From 8e95bba69787bfc34ad63f60565f8f064b51cb88 Mon Sep 17 00:00:00 2001
From: Michal Piszczek <imichaljp@gmail.com>
Date: Mon, 22 Aug 2022 17:11:40 -0700
Subject: [PATCH 001/704] Remove mutable defaults in mlp_model (#12546)

---
 .../tvm/meta_schedule/cost_model/mlp_model.py | 37 +++++++++++--------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/python/tvm/meta_schedule/cost_model/mlp_model.py b/python/tvm/meta_schedule/cost_model/mlp_model.py
index 04ccca0563f9..e7f07f0a4542 100644
--- a/python/tvm/meta_schedule/cost_model/mlp_model.py
+++ b/python/tvm/meta_schedule/cost_model/mlp_model.py
@@ -26,7 +26,7 @@
 import tempfile
 from collections import OrderedDict
 from itertools import chain as itertools_chain
-from typing import Dict, List, NamedTuple, Tuple
+from typing import Dict, List, NamedTuple, Optional, Tuple
 
 import numpy as np  # type: ignore
 import torch  # type: ignore
@@ -418,8 +418,8 @@ def forward(  # pylint: disable=missing-function-docstring
 def extract_features(
     context: TuneContext,
     candidates: List[MeasureCandidate],
-    results: List[RunnerResult] = None,
-    extractor: FeatureExtractor = PerStoreFeature(extract_workload=True),
+    results: Optional[List[RunnerResult]] = None,
+    extractor: Optional[FeatureExtractor] = None,
 ):
     """Extract feature vectors and compute mean costs.
 
@@ -429,9 +429,9 @@ def extract_features(
         The tuning context.
     candidates: List[MeasureCandidate]
         The measure candidates.
-    results: List[RunnerResult]
+    results: Optional[List[RunnerResult]]
         The measured results, can be None if used in prediction.
-    extractor: FeatureExtractor
+    extractor: Optional[FeatureExtractor]
         The feature extractor.
 
     Returns
@@ -441,6 +441,7 @@ def extract_features(
     new_mean_costs: np.ndarray
         The mean costs.
     """
+    extractor = extractor or PerStoreFeature(extract_workload=True)
 
     def _feature(feature: NDArray) -> np.ndarray:
         return feature.numpy().astype("float32")
@@ -481,9 +482,12 @@ class State:
 
     def __init__(
         self,
-        model_config: SegmentSumMLPConfig = SegmentSumMLPConfig(),
-        extractor: FeatureExtractor = PerStoreFeature(extract_workload=True),
+        model_config: Optional[SegmentSumMLPConfig] = None,
+        extractor: Optional[FeatureExtractor] = None,
     ):
+        model_config = model_config or SegmentSumMLPConfig()
+        extractor = extractor or PerStoreFeature(extract_workload=True)
+
         self.model = SegmentSumMLP(**model_config.to_dict())
         self.data = OrderedDict()
         self.data_size = 0
@@ -662,9 +666,12 @@ class SegmentSumMLPTrainer:
 
     def __init__(
         self,
-        train_config: TrainerConfig = TrainerConfig(),
-        state: State = State(),
+        train_config: Optional[TrainerConfig] = None,
+        state: Optional[State] = None,
     ):
+        train_config = train_config or TrainerConfig()
+        state = state or State()
+
         config = train_config.to_dict()
         for attr in config:
             setattr(self, attr, config[attr])
@@ -676,7 +683,7 @@ def train_step(
         self,
         data: Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"],
         batch: int = 0,
-        train_loss: float = None,
+        train_loss: Optional[float] = None,
     ) -> float:
         """Helper function for training on a single batch.
 
@@ -686,7 +693,7 @@ def train_step(
             A batch of data, should be a tuple of (segment_sizes, features, gt_results).
         batch: int = 0
             The current batch number.
-        train_loss: float = None
+        train_loss: Optional[float] = None
             The previous averaged training loss, None if it is the first batch.
 
         Returns
@@ -863,7 +870,7 @@ def train_incremental(
     def predict_incremental(
         self,
         features: List[np.ndarray],
-        results: np.ndarray = None,
+        results: Optional[np.ndarray] = None,
     ) -> np.ndarray:
         """Predicting (validating) on incremental data.
 
@@ -871,7 +878,7 @@ def predict_incremental(
         ----------
         features: List[np.ndarray]
             The extracted features.
-        results: np.ndarray
+        results: Optional[np.ndarray]
             The measured results, can be None if used for predicting.
 
         Returns
@@ -943,10 +950,10 @@ class MLPModel(PyCostModel):
     def __init__(
         self,
         *,
-        trainer: SegmentSumMLPTrainer = SegmentSumMLPTrainer(),
+        trainer: Optional[SegmentSumMLPTrainer] = None,
     ):
         super().__init__()
-        self.trainer = trainer
+        self.trainer = trainer or SegmentSumMLPTrainer()
 
     def load(self, path: str) -> None:
         """Load the cost model, cached data or raw data from given file location.

From 3bd168194f25c95904dac8835f8e74abd423a5a3 Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Mon, 22 Aug 2022 18:39:16 -0700
Subject: [PATCH 002/704] check for CMSIS_PATH in project generation (#12547)

Co-authored-by: Mohamad <mkatanbaf@users.noreply.github.com>
---
 .../zephyr/template_project/CMakeLists.txt.template         | 3 ++-
 .../microtvm/zephyr/template_project/microtvm_api_server.py | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
index 742433e82d0d..b5182bf8ac1f 100644
--- a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
+++ b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
@@ -29,7 +29,7 @@ find_package(Zephyr HINTS $ENV{ZEPHYR_BASE})
 project(microtvm_autogenerated_project)
 
 if(${ENABLE_CMSIS})
-  set(CMSIS_PATH $ENV{CMSIS_PATH})
+  set(CMSIS_PATH <CMSIS_PATH>)
 
   file(GLOB_RECURSE cmsis_lib_srcs
     ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/*.c
@@ -40,6 +40,7 @@ if(${ENABLE_CMSIS})
   )
 
   set(cmsis_includes
+    ${CMSIS_PATH}/CMSIS/Core/Include
     ${CMSIS_PATH}/CMSIS/NN/Include
     ${CMSIS_PATH}/CMSIS/DSP/Include
     ${CMSIS_PATH}/CMSIS/DSP/Include/dsp
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index c55bd63fa4dd..eb20c3e88448 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -463,6 +463,7 @@ def _create_prj_conf(self, project_dir, options):
     API_SERVER_CRT_LIBS_TOKEN = "<API_SERVER_CRT_LIBS>"
     CMAKE_ARGS_TOKEN = "<CMAKE_ARGS>"
     QEMU_PIPE_TOKEN = "<QEMU_PIPE>"
+    CMSIS_PATH_TOKEN = "<CMSIS_PATH>"
 
     CRT_LIBS_BY_PROJECT_TYPE = {
         "host_driven": "microtvm_rpc_server microtvm_rpc_common aot_executor_module aot_executor common",
@@ -521,6 +522,8 @@ def _generate_cmake_args(self, mlf_extracted_path, options) -> str:
         cmake_args += f"set(BOARD {options['zephyr_board']})\n"
 
         enable_cmsis = self._cmsis_required(mlf_extracted_path)
+        if enable_cmsis:
+            assert os.environ.get("CMSIS_PATH"), "CMSIS_PATH is not defined."
         cmake_args += f"set(ENABLE_CMSIS {str(enable_cmsis).upper()})\n"
 
         return cmake_args
@@ -587,6 +590,9 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                         self.qemu_pipe_dir = pathlib.Path(tempfile.mkdtemp())
                         line = line.replace(self.QEMU_PIPE_TOKEN, str(self.qemu_pipe_dir / "fifo"))
 
+                    if self.CMSIS_PATH_TOKEN in line and self._cmsis_required(extract_path):
+                        line = line.replace(self.CMSIS_PATH_TOKEN, str(os.environ["CMSIS_PATH"]))
+
                     cmake_f.write(line)
 
                 if options.get("compile_definitions"):

From 5cef6bf559265e74b84504ed2e190f29f5c5bf33 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Tue, 23 Aug 2022 08:58:54 +0700
Subject: [PATCH 003/704] [microTVM] Rework evaluate_model_accuracy into a more
 generic helper function (#12539)

* Add workaround for #12538

* Rework evaluate_model_accuracy into predict_labels_aot
---
 python/tvm/micro/testing/__init__.py   |  2 +-
 python/tvm/micro/testing/evaluation.py | 21 ++++++---------------
 tests/micro/common/test_autotune.py    | 13 +++++++------
 3 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/python/tvm/micro/testing/__init__.py b/python/tvm/micro/testing/__init__.py
index 9062f061bda3..0dc24102cb89 100644
--- a/python/tvm/micro/testing/__init__.py
+++ b/python/tvm/micro/testing/__init__.py
@@ -16,5 +16,5 @@
 # under the License.
 
 """Allows the tools specified below to be imported directly from tvm.micro.testing"""
-from .evaluation import tune_model, create_aot_session, evaluate_model_accuracy
+from .evaluation import tune_model, create_aot_session, predict_labels_aot
 from .utils import get_supported_boards, get_target
diff --git a/python/tvm/micro/testing/evaluation.py b/python/tvm/micro/testing/evaluation.py
index 5f47e06a17f9..32de1d2a370d 100644
--- a/python/tvm/micro/testing/evaluation.py
+++ b/python/tvm/micro/testing/evaluation.py
@@ -142,27 +142,18 @@ def create_aot_session(
     return tvm.micro.Session(project.transport(), timeout_override=timeout_override)
 
 
-# This utility functions was designed ONLY for one input / one output models
-# where the outputs are confidences for different classes.
-def evaluate_model_accuracy(session, aot_executor, input_data, true_labels, runs_per_sample=1):
-    """Evaluates an AOT-compiled model's accuracy and runtime over an RPC session. Works well
-    when used with create_aot_session."""
+def predict_labels_aot(session, aot_executor, input_data, runs_per_sample=1):
+    """Predicts labels for each sample in input_data using host-driven AOT.
+    Returns an iterator of (label, runtime) tuples. This function can only
+    be used with models for which the output is the confidence for each class."""
 
     assert aot_executor.get_num_inputs() == 1
     assert aot_executor.get_num_outputs() == 1
     assert runs_per_sample > 0
 
-    predicted_labels = []
-    aot_runtimes = []
     for sample in input_data:
         aot_executor.get_input(0).copyfrom(sample)
         result = aot_executor.module.time_evaluator("run", session.device, number=runs_per_sample)()
+        predicted_label = aot_executor.get_output(0).numpy().argmax()
         runtime = result.mean
-        output = aot_executor.get_output(0).numpy()
-        predicted_labels.append(output.argmax())
-        aot_runtimes.append(runtime)
-
-    num_correct = sum(u == v for u, v in zip(true_labels, predicted_labels))
-    average_time = sum(aot_runtimes) / len(aot_runtimes)
-    accuracy = num_correct / len(predicted_labels)
-    return average_time, accuracy, predicted_labels
+        yield predicted_label, runtime
diff --git a/tests/micro/common/test_autotune.py b/tests/micro/common/test_autotune.py
index 60b38ff211a4..b79260dd46ed 100644
--- a/tests/micro/common/test_autotune.py
+++ b/tests/micro/common/test_autotune.py
@@ -76,17 +76,18 @@ def test_kws_autotune_workflow(platform, board, tmp_path):
             np.random.randint(low=-127, high=128, size=(1, 1960), dtype=np.int8) for x in range(3)
         )
 
-        labels = [0, 0, 0]
-
         # Validate perforance across random runs
-        time, _, _ = tvm.micro.testing.evaluate_model_accuracy(
-            session, aot_executor, samples, labels, runs_per_sample=20
-        )
+        runtimes = [
+            runtime
+            for _, runtime in tvm.micro.testing.predict_labels_aot(
+                session, aot_executor, samples, runs_per_sample=20
+            )
+        ]
         # `time` is the average time taken to execute model inference on the
         # device, measured in seconds. It does not include the time to upload
         # the input data via RPC. On slow boards like the Arduino Due, time
         # is around 0.12 (120 ms), so this gives us plenty of buffer.
-        assert time < 1
+        assert np.median(runtimes) < 1
 
 
 if __name__ == "__main__":

From 58f2139ffdd39de61fcea3b090dcfa5f7d0db4be Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Tue, 23 Aug 2022 08:59:26 +0700
Subject: [PATCH 004/704] [microTVM] Replace static fixtures with
 parameterization (#12530)

* Replace microTVM static fixtures with parameterization

* [microTVM] Only perform parameterization when fixture is present

* Reformat with black

* Fix Cortex-M tests

* Add docstring to pytest_generate_tests

* Remove trailing space from docstring
---
 python/tvm/micro/testing/pytest_plugin.py    | 30 ++++++++++++++++----
 python/tvm/micro/testing/utils.py            |  5 ++++
 tests/micro/arduino/test_arduino_workflow.py | 13 ++++++---
 tests/micro/common/conftest.py               | 16 -----------
 4 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/python/tvm/micro/testing/pytest_plugin.py b/python/tvm/micro/testing/pytest_plugin.py
index 5c63711d28b3..9864b49abb61 100644
--- a/python/tvm/micro/testing/pytest_plugin.py
+++ b/python/tvm/micro/testing/pytest_plugin.py
@@ -26,14 +26,18 @@
 
 from tvm.contrib.utils import tempdir
 
-from .utils import get_supported_boards
+from .utils import get_supported_platforms, get_supported_boards
 
 
 def pytest_addoption(parser):
     """Adds more pytest arguments"""
+    parser.addoption(
+        "--platform",
+        choices=get_supported_platforms(),
+        help=("microTVM platform for tests."),
+    )
     parser.addoption(
         "--board",
-        required=True,
         choices=list(get_supported_boards("zephyr").keys())
         + list(get_supported_boards("arduino").keys()),
         help=(
@@ -58,9 +62,25 @@ def pytest_addoption(parser):
     )
 
 
-@pytest.fixture(scope="session")
-def board(request):
-    return request.config.getoption("--board")
+def pytest_generate_tests(metafunc):
+    """Hooks into pytest to add platform and board fixtures to tests that
+    require them. To make sure that "platform" and "board" are treated as
+    parameters for the appropriate tests (and included in the test names),
+    we add them as function level parametrizations. This prevents data
+    from being overwritten in Junit XML files if multiple platforms
+    or boards are tested."""
+
+    for argument in ["platform", "board"]:
+        if argument in metafunc.fixturenames:
+            value = metafunc.config.getoption(f"--{argument}", default=None)
+
+            if not value:
+                raise ValueError(
+                    f"Test {metafunc.function.__name__} in module {metafunc.module.__name__} "
+                    f"requires a --{argument} argument, but none was given."
+                )
+
+            metafunc.parametrize(argument, [metafunc.config.getoption(f"--{argument}")])
 
 
 @pytest.fixture(scope="session")
diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py
index 323108b253a2..794f443e47a6 100644
--- a/python/tvm/micro/testing/utils.py
+++ b/python/tvm/micro/testing/utils.py
@@ -33,6 +33,11 @@
 TIMEOUT_SEC = 10
 
 
+@lru_cache(maxsize=None)
+def get_supported_platforms():
+    return ["arduino", "zephyr"]
+
+
 @lru_cache(maxsize=None)
 def get_supported_boards(platform: str):
     template = Path(tvm.micro.get_microtvm_template_projects(platform))
diff --git a/tests/micro/arduino/test_arduino_workflow.py b/tests/micro/arduino/test_arduino_workflow.py
index 35bfa2556231..8d5d541d408c 100644
--- a/tests/micro/arduino/test_arduino_workflow.py
+++ b/tests/micro/arduino/test_arduino_workflow.py
@@ -37,9 +37,12 @@
 """
 
 # Since these tests are sequential, we'll use the same project/workspace
-# directory for all tests in this file
+# directory for all tests in this file. Note that --board can't be loaded
+# from the fixture, since the fixture is function scoped (it has to be
+# for the tests to be named correctly via parameterization).
 @pytest.fixture(scope="module")
-def workflow_workspace_dir(request, board):
+def workflow_workspace_dir(request):
+    board = request.config.getoption("--board")
     return test_utils.make_workspace_dir("arduino_workflow", board)
 
 
@@ -48,9 +51,11 @@ def project_dir(workflow_workspace_dir):
     return workflow_workspace_dir / "project"
 
 
-# We MUST pass workspace_dir, not project_dir, or the workspace will be dereferenced too soon
+# We MUST pass workspace_dir, not project_dir, or the workspace will be dereferenced
+# too soon. We can't use the board fixture either for the reason mentioned above.
 @pytest.fixture(scope="module")
-def project(board, arduino_cli_cmd, microtvm_debug, workflow_workspace_dir):
+def project(request, arduino_cli_cmd, microtvm_debug, workflow_workspace_dir):
+    board = request.config.getoption("--board")
     return test_utils.make_kws_project(
         board, arduino_cli_cmd, microtvm_debug, workflow_workspace_dir
     )
diff --git a/tests/micro/common/conftest.py b/tests/micro/common/conftest.py
index 0bf70ed06138..d86fd41bd8bf 100644
--- a/tests/micro/common/conftest.py
+++ b/tests/micro/common/conftest.py
@@ -17,19 +17,3 @@
 pytest_plugins = [
     "tvm.micro.testing.pytest_plugin",
 ]
-
-import pytest
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--platform",
-        required=True,
-        choices=["arduino", "zephyr"],
-        help="Platform to run tests with",
-    )
-
-
-@pytest.fixture
-def platform(request):
-    return request.config.getoption("--platform")

From e252d7f3ab6eac631c960cdcb7826862958c6e59 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Mon, 22 Aug 2022 19:59:58 -0700
Subject: [PATCH 005/704] [docs] Add CI contribution instructions (#12551)

This PR documents the steps to introducing a new CI docker image, which we've been doing a lot lately.
---
 docs/contribute/ci.rst | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst
index a421103ab457..1284fd95fbea 100644
--- a/docs/contribute/ci.rst
+++ b/docs/contribute/ci.rst
@@ -174,6 +174,29 @@ The images for these containers are hosted in the `tlcpack Docker Hub <https://h
 and referenced in the `Jenkinsfile.j2 <https://github.com/apache/tvm/tree/main/Jenkinsfile.j2>`_. These can be inspected and run
 locally via standard Docker commands.
 
+Adding a new Docker image
+"""""""""""""""""""""""""
+
+New docker images can be added to test TVM on a variety of platforms. Here are the steps for adding
+a new CI image:
+
+1.  Define the ``docker/Dockerfile.ci_foo`` and associated scripts in ``docker/install``. Create a PR containing only these changes (no ``Jenkinsfile`` changes).
+
+    Example: https://github.com/apache/tvm/pull/12230/files
+
+2. A committer verifies the image builds locally and then reviews/approves this PR.
+3. A committer creates the ci-foo repos in https://hub.docker.com/u/tlcpack and https://hub.docker.com/u/tlcpackstaging.
+4. Create a PR to create an ECR repo for the image in tlcpack/ci: https://github.com/tlc-pack/ci/pull/46/files
+5. A committer creates and gets merged a PR to add the image to the ``Jenkinsfile``
+
+    Example: https://github.com/apache/tvm/pull/12369/files.
+
+    **NOTE**: The PR must be opened from a branch in apache/tvm, not from a branch in a forked repo.
+
+6. A committer adds this image to the daily docker rebuild/validation run in tlcpack.
+
+    Example: https://github.com/tlc-pack/tlcpack/pull/131
+
 
 ``ci-docker-staging``
 ^^^^^^^^^^^^^^^^^^^^^

From d26bf809e4c3c8d6576d4e436475997eb12deb3e Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Tue, 23 Aug 2022 06:46:26 +0100
Subject: [PATCH 006/704] [ACL] Adjust mobilenet test for Keras 2.9 (#12541)

In Keras 2.7, one "reshape" operator was removed from
the Mobilenet model, making our test which verifies the
number of operators to be incorrect.

This patch adjusts the operator count so that it is in line
with the changes in Keras. For reference, the change in
keras repo was done in hash b6abfaed132 "Remove unnecessary
reshape layer in MobileNet architecture".
---
 .../test_arm_compute_lib/test_network.py      | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py
index 8fcafe489cb9..b5b9ed6b6ef9 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_network.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_network.py
@@ -16,6 +16,8 @@
 # under the License.
 """Arm Compute Library network tests."""
 
+from distutils.version import LooseVersion
+
 import numpy as np
 import pytest
 from tvm import testing
@@ -111,6 +113,7 @@ def get_model():
 
 
 def test_mobilenet():
+    keras = pytest.importorskip("keras")
     Device.load("test_config.json")
 
     if skip_runtime_test():
@@ -131,8 +134,25 @@ def get_model():
         mod, params = _get_keras_model(mobilenet, inputs)
         return mod, params, inputs
 
+    if keras.__version__ < LooseVersion("2.9"):
+        # This can be removed after we migrate to TF/Keras >= 2.9
+        expected_tvm_ops = 56
+        expected_acl_partitions = 31
+    else:
+        # In Keras >= 2.7, one reshape operator was removed
+        # from the MobileNet model, so it impacted this test
+        # which now needs to be reduce in by 1
+        # The change in Keras is `b6abfaed1326e3c`
+        expected_tvm_ops = 55
+        expected_acl_partitions = 30
+
     _build_and_run_network(
-        *get_model(), device=device, tvm_ops=56, acl_partitions=31, atol=0.002, rtol=0.01
+        *get_model(),
+        device=device,
+        tvm_ops=expected_tvm_ops,
+        acl_partitions=expected_acl_partitions,
+        atol=0.002,
+        rtol=0.01,
     )
 
 
From 3983a472c6f3ad4ad9604ceeffdf80cce01d166b Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Tue, 23 Aug 2022 07:37:39 +0100
Subject: [PATCH 007/704] [COMMUNITY] @konturn -> Reviewer (#12543)

Co-authored-by: Leandro Nunes <leanun01@e123855.arm.com>
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 7c6f2dfa7112..e3b4fe339a4f 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -121,6 +121,7 @@ We do encourage everyone to work anything they are interested in.
 - [Elen Kalda](https://github.com/ekalda): @ekalda
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame
 - [Michael J. Klaiber](https://github.com/MichaelJKlaiber/) @MichaelJKlaiber
+- [Noah Kontur](https://github.com/konturn/) @konturn
 - [Tristan Konolige](https://github.com/tkonolige): @tkonolige
 - [Denise Kutnick](https://github.com/denise-k): @denise-k
 - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574

From 383bd419310fac4d9d78e0c59760cbef3efa5555 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Tue, 23 Aug 2022 08:38:45 +0100
Subject: [PATCH 008/704] Fix TFLite 2.9 tests (#12130)

This pr fixes the tests that will be broken when we will update TFLite to
the 2.9 version.

We will update TensorFlow and TFLite versions to 2.9 so that we can
benefit from improvements in packaging to support multiple platforms
and Operating Systems.
---
 python/tvm/relay/frontend/keras.py           |  8 +++--
 tests/python/frontend/tflite/test_forward.py | 33 +++++++++++++++-----
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index 3f7a96544a65..8c8a4a1ddcd3 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -635,9 +635,11 @@ def _convert_pooling(
             _op.nn.global_max_pool2d(inexpr, **global_pool_params), keras_layer, etab, data_layout
         )
     if pool_type == "GlobalAveragePooling2D":
-        return _convert_flatten(
-            _op.nn.global_avg_pool2d(inexpr, **global_pool_params), keras_layer, etab, data_layout
-        )
+        global_avg_pool2d = _op.nn.global_avg_pool2d(inexpr, **global_pool_params)
+        keep_dims = len(keras_layer.input.shape) == len(keras_layer.output.shape)
+        if keep_dims:
+            return global_avg_pool2d
+        return _convert_flatten(global_avg_pool2d, keras_layer, etab, data_layout)
     pool_h, pool_w = keras_layer.pool_size
     stride_h, stride_w = keras_layer.strides
     params = {
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 9121721d8ea2..7267b725483d 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -963,6 +963,10 @@ def representative_data_gen():
     input_node = subgraph.Tensors(model_input).Name().decode("utf-8")
 
     tflite_output = run_tflite_graph(tflite_model_quant, data)
+    if tf.__version__ < LooseVersion("2.9"):
+        input_node = data_in.name.replace(":0", "")
+    else:
+        input_node = "serving_default_" + data_in.name + ":0"
     tvm_output = run_tvm_graph(tflite_model_quant, data, input_node)
     tvm.testing.assert_allclose(
         np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-2, atol=1e-2
@@ -1997,10 +2001,12 @@ def _test_abs(data, quantized, int_quant_dtype=tf.int8):
         # TFLite 2.6.x upgrade support
         if tf.__version__ < LooseVersion("2.6.1"):
             in_node = ["serving_default_input_int8"]
-        else:
+        elif tf.__version__ < LooseVersion("2.9"):
             in_node = (
                 ["serving_default_input_int16"] if int_quant_dtype == tf.int16 else ["tfl.quantize"]
             )
+        else:
+            in_node = "serving_default_input"
 
         tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
         tvm.testing.assert_allclose(
@@ -2028,8 +2034,10 @@ def _test_rsqrt(data, quantized, int_quant_dtype=tf.int8):
             tf.math.rsqrt, data, int_quant_dtype=int_quant_dtype
         )
         tflite_output = run_tflite_graph(tflite_model_quant, data)
-        in_node = ["tfl.quantize"]
-
+        if tf.__version__ < LooseVersion("2.9"):
+            in_node = ["tfl.quantize"]
+        else:
+            in_node = "serving_default_input"
         tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
         tvm.testing.assert_allclose(
             np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
@@ -2110,7 +2118,10 @@ def _test_cos(data, quantized, int_quant_dtype=tf.int8):
             tf.math.cos, data, int_quant_dtype=int_quant_dtype
         )
         tflite_output = run_tflite_graph(tflite_model_quant, data)
-        in_node = ["tfl.quantize"]
+        if tf.__version__ < LooseVersion("2.9"):
+            in_node = ["tfl.quantize"]
+        else:
+            in_node = "serving_default_input"
         tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
         tvm.testing.assert_allclose(
             np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
@@ -3024,7 +3035,6 @@ def _test_quantize_dequantize(data):
     add = tf.keras.layers.Add()([data_in, relu])
     concat = tf.keras.layers.Concatenate(axis=0)([relu, add])
     keras_model = tf.keras.models.Model(inputs=data_in, outputs=concat)
-    input_name = data_in.name.split(":")[0]
 
     # To create quantized values with dynamic range of activations, needs representative dataset
     def representative_data_gen():
@@ -3034,7 +3044,11 @@ def representative_data_gen():
     tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen, True, True)
 
     tflite_output = run_tflite_graph(tflite_model_quant, data)
-    tvm_output = run_tvm_graph(tflite_model_quant, data, input_name)
+    if tf.__version__ < LooseVersion("2.9"):
+        in_node = data_in.name.split(":")[0]
+    else:
+        in_node = "serving_default_" + data_in.name + ":0"
+    tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
     tvm.testing.assert_allclose(
         np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
     )
@@ -3051,7 +3065,6 @@ def _test_quantize_dequantize_const(data):
     add = tf.keras.layers.Add()([data, relu])
     concat = tf.keras.layers.Concatenate(axis=0)([relu, add])
     keras_model = tf.keras.models.Model(inputs=data_in, outputs=concat)
-    input_name = data_in.name.split(":")[0]
 
     # To create quantized values with dynamic range of activations, needs representative dataset
     def representative_data_gen():
@@ -3061,7 +3074,11 @@ def representative_data_gen():
     tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen, True, True)
 
     tflite_output = run_tflite_graph(tflite_model_quant, data)
-    tvm_output = run_tvm_graph(tflite_model_quant, data, input_name)
+    if tf.__version__ < LooseVersion("2.9"):
+        in_node = data_in.name.split(":")[0]
+    else:
+        in_node = "serving_default_" + data_in.name + ":0"
+    tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
     tvm.testing.assert_allclose(
         np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
     )

From 52779f1273b05d53d8213e23e70d9b0ac82fd0b9 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Tue, 23 Aug 2022 10:00:34 +0100
Subject: [PATCH 009/704] [CMSIS-NN] Pad fusion with QNN Conv2D (#12353)

Pass that fuses nn.pad and qnn.conv2d for CMSIS-NN target.
---
 python/tvm/relay/op/contrib/cmsisnn.py        |  50 ++-
 .../backend/contrib/cmsisnn/fuse_pads.cc      | 209 +++++++++++
 .../contrib/test_cmsisnn/test_conv2d.py       | 277 ++++++++++++--
 .../contrib/test_cmsisnn/test_fuse_pads.py    | 340 ++++++++++++++++++
 tests/python/contrib/test_cmsisnn/utils.py    |  45 ++-
 5 files changed, 886 insertions(+), 35 deletions(-)
 create mode 100644 src/relay/backend/contrib/cmsisnn/fuse_pads.cc
 create mode 100644 tests/python/contrib/test_cmsisnn/test_fuse_pads.py

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 8d714b7269d9..b887fafd7e00 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -59,6 +59,7 @@ def partition_for_cmsisnn(mod, params=None, mod_name="default", **opts):
             transform.AnnotateTarget("cmsis-nn"),
             transform.PartitionGraph(mod_name=mod_name),
             GenerateCMSISNNConstants(),
+            CMSISNNFusePads(),
             ScalarToTensorConstants(),
             ExtractConstantsFromPartitionedFunction(),
             transform.InferType(),
@@ -91,10 +92,18 @@ def check_qnn_softmax(pattern):
             and dequantize_call.args[0].checked_type.dtype == "int8"
         )
 
-    def qnn_conv2d_pattern():
-        """Create pattern for qnn.conv2D with optional fused relu."""
+    def qnn_conv2d_pattern(with_pad):
+        """Create pattern for qnn.conv2D with optional pad and/or optional fused relu."""
+        conv2d_input = wildcard()
+        if with_pad:
+            conv2d_input = is_op("nn.pad")(wildcard(), is_constant())
         qnn_conv2d = is_op("qnn.conv2d")(
-            wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant()
+            conv2d_input,
+            is_constant(),
+            is_constant(),
+            is_constant(),
+            is_constant(),
+            is_constant(),
         )
         bias_add = is_op("nn.bias_add")(qnn_conv2d, is_constant())
         req = is_op("qnn.requantize")(
@@ -136,7 +145,7 @@ def check_qnn_conv2d(pattern):
         ):
             is_depthwise = True
 
-        return (
+        ret = (
             conv2d.attrs.out_dtype == "int32"
             and conv2d_input.checked_type.dtype == "int8"
             and conv2d_weight.checked_type.dtype == "int8"
@@ -145,6 +154,36 @@ def check_qnn_conv2d(pattern):
             and all([zp == 0 for zp in kernel_zp])
             and (not is_depthwise or bias_add is not None)
         )
+        return ret
+
+    def check_qnn_conv2d_pad(pattern):
+        """Check if the Pad followed by Conv2D is supported by CMSIS-NN."""
+        if str(pattern.op.name) == "clip":
+            relu = pattern
+            requantize = relu.args[0]
+        else:
+            requantize = pattern
+        requantize_input = requantize.args[0]
+        if str(requantize_input.op.name) == "nn.bias_add":
+            bias_add = requantize_input
+            conv2d = bias_add.args[0]
+        else:
+            conv2d = requantize_input
+        conv2d_input = conv2d.args[0]
+
+        # check if sum of paddings from pad() and conv2d() satisfies CMSIS-NN constraints
+        can_pad_be_fused = True
+        if isinstance(conv2d_input, tvm.relay.expr.Call) and str(conv2d_input.op.name) == "nn.pad":
+            pad_top, pad_left, pad_bottom, pad_right = GetEffectiveConv2DPadding(
+                conv2d, conv2d_input
+            )
+            # check if difference in the side paddings is 1 along each dimension
+            pad_w_diff = int(pad_right - pad_left)
+            pad_h_diff = int(pad_bottom - pad_top)
+            can_pad_be_fused = pad_w_diff in [0, 1] and pad_h_diff in [0, 1]
+
+        ret = check_qnn_conv2d(pattern) and can_pad_be_fused
+        return ret
 
     def qnn_fully_connected_pattern():
         """Create pattern for qnn.dense with optional Relu."""
@@ -275,7 +314,8 @@ def check_qnn_binary_op(pattern):
         )
 
     return [
-        ("cmsis-nn.qnn_conv2d", qnn_conv2d_pattern(), check_qnn_conv2d),
+        ("cmsis-nn.qnn_conv2d", qnn_conv2d_pattern(with_pad=True), check_qnn_conv2d_pad),
+        ("cmsis-nn.qnn_conv2d", qnn_conv2d_pattern(with_pad=False), check_qnn_conv2d),
         ("cmsis-nn.qnn_fully_connected", qnn_fully_connected_pattern(), check_qnn_fully_connected),
         ("cmsis-nn.qnn_avg_pool2d", qnn_avg_pool2d_pattern(), check_qnn_avg_pool2d),
         ("cmsis-nn.qnn_max_pool2d", qnn_max_pool2d_pattern(), check_qnn_max_pool2d),
diff --git a/src/relay/backend/contrib/cmsisnn/fuse_pads.cc b/src/relay/backend/contrib/cmsisnn/fuse_pads.cc
new file mode 100644
index 000000000000..71c31c303588
--- /dev/null
+++ b/src/relay/backend/contrib/cmsisnn/fuse_pads.cc
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \file src/relay/backend/contrib/cmsisnn/fuse_pads.cc
+ * \brief Fuses pads that precede qnn.conv2d ops inside CMSIS-NN composite functions.
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/runtime/ndarray.h>
+
+#include "../../../op/make_op.h"
+#include "../../../qnn/utils.h"
+#include "../../../transforms/pattern_utils.h"
+#include "convolutions.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace cmsisnn {
+
+inline IntImm ToIntImm(int32_t value) { return IntImm(DataType::Int(32), value); }
+
+/*!
+ * \brief From padding attributes of nn.pad and qnn.conv2d, calculates effective padding along H
+ * and W dimensions.
+ */
+Array<IntImm> GetEffectiveConv2DPadding(Expr conv2d, Expr pad) {
+  // pad_width: ((), (top, bottom), (left, right), ()) for NHWC layout
+  // conv2d_attrs->padding: (top, left, bottom, right)
+  auto* conv2d_call = conv2d.as<CallNode>();
+  auto* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();
+  std::string data_layout = conv2d_attrs->data_layout.c_str();
+  int pos_h = data_layout.find("H");
+  int pos_w = data_layout.find("W");
+
+  auto* pad_call = pad.as<CallNode>();
+  Array<Array<Integer>> pad_width = pad_call->attrs.as<PadAttrs>()->pad_width;
+  int pad_top =
+      qnn::get_const_int(conv2d_attrs->padding[0]) + qnn::get_const_int(pad_width[pos_h][0]);
+  int pad_left =
+      qnn::get_const_int(conv2d_attrs->padding[1]) + qnn::get_const_int(pad_width[pos_w][0]);
+  int pad_bottom =
+      qnn::get_const_int(conv2d_attrs->padding[2]) + qnn::get_const_int(pad_width[pos_h][1]);
+  int pad_right =
+      qnn::get_const_int(conv2d_attrs->padding[3]) + qnn::get_const_int(pad_width[pos_w][1]);
+
+  return {ToIntImm(pad_top), ToIntImm(pad_left), ToIntImm(pad_bottom), ToIntImm(pad_right)};
+}
+
+/*!
+ * \brief This Mutator will find all partitioned functions meant for CMSIS-NN Conv2D.
+ * Then, it will fuse preceding pads with qnn.conv2d.
+ */
+class FusePadsMutator : public MixedModeMutator {
+ public:
+  explicit FusePadsMutator(const IRModule& mod) : mod_(mod) {}
+
+ private:
+  /*!
+   * \brief In order to eliminate preceding nn.pad op, pad_width of nn.pad is passed onto
+   * convolution layer to update Conv2DAttrs's padding attribute. */
+  void UpdateConv2DPadding(const CallNode* conv2d_call, const CallNode* pad_call,
+                           Attrs* new_attrs) {
+    Array<IntImm> effective_padding =
+        GetEffectiveConv2DPadding(GetRef<Call>(conv2d_call), GetRef<Call>(pad_call));
+    int pad_top = effective_padding[0]->value;
+    int pad_left = effective_padding[1]->value;
+    int pad_bottom = effective_padding[2]->value;
+    int pad_right = effective_padding[3]->value;
+    int pad_diff_w = pad_right - pad_left;
+    int pad_diff_h = pad_bottom - pad_top;
+    bool can_pad_be_fused =
+        ((pad_diff_w == 0 || pad_diff_w == 1) && (pad_diff_h == 0 || pad_diff_h == 1));
+    std::string error = "Difference on each side of a dimension should be either 0 or 1. ";
+    error += "Effective padding in this case: (pad_top, pad_left, pad_bottom, pad_right)=(";
+    error += std::to_string(pad_top);
+    error += ", ";
+    error += std::to_string(pad_left);
+    error += ", ";
+    error += std::to_string(pad_bottom);
+    error += ", ";
+    error += std::to_string(pad_right);
+    error += ")";
+    ICHECK(can_pad_be_fused) << error;
+
+    // Prepare new attrs as padding has changed
+    auto* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();
+    auto attrs = make_object<Conv2DAttrs>();
+    attrs->strides = std::move(conv2d_attrs->strides);
+    attrs->dilation = std::move(conv2d_attrs->dilation);
+    attrs->groups = conv2d_attrs->groups;
+    attrs->channels = std::move(conv2d_attrs->channels);
+    attrs->kernel_size = std::move(conv2d_attrs->kernel_size);
+    attrs->data_layout = std::move(conv2d_attrs->data_layout);
+    attrs->kernel_layout = std::move(conv2d_attrs->kernel_layout);
+    attrs->out_layout = std::move(conv2d_attrs->out_layout);
+    attrs->out_dtype = std::move(conv2d_attrs->out_dtype);
+    attrs->padding = {pad_top, pad_left, pad_bottom, pad_right};
+    *new_attrs = tvm::Attrs{attrs};
+  }
+
+  /*!
+   * \brief Identifies the sequence for qnn.conv2D and fuses the preceding nn.pad present within the
+   * CMSIS-NN partitioned function. */
+  Expr FusePadConv2d(const CallNode* conv2d_call) {
+    // create new paddings for qnn.conv2d
+    tvm::Attrs new_conv2d_attrs = conv2d_call->attrs;
+    Expr new_conv2d_input = conv2d_call->args[0];
+    if (auto* pad_call = conv2d_call->args[0].as<CallNode>()) {
+      if (auto* pad_call_op = pad_call->op.as<OpNode>()) {
+        if (pad_call_op->name == "nn.pad") {
+          new_conv2d_input = pad_call->args[0];
+          UpdateConv2DPadding(conv2d_call, pad_call, &new_conv2d_attrs);
+        }
+      }
+    }
+
+    // Conv2D arguments: pad's input + rest of the origin args
+    auto new_conv2d_args = conv2d_call->args;
+    new_conv2d_args.erase(new_conv2d_args.begin());
+    new_conv2d_args.insert(new_conv2d_args.begin(), new_conv2d_input);
+    Call ret_call = Call(conv2d_call->op, new_conv2d_args, new_conv2d_attrs, {});
+    return std::move(ret_call);
+  }
+
+  Expr Rewrite_(const CallNode* call, const Expr& post) final {
+    Expr ret_call = post;
+    auto* post_call = post.as<CallNode>();
+
+    // Fuse nn.pad and qnn.conv2d
+    if (auto* conv2d_op = post_call->op.as<OpNode>()) {
+      if (conv2d_op->name == "qnn.conv2d") {
+        ret_call = FusePadConv2d(post_call);
+      }
+    }
+
+    // Identify qnn.conv2d partitioned function
+    if (post_call->op.as<FunctionNode>()) {
+      auto* func = call->op.as<FunctionNode>();
+      auto func_name = func->GetAttr<String>(attr::kComposite);
+      if (func_name.defined() && func_name == "cmsis-nn.qnn_conv2d") {
+        Expr new_body = VisitExpr(func->body);
+        Function new_func = Function(FreeVars(new_body), new_body, func->ret_type,
+                                     FreeTypeVars(new_body, mod_), func->attrs);
+        ret_call = Call(new_func, post_call->args);
+      }
+    }
+
+    return ret_call;
+  }
+
+ private:
+  IRModule mod_;
+};
+
+IRModule FusePads(const IRModule& mod) {
+  for (auto gv : mod->GetGlobalVars()) {
+    Function func = Downcast<Function>(mod->Lookup(gv));
+
+    // only mutate CMSIS-NN partitioned functions
+    auto compiler_name = func->GetAttr<String>(attr::kCompiler);
+    if (!compiler_name.defined() || compiler_name != "cmsis-nn") {
+      continue;
+    }
+
+    auto fuse_pads_mutator = FusePadsMutator(mod);
+    auto new_func_body = fuse_pads_mutator.VisitExpr(func->body);
+    if (!new_func_body.same_as(func->body)) {
+      Function new_func =
+          Function(func->params, new_func_body, func->ret_type, func->type_params, func->attrs);
+      mod->Update(gv, new_func);
+    }
+  }
+  return mod;
+}
+
+transform::Pass CMSISNNFusePads() {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [=](IRModule m, transform::PassContext pc) { return FusePads(m); };
+  return tvm::transform::CreateModulePass(pass_func, 0, "CMSISNNFusePads", {});
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.cmsisnn.transform.CMSISNNFusePads").set_body_typed(CMSISNNFusePads);
+TVM_REGISTER_GLOBAL("relay.ext.cmsisnn.transform.GetEffectiveConv2DPadding")
+    .set_body_typed(GetEffectiveConv2DPadding);
+
+}  // namespace cmsisnn
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 502743387bfa..d33d71261613 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -40,6 +40,7 @@
     assert_partitioned_function,
     assert_no_external_function,
     create_test_runner,
+    CheckForPadsWithinCompositeFunc,
 )
 
 
@@ -62,23 +63,21 @@ def make_model(
     weight_format,
     enable_bias,
     relu_type,
+    input_op=None,
 ):
     """Return a model and any parameters it may have"""
+    if input_op:
+        op = input_op
+    else:
+        op = relay.var("input", shape=shape, dtype=dtype)
+
     h_index = weight_format.index("H")
     w_index = weight_format.index("W")
     kernel_h = kernel_shape[h_index]
     kernel_w = kernel_shape[w_index]
-    invar = relay.var("input", shape=shape, dtype=dtype)
     p = (0, 0, 0, 0)
     if padding == "SAME":
         p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
-        invar = relay.nn.pad(
-            invar,
-            pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)],
-            pad_value=input_zero_point,
-            pad_mode="constant",
-        )
-        shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
 
     rng = np.random.default_rng(12321)
     weight = tvm.nd.array(
@@ -92,7 +91,7 @@ def make_model(
     weight_const = relay.const(weight, kernel_dtype)
     conv2d_kernel_sc = kernel_scale[0] if out_channels == 1 else kernel_scale
     conv = relay.qnn.op.conv2d(
-        invar,
+        op,
         weight_const,
         input_zero_point=relay.const(input_zero_point, "int32"),
         kernel_zero_point=relay.const(kernel_zero_point, "int32"),
@@ -165,9 +164,9 @@ def test_conv2d_number_primfunc_args(
         input_zero_point,
         kernel_scale,
         kernel_zero_point,
-        dtype,
-        dtype,
-        dtype,
+        input_dtype=dtype,
+        weights_dtype=dtype,
+        output_dtype=dtype,
     )
 
     model, params = make_model(
@@ -265,9 +264,9 @@ def test_conv2d_symmetric_padding_int8(
         input_zero_point,
         kernel_scale,
         kernel_zero_point,
-        dtype,
-        dtype,
-        dtype,
+        input_dtype=dtype,
+        weights_dtype=dtype,
+        output_dtype=dtype,
     )
 
     model, params = make_model(
@@ -355,9 +354,110 @@ def test_conv2d_asymmetric_padding_int8(
         input_zero_point,
         kernel_scale,
         kernel_zero_point,
+        input_dtype=dtype,
+        weights_dtype=dtype,
+        output_dtype=dtype,
+    )
+
+    model, params = make_model(
+        ifm_shape,
+        kernel_shape,
+        input_zero_point,
+        input_scale,
+        kernel_zero_point,
+        kernel_scale,
+        output_zero_point,
+        output_scale,
+        padding,
+        strides,
+        dilation,
+        groups,
         dtype,
         dtype,
-        dtype,
+        out_channels,
+        weight_format,
+        enable_bias,
+        relu_type,
+    )
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
+    # validate pattern matching
+    assert_partitioned_function(orig_mod, cmsisnn_mod)
+
+    # validate the output
+    rng = np.random.default_rng(12345)
+    inputs = {"input": rng.integers(in_min, high=in_max, size=ifm_shape, dtype=dtype)}
+    output_list = generate_ref_data(orig_mod["main"], inputs, params)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=output_list,
+            params=params,
+            output_tolerance=1,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("ifm_shape", [(1, 25, 25, 12), (1, 64, 100, 4)])
+@pytest.mark.parametrize(
+    "pad_width",
+    [
+        ((0, 0), (0, 1), (1, 2), (0, 0)),
+        ((0, 0), (1, 1), (1, 1), (0, 0)),
+        ((0, 0), (2, 2), (3, 4), (0, 0)),
+    ],
+)
+def test_pad_conv2d_fusion_int8(
+    ifm_shape,
+    pad_width,
+):
+    """Tests QNN Conv2D where the padding is asymmetric on different sides of input"""
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    ifm_shape = (1, 25, 25, 12)
+    kernel_size = (5, 5)
+    strides = (2, 2)
+    dilation = (1, 1)
+    padding = "SAME"
+    dtype = "int8"
+    enable_bias = True
+    relu_type = "NONE"
+    input_zero_point = 10
+    input_scale = 0.0128
+    kernel_scale = [0.11, 0.22]
+    out_channels = 2
+    groups = 1
+    weight_format = "HWIO"
+    kernel_h = kernel_size[0]
+    kernel_w = kernel_size[1]
+    kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
+    kernel_zero_point = 0
+    in_min, in_max = get_range_for_dtype_str(dtype)
+
+    output_scale, output_zero_point = get_conv2d_qnn_params(
+        kernel_shape,
+        input_scale,
+        input_zero_point,
+        kernel_scale,
+        kernel_zero_point,
+        input_dtype=dtype,
+        weights_dtype=dtype,
+        output_dtype=dtype,
+    )
+
+    invar = relay.var("input", shape=ifm_shape, dtype=dtype)
+    pad = relay.nn.pad(
+        invar,
+        pad_width=pad_width,  # ((), (top, bottom), (left, right), ())
+        pad_value=input_zero_point,
+        pad_mode="constant",
     )
 
     model, params = make_model(
@@ -379,12 +479,139 @@ def test_conv2d_asymmetric_padding_int8(
         weight_format,
         enable_bias,
         relu_type,
+        input_op=pad,
     )
     orig_mod = make_module(model)
     cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
+
+    # validate pattern matching
+    assert_partitioned_function(orig_mod, cmsisnn_mod, False)
+
+    # check pad is not present inside CMSIS-NN partitioned function
+    cmsisnn_func = None
+    for var in cmsisnn_mod.get_global_vars():
+        if "cmsis_nn_main_0" in var.name_hint:
+            cmsisnn_func = cmsisnn_mod[var]
+            pad_verifier = CheckForPadsWithinCompositeFunc()
+            pad_verifier.visit_function(cmsisnn_func)
+            pad_verifier.assert_no_pads_within_func()
+
+    # validate the output
+    rng = np.random.default_rng(12345)
+    inputs = {"input": rng.integers(in_min, high=in_max, size=ifm_shape, dtype=dtype)}
+    output_list = generate_ref_data(orig_mod["main"], inputs, params)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=output_list,
+            params=params,
+            output_tolerance=1,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize(
+    "ifm_shape, pad_width, conv2d_padding",
+    [
+        [(1, 25, 25, 12), ((0, 0), (0, 2), (1, 2), (0, 0)), "SAME"],
+        [(1, 64, 100, 4), ((0, 0), (1, 3), (1, 1), (0, 0)), "VALID"],
+        [(1, 55, 55, 3), ((0, 0), (2, 1), (3, 5), (0, 0)), "SAME"],
+    ],
+)
+def test_invalid_pad_conv2d_fusion_int8(
+    ifm_shape,
+    pad_width,
+    conv2d_padding,
+):
+    """Tests QNN Conv2D where the padding is asymmetric on different sides of input"""
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    ifm_shape = (1, 25, 25, 12)
+    kernel_size = (5, 5)
+    strides = (2, 2)
+    dilation = (1, 1)
+    dtype = "int8"
+    enable_bias = True
+    relu_type = "NONE"
+    input_zero_point = 10
+    input_scale = 0.0128
+    kernel_scale = [0.11, 0.22]
+    out_channels = 2
+    groups = 1
+    weight_format = "HWIO"
+    kernel_h = kernel_size[0]
+    kernel_w = kernel_size[1]
+    kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
+    kernel_zero_point = 0
+    in_min, in_max = get_range_for_dtype_str(dtype)
+
+    output_scale, output_zero_point = get_conv2d_qnn_params(
+        kernel_shape,
+        input_scale,
+        input_zero_point,
+        kernel_scale,
+        kernel_zero_point,
+        input_dtype=dtype,
+        weights_dtype=dtype,
+        output_dtype=dtype,
+    )
+
+    invar = relay.var("input", shape=ifm_shape, dtype=dtype)
+    pad = relay.nn.pad(
+        invar,
+        pad_width=pad_width,  # ((), (top, bottom), (left, right), ())
+        pad_value=input_zero_point,
+        pad_mode="constant",
+    )
+
+    model, params = make_model(
+        ifm_shape,
+        kernel_shape,
+        input_zero_point,
+        input_scale,
+        kernel_zero_point,
+        kernel_scale,
+        output_zero_point,
+        output_scale,
+        conv2d_padding,
+        strides,
+        dilation,
+        groups,
+        dtype,
+        dtype,
+        out_channels,
+        weight_format,
+        enable_bias,
+        relu_type,
+        input_op=pad,
+    )
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
+
     # validate pattern matching
     assert_partitioned_function(orig_mod, cmsisnn_mod)
 
+    # check pad is only present inside main function
+    cmsisnn_func = None
+    for var in cmsisnn_mod.get_global_vars():
+        if "cmsis_nn_main_0" in var.name_hint:
+            cmsisnn_func = cmsisnn_mod[var]
+            pad_verifier = CheckForPadsWithinCompositeFunc()
+            pad_verifier.visit_function(cmsisnn_func)
+            pad_verifier.assert_no_pads_within_func()
+        else:
+            main_func = cmsisnn_mod[var]
+            pad_verifier = CheckForPadsWithinCompositeFunc()
+            pad_verifier.visit_function(main_func)
+            pad_verifier.assert_pads_within_func()
+
     # validate the output
     rng = np.random.default_rng(12345)
     inputs = {"input": rng.integers(in_min, high=in_max, size=ifm_shape, dtype=dtype)}
@@ -506,10 +733,10 @@ def test_depthwise_int8(
         input_zero_point,
         kernel_scale,
         kernel_zero_point,
-        dtype,
-        dtype,
-        dtype,
-        True,
+        input_dtype=dtype,
+        weights_dtype=dtype,
+        output_dtype=dtype,
+        is_depthwise=True,
     )
 
     model, params = make_model(
@@ -611,10 +838,10 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
         input_zero_point,
         kernel_scale,
         kernel_zero_point,
-        dtype,
-        dtype,
-        dtype,
-        True,
+        input_dtype=dtype,
+        weights_dtype=dtype,
+        output_dtype=dtype,
+        is_depthwise=True,
     )
 
     model, params = make_model(
@@ -729,7 +956,7 @@ def test_invalid_parameters(
         in_dtype,
         kernel_dtype,
         in_dtype,
-        False,
+        is_depthwise=False,
     )
     model, params = make_model(
         shape=ifm_shape,
diff --git a/tests/python/contrib/test_cmsisnn/test_fuse_pads.py b/tests/python/contrib/test_cmsisnn/test_fuse_pads.py
new file mode 100644
index 000000000000..f57dc5cd5bab
--- /dev/null
+++ b/tests/python/contrib/test_cmsisnn/test_fuse_pads.py
@@ -0,0 +1,340 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""CMSIS-NN integration tests: fuse_pads pass"""
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from .utils import CheckForPadsWithinCompositeFunc
+
+tvm._ffi._init_api("relay.ext.cmsisnn.transform", __name__)
+
+
+def set_external_func_attr(func, compiler, ext_symbol):
+    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+    func = func.with_attr("Compiler", compiler)
+    func = func.with_attr("global_symbol", ext_symbol)
+    return func
+
+
+def set_composite_func_attr(func, name):
+    func = func.with_attr("Composite", name)
+    return func
+
+
+@pytest.mark.parametrize(
+    "ifm_shape, pad_width, conv2d_padding, ofm_shape",
+    [
+        [(1, 25, 25, 12), ((0, 0), (0, 2), (1, 2), (0, 0)), (1, 1, 1, 1), (1, 26, 28, 2)],
+        [(1, 64, 100, 4), ((0, 0), (1, 3), (1, 1), (0, 0)), (0, 0, 0, 0), (1, 64, 100, 2)],
+        [(1, 55, 55, 3), ((0, 0), (2, 1), (3, 5), (0, 0)), (0, 0, 1, 1), (1, 57, 59, 2)],
+    ],
+)
+def test_invalid_padding_for_fusion(ifm_shape, pad_width, conv2d_padding, ofm_shape):
+    """Negative tests for pads preceding Conv2D that cannot be fused."""
+    dtype = "int8"
+    kernel_size = (3, 3)
+    ofm_channels = 2
+    local_input = relay.var("local_input", shape=ifm_shape, dtype=dtype)
+    pad = relay.nn.pad(
+        local_input,
+        pad_width=pad_width,  # ((), (top, bottom), (left, right), ())
+        pad_value=10,
+        pad_mode="constant",
+    )
+    rng = np.random.default_rng(12321)
+    local_weight = tvm.nd.array(
+        rng.integers(
+            np.iinfo(dtype).min,
+            high=np.iinfo(dtype).max,
+            size=(ofm_channels, kernel_size[0], kernel_size[1], ifm_shape[3]),
+            dtype=dtype,
+        )
+    )
+    local_weight = relay.const(local_weight, dtype)
+    conv2d = relay.qnn.op.conv2d(
+        pad,
+        local_weight,
+        relay.const(1, "int32"),
+        relay.const(1, "int32"),
+        relay.const(1, "float32"),
+        relay.const(1, "float32"),
+        data_layout="NHWC",
+        kernel_layout="OHWI",
+        channels=ofm_channels,
+        kernel_size=(3, 3),
+        padding=conv2d_padding,
+        out_dtype="int32",
+    )
+    requantize = relay.qnn.op.requantize(
+        conv2d,
+        relay.const(1, "float32"),
+        relay.const(1, "int32"),
+        relay.const(1, "float32"),
+        relay.const(1, "int32"),
+        axis=0,
+        out_dtype=dtype,
+    )
+    local_func = relay.Function(relay.analysis.free_vars(requantize), requantize)
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_conv2d")
+
+    mod = tvm.IRModule()
+    ext_input = relay.var("ext_input", shape=ifm_shape, dtype=dtype)
+    call_local_func = relay.Call(local_func, [ext_input])
+    extern_func = relay.Function(relay.analysis.free_vars(call_local_func), call_local_func)
+    extern_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", extern_var.name_hint)
+    mod[extern_var] = extern_func
+
+    main_input = relay.var("main_input", shape=ifm_shape, dtype=dtype)
+    call_extern_func = relay.Call(extern_var, [main_input])
+    main_func = relay.Function([main_input], call_extern_func, relay.TensorType(ofm_shape, dtype))
+    main_var = relay.GlobalVar("main")
+    mod[main_var] = main_func
+
+    mod = relay.transform.InferType()(mod)
+
+    error_regex = r"Difference on each side of a dimension should be either 0 or 1"
+
+    with pytest.raises(tvm.TVMError, match=error_regex):
+        mod = CMSISNNFusePads()(mod)
+
+
+@pytest.mark.parametrize(
+    "ifm_shape, pad_width, conv2d_padding, ofm_shape",
+    [
+        [(1, 25, 25, 12), ((0, 0), (0, 1), (1, 2), (0, 0)), (1, 1, 1, 1), (1, 26, 28, 2)],
+        [(1, 64, 100, 4), ((0, 0), (1, 1), (1, 1), (0, 0)), (0, 0, 0, 0), (1, 64, 100, 2)],
+        [(1, 55, 55, 3), ((0, 0), (2, 1), (3, 2), (0, 0)), (0, 0, 1, 1), (1, 57, 59, 2)],
+    ],
+)
+def test_pad_conv2d_fusion_noncmsisnn_target(ifm_shape, pad_width, conv2d_padding, ofm_shape):
+    """Tests the pads and conv2d fusion for non-cmsisnn targets.
+    It is expected that pad will not be fused with Conv2D in this case.
+    """
+    dtype = "int8"
+    kernel_size = (3, 3)
+    ofm_channels = 2
+    local_input = relay.var("local_input", shape=ifm_shape, dtype=dtype)
+    pad = relay.nn.pad(
+        local_input,
+        pad_width=pad_width,  # ((), (top, bottom), (left, right), ())
+        pad_value=10,
+        pad_mode="constant",
+    )
+    rng = np.random.default_rng(12321)
+    local_weight = tvm.nd.array(
+        rng.integers(
+            np.iinfo(dtype).min,
+            high=np.iinfo(dtype).max,
+            size=(ofm_channels, kernel_size[0], kernel_size[1], ifm_shape[3]),
+            dtype=dtype,
+        )
+    )
+    local_weight = relay.const(local_weight, dtype)
+    conv2d = relay.qnn.op.conv2d(
+        pad,
+        local_weight,
+        relay.const(1, "int32"),
+        relay.const(1, "int32"),
+        relay.const(1, "float32"),
+        relay.const(1, "float32"),
+        data_layout="NHWC",
+        kernel_layout="OHWI",
+        channels=ofm_channels,
+        kernel_size=(3, 3),
+        padding=conv2d_padding,
+        out_dtype="int32",
+    )
+    requantize = relay.qnn.op.requantize(
+        conv2d,
+        relay.const(1, "float32"),
+        relay.const(1, "int32"),
+        relay.const(1, "float32"),
+        relay.const(1, "int32"),
+        axis=0,
+        out_dtype=dtype,
+    )
+    local_func = relay.Function(relay.analysis.free_vars(requantize), requantize)
+    local_func = set_composite_func_attr(local_func, "noncmsis-nn.qnn_conv2d")
+
+    mod = tvm.IRModule()
+    ext_input = relay.var("ext_input", shape=ifm_shape, dtype=dtype)
+    call_local_func = relay.Call(local_func, [ext_input])
+    extern_func = relay.Function(relay.analysis.free_vars(call_local_func), call_local_func)
+    extern_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "noncmsis-nn", extern_var.name_hint)
+    mod[extern_var] = extern_func
+
+    main_input = relay.var("main_input", shape=ifm_shape, dtype=dtype)
+    call_extern_func = relay.Call(extern_var, [main_input])
+    main_func = relay.Function([main_input], call_extern_func, relay.TensorType(ofm_shape, dtype))
+    main_var = relay.GlobalVar("main")
+    mod[main_var] = main_func
+
+    mod = relay.transform.InferType()(mod)
+
+    mod = CMSISNNFusePads()(mod)
+    pad_verifier = CheckForPadsWithinCompositeFunc()
+    pad_verifier.visit_function(mod[extern_var])
+    pad_verifier.assert_pads_within_func()
+
+
+@pytest.mark.parametrize(
+    "ifm_shape, pad_width, conv2d_padding, ofm_shape",
+    [
+        [(1, 25, 25, 12), ((0, 0), (0, 1), (1, 2), (0, 0)), (1, 1, 1, 1), (1, 26, 28, 2)],
+        [(1, 64, 100, 4), ((0, 0), (1, 1), (1, 1), (0, 0)), (0, 0, 0, 0), (1, 64, 100, 2)],
+        [(1, 55, 55, 3), ((0, 0), (2, 1), (3, 2), (0, 0)), (0, 0, 1, 1), (1, 57, 59, 2)],
+    ],
+)
+def test_pad_conv2d_fusion(ifm_shape, pad_width, conv2d_padding, ofm_shape):
+    """Tests the pads and conv2d fusion."""
+    dtype = "int8"
+    kernel_size = (3, 3)
+    ofm_channels = 2
+    local_input = relay.var("local_input", shape=ifm_shape, dtype=dtype)
+    pad = relay.nn.pad(
+        local_input,
+        pad_width=pad_width,  # ((), (top, bottom), (left, right), ())
+        pad_value=10,
+        pad_mode="constant",
+    )
+    rng = np.random.default_rng(12321)
+    local_weight = tvm.nd.array(
+        rng.integers(
+            np.iinfo(dtype).min,
+            high=np.iinfo(dtype).max,
+            size=(ofm_channels, kernel_size[0], kernel_size[1], ifm_shape[3]),
+            dtype=dtype,
+        )
+    )
+    local_weight = relay.const(local_weight, dtype)
+    conv2d = relay.qnn.op.conv2d(
+        pad,
+        local_weight,
+        relay.const(1, "int32"),
+        relay.const(1, "int32"),
+        relay.const(1, "float32"),
+        relay.const(1, "float32"),
+        data_layout="NHWC",
+        kernel_layout="OHWI",
+        channels=ofm_channels,
+        kernel_size=(3, 3),
+        padding=conv2d_padding,
+        out_dtype="int32",
+    )
+    requantize = relay.qnn.op.requantize(
+        conv2d,
+        relay.const(1, "float32"),
+        relay.const(1, "int32"),
+        relay.const(1, "float32"),
+        relay.const(1, "int32"),
+        axis=0,
+        out_dtype=dtype,
+    )
+    local_func = relay.Function(relay.analysis.free_vars(requantize), requantize)
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_conv2d")
+
+    mod = tvm.IRModule()
+    ext_input = relay.var("ext_input", shape=ifm_shape, dtype=dtype)
+    call_local_func = relay.Call(local_func, [ext_input])
+    extern_func = relay.Function(relay.analysis.free_vars(call_local_func), call_local_func)
+    extern_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", extern_var.name_hint)
+    mod[extern_var] = extern_func
+
+    main_input = relay.var("main_input", shape=ifm_shape, dtype=dtype)
+    call_extern_func = relay.Call(extern_var, [main_input])
+    main_func = relay.Function([main_input], call_extern_func, relay.TensorType(ofm_shape, dtype))
+    main_var = relay.GlobalVar("main")
+    mod[main_var] = main_func
+
+    mod = relay.transform.InferType()(mod)
+
+    mod = CMSISNNFusePads()(mod)
+    pad_verifier = CheckForPadsWithinCompositeFunc()
+    pad_verifier.visit_function(mod[extern_var])
+    pad_verifier.assert_no_pads_within_func()
+
+
+def test_without_preceding_pad():
+    """Tests the pass FusePads when padding is not present before qnn.conv2d."""
+    dtype = "int8"
+    ifm_shape = (1, 56, 56, 64)
+    ofm_shape = (1, 56, 56, 64)
+    local_input = relay.var("local_input", shape=ifm_shape, dtype=dtype)
+    rng = np.random.default_rng(12321)
+    local_weight = tvm.nd.array(
+        rng.integers(
+            np.iinfo(dtype).min,
+            high=np.iinfo(dtype).max,
+            size=(64, 3, 3, 64),
+            dtype=dtype,
+        )
+    )
+    local_weight = relay.const(local_weight, dtype)
+    conv2d = relay.qnn.op.conv2d(
+        local_input,
+        local_weight,
+        relay.const(1, "int32"),
+        relay.const(1, "int32"),
+        relay.const(1, "float32"),
+        relay.const(1, "float32"),
+        data_layout="NHWC",
+        kernel_layout="OHWI",
+        channels=64,
+        kernel_size=(3, 3),
+        padding=(1, 1, 1, 1),
+        out_dtype="int32",
+    )
+    requantize = relay.qnn.op.requantize(
+        conv2d,
+        relay.const(1, "float32"),
+        relay.const(1, "int32"),
+        relay.const(1, "float32"),
+        relay.const(1, "int32"),
+        axis=0,
+        out_dtype=dtype,
+    )
+    relu = relay.nn.relu(requantize)
+    local_func = relay.Function(relay.analysis.free_vars(relu), relu)
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_conv2d")
+
+    mod = tvm.IRModule()
+    ext_input = relay.var("ext_input", shape=ifm_shape, dtype=dtype)
+    call_local_func = relay.Call(local_func, [ext_input])
+    extern_func = relay.Function(relay.analysis.free_vars(call_local_func), call_local_func)
+    extern_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", extern_var.name_hint)
+    mod[extern_var] = extern_func
+
+    main_input = relay.var("main_input", shape=ifm_shape, dtype=dtype)
+    call_extern_func = relay.Call(extern_var, [main_input])
+    main_func = relay.Function(relay.analysis.free_vars(call_extern_func), call_extern_func)
+    main_func = relay.Function([main_input], call_extern_func, relay.TensorType(ofm_shape, dtype))
+    main_var = relay.GlobalVar("main")
+    mod[main_var] = main_func
+
+    mod = relay.transform.InferType()(mod)
+
+    mod = CMSISNNFusePads()(mod)
+    pad_verifier = CheckForPadsWithinCompositeFunc()
+    pad_verifier.visit_function(mod[extern_var])
+    pad_verifier.assert_no_pads_within_func()
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index d36ec4219a0e..9fdb89289aff 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -50,8 +50,19 @@ def visit_call(self, call):
     return counter.count
 
 
-def assert_partitioned_function(orig_mod, cmsisnn_mod):
-    """If kCompiler attribute is missing, this function raises assertion"""
+def assert_partitioned_function(orig_mod, cmsisnn_mod, expected_ops_unchanged=True):
+    """
+    if KCompiler attribute is missing, this function raises an assertion.
+
+    Parameters
+    ----------
+    orig_mod : IRModule
+        Pre-partitioning module
+    cmsisnn_mod : IRModule
+        Post-partitioning module
+    is_num_calls_same: bool
+        Are number of CallNode(s) before and after partitioning expected to be the same
+    """
     attrs = [
         cmsisnn_mod[var.name_hint].attrs
         for var in cmsisnn_mod.get_global_vars()
@@ -64,9 +75,10 @@ def assert_partitioned_function(orig_mod, cmsisnn_mod):
     ]
     assert any(compilers), "Module does not contain function for cmsisnn target."
 
-    assert count_num_calls(orig_mod) == count_num_calls(
-        cmsisnn_mod
-    ), "Number of calls changed during partitioning"
+    if expected_ops_unchanged:
+        assert count_num_calls(orig_mod) == count_num_calls(
+            cmsisnn_mod
+        ), "Number of calls changed during partitioning"
 
 
 def assert_no_external_function(mod):
@@ -228,6 +240,29 @@ def make_qnn_relu(expr, fused_activation_fn, scale, zero_point, dtype):
     raise ValueError("Invalid argument provided with fused_activation_fn")
 
 
+class CheckForPadsWithinCompositeFunc(tvm.relay.ExprVisitor):
+    """Provides method to test number of pads present inside the function being visited."""
+
+    def __init__(self):
+        super().__init__()
+        self.num_pads_ = 0
+
+    def visit_call(self, call):
+        super().visit_call(call)
+        if (
+            isinstance(call, tvm.relay.Call)
+            and isinstance(call.op, tvm.ir.op.Op)
+            and call.op.name == "nn.pad"
+        ):
+            self.num_pads_ += 1
+
+    def assert_no_pads_within_func(self):
+        assert self.num_pads_ == 0, "CMSIS-NN composite function should not have pads."
+
+    def assert_pads_within_func(self):
+        assert self.num_pads_ > 0, "Composite function should have pads within it."
+
+
 def create_test_runner(compiler_cpu="cortex-m55", cpu_flags=""):
     """
     Creates AOT test runner for CMSIS-NN tests.

From d27167838888ca79eb53cd16449ae4483c9b6249 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Tue, 23 Aug 2022 14:03:04 +0100
Subject: [PATCH 010/704] [CI][AArch64] Skip libgomp failures in integration
 tests (#12554)

Some integration tests are failing when running in CI machines that
have torch installed (validated only in AARch64 for now), with an
error message related to libgomp, similar to the one above:

OSError: /.../dist-packages/torch/lib/libgomp-d22c30c5.so.1: cannot
allocate memory in static TLS block

As part of enabling the integration tests in AArch64, I'm marking this
tests as skipped, so that tests can start executing and don't regress
while we take time to investigate these specific failures.
---
 tests/python/driver/tvmc/test_autotuner.py | 9 +++++++++
 tests/python/driver/tvmc/test_frontends.py | 9 +++++++++
 tests/python/driver/tvmc/test_model.py     | 4 ++++
 3 files changed, 22 insertions(+)

diff --git a/tests/python/driver/tvmc/test_autotuner.py b/tests/python/driver/tvmc/test_autotuner.py
index 66017823a669..7c05ff804fa4 100644
--- a/tests/python/driver/tvmc/test_autotuner.py
+++ b/tests/python/driver/tvmc/test_autotuner.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import platform
 import pytest
 import os
 
@@ -73,6 +74,10 @@ def test_get_tuning_tasks(onnx_mnist):
     assert all([type(x) is expected_task_type for x in sut]) is True
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_tune_tasks__tuner__xgb(onnx_mnist, tmpdir_factory):
     pytest.importorskip("onnx")
 
@@ -141,6 +146,10 @@ def test_tune_tasks__tuner__xgb__no_early_stopping(onnx_mnist, tmpdir_factory):
     _tuner_test_helper(onnx_mnist, "xgb", tmpdir_name, early_stopping=None)
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_tune_tasks__tuner__xgb__no_tuning_records(onnx_mnist, tmpdir_factory):
     pytest.importorskip("onnx")
 
diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py
index 0cd02181ac40..98659b05ae5c 100644
--- a/tests/python/driver/tvmc/test_frontends.py
+++ b/tests/python/driver/tvmc/test_frontends.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import platform
 import pytest
 import builtins
 import importlib
@@ -74,6 +75,10 @@ def test_guess_frontend_onnx():
     assert type(sut) is tvmc.frontends.OnnxFrontend
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_guess_frontend_pytorch():
     # some CI environments wont offer pytorch, so skip in case it is not present
     pytest.importorskip("torch")
@@ -245,6 +250,10 @@ def test_load_model__pth(pytorch_resnet18):
     assert "layer1.0.conv1.weight" in tvmc_model.params.keys()
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_load_quantized_model__pth(pytorch_mobilenetv2_quantized):
     # some CI environments wont offer torch, so skip in case it is not present
     pytest.importorskip("torch")
diff --git a/tests/python/driver/tvmc/test_model.py b/tests/python/driver/tvmc/test_model.py
index 74c1c4ded8a4..fb1f718c1bed 100644
--- a/tests/python/driver/tvmc/test_model.py
+++ b/tests/python/driver/tvmc/test_model.py
@@ -55,6 +55,10 @@ def test_tvmc_workflow(use_vm, keras_simple):
     assert "output_0" in result.outputs.keys()
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 @pytest.mark.parametrize("use_vm", [True, False])
 def test_save_load_model(use_vm, keras_simple, tmpdir_factory):
     pytest.importorskip("onnx")

From ff46fa15e063ef499f666e63b9d5ed3faf2e3bfb Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 23 Aug 2022 14:28:12 +0100
Subject: [PATCH 011/704] [ETHOSN] Fix requantize output conversion (#12540)

Fixes a small issue when converting the output information to the support library API. The `requantize_info` output datatype needed updating with the output datatype from the relay function to ensure the graph is compiled correctly by the support library. Included a test to prevent regression in the future.
---
 .../backend/contrib/ethosn/ethosn_api.cc      | 22 ++++---
 .../contrib/test_ethosn/test_requantize.py    | 63 +++++++++++++++++++
 2 files changed, 75 insertions(+), 10 deletions(-)

diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index c828762096d6..55d0b57bcc2f 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -678,11 +678,17 @@ EthosnError EthosnAPI::Relu(const Expr& expr, ReluParams* params) {
 
 EthosnError EthosnAPI::Requantize(const Expr& expr, RequantizeParams* params) {
   Call call = Downcast<Call>(expr);
-  const auto* input_dtype = call->args[0]->checked_type().as<TensorTypeNode>();
+  const auto* input_ttype = call->args[0]->checked_type().as<TensorTypeNode>();
   sl::TensorShape input_tensor_shape = {1, 1, 1, 1};
   sl::DataType input_data_type;
-  EthosnError err = Tvm2Npu(input_dtype->shape, &input_tensor_shape);
-  err += Tvm2Npu(input_dtype->dtype, &input_data_type);
+  EthosnError err = Tvm2Npu(input_ttype->shape, &input_tensor_shape);
+  err += Tvm2Npu(input_ttype->dtype, &input_data_type);
+
+  const auto* output_ttype = call->checked_type().as<TensorTypeNode>();
+  sl::TensorShape output_tensor_shape = {1, 1, 1, 1};
+  sl::DataType output_data_type;
+  err += Tvm2Npu(output_ttype->shape, &output_tensor_shape);
+  err += Tvm2Npu(output_ttype->dtype, &output_data_type);
 
   float input_sc, output_sc;
   int input_zp, output_zp;
@@ -699,14 +705,10 @@ EthosnError EthosnAPI::Requantize(const Expr& expr, RequantizeParams* params) {
   sl::QuantizationInfo requantize_q_info;
   err += Tvm2Npu(output_zp, output_sc, &requantize_q_info);
   params->requantize_info = sl::RequantizeInfo(requantize_q_info);
+  params->requantize_info.m_OutputDataType = output_data_type;
 
-  sl::TensorInfo output_info = params->input_info;
-  output_info.m_QuantizationInfo = params->requantize_info.m_OutputQuantizationInfo;
-  if (params->requantize_info.m_OutputDataType.has_value()) {
-    output_info.m_DataType = params->requantize_info.m_OutputDataType.value();
-  }
-  params->output_info = output_info;
-
+  params->output_info = sl::TensorInfo(output_tensor_shape, output_data_type, sl::DataFormat::NHWC,
+                                       requantize_q_info);
   return err;
 }
 
diff --git a/tests/python/contrib/test_ethosn/test_requantize.py b/tests/python/contrib/test_ethosn/test_requantize.py
index 4626a0d92bc1..e20c3beeabfa 100644
--- a/tests/python/contrib/test_ethosn/test_requantize.py
+++ b/tests/python/contrib/test_ethosn/test_requantize.py
@@ -68,6 +68,69 @@ def test_requantize(in_dtype, out_dtype, shape):
     tei.verify(outputs, out_dtype, 1)
 
 
+@requires_ethosn
+def test_requantize_mixed_precision_with_following_op():
+    """
+    Checks a requantize operation that changes precision from uint8 to int8 with a
+    following add op.
+    """
+    np.random.seed(0)
+    shape = (1, 4, 6, 8)
+    in_sc = 0.012566
+    in_zp = 131
+    out_sc = 0.012566
+    out_zp = 3
+    in_dtype = "uint8"
+    out_dtype = "int8"
+
+    def get_model():
+        a = relay.var("a", shape=shape, dtype=in_dtype)
+        b = relay.var("b", shape=shape, dtype=out_dtype)
+        req = relay.qnn.op.requantize(
+            data=a,
+            input_scale=relay.const(in_sc, "float32"),
+            input_zero_point=relay.const(in_zp, "int32"),
+            output_scale=relay.const(out_sc, "float32"),
+            output_zero_point=relay.const(out_zp, "int32"),
+            out_dtype=out_dtype,
+        )
+        req = relay.qnn.op.add(
+            req,
+            b,
+            lhs_scale=relay.const(out_sc, "float32"),
+            lhs_zero_point=relay.const(out_zp, "int32"),
+            rhs_scale=relay.const(out_sc, "float32"),
+            rhs_zero_point=relay.const(out_zp, "int32"),
+            output_scale=relay.const(out_sc, "float32"),
+            output_zero_point=relay.const(out_zp, "int32"),
+        )
+        return req
+
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(
+                low=np.iinfo(in_dtype).min, high=np.iinfo(in_dtype).max, size=shape, dtype=in_dtype
+            )
+        ),
+        "b": tvm.nd.array(
+            np.random.randint(
+                low=np.iinfo(out_dtype).min,
+                high=np.iinfo(out_dtype).max,
+                size=shape,
+                dtype=out_dtype,
+            )
+        ),
+    }
+    outputs = []
+    for npu in [False, True]:
+        model = get_model()
+        mod = tei.make_module(model, {})
+        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu)
+        outputs.append(x)
+
+    tei.verify(outputs, out_dtype, 1)
+
+
 @requires_ethosn
 def test_requantize_failure():
     input_sc = 0.8

From dd7ae2d3e5a7e169021e75d0c9d0f6a8cc477a9c Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Tue, 23 Aug 2022 09:51:04 -0600
Subject: [PATCH 012/704] [Relay] Add Rsqrt to SimplifyExpr (#12363)

* Add Rsqrt to SimplifyExpr

* fix unit tests
---
 python/tvm/relay/op/_tensor.py                |  1 +
 python/tvm/relay/op/contrib/dnnl.py           |  3 ++-
 src/relay/transforms/simplify_expr.cc         | 24 +++++++++++++++++++
 tests/python/relay/test_pass_simplify_expr.py | 19 +++++++++++++++
 4 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 37cb263c489d..a04199f6a5b1 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -292,6 +292,7 @@ def elemwise_shape_func(attrs, inputs, _):
 register_shape_func("right_shift", False, broadcast_shape_func)
 
 register_shape_func("sqrt", False, elemwise_shape_func)
+register_shape_func("rsqrt", False, elemwise_shape_func)
 register_shape_func("negative", False, elemwise_shape_func)
 register_shape_func("exp", False, elemwise_shape_func)
 register_shape_func("tan", False, elemwise_shape_func)
diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 4ef342a26b0b..f7752e41b056 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -856,7 +856,8 @@ def __init__(self):
         added_eps = is_op("add")(mp1, eps)
         deno = is_op("sqrt")(added_eps)
         div_out = is_op("divide")(diff, deno)
-        weighted = is_op("multiply")(div_out, self.gamma)
+        div_out2 = diff * is_op("rsqrt")(added_eps)
+        weighted = is_op("multiply")(div_out | div_out2, self.gamma)
         added_bias = is_op("add")(weighted, self.beta)
         self.pattern = added_bias
 
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 04d0edb26d75..a6751933a88c 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -685,6 +685,29 @@ class SimplifyConsecutiveAdd : public DFPatternRewrite {
   DFPattern const2_;
 };
 
+class SimplifyRSqrt : public DFPatternRewrite {
+ public:
+  SimplifyRSqrt() {
+    x_ = IsWildcard();
+    numerator_ = IsWildcard();
+    auto sqrt = IsOp("sqrt");
+    pattern_ = IsOp("divide")({numerator_, sqrt({x_})});
+  }
+
+  Expr Callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    static const Op& op = Op::Get("rsqrt");
+    auto x = node_map[x_][0];
+    auto numerator = node_map[numerator_][0];
+    return Call(Op::Get("multiply"), {numerator, Call(op, {x})});
+  }
+
+ private:
+  /*! \brief Pattern input */
+  DFPattern x_;
+  DFPattern numerator_;
+};
+
 Expr SimplifyExpr(const Expr& expr, const IRModule& mod) {
   // the rewrites will be applied in the given order, and repeated until fixed point
   DFPatternRewriteComposer composer;
@@ -694,6 +717,7 @@ Expr SimplifyExpr(const Expr& expr, const IRModule& mod) {
   composer.AddRewrite<ConcretizeReshapeLikeRewrite>();
   composer.AddRewrite<ConcretizeCollapseSumLikeRewrite>();
   composer.AddRewrite<ConcretizeBroadcastToLikeRewrite>();
+  composer.AddRewrite<SimplifyRSqrt>();
   composer.AddRewrite<EliminateIdentityRewrite>();
   composer.AddRewrite<SimplifyReshape>();
   composer.AddRewrite<SimplifyTranspose>();
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index 162ac6e73ddb..837b15a48dc1 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -584,5 +584,24 @@ def expected():
     assert tvm.ir.structural_equal(zzl, after)
 
 
+def test_simplify_rsqrt():
+    shape = (32, 1, 1)
+    x = relay.var("x", shape=shape, dtype="float32")
+
+    def before(c):
+        return relay.const(c) / relay.sqrt(x)
+
+    def expected(c):
+        if c == 1:
+            return relay.rsqrt(x)
+        else:
+            return relay.const(c) * relay.rsqrt(x)
+
+    for c in [1.0, 2.0, 2.5]:
+        opt = run_opt_pass(before(c), transform.SimplifyExpr())
+        after = run_opt_pass(expected(c), transform.InferType())
+        assert tvm.ir.structural_equal(opt, after)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From da5836f230525afe8984dcbfea8ee788a6286b5c Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Tue, 23 Aug 2022 23:32:56 +0700
Subject: [PATCH 013/704] [AutoTVM] Add support for text buffers to
 ApplyHistoryBest (#12521)

Currently, AutoTVM's ApplyHistoryBest class does not support loading tuning logs from memory. This is a pet peeve of mine, as it requires you to work with a tempfile whenever writing autotuning tests. This is also just strange, as the rest of AutoTVM has support for text buffers (e.g. tvm.autotvm.callback.log_to_file supports passing in a text buffer, letting us write to but not read from them).

Additionally, ApplyHistoryBest handles input arguments very unintuitively. Before this PR, it allowed users to pass string filepaths, a list of string filepaths, or an Iterable (such as a list) of input and result tuples. However, it did not support taking in StringIO objects as mentioned above, nor pathlib.Path objects, nor combinations of a filepath and an Iterable of tuples.

In a perfect world, we would change ApplyHistoryBest to take as input a path-like object, file-like object, or an Iterable of input and result tuples (similar to what ApplyGraphBest takes as an argument). However, this would break the existing functionality to take as input a list of filepaths.

To be backwards compatible, while fixing this issue, this pull request defines a new type inside dispatcher.py:

Records = Union[
    Union[str, bytes, Path],  # Path-like objects
    TextIOBase,  # File-like objects
    Iterable[Tuple[MeasureInput, MeasureResult]],
]
It then rewrites ApplyHistoryBest.load so it takes the following arguments:

def load(self, records: Union[Records, Iterable[Records]]):
This PR also adds unit tests for this new functionality, and fixes a relevant bug in tests/micro/common/test_autotune.py in which a StringIO object was passed to apply_history_best, causing it to appear to pass but not actually read any data.
---
 python/tvm/autotvm/record.py                 | 31 ++++++-
 python/tvm/autotvm/task/dispatcher.py        | 87 ++++++++++--------
 tests/micro/common/test_autotune.py          |  1 +
 tests/python/unittest/test_autotvm_record.py | 92 ++++++++++++++++----
 4 files changed, 155 insertions(+), 56 deletions(-)

diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index b2faee243be0..8e54e011c0b7 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -20,10 +20,12 @@
 
 import argparse
 import base64
+from io import TextIOBase
 import logging
 import pickle
 import json
 import time
+from typing import Union
 import os
 import itertools
 from collections import OrderedDict
@@ -194,20 +196,41 @@ def clean_json_to_python(x):
     raise RuntimeError("Invalid log protocol: " + protocol)
 
 
-def load_from_file(filename):
-    """Generator: load records from file.
+def load_from_buffer(file: TextIOBase):
+    """Generator: load records from buffer.
     This is a generator that yields the records.
 
     Parameters
     ----------
-    filename: str
+    file: io.TextIOBase
 
     Yields
     ------
     input: autotvm.measure.MeasureInput
     result: autotvm.measure.MeasureResult
     """
-    with open(filename) as f:
+    for row in file:
+        if row and not row.startswith("#"):
+            ret = decode(row)
+            if ret is None:
+                continue
+            yield ret
+
+
+def load_from_file(filepath: Union[str, bytes, os.PathLike]):
+    """Generator: load records from path.
+    This is a generator that yields the records.
+
+    Parameters
+    ----------
+    filepath: str, bytes, or os.PathLike
+
+    Yields
+    ------
+    input: autotvm.measure.MeasureInput
+    result: autotvm.measure.MeasureResult
+    """
+    with open(filepath) as f:
         for row in f:
             if row and not row.startswith("#"):
                 ret = decode(row)
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 11a608d4cbbf..8b2e7eb01fe6 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -30,18 +30,26 @@
 
 from __future__ import absolute_import as _abs
 
+from io import TextIOBase
 import logging
-import typing
-from typing import Union
-from collections.abc import Iterable
+from os import PathLike
+from pathlib import Path
+from typing import List, Iterable, Tuple, Union
 
 import numpy as np
 
 from .space import FallbackConfigEntity
 from .. import env as _env
+from ..measure import MeasureInput, MeasureResult
 
 logger = logging.getLogger("autotvm")
 
+Records = Union[
+    Union[str, bytes, Path],  # Path-like objects
+    TextIOBase,  # File-like objects
+    Iterable[Tuple[MeasureInput, MeasureResult]],
+]
+
 
 class DispatchContext(object):
     """
@@ -194,7 +202,7 @@ class ApplyFixedConfig(DispatchContext):
         Name of schedules to use.
     """
 
-    def __init__(self, tasks, schedule_names: Union[str, typing.List[str]]):
+    def __init__(self, tasks, schedule_names: Union[str, List[str]]):
         super(ApplyFixedConfig, self).__init__()
         if isinstance(schedule_names, str):
             self._schedule_names = list(schedule_names)
@@ -238,15 +246,15 @@ class ApplyHistoryBest(DispatchContext):
 
     Parameters
     ----------
-    records : str, list of str, or iterator of (autotvm.measure.MeasureInput,\
-                                                autotvm.measure.MeasureResult)
-        Collection of tuning records.
-        If is str, then it should be the filename of a records log file.
-        Each row of this file is an encoded record pair. If it is a list, it can either be
-        a list of paths to log files that will be loaded jointly or an iterator or records.
+    records : None, Records, or iterator of Records objects, where a
+              Records object is a path-like object, a file-like object,
+              or an iterator of (MeasureInput, MeasureResult).
+
+        Collection of tuning records. If multiple Records objects are passed, their
+        contents will be merged.
     """
 
-    def __init__(self, records):
+    def __init__(self, records: Union[None, Records, Iterable[Records]]):
         super(ApplyHistoryBest, self).__init__()
 
         self.best_by_targetkey = {}
@@ -256,46 +264,48 @@ def __init__(self, records):
         if records:
             self.load(records)
 
-    def load(self, records):
+    def load(self, records: Union[Records, Iterable[Records]]):
         """Load records to this dispatch context
 
         Parameters
         ----------
         records : str, list of str, or iterator of (autotvm.measure.MeasureInput,\
                                                     autotvm.measure.MeasureResult)
-            Collection of tuning records.
-            If is str, then it should be the filename of a records log file.
-            Each row of this file is an encoded record pair. If it is a list
-            it can either be a list of paths to logs that will be loaded jointly or
-            an iterator of measurement results.
+
+            Collection of tuning records. If multiple Records objects are passed, their
+            contents will be merged.
         """
         # pylint: disable=import-outside-toplevel
-        from pathlib import Path
-        from ..record import load_from_file
+        from ..record import load_from_file, load_from_buffer
 
-        joint_records = []
-        if not isinstance(records, Iterable) or isinstance(records, str):
-            records = [records]
+        def _unpack_records(
+            records: Union[Records, Iterable[Records]]
+        ) -> List[Tuple[MeasureInput, MeasureResult]]:
 
-        for rec in records:
-            if isinstance(rec, Path):
-                rec = str(rec)
+            if isinstance(records, (str, bytes, PathLike)):
+                return load_from_file(records)
 
-            if isinstance(rec, str):
-                rec = load_from_file(rec)
-                joint_records += rec
-            else:
-                if rec is not None:
-                    joint_records.append(rec)
+            if isinstance(records, TextIOBase):
+                return load_from_buffer(records)
 
-        if not joint_records:
+            joint_records = []
+            for record in records:
+                if isinstance(record, Tuple) and isinstance(record[0], MeasureInput):
+                    joint_records.append(record)
+                else:
+                    joint_records += _unpack_records(record)
+
+            return joint_records
+
+        flattened_records = _unpack_records(records)
+        if not flattened_records:
             return
 
         best_by_targetkey = self.best_by_targetkey
         best_by_model = self.best_by_model
 
         counter = 0
-        for inp, res in joint_records:
+        for inp, res in flattened_records:
             counter += 1
             if res.error_no != 0:
                 continue
@@ -447,7 +457,7 @@ class ApplyGraphBest(DispatchContext):
     node index.
     """
 
-    def __init__(self, records):
+    def __init__(self, records: Records):
         """
         Parameters
         ----------
@@ -458,11 +468,16 @@ def __init__(self, records):
             Otherwise, it is an iterator.
         """
         # pylint: disable=import-outside-toplevel
-        from ..record import load_from_file
+        from ..record import load_from_file, load_from_buffer
 
         super(ApplyGraphBest, self).__init__()
-        if isinstance(records, str):
+        if isinstance(records, (str, bytes, PathLike)):
             records = load_from_file(records)
+        elif isinstance(records, TextIOBase):
+            records = load_from_buffer(records)
+        else:
+            records = list(records)
+
         self._records = list(records)
         self._counter = 0
         self._global_cfg_dict = {}
diff --git a/tests/micro/common/test_autotune.py b/tests/micro/common/test_autotune.py
index b79260dd46ed..46f6d8889a9a 100644
--- a/tests/micro/common/test_autotune.py
+++ b/tests/micro/common/test_autotune.py
@@ -61,6 +61,7 @@ def test_kws_autotune_workflow(platform, board, tmp_path):
     assert logs[0]["config"]["entity"] != logs[1]["config"]["entity"]
 
     # Compile the best model with AOT and connect to it
+    str_io_logs.seek(0)
     with tvm.micro.testing.create_aot_session(
         platform,
         board,
diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py
index 147122ff10d6..693810d3f979 100644
--- a/tests/python/unittest/test_autotvm_record.py
+++ b/tests/python/unittest/test_autotvm_record.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """test the correctness of dump and load of data log"""
+from io import StringIO
+from os import PathLike
 import time
 
-import tvm
-from tvm import te
 from tvm.contrib import utils
 
 from tvm import autotvm
@@ -78,23 +78,83 @@ def test_file_io():
     assert str(x) == str(inputs[0][2])
 
 
-def test_apply_history_best():
+def test_apply_history_best(tmpdir):
     tsk, target = get_sample_task()
+    best = str(tsk.config_space.get(2))
 
-    records = [
-        (MeasureInput(target, tsk, tsk.config_space.get(0)), MeasureResult((0.1,), 0, 2.3, 0)),
-        (MeasureInput(target, tsk, tsk.config_space.get(1)), MeasureResult((0.3,), 0, 2.3, 0)),
-        (MeasureInput(target, tsk, tsk.config_space.get(2)), MeasureResult((0.01,), 0, 2.3, 0)),
-        (MeasureInput(target, tsk, tsk.config_space.get(4)), MeasureResult((0.4,), 0, 2.3, 0)),
-    ]
-    hist_best = ApplyHistoryBest(records)
-    x = hist_best.query(target, tsk.workload)
-    assert str(x) == str(tsk.config_space.get(2))
+    inputs_batch_1 = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(3)]
+    results_batch_1 = [MeasureResult((i,), 0, 0, 0) for i in range(1, 3)]
+    results_batch_1.append(MeasureResult((0.5,), 0, 2.3, 0))
 
-    # Confirm same functionality for iterators.
-    hist_best = ApplyHistoryBest(iter(records))
-    x = hist_best.query(target, tsk.workload)
-    assert str(x) == str(tsk.config_space.get(2))
+    # Write data out to file
+    filepath_batch_1 = tmpdir / "batch_1.log"
+    with open(filepath_batch_1, "w") as file:
+        autotvm.callback.log_to_file(file)(None, inputs_batch_1, results_batch_1)
+
+    # Load best results from Path
+    assert isinstance(filepath_batch_1, PathLike)
+    hist_best = ApplyHistoryBest(filepath_batch_1)
+    assert str(hist_best.query(target, tsk.workload)) == best
+
+    # Load best results from str(Path)
+    hist_best = ApplyHistoryBest(str(filepath_batch_1))
+    assert str(hist_best.query(target, tsk.workload)) == best
+
+    # Write data into StringIO buffer
+    stringio_batch_1 = StringIO()
+    assert isinstance(filepath_batch_1, PathLike)
+    callback = autotvm.callback.log_to_file(stringio_batch_1)
+    callback(None, inputs_batch_1, results_batch_1)
+    stringio_batch_1.seek(0)
+
+    # Load best results from strIO
+    hist_best = ApplyHistoryBest(stringio_batch_1)
+    assert str(hist_best.query(target, tsk.workload)) == best
+
+    # Load best result from list of tuples (MeasureInput, MeasureResult)
+    hist_best = ApplyHistoryBest(list(zip(inputs_batch_1, results_batch_1)))
+    assert str(hist_best.query(target, tsk.workload)) == best
+
+    # Same thing, but iterable instead of list (i.e. no subscripting)
+    hist_best = ApplyHistoryBest(zip(inputs_batch_1, results_batch_1))
+    assert str(hist_best.query(target, tsk.workload)) == best
+
+
+def test_apply_history_best_multiple_batches(tmpdir):
+    tsk, target = get_sample_task()
+    best = str(tsk.config_space.get(2))
+
+    inputs_batch_1 = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(2)]
+    results_batch_1 = [MeasureResult((i,), 0, 0, 0) for i in range(1, 3)]
+    filepath_batch_1 = tmpdir / "batch_1.log"
+    with open(filepath_batch_1, "w") as file:
+        autotvm.callback.log_to_file(file)(None, inputs_batch_1, results_batch_1)
+
+    inputs_batch_2 = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(2, 4)]
+    results_batch_2 = [MeasureResult((0.5,), 0, 0, 0), MeasureResult((3,), 0, 0, 0)]
+    filepath_batch_2 = tmpdir / "batch_2.log"
+    with open(filepath_batch_2, "w") as file:
+        autotvm.callback.log_to_file(file)(None, inputs_batch_2, results_batch_2)
+
+    # Check two Path filepaths works
+    hist_best = ApplyHistoryBest([filepath_batch_1, filepath_batch_2])
+    assert str(hist_best.query(target, tsk.workload)) == best
+
+    # Check that an arbitrary Iterable of Paths works
+    # Calling zip() on a single list gives a non-subscriptable Iterable
+    hist_best = ApplyHistoryBest(zip([filepath_batch_1, filepath_batch_2]))
+    assert str(hist_best.query(target, tsk.workload)) == best
+
+    # Check that Iterable of Iterable of tuples is correctly merged
+    hist_best = ApplyHistoryBest(
+        zip(
+            [
+                zip(inputs_batch_1, results_batch_1),
+                zip(inputs_batch_2, results_batch_2),
+            ]
+        )
+    )
+    assert str(hist_best.query(target, tsk.workload)) == best
 
 
 if __name__ == "__main__":

From 1d71c1b4aad72843540f897e27c01aa73256a463 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 23 Aug 2022 11:08:55 -0700
Subject: [PATCH 014/704] [skip ci][ci] Mark more ethosu tests with xfail
 (#12560)

See #12511 for context. Since more parameterizations are popping up as
failed, this disables whole tests rather than specific combinations of
parameters.
---
 .../python/contrib/test_ethosu/test_codegen.py  | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 5b4643edb4a0..ae7d0821bb7f 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -347,9 +347,7 @@ def binary_elementwise(lhs, rhs):
         ([1, 4, 4], [4, 1]),
     ],
 )
-@tvm.testing.xfail_parameterizations(
-    "ifm_shape0-ifm2_shape0-ethos-u55-64", reason="See https://github.com/apache/tvm/issues/12511"
-)
+@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511")
 def test_binary_add_with_non_4d_shapes(
     request,
     accel_type,
@@ -608,9 +606,7 @@ def rounding_right_shift(lhs, rhs):
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 @pytest.mark.parametrize("ifm_shape", [(3, 2), (1, 15, 11, 7), (3, 1, 12), (400,)])
 @pytest.mark.parametrize("ifm_scale, ifm_zp, ofm_scale, ofm_zp", [(1, 0, 1, 0), (0.015, 3, 0.2, 5)])
-@tvm.testing.xfail_parameterizations(
-    "1-0-1-0-ifm_shape3-ethos-u55-128", reason="See https://github.com/apache/tvm/issues/12511"
-)
+@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511")
 def test_ethosu_identity_codegen(
     request, ifm_shape, ifm_scale, ifm_zp, ofm_scale, ofm_zp, accel_type
 ):
@@ -659,6 +655,7 @@ def generate_output_data(input_data):
         ((8, 7, 3), (-4, 1, 8, -2)),
     ],
 )
+@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511")
 def test_relay_reshape_codegen(ifm_shape, new_shape, accel_type):
     np.random.seed(0)
 
@@ -691,9 +688,7 @@ def create_model():
         ([5000], [123], [2151]),
     ],
 )
-@tvm.testing.xfail_parameterizations(
-    "ifm_shape3-begin3-size3-ethos-u55-32", reason="See https://github.com/apache/tvm/issues/12511"
-)
+@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511")
 def test_tflite_slice(request, accel_type, ifm_shape, begin, size):
     np.random.seed(0)
 
@@ -729,9 +724,7 @@ def strided_slice_func(x):
     "ifm_shape",
     [[1, 5, 12, 4], [1, 1, 2], [4, 3, 2], [10, 20], [345]],
 )
-@tvm.testing.xfail_parameterizations(
-    "ifm_shape4-ABS-ethos-u55-64", reason="See https://github.com/apache/tvm/issues/12511"
-)
+@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511")
 def test_ethosu_unary_elementwise(
     request,
     accel_type,

From 99b9b74b12d8687966c8d009a9a0cfca6f36defc Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Tue, 23 Aug 2022 20:49:52 +0100
Subject: [PATCH 015/704] [CI] Remove Vela from ci_cpu (#12533)

While the dependencies for microNPU and CMSIS-NN moved into ci_cortexm,
Vela is still installed in ci_cpu. As a result, we have some of the microNPU tests outside of
test_ethosu folder failing since they use precence of Vela to decide whether to skip the
test.

This change will
* remove Vela from ci_cpu
* remove unnecessary PATH update
---
 docker/Dockerfile.ci_cpu | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 013ebfb59e88..3812bfbd197e 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -124,13 +124,6 @@ RUN bash /install/ubuntu_install_androidsdk.sh
 ENV ANDROID_HOME=/opt/android-sdk-linux/
 ENV ANDROID_NDK_HOME=/opt/android-sdk-linux/ndk/21.3.6528147/
 
-# Install Vela compiler
-COPY install/ubuntu_install_vela.sh /install/ubuntu_install_vela.sh
-RUN bash /install/ubuntu_install_vela.sh
-
-# Update PATH
-ENV PATH /opt/arm/gcc-arm-none-eabi/bin:/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4:$PATH
-
 # PaddlePaddle deps
 COPY install/ubuntu_install_paddle.sh /install/ubuntu_install_paddle.sh
 RUN bash /install/ubuntu_install_paddle.sh

From 4d104e5ec6b02d0b1d08c93c26bc322f54189cba Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Tue, 23 Aug 2022 20:57:24 +0100
Subject: [PATCH 016/704] [ETHOSN] Add support for special indices of Reshape
 (#12556)

This pr adds support for the special indices values of the reshape operator for the Arm(R) Ethos(TM)-N NPU.
---
 .../backend/contrib/ethosn/ethosn_api.cc      | 38 +++----------------
 .../contrib/test_ethosn/test_reshape.py       | 37 ++++--------------
 2 files changed, 12 insertions(+), 63 deletions(-)

diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index 55d0b57bcc2f..c1f67d0d2b16 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -36,6 +36,7 @@
 #include <utility>
 #include <vector>
 
+#include "../../../op/tensor/transform.h"
 #include "ethosn_support_library/Support.hpp"
 #include "ethosn_support_library/SupportQueries.hpp"
 #include "tvm/relay/qnn/attrs.h"
@@ -293,12 +294,6 @@ EthosnError EthosnAPI::Reshape(const Expr& expr, ReshapeParams* params) {
   // Create input info
   Call reshape = Downcast<Call>(expr);
   const auto* input_dtype = reshape->args[0]->checked_type().as<TensorTypeNode>();
-  const auto& reshape_attrs = reshape->attrs.as<ReshapeAttrs>();
-
-  if (reshape_attrs->newshape.size() > params->new_shape.size()) {
-    return EthosnError(ErrStrm() << "reshape dimension=" << reshape_attrs->newshape.size()
-                                 << ", reshape dimension must be <= " << params->new_shape.size());
-  }
 
   sl::TensorShape input_tensor_shape = {1, 1, 1, 1};
   sl::DataType input_data_type;
@@ -309,35 +304,12 @@ EthosnError EthosnAPI::Reshape(const Expr& expr, ReshapeParams* params) {
     tensor_size *= dim;
   }
 
-  int infer_index = -1;
-  int reshaped_size = 1;
-  Array<Integer> inferred_shape = {1, 1, 1, 1};
-  for (size_t i = 0; i < reshape_attrs->newshape.size(); i++) {
-    int value = reshape_attrs->newshape[i].as<IntImmNode>()->value;
-    if (value < -1) {
-      return EthosnError(ErrStrm()
-                         << "reshape dimension=" << value << ", reshape dimension must be >= -1");
-    }
-    if (value == -1) {
-      if (infer_index != -1) {
-        return EthosnError("only one reshape dimension can be inferred");
-      }
-      infer_index = i;
-    } else {
-      inferred_shape.Set(i, value);
-      reshaped_size *= value;
-    }
+  Array<IndexExpr> inferred_shape = {1, 1, 1, 1};
+  Array<IndexExpr> new_shape = InferNewShape(input_dtype->shape, reshape->attrs, false);
+  for (size_t i = 0; i < new_shape.size(); ++i) {
+    inferred_shape.Set(i, new_shape[i]);
   }
 
-  if (infer_index != -1) {
-    if (tensor_size % reshaped_size != 0) {
-      return EthosnError(ErrStrm()
-                         << "reshaped size=" << reshaped_size
-                         << ", must be an integer factor of the input size " << tensor_size);
-    }
-    int value = tensor_size / reshaped_size;
-    inferred_shape.Set(infer_index, Integer(value));
-  }
   err += Tvm2Npu(inferred_shape, &params->new_shape);
   params->input_info =
       sl::TensorInfo(input_tensor_shape, input_data_type, params->input_info.m_DataFormat,
diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py
index 6266367e90cc..cb8a49be2d81 100644
--- a/tests/python/contrib/test_ethosn/test_reshape.py
+++ b/tests/python/contrib/test_ethosn/test_reshape.py
@@ -20,7 +20,6 @@
 import tvm
 from tvm import relay
 from tvm.testing import requires_ethosn
-from tvm.relay.op.contrib import get_pattern_table
 import numpy as np
 import pytest
 from . import infrastructure as tei
@@ -43,7 +42,14 @@ def _get_model(input_shape, output_shape, dtype):
         ((1, 15, 4, 1), (1, 30, 2)),
         ((1, 15, 4, 1), (1, 4, 15, 1)),
         ((1, 15, 4, 1), (1, 12, 5, 1)),
+        ((1, 15, 4, 1), (1, 0, 2, 2)),
         ((1, 15, 4, 1), (1, -1, 2, 1)),
+        ((1, 15, 4, 1), (1, -2)),
+        ((1, 15, 4, 1), (1, -3, 1, 1)),
+        ((1, 15, 4, 1), (1, -4, 3, 5, 4)),
+        ((1, 15, 4, 1), (0, -1, -2)),
+        ((1, 15, 4, 1), (0, -1, -3, 1)),
+        ((1, 15, 4, 1), (1, -4, -1, 5, 4)),
     ],
 )
 def test_reshape(dtype, input_shape, output_shape):
@@ -65,32 +71,3 @@ def test_reshape(dtype, input_shape, output_shape):
         outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
 
     tei.verify(outputs, dtype, 1)
-
-
-@requires_ethosn
-@pytest.mark.parametrize(
-    "input_shape, output_shape, dtype, err_msg",
-    [
-        (
-            (1, 15, 4, 1),
-            (1, 15, -2),
-            "uint8",
-            "reshape dimension=-2, reshape dimension must be >= -1",
-        ),
-        (
-            (1, 1, 4, 1),
-            (1, 1, 2, 2, 1),
-            "uint8",
-            "reshape dimension=5, reshape dimension must be <= 4",
-        ),
-    ],
-)
-def test_reshape_failure(input_shape, output_shape, dtype, err_msg):
-    np.random.seed(0)
-    model, params = _get_model(input_shape, output_shape, dtype)
-    mod = tei.make_module(model, params)
-    pattern = get_pattern_table("ethos-n")
-    mod = tei.make_module(model, params)
-    mod = relay.transform.MergeComposite(pattern)(mod)
-    mod = tei.make_ethosn_partition(mod["main"].body)
-    tei.test_error(mod, {}, err_msg)

From 8c23469e2098659cffcbfebb56b4d32c0df7a6ed Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Tue, 23 Aug 2022 13:10:50 -0700
Subject: [PATCH 017/704] [MicroTVM] add heap-size to project options (#12390)

* heap-size is added to project options

* change stm32l4r5zi recommended heap size

* change stm32l4r5zi recommended heap size

* addressing comments

* addressing comments

* addressing comments

Co-authored-by: Mohamad <mkatanbaf@users.noreply.github.com>
---
 .../zephyr/template_project/boards.json       |  3 +-
 .../template_project/microtvm_api_server.py   | 45 +++++++++++++++++++
 .../template_project/src/host_driven/main.c   |  2 +-
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/apps/microtvm/zephyr/template_project/boards.json b/apps/microtvm/zephyr/template_project/boards.json
index aae764a8239e..dcca9c800224 100644
--- a/apps/microtvm/zephyr/template_project/boards.json
+++ b/apps/microtvm/zephyr/template_project/boards.json
@@ -54,7 +54,8 @@
         "is_qemu": false,
         "fpu": true,
         "vid_hex": "0483",
-        "pid_hex": "374b"
+        "pid_hex": "374b",
+        "recommended_heap_size_bytes": 512000
     },
     "qemu_cortex_r5": {
         "board": "qemu_cortex_r5",
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index eb20c3e88448..38a7ec0c2939 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -195,6 +195,33 @@ def _get_device_args(options):
     )
 
 
+def _get_board_mem_size_bytes(options):
+    board_file_path = (
+        pathlib.Path(get_zephyr_base(options))
+        / "boards"
+        / "arm"
+        / options["zephyr_board"]
+        / (options["zephyr_board"] + ".yaml")
+    )
+    try:
+        with open(board_file_path) as f:
+            board_data = yaml.load(f, Loader=yaml.FullLoader)
+            return int(board_data["ram"]) * 1024
+    except:
+        _LOG.warning("Board memory information is not available.")
+    return None
+
+
+DEFAULT_HEAP_SIZE_BYTES = 216 * 1024
+
+
+def _get_recommended_heap_size_bytes(options):
+    prop = BOARD_PROPERTIES[options["zephyr_board"]]
+    if "recommended_heap_size_bytes" in prop:
+        return prop["recommended_heap_size_bytes"]
+    return DEFAULT_HEAP_SIZE_BYTES
+
+
 def generic_find_serial_port(serial_number=None):
     """Find a USB serial port based on its serial number or its VID:PID.
 
@@ -370,6 +397,12 @@ def _get_nrf_device_args(options):
         type="bool",
         help="Run on the FVP emulator instead of hardware.",
     ),
+    server.ProjectOption(
+        "heap_size_bytes",
+        optional=["generate_project"],
+        type="int",
+        help="Sets the value for HEAP_SIZE_BYTES passed to K_HEAP_DEFINE() to service TVM memory allocation requests.",
+    ),
 ]
 
 
@@ -595,6 +628,18 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
 
                     cmake_f.write(line)
 
+                heap_size = _get_recommended_heap_size_bytes(options)
+                if options.get("heap_size_bytes"):
+                    board_mem_size = _get_board_mem_size_bytes(options)
+                    heap_size = options["heap_size_bytes"]
+                    if board_mem_size is not None:
+                        assert (
+                            heap_size < board_mem_size
+                        ), f"Heap size {heap_size} is larger than memory size {board_mem_size} on this board."
+                cmake_f.write(
+                    f"target_compile_definitions(app PUBLIC -DHEAP_SIZE_BYTES={heap_size})\n"
+                )
+
                 if options.get("compile_definitions"):
                     flags = options.get("compile_definitions")
                     for item in flags:
diff --git a/apps/microtvm/zephyr/template_project/src/host_driven/main.c b/apps/microtvm/zephyr/template_project/src/host_driven/main.c
index c0286dc0c74f..7dd082e2e588 100644
--- a/apps/microtvm/zephyr/template_project/src/host_driven/main.c
+++ b/apps/microtvm/zephyr/template_project/src/host_driven/main.c
@@ -142,7 +142,7 @@ tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
 }
 
 // Heap for use by TVMPlatformMemoryAllocate.
-K_HEAP_DEFINE(tvm_heap, 216 * 1024);
+K_HEAP_DEFINE(tvm_heap, HEAP_SIZE_BYTES);
 
 // Called by TVM to allocate memory.
 tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {

From 13ebbfb37f8cec1da71d88fbcbecdd4ad4d24dcc Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 23 Aug 2022 15:44:34 -0500
Subject: [PATCH 018/704] Replace std::result_of (deprecated in C++17) with
 std::invoke_result, NFC (#12562)

---
 include/tvm/script/printer/traced_object.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/script/printer/traced_object.h b/include/tvm/script/printer/traced_object.h
index 4c09b0a41b79..cb63c31cd4a5 100644
--- a/include/tvm/script/printer/traced_object.h
+++ b/include/tvm/script/printer/traced_object.h
@@ -450,7 +450,7 @@ class TracedBasicValue {
    * \brief Transform the wrapped value without changing its path.
    */
   template <typename F>
-  typename detail::TracedObjectWrapperSelector<typename std::result_of<F(const T&)>::type>::Type
+  typename detail::TracedObjectWrapperSelector<typename std::invoke_result<F, const T&>::type>::Type
   ApplyFunc(F&& f) const {
     return MakeTraced(f(value_), path_);
   }

From 8174d082e8168db9ad63826c9d68aee8c76c7090 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 23 Aug 2022 16:42:43 -0500
Subject: [PATCH 019/704] Add using directives for otherwise hidden virtual
 functions, NFC (#12561)

This silences warning
```
warning: 'foo' hides overloaded virtual functions [-Woverloaded-virtual]
```
typically caused by overriding only some overloads of `VisitExpr_` from
a set defined in the base class.
---
 src/relay/backend/annotate_used_memory.cc        | 2 +-
 src/relay/transforms/annotate_texture_storage.cc | 4 ++++
 src/relay/transforms/compiler_function_utils.cc  | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/relay/backend/annotate_used_memory.cc b/src/relay/backend/annotate_used_memory.cc
index ad370c73ad1e..4dcdb2e541c5 100644
--- a/src/relay/backend/annotate_used_memory.cc
+++ b/src/relay/backend/annotate_used_memory.cc
@@ -110,7 +110,7 @@ class AnnotateUsedMemoryMutator : public transform::DeviceAwareExprMutator {
   /*!
    * \brief Establish which let bindings have primitive function values.
    */
-  std::pair<Var, Expr> PreVisitLetBinding_(const Var& var, const Expr& value) {
+  std::pair<Var, Expr> PreVisitLetBinding_(const Var& var, const Expr& value) override {
     if (const auto* func_node = value.as<FunctionNode>()) {
       ICHECK(func_node->attrs.HasNonzeroAttr(attr::kPrimitive))
           << "Expect top-level functions to be primitive.";
diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc
index b3ed28db4574..c9cf45e06929 100644
--- a/src/relay/transforms/annotate_texture_storage.cc
+++ b/src/relay/transforms/annotate_texture_storage.cc
@@ -117,6 +117,8 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
   }
 
  private:
+  using transform::DeviceAwareExprVisitor::VisitExpr_;
+
   void Visit(const Expr& expr) {
     // Pre-order traversal to enable upward propagation
     // of consumer storage scopes to producers when desirable.
@@ -426,6 +428,8 @@ class RewriteVDStorageScopes : public transform::DeviceAwareExprMutator {
   using VarMap = std::unordered_map<Expr, Var, ObjectPtrHash, ObjectPtrEqual>;
 
  public:
+  using transform::DeviceAwareExprMutator::VisitExpr_;
+
   explicit RewriteVDStorageScopes(const Map<Expr, Map<Expr, Array<String>>>& storage_scope)
       : transform::DeviceAwareExprMutator(Optional<IRModule>()), storage_scope_(storage_scope) {}
 
diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc
index 1dafcd10a361..f1e7e223541b 100644
--- a/src/relay/transforms/compiler_function_utils.cc
+++ b/src/relay/transforms/compiler_function_utils.cc
@@ -54,6 +54,8 @@ const FunctionNode* AsFunctionNode(const Expr& expr, const std::string& compiler
  */
 class Outliner : public MixedModeMutator {
  public:
+  using MixedModeMutator::VisitExpr_;
+
   Outliner(GlobalSymbolCache* cache, std::string compiler_filter, IRModule mod)
       : cache_(cache), compiler_filter_(std::move(compiler_filter)), mod_(std::move(mod)) {}
 

From c15cc5ef6d36288abd58587b7bf4f0440596a54f Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 23 Aug 2022 21:02:27 -0700
Subject: [PATCH 020/704] [Target] Remove deprecated parameters from target
 (#12416)

* remove depricated parameters in target

* lint

* fix cpp tests

fix

* remove more configs in test files

* address comments

* fix error

* fix hexagon

* fix micro tutorial

* fix integration tests

* fix hexagon

* lint

* fix unittest

* fix readme

* fix assert executor in target

* address comments

* fix tutorials

* fix hexagon target

* fix tutorial

* fix for tutorials

* hexagon
---
 apps/hexagon_launcher/README.md               |  4 +-
 apps/howto_deploy/prepare_test_libs.py        |  2 +-
 apps/sgx/src/build_model.py                   |  7 +-
 .../wasm-graph/tools/build_graph_lib.py       |  9 ++-
 .../ci_logs/matmul.json                       |  2 +-
 .../ci_logs/resnet-50-NHWC-B1-llvm.json       | 52 ++++++-------
 .../ci_logs/sparse_dense.json                 |  2 +-
 .../tune_with_autotvm/tune_relay_x86.py       |  4 +-
 .../how_to/work_with_microtvm/micro_tvmc.sh   |  4 +-
 gallery/tutorial/auto_scheduler_matmul_x86.py |  2 -
 python/tvm/contrib/hexagon/pytest_plugin.py   |  2 +-
 python/tvm/relay/build_module.py              | 78 -------------------
 python/tvm/target/target.py                   | 19 +----
 src/target/target_kind.cc                     | 38 ++-------
 tests/cpp/c_codegen_test.cc                   | 10 +--
 tests/cpp/target_test.cc                      |  4 +-
 .../test_hexagon/topi/test_softmax_slice.py   |  1 -
 tests/python/driver/tvmc/test_target.py       |  6 +-
 .../python/driver/tvmc/test_target_options.py |  2 +-
 tests/python/relay/aot/test_cpp_aot.py        |  2 +-
 tests/python/relay/aot/test_crt_aot.py        | 36 ---------
 tests/python/relay/test_build_module.py       | 47 +++--------
 .../test_tir_transform_common_subexpr_elim.py |  2 +-
 .../unittest/test_tvmscript_roundtrip.py      |  4 +-
 tests/scripts/task_python_docs.sh             |  2 +-
 25 files changed, 80 insertions(+), 261 deletions(-)

diff --git a/apps/hexagon_launcher/README.md b/apps/hexagon_launcher/README.md
index 210759a80c7c..cc433f245759 100644
--- a/apps/hexagon_launcher/README.md
+++ b/apps/hexagon_launcher/README.md
@@ -118,7 +118,7 @@ mod, params = relay.frontend.from_tflite(
     tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
 )
 
-target = tvm.target.hexagon('v68', link_params=True)
+target = tvm.target.hexagon('v68')
 with tvm.transform.PassContext(opt_level=3):
     lib = relay.build(mod, tvm.target.Target(target, host=target), params=params, mod_name="default")
 
@@ -172,7 +172,7 @@ A sample output JSON from running the Inception V3 model may look like
 
 When using AoT, the `target` needs to be `llvm`:
 ```
-aot_target = "llvm -keys=hexagon -link-params=0 -mattr=+hvxv69,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp -mcpu=hexagonv69 -mtriple=hexagon"
+aot_target = "llvm -keys=hexagon -mattr=+hvxv69,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp -mcpu=hexagonv69 -mtriple=hexagon"
 aot_host_target = aot_target
 ```
 
diff --git a/apps/howto_deploy/prepare_test_libs.py b/apps/howto_deploy/prepare_test_libs.py
index a6c7688d2084..8e9f8b5f7335 100644
--- a/apps/howto_deploy/prepare_test_libs.py
+++ b/apps/howto_deploy/prepare_test_libs.py
@@ -33,7 +33,7 @@ def prepare_test_libs(base_path):
     fadd_dylib.export_library(dylib_path)
 
     # Compile library in system library mode
-    fadd_syslib = tvm.build(s, [A, B], "llvm --system-lib", name="addonesys")
+    fadd_syslib = tvm.build(s, [A, B], "llvm", name="addonesys")
     syslib_path = os.path.join(base_path, "test_addone_sys.o")
     fadd_syslib.save(syslib_path)
 
diff --git a/apps/sgx/src/build_model.py b/apps/sgx/src/build_model.py
index 1fc297d8a094..ea3b4ed992ad 100755
--- a/apps/sgx/src/build_model.py
+++ b/apps/sgx/src/build_model.py
@@ -39,7 +39,12 @@ def main():
     )
 
     with tvm.transform.PassContext(opt_level=3):
-        graph, lib, params = relay.build(net, "llvm --system-lib", params=params)
+        graph, lib, params = relay.build(
+            net,
+            "llvm",
+            params=params,
+            runtime=tvm.relay.backend.Runtime("cpp", {"system-lib": True}),
+        )
 
     build_dir = osp.abspath(sys.argv[1])
     if not osp.isdir(build_dir):
diff --git a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
index 9b262c398e00..c2f9089710a3 100755
--- a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
+++ b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
@@ -72,10 +72,15 @@ def build_graph_lib(opt_level):
     shape_dict = {input_name: img_data.shape}
 
     mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
-    target = "llvm -mtriple=wasm32-unknown-unknown -mattr=+simd128 --system-lib"
+    target = "llvm -mtriple=wasm32-unknown-unknown -mattr=+simd128"
 
     with tvm.transform.PassContext(opt_level=opt_level):
-        factory = relay.build(mod, target=target, params=params)
+        factory = relay.build(
+            mod,
+            target=target,
+            params=params,
+            runtime=tvm.relay.backend.Runtime("cpp", {"system-lib": True}),
+        )
 
     # Save the model artifacts to obj_file
     obj_file = os.path.join(out_dir, "graph.o")
diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json
index 2e3a98404dda..b0d33a911a63 100644
--- a/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json
+++ b/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json
@@ -1,2 +1,2 @@
 # Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI.
-{"i": [["[\"matmul_add\", 1024, 1024, 1024, \"float32\"]", "llvm -keys=cpu -link-params=0", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 2, 0, 1024, [2, 1, 4], 1], ["SP", 2, 4, 1024, [1, 1, 8], 1], ["SP", 2, 8, 1024, [4], 1], ["RE", 2, [0, 4, 1, 5, 8, 2, 6, 9, 3, 7]], ["FSP", 4, 0, 0, 2], ["FSP", 4, 3, 1, 2], ["RE", 4, [0, 3, 1, 4, 2, 5]], ["CA", 2, 4, 3], ["FU", 4, [0, 1]], ["AN", 4, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$8"], ["AN", 2, 9, 2], ["AN", 4, 4, 2]]]], "r": [[0.0044742], 0, 0.335558, 1607112214], "v": "v0.3"}
+{"i": [["[\"matmul_add\", 1024, 1024, 1024, \"float32\"]", "llvm -keys=cpu", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 2, 0, 1024, [2, 1, 4], 1], ["SP", 2, 4, 1024, [1, 1, 8], 1], ["SP", 2, 8, 1024, [4], 1], ["RE", 2, [0, 4, 1, 5, 8, 2, 6, 9, 3, 7]], ["FSP", 4, 0, 0, 2], ["FSP", 4, 3, 1, 2], ["RE", 4, [0, 3, 1, 4, 2, 5]], ["CA", 2, 4, 3], ["FU", 4, [0, 1]], ["AN", 4, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$8"], ["AN", 2, 9, 2], ["AN", 4, 4, 2]]]], "r": [[0.0044742], 0, 0.335558, 1607112214], "v": "v0.3"}
diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
index 3dd4541fd33a..4fb148c887bd 100644
--- a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
+++ b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
@@ -1,28 +1,28 @@
 # Provide valid schedules for resnet-50 for CPU.
 # This is used to run the tutorial on the documentation web server.
-{"i": [["[\"d7b65649a4dd54becea0a52aabbc5af5\", 1, 1000, 1, 1000]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 1, 1000, [50], 1], ["RF", 3, 2, 1], ["RE", 3, [0, 2, 1]], ["SP", 1, 1, 1000, [20], 1], ["RF", 1, 2, 1], ["RE", 1, [0, 2, 1]], ["CR", 6], ["CA", 5, 6, 1], ["CR", 4], ["CA", 2, 3, 1], ["AN", 1, 0, 3], ["FU", 3, [0, 1]], ["AN", 3, 0, 3], ["AN", 4, 0, 3], ["FU", 6, [0, 1]], ["AN", 6, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[8.75e-06, 1.0781e-05, 9.875e-06, 9.836e-06, 1.0357e-05, 1.0238e-05, 1.0341e-05, 9.75e-06, 9.561e-06, 1.0122e-05], 0, 0.17921, 1606960872], "v": "v0.5"}
-{"i": [["[\"69115f188984ae34ede37c3b8ca40b43\", 1, 7, 7, 2048, 1, 1, 1, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CA", 1, 2, 3], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[6.28e-06, 8.176e-06, 8.048e-06, 7.942e-06, 7.977e-06, 8.002e-06, 8.093e-06, 7.924e-06, 7.943e-06, 7.924e-06], 0, 0.130759, 1606960900], "v": "v0.5"}
-{"i": [["[\"875556d12d0be2269206a7775d5296a6\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 1, 1, 2048, 1, 1, 1, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 9], ["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 2048, [8, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 10, 0, 3, 2], ["FSP", 10, 3, 4, 2], ["FSP", 10, 6, 5, 2], ["FSP", 10, 9, 6, 2], ["RE", 10, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 10, 7], ["CI", 1], ["FU", 10, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 10, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 10, 4, 2]]]], "r": [[0.000175984, 0.000171372, 0.00018538, 0.000178085, 0.00017879, 0.000179878, 0.000179221, 0.000178598, 0.000176714, 0.000168318], 0, 0.277929, 1606960917], "v": "v0.5"}
-{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 14, 14, 1024, 1, 1, 1024, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 7], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 512, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 14, 2]]]], "r": [[0.00012651, 0.00012801, 0.000128605, 0.00013267, 0.00012596, 0.000126418, 0.000121995, 0.000127242, 0.000128152, 0.000129989], 0, 0.310011, 1606960986], "v": "v0.5"}
-{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 1, 1, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 1024, [1, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6]], ["AN", 3, 0, 3], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000183629, 0.000188334, 0.000195553, 0.000187308, 0.000196409, 0.000190496, 0.000190344, 0.000188567, 0.000186319, 0.000187136], 0, 0.384722, 1606961002], "v": "v0.5"}
-{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 28, 28, 512, 1, 1, 512, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [2, 1, 7], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 256, [16, 4, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["CR", 6], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 14, 2]]]], "r": [[0.000118033, 0.000116806, 0.000134047, 0.000116701, 0.000116219, 0.000116834, 0.000117132, 0.000117029, 0.000116393, 0.000116778], 0, 0.31025, 1606961069], "v": "v0.5"}
-{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 1, 1, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 1], 1], ["SP", 3, 8, 28, [7, 1, 1], 1], ["SP", 3, 12, 512, [1, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.00019554, 0.000203491, 0.000199599, 0.000194289, 0.000197556, 0.000199504, 0.000198527, 0.000200656, 0.000200037, 0.000201954], 0, 0.240599, 1606961080], "v": "v0.5"}
-{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 56, 56, 256, 1, 1, 256, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [14, 1, 2], 1], ["SP", 3, 8, 28, [2, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [16], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000128461, 0.000158344, 0.000154659, 0.000148478, 0.000162668, 0.000155789, 0.000149412, 0.000141607, 0.000148815, 0.000165989], 0, 0.299928, 1606961156], "v": "v0.5"}
-{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 64, 1, 1, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [14, 2, 2], 1], ["SP", 3, 8, 56, [2, 1, 2], 1], ["SP", 3, 12, 64, [1, 16, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[7.8291e-05, 7.4365e-05, 6.7147e-05, 6.7413e-05, 8.1894e-05, 7.1771e-05, 7.2916e-05, 6.6615e-05, 7.3038e-05, 7.4967e-05], 0, 1.09095, 1606961258], "v": "v0.5"}
-{"i": [["[\"a5612fdeb9db4d579a75ec225ea4c06a\", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 4], ["CA", 2, 5, 3], ["CR", 1], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 5, [0, 1, 2]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$64"]]]], "r": [[2.9217e-05, 3.1065e-05, 3.188e-05, 3.0897e-05, 3.1295e-05, 3.1307e-05, 3.19e-05, 3.1038e-05, 3.1919e-05, 3.2077e-05], 0, 0.217184, 1606961266], "v": "v0.5"}
-{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 14, 14, 1024, 1, 1, 1024, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 256, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[0.000153068, 0.000161094, 0.000164674, 0.000160245, 0.000159626, 0.000146788, 0.000140718, 0.000159237, 0.000162109, 0.000139686], 0, 0.273946, 1606961647], "v": "v0.5"}
-{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 112, [1, 1, 4], 1], ["SP", 3, 8, 112, [4, 2, 1], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 7, [7], 1], ["SP", 3, 18, 7, [7], 1], ["SP", 3, 20, 3, [3], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 6, 3], ["FU", 6, [0, 1, 2]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], ["AN", 6, 9, 2]]]], "r": [[0.000247808, 0.000233393, 0.000251767, 0.000252226, 0.000254169, 0.000254176, 0.00025333, 0.00025511, 0.000253678, 0.000251738], 0, 0.315503, 1606961659], "v": "v0.5"}
-{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 1], 1], ["SP", 3, 8, 7, [1, 1, 7], 1], ["SP", 3, 12, 2048, [256, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000169437, 0.000169021, 0.00016965, 0.00017079, 0.000170862, 0.0001692, 0.000164768, 0.000175541, 0.000171528, 0.000169094], 0, 0.25194, 1606961681], "v": "v0.5"}
-{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 256, 1, 1, 256, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 1, 1], 1], ["SP", 3, 8, 56, [7, 4, 1], 1], ["SP", 3, 12, 64, [4, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.00015141, 0.000158121, 0.000132758, 0.00015109, 0.000148266, 0.000152599, 0.000150809, 0.000151947, 0.000150702, 0.000156091], 0, 0.221869, 1606961698], "v": "v0.5"}
-{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 56, 56, 64, 3, 3, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [1, 1, 1], 1], ["SP", 3, 8, 56, [7, 1, 4], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 64, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.000221341, 0.000225005, 0.000209954, 0.000209741, 0.000228281, 0.000208451, 0.000223046, 0.000222672, 0.000228098, 0.000220093], 0, 0.231218, 1606961709], "v": "v0.5"}
-{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 7, 7, 2048, 1, 1, 2048, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 512, [2, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 2048, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000165941, 0.000152645, 0.000165687, 0.000166639, 0.000166094, 0.00016649, 0.000164394, 0.000169288, 0.000169497, 0.000168535], 0, 0.245559, 1606961724], "v": "v0.5"}
-{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 2, 1], 1], ["SP", 3, 8, 56, [1, 1, 1], 1], ["SP", 3, 12, 256, [2, 4, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 6, 2]]]], "r": [[0.000161206, 0.000161372, 0.000158862, 0.000159596, 0.00014964, 0.000162042, 0.000159626, 0.000158166, 0.000161209, 0.000159408], 0, 0.337652, 1606961748], "v": "v0.5"}
-{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 28, 28, 512, 1, 1, 512, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 2, 1], 1], ["SP", 3, 8, 14, [14, 1, 1], 1], ["SP", 3, 12, 1024, [2, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2]]]], "r": [[0.000238006, 0.000235502, 0.000239805, 0.000234637, 0.000235266, 0.000238355, 0.000240836, 0.000232856, 0.000231219, 0.000238776], 0, 0.219506, 1606961782], "v": "v0.5"}
-{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 56, 56, 256, 1, 1, 256, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [2, 1, 2], 1], ["SP", 3, 8, 28, [1, 2, 1], 1], ["SP", 3, 12, 512, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000213071, 0.000218117, 0.000216346, 0.000216237, 0.000214703, 0.00021605, 0.000210522, 0.000214234, 0.000218293, 0.00021484], 0, 0.291873, 1606961801], "v": "v0.5"}
-{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 14, 14, 1024, 1, 1, 1024, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 2048, [128, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000265306, 0.000259738, 0.000256412, 0.000284932, 0.000267557, 0.000266362, 0.00026533, 0.000263389, 0.000263022, 0.000263069], 0, 0.296232, 1606961838], "v": "v0.5"}
-{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 7, 7, 512, 3, 3, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 8], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CR", 1], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000269786, 0.0002657, 0.000261922, 0.000267462, 0.000270495, 0.000265371, 0.000273858, 0.000268022, 0.000266746, 0.000272337], 0, 0.331923, 1606961848], "v": "v0.5"}
-{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 1024, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000159777, 0.00015711, 0.000163052, 0.000152569, 0.00015342, 0.000154918, 0.000153887, 0.000154133, 0.000154319, 0.000150102], 0, 0.195628, 1606961878], "v": "v0.5"}
-{"i": [["[\"7006235cfc29b73be524cf390ed5a977\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [56, 1, 1], 1], ["SP", 3, 8, 56, [14, 1, 2], 1], ["SP", 3, 12, 256, [1, 2, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000159044, 0.000157356, 0.000158889, 0.000160304, 0.000158648, 0.000159749, 0.000143679, 0.000156393, 0.000164916, 0.000155957], 0, 0.240777, 1606961918], "v": "v0.5"}
-{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000168259, 0.000157338, 0.0001551, 0.000156552, 0.000160492, 0.000164505, 0.000144937, 0.000138397, 0.000153011, 0.000153186], 0, 0.231498, 1606961965], "v": "v0.5"}
-{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 1, 1, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [7, 1, 4], 1], ["SP", 3, 8, 56, [4, 2, 1], 1], ["SP", 3, 12, 256, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3, 4, 5, 6]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 8, 5, 2]]]], "r": [[0.000185957, 0.000180964, 0.000179419, 0.000168205, 0.000176155, 0.000178243, 0.000180175, 0.00017753, 0.000174475, 0.000158878], 0, 0.316404, 1606961979], "v": "v0.5"}
-{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 28, 28, 512, 1, 1, 512, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [7, 1, 4], 1], ["SP", 3, 8, 28, [14, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000150378, 0.000154444, 0.000156051, 0.000130306, 0.000156154, 0.000131167, 0.000142357, 0.000152532, 0.000131899, 0.000157696], 0, 0.18509, 1606962002], "v": "v0.5"}
+{"i": [["[\"d7b65649a4dd54becea0a52aabbc5af5\", 1, 1000, 1, 1000]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 1, 1000, [50], 1], ["RF", 3, 2, 1], ["RE", 3, [0, 2, 1]], ["SP", 1, 1, 1000, [20], 1], ["RF", 1, 2, 1], ["RE", 1, [0, 2, 1]], ["CR", 6], ["CA", 5, 6, 1], ["CR", 4], ["CA", 2, 3, 1], ["AN", 1, 0, 3], ["FU", 3, [0, 1]], ["AN", 3, 0, 3], ["AN", 4, 0, 3], ["FU", 6, [0, 1]], ["AN", 6, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[8.75e-06, 1.0781e-05, 9.875e-06, 9.836e-06, 1.0357e-05, 1.0238e-05, 1.0341e-05, 9.75e-06, 9.561e-06, 1.0122e-05], 0, 0.17921, 1606960872], "v": "v0.5"}
+{"i": [["[\"69115f188984ae34ede37c3b8ca40b43\", 1, 7, 7, 2048, 1, 1, 1, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CA", 1, 2, 3], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[6.28e-06, 8.176e-06, 8.048e-06, 7.942e-06, 7.977e-06, 8.002e-06, 8.093e-06, 7.924e-06, 7.943e-06, 7.924e-06], 0, 0.130759, 1606960900], "v": "v0.5"}
+{"i": [["[\"875556d12d0be2269206a7775d5296a6\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 1, 1, 2048, 1, 1, 1, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 9], ["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 2048, [8, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 10, 0, 3, 2], ["FSP", 10, 3, 4, 2], ["FSP", 10, 6, 5, 2], ["FSP", 10, 9, 6, 2], ["RE", 10, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 10, 7], ["CI", 1], ["FU", 10, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 10, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 10, 4, 2]]]], "r": [[0.000175984, 0.000171372, 0.00018538, 0.000178085, 0.00017879, 0.000179878, 0.000179221, 0.000178598, 0.000176714, 0.000168318], 0, 0.277929, 1606960917], "v": "v0.5"}
+{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 14, 14, 1024, 1, 1, 1024, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 7], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 512, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 14, 2]]]], "r": [[0.00012651, 0.00012801, 0.000128605, 0.00013267, 0.00012596, 0.000126418, 0.000121995, 0.000127242, 0.000128152, 0.000129989], 0, 0.310011, 1606960986], "v": "v0.5"}
+{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 1, 1, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 1024, [1, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6]], ["AN", 3, 0, 3], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000183629, 0.000188334, 0.000195553, 0.000187308, 0.000196409, 0.000190496, 0.000190344, 0.000188567, 0.000186319, 0.000187136], 0, 0.384722, 1606961002], "v": "v0.5"}
+{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 28, 28, 512, 1, 1, 512, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [2, 1, 7], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 256, [16, 4, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["CR", 6], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 14, 2]]]], "r": [[0.000118033, 0.000116806, 0.000134047, 0.000116701, 0.000116219, 0.000116834, 0.000117132, 0.000117029, 0.000116393, 0.000116778], 0, 0.31025, 1606961069], "v": "v0.5"}
+{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 1, 1, 512, 1, 28, 28, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 1], 1], ["SP", 3, 8, 28, [7, 1, 1], 1], ["SP", 3, 12, 512, [1, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.00019554, 0.000203491, 0.000199599, 0.000194289, 0.000197556, 0.000199504, 0.000198527, 0.000200656, 0.000200037, 0.000201954], 0, 0.240599, 1606961080], "v": "v0.5"}
+{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 56, 56, 256, 1, 1, 256, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [14, 1, 2], 1], ["SP", 3, 8, 28, [2, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [16], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000128461, 0.000158344, 0.000154659, 0.000148478, 0.000162668, 0.000155789, 0.000149412, 0.000141607, 0.000148815, 0.000165989], 0, 0.299928, 1606961156], "v": "v0.5"}
+{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 64, 1, 1, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [14, 2, 2], 1], ["SP", 3, 8, 56, [2, 1, 2], 1], ["SP", 3, 12, 64, [1, 16, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[7.8291e-05, 7.4365e-05, 6.7147e-05, 6.7413e-05, 8.1894e-05, 7.1771e-05, 7.2916e-05, 6.6615e-05, 7.3038e-05, 7.4967e-05], 0, 1.09095, 1606961258], "v": "v0.5"}
+{"i": [["[\"a5612fdeb9db4d579a75ec225ea4c06a\", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 4], ["CA", 2, 5, 3], ["CR", 1], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 5, [0, 1, 2]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$64"]]]], "r": [[2.9217e-05, 3.1065e-05, 3.188e-05, 3.0897e-05, 3.1295e-05, 3.1307e-05, 3.19e-05, 3.1038e-05, 3.1919e-05, 3.2077e-05], 0, 0.217184, 1606961266], "v": "v0.5"}
+{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 14, 14, 1024, 1, 1, 1024, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 256, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[0.000153068, 0.000161094, 0.000164674, 0.000160245, 0.000159626, 0.000146788, 0.000140718, 0.000159237, 0.000162109, 0.000139686], 0, 0.273946, 1606961647], "v": "v0.5"}
+{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 112, [1, 1, 4], 1], ["SP", 3, 8, 112, [4, 2, 1], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 7, [7], 1], ["SP", 3, 18, 7, [7], 1], ["SP", 3, 20, 3, [3], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 6, 3], ["FU", 6, [0, 1, 2]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], ["AN", 6, 9, 2]]]], "r": [[0.000247808, 0.000233393, 0.000251767, 0.000252226, 0.000254169, 0.000254176, 0.00025333, 0.00025511, 0.000253678, 0.000251738], 0, 0.315503, 1606961659], "v": "v0.5"}
+{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 1], 1], ["SP", 3, 8, 7, [1, 1, 7], 1], ["SP", 3, 12, 2048, [256, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000169437, 0.000169021, 0.00016965, 0.00017079, 0.000170862, 0.0001692, 0.000164768, 0.000175541, 0.000171528, 0.000169094], 0, 0.25194, 1606961681], "v": "v0.5"}
+{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 256, 1, 1, 256, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 1, 1], 1], ["SP", 3, 8, 56, [7, 4, 1], 1], ["SP", 3, 12, 64, [4, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.00015141, 0.000158121, 0.000132758, 0.00015109, 0.000148266, 0.000152599, 0.000150809, 0.000151947, 0.000150702, 0.000156091], 0, 0.221869, 1606961698], "v": "v0.5"}
+{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 56, 56, 64, 3, 3, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [1, 1, 1], 1], ["SP", 3, 8, 56, [7, 1, 4], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 64, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.000221341, 0.000225005, 0.000209954, 0.000209741, 0.000228281, 0.000208451, 0.000223046, 0.000222672, 0.000228098, 0.000220093], 0, 0.231218, 1606961709], "v": "v0.5"}
+{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 7, 7, 2048, 1, 1, 2048, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 512, [2, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 2048, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000165941, 0.000152645, 0.000165687, 0.000166639, 0.000166094, 0.00016649, 0.000164394, 0.000169288, 0.000169497, 0.000168535], 0, 0.245559, 1606961724], "v": "v0.5"}
+{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 56, 56, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 2, 1], 1], ["SP", 3, 8, 56, [1, 1, 1], 1], ["SP", 3, 12, 256, [2, 4, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 6, 2]]]], "r": [[0.000161206, 0.000161372, 0.000158862, 0.000159596, 0.00014964, 0.000162042, 0.000159626, 0.000158166, 0.000161209, 0.000159408], 0, 0.337652, 1606961748], "v": "v0.5"}
+{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 28, 28, 512, 1, 1, 512, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 2, 1], 1], ["SP", 3, 8, 14, [14, 1, 1], 1], ["SP", 3, 12, 1024, [2, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2]]]], "r": [[0.000238006, 0.000235502, 0.000239805, 0.000234637, 0.000235266, 0.000238355, 0.000240836, 0.000232856, 0.000231219, 0.000238776], 0, 0.219506, 1606961782], "v": "v0.5"}
+{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 56, 56, 256, 1, 1, 256, 512, 1, 28, 28, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [2, 1, 2], 1], ["SP", 3, 8, 28, [1, 2, 1], 1], ["SP", 3, 12, 512, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000213071, 0.000218117, 0.000216346, 0.000216237, 0.000214703, 0.00021605, 0.000210522, 0.000214234, 0.000218293, 0.00021484], 0, 0.291873, 1606961801], "v": "v0.5"}
+{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 14, 14, 1024, 1, 1, 1024, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 2048, [128, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000265306, 0.000259738, 0.000256412, 0.000284932, 0.000267557, 0.000266362, 0.00026533, 0.000263389, 0.000263022, 0.000263069], 0, 0.296232, 1606961838], "v": "v0.5"}
+{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 7, 7, 512, 3, 3, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 8], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CR", 1], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000269786, 0.0002657, 0.000261922, 0.000267462, 0.000270495, 0.000265371, 0.000273858, 0.000268022, 0.000266746, 0.000272337], 0, 0.331923, 1606961848], "v": "v0.5"}
+{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 1024, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000159777, 0.00015711, 0.000163052, 0.000152569, 0.00015342, 0.000154918, 0.000153887, 0.000154133, 0.000154319, 0.000150102], 0, 0.195628, 1606961878], "v": "v0.5"}
+{"i": [["[\"7006235cfc29b73be524cf390ed5a977\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [56, 1, 1], 1], ["SP", 3, 8, 56, [14, 1, 2], 1], ["SP", 3, 12, 256, [1, 2, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000159044, 0.000157356, 0.000158889, 0.000160304, 0.000158648, 0.000159749, 0.000143679, 0.000156393, 0.000164916, 0.000155957], 0, 0.240777, 1606961918], "v": "v0.5"}
+{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 28, 28, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000168259, 0.000157338, 0.0001551, 0.000156552, 0.000160492, 0.000164505, 0.000144937, 0.000138397, 0.000153011, 0.000153186], 0, 0.231498, 1606961965], "v": "v0.5"}
+{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 1, 1, 256, 1, 56, 56, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [7, 1, 4], 1], ["SP", 3, 8, 56, [4, 2, 1], 1], ["SP", 3, 12, 256, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3, 4, 5, 6]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 8, 5, 2]]]], "r": [[0.000185957, 0.000180964, 0.000179419, 0.000168205, 0.000176155, 0.000178243, 0.000180175, 0.00017753, 0.000174475, 0.000158878], 0, 0.316404, 1606961979], "v": "v0.5"}
+{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 28, 28, 512, 1, 1, 512, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [7, 1, 4], 1], ["SP", 3, 8, 28, [14, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000150378, 0.000154444, 0.000156051, 0.000130306, 0.000156154, 0.000131167, 0.000142357, 0.000152532, 0.000131899, 0.000157696], 0, 0.18509, 1606962002], "v": "v0.5"}
diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json
index 7c1c100124dc..9bf6af0b17d8 100644
--- a/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json
+++ b/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json
@@ -1,2 +1,2 @@
 # Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI.
-{"i": [["[\"sparse_dense\", 512, 512, 512, [9831, 16, 1], [9831], [33], \"float32\"]", "llvm -keys=cpu -link-params=0", [6, 64, 64, 0, 0, 0, 0, 0], "", 1, ["sparse_dense_bsr_512_512_512_16_1_0.60_W_data", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indices", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indptr"]], [[], [["CI", 8], ["CI", 6], ["SP", 5, 0, 512, [1, 8], 1], ["FSP", 9, 0, 2, 1], ["SP", 5, 3, 32, [32], 1], ["FSP", 9, 2, 4, 1], ["RE", 5, [0, 3, 1, 4, 6, 2, 5, 7]], ["RE", 9, [0, 2, 1, 3]], ["CA", 5, 9, 1], ["CI", 4], ["FU", 9, [0, 1]], ["AN", 9, 0, 3], ["PR", 5, 0, "auto_unroll_max_step$0"], ["AN", 9, 2, 2]]]], "r": [[0.000957008], 0, 0.605709, 1614689820], "v": "v0.6"}
+{"i": [["[\"sparse_dense\", 512, 512, 512, [9831, 16, 1], [9831], [33], \"float32\"]", "llvm -keys=cpu", [6, 64, 64, 0, 0, 0, 0, 0], "", 1, ["sparse_dense_bsr_512_512_512_16_1_0.60_W_data", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indices", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indptr"]], [[], [["CI", 8], ["CI", 6], ["SP", 5, 0, 512, [1, 8], 1], ["FSP", 9, 0, 2, 1], ["SP", 5, 3, 32, [32], 1], ["FSP", 9, 2, 4, 1], ["RE", 5, [0, 3, 1, 4, 6, 2, 5, 7]], ["RE", 9, [0, 2, 1, 3]], ["CA", 5, 9, 1], ["CI", 4], ["FU", 9, [0, 1]], ["AN", 9, 0, 3], ["PR", 5, 0, "auto_unroll_max_step$0"], ["AN", 9, 2, 2]]]], "r": [[0.000957008], 0, 0.605709, 1614689820], "v": "v0.6"}
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py
index 6e46fbd8ffc8..2ba597d1da19 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py
@@ -298,7 +298,7 @@ def tune_and_evaluate(tuning_opt):
 #
 #    Evaluation of the network been tuned on graph level:
 #    Compile...
-#    Config for target=llvm -keys=cpu -link-params=0, workload=('dense_nopack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32') is missing in ApplyGraphBest context. A fallback configuration is used, which may bring great performance regression.
-#    Config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32') is missing in ApplyGraphBest context. A fallback configuration is used, which may bring great performance regression.
+#    Config for target=llvm -keys=cpu, workload=('dense_nopack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32') is missing in ApplyGraphBest context. A fallback configuration is used, which may bring great performance regression.
+#    Config for target=llvm -keys=cpu, workload=('dense_pack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32') is missing in ApplyGraphBest context. A fallback configuration is used, which may bring great performance regression.
 #    Evaluate inference time cost...
 #    Mean inference time (std dev): 3.16 ms (0.03 ms)
diff --git a/gallery/how_to/work_with_microtvm/micro_tvmc.sh b/gallery/how_to/work_with_microtvm/micro_tvmc.sh
index 5ec718884559..0eaef9c6a836 100755
--- a/gallery/how_to/work_with_microtvm/micro_tvmc.sh
+++ b/gallery/how_to/work_with_microtvm/micro_tvmc.sh
@@ -99,7 +99,7 @@ wget https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/e
 #
 # bash
 tvmc compile magic_wand.tflite \
-    --target='c -keys=cpu -link-params=0 -model=host' \
+    --target='c -keys=cpu -model=host' \
     --runtime=crt \
     --runtime-crt-system-lib 1 \
     --executor='graph' \
@@ -111,7 +111,7 @@ tvmc compile magic_wand.tflite \
 # bash
 # This will generate a ``model.tar`` file which contains TVM compiler output files. To run this command for
 # a different Zephyr device, you need to update ``target``. For instance, for ``nrf5340dk_nrf5340_cpuapp`` board
-# the target is ``--target='c -keys=cpu -link-params=0 -model=nrf5340dk'``.
+# the target is ``--target='c -keys=cpu -model=nrf5340dk'``.
 #
 
 
diff --git a/gallery/tutorial/auto_scheduler_matmul_x86.py b/gallery/tutorial/auto_scheduler_matmul_x86.py
index 279987f00d81..98fd95c33878 100644
--- a/gallery/tutorial/auto_scheduler_matmul_x86.py
+++ b/gallery/tutorial/auto_scheduler_matmul_x86.py
@@ -44,8 +44,6 @@
 testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
 
-import os
-
 import numpy as np
 import tvm
 from tvm import te, auto_scheduler
diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index 6b61b6f4ba55..f735c81ee0aa 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -245,7 +245,7 @@ def terminate_rpc_servers():
 
 aot_host_target = tvm.testing.parameter(
     "c",
-    "llvm -keys=hexagon -link-params=0 "
+    "llvm -keys=hexagon "
     "-mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp "
     "-mcpu=hexagonv68 -mtriple=hexagon",
 )
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index f3de1a085692..6cdc79ceb587 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -274,69 +274,6 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo
     return _build_module_no_factory_impl(mod, target, target_host, params, mod_name)
 
 
-def _reconstruct_from_deprecated_options(deprecated_params_target):
-    executor = None
-    runtime = None
-
-    deprecated_executor = None
-    deprecated_executor_args = {}
-    if "executor" in deprecated_params_target.attrs:
-        _deprecated_target_param_warning("Executor", "executor")
-        deprecated_executor = deprecated_params_target.attrs.get("executor", "graph")
-    if "interface-api" in deprecated_params_target.attrs:
-        _deprecated_target_sub_param_warning("Executor", "interface-api")
-        deprecated_executor_args.update(
-            {"interface-api": deprecated_params_target.attrs["interface-api"]}
-        )
-    if "unpacked-api" in deprecated_params_target.attrs:
-        _deprecated_target_sub_param_warning("Executor", "unpacked-api")
-        deprecated_executor_args.update(
-            {"unpacked-api": deprecated_params_target.attrs["unpacked-api"]}
-        )
-    if (
-        "link-params" in deprecated_params_target.attrs
-        and deprecated_params_target.attrs["link-params"]
-    ):
-        _deprecated_target_sub_param_warning("Executor", "link-params")
-        if deprecated_executor != "aot":
-            deprecated_executor_args.update(
-                {"link-params": deprecated_params_target.attrs["link-params"]}
-            )
-    if deprecated_executor or deprecated_executor_args:
-        executor = Executor(deprecated_executor or "graph", deprecated_executor_args)
-
-    deprecated_runtime = None
-    deprecated_runtime_args = {}
-    if "runtime" in deprecated_params_target.attrs:
-        _deprecated_target_param_warning("Runtime", "runtime")
-        deprecated_runtime = deprecated_params_target.attrs.get("runtime", "cpp")
-        if deprecated_runtime == "c":
-            deprecated_runtime = "crt"
-    if "system-lib" in deprecated_params_target.attrs:
-        _deprecated_target_sub_param_warning("Runtime", "system-lib")
-        deprecated_runtime_args.update({"system-lib": deprecated_params_target.attrs["system-lib"]})
-    if deprecated_runtime or deprecated_runtime_args:
-        runtime = Runtime(deprecated_runtime or "cpp", deprecated_runtime_args)
-
-    return executor, runtime
-
-
-def _deprecated_target_param_warning(registry, param):
-    warnings.warn(
-        f"Please use {registry} (tvm.relay.backend.{registry}) "
-        f"instead of deprecated Target parameter -{param}",
-        DeprecationWarning,
-    )
-
-
-def _deprecated_target_sub_param_warning(registry, param):
-    warnings.warn(
-        f"Please use {registry} (tvm.relay.backend.{registry}) parameter {param} "
-        f"instead of deprecated Target parameter -{param}",
-        DeprecationWarning,
-    )
-
-
 def build(
     ir_mod,
     target=None,
@@ -415,17 +352,6 @@ def build(
     assert len(raw_targets) > 0
     target_host = raw_targets[0].host
 
-    # All of this logic is to raise deprecation warnings for various parameters
-    # TODO(Mousius) Remove these after some time
-    deprecated_params_target = target_host or list(raw_targets)[0]
-    deprecated_executor, deprecated_runtime = _reconstruct_from_deprecated_options(
-        deprecated_params_target
-    )
-    if deprecated_executor:
-        executor = deprecated_executor
-    if deprecated_runtime:
-        runtime = deprecated_runtime
-
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub
     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
@@ -756,9 +682,5 @@ def create_executor(kind="debug", mod=None, device=None, target="llvm", params=N
     if kind == "vm":
         return VMExecutor(mod, device, raw_targets)
     if kind == "aot":
-        # The AOT requires the executor as a target attribute.
-        # (The compilation paths for the other executors currently do not always provide this
-        # attribute, hence the above generic assert is more forgiving).
-        assert "executor" in raw_targets[0].attrs
         return AotExecutor(mod, device, raw_targets)
     raise RuntimeError("unknown execution strategy: {0}".format(kind))
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index e0e5f0177b5e..a558fcbeaf5b 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -636,8 +636,6 @@ def hexagon(cpu_ver="v66", **kwargs):
         Whether to use QFloat HVX instructions.
     use_ieee_fp : bool (default: False)
         Whether to use IEEE HVX instructions
-    link_params : bool (default: False)
-        Whether to link graph parameters into the LLVM module.
 
     Note: Floating point support in HVX requires LLVM 14+.
     """
@@ -671,7 +669,6 @@ def get_arch_version(cpu_ver):
         "llvm_options": None,
         "use_qfloat": arch_version >= 68,
         "use_ieee_fp": False,
-        "link_params": False,
     }
     config.update(kwargs)
 
@@ -738,24 +735,10 @@ def create_llvm_options(cpu_ver, config):  # pylint: disable=unused-argument
         args = [s.replace("=", "@") for s in llvm_options.split()]
         return "--llvm-options=" + ",".join(args)
 
-    # TVM target attributes string
-    def create_tvm_options(cpu_ver, config):  # pylint: disable=unused-argument
-        """Create TVM target features string."""
-
-        features = {
-            "link_params": "link-params",
-        }
-        opts = ""
-        for k in config:
-            if k in features:
-                opts += " --" + features[k] + "=" + str(config[k])
-        return opts
-
     target_str = create_llvm_target(cpu_ver, config)
     llvm_str = create_llvm_options(cpu_ver, config)
-    tvm_str = create_tvm_options(cpu_ver, config)
 
-    args_list = target_str.split() + llvm_str.split() + tvm_str.split()
+    args_list = target_str.split() + llvm_str.split()
 
     return Target(" ".join(["hexagon"] + args_list))
 
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 38ee536e7818..e3b2d7b096fd 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -264,12 +264,7 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<String>("mtriple")
     .add_attr_option<String>("mfloat-abi")
     .add_attr_option<String>("mabi")
-    .add_attr_option<Bool>("system-lib")
-    .add_attr_option<String>("runtime")
     .add_attr_option<Integer>("num-cores")
-    .add_attr_option<Bool>("link-params", Bool(false))
-    .add_attr_option<Bool>("unpacked-api")
-    .add_attr_option<String>("interface-api")
     // Fast math flags, see https://llvm.org/docs/LangRef.html#fast-math-flags
     .add_attr_option<Bool>("fast-math")  // implies all the below
     .add_attr_option<Bool>("fast-math-nnan")
@@ -310,23 +305,16 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
 // Hence the type is "uint".
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
-    .add_attr_option<Bool>("system-lib")
-    .add_attr_option<Bool>("link-params", Bool(false))
-    .add_attr_option<String>("runtime")
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("march")
-    .add_attr_option<String>("executor")
     .add_attr_option<Integer>("workspace-byte-alignment")
     .add_attr_option<Integer>("constants-byte-alignment")
-    .add_attr_option<Bool>("unpacked-api")
-    .add_attr_option<String>("interface-api")
     .set_default_keys({"cpu"})
     .set_target_parser(tvm::target::parsers::cpu::ParseTarget);
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("arch")
-    .add_attr_option<Bool>("system-lib")
     .add_attr_option<Integer>("max_shared_memory_per_block")
     .add_attr_option<Integer>("max_threads_per_block")
     .add_attr_option<Integer>("thread_warp_size", Integer(32))
@@ -338,7 +326,6 @@ TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
 TVM_REGISTER_TARGET_KIND("nvptx", kDLCUDA)
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("mtriple")
-    .add_attr_option<Bool>("system-lib")
     .add_attr_option<Integer>("max_num_threads", Integer(1024))
     .add_attr_option<Integer>("thread_warp_size", Integer(32))
     .set_default_keys({"cuda", "gpu"})
@@ -348,7 +335,6 @@ TVM_REGISTER_TARGET_KIND("rocm", kDLROCM)
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("mtriple")
     .add_attr_option<Array<String>>("mattr")
-    .add_attr_option<Bool>("system-lib")
     // TODO(masahi): Support querying from a target device
     // On RDNA cards, thread_warp_size should be 32
     .add_attr_option<Integer>("max_num_threads", Integer(256))
@@ -359,7 +345,6 @@ TVM_REGISTER_TARGET_KIND("rocm", kDLROCM)
     .set_target_parser(UpdateROCmAttrs);
 
 TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
-    .add_attr_option<Bool>("system-lib")
     .add_attr_option<Integer>("max_num_threads", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(1))
     .add_attr_option<Integer>("texture_spatial_limit", Integer(16384))
@@ -370,7 +355,6 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
 // information about this limitation can be found here:
 // https://developer.apple.com/documentation/metal/buffers/about_argument_buffers?language=objc
 TVM_REGISTER_TARGET_KIND("metal", kDLMetal)
-    .add_attr_option<Bool>("system-lib")
     .add_attr_option<Integer>("max_num_threads", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(16))
     .add_attr_option<Integer>("max_function_args", Integer(31))
@@ -378,7 +362,6 @@ TVM_REGISTER_TARGET_KIND("metal", kDLMetal)
 
 TVM_REGISTER_TARGET_KIND("vulkan", kDLVulkan)
     .add_attr_option<Array<String>>("mattr")
-    .add_attr_option<Bool>("system-lib")
     // Feature support
     .add_attr_option<Bool>("supports_float16")
     .add_attr_option<Bool>("supports_float32", Bool(true))
@@ -417,39 +400,30 @@ TVM_REGISTER_TARGET_KIND("vulkan", kDLVulkan)
     .set_default_keys({"vulkan", "gpu"});
 
 TVM_REGISTER_TARGET_KIND("webgpu", kDLWebGPU)
-    .add_attr_option<Bool>("system-lib")
     .add_attr_option<Integer>("max_num_threads", Integer(256))
     .set_default_keys({"webgpu", "gpu"});
 
-TVM_REGISTER_TARGET_KIND("sdaccel", kDLOpenCL)
-    .add_attr_option<Bool>("system-lib")
+TVM_REGISTER_TARGET_KIND("sdaccel", kDLOpenCL)  // line break
     .set_default_keys({"sdaccel", "hls"});
 
-TVM_REGISTER_TARGET_KIND("aocl", kDLAOCL)
-    .add_attr_option<Bool>("system-lib")
+TVM_REGISTER_TARGET_KIND("aocl", kDLAOCL)  // line break
     .set_default_keys({"aocl", "hls"});
 
-TVM_REGISTER_TARGET_KIND("aocl_sw_emu", kDLAOCL)
-    .add_attr_option<Bool>("system-lib")
+TVM_REGISTER_TARGET_KIND("aocl_sw_emu", kDLAOCL)  // line break
     .set_default_keys({"aocl", "hls"});
 
 TVM_REGISTER_TARGET_KIND("hexagon", kDLHexagon)
     .add_attr_option<Array<String>>("mattr")
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("mtriple")
-    .add_attr_option<Bool>("system-lib")
-    .add_attr_option<Bool>("link-params", Bool(false))
     .add_attr_option<Array<String>>("llvm-options")
     .set_default_keys({"hexagon"});
 
-TVM_REGISTER_TARGET_KIND("stackvm", kDLCPU)  // line break
-    .add_attr_option<Bool>("system-lib");
+TVM_REGISTER_TARGET_KIND("stackvm", kDLCPU);
 
-TVM_REGISTER_TARGET_KIND("ext_dev", kDLExtDev)  // line break
-    .add_attr_option<Bool>("system-lib");
+TVM_REGISTER_TARGET_KIND("ext_dev", kDLExtDev);
 
-TVM_REGISTER_TARGET_KIND("hybrid", kDLCPU)  // line break
-    .add_attr_option<Bool>("system-lib");
+TVM_REGISTER_TARGET_KIND("hybrid", kDLCPU);
 
 TVM_REGISTER_TARGET_KIND("composite", kDLCPU)  // line break
     .add_attr_option<Array<Target>>("devices");
diff --git a/tests/cpp/c_codegen_test.cc b/tests/cpp/c_codegen_test.cc
index 442f76a8cff3..e764d21505d4 100644
--- a/tests/cpp/c_codegen_test.cc
+++ b/tests/cpp/c_codegen_test.cc
@@ -33,7 +33,7 @@ TEST(CCodegen, MainFunctionOrder) {
 
   std::string tvm_module_main = std::string(runtime::symbol::tvm_module_main);
 
-  tvm::Target target_c = tvm::Target("c -keys=cpu -link-params=0");
+  tvm::Target target_c = tvm::Target("c -keys=cpu");
 
   const int n = 4;
   Array<PrimExpr> shape{n};
@@ -104,16 +104,16 @@ TEST(CCodegen, FunctionOrder) {
   using namespace tvm;
   using namespace tvm::te;
 
-  Target target = Target("c -keys=cpu -link-params=0");
+  Target target = Target("c -keys=cpu");
 
   // add schedules in reverse order
   Map<tvm::Target, IRModule> inputs;
-  inputs.Set(Target("c -keys=cpu -link-params=0"), BuildLowered("op_2", target));
-  inputs.Set(Target("c -keys=cpu -link-params=0"), BuildLowered("op_1", target));
+  inputs.Set(Target("c -keys=cpu"), BuildLowered("op_2", target));
+  inputs.Set(Target("c -keys=cpu"), BuildLowered("op_1", target));
 
   for (uint32_t counter = 99; IsSorted(inputs) && counter > 0; counter--) {
     std::string op_name = "op_" + std::to_string(counter);
-    inputs.Set(Target("c -keys=cpu -link-params=0"), BuildLowered(op_name, target));
+    inputs.Set(Target("c -keys=cpu"), BuildLowered(op_name, target));
   }
 
   EXPECT_FALSE(IsSorted(inputs));
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index f238393ce923..37a8eeb44840 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -493,9 +493,8 @@ TEST(TargetCreation, DeduplicateKeys) {
   ICHECK_EQ(target->keys.size(), 2U);
   ICHECK_EQ(target->keys[0], "cpu");
   ICHECK_EQ(target->keys[1], "arm_cpu");
-  ICHECK_EQ(target->attrs.size(), 2U);
+  ICHECK_EQ(target->attrs.size(), 1U);
   ICHECK_EQ(target->GetAttr<String>("device"), "arm_cpu");
-  ICHECK_EQ(target->GetAttr<Bool>("link-params"), false);
 }
 
 TEST(TargetKindRegistry, ListTargetKinds) {
@@ -511,5 +510,4 @@ TEST(TargetKindRegistry, ListTargetOptions) {
 
   ICHECK_EQ(attrs["mattr"], "Array");
   ICHECK_EQ(attrs["mcpu"], "runtime.String");
-  ICHECK_EQ(attrs["system-lib"], "IntImm");
 }
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
index 9bbecdd7f81b..91b51cb5cc75 100644
--- a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import pytest
 import numpy as np
 
 import tvm
diff --git a/tests/python/driver/tvmc/test_target.py b/tests/python/driver/tvmc/test_target.py
index 4438ec437cb4..39e90e6d6ac4 100644
--- a/tests/python/driver/tvmc/test_target.py
+++ b/tests/python/driver/tvmc/test_target.py
@@ -114,9 +114,7 @@ def test_parse_multiple_target():
 
 def test_parse_hybrid_target():
     """Hybrid Target and external codegen"""
-    targets = parse_target(
-        "cmsis-nn -accelerator_config=ethos-u55-256, llvm -device=arm_cpu --system-lib"
-    )
+    targets = parse_target("cmsis-nn -accelerator_config=ethos-u55-256, llvm -device=arm_cpu")
 
     assert len(targets) == 2
     assert "cmsis-nn" == targets[0]["name"]
@@ -154,7 +152,7 @@ def test_parse_quotes_and_separators_on_options():
 
 
 def test_parse_multiple_target_with_opts_ethos_n78():
-    targets = parse_target("ethos-n -myopt=value, llvm -device=arm_cpu --system-lib")
+    targets = parse_target("ethos-n -myopt=value, llvm -device=arm_cpu")
 
     assert len(targets) == 2
     assert "ethos-n" == targets[0]["name"]
diff --git a/tests/python/driver/tvmc/test_target_options.py b/tests/python/driver/tvmc/test_target_options.py
index c73dc288cdd8..891df86f0c1f 100644
--- a/tests/python/driver/tvmc/test_target_options.py
+++ b/tests/python/driver/tvmc/test_target_options.py
@@ -86,7 +86,7 @@ def test_skip_target_from_codegen():
 def test_target_recombobulation_single():
     tvm_target, _ = target_from_cli("llvm", {"llvm": {"mcpu": "cortex-m3"}})
 
-    assert str(tvm_target) == "llvm -keys=arm_cpu,cpu -link-params=0 -mcpu=cortex-m3"
+    assert str(tvm_target) == "llvm -keys=arm_cpu,cpu -mcpu=cortex-m3"
 
 
 def test_target_recombobulation_many():
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index 3f641c995652..4ffe302763f8 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -169,7 +169,7 @@ def test_create_executor():
     x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
     expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
     actual = relay.create_executor(
-        "aot", mod=tvm.IRModule.from_expr(tvm.relay.Function([x], expr)), target="c -executor=aot"
+        "aot", mod=tvm.IRModule.from_expr(tvm.relay.Function([x], expr)), target="c"
     ).evaluate()(np.array([2], dtype="float32"))
 
     np.isfinite(np.array([3], dtype="float32"))
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index edf23ff22781..c3426f147e0d 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -710,42 +710,6 @@ def test_name_sanitiser_name_clash():
         )
 
 
-# This tests for deprecated AOT executor arguments
-# TODO(Mousius) Remove deprecated arguments later
-def test_deprecated_target_arguments():
-    """Tests we can still use relay.build with -executor, -runtime and -link-params"""
-
-    interface_api = "c"
-    use_unpacked_api = True
-    test_runner = AOT_DEFAULT_RUNNER
-
-    input_x = relay.var("x", shape=(1, 10))
-    input_y = relay.var("y", shape=(1, 10))
-    func_add = relay.add(input_x, input_y)
-    func = relay.Function([input_x, input_y], func_add)
-
-    x_in = np.ones((1, 10)).astype("float32")
-    y_in = np.random.uniform(size=(1, 10)).astype("float32")
-
-    params = {"x": x_in}
-    inputs = {"y": y_in}
-    output_list = generate_ref_data(func, inputs, params)
-
-    compile_and_run(
-        AOTTestModel(
-            module=IRModule.from_expr(func),
-            inputs=inputs,
-            outputs=output_list,
-            params=params,
-        ),
-        test_runner,
-        interface_api,
-        use_unpacked_api,
-        use_runtime_executor=False,
-        target="c -executor=aot --link-params -runtime=c -interface-api=c --unpacked-api",
-    )
-
-
 def test_aot_codegen_backend_alloc_workspace_calls():
     """This test checks whether AoT lowering creates TVMBackendAllocWorkspace calls"""
 
diff --git a/tests/python/relay/test_build_module.py b/tests/python/relay/test_build_module.py
index d51cfd29dc97..5cfc27330aff 100644
--- a/tests/python/relay/test_build_module.py
+++ b/tests/python/relay/test_build_module.py
@@ -22,48 +22,23 @@
 from tvm import relay
 from tvm.target.target import Target
 from tvm.relay.backend import Runtime, Executor, graph_executor_codegen
-from tvm.relay.build_module import _reconstruct_from_deprecated_options
 
 
 @pytest.mark.parametrize(
-    "target,executor,runtime",
+    "test_target,unsupported_config",
     [
-        [Target("c"), None, None],
-        [Target("c -runtime=c"), None, Runtime("crt")],
-        [Target("c -system-lib"), None, Runtime("cpp", {"system-lib": True})],
-        [Target("c -runtime=c -system-lib"), None, Runtime("crt", {"system-lib": True})],
-        [Target("c -executor=aot"), Executor("aot"), None],
-        [
-            Target("c -executor=aot -interface-api=c"),
-            Executor("aot", {"interface-api": "c"}),
-            None,
-        ],
-        [
-            Target("c -executor=aot -unpacked-api=1"),
-            Executor("aot", {"unpacked-api": 1}),
-            None,
-        ],
-        [Target("c -executor=aot -link-params=1"), Executor("aot"), None],
-        [Target("c -link-params=1"), Executor("graph", {"link-params": 1}), None],
-        [
-            Target(
-                "c -executor=aot -link-params=1 -interface-api=c"
-                "  -unpacked-api=1 -runtime=c -system-lib"
-            ),
-            Executor("aot", {"unpacked-api": 1, "interface-api": "c"}),
-            Runtime("crt", {"system-lib": True}),
-        ],
+        ["c", "-runtime=c"],
+        ["c", "-system-lib=1"],
+        ["c", "-executor=aot"],
+        ["c", "-interface-api=c"],
+        ["c", "-unpacked-api=1"],
+        ["c", "-link-params=1"],
     ],
 )
-def test_deprecated_target_parameters(target, executor, runtime):
-    actual_executor, actual_runtime = _reconstruct_from_deprecated_options(target)
-
-    assert (executor is None and actual_executor is None) or (executor.name == actual_executor.name)
-    # sort as TVM Map cannot guarantee round-trip order.
-    assert (executor is None and actual_executor is None) or (
-        sorted(executor.attrs.items()) == sorted(actual_executor.attrs.items())
-    )
-    assert runtime == actual_runtime
+def test_deprecated_target_parameters(test_target, unsupported_config):
+    with pytest.raises(ValueError) as e_info:
+        Target(f"{test_target} {unsupported_config}")
+        assert f"Cannot recognize '{unsupported_config}" in str(e_info.execption)
 
 
 def test_build_relay_graph_():
diff --git a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
index a546c16a648e..be229a580f01 100644
--- a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
+++ b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
@@ -449,7 +449,7 @@ def test_deterministic_cse():
 
 # Needed for the second test on determinism
 LOG_LINE = '{"i": [["[\\"conv2d_layer\\", 1, 7, 7, 512, 512, 3, 3, [1, 1], [1, 1]]", \
-            "llvm -keys=cpu -link-params=0 -mcpu=broadwell -num-cores=2", \
+            "llvm -keys=cpu -mcpu=broadwell -num-cores=2", \
             [8, 64, 64, 0, 0, 0, 0, 0], "", 1, []], [[], [["CI", 5], \
             ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 512, [1, 32, 16], 1], \
             ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 7, [1, 1, 1], 1], \
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 0a2cec6011ef..e5f5ae752aac 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3087,9 +3087,7 @@ def func_with_target_spec_by_config() -> None:
                         "kind": "cuda",
                         "tag": "",
                         "keys": ["cuda", "gpu"],
-                        "host": T.target(
-                            {"kind": "llvm", "tag": "", "keys": ["cpu"], "link-params": False}
-                        ),
+                        "host": T.target({"kind": "llvm", "tag": "", "keys": ["cpu"]}),
                     }
                 )
             }
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 8b390c962e98..d8578fde2817 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -83,7 +83,7 @@ IGNORED_WARNINGS=(
     'strategy:depthwise_conv2d NHWC layout is not optimized for x86 with autotvm.'
     'strategy:depthwise_conv2d with layout NHWC is not optimized for arm cpu.'
     'strategy:dense is not optimized for arm cpu.'
-    'autotvm:Cannot find config for target=llvm -keys=cpu -link-params=0'
+    'autotvm:Cannot find config for target=llvm -keys=cpu'
     'autotvm:One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.'
     'autotvm:Cannot find config for target=cuda -keys=cuda,gpu'
     # Warning is thrown during TFLite quantization for micro_train tutorial

From 577826182ff5c0029348b66b8f977c29e21c4ad4 Mon Sep 17 00:00:00 2001
From: crawlingcub <86861129+crawlingcub@users.noreply.github.com>
Date: Wed, 24 Aug 2022 02:08:47 -0500
Subject: [PATCH 021/704] [PyTorch][Fix] Fix for numerically unstable
 logsigmoid (#12563)

* Fix numerical instability for log sigmoid

Fix numerical instability for log sigmoid in pytorch frontend

* update

* add test for overflow check

* merging two tests
---
 python/tvm/relay/frontend/pytorch.py          | 4 +++-
 tests/python/frontend/pytorch/test_forward.py | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 8ed94c2a81c9..04a25c86b799 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -911,7 +911,9 @@ def glu(self, inputs, input_types):
 
     def log_sigmoid(self, inputs, input_types):
         data = inputs[0]
-        return _op.log(_op.tensor.sigmoid(data))
+        mn = _op.minimum(_op.const(0, dtype=input_types[0]), data)
+        z = _op.exp(-_op.abs(data))
+        return mn - self.log1p([z], input_types)
 
     def cross_entropy_loss_with_logits(self, inputs, input_types):
         input = inputs[0]
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index a030c5141a31..7e00770cd593 100755
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -811,7 +811,9 @@ def test_forward_log_sigmoid():
     torch.set_grad_enabled(False)
     input_shape = [10, 10]
     input_data = torch.rand(input_shape).float()
+    input_data_overflow = torch.tensor([-300.0, -100.0]).float()
     verify_model(torch.nn.LogSigmoid().eval(), input_data=input_data)
+    verify_model(torch.nn.LogSigmoid().eval(), input_data=input_data_overflow)
 
 
 @tvm.testing.uses_gpu

From e468dc28eac3c78a3c70c2b1616c6345d4767eab Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 24 Aug 2022 08:10:59 +0100
Subject: [PATCH 022/704] [microNPU] Force compute_cycles_hint to be
 interpreted as an int64 value (#12558)

`compute_cycles` can be the size of an int64 value, however it seems
that when that value is attached to the IR as a pragma from Python,
it is interpreted as an `int`, rather than `int64_t`. This commit adds
an explicit cast to ensure the value is interpreted correctly.

The reason these values started appearing very large and randomly is
still yet to be solved, although the hope is that this fix will unblock
CI.

Change-Id: Idcdd7d37af1acd665590c87624446a025b50eb3d
---
 python/tvm/contrib/ethosu/cascader/scheduler.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py
index 2c804a3b3b64..5ebc95d7ef88 100644
--- a/python/tvm/contrib/ethosu/cascader/scheduler.py
+++ b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -154,7 +154,11 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None:
 
                 # Attach AttrStmt directly to npu op so it isn't removed by ReplaceOperators
                 npu_op = part.subgraph.output_tensor.op.input_tensors[0].op.input_tensors[0]
-                sch[npu_op].pragma(npu_op.op.axis[0], "compute_cycles_hint", compute_cycles)
+                # Force the pragma to interpret the compute cycles as an int64 value
+                compute_cycles_int64_cast = tvm.tir.IntImm("int64", compute_cycles)
+                sch[npu_op].pragma(
+                    npu_op.op.axis[0], "compute_cycles_hint", compute_cycles_int64_cast
+                )
 
         output_tensor_config = plan.output_config
         output_tensor = output_tensor_config.tensor

From 90b2f0d36996be10d71f0c923f588c6dfa0e8546 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Wed, 24 Aug 2022 08:13:30 +0100
Subject: [PATCH 023/704] [CI][CMSIS-NN] Running tests parallel using
 pytest-xdist (#12557)

Introducing -n auto for CMSIS-NN tests to run them in
parallel with pytest-xdist. This is needed because of
additional parameterization done over cpu variants.

Change-Id: I02e1b37ead0b0a562b5b1b2dacfeb3fdd7cc1ce3
---
 tests/scripts/task_python_microtvm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index e65f2253bb28..a2ef53a123bf 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -57,5 +57,5 @@ python3 gallery/how_to/work_with_microtvm/micro_aot.py
 
 run_pytest ctypes python-relay-strategy-arm_cpu tests/python/relay/strategy/arm_cpu --enable-corstone300-tests
 run_pytest ctypes python-integration-m7-simd tests/python/integration/test_arm_mprofile_dsp.py --enable-corstone300-tests
-run_pytest ctypes python-integration-contrib-test_cmsisnn tests/python/contrib/test_cmsisnn
+run_pytest ctypes python-integration-contrib-test_cmsisnn tests/python/contrib/test_cmsisnn -n auto
 run_pytest ctypes python-integration-contrib-test_ethosu tests/python/contrib/test_ethosu -n auto

From 989e5a11285503716c2033f4e56f1bba6b6d00c7 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Wed, 24 Aug 2022 08:16:45 +0100
Subject: [PATCH 024/704] [ETHOSN] Add support for resize (#12535)

This commit adds support for the `resize` operator for
Arm(R) Ethos(TM)-N NPU.
---
 python/tvm/relay/op/contrib/ethosn.py         |  15 ++
 src/relay/backend/contrib/ethosn/codegen.cc   |  39 +++++
 .../backend/contrib/ethosn/codegen_ethosn.h   |   1 +
 .../backend/contrib/ethosn/ethosn_api.cc      |  40 ++++++
 src/relay/backend/contrib/ethosn/ethosn_api.h |   8 ++
 .../python/contrib/test_ethosn/test_resize.py | 134 ++++++++++++++++++
 6 files changed, 237 insertions(+)
 create mode 100644 tests/python/contrib/test_ethosn/test_resize.py

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index eb753ef1391f..469939ecf0b8 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -176,6 +176,13 @@ def qnn_requantize_pattern():
         )
         return pattern
 
+    def qnn_resize_pattern():
+        pattern = is_op("image.resize2d")(wildcard()).has_attr({"method": "nearest_neighbor"})
+        pattern = is_op("qnn.requantize")(
+            pattern, is_constant(), is_constant(), is_constant(), is_constant()
+        )
+        return pattern
+
     def check_conv2d(extract):
         """Check if a conv2d is supported by Ethos-N."""
         if not ethosn_available():
@@ -232,6 +239,13 @@ def check_requantize(extract):
 
         return support.requantize(extract)
 
+    def check_resize(extract):
+        """Check if resize (nearest neighbor) is supported."""
+        if not ethosn_available():
+            return False
+
+        return support.resize(extract)
+
     return [
         ("ethos-n.qnn_conv2d", qnn_conv_pattern(), check_conv2d),
         ("ethos-n.qnn_avg_pool2d", qnn_avg_pool2d_pattern(), check_avg_pool2d),
@@ -240,6 +254,7 @@ def check_requantize(extract):
         ("ethos-n.qnn_mean", qnn_mean_pattern(), check_mean),
         ("ethos-n.qnn_tanh", qnn_tanh_pattern(), check_tanh),
         ("ethos-n.qnn_leaky_relu", qnn_leaky_relu_pattern(), check_leaky_relu),
+        ("ethos-n.qnn_resize", qnn_resize_pattern(), check_resize),
         ("ethos-n.qnn_requantize", qnn_requantize_pattern(), check_requantize),
     ]
 
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index f5cce30e4521..bc4613b80155 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -148,6 +148,10 @@ void InferTensorsVisitor::InferCall(const CallNode* cn) {
     RequantizeParams params;
     err += EthosnAPI::Requantize(cn->op.as<FunctionNode>()->body, &params);
     tensor_table_[cn->args[0]] = {params.input_info};
+  } else if (IsEthosnFunc(call, "ethos-n.qnn_resize")) {
+    ResizeParams params;
+    err += EthosnAPI::Resize(cn->op.as<FunctionNode>()->body, &params);
+    tensor_table_[cn->args[0]] = {params.input_info};
   } else {
     err = EthosnError("unknown operator");
   }
@@ -322,6 +326,9 @@ sl::TensorsAndId ConstructNetworkVisitor::HandleCall(const CallNode* cn) {
   } else if (IsEthosnFunc(call, "ethos-n.qnn_requantize")) {
     if ((err = MakeRequantizeLayer(call, &tensor))) ReportFatalError(call, err);
     return MakeOps(tensor);
+  } else if (IsEthosnFunc(call, "ethos-n.qnn_resize")) {
+    if ((err = MakeResizeLayer(call, &tensor))) ReportFatalError(call, err);
+    return MakeOps(tensor);
   } else {
     ReportFatalError(call, EthosnError("unknown operator"));
     return {};
@@ -622,6 +629,24 @@ EthosnError ConstructNetworkVisitor::MakeRequantizeLayer(const Call& call,
   return EthosnError();
 }
 
+EthosnError ConstructNetworkVisitor::MakeResizeLayer(const Call& call,
+                                                     sl::TensorAndId<sl::Operand>* out) {
+  ResizeParams params;
+  params.input_info = GetTensorInfo(tensor_table_, call);
+  if (auto err = EthosnAPI::Resize(call->op.as<FunctionNode>()->body, &params)) {
+    return err;
+  }
+
+  auto input = operand_table_[call->args[0]][0];
+
+  try {
+    *out = AddResize(network_, *input, params.resize_info);
+  } catch (const sl::NotSupportedException& e) {
+    return EthosnError(e.what());
+  }
+  return EthosnError();
+}
+
 runtime::Module EthosnCompiler::CreateRuntimeModule(const ObjectRef& ref) {
   std::vector<runtime::ethosn::OrderedCompiledNetwork> cmms;
   if (ref->IsInstance<FunctionNode>()) {
@@ -958,6 +983,20 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.requantize")
       err += EthosnError(reason);
     });
 
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.resize")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      ResizeParams params;
+      auto err = EthosnAPI::Resize(call, &params);
+      err += EthosnCompiler::SupportedSetup();
+      char reason[kReasonMaxLength];
+      reason[0] = '\0';
+      *rv = !err &&
+            EthosnCompiler::GetSupported()->IsResizeSupported(
+                params.resize_info, params.input_info, &params.output_info, reason, sizeof(reason));
+      err += EthosnError(reason);
+    });
+
 TVM_REGISTER_GLOBAL("relay.ethos-n.query").set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
 #if defined ETHOSN_HW
   *rv = true;
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index 66aefab16d2d..863a032cafba 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -212,6 +212,7 @@ class ConstructNetworkVisitor : public MixedModeVisitor, private ErrorReportingP
   EthosnError MakeReluLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeLeakyReLULayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeRequantizeLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
+  EthosnError MakeResizeLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
 
   /*! \brief A look-up table from Expr to layers. */
   std::map<Expr, std::vector<std::shared_ptr<sl::Operand>>> operand_table_;
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index c1f67d0d2b16..ccca1779f6d9 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -23,6 +23,7 @@
 
 #include "ethosn_api.h"
 
+#include <tvm/relay/attrs/image.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
@@ -684,6 +685,45 @@ EthosnError EthosnAPI::Requantize(const Expr& expr, RequantizeParams* params) {
   return err;
 }
 
+EthosnError EthosnAPI::Resize(const Expr& expr, ResizeParams* params) {
+  Call requantize = Downcast<Call>(expr);
+  Call resize = Downcast<Call>(requantize->args[0]);
+
+  const auto* input_dtype = resize->args[0]->checked_type().as<TensorTypeNode>();
+  sl::TensorShape input_tensor_shape = {1, 1, 1, 1};
+  EthosnError err = Tvm2Npu(input_dtype->shape, &input_tensor_shape);
+  sl::DataType input_tensor_dtype;
+  err += Tvm2Npu(input_dtype->dtype, &input_tensor_dtype);
+  float input_sc;
+  int input_zp;
+  err += AsConstant(requantize->args[2], &input_zp);
+  err += AsConstant(requantize->args[1], &input_sc);
+  sl::QuantizationInfo input_q_info;
+  err += Tvm2Npu(input_zp, input_sc, &input_q_info);
+  params->input_info =
+      sl::TensorInfo(input_tensor_shape, input_tensor_dtype, sl::DataFormat::NHWC, input_q_info);
+
+  float output_sc;
+  int output_zp;
+  err += AsConstant(requantize->args[3], &output_sc);
+  err += AsConstant(requantize->args[4], &output_zp);
+  sl::QuantizationInfo resize_q_info;
+  err += Tvm2Npu(output_zp, output_sc, &resize_q_info);
+  const auto* attrs = resize->attrs.as<Resize2DAttrs>();
+  uint32_t height, width;
+  err += Tvm2Npu(attrs->size, &height, &width);
+  params->resize_info =
+      sl::ResizeInfo{sl::ResizeAlgorithm::NEAREST_NEIGHBOUR, height, width, resize_q_info};
+
+  sl::TensorInfo output_info = params->input_info;
+  output_info.m_Dimensions[1] = params->resize_info.m_NewHeight;
+  output_info.m_Dimensions[2] = params->resize_info.m_NewWidth;
+  output_info.m_QuantizationInfo = params->resize_info.m_OutputQuantizationInfo;
+  params->output_info = output_info;
+
+  return err;
+}
+
 EthosnError EthosnAPI::Tvm2Npu(const Array<IndexExpr>& padding, sl::Padding* npu_padding) {
   std::array<uint32_t, 4> dim;
   if (EthosnError err = AsArray<IndexExpr, uint32_t>(padding, &dim)) {
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.h b/src/relay/backend/contrib/ethosn/ethosn_api.h
index bb1cd29a5bc4..afe4736bfc40 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.h
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.h
@@ -146,6 +146,12 @@ struct RequantizeParams {
   sl::TensorInfo output_info;
 };
 
+struct ResizeParams {
+  sl::ResizeInfo resize_info;
+  sl::TensorInfo input_info;
+  sl::TensorInfo output_info;
+};
+
 /*!
  * \brief A wrapper around std::stringstream to build an EthosnError.
  */
@@ -241,6 +247,8 @@ class EthosnAPI {
   static EthosnError Relu(const Expr& expr, ReluParams* params);
   /*! \brief Extract the Support Library requantize params from a Relay qnn.requantize call */
   static EthosnError Requantize(const Expr& expr, RequantizeParams* params);
+  /*! \brief Extract the Support Library resize params from a Relay resize call */
+  static EthosnError Resize(const Expr& expr, ResizeParams* params);
 
  private:
   /*! \brief Convert a TVM IndexExpr array to a SL tensor shape */
diff --git a/tests/python/contrib/test_ethosn/test_resize.py b/tests/python/contrib/test_ethosn/test_resize.py
new file mode 100644
index 000000000000..b9d807d21926
--- /dev/null
+++ b/tests/python/contrib/test_ethosn/test_resize.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Arm(R) Ethos(TM)-N integration resize tests"""
+
+import pytest
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.testing import requires_ethosn
+from . import infrastructure as tei
+
+
+def _get_model(
+    shape,
+    dtype,
+    size,
+    input_zp,
+    input_sc,
+    output_zp,
+    output_sc,
+    coordinate_transformation_mode,
+    rounding_method,
+):
+    x = relay.var("x", shape=shape, dtype=dtype)
+    resize = relay.image.resize2d(
+        data=x,
+        size=size,
+        layout="NHWC",
+        method="nearest_neighbor",
+        coordinate_transformation_mode=coordinate_transformation_mode,
+        rounding_method=rounding_method,
+    )
+    model = relay.qnn.op.requantize(
+        resize,
+        input_scale=relay.const(input_sc, "float32"),
+        input_zero_point=relay.const(input_zp, "int32"),
+        output_scale=relay.const(output_sc, "float32"),
+        output_zero_point=relay.const(output_zp, "int32"),
+        out_dtype=dtype,
+    )
+    return model
+
+
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize(
+    "shape, size, coordinate_transformation_mode, rounding_method",
+    [
+        ((1, 4, 4, 2), (8, 8), "half_pixel", "round_prefer_ceil"),
+        ((1, 4, 4, 2), (7, 7), "asymmetric", "floor"),
+        ((1, 4, 8, 3), (8, 16), "half_pixel", "round_prefer_ceil"),
+        ((1, 4, 8, 3), (7, 15), "asymmetric", "floor"),
+    ],
+)
+def test_resize(dtype, shape, size, coordinate_transformation_mode, rounding_method):
+    np.random.seed(0)
+    zp_min = np.iinfo(dtype).min
+    zp_max = np.iinfo(dtype).max
+    inputs = {
+        "x": tvm.nd.array(np.random.randint(zp_min, high=zp_max + 1, size=shape, dtype=dtype)),
+    }
+    outputs = []
+    for npu in [False, True]:
+        model = _get_model(
+            shape=shape,
+            dtype=dtype,
+            size=size,
+            input_zp=zp_min + 128,
+            input_sc=0.0784314,
+            output_zp=zp_min + 128,
+            output_sc=0.0784314,
+            coordinate_transformation_mode=coordinate_transformation_mode,
+            rounding_method=rounding_method,
+        )
+        mod = tei.make_module(model, {})
+        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu)
+        outputs.append(x)
+
+    tei.verify(outputs, dtype, 1)
+
+
+@requires_ethosn
+def test_resize_failure():
+    trials = [
+        (
+            (30, 20),
+            "Requested height isn't supported",
+        ),
+        (
+            (20, 30),
+            "Requested width isn't supported",
+        ),
+        (
+            (19, 20),
+            "Requested width and height must be both even or both odd",
+        ),
+        (
+            (20, 19),
+            "Requested width and height must be both even or both odd",
+        ),
+    ]
+    dtype = "int8"
+    zp_min = np.iinfo(dtype).min
+
+    for size, err_msg in trials:
+        model = _get_model(
+            shape=(1, 10, 10, 1),
+            dtype=dtype,
+            size=size,
+            input_zp=zp_min + 128,
+            input_sc=0.0784314,
+            output_zp=zp_min + 128,
+            output_sc=0.0784314,
+            coordinate_transformation_mode="half_pixel",
+            rounding_method="round_prefer_ceil",
+        )
+        model = tei.make_ethosn_composite(model, "ethos-n.qnn_resize")
+        mod = tei.make_ethosn_partition(model)
+        tei.test_error(mod, {}, err_msg)

From 1ec2c369128c9d57bb09087ab16cb3a2527dd9de Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 24 Aug 2022 17:44:22 +0800
Subject: [PATCH 025/704] [TIR][CompactBufferAllocation] Improve upperbound
 estimation of buffer compaction (#12527)

Hi, this change wants to add some minor updation to region estimator used by buffer compaction:
- Add and clearify among `EstimateRegionStrictBound`, `EstimateRegionLowerBound` and `EstimateRegionUpperBound`

  Originally we have `EstimateRegionLowerBound`, actually it implements strict bound estimation IMO. Now add `upper` and `strict` version for where we actually want them.

- When estimating upperbounds (eg. in buffer compaction), try estimate each dimension independently when they are dependent accesses where `EstimateRegionLowerBound` is expected to fail.

  Eg, `A[i, i], 3 < i < 16`  fails via `EstimateRegionLowerBound` who check indices be independent. But we can still try best to invoke strict bound analysis on each dimension individually.

- If range->extent == 1 for `EvalSet(range, dom)`, invoke `EvalSet(range->min, dom)` instead.

  Eg, `EvalSet([k*k, k*k+1), dom_k)` results to [-inf, +inf] due to current algorithm limitation but  `EvalSet(k*k, dom_k)` results to a range which makes more sense.
---
 include/tvm/arith/int_set.h                   |  39 +-
 python/tvm/arith/__init__.py                  |   8 +-
 python/tvm/arith/int_set.py                   |  48 +++
 src/arith/int_set.cc                          | 131 +++++--
 src/tir/schedule/primitive/compute_at.cc      |   2 +-
 src/tir/schedule/state.cc                     |  14 +-
 src/tir/schedule/utils.h                      |  18 -
 src/tir/transforms/compact_buffer_region.cc   |   2 +-
 tests/python/unittest/test_arith_intset.py    | 354 ++++++++++--------
 ...est_tir_transform_compact_buffer_region.py | 100 +++++
 10 files changed, 496 insertions(+), 220 deletions(-)

diff --git a/include/tvm/arith/int_set.h b/include/tvm/arith/int_set.h
index 7cc4efe6b012..5ef7108d9797 100644
--- a/include/tvm/arith/int_set.h
+++ b/include/tvm/arith/int_set.h
@@ -261,7 +261,29 @@ Array<IntSet> UnionRegionLowerBound(const Array<Array<IntSet>>& nd_int_sets);
 IntSet Intersect(const Array<IntSet>& sets);
 
 /*!
- * \brief Analyze the region with affine map, given the domain of variables and their predicate
+ * \brief Converts the Ranges to IntSets
+ * \param var_dom The ranges of variables
+ * \return The integer sets of the variables
+ */
+Map<Var, arith::IntSet> AsIntSet(const Map<Var, Range>& var_dom);
+
+/*!
+ * \brief Analyze the region with affine map, given the domain of variables and their predicate.
+ * The result should be strict, i.e. no region is discarded or relaxed.
+ * \param region The region to be analyzed
+ * \param var_dom The ranges of the variables
+ * \param predicate The predicate for the affine map
+ * \param analyzer The analyzer used
+ * \return NullOpt if the detection fails, or an array of arith::IntSet as the result of analysis
+ */
+TVM_DLL Optional<Array<IntSet>> EstimateRegionStrictBound(const Array<Range>& region,
+                                                          const Map<Var, Range>& var_dom,
+                                                          const PrimExpr& predicate,
+                                                          arith::Analyzer* analyzer);
+
+/*!
+ * \brief Analyze the region with affine map, given the domain of variables and their predicate.
+ * Some subregion may be discarded during the lower-bound analysis.
  * \param region The region to be analyzed
  * \param var_dom The ranges of the variables
  * \param predicate The predicate for the affine map
@@ -273,6 +295,21 @@ TVM_DLL Optional<Array<IntSet>> EstimateRegionLowerBound(const Array<Range>& reg
                                                          const PrimExpr& predicate,
                                                          arith::Analyzer* analyzer);
 
+/*!
+ * \brief Analyze the region with affine map, given the domain of variables and their predicate
+ * Relaxation of the region may be used in upper-bound analysis, i.e. some extra region may be added
+ * to the result.
+ * \param region The region to be analyzed
+ * \param var_dom The ranges of the variables
+ * \param predicate The predicate for the affine map
+ * \param analyzer The analyzer used
+ * \return an array of arith::IntSet as the result of analysis
+ */
+TVM_DLL Array<IntSet> EstimateRegionUpperBound(const Array<Range>& region,
+                                               const Map<Var, Range>& var_dom,
+                                               const PrimExpr& predicate,
+                                               arith::Analyzer* analyzer);
+
 }  // namespace arith
 }  // namespace tvm
 #endif  // TVM_ARITH_INT_SET_H_
diff --git a/python/tvm/arith/__init__.py b/python/tvm/arith/__init__.py
index f5a0478dc008..03c0769850c9 100644
--- a/python/tvm/arith/__init__.py
+++ b/python/tvm/arith/__init__.py
@@ -16,7 +16,13 @@
 # under the License.
 """Integer bound analysis, simplification and pattern detection."""
 
-from .int_set import IntSet, IntervalSet, estimate_region_lower_bound
+from .int_set import (
+    IntSet,
+    IntervalSet,
+    estimate_region_lower_bound,
+    estimate_region_strict_bound,
+    estimate_region_upper_bound,
+)
 from .analyzer import ModularSet, ConstIntBound, Analyzer
 from .bound import deduce_bound
 from .pattern import detect_linear_equation, detect_clip_bound
diff --git a/python/tvm/arith/int_set.py b/python/tvm/arith/int_set.py
index b5f2100b7c7d..151461bcaf9f 100644
--- a/python/tvm/arith/int_set.py
+++ b/python/tvm/arith/int_set.py
@@ -83,6 +83,7 @@ def __init__(self, min_value, max_value):
 
 def estimate_region_lower_bound(region, var_dom, predicate):
     """Analyze the region with affine map, given the domain of variables and their predicate
+    Some subregion may be discarded during the lower-bound analysis.
 
     Parameters
     ----------
@@ -103,6 +104,53 @@ def estimate_region_lower_bound(region, var_dom, predicate):
     return _ffi_api.EstimateRegionLowerBound(region, var_dom, predicate)
 
 
+def estimate_region_strict_bound(region, var_dom, predicate):
+    """Analyze the region with affine map, given the domain of variables and their predicate
+    The result should be strict, i.e. no region is discarded or relaxed.
+
+    Parameters
+    ----------
+    region : List[Range]
+        The region to be analyzed.
+
+    var_dom : Dict[Var, Range]
+        The ranges of the variables
+
+    predicate : PrimExpr
+        The predicate for the affine map
+
+    Returns
+    ----------
+    region_int_set : Optional[List[IntSet]]
+        None if the detection fails, or an array of IntSets as the result of analysis
+    """
+    return _ffi_api.EstimateRegionStrictBound(region, var_dom, predicate)
+
+
+def estimate_region_upper_bound(region, var_dom, predicate):
+    """Analyze the region with affine map, given the domain of variables and their predicate
+    Relaxation of the region may be used in upper-bound analysis,
+    i.e. some extra region may be added to the result.
+
+    Parameters
+    ----------
+    region : List[Range]
+        The region to be analyzed.
+
+    var_dom : Dict[Var, Range]
+        The ranges of the variables
+
+    predicate : PrimExpr
+        The predicate for the affine map
+
+    Returns
+    ----------
+    region_int_set : List[IntSet]
+        an array of IntSets as the result of analysis
+    """
+    return _ffi_api.EstimateRegionUpperBound(region, var_dom, predicate)
+
+
 def pos_inf():
     """Returns the symbolic positive infinity
 
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index 584bbe8f04ea..e8e223ceca09 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -975,6 +975,9 @@ IntSet EvalSet(PrimExpr e, const std::unordered_map<const VarNode*, IntSet>& dom
 
 IntSet EvalSet(Range r, const Map<Var, IntSet>& dom_map) {
   Analyzer ana;
+  if ((r->min->dtype.is_int() || r->min->dtype.is_uint()) && ana.CanProveEqual(r->extent, 1)) {
+    return EvalSet(r->min, dom_map);
+  }
   IntervalSetEvaluator m(&ana, dom_map);
   // Simplifying first can give tighter bounds if r->min and r->extent share variables
   PrimExpr sum = r->min + r->extent - 1;
@@ -1035,15 +1038,57 @@ IntSet EvalSet(Range r, const Map<IterVar, IntSet>& dom_map) {
   return EvalSet(r, ConvertDomMap(dom_map));
 }
 
-Optional<Array<IntSet>> EstimateRegionLowerBound(const Array<Range>& region,
-                                                 const Map<Var, Range>& var_dom,
-                                                 const PrimExpr& predicate, Analyzer* analyzer) {
+Map<Var, arith::IntSet> AsIntSet(const Map<Var, Range>& var_dom) {
+  Map<Var, arith::IntSet> result;
+  for (auto kv : var_dom) {
+    const Var& var = kv.first;
+    const Range& range = kv.second;
+    result.Set(var, arith::IntSet::FromRange(range));
+  }
+  return result;
+}
+
+/*! \brief Helper function to convert IterSumExpr to the actual touched range. */
+static Optional<IntSet> EvalIterSum(const IterSumExpr& iter_min, const PrimExpr& extent,
+                                    Analyzer* analyzer) {
+  if (iter_min->args.empty()) {
+    return IntSet::FromMinExtent(iter_min->base, extent);
+  }
+  ICHECK_EQ(iter_min->args.size(), 1) << "The `EvalIterSum` expects fused iter sum expr";
+  const IterSplitExpr& split = iter_min->args[0];
+  if (!analyzer->CanProve(extent >= split->scale)) {
+    return NullOpt;
+  }
+
+  const PrimExpr& base = iter_min->base;
+  // IterSplitExpr: (source // lower_factor) % extent * scale
+  // where `(source // lower_factor) % extent` is within [0, extent - 1]
+  if (analyzer->CanProve(split->scale < 0)) {
+    // If scale is negative, the var dom is [(extent - 1) * scale, 0]
+    // The total base is `base + (extent - 1) * scale`,
+    // while total extent is `dom_extent + (extent - 1) * (-scale)`
+    const PrimExpr& var_extent = (split->extent - 1) * split->scale;
+    return IntSet::FromMinExtent(base + var_extent, extent - var_extent);
+  } else {
+    // If scale is positive, the var dom is [0, (extent - 1) * scale]
+    // The total dom is [base, dom_extent + (extent - 1) * scale]
+    return IntSet::FromMinExtent(base, extent + (split->extent - 1) * split->scale);
+  }
+}
+
+Optional<Array<IntSet>> EstimateRegionStrictBound(const Array<Range>& region,
+                                                  const Map<Var, Range>& var_dom,
+                                                  const PrimExpr& predicate, Analyzer* analyzer) {
   int ndim = region.size();
   Array<IterSumExpr> iter_sum_exprs{nullptr};
   {
     Array<PrimExpr> affine_indices;
     affine_indices.reserve(ndim);
     for (const Range& range : region) {
+      if (!is_const_number(range->extent)) {
+        // dynamic extent is not supported yet.
+        return NullOpt;
+      }
       affine_indices.push_back(range->min);
     }
     auto res = DetectIterMap(
@@ -1060,31 +1105,57 @@ Optional<Array<IntSet>> EstimateRegionLowerBound(const Array<Range>& region,
   for (int i = 0; i < ndim; ++i) {
     const IterSumExpr& sum_expr = iter_sum_exprs[i];
     const Range& range = region[i];
-    if (sum_expr->args.empty()) {
-      result.push_back(IntSet::FromMinExtent(sum_expr->base, range->extent));
-      continue;
-    }
-    ICHECK_EQ(sum_expr->args.size(), 1);
-    const IterSplitExpr& split = sum_expr->args[0];
-    if (!analyzer->CanProve(range->extent >= split->scale)) {
+    Optional<IntSet> int_set = EvalIterSum(sum_expr, range->extent, analyzer);
+    if (int_set.defined()) {
+      result.push_back(int_set.value());
+    } else {
       return NullOpt;
     }
+  }
+  return result;
+}
 
-    const PrimExpr& base = sum_expr->base;
-    // IterSplitExpr: (source // lower_factor) % extent * scale
-    // where `(source // lower_factor) % extent` is within [0, extent - 1]
-    if (analyzer->CanProve(split->scale < 0)) {
-      // If scale is negative, the var dom is [(extent - 1) * scale, 0]
-      // The total base is `base + (extent - 1) * scale`,
-      // while total extent is `dom_extent + (extent - 1) * (-scale)`
-      const PrimExpr& var_extent = (split->extent - 1) * split->scale;
-      result.push_back(IntSet::FromMinExtent(base + var_extent, range->extent - var_extent));
-    } else {
-      // If scale is positive, the var dom is [0, (extent - 1) * scale]
-      // The total dom is [base, dom_extent + (extent - 1) * scale]
-      result.push_back(
-          IntSet::FromMinExtent(base, range->extent + (split->extent - 1) * split->scale));
+Optional<Array<IntSet>> EstimateRegionLowerBound(const Array<Range>& region,
+                                                 const Map<Var, Range>& var_dom,
+                                                 const PrimExpr& predicate,
+                                                 arith::Analyzer* analyzer) {
+  return EstimateRegionStrictBound(region, var_dom, predicate, analyzer);
+}
+
+Array<IntSet> EstimateRegionUpperBound(const Array<Range>& region, const Map<Var, Range>& var_dom,
+                                       const PrimExpr& predicate, Analyzer* analyzer) {
+  if (Optional<Array<arith::IntSet>> result = EstimateRegionStrictBound(
+          /*region=*/region,
+          /*var_dom=*/var_dom,
+          /*predicate=*/predicate, /*analyzer=*/analyzer)) {
+    return result.value();
+  }
+  Array<IntSet> result;
+  result.reserve(region.size());
+  // try estimate each dimension independently
+  for (const Range& range : region) {
+    auto res = DetectIterMap(
+        /*indices=*/{range->min}, /*input_iters=*/var_dom,
+        /*predicate=*/predicate, /*check_level=*/IterMapLevel::Surjective, analyzer);
+    if (!res->indices.empty()) {
+      ICHECK_EQ(res->indices.size(), 1U);
+      IterSumExpr sum_expr = res->indices[0];
+
+      // dynamic extent is not supported yet.
+      PrimExpr extent = range->extent;
+      if (!is_const_number(extent)) {
+        IntSet relaxed = EvalSet(extent, AsIntSet(var_dom));
+        ICHECK(relaxed.HasUpperBound());
+        extent = relaxed.max();
+      }
+
+      if (Optional<IntSet> int_set = EvalIterSum(sum_expr, range->extent, analyzer)) {
+        result.push_back(int_set.value());
+        continue;
+      }
     }
+    // fallback to coarse grained evalset
+    result.push_back(EvalSet(range, AsIntSet(var_dom)));
   }
   return result;
 }
@@ -1118,6 +1189,18 @@ TVM_REGISTER_GLOBAL("arith.EstimateRegionLowerBound")
       Analyzer analyzer;
       return EstimateRegionLowerBound(region, var_dom, predicate, &analyzer);
     });
+TVM_REGISTER_GLOBAL("arith.EstimateRegionStrictBound")
+    .set_body_typed([](Array<Range> region, Map<Var, Range> var_dom,
+                       PrimExpr predicate) -> Optional<Array<IntSet>> {
+      Analyzer analyzer;
+      return EstimateRegionStrictBound(region, var_dom, predicate, &analyzer);
+    });
+TVM_REGISTER_GLOBAL("arith.EstimateRegionUpperBound")
+    .set_body_typed([](Array<Range> region, Map<Var, Range> var_dom,
+                       PrimExpr predicate) -> Optional<Array<IntSet>> {
+      Analyzer analyzer;
+      return EstimateRegionUpperBound(region, var_dom, predicate, &analyzer);
+    });
 
 TVM_REGISTER_GLOBAL("arith.PosInf").set_body_typed([]() { return SymbolicLimits::pos_inf_; });
 TVM_REGISTER_GLOBAL("arith.NegInf").set_body_typed([]() { return SymbolicLimits::neg_inf_; });
diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
index 7b0d749f03dc..98a6b2400ee3 100644
--- a/src/tir/schedule/primitive/compute_at.cc
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -356,7 +356,7 @@ void RelaxBufferRegions(const Map<Var, PrimExpr>& binding,
     runtime::StorageRank rank = scope.rank;
     if (rank != previous_rank || !var_dom.defined()) {
       previous_rank = rank;
-      var_dom = AsIntSet(LoopDomainOfSRefTreePath(
+      var_dom = arith::AsIntSet(LoopDomainOfSRefTreePath(
           /*low_inclusive=*/relax_path_low_inclusive,
           /*high_exclusive=*/relax_path_high_exclusive,
           /*extra_relax_scope=*/scope));
diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc
index dadabba48540..07481ddb19e3 100644
--- a/src/tir/schedule/state.cc
+++ b/src/tir/schedule/state.cc
@@ -16,8 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include "./utils.h"
+#include <tvm/arith/int_set.h>
 
+#include "./utils.h"
 namespace tvm {
 namespace tir {
 
@@ -44,13 +45,10 @@ Array<arith::IntSet> AnalyzeRegionUpperBound(const BufferRegion& region,
       /*low_inclusive=*/dom_low_inclusive,
       /*high_exclusive=*/dom_high_exclusive,
       /*extra_relax_scope=*/runtime::StorageScope::Create(region->buffer.scope()));
-  if (Optional<Array<arith::IntSet>> result = EstimateRegionLowerBound(
-          /*region=*/region->region,
-          /*var_dom=*/var_dom,
-          /*predicate=*/predicate, /*analyzer=*/analyzer)) {
-    return result.value();
-  }
-  return arith::EvalSet(region->region, AsIntSet(var_dom));
+  return EstimateRegionUpperBound(
+      /*region=*/region->region,
+      /*var_dom=*/var_dom,
+      /*predicate=*/predicate, /*analyzer=*/analyzer);
 }
 
 /*!
diff --git a/src/tir/schedule/utils.h b/src/tir/schedule/utils.h
index 53cafa798b54..3db80989ae10 100644
--- a/src/tir/schedule/utils.h
+++ b/src/tir/schedule/utils.h
@@ -249,24 +249,6 @@ inline bool IsThreadIdx(const runtime::ThreadScope& thread_scope) {
   return thread_scope.rank == 1 && thread_scope.dim_index >= 0;
 }
 
-/******** Integer set ********/
-
-/*!
- * \brief Converts the Ranges to IntSets
- * \param var_dom The ranges of variables
- * \return The integer sets of the variables
- */
-inline Map<Var, arith::IntSet> AsIntSet(const Map<Var, Range>& var_dom) {
-  std::unordered_map<Var, arith::IntSet, ObjectPtrHash, ObjectPtrEqual> result;
-  result.reserve(var_dom.size());
-  for (auto kv : var_dom) {
-    Var& var = kv.first;
-    Range& range = kv.second;
-    result.emplace(std::move(var), arith::IntSet::FromRange(std::move(range)));
-  }
-  return {result.begin(), result.end()};
-}
-
 /**************** Loop extents ****************/
 
 /*!
diff --git a/src/tir/transforms/compact_buffer_region.cc b/src/tir/transforms/compact_buffer_region.cc
index 2844f1b35e9e..249b8cca77b0 100644
--- a/src/tir/transforms/compact_buffer_region.cc
+++ b/src/tir/transforms/compact_buffer_region.cc
@@ -88,7 +88,7 @@ NDIntSet NDIntSetEval(Region region, PrimExpr predicate,
     var_dom[GetRef<Var>(it.first)] = it.second.CoverRange(Range::FromMinExtent(0, 0));
   }
   Optional<Array<arith::IntSet>> eval_res =
-      arith::EstimateRegionLowerBound(region, var_dom, predicate, analyzer);
+      arith::EstimateRegionUpperBound(region, var_dom, predicate, analyzer);
   if (eval_res.defined()) {
     return NDIntSet(eval_res.value().begin(), eval_res.value().end());
   }
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index 2302d0ed54f2..24228fb52703 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -15,9 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
 from tvm import te
 from tvm import tir
-from tvm.ir.base import structural_equal
+from tvm.arith.analyzer import Analyzer
 
 
 class IntSetChecker:
@@ -128,66 +129,139 @@ def test_select():
     ck.verify(tvm.tir.Select(x > 0, x - 1, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (-1, 11))
 
 
-def test_region_lower_bound_not_independent():
+def check_region_bound(expect_region, var_dom, mode, predicate=None):
+    """Helper to check region bound estimation.
+
+    Parameters
+    ----------
+    expect_region: dict
+        The keys are of form (begin, end) or PrimExpr as a single point. The values are
+        expected estimated region or region dict on different bindings.
+
+    var_dom: dict
+        Map var to iteration domain range.
+
+    mode: str
+        Specify "lowerbound", "upperbound" or else use strict bound estimation.
+
+    predicate: PrimExpr
+        Extra predicate, defaults to True.
+    """
+    if predicate is None:
+        predicate = tvm.tir.IntImm("bool", 1)
+    region = []
+    expect = []
+    for k, v in expect_region.items():
+        if not isinstance(k, (tuple, list)):
+            k = (k, k + 1)
+        region.append(tvm.ir.Range.from_min_extent(k[0], Analyzer().simplify(k[1] - k[0])))
+        expect.append(v)
+    if mode == "lowerbound":
+        result = tvm.arith.estimate_region_lower_bound(
+            region=region, var_dom=var_dom, predicate=predicate
+        )
+    elif mode == "upperbound":
+        result = tvm.arith.estimate_region_upper_bound(
+            region=region, var_dom=var_dom, predicate=predicate
+        )
+    else:
+        result = tvm.arith.estimate_region_strict_bound(
+            region=region, var_dom=var_dom, predicate=predicate
+        )
+    if result is None:
+        assert all([_ is None for _ in expect])
+        return
+    assert len(result) == len(expect)
+    for intset, expect_desc in zip(result, expect):
+        if isinstance(expect_desc, dict):
+            # check range on different free var bindings
+            for binding in expect_desc:
+                analyzer = Analyzer()
+                for k, v in binding:
+                    analyzer.bind(k, v)
+                expect_begin, expect_end = expect_desc[binding]
+                result_begin = analyzer.simplify(intset.min_value, 3)
+                result_end = analyzer.simplify(intset.max_value + 1, 3)
+                print(result_end)
+                assert analyzer.can_prove_equal(
+                    result_begin - expect_begin, 0
+                ), f"{result_begin} vs {expect_begin}"
+                assert analyzer.can_prove_equal(
+                    result_end - expect_end, 0
+                ), f"{result_end} vs {expect_end}"
+        else:
+            # check range
+            expect_begin, expect_end = expect_desc
+            analyzer = Analyzer()
+            assert analyzer.can_prove_equal(
+                intset.min_value - expect_begin, 0
+            ), f"{intset.min_value} vs {expect_begin}"
+            assert analyzer.can_prove_equal(
+                intset.max_value - expect_end + 1, 0
+            ), f"{intset.max_value} vs {expect_end - 1}"
+
+
+def test_region_bound_not_independent():
+    # (i, i+2) and (i+2, i+4) are dependent, this the lowerbound is not available
     i = tvm.tir.Var("i", "int32")
-    result = tvm.arith.estimate_region_lower_bound(
-        region=[
-            tvm.ir.Range(begin=i, end=i + 2),
-            tvm.ir.Range(begin=i + 1, end=i + 4),
-        ],
-        var_dom={
-            i: tvm.ir.Range(begin=0, end=64),
-        },
-        predicate=tvm.tir.IntImm("bool", 1),
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({(i, i + 2): None, (i + 2, i + 4): None}, var_dom, mode="lowerbound")
+    check_region_bound({(i, i + 2): (0, 65), (i + 2, i + 4): (2, 67)}, var_dom, mode="upperbound")
+
+    # when only a subset of access indices are affine
+    i, j, k = tvm.tir.Var("i", "int32"), tvm.tir.Var("j", "int32"), tvm.tir.Var("k", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=16),
+        j: tvm.ir.Range(begin=0, end=16),
+        k: tvm.ir.Range(begin=0, end=16),
+    }
+    check_region_bound(
+        {i // 4: None, j * 4 + i % 4: None, tir.truncdiv(k, 2): None},
+        var_dom,
+        predicate=j * 4 + i % 4 > 3,
+        mode="lowerbound",
+    )
+    check_region_bound(
+        {i // 4: (0, 4), j * 4 + i % 4: (4, 64), tir.truncdiv(k, 2): (0, 8)},
+        var_dom,
+        predicate=j * 4 + i % 4 > 3,
+        mode="upperbound",
     )
-    assert result is None
 
 
-def test_region_lower_bound_stride_too_wide():
+def test_region_bound_stride_too_wide():
     i = tvm.tir.Var("i", "int32")
-    result = tvm.arith.estimate_region_lower_bound(
-        region=[
-            tvm.ir.Range(begin=i * 4, end=i * 4 + 2),
-        ],
-        var_dom={
-            i: tvm.ir.Range(begin=0, end=64),
-        },
-        predicate=tvm.tir.IntImm("bool", 1),
-    )
-    assert result is None
+    var_dom = {i: tvm.ir.Range(begin=0, end=64)}
+    check_region_bound({(i * 4, i * 4 + 2): None}, var_dom, mode="lowerbound")
+    check_region_bound({(i * 4, i * 4 + 2): (0, 254)}, var_dom, mode="upperbound")
 
 
-def test_region_lower_bound_small_stride():
+def test_region_bound_small_stride():
     i = tvm.tir.Var("i", "int32")
-    (result,) = tvm.arith.estimate_region_lower_bound(
-        region=[
-            tvm.ir.Range.from_min_extent(min_value=i * 4, extent=8),
-        ],
-        var_dom={
-            i: tvm.ir.Range(begin=0, end=64),
-        },
-        predicate=tvm.tir.IntImm("bool", 1),
-    )
-    assert result.min_value.value == 0
-    assert result.max_value.value == 259
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({(i * 4, i * 4 + 8): (0, 260)}, var_dom, mode="lowerbound")
 
 
 def test_region_lower_bound_split_predicate():
     x_o = tvm.tir.Var("xo", "int32")
     x_i = tvm.tir.Var("xi", "int32")
     x = x_o * 4 + x_i
-    (result,) = tvm.arith.estimate_region_lower_bound(
-        region=[
-            tvm.ir.Range.from_min_extent(min_value=x * 4, extent=8),
-        ],
-        var_dom={
-            x_o: tvm.ir.Range(begin=0, end=16),
-            x_i: tvm.ir.Range(begin=0, end=4),
-        },
+    var_dom = {
+        x_o: tvm.ir.Range(begin=0, end=16),
+        x_i: tvm.ir.Range(begin=0, end=4),
+    }
+    check_region_bound({(x * 4, x * 4 + 8): (0, 256)}, var_dom, predicate=x < 63, mode="lowerbound")
+
+    check_region_bound(
+        {(x * 4, x * 4 + 8): (0, 256), (x * 3, x * 3 + 5): (0, 191)},
+        var_dom,
         predicate=x < 63,
+        mode="upperbound",
     )
-    assert result.min_value.value == 0
-    assert result.max_value.value == 255
 
 
 def test_region_lower_bound_multiple_variables():
@@ -198,127 +272,94 @@ def test_region_lower_bound_multiple_variables():
     i = div(x, 16)
     j = div(mod(x, 16), 4) * 8 + mod(x, 4) + div(wid, 32) * 4
     k = wid % 32
-    (i_int_set, j_int_set, k_int_set) = tvm.arith.estimate_region_lower_bound(
-        region=[
-            tvm.ir.Range.from_min_extent(min_value=i, extent=1),
-            tvm.ir.Range.from_min_extent(min_value=j, extent=1),
-            tvm.ir.Range.from_min_extent(min_value=k, extent=1),
-        ],
-        var_dom={
-            x: tvm.ir.Range(begin=0, end=32),
-            wid: tvm.ir.Range(begin=0, end=64),
-        },
-        predicate=tvm.tir.IntImm("bool", 1),
-    )
-    assert i_int_set.min_value.value == 0
-    assert i_int_set.max_value.value == 1
-    assert j_int_set.min_value.value == 0
-    assert j_int_set.max_value.value == 31
-    assert k_int_set.min_value.value == 0
-    assert k_int_set.max_value.value == 31
+    var_dom = {
+        x: tvm.ir.Range(begin=0, end=32),
+        wid: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({i: (0, 2), j: (0, 32), k: (0, 32)}, var_dom, mode="lowerbound")
 
 
 def test_region_lower_bound_negative_scale():
     i = tvm.tir.Var("i", "int32")
     j = tvm.tir.Var("j", "int32")
-    int_set_0, int_set_1 = tvm.arith.estimate_region_lower_bound(
-        region=[
-            tvm.ir.Range.from_min_extent(min_value=1 - i, extent=4),
-            tvm.ir.Range.from_min_extent(min_value=20 - j * 4, extent=16),
-        ],
-        var_dom={
-            i: tvm.ir.Range(begin=0, end=4),
-            j: tvm.ir.Range(begin=0, end=4),
-        },
-        predicate=tvm.tir.IntImm("bool", 1),
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=4),
+        j: tvm.ir.Range(begin=0, end=4),
+    }
+    check_region_bound(
+        {(1 - i, 5 - i): (-2, 5), (20 - j * 4, 36 - j * 4): (8, 36)}, var_dom, mode="lowerbound"
     )
-    assert int_set_0.min_value.value == -2
-    assert int_set_0.max_value.value == 4
-    assert int_set_1.min_value.value == 8
-    assert int_set_1.max_value.value == 35
 
 
 def test_region_lower_bound_for_non_perfect_tile():
     h1 = tvm.tir.Var("h1", "int32")
     h2 = tvm.tir.Var("h2", "int32")
     h3 = tvm.tir.Var("h3", "int32")
-    analyzer = tvm.arith.Analyzer()
-
-    def do_test_point_access(point, predicates, var_dom, expect):
-        regions = tvm.arith.estimate_region_lower_bound(
-            region=[
-                tvm.ir.Range.from_min_extent(min_value=point, extent=1),
-            ],
-            var_dom=var_dom,
-            predicate=tvm.tir.all(*predicates),
-        )
-        if expect is None:  # expect a failure
-            assert regions is None
-        else:
-            assert len(regions) == 1
-            for binding, expect_min, expect_max in expect:
-                min_diff = expect_min - regions[0].min_value
-                assert analyzer.simplify(tir.stmt_functor.substitute(min_diff, binding), 3) == 0
-                max_diff = expect_max - regions[0].max_value
-                assert analyzer.simplify(tir.stmt_functor.substitute(max_diff, binding), 3) == 0
 
     # non-uniform tiling, single inner variable
-    # h3 == 0: region is [1, 9]
-    # 0 < h3 <= 26: region is [h3 * 8, h3 * 8 + 9]
-    # h3 > 26: region is [h3 * 8, 223]
-    do_test_point_access(
-        point=h3 * 8 + h2,
-        predicates=[1 <= h3 * 8 + h2, h3 * 8 + h2 < 224],
-        var_dom={
-            h2: tvm.ir.Range(begin=0, end=10),
+    var_dom = {
+        h2: tvm.ir.Range(begin=0, end=10),
+    }
+    check_region_bound(
+        {
+            h3 * 8
+            + h2: {
+                (): (
+                    tvm.tir.max(h3 * 8, 1),
+                    tvm.tir.max(h3 * 8, 1)
+                    - tvm.tir.max(h3 * 8, 214)
+                    - tvm.tir.max(1 - h3 * 8, 0)
+                    + 224,
+                ),
+                ((h3, 0),): (1, 10),  # h3 == 0: region is [1, 10)
+                ((h3, 10),): (h3 * 8, h3 * 8 + 10),  # 0 < h3 <= 26: region is [h3 * 8, h3 * 8 + 10)
+                ((h3, 27),): (h3 * 8, 224),  # h3 > 26: region is [h3 * 8, 224)
+            }
         },
-        expect=[
-            (
-                {},
-                tvm.tir.max(h3 * 8, 1),
-                tvm.tir.max(h3 * 8, 1)
-                - tvm.tir.max(h3 * 8, 214)
-                - tvm.tir.max(1 - h3 * 8, 0)
-                + 223,
-            ),
-            ({h3: 0}, 1, 9),
-            ({h3: 10}, h3 * 8, h3 * 8 + 9),
-            ({h3: 27}, h3 * 8, 223),
-        ],
+        var_dom,
+        predicate=tvm.tir.all(1 <= h3 * 8 + h2, h3 * 8 + h2 < 224),
+        mode="lowerbound",
     )
 
     # non-uniform tiling, two inner variables
-    do_test_point_access(
-        point=h3 * 8 + h2 * 5 + h1,
-        predicates=[1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h2 * 5 + h1 < 224],
-        var_dom={
-            h2: tvm.ir.Range(begin=0, end=2),
-            h1: tvm.ir.Range(begin=0, end=5),
+    var_dom = {
+        h1: tvm.ir.Range(begin=0, end=5),
+        h2: tvm.ir.Range(begin=0, end=2),
+    }
+    check_region_bound(
+        {
+            h3 * 8
+            + h2 * 5
+            + h1: {
+                (): (
+                    tvm.tir.max(h3 * 8, 1),
+                    tvm.tir.max(h3 * 8, 1)
+                    - tvm.tir.max(h3 * 8, 214)
+                    - tvm.tir.max(1 - h3 * 8, 0)
+                    + 224,
+                ),
+                ((h3, 0),): (1, 10),
+                ((h3, 10),): (h3 * 8, h3 * 8 + 10),
+                ((h3, 27),): (h3 * 8, 224),
+            }
         },
-        expect=[
-            (
-                {},
-                tvm.tir.max(h3 * 8, 1),
-                tvm.tir.max(h3 * 8, 1)
-                - tvm.tir.max(h3 * 8, 214)
-                - tvm.tir.max(1 - h3 * 8, 0)
-                + 223,
-            ),
-            ({h3: 0}, 1, 9),
-            ({h3: 10}, h3 * 8, h3 * 8 + 9),
-            ({h3: 27}, h3 * 8, 223),
-        ],
+        var_dom,
+        predicate=tvm.tir.all(1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h2 * 5 + h1 < 224),
+        mode="lowerbound",
     )
 
-    # should fail on incompatible predicates
-    do_test_point_access(
-        point=h3 * 8 + h2 * 5 + h1,
-        predicates=[1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h1 * 2 + h2 < 224],
-        var_dom={
-            h2: tvm.ir.Range(begin=0, end=2),
-            h1: tvm.ir.Range(begin=0, end=5),
-        },
-        expect=None,
+    # lowerbound should fail on incompatible predicates
+    check_region_bound(
+        {h3 * 8 + h2 * 5 + h1: None},
+        var_dom,
+        predicate=tvm.tir.all(1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h1 * 2 + h2 < 224),
+        mode="lowerbound",
+    )
+    check_region_bound(
+        {h3 * 8 + h2 * 5 + h1: (h3 * 8, h3 * 8 + 10)},
+        var_dom,
+        predicate=tvm.tir.all(1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h1 * 2 + h2 < 224),
+        mode="upperbound",
     )
 
 
@@ -328,12 +369,7 @@ def test_region_lower_bound_unfusable():
         tvm.tir.Var("j", "int32"): tvm.ir.Range(4),
     }
     i, j = var_dom
-    region = [
-        tvm.ir.Range.from_min_extent((i + j) // 2, 1),
-    ]
-    result = tvm.arith.estimate_region_lower_bound(region, var_dom, predicate=True)
-    assert result[0].min_value == 0
-    assert result[0].max_value == 5
+    check_region_bound({(i + j) // 2: (0, 6)}, var_dom, mode="lowerbound")
 
 
 def test_union_lower_bound():
@@ -347,18 +383,4 @@ def test_union_lower_bound():
 
 
 if __name__ == "__main__":
-    test_basic()
-    test_vector()
-    test_add_sub()
-    test_mul_div()
-    test_max_min()
-    test_select()
-    test_mod()
-    test_region_lower_bound_not_independent()
-    test_region_lower_bound_stride_too_wide()
-    test_region_lower_bound_small_stride()
-    test_region_lower_bound_split_predicate()
-    test_region_lower_bound_multiple_variables()
-    test_region_lower_bound_negative_scale()
-    test_region_lower_bound_for_non_perfect_tile()
-    test_union_lower_bound()
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index 31bb9b8b7cdb..049de0bed4f9 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -909,5 +909,105 @@ def compacted_func(A: T.Buffer[(960, 770), "float32"], B: T.Buffer[(770, 2304),
     _check(func, compacted_func)
 
 
+def test_compact_dependent_buffer_indices():
+    """Check the upper bound on different indices could be independently estimated."""
+
+    @T.prim_func
+    def diagonal_access():
+        for i in range(8):
+            with T.block():
+                A = T.alloc_buffer((256, 256), "float32")
+                for j, k in T.grid(8, 8):
+                    with T.block():
+                        T.where(j * 8 + k < 60)
+                        A[i * 64 + j * 8 + k, i * 64 + j * 8 + k] = 1.0
+
+    @T.prim_func
+    def diagonal_access_compacted() -> None:
+        for i in T.serial(8):
+            with T.block():
+                A = T.alloc_buffer([60, 60], dtype="float32")
+                for j, k in T.grid(8, 8):
+                    with T.block():
+                        T.where(j * 8 + k < 60)
+                        A[j * 8 + k, j * 8 + k] = 1.0
+
+    _check(diagonal_access, diagonal_access_compacted)
+
+
+def test_compact_dependent_buffer_indices_of_packed_matmul():
+    """Check the outer dimension of the packed M-dim should be compacted to 1 wrt split condition."""
+
+    @T.prim_func
+    def nonuniform_packed_matmul_write_cache(
+        A: T.Buffer[(1020, 64), "float32"],
+        B: T.Buffer[(1000, 64), "float32"],
+        C: T.Buffer[(1020, 1000), "float32"],
+    ):
+        for i0, i1 in T.grid(4, 1):
+            with T.block():
+                C_local2 = T.alloc_buffer([4, 1, 16, 1000, 16], dtype="float32", scope="local")
+                C_local1 = T.alloc_buffer([1020, 1000], dtype="float32", scope="local")
+                for ax0, ax1, ax2 in T.grid(255, 1000, 64):
+                    with T.block("matmul"):
+                        if ax2 == 0:
+                            C_local1[i0 * 255 + ax0, ax1] = 0
+                        C_local1[i0 * 255 + ax0, ax1] = (
+                            C_local1[i0 * 255 + ax0, ax1] + A[i0 * 255 + ax0, ax2] * B[ax1, ax2]
+                        )
+                for ax0, ax1 in T.grid(255, 1000):
+                    with T.block("st1"):
+                        C_local2[
+                            (i0 * 255 + ax0) // 255,
+                            0,
+                            (i0 * 255 + ax0) % 255 // 16,
+                            ax1,
+                            (i0 * 255 + ax0) % 255 % 16,
+                        ] = C_local1[i0 * 255 + ax0, ax1]
+                for ax0, ax1, ax2 in T.grid(16, 16, 1000):
+                    with T.block("st2"):
+                        T.where(ax0 * 16 + ax1 < 255)
+                        C[i0 * 255 + (ax0 * 16 + ax1), i1 * 1000 + ax2] = C_local2[
+                            (i0 * 255 + ax0 * 16 + ax1) // 255,
+                            0,
+                            (i0 * 255 + ax0 * 16 + ax1) % 255 // 16,
+                            i1 * 1000 + ax2,
+                            (i0 * 255 + ax0 * 16 + ax1) % 255 % 16,
+                        ]
+
+    @T.prim_func
+    def nonuniform_packed_matmul_write_cache_compacted(
+        A: T.Buffer[(1020, 64), "float32"],
+        B: T.Buffer[(1000, 64), "float32"],
+        C: T.Buffer[(1020, 1000), "float32"],
+    ) -> None:
+        for i0, i1 in T.grid(4, 1):
+            with T.block():
+                C_local2 = T.alloc_buffer([1, 1, 15, 1000, 16], dtype="float32", scope="local")
+                C_local1 = T.alloc_buffer([255, 1000], dtype="float32", scope="local")
+                for ax0, ax1, ax2 in T.grid(255, 1000, 64):
+                    with T.block("matmul"):
+                        if ax2 == 0:
+                            C_local1[ax0, ax1] = 0
+                        C_local1[ax0, ax1] = (
+                            C_local1[ax0, ax1] + A[i0 * 255 + ax0, ax2] * B[ax1, ax2]
+                        )
+                for ax0, ax1 in T.grid(255, 1000):
+                    with T.block("st1"):
+                        C_local2[0, 0, ax0 // 16, ax1, ax0 % 16] = C_local1[ax0, ax1]
+                for ax0, ax1, ax2 in T.grid(16, 16, 1000):
+                    with T.block("st2"):
+                        T.where(ax0 * 16 + ax1 < 255)
+                        C[i0 * 255 + ax0 * 16 + ax1, ax2] = C_local2[
+                            (ax0 * 16 + ax1) // 255,
+                            0,
+                            (ax0 * 16 + ax1) % 255 // 16,
+                            ax2,
+                            (ax0 * 16 + ax1) % 255 % 16,
+                        ]
+
+    _check(nonuniform_packed_matmul_write_cache, nonuniform_packed_matmul_write_cache_compacted)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 592148abf6866a41eefa736efca067d42f5aea86 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Wed, 24 Aug 2022 11:24:05 +0100
Subject: [PATCH 026/704] [Target] Replace IsaAnalyzer with Target Features
 (#12322)

This is clean up to use the new `target.features` instead of `IsaAnalyzer`.
---
 python/tvm/relay/op/strategy/arm_cpu.py | 17 ++++-------
 python/tvm/target/arm_isa.py            | 39 -------------------------
 tests/micro/zephyr/test_zephyr.py       |  4 +--
 3 files changed, 6 insertions(+), 54 deletions(-)
 delete mode 100644 python/tvm/target/arm_isa.py

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 54e1c871f504..ba28b6c7c31c 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -24,7 +24,6 @@
 
 from ....auto_scheduler import is_auto_scheduler_enabled
 from ....meta_schedule import is_meta_schedule_enabled
-from ....target import arm_isa
 from ....topi.generic import conv2d as conv2d_generic
 from .. import op as _op
 from .generic import *
@@ -57,15 +56,14 @@ def schedule_concatenate_arm_cpu(_, outs, target):
 def schedule_pool_arm_cpu(attrs, outs, target):
     """schedule pooling ops arm cpu"""
     layout = attrs.layout
-    isa = arm_isa.IsaAnalyzer(target)
     avg_pool = isinstance(attrs, relay.op.op_attrs.AvgPool2DAttrs)
     with target:
         if (
             avg_pool
-            and isa.has_dsp_support
+            and target.features.has_dsp
             and layout in ("NCW", "NCHW")
             or not avg_pool
-            and isa.has_dsp_support
+            and target.features.has_dsp
             and layout in ("NWC", "NHWC")
         ):
             return topi.arm_cpu.schedule_pool(outs, layout)
@@ -87,8 +85,6 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
 
-    isa = arm_isa.IsaAnalyzer(target)
-
     if groups == 1:
         if layout == "NCHW":
             if kernel_layout == "OIHW":
@@ -163,7 +159,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 name="conv2d_hwcn.generic",
             )
         elif layout == "NHWC":
-            if isa.has_dsp_support and kernel_layout == "HWOI":
+            if target.features.has_dsp and kernel_layout == "HWOI":
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_dsp),
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_dsp),
@@ -473,10 +469,9 @@ def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target):
 def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
     """dense arm cpu strategy"""
     strategy = _op.OpStrategy()
-    isa = arm_isa.IsaAnalyzer(target)
     data, _ = inputs
 
-    if isa.has_dsp_support and data.dtype in ["int8", "int16"]:
+    if target.features.has_dsp and data.dtype in ["int8", "int16"]:
         strategy.add_implementation(
             wrap_compute_dense(topi.arm_cpu.dense_dsp),
             wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp),
@@ -506,10 +501,8 @@ def conv1d_strategy_arm_cpu(attrs, inputs, out_type, target):
     if dilation[0] < 1:
         raise ValueError("dilation should be a positive value")
 
-    isa = arm_isa.IsaAnalyzer(target)
-
     if kernel_layout == "WOI":
-        if layout == "NWC" and isa.has_dsp_support:
+        if layout == "NWC" and target.features.has_dsp:
             strategy.add_implementation(
                 wrap_compute_conv1d(topi.arm_cpu.conv1d_nwc_dsp),
                 wrap_topi_schedule(topi.arm_cpu.schedule_conv1d_nwc_dsp),
diff --git a/python/tvm/target/arm_isa.py b/python/tvm/target/arm_isa.py
deleted file mode 100644
index a5ac9b1563a5..000000000000
--- a/python/tvm/target/arm_isa.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Defines functions to analyze available opcodes in the ARM ISA."""
-
-import tvm.target
-
-
-ARM_MPROFILE_DSP_SUPPORT_LIST = [
-    "cortex-m7",
-    "cortex-m4",
-    "cortex-m33",
-    "cortex-m35p",
-    "cortex-m55",
-]
-
-
-class IsaAnalyzer(object):
-    """Checks ISA support for given target"""
-
-    def __init__(self, target):
-        self.target = tvm.target.Target(target)
-
-    @property
-    def has_dsp_support(self):
-        return self.target.mcpu is not None and self.target.mcpu in ARM_MPROFILE_DSP_SUPPORT_LIST
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 9c0c3fefb488..8d9a73704d8e 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -32,7 +32,6 @@
 from tvm.relay.testing import byoc
 from tvm.contrib import utils
 from tvm.micro.testing.utils import check_tune_log
-from tvm.target import arm_isa
 
 import test_utils
 
@@ -549,8 +548,7 @@ def test_schedule_build_with_cmsis_dependency(
     build_config = {"debug": microtvm_debug}
     target = tvm.target.target.micro(model, options=["-keys=arm_cpu,cpu"])
 
-    isa = arm_isa.IsaAnalyzer(target)
-    if not isa.has_dsp_support:
+    if not target.features.has_dsp:
         pytest.skip(f"ISA does not support DSP. target: {target}")
 
     # Create a Relay conv2d

From 6e79f64108b26b504089354cfce5e182001a70d1 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Wed, 24 Aug 2022 13:44:55 +0100
Subject: [PATCH 027/704] [CI] Set test python.contrib.test_onnx.test_resize as
 xfail (#12568)

`python.contrib.test_onnx.test_resize` is failing due to a numerical
accuracy issue, reported in #12567. This patch marks that test as
an xfail, so that other tests can be enabled, while this one is
investigated separately.
---
 tests/python/contrib/test_onnx.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/contrib/test_onnx.py b/tests/python/contrib/test_onnx.py
index 214166cebb9d..afebc2295a68 100644
--- a/tests/python/contrib/test_onnx.py
+++ b/tests/python/contrib/test_onnx.py
@@ -655,6 +655,7 @@ def verify_cast(dshape, dtype):
             verify_cast(i, o_dtype)
 
 
+@pytest.mark.xfail(reason="Known failing test. See issue #12567.")
 def test_resize():
     """Resize unit test."""
 

From a0fe74b3c3608929b21faeaea422ac09aa2f75eb Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 24 Aug 2022 13:45:57 +0100
Subject: [PATCH 028/704] [ETHOSN] Support multiply conversion to depthwise
 (#12403)

Multiply can be supported when offloaded to the NPU by a conversion to a depthwise convolution operation. This is only supported when the multiply operation has a single single variable input with the other being a constant of shape [1, ..., C]. This commit adds a new pass "ConvertEquivalents" (name subject to change) to handle this conversion before codegen.
---
 python/tvm/relay/op/contrib/_ethosn.py        |   1 +
 python/tvm/relay/op/contrib/ethosn.py         |  80 ++++++--
 .../contrib/ethosn/convert_equivalent.cc      | 144 +++++++++++++
 src/relay/op/make_op.h                        |   2 +
 src/relay/qnn/utils.h                         |   4 +
 src/relay/transforms/pattern_utils.h          |  34 +++
 .../test_ethosn/test_convert_equivalents.py   | 142 +++++++++++++
 .../contrib/test_ethosn/test_multiply.py      | 193 ++++++++++++++++++
 8 files changed, 582 insertions(+), 18 deletions(-)
 create mode 100644 src/relay/backend/contrib/ethosn/convert_equivalent.cc
 create mode 100644 tests/python/contrib/test_ethosn/test_convert_equivalents.py
 create mode 100644 tests/python/contrib/test_ethosn/test_multiply.py

diff --git a/python/tvm/relay/op/contrib/_ethosn.py b/python/tvm/relay/op/contrib/_ethosn.py
index ea2915675ec6..9c7c922fdfb0 100644
--- a/python/tvm/relay/op/contrib/_ethosn.py
+++ b/python/tvm/relay/op/contrib/_ethosn.py
@@ -20,3 +20,4 @@
 import tvm._ffi
 
 tvm._ffi._init_api("relay.ethos-n.support", __name__)
+tvm._ffi._init_api("relay.backend.contrib.ethos-n", __name__)
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 469939ecf0b8..73dd6b735775 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -25,7 +25,7 @@
 from tvm.relay.build_module import bind_params_by_name
 
 from ...dataflow_pattern import is_constant, is_op, wildcard
-from . import _ethosn as support
+from . import _ethosn
 from .register import register_pattern_table
 
 
@@ -60,6 +60,18 @@ def ethosn_api_version() -> str:
     return tvm.get_global_func("relay.ethos-n.api.version")()
 
 
+def ConvertEquivalents() -> tvm.ir.IRModule:  # pylint: disable=invalid-name
+    """Converts operations into a numerically equivalent form
+    that can be understood by the NPU codegen.
+
+    Return
+    ------
+    Pass
+        The module pass.
+    """
+    return _ethosn.ConvertEquivalents()
+
+
 def partition_for_ethosn(mod, params=None, **opts):
     """Partition the graph greedily offloading supported
     operators to Arm Ethos-N NPU.
@@ -107,9 +119,9 @@ def partition_for_ethosn(mod, params=None, **opts):
             transform.AnnotateTarget("ethos-n"),
             transform.MergeCompilerRegions(),
             transform.PartitionGraph(),
+            ConvertEquivalents(),
         ]
     )
-
     return seq(mod)
 
 
@@ -183,70 +195,102 @@ def qnn_resize_pattern():
         )
         return pattern
 
+    def qnn_mul_pattern():
+        """
+        Multiply is supported when one input is a constant of shape [1, ..., C],
+        where C matches the number of channels of the other input.
+        """
+        mul_op = is_op("qnn.mul")
+        gen_mul_inputs = lambda x, y: mul_op(
+            x,
+            y,
+            is_constant(),
+            is_constant(),
+            is_constant(),
+            is_constant(),
+            is_constant(),
+            is_constant(),
+        )
+        input_is_left = gen_mul_inputs(wildcard(), is_constant())
+        input_is_right = gen_mul_inputs(is_constant(), wildcard())
+        return input_is_left | input_is_right
+
     def check_conv2d(extract):
         """Check if a conv2d is supported by Ethos-N."""
         if not ethosn_available():
             return False
 
-        return support.conv2d(extract)
+        return _ethosn.conv2d(extract)
 
     def check_fc(extract):
         """Check if a fully connected is supported by Ethos-N."""
         if not ethosn_available():
             return False
 
-        return support.fc(extract)
+        return _ethosn.fc(extract)
 
     def check_avg_pool2d(extract):
         """Check if a avg pool2d is supported by Ethos-N."""
         if not ethosn_available():
             return False
 
-        return support.avg_pool2d(extract)
+        return _ethosn.avg_pool2d(extract)
 
     def check_mean(extract):
         """Check if mean is supported by Ethos-N."""
         if not ethosn_available():
             return False
 
-        return support.mean(extract)
+        return _ethosn.mean(extract)
 
     def check_sigmoid(extract):
         """Check if a sigmoid is supported by Ethos-N."""
         if not ethosn_available():
             return False
 
-        return support.sigmoid(extract)
+        return _ethosn.sigmoid(extract)
 
     def check_tanh(extract):
         """Check if tanh is supported by Ethos-N."""
         if not ethosn_available():
             return False
 
-        return support.tanh(extract)
+        return _ethosn.tanh(extract)
 
     def check_leaky_relu(extract):
         """Check if Leaky ReLU is supported."""
         if not ethosn_available():
             return False
 
-        return support.leaky_relu(extract)
+        return _ethosn.leaky_relu(extract)
+
+    def check_mul(extract):
+        """Check if Mul is supported."""
+        if not ethosn_available():
+            return False
+        # Do not support scalar constants for now
+        check_scalar = lambda i: isinstance(i, tvm.relay.Constant) and len(i.data.shape) == 0
+        if check_scalar(extract.args[0]) or check_scalar(extract.args[1]):
+            return False
+        extract = _ethosn.ConvertQnnMultiply(extract)
+        return _ethosn.conv2d(extract)
 
     def check_requantize(extract):
         """Check if requantize is supported."""
         if not ethosn_available():
             return False
 
-        return support.requantize(extract)
+        return _ethosn.requantize(extract)
 
     def check_resize(extract):
         """Check if resize (nearest neighbor) is supported."""
         if not ethosn_available():
             return False
 
-        return support.resize(extract)
+        return _ethosn.resize(extract)
 
     return [
+        ("ethos-n.qnn_mul", qnn_mul_pattern(), check_mul),
         ("ethos-n.qnn_conv2d", qnn_conv_pattern(), check_conv2d),
         ("ethos-n.qnn_avg_pool2d", qnn_avg_pool2d_pattern(), check_avg_pool2d),
         ("ethos-n.qnn_sigmoid", qnn_sigmoid_pattern(), check_sigmoid),
@@ -274,7 +318,7 @@ def max_pool2d(expr):
     if not ethosn_available():
         return False
 
-    return support.max_pool2d(expr)
+    return _ethosn.max_pool2d(expr)
 
 
 @tvm.ir.register_op_attr("reshape", "target.ethos-n")
@@ -285,7 +329,7 @@ def reshape(expr):
     if not _is_ethosn_composite(expr.args[0]):
         return False
 
-    return support.reshape(expr)
+    return _ethosn.reshape(expr)
 
 
 @tvm.ir.register_op_attr("qnn.add", "target.ethos-n")
@@ -294,7 +338,7 @@ def qnn_add(expr):
     if not ethosn_available():
         return False
 
-    return support.addition(expr)
+    return _ethosn.addition(expr)
 
 
 @tvm.ir.register_op_attr("qnn.concatenate", "target.ethos-n")
@@ -302,7 +346,7 @@ def qnn_concatenate(expr):
     """Check if a concatenate is supported by Ethos-N."""
     if not ethosn_available():
         return False
-    if not support.concatenate(expr):
+    if not _ethosn.concatenate(expr):
         return False
 
     # Support library has some unenforced restrictions on qnn params
@@ -332,7 +376,7 @@ def split(expr):
         return False
     if ethosn_api_version() >= LooseVersion("3.0.1"):
         return False
-    if not support.split(expr):
+    if not _ethosn.split(expr):
         return False
 
     return True
@@ -343,7 +387,7 @@ def depth_to_space(expr):
     """Check if a depth_to_space is supported by Ethos-N."""
     if not ethosn_available():
         return False
-    if not support.depth_to_space(expr):
+    if not _ethosn.depth_to_space(expr):
         return False
 
     return True
@@ -354,7 +398,7 @@ def clip(expr):
     """Check if a clip is supported by Ethos-N."""
     if not ethosn_available():
         return False
-    if not support.relu(expr):
+    if not _ethosn.relu(expr):
         return False
 
     return True
diff --git a/src/relay/backend/contrib/ethosn/convert_equivalent.cc b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
new file mode 100644
index 000000000000..6b64467047f4
--- /dev/null
+++ b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/ethosn/convert_equivalent.cc
+ * \brief Converts operations into a numerically equivalent form
+ * that can be understood by the NPU codegen.
+ */
+
+#include <tvm/relay/dataflow_matcher.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+
+#include <unordered_map>
+
+#include "../../../qnn/utils.h"
+#include "../../../transforms/pattern_utils.h"
+#include "../../../transforms/simplify_expr.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace ethosn {
+
+/*!
+ * \brief Converts qnn.mul to mathematically equivalent
+ * qnn.conv2d depthwise operation.
+ */
+Expr ConvertQnnMultiply(const Expr& expr) {
+  Call call = Downcast<Call>(expr);
+
+  Expr input1 = call->args[0];
+  Expr input2 = call->args[1];
+  Expr input1_scale = call->args[2];
+  Expr input1_zero_point = call->args[3];
+  Expr input2_scale = call->args[4];
+  Expr input2_zero_point = call->args[5];
+  // Reverse the inputs if the constant is first input
+  if (call->args[0]->IsInstance<ConstantNode>()) {
+    input1 = call->args[1];
+    input2 = call->args[0];
+    input1_scale = call->args[4];
+    input1_zero_point = call->args[5];
+    input2_scale = call->args[2];
+    input2_zero_point = call->args[3];
+  }
+  Expr output_scale = call->args[6];
+  Expr output_zero_point = call->args[7];
+
+  const auto* input_constant = input2.as<ConstantNode>();
+  ICHECK(input_constant) << "Expected ConstantNode but got " << input2->GetTypeKey();
+  const auto* input_constant_tt = input_constant->checked_type().as<TensorTypeNode>();
+  int channels = input_constant_tt->shape.back().as<IntImmNode>()->value;
+
+  runtime::NDArray input_data = input_constant->data;
+  runtime::NDArray kernel_data_hwoi =
+      runtime::NDArray::Empty({1, 1, channels, 1}, input_data->dtype, input_data->device);
+  kernel_data_hwoi.CopyFrom(input_data);
+  Constant kernel = Constant(kernel_data_hwoi, input_constant->span);
+
+  Type output_type = expr->checked_type();
+  auto output_tt = output_type.as<TensorTypeNode>();
+  ICHECK(output_tt) << "Expected TensorTypeNode but got " << output_type->GetTypeKey();
+  DataType output_dtype = output_tt->dtype;
+
+  Expr conv2d = qnn::MakeQnnConv2D(
+      input1, kernel, input1_zero_point, input2_zero_point, input1_scale, input2_scale, {1, 1},
+      {0, 0, 0, 0}, {1, 1}, channels, channels, {1, 1}, "NHWC", "HWOI", "NHWC", DataType::Int(32));
+  Constant bias_data = MakeConstantZeros(DataType::Int(32), {channels});
+  Expr bias_add = MakeBiasAdd(conv2d, bias_data, 3);
+  Expr requantize = qnn::MakeRequantize(bias_add, input1_scale, input1_zero_point, output_scale,
+                                        output_zero_point, -1, "None", "None", output_dtype);
+
+  return InferType(requantize);
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnMultiply")
+    .set_body_typed(ConvertQnnMultiply);
+
+class ConvertEquivalentsMutator : public MixedModeMutator {
+ public:
+  Expr Rewrite_(const CallNode* pre, const Expr& post) override {
+    Call call = Downcast<Call>(post);
+    if (!call->op->IsInstance<FunctionNode>()) {
+      return post;
+    }
+
+    Function func = Downcast<Function>(call->op);
+    Function new_func = Function(func);
+    auto composite_name = func->GetAttr<String>(attr::kComposite);
+    if (composite_name == "ethos-n.qnn_mul") {
+      Expr new_func_body = ConvertQnnMultiply(func->body);
+      new_func = WithFields(func, func->params, new_func_body);
+      new_func = WithAttr(std::move(new_func), attr::kComposite, String("ethos-n.qnn_conv2d"));
+    }
+
+    Call new_call = WithFields(call, new_func);
+    return Downcast<Expr>(new_call);
+  }
+};
+
+tvm::transform::Pass ConvertEquivalents() {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [=](IRModule mod, transform::PassContext ctx) {
+        for (auto gv : mod->GetGlobalVars()) {
+          Function func = Downcast<Function>(mod->Lookup(gv));
+          auto compiler_name = func->GetAttr<String>(attr::kCompiler);
+          if (compiler_name.defined() && compiler_name == "ethos-n") {
+            auto new_body = ConvertEquivalentsMutator().VisitExpr(func->body);
+            if (!new_body.same_as(func->body)) {
+              Function new_func = WithFields(func, func->params, new_body);
+              mod->Update(gv, new_func);
+            }
+          }
+        }
+        return mod;
+      };
+  return tvm::transform::CreateModulePass(
+      pass_func, 0, "relay.backend.contrib.ethos-n.ConvertEquivalents", {"InferType"});
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertEquivalents")
+    .set_body_typed(ConvertEquivalents);
+
+}  // namespace ethosn
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index c850bf8958c9..85938a739182 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -117,6 +117,8 @@ Expr MakeShapeOf(Expr data, DataType dtype);
 
 Expr MakeTake(Expr data, Expr indices, Integer batch_dims, Integer axis, String mode);
 
+Expr MakeBiasAdd(Expr data, Expr bias, int axis);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_OP_MAKE_OP_H_
diff --git a/src/relay/qnn/utils.h b/src/relay/qnn/utils.h
index 18c592f2ed69..d084e4871e95 100644
--- a/src/relay/qnn/utils.h
+++ b/src/relay/qnn/utils.h
@@ -121,6 +121,10 @@ static inline Expr Requantize(const Expr& data, const Array<IndexExpr>& input_sh
                          attrs.operator->(), input_shape, attrs->out_dtype);
 }
 
+Expr MakeRequantize(Expr data, Expr input_scale, Expr input_zero_point, Expr output_scale,
+                    Expr output_zero_point, int axis, String rounding, String compute_dtype,
+                    DataType out_dtype);
+
 Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale,
                      const Expr& input_zero_point, const Array<tvm::relay::Type>& types,
                      const DequantizeAttrs* attrs);
diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h
index d05d39b733d3..ffe1cc2ca2ab 100644
--- a/src/relay/transforms/pattern_utils.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -344,6 +344,40 @@ static inline Constant MakeConstantTensor(DataType dtype, std::vector<int64_t> s
   return Constant(arr);
 }
 
+/*!
+ * \brief Create a Constant tensor of zeros.
+ *
+ * \param dtype The data type.
+ * \param shape The shape of the output constant tensor.
+ * \return A Constant.
+ */
+static inline Constant MakeConstantZeros(DataType dtype, std::vector<int64_t> shape) {
+  runtime::NDArray arr = runtime::NDArray::Empty(shape, dtype, {kDLCPU, 0});
+  int64_t data_size = 1;
+  for (int64_t dim : shape) {
+    data_size *= dim;
+  }
+  TVM_DTYPE_DISPATCH(dtype, DType, {
+    for (int64_t i = 0; i < data_size; i++) {
+      if (dtype == DataType::Float(16)) {
+        // convert to float16
+        // storage is uint16_t
+        // Similar handling as that in MakeConstantScalar
+        *(static_cast<DType*>(arr->data) + i) =
+            __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(static_cast<float>(0));
+      } else if (dtype == DataType::BFloat(16)) {
+        // convert to bfloat16
+        // storage is uint16_t
+        *(static_cast<DType*>(arr->data) + i) =
+            __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 7>(static_cast<float>(0));
+      } else {
+        *(static_cast<DType*>(arr->data) + i) = 0;
+      }
+    }
+  })
+  return Constant(arr);
+}
+
 /*!
  * \brief Check whether a shape is static and create corresponding Constant.
  Eventually this will be removed and replaced with CheckConstantShapeArrayInteger
diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
new file mode 100644
index 000000000000..570009422067
--- /dev/null
+++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Unit tests for the convert equivalents pass."""
+
+import pytest
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.testing import requires_ethosn
+from tvm.relay.op.contrib.ethosn import ConvertEquivalents
+
+from . import infrastructure as tei
+
+
+def _assert_structural_equal(a, b):
+    """Check structural equality of two Relay expressions."""
+    reason = (
+        "Actual and expected relay functions are not equal. "
+        "ConvertEquivalents is not correctly transforming the input "
+        "graph."
+    )
+    assert tvm.ir.structural_equal(a, b), reason
+
+
+def _create_npu_module(inputs, expr, composite_name, ext_func_name):
+    """Wraps an operator as an NPU module."""
+    gen_vars = lambda prefix, vars: [
+        relay.var(
+            prefix + var.name_hint, shape=var.type_annotation.shape, dtype=var.type_annotation.dtype
+        )
+        for var in vars
+    ]
+
+    mod = tvm.ir.IRModule()
+
+    func = relay.Function(relay.analysis.free_vars(expr), expr)
+    func = func.with_attr("Composite", composite_name)
+    inner_vars = gen_vars("inner_", inputs)
+    call = relay.Call(func, inner_vars)
+
+    func2 = relay.Function(relay.analysis.free_vars(call), call)
+    func2 = func2.with_attr("Compiler", "ethos-n")
+    func2 = func2.with_attr("global_symbol", ext_func_name)
+    mod[ext_func_name] = func2
+    mod = relay.transform.InferType()(mod)
+
+    outer_vars = gen_vars("outer_", inputs)
+    out = relay.Call(mod.get_global_var(ext_func_name), outer_vars)
+    mod["main"] = relay.Function(relay.analysis.free_vars(out), out)
+    mod = relay.transform.InferType()(mod)
+    return mod
+
+
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize("shape,channels", [((1, 4, 4, 8), 8), ((1, 16, 12, 4), 4)])
+@pytest.mark.parametrize("reverse_inputs", [True, False])
+def test_multiply_to_depthwise(dtype, shape, channels, reverse_inputs):
+    """Check that multiply is correctly converted to a depthwise operation."""
+    np.random.seed(0)
+
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+    input_zp = np.random.randint(data_min, data_max)
+    input_sc = np.random.random() * 2
+    input2_zp = np.random.randint(data_min, data_max)
+    input2_sc = np.random.random() * 2
+    output_zp, output_sc = tei.get_conv2d_qnn_params(
+        dtype, input_zp, input_sc, input2_zp, input2_sc, 1, 1, shape[3]
+    )
+    x = relay.var("x", shape=shape, dtype=dtype)
+    constant_shape = (1, 1, 1, channels)
+    y_data = np.random.randint(data_min, data_max + 1, size=constant_shape, dtype=dtype)
+
+    def before():
+        y = relay.const(y_data, dtype=dtype)
+        expr = relay.qnn.op.mul(
+            y if reverse_inputs else x,
+            x if reverse_inputs else y,
+            relay.const(input_sc, "float32"),
+            relay.const(input_zp, "int32"),
+            relay.const(input2_sc, "float32"),
+            relay.const(input2_zp, "int32"),
+            relay.const(output_sc, "float32"),
+            relay.const(output_zp, "int32"),
+        )
+        return _create_npu_module([x], expr, "ethos-n.qnn_mul", "ext_func")
+
+    def expected():
+        constant_shape_hwoi = (1, 1, channels, 1)
+        y_data_hwoi = y_data.reshape(constant_shape_hwoi)
+        y_hwoi = relay.const(y_data_hwoi, dtype=dtype)
+        expr = relay.qnn.op.conv2d(
+            x,
+            y_hwoi,
+            relay.const(input2_zp if reverse_inputs else input_zp, "int32"),
+            relay.const(input_zp if reverse_inputs else input2_zp, "int32"),
+            relay.const(input2_sc if reverse_inputs else input_sc, "float32"),
+            relay.const(input_sc if reverse_inputs else input2_sc, "float32"),
+            (1, 1),
+            channels,
+            (1, 1),
+            (0, 0),
+            (1, 1),
+            channels,
+            "NHWC",
+            "HWOI",
+            "NHWC",
+            "int32",
+        )
+        expr = relay.nn.bias_add(expr, relay.const(np.zeros((channels,), dtype="int32")), axis=3)
+        expr = relay.qnn.op.requantize(
+            expr,
+            relay.const(input2_sc if reverse_inputs else input_sc, "float32"),
+            relay.const(input2_zp if reverse_inputs else input_zp, "int32"),
+            relay.const(output_sc, "float32"),
+            relay.const(output_zp, "int32"),
+            out_dtype=dtype,
+        )
+        return _create_npu_module([x], expr, "ethos-n.qnn_conv2d", "ext_func")
+
+    mod = before()
+    mod = ConvertEquivalents()(mod)
+    expected_mod = expected()
+    _assert_structural_equal(mod["ext_func"], expected_mod["ext_func"])
diff --git a/tests/python/contrib/test_ethosn/test_multiply.py b/tests/python/contrib/test_ethosn/test_multiply.py
new file mode 100644
index 000000000000..38d8516b6721
--- /dev/null
+++ b/tests/python/contrib/test_ethosn/test_multiply.py
@@ -0,0 +1,193 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Integration tests for Multiply."""
+
+import pytest
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.testing import requires_ethosn
+
+from . import infrastructure as tei
+
+
+def _get_model(
+    shape,
+    constant_shape,
+    input_zp,
+    input_sc,
+    input2_zp,
+    input2_sc,
+    output_zp,
+    output_sc,
+    dtype,
+    reverse_inputs=False,
+):
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+
+    x = relay.var("x", shape=shape, dtype=dtype)
+    y_data = np.random.randint(data_min, data_max + 1, size=constant_shape, dtype=dtype)
+    y = relay.const(y_data, dtype=dtype)
+
+    out = relay.qnn.op.mul(
+        y if reverse_inputs else x,
+        x if reverse_inputs else y,
+        relay.const(input_sc, "float32"),
+        relay.const(input_zp, "int32"),
+        relay.const(input2_sc, "float32"),
+        relay.const(input2_zp, "int32"),
+        relay.const(output_sc, "float32"),
+        relay.const(output_zp, "int32"),
+    )
+    params = {"y": y_data}
+    return out, params
+
+
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize(
+    "shape,constant_shape", [((1, 4, 4, 8), (1, 1, 1, 8)), ((1, 16, 12, 4), (4,))]
+)
+@pytest.mark.parametrize("reverse_inputs", [False, True])
+def test_multiply(dtype, shape, constant_shape, reverse_inputs):
+    """Compare Multiply output with TVM."""
+    np.random.seed(0)
+
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+    input_zp = np.random.randint(data_min, data_max)
+    input_sc = np.random.random() * 2
+    input2_zp = np.random.randint(data_min, data_max)
+    input2_sc = np.random.random() * 2
+    output_zp, output_sc = tei.get_conv2d_qnn_params(
+        dtype, input_zp, input_sc, input2_zp, input2_sc, 1, 1, shape[3]
+    )
+
+    model, params = _get_model(
+        shape,
+        constant_shape,
+        input_zp,
+        input_sc,
+        input2_zp,
+        input2_sc,
+        output_zp,
+        output_sc,
+        dtype,
+        reverse_inputs,
+    )
+    inputs = {"x": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=shape, dtype=dtype))}
+    outputs = []
+    for npu in [False, True]:
+        mod = tei.make_module(model, params)
+        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+
+    tei.verify(outputs, dtype, 1)
+
+
+@requires_ethosn
+def test_multiply_multiple_inputs_unsupported():
+    """Check multiply operator with two inputs is not offloaded."""
+    np.random.seed(0)
+
+    shape = (1, 4, 5, 6)
+    dtype = "int8"
+
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+    input_zp = np.random.randint(data_min, data_max)
+    input_sc = np.random.random() * 2
+    input2_zp = np.random.randint(data_min, data_max)
+    input2_sc = np.random.random() * 2
+    output_zp, output_sc = tei.get_conv2d_qnn_params(
+        dtype, input_zp, input_sc, input2_zp, input2_sc, 1, 1, shape[3]
+    )
+
+    x = relay.var("x", shape=shape, dtype=dtype)
+    y = relay.var("y", shape=shape, dtype=dtype)
+    model = relay.qnn.op.mul(
+        x,
+        y,
+        relay.const(input_sc, "float32"),
+        relay.const(input_zp, "int32"),
+        relay.const(input2_sc, "float32"),
+        relay.const(input2_zp, "int32"),
+        relay.const(output_sc, "float32"),
+        relay.const(output_zp, "int32"),
+    )
+
+    expected_host_ops = 1
+    npu_partitions = 0
+    for npu in [False, True]:
+        mod = tei.make_module(model, {})
+        tei.build(
+            mod,
+            {},
+            npu=npu,
+            expected_host_ops=expected_host_ops,
+            npu_partitions=npu_partitions,
+        )
+
+
+@requires_ethosn
+def test_multiply_unsupported_datatype():
+    """Check multiply operator with unsupported datatype is not offloaded."""
+    np.random.seed(0)
+
+    shape = (1, 4, 5, 6)
+    dtype = "int16"
+
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+    input_zp = np.random.randint(data_min, data_max)
+    input_sc = np.random.random() * 2
+    input2_zp = np.random.randint(data_min, data_max)
+    input2_sc = np.random.random() * 2
+    output_zp, output_sc = tei.get_conv2d_qnn_params(
+        dtype, input_zp, input_sc, input2_zp, input2_sc, 1, 1, shape[3]
+    )
+
+    x = relay.var("x", shape=shape, dtype=dtype)
+    y = relay.var("y", shape=shape, dtype=dtype)
+    model = relay.qnn.op.mul(
+        x,
+        y,
+        relay.const(input_sc, "float32"),
+        relay.const(input_zp, "int32"),
+        relay.const(input2_sc, "float32"),
+        relay.const(input2_zp, "int32"),
+        relay.const(output_sc, "float32"),
+        relay.const(output_zp, "int32"),
+    )
+
+    expected_host_ops = 1
+    npu_partitions = 0
+    for npu in [False, True]:
+        mod = tei.make_module(model, {})
+        tei.build(
+            mod,
+            {},
+            npu=npu,
+            expected_host_ops=expected_host_ops,
+            npu_partitions=npu_partitions,
+        )

From 038523e5a21e13ff2802913ec32b73fb47413b35 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Wed, 24 Aug 2022 07:13:28 -0700
Subject: [PATCH 029/704] [TIR] Expose Vector-related API in Python (#12571)

This PR exposes the following TIR operation in python:

- `vectorlow`: tested [here](https://github.com/apache/tvm/blob/592148abf6866a41eefa736efca067d42f5aea86/python/tvm/tir/tensor_intrin/arm_cpu.py#L62)
- `vectorhigh`: tested [here](https://github.com/apache/tvm/blob/592148abf6866a41eefa736efca067d42f5aea86/python/tvm/tir/tensor_intrin/arm_cpu.py#L79)
- `vectorcombine`: add new unittest

Co-Authored-By: yongwww <yongcale@gmail.com>
---
 python/tvm/tir/__init__.py                 |  1 +
 python/tvm/tir/op.py                       | 57 ++++++++++++++++++++++
 tests/python/unittest/test_tir_op_types.py | 24 +++++++++
 3 files changed, 82 insertions(+)

diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 7ea8c02bed85..f61e05cc92e9 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -52,6 +52,7 @@
 from .op import tvm_tuple, tvm_struct_get, tvm_struct_set
 from .op import address_of, lookup_param, assume, undef
 from .op import tvm_thread_allreduce, type_annotation, tvm_access_ptr, tvm_throw_last_error
+from .op import vectorlow, vectorhigh, vectorcombine
 from .op import infinity, reinterpret
 from .op import exp, exp2, exp10, log, log2, log10, log1p, ldexp, clz
 from .op import sin, sinh, asin, asinh
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index 7ab1f3aaae23..c4618042b2dc 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -595,6 +595,63 @@ def tvm_throw_last_error():
     return call_intrin("handle", "tir.tvm_throw_last_error")
 
 
+def vectorlow(dtype, vec):
+    """Get the low level half of the vector
+
+    Parameters
+    ----------
+    dtype : str
+       The data type of the result.
+
+    vec : list
+       The input vector.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(dtype, "tir.vectorlow", vec)
+
+
+def vectorhigh(dtype, vec):
+    """Get the high level half of the vector
+
+    Parameters
+    ----------
+    dtype : str
+       The data type of the result.
+
+    vec : list
+       The input vector.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(dtype, "tir.vectorhigh", vec)
+
+
+def vectorcombine(dtype, vec1, vec2):
+    """Concat two vectors
+
+    Parameters
+    ----------
+    vec1 : list
+       The input vector.
+
+    vec2 : list
+       The input vector.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(dtype, "tir.vectorcombine", vec1, vec2)
+
+
 def ret(val):
     """Create a tir return expression
 
diff --git a/tests/python/unittest/test_tir_op_types.py b/tests/python/unittest/test_tir_op_types.py
index ffee3b3b57c9..3f0ec37adb85 100644
--- a/tests/python/unittest/test_tir_op_types.py
+++ b/tests/python/unittest/test_tir_op_types.py
@@ -104,6 +104,27 @@ def test_tir_op_tvm_throw_last_error():
     assert expr.op.name == "tir.tvm_throw_last_error"
 
 
+def test_tir_op_vectorlow():
+    buffer = tir.decl_buffer((4, 4), "int8", offset_factor=1)
+    vec = buffer.vload([0, 0], dtype="int8x16")
+    expr = tir.vectorlow("int8x8", vec)
+    assert expr.op.name == "tir.vectorlow"
+
+
+def test_tir_op_vectorhigh():
+    buffer = tir.decl_buffer((4, 4), "int8", offset_factor=1)
+    vec = buffer.vload([0, 0], dtype="int8x16")
+    expr = tir.vectorhigh("int8x8", vec)
+    assert expr.op.name == "tir.vectorhigh"
+
+
+def test_tir_op_vectorcombine():
+    buffer = tir.decl_buffer((4, 4), "int8", offset_factor=1)
+    vec = buffer.vload([0, 0], dtype="int8x16")
+    expr = tir.vectorcombine("int8x8", vec, vec)
+    assert expr.op.name == "tir.vectorcombine"
+
+
 def test_tir_op_TVMBackendAllocWorkspace():
     expr = tir.TVMBackendAllocWorkspace(0, 1, 2, 3, 4)
     assert expr.op.name == "tir.TVMBackendAllocWorkspace"
@@ -130,5 +151,8 @@ def test_tir_op_TVMBackendFreeWorkspace():
     test_tir_op_type_annotation()
     test_tir_op_tvm_access_ptr()
     test_tir_op_tvm_throw_last_error()
+    test_tir_op_vectorlow()
+    test_tir_op_vectorhigh()
+    test_tir_op_vectorcombine()
     test_tir_op_TVMBackendAllocWorkspace()
     test_tir_op_TVMBackendFreeWorkspace()

From bf65b396c15b3cbec18fb1aecfa6862f58a2f307 Mon Sep 17 00:00:00 2001
From: Farshid Salemi Parizi <fparizi@octoml.ai>
Date: Wed, 24 Aug 2022 08:29:30 -0700
Subject: [PATCH 030/704] [Hexagon] Add support to run on multiple devices
 (#12504)

* working in parralel using worker

* creating launchers per test and clean up

* clean up

* ci change to distrube tests

* ci work with any number of devices

* fix running on simulator

* adding function docstring

* fix android_serial_number to always return a list of string

* lint issue

* fix internal error when skipping tests while androideserial number is not set

* lint issue
---
 python/tvm/contrib/hexagon/pytest_plugin.py | 60 +++++++++++++++------
 tests/scripts/setup-pytest-env.sh           |  2 +-
 tests/scripts/task_python_hexagon.sh        | 14 ++++-
 3 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index f735c81ee0aa..65475d67f555 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -56,13 +56,16 @@ def _compose(args, decs):
 requires_hexagon_toolchain = tvm.testing.requires_hexagon(support_required="compile-only")
 
 
-@pytest.fixture(scope="session")
 def android_serial_number() -> Optional[str]:
+    """Return the android serial number"""
     serial = os.getenv(ANDROID_SERIAL_NUMBER, default="")
     # Setting ANDROID_SERIAL_NUMBER to an empty string should be
     # equivalent to having it unset.
     if not serial.strip():
-        serial = None
+        return None
+
+    # Split android serial numbers into a list
+    serial = serial.split(",")
     return serial
 
 
@@ -155,12 +158,16 @@ def adb_server_socket() -> str:
 
 @pytest.fixture(scope="session")
 def hexagon_server_process(
-    request, android_serial_number, rpc_server_port_for_session, adb_server_socket, skip_rpc
+    request, rpc_server_port_for_session, adb_server_socket, skip_rpc
 ) -> HexagonLauncherRPC:
     """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined.
     This launcher is started only once per test session.
     """
-    if android_serial_number is None or android_serial_number == "simulator":
+    android_serial_num = android_serial_number()
+
+    if android_serial_num is None:
+        pytest.skip("ANDROID_SERIAL_NUMBER is not set.")
+    if android_serial_num == ["simulator"]:
         yield None
     else:
         # Requesting these fixtures sets up a local tracker, if one
@@ -175,16 +182,37 @@ def hexagon_server_process(
             "rpc_server_port": rpc_server_port_for_session,
             "adb_server_socket": adb_server_socket,
         }
-        launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+        workerinput = getattr(request.config, "workerinput", None)
+        if workerinput is None:  # single-process execution
+            device_adr = read_device_list()[0]
+        else:  # running in a subprocess here
+            device_adr = workerinput["device_adr"]
+        launcher = HexagonLauncher(serial_number=device_adr, rpc_info=rpc_info)
         try:
             if not skip_rpc:
                 launcher.start_server()
-            yield launcher
+            yield {"launcher": launcher, "device_adr": device_adr}
         finally:
             if not skip_rpc:
                 launcher.stop_server()
 
 
+def read_device_list():
+    return android_serial_number()
+
+
+def pytest_configure(config):
+    # read device list if we are on the master
+    if not hasattr(config, "workerinput"):
+        config.iplist = read_device_list()
+
+
+def pytest_configure_node(node):
+    # the master for each node fills slaveinput dictionary
+    # which pytest-xdist will transfer to the subprocess
+    node.workerinput["device_adr"] = node.config.iplist.pop()
+
+
 @pytest.fixture
 def hexagon_launcher(
     hexagon_server_process,
@@ -192,14 +220,12 @@ def hexagon_launcher(
     tvm_tracker_host,
     tvm_tracker_port,
     adb_server_socket,
-    android_serial_number,
 ) -> HexagonLauncherRPC:
     """Initials and returns hexagon launcher which reuses RPC info and Android serial number."""
-    if android_serial_number is None:
-        yield None
+    android_serial_num = android_serial_number()
 
-    if android_serial_number != "simulator":
-        rpc_info = hexagon_server_process._rpc_info
+    if android_serial_num != ["simulator"]:
+        rpc_info = hexagon_server_process["launcher"]._rpc_info
     else:
         rpc_info = {
             "rpc_tracker_host": tvm_tracker_host,
@@ -208,13 +234,17 @@ def hexagon_launcher(
             "adb_server_socket": adb_server_socket,
         }
 
-    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
     try:
-        if android_serial_number == "simulator":
+        if android_serial_num == ["simulator"]:
+            launcher = HexagonLauncher(serial_number=android_serial_num[0], rpc_info=rpc_info)
             launcher.start_server()
+        else:
+            launcher = HexagonLauncher(
+                serial_number=hexagon_server_process["device_adr"], rpc_info=rpc_info
+            )
         yield launcher
     finally:
-        if android_serial_number == "simulator":
+        if android_serial_num == ["simulator"]:
             launcher.stop_server()
         launcher.cleanup_directory()
 
@@ -239,7 +269,7 @@ def terminate_rpc_servers():
     # yield happens every time.
     serial = os.environ.get(ANDROID_SERIAL_NUMBER)
     yield []
-    if serial == "simulator":
+    if serial == ["simulator"]:
         os.system("ps ax | grep tvm_rpc_x86 | awk '{print $1}' | xargs kill")
 
 
diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index d27f008093e0..afb759c09356 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -74,7 +74,7 @@ function run_pytest() {
 
     suite_name="${test_suite_name}-${current_shard}-${ffi_type}"
 
-    if [[ ! "${extra_args[@]}" == *" -n"* ]]; then
+    if [ ! "${extra_args[@]}" == *" -n"* ] && [! "${extra_args[@]}" == *" -dist"* ]; then
         extra_args+=("-n=1")
     fi
 
diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh
index c87bc9b250fa..f7c0a43c48e8 100755
--- a/tests/scripts/task_python_hexagon.sh
+++ b/tests/scripts/task_python_hexagon.sh
@@ -39,8 +39,20 @@ if [[ "${device_serial}" == "simulator" ]]; then
     export HEXAGON_SHARED_LINK_FLAGS="-Lbuild/hexagon_api_output -lhexagon_rpc_sim"
 fi
 
+num_of_devices=0
+if [ ! "${device_serial}" == "simulator" ]; then
+    IFS=',' read -ra ADDR <<< "$device_serial"
+    for i in "${ADDR[@]}"; do
+        num_of_devices=$(($num_of_devices+1))
+    done
+fi
+
 export ANDROID_SERIAL_NUMBER=${device_serial}
-run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon
+if [ "${device_serial}" == "simulator" ]; then
+    run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon
+else
+    run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon --tx $num_of_devices*popen --dist=load
+fi
 
 if [[ "${device_serial}" == "simulator" ]]; then
     kill ${TRACKER_PID}

From f53ee0cecf96adad71db92d2a0c488ca2dd6bee7 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 24 Aug 2022 11:44:59 -0700
Subject: [PATCH 031/704] [Hexagon] Fix missing pytest import (#12565)

* Add pytest

* lint
---
 tests/python/contrib/test_hexagon/topi/test_cast_slice.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
index 6569ce36bb0e..1b235a4daf52 100644
--- a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """ Tests for Hexagon slice cast ops """
+import pytest
 import numpy as np
 
 import tvm
@@ -75,6 +76,7 @@ def test_cast_fp16_fp32_slice(
         """
         if hexagon_session._launcher._serial_number != "simulator":
             pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
+
         target_hexagon = tvm.target.hexagon("v69")
         target = tvm.target.Target(target_hexagon, host=target_hexagon)
         cast_input = te.placeholder(input_shape, name="A", dtype=dtype)

From 1afd0593956066635ee49297b731726c9218c91c Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <73191103+jverma-quic@users.noreply.github.com>
Date: Wed, 24 Aug 2022 13:55:50 -0500
Subject: [PATCH 032/704] [TOPI][Hexagon] Implement quantized avgpool (#12340)

* [TOPI][Hexagon] Implement quantized avgpool

* Fix pylint errors

* Needed to adjust input padding for int8 buffer layout

* Fix formatting issue

* Add unit test for fixed-point conversion utility function

Also, address review comments.

* Remove pytest.skip for test_avg_pool2d_slice.py to enable on-target testing

* Fix formatting issue

* Update python/tvm/topi/hexagon/utils.py

Co-authored-by: Christian Convey <christian.convey@gmail.com>

* Update comments and error messages

* Address review comments

* Import Tuple from typing

* Address pylint error

Co-authored-by: Christian Convey <christian.convey@gmail.com>
---
 python/tvm/topi/hexagon/__init__.py           |   1 +
 python/tvm/topi/hexagon/qnn/__init__.py       |  20 ++
 python/tvm/topi/hexagon/qnn/avg_pool2d.py     | 205 +++++++++++++++++
 python/tvm/topi/hexagon/slice_ops/__init__.py |   2 +-
 .../tvm/topi/hexagon/slice_ops/avg_pool2d.py  |  24 +-
 python/tvm/topi/hexagon/utils.py              | 136 ++++++++++++
 .../contrib/test_hexagon/infrastructure.py    |  55 ++++-
 .../test_fixed_point_conversion.py            |  64 ++++++
 .../topi/test_avg_pool2d_slice.py             | 209 +++++++++++-------
 9 files changed, 625 insertions(+), 91 deletions(-)
 create mode 100644 python/tvm/topi/hexagon/qnn/__init__.py
 create mode 100644 python/tvm/topi/hexagon/qnn/avg_pool2d.py
 create mode 100644 tests/python/contrib/test_hexagon/test_fixed_point_conversion.py

diff --git a/python/tvm/topi/hexagon/__init__.py b/python/tvm/topi/hexagon/__init__.py
index 7b0aa59c8de3..dfe739288187 100644
--- a/python/tvm/topi/hexagon/__init__.py
+++ b/python/tvm/topi/hexagon/__init__.py
@@ -26,3 +26,4 @@
 from .pooling import *
 from .reduce import *
 from .resize2d import *
+from .qnn import *
diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
new file mode 100644
index 000000000000..e27e3793d565
--- /dev/null
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Computes and schedules for Hexagon quantized ops """
+
+from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule
diff --git a/python/tvm/topi/hexagon/qnn/avg_pool2d.py b/python/tvm/topi/hexagon/qnn/avg_pool2d.py
new file mode 100644
index 000000000000..4aac15cbdc17
--- /dev/null
+++ b/python/tvm/topi/hexagon/qnn/avg_pool2d.py
@@ -0,0 +1,205 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals
+
+""" Compute and schedule for quantized avg_pool2d op
+
+Please note the following assumptions made by the implementation:
+
+1) The input must be padded in advance to account for 'padding'. In addition,
+   both input and output must be padded as per the physical buffer layout.
+2) The current implementation assumes 'count_include_pad' to be 'True'. It can be
+   modified to support 'False' case but the element count for the pooling window
+   must be pre-computed and provided as an input to reduce the run-time overhead.
+3) 'padding' is ignored. It must be handled outside of the sliced op.
+4) Please note that this implementation will not work if the output includes any
+   physical layout related padding as it can result into out-of-bound access
+   for the input.
+"""
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn, get_fixed_point_value
+
+
+def validate_out_shape(out_shape: list, in_shape: list, kernel: list, stride: list, dilation: list):
+    """Validate output shape"""
+    _, oh, ow, _ = out_shape
+    _, ih, iw, _ = in_shape
+    kh, kw = kernel
+    sh, sw = stride
+    dh, dw = dilation
+    if ih < (oh - 1) * sh + dh * (kh - 1) + 1:
+        raise RuntimeError("Output height is too large")
+    if iw < (ow - 1) * sw + dw * (kw - 1) + 1:
+        raise RuntimeError("Output width is too large")
+
+
+def saturate(x: te.Tensor, dtype: str):
+    """Saturate value for the specified data type"""
+    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
+
+
+def qnn_avg_pool2d_compute(
+    data: te.Tensor,
+    kernel: list,
+    stride: list,
+    dilation: list,
+    oshape: list,
+    odtype: str,
+    # quantization params:
+    input_zero_point: int,
+    input_scale: float,
+    output_zero_point: int,
+    output_scale: float,
+):
+    """Compute for quantized avg_pool2d"""
+    kh, kw = kernel
+    rh = te.reduce_axis((0, kh), name="rh")
+    rw = te.reduce_axis((0, kw), name="rw")
+    ob, oh, ow, oc = oshape
+    if isinstance(ob, int):
+        validate_out_shape(oshape, data.shape, kernel, stride, dilation)
+
+    if odtype == "uint8":
+        temp_dtype = "uint16"
+    elif odtype == "int8":
+        temp_dtype = "int16"
+    else:
+        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
+
+    sh, sw = stride
+    dh, dw = dilation
+
+    PoolArea = kh * kw
+
+    scale = input_scale / output_scale
+    scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
+    scale_with_area = scale_fixed_point // PoolArea
+    corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point
+
+    Sum = te.compute(
+        oshape,
+        lambda b, h, w, c: te.sum(
+            data[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(temp_dtype), axis=[rh, rw]
+        ),
+        name="sum",
+    )
+
+    Avg = te.compute(
+        oshape,
+        lambda b, h, w, c: saturate(
+            ((Sum[b, h, w, c] * scale_with_area) + corr) >> rsh, odtype
+        ).astype(odtype),
+        name="avg",
+    )
+    return Avg
+
+
+def schedule_nhwc_8h8w32c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
+    """Schedule for input and output layout nhwc-8h8w32c"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+    Sum = s.get_block("sum")
+    Avg = s.get_block("avg")
+
+    input_transform_fn = get_layout_transform_fn(input_layout)
+    output_transform_fn = get_layout_transform_fn(output_layout)
+    s.transform_layout(Sum, ("read", 0), input_transform_fn)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn)
+
+    # Schedule 'Avg'
+    # Split and reorder the axes to iterate over the output tensor chunks.
+    # Each chunk consists for 2048 bytes with 32 channels being the fastest
+    # changing axis, followed by 8 width and then 8 height.
+    # The width is split by a factor of 4 and then fused with 32 channels
+    # to provide full vector length of data for the output tensor chunks.
+    # NOTE: These schedules are a work in progress and may require
+    # adjustments in future as some of the missing features for 2-d tensors
+    # become available.
+    n, h, w, c = s.get_loops(Avg)
+    ho, hi = s.split(h, [None, 8])
+    wo, wi = s.split(w, [None, 8])
+    wio, wii = s.split(wi, [None, 4])
+    co, ci = s.split(c, [None, 32])
+    s.reorder(n, ho, wo, co, hi, wio, wii, ci)
+    wii_ci = s.fuse(wii, ci)
+    s.vectorize(wii_ci)
+
+    # Schedule 'Sum'
+    s.compute_at(Sum, wio)
+    Sum_axis = s.get_loops(Sum)
+    # Compute for 'Sum' includes reduction along height and width. The axes
+    # are being reordered so that 4 width and 32 channels become the
+    # inner-most loops which then can be fused and vectorized. However,
+    # vectorization of the 2-d tensors doesn't work when reduction is
+    # involved and requires codegen support that is yet to be added.
+    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3])
+    ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3])
+    # s.vectorize(ci_wii) # Doesn't work
+    return s
+
+
+def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
+    """Schedule for output layout: n11c-2048c, input layout: nhwc-8h8w32c"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+    Sum = s.get_block("sum")
+    Avg = s.get_block("avg")
+
+    input_transform_fn = get_layout_transform_fn(input_layout)
+    output_transform_fn = get_layout_transform_fn(output_layout)
+    s.transform_layout(Sum, ("read", 0), input_transform_fn)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn)
+
+    # Schedule 'Avg'
+    # Split and reorder the axes to iterate over the output tensor chunks.
+    # Each chunk consists for 2048 bytes. For n11c-2048c tensor layout, each chunk
+    # only contains 2048 channels which get split by a factor of 128 to be vectorized.
+    # NOTE: These schedules are a work in progress and may require
+    # adjustments in future as some of the missing features for 2-d tensors
+    # become available.
+    n, h, w, c = s.get_loops(Avg)
+    co, ci = s.split(c, [None, 2048])
+    cio, cii = s.split(ci, [None, 128])
+    s.vectorize(cii)
+
+    # Schedule 'Sum'
+    # Compute for 'Sum' includes reduction along height and width. The axes are being
+    # reordered so that 128 channels become the inner-most loop and can be vectorized.
+    # However, vectorization of the 2-d tensors doesn't work when reduction is
+    # involved and requires codegen support that is yet to be added.
+    s.compute_at(Sum, cio)
+    Sum_axis = s.get_loops(Sum)
+    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-3])
+    # s.vectorize(Sum_axis[-3]) # Doesn't work
+    return s
+
+
+def qnn_avg_pool2d_schedule(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
+    """Quantized avg_pool2d schedule
+
+    NOTE: This schedule assumes that both input and output tensors are in the form of
+    2d discontiguous buffer and data is already arranged as per the input and output layout
+    respectively.
+
+    """
+    if output_layout == "nhwc-8h8w32c-2d":
+        return schedule_nhwc_8h8w32c(outs, ins, output_layout, input_layout)
+    if output_layout == "n11c-2048c-2d":
+        return schedule_n11c_2048c(outs, ins, output_layout, input_layout)
+    raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index cda63e2e1c73..b96156dc46d2 100644
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -17,7 +17,7 @@
 
 """ Computes and Schedules for Hexagon slice ops. """
 
-from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule
+from .avg_pool2d import avg_pool2d_compute, avg_pool2d_schedule
 from .max_pool2d import max_pool2d_compute, max_pool2d_STIR_schedule
 from .add_subtract_multiply import *
 from .argmax import argmax_compute, argmax_schedule
diff --git a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
index 306be543d8fb..38e2ea577b68 100644
--- a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
+++ b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
@@ -49,33 +49,35 @@ def validate_out_shape(out_shape, in_shape, kernel, stride, dilation):
         raise RuntimeError("Output width is too large")
 
 
-def avg_pool2d_compute(A, out_shape, kernel, stride, dilation):
+def avg_pool2d_compute(A, kernel, stride, dilation, oshape, odtype="float16"):
     """avg_pool2d compute"""
+    if odtype != "float16":
+        RuntimeError(f"Unsupported output dtype '{odtype}'")
     kh, kw = kernel
     rh = te.reduce_axis((0, kh), name="rh")
     rw = te.reduce_axis((0, kw), name="rw")
-    ob, oh, ow, oc = out_shape
+    ob, oh, ow, oc = oshape
     if isinstance(ob, int):
-        validate_out_shape(out_shape, A.shape, kernel, stride, dilation)
+        validate_out_shape(oshape, A.shape, kernel, stride, dilation)
 
     sh, sw = stride
     dh, dw = dilation
     InvArea = float(1) / (kh * kw)
 
     Sum = te.compute(
-        out_shape,
+        oshape,
         lambda b, h, w, c: te.sum(
             A[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw]
         ),
         name="sum",
     )
     Avg = te.compute(
-        out_shape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(A.dtype), name="avg"
+        oshape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(A.dtype), name="avg"
     )
     return Avg
 
 
-def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
+def schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
     """Schedule for input and output layout nhwc-8h2w32c2w"""
     func = te.create_prim_func([ins, outs])
     s = tir.Schedule(func)
@@ -106,7 +108,7 @@ def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: st
     return s
 
 
-def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
+def schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
     """Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w"""
     func = te.create_prim_func([ins, outs])
     s = tir.Schedule(func)
@@ -132,10 +134,10 @@ def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
     return s
 
 
-def avg_pool2d_STIR_schedule(outs, ins, output_layout: str, input_layout: str):
-    """STIR based schedule"""
+def avg_pool2d_schedule(outs, ins, output_layout: str, input_layout: str):
+    """avg_pool2d schedule"""
     if output_layout == "nhwc-8h2w32c2w-2d":
-        return STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout)
+        return schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout)
     if output_layout == "n11c-1024c-2d":
-        return STIR_schedule_n11c_1024c(outs, ins, output_layout, input_layout)
+        return schedule_n11c_1024c(outs, ins, output_layout, input_layout)
     raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index 3b8914ffe937..c056408947b7 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -19,6 +19,9 @@
 
 
 """Common hexagon specific utilities"""
+import math
+import struct
+from typing import Tuple
 from tvm import te
 
 
@@ -102,6 +105,11 @@ def nhwc_8h8w32c_2d(n, h, w, c):
     return [n, h // 8, w // 8, c // 32, te.AXIS_SEPARATOR, h % 8, w % 8, c % 32]
 
 
+def n11c_2048c_2d(n, h, w, c):
+    """Return index map for n11c_2048c 2d layout"""
+    return [n, h, w, c // 2048, te.AXIS_SEPARATOR, c % 2048]
+
+
 def iohw_16i32o2i_1d(height, width, in_channel, out_channel):
     return [
         in_channel // 32,
@@ -150,4 +158,132 @@ def get_layout_transform_fn(layout):
         return nc_2048_2d
     if layout == "nhwc-8h8w32c-2d":
         return nhwc_8h8w32c_2d
+    if layout == "n11c-2048c-2d":
+        return n11c_2048c_2d
     raise RuntimeError(f"Unexpected layout '{layout}'")
+
+
+def get_fixed_point_value(flp: float, dtype: str = "int16") -> Tuple[int, int]:
+    """
+    Return fixed-point value and the corresponding log2 of the scale factor used to compute
+    this value.
+
+    Parameters
+    ----------
+    flp : float
+        Floating-point value to be converted
+    dtype : str
+        Type of the resulting fixed-point value. By default, it's set to "int16"
+
+    Returns
+    -------
+    fixed_point_value : int
+        Fixed-point value for the given floating-point value
+    exp_scale_factor : int
+        log2 of the scale factor
+
+    Convert floating-point value into fixed-point number. This is done by
+    multiplying the value by a scaling factor and then rounding it to the nearest
+    integer value.
+
+    As per IEEE-754 standard, a floating-point value can be represented as follows
+    [see: https://en.wikipedia.org/wiki/IEEE_754-1985]:
+        (-1)^S * M * 2^(E-Bias)
+
+    Here,
+    * S is the signed bit (0 or 1).
+    * M is the mantissa. It's composed of an implicit 1 for the normalized floating-point
+      values or 0 for the denormalized values, and the fraction part. This ensures that
+      mantissa is always within [0, 2) range. Please note that this function doesn't
+      handle denormalized values.
+    * E is the exponent.
+
+    In single precision, 23 bits are used to represent the fraction part of
+    the mantissa (and therefore, '23' shows up in one of the computations below) and
+    8 bits are used for the exponent. Since exponent field needs to reperesent both
+    positive and negative values, a bias (127 for single precision) is added to the actual
+    value. Therefore, to compute the actual exponent, 127 must be subtracted from the stored
+    value.
+
+    As mentioned above, to find the corresponding fixed-point number, we multiply the
+    value with a scaling factor and then round it to the nearest integer. The scaling factor
+    is chosen to be a power for 2 and it's the largest value that can be safely multiplied
+    to the floating-point value, without causing the resulting value to overflow the range
+    of the integer type used to represent the fixed-point value.
+
+    So, if we assume the scaling factor to be 2^x, the resulting fixed-point value will be:
+        round((-1)^S * (M) * 2^(E-Bias) * 2^x)
+
+    This can be simplified to:
+        round((-1)^S * M * 2^(E-Bias+x)
+
+    Now, if 'int16' is used for fixed-point value, then it has to be >= -(2 * 2^14)
+    and <= (2 * 2^14) - 1. Since M (Mantissa) is always < 2, in order for the fixed-point value
+    to be within this range, 2^(E - Bias + x) must be <= 2^14 - 1.
+    And, if we ignore -1, (E - Bias + x) should be <= 14. Note: if mantissa gets too close to 2,
+    this will cause the resulting value to go out of range and require it to be saturated.
+    In the following implementation, we perform range check and adjust the scale to avoid
+    saturation.
+    For most cases, 2^x, where x = 14 - (E - Bias) or 14 - (E - 127) for single precision, is the
+    best scaling factor for 'int16' type that can be used to convert the floating-point value to
+    fixed-point with the least amount of precision loss.
+
+    Additonal notes on various floating-point values:
+    ------------------------------------------------
+    1) Denormalized values: causes assertion failure. The problem with the denormalized values
+        is that they require a very large scale factor (>= 2^127) to be converted to a fixed-point
+        value. As the denormalzied values get smaller, the scale factor becomes too large to be
+        represented as a IEEE-754 floating point value (as being done in the computaton below)
+        and therefore, the denormalized values aren't being handled here.
+    2) NaN and INF: assertion failure
+    """
+
+    def within_range(val, dtype):
+        if dtype == "int16":
+            return -32768 <= val <= 32767
+        raise RuntimeError(f"Unsupported dtype, {dtype}'")
+
+    # Make sure that 'flp' isn't NaN or infinity
+    if math.isnan(flp) or math.isinf(flp):
+        raise RuntimeError("NaN or INF can not be represented as fixed-point")
+
+    flp_f = struct.pack("f", flp)
+    flp_i = struct.unpack("I", flp_f)
+    exp_stored_value = (flp_i[0] >> 23) & 0xFF
+
+    if exp_stored_value == 0:
+        raise RuntimeError(
+            "Denormalized values are not considered for float -> fixed-point conversion!"
+        )
+
+    exp_value = ((flp_i[0] >> 23) & 0xFF) - 127
+    if dtype == "int16":
+        max_bits = 14
+    else:
+        raise RuntimeError(f"Unsupported dtype, {dtype}'")
+
+    exp_scale_factor = max_bits - exp_value  # log2 of the scale_factor
+
+    if exp_scale_factor > 127:
+        raise RuntimeError("Value too small for fixed-point conversion!")
+
+    # Scaling factor = 2^exp_scale_factor
+    # Since exp_scale_factor can be -ve or +ve, scaling factor is calculated by first
+    # representing the value in the binary format as per IEEE floating-point standand and then
+    # reinterpreting it as a float using struct.pack and struct.unpack functions.
+    # struct.pack returns a bytes object packed as integer and struct.unpack
+    # unpacks this bytes object into float.
+    scale = ((exp_scale_factor + 127) & 0xFF) << 23
+    scale_i = struct.pack("I", scale)
+    scale_f = struct.unpack("f", scale_i)
+    fixed_point_value = int(round(flp * scale_f[0]))
+
+    if not within_range(fixed_point_value, dtype):
+        # Adjust scale factor to avoid overflow.
+        exp_scale_factor -= 1
+        scale = ((exp_scale_factor + 127) & 0xFF) << 23
+        scale_i = struct.pack("I", scale)
+        scale_f = struct.unpack("f", scale_i)
+        fixed_point_value = int(round(flp * scale_f[0]))
+
+    return fixed_point_value, exp_scale_factor
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index ab5f62498262..70e50fcb68d6 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -267,8 +267,8 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
             assert h == 1 and w == 1, "The size of h and w must be 1"
             return arr_np.reshape([n, 1, 1, c // 1024, 1024])
         if new_layout == "nc-1024-2d":
-            N, C = arr_np.shape
-            return arr_np.reshape([N, C // 1024, 1024])
+            n, c = arr_np.shape
+            return arr_np.reshape([n, c // 1024, 1024])
         if new_layout == "nhwc-1024c-2d":
             N, H, W, C = arr_np.shape
             return arr_np.reshape([N, H, W, C // 1024, 1024])
@@ -278,11 +278,16 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
         if new_layout == "nhwc-2048c-2d":
             N, H, W, C = arr_np.shape
             return arr_np.reshape([N, H, W, C // 2048, 2048])
-        if new_layout in ["nhwc-8h8w32c-2d"]:
+        if new_layout == "nhwc-8h8w32c-2d":
             n, h, w, c = arr_np.shape
             return arr_np.reshape([n, h // 8, 8, w // 8, 8, c // 32, 32]).transpose(
                 0, 1, 3, 5, 2, 4, 6
             )
+        if new_layout == "n11c-2048c-2d":
+            n, h, w, c = arr_np.shape
+            assert h == 1 and w == 1, "The size of h and w must be 1"
+            return arr_np.reshape([n, h, w, c // 2048, 2048])
+        raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
 
     if current_layout == "nc":
         n, c = arr_np.shape
@@ -300,3 +305,47 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
         raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
 
     raise RuntimeError(f"Unexpected current_layout '{current_layout}'")
+
+
+def quantize_np(arr_np: numpy.ndarray, dtype: str):
+    """
+    Returns quantized array along with scale and zero-point
+
+    Parameters
+    ----------
+    arr_np: numpy.ndarray
+        Input numpy array to be quantized
+    dtype: str
+        dtype of the quantized array: "uint8", "int8", etc
+
+    Returns
+    -------
+    quant_np: numpy.ndarray
+        Quantized numpy array
+    scale: float
+        Scale
+    zero_point: int
+        Value corresponding to float 0
+
+    """
+    if dtype == "uint8":
+        qmax = 255
+        qmin = 0
+    elif dtype == "int8":
+        qmax = 128
+        qmin = -127
+    else:
+        raise RuntimeError(f"Unsupported quantized data type '{dtype}'")
+    fmin = numpy.amin(arr_np)
+    fmax = numpy.amax(arr_np)
+
+    # Include floating-point zero in the range
+    if fmax < 0:
+        fmax = 0.0
+    elif fmin > 0:
+        fmin = 0.0
+
+    scale = (fmax - fmin) / (qmax - qmin)
+    zero_point = numpy.rint((fmax * qmin - fmin * qmax) / (fmax - fmin)).astype("int32")
+    quant_np = (arr_np / scale + zero_point).astype(dtype)
+    return quant_np, scale, zero_point
diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py b/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py
new file mode 100644
index 000000000000..5ec46cf4ae70
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import math
+import struct
+import numpy as np
+import tvm.topi.hexagon.utils as utils
+
+"""
+Test float to fixed-point conversion. We do it by constructing a numpy array with the
+wide range of floating-point values. These values are converted into the
+fixed-point value using topi.hexagon.utils.get_fixed_point_value. Then, these values are
+converted back into float using scale_factor provided by the function. These converted
+floating point values are then compared against the original values and an assertion is
+raised if they happened to be outside of the expected tolerance.
+"""
+
+
+class TestFixedPointConversion:
+    def test_fixed_point_conversion(self):
+        # Construct array with wide range of values
+        fp1 = np.random.uniform(0.00001, 0.0002, size=(10))
+        fp2 = np.random.uniform(0.001, 0.02, size=(10))
+        fp3 = np.random.uniform(1, 20, size=(10))
+        fp4 = np.random.uniform(900, 1000, size=(10))
+        fp5 = np.random.uniform(1e9, 1e10, size=(10))
+
+        # Test for values with largest possible exponent as per IEEE-754 floating-point
+        # standard (actual exp value = 127, stored exp value = 254).
+        fp6 = np.random.uniform(2.4e38, 2.5e38, size=(1))
+
+        # Test for very small floating-point values.
+        fp7 = np.random.uniform(1.4e-34, 1.7e-34, size=(1))
+
+        float_arr = np.concatenate((fp1, fp2, fp3, fp4, fp5, fp6, fp7))
+        for flp in float_arr:
+            fxp, rsh = utils.get_fixed_point_value(flp, "int16")
+            # Compute scale_factor using rsh (rsh is log2 of the scale_factor). While doing this,
+            # we use IEEE-754 floating-point representation since rsh can be negative or positive.
+
+            scale = ((rsh + 127) & 0xFF) << 23  # Add bias (127) and position it into exponent bits
+            scale_i = struct.pack("I", scale)  # Pack it as integer
+            scale_f = struct.unpack("f", scale_i)  # Unpack as float
+
+            converted_flp = fxp / scale_f[0]
+            assert math.isclose(flp, converted_flp, rel_tol=1e-2)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
index af60e0f2e084..743519901542 100644
--- a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
@@ -25,35 +25,67 @@
 from tvm.contrib.hexagon.build import HexagonLauncher
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+import tvm.topi.hexagon.qnn as qn
+from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
 from ..pytest_util import (
     get_multitest_ids,
     create_populated_numpy_ndarray,
-    TensorContentConstant,
     TensorContentRandom,
-    TensorContentDtypeMin,
-    TensorContentDtypeMax,
 )
 
-
 input_layout = tvm.testing.parameter(
     "nhwc-8h2w32c2w-2d",
 )
 
+dtype = tvm.testing.parameter("float16", "uint8")
+
+
+@tvm.testing.fixture
+def output_layout(output_shape, dtype):
+    o_b, o_h, o_w, o_c = output_shape
+    if dtype == "float16":
+        if o_h == 1 and o_w == 1:
+            return "n11c-1024c-2d"
+        else:
+            assert o_h % 8 == 0 and o_w % 4 == 0, "Invalid output shape"
+            return "nhwc-8h2w32c2w-2d"
+    elif dtype == "int8" or "uint8":
+        if o_h == 1 and o_w == 1:
+            return "n11c-2048c-2d"
+        else:
+            assert o_h % 8 == 0 and o_w % 8 == 0, "Invalid output shape"
+            return "nhwc-8h8w32c-2d"
+    else:
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
 
 @tvm.testing.fixture
 def input_np(input_shape, dtype: str, input_tensor_populator):
+    if dtype == "uint8":
+        dtype = "float32"  # Use "float32" input which will be quantized later
     return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator)
 
 
 @tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, output_layout):
-    return transform_numpy(expected_output_np, "nhwc", output_layout)
+def transformed_expected_output_np(expected_output_np, output_layout, dtype):
+    if dtype == "float16":
+        return transform_numpy(expected_output_np, "nhwc", output_layout)
+    elif dtype in ("uint8", "int8"):
+        quant_arr, scale, zero_point = quantize_np(expected_output_np, dtype)
+        return [transform_numpy(quant_arr, "nhwc", output_layout), scale, zero_point]
+    else:
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
 
 
 @tvm.testing.fixture
-def transformed_input_np_padded(input_np_padded, input_layout):
-    return transform_numpy(input_np_padded, "nhwc", input_layout)
+def transformed_input_np_padded(input_np_padded, input_layout, dtype):
+    if dtype == "float16":
+        return transform_numpy(input_np_padded, "nhwc", input_layout)
+    elif dtype in ("uint8", "int8"):
+        quant_arr, scale, zero_point = quantize_np(input_np_padded, dtype)
+        return [transform_numpy(quant_arr, "nhwc", input_layout), scale, zero_point]
+    else:
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
 
 
 class TestAvgPool2dSlice:
@@ -65,8 +97,6 @@ class TestAvgPool2dSlice:
         "pad",  # padding
         "ceil",  # ceil_mode
         "cnt_padded",  # count_include_pad
-        "out_layout",  # output_layout
-        None,  # dtype
         None,  # input_tensor_populator
     ]
 
@@ -79,8 +109,6 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -91,8 +119,6 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -103,8 +129,6 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         # Test non-one stride and dilation
@@ -116,8 +140,6 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -128,8 +150,6 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -140,8 +160,6 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         # Test non-zero padding
@@ -153,8 +171,6 @@ class TestAvgPool2dSlice:
             [1, 1, 1, 1],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -165,8 +181,6 @@ class TestAvgPool2dSlice:
             [1, 2, 3, 4],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -177,8 +191,6 @@ class TestAvgPool2dSlice:
             [1, 2, 3, 4],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -189,8 +201,6 @@ class TestAvgPool2dSlice:
             [1, 2, 3, 4],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         # Test n11c-1024c-2d layout which will require input and output to have different layout
@@ -202,8 +212,6 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "n11c-1024c-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -214,8 +222,6 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "n11c-1024c-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -226,8 +232,6 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "n11c-1024c-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -238,8 +242,6 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "n11c-1024c-2d",
-            "float16",
             TensorContentRandom(),
         ),
     ]
@@ -255,8 +257,6 @@ class TestAvgPool2dSlice:
         padding,
         ceil_mode,
         count_include_pad,
-        output_layout,
-        dtype,
         input_tensor_populator,
     ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids)
 
@@ -309,15 +309,32 @@ def input_shape(self, output_shape, kernel, padding, stride, dilation, output_la
         return [o_b, in_h, in_w, o_c]
 
     @tvm.testing.fixture
-    def input_shape_padded(self, input_shape, padding, output_layout):
+    def input_shape_padded(self, input_shape, padding, output_layout, dtype):
         # Input shape is adjusted to account for 'padding'. Also, due to the physical
         # layout of the buffer, height and width are adjusted so that they are a
-        # multiple of 8 and 4 respectively.
-        # NOTE: Input layout is always assumed to be nhwc-8h2w32c2w-2d.
+        # multiple of the buffer size dictated by the layout.
+        # NOTE: For float16, the input layout is always assumed to be nhwc-8h2w32c2w-2d and
+        # for int8/uint8, it's nhwc-8h8w32c-2d.
+        # For both nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d, the height should be a multiple
+        # of 8. However, the width should be a multiple of 4 for the first case and 8 for
+        # the second case.
+
+        height_mult = 8
+        if dtype == "float16":
+            width_mult = 4  # input layout : nhwc-8h2w32c2w-2d
+        elif dtype in ("uint8", "int8"):
+            width_mult = 8  # input layout : nhwc-8h8w32c-2d
+        else:
+            raise RuntimeError(f"Unsupport dtype '{dtype}'")
+
         pad_before_h, pad_before_w = padding[:2]
         pad_after_h, pad_after_w = padding[2:]
-        padded_input_height = ((input_shape[1] + pad_before_h + pad_after_h + 7) // 8) * 8
-        padded_input_width = ((input_shape[2] + pad_before_w + pad_after_w + 3) // 4) * 4
+        padded_input_height = (
+            (input_shape[1] + pad_before_h + pad_after_h + height_mult - 1) // height_mult
+        ) * height_mult
+        padded_input_width = (
+            (input_shape[2] + pad_before_w + pad_after_w + width_mult - 1) // width_mult
+        ) * width_mult
         return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]]
 
     @tvm.testing.fixture
@@ -332,80 +349,120 @@ def input_np_padded(self, input_np, input_shape, input_shape_padded, padding):
         )
         return input_padded
 
-    @tvm.testing.requires_hexagon
-    def test_avg_pool2d_slice(
+    @tvm.testing.fixture
+    def schedule_args(
         self,
         stride,
         kernel,
         dtype,
         dilation,
-        padding,
-        count_include_pad,
         input_layout,
         output_layout,
         output_shape,
-        input_shape,
         input_shape_padded,
-        input_np,
-        input_np_padded,
         transformed_input_np_padded,
         transformed_expected_output_np,
-        expected_output_np,
-        hexagon_session: Session,
     ):
-        if hexagon_session._launcher._serial_number != "simulator":
-            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11928")
-
-        target_hexagon = tvm.target.hexagon("v69")
+        """
+        Construct schedule args based on dtype
+        """
         A = te.placeholder(input_shape_padded, name="A", dtype=dtype)
 
-        M = sl.avg_pool2d_compute(A, output_shape, kernel, stride, dilation)
+        if dtype == "float16":
+            M = sl.avg_pool2d_compute(A, kernel, stride, dilation, output_shape)
+            tir_schedule = sl.avg_pool2d_schedule(M, A, output_layout, input_layout)
+        elif dtype in ("uint8", "int8"):
+            in_data, in_scale, in_zero_point = transformed_input_np_padded
+            _, out_scale, out_zero_point = transformed_expected_output_np
+            M = qn.qnn_avg_pool2d_compute(
+                A,
+                kernel,
+                stride,
+                dilation,
+                output_shape,
+                dtype,
+                in_zero_point,
+                in_scale,
+                out_zero_point,
+                out_scale,
+            )
+            tir_schedule = qn.qnn_avg_pool2d_schedule(M, A, output_layout, input_layout)
 
-        # tir schedule
-        tir_schedule = sl.avg_pool2d_STIR_schedule(M, A, output_layout, input_layout)
-        sch = tir_schedule.mod
+        return [tir_schedule.mod, [A, M]]
 
-        input_axis_separator = [4]
-        if output_layout == "nhwc-8h2w32c2w-2d":
-            output_axis_separator = [4]
-        elif output_layout == "n11c-1024c-2d":
-            output_axis_separator = [4]
-        else:
-            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+    @tvm.testing.requires_hexagon
+    def test_avg_pool2d_slice(
+        self,
+        dtype,
+        output_layout,
+        output_shape,
+        transformed_input_np_padded,
+        transformed_expected_output_np,
+        schedule_args,
+        hexagon_session: Session,
+    ):
+        target_hexagon = tvm.target.hexagon("v69")
+        in_data = transformed_input_np_padded
 
         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(
-                sch,
-                [A, M],
+                *schedule_args,
                 tvm.target.Target(target_hexagon, host=target_hexagon),
                 name="avg_pool2d",
             )
 
+        input_axis_separator = [4]
+        if output_layout in (
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h8w32c-2d",
+            "n11c-1024c-2d",
+            "n11c-2048c-2d",
+        ):
+            output_axis_separator = [4]
+        else:
+            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
+        if dtype == "float16":
+            in_data_np = transformed_input_np_padded
+            out_data_np = transformed_expected_output_np
+        elif dtype in ("uint8", "int8"):
+            in_data_np, _, _ = transformed_input_np_padded
+            out_data_np, _, _ = transformed_expected_output_np
+        else:
+            raise RuntimeError(f"Unsupport dtype '{dtype}'")
+
         input_arr = allocate_hexagon_array(
             hexagon_session.device,
-            data=transformed_input_np_padded,
+            data=in_data_np,
             axis_separators=input_axis_separator,
             mem_scope="global.vtcm",
         )
         output_arr = allocate_hexagon_array(
             hexagon_session.device,
-            transformed_expected_output_np.shape,
+            out_data_np.shape,
             dtype,
             axis_separators=output_axis_separator,
             mem_scope="global.vtcm",
         )
 
         mod = hexagon_session.load_module(func)
+
         mod(input_arr, output_arr)
         b, h, w, c = output_shape
         if output_layout == "nhwc-8h2w32c2w-2d":
             output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
+        elif output_layout == "nhwc-8h8w32c-2d":
+            output_np = output_arr.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32])
+        elif output_layout == "n11c-2048c-2d":
+            output_np = output_arr.numpy().reshape([b, 1, 1, c // 2048, 2048])
         elif output_layout == "n11c-1024c-2d":
             output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024])
         else:
             raise RuntimeError(f"Unexpected layout '{output_layout}'")
-
-        np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3)
+        if dtype == "float16":
+            np.testing.assert_allclose(output_np, out_data_np, rtol=1e-3, atol=1e-3)
+        else:
+            np.testing.assert_allclose(output_np, out_data_np, rtol=1, atol=1)
 
 
 if __name__ == "__main__":

From 17989e8ab519bdcc66014ccee42438f0dfd32023 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 24 Aug 2022 17:45:14 -0700
Subject: [PATCH 033/704] [microTVM] Fix `build` directory exists error
 (#12575)

When you build a project from existing project directory using `tvm.micro.project.GeneratedProject.from_directory` it would show up error if build directory previously existed.
---
 apps/microtvm/zephyr/template_project/microtvm_api_server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index 38a7ec0c2939..76895c430bd6 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -673,6 +673,8 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                 tf.extractall(project_dir)
 
     def build(self, options):
+        if BUILD_DIR.exists():
+            shutil.rmtree(BUILD_DIR)
         BUILD_DIR.mkdir()
 
         zephyr_board = _find_board_from_cmake_file(API_SERVER_DIR / CMAKELIST_FILENAME)

From b8fbfe26ae3b5e323d2d85ffe02913d78bd0fd20 Mon Sep 17 00:00:00 2001
From: Yuchao Zhang <16538059+Lucien0@users.noreply.github.com>
Date: Thu, 25 Aug 2022 08:46:08 +0800
Subject: [PATCH 034/704] [MicroTVM] fix compile error when the compiler
 implements char as unsigned (#12519)

When compiling tvm with micro on the compiler which implements char as unsigned(such as arm-linux-gcc), there is an error:
`src/runtime/crt/graph_executor/load_json.c:218:12: error: result of comparison of constant -1 with expression of type 'char' is always false [-Werror,-Wtautological-constant-out-of-range-compare]`
`    if (ch == EOF || ch == '\r' || ch == '\n') {`
The reason is because the implementation of char is undefined, so it's better to specify here that it is signed.
---
 src/runtime/crt/graph_executor/load_json.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/crt/graph_executor/load_json.c b/src/runtime/crt/graph_executor/load_json.c
index f1c1f6768168..3d3cdb8d1ce9 100644
--- a/src/runtime/crt/graph_executor/load_json.c
+++ b/src/runtime/crt/graph_executor/load_json.c
@@ -177,7 +177,7 @@ char JSONReader_PeekNextNonSpace(JSONReader* reader) {
  */
 int JSONReader_ReadString(JSONReader* reader, char* out_str, size_t out_str_size) {
   int status = 0;
-  char ch = reader->NextNonSpace(reader);
+  int ch = reader->NextNonSpace(reader);
   size_t output_counter = 0;
   while (output_counter < out_str_size || out_str == NULL) {
     ch = reader->NextChar(reader);

From cd8fd9121deb22b078c9fe73cd8a554e6e7a0e15 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Wed, 24 Aug 2022 19:21:35 -0700
Subject: [PATCH 035/704] [TIR] Expose `shift_left` and `shift_right` to Python
 (#12584)

This PR exposes the following TIR operation in python:

- `shift_left`: tested [here](https://github.com/apache/tvm/blob/1afd0593956066635ee49297b731726c9218c91c/tests/python/unittest/test_tir_transform_simplify.py#L487)
- `shift_right`: add new unittest

Co-authored-by: yongwww <yongcale@gmail.com>
---
 python/tvm/tir/__init__.py                 |  2 +-
 python/tvm/tir/op.py                       | 38 ++++++++++++++++++++++
 tests/python/unittest/test_tir_op_types.py | 16 +++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index f61e05cc92e9..94efe6e1abfe 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -63,7 +63,7 @@
 from .op import likely, isnan, isnullptr, isfinite, isinf, copysign
 from .op import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod, ceildiv
 from .op import comm_reducer, min, max, sum
-from .op import q_multiply_shift
+from .op import q_multiply_shift, shift_left, shift_right
 from .op import TVMBackendAllocWorkspace, TVMBackendFreeWorkspace
 
 from .schedule import StmtSRef, BlockScope, ScheduleState, Schedule, ScheduleError
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index c4618042b2dc..4f26b0f94765 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -1604,6 +1604,44 @@ def q_multiply_shift(x, y, q, s):
     return call_intrin("int32", "tir.q_multiply_shift", x, y, q, s)
 
 
+def shift_left(x, y, span=None):
+    """Return the result of x left shifted by y bits.
+
+    Parameters
+    ----------
+    x : PrimExpr
+        Input argument.
+
+    y : PrimExpr
+        Input argument.
+
+    Returns
+    -------
+    z : PrimExpr
+        The result.
+    """
+    return _ffi_api.left_shift(x, y, span)
+
+
+def shift_right(x, y, span=None):
+    """Return the result of x right shifted by y bits.
+
+    Parameters
+    ----------
+    x : PrimExpr
+        Input argument.
+
+    y : PrimExpr
+        Input argument.
+
+    Returns
+    -------
+    z : PrimExpr
+        The result.
+    """
+    return _ffi_api.right_shift(x, y, span)
+
+
 def fmod(x, y):
     """Return the remainder of x divided by y with the same sign as x.
 
diff --git a/tests/python/unittest/test_tir_op_types.py b/tests/python/unittest/test_tir_op_types.py
index 3f0ec37adb85..835a397ee3b2 100644
--- a/tests/python/unittest/test_tir_op_types.py
+++ b/tests/python/unittest/test_tir_op_types.py
@@ -125,6 +125,20 @@ def test_tir_op_vectorcombine():
     assert expr.op.name == "tir.vectorcombine"
 
 
+def test_tir_op_shift_left():
+    x = tir.Var("x", dtype="int32")
+    y = tir.Var("x", dtype="int32")
+    expr = tir.shift_left(x, y)
+    assert expr.op.name == "tir.shift_left"
+
+
+def test_tir_op_shift_right():
+    x = tir.Var("x", dtype="int32")
+    y = tir.Var("x", dtype="int32")
+    expr = tir.shift_right(x, y)
+    assert expr.op.name == "tir.shift_right"
+
+
 def test_tir_op_TVMBackendAllocWorkspace():
     expr = tir.TVMBackendAllocWorkspace(0, 1, 2, 3, 4)
     assert expr.op.name == "tir.TVMBackendAllocWorkspace"
@@ -154,5 +168,7 @@ def test_tir_op_TVMBackendFreeWorkspace():
     test_tir_op_vectorlow()
     test_tir_op_vectorhigh()
     test_tir_op_vectorcombine()
+    test_tir_op_shift_left()
+    test_tir_op_shift_right()
     test_tir_op_TVMBackendAllocWorkspace()
     test_tir_op_TVMBackendFreeWorkspace()

From 9aac161a46e5aca4c433ccb901c1bb84e6c8bd0c Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 24 Aug 2022 23:28:54 -0700
Subject: [PATCH 036/704] [MetaSchedule] Add software pipeline in CUDA tensor
 core auto tensorization (#12544)

cc @Hzfengsy @junrushao @junrushao1994 @masahi @spectrometerHBH
---
 include/tvm/meta_schedule/schedule_rule.h     |   3 +-
 python/tvm/meta_schedule/default_config.py    |   1 +
 .../schedule_rule/multi_level_tiling.py       |   4 +
 .../meta_schedule/testing/schedule_rule.py    |   2 +
 .../multi_level_tiling_tensor_core.cc         | 122 ++++++++++++++++-
 ...hedule_schedule_rule_multi_level_tiling.py | 125 ++++++++++++++++++
 6 files changed, 255 insertions(+), 2 deletions(-)

diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index b5f4a17b698d..2da441c95e0b 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -190,13 +190,14 @@ class ScheduleRule : public runtime::ObjectRef {
    * NullOpt means disable vectorization
    * \param reuse_read Data reuse configuration for reading. NullOpt means no reuse.
    * \param reuse_write Data reuse configuration for writing. NullOpt means no reuse.
+   * \param use_software_pipeline Whether use the software pipeline.
    * \return The schedule rule created
    */
   TVM_DLL static ScheduleRule MultiLevelTilingTensorCore(
       Array<Map<String, String>> intrin_groups, String structure,
       Optional<Array<String>> tile_binds, Optional<Integer> max_innermost_factor,
       Optional<Array<Integer>> vector_load_lens, Optional<Map<String, ObjectRef>> reuse_read,
-      Optional<Map<String, ObjectRef>> reuse_write);
+      Optional<Map<String, ObjectRef>> reuse_write, bool use_software_pipeline);
 
   /*!
    * \brief Create a rule: add-rfactor to some blocks if needed
diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
index 105b3467de0e..0f1f7d3c2c6a 100644
--- a/python/tvm/meta_schedule/default_config.py
+++ b/python/tvm/meta_schedule/default_config.py
@@ -381,6 +381,7 @@ def schedule_rules():
                     levels=[2],
                     scope="shared",
                 ),
+                use_software_pipeline=False,
             ),
             *_DefaultCUDA.schedule_rules(),
         ]
diff --git a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
index a728a91eb74e..6703bc5716e9 100644
--- a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
+++ b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
@@ -161,6 +161,8 @@ class MultiLevelTilingTensorCore(ScheduleRule):
         Data reuse configuration for reading. None means no reuse.
     reuse_write : Optional[ReuseType]
         Data reuse configuration for writing. None means no reuse.
+    use_software_pipeline : bool
+        Whether to use the software pipeline.
     """
 
     def __init__(
@@ -172,6 +174,7 @@ def __init__(
         vector_load_lens: Optional[List[int]] = None,
         reuse_read: Optional[ReuseType] = None,
         reuse_write: Optional[ReuseType] = None,
+        use_software_pipeline: bool = False,
     ) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.ScheduleRuleMultiLevelTilingTensorCore,  # type: ignore # pylint: disable=no-member
@@ -182,4 +185,5 @@ def __init__(
             vector_load_lens,
             reuse_read.as_dict() if reuse_read is not None else None,
             reuse_write.as_dict() if reuse_write is not None else None,
+            use_software_pipeline,
         )
diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py
index 441ca930f858..46df4b95ce07 100644
--- a/python/tvm/meta_schedule/testing/schedule_rule.py
+++ b/python/tvm/meta_schedule/testing/schedule_rule.py
@@ -119,6 +119,7 @@ def multi_level_tiling_tensor_core(
     in_dtype: Union[str, List[str]] = "float16",
     out_dtype: Union[str, List[str]] = "float32",
     trans_b: Union[bool, List[bool]] = False,
+    use_software_pipeline: bool = False,
 ) -> ScheduleRule:
     """Default schedule rules for with multi-level tiling reuse for tensor core"""
     assert write_reuse_scope in ["shared", "global"]
@@ -154,6 +155,7 @@ def multi_level_tiling_tensor_core(
                 levels=[2],
                 scope=write_reuse_scope,
             ),
+            use_software_pipeline=use_software_pipeline,
         )
     raise NotImplementedError(f"{target.kind.name} is not supported")
 
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 7a3ec513db84..49704fb66b15 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -128,6 +128,8 @@ class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode {
   inline std::vector<State> AddReadReuseTensorCore(TensorCoreState state) const;
   // Subrule: Add tensorized store
   inline std::vector<State> AddWriteReuseTensorCore(TensorCoreState state) const;
+  // Subrule: Add software pipeline
+  inline std::vector<State> AddSoftwarePipeline(TensorCoreState state) const;
 
   // Override ApplySubRules to apply tensorization-specific sub-rules
   std::vector<State> ApplySubRules(std::vector<State> states) final;
@@ -155,6 +157,8 @@ class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode {
  public:
   /*! \brief The candidate tensor core intrin groups to apply */
   std::vector<TensorCoreIntrinGroup> intrin_groups;
+  /*! \brief Whether to use software pipeline */
+  bool use_software_pipeline = false;
   static constexpr const char* _type_key = "meta_schedule.MultiLevelTilingTensorCore";
   TVM_DECLARE_FINAL_OBJECT_INFO(MultiLevelTilingTensorCoreNode, MultiLevelTilingNode);
 
@@ -222,6 +226,9 @@ std::vector<State> MultiLevelTilingTensorCoreNode::ApplySubRules(std::vector<Sta
   states = SubRule(std::move(states), [&](State state) {
     return AddReadReuseTensorCore(Downcast<TensorCoreState>(state));
   });
+  states = SubRule(std::move(states), [&](State state) {
+    return AddSoftwarePipeline(Downcast<TensorCoreState>(state));
+  });
   return states;
 }
 
@@ -286,6 +293,117 @@ std::vector<State> MultiLevelTilingTensorCoreNode::AddReadReuseTensorCore(
   return {state};
 }
 
+std::vector<State> MultiLevelTilingTensorCoreNode::AddSoftwarePipeline(
+    TensorCoreState state) const {
+  if (!use_software_pipeline) {
+    return {state};
+  }
+  // The current config is not suitable for software pipelining.
+  if (r_indices_.size() < 2) {
+    return {state};
+  }
+
+  Schedule& sch = state->sch;
+  // Check reduction length after blockize.
+  int64_t reduction_length = 1;
+  for (int r_index : r_indices_) {
+    const Array<LoopRV>& tiles = state->tiles[r_index];
+    for (const LoopRV& tile : tiles) {
+      const auto* extent = sch->Get(tile)->extent.as<IntImmNode>();
+      ICHECK(extent != nullptr) << "Dynamic extent is not supported.";
+      reduction_length *= extent->value;
+    }
+  }
+  if (reduction_length <= 1) {
+    return {state};
+  }
+
+  // Add local stage and double buffering
+  for (int i = 0; i < 2; ++i) {
+    const tir::BlockRV cache_read = state->read_reuse.at(i);
+    sch->Annotate(cache_read, tir::attr::manifest_shared_memory_local_stage, Bool(true));
+    sch->Annotate(cache_read, tir::attr::double_buffer_scope, Integer(0));
+  }
+
+  // Add annotations of software pipeline
+  //
+  // Before pipelining, the original loop can be expressed as the pseudo code below:
+  //
+  // for k0 in [0, K0):
+  //   load tile k0 to registers
+  //   load tile k0 from registers to shared memory
+  //
+  //   for k1 in [0, K1):
+  //     load fragment k1 of tile k0
+  //     compute matmul with fragment k1
+  //
+
+  // Inner software pipeline: Prefetch to tensor core fragment by one iteration
+  // The following annotation for the inner loop is equivalent the pesudo code below:
+  //
+  // Pipelined inner loop:
+  //
+  // prologue:
+  //   load fragment 0
+  // body:
+  //   for k1 in [0, K1 - 1):
+  //     load fragment k1 + 1
+  //     compute matmul with fragment k1
+  // epilogue:
+  //   compute matmul with fragment K1 - 1
+  //
+  sch->Annotate(state->tiles[r_indices_[1]].back(), tir::attr::software_pipeline_stage,
+                Array<Integer>{0, 0, 1});
+  sch->Annotate(state->tiles[r_indices_[1]].back(), tir::attr::software_pipeline_order,
+                Array<Integer>{0, 1, 2});
+  // Outer software pipeline: Interleave the outer loop with the (pipelined) inner loop.
+  // The prefetching stage of the inner pipeline is executed by one iteration in the outer loop.
+  // The following annotation for the outer loop is equivalent the pesudo code below:
+  //
+  // Pipelined outer loop with nested inner pipeline:
+  //
+  // prologue:
+  //   load tile 0 to registers
+  //   load tile 0 from registers to shared memory
+  //
+  //   // prologue of the inner pipeline
+  //   load fragment 0 of tile 0
+  //
+  // body:
+  //   for k0 in [0, K0 - 1):
+  //     load tile k0 + 1 to registers
+  //
+  //     // body of the inner pipeline
+  //     for k1 in [0, K1 - 1):
+  //       load fragment k1 + 1 of tile k0
+  //       compute matmul with fragment k1 of tile k0
+  //
+  //     load tile k0 + 1 from registers to shared memory
+  //
+  //     // prologue of the inner pipeline
+  //     load fragment 0 of tile k0 + 1
+  //
+  //     // epilogue of the inner pipeline
+  //     compute matmul with fragment K1 - 1 of tile k0
+  //
+  // epilogue:
+  //
+  //   // body of the inner pipeline
+  //   for k1 in [0, K1 - 1):
+  //     load fragment k1 + 1 of tile K0 - 1
+  //     compute matmul with fragment k1 of tile K0 - 1
+  //
+  //   // epilogue of the inner pipeline
+  //   compute matmul with fragment K1 - 1 of tile K0 - 1
+  //
+  sch->Annotate(state->tiles[r_indices_[0]].back(), tir::attr::software_pipeline_stage,
+                Array<Integer>{0, 0, 0, 0, 0, 1, 1});
+  sch->Annotate(state->tiles[r_indices_[0]].back(), tir::attr::software_pipeline_order,
+                Array<Integer>{0, 3, 1, 4, 5, 2, 6});
+
+  return {state};
+}
+
 Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
     TensorCoreStateNode* state, const String& intrin_name) const {
   BlockRV block_rv = state->block_rv;
@@ -418,7 +536,8 @@ inline std::vector<State> MultiLevelTilingTensorCoreNode::TransformForTensorizat
 ScheduleRule ScheduleRule::MultiLevelTilingTensorCore(
     Array<Map<String, String>> intrin_groups, String structure, Optional<Array<String>> tile_binds,
     Optional<Integer> max_innermost_factor, Optional<Array<Integer>> vector_load_lens,
-    Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write) {
+    Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write,
+    bool use_software_pipeline) {
   auto node = MultiLevelTilingInitCommon<MultiLevelTilingTensorCoreNode>(
       structure, tile_binds, max_innermost_factor, vector_load_lens, reuse_read, reuse_write);
 
@@ -426,6 +545,7 @@ ScheduleRule ScheduleRule::MultiLevelTilingTensorCore(
   for (const auto& intrin_group_config : intrin_groups) {
     node->intrin_groups.emplace_back(TensorCoreIntrinGroup::FromConfig(intrin_group_config));
   }
+  node->use_software_pipeline = use_software_pipeline;
   return ScheduleRule(node);
 }
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 4da870e455d3..87159fcb3110 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -709,6 +709,131 @@ def test_cuda_tensor_core_matmul_relu():
     check_trace(spaces, expected)
 
 
+def test_cuda_tensor_core_software_pipeline_matmul_relu():
+    m = n = k = 128
+    target = Target("cuda", host="llvm")
+    ctx = _create_context(
+        create_prim_func(
+            te_workload.matmul_relu(
+                n=n,
+                m=m,
+                k=k,
+                in_dtype="float16",
+                out_dtype="float32",
+            )
+        ),
+        target=target,
+        rule=[
+            multi_level_tiling_tensor_core(
+                target=target, write_reuse_scope="shared", use_software_pipeline=True
+            ),
+            auto_inline(target),
+        ],
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+
+    expected = [
+        """b0 = sch.get_block(name="C", func_name="main")
+b1 = sch.get_block(name="compute", func_name="main")
+sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
+b2 = sch.reindex(block=b0, buffer=("write", 0))
+b3 = sch.reindex(block=b0, buffer=("read", 0))
+b4 = sch.reindex(block=b0, buffer=("read", 1))
+sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, ))
+sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, ))
+sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, ))
+sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b4, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, ))
+l5, l6, l7 = sch.get_loops(block=b0)
+l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True)
+l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
+l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
+l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0)
+sch.reorder(l16, l18, l13, l11, l9)
+b20 = sch.blockize(loop=l13)
+sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32")
+sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
+sch.annotate(block_or_loop=b20, ann_key="warp_execution", ann_val=1)
+l21, l22, l23 = sch.get_loops(block=b20)
+v24, v25, v26, v27, v28 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4)
+l29, l30, l31, l32, l33 = sch.split(loop=l21, factors=[v24, v25, v26, v27, v28], preserve_unit_iters=True)
+v34, v35, v36, v37, v38 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4)
+l39, l40, l41, l42, l43 = sch.split(loop=l22, factors=[v34, v35, v36, v37, v38], preserve_unit_iters=True)
+v44, v45, v46 = sch.sample_perfect_tile(loop=l23, n=3, max_innermost_factor=4)
+l47, l48, l49 = sch.split(loop=l23, factors=[v44, v45, v46], preserve_unit_iters=True)
+sch.reorder(l29, l39, l30, l40, l31, l41, l47, l48, l32, l42, l49, l33, l43)
+l50 = sch.fuse(l29, l39, preserve_unit_iters=True)
+sch.bind(loop=l50, thread_axis="blockIdx.y")
+l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
+sch.bind(loop=l51, thread_axis="blockIdx.x")
+l52 = sch.fuse(l31, l41, preserve_unit_iters=True)
+sch.bind(loop=l52, thread_axis="threadIdx.y")
+b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared")
+sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True)
+b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator")
+sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True)
+v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55)
+sch.reverse_compute_inline(block=b2)
+l56, l57, l58, l59, l60 = sch.get_loops(block=b54)
+l61, l62 = sch.split(loop=l60, factors=[None, 16], preserve_unit_iters=True)
+l63, l64 = sch.split(loop=l59, factors=[None, 16], preserve_unit_iters=True)
+l65, l66, l67, l68, l69, l70, l71 = sch.get_loops(block=b54)
+sch.reorder(l70, l64, l62)
+b72 = sch.blockize(loop=l64)
+sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared")
+b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared")
+sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True)
+l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73)
+l80 = sch.fuse(l78, l79, preserve_unit_iters=True)
+v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81)
+b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared")
+sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True)
+l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82)
+l89 = sch.fuse(l87, l88, preserve_unit_iters=True)
+v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90)
+b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a")
+sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True)
+l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91)
+l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True)
+l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True)
+l103, l104, l105, l106, l107, l108, l109, l110, l111 = sch.get_loops(block=b91)
+sch.reorder(l110, l102, l100)
+b112 = sch.blockize(loop=l102)
+sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
+b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b")
+sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True)
+l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113)
+l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True)
+l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True)
+l125, l126, l127, l128, l129, l130, l131, l132, l133 = sch.get_loops(block=b113)
+sch.reorder(l132, l124, l122)
+b134 = sch.blockize(loop=l124)
+sch.annotate(block_or_loop=b134, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b")
+sch.compute_inline(block=b3)
+sch.compute_inline(block=b4)
+sch.storage_align(block=b73, buffer_index=0, axis=-2, factor=32, offset=8)
+sch.storage_align(block=b82, buffer_index=0, axis=-2, factor=32, offset=8)
+sch.annotate(block_or_loop=b73, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1)
+sch.annotate(block_or_loop=b73, ann_key="double_buffer_scope", ann_val=0)
+sch.annotate(block_or_loop=b82, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1)
+sch.annotate(block_or_loop=b82, ann_key="double_buffer_scope", ann_val=0)
+sch.annotate(block_or_loop=l48, ann_key="software_pipeline_stage", ann_val=[0, 0, 1])
+sch.annotate(block_or_loop=l48, ann_key="software_pipeline_order", ann_val=[0, 1, 2])
+sch.annotate(block_or_loop=l47, ann_key="software_pipeline_stage", ann_val=[0, 0, 0, 0, 0, 1, 1])
+sch.annotate(block_or_loop=l47, ann_key="software_pipeline_order", ann_val=[0, 3, 1, 4, 5, 2, 6])
+sch.reverse_compute_inline(block=b1)""".split(
+            "\n"
+        )
+    ]
+    check_trace(spaces, expected)
+
+
 def test_cuda_tensor_core_matmul_relu_global():
     m = n = k = 128
     target = Target("cuda", host="llvm")

From b38738434b13e138916c994b326b5a128ed14004 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Thu, 25 Aug 2022 03:03:27 -0700
Subject: [PATCH 037/704] [TIR] Expose WMMA-related TensorCore builtins
 (#12589)

This PR exposes the following TIR operation in python:

`tvm_load_matrix_sync`: tested [here](https://github.com/apache/tvm/blob/cd8fd9121deb22b078c9fe73cd8a554e6e7a0e15/tests/python/unittest/test_tvmscript_roundtrip.py#L711)
`tvm_store_matrix_sync`: tested [here](https://github.com/apache/tvm/blob/cd8fd9121deb22b078c9fe73cd8a554e6e7a0e15/tests/python/unittest/test_tvmscript_roundtrip.py#L913)
`tvm_mma_sync`: tested [here](https://github.com/apache/tvm/blob/cd8fd9121deb22b078c9fe73cd8a554e6e7a0e15/tests/python/unittest/test_tvmscript_roundtrip.py#L860)
`tvm_bmma_sync`: add new unittest
`tvm_fill_fragment`: tested [here](https://github.com/apache/tvm/blob/cd8fd9121deb22b078c9fe73cd8a554e6e7a0e15/tests/python/unittest/test_tvmscript_roundtrip.py#L571)

Co-authored-by: yongwww <yongcale@gmail.com>

cc: @junrushao


cc @Hzfengsy @junrushao1994

Co-authored-by: yongwww <yongcale@gmail.com>
---
 python/tvm/tir/__init__.py                 |   7 +
 python/tvm/tir/op.py                       | 236 +++++++++++++++++++++
 tests/python/unittest/test_tir_op_types.py |  43 ++++
 3 files changed, 286 insertions(+)

diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 94efe6e1abfe..04ab7f80daa9 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -52,6 +52,13 @@
 from .op import tvm_tuple, tvm_struct_get, tvm_struct_set
 from .op import address_of, lookup_param, assume, undef
 from .op import tvm_thread_allreduce, type_annotation, tvm_access_ptr, tvm_throw_last_error
+from .op import (
+    tvm_load_matrix_sync,
+    tvm_store_matrix_sync,
+    tvm_mma_sync,
+    tvm_bmma_sync,
+    tvm_fill_fragment,
+)
 from .op import vectorlow, vectorhigh, vectorcombine
 from .op import infinity, reinterpret
 from .op import exp, exp2, exp10, log, log2, log10, log1p, ldexp, clz
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index 4f26b0f94765..cf7985e8f489 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -595,6 +595,242 @@ def tvm_throw_last_error():
     return call_intrin("handle", "tir.tvm_throw_last_error")
 
 
+def tvm_load_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout):
+    """TVM intrinsic for tensor core load operators
+
+    Parameters
+    ----------
+    fragment : Var
+        The wmma fragment.
+
+    m : UIntImm
+        The shape of wmma fragment.
+
+    n : UIntImm
+        The shape of wmma fragment.
+
+    k : UIntImm
+        The shape of wmma fragment.
+
+    index : Expr
+        The fragment index.
+
+    buffer_ptr : Expr
+        The fragment buffer pointer.
+
+    stride : Expr
+        The fragment stride.
+
+    layout : Literal["row_major", "column_major"]
+        The fragment layout.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(
+        "handle",
+        "tir.tvm_load_matrix_sync",
+        fragment,
+        m,
+        n,
+        k,
+        index,
+        buffer_ptr,
+        stride,
+        layout,
+    )
+
+
+def tvm_mma_sync(
+    fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c
+):
+    """TVM intrinsic for tensor core mma_sync operators
+
+    Parameters
+    ----------
+    fragment_d : Var
+        The wmma fragment_d.
+
+    index_d : Expr
+        The fragment_d index.
+
+    fragment_a : Var
+        The wmma fragment_a.
+
+    index_a : Expr
+        The fragment_a index.
+
+    fragment_b : Var
+        The wmma fragment_b.
+
+    index_b : Expr
+        The fragment_b index.
+
+    fragment_c : Var
+        The wmma fragment_c.
+
+    index_c : Expr
+        The fragment_c index.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(
+        "handle",
+        "tir.tvm_mma_sync",
+        fragment_d,
+        index_d,
+        fragment_a,
+        index_a,
+        fragment_b,
+        index_b,
+        fragment_c,
+        index_c,
+    )
+
+
+def tvm_bmma_sync(
+    fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c
+):
+    """TVM intrinsic for tensor core bmma_sync operators
+
+    Parameters
+    ----------
+    fragment_d : Var
+        The bwmma fragment_d.
+
+    index_d : Expr
+        The fragment_d index.
+
+    fragment_a : Var
+        The bwmma fragment_a.
+
+    index_a : Expr
+        The fragment_a index.
+
+    fragment_b : Var
+        The bwmma fragment_b.
+
+    index_b : Expr
+        The fragment_b index.
+
+    fragment_c : Var
+        The bwmma fragment_c.
+
+    index_c : Expr
+        The fragment_c index.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(
+        "handle",
+        "tir.tvm_bmma_sync",
+        fragment_d,
+        index_d,
+        fragment_a,
+        index_a,
+        fragment_b,
+        index_b,
+        fragment_c,
+        index_c,
+    )
+
+
+def tvm_fill_fragment(fragment, m, n, k, index, value):
+    """TVM intrinsic for tensor core fill_fragment operators
+
+    Parameters
+    ----------
+    fragment : Var
+        The wmma fragment
+
+    m : UIntImm
+        The shape of wmma fragment.
+
+    n : UIntImm
+        The shape of wmma fragment.
+
+    k : UIntImm
+        The shape of wmma fragment.
+
+    index : Expr
+        The fragment index.
+
+    value : Expr
+        The value to be filled in fragment.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(
+        "handle",
+        "tir.tvm_fill_fragment",
+        fragment,
+        m,
+        n,
+        k,
+        index,
+        value,
+    )
+
+
+def tvm_store_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout):
+    """TVM intrinsic for tensor core store operators
+
+    Parameters
+    ----------
+    fragment : Var
+        The wmma fragment.
+
+    m : UIntImm
+        The shape of wmma fragment.
+
+    n : UIntImm
+        The shape of wmma fragment.
+
+    k : UIntImm
+        The shape of wmma fragment.
+
+    index : Expr
+        The fragment index.
+
+    buffer_ptr : Expr
+        The fragment buffer pointer.
+
+    stride : Expr
+        The fragment stride.
+
+    layout : Literal["row_major", "column_major"]
+        The fragment layout.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(
+        "handle",
+        "tir.tvm_store_matrix_sync",
+        fragment,
+        m,
+        n,
+        k,
+        index,
+        buffer_ptr,
+        stride,
+        layout,
+    )
+
+
 def vectorlow(dtype, vec):
     """Get the low level half of the vector
 
diff --git a/tests/python/unittest/test_tir_op_types.py b/tests/python/unittest/test_tir_op_types.py
index 835a397ee3b2..5254e7326e24 100644
--- a/tests/python/unittest/test_tir_op_types.py
+++ b/tests/python/unittest/test_tir_op_types.py
@@ -104,6 +104,44 @@ def test_tir_op_tvm_throw_last_error():
     assert expr.op.name == "tir.tvm_throw_last_error"
 
 
+def test_tir_op_tvm_load_matrix_sync():
+    buffer = tir.decl_buffer((16, 16), "float32")
+    x = tir.Var("x", "handle")
+    expr = tir.tvm_load_matrix_sync(buffer.data, 16, 16, 16, 0, x, 128, "row_major")
+    assert expr.op.name == "tir.tvm_load_matrix_sync"
+
+
+def test_tir_op_tvm_store_matrix_sync():
+    buffer = tir.decl_buffer((16, 16), "float32")
+    x = tir.Var("x", "handle")
+    expr = tir.tvm_store_matrix_sync(buffer.data, 16, 16, 16, 0, x, 128, "row_major")
+    assert expr.op.name == "tir.tvm_store_matrix_sync"
+
+
+def test_tir_op_tvm_mma_sync():
+    buffer_0 = tir.decl_buffer((16, 16), "float32")
+    buffer_1 = tir.decl_buffer((16, 16), "float32")
+    buffer_2 = tir.decl_buffer((16, 16), "float32")
+    buffer_3 = tir.decl_buffer((16, 16), "float32")
+    expr = tir.tvm_mma_sync(buffer_0.data, 0, buffer_1.data, 0, buffer_2.data, 0, buffer_3.data, 0)
+    assert expr.op.name == "tir.tvm_mma_sync"
+
+
+def test_tir_op_tvm_bmma_sync():
+    buffer_0 = tir.decl_buffer((16, 16), "float32")
+    buffer_1 = tir.decl_buffer((16, 16), "float32")
+    buffer_2 = tir.decl_buffer((16, 16), "float32")
+    buffer_3 = tir.decl_buffer((16, 16), "float32")
+    expr = tir.tvm_bmma_sync(buffer_0.data, 0, buffer_1.data, 0, buffer_2.data, 0, buffer_3.data, 0)
+    assert expr.op.name == "tir.tvm_bmma_sync"
+
+
+def test_tir_op_tvm_fill_fragment():
+    buffer = tir.decl_buffer((16, 16), "float32")
+    expr = tir.tvm_fill_fragment(buffer.data, 16, 16, 16, 0, 0)
+    assert expr.op.name == "tir.tvm_fill_fragment"
+
+
 def test_tir_op_vectorlow():
     buffer = tir.decl_buffer((4, 4), "int8", offset_factor=1)
     vec = buffer.vload([0, 0], dtype="int8x16")
@@ -165,6 +203,11 @@ def test_tir_op_TVMBackendFreeWorkspace():
     test_tir_op_type_annotation()
     test_tir_op_tvm_access_ptr()
     test_tir_op_tvm_throw_last_error()
+    test_tir_op_tvm_load_matrix_sync(),
+    test_tir_op_tvm_store_matrix_sync(),
+    test_tir_op_tvm_mma_sync(),
+    test_tir_op_tvm_bmma_sync(),
+    test_tir_op_tvm_fill_fragment(),
     test_tir_op_vectorlow()
     test_tir_op_vectorhigh()
     test_tir_op_vectorcombine()

From 40bdea8d7ae1109e33ac64265b4819bb8ebef8b3 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Thu, 25 Aug 2022 00:04:07 -1000
Subject: [PATCH 038/704] [PyTorch] Add aten::new_empty (#12591)

This PR intends to add `aten::new_empty` which is used for model like `hf_Longformer`.

cc: @masahi
---
 python/tvm/relay/frontend/pytorch.py          | 16 ++++++++++++++++
 tests/python/frontend/pytorch/test_forward.py | 17 +++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 04a25c86b799..9f808203a6e1 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2506,6 +2506,21 @@ def empty_like(self, inputs, input_types):
             dtype = input_types[0]
         return _op.zeros(shape, dtype)
 
+    def new_empty(self, inputs, input_types):
+        size = inputs[1]
+
+        import torch
+
+        if not isinstance(size, (_expr.Expr, list, tuple, torch.Size, np.ndarray)):
+            msg = "Data type %s could not be parsed in empty op" % (type(size))
+            raise AssertionError(msg)
+
+        if inputs[2] is not None:
+            dtype = _convert_dtype_value(inputs[2])
+        else:
+            dtype = input_types[0]
+        return _op.zeros(size, dtype)
+
     def randn(self, inputs, input_types):
         import time  # use current time as seed
 
@@ -3639,6 +3654,7 @@ def create_convert_map(self):
             "aten::numel": self.numel,
             "aten::empty": self.empty,
             "aten::empty_like": self.empty_like,
+            "aten::new_empty": self.new_empty,
             "aten::randn": self.randn,
             "aten::bincount": self.bincount,
             "aten::scatter_add": self.scatter_add,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 7e00770cd593..2d0a476e372d 100755
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -4162,6 +4162,23 @@ def test_func(data):
     verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()], assert_shape_only=True)
 
 
+@tvm.testing.uses_gpu
+def test_new_empty():
+    """test_forward_new_ones"""
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    def test_func(input_tensor):
+        return input_tensor.new_empty([3, 10, 10])
+
+    verify_model_with_input(test_func, [torch.rand(input_shape).float()], assert_shape_only=True)
+
+    def test_func1(input_tensor):
+        return input_tensor.new_empty([3, 10, 10], dtype=torch.int32)
+
+    verify_model_with_input(test_func1, [torch.rand(input_shape).float()], assert_shape_only=True)
+
+
 def test_randn():
     """Test for aten::randn"""
 

From fb7cf97fbc2cc19a7eea879a3a1598780f6aa6aa Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 25 Aug 2022 20:05:45 +0900
Subject: [PATCH 039/704] [CI] Install xgboost in Hexagon image (#12592)

Needed for https://github.com/apache/tvm/pull/12587

@mehrdadh

cc @Mousius @areusch @driazati @gigiblender
---
 docker/Dockerfile.ci_hexagon | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index cf7407c2ab05..66b78ae0800c 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -83,3 +83,7 @@ RUN bash /install/ubuntu_install_tflite.sh
 # Install ONNX
 COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
 RUN bash /install/ubuntu_install_onnx.sh
+
+# xgboost (for tuning)
+COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
+RUN bash /install/ubuntu_install_redis.sh

From cc19cdd711b620582baacff82318d3adf5b15115 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 25 Aug 2022 07:22:37 -0700
Subject: [PATCH 040/704] [microTVM][Zephyr] Add recommended heap size for NRF
 and qemu_x86 (#12585)

This PR sets recommended heap size for qemu_x86 and NRF board to fix memory size with models like VWW using AoT host driven executor.
---
 apps/microtvm/zephyr/template_project/boards.json | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/apps/microtvm/zephyr/template_project/boards.json b/apps/microtvm/zephyr/template_project/boards.json
index dcca9c800224..28cbee54d602 100644
--- a/apps/microtvm/zephyr/template_project/boards.json
+++ b/apps/microtvm/zephyr/template_project/boards.json
@@ -38,7 +38,8 @@
         "is_qemu": false,
         "fpu": true,
         "vid_hex": "1366",
-        "pid_hex": "1055"
+        "pid_hex": "1055",
+        "recommended_heap_size_bytes": 368640
     },
     "nucleo_f746zg": {
         "board": "nucleo_f746zg",
@@ -55,7 +56,7 @@
         "fpu": true,
         "vid_hex": "0483",
         "pid_hex": "374b",
-        "recommended_heap_size_bytes": 512000
+        "recommended_heap_size_bytes": 524288
     },
     "qemu_cortex_r5": {
         "board": "qemu_cortex_r5",
@@ -87,7 +88,8 @@
         "is_qemu": true,
         "fpu": true,
         "vid_hex": "",
-        "pid_hex": ""
+        "pid_hex": "",
+        "recommended_heap_size_bytes": 524288
     },
     "stm32f746g_disco": {
         "board": "stm32f746g_disco",

From 56b7c8ae9676ad2184443b60e0c795672e2b6fc9 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Thu, 25 Aug 2022 16:43:06 +0100
Subject: [PATCH 041/704] [CI] Assert some unittests are not skipped in CI
 (#12436)

This PR adds a script that does a diff of skipped tests between the latest successful build on the main and the current branch. Then, it posts a comment with the report on the open PR.

#11670
---
 .github/workflows/tests_bot.yml               |  21 ++
 tests/python/ci/test_ci.py                    | 179 ++++++++++++
 tests/scripts/github_skipped_tests_comment.py | 256 ++++++++++++++++++
 3 files changed, 456 insertions(+)
 create mode 100644 .github/workflows/tests_bot.yml
 create mode 100755 tests/scripts/github_skipped_tests_comment.py

diff --git a/.github/workflows/tests_bot.yml b/.github/workflows/tests_bot.yml
new file mode 100644
index 000000000000..e9d7d81375e4
--- /dev/null
+++ b/.github/workflows/tests_bot.yml
@@ -0,0 +1,21 @@
+
+name: tests-bot
+on:
+  status
+jobs:
+  run-tests-bot:
+    if: ${{ github.repository == 'apache/tvm' && github.event.state == 'success' && github.event.context == 'tvm-ci/pr-head' }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Comment skipped tests
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.CI_RESOURCES_AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.CI_RESOURCES_AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: us-west-2
+          COMMIT_SHA: ${{ github.event.sha }}
+          TARGET_URL: ${{ github.event.target_url }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -eux
+          python tests/scripts/github_skipped_tests_comment.py
\ No newline at end of file
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 1e2008fdd7ba..c45a0d8d8ee0 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test various CI scripts and GitHub Actions workflows"""
+import shutil
 import subprocess
 import json
 import textwrap
@@ -33,6 +34,184 @@ def parameterize_named(*values):
     return pytest.mark.parametrize(",".join(keys), [tuple(d.values()) for d in values])
 
 
+# pylint: disable=line-too-long
+TEST_DATA_SKIPPED_BOT = {
+    "found-diff": {
+        "main_xml_file": "unittest/file1.xml",
+        "main_xml_content": """<?xml version="1.0" encoding="utf-8"?>
+                <testsuites>
+                    <testsuite errors="0" failures="0" hostname="13e7c5f749d8" name="python-unittest-gpu-0-shard-1-ctypes" skipped="102"
+                               tests="165" time="79.312" timestamp="2022-08-10T22:39:36.673781">
+                        <testcase classname="ctypes.tests.python.unittest.test_auto_scheduler_search_policy"
+                                  name="test_sketch_search_policy_cuda_rpc_runner" time="9.679">
+                        </testcase>
+                    </testsuite>
+                </testsuites>
+                """,
+        "pr_xml_file": "unittest/file2.xml",
+        "pr_xml_content": """<?xml version="1.0" encoding="utf-8"?>
+                <testsuites>
+                    <testsuite errors="0" failures="0" hostname="13e7c5f749d8" name="python-unittest-gpu-0-shard-1-ctypes" skipped="102"
+                               tests="165" time="79.312" timestamp="2022-08-10T22:39:36.673781">
+                        <testcase classname="ctypes.tests.python.unittest.test_auto_scheduler_search_policy"
+                                  name="test_sketch_search_policy_cuda_rpc_runner" time="9.679">
+                            <skipped message="This test is skipped" type="pytest.skip">
+                                Skipped
+                            </skipped>
+                        </testcase>
+                        <testcase classname="ctypes.tests.python.unittest.test_roofline"
+                                  name="test_estimate_peak_bandwidth[cuda]" time="4.679">
+                            <skipped message="This is another skippe test" type="pytest.skip">
+                                Skipped
+                            </skipped>
+                        </testcase>
+                    </testsuite>
+                </testsuites>
+                """,
+        "target_url": "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
+        "s3_prefix": "tvm-jenkins-artifacts-prod",
+        "jenkins_prefix": "ci.tlcpack.ai",
+        "common_main_build": """{"build_number": "4115", "state": "success"}""",
+        "commit_sha": "SHA",
+        "expected_url": "issues/11594/comments",
+        "expected_body": """<!---skipped-tests-comment-->\n\nThe list below shows some tests that ran in main SHA but were skipped in the CI build of SHA:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\nunittest -> ctypes.tests.python.unittest.test_roofline#test_estimate_peak_bandwidth[cuda]\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).""",
+    },
+    "no-diff": {
+        "main_xml_file": "unittest/file1.xml",
+        "main_xml_content": """<?xml version="1.0" encoding="utf-8"?>
+                <testsuites>
+                    <testsuite errors="0" failures="0" hostname="13e7c5f749d8" name="python-unittest-gpu-0-shard-1-ctypes" skipped="102"
+                               tests="165" time="79.312" timestamp="2022-08-10T22:39:36.673781">
+                        <testcase classname="ctypes.tests.python.unittest.test_auto_scheduler_search_policy"
+                                  name="test_sketch_search_policy_cuda_rpc_runner" time="9.679">
+                            <skipped message="This test is skipped" type="pytest.skip">
+                                Skipped
+                            </skipped>
+                        </testcase>
+                    </testsuite>
+                </testsuites>
+                """,
+        "pr_xml_file": "unittest/file2.xml",
+        "pr_xml_content": """<?xml version="1.0" encoding="utf-8"?>
+                <testsuites>
+                    <testsuite errors="0" failures="0" hostname="13e7c5f749d8" name="python-unittest-gpu-0-shard-1-ctypes" skipped="102"
+                               tests="165" time="79.312" timestamp="2022-08-10T22:39:36.673781">
+                        <testcase classname="ctypes.tests.python.unittest.test_auto_scheduler_search_policy"
+                                  name="test_sketch_search_policy_cuda_rpc_runner" time="9.679">
+                            <skipped message="This test is skipped" type="pytest.skip">
+                                Skipped
+                            </skipped>
+                        </testcase>
+                    </testsuite>
+                </testsuites>
+                """,
+        "target_url": "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
+        "s3_prefix": "tvm-jenkins-artifacts-prod",
+        "jenkins_prefix": "ci.tlcpack.ai",
+        "common_main_build": """{"build_number": "4115", "state": "success"}""",
+        "commit_sha": "SHA",
+        "expected_url": "issues/11594/comments",
+        "expected_body": """<!---skipped-tests-comment-->\n\nNo additional skipped tests found in this branch for commit SHA.""",
+    },
+    "unable-to-run": {
+        "main_xml_file": "unittest/file1.xml",
+        "main_xml_content": """<?xml version="1.0" encoding="utf-8"?>
+                    <testsuites>
+                    </testsuites>
+                    """,
+        "pr_xml_file": "unittest/file2.xml",
+        "pr_xml_content": """<?xml version="1.0" encoding="utf-8"?>
+                    <testsuites>
+                    </testsuites>
+                    """,
+        "target_url": "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
+        "s3_prefix": "tvm-jenkins-artifacts-prod",
+        "jenkins_prefix": "ci.tlcpack.ai",
+        "common_main_build": """{"build_number": "4115", "state": "failed"}""",
+        "commit_sha": "SHA",
+        "expected_url": "issues/11594/comments",
+        "expected_body": """<!---skipped-tests-comment-->\n\nUnable to run tests bot because main failed to pass CI at SHA.""",
+    },
+}
+# pylint: enable=line-too-long
+
+
+@tvm.testing.skip_if_wheel_test
+@pytest.mark.parametrize(
+    [
+        "main_xml_file",
+        "main_xml_content",
+        "pr_xml_file",
+        "pr_xml_content",
+        "target_url",
+        "s3_prefix",
+        "jenkins_prefix",
+        "common_main_build",
+        "commit_sha",
+        "expected_url",
+        "expected_body",
+    ],
+    [tuple(d.values()) for d in TEST_DATA_SKIPPED_BOT.values()],
+    ids=TEST_DATA_SKIPPED_BOT.keys(),
+)
+# pylint: enable=line-too-long
+def test_skipped_tests_comment(
+    tmpdir_factory,
+    main_xml_file,
+    main_xml_content,
+    pr_xml_file,
+    pr_xml_content,
+    target_url,
+    s3_prefix,
+    jenkins_prefix,
+    common_main_build,
+    commit_sha,
+    expected_url,
+    expected_body,
+):
+    """
+    Test that a comment with a link to the docs is successfully left on PRs
+    """
+    skipped_tests_script = REPO_ROOT / "tests" / "scripts" / "github_skipped_tests_comment.py"
+
+    def write_xml_file(root_dir, xml_file, xml_content):
+        shutil.rmtree(root_dir, ignore_errors=True)
+        file = root_dir / xml_file
+        file.parent.mkdir(parents=True)
+        with open(file, "w") as f:
+            f.write(textwrap.dedent(xml_content))
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+    git.run("init")
+    git.run("checkout", "-b", "main")
+    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
+
+    pr_test_report_dir = Path(git.cwd) / "pr-reports"
+    write_xml_file(pr_test_report_dir, pr_xml_file, pr_xml_content)
+    main_test_report_dir = Path(git.cwd) / "main-reports"
+    write_xml_file(main_test_report_dir, main_xml_file, main_xml_content)
+
+    proc = subprocess.run(
+        [
+            str(skipped_tests_script),
+            "--dry-run",
+            f"--s3-prefix={s3_prefix}",
+            f"--jenkins-prefix={jenkins_prefix}",
+            f"--common-main-build={common_main_build}",
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha},
+        encoding="utf-8",
+        cwd=git.cwd,
+        check=False,
+    )
+    if proc.returncode != 0:
+        raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
+
+    assert f"Dry run, would have posted {expected_url} with data {expected_body}." in proc.stderr
+
+
 @tvm.testing.skip_if_wheel_test
 @pytest.mark.parametrize(
     "target_url,base_url,commit_sha,expected_url,expected_body",
diff --git a/tests/scripts/github_skipped_tests_comment.py b/tests/scripts/github_skipped_tests_comment.py
new file mode 100755
index 000000000000..ef0630620b97
--- /dev/null
+++ b/tests/scripts/github_skipped_tests_comment.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import os
+import logging
+import argparse
+import subprocess
+import sys
+from urllib import error
+from xml.etree import ElementTree
+
+import requests
+
+from git_utils import git, GitHubRepo, parse_remote
+from cmd_utils import init_log
+
+SKIPPED_TESTS_COMMENT_MARKER = "<!---skipped-tests-comment-->\n\n"
+GITHUB_ACTIONS_BOT_LOGIN = "github-actions[bot]"
+
+PR_TEST_REPORT_DIR = "pr-reports"
+MAIN_TEST_REPORT_DIR = "main-reports"
+
+
+def run_subprocess(command):
+    logging.info(f"Running command {command}")
+    proc = subprocess.run(command, shell=True, stdout=subprocess.PIPE, encoding="utf-8")
+    if proc.returncode != 0:
+        raise RuntimeError(f"Command failed {command}:\nstdout:\n{proc.stdout}")
+    return proc
+
+
+def retrieve_test_report(s3_url, target_dir):
+    command = f"aws s3 cp {s3_url} {target_dir} --recursive"
+    run_subprocess(command)
+
+
+def get_common_commit_sha():
+    command = "git merge-base origin/main HEAD"
+    proc = run_subprocess(command)
+    return proc.stdout.strip()
+
+
+def get_main_jenkins_build_number(github, common_commit):
+    json = github.get(f"commits/{common_commit}/status")
+    for status in reversed(json["statuses"]):
+        if status["context"] != "tvm-ci/branch":
+            continue
+        state = status["state"]
+        target_url = str(status["target_url"])
+        build_number = (
+            target_url[target_url.find("job/main") : len(target_url)]
+            .strip("job/main/")
+            .strip("/display/redirect")
+        )
+        assert build_number.isdigit()
+        return {"build_number": build_number, "state": state}
+    raise RuntimeError(f"Failed to find main build number for commit {common_commit}")
+
+
+def retrieve_test_reports(common_main_build, pr_number, build_number, s3_prefix):
+    cur_build_s3_link = (
+        f"s3://{s3_prefix}/tvm/PR-{str(pr_number)}/{str(build_number)}/pytest-results"
+    )
+    retrieve_test_report(cur_build_s3_link, PR_TEST_REPORT_DIR)
+
+    common_build_s3_link = f"s3://{s3_prefix}/tvm/main/{common_main_build}/pytest-results"
+    retrieve_test_report(common_build_s3_link, MAIN_TEST_REPORT_DIR)
+
+
+def get_pr_and_build_numbers(target_url):
+    target_url = target_url[target_url.find("PR-") : len(target_url)]
+    split = target_url.split("/")
+    pr_number = split[0].strip("PR-")
+    build_number = split[1]
+    return {"pr_number": pr_number, "build_number": build_number}
+
+
+def build_test_set(directory):
+    subdir_to_skipped = {}
+    subdirs = [
+        item for item in os.listdir(directory) if os.path.isdir(os.path.join(directory, item))
+    ]
+    for subdir in subdirs:
+        subdir_to_skipped[subdir] = set()
+        for root, _, files in os.walk(directory + "/" + subdir):
+            for file in files:
+                test_report = ElementTree.parse(root + "/" + file)
+                for testcase in test_report.iter("testcase"):
+                    skipped = testcase.find("skipped")
+                    if skipped is not None:
+                        key = testcase.attrib["classname"] + "#" + testcase.attrib["name"]
+                        subdir_to_skipped[subdir].add(key)
+    return subdir_to_skipped
+
+
+def to_node_name(dir_name: str):
+    return dir_name.replace("_", ": ", 1)
+
+
+def build_comment(
+    common_commit_sha,
+    common_main_build,
+    skipped_list,
+    pr_number,
+    build_number,
+    commit_sha,
+    jenkins_prefix,
+):
+    if common_main_build["state"] != "success":
+        return f"{SKIPPED_TESTS_COMMENT_MARKER}Unable to run tests bot because main failed to pass CI at {common_commit_sha}."
+
+    if len(skipped_list) == 0:
+        return f"{SKIPPED_TESTS_COMMENT_MARKER}No additional skipped tests found in this branch for commit {commit_sha}."
+
+    text = (
+        f"{SKIPPED_TESTS_COMMENT_MARKER}The list below shows some tests that ran in main {common_commit_sha} but were "
+        f"skipped in the CI build of {commit_sha}:\n"
+        f"```\n"
+    )
+    for skip in skipped_list:
+        text += skip + "\n"
+    text += (
+        f"```\nA detailed report of ran tests is [here](https://{jenkins_prefix}/job/tvm/job/PR-{str(pr_number)}"
+        f"/{str(build_number)}/testReport/)."
+    )
+    return text
+
+
+def get_pr_comments(github, url):
+    try:
+        return github.get(url)
+    except error.HTTPError as e:
+        logging.exception(f"Failed to retrieve PR comments: {url}: {e}")
+        return []
+
+
+def search_for_docs_comment(comments):
+    for comment in comments:
+        if (
+            comment["user"]["login"] == GITHUB_ACTIONS_BOT_LOGIN
+            and SKIPPED_TESTS_COMMENT_MARKER in comment["body"]
+        ):
+            return comment
+    return None
+
+
+if __name__ == "__main__":
+    help = (
+        "Compares the skipped tests of this PR against the last successful build on main. Also comments on the PR "
+        "issue when tests are skipped in this PR and not on main."
+    )
+    parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--remote", default="origin", help="ssh remote to parse")
+    parser.add_argument("--s3-prefix", default="tvm-jenkins-artifacts-prod")
+    parser.add_argument("--jenkins-prefix", default="ci.tlcpack.ai")
+    parser.add_argument("--common-main-build")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="run but don't send any request to GitHub",
+    )
+    args = parser.parse_args()
+    init_log()
+
+    remote = git(["config", "--get", f"remote.{args.remote}.url"])
+    user, repo = parse_remote(remote)
+
+    target_url = os.environ["TARGET_URL"]
+    pr_and_build = get_pr_and_build_numbers(target_url)
+
+    commit_sha = os.environ["COMMIT_SHA"]
+
+    if not args.dry_run:
+        github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
+        common_commit_sha = get_common_commit_sha()
+        common_main_build = get_main_jenkins_build_number(github, common_commit_sha)
+        retrieve_test_reports(
+            common_main_build=common_main_build["build_number"],
+            pr_number=pr_and_build["pr_number"],
+            build_number=pr_and_build["build_number"],
+            s3_prefix=args.s3_prefix,
+        )
+    else:
+        assert args.common_main_build is not None
+        common_main_build = json.loads(args.common_main_build)
+        common_commit_sha = os.environ["COMMIT_SHA"]
+
+    main_tests = build_test_set(MAIN_TEST_REPORT_DIR)
+    build_tests = build_test_set(PR_TEST_REPORT_DIR)
+
+    skipped_list = []
+    for subdir, skipped_set in build_tests.items():
+        skipped_main = main_tests[subdir]
+        if skipped_main is None:
+            logging.warning(f"Could not find directory {subdir} in main.")
+            continue
+
+        diff_set = skipped_set - skipped_main
+        if len(diff_set) != 0:
+            for test in diff_set:
+                skipped_list.append(f"{to_node_name(subdir)} -> {test}")
+
+    # Sort the list to maintain an order in the output. Helps when validating the output in tests.
+    skipped_list.sort()
+
+    if len(skipped_list) == 0:
+        logging.info("No skipped tests found.")
+
+    body = build_comment(
+        common_commit_sha,
+        common_main_build,
+        skipped_list,
+        pr_and_build["pr_number"],
+        pr_and_build["build_number"],
+        commit_sha,
+        args.jenkins_prefix,
+    )
+    url = f'issues/{pr_and_build["pr_number"]}/comments'
+    if not args.dry_run:
+        # For now, only comment for PRs open by driazati, gigiblender and areusch.
+        get_pr_url = f'pulls/{pr_and_build["pr_number"]}'
+        pull_request_body = github.get(get_pr_url)
+        author = pull_request_body["user"]["login"]
+        if author not in ["driazati", "gigiblender", "areusch"]:
+            logging.info(f"Skipping this action for user {author}")
+            sys.exit(0)
+
+        pr_comments = get_pr_comments(github, url)
+        comment = search_for_docs_comment(pr_comments)
+
+        if comment is not None:
+            comment_url = comment["url"]
+            comment_id = comment_url[comment_url.find("comments/") : len(comment_url)].strip(
+                "comments/"
+            )
+            github.patch(f"issues/comments/{comment_id}", {"body": body})
+        else:
+            github.post(url, {"body": body})
+    else:
+        logging.info(f"Dry run, would have posted {url} with data {body}.")

From 61c034ae27712d5cab4720b3f259df68cf004ac2 Mon Sep 17 00:00:00 2001
From: Huan Mei <352648791@qq.com>
Date: Thu, 25 Aug 2022 23:44:50 +0800
Subject: [PATCH 042/704] [DOC] fix code-block error in debuggging TVM part
 (#12597)

The code block in part Debuggging TVM is not showing up.

Just fix it.
---
 docs/dev/how_to/debugging_tvm.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/dev/how_to/debugging_tvm.rst b/docs/dev/how_to/debugging_tvm.rst
index 6060f797b3e4..0ad44fdd17ce 100644
--- a/docs/dev/how_to/debugging_tvm.rst
+++ b/docs/dev/how_to/debugging_tvm.rst
@@ -60,7 +60,7 @@ optimization). To enable VLOGging, do the following:
 
 Examples:
 
-.. code-block: shell
+.. code-block:: shell
 
    # enable VLOG(0), VLOG(1), VLOG(2) in all files.
    $ TVM_LOG_DEBUG=DEFAULT=2 python3 -c 'import tvm'

From b547106fdeb634d2fc692d8a516899c4abe6edbc Mon Sep 17 00:00:00 2001
From: Lite Ye <liteye859@gmail.com>
Date: Thu, 25 Aug 2022 11:45:43 -0400
Subject: [PATCH 043/704] [CI] github_cc_reviewers: Catch all exceptions so all
 reviewers can be processed (#12578)

In a recent change, `github.post` throws `RuntimeError` instead of `HTTPError` when the requested reviewer isn't a project collaborator. This prevents other reviewers to be added to the PR, for example, https://github.com/apache/tvm/runs/8001367110?check_suite_focus=true.

This PR changes the caller to catch any exception so the execution won't be interrupted.

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>
---
 tests/scripts/github_cc_reviewers.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/scripts/github_cc_reviewers.py b/tests/scripts/github_cc_reviewers.py
index bfc0077b6691..d8323221a7b0 100755
--- a/tests/scripts/github_cc_reviewers.py
+++ b/tests/scripts/github_cc_reviewers.py
@@ -16,6 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import sys
 import os
 import json
 import argparse
@@ -106,5 +107,8 @@ def find_reviewers(body: str) -> List[str]:
         for reviewer in to_add:
             try:
                 github.post(f"pulls/{number}/requested_reviewers", {"reviewers": [reviewer]})
-            except error.HTTPError as e:
+            except KeyboardInterrupt:
+                sys.exit()
+            except (RuntimeError, error.HTTPError) as e:
+                # Catch any exception so other reviewers can be processed
                 print(f"Failed to add reviewer {reviewer}: {e}")

From 399f2e9b7006c95a2ebf0b3d35cdbacb340dd68d Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 25 Aug 2022 16:48:40 +0100
Subject: [PATCH 044/704] [microNPU] Remove xfail from tests relating to #12511
 (#12570)

Removes tests previously marked as xfail since the issue has now
been resolved.
---
 tests/python/contrib/test_ethosu/test_codegen.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index ae7d0821bb7f..28ea48f00932 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -347,7 +347,6 @@ def binary_elementwise(lhs, rhs):
         ([1, 4, 4], [4, 1]),
     ],
 )
-@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511")
 def test_binary_add_with_non_4d_shapes(
     request,
     accel_type,
@@ -606,7 +605,6 @@ def rounding_right_shift(lhs, rhs):
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 @pytest.mark.parametrize("ifm_shape", [(3, 2), (1, 15, 11, 7), (3, 1, 12), (400,)])
 @pytest.mark.parametrize("ifm_scale, ifm_zp, ofm_scale, ofm_zp", [(1, 0, 1, 0), (0.015, 3, 0.2, 5)])
-@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511")
 def test_ethosu_identity_codegen(
     request, ifm_shape, ifm_scale, ifm_zp, ofm_scale, ofm_zp, accel_type
 ):
@@ -655,7 +653,6 @@ def generate_output_data(input_data):
         ((8, 7, 3), (-4, 1, 8, -2)),
     ],
 )
-@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511")
 def test_relay_reshape_codegen(ifm_shape, new_shape, accel_type):
     np.random.seed(0)
 
@@ -688,7 +685,6 @@ def create_model():
         ([5000], [123], [2151]),
     ],
 )
-@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511")
 def test_tflite_slice(request, accel_type, ifm_shape, begin, size):
     np.random.seed(0)
 
@@ -724,7 +720,6 @@ def strided_slice_func(x):
     "ifm_shape",
     [[1, 5, 12, 4], [1, 1, 2], [4, 3, 2], [10, 20], [345]],
 )
-@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511")
 def test_ethosu_unary_elementwise(
     request,
     accel_type,

From f7c143608f9bb45dce8e3f93c3a89275a7c104f6 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 25 Aug 2022 17:17:22 +0100
Subject: [PATCH 045/704] [ETHOSN] Support conversion of add to depthwise
 (#12531)

In similar fashion to the conversion of mul to depthwise, this commit
converts add when one input is a constant of shape [1, ..., n] to a
depthwise convolution. If neither input is a constant, the add is
offloaded naturally like before.

The addition testing has been improved to use pytest features.
---
 python/tvm/relay/op/contrib/ethosn.py         |  43 +++-
 src/relay/backend/contrib/ethosn/codegen.cc   |   8 +-
 .../contrib/ethosn/convert_equivalent.cc      | 109 ++++++++-
 .../contrib/test_ethosn/infrastructure.py     |   3 +-
 .../contrib/test_ethosn/test_addition.py      | 214 +++++++++++++-----
 .../test_ethosn/test_convert_equivalents.py   |  99 +++++---
 .../contrib/test_ethosn/test_networks.py      |   2 +-
 7 files changed, 377 insertions(+), 101 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 73dd6b735775..83972bd08b41 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -215,6 +215,24 @@ def qnn_mul_pattern():
         input_is_right = gen_mul_inputs(is_constant(), wildcard())
         return input_is_left | input_is_right
 
+    def qnn_add_pattern():
+        add_op = is_op("qnn.add")
+        gen_add_inputs = lambda x, y: add_op(
+            x,
+            y,
+            is_constant(),
+            is_constant(),
+            is_constant(),
+            is_constant(),
+            is_constant(),
+            is_constant(),
+        )
+        two_inputs = gen_add_inputs(wildcard(), wildcard())
+        input_is_left = gen_add_inputs(wildcard(), is_constant())
+        input_is_right = gen_add_inputs(is_constant(), wildcard())
+
+        return input_is_left | input_is_right | two_inputs
+
     def check_conv2d(extract):
         """Check if a conv2d is supported by Ethos-N."""
         if not ethosn_available():
@@ -289,8 +307,24 @@ def check_resize(extract):
 
         return _ethosn.resize(extract)
 
+    def check_add(extract):
+        """Check if an addition is supported by Ethos-N."""
+        if not ethosn_available():
+            return False
+        # Do not support scalar constants for now
+        check_scalar = lambda i: isinstance(i, tvm.relay.Constant) and len(i.data.shape) == 0
+        if check_scalar(extract.args[0]) or check_scalar(extract.args[1]):
+            return False
+
+        inputs = extract.args[0:2]
+        if any([isinstance(i, tvm.relay.Constant) for i in inputs]):
+            extract = _ethosn.ConvertQnnAdd(extract)
+            return _ethosn.conv2d(extract)
+        return _ethosn.addition(extract)
+
     return [
         ("ethos-n.qnn_mul", qnn_mul_pattern(), check_mul),
+        ("ethos-n.qnn_add", qnn_add_pattern(), check_add),
         ("ethos-n.qnn_conv2d", qnn_conv_pattern(), check_conv2d),
         ("ethos-n.qnn_avg_pool2d", qnn_avg_pool2d_pattern(), check_avg_pool2d),
         ("ethos-n.qnn_sigmoid", qnn_sigmoid_pattern(), check_sigmoid),
@@ -332,15 +366,6 @@ def reshape(expr):
     return _ethosn.reshape(expr)
 
 
-@tvm.ir.register_op_attr("qnn.add", "target.ethos-n")
-def qnn_add(expr):
-    """Check if an addition is supported by Ethos-N."""
-    if not ethosn_available():
-        return False
-
-    return _ethosn.addition(expr)
-
-
 @tvm.ir.register_op_attr("qnn.concatenate", "target.ethos-n")
 def qnn_concatenate(expr):
     """Check if a concatenate is supported by Ethos-N."""
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index bc4613b80155..69672a143585 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -104,9 +104,9 @@ void InferTensorsVisitor::InferCall(const CallNode* cn) {
     params.input_info = GetTensorInfo(tensor_table_, call);
     err += EthosnAPI::Reshape(call, &params);
     tensor_table_[cn->args[0]] = {params.input_info};
-  } else if (IsEthosnOp(call, "qnn.add")) {
+  } else if (IsEthosnFunc(call, "ethos-n.qnn_add")) {
     AdditionParams params;
-    err += EthosnAPI::Addition(call, &params);
+    err += EthosnAPI::Addition(cn->op.as<FunctionNode>()->body, &params);
     tensor_table_[cn->args[0]] = {params.lhs_info};
     tensor_table_[cn->args[1]] = {params.rhs_info};
   } else if (IsEthosnFunc(call, "ethos-n.qnn_sigmoid")) {
@@ -296,7 +296,7 @@ sl::TensorsAndId ConstructNetworkVisitor::HandleCall(const CallNode* cn) {
   } else if (IsEthosnOp(call, "reshape")) {
     if ((err = MakeReshapeLayer(call, &tensor))) ReportFatalError(call, err);
     return MakeOps(tensor);
-  } else if (IsEthosnOp(call, "qnn.add")) {
+  } else if (IsEthosnFunc(call, "ethos-n.qnn_add")) {
     if ((err = MakeAdditionLayer(call, &tensor))) ReportFatalError(call, err);
     return MakeOps(tensor);
   } else if (IsEthosnFunc(call, "ethos-n.qnn_sigmoid")) {
@@ -468,7 +468,7 @@ EthosnError ConstructNetworkVisitor::MakeReshapeLayer(const Call& call,
 EthosnError ConstructNetworkVisitor::MakeAdditionLayer(const Call& call,
                                                        sl::TensorAndId<sl::Operand>* out) {
   AdditionParams params;
-  if (auto err = EthosnAPI::Addition(call, &params)) {
+  if (auto err = EthosnAPI::Addition(call->op.as<FunctionNode>()->body, &params)) {
     return err;
   }
 
diff --git a/src/relay/backend/contrib/ethosn/convert_equivalent.cc b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
index 6b64467047f4..12b5a12afb35 100644
--- a/src/relay/backend/contrib/ethosn/convert_equivalent.cc
+++ b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
@@ -38,6 +38,20 @@ namespace relay {
 namespace contrib {
 namespace ethosn {
 
+/*!
+ * \brief Apply constant folding on an expression.
+ *
+ * \param expr The expression to fold.
+ * \param fold_qnn Whether to fold constants for QNN operations.
+ * \returns The new folded expression.
+ */
+Expr FoldConstantExpr(const Expr& expr, bool fold_qnn = true) {
+  auto mod = IRModule::FromExpr(expr);
+  mod = transform::FoldConstant(fold_qnn)(mod);
+  auto entry_func = Downcast<Function>(mod->Lookup("main"));
+  return expr.as<FunctionNode>() == nullptr ? entry_func->body : entry_func;
+}
+
 /*!
  * \brief Converts qnn.mul to mathematically equivalent
  * qnn.conv2d depthwise operation.
@@ -65,7 +79,9 @@ Expr ConvertQnnMultiply(const Expr& expr) {
 
   const auto* input_constant = input2.as<ConstantNode>();
   ICHECK(input_constant) << "Expected ConstantNode but got " << input2->GetTypeKey();
-  const auto* input_constant_tt = input_constant->checked_type().as<TensorTypeNode>();
+  Type input_constant_type = input_constant->checked_type();
+  const auto* input_constant_tt = input_constant_type.as<TensorTypeNode>();
+  ICHECK(input_constant) << "Expected TensorTypeNode but got " << input_constant_type->GetTypeKey();
   int channels = input_constant_tt->shape.back().as<IntImmNode>()->value;
 
   runtime::NDArray input_data = input_constant->data;
@@ -93,6 +109,83 @@ Expr ConvertQnnMultiply(const Expr& expr) {
 TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnMultiply")
     .set_body_typed(ConvertQnnMultiply);
 
+/*!
+ * \brief Converts qnn.add to a mathematically equivalent
+ * qnn.conv2d depthwise operation.
+ */
+Expr ConvertQnnAdd(const Expr& expr) {
+  Call call = Downcast<Call>(expr);
+
+  Expr input1 = call->args[0];
+  Expr input2 = call->args[1];
+  Expr input1_scale = call->args[2];
+  Expr input1_zero_point = call->args[3];
+  Expr input2_scale = call->args[4];
+  Expr input2_zero_point = call->args[5];
+  // Reverse the inputs if the constant is first input
+  if (call->args[0]->IsInstance<ConstantNode>()) {
+    input1 = call->args[1];
+    input2 = call->args[0];
+    input1_scale = call->args[4];
+    input1_zero_point = call->args[5];
+    input2_scale = call->args[2];
+    input2_zero_point = call->args[3];
+  }
+  Expr output_scale = call->args[6];
+  Expr output_zero_point = call->args[7];
+
+  const auto* input_constant = input2.as<ConstantNode>();
+  ICHECK(input_constant) << "Expected ConstantNode but got " << input2->GetTypeKey();
+  Type input_constant_type = input_constant->checked_type();
+  const auto* input_constant_tt = input_constant_type.as<TensorTypeNode>();
+  ICHECK(input_constant) << "Expected TensorTypeNode but got " << input_constant_type->GetTypeKey();
+  int channels = input_constant_tt->shape.back().as<IntImmNode>()->value;
+
+  // Create the identity kernel. The kernel data is constructed such that it produces an identity
+  // operation in the quantized space. Therefore, the input is not scaled in any way which allows
+  // us to later use the bias to perform the addition.
+  float input_scale_value = GetScalarFromConstant<float>(input1_scale);
+  float output_scale_value = GetScalarFromConstant<float>(output_scale);
+  float identity_kernel_scale_ub = std::min(output_scale_value / input_scale_value, 1.f);
+  float identity_kernel_scale_lb = (1.f / 255.f);
+  float identity_kernel_scale_target = (identity_kernel_scale_ub + identity_kernel_scale_lb) / 2.f;
+  float identity_kernel_scale_recip_rounded = std::round(1.f / identity_kernel_scale_target);
+  float identity_kernel_scale_value = 1.f / identity_kernel_scale_recip_rounded;
+  Constant identity_kernel_scale =
+      MakeConstantScalar(DataType::Float(32), identity_kernel_scale_value);
+  Constant identity_kernel_zero_point = MakeConstantScalar(DataType::Int(32), 0);
+  float identity_kernel_quantized_data = identity_kernel_scale_recip_rounded;
+  std::vector<uint8_t> identity_kernel_data(channels,
+                                            static_cast<uint8_t>(identity_kernel_quantized_data));
+  Constant identity_kernel =
+      MakeConstantTensor(input_constant_tt->dtype, {1, 1, channels, 1}, identity_kernel_data);
+
+  // Calculate the bias, this is where the addition happens. The bias values are calculated by
+  // scaling the constant input to input_scale * identity_kernel_scale.
+  Constant bias_scale =
+      MakeConstantScalar(DataType::Float(32), input_scale_value * identity_kernel_scale_value);
+  Constant bias_zero_point = MakeConstantScalar(DataType::Int(32), 0);
+  Expr requantize_bias =
+      qnn::MakeRequantize(input2, input2_scale, input2_zero_point, bias_scale, bias_zero_point, -1,
+                          "None", "None", DataType::Int(32));
+  Expr reshape_bias = MakeReshape(requantize_bias, {channels});
+  Constant bias = Downcast<Constant>(FoldConstantExpr(reshape_bias));
+
+  // Make depthwise conv2d operation
+  Expr conv2d =
+      qnn::MakeQnnConv2D(input1, identity_kernel, input1_zero_point, identity_kernel_zero_point,
+                         input1_scale, identity_kernel_scale, {1, 1}, {0, 0, 0, 0}, {1, 1},
+                         channels, channels, {1, 1}, "NHWC", "HWOI", "NHWC", DataType::Int(32));
+  Expr bias_add = MakeBiasAdd(conv2d, bias, 3);
+  Expr requantize =
+      qnn::MakeRequantize(bias_add, input1_scale, input1_zero_point, output_scale,
+                          output_zero_point, -1, "None", "None", input_constant_tt->dtype);
+
+  return InferType(requantize);
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnAdd").set_body_typed(ConvertQnnAdd);
+
 class ConvertEquivalentsMutator : public MixedModeMutator {
  public:
   Expr Rewrite_(const CallNode* pre, const Expr& post) override {
@@ -108,11 +201,25 @@ class ConvertEquivalentsMutator : public MixedModeMutator {
       Expr new_func_body = ConvertQnnMultiply(func->body);
       new_func = WithFields(func, func->params, new_func_body);
       new_func = WithAttr(std::move(new_func), attr::kComposite, String("ethos-n.qnn_conv2d"));
+    } else if (composite_name == "ethos-n.qnn_add" && CheckCanConvertAdd(func->body)) {
+      Expr new_func_body = ConvertQnnAdd(func->body);
+      new_func = WithFields(func, func->params, new_func_body);
+      new_func = WithAttr(std::move(new_func), attr::kComposite, String("ethos-n.qnn_conv2d"));
     }
 
     Call new_call = WithFields(call, new_func);
     return Downcast<Expr>(new_call);
   }
+
+ private:
+  /*!
+   * \brief Check whether add can be converted to depthwise, or whether
+   * it should be offloaded as a normal add operation.
+   */
+  bool CheckCanConvertAdd(const Expr& expr) {
+    Call call = Downcast<Call>(expr);
+    return call->args[0]->IsInstance<ConstantNode>() || call->args[1]->IsInstance<ConstantNode>();
+  }
 };
 
 tvm::transform::Pass ConvertEquivalents() {
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index c227ef5c3aea..a1c8ca0a32d2 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -83,7 +83,8 @@ def make_module(func, params):
 
 def make_ethosn_composite(ethosn_expr, name):
     vars = relay.analysis.free_vars(ethosn_expr)
-    func = relay.Function([relay.Var("a")], ethosn_expr)
+    inner_vars = [relay.Var(v.name_hint, v.type_annotation) for v in vars]
+    func = relay.Function(inner_vars, ethosn_expr)
     func = func.with_attr("Composite", name)
     call = relay.Call(func, vars)
     return call
diff --git a/tests/python/contrib/test_ethosn/test_addition.py b/tests/python/contrib/test_ethosn/test_addition.py
index cc8e030d372d..72981182e17f 100644
--- a/tests/python/contrib/test_ethosn/test_addition.py
+++ b/tests/python/contrib/test_ethosn/test_addition.py
@@ -25,11 +25,37 @@
 from . import infrastructure as tei
 
 
-def _get_model(input_shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype):
+def _get_model(
+    lhs_shape,
+    rhs_shape,
+    lhs_zp,
+    lhs_sc,
+    rhs_zp,
+    rhs_sc,
+    out_zp,
+    out_sc,
+    dtype,
+    lhs_is_constant=False,
+    rhs_is_constant=False,
+):
     """Return a model and any parameters it may have"""
 
-    a = relay.var("a", shape=input_shape, dtype=dtype)
-    b = relay.var("b", shape=input_shape, dtype=dtype)
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+
+    if lhs_is_constant:
+        a_data = np.random.randint(data_min, data_max + 1, size=lhs_shape, dtype=dtype)
+        a = relay.const(a_data, dtype=dtype)
+    else:
+        a = relay.var("a", shape=lhs_shape, dtype=dtype)
+
+    if rhs_is_constant:
+        b_data = np.random.randint(data_min, data_max + 1, size=rhs_shape, dtype=dtype)
+        b = relay.const(b_data, dtype=dtype)
+    else:
+        b = relay.var("b", shape=rhs_shape, dtype=dtype)
+
     model = relay.qnn.op.add(
         lhs=a,
         rhs=b,
@@ -43,74 +69,156 @@ def _get_model(input_shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtyp
     return model
 
 
-def _get_addition_qnn_params(dtype, input1_zp, input1_sc, input2_zp, input2_sc):
-    input1_max = input1_sc * (255 - input1_zp)
-    input1_min = -input1_sc * input1_zp
-    input2_max = input2_sc * (255 - input2_zp)
-    input2_min = -input2_sc * input2_zp
+def _get_addition_qnn_params(dtype):
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+    lhs_zp = np.random.randint(data_min, data_max)
+    lhs_sc = np.random.random() * 2
+    rhs_zp = np.random.randint(data_min, data_max)
+    rhs_sc = np.random.random() * 2
+
+    input1_max = lhs_sc * (255 - lhs_zp)
+    input1_min = -lhs_sc * lhs_zp
+    input2_max = rhs_sc * (255 - rhs_zp)
+    input2_min = -rhs_sc * rhs_zp
     output_max = input1_max + input2_max
     output_min = input1_min + input2_min
     output_sc = (output_max - output_min) / 255
     output_zp = -int(output_min / output_sc)
-    return output_zp, output_sc
+    return lhs_zp, lhs_sc, rhs_zp, rhs_sc, output_zp, output_sc
+
+
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize("shape", [(1, 22, 9, 9), (1, 27, 21, 16)])
+def test_addition(dtype, shape):
+    """Compare Addition output with TVM."""
+    np.random.seed(0)
+
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+    lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype)
+
+    outputs = []
+    inputs = {
+        "a": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=shape, dtype=dtype)),
+        "b": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=shape, dtype=dtype)),
+    }
+    model = _get_model(shape, shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype)
+    for npu in [False, True]:
+        mod = tei.make_module(model, [])
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+
+    tei.verify(outputs, dtype, 1)
+
+
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize(
+    "lhs_shape,rhs_shape",
+    [
+        ((1, 4, 4, 8), (1, 1, 1, 8)),
+        ((1, 16, 12, 4), (4,)),
+    ],
+)
+def test_addition_to_depthwise_rhs_constant(dtype, lhs_shape, rhs_shape):
+    """Compare addition to depthwise with TVM."""
+    np.random.seed(0)
+
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+    lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype)
+
+    model = _get_model(
+        lhs_shape,
+        rhs_shape,
+        lhs_zp,
+        lhs_sc,
+        rhs_zp,
+        rhs_sc,
+        out_zp,
+        out_sc,
+        dtype,
+        lhs_is_constant=False,
+        rhs_is_constant=True,
+    )
+    inputs = {
+        "a": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=lhs_shape, dtype=dtype))
+    }
+    outputs = []
+    for npu in [False, True]:
+        mod = tei.make_module(model, {})
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+    tei.verify(outputs, dtype, 1)
 
 
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_addition(dtype):
-    zp_min = np.iinfo(dtype).min
-    zp_max = np.iinfo(dtype).max
-    trials = [
-        ((1, 22, 9, 9), zp_min + 24, 1.057, zp_max - 3, 0.452),
-        ((1, 27, 21, 16), zp_min + 79, 0.850, 24, 0.380),
-        ((1, 7, 12, 28), zp_min + 125, 1.293, zp_max - 16, 0.320),
-        ((1, 14, 9, 6), zp_min + 14, 0.942, zp_max - 28, 1.562),
-        ((1, 13, 16, 22), zp_min + 15, 0.727, zp_max - 75, 0.461),
-    ]
+@pytest.mark.parametrize(
+    "lhs_shape,rhs_shape",
+    [
+        ((1, 8), (1, 20, 15, 8)),
+    ],
+)
+def test_addition_to_depthwise_lhs_constant(dtype, lhs_shape, rhs_shape):
+    """Compare addition to depthwise with TVM."""
     np.random.seed(0)
-    for shape, rhs_zp, rhs_sc, lhs_zp, lhs_sc in trials:
-        outputs = []
-        inputs = {
-            "a": tvm.nd.array(np.random.randint(zp_min, zp_max + 1, size=shape, dtype=dtype)),
-            "b": tvm.nd.array(np.random.randint(zp_min, zp_max + 1, size=shape, dtype=dtype)),
-        }
-        out_zp, out_sc = _get_addition_qnn_params(dtype, lhs_zp, lhs_sc, rhs_zp, rhs_sc)
-        model = _get_model(shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype)
-        for npu in [False, True]:
-            mod = tei.make_module(model, [])
-            outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
 
-        tei.verify(outputs, dtype, 2)
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+    lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype)
+
+    model = _get_model(
+        lhs_shape,
+        rhs_shape,
+        lhs_zp,
+        lhs_sc,
+        rhs_zp,
+        rhs_sc,
+        out_zp,
+        out_sc,
+        dtype,
+        lhs_is_constant=True,
+        rhs_is_constant=False,
+    )
+    inputs = {
+        "b": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=rhs_shape, dtype=dtype))
+    }
+    outputs = []
+    for npu in [False, True]:
+        mod = tei.make_module(model, {})
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+    tei.verify(outputs, dtype, 1)
 
 
 @requires_ethosn
-def test_addition_failure():
-    trials = [
+@pytest.mark.parametrize(
+    "dtype,shape,err_msg",
+    [
         (
-            (2, 4, 4, 4),
             "uint8",
-            0,
-            1,
-            0,
-            1,
-            0,
-            1,
+            (2, 4, 4, 4),
             "batch size=2, batch size must = 1; batch size=2, batch size must = 1",
         ),
         (
-            (1, 4, 4, 4),
             "int16",
-            0,
-            1,
-            0,
-            1,
-            0,
-            1,
-            "dtype='int16', dtype must be either uint8, int8 or int32; dtype='int16', dtype must be either uint8, int8 or int32",
+            (1, 4, 4, 4),
+            "dtype='int16', dtype must be either uint8, int8 or int32; dtype='int16', "
+            "dtype must be either uint8, int8 or int32",
         ),
-    ]
+    ],
+)
+def test_addition_failure(dtype, shape, err_msg):
+    """Check addition error messages."""
+    np.random.seed(0)
+
+    lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype)
 
-    for shape, dtype, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, err_msg in trials:
-        model = _get_model(shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype)
-        mod = tei.make_ethosn_partition(model)
-        tei.test_error(mod, {}, err_msg)
+    model = _get_model(shape, shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype)
+    model = tei.make_ethosn_composite(model, "ethos-n.qnn_add")
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)
diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
index 570009422067..fe9b346691b6 100644
--- a/tests/python/contrib/test_ethosn/test_convert_equivalents.py
+++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
@@ -24,8 +24,10 @@
 from tvm import relay
 from tvm.testing import requires_ethosn
 from tvm.relay.op.contrib.ethosn import ConvertEquivalents
+from tvm.relay import ExprVisitor
 
 from . import infrastructure as tei
+from .test_addition import _get_addition_qnn_params
 
 
 def _assert_structural_equal(a, b):
@@ -38,35 +40,6 @@ def _assert_structural_equal(a, b):
     assert tvm.ir.structural_equal(a, b), reason
 
 
-def _create_npu_module(inputs, expr, composite_name, ext_func_name):
-    """Wraps an operator as an NPU module."""
-    gen_vars = lambda prefix, vars: [
-        relay.var(
-            prefix + var.name_hint, shape=var.type_annotation.shape, dtype=var.type_annotation.dtype
-        )
-        for var in vars
-    ]
-
-    mod = tvm.ir.IRModule()
-
-    func = relay.Function(relay.analysis.free_vars(expr), expr)
-    func = func.with_attr("Composite", composite_name)
-    inner_vars = gen_vars("inner_", inputs)
-    call = relay.Call(func, inner_vars)
-
-    func2 = relay.Function(relay.analysis.free_vars(call), call)
-    func2 = func2.with_attr("Compiler", "ethos-n")
-    func2 = func2.with_attr("global_symbol", ext_func_name)
-    mod[ext_func_name] = func2
-    mod = relay.transform.InferType()(mod)
-
-    outer_vars = gen_vars("outer_", inputs)
-    out = relay.Call(mod.get_global_var(ext_func_name), outer_vars)
-    mod["main"] = relay.Function(relay.analysis.free_vars(out), out)
-    mod = relay.transform.InferType()(mod)
-    return mod
-
-
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 @pytest.mark.parametrize("shape,channels", [((1, 4, 4, 8), 8), ((1, 16, 12, 4), 4)])
@@ -101,7 +74,8 @@ def before():
             relay.const(output_sc, "float32"),
             relay.const(output_zp, "int32"),
         )
-        return _create_npu_module([x], expr, "ethos-n.qnn_mul", "ext_func")
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_mul")
+        return tei.make_ethosn_partition(composite)
 
     def expected():
         constant_shape_hwoi = (1, 1, channels, 1)
@@ -134,9 +108,70 @@ def expected():
             relay.const(output_zp, "int32"),
             out_dtype=dtype,
         )
-        return _create_npu_module([x], expr, "ethos-n.qnn_conv2d", "ext_func")
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_conv2d")
+        return tei.make_ethosn_partition(composite)
 
     mod = before()
     mod = ConvertEquivalents()(mod)
     expected_mod = expected()
-    _assert_structural_equal(mod["ext_func"], expected_mod["ext_func"])
+    _assert_structural_equal(mod["ethos-n_0"], expected_mod["ethos-n_0"])
+
+
+@requires_ethosn
+@pytest.mark.parametrize("reverse_inputs", [True, False])
+def test_add_to_depthwise(reverse_inputs):
+    """
+    Check that add is converted correctly.
+    """
+    dtype = "uint8"
+    lhs_shape = (1, 2, 4, 8)
+    rhs_shape = (1, 1, 1, 8)
+    np.random.seed(0)
+
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+    lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype)
+
+    x = relay.var("x", shape=lhs_shape, dtype=dtype)
+    y_data = np.random.randint(data_min, data_max + 1, size=rhs_shape, dtype=dtype)
+
+    def before():
+        y = relay.const(y_data)
+        expr = relay.qnn.op.add(
+            lhs=y if reverse_inputs else x,
+            rhs=x if reverse_inputs else y,
+            lhs_scale=relay.const(lhs_sc, "float32"),
+            lhs_zero_point=relay.const(lhs_zp, "int32"),
+            rhs_scale=relay.const(rhs_sc, "float32"),
+            rhs_zero_point=relay.const(rhs_zp, "int32"),
+            output_scale=relay.const(out_sc, "float32"),
+            output_zero_point=relay.const(out_zp, "int32"),
+        )
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_add")
+        return tei.make_ethosn_partition(composite)
+
+    class ConversionChecker(ExprVisitor):
+        """
+        Pass to check the new composite function is in the expected format.
+        """
+
+        sequence = ["qnn.conv2d", "nn.bias_add", "qnn.requantize"]
+
+        def visit_function(self, fn):
+            composite_name = fn.attrs["Composite"]
+            expected = "ethos-n.qnn_conv2d"
+            assert (
+                composite_name == expected
+            ), f"Expected Composite attribute {expected} but got {composite_name}"
+            super().visit_function(fn)
+
+        def visit_call(self, call):
+            op_name = call.op.name
+            expected_name = self.sequence.pop()
+            assert op_name == expected_name, f"Got operator {op_name} but expected {expected_name}"
+            super().visit_call(call)
+
+    mod = before()
+    mod = ConvertEquivalents()(mod)
+    mod = ConversionChecker().visit(mod["ethos-n_0"].body.op)
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index abc4d37a7359..d16bf5bf325c 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -143,7 +143,7 @@ def test_resnet_50_int8():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"60404ad60fc2bfbb68464d8a14cc0452", "4225fa951c145bb1e48e28cad6a3bdd4"}
+    _compile_hash = {"9245965b2c01e7f3d9b478e38a186eb4", "4225fa951c145bb1e48e28cad6a3bdd4"}
     _test_image_network(
         model_url="https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/"
         "models/Quantized/resnet_50_quantized.tflite",

From 21db1eb586f14b272b36f7e33830acc630823b5f Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Thu, 25 Aug 2022 10:23:46 -0600
Subject: [PATCH 046/704] [F2QI] Fix a rounding error on AvgPool when input and
 output affine scales differ (#12577)

cc @sfvaroglu @AndrewZhaoLuo
---
 .../transform/fake_quantization_to_integer.py | 64 ++++++++++++++++---
 .../test_pass_fake_quantization_to_integer.py | 15 ++---
 2 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index 58dcc3477f6a..bb874c131cd8 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -114,11 +114,26 @@ def adaptive_avgpool1d(expr, type_map):
     """Rewrite an adaptive avgpool op"""
     arg = expr.args[0]
     t = type_map[arg]
-    arg = relay.op.cast(arg, "int32")
+    out_t = type_map[expr]
+    if not (
+        approx_equal(t.scale, out_t.scale)
+        and approx_equal(t.zero_point, out_t.zero_point)
+        and tvm.ir.structural_equal(t.dtype, out_t.dtype)
+    ):
+        arg = relay.qnn.op.requantize(
+            arg,
+            t.scale,
+            t.zero_point,
+            out_t.scale,
+            out_t.zero_point,
+            out_dtype="int32",
+            axis=t.axis,
+        )
+    else:
+        arg = relay.op.cast(arg, "int32")
     output_size = expr.attrs.output_size
     out = relay.op.nn.adaptive_avg_pool1d(arg, output_size)
-    out = relay.op.cast(out, t.dtype)
-    return [out, t]
+    return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)]
 
 
 @register_fake_quantization_to_integer("nn.avg_pool2d")
@@ -126,10 +141,25 @@ def avgpool2d(expr, type_map):
     """Rewrite a avgpool op"""
     arg = expr.args[0]
     t = type_map[arg]
-    arg = relay.op.cast(arg, "int32")
+    out_t = type_map[expr]
+    if not (
+        approx_equal(t.scale, out_t.scale)
+        and approx_equal(t.zero_point, out_t.zero_point)
+        and tvm.ir.structural_equal(t.dtype, out_t.dtype)
+    ):
+        arg = relay.qnn.op.requantize(
+            arg,
+            t.scale,
+            t.zero_point,
+            out_t.scale,
+            out_t.zero_point,
+            out_dtype="int32",
+            axis=t.axis,
+        )
+    else:
+        arg = relay.op.cast(arg, "int32")
     out = relay.op.nn.avg_pool2d(arg, **expr.attrs)
-    out = relay.op.cast(out, t.dtype)
-    return [out, t]
+    return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)]
 
 
 @register_fake_quantization_to_integer("nn.global_avg_pool2d")
@@ -137,10 +167,26 @@ def global_avgpool2d(expr, type_map):
     """Rewrite a global_avgpool op"""
     arg = expr.args[0]
     t = type_map[arg]
-    arg = relay.op.cast(arg, "int32")
+    out_t = type_map[expr]
+    out_t = type_map[expr]
+    if not (
+        approx_equal(t.scale, out_t.scale)
+        and approx_equal(t.zero_point, out_t.zero_point)
+        and tvm.ir.structural_equal(t.dtype, out_t.dtype)
+    ):
+        arg = relay.qnn.op.requantize(
+            arg,
+            t.scale,
+            t.zero_point,
+            out_t.scale,
+            out_t.zero_point,
+            out_dtype="int32",
+            axis=t.axis,
+        )
+    else:
+        arg = relay.op.cast(arg, "int32")
     out = relay.op.nn.global_avg_pool2d(arg)
-    out = relay.op.cast(out, t.dtype)
-    return [out, t]
+    return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)]
 
 
 @register_fake_quantization_to_integer("broadcast_to")
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index cdf5fd42a138..a63d82e68750 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -281,10 +281,9 @@ def test_fake_quantize_maxpool():
 def test_fake_quantize_adaptive_avgpool1d(output_size):
     x = relay.var("x", shape=[1, 128, 768], dtype="int8")
 
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
+    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(-12))
     op = relay.op.nn.adaptive_avg_pool1d(x, output_size)
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
+    op = relay.qnn.op.quantize(op, relay.const(0.5), relay.const(10))
 
     x_np = np.random.randint(-128, 127, size=[1, 128, 768], dtype="int8")
 
@@ -294,10 +293,9 @@ def test_fake_quantize_adaptive_avgpool1d(output_size):
 def test_fake_quantize_avgpool():
     x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
 
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
+    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(-12))
     op = relay.op.nn.avg_pool2d(x, [3, 3])
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
+    op = relay.qnn.op.quantize(op, relay.const(0.5), relay.const(10))
 
     x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
 
@@ -307,10 +305,9 @@ def test_fake_quantize_avgpool():
 def test_fake_quantize_global_avg_pool():
     x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
 
-    zero = relay.const(0)
-    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
+    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(-12))
     op = relay.op.nn.global_avg_pool2d(x)
-    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
+    op = relay.qnn.op.quantize(op, relay.const(0.5), relay.const(10))
 
     x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
 

From bb00a15c265ba12341aede06bbbf216dda585211 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Fri, 26 Aug 2022 01:42:57 +0800
Subject: [PATCH 047/704] [CUDA][CodeGen] Fix cuda codegen's fp16 inf literal
 (#12581)

* Fix cuda codegen's fp16 inf literal

* add relay testcase
---
 src/target/source/codegen_cuda.cc    |  6 ++++--
 tests/python/relay/test_op_level3.py | 16 ++++++++++++----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 2239cef92060..d96e0cbc1679 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -1197,8 +1197,10 @@ inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p)
       break;
     }
     case 16: {
-      os << "__float2half_rn";
-      os << '(' << std::scientific << op->value << 'f' << ')';
+      os << "__float2half_rn" << '(';
+      FloatImm const_f32 = FloatImm(DataType::Float(32), op->value);
+      PrintConst(const_f32.get(), os, p);
+      os << ')';
       break;
     }
     default:
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 2fe40ae2f88e..400f7dcf0b42 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1344,7 +1344,7 @@ def verify_gather_nd(xshape, yshape, y_data, batch_dims=0, indices_dtype="int32"
     verify_gather_nd((2, 2, 2), (2, 2, 1), [[[1], [0]], [[0], [1]]], 1, indices_dtype="uint32")
 
 
-def _verify_infiniteness_ops(relay_op, ref_op):
+def _verify_infiniteness_ops(relay_op, ref_op, target="llvm", dev=None):
     for dtype in ["float32", "float16", "float16", "int32", "int16"]:
         shape = (2, 8, 8)
         x = relay.var("x", relay.TensorType(shape, dtype))
@@ -1359,17 +1359,25 @@ def _verify_infiniteness_ops(relay_op, ref_op):
             ] = np.infty
             data.ravel()[np.random.choice(data.size, int(data.size * 0.5), replace=False)] = np.nan
 
-        op_res = create_executor().evaluate(y, {x: data})
+        op_res = create_executor(target=target, device=dev).evaluate(y, {x: data})
         ref_res = ref_op(data)
         np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
 
 
+@tvm.testing.requires_gpu
 def test_isfinite():
-    _verify_infiniteness_ops(relay.isfinite, np.isfinite)
+    for target, dev in tvm.testing.enabled_targets():
+        if target not in ["llvm", "cuda"]:
+            continue
+        _verify_infiniteness_ops(relay.isfinite, np.isfinite, target=target, dev=dev)
 
 
+@tvm.testing.requires_gpu
 def test_isinf():
-    _verify_infiniteness_ops(relay.isinf, np.isinf)
+    for target, dev in tvm.testing.enabled_targets():
+        if target not in ["llvm", "cuda"]:
+            continue
+        _verify_infiniteness_ops(relay.isinf, np.isinf, target=target, dev=dev)
 
 
 def test_unravel_index(target, dev, executor_kind):

From 01fcdfcf5fcfda313df4e176ca3d919b076f77fc Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 25 Aug 2022 10:55:58 -0700
Subject: [PATCH 048/704] [ci] Default to n=2 for test parallelism (#12414)

* Revert "[skip ci] Revert "[ci] Default to n=2 for test parallelism (#12376)" (#12413)"

This reverts commit 478b672f2b7bb37f529fa6477b3c4ac353217b7a.

* [ci] Default to n=2 for test parallelism

This is attempt #2 of #12376 which was reverted in #12413. The changes
in `plugin.py` should keep all the tests on the same node so sporadic
failures don't happen due to scheduling.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                       | 60 +++++++++++++++++++++++++++++--
 ci/jenkins/Jenkinsfile.j2         |  2 +-
 ci/jenkins/macros.j2              |  3 ++
 python/tvm/testing/plugin.py      | 42 ++++++++++++++++++++++
 tests/scripts/setup-pytest-env.sh |  8 ++++-
 5 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 15cd4927d0ba..8c1ce9ed5020 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-08-15T16:55:31.189354
+// Generated at 2022-08-19T15:38:38.311410
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -97,7 +97,7 @@ properties([
 upstream_revision = null
 
 // command to start a docker container
-docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS'
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
 docker_build = 'docker/build.sh'
 // timeout in minutes
 max_time = 180
@@ -610,6 +610,7 @@ def lint() {
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
             'TVM_NUM_SHARDS=2',
+            'TEST_STEP_NAME=Lint',
             'TVM_SHARD_INDEX=0',
             "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
             sh (
@@ -629,6 +630,7 @@ def lint() {
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
             'TVM_NUM_SHARDS=2',
+            'TEST_STEP_NAME=Lint',
             'TVM_SHARD_INDEX=1',
             "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
             sh (
@@ -1225,6 +1227,7 @@ def shard_run_unittest_GPU_1_of_3() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=unittest: GPU',
               'TVM_NUM_SHARDS=3',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -1340,6 +1343,7 @@ def shard_run_unittest_GPU_2_of_3() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=unittest: GPU',
               'TVM_NUM_SHARDS=3',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -1421,6 +1425,7 @@ def shard_run_unittest_GPU_3_of_3() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=unittest: GPU',
               'TVM_NUM_SHARDS=3',
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -1499,6 +1504,7 @@ def shard_run_integration_CPU_1_of_10() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
               'TVM_NUM_SHARDS=10',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -1574,6 +1580,7 @@ def shard_run_integration_CPU_2_of_10() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
               'TVM_NUM_SHARDS=10',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -1649,6 +1656,7 @@ def shard_run_integration_CPU_3_of_10() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
               'TVM_NUM_SHARDS=10',
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -1724,6 +1732,7 @@ def shard_run_integration_CPU_4_of_10() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
               'TVM_NUM_SHARDS=10',
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -1799,6 +1808,7 @@ def shard_run_integration_CPU_5_of_10() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
               'TVM_NUM_SHARDS=10',
               'TVM_SHARD_INDEX=4',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -1874,6 +1884,7 @@ def shard_run_integration_CPU_6_of_10() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
               'TVM_NUM_SHARDS=10',
               'TVM_SHARD_INDEX=5',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -1949,6 +1960,7 @@ def shard_run_integration_CPU_7_of_10() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
               'TVM_NUM_SHARDS=10',
               'TVM_SHARD_INDEX=6',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2024,6 +2036,7 @@ def shard_run_integration_CPU_8_of_10() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
               'TVM_NUM_SHARDS=10',
               'TVM_SHARD_INDEX=7',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2099,6 +2112,7 @@ def shard_run_integration_CPU_9_of_10() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
               'TVM_NUM_SHARDS=10',
               'TVM_SHARD_INDEX=8',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2174,6 +2188,7 @@ def shard_run_integration_CPU_10_of_10() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
               'TVM_NUM_SHARDS=10',
               'TVM_SHARD_INDEX=9',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2250,6 +2265,7 @@ def shard_run_python_i386_1_of_5() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=i386',
+              'TEST_STEP_NAME=python: i386',
               'TVM_NUM_SHARDS=5',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2325,6 +2341,7 @@ def shard_run_python_i386_2_of_5() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=i386',
+              'TEST_STEP_NAME=python: i386',
               'TVM_NUM_SHARDS=5',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2400,6 +2417,7 @@ def shard_run_python_i386_3_of_5() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=i386',
+              'TEST_STEP_NAME=python: i386',
               'TVM_NUM_SHARDS=5',
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2474,6 +2492,7 @@ def shard_run_python_i386_4_of_5() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=i386',
+              'TEST_STEP_NAME=python: i386',
               'TVM_NUM_SHARDS=5',
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2548,6 +2567,7 @@ def shard_run_python_i386_5_of_5() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=i386',
+              'TEST_STEP_NAME=python: i386',
               'TVM_NUM_SHARDS=5',
               'TVM_SHARD_INDEX=4',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2623,6 +2643,7 @@ def shard_run_test_Hexagon_1_of_7() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
               'TVM_NUM_SHARDS=7',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2697,6 +2718,7 @@ def shard_run_test_Hexagon_2_of_7() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
               'TVM_NUM_SHARDS=7',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2770,6 +2792,7 @@ def shard_run_test_Hexagon_3_of_7() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
               'TVM_NUM_SHARDS=7',
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2843,6 +2866,7 @@ def shard_run_test_Hexagon_4_of_7() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
               'TVM_NUM_SHARDS=7',
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2916,6 +2940,7 @@ def shard_run_test_Hexagon_5_of_7() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
               'TVM_NUM_SHARDS=7',
               'TVM_SHARD_INDEX=4',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -2989,6 +3014,7 @@ def shard_run_test_Hexagon_6_of_7() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
               'TVM_NUM_SHARDS=7',
               'TVM_SHARD_INDEX=5',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3062,6 +3088,7 @@ def shard_run_test_Hexagon_7_of_7() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
               'TVM_NUM_SHARDS=7',
               'TVM_SHARD_INDEX=6',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3136,6 +3163,7 @@ def shard_run_integration_aarch64_1_of_4() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
+              'TEST_STEP_NAME=integration: aarch64',
               'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3210,6 +3238,7 @@ def shard_run_integration_aarch64_2_of_4() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
+              'TEST_STEP_NAME=integration: aarch64',
               'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3284,6 +3313,7 @@ def shard_run_integration_aarch64_3_of_4() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
+              'TEST_STEP_NAME=integration: aarch64',
               'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3358,6 +3388,7 @@ def shard_run_integration_aarch64_4_of_4() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
+              'TEST_STEP_NAME=integration: aarch64',
               'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3433,6 +3464,7 @@ def shard_run_topi_GPU_1_of_4() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=topi: GPU',
               'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3506,6 +3538,7 @@ def shard_run_topi_GPU_2_of_4() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=topi: GPU',
               'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3579,6 +3612,7 @@ def shard_run_topi_GPU_3_of_4() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=topi: GPU',
               'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3652,6 +3686,7 @@ def shard_run_topi_GPU_4_of_4() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=topi: GPU',
               'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3726,6 +3761,7 @@ def shard_run_frontend_GPU_1_of_6() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
               'TVM_NUM_SHARDS=6',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3799,6 +3835,7 @@ def shard_run_frontend_GPU_2_of_6() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
               'TVM_NUM_SHARDS=6',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3872,6 +3909,7 @@ def shard_run_frontend_GPU_3_of_6() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
               'TVM_NUM_SHARDS=6',
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -3945,6 +3983,7 @@ def shard_run_frontend_GPU_4_of_6() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
               'TVM_NUM_SHARDS=6',
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4018,6 +4057,7 @@ def shard_run_frontend_GPU_5_of_6() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
               'TVM_NUM_SHARDS=6',
               'TVM_SHARD_INDEX=4',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4091,6 +4131,7 @@ def shard_run_frontend_GPU_6_of_6() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
               'TVM_NUM_SHARDS=6',
               'TVM_SHARD_INDEX=5',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4165,6 +4206,7 @@ def shard_run_topi_aarch64_1_of_2() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
+              'TEST_STEP_NAME=topi: aarch64',
               'TVM_NUM_SHARDS=2',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4243,6 +4285,7 @@ def shard_run_topi_aarch64_2_of_2() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
+              'TEST_STEP_NAME=topi: aarch64',
               'TVM_NUM_SHARDS=2',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4321,6 +4364,7 @@ def shard_run_frontend_aarch64_1_of_2() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
+              'TEST_STEP_NAME=frontend: aarch64',
               'TVM_NUM_SHARDS=2',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4394,6 +4438,7 @@ def shard_run_frontend_aarch64_2_of_2() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
+              'TEST_STEP_NAME=frontend: aarch64',
               'TVM_NUM_SHARDS=2',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4468,6 +4513,7 @@ def shard_run_test_Cortex_M_1_of_8() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
               'TVM_NUM_SHARDS=8',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4546,6 +4592,7 @@ def shard_run_test_Cortex_M_2_of_8() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
               'TVM_NUM_SHARDS=8',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4619,6 +4666,7 @@ def shard_run_test_Cortex_M_3_of_8() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
               'TVM_NUM_SHARDS=8',
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4692,6 +4740,7 @@ def shard_run_test_Cortex_M_4_of_8() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
               'TVM_NUM_SHARDS=8',
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4765,6 +4814,7 @@ def shard_run_test_Cortex_M_5_of_8() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
               'TVM_NUM_SHARDS=8',
               'TVM_SHARD_INDEX=4',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4838,6 +4888,7 @@ def shard_run_test_Cortex_M_6_of_8() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
               'TVM_NUM_SHARDS=8',
               'TVM_SHARD_INDEX=5',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4911,6 +4962,7 @@ def shard_run_test_Cortex_M_7_of_8() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
               'TVM_NUM_SHARDS=8',
               'TVM_SHARD_INDEX=6',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -4984,6 +5036,7 @@ def shard_run_test_Cortex_M_8_of_8() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
               'TVM_NUM_SHARDS=8',
               'TVM_SHARD_INDEX=7',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -5058,6 +5111,7 @@ def shard_run_test_RISC_V_1_of_1() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=riscv',
+              'TEST_STEP_NAME=test: RISC-V',
               'TVM_NUM_SHARDS=1',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -5361,6 +5415,7 @@ stage('Test') {
               docker_init(ci_cpu)
               init_git()
               withEnv(['PLATFORM=cpu',
+              'TEST_STEP_NAME=unittest: CPU',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
                 sh(
                         script: """
@@ -5435,6 +5490,7 @@ stage('Test') {
               docker_init(ci_cpu)
               init_git()
               withEnv(['PLATFORM=cpu',
+              'TEST_STEP_NAME=frontend: CPU',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
                 sh(
                         script: """
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index 4960d4f0fa57..be2776c6d9e3 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -85,7 +85,7 @@ properties([
 upstream_revision = null
 
 // command to start a docker container
-docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS'
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
 docker_build = 'docker/build.sh'
 // timeout in minutes
 max_time = 180
diff --git a/ci/jenkins/macros.j2 b/ci/jenkins/macros.j2
index dbd6ac551db4..9d02ad68d6da 100644
--- a/ci/jenkins/macros.j2
+++ b/ci/jenkins/macros.j2
@@ -44,6 +44,7 @@ def {{ method_name }}() {
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM={{ platform }}',
+              'TEST_STEP_NAME={{ name }}',
               'TVM_NUM_SHARDS={{ num_shards }}',
               'TVM_SHARD_INDEX={{ shard_index - 1 }}',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -75,6 +76,7 @@ def {{ method_name }}() {
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
             'TVM_NUM_SHARDS={{ num_shards }}',
+            'TEST_STEP_NAME={{ name }}',
             'TVM_SHARD_INDEX={{ shard_index - 1 }}',
             "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
             {{ caller() | trim | indent(width=6) }}
@@ -121,6 +123,7 @@ def {{ method_name }}() {
               docker_init({{ docker_image }})
               init_git()
               withEnv(['PLATFORM={{ platform }}',
+              'TEST_STEP_NAME={{ name }}',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
                 {{ caller() | indent(width=12) | trim }}
               })
diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index 1f4f983b7210..2d845b70ff11 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -37,6 +37,13 @@
 import tvm
 from tvm.testing import utils
 
+try:
+    from xdist.scheduler.loadscope import LoadScopeScheduling
+
+    HAVE_XDIST = True
+except ImportError:
+    HAVE_XDIST = False
+
 
 MARKERS = {
     "gpu": "mark a test as requiring a gpu",
@@ -319,3 +326,38 @@ def _parametrize_correlated_parameters(metafunc):
             names = ",".join(name for name, values in params)
             value_sets = zip(*[values for name, values in params])
             metafunc.parametrize(names, value_sets, indirect=True, ids=ids)
+
+
+# pytest-xdist isn't required but is used in CI, so guard on its presence
+if HAVE_XDIST:
+
+    def pytest_xdist_make_scheduler(config, log):
+        """
+        Serialize certain tests for pytest-xdist that have inter-test
+        dependencies
+        """
+
+        class TvmTestScheduler(LoadScopeScheduling):
+            """
+            Scheduler to serializer tests
+            """
+
+            def _split_scope(self, nodeid):
+                """
+                Returns a specific string for classes of nodeids
+                """
+                # NOTE: these tests contain inter-test dependencies and must be
+                # serialized
+                items = {
+                    "test_tvm_testing_features": "functional-tests",
+                    "tests/python/unittest/test_crt": "crt-tests",
+                    "tests/python/driver/tvmc": "tvmc-tests",
+                }
+
+                for nodeid_pattern, suite_name in items.items():
+                    if nodeid_pattern in nodeid:
+                        return suite_name
+
+                return nodeid
+
+        return TvmTestScheduler(config, log)
diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index afb759c09356..d6c49a42819a 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -74,8 +74,14 @@ function run_pytest() {
 
     suite_name="${test_suite_name}-${current_shard}-${ffi_type}"
 
+    # Some test environments don't play well with parallelism
+    DEFAULT_PARALLELISM=2
+    if [[ "${TEST_STEP_NAME:-default}" == "frontend: GPU"* ]] || [[ "${TEST_STEP_NAME:-default}" == "test: Hexagon"* ]]; then
+        DEFAULT_PARALLELISM=1
+    fi
+
     if [ ! "${extra_args[@]}" == *" -n"* ] && [! "${extra_args[@]}" == *" -dist"* ]; then
-        extra_args+=("-n=1")
+        extra_args+=("-n=$DEFAULT_PARALLELISM")
     fi
 
     exit_code=0

From 8d60b3cbbcacc5675383c353a1180be0cbc59cb9 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@octoml.ai>
Date: Thu, 25 Aug 2022 13:05:09 -0700
Subject: [PATCH 049/704] [Runtime] Change default alignment to 64 bytes.
 (#12586)

* Change default alignment to 64 bits.

* Run dlpack test a few times.

* Update alignment in tests.

* Revert mma alignment change.

* Change default printing of buffer.

* Change crt runtime default allocation.
---
 include/tvm/runtime/device_api.h              |   4 +-
 python/tvm/tir/tensor_intrin/cuda.py          |  54 ++++---
 src/printer/tir_text_printer.cc               |   2 +-
 src/runtime/crt/common/crt_runtime_api.c      |   2 +-
 tests/python/contrib/test_dlpack.py           |  10 +-
 .../test_ethosu/test_tir_to_cs_translator.py  |  68 ++++-----
 .../contrib/test_ethosu/test_vela_api.py      |  18 +--
 .../test_tir_analysis_calculate_workspace.py  |  16 +-
 tests/python/unittest/test_tir_intrin.py      |   8 +-
 .../unittest/test_tir_schedule_analysis.py    |   6 +-
 .../unittest/test_tir_schedule_reduction.py   |  16 +-
 .../test_tir_schedule_storage_align.py        |  18 +--
 .../unittest/test_tir_schedule_tensorize.py   |  30 ++--
 ..._tir_transform_convert_for_loops_serial.py |   8 +-
 ...est_tir_transform_inject_rolling_buffer.py |  12 +-
 tests/python/unittest/test_tir_usmp_algo.py   |  18 +--
 ...st_tir_usmp_analysis_extract_bufferinfo.py | 138 +++++++++---------
 ...orm_convert_pool_allocations_to_offsets.py |  36 ++---
 ..._tir_usmp_transform_create_io_allocates.py |  48 +++---
 tests/python/unittest/test_tir_usmp_utils.py  |  18 +--
 .../unittest/test_tvmscript_complete.py       |  18 +--
 .../unittest/test_tvmscript_roundtrip.py      |  36 ++---
 .../unittest/test_tvmscript_syntax_sugar.py   |  12 +-
 tests/python/unittest/test_tvmscript_type.py  |   6 +-
 24 files changed, 303 insertions(+), 299 deletions(-)

diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index c3d83bf2993f..1bb10fa17ae6 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -52,10 +52,10 @@ enum DeviceAttrKind : int {
 };
 
 /*! \brief Number of bytes each allocation must align to */
-constexpr int kAllocAlignment = 128;
+constexpr int kAllocAlignment = 64;
 
 /*! \brief Number of bytes each allocation must align to in temporary allocation */
-constexpr int kTempAllocaAlignment = 128;
+constexpr int kTempAllocaAlignment = 64;
 
 /*! \brief Maximum size that can be allocated on stack */
 constexpr int kMaxStackAlloca = 1024;
diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
index b4f5d1d331e5..64d7c24840ae 100644
--- a/python/tvm/tir/tensor_intrin/cuda.py
+++ b/python/tvm/tir/tensor_intrin/cuda.py
@@ -120,12 +120,12 @@ def ldmatrix_desc(warp_handle: T.handle, shared_handle: T.handle) -> None:
             shared_handle,
             shmem_shape,
             dtype,
-            align=128,
+            align=64,
             offset_factor=16,
             scope=shared_scope,
         )
         warp = T.match_buffer(
-            warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp"
+            warp_handle, (WARP_SIZE, local_size), dtype, align=64, offset_factor=16, scope="warp"
         )
 
         with T.block("root"):
@@ -149,13 +149,13 @@ def ldmatrix_impl(warp_handle: T.handle, shared_handle: T.handle) -> None:
             shared_handle,
             shmem_shape,
             dtype,
-            align=128,
+            align=64,
             offset_factor=16,
             scope=shared_scope,
             strides=[s0, s1],
         )
         warp = T.match_buffer(
-            warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp"
+            warp_handle, (WARP_SIZE, local_size), dtype, align=64, offset_factor=16, scope="warp"
         )
 
         with T.block("root"):
@@ -222,13 +222,13 @@ def maybe_swap(i, j):
     @T.prim_func
     def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
         A = T.match_buffer(
-            a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+            a, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp"
         )
         B = T.match_buffer(
-            b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+            b, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp"
         )
         C = T.match_buffer(
-            c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp"
+            c, (WARP_SIZE, local_size_out), out_dtype, align=64, offset_factor=16, scope="warp"
         )
 
         with T.block("root"):
@@ -262,13 +262,13 @@ def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
     @T.prim_func
     def mma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
         A = T.match_buffer(
-            a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+            a, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp"
         )
         B = T.match_buffer(
-            b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+            b, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp"
         )
         C = T.match_buffer(
-            c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp"
+            c, (WARP_SIZE, local_size_out), out_dtype, align=64, offset_factor=16, scope="warp"
         )
 
         with T.block("root"):
@@ -510,11 +510,9 @@ def get_wmma_load_intrin(
 
     @T.prim_func
     def wmma_load_desc(a: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(
-            a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=shared_scope
-        )
+        A = T.match_buffer(a, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=shared_scope)
         C = T.match_buffer(
-            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=wmma_fragment_scope
+            c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=wmma_fragment_scope
         )
         with T.block("root"):
             T.reads(A[0:m_dim, 0:n_dim])
@@ -532,13 +530,13 @@ def wmma_load_impl(a: T.handle, c: T.handle) -> None:
             a,
             (m_dim, n_dim),
             dtype,
-            align=128,
+            align=64,
             offset_factor=16,
             scope=shared_scope,
             strides=[s1, s0],
         )
         C = T.match_buffer(
-            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=wmma_fragment_scope
+            c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=wmma_fragment_scope
         )
         with T.block("root"):
             T.reads(A[0:m_dim, 0:n_dim])
@@ -569,7 +567,7 @@ def get_wmma_fill_intrin(
     @T.prim_func
     def wmma_fill_desc(c: T.handle) -> None:
         C = T.match_buffer(
-            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+            c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator"
         )
         with T.block("root"):
             T.reads()
@@ -582,7 +580,7 @@ def wmma_fill_desc(c: T.handle) -> None:
     @T.prim_func
     def wmma_fill_impl(c: T.handle) -> None:
         C = T.match_buffer(
-            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+            c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator"
         )
         with T.block("root"):
             T.reads()
@@ -610,9 +608,9 @@ def get_wmma_store_intrin(
     @T.prim_func
     def wmma_store_desc(a: T.handle, c: T.handle) -> None:
         A = T.match_buffer(
-            a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+            a, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator"
         )
-        C = T.match_buffer(c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=scope)
+        C = T.match_buffer(c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=scope)
         with T.block("root"):
             T.reads(A[0:m_dim, 0:n_dim])
             T.writes(C[0:m_dim, 0:n_dim])
@@ -626,10 +624,10 @@ def wmma_store_impl(a: T.handle, c: T.handle) -> None:
         s1 = T.var("int32")
         s0 = T.var("int32")
         A = T.match_buffer(
-            a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+            a, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator"
         )
         C = T.match_buffer(
-            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=scope, strides=[s1, s0]
+            c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=scope, strides=[s1, s0]
         )
         with T.block("root"):
             T.reads(A[0:m_dim, 0:n_dim])
@@ -671,18 +669,18 @@ def maybe_swap(i, j):
     @T.prim_func
     def wmma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
         A = T.match_buffer(
-            a, (m_dim, k_dim), in_dtype, align=128, offset_factor=16, scope="wmma.matrix_a"
+            a, (m_dim, k_dim), in_dtype, align=64, offset_factor=16, scope="wmma.matrix_a"
         )
         B = T.match_buffer(
             b,
             maybe_swap(k_dim, n_dim),
             in_dtype,
-            align=128,
+            align=64,
             offset_factor=16,
             scope="wmma.matrix_b",
         )
         C = T.match_buffer(
-            c, (m_dim, n_dim), out_dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+            c, (m_dim, n_dim), out_dtype, align=64, offset_factor=16, scope="wmma.accumulator"
         )
 
         with T.block("root"):
@@ -699,18 +697,18 @@ def wmma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
     @T.prim_func
     def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
         A = T.match_buffer(
-            a, (m_dim, k_dim), in_dtype, align=128, offset_factor=16, scope="wmma.matrix_a"
+            a, (m_dim, k_dim), in_dtype, align=64, offset_factor=16, scope="wmma.matrix_a"
         )
         B = T.match_buffer(
             b,
             maybe_swap(k_dim, n_dim),
             in_dtype,
-            align=128,
+            align=64,
             offset_factor=16,
             scope="wmma.matrix_b",
         )
         C = T.match_buffer(
-            c, (m_dim, n_dim), out_dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+            c, (m_dim, n_dim), out_dtype, align=64, offset_factor=16, scope="wmma.accumulator"
         )
 
         with T.block("root"):
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index 894a9cec1e2a..cdfc8fd318fd 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -251,7 +251,7 @@ Doc TIRTextPrinter::BufferNode2Doc(const BufferNode* buf, Doc doc) {
   if (GetRef<Buffer>(buf).scope() != "global") {
     doc << ", scope=" << Doc::StrLiteral(GetRef<Buffer>(buf).scope());
   }
-  if (buf->data_alignment != 128) {
+  if (buf->data_alignment != runtime::kAllocAlignment) {
     doc << ", align=" << buf->data_alignment;
   }
   if (buf->offset_factor != 1) {
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index 2151c23f8462..7df610b53c45 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -104,7 +104,7 @@ int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shap
   }
   nbytes *= (dtype.bits * dtype.lanes + 7) / 8;
 
-  int kAllocAlignment = 128;
+  int kAllocAlignment = 64;
   size_t align = (dtype.bits / 8) * dtype.lanes;
   if (align < kAllocAlignment) align = kAllocAlignment;
   return TVMDeviceAllocDataSpace(dev, nbytes, align, dtype, out_data);
diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py
index c71fc45d0346..4e65f79c518e 100644
--- a/tests/python/contrib/test_dlpack.py
+++ b/tests/python/contrib/test_dlpack.py
@@ -21,7 +21,7 @@
 from tvm.contrib.dlpack import to_pytorch_func
 
 
-def test():
+def verify_torch_dlpack():
     a = np.random.randn(1337)
     tvm_a = tvm.nd.array(a)
     np.testing.assert_equal(tvm.nd.from_dlpack(tvm_a.to_dlpack()).numpy(), a)
@@ -63,5 +63,11 @@ def test():
         pass
 
 
+def test_torch_dlpack():
+    # Run dlpack interoperability test a few times to make sure it's stable.
+    for i in range(5):
+        verify_torch_dlpack()
+
+
 if __name__ == "__main__":
-    test()
+    test_torch_dlpack()
diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
index 28522138cafc..e1a0e143281b 100644
--- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
+++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
@@ -525,10 +525,10 @@ class SingleEthosuDepthwiseConv2D:
     def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, ethosu_depthwise_conv2d: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_1, [18], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_5 = T.match_buffer(placeholder_2, [30], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_3 = T.match_buffer(placeholder, [192], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        ethosu_depthwise_conv2d_1 = T.match_buffer(ethosu_depthwise_conv2d, [126], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_1, [18], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_2, [30], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_3 = T.match_buffer(placeholder, [192], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        ethosu_depthwise_conv2d_1 = T.match_buffer(ethosu_depthwise_conv2d, [126], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder_3[0], 0, 0, 0, T.float32(0.6), 11, "NHWC", 24, 3, 1, "int8", 6, 7, 3, 6, 0, 7, ethosu_depthwise_conv2d_1[0], 0, 0, 0, T.float32(0.26), 15, "NHWC", 21, 3, 1, 2, 3, 1, 1, 1, 1, placeholder_4[0], 18, 13, placeholder_5[0], 30, 0, 0, 0, 0, "CLIP", 15, 105, "TFL", "NONE", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
@@ -991,8 +991,8 @@ class SingleEthosuPooling:
     def main(placeholder: T.handle, placeholder_3: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [75], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [75], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_pooling", "int8", 5, 9, 3, 5, 0, 9, placeholder_4[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 5, 3, 5, 0, 5, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 15, 3, 1, "AVG", 2, 3, 2, 1, 1, 1, 1, 1, 1, 0, "CLIP", 10, 100, "TFL", "NONE", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
@@ -1065,10 +1065,10 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         placeholder_2 = T.match_buffer(
-            placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1
+            placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1
         )
         ethosu_write_2 = T.match_buffer(
-            ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1
+            ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1
         )
         # body
         T.evaluate(T.call_extern( "ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "ADD", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
@@ -1084,8 +1084,8 @@ class SingleEthosuBinaryElementwiseSub:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SUB", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
@@ -1099,8 +1099,8 @@ class SingleEthosuBinaryElementwiseMul:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MUL", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
@@ -1115,8 +1115,8 @@ class SingleEthosuBinaryElementwiseMin:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MIN", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
@@ -1131,8 +1131,8 @@ class SingleEthosuBinaryElementwiseMax:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MAX", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
@@ -1147,8 +1147,8 @@ class SingleEthosuBinaryElementwiseShr:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SHR", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="int32"))
     __tvm_meta__ = None
@@ -1163,8 +1163,8 @@ class SingleEthosuBinaryElementwiseShl:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SHL", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int32"))
     __tvm_meta__ = None
@@ -1284,8 +1284,8 @@ class SingleEthosuBinaryElementwiseAddBroadcasting:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "ADD", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
@@ -1299,8 +1299,8 @@ class SingleEthosuBinaryElementwiseSubBroadcasting:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SUB", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
@@ -1314,8 +1314,8 @@ class SingleEthosuBinaryElementwiseMulBroadcasting:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MUL", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
@@ -1330,8 +1330,8 @@ class SingleEthosuBinaryElementwiseMinBroadcasting:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MIN", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
@@ -1346,8 +1346,8 @@ class SingleEthosuBinaryElementwiseMaxBroadcasting:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MAX", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
@@ -1362,8 +1362,8 @@ class SingleEthosuBinaryElementwiseShrBroadcasting:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int32", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int32", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SHR", 1, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="int32"))
     __tvm_meta__ = None
@@ -1378,8 +1378,8 @@ class SingleEthosuBinaryElementwiseShlBroadcasting:
     def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=128, offset_factor=1)
+        placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int32", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int32", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SHL", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int32"))
     __tvm_meta__ = None
diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py b/tests/python/contrib/test_ethosu/test_vela_api.py
index e2e4b2cb3a91..75ca22d08202 100644
--- a/tests/python/contrib/test_ethosu/test_vela_api.py
+++ b/tests/python/contrib/test_ethosu/test_vela_api.py
@@ -50,16 +50,16 @@ def main(
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         placeholder_3 = T.match_buffer(
-            placeholder, [192], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+            placeholder, [192], dtype="uint8", elem_offset=0, align=64, offset_factor=1
         )
         placeholder_4 = T.match_buffer(
-            placeholder_1, [48], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+            placeholder_1, [48], dtype="uint8", elem_offset=0, align=64, offset_factor=1
         )
         placeholder_5 = T.match_buffer(
-            placeholder_2, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1
+            placeholder_2, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1
         )
         ethosu_conv2d_1 = T.match_buffer(
-            ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+            ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=64, offset_factor=1
         )
         # body
         T.evaluate(
@@ -142,20 +142,20 @@ def main(
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         placeholder_3 = T.match_buffer(
-            placeholder, [192], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+            placeholder, [192], dtype="uint8", elem_offset=0, align=64, offset_factor=1
         )
         placeholder_4 = T.match_buffer(
-            placeholder_1, [48], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+            placeholder_1, [48], dtype="uint8", elem_offset=0, align=64, offset_factor=1
         )
         placeholder_5 = T.match_buffer(
-            placeholder_2, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1
+            placeholder_2, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1
         )
         # Per-channel weight scales
         placeholder_7 = T.match_buffer(
-            placeholder_6, [16], dtype="float32", elem_offset=0, align=128, offset_factor=1
+            placeholder_6, [16], dtype="float32", elem_offset=0, align=64, offset_factor=1
         )
         ethosu_conv2d_1 = T.match_buffer(
-            ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+            ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=64, offset_factor=1
         )
         # body
         T.evaluate(
diff --git a/tests/python/unittest/test_tir_analysis_calculate_workspace.py b/tests/python/unittest/test_tir_analysis_calculate_workspace.py
index 8d3163c111c8..1d78458b930d 100644
--- a/tests/python/unittest/test_tir_analysis_calculate_workspace.py
+++ b/tests/python/unittest/test_tir_analysis_calculate_workspace.py
@@ -26,10 +26,10 @@
 def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.handle, placeholder_146: T.handle, T_cast_48: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_13", "tir.noalias": True})
-    placeholder_147 = T.match_buffer(placeholder_144, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-    placeholder_148 = T.match_buffer(placeholder_145, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-    placeholder_149 = T.match_buffer(placeholder_146, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-    T_cast_49 = T.match_buffer(T_cast_48, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+    placeholder_147 = T.match_buffer(placeholder_144, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+    placeholder_148 = T.match_buffer(placeholder_145, [4608], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+    placeholder_149 = T.match_buffer(placeholder_146, [512], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+    T_cast_49 = T.match_buffer(T_cast_48, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1)
     # body
     PaddedInput_22 = T.allocate([131072], "int16", "global")
     DepthwiseConv2d_9 = T.allocate([100352], "int32", "global")
@@ -57,10 +57,10 @@ def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.hand
 def primfunc_local_allocates(placeholder_162: T.handle, placeholder_163: T.handle, placeholder_164: T.handle, T_cast_76: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_9", "tir.noalias": True})
-    placeholder_165 = T.match_buffer(placeholder_162, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-    placeholder_166 = T.match_buffer(placeholder_163, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-    placeholder_167 = T.match_buffer(placeholder_164, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-    T_cast_77 = T.match_buffer(T_cast_76, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+    placeholder_165 = T.match_buffer(placeholder_162, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+    placeholder_166 = T.match_buffer(placeholder_163, [4608], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+    placeholder_167 = T.match_buffer(placeholder_164, [512], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+    T_cast_77 = T.match_buffer(T_cast_76, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1)
     sid_21 = T.allocate_const([0,1,2,3,4,5,6,7], "int8", [8])
     # body
     PaddedInput_25 = T.allocate([131072], "int16", "global")
diff --git a/tests/python/unittest/test_tir_intrin.py b/tests/python/unittest/test_tir_intrin.py
index b8061fc0732a..f887f8877a22 100644
--- a/tests/python/unittest/test_tir_intrin.py
+++ b/tests/python/unittest/test_tir_intrin.py
@@ -203,7 +203,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None:
             [n],
             strides=[stride],
             elem_offset=0,
-            align=128,
+            align=64,
             offset_factor=1,
             buffer_type="auto",
         )
@@ -212,7 +212,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None:
             [n],
             strides=[stride_1],
             elem_offset=0,
-            align=128,
+            align=64,
             offset_factor=1,
             buffer_type="auto",
         )
@@ -221,7 +221,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None:
             [n],
             strides=[stride_2],
             elem_offset=0,
-            align=128,
+            align=64,
             offset_factor=1,
             buffer_type="auto",
         )
@@ -230,7 +230,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None:
             [n],
             strides=[stride_3],
             elem_offset=0,
-            align=128,
+            align=64,
             offset_factor=1,
             buffer_type="auto",
         )
diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index d3e6033e880c..5524abbaf094 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -218,9 +218,9 @@ def test_get_tensorize_loop_mapping_conv2d_nchwc_vnni():
 def test_get_tensorize_loop_mapping_matmul_mma():
     @T.prim_func
     def matmul_16x16x16xf16f16f16_desc(
-        A: T.Buffer((16, 16), "float16", align=128, offset_factor=1),
-        B: T.Buffer((16, 16), "float16", align=128, offset_factor=1),
-        C: T.Buffer((16, 16), "float16", align=128, offset_factor=1),
+        A: T.Buffer((16, 16), "float16", align=64, offset_factor=1),
+        B: T.Buffer((16, 16), "float16", align=64, offset_factor=1),
+        C: T.Buffer((16, 16), "float16", align=64, offset_factor=1),
     ) -> None:
         with T.block("root"):
             T.reads(C[0:16, 0:16], A[0:16, 0:16], B[0:16, 0:16])
diff --git a/tests/python/unittest/test_tir_schedule_reduction.py b/tests/python/unittest/test_tir_schedule_reduction.py
index f3503460e50a..1600b27f5e78 100644
--- a/tests/python/unittest/test_tir_schedule_reduction.py
+++ b/tests/python/unittest/test_tir_schedule_reduction.py
@@ -78,8 +78,8 @@ def matmul_decompose0(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 @T.prim_func
 def matmul_decompose1(a: T.handle, b: T.handle) -> None:
-    A = T.match_buffer(a, [32, 4, 128], elem_offset=0, align=128, offset_factor=1)
-    B = T.match_buffer(b, [32, 4], elem_offset=0, align=128, offset_factor=1)
+    A = T.match_buffer(a, [32, 4, 128], elem_offset=0, align=64, offset_factor=1)
+    B = T.match_buffer(b, [32, 4], elem_offset=0, align=64, offset_factor=1)
 
     for i0 in T.serial(0, 32):
         with T.block("blockized_B_init"):
@@ -100,9 +100,9 @@ def matmul_decompose1(a: T.handle, b: T.handle) -> None:
 
 @T.prim_func
 def matmul_decompose2(a: T.handle, b: T.handle, c: T.handle) -> None:
-    C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
+    C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1)
 
     for i0, i1 in T.grid(128, 128):
         with T.block("update_init"):
@@ -130,9 +130,9 @@ def matmul_decompose_fail3(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 @T.prim_func
 def matmul_decompose4(a: T.handle, b: T.handle, c: T.handle) -> None:
-    C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
+    C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1)
     # body
     with T.block("root"):
         T.reads([])
diff --git a/tests/python/unittest/test_tir_schedule_storage_align.py b/tests/python/unittest/test_tir_schedule_storage_align.py
index 072640c8f3af..23cb5d3b5339 100644
--- a/tests/python/unittest/test_tir_schedule_storage_align.py
+++ b/tests/python/unittest/test_tir_schedule_storage_align.py
@@ -26,13 +26,13 @@
 
 @T.prim_func
 def element_wise(a: T.handle, c: T.handle) -> None:
-    C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
+    C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1)
     # body
     with T.block("root"):
         T.reads([])
         T.writes([])
-        B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1)
+        B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1)
         for i0 in T.serial(0, 128):
             for ax1 in T.serial(0, 128):
                 with T.block("B"):
@@ -50,13 +50,13 @@ def element_wise(a: T.handle, c: T.handle) -> None:
 
 @T.prim_func
 def element_wise_storage_align(a: T.handle, c: T.handle) -> None:
-    C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
+    C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1)
     # body
     with T.block("root"):
         T.reads([])
         T.writes([])
-        B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1)
+        B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1)
         for i0 in T.serial(0, 128):
             for ax1 in T.serial(0, 128):
                 with T.block("B"):
@@ -75,13 +75,13 @@ def element_wise_storage_align(a: T.handle, c: T.handle) -> None:
 
 @T.prim_func
 def element_wise_invalid_annotation(a: T.handle, c: T.handle) -> None:
-    C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
+    C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1)
     # body
     with T.block("root"):
         T.reads([])
         T.writes([])
-        B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1)
+        B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1)
         for i0 in T.serial(0, 128):
             for ax1 in T.serial(0, 128):
                 with T.block("B"):
diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index 929a6cfa19bc..828dad2fc036 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -36,9 +36,9 @@
 
 @T.prim_func
 def mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), align=128, offset_factor=1)
-    B = T.match_buffer(b, (16, 16), align=128, offset_factor=1)
-    C = T.match_buffer(c, (16, 16), align=128, offset_factor=1)
+    A = T.match_buffer(a, (16, 16), align=64, offset_factor=1)
+    B = T.match_buffer(b, (16, 16), align=64, offset_factor=1)
+    C = T.match_buffer(c, (16, 16), align=64, offset_factor=1)
 
     with T.block("root"):
         T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16])
@@ -51,9 +51,9 @@ def mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 @T.prim_func
 def mma_intrin(a: T.handle, b: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), align=128, offset_factor=1)
-    B = T.match_buffer(b, (16, 16), align=128, offset_factor=1)
-    C = T.match_buffer(c, (16, 16), align=128, offset_factor=1)
+    A = T.match_buffer(a, (16, 16), align=64, offset_factor=1)
+    B = T.match_buffer(b, (16, 16), align=64, offset_factor=1)
+    C = T.match_buffer(c, (16, 16), align=64, offset_factor=1)
 
     with T.block("root"):
         T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16])
@@ -173,9 +173,9 @@ def matmul(
 
 @T.prim_func
 def tensorized_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
-    C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
+    C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1)
 
     for i_outer, j_outer in T.grid(8, 8):
         for i_inner_init, j_inner_init in T.grid(16, 16):
@@ -375,9 +375,9 @@ def tensorized_batch_matmul_outer_product(
 
 @T.prim_func
 def annotated_mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), align=128, offset_factor=1)
-    B = T.match_buffer(b, (16, 16), align=128, offset_factor=1)
-    C = T.match_buffer(c, (16, 16), align=128, offset_factor=1)
+    A = T.match_buffer(a, (16, 16), align=64, offset_factor=1)
+    B = T.match_buffer(b, (16, 16), align=64, offset_factor=1)
+    C = T.match_buffer(c, (16, 16), align=64, offset_factor=1)
 
     with T.block("root"):
         T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16])
@@ -406,9 +406,9 @@ def annotated_matmul(
 
 @T.prim_func
 def annotated_tensorized_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
-    C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
+    C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1)
 
     for i_outer, j_outer in T.grid(8, 8):
         for i_inner_init, j_inner_init in T.grid(16, 16):
diff --git a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py
index 38431705611b..1a3afdd4c1e2 100644
--- a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py
+++ b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py
@@ -26,10 +26,10 @@
 def fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(placeholder_30: T.handle, placeholder_31: T.handle, placeholder_32: T.handle, T_cast_8: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", "tir.noalias": True})
-    placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-    placeholder_34 = T.match_buffer(placeholder_31, [3072], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-    placeholder_35 = T.match_buffer(placeholder_32, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-    T_cast_9 = T.match_buffer(T_cast_8, [12544], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+    placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+    placeholder_34 = T.match_buffer(placeholder_31, [3072], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+    placeholder_35 = T.match_buffer(placeholder_32, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+    T_cast_9 = T.match_buffer(T_cast_8, [12544], dtype="int16", elem_offset=0, align=64, offset_factor=1)
     # body
     PaddedInput_3 = T.allocate([150528], "int16", "global")
     for i0_i1_fused_3 in T.parallel(0, 28):
diff --git a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
index 073a0ebd4e84..65a586b8ecfd 100644
--- a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
+++ b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
@@ -196,9 +196,9 @@ def main(A: T.handle, tensor: T.handle) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.realize(tensor_1[0:1, 0:8, 0:8, 0:16], "")
         for ax1_outer in T.serial(0, 2):
@@ -228,9 +228,9 @@ def main(A: T.handle, tensor: T.handle) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
-        tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
         T.realize(tensor_1[0:1, 0:8, 0:8, 0:16], "")
         T.realize(tensor_2[0:1, 0:6, 0:12, 0:16], "")
diff --git a/tests/python/unittest/test_tir_usmp_algo.py b/tests/python/unittest/test_tir_usmp_algo.py
index 140f6d1b146e..f67148189d8c 100644
--- a/tests/python/unittest/test_tir_usmp_algo.py
+++ b/tests/python/unittest/test_tir_usmp_algo.py
@@ -299,9 +299,9 @@ class MobilenetStructure:
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_1 in T.serial(0, 224):
             for ax2_1, ax3_inner_1 in T.grid(224, 3):
@@ -311,10 +311,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True})
-        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_21 = T.match_buffer(T_cast_20, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_21 = T.match_buffer(T_cast_20, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_7 = T.allocate([157323], "int16", "global")
         for i0_i1_fused_7 in T.serial(0, 229):
@@ -333,8 +333,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
     def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True})
-        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         tensor_2 = T.allocate([200704], "uint8", "global")
         for ax0_ax1_fused_4 in T.serial(0, 56):
diff --git a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
index d4e62362495c..60360ecade70 100644
--- a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
+++ b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
@@ -111,9 +111,9 @@ class LinearStructure:
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_1 in T.serial(0, 224):
             for ax2_1, ax3_inner_1 in T.grid(224, 3):
@@ -123,10 +123,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True})
-        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_7 = T.allocate([157323], "int16", "global")
         for i0_i1_fused_7 in T.serial(0, 229):
@@ -145,8 +145,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
     def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True})
-        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         tensor_2 = T.allocate([200704], "uint8", "global")
         for ax0_ax1_fused_4 in T.serial(0, 56):
@@ -215,10 +215,10 @@ class ParallelSerialMixedForLoops:
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placeholder_68: T.handle, placeholder_69: T.handle, placeholder_70: T.handle, T_cast_22: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", "tir.noalias": True})
-        placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_8 = T.allocate([215296], "int16", "global")
         for i0_i1_fused_8 in T.serial(0, 58):
@@ -256,10 +256,10 @@ class AllSerialForLoops:
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placeholder_68: T.handle, placeholder_69: T.handle, placeholder_70: T.handle, T_cast_22: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", "tir.noalias": True})
-        placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_8 = T.allocate([215296], "int16", "global")
         for i0_i1_fused_8 in T.serial(0, 58):
@@ -338,8 +338,8 @@ class InceptionStructure:
     def tvmgen_default_fused_nn_max_pool2d(placeholder: T.handle, tensor: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d", "tir.noalias": True})
-        placeholder_1 = T.match_buffer(placeholder, [602112], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        tensor_1 = T.match_buffer(tensor, [249], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_1 = T.match_buffer(placeholder, [602112], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        tensor_1 = T.match_buffer(tensor, [249], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused in T.serial(0, 28):
             for ax2 in T.serial(0, 28):
@@ -352,9 +352,9 @@ def tvmgen_default_fused_nn_max_pool2d(placeholder: T.handle, tensor: T.handle)
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_1 in T.serial(0, 224):
             for ax2_1, ax3_inner_1 in T.grid(224, 3):
@@ -364,8 +364,8 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     def tvmgen_default_fused_cast(placeholder_6: T.handle, T_cast: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast", "tir.noalias": True})
-        placeholder_7 = T.match_buffer(placeholder_6, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T_cast_1 = T.match_buffer(T_cast, [249], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_7 = T.match_buffer(placeholder_6, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T_cast_1 = T.match_buffer(T_cast, [249], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_2 in T.serial(0, 28):
             for ax2_2, ax3_outer_1, ax3_inner_2 in T.grid(28, 12, 16):
@@ -375,11 +375,11 @@ def tvmgen_default_fused_cast(placeholder_6: T.handle, T_cast: T.handle) -> None
     def tvmgen_default_fused_concatenate(placeholder_8: T.handle, placeholder_9: T.handle, placeholder_10: T.handle, placeholder_11: T.handle, T_concat: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_concatenate", "tir.noalias": True})
-        placeholder_12 = T.match_buffer(placeholder_8, [50176], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T_concat_1 = T.match_buffer(T_concat, [313], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_13 = T.match_buffer(placeholder_9, [100352], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_14 = T.match_buffer(placeholder_11, [25088], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_15 = T.match_buffer(placeholder_10, [25088], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_12 = T.match_buffer(placeholder_8, [50176], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T_concat_1 = T.match_buffer(T_concat, [313], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_13 = T.match_buffer(placeholder_9, [100352], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_14 = T.match_buffer(placeholder_11, [25088], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_15 = T.match_buffer(placeholder_10, [25088], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_3 in T.serial(0, 28):
             for ax2_3, ax3 in T.grid(28, 256):
@@ -389,10 +389,10 @@ def tvmgen_default_fused_concatenate(placeholder_8: T.handle, placeholder_9: T.h
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(placeholder_16: T.handle, placeholder_17: T.handle, placeholder_18: T.handle, T_cast_2: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", "tir.noalias": True})
-        placeholder_19 = T.match_buffer(placeholder_16, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_20 = T.match_buffer(placeholder_17, [4096], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_21 = T.match_buffer(placeholder_18, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_3 = T.match_buffer(T_cast_2, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_19 = T.match_buffer(placeholder_16, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_20 = T.match_buffer(placeholder_17, [4096], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_21 = T.match_buffer(placeholder_18, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_3 = T.match_buffer(T_cast_2, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput = T.allocate([200704], "int16", "global")
         for i0_i1_fused in T.serial(0, 56):
@@ -411,10 +411,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(placeholder_22: T.handle, placeholder_23: T.handle, placeholder_24: T.handle, T_cast_4: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", "tir.noalias": True})
-        placeholder_25 = T.match_buffer(placeholder_22, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_26 = T.match_buffer(placeholder_23, [18432], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_27 = T.match_buffer(placeholder_24, [96], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_5 = T.match_buffer(T_cast_4, [153], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_25 = T.match_buffer(placeholder_22, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_26 = T.match_buffer(placeholder_23, [18432], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_27 = T.match_buffer(placeholder_24, [96], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_5 = T.match_buffer(T_cast_4, [153], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_1 = T.allocate([150528], "int16", "global")
         for i0_i1_fused_1 in T.serial(0, 28):
@@ -432,8 +432,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
     def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True})
-        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         tensor_2 = T.allocate([200704], "uint8", "global")
         for ax0_ax1_fused_4 in T.serial(0, 56):
@@ -450,10 +450,10 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2(placeholder_30: T.handle, placeholder_31: T.handle, placeholder_32: T.handle, T_cast_8: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2", "tir.noalias": True})
-        placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_34 = T.match_buffer(placeholder_31, [12288], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_35 = T.match_buffer(placeholder_32, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_9 = T.match_buffer(T_cast_8, [121], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_34 = T.match_buffer(placeholder_31, [12288], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_35 = T.match_buffer(placeholder_32, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_9 = T.match_buffer(T_cast_8, [121], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_2 = T.allocate([150528], "int16", "global")
         for i0_i1_fused_2 in T.serial(0, 28):
@@ -472,8 +472,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2(placehol
     def tvmgen_default_fused_nn_max_pool2d_cast_1(placeholder_36: T.handle, T_cast_10: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast_1", "tir.noalias": True})
-        placeholder_37 = T.match_buffer(placeholder_36, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T_cast_11 = T.match_buffer(T_cast_10, [249], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_37 = T.match_buffer(placeholder_36, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T_cast_11 = T.match_buffer(T_cast_10, [249], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         tensor_3 = T.allocate([150528], "uint8", "global")
         for ax0_ax1_fused_6 in T.serial(0, 28):
@@ -490,10 +490,10 @@ def tvmgen_default_fused_nn_max_pool2d_cast_1(placeholder_36: T.handle, T_cast_1
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__2(placeholder_38: T.handle, placeholder_39: T.handle, placeholder_40: T.handle, T_cast_12: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__2", "tir.noalias": True})
-        placeholder_41 = T.match_buffer(placeholder_38, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_42 = T.match_buffer(placeholder_39, [6144], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_43 = T.match_buffer(placeholder_40, [32], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_13 = T.match_buffer(T_cast_12, [89], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_41 = T.match_buffer(placeholder_38, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_42 = T.match_buffer(placeholder_39, [6144], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_43 = T.match_buffer(placeholder_40, [32], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_13 = T.match_buffer(T_cast_12, [89], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_3 = T.allocate([150528], "int16", "global")
         for i0_i1_fused_3 in T.serial(0, 28):
@@ -511,10 +511,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(placeholder_44: T.handle, placeholder_45: T.handle, placeholder_46: T.handle, T_cast_14: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", "tir.noalias": True})
-        placeholder_47 = T.match_buffer(placeholder_44, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_48 = T.match_buffer(placeholder_45, [3072], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_49 = T.match_buffer(placeholder_46, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_15 = T.match_buffer(T_cast_14, [73], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_47 = T.match_buffer(placeholder_44, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_48 = T.match_buffer(placeholder_45, [3072], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_49 = T.match_buffer(placeholder_46, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_15 = T.match_buffer(T_cast_14, [73], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_4 = T.allocate([150528], "int16", "global")
         for i0_i1_fused_4 in T.serial(0, 28):
@@ -532,10 +532,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(pla
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__1(placeholder_50: T.handle, placeholder_51: T.handle, placeholder_52: T.handle, T_cast_16: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__1", "tir.noalias": True})
-        placeholder_53 = T.match_buffer(placeholder_50, [12544], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_54 = T.match_buffer(placeholder_51, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_55 = T.match_buffer(placeholder_52, [32], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_17 = T.match_buffer(T_cast_16, [89], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_53 = T.match_buffer(placeholder_50, [12544], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_54 = T.match_buffer(placeholder_51, [4608], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_55 = T.match_buffer(placeholder_52, [32], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_17 = T.match_buffer(T_cast_16, [89], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_5 = T.allocate([14400], "int16", "global")
         for i0_i1_fused_5 in T.serial(0, 30):
@@ -553,10 +553,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320_(placeholder_56: T.handle, placeholder_57: T.handle, placeholder_58: T.handle, T_cast_18: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320_", "tir.noalias": True})
-        placeholder_59 = T.match_buffer(placeholder_56, [75264], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_60 = T.match_buffer(placeholder_57, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_61 = T.match_buffer(placeholder_58, [128], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_19 = T.match_buffer(T_cast_18, [185], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_59 = T.match_buffer(placeholder_56, [75264], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_60 = T.match_buffer(placeholder_57, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_61 = T.match_buffer(placeholder_58, [128], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_19 = T.match_buffer(T_cast_18, [185], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_6 = T.allocate([86400], "int16", "global")
         for i0_i1_fused_6 in T.serial(0, 30):
@@ -576,10 +576,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "T.noalias": True})
-        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_7 = T.allocate([157323], "int16", "global")
         for i0_i1_fused_7 in T.serial(0, 229):
@@ -598,10 +598,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placeholder_68: T.handle, placeholder_69: T.handle, placeholder_70: T.handle, T_cast_22: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", "tir.noalias": True})
-        placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_8 = T.allocate([215296], "int16", "global")
         for i0_i1_fused_8 in T.serial(0, 58):
diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index 0a3e39b52f46..e6d123118757 100644
--- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -74,12 +74,12 @@ class LinearStructure:
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T.preflattened_buffer(placeholder_4, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T.preflattened_buffer(placeholder_5, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T.preflattened_buffer(T_subtract_1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T.preflattened_buffer(placeholder_4, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T.preflattened_buffer(placeholder_5, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T.preflattened_buffer(T_subtract_1, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_1 in T.serial(0, 224):
             for ax2_1, ax3_inner_1 in T.grid(224, 3):
@@ -89,14 +89,14 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True})
-        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T.preflattened_buffer(placeholder_65, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T.preflattened_buffer(placeholder_66, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T.preflattened_buffer(placeholder_67, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T.preflattened_buffer(T_cast_21, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T.preflattened_buffer(placeholder_65, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T.preflattened_buffer(placeholder_66, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T.preflattened_buffer(placeholder_67, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T.preflattened_buffer(T_cast_21, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_7 = T.allocate([157323], "int16", "global")
         for i0_i1_fused_7 in T.serial(0, 229):
@@ -115,10 +115,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
     def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True})
-        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T.preflattened_buffer(placeholder_29, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T.preflattened_buffer(T_cast_7, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T.preflattened_buffer(placeholder_29, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T.preflattened_buffer(T_cast_7, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         tensor_2 = T.allocate([200704], "uint8", "global")
         for ax0_ax1_fused_4 in T.serial(0, 56):
diff --git a/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py b/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py
index d72cb7f72ede..53a381c82b14 100644
--- a/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py
+++ b/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py
@@ -28,9 +28,9 @@ class SingleInputSingleOutput:
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_1 in T.serial(0, 224):
             for ax2_1, ax3_inner_1 in T.grid(224, 3):
@@ -40,8 +40,8 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     def __tvm_main__(input: T.handle, output: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
-        input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p0", dtype="handle"), output_buffer_var.data, dtype="int32"))
 # fmt: on
@@ -54,9 +54,9 @@ class TwoInputSingleOutput:
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_1 in T.serial(0, 224):
             for ax2_1, ax3_inner_1 in T.grid(224, 3):
@@ -66,9 +66,9 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     def __tvm_main__(input1: T.handle, input2: T.handle, output: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
-        input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        input2_buffer_var = T.match_buffer(input2, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        input2_buffer_var = T.match_buffer(input2, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input1_buffer_var.data, input2_buffer_var.data, output_buffer_var.data, dtype="int32"))
 # fmt: on
@@ -81,9 +81,9 @@ class TwoInputTwoOutput:
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_1 in T.serial(0, 224):
             for ax2_1, ax3_inner_1 in T.grid(224, 3):
@@ -93,10 +93,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     def __tvm_main__(input1: T.handle, input2: T.handle, output1: T.handle, output2: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
-        input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        input2_buffer_var = T.match_buffer(input2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        input2_buffer_var = T.match_buffer(input2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input1_buffer_var.data, T.lookup_param("p0", dtype="handle"), output1_buffer_var.data, dtype="int32"))
         T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input2_buffer_var.data, T.lookup_param("p1", dtype="handle"), output2_buffer_var.data, dtype="int32"))
@@ -110,9 +110,9 @@ class SingleInputTwoOutput:
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_1 in T.serial(0, 224):
             for ax2_1, ax3_inner_1 in T.grid(224, 3):
@@ -122,9 +122,9 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     def __tvm_main__(input: T.handle, output1: T.handle, output2: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
-        input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p0", dtype="handle"), output1_buffer_var.data, dtype="int32"))
         T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p1", dtype="handle"), output2_buffer_var.data, dtype="int32"))
diff --git a/tests/python/unittest/test_tir_usmp_utils.py b/tests/python/unittest/test_tir_usmp_utils.py
index 6e53bcb5e597..155ff0962def 100644
--- a/tests/python/unittest/test_tir_usmp_utils.py
+++ b/tests/python/unittest/test_tir_usmp_utils.py
@@ -31,9 +31,9 @@ class LinearStructure:
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_1 in T.serial(0, 224):
             for ax2_1, ax3_inner_1 in T.grid(224, 3):
@@ -43,10 +43,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True})
-        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1)
-        placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-        T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+        T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_7 = T.allocate([157323], "int16", "global")
         for i0_i1_fused_7 in T.serial(0, 229):
@@ -65,8 +65,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
     def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True})
-        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         tensor_2 = T.allocate([200704], "uint8", "global")
         for ax0_ax1_fused_4 in T.serial(0, 56):
diff --git a/tests/python/unittest/test_tvmscript_complete.py b/tests/python/unittest/test_tvmscript_complete.py
index c4b4afb24f82..29ac5dc5da0d 100644
--- a/tests/python/unittest/test_tvmscript_complete.py
+++ b/tests/python/unittest/test_tvmscript_complete.py
@@ -201,12 +201,12 @@ def func_with_bufferslice_indices(data: T.handle, index: T.handle) -> None:
 
 @T.prim_func
 def expected_bufferslice_indices(data: T.handle, index: T.handle) -> None:
-    index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-    data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=128, offset_factor=1)
+    index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+    data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=64, offset_factor=1)
     with T.block("root"):
         T.reads([])
         T.writes([])
-        out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=128, offset_factor=1)
+        out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=64, offset_factor=1)
         for i0, i1 in T.grid(16, 16):
             with T.block():
                 vi, vj = T.axis.remap("SS", [i0, i1])
@@ -229,12 +229,12 @@ def func_with_recursive_bufferslice_indices(data: T.handle, index: T.handle) ->
 
 @T.prim_func
 def expected_recursive_bufferslice_indices(data: T.handle, index: T.handle) -> None:
-    index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=128, offset_factor=1)
-    data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=128, offset_factor=1)
+    index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=64, offset_factor=1)
+    data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=64, offset_factor=1)
     with T.block("root"):
         T.reads([])
         T.writes([])
-        out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=128, offset_factor=1)
+        out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=64, offset_factor=1)
         for i0, i1 in T.grid(16, 16):
             with T.block():
                 vi, vj = T.axis.remap("SS", [i0, i1])
@@ -303,12 +303,12 @@ def alloc_buffer_func(a: T.handle, b: T.handle) -> None:
 
 @T.prim_func
 def expect_alloc_buffer_func(a: T.handle, b: T.handle) -> None:
-    A = T.match_buffer(a, [2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1)
-    B = T.match_buffer(b, [2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1)
+    A = T.match_buffer(a, [2, 2], dtype="float32", elem_offset=0, align=64, offset_factor=1)
+    B = T.match_buffer(b, [2, 2], dtype="float32", elem_offset=0, align=64, offset_factor=1)
     with T.block("root"):
         T.reads([])
         T.writes([])
-        C = T.alloc_buffer([2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1)
+        C = T.alloc_buffer([2, 2], dtype="float32", elem_offset=0, align=64, offset_factor=1)
         A[(0, 0)] = T.float32(2)
         C[(0, 0)] = A[(0, 0)] + B[(0, 0)]
         B[(0, 0)] = C[(0, 0)]
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index e5f5ae752aac..e98f5057d8c4 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -34,11 +34,11 @@ def mmult(A: T.handle, B: T.handle, C: T.handle) -> None:
             # function attr dict
             T.func_attr({"global_symbol": "mmult", "tir.noalias": True})
             # buffer definition
-            C_global = T.buffer_decl([1024, 1024], elem_offset=0, align=128, offset_factor=1)
-            packedB = T.buffer_decl([32, 1024, 32], elem_offset=0, align=128, offset_factor=1)
-            A_1 = T.match_buffer(A, [1024, 1024], elem_offset=0, align=128, offset_factor=1)
-            B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=128, offset_factor=1)
-            C_1 = T.match_buffer(C, [1024, 1024], elem_offset=0, align=128, offset_factor=1)
+            C_global = T.buffer_decl([1024, 1024], elem_offset=0, align=64, offset_factor=1)
+            packedB = T.buffer_decl([32, 1024, 32], elem_offset=0, align=64, offset_factor=1)
+            A_1 = T.match_buffer(A, [1024, 1024], elem_offset=0, align=64, offset_factor=1)
+            B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=64, offset_factor=1)
+            C_1 = T.match_buffer(C, [1024, 1024], elem_offset=0, align=64, offset_factor=1)
             # body
             T.realize(packedB[0:32, 0:1024, 0:32], "")
             for x in T.parallel(0, 32):
@@ -90,9 +90,9 @@ class Module:
         def mmult(A: T.handle, B: T.handle, C: T.handle) -> None:
             # function attr dict
             T.func_attr({"global_symbol": "mmult", "tir.noalias": True})
-            A_1 = T.match_buffer(A, [1024 * 1024], elem_offset=0, align=128, offset_factor=1)
-            B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=128, offset_factor=1)
-            C_1 = T.match_buffer(C, [1024 * 1024], elem_offset=0, align=128, offset_factor=1)
+            A_1 = T.match_buffer(A, [1024 * 1024], elem_offset=0, align=64, offset_factor=1)
+            B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=64, offset_factor=1)
+            C_1 = T.match_buffer(C, [1024 * 1024], elem_offset=0, align=64, offset_factor=1)
             # body
             packedB = T.allocate([32768], "float32", "global")
             for x in T.parallel(0, 32):
@@ -484,10 +484,10 @@ def func(A: T.handle, W: T.handle, Conv: T.handle) -> None:
         tz = T.env_thread("threadIdx.z")
         # buffer definition
         Apad_shared = T.buffer_decl(
-            [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1
+            [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1
         )
         Apad_shared_wmma_matrix_a = T.buffer_decl(
-            [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1
+            [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1
         )
         BA = T.buffer_decl(
             [16, 16], dtype="float16", scope="wmma.matrix_a", align=32, offset_factor=256
@@ -497,13 +497,13 @@ def func(A: T.handle, W: T.handle, Conv: T.handle) -> None:
         )
         BC = T.buffer_decl([16, 16], scope="wmma.accumulator", align=32, offset_factor=256)
         Conv_wmma_accumulator = T.buffer_decl(
-            [16, 14, 14, 32, 16, 16], elem_offset=0, align=128, offset_factor=1
+            [16, 14, 14, 32, 16, 16], elem_offset=0, align=64, offset_factor=1
         )
         W_shared = T.buffer_decl(
-            [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1
+            [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1
         )
         W_shared_wmma_matrix_b = T.buffer_decl(
-            [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1
+            [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1
         )
         buffer = T.buffer_decl(
             [16, 16], dtype="float16", scope="shared", align=32, offset_factor=256
@@ -520,13 +520,13 @@ def func(A: T.handle, W: T.handle, Conv: T.handle) -> None:
         buffer_4 = T.buffer_decl([16, 16], scope="wmma.accumulator", align=32, offset_factor=256)
         buffer_5 = T.buffer_decl([16, 16], align=32, offset_factor=256)
         A_1 = T.match_buffer(
-            A, [16, 14, 14, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1
+            A, [16, 14, 14, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1
         )
         W_1 = T.match_buffer(
-            W, [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1
+            W, [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1
         )
         Conv_1 = T.match_buffer(
-            Conv, [16, 14, 14, 32, 16, 16], elem_offset=0, align=128, offset_factor=1
+            Conv, [16, 14, 14, 32, 16, 16], elem_offset=0, align=64, offset_factor=1
         )
         # body
         T.realize(Conv_1[0:16, 0:14, 0:14, 0:32, 0:16, 0:16], "")
@@ -2958,8 +2958,8 @@ def primfunc_with_allocate_annotations():
     def primfunc_with_allocate_annotations(placeholder_28: T.handle, T_cast_6: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True})
-        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
-        T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         tensor_2 = T.allocate([200704], "uint8", "global", annotations={"attr1_key": "attr1_value"})
         for ax0_ax1_fused_4 in T.serial(0, 56):
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index 329a397724f3..d955ec0a8c80 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -288,9 +288,9 @@ def shared_16x16_to_ldmatrix_32x8_layout(i, j):
 
     @T.prim_func
     def mma_sync_m16n16k16_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
-        B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
-        C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+        A = T.match_buffer(a, (32, 8), "float16", align=64, offset_factor=16, scope="warp")
+        B = T.match_buffer(b, (32, 8), "float16", align=64, offset_factor=16, scope="warp")
+        C = T.match_buffer(c, (32, 8), "float16", align=64, offset_factor=16, scope="warp")
 
         with T.block("root"):
             T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8])
@@ -315,9 +315,9 @@ def mma_sync_m16n16k16_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
 
     @T.prim_func
     def mma_sync_m16n16k16_desc_manual(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
-        B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
-        C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+        A = T.match_buffer(a, (32, 8), "float16", align=64, offset_factor=16, scope="warp")
+        B = T.match_buffer(b, (32, 8), "float16", align=64, offset_factor=16, scope="warp")
+        C = T.match_buffer(c, (32, 8), "float16", align=64, offset_factor=16, scope="warp")
 
         with T.block("root"):
             T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8])
diff --git a/tests/python/unittest/test_tvmscript_type.py b/tests/python/unittest/test_tvmscript_type.py
index 12954e31e5ec..8228363a95ac 100644
--- a/tests/python/unittest/test_tvmscript_type.py
+++ b/tests/python/unittest/test_tvmscript_type.py
@@ -25,13 +25,13 @@
 
 @T.prim_func
 def element_wise_storage_align(a: T.handle, c: T.handle) -> None:
-    C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1)
-    A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
+    C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1)
+    A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1)
     # body
     with T.block("root"):
         T.reads([])
         T.writes([])
-        B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1)
+        B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1)
         for i0 in T.serial(0, 128):
             for ax1 in T.serial(0, 128):
                 with T.block("B"):

From 5db38ba8993d30ab0a89c82ff69e582d1bcc1678 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Thu, 25 Aug 2022 21:06:54 +0100
Subject: [PATCH 050/704] [COMMUNITY] @cconvey -> Reviewer (#12598)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index e3b4fe339a4f..1f9808ff2510 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -97,6 +97,7 @@ We do encourage everyone to work anything they are interested in.
 - [Zhi Chen](https://github.com/zhiics): @zhiics
 - [Valery Chernov](https://github.com/vvchernov): @vvchernov
 - [Neo Chien](https://github.com/cchung100m): @cchung100m
+- [Christian Convey](https://github.com/cconvey/): @cconvey
 - [Meghan Cowan](https://github.com/cowanmeg): @cowanmeg
 - [Balint Cristian](https://github.com/cbalint13): @cbalint13
 - [Egor Churaev](https://github.com/echuraev): @echuraev - metal

From a9f7c32e42a5f09e641dbe83f81cc4a73869af12 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Thu, 25 Aug 2022 14:21:13 -0700
Subject: [PATCH 051/704] [skip ci][Community] Wuwei Lin -> PMC (#12605)

[Community] Wuwei Lin -> PMC
---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 1f9808ff2510..771eb1c63eda 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -49,7 +49,7 @@ We do encourage everyone to work anything they are interested in.
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay
 - [Tristan Konolige](https://github.com/tkonolige): @tkonolige - profiling, relay, tir, runtime
 - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574 - tir, tvm-script
-- [Wuwei Lin](https://github.com/vinx13): @vinx13 - relay, topi
+- [Wuwei Lin](https://github.com/vinx13) (PMC): @vinx13 - relay, topi, tir, meta_schedule
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 - [Hao Lu](https://github.com/hlu1): @hlu1 - nnpack, frontends
 - [Eric Lunderberg](https://github.com/Lunderberg): @Lunderberg - CI, Vulkan backend

From 3224817d0835909c2673184a6c20bac3b7672632 Mon Sep 17 00:00:00 2001
From: WANG Zihan <wzh1999_frog@126.com>
Date: Fri, 26 Aug 2022 14:19:19 +0800
Subject: [PATCH 052/704] [TOPI][Bugfix] Make semantics of empty `axis` in
 `squeeze` consistent with Relay (#12596)

* Fix empty axis of `squeeze` in TOPI.

* Add test case for `squeeze` with empty `axis`.

* Add LLVM target for `test_squeeze`.
---
 include/tvm/topi/transform.h                    | 4 ++--
 tests/python/topi/python/test_topi_transform.py | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index 81935dd72dda..7accbf86912d 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -396,7 +396,7 @@ inline Tensor unravel_index(const Tensor& x, const Tensor& shape, std::string na
  * The removed dimensions must have a constant size of 1.
  *
  * \param x The input tensor
- * \param axis Indices of the dimensions to remove. If this is empty,
+ * \param axis Indices of the dimensions to remove. If this is None,
  * all entries with a constant size of 1 will be removed.
  * \param atleast1d Whether the output need to be atleast1d.
  * \param name The name of the operation
@@ -408,7 +408,7 @@ inline Tensor squeeze(const Tensor& x, Array<Integer> axis, bool atleast1d = fal
                       std::string name = "T_squeeze", std::string tag = kInjective) {
   auto ndim = x->shape.size();
   std::vector<int> axis_val;
-  if (!axis.defined() || axis.size() == 0) {
+  if (!axis.defined()) {
     for (size_t i = 0; i < ndim; ++i) {
       if (IsConstInt(x->shape[i]) && GetConstInt(x->shape[i]) == 1) {
         axis_val.push_back(static_cast<int>(i));
diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index c3155c948a8d..dd5ad1b11926 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -940,18 +940,19 @@ def test_where():
     verify_where((1, 2, 3, 4))
 
 
-@tvm.testing.requires_gpu
+@tvm.testing.uses_gpu
 def test_squeeze():
     verify_squeeze((1, 2, 3, 4), 0)
     verify_squeeze((1, 2, 1, 4), None)
     verify_squeeze((1, 1, 1, 4), (1, 2))
     verify_squeeze((1, 1, 1, 1), None)
+    verify_squeeze((1, 1, 1, 1), ())
 
     # a special case to trigger inline let expression
     A = te.placeholder((2,), "float32", "A")
     E = topi.squeeze(A)
     C = te.compute((1,), lambda i: E[(2 * A[0] - 1).astype("int32")])
-    for target in ["cuda", "opencl"]:
+    for target in ["llvm", "cuda", "opencl"]:
         dev = tvm.device(target, 0)
         if tvm.testing.device_enabled(target):
             with tvm.target.Target(target):

From 4f431c87c2b8bb5ea0773c44d92658e506251dda Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Fri, 26 Aug 2022 02:30:38 -0700
Subject: [PATCH 053/704] [TIR] Expose Memory Copy-Related PTX Builtins
 (#12611)

* Expose Memory Copy-Related PTX Builtins

This PR exposes the following TIR operation in python:

`ptx_ldmatrix`: tested
`ptx_cp_async`: tested
`ptx_commit_group`: tested
`ptx_wait_group`: tested

Co-authored-by: yongwww <yongcale@gmail.com>

* apply code review suggestion

Co-authored-by: yongwww <yongcale@gmail.com>
---
 python/tvm/tir/__init__.py                 |   1 +
 python/tvm/tir/op.py                       | 111 +++++++++++++++++++++
 tests/python/unittest/test_tir_op_types.py |  54 +++++-----
 3 files changed, 140 insertions(+), 26 deletions(-)

diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 04ab7f80daa9..4a6f32d03a2b 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -59,6 +59,7 @@
     tvm_bmma_sync,
     tvm_fill_fragment,
 )
+from .op import ptx_ldmatrix, ptx_cp_async, ptx_commit_group, ptx_wait_group
 from .op import vectorlow, vectorhigh, vectorcombine
 from .op import infinity, reinterpret
 from .op import exp, exp2, exp10, log, log2, log10, log1p, ldexp, clz
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index cf7985e8f489..e510f68a68a1 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -831,6 +831,117 @@ def tvm_store_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout):
     )
 
 
+def ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, smem_offset):
+    """TVM intrinsic for ptx load matrix from shared memory
+    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
+
+    Parameters
+    ----------
+    dtype : str
+       The data type of the result.
+
+    trans : bool
+        The matrix is loaded in column-major format.
+
+    num : IntImm
+        The number of matrices.
+
+    type : Literal[".b16"]
+        The data type of the matrices.
+
+    local_ptr : Var
+        The local pointer variable.
+
+    local_offset : Expr
+        The offset of local pointer.
+
+    smem_ptr : Var
+        The shared memory pointer variable.
+
+    smem_offset : Expr
+        The offset of shared memort pointer.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(
+        dtype,
+        "tir.ptx_ldmatrix",
+        trans,
+        num,
+        type,
+        local_ptr,
+        local_offset,
+        smem_ptr,
+        smem_offset,
+    )
+
+
+def ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes):
+    """TVM intrinsic for ptx async copy from global to shared memory
+    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
+
+    Parameters
+    ----------
+    dtype : str
+       The data type of the result.
+
+    shared_ptr : Var
+        The shared memory pointer variable.
+
+    shared_offset : Expr
+        The offset of shared memory pointer.
+
+    global_ptr : Var
+        The global memory pointer variable.
+
+    global_offset : Expr
+        The offset of global memory pointer.
+
+    bytes : int
+        The data size to copy.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(
+        dtype, "tir.ptx_cp_async", shared_ptr, shared_offset, global_ptr, global_offset, bytes
+    )
+
+
+def ptx_commit_group():
+    """TVM intrinsic for ptx async copy commit
+    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin("", "tir.ptx_commit_group")
+
+
+def ptx_wait_group(num):
+    """TVM intrinsic for ptx async copy wait
+    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group
+
+    Parameters
+    ----------
+    num : int
+        The number of the most recent uncommitted pending cp.async groups to wait.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin("", "tir.ptx_wait_group", num)
+
+
 def vectorlow(dtype, vec):
     """Get the low level half of the vector
 
diff --git a/tests/python/unittest/test_tir_op_types.py b/tests/python/unittest/test_tir_op_types.py
index 5254e7326e24..f8e8de074c42 100644
--- a/tests/python/unittest/test_tir_op_types.py
+++ b/tests/python/unittest/test_tir_op_types.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=missing-docstring
 import tvm
+import tvm.testing
 from tvm import tir
 
 
@@ -142,6 +143,32 @@ def test_tir_op_tvm_fill_fragment():
     assert expr.op.name == "tir.tvm_fill_fragment"
 
 
+def test_op_ptx_ldmatrix():
+    buffer_shared = tir.decl_buffer([16, 16], "float16", scope="shared")
+    buffer_local = tir.decl_buffer([8], "float16", scope="local")
+    expr = tir.ptx_ldmatrix(
+        "float16", False, 4, ".b16", buffer_local.data, 0, buffer_shared.data, 0
+    )
+    assert expr.op.name == "tir.ptx_ldmatrix"
+
+
+def test_op_ptx_cp_async():
+    buffer_shared = tir.decl_buffer([16, 16], "float16", scope="shared")
+    buffer_local = tir.decl_buffer([8], "float16", scope="local")
+    expr = tir.ptx_cp_async("float16", buffer_shared.data, 0, buffer_local.data, 0, 16)
+    assert expr.op.name == "tir.ptx_cp_async"
+
+
+def test_op_ptx_commit_group():
+    expr = tir.ptx_commit_group()
+    assert expr.op.name == "tir.ptx_commit_group"
+
+
+def test_op_ptx_wait_group():
+    expr = tir.ptx_wait_group(8)
+    assert expr.op.name == "tir.ptx_wait_group"
+
+
 def test_tir_op_vectorlow():
     buffer = tir.decl_buffer((4, 4), "int8", offset_factor=1)
     vec = buffer.vload([0, 0], dtype="int8x16")
@@ -189,29 +216,4 @@ def test_tir_op_TVMBackendFreeWorkspace():
 
 
 if __name__ == "__main__":
-    test_tir_op_tvm_tuple()
-    test_tir_op_tvm_struct_get()
-    test_tir_op_tvm_struct_set()
-    test_tir_op_address_of()
-    test_tir_op_lookup_param()
-    test_tir_op_reinterpret()
-    test_tir_op_isnullptr()
-    test_tir_op_call_assume()
-    test_tir_op_call_undef()
-    test_tir_op_call_likely()
-    test_tir_op_tvm_thread_allreduce()
-    test_tir_op_type_annotation()
-    test_tir_op_tvm_access_ptr()
-    test_tir_op_tvm_throw_last_error()
-    test_tir_op_tvm_load_matrix_sync(),
-    test_tir_op_tvm_store_matrix_sync(),
-    test_tir_op_tvm_mma_sync(),
-    test_tir_op_tvm_bmma_sync(),
-    test_tir_op_tvm_fill_fragment(),
-    test_tir_op_vectorlow()
-    test_tir_op_vectorhigh()
-    test_tir_op_vectorcombine()
-    test_tir_op_shift_left()
-    test_tir_op_shift_right()
-    test_tir_op_TVMBackendAllocWorkspace()
-    test_tir_op_TVMBackendFreeWorkspace()
+    tvm.testing.main()

From e02f2f9fddd8cd38589e3569c41de9f7af39971c Mon Sep 17 00:00:00 2001
From: "yin.changsheng" <yin.changsheng@intellif.com>
Date: Fri, 26 Aug 2022 19:42:57 +0800
Subject: [PATCH 054/704] [TIR][Schedule] enhance compute_at and
 reverse_compute_at primitive to choose possible position (#12450)

Current TIR "compute_at" primitive will compute at it's closest consumers. When a block has multiple producers, whoever compute at later who is behind. But for some special hardware, we usually hope keep the a certain order whatever it's compute at early or late.
eg: block A and block B are producers of block C. block A compute at block C first and block B compute at block C later. We hope the result is block B->block A->block C under some loop var.
---
 include/tvm/tir/schedule/schedule.h           |  14 +-
 python/tvm/tir/schedule/schedule.py           |  16 ++
 src/tir/schedule/concrete_schedule.cc         |   8 +-
 src/tir/schedule/concrete_schedule.h          |   7 +-
 src/tir/schedule/primitive.h                  |  13 +-
 src/tir/schedule/primitive/compute_at.cc      |  67 +++++---
 src/tir/schedule/traced_schedule.cc           |  19 +--
 src/tir/schedule/traced_schedule.h            |   7 +-
 ...le_schedule_rule_cross_thread_reduction.py |  16 +-
 ...hedule_schedule_rule_multi_level_tiling.py |  86 +++++-----
 ...e_schedule_rule_random_compute_location.py |   2 +-
 .../unittest/test_tir_schedule_compute_at.py  | 152 ++++++++++++++++++
 12 files changed, 308 insertions(+), 99 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 11fec642c718..da399ab976d6 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -432,9 +432,13 @@ class ScheduleNode : public runtime::Object {
    * \param block_rv The block to be moved
    * \param loop_rv The loop where the block to be moved under
    * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1
+   * \param index The block index of the loop body subtree blocks:
+   * - `index = -1` means inserted into the last possible insertion point;
+   * - `index = -2` means inserted into the first possible insertion point;
+   * - Otherwise, `index` is a nonnegative number that indicates the insertion point
    */
-  virtual void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
-                         bool preserve_unit_loops) = 0;
+  virtual void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops,
+                         int index = -1) = 0;
   /*!
    * \brief Move a consumer block under the specific loop, and regenerate the
    * loops induced by the block so that the buffer region consumed by the consumer block could
@@ -449,9 +453,13 @@ class ScheduleNode : public runtime::Object {
    * \param block_rv The block to be moved
    * \param loop_rv The loop where the block to be moved under
    * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1
+   * \param index The block index of the loop body subtree blocks:
+   * - `index = -1` means inserted into the last possible insertion point;
+   * - `index = -2` means inserted into the first possible insertion point;
+   * - Otherwise, `index` is a nonnegative number that indicates the insertion point
    */
   virtual void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
-                                bool preserve_unit_loops) = 0;
+                                bool preserve_unit_loops, int index = -1) = 0;
   /*!
    * \brief Inline a block into its consumer(s). It requires:
    * 1) The block is a complete non-root block, which only produces one buffer
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index e18bee35a5e1..04cc1bc26ad1 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -1274,6 +1274,7 @@ def compute_at(
         block: Union[BlockRV, str],
         loop: LoopRV,
         preserve_unit_loops: bool = False,
+        index: int = -1,
     ) -> None:
         """Compute-At. Move a producer block under the specific loop, and regenerate the
         loops induced by the block so that the buffer region produced by the producer block could
@@ -1303,6 +1304,12 @@ def compute_at(
         preserve_unit_loops: bool
             Whether to keep the trivial loops whose extents are 1
 
+        index: int
+            The block index of the loop body subtree blocks:
+            - `index = -1` means inserted into the last possible insertion point;
+            - `index = -2` means inserted into the first possible insertion point;
+            - Otherwise, `index` is a nonnegative number that indicates the insertion point
+
         Examples
         --------
 
@@ -1360,6 +1367,7 @@ def after_compute_at(a: T.handle, c: T.handle) -> None:
             block,
             loop,
             preserve_unit_loops,
+            index,
         )
 
     @type_checked
@@ -1368,6 +1376,7 @@ def reverse_compute_at(
         block: Union[BlockRV, str],
         loop: LoopRV,
         preserve_unit_loops: bool = False,
+        index: int = -1,
     ) -> None:
         """Reverse-Compute-At. Move a consumer block under the specific loop, and regenerate the
         loops induced by the block so that the buffer region consumed by the consumer block could
@@ -1394,6 +1403,12 @@ def reverse_compute_at(
         preserve_unit_loops: bool
             Whether to keep the trivial loops whose extents are 1
 
+        index: int
+            The block index of the loop body subtree blocks:
+            - `index = -1` means inserted into the last possible insertion point;
+            - `index = -2` means inserted into the first possible insertion point;
+            - Otherwise, `index` is a nonnegative number that indicates the insertion point
+
         Examples
         --------
 
@@ -1451,6 +1466,7 @@ def after_reverse_compute_at(a: T.handle, c: T.handle) -> None:
             block,
             loop,
             preserve_unit_loops,
+            index,
         )
 
     @type_checked
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index c16638f748b4..5f773a02d6ff 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -574,7 +574,7 @@ BlockRV ConcreteScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index,
 /******** Schedule: Compute location ********/
 
 void ConcreteScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
-                                     bool preserve_unit_loops) {
+                                     bool preserve_unit_loops, int index) {
   static StmtSRef inline_mark = StmtSRef::InlineMark();
   static StmtSRef root_mark = StmtSRef::RootMark();
   StmtSRef loop_sref = this->GetSRef(loop_rv);
@@ -586,14 +586,14 @@ void ConcreteScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop
     TVM_TIR_SCHEDULE_END("compute-at", this->error_render_level_);
   } else {
     TVM_TIR_SCHEDULE_BEGIN();
-    tir::ComputeAt(state_, this->GetSRef(block_rv), loop_sref, preserve_unit_loops);
+    tir::ComputeAt(state_, this->GetSRef(block_rv), loop_sref, preserve_unit_loops, index);
     TVM_TIR_SCHEDULE_END("compute-at", this->error_render_level_);
   }
   this->state_->DebugVerify();
 }
 
 void ConcreteScheduleNode::ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
-                                            bool preserve_unit_loops) {
+                                            bool preserve_unit_loops, int index) {
   static StmtSRef inline_mark = StmtSRef::InlineMark();
   static StmtSRef root_mark = StmtSRef::RootMark();
   StmtSRef loop_sref = this->GetSRef(loop_rv);
@@ -605,7 +605,7 @@ void ConcreteScheduleNode::ReverseComputeAt(const BlockRV& block_rv, const LoopR
     TVM_TIR_SCHEDULE_END("reverse-compute-at", this->error_render_level_);
   } else {
     TVM_TIR_SCHEDULE_BEGIN();
-    tir::ReverseComputeAt(state_, this->GetSRef(block_rv), loop_sref, preserve_unit_loops);
+    tir::ReverseComputeAt(state_, this->GetSRef(block_rv), loop_sref, preserve_unit_loops, index);
     TVM_TIR_SCHEDULE_END("reverse-compute-at", this->error_render_level_);
   }
   this->state_->DebugVerify();
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index cdd0a5b7b0a2..92b9de408873 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -119,9 +119,10 @@ class ConcreteScheduleNode : public ScheduleNode {
   BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
                   BufferIndexType buffer_index_type) override;
   /******** Schedule: Compute location ********/
-  void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops) override;
-  void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
-                        bool preserve_unit_loops) override;
+  void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops,
+                 int index = -1) override;
+  void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops,
+                        int index = -1) override;
   void ComputeInline(const BlockRV& block) override;
   void ReverseComputeInline(const BlockRV& block) override;
   /******** Schedule: Reduction ********/
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 14203a0d167e..05d9e4cf944a 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -299,10 +299,13 @@ TVM_DLL StmtSRef ReIndex(ScheduleState self, const StmtSRef& block_sref, int buf
  * \param self The schedule state
  * \param block_sref The block to be moved
  * \param loop_sref The loop where the block to be moved to
- * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1
+ * \param index The block index of the loop body subtree blocks:
+ * - `index = -1` means inserted into the last possible insertion point;
+ * - `index = -2` means inserted into the first possible insertion point;
+ * - Otherwise, `index` is a nonnegative number that indicates the insertion point
  */
 TVM_DLL void ComputeAt(ScheduleState self, const StmtSRef& block_sref, const StmtSRef& loop_sref,
-                       bool preserve_unit_loops);
+                       bool preserve_unit_loops, int index = -1);
 /*!
  * \brief Move a consumer block under the specific loop, and regenerate the
  * loops induced by the block so that the buffer region consumed by the consumer block could
@@ -318,9 +321,13 @@ TVM_DLL void ComputeAt(ScheduleState self, const StmtSRef& block_sref, const Stm
  * \param block_sref The block to be moved
  * \param loop_sref The loop where the block to be moved to
  * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1
+ * \param index The block index of the loop body subtree blocks:
+ * - `index = -1` means inserted into the last possible insertion point;
+ * - `index = -2` means inserted into the first possible insertion point;
+ * - Otherwise, `index` is a nonnegative number that indicates the insertion point
  */
 TVM_DLL void ReverseComputeAt(ScheduleState self, const StmtSRef& block_sref,
-                              const StmtSRef& loop_sref, bool preserve_unit_loops);
+                              const StmtSRef& loop_sref, bool preserve_unit_loops, int index = -1);
 /*!
  * \brief Inline a block into its consumer(s). It requires:
  * 1) The block is a complete non-root block, which only produces one buffer
diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
index 98a6b2400ee3..8baedfd70dd0 100644
--- a/src/tir/schedule/primitive/compute_at.cc
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -129,15 +129,19 @@ class NotInSameScopeError : public ScheduleError {
  * \param producer_srefs The producer blocks
  * \param consumer_srefs The consumer blocks
  * \param block2realize A cache that maps a block to its realize
- * \return The last position the new block can be inserted onto, and the
+ * \param index The block index of the loop body subtree blocks:
+ * - `index = -1` means inserted into the last possible insertion point;
+ * - `index = -2` means inserted into the first possible insertion point;
+ * - Otherwise, `index` is a nonnegative number that indicates the insertion point
+ * \return The possible position the new block can be inserted into, and the
  * producer-consumer-relationship is still satisfied.
  * \throws ScheduleError if there is no such insertion point found
  */
 template <bool require_all_producers_visited, bool require_all_consumers_visited>
-int FindInsertionPoint(
-    const ScheduleState& self, const Array<Stmt>& subtrees, const Array<StmtSRef>& producer_srefs,
-    const Array<StmtSRef>& consumer_srefs,
-    std::unordered_map<const BlockNode*, const BlockRealizeNode*>* block2realize) {
+int FindInsertionPoint(const ScheduleState& self, const Array<Stmt>& subtrees,
+                       const Array<StmtSRef>& producer_srefs, const Array<StmtSRef>& consumer_srefs,
+                       std::unordered_map<const BlockNode*, const BlockRealizeNode*>* block2realize,
+                       int index) {
   ProducerConsumerSplit split =
       ProducerConsumerSplit::Find(self, subtrees, producer_srefs, consumer_srefs, block2realize);
   // Step 1. Check if all the producers are visited in the subtrees, if required to
@@ -159,8 +163,22 @@ int FindInsertionPoint(
   // Step 3. Check if there is at least one index of the position can be inserted into
   // The valid indices are: (last_producer_position, first_consumer_position]
   ICHECK(split.last_producer_position < split.first_consumer_position);
-  // Step 4. Return the last valid insertion point
-  return split.first_consumer_position;
+  // Step 4. Return the possible insertion point according to index
+  int insert_position;
+  if (index == -1) {
+    insert_position = split.first_consumer_position;
+  } else if (index == -2) {
+    insert_position = split.last_producer_position + 1;
+  } else if (index >= 0 && index >= split.last_producer_position + 1 &&
+             index <= split.first_consumer_position) {
+    insert_position = index;
+  } else {
+    LOG(FATAL) << "Valid index:(-1, -2, [" << split.last_producer_position + 1 << ", "
+               << split.first_consumer_position << "]), "
+               << "current index=" << index;
+    throw;
+  }
+  return insert_position;
 }
 
 /*!
@@ -556,7 +574,8 @@ void CalculateProvidedRequiredRegions(
 template <bool is_compute_at>
 void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_sref,
                                      const StmtSRef& loop_sref, bool preserve_unit_loops,
-                                     arith::Analyzer* analyzer, bool check_only = false) {
+                                     arith::Analyzer* analyzer, bool check_only = false,
+                                     int index = -1) {
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
   const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
   // Step 1. Bunch of checks
@@ -588,7 +607,8 @@ void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_s
       /*self=*/self,
       /*subtrees=*/AsArray(loop->body),
       /*producer_srefs=*/producer_srefs,
-      /*consumer_srefs=*/consumer_srefs, /*block2realize=*/&block2realize);
+      /*consumer_srefs=*/consumer_srefs, /*block2realize=*/&block2realize,
+      /*index=*/index);
   // Step 4. Calculate the region provided by a single execution instance of `block`,
   // as well as the region required by dependent blocks under `loop`.
   // Here is the definition of `provide` and `require`:
@@ -626,17 +646,17 @@ void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_s
 }
 
 void ComputeAt(ScheduleState self, const StmtSRef& block_sref, const StmtSRef& loop_sref,
-               bool preserve_unit_loops) {
+               bool preserve_unit_loops, int index) {
   arith::Analyzer analyzer;
-  ComputeAtOrReverseComputeAtImpl<true>(self, block_sref, loop_sref, preserve_unit_loops,
-                                        &analyzer);
+  ComputeAtOrReverseComputeAtImpl<true>(self, block_sref, loop_sref, preserve_unit_loops, &analyzer,
+                                        false, index);
 }
 
 void ReverseComputeAt(ScheduleState self, const StmtSRef& block_sref, const StmtSRef& loop_sref,
-                      bool preserve_unit_loops) {
+                      bool preserve_unit_loops, int index) {
   arith::Analyzer analyzer;
   ComputeAtOrReverseComputeAtImpl<false>(self, block_sref, loop_sref, preserve_unit_loops,
-                                         &analyzer);
+                                         &analyzer, false, index);
 }
 
 bool CanComputeAt(const ScheduleState& self, const StmtSRef& block_sref, const StmtSRef& loop_sref,
@@ -671,20 +691,21 @@ struct ComputeAtTraits : public UnpackedInstTraits<ComputeAtTraits> {
 
  private:
   static constexpr size_t kNumInputs = 2;
-  static constexpr size_t kNumAttrs = 1;
+  static constexpr size_t kNumAttrs = 2;
   static constexpr size_t kNumDecisions = 0;
 
   static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, LoopRV loop_rv,
-                                      Bool preserve_unit_loops) {
-    return sch->ComputeAt(block_rv, loop_rv, preserve_unit_loops.operator bool());
+                                      Bool preserve_unit_loops, IntImm index) {
+    return sch->ComputeAt(block_rv, loop_rv, preserve_unit_loops.operator bool(), index->value);
   }
 
   static String UnpackedAsPython(Array<String> outputs, String block_rv, String loop_rv,
-                                 Bool preserve_unit_loops) {
+                                 Bool preserve_unit_loops, IntImm index) {
     PythonAPICall py("compute_at");
     py.Input("block", block_rv);
     py.Input("loop", loop_rv);
     py.Input("preserve_unit_loops", preserve_unit_loops.operator bool());
+    py.Input("index", index);
     return py.Str();
   }
 
@@ -698,20 +719,22 @@ struct ReverseComputeAtTraits : public UnpackedInstTraits<ReverseComputeAtTraits
 
  private:
   static constexpr size_t kNumInputs = 2;
-  static constexpr size_t kNumAttrs = 1;
+  static constexpr size_t kNumAttrs = 2;
   static constexpr size_t kNumDecisions = 0;
 
   static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, LoopRV loop_rv,
-                                      Bool preserve_unit_loops) {
-    return sch->ReverseComputeAt(block_rv, loop_rv, preserve_unit_loops.operator bool());
+                                      Bool preserve_unit_loops, IntImm index) {
+    return sch->ReverseComputeAt(block_rv, loop_rv, preserve_unit_loops.operator bool(),
+                                 index->value);
   }
 
   static String UnpackedAsPython(Array<String> outputs, String block_rv, String loop_rv,
-                                 Bool preserve_unit_loops) {
+                                 Bool preserve_unit_loops, IntImm index) {
     PythonAPICall py("reverse_compute_at");
     py.Input("block", block_rv);
     py.Input("loop", loop_rv);
     py.Input("preserve_unit_loops", preserve_unit_loops.operator bool());
+    py.Input("index", index);
     return py.Str();
   }
 
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 07d4da54d7fb..04ddc0507dc4 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -322,24 +322,25 @@ BlockRV TracedScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index,
 /******** Schedule: Compute location ********/
 
 void TracedScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
-                                   bool preserve_unit_loops) {
-  ConcreteScheduleNode::ComputeAt(block_rv, loop_rv, preserve_unit_loops);
+                                   bool preserve_unit_loops, int index) {
+  ConcreteScheduleNode::ComputeAt(block_rv, loop_rv, preserve_unit_loops, index);
 
   static const InstructionKind& kind = InstructionKind::Get("ComputeAt");
-  trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
-                                      /*inputs=*/{block_rv, loop_rv},
-                                      /*attrs=*/{Integer(preserve_unit_loops)},
-                                      /*outputs=*/{}));
+  trace_->Append(
+      /*inst=*/Instruction(/*kind=*/kind,
+                           /*inputs=*/{block_rv, loop_rv},
+                           /*attrs=*/{Integer(preserve_unit_loops), Integer(index)},
+                           /*outputs=*/{}));
 }
 
 void TracedScheduleNode::ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
-                                          bool preserve_unit_loops) {
-  ConcreteScheduleNode::ReverseComputeAt(block_rv, loop_rv, preserve_unit_loops);
+                                          bool preserve_unit_loops, int index) {
+  ConcreteScheduleNode::ReverseComputeAt(block_rv, loop_rv, preserve_unit_loops, index);
 
   static const InstructionKind& kind = InstructionKind::Get("ReverseComputeAt");
   trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
                                       /*inputs=*/{block_rv, loop_rv},
-                                      /*attrs=*/{Integer(preserve_unit_loops)},
+                                      /*attrs=*/{Integer(preserve_unit_loops), Integer(index)},
                                       /*outputs=*/{}));
 }
 
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 865a21687950..d98e4ba4bb95 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -79,9 +79,10 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
                   BufferIndexType buffer_index_type) final;
   /******** Schedule: Compute location ********/
-  void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops) final;
-  void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
-                        bool preserve_unit_loops) final;
+  void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops,
+                 int index = -1) final;
+  void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops,
+                        int index = -1) final;
   void ComputeInline(const BlockRV& block_rv) final;
   void ReverseComputeInline(const BlockRV& block_rv) final;
   /******** Schedule: Reduction ********/
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index 5f76e77592e3..592d32d6245d 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -80,7 +80,7 @@ def test_gpu_softmax_mn():
             "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
             "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l6, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)",
+            "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l7, l8, l9 = sch.get_loops(block=b0)",
             "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
@@ -93,7 +93,7 @@ def test_gpu_softmax_mn():
             "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
             "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l6, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)",
+            "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l7, l8, l9 = sch.get_loops(block=b0)",
             "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
@@ -107,7 +107,7 @@ def test_gpu_softmax_mn():
             "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
             "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l7, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True)",
+            "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True, index=-1)",
             'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")',
             "l8, l9, l10 = sch.get_loops(block=b1)",
             "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)",
@@ -117,7 +117,7 @@ def test_gpu_softmax_mn():
             "v16 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
             "l17, l18 = sch.split(loop=l15, factors=[None, v16], preserve_unit_iters=True)",
             'sch.bind(loop=l18, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b0, loop=l14, preserve_unit_loops=True)",
+            "sch.compute_at(block=b0, loop=l14, preserve_unit_loops=True, index=-1)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l19, l20, l21 = sch.get_loops(block=b0)",
             "l22, l23 = sch.split(loop=l21, factors=[None, v16], preserve_unit_iters=True)",
@@ -157,7 +157,7 @@ def test_gpu_softmax_mn_after_inline():
             "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
             "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l6, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)",
+            "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l7, l8, l9 = sch.get_loops(block=b0)",
             "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
@@ -171,14 +171,14 @@ def test_gpu_softmax_mn_after_inline():
             "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
             "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l7, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True)",
+            "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True, index=-1)",
             'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")',
             "l8, l9, l10 = sch.get_loops(block=b1)",
             "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l12, thread_axis="threadIdx.x")',
             "b13, b14 = sch.get_consumers(block=b0)",
             "l15, l16, l17, l18 = sch.get_loops(block=b13)",
-            "sch.compute_at(block=b0, loop=l15, preserve_unit_loops=True)",
+            "sch.compute_at(block=b0, loop=l15, preserve_unit_loops=True, index=-1)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l19, l20, l21 = sch.get_loops(block=b0)",
             "l22, l23 = sch.split(loop=l21, factors=[None, v5], preserve_unit_iters=True)",
@@ -206,7 +206,7 @@ def test_gpu_batch_norm_bmn():
             "v3 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
             "l4, l5 = sch.split(loop=l2, factors=[None, v3], preserve_unit_iters=True)",
             'sch.bind(loop=l5, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b0, loop=l4, preserve_unit_loops=True)",
+            "sch.compute_at(block=b0, loop=l4, preserve_unit_loops=True, index=-1)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l6, l7, l8, l9 = sch.get_loops(block=b0)",
             "l10 = sch.fuse(l8, l9, preserve_unit_iters=True)",
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 87159fcb3110..fe1220c50925 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -62,7 +62,7 @@ def test_cpu_matmul():
             "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")',
-            "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True)",
+            "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True, index=-1)",
         ],
         [
             'b0 = sch.get_block(name="C", func_name="main")',
@@ -76,7 +76,7 @@ def test_cpu_matmul():
             "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")',
-            "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True)",
+            "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True, index=-1)",
         ],
         [
             'b0 = sch.get_block(name="C", func_name="main")',
@@ -123,7 +123,7 @@ def test_cpu_matmul_relu():
             "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             "b24, = sch.get_consumers(block=b0)",
-            "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True)",
+            "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True, index=-1)",
         ],
         [
             'b0 = sch.get_block(name="C", func_name="main")',
@@ -137,7 +137,7 @@ def test_cpu_matmul_relu():
             "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             "b24, = sch.get_consumers(block=b0)",
-            "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True)",
+            "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True, index=-1)",
         ],
         [
             'b0 = sch.get_block(name="C", func_name="main")',
@@ -193,15 +193,15 @@ def test_cuda_matmul():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32)',
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024)',
             'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")',
-            "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True)",
+            "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True, index=-1)",
             'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")',
-            "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True)",
+            "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True, index=-1)",
             "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)",
             "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)",
             "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)',
             'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
-            "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True)",
+            "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True, index=-1)",
             "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)",
             "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)",
             "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
@@ -247,15 +247,15 @@ def test_cuda_matmul_relu():
             "l32 = sch.fuse(l11, l21, preserve_unit_iters=True)",
             'sch.bind(loop=l32, thread_axis="threadIdx.x")',
             'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")',
-            "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True)",
+            "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True, index=-1)",
             'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")',
-            "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True)",
+            "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True, index=-1)",
             "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)",
             "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)",
             "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)',
             'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
-            "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True)",
+            "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True, index=-1)",
             "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)",
             "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)",
             "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
@@ -402,7 +402,7 @@ def test_multi_level_tiling_conv2d_nchwc_vnni():
 l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True)
 sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)
 b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global")
-sch.reverse_compute_at(block=b98, loop=l75, preserve_unit_loops=True)""".split(
+sch.reverse_compute_at(block=b98, loop=l75, preserve_unit_loops=True, index=-1)""".split(
             "\n"
         ),
         """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
@@ -437,7 +437,7 @@ def test_multi_level_tiling_conv2d_nchwc_vnni():
 l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True)
 sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)
 b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global")
-sch.reverse_compute_at(block=b98, loop=l74, preserve_unit_loops=True)""".split(
+sch.reverse_compute_at(block=b98, loop=l74, preserve_unit_loops=True, index=-1)""".split(
             "\n"
         ),
         """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
@@ -546,15 +546,15 @@ def test_multi_level_tiling_dense_dp4a():
 l38 = sch.fuse(l17, l27, preserve_unit_iters=True)
 sch.bind(loop=l38, thread_axis="threadIdx.x")
 b39 = sch.cache_write(block=b6, write_buffer_index=0, storage_scope="local")
-sch.reverse_compute_at(block=b39, loop=l38, preserve_unit_loops=True)
+sch.reverse_compute_at(block=b39, loop=l38, preserve_unit_loops=True, index=-1)
 b40 = sch.cache_read(block=b6, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b40, loop=l33, preserve_unit_loops=True)
+sch.compute_at(block=b40, loop=l33, preserve_unit_loops=True, index=-1)
 l41, l42, l43, l44, l45, l46 = sch.get_loops(block=b40)
 l47 = sch.fuse(l45, l46, preserve_unit_iters=True)
 v48 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b40, ann_key="meta_schedule.cooperative_fetch", ann_val=v48)
 b49 = sch.cache_read(block=b6, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b49, loop=l33, preserve_unit_loops=True)
+sch.compute_at(block=b49, loop=l33, preserve_unit_loops=True, index=-1)
 l50, l51, l52, l53, l54, l55 = sch.get_loops(block=b49)
 l56 = sch.fuse(l54, l55, preserve_unit_iters=True)
 v57 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
@@ -632,9 +632,9 @@ def test_cuda_tensor_core_matmul_relu():
 l52 = sch.fuse(l31, l41, preserve_unit_iters=True)
 sch.bind(loop=l52, thread_axis="threadIdx.y")
 b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared")
-sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True)
+sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True, index=-1)
 b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator")
-sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True)
+sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True, index=-1)
 v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55)
 sch.reverse_compute_inline(block=b2)
@@ -646,19 +646,19 @@ def test_cuda_tensor_core_matmul_relu():
 b72 = sch.blockize(loop=l64)
 sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared")
 b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True)
+sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True, index=-1)
 l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73)
 l80 = sch.fuse(l78, l79, preserve_unit_iters=True)
 v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81)
 b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True)
+sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True, index=-1)
 l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82)
 l89 = sch.fuse(l87, l88, preserve_unit_iters=True)
 v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90)
 b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a")
-sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True)
+sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True, index=-1)
 l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91)
 l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True)
 l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True)
@@ -667,7 +667,7 @@ def test_cuda_tensor_core_matmul_relu():
 b112 = sch.blockize(loop=l102)
 sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
 b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b")
-sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True)
+sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True, index=-1)
 l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113)
 l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True)
 l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True)
@@ -772,9 +772,9 @@ def test_cuda_tensor_core_software_pipeline_matmul_relu():
 l52 = sch.fuse(l31, l41, preserve_unit_iters=True)
 sch.bind(loop=l52, thread_axis="threadIdx.y")
 b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared")
-sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True)
+sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True, index=-1)
 b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator")
-sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True)
+sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True, index=-1)
 v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55)
 sch.reverse_compute_inline(block=b2)
@@ -786,19 +786,19 @@ def test_cuda_tensor_core_software_pipeline_matmul_relu():
 b72 = sch.blockize(loop=l64)
 sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared")
 b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True)
+sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True, index=-1)
 l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73)
 l80 = sch.fuse(l78, l79, preserve_unit_iters=True)
 v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81)
 b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True)
+sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True, index=-1)
 l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82)
 l89 = sch.fuse(l87, l88, preserve_unit_iters=True)
 v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90)
 b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a")
-sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True)
+sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True, index=-1)
 l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91)
 l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True)
 l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True)
@@ -807,7 +807,7 @@ def test_cuda_tensor_core_software_pipeline_matmul_relu():
 b112 = sch.blockize(loop=l102)
 sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
 b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b")
-sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True)
+sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True, index=-1)
 l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113)
 l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True)
 l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True)
@@ -895,7 +895,7 @@ def test_cuda_tensor_core_matmul_relu_global():
 l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
 sch.bind(loop=l51, thread_axis="threadIdx.y")
 b52 = sch.cache_write(block=b19, write_buffer_index=0, storage_scope="wmma.accumulator")
-sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True)
+sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True, index=-1)
 sch.reverse_compute_inline(block=b1)
 l53, l54, l55, l56, l57 = sch.get_loops(block=b52)
 l58, l59 = sch.split(loop=l57, factors=[None, 16], preserve_unit_iters=True)
@@ -905,19 +905,19 @@ def test_cuda_tensor_core_matmul_relu_global():
 b69 = sch.blockize(loop=l61)
 sch.annotate(block_or_loop=b69, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_global")
 b70 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True)
+sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True, index=-1)
 l71, l72, l73, l74, l75, l76 = sch.get_loops(block=b70)
 l77 = sch.fuse(l75, l76, preserve_unit_iters=True)
 v78 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b70, ann_key="meta_schedule.cooperative_fetch", ann_val=v78)
 b79 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True)
+sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True, index=-1)
 l80, l81, l82, l83, l84, l85 = sch.get_loops(block=b79)
 l86 = sch.fuse(l84, l85, preserve_unit_iters=True)
 v87 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch", ann_val=v87)
 b88 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="wmma.matrix_a")
-sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True)
+sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True, index=-1)
 l89, l90, l91, l92, l93, l94, l95 = sch.get_loops(block=b88)
 l96, l97 = sch.split(loop=l95, factors=[None, 16], preserve_unit_iters=True)
 l98, l99 = sch.split(loop=l94, factors=[None, 16], preserve_unit_iters=True)
@@ -926,7 +926,7 @@ def test_cuda_tensor_core_matmul_relu_global():
 b109 = sch.blockize(loop=l99)
 sch.annotate(block_or_loop=b109, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
 b110 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="wmma.matrix_b")
-sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True)
+sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True, index=-1)
 l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b110)
 l118, l119 = sch.split(loop=l117, factors=[None, 16], preserve_unit_iters=True)
 l120, l121 = sch.split(loop=l116, factors=[None, 16], preserve_unit_iters=True)
@@ -995,7 +995,7 @@ def test_cuda_tensor_core_matmul_relu_global():
 l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
 sch.bind(loop=l51, thread_axis="threadIdx.y")
 b52 = sch.cache_write(block=b19, write_buffer_index=0, storage_scope="wmma.accumulator")
-sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True)
+sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True, index=-1)
 sch.reverse_compute_inline(block=b1)
 l53, l54, l55, l56, l57 = sch.get_loops(block=b52)
 l58, l59 = sch.split(loop=l57, factors=[None, 16], preserve_unit_iters=True)
@@ -1005,19 +1005,19 @@ def test_cuda_tensor_core_matmul_relu_global():
 b69 = sch.blockize(loop=l61)
 sch.annotate(block_or_loop=b69, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_global")
 b70 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True)
+sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True, index=-1)
 l71, l72, l73, l74, l75, l76 = sch.get_loops(block=b70)
 l77 = sch.fuse(l75, l76, preserve_unit_iters=True)
 v78 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b70, ann_key="meta_schedule.cooperative_fetch", ann_val=v78)
 b79 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True)
+sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True, index=-1)
 l80, l81, l82, l83, l84, l85 = sch.get_loops(block=b79)
 l86 = sch.fuse(l84, l85, preserve_unit_iters=True)
 v87 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch", ann_val=v87)
 b88 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="wmma.matrix_a")
-sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True)
+sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True, index=-1)
 l89, l90, l91, l92, l93, l94, l95 = sch.get_loops(block=b88)
 l96, l97 = sch.split(loop=l95, factors=[None, 16], preserve_unit_iters=True)
 l98, l99 = sch.split(loop=l94, factors=[None, 16], preserve_unit_iters=True)
@@ -1026,7 +1026,7 @@ def test_cuda_tensor_core_matmul_relu_global():
 b109 = sch.blockize(loop=l99)
 sch.annotate(block_or_loop=b109, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
 b110 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="wmma.matrix_b")
-sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True)
+sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True, index=-1)
 l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b110)
 l118, l119 = sch.split(loop=l117, factors=[None, 16], preserve_unit_iters=True)
 l120, l121 = sch.split(loop=l116, factors=[None, 16], preserve_unit_iters=True)
@@ -1133,9 +1133,9 @@ def test_cuda_tensor_core_conv2d():
 l64 = sch.fuse(l33, l43, l53, preserve_unit_iters=True)
 sch.bind(loop=l64, thread_axis="threadIdx.y")
 b65 = sch.cache_write(block=b21, write_buffer_index=0, storage_scope="shared")
-sch.reverse_compute_at(block=b65, loop=l63, preserve_unit_loops=True)
+sch.reverse_compute_at(block=b65, loop=l63, preserve_unit_loops=True, index=-1)
 b66 = sch.cache_write(block=b21, write_buffer_index=0, storage_scope="wmma.accumulator")
-sch.reverse_compute_at(block=b66, loop=l64, preserve_unit_loops=True)
+sch.reverse_compute_at(block=b66, loop=l64, preserve_unit_loops=True, index=-1)
 v67 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b65, ann_key="meta_schedule.cooperative_fetch", ann_val=v67)
 sch.reverse_compute_inline(block=b1)
@@ -1147,19 +1147,19 @@ def test_cuda_tensor_core_conv2d():
 b84 = sch.blockize(loop=l76)
 sch.annotate(block_or_loop=b84, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared")
 b85 = sch.cache_read(block=b21, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b85, loop=l59, preserve_unit_loops=True)
+sch.compute_at(block=b85, loop=l59, preserve_unit_loops=True, index=-1)
 l86, l87, l88, l89, l90, l91 = sch.get_loops(block=b85)
 l92 = sch.fuse(l90, l91, preserve_unit_iters=True)
 v93 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b85, ann_key="meta_schedule.cooperative_fetch", ann_val=v93)
 b94 = sch.cache_read(block=b21, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b94, loop=l59, preserve_unit_loops=True)
+sch.compute_at(block=b94, loop=l59, preserve_unit_loops=True, index=-1)
 l95, l96, l97, l98, l99, l100 = sch.get_loops(block=b94)
 l101 = sch.fuse(l99, l100, preserve_unit_iters=True)
 v102 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b94, ann_key="meta_schedule.cooperative_fetch", ann_val=v102)
 b103 = sch.cache_read(block=b21, read_buffer_index=0, storage_scope="wmma.matrix_a")
-sch.compute_at(block=b103, loop=l60, preserve_unit_loops=True)
+sch.compute_at(block=b103, loop=l60, preserve_unit_loops=True, index=-1)
 l104, l105, l106, l107, l108, l109, l110 = sch.get_loops(block=b103)
 l111, l112 = sch.split(loop=l110, factors=[None, 16], preserve_unit_iters=True)
 l113, l114 = sch.split(loop=l109, factors=[None, 16], preserve_unit_iters=True)
@@ -1168,7 +1168,7 @@ def test_cuda_tensor_core_conv2d():
 b124 = sch.blockize(loop=l114)
 sch.annotate(block_or_loop=b124, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
 b125 = sch.cache_read(block=b21, read_buffer_index=1, storage_scope="wmma.matrix_b")
-sch.compute_at(block=b125, loop=l60, preserve_unit_loops=True)
+sch.compute_at(block=b125, loop=l60, preserve_unit_loops=True, index=-1)
 l126, l127, l128, l129, l130, l131, l132 = sch.get_loops(block=b125)
 l133, l134 = sch.split(loop=l132, factors=[None, 16], preserve_unit_iters=True)
 l135, l136 = sch.split(loop=l131, factors=[None, 16], preserve_unit_iters=True)
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
index b2df408e9d01..c951a5adf386 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
@@ -71,7 +71,7 @@ def test_random_compute_location():
         [
             'b0 = sch.get_block(name="move", func_name="main")',
             "l1 = sch.sample_compute_location(block=b0)",
-            "sch.compute_at(block=b0, loop=l1, preserve_unit_loops=True)",
+            "sch.compute_at(block=b0, loop=l1, preserve_unit_loops=True, index=-1)",
         ]
     ]
     mod = Add
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
index 0c20a4783ca0..72cba1a8fdc4 100644
--- a/tests/python/unittest/test_tir_schedule_compute_at.py
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -1353,5 +1353,157 @@ def _create_prim_func():
     verify_trace_roundtrip(sch=sch, mod=mod)
 
 
+def test_compute_at_to_index():
+    @T.prim_func
+    def multi_producers_conv(
+        data: T.Buffer[(1, 3, 224, 224), "int8"],
+        w: T.Buffer[(16, 3, 7, 7), "int8"],
+        conv: T.Buffer[(1, 16, 112, 112), "int32"],
+    ) -> None:
+        pad = T.alloc_buffer([1, 3, 230, 230], dtype="int8")
+        wbuf = T.alloc_buffer([16, 3, 7, 7], dtype="int8")
+        for i0, i1, i2, i3 in T.grid(1, 3, 230, 230):
+            with T.block("pad"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(data[i0_1, i1_1, i2_1 - 3, i3_1 - 3])
+                T.writes(pad[i0_1, i1_1, i2_1, i3_1])
+                pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
+                    3 <= i2_1 and i2_1 < 227 and 3 <= i3_1 and i3_1 < 227,
+                    data[i0_1, i1_1, i2_1 - 3, i3_1 - 3],
+                    T.int8(0),
+                    dtype="int8",
+                )
+        for i0 in T.serial(1):
+            for ax0, ax1, ax2, ax3 in T.grid(16, 3, 7, 7):
+                with T.block("wbuf"):
+                    v0, v1, v2, v3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3])
+                    T.reads(w[v0, v1, v2, v3])
+                    T.writes(wbuf[v0, v1, v2, v3])
+                    wbuf[v0, v1, v2, v3] = w[v0, v1, v2, v3]
+            for i1, i2, i3, i4, i5, i6 in T.grid(16, 112, 112, 3, 7, 7):
+                with T.block("conv"):
+                    nn, ff, yy, xx, rc, ry, rx = T.axis.remap(
+                        "SSSSRRR", [i0, i1, i2, i3, i4, i5, i6]
+                    )
+                    T.reads(pad[nn, rc, yy * 2 + ry, xx * 2 + rx], wbuf[ff, rc, ry, rx])
+                    T.writes(conv[nn, ff, yy, xx])
+                    with T.init():
+                        conv[nn, ff, yy, xx] = 0
+                    conv[nn, ff, yy, xx] = conv[nn, ff, yy, xx] + T.cast(
+                        pad[nn, rc, yy * 2 + ry, xx * 2 + rx], "int32"
+                    ) * T.cast(wbuf[ff, rc, ry, rx], "int32")
+
+    @T.prim_func
+    def multi_producers_after_compute_at(
+        data: T.Buffer[(1, 3, 224, 224), "int8"],
+        w: T.Buffer[(16, 3, 7, 7), "int8"],
+        conv: T.Buffer[(1, 16, 112, 112), "int32"],
+    ) -> None:
+        pad = T.alloc_buffer([1, 3, 230, 230], dtype="int8")
+        wbuf = T.alloc_buffer([16, 3, 7, 7], dtype="int8")
+        for i0 in T.serial(1):
+            for ax0, ax1, ax2 in T.grid(3, 229, 229):
+                with T.block("pad"):
+                    i0_1 = T.axis.spatial(1, 0)
+                    i1_1 = T.axis.spatial(3, ax0)
+                    i2_1 = T.axis.spatial(230, ax1)
+                    i3_1 = T.axis.spatial(230, ax2)
+                    T.reads(data[i0_1, i1_1, i2_1 - 3, i3_1 - 3])
+                    T.writes(pad[i0_1, i1_1, i2_1, i3_1])
+                    pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
+                        3 <= i2_1 and i2_1 < 227 and 3 <= i3_1 and i3_1 < 227,
+                        data[i0_1, i1_1, i2_1 - 3, i3_1 - 3],
+                        T.int8(0),
+                        dtype="int8",
+                    )
+            for ax0, ax1, ax2, ax3 in T.grid(16, 3, 7, 7):
+                with T.block("wbuf"):
+                    v0, v1, v2, v3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3])
+                    T.reads(w[v0, v1, v2, v3])
+                    T.writes(wbuf[v0, v1, v2, v3])
+                    wbuf[v0, v1, v2, v3] = w[v0, v1, v2, v3]
+            for i1, i2, i3, i4, i5, i6 in T.grid(16, 112, 112, 3, 7, 7):
+                with T.block("conv"):
+                    nn, ff, yy, xx, rc, ry, rx = T.axis.remap(
+                        "SSSSRRR", [i0, i1, i2, i3, i4, i5, i6]
+                    )
+                    T.reads(pad[nn, rc, yy * 2 + ry, xx * 2 + rx], wbuf[ff, rc, ry, rx])
+                    T.writes(conv[nn, ff, yy, xx])
+                    with T.init():
+                        conv[nn, ff, yy, xx] = 0
+                    conv[nn, ff, yy, xx] = conv[nn, ff, yy, xx] + T.cast(
+                        pad[nn, rc, yy * 2 + ry, xx * 2 + rx], "int32"
+                    ) * T.cast(wbuf[ff, rc, ry, rx], "int32")
+
+    sch = tir.Schedule(multi_producers_conv, debug_mask="all")
+    block_c = sch.get_block("pad")
+    axis = sch.get_loops("conv")[0]
+    sch.compute_at(block_c, axis, index=-2)
+    tvm.ir.assert_structural_equal(multi_producers_after_compute_at, sch.mod["main"])
+
+
+def test_reverse_compute_at_to_index():
+    @T.prim_func
+    def main(A: T.Buffer[(128, 128), "float32"], D: T.Buffer[(128, 128), "float32"]) -> None:
+        B = T.alloc_buffer([128, 128], dtype="float32")
+        C = T.alloc_buffer([128, 128], dtype="float32")
+        for i_0, j_0, i_1 in T.grid(8, 8, 16):
+            for j_1 in T.serial(16):
+                with T.block("B"):
+                    vi = T.axis.spatial(128, i_0 * 16 + i_1)
+                    vj = T.axis.spatial(128, j_0 * 16 + j_1)
+                    T.reads(A[vi, vj])
+                    T.writes(B[vi, vj])
+                    B[vi, vj] = A[vi, vj] * T.float32(2)
+            for ax0 in T.serial(16):
+                with T.block("C"):
+                    vi = T.axis.spatial(128, i_0 * 16 + i_1)
+                    vj = T.axis.spatial(128, j_0 * 16 + ax0)
+                    T.reads(B[vi, vj])
+                    T.writes(C[vi, vj])
+                    C[vi, vj] = B[vi, vj] + T.float32(1)
+        for i, j in T.grid(128, 128):
+            with T.block("D"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.reads(B[vi, vj])
+                T.writes(D[vi, vj])
+                D[vi, vj] = B[vi, vj] + T.float32(1)
+
+    @T.prim_func
+    def main_reverse_compute_at(
+        A: T.Buffer[(128, 128), "float32"], D: T.Buffer[(128, 128), "float32"]
+    ) -> None:
+        B = T.alloc_buffer([128, 128], dtype="float32")
+        C = T.alloc_buffer([128, 128], dtype="float32")
+        for i_0, j_0, i_1 in T.grid(8, 8, 16):
+            for j_1 in T.serial(16):
+                with T.block("B"):
+                    vi = T.axis.spatial(128, i_0 * 16 + i_1)
+                    vj = T.axis.spatial(128, j_0 * 16 + j_1)
+                    T.reads(A[vi, vj])
+                    T.writes(B[vi, vj])
+                    B[vi, vj] = A[vi, vj] * T.float32(2)
+            for ax0 in T.serial(16):
+                with T.block("D"):
+                    vi = T.axis.spatial(128, i_0 * 16 + i_1)
+                    vj = T.axis.spatial(128, j_0 * 16 + ax0)
+                    T.reads(B[vi, vj])
+                    T.writes(D[vi, vj])
+                    D[vi, vj] = B[vi, vj] + T.float32(1)
+            for ax0 in T.serial(16):
+                with T.block("C"):
+                    vi = T.axis.spatial(128, i_0 * 16 + i_1)
+                    vj = T.axis.spatial(128, j_0 * 16 + ax0)
+                    T.reads(B[vi, vj])
+                    T.writes(C[vi, vj])
+                    C[vi, vj] = B[vi, vj] + T.float32(1)
+
+    sch = tir.Schedule(main, debug_mask="all")
+    block_c = sch.get_block("D")
+    axis = sch.get_loops("B")[2]
+    sch.reverse_compute_at(block_c, axis, index=1)
+    tvm.ir.assert_structural_equal(main_reverse_compute_at, sch.mod["main"])
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From d171b4af09b89683f8648a9df4a1d5cb5902bd99 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Fri, 26 Aug 2022 10:28:20 -0600
Subject: [PATCH 055/704] [SimplifyExpr] Add simplify for dq->arg funcs
 (#12580)

* add simplify for dq->arg funcs

* add comments, fix lint

* move comments to the right spots
---
 src/relay/transforms/simplify_expr.cc         | 48 +++++++++++++++++++
 tests/python/relay/test_pass_simplify_expr.py | 48 +++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index a6751933a88c..463f76995436 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -685,6 +685,7 @@ class SimplifyConsecutiveAdd : public DFPatternRewrite {
   DFPattern const2_;
 };
 
+/*! \brief Simplifying x/sqrt to x*sqrt */
 class SimplifyRSqrt : public DFPatternRewrite {
  public:
   SimplifyRSqrt() {
@@ -708,6 +709,50 @@ class SimplifyRSqrt : public DFPatternRewrite {
   DFPattern numerator_;
 };
 
+/*! \brief Base class for simplifying dequantize followed by arg ops */
+class SimplifyDQArgFunc : public DFPatternRewrite {
+ public:
+  explicit SimplifyDQArgFunc(std::string op) : op_(op) {
+    x_ = IsWildcard();
+    dq_ = IsOp("qnn.dequantize")({x_, IsWildcard(), IsWildcard()});
+    pattern_ = IsOp(op_)({dq_});
+  }
+
+  Expr Callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    const CallNode* call = pre.as<CallNode>();
+    ICHECK(call);
+    auto x = node_map[x_][0];
+    return Call(Op::Get(op_), {x}, call->attrs);
+  }
+
+ protected:
+  /*! \brief Pattern input */
+  DFPattern x_;
+  /*! \brief dequantize op */
+  DFPattern dq_;
+  /*! \brief Name of op to simplify */
+  String op_;
+};
+
+/*! \brief Simplify dequantize follwed by argmax */
+class SimplifyDQArgMax : public SimplifyDQArgFunc {
+ public:
+  SimplifyDQArgMax() : SimplifyDQArgFunc("argmax") {}
+};
+
+/*! \brief Simplify dequantize follwed by argmin */
+class SimplifyDQArgMin : public SimplifyDQArgFunc {
+ public:
+  SimplifyDQArgMin() : SimplifyDQArgFunc("argmin") {}
+};
+
+/*! \brief Simplify dequantize follwed by argsort */
+class SimplifyDQArgSort : public SimplifyDQArgFunc {
+ public:
+  SimplifyDQArgSort() : SimplifyDQArgFunc("argsort") {}
+};
+
 Expr SimplifyExpr(const Expr& expr, const IRModule& mod) {
   // the rewrites will be applied in the given order, and repeated until fixed point
   DFPatternRewriteComposer composer;
@@ -725,6 +770,9 @@ Expr SimplifyExpr(const Expr& expr, const IRModule& mod) {
   composer.AddRewrite<SimplifyConsecutiveCast>();
   composer.AddRewrite<FullElementwise>();
   composer.AddRewrite<SimplifyConsecutiveAdd>();
+  composer.AddRewrite<SimplifyDQArgMax>();
+  composer.AddRewrite<SimplifyDQArgMin>();
+  composer.AddRewrite<SimplifyDQArgSort>();
   return RewritePatterns(composer.MakeCallbacks(), expr, mod);
 }
 
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index 837b15a48dc1..dcd58602b0ac 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -603,5 +603,53 @@ def expected(c):
         assert tvm.ir.structural_equal(opt, after)
 
 
+def test_simplify_dq_argmax():
+    shape = (4, 32, 1, 1)
+    x = relay.var("x", shape=shape, dtype="int8")
+
+    def before():
+        y = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0))
+        return relay.op.argmax(y, axis=1)
+
+    def expected():
+        return relay.op.argmax(x, axis=1)
+
+    opt = run_opt_pass(before(), transform.SimplifyExpr())
+    after = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(opt, after)
+
+
+def test_simplify_dq_argmin():
+    shape = (4, 32, 1, 1)
+    x = relay.var("x", shape=shape, dtype="int8")
+
+    def before():
+        y = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0))
+        return relay.op.argmin(y, axis=1)
+
+    def expected():
+        return relay.op.argmin(x, axis=1)
+
+    opt = run_opt_pass(before(), transform.SimplifyExpr())
+    after = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(opt, after)
+
+
+def test_simplify_dq_argsort():
+    shape = (4, 32, 1, 1)
+    x = relay.var("x", shape=shape, dtype="int8")
+
+    def before():
+        y = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0))
+        return relay.op.argsort(y, axis=1)
+
+    def expected():
+        return relay.op.argsort(x, axis=1)
+
+    opt = run_opt_pass(before(), transform.SimplifyExpr())
+    after = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(opt, after)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From d87fa854b8eb0c8f603d8dc459121eaa1a365e12 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 27 Aug 2022 02:01:24 +0900
Subject: [PATCH 056/704] [Hexagon] Initial support for meta schedule tuning
 (#12587)

Enables AutoTVM-style, template-based tuning for Hexagon.

To run compiled code on Hexagon, we need to use Hexagon `Session` object https://github.com/apache/tvm/blob/dc522a6ff65b68532cd1bba43827cd981114df2c/python/tvm/contrib/hexagon/session.py#L35 in the metaschedule `RPCRunner`. But for RPC "session", `RPCRunner` expects an instance of `RPCSession`, https://github.com/apache/tvm/blob/53fe5966823eee4e011d7228bceab3c82c1d9caa/python/tvm/rpc/client.py#L32,  to be created and used by various customizable functions.

Since `RPCSession` and Hexagon `Session` have slightly different API, we cannot use `RPCRunner` with customizable functions directly. So I introduced an alternative implementation of `RPCRunner` for Hexagon.

The test is disabled for simulator since `HexagonLauncherSimulator` is not pickle-able due to its `multiprocessing.Process` attribute: https://github.com/apache/tvm/blob/c97895e0ffb512e73c89de7cdee9846f052244fc/python/tvm/contrib/hexagon/build.py#L614


Output log from tuning `vrmpy` dense (included in the test)

```
 ID | Name |      FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Terminated
--------------------------------------------------------------------------------------------------------------
  0 | main | 150994944 |      1 |       380.3399 |     397.0000 |              397.0000 |     32 |
--------------------------------------------------------------------------------------------------------------
```
---
 apps/hexagon_api/CMakeLists.txt               |   2 +
 python/tvm/contrib/hexagon/meta_schedule.py   | 166 ++++++++++++++
 python/tvm/contrib/hexagon/session.py         |   8 +-
 python/tvm/contrib/hexagon/tools.py           |   7 +
 python/tvm/meta_schedule/default_config.py    |   6 +-
 python/tvm/target/target.py                   |   5 +
 python/tvm/tir/tensor_intrin/__init__.py      |   2 +-
 python/tvm/tir/tensor_intrin/hexagon.py       |  71 ++++++
 src/target/target_kind.cc                     |   1 +
 .../test_hexagon/test_meta_schedule.py        | 211 ++++++++++++++++++
 10 files changed, 472 insertions(+), 7 deletions(-)
 create mode 100644 python/tvm/contrib/hexagon/meta_schedule.py
 create mode 100644 python/tvm/tir/tensor_intrin/hexagon.py
 create mode 100644 tests/python/contrib/test_hexagon/test_meta_schedule.py

diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index aa971c875307..9a05cf3675b6 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -87,6 +87,7 @@ ExternalProject_Add(android_tvm_runtime_rpc
     "-DUSE_HEXAGON_RPC=ON"
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
     "-DUSE_ALTERNATIVE_LINKER=OFF"
+    "-DUSE_RANDOM=ON"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
@@ -133,6 +134,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DUSE_ALTERNATIVE_LINKER=OFF"
     "-DUSE_CUSTOM_LOGGING=ON"
     "-DUSE_HEXAGON_QHL=ON"
+    "-DUSE_RANDOM=ON"
     "${GTEST_FLAG}"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
diff --git a/python/tvm/contrib/hexagon/meta_schedule.py b/python/tvm/contrib/hexagon/meta_schedule.py
new file mode 100644
index 000000000000..8a4de74b6131
--- /dev/null
+++ b/python/tvm/contrib/hexagon/meta_schedule.py
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Meta schedule tuning utilities for Hexagon."""
+import os
+import tempfile
+from typing import Callable, List, Optional
+from tvm.contrib.popen_pool import PopenPoolExecutor
+from tvm.meta_schedule.utils import cpu_count, derived_object
+from tvm.meta_schedule.builder import LocalBuilder
+from tvm.meta_schedule.runner import (
+    EvaluatorConfig,
+    RunnerInput,
+    RunnerFuture,
+    PyRunner,
+)
+from tvm.meta_schedule.runner.rpc_runner import (
+    default_alloc_argument,
+    default_run_evaluator,
+    RPCRunnerFuture,
+)
+
+from .build import HexagonLauncherRPC
+from .tools import export_module
+
+
+@derived_object
+class HexagonRPCRunner(PyRunner):
+    """RPCRunner for Hexagon. See the documentation of RPCRunner for more details."""
+
+    def __init__(
+        self,
+        hexagon_launcher: HexagonLauncherRPC,
+        evaluator_config: Optional[EvaluatorConfig] = None,
+        cooldown_sec: float = 0.0,
+        alloc_repeat: int = 1,
+        max_workers: Optional[int] = None,
+        initializer: Optional[Callable[[], None]] = None,
+    ):
+        """
+        Parameters
+        ----------
+        hexagon_launcher : HexagonLauncherRPC
+            The RPC launcher for Hexagon. It is needed for creating hexagon.Session
+            object inside the worker function.
+        evaluator_config: EvaluatorConfig
+            The evaluator configuration.
+        cooldown_sec: float
+            The cooldown in seconds.
+        alloc_repeat: int
+            The number of times to random fill the allocation.
+        max_workers: Optional[int] = None
+            The maximum number of connections. Defaults to number of logical CPU cores.
+        initializer: Optional[Callable[[], None]]
+            The initializer function.
+        """
+
+        super().__init__()
+        self.hexagon_launcher = hexagon_launcher
+        self.evaluator_config = EvaluatorConfig._normalized(evaluator_config)
+        self.cooldown_sec = cooldown_sec
+        self.alloc_repeat = alloc_repeat
+        if max_workers is None:
+            max_workers = cpu_count(logical=True)
+        self.pool = PopenPoolExecutor(
+            max_workers=max_workers,
+            timeout=100,
+            initializer=initializer,
+        )
+
+    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
+        results = []
+        for runner_input in runner_inputs:
+            future = RPCRunnerFuture(
+                future=self.pool.submit(
+                    _worker_func,
+                    self.hexagon_launcher,
+                    self.evaluator_config,
+                    self.alloc_repeat,
+                    str(runner_input.artifact_path),
+                    tuple(arg_info.as_json() for arg_info in runner_input.args_info),
+                ),
+                timeout_sec=100,
+            )
+            results.append(future)
+        return results
+
+
+def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path, args_info):
+    with hexagon_launcher.start_session() as session:
+        device = session.device
+        _, remote_path = os.path.split(artifact_path)
+        uploaded = session.upload(artifact_path, remote_path)
+        rt_mod = session.load_module(uploaded)
+        repeated_args = default_alloc_argument(
+            session,
+            device,
+            args_info,
+            alloc_repeat,
+        )
+        costs = default_run_evaluator(
+            session,
+            rt_mod,
+            device,
+            evaluator_config,
+            repeated_args,
+        )
+    return costs
+
+
+def get_hexagon_local_builder():
+    """Return Hexagon-compatible Builder for meta schedule."""
+
+    def export_func(mod):
+        binary_path = export_module(mod, tempfile.mkdtemp())
+        return str(binary_path)
+
+    return LocalBuilder(f_export=export_func)
+
+
+def get_hexagon_rpc_runner(
+    hexagon_launcher: HexagonLauncherRPC, number=3, repeat=1, min_repeat_ms=100
+):
+    """Return Hexagon-compatible RPC Runner for meta schedule.
+
+    Parameters
+    ----------
+    hexagon_launcher : HexagonLauncherRPC
+        The RPC launcher for Hexagon.
+    number: int
+        The number of times to run this function for taking average.
+        We call these runs as one `repeat` of measurement.
+    repeat: int
+        The number of times to repeat the measurement.
+        In total, the function will be invoked (1 + number x repeat) times,
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int
+        Minimum repeat time in ms. if the execution latency is too short,
+        increase the number of runs to the given time (in ms) to reduce the measurement error.
+    """
+    evaluator_config = EvaluatorConfig(
+        number=number,
+        repeat=repeat,
+        min_repeat_ms=min_repeat_ms,
+        enable_cpu_cache_flush=False,
+    )
+
+    return HexagonRPCRunner(
+        hexagon_launcher,
+        evaluator_config,
+    )
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 0c0bf296df44..9308e396b2a5 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -30,6 +30,7 @@
     AOTExecutorFactoryModule,
     GraphExecutorFactoryModule,
 )
+from .tools import export_module
 
 
 class Session:
@@ -110,6 +111,9 @@ def device(self):
 
         return self._device
 
+    def get_function(self, name):
+        return self._rpc.get_function(name)
+
     def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str) -> pathlib.Path:
         """Upload a local file to the remote workspace.
 
@@ -154,10 +158,8 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
 
         if isinstance(module, tvm.runtime.Module):
             with tempfile.TemporaryDirectory() as temp_dir:
-                temp_dir = pathlib.Path(temp_dir)
                 binary_name = "test_binary.so"
-                binary_path = temp_dir / binary_name
-                module.save(str(binary_path))
+                binary_path = export_module(module, temp_dir, binary_name)
                 remote_file_path = self.upload(binary_path, binary_name)
         else:
             remote_file_path = module
diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py
index 1aec8c7d565b..3f4adb90f645 100644
--- a/python/tvm/contrib/hexagon/tools.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -194,3 +194,10 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: st
     cross_compile.output_format = "o"
     c_files = [str(file) for file in files]
     cross_compile(str(so_name), c_files, options=compile_options + options)
+
+
+def export_module(module, out_dir, binary_name="test_binary.so"):
+    """Export Hexagon shared object to a file."""
+    binary_path = pathlib.Path(out_dir) / binary_name
+    module.save(str(binary_path))
+    return binary_path
diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
index 0f1f7d3c2c6a..97cbfc58a6c1 100644
--- a/python/tvm/meta_schedule/default_config.py
+++ b/python/tvm/meta_schedule/default_config.py
@@ -178,7 +178,7 @@ def schedule_rules(  # pylint: disable=redefined-outer-name
         return sch_rules()
     if sch_rules is not None:
         raise TypeError(f"Expected `sch_rules` to be None or callable, but gets: {sch_rules}")
-    if target.kind.name == "llvm":
+    if target.kind.name in ["llvm", "hexagon"]:
         return _DefaultLLVM.schedule_rules()
     if target.kind.name in ["cuda", "rocm", "vulkan"]:
         return _DefaultCUDA.schedule_rules()
@@ -194,7 +194,7 @@ def postproc(  # pylint: disable=redefined-outer-name
         return postproc()
     if postproc is not None:
         raise TypeError(f"Expected `postproc` to be None or callable, but gets: {postproc}")
-    if target.kind.name == "llvm":
+    if target.kind.name in ["llvm", "hexagon"]:
         return _DefaultLLVM.postprocs()
     if target.kind.name in ["cuda", "rocm", "vulkan"]:
         return _DefaultCUDA.postprocs()
@@ -212,7 +212,7 @@ def mutator_probs(  # pylint: disable=redefined-outer-name
         raise TypeError(
             f"Expected `mutator_probs` to be None or callable, but gets: {mutator_probs}"
         )
-    if target.kind.name == "llvm":
+    if target.kind.name in ["llvm", "hexagon"]:
         return _DefaultLLVM.mutator_probs()
     if target.kind.name in ["cuda", "rocm", "vulkan"]:
         return _DefaultCUDA.mutator_probs()
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index a558fcbeaf5b..1e9e2e698c44 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -636,6 +636,8 @@ def hexagon(cpu_ver="v66", **kwargs):
         Whether to use QFloat HVX instructions.
     use_ieee_fp : bool (default: False)
         Whether to use IEEE HVX instructions
+    num_cores : int (default: 4)
+        The number of HVX threads. This attribute is required by meta scheduler.
 
     Note: Floating point support in HVX requires LLVM 14+.
     """
@@ -740,6 +742,9 @@ def create_llvm_options(cpu_ver, config):  # pylint: disable=unused-argument
 
     args_list = target_str.split() + llvm_str.split()
 
+    num_cores = config["num_cores"] if "num_cores" in kwargs else 4
+    args_list.append("--num-cores=%d" % num_cores)
+
     return Target(" ".join(["hexagon"] + args_list))
 
 
diff --git a/python/tvm/tir/tensor_intrin/__init__.py b/python/tvm/tir/tensor_intrin/__init__.py
index f0725b666e3b..7e5a26bdeb43 100644
--- a/python/tvm/tir/tensor_intrin/__init__.py
+++ b/python/tvm/tir/tensor_intrin/__init__.py
@@ -16,4 +16,4 @@
 # under the License.
 # pylint: disable=unused-import
 """Intrinsics for tensorization."""
-from . import arm_cpu, cuda, rocm, x86
+from . import arm_cpu, cuda, rocm, x86, hexagon
diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
new file mode 100644
index 000000000000..0227312d6373
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Intrinsics for Hexagon tensorization."""
+from tvm.script import tir as T
+from .. import TensorIntrin
+
+
+@T.prim_func
+def dot_product_32x4_u8u8i32_desc(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((32, 4), "uint8", offset_factor=1),
+    C: T.Buffer((32,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+        T.writes(C[0:32])
+        for i in T.serial(0, 32):
+            with T.init():
+                C[i] = T.int32(0)
+            for k in T.serial(0, 4):
+                with T.block("update"):
+                    vi, vk = T.axis.remap("SR", [i, k])
+                    C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
+
+
+@T.prim_func
+def dot_product_32x4_u8u8i32_vrmpy(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((32, 4), "uint8", offset_factor=1),
+    C: T.Buffer((32,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+        T.writes(C[0:32])
+
+        A_u8x4 = A.vload([0], "uint8x4")
+        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+
+        B_i8x128 = B.vload([0, 0], dtype="uint8x128")
+        B_i32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+
+        C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+            T.uint32(3),
+            C[T.ramp(T.int32(0), 1, 32)],
+            B_i32x32,
+            A_i32,
+            dtype="int32x32",
+        )
+
+
+VRMPY_u8u8i32_INTRIN = "dot_32x4_u8u8i32_vrmpy"
+
+TensorIntrin.register(
+    VRMPY_u8u8i32_INTRIN, dot_product_32x4_u8u8i32_desc, dot_product_32x4_u8u8i32_vrmpy
+)
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index e3b2d7b096fd..a95f55357f2d 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -417,6 +417,7 @@ TVM_REGISTER_TARGET_KIND("hexagon", kDLHexagon)
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("mtriple")
     .add_attr_option<Array<String>>("llvm-options")
+    .add_attr_option<Integer>("num-cores")
     .set_default_keys({"hexagon"});
 
 TVM_REGISTER_TARGET_KIND("stackvm", kDLCPU);
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py
new file mode 100644
index 000000000000..96d18c9b3076
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test rpc based launcher for hexagon """
+import pytest
+import numpy as np
+import tempfile
+
+import tvm.testing
+from tvm import te
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.arg_info import TensorInfo
+from tvm.meta_schedule.builder import BuilderInput
+from tvm.script import tir as T
+from tvm.tir import FloatImm
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.meta_schedule.runner import RunnerInput
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
+
+MATMUL_N = 16
+MATMUL_M = 32
+
+
+@tvm.script.ir_module
+class MatmulModule:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, (16, 16), "float32")
+        B = T.match_buffer(b, (16, 16), "float32")
+        C = T.match_buffer(c, (16, 16), "float32")
+        for i, j, k in T.grid(16, 16, 16):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.testing.requires_hexagon
+def test_builder_runner(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    mod = MatmulModule
+
+    builder = get_hexagon_local_builder()
+    runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, min_repeat_ms=0)
+
+    (builder_result,) = builder.build([BuilderInput(mod, target)])
+    assert builder_result.artifact_path is not None
+    assert builder_result.error_msg is None
+
+    runner_input = RunnerInput(
+        builder_result.artifact_path,
+        "llvm",
+        [
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+        ],
+    )
+
+    # Run the module
+    (runner_future,) = runner.run([runner_input])
+    runner_result = runner_future.result()
+
+    assert runner_result.error_msg is None
+    for result in runner_result.run_secs:
+        if isinstance(result, FloatImm):
+            result = result.value
+        assert isinstance(result, float)
+        assert result >= 0.0
+
+
+def dense(m, n, k):
+    X = te.placeholder((m, k), name="X", dtype="uint8")
+    packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", dtype="uint8")
+
+    ak = te.reduce_axis((0, k), name="k")
+    out = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype(
+                "int32"
+            ),
+            axis=ak,
+        ),
+        name="compute",
+    )
+    return [X, packedW, out]
+
+
+def schedule_dense(sch, block, M, do_tune):
+    a_y, a_x, _ = sch.get_loops(block)[-3:]
+
+    if do_tune:
+        y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+        a_yo, a_yi = sch.split(a_y, factors=y_factors)
+    else:
+        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+
+    a_xo, a_xi = sch.split(a_x, factors=[None, 32])
+    sch.reorder(a_yo, a_xo, a_yi, a_xi)
+
+    a_xi, a_k = sch.get_loops(block)[-2:]
+    a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+    sch.reorder(a_ko, a_xi, a_ki)
+
+    fused = sch.fuse(a_yo, a_xo)
+
+    sch.parallel(fused)
+
+    dec = sch.decompose_reduction(block, a_ko)
+
+    init_loop = sch.get_loops(dec)[-1]
+    sch.vectorize(init_loop)
+
+    sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
+
+
+def verify_dense(sch, target, M, N, K, hexagon_session):
+    f = tvm.build(sch.mod["main"], target=target, name="dense")
+    mod = hexagon_session.load_module(f)
+    dev = hexagon_session.device
+
+    a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
+    b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8")
+    c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32"))
+
+    packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 4)).astype("uint8")
+
+    for r_idx in range(N // 32):
+        for ko in range(K // 4):
+            for s_idx in range(32):
+                for t_idx in range(4):
+                    packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ko * 4 + t_idx]
+
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(packW, dev)
+    c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
+
+    mod(a, b, c)
+    np.testing.assert_equal(c.numpy(), c_np)
+
+    evaluator = mod.time_evaluator(mod.entry_name, dev, number=10)
+    gflops = (N * M * K) * 2 / 1e9
+    time_ms = evaluator(a, b, c).mean * 1e3
+    print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
+
+
+@pytest.mark.skip(reason="xgboost not installed on CI")
+@tvm.testing.requires_hexagon
+def test_vrmpy_dense(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    do_tune = True
+    target_hexagon = tvm.target.hexagon("v68")
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+    M, N, K = 128, 768, 768
+    workload = te.create_prim_func(dense(M, N, K))
+
+    if not do_tune:
+        ir_module = tvm.IRModule({"main": workload})
+        sch = tvm.tir.Schedule(ir_module)
+        block = sch.get_block("compute")
+        schedule_dense(sch, block, M, do_tune)
+    else:
+        with tempfile.TemporaryDirectory() as work_dir:
+            config = ms.TuneConfig(
+                strategy="replay_trace",
+                num_trials_per_iter=8,
+                max_trials_per_task=8,
+                max_trials_global=8,
+            )
+
+            def schedule_dense_for_tune(sch):
+                block = sch.get_block("compute")
+                return schedule_dense(sch, block, None, True)
+
+            sch = ms.tune_tir(
+                mod=workload,
+                target=target,
+                config=config,
+                work_dir=work_dir,
+                space=ms.space_generator.ScheduleFn(schedule_dense_for_tune),
+                builder=get_hexagon_local_builder(),
+                runner=get_hexagon_rpc_runner(hexagon_launcher, number=10),
+            )
+
+    with hexagon_launcher.start_session() as session:
+        verify_dense(sch, target, M, N, K, session)

From 49b3c72935b290afa9eee1f1c57a4b4c2f10a445 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 26 Aug 2022 10:15:54 -0700
Subject: [PATCH 057/704] [TIR] More hygenic TVM_SREF macros (#12607)

Previously, the `TVM_SREF_TO_BLOCK`, `TVM_SREF_TO_FOR`, and
`TVM_TYPE_AS` macros required both the input and output variables.
The input variable name is useful for improving the error message
returned, but the output variable name isn't necessary for this
functionality, and prevents the macro from being used as part of an
expression.

* Generate an immediately-invoked lambda expression to allow for an
  independently-scoped `result` variable.

* Use parentheses around the input argument, in case the sref is
  the result of an expression.

* Update all call sites to remove the macro argument providing the
  first argument.
---
 src/meta_schedule/mutator/mutate_parallel.cc  |  4 +-
 .../mutator/mutate_thread_binding.cc          |  8 +--
 src/meta_schedule/mutator/mutate_tile_size.cc |  4 +-
 src/meta_schedule/mutator/mutate_unroll.cc    |  4 +-
 .../rewrite_parallel_vectorize_unroll.cc      |  4 +-
 src/meta_schedule/schedule_rule/auto_bind.cc  |  2 +-
 .../schedule_rule/auto_inline.cc              |  2 +-
 .../schedule_rule/multi_level_tiling.cc       |  2 +-
 .../multi_level_tiling_tensor_core.cc         |  4 +-
 .../schedule_rule/random_compute_location.cc  |  2 +-
 src/meta_schedule/utils.h                     |  2 +-
 src/tir/schedule/analysis/analysis.cc         | 48 ++++++++---------
 src/tir/schedule/block_scope.cc               |  2 +-
 src/tir/schedule/concrete_schedule.cc         |  4 +-
 src/tir/schedule/concrete_schedule.h          |  6 +--
 src/tir/schedule/primitive/block_annotate.cc  |  6 +--
 .../schedule/primitive/blockize_tensorize.cc  |  2 +-
 .../schedule/primitive/cache_read_write.cc    | 14 ++---
 src/tir/schedule/primitive/compute_at.cc      | 12 ++---
 src/tir/schedule/primitive/compute_inline.cc  |  8 +--
 .../schedule/primitive/decompose_padding.cc   |  2 +-
 src/tir/schedule/primitive/for_kind.cc        |  4 +-
 src/tir/schedule/primitive/get_block_loop.cc  |  2 +-
 .../primitive/layout_transformation.cc        | 10 ++--
 .../schedule/primitive/loop_transformation.cc | 10 ++--
 src/tir/schedule/primitive/reduction.cc       | 12 ++---
 src/tir/schedule/primitive/sampling.cc        |  2 +-
 src/tir/schedule/state.cc                     | 14 ++---
 src/tir/schedule/transform.cc                 |  6 +--
 src/tir/schedule/utils.h                      | 51 ++++++++++++-------
 30 files changed, 133 insertions(+), 120 deletions(-)

diff --git a/src/meta_schedule/mutator/mutate_parallel.cc b/src/meta_schedule/mutator/mutate_parallel.cc
index 5b7fe7f5148d..82b91da682c6 100644
--- a/src/meta_schedule/mutator/mutate_parallel.cc
+++ b/src/meta_schedule/mutator/mutate_parallel.cc
@@ -64,7 +64,7 @@ const BlockRVNode* GetInstGetBlockOutput(const Instruction& inst) {
     return nullptr;
   }
   ICHECK_EQ(inst->outputs.size(), 1);
-  const BlockRVNode* block = TVM_TYPE_AS(block, inst->outputs[0], BlockRVNode);
+  const BlockRVNode* block = TVM_TYPE_AS(inst->outputs[0], BlockRVNode);
   return block;
 }
 
@@ -82,7 +82,7 @@ std::vector<std::vector<int64_t>> AnalyzeParallel(const ScheduleState& self,
   Array<StmtSRef> block_srefs =
       tir::GetBlocks(self, block_name, self->mod->GetGlobalVar(func_name));
   ICHECK_EQ(block_srefs.size(), 1);
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_srefs[0]);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_srefs[0]);
   ScopeBlockLoopInfo info = GetScopeBlockLoopInfo(GetRef<Block>(block));
   std::vector<std::vector<int64_t>> results;
   results.reserve(info.realizes.size());
diff --git a/src/meta_schedule/mutator/mutate_thread_binding.cc b/src/meta_schedule/mutator/mutate_thread_binding.cc
index 41207162ee1d..de780b53e2d9 100644
--- a/src/meta_schedule/mutator/mutate_thread_binding.cc
+++ b/src/meta_schedule/mutator/mutate_thread_binding.cc
@@ -109,12 +109,12 @@ std::vector<MutateThreadBindingNode::Candidate> MutateThreadBindingNode::FindCan
   for (const Instruction& inst : trace->insts) {
     if (inst->kind.same_as(inst_sample_categorical)) {
       ICHECK_EQ(inst->outputs.size(), 1);
-      const PrimExprNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[0], PrimExprNode);
+      const PrimExprNode* var_rv = TVM_TYPE_AS(inst->outputs[0], PrimExprNode);
       sample_insts[var_rv] = inst.get();
     } else if (is_split_by_sample(inst)) {
       CHECK_EQ(inst->outputs.size(), 2);
       // Only consider the inner loop, which can be bound to threadIdx.x
-      const tir::LoopRVNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[1], tir::LoopRVNode);
+      const tir::LoopRVNode* var_rv = TVM_TYPE_AS(inst->outputs[1], tir::LoopRVNode);
       sampled_split_insts[var_rv] = inst.get();
     } else if (is_thread_binding_by_sample(inst)) {
       bind_insts.push_back(inst.get());
@@ -122,12 +122,12 @@ std::vector<MutateThreadBindingNode::Candidate> MutateThreadBindingNode::FindCan
   }
 
   for (const InstructionNode* bind_inst : bind_insts) {
-    const auto* loop_rv = TVM_TYPE_AS(loop_rv, bind_inst->inputs[0], tir::LoopRVNode);
+    const auto* loop_rv = TVM_TYPE_AS(bind_inst->inputs[0], tir::LoopRVNode);
     auto split_it = sampled_split_insts.find(loop_rv);
     ICHECK(split_it != sampled_split_insts.end());
     const InstructionNode* split_inst = split_it->second;
 
-    const auto* expr_rv = TVM_TYPE_AS(expr_rv, split_inst->inputs[2], PrimExprNode);
+    const auto* expr_rv = TVM_TYPE_AS(split_inst->inputs[2], PrimExprNode);
     auto sample_it = sample_insts.find(expr_rv);
     ICHECK(sample_it != sample_insts.end());
     const InstructionNode* sample_inst = sample_it->second;
diff --git a/src/meta_schedule/mutator/mutate_tile_size.cc b/src/meta_schedule/mutator/mutate_tile_size.cc
index 00967aef7acd..4a3bfda8a4a8 100644
--- a/src/meta_schedule/mutator/mutate_tile_size.cc
+++ b/src/meta_schedule/mutator/mutate_tile_size.cc
@@ -34,7 +34,7 @@ using tir::Trace;
  * \return The result of downcast
  */
 std::vector<int64_t> DowncastTilingDecision(const ObjectRef& decision) {
-  const auto* arr = TVM_TYPE_AS(arr, decision, runtime::ArrayNode);
+  const auto* arr = TVM_TYPE_AS(decision, runtime::ArrayNode);
   return support::AsVector<ObjectRef, int64_t>(GetRef<Array<ObjectRef>>(arr));
 }
 
@@ -123,7 +123,7 @@ void FindSampleVectorize(const Trace& trace, std::vector<Instruction>* inst,
     if (inst->kind.same_as(inst_sample_categorical)) {
       ICHECK_EQ(inst->outputs.size(), 1);
       if (annotated.count(inst->outputs[0].get())) {
-        const auto* d = TVM_TYPE_AS(d, decision, IntImmNode);
+        const auto* d = TVM_TYPE_AS(decision, IntImmNode);
         instructions.push_back(inst);
         decisions.push_back(d->value);
       }
diff --git a/src/meta_schedule/mutator/mutate_unroll.cc b/src/meta_schedule/mutator/mutate_unroll.cc
index 94e83488584e..c282a171c3b7 100644
--- a/src/meta_schedule/mutator/mutate_unroll.cc
+++ b/src/meta_schedule/mutator/mutate_unroll.cc
@@ -91,7 +91,7 @@ bool FindUnrollDecision(const Trace& trace, TRandState* rand_state,
   for (const Instruction& inst : trace->insts) {
     if (inst->kind.same_as(inst_sample_categorical)) {
       ICHECK_EQ(inst->outputs.size(), 1);
-      const PrimExprNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[0], PrimExprNode);
+      const PrimExprNode* var_rv = TVM_TYPE_AS(inst->outputs[0], PrimExprNode);
       sample_insts[var_rv] = inst.get();
     } else if (IsAnnotateWithUnroll(inst)) {
       ann_insts.push_back(inst.get());
@@ -103,7 +103,7 @@ bool FindUnrollDecision(const Trace& trace, TRandState* rand_state,
   }
   const InstructionNode* ann_inst = ann_insts[tir::SampleInt(rand_state, 0, n_ann_insts)];
   ICHECK_EQ(ann_inst->inputs.size(), 2);
-  const auto* var_rv = TVM_TYPE_AS(var_rv, ann_inst->inputs[1], PrimExprNode);
+  const auto* var_rv = TVM_TYPE_AS(ann_inst->inputs[1], PrimExprNode);
   ICHECK(sample_insts.count(var_rv));
   const InstructionNode* sample_inst = sample_insts.at(var_rv);
   ICHECK_EQ(sample_inst->attrs.size(), 2);
diff --git a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
index f3c2b1328bc3..08d25d017840 100644
--- a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
+++ b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
@@ -233,7 +233,7 @@ void AdjustParallelVectorize(const Schedule& sch, const BlockRV& block_rv,
     int64_t prod_extent = 1;
     for (int i = 0; i < n_loops && loop_types[i] == IterVarType::kDataPar; ++i) {
       const StmtSRef& loop_sref = loop_srefs[i];
-      const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+      const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
       if (HasAnnOrBinding(loop)) {
         break;
       }
@@ -262,7 +262,7 @@ void AdjustParallelVectorize(const Schedule& sch, const BlockRV& block_rv,
     for (int i = n_loops - 1;
          i >= 0 && loop_types[i] == IterVarType::kDataPar && num_fusible < max_fusible; --i) {
       const StmtSRef& loop_sref = loop_srefs[i];
-      const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+      const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
       if (HasAnnOrBinding(loop)) {
         break;
       }
diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc
index ff4d26084e57..d8f52fa8e1de 100644
--- a/src/meta_schedule/schedule_rule/auto_bind.cc
+++ b/src/meta_schedule/schedule_rule/auto_bind.cc
@@ -45,7 +45,7 @@ void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
   int i_spatial_loop = -1;
   for (int i = 0; i < n; ++i) {
     const StmtSRef& loop_sref = loops[i];
-    const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+    const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
     runtime::ThreadScope thread_scope = GetThreadScope(loop);
     if (IsBlockIdx(thread_scope)) {
       if (i_block_idx == -1) {
diff --git a/src/meta_schedule/schedule_rule/auto_inline.cc b/src/meta_schedule/schedule_rule/auto_inline.cc
index df4d3ac85911..76313f46d1c8 100644
--- a/src/meta_schedule/schedule_rule/auto_inline.cc
+++ b/src/meta_schedule/schedule_rule/auto_inline.cc
@@ -96,7 +96,7 @@ inline InlineType AutoInlineNode::CheckInline(const tir::Schedule& sch,
   StmtSRef block_sref = sch->GetSRef(block_rv);
   bool is_pure_sptial = IsInSpatialPrimFunc(sch, block_sref);
   ScheduleState state = sch->state();
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   BlockRealize realize = GetBlockRealize(state, block_sref);
   // Cond 1. The block has only one write buffer
   if (block->writes.size() != 1) {
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index eefc2eea411b..c126c854462c 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -37,7 +37,7 @@ namespace tir {
  * of multi-level tiling, so it's intentionally kept inside this file not in the analysis header
  */
 std::vector<int> GetReadBufferNDims(const StmtSRef& block_sref) {
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   const BufferNode* write_buffer = block->writes[0]->buffer.get();
   int n = block->reads.size();
   std::vector<int> results(n, -1);
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 49704fb66b15..7ddda9b2635b 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -411,7 +411,7 @@ Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
   tir::StmtSRef block_sref = state->sch->GetSRef(state->block_rv);
 
   // Add reindex stages
-  const tir::BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const tir::BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   // Hold the reference of the block before reindex
   const tir::Block block_before_reindex = GetRef<tir::Block>(block);
   if (block->reads.size() != 2 || block->writes.size() != 1) {
@@ -488,7 +488,7 @@ Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
     }
     visited_buffers.insert(lhs_buffer);
     // Refresh block pointer (block sref is not invalidated)
-    block = TVM_SREF_TO_BLOCK(block, block_sref);
+    block = TVM_SREF_TO_BLOCK(block_sref);
     const tir::BufferRegion& reindexed_buffer_region = tir::GetNthAccessBufferRegion(
         state->sch->state(), GetRef<tir::Block>(block), buffer_index, index_type);
     auto sub_index_map = f_get_sub_index_map(lhs_buffer, reindexed_buffer_region->region);
diff --git a/src/meta_schedule/schedule_rule/random_compute_location.cc b/src/meta_schedule/schedule_rule/random_compute_location.cc
index e4b5d5bde256..65988dfd5688 100644
--- a/src/meta_schedule/schedule_rule/random_compute_location.cc
+++ b/src/meta_schedule/schedule_rule/random_compute_location.cc
@@ -60,7 +60,7 @@ class RandomComputeLocationNode : public ScheduleRuleNode {
  private:
   bool CheckConditions(const tir::Schedule sch, const tir::BlockRV& block_rv) const {
     tir::StmtSRef block_sref = sch->GetSRef(block_rv);
-    const tir::BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+    TVM_SREF_TO_BLOCK(block_sref);
 
     // Cond 1. The block is not the root block.
     if (block_sref->parent == nullptr) {
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index cb84596eed11..664a6a609e7f 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -238,7 +238,7 @@ inline std::string Concat(const Array<String>& strs, const std::string& delim) {
  */
 inline tir::BlockRV GetRVFromSRef(const tir::Schedule& sch, const tir::StmtSRef& block_sref,
                                   const String& global_var_name) {
-  const tir::BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const tir::BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   return sch->GetBlock(block->name_hint, global_var_name);
 }
 
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 62ec0b468f9d..b9e99257f37c 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -150,7 +150,7 @@ Definition of a scope that is a stage pipeline:
   if (require_stage_pipeline) {
     bool stage_pipeline = self->GetBlockInfo(scope_root_sref).scope->stage_pipeline;
     if (stage_pipeline == false) {
-      const BlockNode* block = TVM_SREF_TO_BLOCK(block, scope_root_sref);
+      const BlockNode* block = TVM_SREF_TO_BLOCK(scope_root_sref);
       throw NotStagePipelineError(self->mod, GetRef<Block>(block));
     }
   }
@@ -229,7 +229,7 @@ bool IsDominantBlock(const ScheduleState& self, const StmtSRef& scope_root_sref,
     }
   }
   // Check whether the input block is the only writer of its outputs
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   for (const BufferRegion& write_region : block->writes) {
     if (buffer_writers.count(write_region->buffer)) {
       if (buffer_writers.at(write_region->buffer).size() != 1) {
@@ -252,7 +252,7 @@ bool IsDominantBlock(const ScheduleState& self, const StmtSRef& scope_root_sref,
 int CheckCompleteBlockErrorCode(const ScheduleState& self, const StmtSRef& block_sref,
                                 const StmtSRef& scope_root_sref) {
   // Cond 1. All block vars are data parallel
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   for (const IterVar& iter_var : block->iter_vars) {
     if (iter_var->iter_type != kDataPar) {
       return 1;
@@ -328,7 +328,7 @@ void CheckCompleteBlock(const ScheduleState& self, const StmtSRef& block_sref,
 
   int error_code = CheckCompleteBlockErrorCode(self, block_sref, scope_root_sref);
   if (error_code != 0) {
-    const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+    const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
     throw IncompleteBlockError(self->mod, GetRef<Block>(block), error_code);
   }
 }
@@ -344,7 +344,7 @@ void CheckCompleteBlock(const ScheduleState& self, const StmtSRef& block_sref,
  */
 int CheckReductionBlockErrorCode(const ScheduleState& self, const StmtSRef& block_sref,
                                  const StmtSRef& scope_root_sref) {
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   // Cond 1. The block has the `init` statement.
   if (!block->init.defined()) {
     return 1;
@@ -394,7 +394,7 @@ void CheckReductionBlock(const ScheduleState& self, const StmtSRef& block_sref,
 
   int error_code = CheckReductionBlockErrorCode(self, block_sref, scope_root_sref);
   if (error_code != 0) {
-    const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+    const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
     throw NotReductionBlockError(self->mod, GetRef<Block>(block), error_code);
   }
 }
@@ -441,7 +441,7 @@ void CheckCompleteOrReductionBlock(const ScheduleState& self, const StmtSRef& bl
   if (reduction_block_error_code == 0) {
     return;
   }
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   throw NotCompleteOrReductionBlockError(self->mod, GetRef<Block>(block), complete_block_error_code,
                                          reduction_block_error_code);
 }
@@ -491,7 +491,7 @@ void CheckSubtreeCompactDataflow(const ScheduleState& self, const StmtSRef& subt
     int local_complete_block_code = CheckCompleteBlockErrorCode(self, block_sref, subtree_root),
         local_reduction_block_code = CheckReductionBlockErrorCode(self, block_sref, subtree_root);
     if (local_complete_block_code != 0 && local_reduction_block_code != 0) {
-      const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+      const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
       throw NotCompactDataFlowError(self->mod, GetRef<Stmt>(subtree_root->stmt),
                                     GetRef<Block>(block), local_complete_block_code,
                                     local_reduction_block_code);
@@ -501,8 +501,8 @@ void CheckSubtreeCompactDataflow(const ScheduleState& self, const StmtSRef& subt
 
 bool IsOutputBlock(const ScheduleState& self, const StmtSRef& block_sref,
                    const StmtSRef& scope_root_sref) {
-  const BlockNode* scope_root = TVM_SREF_TO_BLOCK(scope_root, scope_root_sref);
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* scope_root = TVM_SREF_TO_BLOCK(scope_root_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   std::unordered_set<const BufferNode*> scope_allocated;
   scope_allocated.reserve(scope_root->alloc_buffers.size());
   for (const Buffer& buffer : scope_root->alloc_buffers) {
@@ -532,7 +532,7 @@ void CheckNotOutputBlock(const ScheduleState& self, const StmtSRef& block_sref,
     Block block_;
   };
   if (IsOutputBlock(self, block_sref, scope_root_sref)) {
-    const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+    const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
     throw OutputBlockError(self->mod, GetRef<Block>(block));
   }
 }
@@ -547,12 +547,12 @@ std::vector<IterVarType> GetBlockVarTypes(const BlockNode* block) {
 }
 
 std::vector<IterVarType> GetBlockVarTypes(const StmtSRef& block_sref) {
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   return GetBlockVarTypes(block);
 }
 
 bool IsWriteCache(const StmtSRef& block_sref) {
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   if (block->writes.size() != 1) {
     return false;
   }
@@ -751,7 +751,7 @@ void CheckLoopStartsWithZero(const ScheduleState& self, const StmtSRef& loop_sre
     IRModule mod_;
     For loop_;
   };
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
   if (!analyzer->CanProve(loop->min == 0)) {
     throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop));
   }
@@ -856,7 +856,7 @@ BlockRealize GetBlockRealize(const ScheduleState& self, const StmtSRef& block_sr
     const BlockRealizeNode* result;
   };
 
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   if (block_sref->parent == nullptr) {
     const PrimFuncNode* func = GetRootPrimFunc(self->mod, block, nullptr);
     return Downcast<BlockRealize>(func->body);
@@ -870,7 +870,7 @@ BlockRealize GetBlockRealize(const ScheduleState& self, const StmtSRef& block_sr
 }
 
 IterVarType GetLoopIterType(const StmtSRef& loop_sref) {
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
   const Var& loop_var = loop->loop_var;
   int n_spatial = 0;
   int n_reduce = 0;
@@ -1924,7 +1924,7 @@ void CheckStorageScope(const ScheduleState& self, String storage_scope) {
 }
 
 bool IsSpatial(const StmtSRef& block_sref) {
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   for (const IterVar& iter_var : block->iter_vars) {
     if (iter_var->iter_type != IterVarType::kDataPar) {
       return false;
@@ -1934,14 +1934,14 @@ bool IsSpatial(const StmtSRef& block_sref) {
 }
 
 bool IsTrivialBinding(const ScheduleState& self, const StmtSRef& block_sref) {
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  TVM_SREF_TO_BLOCK(block_sref);
   Array<StmtSRef> loops = GetLoops(block_sref);
   Array<PrimExpr> binds = GetBlockRealize(self, block_sref)->iter_values;
   if (loops.size() != binds.size()) {
     return false;
   }
   for (int i = 0, n = loops.size(); i < n; ++i) {
-    const ForNode* loop = TVM_SREF_TO_FOR(loop, loops[i]);
+    const ForNode* loop = TVM_SREF_TO_FOR(loops[i]);
     if (binds[i].get() != loop->loop_var.get()) {
       return false;
     }
@@ -1953,7 +1953,7 @@ bool NeedsMultiLevelTiling(const ScheduleState& self, const StmtSRef& block_sref
   if (HasBeenMultiLevelTiled(block_sref)) {
     return false;
   }
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   if (block->writes.size() != 1 || block->reads.empty() || IsSpatial(block_sref) ||
       !IsTrivialBinding(self, block_sref)) {
     return false;
@@ -2065,7 +2065,7 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self,   //
                                         const tir::StmtSRef& block_sref,  //
                                         int64_t max_parallel_extent,      //
                                         int64_t max_parallel_basic) {
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   Array<tir::StmtSRef> loops = tir::GetLoops(block_sref);
 
   // Cond 1. The block has only one write buffer
@@ -2100,9 +2100,9 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self,   //
     }
 
     // Cond 5.
-    const ForNode* loop_i = TVM_SREF_TO_FOR(loop_i, loops[i]);
+    const ForNode* loop_i = TVM_SREF_TO_FOR(loops[i]);
     if (i < loops.size() - 1) {
-      const ForNode* loop_i1 = TVM_SREF_TO_FOR(loop_i1, loops[i + 1]);
+      const ForNode* loop_i1 = TVM_SREF_TO_FOR(loops[i + 1]);
       if (loop_i->body.get() != loop_i1) {
         return false;
       }
@@ -2194,7 +2194,7 @@ Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
   TensorIntrinDescInfo desc_info = ExtractTensorIntrinDescInfo(&analyzer, desc_func);
   // Step 2. Collect loops from block_sref
   const tir::StmtSRef& scope_sref = GetScopeRoot(self, block_sref, false);
-  const tir::BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
+  TVM_SREF_TO_BLOCK(scope_sref);
   std::vector<const tir::ForNode*> block_loops;
   std::unordered_set<const tir::VarNode*> block_loop_vars;
   {
diff --git a/src/tir/schedule/block_scope.cc b/src/tir/schedule/block_scope.cc
index f1ce65e48e03..31452f4a8f15 100644
--- a/src/tir/schedule/block_scope.cc
+++ b/src/tir/schedule/block_scope.cc
@@ -76,7 +76,7 @@ BlockScope::BlockScope(const Array<StmtSRef>& child_block_srefs) {
   SMap<Buffer, Array<StmtSRef>> buffer_readers;
   SMap<Buffer, Array<StmtSRef>>& buffer_writers = n->buffer_writers;
   for (const StmtSRef& child_block_sref : child_block_srefs) {
-    const BlockNode* child_block = TVM_SREF_TO_BLOCK(child_block, child_block_sref);
+    const BlockNode* child_block = TVM_SREF_TO_BLOCK(child_block_sref);
     // Step 1. Update `buffer_readers` and `buffer_writers` for each buffer
     for (const BufferRegion& region : child_block->reads) {
       buffer_readers[region->buffer].push_back(child_block_sref);
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 5f773a02d6ff..afc675799706 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -269,7 +269,7 @@ BlockRV ConcreteScheduleNode::GetBlock(const String& name, const Optional<String
         : name_(name), mod_(mod), blocks_{} {
       blocks_.reserve(blocks.size());
       for (const StmtSRef& block_sref : blocks) {
-        const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+        const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
         blocks_.push_back(GetRef<Block>(block));
       }
     }
@@ -432,7 +432,7 @@ Array<LoopRV> ConcreteScheduleNode::Split(const LoopRV& loop_rv,
 
   // Prepare for the splitting
   StmtSRef loop_sref = this->GetSRef(loop_rv);
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
   Array<PrimExpr> factors;
   factors.reserve(factor_rvs.size());
   int infer_index = -1;
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 92b9de408873..e79d1d528809 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -206,13 +206,13 @@ class ConcreteScheduleNode : public ScheduleNode {
 
 inline Block ConcreteScheduleNode::Get(const BlockRV& block_rv) const {
   StmtSRef sref = this->GetSRef(block_rv);
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(sref);
   return GetRef<Block>(block);
 }
 
 inline For ConcreteScheduleNode::Get(const LoopRV& loop_rv) const {
   StmtSRef sref = this->GetSRef(loop_rv);
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(sref);
   return GetRef<For>(loop);
 }
 
@@ -223,7 +223,7 @@ inline PrimExpr ConcreteScheduleNode::Get(const ExprRV& expr_rv) const {
       LOG(FATAL) << "IndexError: Cannot find corresponding ExprRV: " << var;
     }
     const ObjectRef& obj = (*it).second;
-    const auto* int_imm = TVM_TYPE_AS(int_imm, obj, IntImmNode);
+    const auto* int_imm = TVM_TYPE_AS(obj, IntImmNode);
     return Integer(int_imm->value);
   });
   return this->analyzer_->Simplify(transformed);
diff --git a/src/tir/schedule/primitive/block_annotate.cc b/src/tir/schedule/primitive/block_annotate.cc
index 2d876d9bf7fa..31c938313fed 100644
--- a/src/tir/schedule/primitive/block_annotate.cc
+++ b/src/tir/schedule/primitive/block_annotate.cc
@@ -238,7 +238,7 @@ class StorageScopeMutator : private ReplaceBufferMutator {
 
 void StorageAlign(ScheduleState self, const StmtSRef& block_sref, int buffer_index, int axis,
                   int factor, int offset) {
-  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
+  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref);
   Buffer buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index, BufferIndexType::kWrite);
   StorageAlignInvalidFactorError::Check(self->mod, factor);
@@ -274,7 +274,7 @@ void StorageAlign(ScheduleState self, const StmtSRef& block_sref, int buffer_ind
 
 void SetScope(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
               const String& storage_scope) {
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   Buffer buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block), buffer_index, BufferIndexType::kWrite);
 
@@ -289,7 +289,7 @@ void SetScope(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
   // Step 3. Get the allocation site of the target buffer.
   StmtSRef alloc_site_sref =
       NonAllocatedBufferError::CheckAndGetBufferAllocationSite(self->mod, block_sref, buffer);
-  const BlockNode* alloc_site = TVM_SREF_TO_BLOCK(alloc_site, alloc_site_sref);
+  const BlockNode* alloc_site = TVM_SREF_TO_BLOCK(alloc_site_sref);
 
   // Step 4. Recursively replace the old buffer to a new buffer, where the new buffer has the given
   // storage scope. In the meanwhile, collect the block sref reuse information.
diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc
index cf6532e82d46..7481a7c92494 100644
--- a/src/tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/tir/schedule/primitive/blockize_tensorize.cc
@@ -426,7 +426,7 @@ Stmt MakeLoopNest(Stmt stmt, const std::vector<const ForNode*>& loops) {
 
 BlockRealize BlockizeImpl(const ScheduleState& self, const StmtSRef& loop_sref,
                           Map<Block, Block>* block_sref_reuse, arith::Analyzer* analyzer) {
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  TVM_SREF_TO_FOR(loop_sref);
   // Step 1: Check and get the only block under `loop`.
   BlockRealize block_realize = CheckGetSingleChildBlockRealizeOnSRefTree(self, loop_sref);
   Block block = block_realize->block;
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 529d3333cd18..a221733eb394 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -31,7 +31,7 @@ class NotSingleWriteBlock : public ScheduleError {
     ICHECK_GT(write_blocks.size(), 1);
     write_blocks_.reserve(write_blocks.size());
     for (const StmtSRef& block_sref : write_blocks) {
-      const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+      const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
       write_blocks_.push_back(GetRef<Block>(block));
     }
   }
@@ -532,7 +532,7 @@ class CacheReadRewriter : public StmtExprMutator {
     bool is_consumer = info_->consumer_blocks.empty();
     // Otherwise check if this is one of the specified blocks.
     for (StmtSRef consumer_sref : info_->consumer_blocks) {
-      const BlockNode* consumer_node = TVM_SREF_TO_BLOCK(consumer_node, consumer_sref);
+      const BlockNode* consumer_node = TVM_SREF_TO_BLOCK(consumer_sref);
       Block consumer_block = GetRef<Block>(consumer_node);
       if (old_stmt.same_as(consumer_block)) {
         is_consumer = true;
@@ -999,11 +999,11 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff
   CheckStorageScope(self, storage_scope);
 
   // Step 1. Check index, getting the target buffer and the parent scope
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   Buffer read_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block), read_buffer_index, BufferIndexType::kRead);
   StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
-  const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
+  const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref);
 
   // Step 2. Create CacheStageInfo
   CacheStageInfo info;
@@ -1020,7 +1020,7 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff
   if (Optional<StmtSRef> _write_block_sref = GetOnlyWriteBlock(self, scope_sref, read_buffer)) {
     // Case 1. The buffer is written inside the block.
     StmtSRef write_block_sref = _write_block_sref.value();
-    const BlockNode* write_block = TVM_SREF_TO_BLOCK(write_block, write_block_sref);
+    const BlockNode* write_block = TVM_SREF_TO_BLOCK(write_block_sref);
     // Find the producing region
     BufferRegion region = GetBufferRegionFromBuffer(write_block->writes, read_buffer).value();
     StmtSRef parent_sref = GetRef<StmtSRef>(write_block_sref->parent);
@@ -1072,7 +1072,7 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu
   CheckStorageScope(self, storage_scope);
 
   // Step 1. Checking index, getting the target buffer and the parent scope
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   Buffer write_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block), write_buffer_index, BufferIndexType::kWrite);
   StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
@@ -1114,7 +1114,7 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu
 
 StmtSRef ReIndex(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
                  BufferIndexType buffer_index_type) {
-  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
+  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref);
   Block block = GetRef<Block>(block_ptr);
   Buffer buffer = GetNthAccessBuffer(self, block, buffer_index, buffer_index_type);
   StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
index 8baedfd70dd0..83342e351b91 100644
--- a/src/tir/schedule/primitive/compute_at.cc
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -37,7 +37,7 @@ class NotAllRequiredBlocksAreVisitedError : public ScheduleError {
       : mod_(mod), num_not_visited_(num_not_visited) {
     required_.reserve(required.size());
     for (const StmtSRef& block_sref : required) {
-      const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+      const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
       required_.push_back(GetRef<Block>(block));
     }
   }
@@ -306,14 +306,14 @@ class ScopeReconstructor : private StmtMutator {
       return GetRef<Block>(block);
     }
     if (block == rm_src_stmt_.get()) {
-      block = TVM_TYPE_AS(block, rm_tgt_stmt_, BlockNode);
+      block = TVM_TYPE_AS(rm_tgt_stmt_, BlockNode);
     }
     return StmtMutator::VisitStmt_(block);
   }
 
   Stmt VisitStmt_(const ForNode* loop) final {
     if (loop == rm_src_stmt_.get()) {
-      loop = TVM_TYPE_AS(loop, rm_tgt_stmt_, ForNode);
+      loop = TVM_TYPE_AS(rm_tgt_stmt_, ForNode);
     }
     if (loop == loop_.get()) {
       return new_loop_;
@@ -559,7 +559,7 @@ void CalculateProvidedRequiredRegions(
   }
   // Step 2. Calculate the region required by dependent blocks under `loop`
   for (const StmtSRef& required_block_sref : is_compute_at ? consumer_srefs : producer_srefs) {
-    const BlockNode* required_block = TVM_SREF_TO_BLOCK(required_block, required_block_sref);
+    const BlockNode* required_block = TVM_SREF_TO_BLOCK(required_block_sref);
     ICHECK(block2realize.count(required_block));
     RelaxBufferRegions</*relax_storage_scope=*/is_compute_at>(
         /*binding=*/GetBindings(GetRef<BlockRealize>(block2realize.at(required_block))),
@@ -576,8 +576,8 @@ void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_s
                                      const StmtSRef& loop_sref, bool preserve_unit_loops,
                                      arith::Analyzer* analyzer, bool check_only = false,
                                      int index = -1) {
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
   // Step 1. Bunch of checks
   // Check condition 1) : scope stage pipeline
   StmtSRef scope_root_sref = GetScopeRoot(self, block_sref,
diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc
index ad15e06e285a..bfda66036fe3 100644
--- a/src/tir/schedule/primitive/compute_inline.cc
+++ b/src/tir/schedule/primitive/compute_inline.cc
@@ -174,7 +174,7 @@ class NonSingleProducerError : public ScheduleError {
         }
       }
     }
-    const BlockNode* block = TVM_SREF_TO_BLOCK(block, consumer_block_sref);
+    const BlockNode* block = TVM_SREF_TO_BLOCK(consumer_block_sref);
     throw NonSingleProducerError(self->mod, GetRef<Block>(block));
   }
 };
@@ -183,7 +183,7 @@ class OpaqueAccessError : public ScheduleError {
  public:
   explicit OpaqueAccessError(IRModule mod, StmtSRef scope_root_sref)
       : mod_(mod), scope_root_(nullptr) {
-    const BlockNode* scope_root = TVM_SREF_TO_BLOCK(scope_root, scope_root_sref);
+    const BlockNode* scope_root = TVM_SREF_TO_BLOCK(scope_root_sref);
     this->scope_root_ = GetRef<Block>(scope_root);
   }
 
@@ -653,7 +653,7 @@ class ReverseComputeInliner : public BaseInliner {
 
 void ComputeInlineImpl(ScheduleState self, const StmtSRef& producer_block_sref,
                        bool check_only = false) {
-  const BlockNode* _producer_block = TVM_SREF_TO_BLOCK(_producer_block, producer_block_sref);
+  const BlockNode* _producer_block = TVM_SREF_TO_BLOCK(producer_block_sref);
   Block producer_block = GetRef<Block>(_producer_block);
   HasInitBlock::Check(self->mod, producer_block);
   Buffer inlined_buffer = NotSingleReadWriteBuffer::GetSingleWrite(self, producer_block);
@@ -698,7 +698,7 @@ bool CanComputeInline(const ScheduleState& self, const StmtSRef& producer_block_
 
 void ReverseComputeInlineImpl(ScheduleState self, const StmtSRef& consumer_block_sref,
                               bool check_only = false) {
-  const BlockNode* _consumer_block = TVM_SREF_TO_BLOCK(_consumer_block, consumer_block_sref);
+  const BlockNode* _consumer_block = TVM_SREF_TO_BLOCK(consumer_block_sref);
   Block consumer_block = GetRef<Block>(_consumer_block);
   HasInitBlock::Check(self->mod, consumer_block);
   // Step 1. Get the scope block
diff --git a/src/tir/schedule/primitive/decompose_padding.cc b/src/tir/schedule/primitive/decompose_padding.cc
index 365c6d43f127..93fb88e66619 100644
--- a/src/tir/schedule/primitive/decompose_padding.cc
+++ b/src/tir/schedule/primitive/decompose_padding.cc
@@ -415,7 +415,7 @@ StmtSRef DecomposePaddingImpl(ScheduleState self, const StmtSRef& block_sref,
    *    - trim original block to write non-padding part only
    */
   // Condition Checks and Information Collection
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   const BlockRealizeNode* realize = GetBlockRealize(self, block_sref).get();
   Map<Var, Range> dom_map;
   arith::Analyzer analyzer;
diff --git a/src/tir/schedule/primitive/for_kind.cc b/src/tir/schedule/primitive/for_kind.cc
index ec337224e59d..cc8cb55fd3fa 100644
--- a/src/tir/schedule/primitive/for_kind.cc
+++ b/src/tir/schedule/primitive/for_kind.cc
@@ -145,7 +145,7 @@ void CheckParallelizability(const ScheduleState& self, const For& loop, ForKind
  */
 void ParallelizeComputation(const ScheduleState& self, const StmtSRef& loop_sref, ForKind for_kind,
                             Optional<IterVar> thread_axis) {
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
 
   /*
    * Check:
@@ -186,7 +186,7 @@ void Bind(ScheduleState self, const StmtSRef& loop_sref, const IterVar& thread_a
 }
 
 void Unroll(ScheduleState self, const StmtSRef& loop_sref) {
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
   ObjectPtr<ForNode> new_loop = make_object<ForNode>(*loop);
   new_loop->kind = ForKind::kUnrolled;
   new_loop->thread_binding = NullOpt;
diff --git a/src/tir/schedule/primitive/get_block_loop.cc b/src/tir/schedule/primitive/get_block_loop.cc
index 746918ac4e34..cbdb99c6444f 100644
--- a/src/tir/schedule/primitive/get_block_loop.cc
+++ b/src/tir/schedule/primitive/get_block_loop.cc
@@ -40,7 +40,7 @@ Array<StmtSRef> GetBlocks(const ScheduleState& self, const String& name, const G
   };
 
   BaseFunc func = self->mod->Lookup(gv);
-  const auto* prim_func = TVM_TYPE_AS(prim_func, func, PrimFuncNode);
+  const auto* prim_func = TVM_TYPE_AS(func, PrimFuncNode);
   Finder finder(self, name);
   finder(prim_func->body);
   return std::move(finder.results_);
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 148b3ee033c3..b4e40fa120fe 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -134,7 +134,7 @@ class BufferIsSubregionError : public ScheduleError {
 
 void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
                      BufferIndexType buffer_index_type, const IndexMap& index_map) {
-  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
+  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref);
   Buffer old_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index, buffer_index_type);
   Optional<StmtSRef> defining_site_sref;
@@ -147,7 +147,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
   StmtSRef scope_sref = defining_site_sref.defined()
                             ? defining_site_sref.value()
                             : GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
-  const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
+  const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref);
 
   // Step 1: Infer the shape of the new buffer
   ObjectPtr<BufferNode> new_buffer_node = make_object<BufferNode>(*(old_buffer.get()));
@@ -344,7 +344,7 @@ class OpaqueNewIterTypeError : public ScheduleError {
 
 void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
                           const IndexMap& index_map) {
-  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
+  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref);
   const Block& block = GetRef<Block>(block_ptr);
   arith::Analyzer analyzer;
 
@@ -489,7 +489,7 @@ class BufferAxisSeparatorMutator : private ReplaceBufferMutator {
 
 void SetAxisSeparator(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
                       BufferIndexType buffer_index_type, const Array<IntImm>& axis_separators) {
-  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
+  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref);
   Buffer old_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index, buffer_index_type);
   Optional<StmtSRef> defining_site_sref;
@@ -502,7 +502,7 @@ void SetAxisSeparator(ScheduleState self, const StmtSRef& block_sref, int buffer
   StmtSRef scope_sref = defining_site_sref.defined()
                             ? defining_site_sref.value()
                             : GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
-  const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
+  const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref);
 
   // Step 1: Check and update axis_separators of the buffer.
   Buffer new_buffer = old_buffer;
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index f1b6f46e1b8f..2db3eb902aba 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -87,7 +87,7 @@ class IterMapSimplifyBlockBinding : public StmtExprMutator {
                               bool preserve_unit_iters) {
     Map<Var, Range> loop_var2extent;
     for (const StmtSRef& sref : loop_srefs) {
-      const ForNode* loop = TVM_SREF_TO_FOR(loop, sref);
+      const ForNode* loop = TVM_SREF_TO_FOR(sref);
       loop_var2extent.Set(loop->loop_var, Range::FromMinExtent(loop->min, loop->extent));
     }
     return Downcast<For>(IterMapSimplifyBlockBinding(opaque_blocks, std::move(loop_var2extent),
@@ -389,7 +389,7 @@ Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, const Array
   // - The execution order has not changed. (The block executes with the same args and the same
   // order with before.
   // Step 1. Check correctness
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
   if (!loop->annotations.empty() || loop->thread_binding.defined()) {
     throw HasAnnotationOrThreadBindingError(self->mod, GetRef<For>(loop));
   }
@@ -445,7 +445,7 @@ Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, const Array
   result_srefs.reserve(n);
   for (int i = 0; i < n; i++) {
     result_srefs.push_back(self->stmt2ref.at(new_stmt.get()));
-    const ForNode* outer_loop = TVM_TYPE_AS(outer_loop, new_stmt, ForNode);
+    const ForNode* outer_loop = TVM_TYPE_AS(new_stmt, ForNode);
     new_stmt = outer_loop->body;
   }
   return result_srefs;
@@ -464,7 +464,7 @@ StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs, bool preser
   std::unordered_set<const VarNode*> outer_loop_vars;
   // Step 1. check correctness
   for (const StmtSRef& sref : loop_srefs) {
-    const ForNode* loop = TVM_SREF_TO_FOR(loop, sref);
+    const ForNode* loop = TVM_SREF_TO_FOR(sref);
     if (!loop->annotations.empty() || loop->thread_binding.defined()) {
       throw HasAnnotationOrThreadBindingError(self->mod, GetRef<For>(loop));
     }
@@ -554,7 +554,7 @@ std::unordered_set<const StmtSRefNode*> CollectLoopsIntoSet(
   for (const StmtSRef& loop_sref : ordered_loop_srefs) {
     auto inserted = loop_srefs.insert(loop_sref.get());
     if (!inserted.second) {
-      const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+      const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
       throw LoopMultiAppearanceError(self->mod, GetRef<For>(loop));
     }
   }
diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc
index ad9043e4f2db..7a4ace736e48 100644
--- a/src/tir/schedule/primitive/reduction.cc
+++ b/src/tir/schedule/primitive/reduction.cc
@@ -123,7 +123,7 @@ class LoopHeightError : public ScheduleError {
         // loop_var of a higher loop shouldn't contain loop var
         const Var& loop_var = higher_loop->StmtAs<ForNode>()->loop_var;
         if (UsesVar(binding, [v = loop_var.get()](const VarNode* var) { return var == v; })) {
-          const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+          const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
           throw LoopHeightError(mod, GetRef<For>(loop), GetRef<Block>(block));
         }
       }
@@ -183,8 +183,8 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref,
    *    - generate corresponding init block and update block
    */
   // Condition Checks and Information Collection
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
   // Get the outer loops from high to low
   Array<StmtSRef> loops = GetLoops(block_sref);
   const BlockRealizeNode* realize = GetBlockRealize(self, block_sref).get();
@@ -264,7 +264,7 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref,
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> loop_var_map;
   Stmt body = BlockRealize(init_realize);
   for (int i : chosen_loops) {
-    const ForNode* old_loop = TVM_SREF_TO_FOR(old_loop, loops[i]);
+    const ForNode* old_loop = TVM_SREF_TO_FOR(loops[i]);
     // Create a new equivalent to the chosen loop
     Var old_loop_var = old_loop->loop_var;
     Var new_loop_var = old_loop_var.copy_with_suffix("_init");
@@ -277,7 +277,7 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref,
   }
   body = Substitute(body, loop_var_map);
   // Step 6. Mutate IR
-  const BlockNode* old_scope_root = TVM_SREF_TO_BLOCK(old_scope_root, scope_root_sref);
+  const BlockNode* old_scope_root = TVM_SREF_TO_BLOCK(scope_root_sref);
   Block new_scope_root{nullptr};
   Block new_reduction_block{nullptr};
   std::tie(new_scope_root, new_reduction_block) = DecomposeReductionBlockReplacer::Replace(
@@ -1013,7 +1013,7 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax
   StmtSRef scope_root = GetScopeRoot(self, block_sref,  //
                                      /*require_stage_pipeline=*/true);
   CheckReductionBlock(self, block_sref, scope_root);
-  const ForNode* rf_loop = TVM_SREF_TO_FOR(rf_loop, rf_loop_sref);
+  const ForNode* rf_loop = TVM_SREF_TO_FOR(rf_loop_sref);
   if (rf_loop->kind != ForKind::kSerial) {
     throw NotSerialLoopKindError(self->mod, GetRef<For>(rf_loop));
   }
diff --git a/src/tir/schedule/primitive/sampling.cc b/src/tir/schedule/primitive/sampling.cc
index 1961565aac75..52b5add2bc9e 100644
--- a/src/tir/schedule/primitive/sampling.cc
+++ b/src/tir/schedule/primitive/sampling.cc
@@ -311,7 +311,7 @@ std::vector<int64_t> SamplePerfectTile(
     support::LinearCongruentialEngine::TRandState* rand_state,  //
     const tir::StmtSRef& loop_sref, int32_t n_splits, int32_t max_innermost_factor,
     Optional<Array<Integer>>* decision) {
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
   const int64_t* extent = GetLoopIntExtent(loop);
   std::vector<int64_t> result;
   if (extent == nullptr) {
diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc
index 07481ddb19e3..15d0e08ddc2c 100644
--- a/src/tir/schedule/state.cc
+++ b/src/tir/schedule/state.cc
@@ -208,7 +208,7 @@ class BlockInfoCollector : private StmtVisitor {
     if (is_root_block) {
       // If the block doesn't have outer loops and BlockRealize,
       // then we set the affine binding flag as true only if the block has no block vars
-      const BlockNode* block = TVM_SREF_TO_BLOCK(block, scope_root);
+      const BlockNode* block = TVM_SREF_TO_BLOCK(scope_root);
       if (block->iter_vars.empty()) info.affine_binding = true;
     } else {
       info.affine_binding =
@@ -233,7 +233,7 @@ class BlockInfoCollector : private StmtVisitor {
     block_reads_unbound.reserve(child_block_srefs.size());
     block_writes_unbound.reserve(child_block_srefs.size());
     for (const StmtSRef& block_sref : child_block_srefs) {
-      const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+      const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
       Map<Var, PrimExpr> binding = GetBindings(block2realize_.at(block));
       // Step 1.1. Unbind read regions
       Array<BufferRegion> reads;
@@ -254,7 +254,7 @@ class BlockInfoCollector : private StmtVisitor {
     for (const auto& kv : info.scope->dst2deps) {
       const StmtSRef& consumer_block_sref = kv.first;
       const Array<Dependency>& deps = kv.second;
-      const BlockNode* consumer_block = TVM_SREF_TO_BLOCK(consumer_block, consumer_block_sref);
+      const BlockNode* consumer_block = TVM_SREF_TO_BLOCK(consumer_block_sref);
       const BlockRealize& consumer_realize = block2realize_.at(consumer_block);
       bool& region_cover = self_->block_info.at(consumer_block_sref).region_cover = true;
       // Step 2.1. Extract the path to the scope root
@@ -851,7 +851,7 @@ class ChildReplacer : private StmtMutator {
       } else if (const auto* realize = stmt.as<BlockRealizeNode>()) {
         // Case 2. stmt is BlockRealize, src_stmt is Block
         if (realize->block.get() == src_stmt) {
-          const auto* tgt_block = TVM_TYPE_AS(tgt_block, tgt_stmt_, BlockNode);
+          const auto* tgt_block = TVM_TYPE_AS(tgt_stmt_, BlockNode);
           ObjectPtr<BlockRealizeNode> new_realize = make_object<BlockRealizeNode>(*realize);
           new_realize->block = GetRef<Block>(tgt_block);
           new_stmt = BlockRealize(std::move(new_realize));
@@ -1044,9 +1044,9 @@ void ScheduleStateNode::Replace(const tir::StmtSRef& _src_sref, const Stmt& tgt_
     // If `g_func` was unique, after the 3 lines above:
     //   `ref_new_func` points to the same unique function that `g_func` points to
     // Update the body of the function the sref belongs to Assign
-    const auto* realize = TVM_TYPE_AS(realize, g_func->body, BlockRealizeNode);
+    const auto* realize = TVM_TYPE_AS(g_func->body, BlockRealizeNode);
     // Make `child_tgt_stmt` the root block
-    const auto* child_block = TVM_TYPE_AS(child_block, child_tgt_stmt, BlockNode);
+    const auto* child_block = TVM_TYPE_AS(child_tgt_stmt, BlockNode);
     ObjectPtr<BlockRealizeNode> new_realize = make_object<BlockRealizeNode>(*realize);
     new_realize->block = GetRef<Block>(child_block);
     new_func->body = BlockRealize(std::move(new_realize));
@@ -1078,7 +1078,7 @@ void ScheduleStateNode::DebugVerify() const {
 /**************** BlockInfo-related ****************/
 
 BlockInfo ScheduleStateNode::GetBlockInfo(const StmtSRef& block_sref) const {
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  TVM_SREF_TO_BLOCK(block_sref);
   auto it = this->block_info.find(block_sref);
   CHECK(it != this->block_info.end())
       << "IndexError: Cannot find the corresponding BlockScope to the block sref:\n"
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index 1c21d770db30..1ebaf202d487 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -36,7 +36,7 @@ Block WithAnnotation(const BlockNode* block, const String& attr_key, const Objec
 Buffer WithScope(const Buffer& buffer, const String& scope) {
   ObjectPtr<BufferNode> new_buffer = make_object<BufferNode>(*buffer.get());
   ObjectPtr<VarNode> new_var = make_object<VarNode>(*buffer->data.get());
-  const auto* ptr_type = TVM_TYPE_AS(ptr_type, buffer->data->type_annotation, PointerTypeNode);
+  const auto* ptr_type = TVM_TYPE_AS(buffer->data->type_annotation, PointerTypeNode);
   new_var->type_annotation = PointerType(ptr_type->element_type, scope);
   new_buffer->data = Var(new_var->name_hint + "_" + scope, new_var->type_annotation);
   new_buffer->name = buffer->name + "_" + scope;
@@ -253,8 +253,8 @@ void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_
     }
   }
   ICHECK(sref != nullptr && sref->stmt != nullptr);
-  const auto* leaf_block = TVM_SREF_TO_BLOCK(leaf_block, leaf_block_sref);
-  const auto* scope_block = TVM_SREF_TO_BLOCK(scope_block, sref);
+  const auto* leaf_block = TVM_SREF_TO_BLOCK(leaf_block_sref);
+  const auto* scope_block = TVM_SREF_TO_BLOCK(sref);
   throw OnlyLeafError(self->mod, GetRef<Block>(leaf_block), GetRef<Block>(scope_block));
 }
 
diff --git a/src/tir/schedule/utils.h b/src/tir/schedule/utils.h
index 3db80989ae10..c289309acc2d 100644
--- a/src/tir/schedule/utils.h
+++ b/src/tir/schedule/utils.h
@@ -62,25 +62,35 @@ namespace tir {
 
 /*!
  * \brief A helper macro to convert an sref to the block it points to,
- * throwing an internal error if downcasting fails
- * \param Result The result variable, used for checking
+ *
+ * Throws an internal error if downcasting fails.  The variable name
+ * in the parent scope is used for the error message.
+ *
  * \param SRef The SRef to be cast
  */
-#define TVM_SREF_TO_BLOCK(Result, SRef)                   \
-  TVM_SREF_AS_OR_ERR(Result, SRef, ::tvm::tir::BlockNode) \
-      << "TypeError: Expects StmtSRef `" << #SRef         \
-      << "` points to `Block`, but gets: " << (SRef->stmt ? SRef->stmt->GetTypeKey() : "None")
+#define TVM_SREF_TO_BLOCK(SRef)                                                                    \
+  [&]() {                                                                                          \
+    auto result = TVM_SREF_AS_OR_ERR(result, (SRef), ::tvm::tir::BlockNode)                        \
+                  << "TypeError: Expects StmtSRef `" << #SRef << "` points to `Block`, but gets: " \
+                  << ((SRef)->stmt ? (SRef)->stmt->GetTypeKey() : "None");                         \
+    return result;                                                                                 \
+  }()
 
 /*!
- * \brief A helper macro to convert an sref to the for-loop it points to,
- * throwing an internal error if downcasting fails
- * \param Result The name of the result variable, used for checking
+ * \brief A helper macro to convert an sref to the for-loop it points to
+ *
+ * Throws an internal error if downcasting fails.  The variable name
+ * in the parent scope is used for the error message.
+ *
  * \param SRef The SRef to be cast
  */
-#define TVM_SREF_TO_FOR(Result, SRef)                   \
-  TVM_SREF_AS_OR_ERR(Result, SRef, ::tvm::tir::ForNode) \
-      << "TypeError: Expects StmtSRef `" << #SRef       \
-      << "` points to `Loop`, but gets: " << (SRef->stmt ? SRef->stmt->GetTypeKey() : "None")
+#define TVM_SREF_TO_FOR(SRef)                                                                     \
+  [&]() {                                                                                         \
+    auto result = TVM_SREF_AS_OR_ERR(result, (SRef), ::tvm::tir::ForNode)                         \
+                  << "TypeError: Expects StmtSRef `" << #SRef << "` points to `Loop`, but gets: " \
+                  << ((SRef)->stmt ? (SRef)->stmt->GetTypeKey() : "None");                        \
+    return result;                                                                                \
+  }()
 
 /*!
  * \brief Downcast a TVM ObjectRef to its corresponding container using `ObjectRef::as<Type>`,
@@ -100,10 +110,13 @@ namespace tir {
  * \param From The ObjectRef to be downcast
  * \param Type The type to be downcast to
  */
-#define TVM_TYPE_AS(Result, From, Type)                                           \
-  TVM_TYPE_AS_OR_ERR(Result, From, Type)                                          \
-      << "TypeError: Expects `" << #From << "` to have type `" << Type::_type_key \
-      << "`, but gets: " << (From.defined() ? From->GetTypeKey() : "None")
+#define TVM_TYPE_AS(From, Type)                                                               \
+  [&]() {                                                                                     \
+    auto result = TVM_TYPE_AS_OR_ERR(result, (From), Type)                                    \
+                  << "TypeError: Expects `" << #From << "` to have type `" << Type::_type_key \
+                  << "`, but gets: " << ((From).defined() ? (From)->GetTypeKey() : "None");   \
+    return result;                                                                            \
+  }()
 
 /*!
  * \brief Convert an array of loop StmtSRefs to an array of loops
@@ -114,7 +127,7 @@ inline Array<For> LoopSRefs2Loops(const Array<StmtSRef>& loop_srefs) {
   Array<For> loops;
   loops.reserve(loop_srefs.size());
   for (StmtSRef loop_sref : loop_srefs) {
-    const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+    const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
     loops.push_back(GetRef<For>(loop));
   }
   return loops;
@@ -264,7 +277,7 @@ inline const int64_t* GetLoopIntExtent(const ForNode* loop) { return as_const_in
  * \return The extent of the loop, nullptr if the extent is not constant
  */
 inline const int64_t* GetLoopIntExtent(const StmtSRef& loop_sref) {
-  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
   return as_const_int(loop->extent);
 }
 

From 2e83e03b2c57f1e65938d7da48a48296c781f7a1 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 27 Aug 2022 04:37:32 +0900
Subject: [PATCH 058/704] [CI] Update Hexagon image to install boost (#12613)

The new image has xgboost installed, which I need for https://github.com/apache/tvm/pull/12587

Validated in https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/ci-docker-staging/279/pipeline
---
 Jenkinsfile               | 4 ++--
 ci/jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8c1ce9ed5020..3278e83098b7 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-08-19T15:38:38.311410
+// Generated at 2022-08-26T15:09:39.104767
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -57,7 +57,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220810-060142-fae79bbc3'
 ci_i386 = 'tlcpack/ci-i386:20220810-060142-fae79bbc3'
 ci_cortexm = 'tlcpack/ci-cortexm:20220810-060142-fae79bbc3'
 ci_arm = 'tlcpack/ci-arm:20220810-060142-fae79bbc3'
-ci_hexagon = 'tlcpack/ci-hexagon:20220810-060142-fae79bbc3'
+ci_hexagon = 'tlcpack/ci-hexagon:20220825-145056-fb7cf97f'
 ci_riscv = 'tlcpack/ci-riscv:20220810-060142-fae79bbc3'
 // <--- End of regex-scanned config.
 
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index be2776c6d9e3..c932431a44a1 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -59,7 +59,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220810-060142-fae79bbc3'
 ci_i386 = 'tlcpack/ci-i386:20220810-060142-fae79bbc3'
 ci_cortexm = 'tlcpack/ci-cortexm:20220810-060142-fae79bbc3'
 ci_arm = 'tlcpack/ci-arm:20220810-060142-fae79bbc3'
-ci_hexagon = 'tlcpack/ci-hexagon:20220810-060142-fae79bbc3'
+ci_hexagon = 'tlcpack/ci-hexagon:20220825-145056-fb7cf97f'
 ci_riscv = 'tlcpack/ci-riscv:20220810-060142-fae79bbc3'
 // <--- End of regex-scanned config.
 

From 23e794422a66ccfca8d58435e341c2af58f505e2 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 26 Aug 2022 15:59:53 -0500
Subject: [PATCH 059/704] Replace '> >' in templates with >>, NFC (#12615)

The problem with greedy lexing of >> as an operator was solved in
C++11, and now templates no longer require spaces between >'s.
---
 docs/arch/convert_layout.rst                  | 10 +++---
 docs/arch/inferbound.rst                      |  4 +--
 .../how_to/relay_bring_your_own_codegen.rst   |  2 +-
 include/tvm/auto_scheduler/feature.h          |  8 ++---
 include/tvm/relay/attrs/image.h               | 14 ++++----
 include/tvm/runtime/module.h                  |  2 +-
 include/tvm/support/span.h                    |  2 +-
 include/tvm/te/operation.h                    |  2 +-
 include/tvm/topi/detail/extern.h              |  2 +-
 include/tvm/topi/transform.h                  |  2 +-
 .../native/org_apache_tvm_native_c_api.cc     |  4 +--
 src/arith/analyzer.cc                         |  2 +-
 src/autotvm/touch_extractor.cc                | 14 ++++----
 src/contrib/ethosu/cascader/propagator.cc     |  8 ++---
 src/contrib/ethosu/cascader/propagator.h      |  6 ++--
 src/ir/span.cc                                |  2 +-
 src/node/reflection.cc                        |  2 +-
 src/printer/meta_data.h                       |  2 +-
 src/relay/analysis/dependency_graph.cc        |  4 +--
 src/relay/ir/transform.cc                     |  2 +-
 src/relay/transforms/convert_sparse_dense.cc  |  8 ++---
 src/relay/transforms/fuse_ops.cc              |  2 +-
 src/relay/transforms/let_list.h               |  2 +-
 src/relay/transforms/partial_eval.cc          |  2 +-
 src/relay/transforms/type_infer.cc            |  4 +--
 src/runtime/contrib/ethosn/ethosn_device.cc   |  6 ++--
 src/runtime/graph_executor/graph_executor.cc  |  4 +--
 src/runtime/metal/metal_common.h              |  4 +--
 src/runtime/thread_pool.cc                    |  2 +-
 src/runtime/threading_backend.cc              |  2 +-
 src/runtime/vm/pooled_allocator.h             |  2 +-
 src/target/source/codegen_vhls.cc             |  2 +-
 src/te/operation/compute_op.cc                |  8 ++---
 src/te/operation/compute_op.h                 |  4 +--
 src/te/operation/tensor_compute_op.cc         | 13 ++++---
 src/te/operation/tensorize.cc                 | 29 ++++++++--------
 src/te/schedule/graph.h                       |  6 ++--
 src/te/schedule/schedule_dataflow_rewrite.cc  |  2 +-
 src/tir/ir/buffer.cc                          |  8 ++---
 src/tir/transforms/coproc_sync.cc             | 34 +++++++++----------
 src/tir/transforms/inject_double_buffer.cc    |  4 +--
 src/tir/transforms/inject_virtual_thread.cc   |  2 +-
 src/tir/transforms/ir_utils.h                 |  2 +-
 src/tir/transforms/make_packed_api.cc         |  6 ++--
 src/tir/transforms/storage_access.h           |  2 +-
 src/tir/transforms/storage_rewrite.cc         |  4 +--
 46 files changed, 128 insertions(+), 130 deletions(-)

diff --git a/docs/arch/convert_layout.rst b/docs/arch/convert_layout.rst
index 53038e9605e8..51917fce44df 100644
--- a/docs/arch/convert_layout.rst
+++ b/docs/arch/convert_layout.rst
@@ -150,10 +150,10 @@ First example is for layout agnostic operators. These operators do not have any
     // 		.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
 
     // Take arbitrary input layouts and copy to outputs.
-    inline Array<Array<Layout> > ElemwiseArbitraryLayout(const Attrs& attrs,
-                                                         const Array<Layout>& new_in_layouts,
-                                                         const Array<Layout>& old_in_layouts,
-                                                         const Array<Array<IndexExpr>> &old_in_shapes) {
+    inline Array<Array<Layout>> ElemwiseArbitraryLayout(const Attrs& attrs,
+                                                        const Array<Layout>& new_in_layouts,
+                                                        const Array<Layout>& old_in_layouts,
+                                                        const Array<Array<IndexExpr>> &old_in_shapes) {
       Layout ret;
 
       if (new_in_layouts.defined()) {
@@ -168,7 +168,7 @@ First example is for layout agnostic operators. These operators do not have any
         }
       }
 
-      return Array<Array<Layout> >{Array<Layout>(old_in_layouts.size(), ret), {ret}};
+      return Array<Array<Layout>>{Array<Layout>(old_in_layouts.size(), ret), {ret}};
     }
 
 
diff --git a/docs/arch/inferbound.rst b/docs/arch/inferbound.rst
index 9c78a9da7440..cc516359bdba 100644
--- a/docs/arch/inferbound.rst
+++ b/docs/arch/inferbound.rst
@@ -280,7 +280,7 @@ Phase 3: Propagate IntSets to consumer's input tensors
 
    /*
     * Input: Map<IterVar, IntSet> dom_map: consumer root -> IntSet
-    * Output: Map<Tensor, TensorDom> tmap: output tensor -> vector<vector<IntSet> >
+    * Output: Map<Tensor, TensorDom> tmap: output tensor -> vector<vector<IntSet>>
     */
 
 Note that the consumer's input tensors are output tensors of the stage InferBound is working on. So by establishing information about the consumer's input tensors, we actually obtain information about the stage's output tensors too: the consumers require certain regions of these tensors to be computed. This information can then be propagated through the rest of the stage, eventually obtaining Ranges for the stage's root_iter_vars by the end of Phase 4.
@@ -306,7 +306,7 @@ Phase 4: Consolidate across all consumers
 .. code:: cpp
 
    /*
-    * Input: Map<Tensor, TensorDom> tmap: output tensor -> vector<vector<IntSet> >
+    * Input: Map<Tensor, TensorDom> tmap: output tensor -> vector<vector<IntSet>>
     * Output: Map<IterVar, Range> rmap: rmap is populated for all of the stage's root_iter_vars
     */
 
diff --git a/docs/dev/how_to/relay_bring_your_own_codegen.rst b/docs/dev/how_to/relay_bring_your_own_codegen.rst
index 304bd016dec2..c106bb2a6372 100644
--- a/docs/dev/how_to/relay_bring_your_own_codegen.rst
+++ b/docs/dev/how_to/relay_bring_your_own_codegen.rst
@@ -676,7 +676,7 @@ Again, we first define a customized runtime class as follows. The class has to b
 	  /* \brief The subgraph that being processed. */
 	  std::string curr_subgraph_;
 	  /*! \brief A simple graph from subgraph id to node entries. */
-	  std::map<std::string, std::vector<NodeEntry> > graph_;
+	  std::map<std::string, std::vector<NodeEntry>> graph_;
 	  /* \brief A simple pool to contain the tensor for each node in the graph. */
 	  std::vector<NDArray> data_entry_;
 	  /* \brief A mapping from node id to op name. */
diff --git a/include/tvm/auto_scheduler/feature.h b/include/tvm/auto_scheduler/feature.h
index 71d00f249210..a8b88b7f11f9 100644
--- a/include/tvm/auto_scheduler/feature.h
+++ b/include/tvm/auto_scheduler/feature.h
@@ -70,7 +70,7 @@ void GetPerStoreFeatureName(int max_n_bufs, std::vector<std::string>* ret);
  */
 void GetPerStoreFeaturesFromStates(const Array<State>& states, const SearchTask& task,
                                    int skip_first_n_feature_extraction, int max_n_bufs,
-                                   std::vector<std::vector<float> >* features);
+                                   std::vector<std::vector<float>>* features);
 
 /*!
  * \brief Get per-store feature from states of different tasks
@@ -83,7 +83,7 @@ void GetPerStoreFeaturesFromStates(const Array<State>& states, const SearchTask&
  */
 void GetPerStoreFeaturesFromStates(const Array<State>& states, const std::vector<SearchTask>& tasks,
                                    int skip_first_n_feature_extraction, int max_n_bufs,
-                                   std::vector<std::vector<float> >* features);
+                                   std::vector<std::vector<float>>* features);
 
 /*!
  * \brief Get per-store features from a log file
@@ -96,7 +96,7 @@ void GetPerStoreFeaturesFromStates(const Array<State>& states, const std::vector
  * \param task_ids The task ids for all states
  */
 void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int max_n_bufs,
-                                 std::vector<std::vector<float> >* features,
+                                 std::vector<std::vector<float>>* features,
                                  std::vector<float>* normalized_throughputs,
                                  std::vector<int>* task_ids);
 
@@ -114,7 +114,7 @@ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int
 void GetPerStoreFeaturesFromMeasurePairs(const Array<MeasureInput>& inputs,
                                          const Array<MeasureResult>& results,
                                          int skip_first_n_feature_extraction, int max_n_bufs,
-                                         std::vector<std::vector<float> >* features,
+                                         std::vector<std::vector<float>>* features,
                                          std::vector<float>* normalized_throughputs,
                                          std::vector<int>* task_ids);
 
diff --git a/include/tvm/relay/attrs/image.h b/include/tvm/relay/attrs/image.h
index e0ee6dc748c2..43510ea68501 100644
--- a/include/tvm/relay/attrs/image.h
+++ b/include/tvm/relay/attrs/image.h
@@ -46,9 +46,9 @@ struct Resize1DAttrs : public tvm::AttrsNode<Resize1DAttrs> {
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Resize1DAttrs, "relay.attrs.Resize1DAttrs") {
-    TVM_ATTR_FIELD(size).set_default(NullValue<Array<IndexExpr> >()).describe("Output Size.");
+    TVM_ATTR_FIELD(size).set_default(NullValue<Array<IndexExpr>>()).describe("Output Size.");
     TVM_ATTR_FIELD(roi)
-        .set_default(NullValue<Array<FloatImm> >())
+        .set_default(NullValue<Array<FloatImm>>())
         .describe("Region of Interest for coordinate transformation mode 'tf_crop_and_resize'");
     TVM_ATTR_FIELD(layout).set_default("NCW").describe(
         "Dimension ordering of input data. Can be 'NCW', 'NWC', etc."
@@ -99,9 +99,9 @@ struct Resize2DAttrs : public tvm::AttrsNode<Resize2DAttrs> {
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Resize2DAttrs, "relay.attrs.Resize2DAttrs") {
-    TVM_ATTR_FIELD(size).set_default(NullValue<Array<IndexExpr> >()).describe("Output Size.");
+    TVM_ATTR_FIELD(size).set_default(NullValue<Array<IndexExpr>>()).describe("Output Size.");
     TVM_ATTR_FIELD(roi)
-        .set_default(NullValue<Array<FloatImm> >())
+        .set_default(NullValue<Array<FloatImm>>())
         .describe("Region of Interest for coordinate transformation mode 'tf_crop_and_resize'");
     TVM_ATTR_FIELD(layout).set_default("NCHW").describe(
         "Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
@@ -152,9 +152,9 @@ struct Resize3DAttrs : public tvm::AttrsNode<Resize3DAttrs> {
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Resize3DAttrs, "relay.attrs.Resize3DAttrs") {
-    TVM_ATTR_FIELD(size).set_default(NullValue<Array<IndexExpr> >()).describe("Output Size.");
+    TVM_ATTR_FIELD(size).set_default(NullValue<Array<IndexExpr>>()).describe("Output Size.");
     TVM_ATTR_FIELD(roi)
-        .set_default(NullValue<Array<FloatImm> >())
+        .set_default(NullValue<Array<FloatImm>>())
         .describe("Region of Interest for coordinate transformation mode 'tf_crop_and_resize'");
     TVM_ATTR_FIELD(layout).set_default("NCDHW").describe(
         "Dimension ordering of input data. Can be 'NCDHW', 'NDHWC', etc."
@@ -200,7 +200,7 @@ struct CropAndResizeAttrs : public tvm::AttrsNode<CropAndResizeAttrs> {
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(CropAndResizeAttrs, "relay.attrs.CropAndResizeAttrs") {
-    TVM_ATTR_FIELD(crop_size).set_default(NullValue<Array<IndexExpr> >()).describe("Target Size.");
+    TVM_ATTR_FIELD(crop_size).set_default(NullValue<Array<IndexExpr>>()).describe("Target Size.");
     TVM_ATTR_FIELD(layout).set_default("NCHW").describe(
         "Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
         "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 9d139c9feff3..a54f98a558f3 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -234,7 +234,7 @@ class TVM_DLL ModuleNode : public Object {
 
  private:
   /*! \brief Cache used by GetImport */
-  std::unordered_map<std::string, std::shared_ptr<PackedFunc> > import_cache_;
+  std::unordered_map<std::string, std::shared_ptr<PackedFunc>> import_cache_;
   std::mutex mutex_;
 };
 
diff --git a/include/tvm/support/span.h b/include/tvm/support/span.h
index 603fb531f43a..689a48dee788 100644
--- a/include/tvm/support/span.h
+++ b/include/tvm/support/span.h
@@ -68,7 +68,7 @@ class Span {
 
     inline bool operator!=(iterator_base<W1> other) { return !(*this == other); }
 
-    template <class X = W1, typename = std::enable_if_t<!std::is_const<X>::value> >
+    template <class X = W1, typename = std::enable_if_t<!std::is_const<X>::value>>
     inline operator iterator_base<const_W>() const {
       return iterator_base<const_W>(ptr_, end_);
     }
diff --git a/include/tvm/te/operation.h b/include/tvm/te/operation.h
index e91a0930f37b..2c50f3c3157b 100644
--- a/include/tvm/te/operation.h
+++ b/include/tvm/te/operation.h
@@ -47,7 +47,7 @@ struct TensorDom {
   // constructor
   explicit TensorDom(int ndim) : data(ndim) {}
   /*! \brief The domain data */
-  std::vector<std::vector<IntSet> > data;
+  std::vector<std::vector<IntSet>> data;
 };
 
 /*!
diff --git a/include/tvm/topi/detail/extern.h b/include/tvm/topi/detail/extern.h
index 2561f8d1ca27..dee4bf70a729 100644
--- a/include/tvm/topi/detail/extern.h
+++ b/include/tvm/topi/detail/extern.h
@@ -75,7 +75,7 @@ using FExtern = std::function<PrimExpr(Array<Buffer>, Array<Buffer>)>;
  * be one output Tensor for each element of out_shapes, with dtype equal to the corresponding
  * element of out_types.
  */
-inline Array<Tensor> make_extern(const Array<Array<PrimExpr> >& out_shapes,
+inline Array<Tensor> make_extern(const Array<Array<PrimExpr>>& out_shapes,
                                  const std::vector<DataType>& out_types,
                                  const Array<Tensor>& inputs, FExtern fextern, std::string name,
                                  std::string tag, ::tvm::Map<String, ObjectRef> attrs) {
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index 7accbf86912d..4c96ed42f6e9 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -592,7 +592,7 @@ inline Array<Tensor> split(const Tensor& x, Array<PrimExpr> split_indices, int a
     begin_ids.push_back(idx);
   }
 
-  Array<Array<PrimExpr> > out_shapes;
+  Array<Array<PrimExpr>> out_shapes;
   for (size_t i = 0; i < begin_ids.size(); ++i) {
     PrimExpr out_axis_size;
     if (i == begin_ids.size() - 1) {
diff --git a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
index f7be0cf80eb0..f86191d45bbc 100644
--- a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
+++ b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
@@ -42,8 +42,8 @@ struct TVMFuncArgsThreadLocalEntry {
   std::vector<TVMValue> tvmFuncArgValues;
   std::vector<int> tvmFuncArgTypes;
   // for later release
-  std::vector<std::pair<jstring, const char*> > tvmFuncArgPushedStrs;
-  std::vector<std::pair<jbyteArray, TVMByteArray*> > tvmFuncArgPushedBytes;
+  std::vector<std::pair<jstring, const char*>> tvmFuncArgPushedStrs;
+  std::vector<std::pair<jbyteArray, TVMByteArray*>> tvmFuncArgPushedBytes;
 };
 typedef dmlc::ThreadLocalStore<TVMFuncArgsThreadLocalEntry> TVMFuncArgsThreadLocalStore;
 
diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index f32c9b2ff4cf..ad52a6578b24 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -186,7 +186,7 @@ TVM_REGISTER_GLOBAL("arith.CreateAnalyzer").set_body([](TVMArgs args, TVMRetValu
       return PackedFunc([self](TVMArgs args, TVMRetValue* ret) {
         // can't use make_shared due to noexcept(false) decl in destructor,
         // see https://stackoverflow.com/a/43907314
-        auto ctx = std::shared_ptr<With<ConstraintContext> >(
+        auto ctx = std::shared_ptr<With<ConstraintContext>>(
             new With<ConstraintContext>(self.get(), args[0]));
         auto fexit = [ctx](TVMArgs, TVMRetValue*) mutable { ctx.reset(); };
         *ret = PackedFunc(fexit);
diff --git a/src/autotvm/touch_extractor.cc b/src/autotvm/touch_extractor.cc
index 10ead718bae2..dd3cf88f7bf6 100644
--- a/src/autotvm/touch_extractor.cc
+++ b/src/autotvm/touch_extractor.cc
@@ -220,7 +220,7 @@ void TouchExtractor::ExitMem_() {}
  * \note If you want to flatten these features as the input of your model,
  * You can use the faster one GetItervarFeatureFlatten below.
  */
-void GetItervarFeature(Stmt stmt, bool take_log, Array<Array<Array<PrimExpr> > >* ret_feature) {
+void GetItervarFeature(Stmt stmt, bool take_log, Array<Array<Array<PrimExpr>>>* ret_feature) {
   // extract
   TouchExtractor touch_analyzer;
   touch_analyzer.Analyze(stmt);
@@ -248,7 +248,7 @@ void GetItervarFeature(Stmt stmt, bool take_log, Array<Array<Array<PrimExpr> > >
 
   // serialize for front end
   for (auto var : vars) {
-    Array<Array<PrimExpr> > feature_row;
+    Array<Array<PrimExpr>> feature_row;
     ItervarFeature& fea = touch_analyzer.itervar_map[var];
     feature_row.push_back(Array<PrimExpr>{tvm::tir::StringImm("_itervar_"), var});
 
@@ -389,10 +389,10 @@ void GetCurveSampleFeatureFlatten(Stmt stmt, int sample_n, std::vector<float>* r
   });
 
   int max_depth = 0;
-  std::map<TouchedBuffer, std::vector<double> > reuse_curve;
-  std::map<TouchedBuffer, std::vector<double> > count_curve;
-  std::map<TouchedBuffer, std::vector<double> > topdown_curve;
-  std::map<TouchedBuffer, std::vector<double> > bottomup_curve;
+  std::map<TouchedBuffer, std::vector<double>> reuse_curve;
+  std::map<TouchedBuffer, std::vector<double>> count_curve;
+  std::map<TouchedBuffer, std::vector<double>> topdown_curve;
+  std::map<TouchedBuffer, std::vector<double>> bottomup_curve;
   std::set<TouchedBuffer> innermost_buffers;
   std::set<std::string> added;
 
@@ -485,7 +485,7 @@ TVM_REGISTER_GLOBAL("autotvm.feature.GetItervarFeature")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       Stmt stmt = args[0];
       bool take_log = args[1];
-      Array<Array<Array<PrimExpr> > > ret_feature;
+      Array<Array<Array<PrimExpr>>> ret_feature;
 
       GetItervarFeature(stmt, take_log, &ret_feature);
 
diff --git a/src/contrib/ethosu/cascader/propagator.cc b/src/contrib/ethosu/cascader/propagator.cc
index 25b711a53d05..ca8aaf6e27d5 100644
--- a/src/contrib/ethosu/cascader/propagator.cc
+++ b/src/contrib/ethosu/cascader/propagator.cc
@@ -34,7 +34,7 @@ namespace ethosu {
 namespace cascader {
 
 void PropagatorNode::VisitAttrs(AttrVisitor* v) {
-  Array<Array<FloatImm> > tmp_transform;
+  Array<Array<FloatImm>> tmp_transform;
   for (const auto& vec : transform_) {
     tmp_transform.push_back(make_array(vec));
   }
@@ -43,7 +43,7 @@ void PropagatorNode::VisitAttrs(AttrVisitor* v) {
   v->Visit("_offset", &tmp_arr);
 }
 
-Propagator::Propagator(const std::vector<std::vector<float> >& transform,
+Propagator::Propagator(const std::vector<std::vector<float>>& transform,
                        const std::vector<int>& offset) {
   auto n = make_object<PropagatorNode>();
   size_t rows = transform.size();
@@ -102,8 +102,8 @@ StripeConfig PropagatorNode::propagate(const StripeConfig& stripe_config) const
 }
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.Propagator")
-    .set_body_typed([](Array<Array<FloatImm> > transform, Array<Integer> offset) {
-      std::vector<std::vector<float> > vtransform;
+    .set_body_typed([](Array<Array<FloatImm>> transform, Array<Integer> offset) {
+      std::vector<std::vector<float>> vtransform;
       for (const auto& vec : transform) {
         vtransform.push_back(make_vector<float, FloatImm>(vec));
       }
diff --git a/src/contrib/ethosu/cascader/propagator.h b/src/contrib/ethosu/cascader/propagator.h
index 2d4bd0d0154a..3946d0806a0c 100644
--- a/src/contrib/ethosu/cascader/propagator.h
+++ b/src/contrib/ethosu/cascader/propagator.h
@@ -43,7 +43,7 @@ class PropagatorNode : public Object {
   void VisitAttrs(AttrVisitor* v);
 
   /*! \return The transform matrix to apply to the StripeConfigs */
-  const std::vector<std::vector<float> > GetTransform() const { return transform_; }
+  const std::vector<std::vector<float>> GetTransform() const { return transform_; }
   /*! \return The offset vector to apply to the StripeConfigs */
   const std::vector<int> GetOffset() const { return offset_; }
   /*! \return The number of input dimensions */
@@ -92,7 +92,7 @@ class PropagatorNode : public Object {
   friend class Propagator;
 
   /*! \brief The transform matrix to apply to the StripeConfigs */
-  std::vector<std::vector<float> > transform_;
+  std::vector<std::vector<float>> transform_;
   /*! \brief The offset vector to apply to the StripeConfigs */
   std::vector<int> offset_;
 };
@@ -124,7 +124,7 @@ class PropagatorNode : public Object {
  */
 class Propagator : public ObjectRef {
  public:
-  Propagator(const std::vector<std::vector<float> >& transform, const std::vector<int>& offset);
+  Propagator(const std::vector<std::vector<float>>& transform, const std::vector<int>& offset);
 
   TVM_DEFINE_OBJECT_REF_METHODS(Propagator, ObjectRef, PropagatorNode);
 };
diff --git a/src/ir/span.cc b/src/ir/span.cc
index 4a26f3a6eb11..e19bef4cb864 100644
--- a/src/ir/span.cc
+++ b/src/ir/span.cc
@@ -30,7 +30,7 @@ namespace tvm {
 ObjectPtr<Object> GetSourceNameNode(const String& name) {
   // always return pointer as the reference can change as map re-allocate.
   // or use another level of indirection by creating a unique_ptr
-  static std::unordered_map<String, ObjectPtr<SourceNameNode> > source_map;
+  static std::unordered_map<String, ObjectPtr<SourceNameNode>> source_map;
 
   auto sn = source_map.find(name);
   if (sn == source_map.end()) {
diff --git a/src/node/reflection.cc b/src/node/reflection.cc
index a0f83f6cf5ad..aa572e99658c 100644
--- a/src/node/reflection.cc
+++ b/src/node/reflection.cc
@@ -254,7 +254,7 @@ void NodeListAttrNames(TVMArgs args, TVMRetValue* ret) {
   Object* self = static_cast<Object*>(args[0].value().v_handle);
 
   auto names =
-      std::make_shared<std::vector<std::string> >(ReflectionVTable::Global()->ListAttrNames(self));
+      std::make_shared<std::vector<std::string>>(ReflectionVTable::Global()->ListAttrNames(self));
 
   *ret = PackedFunc([names](TVMArgs args, TVMRetValue* rv) {
     int64_t i = args[0];
diff --git a/src/printer/meta_data.h b/src/printer/meta_data.h
index b076ad07caaf..ddf0d78087ee 100644
--- a/src/printer/meta_data.h
+++ b/src/printer/meta_data.h
@@ -136,7 +136,7 @@ class TextMetaDataContext {
 
  private:
   /*! \brief additional metadata stored in TVM json format */
-  std::unordered_map<String, Array<ObjectRef> > meta_data_;
+  std::unordered_map<String, Array<ObjectRef>> meta_data_;
   /*! \brief map from meta data into its string representation */
   std::unordered_map<ObjectRef, Doc, ObjectPtrHash, ObjectPtrEqual> meta_repr_;
 };
diff --git a/src/relay/analysis/dependency_graph.cc b/src/relay/analysis/dependency_graph.cc
index 18913ca37562..91711fa4baa8 100644
--- a/src/relay/analysis/dependency_graph.cc
+++ b/src/relay/analysis/dependency_graph.cc
@@ -56,11 +56,11 @@ class DependencyGraph::Creator : private MixedModeVisitor {
   }
 
   void Depend(DependencyGraph::Node* parent, DependencyGraph::Node* child) {
-    auto* parent_link = arena_->make<LinkNode<DependencyGraph::Node*> >();
+    auto* parent_link = arena_->make<LinkNode<DependencyGraph::Node*>>();
     parent_link->value = parent;
     child->parents.Push(parent_link);
 
-    auto* child_link = arena_->make<LinkNode<DependencyGraph::Node*> >();
+    auto* child_link = arena_->make<LinkNode<DependencyGraph::Node*>>();
     child_link->value = child;
     parent->children.Push(child_link);
   }
diff --git a/src/relay/ir/transform.cc b/src/relay/ir/transform.cc
index 1a16cc9becf1..fc1f3a15077e 100644
--- a/src/relay/ir/transform.cc
+++ b/src/relay/ir/transform.cc
@@ -126,7 +126,7 @@ IRModule FunctionPassNode::operator()(IRModule mod, const PassContext& pass_ctx)
 
   IRModule updated_mod = mod->ShallowCopy();
 
-  std::vector<std::pair<GlobalVar, Function> > updates;
+  std::vector<std::pair<GlobalVar, Function>> updates;
   for (const auto& kv : mod->functions) {
     // only process optimizable Relay Functions
     if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) {
diff --git a/src/relay/transforms/convert_sparse_dense.cc b/src/relay/transforms/convert_sparse_dense.cc
index faba366eca49..7053f1301cca 100644
--- a/src/relay/transforms/convert_sparse_dense.cc
+++ b/src/relay/transforms/convert_sparse_dense.cc
@@ -73,7 +73,7 @@ TVM_REGISTER_GLOBAL("relay.analysis.search_dense_op_weight").set_body_typed(Sear
 class DenseToSparseDenseMutator : public ExprRewriter {
  public:
   DenseToSparseDenseMutator(const Array<ObjectRef>& weight_name,
-                            const Array<Array<PrimExpr> >& weight_shape)
+                            const Array<Array<PrimExpr>>& weight_shape)
       : dense_op_(Op::Get("nn.dense")), sparse_dense_op_(Op::Get("nn.sparse_dense")) {
     ICHECK_EQ(weight_name.size(), weight_shape.size());
     for (size_t i = 0; i < weight_name.size(); ++i) {
@@ -117,11 +117,11 @@ class DenseToSparseDenseMutator : public ExprRewriter {
   // Cached op
   const Op& dense_op_;
   const Op& sparse_dense_op_;
-  std::unordered_map<std::string, std::vector<int> > target_weights_;
+  std::unordered_map<std::string, std::vector<int>> target_weights_;
 };  // class DenseToSparseDenseAlter
 
 Expr DenseToSparse(const Expr& e, const Array<ObjectRef>& weight_name,
-                   const Array<Array<PrimExpr> >& weight_shape) {
+                   const Array<Array<PrimExpr>>& weight_shape) {
   auto rewriter = DenseToSparseDenseMutator(weight_name, weight_shape);
   return PostOrderRewrite(e, &rewriter);
 }
@@ -129,7 +129,7 @@ Expr DenseToSparse(const Expr& e, const Array<ObjectRef>& weight_name,
 namespace transform {
 
 Pass DenseToSparse(const Array<ObjectRef>& weight_name,
-                   const Array<Array<PrimExpr> >& weight_shape) {
+                   const Array<Array<PrimExpr>>& weight_shape) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
       [=](Function f, IRModule m, PassContext pc) {
         // Remove FreeVar warnings
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 1ced0883a14c..dac5dc69ead5 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -180,7 +180,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
       graph_.node_map[key] = current;
     }
     if (parent != nullptr) {
-      auto* link = arena_->make<LinkNode<IndexedForwardGraph::Edge> >();
+      auto* link = arena_->make<LinkNode<IndexedForwardGraph::Edge>>();
       link->value.node = parent;
       link->value.pattern = pattern;
       current->outputs.Push(link);
diff --git a/src/relay/transforms/let_list.h b/src/relay/transforms/let_list.h
index f449d6c3b011..f908fbcee514 100644
--- a/src/relay/transforms/let_list.h
+++ b/src/relay/transforms/let_list.h
@@ -145,7 +145,7 @@ class LetList {
   }
 
  private:
-  std::vector<std::pair<Var, Expr> > lets_;
+  std::vector<std::pair<Var, Expr>> lets_;
   bool used_ = false;
 };
 
diff --git a/src/relay/transforms/partial_eval.cc b/src/relay/transforms/partial_eval.cc
index fc9922ca03ef..f791192e25c1 100644
--- a/src/relay/transforms/partial_eval.cc
+++ b/src/relay/transforms/partial_eval.cc
@@ -772,7 +772,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     if (func->HasNonzeroAttr(attr::kPrimitive)) {
       return ConstEvaluateFunc(func);
     }
-    std::vector<std::pair<Var, PStatic> > free_vars;
+    std::vector<std::pair<Var, PStatic>> free_vars;
     for (const auto& v : FreeVars(func)) {
       if (v != var) {
         free_vars.push_back(std::pair<Var, PStatic>(v, env_.Lookup(v)));
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index 9c01c40517f4..d2eb48073f7d 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -829,7 +829,7 @@ void EnsureCheckedType(const Expr& e) { AllCheckTypePopulated().VisitExpr(e); }
 
 // TODO(@jroesch): Can we optimize this?
 void AddGlobalTypes(IRModule mod) {
-  std::vector<std::pair<GlobalVar, Function> > updates;
+  std::vector<std::pair<GlobalVar, Function>> updates;
   for (const auto& it : mod->functions) {
     // Currently we don't type check TIR.
     // The inferencer will only check Relay functions
@@ -961,7 +961,7 @@ Pass InferType() {
         // Add all the type annotations to the functions in the model.
         AddGlobalTypes(mod);
 
-        std::vector<std::pair<GlobalVar, Function> > updates;
+        std::vector<std::pair<GlobalVar, Function>> updates;
         for (const auto& it : updated_mod->functions) {
           // Currently we don't type check TIR.
           //
diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc
index 628f99788d16..900ae65afcc3 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.cc
+++ b/src/runtime/contrib/ethosn/ethosn_device.cc
@@ -87,7 +87,7 @@ void CopyOutput(dl::Buffer* source_buffers[], std::vector<DLTensor*>* outputs) {
   }
 }
 
-void CreateBuffers(std::vector<std::shared_ptr<dl::Buffer> >* fm,
+void CreateBuffers(std::vector<std::shared_ptr<dl::Buffer>>* fm,
                    const std::vector<DLTensor*>& tensors, const std::vector<uint32_t>& tensor_sizes,
                    bool input) {
   for (size_t i = 0; i < tensors.size(); i++) {
@@ -118,11 +118,11 @@ bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
   }
 
   // Set up input buffers
-  std::vector<std::shared_ptr<dl::Buffer> > ifm(inputs.size());
+  std::vector<std::shared_ptr<dl::Buffer>> ifm(inputs.size());
   CreateBuffers(&ifm, inputs, input_sizes, true);
 
   // Set up output buffers
-  std::vector<std::shared_ptr<dl::Buffer> > ofm(outputs.size());
+  std::vector<std::shared_ptr<dl::Buffer>> ofm(outputs.size());
   CreateBuffers(&ofm, outputs, output_sizes, false);
 
   // Raw pointers for the inference
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index 78e65f6f2319..e3113dbfe54c 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -519,8 +519,8 @@ void GraphExecutor::SetupOpExecs() {
   }
 }
 
-std::pair<std::function<void()>, std::shared_ptr<GraphExecutor::OpArgs> >
-GraphExecutor::CreateTVMOp(const TVMOpParam& param, const std::vector<DLTensor>& args) {
+std::pair<std::function<void()>, std::shared_ptr<GraphExecutor::OpArgs>> GraphExecutor::CreateTVMOp(
+    const TVMOpParam& param, const std::vector<DLTensor>& args) {
   std::shared_ptr<GraphExecutor::OpArgs> arg_ptr = std::make_shared<GraphExecutor::OpArgs>();
   // setup address.
   arg_ptr->args = args;
diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h
index 47a5999fdce9..dad156bcdddc 100644
--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -133,7 +133,7 @@ class Stream {
 class MetalWorkspace final : public DeviceAPI {
  public:
   // the devices
-  std::vector<id<MTLDevice> > devices;
+  std::vector<id<MTLDevice>> devices;
   // Warp size constant
   std::vector<int> warp_size;
   // Whether it is initialized.
@@ -186,7 +186,7 @@ class MetalThreadEntry {
   /*! \brief The current stream */
   std::vector<Stream*> stream;
   /*! \brief The shared buffer used for copy. */
-  std::vector<id<MTLBuffer> > temp_buffer_;
+  std::vector<id<MTLBuffer>> temp_buffer_;
   /*! \brief workspace pool */
   WorkspacePool pool;
   // constructor
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 7744174ec866..665244d3d1bd 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -369,7 +369,7 @@ class ThreadPool {
   int num_workers_used_;
   // if or not to exclude worker 0 and use main to run task 0
   bool exclude_worker0_{true};
-  std::vector<std::unique_ptr<SpscTaskQueue> > queues_;
+  std::vector<std::unique_ptr<SpscTaskQueue>> queues_;
   std::unique_ptr<tvm::runtime::threading::ThreadGroup> threads_;
 };
 
diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index 14b5f27dd495..ef1aa69f6455 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -285,7 +285,7 @@ class ThreadGroup::Impl {
     // is not supported in earlier versions of QuRT. In such cases assume 4.
     if (threads == 0) threads = 4;
 #endif
-    std::vector<std::pair<unsigned int, int64_t> > max_freqs;
+    std::vector<std::pair<unsigned int, int64_t>> max_freqs;
 
     for (unsigned int i = 0; i < threads; ++i) {
       int64_t cur_freq = 0;
diff --git a/src/runtime/vm/pooled_allocator.h b/src/runtime/vm/pooled_allocator.h
index e5f236983a73..9c11c783011e 100644
--- a/src/runtime/vm/pooled_allocator.h
+++ b/src/runtime/vm/pooled_allocator.h
@@ -99,7 +99,7 @@ class PooledAllocator final : public Allocator {
  private:
   size_t page_size_;
   std::atomic<size_t> used_memory_;
-  std::unordered_map<size_t, std::vector<Buffer> > memory_pool_;
+  std::unordered_map<size_t, std::vector<Buffer>> memory_pool_;
   std::recursive_mutex mu_;
   Device device_;
 };
diff --git a/src/target/source/codegen_vhls.cc b/src/target/source/codegen_vhls.cc
index 9896d8b833f9..4091b64f4524 100644
--- a/src/target/source/codegen_vhls.cc
+++ b/src/target/source/codegen_vhls.cc
@@ -157,7 +157,7 @@ runtime::Module BuildSDAccel(IRModule mod, Target target) {
   std::string whole_code = cg.Finish();
 
   // Generate source code for compilation.
-  Array<Array<runtime::String> > kernel_info;
+  Array<Array<runtime::String>> kernel_info;
 
   for (auto kv : mod->functions) {
     ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenOpenCL: Can only take PrimFunc";
diff --git a/src/te/operation/compute_op.cc b/src/te/operation/compute_op.cc
index c3062045939a..7f8facad5568 100644
--- a/src/te/operation/compute_op.cc
+++ b/src/te/operation/compute_op.cc
@@ -357,10 +357,10 @@ Stmt MakeComputeStmt(const ComputeOpNode* self, const Stage& stage,
     init = MergeNest(n.init_nest, init);
     init = Substitute(init, n.init_vmap);
     // common nest
-    std::vector<std::vector<Stmt> > common(n.main_nest.begin(),
-                                           n.main_nest.begin() + n.num_common_loop + 1);
-    std::vector<std::vector<Stmt> > reduce(n.main_nest.begin() + n.num_common_loop + 1,
-                                           n.main_nest.end());
+    std::vector<std::vector<Stmt>> common(n.main_nest.begin(),
+                                          n.main_nest.begin() + n.num_common_loop + 1);
+    std::vector<std::vector<Stmt>> reduce(n.main_nest.begin() + n.num_common_loop + 1,
+                                          n.main_nest.end());
     provide = MergeNest(reduce, provide);
     if (debug_keep_trivial_loop) {
       provide = MergeNest(common, provide);
diff --git a/src/te/operation/compute_op.h b/src/te/operation/compute_op.h
index 2661eb976f2e..944334a41fdb 100644
--- a/src/te/operation/compute_op.h
+++ b/src/te/operation/compute_op.h
@@ -41,13 +41,13 @@ struct ComputeLoopNest {
   // predicates for the initialize loop
   std::vector<PrimExpr> init_predicates;
   // Initialization nest involved.
-  std::vector<std::vector<Stmt> > init_nest;
+  std::vector<std::vector<Stmt>> init_nest;
   // Value map for the init code
   std::unordered_map<IterVar, PrimExpr> init_vmap;
   // Predicates for the main update loop
   std::vector<PrimExpr> main_predicates;
   // The general loop nest
-  std::vector<std::vector<Stmt> > main_nest;
+  std::vector<std::vector<Stmt>> main_nest;
   // Value map for the IterVar.
   std::unordered_map<IterVar, PrimExpr> main_vmap;
 
diff --git a/src/te/operation/tensor_compute_op.cc b/src/te/operation/tensor_compute_op.cc
index 262e5a2b97f4..00f751c58a09 100644
--- a/src/te/operation/tensor_compute_op.cc
+++ b/src/te/operation/tensor_compute_op.cc
@@ -202,7 +202,7 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
   ComputeLoopNest n = ComputeLoopNest::Create(this, stage, dom_map, debug_keep_trivial_loop);
 
   if (this->reduce_axis.size() == 0) {
-    std::vector<std::vector<Stmt> > nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
+    std::vector<std::vector<Stmt>> nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
     nest.emplace_back(MakeIfNest(n.main_predicates));
     ICHECK_EQ(n.init_predicates.size(), 0U);
     ICHECK(this->intrin->body.defined())
@@ -219,16 +219,15 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
     ICHECK(this->intrin->reduce_update.defined()) << "Reduction update op is not defined";
     // Need init and update steps
     ICHECK_NE(this->reduce_axis.size(), 0U);
-    std::vector<std::vector<Stmt> > common(n.main_nest.begin(),
-                                           n.main_nest.begin() + n.num_common_loop + 1);
-    std::vector<std::vector<Stmt> > update_nest(n.main_nest.begin() + n.num_common_loop + 1,
-                                                n.main_nest.begin() + tloc + 1);
+    std::vector<std::vector<Stmt>> common(n.main_nest.begin(),
+                                          n.main_nest.begin() + n.num_common_loop + 1);
+    std::vector<std::vector<Stmt>> update_nest(n.main_nest.begin() + n.num_common_loop + 1,
+                                               n.main_nest.begin() + tloc + 1);
     update_nest.emplace_back(MakeIfNest(n.main_predicates));
 
     if (this->intrin->reduce_init.defined()) {
       // init nest
-      std::vector<std::vector<Stmt> > init_nest(n.init_nest.begin(),
-                                                n.init_nest.begin() + tloc + 1);
+      std::vector<std::vector<Stmt>> init_nest(n.init_nest.begin(), n.init_nest.begin() + tloc + 1);
       init_nest.emplace_back(MakeIfNest(n.init_predicates));
       Stmt init = MergeNest(output_bind_nest, this->intrin->reduce_init);
       init = te::Substitute(init, n.init_vmap);
diff --git a/src/te/operation/tensorize.cc b/src/te/operation/tensorize.cc
index b31b61b739c1..138aeeb37f19 100644
--- a/src/te/operation/tensorize.cc
+++ b/src/te/operation/tensorize.cc
@@ -42,7 +42,7 @@ using namespace tir;
 size_t InferTensorizeRegion(const ComputeOpNode* self, const Stage& stage,
                             const std::unordered_map<IterVar, Range>& dom_map,
                             std::unordered_map<IterVar, Range>* out_dom,
-                            std::unordered_map<Tensor, Array<Range> >* in_region) {
+                            std::unordered_map<Tensor, Array<Range>>* in_region) {
   // Get the bound of the tensorized scope.
   bool found_point = false;
   size_t loc_scope = 0;
@@ -198,7 +198,7 @@ class TensorIntrinMatcher final : public StmtExprMutator {
   void Init(const ComputeOpNode* self, const Stage& stage,
             const std::unordered_map<IterVar, Range>& dom_map,
             const std::unordered_map<IterVar, Range>& out_dom,
-            const std::unordered_map<Tensor, Array<Range> >& in_region, const TensorIntrin& intrin,
+            const std::unordered_map<Tensor, Array<Range>>& in_region, const TensorIntrin& intrin,
             Map<Var, Range>* compute_intrin_iter_space) {
     ICHECK(self == stage->op.get());
 
@@ -298,7 +298,7 @@ class TensorIntrinMatcher final : public StmtExprMutator {
 Array<PrimExpr> MatchTensorizeBody(const ComputeOpNode* self, const Stage& stage,
                                    const std::unordered_map<IterVar, Range>& dom_map,
                                    const std::unordered_map<IterVar, Range>& out_dom,
-                                   const std::unordered_map<Tensor, Array<Range> >& in_region,
+                                   const std::unordered_map<Tensor, Array<Range>>& in_region,
                                    const TensorIntrin& intrin,
                                    Map<Var, Range>* compute_intrin_iter_space) {
   TensorIntrinMatcher matcher;
@@ -314,7 +314,7 @@ void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage,
                          const std::unordered_map<IterVar, PrimExpr>& value_map,
                          const std::unordered_map<IterVar, Range>& dom_map,
                          const std::unordered_map<IterVar, Range>& out_dom,
-                         const std::unordered_map<Tensor, Array<Range> >& in_region,
+                         const std::unordered_map<Tensor, Array<Range>>& in_region,
                          const TensorIntrin& intrin) {
   StructuralEqual expr_equal;
   Map<Var, Range> compute_intrin_iter_space;
@@ -346,7 +346,7 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
                    const std::unordered_map<IterVar, Range>& dom_map,
                    bool debug_keep_trivial_loop) {
   std::unordered_map<IterVar, Range> out_dom;
-  std::unordered_map<Tensor, Array<Range> > in_region;
+  std::unordered_map<Tensor, Array<Range>> in_region;
   size_t tloc = InferTensorizeRegion(self, stage, dom_map, &out_dom, &in_region);
   TensorIntrin intrin = stage->iter_var_attrs.at(stage->leaf_iter_vars[tloc])->tensor_intrin;
   ICHECK(intrin.defined());
@@ -418,7 +418,7 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
   }
   if (tloc <= n.num_common_loop) {
     // Do no need to split reduction
-    std::vector<std::vector<Stmt> > nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
+    std::vector<std::vector<Stmt>> nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
     nest.emplace_back(MakeIfNest(n.main_predicates));
     ICHECK_EQ(n.init_predicates.size(), 0U);
     ICHECK(intrin->body.defined()) << "Normal store op for intrin " << intrin << " is not defined";
@@ -434,16 +434,15 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
         << "Reduction update op for intrin " << intrin << " is not defined";
     // Need init and update steps
     ICHECK_NE(self->reduce_axis.size(), 0U);
-    std::vector<std::vector<Stmt> > common(n.main_nest.begin(),
-                                           n.main_nest.begin() + n.num_common_loop + 1);
-    std::vector<std::vector<Stmt> > update_nest(n.main_nest.begin() + n.num_common_loop + 1,
-                                                n.main_nest.begin() + tloc + 1);
+    std::vector<std::vector<Stmt>> common(n.main_nest.begin(),
+                                          n.main_nest.begin() + n.num_common_loop + 1);
+    std::vector<std::vector<Stmt>> update_nest(n.main_nest.begin() + n.num_common_loop + 1,
+                                               n.main_nest.begin() + tloc + 1);
     update_nest.emplace_back(MakeIfNest(n.main_predicates));
 
     if (intrin->reduce_init.defined()) {
       // init nest
-      std::vector<std::vector<Stmt> > init_nest(n.init_nest.begin(),
-                                                n.init_nest.begin() + tloc + 1);
+      std::vector<std::vector<Stmt>> init_nest(n.init_nest.begin(), n.init_nest.begin() + tloc + 1);
       init_nest.emplace_back(MakeIfNest(n.init_predicates));
       Stmt init = MergeNest(output_bind_nest, intrin->reduce_init);
       init = te::Substitute(init, n.init_vmap);
@@ -476,17 +475,17 @@ TVM_REGISTER_GLOBAL("test.op.InferTensorizeRegion").set_body([](TVMArgs args, TV
   Stage stage = args[0];
   Map<IterVar, Range> dmap = args[1];
   std::unordered_map<IterVar, Range> out_dom;
-  std::unordered_map<Tensor, Array<Range> > in_region;
+  std::unordered_map<Tensor, Array<Range>> in_region;
   ICHECK(stage->op.as<ComputeOpNode>());
   InferTensorizeRegion(stage->op.as<ComputeOpNode>(), stage, as_unordered_map(dmap), &out_dom,
                        &in_region);
-  *ret = Array<ObjectRef>{Map<IterVar, Range>(out_dom), Map<Tensor, Array<Range> >(in_region)};
+  *ret = Array<ObjectRef>{Map<IterVar, Range>(out_dom), Map<Tensor, Array<Range>>(in_region)};
 });
 
 TVM_REGISTER_GLOBAL("test.op.MatchTensorizeBody").set_body([](TVMArgs args, TVMRetValue* ret) {
   Stage stage = args[0];
   Map<IterVar, Range> out_dom = args[1];
-  Map<Tensor, Array<Range> > in_region = args[2];
+  Map<Tensor, Array<Range>> in_region = args[2];
   TensorIntrin intrin = args[3];
   Map<Var, Range> vrange;
   ICHECK(stage->op.as<ComputeOpNode>());
diff --git a/src/te/schedule/graph.h b/src/te/schedule/graph.h
index bb98ff4b706d..d31473d1b5a0 100644
--- a/src/te/schedule/graph.h
+++ b/src/te/schedule/graph.h
@@ -38,17 +38,17 @@ namespace te {
 /*!
  * \brief data structure of Operation->Tensors it reads
  */
-using ReadGraph = Map<Operation, Array<Tensor> >;
+using ReadGraph = Map<Operation, Array<Tensor>>;
 
 /*!
  * \brief AttachPath maps op-> a list of IterVar
  */
-using AttachPath = Map<Operation, Array<IterVar> >;
+using AttachPath = Map<Operation, Array<IterVar>>;
 
 /*!
  * \brief The map between tensor and operation it feeds to.
  */
-using FeedGraph = std::unordered_map<Tensor, std::vector<Operation> >;
+using FeedGraph = std::unordered_map<Tensor, std::vector<Operation>>;
 
 /*!
  * \brief Get read graph of each operation to all the
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
index a8363fd084cd..39243bf2216f 100644
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ b/src/te/schedule/schedule_dataflow_rewrite.cc
@@ -507,7 +507,7 @@ void RebaseNonZeroMinLoop(ScheduleNode* sch) {
 void InjectInline(ScheduleNode* sch, bool feature_extraction_mode) {
   sch->InvalidateCache();
 
-  std::vector<Array<PrimExpr> > new_body(sch->stages.size());
+  std::vector<Array<PrimExpr>> new_body(sch->stages.size());
   std::vector<bool> changed(sch->stages.size(), false);
   std::vector<Stmt> new_hybrid_body(sch->stages.size());
   std::vector<bool> hybrid_changed(sch->stages.size(), false);
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 1ac0f1f1705e..cae4109a6026 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -152,7 +152,7 @@ inline std::pair<bool, PrimExpr> MergeMulModInner(arith::Analyzer* analyzer,
 // Otherwise, the elements will be added to the no_opt_sum variable
 inline void MergeMulModInsertElements(const std::vector<const PrimExpr*>& eles,
                                       std::list<PrimExpr>* mult_exprs,
-                                      std::list<std::pair<PrimExpr, PrimExpr> >* mod_exprs,
+                                      std::list<std::pair<PrimExpr, PrimExpr>>* mod_exprs,
                                       PrimExpr* no_opt_sum, bool* has_mult, bool* has_mod) {
   using namespace tir;
   *has_mult = false;
@@ -194,13 +194,13 @@ inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) {
   simplified_base = analyzer->Simplify(simplified_base);
   std::vector<const PrimExpr*> eles = ExprSplitAddition(simplified_base);
   std::list<PrimExpr> mult_exprs;
-  std::list<std::pair<PrimExpr, PrimExpr> > mod_exprs;
+  std::list<std::pair<PrimExpr, PrimExpr>> mod_exprs;
   PrimExpr no_opt_sum;
   bool has_mult;
   bool has_mod;
   MergeMulModInsertElements(eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, &has_mod);
   bool find_opt = false;
-  std::list<std::pair<PrimExpr, PrimExpr> >::iterator search_mod_it = mod_exprs.begin();
+  std::list<std::pair<PrimExpr, PrimExpr>>::iterator search_mod_it = mod_exprs.begin();
   // 2. Exhaustive Search
   while (search_mod_it != mod_exprs.end()) {
     std::list<PrimExpr>::iterator mult_it = mult_exprs.begin();
@@ -238,7 +238,7 @@ inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) {
   for (std::list<PrimExpr>::iterator it = mult_exprs.begin(); it != mult_exprs.end(); ++it) {
     no_opt_sum = no_opt_sum.get() ? no_opt_sum + *it : *it;
   }
-  for (std::list<std::pair<PrimExpr, PrimExpr> >::iterator it = mod_exprs.begin();
+  for (std::list<std::pair<PrimExpr, PrimExpr>>::iterator it = mod_exprs.begin();
        it != mod_exprs.end(); ++it) {
     no_opt_sum = no_opt_sum.get() ? no_opt_sum + indexmod(it->first, it->second)
                                   : indexmod(it->first, it->second);
diff --git a/src/tir/transforms/coproc_sync.cc b/src/tir/transforms/coproc_sync.cc
index f3a9f990599f..1b1cabeadb71 100644
--- a/src/tir/transforms/coproc_sync.cc
+++ b/src/tir/transforms/coproc_sync.cc
@@ -111,7 +111,7 @@ class CoProcSyncPlanner : public StorageAccessVisitor {
   }
 
   // Write synchronization to be inserted before or after stmt.
-  std::unordered_map<const Object*, std::vector<Stmt> > sync_;
+  std::unordered_map<const Object*, std::vector<Stmt>> sync_;
 
  protected:
   bool Enabled(const VarNode* buf, const StorageScope& scope) const final {
@@ -230,8 +230,8 @@ class CoProcBarrierDetector : public StorageAccessVisitor {
     PlanWriteBarrier(scope_.back(), nullptr);
   }
 
-  std::unordered_map<const Object*, std::vector<Stmt> > barrier_before_;
-  std::unordered_map<const Object*, std::vector<Stmt> > barrier_after_;
+  std::unordered_map<const Object*, std::vector<Stmt>> barrier_before_;
+  std::unordered_map<const Object*, std::vector<Stmt>> barrier_after_;
 
  protected:
   bool Enabled(const VarNode* buf, const StorageScope& scope) const final {
@@ -251,7 +251,7 @@ class CoProcBarrierDetector : public StorageAccessVisitor {
   // Plan write barrier at Read after write point.
   std::vector<AccessEntry> PlanWriteBarrier(std::vector<StmtEntry> seq, const ForNode* loop) {
     std::vector<AccessEntry> read_seq;
-    std::unordered_map<const VarNode*, std::vector<AccessEntry> > write_set;
+    std::unordered_map<const VarNode*, std::vector<AccessEntry>> write_set;
 
     auto fupdate = [&](size_t i, const AccessEntry& acc) {
       auto it = write_set.find(acc.buffer.get());
@@ -289,7 +289,7 @@ class CoProcBarrierDetector : public StorageAccessVisitor {
 
   std::vector<AccessEntry> PlanReadBarrier(std::vector<StmtEntry> seq, const ForNode* loop) {
     std::vector<AccessEntry> write_seq;
-    std::unordered_map<const VarNode*, std::vector<AccessEntry> > read_set;
+    std::unordered_map<const VarNode*, std::vector<AccessEntry>> read_set;
 
     auto fupdate = [&](size_t i, const AccessEntry& acc) {
       auto it = read_set.find(acc.buffer.get());
@@ -443,8 +443,8 @@ class CoProcInstDepDetector : public StmtVisitor {
 
   // insert before is stored in reverse order
   // the first element is closest to the node.
-  std::unordered_map<const Object*, std::vector<Stmt> > insert_before_;
-  std::unordered_map<const Object*, std::vector<Stmt> > insert_after_;
+  std::unordered_map<const Object*, std::vector<Stmt>> insert_before_;
+  std::unordered_map<const Object*, std::vector<Stmt>> insert_after_;
 
  private:
   // state in the sync entry
@@ -456,9 +456,9 @@ class CoProcInstDepDetector : public StmtVisitor {
     // Set of all possible contexts in the exit moment.
     std::unordered_set<int> exit_ctx;
     // existing pop performed at enter
-    std::vector<std::pair<int, int> > enter_pop;
+    std::vector<std::pair<int, int>> enter_pop;
     // existing push performed at exit
-    std::vector<std::pair<int, int> > exit_push;
+    std::vector<std::pair<int, int>> exit_push;
     // clear the state
     void clear() {
       node = nullptr;
@@ -473,8 +473,8 @@ class CoProcInstDepDetector : public StmtVisitor {
   // return the push/pop message at enter/exit of the Block
   // after considering the existing unmatcheded events and added events
   void InjectSync(const SyncState& prev, const SyncState& next,
-                  std::vector<std::pair<int, int> >* prev_exit_push,
-                  std::vector<std::pair<int, int> >* next_enter_pop) {
+                  std::vector<std::pair<int, int>>* prev_exit_push,
+                  std::vector<std::pair<int, int>>* next_enter_pop) {
     prev_exit_push->clear();
     next_enter_pop->clear();
     // quick path
@@ -491,9 +491,9 @@ class CoProcInstDepDetector : public StmtVisitor {
       return;
     }
     // complicate path.
-    std::vector<std::pair<int, int> > vpush = prev.exit_push;
-    std::vector<std::pair<int, int> > vpop = next.enter_pop;
-    std::vector<std::pair<int, int> > pending;
+    std::vector<std::pair<int, int>> vpush = prev.exit_push;
+    std::vector<std::pair<int, int>> vpop = next.enter_pop;
+    std::vector<std::pair<int, int>> pending;
     for (int from : prev.exit_ctx) {
       for (int to : next.enter_ctx) {
         if (from != to) {
@@ -556,7 +556,7 @@ class CoProcInstDepDetector : public StmtVisitor {
 
   void UpdateState() {
     if (last_state_.node != nullptr) {
-      std::vector<std::pair<int, int> > t1, t2;
+      std::vector<std::pair<int, int>> t1, t2;
       InjectSync(last_state_, curr_state_, &t1, &t2);
       std::swap(last_state_, curr_state_);
     } else {
@@ -642,8 +642,8 @@ class CoProcSyncInserter : public StmtMutator {
  private:
   // insert before is stored in reverse order
   // the first element is closest to the node.
-  std::unordered_map<const Object*, std::vector<Stmt> > insert_before_;
-  std::unordered_map<const Object*, std::vector<Stmt> > insert_after_;
+  std::unordered_map<const Object*, std::vector<Stmt>> insert_before_;
+  std::unordered_map<const Object*, std::vector<Stmt>> insert_after_;
 };
 
 Stmt CoProcSync(Stmt stmt) { return CoProcSyncInserter().Insert(std::move(stmt)); }
diff --git a/src/tir/transforms/inject_double_buffer.cc b/src/tir/transforms/inject_double_buffer.cc
index 03f2ccd40dd1..d974e3c8108a 100644
--- a/src/tir/transforms/inject_double_buffer.cc
+++ b/src/tir/transforms/inject_double_buffer.cc
@@ -299,9 +299,9 @@ class DoubleBufferInjector : public StmtExprMutator {
   // The current loop next
   std::vector<const ForNode*> loop_nest_;
   // The allocs to be appended before the loop
-  std::unordered_map<const ForNode*, std::vector<Stmt> > loop_allocs_;
+  std::unordered_map<const ForNode*, std::vector<Stmt>> loop_allocs_;
   // The stmt to be appended before the loop
-  std::unordered_map<const ForNode*, std::vector<Stmt> > loop_pre_;
+  std::unordered_map<const ForNode*, std::vector<Stmt>> loop_pre_;
   // The allocation size of the buffer
   std::unordered_map<const VarNode*, StorageEntry> dbuffer_info_;
   // The updated Buffer objects
diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc
index 83722d7b8aab..455140c75c13 100644
--- a/src/tir/transforms/inject_virtual_thread.cc
+++ b/src/tir/transforms/inject_virtual_thread.cc
@@ -177,7 +177,7 @@ class VarTouchedAnalysis : public StmtVisitor {
   // Whether variable is touched by the thread variable.
   std::unordered_set<const VarNode*> touched_var_;
   // x -> all the buffers x read from
-  std::unordered_map<const VarNode*, std::vector<const VarNode*> > affect_;
+  std::unordered_map<const VarNode*, std::vector<const VarNode*>> affect_;
 };
 
 // Inject virtual thread loop
diff --git a/src/tir/transforms/ir_utils.h b/src/tir/transforms/ir_utils.h
index a54eebe4ed05..6915a0e3acc9 100644
--- a/src/tir/transforms/ir_utils.h
+++ b/src/tir/transforms/ir_utils.h
@@ -54,7 +54,7 @@ Stmt MergeNest(const std::vector<Stmt>& nest, Stmt body);
  * \param body body
  * \return The combined Stmt
  */
-Stmt MergeNest(const std::vector<std::vector<Stmt> >& nest, Stmt body);
+Stmt MergeNest(const std::vector<std::vector<Stmt>>& nest, Stmt body);
 
 /*!
  * \brief update array with an unary function
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 35c96e4fe4e1..4f8ad1223cd2 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -204,8 +204,8 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   }
 
   // Need to re-declare vars, in case some arguments also appears in the buffer.
-  std::vector<std::pair<Var, Var> > var_def;
-  std::vector<std::pair<Var, Buffer> > buffer_def;
+  std::vector<std::pair<Var, Var>> var_def;
+  std::vector<std::pair<Var, Buffer>> buffer_def;
 
   for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
     Var param = func_ptr->params[i];
@@ -343,7 +343,7 @@ Pass MakePackedAPI(int num_unpacked_args) {
   // packed arguments anyway while `num_unpacked_args` is -1
   auto pass_func = [num_unpacked_args](IRModule m, PassContext ctx) {
     IRModuleNode* mptr = m.CopyOnWrite();
-    std::vector<std::pair<GlobalVar, PrimFunc> > updates;
+    std::vector<std::pair<GlobalVar, PrimFunc>> updates;
 
     for (const auto& kv : mptr->functions) {
       if (auto* n = kv.second.as<PrimFuncNode>()) {
diff --git a/src/tir/transforms/storage_access.h b/src/tir/transforms/storage_access.h
index a48ee73f17fc..ac64e2f5cb65 100644
--- a/src/tir/transforms/storage_access.h
+++ b/src/tir/transforms/storage_access.h
@@ -125,7 +125,7 @@ class StorageAccessVisitor : public StmtExprVisitor {
    */
   StorageScope GetScope(Var buffer_var) const;
   // access scope
-  std::vector<std::vector<StmtEntry> > scope_;
+  std::vector<std::vector<StmtEntry>> scope_;
 
  private:
   // whether access appending is enabled.
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index acb052650036..177017f9a245 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -1010,11 +1010,11 @@ class StoragePlanRewriter : public StmtExprMutator {
   // symbolic free list, for non constant items.
   std::list<StorageEntry*> sym_free_list_;
   // The allocation attach map
-  std::unordered_map<const Object*, std::vector<StorageEntry*> > attach_map_;
+  std::unordered_map<const Object*, std::vector<StorageEntry*>> attach_map_;
   // The allocation assign map
   std::unordered_map<const VarNode*, StorageEntry*> alloc_map_;
   // The allocations
-  std::vector<std::unique_ptr<StorageEntry> > alloc_vec_;
+  std::vector<std::unique_ptr<StorageEntry>> alloc_vec_;
   // The buffer objects being remapped
   std::unordered_map<const BufferNode*, Buffer> buffer_remap_;
   // analyzer

From 7f1856d34f03113dc3a7733c010be43446161944 Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Fri, 26 Aug 2022 14:22:04 -0700
Subject: [PATCH 060/704] [Hexagon] Asynchronous DMA support (#12411)

Adds adds asynchronous DMA support through the Hexagon User DMA engine with unit tests to validate basic functionality. Asynchronous DMA support here means the ability to "kick off" asynchronously a number of DMAs using the Copy API and then to Poll for or Wait on a number of "in flight" (not done) DMAs. Enables future testing and development for asynchronous memory copy on Hexagon. For now, Hexagon DMA support remains synchronous in nature through existing hexagon_user_dma_1d_sync interface which uses asynchronous capable HexagonUserDMA class in a synchronous way --- calling Copy and Wait back to back for each request.

* use ring buffer to store DMA descriptors

* add RingBuffer class; used by HexUserDMA to store descriptors

* add test to overflow the HexagonUserDMA ring buffer
---
 src/runtime/hexagon/hexagon_device_api.cc     |   3 +-
 src/runtime/hexagon/hexagon_user_dma.cc       | 112 ++++++-----
 src/runtime/hexagon/hexagon_user_dma.h        |  97 +++++++++
 .../hexagon/hexagon_user_dma_descriptors.h    |   2 -
 .../hexagon/hexagon_user_dma_instructions.h   |   8 +-
 src/runtime/hexagon/ring_buffer.h             |  94 +++++++++
 .../hexagon/hexagon_user_dma_tests.cc         | 178 ++++++++++++++++
 .../cpp-runtime/hexagon/ring_buffer_tests.cc  | 190 ++++++++++++++++++
 8 files changed, 631 insertions(+), 53 deletions(-)
 create mode 100644 src/runtime/hexagon/hexagon_user_dma.h
 create mode 100644 src/runtime/hexagon/ring_buffer.h
 create mode 100644 tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
 create mode 100644 tests/cpp-runtime/hexagon/ring_buffer_tests.cc

diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 92a7b22784fb..f22afca10bfa 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -170,7 +170,8 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM
   void* src = args[1];
   int size = args[2];
 
-  hexagon_user_dma_1d_sync(dst, src, size);
+  int error_code = hexagon_user_dma_1d_sync(dst, src, size);
+  CHECK_EQ(error_code, 0);
 
   *rv = static_cast<int32_t>(0);
 });
diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc
index 0e3fbd2048f6..8d45b7590bc4 100644
--- a/src/runtime/hexagon/hexagon_user_dma.cc
+++ b/src/runtime/hexagon/hexagon_user_dma.cc
@@ -17,66 +17,47 @@
  * under the License.
  */
 
-#include <algorithm>
+#include "hexagon_user_dma.h"
 
-#include "hexagon_common.h"
-#include "hexagon_user_dma_descriptors.h"
-#include "hexagon_user_dma_instructions.h"
-#include "hexagon_user_dma_registers.h"
+#include <algorithm>
 
 namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-int init_hexagon_user_dma() {
-#if __HEXAGON_ARCH__ >= 68
-  // reset DMA engine
+unsigned int HexagonUserDMA::Init() {
   unsigned int status = dmpause() & DM0_STATUS_MASK;
-  if (status != DM0_STATUS_IDLE) {
-    return DMA_FAILURE;
-  }
-#endif
-  return DMA_SUCCESS;
+  return status;
 }
 
-int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) {
-#if __HEXAGON_ARCH__ >= 68
-  static int config_dma = init_hexagon_user_dma();
-  if (config_dma != DMA_SUCCESS) {
+int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) {
+  // length limited to 24 bits
+  if (length > DESC_LENGTH_MASK) {
     return DMA_FAILURE;
   }
 
-  uint64_t src64 = reinterpret_cast<uint64_t>(src);
   // source address limited to 32 bits
-  if (src64 > DESC_SRC_MASK) {
+  uint64_t src64 = reinterpret_cast<uint64_t>(src);
+  if (!src64 || src64 > DESC_SRC_MASK) {
     return DMA_FAILURE;
   }
 
-  uint64_t dst64 = reinterpret_cast<uint64_t>(dst);
   // destination address limited to 32 bits
-  if (dst64 > DESC_DST_MASK) {
-    return DMA_FAILURE;
-  }
-
-  // length limited to 24 bits
-  if (length > DESC_LENGTH_MASK) {
+  uint64_t dst64 = reinterpret_cast<uint64_t>(dst);
+  if (!dst64 || dst64 > DESC_DST_MASK) {
     return DMA_FAILURE;
   }
 
-  uint32_t src32 = src64 & DESC_SRC_MASK;
-  uint32_t dst32 = dst64 & DESC_DST_MASK;
-
-  void* dma_desc = nullptr;
-
-  int ret = posix_memalign(&dma_desc, DMA_DESC_2D_SIZE, DMA_DESC_2D_SIZE);
-  if (ret) {
-    return DMA_FAILURE;
-  }
+  uint32_t src32 = static_cast<uint32_t>(src64);
+  uint32_t dst32 = static_cast<uint32_t>(dst64);
 
+  // get pointer to next descriptor
+  dma_desc_2d_t* dma_desc = descriptors_->Next();
   if (!dma_desc) {
-    return DMA_FAILURE;
+    return DMA_RETRY;
   }
 
+  // populate descriptor fields
   dma_desc_set_state(dma_desc, DESC_STATE_READY);
   dma_desc_set_next(dma_desc, DMA_NULL_PTR);
   dma_desc_set_length(dma_desc, length);
@@ -90,23 +71,60 @@ int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) {
   dma_desc_set_src(dma_desc, src32);
   dma_desc_set_dst(dma_desc, dst32);
 
-  dmstart(dma_desc);
-  unsigned int status = dmwait() & DM0_STATUS_MASK;
-  unsigned int done = dma_desc_get_done(dma_desc);
+  if (first_dma_) {
+    // `dmstart` first descriptor
+    dmstart(dma_desc);
+    first_dma_ = false;
+  } else {
+    // `dmlink` descriptor to tail descriptor
+    dmlink(tail_dma_desc_, dma_desc);
+  }
 
-  free(dma_desc);
+  // update tail
+  tail_dma_desc_ = dma_desc;
+  return DMA_SUCCESS;
+}
 
-  if (status == DM0_STATUS_IDLE && done == DESC_DONE_COMPLETE) {
-    return DMA_SUCCESS;
+void HexagonUserDMA::Wait(uint32_t max_dmas_in_flight) {
+  // wait (forever) until max DMAs in flight <= actual DMAs in flight
+  while (DMAsInFlight() > max_dmas_in_flight) {
   }
-#endif
-  return DMA_FAILURE;
+}
+
+uint32_t HexagonUserDMA::Poll() { return DMAsInFlight(); }
+
+uint32_t HexagonUserDMA::DMAsInFlight() {
+  dmpoll();  // update DMA engine status
+  return descriptors_->InFlight();
+}
+
+HexagonUserDMA::HexagonUserDMA() {
+  // reset DMA engine
+  unsigned int status = Init();
+  CHECK_EQ(status, DM0_STATUS_IDLE);
+
+  auto desc_in_flight = [](dma_desc_2d_t* dma_desc) {
+    unsigned int done = dma_desc_get_done(dma_desc);
+    return (done != DESC_DONE_COMPLETE);
+  };
+  descriptors_ = new RingBuffer<dma_desc_2d_t>(MAX_DMA_DESCRIPTORS, desc_in_flight);
+}
+
+HexagonUserDMA::~HexagonUserDMA() {
+  Init();  // stop DMA engine
+  delete descriptors_;
 }
 
 int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
   // One DMA transfer can copy at most DESC_LENGTH_MASK bytes.
   // Make the common case quick.
-  if (length <= DESC_LENGTH_MASK) return hexagon_user_dma_1d_sync_helper(dst, src, length);
+  if (length <= DESC_LENGTH_MASK) {
+    // sync DMA -> `Copy` and then `Wait(0)`
+    int ret_val = HexagonUserDMA::Get().Copy(dst, src, length);
+    if (ret_val != DMA_SUCCESS) return ret_val;
+    HexagonUserDMA::Get().Wait(0);
+    return DMA_SUCCESS;
+  }
 
   // Split big transfers into smaller transfers.
   char* cast_src = static_cast<char*>(src);
@@ -114,8 +132,10 @@ int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
   for (uint32_t i = 0; i < length;) {
     // Ensure there is no overflow while updating i
     uint32_t cur_len = std::min<uint32_t>(length - i, DESC_LENGTH_MASK);
-    int ret_val = hexagon_user_dma_1d_sync_helper(&cast_dst[i], &cast_src[i], cur_len);
+    // sync DMA -> `Copy` and then `Wait(0)`
+    int ret_val = HexagonUserDMA::Get().Copy(&cast_dst[i], &cast_src[i], cur_len);
     if (ret_val != DMA_SUCCESS) return ret_val;
+    HexagonUserDMA::Get().Wait(0);
     // 2 cases for new val for i:
     // 1. length - i <= DESC_LENGTH_MASK (<= MAX_UINT)
     //    new_i = i + (length - i) = length, no more iter
diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h
new file mode 100644
index 000000000000..aa00df79c4d0
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_user_dma.h
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_
+
+#include "hexagon_common.h"
+#include "hexagon_user_dma_descriptors.h"
+#include "hexagon_user_dma_instructions.h"
+#include "hexagon_user_dma_registers.h"
+#include "ring_buffer.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+#define DMA_SUCCESS 0
+#define DMA_FAILURE -1
+#define DMA_RETRY 1
+#define MAX_DMA_DESCRIPTORS 100
+
+class HexagonUserDMA {
+ public:
+  /*!
+   * \brief Initiate DMA to copy memory from source to destination address
+   * \param dst Destination address
+   * \param src Source address
+   * \param length Length in bytes to copy
+   * \returns Status: DMA_SUCCESS or DMA_FAILURE
+   */
+  int Copy(void* dst, void* src, uint32_t length);
+
+  /*!
+   * \brief Wait until the number of DMAs in flight is less than or equal to some maximum
+   * \param max_dmas_in_flight Maximum number of DMAs allowed to be in flight
+   * to satisfy the `Wait` e.g. use `Wait(0)` to wait on "all" outstanding DMAs to complete
+   */
+  void Wait(uint32_t max_dmas_in_flight);
+
+  /*!
+   * \brief Poll the number of DMAs in flight
+   * \returns Number of DMAs in flight
+   */
+  uint32_t Poll();
+
+  //! \brief HexagonUserDMA uses the singleton pattern
+  static HexagonUserDMA& Get() {
+    static HexagonUserDMA* hud = new HexagonUserDMA();
+    return *hud;
+  }
+
+ private:
+  // HexagonUserDMA uses the singleton pattern
+  HexagonUserDMA();
+  ~HexagonUserDMA();
+  HexagonUserDMA(const HexagonUserDMA&) = delete;
+  HexagonUserDMA& operator=(const HexagonUserDMA&) = delete;
+  HexagonUserDMA(HexagonUserDMA&&) = delete;
+  HexagonUserDMA& operator=(HexagonUserDMA&&) = delete;
+
+  //! \brief Initializes the Hexagon User DMA engine
+  unsigned int Init();
+
+  //! \brief Calculates and returns the number of DMAs in flight
+  uint32_t DMAsInFlight();
+
+  //! \brief Tracks whether the very first DMA has been executed
+  bool first_dma_{true};
+
+  //! \brief Tracks the tail DMA descriptor
+  void* tail_dma_desc_{nullptr};
+
+  //! \brief Storage for all DMA descriptors
+  RingBuffer<dma_desc_2d_t>* descriptors_{nullptr};
+};
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_
diff --git a/src/runtime/hexagon/hexagon_user_dma_descriptors.h b/src/runtime/hexagon/hexagon_user_dma_descriptors.h
index 643dbc5e8bf5..913b025df138 100644
--- a/src/runtime/hexagon/hexagon_user_dma_descriptors.h
+++ b/src/runtime/hexagon/hexagon_user_dma_descriptors.h
@@ -126,8 +126,6 @@ namespace hexagon {
 #define DESC_DSTWIDTHOFFSET_MASK 0xFFFF0000
 #define DESC_DSTWIDTHOFFSET_SHIFT 16
 
-#define DMA_SUCCESS 0
-#define DMA_FAILURE -1
 #define DMA_NULL_PTR 0
 
 /**************************/
diff --git a/src/runtime/hexagon/hexagon_user_dma_instructions.h b/src/runtime/hexagon/hexagon_user_dma_instructions.h
index e160b7395658..2345d4daaf21 100644
--- a/src/runtime/hexagon/hexagon_user_dma_instructions.h
+++ b/src/runtime/hexagon/hexagon_user_dma_instructions.h
@@ -24,8 +24,6 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-#if __HEXAGON_ARCH__ >= 68
-
 inline unsigned int dmpause() {
   unsigned int dm0 = 0;
   asm volatile(" %0 = dmpause" : "=r"(dm0));
@@ -34,6 +32,10 @@ inline unsigned int dmpause() {
 
 inline void dmstart(void* next) { asm volatile(" dmstart(%0)" : : "r"(next)); }
 
+inline void dmlink(void* tail, void* next) {
+  asm volatile(" dmlink(%0, %1)" : : "r"(tail), "r"(next));
+}
+
 inline unsigned int dmpoll() {
   unsigned int dm0 = 0;
   asm volatile(" %0 = dmpoll" : "=r"(dm0));
@@ -70,8 +72,6 @@ inline void dmcfgwr(unsigned int dmindex, unsigned int data) {
   asm volatile(" dmcfgwr(%0, %1)" : : "r"(dmindex), "r"(data));
 }
 
-#endif
-
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/ring_buffer.h b/src/runtime/hexagon/ring_buffer.h
new file mode 100644
index 000000000000..d21b2b9953c2
--- /dev/null
+++ b/src/runtime/hexagon/ring_buffer.h
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_RING_BUFFER_H_
+#define TVM_RUNTIME_HEXAGON_RING_BUFFER_H_
+
+#include <functional>
+
+#include "hexagon_common.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+template <class T>
+class RingBuffer {
+ public:
+  //! \brief Returns the number of Ts in flight
+  uint32_t InFlight() {
+    while (id_oldest_ < id_next_ && !in_flight_(GetAddr(id_oldest_))) {
+      id_oldest_++;
+    }
+    return id_next_ - id_oldest_;
+  }
+
+  //! \brief Returns pointer to next T; null if ring buffer is full
+  T* Next() {
+    if (InFlight() == ring_buff_size_) {
+      return nullptr;
+    }
+    T* next = GetAddr(id_next_);
+    id_next_++;
+    return next;
+  }
+
+  /*! \brief Creates a ring buffer for storage items of type T
+   *  \param ring_buff_size Size of the ring buffer in number of Ts
+   *  \param in_flight Function that determines whether a T is in flight
+   */
+  RingBuffer(uint32_t ring_buff_size, std::function<bool(T*)> in_flight)
+      : ring_buff_size_(ring_buff_size), in_flight_(in_flight) {
+    CHECK_NE(ring_buff_size, 0);
+    int ret = posix_memalign(reinterpret_cast<void**>(&ring_buff_ptr_), sizeof(T),
+                             sizeof(T) * ring_buff_size_);
+    CHECK_EQ(ret, 0);
+    CHECK_NE(ring_buff_ptr_, nullptr);
+  }
+
+  ~RingBuffer() { free(ring_buff_ptr_); }
+
+ private:
+  //! \brief Returns the address of a T given its index
+  T* GetAddr(uint32_t id) const {
+    uint32_t ring_buff_index = id % ring_buff_size_;
+    return ring_buff_ptr_ + ring_buff_index;
+  }
+
+  //! \brief Pointer to the ring buffer
+  T* ring_buff_ptr_{nullptr};
+
+  //! \brief Size of the ring buffer in number of Ts
+  const uint32_t ring_buff_size_;
+
+  //! \brief Function that determines whether a T is in flight
+  const std::function<bool(T*)> in_flight_;
+
+  //! \brief Tracks the ID of the next T to be added to the ring buffer
+  uint32_t id_next_{0};
+
+  //! \brief Tracks the ID of the oldest T in flight
+  uint32_t id_oldest_{0};
+};
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_HEXAGON_RING_BUFFER_H_
diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
new file mode 100644
index 000000000000..bf7a23712d7d
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../src/runtime/hexagon/hexagon_user_dma.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::hexagon;
+
+class HexagonUserDMATest : public ::testing::Test {
+  void SetUp() override {
+    src = malloc(length);
+    dst = malloc(length);
+    ASSERT_NE(src, nullptr);
+    ASSERT_NE(dst, nullptr);
+
+    src_char = static_cast<char*>(src);
+    dst_char = static_cast<char*>(dst);
+    for (uint32_t i = 0; i < length; ++i) {
+      src_char[i] = 1;
+      dst_char[i] = 0;
+    }
+  }
+  void TearDown() override {
+    free(src);
+    free(dst);
+  }
+
+ public:
+  int ret{0};
+  void* src{nullptr};
+  void* dst{nullptr};
+  char* src_char{nullptr};
+  char* dst_char{nullptr};
+  uint32_t length{0x4000};  // 16KB
+};
+
+TEST_F(HexagonUserDMATest, wait) {
+  HexagonUserDMA::Get().Wait(0);
+  HexagonUserDMA::Get().Wait(10);
+}
+
+TEST_F(HexagonUserDMATest, poll) { ASSERT_EQ(HexagonUserDMA::Get().Poll(), 0); }
+
+TEST_F(HexagonUserDMATest, bad_copy) {
+  uint64_t bigaddr = 0x100000000;
+  void* src64 = reinterpret_cast<void*>(bigaddr);
+  void* dst64 = reinterpret_cast<void*>(bigaddr);
+  uint32_t biglength = 0x1000000;
+  ASSERT_NE(HexagonUserDMA::Get().Copy(dst64, src, length), DMA_SUCCESS);
+  ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src64, length), DMA_SUCCESS);
+  ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src, biglength), DMA_SUCCESS);
+}
+
+TEST_F(HexagonUserDMATest, sync_dma) {
+  // kick off 1 DMA
+  ret = HexagonUserDMA::Get().Copy(dst, src, length);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // wait for DMA to complete
+  HexagonUserDMA::Get().Wait(0);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+}
+
+TEST_F(HexagonUserDMATest, async_dma_wait) {
+  // kick off 10x duplicate DMAs
+  for (uint32_t i = 0; i < 10; ++i) {
+    ret = HexagonUserDMA::Get().Copy(dst, src, length);
+    ASSERT_EQ(ret, DMA_SUCCESS);
+  }
+
+  // wait for at least 1 DMA to complete
+  HexagonUserDMA::Get().Wait(9);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+
+  // empty the DMA queue
+  HexagonUserDMA::Get().Wait(0);
+}
+
+TEST_F(HexagonUserDMATest, async_dma_poll) {
+  // kick off 10x duplicate DMAs
+  for (uint32_t i = 0; i < 10; ++i) {
+    ret = HexagonUserDMA::Get().Copy(dst, src, length);
+    ASSERT_EQ(ret, DMA_SUCCESS);
+  }
+
+  // poll until at least 1 DMA is complete
+  while (HexagonUserDMA::Get().Poll() == 10) {
+  };
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+
+  // empty the DMA queue
+  HexagonUserDMA::Get().Wait(0);
+}
+
+// TODO: Run non-pipelined case with sync DMA and execution time vs. pipelined case
+TEST_F(HexagonUserDMATest, pipeline) {
+  uint32_t pipeline_depth = 4;
+  uint32_t pipeline_length = length / pipeline_depth;
+
+  for (uint32_t i = 0; i < pipeline_depth; ++i) {
+    ret |= HexagonUserDMA::Get().Copy(dst_char + i * pipeline_length,
+                                      src_char + i * pipeline_length, pipeline_length);
+  }
+
+  HexagonUserDMA::Get().Wait(3);
+  for (uint32_t i = 0; i < pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+
+  HexagonUserDMA::Get().Wait(2);
+  for (uint32_t i = pipeline_length; i < 2 * pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+
+  HexagonUserDMA::Get().Wait(1);
+  for (uint32_t i = 2 * pipeline_length; i < 3 * pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+
+  HexagonUserDMA::Get().Wait(0);
+  for (uint32_t i = 3 * pipeline_length; i < 4 * pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+
+  // verify
+  ASSERT_EQ(ret, DMA_SUCCESS);
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(2, dst_char[i]);
+  }
+}
+
+TEST_F(HexagonUserDMATest, overflow_ring_buffer) {
+  uint32_t number_of_dmas = 0x400;  // 1k
+  uint32_t length_of_each_dma = length / number_of_dmas;
+
+  for (uint32_t i = 0; i < number_of_dmas; ++i) {
+    do {
+      ret = HexagonUserDMA::Get().Copy(dst_char + i * length_of_each_dma,
+                                       src_char + i * length_of_each_dma, length_of_each_dma);
+    } while (ret == DMA_RETRY);
+    ASSERT_EQ(ret, DMA_SUCCESS);
+  }
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+}
\ No newline at end of file
diff --git a/tests/cpp-runtime/hexagon/ring_buffer_tests.cc b/tests/cpp-runtime/hexagon/ring_buffer_tests.cc
new file mode 100644
index 000000000000..cd40dca87b02
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/ring_buffer_tests.cc
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../src/runtime/hexagon/ring_buffer.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::hexagon;
+
+class RingBufferTest : public ::testing::Test {
+  void SetUp() override { ring_buff = new RingBuffer<int>(size, in_flight); }
+  void TearDown() override { delete ring_buff; }
+
+ public:
+  std::function<bool(int*)> in_flight = [](int* ptr) {
+    if (*ptr == 42) {
+      // finished
+      return false;
+    }
+    // in flight
+    return true;
+  };
+
+  int finished = 42;
+  int inflight = 43;
+  uint32_t size = 4;
+  uint32_t half = size / 2;
+  RingBuffer<int>* ring_buff;
+};
+
+TEST_F(RingBufferTest, zero_size_ring_buffer) {
+  ASSERT_THROW(RingBuffer<int>(0, in_flight), InternalError);
+}
+
+TEST_F(RingBufferTest, in_flight) { ASSERT_EQ(ring_buff->InFlight(), 0); }
+
+TEST_F(RingBufferTest, next) {
+  // get pointer to first item
+  int* ptr = ring_buff->Next();
+  ASSERT_NE(ptr, nullptr);
+
+  // mark it in flight and check
+  *ptr = inflight;
+  ASSERT_EQ(ring_buff->InFlight(), 1);
+
+  // mark it finished and check
+  *ptr = finished;
+  ASSERT_EQ(ring_buff->InFlight(), 0);
+}
+
+TEST_F(RingBufferTest, full) {
+  // fill the ring buffer
+  for (int i = 0; i < size; ++i) {
+    int* ptr = ring_buff->Next();
+    ASSERT_NE(ptr, nullptr);
+
+    // mark in flight and check
+    *ptr = inflight;
+    ASSERT_EQ(ring_buff->InFlight(), i + 1);
+  }
+
+  // check that the ring buffer is full
+  ASSERT_EQ(ring_buff->Next(), nullptr);
+  ASSERT_EQ(ring_buff->InFlight(), size);
+}
+
+TEST_F(RingBufferTest, wrap) {
+  // fill the ring buffer, but mark each finished
+  bool first = true;
+  int* firstptr = nullptr;
+  for (int i = 0; i < size; ++i) {
+    int* ptr = ring_buff->Next();
+    ASSERT_NE(ptr, nullptr);
+
+    // save first ptr for later comparison
+    if (first) {
+      firstptr = ptr;
+      first = false;
+    }
+
+    // mark finished and check
+    *ptr = finished;
+    ASSERT_EQ(ring_buff->InFlight(), 0);
+  }
+
+  // reuse the first ring buffer entry
+  int* ptr = ring_buff->Next();
+  ASSERT_EQ(ptr, firstptr);
+
+  // mark it in flight and check
+  *ptr = inflight;
+  ASSERT_EQ(ring_buff->InFlight(), 1);
+
+  // mark it finished and check
+  *ptr = finished;
+  ASSERT_EQ(ring_buff->InFlight(), 0);
+}
+
+TEST_F(RingBufferTest, wrap_corner) {
+  for (int i = 0; i < size; ++i) {
+    int* ptr = ring_buff->Next();
+    *ptr = finished;
+  }
+
+  // reuse the first ring buffer entry
+  int* ptr = ring_buff->Next();
+  ASSERT_NE(ptr, nullptr);
+
+  // user must mark the item "inflight" before checking in flight count
+  // here the "finished" status is inherited from the reused ring buffer entry
+  // thus the in flight count is zero instead one; which the user might expect
+  ASSERT_EQ(ring_buff->InFlight(), 0);
+
+  // marking the item "inflight" after checking the in flight count
+  // will not change the outcome; the ring buffer considers the item "finished"
+  *ptr = inflight;
+  ASSERT_EQ(ring_buff->InFlight(), 0);
+}
+
+TEST_F(RingBufferTest, half_in_flight) {
+  // these will complete
+  for (int i = 0; i < half; ++i) {
+    int* ptr = ring_buff->Next();
+    ASSERT_NE(ptr, nullptr);
+    *ptr = finished;
+    ASSERT_EQ(ring_buff->InFlight(), 0);
+  }
+
+  // these will not complete
+  for (int i = 0; i < half; ++i) {
+    int* ptr = ring_buff->Next();
+    ASSERT_NE(ptr, nullptr);
+    *ptr = inflight;
+    ASSERT_EQ(ring_buff->InFlight(), i + 1);
+  }
+
+  // check half in flight
+  ASSERT_EQ(ring_buff->InFlight(), half);
+
+  // get pointer to next item
+  int* ptr = ring_buff->Next();
+  ASSERT_NE(ptr, nullptr);
+
+  // mark it inflight and check
+  *ptr = inflight;
+  ASSERT_EQ(ring_buff->InFlight(), 3);
+
+  // mark it finished and check also blocked
+  *ptr = finished;
+  ASSERT_EQ(ring_buff->InFlight(), 3);
+}
+
+TEST_F(RingBufferTest, half_in_flight_blocked) {
+  // these will not complete
+  for (int i = 0; i < half; ++i) {
+    int* ptr = ring_buff->Next();
+    ASSERT_NE(ptr, nullptr);
+    *ptr = inflight;
+    ASSERT_EQ(ring_buff->InFlight(), i + 1);
+  }
+
+  // these would complete, but they are blocked
+  for (int i = half; i < size; ++i) {
+    int* ptr = ring_buff->Next();
+    ASSERT_NE(ptr, nullptr);
+    *ptr = finished;
+    ASSERT_EQ(ring_buff->InFlight(), i + 1);
+  }
+
+  // check that the ring buffer is full
+  ASSERT_EQ(ring_buff->Next(), nullptr);
+  ASSERT_EQ(ring_buff->InFlight(), size);
+}

From 370abe69d24519a5453cead846d328a1c378957f Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 26 Aug 2022 20:20:42 -0700
Subject: [PATCH 061/704] [MetaSchedule][UX] Make `Database` with-able (#12520)

`ApplyHistoryBest` right now plays a role as the database adaptor to query inside the database.
In fact, the logic could be simplified and users only have to deal with `Database` instead of this
extra object.

- [x] Add `EnterWithScope`/`ExitWithScope`/`Current` to Database
- [x] Migrate `te_filter_func` => "tir_filter" in Relay's pass context
- [x] Migrate `f_take_tuning_record` => "Database.query_tuning_record"
- [x] Migrate `TECompiler` to use `Database`
- [x] Remove apply-history-best

Next PR:
- Migrate `f_direct_dispatch` (potentially unify with `apply_fixed_schedule`?)
---
 .../tvm/meta_schedule/apply_history_best.h    | 115 ------------
 include/tvm/meta_schedule/database.h          |  28 +++
 include/tvm/meta_schedule/extracted_task.h    |  20 ---
 .../tvm/auto_scheduler/testing/tune_relay.py  |  93 +++++-----
 python/tvm/meta_schedule/__init__.py          |   1 -
 .../tvm/meta_schedule/apply_history_best.py   | 130 --------------
 python/tvm/meta_schedule/database/database.py | 104 ++++++++++-
 python/tvm/meta_schedule/default_config.py    |   4 -
 python/tvm/meta_schedule/relay_integration.py |  29 ++-
 .../tvm/meta_schedule/testing/tune_relay.py   |  30 +++-
 python/tvm/meta_schedule/testing/utils.py     |  26 +--
 python/tvm/meta_schedule/tune.py              |  12 +-
 src/meta_schedule/apply_history_best.cc       | 165 ------------------
 src/meta_schedule/database/database.cc        |  64 +++++++
 src/meta_schedule/extracted_task.cc           |  70 --------
 src/meta_schedule/utils.h                     |   1 -
 src/relay/backend/task_extraction.cc          |  25 +--
 src/relay/backend/te_compiler.cc              |   1 +
 src/relay/backend/te_compiler_cache.cc        |  70 ++++----
 src/relay/backend/utils.cc                    |  73 ++++++++
 src/relay/backend/utils.h                     |  31 ++++
 .../test_meta_schedule_auto_tensorize.py      |  25 ++-
 tests/python/unittest/test_link_params.py     |  19 +-
 .../test_meta_schedule_integration.py         |  62 +------
 .../test_meta_schedule_multi_anchor.py        |   2 +-
 .../test_meta_schedule_relay_tir_compute.py   |  18 +-
 .../unittest/test_meta_schedule_tune_relay.py |  57 +++---
 27 files changed, 511 insertions(+), 764 deletions(-)
 delete mode 100644 include/tvm/meta_schedule/apply_history_best.h
 delete mode 100644 python/tvm/meta_schedule/apply_history_best.py
 delete mode 100644 src/meta_schedule/apply_history_best.cc

diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h
deleted file mode 100644
index 44a34b3ee496..000000000000
--- a/include/tvm/meta_schedule/apply_history_best.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_
-#define TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_
-
-#include <tvm/ir/module.h>
-#include <tvm/meta_schedule/database.h>
-#include <tvm/node/reflection.h>
-#include <tvm/runtime/container/array.h>
-#include <tvm/runtime/container/optional.h>
-#include <tvm/runtime/container/string.h>
-#include <tvm/runtime/object.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/target/target.h>
-#include <tvm/te/tensor.h>
-
-namespace tvm {
-namespace meta_schedule {
-
-/*!
- * \brief An integration context that allows application of historically best records from a
- * database
- */
-class ApplyHistoryBestNode : public runtime::Object {
- public:
-  /*! \brief A callback function that filters TE compute */
-  using FTEFilterFunc = runtime::TypedPackedFunc<Optional<tir::PrimFunc>(
-      const Array<te::Tensor, void>&, const Array<runtime::NDArray>&)>;
-  /*! \brief  A callback function that takes a tuning record and does something with it */
-  using FTakeTuningRecord = runtime::TypedPackedFunc<void(const TuningRecord&)>;
-  using FDirectDispatch = runtime::TypedPackedFunc<Optional<IRModule>(const IRModule&)>;
-
-  /*! \brief The database to be queried from */
-  Database database{nullptr};
-  /*! \brief The filtering function for TE computation */
-  FTEFilterFunc te_filter_func{nullptr};
-  /*! \brief The logging function to be used */
-  PackedFunc logging_func;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("database", &database);
-    // `te_filter_func` is not visited
-    // `logging_func` is not visited
-  }
-  /*!
-   * \brief Query the best entry from the database
-   * \param task_name The name of the task to be queried
-   * \param mod The module to be queried
-   * \param target The target to be queried
-   * \param dispatched The IRs after dispatch
-   * \param f_take_tuning_record A callback function that takes a tuning record and does something
-   *   with it.
-   * \param f_direct_dispatch A function that directly dispatches an IRModule to the given workload
-   *   as result if available, skipping the database query.
-   */
-  Optional<IRModule> Query(runtime::String task_name, IRModule mod, Target target,
-                           Optional<Array<IRModule>> dispatched,
-                           FTakeTuningRecord f_take_tuning_record,
-                           FDirectDispatch f_direct_dispatch = nullptr);
-
-  static constexpr const char* _type_key = "meta_schedule.ApplyHistoryBest";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ApplyHistoryBestNode, runtime::Object);
-};
-
-/*!
- * \brief Managed reference to ApplyHistoryBestNode
- * \sa ApplyHistoryBestNode
- */
-class ApplyHistoryBest : public runtime::ObjectRef {
- public:
-  /*!
-   * \brief Constructor
-   * \param database The database to be queried from
-   * \param te_filter_func The filtering function for TE computation
-   * \param logging_func The logging function to use
-   */
-  explicit ApplyHistoryBest(Database database, ApplyHistoryBestNode::FTEFilterFunc te_filter_func,
-                            PackedFunc logging_func);
-  /*!
-   * \brief The current ApplyHistoryBest in the context
-   * \return The ApplyHistoryBest in the current scope.
-   */
-  static Optional<ApplyHistoryBest> Current();
-
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ApplyHistoryBest, runtime::ObjectRef,
-                                                    ApplyHistoryBestNode);
-
- protected:
-  friend class ApplyHistoryBestInternal;
-  /*! \brief Entering the scope of the context manager */
-  void EnterWithScope();
-  /*! \brief Exiting the scope of the context manager */
-  void ExitWithScope();
-};
-
-}  // namespace meta_schedule
-}  // namespace tvm
-
-#endif  // TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_
diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index 1c260d9d748a..0e7f45d39332 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -203,6 +203,27 @@ class DatabaseNode : public runtime::Object {
    * \return The size of the database.
    */
   virtual int64_t Size() = 0;
+  /*!
+   * \brief Query the best record of the given workload from the database.
+   * \param mod The IRModule to be searched for.
+   * \param target The target to be searched for.
+   * \return The best record of the given workload; NullOpt if not found.
+   */
+  virtual Optional<TuningRecord> QueryTuningRecord(IRModule mod, Target target);
+  /*!
+   * \brief Query the best schedule of the given workload from the database.
+   * \param mod The IRModule to be searched for.
+   * \param target The target to be searched for.
+   * \return The schedule in the best schedule of the given workload; NullOpt if not found.
+   */
+  virtual Optional<tir::Schedule> QuerySchedule(IRModule mod, Target target);
+  /*!
+   * \brief Query the best IRModule of the given workload from the database.
+   * \param mod The IRModule to be searched for.
+   * \param target The target to be searched for.
+   * \return The IRModule in the best IRModule of the given workload; NullOpt if not found.
+   */
+  virtual Optional<IRModule> QueryIRModule(IRModule mod, Target target);
 
   static constexpr const char* _type_key = "meta_schedule.Database";
   TVM_DECLARE_BASE_OBJECT_INFO(DatabaseNode, runtime::Object);
@@ -339,6 +360,13 @@ class Database : public runtime::ObjectRef {
                                      PyDatabaseNode::FGetTopK f_get_top_k,
                                      PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records,
                                      PyDatabaseNode::FSize f_size);
+  /*! \return The current Database in the scope. */
+  static Optional<Database> Current();
+  /*! \brief Entering the scope of the context manager */
+  void EnterWithScope();
+  /*! \brief Exiting the scope of the context manager */
+  void ExitWithScope();
+
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(Database, runtime::ObjectRef, DatabaseNode);
 };
 
diff --git a/include/tvm/meta_schedule/extracted_task.h b/include/tvm/meta_schedule/extracted_task.h
index bce40e6b95f0..239bf0dc5777 100644
--- a/include/tvm/meta_schedule/extracted_task.h
+++ b/include/tvm/meta_schedule/extracted_task.h
@@ -76,26 +76,6 @@ class ExtractedTask : public runtime::ObjectRef {
                                                     ExtractedTaskNode);
 };
 
-/*!
- * \brief The default TE task filter
- * \param args The input/output arguments of the TE compute graph
- * \param constants Raw data for constant tensors in args. If the size of this array is N, the last
- * N tensors in args will be treated as constant tensors.
- * \return NullOpt if the task is filtered out, otherwise the task in PrimFunc
- */
-Optional<tvm::tir::PrimFunc> DefaultTaskFilter(const Array<tvm::te::Tensor, void>& args,
-                                               const Array<runtime::NDArray>& constants);
-
-/*!
- * \brief The default TE task filter, with `te.extern` allowed
- * \param args The input/output arguments of the TE compute graph
- * \param constants Raw data for constant tensors in args. If the size of this array is N, the last
- * N tensors in args will be treated as constant tensors.
- * \return NullOpt if the task is filtered out, otherwise the task in PrimFunc
- */
-Optional<tir::PrimFunc> DefaultTaskFilterAllowExtern(const Array<tvm::te::Tensor, void>& args,
-                                                     const Array<runtime::NDArray>& constants);
-
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index fe747af7972c..2d84389f9de1 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
-from distutils.util import strtobool
 import argparse
 import json
 import os
+from distutils.util import strtobool
 
 import tvm
 from tvm import auto_scheduler
@@ -26,7 +26,7 @@
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer
+from tvm.meta_schedule.testing.tune_utils import create_timer, generate_input_data
 from tvm.meta_schedule.utils import cpu_count
 from tvm.support import describe
 
@@ -170,53 +170,62 @@ def main():
         ARGS.input_shape,
         cache_dir=ARGS.cache_dir,
     )
-    input_info = {input_name: input_shape}
+    input_info = [
+        {
+            "name": input_name,
+            "shape": input_shape,
+            "dtype": input_dtype,
+        },
+    ]
     input_data = {
-        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape
+        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in input_info
     }
-    for input_name, input_shape in input_info.items():
-        print(f"  input_name : {input_name}")
-        print(f"  input_shape: {input_shape}")
-        print(f"  input_dtype: {input_dtype}")
+    for item in input_info:
+        print(f"  input_name : {item['name']}")
+        print(f"  input_shape: {item['shape']}")
+        print(f"  input_dtype: {item['dtype']}")
 
     with ms.Profiler() as profiler:
-        tasks, task_weights = auto_scheduler.extract_tasks(
-            mod["main"],
-            params,
-            target=ARGS.target,
-            hardware_params=hardware_params,
-        )
-        for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
-            print(
-                f"==== Task {idx}: {task.desc} "
-                f"(weight {task_weight} key: {task.workload_key}) ====="
-            )
-            print(task.compute_dag)
-
-        if ARGS.num_trials > 0:
-            tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-            tuner.tune(
-                auto_scheduler.TuningOptions(
-                    num_measure_trials=ARGS.num_trials,
-                    runner=runner,
-                    measure_callbacks=[
-                        auto_scheduler.RecordToFile(log_file),
-                    ],
-                ),
-                adaptive_training=ARGS.adaptive_training,
+        with ms.Profiler.timeit("TaskExtraction"):
+            tasks, task_weights = auto_scheduler.extract_tasks(
+                mod["main"],
+                params,
+                target=ARGS.target,
+                hardware_params=hardware_params,
             )
+            for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
+                print(
+                    f"==== Task {idx}: {task.desc} "
+                    f"(weight {task_weight} key: {task.workload_key}) ====="
+                )
+                print(task.compute_dag)
+
+        with ms.Profiler.timeit("Tuning"):
+            if ARGS.num_trials > 0:
+                tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+                tuner.tune(
+                    auto_scheduler.TuningOptions(
+                        num_measure_trials=ARGS.num_trials,
+                        runner=runner,
+                        measure_callbacks=[
+                            auto_scheduler.RecordToFile(log_file),
+                        ],
+                    ),
+                    adaptive_training=ARGS.adaptive_training,
+                )
 
         relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend]
-        with auto_scheduler.ApplyHistoryBest(log_file):
-            with tvm.transform.PassContext(
-                opt_level=3,
-                config={"relay.backend.use_auto_scheduler": True},
-            ):
-                lib = relay_build(
-                    mod,
-                    target=ARGS.target,
-                    params=params,
-                )
+        with ms.Profiler.timeit("PostTuningCompilation"):
+            with auto_scheduler.ApplyHistoryBest(log_file):
+                with tvm.transform.PassContext(
+                    opt_level=3,
+                    config={"relay.backend.use_auto_scheduler": True},
+                ):
+                    lib = relay_build(
+                        mod,
+                        target=ARGS.target,
+                        params=params,
+                    )
     print("Tuning Time:")
     print(profiler.table())
 
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index f60d0a5490f5..cf348d49f4e2 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -30,7 +30,6 @@
     search_strategy,
     space_generator,
 )
-from .apply_history_best import ApplyHistoryBest
 from .extracted_task import ExtractedTask
 from .profiler import Profiler
 from .relay_integration import (
diff --git a/python/tvm/meta_schedule/apply_history_best.py b/python/tvm/meta_schedule/apply_history_best.py
deleted file mode 100644
index a7b9b20bf244..000000000000
--- a/python/tvm/meta_schedule/apply_history_best.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""A context manager that injects the best tuning record in the database into compilation"""
-import logging
-from typing import Callable, List, Optional, Union
-
-from tvm._ffi import get_global_func, register_object
-from tvm.ir import IRModule
-from tvm.runtime import Object
-from tvm.target import Target
-from tvm.te import Tensor
-from tvm.tir import PrimFunc
-
-from . import _ffi_api
-from .database import Database, TuningRecord
-from .utils import make_logging_func
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-
-@register_object("meta_schedule.ApplyHistoryBest")
-class ApplyHistoryBest(Object):
-    """An integration context that allows application of historically best records from a database
-
-    Parameters
-    ----------
-    database : Database
-        The database to be queried from
-    te_filter_func : Union[str, None, Callable[[List[Tensor], List[NDArray]], PrimFunc]] = None
-        The filtering function for TE computation
-        If it's a string, it's the name of the filtering function. Built in functions are
-          - "meta_schedule.DefaultTaskFilter"
-          - "meta_schedule.DefaultTaskFilterAllowExtern"
-        If it's None, it's the default filtering function
-        If it's a callable, it's the filtering function
-    """
-
-    database: Database
-
-    def __init__(
-        self,
-        database: Database,
-        te_filter_func: Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None,
-    ) -> None:
-        if isinstance(te_filter_func, str):
-            te_filter_func = get_global_func(te_filter_func)
-        self.__init_handle_by_constructor__(
-            _ffi_api.ApplyHistoryBest,  # type: ignore # pylint: disable=no-member
-            database,
-            te_filter_func,
-            make_logging_func(logger),
-        )
-
-    def query(
-        self,
-        task_name: str,
-        mod: IRModule,
-        target: Target,
-        dispatched: Optional[List[IRModule]],
-        f_take_tuning_record: Optional[Callable[[TuningRecord], None]] = None,
-        f_direct_dispatch: Optional[Callable[[IRModule], Optional[IRModule]]] = None,
-    ) -> Union[IRModule, None]:
-        """The entry point of the integration
-
-        Parameters
-        ----------
-        task_name : str
-            The name of the task extracted
-        mod : IRModule
-            The high-level IR
-        target: Target
-            Target Info
-        dispatched : Optional[List[IRModule]]
-            A list of low-level IRs that the high-level IR could potentially dispatch to
-        f_take_tuning_record : Optional[Callable[[TuningRecord], None]] = None
-            A callback function that takes a tuning record and does something with it
-        f_direct_dispatch : Optional[Callable[[IRModule], Optional[IRModule]]] = None
-            A function that directly dispatches an IRModule to the given workload as result if
-            available, skipping the database query.
-
-        Returns
-        -------
-        result : IRModule or None
-            Currently we only have to return tir::PrimFunc, but we wrap it under IRModule for
-            more general future use. None is returned if there is no feedback hint.
-        """
-        return _ffi_api.ApplyHistoryBestQuery(  # type: ignore # pylint: disable=no-member
-            self,
-            task_name,
-            mod,
-            target,
-            dispatched,
-            f_take_tuning_record,
-            f_direct_dispatch,
-        )
-
-    @staticmethod
-    def current() -> Optional["ApplyHistoryBest"]:
-        """The context manager in the current scope
-
-        Returns
-        -------
-        ctx : Optional[ApplyHistoryBest]
-            The ApplyHistoryBest context manager in the current scope.
-            None if it's currently not under any ApplyHistoryBest context.
-        """
-        return _ffi_api.ApplyHistoryBestCurrent()  # type: ignore # pylint: disable=no-member
-
-    def __enter__(self) -> "ApplyHistoryBest":
-        """Entering the scope of the context manager"""
-        _ffi_api.ApplyHistoryBestEnterScope(self)  # type: ignore # pylint: disable=no-member
-        return self
-
-    def __exit__(self, ptype, value, trace) -> None:
-        """Exiting the scope of the context manager"""
-        _ffi_api.ApplyHistoryBestExitScope(self)  # type: ignore # pylint: disable=no-member
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index 0c11f77591cc..68283b4554e5 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -15,13 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 """TuningRecord database"""
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, List, Optional, Union
 
 from tvm._ffi import register_object
 from tvm.ir.module import IRModule
 from tvm.runtime import Object
 from tvm.target import Target
-from tvm.tir.schedule import Trace
+from tvm.tir.schedule import Schedule, Trace
+from typing_extensions import Literal  # pylint: disable=wrong-import-order
 
 from .. import _ffi_api
 from ..arg_info import ArgInfo
@@ -234,6 +235,105 @@ def __len__(self) -> int:
         """
         return _ffi_api.DatabaseSize(self)  # type: ignore # pylint: disable=no-member
 
+    def query_tuning_record(self, mod: IRModule, target: Target) -> Optional[TuningRecord]:
+        """Query the best record of the given workload from the database.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The IRModule to be searched for.
+        target : Target
+            The target to be searched for.
+
+        Returns
+        -------
+        tuning_record : Optional[TuningRecord]
+            The best record of the given workload; None if not found.
+        """
+        return _ffi_api.DatabaseQueryTuningRecord(self, mod, target)  # type: ignore # pylint: disable=no-member
+
+    def query_schedule(self, mod: IRModule, target: Target) -> Optional[Schedule]:
+        """Query the best schedule of the given workload from the database.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The IRModule to be searched for.
+        target : Target
+            The target to be searched for.
+
+        Returns
+        -------
+        schedule : Optional[Schedule]
+            The best schedule of the given workload; None if not found.
+        """
+        return _ffi_api.DatabaseQuerySchedule(self, mod, target)  # type: ignore # pylint: disable=no-member
+
+    def query_ir_module(self, mod: IRModule, target: Target) -> Optional[IRModule]:
+        """Query the best IRModule of the given workload from the database.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The IRModule to be searched for.
+        target : Target
+            The target to be searched for.
+
+        Returns
+        -------
+        ir_module : Optional[IRModule]
+            The best IRModule of the given workload; None if not found.
+        """
+        return _ffi_api.DatabaseQueryIRModule(self, mod, target)  # type: ignore # pylint: disable=no-member
+
+    def query(
+        self,
+        mod: IRModule,
+        target: Target,
+        kind: Union[
+            Literal["schedule"],
+            Literal["record"],
+            Literal["ir_module"],
+        ] = "schedule",
+    ) -> Union[Schedule, IRModule, TuningRecord]:
+        """Query the database to retrieve the best optimization outcome of the given workload.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The IRModule to be searched for.
+        target : Target
+            The target to be searched for.
+        kind : str = "schedule" | "record" | "ir_module"
+            The kind of the optimization outcome to be returned.
+
+        Returns
+        -------
+        result : Union[Schedule, IRModule, TuningRecord]
+            The best optimization outcome of the given workload.
+        """
+        if kind == "schedule":
+            return self.query_schedule(mod, target)
+        if kind == "record":
+            return self.query_tuning_record(mod, target)
+        if kind == "ir_module":
+            return self.query_ir_module(mod, target)
+        raise ValueError(f'Unknown kind: {kind}. Candidates are: "schedule", "record", "ir_module"')
+
+    def __enter__(self) -> "Database":
+        """Entering the scope of the context manager"""
+        _ffi_api.DatabaseEnterWithScope(self)  # type: ignore # pylint: disable=no-member
+        return self
+
+    def __exit__(self, ptype, value, trace) -> None:
+        """Exiting the scope of the context manager"""
+        _ffi_api.DatabaseExitWithScope(self)  # type: ignore # pylint: disable=no-member
+
+    @staticmethod
+    def current() -> Optional["Database"]:
+        """Get the current database under scope."""
+        return _ffi_api.DatabaseCurrent()  # type: ignore # pylint: disable=no-member
+
 
 @register_object("meta_schedule.PyDatabase")
 class _PyDatabase(Database):
diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
index 97cbfc58a6c1..652f09261b2f 100644
--- a/python/tvm/meta_schedule/default_config.py
+++ b/python/tvm/meta_schedule/default_config.py
@@ -20,7 +20,6 @@
 from os import path as osp
 from typing import Callable, Dict, List, Optional, Union
 
-from tvm._ffi.registry import register_func
 from tvm.ir import IRModule
 from tvm.target import Target
 from tvm.tir import PrimFunc
@@ -44,7 +43,6 @@
 FnMutatorProb = Callable[[], Dict[Mutator, float]]
 
 
-@register_func("tvm.meta_schedule.tune.parse_mod")  # for use in ApplyHistoryBest
 def mod(mod: Union[PrimFunc, IRModule]) -> IRModule:  # pylint: disable=redefined-outer-name
     """Normalize the input to an IRModule"""
     if isinstance(mod, PrimFunc):
@@ -53,8 +51,6 @@ def mod(mod: Union[PrimFunc, IRModule]) -> IRModule:  # pylint: disable=redefine
         mod = IRModule({"main": mod})
     if not isinstance(mod, IRModule):
         raise TypeError(f"Expected `mod` to be PrimFunc or IRModule, but gets: {mod}")
-    # in order to make sure the mod can be found in ApplyHistoryBest
-    # different func name can cause structural unequal
     func_names = mod.get_global_vars()
     (func_name,) = func_names
     if len(func_names) == 1 and func_name != "main":
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index d3b3ea796532..24009ab07fcf 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """MetaSchedule-Relay integration"""
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
 
 import numpy as np  # type: ignore
 from tvm import nd
@@ -23,8 +23,6 @@
 from tvm.ir import IRModule, transform
 from tvm.runtime import NDArray
 from tvm.target import Target
-from tvm.te import Tensor
-from tvm.tir import PrimFunc
 
 from .extracted_task import ExtractedTask
 from .utils import autotvm_silencer
@@ -38,7 +36,7 @@ def extract_task_from_relay(
     opt_level: int = 3,
     pass_config: Optional[Dict[str, Any]] = None,
     disabled_pass: Optional[List[str]] = None,
-    te_filter_func: Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None,
+    tir_converter: str = "default",
 ) -> List[ExtractedTask]:
     """Extract tuning tasks from a relay program.
 
@@ -56,13 +54,13 @@ def extract_task_from_relay(
         The pass config of the compiler
     disabled_pass : Optional[List[str]]
         The list of disabled passes of the compiler
-    te_filter_func : Callable[[List[tvm.te.Tensor], List[NDArray]], bool]
-        The filter function to filter out the extracted tasks
-        If it's a string, it's the name of the filtering function. Built in functions are
-          - "meta_schedule.DefaultTaskFilter"
-          - "meta_schedule.DefaultTaskFilterAllowExtern"
-        If it's None, it's the default filtering function
-        If it's a callable, it's the filtering function
+    tir_converter : str
+        The filter function to filter out the extracted tasks. Builtin filters:
+          - "default"
+          - "allow_extern"
+        The converter is a PackedFunc registered as f"relay.backend.tir_converter.{tir_converter}",
+        with the signature below:
+            (args: List[te.Tensor], constants: List[NDArray]) -> Optional[tir.PrimFunc]
 
     Returns
     -------
@@ -75,8 +73,6 @@ def extract_task_from_relay(
 
     # pylint: enable=import-outside-toplevel
 
-    if isinstance(te_filter_func, str):
-        te_filter_func = get_global_func(te_filter_func)
     extract_task_func = get_global_func(
         "relay.backend.MetaScheduleExtractTask",
         allow_missing=False,
@@ -89,7 +85,10 @@ def extract_task_from_relay(
     if disabled_pass is None:
         disabled_pass = []
     if pass_config is None:
-        pass_config = {"relay.backend.use_meta_schedule": True}
+        pass_config = {
+            "relay.backend.use_meta_schedule": True,
+            "relay.backend.tir_converter": tir_converter,
+        }
     if params is None:
         params = {}
     relay_params = {}
@@ -110,7 +109,7 @@ def extract_task_from_relay(
         else:
             tophub_context = autotvm.utils.EmptyContext()
         with tophub_context:
-            return list(extract_task_func(mod, target, relay_params, te_filter_func))
+            return list(extract_task_func(mod, target, relay_params))
 
 
 def is_meta_schedule_enabled() -> bool:
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index 8010e36fd656..596a5a736333 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -15,16 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
-from distutils.util import strtobool
 import argparse
 import json
 import logging
+from distutils.util import strtobool
+from typing import Dict
 
+import numpy as np  # type: ignore
 import tvm
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer
+from tvm.meta_schedule.testing.tune_utils import create_timer, generate_input_data
 from tvm.support import describe
 
 
@@ -137,14 +139,24 @@ def main():
         ARGS.input_shape,
         cache_dir=ARGS.cache_dir,
     )
-    input_info = {input_name: input_shape}
-    input_data = {
-        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape
+    input_info = [
+        {
+            "name": input_name,
+            "shape": input_shape,
+            "dtype": input_dtype,
+        },
+    ]
+    input_data: Dict[str, np.ndarray] = {
+        item["name"]: generate_input_data(  # type: ignore
+            item["shape"],  # type: ignore
+            item["dtype"],  # type: ignore
+        )
+        for item in input_info
     }
-    for input_name, input_shape in input_info.items():
-        print(f"  input_name : {input_name}")
-        print(f"  input_shape: {input_shape}")
-        print(f"  input_dtype: {input_dtype}")
+    for item in input_info:
+        print(f"  input_name : {item['name']}")
+        print(f"  input_shape: {item['shape']}")
+        print(f"  input_dtype: {item['dtype']}")
 
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
index dda492008ffe..5919fb47c809 100644
--- a/python/tvm/meta_schedule/testing/utils.py
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -16,12 +16,13 @@
 # under the License.
 """Testing utility functions in meta schedule"""
 from typing import Callable, Dict, Optional, Union
+
+from tvm import meta_schedule as ms
 from tvm.ir import IRModule, transform
 from tvm.relay import Function as RelayFunc
 from tvm.runtime import NDArray
 from tvm.target import Target
 from tvm.tir import Schedule
-from tvm import meta_schedule as ms
 
 
 def apply_fixed_schedules(
@@ -29,10 +30,10 @@ def apply_fixed_schedules(
     target: Union[str, Target],
     params: Optional[Dict[str, NDArray]],
     schedule_fn: Callable[[ms.ExtractedTask, Schedule], bool],
-    te_filter_func=None,
+    tir_converter: str = "default",
 ):
     """Apply fixed schedules (manually written, without any tunable knobs) as specified by
-    schedule_fn to extracted tasks, and return a database that can be passed to ApplyHistoryBest.
+    schedule_fn to extracted tasks, and return a database that can be passed to compilation.
 
     Parameters
     ----------
@@ -45,13 +46,13 @@ def apply_fixed_schedules(
     schedule_fn : Callable[[ExtractedTask, Schedule], bool]
         A callable that is applied for each extracted task and the corresponding default schedule.
         Returns True if the given schedule should be committed to the database, False otherwise.
-    te_filter_func : Union[str, None, Callable[[List[Tensor], List[NDArray]], PrimFunc]] = None
-        The filtering function for TE computation
-        If it's a string, it's the name of the filtering function. Built in functions are
-          - "meta_schedule.DefaultTaskFilter"
-          - "meta_schedule.DefaultTaskFilterAllowExtern"
-        If it's None, it's the default filtering function
-        If it's a callable, it's the filtering function
+    tir_converter : str
+        The filter function to filter out the extracted tasks. Builtin filters:
+          - "default"
+          - "allow_extern"
+        The converter is a PackedFunc registered as f"relay.backend.tir_converter.{tir_converter}",
+        with the signature below:
+            (args: List[te.Tensor], constants: List[NDArray]) -> Optional[tir.PrimFunc]
 
     Returns
     -------
@@ -64,7 +65,10 @@ def apply_fixed_schedules(
         config[k] = v
 
     extracted_tasks = ms.extract_task_from_relay(
-        relay_mod, target, params, te_filter_func=te_filter_func, pass_config=config
+        relay_mod,
+        target,
+        params,
+        tir_converter=tir_converter,
     )
     database = ms.database.MemoryDatabase()
     for task in extracted_tasks:
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 447fb56637ef..20eccc30a113 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -24,14 +24,12 @@
 
 from tvm.ir import IRModule
 from tvm.ir.transform import PassContext
-from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.runtime import Module, NDArray, vm
 from tvm.target import Target
 from tvm.te import Tensor, create_prim_func
 from tvm.tir import PrimFunc, Schedule
 
 from . import default_config
-from .apply_history_best import ApplyHistoryBest
 from .builder import Builder
 from .cost_model import CostModel
 from .database import Database, TuningRecord
@@ -43,7 +41,7 @@
 from .runner import Runner
 from .schedule_rule import ScheduleRule
 from .search_strategy import EvolutionarySearch, ReplayFunc, ReplayTrace
-from .space_generator import SpaceGenerator
+from .space_generator import PostOrderApply, SpaceGenerator
 from .task_scheduler import GradientBased, RoundRobin
 from .tune_context import TuneContext
 from .utils import autotvm_silencer, batch_parameterize_config
@@ -461,7 +459,7 @@ def _f_block_filter(block, target_names) -> bool:
         mutator_probs=mutator_probs,
         num_threads=num_threads,
     )
-    with Profiler.timeit("ApplyHistoryBest"):
+    with Profiler.timeit("PostTuningCompilation"):
         bests: List[TuningRecord] = database.get_top_k(database.commit_workload(mod), top_k=1)
         if not bests:
             return None
@@ -591,6 +589,7 @@ def tune_relay(
     """
     # pylint: disable=import-outside-toplevel
     from tvm import relay
+
     from .relay_integration import extract_task_from_relay
 
     # pylint: disable=protected-access, enable=import-outside-toplevel
@@ -615,13 +614,14 @@ def tune_relay(
         num_threads=num_threads,
     )
     relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend]
-    with Profiler.timeit("ApplyHistoryBest"):
-        with target, autotvm_silencer(), ApplyHistoryBest(database):
+    with Profiler.timeit("PostTuningCompilation"):
+        with target, autotvm_silencer(), database:
             with PassContext(
                 opt_level=3,
                 config={
                     "relay.backend.use_meta_schedule": True,
                     "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda",
+                    "relay.backend.tir_converter": "default",
                 },
             ):
                 return relay_build(mod, target=target, params=params)
diff --git a/src/meta_schedule/apply_history_best.cc b/src/meta_schedule/apply_history_best.cc
deleted file mode 100644
index 62db29306777..000000000000
--- a/src/meta_schedule/apply_history_best.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <tvm/te/tensor.h>
-
-#include "./utils.h"
-
-namespace tvm {
-namespace meta_schedule {
-
-/**************** Utility functions ****************/
-
-template <class FunctionType, class RetType, class Callback>
-Optional<RetType> GetOnlyOneFunctionCommon(const IRModule& mod, Callback on_found) {
-  if (mod->functions.size() != 1) {
-    return NullOpt;
-  }
-  for (const auto& kv : mod->functions) {
-    const BaseFunc& func = kv.second;
-    if (!func->IsInstance<typename FunctionType::ContainerType>()) {
-      return NullOpt;
-    } else {
-      return on_found(kv);
-    }
-  }
-  return NullOpt;
-}
-
-template <class FunctionType>
-Optional<GlobalVar> GetOnlyOneFunctionKey(const IRModule& mod) {
-  return GetOnlyOneFunctionCommon<FunctionType, GlobalVar>(mod, [](auto kv) { return kv.first; });
-}
-
-template <class FunctionType>
-Optional<FunctionType> GetOnlyOneFunction(const IRModule& mod) {
-  return GetOnlyOneFunctionCommon<FunctionType, FunctionType>(
-      mod, [](auto kv) { return Downcast<FunctionType>(kv.second); });
-}
-
-template <class FunctionType>
-bool HasOnlyOneFunction(const IRModule& mod) {
-  return GetOnlyOneFunction<FunctionType>(mod).defined();
-}
-
-/**************** Context Manager ****************/
-
-class ApplyHistoryBestInternal {
- public:
-  static void EnterScope(ApplyHistoryBest ctx) { ctx.EnterWithScope(); }
-  static void ExitScope(ApplyHistoryBest ctx) { ctx.ExitWithScope(); }
-};
-
-struct ApplyHistoryBestThreadLocalEntry {
-  Optional<ApplyHistoryBest> ctx;
-};
-
-using ApplyHistoryBestThreadLocalStore = dmlc::ThreadLocalStore<ApplyHistoryBestThreadLocalEntry>;
-
-Optional<ApplyHistoryBest> ApplyHistoryBest::Current() {
-  return ApplyHistoryBestThreadLocalStore::Get()->ctx;
-}
-
-void ApplyHistoryBest::EnterWithScope() {
-  Optional<ApplyHistoryBest>& ctx = ApplyHistoryBestThreadLocalStore::Get()->ctx;
-  CHECK(!ctx.defined()) << "ValueError: Nested ApplyHistoryBest context managers are not allowed";
-  ctx = *this;
-}
-
-void ApplyHistoryBest::ExitWithScope() {
-  Optional<ApplyHistoryBest>& ctx = ApplyHistoryBestThreadLocalStore::Get()->ctx;
-  ICHECK(ctx.defined());
-  ctx = NullOpt;
-}
-
-/**************** ApplyHistoryBest ****************/
-
-ApplyHistoryBest::ApplyHistoryBest(Database database,
-                                   ApplyHistoryBestNode::FTEFilterFunc te_filter_func,
-                                   PackedFunc logging_func) {
-  ObjectPtr<ApplyHistoryBestNode> n = make_object<ApplyHistoryBestNode>();
-  n->database = database;
-  n->te_filter_func = te_filter_func;
-  n->logging_func = logging_func;
-  if (te_filter_func == nullptr) {
-    n->te_filter_func = DefaultTaskFilter;
-  }
-  data_ = n;
-}
-
-Optional<IRModule> ApplyHistoryBestNode::Query(runtime::String task_name, IRModule mod,
-                                               Target target, Optional<Array<IRModule>> dispatched,
-                                               FTakeTuningRecord f_take_tuning_record,
-                                               FDirectDispatch f_direct_dispatch) {
-  ICHECK(dispatched.defined());
-  ICHECK_EQ(dispatched.value().size(), 1);
-  ICHECK(HasOnlyOneFunction<relay::Function>(mod)) << mod;
-  IRModule prim_mod = dispatched.value()[0];
-  ICHECK(HasOnlyOneFunction<tir::PrimFunc>(prim_mod)) << prim_mod;
-
-  // Keep the original func name to be returned later.
-  GlobalVar gv = GetOnlyOneFunctionKey<tir::PrimFunc>(prim_mod).value();
-
-  // Unify func name to make sure it can be found in database
-  const auto* parse_mod_func = runtime::Registry::Get("tvm.meta_schedule.tune.parse_mod");
-  ICHECK(parse_mod_func) << "Parse mod function not defined!";
-  prim_mod = (*parse_mod_func)(prim_mod);
-
-  if (f_direct_dispatch != nullptr) {
-    Optional<IRModule> mod = f_direct_dispatch(prim_mod);
-    if (mod.defined()) {
-      TVM_PY_LOG(INFO, logging_func) << "Direct dispatch applied for workload: " << task_name;
-      return mod.value();
-    }
-  }
-  if (database->HasWorkload(prim_mod)) {
-    Array<TuningRecord> records = database->GetTopK(database->CommitWorkload(prim_mod), 1);
-    if (records.size() == 1) {
-      if (f_take_tuning_record != nullptr) {
-        f_take_tuning_record(records[0]);
-      }
-      tir::Schedule sch =
-          tir::Schedule::Traced(records[0]->workload->mod, /*seed=*/-1, /*debug_mask=*/0,
-                                /*error_render_level=*/tir::ScheduleErrorRenderLevel::kNone);
-      records[0]->trace->ApplyToSchedule(sch, false);
-      tir::PrimFunc func = GetOnlyOneFunction<tir::PrimFunc>(sch->mod()).value();
-      // Make sure we return the updated PrimFunc paired with the original func name.
-      return IRModule({{gv, func}});
-    }
-  }
-  TVM_PY_LOG(WARNING, logging_func) << "Cannot find workload: " << task_name;
-  return NullOpt;
-}
-
-TVM_REGISTER_NODE_TYPE(ApplyHistoryBestNode);
-TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBest")
-    .set_body_typed([](Database database, ApplyHistoryBestNode::FTEFilterFunc te_filter_func,
-                       PackedFunc logging_func) -> ApplyHistoryBest {
-      return ApplyHistoryBest(database, te_filter_func, logging_func);
-    });
-TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestEnterScope")
-    .set_body_typed(ApplyHistoryBestInternal::EnterScope);
-TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestExitScope")
-    .set_body_typed(ApplyHistoryBestInternal::ExitScope);
-TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestCurrent")
-    .set_body_typed(ApplyHistoryBest::Current);
-TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestQuery")
-    .set_body_method<ApplyHistoryBest>(&ApplyHistoryBestNode::Query);
-
-}  // namespace meta_schedule
-}  // namespace tvm
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index 4e180c4fab61..fedd2aa35278 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -154,6 +154,59 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w
   return TuningRecord(trace, workload, run_secs, target, args_info);
 }
 
+/******** Database ********/
+
+Optional<TuningRecord> DatabaseNode::QueryTuningRecord(IRModule mod, Target target) {
+  if (!this->HasWorkload(mod)) {
+    return NullOpt;
+  }
+  Array<TuningRecord> records = this->GetTopK(this->CommitWorkload(mod), 1);
+  if (records.empty()) {
+    return NullOpt;
+  }
+  ICHECK_EQ(records.size(), 1);
+  return records[0];
+}
+
+Optional<tir::Schedule> DatabaseNode::QuerySchedule(IRModule mod, Target target) {
+  if (Optional<TuningRecord> opt_record = this->QueryTuningRecord(mod, target)) {
+    TuningRecord record = opt_record.value();
+    tir::Schedule sch =
+        tir::Schedule::Traced(record->workload->mod, /*seed=*/-1, /*debug_mask=*/0,
+                              /*error_render_level=*/tir::ScheduleErrorRenderLevel::kDetail);
+    record->trace->ApplyToSchedule(sch, false);
+    return sch;
+  } else {
+    return NullOpt;
+  }
+}
+
+Optional<IRModule> DatabaseNode::QueryIRModule(IRModule mod, Target target) {
+  if (Optional<tir::Schedule> opt_sch = this->QuerySchedule(mod, target)) {
+    return opt_sch.value()->mod();
+  } else {
+    return NullOpt;
+  }
+}
+
+std::vector<Database>* ThreadLocalDatabases() {
+  static thread_local std::vector<Database> tls;
+  return &tls;
+}
+
+void Database::EnterWithScope() { ThreadLocalDatabases()->push_back(*this); }
+
+void Database::ExitWithScope() { ThreadLocalDatabases()->pop_back(); }
+
+Optional<Database> Database::Current() {
+  std::vector<Database>* tls = ThreadLocalDatabases();
+  if (tls->empty()) {
+    return NullOpt;
+  } else {
+    return tls->back();
+  }
+}
+
 /******** PyDatabase ********/
 
 Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload,
@@ -194,6 +247,11 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsMeasureCandidate")
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsJSON")
     .set_body_method<TuningRecord>(&TuningRecordNode::AsJSON);
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordFromJSON").set_body_typed(TuningRecord::FromJSON);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseEnterWithScope")
+    .set_body_method(&Database::EnterWithScope);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseExitWithScope")
+    .set_body_method(&Database::ExitWithScope);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseCurrent").set_body_typed(Database::Current);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabaseHasWorkload")
     .set_body_method<Database>(&DatabaseNode::HasWorkload);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabaseCommitWorkload")
@@ -205,6 +263,12 @@ TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetTopK")
 TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetAllTuningRecords")
     .set_body_method<Database>(&DatabaseNode::GetAllTuningRecords);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabaseSize").set_body_method<Database>(&DatabaseNode::Size);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseQueryTuningRecord")
+    .set_body_method<Database>(&DatabaseNode::QueryTuningRecord);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseQuerySchedule")
+    .set_body_method<Database>(&DatabaseNode::QuerySchedule);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseQueryIRModule")
+    .set_body_method<Database>(&DatabaseNode::QueryIRModule);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabasePyDatabase").set_body_typed(Database::PyDatabase);
 
 }  // namespace meta_schedule
diff --git a/src/meta_schedule/extracted_task.cc b/src/meta_schedule/extracted_task.cc
index 3406f82eb1f0..ec04361f51ec 100644
--- a/src/meta_schedule/extracted_task.cc
+++ b/src/meta_schedule/extracted_task.cc
@@ -38,67 +38,6 @@ ExtractedTask::ExtractedTask(String task_name, IRModule mod, Target target,
   data_ = n;
 }
 
-Optional<tir::PrimFunc> DefaultTaskFilterImpl(const Array<te::Tensor>& args,
-                                              const Array<runtime::NDArray>& constants,
-                                              bool allow_extern_op) {
-  using namespace ::tvm::te;
-  std::vector<Tensor> stack;
-  std::unordered_set<const TensorNode*> visited;
-  for (const Tensor& v : args) {
-    for (const PrimExpr& e : v->shape) {
-      // Dynamic shape is not supported for now
-      if (!e->IsInstance<IntImmNode>()) {
-        return NullOpt;
-      }
-    }
-    if (!visited.count(v.get())) {
-      visited.insert(v.get());
-      stack.push_back(v);
-    }
-  }
-  while (!stack.empty()) {
-    Tensor tensor = stack.back();
-    stack.pop_back();
-    if (tensor->op->IsInstance<PlaceholderOpNode>()) {
-      // do nothing
-    } else if (tensor->op->IsInstance<ComputeOpNode>() ||
-               (allow_extern_op && tensor->op->IsInstance<ExternOpNode>())) {
-      Array<Tensor> inputs = tensor->op->InputTensors();
-      for (const Tensor& v : inputs) {
-        if (!visited.count(v.get())) {
-          visited.insert(v.get());
-          stack.push_back(v);
-        }
-      }
-    } else {
-      return NullOpt;
-    }
-  }
-  PrimFunc func = te::CreatePrimFuncWithConstants(args, constants);
-  bool dynamic_loop_extent = false;
-  PostOrderVisit(func->body, [&dynamic_loop_extent](const ObjectRef& obj) -> void {
-    if (const auto* loop = obj.as<tir::ForNode>()) {
-      if (!loop->extent->IsInstance<IntImmNode>()) {
-        dynamic_loop_extent = true;
-      }
-    }
-  });
-  if (dynamic_loop_extent) {
-    return NullOpt;
-  }
-  return func;
-}
-
-Optional<tir::PrimFunc> DefaultTaskFilter(const Array<te::Tensor>& args,
-                                          const Array<runtime::NDArray>& constants) {
-  return DefaultTaskFilterImpl(args, constants, false);
-}
-
-Optional<tir::PrimFunc> DefaultTaskFilterAllowExtern(const Array<te::Tensor>& args,
-                                                     const Array<runtime::NDArray>& constants) {
-  return DefaultTaskFilterImpl(args, constants, true);
-}
-
 TVM_REGISTER_NODE_TYPE(ExtractedTaskNode);
 TVM_REGISTER_GLOBAL("meta_schedule.ExtractedTask")
     .set_body_typed([](String task_name, IRModule mod, Target target, Array<IRModule> dispatched,
@@ -106,14 +45,5 @@ TVM_REGISTER_GLOBAL("meta_schedule.ExtractedTask")
       return ExtractedTask(task_name, mod, target, dispatched, weight);
     });
 
-TVM_REGISTER_GLOBAL("meta_schedule.DefaultTaskFilter")
-    .set_body_typed([](const Array<te::Tensor>& args, const Array<runtime::NDArray>& constants) {
-      return DefaultTaskFilter(args, constants);
-    });
-
-TVM_REGISTER_GLOBAL("meta_schedule.DefaultTaskFilterAllowExtern")
-    .set_body_typed([](const Array<te::Tensor>& args, const Array<runtime::NDArray>& constants) {
-      return DefaultTaskFilterAllowExtern(args, constants);
-    });
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 664a6a609e7f..db37935ec206 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -21,7 +21,6 @@
 
 #include <dmlc/memory_io.h>
 #include <tvm/arith/analyzer.h>
-#include <tvm/meta_schedule/apply_history_best.h>
 #include <tvm/meta_schedule/arg_info.h>
 #include <tvm/meta_schedule/builder.h>
 #include <tvm/meta_schedule/cost_model.h>
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index 4f83b6eeed60..213841c621de 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -16,8 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
-#include <tvm/meta_schedule/apply_history_best.h>
 #include <tvm/meta_schedule/extracted_task.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
@@ -32,13 +30,10 @@ namespace tvm {
 namespace relay {
 namespace backend {
 
-Array<meta_schedule::ExtractedTask> ExtractTask(
-    IRModule mod, Target target, Map<String, runtime::NDArray> params,
-    meta_schedule::ApplyHistoryBestNode::FTEFilterFunc filter_func) {
+Array<meta_schedule::ExtractedTask> ExtractTask(IRModule mod, Target target,
+                                                Map<String, runtime::NDArray> params) {
   using meta_schedule::ExtractedTask;
-  if (filter_func == nullptr) {
-    filter_func = tvm::meta_schedule::DefaultTaskFilter;
-  }
+  backend::FTECompilerTIRConverter tir_converter = backend::GetTIRConverter();
   backend::BindParamsInModule(mod, params);
   // is_vm=true for backward compatibility
   Array<Pass> pass_seqs = relay::backend::GetPassPrefix(/*is_homogenous=*/true, /*is_vm=*/true);
@@ -48,7 +43,7 @@ Array<meta_schedule::ExtractedTask> ExtractTask(
 
   std::vector<ExtractedTask> tasks;
   std::unordered_map<tec::CCacheKey, ExtractedTask> cache;
-  PostOrderVisit(mod->Lookup("main"), [&target, &tasks, &cache, &filter_func](const Expr& exp) {
+  PostOrderVisit(mod->Lookup("main"), [&target, &tasks, &cache, &tir_converter](const Expr& exp) {
     if (exp->IsInstance<FunctionNode>()) {
       Function relay_func = Downcast<Function>(exp);
       if (!relay_func->HasNonzeroAttr(attr::kPrimitive)) {
@@ -62,13 +57,11 @@ Array<meta_schedule::ExtractedTask> ExtractTask(
       }
       auto [inputs_outputs, constants, fused_name] =
           tec::LowerTECompute(relay_func, target, /*return_inputs=*/true);
-      if (Optional<tir::PrimFunc> prim_func = filter_func(inputs_outputs, constants)) {
-        GlobalVar prim_fn_var(fused_name);
-        IRModule relay_mod({{prim_fn_var, relay_func}});
-        IRModule tir_mod({{prim_fn_var, prim_func.value()}});
-        ExtractedTask extracted_task(fused_name, relay_mod, target, {tir_mod}, 1);
-        tasks.push_back(extracted_task);
-        cache.emplace(cache_key, extracted_task);
+      if (Optional<tir::PrimFunc> f = tir_converter(inputs_outputs, constants)) {
+        IRModule relay_mod({{GlobalVar(fused_name), relay_func}});
+        ExtractedTask task(fused_name, relay_mod, target, {PrimFuncToIRModule(f.value())}, 1);
+        tasks.push_back(task);
+        cache.emplace(cache_key, task);
       }
     }
   });
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 5c79ed2070cc..8fa8610c0fca 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -548,6 +548,7 @@ TECompiler& TECompiler::Global() {
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_meta_schedule", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_meta_schedule_dispatch", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.tir_converter", String);
 
 TVM_REGISTER_GLOBAL("relay.backend._TECompilerGlobal").set_body_typed([]() {
   return TECompiler::Global();
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 92cc6f8cfa46..0e2a3e270257 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -21,7 +21,7 @@
 
 #include <tvm/driver/driver_api.h>
 #include <tvm/ir/type_functor.h>
-#include <tvm/meta_schedule/apply_history_best.h>
+#include <tvm/meta_schedule/database.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/device_copy.h>
 #include <tvm/relay/expr.h>
@@ -37,6 +37,7 @@
 #include <tvm/te/schedule_pass.h>
 #include <tvm/tir/function.h>
 #include <tvm/tir/index_map.h>
+#include <tvm/tir/schedule/schedule.h>
 #include <tvm/tir/transform.h>
 #include <tvm/topi/tags.h>
 
@@ -61,16 +62,6 @@ TVM_REGISTER_NODE_TYPE(CachedFuncNode);
 TVM_REGISTER_NODE_TYPE(CCacheKeyNode);
 TVM_REGISTER_NODE_TYPE(CCacheValueNode);
 
-void ExtractTransformLayout(const meta_schedule::TuningRecord& record) {
-  static tir::InstructionKind kind_transform_layout = tir::InstructionKind::Get("TransformLayout");
-  for (const tir::Instruction& inst : record->trace->insts) {
-    if (inst->kind.same_as(kind_transform_layout)) {
-      ICHECK_EQ(inst->attrs.size(), 3);
-      relay::MetaScheduleLayoutRewriter::LayoutQueuePush(Downcast<tir::IndexMap>(inst->attrs[2]));
-    }
-  }
-}
-
 LoweredOutput::LoweredOutput(tvm::Array<te::Tensor> outputs, OpImplementation impl) {
   auto n = make_object<LoweredOutputNode>();
   n->outputs = std::move(outputs);
@@ -317,11 +308,11 @@ class ScheduleBuilder : public ExprVisitor {
     // Whether to use auto_scheduler schedule.
     use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
     if (backend::IsMetaScheduleEnabled()) {
-      meta_schedule_ctx_ = meta_schedule::ApplyHistoryBest::Current();
-      CHECK(meta_schedule_ctx_.defined()) << "ValueError: `use_meta_schedule` is enabled in Relay "
-                                             "build, but no ApplyHistoryBest context is provided. ";
+      database_ = meta_schedule::Database::Current();
+      CHECK(database_.defined()) << "ValueError: `use_meta_schedule` is enabled in Relay "
+                                    "build, but no `meta_schedule.Database` context is provided. ";
     } else {
-      meta_schedule_ctx_ = NullOpt;
+      database_ = NullOpt;
     }
   }
 
@@ -359,32 +350,43 @@ class ScheduleBuilder : public ExprVisitor {
           schedule = Downcast<te::Schedule>(obj);
         }
       }
-      if (meta_schedule_ctx_) {
+      if (database_) {
+        using tvm::meta_schedule::TuningRecord;
+        using tvm::tir::IndexMap;
+        using tvm::tir::Instruction;
+        using tvm::tir::InstructionKind;
+        using tvm::tir::PrimFunc;
+        using tvm::tir::Schedule;
+        backend::FTECompilerTIRConverter tir_converter = backend::GetTIRConverter();
         Array<te::Tensor> te_args = Concat(fn_inputs, tensor_outs);
         Array<runtime::NDArray> constants;
         for (auto [const_node, te_tensor] : lower_te_compute.constant_tensors_) {
           te_args.push_back(te_tensor);
           constants.push_back(const_node->data);
         }
-
-        if (Optional<tir::PrimFunc> tir_func =
-                meta_schedule_ctx_.value()->te_filter_func(te_args, constants)) {
-          IRModule relay_mod({{prim_fn_var, relay_func}});
-          IRModule tir_mod({{prim_fn_var, tir_func.value()}});
-          if (Optional<IRModule> opt_scheduled_mod = meta_schedule_ctx_.value()->Query(
-                  /*task_name=*/prim_fn_var->name_hint,     //
-                  /*mod=*/relay_mod,                        //
-                  /*target=*/target_,                       //
-                  /*dispatched=*/Array<IRModule>{tir_mod},  //
-                  /*f_take_tuning_record=*/ExtractTransformLayout)) {
-            IRModule scheduled_mod =
-                tir::transform::RemoveWeightLayoutRewriteBlock()(opt_scheduled_mod.value());
-            ICHECK_EQ(scheduled_mod->functions.count(prim_fn_var), 1);
-            prim_func = Downcast<tir::PrimFunc>(scheduled_mod->functions[prim_fn_var]);
+        if (Optional<PrimFunc> f = tir_converter(te_args, constants)) {
+          if (Optional<TuningRecord> opt_record = database_.value()->QueryTuningRecord(
+                  /*mod=*/backend::PrimFuncToIRModule(f.value()),
+                  /*target=*/target_)) {
+            static InstructionKind kind_transform_layout = InstructionKind::Get("TransformLayout");
+            TuningRecord record = opt_record.value();
+            for (const Instruction& inst : record->trace->insts) {
+              if (inst->kind.same_as(kind_transform_layout)) {
+                ICHECK_EQ(inst->attrs.size(), 3);
+                MetaScheduleLayoutRewriter::LayoutQueuePush(Downcast<IndexMap>(inst->attrs[2]));
+              }
+            }
+            Schedule sch = Schedule::Traced(record->workload->mod, /*seed=*/-1, /*debug_mask=*/0,
+                                            tir::ScheduleErrorRenderLevel::kDetail);
+            record->trace->ApplyToSchedule(sch, /*remove_postproc=*/false);
+            IRModule mod = sch->mod();
+            ICHECK_EQ(mod->functions.size(), 1);
+            mod = tir::transform::RemoveWeightLayoutRewriteBlock()(std::move(mod));
+            prim_func = Downcast<PrimFunc>(mod->Lookup("main"));
           }
         }
       }
-      // Use TOPI schedule if user specificed, or the function has no auto_scheduler schedule.
+      // Use TOPI schedule if user specified, or the function has no auto_scheduler schedule.
       if (!schedule.defined() && !prim_func.defined()) {
         if (anchor_op_.defined()) {
           auto anchor_impl = lower_te_compute.op_implementations_.find(anchor_op_.operator->());
@@ -422,7 +424,7 @@ class ScheduleBuilder : public ExprVisitor {
     }
 
     int op_pattern = fpattern[op];
-    if (!use_auto_scheduler_ && !meta_schedule_ctx_.defined() && op_pattern >= kCommReduce) {
+    if (!use_auto_scheduler_ && !database_.defined() && op_pattern >= kCommReduce) {
       ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
           << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
           << " anchor=" << anchor_op_ << " current=" << op;
@@ -440,7 +442,7 @@ class ScheduleBuilder : public ExprVisitor {
   Attrs anchor_attrs_;
   int anchor_op_pattern_{0};
   bool use_auto_scheduler_;
-  Optional<meta_schedule::ApplyHistoryBest> meta_schedule_ctx_;
+  Optional<meta_schedule::Database> database_;
 };
 
 /*!
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 340986770e93..5cf7a5563d19 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -28,6 +28,9 @@
 #include <tvm/parser/parser.h>
 #include <tvm/relay/qnn/transform.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include "../../te/operation/create_primfunc.h"
 
 namespace tvm {
 namespace relay {
@@ -368,6 +371,76 @@ void BindParamsInModule(IRModule mod, Map<String, runtime::NDArray> params) {
   BindParamsInModule(mod, params_tmp);
 }
 
+/*!
+ * \brief A default TE compute to TIR compute.
+ * \param args The inputs/outputs of the TE compute graph.
+ * \param constants The constants bound to TIR
+ * \param allow_extern_op Whether to allow extern operation in TE.
+ * \return The TIR converted; NullOpt if not supported (dynamic shape)
+ */
+Optional<tir::PrimFunc> DefaultTIRConverterImpl(const Array<te::Tensor>& args,
+                                                const Array<runtime::NDArray>& constants,
+                                                bool allow_extern_op) {
+  using namespace ::tvm::te;
+  std::vector<Tensor> stack;
+  std::unordered_set<const TensorNode*> visited;
+  for (const Tensor& v : args) {
+    for (const PrimExpr& e : v->shape) {
+      // Dynamic shape is not supported for now
+      if (!e->IsInstance<IntImmNode>()) {
+        return NullOpt;
+      }
+    }
+    if (!visited.count(v.get())) {
+      visited.insert(v.get());
+      stack.push_back(v);
+    }
+  }
+  while (!stack.empty()) {
+    Tensor tensor = stack.back();
+    stack.pop_back();
+    if (tensor->op->IsInstance<PlaceholderOpNode>()) {
+      // do nothing
+    } else if (tensor->op->IsInstance<ComputeOpNode>() ||
+               (allow_extern_op && tensor->op->IsInstance<ExternOpNode>())) {
+      Array<Tensor> inputs = tensor->op->InputTensors();
+      for (const Tensor& v : inputs) {
+        if (!visited.count(v.get())) {
+          visited.insert(v.get());
+          stack.push_back(v);
+        }
+      }
+    } else {
+      return NullOpt;
+    }
+  }
+  PrimFunc func = te::CreatePrimFuncWithConstants(args, constants);
+  bool dynamic_loop_extent = false;
+  tir::PostOrderVisit(func->body, [&dynamic_loop_extent](const ObjectRef& obj) -> void {
+    if (const auto* loop = obj.as<tir::ForNode>()) {
+      if (!loop->extent->IsInstance<IntImmNode>()) {
+        dynamic_loop_extent = true;
+      }
+    }
+  });
+  if (dynamic_loop_extent) {
+    return NullOpt;
+  }
+  return func;
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.tir_converter.default")
+    .set_body_typed([](const Array<te::Tensor>& args,
+                       const Array<runtime::NDArray>& constants) -> Optional<tir::PrimFunc> {
+      return DefaultTIRConverterImpl(args, constants, false);
+    });
+
+TVM_REGISTER_GLOBAL("relay.backend.tir_converter.allow_extern")
+    .set_body_typed([](const Array<te::Tensor>& args,
+                       const Array<runtime::NDArray>& constants) -> Optional<tir::PrimFunc> {
+      return DefaultTIRConverterImpl(args, constants, true);
+    });
+
 }  // namespace backend
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 57c066131181..37ae9d803a35 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -558,6 +558,37 @@ inline bool IsMetaScheduleEnabled() {
       .value();
 }
 
+/*!
+ * \brief Method in TECompiler to convert TE compute to scheduleable TIR
+ * \param args The arguments of the TE compute
+ * \param constants The constants used in AllocateConst
+ * \return NullOpt if conversion fails; Otherwise the converted TIR
+ * \note This method could be further used as a task filtering mechanism in task extraction
+ */
+using FTECompilerTIRConverter = runtime::TypedPackedFunc<  //
+    Optional<tir::PrimFunc>(                               //
+        const Array<te::Tensor>& args,                     //
+        const Array<runtime::NDArray>& constants)>;
+
+/*! \brief Return a task filter for AutoTIR according to `relay.backend.tir_converter` */
+inline FTECompilerTIRConverter GetTIRConverter() {
+  String name = transform::PassContext::Current()
+                    ->GetConfig<String>("relay.backend.tir_converter", "default")
+                    .value();
+  const PackedFunc* f = runtime::Registry::Get("relay.backend.tir_converter." + name);
+  ICHECK(f != nullptr) << "IndexError: Cannot find TIR converter: " << name;
+  return FTECompilerTIRConverter(*f);
+}
+
+/*! \brief Converts a PrimFunc to IRModule. */
+inline IRModule PrimFuncToIRModule(tir::PrimFunc f) {
+  f = WithAttrs(f, Map<String, ObjectRef>{
+                       {tvm::attr::kGlobalSymbol, String("main")},
+                       {tvm::tir::attr::kNoAlias, Bool(1)},
+                   });
+  return IRModule({{GlobalVar("main"), f}});
+}
+
 /*!
  * \brief Get the sequence of Relay optimization passes based on backend type.
  * The prefix of the Relay passes almost overlaps between the vm and graph backend, with some slight
diff --git a/tests/python/integration/test_meta_schedule_auto_tensorize.py b/tests/python/integration/test_meta_schedule_auto_tensorize.py
index 3397eaabbef2..7227ef0c7b79 100644
--- a/tests/python/integration/test_meta_schedule_auto_tensorize.py
+++ b/tests/python/integration/test_meta_schedule_auto_tensorize.py
@@ -19,13 +19,12 @@
 
 import numpy as np
 import pytest
-
 import tvm
 import tvm.testing
 import tvm.topi.testing
 from tvm import meta_schedule as ms
 from tvm import relay
-from tvm.meta_schedule import ApplyHistoryBest, postproc, schedule_rule
+from tvm.meta_schedule import postproc, schedule_rule
 from tvm.meta_schedule.relay_integration import extract_task_from_relay
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
 from tvm.meta_schedule.tune import tune_extracted_tasks
@@ -176,12 +175,11 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
             postprocs=lambda: postprocs,
         )
 
-    with ApplyHistoryBest(database):
-        with tvm.transform.PassContext(
-            opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
-        ):
-            lib = relay.build(relay_mod, target=target, params=params)
+    with database, tvm.transform.PassContext(
+        opt_level=3,
+        config={"relay.backend.use_meta_schedule": True},
+    ):
+        lib = relay.build(relay_mod, target=target, params=params)
 
     if "cascadelake" in target:
         asm = lib.lib.get_source("asm")
@@ -267,12 +265,11 @@ def _test_bert_int8(target, sch_rules, postprocs):
             postprocs=lambda: postprocs,
         )
 
-    with ApplyHistoryBest(database):
-        with tvm.transform.PassContext(
-            opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
-        ):
-            lib = relay.build(relay_mod, target=target, params=params)
+    with database, tvm.transform.PassContext(
+        opt_level=3,
+        config={"relay.backend.use_meta_schedule": True},
+    ):
+        lib = relay.build(relay_mod, target=target, params=params)
 
     dev = tvm.device("cuda" if "nvidia" in target else target, 0)
     runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index 8e299dc935d5..c741ecb59ae0 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -19,20 +19,18 @@
 import json
 import os
 import re
-from io import StringIO
 from contextlib import redirect_stderr
+from io import StringIO
 
 import numpy as np
-
 import tvm
 import tvm.relay
 import tvm.testing
 from tvm import meta_schedule as ms
 from tvm import relay
-from tvm.relay.backend import Executor, Runtime
 from tvm.contrib import utils
 from tvm.meta_schedule.testing.utils import apply_fixed_schedules
-
+from tvm.relay.backend import Executor, Runtime
 
 INPUT_SHAPE = (1, 3, 16, 16)
 
@@ -421,13 +419,12 @@ def schedule_fn(task, sch):
         database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
 
     with StringIO() as stderr_buf, redirect_stderr(stderr_buf):
-        with ms.ApplyHistoryBest(database):
-            with tvm.transform.PassContext(
-                opt_level=3,
-                config={"relay.backend.use_meta_schedule": True},
-            ):
-                executor = Executor("graph", {"link-params": link_params})
-                lib = relay.build(relay_mod, target=target, executor=executor)
+        with database, tvm.transform.PassContext(
+            opt_level=3,
+            config={"relay.backend.use_meta_schedule": True},
+        ):
+            executor = Executor("graph", {"link-params": link_params})
+            lib = relay.build(relay_mod, target=target, executor=executor)
 
         # Workload look up should succeed. This does not work when the test is invoked from pytest.
         assert not "Cannot find workload" in stderr_buf.getvalue()
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index afce19a590e3..69522831ee55 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Integration test for MetaSchedule"""
-from typing import Optional
 import numpy as np
 import pytest
 import tvm
@@ -23,11 +22,10 @@
 from tvm import IRModule
 from tvm import meta_schedule as ms
 from tvm import relay, te, tir
+from tvm._ffi import register_func
 from tvm.meta_schedule.testing.relay_workload import get_network
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
 from tvm.script import tir as T
-from tvm.target import Target
-from tvm.tir import Schedule
 
 # pylint: disable=no-member,line-too-long,too-many-nested-blocks,unbalanced-tuple-unpacking,no-self-argument,missing-docstring,invalid-name
 
@@ -58,10 +56,6 @@ def _has_torch():
 requires_torch = pytest.mark.skipif(not _has_torch(), reason="torch is not installed")
 
 
-def test_meta_schedule_apply_history_best_no_current():
-    assert ms.ApplyHistoryBest.current() is None
-
-
 def test_meta_schedule_dynamic_loop_extent():
     a = relay.var("a", shape=(1, 8, 8, 512), dtype="float32")
     b = relay.nn.adaptive_avg_pool2d(a, (7, 7), "NHWC")
@@ -125,7 +119,7 @@ def test_meta_schedule_integration_extract_from_bert_base():
             12,
             [[64, 768], [3072, 768], [64, 3072]],
         ),
-        "fused_subtract_add_sqrt_divide_multiply_add": (
+        "fused_subtract_add_rsqrt_multiply_multiply_add": (
             25,
             [[1, 64, 768], [1, 64, 1], [1, 64, 1], [768], [768], [1, 64, 768]],
         ),
@@ -206,7 +200,8 @@ def test_meta_schedule_integration_extract_from_bert_base():
 
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet_with_filter_func():
-    def filter_func(args) -> bool:
+    @register_func("relay.backend.tir_converter.remove_purely_spatial", override=True)
+    def filter_func(args, _) -> bool:
         from tvm.te import create_prim_func  # pylint: disable=import-outside-toplevel
 
         has_complex_op = False
@@ -236,7 +231,7 @@ def traverse(t):
         mod,
         target="llvm",
         params=params,
-        te_filter_func=filter_func,
+        tir_converter="remove_purely_spatial",
     )
     expected_task_names = [
         "fused_" + s
@@ -267,53 +262,6 @@ def traverse(t):
         assert t.task_name in expected_task_names, t.task_name
 
 
-@requires_torch
-def test_meta_schedule_integration_apply_history_best():
-    mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    database = ms.database.MemoryDatabase()
-    env = ms.ApplyHistoryBest(database)
-    target = Target("llvm")
-    workload = database.commit_workload(MockModule)
-    database.commit_tuning_record(
-        ms.database.TuningRecord(
-            trace=Schedule(MockModule).trace,
-            workload=workload,
-            run_secs=[1.0],
-            target=target,
-            args_info=[],
-        )
-    )
-    mod = env.query(
-        task_name="mock-task",
-        mod=mod,
-        target=target,
-        dispatched=[MockModule],
-    )
-    assert tvm.ir.structural_equal(mod, workload.mod)
-
-
-@requires_torch
-def test_meta_schedule_integration_apply_history_best_direct_dispatch():
-    def direct_dispatch(mod: IRModule) -> Optional[IRModule]:
-        if tvm.ir.structural_equal(mod, MockModule):
-            return MockModule
-        return None
-
-    mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    database = ms.database.MemoryDatabase()
-    env = ms.ApplyHistoryBest(database)
-    target = Target("llvm")
-    workload = database.commit_workload(MockModule)
-    mod = env.query(
-        task_name="mock-task-direct-dispatch",
-        mod=mod,
-        target=target,
-        dispatched=[MockModule],
-        f_direct_dispatch=direct_dispatch,
-    )
-    assert tvm.ir.structural_equal(mod, workload.mod)
-
-
 @pytest.mark.skip("Too slow on CI")
 def extract_task_qbert():
     mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
diff --git a/tests/python/unittest/test_meta_schedule_multi_anchor.py b/tests/python/unittest/test_meta_schedule_multi_anchor.py
index b7d012ca04d6..177001781179 100644
--- a/tests/python/unittest/test_meta_schedule_multi_anchor.py
+++ b/tests/python/unittest/test_meta_schedule_multi_anchor.py
@@ -70,7 +70,7 @@ def schedule_fn(task, sch):
         return False
 
     database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
-    with ms.ApplyHistoryBest(database):
+    with database:
         with tvm.transform.PassContext(
             opt_level=3,
             config={"relay.backend.use_meta_schedule": True},
diff --git a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
index 058012cb643a..939851a65731 100644
--- a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
+++ b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
@@ -19,7 +19,6 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm import autotvm, relay, te
-from tvm.meta_schedule import ApplyHistoryBest
 from tvm.meta_schedule.testing.utils import apply_fixed_schedules
 from tvm.relay.testing.temp_op_attr import TempOpAttr
 from tvm.script import tir as T
@@ -152,17 +151,16 @@ def schedule_fn(task, sch):
             target,
             params,
             schedule_fn,
-            te_filter_func="meta_schedule.DefaultTaskFilterAllowExtern",
+            tir_converter="allow_extern",
         )
-        with ApplyHistoryBest(
-            database,
-            te_filter_func="meta_schedule.DefaultTaskFilterAllowExtern",
+        with database, tvm.transform.PassContext(
+            opt_level=3,
+            config={
+                "relay.backend.use_meta_schedule": True,
+                "relay.backend.tir_converter": "allow_extern",
+            },
         ):
-            with tvm.transform.PassContext(
-                opt_level=3,
-                config={"relay.backend.use_meta_schedule": True},
-            ):
-                lib = relay.build(relay_mod, target=target, params=params)
+            lib = relay.build(relay_mod, target=target, params=params)
 
     dev = tvm.device(target, 0)
 
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index 7d85b8757ae2..bc37fed7d691 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -245,12 +245,11 @@ def print_results(self) -> None:
     database.commit_workload(tvmgen_default_fused_layout_transform_1)
     database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc)
 
-    with ms.ApplyHistoryBest(database):
-        with tvm.transform.PassContext(
-            opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
-        ):
-            rt_mod1 = relay.build(mod, target=target, params=params)
+    with database, tvm.transform.PassContext(
+        opt_level=3,
+        config={"relay.backend.use_meta_schedule": True},
+    ):
+        rt_mod1 = relay.build(mod, target=target, params=params)
 
     # Compile without meta-schedule for correctness check
     with tvm.transform.PassContext(opt_level=0):
@@ -307,12 +306,11 @@ def test_meta_schedule_relay_lowering():
                 args_info=[],
             )
         )
-        with ms.ApplyHistoryBest(database):
-            with tvm.transform.PassContext(
-                opt_level=3,
-                config={"relay.backend.use_meta_schedule": True},
-            ):
-                rt_mod1 = relay.build(mod, target=target, params=params)
+        with database, tvm.transform.PassContext(
+            opt_level=3,
+            config={"relay.backend.use_meta_schedule": True},
+        ):
+            rt_mod1 = relay.build(mod, target=target, params=params)
 
         # Compile without meta-schedule for correctness check
         with tvm.transform.PassContext(opt_level=0):
@@ -472,24 +470,23 @@ def schedule_fn(task, sch):
 
         database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
 
-    with ms.ApplyHistoryBest(database):
-        with tvm.transform.PassContext(
-            opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
-        ):
-            # pylint: disable=W0105
-            """
-            The log should say
-            Warning: Cannot find workload: tvmgen_default_fused_expand_dims
-            Warning: Cannot find workload: tvmgen_default_fused_cast
-            Warning: Cannot find workload: tvmgen_default_fused_cast_1
-            Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul
-
-            This means batch matmul and others are scheduled by TE, and dense (the one not warned)
-            is found in the meta schedule tuning database during ApplyHistoryBest
-            """
-            # pylint: enable=W0105
-            lib = relay.build(relay_mod, target=target, params=params)
+    with database, tvm.transform.PassContext(
+        opt_level=3,
+        config={"relay.backend.use_meta_schedule": True},
+    ):
+        # pylint: disable=W0105
+        """
+        The log should say
+        Warning: Cannot find workload: tvmgen_default_fused_expand_dims
+        Warning: Cannot find workload: tvmgen_default_fused_cast
+        Warning: Cannot find workload: tvmgen_default_fused_cast_1
+        Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul
+
+        This means batch matmul and others are scheduled by TE, and dense (the one not warned)
+        is found in the meta schedule tuning database during compilation
+        """
+        # pylint: enable=W0105
+        lib = relay.build(relay_mod, target=target, params=params)
 
     runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 

From 534412896e6d39ee4f830d63370d02e8e5f09050 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Sat, 27 Aug 2022 11:14:58 -0700
Subject: [PATCH 062/704] [TIR] Expose MMA-related PTX builtins (#12623)

Expose MMA-related PTX builtins

This PR exposes the following TIR operation in python:

`ptx_mma`: tested
`ptx_mma_sp`: tested
`mma_store`: add new unittest
`mma_fill`: add new unittest

Co-authored-by: yongwww <yongcale@gmail.com>

Co-authored-by: yongwww <yongcale@gmail.com>
---
 python/tvm/tir/__init__.py                 |   1 +
 python/tvm/tir/op.py                       | 287 +++++++++++++++++++++
 tests/python/unittest/test_tir_op_types.py |  75 ++++++
 3 files changed, 363 insertions(+)

diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 4a6f32d03a2b..8e637d2d6564 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -59,6 +59,7 @@
     tvm_bmma_sync,
     tvm_fill_fragment,
 )
+from .op import ptx_mma, ptx_mma_sp, mma_store, mma_fill
 from .op import ptx_ldmatrix, ptx_cp_async, ptx_commit_group, ptx_wait_group
 from .op import vectorlow, vectorhigh, vectorcombine
 from .op import infinity, reinterpret
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index e510f68a68a1..1fd3050c0a7f 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -831,6 +831,293 @@ def tvm_store_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout):
     )
 
 
+def ptx_mma(
+    dtype,
+    shape,
+    A_layout,
+    B_layout,
+    A_dtype,
+    B_dtype,
+    C_dtype,
+    multiplicand_a,
+    a_index,
+    multiplicand_b,
+    b_index,
+    accumulator,
+    c_index,
+    saturate,
+    operator=None,
+):
+    """TVM intrinsic for ptx tensor core mma instructions
+    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-mma
+
+    Parameters
+    ----------
+    dtype : str
+        The data type of the result.
+
+    shape : str
+        The shape of mma fragment.
+
+    A_layout : Literal["row", "col"]
+        The layout of multiplicand fragment A.
+
+    B_layout : Literal["row", "col"]
+        The layout of multiplicand fragment B.
+
+    A_dtype : str
+        The data type of multiplicand fragment A.
+
+    B_dtype : str
+        The data type of multiplicand fragment B.
+
+    C_dtype : str
+        The data type of accumulator fragment C.
+
+    multiplicand_a : Var
+        The multiplicand fragment A variable.
+
+    a_index : Expr
+        The index of multiplicand fragment A.
+
+    multiplicand_b : Var
+        The multiplicand fragment B variable.
+
+    b_index : Expr
+        The index of multiplicand fragment A.
+
+    accumulator : Var
+        The accumulator fragment C variable.
+
+    c_index : Expr
+        The index of accumulator fragment C.
+
+    saturate : bool
+        The optional saturation at the output.
+
+
+    operator : Optional[Literal["xor", "and"]]
+        The 1-bit operator.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    if operator is None:
+        return call_intrin(
+            dtype,
+            "tir.ptx_mma",
+            shape,
+            A_layout,
+            B_layout,
+            A_dtype,
+            B_dtype,
+            C_dtype,
+            multiplicand_a,
+            a_index,
+            multiplicand_b,
+            b_index,
+            accumulator,
+            c_index,
+            saturate,
+        )
+    return call_intrin(
+        dtype,
+        "tir.ptx_mma",
+        shape,
+        A_layout,
+        B_layout,
+        A_dtype,
+        B_dtype,
+        C_dtype,
+        multiplicand_a,
+        a_index,
+        multiplicand_b,
+        b_index,
+        accumulator,
+        c_index,
+        saturate,
+        operator,
+    )
+
+
+def ptx_mma_sp(
+    dtype,
+    shape,
+    A_layout,
+    B_layout,
+    A_dtype,
+    B_dtype,
+    C_dtype,
+    multiplicand_a,
+    a_index,
+    multiplicand_b,
+    b_index,
+    accumulator,
+    c_index,
+    metadata,
+    meta_index,
+    sparse_selector,
+    saturate,
+):
+    """TVM intrinsic for sparse tensor core ptx instructions
+    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma
+
+    Parameters
+    ----------
+    dtype : str
+        The data type of the result.
+
+    shape : str
+        The shape of mma fragment.
+
+    A_layout : Literal["row", "col"]
+        The layout of multiplicand fragment A.
+
+    B_layout : Literal["row", "col"]
+        The layout of multiplicand fragment B.
+
+    A_dtype : str
+        The data type of multiplicand fragment A.
+
+    B_dtype : str
+        The data type of multiplicand fragment B.
+
+    C_dtype : str
+        The data type of multiplicand fragment C.
+
+    multiplicand_a : Var
+        The multiplicand fragment A variable.
+
+    a_index : Expr
+        The index of multiplicand fragment A.
+
+    multiplicand_b : Var
+        The multiplicand fragment B variable.
+
+    b_index : Expr
+        The index of multiplicand fragment B.
+
+    accumulator : Var
+        The accumulator fragment C variable.
+
+    c_index : Expr
+        The index of accumulator fragment C.
+
+    metadata : Expr
+        The metadata of operand.
+
+    meta_index : Expr
+        The metadata index of operand.
+
+    sparse_selector : Expr
+        The sparse selector indicating the thread that stores the metadata.
+
+    saturate : bool
+        The optional saturation at the output.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(
+        dtype,
+        "tir.ptx_mma_sp",
+        shape,
+        A_layout,
+        B_layout,
+        A_dtype,
+        B_dtype,
+        C_dtype,
+        multiplicand_a,
+        a_index,
+        multiplicand_b,
+        b_index,
+        accumulator,
+        c_index,
+        metadata,
+        meta_index,
+        sparse_selector,
+        saturate,
+    )
+
+
+def mma_store(dtype, m, n, dst_ptr, src_ptr, src_offset, dst_stride):
+    """TVM intrinsic for storing the result of PTX MMA into a destination pointer
+
+    Parameters
+    ----------
+    dtype : str
+        The data type of the result.
+
+    m : IntImm
+        The shape of mma fragment.
+
+    n : IntImm
+        The shape of mma fragment.
+
+    dst_ptr : Var
+        The destination pointer variable.
+
+    src_ptr : Var
+        The source pointer variable.
+
+    src_offset : Expr
+        The source offset.
+
+    dst_stride : Var
+        The destination stride.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(
+        dtype,
+        "tir.mma_store",
+        m,
+        n,
+        dst_ptr,
+        src_ptr,
+        src_offset,
+        dst_stride,
+    )
+
+
+def mma_fill(dtype, local_size, local_ptr, offset):
+    """TVM intrinsic for zero-initalizing an MMA accumulation registor
+
+    Parameters
+    ----------
+    dtype : str
+        The data type of the result.
+
+    local_size : IntImm
+        The number of elements.
+
+    local_ptr : Var
+        The destination pointer variable.
+
+    offset : Expr
+        The destination offset.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin(
+        dtype,
+        "tir.mma_fill",
+        local_size,
+        local_ptr,
+        offset,
+    )
+
+
 def ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, smem_offset):
     """TVM intrinsic for ptx load matrix from shared memory
     https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
diff --git a/tests/python/unittest/test_tir_op_types.py b/tests/python/unittest/test_tir_op_types.py
index f8e8de074c42..23a264bef75a 100644
--- a/tests/python/unittest/test_tir_op_types.py
+++ b/tests/python/unittest/test_tir_op_types.py
@@ -143,6 +143,81 @@ def test_tir_op_tvm_fill_fragment():
     assert expr.op.name == "tir.tvm_fill_fragment"
 
 
+def test_tir_op_ptx_mma():
+    buffer_a = tir.decl_buffer([32], "int4", scope="local")
+    buffer_b = tir.decl_buffer([16], "uint4", scope="local")
+    buffer_c = tir.decl_buffer([4], "int32", scope="local")
+    expr = tir.ptx_mma(
+        "int32",
+        "m8n8k32",
+        "row",
+        "col",
+        "int4",
+        "uint4",
+        "int32",
+        buffer_a.data,
+        0,
+        buffer_b.data,
+        0,
+        buffer_c.data,
+        0,
+        False,
+    )
+    assert expr.op.name == "tir.ptx_mma"
+
+
+def test_tir_op_ptx_mma_sp():
+    buffer_a = tir.decl_buffer([32], "int4", scope="local")
+    buffer_b = tir.decl_buffer([16], "uint4", scope="local")
+    buffer_c = tir.decl_buffer([4], "int32", scope="local")
+    buffer_d = tir.decl_buffer([1], "uint32", scope="local")
+    expr = tir.ptx_mma_sp(
+        "int32",
+        "m8n8k32",
+        "row",
+        "col",
+        "int4",
+        "uint4",
+        "int32",
+        buffer_a.data,
+        0,
+        buffer_b.data,
+        0,
+        buffer_c.data,
+        0,
+        buffer_d.data,
+        0,
+        0,
+        False,
+    )
+    assert expr.op.name == "tir.ptx_mma_sp"
+
+
+def test_tir_op_mma_store():
+    x = tir.Var("x", dtype="int32")
+    y = tir.Var("y", dtype="int32")
+    buffer_w = tir.decl_buffer([16, 8], dtype="int32", scope="warp", offset_factor=1)
+    buffer = tir.decl_buffer(
+        [16, 16], dtype="int32", scope="global", offset_factor=1, strides=[x, y]
+    )
+    expr = tir.mma_store(
+        "int32",
+        16,
+        16,
+        buffer.access_ptr("w"),
+        buffer_w.data,
+        buffer_w.elem_offset,
+        x,
+    )
+    assert expr.op.name == "tir.mma_store"
+
+
+def test_tir_op_mma_fill():
+    buffer_w = tir.decl_buffer([16, 8], dtype="int32", scope="warp", offset_factor=1)
+    expr = tir.mma_fill("int32", 8, buffer_w.data, buffer_w.elem_offset)
+    assert expr.op.name == "tir.mma_fill"
+
+
 def test_op_ptx_ldmatrix():
     buffer_shared = tir.decl_buffer([16, 16], "float16", scope="shared")
     buffer_local = tir.decl_buffer([8], "float16", scope="local")

From 648a29a53a641f1e923220600dce9c9215104879 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 29 Aug 2022 00:34:11 -0700
Subject: [PATCH 063/704] [MetaSchedule] Introduce `ScheduleFnDatabase`
 (#12626)

Following #12520, this PR introduces `ScheduleFnDatabase`, a mocked
database to allow injecting handcrafted schedules provided by a schedule
function.

The schedule function comes with the following signature:

```python
def schedule_fn(
  sch: tir.Schedule,
) -> bool:
  task_name = sch.mod.attrs["task_name"]
  # ^^^ provides an optional name of the task queried
  ...
```

This mocked database helps incorporate the existing testing utility
`apply_fixed_schedule` more formally into the MetaSchedule-Relay build
pipeline, and allows further extension to Relax with the same interface.

Next as another follow-up, we will introduce ConcatDatabase that allows
mixing multiple databases, including the mocked and ones from JSON
files.
---
 include/tvm/meta_schedule/database.h          |  19 +++-
 python/tvm/meta_schedule/database/__init__.py |   1 +
 python/tvm/meta_schedule/database/database.py |  41 +++++--
 .../database/schedule_fn_database.py          |  38 +++++++
 python/tvm/meta_schedule/testing/utils.py     |  83 --------------
 src/meta_schedule/database/database.cc        |  13 ++-
 src/meta_schedule/database/memory_database.cc |  10 +-
 .../database/schedule_fn_database.cc          | 103 ++++++++++++++++++
 src/relay/backend/te_compiler_cache.cc        |   5 +-
 tests/python/unittest/test_link_params.py     |  15 ++-
 .../test_meta_schedule_multi_anchor.py        |   8 +-
 .../test_meta_schedule_relay_tir_compute.py   |  18 +--
 .../unittest/test_meta_schedule_tune_relay.py |   7 +-
 13 files changed, 226 insertions(+), 135 deletions(-)
 create mode 100644 python/tvm/meta_schedule/database/schedule_fn_database.py
 delete mode 100644 python/tvm/meta_schedule/testing/utils.py
 create mode 100644 src/meta_schedule/database/schedule_fn_database.cc

diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index 0e7f45d39332..88db2e227786 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -207,23 +207,29 @@ class DatabaseNode : public runtime::Object {
    * \brief Query the best record of the given workload from the database.
    * \param mod The IRModule to be searched for.
    * \param target The target to be searched for.
+   * \param workload_name The name of the workload to be searched for.
    * \return The best record of the given workload; NullOpt if not found.
    */
-  virtual Optional<TuningRecord> QueryTuningRecord(IRModule mod, Target target);
+  virtual Optional<TuningRecord> QueryTuningRecord(const IRModule& mod, const Target& target,
+                                                   const String& workload_name);
   /*!
    * \brief Query the best schedule of the given workload from the database.
    * \param mod The IRModule to be searched for.
    * \param target The target to be searched for.
+   * \param workload_name The name of the workload to be searched for.
    * \return The schedule in the best schedule of the given workload; NullOpt if not found.
    */
-  virtual Optional<tir::Schedule> QuerySchedule(IRModule mod, Target target);
+  virtual Optional<tir::Schedule> QuerySchedule(const IRModule& mod, const Target& target,
+                                                const String& workload_name);
   /*!
    * \brief Query the best IRModule of the given workload from the database.
    * \param mod The IRModule to be searched for.
    * \param target The target to be searched for.
+   * \param workload_name The name of the workload to be searched for.
    * \return The IRModule in the best IRModule of the given workload; NullOpt if not found.
    */
-  virtual Optional<IRModule> QueryIRModule(IRModule mod, Target target);
+  virtual Optional<IRModule> QueryIRModule(const IRModule& mod, const Target& target,
+                                           const String& workload_name);
 
   static constexpr const char* _type_key = "meta_schedule.Database";
   TVM_DECLARE_BASE_OBJECT_INFO(DatabaseNode, runtime::Object);
@@ -336,6 +342,13 @@ class Database : public runtime::ObjectRef {
  public:
   /*! An in-memory database. */
   TVM_DLL static Database MemoryDatabase();
+  /*!
+   * \brief A database for injecting handcrafted schedule functions.
+   * \param schedule_fn The function to do scheduling, which takes a TIR schedule,
+   * and returns a boolean indicating if the schedule is successful.
+   */
+  TVM_DLL static Database ScheduleFnDatabase(
+      runtime::TypedPackedFunc<bool(tir::Schedule)> schedule_fn);
   /*!
    * \brief Create a default database that uses JSON file for tuning records.
    * \param path_workload The path to the workload table.
diff --git a/python/tvm/meta_schedule/database/__init__.py b/python/tvm/meta_schedule/database/__init__.py
index 2a87eea147d9..7726daf6eb63 100644
--- a/python/tvm/meta_schedule/database/__init__.py
+++ b/python/tvm/meta_schedule/database/__init__.py
@@ -21,3 +21,4 @@
 from .database import Database, PyDatabase, TuningRecord, Workload
 from .json_database import JSONDatabase
 from .memory_database import MemoryDatabase
+from .schedule_fn_database import ScheduleFnDatabase
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index 68283b4554e5..aa509b715132 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -235,7 +235,12 @@ def __len__(self) -> int:
         """
         return _ffi_api.DatabaseSize(self)  # type: ignore # pylint: disable=no-member
 
-    def query_tuning_record(self, mod: IRModule, target: Target) -> Optional[TuningRecord]:
+    def query_tuning_record(
+        self,
+        mod: IRModule,
+        target: Target,
+        workload_name: str,
+    ) -> Optional[TuningRecord]:
         """Query the best record of the given workload from the database.
 
         Parameters
@@ -244,15 +249,22 @@ def query_tuning_record(self, mod: IRModule, target: Target) -> Optional[TuningR
             The IRModule to be searched for.
         target : Target
             The target to be searched for.
+        workload_name : str
+            The name of the workload to be searched for.
 
         Returns
         -------
         tuning_record : Optional[TuningRecord]
             The best record of the given workload; None if not found.
         """
-        return _ffi_api.DatabaseQueryTuningRecord(self, mod, target)  # type: ignore # pylint: disable=no-member
+        return _ffi_api.DatabaseQueryTuningRecord(self, mod, target, workload_name)  # type: ignore # pylint: disable=no-member
 
-    def query_schedule(self, mod: IRModule, target: Target) -> Optional[Schedule]:
+    def query_schedule(
+        self,
+        mod: IRModule,
+        target: Target,
+        workload_name: str,
+    ) -> Optional[Schedule]:
         """Query the best schedule of the given workload from the database.
 
         Parameters
@@ -261,15 +273,22 @@ def query_schedule(self, mod: IRModule, target: Target) -> Optional[Schedule]:
             The IRModule to be searched for.
         target : Target
             The target to be searched for.
+        workload_name : str
+            The name of the workload to be searched for.
 
         Returns
         -------
         schedule : Optional[Schedule]
             The best schedule of the given workload; None if not found.
         """
-        return _ffi_api.DatabaseQuerySchedule(self, mod, target)  # type: ignore # pylint: disable=no-member
+        return _ffi_api.DatabaseQuerySchedule(self, mod, target, workload_name)  # type: ignore # pylint: disable=no-member
 
-    def query_ir_module(self, mod: IRModule, target: Target) -> Optional[IRModule]:
+    def query_ir_module(
+        self,
+        mod: IRModule,
+        target: Target,
+        workload_name: str,
+    ) -> Optional[IRModule]:
         """Query the best IRModule of the given workload from the database.
 
         Parameters
@@ -278,18 +297,22 @@ def query_ir_module(self, mod: IRModule, target: Target) -> Optional[IRModule]:
             The IRModule to be searched for.
         target : Target
             The target to be searched for.
+        workload_name : str
+            The name of the workload to be searched for.
 
         Returns
         -------
         ir_module : Optional[IRModule]
             The best IRModule of the given workload; None if not found.
         """
-        return _ffi_api.DatabaseQueryIRModule(self, mod, target)  # type: ignore # pylint: disable=no-member
+        return _ffi_api.DatabaseQueryIRModule(self, mod, target, workload_name)  # type: ignore # pylint: disable=no-member
 
     def query(
         self,
         mod: IRModule,
         target: Target,
+        *,
+        workload_name: str = "main",
         kind: Union[
             Literal["schedule"],
             Literal["record"],
@@ -313,11 +336,11 @@ def query(
             The best optimization outcome of the given workload.
         """
         if kind == "schedule":
-            return self.query_schedule(mod, target)
+            return self.query_schedule(mod, target, workload_name)
         if kind == "record":
-            return self.query_tuning_record(mod, target)
+            return self.query_tuning_record(mod, target, workload_name)
         if kind == "ir_module":
-            return self.query_ir_module(mod, target)
+            return self.query_ir_module(mod, target, workload_name)
         raise ValueError(f'Unknown kind: {kind}. Candidates are: "schedule", "record", "ir_module"')
 
     def __enter__(self) -> "Database":
diff --git a/python/tvm/meta_schedule/database/schedule_fn_database.py b/python/tvm/meta_schedule/database/schedule_fn_database.py
new file mode 100644
index 000000000000..2918f05799dc
--- /dev/null
+++ b/python/tvm/meta_schedule/database/schedule_fn_database.py
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A database for injecting handcrafted schedule functions."""
+from typing import Callable
+
+from tvm._ffi import register_object
+from tvm.tir import Schedule
+
+from .. import _ffi_api
+from .database import Database
+
+
+@register_object("meta_schedule.ScheduleFnDatabase")
+class ScheduleFnDatabase(Database):
+    """A database for injecting handcrafted schedule functions."""
+
+    def __init__(
+        self,
+        schedule_fn: Callable[[Schedule], bool],
+    ) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.DatabaseScheduleFnDatabase,  # type: ignore # pylint: disable=no-member
+            schedule_fn,
+        )
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
deleted file mode 100644
index 5919fb47c809..000000000000
--- a/python/tvm/meta_schedule/testing/utils.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Testing utility functions in meta schedule"""
-from typing import Callable, Dict, Optional, Union
-
-from tvm import meta_schedule as ms
-from tvm.ir import IRModule, transform
-from tvm.relay import Function as RelayFunc
-from tvm.runtime import NDArray
-from tvm.target import Target
-from tvm.tir import Schedule
-
-
-def apply_fixed_schedules(
-    relay_mod: Union[RelayFunc, IRModule],
-    target: Union[str, Target],
-    params: Optional[Dict[str, NDArray]],
-    schedule_fn: Callable[[ms.ExtractedTask, Schedule], bool],
-    tir_converter: str = "default",
-):
-    """Apply fixed schedules (manually written, without any tunable knobs) as specified by
-    schedule_fn to extracted tasks, and return a database that can be passed to compilation.
-
-    Parameters
-    ----------
-    mod : Union[RelayFunc, IRModule]
-        The Relay module to apply fixed schedules.
-    target : Union[str, Target]
-        The target used to extract tasks.
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
-        The associated parameters of the module.
-    schedule_fn : Callable[[ExtractedTask, Schedule], bool]
-        A callable that is applied for each extracted task and the corresponding default schedule.
-        Returns True if the given schedule should be committed to the database, False otherwise.
-    tir_converter : str
-        The filter function to filter out the extracted tasks. Builtin filters:
-          - "default"
-          - "allow_extern"
-        The converter is a PackedFunc registered as f"relay.backend.tir_converter.{tir_converter}",
-        with the signature below:
-            (args: List[te.Tensor], constants: List[NDArray]) -> Optional[tir.PrimFunc]
-
-    Returns
-    -------
-    database : Database
-        The database containing dummy tuning records for manually scheduled traces.
-    """
-    target = Target(target) if isinstance(target, str) else target
-    config = {"relay.backend.use_meta_schedule": True}
-    for k, v in transform.PassContext.current().config.items():
-        config[k] = v
-
-    extracted_tasks = ms.extract_task_from_relay(
-        relay_mod,
-        target,
-        params,
-        tir_converter=tir_converter,
-    )
-    database = ms.database.MemoryDatabase()
-    for task in extracted_tasks:
-        mod = ms.default_config.mod(task.dispatched[0])
-        sch = Schedule(mod)
-
-        if schedule_fn(task, sch):
-            workload = database.commit_workload(mod)
-            tune_rec = ms.database.TuningRecord(sch.trace, workload, [0.0], target, [])
-            database.commit_tuning_record(tune_rec)
-
-    return database
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index fedd2aa35278..d082ff7a3901 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -156,7 +156,8 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w
 
 /******** Database ********/
 
-Optional<TuningRecord> DatabaseNode::QueryTuningRecord(IRModule mod, Target target) {
+Optional<TuningRecord> DatabaseNode::QueryTuningRecord(const IRModule& mod, const Target& target,
+                                                       const String& workload_name) {
   if (!this->HasWorkload(mod)) {
     return NullOpt;
   }
@@ -168,8 +169,9 @@ Optional<TuningRecord> DatabaseNode::QueryTuningRecord(IRModule mod, Target targ
   return records[0];
 }
 
-Optional<tir::Schedule> DatabaseNode::QuerySchedule(IRModule mod, Target target) {
-  if (Optional<TuningRecord> opt_record = this->QueryTuningRecord(mod, target)) {
+Optional<tir::Schedule> DatabaseNode::QuerySchedule(const IRModule& mod, const Target& target,
+                                                    const String& workload_name) {
+  if (Optional<TuningRecord> opt_record = this->QueryTuningRecord(mod, target, workload_name)) {
     TuningRecord record = opt_record.value();
     tir::Schedule sch =
         tir::Schedule::Traced(record->workload->mod, /*seed=*/-1, /*debug_mask=*/0,
@@ -181,8 +183,9 @@ Optional<tir::Schedule> DatabaseNode::QuerySchedule(IRModule mod, Target target)
   }
 }
 
-Optional<IRModule> DatabaseNode::QueryIRModule(IRModule mod, Target target) {
-  if (Optional<tir::Schedule> opt_sch = this->QuerySchedule(mod, target)) {
+Optional<IRModule> DatabaseNode::QueryIRModule(const IRModule& mod, const Target& target,
+                                               const String& workload_name) {
+  if (Optional<tir::Schedule> opt_sch = this->QuerySchedule(mod, target, workload_name)) {
     return opt_sch.value()->mod();
   } else {
     return NullOpt;
diff --git a/src/meta_schedule/database/memory_database.cc b/src/meta_schedule/database/memory_database.cc
index a00d5501ad1d..b6c635555152 100644
--- a/src/meta_schedule/database/memory_database.cc
+++ b/src/meta_schedule/database/memory_database.cc
@@ -44,7 +44,7 @@ class MemoryDatabaseNode : public DatabaseNode {
     return false;
   }
 
-  Workload CommitWorkload(const IRModule& mod) {
+  Workload CommitWorkload(const IRModule& mod) final {
     for (const auto& workload : workloads) {
       if (StructuralEqual()(workload->mod, mod)) {
         return workload;
@@ -55,9 +55,9 @@ class MemoryDatabaseNode : public DatabaseNode {
     return workload;
   }
 
-  void CommitTuningRecord(const TuningRecord& record) { records.push_back(record); }
+  void CommitTuningRecord(const TuningRecord& record) final { records.push_back(record); }
 
-  Array<TuningRecord> GetTopK(const Workload& workload, int top_k) {
+  Array<TuningRecord> GetTopK(const Workload& workload, int top_k) final {
     std::vector<std::pair<double, TuningRecord>> results;
     results.reserve(this->records.size());
     for (const TuningRecord& record : records) {
@@ -91,9 +91,9 @@ class MemoryDatabaseNode : public DatabaseNode {
     return ret;
   }
 
-  Array<TuningRecord> GetAllTuningRecords() { return records; }
+  Array<TuningRecord> GetAllTuningRecords() final { return records; }
 
-  int64_t Size() { return records.size(); }
+  int64_t Size() final { return records.size(); }
 };
 
 Database Database::MemoryDatabase() {
diff --git a/src/meta_schedule/database/schedule_fn_database.cc b/src/meta_schedule/database/schedule_fn_database.cc
new file mode 100644
index 000000000000..751721fe52d4
--- /dev/null
+++ b/src/meta_schedule/database/schedule_fn_database.cc
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+class ScheduleFnDatabaseNode : public DatabaseNode {
+ public:
+  runtime::TypedPackedFunc<bool(tir::Schedule)> schedule_fn;
+
+  void VisitAttrs(AttrVisitor* v) {
+    // `schedule_fn` is not visited.
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.ScheduleFnDatabase";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ScheduleFnDatabaseNode, DatabaseNode);
+
+ public:
+  Optional<TuningRecord> QueryTuningRecord(const IRModule& mod, const Target& target,
+                                           const String& workload_name) final {
+    if (Optional<tir::Schedule> sch = this->QuerySchedule(mod, target, workload_name)) {
+      return TuningRecord(sch.value()->trace().value(),
+                          /*workload=*/Workload(mod, 0),  //
+                          /*run_secs=*/NullOpt,           //
+                          /*target=*/target,              //
+                          /*arg_info=*/NullOpt);
+    }
+    return NullOpt;
+  }
+
+  Optional<tir::Schedule> QuerySchedule(const IRModule& mod, const Target& target,
+                                        const String& workload_name) final {
+    tir::Schedule sch =
+        tir::Schedule::Traced(WithAttr<IRModule>(mod, "task_name", workload_name),
+                              /*rand_state=*/-1,
+                              /*debug_mode=*/0,
+                              /*error_render_level=*/tir::ScheduleErrorRenderLevel::kDetail);
+    if (!schedule_fn(sch)) {
+      return NullOpt;
+    }
+    return sch;
+  }
+
+  bool HasWorkload(const IRModule& mod) final {
+    LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.HasWorkload";
+    throw;
+  }
+
+  Workload CommitWorkload(const IRModule& mod) final {
+    LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.CommitWorkload";
+    throw;
+  }
+
+  void CommitTuningRecord(const TuningRecord& record) final {
+    LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.CommitTuningRecord";
+    throw;
+  }
+
+  Array<TuningRecord> GetTopK(const Workload& workload, int top_k) final {
+    LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.GetTopK";
+    throw;
+  }
+
+  Array<TuningRecord> GetAllTuningRecords() final {
+    LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.GetAllTuningRecords";
+    throw;
+  }
+
+  int64_t Size() final {
+    LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.size";
+    throw;
+  }
+};
+
+Database Database::ScheduleFnDatabase(runtime::TypedPackedFunc<bool(tir::Schedule)> schedule_fn) {
+  ObjectPtr<ScheduleFnDatabaseNode> n = make_object<ScheduleFnDatabaseNode>();
+  n->schedule_fn = std::move(schedule_fn);
+  return Database(n);
+}
+
+TVM_REGISTER_NODE_TYPE(ScheduleFnDatabaseNode);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseScheduleFnDatabase")
+    .set_body_typed(Database::ScheduleFnDatabase);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 0e2a3e270257..1d7566ebe2bd 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -367,7 +367,8 @@ class ScheduleBuilder : public ExprVisitor {
         if (Optional<PrimFunc> f = tir_converter(te_args, constants)) {
           if (Optional<TuningRecord> opt_record = database_.value()->QueryTuningRecord(
                   /*mod=*/backend::PrimFuncToIRModule(f.value()),
-                  /*target=*/target_)) {
+                  /*target=*/target_,
+                  /*workload_name=*/prim_fn_var->name_hint)) {
             static InstructionKind kind_transform_layout = InstructionKind::Get("TransformLayout");
             TuningRecord record = opt_record.value();
             for (const Instruction& inst : record->trace->insts) {
@@ -383,6 +384,8 @@ class ScheduleBuilder : public ExprVisitor {
             ICHECK_EQ(mod->functions.size(), 1);
             mod = tir::transform::RemoveWeightLayoutRewriteBlock()(std::move(mod));
             prim_func = Downcast<PrimFunc>(mod->Lookup("main"));
+          } else {
+            LOG(WARNING) << "Cannot find workload: " << prim_fn_var->name_hint;
           }
         }
       }
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index c741ecb59ae0..b14c18e55f4b 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -29,7 +29,6 @@
 from tvm import meta_schedule as ms
 from tvm import relay
 from tvm.contrib import utils
-from tvm.meta_schedule.testing.utils import apply_fixed_schedules
 from tvm.relay.backend import Executor, Runtime
 
 INPUT_SHAPE = (1, 3, 16, 16)
@@ -407,21 +406,21 @@ def schedule_dense(sch):
     target = "llvm"
     params = {"weight": weight_np}
 
-    def schedule_fn(task, sch):
-        if "nn_dense" in task.task_name:
+    def schedule_fn(sch):
+        if "nn_dense" in sch.mod.attrs["task_name"]:
             schedule_dense(sch)
             return True
         return False
 
     link_params = True
 
-    with tvm.transform.PassContext(config={"relay.FuseOps.link_params": link_params}):
-        database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
-
     with StringIO() as stderr_buf, redirect_stderr(stderr_buf):
-        with database, tvm.transform.PassContext(
+        with ms.database.ScheduleFnDatabase(schedule_fn), tvm.transform.PassContext(
             opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
+            config={
+                "relay.backend.use_meta_schedule": True,
+                "relay.FuseOps.link_params": link_params,
+            },
         ):
             executor = Executor("graph", {"link-params": link_params})
             lib = relay.build(relay_mod, target=target, executor=executor)
diff --git a/tests/python/unittest/test_meta_schedule_multi_anchor.py b/tests/python/unittest/test_meta_schedule_multi_anchor.py
index 177001781179..cb6f59c6e5d5 100644
--- a/tests/python/unittest/test_meta_schedule_multi_anchor.py
+++ b/tests/python/unittest/test_meta_schedule_multi_anchor.py
@@ -19,7 +19,6 @@
 import tvm.testing
 from tvm import meta_schedule as ms
 from tvm import relay
-from tvm.meta_schedule.testing.utils import apply_fixed_schedules
 
 
 def get_dense_dense(data_shape, weight_shape):
@@ -63,14 +62,13 @@ def test_dense_dense():
     target = "llvm"
     params = {"weight1": weight1_np, "weight2": weight2_np}
 
-    def schedule_fn(task, sch):
-        if "nn_dense_nn_dense" in task.task_name:
+    def schedule_fn(sch):
+        if "nn_dense_nn_dense" in sch.mod.attrs["task_name"]:
             schedule_dense_dense(sch)
             return True
         return False
 
-    database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
-    with database:
+    with ms.database.ScheduleFnDatabase(schedule_fn):
         with tvm.transform.PassContext(
             opt_level=3,
             config={"relay.backend.use_meta_schedule": True},
diff --git a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
index 939851a65731..b37333803603 100644
--- a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
+++ b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
@@ -18,8 +18,9 @@
 import tvm
 import tvm.testing
 import tvm.topi.testing
-from tvm import autotvm, relay, te
-from tvm.meta_schedule.testing.utils import apply_fixed_schedules
+from tvm import autotvm
+from tvm import meta_schedule as ms
+from tvm import relay, te
 from tvm.relay.testing.temp_op_attr import TempOpAttr
 from tvm.script import tir as T
 
@@ -139,21 +140,14 @@ def test_conv2d():
     target = "llvm"
     params = {"weight": weight_np}
 
-    def schedule_fn(task, sch):
-        if "nn_conv2d" in task.task_name:
+    def schedule_fn(sch):
+        if "nn_conv2d" in sch.mod.attrs["task_name"]:
             schedule_tir_conv2d_nchw_oihw(sch)
             return True
         return False
 
     with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy):
-        database = apply_fixed_schedules(
-            relay_mod,
-            target,
-            params,
-            schedule_fn,
-            tir_converter="allow_extern",
-        )
-        with database, tvm.transform.PassContext(
+        with ms.database.ScheduleFnDatabase(schedule_fn), tvm.transform.PassContext(
             opt_level=3,
             config={
                 "relay.backend.use_meta_schedule": True,
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index bc37fed7d691..b05b57feaf4c 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -29,7 +29,6 @@
 from tvm.contrib import graph_executor
 from tvm.ir import IRModule
 from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.testing.utils import apply_fixed_schedules
 from tvm.script import tir as T
 from tvm.target.target import Target
 from tvm.tir.schedule import BlockRV, Schedule
@@ -452,8 +451,8 @@ def manual_tir_common(do_tune=False):
             )
     else:
 
-        def schedule_fn(task, sch):
-            if "dense" not in task.task_name:
+        def schedule_fn(sch) -> bool:
+            if "dense" not in sch.mod.attrs["task_name"]:
                 return False
 
             block = sch.get_block("compute")
@@ -468,7 +467,7 @@ def schedule_fn(task, sch):
 
             return True
 
-        database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
+        database = ms.database.ScheduleFnDatabase(schedule_fn)
 
     with database, tvm.transform.PassContext(
         opt_level=3,

From 3d41ac3a9ab58ba5f7d3182e6afe915924568f8d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 29 Aug 2022 02:29:24 -0700
Subject: [PATCH 064/704] [Refactor] Replace std::tie with structured bindings
 (#12610)

* [Refactor] Replace std::tie with structured bindings

With C++17 enabled in https://github.com/apache/tvm/pull/12337, using
structured bindings to replace cases where `std::tie` is used to
define local variables.

* Added missing header for <optional>

* Silenced unused variable warnings after structured bindings

This is a bug in gcc version 7, resolved in gcc 8.  While gcc version
7 is used for CI, we'll need to silence unused variable warnings
resulting from using only part of a structured binding.

More information: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
---
 src/auto_scheduler/auto_schedule.cc           |  4 +-
 src/auto_scheduler/compute_dag.cc             | 17 +++----
 src/auto_scheduler/feature.cc                 |  9 +---
 .../search_policy/search_policy.cc            |  4 +-
 .../search_policy/sketch_policy_rules.cc      |  3 +-
 src/ir/instrument.cc                          |  5 +-
 src/meta_schedule/database/json_database.cc   |  4 +-
 .../mutator/mutate_compute_location.cc        |  4 +-
 .../schedule_rule/cross_thread_reduction.cc   |  6 +--
 .../space_generator/post_order_apply.cc       |  4 +-
 src/relay/collage/partition_rule.cc           | 12 ++---
 src/relay/collage/sub_graph.cc                |  8 +--
 src/relay/qnn/op/convolution.cc               |  4 +-
 src/relay/qnn/op/leaky_relu.cc                |  6 +--
 src/relay/qnn/op/requantize.cc                |  3 +-
 src/relay/qnn/utils.cc                        |  6 +--
 src/relay/quantize/realize.cc                 |  6 +--
 .../transforms/combine_parallel_conv2d.cc     |  4 +-
 .../transforms/combine_parallel_dense.cc      |  4 +-
 src/runtime/graph_executor/graph_executor.cc  |  4 +-
 src/target/source/ptx.cc                      | 12 ++---
 src/te/autodiff/ad_simplify.cc                | 10 ++--
 src/te/autodiff/ad_utils.cc                   |  8 +--
 src/te/autodiff/jacobian.cc                   |  4 +-
 src/tir/schedule/analysis/analysis.cc         | 11 ++--
 src/tir/schedule/primitive/block_annotate.cc  |  4 +-
 .../primitive/layout_transformation.cc        | 12 ++---
 .../schedule/primitive/loop_transformation.cc |  4 +-
 src/tir/schedule/primitive/reduction.cc       | 12 ++---
 src/tir/schedule/primitive/sampling.cc        |  4 +-
 src/tir/transforms/loop_partition.cc          | 51 ++++++++++++-------
 .../lower_cross_thread_reduction.cc           | 15 ++----
 src/tir/transforms/lower_thread_allreduce.cc  |  5 +-
 src/tir/transforms/lower_warp_memory.cc       | 11 ++--
 .../manifest_shared_memory_local_stage.cc     | 10 +---
 35 files changed, 105 insertions(+), 185 deletions(-)

diff --git a/src/auto_scheduler/auto_schedule.cc b/src/auto_scheduler/auto_schedule.cc
index 747aa01cfa05..41aa49c77193 100755
--- a/src/auto_scheduler/auto_schedule.cc
+++ b/src/auto_scheduler/auto_schedule.cc
@@ -78,9 +78,7 @@ TVM_REGISTER_GLOBAL("auto_scheduler.TuningOptions")
 
 TVM_REGISTER_GLOBAL("auto_scheduler.AutoSchedule")
     .set_body_typed([](SearchPolicy search_policy, TuningOptions tuning_options) {
-      te::Schedule sch;
-      Array<te::Tensor> return_tensors;
-      std::tie(sch, return_tensors) = AutoSchedule(search_policy, tuning_options);
+      auto [sch, return_tensors] = AutoSchedule(search_policy, tuning_options);
       return Array<ObjectRef>{sch, return_tensors};
     });
 }  // namespace auto_scheduler
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index dad55db0303f..5500707fb9af 100644
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -1325,10 +1325,9 @@ State ComputeDAG::InferBound(const State& state) const {
 
   Array<te::Stage> stages;
   StageToAxesMap stage_to_axes;
-  te::Schedule sch;
-  Array<te::Tensor> tensors;
   // Replay steps to tvm::Schedule
-  std::tie(sch, tensors) = ApplySteps(pstate->transform_steps, &stages, &stage_to_axes);
+  auto [sch, tensors] = ApplySteps(pstate->transform_steps, &stages, &stage_to_axes);
+  (void)tensors;  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
   sch = sch.normalize_for_feature_extraction();
   // Get bound information from TVM schedule
   Map<IterVar, Range> bounds = te::InferBound(sch);
@@ -1382,9 +1381,8 @@ Array<State> ComputeDAG::InferBound(const Array<State>& states) const {
 }
 
 ComputeDAG ComputeDAG::ReplayAndGetDAG(const Array<Step>& transform_steps) const {
-  te::Schedule sch;
-  Array<te::Tensor> old_tensors;
-  std::tie(sch, old_tensors) = ApplySteps(transform_steps);
+  auto [sch, old_tensors] = ApplySteps(transform_steps);
+  (void)old_tensors;  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
   return ComputeDAG(sch);
 }
 
@@ -1481,11 +1479,8 @@ TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAG")
 
 TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGApplyStepsFromState")
     .set_body_typed([](const ComputeDAG& dag, const State& state, int layout_rewrite) {
-      te::Schedule sch;
-      Array<te::Tensor> return_tensors;
-      std::tie(sch, return_tensors) =
-          dag.ApplySteps(state->transform_steps, nullptr, nullptr,
-                         static_cast<LayoutRewriteOption>(layout_rewrite));
+      auto [sch, return_tensors] = dag.ApplySteps(state->transform_steps, nullptr, nullptr,
+                                                  static_cast<LayoutRewriteOption>(layout_rewrite));
       return Array<ObjectRef>{sch, return_tensors};
     });
 
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index c930bf0c4e73..e079018151a7 100644
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -952,9 +952,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
         unique_lines = std::max(unique_lines, 1.0f);
       }
 
-      ReuseType reuse_type;
-      float reuse_dis_iter, reuse_dis_bytes, reuse_ct;
-      std::tie(reuse_type, reuse_dis_iter, reuse_dis_bytes, reuse_ct) =
+      auto [reuse_type, reuse_dis_iter, reuse_dis_bytes, reuse_ct] =
           ComputeReuse(t, acc.indices, for_loop_stack_, for_touch_regions_, ana_);
 
       acc_feas.emplace_back();
@@ -1356,10 +1354,7 @@ void GetPerStoreFeatureName(int max_n_bufs, std::vector<std::string>* ret) {
 
 void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, int max_n_bufs,
                                    std::vector<float>* feature, std::atomic<int>* error_ct) {
-  te::Schedule sch;
-  Array<te::Tensor> tensors;
-
-  std::tie(sch, tensors) = task->compute_dag.ApplySteps(state->transform_steps);
+  auto [sch, tensors] = task->compute_dag.ApplySteps(state->transform_steps);
 
   // When inlining, replace const matrices with const values.
   // Produces wrong IR, but good enough for feature extraction, and
diff --git a/src/auto_scheduler/search_policy/search_policy.cc b/src/auto_scheduler/search_policy/search_policy.cc
index 702eec087668..196bee8ff0e2 100644
--- a/src/auto_scheduler/search_policy/search_policy.cc
+++ b/src/auto_scheduler/search_policy/search_policy.cc
@@ -106,9 +106,7 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyRunCallbacks")
 
 TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyContinueSearchOneRound")
     .set_body_typed([](SearchPolicy policy, int num_measure, ProgramMeasurer measurer) {
-      Array<MeasureInput> inputs;
-      Array<MeasureResult> results;
-      std::tie(inputs, results) = policy->ContinueSearchOneRound(num_measure, measurer);
+      auto [inputs, results] = policy->ContinueSearchOneRound(num_measure, measurer);
       return Array<ObjectRef>{inputs, results};
     });
 
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 8df69fc7ce3b..862e593c9dd3 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -343,8 +343,7 @@ SketchGenerationRule::ConditionKind RuleCrossThreadReduction::MeetCondition(
   const auto& op = state->stages[stage_id]->op;
   if (op->IsInstance<te::ComputeOpNode>()) {
     // Compute the product of lengths of all space iters and all reduce iters
-    int cum_space_len, cum_reduce_len;
-    std::tie(cum_space_len, cum_reduce_len) =
+    auto [cum_space_len, cum_reduce_len] =
         GetCumulativeSpaceAndReductionLength(state->stages[stage_id]);
 
     if (NeedsMultilevelTiling(policy.search_task, state, stage_id)) {
diff --git a/src/ir/instrument.cc b/src/ir/instrument.cc
index 795e5b8cb542..6701308fbfb7 100644
--- a/src/ir/instrument.cc
+++ b/src/ir/instrument.cc
@@ -288,10 +288,7 @@ String RenderPassProfiles() {
   os << std::fixed;
 
   while (profiles.size() > 0) {
-    size_t depth;
-    PassProfile::Duration parent_duration;
-    PassProfile* profile;
-    std::tie(depth, parent_duration, profile) = profiles.top();
+    auto [depth, parent_duration, profile] = profiles.top();
     profiles.pop();
 
     // indent depth
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index f8fb64e92407..2e4f85260835 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -115,9 +115,7 @@ class JSONDatabaseNode : public DatabaseNode {
 
   Workload CommitWorkload(const IRModule& mod) {
     // Try to insert `mod` into `workloads_`
-    decltype(this->workloads2idx_)::iterator it;
-    bool inserted = false;
-    std::tie(it, inserted) =
+    auto [it, inserted] =
         this->workloads2idx_.emplace(Workload(mod, tvm::StructuralHash()(mod)), -1);
     Workload workload = it->first;
     // If `mod` is new in `workloads2idx_`, append it to the workload file
diff --git a/src/meta_schedule/mutator/mutate_compute_location.cc b/src/meta_schedule/mutator/mutate_compute_location.cc
index 3ed56df1b381..9d6d69ba355f 100644
--- a/src/meta_schedule/mutator/mutate_compute_location.cc
+++ b/src/meta_schedule/mutator/mutate_compute_location.cc
@@ -86,9 +86,7 @@ std::vector<MutateComputeLocationNode::Candidate> MutateComputeLocationNode::Fin
       int old_decision = Downcast<Integer>(decision)->value;
 
       // Step 2. Collect all the compute_at locations.
-      Array<tir::StmtSRef> location_srefs;
-      std::vector<int> location_indices;
-      std::tie(location_srefs, location_indices) = CollectComputeLocation(sch->state(), block_sref);
+      auto [location_srefs, location_indices] = CollectComputeLocation(sch->state(), block_sref);
       // Step 3. Remove the old decision.
       auto it = std::find(location_indices.begin(), location_indices.end(), old_decision);
       if (it != location_indices.end()) {
diff --git a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
index 242f1aea89c5..0f0ab99e7259 100644
--- a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
+++ b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
@@ -64,15 +64,11 @@ class CrossThreadReductionNode : public ScheduleRuleNode {
     // Step 2. Check the opportunity for block fusion. We say "fusible", if we can compute-at the
     // block to its consumers. We want to fuse as much as possible because it results in
     // significantly faster schedule.
-    bool fusible = false;
     // `target_loop` is the loop position where the input block will be computed at.
-    tir::LoopRV target_loop{nullptr};
     // `target_block` is the consumer block that we want to compute-at the input block to.
-    tir::BlockRV target_block{nullptr};
     // `tgt_block_innermost_loop` is the innermost loop outside the target block.
-    tir::LoopRV tgt_block_innermost_loop{nullptr};
 
-    std::tie(fusible, target_loop, target_block, tgt_block_innermost_loop) =
+    auto [fusible, target_loop, target_block, tgt_block_innermost_loop] =
         GetComputeTargetLoopAndBlock(tmp_sch, block_rv);
 
     // Step 3. Try block fusion.
diff --git a/src/meta_schedule/space_generator/post_order_apply.cc b/src/meta_schedule/space_generator/post_order_apply.cc
index eab084f8978f..9be89e2d9c70 100644
--- a/src/meta_schedule/space_generator/post_order_apply.cc
+++ b/src/meta_schedule/space_generator/post_order_apply.cc
@@ -140,9 +140,7 @@ class PostOrderApplyNode : public SpaceGeneratorNode {
       result.clear();
       while (!stack.empty()) {
         // get the stack.top()
-        tir::Schedule sch;
-        Array<tir::BlockRV> blocks;
-        std::tie(sch, blocks) = stack.back();
+        auto [sch, blocks] = stack.back();
         stack.pop_back();
         // if all blocks are visited
         if (blocks.empty()) {
diff --git a/src/relay/collage/partition_rule.cc b/src/relay/collage/partition_rule.cc
index e11f740acfe9..1d8c5e9723ee 100644
--- a/src/relay/collage/partition_rule.cc
+++ b/src/relay/collage/partition_rule.cc
@@ -92,9 +92,7 @@ std::vector<CandidatePartition> DFPatternPartitionRuleNode::AllCandidates(
       continue;
     }
     IndexSet inside = MatcherToIndexSet(matcher);
-    OpPatternKind kind;
-    String label;
-    std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside);
+    auto [kind, label] = SubGraphKindAndLabel(dataflow_graph, inside);
     SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label));
     String rule_name = rule_name_.empty() ? sub_graph->label_ : rule_name_;
     CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec);
@@ -256,9 +254,7 @@ std::vector<CandidatePartition> OpCallByKindPartitionRuleNode::AllCandidates(
     auto node = dataflow_graph.index_to_node(index);
     Expr sub_expr = node->ref();
     if (sub_expr->IsInstance<CallNode>()) {
-      OpPatternKind kind;
-      String label;
-      std::tie(kind, label) = SubExprKindAndLabel(sub_expr);
+      auto [kind, label] = SubExprKindAndLabel(sub_expr);
       if (kind <= kOutEWiseFusable) {
         IndexSet inside(dataflow_graph.size(), {index});
         SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label));
@@ -404,9 +400,7 @@ std::vector<CandidatePartition> HostPartitionRuleNode::AllCandidates(
       continue;
     }
     IndexSet inside(dataflow_graph.size(), {index});
-    OpPatternKind kind;
-    String label;
-    std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside);
+    auto [kind, label] = SubGraphKindAndLabel(dataflow_graph, inside);
     SubGraph sub_graph(dataflow_graph, std::move(inside), kind, label);
     String rule_name = NestLabels(rule_name_, sub_graph->label_);
     // We'll a zero cost for the candidate since we'll never want to actually estimate the cost
diff --git a/src/relay/collage/sub_graph.cc b/src/relay/collage/sub_graph.cc
index 63edc8c079fb..dee72093fd2f 100644
--- a/src/relay/collage/sub_graph.cc
+++ b/src/relay/collage/sub_graph.cc
@@ -439,9 +439,7 @@ std::pair<OpPatternKind, std::string> SubGraphKindAndLabel(const DataflowGraph&
   bool first = true;
   OpPatternKind max_kind = kElemWise;
   for (PostDfsIndex index : inside) {
-    OpPatternKind sub_kind;
-    std::string sub_label;
-    std::tie(sub_kind, sub_label) = SubExprKindAndLabel(dataflow_graph.index_to_node(index)->ref());
+    auto [sub_kind, sub_label] = SubExprKindAndLabel(dataflow_graph.index_to_node(index)->ref());
     if (!sub_label.empty()) {
       if (first) {
         first = false;
@@ -995,9 +993,7 @@ transform::Pass PartitionForTesting(Integer max_exits, Bool allow_taps, String c
     // Build the overall sub-graph, which will include any "Composite" functions as
     // well as any nodes without a label.
     IndexSet inside(dataflow_graph.size(), node_indexes);
-    OpPatternKind kind;
-    String label;
-    std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside);
+    auto [kind, label] = SubGraphKindAndLabel(dataflow_graph, inside);
     SubGraph sub_graph(dataflow_graph, inside, kind, label, std::move(nested_sub_graphs));
 
     // Push the overall sub-graph into the final "Compiler" function.
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index 42e4540f0f2c..64a5a02e6e25 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -722,9 +722,9 @@ Expr QnnConv2DCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
       << "qnn.conv2d supports only OIHW/HWIO/HWOI/OHWI kernel data layout.";
   ICHECK(param->kernel_size.defined()) << "qnn.conv2d requires kernel size to be specified.";
 
-  int batch_size, in_channels, out_channels, kernel_h, kernel_w, channel_multiplier;
-  std::tie(batch_size, in_channels, out_channels, kernel_h, kernel_w, channel_multiplier) =
+  auto [batch_size, in_channels, out_channels, kernel_h, kernel_w, channel_multiplier] =
       GetWorkload(arg_types, param);
+  (void)batch_size;  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
 
   // zero points are allowed to be non-scalar. Let's check if that's the case.
   bool dynamic_zp = false;
diff --git a/src/relay/qnn/op/leaky_relu.cc b/src/relay/qnn/op/leaky_relu.cc
index 75bfabb7db85..458fde0d8a08 100644
--- a/src/relay/qnn/op/leaky_relu.cc
+++ b/src/relay/qnn/op/leaky_relu.cc
@@ -125,13 +125,11 @@ Expr QnnLeakyReluCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                                              output_zero_point, input_shape);
 
   // alpha * Q_i'
-  int32_t fixed_point_multiplier, shift;
-  std::tie(fixed_point_multiplier, shift) = GetFixedPointMultiplierShift(alpha);
+  auto [fixed_point_multiplier, shift] = GetFixedPointMultiplierShift(alpha);
   auto prod = FixedPointMultiply(requantized_expr, fixed_point_multiplier, shift);
 
   // (1 - alpha) * zp_o
-  int32_t fixed_point_multiplier_z, shift_z;
-  std::tie(fixed_point_multiplier_z, shift_z) = GetFixedPointMultiplierShift(1 - alpha);
+  auto [fixed_point_multiplier_z, shift_z] = GetFixedPointMultiplierShift(1 - alpha);
   auto scaled_z = FixedPointMultiply(output_zero_point, fixed_point_multiplier_z, shift_z);
 
   // alpha * Q_i' + (1 - alpha) * zp_o
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 5bf53a95edda..ae321b459788 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -223,8 +223,7 @@ Expr RequantizeLowerInt(const Expr& input_tensor, const Expr& input_scale,
         static_cast<double>(input_scale_float) / static_cast<double>(output_scale_float);
     // Skip if input and output scales are same.
     if (!IsEqualScalar(input_scale, output_scale)) {
-      int32_t fixed_point_multiplier, shift;
-      std::tie(fixed_point_multiplier, shift) = GetFixedPointMultiplierShift(double_multiplier);
+      auto [fixed_point_multiplier, shift] = GetFixedPointMultiplierShift(double_multiplier);
 
       const bool is_upward_rounding = (param->rounding == "UPWARD");
 
diff --git a/src/relay/qnn/utils.cc b/src/relay/qnn/utils.cc
index 7dfd788d96c6..ed7a415cf6af 100644
--- a/src/relay/qnn/utils.cc
+++ b/src/relay/qnn/utils.cc
@@ -64,8 +64,7 @@ Expr FixedPointMultiplyToNearest(Expr tensor, double multiplier,
   tensor = Cast(tensor, hp_dtype);
 
   // 1) Calculating the integer multiplier and integer shift
-  int32_t fixed_point_multiplier, shift;
-  std::tie(fixed_point_multiplier, shift) = GetFixedPointMultiplierShift(multiplier);
+  auto [fixed_point_multiplier, shift] = GetFixedPointMultiplierShift(multiplier);
   int left_shift = shift > 0 ? shift : 0;
   int right_shift = shift > 0 ? 0 : -shift;
 
@@ -128,8 +127,7 @@ Expr FixedPointMultiplyPerChannel(Expr tensor, std::vector<double> multipliers,
   std::vector<int32_t> fixed_pt_multipliers, lshifts, rshifts;
   bool is_lshift_required = false;
   for (auto multiplier : multipliers) {
-    int32_t fixed_pt_multiplier, shift;
-    std::tie(fixed_pt_multiplier, shift) = GetFixedPointMultiplierShift(multiplier);
+    auto [fixed_pt_multiplier, shift] = GetFixedPointMultiplierShift(multiplier);
     int lshift = shift > 0 ? shift : 0;
     int rshift = shift > 0 ? 0 : -shift;
     fixed_pt_multipliers.push_back(fixed_pt_multiplier);
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 5766c62eaa43..720ef25cd33d 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -77,8 +77,7 @@ inline Expr MulAndDiv(Expr data, float s1, float s2, DataType dtype,
     return Multiply(data, MakeConstantScalar(dtype, factor));
   } else {
     if (cfg->rounding == "UPWARD") {
-      int32_t fixed_point_multiplier, shift;
-      std::tie(fixed_point_multiplier, shift) = qnn::GetFixedPointMultiplierShift(factor);
+      auto [fixed_point_multiplier, shift] = qnn::GetFixedPointMultiplierShift(factor);
       data = relay::FixedPointMultiply(data, fixed_point_multiplier, shift);
     } else {
       data = qnn::FixedPointMultiplyToNearest(data, factor, data_shape);
@@ -135,8 +134,7 @@ Expr QuantizeRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
     } else {
       data = Cast(data, DataType::Int(64));
       if (cfg->rounding == "UPWARD") {
-        int32_t fixed_point_multiplier, shift;
-        std::tie(fixed_point_multiplier, shift) =
+        auto [fixed_point_multiplier, shift] =
             qnn::GetFixedPointMultiplierShift(idom_scale_imm / odom_scale_imm);
         data = relay::FixedPointMultiply(data, fixed_point_multiplier, shift);
       } else {
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index 20b206e0423c..9c7bcc27ec82 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -83,9 +83,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
   Call MakeCombinedOp(const Group& branches) {
     const Op& conv2d = Op::Get("nn.conv2d");
     Expr data = branches[0][0]->args[0];
-    Expr new_weight;
-    IndexExpr new_channels;
-    std::tie(new_weight, new_channels) = TransformWeight(branches);
+    auto [new_weight, new_channels] = TransformWeight(branches);
 
     const CallNode* group_root = branches[0][0];
     const auto* attrs = group_root->attrs.as<Conv2DAttrs>();
diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc
index d5404ba30f90..7cf102b5bcab 100644
--- a/src/relay/transforms/combine_parallel_dense.cc
+++ b/src/relay/transforms/combine_parallel_dense.cc
@@ -116,10 +116,8 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
   Call MakeCombinedOp(const Group& branches) {
     const Op& dense_op = Op::Get("nn.dense");
     Expr input = branches[0][0]->args[0];
-    Expr new_weight;
-    IndexExpr new_output_dims;
     // concat all weights into one
-    std::tie(new_weight, new_output_dims) = TransformWeight(branches);
+    auto [new_weight, new_output_dims] = TransformWeight(branches);
     const auto* origin_attrs = branches[0][0]->attrs.as<DenseAttrs>();
     ICHECK(origin_attrs);
     const auto dense_attrs = make_object<DenseAttrs>();
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index e3113dbfe54c..fc7e82bed4e2 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -674,9 +674,7 @@ PackedFunc GraphExecutor::GetFunction(const std::string& name,
     });
   } else if (name == "get_input_info") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      GraphExecutor::ShapeInfo shape_info;
-      GraphExecutor::DtypeInfo dtype_info;
-      std::tie(shape_info, dtype_info) = this->GetInputInfo();
+      auto [shape_info, dtype_info] = this->GetInputInfo();
       Map<String, ObjectRef> input_info;
       input_info.Set("shape", shape_info);
       input_info.Set("dtype", dtype_info);
diff --git a/src/target/source/ptx.cc b/src/target/source/ptx.cc
index c5e3bf98ec2d..881c425e7742 100644
--- a/src/target/source/ptx.cc
+++ b/src/target/source/ptx.cc
@@ -403,8 +403,7 @@ class Replacer {
   }
   std::string rewrite(std::string str) {
     for (auto&& rule : _rules) {
-      std::string pattern, replacement;
-      std::tie(pattern, replacement) = rule;
+      auto [pattern, replacement] = rule;
       size_t len = pattern.size();
       size_t new_len = replacement.size();
       size_t pos = str.find(pattern);
@@ -532,8 +531,7 @@ std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layo
                 dtype_c = ptx::DTypeFromString(C_dtype);
   ptx::LayoutType layout_a = ptx::LayoutTypeFromString(A_layout),
                   layout_b = ptx::LayoutTypeFromString(B_layout);
-  int m, n, k;
-  std::tie(m, n, k) = ptx::ParseMMAShape(shape);
+  auto [m, n, k] = ptx::ParseMMAShape(shape);
   CheckMMAConfigValidity(m, n, k, layout_a, layout_b, dtype_a, dtype_b, dtype_c, bit_op, sparse,
                          saturate);
   std::string asm_code = R"(
@@ -545,8 +543,7 @@ std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layo
       : {inputs});
   }
 )";
-  std::string templates_str, inputs_str, outputs_str;
-  std::tie(templates_str, inputs_str, outputs_str) =
+  auto [templates_str, inputs_str, outputs_str] =
       GetMMAOperands(m, n, k, dtype_a, dtype_b, dtype_c, sparse);
 
   // replace patterns
@@ -622,8 +619,7 @@ std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type
     );
   }
 )";
-  std::string templates_str, outputs_str;
-  std::tie(templates_str, outputs_str) = GetLoadMatrixOperands(num, local_ptr, local_elem_offset);
+  auto [templates_str, outputs_str] = GetLoadMatrixOperands(num, local_ptr, local_elem_offset);
 
   Replacer replacer;
   replacer.register_rule("{.shape}", ".m8n8");
diff --git a/src/te/autodiff/ad_simplify.cc b/src/te/autodiff/ad_simplify.cc
index 28f57c77da70..26047e879e9b 100644
--- a/src/te/autodiff/ad_simplify.cc
+++ b/src/te/autodiff/ad_simplify.cc
@@ -1183,21 +1183,19 @@ PrimExpr RemoveJacobianAndLiftNonzeroCondImpl(const PrimExpr& expr_orig, const A
         return RemoveJacobianAndLiftNonzeroCondImpl(new_red, axis, vranges);
       }
 
-      PrimExpr new_outer_cond, new_reduce_cond;
       Array<PrimExpr> new_source = red->source;
 
       // Partially lift conditions from the reduce condition
-      std::tie(new_outer_cond, new_reduce_cond) =
+      auto [new_outer_cond, new_reduce_cond] =
           LiftConditionsThroughReduction(red->condition, red->axis, axis);
 
       // If it's not sum then we haven't yet lifted nonzeroness cond from the source
       if (!is_sum) {
-        PrimExpr outer_nz_cond, nz_cond, nz_source;
         auto nz = NonzeronessCondition(red->source[red->value_index]);
         // Append conditions from the reduction
-        nz_cond = new_reduce_cond && nz.cond;
-        nz_source = nz.value;
-        std::tie(outer_nz_cond, nz_cond) = LiftConditionsThroughReduction(nz_cond, red->axis, axis);
+        PrimExpr nz_source = nz.value;
+        auto [outer_nz_cond, nz_cond] =
+            LiftConditionsThroughReduction(new_reduce_cond && nz.cond, red->axis, axis);
         new_outer_cond = new_outer_cond && outer_nz_cond;
         new_source.Set(red->value_index, Select(nz_cond, nz_source, make_zero(nz_source.dtype())));
       }
diff --git a/src/te/autodiff/ad_utils.cc b/src/te/autodiff/ad_utils.cc
index 268abab9cacb..0d1e4927cdfe 100644
--- a/src/te/autodiff/ad_utils.cc
+++ b/src/te/autodiff/ad_utils.cc
@@ -47,9 +47,7 @@ std::pair<Array<IterVar>, Map<Var, PrimExpr>> CloneIterVars(const Array<IterVar>
 
 PrimExpr CloneReduction(const PrimExpr& expr) {
   if (const ReduceNode* red = expr.as<ReduceNode>()) {
-    Array<IterVar> new_axis;
-    Map<Var, PrimExpr> vmap;
-    std::tie(new_axis, vmap) = CloneIterVars(red->axis);
+    auto [new_axis, vmap] = CloneIterVars(red->axis);
 
     Array<PrimExpr> src_with_newaxis;
     for (const auto& src : red->source) {
@@ -71,9 +69,7 @@ Operation ComputeOpFromExprs(const Array<PrimExpr>& exprs, const Array<IterVar>&
                              const std::string& name, const std::string& tag,
                              const Map<String, ObjectRef>& attrs, bool clone_axis) {
   if (clone_axis) {
-    Array<IterVar> new_axis = axis;
-    Map<Var, PrimExpr> vmap;
-    std::tie(new_axis, vmap) = CloneIterVars(axis);
+    auto [new_axis, vmap] = CloneIterVars(axis);
     Array<PrimExpr> new_exprs;
     for (const PrimExpr& e : exprs) {
       new_exprs.push_back(Substitute(CloneReduction(e), vmap));
diff --git a/src/te/autodiff/jacobian.cc b/src/te/autodiff/jacobian.cc
index 7104424957af..e61a590c409d 100644
--- a/src/te/autodiff/jacobian.cc
+++ b/src/te/autodiff/jacobian.cc
@@ -317,9 +317,7 @@ Tensor Jacobian(const Tensor& output, const Tensor& input) {
 
   // We have to clone the iteration axes because otherwise the original expression
   // cannot be used together with the derivative (it will lead to errors during lowering)
-  Array<IterVar> new_axis;
-  Map<Var, PrimExpr> vmap;
-  std::tie(new_axis, vmap) = te::CloneIterVars(op->axis);
+  auto [new_axis, vmap] = te::CloneIterVars(op->axis);
 
   Array<PrimExpr> input_indices;
   size_t i = 0;
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index b9e99257f37c..fb09a3480a3a 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -558,9 +558,13 @@ bool IsWriteCache(const StmtSRef& block_sref) {
   }
   const BufferRegion& write_region = block->writes[0];
   for (const BufferRegion& read_region : block->reads) {
-    bool exists, surjective, injective, ordered, no_const_read, no_shift_read;
-    std::tie(exists, surjective, injective, ordered, no_const_read, no_shift_read) =
+    auto [exists, surjective, injective, ordered, no_const_read, no_shift_read] =
         AnalyzeReadWritePattern(read_region, write_region);
+    // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
+    (void)exists;
+    (void)surjective;
+    (void)no_const_read;
+    (void)no_shift_read;
     if (!(injective && ordered)) {
       return false;
     }
@@ -2118,8 +2122,7 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self,   //
   }
 
   // Cond 6. Can successfully calculating the cumulative loop length.
-  int64_t cum_space_len, cum_reduce_len;
-  std::tie(cum_space_len, cum_reduce_len) = GetCumulativeSpaceAndReductionLength(self, block_sref);
+  auto [cum_space_len, cum_reduce_len] = GetCumulativeSpaceAndReductionLength(self, block_sref);
   if (cum_space_len == -1 || cum_reduce_len == -1) {
     return false;
   }
diff --git a/src/tir/schedule/primitive/block_annotate.cc b/src/tir/schedule/primitive/block_annotate.cc
index 31c938313fed..0912e36836e3 100644
--- a/src/tir/schedule/primitive/block_annotate.cc
+++ b/src/tir/schedule/primitive/block_annotate.cc
@@ -82,9 +82,7 @@ class NonAllocatedBufferError : public ScheduleError {
 
   static StmtSRef CheckAndGetBufferAllocationSite(const IRModule& mod, const StmtSRef& block_sref,
                                                   const Buffer& buffer) {
-    Optional<StmtSRef> defining_site_sref;
-    bool is_alloc;
-    std::tie(defining_site_sref, is_alloc) = GetBufferDefiningSite(block_sref, buffer);
+    auto [defining_site_sref, is_alloc] = GetBufferDefiningSite(block_sref, buffer);
     if (!defining_site_sref.defined() || !is_alloc) {
       throw NonAllocatedBufferError(mod, buffer);
     }
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index b4e40fa120fe..8e2643db0103 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -137,9 +137,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
   const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref);
   Buffer old_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index, buffer_index_type);
-  Optional<StmtSRef> defining_site_sref;
-  bool is_alloc;
-  std::tie(defining_site_sref, is_alloc) = GetBufferDefiningSite(block_sref, old_buffer);
+  auto [defining_site_sref, is_alloc] = GetBufferDefiningSite(block_sref, old_buffer);
   if (defining_site_sref.defined() && !is_alloc) {
     throw BufferIsSubregionError(self->mod, old_buffer);
   }
@@ -155,9 +153,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
   Buffer new_buffer{new_buffer_node};
 
   // Step 2: Rewrite access indices and regions of the buffer
-  Stmt new_stmt;
-  Map<Block, Block> block_sref_reuse;
-  std::tie(new_stmt, block_sref_reuse) = TransformLayoutRewriter::Rewrite(
+  auto [new_stmt, block_sref_reuse] = TransformLayoutRewriter::Rewrite(
       GetRef<Block>(scope_block), old_buffer, new_buffer, index_map);
   Block new_scope_block = Downcast<Block>(new_stmt);
 
@@ -492,9 +488,7 @@ void SetAxisSeparator(ScheduleState self, const StmtSRef& block_sref, int buffer
   const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref);
   Buffer old_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index, buffer_index_type);
-  Optional<StmtSRef> defining_site_sref;
-  bool is_alloc;
-  std::tie(defining_site_sref, is_alloc) = GetBufferDefiningSite(block_sref, old_buffer);
+  auto [defining_site_sref, is_alloc] = GetBufferDefiningSite(block_sref, old_buffer);
   if (defining_site_sref.defined() && !is_alloc) {
     throw BufferIsSubregionError(self->mod, old_buffer);
   }
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index 2db3eb902aba..992817e87e2d 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -704,9 +704,7 @@ void Reorder(ScheduleState self, const Array<StmtSRef>& ordered_loop_srefs) {
   //   the input array
   // - the bottom of the reorder range is the last loop in the input array which is not visited in
   // the previous traversals
-  const StmtSRefNode* top = nullptr;
-  const StmtSRefNode* bottom = nullptr;
-  std::tie(top, bottom) = GetBoundaryOfReorderRange(self, loop_srefs);
+  auto [top, bottom] = GetBoundaryOfReorderRange(self, loop_srefs);
   // Step 3. Collect all loops in the chain and check the loops are single-branch
   std::vector<const StmtSRefNode*> chain = GetLoopsInReorderRange(self, top, bottom);
   // Step 4. Check the block below has all its block_var to be data-parallel or reduction,
diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc
index 7a4ace736e48..1198e67d710a 100644
--- a/src/tir/schedule/primitive/reduction.cc
+++ b/src/tir/schedule/primitive/reduction.cc
@@ -278,9 +278,7 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref,
   body = Substitute(body, loop_var_map);
   // Step 6. Mutate IR
   const BlockNode* old_scope_root = TVM_SREF_TO_BLOCK(scope_root_sref);
-  Block new_scope_root{nullptr};
-  Block new_reduction_block{nullptr};
-  std::tie(new_scope_root, new_reduction_block) = DecomposeReductionBlockReplacer::Replace(
+  auto [new_scope_root, new_reduction_block] = DecomposeReductionBlockReplacer::Replace(
       GetRef<Block>(old_scope_root), GetRef<For>(loop), body, GetRef<Block>(block));
   self->Replace(scope_root_sref, new_scope_root,
                 {{GetRef<Block>(old_scope_root), new_scope_root},
@@ -1042,12 +1040,8 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax
   // commutative reducer, combiner lhs and combiner rhs from the reduction identity and the
   // reduction combiner. The lhs will be used when constructing the write-back block, and the rhs
   // will be used when constructing the rfactor block.
-  BufferStore init;
-  BufferStore update;
-  CommReducer reducer;
-  PrimExpr combiner_lhs, combiner_rhs;
-  std::tie(init, update) = GetBufferStoresFromReductionBlock(self, block);
-  std::tie(reducer, combiner_lhs, combiner_rhs) =
+  auto [init, update] = GetBufferStoresFromReductionBlock(self, block);
+  auto [reducer, combiner_lhs, combiner_rhs] =
       GetReducerAndCombinerLhsRhs(self, init->value, update);
 
   // Step 6. Check whether `factor_axis` is in a correct range, and convert it to non-negative if it
diff --git a/src/tir/schedule/primitive/sampling.cc b/src/tir/schedule/primitive/sampling.cc
index 52b5add2bc9e..b1001a7f9455 100644
--- a/src/tir/schedule/primitive/sampling.cc
+++ b/src/tir/schedule/primitive/sampling.cc
@@ -348,9 +348,7 @@ tir::StmtSRef SampleComputeLocation(tir::ScheduleState self,
                                     support::LinearCongruentialEngine::TRandState* rand_state,
                                     const StmtSRef& block_sref, Optional<Integer>* decision) {
   // Step 1. Collect all possible compute-at locations.
-  Array<tir::StmtSRef> location_srefs;
-  std::vector<int> location_indices;
-  std::tie(location_srefs, location_indices) = CollectComputeLocation(self, block_sref);
+  auto [location_srefs, location_indices] = CollectComputeLocation(self, block_sref);
   ICHECK_EQ(location_srefs.size(), location_indices.size());
 
   // Step 2. If there was a previous decision, keep the decision unchanged if it exists in the
diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc
index 677506889e57..6ecc6459b904 100644
--- a/src/tir/transforms/loop_partition.cc
+++ b/src/tir/transforms/loop_partition.cc
@@ -29,6 +29,7 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <optional>
 #include <unordered_map>
 #include <unordered_set>
 
@@ -553,25 +554,39 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim
   if (finder.partitions.empty()) return Stmt();
 
   arith::IntervalSet for_interval(min, max);
-  bool cond_value;
-  IntSet middle_interval;
-  ExpressionSet cond_set;
-  // find an interval in which all conditions on var are true
-  std::tie(middle_interval, cond_set) =
-      GetIntervalAndCondset(finder.partitions, for_interval, true, has_partition_hint_);
-  if (middle_interval.IsNothing()) {
-    // if such interval doesn't exist, find an interval in which all
-    // conditions on var are false
-    std::tie(middle_interval, cond_set) =
-        GetIntervalAndCondset(finder.partitions, for_interval, false, has_partition_hint_);
-    if (middle_interval.IsNothing())
-      // we couldn't find an interval in which the conditions are provably true or false
-      // Therefore, we can't partition the loop based on those conds
-      return Stmt();
-    cond_value = false;
-  } else {
-    cond_value = true;
+
+  auto [middle_interval, cond_set,
+        opt_cond_value] = [&]() -> std::tuple<IntSet, ExpressionSet, std::optional<bool>> {
+    {
+      // find an interval in which all conditions on var are true
+      auto [middle_interval, cond_set] =
+          GetIntervalAndCondset(finder.partitions, for_interval, true, has_partition_hint_);
+      if (!middle_interval.IsNothing()) {
+        return {middle_interval, cond_set, true};
+      }
+    }
+
+    {
+      // if such interval doesn't exist, find an interval in which all
+      // conditions on var are false
+      auto [middle_interval, cond_set] =
+          GetIntervalAndCondset(finder.partitions, for_interval, false, has_partition_hint_);
+
+      if (!middle_interval.IsNothing()) {
+        return {middle_interval, cond_set, false};
+      }
+    }
+
+    // we couldn't find an interval in which the conditions are
+    // provably true or false.  Therefore, we can't partition the loop
+    // based on those conds
+    return {{}, {}, std::nullopt};
+  }();
+
+  if (!opt_cond_value.has_value()) {
+    return Stmt();
   }
+  bool cond_value = opt_cond_value.value();
 
   IntervalSet middle_interval_i = Downcast<IntervalSet>(middle_interval);
   // middle_interval is the subrange of the loop variable range for which a
diff --git a/src/tir/transforms/lower_cross_thread_reduction.cc b/src/tir/transforms/lower_cross_thread_reduction.cc
index df8bf69e7468..04b025b5f9ae 100644
--- a/src/tir/transforms/lower_cross_thread_reduction.cc
+++ b/src/tir/transforms/lower_cross_thread_reduction.cc
@@ -497,14 +497,10 @@ class CrossThreadReductionTransformer : public StmtMutator {
     // both be BufferStores with the same buffer and indices;
     // Extract the commutative reducer, combiner lhs and combiner rhs from the reduction identity
     // and the reduction combiner.
-    BufferStore init{nullptr};
-    BufferStore update{nullptr};
-    CommReducer reducer{nullptr};
-    PrimExpr combiner_lhs{nullptr};
-    PrimExpr combiner_rhs{nullptr};
-    std::tie(init, update) = GetBufferStoresFromReductionBlock(NullOpt, GetRef<Block>(block));
-    std::tie(reducer, combiner_lhs, combiner_rhs) =
+    auto [init, update] = GetBufferStoresFromReductionBlock(NullOpt, GetRef<Block>(block));
+    auto [reducer, combiner_lhs, combiner_rhs] =
         GetReducerAndCombinerLhsRhs(NullOpt, init->value, update);
+    (void)combiner_lhs;  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
 
     // Condition 5. The block should be the last block under the first reduction-related loop.
     bool visit = false;
@@ -577,10 +573,7 @@ class CrossThreadReductionTransformer : public StmtMutator {
     ++reduction_id_;
     // Step 2. Check whether cross-thread reduction can be applied. If no, throw an exception on
     // which condition the block violates.
-    int n_bound_reduction_loops = 0;
-    CommReducer reducer{nullptr};
-    PrimExpr combiner_rhs{nullptr};
-    std::tie(n_bound_reduction_loops, reducer, combiner_rhs) =
+    auto [n_bound_reduction_loops, reducer, combiner_rhs] =
         CheckCanApplyCrossThreadReduction(block, reduction_loops);
     // Step 3. Before doing the cross-thread reduction, in-thread reduction is needed when
     //  - not all the reduction-related loops are bound to thread axes, or
diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc
index 43f7a103db7f..bd6b5185eb4a 100644
--- a/src/tir/transforms/lower_thread_allreduce.cc
+++ b/src/tir/transforms/lower_thread_allreduce.cc
@@ -301,9 +301,8 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     // sort according to dim_index
     std::sort(block_threads.begin(), block_threads.end());
     for (auto&& thr_attr : block_threads) {
-      int dim_index, extent;
-      bool is_reduce;
-      std::tie(dim_index, extent, is_reduce) = thr_attr;
+      auto [dim_index, extent, is_reduce] = thr_attr;
+      (void)dim_index;  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
       if (is_reduce) {
         contiguous_reduce_extent *= extent;
       } else {
diff --git a/src/tir/transforms/lower_warp_memory.cc b/src/tir/transforms/lower_warp_memory.cc
index 408cdbd04ec7..e12e2772ab22 100644
--- a/src/tir/transforms/lower_warp_memory.cc
+++ b/src/tir/transforms/lower_warp_memory.cc
@@ -311,8 +311,8 @@ class WarpAccessRewriter : protected StmtExprMutator {
                                           << "Has StorageFlatten (TE-based schedule) or "
                                           << "FlattenBuffer (TIR-based schedules) been run?";
 
-      PrimExpr local_index, group;
-      std::tie(local_index, group) = SplitIndexByGroup(store->indices[0]);
+      auto [local_index, group] = SplitIndexByGroup(store->indices[0]);
+      (void)group;  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
 
       auto writer = store.CopyOnWrite();
       writer->indices = {local_index};
@@ -332,8 +332,7 @@ class WarpAccessRewriter : protected StmtExprMutator {
                                      << "Has StorageFlatten (TE-based schedule) or "
                                      << "FlattenBuffer (TIR-based schedules) been run?";
 
-    PrimExpr local_index, group;
-    std::tie(local_index, group) = SplitIndexByGroup(op->indices[0]);
+    auto [local_index, group] = SplitIndexByGroup(op->indices[0]);
     // invariance: local index must do not contain warp id
     ICHECK(!UsesVar(local_index, [this](const VarNode* var) { return var == warp_index_.get(); }))
         << "LowerWarpMemory failed to rewrite load to shuffle for index " << op->indices[0]
@@ -357,12 +356,10 @@ class WarpAccessRewriter : protected StmtExprMutator {
   // in this access pattern.
   std::pair<PrimExpr, PrimExpr> SplitIndexByGroup(const PrimExpr& index) {
     if (index.dtype().lanes() != 1) {
-      PrimExpr local_index, group;
-
       arith::PVar<PrimExpr> base;
       ICHECK(arith::ramp(base, 1, index.dtype().lanes()).Match(index));
 
-      std::tie(local_index, group) = SplitIndexByGroup(base.Eval());
+      auto [local_index, group] = SplitIndexByGroup(base.Eval());
       local_index = Ramp(local_index, make_const(local_index.dtype(), 1), index.dtype().lanes());
       return std::make_pair(local_index, group);
     }
diff --git a/src/tir/transforms/manifest_shared_memory_local_stage.cc b/src/tir/transforms/manifest_shared_memory_local_stage.cc
index 16c85642d1e5..0f56c8b8b7c9 100644
--- a/src/tir/transforms/manifest_shared_memory_local_stage.cc
+++ b/src/tir/transforms/manifest_shared_memory_local_stage.cc
@@ -61,9 +61,7 @@ class IntermediateStageRewriter {
     std::vector<const ForNode*> relaxed_loops = CollectRelaxedOuterLoops(block, target_buffer);
 
     // Step 1: Create buffer for the local stage
-    Buffer new_buffer{nullptr};
-    Array<PrimExpr> buffer_indices;
-    std::tie(new_buffer, buffer_indices) = CreateIntermediateBuffer(relaxed_loops, target_buffer);
+    auto [new_buffer, buffer_indices] = CreateIntermediateBuffer(relaxed_loops, target_buffer);
 
     // Step 2: Create the local stage block
     Stmt local_stage = MakeLocalStage(block, new_buffer, buffer_indices, relaxed_loops, store);
@@ -190,12 +188,8 @@ class SharedMemoryLocalStageInserter : public StmtMutator {
       // The annotated block must be a leaf block (will be checked during rewriting). No need to
       // visit its body recursively.
 
-      Buffer target_buffer{nullptr};
-      Buffer new_buffer{nullptr};
-      Block new_block{nullptr};
-      Stmt local_stage{nullptr};
       IntermediateStageRewriter rewriter(ancestor_loop_or_blocks_);
-      std::tie(target_buffer, new_buffer, new_block, local_stage) = rewriter.Rewrite(op);
+      auto [target_buffer, new_buffer, new_block, local_stage] = rewriter.Rewrite(op);
       buffer_remap_.Set(target_buffer, new_buffer);
 
       new_block.CopyOnWrite()->annotations.erase(attr::manifest_shared_memory_local_stage);

From c5c99a4b523c9165adb4d552d284f8666520336f Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Mon, 29 Aug 2022 18:31:00 +0800
Subject: [PATCH 065/704] [QNN] Align output_scale/zero_point of sigmoid to
 Torch (#12624)

* [QNN] Align output_scale/zero_point of sigmoid to Torch

* [QNN] Align output_scale/zero_point of sigmoid to Torch
---
 python/tvm/relay/frontend/pytorch.py   |  6 ++--
 python/tvm/relay/frontend/qnn_torch.py | 40 ++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 9f808203a6e1..2255396c0633 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1565,10 +1565,8 @@ def func(x):
             return _op.tensor.sigmoid(x)
 
         if self.is_quantized_tensor(data):
-            assert len(inputs) == 3, "Input quant param not found in op inputs"
-            input_scale = _expr.const(inputs[1])
-            input_zero_point = _expr.const(inputs[2])
-            return qnn_torch.quantized_sigmoid(data, input_scale, input_zero_point)
+            assert len(inputs) == 5, "Input/Ouput quant param not found in op inputs"
+            return qnn_torch.quantized_sigmoid(inputs)
 
         return func(data)
 
diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index c2e233d5961e..45cb8dedfd53 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -272,6 +272,7 @@ def _get_quant_param_for_input(input_value):
         "quantized::hardswish": (1, 2),
         "quantized::conv_transpose2d": qconv_indices,
         "quantized::leaky_relu": (3, 4),
+        "aten::sigmoid": (1, 2),
     }
 
     def dfs(current_node):
@@ -395,6 +396,33 @@ def _add_output_quant_params_to_scalar_op(node, graph, input_scale, input_zero_p
     node.addInput(out_zero_point_node.output())
 
 
+def _add_output_quant_params_to_sigmoid_op(node, graph):
+    """
+    Refer to aten/src/ATen/native/quantized/cpu/qsigmoid.cpp,
+    the output scale and zp of sigmoid op are two fixed numbers.
+    So we need to make two new constant nodes in the input IR and
+    add these params to the inputs of sigmoid op.
+    """
+    # pylint: disable=c-extension-no-member
+    import torch
+
+    # suppose scale_type is uint8
+    out_scale = 1.0 / 256
+    out_zero_point = 0
+
+    # create new constant nodes and add them to graph
+    out_scale_node = graph.create("prim::Constant")
+    out_zero_point_node = graph.create("prim::Constant")
+    out_scale_node.insertBefore(node)
+    out_zero_point_node.insertBefore(node)
+    out_scale_node.f_("value", out_scale)
+    out_zero_point_node.i_("value", out_zero_point)
+    out_scale_node.output().setType(torch._C.FloatType.get())
+    out_zero_point_node.output().setType(torch._C.IntType.get())
+    node.addInput(out_scale_node.output())
+    node.addInput(out_zero_point_node.output())
+
+
 def add_input_quant_params_to_op_inputs(graph):
     """
     In Torch, input quant params are not explicitly passed around
@@ -483,6 +511,9 @@ def add_input_quant_params_to_op_inputs(graph):
             # see the comments in this function above
             _add_output_quant_params_to_scalar_op(node, graph, inp_scale, inp_zero_point, scalar)
 
+        if operator == "aten::sigmoid":
+            _add_output_quant_params_to_sigmoid_op(node, graph)
+
         for scale, zp in zip(input_scales, input_zero_points):
             node.addInput(scale)
             node.addInput(zp)
@@ -571,9 +602,12 @@ def quantized_relu(data, input_zero_point):
     return _op.tensor.maximum(data, zp)
 
 
-def quantized_sigmoid(data, input_scale, input_zero_point):
-    output_scale = input_scale
-    output_zero_point = input_zero_point
+def quantized_sigmoid(inputs):
+    data = inputs[0]
+    output_scale = _expr.const(inputs[1])
+    output_zero_point = _expr.const(inputs[2])
+    input_scale = _expr.const(inputs[3])
+    input_zero_point = _expr.const(inputs[4])
     return relay.qnn.op.sigmoid(
         data, input_scale, input_zero_point, output_scale, output_zero_point
     )

From 0de22196db5f818a6937f026db43785935b9e731 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 29 Aug 2022 09:59:10 -0700
Subject: [PATCH 066/704] [microTVM][Zephyr] Disable test_armv7m_intrinsic
 since it's broken (#12620)

add xfail
---
 tests/micro/zephyr/test_zephyr_armv7m.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py b/tests/micro/zephyr/test_zephyr_armv7m.py
index 1f6a1a1bc13e..6a1dff254591 100644
--- a/tests/micro/zephyr/test_zephyr_armv7m.py
+++ b/tests/micro/zephyr/test_zephyr_armv7m.py
@@ -104,12 +104,12 @@ def _apply_desired_layout_no_simd(relay_mod):
 
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521"])
+@pytest.mark.xfail(reason="due https://github.com/apache/tvm/issues/12619")
 def test_armv7m_intrinsic(workspace_dir, board, west_cmd, microtvm_debug):
     """Testing a ARM v7m SIMD extension."""
-
     if board not in [
         "mps2_an521",
-        "stm32f746xx_disco",
+        "stm32f746g_disco",
         "nucleo_f746zg",
         "nucleo_l4r5zi",
         "nrf5340dk_nrf5340_cpuapp",

From c31a762b985894f64d3a80407b75fadb60240862 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 29 Aug 2022 11:00:54 -0700
Subject: [PATCH 067/704] [ci] Don't update Jenkinsfile timestamp on image
 updates (#12621)

The timestamp in the Jenkinsfile is there to prevent post-merge
conflicts from different PRs that edit the templates merging
non-sequentially. This is not an issue when a line is edited in place
though, which is often the case when Docker image tags are updated. This
PR makes it so the timestamp is not updated in these cases which should
reduce merge conflicts on these types of PRs.
---
 ci/jenkins/generate.py | 62 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 4 deletions(-)

diff --git a/ci/jenkins/generate.py b/ci/jenkins/generate.py
index 901d413364b3..3ccdedc6d924 100644
--- a/ci/jenkins/generate.py
+++ b/ci/jenkins/generate.py
@@ -18,11 +18,12 @@
 import jinja2
 import argparse
 import difflib
-import re
 import datetime
+import re
 import textwrap
 
 from pathlib import Path
+from typing import List
 
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent
@@ -82,9 +83,51 @@ def lines_without_generated_tag(content):
     ]
 
 
+def is_changed_images_only(lines: List[str]) -> bool:
+    """
+    Return True if 'line' only edits an image tag or if 'line' is not a changed
+    line in a diff
+    """
+    added_images = []
+    removed_images = []
+    diff_lines = []
+
+    for line in lines[2:]:
+        if not line.startswith("-") and not line.startswith("+"):
+            # not a diff line, ignore it
+            continue
+
+        diff_lines.append(line)
+
+    if len(diff_lines) == 0:
+        # no changes made
+        return True
+
+    for line in diff_lines:
+        is_add = line.startswith("+")
+        line = line.strip().lstrip("+").lstrip("-")
+        match = re.search(
+            r"^(ci_[a-zA-Z0-9]+) = \'.*\'$",
+            line.strip().lstrip("+").lstrip("-"),
+            flags=re.MULTILINE,
+        )
+        if match is None:
+            # matched a non-image line, quit early
+            return False
+
+        if is_add:
+            added_images.append(match.groups()[0])
+        else:
+            removed_images.append(match.groups()[0])
+
+    # make sure that the added image lines match the removed image lines
+    return len(added_images) > 0 and added_images == removed_images
+
+
 if __name__ == "__main__":
     help = "Regenerate Jenkinsfile from template"
     parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--force", action="store_true", help="always overwrite timestamp")
     parser.add_argument("--check", action="store_true", help="just verify the output didn't change")
     args = parser.parse_args()
 
@@ -92,6 +135,10 @@ def lines_without_generated_tag(content):
         content = f.read()
 
     data["generated_time"] = datetime.datetime.now().isoformat()
+    timestamp_match = re.search(r"^// Generated at (.*)$", content, flags=re.MULTILINE)
+    if not timestamp_match:
+        raise RuntimeError("Could not find timestamp in Jenkinsfile")
+    original_timestamp = timestamp_match.groups()[0]
 
     environment = jinja2.Environment(
         loader=jinja2.FileSystemLoader(REPO_ROOT),
@@ -103,11 +150,18 @@ def lines_without_generated_tag(content):
     template = environment.get_template(str(JENKINSFILE_TEMPLATE.relative_to(REPO_ROOT)))
     new_content = template.render(**data)
 
-    diff = "".join(
-        difflib.unified_diff(
+    diff = [
+        line
+        for line in difflib.unified_diff(
             lines_without_generated_tag(content), lines_without_generated_tag(new_content)
         )
-    )
+    ]
+    if not args.force and is_changed_images_only(diff):
+        new_content = new_content.replace(data["generated_time"], original_timestamp)
+        print("Detected only Docker-image name changed, skipping timestamp update")
+
+    diff = "".join(diff)
+
     if args.check:
         if not diff:
             print("Success, the newly generated Jenkinsfile matched the one on disk")

From 74988d36bd578b791bbdcea383d343d62029e9cf Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 29 Aug 2022 14:33:04 -0700
Subject: [PATCH 068/704] [Utils] Handled Callable in
 tir.schedule._type_checker (#12633)

Previously, `Callable` was handled as an atomic type.  This worked
when it was included as last element of a `Union[]` annotation with no
subtypes, but raised an error for other use cases, including
`Optional[Callable]`.

This commit adds explicit checks for `Callable` type annotations to
validate whether the argument is callable, but doesn't recursively
validate the signature of the callable object, because lambda
functions cannot have type
annotations. (https://peps.python.org/pep-3107/#lambda)
---
 python/tvm/tir/schedule/_type_checker.py      | 40 ++++++++++
 .../unittest/test_type_annotation_checker.py  | 77 +++++++++++++++----
 2 files changed, 103 insertions(+), 14 deletions(-)

diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py
index d45b4fb84b27..0b48dfc2b0e6 100644
--- a/python/tvm/tir/schedule/_type_checker.py
+++ b/python/tvm/tir/schedule/_type_checker.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """Type checking functionality"""
+import collections
+import collections.abc
 import functools
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, Union
@@ -26,6 +28,7 @@ def _is_none_type(type_: Any) -> bool:
 
 
 if hasattr(typing, "_GenericAlias"):
+    # For python versions 3.7 onward, check the __origin__ attribute.
 
     class _Subtype:
         @staticmethod
@@ -71,7 +74,15 @@ def union(type_: Any) -> Optional[List[type]]:
                     return list(subtypes)
             return None
 
+        @staticmethod
+        def callable(type_: Any) -> Optional[List[type]]:
+            if _Subtype._origin(type_) is collections.abc.Callable:
+                subtypes = type_.__args__
+                return subtypes
+            return None
+
 elif hasattr(typing, "_Union"):
+    # For python 3.6 and below, check the __name__ attribute, or CallableMeta.
 
     class _Subtype:  # type: ignore
         @staticmethod
@@ -114,6 +125,13 @@ def union(type_: Any) -> Optional[List[type]]:
                     return list(subtypes)
             return None
 
+        @staticmethod
+        def callable(type_: Any) -> Optional[List[type]]:
+            if isinstance(type_, typing.CallableMeta):  # type: ignore # pylint: disable=no-member,protected-access
+                subtypes = type_.__args__
+                return subtypes
+            return None
+
 
 def _dispatcher(type_: Any) -> Tuple[str, List[type]]:
     if _is_none_type(type_):
@@ -139,12 +157,27 @@ def _dispatcher(type_: Any) -> Tuple[str, List[type]]:
     if subtype is not None:
         return "union", subtype
 
+    subtype = _Subtype.callable(type_)
+    if subtype is not None:
+        return "callable", subtype
+
     return "atomic", [type_]
 
 
+def callable_str(subtypes):
+    if subtypes:
+        *arg_types, return_type = subtypes
+        arg_str = ", ".join(_type2str(arg_type) for arg_type in arg_types)
+        return_type_str = _type2str(return_type)
+        return f"Callable[[{arg_str}], {return_type_str}]"
+    else:
+        return "Callable"
+
+
 _TYPE2STR: Dict[Any, Callable] = {
     "none": lambda: "None",
     "atomic": lambda t: str(t.__name__),
+    "callable": callable_str,
     "list": lambda t: f"List[{_type2str(t)}]",
     "dict": lambda k, v: f"Dict[{_type2str(k)}, {_type2str(v)}]",
     "tuple": lambda *t: f"Tuple[{', '.join([_type2str(x) for x in t])}]",
@@ -188,6 +221,12 @@ def _type_check_none(v: Any, name: str) -> Optional[str]:
     def _type_check_atomic(v: Any, name: str, type_: Any) -> Optional[str]:
         return None if isinstance(v, type_) else _type_check_err(v, name, type_)
 
+    def _type_check_callable(v: Any, name: str, *_subtypes: Any) -> Optional[str]:
+        # Current implementation only validates that the argument is
+        # callable, and doesn't validate the arguments accepted by the
+        # callable, if any.
+        return None if callable(v) else _type_check_err(v, name, Callable)
+
     def _type_check_list(v: List[Any], name: str, type_: Any) -> Optional[str]:
         if not isinstance(v, (list, tuple)):
             return _type_check_err(v, name, list)
@@ -234,6 +273,7 @@ def _type_check_union(v: Any, name: str, *types: Any) -> Optional[str]:
     return {
         "none": _type_check_none,
         "atomic": _type_check_atomic,
+        "callable": _type_check_callable,
         "list": _type_check_list,
         "dict": _type_check_dict,
         "tuple": _type_check_tuple,
diff --git a/tests/python/unittest/test_type_annotation_checker.py b/tests/python/unittest/test_type_annotation_checker.py
index e84ae043d356..204c15331339 100644
--- a/tests/python/unittest/test_type_annotation_checker.py
+++ b/tests/python/unittest/test_type_annotation_checker.py
@@ -17,13 +17,22 @@
 """Test type checker based on python's type annotations"""
 
 import sys
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Tuple, Union, Callable
 
 import pytest
+import _pytest
 
 from tvm.tir.schedule._type_checker import type_checked
 
 
+def int_func(x: int) -> int:
+    return 2 * x
+
+
+def str_func(x: str) -> str:
+    return 2 * x
+
+
 test_cases = [
     {
         "type_annotation": int,
@@ -90,30 +99,71 @@
             None,
         ],
     },
+    {
+        "type_annotation": Callable,
+        "positive_cases": [str_func, int_func],
+        "negative_cases": [
+            None,
+            "x",
+            42,
+        ],
+    },
+    {
+        "type_annotation": Callable[[int], int],
+        "positive_cases": [int_func],
+        "negative_cases": [
+            None,
+            "x",
+            42,
+            pytest.param(
+                str_func,
+                marks=pytest.mark.xfail(
+                    reason="Signature of Callable arguments not currently checked"
+                ),
+            ),
+        ],
+    },
 ]
 
-positive_cases = [
-    (config["type_annotation"], case) for config in test_cases for case in config["positive_cases"]
-]
-
-negative_cases = [
-    (config["type_annotation"], case) for config in test_cases for case in config["negative_cases"]
-]
 
+def make_parametrization(type_annotation, case):
+    if isinstance(case, _pytest.mark.structures.ParameterSet):
+        marks = case.marks
+        (case,) = case.values
+    else:
+        marks = []
 
-def format_name(type_annotation, case):
     try:
-        name = type_annotation.__name__
+        annotation_name = type_annotation.__name__
     except AttributeError:
-        name = str(type_annotation).replace("typing.", "")
+        annotation_name = str(type_annotation).replace("typing.", "")
+
+    if hasattr(case, "__name__"):
+        case_name = case.__name__
+    else:
+        case_name = str(case)
 
-    return f"{name}_{case}"
+    name = f"{annotation_name}, {case_name}"
+
+    return pytest.param(type_annotation, case, marks=marks, id=name)
+
+
+positive_cases = [
+    make_parametrization(config["type_annotation"], case)
+    for config in test_cases
+    for case in config["positive_cases"]
+]
+
+negative_cases = [
+    make_parametrization(config["type_annotation"], case)
+    for config in test_cases
+    for case in config["negative_cases"]
+]
 
 
 @pytest.mark.parametrize(
     ["type_annotation", "case"],
     positive_cases,
-    ids=[format_name(t, c) for t, c in positive_cases],
 )
 def test_matches_type(type_annotation, case):
     @type_checked
@@ -126,7 +176,6 @@ def func(_: type_annotation):
 @pytest.mark.parametrize(
     ["type_annotation", "case"],
     negative_cases,
-    ids=[format_name(t, c) for t, c in negative_cases],
 )
 def test_not_matches(type_annotation, case):
     @type_checked

From 9e88723385f83a2d27a60432cbe50782bed2885f Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 29 Aug 2022 17:27:34 -0700
Subject: [PATCH 069/704] [TIR] Improved error messages for PrimExpr operator
 overloads (#12638)

Previously, type-checks in boolean operators on `PrimExpr` would
state that the type is incorrect, but further investigation would be
required in order to determine what expression caused the error.
After this commit, error messages for these type checks include the
expression that was used, and the dtype of that expression.
---
 src/tir/op/op.cc | 58 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 18 deletions(-)

diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 69d1da5e8c1c..b9e0c3c37068 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -520,10 +520,37 @@ PrimExpr not_equal(PrimExpr a, PrimExpr b, Span span) {
   return tir::NE(a, b, span);
 }
 
+namespace {
+void type_check_boolean_args(const PrimExpr& arg, const char* op) {
+  ICHECK(arg.dtype().is_bool()) << "Expected boolean argument for " << op << ", but received "
+                                << arg << " of type " << arg.dtype();
+}
+void type_check_boolean_args(const PrimExpr& lhs, const PrimExpr& rhs, const char* op) {
+  ICHECK(lhs.dtype().is_bool()) << "Expected boolean argument as LHS of " << op << ", but received "
+                                << lhs << " of type " << lhs.dtype();
+  ICHECK(rhs.dtype().is_bool()) << "Expected boolean argument as RHS of " << op << ", but received "
+                                << rhs << " of type " << rhs.dtype();
+}
+
+void type_check_integer_args(const PrimExpr& arg, const char* op) {
+  ICHECK(arg.dtype().is_int() || arg.dtype().is_uint())
+      << "Expected integer argument for " << op << ", but received " << arg << " of type "
+      << arg.dtype();
+}
+
+void type_check_integer_args(const PrimExpr& lhs, const PrimExpr& rhs, const char* op) {
+  ICHECK(lhs.dtype().is_int() || lhs.dtype().is_uint())
+      << "Expected integer argument as LHS of " << op << ", but received " << lhs << " of type "
+      << lhs.dtype();
+  ICHECK(rhs.dtype().is_int() || rhs.dtype().is_uint())
+      << "Expected integer argument as RHS of " << op << ", but received " << rhs << " of type "
+      << rhs.dtype();
+}
+}  // namespace
+
 PrimExpr operator&&(PrimExpr a, PrimExpr b) { return logical_and(a, b); }
 PrimExpr logical_and(PrimExpr a, PrimExpr b, Span span) {
-  ICHECK(a.dtype().is_bool());
-  ICHECK(b.dtype().is_bool());
+  type_check_boolean_args(a, b, "&& operator (logical AND)");
   PrimExpr ret = arith::TryConstFold<tir::And>(a, b);
   if (ret.defined()) return ret;
   return tir::And(a, b, span);
@@ -531,8 +558,7 @@ PrimExpr logical_and(PrimExpr a, PrimExpr b, Span span) {
 
 PrimExpr operator||(PrimExpr a, PrimExpr b) { return logical_or(a, b); }
 PrimExpr logical_or(PrimExpr a, PrimExpr b, Span span) {
-  ICHECK(a.dtype().is_bool());
-  ICHECK(b.dtype().is_bool());
+  type_check_boolean_args(a, b, "|| operator (logical OR)");
   PrimExpr ret = arith::TryConstFold<tir::Or>(a, b);
   if (ret.defined()) return ret;
   return tir::Or(a, b, span);
@@ -540,7 +566,7 @@ PrimExpr logical_or(PrimExpr a, PrimExpr b, Span span) {
 
 PrimExpr operator!(PrimExpr a) { return logical_not(a); }
 PrimExpr logical_not(PrimExpr a, Span span) {
-  ICHECK(a.dtype().is_bool());
+  type_check_boolean_args(a, "! operator (logical NOT)");
   PrimExpr ret = arith::TryConstFold<tir::Not>(a);
   if (ret.defined()) return ret;
   return tir::Not(a, span);
@@ -550,8 +576,8 @@ PrimExpr logical_not(PrimExpr a, Span span) {
 PrimExpr operator>>(PrimExpr a, PrimExpr b) { return right_shift(a, b); }
 
 PrimExpr right_shift(PrimExpr a, PrimExpr b, Span span) {
-  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
-  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
+  type_check_integer_args(a, b, ">> operator (right shift)");
+
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
@@ -573,8 +599,7 @@ PrimExpr right_shift(PrimExpr a, PrimExpr b, Span span) {
 // shift left
 PrimExpr operator<<(PrimExpr a, PrimExpr b) { return left_shift(a, b); }
 PrimExpr left_shift(PrimExpr a, PrimExpr b, Span span) {
-  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
-  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
+  type_check_integer_args(a, b, "<< operator (left shift)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
@@ -593,8 +618,7 @@ PrimExpr left_shift(PrimExpr a, PrimExpr b, Span span) {
 // bitwise and
 PrimExpr operator&(PrimExpr a, PrimExpr b) { return bitwise_and(a, b); }
 PrimExpr bitwise_and(PrimExpr a, PrimExpr b, Span span) {
-  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
-  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
+  type_check_integer_args(a, b, "& operator (bitwise AND)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
@@ -606,8 +630,7 @@ PrimExpr bitwise_and(PrimExpr a, PrimExpr b, Span span) {
 // bitwise_or
 PrimExpr operator|(PrimExpr a, PrimExpr b) { return bitwise_or(a, b); }
 PrimExpr bitwise_or(PrimExpr a, PrimExpr b, Span span) {
-  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
-  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
+  type_check_integer_args(a, b, "| operator (bitwise OR)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
@@ -619,8 +642,7 @@ PrimExpr bitwise_or(PrimExpr a, PrimExpr b, Span span) {
 // bitwise_xor
 PrimExpr operator^(PrimExpr a, PrimExpr b) { return bitwise_xor(a, b); }
 PrimExpr bitwise_xor(PrimExpr a, PrimExpr b, Span span) {
-  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
-  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
+  type_check_integer_args(a, b, "^ operator (bitwise XOR)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
@@ -633,7 +655,7 @@ PrimExpr bitwise_xor(PrimExpr a, PrimExpr b, Span span) {
 PrimExpr operator~(PrimExpr a) { return bitwise_neg(a); }
 
 PrimExpr bitwise_neg(PrimExpr a, Span span) {
-  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
+  type_check_integer_args(a, "~ operator (bitwise NOT)");
   return tir::Call(a.dtype(), tir::builtin::bitwise_not(), {a}, span);
 }
 
@@ -728,7 +750,7 @@ PrimExpr sum(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init, Span sp
 }
 
 PrimExpr all(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init, Span span) {
-  ICHECK(source.dtype().is_bool());
+  type_check_boolean_args(source, "tvm::all");
   Var x("x", source.dtype(), span), y("y", source.dtype());
   PrimExpr result = tir::And(x, y, span);
   PrimExpr identity_element = make_const(source.dtype(), true, span);
@@ -737,7 +759,7 @@ PrimExpr all(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init, Span sp
 }
 
 PrimExpr any(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init, Span span) {
-  ICHECK(source.dtype().is_bool());
+  type_check_boolean_args(source, "tvm::any");
   Var x("x", source.dtype(), span), y("y", source.dtype(), span);
   PrimExpr result = tir::Or(x, y, span);
   PrimExpr identity_element = make_const(source.dtype(), false, span);

From 5287d8f11e28cf4953ca3b5638880397e7ceb48e Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 30 Aug 2022 11:51:12 -0700
Subject: [PATCH 070/704] [ci] Move non-task CI scripts into ci/ folder
 (#12609)

[CI] Update Hexagon image to install boost (#12613)

The new image has xgboost installed, which I need for https://github.com/apache/tvm/pull/12587

Validated in https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/ci-docker-staging/279/pipeline

Co-authored-by: masahi <masahi129@gmail.com>
---
 .github/ISSUE_TEMPLATE/flaky-test.md          |  2 +-
 .github/workflows/cc_bot.yml                  |  2 +-
 .github/workflows/docs_bot.yml                |  2 +-
 .github/workflows/nightly_docker_update.yml   |  2 +-
 .github/workflows/ping_reviewers.yml          |  2 +-
 .github/workflows/tag_teams.yml               |  2 +-
 .github/workflows/tests_bot.yml               |  2 +-
 .github/workflows/tvmbot.yml                  |  2 +-
 .../update_last_successful_branch.yml         |  2 +-
 Jenkinsfile                                   | 14 ++++++------
 ci/README.md                                  |  1 +
 ci/jenkins/Prepare.groovy.j2                  | 12 +++++-----
 {tests => ci}/scripts/cmd_utils.py            |  0
 .../scripts/determine_docker_images.py        |  0
 {tests => ci}/scripts/git_change_docker.sh    |  0
 {tests => ci}/scripts/git_change_docs.sh      |  0
 {tests => ci}/scripts/git_skip_ci.py          |  0
 {tests => ci}/scripts/git_skip_ci_globs.py    |  0
 {tests => ci}/scripts/git_utils.py            |  0
 {tests => ci}/scripts/github_cc_reviewers.py  |  0
 {tests => ci}/scripts/github_docs_comment.py  |  0
 .../scripts/github_skipped_tests_comment.py   |  0
 {tests => ci}/scripts/github_tag_teams.py     |  0
 {tests => ci}/scripts/github_tvmbot.py        |  0
 {tests => ci}/scripts/http_utils.py           |  0
 .../scripts/open_docker_update_pr.py          |  0
 {tests => ci}/scripts/ping_reviewers.py       |  0
 {tests => ci}/scripts/pytest_ids.py           |  0
 {tests => ci}/scripts/pytest_wrapper.py       |  0
 .../scripts/should_rebuild_docker.py          |  0
 .../scripts/should_run_slow_tests.py          |  0
 {tests => ci}/scripts/update_branch.py        |  0
 docker/bash.sh                                |  2 +-
 tests/python/ci/test_ci.py                    | 22 +++++++++----------
 tests/python/ci/test_tvmbot.py                |  2 +-
 tests/scripts/setup-pytest-env.sh             |  2 +-
 tests/scripts/task_build.py                   |  5 +++++
 tests/scripts/task_python_frontend.sh         |  2 +-
 38 files changed, 43 insertions(+), 37 deletions(-)
 rename {tests => ci}/scripts/cmd_utils.py (100%)
 rename {tests => ci}/scripts/determine_docker_images.py (100%)
 rename {tests => ci}/scripts/git_change_docker.sh (100%)
 rename {tests => ci}/scripts/git_change_docs.sh (100%)
 rename {tests => ci}/scripts/git_skip_ci.py (100%)
 rename {tests => ci}/scripts/git_skip_ci_globs.py (100%)
 rename {tests => ci}/scripts/git_utils.py (100%)
 rename {tests => ci}/scripts/github_cc_reviewers.py (100%)
 rename {tests => ci}/scripts/github_docs_comment.py (100%)
 rename {tests => ci}/scripts/github_skipped_tests_comment.py (100%)
 rename {tests => ci}/scripts/github_tag_teams.py (100%)
 rename {tests => ci}/scripts/github_tvmbot.py (100%)
 rename {tests => ci}/scripts/http_utils.py (100%)
 rename {tests => ci}/scripts/open_docker_update_pr.py (100%)
 rename {tests => ci}/scripts/ping_reviewers.py (100%)
 rename {tests => ci}/scripts/pytest_ids.py (100%)
 rename {tests => ci}/scripts/pytest_wrapper.py (100%)
 rename {tests => ci}/scripts/should_rebuild_docker.py (100%)
 rename {tests => ci}/scripts/should_run_slow_tests.py (100%)
 rename {tests => ci}/scripts/update_branch.py (100%)

diff --git a/.github/ISSUE_TEMPLATE/flaky-test.md b/.github/ISSUE_TEMPLATE/flaky-test.md
index 1d61bbb632a4..1e8d267f8ec1 100644
--- a/.github/ISSUE_TEMPLATE/flaky-test.md
+++ b/.github/ISSUE_TEMPLATE/flaky-test.md
@@ -7,7 +7,7 @@ labels: "test: flaky"
 
 Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking. You are always welcomed to post on the forum first :smile_cat:
 
-These tests were found to be flaky (intermittently failing on `main` or failed in a PR with unrelated changes). As per [the docs](https://github.com/apache/tvm/blob/main/docs/contribute/ci.rst#handling-flaky-failures, these failures will be disabled in a PR that references this issue until the test owners can fix the source of the flakiness.
+These tests were found to be flaky (intermittently failing on `main` or failed in a PR with unrelated changes). As per [the docs](https://github.com/apache/tvm/blob/main/docs/contribute/ci.rst#handling-flaky-failures), these failures will be disabled in a PR that references this issue until the test owners can fix the source of the flakiness.
 
 ### Test(s)
 
diff --git a/.github/workflows/cc_bot.yml b/.github/workflows/cc_bot.yml
index ac0baa490222..95aa96426229 100644
--- a/.github/workflows/cc_bot.yml
+++ b/.github/workflows/cc_bot.yml
@@ -44,4 +44,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python tests/scripts/github_cc_reviewers.py || echo step failed
+          python ci/scripts/github_cc_reviewers.py || echo step failed
diff --git a/.github/workflows/docs_bot.yml b/.github/workflows/docs_bot.yml
index 9480a1176f15..73c12a8d7d05 100644
--- a/.github/workflows/docs_bot.yml
+++ b/.github/workflows/docs_bot.yml
@@ -15,4 +15,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python tests/scripts/github_docs_comment.py
\ No newline at end of file
+          python ci/scripts/github_docs_comment.py
\ No newline at end of file
diff --git a/.github/workflows/nightly_docker_update.yml b/.github/workflows/nightly_docker_update.yml
index 08945555af34..c2441807430f 100644
--- a/.github/workflows/nightly_docker_update.yml
+++ b/.github/workflows/nightly_docker_update.yml
@@ -28,4 +28,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python tests/scripts/open_docker_update_pr.py
+          python ci/scripts/open_docker_update_pr.py
diff --git a/.github/workflows/ping_reviewers.yml b/.github/workflows/ping_reviewers.yml
index 96c20434d9b5..a2e3e996a033 100644
--- a/.github/workflows/ping_reviewers.yml
+++ b/.github/workflows/ping_reviewers.yml
@@ -20,4 +20,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python tests/scripts/ping_reviewers.py --wait-time-minutes 10080 || echo failed
+          python ci/scripts/ping_reviewers.py --wait-time-minutes 10080 || echo failed
diff --git a/.github/workflows/tag_teams.yml b/.github/workflows/tag_teams.yml
index 2518cf87db5b..7c10f9c33d9f 100644
--- a/.github/workflows/tag_teams.yml
+++ b/.github/workflows/tag_teams.yml
@@ -45,4 +45,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python tests/scripts/github_tag_teams.py || echo failed
+          python ci/scripts/github_tag_teams.py || echo failed
diff --git a/.github/workflows/tests_bot.yml b/.github/workflows/tests_bot.yml
index e9d7d81375e4..0ddae2afb771 100644
--- a/.github/workflows/tests_bot.yml
+++ b/.github/workflows/tests_bot.yml
@@ -18,4 +18,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python tests/scripts/github_skipped_tests_comment.py
\ No newline at end of file
+          python ci/scripts/github_skipped_tests_comment.py
\ No newline at end of file
diff --git a/.github/workflows/tvmbot.yml b/.github/workflows/tvmbot.yml
index 87292ec211d1..23e90aed5329 100644
--- a/.github/workflows/tvmbot.yml
+++ b/.github/workflows/tvmbot.yml
@@ -34,4 +34,4 @@ jobs:
           RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
         run: |
           set -eux
-          python tests/scripts/github_tvmbot.py --pr "$PR_NUMBER" --run-url "$RUN_URL" --trigger-comment-json "$ISSUE_COMMENT"
+          python ci/scripts/github_tvmbot.py --pr "$PR_NUMBER" --run-url "$RUN_URL" --trigger-comment-json "$ISSUE_COMMENT"
diff --git a/.github/workflows/update_last_successful_branch.yml b/.github/workflows/update_last_successful_branch.yml
index fc2f2d0d4f2a..6635b9ef4c47 100644
--- a/.github/workflows/update_last_successful_branch.yml
+++ b/.github/workflows/update_last_successful_branch.yml
@@ -41,4 +41,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python tests/scripts/update_branch.py || echo step failed
+          python ci/scripts/update_branch.py || echo step failed
diff --git a/Jenkinsfile b/Jenkinsfile
index 3278e83098b7..1b615e38304c 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-08-26T15:09:39.104767
+// Generated at 2022-08-26T15:48:19.597592
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -230,7 +230,7 @@ def should_skip_slow_tests(pr_number) {
     // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
     result = sh (
       returnStatus: true,
-      script: "./tests/scripts/should_run_slow_tests.py --pr '${pr_number}'",
+      script: "./ci/scripts/should_run_slow_tests.py --pr '${pr_number}'",
       label: 'Check if CI should run slow tests',
     )
   }
@@ -255,7 +255,7 @@ def should_skip_ci(pr_number) {
   }
   glob_skip_ci_code = sh (
     returnStatus: true,
-    script: "./tests/scripts/git_skip_ci_globs.py",
+    script: "./ci/scripts/git_skip_ci_globs.py",
     label: 'Check if CI should be skipped due to changed files',
   )
   if (glob_skip_ci_code == 0) {
@@ -269,7 +269,7 @@ def should_skip_ci(pr_number) {
     // full CI just in case). Exit code of 0 means skip CI.
     git_skip_ci_code = sh (
       returnStatus: true,
-      script: "./tests/scripts/git_skip_ci.py --pr '${pr_number}'",
+      script: "./ci/scripts/git_skip_ci.py --pr '${pr_number}'",
       label: 'Check if CI should be skipped',
     )
   }
@@ -284,7 +284,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./tests/scripts/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./ci/scripts/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
@@ -367,14 +367,14 @@ def prepare() {
 
         is_docs_only_build = sh (
           returnStatus: true,
-          script: './tests/scripts/git_change_docs.sh',
+          script: './ci/scripts/git_change_docs.sh',
           label: 'Check for docs only changes',
         )
         skip_ci = should_skip_ci(env.CHANGE_ID)
         skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
         rebuild_docker_images = sh (
           returnStatus: true,
-          script: './tests/scripts/git_change_docker.sh',
+          script: './ci/scripts/git_change_docker.sh',
           label: 'Check for any docker changes',
         )
 
diff --git a/ci/README.md b/ci/README.md
index 38995549236c..2cb915e70207 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -26,6 +26,7 @@ TVM project in a healthy state and preventing breakages. CI in TVM is broken int
  - The tests themselves, all of which live underneath [`tests`](../tests).
  - Definitions of test suites, with each suite defined as a separate `task_` script in
    [`tests/scripts`](../tests/scripts).
+ - Scripts and automation [`ci/scripts`](../ci/scripts).
  - The linux test sequence (in [`Jenkinsfile`](../Jenkinsfile)), which lints and builds TVM and runs test
    suites using Docker on Linux.
  - The Windows and Mac test sequences (in [`.github/actions`](../.github/actions)).
diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2
index 7115d39ffce3..404d2870c9e2 100644
--- a/ci/jenkins/Prepare.groovy.j2
+++ b/ci/jenkins/Prepare.groovy.j2
@@ -80,7 +80,7 @@ def should_skip_slow_tests(pr_number) {
     // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
     result = sh (
       returnStatus: true,
-      script: "./tests/scripts/should_run_slow_tests.py --pr '${pr_number}'",
+      script: "./ci/scripts/should_run_slow_tests.py --pr '${pr_number}'",
       label: 'Check if CI should run slow tests',
     )
   }
@@ -105,7 +105,7 @@ def should_skip_ci(pr_number) {
   }
   glob_skip_ci_code = sh (
     returnStatus: true,
-    script: "./tests/scripts/git_skip_ci_globs.py",
+    script: "./ci/scripts/git_skip_ci_globs.py",
     label: 'Check if CI should be skipped due to changed files',
   )
   if (glob_skip_ci_code == 0) {
@@ -119,7 +119,7 @@ def should_skip_ci(pr_number) {
     // full CI just in case). Exit code of 0 means skip CI.
     git_skip_ci_code = sh (
       returnStatus: true,
-      script: "./tests/scripts/git_skip_ci.py --pr '${pr_number}'",
+      script: "./ci/scripts/git_skip_ci.py --pr '${pr_number}'",
       label: 'Check if CI should be skipped',
     )
   }
@@ -134,7 +134,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./tests/scripts/determine_docker_images.py {% for image in images %}{{ image.name }}={% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %} {% endfor %}",
+            script: "./ci/scripts/determine_docker_images.py {% for image in images %}{{ image.name }}={% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %} {% endfor %}",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
@@ -160,14 +160,14 @@ def prepare() {
 
         is_docs_only_build = sh (
           returnStatus: true,
-          script: './tests/scripts/git_change_docs.sh',
+          script: './ci/scripts/git_change_docs.sh',
           label: 'Check for docs only changes',
         )
         skip_ci = should_skip_ci(env.CHANGE_ID)
         skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
         rebuild_docker_images = sh (
           returnStatus: true,
-          script: './tests/scripts/git_change_docker.sh',
+          script: './ci/scripts/git_change_docker.sh',
           label: 'Check for any docker changes',
         )
 
diff --git a/tests/scripts/cmd_utils.py b/ci/scripts/cmd_utils.py
similarity index 100%
rename from tests/scripts/cmd_utils.py
rename to ci/scripts/cmd_utils.py
diff --git a/tests/scripts/determine_docker_images.py b/ci/scripts/determine_docker_images.py
similarity index 100%
rename from tests/scripts/determine_docker_images.py
rename to ci/scripts/determine_docker_images.py
diff --git a/tests/scripts/git_change_docker.sh b/ci/scripts/git_change_docker.sh
similarity index 100%
rename from tests/scripts/git_change_docker.sh
rename to ci/scripts/git_change_docker.sh
diff --git a/tests/scripts/git_change_docs.sh b/ci/scripts/git_change_docs.sh
similarity index 100%
rename from tests/scripts/git_change_docs.sh
rename to ci/scripts/git_change_docs.sh
diff --git a/tests/scripts/git_skip_ci.py b/ci/scripts/git_skip_ci.py
similarity index 100%
rename from tests/scripts/git_skip_ci.py
rename to ci/scripts/git_skip_ci.py
diff --git a/tests/scripts/git_skip_ci_globs.py b/ci/scripts/git_skip_ci_globs.py
similarity index 100%
rename from tests/scripts/git_skip_ci_globs.py
rename to ci/scripts/git_skip_ci_globs.py
diff --git a/tests/scripts/git_utils.py b/ci/scripts/git_utils.py
similarity index 100%
rename from tests/scripts/git_utils.py
rename to ci/scripts/git_utils.py
diff --git a/tests/scripts/github_cc_reviewers.py b/ci/scripts/github_cc_reviewers.py
similarity index 100%
rename from tests/scripts/github_cc_reviewers.py
rename to ci/scripts/github_cc_reviewers.py
diff --git a/tests/scripts/github_docs_comment.py b/ci/scripts/github_docs_comment.py
similarity index 100%
rename from tests/scripts/github_docs_comment.py
rename to ci/scripts/github_docs_comment.py
diff --git a/tests/scripts/github_skipped_tests_comment.py b/ci/scripts/github_skipped_tests_comment.py
similarity index 100%
rename from tests/scripts/github_skipped_tests_comment.py
rename to ci/scripts/github_skipped_tests_comment.py
diff --git a/tests/scripts/github_tag_teams.py b/ci/scripts/github_tag_teams.py
similarity index 100%
rename from tests/scripts/github_tag_teams.py
rename to ci/scripts/github_tag_teams.py
diff --git a/tests/scripts/github_tvmbot.py b/ci/scripts/github_tvmbot.py
similarity index 100%
rename from tests/scripts/github_tvmbot.py
rename to ci/scripts/github_tvmbot.py
diff --git a/tests/scripts/http_utils.py b/ci/scripts/http_utils.py
similarity index 100%
rename from tests/scripts/http_utils.py
rename to ci/scripts/http_utils.py
diff --git a/tests/scripts/open_docker_update_pr.py b/ci/scripts/open_docker_update_pr.py
similarity index 100%
rename from tests/scripts/open_docker_update_pr.py
rename to ci/scripts/open_docker_update_pr.py
diff --git a/tests/scripts/ping_reviewers.py b/ci/scripts/ping_reviewers.py
similarity index 100%
rename from tests/scripts/ping_reviewers.py
rename to ci/scripts/ping_reviewers.py
diff --git a/tests/scripts/pytest_ids.py b/ci/scripts/pytest_ids.py
similarity index 100%
rename from tests/scripts/pytest_ids.py
rename to ci/scripts/pytest_ids.py
diff --git a/tests/scripts/pytest_wrapper.py b/ci/scripts/pytest_wrapper.py
similarity index 100%
rename from tests/scripts/pytest_wrapper.py
rename to ci/scripts/pytest_wrapper.py
diff --git a/tests/scripts/should_rebuild_docker.py b/ci/scripts/should_rebuild_docker.py
similarity index 100%
rename from tests/scripts/should_rebuild_docker.py
rename to ci/scripts/should_rebuild_docker.py
diff --git a/tests/scripts/should_run_slow_tests.py b/ci/scripts/should_run_slow_tests.py
similarity index 100%
rename from tests/scripts/should_run_slow_tests.py
rename to ci/scripts/should_run_slow_tests.py
diff --git a/tests/scripts/update_branch.py b/ci/scripts/update_branch.py
similarity index 100%
rename from tests/scripts/update_branch.py
rename to ci/scripts/update_branch.py
diff --git a/docker/bash.sh b/docker/bash.sh
index 62b71ba3539e..10d80478d3f7 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -295,7 +295,7 @@ if [ -n "${EXPANDED_SHORTCUT}" ]; then
     if [ "${CI+x}" == "x" ]; then
         DOCKER_IMAGE_NAME="${EXPANDED_SHORTCUT}"
     else
-        python3 tests/scripts/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null
+        python3 ci/scripts/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null
         DOCKER_IMAGE_NAME=$(cat ".docker-image-names/$DOCKER_IMAGE_NAME")
         if [[ "$DOCKER_IMAGE_NAME" == *"tlcpackstaging"* ]]; then
             echo "WARNING: resolved docker image to fallback tag in tlcpackstaging" >&2
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index c45a0d8d8ee0..0939aae10ab5 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -172,7 +172,7 @@ def test_skipped_tests_comment(
     """
     Test that a comment with a link to the docs is successfully left on PRs
     """
-    skipped_tests_script = REPO_ROOT / "tests" / "scripts" / "github_skipped_tests_comment.py"
+    skipped_tests_script = REPO_ROOT / "ci" / "scripts" / "github_skipped_tests_comment.py"
 
     def write_xml_file(root_dir, xml_file, xml_content):
         shutil.rmtree(root_dir, ignore_errors=True)
@@ -232,7 +232,7 @@ def test_docs_comment(
     """
     Test that a comment with a link to the docs is successfully left on PRs
     """
-    docs_comment_script = REPO_ROOT / "tests" / "scripts" / "github_docs_comment.py"
+    docs_comment_script = REPO_ROOT / "ci" / "scripts" / "github_docs_comment.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
     git.run("init")
@@ -258,7 +258,7 @@ def test_cc_reviewers(tmpdir_factory):
     """
     Test that reviewers are added from 'cc @someone' messages in PRs
     """
-    reviewers_script = REPO_ROOT / "tests" / "scripts" / "github_cc_reviewers.py"
+    reviewers_script = REPO_ROOT / "ci" / "scripts" / "github_cc_reviewers.py"
 
     def run(pr_body, requested_reviewers, existing_review_users, expected_reviewers):
         git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
@@ -335,7 +335,7 @@ def test_update_branch(tmpdir_factory):
     """
     Test that the last-successful branch script updates successfully
     """
-    update_script = REPO_ROOT / "tests" / "scripts" / "update_branch.py"
+    update_script = REPO_ROOT / "ci" / "scripts" / "update_branch.py"
 
     def run(statuses, expected_rc, expected_output):
         git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
@@ -515,7 +515,7 @@ def test_skip_ci(tmpdir_factory, commands, should_skip, pr_title, why):
     """
     Test that CI is skipped when it should be
     """
-    skip_ci_script = REPO_ROOT / "tests" / "scripts" / "git_skip_ci.py"
+    skip_ci_script = REPO_ROOT / "ci" / "scripts" / "git_skip_ci.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
     # Jenkins git is too old and doesn't have 'git init --initial-branch'
@@ -548,7 +548,7 @@ def test_skip_globs(tmpdir_factory):
     """
     Test that CI is skipped if only certain files are edited
     """
-    script = REPO_ROOT / "tests" / "scripts" / "git_skip_ci_globs.py"
+    script = REPO_ROOT / "ci" / "scripts" / "git_skip_ci_globs.py"
 
     def run(files, should_skip):
         git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
@@ -587,7 +587,7 @@ def test_ping_reviewers(tmpdir_factory):
     """
     Test that reviewers are messaged after a time period of inactivity
     """
-    reviewers_script = REPO_ROOT / "tests" / "scripts" / "ping_reviewers.py"
+    reviewers_script = REPO_ROOT / "ci" / "scripts" / "ping_reviewers.py"
 
     def run(pull_request, check):
         git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
@@ -744,7 +744,7 @@ def test_github_tag_teams(tmpdir_factory):
     """
     Check that individuals are tagged from team headers
     """
-    tag_script = REPO_ROOT / "tests" / "scripts" / "github_tag_teams.py"
+    tag_script = REPO_ROOT / "ci" / "scripts" / "github_tag_teams.py"
 
     def run(source_type, data, check):
         git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
@@ -1081,7 +1081,7 @@ def test_open_docker_update_pr(
     tmpdir_factory, tlcpackstaging_body, tlcpack_body, expected, expected_images
 ):
     """Test workflow to open a PR to update Docker images"""
-    tag_script = REPO_ROOT / "tests" / "scripts" / "open_docker_update_pr.py"
+    tag_script = REPO_ROOT / "ci" / "scripts" / "open_docker_update_pr.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
     git.run("init")
@@ -1152,7 +1152,7 @@ def test_open_docker_update_pr(
 )
 def test_determine_docker_images(tmpdir_factory, images, expected):
     """Test script to decide whether to use tlcpack or tlcpackstaging for images"""
-    tag_script = REPO_ROOT / "tests" / "scripts" / "determine_docker_images.py"
+    tag_script = REPO_ROOT / "ci" / "scripts" / "determine_docker_images.py"
 
     git_dir = tmpdir_factory.mktemp("tmp_git_dir")
 
@@ -1219,7 +1219,7 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec
     """
     Check that the Docker images are built when necessary
     """
-    tag_script = REPO_ROOT / "tests" / "scripts" / "should_rebuild_docker.py"
+    tag_script = REPO_ROOT / "ci" / "scripts" / "should_rebuild_docker.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
     git.run("init")
diff --git a/tests/python/ci/test_tvmbot.py b/tests/python/ci/test_tvmbot.py
index 9568a0469bb0..2c7a0eaec0d4 100644
--- a/tests/python/ci/test_tvmbot.py
+++ b/tests/python/ci/test_tvmbot.py
@@ -147,7 +147,7 @@ def test_tvmbot(tmpdir_factory, number, filename, expected, comment, user, detai
     """
     Test the mergebot test cases
     """
-    mergebot_script = REPO_ROOT / "tests" / "scripts" / "github_tvmbot.py"
+    mergebot_script = REPO_ROOT / "ci" / "scripts" / "github_tvmbot.py"
     test_json_dir = Path(__file__).resolve().parent / "sample_prs"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index d6c49a42819a..895979293122 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -39,7 +39,7 @@ function cleanup() {
     set +x
     if [ "${#pytest_errors[@]}" -gt 0 ]; then
         echo "These pytest invocations failed, the results can be found in the Jenkins 'Tests' tab or by scrolling up through the raw logs here."
-        python3 tests/scripts/pytest_wrapper.py "${pytest_errors[@]}"
+        python3 ci/scripts/pytest_wrapper.py "${pytest_errors[@]}"
         exit 1
     fi
     set -x
diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index e4583fe6af04..1a8a1d112fc0 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -19,9 +19,14 @@
 import shutil
 import os
 import logging
+import sys
 import multiprocessing
 
 from pathlib import Path
+
+# Hackery to enable importing of utils from ci/scripts
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.append(str(REPO_ROOT / "ci" / "scripts"))
 from cmd_utils import Sh, init_log, REPO_ROOT
 
 
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index 2c7e34fac592..61d7238a594b 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -42,7 +42,7 @@ run_pytest cython python-frontend-pytorch tests/python/frontend/pytorch
 
 echo "Running relay Tensorflow frontend test..."
 # Note: Tensorflow tests often have memory issues, so invoke each one separately
-TENSORFLOW_TESTS=$(./tests/scripts/pytest_ids.py --folder tests/python/frontend/tensorflow)
+TENSORFLOW_TESTS=$(./ci/scripts/pytest_ids.py --folder tests/python/frontend/tensorflow)
 i=0
 for node_id in $TENSORFLOW_TESTS; do
     echo "$node_id"

From 58ee935a53893bfd47b9cd7ea4738ecec8d7181e Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Tue, 30 Aug 2022 08:51:53 -1000
Subject: [PATCH 071/704] [TVMScript] support float inf, -inf and nan in
 TVMScript parser and printer (#12618)

* support float inf, -inf and nan in TVMScript parser and printer

* address comment and fix lint

* use type_extensions.Literal

* address comments

* fix win build

* remove template
---
 python/tvm/script/tir/__init__.pyi            | 18 +++++++--
 python/tvm/script/tir/intrin.py               |  4 ++
 src/printer/tvmscript_printer.cc              | 37 +++++++++++++++----
 .../unittest/test_tvmscript_roundtrip.py      | 22 +++++++++++
 4 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi
index a62fb102bec5..a64eed055ae8 100644
--- a/python/tvm/script/tir/__init__.pyi
+++ b/python/tvm/script/tir/__init__.pyi
@@ -464,14 +464,24 @@ class uint32(PrimExpr):
 class uint64(PrimExpr):
     def __init__(self: uint64, imm: Union[PrimExpr, int]): ...
 
+# use typing.Literal instead for python 3.8 or higher
+import sys
+
+if sys.version_info >= (3, 8):
+    from typing import Literal
+
+    SpecialFloatLiteral = Literal["inf", "-inf", "nan"]
+else:
+    SpecialFloatLiteral = str
+
 class float8(PrimExpr):
-    def __init__(self: float8, imm: Union[PrimExpr, int, float]): ...
+    def __init__(self: float8, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ...
 
 class float16(PrimExpr):
-    def __init__(self: float16, imm: Union[PrimExpr, int, float]): ...
+    def __init__(self: float16, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ...
 
 class float32(PrimExpr):
-    def __init__(self: float32, imm: Union[PrimExpr, int, float]): ...
+    def __init__(self: float32, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ...
 
 class float64(PrimExpr):
-    def __init__(self: float64, imm: Union[PrimExpr, int, float]): ...
+    def __init__(self: float64, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ...
diff --git a/python/tvm/script/tir/intrin.py b/python/tvm/script/tir/intrin.py
index 382431c2296a..f3919afe5a24 100644
--- a/python/tvm/script/tir/intrin.py
+++ b/python/tvm/script/tir/intrin.py
@@ -20,6 +20,7 @@
 from typing import List, Any
 
 import tvm.tir
+from tvm.tir import FloatImm
 from ..registry import register
 from ...target import codegen
 from ..utils import get_param_list, tvm_span_from_synr
@@ -51,6 +52,9 @@ def bool(imm, span):
             # nest closures so we copy the name string
             def wrap(name):
                 def f(imm, span):
+                    if name.startswith("float"):
+                        if imm in {"inf", "-inf", "nan"}:
+                            return FloatImm(dtype=name, value=float(imm), span=span)
                     return imm.astype(name, span)
 
                 f.__name__ = name
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 7649b6101919..f5300e1e6985 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -381,18 +381,16 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   }
 
   /*!
-   * \brief special method to print out const scalar
+   * \brief special method to print out const int64_t scalar
    * \param dtype The data type
    * \param data The pointer to hold the data.
    */
-  template <typename T>
-  Doc PrintConstScalar(DataType dtype, const T* data) const {
+  Doc PrintConstScalar(DataType dtype, const int64_t* data) const {
     Doc doc;
     std::ostringstream os;
-    if (dtype.is_float() || dtype.is_float16() || dtype.is_bfloat16()) {
-      os.precision(17);
-    }
+
     os << data[0];
+
     if (dtype == DataType::Int(32)) {
       doc << Doc::Text(os.str());
     } else if (dtype == DataType::Bool()) {
@@ -404,6 +402,29 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
     return doc;
   }
 
+  /*!
+   * \brief special method to print out const double scalar
+   * \param dtype The data type
+   * \param data The pointer to hold the data.
+   * \note this overriden function is created as std::isnan of msvc will complain about int64_t
+   */
+  Doc PrintConstScalar(DataType dtype, const double* data) const {
+    Doc doc;
+    std::ostringstream os;
+
+    os.precision(17);
+    if (std::isinf(data[0]) || std::isnan(data[0])) {
+      os << "\"" << data[0] << "\"";
+    } else {
+      os << data[0];
+    }
+
+    doc << tir_prefix_ << "." << runtime::DLDataType2String(dtype) << "(" << Doc::Text(os.str())
+        << ")";
+
+    return doc;
+  }
+
  public:
   static Doc PrintHeader(const std::string& tir_prefix) {
     Doc header;
@@ -731,12 +752,12 @@ Doc TVMScriptPrinter::VisitStmtDefault_(const Object* op) {
 
 Doc TVMScriptPrinter::VisitExpr_(const IntImmNode* op, ExprPrecedence* out_precedence) {
   *out_precedence = ExprPrecedence::kIdentity;
-  return PrintConstScalar<int64_t>(op->dtype, &(op->value));
+  return PrintConstScalar(op->dtype, &(op->value));
 }
 
 Doc TVMScriptPrinter::VisitExpr_(const FloatImmNode* op, ExprPrecedence* out_precedence) {
   *out_precedence = ExprPrecedence::kIdentity;
-  return PrintConstScalar<double>(op->dtype, &(op->value));
+  return PrintConstScalar(op->dtype, &(op->value));
 }
 
 Doc TVMScriptPrinter::VisitExpr_(const StringImmNode* op, ExprPrecedence* out_precedence) {
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index e98f5057d8c4..45ea88f829ec 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3313,6 +3313,27 @@ def func(A: T.Buffer[(16, 16), "float32"], B: T.Buffer[(16, 16), "float32"]) ->
     return func
 
 
+def float_infinity():
+    @T.prim_func
+    def func(
+        placeholder: T.Buffer[(1, 512, 768), "float32"], T_isinf: T.Buffer[(1, 512, 768), "bool"]
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0, i1, i2 in T.grid(1, 512, 768):
+            with T.block("T_isinf"):
+                ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2])
+                T.reads(placeholder[ax0, ax1, ax2])
+                T.writes(T_isinf[ax0, ax1, ax2])
+                T_isinf[ax0, ax1, ax2] = T.fabs(
+                    placeholder[ax0, ax1, ax2], dtype="float32"
+                ) == T.float32("inf") and not (T.isnan(placeholder[ax0, ax1, ax2], dtype="bool"))
+
+    return func
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3353,6 +3374,7 @@ def func(A: T.Buffer[(16, 16), "float32"], B: T.Buffer[(16, 16), "float32"]) ->
     let_expression,
     void_ptr,
     decl_buffer,
+    float_infinity,
 )
 
 
From b44f1343a10ccc908de5e65b864012c72d564a7b Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 30 Aug 2022 12:48:43 -0700
Subject: [PATCH 072/704] [microTVM][ARM-DSP] Fix pool schedule  (#12653)

When I built keyword spotting ONNX model, there was an issue with the pool schedule because certain schedules like broadcast and elemwise do not have input tensors.
---
 python/tvm/topi/arm_cpu/mprofile/dsp/pool.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py
index 99470a28530a..441683112447 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py
@@ -105,8 +105,8 @@ def pool_dsp_schedule(outs, layout):
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        in_dtype = op.input_tensors[0].dtype
         if "pool_max" in op.tag:
+            in_dtype = op.input_tensors[0].dtype
             if in_dtype != "int8":
                 logger.warning("Does not have micro-kernel for %s maxpool.", in_dtype)
             elif layout == "NWC":
@@ -114,6 +114,7 @@ def _callback(op):
             elif layout == "NHWC":
                 schedule_maxpool_2d_nhwc(s, op)
         elif "pool_sum" in op.tag:
+            in_dtype = op.input_tensors[0].dtype
             if in_dtype != "int16":
                 logger.warning("Does not have micro-kernel for %s avgpool.", in_dtype)
             elif layout == "NCW":

From d421e32f1a3be11a908f897118deee018e309d97 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 30 Aug 2022 13:40:18 -0700
Subject: [PATCH 073/704] [microTVM]Fix test util functions (#12641)

* Fix test utils
* Update python/tvm/micro/testing/utils.py

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>
---
 python/tvm/micro/testing/evaluation.py | 4 +++-
 python/tvm/micro/testing/utils.py      | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/tvm/micro/testing/evaluation.py b/python/tvm/micro/testing/evaluation.py
index 32de1d2a370d..c8a90ff5b40f 100644
--- a/python/tvm/micro/testing/evaluation.py
+++ b/python/tvm/micro/testing/evaluation.py
@@ -22,6 +22,7 @@
 
 """
 
+import logging
 from io import StringIO
 from pathlib import Path
 from contextlib import ExitStack
@@ -151,7 +152,8 @@ def predict_labels_aot(session, aot_executor, input_data, runs_per_sample=1):
     assert aot_executor.get_num_outputs() == 1
     assert runs_per_sample > 0
 
-    for sample in input_data:
+    for counter, sample in enumerate(input_data):
+        logging.info("Evaluating sample %d", counter)
         aot_executor.get_input(0).copyfrom(sample)
         result = aot_executor.module.time_evaluator("run", session.device, number=runs_per_sample)()
         predicted_label = aot_executor.get_output(0).numpy().argmax()
diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py
index 794f443e47a6..097fbf283a58 100644
--- a/python/tvm/micro/testing/utils.py
+++ b/python/tvm/micro/testing/utils.py
@@ -45,12 +45,12 @@ def get_supported_boards(platform: str):
         return json.load(f)
 
 
-def get_target(platform: str, board: str):
-    """Intentionally simple function for making target strings for microcontrollers.
+def get_target(platform: str, board: str) -> tvm.target.Target:
+    """Intentionally simple function for making Targets for microcontrollers.
     If you need more complex arguments, one should call target.micro directly. Note
     that almost all, but not all, supported microcontrollers are Arm-based."""
     model = get_supported_boards(platform)[board]["model"]
-    return str(tvm.target.target.micro(model, options=["-device=arm_cpu"]))
+    return tvm.target.target.micro(model, options=["-device=arm_cpu"])
 
 
 def check_tune_log(log_path: Union[Path, str]):

From 1c32798a2c21fd0db6b0e8c938abee4666163bbd Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Tue, 30 Aug 2022 13:56:04 -0700
Subject: [PATCH 074/704] [Hexagon] Expose gtest output through runtime
 exception (#12502)

Expose Hexagon gtest output in CI by raising it as a runtime exception rather than printing it to stdout.
---
 tests/python/contrib/test_hexagon/test_run_unit_tests.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
index fd75775a0115..24c9f33a8ecb 100644
--- a/tests/python/contrib/test_hexagon/test_run_unit_tests.py
+++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
@@ -17,8 +17,6 @@
 
 """ capture gtest output and return over FFI """
 
-import numpy as np
-
 import tvm
 from tvm.contrib.hexagon.session import Session
 
@@ -46,4 +44,7 @@ def test_run_unit_tests(hexagon_session: Session, gtest_args):
     gtest_error_code = int(gtest_error_code_and_output.splitlines()[0])
     gtest_output = gtest_error_code_and_output.split("\n", 1)[-1]
     print(gtest_output)
-    np.testing.assert_equal(gtest_error_code, 0)
+    if gtest_error_code != 0:
+        raise RuntimeError(
+            f"Hexagon gtest retruned non-zero error code = {gtest_error_code}:\n{gtest_output}"
+        )

From 775520c8f3dede1d2b3fb0d34e80ff874b35a99b Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 30 Aug 2022 15:10:54 -0700
Subject: [PATCH 075/704] [microTVM][Zephyr] Add missing CMSIS-NN source files
 to cmake file (#12642)

This PR adds missing CMSIS-NN source files to Zephyr cmake template file for models like keyword spotting, anomaly detection, VWW and image classification.
---
 .../zephyr/template_project/CMakeLists.txt.template         | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
index b5182bf8ac1f..bbd975315e88 100644
--- a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
+++ b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
@@ -32,11 +32,15 @@ if(${ENABLE_CMSIS})
   set(CMSIS_PATH <CMSIS_PATH>)
 
   file(GLOB_RECURSE cmsis_lib_srcs
-    ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/*.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/ActivationFunctions/*.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/BasicMathFunctions/*.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/ConcatenationFunctions/*.c
     ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/*.c
     ${CMSIS_PATH}/CMSIS/NN/Source/FullyConnectedFunctions/*.c
     ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/*.c
     ${CMSIS_PATH}/CMSIS/NN/Source/PoolingFunctions/*.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/ReshapeFunctions/*.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/*.c
   )
 
   set(cmsis_includes

From caf326fab2963dac8fe03d266ea33d323f4b4470 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 30 Aug 2022 15:19:27 -0700
Subject: [PATCH 076/704] [ci] Add mechanism for trust on certain CI scripts
 (#12604)

This makes it so changes to certain files from users not listed in
`CONTRIBUTING.md` are not tested in CI. This is necessary since these
scripts run on the baremetal EC2 instances and not inside Docker
containers, so they can affect other builds and potentially grab Jenkins
secrets. This checks out the version from the upstream for the listed
files after running `git checkout`. Tested in CI: [positive](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-12604/6/pipeline/) and [negative](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-12604/9/pipeline/)
---
 Jenkinsfile                  | 27 ++++++++++++++++++++++++++-
 ci/jenkins/Prepare.groovy.j2 | 25 +++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 1b615e38304c..50eee01fa974 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-08-26T15:48:19.597592
+// Generated at 2022-08-30T11:58:06.036509
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -169,6 +169,7 @@ def init_git() {
     """,
     label: 'Update git submodules',
   )
+  checkout_trusted_files()
 }
 
 def docker_init(image) {
@@ -248,6 +249,30 @@ def cancel_previous_build() {
   }
 }
 
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (!env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ci/scripts/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
 def should_skip_ci(pr_number) {
   if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
     // never skip CI on build sourced from a branch
diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2
index 404d2870c9e2..94575a7b4b64 100644
--- a/ci/jenkins/Prepare.groovy.j2
+++ b/ci/jenkins/Prepare.groovy.j2
@@ -38,6 +38,7 @@ def init_git() {
     """,
     label: 'Update git submodules',
   )
+  checkout_trusted_files()
 }
 
 def docker_init(image) {
@@ -98,6 +99,30 @@ def cancel_previous_build() {
   }
 }
 
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (!env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ci/scripts/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
 def should_skip_ci(pr_number) {
   if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
     // never skip CI on build sourced from a branch

From f7cc992a9812872396bf5d42cc70461c3bd7e81f Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 30 Aug 2022 20:09:15 -0700
Subject: [PATCH 077/704] [MetaSchedule] Complete NCHW Conv2D Winograd Kernel
 Scheduling (#12648)

* Complete winograd scheduling.

* Fix test.
---
 python/tvm/topi/cuda/conv2d_winograd.py       |  1 +
 src/meta_schedule/schedule_rule/winograd.cc   | 29 ++++++++++++++++++-
 .../unittest/test_meta_schedule_space_cuda.py | 13 +++++++--
 3 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py
index f5e6cd88a5e3..239d05844b40 100644
--- a/python/tvm/topi/cuda/conv2d_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -104,6 +104,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
                 kernel[co][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
             ),
             name="kernel_pack",
+            attrs={"schedule_rule": "meta_schedule.winograd_kernel_pack.nchw.cuda"},
         )
     else:
         kernel_pack = kernel
diff --git a/src/meta_schedule/schedule_rule/winograd.cc b/src/meta_schedule/schedule_rule/winograd.cc
index 8ae8118731dd..22e2300d63b6 100644
--- a/src/meta_schedule/schedule_rule/winograd.cc
+++ b/src/meta_schedule/schedule_rule/winograd.cc
@@ -185,6 +185,32 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.nchw.cuda")
       return {sch};
     });
 
+TVM_REGISTER_GLOBAL("meta_schedule.winograd_kernel_pack.nchw.cuda")
+    .set_body_typed([](Schedule sch, BlockRV kernel_pack) -> Array<Schedule> {
+      Array<LoopRV> loops = sch->GetLoops(kernel_pack);
+      ICHECK_EQ(loops.size(), 6);
+      if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[0]))) {
+        if (*i <= 16) {
+          sch->Unroll(loops[0]);
+        }
+      }
+      if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[1]))) {
+        if (*i <= 16) {
+          sch->Unroll(loops[1]);
+        }
+      }
+      sch->Unroll(loops[4]);
+      sch->Unroll(loops[5]);
+
+      LoopRV fused = sch->Fuse({loops[2], loops[3]});
+
+      int64_t max_threadblocks = 256;
+      int64_t max_threads_per_block = 1024;
+      auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024});
+      BindBlockThreadIdx(sch, kernel_pack, max_threadblocks, max_threads_per_block, get_factor);
+      return {sch};
+    });
+
 TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.cuda")
     .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array<Schedule> {
       BlockRV input_tile = GetOnlyProducer(sch, data_pack);
@@ -206,9 +232,10 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.nchw.cuda")
       BlockRV data_pad = GetOnlyProducer(sch, input_tile);
 
       BlockRV data_l = sch->CacheWrite(data_pack, /*buffer_index=*/0, /*storage_scope=*/"local");
+      BlockRV d = sch->CacheRead(data_pack, /*buffer_index=*/0, /*storage_scope=*/"local");
       LoopRV loop = ScheduleDataPackNCHW(sch, data_pack);
       sch->ReverseComputeAt(data_l, loop, /*preserve_unit_loops=*/true);
-      sch->ComputeAt(input_tile, /*loop_rv=*/loop, /*preserve_unit_loops=*/true);
+      sch->ComputeAt(d, /*loop_rv=*/loop, /*preserve_unit_loops=*/true);
       sch->ComputeInline(data_pad);
 
       int64_t max_threadblocks = 256;
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index ce333887ec83..ffa2b57ba8ec 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -1338,11 +1338,20 @@ def winograd_nchw_conv2d(data: T.Buffer[(1, 64, 224, 224), "float32"], kernel: T
             bgemm = T.alloc_buffer([6, 6, 64, 3136], dtype="float32")
             inverse_local = T.alloc_buffer([64, 3136, 4, 4], dtype="float32", scope="local")
             data_pack_local = T.alloc_buffer([6, 6, 64, 3136], dtype="float32", scope="local")
+            d_local = T.alloc_buffer([64, 3136, 6, 6], dtype="float32", scope="local")
             bgemm_local = T.alloc_buffer([6, 6, 64, 3136], dtype="float32", scope="local")
             kernel_shared = T.alloc_buffer([6, 6, 64, 64], dtype="float32", scope="shared")
             data_pack_shared = T.alloc_buffer([6, 6, 64, 3136], dtype="float32", scope="shared")
             for i2_i3_0_fused_i3_1_fused_0 in T.thread_binding(3136, thread="blockIdx.x"):
                 for i2_i3_0_fused_i3_1_fused_1 in T.thread_binding(64, thread="threadIdx.x"):
+                    for ax0, ax1, ax2, ax3 in T.grid(1, 1, 6, 6):
+                        with T.block("d_local"):
+                            v0 = T.axis.spatial(64, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) // 3136 + ax0)
+                            v1 = T.axis.spatial(3136, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 3136 // 7 * 7 + (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 7 + ax1)
+                            v2, v3 = T.axis.remap("SS", [ax2, ax3])
+                            T.reads(data[v1 // 3136, v0, v1 % 3136 // 56 * 4 + v2 - 1, v1 % 56 * 4 + v3 - 1])
+                            T.writes(d_local[v0, v1, v2, v3])
+                            d_local[v0, v1, v2, v3] = T.if_then_else(1 <= v1 % 3136 // 56 * 4 + v2 and v1 % 3136 // 56 * 4 + v2 < 225 and 1 <= v1 % 56 * 4 + v3 and v1 % 56 * 4 + v3 < 225, data[v1 // 3136, v0, v1 % 3136 // 56 * 4 + v2 - 1, v1 % 56 * 4 + v3 - 1], T.float32(0), dtype="float32")
                     for i0 in T.unroll(6):
                         for i1 in T.unroll(6):
                             for i4 in T.unroll(6):
@@ -1352,12 +1361,12 @@ def winograd_nchw_conv2d(data: T.Buffer[(1, 64, 224, 224), "float32"], kernel: T
                                         ci = T.axis.spatial(64, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) // 3136)
                                         p = T.axis.spatial(3136, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 3136 // 7 * 7 + (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 7)
                                         r_a, r_a_1 = T.axis.remap("RR", [i4, i5])
-                                        T.reads(data[p // 3136, ci, p % 3136 // 56 * 4 + r_a - 1, p % 56 * 4 + r_a_1 - 1])
+                                        T.reads(d_local[ci, p, r_a, r_a_1])
                                         T.writes(data_pack_local[eps, nu, ci, p])
                                         T.block_attr({"schedule_rule":"meta_schedule.winograd_data_pack.nchw.cuda"})
                                         with T.init():
                                             data_pack_local[eps, nu, ci, p] = T.float32(0)
-                                        data_pack_local[eps, nu, ci, p] = data_pack_local[eps, nu, ci, p] + T.if_then_else(1 <= p % 3136 // 56 * 4 + r_a and p % 3136 // 56 * 4 + r_a < 225 and 1 <= p % 56 * 4 + r_a_1 and p % 56 * 4 + r_a_1 < 225, data[p // 3136, ci, p % 3136 // 56 * 4 + r_a - 1, p % 56 * 4 + r_a_1 - 1], T.float32(0), dtype="float32") * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_a_1 % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_a_1 % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_a_1 % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_a_1 % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_a_1 % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_a_1 % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_a_1 % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_a_1 % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_a_1 % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_a_1 % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_a_1 % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_a_1 % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_a_1 % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
+                                        data_pack_local[eps, nu, ci, p] = data_pack_local[eps, nu, ci, p] + d_local[ci, p, r_a, r_a_1] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_a_1 % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_a_1 % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_a_1 % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_a_1 % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_a_1 % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_a_1 % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_a_1 % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_a_1 % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_a_1 % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_a_1 % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_a_1 % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_a_1 % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_a_1 % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
                     for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1):
                         with T.block("data_pack_local"):
                             v0, v1 = T.axis.remap("SS", [ax0, ax1])

From f114d55bee538b5420c9c993aa789abff245d897 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 31 Aug 2022 11:12:05 +0800
Subject: [PATCH 078/704] [TIR] Preserve annotations after lower opaque block
 (#12572)

---
 src/tir/transforms/lower_opaque_block.cc      | 60 ++++++++++++++-----
 .../test_tir_transform_lower_opaque_block.py  | 37 ++++++++++++
 2 files changed, 83 insertions(+), 14 deletions(-)

diff --git a/src/tir/transforms/lower_opaque_block.cc b/src/tir/transforms/lower_opaque_block.cc
index 69d8787aa1a1..a4655ebbaed5 100644
--- a/src/tir/transforms/lower_opaque_block.cc
+++ b/src/tir/transforms/lower_opaque_block.cc
@@ -59,6 +59,12 @@ class OpaqueBlockLower : public StmtExprMutator {
       }
       body = Allocate(buffer->data, buffer->dtype, new_shape, const_true(), std::move(body));
     }
+    // Step 4. Handle annotations, block annotations are not preserved by default.
+    std::vector<std::pair<std::string, PrimExpr>> pragma_attrs;
+    HandleAnnotations(new_block->annotations, &pragma_attrs, /*is_block=*/true);
+    for (auto it = pragma_attrs.rbegin(); it != pragma_attrs.rend(); ++it) {
+      body = AttrStmt(Integer(0), it->first, it->second, std::move(body));
+    }
     return body;
   }
 
@@ -72,7 +78,11 @@ class OpaqueBlockLower : public StmtExprMutator {
     }
     // Step 2. Visit recursively
     Stmt body = this->VisitStmt(op->body);
-    // Step 3. Create new For loop accordingly
+    // Step 3. Handle annotations
+    std::vector<std::pair<std::string, PrimExpr>> pragma_attrs;
+    Map<String, ObjectRef> new_annotations =
+        HandleAnnotations(op->annotations, &pragma_attrs, /*is_block=*/false);
+    // Step 4. Create new For loop accordingly
     if (op->kind == ForKind::kThreadBinding) {
       // Case 1. Thread binding
       ICHECK(op->thread_binding.defined());
@@ -83,20 +93,12 @@ class OpaqueBlockLower : public StmtExprMutator {
       return body;
     } else {
       // Case 3. An ordinary loop
-      body = For(op->loop_var, std::move(min), std::move(extent), op->kind, std::move(body));
-    }
-    // Step 4. Handle annotations
-    std::set<std::string> ordered_ann_keys;
-    for (const auto& annotation : op->annotations) {
-      ordered_ann_keys.insert(annotation.first);
+      body = For(op->loop_var, std::move(min), std::move(extent), op->kind, std::move(body),
+                 NullOpt, new_annotations);
     }
-    for (auto it = ordered_ann_keys.rbegin(); it != ordered_ann_keys.rend(); ++it) {
-      const std::string& ann_key = *it;
-      const ObjectRef& ann_value = op->annotations.at(ann_key);
-      if (attr::IsPragmaKey(ann_key)) {
-        body =
-            AttrStmt(op->loop_var, ann_key, ConvertAttrValue(ann_key, ann_value), std::move(body));
-      }
+    // Step 5. Insert nested attrs
+    for (auto it = pragma_attrs.rbegin(); it != pragma_attrs.rend(); ++it) {
+      body = AttrStmt(op->loop_var, it->first, it->second, std::move(body));
     }
     return body;
   }
@@ -146,8 +148,38 @@ class OpaqueBlockLower : public StmtExprMutator {
     }
   }
 
+  /*!
+   * \brief Helper to handle annotation dict.
+   * (1) if the attr key is prefixed by `pragma_`, move to ordered kv list. They
+   * are lowered to `AttrStmt` by legacy TE schedule convention.
+   * (2) the non-pragma loop annotations are preserved
+   * (3) the non-pragma block annotations are dropped
+   * \return New annotation dict with preserved keys. Also update pragma attr pairs ordered by key.
+   */
+  Map<String, ObjectRef> HandleAnnotations(
+      const Map<String, ObjectRef>& annotations,
+      std::vector<std::pair<std::string, PrimExpr>>* pragma_attrs, bool is_block) {
+    Map<String, ObjectRef> preserved_annotations;
+    pragma_attrs->clear();
+    for (const auto& kv : annotations) {
+      const String& key = kv.first;
+      if (attr::IsPragmaKey(key)) {
+        pragma_attrs->emplace_back(key, ConvertAttrValue(key, kv.second));
+      } else if (!is_block) {
+        // the loop annotation is preserved
+        preserved_annotations.Set(key, kv.second);
+      }
+    }
+    std::sort(pragma_attrs->begin(), pragma_attrs->end(),
+              [](const auto& p1, const auto& p2) { return p1.first < p2.first; });
+    return preserved_annotations;
+  }
+
   /*! \brief Record the loop_var and loop start value of unit loops, whose extent is one. */
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> unit_loop_vars_;
+
+  /*! \brief Attr keys to preserve into loop annotations. */
+  std::unordered_set<std::string> preserved_annotations_;
 };
 
 PrimFunc LowerOpaqueBlock(PrimFunc f) {
diff --git a/tests/python/unittest/test_tir_transform_lower_opaque_block.py b/tests/python/unittest/test_tir_transform_lower_opaque_block.py
index 9b18c407c40c..6f557ba09d43 100644
--- a/tests/python/unittest/test_tir_transform_lower_opaque_block.py
+++ b/tests/python/unittest/test_tir_transform_lower_opaque_block.py
@@ -321,6 +321,43 @@ def test_annotated_loops():
     tvm.ir.assert_structural_equal(attr3.value, tvm.tir.FloatImm("float32", 0.0))
 
 
+def test_annotated_block():
+    @T.prim_func
+    def annotated_block() -> None:
+        with T.block():
+            T.block_attr({"pragma_1": "str_value", "pragma_2": 1, "pragma_3": 0.0})
+            T.evaluate(0)
+
+    mod = tvm.IRModule.from_expr(annotated_block)
+    mod = tvm.tir.transform.LowerOpaqueBlock()(mod)
+    attr1 = mod["main"].body
+    attr2 = attr1.body
+    attr3 = attr2.body
+    assert attr1.attr_key == "pragma_1" and attr1.value == "str_value"
+    assert attr2.attr_key == "pragma_2"
+    tvm.ir.assert_structural_equal(attr2.value, tvm.tir.IntImm("int32", 1))
+    assert attr3.attr_key == "pragma_3"
+    tvm.ir.assert_structural_equal(attr3.value, tvm.tir.FloatImm("float32", 0.0))
+
+
+def test_preserved_annotations():
+    @T.prim_func
+    def before(A: T.Buffer[8, "float32"], B: T.Buffer[8, "float32"]):
+        for i in T.serial(8, annotations={"k_0": 1, "k_1": [2, 3], "k_2": 3.14}):
+            with T.block("block"):
+                T.block_attr({"k_3": "oops"})
+                B[i] = A[i] + 1.0
+
+    @T.prim_func
+    def after(A: T.Buffer[8, "float32"], B: T.Buffer[8, "float32"]):
+        for i in T.serial(8, annotations={"k_0": 1, "k_1": [2, 3], "k_2": 3.14}):
+            B[i] = A[i] + 1.0
+
+    mod = tvm.IRModule.from_expr(before)
+    mod = tvm.tir.transform.LowerOpaqueBlock()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], after)
+
+
 def test_boolean_handling():
     _check(boolean_handling_before, boolean_handling_after)
 

From c2824a84d51ad3d64be2e72680b33c378f033f99 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 31 Aug 2022 01:21:36 -0700
Subject: [PATCH 079/704] [Testing] Allow NCHW layout in `relay_workload`
 (#12656)

---
 .../tvm/auto_scheduler/testing/tune_relay.py  |  6 +++
 .../meta_schedule/testing/relay_workload.py   | 43 ++++++++++++-------
 .../tvm/meta_schedule/testing/tune_relay.py   |  6 +++
 .../unittest/test_meta_schedule_tune_relay.py | 36 +++++++++-------
 4 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index 2d84389f9de1..9773fbbc65ad 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -73,6 +73,11 @@ def _parse_args():
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--layout",
+        type=str,
+        default=None,
+    )
     args.add_argument(
         "--cache-dir",
         type=str,
@@ -168,6 +173,7 @@ def main():
     mod, params, (input_name, input_shape, input_dtype) = get_network(
         ARGS.workload,
         ARGS.input_shape,
+        layout=ARGS.layout,
         cache_dir=ARGS.cache_dir,
     )
     input_info = [
diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py
index 3cdf251fe4b6..016263489527 100644
--- a/python/tvm/meta_schedule/testing/relay_workload.py
+++ b/python/tvm/meta_schedule/testing/relay_workload.py
@@ -34,11 +34,12 @@
 
 
 def _get_network(
-    args: Tuple[str, List[int]]
+    args: Tuple[str, List[int], str]
 ) -> Tuple[IRModule, bytearray, Tuple[str, List[int], str]]:
     name: str
     input_shape: List[int]
-    name, input_shape = args
+    layout: str
+    name, input_shape, layout = args
 
     mod: IRModule
 
@@ -57,6 +58,8 @@ def _get_network(
         import torch  # type: ignore
         from torchvision import models  # type: ignore
 
+        assert layout is None or layout in ["NCHW", "NHWC"]
+
         if name in ["resnet_18", "resnet_50"]:
             model = getattr(models, name.replace("_", ""))(pretrained=False)
         elif name == "wide_resnet_50":
@@ -86,20 +89,21 @@ def _get_network(
         input_name = "input0"
         shape_list = [(input_name, input_shape)]
         mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
+        passes = [relay.transform.RemoveUnusedFunctions()]
+        if layout == "NHWC":
+            # PyTorch is imported as NCHW by default
+            passes.append(
+                relay.transform.ConvertLayout(
+                    {
+                        "nn.conv2d": ["NHWC", "default"],
+                        "nn.conv3d": ["NDHWC", "default"],
+                        "nn.max_pool2d": ["NHWC", "default"],
+                        "nn.avg_pool2d": ["NHWC", "default"],
+                    }
+                )
+            )
         with tvm.transform.PassContext(opt_level=3):
-            mod = tvm.transform.Sequential(
-                [
-                    relay.transform.RemoveUnusedFunctions(),
-                    relay.transform.ConvertLayout(
-                        {
-                            "nn.conv2d": ["NHWC", "default"],
-                            "nn.conv3d": ["NDHWC", "default"],
-                            "nn.max_pool2d": ["NHWC", "default"],
-                            "nn.avg_pool2d": ["NHWC", "default"],
-                        }
-                    ),
-                ]
-            )(mod)
+            mod = tvm.transform.Sequential(passes)(mod)
         inputs = (input_name, input_shape, dtype)
     elif name in ["bert_tiny", "bert_base", "bert_medium", "bert_large"]:
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -107,6 +111,8 @@ def _get_network(
         import torch  # type: ignore
         import transformers  # type: ignore
 
+        assert layout is None
+
         config_dict = {
             "bert_tiny": transformers.BertConfig(
                 num_hidden_layers=6,
@@ -151,6 +157,8 @@ def _get_network(
         mod = relay.transform.CombineParallelBatchMatmul()(mod)
         inputs = (input_name, input_shape, input_dtype)
     elif name == "dcgan":
+        assert layout is None
+
         output_shape = input_shape
         batch_size = output_shape[0]
         oshape = output_shape[1:]
@@ -190,6 +198,7 @@ def get_network(
     name: str,
     input_shape: List[int],
     *,
+    layout: Optional[str] = None,
     cache_dir: Optional[str] = None,
 ) -> Tuple[IRModule, Dict[str, NDArray], Tuple[str, List[int], str]]:
     """Get the symbol definition and random weight of a network
@@ -200,6 +209,8 @@ def get_network(
         The name of the network.
     input_shape : List[int]
         The shape of the input tensor.
+    layout : Optional[str]
+        The layout of the input tensor. For vision models, the layout is by default NHWC.
     cache_dir : Optional[str], optional
         The directory to cache the generated network.
         If not specified, the cache will be disabled.
@@ -223,7 +234,7 @@ def get_network(
     cached = _load_cache(cache_dir, filename)
     if cached is None:
         with multiprocessing.Pool(processes=1) as pool:
-            result = pool.map(_get_network, [(name, input_shape)])
+            result = pool.map(_get_network, [(name, input_shape, layout)])
         ((mod, params_bytearray, inputs),) = result
         cached = [mod, params_bytearray, inputs]
         _save_cache(cache_dir, filename, cached)
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index 596a5a736333..7c5977495db5 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -72,6 +72,11 @@ def _parse_args():
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--layout",
+        type=str,
+        default=None,
+    )
     args.add_argument(
         "--cache-dir",
         type=str,
@@ -137,6 +142,7 @@ def main():
     mod, params, (input_name, input_shape, input_dtype) = get_network(
         ARGS.workload,
         ARGS.input_shape,
+        layout=ARGS.layout,
         cache_dir=ARGS.cache_dir,
     )
     input_info = [
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index b05b57feaf4c..0267352fd697 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -18,7 +18,7 @@
 import logging
 import tempfile
 from os import path as osp
-from typing import List
+from typing import List, Optional
 
 import numpy as np  # type: ignore
 import pytest
@@ -113,20 +113,21 @@ def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T.
 
 @pytest.mark.skip("Integration test")
 @pytest.mark.parametrize(
-    "model_name, input_shape, target",
+    "model_name, input_shape, target, layout",
     [
-        ("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16"),
-        ("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070"),
-        ("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16"),
-        ("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070"),
-        ("bert_base", [1, 64], "llvm --num-cores=16"),
-        ("bert_base", [1, 64], "nvidia/geforce-rtx-3070"),
+        ("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC"),
+        ("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NHWC"),
+        ("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC"),
+        ("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NHWC"),
+        ("bert_base", [1, 64], "llvm --num-cores=16", None),
+        ("bert_base", [1, 64], "nvidia/geforce-rtx-3070", None),
     ],
 )
 def test_meta_schedule_tune_relay(
     model_name: str,
     input_shape: List[int],
     target: str,
+    layout: Optional[str],
 ):
     dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda()
     if model_name.startswith("bert"):
@@ -134,7 +135,12 @@ def test_meta_schedule_tune_relay(
     else:
         data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev)
 
-    mod, params, (input_name, _, _) = get_network(name=model_name, input_shape=input_shape)
+    mod, params, (input_name, _, _) = get_network(
+        name=model_name,
+        input_shape=input_shape,
+        layout=layout,
+    )
+
     target = Target(target)
     with tempfile.TemporaryDirectory() as work_dir:
         with ms.Profiler() as profiler:
@@ -536,12 +542,12 @@ def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
 
 
 if __name__ == """__main__""":
-    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16")
-    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070")
-    test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16")
-    test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070")
-    test_meta_schedule_tune_relay("bert_base", [1, 64], "llvm --num-cores=16")
-    test_meta_schedule_tune_relay("bert_base", [1, 64], "nvidia/geforce-rtx-3070")
+    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", None)
+    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NCHW")
+    test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16", None)
+    test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", None)
+    test_meta_schedule_tune_relay("bert_base", [1, 64], "llvm --num-cores=16", None)
+    test_meta_schedule_tune_relay("bert_base", [1, 64], "nvidia/geforce-rtx-3070", None)
     test_meta_schedule_te2primfunc_argument_order()
     test_meta_schedule_relay_lowering()
     test_tune_relay_manual_tir_vnni()

From acbbd9f15a9ce79ecc88f16f5be9b0c07122cfc4 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Wed, 31 Aug 2022 10:54:54 +0100
Subject: [PATCH 080/704] [ETHOSN] Improve inferring new shape of the Reshape
 operator (#12594)

Fixes the case when reshape is > 4 dims. While this cannot be offloaded to the NPU, the check was previously producing an error preventing further compilation. The correct behavior is to ensure the check returns False and not offload the reshape.
---
 python/tvm/relay/op/contrib/ethosn.py         |  2 --
 .../backend/contrib/ethosn/ethosn_api.cc      | 18 ++++++-------
 .../contrib/test_ethosn/test_networks.py      |  4 +--
 .../contrib/test_ethosn/test_reshape.py       | 25 ++++++++++++++++---
 4 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 83972bd08b41..a4e9d9647c95 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -360,8 +360,6 @@ def reshape(expr):
     """Check if a reshape is supported by Ethos-N."""
     if not ethosn_available():
         return False
-    if not _is_ethosn_composite(expr.args[0]):
-        return False
 
     return _ethosn.reshape(expr)
 
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index ccca1779f6d9..55e8901dae08 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -37,7 +37,6 @@
 #include <utility>
 #include <vector>
 
-#include "../../../op/tensor/transform.h"
 #include "ethosn_support_library/Support.hpp"
 #include "ethosn_support_library/SupportQueries.hpp"
 #include "tvm/relay/qnn/attrs.h"
@@ -300,15 +299,16 @@ EthosnError EthosnAPI::Reshape(const Expr& expr, ReshapeParams* params) {
   sl::DataType input_data_type;
   EthosnError err = Tvm2Npu(input_dtype->shape, &input_tensor_shape);
   err += Tvm2Npu(input_dtype->dtype, &input_data_type);
-  int tensor_size = 1;
-  for (const auto& dim : input_tensor_shape) {
-    tensor_size *= dim;
-  }
 
-  Array<IndexExpr> inferred_shape = {1, 1, 1, 1};
-  Array<IndexExpr> new_shape = InferNewShape(input_dtype->shape, reshape->attrs, false);
-  for (size_t i = 0; i < new_shape.size(); ++i) {
-    inferred_shape.Set(i, new_shape[i]);
+  Array<IndexExpr> inferred_shape;
+  Array<IndexExpr> new_shape = reshape->checked_type().as<TensorTypeNode>()->shape;
+  if (new_shape.size() < 4) {
+    inferred_shape = {1, 1, 1, 1};
+    for (size_t i = 0; i < new_shape.size(); ++i) {
+      inferred_shape.Set(i, new_shape[i]);
+    }
+  } else {
+    inferred_shape = new_shape;
   }
 
   err += Tvm2Npu(inferred_shape, &params->new_shape);
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index d16bf5bf325c..11745409d4ea 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -151,7 +151,7 @@ def test_resnet_50_int8():
         input_dict={"input": (1, 224, 224, 3)},
         compile_hash=_compile_hash,
         output_count=1,
-        host_ops=11,
+        host_ops=10,
         npu_partitions=2,
     )
 
@@ -211,6 +211,6 @@ def test_ssd_mobilenet_v1():
         input_dict={"normalized_input_image_tensor": (1, 300, 300, 3)},
         compile_hash=_compile_hash,
         output_count=4,
-        host_ops=28,
+        host_ops=27,
         npu_partitions=2,
     )
diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py
index cb8a49be2d81..e165cea9c63b 100644
--- a/tests/python/contrib/test_ethosn/test_reshape.py
+++ b/tests/python/contrib/test_ethosn/test_reshape.py
@@ -28,9 +28,8 @@
 def _get_model(input_shape, output_shape, dtype):
     """Return a model and any parameters it may have"""
     a = relay.var("a", shape=input_shape, dtype=dtype)
-    conv, params = tei.get_conv2d(a, input_shape, dtype)
-    req = relay.reshape(conv, output_shape)
-    return req, params
+    req = relay.reshape(a, output_shape)
+    return req, {}
 
 
 @requires_ethosn
@@ -53,6 +52,8 @@ def _get_model(input_shape, output_shape, dtype):
     ],
 )
 def test_reshape(dtype, input_shape, output_shape):
+    """Compare Reshape output with TVM."""
+
     np.random.seed(0)
     inputs = {
         "a": tvm.nd.array(
@@ -71,3 +72,21 @@ def test_reshape(dtype, input_shape, output_shape):
         outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
 
     tei.verify(outputs, dtype, 1)
+
+
+@requires_ethosn
+@pytest.mark.parametrize(
+    "input_shape, output_shape",
+    [
+        (
+            (1, 13, 13, 255),
+            (1, 13, 13, 3, 85),
+        ),
+    ],
+)
+def test_reshape_failure(input_shape, output_shape):
+    """Check Resize is not offloaded."""
+
+    model, params = _get_model(input_shape, output_shape, "int8")
+    mod = tei.make_module(model, params)
+    tei.build(mod, params, expected_host_ops=1, npu_partitions=0)

From 0c374544a3e1dc358b23e99eaff719631a9984d7 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 31 Aug 2022 09:41:41 -0700
Subject: [PATCH 081/704] [TIR][TVMScript] Update printer / parser to make
 T.allocate return buffer var (#12412)

* Updated TVMScript syntax of `T.allocate` to return buffer var.

* Added syntax sugar for `T.decl_buffer`. When `data` field is not
  specified, `data` will be implicitly created via `Allocate` stmt.

* Updated the existing test cases. Most test cases can be updated by
  changing `T.allocate` to `T.decl_buffer`. `T.allocate` in some tests
  are updated to `T.allocate` + `T.buffer_decl`, to maintain the
  legacy behavior of allocation and implicit buffer declaration (will
  be followed up in future PR to adopt `T.decl_buffer`).
---
 python/tvm/script/tir/scope_handler.py        |  57 ++---
 src/printer/tvmscript_printer.cc              | 128 +++++-----
 .../test_copy_compute_reordering.py           | 228 +++++++++---------
 .../test_ethosu/test_encode_constants.py      |  48 ++--
 .../test_ethosu/test_hoist_allocates.py       |  75 ++++--
 .../test_ethosu/test_merge_constants.py       | 158 +++++++-----
 .../test_ethosu/test_remove_concatenates.py   |   3 +-
 .../test_ethosu/test_replace_conv2d.py        |  18 +-
 .../contrib/test_ethosu/test_replace_copy.py  |   9 +-
 .../contrib/test_ethosu/test_scheduler.py     |  12 +-
 .../test_ethosu/test_tir_to_cs_translator.py  |  31 ++-
 ..._meta_schedule_postproc_verify_gpu_code.py |  24 +-
 .../test_tir_analysis_calculate_workspace.py  |  18 +-
 ...t_tir_analysis_detect_buffer_access_lca.py |   2 +-
 tests/python/unittest/test_tir_ptx_mma.py     | 103 ++++----
 tests/python/unittest/test_tir_ptx_mma_sp.py  |  32 +--
 tests/python/unittest/test_tir_renew_defs.py  |   3 +-
 .../test_tir_structural_equal_hash.py         |   2 +-
 ..._tir_transform_convert_for_loops_serial.py |   4 +-
 .../test_tir_transform_extract_constants.py   |   9 +-
 .../test_tir_transform_flatten_buffer.py      |  36 ++-
 ...est_tir_transform_inject_virtual_thread.py |  12 +-
 .../test_tir_transform_lower_opaque_block.py  |  18 +-
 ...tir_transform_renormalize_split_pattern.py |  18 +-
 .../test_tir_transform_storage_flatten.py     |   4 +-
 .../test_tir_transform_storage_rewrite.py     |   6 +-
 .../test_tir_transform_unroll_loop.py         |   9 +-
 tests/python/unittest/test_tir_usmp_algo.py   |  38 +--
 ...st_tir_usmp_analysis_extract_bufferinfo.py | 146 +++++------
 ...orm_convert_pool_allocations_to_offsets.py |  49 ++--
 tests/python/unittest/test_tir_usmp_utils.py  |  12 +-
 .../unittest/test_tvmscript_roundtrip.py      |  82 +++++--
 32 files changed, 804 insertions(+), 590 deletions(-)

diff --git a/python/tvm/script/tir/scope_handler.py b/python/tvm/script/tir/scope_handler.py
index 41fa6a5fa2f7..1d2550eecde2 100644
--- a/python/tvm/script/tir/scope_handler.py
+++ b/python/tvm/script/tir/scope_handler.py
@@ -112,9 +112,9 @@ def allocate(extents, dtype, scope, condition=True, annotations=None, span=None)
             scope = tvm.runtime.convert(scope)
 
             return tvm.tir.Allocate(
-                self.buffer.data,
-                self.buffer.dtype,
-                self.buffer.shape,
+                self.buffer_var,
+                dtype,
+                extents,
                 condition,
                 self.body,
                 annotations=annotations,
@@ -122,7 +122,7 @@ def allocate(extents, dtype, scope, condition=True, annotations=None, span=None)
             )
 
         super().__init__(allocate, concise_scope=True, def_symbol=True)
-        self.buffer = None
+        self.buffer_var = None
 
     def enter_scope(
         self,
@@ -146,20 +146,15 @@ def enter_scope(
         else:
             raise Exception("Internal Bug")
 
-        def setup_buffer(
+        def setup_buffer_var(
             extents, dtype, scope, condition=True, annotations=None, span: Span = None
         ):
-            """Setup buffer object for a given type."""
-            self.buffer = tvm.tir.decl_buffer(
-                shape=extents,
-                dtype=dtype,
-                name=name,
-                scope=scope,
-                span=span,
-            )
+            """Setup buffer var for a given type."""
+            buffer_ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype), scope)
+            self.buffer_var = tvm.tir.Var(name, buffer_ptr_type, span)
 
-        setup_buffer(*arg_list, span=tvm_span_from_synr(var_span))
-        context.update_symbol(name, self.buffer, node)
+        setup_buffer_var(*arg_list, span=tvm_span_from_synr(var_span))
+        context.update_symbol(name, self.buffer_var, node)
 
 
 @register
@@ -176,7 +171,7 @@ def allocate_const(raw_data, dtype, shape, annotations=None, span=None):
                 list_data.append(i.value)
             nd_data = tvm.nd.array(np.asarray(list_data, dtype=dtype))
             n = tvm.tir.AllocateConst(
-                self.buffer.data,
+                self.buffer_var,
                 dtype,
                 shape,
                 nd_data,
@@ -187,7 +182,7 @@ def allocate_const(raw_data, dtype, shape, annotations=None, span=None):
             return n
 
         super().__init__(allocate_const, concise_scope=True, def_symbol=True)
-        self.buffer = None
+        self.buffer_var = None
 
     def enter_scope(
         self,
@@ -211,17 +206,13 @@ def enter_scope(
         else:
             raise Exception("Internal Bug")
 
-        def setup_buffer(data, dtype, shape, annotations: dict = None, span: Span = None):
+        def setup_buffer_var(data, dtype, shape, annotations: dict = None, span: Span = None):
             """Setup buffer var for a given type."""
-            self.buffer = tvm.tir.decl_buffer(
-                shape=shape,
-                dtype=dtype,
-                name=name,
-                span=span,
-            )
+            buffer_ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype))
+            self.buffer_var = tvm.tir.Var(name, buffer_ptr_type, span)
 
-        setup_buffer(*arg_list, span=tvm_span_from_synr(var_span))
-        context.update_symbol(name, self.buffer, node)
+        setup_buffer_var(*arg_list, span=tvm_span_from_synr(var_span))
+        context.update_symbol(name, self.buffer_var, node)
 
 
 @register
@@ -248,7 +239,18 @@ def decl_buffer(
             axis_separators=None,
             span=None,
         ):
-            return tvm.tir.DeclBuffer(self.buffer, self.body, span=span)
+            decl_buffer = tvm.tir.DeclBuffer(self.buffer, self.body, span=span)
+            if data is None:
+                # when data is not specified, the buffer is implicitly allocated
+                return tvm.tir.Allocate(
+                    self.buffer.data,
+                    dtype,
+                    shape,
+                    tvm.runtime.convert(True),
+                    decl_buffer,
+                    span=span,
+                )
+            return decl_buffer
 
         super().__init__(decl_buffer, concise_scope=True, def_symbol=True)
 
@@ -298,6 +300,7 @@ def setup_buffer(
                 offset_factor=offset_factor,
                 buffer_type=buffer_type,
                 axis_separators=axis_separators,
+                name=name,
                 span=span,
             )
 
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index f5300e1e6985..5da81de4dc5d 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -100,6 +100,12 @@ class BufferUsageFinder : public StmtExprVisitor {
     StmtExprVisitor::VisitStmt_(op);
   }
 
+  void VisitStmt_(const DeclBufferNode* op) final {
+    buffers_declared_.insert(op->buffer.get());
+    StmtExprVisitor::VisitStmt_(op);
+    buffers_declared_.erase(op->buffer.get());
+  }
+
  private:
   explicit BufferUsageFinder(Map<Var, Array<Buffer>> usage) : usage_(usage) {}
 
@@ -107,6 +113,9 @@ class BufferUsageFinder : public StmtExprVisitor {
     if (buffers_visited_.count(buffer.get())) {
       return;
     }
+    if (buffers_declared_.count(buffer.get())) {
+      return;
+    }
     buffers_visited_.insert(buffer.get());
 
     Array<Buffer> arr = usage_.Get(buffer->data).value_or({});
@@ -119,6 +128,9 @@ class BufferUsageFinder : public StmtExprVisitor {
   // The buffers that have been visited so far, to avoid duplicate
   // entries in the search result.
   std::unordered_set<const BufferNode*> buffers_visited_;
+  // The buffers declared via `DeclBuffer`. These buffers are excluded from the result because
+  // T.buffer_decl shouldn't be printed for them.
+  std::unordered_set<const BufferNode*> buffers_declared_;
 };
 
 /*!
@@ -1055,58 +1067,57 @@ Doc TVMScriptPrinter::VisitStmt_(const BufferRealizeNode* op) {
 }
 
 namespace {
-struct AllocUsage {
-  Buffer alloc_buffer;
-  Array<Buffer> aliasing_buffers;
-};
 
-template <typename AllocNode>
-AllocUsage FindAllocateUsage(AllocNode* op, Map<Var, Array<Buffer>>* cache_ptr) {
-  Map<Var, Array<Buffer>>& cache = *cache_ptr;
-  if (!cache.count(op->buffer_var)) {
-    cache = BufferUsageFinder::FindUsage(std::move(cache), op->body);
+bool IsAllocateDeclBufferPattern(const AllocateNode* allocate) {
+  const Var& buffer_var = allocate->buffer_var;
+  const DeclBufferNode* decl_buffer = allocate->body.as<DeclBufferNode>();
+  if (!decl_buffer) {
+    return false;
   }
-  Array<Buffer> buffer_usage = cache.Get(op->buffer_var).value_or({});
-
-  auto is_exact_match = [](Buffer a, Buffer b) {
-    if (a->dtype != b->dtype) return false;
-    if (a->shape.size() != b->shape.size()) return false;
-
-    arith::Analyzer analyzer;
-    for (size_t i = 0; i < a->shape.size(); i++) {
-      if (!analyzer.CanProveEqual(a->shape[i], b->shape[i])) {
-        return false;
-      }
-    }
-    return true;
-  };
-
-  // If the buffer allocated via T.allocate is an exact match to the
-  // usage of the buffer later on, then that buffer is the return
-  // value of T.allocate, and no T.buffer_decl statement is needed.
-  Buffer alloc_buffer(op->buffer_var, op->dtype, op->extents, {}, 0, op->buffer_var->name_hint, 0,
-                      0, kDefault);
-  bool found_alloc_buf = false;
-  Array<Buffer> aliasing_buffers;
-  for (const auto& buf : buffer_usage) {
-    if (!found_alloc_buf && is_exact_match(buf, alloc_buffer)) {
-      alloc_buffer = buf;
-      found_alloc_buf = true;
-    } else {
-      aliasing_buffers.push_back(buf);
+  const Buffer& buffer = decl_buffer->buffer;
+  if (!buffer_var.same_as(buffer->data)) {
+    return false;
+  }
+  if (allocate->dtype != buffer->dtype) {
+    return false;
+  }
+  if (!is_one(allocate->condition)) {
+    return false;
+  }
+  if (allocate->annotations.size()) {
+    return false;
+  }
+  if (allocate->extents.size() != buffer->shape.size()) {
+    return false;
+  }
+  tir::ExprDeepEqual expr_equal;
+  for (size_t i = 0, n = allocate->extents.size(); i < n; ++i) {
+    if (!expr_equal(allocate->extents[i], buffer->shape[i])) {
+      return false;
     }
   }
-
-  return AllocUsage{alloc_buffer, aliasing_buffers};
+  return true;
 }
+
 }  // namespace
 
 Doc TVMScriptPrinter::VisitStmt_(const AllocateNode* op) {
-  auto usage = FindAllocateUsage(op, &buffer_var_usage_);
-  Buffer& alloc_buffer = usage.alloc_buffer;
-  Array<Buffer>& aliasing_buffers = usage.aliasing_buffers;
-  buf_not_in_headers_.insert(alloc_buffer.get());
-  var_not_in_headers_.insert(alloc_buffer->data.get());
+  var_not_in_headers_.insert(op->buffer_var.get());
+
+  if (!buffer_var_usage_.count(op->buffer_var)) {
+    buffer_var_usage_ = BufferUsageFinder::FindUsage(std::move(buffer_var_usage_), op->body);
+  }
+  Array<Buffer> buffer_usage = buffer_var_usage_.Get(op->buffer_var).value_or({});
+
+  if (buffer_usage.empty()) {
+    if (IsAllocateDeclBufferPattern(op)) {
+      // As a syntax sugar, we identify the pattern of Allocate and DeclBuffer and print a single
+      // DeclBuffer statement. It is intentionally to call `Print` instead of `PrintBody` here to
+      // delegate the printing of the current node to `DeclBufferNode` while maintaining the
+      // same value of `current_num_` and `num_child_`.
+      return Print(op->body);
+    }
+  }
 
   auto storage_scope = GetPtrStorageScope(op->buffer_var);
   Doc func_call;
@@ -1124,12 +1135,12 @@ Doc TVMScriptPrinter::VisitStmt_(const AllocateNode* op) {
 
   Doc doc;
   if (current_num_ != num_child_ - 1) {
-    doc << "with " << func_call << " as " << Print(alloc_buffer) << ":";
-    doc << Doc::Indent(4, Doc::NewLine() << PrintNonHeaderBufferDeclarations(aliasing_buffers)
-                                         << PrintBody(op->body));
+    doc << "with " << func_call << " as " << Print(op->buffer_var) << ":";
+    doc << Doc::Indent(
+        4, Doc::NewLine() << PrintNonHeaderBufferDeclarations(buffer_usage) << PrintBody(op->body));
   } else {
-    doc << Print(alloc_buffer) << " = " << func_call << Doc::NewLine();
-    doc << PrintNonHeaderBufferDeclarations(aliasing_buffers) << PrintBody(op->body);
+    doc << Print(op->buffer_var) << " = " << func_call << Doc::NewLine();
+    doc << PrintNonHeaderBufferDeclarations(buffer_usage) << PrintBody(op->body);
   }
   TryDeallocVar(op->buffer_var);
   return doc;
@@ -1179,11 +1190,12 @@ Doc TVMScriptPrinter::VisitStmt_(const AllocateConstNode* alloc) {
   }
   auto ndarray_str = ss.str();
 
-  auto usage = FindAllocateUsage(alloc, &buffer_var_usage_);
-  Buffer& alloc_buffer = usage.alloc_buffer;
-  Array<Buffer>& aliasing_buffers = usage.aliasing_buffers;
-  buf_not_in_headers_.insert(alloc_buffer.get());
-  var_not_in_headers_.insert(alloc_buffer->data.get());
+  var_not_in_headers_.insert(alloc->buffer_var.get());
+
+  if (!buffer_var_usage_.count(alloc->buffer_var)) {
+    buffer_var_usage_ = BufferUsageFinder::FindUsage(std::move(buffer_var_usage_), alloc->body);
+  }
+  Array<Buffer> buffer_usage = buffer_var_usage_.Get(alloc->buffer_var).value_or({});
 
   Doc func_call;
   func_call << tir_prefix_ << ".allocate_const(" << ndarray_str << ", " << PrintDType(alloc->dtype)
@@ -1192,12 +1204,12 @@ Doc TVMScriptPrinter::VisitStmt_(const AllocateConstNode* alloc) {
   Doc doc;
   var_not_in_headers_.insert(alloc->buffer_var.get());
   if (current_num_ != num_child_ - 1) {
-    doc << "with " << func_call << " as " << Print(alloc_buffer) << ":";
-    doc << Doc::Indent(4, Doc::NewLine() << PrintNonHeaderBufferDeclarations(aliasing_buffers)
+    doc << "with " << func_call << " as " << Print(alloc->buffer_var) << ":";
+    doc << Doc::Indent(4, Doc::NewLine() << PrintNonHeaderBufferDeclarations(buffer_usage)
                                          << PrintBody(alloc->body));
   } else {
-    doc << Print(alloc_buffer) << " = " << func_call << Doc::NewLine();
-    doc << PrintNonHeaderBufferDeclarations(aliasing_buffers) << PrintBody(alloc->body);
+    doc << Print(alloc->buffer_var) << " = " << func_call << Doc::NewLine();
+    doc << PrintNonHeaderBufferDeclarations(buffer_usage) << PrintBody(alloc->body);
   }
   return doc;
 }
diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
index f348fd7f5a77..8c598fe0d794 100644
--- a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
+++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
@@ -40,14 +40,14 @@ def main() -> None:
         buffer9 = T.buffer_decl([32], "uint8")
         buffer10 = T.buffer_decl([2048], "int8")
         # body
-        p1 = T.allocate([128], "uint8", "global")
-        p2 = T.allocate([112], "uint8", "global")
-        p3 = T.allocate([112], "uint8", "global")
-        p4 = T.allocate([32], "uint8", "global")
-        p5 = T.allocate([32], "uint8", "global")
-        p6 = T.allocate([32], "uint8", "global")
-        p7 = T.allocate([112], "uint8", "global")
-        p8 = T.allocate([32], "uint8", "global")
+        p1 = T.decl_buffer([128], "uint8")
+        p2 = T.decl_buffer([112], "uint8")
+        p3 = T.decl_buffer([112], "uint8")
+        p4 = T.decl_buffer([32], "uint8")
+        p5 = T.decl_buffer([32], "uint8")
+        p6 = T.decl_buffer([32], "uint8")
+        p7 = T.decl_buffer([112], "uint8")
+        p8 = T.decl_buffer([32], "uint8")
         T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -88,14 +88,14 @@ def main() -> None:
             buffer9 = T.buffer_decl([32], "uint8")
             buffer10 = T.buffer_decl([2048], "int8")
             # body
-            p1 = T.allocate([128], "uint8", "global")
-            p2 = T.allocate([112], "uint8", "global")
-            p3 = T.allocate([112], "uint8", "global")
-            p4 = T.allocate([32], "uint8", "global")
-            p5 = T.allocate([32], "uint8", "global")
-            p6 = T.allocate([32], "uint8", "global")
-            p7 = T.allocate([112], "uint8", "global")
-            p8 = T.allocate([32], "uint8", "global")
+            p1 = T.decl_buffer([128], "uint8")
+            p2 = T.decl_buffer([112], "uint8")
+            p3 = T.decl_buffer([112], "uint8")
+            p4 = T.decl_buffer([32], "uint8")
+            p5 = T.decl_buffer([32], "uint8")
+            p6 = T.decl_buffer([32], "uint8")
+            p7 = T.decl_buffer([112], "uint8")
+            p8 = T.decl_buffer([32], "uint8")
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
@@ -134,14 +134,14 @@ def main() -> None:
             buffer9 = T.buffer_decl([32], "uint8")
             buffer10 = T.buffer_decl([2048], "int8")
             # body
-            p1 = T.allocate([128], "uint8", "global")
-            p2 = T.allocate([112], "uint8", "global")
-            p3 = T.allocate([112], "uint8", "global")
-            p4 = T.allocate([32], "uint8", "global")
-            p5 = T.allocate([32], "uint8", "global")
-            p6 = T.allocate([32], "uint8", "global")
-            p7 = T.allocate([112], "uint8", "global")
-            p8 = T.allocate([32], "uint8", "global")
+            p1 = T.decl_buffer([128], "uint8")
+            p2 = T.decl_buffer([112], "uint8")
+            p3 = T.decl_buffer([112], "uint8")
+            p4 = T.decl_buffer([32], "uint8")
+            p5 = T.decl_buffer([32], "uint8")
+            p6 = T.decl_buffer([32], "uint8")
+            p7 = T.decl_buffer([112], "uint8")
+            p8 = T.decl_buffer([32], "uint8")
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
@@ -166,11 +166,11 @@ def main() -> None:
 class AllOperatorsWithoutWeights:
     @T.prim_func
     def main() -> None:
-        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer1 = T.buffer_decl([36], "int8")
         buffer2 = T.buffer_decl([9], "int8")
         # body
-        p1 = T.allocate([96], "int8", "global")
+        p1 = T.decl_buffer([96], "int8")
         T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 4, 3, 3, 0, 4, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 12, 3, 1, "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "int8", 3, 1, 3, 3, 0, 1, buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 3, 1, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
 # fmt: on
@@ -188,19 +188,19 @@ def test_all_operators_without_weights(max_copy_movements):
 class OperatorsWithAndWithoutWeights:
     @T.prim_func
     def main() -> None:
-        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer1 = T.buffer_decl([97156], "int8")
         buffer2 = T.buffer_decl([80], "uint8")
         buffer3 = T.buffer_decl([64], "uint8")
         buffer4 = T.buffer_decl([96], "uint8")
         buffer5 = T.buffer_decl([32], "uint8")
         # body
-        p1 = T.allocate([390336], "int8", "global")
-        p2 = T.allocate([80], "uint8", "global")
-        p3 = T.allocate([64], "uint8", "global")
-        p4 = T.allocate([390336], "int8", "global")
-        p5 = T.allocate([96], "uint8", "global")
-        p6 = T.allocate([32], "uint8", "global")
+        p1 = T.decl_buffer([390336], "int8")
+        p2 = T.decl_buffer([80], "uint8")
+        p3 = T.decl_buffer([64], "uint8")
+        p4 = T.decl_buffer([390336], "int8")
+        p5 = T.decl_buffer([96], "uint8")
+        p6 = T.decl_buffer([32], "uint8")
         T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
@@ -230,12 +230,12 @@ def main() -> None:
             buffer4 = T.buffer_decl([96], "uint8")
             buffer5 = T.buffer_decl([32], "uint8")
             # body
-            p1 = T.allocate([390336], "int8", "global")
-            p2 = T.allocate([80], "uint8", "global")
-            p3 = T.allocate([64], "uint8", "global")
-            p4 = T.allocate([390336], "int8", "global")
-            p5 = T.allocate([96], "uint8", "global")
-            p6 = T.allocate([32], "uint8", "global")
+            p1 = T.decl_buffer([390336], "int8")
+            p2 = T.decl_buffer([80], "uint8")
+            p3 = T.decl_buffer([64], "uint8")
+            p4 = T.decl_buffer([390336], "int8")
+            p5 = T.decl_buffer([96], "uint8")
+            p6 = T.decl_buffer([32], "uint8")
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -256,19 +256,19 @@ def test_operators_with_and_without_weights_max_copy_movements_2():
     class ReferenceModule:
         @T.prim_func
         def main() -> None:
-            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             buffer1 = T.buffer_decl([97156], "int8")
             buffer2 = T.buffer_decl([80], "uint8")
             buffer3 = T.buffer_decl([64], "uint8")
             buffer4 = T.buffer_decl([96], "uint8")
             buffer5 = T.buffer_decl([32], "uint8")
             # body
-            p1 = T.allocate([390336], "int8", "global")
-            p2 = T.allocate([80], "uint8", "global")
-            p3 = T.allocate([64], "uint8", "global")
-            p4 = T.allocate([390336], "int8", "global")
-            p5 = T.allocate([96], "uint8", "global")
-            p6 = T.allocate([32], "uint8", "global")
+            p1 = T.decl_buffer([390336], "int8")
+            p2 = T.decl_buffer([80], "uint8")
+            p3 = T.decl_buffer([64], "uint8")
+            p4 = T.decl_buffer([390336], "int8")
+            p5 = T.decl_buffer([96], "uint8")
+            p6 = T.decl_buffer([32], "uint8")
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
@@ -288,7 +288,7 @@ def main() -> None:
 class CopyToBufferWithLocalScope:
     @T.prim_func
     def main() -> None:
-        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer1 = T.buffer_decl([64], "uint8")
         buffer2 = T.buffer_decl([48], "uint8")
         buffer3 = T.buffer_decl([48], "uint8")
@@ -298,13 +298,13 @@ def main() -> None:
         buffer7 = T.buffer_decl([256], "uint8")
         buffer8 = T.buffer_decl([64], "uint8")
         # body
-        p1 = T.allocate([48], "uint8", "global")
-        p2 = T.allocate([48], "uint8", "global")
-        p3 = T.allocate([256], "int8", "local")
-        p4 = T.allocate([256], "int8", "global")
-        p5 = T.allocate([16], "uint8", "global")
-        p6 = T.allocate([48], "uint8", "global")
-        p7 = T.allocate([256], "int8", "local")
+        p1 = T.decl_buffer([48], "uint8")
+        p2 = T.decl_buffer([48], "uint8")
+        p3 = T.decl_buffer([256], "int8", scope="local")
+        p4 = T.decl_buffer([256], "int8")
+        p5 = T.decl_buffer([16], "uint8")
+        p6 = T.decl_buffer([48], "uint8")
+        p7 = T.decl_buffer([256], "int8", scope="local")
         T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
@@ -339,13 +339,13 @@ def main() -> None:
             buffer7 = T.buffer_decl([256], "uint8")
             buffer8 = T.buffer_decl([64], "uint8")
             # body
-            p1 = T.allocate([48], "uint8", "global")
-            p2 = T.allocate([48], "uint8", "global")
-            p3 = T.allocate([256], "int8", "local")
-            p4 = T.allocate([256], "int8", "global")
-            p5 = T.allocate([16], "uint8", "global")
-            p6 = T.allocate([48], "uint8", "global")
-            p7 = T.allocate([256], "int8", "local")
+            p1 = T.decl_buffer([48], "uint8")
+            p2 = T.decl_buffer([48], "uint8")
+            p3 = T.decl_buffer([256], "int8", scope="local")
+            p4 = T.decl_buffer([256], "int8")
+            p5 = T.decl_buffer([16], "uint8")
+            p6 = T.decl_buffer([48], "uint8")
+            p7 = T.decl_buffer([256], "int8", scope="local")
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
@@ -412,12 +412,12 @@ def main() -> None:
             buffer4 = T.buffer_decl([96], "uint8")
             buffer5 = T.buffer_decl([32], "uint8")
             # body
-            p1 = T.allocate([390336], "int8", "global")
-            p2 = T.allocate([80], "uint8", "global")
-            p3 = T.allocate([64], "uint8", "global")
-            p4 = T.allocate([390336], "int8", "global")
-            p5 = T.allocate([96], "uint8", "global")
-            p6 = T.allocate([32], "uint8", "global")
+            p1 = T.decl_buffer([390336], "int8")
+            p2 = T.decl_buffer([80], "uint8")
+            p3 = T.decl_buffer([64], "uint8")
+            p4 = T.decl_buffer([390336], "int8")
+            p5 = T.decl_buffer([96], "uint8")
+            p6 = T.decl_buffer([32], "uint8")
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -438,19 +438,19 @@ def test_pass_context_option_max_copy_movements():
     class ReferenceModule:
         @T.prim_func
         def main() -> None:
-            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             buffer1 = T.buffer_decl([97156], "int8")
             buffer2 = T.buffer_decl([80], "uint8")
             buffer3 = T.buffer_decl([64], "uint8")
             buffer4 = T.buffer_decl([96], "uint8")
             buffer5 = T.buffer_decl([32], "uint8")
             # body
-            p1 = T.allocate([390336], "int8", "global")
-            p2 = T.allocate([80], "uint8", "global")
-            p3 = T.allocate([64], "uint8", "global")
-            p4 = T.allocate([390336], "int8", "global")
-            p5 = T.allocate([96], "uint8", "global")
-            p6 = T.allocate([32], "uint8", "global")
+            p1 = T.decl_buffer([390336], "int8")
+            p2 = T.decl_buffer([80], "uint8")
+            p3 = T.decl_buffer([64], "uint8")
+            p4 = T.decl_buffer([390336], "int8")
+            p5 = T.decl_buffer([96], "uint8")
+            p6 = T.decl_buffer([32], "uint8")
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
@@ -487,15 +487,15 @@ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208
             nn_4 = T.var("int32")
             nn_5 = T.var("int32")
             # body
-            placeholder_d_global = T.allocate([208], "uint8", "global")
-            placeholder_d_global_1 = T.allocate([112], "uint8", "global")
-            placeholder_d_global_2 = T.allocate([96], "uint8", "global")
-            placeholder_d_global_3 = T.allocate([112], "uint8", "global")
-            ethosu_write_1 = T.allocate([195168], "int8", "global")
-            ethosu_write_2 = T.allocate([184800], "int8", "global")
-            ethosu_write_3 = T.allocate([174688], "int8", "global")
-            ethosu_write_4 = T.allocate([174688], "int8", "global")
-            ethosu_write_5 = T.allocate([174688], "int8", "global")
+            placeholder_d_global = T.decl_buffer([208], "uint8")
+            placeholder_d_global_1 = T.decl_buffer([112], "uint8")
+            placeholder_d_global_2 = T.decl_buffer([96], "uint8")
+            placeholder_d_global_3 = T.decl_buffer([112], "uint8")
+            ethosu_write_1 = T.decl_buffer([195168], "int8")
+            ethosu_write_2 = T.decl_buffer([184800], "int8")
+            ethosu_write_3 = T.decl_buffer([174688], "int8")
+            ethosu_write_4 = T.decl_buffer([174688], "int8")
+            ethosu_write_5 = T.decl_buffer([174688], "int8")
             with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792):
                 T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_global[0], dtype="handle"))
             with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 250):
@@ -535,15 +535,15 @@ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208
             nn_4 = T.var("int32")
             nn_5 = T.var("int32")
             # body
-            placeholder_d_global = T.allocate([208], "uint8", "global")
-            placeholder_d_global_1 = T.allocate([112], "uint8", "global")
-            placeholder_d_global_2 = T.allocate([96], "uint8", "global")
-            placeholder_d_global_3 = T.allocate([112], "uint8", "global")
-            ethosu_write_1 = T.allocate([195168], "int8", "global")
-            ethosu_write_2 = T.allocate([184800], "int8", "global")
-            ethosu_write_3 = T.allocate([174688], "int8", "global")
-            ethosu_write_4 = T.allocate([174688], "int8", "global")
-            ethosu_write_5 = T.allocate([174688], "int8", "global")
+            placeholder_d_global = T.decl_buffer([208], "uint8")
+            placeholder_d_global_1 = T.decl_buffer([112], "uint8")
+            placeholder_d_global_2 = T.decl_buffer([96], "uint8")
+            placeholder_d_global_3 = T.decl_buffer([112], "uint8")
+            ethosu_write_1 = T.decl_buffer([195168], "int8")
+            ethosu_write_2 = T.decl_buffer([184800], "int8")
+            ethosu_write_3 = T.decl_buffer([174688], "int8")
+            ethosu_write_4 = T.decl_buffer([174688], "int8")
+            ethosu_write_5 = T.decl_buffer([174688], "int8")
             with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792):
                 T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_global[0], dtype="handle"))
             with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024):
@@ -589,17 +589,17 @@ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208
             nn_4 = T.var("int32")
             nn_5 = T.var("int32")
             # body
-            placeholder_d_d_global = T.allocate([208], "uint8", "global")
-            placeholder_d_d_global_1 = T.allocate([112], "uint8", "global")
-            placeholder_d_global = T.allocate([96], "uint8", "global")
-            ethosu_write_1 = T.allocate([195168], "int8", "global")
-            placeholder_local = T.allocate([256], "int8", "local")
-            ethosu_write_2 = T.allocate([184800], "int8", "global")
-            ethosu_write_3 = T.allocate([184800], "int8", "global")
-            ethosu_write_4 = T.allocate([184800], "int8", "global")
-            placeholder_d_local = T.allocate([256], "int8", "local")
-            ethosu_write_5 = T.allocate([184800], "int8", "global")
-            placeholder_d_d_local = T.allocate([256], "int8", "local")
+            placeholder_d_d_global = T.decl_buffer([208], "uint8")
+            placeholder_d_d_global_1 = T.decl_buffer([112], "uint8")
+            placeholder_d_global = T.decl_buffer([96], "uint8")
+            ethosu_write_1 = T.decl_buffer([195168], "int8")
+            placeholder_local = T.decl_buffer([256], "int8", scope="local")
+            ethosu_write_2 = T.decl_buffer([184800], "int8")
+            ethosu_write_3 = T.decl_buffer([184800], "int8")
+            ethosu_write_4 = T.decl_buffer([184800], "int8")
+            placeholder_d_local = T.decl_buffer([256], "int8", scope="local")
+            ethosu_write_5 = T.decl_buffer([184800], "int8")
+            placeholder_d_d_local = T.decl_buffer([256], "int8", scope="local")
             with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792):
                 T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_d_global[0], dtype="handle"))
             with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 73668):
@@ -639,17 +639,17 @@ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208
             nn_4 = T.var("int32")
             nn_5 = T.var("int32")
             # body
-            placeholder_d_d_global = T.allocate([208], "uint8", "global")
-            placeholder_d_d_global_1 = T.allocate([112], "uint8", "global")
-            placeholder_d_global = T.allocate([96], "uint8", "global")
-            ethosu_write_1 = T.allocate([195168], "int8", "global")
-            placeholder_local = T.allocate([256], "int8", "local")
-            ethosu_write_2 = T.allocate([184800], "int8", "global")
-            ethosu_write_3 = T.allocate([184800], "int8", "global")
-            ethosu_write_4 = T.allocate([184800], "int8", "global")
-            placeholder_d_local = T.allocate([256], "int8", "local")
-            ethosu_write_5 = T.allocate([184800], "int8", "global")
-            placeholder_d_d_local = T.allocate([256], "int8", "local")
+            placeholder_d_d_global = T.decl_buffer([208], "uint8")
+            placeholder_d_d_global_1 = T.decl_buffer([112], "uint8")
+            placeholder_d_global = T.decl_buffer([96], "uint8")
+            ethosu_write_1 = T.decl_buffer([195168], "int8")
+            placeholder_local = T.decl_buffer([256], "int8", scope="local")
+            ethosu_write_2 = T.decl_buffer([184800], "int8")
+            ethosu_write_3 = T.decl_buffer([184800], "int8")
+            ethosu_write_4 = T.decl_buffer([184800], "int8")
+            placeholder_d_local = T.decl_buffer([256], "int8", scope="local")
+            ethosu_write_5 = T.decl_buffer([184800], "int8")
+            placeholder_d_d_local = T.decl_buffer([256], "int8", scope="local")
             with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792):
                 T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_d_global[0], dtype="handle"))
             with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 384):
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index fd9f373739e1..6ffbf22312ff 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -43,8 +43,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         buffer7 = T.buffer_decl([144], "uint8")
         buffer8 = T.buffer_decl([32], "uint8")
         # body
-        p1 = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.allocate([144], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1_data = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1 = T.buffer_decl([160], "uint8", data=p1_data)
+        p2_data = T.allocate([144], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.buffer_decl([144], "uint8", data=p2_data)
         buffer9 = T.buffer_decl([144], "uint8", data=p1.data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 160, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 144, p2[0], dtype="handle"))
@@ -69,8 +71,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         buffer_encoded_4_1 = T.buffer_decl([208], dtype="uint8")
         buffer_encoded_6_1 = T.buffer_decl([192], dtype="uint8")
         # body
-        p1 = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1_data = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1 = T.buffer_decl([208], "uint8", data=p1_data)
+        p2_data = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.buffer_decl([192], "uint8", data=p2_data)
         p3 = T.buffer_decl([192], dtype="uint8", data=p1.data)
         T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 192, p3[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 192, p2[0], dtype="handle"))
@@ -149,8 +153,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer1 = T.buffer_decl([384], "uint8")
         # body
-        p1 = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1_data = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1 = T.buffer_decl([384], "uint8", data=p1_data)
+        p2_data = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.buffer_decl([384], "uint8", data=p2_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 384, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 384, p2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, T.int8(-1), T.int8(-1), 12, p1[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -167,8 +173,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         # buffer definition
         placeholder_encoded_1 = T.buffer_decl([464], "uint8")
         # body
-        p1 = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1_data = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1 = T.buffer_decl([464], "uint8", data=p1_data)
+        p2_data = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.buffer_decl([464], "uint8", data=p2_data)
         T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 464, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 464, p2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -246,7 +254,8 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         buffer_2 = T.buffer_decl([160], "uint8")
         buffer_3 = T.buffer_decl([80], "uint8")
         # body
-        ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        ethosu_write_1_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        ethosu_write_1 = T.buffer_decl([4096], "int8", data=ethosu_write_1_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 160, T.int8(-1), T.int8(-1), 12, buffer_3[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -264,7 +273,8 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         placeholder_encoded_2 = T.buffer_decl([208], dtype="uint8")
         placeholder_encoded_3 = T.buffer_decl([96], dtype="uint8")
         # body
-        ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        ethosu_write_2_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        ethosu_write_2 = T.buffer_decl([4096], "int8", data=ethosu_write_2_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded[0], 304, placeholder_encoded[304], 304, 12, placeholder_encoded_1[0], 80, placeholder_encoded_1[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded_2[0], 112, placeholder_encoded_2[112], 96, 12, placeholder_encoded_3[0], 48, placeholder_encoded_3[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -340,9 +350,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(112,)
         buffer10 = T.buffer_decl([160], "uint8")
         buffer11 = T.buffer_decl([2048], "int8")
         # body
-        p1 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p3 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1_data = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1 = T.buffer_decl([112], "uint8", data=p1_data)
+        p3_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.buffer_decl([4096], "int8", data=p3_data)
+        p2_data = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.buffer_decl([112], "uint8", data=p2_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 112, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, T.int8(-1), T.int8(-1), 12, buffer10[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p2[0], dtype="handle"))
@@ -369,9 +382,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(128,)
         buffer4 = T.buffer_decl([608], dtype="uint8")
         buffer5 = T.buffer_decl([160], dtype="uint8")
         buffer6 = T.buffer_decl([2048], dtype="int8")
-        p1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        p3 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1 = T.buffer_decl([128], "uint8", data=p1_data)
+        p2_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.buffer_decl([4096], "int8", data=p2_data)
+        p3_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.buffer_decl([128], "uint8", data=p3_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer4[0], 304, buffer4[304], 304, 12, buffer5[0], 80, buffer5[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p3[0], dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_hoist_allocates.py b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
index b54b92950180..6c6d51fa06b9 100644
--- a/tests/python/contrib/test_ethosu/test_hoist_allocates.py
+++ b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
@@ -116,15 +116,20 @@ def main(placeholder: T.Buffer[(3402,), "int8"], placeholder_encoded: T.Buffer[(
             T.preflattened_buffer(placeholder_encoded_3, [3, 10], dtype="uint8")
             T.preflattened_buffer(ethosu_write, [1, 27, 42, 3], dtype="int8", data=ethosu_write.data)
             # body
-            placeholder_global = T.allocate([128], "uint8", "global")
+            placeholder_global_data = T.allocate([128], "uint8", "global")
+            placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data)
             T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 128, placeholder_global[0], dtype="handle"))
-            placeholder_d_global = T.allocate([32], "uint8", "global")
+            placeholder_d_global_data = T.allocate([32], "uint8", "global")
+            placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data)
             T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
-            ethosu_write_2 = T.allocate([18144], "int8", "global")
+            ethosu_write_2_data = T.allocate([18144], "int8", "global")
+            ethosu_write_2 = T.buffer_decl([18144], "int8", data=ethosu_write_2_data)
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 27, 42, 3, 27, 0, 42, placeholder[0], 0, 0, 0, T.float32(0.0039215646684169769), -128, "NHWC", 126, 3, 1, "int8", 27, 42, 3, 27, 0, 42, ethosu_write_2[0], 0, 0, 0, T.float32(0.031308155506849289), -128, "NHCWB16", 672, 16, 1, 2, 3, 1, 1, 1, 2, placeholder_global[0], 128, 0, placeholder_d_global[0], 32, 2, 0, 2, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-            placeholder_d_global_1 = T.allocate([128], "uint8", "global")
+            placeholder_d_global_1_data = T.allocate([128], "uint8", "global")
+            placeholder_d_global_1 = T.buffer_decl([128], "uint8", data=placeholder_d_global_1_data)
             T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 128, placeholder_d_global_1[0], dtype="handle"))
-            placeholder_d_global_2 = T.allocate([32], "uint8", "global")
+            placeholder_d_global_2_data = T.allocate([32], "uint8", "global")
+            placeholder_d_global_2 = T.buffer_decl([32], "uint8", data=placeholder_d_global_2_data)
             T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_3[0], 32, placeholder_d_global_2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 27, 42, 3, 27, 0, 42, ethosu_write_2[0], 0, 0, 0, T.float32(0.031308155506849289), -128, "NHCWB16", 672, 16, 1, "int8", 27, 42, 3, 27, 0, 42, ethosu_write[0], 0, 0, 0, T.float32(0.23604340851306915), -128, "NHWC", 126, 3, 1, 2, 3, 1, 1, 1, 2, placeholder_d_global_1[0], 128, 0, placeholder_d_global_2[0], 32, 2, 0, 2, 1, "CLIP", -128, 127, "TFL", "NONE", dtype="handle"))
     # fmt: on
@@ -151,14 +156,18 @@ def main(placeholder: T.Buffer[(24,), "int8"], T_concat: T.Buffer[(24,), "int8"]
             T.preflattened_buffer(placeholder, [1, 2, 3, 4], dtype="int8", data=placeholder.data)
             T.preflattened_buffer(T_concat, [24], dtype="int8", data=T_concat.data)
             # body
-            ethosu_write = T.allocate([12], "int8", "global")
+            ethosu_write_data = T.allocate([12], "int8", "global")
+            ethosu_write = T.buffer_decl([12], "int8", data=ethosu_write_data)
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, placeholder[12], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-            ethosu_write_1 = T.allocate([12], "int8", "global")
+            ethosu_write_1_data = T.allocate([12], "int8", "global")
+            ethosu_write_1 = T.buffer_decl([12], "int8", data=ethosu_write_1_data)
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
             T.evaluate(T.call_extern("ethosu_identity", "int8", 12, 1, 1, 12, 0, 1, ethosu_write_1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "int8", 12, 1, 1, 12, 0, 1, T_concat[12], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-            ethosu_write_2 = T.allocate([12], "int8", "global")
+            ethosu_write_2_data = T.allocate([12], "int8", "global")
+            ethosu_write_2 = T.buffer_decl([12], "int8", data=ethosu_write_2_data)
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, placeholder[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-            ethosu_write_3 = T.allocate([12], "int8", "global")
+            ethosu_write_3_data = T.allocate([12], "int8", "global")
+            ethosu_write_3 = T.buffer_decl([12], "int8", data=ethosu_write_3_data)
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
             T.evaluate(T.call_extern("ethosu_identity", "int8", 12, 1, 1, 12, 0, 1, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "int8", 12, 1, 1, 12, 0, 1, T_concat[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
     # fmt: on
@@ -185,24 +194,32 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
             T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
             T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
             # body
-            with T.allocate([128], "uint8", "global") as placeholder_global:
+            with T.allocate([128], "uint8", "global") as placeholder_global_data:
+                placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 128, placeholder_global[0], dtype="handle"))
-                placeholder_d_global = T.allocate([32], "uint8", "global")
+                placeholder_d_global_data = T.allocate([32], "uint8", "global")
+                placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
                 T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-            with T.allocate([112], "uint8", "global") as placeholder_global_1:
+            with T.allocate([112], "uint8", "global") as placeholder_global_1_data:
+                placeholder_global_1 = T.buffer_decl([112], "uint8", data=placeholder_global_1_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 112, placeholder_global_1[0], dtype="handle"))
-                placeholder_d_global_1 = T.allocate([32], "uint8", "global")
+                placeholder_d_global_1_data = T.allocate([32], "uint8", "global")
+                placeholder_d_global_1 = T.buffer_decl([32], "uint8", data=placeholder_d_global_1_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle"))
                 T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 112, 12, placeholder_d_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-            with T.allocate([112], "uint8", "global") as placeholder_global_2:
+            with T.allocate([112], "uint8", "global") as placeholder_global_2_data:
+                placeholder_global_2 = T.buffer_decl([112], "uint8", data=placeholder_global_2_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 112, placeholder_global_2[0], dtype="handle"))
-                placeholder_d_global_2 = T.allocate([32], "uint8", "global")
+                placeholder_d_global_2_data = T.allocate([32], "uint8", "global")
+                placeholder_d_global_2 = T.buffer_decl([32], "uint8", data=placeholder_d_global_2_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
                 T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 112, 12, placeholder_d_global_2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-            placeholder_global_3 = T.allocate([112], "uint8", "global")
+            placeholder_global_3_data = T.allocate([112], "uint8", "global")
+            placeholder_global_3 = T.buffer_decl([112], "uint8", data=placeholder_global_3_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 112, placeholder_global_3[0], dtype="handle"))
-            placeholder_d_global_3 = T.allocate([32], "uint8", "global")
+            placeholder_d_global_3_data = T.allocate([32], "uint8", "global")
+            placeholder_d_global_3 = T.buffer_decl([32], "uint8", data=placeholder_d_global_3_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 112, 12, placeholder_d_global_3[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     # fmt: on
@@ -227,13 +244,20 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
             T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
             T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
             # body
-            placeholder_global = T.allocate([128], "uint8", "global")
-            placeholder_global_1 = T.allocate([112], "uint8", "global")
-            placeholder_global_2 = T.allocate([112], "uint8", "global")
-            placeholder_d_global = T.allocate([32], "uint8", "global")
-            placeholder_d_global_1 = T.allocate([32], "uint8", "global")
-            placeholder_d_global_2 = T.allocate([32], "uint8", "global")
-            placeholder_global_3 = T.allocate([112], "uint8", "global")
+            placeholder_global_data = T.allocate([128], "uint8", "global")
+            placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data)
+            placeholder_global_1_data = T.allocate([112], "uint8", "global")
+            placeholder_global_1 = T.buffer_decl([112], "uint8", data=placeholder_global_1_data)
+            placeholder_global_2_data = T.allocate([112], "uint8", "global")
+            placeholder_global_2 = T.buffer_decl([112], "uint8", data=placeholder_global_2_data)
+            placeholder_d_global_data = T.allocate([32], "uint8", "global")
+            placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data)
+            placeholder_d_global_1_data = T.allocate([32], "uint8", "global")
+            placeholder_d_global_1 = T.buffer_decl([32], "uint8", data=placeholder_d_global_1_data)
+            placeholder_d_global_2_data = T.allocate([32], "uint8", "global")
+            placeholder_d_global_2 = T.buffer_decl([32], "uint8", data=placeholder_d_global_2_data)
+            placeholder_global_3_data = T.allocate([112], "uint8", "global")
+            placeholder_global_3 = T.buffer_decl([112], "uint8", data=placeholder_global_3_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 128, placeholder_global[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -242,7 +266,8 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 112, 12, placeholder_d_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 112, placeholder_global_2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
-            placeholder_d_global_3 = T.allocate([32], "uint8", "global")
+            placeholder_d_global_3_data = T.allocate([32], "uint8", "global")
+            placeholder_d_global_3 = T.buffer_decl([32], "uint8", data=placeholder_d_global_3_data)
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 112, 12, placeholder_d_global_2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 112, placeholder_global_3[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_merge_constants.py b/tests/python/contrib/test_ethosu/test_merge_constants.py
index caf09abdb020..337b5c70d125 100644
--- a/tests/python/contrib/test_ethosu/test_merge_constants.py
+++ b/tests/python/contrib/test_ethosu/test_merge_constants.py
@@ -44,8 +44,10 @@ def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"])
             buffer1 = T.buffer_decl([8192], "int8")
             buffer10 = T.buffer_decl([2048], "int8")
             # body
-            p1 = T.allocate([128], "uint8", "global")
-            p4 = T.allocate([32], "uint8", "global")
+            p1_data = T.allocate([128], "uint8", "global")
+            p1 = T.buffer_decl([128], "uint8", data=p1_data)
+            p4_data = T.allocate([32], "uint8", "global")
+            p4 = T.buffer_decl([32], "uint8", data=p4_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -60,7 +62,8 @@ def main(buffer2: T.Buffer[(160,), "uint8"]) -> None:
             buffer1 = T.buffer_decl([8192], "int8")
             buffer10 = T.buffer_decl([2048], "int8")
             # body
-            p4 = T.allocate([160], "uint8", "global")
+            p4_data = T.allocate([160], "uint8", "global")
+            p4 = T.buffer_decl([160], "uint8", data=p4_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     # fmt: on
@@ -86,14 +89,22 @@ def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"],
             buffer1 = T.buffer_decl([8192], "int8")
             buffer10 = T.buffer_decl([2048], "int8")
             # body
-            p1 = T.allocate([128], "uint8", "global")
-            p2 = T.allocate([112], "uint8", "global")
-            p3 = T.allocate([112], "uint8", "global")
-            p4 = T.allocate([32], "uint8", "global")
-            p5 = T.allocate([32], "uint8", "global")
-            p6 = T.allocate([32], "uint8", "global")
-            p7 = T.allocate([112], "uint8", "global")
-            p8 = T.allocate([3], "uint8", "global")
+            p1_data = T.allocate([128], "uint8", "global")
+            p1 = T.buffer_decl([128], "uint8", data=p1_data)
+            p2_data = T.allocate([112], "uint8", "global")
+            p2 = T.buffer_decl([112], "uint8", data=p2_data)
+            p3_data = T.allocate([112], "uint8", "global")
+            p3 = T.buffer_decl([112], "uint8", data=p3_data)
+            p4_data = T.allocate([32], "uint8", "global")
+            p4 = T.buffer_decl([32], "uint8", data=p4_data)
+            p5_data = T.allocate([32], "uint8", "global")
+            p5 = T.buffer_decl([32], "uint8", data=p5_data)
+            p6_data = T.allocate([32], "uint8", "global")
+            p6 = T.buffer_decl([32], "uint8", data=p6_data)
+            p7_data = T.allocate([112], "uint8", "global")
+            p7 = T.buffer_decl([112], "uint8", data=p7_data)
+            p8_data = T.allocate([3], "uint8", "global")
+            p8 = T.buffer_decl([3], "uint8", data=p8_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
@@ -117,10 +128,14 @@ def main(buffer2: T.Buffer[(160,), "uint8"], buffer4: T.Buffer[(144,), "uint8"],
             buffer1 = T.buffer_decl([8192], "int8")
             buffer10 = T.buffer_decl([2048], "int8")
             # body
-            p4 = T.allocate([160], "uint8", "global")
-            p7 = T.allocate([144], "uint8", "global")
-            p10 = T.allocate([144], "uint8", "global")
-            p11 = T.allocate([144], "uint8", "global")
+            p4_data = T.allocate([160], "uint8", "global")
+            p4 = T.buffer_decl([160], "uint8", data=p4_data)
+            p7_data = T.allocate([144], "uint8", "global")
+            p7 = T.buffer_decl([144], "uint8", data=p7_data)
+            p10_data = T.allocate([144], "uint8", "global")
+            p10 = T.buffer_decl([144], "uint8", data=p10_data)
+            p11_data = T.allocate([144], "uint8", "global")
+            p11 = T.buffer_decl([144], "uint8", data=p11_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 144, p7[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -159,13 +174,15 @@ def test_operators_with_and_without_weights():
     class InputModule:
         @T.prim_func
         def main(buffer2: T.Buffer[(80,), "uint8"], buffer3: T.Buffer[(64,), "uint8"]) -> None:
-            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             buffer0 = T.buffer_decl([390336], "int8")
             buffer1 = T.buffer_decl([97156], "int8")
             buffer6 = T.buffer_decl([390336], "int8")
             # body
-            p2 = T.allocate([80], "uint8", "global")
-            p3 = T.allocate([64], "uint8", "global")
+            p2_data = T.allocate([80], "uint8", "global")
+            p2 = T.buffer_decl([80], "uint8", data=p2_data)
+            p3_data = T.allocate([64], "uint8", "global")
+            p3 = T.buffer_decl([64], "uint8", data=p3_data)
             T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
@@ -176,12 +193,13 @@ def main(buffer2: T.Buffer[(80,), "uint8"], buffer3: T.Buffer[(64,), "uint8"]) -
     class ReferenceModule:
         @T.prim_func
         def main(buffer2: T.Buffer[(144,), "uint8"]) -> None:
-            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             buffer0 = T.buffer_decl([390336], "int8")
             buffer1 = T.buffer_decl([97156], "int8")
             buffer6 = T.buffer_decl([390336], "int8")
             # body
-            p3 = T.allocate([144], "uint8", "global")
+            p3_data = T.allocate([144], "uint8", "global")
+            p3 = T.buffer_decl([144], "uint8", data=p3_data)
             T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 144, p3[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, buffer6[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p3[0], 80, 0, p3[80], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -203,8 +221,8 @@ def test_copy_to_buffer_with_local_scope():
     @tvm.script.ir_module
     class InputModule:
         @T.prim_func
-        def main(buffer1: T.Buffer[(64,), "uint8"], 
-        buffer2: T.Buffer[(48,), "uint8"], 
+        def main(buffer1: T.Buffer[(64,), "uint8"],
+        buffer2: T.Buffer[(48,), "uint8"],
         buffer3: T.Buffer[(256,), "uint8"],
         buffer4: T.Buffer[(256,), "uint8"],
         buffer5: T.Buffer[(16,), "uint8"],
@@ -215,12 +233,18 @@ def main(buffer1: T.Buffer[(64,), "uint8"],
         ) -> None:
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # body
-            p1 = T.allocate([48], "uint8", "global")
-            p2 = T.allocate([48], "uint8", "global")
-            p3 = T.allocate([256], "int8", "local")
-            p5 = T.allocate([16], "uint8", "global")
-            p6 = T.allocate([48], "uint8", "global")
-            p7 = T.allocate([256], "int8", "local")
+            p1_data = T.allocate([48], "uint8", "global")
+            p1 = T.buffer_decl([48], "uint8", data=p1_data)
+            p2_data = T.allocate([48], "uint8", "global")
+            p2 = T.buffer_decl([48], "uint8", data=p2_data)
+            p3_data = T.allocate([256], "int8", "local")
+            p3 = T.buffer_decl([256], "int8", data=p3_data, scope="local")
+            p5_data = T.allocate([16], "uint8", "global")
+            p5 = T.buffer_decl([16], "uint8", data=p5_data)
+            p6_data = T.allocate([48], "uint8", "global")
+            p6 = T.buffer_decl([48], "uint8", data=p6_data)
+            p7_data = T.allocate([256], "int8", "local")
+            p7 = T.buffer_decl([256], "int8", data=p7_data, scope="local")
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) # Local
@@ -234,8 +258,8 @@ def main(buffer1: T.Buffer[(64,), "uint8"],
     @tvm.script.ir_module
     class ReferenceModule:
         @T.prim_func
-        def main(buffer1: T.Buffer[(64,), "uint8"], 
-            buffer2: T.Buffer[(96,), "uint8"], 
+        def main(buffer1: T.Buffer[(64,), "uint8"],
+            buffer2: T.Buffer[(96,), "uint8"],
             buffer4: T.Buffer[(256,), "uint8"],
             buffer5: T.Buffer[(64,), "uint8"],
             buffer7: T.Buffer[(256,), "uint8"],
@@ -244,10 +268,14 @@ def main(buffer1: T.Buffer[(64,), "uint8"],
             ) -> None:
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # body
-            p1 = T.allocate([96], "uint8", "global")
-            p2 = T.allocate([64], "uint8", "global")
-            p3 = T.allocate([256], "int8", "local")
-            p7 = T.allocate([256], "int8", "local")
+            p1_data = T.allocate([96], "uint8", "global")
+            p1 = T.buffer_decl([96], "uint8", data=p1_data)
+            p2_data = T.allocate([64], "uint8", "global")
+            p2 = T.buffer_decl([64], "uint8", data=p2_data)
+            p3_data = T.allocate([256], "int8", "local")
+            p3 = T.buffer_decl([256], "int8", data=p3_data, scope="local")
+            p7_data = T.allocate([256], "int8", "local")
+            p7 = T.buffer_decl([256], "int8", data=p7_data, scope="local")
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) # Local
             T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 64, p2[0], dtype="handle"))
@@ -287,10 +315,11 @@ def main() -> None:
             placeholder = T.buffer_decl([20], "int8")
             ethosu_write = T.buffer_decl([16], "int8")
             # body
-            ethosu_write_4 = T.allocate([16], "int8", "global")
+            ethosu_write_4_data = T.allocate([16], "int8", "global")
+            ethosu_write_4 = T.buffer_decl([16], "int8", data=ethosu_write_4_data)
             T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 1, 4, 4, 1, 0, 4, placeholder[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "int8", 1, 4, 1, 1, 0, 4, placeholder[16], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 1, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "MAX", 0, "CLIP", -128, 127, "TFL", 1, 4, 4, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-    
+
     @tvm.script.ir_module
     class ReferenceModule:
         @T.prim_func
@@ -300,7 +329,8 @@ def main() -> None:
             placeholder = T.buffer_decl([20], "int8")
             ethosu_write = T.buffer_decl([16], "int8")
             # body
-            ethosu_write_4 = T.allocate([16], "int8", "global")
+            ethosu_write_4_data = T.allocate([16], "int8", "global")
+            ethosu_write_4 = T.buffer_decl([16], "int8", data=ethosu_write_4_data)
             T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 1, 4, 4, 1, 0, 4, placeholder[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "int8", 1, 4, 1, 1, 0, 4, placeholder[16], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 1, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "MAX", 0, "CLIP", -128, 127, "TFL", 1, 4, 4, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     # fmt: on
@@ -324,8 +354,10 @@ def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"])
             buffer1 = T.buffer_decl([8192], "int8")
             buffer10 = T.buffer_decl([2048], "int8")
             # body
-            p1 = T.allocate([128], "uint8", "global")
-            p4 = T.allocate([32], "uint8", "global")
+            p1_data = T.allocate([128], "uint8", "global")
+            p1 = T.buffer_decl([128], "uint8", data=p1_data)
+            p4_data = T.allocate([32], "uint8", "global")
+            p4 = T.buffer_decl([32], "uint8", data=p4_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -343,7 +375,8 @@ def main(buffer2: T.Buffer[(160,), "uint8"]) -> None:
             buffer1 = T.buffer_decl([8192], "int8")
             buffer10 = T.buffer_decl([2048], "int8")
             # body
-            p5 = T.allocate([160], "uint8", "global")
+            p5_data = T.allocate([160], "uint8", "global")
+            p5 = T.buffer_decl([160], "uint8", data=p5_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p5[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5[0], 128, 12, p5[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p5[0], dtype="handle"))
@@ -373,8 +406,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint
             T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
             T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
             # body
-            p1 = T.allocate([368], "uint8", "global")
-            p2 = T.allocate([96], "uint8", "global") 
+            p1_data = T.allocate([368], "uint8", "global")
+            p1 = T.buffer_decl([368], "uint8", data=p1_data)
+            p2_data = T.allocate([96], "uint8", "global")
+            p2 = T.buffer_decl([96], "uint8", data=p2_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p2[0], 48, p2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -388,7 +423,8 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # body
-            p1 = T.allocate([464], "uint8", "global")
+            p1_data = T.allocate([464], "uint8", "global")
+            p1 = T.buffer_decl([464], "uint8", data=p1_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 464, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -428,14 +464,22 @@ def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"],
             buffer1 = T.buffer_decl([8192], "int8")
             buffer10 = T.buffer_decl([2048], "int8")
             # body
-            p1 = T.allocate([128], "uint8", "global")
-            p2 = T.allocate([112], "uint8", "global")
-            p3 = T.allocate([112], "uint8", "global")
-            p4 = T.allocate([32], "uint8", "global")
-            p5 = T.allocate([32], "uint8", "global")
-            p6 = T.allocate([32], "uint8", "global")
-            p7 = T.allocate([112], "uint8", "global")
-            p8 = T.allocate([3], "uint8", "global")
+            p1_data = T.allocate([128], "uint8", "global")
+            p1 = T.buffer_decl([128], "uint8", data=p1_data)
+            p2_data = T.allocate([112], "uint8", "global")
+            p2 = T.buffer_decl([112], "uint8", data=p2_data)
+            p3_data = T.allocate([112], "uint8", "global")
+            p3 = T.buffer_decl([112], "uint8", data=p3_data)
+            p4_data = T.allocate([32], "uint8", "global")
+            p4 = T.buffer_decl([32], "uint8", data=p4_data)
+            p5_data = T.allocate([32], "uint8", "global")
+            p5 = T.buffer_decl([32], "uint8", data=p5_data)
+            p6_data = T.allocate([32], "uint8", "global")
+            p6 = T.buffer_decl([32], "uint8", data=p6_data)
+            p7_data = T.allocate([112], "uint8", "global")
+            p7 = T.buffer_decl([112], "uint8", data=p7_data)
+            p8_data = T.allocate([3], "uint8", "global")
+            p8 = T.buffer_decl([3], "uint8", data=p8_data)
             with T.attr(T.iter_var(v1a, None, "DataPar", ""), "pragma_compute_cycles_hint", 100):
                 T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
             with T.attr(T.iter_var(v1b, None, "DataPar", ""), "pragma_compute_cycles_hint", 101):
@@ -479,10 +523,14 @@ def main(buffer2: T.Buffer[(160,), "uint8"], buffer4: T.Buffer[(144,), "uint8"],
             buffer1 = T.buffer_decl([8192], "int8")
             buffer10 = T.buffer_decl([2048], "int8")
             # body
-            p4 = T.allocate([160], "uint8", "global")
-            p7 = T.allocate([144], "uint8", "global")
-            p10 = T.allocate([144], "uint8", "global")
-            p11 = T.allocate([144], "uint8", "global")
+            p4_data = T.allocate([160], "uint8", "global")
+            p4 = T.buffer_decl([160], "uint8", data=p4_data)
+            p7_data = T.allocate([144], "uint8", "global")
+            p7 = T.buffer_decl([144], "uint8", data=p7_data)
+            p10_data = T.allocate([144], "uint8", "global")
+            p10 = T.buffer_decl([144], "uint8", data=p10_data)
+            p11_data = T.allocate([144], "uint8", "global")
+            p11 = T.buffer_decl([144], "uint8", data=p11_data)
             with T.attr(T.iter_var(v1a, None, "DataPar", ""), "pragma_compute_cycles_hint", 201):
                 T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle"))
             with T.attr(T.iter_var(v2a, None, "DataPar", ""), "pragma_compute_cycles_hint", 205):
diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
index d2c759a0ae4d..e6414c24d4a3 100644
--- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py
+++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
@@ -42,7 +42,8 @@ def main(placeholder: T.Buffer[(1536,), "int8"], placeholder_1: T.Buffer[(1280,)
         buffer_6 = T.buffer_decl([2992], "uint8")
         buffer_7 = T.buffer_decl([160], "uint8")
         # body
-        T_concat_1 = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True})
+        T_concat_1_data = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True})
+        T_concat_1 = T.buffer_decl([2816], "int8", data=T_concat_1_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, placeholder_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 160, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat[352], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_3[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 12, 16, 8, 0, 12, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 192, 16, 1, "int8", 8, 12, 16, 8, 0, 12, T_concat_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer_4[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_5[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index 46a3c5a15bf5..ae46057369e0 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -374,7 +374,8 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,
         buffer_2 = T.buffer_decl([320], "uint8")
         buffer_3 = T.buffer_decl([160], "uint8")
         # body
-        ethosu_write_2 = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True})
+        ethosu_write_2_data = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True})
+        ethosu_write_2 = T.buffer_decl([1024], "int8", data=ethosu_write_2_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, buffer[0], 304, T.int8(-1), T.int8(-1), 12, buffer_1[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[12], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -393,7 +394,8 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,
         buffer_2 = T.buffer_decl([1312], "uint8")
         buffer_3 = T.buffer_decl([2608], "uint8")
         # body
-        ethosu_write_2 = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True})
+        ethosu_write_2_data = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True})
+        ethosu_write_2 = T.buffer_decl([1536], "int8", data=ethosu_write_2_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, T.int8(-1), T.int8(-1), 12, buffer[0], 80, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[48], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -412,7 +414,8 @@ def main(placeholder_5: T.Buffer[(768,), "int8"], ethosu_write_1: T.Buffer[(640,
         buffer_2 = T.buffer_decl([320], "uint8")
         buffer_3 = T.buffer_decl([880], "uint8")
         # body
-        ethosu_write_2 = T.allocate([2560], "int8", "global", annotations={"disable_lower_builtin": True})
+        ethosu_write_2_data = T.allocate([2560], "int8", "global", annotations={"disable_lower_builtin": True})
+        ethosu_write_2 = T.buffer_decl([2560], "int8", data=ethosu_write_2_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 3, 8, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, T.int8(-1), T.int8(-1), 12, buffer_1[0], 80, T.int8(-1), T.int8(-1), 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 12, 16, 3, 12, 0, 16, placeholder_5[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 10, 8, 32, 10, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -433,7 +436,8 @@ def main(placeholder_5: T.Buffer[(1024,), "int8"], ethosu_write_1: T.Buffer[(204
         buffer_2 = T.buffer_decl([272], "uint8")
         buffer_3 = T.buffer_decl([11040], "uint8")
         # body
-        ethosu_write_2 = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True})
+        ethosu_write_2_data = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True})
+        ethosu_write_2 = T.buffer_decl((2304,), "int8", data=ethosu_write_2_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_2[0], 272, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[256], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -452,7 +456,8 @@ def main(placeholder: T.Buffer[(192,), "int8"], ethosu_write: T.Buffer[(8192,),
         buffer_2 = T.buffer_decl([304], "uint8")
         buffer_3 = T.buffer_decl([80], "uint8")
         # body
-        ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        ethosu_write_1_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        ethosu_write_1 = T.buffer_decl([4096], "int8", data=ethosu_write_1_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, T.int8(-1), T.int8(-1), 12, buffer_3[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
@@ -471,7 +476,8 @@ def main(placeholder: T.Buffer[(1024,), "int8"], ethosu_write: T.Buffer[(32768,)
         buffer_2 = T.buffer_decl([11040], "uint8")
         buffer_3 = T.buffer_decl([272], "uint8")
         # body
-        ethosu_write_1 = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True})
+        ethosu_write_1_data = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True})
+        ethosu_write_1 = T.buffer_decl([12288], "int8", data=ethosu_write_1_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 768, 16, 256, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 768, 16, 256, "int8", 32, 32, 26, 32, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 1024, 16, 512, 3, 3, 1, 1, 1, 1, buffer_2[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_3[0], 272, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 6b97b38d80e6..8c7ff35272ef 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -36,7 +36,8 @@ def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(204
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer_1 = T.buffer_decl([384], "uint8")
         # body
-        placeholder_global = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin": True})
+        placeholder_global_data = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin": True})
+        placeholder_global = T.buffer_decl([384], "uint8", data=placeholder_global_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 384, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_global[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -78,8 +79,10 @@ def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(409
         buffer = T.buffer_decl([528], "uint8")
         buffer_2 = T.buffer_decl([336], "uint8")
         # body
-        placeholder_d_global = T.allocate([528], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_d_global_1 = T.allocate([336], "uint8", "global", annotations={"disable_lower_builtin": True})
+        placeholder_d_global_data = T.allocate([528], "uint8", "global", annotations={"disable_lower_builtin": True})
+        placeholder_d_global = T.buffer_decl([528], "uint8", data=placeholder_d_global_data)
+        placeholder_d_global_1_data = T.allocate([336], "uint8", "global", annotations={"disable_lower_builtin": True})
+        placeholder_d_global_1 = T.buffer_decl([336], "uint8", data=placeholder_d_global_1_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 528, placeholder_d_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 336, placeholder_d_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_d_global[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global[416], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index ba050de2b473..254abab644a2 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -184,10 +184,14 @@ def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer1 = T.buffer_decl([2848], "uint8")
         buffer3 = T.buffer_decl([976], "uint8")
-        p1 = T.allocate([2848], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.allocate([976], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p5 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
-        p6 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
+        p1_data = T.allocate([2848], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1 = T.buffer_decl([2848], "uint8", data=p1_data)
+        p2_data = T.allocate([976], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.buffer_decl([976], "uint8", data=p2_data)
+        p5_data = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
+        p5 = T.buffer_decl([75264], "int8", data=p5_data)
+        p6_data = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
+        p6 = T.buffer_decl([75264], "int8", data=p6_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2848, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 976, p2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, T.int8(-1), T.int8(-1), 12, p1[2608], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
index e1a0e143281b..f8a84aa08367 100644
--- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
+++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
@@ -56,8 +56,8 @@ def main(placeholder_6: T.Buffer[(192,), "int8"], ethosu_conv2d_1: T.Buffer[(512
         placeholder_8 = T.buffer_decl([1], "uint8")
         placeholder_5 = T.buffer_decl([1], "uint8")
         # body
-        ethosu_conv2d_2 = T.allocate([1024], "uint8", "global")
-        ethosu_conv2d_3 = T.allocate([2048], "uint8", "global")
+        ethosu_conv2d_2 = T.decl_buffer([1024], "uint8")
+        ethosu_conv2d_3 = T.decl_buffer([2048], "uint8")
         T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_8[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
         T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_9[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_5[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
         T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_8[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
@@ -76,8 +76,8 @@ def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_conv2d_1: T.Buffer[(20
         placeholder_5 = T.buffer_decl([1], "int32")
         placeholder_4 = T.buffer_decl([1], "uint8")
         # body
-        placeholder_global = T.allocate([256], "uint8", "global")
-        placeholder_d_global = T.allocate([8], "int32", "global")
+        placeholder_global = T.decl_buffer([256], "uint8")
+        placeholder_d_global = T.decl_buffer([8], "int32")
         T.evaluate(T.call_extern("ethosu_copy", placeholder_4[0], 256,  placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", placeholder_5[0], 8, placeholder_d_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 8, 16, 0, 16, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -110,8 +110,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
                                        buffer_6.name: buffer_6,
                                        buffer_7.name: buffer_7}})
         # body
-        placeholder_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_global_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_global = T.decl_buffer([128], "uint8", data=placeholder_global_data)
+        placeholder_d_global_data = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_d_global = T.decl_buffer([32], "uint8", data=placeholder_d_global_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, placeholder_d_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -158,9 +160,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
                                    buffer_8.name: buffer_8,
                                    buffer_9.name: buffer_9}})
         # body
-        ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        ethosu_write_1_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        ethosu_write_1 = T.buffer_decl([4096], "int8", data=ethosu_write_1_data)
+        placeholder_global_data = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_global = T.buffer_decl([80], "uint8", data=placeholder_global_data)
+        placeholder_d_global_data = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle"))
@@ -678,10 +683,10 @@ def main(placeholder_4: T.Buffer[(2048,), "int8"], ethosu_write_1: T.Buffer[(16,
                                    buffer_1.name: buffer_1,
                                    buffer_2.name: buffer_2}})
         # body
-        placeholder_global = T.allocate([272], "uint8", "global")
-        placeholder_d_global = T.allocate([160], "uint8", "global")
-        ethosu_write_2 = T.allocate([16], "int16", "global")
-        placeholder_d_global_1 = T.allocate([1], "int16", "global")
+        placeholder_global = T.decl_buffer([272], "uint8")
+        placeholder_d_global = T.decl_buffer([160], "uint8")
+        ethosu_write_2 = T.decl_buffer([16], "int16")
+        placeholder_d_global_1 = T.decl_buffer([1], "int16")
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 272, placeholder_global[0], dtype="uint8"))
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 160, placeholder_d_global[0], dtype="uint8"))
         T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 16, 16, 8, 0, 16, placeholder_4[0], 0, 0, 0, T.float32(0.0039215548895299435), -128, "NHWC", 256, 16, 1, "int16", 1, 1, 16, 1, 0, 1, ethosu_write_2[0], 0, 0, 0, T.float32(0.0023205536417663097), -128, "NHWC", 1, 1, 1, 16, 8, 1, 1, 1, 1, placeholder_global[0], 272, 0, placeholder_d_global[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="int16"))
diff --git a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
index 0b1e0f402b9d..e7632561c05c 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
@@ -63,9 +63,9 @@ def main(a: T.handle, b: T.handle) -> None:
         B = T.match_buffer(b, [14*14*512*256], dtype="float32")
         # body
         T.launch_thread(blockIdx_z, 196)
-        B_local = T.allocate([64], "float32", "local")
-        Apad_shared = T.allocate([512], "float32", "shared")
-        Apad_shared_local = T.allocate([8], "float32", "local")
+        B_local = T.decl_buffer([64], "float32", scope="local")
+        Apad_shared = T.decl_buffer([512], "float32", scope="shared")
+        Apad_shared_local = T.decl_buffer([8], "float32", scope="local")
         T.launch_thread(blockIdx_y, 8)
         T.launch_thread(blockIdx_x, 4)
         T.launch_thread(threadIdx_y, 8)
@@ -105,9 +105,9 @@ def main(a: T.handle, b: T.handle) -> None:
         B = T.match_buffer(b, [14*14*512*256], dtype="float32")
         # body
         T.launch_thread(blockIdx_z, 196)
-        B_local = T.allocate([6400000], "float32", "local")
-        Apad_shared = T.allocate([512], "float32", "shared")
-        Apad_shared_local = T.allocate([8], "float32", "local")
+        B_local = T.decl_buffer([6400000], "float32", scope="local")
+        Apad_shared = T.decl_buffer([512], "float32", scope="shared")
+        Apad_shared_local = T.decl_buffer([8], "float32", scope="local")
         T.launch_thread(blockIdx_y, 8)
         T.launch_thread(blockIdx_x, 4)
         T.launch_thread(threadIdx_y, 8)
@@ -151,9 +151,9 @@ def main(a: T.handle, b: T.handle) -> None:
         B = T.match_buffer(b, [14*14*512*256], dtype="float32")
         # body
         T.launch_thread(blockIdx_z, 196)
-        B_local = T.allocate([64], "float32", "local")
-        Apad_shared = T.allocate([512000], "float32", "shared")
-        Apad_shared_local = T.allocate([8], "float32", "local")
+        B_local = T.decl_buffer([64], "float32", scope="local")
+        Apad_shared = T.decl_buffer([512000], "float32", scope="shared")
+        Apad_shared_local = T.decl_buffer([8], "float32", scope="local")
         T.launch_thread(blockIdx_y, 8)
         T.launch_thread(blockIdx_x, 4)
         T.launch_thread(threadIdx_y, 8)
@@ -197,9 +197,9 @@ def main(a: T.handle, b: T.handle) -> None:
         B = T.match_buffer(b, [14*14*512*256], dtype="float32")
         # body
         T.launch_thread(blockIdx_z, 196)
-        B_local = T.allocate([64], "float32", "local")
-        Apad_shared = T.allocate([512], "float32", "shared")
-        Apad_shared_local = T.allocate([8], "float32", "local")
+        B_local = T.decl_buffer([64], "float32", scope="local")
+        Apad_shared = T.decl_buffer([512], "float32", scope="shared")
+        Apad_shared_local = T.decl_buffer([8], "float32", scope="local")
         T.launch_thread(blockIdx_y, 8)
         T.launch_thread(blockIdx_x, 4)
         T.launch_thread(threadIdx_y, 8)
diff --git a/tests/python/unittest/test_tir_analysis_calculate_workspace.py b/tests/python/unittest/test_tir_analysis_calculate_workspace.py
index 1d78458b930d..12c892a04b07 100644
--- a/tests/python/unittest/test_tir_analysis_calculate_workspace.py
+++ b/tests/python/unittest/test_tir_analysis_calculate_workspace.py
@@ -31,8 +31,8 @@ def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.hand
     placeholder_149 = T.match_buffer(placeholder_146, [512], dtype="int32", elem_offset=0, align=64, offset_factor=1)
     T_cast_49 = T.match_buffer(T_cast_48, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1)
     # body
-    PaddedInput_22 = T.allocate([131072], "int16", "global")
-    DepthwiseConv2d_9 = T.allocate([100352], "int32", "global")
+    PaddedInput_22 = T.decl_buffer([131072], "int16")
+    DepthwiseConv2d_9 = T.decl_buffer([100352], "int32")
     for i1_29, i2_39, i3_40 in T.grid(16, 16, 512):
         PaddedInput_22[(((i1_29*8192) + (i2_39*512)) + i3_40)] = T.if_then_else(((((1 <= i1_29) and (i1_29 < 15)) and (1 <= i2_39)) and (i2_39 < 15)), placeholder_147[((((i1_29*7168) + (i2_39*512)) + i3_40) - 7680)], T.int16(0), dtype="int16")
     for i_9, j_9, c_9 in T.grid(14, 14, 512):
@@ -63,25 +63,25 @@ def primfunc_local_allocates(placeholder_162: T.handle, placeholder_163: T.handl
     T_cast_77 = T.match_buffer(T_cast_76, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1)
     sid_21 = T.allocate_const([0,1,2,3,4,5,6,7], "int8", [8])
     # body
-    PaddedInput_25 = T.allocate([131072], "int16", "global")
+    PaddedInput_25 = T.decl_buffer([131072], "int16")
     for i1_35, i2_46, i3_47 in T.grid(16, 16, 512):
         PaddedInput_25[(((i1_35*8192) + (i2_46*512)) + i3_47)] = T.if_then_else(((((1 <= i1_35) and (i1_35 < 15)) and (1 <= i2_46)) and (i2_46 < 15)), placeholder_165[((((i1_35*7168) + (i2_46*512)) + i3_47) - 7680)], T.int16(0), dtype="int16")
-    T_add_11 = T.allocate([100352], "int32", "global")
-    with T.allocate([100352], "int32", "global") as DepthwiseConv2d_11:
+    T_add_11 = T.decl_buffer([100352], "int32")
+    with T.decl_buffer([100352], "int32") as DepthwiseConv2d_11:
         for i_11, j_11, c_11 in T.grid(14, 14, 512):
             DepthwiseConv2d_11[(((i_11*7168) + (j_11*512)) + c_11)] = 0
             for di_11, dj_11 in T.grid(3, 3):
                 DepthwiseConv2d_11[(((i_11*7168) + (j_11*512)) + c_11)] = (DepthwiseConv2d_11[(((i_11*7168) + (j_11*512)) + c_11)] + (PaddedInput_25[(((((i_11*8192) + (di_11*8192)) + (j_11*512)) + (dj_11*512)) + c_11)].astype("int32")*placeholder_166[(((di_11*1536) + (dj_11*512)) + c_11)].astype("int32")))
         for ax1_44, ax2_45, ax3_47 in T.grid(14, 14, 512):
             T_add_11[(((ax1_44*7168) + (ax2_45*512)) + ax3_47)] = (DepthwiseConv2d_11[(((ax1_44*7168) + (ax2_45*512)) + ax3_47)] + placeholder_167[ax3_47])
-    compute_22 = T.allocate([100352], "int32", "global")
-    with T.allocate([100352], "int32", "global") as T_cast_78:
+    compute_22 = T.decl_buffer([100352], "int32")
+    with T.decl_buffer([100352], "int32") as T_cast_78:
         for ax1_45, ax2_46, ax3_48 in T.grid(14, 14, 512):
             T_cast_78[(((ax1_45*7168) + (ax2_46*512)) + ax3_48)] = T_add_11[(((ax1_45*7168) + (ax2_46*512)) + ax3_48)]
         for i1_36, i2_47, i3_48 in T.grid(14, 14, 512):
             compute_22[(((i1_36*7168) + (i2_47*512)) + i3_48)] = T.q_multiply_shift(T_cast_78[(((i1_36*7168) + (i2_47*512)) + i3_48)], 1948805937, 31, -5, dtype="int32")
-    T_cast_79 = T.allocate([100352], "uint8", "global")
-    with T.allocate([100352], "int32", "global") as compute_23:
+    T_cast_79 = T.decl_buffer([100352], "uint8")
+    with T.decl_buffer([100352], "int32") as compute_23:
         for i1_37, i2_48, i3_49 in T.grid(14, 14, 512):
             compute_23[(((i1_37*7168) + (i2_48*512)) + i3_49)] = T.max(T.max(compute_22[(((i1_37*7168) + (i2_48*512)) + i3_49)], 255), 0)
         for ax1_46, ax2_47, ax3_49 in T.grid(14, 14, 512):
diff --git a/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py b/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py
index 49121614ffa0..344f37a23677 100644
--- a/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py
+++ b/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py
@@ -52,7 +52,7 @@ def buffer_opaque_access(b: T.handle, c: T.handle) -> None:
     with T.block():
         T.reads([])
         T.writes(B[0:16, 0:16])
-        A = T.allocate([256], "float32", "global")
+        A = T.decl_buffer([256], "float32")
         for i, j in T.grid(16, 16):
             A[i * 16 + j] = 1
         for i in range(0, 16):
diff --git a/tests/python/unittest/test_tir_ptx_mma.py b/tests/python/unittest/test_tir_ptx_mma.py
index 23405fdee98a..bee9b7b48020 100644
--- a/tests/python/unittest/test_tir_ptx_mma.py
+++ b/tests/python/unittest/test_tir_ptx_mma.py
@@ -36,9 +36,9 @@ def gemm_mma_m8n8k4_row_col_fp64pf64fp64(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([1], "float64", scope="local")
-    MultiB = T.allocate([1], "float64", scope="local")
-    Accum = T.allocate([2], "float64", scope="local")
+    MultiA = T.decl_buffer([1], "float64", scope="local")
+    MultiB = T.decl_buffer([1], "float64", scope="local")
+    Accum = T.decl_buffer([2], "float64", scope="local")
     for i in range(2):
         Accum[i] = T.float64(0)
 
@@ -106,9 +106,9 @@ def gemm_mma_m8n8k4_row_row_fp16fp16fp16(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([4], "float16", scope="local")
-    MultiB = T.allocate([4], "float16", scope="local")
-    Accum = T.allocate([8], "float16", scope="local")
+    MultiA = T.decl_buffer([4], "float16", scope="local")
+    MultiB = T.decl_buffer([4], "float16", scope="local")
+    Accum = T.decl_buffer([8], "float16", scope="local")
     for i in range(8):
         Accum[i] = T.float32(0)
 
@@ -187,9 +187,10 @@ def gemm_mma_m8n8k4_row_row_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([4], "float16", scope="local")
-    MultiB = T.allocate([4], "float16", scope="local")
-    Accum = T.allocate([8], "float32", scope="local")
+    MultiA = T.decl_buffer([4], "float16", scope="local")
+    MultiB = T.decl_buffer([4], "float16", scope="local")
+    Accum = T.decl_buffer([8], "float32", scope="local")
+
     for i in range(8):
         Accum[i] = T.float32(0)
 
@@ -274,9 +275,9 @@ def gemm_mma_m8n8k16_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([4], "int8", scope="local")
-    MultiB = T.allocate([4], "int8", scope="local")
-    Accum = T.allocate([2], "int32", scope="local")
+    MultiA = T.decl_buffer([4], "int8", scope="local")
+    MultiB = T.decl_buffer([4], "int8", scope="local")
+    Accum = T.decl_buffer([2], "int32", scope="local")
     for i in range(2):
         Accum[i] = T.int32(0)
 
@@ -350,9 +351,9 @@ def gemm_mma_m8n8k16_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([4], "int8", scope="local")
-    MultiB = T.allocate([4], "uint8", scope="local")
-    Accum = T.allocate([2], "int32", scope="local")
+    MultiA = T.decl_buffer([4], "int8", scope="local")
+    MultiB = T.decl_buffer([4], "uint8", scope="local")
+    Accum = T.decl_buffer([2], "int32", scope="local")
     for i in range(2):
         Accum[i] = T.int32(0)
 
@@ -426,9 +427,9 @@ def gemm_mma_m8n8k32_row_col_s4s4s32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([8], "int4", scope="local")
-    MultiB = T.allocate([8], "int4", scope="local")
-    Accum = T.allocate([2], "int32", scope="local")
+    MultiA = T.decl_buffer([8], "int4", scope="local")
+    MultiB = T.decl_buffer([8], "int4", scope="local")
+    Accum = T.decl_buffer([2], "int32", scope="local")
     for i in range(2):
         Accum[i] = T.int32(0)
 
@@ -494,9 +495,9 @@ def gemm_mma_m8n8k32_row_col_s4u4s32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([8], "int4", scope="local")
-    MultiB = T.allocate([8], "uint4", scope="local")
-    Accum = T.allocate([2], "int32", scope="local")
+    MultiA = T.decl_buffer([8], "int4", scope="local")
+    MultiB = T.decl_buffer([8], "uint4", scope="local")
+    Accum = T.decl_buffer([2], "int32", scope="local")
     for i in range(2):
         Accum[i] = T.int32(0)
 
@@ -562,9 +563,9 @@ def gemm_mma_m16n8k8_row_col_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle)
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([4], "float16", scope="local")
-    MultiB = T.allocate([2], "float16", scope="local")
-    Accum = T.allocate([4], "float32", scope="local")
+    MultiA = T.decl_buffer([4], "float16", scope="local")
+    MultiB = T.decl_buffer([2], "float16", scope="local")
+    Accum = T.decl_buffer([4], "float32", scope="local")
     for i in range(4):
         Accum[i] = T.float32(0)
 
@@ -640,9 +641,9 @@ def gemm_mma_m16n8k16_row_col_fp16fp16fp16(a: T.handle, b: T.handle, c: T.handle
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([8], "float16", scope="local")
-    MultiB = T.allocate([4], "float16", scope="local")
-    Accum = T.allocate([4], "float16", scope="local")
+    MultiA = T.decl_buffer([8], "float16", scope="local")
+    MultiB = T.decl_buffer([4], "float16", scope="local")
+    Accum = T.decl_buffer([4], "float16", scope="local")
     for i in range(4):
         Accum[i] = T.float32(0)
 
@@ -722,9 +723,9 @@ def gemm_mma_m16n8k16_row_col_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([8], "float16", scope="local")
-    MultiB = T.allocate([4], "float16", scope="local")
-    Accum = T.allocate([4], "float32", scope="local")
+    MultiA = T.decl_buffer([8], "float16", scope="local")
+    MultiB = T.decl_buffer([4], "float16", scope="local")
+    Accum = T.decl_buffer([4], "float32", scope="local")
     for i in range(4):
         Accum[i] = T.float32(0)
 
@@ -804,9 +805,9 @@ def gemm_mma_m16n8k16_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([8], "int8", scope="local")
-    MultiB = T.allocate([4], "int8", scope="local")
-    Accum = T.allocate([4], "int32", scope="local")
+    MultiA = T.decl_buffer([8], "int8", scope="local")
+    MultiB = T.decl_buffer([4], "int8", scope="local")
+    Accum = T.decl_buffer([4], "int32", scope="local")
     for i in range(4):
         Accum[i] = T.int32(0)
 
@@ -886,9 +887,9 @@ def gemm_mma_m16n8k16_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([8], "int8", scope="local")
-    MultiB = T.allocate([4], "uint8", scope="local")
-    Accum = T.allocate([4], "int32", scope="local")
+    MultiA = T.decl_buffer([8], "int8", scope="local")
+    MultiB = T.decl_buffer([4], "uint8", scope="local")
+    Accum = T.decl_buffer([4], "int32", scope="local")
     for i in range(4):
         Accum[i] = T.int32(0)
 
@@ -968,9 +969,9 @@ def gemm_mma_m16n8k32_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([16], "int8", scope="local")
-    MultiB = T.allocate([8], "int8", scope="local")
-    Accum = T.allocate([4], "int32", scope="local")
+    MultiA = T.decl_buffer([16], "int8", scope="local")
+    MultiB = T.decl_buffer([8], "int8", scope="local")
+    Accum = T.decl_buffer([4], "int32", scope="local")
     for i in range(4):
         Accum[i] = T.int32(0)
 
@@ -1050,9 +1051,9 @@ def gemm_mma_m16n8k32_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([16], "int8", scope="local")
-    MultiB = T.allocate([8], "uint8", scope="local")
-    Accum = T.allocate([4], "int32", scope="local")
+    MultiA = T.decl_buffer([16], "int8", scope="local")
+    MultiB = T.decl_buffer([8], "uint8", scope="local")
+    Accum = T.decl_buffer([4], "int32", scope="local")
     for i in range(4):
         Accum[i] = T.int32(0)
 
@@ -1132,9 +1133,9 @@ def gemm_mma_m16n8k64_row_col_s4s4s32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([32], "int4", scope="local")
-    MultiB = T.allocate([16], "int4", scope="local")
-    Accum = T.allocate([4], "int32", scope="local")
+    MultiA = T.decl_buffer([32], "int4", scope="local")
+    MultiB = T.decl_buffer([16], "int4", scope="local")
+    Accum = T.decl_buffer([4], "int32", scope="local")
     for i in range(4):
         Accum[i] = T.int32(0)
 
@@ -1206,9 +1207,9 @@ def gemm_mma_m16n8k64_row_col_s4u4s32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([32], "int4", scope="local")
-    MultiB = T.allocate([16], "uint4", scope="local")
-    Accum = T.allocate([4], "int32", scope="local")
+    MultiA = T.decl_buffer([32], "int4", scope="local")
+    MultiB = T.decl_buffer([16], "uint4", scope="local")
+    Accum = T.decl_buffer([4], "int32", scope="local")
     for i in range(4):
         Accum[i] = T.int32(0)
 
@@ -1280,9 +1281,9 @@ def gemm_mma_m16n8k256_row_col_b1b1s32(a: T.handle, b: T.handle, c: T.handle):
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    MultiA = T.allocate([128], "int1", scope="local")
-    MultiB = T.allocate([64], "int1", scope="local")
-    Accum = T.allocate([4], "int32", scope="local")
+    MultiA = T.decl_buffer([128], "int1", scope="local")
+    MultiB = T.decl_buffer([64], "int1", scope="local")
+    Accum = T.decl_buffer([4], "int32", scope="local")
     for i in range(4):
         Accum[i] = T.int32(0)
 
diff --git a/tests/python/unittest/test_tir_ptx_mma_sp.py b/tests/python/unittest/test_tir_ptx_mma_sp.py
index 321cd28ff6f7..24170b4898f9 100644
--- a/tests/python/unittest/test_tir_ptx_mma_sp.py
+++ b/tests/python/unittest/test_tir_ptx_mma_sp.py
@@ -52,10 +52,10 @@ def mma_sp_m16n8k16_f16f16f16(a: T.handle, b: T.handle, c: T.handle, _metadata:
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    multi_a = T.allocate([4], "float16", scope="local")
-    multi_b = T.allocate([4], "float16", scope="local")
-    accum = T.allocate([4], "float16", scope="local")
-    meta_local = T.allocate([1], "uint32", scope="local")
+    multi_a = T.decl_buffer([4], "float16", scope="local")
+    multi_b = T.decl_buffer([4], "float16", scope="local")
+    accum = T.decl_buffer([4], "float16", scope="local")
+    meta_local = T.decl_buffer([1], "uint32", scope="local")
     for i in range(4):
         accum[i] = T.float16(0)
 
@@ -106,10 +106,10 @@ def mma_sp_m16n8k16_f16f16f32(a: T.handle, b: T.handle, c: T.handle, _metadata:
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    multi_a = T.allocate([4], "float16", scope="local")
-    multi_b = T.allocate([4], "float16", scope="local")
-    accum = T.allocate([4], "float32", scope="local")
-    meta_local = T.allocate([1], "uint32", scope="local")
+    multi_a = T.decl_buffer([4], "float16", scope="local")
+    multi_b = T.decl_buffer([4], "float16", scope="local")
+    accum = T.decl_buffer([4], "float32", scope="local")
+    meta_local = T.decl_buffer([1], "uint32", scope="local")
     for i in range(4):
         accum[i] = T.float16(0)
 
@@ -160,10 +160,10 @@ def mma_sp_m16n8k32_f16f16f16(a: T.handle, b: T.handle, c: T.handle, _metadata:
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    multi_a = T.allocate([8], "float16", scope="local")
-    multi_b = T.allocate([8], "float16", scope="local")
-    accum = T.allocate([4], "float16", scope="local")
-    meta_local = T.allocate([1], "uint32", scope="local")
+    multi_a = T.decl_buffer([8], "float16", scope="local")
+    multi_b = T.decl_buffer([8], "float16", scope="local")
+    accum = T.decl_buffer([4], "float16", scope="local")
+    meta_local = T.decl_buffer([1], "uint32", scope="local")
     for i in range(4):
         accum[i] = T.float16(0)
 
@@ -214,10 +214,10 @@ def mma_sp_m16n8k32_f16f16f32(a: T.handle, b: T.handle, c: T.handle, _metadata:
     T.launch_thread(brow, 1)
     T.launch_thread(bcol, 1)
     T.launch_thread(tx, 32)
-    multi_a = T.allocate([8], "float16", scope="local")
-    multi_b = T.allocate([8], "float16", scope="local")
-    accum = T.allocate([4], "float32", scope="local")
-    meta_local = T.allocate([1], "uint32", scope="local")
+    multi_a = T.decl_buffer([8], "float16", scope="local")
+    multi_b = T.decl_buffer([8], "float16", scope="local")
+    accum = T.decl_buffer([4], "float32", scope="local")
+    meta_local = T.decl_buffer([1], "uint32", scope="local")
     for i in range(4):
         accum[i] = T.float16(0)
 
diff --git a/tests/python/unittest/test_tir_renew_defs.py b/tests/python/unittest/test_tir_renew_defs.py
index 36cc52c16935..28b440a608dc 100644
--- a/tests/python/unittest/test_tir_renew_defs.py
+++ b/tests/python/unittest/test_tir_renew_defs.py
@@ -135,7 +135,8 @@ def test_undefined_buffer():
     @T.prim_func
     def access_alloc():
         # Buffer A should be remapped
-        A = T.allocate([128], "float16", "global")
+        A_data = T.allocate([128], "float16", "global")
+        A = T.buffer_decl(shape=[128], dtype="float16", data=A_data)
         # check if buffer var also get remapped
         T.evaluate(A.data)
         for i in range(128):
diff --git a/tests/python/unittest/test_tir_structural_equal_hash.py b/tests/python/unittest/test_tir_structural_equal_hash.py
index d5feb21f0db7..4bb13ed77ad8 100644
--- a/tests/python/unittest/test_tir_structural_equal_hash.py
+++ b/tests/python/unittest/test_tir_structural_equal_hash.py
@@ -234,7 +234,7 @@ def test_buffer_storage_scope():
 
     buffer_local_0 = tvm.tir.decl_buffer((10, 10), "float32", scope="local")
     buffer_local_1 = tvm.tir.decl_buffer((10, 10), "float32", scope="local")
-    buffer_global = tvm.tir.decl_buffer((10, 10), "float32", scope="global")
+    buffer_global = tvm.tir.decl_buffer((10, 10), "float32")
     buffer_empty = tvm.tir.decl_buffer((10, 10), "float32", scope="")
 
     func0 = tvm.tir.PrimFunc([x], tvm.tir.Evaluate(x), buffer_map={x: buffer_local_0})
diff --git a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py
index 1a3afdd4c1e2..e08f04fa1f25 100644
--- a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py
+++ b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py
@@ -31,13 +31,13 @@ def fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(placeholder_30: T.
     placeholder_35 = T.match_buffer(placeholder_32, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1)
     T_cast_9 = T.match_buffer(T_cast_8, [12544], dtype="int16", elem_offset=0, align=64, offset_factor=1)
     # body
-    PaddedInput_3 = T.allocate([150528], "int16", "global")
+    PaddedInput_3 = T.decl_buffer([150528], "int16")
     for i0_i1_fused_3 in T.parallel(0, 28):
         for i2_3, i3_3 in T.grid(28, 192):
             PaddedInput_3[(((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3) ] = placeholder_33[(((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3)]
     for ax0_ax1_fused_ax2_fused_3 in T.parallel(0, 784):
         for ax3_2 in T.serial(0, 16):
-            Conv2dOutput_3 = T.allocate([1], "int32", "global")
+            Conv2dOutput_3 = T.decl_buffer([1], "int32")
             Conv2dOutput_3[0] = 0
             for rc_3 in T.serial(0, 192):
                 Conv2dOutput_3[0] = (Conv2dOutput_3[0] + (T.cast(PaddedInput_3[((ax0_ax1_fused_ax2_fused_3*192) + rc_3)], "int32")*T.cast(placeholder_34[((rc_3*16) + ax3_2)], "int32")))
diff --git a/tests/python/unittest/test_tir_transform_extract_constants.py b/tests/python/unittest/test_tir_transform_extract_constants.py
index 82f4f6515c09..5de06e38a557 100644
--- a/tests/python/unittest/test_tir_transform_extract_constants.py
+++ b/tests/python/unittest/test_tir_transform_extract_constants.py
@@ -27,7 +27,8 @@ class Module4:
     def constant1(a: T.handle) -> None:
         A = T.match_buffer(a, (10), "int32")
         B = T.alloc_buffer((10), "int32")
-        K = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+        K_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+        K = T.buffer_decl(shape=(10), dtype="int32", data=K_data)
         for x in T.serial(0, 10):
             B[x] = A[x] + K[x]
 
@@ -35,7 +36,8 @@ def constant1(a: T.handle) -> None:
     def constant2(a: T.handle) -> None:
         A = T.match_buffer(a, (10), "int32")
         B = T.alloc_buffer((10), "int32")
-        K = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+        K_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+        K = T.buffer_decl(shape=(10), dtype="int32", data=K_data)
         for x in T.serial(0, 10):
             B[x] = A[x] + K[x]
 
@@ -43,7 +45,8 @@ def constant2(a: T.handle) -> None:
     def constant3(a: T.handle) -> None:
         A = T.match_buffer(a, (10), "int32")
         B = T.alloc_buffer((10), "int32")
-        K = T.allocate_const([1, 2, 3, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+        K_data = T.allocate_const([1, 2, 3, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+        K = T.buffer_decl(shape=(10), dtype="int32", data=K_data)
         for x in T.serial(0, 10):
             B[x] = A[x] + K[x]
 
diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py
index a1195a9d2a65..4cdf71889eee 100644
--- a/tests/python/unittest/test_tir_transform_flatten_buffer.py
+++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py
@@ -33,7 +33,8 @@ def elementwise_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i in T.serial(0, 16):
-        B_new = T.allocate([1, 16], "float32", "global")
+        B_new_data = T.allocate([1, 16], "float32", "global")
+        B_new = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_new_data)
         for j in T.serial(0, 16):
             B_new[0, j] = A[i, j] + 1.0
         for j in T.serial(0, 16):
@@ -47,7 +48,8 @@ def flattened_elementwise_func(a: T.handle, c: T.handle) -> None:
     T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
     T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)
     for i in T.serial(0, 16):
-        B_new = T.allocate([16], "float32", "global")
+        B_new_data = T.allocate([16], "float32", "global")
+        B_new = T.buffer_decl(shape=[16], dtype="float32", data=B_new_data)
         for j in T.serial(0, 16):
             B_new[j] = A[((i * 16) + j)] + 1.0
         for j in T.serial(0, 16):
@@ -66,7 +68,8 @@ def gpu_func(a: T.handle, c: T.handle) -> None:
     T.launch_thread(i0, 4)
     T.launch_thread(i1, 2)
     T.launch_thread(i2, 2)
-    B = T.allocate([1, 16], "float32", "local")
+    B_data = T.allocate([1, 16], "float32", "local")
+    B = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_data, scope="local")
     for j in range(0, 16):
         B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0
     for j in range(0, 16):
@@ -87,7 +90,8 @@ def flattened_gpu_func(a: T.handle, c: T.handle) -> None:
     T.launch_thread(i0, 4)
     T.launch_thread(i1, 2)
     T.launch_thread(i2, 2)
-    B = T.allocate([16], "float32", "local")
+    B_data = T.allocate([16], "float32", "local")
+    B = T.buffer_decl(shape=[16], dtype="float32", data=B_data, scope="local")
     for j in range(0, 16):
         B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + 1.0
     for j in range(0, 16):
@@ -100,7 +104,8 @@ def symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None:
     C = T.match_buffer(c, (n, m), "float32")
 
     for i in range(0, n):
-        B = T.allocate([m], "float32", "global")
+        B_data = T.allocate([m], "float32", "global")
+        B = T.buffer_decl(shape=[m], dtype="float32", data=B_data)
         for j in range(0, m):
             B[j] = A[i, j] + 1.0
         for j in range(0, m):
@@ -115,7 +120,8 @@ def flattened_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) ->
     T.preflattened_buffer(C, (n, m), "float32", data=C.data)
 
     for i in range(0, n):
-        B = T.allocate([m], "float32", "global")
+        B_data = T.allocate([m], "float32", "global")
+        B = T.buffer_decl(shape=[m], dtype="float32", data=B_data)
         for j in range(0, m):
             B[j] = A[i * m + j] + 1.0
         for j in range(0, m):
@@ -128,8 +134,10 @@ def multi_alloc_func(a: T.handle, d: T.handle) -> None:
     D = T.match_buffer(d, (4, 32), "float32")
 
     for i, j in T.grid(4, 32):
-        B = T.allocate((4, 32), "float32", scope="global")
-        C = T.allocate((4, 32), "float32", scope="global")
+        B_data = T.allocate((4, 32), "float32", scope="global")
+        B = T.buffer_decl(shape=(4, 32), dtype="float32", data=B_data)
+        C_data = T.allocate((4, 32), "float32", scope="global")
+        C = T.buffer_decl(shape=(4, 32), dtype="float32", data=C_data)
         B[i, j] = A[i, j] + 1.0
         C[i, j] = A[i, j] + B[i, j]
         D[i, j] = C[i, j] * 2.0
@@ -143,8 +151,10 @@ def flattened_multi_alloc_func(a: T.handle, d: T.handle) -> None:
     T.preflattened_buffer(D, (4, 32), "float32", data=D.data)
 
     for i, j in T.grid(4, 32):
-        B = T.allocate([128], "float32", "global")
-        C = T.allocate([128], "float32", "global")
+        B_data = T.allocate([128], "float32", "global")
+        B = T.buffer_decl(shape=[128], dtype="float32", data=B_data)
+        C_data = T.allocate([128], "float32", "global")
+        C = T.buffer_decl(shape=[128], dtype="float32", data=C_data)
         B[i * 32 + j] = A[i * 32 + j] + 1.0
         C[i * 32 + j] = A[i * 32 + j] + B[i * 32 + j]
         D[i * 32 + j] = C[i * 32 + j] * 2.0
@@ -155,7 +165,8 @@ def strided_buffer_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i0 in T.serial(4):
-        B = T.allocate([4, 17], "float32", "global")
+        B_data = T.allocate([4, 17], "float32", "global")
+        B = T.buffer_decl(shape=[4, 17], dtype="float32", data=B_data)
         B_1 = T.buffer_decl([4, 16], dtype="float32", data=B.data, strides=[17, 1])
         for i1, j in T.grid(4, 16):
             B_1[i1, j] = A[i0 * 4 + i1, j] + 1.0
@@ -170,7 +181,8 @@ def flattened_strided_buffer_func(a: T.handle, c: T.handle) -> None:
     T.preflattened_buffer(A, [16, 16], dtype="float32", data=A.data)
     T.preflattened_buffer(C, [16, 16], dtype="float32", data=C.data)
     for i0 in T.serial(0, 4):
-        B_new = T.allocate([68], "float32", "global")
+        B_new_data = T.allocate([68], "float32", "global")
+        B_new = T.buffer_decl(shape=[68], dtype="float32", data=B_new_data)
         for i1 in T.serial(0, 4):
             for j in T.serial(0, 16):
                 B_new[i1 * 17 + j] = A[i0 * 64 + i1 * 16 + j] + 1.0
diff --git a/tests/python/unittest/test_tir_transform_inject_virtual_thread.py b/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
index b96afb6a0941..548f3bc8d1d2 100644
--- a/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
+++ b/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
@@ -145,12 +145,14 @@ def test_vthread_simplified():
     def before_func():
         vthread = T.env_thread("vthread")
         T.launch_thread(vthread, 4)
-        B = T.allocate([4], "int32", "shared")
+        B_data = T.allocate([4], "int32", scope="shared")
+        B = T.buffer_decl([4], "int32", data=B_data, scope="shared")
         B[0:4] = T.broadcast(vthread, 4)
 
     @T.prim_func
     def expected_func():
-        B = T.allocate([16], "int32", "shared")
+        B_data = T.allocate([16], "int32", scope="shared")
+        B = T.buffer_decl([16], "int32", data=B_data, scope="shared")
         # The indices for B should each be a single Ramp node, and
         # should not be the sum of a Ramp and Broadcast node.
         B[0 * 4 : 0 * 4 + 4] = T.broadcast(0, 4)
@@ -172,12 +174,14 @@ def test_vthread_vectorized():
     def before_func():
         vthread = T.env_thread("vthread")
         T.launch_thread(vthread, 4)
-        B = T.allocate([4], "int32", "shared")
+        B_data = T.allocate([4], "int32", "shared")
+        B = T.buffer_decl([4], "int32", data=B_data, scope="shared")
         B[0:4] = T.broadcast(vthread, 4)
 
     @T.prim_func
     def expected_func():
-        B = T.allocate([4], "int32x4", "shared")
+        B_data = T.allocate([4], "int32x4", "shared")
+        B = T.buffer_decl([4], "int32x4", data=B_data, scope="shared")
         B[0 * 4 / 4] = T.broadcast(0, 4)
         B[1 * 4 / 4] = T.broadcast(1, 4)
         B[2 * 4 / 4] = T.broadcast(2, 4)
diff --git a/tests/python/unittest/test_tir_transform_lower_opaque_block.py b/tests/python/unittest/test_tir_transform_lower_opaque_block.py
index 6f557ba09d43..f8f3e3a5aced 100644
--- a/tests/python/unittest/test_tir_transform_lower_opaque_block.py
+++ b/tests/python/unittest/test_tir_transform_lower_opaque_block.py
@@ -54,7 +54,8 @@ def transformed_elementwise_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i in T.serial(0, 16):
-        B_new = T.allocate([1, 16], "float32", "global")
+        B_new_data = T.allocate([1, 16], "float32", "global")
+        B_new = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_new_data)
         for j in T.serial(0, 16):
             B_new[0, j] = A[i, j] + 1.0
         for j in T.serial(0, 16):
@@ -96,7 +97,8 @@ def transformed_gpu_func(a: T.handle, c: T.handle) -> None:
     T.launch_thread(i0, 4)
     T.launch_thread(i1, 2)
     T.launch_thread(i2, 2)
-    B = T.allocate([1, 16], "float32", "local")
+    B_data = T.allocate([1, 16], "float32", "local")
+    B = T.buffer_decl(shape=[1, 16], dtype="float32", scope="local", data=B_data)
     for j in range(0, 16):
         B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0
     for j in range(0, 16):
@@ -131,7 +133,8 @@ def transformed_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32)
     C = T.match_buffer(c, (n, m), "float32")
 
     for i in range(0, n):
-        B = T.allocate([m], "float32", "global")
+        B_data = T.allocate([m], "float32", "global")
+        B = T.buffer_decl(shape=[m], dtype="float32", data=B_data)
         for j in range(0, m):
             B[j] = A[i, j] + 1.0
         for j in range(0, m):
@@ -204,8 +207,10 @@ def transformed_multi_alloc_func(a: T.handle, d: T.handle) -> None:
     D = T.match_buffer(d, (32), "float32")
 
     for i in range(0, 32):
-        B = T.allocate((32,), "float32", "global")
-        C = T.allocate((32,), "float32", "global")
+        B_data = T.allocate((32,), "float32", "global")
+        B = T.buffer_decl(shape=(32,), dtype="float32", data=B_data)
+        C_data = T.allocate((32,), "float32", "global")
+        C = T.buffer_decl(shape=(32,), dtype="float32", data=C_data)
         B[i] = A[i] + 1.0
         C[i] = A[i] + B[i]
         D[i] = C[i] * 2.0
@@ -240,7 +245,8 @@ def transformed_strided_buffer_func(
 ) -> None:
     # body
     for i0 in T.serial(4):
-        B = T.allocate([4, 17], "float32", "global")
+        B_data = T.allocate([4, 17], "float32", "global")
+        B = T.buffer_decl(shape=[4, 17], dtype="float32", data=B_data)
         B_1 = T.buffer_decl([4, 16], dtype="float32", data=B.data, strides=[17, 1])
         for i1, j in T.grid(4, 16):
             B_1[i1, j] = A[i0 * 4 + i1, j] + T.float32(1)
diff --git a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
index fd08f7e2249a..bfa132d4cecf 100644
--- a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
+++ b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
@@ -36,9 +36,9 @@ def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "flo
         blockIdx_x = T.env_thread("blockIdx.x")
         # body
         T.launch_thread(blockIdx_x, 64)
-        conv2d_transpose_nhwc_local = T.allocate([8], "float32", "local")
-        PadInput_shared = T.allocate([768], "float32", "shared")
-        weight_shared = T.allocate([4096], "float32", "shared")
+        conv2d_transpose_nhwc_local = T.decl_buffer([8], "float32", scope="local")
+        PadInput_shared = T.decl_buffer([768], "float32", scope="shared")
+        weight_shared = T.decl_buffer([4096], "float32", scope="shared")
         T.launch_thread(threadIdx_x, 32)
         for i2_3_init, i1_4_init, i2_4_init in T.grid(2, 2, 2):
             conv2d_transpose_nhwc_local[i1_4_init * 4 + i2_3_init * 2 + i2_4_init] = T.float32(0)
@@ -67,9 +67,9 @@ def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "flo
         blockIdx_x = T.env_thread("blockIdx.x")
         # body
         T.launch_thread(blockIdx_x, 64)
-        conv2d_transpose_nhwc_local = T.allocate([8], "float32", "local")
-        PadInput_shared = T.allocate([768], "float32", "shared")
-        weight_shared = T.allocate([4096], "float32", "shared")
+        conv2d_transpose_nhwc_local = T.decl_buffer([8], "float32", scope="local")
+        PadInput_shared = T.decl_buffer([768], "float32", scope="shared")
+        weight_shared = T.decl_buffer([4096], "float32", scope="shared")
         T.launch_thread(threadIdx_x, 32)
         for i2_3_init, i1_4_init, i2_4_init in T.grid(2, 2, 2):
             conv2d_transpose_nhwc_local[i1_4_init * 4 + i2_3_init * 2 + i2_4_init] = T.float32(0)
@@ -98,9 +98,9 @@ def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "flo
         T.preflattened_buffer(conv2d_transpose_nhwc, [1, 8, 8, 256], dtype="float32", data=conv2d_transpose_nhwc.data)
         # body
         T.launch_thread(blockIdx_x, 64)
-        conv2d_transpose_nhwc_local = T.allocate([8], "float32", "local")
-        PadInput_shared = T.allocate([768], "float32", "shared")
-        weight_shared = T.allocate([4096], "float32", "shared")
+        conv2d_transpose_nhwc_local = T.decl_buffer([8], "float32", scope="local")
+        PadInput_shared = T.decl_buffer([768], "float32", scope="shared")
+        weight_shared = T.decl_buffer([4096], "float32", scope="shared")
         T.launch_thread(threadIdx_x, 32)
         for i2_3_init, i1_4_init, i2_4_init in T.grid(2, 2, 2):
             conv2d_transpose_nhwc_local[i1_4_init * 4 + i2_3_init * 2 + i2_4_init] = T.float32(0)
diff --git a/tests/python/unittest/test_tir_transform_storage_flatten.py b/tests/python/unittest/test_tir_transform_storage_flatten.py
index ff59f10c0168..95e2eaed55fa 100644
--- a/tests/python/unittest/test_tir_transform_storage_flatten.py
+++ b/tests/python/unittest/test_tir_transform_storage_flatten.py
@@ -95,7 +95,7 @@ def main(A_param: T.handle, C_param: T.handle):
             threadIdx_x = T.env_thread("threadIdx.x")
             T.launch_thread(threadIdx_x, 1)
             for i in T.serial(0, 100):
-                B = T.allocate([4], "float32", scope="shared")
+                B = T.decl_buffer([4], "float32", scope="shared")
                 with T.attr(B.data, "double_buffer_scope", 1):
                     for j in T.serial(0, 4):
                         B[j] = A[4 * i + j]
@@ -142,7 +142,7 @@ def main():
             A_data: T.Ptr[T.int32] = T.call_extern("dummy_extern_function", dtype="handle")
 
             # and a buffer is backed by that pointer,
-            A = T.buffer_decl([1], dtype="float32", data=A_data)
+            A = T.decl_buffer([1], dtype="float32", data=A_data)
             T.evaluate(A[0])
 
     # then the call to StorageFlatten would result in an exception
diff --git a/tests/python/unittest/test_tir_transform_storage_rewrite.py b/tests/python/unittest/test_tir_transform_storage_rewrite.py
index b7cb75594997..581afef88942 100644
--- a/tests/python/unittest/test_tir_transform_storage_rewrite.py
+++ b/tests/python/unittest/test_tir_transform_storage_rewrite.py
@@ -654,14 +654,16 @@ def test_access_in_let_value():
     @T.prim_func
     def func(A: T.Buffer[(8,), "float32"]):
         for i in range(8):
-            B = T.allocate((1,), "float32", "global")
+            B_data = T.allocate((1,), "float32", "global")
+            B = T.buffer_decl(shape=[1], dtype="float32", data=B_data)
             B[0] = 3.14
             x: T.float32 = T.exp(B[0], dtype="float32")
             A[i] = (x + 1.0) / (x - 1.0)
 
     @T.prim_func
     def func_rewritten(A: T.Buffer[(8,), "float32"]) -> None:
-        B = T.allocate((1,), "float32", "global")
+        B_data = T.allocate((1,), "float32", "global")
+        B = T.buffer_decl(shape=[1], dtype="float32", data=B_data)
         for i in range(8):
             B[0] = 3.14
             x: T.float32 = T.exp(B[0], dtype="float32")
diff --git a/tests/python/unittest/test_tir_transform_unroll_loop.py b/tests/python/unittest/test_tir_transform_unroll_loop.py
index 6dba694e45ac..3a638ba45122 100644
--- a/tests/python/unittest/test_tir_transform_unroll_loop.py
+++ b/tests/python/unittest/test_tir_transform_unroll_loop.py
@@ -117,16 +117,19 @@ class before:
         @T.prim_func
         def main():
             for i in T.unroll(2):
-                with T.allocate([16], "float32", "global") as buf:
+                with T.allocate([16], "float32", "global") as buf_data:
+                    buf = T.buffer_decl(shape=[16], dtype="float32", data=buf_data)
                     buf[0] = 0.0
 
     @tvm.script.ir_module
     class expected:
         @T.prim_func
         def main():
-            with T.allocate([16], "float32", "global") as buf1:
+            with T.allocate([16], "float32", "global") as buf1_data:
+                buf1 = T.buffer_decl(shape=[16], dtype="float32", data=buf1_data)
                 buf1[0] = 0.0
-            with T.allocate([16], "float32", "global") as buf2:
+            with T.allocate([16], "float32", "global") as buf2_data:
+                buf2 = T.buffer_decl(shape=[16], dtype="float32", data=buf2_data)
                 buf2[0] = 0.0
 
     after = tvm.tir.transform.UnrollLoop()(before)
diff --git a/tests/python/unittest/test_tir_usmp_algo.py b/tests/python/unittest/test_tir_usmp_algo.py
index f67148189d8c..265e6fe5d5d5 100644
--- a/tests/python/unittest/test_tir_usmp_algo.py
+++ b/tests/python/unittest/test_tir_usmp_algo.py
@@ -316,12 +316,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
         placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_21 = T.match_buffer(T_cast_20, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_7 = T.allocate([157323], "int16", "global")
+        PaddedInput_7 = T.decl_buffer([157323], "int16")
         for i0_i1_fused_7 in T.serial(0, 229):
             for i2_7, i3_7 in T.grid(229, 3):
                 PaddedInput_7[(((i0_i1_fused_7*687) + (i2_7*3)) + i3_7)] = T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), placeholder_65[((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544):
-            Conv2dOutput_7 = T.allocate([64], "int32", "global")
+            Conv2dOutput_7 = T.decl_buffer([64], "int32")
             for ff_3 in T.serial(0, 64):
                 Conv2dOutput_7[ff_3] = 0
                 for ry_2, rx_2, rc_7 in T.grid(7, 7, 3):
@@ -336,7 +336,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
         placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
-        tensor_2 = T.allocate([200704], "uint8", "global")
+        tensor_2 = T.decl_buffer([200704], "uint8")
         for ax0_ax1_fused_4 in T.serial(0, 56):
             for ax2_4 in T.serial(0, 56):
                 for ax3_init in T.serial(0, 64):
@@ -356,9 +356,9 @@ def run_model(input: T.handle, output: T.handle) -> None:
         T.attr("default", "device_type", 1)
         sid_9 = T.allocate([301056], "int8", "global")
         sid_8 = T.allocate([802816], "int8", "global")
-        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8.data, output, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8, output, dtype="int32"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -436,11 +436,11 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
         placeholder_15 = T.match_buffer(placeholder_12, [64], dtype="int32")
         T_cast_5 = T.match_buffer(T_cast_4, [360000], dtype="int16")
         # body
-        PaddedInput_1 = T.allocate([379456], "int16", "global")
+        PaddedInput_1 = T.decl_buffer([379456], "int16")
         for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64):
             PaddedInput_1[i0_i1_fused_1 * 4928 + i2_1 * 64 + i3_1] = T.if_then_else(1 <= i0_i1_fused_1 and i0_i1_fused_1 < 76 and 1 <= i2_1 and i2_1 < 76, placeholder_13[i0_i1_fused_1 * 4800 + i2_1 * 64 + i3_1 - 4864], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 5625):
-            Conv2dOutput_1 = T.allocate([64], "int32", "global")
+            Conv2dOutput_1 = T.decl_buffer([64], "int32")
             for ff_1 in T.serial(0, 64):
                 Conv2dOutput_1[ff_1] = 0
                 for ry, rx, rc_1 in T.grid(3, 3, 64):
@@ -457,11 +457,11 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         placeholder_21 = T.match_buffer(placeholder_18, [256], dtype="int32")
         T_add_1 = T.match_buffer(T_add, [1440000], dtype="int32")
         # body
-        PaddedInput_2 = T.allocate([360000], "int16", "global")
+        PaddedInput_2 = T.decl_buffer([360000], "int16")
         for i0_i1_fused_2, i2_2, i3_2 in T.grid(75, 75, 64):
             PaddedInput_2[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2] = placeholder_19[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2]
         for ax0_ax1_fused_ax2_fused_2 in T.serial(0, 5625):
-            Conv2dOutput_2 = T.allocate([64], "int32", "global")
+            Conv2dOutput_2 = T.decl_buffer([64], "int32")
             for ax3_outer_1 in T.serial(0, 4):
                 for ff_2 in T.serial(0, 64):
                     Conv2dOutput_2[ff_2] = 0
@@ -480,11 +480,11 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         placeholder_28 = T.match_buffer(placeholder_25, [1440000], dtype="int32")
         T_cast_7 = T.match_buffer(T_cast_6, [1440000], dtype="uint8")
         # body
-        PaddedInput_3 = T.allocate([360000], "int16", "global")
+        PaddedInput_3 = T.decl_buffer([360000], "int16")
         for i0_i1_fused_3, i2_3, i3_3 in T.grid(75, 75, 64):
             PaddedInput_3[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3] = placeholder_29[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3]
         for ax0_ax1_fused_ax2_fused_3 in T.serial(0, 5625):
-            Conv2dOutput_3 = T.allocate([64], "int32", "global")
+            Conv2dOutput_3 = T.decl_buffer([64], "int32")
             for ax3_outer_2 in T.serial(0, 4):
                 for ff_3 in T.serial(0, 64):
                     Conv2dOutput_3[ff_3] = 0
@@ -504,11 +504,11 @@ def tvmgen_default_run_model(input: T.handle, output: T.handle) -> None:
         sid_6 = T.allocate([5760000], "int8", "global")
         sid_7 = T.allocate([720000], "int8", "global")
         sid_8 = T.allocate([720000], "int8", "global")
-        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast", input, T.lookup_param("p0", dtype="handle"), sid_2.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_2.data, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_8.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_8.data, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_7.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", sid_7.data, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_6.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", sid_2.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_6.data, output, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast", input, T.lookup_param("p0", dtype="handle"), sid_2, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_2, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_8, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_8, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_7, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", sid_7, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_6, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", sid_2, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_6, output, dtype="int32"))
 
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(placeholder_4: T.handle, placeholder_5: T.handle, placeholder_6: T.handle, T_cast_2: T.handle) -> None:
@@ -519,11 +519,11 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place
         placeholder_9 = T.match_buffer(placeholder_6, [64], dtype="int32")
         T_cast_3 = T.match_buffer(T_cast_2, [360000], dtype="int16")
         # body
-        PaddedInput = T.allocate([360000], "int16", "global")
+        PaddedInput = T.decl_buffer([360000], "int16")
         for i0_i1_fused, i2, i3 in T.grid(75, 75, 64):
             PaddedInput[i0_i1_fused * 4800 + i2 * 64 + i3] = placeholder_7[i0_i1_fused * 4800 + i2 * 64 + i3]
         for ax0_ax1_fused_ax2_fused in T.serial(0, 5625):
-            Conv2dOutput = T.allocate([64], "int32", "global")
+            Conv2dOutput = T.decl_buffer([64], "int32")
             for ff in T.serial(0, 64):
                 Conv2dOutput[ff] = 0
                 for rc in T.serial(0, 64):
diff --git a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
index 60360ecade70..52880e40cbee 100644
--- a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
+++ b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
@@ -128,12 +128,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
         placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_7 = T.allocate([157323], "int16", "global")
+        PaddedInput_7 = T.decl_buffer([157323], "int16")
         for i0_i1_fused_7 in T.serial(0, 229):
             for i2_7, i3_7 in T.grid(229, 3):
                 PaddedInput_7[(((i0_i1_fused_7*687) + (i2_7*3)) + i3_7)] = T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), placeholder_65[((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544):
-            Conv2dOutput_7 = T.allocate([64], "int32", "global")
+            Conv2dOutput_7 = T.decl_buffer([64], "int32")
             for ff_3 in T.serial(0, 64):
                 Conv2dOutput_7[ff_3] = 0
                 for ry_2, rx_2, rc_7 in T.grid(7, 7, 3):
@@ -148,7 +148,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
         placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
-        tensor_2 = T.allocate([200704], "uint8", "global")
+        tensor_2 = T.decl_buffer([200704], "uint8")
         for ax0_ax1_fused_4 in T.serial(0, 56):
             for ax2_4 in T.serial(0, 56):
                 for ax3_init in T.serial(0, 64):
@@ -168,9 +168,9 @@ def run_model(input: T.handle, output: T.handle) -> None:
         T.attr("default", "device_type", 1)
         sid_9 = T.allocate([301056], "int8", "global")
         sid_8 = T.allocate([802816], "int8", "global")
-        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8.data, output, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8, output, dtype="int32"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -220,14 +220,14 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placehol
         placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_8 = T.allocate([215296], "int16", "global")
+        PaddedInput_8 = T.decl_buffer([215296], "int16")
         for i0_i1_fused_8 in T.serial(0, 58):
             for i2_8, i3_8 in T.grid(58, 64):
                 PaddedInput_8[(((i0_i1_fused_8*3712) + (i2_8*64)) + i3_8)] = T.if_then_else(((((1 <= i0_i1_fused_8) and (i0_i1_fused_8 < 57)) and (1 <= i2_8)) and (i2_8 < 57)), placeholder_71[((((i0_i1_fused_8*3584) + (i2_8*64)) + i3_8) - 3648)], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_8 in T.parallel(0, 3136):
-            dummy_allocate = T.allocate([1], "int32", "global")
+            dummy_allocate = T.decl_buffer([1], "int32")
             for ax3_outer_4 in T.serial(0, 3):
-                Conv2dOutput_8 = T.allocate([64], "int32", "global")
+                Conv2dOutput_8 = T.decl_buffer([64], "int32")
                 for ff_4 in T.serial(0, 64):
                     Conv2dOutput_8[ff_4] = 0
                     for ry_3, rx_3, rc_8 in T.grid(3, 3, 64):
@@ -261,14 +261,14 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placehol
         placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_8 = T.allocate([215296], "int16", "global")
+        PaddedInput_8 = T.decl_buffer([215296], "int16")
         for i0_i1_fused_8 in T.serial(0, 58):
             for i2_8, i3_8 in T.grid(58, 64):
                 PaddedInput_8[(((i0_i1_fused_8*3712) + (i2_8*64)) + i3_8)] = T.if_then_else(((((1 <= i0_i1_fused_8) and (i0_i1_fused_8 < 57)) and (1 <= i2_8)) and (i2_8 < 57)), placeholder_71[((((i0_i1_fused_8*3584) + (i2_8*64)) + i3_8) - 3648)], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_8 in T.serial(0, 3136):
-            dummy_allocate = T.allocate([1], "int32", "global")
+            dummy_allocate = T.decl_buffer([1], "int32")
             for ax3_outer_4 in T.serial(0, 3):
-                Conv2dOutput_8 = T.allocate([64], "int32", "global")
+                Conv2dOutput_8 = T.decl_buffer([64], "int32")
                 for ff_4 in T.serial(0, 64):
                     Conv2dOutput_8[ff_4] = 0
                     for ry_3, rx_3, rc_8 in T.grid(3, 3, 64):
@@ -394,12 +394,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place
         placeholder_21 = T.match_buffer(placeholder_18, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_3 = T.match_buffer(T_cast_2, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput = T.allocate([200704], "int16", "global")
+        PaddedInput = T.decl_buffer([200704], "int16")
         for i0_i1_fused in T.serial(0, 56):
             for i2, i3 in T.grid(56, 64):
                 PaddedInput[(((i0_i1_fused*3584) + (i2*64)) + i3)] = placeholder_19[(((i0_i1_fused*3584) + (i2*64)) + i3)]
         for ax0_ax1_fused_ax2_fused in T.serial(0, 3136):
-            Conv2dOutput = T.allocate([64], "int32", "global")
+            Conv2dOutput = T.decl_buffer([64], "int32")
             for ff in T.serial(0, 64):
                 Conv2dOutput[ff] = 0
                 for rc in T.serial(0, 64):
@@ -416,12 +416,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
         placeholder_27 = T.match_buffer(placeholder_24, [96], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_5 = T.match_buffer(T_cast_4, [153], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_1 = T.allocate([150528], "int16", "global")
+        PaddedInput_1 = T.decl_buffer([150528], "int16")
         for i0_i1_fused_1 in T.serial(0, 28):
             for i2_1, i3_1 in T.grid(28, 192):
                 PaddedInput_1[(((i0_i1_fused_1*5376) + (i2_1*192)) + i3_1)] = placeholder_25[(((i0_i1_fused_1*5376) + (i2_1*192)) + i3_1)]
         for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 784):
-            Conv2dOutput_1 = T.allocate([1], "int32", "global")
+            Conv2dOutput_1 = T.decl_buffer([1], "int32")
             for ax3_1 in T.serial(0, 96):
                 Conv2dOutput_1[0] = 0
                 for rc_1 in T.serial(0, 192):
@@ -435,7 +435,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
         placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
-        tensor_2 = T.allocate([200704], "uint8", "global")
+        tensor_2 = T.decl_buffer([200704], "uint8")
         for ax0_ax1_fused_4 in T.serial(0, 56):
             for ax2_4 in T.serial(0, 56):
                 for ax3_init in T.serial(0, 64):
@@ -455,12 +455,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2(placehol
         placeholder_35 = T.match_buffer(placeholder_32, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_9 = T.match_buffer(T_cast_8, [121], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_2 = T.allocate([150528], "int16", "global")
+        PaddedInput_2 = T.decl_buffer([150528], "int16")
         for i0_i1_fused_2 in T.serial(0, 28):
             for i2_2, i3_2 in T.grid(28, 192):
                 PaddedInput_2[(((i0_i1_fused_2*5376) + (i2_2*192)) + i3_2)] = placeholder_33[(((i0_i1_fused_2*5376) + (i2_2*192)) + i3_2)]
         for ax0_ax1_fused_ax2_fused_2 in T.serial(0, 784):
-            Conv2dOutput_2 = T.allocate([64], "int32", "global")
+            Conv2dOutput_2 = T.decl_buffer([64], "int32")
             for ff_1 in T.serial(0, 64):
                 Conv2dOutput_2[ff_1] = 0
                 for rc_2 in T.serial(0, 192):
@@ -475,7 +475,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast_1(placeholder_36: T.handle, T_cast_1
         placeholder_37 = T.match_buffer(placeholder_36, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         T_cast_11 = T.match_buffer(T_cast_10, [249], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
-        tensor_3 = T.allocate([150528], "uint8", "global")
+        tensor_3 = T.decl_buffer([150528], "uint8")
         for ax0_ax1_fused_6 in T.serial(0, 28):
             for ax2_6 in T.serial(0, 28):
                 for ax3_outer_init_1, ax3_inner_init_1 in T.grid(3, 64):
@@ -495,12 +495,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed
         placeholder_43 = T.match_buffer(placeholder_40, [32], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_13 = T.match_buffer(T_cast_12, [89], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_3 = T.allocate([150528], "int16", "global")
+        PaddedInput_3 = T.decl_buffer([150528], "int16")
         for i0_i1_fused_3 in T.serial(0, 28):
             for i2_3, i3_3 in T.grid(28, 192):
                 PaddedInput_3[(((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3)] = placeholder_41[(((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3)]
         for ax0_ax1_fused_ax2_fused_3 in T.serial(0, 784):
-            Conv2dOutput_3 = T.allocate([1], "int32", "global")
+            Conv2dOutput_3 = T.decl_buffer([1], "int32")
             for ax3_5 in T.serial(0, 32):
                 Conv2dOutput_3[0] = 0
                 for rc_3 in T.serial(0, 192):
@@ -516,12 +516,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(pla
         placeholder_49 = T.match_buffer(placeholder_46, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_15 = T.match_buffer(T_cast_14, [73], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_4 = T.allocate([150528], "int16", "global")
+        PaddedInput_4 = T.decl_buffer([150528], "int16")
         for i0_i1_fused_4 in T.serial(0, 28):
             for i2_4, i3_4 in T.grid(28, 192):
                 PaddedInput_4[(((i0_i1_fused_4*5376) + (i2_4*192)) + i3_4)] = placeholder_47[(((i0_i1_fused_4*5376) + (i2_4*192)) + i3_4)]
         for ax0_ax1_fused_ax2_fused_4 in T.serial(0, 784):
-            Conv2dOutput_4 = T.allocate([1], "int32", "global")
+            Conv2dOutput_4 = T.decl_buffer([1], "int32")
             for ax3_6 in T.serial(0, 16):
                 Conv2dOutput_4[0] = 0
                 for rc_4 in T.serial(0, 192):
@@ -537,12 +537,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed
         placeholder_55 = T.match_buffer(placeholder_52, [32], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_17 = T.match_buffer(T_cast_16, [89], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_5 = T.allocate([14400], "int16", "global")
+        PaddedInput_5 = T.decl_buffer([14400], "int16")
         for i0_i1_fused_5 in T.serial(0, 30):
             for i2_5, i3_5 in T.grid(30, 16):
                 PaddedInput_5[(((i0_i1_fused_5*480) + (i2_5*16)) + i3_5)] = T.if_then_else(((((1 <= i0_i1_fused_5) and (i0_i1_fused_5 < 29)) and (1 <= i2_5)) and (i2_5 < 29)), placeholder_53[((((i0_i1_fused_5*448) + (i2_5*16)) + i3_5) - 464)], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_5 in T.serial(0, 784):
-            Conv2dOutput_5 = T.allocate([1], "int32", "global")
+            Conv2dOutput_5 = T.decl_buffer([1], "int32")
             for ax3_7 in T.serial(0, 32):
                 Conv2dOutput_5[0] = 0
                 for ry, rx, rc_5 in T.grid(3, 3, 16):
@@ -558,12 +558,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed
         placeholder_61 = T.match_buffer(placeholder_58, [128], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_19 = T.match_buffer(T_cast_18, [185], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_6 = T.allocate([86400], "int16", "global")
+        PaddedInput_6 = T.decl_buffer([86400], "int16")
         for i0_i1_fused_6 in T.serial(0, 30):
             for i2_6, i3_6 in T.grid(30, 96):
                 PaddedInput_6[(((i0_i1_fused_6*2880) + (i2_6*96)) + i3_6)] = T.if_then_else(((((1 <= i0_i1_fused_6) and (i0_i1_fused_6 < 29)) and (1 <= i2_6)) and (i2_6 < 29)), placeholder_59[((((i0_i1_fused_6*2688) + (i2_6*96)) + i3_6) - 2784)], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_6 in T.serial(0, 784):
-            Conv2dOutput_6 = T.allocate([64], "int32", "global")
+            Conv2dOutput_6 = T.decl_buffer([64], "int32")
             for ax3_outer_3 in T.serial(0, 2):
                 for ff_2 in T.serial(0, 64):
                     Conv2dOutput_6[ff_2] = 0
@@ -581,12 +581,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
         placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_7 = T.allocate([157323], "int16", "global")
+        PaddedInput_7 = T.decl_buffer([157323], "int16")
         for i0_i1_fused_7 in T.serial(0, 229):
             for i2_7, i3_7 in T.grid(229, 3):
                 PaddedInput_7[(((i0_i1_fused_7*687) + (i2_7*3)) + i3_7)] = T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), placeholder_65[((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544):
-            Conv2dOutput_7 = T.allocate([64], "int32", "global")
+            Conv2dOutput_7 = T.decl_buffer([64], "int32")
             for ff_3 in T.serial(0, 64):
                 Conv2dOutput_7[ff_3] = 0
                 for ry_2, rx_2, rc_7 in T.grid(7, 7, 3):
@@ -603,12 +603,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placehol
         placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_8 = T.allocate([215296], "int16", "global")
+        PaddedInput_8 = T.decl_buffer([215296], "int16")
         for i0_i1_fused_8 in T.serial(0, 58):
             for i2_8, i3_8 in T.grid(58, 64):
                 PaddedInput_8[(((i0_i1_fused_8*3712) + (i2_8*64)) + i3_8)] = T.if_then_else(((((1 <= i0_i1_fused_8) and (i0_i1_fused_8 < 57)) and (1 <= i2_8)) and (i2_8 < 57)), placeholder_71[((((i0_i1_fused_8*3584) + (i2_8*64)) + i3_8) - 3648)], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_8 in T.serial(0, 3136):
-            Conv2dOutput_8 = T.allocate([64], "int32", "global")
+            Conv2dOutput_8 = T.decl_buffer([64], "int32")
             for ax3_outer_4 in T.serial(0, 3):
                 for ff_4 in T.serial(0, 64):
                     Conv2dOutput_8[ff_4] = 0
@@ -638,21 +638,21 @@ def run_model(input: T.handle, output: T.handle) -> None:
         sid_25 = T.allocate([25088], "int8", "global")
         sid_26 = T.allocate([25088], "int8", "global")
         sid_31 = T.allocate([25088], "int8", "global")
-        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8.data, sid_7.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_7.data, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_6.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", sid_6.data, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_5.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d", sid_5.data, sid_4.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_cast", sid_4.data, sid_3.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2", sid_3.data, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_2.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_3.data, T.lookup_param("p9", dtype="handle"), T.lookup_param("p10", dtype="handle"), sid_20.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320_", sid_20.data, T.lookup_param("p11", dtype="handle"), T.lookup_param("p12", dtype="handle"), sid_19.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", sid_3.data, T.lookup_param("p13", dtype="handle"), T.lookup_param("p14", dtype="handle"), sid_26.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__1", sid_26.data, T.lookup_param("p15", dtype="handle"), T.lookup_param("p16", dtype="handle"), sid_25.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast_1", sid_4.data, sid_32.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__2", sid_32.data, T.lookup_param("p17", dtype="handle"), T.lookup_param("p18", dtype="handle"), sid_31.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_concatenate", sid_2.data, sid_19.data, sid_25.data, sid_31.data, output, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8, sid_7, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_7, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_6, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", sid_6, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_5, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d", sid_5, sid_4, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast", sid_4, sid_3, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2", sid_3, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_2, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_3, T.lookup_param("p9", dtype="handle"), T.lookup_param("p10", dtype="handle"), sid_20, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320_", sid_20, T.lookup_param("p11", dtype="handle"), T.lookup_param("p12", dtype="handle"), sid_19, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", sid_3, T.lookup_param("p13", dtype="handle"), T.lookup_param("p14", dtype="handle"), sid_26, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__1", sid_26, T.lookup_param("p15", dtype="handle"), T.lookup_param("p16", dtype="handle"), sid_25, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast_1", sid_4, sid_32, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__2", sid_32, T.lookup_param("p17", dtype="handle"), T.lookup_param("p18", dtype="handle"), sid_31, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_concatenate", sid_2, sid_19, sid_25, sid_31, output, dtype="int32"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1129,11 +1129,11 @@ def tvmgen_default_fused_nn_contrib_conv2d_NCHWc(placeholder_2: T.handle, placeh
         placeholder_5 = T.match_buffer(placeholder_3, [81], dtype="float32")
         conv2d_NCHWc_1 = T.match_buffer(conv2d_NCHWc, [41], dtype="float32")
         # body
-        data_pad = T.allocate([1092], "float32", "global")
+        data_pad = T.decl_buffer([1092], "float32")
         for i0_i1_fused_i2_fused, i3, i4 in T.grid(26, 14, 3):
             data_pad[i0_i1_fused_i2_fused * 42 + i3 * 3 + i4] = T.if_then_else(1 <= i0_i1_fused_i2_fused and i0_i1_fused_i2_fused < 25 and 1 <= i3 and i3 < 13, placeholder_4[i0_i1_fused_i2_fused * 36 + i3 * 3 + i4 - 39], T.float32(0), dtype="float32")
         for n_oc_chunk_fused_oh_fused in T.serial(0, 24):
-            conv2d_NCHWc_global = T.allocate([36], "float32", "global")
+            conv2d_NCHWc_global = T.decl_buffer([36], "float32")
             for oc_block_c_init in T.serial(0, 3):
                 conv2d_NCHWc_global[oc_block_c_init] = T.float32(0)
             for oc_block_c_init in T.serial(0, 3):
@@ -1198,15 +1198,15 @@ def tvmgen_default_fused_nn_softmax_add_add_multiply_add(placeholder_6: T.handle
         T_add_1 = T.match_buffer(T_add, [864], dtype="float32")
         # body
         for ax0_ax1_fused_ax2_fused in T.serial(0, 72):
-            T_softmax_norm = T.allocate([12], "float32", "global")
-            with T.allocate([1], "float32", "global") as T_softmax_maxelem:
+            T_softmax_norm = T.decl_buffer([12], "float32")
+            with T.decl_buffer([1], "float32") as T_softmax_maxelem:
                 T_softmax_maxelem[0] = T.float32(-3.4028234663852886e+38)
                 for k in T.serial(0, 12):
                     T_softmax_maxelem[0] = T.max(T_softmax_maxelem[0], placeholder_11[ax0_ax1_fused_ax2_fused * 12 + k])
-                T_softmax_exp = T.allocate([12], "float32", "global")
+                T_softmax_exp = T.decl_buffer([12], "float32")
                 for i3 in T.serial(0, 12):
                     T_softmax_exp[i3] = T.exp(placeholder_11[ax0_ax1_fused_ax2_fused * 12 + i3] - T_softmax_maxelem[0], dtype="float32")
-                T_softmax_expsum = T.allocate([1], "float32", "global")
+                T_softmax_expsum = T.decl_buffer([1], "float32")
                 T_softmax_expsum[0] = T.float32(0)
                 for k in T.serial(0, 12):
                     T_softmax_expsum[0] = T_softmax_expsum[0] + T_softmax_exp[k]
@@ -1224,8 +1224,8 @@ def tvmgen_default_fused_nn_contrib_dense_pack_nn_relu(placeholder_16: T.handle,
         T_relu_1 = T.match_buffer(T_relu, [864], dtype="float32")
         # body
         for ax1_outer_ax0_outer_fused in T.serial(0, 18):
-            compute = T.allocate([48], "float32", "global")
-            with T.allocate([48], "float32", "global") as compute_global:
+            compute = T.decl_buffer([48], "float32")
+            with T.decl_buffer([48], "float32") as compute_global:
                 for x_c_init in T.serial(0, 6):
                     compute_global[x_c_init] = T.float32(0)
                 for x_c_init in T.serial(0, 6):
@@ -1317,15 +1317,15 @@ def tvmgen_default_fused_nn_softmax_add(placeholder_26: T.handle, placeholder_27
         T_add_3 = T.match_buffer(T_add_2, [864], dtype="float32")
         # body
         for ax0_ax1_fused_ax2_fused in T.serial(0, 72):
-            T_softmax_norm = T.allocate([12], "float32", "global")
-            with T.allocate([1], "float32", "global") as T_softmax_maxelem:
+            T_softmax_norm = T.decl_buffer([12], "float32")
+            with T.decl_buffer([1], "float32") as T_softmax_maxelem:
                 T_softmax_maxelem[0] = T.float32(-3.4028234663852886e+38)
                 for k in T.serial(0, 12):
                     T_softmax_maxelem[0] = T.max(T_softmax_maxelem[0], placeholder_28[ax0_ax1_fused_ax2_fused * 12 + k])
-                T_softmax_exp = T.allocate([12], "float32", "global")
+                T_softmax_exp= T.decl_buffer([12], "float32")
                 for i3 in T.serial(0, 12):
                     T_softmax_exp[i3] = T.exp(placeholder_28[ax0_ax1_fused_ax2_fused * 12 + i3] - T_softmax_maxelem[0], dtype="float32")
-                T_softmax_expsum = T.allocate([1], "float32", "global")
+                T_softmax_expsum = T.decl_buffer([1], "float32")
                 T_softmax_expsum[0] = T.float32(0)
                 for k in T.serial(0, 12):
                     T_softmax_expsum[0] = T_softmax_expsum[0] + T_softmax_exp[k]
@@ -1359,20 +1359,20 @@ def run_model(data: T.handle, output: T.handle) -> None:
         sid_22 = T.allocate_const([1], "int8", [1])
         sid_23 = T.allocate_const([2,1], "int8", [3456])
 
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform_1", data_buffer.data, sid_23.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_conv2d_NCHWc", sid_8.data, T.cast(T.lookup_param("p0", dtype="handle"), "handle"), sid_7.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform", sid_7.data, sid_6.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape_1", data_buffer.data, sid_12.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_dense_pack_nn_relu", sid_12.data, T.cast(T.lookup_param("p1", dtype="handle"), "handle"), sid_11.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape", sid_11.data, sid_10.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_softmax_add_add_multiply_add", sid_6.data, sid_10.data, T.cast(T.lookup_param("p2", dtype="handle"), "handle"), T.cast(T.lookup_param("p3", dtype="handle"), "handle"), T.cast(T.lookup_param("p4", dtype="handle"), "handle"), sid_5.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform_1", sid_5.data, sid_4.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_conv2d_NCHWc", sid_4.data, T.cast(T.lookup_param("p5", dtype="handle"), "handle"), sid_3.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform", sid_3.data, sid_2.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape_1", sid_5.data, sid_20.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_dense_pack_nn_relu", sid_20.data, T.cast(T.lookup_param("p6", dtype="handle"), "handle"), sid_19.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape", sid_19.data, sid_18.data, dtype="int32"))
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_softmax_add", sid_2.data, sid_18.data, output_buffer.data, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform_1", data_buffer.data, sid_23, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_conv2d_NCHWc", sid_8, T.cast(T.lookup_param("p0", dtype="handle"), "handle"), sid_7, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform", sid_7, sid_6, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape_1", data_buffer.data, sid_12, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_dense_pack_nn_relu", sid_12, T.cast(T.lookup_param("p1", dtype="handle"), "handle"), sid_11, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape", sid_11, sid_10, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_softmax_add_add_multiply_add", sid_6, sid_10, T.cast(T.lookup_param("p2", dtype="handle"), "handle"), T.cast(T.lookup_param("p3", dtype="handle"), "handle"), T.cast(T.lookup_param("p4", dtype="handle"), "handle"), sid_5, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform_1", sid_5, sid_4, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_conv2d_NCHWc", sid_4, T.cast(T.lookup_param("p5", dtype="handle"), "handle"), sid_3, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform", sid_3, sid_2, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape_1", sid_5, sid_20, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_dense_pack_nn_relu", sid_20, T.cast(T.lookup_param("p6", dtype="handle"), "handle"), sid_19, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape", sid_19, sid_18, dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_softmax_add", sid_2, sid_18, output_buffer.data, dtype="int32"))
 # fmt: on
 
 
diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index e6d123118757..fdda400a779f 100644
--- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -98,12 +98,14 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
         T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         T.preflattened_buffer(T_cast_21, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_7 = T.allocate([157323], "int16", "global")
+        PaddedInput_7_data = T.allocate([157323], "int16", "global")
+        PaddedInput_7 = T.buffer_decl(shape=[157323], dtype="int16", data=PaddedInput_7_data)
         for i0_i1_fused_7 in T.serial(0, 229):
             for i2_7, i3_7 in T.grid(229, 3):
                 PaddedInput_7[(((i0_i1_fused_7*687) + (i2_7*3)) + i3_7)] = T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), placeholder_65[((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544):
-            Conv2dOutput_7 = T.allocate([64], "int32", "global")
+            Conv2dOutput_7_data = T.allocate([64], "int32", "global")
+            Conv2dOutput_7 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_7_data)
             for ff_3 in T.serial(0, 64):
                 Conv2dOutput_7[ff_3] = 0
                 for ry_2, rx_2, rc_7 in T.grid(7, 7, 3):
@@ -120,7 +122,8 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
         T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         T.preflattened_buffer(T_cast_7, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
-        tensor_2 = T.allocate([200704], "uint8", "global")
+        tensor_2_data = T.allocate([200704], "uint8", "global")
+        tensor_2 = T.buffer_decl(shape=[200704], dtype="uint8", data=tensor_2_data)
         for ax0_ax1_fused_4 in T.serial(0, 56):
             for ax2_4 in T.serial(0, 56):
                 for ax3_init in T.serial(0, 64):
@@ -140,9 +143,9 @@ def __tvm_main__(input: T.handle, output: T.handle) -> None:
         T.attr("default", "device_type", 1)
         sid_9 = T.allocate([301056], "int8", "global")
         sid_8 = T.allocate([802816], "int8", "global")
-        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8.data, output, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8, output, dtype="int32"))
 # fmt: on
 
 
@@ -299,11 +302,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
         T_cast_5 = T.match_buffer(T_cast_4, [215], dtype="int16")
         T.preflattened_buffer(T_cast_5, [215], dtype="int16")
         # body
-        PaddedInput_1 = T.allocate([379456], "int16", "global")
+        PaddedInput_1_data = T.allocate([379456], "int16", "global")
+        PaddedInput_1 = T.buffer_decl(shape=[379456], dtype="int16", data=PaddedInput_1_data)
         for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64):
             PaddedInput_1[i0_i1_fused_1 * 4928 + i2_1 * 64 + i3_1] = T.if_then_else(1 <= i0_i1_fused_1 and i0_i1_fused_1 < 76 and 1 <= i2_1 and i2_1 < 76, placeholder_13[i0_i1_fused_1 * 4800 + i2_1 * 64 + i3_1 - 4864], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 5625):
-            Conv2dOutput_1 = T.allocate([64], "int32", "global")
+            Conv2dOutput_1_data = T.allocate([64], "int32", "global")
+            Conv2dOutput_1 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_1_data)
             for ff_1 in T.serial(0, 64):
                 Conv2dOutput_1[ff_1] = 0
                 for ry, rx, rc_1 in T.grid(3, 3, 64):
@@ -324,11 +329,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         T_add_1 = T.match_buffer(T_add, [407], dtype="int32")
         T.preflattened_buffer(T_add_1, [407], dtype="int32")
         # body
-        PaddedInput_2 = T.allocate([360000], "int16", "global")
+        PaddedInput_2_data = T.allocate([360000], "int16", "global")
+        PaddedInput_2 = T.buffer_decl(shape=[360000], dtype="int16", data=PaddedInput_2_data)
         for i0_i1_fused_2, i2_2, i3_2 in T.grid(75, 75, 64):
             PaddedInput_2[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2] = placeholder_19[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2]
         for ax0_ax1_fused_ax2_fused_2 in T.serial(0, 5625):
-            Conv2dOutput_2 = T.allocate([64], "int32", "global")
+            Conv2dOutput_2_data = T.allocate([64], "int32", "global")
+            Conv2dOutput_2 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_2_data)
             for ax3_outer_1 in T.serial(0, 4):
                 for ff_2 in T.serial(0, 64):
                     Conv2dOutput_2[ff_2] = 0
@@ -352,11 +359,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         T_cast_7 = T.match_buffer(T_cast_6, [407], dtype="uint8")
         T.preflattened_buffer(T_cast_7, [407], dtype="uint8")
         # body
-        PaddedInput_3 = T.allocate([360000], "int16", "global")
+        PaddedInput_3_data = T.allocate([360000], "int16", "global")
+        PaddedInput_3 = T.buffer_decl(shape=[360000], dtype="int16", data=PaddedInput_3_data)
         for i0_i1_fused_3, i2_3, i3_3 in T.grid(75, 75, 64):
             PaddedInput_3[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3] = placeholder_29[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3]
         for ax0_ax1_fused_ax2_fused_3 in T.serial(0, 5625):
-            Conv2dOutput_3 = T.allocate([64], "int32", "global")
+            Conv2dOutput_3_data = T.allocate([64], "int32", "global")
+            Conv2dOutput_3 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_3_data)
             for ax3_outer_2 in T.serial(0, 4):
                 for ff_3 in T.serial(0, 64):
                     Conv2dOutput_3[ff_3] = 0
@@ -376,11 +385,11 @@ def __tvm_main__(input: T.handle, output: T.handle) -> None:
         sid_6 = T.allocate([5760000], "int8", "global")
         sid_7 = T.allocate([720000], "int8", "global")
         sid_8 = T.allocate([720000], "int8", "global")
-        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast", input, T.lookup_param("p0", dtype="handle"), sid_2.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_2.data, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_8.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_8.data, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_7.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", sid_7.data, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_6.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", sid_2.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_6.data, output, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast", input, T.lookup_param("p0", dtype="handle"), sid_2, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_2, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_8, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_8, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_7, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", sid_7, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_6, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", sid_2, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_6, output, dtype="int32"))
 
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(placeholder_4: T.handle, placeholder_5: T.handle, placeholder_6: T.handle, T_cast_2: T.handle) -> None:
@@ -395,11 +404,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place
         T_cast_3 = T.match_buffer(T_cast_2, [215], dtype="int16")
         T.preflattened_buffer(T_cast_3, [215], dtype="int16")
         # body
-        PaddedInput = T.allocate([360000], "int16", "global")
+        PaddedInput_data = T.allocate([360000], "int16", "global")
+        PaddedInput = T.buffer_decl([360000], "int16", data=PaddedInput_data)
         for i0_i1_fused, i2, i3 in T.grid(75, 75, 64):
             PaddedInput[i0_i1_fused * 4800 + i2 * 64 + i3] = placeholder_7[i0_i1_fused * 4800 + i2 * 64 + i3]
         for ax0_ax1_fused_ax2_fused in T.serial(0, 5625):
-            Conv2dOutput = T.allocate([64], "int32", "global")
+            Conv2dOutput_data = T.allocate([64], "int32", "global")
+            Conv2dOutput = T.buffer_decl([64], "int32", data=Conv2dOutput_data)
             for ff in T.serial(0, 64):
                 Conv2dOutput[ff] = 0
                 for rc in T.serial(0, 64):
diff --git a/tests/python/unittest/test_tir_usmp_utils.py b/tests/python/unittest/test_tir_usmp_utils.py
index 155ff0962def..756b97b0d223 100644
--- a/tests/python/unittest/test_tir_usmp_utils.py
+++ b/tests/python/unittest/test_tir_usmp_utils.py
@@ -48,12 +48,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
         placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
-        PaddedInput_7 = T.allocate([157323], "int16", "global")
+        PaddedInput_7 = T.decl_buffer([157323], "int16")
         for i0_i1_fused_7 in T.serial(0, 229):
             for i2_7, i3_7 in T.grid(229, 3):
                 PaddedInput_7[(((i0_i1_fused_7*687) + (i2_7*3)) + i3_7)] = T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), placeholder_65[((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544):
-            Conv2dOutput_7 = T.allocate([64], "int32", "global")
+            Conv2dOutput_7 = T.decl_buffer([64], "int32")
             for ff_3 in T.serial(0, 64):
                 Conv2dOutput_7[ff_3] = 0
                 for ry_2, rx_2, rc_7 in T.grid(7, 7, 3):
@@ -68,7 +68,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
         placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
-        tensor_2 = T.allocate([200704], "uint8", "global")
+        tensor_2 = T.decl_buffer([200704], "uint8")
         for ax0_ax1_fused_4 in T.serial(0, 56):
             for ax2_4 in T.serial(0, 56):
                 for ax3_init in T.serial(0, 64):
@@ -88,9 +88,9 @@ def tvmgen_default_run_model(input: T.handle, output: T.handle) -> None:
         T.attr("default", "device_type", 1)
         sid_9 = T.allocate([301056], "int8", "global")
         sid_8 = T.allocate([802816], "int8", "global")
-        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8.data, dtype="int32"))
-        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8.data, output, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8, output, dtype="int32"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 45ea88f829ec..17622789558d 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -94,12 +94,18 @@ def mmult(A: T.handle, B: T.handle, C: T.handle) -> None:
             B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=64, offset_factor=1)
             C_1 = T.match_buffer(C, [1024 * 1024], elem_offset=0, align=64, offset_factor=1)
             # body
-            packedB = T.allocate([32768], "float32", "global")
+            packedB_data = T.allocate([32768], "float32", "global")
+            packedB = T.buffer_decl(
+                shape=[32768], dtype="float32", scope="global", data=packedB_data
+            )
             for x in T.parallel(0, 32):
                 for y in T.serial(0, 1024):
                     packedB[T.ramp(((x * 32768) + (y * 32)), 1, 32)] = B_1[y, T.ramp(x * 32, 1, 32)]
             for x_outer in T.parallel(0, 32):
-                C_global = T.allocate([1024], "float32", "global")
+                C_global_data = T.allocate([1024], "float32", "global")
+                C_global = T.buffer_decl(
+                    shape=[1024], dtype="float32", scope="global", data=C_global_data
+                )
                 for y_outer in T.serial(0, 32):
                     for x_c_init in T.serial(0, 32):
                         C_global[T.ramp((x_c_init * 32), 1, 32)] = T.broadcast(T.float32(0), 32)
@@ -953,11 +959,24 @@ def func(
         ty = T.env_thread("threadIdx.y")
         tz = T.env_thread("threadIdx.z")
         T.launch_thread(bz, 196)
-        Conv_wmma_accumulator = T.allocate([2048], "float32", "wmma.accumulator")
-        Apad_shared = T.allocate([12288], "float16", "shared")
-        W_shared = T.allocate([12288], "float16", "shared")
-        Apad_shared_wmma_matrix_a = T.allocate([512], "float16", "wmma.matrix_a")
-        W_shared_wmma_matrix_b = T.allocate([1024], "float16", "wmma.matrix_b")
+        Conv_wmma_accumulator_data = T.allocate([2048], "float32", "wmma.accumulator")
+        Conv_wmma_accumulator = T.buffer_decl(
+            shape=[2048], dtype="float32", scope="wmma.accumulator", data=Conv_wmma_accumulator_data
+        )
+        Apad_shared_data = T.allocate([12288], "float16", "shared")
+        Apad_shared = T.buffer_decl(
+            shape=[12288], dtype="float16", scope="shared", data=Apad_shared_data
+        )
+        W_shared_data = T.allocate([12288], "float16", "shared")
+        W_shared = T.buffer_decl(shape=[12288], dtype="float16", scope="shared", data=W_shared_data)
+        Apad_shared_wmma_matrix_a_data = T.allocate([512], "float16", "wmma.matrix_a")
+        Apad_shared_wmma_matrix_a = T.buffer_decl(
+            shape=[512], dtype="float16", scope="wmma.matrix_a", data=Apad_shared_wmma_matrix_a_data
+        )
+        W_shared_wmma_matrix_b_data = T.allocate([1024], "float16", "wmma.matrix_b")
+        W_shared_wmma_matrix_b = T.buffer_decl(
+            shape=[1024], dtype="float16", scope="wmma.matrix_b", data=W_shared_wmma_matrix_b_data
+        )
         T.launch_thread(bx, 2)
         T.launch_thread(by, 4)
         T.launch_thread(ty, 4)
@@ -2479,7 +2498,8 @@ def vthread_func(a: T.handle, c: T.handle) -> None:
         T.launch_thread(i0, 4)
         T.launch_thread(i1, 2)
         T.launch_thread(i2, 2)
-        B = T.allocate([16], "float32", "local")
+        B_data = T.allocate([16], "float32", "local")
+        B = T.buffer_decl(shape=[16], dtype="float32", scope="local", data=B_data)
         for j in range(16):
             B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + T.float32(1)
         for j in range(16):
@@ -2792,11 +2812,13 @@ def B(a: T.handle, c: T.handle) -> None:
             C = T.match_buffer(c, (10), "int32")
             B = T.alloc_buffer((10), "int32")
 
-            K1 = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+            K1_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+            K1 = T.buffer_decl(shape=[10], dtype="int32", data=K1_data)
             for x in T.serial(0, 10):
                 B[x] = A[x] + K1[x]
 
-            K2 = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+            K2_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+            K2 = T.buffer_decl(shape=[10], dtype="int32", data=K2_data)
             for x in T.serial(0, 10):
                 B[x] = B[x] + K2[x]
 
@@ -2812,7 +2834,8 @@ def constant(a: T.handle, c: T.handle) -> None:
         A = T.match_buffer(a, (10), "int32")
         C = T.match_buffer(c, (10), "int32")
         B = T.alloc_buffer((10), "int32")
-        K = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+        K_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
+        K = T.buffer_decl(shape=[10], dtype="int32", data=K_data)
         for x in T.serial(0, 10):
             B[x] = A[x] + K[x]
 
@@ -2961,7 +2984,8 @@ def primfunc_with_allocate_annotations(placeholder_28: T.handle, T_cast_6: T.han
         placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
-        tensor_2 = T.allocate([200704], "uint8", "global", annotations={"attr1_key": "attr1_value"})
+        tensor_2_data = T.allocate([200704], "uint8", "global", annotations={"attr1_key": "attr1_value"})
+        tensor_2 = T.buffer_decl(shape=[200704], dtype="uint8", scope="global", data=tensor_2_data)
         for ax0_ax1_fused_4 in T.serial(0, 56):
             for ax2_4 in T.serial(0, 56):
                 for ax3_init in T.serial(0, 64):
@@ -2987,7 +3011,8 @@ def comm_reducer_single_reduce_group(a: T.handle, b: T.handle) -> None:
         A = T.match_buffer(a, [128 * 128], dtype="float32")
         for i in T.serial(0, 128):
             T.launch_thread(threadIdx_x, 128)
-            reduce_temp0 = T.allocate([1], "float32", "local")
+            reduce_temp0_data = T.allocate([1], "float32", "local")
+            reduce_temp0 = T.buffer_decl(shape=[1], dtype="float32", scope="local", data=reduce_temp0_data)
             with T.attr(T.comm_reducer(lambda x, y: x + y, [T.float32(0)]), "reduce_scope", T.reinterpret(T.uint64(0), dtype="handle")):
                 T.evaluate(T.tvm_thread_allreduce(T.uint32(1), A[i * 128 + threadIdx_x], True, reduce_temp0.data, threadIdx_x, dtype="handle"))
 
@@ -3002,7 +3027,8 @@ def comm_reducer_multiple_reduce_groups(a: T.handle, b: T.handle) -> None:
         A = T.match_buffer(a, [128 * 128], dtype="float32")
         for i in T.serial(0, 128):
             T.launch_thread(threadIdx_x, 128)
-            reduce_temp0 = T.allocate([1], "float32", "local")
+            reduce_temp0_data = T.allocate([1], "float32", "local")
+            reduce_temp0 = T.buffer_decl(shape=[1], dtype="float32", scope="local", data=reduce_temp0_data)
             with T.attr(T.comm_reducer(lambda x0, x1, y0, y1: (T.Select((x1 >= y1), x0, y0), T.Select((x1 >= y1), x1, y1)), [T.int32(-1), T.min_value("float32")]), "reduce_scope", T.reinterpret(T.uint64(0), dtype="handle")):
                 T.evaluate(T.tvm_thread_allreduce(T.uint32(1), A[i * 128 + threadIdx_x], True, reduce_temp0.data, threadIdx_x, dtype="handle"))
 
@@ -3149,7 +3175,8 @@ def func_T_ptr_let_statement(
 def func_T_ptr_allocate():
     @T.prim_func
     def func_T_ptr_allocate() -> None:
-        A = T.allocate([1024], "float32", "global")
+        A_data = T.allocate([1024], "float32", "global")
+        A = T.buffer_decl(shape=[1024], dtype="float32", scope="global", data=A_data)
         A[0] = 0.0
 
     return func_T_ptr_allocate
@@ -3240,8 +3267,10 @@ def string_annotation_of_special_chars():
 def pointer_type():
     @T.prim_func
     def func_with_ptr_type_annotations(x: T.Ptr[T.int32], y: T.Ptr[T.int32, "shared"]):
-        xx = T.allocate([16], "int32", "global")
-        yy = T.allocate([16], "int32", "shared")
+        xx_data = T.allocate([16], "int32", "global")
+        xx = T.buffer_decl(shape=[16], dtype="int32", scope="global", data=xx_data)
+        yy_data = T.allocate([16], "int32", "shared")
+        yy = T.buffer_decl(shape=[16], dtype="int32", scope="shared", data=yy_data)
         a: T.Ptr[T.int32] = T.address_of(xx[0], dtype="handle")
         b: T.Ptr[T.int32, "shared"] = T.address_of(yy[0], dtype="handle")
         T.evaluate(T.call_extern("copy", a, b, dtype=""))
@@ -3313,6 +3342,24 @@ def func(A: T.Buffer[(16, 16), "float32"], B: T.Buffer[(16, 16), "float32"]) ->
     return func
 
 
+def allocate_and_decl_buffer():
+    @T.prim_func
+    def func(A: T.Buffer[(16,), "float32"], B: T.Buffer[(16,), "float32"]) -> None:
+        D_data = T.allocate((16,), "float32", "global")
+        D = T.decl_buffer((16,), "float32", data=D_data)
+        for i in range(4):
+            with T.allocate((4,), "float32", "global") as C_data:
+                C = T.decl_buffer((4,), "float32", data=C_data)
+                for j in range(4):
+                    C[j] = A[i * 4 + j] + T.float32(1.0)
+                for j in range(4):
+                    D[j] = C[j]
+            for j in range(4):
+                B[i * 4 + j] = D[j]
+
+    return func
+
+
 def float_infinity():
     @T.prim_func
     def func(
@@ -3374,6 +3421,7 @@ def func(
     let_expression,
     void_ptr,
     decl_buffer,
+    allocate_and_decl_buffer,
     float_infinity,
 )
 

From d54c0651ecae088e24fcfd448cfca31c77e8c2cb Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Wed, 31 Aug 2022 20:51:21 +0100
Subject: [PATCH 082/704] [Torch][AArch64] Skip
 test_load_model___wrong_language__to_pytorch (#12660)

This patch makes test_load_model___wrong_language__to_pytorch to be
skipped in AArch64 due to a bug that can be reproduced when enabling
Integration Tests in machines with Torch installed in TVM.

```
The error message seen is:
OSError: /usr/local/lib/python3.7/dist-packages/torch/lib/
libgomp-d22c30c5.so.1: cannot allocate memory in static TLS block
```

While the test needs further investigation, it is being set as
skipped so other tests can be enabled and not to regress and allow
time for the investigation to be made.

This relates to the issue described in #10673.
---
 tests/python/driver/tvmc/test_frontends.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py
index 98659b05ae5c..1ccac7696fcc 100644
--- a/tests/python/driver/tvmc/test_frontends.py
+++ b/tests/python/driver/tvmc/test_frontends.py
@@ -269,6 +269,10 @@ def test_load_quantized_model__pth(pytorch_mobilenetv2_quantized):
         assert p.dtype in ["int8", "uint8", "int32"]  # int32 for bias
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_load_model___wrong_language__to_pytorch(tflite_mobilenet_v1_1_quant):
     # some CI environments wont offer pytorch, so skip in case it is not present
     pytest.importorskip("torch")

From a399e6ce9759cd524fcb8f804749baa426096e4b Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 31 Aug 2022 16:10:07 -0700
Subject: [PATCH 083/704] [ci] Add linter for PR title and body (#12367)

* [skip ci][ci] Fix Jenkinsfile (#12387)

This got out of date after merging #12178

Co-authored-by: driazati <driazati@users.noreply.github.com>

* Address comments

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                  |  23 +++++-
 ci/jenkins/Prepare.groovy.j2 |  21 ++++-
 ci/scripts/check_pr.py       | 150 +++++++++++++++++++++++++++++++++++
 ci/scripts/git_skip_ci.py    |   2 +-
 4 files changed, 192 insertions(+), 4 deletions(-)
 create mode 100644 ci/scripts/check_pr.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 50eee01fa974..2b73508da0d3 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-08-30T11:58:06.036509
+// Generated at 2022-08-30T15:26:50.100067
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -288,7 +288,7 @@ def should_skip_ci(pr_number) {
   }
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
-    variable: 'TOKEN',
+    variable: 'GITHUB_TOKEN',
     )]) {
     // Exit code of 1 means run full CI (or the script had an error, so run
     // full CI just in case). Exit code of 0 means skip CI.
@@ -301,12 +301,31 @@ def should_skip_ci(pr_number) {
   return git_skip_ci_code == 0
 }
 
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ci/scripts/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
 def prepare() {
   stage('Prepare') {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
         init_git()
 
+        check_pr(env.CHANGE_ID)
+
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
             script: "./ci/scripts/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2
index 94575a7b4b64..6d0c0ec9c4b6 100644
--- a/ci/jenkins/Prepare.groovy.j2
+++ b/ci/jenkins/Prepare.groovy.j2
@@ -138,7 +138,7 @@ def should_skip_ci(pr_number) {
   }
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
-    variable: 'TOKEN',
+    variable: 'GITHUB_TOKEN',
     )]) {
     // Exit code of 1 means run full CI (or the script had an error, so run
     // full CI just in case). Exit code of 0 means skip CI.
@@ -151,12 +151,31 @@ def should_skip_ci(pr_number) {
   return git_skip_ci_code == 0
 }
 
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ci/scripts/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
 def prepare() {
   stage('Prepare') {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
         init_git()
 
+        check_pr(env.CHANGE_ID)
+
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
             script: "./ci/scripts/determine_docker_images.py {% for image in images %}{{ image.name }}={% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %} {% endfor %}",
diff --git a/ci/scripts/check_pr.py b/ci/scripts/check_pr.py
new file mode 100644
index 000000000000..45d502c6a72e
--- /dev/null
+++ b/ci/scripts/check_pr.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import argparse
+import re
+import os
+import textwrap
+from dataclasses import dataclass
+from typing import Any, List, Callable
+
+
+from git_utils import GitHubRepo, parse_remote, git
+from cmd_utils import init_log, tags_from_title
+
+
+GITHUB_USERNAME_REGEX = re.compile(r"(@[a-zA-Z0-9-]+)", flags=re.MULTILINE)
+OK = object()
+FAIL = object()
+
+
+@dataclass
+class Check:
+    # check to run, returning OK means it passed, anything else means it failed
+    check: Callable[[str], Any]
+
+    # function to call to generate the error message
+    error_fn: Callable[[Any], str]
+
+
+def non_empty(s: str):
+    if len(s) == 0:
+        return FAIL
+    return OK
+
+
+def usernames(s: str):
+    m = GITHUB_USERNAME_REGEX.findall(s)
+    return m if m else OK
+
+
+def tags(s: str):
+    items = tags_from_title(s)
+    if len(items) == 0:
+        return FAIL
+    return OK
+
+
+def trailing_period(s: str):
+    if s.endswith("."):
+        return FAIL
+    return OK
+
+
+title_checks = [
+    Check(check=non_empty, error_fn=lambda d: "PR must have a title but title was empty"),
+    Check(check=trailing_period, error_fn=lambda d: "PR must not end in a tailing '.'"),
+    # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done
+    # Check(
+    #     check=usernames,
+    #     error_fn=lambda d: f"PR title must not tag anyone but found these usernames: {d}",
+    # ),
+]
+body_checks = [
+    Check(check=non_empty, error_fn=lambda d: "PR must have a body but body was empty"),
+    # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done
+    # Check(
+    #     check=usernames,
+    #     error_fn=lambda d: f"PR body must not tag anyone but found these usernames: {d}",
+    # ),
+]
+
+
+def run_checks(checks: List[Check], s: str, name: str) -> bool:
+    print(f"Running checks for {name}")
+    print(textwrap.indent(s, prefix="    "))
+    passed = True
+    print("    Checks:")
+    for i, check in enumerate(checks):
+        result = check.check(s)
+        if result == OK:
+            print(f"        [{i+1}] {check.check.__name__}: PASSED")
+        else:
+            passed = False
+            msg = check.error_fn(result)
+            print(f"        [{i+1}] {check.check.__name__}: FAILED: {msg}")
+
+    return passed
+
+
+if __name__ == "__main__":
+    init_log()
+    help = "Check a PR's title and body for conformance to guidelines"
+    parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--pr", required=True)
+    parser.add_argument("--remote", default="origin", help="ssh remote to parse")
+    parser.add_argument(
+        "--pr-body", help="(testing) PR body to use instead of fetching from GitHub"
+    )
+    parser.add_argument(
+        "--pr-title", help="(testing) PR title to use instead of fetching from GitHub"
+    )
+    args = parser.parse_args()
+
+    try:
+        pr = int(args.pr)
+    except ValueError:
+        print(f"PR was not a number: {args.pr}")
+        exit(0)
+
+    if args.pr_body:
+        body = args.pr_body
+        title = args.pr_title
+    else:
+        remote = git(["config", "--get", f"remote.{args.remote}.url"])
+        user, repo = parse_remote(remote)
+
+        github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
+        pr = github.get(f"pulls/{args.pr}")
+        body = pr["body"]
+        title = pr["title"]
+
+    body = body.strip()
+    title = title.strip()
+
+    title_passed = run_checks(checks=title_checks, s=title, name="PR title")
+    print("")
+    body_passed = run_checks(checks=body_checks, s=body, name="PR body")
+
+    if title_passed and body_passed:
+        print("All checks passed!")
+        exit(0)
+    else:
+        print(
+            "Some checks failed, please review the logs above and edit your PR on GitHub accordingly"
+        )
+        exit(1)
diff --git a/ci/scripts/git_skip_ci.py b/ci/scripts/git_skip_ci.py
index 1e02fcb964fc..162e513275c4 100755
--- a/ci/scripts/git_skip_ci.py
+++ b/ci/scripts/git_skip_ci.py
@@ -46,7 +46,7 @@ def check_pr_title():
         if args.pr_title:
             title = args.pr_title
         else:
-            github = GitHubRepo(token=os.environ["TOKEN"], user=user, repo=repo)
+            github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
             pr = github.get(f"pulls/{args.pr}")
             title = pr["title"]
         logging.info(f"pr title: {title}")

From c6516a534fded605ae24bf56e24ec871b68ca9e2 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 31 Aug 2022 19:23:15 -0700
Subject: [PATCH 084/704] [TIR] Allow string/buffer arguments to Schedule
 cache_read/write (#12661)

Previously, the argument needed to be an integer specifying the index
into the read/write regions of a block.  Now, the argument can be a
string specifying the name of the buffer, or the Buffer object itself.
This is a follow-up from https://github.com/apache/tvm/pull/11624.
---
 python/tvm/tir/schedule/schedule.py           | 42 ++++++++++++++++---
 .../test_tir_schedule_cache_read_write.py     |  8 +++-
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 04cc1bc26ad1..d1293371a0e0 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -1014,7 +1014,7 @@ def after_unroll(a: T.handle, b: T.handle) -> None:
     def cache_read(
         self,
         block: Union[BlockRV, str],
-        read_buffer_index: int,
+        read_buffer_index: Union[int, str, Buffer],
         storage_scope: str,
         consumer_blocks: Optional[List[Union[BlockRV, str]]] = None,
     ) -> BlockRV:
@@ -1029,8 +1029,10 @@ def cache_read(
         block : Union[BlockRV, str]
             The consumer block of the target buffer.
 
-        read_buffer_index: int
-            The index of the buffer in block's read region.
+        buffer: Union[int, str, Buffer]
+            The index of the buffer in block's read region, the unique
+            name of a read buffer in the block, or a Buffer object
+            that is within the blocks read region.
 
         storage_scope: str
             The target storage scope.
@@ -1093,13 +1095,21 @@ def after_cache_read(a: T.handle, b: T.handle) -> None:
         # Convert any string block names into Block RVs.
         consumer_blocks = [self._normalize_block_arg(b) for b in consumer_blocks]
         block = self._normalize_block_arg(block)
+
+        if not isinstance(read_buffer_index, int):
+            _, read_buffer_index, _ = self._normalize_buffer_arg(
+                block, read_buffer_index, required_buffer_type="read"
+            )
         return _ffi_api.ScheduleCacheRead(  # type: ignore # pylint: disable=no-member
             self, block, read_buffer_index, storage_scope, consumer_blocks
         )
 
     @type_checked
     def cache_write(
-        self, block: Union[BlockRV, str], write_buffer_index: int, storage_scope: str
+        self,
+        block: Union[BlockRV, str],
+        write_buffer_index: Union[int, str, Buffer],
+        storage_scope: str,
     ) -> BlockRV:
         """Create a block that reads a buffer region into a write cache. It requires:
 
@@ -1113,7 +1123,9 @@ def cache_write(
             The producer block of the target buffer.
 
         write_buffer_index: int
-            The index of the buffer in block's write region.
+            The index of the buffer in block's write region, the unique
+            name of a write buffer in the block, or a Buffer object
+            that is within the blocks write region.
 
         storage_scope: str
             The target storage scope.
@@ -1168,6 +1180,11 @@ def after_cache_write(a: T.handle, b: T.handle) -> None:
 
         """
         block = self._normalize_block_arg(block)
+
+        if not isinstance(write_buffer_index, int):
+            _, write_buffer_index, _ = self._normalize_buffer_arg(
+                block, write_buffer_index, required_buffer_type="write"
+            )
         return _ffi_api.ScheduleCacheWrite(  # type: ignore # pylint: disable=no-member
             self, block, write_buffer_index, storage_scope
         )
@@ -2352,7 +2369,10 @@ def _normalize_block_arg(self, block: Union[BlockRV, str]) -> BlockRV:
         return block
 
     def _normalize_buffer_arg(
-        self, block: BlockRV, buffer: Union[Tuple[str, int], str, Buffer]
+        self,
+        block: BlockRV,
+        buffer: Union[Tuple[str, int], int, str, Buffer],
+        required_buffer_type=None,
     ) -> Tuple[str, int, Buffer]:
 
         block_obj: Block = self.get(block)
@@ -2364,6 +2384,9 @@ def iter_buffers():
             for i, write in enumerate(block_obj.writes):
                 yield "write", i, write.buffer
 
+        if isinstance(buffer, int):
+            buffer = (required_buffer_type, buffer)
+
         if isinstance(buffer, str):
             possible_buffers = {}
             # String lookup requires ensuring that the name is unique
@@ -2405,6 +2428,13 @@ def iter_buffers():
         else:
             raise TypeError(f"Invalid type for argument 'buffer': {type(buffer)}")
 
+        if required_buffer_type is not None:
+            assert buffer_index_type == required_buffer_type, (
+                f"Expected buffer to be read buffer, "
+                f"but {buffer_obj.name} was a {buffer_index_type} buffer "
+                f"in the specified block"
+            )
+
         return (buffer_index_type, buffer_index, buffer_obj)
 
     @type_checked
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index 255ca34118d0..cf4836e5361e 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -774,8 +774,12 @@ def test_cache_read_elementwise(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
     block_b = sch.get_block("B")
     block_c = sch.get_block("C")
-    cached_a = sch.cache_read("B" if use_block_name else block_b, 0, "global")
-    cached_b = sch.cache_read("C" if use_block_name else block_c, 0, "local")
+    if use_block_name:
+        cached_a = sch.cache_read("B", "A", "global")
+        cached_b = sch.cache_read("C", "B", "local")
+    else:
+        cached_a = sch.cache_read(block_b, 0, "global")
+        cached_b = sch.cache_read(block_c, 0, "local")
     assert sch.get(cached_a) == sch.get(sch.get_block("A_global"))
     assert sch.get(cached_b) == sch.get(sch.get_block("B_local"))
     assert sch.get(block_b) == sch.get(sch.get_block("B"))

From aa6c7123d0a2cdd93256c6a4576ff029008fd375 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Thu, 1 Sep 2022 08:10:55 +0100
Subject: [PATCH 085/704] [ETHOSN] Fix tests pylint errors (#12649)

This pr fixes pylint errors in tests/python/contrib/test_ethosn as reported in issue #11414.
---
 tests/lint/pylint.sh                          |  1 +
 .../contrib/test_ethosn/infrastructure.py     | 50 +++++++++++--------
 .../contrib/test_ethosn/test_concatenate.py   | 10 +++-
 .../test_ethosn/test_constant_duplication.py  | 10 ++--
 .../python/contrib/test_ethosn/test_conv2d.py | 18 ++++---
 .../test_ethosn/test_depth_to_space.py        |  4 ++
 .../test_ethosn/test_fullyconnected.py        | 25 +++++-----
 .../contrib/test_ethosn/test_leaky_relu.py    |  2 +
 tests/python/contrib/test_ethosn/test_mean.py |  2 +
 .../contrib/test_ethosn/test_multiply.py      |  3 ++
 .../contrib/test_ethosn/test_networks.py      | 13 +++--
 .../test_ethosn/test_partition_params.py      | 24 ++++++---
 .../contrib/test_ethosn/test_pooling.py       |  8 ++-
 tests/python/contrib/test_ethosn/test_relu.py |  4 ++
 .../contrib/test_ethosn/test_requantize.py    |  5 ++
 .../python/contrib/test_ethosn/test_resize.py |  4 ++
 .../contrib/test_ethosn/test_sigmoid.py       | 11 ++--
 .../python/contrib/test_ethosn/test_split.py  |  9 +++-
 tests/python/contrib/test_ethosn/test_tanh.py |  4 ++
 .../contrib/test_ethosn/test_topologies.py    | 33 +++++++-----
 20 files changed, 158 insertions(+), 82 deletions(-)

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 2228e110c15e..94fae289b6b9 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -21,6 +21,7 @@ python3 -m pylint python/tvm --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint vta/python/vta --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/unittest/test_tvmscript_type.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_ethosn --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/relay/aot/*.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index a1c8ca0a32d2..0071b1a7f52e 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -18,17 +18,17 @@
 """Arm(R) Ethos(TM)-N test functions"""
 
 from __future__ import absolute_import, print_function
-import tvm
-from tvm import relay
-from tvm.contrib import utils, graph_executor, download
 from hashlib import md5
 from itertools import zip_longest, combinations
+import os
 import numpy as np
 from PIL import Image
-import os
 
-from . import _infrastructure
+import tvm
+from tvm import relay
+from tvm.contrib import utils, graph_executor, download
 from tvm.relay.op.contrib import partition_for_ethosn
+from . import _infrastructure
 
 
 def get_real_image(im_height, im_width):
@@ -82,23 +82,25 @@ def make_module(func, params):
 
 
 def make_ethosn_composite(ethosn_expr, name):
-    vars = relay.analysis.free_vars(ethosn_expr)
-    inner_vars = [relay.Var(v.name_hint, v.type_annotation) for v in vars]
+    variables = relay.analysis.free_vars(ethosn_expr)
+    inner_vars = [relay.Var(v.name_hint, v.type_annotation) for v in variables]
     func = relay.Function(inner_vars, ethosn_expr)
     func = func.with_attr("Composite", name)
-    call = relay.Call(func, vars)
+    call = relay.Call(func, variables)
     return call
 
 
 def make_ethosn_partition(ethosn_expr):
+    """Make an Ethos(TM)-N partition."""
+
     # Create an Ethos-N global function
     mod = tvm.IRModule({})
-    vars = relay.analysis.free_vars(ethosn_expr)
+    variables = relay.analysis.free_vars(ethosn_expr)
     # NB: it is illegal to reuse variables inside and outside a scope in Relay
     # if you want to duplicate types and names you must re-allocate them.
-    fresh_vars = [relay.Var(v.name_hint, v.type_annotation) for v in vars]
+    fresh_vars = [relay.Var(v.name_hint, v.type_annotation) for v in variables]
     binds = {}
-    for var, fresh_var in zip(vars, fresh_vars):
+    for var, fresh_var in zip(variables, fresh_vars):
         binds[var] = fresh_var
     ethosn_expr_fresh = relay.bind(ethosn_expr, binds)
     func = relay.Function(fresh_vars, ethosn_expr_fresh)
@@ -106,19 +108,21 @@ def make_ethosn_partition(ethosn_expr):
     func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
     func = func.with_attr("Compiler", "ethos-n")
     func = func.with_attr("global_symbol", "ethos-n_0")
-    g1 = relay.GlobalVar("ethos-n_0")
-    mod[g1] = func
+    global_var = relay.GlobalVar("ethos-n_0")
+    mod[global_var] = func
     mod = relay.transform.InferType()(mod)
 
     # These are the vars to call the Ethos-N partition with
     more_vars = relay.analysis.free_vars(ethosn_expr)
     # Call the Ethos-N partition in main
-    call_fn1 = g1(*more_vars)
+    call_fn1 = global_var(*more_vars)
     mod["main"] = relay.Function(more_vars, call_fn1)
     return relay.transform.InferType()(mod)
 
 
 def get_host_op_count(mod):
+    """Return the number of host operators."""
+
     class Counter(tvm.relay.ExprVisitor):
         def __init__(self):
             super().__init__()
@@ -219,9 +223,7 @@ def run(lib, inputs, outputs, npu=True):
     return out
 
 
-def build_and_run(
-    mod, inputs, outputs, params, device=tvm.cpu(), npu=True, expected_host_ops=0, npu_partitions=1
-):
+def build_and_run(mod, inputs, outputs, params, npu=True, expected_host_ops=0, npu_partitions=1):
     lib = build(mod, params, npu, expected_host_ops, npu_partitions)
     return run(lib, inputs, outputs, npu)
 
@@ -254,6 +256,8 @@ def inference_result(outputs):
 
 
 def test_error(mod, params, err_msg):
+    """Test an operator error message."""
+
     caught = None
     with tvm.transform.PassContext(
         opt_level=3, config={"relay.ext.ethos-n.options": {"variant": get_ethosn_variant()}}
@@ -262,8 +266,8 @@ def test_error(mod, params, err_msg):
             try:
                 mod = relay.transform.InferType()(mod)
                 relay.build(mod, params=params)
-            except tvm.error.TVMError as e:
-                caught = e.args[0]
+            except tvm.error.TVMError as error:
+                caught = error.args[0]
             finally:
                 relay.backend.te_compiler.get().clear()
 
@@ -275,8 +279,8 @@ def get_conv2d(var, shape, dtype):
     """Standard convolution to test activation functions"""
 
     weight_shape = (1, 1, shape[3], 1)
-    w = tvm.nd.array(np.ones(weight_shape, dtype))
-    weights = relay.const(w, dtype)
+    weights_array = tvm.nd.array(np.ones(weight_shape, dtype))
+    weights = relay.const(weights_array, dtype)
     conv = relay.qnn.op.conv2d(
         var,
         weights,
@@ -300,13 +304,15 @@ def get_conv2d(var, shape, dtype):
         relay.const(0, "int32"),  # output zero point
         out_dtype=dtype,
     )
-    params = {"w": w, "b": b}
+    params = {"w": weights_array, "b": b}
     return req, params
 
 
 def get_conv2d_qnn_params(
     dtype, input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, kernel_w, channels
 ):
+    """Return Conv2D QNN params."""
+
     kernel_sc = (
         kernel_sc.numpy() if isinstance(kernel_sc, tvm.runtime.ndarray.NDArray) else [kernel_sc]
     )
diff --git a/tests/python/contrib/test_ethosn/test_concatenate.py b/tests/python/contrib/test_ethosn/test_concatenate.py
index b2eba6d650e0..cd4ec7a4e4b2 100644
--- a/tests/python/contrib/test_ethosn/test_concatenate.py
+++ b/tests/python/contrib/test_ethosn/test_concatenate.py
@@ -57,6 +57,8 @@ def _get_model(shapes, dtype, axis):
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_concatenate(dtype):
+    """Compare Concatenate output with TVM."""
+
     trials = [
         ([(1, 4), (1, 6)], 1),
         ([(1, 16, 4), (1, 16, 4)], 1),
@@ -78,19 +80,23 @@ def test_concatenate(dtype):
 
 @requires_ethosn
 def test_concatenate_failure():
+    """Check Concatenate error messages."""
+
     trials = [
         ([(1, 4, 4, 4, 4), (1, 4, 4, 4, 4)], "uint8", 1, "dimensions=5, dimensions must be <= 4;"),
         (
             [(1, 4, 4, 4), (1, 4, 4, 4)],
             "uint8",
             3,
-            "Concatenation along the channels dimension (axis 3) requires input tensors with a multiple of 16 channels;",
+            "Concatenation along the channels dimension (axis 3) "
+            "requires input tensors with a multiple of 16 channels;",
         ),
         (
             [(1, 4, 4, 4), (1, 4, 4, 4)],
             "int16",
             2,
-            "dtype='int16', dtype must be either uint8, int8 or int32; dtype='int16', dtype must be either uint8, int8 or int32;",
+            "dtype='int16', dtype must be either uint8, int8 or int32; dtype='int16', "
+            "dtype must be either uint8, int8 or int32;",
         ),
         (
             [(2, 4, 4, 4), (2, 4, 4, 4)],
diff --git a/tests/python/contrib/test_ethosn/test_constant_duplication.py b/tests/python/contrib/test_ethosn/test_constant_duplication.py
index 84956840ecbb..b3cd0046f508 100644
--- a/tests/python/contrib/test_ethosn/test_constant_duplication.py
+++ b/tests/python/contrib/test_ethosn/test_constant_duplication.py
@@ -36,8 +36,10 @@ def _get_model():
     add_const = relay.const(add_const_value, "uint8")
     a = relay.add(a, add_const)
     weight_shape = (kernel_h, kernel_w, shape[3], out_channels)
-    w = tvm.nd.array(np.random.randint(low=0, high=255, size=weight_shape, dtype="uint8"))
-    weights = relay.const(w, "uint8")
+    weights_array = tvm.nd.array(
+        np.random.randint(low=0, high=255, size=weight_shape, dtype="uint8")
+    )
+    weights = relay.const(weights_array, "uint8")
     conv = relay.qnn.op.conv2d(
         a,
         weights,
@@ -66,12 +68,14 @@ def _get_model():
         relay.const(0, "int32"),  # output zero point
         out_dtype="uint8",
     )
-    params = {"w": w, "b": b}
+    params = {"w": weights_array, "b": b}
     return req, params
 
 
 @requires_ethosn
 def test_constant_duplication():
+    """Test that constants are not duplicated."""
+
     np.random.seed(0)
     model, params = _get_model()
     mod = tei.make_module(model, params)
diff --git a/tests/python/contrib/test_ethosn/test_conv2d.py b/tests/python/contrib/test_ethosn/test_conv2d.py
index a411701ea0bc..ffe66f0d2be2 100644
--- a/tests/python/contrib/test_ethosn/test_conv2d.py
+++ b/tests/python/contrib/test_ethosn/test_conv2d.py
@@ -17,9 +17,9 @@
 
 """Arm(R) Ethos(TM)-N integration conv2d tests"""
 
+import math
 import numpy as np
 import pytest
-import math
 import tvm
 from tvm import relay
 from tvm.testing import requires_ethosn
@@ -61,7 +61,7 @@ def _get_model(
 ):
     """Return a model and any parameters it may have"""
     a = relay.var("a", shape=shape, dtype=dtype)
-    if pad == "op" or pad == "both":
+    if pad in ("op", "both"):
         p = _get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
         a = relay.nn.pad(
             a,
@@ -76,12 +76,12 @@ def _get_model(
         weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
     else:
         weight_shape = (kernel_h, kernel_w, out_channels, 1)
-    w = tvm.nd.array(
+    weights_array = tvm.nd.array(
         np.random.randint(
             np.iinfo(dtype).min, high=np.iinfo(dtype).max + 1, size=weight_shape, dtype=dtype
         )
     )
-    weights = relay.const(w, dtype)
+    weights = relay.const(weights_array, dtype)
     conv = relay.qnn.op.conv2d(
         a,
         weights,
@@ -96,7 +96,7 @@ def _get_model(
         strides=strides,
         groups=groups,
         channels=out_channels,
-        padding=p if pad == "attr" or pad == "both" else (0, 0, 0, 0),
+        padding=p if pad in ("attr", "both") else (0, 0, 0, 0),
         out_dtype="int32",
     )
     b = tvm.nd.array(
@@ -118,7 +118,7 @@ def _get_model(
         relay.const(output_zp, "int32"),  # output zero point
         out_dtype=dtype,
     )
-    params = {"w": w, "b": b}
+    params = {"w": weights_array, "b": b}
     return req, params
 
 
@@ -126,6 +126,8 @@ def _get_model(
 @pytest.mark.parametrize("depthwise", [False, True])
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_conv2d(dtype, depthwise):
+    """Compare Conv2D output with TVM."""
+
     trials = [
         [(1, 17, 20, 26), 4, 3, 1, "attr", (2, 2), (1, 1), False],
         [(1, 30, 27, 30), 5, 5, 3, "none", (1, 1), (1, 1), False],
@@ -208,6 +210,8 @@ def test_conv2d(dtype, depthwise):
 
 @requires_ethosn
 def test_conv2d_failure():
+    """Check Conv2D error messages."""
+
     trials = [
         (
             (1, 4, 4, 4),
@@ -326,7 +330,7 @@ def test_conv2d_failure():
         weight_format,
         err_msg,
     ) in trials:
-        model, params = _get_model(
+        model, _ = _get_model(
             shape,
             kernel_h,
             kernel_w,
diff --git a/tests/python/contrib/test_ethosn/test_depth_to_space.py b/tests/python/contrib/test_ethosn/test_depth_to_space.py
index 1675b82eeace..c071fe00f212 100644
--- a/tests/python/contrib/test_ethosn/test_depth_to_space.py
+++ b/tests/python/contrib/test_ethosn/test_depth_to_space.py
@@ -34,6 +34,8 @@ def _get_model(shape, block, dtype, layout):
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_depth_to_space(dtype):
+    """Compare Depth To Space output with TVM."""
+
     trials = [
         (1, 16, 16, 16),
         (1, 64, 32, 16),
@@ -59,6 +61,8 @@ def test_depth_to_space(dtype):
 
 @requires_ethosn
 def test_depth_to_space_failure():
+    """Check Depth To Space error messages."""
+
     trials = [
         ((2, 16, 16, 16), 2, "uint8", "NHWC", "batch size=2, batch size must = 1"),
         (
diff --git a/tests/python/contrib/test_ethosn/test_fullyconnected.py b/tests/python/contrib/test_ethosn/test_fullyconnected.py
index 2caca9e890a2..d5510bb79d2c 100644
--- a/tests/python/contrib/test_ethosn/test_fullyconnected.py
+++ b/tests/python/contrib/test_ethosn/test_fullyconnected.py
@@ -30,9 +30,9 @@ def _get_model(
 ):
     """Return a model an any parameters it may have"""
     a = relay.var("a", shape=shape, dtype=dtype)
-    w = tvm.nd.array(np.ones(weight_shape, dtype))
-    weights = relay.const(w, dtype)
-    fc = relay.qnn.op.dense(
+    weights_array = tvm.nd.array(np.ones(weight_shape, dtype))
+    weights = relay.const(weights_array, dtype)
+    dense = relay.qnn.op.dense(
         a,
         weights,
         input_zero_point=relay.const(input_zp, "int32"),
@@ -44,7 +44,7 @@ def _get_model(
     )
     b = tvm.nd.array(np.random.randint(0, high=255, size=(weight_shape[0],), dtype="int32"))
     biasc = relay.const(b, "int32")
-    bias = relay.nn.bias_add(fc, biasc)
+    bias = relay.nn.bias_add(dense, biasc)
     req = relay.qnn.op.requantize(
         bias,
         relay.const(input_sc * kernel_sc, "float32"),  # input zero scale
@@ -53,7 +53,7 @@ def _get_model(
         relay.const(output_zp, "int32"),  # output zero point
         out_dtype=dtype,
     )
-    params = {"w": w, "b": b}
+    params = {"w": weights_array, "b": b}
     return req, params
 
 
@@ -76,9 +76,8 @@ def _get_model(
     ],
 )
 def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_zp, kernel_sc):
-    """
-    Test fully connected offloading.
-    """
+    """Compare Fully Connected output with TVM."""
+
     np.random.seed(0)
     inputs = {
         "a": tvm.nd.array(
@@ -116,6 +115,8 @@ def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_z
 
 @requires_ethosn
 def test_fullyconnected_failure():
+    """Check Fully Connected error messages."""
+
     trials = [
         (
             (1, 64),
@@ -139,7 +140,8 @@ def test_fullyconnected_failure():
             0,
             1,
             "uint8",
-            "Weights tensor must have I dimension equal to the number of channels of the input tensor.;",
+            "Weights tensor must have I dimension equal to the number"
+            " of channels of the input tensor.;",
         ),
         ((1024, 64), (1, 64), 0, 1, 0, 1, 0, 1, "uint8", "batch size=1024, batch size must = 1;"),
     ]
@@ -157,10 +159,7 @@ def test_fullyconnected_failure():
         dtype,
         err_msg,
     ) in trials:
-        inputs = {
-            "a": tvm.nd.array(np.random.randint(0, high=255, size=shape, dtype=dtype)),
-        }
-        model, params = _get_model(
+        model, _ = _get_model(
             shape,
             weight_shape,
             input_zp,
diff --git a/tests/python/contrib/test_ethosn/test_leaky_relu.py b/tests/python/contrib/test_ethosn/test_leaky_relu.py
index cdd06f5e73e4..3c3bbc709679 100644
--- a/tests/python/contrib/test_ethosn/test_leaky_relu.py
+++ b/tests/python/contrib/test_ethosn/test_leaky_relu.py
@@ -49,6 +49,7 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype, alpha):
 @pytest.mark.parametrize("alpha", [0.001, 0.5678])
 def test_leaky_relu(dtype, shape, alpha):
     """Compare Leaky ReLU output with TVM."""
+
     np.random.seed(0)
 
     iinfo = np.iinfo(dtype)
@@ -75,6 +76,7 @@ def test_leaky_relu(dtype, shape, alpha):
 @pytest.mark.parametrize("alpha", [-1.34, 2.32, 1, 0])
 def test_leaky_relu_unsupported_alpha(dtype, shape, alpha):
     """Test unsupported values of alpha (<= 0, >= 1) in Leaky ReLU."""
+
     iinfo = np.iinfo(dtype)
     zp_min = iinfo.min
 
diff --git a/tests/python/contrib/test_ethosn/test_mean.py b/tests/python/contrib/test_ethosn/test_mean.py
index 548743fe9548..0ad7e17faed8 100644
--- a/tests/python/contrib/test_ethosn/test_mean.py
+++ b/tests/python/contrib/test_ethosn/test_mean.py
@@ -45,6 +45,7 @@ def _get_model(shape, axis, keepdims, input_zp, input_sc, output_zp, output_sc,
 @pytest.mark.parametrize("shape", [(1, 7, 7, 2048), (1, 8, 8)])
 def test_mean(dtype, shape):
     """Compare Mean output with TVM."""
+
     np.random.seed(0)
 
     zp_min = np.iinfo(dtype).min
@@ -68,6 +69,7 @@ def test_mean(dtype, shape):
 @pytest.mark.parametrize("dtype", ["int8", "uint8"])
 def test_mean_non_equal_quantization(dtype):
     """Test mean is not offloaded when quantization is not equal."""
+
     np.random.seed(0)
 
     shape = (1, 7, 7, 2048)
diff --git a/tests/python/contrib/test_ethosn/test_multiply.py b/tests/python/contrib/test_ethosn/test_multiply.py
index 38d8516b6721..cb95a97db529 100644
--- a/tests/python/contrib/test_ethosn/test_multiply.py
+++ b/tests/python/contrib/test_ethosn/test_multiply.py
@@ -69,6 +69,7 @@ def _get_model(
 @pytest.mark.parametrize("reverse_inputs", [False, True])
 def test_multiply(dtype, shape, constant_shape, reverse_inputs):
     """Compare Multiply output with TVM."""
+
     np.random.seed(0)
 
     iinfo = np.iinfo(dtype)
@@ -106,6 +107,7 @@ def test_multiply(dtype, shape, constant_shape, reverse_inputs):
 @requires_ethosn
 def test_multiply_multiple_inputs_unsupported():
     """Check multiply operator with two inputs is not offloaded."""
+
     np.random.seed(0)
 
     shape = (1, 4, 5, 6)
@@ -151,6 +153,7 @@ def test_multiply_multiple_inputs_unsupported():
 @requires_ethosn
 def test_multiply_unsupported_datatype():
     """Check multiply operator with unsupported datatype is not offloaded."""
+
     np.random.seed(0)
 
     shape = (1, 4, 5, 6)
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 11745409d4ea..db1b41244846 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+# pylint: disable=wrong-import-position
 """Arm(R) Ethos(TM)-N integration end-to-end network tests"""
 
 import pytest
@@ -22,12 +22,11 @@
 pytest.importorskip("tflite")
 pytest.importorskip("tensorflow")
 
+import tflite.Model
 from tvm import relay
 from tvm.testing import requires_ethosn
 from tvm.contrib import download
-
 import tvm.relay.testing.tf as tf_testing
-import tflite.Model
 from . import infrastructure as tei
 
 
@@ -41,10 +40,10 @@ def _get_tflite_model(tflite_model_path, inputs_dict, dtype):
         tflite_model = tflite.Model.GetRootAsModel(tflite_model_buffer, 0)
     shape_dict = {}
     dtype_dict = {}
-    for input in inputs_dict:
-        input_shape = inputs_dict[input]
-        shape_dict[input] = input_shape
-        dtype_dict[input] = dtype
+    for value in inputs_dict:
+        input_shape = inputs_dict[value]
+        shape_dict[value] = input_shape
+        dtype_dict[value] = dtype
 
     return relay.frontend.from_tflite(
         tflite_model,
diff --git a/tests/python/contrib/test_ethosn/test_partition_params.py b/tests/python/contrib/test_ethosn/test_partition_params.py
index 34e22e6aaba8..e8ac687c04b0 100644
--- a/tests/python/contrib/test_ethosn/test_partition_params.py
+++ b/tests/python/contrib/test_ethosn/test_partition_params.py
@@ -18,19 +18,23 @@
 """Arm(R) Ethos(TM)-N partition parameter tests"""
 
 import pytest
-import tvm
-from tvm import relay
 import numpy as np
 
+import tvm
+from tvm import relay
 from tvm.relay.op.contrib.ethosn import partition_for_ethosn
 from tvm.testing import requires_ethosn
 
 
 @requires_ethosn
 def test_ethosn78_partition_no_error():
+    """Test Arm(R) Ethos(TM)-N78 partition"""
+
     a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-    w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-    res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8")
+    weights = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
+    res = relay.nn.conv2d(
+        a, weights, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
+    )
     b = relay.var("b", shape=[8], dtype="uint8")
     res = relay.nn.bias_add(res, b, axis=1)
 
@@ -41,13 +45,15 @@ def test_ethosn78_partition_no_error():
 
 @requires_ethosn
 def test_ethosn78_partition_undefined_variant():
+    """Test Arm(R) Ethos(TM)-N78 partition with undefined variant"""
+
     with pytest.raises(
         ValueError, match=r".*Please specify a variant in the target string, e.g. -variant=n78.*"
     ):
         a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
+        weights = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
         res = relay.nn.conv2d(
-            a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
+            a, weights, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
         )
         b = relay.var("b", shape=[8], dtype="uint8")
         res = relay.nn.bias_add(res, b, axis=1)
@@ -58,13 +64,15 @@ def test_ethosn78_partition_undefined_variant():
 
 @requires_ethosn
 def test_ethosn78_partition_invalid_variant():
+    """Test Arm(R) Ethos(TM)-N78 partition with invalid variant"""
+
     with pytest.raises(
         ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=n78 should be set.*"
     ):
         a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
+        wwights = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
         res = relay.nn.conv2d(
-            a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
+            a, wwights, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
         )
         b = relay.var("b", shape=[8], dtype="uint8")
         res = relay.nn.bias_add(res, b, axis=1)
diff --git a/tests/python/contrib/test_ethosn/test_pooling.py b/tests/python/contrib/test_ethosn/test_pooling.py
index 3defaa55e853..e1c7358f71a1 100644
--- a/tests/python/contrib/test_ethosn/test_pooling.py
+++ b/tests/python/contrib/test_ethosn/test_pooling.py
@@ -28,10 +28,10 @@
 def _get_model(shape, typef, sizes, strides, pads, layout, dtype):
     """Return a model and any parameters it may have"""
     req = relay.var("a", shape=shape, dtype=dtype)
-    if typef == relay.nn.avg_pool2d:
+    if typef is relay.nn.avg_pool2d:
         req = relay.cast(req, "int32")
     req = typef(req, pool_size=sizes, strides=strides, padding=pads, ceil_mode=True, layout=layout)
-    if typef == relay.nn.avg_pool2d:
+    if typef is relay.nn.avg_pool2d:
         req = relay.cast(req, dtype)
     return req
 
@@ -39,6 +39,8 @@ def _get_model(shape, typef, sizes, strides, pads, layout, dtype):
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_pooling(dtype):
+    """Compare Pooling output with TVM."""
+
     trials = [
         ((1, 8, 8, 8), relay.nn.max_pool2d, (2, 2), (2, 2), (0, 0, 0, 0), "NHWC"),
         ((1, 9, 9, 9), relay.nn.max_pool2d, (3, 3), (2, 2), (0, 0, 0, 0), "NHWC"),
@@ -65,6 +67,8 @@ def test_pooling(dtype):
 
 @requires_ethosn
 def test_pooling_failure():
+    """Check Pooling error messages."""
+
     trials = [
         (
             (2, 8, 8, 8),
diff --git a/tests/python/contrib/test_ethosn/test_relu.py b/tests/python/contrib/test_ethosn/test_relu.py
index 5d3e8f1e9921..f56a1cd7ad3c 100644
--- a/tests/python/contrib/test_ethosn/test_relu.py
+++ b/tests/python/contrib/test_ethosn/test_relu.py
@@ -35,6 +35,8 @@ def _get_model(shape, dtype, a_min, a_max):
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_relu(dtype):
+    """Compare Relu output with TVM."""
+
     trials = [
         ((1, 4, 4, 4), 65, 178, "uint8"),
         ((1, 8, 4, 2), 1, 254, "uint8"),
@@ -68,6 +70,8 @@ def test_relu(dtype):
 
 @requires_ethosn
 def test_relu_failure():
+    """Check Relu error messages."""
+
     trials = [
         ((1, 4, 4, 4, 4), "uint8", 65, 78, "dimensions=5, dimensions must be <= 4"),
         ((1, 8, 4, 2), "int16", 1, 254, "dtype='int16', dtype must be either uint8, int8 or int32"),
diff --git a/tests/python/contrib/test_ethosn/test_requantize.py b/tests/python/contrib/test_ethosn/test_requantize.py
index e20c3beeabfa..3187c22f3391 100644
--- a/tests/python/contrib/test_ethosn/test_requantize.py
+++ b/tests/python/contrib/test_ethosn/test_requantize.py
@@ -43,6 +43,8 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, in_dtype, out_dt
 @pytest.mark.parametrize("out_dtype", ["int8", "uint8"])
 @pytest.mark.parametrize("shape", [(1, 52, 52, 3)])
 def test_requantize(in_dtype, out_dtype, shape):
+    """Compare Requantize output with TVM."""
+
     np.random.seed(0)
     low = 0 if in_dtype == "uint8" else -5
     high = low + 10
@@ -74,6 +76,7 @@ def test_requantize_mixed_precision_with_following_op():
     Checks a requantize operation that changes precision from uint8 to int8 with a
     following add op.
     """
+
     np.random.seed(0)
     shape = (1, 4, 6, 8)
     in_sc = 0.012566
@@ -133,6 +136,8 @@ def get_model():
 
 @requires_ethosn
 def test_requantize_failure():
+    """Check Requantize error messages."""
+
     input_sc = 0.8
     output_sc = (input_sc / 128) - 0.0001
     model = _get_model(
diff --git a/tests/python/contrib/test_ethosn/test_resize.py b/tests/python/contrib/test_ethosn/test_resize.py
index b9d807d21926..2cc641e63b5c 100644
--- a/tests/python/contrib/test_ethosn/test_resize.py
+++ b/tests/python/contrib/test_ethosn/test_resize.py
@@ -68,6 +68,8 @@ def _get_model(
     ],
 )
 def test_resize(dtype, shape, size, coordinate_transformation_mode, rounding_method):
+    """Compare Resize output with TVM."""
+
     np.random.seed(0)
     zp_min = np.iinfo(dtype).min
     zp_max = np.iinfo(dtype).max
@@ -96,6 +98,8 @@ def test_resize(dtype, shape, size, coordinate_transformation_mode, rounding_met
 
 @requires_ethosn
 def test_resize_failure():
+    """Check Resize error messages."""
+
     trials = [
         (
             (30, 20),
diff --git a/tests/python/contrib/test_ethosn/test_sigmoid.py b/tests/python/contrib/test_ethosn/test_sigmoid.py
index 9947bee3b86b..ae8c301ff01a 100644
--- a/tests/python/contrib/test_ethosn/test_sigmoid.py
+++ b/tests/python/contrib/test_ethosn/test_sigmoid.py
@@ -45,6 +45,8 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype):
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_sigmoid(dtype):
+    """Compare Sigmoid output with TVM."""
+
     trials = [
         (1, 16, 16, 16),
         (1, 8, 8),
@@ -61,7 +63,7 @@ def test_sigmoid(dtype):
         }
         outputs = []
         for npu in [False, True]:
-            for d in range(1, 2):
+            for _ in range(1, 2):
                 if dtype == "uint8":
                     input_zp = 0
                     output_zp = 0
@@ -78,21 +80,22 @@ def test_sigmoid(dtype):
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_sigmoid_failure(dtype):
+    """Check Sigmoid error messages."""
+
     test_zp = 0 if dtype == "uint8" else -128
     trials = [
-        ((2, 4, 4, 4), 64, 0.2, test_zp, 1 / 256, dtype, "batch size=2, batch size must = 1"),
+        ((2, 4, 4, 4), 64, 0.2, test_zp, 1 / 256, "batch size=2, batch size must = 1"),
         (
             (1, 4, 4, 4),
             64,
             0.2,
             3,
             1,
-            dtype,
             f"output quantization params=(3, 1), must = ({test_zp}, 1/256)",
         ),
     ]
 
-    for shape, input_zp, input_sc, output_zp, output_sc, dtype, err_msg in trials:
+    for shape, input_zp, input_sc, output_zp, output_sc, err_msg in trials:
         model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype)
         model = tei.make_ethosn_composite(model, "ethos-n.qnn_sigmoid")
         mod = tei.make_ethosn_partition(model)
diff --git a/tests/python/contrib/test_ethosn/test_split.py b/tests/python/contrib/test_ethosn/test_split.py
index 4d1743d07a32..7f8787afe947 100644
--- a/tests/python/contrib/test_ethosn/test_split.py
+++ b/tests/python/contrib/test_ethosn/test_split.py
@@ -37,6 +37,8 @@ def _get_model(shape, dtype, splits, axis):
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_split(dtype):
+    """Compare Split output with TVM."""
+
     trials = [
         ((1, 16, 16, 32), (2, 7, 10), 2),
         ((1, 12, 8, 16), 3, 1),
@@ -55,7 +57,7 @@ def test_split(dtype):
         for npu in [False, True]:
             model = _get_model(shape, dtype, splits, axis)
             mod = tei.make_module(model, {})
-            output_count = splits if type(splits) == int else len(splits) + 1
+            output_count = splits if isinstance(splits, int) else len(splits) + 1
             outputs.append(tei.build_and_run(mod, inputs, output_count, {}, npu=npu))
 
         tei.verify(outputs, dtype, 0)
@@ -64,6 +66,8 @@ def test_split(dtype):
 @pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.")
 @requires_ethosn
 def test_split_failure():
+    """Check Split error messages."""
+
     trials = [
         ((1, 4, 4, 4, 4), "uint8", 4, 2, "dimensions=5, dimensions must be <= 4;"),
         ((1, 4, 4, 4), "int16", 4, 2, "dtype='int16', dtype must be either uint8, int8 or int32;"),
@@ -74,7 +78,8 @@ def test_split_failure():
             "uint8",
             4,
             3,
-            "Split along the channels dimension (axis 3) requires all output sizes (specified in splitInfo.m_Sizes) to be multiples of 16;",
+            "Split along the channels dimension (axis 3) requires all output sizes "
+            "(specified in splitInfo.m_Sizes) to be multiples of 16;",
         ),
     ]
 
diff --git a/tests/python/contrib/test_ethosn/test_tanh.py b/tests/python/contrib/test_ethosn/test_tanh.py
index 8f44936fdc4f..68170601c5f8 100644
--- a/tests/python/contrib/test_ethosn/test_tanh.py
+++ b/tests/python/contrib/test_ethosn/test_tanh.py
@@ -46,6 +46,8 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype):
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 @pytest.mark.parametrize("shape", [(1, 52, 52, 3)])
 def test_tanh(dtype, shape):
+    """Compare Tanh output with TVM."""
+
     zp_min = np.iinfo(dtype).min
     zp_max = np.iinfo(dtype).max
 
@@ -78,6 +80,8 @@ def test_tanh(dtype, shape):
     ],
 )
 def test_tanh_failure(shape, input_zp, input_sc, output_zp, output_sc, err_msg, dtype):
+    """Check Tanh error messages."""
+
     test_zp = 0 if dtype == "int8" else 128
     model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype)
     model = tei.make_ethosn_composite(model, "ethos-n.qnn_tanh")
diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py
index 970f7dce5cbd..19d7accadb6d 100644
--- a/tests/python/contrib/test_ethosn/test_topologies.py
+++ b/tests/python/contrib/test_ethosn/test_topologies.py
@@ -31,6 +31,8 @@
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_split_add_concat(dtype):
+    """Test a model with split, add and contatenate."""
+
     def get_model(input_shape, dtype, var_names):
         """Return a model"""
 
@@ -148,23 +150,25 @@ def get_model(dtype):
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_output_order(dtype):
+    """Test the output order."""
+
     def get_model(input_shape, dtype, var_names):
         """Return a model"""
 
-        min = np.iinfo(dtype).min
-        max = np.iinfo(dtype).max
+        min_value = np.iinfo(dtype).min
+        max_value = np.iinfo(dtype).max
         a = relay.var(next(var_names), shape=input_shape, dtype=dtype)
 
-        z = relay.op.clip(a, min, max)
-        b = relay.op.clip(z, min, min + 15)
-        c = relay.op.clip(z, min + 16, min + 31)
-        d = relay.op.clip(z, min + 32, min + 47)
-        e = relay.op.clip(z, min + 48, min + 63)
-        f = relay.op.clip(z, min + 64, min + 79)
-        g = relay.op.clip(z, min + 80, min + 95)
-        h = relay.op.clip(z, min + 96, min + 111)
-        i = relay.op.clip(z, min + 112, max)
-        return relay.Tuple((d, c, e, f, i, b, h, g))
+        op_z = relay.op.clip(a, min_value, max_value)
+        op_b = relay.op.clip(op_z, min_value, min_value + 15)
+        op_c = relay.op.clip(op_z, min_value + 16, min_value + 31)
+        op_d = relay.op.clip(op_z, min_value + 32, min_value + 47)
+        op_e = relay.op.clip(op_z, min_value + 48, min_value + 63)
+        op_f = relay.op.clip(op_z, min_value + 64, min_value + 79)
+        op_g = relay.op.clip(op_z, min_value + 80, min_value + 95)
+        op_h = relay.op.clip(op_z, min_value + 96, min_value + 111)
+        op_i = relay.op.clip(op_z, min_value + 112, max_value)
+        return relay.Tuple((op_d, op_c, op_e, op_f, op_i, op_b, op_h, op_g))
 
     np.random.seed(0)
     inputs = {
@@ -190,6 +194,7 @@ def test_output_order_different_sizes(dtype):
     """
     Test the output order when there are multiple outputs of different sizes.
     """
+
     np.random.seed(0)
     input_name = "a"
     input_shape = (1, 8, 8, 4)
@@ -233,6 +238,8 @@ def get_model():
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_split_with_asym_concats(dtype):
+    """Test a model with split and contatenates."""
+
     def get_model(shape, dtype, splits, axis):
         a = relay.var("a", shape=shape, dtype=dtype)
         split = relay.op.split(a, indices_or_sections=splits, axis=axis)
@@ -335,6 +342,8 @@ def get_model(dtype):
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_input_tuples(dtype):
+    """Test a model with a tuple as input."""
+
     def get_model(shapes, dtype, axis):
         tup = []
         for i, shape in enumerate(shapes):

From 38ba8c0bb69dd76203a96ba6b2a5c067fe0b2ba0 Mon Sep 17 00:00:00 2001
From: sisleyli <43139237+sisleyli@users.noreply.github.com>
Date: Thu, 1 Sep 2022 18:32:42 +0800
Subject: [PATCH 086/704] [Relay] Extract intermediate node by its expression
 ID (#12646)

[Relay] Extract Intermediate Expr by relay expr ID for analysis

modify doc comments

Co-authored-by: Bin Li <binli1@amd.com>
---
 python/tvm/relay/analysis/analysis.py         |  38 +++++
 .../analysis/extract_intermediate_expr.cc     |  88 ++++++++++++
 ...test_analysis_extract_intermediate_expr.py | 130 ++++++++++++++++++
 3 files changed, 256 insertions(+)
 create mode 100644 src/relay/analysis/extract_intermediate_expr.cc
 create mode 100644 tests/python/relay/test_analysis_extract_intermediate_expr.py

diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py
index 3b38c07a0a8a..12f659f0037c 100644
--- a/python/tvm/relay/analysis/analysis.py
+++ b/python/tvm/relay/analysis/analysis.py
@@ -431,3 +431,41 @@ def get_calibration_data(mod, data):
         calib_data[gvar] = value
 
     return calib_data
+
+
+def extract_intermdeiate_expr(mod, expr_id):
+    """Extract Relay Expr by its expression ID
+
+    This function is used for extracting Relay Expr
+    by its expression ID of the main function
+    that we can see in `print(mod["main"])`.
+
+    Parameters
+    ----------
+    mod : tvm.IRModule
+
+    expr_id : the Expr ID that we want to extract
+
+    Returns
+    -------
+    ret : Extracted IRModule
+
+    Examples
+    --------
+    .. code-block:: python
+
+        # Suppose our module is printed like this:
+        # def @main(%x: Tensor[(1, 1, 5, 1), float32], %w1, %w2) {
+        #   %0 = nn.conv2d(%x, %w1, padding=[1, 1, 1, 1], channels=1, kernel_size=[3, 3]);
+        #   %1 = nn.conv2d(%0, %w2, padding=[1, 1, 1, 1], channels=1, kernel_size=[3, 3]);
+        #   %2 = add(%0, %1);
+        #   %3 = split(%2, indices_or_sections=1);
+        #   %4 = %3.0;
+        #   add(%4, 1f)
+        # }
+        # if we want to extract `%1 = nn.conv2d`
+        from tvm import relay
+
+        relay.analysis.extract_intermdeiate_expr(mod, 1)
+    """
+    return _ffi_api.ExtractIntermediateExpr(mod, expr_id)
diff --git a/src/relay/analysis/extract_intermediate_expr.cc b/src/relay/analysis/extract_intermediate_expr.cc
new file mode 100644
index 000000000000..d7466e2729db
--- /dev/null
+++ b/src/relay/analysis/extract_intermediate_expr.cc
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file extract_intermediate_expr.cc
+ * \brief Used for extracting Relay Expr
+    by the expression ID of the main function
+    that we can see in `print(mod["main"])`.
+ */
+#include <tvm/node/structural_hash.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+
+namespace tvm {
+namespace relay {
+
+class ExtractIntermediateExprWrapper : private MixedModeVisitor {
+ public:
+  explicit ExtractIntermediateExprWrapper(const IRModule& mod, const int expr_id)
+      : mod_(mod), target_expr_id_(expr_id), counter_(0) {}
+
+  IRModule Extract() {
+    VisitExpr(this->mod_->Lookup("main"));
+
+    // ensure the target expr_id we want to extract is valid.
+    ICHECK(target_expr_id_ >= 0 && target_expr_id_ < counter_);
+
+    return IRModule::FromExpr(target_op_, {});
+  }
+
+ private:
+  using MixedModeVisitor::VisitExpr_;
+
+  const IRModule mod_;
+  /*! \brief the expr id that we want to extract. */
+  const int target_expr_id_;
+  int counter_;
+  Expr target_op_;
+
+  void VisitExpr_(const CallNode* n) final {
+    CheckCounterAndIncrease(GetRef<Expr>(n));
+    MixedModeVisitor::VisitExpr_(n);
+  }
+
+  void VisitExpr_(const TupleNode* n) final {
+    CheckCounterAndIncrease(GetRef<Expr>(n));
+    MixedModeVisitor::VisitExpr_(n);
+  }
+
+  void VisitExpr_(const TupleGetItemNode* n) final {
+    CheckCounterAndIncrease(GetRef<Expr>(n));
+    MixedModeVisitor::VisitExpr_(n);
+  }
+
+  void CheckCounterAndIncrease(const Expr& expr) {
+    if (target_expr_id_ == counter_) {
+      target_op_ = expr;
+    }
+    ++counter_;
+  }
+};
+
+IRModule ExtractIntermediateExprPacked(const IRModule& mod, const int expr_id) {
+  return ExtractIntermediateExprWrapper(mod, expr_id).Extract();
+}
+
+TVM_REGISTER_GLOBAL("relay.analysis.ExtractIntermediateExpr")
+    .set_body_typed(ExtractIntermediateExprPacked);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_analysis_extract_intermediate_expr.py b/tests/python/relay/test_analysis_extract_intermediate_expr.py
new file mode 100644
index 000000000000..abcaf880b4aa
--- /dev/null
+++ b/tests/python/relay/test_analysis_extract_intermediate_expr.py
@@ -0,0 +1,130 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test function extraction"""
+import pytest
+import tvm
+from tvm import relay
+
+
+def get_conv_net():
+    """This gets the net for:
+          conv2d
+          /  |
+         /   |
+    conv2d   |
+        \    |
+         \   |
+        elemwise add
+             |
+             |
+             |
+           split
+             |
+             |
+             |
+        elemwise add
+    """
+    dshape = (1, 1, 5, 1)
+    x = relay.var("x", shape=dshape)
+    y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+    x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+
+    z = relay.add(y, x1)
+
+    tuple_out = relay.op.split(z, indices_or_sections=1, axis=0)
+
+    tuple_0_add = relay.add(tuple_out[0], relay.const(1, dtype="float32"))
+
+    return tvm.IRModule.from_expr(tuple_0_add)
+
+
+def get_conv2d():
+    x = relay.var("x", shape=(1, 56, 56, 64))
+    weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
+    y = relay.nn.conv2d(
+        x,
+        weight1,
+        channels=32,
+        kernel_size=(3, 3),
+        padding=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+    )
+    return tvm.IRModule.from_expr(y)
+
+
+def test_extract():
+    dshape = (1, 1, 5, 1)
+
+    def before():
+        return get_conv_net()
+
+    def expected_0():
+        x = relay.var("x", shape=dshape)
+        y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+        return tvm.IRModule.from_expr(y)
+
+    def expected_1():
+        x = relay.var("x", shape=dshape)
+        y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+        x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+        return tvm.IRModule.from_expr(x1)
+
+    def expected_2():
+        x = relay.var("x", shape=dshape)
+        y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+        x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+        z = relay.add(y, x1)
+        return tvm.IRModule.from_expr(z)
+
+    def expected_3():
+        x = relay.var("x", shape=dshape)
+        y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+        x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+        z = relay.add(y, x1)
+        tuple_out = relay.op.split(z, indices_or_sections=1, axis=0)
+        return tvm.IRModule.from_expr(tuple_out.astuple())
+
+    def expected_4():
+        # check tuple node
+        x = relay.var("x", shape=dshape)
+        y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+        x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+        z = relay.add(y, x1)
+        tuple_out = relay.op.split(z, indices_or_sections=1, axis=0)
+        return tvm.IRModule.from_expr(tuple_out[0])
+
+    assert tvm.ir.structural_equal(
+        relay.analysis.extract_intermdeiate_expr(before(), 0), expected_0()
+    )
+    assert tvm.ir.structural_equal(
+        relay.analysis.extract_intermdeiate_expr(before(), 1), expected_1()
+    )
+    assert tvm.ir.structural_equal(
+        relay.analysis.extract_intermdeiate_expr(before(), 2), expected_2()
+    )
+    assert tvm.ir.structural_equal(
+        (relay.analysis.extract_intermdeiate_expr(before(), 3)), expected_3()
+    )
+    assert tvm.ir.structural_equal(
+        relay.analysis.extract_intermdeiate_expr(before(), 4), expected_4()
+    )
+    assert tvm.ir.structural_equal(relay.analysis.extract_intermdeiate_expr(before(), 5), before())
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From 038f15b5e204120709186a8791e5b49986060bb0 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Thu, 1 Sep 2022 16:31:54 +0300
Subject: [PATCH 087/704] [Hexagon] Implement fixed_point_multiply op through
 intrinsics. (#12659)

This commit adds high-performance implementation of fixed_point_multiply
operation based on Hexagon intrinsics for vmpye/vmpyo instructions.

Benchmarking of 'fixed_point_multiply' op with (1,8,56,56,32) input
tensor on Qualcomm SM8350:
  * default implementation: 10.06 ms
  * optimized implementation: 1.42 ms
  * speedup: 7x times (!!!)

Please note that this is introducing a small round-up error for some
corner cases with negative shift argument (The same as for ARM CPU, see
PR#5980). This is because we are rounding twice instead than only once:
  * original q_multiply_shift: round(x*y*2^-s)
  * hexagon q_multiply_shift: round(round(x*y)*2^-s)
---
 python/tvm/topi/hexagon/__init__.py           |   1 +
 python/tvm/topi/hexagon/injective.py          |   7 +-
 python/tvm/topi/hexagon/tensor_intrin.py      |  71 +++++++++
 .../test_hexagon/test_fixed_point_multiply.py | 140 ++++++++++++++++++
 4 files changed, 216 insertions(+), 3 deletions(-)
 create mode 100644 python/tvm/topi/hexagon/tensor_intrin.py
 create mode 100644 tests/python/contrib/test_hexagon/test_fixed_point_multiply.py

diff --git a/python/tvm/topi/hexagon/__init__.py b/python/tvm/topi/hexagon/__init__.py
index dfe739288187..a3768a6e809e 100644
--- a/python/tvm/topi/hexagon/__init__.py
+++ b/python/tvm/topi/hexagon/__init__.py
@@ -26,4 +26,5 @@
 from .pooling import *
 from .reduce import *
 from .resize2d import *
+from .tensor_intrin import *
 from .qnn import *
diff --git a/python/tvm/topi/hexagon/injective.py b/python/tvm/topi/hexagon/injective.py
index 9ced0ac7d399..b1d1e1541961 100644
--- a/python/tvm/topi/hexagon/injective.py
+++ b/python/tvm/topi/hexagon/injective.py
@@ -19,6 +19,8 @@
 
 import tvm
 
+import numpy as np
+
 
 def schedule_injective(outs):
     """Schedule for injective op.
@@ -37,11 +39,10 @@ def schedule_injective(outs):
     outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
     s = tvm.te.create_schedule([x.op for x in outs])
     tvm.te.schedule.AutoInlineInjective(s)
-
-    # Fuse axes and vectorize inner 128 elements
+    # Fuse axes and vectorize inner elements
     for x in outs:
         fused = s[x].fuse(*x.op.axis)
-        _, inner = s[x].split(fused, factor=128)
+        _, inner = s[x].split(fused, factor=128 // np.dtype(x.dtype).itemsize)
         s[x].vectorize(inner)
     return s
 
diff --git a/python/tvm/topi/hexagon/tensor_intrin.py b/python/tvm/topi/hexagon/tensor_intrin.py
new file mode 100644
index 000000000000..bdc63854328b
--- /dev/null
+++ b/python/tvm/topi/hexagon/tensor_intrin.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Optimized implementation of q_multiply_shift based on LLVM intrinsics"""
+
+import tvm
+from tvm.ir import register_intrin_lowering
+
+
+def _q_multiply_shift_hexagon(op):
+    """
+    Implementation of q_multiply_shift through hexagon intrinsics vmpyewuh and vmpyowh when q == 31.
+
+    Please note that this is introducing a small round-up error for some corner cases with negative
+    shift argument. This is because we are rounding twice instead than only once. I.e.:
+
+        * original q_multiply_shift: round(x*y*2^-s)
+        * hexagon q_multiply_shift: round(round(x*y)*2^-s)
+    """
+    x = op.args[0]
+    y = op.args[1]
+    fractional_bits = op.args[2]
+    shift = op.args[3]
+
+    # Don't use this intrinsic if we don't have a int32x32 vector
+    # or if we are not multiplying q31 numbers
+    if x.dtype != "int32x32" or fractional_bits.value != 31:
+        return op
+
+    # Case 1, shift is negative
+    mul_e_1 = tvm.tir.call_llvm_intrin(
+        op.dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), x, y
+    )
+    mul_o_1 = tvm.tir.call_llvm_intrin(
+        op.dtype, "llvm.hexagon.V6.vmpyowh.rnd.sacc.128B", tvm.tir.const(3, "uint32"), mul_e_1, x, y
+    )
+    fixup = mul_o_1 & (-shift)
+    round_mul = mul_o_1 + fixup
+    out_negative_shift = tvm.tir.call_llvm_intrin(
+        op.dtype, "llvm.hexagon.V6.vaslwv.128B", tvm.tir.const(2, "uint32"), round_mul, shift
+    )
+
+    # Case 2, shift is positive
+    x = x * (1 << (shift))
+    mul_e_2 = tvm.tir.call_llvm_intrin(
+        op.dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), x, y
+    )
+    mul_o_2 = tvm.tir.call_llvm_intrin(
+        op.dtype, "llvm.hexagon.V6.vmpyowh.rnd.sacc.128B", tvm.tir.const(3, "uint32"), mul_e_2, x, y
+    )
+
+    # Select depending on the shift
+    return tvm.tir.Select(shift < 0, out_negative_shift, mul_o_2)
+
+
+register_intrin_lowering(
+    "tir.q_multiply_shift", target="hexagon", f=_q_multiply_shift_hexagon, level=99
+)
diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
new file mode 100644
index 000000000000..8ee04a649990
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm.testing
+from tvm import relay
+from tvm.relay.backend import Executor
+from tvm.contrib.hexagon.session import Session
+
+import re
+import numpy as np
+
+
+@tvm.testing.requires_hexagon
+def test_vmpy_intrinsic_presence():
+    """
+    check intrinsic lowering for fixed_point_multiply operation
+    """
+    ishape = (1, 128)
+    a = relay.var("a", relay.TensorType(ishape, "int32"))
+
+    y = relay.fixed_point_multiply(a, 1395864320, 1)  # 1.3
+
+    relay_mod = tvm.IRModule.from_expr(y)
+
+    params = {}
+    target_hexagon = tvm.target.hexagon("v68")
+    executor = Executor("graph", {"link-params": True})
+
+    with tvm.transform.PassContext(opt_level=3):
+        hexagon_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            executor=executor,
+            params=params,
+        )
+
+    asm = hexagon_lowered.lib.get_source("asm")
+
+    # Check that 'vmpye' instruction was generated in asm file.
+    vmpye_regex = re.compile(r"v\d{1,2}.w = vmpye\(v\d{1,2}.w,v\d{1,2}.uh\)")
+    assert vmpye_regex.search(asm) is not None
+
+    # Check that 'vmpyo' instruction was generated in asm file.
+    vmpyo_regex = re.compile(r"v\d{1,2}.w \+= vmpyo\(v\d{1,2}.w,v\d{1,2}.h\):<<1:rnd:sat:shift")
+    assert vmpyo_regex.search(asm) is not None
+
+
+def build_module(relay_mod, target):
+    params = {}
+    executor = Executor("graph", {"link-params": True})
+    lowered = tvm.relay.build(
+        relay_mod,
+        tvm.target.Target(target, host=target),
+        executor=executor,
+        params=params,
+    )
+    return lowered
+
+
+def run_module(graph_mod, inputs):
+    graph_mod.set_input(**inputs)
+    graph_mod.run()
+    output = graph_mod.get_output(0).numpy()
+    return output
+
+
+@tvm.testing.requires_hexagon
+def test_fixed_point_multiply_positive_shift(hexagon_session: Session):
+    ishape = (6, 32)
+    a = relay.var("a", relay.TensorType(ishape, "int32"))
+    multiplier, shift = (1395864320, 1)  # 1.3
+    fpm = relay.fixed_point_multiply(a, multiplier, shift)
+    relay_mod = tvm.IRModule.from_expr(fpm)
+
+    with tvm.transform.PassContext(opt_level=3):
+        # Compile for Hexagon...
+        hexagon_lowered = build_module(relay_mod, tvm.target.hexagon("v68"))
+
+        # Compile for LLVM...
+        llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm"))
+
+    data_in = np.arange(-96, 96).reshape(ishape)
+    inputs = {"a": data_in}
+
+    # Run hexagon...
+    graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+    hexagon_output = run_module(graph_mod, inputs)
+
+    # Run llvm...
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    expected_output = run_module(llvm_graph_mod, inputs)
+
+    tvm.testing.assert_allclose(hexagon_output, expected_output)
+
+
+@tvm.testing.requires_hexagon
+def test_fixed_point_multiply_negative_shift(hexagon_session: Session):
+    ishape = (6, 32)
+    a = relay.var("a", relay.TensorType(ishape, "int32"))
+    multiplier, shift = (1288490240, -2)  # 0.15
+    fpm = relay.fixed_point_multiply(a, multiplier, shift)
+    relay_mod = tvm.IRModule.from_expr(fpm)
+
+    with tvm.transform.PassContext(opt_level=3):
+        # Compile for Hexagon...
+        hexagon_lowered = build_module(relay_mod, tvm.target.hexagon("v68"))
+
+        # Compile for LLVM...
+        llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm"))
+
+    data_in = np.arange(-96, 96).reshape(ishape)
+    inputs = {"a": data_in}
+
+    # Run hexagon...
+    graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+    hexagon_output = run_module(graph_mod, inputs)
+
+    # Run llvm...
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    expected_output = run_module(llvm_graph_mod, inputs)
+
+    tvm.testing.assert_allclose(hexagon_output, expected_output, atol=1)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 32f9a5f4d4f03a0875d64ac42df46cafe8ae3cfa Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Thu, 1 Sep 2022 04:23:35 -1000
Subject: [PATCH 088/704] [MetaSchedule] Fix autoinline for single const
 consumer block (#12668)

fix autoinline and add test
---
 .../schedule_rule/auto_inline.cc              |  5 +++-
 ...meta_schedule_schedule_rule_auto_inline.py | 28 +++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/meta_schedule/schedule_rule/auto_inline.cc b/src/meta_schedule/schedule_rule/auto_inline.cc
index 76313f46d1c8..446c8ead7e8e 100644
--- a/src/meta_schedule/schedule_rule/auto_inline.cc
+++ b/src/meta_schedule/schedule_rule/auto_inline.cc
@@ -104,7 +104,10 @@ inline InlineType AutoInlineNode::CheckInline(const tir::Schedule& sch,
   }
   // Cond 2. For a block that generates a constant tensor, ignore all other conditions
   if (inline_const_tensor && block->reads.empty()) {
-    return InlineType::kInlineIntoConsumer;
+    Array<tir::StmtSRef> consumer_srefs = GetConsumers(state, block_sref);
+    if (!consumer_srefs.empty() && CanComputeInline(state, block_sref)) {
+      return InlineType::kInlineIntoConsumer;
+    }
   }
   // Cond 3. The block doesn't contain any disallowed operators
   if (!is_pure_sptial && !disallow_op.empty() && HasOp(realize, disallow_op)) {
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
index a8ffa6ff9d3f..fcf6a8571b7f 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
@@ -320,6 +320,21 @@ def main(placeholder: T.Buffer[(1, 384), "int64"], placeholder_1: T.Buffer[(3052
                 T.writes(T_add[ax0, ax1, ax2])
                 T_add[ax0, ax1, ax2] = placeholder_1[T.min(T.max(T.int64(0), T.Select(T.cast(placeholder[ax0, ax1] < T.int64(0), "int32") != 0, placeholder[ax0, ax1] + T.int64(30522), placeholder[ax0, ax1])), T.int64(30521)), ax2] + placeholder_2[ax0, ax1, ax2]
 
+@tvm.script.ir_module
+class ConstConsumer:
+    @T.prim_func
+    def main(T_full: T.Buffer[(1, 12, 4096), "int64"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0, i1, i2 in T.grid(1, 12, 4096):
+            with T.block("T_full"):
+                ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2])
+                T.reads()
+                T.writes(T_full[ax0, ax1, ax2])
+                T_full[ax0, ax1, ax2] = T.int64(0)
+
 # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks
 # fmt: on
 
@@ -383,8 +398,21 @@ def test_inline_pure_spatial():
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=AfterPureSpatial)
 
 
+def test_inline_constant_tensor():
+    mod = ConstConsumer
+    target = Target("cuda", host="llvm")
+    ctx = _create_context(
+        mod=mod,
+        target=target,
+        rule=auto_inline(target=target),
+    )
+    (space,) = ctx.space_generator.generate_design_space(mod=mod)
+    tvm.ir.assert_structural_equal(lhs=space.mod, rhs=ConstConsumer)
+
+
 if __name__ == "__main__":
     test_inline_consumer_chain()
     test_inline_into_cache()
     test_inline_into_multiple_consumers()
     test_inline_pure_spatial()
+    test_inline_constant_tensor()

From effcd2251b4bb04e47f8ec288b056b0756ea4f4f Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Thu, 1 Sep 2022 08:57:40 -0700
Subject: [PATCH 089/704] Add methods to get and set late-bound constants.
 (#12664)

* Add methods to read and restore late-bound constants on Executable.

* Add bindings for new functions

* Cleanup

* Fix function name

* Add tests for python API to access new load/save functions

* Add another tests for python API to access new load/save functions where there are no constants
---
 include/tvm/runtime/vm/executable.h | 13 +++++
 python/tvm/runtime/vm.py            | 10 ++++
 src/runtime/vm/executable.cc        | 24 ++++++++-
 tests/python/relay/test_vm.py       | 80 +++++++++++++++++++++++++++++
 4 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h
index 2405b3c0ba8c..fdbc1769c353 100644
--- a/include/tvm/runtime/vm/executable.h
+++ b/include/tvm/runtime/vm/executable.h
@@ -126,6 +126,11 @@ class TVM_DLL Executable : public ModuleNode {
    */
   void MoveLateBoundConstantsToFile(const std::string& path, size_t byte_limit);
 
+  /*!
+   * \brief Get a map of all constants with larger that byte_limit in size.
+   */
+  Map<String, NDArray> GetLateBoundConstants(size_t byte_limit);
+
   /*!
    * \brief Restores the late-bound constants for the executable (if any) from given byte-stream.
    *
@@ -134,6 +139,14 @@ class TVM_DLL Executable : public ModuleNode {
    */
   void LoadLateBoundConstantsFromStream(dmlc::Stream* stream);
 
+  /*!
+   * \brief Restores the late-bound constants for the executable (if any) from given map.
+   *
+   * Must be called after \p Load but before any other methods if \p MoveLateBoundConstantsToBinary
+   * was used when saving. Otherwise can be ignored.
+   */
+  void LoadLateBoundConstantsFromMap(Map<String, NDArray> map);
+
   /*!
    * \brief As for \p LoadLateBoundConstantsFromStream, but load from file at \p path.
    */
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index c065d77a7c9f..615f66fdcc1c 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -86,7 +86,9 @@ def __init__(self, mod):
         self._get_function_arity = self.mod["get_function_arity"]
         self._get_function_param_name = self.mod["get_function_param_name"]
         self._move_late_bound_consts = self.mod["move_late_bound_consts"]
+        self._get_late_bound_consts = self.mod["get_late_bound_consts"]
         self._load_late_bound_consts = self.mod["load_late_bound_consts"]
+        self._load_late_bound_consts_from_map = self.mod["load_late_bound_consts_from_map"]
 
     def save(self):
         """Save the Relay VM Executable.
@@ -312,10 +314,18 @@ def move_late_bound_consts(self, path, byte_limit):
         """Move all constants of byte size greater or equal to byte_limit to file at path"""
         return self._move_late_bound_consts(path, byte_limit)
 
+    def get_late_bound_consts(self, byte_limit):
+        """Return all constants of byte size greater or equal to byte_limit"""
+        return self._get_late_bound_consts(byte_limit)
+
     def load_late_bound_consts(self, path):
         """Re-load constants previously saved to file at path"""
         return self._load_late_bound_consts(path)
 
+    def load_late_bound_consts_from_map(self, map):
+        """Re-load constants supplied in map"""
+        return self._load_late_bound_consts_from_map(map)
+
 
 class VirtualMachine(object):
     """Relay VM runtime.
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 85dad2839a8a..2484ece3081d 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -97,12 +97,25 @@ PackedFunc Executable::GetFunction(const std::string& name, const ObjectPtr<Obje
       uint64_t byte_limit = args[1];
       MoveLateBoundConstantsToFile(path, static_cast<size_t>(byte_limit));
     });
+  } else if (name == "get_late_bound_consts") {
+    return PackedFunc([this](TVMArgs args, TVMRetValue* rv) {
+      CHECK_EQ(args.size(), 1);
+      uint64_t byte_limit = args[0];
+      Map<String, NDArray> consts = GetLateBoundConstants(static_cast<size_t>(byte_limit));
+      *rv = consts;
+    });
   } else if (name == "load_late_bound_consts") {
     return PackedFunc([this](TVMArgs args, TVMRetValue* rv) {
       CHECK_EQ(args.size(), 1);
       std::string path = args[0];
       LoadLateBoundConstantsFromFile(path);
     });
+  } else if (name == "load_late_bound_consts_from_map") {
+    return PackedFunc([this](TVMArgs args, TVMRetValue* rv) {
+      CHECK_EQ(args.size(), 1);
+      Map<String, NDArray> map = args[0];
+      LoadLateBoundConstantsFromMap(map);
+    });
   } else {
     LOG(FATAL) << "Unknown packed function: " << name;
     return PackedFunc();
@@ -300,7 +313,7 @@ void Executable::SaveVirtualDevicesSection(dmlc::Stream* strm) {
   strm->Write(host_device_index);
 }
 
-void Executable::MoveLateBoundConstantsToStream(dmlc::Stream* stream, size_t byte_limit) {
+Map<String, NDArray> Executable::GetLateBoundConstants(size_t byte_limit) {
   ICHECK(late_bound_constant_names.empty());
   late_bound_constant_names.reserve(constants.size());
   Map<String, NDArray> map;
@@ -323,6 +336,11 @@ void Executable::MoveLateBoundConstantsToStream(dmlc::Stream* stream, size_t byt
   }
   VLOG(1) << "moved " << map.size() << " constants of " << total_late_bound_bytes
           << " bytes (out of " << constants.size() << " overall) to be late-bound";
+  return map;
+}
+
+void Executable::MoveLateBoundConstantsToStream(dmlc::Stream* stream, size_t byte_limit) {
+  Map<String, NDArray> map = GetLateBoundConstants(byte_limit);
   runtime::SaveParams(stream, map);
 }
 
@@ -341,6 +359,10 @@ void Executable::LoadLateBoundConstantsFromStream(dmlc::Stream* stream) {
   ICHECK_EQ(late_bound_constant_names.size(), constants.size());
   Map<String, NDArray> map = runtime::LoadParams(stream);
   VLOG(1) << "loaded " << map.size() << " late-bound constants";
+  LoadLateBoundConstantsFromMap(map);
+}
+
+void Executable::LoadLateBoundConstantsFromMap(Map<String, NDArray> map) {
   for (size_t const_index = 0; const_index < constants.size(); ++const_index) {
     if (!late_bound_constant_names[const_index].defined()) {
       ICHECK(constants[const_index].defined())
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 4f649ad9beba..0b62db85c904 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -1405,5 +1405,85 @@ def test_vm_save_and_load_without_designating_late_bound_consts():
     tvm.testing.assert_allclose(expected, actual.numpy())
 
 
+def test_load_and_save_constants_via_map():
+    """Large constants can be serialized outside of executable"""
+    target = tvm.target.Target("llvm")
+    dev = tvm.cpu()
+
+    # fn(x) { add(x, <large constant>) }
+    x = relay.var("x", shape=(1000, 1000))
+    const_data = np.random.rand(1000, 1000).astype("float32")
+    const = relay.const(const_data, dtype="float32")
+    func = relay.Function([x], relay.op.add(x, const))
+    mod = tvm.IRModule.from_expr(func)
+
+    # Compile to executable.
+    vm_exec = vm.compile(mod, target=target)
+
+    consts_map = vm_exec.get_late_bound_consts(byte_limit=256)
+
+    # Save to constants and library files
+    temp = utils.tempdir()
+    path_dso = temp.relpath("lib.so")
+    vm_exec.mod.export_library(path_dso)
+
+    # Load library files and constants
+    mod = runtime.load_module(path_dso)
+    mod["load_late_bound_consts_from_map"](consts_map)
+
+    # Test main
+    x_data = np.random.rand(1000, 1000).astype("float32")
+    the_vm = runtime.vm.VirtualMachine(mod, dev)
+    actual = the_vm.invoke("main", x_data)
+    expected = x_data + const_data
+    tvm.testing.assert_allclose(expected, actual.numpy())
+
+    # We load the mod again so it's missing the consts.
+    mod = runtime.load_module(path_dso)
+    exe = runtime.vm.Executable(mod)
+
+    # Also test loading consts via the VM's wrapper API.
+    exe.load_late_bound_consts_from_map(consts_map)
+
+    # Test main again with consts now loaded via the above API.
+    x_data = np.random.rand(1000, 1000).astype("float32")
+    the_vm = runtime.vm.VirtualMachine(exe, dev)
+    actual = the_vm.invoke("main", x_data)
+    expected = x_data + const_data
+    tvm.testing.assert_allclose(expected, actual.numpy())
+
+
+def test_load_late_bound_consts_via_map_with_no_late_bound_consts():
+    """Check that load_late_bound_consts handles a model with no late bound consts."""
+    target = tvm.target.Target("llvm")
+    dev = tvm.cpu()
+
+    const_data = np.random.rand(1).astype("float64")
+    x = relay.var("x", shape=(1,), dtype="float64")
+    const = relay.const(const_data, dtype="float64")
+
+    func = relay.Function([x], relay.op.add(x, const))
+    mod = tvm.IRModule.from_expr(func)
+
+    vm_exec = vm.compile(mod, target=target)
+
+    temp = utils.tempdir()
+    path_dso = temp.relpath("lib.so")
+
+    # Ensure const_data is below the byte threshold for a late-bound const.
+    byte_limit = len(const_data.tobytes()) + 1
+    consts_map = vm_exec.get_late_bound_consts(byte_limit=byte_limit)
+    vm_exec.mod.export_library(path_dso)
+
+    mod = runtime.load_module(path_dso)
+    mod["load_late_bound_consts_from_map"](consts_map)
+
+    x_data = np.random.rand(1).astype("float64")
+    loaded_vm = runtime.vm.VirtualMachine(mod, dev)
+    actual = loaded_vm.invoke("main", x_data)
+    expected = x_data + const_data
+    tvm.testing.assert_allclose(expected, actual.numpy())
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From e814f798edc5bf6977a4f4f74ec8d1d7e363c608 Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Thu, 1 Sep 2022 19:33:15 +0300
Subject: [PATCH 090/704] [Adreno] Change compute/schedule for ToMixedPrecision
 pass (#12537)

* [Adreno] Change compute/schedule for ToMixedPrecision pass

* Address CI fails

* address PR comments

* Fix AutoTVM flow
---
 python/tvm/relay/op/strategy/adreno.py        | 142 ++++++------------
 python/tvm/topi/adreno/conv2d_alter_op.py     |  48 +++---
 python/tvm/topi/adreno/conv2d_nchw.py         | 117 +++++++--------
 .../tvm/topi/adreno/conv2d_nchw_winograd.py   |  45 +-----
 python/tvm/topi/adreno/conv2d_nhwc.py         | 111 +++++++-------
 .../tvm/topi/adreno/conv2d_nhwc_winograd.py   |  45 +-----
 .../tvm/topi/adreno/conv2d_winograd_common.py |  19 +--
 .../tvm/topi/adreno/depthwise_conv2d_nchw.py  |  42 +-----
 .../tvm/topi/adreno/depthwise_conv2d_nhwc.py  |  38 +----
 .../python/relay/test_conv2d_nchw_texture.py  |   4 +-
 .../python/relay/test_conv2d_nhwc_texture.py  |   2 +-
 11 files changed, 218 insertions(+), 395 deletions(-)

diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py
index a537fa1e7b90..9429fd71e1d9 100644
--- a/python/tvm/relay/op/strategy/adreno.py
+++ b/python/tvm/relay/op/strategy/adreno.py
@@ -36,8 +36,10 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
         raise ValueError("dilation should be positive value")
 
     if groups == 1:
-        if (data_layout == "NCHW" and kernel_layout == "OIHW") or (
-            data_layout == "NCHW4c" and kernel_layout == "OIHW4o"
+        if (
+            (data_layout == "NCHW" and kernel_layout == "OIHW")
+            or (data_layout == "NCHW4c" and kernel_layout == "OIHW4o")
+            or (data_layout == "NCHW" and kernel_layout == "OIHW4o")
         ):
             if len(kernel.shape) == 4:
                 _, _, kh, kw = get_const_tuple(kernel.shape)
@@ -47,35 +49,24 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
                 (2 < kh < 8 and 2 < kw < 8 and kh == kw)
                 and (stride_h == 1 and stride_w == 1)
                 and (dilation_h == 1 and dilation_w == 1)
+                and not (data_layout == "NCHW" and kernel_layout == "OIHW4o")
             ):
-                if out_type.dtype == "float16":
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd),
-                        wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd),
-                        name="conv2d_nchw_winograd.image2d",
-                        plevel=5,
-                    )
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_acc32),
-                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd_acc32),
-                    name="conv2d_nchw_winograd_acc32.image2d",
-                    plevel=7,
-                )
-            if out_type.dtype == "float16":
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.adreno.conv2d_nchwc),
-                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc),
-                    name="conv2d_nchwc.image2d",
-                    plevel=10,
+                    wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd),
+                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd),
+                    name="conv2d_nchw_winograd.image2d",
+                    plevel=5,
                 )
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.adreno.conv2d_nchwc_acc32),
-                wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc_acc32),
-                name="conv2d_nchwc_acc32.image2d",
-                plevel=20,
+                wrap_compute_conv2d(topi.adreno.conv2d_nchwc),
+                wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc),
+                name="conv2d_nchwc.image2d",
+                plevel=10,
             )
-        elif (data_layout == "NHWC" and kernel_layout == "HWIO") or (
-            data_layout == "NHWC4c" and kernel_layout == "HWIO4o"
+        elif (
+            (data_layout == "NHWC" and kernel_layout == "HWIO")
+            or (data_layout == "NHWC4c" and kernel_layout == "HWIO4o")
+            or (data_layout == "NHWC" and kernel_layout == "HWIO4o")
         ):
             if len(kernel.shape) == 4:
                 kh, kw, _, _ = get_const_tuple(kernel.shape)
@@ -85,32 +76,19 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
                 (2 < kh < 8 and 2 < kw < 8 and kh == kw)
                 and (stride_h == 1 and stride_w == 1)
                 and (dilation_h == 1 and dilation_w == 1)
+                and not (data_layout == "NHWC" and kernel_layout == "HWIO4o")
             ):
-                if out_type.dtype == "float16":
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd),
-                        wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd),
-                        name="conv2d_nhwc_winograd.image2d",
-                        plevel=5,
-                    )
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_acc32),
-                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd_acc32),
-                    name="conv2d_nhwc_winograd_acc32.image2d",
-                    plevel=7,
-                )
-            if out_type.dtype == "float16":
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.adreno.conv2d_nhwc),
-                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc),
-                    name="conv2d_nhwc.image2d",
-                    plevel=10,
+                    wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd),
+                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd),
+                    name="conv2d_nhwc_winograd.image2d",
+                    plevel=5,
                 )
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.adreno.conv2d_nhwc_acc32),
-                wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_acc32),
-                name="conv2d_nhwc_acc32.image2d",
-                plevel=20,
+                wrap_compute_conv2d(topi.adreno.conv2d_nhwc),
+                wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc),
+                name="conv2d_nhwc.image2d",
+                plevel=10,
             )
         else:
             raise RuntimeError(
@@ -149,35 +127,21 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
             if (data_layout == "NCHW" and kernel_layout == "OIHW") or (
                 data_layout == "NCHW4c" and kernel_layout == "OIHW4o"
             ):
-                if out_type.dtype == "float16":
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc),
-                        wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc),
-                        name="depthwise_conv2d_nchwc.image2d",
-                        plevel=10,
-                    )
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc_acc32),
-                    wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc_acc32),
-                    name="depthwise_conv2d_nchwc_acc32.image2d",
-                    plevel=20,
+                    wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc),
+                    wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc),
+                    name="depthwise_conv2d_nchwc.image2d",
+                    plevel=10,
                 )
             elif (data_layout == "NHWC" and kernel_layout == "HWOI") or (
                 data_layout == "NHWC4c" and kernel_layout == "HWOI4o"
             ):
                 if data.shape[-1] >= 4:
-                    if out_type.dtype == "float16":
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc),
-                            wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc),
-                            name="depthwise_conv2d_nhwc.image2d",
-                            plevel=10,
-                        )
                     strategy.add_implementation(
-                        wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc_acc32),
-                        wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc_acc32),
-                        name="depthwise_conv2d_nhwc_acc32.image2d",
-                        plevel=20,
+                        wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc),
+                        wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc),
+                        name="depthwise_conv2d_nhwc.image2d",
+                        plevel=10,
                     )
                 else:
                     strategy.add_implementation(
@@ -208,40 +172,18 @@ def conv2d_winograd_without_weight_transfrom_strategy_adreno(attrs, inputs, out_
     assert groups == 1, "Do not supoort arbitrary group number"
     strategy = _op.OpStrategy()
     if layout in ("NCHW", "NCHW4c"):
-        if out_type.dtype == "float16":
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform),
-                wrap_topi_schedule(
-                    topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform
-                ),
-                name="conv2d_nchw_winograd_without_weight_transform.image2d",
-                plevel=5,
-            )
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform_acc32),
-            wrap_topi_schedule(
-                topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform_acc32
-            ),
-            name="conv2d_nchw_winograd_without_weight_transform_acc32.image2d",
-            plevel=7,
+            wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform),
+            wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform),
+            name="conv2d_nchw_winograd_without_weight_transform.image2d",
+            plevel=5,
         )
     elif layout in ("NHWC", "NHWC4c"):
-        if out_type.dtype == "float16":
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform),
-                wrap_topi_schedule(
-                    topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform
-                ),
-                name="conv2d_nhwc_winograd_without_weight_transform.image2d",
-                plevel=5,
-            )
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform_acc32),
-            wrap_topi_schedule(
-                topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform_acc32
-            ),
-            name="conv2d_nhwc_winograd_without_weight_transform_acc32.image2d",
-            plevel=7,
+            wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform),
+            wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform),
+            name="conv2d_nhwc_winograd_without_weight_transform.image2d",
+            plevel=5,
         )
     else:
         raise RuntimeError(
diff --git a/python/tvm/topi/adreno/conv2d_alter_op.py b/python/tvm/topi/adreno/conv2d_alter_op.py
index 16573991e09c..6cf749a62b27 100644
--- a/python/tvm/topi/adreno/conv2d_alter_op.py
+++ b/python/tvm/topi/adreno/conv2d_alter_op.py
@@ -304,7 +304,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
                 num_filter_block = 4
 
             # no support yet for tensors that cannot be divisible by factor 4
-            if in_channel_block != 4 or num_filter_block != 4:
+            if num_filter_block != 4:
                 return None
 
             batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
@@ -312,16 +312,22 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
 
             # update new attrs
             new_attrs["channels"] = out_channel
-            new_attrs["data_layout"] = "NCHW%dc" % in_channel_block
+            if in_channel_block == 4:
+                new_attrs["data_layout"] = "NCHW%dc" % in_channel_block
+            else:
+                new_attrs["data_layout"] = "NCHW"
             # (oc, ic, h, w) -> (OC, ic, h, w, oc)
             new_attrs["kernel_layout"] = "OIHW%do" % num_filter_block
             new_attrs["out_layout"] = "NCHW%dc" % num_filter_block
 
             # Store altered operator's config for applying of tuned AutoTVM statistics
-            new_data = te.placeholder(
-                (batch_size, in_channel // in_channel_block, height, width, in_channel_block),
-                dtype=data_dtype,
-            )
+            if in_channel_block == 4:
+                new_data = te.placeholder(
+                    (batch_size, in_channel // in_channel_block, height, width, in_channel_block),
+                    dtype=data_dtype,
+                )
+            else:
+                new_data = data_tensor
             new_kernel = te.placeholder(
                 (out_channel // num_filter_block, in_filter_channel, kh, kw, num_filter_block),
                 dtype=kernel_tensor.dtype,
@@ -361,12 +367,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
                 num_filter_block = 4
 
             # no support yet for tensors cannot be divisible by factor 4
-            if in_channel_block != 4 or num_filter_block != 4:
+            if num_filter_block != 4:
                 return None
 
             # update new attrs
             new_attrs["channels"] = out_channles
-            new_attrs["data_layout"] = "NHWC%dc" % in_channel_block
+            if in_channel_block == 4:
+                new_attrs["data_layout"] = "NHWC%dc" % in_channel_block
+            else:
+                new_attrs["data_layout"] = "NHWC"
             # (h, w, ic, oc) -> (h, w, ic, OC, oc)
             if kernel_layout == "HWIO":
                 new_attrs["kernel_layout"] = "HWIO%do" % num_filter_block
@@ -375,16 +384,19 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
             new_attrs["out_layout"] = "NHWC%dc" % num_filter_block
 
             # Store altered operator's config for applying of tuned AutoTVM statistics
-            new_data = te.placeholder(
-                (
-                    batch_size,
-                    in_height,
-                    in_width,
-                    in_channels // in_channel_block,
-                    in_channel_block,
-                ),
-                dtype=data_dtype,
-            )
+            if in_channel_block == 4:
+                new_data = te.placeholder(
+                    (
+                        batch_size,
+                        in_height,
+                        in_width,
+                        in_channels // in_channel_block,
+                        in_channel_block,
+                    ),
+                    dtype=data_dtype,
+                )
+            else:
+                new_data = data_tensor
             if kernel_layout == "HWIO":
                 new_kernel = te.placeholder(
                     (
diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py
index 65cd8e0150a8..082f71364af8 100644
--- a/python/tvm/topi/adreno/conv2d_nchw.py
+++ b/python/tvm/topi/adreno/conv2d_nchw.py
@@ -33,48 +33,22 @@
 )
 
 
-@autotvm.register_topi_compute("conv2d_nchwc.image2d")
-def conv2d_nchwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
-    """Compute conv2d with NCHWc layout"""
-    args = {"shared": False, "accumulator": "float16"}
-    return compute_conv2d_NCHWc_KCRSk(
-        data, kernel, strides, padding, dilation, out_dtype, args=args
-    )
-
-
-@autotvm.register_topi_compute("conv2d_nchwc_acc32.image2d")
-def conv2d_nchwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
-    """Compute conv2d with NCHWc layout"""
-    args = {"shared": False, "accumulator": "float32"}
-    return compute_conv2d_NCHWc_KCRSk(
-        data, kernel, strides, padding, dilation, out_dtype, args=args
-    )
-
-
 @autotvm.register_topi_schedule("conv2d_nchwc.image2d")
 def schedule_conv2d_nchwc(cfg, outs):
-    return schedule_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc16")
-
-
-@autotvm.register_topi_schedule("conv2d_nchwc_acc32.image2d")
-def schedule_conv2d_nchwc_acc32(cfg, outs):
-    return schedule_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc32")
-
-
-def schedule_conv2d_nchwc_impl(cfg, outs, tag):
     """Create the schedule for conv2d_nchw"""
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if op.tag == tag:
+        if op.tag == "adreno_conv2d_latest_op":
             schedule_conv2d_NCHWc_KCRSk(cfg, s, op.output(0))
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
 
-def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype, args):
+@autotvm.register_topi_compute("conv2d_nchwc.image2d")
+def conv2d_nchwc(cfg, Input, Filter, stride, padding, dilation, out_dtype):
     """
     Convolution operator in NCHWc layout.
     Algo:
@@ -109,18 +83,12 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty
     convert_from4d = False
     if len(Input.shape) == 4:
         batch, in_channels, in_height, in_width = Input.shape
-        out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape
-
         in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
-        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
 
         if autotvm.GLOBAL_SCOPE.in_tuning:
             dshape = (batch, in_channel_chunks, in_height, in_width, in_channel_block)
             Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
-            kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block)
-            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
         else:
-            convert_from4d = True
             Input = pack_input(
                 Input,
                 "NCHW",
@@ -131,6 +99,18 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty
                 in_height,
                 in_width,
             )
+    else:
+        batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape
+
+    if len(Filter.shape) == 4:
+        out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape
+        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
+
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block)
+            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
+        else:
+            convert_from4d = True
             Filter = pack_filter(
                 Filter,
                 "OIHW",
@@ -144,9 +124,7 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty
                 kernel_h,
                 kernel_w,
             )
-
     else:
-        batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape
         out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block = Filter.shape
 
     out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions(
@@ -178,7 +156,7 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty
             (
                 temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb]
                 * Filter[ffc, rcc * in_channel_block + rcb, ry, rx, ffb]
-            ).astype(args["accumulator"]),
+            ).astype(out_dtype),
             axis=[rcc, rcb, ry, rx],
         ),
         tag="conv2d_nchwc",
@@ -193,13 +171,13 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty
         return te.compute(
             (batch, out_channles, out_height_orig, out_width_orig),
             lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block],
-            tag="cast_from_acc" + args["accumulator"][-2:],
+            tag="adreno_conv2d_latest_op",
         )
     else:
         return te.compute(
             (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
             lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype),
-            tag="cast_from_acc" + args["accumulator"][-2:],
+            tag="adreno_conv2d_latest_op",
         )
 
 
@@ -234,6 +212,20 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output):
         conv = output.op.input_tensors[0]
         latest_blocked = latest
 
+    pad_data, kernel = s[conv].op.input_tensors
+    filter_pack_rt = bool(
+        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
+    )
+
+    if "pad_temp" in pad_data.op.name:
+        input_pad_temp = pad_data.op.input_tensors[0]
+    else:
+        input_pad_temp = pad_data
+
+    input_pack_rt = bool(
+        isinstance(input_pad_temp.op, tvm.te.ComputeOp) and "input_pack" in input_pad_temp.op.tag
+    )
+
     ##### space definition begin #####
     n, fc, y, x, fb = s[conv].op.axis
     rcc, rcb, ry, rx = s[conv].op.reduce_axis
@@ -274,37 +266,40 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output):
     ##### space definition end #####
 
     pad_data, kernel = s[conv].op.input_tensors
-    if (
-        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
-    ):  # len(latest.op.axis) == 4:
-        # manage scheduling of datacopy
-        pad_data, kernel = s[conv].op.input_tensors
-        if "pad_temp" in pad_data.op.name:
-            pack_data = pad_data.op.input_tensors[0]
-            bind_data_copy(s[pack_data])
+    # There are several conditions that have to be handled:
+    # 1. If we are in the tuning, we always add cache read for data to main conv kernel
+    #    to get texture in tuning opencl kernel
+    # 2. If we are repacking input in runtime, we should always explicit schedule this one more
+    #    stage of data copy from 4d to 5d (referred as pack_data).
+    # 3. If we have pad (independently if we have runtime repack or not) we should inline it in the
+    #    cache_read("texture")
+    if autotvm.GLOBAL_SCOPE.in_tuning or input_pack_rt:
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            if "pad_temp" in pad_data.op.name:
+                s[pad_data].compute_inline()
         else:
-            bind_data_copy(s[pad_data])
-        bind_data_copy(s[kernel])
-
-    pad_data, kernel = s[conv].op.input_tensors
+            if "pad_temp" in pad_data.op.name:
+                pack_data = pad_data.op.input_tensors[0]
+                bind_data_copy(s[pack_data])
+                s[pad_data].compute_inline()
+            else:
+                pack_data = pad_data
+                bind_data_copy(s[pack_data])
 
-    if (
-        autotvm.GLOBAL_SCOPE.in_tuning
-        or isinstance(kernel.op, tvm.te.ComputeOp)
-        and "filter_pack" in kernel.op.tag
-    ):
-        if "pad_temp" in pad_data.op.name:
-            s[pad_data].compute_inline()
         AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
         bind_data_copy(s[AT])
-        WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-        bind_data_copy(s[WT])
     elif "pad_temp" in pad_data.op.name:
         s[pad_data].compute_inline()
         # create cache stage
         AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
         bind_data_copy(s[AT])
 
+    if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt:
+        if not autotvm.GLOBAL_SCOPE.in_tuning:
+            bind_data_copy(s[kernel])
+        WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
+        bind_data_copy(s[WT])
+
     s[conv].set_scope("local")
     if latest_blocked == latest and output != latest:
         s[output].compute_inline()
diff --git a/python/tvm/topi/adreno/conv2d_nchw_winograd.py b/python/tvm/topi/adreno/conv2d_nchw_winograd.py
index 16f7cb8b19d9..0ddc0e7f2c0d 100644
--- a/python/tvm/topi/adreno/conv2d_nchw_winograd.py
+++ b/python/tvm/topi/adreno/conv2d_nchw_winograd.py
@@ -27,62 +27,32 @@
 
 @autotvm.register_topi_compute("conv2d_nchw_winograd.image2d")
 def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    args = {"shared": False, "accumulator": "float16"}
     return conv2d_nchw_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
-    )
-
-
-@autotvm.register_topi_compute("conv2d_nchw_winograd_acc32.image2d")
-def conv2d_nchw_winograd_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    args = {"shared": False, "accumulator": "float32"}
-    return conv2d_nchw_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
+        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False
     )
 
 
 @autotvm.register_topi_schedule("conv2d_nchw_winograd.image2d")
 def schedule_conv2d_nchw_winograd(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16")
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_winograd_acc32.image2d")
-def schedule_conv2d_nchw_winograd_acc32(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32")
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at")
 
 
 @autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform.image2d")
 def conv2d_nchw_winograd_without_weight_transform(
     cfg, data, kernel, strides, padding, dilation, out_dtype
 ):
-    args = {"shared": False, "accumulator": "float16"}
     return conv2d_nchw_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
-    )
-
-
-@autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform_acc32.image2d")
-def conv2d_nchw_winograd_without_weight_transform_acc32(
-    cfg, data, kernel, strides, padding, dilation, out_dtype
-):
-    args = {"shared": False, "accumulator": "float32"}
-    return conv2d_nchw_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
+        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True
     )
 
 
 @autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform.image2d")
 def schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16", pre_computed=True)
-
-
-@autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform_acc32.image2d")
-def schedule_conv2d_nchw_winograd_without_weight_transform_acc32(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32", pre_computed=True)
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at", pre_computed=True)
 
 
 def conv2d_nchw_winograd_comp(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed
+    cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed
 ):
     """Compute declaration for winograd
 
@@ -111,9 +81,6 @@ def conv2d_nchw_winograd_comp(
     out_dtype: str
         The output type. This is used for mixed precision.
 
-    args: dict
-        Dictionary with additional arguments, e.g. accumulator type
-
     pre_computed: bool
         Flag if weights were pre computed if true or the weights should be
         computed in runtime
@@ -124,5 +91,5 @@ def conv2d_nchw_winograd_comp(
         4-D or 5-D with shape NCHW or NCHW4c
     """
     return conv2d_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, "NCHW"
+        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, "NCHW"
     )
diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py
index b377169ca8c9..993b63252531 100644
--- a/python/tvm/topi/adreno/conv2d_nhwc.py
+++ b/python/tvm/topi/adreno/conv2d_nhwc.py
@@ -33,44 +33,22 @@
 )
 
 
-@autotvm.register_topi_compute("conv2d_nhwc.image2d")
-def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
-    """Compute conv2d with NCHWc layout"""
-    args = {"shared": False, "accumulator": "float16"}
-    return compute_conv2d_NHWC_HWIO(data, kernel, strides, padding, dilation, out_dtype, args=args)
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_acc32.image2d")
-def conv2d_nhwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
-    """Compute conv2d with NCHWc layout"""
-    args = {"shared": False, "accumulator": "float32"}
-    return compute_conv2d_NHWC_HWIO(data, kernel, strides, padding, dilation, out_dtype, args=args)
-
-
 @autotvm.register_topi_schedule("conv2d_nhwc.image2d")
 def schedule_conv2d_nhwc(cfg, outs):
-    return schedule_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc16")
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_acc32.image2d")
-def schedule_conv2d_nhwc_acc32(cfg, outs):
-    return schedule_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc32")
-
-
-def schedule_conv2d_nhwc_impl(cfg, outs, tag):
     """Create the schedule for conv2d_nhwc"""
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if op.tag == tag:
+        if op.tag == "adreno_conv2d_latest_op":
             schedule_conv2d_NHWC(cfg, s, op.output(0))
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
 
-def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype, args):
+@autotvm.register_topi_compute("conv2d_nhwc.image2d")
+def conv2d_nhwc(cfg, Input, Filter, stride, padding, dilation, out_dtype):
     """
     Convolution operator in NHWC layout.
     Algo:
@@ -105,18 +83,12 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype
     convert_from4d = False
     if len(Input.shape) == 4:
         batch, in_height, in_width, in_channels = Input.shape
-        kernel_h, kernel_w, in_filter_channels, out_channles = Filter.shape
-
         in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
-        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
 
         if autotvm.GLOBAL_SCOPE.in_tuning:
             dshape = (batch, in_height, in_width, in_channel_chunks, in_channel_block)
             Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
-            kshape = (kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block)
-            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
         else:
-            convert_from4d = True
             Input = pack_input(
                 Input,
                 "NHWC",
@@ -127,6 +99,17 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype
                 in_height,
                 in_width,
             )
+    else:
+        batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape
+
+    if len(Filter.shape) == 4:
+        kernel_h, kernel_w, in_filter_channels, out_channles = Filter.shape
+        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            kshape = (kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block)
+            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
+        else:
+            convert_from4d = True
             Filter = pack_filter(
                 Filter,
                 "HWIO",
@@ -140,9 +123,7 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype
                 kernel_h,
                 kernel_w,
             )
-
     else:
-        batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape
         kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block = Filter.shape
 
     out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions(
@@ -173,7 +154,7 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype
             (
                 temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcc, rcb]
                 * Filter[ry, rx, rcc * in_channel_block + rcb, fc, fb]
-            ).astype(args["accumulator"]),
+            ).astype(out_dtype),
             axis=[ry, rx, rcc, rcb],
         ),
         tag="conv2d_nhwc",
@@ -188,13 +169,13 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype
         return te.compute(
             (batch, out_height_orig, out_width_orig, out_channles),
             lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block],
-            tag="cast_from_acc" + args["accumulator"][-2:],
+            tag="adreno_conv2d_latest_op",
         )
     else:
         return te.compute(
             (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block),
             lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype),
-            tag="cast_from_acc" + args["accumulator"][-2:],
+            tag="adreno_conv2d_latest_op",
         )
 
 
@@ -229,6 +210,19 @@ def schedule_conv2d_NHWC(cfg, s, output):
         conv = output.op.input_tensors[0]
         latest_blocked = latest
 
+    pad_data, kernel = s[conv].op.input_tensors
+    filter_pack_rt = bool(
+        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
+    )
+
+    if "pad_temp" in pad_data.op.name:
+        input_pad_temp = pad_data.op.input_tensors[0]
+    else:
+        input_pad_temp = pad_data
+
+    input_pack_rt = bool(
+        isinstance(input_pad_temp.op, tvm.te.ComputeOp) and "input_pack" in input_pad_temp.op.tag
+    )
     ##### space definition begin #####
     n, y, x, fc, fb = s[conv].op.axis
     ry, rx, rcc, rcb = s[conv].op.reduce_axis
@@ -270,37 +264,40 @@ def schedule_conv2d_NHWC(cfg, s, output):
     ##### space definition end #####
 
     pad_data, kernel = s[conv].op.input_tensors
-    if (
-        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
-    ):  # len(latest.op.axis) == 4:
-        # manage scheduling of datacopy
-        pad_data, kernel = s[conv].op.input_tensors
-        if "pad_temp" in pad_data.op.name:
-            pack_data = pad_data.op.input_tensors[0]
-            bind_data_copy(s[pack_data])
+    # There are several conditions that have to be handled:
+    # 1. If we are in the tuning, we always add cache read for data to main conv kernel
+    #    to get texture in tuning opencl kernel
+    # 2. If we are repacking input in runtime, we should always explicit schedule this one more
+    #    stage of data copy from 4d to 5d (referred as pack_data).
+    # 3. If we have pad (independently if we have runtime repack or not) we should inline it in the
+    #    cache_read("texture")
+    if autotvm.GLOBAL_SCOPE.in_tuning or input_pack_rt:
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            if "pad_temp" in pad_data.op.name:
+                s[pad_data].compute_inline()
         else:
-            bind_data_copy(s[pad_data])
-        bind_data_copy(s[kernel])
-
-    pad_data, kernel = s[conv].op.input_tensors
+            if "pad_temp" in pad_data.op.name:
+                s[pad_data].compute_inline()
+                pack_data = pad_data.op.input_tensors[0]
+                bind_data_copy(s[pack_data])
+            else:
+                pack_data = pad_data
+                bind_data_copy(s[pack_data])
 
-    if (
-        autotvm.GLOBAL_SCOPE.in_tuning
-        or isinstance(kernel.op, tvm.te.ComputeOp)
-        and "filter_pack" in kernel.op.tag
-    ):
-        if "pad_temp" in pad_data.op.name:
-            s[pad_data].compute_inline()
         AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
         bind_data_copy(s[AT])
-        WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-        bind_data_copy(s[WT])
     elif "pad_temp" in pad_data.op.name:
         s[pad_data].compute_inline()
         # create cache stage
         AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
         bind_data_copy(s[AT])
 
+    if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt:
+        if not autotvm.GLOBAL_SCOPE.in_tuning:
+            bind_data_copy(s[kernel])
+        WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
+        bind_data_copy(s[WT])
+
     s[conv].set_scope("local")
     if latest_blocked == latest and output != latest:
         s[output].compute_inline()
diff --git a/python/tvm/topi/adreno/conv2d_nhwc_winograd.py b/python/tvm/topi/adreno/conv2d_nhwc_winograd.py
index bfe385f210a4..b055b388e1a7 100644
--- a/python/tvm/topi/adreno/conv2d_nhwc_winograd.py
+++ b/python/tvm/topi/adreno/conv2d_nhwc_winograd.py
@@ -27,62 +27,32 @@
 
 @autotvm.register_topi_compute("conv2d_nhwc_winograd.image2d")
 def conv2d_nhwc_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    args = {"shared": False, "accumulator": "float16"}
     return conv2d_nhwc_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
-    )
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_winograd_acc32.image2d")
-def conv2d_nhwc_winograd_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    args = {"shared": False, "accumulator": "float32"}
-    return conv2d_nhwc_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
+        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False
     )
 
 
 @autotvm.register_topi_schedule("conv2d_nhwc_winograd.image2d")
 def schedule_conv2d_nhwc_winograd(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16")
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_winograd_acc32.image2d")
-def schedule_conv2d_nhwc_winograd_acc32(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32")
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at")
 
 
 @autotvm.register_topi_compute("conv2d_nhwc_winograd_without_weight_transform.image2d")
 def conv2d_nhwc_winograd_without_weight_transform(
     cfg, data, kernel, strides, padding, dilation, out_dtype
 ):
-    args = {"shared": False, "accumulator": "float16"}
     return conv2d_nhwc_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
-    )
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_winograd_without_weight_transform_acc32.image2d")
-def conv2d_nhwc_winograd_without_weight_transform_acc32(
-    cfg, data, kernel, strides, padding, dilation, out_dtype
-):
-    args = {"shared": False, "accumulator": "float32"}
-    return conv2d_nhwc_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
+        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True
     )
 
 
 @autotvm.register_topi_schedule("conv2d_nhwc_winograd_without_weight_transform.image2d")
 def schedule_conv2d_nhwc_winograd_without_weight_transform(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16", pre_computed=True)
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_winograd_without_weight_transform_acc32.image2d")
-def schedule_conv2d_nhwc_winograd_without_weight_transform_acc32(cfg, outs):
-    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32", pre_computed=True)
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at", pre_computed=True)
 
 
 def conv2d_nhwc_winograd_comp(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed
+    cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed
 ):
     """Compute declaration for winograd
 
@@ -111,9 +81,6 @@ def conv2d_nhwc_winograd_comp(
     out_dtype: str
         The output type. This is used for mixed precision.
 
-    args: dict
-        Dictionary with additional arguments, e.g. accumulator type
-
     pre_computed: bool
         Flag if weights were pre computed if true or the weights should be
         computed in runtime
@@ -124,5 +91,5 @@ def conv2d_nhwc_winograd_comp(
         4-D or 5-D with shape NCHW or NCHW4c
     """
     return conv2d_winograd_comp(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, "NHWC"
+        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, "NHWC"
     )
diff --git a/python/tvm/topi/adreno/conv2d_winograd_common.py b/python/tvm/topi/adreno/conv2d_winograd_common.py
index b0cec0f70280..501773ad46fa 100644
--- a/python/tvm/topi/adreno/conv2d_winograd_common.py
+++ b/python/tvm/topi/adreno/conv2d_winograd_common.py
@@ -35,7 +35,7 @@
 
 
 def conv2d_winograd_comp(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, layout
+    cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, layout
 ):
     """Compute declaration for winograd
 
@@ -64,9 +64,6 @@ def conv2d_winograd_comp(
     out_dtype: str
         The output type. This is used for mixed precision.
 
-    args: dict
-        Dictionary with additional arguments, e.g. accumulator type
-
     pre_computed: bool
         Flag if weights were pre computed if true or the weights should be
         computed in runtime
@@ -186,7 +183,7 @@ def conv2d_winograd_comp(
 
     r = KW
     m = tile_size
-    A, B, G = winograd_transform_matrices(m, r, out_dtype)
+    A, B, G = winograd_transform_matrices(m, r, data.dtype)
 
     H = (H + pt + pb - KH) // HSTR + 1
     W = (W + pl + pr - KW) // WSTR + 1
@@ -268,7 +265,7 @@ def conv2d_winograd_comp(
         lambda eps, nu, co, p, cob: te.sum(
             (
                 kernel_pack[eps][nu][ci * CB + cb][co][cob] * data_pack_trans[eps][nu][ci][p][cb]
-            ).astype(args["accumulator"]),
+            ).astype(out_dtype),
             axis=[ci, cb],
         ),
         name="bgemm",
@@ -280,7 +277,7 @@ def conv2d_winograd_comp(
     inverse = te.compute(
         (CO, P, m, m, COB),
         lambda co, p, vh, vw, cob: te.sum(
-            bgemm[r_a][r_b][co][p][cob] * (A[r_a][vh] * A[r_b][vw]).astype(args["accumulator"]),
+            bgemm[r_a][r_b][co][p][cob] * (A[r_a][vh] * A[r_b][vw]).astype(out_dtype),
             axis=[r_a, r_b],
         ),
         name="inverse",
@@ -295,7 +292,7 @@ def conv2d_winograd_comp(
                     idxmod(h, m)
                 ][idxmod(w, m)][c % CB].astype(out_dtype),
                 name="output",
-                tag="cast_from_acc" + args["accumulator"][-2:],
+                tag="dummy_compute_at",
             )
         else:
             output = te.compute(
@@ -304,7 +301,7 @@ def conv2d_winograd_comp(
                     n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)
                 ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype),
                 name="output",
-                tag="cast_from_acc" + args["accumulator"][-2:],
+                tag="dummy_compute_at",
             )
     else:
         if convert_from4d and autotvm.GLOBAL_SCOPE.in_tuning is False:
@@ -314,7 +311,7 @@ def conv2d_winograd_comp(
                     idxmod(h, m)
                 ][idxmod(w, m)][c % CB].astype(out_dtype),
                 name="output",
-                tag="cast_from_acc" + args["accumulator"][-2:],
+                tag="dummy_compute_at",
             )
         else:
             output = te.compute(
@@ -323,7 +320,7 @@ def conv2d_winograd_comp(
                     n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)
                 ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype),
                 name="output",
-                tag="cast_from_acc" + args["accumulator"][-2:],
+                tag="dummy_compute_at",
             )
 
     if isinstance(N, int):
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
index 37713b4584b9..eb998bdbcd6e 100644
--- a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
+++ b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
@@ -33,50 +33,22 @@
 )
 
 
-@autotvm.register_topi_compute("depthwise_conv2d_nchwc.image2d")
-def depthwise_conv2d_nchwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
-    """Compute depthwise_conv2d with NCHWc layout"""
-    args = {"shared": False, "accumulator": "float16"}
-    return compute_depthwise_conv2d_NCHWc_KCRSk(
-        data, kernel, strides, padding, dilation, out_dtype, args=args
-    )
-
-
-@autotvm.register_topi_compute("depthwise_conv2d_nchwc_acc32.image2d")
-def depthwise_conv2d_nchwc_acc32(
-    cfg, data, kernel, strides, padding, dilation, out_dtype="float16"
-):
-    """Compute depthwise_conv2d with NCHWc layout"""
-    args = {"shared": False, "accumulator": "float32"}
-    return compute_depthwise_conv2d_NCHWc_KCRSk(
-        data, kernel, strides, padding, dilation, out_dtype, args=args
-    )
-
-
 @autotvm.register_topi_schedule("depthwise_conv2d_nchwc.image2d")
 def schedule_depthwise_conv2d_nchwc(cfg, outs):
-    return schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc16")
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nchwc_acc32.image2d")
-def schedule_depthwise_conv2d_nchwc_acc32(cfg, outs):
-    return schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc32")
-
-
-def schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag):
     """Create the schedule for depthwise conv2d_nchw4c_ohwi4o"""
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if op.tag == tag:
+        if op.tag == "adreno_dw_conv2d_latest_op":
             schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, op.output(0))
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
 
-def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype, args):
+@autotvm.register_topi_compute("depthwise_conv2d_nchwc.image2d")
+def depthwise_conv2d_nchwc(cfg, Input, Filter, stride, padding, dilation, out_dtype):
     """
     Depthwise convolution operator in NCHWc layout.
     Algo:
@@ -183,10 +155,10 @@ def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilatio
                     ffb,
                 ]
                 * Filter[ffc // in_filter_channels, ffc % in_filter_channels, ry, rx, ffb]
-            ).astype(args["accumulator"]),
+            ).astype(out_dtype),
             axis=[ry, rx],
         ),
-        tag="depthwise_conv2d_nchwc_kcrsk",
+        tag="depthwise_conv2d_nchwc",
     )
 
     if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
@@ -198,13 +170,13 @@ def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilatio
         return te.compute(
             (batch, out_channles, out_height_orig, out_width_orig),
             lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block],
-            tag="cast_from_acc" + args["accumulator"][-2:],
+            tag="adreno_dw_conv2d_latest_op",
         )
     else:
         return te.compute(
             (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
             lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype),
-            tag="cast_from_acc" + args["accumulator"][-2:],
+            tag="adreno_dw_conv2d_latest_op",
         )
 
 
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
index 2b228b444fca..c27f2a9eae7c 100644
--- a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
+++ b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
@@ -33,48 +33,22 @@
 )
 
 
-@autotvm.register_topi_compute("depthwise_conv2d_nhwc.image2d")
-def depthwise_conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
-    """Compute depthwise_conv2d with NHWC layout"""
-    args = {"shared": False, "accumulator": "float16"}
-    return compute_depthwise_conv2d_NHWC_HWOI(
-        data, kernel, strides, padding, dilation, out_dtype, args=args
-    )
-
-
-@autotvm.register_topi_compute("depthwise_conv2d_nhwc_acc32.image2d")
-def depthwise_conv2d_nhwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
-    """Compute depthwise_conv2d with NHWC layout"""
-    args = {"shared": False, "accumulator": "float32"}
-    return compute_depthwise_conv2d_NHWC_HWOI(
-        data, kernel, strides, padding, dilation, out_dtype, args=args
-    )
-
-
 @autotvm.register_topi_schedule("depthwise_conv2d_nhwc.image2d")
 def schedule_depthwise_conv2d_nhwc(cfg, outs):
-    return schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc16")
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nhwc_acc32.image2d")
-def schedule_depthwise_conv2d_nhwc_acc32(cfg, outs):
-    return schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc32")
-
-
-def schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag):
     """Create the schedule for depthwise conv2d_nchw4c_ohwi4o"""
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if op.tag == tag:
+        if op.tag == "adreno_dw_conv2d_latest_op":
             schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, op.output(0))
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
 
-def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation, out_dtype, args):
+@autotvm.register_topi_compute("depthwise_conv2d_nhwc.image2d")
+def depthwise_conv2d_nhwc(cfg, Input, Filter, stride, padding, dilation, out_dtype):
     """
     Depthwise convolution operator in NCHWc layout.
     Algo:
@@ -175,7 +149,7 @@ def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation,
             (
                 temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffc, ffb]
                 * Filter[ry, rx, ffc, 0, ffb]
-            ).astype(args["accumulator"]),
+            ).astype(out_dtype),
             axis=[ry, rx],
         ),
         tag="depthwise_conv2d_nhwc",
@@ -190,13 +164,13 @@ def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation,
         return te.compute(
             (batch, out_height_orig, out_width_orig, out_channles),
             lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block],
-            tag="cast_from_acc" + args["accumulator"][-2:],
+            tag="adreno_dw_conv2d_latest_op",
         )
     else:
         return te.compute(
             (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block),
             lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype),
-            tag="cast_from_acc" + args["accumulator"][-2:],
+            tag="adreno_dw_conv2d_latest_op",
         )
 
 
diff --git a/tests/python/relay/test_conv2d_nchw_texture.py b/tests/python/relay/test_conv2d_nchw_texture.py
index 6eadd8fc1c7a..ab12e40b39cb 100644
--- a/tests/python/relay/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/test_conv2d_nchw_texture.py
@@ -437,7 +437,7 @@ def test_conv2d_vgg16_winograd_4d():
     stat_file = temp.relpath("stat.log")
     with open(stat_file, "w") as f:
         f.write(
-            '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd_acc32.image2d", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n'
+            '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n'
         )
     graph = build_run_compare(
         mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
@@ -486,7 +486,7 @@ def test_conv2d_winograd_conv():
     stat_file = temp.relpath("stat.log")
     with open(stat_file, "w") as f:
         f.write(
-            '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd_acc32.image2d", [["TENSOR", [1, 4, 3, 3], "float16"], ["TENSOR", [8, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n'
+            '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 4, 3, 3], "float16"], ["TENSOR", [8, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n'
         )
     graph = build_run_compare(
         mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
diff --git a/tests/python/relay/test_conv2d_nhwc_texture.py b/tests/python/relay/test_conv2d_nhwc_texture.py
index be5cefd46038..cf8116c076cc 100644
--- a/tests/python/relay/test_conv2d_nhwc_texture.py
+++ b/tests/python/relay/test_conv2d_nhwc_texture.py
@@ -598,7 +598,7 @@ def test_conv2d_vgg16_winograd_4d():
     stat_file = temp.relpath("stat.log")
     with open(stat_file, "w") as f:
         f.write(
-            '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd_acc32.image2d", [["TENSOR", [1, 28, 28, 512], "float16"], ["TENSOR", [3, 3, 512, 512], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n'
+            '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 512], "float16"], ["TENSOR", [3, 3, 512, 512], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n'
         )
     graph = build_run_compare(
         mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file

From 54786bbff340426109de7785bb2de4c1dfc2a738 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Thu, 1 Sep 2022 16:51:33 -0400
Subject: [PATCH 091/704] [hexagon][tests] re-enable maxpool hardware test
 (#12676)

- Re-enable test_max_pool2d_slice.py when run on Hexagon
  hardware (as opposed to hexagon-sim).

  This is now safe because https://github.com/apache/tvm/issues/11928
  has been fixed.
---
 .../python/contrib/test_hexagon/topi/test_max_pool2d_slice.py  | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py
index 373a59e0b613..f827f025af17 100644
--- a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py
@@ -330,9 +330,6 @@ def test_max_pool2d_slice(
         expected_output_np,
         hexagon_session: Session,
     ):
-        if hexagon_session._launcher._serial_number != "simulator":
-            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11928")
-
         target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape_padded, name="A", dtype=dtype)
 

From 50dad0d9a3c85f7692025b5330ceb902e264bb92 Mon Sep 17 00:00:00 2001
From: arangasa <76030063+arangasa@users.noreply.github.com>
Date: Fri, 2 Sep 2022 02:49:40 +0530
Subject: [PATCH 092/704] [HEXAGON][TOPI]Slice Op Argmax uint8 (#12472)

---
 python/tvm/topi/hexagon/slice_ops/argmax.py        |  7 +++++++
 .../contrib/test_hexagon/topi/test_argmax_slice.py | 14 ++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/python/tvm/topi/hexagon/slice_ops/argmax.py b/python/tvm/topi/hexagon/slice_ops/argmax.py
index 4d34cb50a0b0..a3a0ea37c37c 100644
--- a/python/tvm/topi/hexagon/slice_ops/argmax.py
+++ b/python/tvm/topi/hexagon/slice_ops/argmax.py
@@ -43,4 +43,11 @@ def argmax_schedule(argmax_func, in_layout_str, out_layout_str):
             argmax_func, fp16_layout_transform, int32_layout_transform
         )
         return tir_s
+    if (in_layout_str == "nhwc-8h8w32c-2d") and (out_layout_str == "nhw-32h16w-2d"):
+        int8_layout_transform = get_layout_transform_fn(in_layout_str)
+        int32_layout_transform = get_layout_transform_fn(out_layout_str)
+        tir_s = argmax_stir_schedule_nhwc(
+            argmax_func, int8_layout_transform, int32_layout_transform
+        )
+        return tir_s
     raise RuntimeError(f"Unexpected input_layout, output_layout '{in_layout_str, out_layout_str}'")
diff --git a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
index eaba9fafde3a..32d7a5097384 100644
--- a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """ Tests for Hexagon slice argmax op """
-import pytest
 import numpy as np
 
 import tvm
@@ -33,15 +32,18 @@ class TestArgMaxSlice:
         input_shape,
         input_layout,
         output_layout,
+        dtype,
         in_axis,
         in_axis_sep,
         out_axis_sep,
     ) = tvm.testing.parameters(
-        ((1, 64, 64, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]),
-        ((3, 32, 16, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]),
-        ((1, 32, 32, 64), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]),
+        ((1, 64, 64, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", "float16", [3], [4], [3]),
+        ((3, 32, 16, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", "float16", [3], [4], [3]),
+        ((1, 32, 32, 64), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", "float16", [3], [4], [3]),
+        ((1, 64, 64, 32), "nhwc-8h8w32c-2d", "nhw-32h16w-2d", "int8", [3], [4], [3]),
+        ((3, 32, 16, 32), "nhwc-8h8w32c-2d", "nhw-32h16w-2d", "int8", [3], [4], [3]),
+        ((1, 32, 32, 64), "nhwc-8h8w32c-2d", "nhw-32h16w-2d", "int8", [3], [4], [3]),
     )
-    dtype = tvm.testing.parameter("float16")
     working_scope = tvm.testing.parameter("global.vtcm")
 
     @tvm.testing.fixture
@@ -96,7 +98,7 @@ def test_argmax_slice(
             axis_separators=out_axis_sep,
             mem_scope=working_scope,
         )
-        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_assert": True}):
+        with tvm.transform.PassContext(opt_level=3):
             tir_irm = tvm.lower(tir_s.mod, [argmax_input, output], name="argmax")
             runtime_module = tvm.build(
                 tir_irm, [argmax_input, output], target=target, name="argmax"

From eecb7fd494052ca941f3d123daa2e887f14b7e75 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 1 Sep 2022 16:44:42 -0700
Subject: [PATCH 093/704] [MetaSchedule] Introduce `Union` and `OrderedUnion`
 in Database (#12628)

Following up #12520 and #12626, this PR introduces two database classes:
`UnionDatabase` and `OrderedUnionDatabase`, both of which allow users to
organically compose multiple databases together, so that the high-level
IR (Relay, Relax) could select the best tuning records according to
running time or a preferred order given by users.

To each query, `UnionDatabase` returns the best record among all the
databases given; Instead, `OrderedUnionDatabase` returns he record from
the first database that responds to the query.

Used together, users may specify complicated dispatching patterns like
below:

Examples below demonstrate the usecases of and difference between
UnionDatabase and OrderDatabase.

Assumption:
* db1, db2 do not have tuning records for the target workload.
* Each of db3, db4, db5 has tuning records r3, r4, r5 for target
workload respectively.

```python
#### Case 1. `UnionDatabase`:
merged_db = ms.database.UnionDatabase(
    db1, # no record
    db2, # no record
    db3, # has r3
    db4  # has r4
)
# returns the better one between r3 and r4
merged_db.query_tuning_record(..., target_workload)

### Case 2. `OrderedUnionDatabase`
merged_db = ms.database.OrderedUnionDatabase(
    db1, # no record
    db2, # no record
    db3, # has r3
    db4  # has r4
)
# returns r3
merged_db.query_tuning_record(..., target_workload)

### Case 3. Mix-use scenario
merged_db = ms.database.UnionDatabase(
    db1, # no record
    db2, # no record
    db3, # has r3
    ms.database.OrderedUnionDatabase( # returns r4
        db4,  # has r4
        db5,  # has r5
    )
)
# returns the better one between r3 and r4
merged_db.query_tuning_record(..., target_workload)

### Case 4. Another mix-use scenario
merged_db = ms.database.UnionDatabase(
    db1, # no record
    db2, # no record
    db3, # has r3
    ms.database.UnionDatabase( # returns the better one between r4 and r5
        db4,  # has r4
        db5,  # has r5
    )
)
# returns the best one among r3, r4 and r5
merged_db.query_tuning_record(..., target_workload)

### Case 5. Yet another mix-use scenario
merged_db = ms.database.OrderedUnionDatabase(
    db1, # no record
    db2, # no record
    ms.database.UnionDatabase( # returns the better one between r3 and r4
        db3, # has r3
        db4, # has r4
    )
    db5,  # has r5
)
# returns the better one between r3 and r4
merged_db.query_tuning_record(..., target_workload)
```

Co-authored-by: sunggg <49998730+sunggg@users.noreply.github.com>
---
 include/tvm/meta_schedule/database.h          |  16 +++
 python/tvm/meta_schedule/database/__init__.py |   2 +
 .../database/ordered_union_database.py        | 112 ++++++++++++++++++
 .../meta_schedule/database/union_database.py  | 112 ++++++++++++++++++
 src/meta_schedule/database/json_database.cc   |  22 ----
 .../database/ordered_union_database.cc        |  86 ++++++++++++++
 src/meta_schedule/database/union_database.cc  |  88 ++++++++++++++
 src/meta_schedule/utils.h                     |  22 ++++
 tests/python/unittest/test_link_params.py     |   9 +-
 .../unittest/test_meta_schedule_database.py   |  37 ++++++
 10 files changed, 477 insertions(+), 29 deletions(-)
 create mode 100644 python/tvm/meta_schedule/database/ordered_union_database.py
 create mode 100644 python/tvm/meta_schedule/database/union_database.py
 create mode 100644 src/meta_schedule/database/ordered_union_database.cc
 create mode 100644 src/meta_schedule/database/union_database.cc

diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index 88db2e227786..fa488a38ce0a 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -357,6 +357,22 @@ class Database : public runtime::ObjectRef {
    */
   TVM_DLL static Database JSONDatabase(String path_workload, String path_tuning_record,
                                        bool allow_missing);
+  /*!
+   * \brief A database composed of multiple databases, allowing users to guide IR rewriting using
+   * combined knowledge of those databases. To each query, it returns the best record among all the
+   * databases given.
+   * \param databases The list of databases to be combined.
+   * \return The combined database.
+   */
+  TVM_DLL static Database UnionDatabase(Array<Database, void> databases);
+  /*!
+   * \brief A database composed of multiple databases, allowing users to guide IR rewriting using
+   * combined knowledge of those databases. To each query, it returns the record from the first
+   * database that responds to the query.
+   * \param databases The database to be subsetted.
+   * \return The subsetted database.
+   */
+  TVM_DLL static Database OrderedUnionDatabase(Array<Database, void> databases);
   /*!
    * \brief Create a database with customized methods on the python-side.
    * \param f_has_workload The packed function of `HasWorkload`.
diff --git a/python/tvm/meta_schedule/database/__init__.py b/python/tvm/meta_schedule/database/__init__.py
index 7726daf6eb63..679923e47936 100644
--- a/python/tvm/meta_schedule/database/__init__.py
+++ b/python/tvm/meta_schedule/database/__init__.py
@@ -21,4 +21,6 @@
 from .database import Database, PyDatabase, TuningRecord, Workload
 from .json_database import JSONDatabase
 from .memory_database import MemoryDatabase
+from .ordered_union_database import OrderedUnionDatabase
 from .schedule_fn_database import ScheduleFnDatabase
+from .union_database import UnionDatabase
diff --git a/python/tvm/meta_schedule/database/ordered_union_database.py b/python/tvm/meta_schedule/database/ordered_union_database.py
new file mode 100644
index 000000000000..35b0a9e282c1
--- /dev/null
+++ b/python/tvm/meta_schedule/database/ordered_union_database.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A database consists of multiple databases."""
+from tvm._ffi import register_object
+
+from .. import _ffi_api
+from .database import Database
+
+
+@register_object("meta_schedule.OrderedUnionDatabase")
+class OrderedUnionDatabase(Database):
+    """A database composed of multiple databases, allowing users to guide IR rewriting using
+    combined knowledge of those databases. To each query, it returns the record from the first
+    database that responds to the query.
+
+    Examples
+    --------
+    Examples below demonstrate the usecases of and difference between UnionDatabase and
+    OrderDatabase.
+
+    Assumption:
+    * db1, db2 do not have tuning records for the target workload.
+    * Each of db3, db4, db5 has tuning records r3, r4, r5 for target workload respectively.
+
+    .. code-block:: python
+
+    #### Case 1. `UnionDatabase`:
+    merged_db = ms.database.UnionDatabase(
+        db1, # no record
+        db2, # no record
+        db3, # has r3
+        db4  # has r4
+    )
+    # returns the better one between r3 and r4
+    merged_db.query_tuning_record(..., target_workload)
+
+    ### Case 2. `OrderedUnionDatabase`
+    merged_db = ms.database.OrderedUnionDatabase(
+        db1, # no record
+        db2, # no record
+        db3, # has r3
+        db4  # has r4
+    )
+    # returns r3
+    merged_db.query_tuning_record(..., target_workload)
+
+    ### Case 3. Mix-use scenario
+    merged_db = ms.database.UnionDatabase(
+        db1, # no record
+        db2, # no record
+        db3, # has r3
+        ms.database.OrderedUnionDatabase( # returns r4
+            db4,  # has r4
+            db5,  # has r5
+        )
+    )
+    # returns the better one between r3 and r4
+    merged_db.query_tuning_record(..., target_workload)
+
+    ### Case 4. Another mix-use scenario
+    merged_db = ms.database.UnionDatabase(
+        db1, # no record
+        db2, # no record
+        db3, # has r3
+        ms.database.UnionDatabase( # returns best one between r4 and r5
+            db4,  # has r4
+            db5,  # has r5
+        )
+    )
+    # returns the best one among r3, r4 and r5
+    merged_db.query_tuning_record(..., target_workload)
+
+    ### Case 5. Yet another mix-use scenario
+    merged_db = ms.database.OrderedUnionDatabase(
+        db1, # no record
+        db2, # no record
+        ms.database.UnionDatabase( # returns best one between r3 and r4
+            db3, # has r3
+            db4,  # has r4
+        )
+        db5,  # has r5
+    )
+    # returns the better one between r3 and r4
+    merged_db.query_tuning_record(..., target_workload)
+    """
+
+    def __init__(self, *databases: Database) -> None:
+        """Construct a merged database from multiple databases.
+
+        Parameters
+        ----------
+        *databases : Database
+            The list of databases to combine.
+        """
+        self.__init_handle_by_constructor__(
+            _ffi_api.DatabaseOrderedUnionDatabase,  # type: ignore # pylint: disable=no-member
+            databases,
+        )
diff --git a/python/tvm/meta_schedule/database/union_database.py b/python/tvm/meta_schedule/database/union_database.py
new file mode 100644
index 000000000000..ae55ebe79614
--- /dev/null
+++ b/python/tvm/meta_schedule/database/union_database.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A database consists of multiple databases."""
+from tvm._ffi import register_object
+
+from .. import _ffi_api
+from .database import Database
+
+
+@register_object("meta_schedule.UnionDatabase")
+class UnionDatabase(Database):
+    """A database composed of multiple databases, allowing users to guide IR rewriting using
+    combined knowledge of those databases. To each query, it returns the best record among all the
+    databases given.
+
+    Examples
+    --------
+    Examples below demonstrate the usecases of and difference between UnionDatabase and
+    OrderDatabase.
+
+    Assumption:
+    * db1, db2 do not have tuning records for the target workload.
+    * Each of db3, db4, db5 has tuning records r3, r4, r5 for target workload respectively.
+
+    .. code-block:: python
+
+    #### Case 1. `UnionDatabase`:
+    merged_db = ms.database.UnionDatabase(
+        db1, # no record
+        db2, # no record
+        db3, # has r3
+        db4  # has r4
+    )
+    # returns the better one between r3 and r4
+    merged_db.query_tuning_record(..., target_workload)
+
+    ### Case 2. `OrderedUnionDatabase`
+    merged_db = ms.database.OrderedUnionDatabase(
+        db1, # no record
+        db2, # no record
+        db3, # has r3
+        db4  # has r4
+    )
+    # returns r3
+    merged_db.query_tuning_record(..., target_workload)
+
+    ### Case 3. Mix-use scenario
+    merged_db = ms.database.UnionDatabase(
+        db1, # no record
+        db2, # no record
+        db3, # has r3
+        ms.database.OrderedUnionDatabase( # returns r4
+            db4,  # has r4
+            db5,  # has r5
+        )
+    )
+    # returns the better one between r3 and r4
+    merged_db.query_tuning_record(..., target_workload)
+
+    ### Case 4. Another mix-use scenario
+    merged_db = ms.database.UnionDatabase(
+        db1, # no record
+        db2, # no record
+        db3, # has r3
+        ms.database.UnionDatabase( # returns best one between r4 and r5
+            db4,  # has r4
+            db5,  # has r5
+        )
+    )
+    # returns the best one among r3, r4 and r5
+    merged_db.query_tuning_record(..., target_workload)
+
+    ### Case 5. Yet another mix-use scenario
+    merged_db = ms.database.OrderedUnionDatabase(
+        db1, # no record
+        db2, # no record
+        ms.database.UnionDatabase( # returns best one between r3 and r4
+            db3, # has r3
+            db4,  # has r4
+        )
+        db5,  # has r5
+    )
+    # returns the better one between r3 and r4
+    merged_db.query_tuning_record(..., target_workload)
+    """
+
+    def __init__(self, *databases: Database) -> None:
+        """Construct a merged database from multiple databases.
+
+        Parameters
+        ----------
+        *databases : Database
+            The list of databases to combine.
+        """
+        self.__init_handle_by_constructor__(
+            _ffi_api.DatabaseUnionDatabase,  # type: ignore # pylint: disable=no-member
+            databases,
+        )
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 2e4f85260835..91b96c82479f 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -25,28 +25,6 @@
 namespace tvm {
 namespace meta_schedule {
 
-/*! \brief The struct defining comparison function of sorting by mean run seconds. */
-struct SortTuningRecordByMeanRunSecs {
-  static const constexpr double kMaxMeanTime = 1e10;
-
-  static double Mean(const Array<FloatImm>& a) {
-    if (a.empty()) {
-      return kMaxMeanTime;
-    }
-    double sum = 0.0;
-    for (const FloatImm& i : a) {
-      sum += i->value;
-    }
-    return sum / a.size();
-  }
-
-  bool operator()(const TuningRecord& a, const TuningRecord& b) const {
-    double a_time = Mean(a->run_secs.value_or({}));
-    double b_time = Mean(b->run_secs.value_or({}));
-    return a_time < b_time;
-  }
-};
-
 /*!
  * \brief Read lines from a json file.
  * \param path The path to the json file.
diff --git a/src/meta_schedule/database/ordered_union_database.cc b/src/meta_schedule/database/ordered_union_database.cc
new file mode 100644
index 000000000000..3aaee2112c0c
--- /dev/null
+++ b/src/meta_schedule/database/ordered_union_database.cc
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+class OrderedUnionDatabaseNode : public DatabaseNode {
+ public:
+  Array<Database> databases;
+
+  void VisitAttrs(AttrVisitor* v) { v->Visit("databases", &databases); }
+
+  static constexpr const char* _type_key = "meta_schedule.OrderedUnionDatabase";
+  TVM_DECLARE_FINAL_OBJECT_INFO(OrderedUnionDatabaseNode, DatabaseNode);
+
+ public:
+  Optional<TuningRecord> QueryTuningRecord(const IRModule& mod, const Target& target,
+                                           const String& task_name) final {
+    for (const Database& db : databases) {
+      if (Optional<TuningRecord> record = db->QueryTuningRecord(mod, target, task_name)) {
+        return record;
+      }
+    }
+    return NullOpt;
+  }
+
+  bool HasWorkload(const IRModule& mod) final {
+    LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.HasWorkload";
+    throw;
+  }
+
+  Workload CommitWorkload(const IRModule& mod) final {
+    LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.CommitWorkload";
+    throw;
+  }
+
+  void CommitTuningRecord(const TuningRecord& record) final {
+    LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.CommitTuningRecord";
+    throw;
+  }
+
+  Array<TuningRecord> GetTopK(const Workload& workload, int top_k) final {
+    LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.GetTopK";
+    throw;
+  }
+
+  Array<TuningRecord> GetAllTuningRecords() final {
+    LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.GetAllTuningRecords";
+    throw;
+  }
+
+  int64_t Size() final {
+    LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.size";
+    throw;
+  }
+};
+
+Database Database::OrderedUnionDatabase(Array<Database> databases) {
+  ObjectPtr<OrderedUnionDatabaseNode> n = make_object<OrderedUnionDatabaseNode>();
+  n->databases = std::move(databases);
+  return Database(n);
+}
+
+TVM_REGISTER_NODE_TYPE(OrderedUnionDatabaseNode);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseOrderedUnionDatabase")
+    .set_body_typed(Database::OrderedUnionDatabase);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/database/union_database.cc b/src/meta_schedule/database/union_database.cc
new file mode 100644
index 000000000000..6d19a38c6d9e
--- /dev/null
+++ b/src/meta_schedule/database/union_database.cc
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+class UnionDatabaseNode : public DatabaseNode {
+ public:
+  Array<Database> databases;
+
+  void VisitAttrs(AttrVisitor* v) { v->Visit("databases", &databases); }
+
+  static constexpr const char* _type_key = "meta_schedule.UnionDatabase";
+  TVM_DECLARE_FINAL_OBJECT_INFO(UnionDatabaseNode, DatabaseNode);
+
+ public:
+  Optional<TuningRecord> QueryTuningRecord(const IRModule& mod, const Target& target,
+                                           const String& task_name) final {
+    std::vector<TuningRecord> results;
+    results.reserve(databases.size());
+    for (const Database& db : databases) {
+      if (Optional<TuningRecord> record = db->QueryTuningRecord(mod, target, task_name)) {
+        results.push_back(record.value());
+      }
+    }
+    std::stable_sort(results.begin(), results.end(), SortTuningRecordByMeanRunSecs());
+    return results.empty() ? Optional<TuningRecord>(NullOpt) : results[0];
+  }
+
+  bool HasWorkload(const IRModule& mod) final {
+    LOG(FATAL) << "NotImplementedError: UnionDatabase.HasWorkload";
+    throw;
+  }
+
+  Workload CommitWorkload(const IRModule& mod) final {
+    LOG(FATAL) << "NotImplementedError: UnionDatabase.CommitWorkload";
+    throw;
+  }
+
+  void CommitTuningRecord(const TuningRecord& record) final {
+    LOG(FATAL) << "NotImplementedError: UnionDatabase.CommitTuningRecord";
+    throw;
+  }
+
+  Array<TuningRecord> GetTopK(const Workload& workload, int top_k) final {
+    LOG(FATAL) << "NotImplementedError: UnionDatabase.GetTopK";
+    throw;
+  }
+
+  Array<TuningRecord> GetAllTuningRecords() final {
+    LOG(FATAL) << "NotImplementedError: UnionDatabase.GetAllTuningRecords";
+    throw;
+  }
+
+  int64_t Size() final {
+    LOG(FATAL) << "NotImplementedError: UnionDatabase.size";
+    throw;
+  }
+};
+
+Database Database::UnionDatabase(Array<Database> databases) {
+  ObjectPtr<UnionDatabaseNode> n = make_object<UnionDatabaseNode>();
+  n->databases = std::move(databases);
+  return Database(n);
+}
+
+TVM_REGISTER_NODE_TYPE(UnionDatabaseNode);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseUnionDatabase").set_body_typed(Database::UnionDatabase);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index db37935ec206..ad56fa7f6a52 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -404,6 +404,28 @@ inline Array<Integer> AsIntArray(const ObjectRef& obj) {
   return results;
 }
 
+/*! \brief The struct defining comparison function of sorting by mean run seconds. */
+struct SortTuningRecordByMeanRunSecs {
+  static const constexpr double kMaxMeanTime = 1e10;
+
+  static double Mean(const Array<FloatImm>& a) {
+    if (a.empty()) {
+      return kMaxMeanTime;
+    }
+    double sum = 0.0;
+    for (const FloatImm& i : a) {
+      sum += i->value;
+    }
+    return sum / a.size();
+  }
+
+  bool operator()(const TuningRecord& a, const TuningRecord& b) const {
+    double a_time = Mean(a->run_secs.value_or({}));
+    double b_time = Mean(b->run_secs.value_or({}));
+    return a_time < b_time;
+  }
+};
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index b14c18e55f4b..e5b8cd77445f 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -412,17 +412,12 @@ def schedule_fn(sch):
             return True
         return False
 
-    link_params = True
-
     with StringIO() as stderr_buf, redirect_stderr(stderr_buf):
         with ms.database.ScheduleFnDatabase(schedule_fn), tvm.transform.PassContext(
             opt_level=3,
-            config={
-                "relay.backend.use_meta_schedule": True,
-                "relay.FuseOps.link_params": link_params,
-            },
+            config={"relay.backend.use_meta_schedule": True},
         ):
-            executor = Executor("graph", {"link-params": link_params})
+            executor = Executor("graph", {"link-params": True})
             lib = relay.build(relay_mod, target=target, executor=executor)
 
         # Workload look up should succeed. This does not work when the test is invoked from pytest.
diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py
index ff0f350d8914..e6342f1c3536 100644
--- a/tests/python/unittest/test_meta_schedule_database.py
+++ b/tests/python/unittest/test_meta_schedule_database.py
@@ -294,5 +294,42 @@ def test_meta_schedule_database_reload():
             _equal_record(ret[1], records[2])
 
 
+def test_meta_schedule_database_union():
+    mod: IRModule = Matmul
+    target = tvm.target.Target("llvm")
+    arg_info = ms.arg_info.ArgInfo.from_prim_func(func=mod["main"])
+    db_1 = ms.database.MemoryDatabase()
+    db_2 = ms.database.MemoryDatabase()
+    trace = _create_schedule(mod, _schedule_matmul).trace
+
+    def query(db):
+        return db.query_tuning_record(mod=mod, target=target, workload_name="main").run_secs
+
+    def commit_record(db, run_sec):
+        db.commit_tuning_record(
+            ms.database.TuningRecord(
+                trace,
+                workload=db.commit_workload(mod),
+                run_secs=[run_sec],
+                target=target,
+                args_info=arg_info,
+            )
+        )
+
+    commit_record(db_1, 1.0)
+    (run_sec,) = query(db_1)
+    assert run_sec.value == 1.0
+
+    commit_record(db_2, 0.5)
+    (run_sec,) = query(db_2)
+    assert run_sec.value == 0.5
+
+    (run_secs,) = query(ms.database.UnionDatabase(db_1, db_2))
+    assert run_secs.value == 0.5
+
+    (run_secs,) = query(ms.database.OrderedUnionDatabase(db_1, db_2))
+    assert run_secs.value == 1.0
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 8ca8f24d54d65be552448e5368d879710930711b Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 1 Sep 2022 18:56:32 -0700
Subject: [PATCH 094/704] [TIR] Handle DeclBuffer in ToSSA (#12679)

---
 include/tvm/tir/stmt.h                                  | 1 +
 src/tir/transforms/ir_utils.cc                          | 9 +++++++++
 tests/python/unittest/test_tir_transform_unroll_loop.py | 9 +++------
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index bee9819a228e..e16d773f02b3 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -713,6 +713,7 @@ class DeclBuffer : public Stmt {
  public:
   TVM_DLL DeclBuffer(Buffer buffer, Stmt body, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(DeclBuffer, Stmt, DeclBufferNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(DeclBufferNode);
 };
 
 /*!
diff --git a/src/tir/transforms/ir_utils.cc b/src/tir/transforms/ir_utils.cc
index 66b04bd67892..b7e3e01f7506 100644
--- a/src/tir/transforms/ir_utils.cc
+++ b/src/tir/transforms/ir_utils.cc
@@ -132,6 +132,15 @@ class IRConvertSSA final : public StmtExprMutator {
     return std::move(output);
   }
 
+  Stmt VisitStmt_(const DeclBufferNode* op) final {
+    DeclBuffer decl = Downcast<DeclBuffer>(StmtExprMutator::VisitStmt_(op));
+    Buffer new_buffer = GetRemappedBuffer(decl->buffer);
+    if (!new_buffer.same_as(decl->buffer)) {
+      decl.CopyOnWrite()->buffer = std::move(new_buffer);
+    }
+    return std::move(decl);
+  }
+
   template <typename Node>
   Node VisitBufferAccess(Node node) {
     Buffer new_buf = GetRemappedBuffer(node->buffer);
diff --git a/tests/python/unittest/test_tir_transform_unroll_loop.py b/tests/python/unittest/test_tir_transform_unroll_loop.py
index 3a638ba45122..a76e6135b3c4 100644
--- a/tests/python/unittest/test_tir_transform_unroll_loop.py
+++ b/tests/python/unittest/test_tir_transform_unroll_loop.py
@@ -117,19 +117,16 @@ class before:
         @T.prim_func
         def main():
             for i in T.unroll(2):
-                with T.allocate([16], "float32", "global") as buf_data:
-                    buf = T.buffer_decl(shape=[16], dtype="float32", data=buf_data)
+                with T.decl_buffer([16], "float32") as buf:
                     buf[0] = 0.0
 
     @tvm.script.ir_module
     class expected:
         @T.prim_func
         def main():
-            with T.allocate([16], "float32", "global") as buf1_data:
-                buf1 = T.buffer_decl(shape=[16], dtype="float32", data=buf1_data)
+            with T.decl_buffer([16], "float32") as buf1:
                 buf1[0] = 0.0
-            with T.allocate([16], "float32", "global") as buf2_data:
-                buf2 = T.buffer_decl(shape=[16], dtype="float32", data=buf2_data)
+            with T.decl_buffer([16], "float32") as buf2:
                 buf2[0] = 0.0
 
     after = tvm.tir.transform.UnrollLoop()(before)

From 4acddb1d036a5f055f5e62f348b18c5e8562140e Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Fri, 2 Sep 2022 13:13:20 +0800
Subject: [PATCH 095/704] [COMMUNITY] Yaxing Cai -> Reviewer (#12683)

Please join me in welcoming Yaxing Cai (@cyx-6) as a new reviewer in TVM. Yaxing has brought the PackedFunc into TVM object system ([RFC-051](https://github.com/apache/tvm-rfcs/pull/51)), designed and implemented the new parser infrastructure for TVMScript and meta-programming ([RFC-079](https://github.com/apache/tvm-rfcs/pull/79))

- [Commits History](https://github.com/apache/tvm/commits?author=cyx-6)
- [Code Review](https://github.com/apache/tvm/pulls?q=reviewed-by%3Acyx-6+)
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 771eb1c63eda..01cf7058a069 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -92,6 +92,7 @@ We do encourage everyone to work anything they are interested in.
 - [Matthew Barrett](https://github.com/mbaret): @mbaret
 - [Arnaud Bergeron](https://github.com/abergeron): @abergeron
 - [Matthew Brookhart](https://github.com/mbrookhart): @mbrookhart
+- [Yaxing Cai](https://github.com/cyx-6): @cyx-6
 - [Liangfu Chen](https://github.com/liangfu): @liangfu
 - [Tianqi Chen](https://github.com/tqchen): @tqchen
 - [Zhi Chen](https://github.com/zhiics): @zhiics

From b2d660006446f720f0c9488f96d28387cbd0d294 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Fri, 2 Sep 2022 00:24:04 -0700
Subject: [PATCH 096/704] [PyTorch] Fix aten::arange for pytorch (#12681)

fix arange for pytorch nightly 20220815
---
 python/tvm/relay/frontend/pytorch.py | 32 ++++++++++++----------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 2255396c0633..7c52393b8468 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -348,28 +348,24 @@ def _get_type(val, inp_type):
         # - if a dtype is given, start, stop, step are converted to that dtype
         # - if no dtype is given and all args are integral, dtype is int64
         # - if no dtype is given and there is a float arg, dtype is float32
-        if len(inputs) == 5:
-            dtype0 = _get_type(inputs[0], input_types[0])
-            if inputs[1] is not None:
-                dtype = _convert_dtype_value(inputs[1])
-            elif dtype0.startswith("float"):
-                dtype = "float32"
-            else:
-                dtype = "int64"
-            start = _expr.const(0, dtype)
-            stop = _get_value(inputs[0], dtype)
-            step = _expr.const(1, dtype)
-        elif len(inputs) == 7:
-            types = [_get_type(inputs[i], input_types[i]) for i in range(3)]
-            if inputs[3] is not None:
-                dtype = _convert_dtype_value(inputs[3])
+        if len(inputs) in {5, 6, 7}:
+            # inputs look like [_,_,_,dtype,layout,device,requires_grad]
+            # therefore dtype_idx is always the length of inputs minus 4
+            dtype_idx = len(inputs) - 4
+            types = [_get_type(inputs[i], input_types[i]) for i in range(dtype_idx)]
+            if inputs[dtype_idx] is not None:
+                dtype = _convert_dtype_value(inputs[dtype_idx])
             elif any([t.startswith("float") for t in types]):
                 dtype = "float32"
             else:
                 dtype = "int64"
-            start = _get_value(inputs[0], dtype)
-            stop = _get_value(inputs[1], dtype)
-            step = _get_value(inputs[2], dtype)
+
+            # - if len(inputs) == 5, inputs = [stop, dtype, ...]
+            # - if len(inputs) == 6, inputs = [start, stop, dtype, ...]
+            # - if len(inputs) == 7, inputs = [start, stop, step, dtype, ...]
+            start = _get_value(inputs[0], dtype) if len(inputs) > 5 else _expr.const(0, dtype)
+            stop = _get_value(inputs[1 if len(inputs) > 5 else 0], dtype)
+            step = _get_value(inputs[2], dtype) if len(inputs) > 6 else _expr.const(1, dtype)
         else:
             msg = "Unknown number of arguments (%d) to parse." % (len(inputs))
             raise AssertionError(msg)

From bb56f2a972606b33e5479d1e18d4c4f13751eeed Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 2 Sep 2022 00:47:38 -0700
Subject: [PATCH 097/704] [MetaSchedule][UX] Convenient Object Creation
 (#12643)

This PR introduces a set of `.create` methods making it easier to create
MetaSchedule objects.

For example:

```python
ms.database.JSONDatabase(...)
ms.database.create("json")

ms.runner.RPCRunner(...)
ms.runner.create("rpc")
```

Besides, this PR allows `JSONDatabase` to be created via `work_dir`:

```python
db = ms.database.create("json", work_dir="/path/to/db/")
db = ms.database.create(work_dir="/path/to/db/")  # or even simpler
```
---
 python/tvm/meta_schedule/builder/__init__.py  |  2 +-
 python/tvm/meta_schedule/builder/builder.py   | 17 ++++++++
 python/tvm/meta_schedule/database/__init__.py |  2 +-
 python/tvm/meta_schedule/database/database.py | 41 ++++++++++++++++++-
 .../meta_schedule/database/json_database.py   | 31 +++++++++++---
 python/tvm/meta_schedule/runner/__init__.py   | 12 +++++-
 python/tvm/meta_schedule/runner/runner.py     | 22 +++++++++-
 .../meta_schedule/search_strategy/__init__.py |  2 +-
 .../search_strategy/search_strategy.py        | 29 +++++++++++++
 .../meta_schedule/space_generator/__init__.py |  2 +-
 .../space_generator/space_generator.py        | 28 +++++++++++++
 .../meta_schedule/task_scheduler/__init__.py  |  4 +-
 .../task_scheduler/task_scheduler.py          | 20 +++++++++
 .../meta_schedule/testing/relay_workload.py   |  4 +-
 14 files changed, 198 insertions(+), 18 deletions(-)

diff --git a/python/tvm/meta_schedule/builder/__init__.py b/python/tvm/meta_schedule/builder/__init__.py
index 859c74d75622..ac71e3a0c1fc 100644
--- a/python/tvm/meta_schedule/builder/__init__.py
+++ b/python/tvm/meta_schedule/builder/__init__.py
@@ -19,5 +19,5 @@
 Meta Schedule builders that translate IRModule to runtime.Module,
 and then export
 """
-from .builder import Builder, BuilderInput, BuilderResult, PyBuilder
+from .builder import Builder, BuilderInput, BuilderResult, PyBuilder, create
 from .local_builder import LocalBuilder
diff --git a/python/tvm/meta_schedule/builder/builder.py b/python/tvm/meta_schedule/builder/builder.py
index daa9f7be4214..a2254f243380 100644
--- a/python/tvm/meta_schedule/builder/builder.py
+++ b/python/tvm/meta_schedule/builder/builder.py
@@ -17,6 +17,10 @@
 """Meta Schedule builders that translate IRModule to runtime.Module, and then export"""
 from typing import Callable, Dict, List, Optional
 
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 from tvm._ffi import register_object
 from tvm.ir import IRModule
 from tvm.runtime import NDArray, Object
@@ -164,3 +168,16 @@ def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
             The results of building the given inputs.
         """
         raise NotImplementedError
+
+
+def create(  # pylint: disable=keyword-arg-before-vararg
+    kind: Literal["local"] = "local",
+    *args,
+    **kwargs,
+) -> Builder:
+    """Create a Builder."""
+    from . import LocalBuilder  # pylint: disable=import-outside-toplevel
+
+    if kind == "local":
+        return LocalBuilder(*args, **kwargs)  # type: ignore
+    raise ValueError(f"Unknown Builder: {kind}")
diff --git a/python/tvm/meta_schedule/database/__init__.py b/python/tvm/meta_schedule/database/__init__.py
index 679923e47936..66d011ed5246 100644
--- a/python/tvm/meta_schedule/database/__init__.py
+++ b/python/tvm/meta_schedule/database/__init__.py
@@ -18,7 +18,7 @@
 The tvm.meta_schedule.database package.
 The database that stores serialized tuning records and workloads
 """
-from .database import Database, PyDatabase, TuningRecord, Workload
+from .database import Database, PyDatabase, TuningRecord, Workload, create
 from .json_database import JSONDatabase
 from .memory_database import MemoryDatabase
 from .ordered_union_database import OrderedUnionDatabase
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index aa509b715132..7a1338f46b20 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -17,12 +17,16 @@
 """TuningRecord database"""
 from typing import Any, Callable, List, Optional, Union
 
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
+
 from tvm._ffi import register_object
 from tvm.ir.module import IRModule
 from tvm.runtime import Object
 from tvm.target import Target
 from tvm.tir.schedule import Schedule, Trace
-from typing_extensions import Literal  # pylint: disable=wrong-import-order
 
 from .. import _ffi_api
 from ..arg_info import ArgInfo
@@ -483,3 +487,38 @@ def __len__(self) -> int:
             The number of records in the database
         """
         raise NotImplementedError
+
+
+def create(  # pylint: disable=keyword-arg-before-vararg
+    kind: Union[
+        Literal[
+            "json",
+            "memory",
+            "union",
+            "ordered_union",
+        ],
+        Callable[[Schedule], bool],
+    ] = "json",
+    *args,
+    **kwargs,
+) -> Database:
+    """Create a Database."""
+    from . import (  # pylint: disable=import-outside-toplevel
+        JSONDatabase,
+        MemoryDatabase,
+        OrderedUnionDatabase,
+        ScheduleFnDatabase,
+        UnionDatabase,
+    )
+
+    if callable(kind):
+        return ScheduleFnDatabase(kind, *args, **kwargs)  # type: ignore
+    if kind == "json":
+        return JSONDatabase(*args, **kwargs)
+    if kind == "memory":
+        return MemoryDatabase(*args, **kwargs)  # type: ignore
+    if kind == "union":
+        return UnionDatabase(*args, **kwargs)  # type: ignore
+    if kind == "ordered_union":
+        return OrderedUnionDatabase(*args, **kwargs)  # type: ignore
+    raise ValueError(f"Unknown Database: {kind}")
diff --git a/python/tvm/meta_schedule/database/json_database.py b/python/tvm/meta_schedule/database/json_database.py
index 6897b82d9888..b36ac61ef2fb 100644
--- a/python/tvm/meta_schedule/database/json_database.py
+++ b/python/tvm/meta_schedule/database/json_database.py
@@ -15,6 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """The default database that uses a JSON File to store tuning records"""
+import os.path as osp
+from typing import Optional
+
 from tvm._ffi import register_object
 
 from .. import _ffi_api
@@ -38,21 +41,37 @@ class JSONDatabase(Database):
 
     def __init__(
         self,
-        path_workload: str,
-        path_tuning_record: str,
+        path_workload: Optional[str] = None,
+        path_tuning_record: Optional[str] = None,
+        *,
+        work_dir: Optional[str] = None,
         allow_missing: bool = True,
     ) -> None:
         """Constructor.
 
         Parameters
         ----------
-        path_workload : str
-            The path to the workload table.
-        path_tuning_record : str
-            The path to the tuning record table.
+        path_workload : Optional[str] = None
+            The path to the workload table. If not specified,
+            will be generated from `work_dir` as `$work_dir/database_workload.json`.
+        path_tuning_record : Optional[str] = None
+            The path to the tuning record table. If not specified,
+            will be generated from `work_dir` as `$work_dir/database_tuning_record.json`.
+        work_dir : Optional[str] = None
+            The work directory, if specified, will be used to generate `path_tuning_record`
+            and `path_workload`.
         allow_missing : bool
             Whether to create new file when the given path is not found.
         """
+        if work_dir is not None:
+            if path_workload is None:
+                path_workload = osp.join(work_dir, "database_workload.json")
+            if path_tuning_record is None:
+                path_tuning_record = osp.join(work_dir, "database_tuning_record.json")
+        if path_workload is None:
+            raise ValueError("`path_workload` is not specified.")
+        if path_tuning_record is None:
+            raise ValueError("`path_tuning_record` is not specified.")
         self.__init_handle_by_constructor__(
             _ffi_api.DatabaseJSONDatabase,  # type: ignore # pylint: disable=no-member
             path_workload,
diff --git a/python/tvm/meta_schedule/runner/__init__.py b/python/tvm/meta_schedule/runner/__init__.py
index 413bea6d2fab..f0e1028bbf28 100644
--- a/python/tvm/meta_schedule/runner/__init__.py
+++ b/python/tvm/meta_schedule/runner/__init__.py
@@ -19,6 +19,14 @@
 Meta Schedule runners that runs an artifact either locally or through the RPC interface
 """
 from .config import EvaluatorConfig, RPCConfig
-from .rpc_runner import RPCRunner
 from .local_runner import LocalRunner, LocalRunnerFuture
-from .runner import PyRunner, Runner, RunnerFuture, RunnerInput, RunnerResult, PyRunnerFuture
+from .rpc_runner import RPCRunner
+from .runner import (
+    PyRunner,
+    PyRunnerFuture,
+    Runner,
+    RunnerFuture,
+    RunnerInput,
+    RunnerResult,
+    create,
+)
diff --git a/python/tvm/meta_schedule/runner/runner.py b/python/tvm/meta_schedule/runner/runner.py
index 90b53fde8c29..539e47f15c41 100644
--- a/python/tvm/meta_schedule/runner/runner.py
+++ b/python/tvm/meta_schedule/runner/runner.py
@@ -15,7 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 """Runners"""
-from typing import Callable, Optional, List
+from typing import Callable, List, Optional
+
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
@@ -223,3 +228,18 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
             The runner futures.
         """
         raise NotImplementedError
+
+
+def create(  # pylint: disable=keyword-arg-before-vararg
+    kind: Literal["local", "rpc"] = "local",
+    *args,
+    **kwargs,
+) -> Runner:
+    """Create a Runner."""
+    from . import LocalRunner, RPCRunner  # pylint: disable=import-outside-toplevel
+
+    if kind == "local":
+        return LocalRunner(*args, **kwargs)  # type: ignore
+    elif kind == "rpc":
+        return RPCRunner(*args, **kwargs)  # type: ignore
+    raise ValueError(f"Unknown Runner: {kind}")
diff --git a/python/tvm/meta_schedule/search_strategy/__init__.py b/python/tvm/meta_schedule/search_strategy/__init__.py
index 2046067d6c00..ffe7e1473954 100644
--- a/python/tvm/meta_schedule/search_strategy/__init__.py
+++ b/python/tvm/meta_schedule/search_strategy/__init__.py
@@ -23,4 +23,4 @@
 from .evolutionary_search import EvolutionarySearch
 from .replay_func import ReplayFunc
 from .replay_trace import ReplayTrace
-from .search_strategy import MeasureCandidate, PySearchStrategy, SearchStrategy
+from .search_strategy import MeasureCandidate, PySearchStrategy, SearchStrategy, create
diff --git a/python/tvm/meta_schedule/search_strategy/search_strategy.py b/python/tvm/meta_schedule/search_strategy/search_strategy.py
index 1cd8a448fe8e..e88cdf825a79 100644
--- a/python/tvm/meta_schedule/search_strategy/search_strategy.py
+++ b/python/tvm/meta_schedule/search_strategy/search_strategy.py
@@ -20,6 +20,10 @@
 """
 from typing import TYPE_CHECKING, Callable, List, Optional
 
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 from tvm._ffi import register_object
 from tvm.runtime import Object
 from tvm.tir.schedule import Schedule
@@ -245,3 +249,28 @@ def notify_runner_results(
             The profiling results from the runner.
         """
         raise NotImplementedError
+
+
+def create(  # pylint: disable=keyword-arg-before-vararg
+    kind: Literal[
+        "evolutionary",
+        "replay_trace",
+        "replay_func",
+    ] = "evolutionary",
+    *args,
+    **kwargs,
+) -> SearchStrategy:
+    """Create a search strategy."""
+    from . import (  # pylint: disable=import-outside-toplevel
+        EvolutionarySearch,
+        ReplayFunc,
+        ReplayTrace,
+    )
+
+    if kind == "evolutionary":
+        return EvolutionarySearch(*args, **kwargs)
+    if kind == "replay_trace":
+        return ReplayTrace(*args, **kwargs)
+    if kind == "replay_func":
+        return ReplayFunc(*args, **kwargs)
+    raise ValueError(f"Unknown SearchStrategy: {kind}")
diff --git a/python/tvm/meta_schedule/space_generator/__init__.py b/python/tvm/meta_schedule/space_generator/__init__.py
index d2039c4511c9..c417ec2d7d4a 100644
--- a/python/tvm/meta_schedule/space_generator/__init__.py
+++ b/python/tvm/meta_schedule/space_generator/__init__.py
@@ -21,5 +21,5 @@
 """
 from .post_order_apply import PostOrderApply
 from .schedule_fn import ScheduleFn
-from .space_generator import PySpaceGenerator, ScheduleFnType, SpaceGenerator
+from .space_generator import PySpaceGenerator, ScheduleFnType, SpaceGenerator, create
 from .space_generator_union import SpaceGeneratorUnion
diff --git a/python/tvm/meta_schedule/space_generator/space_generator.py b/python/tvm/meta_schedule/space_generator/space_generator.py
index 74c29b4de0dd..9d7ebf3bae26 100644
--- a/python/tvm/meta_schedule/space_generator/space_generator.py
+++ b/python/tvm/meta_schedule/space_generator/space_generator.py
@@ -20,6 +20,10 @@
 """
 from typing import TYPE_CHECKING, Callable, List, Optional, Union
 
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 from tvm._ffi import register_object
 from tvm.ir import IRModule
 from tvm.runtime import Object
@@ -132,3 +136,27 @@ def generate_design_space(self, mod: IRModule) -> List[Schedule]:
             The generated design spaces, i.e., schedules.
         """
         raise NotImplementedError
+
+
+def create(  # pylint: disable=keyword-arg-before-vararg
+    kind: Union[
+        Literal["post_order_apply", "union"],
+        ScheduleFnType,
+    ] = "post_order_apply",
+    *args,
+    **kwargs,
+) -> SpaceGenerator:
+    """Create a design space generator."""
+    from . import (  # pylint: disable=import-outside-toplevel
+        PostOrderApply,
+        ScheduleFn,
+        SpaceGeneratorUnion,
+    )
+
+    if callable(kind):
+        return ScheduleFn(kind, *args, **kwargs)  # type: ignore
+    if kind == "post_order_apply":
+        return PostOrderApply(*args, **kwargs)
+    if kind == "union":
+        return SpaceGeneratorUnion(*args, **kwargs)
+    raise ValueError(f"Unknown SpaceGenerator: {kind}")
diff --git a/python/tvm/meta_schedule/task_scheduler/__init__.py b/python/tvm/meta_schedule/task_scheduler/__init__.py
index 1a67aa6f6831..51985570b06f 100644
--- a/python/tvm/meta_schedule/task_scheduler/__init__.py
+++ b/python/tvm/meta_schedule/task_scheduler/__init__.py
@@ -20,6 +20,6 @@
 for measure candidates generation and measurement, then save
 records to the database.
 """
-from .task_scheduler import TaskScheduler, PyTaskScheduler
-from .round_robin import RoundRobin
 from .gradient_based import GradientBased
+from .round_robin import RoundRobin
+from .task_scheduler import PyTaskScheduler, TaskScheduler, create
diff --git a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
index 3d57a6b01b9d..29a5f18dfb8a 100644
--- a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
+++ b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
@@ -19,6 +19,11 @@
 import logging
 from typing import Callable, List, Optional
 
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
+
 from tvm._ffi import register_object
 from tvm.runtime import Object
 
@@ -255,3 +260,18 @@ def touch_task(self, task_id: int) -> None:
         """
         # Using self._outer to replace the self pointer
         _ffi_api.TaskSchedulerTouchTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
+
+
+def create(  # pylint: disable=keyword-arg-before-vararg
+    kind: Literal["round-robin", "gradient"] = "gradient",
+    *args,
+    **kwargs,
+) -> "TaskScheduler":
+    """Create a task scheduler."""
+    from . import GradientBased, RoundRobin  # pylint: disable=import-outside-toplevel
+
+    if kind == "round-robin":
+        return RoundRobin(*args, **kwargs)
+    if kind == "gradient":
+        return GradientBased(*args, **kwargs)
+    raise ValueError(f"Unknown TaskScheduler name: {kind}")
diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py
index 016263489527..f4f6336df33f 100644
--- a/python/tvm/meta_schedule/testing/relay_workload.py
+++ b/python/tvm/meta_schedule/testing/relay_workload.py
@@ -85,7 +85,7 @@ def _get_network(
                 "float32": torch.float32,  # pylint: disable=no-member
             }[dtype]
         )
-        scripted_model = torch.jit.trace(model, input_data).eval()
+        scripted_model = torch.jit.trace(model, input_data).eval()  # type: ignore
         input_name = "input0"
         shape_list = [(input_name, input_shape)]
         mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
@@ -149,7 +149,7 @@ def _get_network(
         input_dtype = "int64"
         a = torch.randint(10000, input_shape)  # pylint: disable=no-member
         model.eval()
-        scripted_model = torch.jit.trace(model, [a], strict=False)
+        scripted_model = torch.jit.trace(model, [a], strict=False)  # type: ignore
         input_name = "input_ids"
         shape_list = [(input_name, input_shape)]
         mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)

From 445a14f4c637ea88f4a1c39ed238da752fc6cecf Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Fri, 2 Sep 2022 08:53:44 +0100
Subject: [PATCH 098/704] [ETHOSN] Fix some more pylint issues (#12675)

Fixing a few more pylint issues caught when using pylint==2.9.3.

Change-Id: Ie7ca61e1a8083a40e0ffccf1418192966884707a
---
 tests/python/contrib/test_ethosn/infrastructure.py     |  3 ++-
 .../contrib/test_ethosn/test_convert_equivalents.py    |  1 +
 tests/python/contrib/test_ethosn/test_networks.py      | 10 ++++++----
 tests/python/contrib/test_ethosn/test_reshape.py       |  6 ++++--
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index 0071b1a7f52e..c658b33747c3 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -67,7 +67,8 @@ def assert_lib_hash(lib, golden):
     for mod in lib.imported_modules:
         if mod.type_key == "ethos-n":
             mod.save(path)
-            lib_hash = md5(open(path, "rb").read()).hexdigest()
+            with open(path, "rb") as compiled_model:
+                lib_hash = md5(compiled_model.read()).hexdigest()
             hash_set.add(lib_hash)
 
     assert hash_set == golden, "Expected hash: {} Got hash: {}".format(golden, hash_set)
diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
index fe9b346691b6..c8d1b5729d83 100644
--- a/tests/python/contrib/test_ethosn/test_convert_equivalents.py
+++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
@@ -158,6 +158,7 @@ class ConversionChecker(ExprVisitor):
 
         sequence = ["qnn.conv2d", "nn.bias_add", "qnn.requantize"]
 
+        # pylint: disable=invalid-name
         def visit_function(self, fn):
             composite_name = fn.attrs["Composite"]
             expected = "ethos-n.qnn_conv2d"
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index db1b41244846..b584a579b8be 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -34,10 +34,7 @@ def _get_tflite_model(tflite_model_path, inputs_dict, dtype):
     with open(tflite_model_path, "rb") as f:
         tflite_model_buffer = f.read()
 
-    try:
-        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buffer, 0)
-    except AttributeError:
-        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buffer, 0)
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buffer, 0)
     shape_dict = {}
     dtype_dict = {}
     for value in inputs_dict:
@@ -116,6 +113,7 @@ def get_model():
 
 @requires_ethosn
 def test_mobilenet_v1():
+    """Compare compile hashes for mobilenetv1 with an expected result."""
     # If this test is failing due to a hash mismatch, please notify @lhutton1 and
     # @Leo-arm. The hash is there to catch any changes in the behaviour of the
     # codegen, which could come about from either a change in Support Library
@@ -137,6 +135,7 @@ def test_mobilenet_v1():
 
 @requires_ethosn
 def test_resnet_50_int8():
+    """Compare compile hashes for resnet50 with an expected result."""
     # If this test is failing due to a hash mismatch, please notify @lhutton1 and
     # @Leo-arm. The hash is there to catch any changes in the behaviour of the
     # codegen, which could come about from either a change in Support Library
@@ -157,6 +156,7 @@ def test_resnet_50_int8():
 
 @requires_ethosn
 def test_inception_v3():
+    """Compare compile hashes for inceptionv3 with an expected result."""
     # If this test is failing due to a hash mismatch, please notify @lhutton1 and
     # @Leo-arm. The hash is there to catch any changes in the behaviour of the
     # codegen, which could come about from either a change in Support Library
@@ -177,6 +177,7 @@ def test_inception_v3():
 
 @requires_ethosn
 def test_inception_v4():
+    """Compare compile hashes for inceptionv4 with an expected result."""
     # If this test is failing due to a hash mismatch, please notify @lhutton1 and
     # @Leo-arm. The hash is there to catch any changes in the behaviour of the
     # codegen, which could come about from either a change in Support Library
@@ -197,6 +198,7 @@ def test_inception_v4():
 
 @requires_ethosn
 def test_ssd_mobilenet_v1():
+    """Compare compile hashes for ssdmobilenetv1 with an expected result."""
     # If this test is failing due to a hash mismatch, please notify @lhutton1 and
     # @Leo-arm. The hash is there to catch any changes in the behaviour of the
     # codegen, which could come about from either a change in Support Library
diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py
index e165cea9c63b..2d6eae9b2522 100644
--- a/tests/python/contrib/test_ethosn/test_reshape.py
+++ b/tests/python/contrib/test_ethosn/test_reshape.py
@@ -17,11 +17,13 @@
 
 """Arm(R) Ethos(TM)-N integration reshape tests"""
 
+import numpy as np
+import pytest
+
 import tvm
 from tvm import relay
 from tvm.testing import requires_ethosn
-import numpy as np
-import pytest
+
 from . import infrastructure as tei
 
 
From 0549a08f4de40a5a0db277cfe1ae00ab22fc9107 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Fri, 2 Sep 2022 13:52:23 +0100
Subject: [PATCH 099/704] [ETHOSN] Add support for concatenate with negative
 axis (#12686)

Supports offloading concatenate with a negative axis to the NPU. In addition, parameterized the concatenate unit tests.
---
 .../backend/contrib/ethosn/ethosn_api.cc      |  7 ++-
 .../contrib/test_ethosn/test_concatenate.py   | 49 ++++++++++---------
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index 55e8901dae08..4f01c924cf6e 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -520,7 +520,12 @@ EthosnError EthosnAPI::LeakyReLU(const Expr& expr, LeakyReLUParams* params) {
 EthosnError EthosnAPI::Concatenate(const Expr& expr, ConcatenateParams* params) {
   Call call = Downcast<Call>(expr);
   const auto& attrs = call->attrs.as<ConcatenateAttrs>();
-  params->concat_info.m_Axis = attrs->axis;
+  int axis = attrs->axis;
+  if (axis < 0) {
+    int output_dims = Downcast<TensorType>(call->checked_type())->shape.size();
+    axis = output_dims + axis;
+  }
+  params->concat_info.m_Axis = axis;
 
   float output_sc;
   int output_zp;
diff --git a/tests/python/contrib/test_ethosn/test_concatenate.py b/tests/python/contrib/test_ethosn/test_concatenate.py
index cd4ec7a4e4b2..0389b3c5b103 100644
--- a/tests/python/contrib/test_ethosn/test_concatenate.py
+++ b/tests/python/contrib/test_ethosn/test_concatenate.py
@@ -56,33 +56,35 @@ def _get_model(shapes, dtype, axis):
 
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_concatenate(dtype):
-    """Compare Concatenate output with TVM."""
-
-    trials = [
+@pytest.mark.parametrize(
+    "shapes,axis",
+    [
         ([(1, 4), (1, 6)], 1),
         ([(1, 16, 4), (1, 16, 4)], 1),
         ([(1, 25, 4, 16)] * 3, 3),
         ([(1, 25, 4, 16), (1, 25, 5, 16), (1, 25, 6, 16)], 2),
-    ]
-
+        ([(1, 4), (1, 6)], -1),
+        ([(1, 16, 4), (1, 16, 4)], -2),
+    ],
+)
+def test_concatenate(dtype, shapes, axis):
+    """Compare Concatenate output with TVM."""
     np.random.seed(0)
-    for shapes, axis in trials:
-        outputs = []
-        inputs = _get_inputs(shapes, dtype)
-        for npu in [False, True]:
-            model = _get_model(shapes, dtype, axis)
-            mod = tei.make_module(model, {})
-            outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+
+    outputs = []
+    inputs = _get_inputs(shapes, dtype)
+    for npu in [False, True]:
+        model = _get_model(shapes, dtype, axis)
+        mod = tei.make_module(model, {})
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
 
         tei.verify(outputs, dtype, 0)
 
 
 @requires_ethosn
-def test_concatenate_failure():
-    """Check Concatenate error messages."""
-
-    trials = [
+@pytest.mark.parametrize(
+    "shapes,dtype,axis,err_msg",
+    [
         ([(1, 4, 4, 4, 4), (1, 4, 4, 4, 4)], "uint8", 1, "dimensions=5, dimensions must be <= 4;"),
         (
             [(1, 4, 4, 4), (1, 4, 4, 4)],
@@ -110,9 +112,10 @@ def test_concatenate_failure():
             0,
             "Concatenation cannot be performed along batch axis (axis 0);",
         ),
-    ]
-
-    for shapes, dtype, axis, err_msg in trials:
-        model = _get_model(shapes, dtype, axis)
-        mod = tei.make_ethosn_partition(model)
-        tei.test_error(mod, {}, err_msg)
+    ],
+)
+def test_concatenate_failure(shapes, dtype, axis, err_msg):
+    """Check Concatenate error messages."""
+    model = _get_model(shapes, dtype, axis)
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)

From 7c7b0f7a2fb7833a3afe8900f8b38ccf144f96f0 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 2 Sep 2022 09:44:22 -0700
Subject: [PATCH 100/704] [ci][tvmbot] Trigger GitHub Actions after merging
 (#12361)

This fixes the issue where merging from GitHub Actions (i.e. with the default `GITHUB_TOKEN`) doesn't trigger post merge GitHub Actions on the commit it creates in `main`. Instead these jobs are triggered manually by a call to the Actions API after the merge has taken place.

This also updates the tvmbot testing code (and by extension some of the other CI testing code) to remove the fixtures for each test in favor of constructing them from a single sample at runtime, this makes it a lot easier to add new tests and see what is different between each data sample and clean up the testing anti-patterns that were there before (e.g. `run()` instead of `pytest.mark.parameterize`, but none of the tests in `test_ci.py` have changed)

Tested in https://github.com/driazati/tvm/pull/36 which ran https://github.com/driazati/tvm/actions/runs/2881047903
---
 ci/scripts/github_tvmbot.py                   |  22 +-
 tests/python/ci/sample_prs/pr10786-badci.json | 130 ---
 .../sample_prs/pr10786-changes-requested.json | 131 ---
 .../ci/sample_prs/pr10786-co-authors.json     | 129 ---
 .../ci/sample_prs/pr10786-invalid-author.json | 130 ---
 .../python/ci/sample_prs/pr10786-merges.json  | 129 ---
 .../ci/sample_prs/pr10786-missing-job.json    | 129 ---
 .../ci/sample_prs/pr10786-nottriggered.json   | 129 ---
 .../ci/sample_prs/pr10786-oldreview.json      | 129 ---
 ...{pr10786-ignore-jobs.json => pr10786.json} |   5 +-
 .../pr11244-unauthorized-comment.json         | 103 ---
 .../ci/sample_prs/pr11267-no-review.json      | 144 ----
 .../ci/sample_prs/pr11442-rerun-ci.json       | 183 ----
 tests/python/ci/test_ci.py                    | 803 ++++++++----------
 tests/python/ci/test_tvmbot.py                | 400 +++++----
 tests/python/ci/test_utils.py                 |  33 +-
 16 files changed, 624 insertions(+), 2105 deletions(-)
 delete mode 100644 tests/python/ci/sample_prs/pr10786-badci.json
 delete mode 100644 tests/python/ci/sample_prs/pr10786-changes-requested.json
 delete mode 100644 tests/python/ci/sample_prs/pr10786-co-authors.json
 delete mode 100644 tests/python/ci/sample_prs/pr10786-invalid-author.json
 delete mode 100644 tests/python/ci/sample_prs/pr10786-merges.json
 delete mode 100644 tests/python/ci/sample_prs/pr10786-missing-job.json
 delete mode 100644 tests/python/ci/sample_prs/pr10786-nottriggered.json
 delete mode 100644 tests/python/ci/sample_prs/pr10786-oldreview.json
 rename tests/python/ci/sample_prs/{pr10786-ignore-jobs.json => pr10786.json} (78%)
 delete mode 100644 tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
 delete mode 100644 tests/python/ci/sample_prs/pr11267-no-review.json
 delete mode 100644 tests/python/ci/sample_prs/pr11442-rerun-ci.json

diff --git a/ci/scripts/github_tvmbot.py b/ci/scripts/github_tvmbot.py
index 3a39e69694d8..ee9607dd0254 100755
--- a/ci/scripts/github_tvmbot.py
+++ b/ci/scripts/github_tvmbot.py
@@ -195,6 +195,7 @@ def __init__(
         self.number = number
         self.repo_name = repo
         self.dry_run = dry_run
+        self.has_error = False
 
         if dry_run and raw_data:
             # In test mode there is no need to fetch anything
@@ -468,7 +469,10 @@ def find_missing_expected_jobs(self) -> List[str]:
 
     def trigger_gha_ci(self, sha: str) -> None:
         logging.info(f"POST-ing a workflow_dispatch event to main.yml")
-        r = self.github.post(
+        actions_github = GitHubRepo(
+            user=self.github.user, repo=self.github.repo, token=GH_ACTIONS_TOKEN
+        )
+        r = actions_github.post(
             url="actions/workflows/main.yml/dispatches",
             data={
                 "ref": "main",
@@ -537,9 +541,12 @@ def rerun_github_actions(self) -> None:
 
         workflow_ids = list(set(workflow_ids))
         logging.info(f"Rerunning GitHub Actions workflows with IDs: {workflow_ids}")
-        actions_github = GitHubRepo(
-            user=self.github.user, repo=self.github.repo, token=GH_ACTIONS_TOKEN
-        )
+        if self.dry_run:
+            actions_github = None
+        else:
+            actions_github = GitHubRepo(
+                user=self.github.user, repo=self.github.repo, token=GH_ACTIONS_TOKEN
+            )
         for workflow_id in workflow_ids:
             if self.dry_run:
                 logging.info(f"Dry run, not restarting workflow {workflow_id}")
@@ -576,6 +583,7 @@ def comment_failure(self, msg: str, exceptions: Union[Exception, List[Exception]
             comment += "</details>"
 
         pr.comment(comment)
+        pr.has_error = True
         return exception
 
 
@@ -750,6 +758,9 @@ def run(pr: PR):
     for name, check in command_to_run.auth:
         if check(pr, comment, args):
             logging.info(f"Passed auth check '{name}', continuing")
+            # Only one authorization check needs to pass (e.g. just mentionable
+            # or PR author), not all of them so quit
+            break
         else:
             logging.info(f"Failed auth check '{name}', quitting")
             # Add a sad face
@@ -767,3 +778,6 @@ def run(pr: PR):
 
     # Run the command
     command_to_run.run(pr)
+
+    if pr.has_error:
+        raise RuntimeError("PR commented a failure")
diff --git a/tests/python/ci/sample_prs/pr10786-badci.json b/tests/python/ci/sample_prs/pr10786-badci.json
deleted file mode 100644
index 7e9d10d0b648..000000000000
--- a/tests/python/ci/sample_prs/pr10786-badci.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "title": "[Hexagon] 2-d allocation cleanup",
-  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
-  "state": "OPEN",
-  "author": {
-    "login": "abc"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "Eric Lunderberg",
-                "email": "elunderberg@octoml.ai"
-              },
-              {
-                "name": "Adam Straw",
-                "email": "astraw@octoml.ai"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945392"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945029"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945030"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945524"
-                },
-                {
-                  "state": "FAILED",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "APPROVED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "body": "@tvm-bot merge",
-        "updatedAt": "2022-03-25T22:13:50Z",
-        "authorCanPushToRepository": true,
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
-        },
-        "id": 123,
-        "author": {
-          "login": "kparzysz-quic"
-        },
-        "state": "APPROVED"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-changes-requested.json b/tests/python/ci/sample_prs/pr10786-changes-requested.json
deleted file mode 100644
index 24e261099a4f..000000000000
--- a/tests/python/ci/sample_prs/pr10786-changes-requested.json
+++ /dev/null
@@ -1,131 +0,0 @@
-{
-  "title": "[Hexagon] 2-d allocation cleanup",
-  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
-  "state": "OPEN",
-  "author": {
-    "login": "abc"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "Eric Lunderberg",
-                "email": "elunderberg@octoml.ai"
-              },
-              {
-                "name": "Adam Straw",
-                "email": "astraw@octoml.ai"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945392"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945029"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945030"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945524"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "CHANGES_REQUESTED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "body": "@tvm-bot merge",
-        "updatedAt": "2022-03-25T22:13:50Z",
-        "url": "https://github.com/apache/tvm/pull/10786#pullrequestreview-922186273",
-        "authorCanPushToRepository": true,
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
-        },
-        "id": 123,
-        "author": {
-          "login": "kparzysz-quic"
-        },
-        "state": "CHANGES_REQUESTED"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-co-authors.json b/tests/python/ci/sample_prs/pr10786-co-authors.json
deleted file mode 100644
index 75f272825059..000000000000
--- a/tests/python/ci/sample_prs/pr10786-co-authors.json
+++ /dev/null
@@ -1,129 +0,0 @@
-{
-  "title": "[Hexagon] 2-d allocation cleanup",
-  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
-  "state": "OPEN",
-  "author": {
-    "login": "abc"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "Eric Lunderberg",
-                "email": "elunderberg@octoml.ai"
-              },
-              {
-                "name": "Some One",
-                "email": "someone@email.com"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945392"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945029"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945030"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945524"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "APPROVED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "body": "@tvm-bot merge",
-        "updatedAt": "2022-03-25T22:13:50Z",
-        "authorCanPushToRepository": true,
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
-        },
-        "author": {
-          "login": "kparzysz-quic"
-        },
-        "state": "APPROVED"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-invalid-author.json b/tests/python/ci/sample_prs/pr10786-invalid-author.json
deleted file mode 100644
index 81b028e3196a..000000000000
--- a/tests/python/ci/sample_prs/pr10786-invalid-author.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "title": "[Hexagon] 2-d allocation cleanup",
-  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
-  "state": "OPEN",
-  "author": {
-    "login": "abc"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "Eric Lunderberg",
-                "email": "elunderberg@octoml.ai"
-              },
-              {
-                "name": "Adam Straw",
-                "email": "astraw@octoml.ai"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945392"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945029"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945030"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945524"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "APPROVED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "body": "@tvm-bot merge",
-        "id": 123,
-        "updatedAt": "2022-03-25T22:13:50Z",
-        "authorCanPushToRepository": false,
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
-        },
-        "author": {
-          "login": "kparzysz-quic"
-        },
-        "state": "APPROVED"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-merges.json b/tests/python/ci/sample_prs/pr10786-merges.json
deleted file mode 100644
index 0226c8ab5245..000000000000
--- a/tests/python/ci/sample_prs/pr10786-merges.json
+++ /dev/null
@@ -1,129 +0,0 @@
-{
-  "title": "[Hexagon] 2-d allocation cleanup",
-  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free.\n\n\nThanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n\nPreviously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\n\n\ncc @someone\n\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>\n\n\nThanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n",
-  "state": "OPEN",
-  "author": {
-    "login": "abc"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "Eric Lunderberg",
-                "email": "elunderberg@octoml.ai"
-              },
-              {
-                "name": "Adam Straw",
-                "email": "astraw@octoml.ai"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945392"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945029"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945030"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945524"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "APPROVED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "body": "@tvm-bot merge",
-        "updatedAt": "2022-03-25T22:13:50Z",
-        "authorCanPushToRepository": true,
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
-        },
-        "author": {
-          "login": "kparzysz-quic"
-        },
-        "state": "APPROVED"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-missing-job.json b/tests/python/ci/sample_prs/pr10786-missing-job.json
deleted file mode 100644
index 13739b793fb5..000000000000
--- a/tests/python/ci/sample_prs/pr10786-missing-job.json
+++ /dev/null
@@ -1,129 +0,0 @@
-{
-  "title": "[Hexagon] 2-d allocation cleanup",
-  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
-  "state": "OPEN",
-  "author": {
-    "login": "abc"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "Eric Lunderberg",
-                "email": "elunderberg@octoml.ai"
-              },
-              {
-                "name": "Adam Straw",
-                "email": "astraw@octoml.ai"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945392"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945029"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945030"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945524"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/definitely-not-pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "APPROVED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "body": "@tvm-bot merge",
-        "updatedAt": "2022-03-25T22:13:50Z",
-        "authorCanPushToRepository": true,
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
-        },
-        "author": {
-          "login": "kparzysz-quic"
-        },
-        "state": "APPROVED"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-nottriggered.json b/tests/python/ci/sample_prs/pr10786-nottriggered.json
deleted file mode 100644
index 0da541c4342d..000000000000
--- a/tests/python/ci/sample_prs/pr10786-nottriggered.json
+++ /dev/null
@@ -1,129 +0,0 @@
-{
-  "title": "[Hexagon] 2-d allocation cleanup",
-  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
-  "state": "OPEN",
-  "author": {
-    "login": "abc"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "Eric Lunderberg",
-                "email": "elunderberg@octoml.ai"
-              },
-              {
-                "name": "Adam Straw",
-                "email": "astraw@octoml.ai"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945392"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945029"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945030"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945524"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "APPROVED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "body": "",
-        "updatedAt": "2022-03-25T22:13:50Z",
-        "authorCanPushToRepository": true,
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
-        },
-        "author": {
-          "login": "kparzysz-quic"
-        },
-        "state": "APPROVED"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-oldreview.json b/tests/python/ci/sample_prs/pr10786-oldreview.json
deleted file mode 100644
index 1a2556cb6f5f..000000000000
--- a/tests/python/ci/sample_prs/pr10786-oldreview.json
+++ /dev/null
@@ -1,129 +0,0 @@
-{
-  "title": "[Hexagon] 2-d allocation cleanup",
-  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
-  "state": "OPEN",
-  "author": {
-    "login": "abc"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "Eric Lunderberg",
-                "email": "elunderberg@octoml.ai"
-              },
-              {
-                "name": "Adam Straw",
-                "email": "astraw@octoml.ai"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945392"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945029"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945030"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/5694945524"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "APPROVED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "body": "@tvm-bot merge",
-        "updatedAt": "2022-03-25T22:13:50Z",
-        "authorCanPushToRepository": true,
-        "commit": {
-          "oid": "abc12345"
-        },
-        "author": {
-          "login": "kparzysz-quic"
-        },
-        "state": "APPROVED"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-ignore-jobs.json b/tests/python/ci/sample_prs/pr10786.json
similarity index 78%
rename from tests/python/ci/sample_prs/pr10786-ignore-jobs.json
rename to tests/python/ci/sample_prs/pr10786.json
index dfcd806ff14b..79f20ca6094b 100644
--- a/tests/python/ci/sample_prs/pr10786-ignore-jobs.json
+++ b/tests/python/ci/sample_prs/pr10786.json
@@ -1,6 +1,6 @@
 {
     "title": "[Hexagon] 2-d allocation cleanup",
-    "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
+    "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free.\n\n\nThanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n\nPreviously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\n\n\ncc @someone\n\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>\n\n\nThanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n",
     "state": "OPEN",
     "author": {
       "login": "abc"
@@ -65,7 +65,7 @@
                       }
                     },
                     "status": "COMPLETED",
-                    "conclusion": "FAILED",
+                    "conclusion": "SUCCESS",
                     "url": "https://github.com/apache/tvm/runs/5694945029"
                   },
                   {
@@ -119,7 +119,6 @@
           "commit": {
             "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
           },
-          "id": 123,
           "author": {
             "login": "kparzysz-quic"
           },
diff --git a/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json b/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
deleted file mode 100644
index beafc05958b6..000000000000
--- a/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
+++ /dev/null
@@ -1,103 +0,0 @@
-{
-  "title": "[CRT runtime] Added functions TVMPlatformPreFuncCall and TVMPlatformPostFuncCall",
-  "body": "See [this thread ](https://discuss.tvm.apache.org/t/crt-add-platform-specific-pre-and-post-function-calls-in-crt-runtime/12723)for an explanation.",
-  "state": "OPEN",
-  "author": {
-    "login": "abc"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "authorAssociation": "NONE",
-        "author": {
-          "login": "abc"
-        },
-        "updatedAt": "2022-05-09T13:39:04Z",
-        "body": "@tvm-bot merge"
-      },
-      {
-        "authorAssociation": "CONTRIBUTOR",
-        "author": {
-          "login": "areusch"
-        },
-        "updatedAt": "2022-05-11T19:22:01Z",
-        "body": "i commented on the discuss forum thread. let's resolve there and then continue this PR."
-      }
-    ]
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "Federico Peccia",
-                "email": "peccia@fzi.de"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "79d355c5f837b3bdadb5d25b2a5d0d2802783ae2",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6352791017"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6352791014"
-                },
-                {
-                  "state": "ERROR",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11244/1/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "REVIEW_REQUIRED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr11267-no-review.json b/tests/python/ci/sample_prs/pr11267-no-review.json
deleted file mode 100644
index d2ad164673e5..000000000000
--- a/tests/python/ci/sample_prs/pr11267-no-review.json
+++ /dev/null
@@ -1,144 +0,0 @@
-{
-  "title": "[ci][docker] Use sccache everywhere by default",
-  "body": "This adds `/opt/sccache` to the PATH of each of the CI docker images so when cmake looks for a C compiler it will pick up the sccache wrapper by default. This fixes some issues where compiler invocations weren't being run though sccache. With this approach the invoker doesn't need to do anything specific to set up sccache.\n\nThis will require a follow up PR to update the Docker images and remove some of the sccache logic in `task_build.py`\n\n\n\ncc @Mousius @areusch",
-  "state": "OPEN",
-  "author": {
-    "login": "abc"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "authorAssociation": "CONTRIBUTOR",
-        "author": {
-          "login": "areusch"
-        },
-        "id": 124,
-        "updatedAt": "2022-05-11T16:54:32Z",
-        "body": "just confirming--we can disable this when doing a local build, correct? what's the mechanism by which we do that?"
-      },
-      {
-        "authorAssociation": "COLLABORATOR",
-        "author": {
-          "login": "driazati"
-        },
-        "id": 123,
-        "updatedAt": "2022-05-11T18:46:54Z",
-        "body": "@tvm-bot merge"
-      }
-    ]
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "driazati",
-                "email": "driazati@users.noreply.github.com"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "bb7f51d3e0fd50997012dfcce3c9b2b852cd3136",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6377784092"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6377778488"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6390508806"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6390511833"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6377784248"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11267/2/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "REVIEW_REQUIRED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr11442-rerun-ci.json b/tests/python/ci/sample_prs/pr11442-rerun-ci.json
deleted file mode 100644
index 0199b2921f64..000000000000
--- a/tests/python/ci/sample_prs/pr11442-rerun-ci.json
+++ /dev/null
@@ -1,183 +0,0 @@
-{
-  "title": "Add 'static_library' runtime::Module",
-  "body": "(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for\r\ncontext, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).\r\n\r\nThis adds a new 'DSO exportable' runtime module representing the contents of a .o file. It\r\nallows external codegen toolchains to yield a result which:\r\n - Like CSource modules, can be conveyed directly to the final export_library compilation\r\n   step for linking into the final .so and saved to a know location without risk the\r\n   underlying code artifact will be lost.\r\n - Like DSOLibrary modules, are self contained so that no additional compile-time arguments\r\n   need be conveyed from the CSource module to the final export_library command line\r\n\r\nSince this is the third flavor of 'DSO exportable' module, add a Module::IsDSOExportable.\r\n\r\nSince adding the above, can't resist also adding a Module::ImplementsFunction virtual and\r\ncalling it from TEComplier to check if an external codegen function actually provided the\r\nimplementation it promised.\r\n\r\nNote:\r\n - I've left the existing implementation of runtime.load_module alone which\r\n   relinks .o files to .so files.\r\n - Though also contained in the .o metadata, I require static libraries to always\r\n   carry their list of exported function names.\r\n\r\nThis is all pretty stop gap pending a good rework of TVM to supoprt the notion of artifacts\r\nand, perhaps, build rules.\r\n",
-  "state": "OPEN",
-  "author": {
-    "login": "abc"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "authorAssociation": "MEMBER",
-        "author": {
-          "login": "tqchen"
-        },
-        "updatedAt": "2022-05-24T22:13:29Z",
-        "body": "Thanks @mbs-octoml  . I think we go with this as a temp workaround with a mind that the IsDSOExportable and ImplementsFunction likely should go to Artifact."
-      },
-      {
-        "authorAssociation": "CONTRIBUTOR",
-        "author": {
-          "login": "mbs-octoml"
-        },
-        "updatedAt": "2022-05-24T22:56:07Z",
-        "body": "Yeah, we really need to put some love into that.\r\n\r\nCollecting all the pieces needed for deployment along with their metadata a la Artifact is pretty clearly needed, though I suspect that will need to be abstract to cover the spectrum from firmware image to dynamically loadable .so to ready-to-call JITed code to tar.\r\n\r\nI can't help thinking we should also think about build rules guarded by target kinds & attributes, since again there's just so may ways to proceed."
-      },
-      {
-        "authorAssociation": "MEMBER",
-        "author": {
-          "login": "tqchen"
-        },
-        "updatedAt": "2022-05-24T23:08:00Z",
-        "body": "Perhaps we will end up building our own cmake/bazel :p in another time"
-      },
-      {
-        "authorAssociation": "CONTRIBUTOR",
-        "author": {
-          "login": "mbs-octoml"
-        },
-        "updatedAt": "2022-05-25T22:11:44Z",
-        "body": "Thanks Tianqi. Let's see if  this new fancy bot works...\r\n\r\n"
-      },
-      {
-        "authorAssociation": "CONTRIBUTOR",
-        "author": {
-          "login": "mbs-octoml"
-        },
-        "updatedAt": "2022-05-25T22:11:50Z",
-        "body": "@tvm-bot merge"
-      },
-      {
-        "authorAssociation": "NONE",
-        "author": {
-          "login": "github-actions"
-        },
-        "updatedAt": "2022-05-25T22:12:10Z",
-        "body": "Cannot merge, did not find any approving reviews from users with write access on 96d4e62da5a7b78da18d0ee28cc6261d8fbf31c4"
-      },
-      {
-        "authorAssociation": "CONTRIBUTOR",
-        "author": {
-          "login": "mbs-octoml"
-        },
-        "updatedAt": "2022-05-25T22:12:37Z",
-        "body": "@tvm-bot rerun"
-      }
-    ]
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "mbs-octoml",
-                "email": "mbs@octoml.ai"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "96d4e62da5a7b78da18d0ee28cc6261d8fbf31c4",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6598275844"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6598273162"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6598275717"
-                },
-                {
-                  "name": "Android",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6598275593"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11442/4/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "APPROVED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "body": "",
-        "updatedAt": "2022-05-24T23:08:31Z",
-        "url": "https://github.com/apache/tvm/pull/11442#pullrequestreview-983954561",
-        "authorCanPushToRepository": true,
-        "commit": {
-          "oid": "23c600097cf1c2a55acda059626a060e106dd023"
-        },
-        "author": {
-          "login": "tqchen"
-        },
-        "state": "APPROVED"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 0939aae10ab5..f2e686d1e582 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -23,15 +23,14 @@
 
 import pytest
 import tvm.testing
-from .test_utils import REPO_ROOT, TempGit
+from .test_utils import REPO_ROOT, TempGit, run_script
 
 
-def parameterize_named(*values):
-    keys = list(values[0].keys())
-    if len(keys) == 1:
-        return pytest.mark.parametrize(",".join(keys), [d[keys[0]] for d in values])
-
-    return pytest.mark.parametrize(",".join(keys), [tuple(d.values()) for d in values])
+def parameterize_named(**kwargs):
+    keys = next(iter(kwargs.values())).keys()
+    return pytest.mark.parametrize(
+        ",".join(keys), [tuple(d.values()) for d in kwargs.values()], ids=kwargs.keys()
+    )
 
 
 # pylint: disable=line-too-long
@@ -137,23 +136,7 @@ def parameterize_named(*values):
 
 
 @tvm.testing.skip_if_wheel_test
-@pytest.mark.parametrize(
-    [
-        "main_xml_file",
-        "main_xml_content",
-        "pr_xml_file",
-        "pr_xml_content",
-        "target_url",
-        "s3_prefix",
-        "jenkins_prefix",
-        "common_main_build",
-        "commit_sha",
-        "expected_url",
-        "expected_body",
-    ],
-    [tuple(d.values()) for d in TEST_DATA_SKIPPED_BOT.values()],
-    ids=TEST_DATA_SKIPPED_BOT.keys(),
-)
+@parameterize_named(**TEST_DATA_SKIPPED_BOT)
 # pylint: enable=line-too-long
 def test_skipped_tests_comment(
     tmpdir_factory,
@@ -182,49 +165,37 @@ def write_xml_file(root_dir, xml_file, xml_content):
             f.write(textwrap.dedent(xml_content))
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-    git.run("init")
-    git.run("checkout", "-b", "main")
-    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
 
     pr_test_report_dir = Path(git.cwd) / "pr-reports"
     write_xml_file(pr_test_report_dir, pr_xml_file, pr_xml_content)
     main_test_report_dir = Path(git.cwd) / "main-reports"
     write_xml_file(main_test_report_dir, main_xml_file, main_xml_content)
 
-    proc = subprocess.run(
+    proc = run_script(
         [
-            str(skipped_tests_script),
+            skipped_tests_script,
             "--dry-run",
             f"--s3-prefix={s3_prefix}",
             f"--jenkins-prefix={jenkins_prefix}",
             f"--common-main-build={common_main_build}",
         ],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
         env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha},
-        encoding="utf-8",
         cwd=git.cwd,
-        check=False,
     )
-    if proc.returncode != 0:
-        raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
 
-    assert f"Dry run, would have posted {expected_url} with data {expected_body}." in proc.stderr
+    assert_in(f"Dry run, would have posted {expected_url} with data {expected_body}.", proc.stderr)
 
 
 @tvm.testing.skip_if_wheel_test
-@pytest.mark.parametrize(
-    "target_url,base_url,commit_sha,expected_url,expected_body",
-    [
-        (
-            "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
-            "https://pr-docs.tlcpack.ai",
-            "SHA",
-            "issues/11594/comments",
-            "<!---docs-bot-comment-->\n\nBuilt docs for commit SHA can be found "
-            "[here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).",
-        )
-    ],
+@parameterize_named(
+    doc_link=dict(
+        target_url="https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
+        base_url="https://pr-docs.tlcpack.ai",
+        commit_sha="SHA",
+        expected_url="issues/11594/comments",
+        expected_body="<!---docs-bot-comment-->\n\nBuilt docs for commit SHA can be found "
+        "[here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).",
+    )
 )
 def test_docs_comment(
     tmpdir_factory, target_url, base_url, commit_sha, expected_url, expected_body
@@ -235,146 +206,93 @@ def test_docs_comment(
     docs_comment_script = REPO_ROOT / "ci" / "scripts" / "github_docs_comment.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-    git.run("init")
-    git.run("checkout", "-b", "main")
-    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
-    proc = subprocess.run(
-        [str(docs_comment_script), "--dry-run", f"--base-url-docs={base_url}"],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+    proc = run_script(
+        [docs_comment_script, "--dry-run", f"--base-url-docs={base_url}"],
         env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha},
-        encoding="utf-8",
         cwd=git.cwd,
-        check=False,
     )
-    if proc.returncode != 0:
-        raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
 
-    assert f"Dry run, would have posted {expected_url} with data {expected_body}." in proc.stderr
+    assert_in(f"Dry run, would have posted {expected_url} with data {expected_body}.", proc.stderr)
 
 
 @tvm.testing.skip_if_wheel_test
-def test_cc_reviewers(tmpdir_factory):
-    """
-    Test that reviewers are added from 'cc @someone' messages in PRs
-    """
-    reviewers_script = REPO_ROOT / "ci" / "scripts" / "github_cc_reviewers.py"
-
-    def run(pr_body, requested_reviewers, existing_review_users, expected_reviewers):
-        git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-        git.run("init")
-        git.run("checkout", "-b", "main")
-        git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
-        reviews = [{"user": {"login": r}} for r in existing_review_users]
-        requested_reviewers = [{"login": r} for r in requested_reviewers]
-        proc = subprocess.run(
-            [str(reviewers_script), "--dry-run", "--testing-reviews-json", json.dumps(reviews)],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env={
-                "PR": json.dumps(
-                    {"number": 1, "body": pr_body, "requested_reviewers": requested_reviewers}
-                )
-            },
-            encoding="utf-8",
-            cwd=git.cwd,
-            check=False,
-        )
-        if proc.returncode != 0:
-            raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
-
-        assert f"After filtering existing reviewers, adding: {expected_reviewers}" in proc.stdout
-
-    run(pr_body="abc", requested_reviewers=[], existing_review_users=[], expected_reviewers=[])
-    run(
+@parameterize_named(
+    cc_no_one=dict(
+        pr_body="abc", requested_reviewers=[], existing_review_users=[], expected_reviewers=[]
+    ),
+    cc_abc=dict(
         pr_body="cc @abc",
         requested_reviewers=[],
         existing_review_users=[],
         expected_reviewers=["abc"],
-    )
-    run(pr_body="cc @", requested_reviewers=[], existing_review_users=[], expected_reviewers=[])
-    run(
+    ),
+    bad_cc_line=dict(
+        pr_body="cc @", requested_reviewers=[], existing_review_users=[], expected_reviewers=[]
+    ),
+    cc_multiple=dict(
         pr_body="cc @abc @def",
         requested_reviewers=[],
         existing_review_users=[],
         expected_reviewers=["abc", "def"],
-    )
-    run(
+    ),
+    with_existing=dict(
         pr_body="some text cc @abc @def something else",
         requested_reviewers=[],
         existing_review_users=[],
         expected_reviewers=["abc", "def"],
-    )
-    run(
+    ),
+    with_existing_split=dict(
         pr_body="some text cc @abc @def something else\n\n another cc @zzz z",
         requested_reviewers=[],
         existing_review_users=[],
         expected_reviewers=["abc", "def", "zzz"],
-    )
-    run(
+    ),
+    with_existing_request=dict(
         pr_body="some text cc @abc @def something else\n\n another cc @zzz z",
         requested_reviewers=["abc"],
         existing_review_users=[],
         expected_reviewers=["def", "zzz"],
-    )
-    run(
+    ),
+    with_existing_reviewers=dict(
         pr_body="some text cc @abc @def something else\n\n another cc @zzz z",
         requested_reviewers=["abc"],
         existing_review_users=["abc"],
         expected_reviewers=["def", "zzz"],
-    )
-    run(
+    ),
+    with_no_reviewers=dict(
         pr_body="some text cc @abc @def something else\n\n another cc @zzz z",
         requested_reviewers=[],
         existing_review_users=["abc"],
         expected_reviewers=["def", "zzz"],
-    )
-
-
-def test_update_branch(tmpdir_factory):
+    ),
+)
+def test_cc_reviewers(
+    tmpdir_factory, pr_body, requested_reviewers, existing_review_users, expected_reviewers
+):
     """
-    Test that the last-successful branch script updates successfully
+    Test that reviewers are added from 'cc @someone' messages in PRs
     """
-    update_script = REPO_ROOT / "ci" / "scripts" / "update_branch.py"
-
-    def run(statuses, expected_rc, expected_output):
-        git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-        git.run("init")
-        git.run("checkout", "-b", "main")
-        git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
-        commit = {
-            "statusCheckRollup": {"contexts": {"nodes": statuses}},
-            "oid": "123",
-            "messageHeadline": "hello",
-        }
-        data = {
-            "data": {
-                "repository": {
-                    "defaultBranchRef": {"target": {"history": {"edges": [], "nodes": [commit]}}}
-                }
-            }
-        }
-        proc = subprocess.run(
-            [str(update_script), "--dry-run", "--testonly-json", json.dumps(data)],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            encoding="utf-8",
-            cwd=git.cwd,
-            check=False,
-        )
+    reviewers_script = REPO_ROOT / "ci" / "scripts" / "github_cc_reviewers.py"
 
-        if proc.returncode != expected_rc:
-            raise RuntimeError(
-                f"Wrong return code:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}"
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+    reviews = [{"user": {"login": r}} for r in existing_review_users]
+    requested_reviewers = [{"login": r} for r in requested_reviewers]
+    proc = run_script(
+        [reviewers_script, "--dry-run", "--testing-reviews-json", json.dumps(reviews)],
+        env={
+            "PR": json.dumps(
+                {"number": 1, "body": pr_body, "requested_reviewers": requested_reviewers}
             )
+        },
+        cwd=git.cwd,
+    )
 
-        if expected_output not in proc.stdout:
-            raise RuntimeError(
-                f"Missing {expected_output}:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}"
-            )
+    assert f"After filtering existing reviewers, adding: {expected_reviewers}" in proc.stdout
 
+
+@parameterize_named(
     # Missing expected tvm-ci/branch test
-    run(
+    missing_tvm_ci_branch=dict(
         statuses=[
             {
                 "context": "test",
@@ -383,10 +301,9 @@ def run(statuses, expected_rc, expected_output):
         ],
         expected_rc=1,
         expected_output="No good commits found in the last 1 commits",
-    )
-
+    ),
     # Only has the right passing test
-    run(
+    has_expected_test=dict(
         statuses=[
             {
                 "context": "tvm-ci/branch",
@@ -395,10 +312,9 @@ def run(statuses, expected_rc, expected_output):
         ],
         expected_rc=0,
         expected_output="Found last good commit: 123: hello",
-    )
-
+    ),
     # Check with many statuses
-    run(
+    many_statuses=dict(
         statuses=[
             {
                 "context": "tvm-ci/branch",
@@ -415,8 +331,8 @@ def run(statuses, expected_rc, expected_output):
         ],
         expected_rc=1,
         expected_output="No good commits found in the last 1 commits",
-    )
-    run(
+    ),
+    many_success_statuses=dict(
         statuses=[
             {
                 "context": "tvm-ci/branch",
@@ -433,17 +349,50 @@ def run(statuses, expected_rc, expected_output):
         ],
         expected_rc=0,
         expected_output="Found last good commit: 123: hello",
+    ),
+)
+def test_update_branch(tmpdir_factory, statuses, expected_rc, expected_output):
+    """
+    Test that the last-successful branch script updates successfully
+    """
+    update_script = REPO_ROOT / "ci" / "scripts" / "update_branch.py"
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+    commit = {
+        "statusCheckRollup": {"contexts": {"nodes": statuses}},
+        "oid": "123",
+        "messageHeadline": "hello",
+    }
+    data = {
+        "data": {
+            "repository": {
+                "defaultBranchRef": {"target": {"history": {"edges": [], "nodes": [commit]}}}
+            }
+        }
+    }
+    proc = run_script(
+        [update_script, "--dry-run", "--testonly-json", json.dumps(data)],
+        cwd=git.cwd,
+        check=False,
     )
 
+    if proc.returncode != expected_rc:
+        raise RuntimeError(f"Wrong return code:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
+
+    if expected_output not in proc.stdout:
+        raise RuntimeError(
+            f"Missing {expected_output}:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}"
+        )
+
 
 @parameterize_named(
-    dict(
+    dont_skip_main=dict(
         commands=[],
         should_skip=False,
         pr_title="[skip ci] test",
         why="ci should not be skipped on main",
     ),
-    dict(
+    dont_skip_main_with_commit=dict(
         commands=[
             ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
         ],
@@ -451,7 +400,7 @@ def run(statuses, expected_rc, expected_output):
         pr_title="[skip ci] test",
         why="ci should not be skipped on main",
     ),
-    dict(
+    skip_on_new_branch=dict(
         commands=[
             ["checkout", "-b", "some_new_branch"],
             ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
@@ -460,7 +409,7 @@ def run(statuses, expected_rc, expected_output):
         pr_title="[skip ci] test",
         why="ci should be skipped on a branch with [skip ci] in the last commit",
     ),
-    dict(
+    no_skip_in_pr_title=dict(
         commands=[
             ["checkout", "-b", "some_new_branch"],
             ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
@@ -470,7 +419,7 @@ def run(statuses, expected_rc, expected_output):
         why="ci should not be skipped on a branch with "
         "[skip ci] in the last commit but not the PR title",
     ),
-    dict(
+    skip_in_pr_title=dict(
         commands=[
             ["checkout", "-b", "some_new_branch"],
             ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
@@ -480,17 +429,7 @@ def run(statuses, expected_rc, expected_output):
         pr_title="[skip ci] test",
         why="ci should be skipped with [skip ci] in the PR title",
     ),
-    dict(
-        commands=[
-            ["checkout", "-b", "some_new_branch"],
-            ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
-            ["commit", "--allow-empty", "--message", "commit 2"],
-        ],
-        should_skip=True,
-        pr_title="[skip ci] test",
-        why="ci should be skipped with [skip ci] in the PR title",
-    ),
-    dict(
+    skip_in_pr_title_many_commits=dict(
         commands=[
             ["checkout", "-b", "some_new_branch"],
             ["commit", "--allow-empty", "--message", "commit 1"],
@@ -502,7 +441,7 @@ def run(statuses, expected_rc, expected_output):
         pr_title="[skip ci] test",
         why="ci should be skipped with [skip ci] in the PR title",
     ),
-    dict(
+    skip_anywhere_in_title=dict(
         commands=[
             ["checkout", "-b", "some_new_branch"],
         ],
@@ -518,22 +457,16 @@ def test_skip_ci(tmpdir_factory, commands, should_skip, pr_title, why):
     skip_ci_script = REPO_ROOT / "ci" / "scripts" / "git_skip_ci.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-    # Jenkins git is too old and doesn't have 'git init --initial-branch'
-    git.run("init")
-    git.run("checkout", "-b", "main")
-    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
+
     git.run("config", "user.name", "ci")
     git.run("config", "user.email", "email@example.com")
     git.run("commit", "--allow-empty", "--message", "base commit")
     for command in commands:
         git.run(*command)
     pr_number = "1234"
-    proc = subprocess.run(
-        [str(skip_ci_script), "--pr", pr_number, "--pr-title", pr_title],
+    proc = run_script(
+        [skip_ci_script, "--pr", pr_number, "--pr-title", pr_title],
         cwd=git.cwd,
-        stderr=subprocess.STDOUT,
-        stdout=subprocess.PIPE,
-        encoding="utf-8",
         check=False,
     )
     expected = 0 if should_skip else 1
@@ -544,120 +477,66 @@ def test_skip_ci(tmpdir_factory, commands, should_skip, pr_title, why):
         )
 
 
-def test_skip_globs(tmpdir_factory):
+@parameterize_named(
+    no_file=dict(files=[], should_skip=True),
+    readme=dict(files=["README.md"], should_skip=True),
+    c_file=dict(files=["test.c"], should_skip=False),
+    c_and_readme=dict(files=["test.c", "README.md"], should_skip=False),
+    src_file_and_readme=dict(
+        files=["src/autotvm/feature_visitor.cc", "README.md"], should_skip=False
+    ),
+    yaml_and_readme=dict(files=[".asf.yaml", "docs/README.md"], should_skip=True),
+)
+def test_skip_globs(tmpdir_factory, files, should_skip):
     """
     Test that CI is skipped if only certain files are edited
     """
     script = REPO_ROOT / "ci" / "scripts" / "git_skip_ci_globs.py"
 
-    def run(files, should_skip):
-        git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-        # Jenkins git is too old and doesn't have 'git init --initial-branch'
-        git.run("init")
-        git.run("checkout", "-b", "main")
-        git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
-
-        proc = subprocess.run(
-            [
-                str(script),
-                "--files",
-                ",".join(files),
-            ],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            encoding="utf-8",
-            cwd=git.cwd,
-            check=False,
-        )
-
-        if should_skip:
-            assert proc.returncode == 0
-        else:
-            assert proc.returncode == 1
-
-    run([], should_skip=True)
-    run(["README.md"], should_skip=True)
-    run(["test.c"], should_skip=False)
-    run(["test.c", "README.md"], should_skip=False)
-    run(["src/autotvm/feature_visitor.cc", "README.md"], should_skip=False)
-    run([".asf.yaml", "docs/README.md"], should_skip=True)
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
 
+    proc = run_script(
+        [
+            script,
+            "--files",
+            ",".join(files),
+        ],
+        check=False,
+        cwd=git.cwd,
+    )
 
-def test_ping_reviewers(tmpdir_factory):
-    """
-    Test that reviewers are messaged after a time period of inactivity
-    """
-    reviewers_script = REPO_ROOT / "ci" / "scripts" / "ping_reviewers.py"
+    if should_skip:
+        assert proc.returncode == 0
+    else:
+        assert proc.returncode == 1
 
-    def run(pull_request, check):
-        git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-        # Jenkins git is too old and doesn't have 'git init --initial-branch'
-        git.run("init")
-        git.run("checkout", "-b", "main")
-        git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
-
-        data = {
-            "data": {
-                "repository": {
-                    "pullRequests": {
-                        "nodes": [pull_request],
-                        "edges": [],
-                    }
-                }
-            }
-        }
-        proc = subprocess.run(
-            [
-                str(reviewers_script),
-                "--dry-run",
-                "--wait-time-minutes",
-                "1",
-                "--cutoff-pr-number",
-                "5",
-                "--allowlist",
-                "user",
-                "--pr-json",
-                json.dumps(data),
-                "--now",
-                "2022-01-26T17:54:19Z",
-            ],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            encoding="utf-8",
-            cwd=git.cwd,
-            check=False,
-        )
-        if proc.returncode != 0:
-            raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
 
-        assert check in proc.stdout
+def all_time_keys(time):
+    return {
+        "updatedAt": time,
+        "lastEditedAt": time,
+        "createdAt": time,
+        "publishedAt": time,
+    }
 
-    def all_time_keys(time):
-        return {
-            "updatedAt": time,
-            "lastEditedAt": time,
-            "createdAt": time,
-            "publishedAt": time,
-        }
 
-    run(
-        {
+@parameterize_named(
+    draft=dict(
+        pull_request={
             "isDraft": True,
             "number": 2,
         },
-        "Checking 0 of 1 fetched",
-    )
-
-    run(
-        {
+        check="Checking 0 of 1 fetched",
+    ),
+    not_draft=dict(
+        pull_request={
             "isDraft": False,
             "number": 2,
         },
-        "Checking 0 of 1 fetched",
-    )
-
-    run(
-        {
+        check="Checking 0 of 1 fetched",
+    ),
+    week_old=dict(
+        pull_request={
             "number": 123,
             "url": "https://github.com/apache/tvm/pull/123",
             "body": "cc @someone",
@@ -667,12 +546,11 @@ def all_time_keys(time):
             **all_time_keys("2022-01-18T17:54:19Z"),
             "comments": {"nodes": []},
         },
-        "Pinging reviewers ['someone'] on https://github.com/apache/tvm/pull/123",
-    )
-
+        check="Pinging reviewers ['someone'] on https://github.com/apache/tvm/pull/123",
+    ),
     # Check allowlist functionality
-    run(
-        {
+    allowlist=dict(
+        pull_request={
             "number": 123,
             "url": "https://github.com/apache/tvm/pull/123",
             "body": "cc @someone",
@@ -686,12 +564,11 @@ def all_time_keys(time):
                 ]
             },
         },
-        "Checking 0 of 1 fetched",
-    )
-
+        check="Checking 0 of 1 fetched",
+    ),
     # Old comment, ping
-    run(
-        {
+    old_comment=dict(
+        pull_request={
             "number": 123,
             "url": "https://github.com/apache/tvm/pull/123",
             "body": "cc @someone",
@@ -708,12 +585,11 @@ def all_time_keys(time):
                 ]
             },
         },
-        "Pinging reviewers ['someone'] on https://github.com/apache/tvm/pull/123",
-    )
-
+        check="Pinging reviewers ['someone'] on https://github.com/apache/tvm/pull/123",
+    ),
     # New comment, don't ping
-    run(
-        {
+    new_comment=dict(
+        pull_request={
             "number": 123,
             "url": "https://github.com/apache/tvm/pull/123",
             "body": "cc @someone",
@@ -727,8 +603,45 @@ def all_time_keys(time):
                 ]
             },
         },
-        "Not pinging PR 123",
+        check="Not pinging PR 123",
+    ),
+)
+def test_ping_reviewers(tmpdir_factory, pull_request, check):
+    """
+    Test that reviewers are messaged after a time period of inactivity
+    """
+    reviewers_script = REPO_ROOT / "ci" / "scripts" / "ping_reviewers.py"
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+
+    data = {
+        "data": {
+            "repository": {
+                "pullRequests": {
+                    "nodes": [pull_request],
+                    "edges": [],
+                }
+            }
+        }
+    }
+    proc = run_script(
+        [
+            reviewers_script,
+            "--dry-run",
+            "--wait-time-minutes",
+            "1",
+            "--cutoff-pr-number",
+            "5",
+            "--allowlist",
+            "user",
+            "--pr-json",
+            json.dumps(data),
+            "--now",
+            "2022-01-26T17:54:19Z",
+        ],
+        cwd=git.cwd,
     )
+    assert_in(check, proc.stdout)
 
 
 def assert_in(needle: str, haystack: str):
@@ -740,69 +653,8 @@ def assert_in(needle: str, haystack: str):
 
 
 @tvm.testing.skip_if_wheel_test
-def test_github_tag_teams(tmpdir_factory):
-    """
-    Check that individuals are tagged from team headers
-    """
-    tag_script = REPO_ROOT / "ci" / "scripts" / "github_tag_teams.py"
-
-    def run(source_type, data, check):
-        git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-        git.run("init")
-        git.run("checkout", "-b", "main")
-        git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
-
-        issue_body = """
-        some text
-        [temporary] opt-in: @person5
-
-        - something: @person1 @person2
-        - something3: @person1 @person2 @SOME1-ONE-
-        - something else @person1 @person2
-        - something else2: @person1 @person2
-        - something-else @person1 @person2
-        """
-        comment1 = """
-        another thing: @person3
-        another-thing @person3
-        """
-        comment2 = """
-        something @person4
-        @person5
-        """
-        teams = {
-            "data": {
-                "repository": {
-                    "issue": {
-                        "body": issue_body,
-                        "comments": {"nodes": [{"body": comment1}, {"body": comment2}]},
-                    }
-                }
-            }
-        }
-        env = {
-            source_type: json.dumps(data),
-        }
-        proc = subprocess.run(
-            [
-                str(tag_script),
-                "--dry-run",
-                "--team-issue-json",
-                json.dumps(teams),
-            ],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            encoding="utf-8",
-            cwd=git.cwd,
-            env=env,
-            check=False,
-        )
-        if proc.returncode != 0:
-            raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
-
-        assert_in(check, proc.stdout)
-
-    run(
+@parameterize_named(
+    no_cc=dict(
         source_type="ISSUE",
         data={
             "title": "A title",
@@ -818,9 +670,8 @@ def run(source_type, data, check):
             ),
         },
         check="No one to cc, exiting",
-    )
-
-    run(
+    ),
+    no_additional_cc=dict(
         source_type="ISSUE",
         data={
             "title": "A title",
@@ -838,9 +689,8 @@ def run(source_type, data, check):
             ),
         },
         check="No one to cc, exiting",
-    )
-
-    run(
+    ),
+    cc_update=dict(
         source_type="ISSUE",
         data={
             "title": "A title",
@@ -858,9 +708,8 @@ def run(source_type, data, check):
         },
         check="would have updated issues/1234 with {'body': "
         "'\\nhello\\n\\nsomething\\n\\ncc @person1 @person2 @person4'}",
-    )
-
-    run(
+    ),
+    already_cced=dict(
         source_type="ISSUE",
         data={
             "title": "A title",
@@ -877,9 +726,8 @@ def run(source_type, data, check):
             ),
         },
         check="No one to cc, exiting",
-    )
-
-    run(
+    ),
+    not_already_cced=dict(
         source_type="ISSUE",
         data={
             "title": "[something] A title",
@@ -897,9 +745,8 @@ def run(source_type, data, check):
         },
         check="would have updated issues/1234 with {'body': "
         "'\\nhello\\n\\nsomething\\n\\ncc @person1 @person2 @person4'}",
-    )
-
-    run(
+    ),
+    no_new_ccs=dict(
         source_type="ISSUE",
         data={
             "title": "[something] A title",
@@ -916,9 +763,8 @@ def run(source_type, data, check):
             ),
         },
         check="No one to cc, exiting",
-    )
-
-    run(
+    ),
+    mismatching_tags=dict(
         source_type="PR",
         data={
             "title": "[something] A title",
@@ -936,9 +782,8 @@ def run(source_type, data, check):
             ),
         },
         check="No one to cc, exiting",
-    )
-
-    run(
+    ),
+    draft_pr=dict(
         source_type="PR",
         data={
             "title": "[something] A title",
@@ -956,9 +801,8 @@ def run(source_type, data, check):
             ),
         },
         check="Terminating since 1234 is a draft",
-    )
-
-    run(
+    ),
+    edit_inplace=dict(
         source_type="ISSUE",
         data={
             "title": "[something] A title",
@@ -974,9 +818,8 @@ def run(source_type, data, check):
         check="would have updated issues/1234 with {'body': '`mold` and `lld` can be a much"
         " faster alternative to `ld` from gcc. We should modify our CMakeLists.txt to "
         "detect and use these when possible. cc @person1\\n\\ncc @person2 @person4'}",
-    )
-
-    run(
+    ),
+    edit_out_of_place=dict(
         source_type="ISSUE",
         data={
             "title": "[something3] A title",
@@ -989,9 +832,8 @@ def run(source_type, data, check):
         },
         check="Dry run, would have updated issues/1234 with"
         " {'body': '@person2 @SOME1-ONE-\\n\\ncc @person1'}",
-    )
-
-    run(
+    ),
+    atted_but_not_cced=dict(
         source_type="ISSUE",
         data={
             "title": "[] A title",
@@ -1003,12 +845,64 @@ def run(source_type, data, check):
             "body": "@person2 @SOME1-ONE-",
         },
         check="No one to cc, exiting",
+    ),
+)
+def test_github_tag_teams(tmpdir_factory, source_type, data, check):
+    """
+    Check that individuals are tagged from team headers
+    """
+    tag_script = REPO_ROOT / "ci" / "scripts" / "github_tag_teams.py"
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+
+    issue_body = """
+    some text
+    [temporary] opt-in: @person5
+
+    - something: @person1 @person2
+    - something3: @person1 @person2 @SOME1-ONE-
+    - something else @person1 @person2
+    - something else2: @person1 @person2
+    - something-else @person1 @person2
+    """
+    comment1 = """
+    another thing: @person3
+    another-thing @person3
+    """
+    comment2 = """
+    something @person4
+    @person5
+    """
+    teams = {
+        "data": {
+            "repository": {
+                "issue": {
+                    "body": issue_body,
+                    "comments": {"nodes": [{"body": comment1}, {"body": comment2}]},
+                }
+            }
+        }
+    }
+    env = {
+        source_type: json.dumps(data),
+    }
+    proc = run_script(
+        [
+            tag_script,
+            "--dry-run",
+            "--team-issue-json",
+            json.dumps(teams),
+        ],
+        cwd=git.cwd,
+        env=env,
     )
 
+    assert_in(check, proc.stdout)
+
 
 @tvm.testing.skip_if_wheel_test
 @parameterize_named(
-    dict(
+    same_tags=dict(
         tlcpackstaging_body={
             "results": [
                 {
@@ -1028,7 +922,7 @@ def run(source_type, data, check):
         expected="Tag names were the same, no update needed",
         expected_images=[],
     ),
-    dict(
+    staging_update=dict(
         tlcpackstaging_body={
             "results": [
                 {
@@ -1054,7 +948,7 @@ def run(source_type, data, check):
             "ci_arm = 'tlcpack/ci-arm:456-456-abc'",
         ],
     ),
-    dict(
+    tlcpack_update=dict(
         tlcpackstaging_body={
             "results": [
                 {
@@ -1084,22 +978,19 @@ def test_open_docker_update_pr(
     tag_script = REPO_ROOT / "ci" / "scripts" / "open_docker_update_pr.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-    git.run("init")
     git.run("config", "user.name", "ci")
     git.run("config", "user.email", "email@example.com")
-    git.run("checkout", "-b", "main")
-    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
     images = [
-        "ci_lint",
-        "ci_gpu",
-        "ci_cpu",
-        "ci_minimal",
-        "ci_wasm",
-        "ci_i386",
-        "ci_cortexm",
         "ci_arm",
+        "ci_cortexm",
+        "ci_cpu",
+        "ci_gpu",
         "ci_hexagon",
+        "ci_i386",
+        "ci_lint",
+        "ci_minimal",
         "ci_riscv",
+        "ci_wasm",
     ]
 
     docker_data = {}
@@ -1107,52 +998,43 @@ def test_open_docker_update_pr(
         docker_data[f"repositories/tlcpackstaging/{image}/tags"] = tlcpackstaging_body
         docker_data[f"repositories/tlcpack/{image.replace('_', '-')}/tags"] = tlcpack_body
 
-    proc = subprocess.run(
+    proc = run_script(
         [
-            str(tag_script),
+            tag_script,
             "--dry-run",
             "--testing-docker-data",
             json.dumps(docker_data),
         ],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        encoding="utf-8",
         cwd=git.cwd,
         env={"GITHUB_TOKEN": "1234"},
-        check=False,
+        stderr=subprocess.STDOUT,
     )
 
     for line in expected_images:
         if line not in proc.stdout:
             raise RuntimeError(f"Missing line {line} in output:\n{proc.stdout}")
 
-    if proc.returncode != 0:
-        raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
-
     assert_in(expected, proc.stdout)
 
 
-@pytest.mark.parametrize(
-    "images,expected",
-    [
-        (
-            ["ci_arm=tlcpack/ci-arm:abc-abc-123", "ci_lint=tlcpack/ci-lint:abc-abc-234"],
-            {
-                "ci_arm": "tlcpack/ci-arm:abc-abc-123",
-                "ci_lint": "tlcpack/ci-lint:abc-abc-234",
-            },
-        ),
-        (
-            ["ci_arm2=tlcpack/ci-arm2:abc-abc-123"],
-            {
-                "ci_arm2": "tlcpackstaging/ci_arm2:abc-abc-123",
-            },
-        ),
-    ],
+@parameterize_named(
+    use_tlcpack=dict(
+        images=["ci_arm=tlcpack/ci-arm:abc-abc-123", "ci_lint=tlcpack/ci-lint:abc-abc-234"],
+        expected={
+            "ci_arm": "tlcpack/ci-arm:abc-abc-123",
+            "ci_lint": "tlcpack/ci-lint:abc-abc-234",
+        },
+    ),
+    use_staging=dict(
+        images=["ci_arm2=tlcpack/ci-arm2:abc-abc-123"],
+        expected={
+            "ci_arm2": "tlcpackstaging/ci_arm2:abc-abc-123",
+        },
+    ),
 )
 def test_determine_docker_images(tmpdir_factory, images, expected):
     """Test script to decide whether to use tlcpack or tlcpackstaging for images"""
-    tag_script = REPO_ROOT / "ci" / "scripts" / "determine_docker_images.py"
+    script = REPO_ROOT / "ci" / "scripts" / "determine_docker_images.py"
 
     git_dir = tmpdir_factory.mktemp("tmp_git_dir")
 
@@ -1161,23 +1043,17 @@ def test_determine_docker_images(tmpdir_factory, images, expected):
         "repositories/tlcpack/ci-lint/tags/abc-abc-234": {},
     }
 
-    proc = subprocess.run(
+    run_script(
         [
-            str(tag_script),
+            script,
             "--testing-docker-data",
             json.dumps(docker_data),
             "--base-dir",
             git_dir,
         ]
         + images,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        encoding="utf-8",
         cwd=git_dir,
-        check=False,
     )
-    if proc.returncode != 0:
-        raise RuntimeError(f"Failed to run script:\n{proc.stdout}")
 
     for expected_filename, expected_image in expected.items():
         with open(Path(git_dir) / expected_filename) as f:
@@ -1186,34 +1062,28 @@ def test_determine_docker_images(tmpdir_factory, images, expected):
         assert actual_image == expected_image
 
 
-@pytest.mark.parametrize(
-    "changed_files,name,check,expected_code",
-    [
-        d.values()
-        for d in [
-            dict(
-                changed_files=[],
-                name="abc",
-                check="Image abc is not using new naming scheme",
-                expected_code=1,
-            ),
-            dict(
-                changed_files=[], name="123-123-abc", check="No extant hash found", expected_code=1
-            ),
-            dict(
-                changed_files=[["test.txt"]],
-                name=None,
-                check="Did not find changes, no rebuild necessary",
-                expected_code=0,
-            ),
-            dict(
-                changed_files=[["test.txt"], ["docker/test.txt"]],
-                name=None,
-                check="Found docker changes",
-                expected_code=2,
-            ),
-        ]
-    ],
+@parameterize_named(
+    invalid_name=dict(
+        changed_files=[],
+        name="abc",
+        check="Image abc is not using new naming scheme",
+        expected_code=1,
+    ),
+    no_hash=dict(
+        changed_files=[], name="123-123-abc", check="No extant hash found", expected_code=1
+    ),
+    no_changes=dict(
+        changed_files=[["test.txt"]],
+        name=None,
+        check="Did not find changes, no rebuild necessary",
+        expected_code=0,
+    ),
+    docker_changes=dict(
+        changed_files=[["test.txt"], ["docker/test.txt"]],
+        name=None,
+        check="Found docker changes",
+        expected_code=2,
+    ),
 )
 def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expected_code):
     """
@@ -1222,11 +1092,8 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec
     tag_script = REPO_ROOT / "ci" / "scripts" / "should_rebuild_docker.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-    git.run("init")
     git.run("config", "user.name", "ci")
     git.run("config", "user.email", "email@example.com")
-    git.run("checkout", "-b", "main")
-    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
 
     git_path = Path(git.cwd)
     for i, commits in enumerate(changed_files):
@@ -1262,15 +1129,13 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec
         },
     }
 
-    proc = subprocess.run(
+    proc = run_script(
         [
-            str(tag_script),
+            tag_script,
             "--testing-docker-data",
             json.dumps(docker_data),
         ],
-        stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
-        encoding="utf-8",
         cwd=git.cwd,
         check=False,
     )
diff --git a/tests/python/ci/test_tvmbot.py b/tests/python/ci/test_tvmbot.py
index 2c7a0eaec0d4..ceabd46a9b03 100644
--- a/tests/python/ci/test_tvmbot.py
+++ b/tests/python/ci/test_tvmbot.py
@@ -18,13 +18,12 @@
 Test the @tvm-bot merge code
 """
 
-import subprocess
 import json
 from pathlib import Path
+from typing import Dict, Any
 
-import pytest
 import tvm
-from .test_utils import REPO_ROOT, TempGit
+from .test_utils import REPO_ROOT, TempGit, run_script
 
 
 SUCCESS_EXPECTED_OUTPUT = """
@@ -37,167 +36,244 @@
 """.strip()
 
 
-TEST_DATA = {
-    "successful-merge": {
-        "number": 10786,
-        "filename": "pr10786-merges.json",
-        "expected": SUCCESS_EXPECTED_OUTPUT,
-        "comment": "@tvm-bot merge",
-        "user": "abc",
-        "detail": "Everything is fine so this PR will merge",
-    },
-    "no-request": {
-        "number": 10786,
-        "filename": "pr10786-nottriggered.json",
-        "expected": "Command 'do something else' did not match anything",
-        "comment": "@tvm-bot do something else",
-        "user": "abc",
-        "detail": "A PR for which the mergebot runs but no merge is requested",
-    },
-    "bad-ci": {
-        "number": 10786,
-        "filename": "pr10786-badci.json",
-        "expected": "Cannot merge, these CI jobs are not successful on",
-        "comment": "@tvm-bot merge",
-        "user": "abc",
-        "detail": "A PR which failed CI and cannot merge",
-    },
-    "old-review": {
-        "number": 10786,
-        "filename": "pr10786-oldreview.json",
-        "expected": "Cannot merge, did not find any approving reviews",
-        "comment": "@tvm-bot merge",
-        "user": "abc",
-        "detail": "A PR with passing CI and approving reviews on an old commit so it cannot merge",
-    },
-    "missing-job": {
-        "number": 10786,
-        "filename": "pr10786-missing-job.json",
-        "expected": "Cannot merge, missing expected jobs",
-        "comment": "@tvm-bot merge",
-        "user": "abc",
-        "detail": "PR missing an expected CI job and cannot merge",
-    },
-    "invalid-author": {
-        "number": 10786,
-        "filename": "pr10786-invalid-author.json",
-        "expected": "Failed auth check 'collaborators', quitting",
-        "comment": "@tvm-bot merge",
-        "user": "not-abc",
-        "detail": "Merge requester is not a committer and cannot merge",
-    },
-    "unauthorized-comment": {
-        "number": 11244,
-        "filename": "pr11244-unauthorized-comment.json",
-        "expected": "Failed auth check 'collaborators'",
-        "comment": "@tvm-bot merge",
-        "user": "not-abc2",
-        "detail": "Check that a merge comment not from a CONTRIBUTOR is rejected",
-    },
-    "no-review": {
-        "number": 11267,
-        "filename": "pr11267-no-review.json",
-        "expected": "Cannot merge, did not find any approving reviews from users with write access",
-        "comment": "@tvm-bot merge",
-        "user": "abc",
-        "detail": "Check that a merge request without any reviews is rejected",
-    },
-    "changes-requested": {
-        "number": 10786,
-        "filename": "pr10786-changes-requested.json",
-        "expected": "Cannot merge, found [this review]",
-        "comment": "@tvm-bot merge",
-        "user": "abc",
-        "detail": "Check that a merge request with a 'Changes Requested' review is rejected",
-    },
-    "co-authors": {
-        "number": 10786,
-        "filename": "pr10786-co-authors.json",
-        "expected": "Co-authored-by: Some One <someone@email.com>",
-        "comment": "@tvm-bot merge",
-        "user": "abc",
-        "detail": "Check that a merge request with co-authors generates the correct commit message",
-    },
-    "rerun-ci": {
-        "number": 11442,
-        "filename": "pr11442-rerun-ci.json",
-        "expected": "Rerunning ci with",
-        "comment": "@tvm-bot rerun",
-        "user": "abc",
-        "detail": "Start a new CI job",
-    },
-    "ignore-jobs": {
-        "number": 10786,
-        "filename": "pr10786-ignore-jobs.json",
-        "expected": "Dry run, would have merged",
-        "comment": "@tvm-bot merge",
-        "user": "abc",
-        "detail": "Ignore GitHub Actions jobs that don't start with CI / ",
-    },
-}
+class _TvmBotTest:
+    NUMBER = 10786
+
+    def preprocess_data(self, data: Dict[str, Any]):
+        """
+        Used to pre-process PR data before running the test. Override as
+        necessary to edit data for specific test cases.
+        """
+        return data
+
+    @tvm.testing.skip_if_wheel_test
+    def test(self, tmpdir_factory):
+        """
+        Run the tvm-bot script using the data from preprocess_data
+        """
+        mergebot_script = REPO_ROOT / "ci" / "scripts" / "github_tvmbot.py"
+        test_json_dir = Path(__file__).resolve().parent / "sample_prs"
+        with open(test_json_dir / f"pr{self.NUMBER}.json") as f:
+            test_data = json.load(f)
+
+        # Update testing data with replacements / additions
+        test_data = self.preprocess_data(test_data)
+
+        git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+
+        comment = {
+            "body": self.COMMENT,
+            "id": 123,
+            "user": {
+                "login": self.USER,
+            },
+        }
+        allowed_users = [{"login": "abc"}, {"login": "other-abc"}]
+
+        proc = run_script(
+            [
+                mergebot_script,
+                "--pr",
+                self.NUMBER,
+                "--dry-run",
+                "--run-url",
+                "https://example.com",
+                "--testing-pr-json",
+                json.dumps(test_data),
+                "--testing-collaborators-json",
+                json.dumps(allowed_users),
+                "--testing-mentionable-users-json",
+                json.dumps(allowed_users),
+                "--trigger-comment-json",
+                json.dumps(comment),
+            ],
+            env={
+                "TVM_BOT_JENKINS_TOKEN": "123",
+                "GH_ACTIONS_TOKEN": "123",
+            },
+            cwd=git.cwd,
+        )
+
+        if self.EXPECTED not in proc.stderr:
+            raise RuntimeError(f"{proc.stderr}\ndid not contain\n{self.EXPECTED}")
+
+
+class TestNoRequest(_TvmBotTest):
+    """
+    A PR for which the mergebot runs but no merge is requested
+    """
+
+    COMMENT = "@tvm-bot do something else"
+    USER = "abc"
+    EXPECTED = "Command 'do something else' did not match anything"
+
+    def preprocess_data(self, data: Dict[str, Any]):
+        data["reviews"]["nodes"][0]["body"] = "nothing"
+        return data
+
+
+class TestSuccessfulMerge(_TvmBotTest):
+    """
+    Everything is fine so this PR will merge
+    """
+
+    COMMENT = "@tvm-bot merge"
+    USER = "abc"
+    EXPECTED = SUCCESS_EXPECTED_OUTPUT
+
+
+class TestBadCI(_TvmBotTest):
+    """
+    A PR which failed CI and cannot merge
+    """
+
+    COMMENT = "@tvm-bot merge"
+    USER = "abc"
+    EXPECTED = "Cannot merge, these CI jobs are not successful on"
+
+    def preprocess_data(self, data: Dict[str, Any]):
+        # Mark the Jenkins build as failed
+        contexts = data["commits"]["nodes"][0]["commit"]["statusCheckRollup"]["contexts"]["nodes"]
+        for context in contexts:
+            if "context" in context and context["context"] == "tvm-ci/pr-head":
+                context["state"] = "FAILED"
+        return data
+
+
+class TestOldReview(_TvmBotTest):
+    """
+    A PR with passing CI and approving reviews on an old commit so it cannot merge
+    """
+
+    COMMENT = "@tvm-bot merge"
+    USER = "abc"
+    EXPECTED = "Cannot merge, did not find any approving reviews"
+
+    def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        data["reviews"]["nodes"][0]["commit"]["oid"] = "abc12345"
+        return data
+
+
+class TestMissingJob(_TvmBotTest):
+    """
+    PR missing an expected CI job and cannot merge
+    """
+
+    COMMENT = "@tvm-bot merge"
+    USER = "abc"
+    EXPECTED = "Cannot merge, missing expected jobs"
+
+    def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        contexts = data["commits"]["nodes"][0]["commit"]["statusCheckRollup"]["contexts"]["nodes"]
+        for context in contexts:
+            if "context" in context and context["context"] == "tvm-ci/pr-head":
+                context["context"] = "something"
+        return data
+
+
+class TestInvalidAuthor(_TvmBotTest):
+    """
+    Merge requester is not a committer and cannot merge
+    """
+
+    COMMENT = "@tvm-bot merge"
+    USER = "not-abc"
+    EXPECTED = "Failed auth check 'collaborators', quitting"
 
 
-@tvm.testing.skip_if_wheel_test
-@pytest.mark.parametrize(
-    ["number", "filename", "expected", "comment", "user", "detail"],
-    [tuple(d.values()) for d in TEST_DATA.values()],
-    ids=TEST_DATA.keys(),
-)
-def test_tvmbot(tmpdir_factory, number, filename, expected, comment, user, detail):
-    """
-    Test the mergebot test cases
-    """
-    mergebot_script = REPO_ROOT / "ci" / "scripts" / "github_tvmbot.py"
-    test_json_dir = Path(__file__).resolve().parent / "sample_prs"
-
-    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-    git.run("init", stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-    git.run("checkout", "-b", "main", stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
-    with open(test_json_dir / filename) as f:
-        test_data = json.load(f)
-
-    comment = {
-        "body": comment,
-        "id": 123,
-        "user": {
-            "login": user,
-        },
-    }
-    allowed_users = [{"login": "abc"}]
-
-    proc = subprocess.run(
-        [
-            str(mergebot_script),
-            "--pr",
-            str(number),
-            "--dry-run",
-            "--run-url",
-            "https://example.com",
-            "--testing-pr-json",
-            json.dumps(test_data),
-            "--testing-collaborators-json",
-            json.dumps(allowed_users),
-            "--testing-mentionable-users-json",
-            json.dumps(allowed_users),
-            "--trigger-comment-json",
-            json.dumps(comment),
-        ],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        encoding="utf-8",
-        env={
-            "TVM_BOT_JENKINS_TOKEN": "123",
-            "GH_ACTIONS_TOKEN": "123",
-        },
-        cwd=git.cwd,
-        check=False,
-    )
-    if proc.returncode != 0:
-        raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
-
-    if expected not in proc.stderr:
-        raise RuntimeError(f"{proc.stderr}\ndid not contain\n{expected}")
+class TestUnauthorizedComment(_TvmBotTest):
+    """
+    Check that a merge comment not from a CONTRIBUTOR is rejected
+    """
+
+    COMMENT = "@tvm-bot merge"
+    USER = "not-abc2"
+    EXPECTED = "Failed auth check 'collaborators'"
+
+
+class TestNoReview(_TvmBotTest):
+    """
+    Check that a merge request without any reviews is rejected
+    """
+
+    COMMENT = "@tvm-bot merge"
+    USER = "abc"
+    EXPECTED = "Cannot merge, did not find any approving reviews from users with write access"
+
+    def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        data["reviews"]["nodes"] = []
+        return data
+
+
+class TestChangesRequested(_TvmBotTest):
+    """
+    Check that a merge request with a 'Changes Requested' review is rejected
+    """
+
+    COMMENT = "@tvm-bot merge"
+    USER = "abc"
+    EXPECTED = "Cannot merge, found [this review]"
+
+    def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        data["reviews"]["nodes"][0]["state"] = "CHANGES_REQUESTED"
+        data["reviews"]["nodes"][0]["url"] = "http://example.com"
+        return data
+
+
+class TestCoAuthors(_TvmBotTest):
+    """
+    Check that a merge request with co-authors generates the correct commit message
+    """
+
+    COMMENT = "@tvm-bot merge"
+    USER = "abc"
+    EXPECTED = "Co-authored-by: Some One <someone@email.com>"
+
+    def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        data["authorCommits"]["nodes"][0]["commit"]["authors"]["nodes"].append(
+            {"name": "Some One", "email": "someone@email.com"}
+        )
+        return data
+
+
+class TestRerunCI(_TvmBotTest):
+    """
+    Start a new CI job
+    """
+
+    COMMENT = "@tvm-bot rerun"
+    USER = "abc"
+    EXPECTED = "Rerunning ci with"
+
+
+class TestRerunPermissions(_TvmBotTest):
+    """
+    Start a new CI job as an unauthorized user
+    """
+
+    COMMENT = "@tvm-bot rerun"
+    USER = "someone"
+    EXPECTED = "Failed auth check 'metionable_users', quitting"
+
+
+class TestRerunNonAuthor(_TvmBotTest):
+    """
+    Start a new CI job as a mentionable user
+    """
+
+    COMMENT = "@tvm-bot rerun"
+    USER = "other-abc"
+    EXPECTED = "Passed auth check 'metionable_users', continuing"
+
+
+class TestIgnoreJobs(_TvmBotTest):
+    """
+    Ignore GitHub Actions jobs that don't start with CI /
+    """
+
+    COMMENT = "@tvm-bot merge"
+    USER = "abc"
+    EXPECTED = "Dry run, would have merged"
 
 
 if __name__ == "__main__":
diff --git a/tests/python/ci/test_utils.py b/tests/python/ci/test_utils.py
index 513601aa1b46..4a0f2710e74a 100644
--- a/tests/python/ci/test_utils.py
+++ b/tests/python/ci/test_utils.py
@@ -19,19 +19,28 @@
 """
 import subprocess
 import pathlib
+from typing import List, Any
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
 
 
 class TempGit:
     """
-    A wrapper to run commands in a directory
+    A wrapper to run commands in a directory (specifically for use in CI tests)
     """
 
     def __init__(self, cwd):
         self.cwd = cwd
+        # Jenkins git is too old and doesn't have 'git init --initial-branch',
+        # so init and checkout need to be separate steps
+        self.run("init", stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+        self.run("checkout", "-b", "main", stderr=subprocess.PIPE)
+        self.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
 
     def run(self, *args, **kwargs):
+        """
+        Run a git command based on *args
+        """
         proc = subprocess.run(
             ["git"] + list(args), encoding="utf-8", cwd=self.cwd, check=False, **kwargs
         )
@@ -39,3 +48,25 @@ def run(self, *args, **kwargs):
             raise RuntimeError(f"git command failed: '{args}'")
 
         return proc
+
+
+def run_script(command: List[Any], check: bool = True, **kwargs):
+    """
+    Wrapper to run a script and print its output if there was an error
+    """
+    command = [str(c) for c in command]
+    kwargs_to_send = {
+        "stdout": subprocess.PIPE,
+        "stderr": subprocess.PIPE,
+        "encoding": "utf-8",
+    }
+    kwargs_to_send.update(kwargs)
+    proc = subprocess.run(
+        command,
+        check=False,
+        **kwargs_to_send,
+    )
+    if check and proc.returncode != 0:
+        raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
+
+    return proc

From 0cbf3aa6e22e77a62256e35a9eef4dbe327b6fa0 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 2 Sep 2022 14:27:45 -0700
Subject: [PATCH 101/704] [AutoTVM][Testing] Add `tune_relay` scripts (#12685)

Example:

```bash
python -m tvm.autotvm.testing.tune_relay  \
       --workload bert_base               \
       --input-shape '[1,64]'             \
       --target "llvm"                    \
       --num-trials 800                   \
       --rpc-host 192.168.6.66            \
       --rpc-port 4445                    \
       --rpc-key 3090ti                   \
       --work-dir /logs/autotvm-bert_base \
       --cache-dir /cache-workloads       \
       --graph-tuner True                 \
       --cpu-flush True                   \
       --backend graph
```
---
 python/tvm/autotvm/testing/__init__.py   |  17 ++
 python/tvm/autotvm/testing/tune_relay.py | 263 +++++++++++++++++++++++
 2 files changed, 280 insertions(+)
 create mode 100644 python/tvm/autotvm/testing/__init__.py
 create mode 100644 python/tvm/autotvm/testing/tune_relay.py

diff --git a/python/tvm/autotvm/testing/__init__.py b/python/tvm/autotvm/testing/__init__.py
new file mode 100644
index 000000000000..972d0cbaae5c
--- /dev/null
+++ b/python/tvm/autotvm/testing/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Testing utilities for autotvm"""
diff --git a/python/tvm/autotvm/testing/tune_relay.py b/python/tvm/autotvm/testing/tune_relay.py
new file mode 100644
index 000000000000..e4745963741f
--- /dev/null
+++ b/python/tvm/autotvm/testing/tune_relay.py
@@ -0,0 +1,263 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+import argparse
+import json
+import os
+import warnings
+from distutils.util import strtobool
+
+import tvm
+from tvm import autotvm
+from tvm import meta_schedule as ms
+from tvm import relay
+from tvm.autotvm.graph_tuner import DPTuner
+from tvm.autotvm.tuner import XGBTuner
+from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.meta_schedule.testing.tune_utils import create_timer, generate_input_data
+from tvm.support import describe
+
+
+def _parse_args():
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--workload",
+        type=str,
+        required=True,
+        help="The name of the workload to tune. Supported models: "
+        "https://github.com/apache/tvm/blob/main/python/tvm/meta_schedule/testing/relay_workload.py#L303-L322",  # pylint: disable=line-too-long
+    )
+    args.add_argument(
+        "--input-shape",
+        type=str,
+        required=True,
+        help="The input shape of the workload. Example: '[1, 3, 224, 224]'",
+    )
+    args.add_argument(
+        "--target",
+        type=str,
+        required=True,
+        help="The target device to tune. "
+        "Example: 'aws/cpu/c5.9xlarge', 'nvidia/nvidia-v100', 'nvidia/geforce-rtx-3090'",
+    )
+    args.add_argument(
+        "--num-trials",
+        type=int,
+        required=True,
+        help="The number of trials per kernel. Example: 800",
+    )
+    args.add_argument(
+        "--rpc-host",
+        type=str,
+        required=True,
+        help="The host address of the RPC tracker. Example: 192.168.6.66",
+    )
+    args.add_argument(
+        "--rpc-port",
+        type=int,
+        required=True,
+        help="The port of the RPC tracker. Example: 4445",
+    )
+    args.add_argument(
+        "--rpc-key",
+        type=str,
+        required=True,
+        help="The key of the RPC tracker. Example: '3090ti'",
+    )
+    args.add_argument(
+        "--work-dir",
+        type=str,
+        required=True,
+        help="The working directory to store the tuning logs. Example: '/tmp/tune_relay'",
+    )
+    args.add_argument(
+        "--layout",
+        type=str,
+        default=None,
+        help="The layout of the workload. Example: 'NCHW', 'NHWC'",
+    )
+    args.add_argument(
+        "--cache-dir",
+        type=str,
+        default=None,
+    )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=lambda x: bool(strtobool(x)),
+        help="example: True / False",
+        required=True,
+    )
+    args.add_argument(
+        "--graph-tuner",
+        type=lambda x: bool(strtobool(x)),
+        help="example: True / False",
+        required=True,
+    )
+    args.add_argument(
+        "--backend",
+        type=str,
+        choices=["graph", "vm"],
+        help="example: graph / vm",
+        required=True,
+    )
+    parsed = args.parse_args()
+    parsed.target = tvm.target.Target(parsed.target)
+    parsed.input_shape = json.loads(parsed.input_shape)
+    parsed.rpc_config = ms.runner.RPCConfig(
+        tracker_host=parsed.rpc_host,
+        tracker_port=parsed.rpc_port,
+        tracker_key=parsed.rpc_key,
+        session_timeout_sec=600,
+    )
+    if ARGS.target.kind.name != "llvm" and ARGS.graph_tuner:
+        raise ValueError("GraphTuner only supports llvm target")
+    if ARGS.target.kind.name != "llvm" and ARGS.cpu_flush:
+        raise ValueError("cpu_flush only supports llvm target")
+    if ARGS.target.kind.name == "llvm" and not ARGS.cpu_flush:
+        warnings.warn("cpu_flush is not enabled for llvm target")
+    return parsed
+
+
+ARGS = _parse_args()
+
+
+def main():
+    log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
+    graph_opt_sch_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}_graph_opt.log")
+    measure_option = autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.RPCRunner(
+            key=ARGS.rpc_key,
+            host=ARGS.rpc_host,
+            port=ARGS.rpc_port,
+            number=ARGS.number,
+            repeat=ARGS.repeat,
+            min_repeat_ms=ARGS.min_repeat_ms,
+            enable_cpu_cache_flush=ARGS.cpu_flush,
+        ),
+    )
+    describe()
+    print(f"Workload: {ARGS.workload}")
+    mod, params, (input_name, input_shape, input_dtype) = get_network(
+        ARGS.workload,
+        ARGS.input_shape,
+        layout=ARGS.layout,
+        cache_dir=ARGS.cache_dir,
+    )
+    input_info = [
+        {
+            "name": input_name,
+            "shape": input_shape,
+            "dtype": input_dtype,
+        },
+    ]
+    input_data = {
+        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in input_info
+    }
+    for item in input_info:
+        print(f"  input_name : {item['name']}")
+        print(f"  input_shape: {item['shape']}")
+        print(f"  input_dtype: {item['dtype']}")
+
+    with ms.Profiler() as profiler:
+        with ms.Profiler.timeit("TaskExtraction"):
+            # extract workloads from relay program
+            tasks = autotvm.task.extract_from_program(
+                mod["main"],
+                target=ARGS.target,
+                params=params,
+                ops=(
+                    relay.op.get("nn.conv2d"),
+                    relay.op.get("nn.conv3d"),
+                    relay.op.get("nn.conv2d_transpose"),
+                    relay.op.get("nn.dense"),
+                    relay.op.get("nn.batch_matmul"),
+                ),
+            )
+            for i, task in enumerate(tasks):
+                print(f"Task {i} {task.name}: {task}")
+
+        with ms.Profiler.timeit("Tuning"):
+            if ARGS.num_trials > 0:
+                for i, task in enumerate(tasks):
+                    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+                    tuner_obj = XGBTuner(task, loss_type="rank")
+                    n_trial = min(len(task.config_space), ARGS.num_trials)
+                    tuner_obj.tune(
+                        n_trial=n_trial,
+                        early_stopping=800,
+                        measure_option=measure_option,
+                        callbacks=[
+                            autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                            autotvm.callback.log_to_file(log_file),
+                        ],
+                    )
+                if ARGS.graph_tuner:
+                    executor = DPTuner(
+                        graph=mod["main"],
+                        input_shapes={input_name: input_shape},
+                        records=log_file,
+                        target_ops=[
+                            relay.op.get("nn.conv2d"),
+                        ],
+                        target=ARGS.target,
+                    )
+                    executor.benchmark_layout_transform(min_exec_num=1000)
+                    executor.run()
+                    executor.write_opt_sch2record_file(graph_opt_sch_file)
+
+        relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend]
+        with ms.Profiler.timeit("PostTuningCompilation"):
+            if ARGS.graph_tuner:
+                ctx = autotvm.apply_graph_best(graph_opt_sch_file)
+            else:
+                ctx = autotvm.apply_history_best(log_file)
+            with ctx:
+                print("compile...")
+                with tvm.transform.PassContext(opt_level=3):
+                    lib = relay_build(mod, target=ARGS.target, params=params)
+    print("Tuning Time:")
+    print(profiler.table())
+
+    run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=lib,
+        dev_type=ARGS.target.kind.name,
+        args=input_data,
+        continuation=create_timer(ARGS.backend),
+        backend=ARGS.backend,
+    )
+
+
+if __name__ == "__main__":
+    main()

From 4ed6564f764eea10af360a3e4bfb904061f5cc32 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 2 Sep 2022 15:01:22 -0700
Subject: [PATCH 102/704] [ci] Add tests for PR linter (#12680)

This adds some checks for the current usages of the PR linter and fixes the case where the script would error uncleanly when a PR body was `null`.
---
 ci/scripts/check_pr.py     | 17 ++++++---------
 tests/python/ci/test_ci.py | 43 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 11 deletions(-)
 mode change 100644 => 100755 ci/scripts/check_pr.py

diff --git a/ci/scripts/check_pr.py b/ci/scripts/check_pr.py
old mode 100644
new mode 100755
index 45d502c6a72e..9af5ec5580a3
--- a/ci/scripts/check_pr.py
+++ b/ci/scripts/check_pr.py
@@ -18,6 +18,7 @@
 import argparse
 import re
 import os
+import json
 import textwrap
 from dataclasses import dataclass
 from typing import Any, List, Callable
@@ -108,10 +109,7 @@ def run_checks(checks: List[Check], s: str, name: str) -> bool:
     parser.add_argument("--pr", required=True)
     parser.add_argument("--remote", default="origin", help="ssh remote to parse")
     parser.add_argument(
-        "--pr-body", help="(testing) PR body to use instead of fetching from GitHub"
-    )
-    parser.add_argument(
-        "--pr-title", help="(testing) PR title to use instead of fetching from GitHub"
+        "--pr-data", help="(testing) PR data to use instead of fetching from GitHub"
     )
     args = parser.parse_args()
 
@@ -121,20 +119,17 @@ def run_checks(checks: List[Check], s: str, name: str) -> bool:
         print(f"PR was not a number: {args.pr}")
         exit(0)
 
-    if args.pr_body:
-        body = args.pr_body
-        title = args.pr_title
+    if args.pr_data:
+        pr = json.loads(args.pr_data)
     else:
         remote = git(["config", "--get", f"remote.{args.remote}.url"])
         user, repo = parse_remote(remote)
 
         github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
         pr = github.get(f"pulls/{args.pr}")
-        body = pr["body"]
-        title = pr["title"]
 
-    body = body.strip()
-    title = title.strip()
+    body = "" if pr["body"] is None else pr["body"].strip()
+    title = "" if pr["title"] is None else pr["title"].strip()
 
     title_passed = run_checks(checks=title_checks, s=title, name="PR title")
     print("")
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index f2e686d1e582..79c72ce988c3 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -1144,5 +1144,48 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec
     assert proc.returncode == expected_code
 
 
+@parameterize_named(
+    passing=dict(
+        title="[something] a change",
+        body="something",
+        expected="All checks passed",
+        expected_code=0,
+    ),
+    period=dict(
+        title="[something] a change.",
+        body="something",
+        expected="trailing_period: FAILED",
+        expected_code=1,
+    ),
+    empty_body=dict(
+        title="[something] a change",
+        body=None,
+        expected="non_empty: FAILED",
+        expected_code=1,
+    ),
+)
+def test_pr_linter(title, body, expected, expected_code):
+    """
+    Test the PR linter
+    """
+    tag_script = REPO_ROOT / "ci" / "scripts" / "check_pr.py"
+    pr_data = {
+        "title": title,
+        "body": body,
+    }
+    proc = run_script(
+        [
+            tag_script,
+            "--pr",
+            1234,
+            "--pr-data",
+            json.dumps(pr_data),
+        ],
+        check=False,
+    )
+    assert proc.returncode == expected_code
+    assert_in(expected, proc.stdout)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 2734d044a24bdfcdab1fb473d07b93f4ed6b64eb Mon Sep 17 00:00:00 2001
From: Alexey Voronov <alexey.voronov@deelvin.com>
Date: Sat, 3 Sep 2022 01:42:59 +0300
Subject: [PATCH 103/704] [Adreno] Define memory_info for global.texture*
 (#12647)

There are now many warnings in the tuning process about undefined memory information when using textures. A definition is required as textures* are tagged.
---
 include/tvm/target/target_info.h |  6 +++---
 python/tvm/topi/adreno/utils.py  | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/tvm/target/target_info.h b/include/tvm/target/target_info.h
index 1de15a5bd526..946161f905f3 100644
--- a/include/tvm/target/target_info.h
+++ b/include/tvm/target/target_info.h
@@ -37,11 +37,11 @@ namespace tvm {
 class MemoryInfoNode : public Object {
  public:
   /*! \brief The addressable unit */
-  int unit_bits;
+  int64_t unit_bits;
   /*! \brief Maximum number of bits supported in the memory */
-  int max_num_bits;
+  int64_t max_num_bits;
   /*! \brief maximum number of bits to be used in simd op */
-  int max_simd_bits;
+  int64_t max_simd_bits;
   /*!
    * \brief head address of the buffer, if visible to CPU
    *  This address can be None.
diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py
index 6ad5271744b2..de0505af03d4 100644
--- a/python/tvm/topi/adreno/utils.py
+++ b/python/tvm/topi/adreno/utils.py
@@ -20,6 +20,7 @@
 import tvm
 import numpy
 from tvm import te
+from tvm._ffi.registry import register_func
 from tvm.topi.utils import simplify
 from tvm.topi import nn
 from tvm.autotvm.task.space import SplitEntity
@@ -571,6 +572,19 @@ def get_texture_storage(shape):
         return "global.texture-weight"
 
 
+@register_func("tvm.info.mem.global.texture")
+@register_func("tvm.info.mem.global.texture-nhwc")
+@register_func("tvm.info.mem.global.texture-weight")
+def mem_info_global_texture_variants():
+    return tvm.ir.make_node(
+        "MemoryInfo",
+        unit_bits=16,
+        max_num_bits=16384 * 16384 * 4 * 32,
+        max_simd_bits=4 * 32,
+        head_address=None,
+    )
+
+
 def infer_tile_size(data, layout):
     """Compute the tile size for Winograd algorithm
 

From 28cad58fd06f6fd395390f5a33c81acda4c27d12 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 2 Sep 2022 16:43:31 -0700
Subject: [PATCH 104/704] [Web][Emscripten] Update EMCC C++ standard to C++17
 (#12693)

As a follow-up to https://github.com/apache/tvm/pull/12337, updating
the EMCC flags from `-std=c++14` to `-std=c++17`.
---
 web/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/Makefile b/web/Makefile
index 34a1b8172484..d6adc94170fc 100644
--- a/web/Makefile
+++ b/web/Makefile
@@ -26,7 +26,7 @@ all: dist/wasm/tvmjs_runtime.wasm dist/wasm/tvmjs_runtime.wasi.js
 
 EMCC = emcc
 
-EMCC_CFLAGS = $(INCLUDE_FLAGS) -O3 -std=c++14 -Wno-ignored-attributes --no-entry \
+EMCC_CFLAGS = $(INCLUDE_FLAGS) -O3 -std=c++17 -Wno-ignored-attributes --no-entry \
 	-s ALLOW_MEMORY_GROWTH=1 -s STANDALONE_WASM=1 -s ERROR_ON_UNDEFINED_SYMBOLS=0 
 
 EMCC_LDFLAGS = --pre-js emcc/preload.js

From 5dcf62288b1d998df74ac36e48fcfe2424a0def8 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 5 Sep 2022 09:27:03 +0100
Subject: [PATCH 105/704] [ETHOSN] Use pytest parameterization for integration
 tests (#12688)

Using pytest parameterization helps identify the particular parameter combinations that are failing for a given test. Additionally, it can be useful when parallelizing the tests. This commit makes sure that "trials" have been replaced by parameterization as well as completing a general cleanup.
---
 .../python/contrib/test_ethosn/test_conv2d.py | 399 ++++++++++--------
 .../test_ethosn/test_depth_to_space.py        |  59 ++-
 .../test_ethosn/test_fullyconnected.py        |  95 ++---
 .../contrib/test_ethosn/test_pooling.py       |  77 ++--
 tests/python/contrib/test_ethosn/test_relu.py |  71 ++--
 .../python/contrib/test_ethosn/test_resize.py |  42 +-
 .../contrib/test_ethosn/test_sigmoid.py       |  82 ++--
 .../python/contrib/test_ethosn/test_split.py  |  59 ++-
 .../contrib/test_ethosn/test_topologies.py    |  73 ++--
 9 files changed, 492 insertions(+), 465 deletions(-)

diff --git a/tests/python/contrib/test_ethosn/test_conv2d.py b/tests/python/contrib/test_ethosn/test_conv2d.py
index ffe66f0d2be2..4026f8267d72 100644
--- a/tests/python/contrib/test_ethosn/test_conv2d.py
+++ b/tests/python/contrib/test_ethosn/test_conv2d.py
@@ -18,11 +18,14 @@
 """Arm(R) Ethos(TM)-N integration conv2d tests"""
 
 import math
+
 import numpy as np
 import pytest
+
 import tvm
 from tvm import relay
 from tvm.testing import requires_ethosn
+
 from . import infrastructure as tei
 
 
@@ -99,12 +102,12 @@ def _get_model(
         padding=p if pad in ("attr", "both") else (0, 0, 0, 0),
         out_dtype="int32",
     )
-    b = tvm.nd.array(
+    bias_data = tvm.nd.array(
         np.random.randint(
             np.iinfo(dtype).min, high=np.iinfo(dtype).max + 1, size=(out_channels,), dtype="int32"
         )
     )
-    biasc = relay.const(b, "int32")
+    biasc = relay.const(bias_data, "int32")
     bias = relay.nn.bias_add(conv, biasc, axis=3)
     if isinstance(kernel_sc, tvm.runtime.ndarray.NDArray):
         req_input_sc = [sc * input_sc for sc in kernel_sc.numpy()]
@@ -118,209 +121,222 @@ def _get_model(
         relay.const(output_zp, "int32"),  # output zero point
         out_dtype=dtype,
     )
-    params = {"w": weights_array, "b": b}
+    params = {"w": weights_array, "b": bias_data}
     return req, params
 
 
 @requires_ethosn
-@pytest.mark.parametrize("depthwise", [False, True])
-@pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_conv2d(dtype, depthwise):
+@pytest.mark.parametrize(
+    "dtype,qnn_per_channel", [("uint8", False), ("int8", False), ("int8", True)]
+)
+@pytest.mark.parametrize("pad,stride", [("attr", (2, 2)), ("none", (2, 2)), ("op", (1, 1))])
+@pytest.mark.parametrize(
+    "shape,out_channels,kernel_size",
+    [
+        [(1, 17, 20, 26), 4, (3, 1)],
+        [(1, 9, 20, 30), 7, (1, 5)],
+        [(1, 21, 21, 22), 8, (2, 2)],
+    ],
+)
+def test_conv2d(
+    dtype,
+    shape,
+    out_channels,
+    kernel_size,
+    pad,
+    stride,
+    qnn_per_channel,
+):
     """Compare Conv2D output with TVM."""
-
-    trials = [
-        [(1, 17, 20, 26), 4, 3, 1, "attr", (2, 2), (1, 1), False],
-        [(1, 30, 27, 30), 5, 5, 3, "none", (1, 1), (1, 1), False],
-        [(1, 30, 27, 30), 5, 5, 3, "none", (1, 1), (1, 1), dtype == "int8"],
-        [(1, 14, 28, 11), 6, 2, 2, "op", (2, 2), (1, 1), False],
-        [(1, 9, 20, 30), 7, 1, 5, "none", (1, 1), (1, 1), False],
-        [(1, 21, 21, 22), 8, 5, 1, "attr", (2, 2), (1, 1), False],
-        [(1, 21, 21, 22), 8, 5, 1, "attr", (2, 2), (1, 1), dtype == "int8"],
-        [(1, 21, 25, 29), 9, 2, 5, "op", (1, 1), (1, 1), False],
-        [(1, 21, 25, 29), 9, 2, 5, "op", (1, 1), (1, 1), dtype == "int8"],
-        [(1, 31, 28, 15), 10, 1, 2, "attr", (2, 2), (1, 1), False],
-        [(1, 21, 21, 8), 11, 3, 3, "none", (1, 1), (1, 1), False],
-        [(1, 5, 11, 6), 12, 5, 2, "op", (2, 2), (1, 1), False],
-        [(1, 12, 7, 18), 13, 1, 3, "op", (1, 1), (1, 1), False],
-        [(1, 24, 6, 26), 14, 3, 5, "none", (2, 2), (1, 1), False],
-        [(1, 19, 24, 16), 15, 2, 1, "attr", (1, 1), (1, 1), False],
-    ]
-
     np.random.seed(0)
-    for shape, out_channels, kernel_h, kernel_w, pad, stride, dilation, qnn_per_channel in trials:
-        if depthwise:
-            out_channels = shape[3]
-            groups = out_channels
-            kernel_w = kernel_h
-            weight_format = "HWOI"
-            stride = (1, 1) if kernel_w == 1 else (2, 2)
-        else:
-            groups = 1
-            weight_format = "HWIO"
 
-        outputs = []
-        inputs = {
-            "a": tvm.nd.array(
-                np.random.randint(
-                    np.iinfo(dtype).min,
-                    np.iinfo(dtype).max + 1,
-                    size=shape,
-                    dtype=dtype,
-                )
-            ),
-        }
-        input_zp = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max)
-        input_sc = np.random.random() * 2
-        if qnn_per_channel:
-            kernel_sc = tvm.nd.array(
-                np.random.uniform(low=0, high=2, size=(out_channels,)).astype(np.float32)
+    dilation = (1, 1)
+    groups = 1
+    weight_format = "HWIO"
+
+    outputs = []
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(
+                np.iinfo(dtype).min,
+                np.iinfo(dtype).max + 1,
+                size=shape,
+                dtype=dtype,
             )
-        else:
-            kernel_sc = np.random.random() * 2
-        kernel_zp = (
-            0 if dtype == "int8" else np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max)
-        )
-        output_zp, output_sc = tei.get_conv2d_qnn_params(
-            dtype, input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, kernel_w, shape[3]
-        )
-        model, params = _get_model(
-            shape,
-            kernel_h,
-            kernel_w,
-            input_zp,
-            input_sc,
-            kernel_zp,
-            kernel_sc,
-            output_zp,
-            output_sc,
-            pad,
-            stride,
-            dilation,
-            groups,
-            dtype,
-            out_channels,
-            weight_format,
+        ),
+    }
+    input_zp = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max)
+    input_sc = np.random.random() * 2
+    if qnn_per_channel:
+        kernel_sc = tvm.nd.array(
+            np.random.uniform(low=0, high=2, size=(out_channels,)).astype(np.float32)
         )
-        for npu in [False, True]:
-            mod = tei.make_module(model, params)
-            outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+    else:
+        kernel_sc = np.random.random() * 2
+    kernel_zp = (
+        0 if dtype == "int8" else np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max)
+    )
+    output_zp, output_sc = tei.get_conv2d_qnn_params(
+        dtype, input_zp, input_sc, kernel_zp, kernel_sc, kernel_size[0], kernel_size[1], shape[3]
+    )
+    model, params = _get_model(
+        shape,
+        kernel_size[0],
+        kernel_size[1],
+        input_zp,
+        input_sc,
+        kernel_zp,
+        kernel_sc,
+        output_zp,
+        output_sc,
+        pad,
+        stride,
+        dilation,
+        groups,
+        dtype,
+        out_channels,
+        weight_format,
+    )
+    for npu in [False, True]:
+        mod = tei.make_module(model, params)
+        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
 
-        tei.verify(outputs, dtype, 1)
+    tei.verify(outputs, dtype, 1)
 
 
 @requires_ethosn
-def test_conv2d_failure():
-    """Check Conv2D error messages."""
+@pytest.mark.parametrize(
+    "dtype,qnn_per_channel", [("uint8", False), ("int8", False), ("int8", True)]
+)
+@pytest.mark.parametrize("pad,stride", [("attr", (2, 2)), ("none", (2, 2)), ("op", (1, 1))])
+@pytest.mark.parametrize(
+    "shape,kernel_size",
+    [
+        [(1, 17, 20, 28), (3, 3)],
+        [(1, 9, 20, 30), (5, 5)],
+        [(1, 21, 21, 22), (2, 2)],
+    ],
+)
+def test_conv2d_depthwise(
+    dtype,
+    shape,
+    kernel_size,
+    pad,
+    stride,
+    qnn_per_channel,
+):
+    """Compare Conv2D output with TVM."""
+    np.random.seed(0)
 
-    trials = [
-        (
-            (1, 4, 4, 4),
-            1,
-            1,
-            0,
-            1024,
-            0,
-            1024,
-            0,
-            1,
-            "none",
-            (1, 1),
-            (1, 1),
-            1,
-            "uint8",
-            8,
-            "HWIO",
-            "Overall scale (of the input * weights / output) should be in the range (2^-32, 65536)",
+    dilation = (1, 1)
+    out_channels = shape[3]
+    groups = out_channels
+    weight_format = "HWOI"
+
+    outputs = []
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(
+                np.iinfo(dtype).min,
+                np.iinfo(dtype).max + 1,
+                size=shape,
+                dtype=dtype,
+            )
         ),
+    }
+    input_zp = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max)
+    input_sc = np.random.random() * 2
+    if qnn_per_channel:
+        kernel_sc = tvm.nd.array(
+            np.random.uniform(low=0, high=2, size=(out_channels,)).astype(np.float32)
+        )
+    else:
+        kernel_sc = np.random.random() * 2
+    kernel_zp = (
+        0 if dtype == "int8" else np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max)
+    )
+    output_zp, output_sc = tei.get_conv2d_qnn_params(
+        dtype, input_zp, input_sc, kernel_zp, kernel_sc, kernel_size[0], kernel_size[1], shape[3]
+    )
+    model, params = _get_model(
+        shape,
+        kernel_size[0],
+        kernel_size[1],
+        input_zp,
+        input_sc,
+        kernel_zp,
+        kernel_sc,
+        output_zp,
+        output_sc,
+        pad,
+        stride,
+        dilation,
+        groups,
+        dtype,
+        out_channels,
+        weight_format,
+    )
+    for npu in [False, True]:
+        mod = tei.make_module(model, params)
+        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+
+    tei.verify(outputs, dtype, 1)
+
+
+@requires_ethosn
+@pytest.mark.parametrize(
+    "shape,pad,stride,dilation,err_msg",
+    [
         (
             (1, 4, 4, 4),
-            2,
-            2,
-            0,
-            1,
-            0,
-            1,
-            0,
-            2,
             "both",
             (1, 1),
             (1, 1),
-            1,
-            "uint8",
-            8,
-            "HWIO",
             "both op and attr padding exist, must be either op/attr only or no padding",
         ),
         (
             (1, 4, 4, 4),
-            1,
-            1,
-            0,
-            1,
-            0,
-            1,
-            0,
-            2,
             "none",
             (1, 1, 1),
             (1, 1),
-            1,
-            "uint8",
-            8,
-            "HWIO",
             "stride size=3, stride size must = 2",
         ),
         (
             (1, 4, 4, 4),
-            1,
-            1,
-            0,
-            1,
-            0,
-            1,
-            0,
-            2,
             "none",
             (1, 1),
             (2, 1),
-            1,
-            "uint8",
-            8,
-            "HWIO",
             "dilation=[2, 1], dilation must = [1, 1]",
         ),
         (
             (2, 4, 4, 4),
-            1,
-            1,
-            0,
-            1,
-            0,
-            1,
-            0,
-            2,
             "none",
             (1, 1),
             (1, 1),
-            1,
-            "uint8",
-            8,
-            "HWIO",
             "batch size=2, batch size must = 1",
         ),
-    ]
-
+    ],
+)
+def test_conv2d_failure(shape, pad, stride, dilation, err_msg):
+    """Check Conv2D error messages."""
     np.random.seed(0)
-    for (
+
+    kernel_size = (2, 2)
+    groups = 1
+    dtype = "uint8"
+    out_channels = 8
+    weight_format = "HWIO"
+
+    model, _ = _get_model(
         shape,
-        kernel_h,
-        kernel_w,
-        input_zp,
-        input_sc,
-        kernel_zp,
-        kernel_sc,
-        output_zp,
-        output_sc,
+        kernel_size[0],
+        kernel_size[1],
+        0,
+        1,
+        0,
+        1,
+        0,
+        1,
         pad,
         stride,
         dilation,
@@ -328,26 +344,43 @@ def test_conv2d_failure():
         dtype,
         out_channels,
         weight_format,
-        err_msg,
-    ) in trials:
-        model, _ = _get_model(
-            shape,
-            kernel_h,
-            kernel_w,
-            input_zp,
-            input_sc,
-            kernel_zp,
-            kernel_sc,
-            output_zp,
-            output_sc,
-            pad,
-            stride,
-            dilation,
-            groups,
-            dtype,
-            out_channels,
-            weight_format,
-        )
-        model = tei.make_ethosn_composite(model, "ethos-n.qnn_conv2d")
-        mod = tei.make_ethosn_partition(model)
-        tei.test_error(mod, {}, err_msg)
+    )
+    model = tei.make_ethosn_composite(model, "ethos-n.qnn_conv2d")
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)
+
+
+@requires_ethosn
+def test_conv2d_out_of_range_scale():
+    """Check Conv2D scale out of range error."""
+    np.random.seed(0)
+
+    input_sc = 1024
+    kernel_sc = 1024
+    output_sc = 1
+
+    model, _ = _get_model(
+        (1, 4, 4, 4),
+        1,
+        1,
+        0,
+        input_sc,
+        0,
+        kernel_sc,
+        0,
+        output_sc,
+        "none",
+        (1, 1),
+        (1, 1),
+        1,
+        "uint8",
+        8,
+        "HWIO",
+    )
+    model = tei.make_ethosn_composite(model, "ethos-n.qnn_conv2d")
+    mod = tei.make_ethosn_partition(model)
+
+    expected_err_msg = (
+        "Overall scale (of the input * weights / output) should be in the range (2^-32, 65536)"
+    )
+    tei.test_error(mod, {}, expected_err_msg)
diff --git a/tests/python/contrib/test_ethosn/test_depth_to_space.py b/tests/python/contrib/test_ethosn/test_depth_to_space.py
index c071fe00f212..732932d8f324 100644
--- a/tests/python/contrib/test_ethosn/test_depth_to_space.py
+++ b/tests/python/contrib/test_ethosn/test_depth_to_space.py
@@ -33,37 +33,35 @@ def _get_model(shape, block, dtype, layout):
 
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_depth_to_space(dtype):
-    """Compare Depth To Space output with TVM."""
-
-    trials = [
+@pytest.mark.parametrize(
+    "shape",
+    [
         (1, 16, 16, 16),
         (1, 64, 32, 16),
-    ]
-
+    ],
+)
+def test_depth_to_space(dtype, shape):
+    """Compare Depth To Space output with TVM."""
     np.random.seed(0)
-    for shape in trials:
-        inputs = {
-            "a": tvm.nd.array(
-                np.random.randint(
-                    np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype
-                )
-            )
-        }
-        outputs = []
-        for npu in [False, True]:
-            model = _get_model(shape, 2, dtype, "NHWC")
-            mod = tei.make_module(model, {})
-            outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
 
-        tei.verify(outputs, dtype, 1)
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype)
+        )
+    }
+    outputs = []
+    for npu in [False, True]:
+        model = _get_model(shape, 2, dtype, "NHWC")
+        mod = tei.make_module(model, {})
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
 
+    tei.verify(outputs, dtype, 1)
 
-@requires_ethosn
-def test_depth_to_space_failure():
-    """Check Depth To Space error messages."""
 
-    trials = [
+@requires_ethosn
+@pytest.mark.parametrize(
+    "shape,block,dtype,layout,err_msg",
+    [
         ((2, 16, 16, 16), 2, "uint8", "NHWC", "batch size=2, batch size must = 1"),
         (
             (1, 16, 16, 16),
@@ -74,9 +72,10 @@ def test_depth_to_space_failure():
         ),
         ((1, 16, 16, 16), 4, "uint8", "NHWC", "Only block size of 2 is supported"),
         ((1, 16, 16, 16), 2, "uint8", "NCHW", "Input layer must be NHWC or NHWCB"),
-    ]
-
-    for shape, block, dtype, layout, err_msg in trials:
-        model = _get_model(shape, block, dtype, layout)
-        mod = tei.make_ethosn_partition(model)
-        tei.test_error(mod, {}, err_msg)
+    ],
+)
+def test_depth_to_space_failure(shape, block, dtype, layout, err_msg):
+    """Check Depth To Space error messages."""
+    model = _get_model(shape, block, dtype, layout)
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)
diff --git a/tests/python/contrib/test_ethosn/test_fullyconnected.py b/tests/python/contrib/test_ethosn/test_fullyconnected.py
index d5510bb79d2c..d38b2528c7bb 100644
--- a/tests/python/contrib/test_ethosn/test_fullyconnected.py
+++ b/tests/python/contrib/test_ethosn/test_fullyconnected.py
@@ -114,62 +114,63 @@ def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_z
 
 
 @requires_ethosn
-def test_fullyconnected_failure():
-    """Check Fully Connected error messages."""
-
-    trials = [
-        (
-            (1, 64),
-            (1, 64),
-            0,
-            1024,
-            0,
-            1024,
-            0,
-            1,
-            "uint8",
-            "Overall scale (of the input * weights / output) should be in the range (2^-32, 65536)",
-        ),
+@pytest.mark.parametrize(
+    "shape,weight_shape,err_msg",
+    [
         (
             (1, 1, 1, 64),
             (1, 64),
-            0,
-            1,
-            0,
-            1,
-            0,
-            1,
-            "uint8",
             "Weights tensor must have I dimension equal to the number"
             " of channels of the input tensor.;",
         ),
-        ((1024, 64), (1, 64), 0, 1, 0, 1, 0, 1, "uint8", "batch size=1024, batch size must = 1;"),
-    ]
-
+        ((1024, 64), (1, 64), "batch size=1024, batch size must = 1;"),
+    ],
+)
+def test_fullyconnected_failure(shape, weight_shape, err_msg):
+    """Check Fully Connected error messages."""
     np.random.seed(0)
-    for (
+
+    dtype = "uint8"
+
+    model, _ = _get_model(
         shape,
         weight_shape,
-        input_zp,
+        0,
+        1,
+        0,
+        1,
+        0,
+        1,
+        dtype,
+    )
+    model = tei.make_ethosn_composite(model, "ethos-n.qnn_fc")
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)
+
+
+@requires_ethosn
+def test_fullyconnected_scale_out_of_range():
+    """Check Fully Connected out of range scale error message."""
+    np.random.seed(0)
+
+    input_sc = 1024
+    kernel_sc = 1024
+    output_sc = 1
+
+    model, _ = _get_model(
+        (1, 64),
+        (1, 64),
+        0,
         input_sc,
-        kernel_zp,
+        0,
         kernel_sc,
-        output_zp,
+        0,
         output_sc,
-        dtype,
-        err_msg,
-    ) in trials:
-        model, _ = _get_model(
-            shape,
-            weight_shape,
-            input_zp,
-            input_sc,
-            kernel_zp,
-            kernel_sc,
-            output_zp,
-            output_sc,
-            dtype,
-        )
-        model = tei.make_ethosn_composite(model, "ethos-n.qnn_fc")
-        mod = tei.make_ethosn_partition(model)
-        tei.test_error(mod, {}, err_msg)
+        "uint8",
+    )
+    model = tei.make_ethosn_composite(model, "ethos-n.qnn_fc")
+    mod = tei.make_ethosn_partition(model)
+    expected_error_msg = (
+        "Overall scale (of the input * weights / output) should be in the range (2^-32, 65536)"
+    )
+    tei.test_error(mod, {}, expected_error_msg)
diff --git a/tests/python/contrib/test_ethosn/test_pooling.py b/tests/python/contrib/test_ethosn/test_pooling.py
index e1c7358f71a1..1e0487d76778 100644
--- a/tests/python/contrib/test_ethosn/test_pooling.py
+++ b/tests/python/contrib/test_ethosn/test_pooling.py
@@ -38,91 +38,88 @@ def _get_model(shape, typef, sizes, strides, pads, layout, dtype):
 
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_pooling(dtype):
+@pytest.mark.parametrize(
+    "shape,typef,size,stride,pad",
+    [
+        ((1, 8, 8, 8), relay.nn.max_pool2d, (2, 2), (2, 2), (0, 0, 0, 0)),
+        ((1, 9, 9, 9), relay.nn.max_pool2d, (3, 3), (2, 2), (0, 0, 0, 0)),
+        ((1, 8, 8, 8), relay.nn.avg_pool2d, (3, 3), (1, 1), (1, 1, 1, 1)),
+    ],
+)
+def test_pooling(dtype, shape, typef, size, stride, pad):
     """Compare Pooling output with TVM."""
+    np.random.seed(0)
 
-    trials = [
-        ((1, 8, 8, 8), relay.nn.max_pool2d, (2, 2), (2, 2), (0, 0, 0, 0), "NHWC"),
-        ((1, 9, 9, 9), relay.nn.max_pool2d, (3, 3), (2, 2), (0, 0, 0, 0), "NHWC"),
-        ((1, 8, 8, 8), relay.nn.avg_pool2d, (3, 3), (1, 1), (1, 1, 1, 1), "NHWC"),
-    ]
+    layout = "NHWC"
 
-    np.random.seed(0)
-    for shape, typef, size, stride, pad, layout in trials:
-        inputs = {
-            "a": tvm.nd.array(
-                np.random.randint(
-                    low=np.iinfo(dtype).min, high=np.iinfo(dtype).max + 1, size=shape, dtype=dtype
-                )
-            ),
-        }
-        outputs = []
-        model = _get_model(shape, typef, size, stride, pad, layout, dtype)
-        for npu in [False, True]:
-            mod = tei.make_module(model, {})
-            outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(
+                low=np.iinfo(dtype).min, high=np.iinfo(dtype).max + 1, size=shape, dtype=dtype
+            )
+        ),
+    }
+    outputs = []
+    model = _get_model(shape, typef, size, stride, pad, layout, dtype)
+    for npu in [False, True]:
+        mod = tei.make_module(model, {})
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
 
-        tei.verify(outputs, dtype, 1)
+    tei.verify(outputs, dtype, 1)
 
 
 @requires_ethosn
-def test_pooling_failure():
-    """Check Pooling error messages."""
-
-    trials = [
+@pytest.mark.parametrize(
+    "shape,size,stride,layout,dtype,err_msg",
+    [
         (
             (2, 8, 8, 8),
-            relay.nn.max_pool2d,
             (2, 2),
             (2, 2),
-            (0, 0, 0, 0),
             "NHWC",
             "uint8",
             "batch size=2, batch size must = 1",
         ),
         (
             (1, 8, 8, 8),
-            relay.nn.max_pool2d,
             (2, 2),
             (2, 2),
-            (0, 0, 0, 0),
             "NHWC",
             "int16",
             "dtype='int16', dtype must be either uint8, int8 or int32",
         ),
         (
             (1, 8, 8, 8),
-            relay.nn.max_pool2d,
             (2, 2),
             (2, 2),
-            (0, 0, 0, 0),
             "NCHW",
             "uint8",
             "data format=NCHW, data format must = NHWC",
         ),
         (
             (1, 8, 8, 8),
-            relay.nn.max_pool2d,
             (2, 2),
             (2, 2, 2),
-            (0, 0, 0, 0),
             "NHWC",
             "uint8",
             "stride size=3, stride size must = 2",
         ),
         (
             (1, 8, 8, 8),
-            relay.nn.max_pool2d,
             (2, 2, 2),
             (2, 2),
-            (0, 0, 0, 0),
             "NHWC",
             "uint8",
             "dimensions=3, dimensions must = 2",
         ),
-    ]
+    ],
+)
+def test_pooling_failure(shape, size, stride, layout, dtype, err_msg):
+    """Check Pooling error messages."""
+
+    typef = relay.nn.max_pool2d
+    pad = (0, 0, 0, 0)
 
-    for shape, typef, size, stride, pad, layout, dtype, err_msg in trials:
-        model = _get_model(shape, typef, size, stride, pad, layout, dtype)
-        mod = tei.make_ethosn_partition(model)
-        tei.test_error(mod, {}, err_msg)
+    model = _get_model(shape, typef, size, stride, pad, layout, dtype)
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)
diff --git a/tests/python/contrib/test_ethosn/test_relu.py b/tests/python/contrib/test_ethosn/test_relu.py
index f56a1cd7ad3c..db1894931dd9 100644
--- a/tests/python/contrib/test_ethosn/test_relu.py
+++ b/tests/python/contrib/test_ethosn/test_relu.py
@@ -33,53 +33,50 @@ def _get_model(shape, dtype, a_min, a_max):
 
 
 @requires_ethosn
-@pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_relu(dtype):
-    """Compare Relu output with TVM."""
-
-    trials = [
+@pytest.mark.parametrize(
+    "shape,a_min,a_max,dtype",
+    [
         ((1, 4, 4, 4), 65, 178, "uint8"),
         ((1, 8, 4, 2), 1, 254, "uint8"),
-        ((1, 16), 12, 76, "uint8"),
-        ((1, 4, 4, 4), 65, 125, "int8"),
         ((1, 8, 4, 2), -100, 100, "int8"),
         ((1, 16), -120, -20, "int8"),
-    ]
-
+    ],
+)
+def test_relu(dtype, shape, a_min, a_max):
+    """Compare Relu output with TVM."""
     np.random.seed(0)
-    for shape, a_min, a_max, trial_dtype in trials:
-        if trial_dtype == dtype:
-            inputs = {
-                "a": tvm.nd.array(
-                    np.random.randint(
-                        low=np.iinfo(dtype).min,
-                        high=np.iinfo(dtype).max + 1,
-                        size=shape,
-                        dtype=dtype,
-                    )
-                ),
-            }
-            outputs = []
-            for npu in [False, True]:
-                model = _get_model(inputs["a"].shape, dtype, a_min, a_max)
-                mod = tei.make_module(model, {})
-                outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
 
-            tei.verify(outputs, dtype, 1)
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(
+                low=np.iinfo(dtype).min,
+                high=np.iinfo(dtype).max + 1,
+                size=shape,
+                dtype=dtype,
+            )
+        ),
+    }
+    outputs = []
+    for npu in [False, True]:
+        model = _get_model(inputs["a"].shape, dtype, a_min, a_max)
+        mod = tei.make_module(model, {})
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
 
+    tei.verify(outputs, dtype, 1)
 
-@requires_ethosn
-def test_relu_failure():
-    """Check Relu error messages."""
 
-    trials = [
+@requires_ethosn
+@pytest.mark.parametrize(
+    "shape,dtype,a_min,a_max,err_msg",
+    [
         ((1, 4, 4, 4, 4), "uint8", 65, 78, "dimensions=5, dimensions must be <= 4"),
         ((1, 8, 4, 2), "int16", 1, 254, "dtype='int16', dtype must be either uint8, int8 or int32"),
         ((1, 8, 4, 2), "uint8", 254, 1, "Relu has lower bound > upper bound"),
         ((2, 2, 2, 2), "uint8", 1, 63, "batch size=2, batch size must = 1; "),
-    ]
-
-    for shape, dtype, a_min, a_max, err_msg in trials:
-        model = _get_model(shape, dtype, a_min, a_max)
-        mod = tei.make_ethosn_partition(model)
-        tei.test_error(mod, {}, err_msg)
+    ],
+)
+def test_relu_failure(shape, dtype, a_min, a_max, err_msg):
+    """Check Relu error messages."""
+    model = _get_model(shape, dtype, a_min, a_max)
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)
diff --git a/tests/python/contrib/test_ethosn/test_resize.py b/tests/python/contrib/test_ethosn/test_resize.py
index 2cc641e63b5c..b437ad1e545c 100644
--- a/tests/python/contrib/test_ethosn/test_resize.py
+++ b/tests/python/contrib/test_ethosn/test_resize.py
@@ -97,10 +97,9 @@ def test_resize(dtype, shape, size, coordinate_transformation_mode, rounding_met
 
 
 @requires_ethosn
-def test_resize_failure():
-    """Check Resize error messages."""
-
-    trials = [
+@pytest.mark.parametrize(
+    "size,err_msg",
+    [
         (
             (30, 20),
             "Requested height isn't supported",
@@ -117,22 +116,25 @@ def test_resize_failure():
             (20, 19),
             "Requested width and height must be both even or both odd",
         ),
-    ]
+    ],
+)
+def test_resize_failure(size, err_msg):
+    """Check Resize error messages."""
+
     dtype = "int8"
     zp_min = np.iinfo(dtype).min
 
-    for size, err_msg in trials:
-        model = _get_model(
-            shape=(1, 10, 10, 1),
-            dtype=dtype,
-            size=size,
-            input_zp=zp_min + 128,
-            input_sc=0.0784314,
-            output_zp=zp_min + 128,
-            output_sc=0.0784314,
-            coordinate_transformation_mode="half_pixel",
-            rounding_method="round_prefer_ceil",
-        )
-        model = tei.make_ethosn_composite(model, "ethos-n.qnn_resize")
-        mod = tei.make_ethosn_partition(model)
-        tei.test_error(mod, {}, err_msg)
+    model = _get_model(
+        shape=(1, 10, 10, 1),
+        dtype=dtype,
+        size=size,
+        input_zp=zp_min + 128,
+        input_sc=0.0784314,
+        output_zp=zp_min + 128,
+        output_sc=0.0784314,
+        coordinate_transformation_mode="half_pixel",
+        rounding_method="round_prefer_ceil",
+    )
+    model = tei.make_ethosn_composite(model, "ethos-n.qnn_resize")
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)
diff --git a/tests/python/contrib/test_ethosn/test_sigmoid.py b/tests/python/contrib/test_ethosn/test_sigmoid.py
index ae8c301ff01a..bddd16049144 100644
--- a/tests/python/contrib/test_ethosn/test_sigmoid.py
+++ b/tests/python/contrib/test_ethosn/test_sigmoid.py
@@ -44,59 +44,59 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype):
 
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_sigmoid(dtype):
-    """Compare Sigmoid output with TVM."""
-
-    trials = [
+@pytest.mark.parametrize(
+    "shape",
+    [
         (1, 16, 16, 16),
         (1, 8, 8),
-    ]
-
+    ],
+)
+def test_sigmoid(dtype, shape):
+    """Compare Sigmoid output with TVM."""
     np.random.seed(0)
-    for shape in trials:
-        inputs = {
-            "a": tvm.nd.array(
-                np.random.randint(
-                    np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype
-                )
-            ),
-        }
-        outputs = []
-        for npu in [False, True]:
-            for _ in range(1, 2):
-                if dtype == "uint8":
-                    input_zp = 0
-                    output_zp = 0
-                else:
-                    input_zp = 127
-                    output_zp = -128
-                model = _get_model(shape, input_zp, 0.02, output_zp, 1.0 / 256.0, dtype)
-                mod = tei.make_module(model, [])
-                outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
 
-        tei.verify(outputs, dtype, 1)
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype)
+        ),
+    }
+    outputs = []
+    for npu in [False, True]:
+        for _ in range(1, 2):
+            if dtype == "uint8":
+                input_zp = 0
+                output_zp = 0
+            else:
+                input_zp = 127
+                output_zp = -128
+            model = _get_model(shape, input_zp, 0.02, output_zp, 1.0 / 256.0, dtype)
+            mod = tei.make_module(model, [])
+            outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
 
+    tei.verify(outputs, dtype, 1)
 
-@requires_ethosn
-@pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_sigmoid_failure(dtype):
-    """Check Sigmoid error messages."""
 
-    test_zp = 0 if dtype == "uint8" else -128
-    trials = [
-        ((2, 4, 4, 4), 64, 0.2, test_zp, 1 / 256, "batch size=2, batch size must = 1"),
+@requires_ethosn
+@pytest.mark.parametrize(
+    "shape,input_zp,input_sc,output_zp,output_sc,err_msg",
+    [
+        ((2, 4, 4, 4), 64, 0.2, 0, 1 / 256, "batch size=2, batch size must = 1"),
         (
             (1, 4, 4, 4),
             64,
             0.2,
             3,
             1,
-            f"output quantization params=(3, 1), must = ({test_zp}, 1/256)",
+            "output quantization params=(3, 1), must = (0, 1/256)",
         ),
-    ]
+    ],
+)
+def test_sigmoid_failure(shape, input_zp, input_sc, output_zp, output_sc, err_msg):
+    """Check Sigmoid error messages."""
+
+    dtype = "uint8"
 
-    for shape, input_zp, input_sc, output_zp, output_sc, err_msg in trials:
-        model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype)
-        model = tei.make_ethosn_composite(model, "ethos-n.qnn_sigmoid")
-        mod = tei.make_ethosn_partition(model)
-        tei.test_error(mod, {}, err_msg)
+    model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype)
+    model = tei.make_ethosn_composite(model, "ethos-n.qnn_sigmoid")
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)
diff --git a/tests/python/contrib/test_ethosn/test_split.py b/tests/python/contrib/test_ethosn/test_split.py
index 7f8787afe947..afbc45a0805d 100644
--- a/tests/python/contrib/test_ethosn/test_split.py
+++ b/tests/python/contrib/test_ethosn/test_split.py
@@ -36,39 +36,37 @@ def _get_model(shape, dtype, splits, axis):
 @pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.")
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_split(dtype):
-    """Compare Split output with TVM."""
-
-    trials = [
+@pytest.mark.parametrize(
+    "shape,splits,axis",
+    [
         ((1, 16, 16, 32), (2, 7, 10), 2),
         ((1, 12, 8, 16), 3, 1),
-    ]
-
+    ],
+)
+def test_split(dtype, shape, splits, axis):
+    """Compare Split output with TVM."""
     np.random.seed(0)
-    for shape, splits, axis in trials:
-        outputs = []
-        inputs = {
-            "a": tvm.nd.array(
-                np.random.randint(
-                    np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype
-                )
-            )
-        }
-        for npu in [False, True]:
-            model = _get_model(shape, dtype, splits, axis)
-            mod = tei.make_module(model, {})
-            output_count = splits if isinstance(splits, int) else len(splits) + 1
-            outputs.append(tei.build_and_run(mod, inputs, output_count, {}, npu=npu))
+
+    outputs = []
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype)
+        )
+    }
+    for npu in [False, True]:
+        model = _get_model(shape, dtype, splits, axis)
+        mod = tei.make_module(model, {})
+        output_count = splits if isinstance(splits, int) else len(splits) + 1
+        outputs.append(tei.build_and_run(mod, inputs, output_count, {}, npu=npu))
 
         tei.verify(outputs, dtype, 0)
 
 
 @pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.")
 @requires_ethosn
-def test_split_failure():
-    """Check Split error messages."""
-
-    trials = [
+@pytest.mark.parametrize(
+    "shape,dtype,splits,axis,err_msg",
+    [
         ((1, 4, 4, 4, 4), "uint8", 4, 2, "dimensions=5, dimensions must be <= 4;"),
         ((1, 4, 4, 4), "int16", 4, 2, "dtype='int16', dtype must be either uint8, int8 or int32;"),
         ((2, 4, 4, 4), "uint8", 4, 2, "batch size=2, batch size must = 1;"),
@@ -81,9 +79,10 @@ def test_split_failure():
             "Split along the channels dimension (axis 3) requires all output sizes "
             "(specified in splitInfo.m_Sizes) to be multiples of 16;",
         ),
-    ]
-
-    for shape, dtype, splits, axis, err_msg in trials:
-        model = _get_model(shape, dtype, splits, axis)
-        mod = tei.make_ethosn_partition(model)
-        tei.test_error(mod, {}, err_msg)
+    ],
+)
+def test_split_failure(shape, dtype, splits, axis, err_msg):
+    """Check Split error messages."""
+    model = _get_model(shape, dtype, splits, axis)
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)
diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py
index 19d7accadb6d..dc6a2ed086d4 100644
--- a/tests/python/contrib/test_ethosn/test_topologies.py
+++ b/tests/python/contrib/test_ethosn/test_topologies.py
@@ -237,8 +237,15 @@ def get_model():
 
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_split_with_asym_concats(dtype):
+@pytest.mark.parametrize(
+    "shape,splits,axis",
+    [
+        ((1, 16, 16, 32), (2, 7, 10), 2),
+    ],
+)
+def test_split_with_asym_concats(dtype, shape, splits, axis):
     """Test a model with split and contatenates."""
+    np.random.seed(0)
 
     def get_model(shape, dtype, splits, axis):
         a = relay.var("a", shape=shape, dtype=dtype)
@@ -263,51 +270,43 @@ def get_model(shape, dtype, splits, axis):
         )
         return relay.Tuple((con2, con1))
 
-    trials = [
-        ((1, 16, 16, 32), (2, 7, 10), 2),
-    ]
-
-    np.random.seed(0)
-    for shape, splits, axis in trials:
-        outputs = []
-        inputs = {
-            "a": tvm.nd.array(
-                np.random.randint(
-                    np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype
-                )
-            )
-        }
-        for npu in [False, True]:
-            model = get_model(shape, dtype, splits, axis)
-            mod = tei.make_module(model, {})
+    outputs = []
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype)
+        )
+    }
+    for npu in [False, True]:
+        model = get_model(shape, dtype, splits, axis)
+        mod = tei.make_module(model, {})
 
-            expected_host_ops = 1
-            npu_partitions = 2
+        expected_host_ops = 1
+        npu_partitions = 2
 
-            # Mock inference is only supported when the whole graph is offloaded to the NPU
-            if ethosn_available() == Available.SW_ONLY:
-                tei.build(
+        # Mock inference is only supported when the whole graph is offloaded to the NPU
+        if ethosn_available() == Available.SW_ONLY:
+            tei.build(
+                mod,
+                {},
+                npu=npu,
+                expected_host_ops=expected_host_ops,
+                npu_partitions=npu_partitions,
+            )
+        else:
+            outputs.append(
+                tei.build_and_run(
                     mod,
+                    inputs,
+                    2,
                     {},
                     npu=npu,
                     expected_host_ops=expected_host_ops,
                     npu_partitions=npu_partitions,
                 )
-            else:
-                outputs.append(
-                    tei.build_and_run(
-                        mod,
-                        inputs,
-                        2,
-                        {},
-                        npu=npu,
-                        expected_host_ops=expected_host_ops,
-                        npu_partitions=npu_partitions,
-                    )
-                )
+            )
 
-        if outputs:
-            tei.verify(outputs, dtype, 0)
+    if outputs:
+        tei.verify(outputs, dtype, 0)
 
 
 @pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.")

From b3edb6e227be0dea73413d5780d15a4cbdc3d83b Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Tue, 6 Sep 2022 14:12:14 +0100
Subject: [PATCH 106/704] [Apps] Pin android_camera TensorFlow/Keras dependency
 version (#12710)

At the moment, android camera is installing latest TF and Keras
which is causing the following issue in CI:

```
  File ".../keras/dtensor/lazy_variable.py", line 26, in <module>
    from tensorflow.python.trackable import base as trackable
ModuleNotFoundError: No module named 'tensorflow.python.trackable'
```

This patch fixes the versions in the last known working versions
of both: TF 2.9.1 and Keras 2.9.
---
 apps/android_camera/models/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/android_camera/models/requirements.txt b/apps/android_camera/models/requirements.txt
index 98aa53def46f..1deff2b3548b 100644
--- a/apps/android_camera/models/requirements.txt
+++ b/apps/android_camera/models/requirements.txt
@@ -1,4 +1,4 @@
-keras
+keras==2.9
 mxnet
 scipy
-tensorflow
\ No newline at end of file
+tensorflow==2.9.1
\ No newline at end of file

From 832cffa1c1729c88c799e81c3340a80fb4a48baa Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Tue, 6 Sep 2022 11:06:03 -0400
Subject: [PATCH 107/704] [Hexagon][Runtime] Better support for 2-tier memory
 (#12574)

- Introduce 'global.ddr' memory scope:
  - Like 'global', this allocates memory from the Hexagon SoC's
    DDR memory.
  - Like 'global.vtcm', the specified tensor shape must be 1d
    or 2d, where 2d indicates Hexagon's "indirect tensor"
    (i.e., discontiguous) allocation scheme.

- Change memory-alignment strategy to always be 2048-byte aligned
  on Hexagon.  (This can be refined in the future, but for now it
  ensures all allocations meet the strictest alignment requirements
  for any Hexagon operations.)
---
 src/runtime/hexagon/hexagon_buffer.cc         | 17 ++--
 src/runtime/hexagon/hexagon_device_api.cc     | 47 +++++++---
 .../contrib/test_hexagon/test_memory_alloc.py | 85 +++++++++++++++++++
 3 files changed, 126 insertions(+), 23 deletions(-)
 create mode 100644 tests/python/contrib/test_hexagon/test_memory_alloc.py

diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index 0fc71d8ac29c..f23317fd01ed 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -161,17 +161,16 @@ void* HexagonBuffer::GetPointer() {
 HexagonBuffer::StorageScope HexagonBuffer::GetStorageScope() const { return storage_scope_; }
 
 void HexagonBuffer::SetStorageScope(Optional<String> scope) {
-  if (!scope.defined()) {
+  const std::string s = scope.value_or("global");
+
+  if (s == "global") {
+    storage_scope_ = StorageScope::kDDR;
+  } else if (s == "global.ddr") {
     storage_scope_ = StorageScope::kDDR;
+  } else if (s == "global.vtcm") {
+    storage_scope_ = StorageScope::kVTCM;
   } else {
-    if (scope.value() == "global") {
-      storage_scope_ = StorageScope::kDDR;
-    } else if (scope.value() == "global.vtcm") {
-      storage_scope_ = StorageScope::kVTCM;
-    } else {
-      CHECK(false) << "Encountered unknown HexagonBuffer storage scope: "
-                   << std::string(scope.value());
-    }
+    CHECK(false) << "Encountered unknown HexagonBuffer storage scope: " << std::string(s);
   }
 }
 
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index f22afca10bfa..cf384ae88db7 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -57,34 +57,53 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
   CHECK(shape) << "shape array is null";
   CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
 
+  // IMPORTANT NOTE!
+  // Hexagon treats "global" memory scope VERY DIFFERENTLY from all the others.
+  //
+  // With "global":
+  //    - As with "global.ddr", this uses the target device's DDR memory.
+  //    - The memory allocation must be a single, contiguous region of
+  //      (virtual) memory addresses.
+  //    - 'ndim' and 'shape' give the dimensions of the tensor to be stored
+  //      in this allocation.  There's no (practical) limit on the maximum
+  //      rank (ndim) of the tensor.
+  //
+  // All other supported memory-scope names:
+  //   - 'ndim' must be exactly 1 or 2:
+  //      1: A single, contiguous region of memory is requested.
+  //      2: A two-level memory allocation is required, suitable for storing a tensor
+  //         in Hexagon's "indirect tensor" format:
+  //         - shape[0] indicates the number of tensor-content memory allocations.
+  //         - shape[1] indicates the size of each tensor-content memory allocation.
   if (!mem_scope.defined() || mem_scope.value() == "global") {
     return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
   }
 
-  // must be Hexagon device and VTCM scope after this point
-  CHECK_EQ(mem_scope.value(), "global.vtcm");
-  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
+  // NOTE: This check should be superfluous, but it's probably a good idea to leave it in
+  // until the AoT executor's multi-device dispatch code is mature. --cconvey 2022-08-26
+  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon)
+      << "dev.device_type: " << dev.device_type << " DeviceName(" << dev.device_type
+      << "): " << DeviceName(dev.device_type) << "";
 
-  size_t typesize = (dtype.bits / 8) * dtype.lanes;
+  CHECK(ndim >= 0 && ndim <= 2)
+      << "Hexagon Device API supports only 1d and 2d allocations, but received ndim = " << ndim;
 
-  size_t alignment = shape[ndim - 1] * typesize;
-  if (alignment < kHexagonAllocAlignment) {
-    alignment = kHexagonAllocAlignment;
-  }
+  const size_t typesize = (dtype.bits / 8) * dtype.lanes;
 
   if (ndim == 0) {
-    return hexbuffs.AllocateHexagonBuffer(typesize, alignment, mem_scope);
+    // Allocate storage for a single scalar value.
+    return hexbuffs.AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
   } else if (ndim == 1) {
+    // Allocate a single, contiguous memory region.
     size_t nbytes = shape[0] * typesize;
-    return hexbuffs.AllocateHexagonBuffer(nbytes, alignment, mem_scope);
+    return hexbuffs.AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
   } else if (ndim == 2) {
+    // Allocate the region(s) needed for Hexagon's indirect-tensor format.
     size_t nallocs = shape[0];
     size_t nbytes = shape[1] * typesize;
-    return hexbuffs.AllocateHexagonBuffer(nallocs, nbytes, alignment, mem_scope);
+    return hexbuffs.AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope);
   } else {
-    LOG(FATAL) << "Hexagon Device API supports only 1d and 2d allocations, but received ndim = "
-               << ndim;
-    return nullptr;
+    return nullptr;  // unreachable
   }
 }
 
diff --git a/tests/python/contrib/test_hexagon/test_memory_alloc.py b/tests/python/contrib/test_hexagon/test_memory_alloc.py
new file mode 100644
index 000000000000..fd948ea524f2
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_memory_alloc.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import os.path
+import sys
+import tempfile
+
+import numpy as np
+import pytest
+
+import tvm
+from tvm.script import tir as T
+
+from .infrastructure import allocate_hexagon_array
+
+_HEXAGON_TARGET = tvm.target.hexagon("v69", link_params=True)
+
+
+@tvm.testing.fixture
+def generated_func(shape, scope, dtype, axis_separators):
+    dim0, dim1 = shape
+
+    @T.prim_func
+    def elwise(a: T.handle, b: T.handle):
+        A = T.match_buffer(a, shape, dtype=dtype, axis_separators=axis_separators)
+        B = T.match_buffer(b, shape, dtype=dtype, axis_separators=axis_separators)
+
+        for i, j in T.grid(dim0, dim1):
+            with T.block("compute"):
+                B[i, j] = A[i, j] * T.cast(2, dtype=dtype)
+
+    return elwise
+
+
+class TestMemoryAlloc:
+    dtype = tvm.testing.parameter("int8")
+    shape = tvm.testing.parameter((128, 128))
+
+    (scope, axis_separators,) = tvm.testing.parameters(
+        ("global", []),
+        ("global.vtcm", []),
+        ("global.vtcm", [1]),
+        ("global.ddr", []),
+        ("global.ddr", [1]),
+    )
+
+    def test_global_axis_separator(
+        self, hexagon_session, generated_func, shape, dtype, scope, axis_separators
+    ):
+        mod1 = tvm.build(
+            generated_func, target=tvm.target.Target(_HEXAGON_TARGET, host=_HEXAGON_TARGET)
+        )
+        mod2 = hexagon_session.load_module(mod1)
+
+        a_np = np.ones(shape=shape, dtype=dtype)
+        a = allocate_hexagon_array(
+            hexagon_session.device, data=a_np, mem_scope=scope, axis_separators=axis_separators
+        )
+
+        b_np = np.zeros(shape=shape, dtype=dtype)
+        b = allocate_hexagon_array(
+            hexagon_session.device, data=b_np, mem_scope=scope, axis_separators=axis_separators
+        )
+
+        mod2(a, b)
+        tvm.testing.assert_allclose(a.numpy() * 2, b.numpy(), atol=1e-4, rtol=1e-4)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 744649e53bd32b53eb53020a111479facff3b88a Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 6 Sep 2022 10:31:39 -0700
Subject: [PATCH 108/704] [TIR][StorageRewrite] Allow in-place buffer reuse of
 non-flat memory (#12655)

* [TIR][StorageRewrite] Allow in-place buffer reuse of non-flat memory

Previously, shared buffer use was entirely disabled for non-flat
memory, since the existing checks for shared memory assume flat 1-d
spaces.  This was enforced in `FindAlloc` and validated in
`PrepareNewAlloc`.  The validation in `PrepareNewAlloc` could trigger,
if the buffer sharing was due to an in-place operation, and not
through the `FindAlloc` function.

In-place operations do not require N-d packing, nor do they introduce
ambiguity in how different code generators may interpret non-flat
physical indices.  Therefore, this commit relaxes the validation in
`PrepareNewAlloc`, allowing buffer reuse of non-flat buffers for
in-place operations.

* Update new StorageRewrite with correct allocate/buffer_decl usage
---
 src/tir/transforms/storage_rewrite.cc         |  20 ++-
 .../test_tir_transform_storage_rewrite.py     | 116 +++++++++++++++++-
 2 files changed, 132 insertions(+), 4 deletions(-)

diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index 177017f9a245..67972ce67282 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -655,7 +655,25 @@ class StoragePlanRewriter : public StmtExprMutator {
           }
         }
 
-        if (e->allocs.size() == 1) {
+        bool all_allocs_identical = std::all_of(
+            e->allocs.begin() + 1, e->allocs.end(), [&](const AllocateNode* op) -> bool {
+              const AllocateNode* first = *e->allocs.begin();
+              if (op->dtype != first->dtype) {
+                return false;
+              }
+              if (op->extents.size() != first->extents.size()) {
+                return false;
+              }
+              ExprDeepEqual expr_equal;
+              for (size_t i = 0; i < op->extents.size(); i++) {
+                if (!expr_equal(op->extents[i], first->extents[i])) {
+                  return false;
+                }
+              }
+              return true;
+            });
+
+        if (all_allocs_identical) {
           // simply use the original allocation.
           e->new_alloc = Allocate(e->alloc_var, alloc_type, e->allocs[0]->extents,
                                   e->allocs[0]->condition, Evaluate(0));
diff --git a/tests/python/unittest/test_tir_transform_storage_rewrite.py b/tests/python/unittest/test_tir_transform_storage_rewrite.py
index 581afef88942..533a835e0f9c 100644
--- a/tests/python/unittest/test_tir_transform_storage_rewrite.py
+++ b/tests/python/unittest/test_tir_transform_storage_rewrite.py
@@ -673,7 +673,11 @@ def func_rewritten(A: T.Buffer[(8,), "float32"]) -> None:
     tvm.ir.assert_structural_equal(mod["main"], func_rewritten)
 
 
-class TestLetBufferRewrite(tvm.testing.CompareBeforeAfter):
+class BaseCompare(tvm.testing.CompareBeforeAfter):
+    transform = tvm.tir.transform.StorageRewrite()
+
+
+class TestLetBufferRewrite(BaseCompare):
     """StorageRewrite replaces the bound var of backing allocations
 
     If StorageRewrite replaces the backing variable of an array, such
@@ -684,8 +688,6 @@ class TestLetBufferRewrite(tvm.testing.CompareBeforeAfter):
     handled.
     """
 
-    transform = tvm.tir.transform.StorageRewrite()
-
     def before() -> None:
         A_data: T.Ptr[T.int32] = T.call_extern("dummy_func", dtype="handle")
         A = T.buffer_decl([8], "int32", data=A_data)
@@ -697,5 +699,113 @@ def expected() -> None:
         A[0] = T.broadcast(42, 8)
 
 
+class TestRewriteInPlaceUseOfNonFlatBuffer(BaseCompare):
+    """A non-flat buffer may be re-used for in-place operations"""
+
+    def before(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]):
+        B_data = T.allocate(
+            [16, 16],
+            dtype="float32",
+            scope="global",
+        )
+        B = T.buffer_decl(
+            [16, 16],
+            dtype="float32",
+            axis_separators=[1],
+            data=B_data,
+        )
+        C_data = T.allocate(
+            [16, 16],
+            dtype="float32",
+            scope="global",
+        )
+        C = T.buffer_decl(
+            [16, 16],
+            dtype="float32",
+            axis_separators=[1],
+            data=C_data,
+        )
+
+        for i, j in T.grid(16, 16):
+            B[i, j] = A[i, j]
+
+        for i, j in T.grid(16, 16):
+            C[i, j] = 2.0 * B[i, j]
+
+        for i, j in T.grid(16, 16):
+            D[i, j] = C[i, j]
+
+    def expected(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]):
+        B_data = T.allocate(
+            [16, 16],
+            dtype="float32",
+            scope="global",
+        )
+        B = T.buffer_decl([16, 16], dtype="float32", axis_separators=[1], data=B_data)
+        C = T.buffer_decl(
+            [16, 16],
+            dtype="float32",
+            axis_separators=[1],
+            data=B.data,
+        )
+
+        for i, j in T.grid(16, 16):
+            B[i, j] = A[i, j]
+
+        for i, j in T.grid(16, 16):
+            C[i, j] = 2.0 * B[i, j]
+
+        for i, j in T.grid(16, 16):
+            D[i, j] = C[i, j]
+
+
+class TestNoRewriteOfSharedNonFlatBuffer(BaseCompare):
+    """In general, sharing of non-flat buffer isn't supported
+
+    The current packing algorithms in StorageRewrite assume a flat
+    memory space, and do not support packing of N-d buffers.  For
+    buffers with axis separators, normal buffer sharing should be
+    disabled.
+
+    Like TestRewriteInPlaceUseOfNonFlatBuffer, except that B and C do
+    not have matching shapes.
+    """
+
+    def before(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]):
+        B_data = T.allocate(
+            [16, 16],
+            dtype="float32",
+            scope="global",
+        )
+        B = T.buffer_decl(
+            [16, 16],
+            dtype="float32",
+            axis_separators=[1],
+            data=B_data,
+        )
+        C_data = T.allocate(
+            [20, 20],
+            dtype="float32",
+            scope="global",
+        )
+        C = T.buffer_decl(
+            [20, 20],
+            dtype="float32",
+            axis_separators=[1],
+            data=C_data,
+        )
+
+        for i, j in T.grid(16, 16):
+            B[i, j] = A[i, j]
+
+        for i, j in T.grid(16, 16):
+            C[i, j] = 2.0 * B[i, j]
+
+        for i, j in T.grid(16, 16):
+            D[i, j] = C[i, j]
+
+    expected = before
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From d4201a9d8e56a391231cb71bf80d82ab36a9dfaf Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 6 Sep 2022 10:33:58 -0700
Subject: [PATCH 109/704] [COMMUNITY] ekalda -> Committer (#12715)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 01cf7058a069..2231fac66596 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -46,6 +46,7 @@ We do encourage everyone to work anything they are interested in.
 - [Chenfan Jia](https://github.com/jcf94): @jcf94 - auto_scheduler
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm - ethos-u, memory planner
+- [Elen Kalda](https://github.com/ekalda): @ekalda - ethos-u, arm
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay
 - [Tristan Konolige](https://github.com/tkonolige): @tkonolige - profiling, relay, tir, runtime
 - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574 - tir, tvm-script

From 141b17b23a801799576bab02b0654d062e071380 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Tue, 6 Sep 2022 22:45:53 +0300
Subject: [PATCH 110/704] [Hexagon] Add optimized schedule for nn.pad (#12714)

Motivation:
In case of quantized models nn.pad operation typically is not fused with QNN ops
and lives as a standalone operation. In this case it uses default injective
schedule for Hexagon target and it is not optimized very well (based on
analysis of real models like ResNet50 INT8).

What was done:
New schedule for Pad operation was implemented instead of default injective schedule.
For Hexagon target injective schedule does fusion of all axis and vectorization
on 128/64/32 (depends on dtype). It works fine for Add, Sub, etc... but not for Pad.
New optimized schedule does these steps (fusion+vectorization) only if last tensor
dimension is divisible by 128/64/32 (depends on dtype). It was done only for Hexagon,
for other targets (x86, cuda, etc.) there is no changes and it uses default injective
schedule.

Benchmark results on Snapdragon 888:

4d NHWC layout with ((0, 0), (1, 1), (1, 1), (0, 0)) padding, "uint8" dtype:

shape              | default schedule, ms | optimized schedule, ms |      speedup      |
-------------------|----------------------|------------------------|-------------------|
(1, 112, 112, 32)  |         10,03        |           0.2          |    50.1x times    |
(1, 56, 56, 128)   |         0,099        |          0,085         |  ~1x (no speedup) |
---------------------------------------------------------------------------------------|

4d NCHW layout with ((0, 0), (0, 0), (1, 1), (1, 1)) padding, "uint8" dtype:

shape              | default schedule, ms | optimized schedule, ms |      speedup      |
-------------------|----------------------|------------------------|-------------------|
(1, 128, 56, 56)   |         10.96        |          1.38          |    7.9x times     |
(1, 32, 126, 126)  |          1.66        |          1.58          |  ~1x (no speedup) |
(1, 32, 128, 128)  |         13.98        |          2.66          |    5.25x times    |
---------------------------------------------------------------------------------------|

5d NCHWc layout with ((0, 0), (0, 0), (1, 1), (1, 1), (0, 0)) padding, "uint8" dtype:

shape              | default schedule, ms | optimized schedule, ms |      speedup      |
-------------------|----------------------|------------------------|-------------------|
(1, 4, 56, 56, 32) |          6.39        |          0.29          |     22x times     |
(1, 56, 56, 128)   |          0.15        |          0.15          |  ~1x (no speedup) |
---------------------------------------------------------------------------------------|

Summary:
For some input tensors we get up to 50x times speedup, for other performance is the same.
No performance degradations were detected.
---
 python/tvm/relay/op/nn/_nn.py                 |  2 +-
 python/tvm/relay/op/strategy/generic.py       |  8 +++
 python/tvm/relay/op/strategy/hexagon.py       |  7 +++
 python/tvm/topi/hexagon/__init__.py           |  1 +
 python/tvm/topi/hexagon/pad.py                | 51 +++++++++++++++++
 .../contrib/test_hexagon/topi/test_pad.py     | 57 +++++++++++++++++++
 6 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/topi/hexagon/pad.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_pad.py

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index ff213f098319..90a94c422992 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -701,7 +701,7 @@ def compute_upsampling3d(attrs, inputs, out_dtype):
 
 
 # pad
-reg.register_broadcast_schedule("nn.pad")
+reg.register_schedule("nn.pad", strategy.schedule_pad)
 
 
 # mirror_pad
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 74abd9281f87..6ab281abeb37 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -205,6 +205,14 @@ def schedule_lrn(attrs, outs, target):
         return topi.generic.schedule_lrn(outs)
 
 
+# pad
+@generic_func
+def schedule_pad(attrs, outs, target):
+    """Schedule PAD op"""
+    with target:
+        return schedule_injective(attrs, outs, target)
+
+
 # bitpack
 @generic_func
 def schedule_bitpack(attrs, outs, target):
diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py
index be01ee50fba8..13c808f96b95 100644
--- a/python/tvm/relay/op/strategy/hexagon.py
+++ b/python/tvm/relay/op/strategy/hexagon.py
@@ -168,6 +168,13 @@ def schedule_concatenate_hexagon(attrs, outs, target):
         return topi.hexagon.schedule_injective(outs)
 
 
+@schedule_pad.register("hexagon")
+def schedule_pad_hexagon(attrs, outs, target):
+    """Schedule pad ops for Hexagon"""
+    with target:
+        return topi.hexagon.schedule_pad(outs)
+
+
 @schedule_pool.register("hexagon")
 def schedule_pool_hexagon(attrs, outs, target):
     """Schedule pool ops for Hexagon"""
diff --git a/python/tvm/topi/hexagon/__init__.py b/python/tvm/topi/hexagon/__init__.py
index a3768a6e809e..295152d11631 100644
--- a/python/tvm/topi/hexagon/__init__.py
+++ b/python/tvm/topi/hexagon/__init__.py
@@ -23,6 +23,7 @@
 from .conv2d import *
 from .dense import *
 from .injective import *
+from .pad import *
 from .pooling import *
 from .reduce import *
 from .resize2d import *
diff --git a/python/tvm/topi/hexagon/pad.py b/python/tvm/topi/hexagon/pad.py
new file mode 100644
index 000000000000..c744d47fefa1
--- /dev/null
+++ b/python/tvm/topi/hexagon/pad.py
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Schedule for nn.pad operator"""
+
+import tvm
+
+import numpy as np
+
+
+def schedule_pad(outs):
+    """Schedule for pad op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of injective in the format
+        of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
+    s = tvm.te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
+
+    # Fuse axes and vectorize only if last output tensor dimension is divisible by a factor:
+    factor = 128 // np.dtype(outs[0].dtype).itemsize
+    last_dim = outs[0].shape[-1]
+    if last_dim % factor == 0 and last_dim // factor >= 0:
+        fused = s[outs[0]].fuse(*outs[0].op.axis)
+        _, inner = s[outs[0]].split(fused, factor=factor)
+        s[outs[0]].vectorize(inner)
+
+    return s
diff --git a/tests/python/contrib/test_hexagon/topi/test_pad.py b/tests/python/contrib/test_hexagon/topi/test_pad.py
new file mode 100644
index 000000000000..631cb979dcbd
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_pad.py
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for reduce"""
+import numpy as np
+
+import tvm
+from tvm import te, topi
+from tvm.contrib.hexagon.session import Session
+from tvm.topi.utils import get_const_tuple
+
+
+@tvm.testing.requires_hexagon
+def test_nn_pad(hexagon_session: Session):
+    dtype = "uint8"
+    in_shape = (1, 56, 56, 32)
+
+    data_in = np.ones(in_shape).astype(dtype)
+
+    A = te.placeholder(shape=in_shape, name="A", dtype=dtype)
+
+    C = topi.nn.pad(A, [0, 1, 1, 0], [0, 1, 1, 0], pad_value=0)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    with tvm.target.Target(target_hexagon):
+        fschedule = topi.hexagon.schedule_pad
+        s = fschedule(C)
+
+    func = tvm.build(s, [A, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="pad")
+    mod = hexagon_session.load_module(func)
+
+    dev = hexagon_session.device
+    a = tvm.nd.array(data_in, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+    mod["pad"](a, b)
+
+    # Reference numpy pad output
+    ref_out = np.pad(data_in, pad_width=((0, 0), (1, 1), (1, 1), (0, 0)))
+
+    tvm.testing.assert_allclose(b.numpy(), ref_out)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From da48e13b66fa053578815343c3f247f47364d0bb Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 6 Sep 2022 21:10:36 +0100
Subject: [PATCH 111/704] [TVMC] Run module once by default (#12713)

* [TVMC] Run module once by default

Currently executing `tvmc run module.tar` will run the input model
twice. For benchmaking this is to be expected as the first run is used
to prime caches etc before taking a measurement. However, this seems a
bit unintuitive to have as default, especially when benchmarking is not
always intended. In this sense, this commit aims to amend the
number of runs for the default: `tvmc run module.tar` to a single run.

After inspection, this seems to be down to the use of the `.benchmark()`
method which runs (1 + repeat * number) executions in total. This means
that at least two runs are required (i.e. when repeat=1, number=1). It
also seems that it is only necessary to benchmark the model when
`--print-time` has been set from the CLI POV. From the python interface
POV, benchmarking is always run, but this may not always be necessary.

This commit makes use of the `.run()` method to singularly execute the
model by default. From the CLI this will be used when `--print-time` is
set to False whereas from the python interface this will be used when
`benchmark=False`. Otherwise, the `.benchmark()` method will be used
as before. Complementary to this change `repeat`, `number` and
`end_to_end` parameters are only used when either `--print-time` or
`benchmark` are set to True - and the documentation has been updated to
indicate this.

Change-Id: I18a38a9d430d660264f7fce5caf0779aa059fed3

* improve documentation with number of exectuions when benchmarking

Change-Id: Iecf557594420fcc9f3abcec5ce7d952db2c94271
---
 python/tvm/driver/tvmc/runner.py        | 58 +++++++++++++++++--------
 tests/python/driver/tvmc/conftest.py    | 16 +++++++
 tests/python/driver/tvmc/test_model.py  |  4 +-
 tests/python/driver/tvmc/test_runner.py | 42 ++++++++++++++++++
 4 files changed, 101 insertions(+), 19 deletions(-)

diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index afb198ce1c6e..216f3bb2653b 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -92,7 +92,8 @@ def add_run_parser(subparsers, main_parser, json_params):
     parser.add_argument(
         "--print-time",
         action="store_true",
-        help="record and print the execution time(s). (non-micro devices only)",
+        help="record and print the execution time(s). Enabling print-time will result "
+        " in (1 + repeat * number) executions of the model. (non-micro devices only)",
     )
     parser.add_argument(
         "--print-top",
@@ -112,13 +113,24 @@ def add_run_parser(subparsers, main_parser, json_params):
         "--end-to-end",
         action="store_true",
         help="Measure data transfers as well as model execution. This can provide a "
-        "more realistic performance measurement in many cases.",
+        "more realistic performance measurement in many cases. Requires "
+        "'--print-time' to be specified.",
     )
     parser.add_argument(
-        "--repeat", metavar="N", type=int, default=1, help="run the model n times. Defaults to '1'"
+        "--repeat",
+        metavar="N",
+        type=int,
+        default=1,
+        help="How many times to repeat the run. Requires '--print-time' to be "
+        "specified. Defaults to '1'",
     )
     parser.add_argument(
-        "--number", metavar="N", type=int, default=1, help="repeat the run n times. Defaults to '1'"
+        "--number",
+        metavar="N",
+        type=int,
+        default=1,
+        help="The number of runs to measure within each repeat. Requires "
+        "'--print-time' to be specified. Defaults to '1'",
     )
     parser.add_argument(
         "--rpc-key",
@@ -273,6 +285,7 @@ def drive_run(args):
         rpc_key=args.rpc_key,
         inputs=inputs,
         fill_mode=args.fill_mode,
+        benchmark=args.print_time,
         repeat=args.repeat,
         number=args.number,
         profile=args.profile,
@@ -462,6 +475,7 @@ def run_module(
     rpc_key: Optional[str] = None,
     inputs: Optional[Dict[str, np.ndarray]] = None,
     fill_mode: str = "random",
+    benchmark: bool = False,
     repeat: int = 10,
     number: int = 10,
     profile: bool = False,
@@ -495,23 +509,26 @@ def run_module(
         The fill-mode to use when generating data for input tensors.
         Valid options are "zeros", "ones" and "random".
         Defaults to "random".
+    benchmark : bool, optional
+        Whether to benchmark the execution of the module. Enabling benchmark will
+        result in (1 + repeat * number) executions of the model.
     repeat : int, optional
-        How many times to repeat the run.
+        How many times to repeat the run. Requires `benchmark` to be set to True.
     number : int, optional
         The number of runs to measure within each repeat.
+        Requires `benchmark` to be set to True.
     profile : bool
         Whether to profile the run with the debug executor.
     end_to_end : bool
         Whether to measure the time of memory copies as well as model
         execution. Turning this on can provide a more realistic estimate
         of how long running the model in production would take.
+        Requires `benchmark` to be set to True.
 
     Returns
     -------
-    outputs : dict
-        a dictionary with output tensors, generated by the module
-    times : list of str
-        execution times generated by the time evaluator
+    TVMCResult
+        The results of the run, including the output data.
     """
     if not isinstance(tvmc_package, TVMCPackage):
         raise TVMCException(
@@ -605,14 +622,19 @@ def run_module(
                 exe = vm.VirtualMachine(lib, dev)
 
             exe_outputs = exe.invoke("main", **input_tensor)
-            times = exe.benchmark(
-                dev,
-                **input_tensor,
-                func_name="main",
-                repeat=repeat,
-                number=number,
-                end_to_end=end_to_end,
-            )
+
+            if benchmark:
+                times = exe.benchmark(
+                    dev,
+                    **input_tensor,
+                    func_name="main",
+                    repeat=repeat,
+                    number=number,
+                    end_to_end=end_to_end,
+                )
+            else:
+                exe.run(**input_tensor)
+                times = []
 
             # Special handling if the output only has a single value
             if not isinstance(exe_outputs, list):
@@ -662,7 +684,7 @@ def run_module(
                 # This print is intentional
                 print(report)
 
-            if device == "micro":
+            if not benchmark or device == "micro":
                 # TODO(gromero): Fix time_evaluator() for micro targets. Once it's
                 # fixed module.benchmark() can be used instead and this if/else can
                 # be removed.
diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py
index 48b465e507ae..8009448bff77 100644
--- a/tests/python/driver/tvmc/conftest.py
+++ b/tests/python/driver/tvmc/conftest.py
@@ -192,6 +192,22 @@ def model_compiler(model_file, **overrides):
     return model_compiler
 
 
+@pytest.fixture
+def relay_compile_model(tmpdir_factory):
+    """Support function that returns a TFLite compiled module"""
+
+    def model_compiler(model_file, shape_dict, **overrides):
+        package_path = tmpdir_factory.mktemp("data").join("mock.tar")
+        tvmc_model = tvmc.frontends.load_model(
+            model_file, model_format="relay", shape_dict=shape_dict
+        )
+        args = {"target": "llvm", **overrides}
+        return tvmc.compiler.compile_model(tvmc_model, package_path=package_path, **args)
+
+    # Returns a TVMCPackage
+    return model_compiler
+
+
 @pytest.fixture(scope="session")
 def imagenet_cat(tmpdir_factory):
     tmpdir_name = tmpdir_factory.mktemp("data")
diff --git a/tests/python/driver/tvmc/test_model.py b/tests/python/driver/tvmc/test_model.py
index fb1f718c1bed..4d937212e9cc 100644
--- a/tests/python/driver/tvmc/test_model.py
+++ b/tests/python/driver/tvmc/test_model.py
@@ -45,7 +45,9 @@ def test_tvmc_workflow(use_vm, keras_simple):
     )
     input_dict = {"input_1": np.random.uniform(size=(1, 32, 32, 3)).astype("float32")}
 
-    result = tvmc.run(tvmc_package, device="cpu", end_to_end=True, inputs=input_dict)
+    result = tvmc.run(
+        tvmc_package, device="cpu", end_to_end=True, benchmark=True, inputs=input_dict
+    )
     assert type(tvmc_model) is TVMCModel
     assert type(tvmc_package) is TVMCPackage
     assert type(result) is TVMCResult
diff --git a/tests/python/driver/tvmc/test_runner.py b/tests/python/driver/tvmc/test_runner.py
index f0d363dc59ac..5e6386614b1c 100644
--- a/tests/python/driver/tvmc/test_runner.py
+++ b/tests/python/driver/tvmc/test_runner.py
@@ -87,6 +87,7 @@ def test_run_tflite_module__with_profile__valid_input(
     result = tvmc.run(
         tflite_compiled_model,
         inputs=input_dict,
+        benchmark=True,
         hostname=None,
         device="cpu",
         profile=True,
@@ -145,3 +146,44 @@ def test_run_tflite_module_with_rpc(
     ), "tiger cat is expected in the top-5 for mobilenet v1"
     assert isinstance(result.outputs, dict)
     assert "output_0" in result.outputs.keys()
+
+
+@pytest.mark.parametrize("use_vm", [True, False])
+@pytest.mark.parametrize(
+    "benchmark,repeat,number,expected_len", [(False, 1, 1, 0), (True, 1, 1, 1), (True, 3, 2, 3)]
+)
+def test_run_relay_module__benchmarking(
+    use_vm,
+    benchmark,
+    repeat,
+    number,
+    expected_len,
+    relay_text_conv2d,
+    relay_compile_model,
+):
+    """Check the length of the results from benchmarking is what is expected by expected_len."""
+    shape_dict = {"data": (1, 3, 64, 64), "weight": (3, 3, 5, 5)}
+    input_dict = {
+        "data": np.random.randint(low=0, high=10, size=shape_dict["data"], dtype="uint8"),
+        "weight": np.random.randint(low=0, high=10, size=shape_dict["weight"], dtype="int8"),
+    }
+
+    tflite_compiled_model = relay_compile_model(
+        relay_text_conv2d, shape_dict=shape_dict, use_vm=use_vm
+    )
+    result = tvmc.run(
+        tflite_compiled_model,
+        inputs=input_dict,
+        hostname=None,
+        device="cpu",
+        benchmark=benchmark,
+        repeat=repeat,
+        number=number,
+    )
+
+    # When no benchmarking is used, an empty list is used to
+    # represent an absence of results.
+    if isinstance(result.times, list):
+        assert len(result.times) == expected_len
+    else:
+        assert len(result.times.results) == expected_len

From 85bf80c822ec930939eabba1dd8a774c88d88bdd Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Wed, 7 Sep 2022 04:00:24 -0300
Subject: [PATCH 112/704] [Docs] Add Commit Message Guideline (#12689)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds the Commit Message Guideline text to Apache TVM
documentation in ./docs/contribute/pull_request.rst, under section
'Submit a Pull Request', below subsection 'Guidelines', as a subsection
named “Commit Message Guideline”. The text in the second-last item in
subsection 'Guidelines' that mentions PR tags is also updated to refer
to this guideline.

This documentation will help guide contributors on how to write good
commit messages when submitting code / creating Pull Requests, in
accordance with RFC-0088:

https://github.com/apache/tvm-rfcs/blob/main/rfcs/0088-commit-message-guideline.md
---
 docs/contribute/pull_request.rst | 113 ++++++++++++++++++++++++++++++-
 1 file changed, 112 insertions(+), 1 deletion(-)

diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 81852a212610..7b5509be0aa9 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -62,7 +62,12 @@ Guidelines
 - Add test-cases to cover the new features or bugfix the patch introduces.
 - Document the code you wrote, see more at :ref:`doc_guide`
 - `Create a pull request <https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request>`_ and fix the problems reported by CI checks.
-- Request code reviews from other contributors and improve your patch according to their reviews by ``@``-ing them in your pull request. Tags in PR titles will automatically tag subscribed users, so make sure to put relevant topics in your PR titles (e.g. ``[microTVM] a cool change`` and not ``a cool change for microTVM``).
+- Request code reviews from other contributors and improve your patch according
+  to their reviews by ``@``-ing them in your pull request. Tags in PR titles
+  will automatically tag subscribed users, so make sure to put relevant topics
+  in your PR titles (e.g. ``[microTVM] Add a cool change`` and not ``a cool change for microTVM``).
+  Please see the Commit Message Guideline below on the guidelines about the tags
+  in a PR/commit title and how to write good PR/commit messages.
 
   - To get your code reviewed quickly, we encourage you to help review others' code so they can do the favor in return.
   - Code review is a shepherding process that helps to improve contributor's code quality.
@@ -72,6 +77,112 @@ Guidelines
 
 - The PR can be merged after the reviewers approve the pull request.
 
+Commit Message Guideline
+------------------------
+
+Apache TVM uses the Github (GH) platform for patch submission and code review
+via Pull Requests (PRs). The final commit (title and body) that is merged into
+the Apache TVM main tree is composed of the PR's title and body and must be kept
+updated and reflecting the new changes in the code as per the reviews and
+discussions.
+
+Although these guidelines apply essentially to the PRs’ title and body messages,
+because GH auto-generates the PR’s title and body from the commits on a given
+branch, it’s recommended to follow these guidelines right from the beginning,
+when preparing commits in general to be submitted to the Apache TVM project.
+This will ease the creation of a new PR, avoiding rework, and also will help the
+review.
+
+The rules below will help to achieve uniformity that has several benefits, both
+for review and for the code base maintenance as a whole, helping you to write
+commit messages with a good quality suitable for the Apache TVM project,
+allowing fast log searches, bisecting, and so on.
+
+*PR/commit title*:
+
+ - Guarantee a title exists (enforced);
+ - Don’t use Github usernames in the title, like @username (enforced);
+ - A tag must be present as a hint about what component(s) of the code
+   the PRs / commits “touch” (enforced). For example [BugFix], [CI], [microTVM],
+   and [TVMC]. Tags go between square brackets and appear first in the title. If
+   more than one tag exist, multiple brackets should be used, like [BugFix][CI].
+   The case recommended for tags, in geral, is the upper camel case. For example,
+   prefer the forms [Fix], [BugFix], and [Docker] instead of [fix], [bug_fix],
+   and [docker]. Acronyms should be kept as such so, for example, use [CI] and
+   [TVMC] instead of [ci] and [tvmc]. Tags help reviewers to identify the PRs
+   they can/want to review and also help the release folks when generating the
+   release notes;
+ - Use an imperative mood. Avoid titles like “Added operator X” and “Updated
+   image Y in the CI”, instead use the forms “Add feature X” and “Update image Y
+   in the CI” instead;
+ - Observe proper use of caps at the beginning (uppercase for the first letter)
+   and for acronyms, like, for instance, TVM, FVP, OpenCL. Hence instead of
+   “fix tvm use of opencl library”, write it as “Fix TVM use of OpenCL library”;
+ - Do not put a period at the end of the title.
+
+*PR/commit body*:
+
+ - Guarantee a body exists (enforced);
+ - Don’t use Github usernames in body text, like @username (enforced);
+ - Avoid “bullet” commit message bodies: “bullet” commit message bodies are not
+   bad per se, but “bullet” commit messages without any description or
+   explanation is likely as bad as commits without any description, rationale,
+   or explanation in the body.
+
+For minor deviations from these guidelines, the community will normally favor
+reminding the contributor of this policy over reverting or blocking a commmit /
+PR.
+
+Commits and PRs without a title and/or a body are not considered minor
+deviations from these guidelines and hence must be avoided.
+
+Most importantly, the contents of the commit message, especially the body,
+should be written to convey the intention of the change, so it should avoid
+being vague. For example, commits with a title like “Fix”, “Cleanup”, and
+“Fix flaky test” and without any body text should be avoided. Also, for the
+review, it will leave the reviewer wondering about what exactly was fixed or
+changed and why the change is necessary, slowing the review.
+
+Below is an example that can be used as a model:
+
+::
+
+ [microTVM] Zephyr: Remove zephyr_board option from build, flash, and open_transport methods
+
+ Currently it’s necessary to pass the board type via ‘zephyr_board’ option to
+ the Project API build, flash, and open_transport methods.
+
+ However, since the board type is already configured when the project is
+ created (i.e. when the generate_project method is called), it’s possible to
+ avoid this redundancy by obtaining the board type from the project
+ configuration files.
+
+ This commit adds code to obtain the board type from the project CMake files,
+ removing this option from build, flash, and open_transport methods, so it’s
+ only necessary to specify the ‘zephyr_board’ option when calling
+ generate_project.
+
+ This commit also moves the ‘verbose’ and ‘west_cmd’ options from ‘build’
+ method to ‘generate_project’, reducing further the number of required options
+ when building a project, since the ‘build’ method is usually called more often
+ than the ‘generate_project’.
+
+After a new PR is created and the review starts it’s common that reviewers will
+request changes. Usually the author will address the reviewers’ comments and
+push additional commits on top of the initial ones. For these additional commits
+there is no recommendation regarding the commit messages. However if the
+additional commits render the PR title and/or body outdated then it's the
+author's responsibility to keep the PR title and body in sync with new changes
+in the code and updated the PR title and body accordingly (remember that the PR
+title and body will be used to compose the final commit message that will land
+in the main tree).
+
+Committers will seek to fix any issues with the commit message prior to
+committing but they retain the right to inform the author of the rules and
+encourage them to follow them in future. Also, they retain the right to ask to
+the author to update the PR title and/or body when they are not correctly
+updated or fixed.
+
 CI Environment
 --------------
 We use Docker images to create stable CI environments that can be deployed to multiple machines.

From 6cd31e7bf1d9fed7e2e9f5de1b725d1fdc5a4659 Mon Sep 17 00:00:00 2001
From: "yin.changsheng" <yin.changsheng@intellif.com>
Date: Wed, 7 Sep 2022 15:03:47 +0800
Subject: [PATCH 113/704] [TIR] Fix pragma_loop_partition_hint attrs should
 check it's value (#12699)

Current LoopPartition doesn't check the value of attribute key "pragma_loop_partition_hint". Whatever I set pragma_loop_partition_hint to True or False, the result is same, which is confused for debug.

This PR fix pragma_loop_partition_hint attribute key should check it's value.
---
 src/tir/transforms/loop_partition.cc            | 17 ++++++++++-------
 .../test_tir_transform_loop_partition.py        |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc
index 6ecc6459b904..d410f8cfa471 100644
--- a/src/tir/transforms/loop_partition.cc
+++ b/src/tir/transforms/loop_partition.cc
@@ -139,14 +139,16 @@ class CandidateSelector final : public StmtExprVisitor {
         return;
       }
     } else if (op->attr_key == attr::pragma_loop_partition_hint) {
-      const VarNode* var = nullptr;
-      if (op->node->IsInstance<VarNode>()) {
-        var = op->node.as<VarNode>();
-      } else if (op->node->IsInstance<IterVarNode>()) {
-        var = op->node.as<IterVarNode>()->var.get();
+      if (analyzer_.CanProve(op->value)) {
+        const VarNode* var = nullptr;
+        if (op->node->IsInstance<VarNode>()) {
+          var = op->node.as<VarNode>();
+        } else if (op->node->IsInstance<IterVarNode>()) {
+          var = op->node.as<IterVarNode>()->var.get();
+        }
+        ICHECK(var);
+        partition_hint_vars.insert(var);
       }
-      ICHECK(var);
-      partition_hint_vars.insert(var);
     }
     StmtExprVisitor::VisitStmt_(op);
   }
@@ -191,6 +193,7 @@ class CandidateSelector final : public StmtExprVisitor {
   bool no_split_{false};
   bool partition_const_loop_{false};
   std::unordered_map<const VarNode*, VarIsUsed> record_;
+  arith::Analyzer analyzer_;
 };
 
 // Finder try best to find partitions for hinted vars
diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py
index 86f2b6696b3d..b6e8d92f8d39 100644
--- a/tests/python/unittest/test_tir_transform_loop_partition.py
+++ b/tests/python/unittest/test_tir_transform_loop_partition.py
@@ -559,7 +559,7 @@ def test_explicit_partition_hint():
     C = te.compute((32,), lambda i: te.if_then_else(i < 16, A[i], B[i]), name="C")
     s = te.create_schedule(C.op)
     s.normalize()
-    s[C].pragma(s[C].op.axis[0], "loop_partition_hint")
+    s[C].pragma(s[C].op.axis[0], "loop_partition_hint", True)
     mod = tvm.driver.build_module.schedule_to_module(s, [A, B, C], "main", None)
     with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
         mod = tvm.tir.transform.StorageFlatten(64)(mod)

From 291dd2f06331342f5c89216d5d211cb61fe3d19f Mon Sep 17 00:00:00 2001
From: cery999 <112694109+cery999@users.noreply.github.com>
Date: Wed, 7 Sep 2022 15:06:31 +0800
Subject: [PATCH 114/704] support false-positive fast math (#12702)

---
 include/tvm/topi/elemwise.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/topi/elemwise.h b/include/tvm/topi/elemwise.h
index fc9ab139887e..f26105cb180b 100644
--- a/include/tvm/topi/elemwise.h
+++ b/include/tvm/topi/elemwise.h
@@ -81,7 +81,7 @@ TOPI_DECLARE_UNARY_OP(isinf);
 inline Tensor fast_tanh_float(const Tensor& in, std::string name, std::string tag) {
   // Clamp the inputs to the range [-9, 9] since anything outside
   // this range is +/-1.0f in single-precision.
-  auto x = maximum(minimum(in, make_const(in->dtype, 9.0)), make_const(in->dtype, -9.0));
+  auto x = maximum(make_const(in->dtype, -9.0), minimum(make_const(in->dtype, 9.0), in));
 
   // The monomial coefficients of the numerator polynomial (odd).
   auto alpha_1 = make_const(in->dtype, 4.89352455891786e-03);

From b55ffcd18b049ae7a76e02d561535530f384c5d8 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 7 Sep 2022 10:05:52 +0100
Subject: [PATCH 115/704] [ETHOSN] Add support for transpose convolution
 (#12674)

Adds support for offloading transpose convolution with an optional bias
to the NPU.

Co-authored-by: Samuel Panijel <samuel.panijel@arm.com>
Co-authored-by: Leo Blonk <leo.blonk@arm.com>
---
 python/tvm/relay/op/contrib/ethosn.py         |  18 ++
 src/relay/backend/contrib/ethosn/codegen.cc   |  39 +++
 .../backend/contrib/ethosn/codegen_ethosn.h   |   1 +
 .../contrib/ethosn/convert_equivalent.cc      |  15 +-
 .../backend/contrib/ethosn/ethosn_api.cc      | 126 ++++++++++
 src/relay/backend/contrib/ethosn/ethosn_api.h |  23 ++
 .../contrib/test_ethosn/infrastructure.py     |  43 ++++
 .../python/contrib/test_ethosn/test_conv2d.py |  21 +-
 .../test_ethosn/test_conv2d_transpose.py      | 234 ++++++++++++++++++
 9 files changed, 487 insertions(+), 33 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosn/test_conv2d_transpose.py

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index a4e9d9647c95..5129ed9ffaef 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -233,6 +233,16 @@ def qnn_add_pattern():
 
         return input_is_left | input_is_right | two_inputs
 
+    def qnn_conv2d_transpose_pattern():
+        pattern = is_op("qnn.conv2d_transpose")(
+            wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant()
+        ).has_attr({"data_layout": "NHWC"})
+        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
+        pattern = is_op("qnn.requantize")(
+            pattern, is_constant(), is_constant(), is_constant(), is_constant()
+        )
+        return pattern
+
     def check_conv2d(extract):
         """Check if a conv2d is supported by Ethos-N."""
         if not ethosn_available():
@@ -261,6 +271,13 @@ def check_mean(extract):
 
         return _ethosn.mean(extract)
 
+    def check_conv2d_transpose(extract):
+        """Check if conv2d_transpose is supported by Ethos-N."""
+        if not ethosn_available():
+            return False
+
+        return _ethosn.conv2d_transpose(extract)
+
     def check_sigmoid(extract):
         """Check if a sigmoid is supported by Ethos-N."""
         if not ethosn_available():
@@ -326,6 +343,7 @@ def check_add(extract):
         ("ethos-n.qnn_mul", qnn_mul_pattern(), check_mul),
         ("ethos-n.qnn_add", qnn_add_pattern(), check_add),
         ("ethos-n.qnn_conv2d", qnn_conv_pattern(), check_conv2d),
+        ("ethos-n.qnn_conv2d_transpose", qnn_conv2d_transpose_pattern(), check_conv2d_transpose),
         ("ethos-n.qnn_avg_pool2d", qnn_avg_pool2d_pattern(), check_avg_pool2d),
         ("ethos-n.qnn_sigmoid", qnn_sigmoid_pattern(), check_sigmoid),
         ("ethos-n.qnn_fc", qnn_fc_pattern(), check_fc),
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index 69672a143585..c7109b754d2b 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -125,6 +125,10 @@ void InferTensorsVisitor::InferCall(const CallNode* cn) {
     LeakyReLUParams params;
     err += EthosnAPI::LeakyReLU(cn->op.as<FunctionNode>()->body, &params);
     tensor_table_[cn->args[0]] = {params.input_info};
+  } else if (IsEthosnFunc(call, "ethos-n.qnn_conv2d_transpose")) {
+    QnnConv2dTransposeParams params;
+    err += EthosnAPI::QnnConv2dTranspose(cn->op.as<FunctionNode>()->body, &params);
+    tensor_table_[cn->args[0]] = {params.input_info};
   } else if (IsEthosnOp(call, "qnn.concatenate")) {
     ConcatenateParams params;
     err = EthosnAPI::Concatenate(call, &params);
@@ -311,6 +315,9 @@ sl::TensorsAndId ConstructNetworkVisitor::HandleCall(const CallNode* cn) {
   } else if (IsEthosnFunc(call, "ethos-n.qnn_leaky_relu")) {
     if ((err = MakeLeakyReLULayer(call, &tensor))) ReportFatalError(call, err);
     return MakeOps(tensor);
+  } else if (IsEthosnFunc(call, "ethos-n.qnn_conv2d_transpose")) {
+    if ((err = MakeConv2DTransposeLayer(call, &tensor))) ReportFatalError(call, err);
+    return MakeOps(tensor);
   } else if (IsEthosnOp(call, "qnn.concatenate")) {
     if ((err = MakeConcatenateLayer(call, &tensor))) ReportFatalError(call, err);
     return MakeOps(tensor);
@@ -537,6 +544,24 @@ EthosnError ConstructNetworkVisitor::MakeLeakyReLULayer(const Call& call,
   return EthosnError();
 }
 
+EthosnError ConstructNetworkVisitor::MakeConv2DTransposeLayer(const Call& call,
+                                                              sl::TensorAndId<sl::Operand>* out) {
+  QnnConv2dTransposeParams params;
+  if (auto err = EthosnAPI::QnnConv2dTranspose(call->op.as<FunctionNode>()->body, &params)) {
+    return err;
+  }
+
+  auto activation = operand_table_[call->args[0]][0];
+  auto weights = AddConstant(network_, params.weights_info, params.raw_weights->data).tensor;
+  auto bias = AddConstant(network_, params.bias_info, params.raw_bias->data).tensor;
+  try {
+    *out = AddTransposeConvolution(network_, *activation, *bias, *weights, params.conv_info);
+  } catch (const sl::NotSupportedException& e) {
+    return EthosnError(e.what());
+  }
+  return EthosnError();
+}
+
 EthosnError ConstructNetworkVisitor::MakeConcatenateLayer(const Call& call,
                                                           sl::TensorAndId<sl::Operand>* out) {
   ConcatenateParams params;
@@ -913,6 +938,20 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.leaky_relu")
       err += EthosnError(reason);
     });
 
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.conv2d_transpose")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      QnnConv2dTransposeParams params;
+      auto err = EthosnAPI::QnnConv2dTranspose(call, &params);
+      err += EthosnCompiler::SupportedSetup();
+      char reason[kReasonMaxLength];
+      reason[0] = '\0';
+      *rv = !err && EthosnCompiler::GetSupported()->IsTransposeConvolutionSupported(
+                        params.bias_info, params.weights_info, params.conv_info, params.input_info,
+                        &params.output_info, reason, sizeof(reason));
+      err += EthosnError(reason);
+    });
+
 TVM_REGISTER_GLOBAL("relay.ethos-n.support.concatenate")
     .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
       Call call = args[0];
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index 863a032cafba..a653b0b8dc97 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -206,6 +206,7 @@ class ConstructNetworkVisitor : public MixedModeVisitor, private ErrorReportingP
   EthosnError MakeSigmoidLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeMeanLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeTanhLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
+  EthosnError MakeConv2DTransposeLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeConcatenateLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeSplitLayer(const Call& call, sl::TensorsAndId* outs);
   EthosnError MakeDepthToSpaceLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
diff --git a/src/relay/backend/contrib/ethosn/convert_equivalent.cc b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
index 12b5a12afb35..91c924b1b04f 100644
--- a/src/relay/backend/contrib/ethosn/convert_equivalent.cc
+++ b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
@@ -32,26 +32,13 @@
 #include "../../../qnn/utils.h"
 #include "../../../transforms/pattern_utils.h"
 #include "../../../transforms/simplify_expr.h"
+#include "ethosn_api.h"
 
 namespace tvm {
 namespace relay {
 namespace contrib {
 namespace ethosn {
 
-/*!
- * \brief Apply constant folding on an expression.
- *
- * \param expr The expression to fold.
- * \param fold_qnn Whether to fold constants for QNN operations.
- * \returns The new folded expression.
- */
-Expr FoldConstantExpr(const Expr& expr, bool fold_qnn = true) {
-  auto mod = IRModule::FromExpr(expr);
-  mod = transform::FoldConstant(fold_qnn)(mod);
-  auto entry_func = Downcast<Function>(mod->Lookup("main"));
-  return expr.as<FunctionNode>() == nullptr ? entry_func->body : entry_func;
-}
-
 /*!
  * \brief Converts qnn.mul to mathematically equivalent
  * qnn.conv2d depthwise operation.
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index 4f01c924cf6e..ce57cc23419a 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -23,6 +23,7 @@
 
 #include "ethosn_api.h"
 
+#include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/image.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/expr.h>
@@ -37,6 +38,9 @@
 #include <utility>
 #include <vector>
 
+#include "../../../op/make_op.h"
+#include "../../../transforms/pattern_utils.h"
+#include "../../../transforms/simplify_expr.h"
 #include "ethosn_support_library/Support.hpp"
 #include "ethosn_support_library/SupportQueries.hpp"
 #include "tvm/relay/qnn/attrs.h"
@@ -445,6 +449,121 @@ EthosnError EthosnAPI::Mean(const Expr& expr, MeanParams* params) {
   return err;
 }
 
+Constant TransposeWeights(const Constant& data, const std::string& input_layout) {
+  int pos_h = input_layout.find("H");
+  int pos_w = input_layout.find("W");
+  int pos_i = input_layout.find("I");
+  int pos_o = input_layout.find("O");
+
+  // Currently the expected target layout is HWIO only.
+  Array<Integer> target_shape = {pos_h, pos_w, pos_i, pos_o};
+
+  Expr transpose = MakeTranspose(data, target_shape);
+  transpose = InferType(FoldConstantExpr(transpose));
+  Constant transposed_data = Downcast<Constant>(transpose);
+  return transposed_data;
+}
+
+EthosnError EthosnAPI::QnnConv2dTranspose(const Expr& expr, QnnConv2dTransposeParams* params) {
+  Call requantize = Downcast<Call>(expr);
+  Call bias;
+  Call conv2d_transpose;
+  if (requantize->args[0]->IsInstance<CallNode>() &&
+      Downcast<Call>(requantize->args[0])->op == Op::Get("nn.bias_add")) {
+    bias = Downcast<Call>(requantize->args[0]);
+    conv2d_transpose = Downcast<Call>(bias->args[0]);
+  } else {
+    conv2d_transpose = Downcast<Call>(requantize->args[0]);
+  }
+  const auto& conv_attr = conv2d_transpose->attrs.as<Conv2DTransposeAttrs>();
+  ICHECK(conv_attr) << "Expected type Conv2DTransposeAttrs but was "
+                    << conv2d_transpose->attrs->GetTypeKey();
+
+  int input_zero_point;
+  int kernel_zero_point;
+  int output_zero_point;
+  std::valarray<float> input_scale;
+  std::valarray<float> kernel_scale;
+  float output_scale;
+  unsigned int qaxis = conv_attr->kernel_layout.find("O");
+
+  EthosnError err = AsConstant(conv2d_transpose->args[2], &input_zero_point);
+  err += AsConstant(conv2d_transpose->args[3], &kernel_zero_point);
+  err += AsConstant(requantize->args[4], &output_zero_point);
+  err += AsConstant(conv2d_transpose->args[4], &input_scale);
+  err += AsConstant(conv2d_transpose->args[5], &kernel_scale);
+  err += AsConstant(requantize->args[3], &output_scale);
+
+  // Convert quantization params
+  sl::QuantizationInfo input_q_info;
+  sl::QuantizationInfo weights_q_info;
+  sl::QuantizationInfo bias_q_info;
+  sl::QuantizationInfo output_q_info;
+  err += Tvm2Npu(input_zero_point, input_scale, qaxis, &input_q_info);
+  err += Tvm2Npu(kernel_zero_point, kernel_scale, qaxis, &weights_q_info);
+  std::valarray<float> bias_scales = input_q_info.GetScales() * weights_q_info.GetScales();
+  err += Tvm2Npu(0, bias_scales, 3, &bias_q_info);
+  err += Tvm2Npu(output_zero_point, output_scale, &output_q_info);
+
+  // Convert convolution attributes
+  sl::Padding padding;
+  err += Tvm2Npu(conv_attr->padding, &padding);
+  sl::Stride stride;
+  err += Tvm2Npu(conv_attr->strides, &stride);
+  // Dilation is not supported
+  std::array<uint32_t, 2> dilation = {1, 1};
+  AsArray(conv_attr->dilation, &dilation);
+  if (conv_attr->dilation.size() != 2 || dilation[0] != 1 || dilation[1] != 1) {
+    err +=
+        EthosnError(ErrStrm() << "dilation=" << conv_attr->dilation << ", dilation must = [1, 1]");
+  }
+
+  // Create convolution info
+  params->conv_info = sl::ConvolutionInfo(padding, stride, output_q_info);
+
+  // Create input info
+  sl::TensorInfo input_tensor_info;
+  err += Tvm2Npu(conv2d_transpose->args[0]->checked_type(), &input_tensor_info);
+  input_tensor_info.m_QuantizationInfo = input_q_info;
+  params->input_info = input_tensor_info;
+
+  // Create weights info
+  Constant weights_data = Downcast<Constant>(conv2d_transpose->args[1]);
+  if (conv_attr->kernel_layout != "HWIO") {
+    weights_data = TransposeWeights(weights_data, conv_attr->kernel_layout);
+  }
+  const auto* weights_ttype = weights_data->checked_type().as<TensorTypeNode>();
+  sl::TensorShape weights_tensor_shape;
+  sl::DataType weights_data_type;
+  sl::DataFormat weights_data_format;
+  // Ignore the error here because weights don't have a batch axis
+  Tvm2Npu(weights_ttype->shape, &weights_tensor_shape);
+  err += Tvm2Npu(weights_ttype->dtype, &weights_data_type);
+  err += Tvm2Npu("HWIO", &weights_data_format);
+  params->weights_info =
+      sl::TensorInfo(weights_tensor_shape, weights_data_type, weights_data_format, weights_q_info);
+
+  params->raw_weights = weights_data->data;
+
+  // Create bias info
+  unsigned int out_channels = Downcast<IntImm>(conv_attr->channels)->value;
+  params->bias_info = sl::TensorInfo({1, 1, 1, out_channels}, sl::DataType::INT32_QUANTIZED,
+                                     sl::DataFormat::NHWC, bias_q_info);
+  if (bias.defined()) {
+    params->raw_bias = Downcast<Constant>(bias->args[1])->data;
+  } else {
+    params->raw_bias = MakeConstantZeros(tvm::DataType::Int(32), {1, 1, 1, out_channels})->data;
+  }
+
+  // Create output info
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(requantize->checked_type(), &output_tensor_info);
+  output_tensor_info.m_QuantizationInfo = output_q_info;
+  params->output_info = output_tensor_info;
+
+  return err;
+}
+
 EthosnError EthosnAPI::Tanh(const Expr& expr, TanhParams* params) {
   Call quantize = Downcast<Call>(expr);
   Call tanh = Downcast<Call>(quantize->args[0]);
@@ -925,6 +1044,13 @@ EthosnError EthosnAPI::AsConstant(const Expr& expr, T* out) {
   return EthosnError();
 }
 
+Expr FoldConstantExpr(const Expr& expr, bool fold_qnn) {
+  auto mod = IRModule::FromExpr(expr);
+  mod = transform::FoldConstant(fold_qnn)(mod);
+  auto entry_func = Downcast<Function>(mod->Lookup("main"));
+  return expr.as<FunctionNode>() == nullptr ? entry_func->body : entry_func;
+}
+
 }  // namespace ethosn
 }  // namespace contrib
 }  // namespace relay
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.h b/src/relay/backend/contrib/ethosn/ethosn_api.h
index afe4736bfc40..167106c3d06d 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.h
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.h
@@ -24,6 +24,7 @@
 #ifndef TVM_RELAY_BACKEND_CONTRIB_ETHOSN_ETHOSN_API_H_
 #define TVM_RELAY_BACKEND_CONTRIB_ETHOSN_ETHOSN_API_H_
 
+#include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
@@ -115,6 +116,16 @@ struct LeakyReLUParams {
   sl::TensorInfo output_info;
 };
 
+struct QnnConv2dTransposeParams {
+  sl::ConvolutionInfo conv_info;
+  sl::TensorInfo input_info;
+  sl::TensorInfo weights_info;
+  sl::TensorInfo bias_info;
+  sl::TensorInfo output_info;
+  runtime::NDArray raw_weights;
+  runtime::NDArray raw_bias;
+};
+
 struct ConcatenateParams {
   sl::QuantizationInfo qInfo;
   sl::ConcatenationInfo concat_info = sl::ConcatenationInfo(1, qInfo);
@@ -237,6 +248,9 @@ class EthosnAPI {
   static EthosnError Tanh(const Expr& expr, TanhParams* params);
   /*! \brief Extract the Support Library leaky relu params from an ethos-n leaky relu Relu call. */
   static EthosnError LeakyReLU(const Expr& expr, LeakyReLUParams* params);
+  /*! \brief Extract the Support Library transpose params from a Relay
+   * ethos-n.qnn_conv2d_transpose func */
+  static EthosnError QnnConv2dTranspose(const Expr& expr, QnnConv2dTransposeParams* params);
   /*! \brief Extract the Support Library concatenate params from a Relay qnn.concatenate call */
   static EthosnError Concatenate(const Expr& expr, ConcatenateParams* params);
   /*! \brief Extract the Support Library split params from a Relay split call */
@@ -294,6 +308,15 @@ class EthosnAPI {
   static EthosnError AsConstant(const Expr& expr, std::valarray<float>* out);
 };
 
+/*!
+ * \brief Apply constant folding on an expression.
+ *
+ * \param expr The expression to fold.
+ * \param fold_qnn Whether to fold constants for QNN operations.
+ * \returns The new folded expression.
+ */
+Expr FoldConstantExpr(const Expr& expr, bool fold_qnn = true);
+
 }  // namespace ethosn
 }  // namespace contrib
 }  // namespace relay
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index c658b33747c3..6b019686968e 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -21,6 +21,9 @@
 from hashlib import md5
 from itertools import zip_longest, combinations
 import os
+from typing import Tuple
+import math
+
 import numpy as np
 from PIL import Image
 
@@ -28,6 +31,7 @@
 from tvm import relay
 from tvm.contrib import utils, graph_executor, download
 from tvm.relay.op.contrib import partition_for_ethosn
+
 from . import _infrastructure
 
 
@@ -340,5 +344,44 @@ def get_conv2d_qnn_params(
     return output_zp, output_sc
 
 
+def get_same_padding(
+    data: Tuple[int, int],
+    kernel: Tuple[int, int],
+    dilation: Tuple[int, int],
+    stride: Tuple[int, int],
+) -> Tuple[int, int, int, int]:
+    """
+    Get the padding values required for 'SAME' padding.
+
+    Parameters
+    ----------
+    data : Tuple[int, int]
+        The height and width of the data respectively.
+    kernel : Tuple[int, int]
+        The height and width of the kernel respectively.
+    dilation : Tuple[int, int]
+        The dilation of the kernel.
+    stride : Tuple[int, int]
+        The stride of the kernel.
+
+    Returns
+    -------
+    Tuple[int, int, int, int]
+        The padding values for top, left, bottom and right respectively.
+    """
+    dilated_kernel_h = dilation[0] * (kernel[0] - 1) + 1
+    dilated_kernel_w = dilation[1] * (kernel[1] - 1) + 1
+    out = int(math.ceil(float(data[0]) / float(stride[0])))
+    pad = max(0, (out - 1) * stride[0] + dilated_kernel_h - data[0])
+    pad_top = pad // 2
+    pad_bottom = pad - pad_top
+
+    out = int(math.ceil(float(data[1]) / float(stride[1])))
+    pad = max(0, (out - 1) * stride[1] + dilated_kernel_w - data[1])
+    pad_left = pad // 2
+    pad_right = pad - pad_left
+    return (pad_top, pad_left, pad_bottom, pad_right)
+
+
 def get_ethosn_variant():
     return os.getenv("ETHOSN_VARIANT_CONFIG", default="Ethos-N78_1TOPS_2PLE_RATIO")
diff --git a/tests/python/contrib/test_ethosn/test_conv2d.py b/tests/python/contrib/test_ethosn/test_conv2d.py
index 4026f8267d72..a6ce73656bfc 100644
--- a/tests/python/contrib/test_ethosn/test_conv2d.py
+++ b/tests/python/contrib/test_ethosn/test_conv2d.py
@@ -17,8 +17,6 @@
 
 """Arm(R) Ethos(TM)-N integration conv2d tests"""
 
-import math
-
 import numpy as np
 import pytest
 
@@ -29,21 +27,6 @@
 from . import infrastructure as tei
 
 
-def _get_same_padding(data, kernel, dilation, stride):
-    dilated_kernel_h = dilation[0] * (kernel[0] - 1) + 1
-    dilated_kernel_w = dilation[1] * (kernel[1] - 1) + 1
-    out = int(math.ceil(float(data[0]) / float(stride[0])))
-    pad = max(0, (out - 1) * stride[0] + dilated_kernel_h - data[0])
-    pad_top = pad // 2
-    pad_bottom = pad - pad_top
-
-    out = int(math.ceil(float(data[1]) / float(stride[1])))
-    pad = max(0, (out - 1) * stride[1] + dilated_kernel_w - data[1])
-    pad_left = pad // 2
-    pad_right = pad - pad_left
-    return [pad_top, pad_left, pad_bottom, pad_right]
-
-
 def _get_model(
     shape,
     kernel_h,
@@ -65,7 +48,7 @@ def _get_model(
     """Return a model and any parameters it may have"""
     a = relay.var("a", shape=shape, dtype=dtype)
     if pad in ("op", "both"):
-        p = _get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
+        p = tei.get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
         a = relay.nn.pad(
             a,
             pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)],
@@ -74,7 +57,7 @@ def _get_model(
         )
         shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
 
-    p = _get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
+    p = tei.get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
     if weight_format == "HWIO":
         weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
     else:
diff --git a/tests/python/contrib/test_ethosn/test_conv2d_transpose.py b/tests/python/contrib/test_ethosn/test_conv2d_transpose.py
new file mode 100644
index 000000000000..84aa7e969b30
--- /dev/null
+++ b/tests/python/contrib/test_ethosn/test_conv2d_transpose.py
@@ -0,0 +1,234 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Arm(R) Ethos(TM)-N integration conv2d tests"""
+
+import pytest
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.testing import requires_ethosn
+from . import infrastructure as tei
+
+
+def _get_model(
+    shape,
+    kernel_h,
+    kernel_w,
+    input_zp,
+    input_sc,
+    kernel_zp,
+    kernel_sc,
+    output_zp,
+    output_sc,
+    stride,
+    dilation,
+    groups,
+    kernel_layout,
+    dtype,
+    out_channels,
+    bias,
+):
+    """Return a model and any parameters it may have"""
+    a = relay.var("a", shape=shape, dtype=dtype)
+    p = tei.get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, stride)
+    weight_shape = (shape[3], out_channels // groups, kernel_h, kernel_w)
+
+    weight_data = tvm.nd.array(
+        np.random.randint(
+            np.iinfo(dtype).min,
+            high=(np.iinfo(dtype).max + 1),
+            size=weight_shape,
+            dtype=dtype,
+        )
+    )
+    weights = relay.const(weight_data, dtype)
+    op = relay.qnn.op.conv2d_transpose(
+        a,
+        weights,
+        input_zero_point=relay.const(input_zp, "int32"),
+        input_scale=relay.const(input_sc, "float32"),
+        kernel_zero_point=relay.const(kernel_zp, "int32"),
+        kernel_scale=relay.const(kernel_sc, "float32"),
+        kernel_size=(kernel_h, kernel_w),
+        padding=p,
+        strides=stride,
+        dilation=dilation,
+        data_layout="NHWC",
+        kernel_layout=kernel_layout,
+        out_dtype="int32",
+        channels=out_channels,
+        groups=groups,
+    )
+    if bias:
+        bias_data = tvm.nd.array(
+            np.random.randint(
+                np.iinfo(dtype).min,
+                high=np.iinfo(dtype).max + 1,
+                size=(out_channels,),
+                dtype="int32",
+            )
+        )
+        biasc = relay.const(bias_data, "int32")
+        op = relay.nn.bias_add(op, biasc, axis=3)
+
+    if isinstance(kernel_sc, tvm.runtime.ndarray.NDArray):
+        req_input_sc = [sc * input_sc for sc in kernel_sc.numpy()]
+    else:
+        req_input_sc = input_sc * kernel_sc
+
+    op = relay.qnn.op.requantize(
+        op,
+        input_zero_point=relay.const(input_zp, "int32"),
+        input_scale=relay.const(req_input_sc, "float32"),
+        output_zero_point=relay.const(output_zp, "int32"),
+        output_scale=relay.const(output_sc, "float32"),
+        axis=3,
+        rounding="UPWARD",
+        out_dtype=dtype,
+    )
+    params = {"w": weight_data}
+    if bias:
+        params["b"] = bias_data
+    return op, params
+
+
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize(
+    "ifm_shape,strides,kernel_size,out_channels,bias",
+    [
+        ((1, 2, 2, 1), (2, 2), (1, 1), 1, False),
+        ((1, 2, 2, 5), (2, 2), (3, 5), 4, False),
+        ((1, 7, 7, 4), (2, 2), (7, 9), 8, True),
+    ],
+)
+def test_conv2d_transpose(ifm_shape, strides, kernel_size, out_channels, dtype, bias):
+    """Check transpose convolution output with TVM."""
+    np.random.seed(0)
+
+    kernel_layout = "IOHW"
+    dilation = (1, 1)
+    groups = 1
+
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+
+    input_zp = np.random.randint(data_min, data_max)
+    input_sc = np.random.random() * 2
+    kernel_zp = np.random.randint(data_min, data_max)
+    kernel_sc = np.random.random() * 4
+    output_zp, output_sc = tei.get_conv2d_qnn_params(
+        dtype, input_zp, input_sc, kernel_zp, kernel_sc, ifm_shape[1], ifm_shape[2], ifm_shape[3]
+    )
+
+    model, params = _get_model(
+        shape=ifm_shape,
+        kernel_h=kernel_size[0],
+        kernel_w=kernel_size[1],
+        input_zp=input_zp,
+        input_sc=input_sc,
+        kernel_zp=kernel_zp,
+        kernel_sc=kernel_sc,
+        output_zp=output_zp,
+        output_sc=output_sc,
+        stride=strides,
+        dilation=dilation,
+        groups=groups,
+        kernel_layout=kernel_layout,
+        dtype=dtype,
+        out_channels=out_channels,
+        bias=bias,
+    )
+
+    outputs = []
+    inputs = {
+        "a": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=ifm_shape, dtype=dtype))
+    }
+
+    for npu in [False, True]:
+        mod = tei.make_module(model, params)
+        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+
+    tei.verify(outputs, dtype, 1)
+
+
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize(
+    "shape, stride, dilation, groups, err_msg",
+    [
+        (
+            (1, 4, 4, 4),
+            (1, 1, 1),
+            (1, 1),
+            1,
+            "stride size=3, stride size must = 2",
+        ),
+        (
+            (1, 4, 4, 4),
+            (2, 2),
+            (2, 2),
+            2,
+            "dilation=[2, 2], dilation must = [1, 1]",
+        ),
+        (
+            (2, 4, 4, 4),
+            (1, 1),
+            (1, 1),
+            1,
+            "batch size=2, batch size must = 1",
+        ),
+    ],
+)
+def test_conv2d_transpose_failure(
+    shape,
+    stride,
+    dilation,
+    groups,
+    err_msg,
+    dtype,
+):
+    """
+    Test transpose_conv2d error messages.
+    """
+    np.random.seed(0)
+    out_channels = 8
+
+    model, _ = _get_model(
+        shape=shape,
+        kernel_h=1,
+        kernel_w=1,
+        input_zp=0,
+        input_sc=1,
+        kernel_zp=0,
+        kernel_sc=1,
+        output_zp=0,
+        output_sc=1,
+        stride=stride,
+        dilation=dilation,
+        groups=groups,
+        kernel_layout="IOHW",
+        dtype=dtype,
+        out_channels=out_channels,
+        bias=False,
+    )
+    model = tei.make_ethosn_composite(model, "ethos-n.qnn_conv2d_transpose")
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)

From ff9a5309ecd713214a61e9e848c90289831f70c5 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 7 Sep 2022 07:26:09 -0700
Subject: [PATCH 116/704] [microTVM][Zephyr] Enable -O2 optimization on build
 by default (#12718)

* add spped optimization flag

* trigger

* add exception for qemu_riscv64
---
 .../zephyr/template_project/microtvm_api_server.py       | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index 76895c430bd6..b73779f68148 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -456,6 +456,7 @@ def server_info_query(self, tvm_version):
     }
 
     def _create_prj_conf(self, project_dir, options):
+        zephyr_board = options["zephyr_board"]
         with open(project_dir / "prj.conf", "w") as f:
             f.write(
                 "# For UART used from main().\n"
@@ -477,7 +478,7 @@ def _create_prj_conf(self, project_dir, options):
 
             f.write("# For math routines\n" "CONFIG_NEWLIB_LIBC=y\n" "\n")
 
-            if self._has_fpu(options["zephyr_board"]):
+            if self._has_fpu(zephyr_board):
                 f.write("# For models with floating point.\n" "CONFIG_FPU=y\n" "\n")
 
             # Set main stack size, if needed.
@@ -488,9 +489,13 @@ def _create_prj_conf(self, project_dir, options):
 
             f.write("\n# Extra prj.conf directives\n")
             for line, board_list in self.EXTRA_PRJ_CONF_DIRECTIVES.items():
-                if options["zephyr_board"] in board_list:
+                if zephyr_board in board_list:
                     f.write(f"{line}\n")
 
+            # TODO(mehrdadh): due to https://github.com/apache/tvm/issues/12721
+            if zephyr_board not in ["qemu_riscv64"]:
+                f.write("# For setting -O2 in compiler.\n" "CONFIG_SPEED_OPTIMIZATIONS=y\n")
+
             f.write("\n")
 
     API_SERVER_CRT_LIBS_TOKEN = "<API_SERVER_CRT_LIBS>"

From 269d536be0308f6594b22615d33cc0f0539ad39a Mon Sep 17 00:00:00 2001
From: Aakanksha Verma <89928182+avquicinc@users.noreply.github.com>
Date: Wed, 7 Sep 2022 19:59:54 +0530
Subject: [PATCH 117/704] [HEXAGON] [TOPI] Dequantize (#12677)

dequantize op hexagon
---
 python/tvm/topi/hexagon/qnn/__init__.py       |   5 +
 python/tvm/topi/hexagon/qnn/dequantize.py     |  94 ++++++++++++++
 python/tvm/topi/hexagon/utils.py              |   7 +
 .../contrib/test_hexagon/infrastructure.py    |   2 +
 .../topi/test_dequantize_slice.py             | 121 ++++++++++++++++++
 5 files changed, 229 insertions(+)
 create mode 100644 python/tvm/topi/hexagon/qnn/dequantize.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py

diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
index e27e3793d565..25d1e6d1854d 100644
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -18,3 +18,8 @@
 """ Computes and schedules for Hexagon quantized ops """
 
 from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule
+
+from .dequantize import (
+    dequantize_compute,
+    dequantize_schedule,
+)
diff --git a/python/tvm/topi/hexagon/qnn/dequantize.py b/python/tvm/topi/hexagon/qnn/dequantize.py
new file mode 100644
index 000000000000..3e1466e88b38
--- /dev/null
+++ b/python/tvm/topi/hexagon/qnn/dequantize.py
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+""" Hexagon qnn.dequantize slice op compute and schedule"""
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn
+
+
+def dequantize_compute(tensor_A, scale_A, zero_point_A):
+
+    return te.compute(
+        tensor_A.shape,
+        lambda *indices: (scale_A * (tensor_A[indices] - zero_point_A)).astype("float32"),
+        name="dequantize",
+    )
+
+
+def dequantize_stir_schedule_nhwc_8h8w32c(
+    _in,
+    _out,
+    in_layout,
+    out_layout,
+):
+    """Schedule for nhwc int8/uint8 to f32 : nhwc layout"""
+    func = te.create_prim_func([_in, _out])
+    sch = tir.Schedule(func, debug_mask="all")
+    block_name = "dequantize"
+    n, h, w, c = sch.get_loops(sch.get_block(block_name))
+    ho, hi = sch.split(h, [None, 4])
+    wo, wi = sch.split(w, [None, 8])
+    wio, wii = sch.split(wi, [None, 4])
+    co, ci = sch.split(c, [None, 32])
+    sch.transform_layout(block_name, "A", in_layout)
+    sch.transform_layout(block_name, block_name, out_layout)
+    sch.reorder(n, ho, wo, co, hi, wio, wii, ci)
+    wii_ci = sch.fuse(wii, ci)
+    sch.vectorize(wii_ci)
+    return sch
+
+
+def dequantize_stir_schedule_nc(
+    _in,
+    _out,
+    in_layout,
+    out_layout,
+):
+    """Schedule for nc int8/uint8 to f32 : nc layout"""
+    func = te.create_prim_func([_in, _out])
+    sch = tir.Schedule(func, debug_mask="all")
+    block_name = "dequantize"
+    _, c_orig = sch.get_loops(sch.get_block(block_name))
+    _, c_inner = sch.split(c_orig, [None, 512])
+    sch.transform_layout(block_name, "A", in_layout)
+    sch.transform_layout(block_name, block_name, out_layout)
+    sch.vectorize(c_inner)
+    return sch
+
+
+def dequantize_schedule(_in, _output, in_layout_str, out_layout_str):
+    """Schedule for int8/uint8 to f32 : top level function"""
+    f32_layout_transform_func = get_layout_transform_fn(out_layout_str)
+    in_layout_transform_func = get_layout_transform_fn(in_layout_str)
+    if out_layout_str == "nhwc-4h2w32c2w-2d":
+        return dequantize_stir_schedule_nhwc_8h8w32c(
+            _in,
+            _output,
+            in_layout_transform_func,
+            f32_layout_transform_func,
+        )
+    if out_layout_str == "nc-512c-2d":
+        return dequantize_stir_schedule_nc(
+            _in,
+            _output,
+            in_layout_transform_func,
+            f32_layout_transform_func,
+        )
+    raise RuntimeError(f"Unexpected layout '{layout}'")
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index c056408947b7..9939e5b6fbb7 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -100,6 +100,11 @@ def nc_2048_2d(n, c):
     return [n, c // 2048, te.AXIS_SEPARATOR, c % 2048]
 
 
+def nc_2048c_2d(n, c):
+    """Return index map for nc_2048 2d layout"""
+    return [n, c // 2048, te.AXIS_SEPARATOR, c % 2048]
+
+
 def nhwc_8h8w32c_2d(n, h, w, c):
     """Return index map for nhwc_8h8w32c 2d layout"""
     return [n, h // 8, w // 8, c // 32, te.AXIS_SEPARATOR, h % 8, w % 8, c % 32]
@@ -156,6 +161,8 @@ def get_layout_transform_fn(layout):
         return nhwc_2048c_2d
     if layout == "nc-2048-2d":
         return nc_2048_2d
+    if layout == "nc-2048c-2d":
+        return nc_2048c_2d
     if layout == "nhwc-8h8w32c-2d":
         return nhwc_8h8w32c_2d
     if layout == "n11c-2048c-2d":
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 70e50fcb68d6..71960b649ea2 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -295,6 +295,8 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
             return arr_np.reshape([n, c // 1024, 1024])
         if new_layout in ["nc-512c-2d"]:
             return arr_np.reshape([n, c // 512, 512])
+        if new_layout in ["nc-2048c-2d"]:
+            return arr_np.reshape([n, c // 2048, 2048])
         raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
 
     if current_layout == "nhw":
diff --git a/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py b/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py
new file mode 100644
index 000000000000..e9b3dd132692
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+""" Tests for Hexagon dequantize """
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import te
+from tvm.topi.hexagon import qnn
+from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
+
+
+class TestDequantizeSlice2d:
+    """
+    For testing Dequantize Slice ops
+    """
+
+    input_shape, orig_layout, input_layout, output_layout, axis_sep, dtype = tvm.testing.parameters(
+        ((1, 16, 64, 128), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "int8"),
+        ((1, 16, 64, 128), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "uint8"),
+        ((1, 8, 8, 32), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "int8"),
+        ((1, 8, 8, 32), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "uint8"),
+        ((1, 2048), "nc", "nc-2048c-2d", "nc-512c-2d", [2], "int8"),
+        ((1, 2048), "nc", "nc-2048c-2d", "nc-512c-2d", [2], "uint8"),
+    )
+
+    working_scope = tvm.testing.parameter("global.vtcm")
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape):
+        arr_np = np.random.random(size=input_shape).astype("float32")
+        return arr_np
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, orig_layout, input_layout, dtype):
+        quant_arr, scale, zero_point = quantize_np(input_np, dtype)
+        return [transform_numpy(quant_arr, orig_layout, input_layout), scale, zero_point]
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np, dtype):
+        quant_np, scale, zero_point = quantize_np(input_np, dtype)
+        ref_np = (scale * (quant_np.astype("int32") - zero_point)).astype("float32")
+        return ref_np
+
+    @tvm.testing.fixture
+    def transformed_expected_output_np(self, expected_output_np, orig_layout, output_layout):
+        return transform_numpy(expected_output_np, orig_layout, output_layout)
+
+    @tvm.testing.requires_hexagon
+    def test_dequant_qnn(
+        self,
+        input_shape,
+        dtype,
+        input_layout,
+        output_layout,
+        transformed_input_np,
+        transformed_expected_output_np,
+        axis_sep,
+        hexagon_session,
+        working_scope,
+    ):
+        """
+        Top level testing function for dequantize
+        """
+        target_hexagon = tvm.target.hexagon("v69")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+        dequant_input = te.placeholder(input_shape, name="A", dtype=dtype)
+
+        in_data_np, in_scale, in_zero_pt = transformed_input_np
+
+        dequant_output = qnn.dequantize_compute(dequant_input, in_scale, in_zero_pt)
+
+        tir_s = qnn.dequantize_schedule(dequant_input, dequant_output, input_layout, output_layout)
+
+        input_data = allocate_hexagon_array(
+            hexagon_session.device,
+            data=in_data_np,
+            axis_separators=axis_sep,
+            mem_scope=working_scope,
+        )
+        output_data = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=transformed_expected_output_np.shape,
+            dtype=transformed_expected_output_np.dtype,
+            axis_separators=axis_sep,
+            mem_scope=working_scope,
+        )
+        with tvm.transform.PassContext(opt_level=3):
+            tir_irm = tvm.lower(tir_s.mod, [dequant_input, dequant_output], name="dequantize")
+            runtime_module = tvm.build(tir_irm, target=target, name="dequantize")
+        mod = hexagon_session.load_module(runtime_module)
+
+        mod(input_data, output_data)
+        output_np = output_data.numpy()
+        tvm.testing.assert_allclose(
+            output_np,
+            transformed_expected_output_np,
+            1e-3,
+            1e-3,
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 2622ac9e638b259cae017813ad93937c0ff8a2f9 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 7 Sep 2022 09:12:02 -0700
Subject: [PATCH 118/704] [Build] Update C++ standard to C++17 for AOT, iOS,
 VTA (#12712)

Follow-up from https://github.com/apache/tvm/pull/12337 and
https://github.com/apache/tvm/pull/12693, updating a few additional
locations that specified C++14.
---
 apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj | 4 ++--
 include/tvm/support/span.h                    | 2 +-
 tests/python/relay/aot/test_cpp_aot.py        | 2 +-
 vta/python/vta/exec/rpc_server.py             | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
index 61427d0ca248..ccc61707d3f2 100644
--- a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
+++ b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
@@ -255,7 +255,7 @@
 				ALWAYS_SEARCH_USER_PATHS = NO;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++17";
 				CLANG_CXX_LIBRARY = "libc++";
 				CLANG_ENABLE_MODULES = YES;
 				CLANG_ENABLE_OBJC_ARC = YES;
@@ -308,7 +308,7 @@
 				ALWAYS_SEARCH_USER_PATHS = NO;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++17";
 				CLANG_CXX_LIBRARY = "libc++";
 				CLANG_ENABLE_MODULES = YES;
 				CLANG_ENABLE_OBJC_ARC = YES;
diff --git a/include/tvm/support/span.h b/include/tvm/support/span.h
index 689a48dee788..768252f77ce9 100644
--- a/include/tvm/support/span.h
+++ b/include/tvm/support/span.h
@@ -36,7 +36,7 @@ namespace support {
 /*!
  * \brief A partial implementation of the C++20 std::span.
  *
- * At the time of writing, TVM must compile against C++14.
+ * At the time of writing, TVM must compile against C++17.
  */
 template <class T, class W>
 class Span {
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index 4ffe302763f8..b67bc90d34fd 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -138,7 +138,7 @@ def test_mobilenet(enable_usmp, target_kind):
 
     temp_dir = tvm.contrib.utils.TempDirectory()
     test_so_path = temp_dir / "test.so"
-    mod.export_library(test_so_path, cc="c++", options=["-std=gnu++14", "-g3", "-O0"])
+    mod.export_library(test_so_path, cc="c++", options=["-std=gnu++17", "-g3", "-O0"])
     loaded_mod = tvm.runtime.load_module(test_so_path)
     runner = tvm.runtime.executor.AotModule(loaded_mod["default"](tvm.cpu(0)))
     runner.set_input(**inputs)
diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py
index dcf564dd0314..1abad98b2216 100644
--- a/vta/python/vta/exec/rpc_server.py
+++ b/vta/python/vta/exec/rpc_server.py
@@ -106,7 +106,7 @@ def reconfig_runtime(cfg_json):
             if pkg.same_config(old_cfg):
                 logging.info("Skip reconfig_runtime due to same config.")
                 return
-        cflags = ["-O2", "-std=c++14"]
+        cflags = ["-O2", "-std=c++17"]
         cflags += pkg.cflags
         ldflags = pkg.ldflags
         lib_name = dll_path

From 010c662938245d607fbffd4bd10a9c7fb93e4270 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Wed, 7 Sep 2022 12:17:59 -0700
Subject: [PATCH 119/704] [TVMScript] IRBuilder methods for `IRModule` (#12694)

* IRBuilder methods for `IRModule`

This PR introduces IRBuilder methods for `IRModule`.

Co-authored-by: yongwww <yongcale@gmail.com>

* apply code review suggestion

Co-authored-by: yongwww <yongcale@gmail.com>
---
 include/tvm/script/ir_builder/ir/frame.h      | 71 +++++++++++++++++++
 include/tvm/script/ir_builder/ir/ir.h         | 43 +++++++++++
 python/tvm/script/ir_builder/ir/__init__.py   | 19 +++++
 python/tvm/script/ir_builder/ir/_ffi_api.py   | 20 ++++++
 python/tvm/script/ir_builder/ir/frame.py      | 26 +++++++
 python/tvm/script/ir_builder/ir/ir.py         | 24 +++++++
 src/script/ir_builder/ir/frame.cc             | 43 +++++++++++
 src/script/ir_builder/ir/ir.cc                | 38 ++++++++++
 .../test_tvmscript_ir_builder_irmodule.py     | 41 +++++++++++
 9 files changed, 325 insertions(+)
 create mode 100644 include/tvm/script/ir_builder/ir/frame.h
 create mode 100644 include/tvm/script/ir_builder/ir/ir.h
 create mode 100644 python/tvm/script/ir_builder/ir/__init__.py
 create mode 100644 python/tvm/script/ir_builder/ir/_ffi_api.py
 create mode 100644 python/tvm/script/ir_builder/ir/frame.py
 create mode 100644 python/tvm/script/ir_builder/ir/ir.py
 create mode 100644 src/script/ir_builder/ir/frame.cc
 create mode 100644 src/script/ir_builder/ir/ir.cc
 create mode 100644 tests/python/unittest/test_tvmscript_ir_builder_irmodule.py

diff --git a/include/tvm/script/ir_builder/ir/frame.h b/include/tvm/script/ir_builder/ir/frame.h
new file mode 100644
index 000000000000..181774bc53bc
--- /dev/null
+++ b/include/tvm/script/ir_builder/ir/frame.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_IR_BUILDER_IR_FRAME_H_
+#define TVM_SCRIPT_IR_BUILDER_IR_FRAME_H_
+
+#include <tvm/ir/expr.h>
+#include <tvm/ir/function.h>
+#include <tvm/node/node.h>
+#include <tvm/script/ir_builder/base.h>
+
+#include <vector>
+
+namespace tvm {
+namespace script {
+namespace ir_builder {
+
+/*!
+ * \brief A frame that represents the IRModule frame with functions and global variables.
+ *
+ * \sa IRModuleFrame
+ */
+class IRModuleFrameNode : public IRBuilderFrameNode {
+ public:
+  Array<GlobalVar> global_vars;
+  Array<BaseFunc> functions;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    IRBuilderFrameNode::VisitAttrs(v);
+    v->Visit("global_vars", &global_vars);
+    v->Visit("functions", &functions);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.IRModuleFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IRModuleFrameNode, IRBuilderFrameNode);
+
+ public:
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to IRModuleFrameNode.
+ *
+ * \sa IRModuleFrameNode
+ */
+class IRModuleFrame : public IRBuilderFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(IRModuleFrame, IRBuilderFrame,
+                                                    IRModuleFrameNode);
+};
+
+}  // namespace ir_builder
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_IR_BUILDER_IR_FRAME_H_
diff --git a/include/tvm/script/ir_builder/ir/ir.h b/include/tvm/script/ir_builder/ir/ir.h
new file mode 100644
index 000000000000..0bd5473c7eaf
--- /dev/null
+++ b/include/tvm/script/ir_builder/ir/ir.h
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_IR_BUILDER_IR_IR_H_
+#define TVM_SCRIPT_IR_BUILDER_IR_IR_H_
+
+#include <tvm/ir/expr.h>
+#include <tvm/ir/function.h>
+#include <tvm/node/node.h>
+#include <tvm/script/ir_builder/ir/frame.h>
+
+#include <vector>
+
+namespace tvm {
+namespace script {
+namespace ir_builder {
+
+/*!
+ * \brief The IRModule declaration statement.
+ * \return The IRModuleFrame.
+ */
+TVM_DLL IRModuleFrame IRModule();
+
+}  // namespace ir_builder
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_IR_BUILDER_IR_IR_H_
diff --git a/python/tvm/script/ir_builder/ir/__init__.py b/python/tvm/script/ir_builder/ir/__init__.py
new file mode 100644
index 000000000000..ebb9728737ad
--- /dev/null
+++ b/python/tvm/script/ir_builder/ir/__init__.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Package tvm.script.ir_builder.ir"""
+from .frame import IRModuleFrame
+from .ir import ir_module
diff --git a/python/tvm/script/ir_builder/ir/_ffi_api.py b/python/tvm/script/ir_builder/ir/_ffi_api.py
new file mode 100644
index 000000000000..874cc278af83
--- /dev/null
+++ b/python/tvm/script/ir_builder/ir/_ffi_api.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI APIs"""
+import tvm._ffi
+
+tvm._ffi._init_api("script.ir_builder.ir", __name__)  # pylint: disable=protected-access
diff --git a/python/tvm/script/ir_builder/ir/frame.py b/python/tvm/script/ir_builder/ir/frame.py
new file mode 100644
index 000000000000..e16d86dc227e
--- /dev/null
+++ b/python/tvm/script/ir_builder/ir/frame.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Package tvm.script.ir_builder.ir.frame"""
+
+from tvm._ffi import register_object as _register_object
+
+from ..base import IRBuilderFrame
+
+
+@_register_object("script.ir_builder.IRModuleFrame")
+class IRModuleFrame(IRBuilderFrame):
+    ...
diff --git a/python/tvm/script/ir_builder/ir/ir.py b/python/tvm/script/ir_builder/ir/ir.py
new file mode 100644
index 000000000000..df920364356b
--- /dev/null
+++ b/python/tvm/script/ir_builder/ir/ir.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Package tvm.script.ir_builder.ir.ir"""
+
+from . import _ffi_api
+from .frame import IRModuleFrame
+
+
+def ir_module() -> IRModuleFrame:
+    return _ffi_api.IRModule()  # pylint: disable=no-member # type: ignore
diff --git a/src/script/ir_builder/ir/frame.cc b/src/script/ir_builder/ir/frame.cc
new file mode 100644
index 000000000000..c85e30544aca
--- /dev/null
+++ b/src/script/ir_builder/ir/frame.cc
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/ir/module.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/script/ir_builder/ir/frame.h>
+
+namespace tvm {
+namespace script {
+namespace ir_builder {
+
+void IRModuleFrameNode::ExitWithScope() {
+  ICHECK_EQ(functions.size(), global_vars.size());
+  int n = functions.size();
+  Map<GlobalVar, BaseFunc> func_map;
+  for (int i = 0; i < n; ++i) {
+    func_map.Set(global_vars[i], functions[i]);
+  }
+  IRBuilder builder = IRBuilder::Current();
+  ICHECK(!builder->result.defined()) << "ValueError: Builder.result has already been set";
+  builder->result = tvm::IRModule(func_map);
+}
+
+TVM_REGISTER_NODE_TYPE(IRModuleFrameNode);
+
+}  // namespace ir_builder
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/ir_builder/ir/ir.cc b/src/script/ir_builder/ir/ir.cc
new file mode 100644
index 000000000000..bcd21de144bb
--- /dev/null
+++ b/src/script/ir_builder/ir/ir.cc
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/ir/module.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/script/ir_builder/ir/ir.h>
+
+namespace tvm {
+namespace script {
+namespace ir_builder {
+
+IRModuleFrame IRModule() {
+  ObjectPtr<IRModuleFrameNode> n = make_object<IRModuleFrameNode>();
+  n->global_vars.clear();
+  n->functions.clear();
+  return IRModuleFrame(n);
+}
+
+TVM_REGISTER_GLOBAL("script.ir_builder.ir.IRModule").set_body_typed(IRModule);
+
+}  // namespace ir_builder
+}  // namespace script
+}  // namespace tvm
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_irmodule.py b/tests/python/unittest/test_tvmscript_ir_builder_irmodule.py
new file mode 100644
index 000000000000..7adf192df36b
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_ir_builder_irmodule.py
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unittests for tvm.script.ir_builder.ir"""
+import pytest
+import tvm.testing
+from tvm.script.ir_builder import IRBuilder
+from tvm.script.ir_builder import ir as I
+from tvm import ir
+from tvm.ir.base import assert_structural_equal
+
+
+def test_ir_builder_irmodule():
+    with IRBuilder() as ib:  # pylint: disable=invalid-name
+        with I.ir_module():
+            pass
+
+    # the ir_module generated by IRBuilder
+    ir_module_actual = ib.get()
+
+    # the expected prim_func
+    ir_module_expected = ir.IRModule(None, None)
+
+    assert_structural_equal(ir_module_actual, ir_module_expected, map_free_vars=True)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From bee562790894ee195bd934740a30dabfbb2f5483 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Wed, 7 Sep 2022 20:53:54 +0100
Subject: [PATCH 120/704] [TFLite][CI] Update TensorFlow dependency to 2.9.1
 (#12131)

This updates the TF version to be used in TVM CI to 2.9.1,
which brings improvements so that more platforms are supported by
official packages.

When building TFLite, an update to CMake was also required,
which is updated now to 3.18.4.

ethos-u-vela dependency is also updated, from version 3.2.0 to 3.4.0
so that it is closer to the TensorFlow version being proposed here.

This PR updates the Docker images scripting to install TF and TFLite.

Change-Id: I290085f0c018ad57606f1295494c19ff6e1af2dd
---
 cmake/modules/contrib/TFLite.cmake            |  2 ++
 docker/Dockerfile.ci_cortexm                  |  3 +++
 docker/Dockerfile.ci_cpu                      |  3 +++
 docker/Dockerfile.ci_gpu                      |  3 +++
 docker/Dockerfile.ci_riscv                    |  3 +++
 docker/install/ubuntu_install_cmake_source.sh |  4 ++--
 .../install/ubuntu_install_python_package.sh  |  2 +-
 docker/install/ubuntu_install_tensorflow.sh   |  5 ++--
 .../ubuntu_install_tensorflow_aarch64.sh      | 23 ++-----------------
 docker/install/ubuntu_install_tflite.sh       | 13 +++++++++--
 docker/install/ubuntu_install_vela.sh         |  2 +-
 docker/install/ubuntu_install_zephyr.sh       |  3 ++-
 12 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/cmake/modules/contrib/TFLite.cmake b/cmake/modules/contrib/TFLite.cmake
index 31597109095a..b8d6a0daff19 100644
--- a/cmake/modules/contrib/TFLite.cmake
+++ b/cmake/modules/contrib/TFLite.cmake
@@ -38,8 +38,10 @@ if(NOT USE_TFLITE STREQUAL "OFF")
     set(USE_TFLITE ${USE_TENSORFLOW_PATH}/tensorflow/lite/tools/make/gen/*/lib)
   endif()
   find_library(TFLITE_CONTRIB_LIB libtensorflow-lite.a ${USE_TFLITE})
+  file(GLOB_RECURSE TFLITE_DEPS "${USE_TFLITE}/*.a")
 
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${TFLITE_CONTRIB_LIB})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${TFLITE_DEPS})
 
   if (NOT USE_FLATBUFFERS_PATH STREQUAL "none")
     include_directories(${USE_FLATBUFFERS_PATH}/include)
diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm
index 63089f3d65f2..fb3c10d393f0 100644
--- a/docker/Dockerfile.ci_cortexm
+++ b/docker/Dockerfile.ci_cortexm
@@ -32,6 +32,9 @@ RUN bash /install/ubuntu_install_googletest.sh
 COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu1804_install_python_venv.sh /install/ubuntu1804_install_python_venv.sh
 RUN bash /install/ubuntu1804_install_python_venv.sh
 ENV PATH=/opt/tvm-venv/bin:/opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 3812bfbd197e..d9f353d41be1 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -40,6 +40,9 @@ RUN bash /install/ubuntu_install_python_package.sh
 COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh
 RUN bash /install/ubuntu1804_install_llvm.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu_install_dnnl.sh /install/ubuntu_install_dnnl.sh
 RUN bash /install/ubuntu_install_dnnl.sh
 
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index f04d8515b8dc..6f02ab97c09e 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -32,6 +32,9 @@ RUN apt-get update --fix-missing
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
diff --git a/docker/Dockerfile.ci_riscv b/docker/Dockerfile.ci_riscv
index b65b87a86386..1ca792e20c98 100644
--- a/docker/Dockerfile.ci_riscv
+++ b/docker/Dockerfile.ci_riscv
@@ -32,6 +32,9 @@ RUN bash /install/ubuntu_install_googletest.sh
 COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu1804_install_python_venv.sh /install/ubuntu1804_install_python_venv.sh
 RUN bash /install/ubuntu1804_install_python_venv.sh
 ENV PATH=/opt/tvm-venv/bin:/opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH
diff --git a/docker/install/ubuntu_install_cmake_source.sh b/docker/install/ubuntu_install_cmake_source.sh
index 18335c98c403..030cb4ea0406 100755
--- a/docker/install/ubuntu_install_cmake_source.sh
+++ b/docker/install/ubuntu_install_cmake_source.sh
@@ -20,8 +20,8 @@ set -e
 set -u
 set -o pipefail
 
-v=3.14
-version=3.14.7
+v=3.18
+version=3.18.4
 wget https://cmake.org/files/v${v}/cmake-${version}.tar.gz
 tar xvf cmake-${version}.tar.gz
 cd cmake-${version}
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 3fc310c47e34..9fee9d01425c 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -28,7 +28,7 @@ pip3 install --upgrade \
     cython \
     decorator \
     mypy \
-    numpy~=1.19.5 \
+    numpy==1.21.* \
     orderedset \
     packaging \
     Pillow==9.1.0 \
diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
index 17d2b31d9bc2..2225b7aef3b8 100755
--- a/docker/install/ubuntu_install_tensorflow.sh
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -21,6 +21,5 @@ set -u
 set -o pipefail
 
 pip3 install \
-    "h5py==3.1.0" \
-    keras==2.6 \
-    tensorflow==2.6.5
+    keras==2.9 \
+    tensorflow==2.9.1
diff --git a/docker/install/ubuntu_install_tensorflow_aarch64.sh b/docker/install/ubuntu_install_tensorflow_aarch64.sh
index 59cc5b4814b3..09efe5db5707 100755
--- a/docker/install/ubuntu_install_tensorflow_aarch64.sh
+++ b/docker/install/ubuntu_install_tensorflow_aarch64.sh
@@ -21,27 +21,8 @@ set -euxo pipefail
 # Build dependencies
 apt-install-and-clear -y --no-install-recommends libhdf5-dev
 
-# Downloading Tensorflow and installing it manually is needed
-# just as a temporary workaround while we move to a newer
-# version (>2.7) that is hosted in the official PyPI repository.
-linaro_repo="https://snapshots.linaro.org/ldcg/python/tensorflow-manylinux/43/tensorflow-aarch64"
-tensorflow_package="tensorflow_aarch64-2.6.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
-tmpdir=$(mktemp -d)
-
-cleanup()
-{
-  rm -rf "$tmpdir"
-}
-
-trap cleanup 0
-
-cd "${tmpdir}"
-wget -q "${linaro_repo}/${tensorflow_package}"
-
 # We're only using the TensorFlow wheel snapshot here as the
 # h5py wheel tries to use the wrong .so file
 pip3 install \
-    ${tensorflow_package} \
-    "h5py==3.1.0" \
-    keras==2.6 \
-    "protobuf<4"
+    keras==2.9 \
+    tensorflow-aarch64==2.9.1
diff --git a/docker/install/ubuntu_install_tflite.sh b/docker/install/ubuntu_install_tflite.sh
index 8a394302fdd3..4b73c202bc7f 100755
--- a/docker/install/ubuntu_install_tflite.sh
+++ b/docker/install/ubuntu_install_tflite.sh
@@ -18,6 +18,7 @@
 
 set -e
 set -u
+set -x
 set -o pipefail
 
 # The tflite version should have matched versions to the tensorflow
@@ -38,8 +39,16 @@ pip3 install flatbuffers
 # The library is built at:
 # tensorflow/tensorflow/lite/tools/make/gen/*/lib/libtensorflow-lite.a.
 git clone https://github.com/tensorflow/tensorflow --branch=v${TENSORFLOW_VERSION} --depth 1
-./tensorflow/tensorflow/lite/tools/make/download_dependencies.sh
-./tensorflow/tensorflow/lite/tools/make/build_lib.sh
+
+mkdir -p /opt/tflite
+cd /opt/tflite
+cmake \
+  -DTFLITE_ENABLE_XNNPACK=OFF \
+  /tensorflow/tensorflow/lite
+
+cmake --build .
+cd -
+
 
 # Setup tflite from schema
 mkdir tflite
diff --git a/docker/install/ubuntu_install_vela.sh b/docker/install/ubuntu_install_vela.sh
index c72d11823345..9e32889cd4eb 100755
--- a/docker/install/ubuntu_install_vela.sh
+++ b/docker/install/ubuntu_install_vela.sh
@@ -20,4 +20,4 @@ set -e
 set -u
 set -o pipefail
 
-pip3 install ethos-u-vela==3.2.0
+pip3 install ethos-u-vela==3.4.0
diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh
index d25027f00709..f955a7ff9b19 100755
--- a/docker/install/ubuntu_install_zephyr.sh
+++ b/docker/install/ubuntu_install_zephyr.sh
@@ -31,7 +31,8 @@ sudo apt-install-and-clear -y --no-install-recommends \
      git cmake ninja-build gperf \
      ccache dfu-util device-tree-compiler wget \
      python3-dev python3-pip python3-setuptools python3-tk python3-wheel python3-venv \
-     xz-utils file make gcc gcc-multilib g++-multilib apt-transport-https
+     xz-utils file make gcc gcc-multilib g++-multilib apt-transport-https libudev-dev \
+     libmagic1
 
 wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc
 sudo apt-key add kitware-archive-latest.asc

From 7f788dca4ecc76203b3a1873154106d4127c4f98 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 7 Sep 2022 13:15:42 -0700
Subject: [PATCH 121/704] [ci] Add onnx model to S3 (#12716)

Addresses this CI failure on `main`:
https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4235/pipeline/

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/workflows/upload_ci_resource.yml   | 2 ++
 tests/scripts/request_hook/request_hook.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/.github/workflows/upload_ci_resource.yml b/.github/workflows/upload_ci_resource.yml
index 10bba56583c9..6d85c26c25b3 100644
--- a/.github/workflows/upload_ci_resource.yml
+++ b/.github/workflows/upload_ci_resource.yml
@@ -56,3 +56,5 @@ jobs:
           echo "$SHA256 downloaded_file" | sha256sum --check
           aws s3 cp downloaded_file "s3://tvm-ci-resources/$UPLOAD_PATH"
           echo "The item is available at https://tvm-ci-resources.s3.us-west-2.amazonaws.com/$UPLOAD_PATH"
+          echo "Add this line to tests/scripts/request_hook/request_hook.py"
+          echo "    \"$URL\": f\"{BASE}/$UPLOAD_PATH\",
diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py
index 1cabdba76b02..46448f0a38a8 100644
--- a/tests/scripts/request_hook/request_hook.py
+++ b/tests/scripts/request_hook/request_hook.py
@@ -40,6 +40,7 @@
     "http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel": f"{BASE}/bvlc_alexnet.caffemodel",
     "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel": f"{BASE}/bvlc_googlenet.caffemodel",
     "https://github.com/dmlc/web-data/blob/main/darknet/data/dog.jpg": f"{BASE}/dog.jpg",
+    "https://github.com/onnx/models/raw/bd206494e8b6a27b25e5cf7199dbcdbfe9d05d1c/vision/classification/mnist/model/mnist-1.onnx": f"{BASE}/onnx/mnist-1.onnx",
 }
 
 
From 546a7da2febe8ced256a4e9759413a9542c68d66 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 7 Sep 2022 13:17:12 -0700
Subject: [PATCH 122/704] [ci] Re-balance shards (#12473)

Replace '> >' in templates with >>, NFC (#12615)

The problem with greedy lexing of >> as an operator was solved in
C++11, and now templates no longer require spaces between >'s.

Co-authored-by: Krzysztof Parzyszek <kparzysz@quicinc.com>
---
 Jenkinsfile                                 | 1397 +++++++------------
 ci/jenkins/Test.groovy.j2                   |   12 +-
 ci/jenkins/generate.py                      |   23 +-
 python/tvm/contrib/hexagon/pytest_plugin.py |    3 +-
 tests/scripts/setup-pytest-env.sh           |   10 +-
 5 files changed, 567 insertions(+), 878 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 2b73508da0d3..78071fde4599 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-08-30T15:26:50.100067
+// Generated at 2022-09-01T11:52:42.195970
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -1538,7 +1538,7 @@ def shard_run_unittest_GPU_3_of_3() {
 }
 
 
-def shard_run_integration_CPU_1_of_10() {
+def shard_run_integration_CPU_1_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
@@ -1549,7 +1549,7 @@ def shard_run_integration_CPU_1_of_10() {
             withEnv([
               'PLATFORM=cpu',
               'TEST_STEP_NAME=integration: CPU',
-              'TVM_NUM_SHARDS=10',
+              'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
@@ -1610,11 +1610,11 @@ def shard_run_integration_CPU_1_of_10() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('integration: CPU 1 of 10')
+    Utils.markStageSkippedForConditional('integration: CPU 1 of 4')
   }
 }
 
-def shard_run_integration_CPU_2_of_10() {
+def shard_run_integration_CPU_2_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
@@ -1625,7 +1625,7 @@ def shard_run_integration_CPU_2_of_10() {
             withEnv([
               'PLATFORM=cpu',
               'TEST_STEP_NAME=integration: CPU',
-              'TVM_NUM_SHARDS=10',
+              'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
@@ -1686,11 +1686,11 @@ def shard_run_integration_CPU_2_of_10() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('integration: CPU 2 of 10')
+    Utils.markStageSkippedForConditional('integration: CPU 2 of 4')
   }
 }
 
-def shard_run_integration_CPU_3_of_10() {
+def shard_run_integration_CPU_3_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
@@ -1701,7 +1701,7 @@ def shard_run_integration_CPU_3_of_10() {
             withEnv([
               'PLATFORM=cpu',
               'TEST_STEP_NAME=integration: CPU',
-              'TVM_NUM_SHARDS=10',
+              'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
@@ -1762,11 +1762,11 @@ def shard_run_integration_CPU_3_of_10() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('integration: CPU 3 of 10')
+    Utils.markStageSkippedForConditional('integration: CPU 3 of 4')
   }
 }
 
-def shard_run_integration_CPU_4_of_10() {
+def shard_run_integration_CPU_4_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
@@ -1777,7 +1777,7 @@ def shard_run_integration_CPU_4_of_10() {
             withEnv([
               'PLATFORM=cpu',
               'TEST_STEP_NAME=integration: CPU',
-              'TVM_NUM_SHARDS=10',
+              'TVM_NUM_SHARDS=4',
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
@@ -1838,327 +1838,24 @@ def shard_run_integration_CPU_4_of_10() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('integration: CPU 4 of 10')
+    Utils.markStageSkippedForConditional('integration: CPU 4 of 4')
   }
 }
 
-def shard_run_integration_CPU_5_of_10() {
-  if (!skip_ci && is_docs_only_build != 1) {
-    node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-        try {
-          docker_init(ci_cpu)
-          init_git()
-          timeout(time: max_time, unit: 'MINUTES') {
-            withEnv([
-              'PLATFORM=cpu',
-              'TEST_STEP_NAME=integration: CPU',
-              'TVM_NUM_SHARDS=10',
-              'TVM_SHARD_INDEX=4',
-              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-              sh(
-                        script: """
-                          set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
-
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-            })
-          }
-        } finally {
-          sh(
-            script: """
-              set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
-            """,
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        }
-      }
-    }
-  } else {
-    Utils.markStageSkippedForConditional('integration: CPU 5 of 10')
-  }
-}
 
-def shard_run_integration_CPU_6_of_10() {
+def shard_run_python_i386_1_of_3() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-        try {
-          docker_init(ci_cpu)
-          init_git()
-          timeout(time: max_time, unit: 'MINUTES') {
-            withEnv([
-              'PLATFORM=cpu',
-              'TEST_STEP_NAME=integration: CPU',
-              'TVM_NUM_SHARDS=10',
-              'TVM_SHARD_INDEX=5',
-              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-              sh(
-                        script: """
-                          set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
-
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-            })
-          }
-        } finally {
-          sh(
-            script: """
-              set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
-            """,
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        }
-      }
-    }
-  } else {
-    Utils.markStageSkippedForConditional('integration: CPU 6 of 10')
-  }
-}
-
-def shard_run_integration_CPU_7_of_10() {
-  if (!skip_ci && is_docs_only_build != 1) {
-    node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-        try {
-          docker_init(ci_cpu)
-          init_git()
-          timeout(time: max_time, unit: 'MINUTES') {
-            withEnv([
-              'PLATFORM=cpu',
-              'TEST_STEP_NAME=integration: CPU',
-              'TVM_NUM_SHARDS=10',
-              'TVM_SHARD_INDEX=6',
-              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-              sh(
-                        script: """
-                          set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
-
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-            })
-          }
-        } finally {
-          sh(
-            script: """
-              set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
-            """,
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        }
-      }
-    }
-  } else {
-    Utils.markStageSkippedForConditional('integration: CPU 7 of 10')
-  }
-}
-
-def shard_run_integration_CPU_8_of_10() {
-  if (!skip_ci && is_docs_only_build != 1) {
-    node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-        try {
-          docker_init(ci_cpu)
-          init_git()
-          timeout(time: max_time, unit: 'MINUTES') {
-            withEnv([
-              'PLATFORM=cpu',
-              'TEST_STEP_NAME=integration: CPU',
-              'TVM_NUM_SHARDS=10',
-              'TVM_SHARD_INDEX=7',
-              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-              sh(
-                        script: """
-                          set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
-
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-            })
-          }
-        } finally {
-          sh(
-            script: """
-              set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
-            """,
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        }
-      }
-    }
-  } else {
-    Utils.markStageSkippedForConditional('integration: CPU 8 of 10')
-  }
-}
-
-def shard_run_integration_CPU_9_of_10() {
-  if (!skip_ci && is_docs_only_build != 1) {
-    node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
-          docker_init(ci_cpu)
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=cpu',
-              'TEST_STEP_NAME=integration: CPU',
-              'TVM_NUM_SHARDS=10',
-              'TVM_SHARD_INDEX=8',
+              'PLATFORM=i386',
+              'TEST_STEP_NAME=python: i386',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -2183,24 +1880,24 @@ def shard_run_integration_CPU_9_of_10() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_cpu)
+              ci_setup(ci_i386)
+              cpp_unittest(ci_i386)
+              python_unittest(ci_i386)
               sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
+                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                label: 'Run i386 integration tests',
               )
             })
           }
@@ -2208,7 +1905,7 @@ def shard_run_integration_CPU_9_of_10() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2218,23 +1915,23 @@ def shard_run_integration_CPU_9_of_10() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('integration: CPU 9 of 10')
+    Utils.markStageSkippedForConditional('python: i386 1 of 3')
   }
 }
 
-def shard_run_integration_CPU_10_of_10() {
+def shard_run_python_i386_2_of_3() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
-          docker_init(ci_cpu)
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=cpu',
-              'TEST_STEP_NAME=integration: CPU',
-              'TVM_NUM_SHARDS=10',
-              'TVM_SHARD_INDEX=9',
+              'PLATFORM=i386',
+              'TEST_STEP_NAME=python: i386',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -2259,32 +1956,32 @@ def shard_run_integration_CPU_10_of_10() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_cpu)
+              ci_setup(ci_i386)
+              python_unittest(ci_i386)
               sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
+                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                label: 'Run i386 integration tests',
               )
+              fsim_test(ci_i386)
             })
           }
         } finally {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2294,12 +1991,11 @@ def shard_run_integration_CPU_10_of_10() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('integration: CPU 10 of 10')
+    Utils.markStageSkippedForConditional('python: i386 2 of 3')
   }
 }
 
-
-def shard_run_python_i386_1_of_5() {
+def shard_run_python_i386_3_of_3() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
@@ -2310,8 +2006,8 @@ def shard_run_python_i386_1_of_5() {
             withEnv([
               'PLATFORM=i386',
               'TEST_STEP_NAME=python: i386',
-              'TVM_NUM_SHARDS=5',
-              'TVM_SHARD_INDEX=0',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -2349,7 +2045,6 @@ def shard_run_python_i386_1_of_5() {
                       )
 
               ci_setup(ci_i386)
-              cpp_unittest(ci_i386)
               python_unittest(ci_i386)
               sh (
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
@@ -2371,23 +2066,24 @@ def shard_run_python_i386_1_of_5() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('python: i386 1 of 5')
+    Utils.markStageSkippedForConditional('python: i386 3 of 3')
   }
 }
 
-def shard_run_python_i386_2_of_5() {
+
+def shard_run_test_Hexagon_1_of_8() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_i386)
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=i386',
-              'TEST_STEP_NAME=python: i386',
-              'TVM_NUM_SHARDS=5',
-              'TVM_SHARD_INDEX=1',
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -2412,32 +2108,31 @@ def shard_run_python_i386_2_of_5() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_i386)
-              python_unittest(ci_i386)
+              add_hexagon_permissions()
+              ci_setup(ci_hexagon)
+              cpp_unittest(ci_hexagon)
               sh (
-                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-                label: 'Run i386 integration tests',
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
               )
-              fsim_test(ci_i386)
             })
           }
         } finally {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2447,23 +2142,23 @@ def shard_run_python_i386_2_of_5() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('python: i386 2 of 5')
+    Utils.markStageSkippedForConditional('test: Hexagon 1 of 8')
   }
 }
 
-def shard_run_python_i386_3_of_5() {
+def shard_run_test_Hexagon_2_of_8() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_i386)
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=i386',
-              'TEST_STEP_NAME=python: i386',
-              'TVM_NUM_SHARDS=5',
-              'TVM_SHARD_INDEX=2',
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -2488,23 +2183,22 @@ def shard_run_python_i386_3_of_5() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_i386)
-              python_unittest(ci_i386)
+              add_hexagon_permissions()
+              ci_setup(ci_hexagon)
               sh (
-                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-                label: 'Run i386 integration tests',
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
               )
             })
           }
@@ -2512,7 +2206,7 @@ def shard_run_python_i386_3_of_5() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2522,23 +2216,23 @@ def shard_run_python_i386_3_of_5() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('python: i386 3 of 5')
+    Utils.markStageSkippedForConditional('test: Hexagon 2 of 8')
   }
 }
 
-def shard_run_python_i386_4_of_5() {
+def shard_run_test_Hexagon_3_of_8() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_i386)
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=i386',
-              'TEST_STEP_NAME=python: i386',
-              'TVM_NUM_SHARDS=5',
-              'TVM_SHARD_INDEX=3',
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -2563,23 +2257,22 @@ def shard_run_python_i386_4_of_5() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_i386)
-              python_unittest(ci_i386)
+              add_hexagon_permissions()
+              ci_setup(ci_hexagon)
               sh (
-                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-                label: 'Run i386 integration tests',
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
               )
             })
           }
@@ -2587,7 +2280,7 @@ def shard_run_python_i386_4_of_5() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2597,23 +2290,23 @@ def shard_run_python_i386_4_of_5() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('python: i386 4 of 5')
+    Utils.markStageSkippedForConditional('test: Hexagon 3 of 8')
   }
 }
 
-def shard_run_python_i386_5_of_5() {
+def shard_run_test_Hexagon_4_of_8() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_i386)
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=i386',
-              'TEST_STEP_NAME=python: i386',
-              'TVM_NUM_SHARDS=5',
-              'TVM_SHARD_INDEX=4',
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -2638,23 +2331,22 @@ def shard_run_python_i386_5_of_5() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_i386)
-              python_unittest(ci_i386)
+              add_hexagon_permissions()
+              ci_setup(ci_hexagon)
               sh (
-                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-                label: 'Run i386 integration tests',
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
               )
             })
           }
@@ -2662,7 +2354,7 @@ def shard_run_python_i386_5_of_5() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2672,12 +2364,11 @@ def shard_run_python_i386_5_of_5() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('python: i386 5 of 5')
+    Utils.markStageSkippedForConditional('test: Hexagon 4 of 8')
   }
 }
 
-
-def shard_run_test_Hexagon_1_of_7() {
+def shard_run_test_Hexagon_5_of_8() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
@@ -2688,8 +2379,8 @@ def shard_run_test_Hexagon_1_of_7() {
             withEnv([
               'PLATFORM=hexagon',
               'TEST_STEP_NAME=test: Hexagon',
-              'TVM_NUM_SHARDS=7',
-              'TVM_SHARD_INDEX=0',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=4',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -2727,7 +2418,6 @@ def shard_run_test_Hexagon_1_of_7() {
 
               add_hexagon_permissions()
               ci_setup(ci_hexagon)
-              cpp_unittest(ci_hexagon)
               sh (
                 script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
                 label: 'Run Hexagon tests',
@@ -2748,11 +2438,11 @@ def shard_run_test_Hexagon_1_of_7() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 1 of 7')
+    Utils.markStageSkippedForConditional('test: Hexagon 5 of 8')
   }
 }
 
-def shard_run_test_Hexagon_2_of_7() {
+def shard_run_test_Hexagon_6_of_8() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
@@ -2763,8 +2453,8 @@ def shard_run_test_Hexagon_2_of_7() {
             withEnv([
               'PLATFORM=hexagon',
               'TEST_STEP_NAME=test: Hexagon',
-              'TVM_NUM_SHARDS=7',
-              'TVM_SHARD_INDEX=1',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=5',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -2822,11 +2512,11 @@ def shard_run_test_Hexagon_2_of_7() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 2 of 7')
+    Utils.markStageSkippedForConditional('test: Hexagon 6 of 8')
   }
 }
 
-def shard_run_test_Hexagon_3_of_7() {
+def shard_run_test_Hexagon_7_of_8() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
@@ -2837,8 +2527,8 @@ def shard_run_test_Hexagon_3_of_7() {
             withEnv([
               'PLATFORM=hexagon',
               'TEST_STEP_NAME=test: Hexagon',
-              'TVM_NUM_SHARDS=7',
-              'TVM_SHARD_INDEX=2',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=6',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -2896,11 +2586,11 @@ def shard_run_test_Hexagon_3_of_7() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 3 of 7')
+    Utils.markStageSkippedForConditional('test: Hexagon 7 of 8')
   }
 }
 
-def shard_run_test_Hexagon_4_of_7() {
+def shard_run_test_Hexagon_8_of_8() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
@@ -2911,8 +2601,8 @@ def shard_run_test_Hexagon_4_of_7() {
             withEnv([
               'PLATFORM=hexagon',
               'TEST_STEP_NAME=test: Hexagon',
-              'TVM_NUM_SHARDS=7',
-              'TVM_SHARD_INDEX=3',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=7',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -2970,23 +2660,24 @@ def shard_run_test_Hexagon_4_of_7() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 4 of 7')
+    Utils.markStageSkippedForConditional('test: Hexagon 8 of 8')
   }
 }
 
-def shard_run_test_Hexagon_5_of_7() {
+
+def shard_run_integration_aarch64_1_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_hexagon)
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=hexagon',
-              'TEST_STEP_NAME=test: Hexagon',
-              'TVM_NUM_SHARDS=7',
-              'TVM_SHARD_INDEX=4',
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=integration: aarch64',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -3011,22 +2702,23 @@ def shard_run_test_Hexagon_5_of_7() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              add_hexagon_permissions()
-              ci_setup(ci_hexagon)
+              ci_setup(ci_arm)
+              python_unittest(ci_arm)
               sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
               )
             })
           }
@@ -3034,7 +2726,7 @@ def shard_run_test_Hexagon_5_of_7() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3044,23 +2736,23 @@ def shard_run_test_Hexagon_5_of_7() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 5 of 7')
+    Utils.markStageSkippedForConditional('integration: aarch64 1 of 4')
   }
 }
 
-def shard_run_test_Hexagon_6_of_7() {
+def shard_run_integration_aarch64_2_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_hexagon)
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=hexagon',
-              'TEST_STEP_NAME=test: Hexagon',
-              'TVM_NUM_SHARDS=7',
-              'TVM_SHARD_INDEX=5',
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=integration: aarch64',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -3085,22 +2777,23 @@ def shard_run_test_Hexagon_6_of_7() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              add_hexagon_permissions()
-              ci_setup(ci_hexagon)
+              ci_setup(ci_arm)
+              python_unittest(ci_arm)
               sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
               )
             })
           }
@@ -3108,7 +2801,7 @@ def shard_run_test_Hexagon_6_of_7() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3118,23 +2811,23 @@ def shard_run_test_Hexagon_6_of_7() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 6 of 7')
+    Utils.markStageSkippedForConditional('integration: aarch64 2 of 4')
   }
 }
 
-def shard_run_test_Hexagon_7_of_7() {
+def shard_run_integration_aarch64_3_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_hexagon)
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=hexagon',
-              'TEST_STEP_NAME=test: Hexagon',
-              'TVM_NUM_SHARDS=7',
-              'TVM_SHARD_INDEX=6',
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=integration: aarch64',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -3159,22 +2852,23 @@ def shard_run_test_Hexagon_7_of_7() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              add_hexagon_permissions()
-              ci_setup(ci_hexagon)
+              ci_setup(ci_arm)
+              python_unittest(ci_arm)
               sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
               )
             })
           }
@@ -3182,7 +2876,7 @@ def shard_run_test_Hexagon_7_of_7() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3192,12 +2886,11 @@ def shard_run_test_Hexagon_7_of_7() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 7 of 7')
+    Utils.markStageSkippedForConditional('integration: aarch64 3 of 4')
   }
 }
 
-
-def shard_run_integration_aarch64_1_of_4() {
+def shard_run_integration_aarch64_4_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
@@ -3209,7 +2902,7 @@ def shard_run_integration_aarch64_1_of_4() {
               'PLATFORM=arm',
               'TEST_STEP_NAME=integration: aarch64',
               'TVM_NUM_SHARDS=4',
-              'TVM_SHARD_INDEX=0',
+              'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -3268,23 +2961,24 @@ def shard_run_integration_aarch64_1_of_4() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 1 of 4')
+    Utils.markStageSkippedForConditional('integration: aarch64 4 of 4')
   }
 }
 
-def shard_run_integration_aarch64_2_of_4() {
+
+def shard_run_topi_GPU_1_of_3() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
-          docker_init(ci_arm)
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=arm',
-              'TEST_STEP_NAME=integration: aarch64',
-              'TVM_NUM_SHARDS=4',
-              'TVM_SHARD_INDEX=1',
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=topi: GPU',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -3309,23 +3003,22 @@ def shard_run_integration_aarch64_2_of_4() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_arm)
-              python_unittest(ci_arm)
+              ci_setup(ci_gpu)
               sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
               )
             })
           }
@@ -3333,7 +3026,7 @@ def shard_run_integration_aarch64_2_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3343,23 +3036,23 @@ def shard_run_integration_aarch64_2_of_4() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 2 of 4')
+    Utils.markStageSkippedForConditional('topi: GPU 1 of 3')
   }
 }
 
-def shard_run_integration_aarch64_3_of_4() {
+def shard_run_topi_GPU_2_of_3() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
-          docker_init(ci_arm)
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=arm',
-              'TEST_STEP_NAME=integration: aarch64',
-              'TVM_NUM_SHARDS=4',
-              'TVM_SHARD_INDEX=2',
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=topi: GPU',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -3384,23 +3077,22 @@ def shard_run_integration_aarch64_3_of_4() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_arm)
-              python_unittest(ci_arm)
+              ci_setup(ci_gpu)
               sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
               )
             })
           }
@@ -3408,7 +3100,7 @@ def shard_run_integration_aarch64_3_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3418,23 +3110,23 @@ def shard_run_integration_aarch64_3_of_4() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 3 of 4')
+    Utils.markStageSkippedForConditional('topi: GPU 2 of 3')
   }
 }
 
-def shard_run_integration_aarch64_4_of_4() {
+def shard_run_topi_GPU_3_of_3() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
-          docker_init(ci_arm)
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=arm',
-              'TEST_STEP_NAME=integration: aarch64',
-              'TVM_NUM_SHARDS=4',
-              'TVM_SHARD_INDEX=3',
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=topi: GPU',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -3459,23 +3151,22 @@ def shard_run_integration_aarch64_4_of_4() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_arm)
-              python_unittest(ci_arm)
+              ci_setup(ci_gpu)
               sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
               )
             })
           }
@@ -3483,7 +3174,7 @@ def shard_run_integration_aarch64_4_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3493,23 +3184,23 @@ def shard_run_integration_aarch64_4_of_4() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 4 of 4')
+    Utils.markStageSkippedForConditional('topi: GPU 3 of 3')
   }
 }
 
 
-def shard_run_topi_GPU_1_of_4() {
+def shard_run_frontend_GPU_1_of_6() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('GPU') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
           docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
-              'TEST_STEP_NAME=topi: GPU',
-              'TVM_NUM_SHARDS=4',
+              'TEST_STEP_NAME=frontend: GPU',
+              'TVM_NUM_SHARDS=6',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
@@ -3549,8 +3240,8 @@ def shard_run_topi_GPU_1_of_4() {
 
               ci_setup(ci_gpu)
               sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
               )
             })
           }
@@ -3558,7 +3249,7 @@ def shard_run_topi_GPU_1_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3568,22 +3259,22 @@ def shard_run_topi_GPU_1_of_4() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('topi: GPU 1 of 4')
+    Utils.markStageSkippedForConditional('frontend: GPU 1 of 6')
   }
 }
 
-def shard_run_topi_GPU_2_of_4() {
+def shard_run_frontend_GPU_2_of_6() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('GPU') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
           docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
-              'TEST_STEP_NAME=topi: GPU',
-              'TVM_NUM_SHARDS=4',
+              'TEST_STEP_NAME=frontend: GPU',
+              'TVM_NUM_SHARDS=6',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
@@ -3623,8 +3314,8 @@ def shard_run_topi_GPU_2_of_4() {
 
               ci_setup(ci_gpu)
               sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
               )
             })
           }
@@ -3632,7 +3323,7 @@ def shard_run_topi_GPU_2_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3642,22 +3333,22 @@ def shard_run_topi_GPU_2_of_4() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('topi: GPU 2 of 4')
+    Utils.markStageSkippedForConditional('frontend: GPU 2 of 6')
   }
 }
 
-def shard_run_topi_GPU_3_of_4() {
+def shard_run_frontend_GPU_3_of_6() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('GPU') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
           docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
-              'TEST_STEP_NAME=topi: GPU',
-              'TVM_NUM_SHARDS=4',
+              'TEST_STEP_NAME=frontend: GPU',
+              'TVM_NUM_SHARDS=6',
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
@@ -3697,8 +3388,8 @@ def shard_run_topi_GPU_3_of_4() {
 
               ci_setup(ci_gpu)
               sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
               )
             })
           }
@@ -3706,7 +3397,7 @@ def shard_run_topi_GPU_3_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3716,22 +3407,22 @@ def shard_run_topi_GPU_3_of_4() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('topi: GPU 3 of 4')
+    Utils.markStageSkippedForConditional('frontend: GPU 3 of 6')
   }
 }
 
-def shard_run_topi_GPU_4_of_4() {
+def shard_run_frontend_GPU_4_of_6() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('GPU') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
           docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
-              'TEST_STEP_NAME=topi: GPU',
-              'TVM_NUM_SHARDS=4',
+              'TEST_STEP_NAME=frontend: GPU',
+              'TVM_NUM_SHARDS=6',
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
@@ -3771,8 +3462,8 @@ def shard_run_topi_GPU_4_of_4() {
 
               ci_setup(ci_gpu)
               sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
               )
             })
           }
@@ -3780,7 +3471,7 @@ def shard_run_topi_GPU_4_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3790,12 +3481,11 @@ def shard_run_topi_GPU_4_of_4() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('topi: GPU 4 of 4')
+    Utils.markStageSkippedForConditional('frontend: GPU 4 of 6')
   }
 }
 
-
-def shard_run_frontend_GPU_1_of_6() {
+def shard_run_frontend_GPU_5_of_6() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
@@ -3807,7 +3497,7 @@ def shard_run_frontend_GPU_1_of_6() {
               'PLATFORM=gpu',
               'TEST_STEP_NAME=frontend: GPU',
               'TVM_NUM_SHARDS=6',
-              'TVM_SHARD_INDEX=0',
+              'TVM_SHARD_INDEX=4',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -3865,11 +3555,11 @@ def shard_run_frontend_GPU_1_of_6() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('frontend: GPU 1 of 6')
+    Utils.markStageSkippedForConditional('frontend: GPU 5 of 6')
   }
 }
 
-def shard_run_frontend_GPU_2_of_6() {
+def shard_run_frontend_GPU_6_of_6() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
@@ -3881,7 +3571,7 @@ def shard_run_frontend_GPU_2_of_6() {
               'PLATFORM=gpu',
               'TEST_STEP_NAME=frontend: GPU',
               'TVM_NUM_SHARDS=6',
-              'TVM_SHARD_INDEX=1',
+              'TVM_SHARD_INDEX=5',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -3939,23 +3629,24 @@ def shard_run_frontend_GPU_2_of_6() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('frontend: GPU 2 of 6')
+    Utils.markStageSkippedForConditional('frontend: GPU 6 of 6')
   }
 }
 
-def shard_run_frontend_GPU_3_of_6() {
+
+def shard_run_topi_aarch64_1_of_2() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('GPU') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_gpu)
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=gpu',
-              'TEST_STEP_NAME=frontend: GPU',
-              'TVM_NUM_SHARDS=6',
-              'TVM_SHARD_INDEX=2',
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=topi: aarch64',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -3980,22 +3671,27 @@ def shard_run_frontend_GPU_3_of_6() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_gpu)
+              ci_setup(ci_arm)
+              cpp_unittest(ci_arm)
               sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
-                label: 'Run Python frontend tests',
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+                label: 'Run test_arm_compute_lib test',
+              )
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
               )
             })
           }
@@ -4003,7 +3699,7 @@ def shard_run_frontend_GPU_3_of_6() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -4013,23 +3709,23 @@ def shard_run_frontend_GPU_3_of_6() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('frontend: GPU 3 of 6')
+    Utils.markStageSkippedForConditional('topi: aarch64 1 of 2')
   }
 }
 
-def shard_run_frontend_GPU_4_of_6() {
+def shard_run_topi_aarch64_2_of_2() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('GPU') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_gpu)
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=gpu',
-              'TEST_STEP_NAME=frontend: GPU',
-              'TVM_NUM_SHARDS=6',
-              'TVM_SHARD_INDEX=3',
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=topi: aarch64',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -4054,22 +3750,26 @@ def shard_run_frontend_GPU_4_of_6() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_gpu)
+              ci_setup(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+                label: 'Run test_arm_compute_lib test',
+              )
               sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
-                label: 'Run Python frontend tests',
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
               )
             })
           }
@@ -4077,7 +3777,7 @@ def shard_run_frontend_GPU_4_of_6() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -4087,23 +3787,24 @@ def shard_run_frontend_GPU_4_of_6() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('frontend: GPU 4 of 6')
+    Utils.markStageSkippedForConditional('topi: aarch64 2 of 2')
   }
 }
 
-def shard_run_frontend_GPU_5_of_6() {
+
+def shard_run_frontend_aarch64_1_of_2() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('GPU') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
-          docker_init(ci_gpu)
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=gpu',
-              'TEST_STEP_NAME=frontend: GPU',
-              'TVM_NUM_SHARDS=6',
-              'TVM_SHARD_INDEX=4',
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=frontend: aarch64',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -4128,21 +3829,21 @@ def shard_run_frontend_GPU_5_of_6() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_gpu)
+              ci_setup(ci_arm)
               sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
                 label: 'Run Python frontend tests',
               )
             })
@@ -4151,7 +3852,7 @@ def shard_run_frontend_GPU_5_of_6() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -4161,23 +3862,23 @@ def shard_run_frontend_GPU_5_of_6() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('frontend: GPU 5 of 6')
+    Utils.markStageSkippedForConditional('frontend: aarch64 1 of 2')
   }
 }
 
-def shard_run_frontend_GPU_6_of_6() {
+def shard_run_frontend_aarch64_2_of_2() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('GPU') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
-          docker_init(ci_gpu)
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=gpu',
-              'TEST_STEP_NAME=frontend: GPU',
-              'TVM_NUM_SHARDS=6',
-              'TVM_SHARD_INDEX=5',
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=frontend: aarch64',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -4202,21 +3903,21 @@ def shard_run_frontend_GPU_6_of_6() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_gpu)
+              ci_setup(ci_arm)
               sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
                 label: 'Run Python frontend tests',
               )
             })
@@ -4225,7 +3926,7 @@ def shard_run_frontend_GPU_6_of_6() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -4235,23 +3936,23 @@ def shard_run_frontend_GPU_6_of_6() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('frontend: GPU 6 of 6')
+    Utils.markStageSkippedForConditional('frontend: aarch64 2 of 2')
   }
 }
 
 
-def shard_run_topi_aarch64_1_of_2() {
+def shard_run_test_Cortex_M_1_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_arm)
+          docker_init(ci_cortexm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=arm',
-              'TEST_STEP_NAME=topi: aarch64',
-              'TVM_NUM_SHARDS=2',
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
@@ -4277,27 +3978,27 @@ def shard_run_topi_aarch64_1_of_2() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_arm)
-              cpp_unittest(ci_arm)
+              add_microtvm_permissions()
+              ci_setup(ci_cortexm)
+              cpp_unittest(ci_cortexm)
               sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-                label: 'Run test_arm_compute_lib test',
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_demo_microtvm.sh",
+                label: 'Run microTVM demos',
               )
               sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
               )
             })
           }
@@ -4305,7 +4006,7 @@ def shard_run_topi_aarch64_1_of_2() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -4315,22 +4016,22 @@ def shard_run_topi_aarch64_1_of_2() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('topi: aarch64 1 of 2')
+    Utils.markStageSkippedForConditional('test: Cortex-M 1 of 12')
   }
 }
 
-def shard_run_topi_aarch64_2_of_2() {
+def shard_run_test_Cortex_M_2_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_arm)
+          docker_init(ci_cortexm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=arm',
-              'TEST_STEP_NAME=topi: aarch64',
-              'TVM_NUM_SHARDS=2',
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
@@ -4356,26 +4057,22 @@ def shard_run_topi_aarch64_2_of_2() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-                label: 'Run test_arm_compute_lib test',
-              )
+              add_microtvm_permissions()
+              ci_setup(ci_cortexm)
               sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
               )
             })
           }
@@ -4383,7 +4080,7 @@ def shard_run_topi_aarch64_2_of_2() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -4393,24 +4090,23 @@ def shard_run_topi_aarch64_2_of_2() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('topi: aarch64 2 of 2')
+    Utils.markStageSkippedForConditional('test: Cortex-M 2 of 12')
   }
 }
 
-
-def shard_run_frontend_aarch64_1_of_2() {
+def shard_run_test_Cortex_M_3_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_arm)
+          docker_init(ci_cortexm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=arm',
-              'TEST_STEP_NAME=frontend: aarch64',
-              'TVM_NUM_SHARDS=2',
-              'TVM_SHARD_INDEX=0',
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -4435,22 +4131,22 @@ def shard_run_frontend_aarch64_1_of_2() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_arm)
+              add_microtvm_permissions()
+              ci_setup(ci_cortexm)
               sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
-                label: 'Run Python frontend tests',
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
               )
             })
           }
@@ -4458,7 +4154,7 @@ def shard_run_frontend_aarch64_1_of_2() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -4468,23 +4164,23 @@ def shard_run_frontend_aarch64_1_of_2() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('frontend: aarch64 1 of 2')
+    Utils.markStageSkippedForConditional('test: Cortex-M 3 of 12')
   }
 }
 
-def shard_run_frontend_aarch64_2_of_2() {
+def shard_run_test_Cortex_M_4_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_arm)
+          docker_init(ci_cortexm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
-              'PLATFORM=arm',
-              'TEST_STEP_NAME=frontend: aarch64',
-              'TVM_NUM_SHARDS=2',
-              'TVM_SHARD_INDEX=1',
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -4509,22 +4205,22 @@ def shard_run_frontend_aarch64_2_of_2() {
                             done
                           }
 
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-              ci_setup(ci_arm)
+              add_microtvm_permissions()
+              ci_setup(ci_cortexm)
               sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
-                label: 'Run Python frontend tests',
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
               )
             })
           }
@@ -4532,7 +4228,7 @@ def shard_run_frontend_aarch64_2_of_2() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -4542,12 +4238,11 @@ def shard_run_frontend_aarch64_2_of_2() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('frontend: aarch64 2 of 2')
+    Utils.markStageSkippedForConditional('test: Cortex-M 4 of 12')
   }
 }
 
-
-def shard_run_test_Cortex_M_1_of_8() {
+def shard_run_test_Cortex_M_5_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
@@ -4558,8 +4253,8 @@ def shard_run_test_Cortex_M_1_of_8() {
             withEnv([
               'PLATFORM=cortexm',
               'TEST_STEP_NAME=test: Cortex-M',
-              'TVM_NUM_SHARDS=8',
-              'TVM_SHARD_INDEX=0',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=4',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -4597,11 +4292,6 @@ def shard_run_test_Cortex_M_1_of_8() {
 
               add_microtvm_permissions()
               ci_setup(ci_cortexm)
-              cpp_unittest(ci_cortexm)
-              sh (
-                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_demo_microtvm.sh",
-                label: 'Run microTVM demos',
-              )
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
                 label: 'Run microTVM tests',
@@ -4622,11 +4312,11 @@ def shard_run_test_Cortex_M_1_of_8() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Cortex-M 1 of 8')
+    Utils.markStageSkippedForConditional('test: Cortex-M 5 of 12')
   }
 }
 
-def shard_run_test_Cortex_M_2_of_8() {
+def shard_run_test_Cortex_M_6_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
@@ -4637,8 +4327,8 @@ def shard_run_test_Cortex_M_2_of_8() {
             withEnv([
               'PLATFORM=cortexm',
               'TEST_STEP_NAME=test: Cortex-M',
-              'TVM_NUM_SHARDS=8',
-              'TVM_SHARD_INDEX=1',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=5',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -4696,11 +4386,11 @@ def shard_run_test_Cortex_M_2_of_8() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Cortex-M 2 of 8')
+    Utils.markStageSkippedForConditional('test: Cortex-M 6 of 12')
   }
 }
 
-def shard_run_test_Cortex_M_3_of_8() {
+def shard_run_test_Cortex_M_7_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
@@ -4711,8 +4401,8 @@ def shard_run_test_Cortex_M_3_of_8() {
             withEnv([
               'PLATFORM=cortexm',
               'TEST_STEP_NAME=test: Cortex-M',
-              'TVM_NUM_SHARDS=8',
-              'TVM_SHARD_INDEX=2',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=6',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -4770,11 +4460,11 @@ def shard_run_test_Cortex_M_3_of_8() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Cortex-M 3 of 8')
+    Utils.markStageSkippedForConditional('test: Cortex-M 7 of 12')
   }
 }
 
-def shard_run_test_Cortex_M_4_of_8() {
+def shard_run_test_Cortex_M_8_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
@@ -4785,8 +4475,8 @@ def shard_run_test_Cortex_M_4_of_8() {
             withEnv([
               'PLATFORM=cortexm',
               'TEST_STEP_NAME=test: Cortex-M',
-              'TVM_NUM_SHARDS=8',
-              'TVM_SHARD_INDEX=3',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=7',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -4844,11 +4534,11 @@ def shard_run_test_Cortex_M_4_of_8() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Cortex-M 4 of 8')
+    Utils.markStageSkippedForConditional('test: Cortex-M 8 of 12')
   }
 }
 
-def shard_run_test_Cortex_M_5_of_8() {
+def shard_run_test_Cortex_M_9_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
@@ -4859,8 +4549,8 @@ def shard_run_test_Cortex_M_5_of_8() {
             withEnv([
               'PLATFORM=cortexm',
               'TEST_STEP_NAME=test: Cortex-M',
-              'TVM_NUM_SHARDS=8',
-              'TVM_SHARD_INDEX=4',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=8',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -4918,11 +4608,11 @@ def shard_run_test_Cortex_M_5_of_8() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Cortex-M 5 of 8')
+    Utils.markStageSkippedForConditional('test: Cortex-M 9 of 12')
   }
 }
 
-def shard_run_test_Cortex_M_6_of_8() {
+def shard_run_test_Cortex_M_10_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
@@ -4933,8 +4623,8 @@ def shard_run_test_Cortex_M_6_of_8() {
             withEnv([
               'PLATFORM=cortexm',
               'TEST_STEP_NAME=test: Cortex-M',
-              'TVM_NUM_SHARDS=8',
-              'TVM_SHARD_INDEX=5',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=9',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -4992,11 +4682,11 @@ def shard_run_test_Cortex_M_6_of_8() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Cortex-M 6 of 8')
+    Utils.markStageSkippedForConditional('test: Cortex-M 10 of 12')
   }
 }
 
-def shard_run_test_Cortex_M_7_of_8() {
+def shard_run_test_Cortex_M_11_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
@@ -5007,8 +4697,8 @@ def shard_run_test_Cortex_M_7_of_8() {
             withEnv([
               'PLATFORM=cortexm',
               'TEST_STEP_NAME=test: Cortex-M',
-              'TVM_NUM_SHARDS=8',
-              'TVM_SHARD_INDEX=6',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=10',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -5066,11 +4756,11 @@ def shard_run_test_Cortex_M_7_of_8() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Cortex-M 7 of 8')
+    Utils.markStageSkippedForConditional('test: Cortex-M 11 of 12')
   }
 }
 
-def shard_run_test_Cortex_M_8_of_8() {
+def shard_run_test_Cortex_M_12_of_12() {
   if (!skip_ci && is_docs_only_build != 1) {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
@@ -5081,8 +4771,8 @@ def shard_run_test_Cortex_M_8_of_8() {
             withEnv([
               'PLATFORM=cortexm',
               'TEST_STEP_NAME=test: Cortex-M',
-              'TVM_NUM_SHARDS=8',
-              'TVM_SHARD_INDEX=7',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=11',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
                         script: """
@@ -5140,7 +4830,7 @@ def shard_run_test_Cortex_M_8_of_8() {
       }
     }
   } else {
-    Utils.markStageSkippedForConditional('test: Cortex-M 8 of 8')
+    Utils.markStageSkippedForConditional('test: Cortex-M 12 of 12')
   }
 }
 
@@ -5300,71 +4990,50 @@ stage('Test') {
   'unittest: GPU 3 of 3': {
     shard_run_unittest_GPU_3_of_3()
   },
-  'integration: CPU 1 of 10': {
-    shard_run_integration_CPU_1_of_10()
-  },
-  'integration: CPU 2 of 10': {
-    shard_run_integration_CPU_2_of_10()
-  },
-  'integration: CPU 3 of 10': {
-    shard_run_integration_CPU_3_of_10()
-  },
-  'integration: CPU 4 of 10': {
-    shard_run_integration_CPU_4_of_10()
-  },
-  'integration: CPU 5 of 10': {
-    shard_run_integration_CPU_5_of_10()
-  },
-  'integration: CPU 6 of 10': {
-    shard_run_integration_CPU_6_of_10()
+  'integration: CPU 1 of 4': {
+    shard_run_integration_CPU_1_of_4()
   },
-  'integration: CPU 7 of 10': {
-    shard_run_integration_CPU_7_of_10()
+  'integration: CPU 2 of 4': {
+    shard_run_integration_CPU_2_of_4()
   },
-  'integration: CPU 8 of 10': {
-    shard_run_integration_CPU_8_of_10()
+  'integration: CPU 3 of 4': {
+    shard_run_integration_CPU_3_of_4()
   },
-  'integration: CPU 9 of 10': {
-    shard_run_integration_CPU_9_of_10()
+  'integration: CPU 4 of 4': {
+    shard_run_integration_CPU_4_of_4()
   },
-  'integration: CPU 10 of 10': {
-    shard_run_integration_CPU_10_of_10()
+  'python: i386 1 of 3': {
+    shard_run_python_i386_1_of_3()
   },
-  'python: i386 1 of 5': {
-    shard_run_python_i386_1_of_5()
+  'python: i386 2 of 3': {
+    shard_run_python_i386_2_of_3()
   },
-  'python: i386 2 of 5': {
-    shard_run_python_i386_2_of_5()
+  'python: i386 3 of 3': {
+    shard_run_python_i386_3_of_3()
   },
-  'python: i386 3 of 5': {
-    shard_run_python_i386_3_of_5()
+  'test: Hexagon 1 of 8': {
+    shard_run_test_Hexagon_1_of_8()
   },
-  'python: i386 4 of 5': {
-    shard_run_python_i386_4_of_5()
+  'test: Hexagon 2 of 8': {
+    shard_run_test_Hexagon_2_of_8()
   },
-  'python: i386 5 of 5': {
-    shard_run_python_i386_5_of_5()
+  'test: Hexagon 3 of 8': {
+    shard_run_test_Hexagon_3_of_8()
   },
-  'test: Hexagon 1 of 7': {
-    shard_run_test_Hexagon_1_of_7()
+  'test: Hexagon 4 of 8': {
+    shard_run_test_Hexagon_4_of_8()
   },
-  'test: Hexagon 2 of 7': {
-    shard_run_test_Hexagon_2_of_7()
+  'test: Hexagon 5 of 8': {
+    shard_run_test_Hexagon_5_of_8()
   },
-  'test: Hexagon 3 of 7': {
-    shard_run_test_Hexagon_3_of_7()
+  'test: Hexagon 6 of 8': {
+    shard_run_test_Hexagon_6_of_8()
   },
-  'test: Hexagon 4 of 7': {
-    shard_run_test_Hexagon_4_of_7()
+  'test: Hexagon 7 of 8': {
+    shard_run_test_Hexagon_7_of_8()
   },
-  'test: Hexagon 5 of 7': {
-    shard_run_test_Hexagon_5_of_7()
-  },
-  'test: Hexagon 6 of 7': {
-    shard_run_test_Hexagon_6_of_7()
-  },
-  'test: Hexagon 7 of 7': {
-    shard_run_test_Hexagon_7_of_7()
+  'test: Hexagon 8 of 8': {
+    shard_run_test_Hexagon_8_of_8()
   },
   'integration: aarch64 1 of 4': {
     shard_run_integration_aarch64_1_of_4()
@@ -5378,17 +5047,14 @@ stage('Test') {
   'integration: aarch64 4 of 4': {
     shard_run_integration_aarch64_4_of_4()
   },
-  'topi: GPU 1 of 4': {
-    shard_run_topi_GPU_1_of_4()
+  'topi: GPU 1 of 3': {
+    shard_run_topi_GPU_1_of_3()
   },
-  'topi: GPU 2 of 4': {
-    shard_run_topi_GPU_2_of_4()
+  'topi: GPU 2 of 3': {
+    shard_run_topi_GPU_2_of_3()
   },
-  'topi: GPU 3 of 4': {
-    shard_run_topi_GPU_3_of_4()
-  },
-  'topi: GPU 4 of 4': {
-    shard_run_topi_GPU_4_of_4()
+  'topi: GPU 3 of 3': {
+    shard_run_topi_GPU_3_of_3()
   },
   'frontend: GPU 1 of 6': {
     shard_run_frontend_GPU_1_of_6()
@@ -5420,29 +5086,41 @@ stage('Test') {
   'frontend: aarch64 2 of 2': {
     shard_run_frontend_aarch64_2_of_2()
   },
-  'test: Cortex-M 1 of 8': {
-    shard_run_test_Cortex_M_1_of_8()
+  'test: Cortex-M 1 of 12': {
+    shard_run_test_Cortex_M_1_of_12()
+  },
+  'test: Cortex-M 2 of 12': {
+    shard_run_test_Cortex_M_2_of_12()
   },
-  'test: Cortex-M 2 of 8': {
-    shard_run_test_Cortex_M_2_of_8()
+  'test: Cortex-M 3 of 12': {
+    shard_run_test_Cortex_M_3_of_12()
   },
-  'test: Cortex-M 3 of 8': {
-    shard_run_test_Cortex_M_3_of_8()
+  'test: Cortex-M 4 of 12': {
+    shard_run_test_Cortex_M_4_of_12()
   },
-  'test: Cortex-M 4 of 8': {
-    shard_run_test_Cortex_M_4_of_8()
+  'test: Cortex-M 5 of 12': {
+    shard_run_test_Cortex_M_5_of_12()
   },
-  'test: Cortex-M 5 of 8': {
-    shard_run_test_Cortex_M_5_of_8()
+  'test: Cortex-M 6 of 12': {
+    shard_run_test_Cortex_M_6_of_12()
   },
-  'test: Cortex-M 6 of 8': {
-    shard_run_test_Cortex_M_6_of_8()
+  'test: Cortex-M 7 of 12': {
+    shard_run_test_Cortex_M_7_of_12()
   },
-  'test: Cortex-M 7 of 8': {
-    shard_run_test_Cortex_M_7_of_8()
+  'test: Cortex-M 8 of 12': {
+    shard_run_test_Cortex_M_8_of_12()
   },
-  'test: Cortex-M 8 of 8': {
-    shard_run_test_Cortex_M_8_of_8()
+  'test: Cortex-M 9 of 12': {
+    shard_run_test_Cortex_M_9_of_12()
+  },
+  'test: Cortex-M 10 of 12': {
+    shard_run_test_Cortex_M_10_of_12()
+  },
+  'test: Cortex-M 11 of 12': {
+    shard_run_test_Cortex_M_11_of_12()
+  },
+  'test: Cortex-M 12 of 12': {
+    shard_run_test_Cortex_M_12_of_12()
   },
   'test: RISC-V 1 of 1': {
     shard_run_test_RISC_V_1_of_1()
@@ -5684,7 +5362,8 @@ stage('Test') {
   },
   )
 }
-}/*
+}
+/*
 stage('Build packages') {
   parallel 'conda CPU': {
     node('CPU') {
diff --git a/ci/jenkins/Test.groovy.j2 b/ci/jenkins/Test.groovy.j2
index 9e8c9ac28b01..4ed149da9be0 100644
--- a/ci/jenkins/Test.groovy.j2
+++ b/ci/jenkins/Test.groovy.j2
@@ -42,7 +42,7 @@
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="integration: CPU",
   node="CPU-SMALL",
-  num_shards=10,
+  num_shards=4,
   ws="tvm/integration-python-cpu",
   platform="cpu",
   docker_image="ci_cpu",
@@ -58,7 +58,7 @@
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="python: i386",
   node="CPU-SMALL",
-  num_shards=5,
+  num_shards=3,
   ws="tvm/integration-python-i386",
   platform="i386",
   docker_image="ci_i386",
@@ -85,7 +85,7 @@
   platform="hexagon",
   docker_image="ci_hexagon",
   test_method_names=test_method_names,
-  num_shards=7,
+  num_shards=8,
 ) %}
   {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib, folders=hexagon_api) }}
   add_hexagon_permissions()
@@ -118,7 +118,7 @@
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="topi: GPU",
   node="GPU",
-  num_shards=4,
+  num_shards=3,
   ws="tvm/topi-python-gpu",
   platform="gpu",
   docker_image="ci_gpu",
@@ -192,7 +192,7 @@
   ws="tvm/test-cortexm",
   platform="cortexm",
   docker_image="ci_cortexm",
-  num_shards=8,
+  num_shards=12,
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='cortexm', filenames=tvm_lib, folders=microtvm_template_projects) }}
@@ -316,4 +316,4 @@ stage('Test') {
   },
   )
 }
-}
\ No newline at end of file
+}
diff --git a/ci/jenkins/generate.py b/ci/jenkins/generate.py
index 3ccdedc6d924..07bf4b5a8dad 100644
--- a/ci/jenkins/generate.py
+++ b/ci/jenkins/generate.py
@@ -31,6 +31,12 @@
 JENKINSFILE = REPO_ROOT / "Jenkinsfile"
 
 
+class Change:
+    IMAGES_ONLY = object()
+    NONE = object()
+    FULL = object()
+
+
 data = {
     "images": [
         {
@@ -83,7 +89,7 @@ def lines_without_generated_tag(content):
     ]
 
 
-def is_changed_images_only(lines: List[str]) -> bool:
+def change_type(lines: List[str]) -> Change:
     """
     Return True if 'line' only edits an image tag or if 'line' is not a changed
     line in a diff
@@ -101,7 +107,7 @@ def is_changed_images_only(lines: List[str]) -> bool:
 
     if len(diff_lines) == 0:
         # no changes made
-        return True
+        return Change.NONE
 
     for line in diff_lines:
         is_add = line.startswith("+")
@@ -113,7 +119,7 @@ def is_changed_images_only(lines: List[str]) -> bool:
         )
         if match is None:
             # matched a non-image line, quit early
-            return False
+            return Change.FULL
 
         if is_add:
             added_images.append(match.groups()[0])
@@ -121,7 +127,10 @@ def is_changed_images_only(lines: List[str]) -> bool:
             removed_images.append(match.groups()[0])
 
     # make sure that the added image lines match the removed image lines
-    return len(added_images) > 0 and added_images == removed_images
+    if len(added_images) > 0 and added_images == removed_images:
+        return Change.IMAGES_ONLY
+    else:
+        return Change.FULL
 
 
 if __name__ == "__main__":
@@ -156,9 +165,11 @@ def is_changed_images_only(lines: List[str]) -> bool:
             lines_without_generated_tag(content), lines_without_generated_tag(new_content)
         )
     ]
-    if not args.force and is_changed_images_only(diff):
+    change = change_type(diff)
+    if not args.force and change == Change.IMAGES_ONLY or change == Change.NONE:
+        if change != Change.NONE:
+            print("Detected only Docker-image name changes, skipping timestamp update")
         new_content = new_content.replace(data["generated_time"], original_timestamp)
-        print("Detected only Docker-image name changed, skipping timestamp update")
 
     diff = "".join(diff)
 
diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index 65475d67f555..0b9f65540c34 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -210,7 +210,8 @@ def pytest_configure(config):
 def pytest_configure_node(node):
     # the master for each node fills slaveinput dictionary
     # which pytest-xdist will transfer to the subprocess
-    node.workerinput["device_adr"] = node.config.iplist.pop()
+    if node.config.iplist is not None:
+        node.workerinput["device_adr"] = node.config.iplist.pop()
 
 
 @pytest.fixture
diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index 895979293122..305f626d666c 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -47,6 +47,7 @@ function cleanup() {
 trap cleanup 0
 
 function run_pytest() {
+    set -e
     local ffi_type="$1"
     shift
     local test_suite_name="$1"
@@ -74,17 +75,14 @@ function run_pytest() {
 
     suite_name="${test_suite_name}-${current_shard}-${ffi_type}"
 
-    # Some test environments don't play well with parallelism
-    DEFAULT_PARALLELISM=2
-    if [[ "${TEST_STEP_NAME:-default}" == "frontend: GPU"* ]] || [[ "${TEST_STEP_NAME:-default}" == "test: Hexagon"* ]]; then
-        DEFAULT_PARALLELISM=1
-    fi
+    DEFAULT_PARALLELISM=1
 
-    if [ ! "${extra_args[@]}" == *" -n"* ] && [! "${extra_args[@]}" == *" -dist"* ]; then
+    if [[ ! "${extra_args[*]}" == *" -n"* ]] && [[ ! "${extra_args[*]}" == *" -dist"* ]]; then
         extra_args+=("-n=$DEFAULT_PARALLELISM")
     fi
 
     exit_code=0
+    set +e
     TVM_FFI=${ffi_type} python3 -m pytest \
            -o "junit_suite_name=${suite_name}" \
            "--junit-xml=${TVM_PYTEST_RESULT_DIR}/${suite_name}.xml" \

From abb2aa062fd240980844faa6e0ebcc2256a5191c Mon Sep 17 00:00:00 2001
From: "yin.changsheng" <yin.changsheng@intellif.com>
Date: Thu, 8 Sep 2022 11:28:36 +0800
Subject: [PATCH 123/704] [TIR] Add unroll_loop_with_partition_hint_no_interval
 attr in LoopPartitionConfig     to unroll loop (#12631)

[TIR] Add unroll_loop_with_partition_hint_no_interval attr in LoopPartitionConfig
to unroll loop
---
 src/tir/transforms/loop_partition.cc          | 28 ++++++--
 .../test_tir_transform_loop_partition.py      | 72 +++++++++++++------
 2 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc
index d410f8cfa471..e1445d29dacf 100644
--- a/src/tir/transforms/loop_partition.cc
+++ b/src/tir/transforms/loop_partition.cc
@@ -43,12 +43,16 @@ namespace tir {
 struct LoopPartitionConfigNode : public tvm::AttrsNode<LoopPartitionConfigNode> {
   bool partition_const_loop;
   bool no_unroll_loop_with_extent_one;
+  bool unroll_loop_with_partition_hint_no_interval;
 
   TVM_DECLARE_ATTRS(LoopPartitionConfigNode, "tir.transform.LoopPartitionConfig") {
     TVM_ATTR_FIELD(partition_const_loop).describe("Split constant loop").set_default(false);
     TVM_ATTR_FIELD(no_unroll_loop_with_extent_one)
         .describe("Don't unroll loops with extent 1")
         .set_default(false);
+    TVM_ATTR_FIELD(unroll_loop_with_partition_hint_no_interval)
+        .describe("Unroll loops with pragma_loop_partition_hint and no interval")
+        .set_default(false);
   }
 };
 
@@ -377,9 +381,11 @@ class ThreadPartitionInserter : public StmtMutator {
 // likely conditions
 class LoopPartitioner : public StmtMutator {
  public:
-  explicit LoopPartitioner(bool partition_const_loop, bool no_unroll_loop_with_extent_one)
+  explicit LoopPartitioner(bool partition_const_loop, bool no_unroll_loop_with_extent_one,
+                           bool unroll_loop_with_partition_hint_no_interval)
       : selector(CandidateSelector(partition_const_loop)),
-        no_unroll_loop_with_extent_one_(no_unroll_loop_with_extent_one) {}
+        no_unroll_loop_with_extent_one_(no_unroll_loop_with_extent_one),
+        unroll_loop_with_partition_hint_no_interval_(unroll_loop_with_partition_hint_no_interval) {}
 
   Stmt VisitAndMutate(Stmt stmt) {
     selector(stmt);
@@ -447,6 +453,7 @@ class LoopPartitioner : public StmtMutator {
   arith::Analyzer analyzer_;
   CandidateSelector selector;
   bool no_unroll_loop_with_extent_one_;
+  bool unroll_loop_with_partition_hint_no_interval_;
 };
 
 // Returns an interval (in the first component) in which all the conditions
@@ -587,6 +594,10 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim
   }();
 
   if (!opt_cond_value.has_value()) {
+    if (has_partition_hint_ && unroll_loop_with_partition_hint_no_interval_ &&
+        analyzer_.CanProve(max - min > 0)) {
+      return For(var, min, max - min + 1, ForKind::kUnrolled, body);
+    }
     return Stmt();
   }
   bool cond_value = opt_cond_value.value();
@@ -658,11 +669,11 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim
       Stmt simplified_body = ConditionEliminator(cond_set, cond_value)(body);
       Stmt new_body = Substitute(simplified_body, {{Var{var}, var + body_begin}});
       mid_stmt = MakeFor(stmt.get(), post_doubt_begin - body_begin, new_body);
-
+      // Recurse until partitions is empty
+      mid_stmt = VisitAndMutate(mid_stmt);
       // Recurse for each non-empty subrange only if there are at least
       // two non-empty subranges
       if (pre_stmt.defined() || post_stmt.defined()) {
-        mid_stmt = VisitAndMutate(mid_stmt);
         if (pre_stmt.defined() && pre_stmt_recurse) {
           pre_stmt = VisitAndMutate(pre_stmt);
         }
@@ -714,8 +725,10 @@ class RemoveLikelyTagsAndHints : public StmtExprMutator {
   }
 };
 
-Stmt LoopPartition(Stmt stmt, bool partition_const_loop, bool no_unroll_loop_with_extent_one) {
-  stmt = LoopPartitioner(partition_const_loop, no_unroll_loop_with_extent_one)
+Stmt LoopPartition(Stmt stmt, bool partition_const_loop, bool no_unroll_loop_with_extent_one,
+                   bool unroll_loop_with_partition_hint_no_interval) {
+  stmt = LoopPartitioner(partition_const_loop, no_unroll_loop_with_extent_one,
+                         unroll_loop_with_partition_hint_no_interval)
              .VisitAndMutate(std::move(stmt));
   stmt = RemoveLikelyTagsAndHints()(std::move(stmt));
   return stmt;
@@ -731,7 +744,8 @@ Pass LoopPartition() {
       cfg = AttrsWithDefaultValues<LoopPartitionConfig>();
     }
     n->body = LoopPartition(std::move(n->body), cfg.value()->partition_const_loop,
-                            cfg.value()->no_unroll_loop_with_extent_one);
+                            cfg.value()->no_unroll_loop_with_extent_one,
+                            cfg.value()->unroll_loop_with_partition_hint_no_interval);
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.LoopPartition", {});
diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py
index b6e8d92f8d39..23a0064ee6ff 100644
--- a/tests/python/unittest/test_tir_transform_loop_partition.py
+++ b/tests/python/unittest/test_tir_transform_loop_partition.py
@@ -619,26 +619,54 @@ def test_condition_mutually_exclusive():
     assert tvm.ir.structural_equal(mod["main"], partitioned_concat_3)
 
 
+def test_loop_partition_unroll_hint():
+    @T.prim_func
+    def main(A: T.Buffer[150528, "int8"], B: T.Buffer[25088, "int8"]) -> None:
+        T.preflattened_buffer(A, [1, 3, 224, 224], "int8", data=A.data)
+        T.preflattened_buffer(B, [1, 224, 7, 16], "int8", data=B.data)
+        for ax0 in T.serial(
+            112,
+            annotations={"pragma_loop_partition_hint": True},
+        ):
+            for ax1, ax2, ax3 in T.grid(224, 7, 16):
+                if 3 <= ax0 * 2 + ax2 and ax0 * 2 + ax2 < 227 and ax3 < 3:
+                    B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax0 * 2 + ax2 - 3]
+
+    @T.prim_func
+    def partitioned_main(A: T.Buffer[150528, "int8"], B: T.Buffer[25088, "int8"]) -> None:
+        T.preflattened_buffer(A, [1, 3, 224, 224], dtype="int8", data=A.data)
+        T.preflattened_buffer(B, [1, 224, 7, 16], dtype="int8", data=B.data)
+        # body
+        for ax1, ax2, ax3 in T.grid(224, 7, 16):
+            if 3 <= ax2 and ax3 < 3:
+                B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax2 - 3]
+        for ax1, ax2, ax3 in T.grid(224, 7, 16):
+            if 1 <= ax2 and ax3 < 3:
+                B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax2 - 1]
+        for ax0, ax1, ax2, ax3 in T.grid(109, 224, 7, 16):
+            if ax3 < 3:
+                B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax0 * 2 + ax2 + 1]
+        for ax1, ax2, ax3 in T.grid(224, 7, 16):
+            if ax2 < 5 and ax3 < 3:
+                B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax2 + 219]
+
+    mod = tvm.ir.module.IRModule.from_expr(main)
+    with tvm.transform.PassContext(
+        config={
+            "tir.LoopPartition": {
+                "partition_const_loop": True,
+                "unroll_loop_with_partition_hint_no_interval": True,
+            }
+        }
+    ):
+        mod = tvm.tir.transform.LowerOpaqueBlock()(mod)
+        mod = tvm.tir.transform.FlattenBuffer()(mod)
+        mod = tvm.tir.transform.LoopPartition()(mod)
+        mod = tvm.tir.transform.UnrollLoop()(mod)
+        mod = tvm.tir.transform.RemoveNoOp()(mod)
+        mod = tvm.tir.transform.Simplify()(mod)
+    assert tvm.ir.structural_equal(mod["main"], partitioned_main)
+
+
 if __name__ == "__main__":
-    test_basic()
-    test_const_loop()
-    test_multi_loop()
-    test_multi_if()
-    test_thread_axis()
-    test_vectorize()
-    test_condition()
-    test_condition_EQ()
-    test_thread_axis2()
-    test_everything_during_deduction()
-    test_single_likely()
-    test_multi_likely()
-    test_oneD_pool()
-    test_cce_loop_1()
-    test_cce_loop_2()
-    test_cce_loop_3()
-    test_conv_tiling()
-    test_double_splitting_with_indivisible_factors()
-    test_multilevel_splitting_with_indivisble_factors()
-    test_simple_rfactor()
-    test_explicit_partition_hint()
-    test_condition_mutually_exclusive()
+    tvm.testing.main()

From 6be04d72c2a2d65b791a43a40167101ce4064ff2 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 8 Sep 2022 10:28:17 +0530
Subject: [PATCH 124/704] =?UTF-8?q?[OpenCLML]=20CLML=20Profiling=20fixes?=
 =?UTF-8?q?=20corresponding=20to=20OpenCL=20Timer=20recent=20=E2=80=A6=20(?=
 =?UTF-8?q?#12711)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [OpenCLML] CLML Profiling fixes corresponding to OpenCL Timer recent changes.

* [OpenCLML] Review comments.

* * review comment
---
 src/runtime/contrib/clml/clml_runtime.cc      | 161 ++++++++----------
 .../contrib/test_clml/infrastructure.py       |   6 +-
 .../python/contrib/test_clml/test_network.py  |   4 +-
 tests/python/contrib/test_clml/test_ops.py    |   2 +-
 4 files changed, 80 insertions(+), 93 deletions(-)

diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index 7966c0e78b2d..da41442ef91d 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -131,37 +131,14 @@ class CLMLRuntime : public JSONRuntimeBase {
     // Setup CLML Context
     cl_int result = 0;
 
-    // Initialize Context and Command Queue
-    result = clGetPlatformIDs(1, &platform, NULL);
-    ICHECK(result == CL_SUCCESS) << "clGetPlatformIDs:" << result;
+    workspace = cl::OpenCLWorkspace::Global();
+    workspace->Init();
+    tentry = workspace->GetThreadEntry();
 
-    uint32_t num_devices = 0;
-    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
-    ICHECK(result == CL_SUCCESS && num_devices == 1) << "clGetDeviceIDs:" << result;
-
-    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
-    ICHECK(device_id && result == CL_SUCCESS) << "clGetDeviceIDs:" << result;
-
-    if (!ExtensionStringPresent(device_id)) {
+    if (!ExtensionStringPresent()) {
       LOG(WARNING) << "CLML Runtime Init: Qualcomm extn not present.\n";
       return;
     }
-
-    // Reuse the OpenCl work space from TVM Device API.
-    auto func = tvm::runtime::Registry::Get("device_api.opencl");
-    ICHECK(func != nullptr) << "Cannot find OpenCL device_api in registry";
-    auto device_api = static_cast<cl::OpenCLWorkspace*>(((*func)()).operator void*());
-    this->context = device_api->context;
-    bool queue_found = false;
-    for (size_t i = 0; i < device_api->devices.size(); ++i) {
-      if (device_api->devices[i] == device_id) {
-        this->queue = device_api->queues[i];
-        this->evts = &(device_api->events[i]);
-        queue_found = true;
-      }
-    }
-    ICHECK(queue_found != false) << "Device queue not found in OpenCL Workspace";
-
     // Query and Get CLML Interface
     static const cl_uint MAX_VERSIONS = 256;
     cl_int majorVersions[MAX_VERSIONS];
@@ -220,8 +197,8 @@ class CLMLRuntime : public JSONRuntimeBase {
                             cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM) {
     cl_int result = 0;
     cl_event evt = NULL;
-    result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, data, layout, tensor->tensor,
-                                                        tensor->memory,
+    result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(workspace->GetQueue(tentry->device), data,
+                                                        layout, tensor->tensor, tensor->memory,
                                                         0,      // n waitlist
                                                         NULL,   // waitlist
                                                         &evt);  // event
@@ -233,8 +210,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_int result = 0;
     cl_event readEvent = NULL;
     // Read the output tensor
-    result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(queue, tensor->tensor, tensor->memory, data,
-                                                       layout,
+    result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(workspace->GetQueue(tentry->device),
+                                                       tensor->tensor, tensor->memory, data, layout,
                                                        0,            // n waitlist
                                                        NULL,         // waitlist
                                                        &readEvent);  // event
@@ -253,6 +230,8 @@ class CLMLRuntime : public JSONRuntimeBase {
    */
   void Run() override {
     cl_int result = 0;
+    cl_command_queue queue = workspace->GetQueue(tentry->device);
+    std::vector<cl_event>& evts = workspace->GetEventQueue(tentry->device);
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
       auto nid = input_nodes_[i];
       uint32_t eid = EntryID(nid, 0);
@@ -286,22 +265,26 @@ class CLMLRuntime : public JSONRuntimeBase {
     }
 
     for (size_t i = 0; i < this->layer_.function.size(); ++i) {
-      this->evts->resize(this->evts->size() + 1);
-      cl_event* evt = &(this->evts->back());
-      result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
-                                             this->layer_.descriptorSet, 0, NULL, evt);
+      if (getenv("CLML_PROFILING")) {
+        evts.resize(evts.size() + 1);
+        cl_event* evt = &(evts.back());
+        result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
+                                               this->layer_.descriptorSet, 0, NULL, evt);
+      } else {
+        result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
+                                               this->layer_.descriptorSet, 0, NULL, NULL);
+      }
       ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result;
     }
 
     if (getenv("CLML_PROFILING")) {
       cl_ulong start, end;
       cl_ulong duration = 0;
-      clWaitForEvents(1, &(this->evts->back()));
+      clWaitForEvents(1, &(evts.back()));
       for (size_t i = 0; i < this->layer_.layer_names.size(); ++i) {
-        clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
-                                &start, nullptr);
-        clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end,
+        clGetEventProfilingInfo(evts[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start,
                                 nullptr);
+        clGetEventProfilingInfo(evts[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr);
         duration += (end - start);
         LOG(WARNING) << "Layer:" << this->layer_.layer_names[i] << " Duration:" << (end - start);
       }
@@ -425,7 +408,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       JSONGraphNode node = it->second.second;
       void* node_data = nullptr;
 
-      allocateTensorMemory(h_ClmlIntf, context, tensor_desc);
+      allocateTensorMemory(h_ClmlIntf, workspace->context, tensor_desc);
 
       if (node.GetOpType() == "const") {
         node_data = data_entry_[EntryID(it->first, 0)]->data;
@@ -449,8 +432,9 @@ class CLMLRuntime : public JSONRuntimeBase {
       LOG(WARNING) << "CLML Tunning In Progress:";
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
         LOG(WARNING) << "CLML Tunning:" << i;
-        result = h_ClmlIntf->clTuneMLOpQCOM(queue, this->layer_.function[i],
-                                            this->layer_.descriptorSet, this->tuning_cache, NULL);
+        result = h_ClmlIntf->clTuneMLOpQCOM(workspace->GetQueue(tentry->device),
+                                            this->layer_.function[i], this->layer_.descriptorSet,
+                                            this->tuning_cache, NULL);
         ICHECK(result == CL_SUCCESS) << "clTuneMLOpQCOM:" << result;
       }
 
@@ -499,10 +483,13 @@ class CLMLRuntime : public JSONRuntimeBase {
     uint32_t n, c, h, w;
   };
 
-  bool ExtensionStringPresent(cl_device_id device_id) {
+  bool ExtensionStringPresent(void) {
     cl_int result = 0;
-
+    if (workspace->platform_id == nullptr) {
+      return 0;
+    }
     size_t reqd_size = 0;
+    cl_device_id device_id = workspace->devices[workspace->GetThreadEntry()->device.device_id];
     result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, NULL, &reqd_size);
     ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
 
@@ -525,7 +512,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_ml_tensor_desc_qcom desc = {
         dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, { 0 }};
     CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
-    result = clmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &tensor);
+    result = clmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &tensor);
     ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
     (void)result;
     return tensor;
@@ -538,10 +525,11 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_mem buffer = NULL;
 
     CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
-    result = clmlIntf->clGetMLTensorMemorySizeQCOM(context, pTensorMemDesc->tensor, &size);
+    result =
+        clmlIntf->clGetMLTensorMemorySizeQCOM(workspace->context, pTensorMemDesc->tensor, &size);
     ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result;
 
-    buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &result);
+    buffer = clCreateBuffer(workspace->context, CL_MEM_READ_WRITE, size, NULL, &result);
     ICHECK(result == CL_SUCCESS) << "clCreateBuffer:" << result;
 
     pTensorMemDesc->memory = buffer;
@@ -592,7 +580,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
 
     auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-    tensor_dsc->tensor = DeviceMakeCLMLTensor(h_ClmlIntf, context, dims, layout, cl_dtype);
+    tensor_dsc->tensor =
+        DeviceMakeCLMLTensor(h_ClmlIntf, workspace->context, dims, layout, cl_dtype);
     return tensor_dsc;
   }
 
@@ -703,7 +692,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     } else {
       cl_ml_tensor_desc_qcom desc = {};
       desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-      result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor);
+      result =
+          h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &layer_.unusedTensor);
       ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
       bias->tensor = layer_.unusedTensor;
     }
@@ -723,13 +713,13 @@ class CLMLRuntime : public JSONRuntimeBase {
     if (!has_bn) {
       if (!has_act) {
         result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM(
-            context, 0, &conv_desc, input->tensor, weight->tensor, bias->tensor, output->tensor,
-            &op, NULL);
+            workspace->context, 0, &conv_desc, input->tensor, weight->tensor, bias->tensor,
+            output->tensor, &op, NULL);
         ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
       } else {
         result = h_ClmlIntf->clCreateMLOpFusedConvolutionActivationForwardQCOM(
-            context, 0, &conv_desc, &act_desc, input->tensor, weight->tensor, bias->tensor, NULL,
-            output->tensor, &op, tuning_cache);
+            workspace->context, 0, &conv_desc, &act_desc, input->tensor, weight->tensor,
+            bias->tensor, NULL, output->tensor, &op, tuning_cache);
         ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
       }
       layer_.func_ins.push_back(input);
@@ -753,13 +743,13 @@ class CLMLRuntime : public JSONRuntimeBase {
                                               CL_ARITHMETIC_MODE_FP32_QCOM};
       if (!has_act) {
         result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM(
-            context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor, bias->tensor,
-            output->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op,
-            tuning_cache);
+            workspace->context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor,
+            bias->tensor, output->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor,
+            bn_bias->tensor, &op, tuning_cache);
         ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
       } else {
         result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM(
-            context, 0, &conv_desc, &bn_desc, &act_desc, input->tensor, weight->tensor,
+            workspace->context, 0, &conv_desc, &bn_desc, &act_desc, input->tensor, weight->tensor,
             bias->tensor, output->tensor, NULL, bn_mean->tensor, bn_var->tensor, bn_scale->tensor,
             bn_bias->tensor, &op, tuning_cache);
 
@@ -790,12 +780,13 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     cl_ml_tensor_desc_qcom desc = {};
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-    result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor);
+    result =
+        h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &layer_.unusedTensor);
     ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
 
-    result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM(context, 0, &act_desc, input->tensor,
-                                                           layer_.unusedTensor, output->tensor, &op,
-                                                           tuning_cache);
+    result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM(workspace->context, 0, &act_desc,
+                                                           input->tensor, layer_.unusedTensor,
+                                                           output->tensor, &op, tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Activation Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -834,8 +825,8 @@ class CLMLRuntime : public JSONRuntimeBase {
                                             CL_ARITHMETIC_MODE_FP32_QCOM};
 
     result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM(
-        context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor,
-        bn_bias->tensor, output->tensor, &op, tuning_cache);
+        workspace->context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor,
+        bn_scale->tensor, bn_bias->tensor, output->tensor, &op, tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Batchnorm Error:" << result;
 
     layer->function.push_back(op);
@@ -872,12 +863,13 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     cl_ml_tensor_desc_qcom desc = {};
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-    result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor);
+    result =
+        h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &layer_.unusedTensor);
     ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
 
-    result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(context, 0, &pool_desc, input->tensor,
-                                                        layer_.unusedTensor, output->tensor, &op,
-                                                        tuning_cache);
+    result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(workspace->context, 0, &pool_desc,
+                                                        input->tensor, layer_.unusedTensor,
+                                                        output->tensor, &op, tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -904,8 +896,8 @@ class CLMLRuntime : public JSONRuntimeBase {
                                                CL_SOFTMAX_MODE_INSTANCE_QCOM,
                                                CL_ARITHMETIC_MODE_FP32_QCOM};
 
-    result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(context, 0, &softmax_desc, input->tensor,
-                                                 output->tensor, &op, tuning_cache);
+    result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(workspace->context, 0, &softmax_desc,
+                                                 input->tensor, output->tensor, &op, tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -946,8 +938,8 @@ class CLMLRuntime : public JSONRuntimeBase {
         {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0},
         CL_ARITHMETIC_MODE_FP32_QCOM};
 
-    result = h_ClmlIntf->clCreateMLOpPadQCOM(context, 0, &pad_desc, input->tensor, output->tensor,
-                                             &op, tuning_cache);
+    result = h_ClmlIntf->clCreateMLOpPadQCOM(workspace->context, 0, &pad_desc, input->tensor,
+                                             output->tensor, &op, tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Pad Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -968,8 +960,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
     auto output = MakeCLMLTensorFromJSONNode(node);
 
-    result = h_ClmlIntf->clCreateMLOpReshapeQCOM(context, 0, input->tensor, output->tensor, &op,
-                                                 tuning_cache);
+    result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->context, 0, input->tensor,
+                                                 output->tensor, &op, tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -1004,13 +996,13 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     auto output = MakeCLMLTensorFromJSONNode(node);
     if (has_bias) {
-      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0, &fc_desc, input->tensor,
-                                                          weight->tensor, bias->tensor,
-                                                          output->tensor, &op, tuning_cache);
+      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(
+          workspace->context, 0, &fc_desc, input->tensor, weight->tensor, bias->tensor,
+          output->tensor, &op, tuning_cache);
     } else {
-      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0, &fc_desc, input->tensor,
-                                                          weight->tensor, NULL, output->tensor, &op,
-                                                          tuning_cache);
+      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(workspace->context, 0, &fc_desc,
+                                                          input->tensor, weight->tensor, NULL,
+                                                          output->tensor, &op, tuning_cache);
     }
     ICHECK(op && result == CL_SUCCESS) << "Fully Connected Error:" << result;
 
@@ -1039,8 +1031,8 @@ class CLMLRuntime : public JSONRuntimeBase {
                                          {{a_min}, CL_FLOAT},
                                          CL_ARITHMETIC_MODE_FP32_QCOM};
 
-    result = h_ClmlIntf->clCreateMLOpClipQCOM(context, 0, &clip_desc, input->tensor, output->tensor,
-                                              &op, tuning_cache);
+    result = h_ClmlIntf->clCreateMLOpClipQCOM(workspace->context, 0, &clip_desc, input->tensor,
+                                              output->tensor, &op, tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Clip Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -1056,11 +1048,8 @@ class CLMLRuntime : public JSONRuntimeBase {
   CachedLayer layer_;
   // CLML Context
   CLMLInterfaceV2QCOM* h_ClmlIntf = NULL;
-  cl_platform_id platform = NULL;
-  cl_context context = NULL;
-  cl_device_id device_id = NULL;
-  cl_command_queue queue = NULL;
-  std::vector<cl_event>* evts;
+  cl::OpenCLWorkspace* workspace = NULL;
+  cl::OpenCLThreadEntry* tentry = NULL;
   cl_ml_tuningcache_qcom tuning_cache = NULL;
   bool is_tuning_run;
   char* tuning_file;
diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py
index 19901d733e4c..0cf76079e8fb 100644
--- a/tests/python/contrib/test_clml/infrastructure.py
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -73,11 +73,11 @@ class Device:
 
     connection_type = "tracker"
     host = "localhost"
-    port = 9090
+    port = 9150
     target = "opencl"
     target_host = "llvm -mtriple=aarch64-linux-gnu"
-    device_key = ""
-    cross_compile = ""
+    device_key = "android"
+    cross_compile = "aarch64-linux-android-g++"
 
     def __init__(self):
         """Keep remote device for lifetime of object."""
diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py
index d89676f10e3a..405f5782ff2e 100644
--- a/tests/python/contrib/test_clml/test_network.py
+++ b/tests/python/contrib/test_clml/test_network.py
@@ -22,8 +22,7 @@
 from tvm import relay
 
 import tvm
-from test_clml.infrastructure import skip_runtime_test, build_and_run
-from test_clml.infrastructure import Device
+from test_clml.infrastructure import skip_runtime_test, build_and_run, Device
 
 
 def _build_and_run_network(mod, params, inputs, data, device, atol, rtol):
@@ -86,7 +85,6 @@ def get_model():
         mobilenet = MobileNet(
             include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000
         )
-        mobilenet.load_weights("mobilenet_1_0_224_tf.h5")
         inputs = {mobilenet.input_names[0]: ((1, 3, 224, 224), "float32")}
 
         data = {}
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
index 63f5bc168fd0..13f49d152714 100644
--- a/tests/python/contrib/test_clml/test_ops.py
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -212,5 +212,5 @@ def test_batchnorm():
 
 
 if __name__ == "__main__":
-    # test_conv2d()
+    test_conv2d()
     test_batchnorm()

From 62bdc91b1aee1c88dc128273abb637174d0e2071 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Wed, 7 Sep 2022 22:16:46 -0700
Subject: [PATCH 125/704] Add Arm DSP implementation of Depthwise Conv2D
 (#12448)

---
 python/tvm/relay/op/strategy/arm_cpu.py       |  22 ++
 python/tvm/topi/arm_cpu/depthwise_conv2d.py   |  19 ++
 .../arm_cpu/mprofile/dsp/depthwise_conv2d.py  | 245 ++++++++++++++++++
 .../dsp/micro_kernel/quad_channel_convolve.py | 180 +++++++++++++
 .../strategy/arm_cpu/test_depthwise_conv2d.py |  25 ++
 5 files changed, 491 insertions(+)
 create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py
 create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index ba28b6c7c31c..2d9ef99ba8a6 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -235,6 +235,28 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc),
                     name="depthwise_conv2d_nhwc.arm_cpu",
                 )
+
+            # Optimized special case depthwiseConv2D operation. Requires a 3x3 kernel, a
+            # NHWC layout, a HWOI kernel layout (which we rearrange), no dilation, int8 inputs,
+            # int32 output, the same number of input and output channels, and for that channel
+            # count to be divisible by 4. Additional work could remove these restrictions.
+
+            elif (
+                target.features.has_dsp
+                and kernel.shape[0] == kernel.shape[1] == 3
+                and dilation_w == dilation_h == 1
+                and kernel.shape[3] == 1  # channel_multiplier == 1
+                and data.dtype == "int8"
+                and out_type.dtype == "int32"
+                and data.shape[3] % 4 == 0
+                and (padding != "SAME" or data.shape[1] % stride_h == data.shape[2] % stride_w == 0)
+            ):
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nhwc_dsp),
+                    wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc_dsp),
+                    name="depthwise_conv2d_nhwc_dsp.arm_cpu",
+                )
+
             else:
                 logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.")
                 strategy.add_implementation(
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index c21480724ae4..333db3d5e014 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -28,6 +28,11 @@
 from .tensor_intrin import smlal_int16_int32
 from .arm_utils import is_aarch64_arm
 
+from .mprofile.dsp.depthwise_conv2d import (
+    depthwise_conv2d_nhwc_dsp_compute,
+    depthwise_conv2d_nhwc_dsp_schedule,
+)
+
 
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
 def depthwise_conv2d_nchw(_, data, kernel, strides, padding, dilation, out_dtype):
@@ -699,3 +704,17 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, last):
             s[kernel_vec].parallel(co)
 
     return s
+
+
+@autotvm.register_topi_compute("depthwise_conv2d_nhwc_dsp.arm_cpu")
+def depthwise_conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d_nhwc with v7e-m DSP instructions."""
+    return depthwise_conv2d_nhwc_dsp_compute(
+        cfg, data, kernel, strides, padding, dilation, out_dtype
+    )
+
+
+@autotvm.register_topi_schedule("depthwise_conv2d_nhwc_dsp.arm_cpu")
+def schedule_depthwise_conv2d_nhwc_dsp(cfg, outs):
+    """Create schedule for conv2d_nhwc_dsp"""
+    return depthwise_conv2d_nhwc_dsp_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py
new file mode 100644
index 000000000000..162bf65a21f9
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py
@@ -0,0 +1,245 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""ARM Cortex-M DSP schedule for depthwise_conv2d"""
+
+import random
+import string
+
+from tvm import te
+from tvm.topi.utils import traverse_inline, get_const_tuple
+from tvm.topi.nn.pad import pad
+from tvm import tir
+
+from .micro_kernel.quad_channel_convolve import (
+    intrin_quad_channel_convolve,
+    quad_channel_convolve_impl,
+)
+
+# For depthwise_conv2d, kernels are normally given in HWOI format,
+# which when input_channels = output channels, we will call HWC.
+# This is bad, as we want "related" parts of the kernel to be next
+# to each other, so we can use __SMLAD later.
+#
+# Consider a 3x3 int8 kernel with no bias vector, with eight
+# channels. Let us specify entries in the kernel as H_W_C - i.e.
+# where 0_2_3 represents the rightmost position in the first row
+# of channel 4/8 (4 because of zero indexing). Each [ ] represents
+# a 32-bit integer. We currently store the kernel as:
+#
+# 0 ................................31
+# [ 0_0_0 || 0_0_1 || 0_0_2 || 0_0_3 ] [ 0_0_4 || 0_0_5 || 0_0_6 || 0_0_7 ]
+# [ 0_1_0 || 0_1_1 || 0_1_2 || 0_1_3 ] [ 0_1_4 || 0_1_5 || 0_1_6 || 0_1_7 ]
+# [ 0_2_0 || 0_2_1 || 0_2_2 || 0_2_3 ] [ 0_2_4 || 0_2_5 || 0_2_6 || 0_2_7 ]
+# [ 1_0_0 || 1_0_1 || 1_0_2 || 1_0_3 ] [ 1_0_4 || 1_0_5 || 1_0_6 || 1_0_7 ]
+# [ 1_1_0 || 1_1_1 || 1_1_2 || 1_1_3 ] [ 1_1_4 || 1_1_5 || 1_1_6 || 1_1_7 ]
+# [ 1_2_0 || 1_2_1 || 1_2_2 || 1_2_3 ] [ 1_2_4 || 1_2_5 || 1_2_6 || 1_2_7 ]
+# [ 2_0_0 || 2_0_1 || 2_0_2 || 2_0_3 ] [ 2_0_4 || 2_0_5 || 2_0_6 || 2_0_7 ]
+# [ 2_1_0 || 2_1_1 || 2_1_2 || 2_1_3 ] [ 2_1_4 || 2_1_5 || 2_1_6 || 2_1_7 ]
+# [ 2_2_0 || 2_2_1 || 2_2_2 || 2_2_3 ] [ 2_2_4 || 2_2_5 || 2_2_6 || 2_2_7 ]
+#
+# Let 0x00 be all zeros. We rearrange into:
+#
+# 0 ................................31
+# [ 0_0_0 || 0_0_1 || 0_1_0 || 0_1_1 ] [ 0_0_2 || 0_0_3 || 0_1_2 || 0_1_3 ]
+# [ 0_2_0 || 0_2_1 || 1_0_0 || 1_0_1 ] [ 0_2_2 || 0_2_3 || 1_0_2 || 1_0_3 ]
+# [ 1_1_0 || 1_1_1 || 1_2_0 || 1_2_1 ] [ 1_1_2 || 1_1_3 || 1_2_2 || 1_2_3 ]
+# [ 2_0_0 || 2_0_1 || 2_1_0 || 2_1_1 ] [ 2_0_2 || 2_0_3 || 2_1_2 || 2_1_3 ]
+# [ 2_2_0 || 2_2_1 || 0x000 || 0x000 ] [ 2_2_2 || 2_2_3 || 0x000 || 0x000 ]
+# [ 0_0_4 || 0_0_5 || 0_1_4 || 0_1_5 ] [ 0_0_6 || 0_0_7 || 0_1_6 || 0_1_7 ]
+# [ 0_2_4 || 0_2_5 || 1_0_4 || 1_0_5 ] [ 0_2_6 || 0_2_7 || 1_0_6 || 1_0_7 ]
+# [ 1_1_4 || 1_1_5 || 1_2_4 || 1_2_5 ] [ 1_1_6 || 1_1_7 || 1_2_6 || 1_2_7 ]
+# [ 2_0_4 || 2_0_5 || 2_1_4 || 2_1_5 ] [ 2_0_6 || 2_0_7 || 2_1_6 || 2_1_7 ]
+# [ 2_2_4 || 2_2_5 || 0x000 || 0x000 ] [ 2_2_6 || 2_2_7 || 0x000 || 0x000 ]
+#
+# This saves us six operations comapred to the original ordering, as we
+# do not need halfword packing instructions.
+#
+# This kernel re-arranging function will be used for 3x3 kernels (as that
+# is all this DSP implementation currently supports) but would work with
+# any M*N kernel such that M*N is odd.
+
+
+def _rearrange_kernel(kernel):
+    # Kernel must be HWC format.
+    kernel_h, kernel_w, channels, _ = get_const_tuple(kernel.shape)
+    assert channels % 4 == 0
+
+    # This restriction could be removed by only using tir.if_then_else to add padding
+    # zeros if (kernel_w * kernel_h) % 2 == 1, and filling completely otherwise.
+    assert (kernel_w * kernel_h) % 2 == 1
+
+    def fcompute(c_o, pos, c_i):
+        channel = (2 * (pos % 2)) + (c_i % 2) + (4 * c_o)
+        true_pos_index = 2 * (pos // 2) + (c_i // 2)
+
+        return tir.if_then_else(
+            true_pos_index < (kernel_h * kernel_w),
+            kernel[true_pos_index // kernel_w, true_pos_index % kernel_w, channel, 0],
+            tir.const(0, "int8"),
+        )
+
+    return te.compute(
+        (channels // 4, kernel_h * kernel_w + 1, 4),
+        fcompute,
+        name="packed_kernel",
+    )
+
+
+def depthwise_conv2d_nhwc_dsp_compute(_cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute function for v7e-m DSP instructions of DepthwiseConv2D. Has a lot of requirements
+    for use - if not all apply, the fallback implementation will be used instead."""
+    assert isinstance(strides, int) or len(strides) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+
+    if isinstance(strides, int):
+        stride_h = stride_w = strides
+    else:
+        stride_h, stride_w = strides
+
+    # We do not support dilation currently. It would be possible, but it would require
+    # modifying the way the kernel is packed. Gnarly.
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+    assert dilation_h == dilation_w == 1
+
+    batch_size, height, width, channels = data.shape
+    kernel_h, kernel_w, _, _ = kernel.shape
+
+    # We require that the number of channels be divisible by 4. This restriction could
+    # be removed with strip mining if people cared.
+    assert channels % 4 == 0
+
+    # We don't support different numbers of input and output channels.
+    assert channels == kernel.shape[2]
+    assert kernel.shape[3] == 1
+
+    # We take in int8 as our dtype, but we spit out int32. This is because we cannot
+    # round until we compute activations.
+    assert out_dtype == "int32"
+
+    # This can pretty easily be generalized in the future. Likely worth doing, and this
+    # function was written to make doing so easy. Should only require adding more calls
+    # to QUAD_CHANNEL_REARRANGE_SUM.
+    assert kernel_w == kernel_h == 3
+
+    # Padding the data requires COPYING THE ENTIRE INPUT TENSOR, which
+    # is slow and bad. We should really implement a strip mining
+    # routine to avoid this, but TVM has terrible support for that.
+
+    if padding == "SAME":
+        # This assumption makes the logic easier. Could be removed with work.
+        assert height % stride_h == width % stride_w == 0
+
+        output_h = height // stride_h
+        output_w = width // stride_w
+
+        # This padding behavior is consistent with other TVM depthwise_conv2d schedules. However it
+        # differs from the TensorFlow, which only pads the bottom right if stride > 1. This probably
+        # brings down accuracy slightly for models imported from TFLite.
+        pad_down = 1 if stride_h == 1 else 0
+        pad_right = 1 if stride_w == 1 else 0
+
+        padded_data = pad(
+            data,
+            [0, kernel_h // 2, kernel_w // 2, 0],
+            [0, pad_down, pad_right, 0],
+            name="padded_data",
+        )
+
+    elif padding == "VALID":
+        assert height > kernel_h and width > kernel_w
+        output_h = (height - kernel_h) // stride_h + 1
+        output_w = (width - kernel_w) // stride_w + 1
+        padded_data = data
+
+    elif isinstance(padding, tuple):
+        if len(padding) == 2:
+            pad_up, pad_down = padding[0]
+            pad_left, pad_right = padding[1]
+        else:
+            pad_up, pad_left, pad_down, pad_right = padding
+
+        output_h = (height - kernel_h + pad_up + pad_down) // stride_h + 1
+        output_w = (width - kernel_w + pad_left + pad_right) // stride_w + 1
+        padded_data = pad(
+            data,
+            [0, pad_up, pad_left, 0],
+            [0, pad_down, pad_right, 0],
+            name="padded_data",
+        )
+
+    else:
+        raise RuntimeError()
+    _, padded_h, padded_w, _ = padded_data.shape
+
+    packed_kernel = _rearrange_kernel(kernel)
+    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
+    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
+    return te.compute(
+        (batch_size, output_h, output_w, channels),
+        lambda h, i, j, k: te.sum(
+            padded_data[h, (i * stride_h) + kh_i, (j * stride_w) + kw_i, k].astype("int32")
+            * packed_kernel[
+                k // 4,
+                (2 * ((3 * kh_i + kw_i) // 2)) + ((k % 4) // 2),
+                (2 * ((kh_i + kw_i) % 2)) + (k % 2),
+            ].astype("int32"),
+            axis=(kh_i, kw_i),
+        ),
+        name="depthwise_conv2d",
+        tag=f"depthwise_conv2d_nhwc_{padded_h}_{padded_w}_dsp",
+    )
+
+
+def depthwise_conv2d_nhwc_dsp_schedule(_cfg, outs):
+
+    """Schedule function for v7e-m DSP instructions of conv2d."""
+    schedule = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "depthwise_conv2d_nhwc" not in op.tag:
+            return
+
+        # extract tensors
+        output = op.output(0)
+        padded_data = output.op.input_tensors[0]
+        packed_kernel = output.op.input_tensors[1]
+        kernel = packed_kernel.op.input_tensors[0]
+
+        _, _, padded_w, channels = padded_data.shape
+        kernel_h, kernel_w, _, _ = kernel.shape
+        suffix = "".join(random.choices(string.ascii_uppercase, k=8))
+
+        b_ax, y_ax, x_ax, c_ax = schedule[output].op.axis
+        ky_ax, kx_ax = schedule[output].op.reduce_axis
+        c_ax_o, c_ax_i = schedule[output].split(c_ax, factor=4)
+        schedule[output].reorder(b_ax, c_ax_o, y_ax, x_ax, ky_ax, kx_ax, c_ax_i)
+
+        quad_channel_convolve = intrin_quad_channel_convolve(
+            padded_w, channels, kernel_h, kernel_w, suffix
+        )
+        schedule[output].tensorize(ky_ax, quad_channel_convolve)
+        schedule[output].pragma(
+            b_ax,
+            "import_c",
+            quad_channel_convolve_impl(padded_w, channels, kernel_h, kernel_w, suffix),
+        )
+
+    traverse_inline(schedule, outs[-1].op, _callback)
+    return schedule
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py
new file mode 100644
index 000000000000..960ef8fadc0e
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py
@@ -0,0 +1,180 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""This is a special intrinsic used for depthwise convolution using Cortex-M DSP instructions
+(v7e-m). It takes as inputs an int8 HWC data tensor and an int8 CHWc kernel. This intrinsic "lays"
+the kernel on top of the data tensors starting from a given pointer, performs signed sixteen-bit
+multiplies on each pair of values, and sums all the products in an int32 accumlator. This process is
+repeated four times giving four int32 outputs - one per channel."""
+
+import textwrap
+
+from tvm import te, tir
+
+
+def intrin_quad_channel_convolve(tensor_w, channels, kernel_h, kernel_w, suffix):
+    """Defines a v7e-m DSP-accelerated four-channel convolution."""
+    data_slice = te.placeholder((kernel_h, kernel_w, 4), name="a", dtype="int8")
+
+    if kernel_h * kernel_w % 2 == 1:
+        kernel_length = kernel_h * kernel_w + 1
+    else:
+        kernel_length = kernel_h * kernel_w
+    kernel_slice = te.placeholder((kernel_length, 4), name="b", dtype="int8")
+
+    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
+    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
+
+    output_slice = te.compute(
+        (4,),
+        lambda k: te.sum(
+            data_slice[kh_i, kw_i, k].astype("int32")
+            * kernel_slice[
+                (2 * ((3 * kh_i + kw_i) // 2)) + ((k % 4) // 2),
+                (2 * ((kh_i + kw_i) % 2)) + (k % 2),
+            ].astype("int32"),
+            axis=(kh_i, kw_i),
+        ),
+        name="c",
+    )
+
+    data_buf = tir.decl_buffer(
+        data_slice.shape,
+        data_slice.dtype,
+        name="data",
+        offset_factor=1,
+        strides=[tensor_w * channels, channels, 1],
+    )
+    kernel_buf = tir.decl_buffer(
+        kernel_slice.shape, kernel_slice.dtype, name="kernel", offset_factor=1, strides=[4, 1]
+    )
+    output_buf = tir.decl_buffer(
+        output_slice.shape, output_slice.dtype, name="output", offset_factor=1, strides=[1]
+    )
+
+    def intrin_func(ins, outs):
+        builder = tir.ir_builder.create()
+        builder.emit(
+            tir.call_extern(
+                "int32",
+                f"kernel_convolve_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}",
+                outs[0].access_ptr("w"),
+                ins[0].access_ptr("r"),
+                ins[1].access_ptr("r"),
+            )
+        )
+        return builder.get()
+
+    return te.decl_tensor_intrin(
+        output_slice.op,
+        intrin_func,
+        binds={data_slice: data_buf, kernel_slice: kernel_buf, output_slice: output_buf},
+    )
+
+
+def quad_channel_convolve_impl(tensor_w, channels, kernel_h, kernel_w, suffix):
+    """Emits C code for quad_channel_convolve. Note that while intrin_quad_channel_convolve supports
+    any kernel size, this function only supports 3x3 kernels (this could be fixed with work)."""
+    assert kernel_h == kernel_w == 3
+
+    return textwrap.dedent(
+        (
+            f"""
+        #include <stdint.h>
+        #include <arm_nnsupportfunctions.h>
+
+        // __SXTB16(_ROR(X, Y)) is combined into one assembly instruction
+
+        #define TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP( \
+            arranged_kernel, \
+            tensor_v0_c3210, tensor_v1_c3210, \
+            sum0, sum1, sum2, sum3) {{ \
+          \
+          uint32_t tensor_v0_c20 = __SXTB16(tensor_v0_c3210); \
+          uint32_t tensor_v0_c31 = __SXTB16(__ROR(tensor_v0_c3210, 8)); \
+          uint32_t tensor_v1_c20 = __SXTB16(tensor_v1_c3210); \
+          uint32_t tensor_v1_c31 = __SXTB16(__ROR(tensor_v1_c3210, 8)); \
+          \
+          uint32_t kernel_v1c1_v1c0_v0c1_v0c0 = *arranged_kernel++; \
+          uint32_t kernel_v1c3_v1c2_v0c3_v0c2 = *arranged_kernel++; \
+          \
+          uint32_t kernel_v10_c0 = __SXTB16(kernel_v1c1_v1c0_v0c1_v0c0); \
+          uint32_t kernel_v10_c1 = __SXTB16(__ROR(kernel_v1c1_v1c0_v0c1_v0c0, 8)); \
+          uint32_t kernel_v10_c2 = __SXTB16(kernel_v1c3_v1c2_v0c3_v0c2); \
+          uint32_t kernel_v10_c3 = __SXTB16(__ROR(kernel_v1c3_v1c2_v0c3_v0c2, 8)); \
+          \
+          uint32_t tensor_v10_c0 = __PKHBT(tensor_v0_c20, tensor_v1_c20, 16); \
+          uint32_t tensor_v10_c1 = __PKHBT(tensor_v0_c31, tensor_v1_c31, 16); \
+          uint32_t tensor_v10_c2 = __PKHTB(tensor_v1_c20, tensor_v0_c20, 16); \
+          uint32_t tensor_v10_c3 = __PKHTB(tensor_v1_c31, tensor_v0_c31, 16); \
+          \
+          sum_c0 = __SMLAD(tensor_v10_c0, kernel_v10_c0, sum_c0); \
+          sum_c1 = __SMLAD(tensor_v10_c1, kernel_v10_c1, sum_c1); \
+          sum_c2 = __SMLAD(tensor_v10_c2, kernel_v10_c2, sum_c2); \
+          sum_c3 = __SMLAD(tensor_v10_c3, kernel_v10_c3, sum_c3); \
+        }}
+
+        /* We do four channels at once to get this speed boost. */
+        #ifdef __cplusplus
+        extern "C"
+        #endif
+        int32_t kernel_convolve_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}(
+            uint32_t *out,
+            uint32_t *tensor,
+            uint32_t *packed_kernel) {{
+
+          uint32_t sum_c0 = 0;
+          uint32_t sum_c1 = 0;
+          uint32_t sum_c2 = 0;
+          uint32_t sum_c3 = 0;
+
+          TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP(
+            packed_kernel,
+            *tensor,
+            *(tensor + {channels // 4}),
+            sum_c0, sum_c1, sum_c2, sum_c3)
+          TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP(
+            packed_kernel,
+            *(tensor + {(2) * channels // 4}),
+            *(tensor + {tensor_w * (channels // 4)}),
+            sum_c0, sum_c1, sum_c2, sum_c3)
+          TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP(
+            packed_kernel,
+            *(tensor + {(tensor_w + 1) * (channels // 4)}),
+            *(tensor + {(tensor_w + 2) * (channels // 4)}),
+            sum_c0, sum_c1, sum_c2, sum_c3)
+          TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP(
+            packed_kernel,
+            *(tensor + {(2 * tensor_w) * (channels // 4)}),
+            *(tensor + {(2 * tensor_w + 1) * (channels // 4)}),
+            sum_c0, sum_c1, sum_c2, sum_c3)
+          TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP(
+            packed_kernel,
+            *(tensor + {(2 * tensor_w + 2) * (channels // 4)}),
+            0,
+            sum_c0, sum_c1, sum_c2, sum_c3)
+
+          out[0] = sum_c0;
+          out[1] = sum_c1;
+          out[2] = sum_c2;
+          out[3] = sum_c3;
+          return 0;
+        }}
+
+        #undef TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP
+        """
+        )
+    )
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
index ee0d51c321f7..18c5082f2a0c 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -147,5 +147,30 @@ class TestDepthwiseConv2d_NHWC_HWOI(BasicDepthwiseConv2dTests):
     schedule_name = tvm.testing.parameter("depthwise_conv2d_nhwc.generic")
 
 
+class TestDepthwiseConv2d_NHWC_HWOI_DSP(BasicDepthwiseConv2dTests):
+    """This test is for depthwise_conv2d_nhwc_dsp.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        # The LLVM implementation doesn't support "SAME" and "VALID" padding,
+        # so padding must be explicitly specified.
+        # Depthwise_conv2d parameters from MobileNetV1 0.25x
+        ((1, 48, 48, 8), (3, 3), 8, (1, 1), 1, 1),
+        ((1, 48, 48, 16), (3, 3), 16, (2, 2), (1, 1, 0, 0), 1),
+        ((1, 24, 24, 32), (3, 3), 32, (1, 1), 1, 1),
+        ((1, 24, 24, 32), (3, 3), 32, (2, 2), (1, 1, 0, 0), 1),
+        ((1, 12, 12, 64), (3, 3), 64, (1, 1), 1, 1),
+        ((1, 12, 12, 64), (3, 3), 64, (2, 2), (1, 1, 0, 0), 1),
+        ((1, 6, 6, 128), (3, 3), 128, (1, 1), 1, 1),
+        ((1, 6, 6, 128), (3, 3), 128, (2, 2), (1, 1, 0, 0), 1),
+        ((1, 3, 3, 256), (3, 3), 256, (1, 1), 1, 1),
+        # Asymmetric height and width
+        ((1, 25, 5, 64), (3, 3), 64, (1, 1), 1, 1),
+    )
+    data_layout = tvm.testing.parameter("NHWC")
+    dtype = tvm.testing.parameter("int8")
+    kernel_layout = tvm.testing.parameter("HWOI")
+    schedule_name = tvm.testing.parameter("depthwise_conv2d_nhwc_dsp.arm_cpu")
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From cd99ca64cb2416219215745c1d478b86776378ed Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Thu, 8 Sep 2022 01:54:10 -0700
Subject: [PATCH 126/704] [Relay] Change when int8 operations are converted to
 int16 on Arm (#12671)

Currently, Relay QNN uses its `helper_no_fast_int8_hw_legalization` to convert most `int8` convolution and dense operations into `int16` ones on Arm. This currently occurs on ARM chips except for `v8.2a` chips with `dotprod` support.

However, this behavior means that `int8` operations are replaced with `int16` ones on Cortex-M chips. On these chips `int16` is substantially slower, as while it saves a few sign extension operations, it doubles the amount of memory loads we need to perform.

This PR changes when `helper_no_fast_int8_hw_legalization` is used on Arm, and instead makes **not** doing this replacement the standard. We will only do this replacement if we are on a chip with ASIMD support but without `v8.2a` and `dotprod`. This ensures that Cortex-M microcontrollers do not have `int8` operations turned into `int16` ones.

I have also verified that this does, in fact, improve performance for some common models. For example, MobileNet_v1_0.25 on the Cortex-M4 saw a 10% performance improvement, compared to before this change. Accuracy does not seem to be affected.
---
 python/tvm/relay/qnn/op/legalizations.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 2fcdaf362a22..9bc6efdad00f 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -424,7 +424,8 @@ def is_aarch64_arm():
 
 @qnn_conv2d_legalize.register("arm_cpu")
 def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
-    # ARM prefers the dtypes to be same.
+    target = tvm.target.Target.current(allow_none=False)
+    has_asimd = is_aarch64_arm() or "+neon" in target.mattr
     is_depthwise = relay.op.strategy.is_depthwise_conv2d(
         types[0].shape,
         attrs["data_layout"],
@@ -432,18 +433,23 @@ def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
         attrs["kernel_layout"],
         attrs["groups"],
     )
-    use_int8_on_arm = (not is_depthwise) and is_aarch64_arm() and attrs["data_layout"] == "NHWC"
-    if use_int8_on_arm or is_fast_int8_on_arm():
-        return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
-    return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
+    use_int8_on_arm = (not is_depthwise) and attrs["data_layout"] == "NHWC"
+    has_dotprod = is_fast_int8_on_arm()
+    other_options = use_int8_on_arm or has_dotprod
+    if has_asimd and not other_options:
+        return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
+    # ARM prefers the dtypes to be same.
+    return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
 
 
 @qnn_dense_legalize.register("arm_cpu")
 def _qnn_dense_legalize_arm_cpu(attrs, inputs, types):
+    target = tvm.target.Target.current(allow_none=False)
+    has_asimd = is_aarch64_arm() or "+neon" in target.mattr
+    if has_asimd and not is_fast_int8_on_arm():
+        return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.dense)
     # ARM prefers the dtypes to be same.
-    if is_fast_int8_on_arm():
-        return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense)
-    return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.dense)
+    return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense)
 
 
 ##########################

From 2d36e460079f6920ab97a6b2de31fe678895ce62 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Thu, 8 Sep 2022 14:49:55 +0100
Subject: [PATCH 127/704] [CI][AArch64] Mark tests to be skipped due to torch
 crash (#12730)

Some integration tests are not being run on CI due to the
configuration of the machine with onnx and torch not calling
the integration tests script.

This patch skips two more tests failing with the error message
below:

```
"OSError: /.../torch/lib/libgomp-d22c30c5.so.1:
cannot allocate memory in static TLS block"
```
---
 tests/python/driver/tvmc/test_frontends.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py
index 1ccac7696fcc..c1a3be67c208 100644
--- a/tests/python/driver/tvmc/test_frontends.py
+++ b/tests/python/driver/tvmc/test_frontends.py
@@ -237,6 +237,10 @@ def test_load_model___wrong_language__to_onnx(tflite_mobilenet_v1_1_quant):
         tvmc.load(tflite_mobilenet_v1_1_quant, model_format="onnx")
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_load_model__pth(pytorch_resnet18):
     # some CI environments wont offer torch, so skip in case it is not present
     pytest.importorskip("torch")
@@ -432,6 +436,10 @@ def test_import_tensorflow_friendly_message(pb_mobilenet_v1_1_quant, monkeypatch
         _ = tvmc.frontends.load_model(pb_mobilenet_v1_1_quant, model_format="pb")
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_import_torch_friendly_message(pytorch_resnet18, monkeypatch):
     monkeypatch.setattr("importlib.import_module", mock_error_on_name("torch"))
 

From 4f4bc26607712adfed539e21916cddc3dc2dd601 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Thu, 8 Sep 2022 14:50:14 +0100
Subject: [PATCH 128/704] [MetaSchedule] Mark two tests as xfail (#12733)

This patch marks two tests as xfail for further investigation:
* test_meta_schedule_integration_extract_from_resnet_with_filter_func
* test_meta_schedule_integration_extract_from_resnet
---
 tests/python/unittest/test_meta_schedule_integration.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 69522831ee55..366a2e4887ed 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -64,6 +64,7 @@ def test_meta_schedule_dynamic_loop_extent():
     assert not extracted_tasks
 
 
+@pytest.mark.xfail(strict=True, reason="See https://github.com/apache/tvm/issues/12732")
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet():
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
@@ -198,6 +199,7 @@ def test_meta_schedule_integration_extract_from_bert_base():
         assert expected_shape == shape, t.task_name
 
 
+@pytest.mark.xfail(strict=True, reason="See https://github.com/apache/tvm/issues/12732")
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet_with_filter_func():
     @register_func("relay.backend.tir_converter.remove_purely_spatial", override=True)

From ed630122c281f47493e2941a7dc471e201904587 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Thu, 8 Sep 2022 14:50:36 +0100
Subject: [PATCH 129/704] [Test] Add tvm.testing.requires_libtorch (#12737)

Create a specific test dependency to map to USE_LIBTORCH, which
is disabled by deafult, and is independent from torch being
installed on the underlying machine, so it causes problems in
machines that have torch installed but TVM is build with
USE_LIBTORCH OFF.

Mark tests.python.contrib.test_libtorch_ops.test_backend with
this new decorator.
---
 python/tvm/testing/utils.py               | 3 +++
 tests/python/contrib/test_libtorch_ops.py | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 5b70eb06911b..37a27a4213e9 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -945,6 +945,9 @@ def _any_gpu_exists():
 # Mark a test as requiring Arm(R) Ethos(TM)-N to run
 requires_ethosn = Feature("ethosn", "Arm(R) Ethos(TM)-N", cmake_flag="USE_ETHOSN")
 
+# Mark a test as requiring libtorch to run
+requires_libtorch = Feature("libtorch", "LibTorch", cmake_flag="USE_LIBTORCH")
+
 # Mark a test as requiring Hexagon to run
 requires_hexagon = Feature(
     "hexagon",
diff --git a/tests/python/contrib/test_libtorch_ops.py b/tests/python/contrib/test_libtorch_ops.py
index 28ae39c329f5..2bfb78b407aa 100644
--- a/tests/python/contrib/test_libtorch_ops.py
+++ b/tests/python/contrib/test_libtorch_ops.py
@@ -19,6 +19,7 @@
 
 import tvm.relay
 from tvm.relay.op.contrib import torchop
+from tvm.testing import requires_libtorch
 
 import_torch_error = None
 
@@ -30,6 +31,7 @@
 
 
 @pytest.mark.skipif(torch is None, reason=f"PyTorch is not available: {import_torch_error}")
+@requires_libtorch
 def test_backend():
     @torch.jit.script
     def script_fn(x, y):

From b2bd434ef944315a6f241803ac03c59c9aaa9847 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 8 Sep 2022 08:02:42 -0700
Subject: [PATCH 130/704] [TIR] Handle axis_separators during FlattenBuffer
 (#12652)

* [TIR] Moved tir.FlattenBuffer to occur before tir.LowerOpaqueBlock

For buffers with more than one physical axis, the `axis_separators`
are required in order to know which groups of logical axes to fuse
into each physical axis.  The implementation in `tir.FlattenBuffer`
assumed that all buffers were being flattened to a single physical
axis.  Because `tir.LowerOpaqueBlock` replaces the
`BlockNode::alloc_buffers` with `Allocate` nodes, `tir.FlattenBuffer`
no longer has access to the axis separators and performs inconsistent
flattening for `Allocate` as opposed to `BufferLoad`/`BufferStore`.
This was introduced in https://github.com/apache/tvm/pull/12172, which
decoupled the lowering/flattening steps.

The commit reorders the `tir.FlattenBuffer` to occur before
`tir.LowerOpaqueBlock`, to make use of the axis separators.  Any
`Allocate` nodes that exist at that point (e.g. from hand-written
schedules) are still flattened to 1-d physical buffers, but the
`BlockNode::alloc_buffers` are flattened according to the axis
separators.

* Add unit test to validate non-flat memory after tvm.lower

* Explicitly write T.reads for test on BufferRegion updates

* Update incorrect docstring for test

* Use DeclBuffer information in FlattenBuffer

The DeclBuffer node can be inserted during LowerOpaqueBlock, then
provide the missing Buffer information required to flatten the
allocation.

* Use T.allocate in unit tests

With the insertion of `DeclBuffer` nodes, `LowerOpaqueBlock` no longer
needs to be before `FlattenBuffer`, and has been moved back to its
original position.  Revering the tests to use `T.allocate` instead of
`T.alloc_buffer` more closely represents the functions as they are
being lowered.

* Fix usage of T.decl_buffer in updated tests

* Update LowerOpaqueBuffer to expect the DeclBuffer nodes

* Strip DeclBuffer annotation in FlattenBuffer

The DeclBuffer annotations aren't yet supported in all passes.  This
restricts them to being introduced in LowerOpaqueBuffer, then
immediately removed in FlattenBuffer.

* Strip out all DeclBuffer nodes in FlattenBuffer

* Update unit tests to remove expectation of DeclBuffer nodes
---
 src/tir/transforms/flatten_buffer.cc          | 123 ++++-
 src/tir/transforms/lower_opaque_block.cc      |   1 +
 .../test_tir_transform_flatten_buffer.py      | 502 ++++++++++--------
 .../test_tir_transform_lower_opaque_block.py  |  22 +-
 4 files changed, 417 insertions(+), 231 deletions(-)

diff --git a/src/tir/transforms/flatten_buffer.cc b/src/tir/transforms/flatten_buffer.cc
index 22aef136bcff..5441120491c6 100644
--- a/src/tir/transforms/flatten_buffer.cc
+++ b/src/tir/transforms/flatten_buffer.cc
@@ -21,6 +21,7 @@
  * \file flatten_buffer.cc
  */
 
+#include <tvm/tir/analysis.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
@@ -53,6 +54,34 @@ class BufferFlattener : public StmtExprMutator {
     }
   }
 
+  Stmt VisitStmt_(const BlockNode* op) final {
+    ICHECK_EQ(op->match_buffers.size(), 0)
+        << "Unexpected MatchBufferRegion found during tir.transform.FlattenBuffer.  "
+        << "All MatchBufferRegion should be removed in tir.transform.LowerMatchBuffer.";
+
+    Block block = GetRef<Block>(op);
+
+    Array<Buffer> alloc_buffers = op->alloc_buffers;
+    alloc_buffers.MutateByApply([this](Buffer buf) { return GetFlattenedBuffer(buf); });
+    if (!alloc_buffers.same_as(op->alloc_buffers)) {
+      block.CopyOnWrite()->alloc_buffers = alloc_buffers;
+    }
+
+    Array<BufferRegion> reads = op->reads;
+    reads.MutateByApply([this](BufferRegion region) { return MutateBufferRegion(region); });
+    if (!reads.same_as(op->reads)) {
+      block.CopyOnWrite()->reads = reads;
+    }
+
+    Array<BufferRegion> writes = op->writes;
+    writes.MutateByApply([this](BufferRegion region) { return MutateBufferRegion(region); });
+    if (!writes.same_as(op->writes)) {
+      block.CopyOnWrite()->writes = writes;
+    }
+
+    return StmtExprMutator::VisitStmt_(block.get());
+  }
+
   Stmt VisitStmt_(const AllocateNode* op) final {
     Allocate alloc = Downcast<Allocate>(StmtExprMutator::VisitStmt_(op));
     // TODO(Lunderberg): Move the handling of boolean into a
@@ -61,18 +90,70 @@ class BufferFlattener : public StmtExprMutator {
       auto writer = alloc.CopyOnWrite();
       writer->dtype = DataType::Int(8);
     }
-    // Handle multi-dimension allocations
+
     if (alloc->extents.size() == 1) {
-      return std::move(alloc);
-    } else {
-      Array<PrimExpr> flat_extent(static_cast<size_t>(1), 1);
-      for (size_t i = 0; i < alloc->extents.size(); i++) {
-        flat_extent.Set(0, flat_extent[0] * alloc->extents[i]);
+      // No flattening required for buffers that are already flat
+
+      // TODO(rfc-70): Keep the DeclBuffer node as-is.  Stripping it
+      // out in the current implementation as not all lowering passes
+      // support DeclBuffer.
+      if (auto* decl_buffer = alloc->body.as<DeclBufferNode>()) {
+        alloc.CopyOnWrite()->body = std::move(decl_buffer->body);
       }
-      auto n = alloc.CopyOnWrite();
-      n->extents = flat_extent;
+
       return std::move(alloc);
     }
+
+    if (auto* decl_buffer = alloc->body.as<DeclBufferNode>();
+        decl_buffer && decl_buffer->buffer->data.same_as(alloc->buffer_var)) {
+      // N-d buffer, use the DeclBuffer inside to determine how it
+      // should be flattened.
+      auto& buffer = decl_buffer->buffer;
+      bool matching_buffer = [&]() {
+        if (alloc->dtype != buffer->dtype) {
+          return false;
+        }
+        if (alloc->extents.size() != buffer->shape.size()) {
+          return false;
+        }
+        ExprDeepEqual expr_equal;
+        for (size_t i = 0; i < alloc->extents.size(); i++) {
+          if (!expr_equal(alloc->extents[i], buffer->shape[i])) {
+            return false;
+          }
+        }
+        return true;
+      }();
+
+      if (matching_buffer) {
+        Buffer flattened = GetFlattenedBuffer(buffer);
+
+        auto n = alloc.CopyOnWrite();
+        // TODO(rfc-70): Update the DeclBuffer node instead of
+        // stripping it out.  Stripping it out in the current
+        // implementation as not all lowering passes support
+        // DeclBuffer.
+        //
+        // n->body = DeclBuffer(flattened, std::move(decl_buffer->body));
+        n->body = std::move(decl_buffer->body);
+        n->extents = flattened->shape;
+        return std::move(alloc);
+      } else {
+        ICHECK(decl_buffer->buffer->axis_separators.empty())
+            << "DeclBuffer node doesn't match Allocate extents, but also shouldn't be "
+               "flattened to 1-d physical memory";
+      }
+    }
+
+    // Fallback, this is an allocation without a matching DeclBuffer
+    PrimExpr flat_extent = 1;
+    for (const auto& dim : alloc->extents) {
+      flat_extent *= dim;
+    }
+
+    auto n = alloc.CopyOnWrite();
+    n->extents = {flat_extent};
+    return std::move(alloc);
   }
 
   Buffer GetFlattenedBuffer(Buffer buf) {
@@ -141,6 +222,32 @@ class BufferFlattener : public StmtExprMutator {
     return node;
   }
 
+  BufferRegion MutateBufferRegion(BufferRegion region) {
+    Buffer orig_buf = region->buffer;
+    Buffer flattened_buf = GetFlattenedBuffer(orig_buf);
+    if (flattened_buf.same_as(orig_buf)) {
+      return region;
+    }
+
+    Array<PrimExpr> min_values;
+    Array<PrimExpr> max_values;
+    for (const auto& range : region->region) {
+      min_values.push_back(range->min);
+      max_values.push_back(range->min + range->extent - 1);
+    }
+
+    Array<PrimExpr> flattened_min = orig_buf->ElemOffset(min_values);
+    Array<PrimExpr> flattened_max = orig_buf->ElemOffset(max_values);
+
+    Array<Range> flattened_ranges;
+    ICHECK_EQ(flattened_min.size(), flattened_max.size());
+    for (size_t i = 0; i < flattened_min.size(); i++) {
+      flattened_ranges.push_back(Range(flattened_min[i], flattened_max[i] + 1));
+    }
+
+    return BufferRegion(flattened_buf, flattened_ranges);
+  }
+
   /*! \brief Map of buffers being remapped. */
   std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_remap_;
 
diff --git a/src/tir/transforms/lower_opaque_block.cc b/src/tir/transforms/lower_opaque_block.cc
index a4655ebbaed5..ce74fdc4c17b 100644
--- a/src/tir/transforms/lower_opaque_block.cc
+++ b/src/tir/transforms/lower_opaque_block.cc
@@ -57,6 +57,7 @@ class OpaqueBlockLower : public StmtExprMutator {
           new_shape.Set(i, buffer->strides[i - 1] / buffer->strides[i]);
         }
       }
+      body = DeclBuffer(buffer, std::move(body));
       body = Allocate(buffer->data, buffer->dtype, new_shape, const_true(), std::move(body));
     }
     // Step 4. Handle annotations, block annotations are not preserved by default.
diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py
index 4cdf71889eee..870208499e7a 100644
--- a/tests/python/unittest/test_tir_transform_flatten_buffer.py
+++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py
@@ -20,223 +20,307 @@
 from tvm.script import tir as T
 
 
-def _check(original, transformed):
-    func = original
-    mod = tvm.IRModule.from_expr(func)
-    mod = tvm.tir.transform.FlattenBuffer()(mod)
-    mod = tvm.tir.transform.Simplify()(mod)
-    tvm.ir.assert_structural_equal(mod["main"], transformed, True)
-
-
-@T.prim_func
-def elementwise_func(a: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), "float32")
-    C = T.match_buffer(c, (16, 16), "float32")
-    for i in T.serial(0, 16):
-        B_new_data = T.allocate([1, 16], "float32", "global")
-        B_new = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_new_data)
-        for j in T.serial(0, 16):
-            B_new[0, j] = A[i, j] + 1.0
-        for j in T.serial(0, 16):
-            C[i, j] = B_new[0, j] * 2.0
-
-
-@T.prim_func
-def flattened_elementwise_func(a: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, 256, "float32")
-    C = T.match_buffer(c, 256, "float32")
-    T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
-    T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)
-    for i in T.serial(0, 16):
-        B_new_data = T.allocate([16], "float32", "global")
-        B_new = T.buffer_decl(shape=[16], dtype="float32", data=B_new_data)
-        for j in T.serial(0, 16):
-            B_new[j] = A[((i * 16) + j)] + 1.0
-        for j in T.serial(0, 16):
-            C[((i * 16) + j)] = B_new[j] * 2.0
-
-
-@T.prim_func
-def gpu_func(a: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), "float32")
-    C = T.match_buffer(c, (16, 16), "float32")
-
-    i0 = T.env_thread("blockIdx.x")
-    i1 = T.env_thread("threadIdx.x")
-    i2 = T.env_thread("vthread")
-
-    T.launch_thread(i0, 4)
-    T.launch_thread(i1, 2)
-    T.launch_thread(i2, 2)
-    B_data = T.allocate([1, 16], "float32", "local")
-    B = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_data, scope="local")
-    for j in range(0, 16):
-        B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0
-    for j in range(0, 16):
-        C[i0 * 4 + i1 * 2 + i2, j] = B[0, j] * 2.0
-
-
-@T.prim_func
-def flattened_gpu_func(a: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, 256, "float32")
-    C = T.match_buffer(c, 256, "float32")
-    T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
-    T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)
-
-    i0 = T.env_thread("blockIdx.x")
-    i1 = T.env_thread("threadIdx.x")
-    i2 = T.env_thread("vthread")
-
-    T.launch_thread(i0, 4)
-    T.launch_thread(i1, 2)
-    T.launch_thread(i2, 2)
-    B_data = T.allocate([16], "float32", "local")
-    B = T.buffer_decl(shape=[16], dtype="float32", data=B_data, scope="local")
-    for j in range(0, 16):
-        B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + 1.0
-    for j in range(0, 16):
-        C[i0 * 64 + i1 * 32 + i2 * 16 + j] = B[j] * 2.0
-
-
-@T.prim_func
-def symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None:
-    A = T.match_buffer(a, (n, m), "float32")
-    C = T.match_buffer(c, (n, m), "float32")
-
-    for i in range(0, n):
-        B_data = T.allocate([m], "float32", "global")
-        B = T.buffer_decl(shape=[m], dtype="float32", data=B_data)
-        for j in range(0, m):
-            B[j] = A[i, j] + 1.0
-        for j in range(0, m):
-            C[i, j] = B[j] * 2.0
-
-
-@T.prim_func
-def flattened_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None:
-    A = T.match_buffer(a, n * m, "float32")
-    C = T.match_buffer(c, n * m, "float32")
-    T.preflattened_buffer(A, (n, m), "float32", data=A.data)
-    T.preflattened_buffer(C, (n, m), "float32", data=C.data)
-
-    for i in range(0, n):
-        B_data = T.allocate([m], "float32", "global")
-        B = T.buffer_decl(shape=[m], dtype="float32", data=B_data)
-        for j in range(0, m):
-            B[j] = A[i * m + j] + 1.0
-        for j in range(0, m):
-            C[i * m + j] = B[j] * 2.0
-
-
-@T.prim_func
-def multi_alloc_func(a: T.handle, d: T.handle) -> None:
-    A = T.match_buffer(a, (4, 32), "float32")
-    D = T.match_buffer(d, (4, 32), "float32")
-
-    for i, j in T.grid(4, 32):
-        B_data = T.allocate((4, 32), "float32", scope="global")
-        B = T.buffer_decl(shape=(4, 32), dtype="float32", data=B_data)
-        C_data = T.allocate((4, 32), "float32", scope="global")
-        C = T.buffer_decl(shape=(4, 32), dtype="float32", data=C_data)
-        B[i, j] = A[i, j] + 1.0
-        C[i, j] = A[i, j] + B[i, j]
-        D[i, j] = C[i, j] * 2.0
-
-
-@T.prim_func
-def flattened_multi_alloc_func(a: T.handle, d: T.handle) -> None:
-    A = T.match_buffer(a, 128, "float32")
-    D = T.match_buffer(d, 128, "float32")
-    T.preflattened_buffer(A, (4, 32), "float32", data=A.data)
-    T.preflattened_buffer(D, (4, 32), "float32", data=D.data)
-
-    for i, j in T.grid(4, 32):
-        B_data = T.allocate([128], "float32", "global")
-        B = T.buffer_decl(shape=[128], dtype="float32", data=B_data)
-        C_data = T.allocate([128], "float32", "global")
-        C = T.buffer_decl(shape=[128], dtype="float32", data=C_data)
-        B[i * 32 + j] = A[i * 32 + j] + 1.0
-        C[i * 32 + j] = A[i * 32 + j] + B[i * 32 + j]
-        D[i * 32 + j] = C[i * 32 + j] * 2.0
-
-
-@T.prim_func
-def strided_buffer_func(a: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), "float32")
-    C = T.match_buffer(c, (16, 16), "float32")
-    for i0 in T.serial(4):
-        B_data = T.allocate([4, 17], "float32", "global")
-        B = T.buffer_decl(shape=[4, 17], dtype="float32", data=B_data)
-        B_1 = T.buffer_decl([4, 16], dtype="float32", data=B.data, strides=[17, 1])
-        for i1, j in T.grid(4, 16):
-            B_1[i1, j] = A[i0 * 4 + i1, j] + 1.0
-        for i1, j in T.grid(4, 16):
-            C[i0 * 4 + i1, j] = B_1[i1, j] * 2.0
-
-
-@T.prim_func
-def flattened_strided_buffer_func(a: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, (256,), "float32")
-    C = T.match_buffer(c, (256,), "float32")
-    T.preflattened_buffer(A, [16, 16], dtype="float32", data=A.data)
-    T.preflattened_buffer(C, [16, 16], dtype="float32", data=C.data)
-    for i0 in T.serial(0, 4):
-        B_new_data = T.allocate([68], "float32", "global")
-        B_new = T.buffer_decl(shape=[68], dtype="float32", data=B_new_data)
-        for i1 in T.serial(0, 4):
-            for j in T.serial(0, 16):
-                B_new[i1 * 17 + j] = A[i0 * 64 + i1 * 16 + j] + 1.0
-        for i1 in T.serial(0, 4):
-            for j in T.serial(0, 16):
-                C[i0 * 64 + i1 * 16 + j] = B_new[i1 * 17 + j] * 2.0
-
-
-@T.prim_func
-def boolean_handling_before(a: T.Buffer[10, "bool"], b: T.Buffer[10, "bool"]) -> None:
-    for i0 in T.serial(10):
-        b[i0] = a[i0]
-
-
-@T.prim_func
-def boolean_handling_after(a: T.Buffer[10, "int8"], b: T.Buffer[10, "int8"]) -> None:
-    T.preflattened_buffer(a, [10], dtype="bool", data=a.data)
-    T.preflattened_buffer(b, [10], dtype="bool", data=b.data)
-    # body
-    for i0 in T.serial(10):
-        b[i0] = T.cast(T.cast(a[i0], "bool"), "int8")
-
-
-def test_elementwise():
-    _check(elementwise_func, flattened_elementwise_func)
-
-
-def test_gpu_workload():
-    _check(gpu_func, flattened_gpu_func)
+class BaseCompare(tvm.testing.CompareBeforeAfter):
+    transform = tvm.transform.Sequential(
+        [
+            tvm.tir.transform.FlattenBuffer(),
+            tvm.tir.transform.Simplify(),
+        ]
+    )
 
 
-def test_symbolic_shape():
-    _check(symbolic_func, flattened_symbolic_func)
-
-
-def test_multi_alloc():
-    _check(multi_alloc_func, flattened_multi_alloc_func)
+class TestElementwise(BaseCompare):
+    """2-d buffers are flattened to 1-d"""
 
+    def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
+        for i in T.serial(0, 16):
+            B_new = T.decl_buffer([1, 16], "float32")
+            for j in T.serial(0, 16):
+                B_new[0, j] = A[i, j] + 1.0
+            for j in T.serial(0, 16):
+                C[i, j] = B_new[0, j] * 2.0
+
+    def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]):
+        T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
+        T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)
+        for i in T.serial(0, 16):
+            B_new_data = T.allocate([16], "float32", scope="global")
+            B_new = T.buffer_decl([16], "float32", scope="global", data=B_new_data)
+            for j in T.serial(0, 16):
+                B_new[j] = A[((i * 16) + j)] + 1.0
+            for j in T.serial(0, 16):
+                C[((i * 16) + j)] = B_new[j] * 2.0
 
-def test_strided_buffer():
-    _check(strided_buffer_func, flattened_strided_buffer_func)
 
+class TestElementwiseWithoutDeclBuffer(BaseCompare):
+    """2-d buffers are flattened to 1-d
 
-def test_lower_te():
-    x = te.placeholder((1,))
-    y = te.compute((1,), lambda i: x[i] + 2)
-    s = te.create_schedule(y.op)
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-    mod = tvm.tir.transform.FlattenBuffer()(orig_mod)
-    tvm.ir.assert_structural_equal(mod, orig_mod)  # FlattenBuffer should do nothing on TE
+    Like TestElementwise, but the TIR doesn't have the DeclBuffer
+    node.  The T.buffer_decl declaration applies only during the
+    parsing the TVMScript, and doesn't occur in the TIR itself.  In
+    this case, the allocation should be assumed to be targeting flat
+    memory, and should be flattened to a 1-d allocation.
+    """
 
+    def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
+        for i in T.serial(0, 16):
+            B_new_data = T.allocate([1, 16], "float32", "global")
+            B_new = T.buffer_decl([1, 16], "float32", data=B_new_data)
+            for j in T.serial(0, 16):
+                B_new[0, j] = A[i, j] + 1.0
+            for j in T.serial(0, 16):
+                C[i, j] = B_new[0, j] * 2.0
+
+    def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]):
+        T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
+        T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)
+        for i in T.serial(0, 16):
+            B_new_data = T.allocate([16], "float32", "global")
+            B_new = T.buffer_decl(16, "float32", data=B_new_data)
+            for j in T.serial(0, 16):
+                B_new[j] = A[((i * 16) + j)] + 1.0
+            for j in T.serial(0, 16):
+                C[((i * 16) + j)] = B_new[j] * 2.0
+
+
+class TestGPU(BaseCompare):
+    """Buffer flattening may have indices based on GPU thread vars"""
+
+    def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
+        i0 = T.env_thread("blockIdx.x")
+        i1 = T.env_thread("threadIdx.x")
+        i2 = T.env_thread("vthread")
+
+        T.launch_thread(i0, 4)
+        T.launch_thread(i1, 2)
+        T.launch_thread(i2, 2)
+        B = T.decl_buffer([1, 16], "float32", scope="local")
+        for j in range(0, 16):
+            B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0
+        for j in range(0, 16):
+            C[i0 * 4 + i1 * 2 + i2, j] = B[0, j] * 2.0
+
+    def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]):
+        T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
+        T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)
+
+        i0 = T.env_thread("blockIdx.x")
+        i1 = T.env_thread("threadIdx.x")
+        i2 = T.env_thread("vthread")
+
+        T.launch_thread(i0, 4)
+        T.launch_thread(i1, 2)
+        T.launch_thread(i2, 2)
+        B_data = T.allocate([16], "float32", scope="local")
+        B = T.buffer_decl([16], "float32", scope="local", data=B_data)
+        for j in range(0, 16):
+            B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + 1.0
+        for j in range(0, 16):
+            C[i0 * 64 + i1 * 32 + i2 * 16 + j] = B[j] * 2.0
+
+
+class TestSymbolic(BaseCompare):
+    """Dynamically-sized arrrays are flattened"""
+
+    def before(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None:
+        A = T.match_buffer(a, (n, m), "float32")
+        C = T.match_buffer(c, (n, m), "float32")
+
+        for i in range(0, n):
+            B = T.decl_buffer([m], "float32")
+            for j in range(0, m):
+                B[j] = A[i, j] + 1.0
+            for j in range(0, m):
+                C[i, j] = B[j] * 2.0
+
+    def expected(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None:
+        A = T.match_buffer(a, n * m, "float32")
+        C = T.match_buffer(c, n * m, "float32")
+        T.preflattened_buffer(A, (n, m), "float32", data=A.data)
+        T.preflattened_buffer(C, (n, m), "float32", data=C.data)
+
+        for i in range(0, n):
+            B_data = T.allocate([m], "float32", scope="global")
+            B = T.buffer_decl([m], "float32", scope="global", data=B_data)
+            for j in range(0, m):
+                B[j] = A[i * m + j] + 1.0
+            for j in range(0, m):
+                C[i * m + j] = B[j] * 2.0
+
+
+class TestMultiAlloc(BaseCompare):
+    """If multiple allocations occur, all are flattened."""
+
+    def before(A: T.Buffer[(4, 32), "float32"], D: T.Buffer[(4, 32), "float32"]):
+        for i, j in T.grid(4, 32):
+            B = T.decl_buffer((4, 32), "float32", scope="global")
+            C = T.decl_buffer((4, 32), "float32", scope="global")
+            B[i, j] = A[i, j] + 1.0
+            C[i, j] = A[i, j] + B[i, j]
+            D[i, j] = C[i, j] * 2.0
+
+    def expected(A: T.Buffer[128, "float32"], D: T.Buffer[128, "float32"]):
+        T.preflattened_buffer(A, (4, 32), "float32", data=A.data)
+        T.preflattened_buffer(D, (4, 32), "float32", data=D.data)
+
+        for i, j in T.grid(4, 32):
+            B_data = T.allocate([128], "float32", scope="global")
+            B = T.buffer_decl([128], "float32", scope="global", data=B_data)
+            C_data = T.allocate([128], "float32", scope="global")
+            C = T.buffer_decl([128], "float32", scope="global", data=C_data)
+            B[i * 32 + j] = A[i * 32 + j] + 1.0
+            C[i * 32 + j] = A[i * 32 + j] + B[i * 32 + j]
+            D[i * 32 + j] = C[i * 32 + j] * 2.0
+
+
+class TestStrided(BaseCompare):
+    """Indices for flattened buffers use the specified striding."""
+
+    def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
+        for i0 in T.serial(4):
+            B = T.decl_buffer([4, 17], "float32")
+            B_1 = T.buffer_decl([4, 16], dtype="float32", data=B.data, strides=[17, 1])
+            for i1, j in T.grid(4, 16):
+                B_1[i1, j] = A[i0 * 4 + i1, j] + 1.0
+            for i1, j in T.grid(4, 16):
+                C[i0 * 4 + i1, j] = B_1[i1, j] * 2.0
+
+    def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]):
+        T.preflattened_buffer(A, [16, 16], dtype="float32", data=A.data)
+        T.preflattened_buffer(C, [16, 16], dtype="float32", data=C.data)
+        for i0 in T.serial(0, 4):
+            B_new_data = T.allocate([68], "float32", scope="global")
+            B_new = T.buffer_decl([68], "float32", scope="global", data=B_new_data)
+            for i1 in T.serial(0, 4):
+                for j in T.serial(0, 16):
+                    B_new[i1 * 17 + j] = A[i0 * 64 + i1 * 16 + j] + 1.0
+            for i1 in T.serial(0, 4):
+                for j in T.serial(0, 16):
+                    C[i0 * 64 + i1 * 16 + j] = B_new[i1 * 17 + j] * 2.0
+
+
+class TestBoolean(BaseCompare):
+    """Boolean buffers should be replaced by a backing int8 array"""
+
+    def before(A: T.Buffer[10, "bool"], B: T.Buffer[10, "bool"]) -> None:
+        for i0 in T.serial(10):
+            B[i0] = A[i0]
+
+    def expected(A: T.Buffer[10, "int8"], B: T.Buffer[10, "int8"]) -> None:
+        T.preflattened_buffer(A, [10], dtype="bool", data=A.data)
+        T.preflattened_buffer(B, [10], dtype="bool", data=B.data)
+        # body
+        for i0 in T.serial(10):
+            B[i0] = T.cast(T.cast(A[i0], "bool"), "int8")
+
+
+class TestLowerTE(BaseCompare):
+    """FlattenBuffer should do nothing on TE-based functions"""
+
+    def before(self):
+        x = te.placeholder((1,))
+        y = te.compute((1,), lambda i: x[i] + 2)
+        s = te.create_schedule(y.op)
+        mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
+        return mod["main"]
+
+    expected = before
+
+
+class TestFlattenInsideBlock(BaseCompare):
+    """Flattening access inside a block flattens the accessed region."""
+
+    def before():
+        A = T.alloc_buffer([32, 32])
+        for i, j in T.grid(32, 32):
+            with T.block("block"):
+                T.reads(A[i, j])
+                T.evaluate(A[i, j])
+
+    def expected():
+        A = T.alloc_buffer([1024])
+        for i, j in T.grid(32, 32):
+            with T.block("block"):
+                T.reads(A[i * 32 + j])
+                T.evaluate(A[i * 32 + j])
+
+
+class TestNoChangeTo2DPhysicalBuffer(BaseCompare):
+    """Flattening preserves axis separators."""
+
+    def before():
+        A = T.alloc_buffer([32, 32], axis_separators=[1])
+        for i, j in T.grid(32, 32):
+            T.evaluate(A[i, j])
+
+    expected = before
+
+
+class TestFlattenAllocBufferWithAxisSeparators(BaseCompare):
+    """Flattening preserves axis separators"""
+
+    def before():
+        A = T.alloc_buffer([2, 3, 5, 7, 11, 13], axis_separators=[3])
+        for i0, i1, i2, i3, i4, i5 in T.grid(2, 3, 5, 7, 11, 13):
+            T.evaluate(A[i0, i1, i2, i3, i4, i5])
+
+    def expected():
+        A = T.alloc_buffer([30, 1001], axis_separators=[1])
+        for i0, i1, i2, i3, i4, i5 in T.grid(2, 3, 5, 7, 11, 13):
+            T.evaluate(A[i0 * 15 + i1 * 5 + i2, i3 * 143 + i4 * 13 + i5])
+
+
+class TestFlattenDeclBufferWithAxisSeparators(BaseCompare):
+    """Flattening preserves axis separators
+
+    Like TestFlattenAllocBufferWithAxisSeparators, but the allocations
+    is done using Allocate/DeclBuffer, rather than through
+    BlockNode::alloc_buffers.
+    """
+
+    def before():
+        A = T.decl_buffer([2, 3, 5, 7, 11, 13], axis_separators=[3])
+        for i0, i1, i2, i3, i4, i5 in T.grid(2, 3, 5, 7, 11, 13):
+            T.evaluate(A[i0, i1, i2, i3, i4, i5])
+
+    def expected():
+        A_data = T.allocate([30, 1001], dtype="float32", scope="global")
+        A = T.buffer_decl(
+            [30, 1001], dtype="float32", scope="global", axis_separators=[1], data=A_data
+        )
+        for i0, i1, i2, i3, i4, i5 in T.grid(2, 3, 5, 7, 11, 13):
+            T.evaluate(A[i0 * 15 + i1 * 5 + i2, i3 * 143 + i4 * 13 + i5])
+
+
+def test_lower_2d_physical_memory():
+    """Axis separators should preserve 2-d buffers through lowering.
 
-def test_boolean_handling():
-    _check(boolean_handling_before, boolean_handling_after)
+    A catch-all test to ensure that defining axis_separators is
+    sufficient to maintain non-flat buffer descriptions through all
+    lowering steps.
+    """
+
+    # This test doesn't use CompareBeforeAfter, because the after step
+    # is not currently expressible in TVMScript.  This test can be
+    # re-written after https://github.com/apache/tvm/pull/12412.
+
+    @T.prim_func
+    def func():
+        buf = T.alloc_buffer(
+            [1, 1],
+            dtype="int32",
+            scope="global",
+            axis_separators=[1],
+        )
+        buf[0, 0] = 0
+
+    lowered = tvm.lower(func)["main"]
+    assert isinstance(lowered.body, tvm.tir.Allocate)
+    assert list(lowered.body.extents) == [1, 1], (
+        "Non-flat buffer allocations, "
+        "marked by axis_separators, "
+        "flattened to flat memory allocation."
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_tir_transform_lower_opaque_block.py b/tests/python/unittest/test_tir_transform_lower_opaque_block.py
index f8f3e3a5aced..824cef174055 100644
--- a/tests/python/unittest/test_tir_transform_lower_opaque_block.py
+++ b/tests/python/unittest/test_tir_transform_lower_opaque_block.py
@@ -54,8 +54,7 @@ def transformed_elementwise_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i in T.serial(0, 16):
-        B_new_data = T.allocate([1, 16], "float32", "global")
-        B_new = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_new_data)
+        B_new = T.decl_buffer(shape=[1, 16], dtype="float32")
         for j in T.serial(0, 16):
             B_new[0, j] = A[i, j] + 1.0
         for j in T.serial(0, 16):
@@ -97,8 +96,7 @@ def transformed_gpu_func(a: T.handle, c: T.handle) -> None:
     T.launch_thread(i0, 4)
     T.launch_thread(i1, 2)
     T.launch_thread(i2, 2)
-    B_data = T.allocate([1, 16], "float32", "local")
-    B = T.buffer_decl(shape=[1, 16], dtype="float32", scope="local", data=B_data)
+    B = T.decl_buffer(shape=[1, 16], dtype="float32", scope="local")
     for j in range(0, 16):
         B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0
     for j in range(0, 16):
@@ -133,8 +131,7 @@ def transformed_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32)
     C = T.match_buffer(c, (n, m), "float32")
 
     for i in range(0, n):
-        B_data = T.allocate([m], "float32", "global")
-        B = T.buffer_decl(shape=[m], dtype="float32", data=B_data)
+        B = T.decl_buffer(shape=[m], dtype="float32")
         for j in range(0, m):
             B[j] = A[i, j] + 1.0
         for j in range(0, m):
@@ -207,10 +204,8 @@ def transformed_multi_alloc_func(a: T.handle, d: T.handle) -> None:
     D = T.match_buffer(d, (32), "float32")
 
     for i in range(0, 32):
-        B_data = T.allocate((32,), "float32", "global")
-        B = T.buffer_decl(shape=(32,), dtype="float32", data=B_data)
-        C_data = T.allocate((32,), "float32", "global")
-        C = T.buffer_decl(shape=(32,), dtype="float32", data=C_data)
+        B = T.decl_buffer(shape=(32,), dtype="float32")
+        C = T.decl_buffer(shape=(32,), dtype="float32")
         B[i] = A[i] + 1.0
         C[i] = A[i] + B[i]
         D[i] = C[i] * 2.0
@@ -246,12 +241,11 @@ def transformed_strided_buffer_func(
     # body
     for i0 in T.serial(4):
         B_data = T.allocate([4, 17], "float32", "global")
-        B = T.buffer_decl(shape=[4, 17], dtype="float32", data=B_data)
-        B_1 = T.buffer_decl([4, 16], dtype="float32", data=B.data, strides=[17, 1])
+        B = T.decl_buffer(shape=[4, 16], dtype="float32", strides=[17, 1], data=B_data)
         for i1, j in T.grid(4, 16):
-            B_1[i1, j] = A[i0 * 4 + i1, j] + T.float32(1)
+            B[i1, j] = A[i0 * 4 + i1, j] + T.float32(1)
         for i1, j in T.grid(4, 16):
-            C[i0 * 4 + i1, j] = B_1[i1, j] * T.float32(2)
+            C[i0 * 4 + i1, j] = B[i1, j] * T.float32(2)
 
 
 @T.prim_func

From 299ca267e7641b5fa6e78dd131d0574e310f9a13 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 8 Sep 2022 09:35:58 -0700
Subject: [PATCH 131/704] [TIR] Update region min/extent in
 ReplaceBufferMutator (#12725)

Prior to this commit, `ReplaceBufferMutator` only checks
`BufferRegionNode::buffer` to determine if a `BufferRegion` needs to
be replaced, and doesn't check the `BufferRegionNode::region`.  As a
result, updating `T.reads(A[B[i]])` would fail to replace `B`.

This commit checks `BufferRegionNode::region` for buffer usage to
resolve this issue.
---
 src/tir/schedule/transform.cc                 | 27 ++++++++++++++++---
 .../test_tir_schedule_set_axis_separator.py   | 24 +++++++++++++++++
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index 1ebaf202d487..c11fa656d6da 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -138,9 +138,30 @@ Stmt ReplaceBufferMutator::VisitStmt_(const BlockNode* block) {
     return this->VisitMatchBufferRegion(match_buffer);
   };
   auto f_mutate_read_write_region = [this](const BufferRegion& buffer_region) {
-    auto it = buffer_var_map_.find(buffer_region->buffer->data.get());
-    return it == buffer_var_map_.end() ? buffer_region
-                                       : BufferRegion(it->second, buffer_region->region);
+    auto region = MutateArray(buffer_region->region, [this](const Range& range) {
+      PrimExpr min = VisitExpr(range->min);
+      PrimExpr extent = VisitExpr(range->extent);
+      if (min.same_as(range->min) && extent.same_as(range->extent)) {
+        return range;
+      } else {
+        return Range::FromMinExtent(min, extent);
+      }
+    });
+
+    Buffer buf = [&]() {
+      auto it = buffer_var_map_.find(buffer_region->buffer->data.get());
+      if (it == buffer_var_map_.end()) {
+        return buffer_region->buffer;
+      } else {
+        return it->second;
+      }
+    }();
+
+    if (buf.same_as(buffer_region->buffer) && region.same_as(buffer_region->region)) {
+      return buffer_region;
+    } else {
+      return BufferRegion(buf, region);
+    }
   };
   auto f_mutate_alloc_buffers = [this](const Buffer& buffer) {
     auto it = buffer_var_map_.find(buffer->data.get());
diff --git a/tests/python/unittest/test_tir_schedule_set_axis_separator.py b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
index 9502da182926..b432fbb61066 100644
--- a/tests/python/unittest/test_tir_schedule_set_axis_separator.py
+++ b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
@@ -154,6 +154,30 @@ def test_set_axis_separator_subregion(use_sugared_transform):
     tvm.ir.assert_structural_equal(element_wise_subregion_match_set_axis_separator, s.mod["main"])
     verify_trace_roundtrip(sch=s, mod=func)
 
+class TestIndexedLookup(tvm.testing.CompareBeforeAfter):
+    def transform(self):
+        def func(mod):
+            sch = tir.Schedule(mod)
+            sch.set_axis_separator('block', 'B', [1])
+            return sch.mod
+        return func
+
+    @T.prim_func
+    def before():
+        A = T.alloc_buffer([4,4], dtype="int32")
+        B = T.alloc_buffer([1,1], dtype="int32")
+        for j in T.serial(4):
+            with T.block('block'):
+                A[B[0,0],j] = 0
+
+    @T.prim_func
+    def expected():
+        A = T.alloc_buffer([4,4], dtype="int32")
+        B = T.alloc_buffer([1,1], dtype="int32", axis_separators=[1])
+        for j in T.serial(4):
+            with T.block('block'):
+                A[B[0,0],j] = 0
+
 
 if __name__ == "__main__":
     tvm.testing.main()

From 64031d56d634a535c8e3832d9231855b688f0648 Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Thu, 8 Sep 2022 15:30:38 -0700
Subject: [PATCH 132/704] Move static array initialization into a function go
 avoid link errors (#12678)

* Move static array initialization into a function go avoid link errors

* Fix line length
---
 include/tvm/runtime/container/map.h | 63 +++++++++++++++--------------
 src/runtime/container.cc            |  4 --
 2 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/include/tvm/runtime/container/map.h b/include/tvm/runtime/container/map.h
index 4c76a3b0ad4f..53c37cc20e6b 100644
--- a/include/tvm/runtime/container/map.h
+++ b/include/tvm/runtime/container/map.h
@@ -1038,10 +1038,10 @@ class DenseMapNode : public MapNode {
       new (&Data()) KVType(std::move(v));
     }
     /*! \brief If the entry has next entry on the linked list */
-    bool HasNext() const { return kNextProbeLocation[Meta() & 0b01111111] != 0; }
+    bool HasNext() const { return NextProbeLocation(Meta() & 0b01111111) != 0; }
     /*! \brief Move the entry to the next entry on the linked list */
     bool MoveToNext(const DenseMapNode* self, uint8_t meta) {
-      uint64_t offset = kNextProbeLocation[meta & 0b01111111];
+      uint64_t offset = NextProbeLocation(meta & 0b01111111);
       if (offset == 0) {
         index = 0;
         block = nullptr;
@@ -1066,7 +1066,7 @@ class DenseMapNode : public MapNode {
     /*! \brief Get the next empty jump */
     bool GetNextEmpty(const DenseMapNode* self, uint8_t* jump, ListNode* result) const {
       for (uint8_t idx = 1; idx < kNumJumpDists; ++idx) {
-        ListNode candidate((index + kNextProbeLocation[idx]) & (self->slots_), self);
+        ListNode candidate((index + NextProbeLocation(idx)) & (self->slots_), self);
         if (candidate.IsEmpty()) {
           *jump = idx;
           *result = candidate;
@@ -1086,33 +1086,36 @@ class DenseMapNode : public MapNode {
   uint32_t fib_shift_;
   /*! \brief array of data blocks */
   Block* data_;
-  /* clang-format off */
-  /*! \brief Candidates of probing distance */
-  TVM_DLL static constexpr uint64_t kNextProbeLocation[kNumJumpDists] {
-    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-    // Quadratic probing with triangle numbers. See also:
-    // 1) https://en.wikipedia.org/wiki/Quadratic_probing
-    // 2) https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/
-    // 3) https://github.com/skarupke/flat_hash_map
-    21, 28, 36, 45, 55, 66, 78, 91, 105, 120,
-    136, 153, 171, 190, 210, 231, 253, 276, 300, 325,
-    351, 378, 406, 435, 465, 496, 528, 561, 595, 630,
-    666, 703, 741, 780, 820, 861, 903, 946, 990, 1035,
-    1081, 1128, 1176, 1225, 1275, 1326, 1378, 1431, 1485, 1540,
-    1596, 1653, 1711, 1770, 1830, 1891, 1953, 2016, 2080, 2145,
-    2211, 2278, 2346, 2415, 2485, 2556, 2628,
-    // larger triangle numbers
-    8515, 19110, 42778, 96141, 216153,
-    486591, 1092981, 2458653, 5532801, 12442566,
-    27993903, 62983476, 141717030, 318844378, 717352503,
-    1614057336, 3631522476, 8170957530, 18384510628, 41364789378,
-    93070452520, 209408356380, 471168559170, 1060128894105, 2385289465695,
-    5366898840628, 12075518705635, 27169915244790, 61132312065111, 137547689707000,
-    309482283181501, 696335127828753, 1566753995631385, 3525196511162271, 7931691992677701,
-    17846306936293605, 40154190677507445, 90346928918121501, 203280589587557251, 457381325854679626,
-    1029107982097042876, 2315492959180353330, 5209859154120846435,
-  };
-  /* clang-format on */
+  static uint64_t NextProbeLocation(size_t index) {
+    /* clang-format off */
+    /*! \brief Candidates of probing distance */
+    static const uint64_t kNextProbeLocation[kNumJumpDists] {
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+      // Quadratic probing with triangle numbers. See also:
+      // 1) https://en.wikipedia.org/wiki/Quadratic_probing
+      // 2) https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/
+      // 3) https://github.com/skarupke/flat_hash_map
+      21, 28, 36, 45, 55, 66, 78, 91, 105, 120,
+      136, 153, 171, 190, 210, 231, 253, 276, 300, 325,
+      351, 378, 406, 435, 465, 496, 528, 561, 595, 630,
+      666, 703, 741, 780, 820, 861, 903, 946, 990, 1035,
+      1081, 1128, 1176, 1225, 1275, 1326, 1378, 1431, 1485, 1540,
+      1596, 1653, 1711, 1770, 1830, 1891, 1953, 2016, 2080, 2145,
+      2211, 2278, 2346, 2415, 2485, 2556, 2628,
+      // larger triangle numbers
+      8515, 19110, 42778, 96141, 216153,
+      486591, 1092981, 2458653, 5532801, 12442566,
+      27993903, 62983476, 141717030, 318844378, 717352503,
+      1614057336, 3631522476, 8170957530, 18384510628, 41364789378,
+      93070452520, 209408356380, 471168559170, 1060128894105, 2385289465695,
+      5366898840628, 12075518705635, 27169915244790, 61132312065111, 137547689707000,
+      309482283181501, 696335127828753, 1566753995631385, 3525196511162271, 7931691992677701,
+      17846306936293605, 40154190677507445, 90346928918121501, 203280589587557251,
+      457381325854679626, 1029107982097042876, 2315492959180353330, 5209859154120846435,
+    };
+    /* clang-format on */
+    return kNextProbeLocation[index];
+  }
   friend class MapNode;
 };
 
diff --git a/src/runtime/container.cc b/src/runtime/container.cc
index 159404be5351..adcaecbc64cf 100644
--- a/src/runtime/container.cc
+++ b/src/runtime/container.cc
@@ -180,10 +180,6 @@ TVM_REGISTER_GLOBAL("runtime.MapItems").set_body([](TVMArgs args, TVMRetValue* r
   *ret = std::move(rkvs);
 });
 
-#if (USE_FALLBACK_STL_MAP == 0)
-TVM_DLL constexpr uint64_t DenseMapNode::kNextProbeLocation[];
-#endif
-
 // Closure
 TVM_REGISTER_OBJECT_TYPE(ClosureObj);
 

From 89ce171b8697d223032f53b5e14c459332316da8 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 8 Sep 2022 18:51:32 -0700
Subject: [PATCH 133/704] [TIR, Schedule] Check consumer in-bound and covered
 in reverse_compute_inline (#12717)

* [TIR, Schedule] Generate consumer-in-bound predicate after reverse_compute_inline

* Check consumer block iters are covered

* fix lint
---
 src/tir/schedule/primitive/compute_inline.cc  | 131 ++++++++++++++++--
 .../test_tir_schedule_compute_inline.py       |  61 ++++++++
 2 files changed, 178 insertions(+), 14 deletions(-)

diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc
index bfda66036fe3..2ea641a2cbd4 100644
--- a/src/tir/schedule/primitive/compute_inline.cc
+++ b/src/tir/schedule/primitive/compute_inline.cc
@@ -30,7 +30,8 @@ static const char kErrBodyReverseInline[] = R"(The body of the inlined block sho
     `B[...] = g(i, j, k, A[f(i, j, k, ...)] ...)`,
 where A is the only buffer the block consumes, whose indices are distinct atomic variables,
 and there should be no variables other than the index variables), and f is a bijective affine
-mapping)";
+mapping and there should not be predicates in the inlined block. The iter domains of the inlined
+block should be covered by the producer block.)";
 
 class HasInitBlock : public ScheduleError {
  public:
@@ -161,16 +162,25 @@ class NonSingleProducerError : public ScheduleError {
   IRModule mod_;
   Block block_;
 
-  static void Check(const ScheduleState& self, const StmtSRef& consumer_block_sref,
-                    const StmtSRef& scope_root_sref) {
+  /*!
+   * \brief Check if the block has a single producer.
+   * \param self The schedule state
+   * \param block_sref The sref of the block to be checked
+   * \param scope_root_sref The sref of the scope root
+   * \return The sref of the producer block if the block has a single producer
+   * \throw ScheduleError if the block does not have a single producer
+   */
+  static StmtSRef Check(const ScheduleState& self, const StmtSRef& consumer_block_sref,
+                        const StmtSRef& scope_root_sref) {
     BlockScope scope = self->GetBlockScope(scope_root_sref);
     Array<Dependency> producers = scope->GetDepsByDst(consumer_block_sref);
+    StmtSRef producer_block_sref{nullptr};
     if (producers.size() == 1 && producers[0]->kind == DepKind::kRAW) {
-      const StmtSRef& producer_block_sref = producers[0]->src;
+      producer_block_sref = producers[0]->src;
       if (IsCompleteBlock(self, producer_block_sref, scope_root_sref)) {
         Array<Dependency> consumers = scope->GetDepsBySrc(producer_block_sref);
         if (consumers.size() == 1) {
-          return;
+          return producer_block_sref;
         }
       }
     }
@@ -521,11 +531,28 @@ class ReverseComputeInliner : public BaseInliner {
   };
 
  public:
-  explicit ReverseComputeInliner(const Buffer& inlined_buffer, const Block& consumer_block,
+  explicit ReverseComputeInliner(const Buffer& inlined_buffer, const BlockNode* producer_block,
+                                 const BlockRealize& consumer_block_realize,
                                  const StmtSRef& scope_root_sref)
-      : BaseInliner(inlined_buffer, consumer_block, scope_root_sref) {}
+      : BaseInliner(inlined_buffer, consumer_block_realize->block, scope_root_sref),
+        producer_block_(producer_block),
+        consumer_block_(consumer_block_realize->block.get()) {
+    // Initialize the predicates to ensure consumer block iters are in-bound
+    consumer_iter_in_bound_ = Bool(true);
+    for (const IterVar& iter : consumer_block_realize->block->iter_vars) {
+      consumer_iter_in_bound_ =
+          consumer_iter_in_bound_ &&
+          (iter->var >= iter->dom->min && iter->var < iter->dom->min + iter->dom->extent);
+    }
+  }
 
-  bool BodyPatternAllowInline(const Block& consumer_block) {
+  bool BodyPatternAllowInline(const BlockRealize& consumer_block_realize) {
+    const Block& consumer_block = consumer_block_realize->block;
+
+    if (!is_one(consumer_block_realize->predicate)) {
+      // Failure: Predicate is the consumer block is not supported
+      return false;
+    }
     if (inlined_store_ == nullptr) {
       // Failure: block body is not BufferStore
       return false;
@@ -557,13 +584,25 @@ class ReverseComputeInliner : public BaseInliner {
         /*input_iters=*/consumer_iter_doms,
         /*predicate=*/true,
         /*check_level=*/arith::IterMapLevel::Bijective,
-        /*analyzer=*/&analyzer,
+        /*analyzer=*/&analyzer_,
         /*simplify_trivial_iterators=*/false);
     buffer_load_iter_map_ = res->indices;
     if (buffer_load_iter_map_.empty()) {
       // Failure: indices of BufferLoad are not bijective affine
       return false;
     }
+
+    const BufferStoreNode* producer_store = producer_block_->body.as<BufferStoreNode>();
+    if (producer_store == nullptr) {
+      // Failure: producer block body is not BufferStore
+      return false;
+    }
+    CreateInverseMapping(producer_store->indices);
+    if (!CheckConsumerCovered()) {
+      // Failure: consumer block iter domains are not covered by the producer block
+      return false;
+    }
+
     return true;
   }
 
@@ -571,6 +610,34 @@ class ReverseComputeInliner : public BaseInliner {
   using BaseInliner::VisitExpr_;
   using BaseInliner::VisitStmt_;
 
+  /*! \brief Generate the predicate after inlining based on the consumer predicate */
+  PrimExpr BuildInlinedConsumerPredicate(const BlockRealizeNode* producer_block_realize) {
+    // Bind the producer block iter domains for simplification
+    Map<Var, PrimExpr> subst_map;
+    for (int i = 0, n = producer_block_realize->iter_values.size(); i < n; ++i) {
+      const IterVar& iter = producer_block_realize->block->iter_vars[i];
+      analyzer_.Bind(iter->var, Range::FromMinExtent(iter->dom->min, iter->dom->extent));
+      subst_map.Set(iter->var, producer_block_realize->iter_values[i]);
+    }
+    // Substitute the consumer block iters with the corresponding iters in the producer blocks
+    PrimExpr predicate = Substituter(this)(consumer_iter_in_bound_);
+    // Simplify the predicate using the producer block iter domains
+    predicate = analyzer_.Simplify(predicate);
+    // Substitute the producer block iters with the its bindings since the predicate in BlockRealize
+    // should not contain the block iters
+    predicate = Substitute(predicate, subst_map);
+    return predicate;
+  }
+
+  Stmt VisitStmt_(const BlockRealizeNode* op) final {
+    BlockRealize new_block_realize = Downcast<BlockRealize>(StmtMutator::VisitStmt_(op));
+    if (op->block.get() == producer_block_) {
+      new_block_realize.CopyOnWrite()->predicate =
+          BuildInlinedConsumerPredicate(new_block_realize.get());
+    }
+    return std::move(new_block_realize);
+  }
+
   Stmt VisitStmt_(const BufferStoreNode* _store) final {
     BufferStore store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(_store));
     if (!store->buffer.same_as(inlined_buffer_)) {
@@ -579,6 +646,32 @@ class ReverseComputeInliner : public BaseInliner {
     return ReplaceInlinedBuffer(std::move(store));
   }
 
+  /*!
+   * \brief Check the consumer block iter domains are covered by the producer block iter domains
+   * \return Whether the consumer block iter domains are covered
+   */
+  bool CheckConsumerCovered() {
+    Map<IterVar, arith::IntSet> producer_iter_doms;
+    for (const IterVar& iter_var : producer_block_->iter_vars) {
+      producer_iter_doms.Set(iter_var, arith::IntSet::FromRange(iter_var->dom));
+    }
+    // For each block iter in the consumer block, find the corresponding expression in the producer
+    for (const IterVar& iter : consumer_block_->iter_vars) {
+      if (auto it = idx_sub_.find(iter->var.get()); it != idx_sub_.end()) {
+        const PrimExpr& producer_iter = it->second;
+        arith::IntSet producer_iter_range = arith::EvalSet(producer_iter, producer_iter_doms);
+        if (analyzer_.CanProve(producer_iter_range.min() > iter->dom->min) ||
+            analyzer_.CanProve(producer_iter_range.max() <
+                               iter->dom->min + iter->dom->extent - 1)) {
+          return false;
+        }
+      } else {
+        return false;
+      }
+    }
+    return true;
+  }
+
   /*!
    * \brief Apply the inverse of `buffer_load_iter_map_` to producer indices. Update `idx_sub_` with
    *        the result. It will be later used to transform the BufferStore indices of the producer.
@@ -592,7 +685,6 @@ class ReverseComputeInliner : public BaseInliner {
   }
 
   Stmt ReplaceInlinedBuffer(BufferStore producer) {
-    CreateInverseMapping(producer->indices);
     producer_rhs_ = producer->value;
     return Substituter(this)(GetRef<BufferStore>(inlined_store_));
   }
@@ -647,8 +739,16 @@ class ReverseComputeInliner : public BaseInliner {
   Array<PrimExpr> buffer_load_indices_;
   /*! \brief The IterMap representing the indices of the consumer's BufferLoad */
   Array<arith::IterSumExpr> buffer_load_iter_map_{nullptr};
+  /*! \brief The producer block */
+  const BlockNode* producer_block_{nullptr};
+  /* \brief The consumer block */
+  const BlockNode* consumer_block_{nullptr};
+  /*! \brief The predicate to ensure the consumer block iters are in-bound. It will be inserted
+   * as the predicate of the producer block after inlining.
+   */
+  PrimExpr consumer_iter_in_bound_{nullptr};
   /*! \brief The arithmetic analyzer */
-  arith::Analyzer analyzer;
+  arith::Analyzer analyzer_;
 };
 
 void ComputeInlineImpl(ScheduleState self, const StmtSRef& producer_block_sref,
@@ -700,6 +800,7 @@ void ReverseComputeInlineImpl(ScheduleState self, const StmtSRef& consumer_block
                               bool check_only = false) {
   const BlockNode* _consumer_block = TVM_SREF_TO_BLOCK(consumer_block_sref);
   Block consumer_block = GetRef<Block>(_consumer_block);
+  BlockRealize consumer_block_realize = GetBlockRealize(self, consumer_block_sref);
   HasInitBlock::Check(self->mod, consumer_block);
   // Step 1. Get the scope block
   StmtSRef scope_root_sref = GetScopeRoot(self, consumer_block_sref,  //
@@ -709,10 +810,12 @@ void ReverseComputeInlineImpl(ScheduleState self, const StmtSRef& consumer_block
   // Step 2. Check completeness
   CheckCompleteBlock(self, consumer_block_sref, scope_root_sref);
   // Step 3. Check if the consumer has a single complete producer
-  NonSingleProducerError::Check(self, consumer_block_sref, scope_root_sref);
+  StmtSRef producer_block_sref =
+      NonSingleProducerError::Check(self, consumer_block_sref, scope_root_sref);
   // Step 4. Analyze the block body
-  ReverseComputeInliner inliner(inlined_buffer, consumer_block, scope_root_sref);
-  if (!inliner.BodyPatternAllowInline(consumer_block)) {
+  ReverseComputeInliner inliner(inlined_buffer, producer_block_sref->StmtAs<BlockNode>(),
+                                consumer_block_realize, scope_root_sref);
+  if (!inliner.BodyPatternAllowInline(consumer_block_realize)) {
     throw BodyAnalysisError(true, self->mod, consumer_block);
   }
   // Step 5. Create a plan that removes the leaf block to be inlined
diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py
index ec19402969e3..20eafabc7a22 100644
--- a/tests/python/unittest/test_tir_schedule_compute_inline.py
+++ b/tests/python/unittest/test_tir_schedule_compute_inline.py
@@ -585,6 +585,47 @@ def exp_exp_opaque_access_with_tvm_access_ptr_inlined(
             )
 
 
+@T.prim_func
+def elementwise_overcomputed_producer(
+    A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(127, 127), "float32"]
+) -> None:
+    B = T.alloc_buffer((128, 128))
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(127, 127):
+        with T.block("C"):
+            cvi, cvj = T.axis.remap("SS", [i, j])
+            C[cvi, cvj] = B[cvi, cvj] + 1.0
+
+
+@T.prim_func
+def elementwise_overcomputed_producer_reverse_inlined(
+    A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(127, 127), "float32"]
+) -> None:
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.where(i < 127 and j < 127)
+            C[vi, vj] = A[vi, vj] * 2.0 + 1.0
+
+
+@T.prim_func
+def elementwise_producer_not_cover_consumer(
+    A: T.Buffer[(128, 128), "float32"], D: T.Buffer[(256, 128), "float32"]
+) -> None:
+    B = T.alloc_buffer((128, 128))
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(256, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            D[vi, vj] = T.if_then_else(vi >= 128, B[vi - 128, vj], T.float32(0), dtype="float32")
+
+
 # pylint: enable=no-member,invalid-name,unused-variable
 
 use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
@@ -822,5 +863,25 @@ def test_compute_inline_opaque_access_with_tvm_access_ptr(use_block_name):
     )
 
 
+def test_reverse_compute_inline_overcomputed_producer(use_block_name):
+    """Test reverse compute inline overcomputed producer"""
+    sch = tir.Schedule(elementwise_overcomputed_producer, debug_mask="all")
+    compute = "C" if use_block_name else sch.get_block("C")
+    sch.reverse_compute_inline(compute)
+    tvm.ir.assert_structural_equal(
+        elementwise_overcomputed_producer_reverse_inlined, sch.mod["main"]
+    )
+
+
+def test_reverse_compute_inline_error_producer_not_cover_consumer(use_block_name):
+    """Test reverse compute inline failure when the inlined block iter domains are not covered by
+    its producer
+    """
+    sch = tir.Schedule(elementwise_producer_not_cover_consumer, debug_mask="all")
+    compute = "C" if use_block_name else sch.get_block("C")
+    with pytest.raises(tvm.tir.ScheduleError):
+        sch.reverse_compute_inline(compute)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 1c5ffc67ad2497a2d34509e0599b3a787fcd464d Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 9 Sep 2022 00:07:09 -0700
Subject: [PATCH 134/704] [ci][docker] Use CMake 3.20.0 for cortexm (#12744)

The Zephyr project builds require 3.20.0 to work correctly

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/Dockerfile.ci_cortexm                  |  2 +-
 docker/install/ubuntu_install_cmake_source.sh | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm
index fb3c10d393f0..d646704bb0a8 100644
--- a/docker/Dockerfile.ci_cortexm
+++ b/docker/Dockerfile.ci_cortexm
@@ -33,7 +33,7 @@ COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
 
 COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
-RUN bash /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh 3.20.0
 
 COPY install/ubuntu1804_install_python_venv.sh /install/ubuntu1804_install_python_venv.sh
 RUN bash /install/ubuntu1804_install_python_venv.sh
diff --git a/docker/install/ubuntu_install_cmake_source.sh b/docker/install/ubuntu_install_cmake_source.sh
index 030cb4ea0406..702130f07964 100755
--- a/docker/install/ubuntu_install_cmake_source.sh
+++ b/docker/install/ubuntu_install_cmake_source.sh
@@ -20,13 +20,19 @@ set -e
 set -u
 set -o pipefail
 
-v=3.18
-version=3.18.4
+if [ -z ${1+x} ]; then
+    version=3.18.4
+else
+    version=$1
+fi
+
+v=$(echo $version | sed 's/\(.*\)\..*/\1/g')
+echo "Installing cmake $version ($v)"
 wget https://cmake.org/files/v${v}/cmake-${version}.tar.gz
 tar xvf cmake-${version}.tar.gz
 cd cmake-${version}
 ./bootstrap
-make -j$(nproc)
+make -j"$(nproc)"
 make install
 cd ..
 rm -rf cmake-${version} cmake-${version}.tar.gz

From cb08a1251f247ee79d3ede2b0e843cc11c4925d0 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Fri, 9 Sep 2022 01:29:57 -0700
Subject: [PATCH 135/704] [TF] Add DenseBincount support (#12728)

---
 python/tvm/relay/frontend/tensorflow_ops.py   | 55 +++++++++++++++++++
 .../frontend/tensorflow/test_forward.py       | 41 ++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py
index c94a4ef2e6aa..4598f4f09a05 100644
--- a/python/tvm/relay/frontend/tensorflow_ops.py
+++ b/python/tvm/relay/frontend/tensorflow_ops.py
@@ -2868,6 +2868,60 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _dense_bincount():
+    def _impl(inputs, attr, params, mod):
+        input = inputs[0]  # input: int32, int64. 1D or 2D int Tensor
+        size = inputs[1]  # size: non-negative int scalar Tensor
+        # weights: int32, int64, float32, or float64 Tensor with the same shape as arr
+        # or a length-0 Tensor, in which case it acts as all weights equal to 1.
+        weights = inputs[2]
+        # Returns: Output: 1D Tensor with length equal to size
+        # or 2D Tensor with [batch_size, size].
+        # The counts or summed weights for each value in the range [0, size).
+
+        input_dtype = _infer_type(input, mod).checked_type.dtype
+        input_shape = _infer_shape(input, mod)
+        is_2d_input = len(input_shape) == 2
+
+        if input_dtype == "int64":
+            warnings.warn(
+                "Casting an int64 input to int32, since we do not have int64 atomic add"
+                "needed for bincount yet."
+            )
+            input = _op.cast(input, "int32")
+
+        is_weights_zero_tensor = True
+        if weights:
+            weights_shape = _infer_shape(weights, mod)
+            is_weights_zero_tensor = weights_shape == (0,)
+
+        # Output should have the same dtype as weights.
+        if is_weights_zero_tensor:
+            # if weights are length-0 Tensor - output dtype is float32
+            out_dtype = "float32"
+            updates = _op.cast(_op.ones_like(input), out_dtype)
+        else:
+            out_dtype = _infer_type(weights, mod).checked_type.dtype
+            updates = weights
+
+        if is_2d_input:
+            batch_arr = _op.take(_op.shape_of(input), _expr.const([0]))
+            size_arr = _op.reshape(size, [1])
+            counts_shape = _op.concatenate([batch_arr, size_arr], axis=0)
+            counts = _op.zeros(counts_shape, out_dtype)
+            out = _op.scatter_add(counts, input, updates, axis=1)
+        else:
+            counts_shape = _op.reshape(size, [1])
+            counts = _op.zeros(counts_shape, out_dtype)
+            out = _op.scatter_add(counts, input, updates, axis=0)
+
+        if attr["binary_output"]:
+            out = _op.cast(_op.cast(out, "bool"), out_dtype)
+        return out
+
+    return _impl
+
+
 # _convert_map defines maps of name to converter functor(callable)
 # for 1 to 1 mapping, use Renamer if nothing but name is different
 # use AttrCvt if attributes need to be converted
@@ -2913,6 +2967,7 @@ def _impl(inputs, attr, params, mod):
     "Cosh": AttrCvt("cosh"),
     "CropAndResize": _crop_and_resize(),
     "DecodeJpeg": _decode_image(),
+    "DenseBincount": _dense_bincount(),
     "DepthToSpace": _depth_to_space(),
     "DepthwiseConv2dNative": _conv("depthwise"),
     "Dilation2D": _dilation2d(),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index c679425beab1..ebeb35e08f5d 100755
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -5758,5 +5758,46 @@ def test_invert_permutation():
         compare_tf_with_tvm(x, "Placeholder:0", out_name, no_gpu=False)
 
 
+#######################################################################
+# DenseBincount
+# ----
+
+
+def _test_dense_bincount(in_shape, size, weights, binary_output):
+    with tf.Graph().as_default():
+        inputs = []
+        data = []
+        inputs.append(tf.placeholder(shape=in_shape, dtype="int32", name="input0"))
+        data.append(np.random.uniform(0, size, size=in_shape).astype("int32"))
+        inputs.append(tf.placeholder(shape=(), dtype="int32", name="size"))
+        data.append(np.array(size, "int32"))
+        if weights:
+            inputs.append(tf.placeholder(shape=in_shape, dtype="float32", name="weights"))
+            data.append(np.reshape(weights, in_shape).astype("float32"))
+        else:
+            inputs.append(tf.placeholder(shape=(0,), dtype="float32", name="weights"))
+            data.append(np.array([], "float32"))
+        result = tf.raw_ops.DenseBincount(
+            input=data[0],
+            size=data[1],
+            weights=data[2],
+            binary_output=binary_output,
+        )
+        compare_tf_with_tvm(data, [a.name for a in inputs], result.name, mode="vm")
+
+
+def test_forward_dense_bincount():
+    """Test DenseBincount Op"""
+    for binary_output in [False, True]:
+        # 2D input
+        _test_dense_bincount((3, 10), 20, [1.0] * 30, binary_output)
+        _test_dense_bincount((3, 10), 20, [1.5] * 30, binary_output)
+        _test_dense_bincount((3, 10), 20, None, binary_output)
+        # 1D input
+        _test_dense_bincount((10,), 20, [1.0] * 10, binary_output)
+        _test_dense_bincount((10,), 20, [1.5] * 10, binary_output)
+        _test_dense_bincount((10,), 20, None, binary_output)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 90fb79b74c49b585f39469e1a2eec233fdd592e0 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Fri, 9 Sep 2022 16:24:05 +0100
Subject: [PATCH 136/704] [CI] Update Docker images to bring TF 2.9 and
 integration tests (#12738)

[CI] Update Docker images to tag 20220908-060034-62bdc91b1

Updates all Docker images to tag 20220908-060034-62bdc91b1, to
update TensorFlow/TFLite/Keras to 2.9, and cascaded dependencies
such as numpy. Updates ethos-u-vela to 3.4.0.

It also brings ONNX and PyTorch to ci_arm, to enable Integration
tests to be run in CI.

Standadises the minimum CMake version required in CI to be 3.18.4,
fixing apps/microtvm/zephyr_cmsisnn to require this version.

Finally, adds a new import error in the tutorials documentation
which doesn't affect the final result. The new warning added is
'absl:Found untraced functions such as _jit_compiled_convolution_op'
---
 Jenkinsfile                            | 20 ++++++++++----------
 ci/jenkins/Jenkinsfile.j2              | 20 ++++++++++----------
 tests/scripts/task_config_build_cpu.sh | 11 ++++++++++-
 tests/scripts/task_python_docs.sh      |  1 +
 4 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 78071fde4599..ed1cf4b09e6e 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -49,16 +49,16 @@
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220810-060142-fae79bbc3'
-ci_gpu = 'tlcpack/ci-gpu:20220810-060142-fae79bbc3'
-ci_cpu = 'tlcpack/ci-cpu:20220810-060142-fae79bbc3'
-ci_minimal = 'tlcpack/ci-minimal:20220725-133226-d3cefdaf1'
-ci_wasm = 'tlcpack/ci-wasm:20220810-060142-fae79bbc3'
-ci_i386 = 'tlcpack/ci-i386:20220810-060142-fae79bbc3'
-ci_cortexm = 'tlcpack/ci-cortexm:20220810-060142-fae79bbc3'
-ci_arm = 'tlcpack/ci-arm:20220810-060142-fae79bbc3'
-ci_hexagon = 'tlcpack/ci-hexagon:20220825-145056-fb7cf97f'
-ci_riscv = 'tlcpack/ci-riscv:20220810-060142-fae79bbc3'
+ci_lint = 'tlcpack/ci-lint:20220908-060034-62bdc91b1'
+ci_gpu = 'tlcpack/ci-gpu:20220908-060034-62bdc91b1'
+ci_cpu = 'tlcpack/ci-cpu:20220908-060034-62bdc91b1'
+ci_minimal = 'tlcpack/ci-minimal:20220908-060034-62bdc91b1'
+ci_wasm = 'tlcpack/ci-wasm:20220908-060034-62bdc91b1'
+ci_i386 = 'tlcpack/ci-i386:20220908-060034-62bdc91b1'
+ci_cortexm = 'tlcpack/ci-cortexm:20220909-090211-cb08a1251'
+ci_arm = 'tlcpack/ci-arm:20220908-060034-62bdc91b1'
+ci_hexagon = 'tlcpack/ci-hexagon:20220908-060034-62bdc91b1'
+ci_riscv = 'tlcpack/ci-riscv:20220908-060034-62bdc91b1'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index c932431a44a1..6ba0c2df8efd 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -51,16 +51,16 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'ci/jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220810-060142-fae79bbc3'
-ci_gpu = 'tlcpack/ci-gpu:20220810-060142-fae79bbc3'
-ci_cpu = 'tlcpack/ci-cpu:20220810-060142-fae79bbc3'
-ci_minimal = 'tlcpack/ci-minimal:20220725-133226-d3cefdaf1'
-ci_wasm = 'tlcpack/ci-wasm:20220810-060142-fae79bbc3'
-ci_i386 = 'tlcpack/ci-i386:20220810-060142-fae79bbc3'
-ci_cortexm = 'tlcpack/ci-cortexm:20220810-060142-fae79bbc3'
-ci_arm = 'tlcpack/ci-arm:20220810-060142-fae79bbc3'
-ci_hexagon = 'tlcpack/ci-hexagon:20220825-145056-fb7cf97f'
-ci_riscv = 'tlcpack/ci-riscv:20220810-060142-fae79bbc3'
+ci_lint = 'tlcpack/ci-lint:20220908-060034-62bdc91b1'
+ci_gpu = 'tlcpack/ci-gpu:20220908-060034-62bdc91b1'
+ci_cpu = 'tlcpack/ci-cpu:20220908-060034-62bdc91b1'
+ci_minimal = 'tlcpack/ci-minimal:20220908-060034-62bdc91b1'
+ci_wasm = 'tlcpack/ci-wasm:20220908-060034-62bdc91b1'
+ci_i386 = 'tlcpack/ci-i386:20220908-060034-62bdc91b1'
+ci_cortexm = 'tlcpack/ci-cortexm:20220909-090211-cb08a1251'
+ci_arm = 'tlcpack/ci-arm:20220908-060034-62bdc91b1'
+ci_hexagon = 'tlcpack/ci-hexagon:20220908-060034-62bdc91b1'
+ci_riscv = 'tlcpack/ci-riscv:20220908-060034-62bdc91b1'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 9dc5c62efaa7..7f48839f23c0 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -37,7 +37,16 @@ echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
 echo set\(USE_VTA_TSIM ON\) >> config.cmake
 echo set\(USE_VTA_FSIM ON\) >> config.cmake
-echo set\(USE_TFLITE ON\) >> config.cmake
+
+# This conditional is just to support the transition to cope
+# with the change in the way TFLite is built. It can be
+# removed once we migrate to TensorFlow and TFLite > 2.9.1
+if [ -d "/opt/tflite" ]; then
+  echo set\(USE_TFLITE \"/opt/tflite\"\) >> config.cmake
+else
+  echo set\(USE_TFLITE ON\) >> config.cmake
+fi
+
 echo set\(USE_TENSORFLOW_PATH \"/tensorflow\"\) >> config.cmake
 echo set\(USE_FLATBUFFERS_PATH \"/flatbuffers\"\) >> config.cmake
 echo set\(USE_ETHOSN /opt/arm/ethosn-driver\) >> config.cmake
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index d8578fde2817..fdce77bfd9cc 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -88,6 +88,7 @@ IGNORED_WARNINGS=(
     'autotvm:Cannot find config for target=cuda -keys=cuda,gpu'
     # Warning is thrown during TFLite quantization for micro_train tutorial
     'absl:For model inputs containing unsupported operations which cannot be quantized, the `inference_input_type` attribute will default to the original type.'
+    'absl:Found untraced functions such as _jit_compiled_convolution_op'
 )
 
 JOINED_WARNINGS=$(join_by '|' "${IGNORED_WARNINGS[@]}")

From 75969647fdf5e9f9b60635d1409952c97a29f0e4 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Fri, 9 Sep 2022 16:49:37 +0100
Subject: [PATCH 137/704] Aligned CMSIS-NN SHA in TVM to CMSIS top of tree
 (#12723)

Aligned CMSIS-NN SHA in TVM to top of tree of CMSIS.

-Aligned buffer size APIs to CMSIS implementations.
-Updated the tests to match new CMSIS context buffer sizes.
-This change needs updates to cortex-m docker image.

Change-Id: I13f1ad29fe0ef02f08660eca4c818b5d66145ffc
---
 docker/install/ubuntu_install_cmsis.sh        |  4 ++--
 .../backend/contrib/cmsisnn/buffer_size.cc    | 20 ++++++++++---------
 .../backend/contrib/cmsisnn/buffer_size.h     |  7 ++++++-
 .../backend/contrib/cmsisnn/relay_to_tir.cc   |  4 ++--
 .../contrib/cmsisnn/buffer_size_test.cc       |  8 ++++----
 5 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/docker/install/ubuntu_install_cmsis.sh b/docker/install/ubuntu_install_cmsis.sh
index 1116b5bd6929..9fcbcf61cefa 100755
--- a/docker/install/ubuntu_install_cmsis.sh
+++ b/docker/install/ubuntu_install_cmsis.sh
@@ -39,8 +39,8 @@ shift
 mkdir -p "${INSTALLATION_PATH}"
 
 # Download and extract CMSIS
-CMSIS_SHA="e336766b1b5654f36244bca649917281f399bf37"
-CMSIS_SHASUM="30c40824c4e008dcb9c6c77adee5115efa0cb04b6701fe2bc31ddf7be2da59f2161aeb4dbe5780cbaa709af23a3e21ea460bb2b84fa12418563125b4d426ac86"
+CMSIS_SHA="51263182d16c92649a48144ba56c0945f9fce60e"
+CMSIS_SHASUM="d02573e5a8908c741d8558f01be2939aae6e940933ccb58123fa972864947759eefe5d554688db3910c8ed665a248b477b5e4458e12773385c67f8a2136b3b34"
 CMSIS_URL="http://github.com/ARM-software/CMSIS_5/archive/${CMSIS_SHA}.tar.gz"
 DOWNLOAD_PATH="/tmp/${CMSIS_SHA}.tar.gz"
 
diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.cc b/src/relay/backend/contrib/cmsisnn/buffer_size.cc
index d03d34897f5a..25f4d054e810 100644
--- a/src/relay/backend/contrib/cmsisnn/buffer_size.cc
+++ b/src/relay/backend/contrib/cmsisnn/buffer_size.cc
@@ -17,6 +17,8 @@
  * under the License.
  */
 
+#include "buffer_size.h"
+
 #include <tvm/ir/attrs.h>
 #include <tvm/ir/transform.h>
 
@@ -44,13 +46,13 @@ int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_
   }
 
   if (is1xN) {
-    if (!has_mve) {
-      return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
+    if (has_mve) {
+      return 0;
     }
-    return 0;
+    return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
   }
 
-  if (has_mve) {
+  if (has_mve || is1xN) {
     int32_t col_length = input_c * filter_w * filter_h;
     col_length = (col_length + 7) / 8;
     return 4 * col_length * 8 * (int32_t)sizeof(int8_t);
@@ -61,15 +63,15 @@ int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_
 }
 
 int DepthwiseConv2dBufferSize(Target target, int32_t input_n, int32_t input_c, int32_t output_c,
-                              int32_t filter_w, int32_t filter_h) {
+                              int32_t filter_w, int32_t filter_h, int32_t dilation_w,
+                              int32_t dilation_h) {
   bool has_mve = target->GetFeature<Bool>("has_mve").value_or(Bool(false));
   bool has_dsp = target->GetFeature<Bool>("has_dsp").value_or(Bool(false));
 
-  if (input_c == output_c && input_n == 1) {
+  if (input_c == output_c && input_n == 1 && dilation_w == 1 && dilation_h == 1) {
     if (has_mve) {
-      return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t) + 4;
-    }
-    if (has_dsp) {
+      return (4 * CH_IN_BLOCK_MVE * filter_w * filter_h) * (int32_t)sizeof(int8_t);
+    } else if (has_dsp) {
       return (input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
     }
   }
diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.h b/src/relay/backend/contrib/cmsisnn/buffer_size.h
index a6d3d588e2d9..9dae17c0a220 100644
--- a/src/relay/backend/contrib/cmsisnn/buffer_size.h
+++ b/src/relay/backend/contrib/cmsisnn/buffer_size.h
@@ -34,6 +34,8 @@ namespace relay {
 namespace contrib {
 namespace cmsisnn {
 
+#define CH_IN_BLOCK_MVE (124)
+
 /*!
  * \brief Calculates the appropriate buffer size for CMSIS-NN Convolutions
  * See:
@@ -70,11 +72,14 @@ int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_
  * \param output_c - Output channels
  * \param filter_w - Filter width
  * \param filter_h - Filter height
+ * \param dilation_w - Dilation width
+ * \param dilation_h - Dilation height
  *
  * \return Size of buffer to allocate for depthwise convolution
  */
 int DepthwiseConv2dBufferSize(Target target, int32_t input_n, int32_t input_c, int32_t output_c,
-                              int32_t filter_w, int32_t filter_h);
+                              int32_t filter_w, int32_t filter_h, int32_t dilation_w,
+                              int32_t dilation_h);
 
 /*!
  * \brief Calculates the appropriate buffer size for CMSIS-NN Average Pooling
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 5683bc6698be..a5cdfd570fea 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -242,8 +242,8 @@ class RelayToTIRVisitor : public MixedModeMutator {
     Target target = CreateTarget(transform::PassContext::Current());
     size_t context_buffer_size;
     if (is_depthwise) {
-      context_buffer_size =
-          DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w, filter_h);
+      context_buffer_size = DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w,
+                                                      filter_h, dilation_w, dilation_h);
     } else {
       context_buffer_size = Conv2dBufferSize(target, padding_w, padding_h, input_n, input_h,
                                              input_c, output_h, output_w, stride_w, stride_h,
diff --git a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
index 9ff42e203ee6..d8870fa71525 100644
--- a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
+++ b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
@@ -143,7 +143,7 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, UnEvenChannels) {
   int32_t input_n = 1;
 
   auto depthwise_conv2d_with_channels = [=](Target target, int32_t input_c, int32_t output_c) {
-    return DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w, filter_h);
+    return DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w, filter_h, 1, 1);
   };
 
   ASSERT_EQ(depthwise_conv2d_with_channels(kNoExt, 4, 6), 0);
@@ -161,7 +161,7 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, MultipleBatches) {
 
   auto depthwise_conv2d_with_batch = [=](Target target, int32_t input_n) {
     return DepthwiseConv2dBufferSize(target, input_n, input_output_c, input_output_c, filter_w,
-                                     filter_h);
+                                     filter_h, 1, 1);
   };
 
   ASSERT_EQ(depthwise_conv2d_with_batch(kNoExt, 4), 0);
@@ -179,12 +179,12 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, Default) {
   int32_t input_n = 1;
 
   int32_t mve_calculated_buffer =
-      (2 * input_output_c * filter_w * filter_h) * (int32_t)sizeof(int16_t) + 4;
+      (4 * CH_IN_BLOCK_MVE * filter_w * filter_h) * (int32_t)sizeof(int8_t);
   int32_t dsp_calculated_buffer = (input_output_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
 
   auto depthwise_conv2d = [=](Target target) {
     return DepthwiseConv2dBufferSize(target, input_n, input_output_c, input_output_c, filter_w,
-                                     filter_h);
+                                     filter_h, 1, 1);
   };
 
   ASSERT_EQ(depthwise_conv2d(kNoExt), 0);

From 1d32c400f1d2a14cb3c663c2d17b977b94b2db48 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 9 Sep 2022 09:25:30 -0700
Subject: [PATCH 138/704] [microtvm][Zephyr] Add project overlay to overwrite
 device tree configs (#12741)

* add nucleo overlay
---
 .../app-overlay/nucleo_l4r5zi.overlay         | 23 +++++++++++++++++++
 .../template_project/microtvm_api_server.py   | 15 ++++++++----
 cmake/modules/Zephyr.cmake                    |  1 +
 tests/lint/check_file_type.py                 |  1 +
 4 files changed, 36 insertions(+), 4 deletions(-)
 create mode 100644 apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay

diff --git a/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay b/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay
new file mode 100644
index 000000000000..360e0753d4f5
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay
@@ -0,0 +1,23 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+&rcc {
+	clock-frequency = <DT_FREQ_M(120)>;
+};
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index b73779f68148..5a0bc7309c63 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -567,6 +567,8 @@ def _generate_cmake_args(self, mlf_extracted_path, options) -> str:
         return cmake_args
 
     def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
+        zephyr_board = options["zephyr_board"]
+
         # Check Zephyr version
         version = self._get_platform_version(get_zephyr_base(options))
         if version != ZEPHYR_VERSION:
@@ -586,6 +588,11 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         # Copy boards.json file to generated project.
         shutil.copy2(BOARDS, project_dir / BOARDS.name)
 
+        # Copy overlay files
+        board_overlay_path = API_SERVER_DIR / "app-overlay" / f"{zephyr_board}.overlay"
+        if board_overlay_path.exists():
+            shutil.copy2(board_overlay_path, project_dir / f"{zephyr_board}.overlay")
+
         # Place Model Library Format tarball in the special location, which this script uses to decide
         # whether it's being invoked in a template or generated project.
         project_model_library_format_tar_path = project_dir / MODEL_LIBRARY_FORMAT_RELPATH
@@ -597,9 +604,9 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
             os.makedirs(extract_path)
             tf.extractall(path=extract_path)
 
-        if self._is_qemu(options["zephyr_board"], options.get("use_fvp")):
+        if self._is_qemu(zephyr_board, options.get("use_fvp")):
             shutil.copytree(API_SERVER_DIR / "qemu-hack", project_dir / "qemu-hack")
-        elif self._is_fvp(options["zephyr_board"], options.get("use_fvp")):
+        elif self._is_fvp(zephyr_board, options.get("use_fvp")):
             shutil.copytree(API_SERVER_DIR / "fvp-hack", project_dir / "fvp-hack")
 
         # Populate CRT.
@@ -650,7 +657,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                     for item in flags:
                         cmake_f.write(f"target_compile_definitions(app PUBLIC {item})\n")
 
-                if self._is_fvp(options["zephyr_board"], options.get("use_fvp")):
+                if self._is_fvp(zephyr_board, options.get("use_fvp")):
                     cmake_f.write(f"target_compile_definitions(app PUBLIC -DFVP=1)\n")
 
         self._create_prj_conf(project_dir, options)
@@ -665,7 +672,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         # Populate src/
         src_dir = project_dir / "src"
         if options["project_type"] != "host_driven" or self._is_fvp(
-            options["zephyr_board"], options.get("use_fvp")
+            zephyr_board, options.get("use_fvp")
         ):
             shutil.copytree(API_SERVER_DIR / "src" / options["project_type"], src_dir)
         else:
diff --git a/cmake/modules/Zephyr.cmake b/cmake/modules/Zephyr.cmake
index be4f85dac33d..644675dcf871 100644
--- a/cmake/modules/Zephyr.cmake
+++ b/cmake/modules/Zephyr.cmake
@@ -29,6 +29,7 @@ if(USE_MICRO)
       "apps/microtvm/zephyr/template_project/src/host_driven *.h -> zephyr/src/host_driven"
       "apps/microtvm/zephyr/template_project/fvp-hack * -> zephyr/fvp-hack"
       "apps/microtvm/zephyr/template_project/qemu-hack * -> zephyr/qemu-hack"
+      "apps/microtvm/zephyr/template_project/app-overlay * -> zephyr/app-overlay"
       "apps/microtvm/zephyr/template_project/crt_config *.h -> zephyr/crt_config"
     )
 
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 7e09c3c7cfa6..51a80431d37f 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -148,6 +148,7 @@
     "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-riscv32",
     "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-riscv64",
     "apps/microtvm/zephyr/template_project/fvp-hack/FVP_Corstone_SSE-300_Ethos-U55",
+    "apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay",
     # microTVM Virtual Machines
     "apps/microtvm/poetry.lock",
     "apps/microtvm/reference-vm/Vagrantfile",

From 8bd81e6fbca3b7c8511b3b24601c37a3cff19864 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Fri, 9 Sep 2022 11:54:38 -0700
Subject: [PATCH 139/704] [TVMScript] Base IRBuilder methods for `PrimFunc`
 (#12745)

Base IRBuilder methods for `PrimFunc`

This PR introduces base IRBuilder methods for `PrimFunc`.

Co-authored-by: yongwww <yongcale@gmail.com>

Co-authored-by: yongwww <yongcale@gmail.com>
---
 include/tvm/script/ir_builder/ir/frame.h      |   2 +
 include/tvm/script/ir_builder/ir/ir.h         |   2 +
 include/tvm/script/ir_builder/tir/frame.h     | 155 ++++++++++++++++++
 include/tvm/script/ir_builder/tir/ir.h        |  48 ++++++
 python/tvm/script/ir_builder/tir/__init__.py  |  18 ++
 python/tvm/script/ir_builder/tir/_ffi_api.py  |  20 +++
 python/tvm/script/ir_builder/tir/frame.py     |  31 ++++
 python/tvm/script/ir_builder/tir/ir.py        |  55 +++++++
 src/script/ir_builder/ir/frame.cc             |   2 +
 src/script/ir_builder/ir/ir.cc                |   2 +
 src/script/ir_builder/tir/frame.cc            |  59 +++++++
 src/script/ir_builder/tir/ir.cc               |  50 ++++++
 src/script/ir_builder/tir/utils.h             |  68 ++++++++
 .../unittest/test_tvmscript_ir_builder_tir.py |  49 ++++++
 14 files changed, 561 insertions(+)
 create mode 100644 include/tvm/script/ir_builder/tir/frame.h
 create mode 100644 include/tvm/script/ir_builder/tir/ir.h
 create mode 100644 python/tvm/script/ir_builder/tir/__init__.py
 create mode 100644 python/tvm/script/ir_builder/tir/_ffi_api.py
 create mode 100644 python/tvm/script/ir_builder/tir/frame.py
 create mode 100644 python/tvm/script/ir_builder/tir/ir.py
 create mode 100644 src/script/ir_builder/tir/frame.cc
 create mode 100644 src/script/ir_builder/tir/ir.cc
 create mode 100644 src/script/ir_builder/tir/utils.h
 create mode 100644 tests/python/unittest/test_tvmscript_ir_builder_tir.py

diff --git a/include/tvm/script/ir_builder/ir/frame.h b/include/tvm/script/ir_builder/ir/frame.h
index 181774bc53bc..887981ccffc8 100644
--- a/include/tvm/script/ir_builder/ir/frame.h
+++ b/include/tvm/script/ir_builder/ir/frame.h
@@ -29,6 +29,7 @@
 namespace tvm {
 namespace script {
 namespace ir_builder {
+namespace ir {
 
 /*!
  * \brief A frame that represents the IRModule frame with functions and global variables.
@@ -64,6 +65,7 @@ class IRModuleFrame : public IRBuilderFrame {
                                                     IRModuleFrameNode);
 };
 
+}  // namespace ir
 }  // namespace ir_builder
 }  // namespace script
 }  // namespace tvm
diff --git a/include/tvm/script/ir_builder/ir/ir.h b/include/tvm/script/ir_builder/ir/ir.h
index 0bd5473c7eaf..f0e7cc6f5c2f 100644
--- a/include/tvm/script/ir_builder/ir/ir.h
+++ b/include/tvm/script/ir_builder/ir/ir.h
@@ -29,6 +29,7 @@
 namespace tvm {
 namespace script {
 namespace ir_builder {
+namespace ir {
 
 /*!
  * \brief The IRModule declaration statement.
@@ -36,6 +37,7 @@ namespace ir_builder {
  */
 TVM_DLL IRModuleFrame IRModule();
 
+}  // namespace ir
 }  // namespace ir_builder
 }  // namespace script
 }  // namespace tvm
diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h
new file mode 100644
index 000000000000..4bfd022af27a
--- /dev/null
+++ b/include/tvm/script/ir_builder/tir/frame.h
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_IR_BUILDER_TIR_FRAME_H_
+#define TVM_SCRIPT_IR_BUILDER_TIR_FRAME_H_
+
+#include <tvm/script/ir_builder/base.h>
+#include <tvm/script/ir_builder/ir/frame.h>
+#include <tvm/tir/stmt.h>
+
+namespace tvm {
+namespace script {
+namespace ir_builder {
+namespace tir {
+
+/*!
+ * \brief A base frame that represents the TIR fame with body of statements.
+ *
+ * \sa TIRFrame
+ */
+class TIRFrameNode : public IRBuilderFrameNode {
+ public:
+  /*! \brief The Stmt within in this frame. */
+  Array<tvm::tir::Stmt> stmts;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    IRBuilderFrameNode::VisitAttrs(v);
+    v->Visit("stmts", &stmts);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.TIRFrame";
+  TVM_DECLARE_BASE_OBJECT_INFO(TIRFrameNode, IRBuilderFrameNode);
+};
+
+/*!
+ * \brief Managed reference to TIRFrameNode.
+ *
+ * \sa TIRFrameNode
+ */
+class TIRFrame : public IRBuilderFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(TIRFrame, IRBuilderFrame, TIRFrameNode);
+
+ protected:
+  TIRFrame() = default;
+};
+
+/*!
+ * \brief A frame that represents the PrimFunc containing TIR statements.
+ *
+ * \sa PrimFuncFrame
+ */
+class PrimFuncFrameNode : public TIRFrameNode {
+ public:
+  /*! \brief The name of the block. */
+  Optional<String> name;
+  /*! \brief Function parameters. */
+  Array<tvm::tir::Var> args;
+  /*! \brief The return type of the function. */
+  Optional<Type> ret_type;
+  /*! \brief Maps some parameters to specific Buffer data structures. */
+  Map<tvm::tir::Var, tvm::tir::Buffer> buffer_map;
+  /*! \brief The buffer map prior to flattening. */
+  Map<tvm::tir::Var, tvm::tir::Buffer> preflattened_buffer_map;
+  /*! \brief Additional attributes storing the meta-data */
+  Optional<Map<String, ObjectRef>> attrs;
+  /*! \brief The variable map bound to thread env. */
+  Map<tvm::tir::Var, tvm::tir::IterVar> env_threads;
+  /*! \brief The buffer allocated in root block. */
+  Array<tvm::tir::Buffer> root_alloc_buffers;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("name", &name);
+    v->Visit("args", &args);
+    v->Visit("ret_type", &ret_type);
+    v->Visit("buffer_map", &buffer_map);
+    v->Visit("preflattened_buffer_map", &preflattened_buffer_map);
+    v->Visit("attrs", &attrs);
+    v->Visit("env_threads", &env_threads);
+    v->Visit("root_alloc_buffers", &root_alloc_buffers);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.PrimFuncFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PrimFuncFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to PrimFuncFrameNode.
+ *
+ * \sa PrimFuncFrameNode
+ */
+class PrimFuncFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(PrimFuncFrame, TIRFrame, PrimFuncFrameNode);
+};
+
+/*!
+ * \brief A frame that represents the assert statement. Proceeds if the condition is true,
+ * otherwise aborts with the message.
+ *
+ * \sa AssertFrame
+ */
+class AssertFrameNode : public TIRFrameNode {
+ public:
+  /*! \brief The PrimExpr to test. */
+  PrimExpr condition;
+  /*! \brief The output error message when the assertion failed. */
+  PrimExpr message;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("condition", &condition);
+    v->Visit("message", &message);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.AssertFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AssertFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+}  // namespace tir
+}  // namespace ir_builder
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_IR_BUILDER_TIR_FRAME_H_
diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
new file mode 100644
index 000000000000..cee60ad4f827
--- /dev/null
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_IR_BUILDER_TIR_IR_H_
+#define TVM_SCRIPT_IR_BUILDER_TIR_IR_H_
+
+#include <tvm/script/ir_builder/base.h>
+#include <tvm/script/ir_builder/tir/frame.h>
+#include <tvm/tir/op.h>
+
+namespace tvm {
+namespace script {
+namespace ir_builder {
+namespace tir {
+
+/*!
+ * \brief The primitive function statement.
+ * \return The PrimFuncFrame.
+ */
+PrimFuncFrame PrimFunc();
+
+/*!
+ * \brief Evaluate the input expression.
+ * \param value The input expression to evaluate.
+ */
+void Evaluate(PrimExpr value);
+
+}  // namespace tir
+}  // namespace ir_builder
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_IR_BUILDER_TIR_IR_H_
diff --git a/python/tvm/script/ir_builder/tir/__init__.py b/python/tvm/script/ir_builder/tir/__init__.py
new file mode 100644
index 000000000000..1e43d1af3498
--- /dev/null
+++ b/python/tvm/script/ir_builder/tir/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Package tvm.script.ir_builder.tir"""
+from .ir import *  # pylint: disable=wildcard-import,redefined-builtin
diff --git a/python/tvm/script/ir_builder/tir/_ffi_api.py b/python/tvm/script/ir_builder/tir/_ffi_api.py
new file mode 100644
index 000000000000..876f5f3a35a0
--- /dev/null
+++ b/python/tvm/script/ir_builder/tir/_ffi_api.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI APIs"""
+import tvm._ffi
+
+tvm._ffi._init_api("script.ir_builder.tir", __name__)  # pylint: disable=protected-access
diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py
new file mode 100644
index 000000000000..61418e0b2aa6
--- /dev/null
+++ b/python/tvm/script/ir_builder/tir/frame.py
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""IRBuilder for TIR"""
+
+from tvm._ffi import register_object as _register_object
+
+from ..base import IRBuilderFrame
+
+
+@_register_object("script.ir_builder.tir.TIRFrame")
+class TIRFrame(IRBuilderFrame):
+    ...
+
+
+@_register_object("script.ir_builder.tir.PrimFuncFrame")
+class PrimFuncFrame(TIRFrame):
+    ...
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
new file mode 100644
index 000000000000..ae5d5b260f65
--- /dev/null
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+"""IRBuilder for TIR"""
+
+from tvm.tir import PrimExpr, StringImm
+
+from . import _ffi_api, frame
+
+
+def prim_func() -> frame.PrimFuncFrame:
+    """The primitive function statement.
+
+    Returns
+    -------
+    res : frame.PrimFuncFrame
+        The PrimFuncFrame.
+    """
+    return _ffi_api.PrimFunc()  # pylint: disable=no-member # type: ignore
+
+
+def evaluate(value: PrimExpr) -> None:
+    """Evaluate the input expression.
+
+    Parameters
+    ----------
+    value: PrimExpr
+        The input expression to evaluate.
+    """
+    if isinstance(value, str):
+        value = StringImm(value)
+    return _ffi_api.Evaluate(value)  # pylint: disable=no-member # type: ignore
+
+
+# pylint: enable=invalid-name
+
+
+__all__ = [
+    "evaluate",
+    "prim_func",
+]
diff --git a/src/script/ir_builder/ir/frame.cc b/src/script/ir_builder/ir/frame.cc
index c85e30544aca..a81c56922dff 100644
--- a/src/script/ir_builder/ir/frame.cc
+++ b/src/script/ir_builder/ir/frame.cc
@@ -23,6 +23,7 @@
 namespace tvm {
 namespace script {
 namespace ir_builder {
+namespace ir {
 
 void IRModuleFrameNode::ExitWithScope() {
   ICHECK_EQ(functions.size(), global_vars.size());
@@ -38,6 +39,7 @@ void IRModuleFrameNode::ExitWithScope() {
 
 TVM_REGISTER_NODE_TYPE(IRModuleFrameNode);
 
+}  // namespace ir
 }  // namespace ir_builder
 }  // namespace script
 }  // namespace tvm
diff --git a/src/script/ir_builder/ir/ir.cc b/src/script/ir_builder/ir/ir.cc
index bcd21de144bb..a8cc452e4f0c 100644
--- a/src/script/ir_builder/ir/ir.cc
+++ b/src/script/ir_builder/ir/ir.cc
@@ -23,6 +23,7 @@
 namespace tvm {
 namespace script {
 namespace ir_builder {
+namespace ir {
 
 IRModuleFrame IRModule() {
   ObjectPtr<IRModuleFrameNode> n = make_object<IRModuleFrameNode>();
@@ -33,6 +34,7 @@ IRModuleFrame IRModule() {
 
 TVM_REGISTER_GLOBAL("script.ir_builder.ir.IRModule").set_body_typed(IRModule);
 
+}  // namespace ir
 }  // namespace ir_builder
 }  // namespace script
 }  // namespace tvm
diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc
new file mode 100644
index 000000000000..139c8193b0ba
--- /dev/null
+++ b/src/script/ir_builder/tir/frame.cc
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/script/ir_builder/tir/frame.h>
+#include <tvm/tir/function.h>
+
+#include "../../../tir/ir/script/script_complete.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace script {
+namespace ir_builder {
+namespace tir {
+
+void PrimFuncFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  tvm::tir::PrimFunc func(
+      /*params=*/args,
+      /*body=*/AsStmt(stmts),
+      /*ret_type=*/ret_type.value_or(TupleType::Empty()),
+      /*buffer_map=*/buffer_map,
+      /*preflattened_buffer_map=*/preflattened_buffer_map,
+      /*attrs=*/attrs.defined() ? DictAttrs(attrs.value()) : NullValue<DictAttrs>());
+  func = tvm::tir::ScriptComplete(func, root_alloc_buffers);
+  IRBuilder builder = IRBuilder::Current();
+  if (builder->frames.empty()) {
+    ICHECK(!builder->result.defined()) << "ValueError: Builder.result has already been set";
+    builder->result = func;
+  } else if (Optional<ir::IRModuleFrame> opt_frame = builder->FindFrame<ir::IRModuleFrame>()) {
+    ir::IRModuleFrame frame = opt_frame.value();
+    frame->global_vars.push_back(GlobalVar(name.value_or("")));
+    frame->functions.push_back(func);
+  } else {
+    LOG(FATAL) << "ValueError: Cannot find where to insert PrimFunc";
+  }
+}
+
+TVM_REGISTER_NODE_TYPE(TIRFrameNode);
+TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode);
+
+}  // namespace tir
+}  // namespace ir_builder
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
new file mode 100644
index 000000000000..5f994d71ca0a
--- /dev/null
+++ b/src/script/ir_builder/tir/ir.cc
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/arith/analyzer.h>
+#include <tvm/script/ir_builder/tir/ir.h>
+
+#include "./utils.h"
+
+namespace tvm {
+namespace script {
+namespace ir_builder {
+namespace tir {
+
+using tvm::tir::IterVar;
+
+PrimFuncFrame PrimFunc() {
+  ObjectPtr<PrimFuncFrameNode> n = make_object<PrimFuncFrameNode>();
+  n->name = NullOpt;
+  n->args.clear();
+  n->ret_type = NullOpt;
+  n->buffer_map.clear();
+  n->preflattened_buffer_map.clear();
+  n->attrs = NullOpt;
+  n->env_threads.clear();
+  n->root_alloc_buffers.clear();
+  return PrimFuncFrame(n);
+}
+
+void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); }
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.PrimFunc").set_body_typed(PrimFunc);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate);
+}  // namespace tir
+}  // namespace ir_builder
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/ir_builder/tir/utils.h b/src/script/ir_builder/tir/utils.h
new file mode 100644
index 000000000000..47557917cca5
--- /dev/null
+++ b/src/script/ir_builder/tir/utils.h
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_IR_BUILDER_TIR_UTILS_H_
+#define TVM_SCRIPT_IR_BUILDER_TIR_UTILS_H_
+
+#include <tvm/script/ir_builder/tir/frame.h>
+#include <tvm/script/ir_builder/tir/ir.h>
+#include <tvm/tir/stmt.h>
+
+namespace tvm {
+namespace script {
+namespace ir_builder {
+namespace tir {
+
+inline void AddToParent(tvm::tir::Stmt stmt) {
+  IRBuilder builder = IRBuilder::Current();
+  if (builder->frames.empty()) {
+    ICHECK(!builder->result.defined()) << "ValueError: Builder.result has already been set";
+    builder->result = stmt;
+  } else if (const auto* tir_frame = builder->frames.back().as<TIRFrameNode>()) {
+    GetRef<TIRFrame>(tir_frame)->stmts.push_back(stmt);
+  } else {
+    LOG(FATAL) << "TypeError: Unsupported frame type: " << builder->frames.back();
+  }
+}
+
+inline tvm::tir::Stmt AsStmt(const Array<tvm::tir::Stmt>& stmt) {
+  using namespace tvm::tir;
+  if (stmt.empty()) {
+    return tvm::tir::Evaluate(0);
+  } else if (stmt.size() == 1) {
+    return stmt[0];
+  } else {
+    return SeqStmt(stmt);
+  }
+}
+
+inline PrimFuncFrame FindPrimFuncFrame(const String& method) {
+  if (Optional<PrimFuncFrame> frame = IRBuilder::Current()->GetLastFrame<PrimFuncFrame>()) {
+    return frame.value();
+  }
+  LOG(FATAL) << "ValueError: PrimFunc frame not find. Please ensure '" << method
+             << "' is called under T.prim_func()";
+  throw;
+}
+
+}  // namespace tir
+}  // namespace ir_builder
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_IR_BUILDER_TIR_UTILS_H_
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
new file mode 100644
index 000000000000..70a8f3565d03
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, missing-docstring
+"""Unittests for tvm.script.ir_builder.tir"""
+import pytest
+import tvm.testing
+import tvm
+from tvm import tir
+from tvm.script.ir_builder import tir as T
+from tvm.script.ir_builder import IRBuilder
+from tvm.ir.base import assert_structural_equal
+
+
+def test_ir_builder_tir_primfunc():
+    with IRBuilder() as ib:
+        with T.prim_func():
+            T.evaluate(0)
+    # the prim_func generated by IRBuilder
+    prim_func_actual = ib.get()
+
+    # the expected prim_func
+    prim_func_expected = tir.PrimFunc(
+        params=[],
+        body=tir.Evaluate(0),
+        ret_type=None,
+        buffer_map=None,
+        preflattened_buffer_map=None,
+        attrs=None,
+    )
+    # Check if the generated ir is expected
+    assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 14999f8add61b1a81a0f733ba12aadf2b8057279 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 9 Sep 2022 13:59:55 -0500
Subject: [PATCH 140/704] [TVMScript][TIR] Clarify scope of
 BlockNode::iter_vars (#12726)

Previously, it was ambiguous whether `BlockNode::iter_vars` were
in-scope for `BlockRealizeNode::predicate`.  `ConvertBlocksToOpaque`
treated them as in-scope, and applied a mapping from `iter_vars` to
`iter_values`.  Similarly, TVMScript printing places `T.where`
statements below the `T.axis` statements, where `T.axis` definitions
are in scope.  However, `BlockRealizeNode::SEqualReduce` and
`BlockRealizeNode::SHashReduce` do not visit the block and `iter_vars`
until after visiting the predicate, placing the `iter_vars` out of
scope.

This commit updates the printing of `T.where` to be above `T.axis`,
and updates `ConvertBlocksToOpaque` to report an error if the
predicate contains references to `BlockNode::iter_vars`.  After this
commit, these three usages all consistently treat
`BlockNode::iter_vars` as out of scope for
`BlockRealizeNode::predicate`.
---
 src/printer/tvmscript_printer.cc              | 24 ++++++++----
 .../transforms/convert_blocks_to_opaque.cc    | 39 +++++++++++++++----
 ..._tir_transform_convert_blocks_to_opaque.py | 16 +++++++-
 3 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 5da81de4dc5d..20720373589f 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -283,6 +283,7 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc AllocBufferDeclaration(const Buffer& buf);
   Doc PrintBlockVar(const IterVar& iter_var, const PrimExpr& value);
   Doc PrintBlockVarRemaps();
+  Doc PrintBlockPredicate(const BlockRealizeNode* op);
   Doc PrintBlockVars(const BlockRealizeNode* op);
   Doc PrintBlockAttr(const BlockRealizeNode* op);
   Doc PrintExpandedArray(const ArrayNode* op);
@@ -1417,6 +1418,14 @@ Doc TVMScriptPrinter::PrintBlockVarRemaps() {
   return doc;
 }
 
+Doc TVMScriptPrinter::PrintBlockPredicate(const BlockRealizeNode* op) {
+  Doc doc;
+  if (!is_one(op->predicate)) {
+    doc << Doc::NewLine() << tir_prefix_ << ".where(" << Print(op->predicate) << ")";
+  }
+  return doc;
+}
+
 Doc TVMScriptPrinter::PrintBlockVars(const BlockRealizeNode* op) {
   Doc doc;
   const auto* block_op = op->block.as<BlockNode>();
@@ -1457,10 +1466,7 @@ Doc TVMScriptPrinter::PrintBlockVars(const BlockRealizeNode* op) {
 Doc TVMScriptPrinter::PrintBlockAttr(const BlockRealizeNode* op) {
   const auto* block_op = op->block.as<BlockNode>();
   Doc block_attr_doc;
-  // print predicate, binding, read/write tensor region, annotations
-  if (!is_one(op->predicate)) {
-    block_attr_doc << Doc::NewLine() << tir_prefix_ << ".where(" << Print(op->predicate) << ")";
-  }
+  // print binding, read/write tensor region, annotations
   block_attr_doc << Doc::NewLine() << tir_prefix_ << ".reads("
                  << PrintExpandedArray(block_op->reads.as<ArrayNode>()) << ")";
   block_attr_doc << Doc::NewLine() << tir_prefix_ << ".writes("
@@ -1523,14 +1529,18 @@ Doc TVMScriptPrinter::PrintBlockName(const BlockNode* block_op) {
 Doc TVMScriptPrinter::VisitStmt_(const BlockRealizeNode* op) {
   const auto* block_op = op->block.as<BlockNode>();
   Doc doc = PrintOptionalInfo(GetRef<Stmt>(block_op));
-  // print block name and block vars
+  // print block name
   doc << PrintBlockName(block_op);
+  // Print block predicate.
+  Doc block_predicate = PrintBlockPredicate(op);
+  // Print the variable bindings, valid to use in block attributes and
+  // body
   Doc block_var = PrintBlockVars(op);
-  // print predicate, binding, read/write tensor region, annotations
+  // print read/write tensor region, annotations
   Doc block_attr_doc = PrintBlockAttr(op);
   // print body
   Doc body = PrintBlockBody(block_op);
-  doc << Doc::Indent(4, block_var << block_attr_doc << Doc::NewLine() << body);
+  doc << Doc::Indent(4, block_predicate << block_var << block_attr_doc << Doc::NewLine() << body);
   for (const auto& iter_var : block_op->iter_vars) {
     TryDeallocVar(iter_var->var);
   }
diff --git a/src/tir/transforms/convert_blocks_to_opaque.cc b/src/tir/transforms/convert_blocks_to_opaque.cc
index ddc2e1756908..95648713494c 100644
--- a/src/tir/transforms/convert_blocks_to_opaque.cc
+++ b/src/tir/transforms/convert_blocks_to_opaque.cc
@@ -45,6 +45,10 @@ class OpaqueBlockConverter : public StmtExprMutator {
   OpaqueBlockConverter() = default;
 
   PrimExpr VisitExpr_(const VarNode* var) final {
+    CHECK(!forbidden_iter_vars_.count(var))
+        << "Variable " << var->name_hint << " occurs in the predicate or iter_values of a block, "
+        << "but isn't defined until the body of the block";
+
     auto it = var_substitutes_.find(var);
     if (it != var_substitutes_.end()) {
       return it->second;
@@ -65,23 +69,42 @@ class OpaqueBlockConverter : public StmtExprMutator {
   Stmt VisitStmt_(const BlockRealizeNode* realize) final {
     const auto* block_op = realize->block.get();
     ICHECK(!block_op->init.defined());
-    // Step 1. Update "block vars => binding values" for substitution.
-    ICHECK_EQ(block_op->iter_vars.size(), realize->iter_values.size());
+
+    // Step 1. Visit the predicate and iter_values, without any variable bindings
+    for (const auto& iter : block_op->iter_vars) forbidden_iter_vars_.insert(iter->var.get());
+    PrimExpr predicate = VisitExpr(realize->predicate);
+    Array<PrimExpr> iter_values = realize->iter_values;
+    iter_values.MutateByApply([this](PrimExpr expr) { return VisitExpr(std::move(expr)); });
+    for (const auto& iter : block_op->iter_vars) forbidden_iter_vars_.erase(iter->var.get());
+
+    // Step 2. Update "block vars => binding values" for substitution.
+    ICHECK_EQ(block_op->iter_vars.size(), iter_values.size());
     for (int i = 0, n = block_op->iter_vars.size(); i < n; ++i) {
       IterVar block_var = block_op->iter_vars[i];
-      PrimExpr v = this->VisitExpr(realize->iter_values[i]);
+      PrimExpr v = this->VisitExpr(iter_values[i]);
       var_substitutes_.emplace(block_var->var.get(), v);
     }
-    // Step 2. Visit recursively.
-    BlockRealize new_realize = Downcast<BlockRealize>(StmtExprMutator::VisitStmt_(realize));
-    if (!new_realize->iter_values.empty()) {
-      new_realize.CopyOnWrite()->iter_values.clear();
+    // Step 3. Visit recursively.
+    Block new_block = Downcast<Block>(VisitStmt(realize->block));
+
+    // Step 4. Clear the variable bindings
+    for (const auto& block_var : block_op->iter_vars) {
+      var_substitutes_.erase(block_var->var.get());
+    }
+
+    // Step 5. Return
+    if (predicate.same_as(realize->predicate) && iter_values.same_as(realize->iter_values) &&
+        new_block.same_as(realize->block) && realize->iter_values.size() == 0) {
+      return GetRef<BlockRealize>(realize);
+    } else {
+      return BlockRealize({}, predicate, new_block);
     }
-    return std::move(new_realize);
   }
 
   /*! \brief The map from block vars to their binding values. */
   std::unordered_map<const VarNode*, PrimExpr> var_substitutes_;
+  /* \brief Variables that may not occur in the current context */
+  std::unordered_set<const VarNode*> forbidden_iter_vars_;
 };
 
 PrimFunc ConvertBlocksToOpaque(PrimFunc f) {
diff --git a/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py b/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py
index 6859a5d75b75..297943bc1381 100644
--- a/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py
+++ b/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py
@@ -82,6 +82,18 @@ def test_lower_te():
     tvm.ir.assert_structural_equal(mod, orig_mod)  # ConvertBlocksToOpaque should do nothing on TE
 
 
+class TestErrorIfPredicateUsesBlockVariables(tvm.testing.CompareBeforeAfter):
+    transform = tvm.tir.transform.ConvertBlocksToOpaque()
+
+    def before(A: T.Buffer[8, "int32"]):
+        for i in T.serial(8):
+            with T.block():
+                vi = T.axis.remap("S", [i])
+                T.where(vi < 6)
+                T.evaluate(0)
+
+    expected = tvm.TVMError
+
+
 if __name__ == "__main__":
-    test_elementwise()
-    test_lower_te()
+    tvm.testing.main()

From 574794e915ba424db05e1ddcf2218f37b2b65764 Mon Sep 17 00:00:00 2001
From: Matveenko Valery <50880524+valmat07@users.noreply.github.com>
Date: Fri, 9 Sep 2022 21:01:53 +0200
Subject: [PATCH 141/704] [OpenCL] Enable OpenCL for GPU tests (#12490)

* Add opencl target in test build script

* Fix fp16 test and compile test for opencl

* fix lint

* Fix relay OpenCL texture tests

* Fix lint

* Enable relay OpenCL tests

* Fix opencl relay texture tests

* fix lint

* Remove OpenCL gtest variable

* Fix unbound variable

* Skip tests that are not supported in CI

* fix lint

* Add path for opencl gtest directory

* Fix opencl gtests include directory

* Enable OpenCL googletest. Fix bug in opencl timer test

* testing fix for build cpp tests

* update googletest git version for opencl tests build

* update cmakelist

* Update CMakeList

* Update CMakeList

* Disable opencl googletests

* update Opecnl.cmake

* fix Opecnl.cmake

* Apply comments. Remove xfail decerator for opencl tests. Now specific tests are skipped in the environment script

* minor code changes

* apply comments

* apply comment

* skip test in ci by decorator

* fix pytest skipif warnings

* Fix skipif for opencl gtests
---
 src/runtime/opencl/opencl_common.h            |   2 +-
 tests/cpp-runtime/opencl/opencl_timer_test.cc |   1 +
 tests/cpp-runtime/opencl/run_gtests.cc        |   2 +-
 .../contrib/test_opencl/test_run_gtests.py    |   1 +
 tests/python/driver/tvmc/test_compiler.py     |   3 +-
 .../test_conv2d_nchw_texture.py               | 107 +++++++-----------
 .../test_conv2d_nhwc_texture.py               |  92 ++++++---------
 .../test_depthwise_conv2d_nchw_texture.py     |  26 ++---
 .../test_depthwise_conv2d_nhwc_texture.py     |  32 ++----
 .../utils/adreno_utils.py                     |   0
 .../unittest/test_target_codegen_vulkan.py    |   3 +
 tests/scripts/task_config_build_gpu.sh        |   1 +
 tests/scripts/task_python_integration.sh      |   6 +-
 .../task_python_integration_gpuonly.sh        |   3 +-
 14 files changed, 112 insertions(+), 167 deletions(-)
 rename tests/python/relay/{ => opencl_texture}/test_conv2d_nchw_texture.py (90%)
 rename tests/python/relay/{ => opencl_texture}/test_conv2d_nhwc_texture.py (87%)
 rename tests/python/relay/{ => opencl_texture}/test_depthwise_conv2d_nchw_texture.py (91%)
 rename tests/python/relay/{ => opencl_texture}/test_depthwise_conv2d_nhwc_texture.py (91%)
 rename tests/python/relay/{ => opencl_texture}/utils/adreno_utils.py (100%)

diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index f16e1e936d96..7f7f083cf303 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -439,9 +439,9 @@ class OpenCLTimerNode : public TimerNode {
  public:
   // Timer start
   virtual void Start() {
+    this->duration = 0;
     if (count_timer_execs == 0) {
       cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).clear();
-      this->duration = 0;
       // Very first call of Start() leads to the recreation of
       // OpenCL command queue in profiling mode. This allows to run profile after inference.
       recreateCommandQueue();
diff --git a/tests/cpp-runtime/opencl/opencl_timer_test.cc b/tests/cpp-runtime/opencl/opencl_timer_test.cc
index 40ec65d8dfe2..6faf2f6a1482 100644
--- a/tests/cpp-runtime/opencl/opencl_timer_test.cc
+++ b/tests/cpp-runtime/opencl/opencl_timer_test.cc
@@ -46,6 +46,7 @@ TEST(OpenCLTimerNode, nested_timers) {
     cl_mem cl_buf = clCreateBuffer(workspace->context, CL_MEM_READ_ONLY, BUFF_SIZE * sizeof(cl_int),
                                    NULL, &err);
     OPENCL_CHECK_ERROR(err);
+    queue = workspace->GetQueue(thr->device);
     OPENCL_CALL(clEnqueueWriteBuffer(queue, cl_buf, false, 0, BUFF_SIZE * sizeof(cl_int), tmp_buf,
                                      0, NULL, &ev));
     OPENCL_CALL(clReleaseMemObject(cl_buf));
diff --git a/tests/cpp-runtime/opencl/run_gtests.cc b/tests/cpp-runtime/opencl/run_gtests.cc
index b16ae3efc74d..ffe86a7f52c0 100644
--- a/tests/cpp-runtime/opencl/run_gtests.cc
+++ b/tests/cpp-runtime/opencl/run_gtests.cc
@@ -40,7 +40,7 @@ TVM_REGISTER_GLOBAL("opencl.run_gtests").set_body([](TVMArgs args, TVMRetValue*
   argv.push_back(const_cast<char*>("opencl_run_gtests"));
 
   // add parsed arguments
-  for (int i = 0; i < parsed_args.size(); ++i) {
+  for (size_t i = 0; i < parsed_args.size(); ++i) {
     argv.push_back(const_cast<char*>(parsed_args[i].data()));
   }
 
diff --git a/tests/python/contrib/test_opencl/test_run_gtests.py b/tests/python/contrib/test_opencl/test_run_gtests.py
index 4afcf7ee8d66..ee59086b25f1 100644
--- a/tests/python/contrib/test_opencl/test_run_gtests.py
+++ b/tests/python/contrib/test_opencl/test_run_gtests.py
@@ -28,6 +28,7 @@
 # for example to run all "foo" tests twice and observe gtest output run
 # pytest -sv <this file> --gtests_args="--gtest_filter=*foo* --gtest_repeat=2"
 @tvm.testing.requires_opencl
+@pytest.mark.skipif(tvm.testing.utils.IS_IN_CI, reason="failed due to nvidia libOpencl in the CI")
 def test_run_gtests(gtest_args):
     if (
         "TVM_TRACKER_HOST" in os.environ
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 27cd78d436c7..5535fc02249f 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -367,8 +367,9 @@ def test_compile_opencl(tflite_mobilenet_v1_0_25_128):
     tvmc_model = tvmc.load(tflite_mobilenet_v1_0_25_128)
     tvmc_package = tvmc.compile(
         tvmc_model,
-        target="opencl --host=llvm",
+        target="opencl -host=llvm",
         desired_layout="NCHW",
+        dump_code="asm",
     )
     dumps_path = tvmc_package.package_path + ".asm"
 
diff --git a/tests/python/relay/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
similarity index 90%
rename from tests/python/relay/test_conv2d_nchw_texture.py
rename to tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
index ab12e40b39cb..504a2b4e3ed3 100644
--- a/tests/python/relay/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
@@ -22,13 +22,15 @@
 from tvm.relay import testing
 from tvm.contrib import utils
 from utils.adreno_utils import gpu_preprocess, build_run_compare
+import pytest
 
 
-@tvm.testing.requires_opencl
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad():
-    target = "opencl --device=adreno"
-    dtype = "float16"
+dtype = tvm.testing.parameter("float32")
+
 
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(target, dtype):
     input_shape = (1, 32, 42, 42)
     filter_shape = (96, 32, 3, 3)
     bias_shape = (1, 96, 1, 1)
@@ -67,10 +69,8 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(target, dtype):
     input_shape = (1, 32, 40, 40)
     filter_shape = (96, 32, 2, 2)
     bias_shape = (1, 96, 1, 1)
@@ -109,10 +109,8 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_inceptionv3_35_35_strides():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_inceptionv3_35_35_strides(target, dtype):
     input_shape = (1, 48, 35, 35)
     filter_shape = (64, 48, 5, 5)
     bias_shape = (1, 64, 1, 1)
@@ -151,10 +149,8 @@ def test_conv2d_inceptionv3_35_35_strides():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_resnet50_v2_nchw_3c():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_resnet50_v2_nchw_3c(target, dtype):
     input_shape = (1, 3, 224, 224)
     filter_shape = (64, 3, 7, 7)
     bias_shape = (1, 64, 1, 1)
@@ -194,10 +190,8 @@ def test_conv2d_resnet50_v2_nchw_3c():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_inceptionv3_nchw_3c():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_inceptionv3_nchw_3c(target, dtype):
     input_shape = (1, 3, 299, 299)
     filter_shape = (64, 3, 3, 3)
     bias_shape = (1, 64, 1, 1)
@@ -236,10 +230,8 @@ def test_conv2d_inceptionv3_nchw_3c():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_1x1_16c16spatial():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_1x1_16c16spatial(target, dtype):
     input_shape = (1, 16, 256, 256)
     filter_shape = (32, 16, 4, 4)
     bias_shape = (1, 32, 1, 1)
@@ -278,10 +270,8 @@ def test_conv2d_1x1_16c16spatial():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_4x4_16c16pad():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_4x4_16c16pad(target, dtype):
     input_shape = (1, 32, 256, 256)
     filter_shape = (32, 32, 4, 4)
     bias_shape = (1, 32, 1, 1)
@@ -320,10 +310,8 @@ def test_conv2d_4x4_16c16pad():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_4x4x4_16c16pad():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_4x4x4_16c16pad(target, dtype):
     input_shape = (1, 32, 256, 256)
     filter_shape = (4, 32, 4, 4)
     bias_shape = (1, 4, 1, 1)
@@ -362,10 +350,8 @@ def test_conv2d_4x4x4_16c16pad():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_yolov3_v2_nchw_3c():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_yolov3_v2_nchw_3c(target, dtype):
     input_shape = (1, 1024, 13, 13)
     filter_shape = (255, 1024, 1, 1)
     A = relay.var("data", shape=input_shape, dtype=dtype)
@@ -397,10 +383,8 @@ def test_conv2d_yolov3_v2_nchw_3c():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_vgg16_winograd_4d():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_vgg16_winograd_4d(target, dtype):
     input_shape = (1, 512, 28, 28)
     filter_shape = (512, 512, 3, 3)
     bias_shape = (1, 512, 1, 1)
@@ -437,7 +421,7 @@ def test_conv2d_vgg16_winograd_4d():
     stat_file = temp.relpath("stat.log")
     with open(stat_file, "w") as f:
         f.write(
-            '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n'
+            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 512, 28, 28], "{dtype}"], ["TENSOR", [512, 512, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
         )
     graph = build_run_compare(
         mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
@@ -447,10 +431,8 @@ def test_conv2d_vgg16_winograd_4d():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_winograd_conv():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_winograd_conv(target, dtype):
     input_shape = (1, 4, 3, 3)
     A = relay.var("data", shape=input_shape, dtype=dtype)
     filter_shape3 = (8, 4, 3, 3)
@@ -486,7 +468,7 @@ def test_conv2d_winograd_conv():
     stat_file = temp.relpath("stat.log")
     with open(stat_file, "w") as f:
         f.write(
-            '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 4, 3, 3], "float16"], ["TENSOR", [8, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n'
+            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 4, 3, 3], "{dtype}"], ["TENSOR", [8, 4, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
         )
     graph = build_run_compare(
         mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
@@ -496,7 +478,9 @@ def test_conv2d_winograd_conv():
 
 
 @tvm.testing.requires_opencl
-def test_residual_block():
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+@pytest.mark.skipif(tvm.testing.utils.IS_IN_CI, reason="failed due to nvidia libOpencl in the CI")
+def test_residual_block(target, dtype):
     """
     - some kind of residual block followed by convolution to have texture after residual block
     - scalar data type verification which should be mapped to global memory scope
@@ -515,9 +499,6 @@ def test_residual_block():
                      |                      <- buffer
                layout_transform (NCHW4c->NCHW)
     """
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
     input_shape = (1, 32, 40, 40)
     filter_shape1 = (32, 32, 2, 2)
     filter_shape2 = (32, 32, 1, 1)
@@ -555,7 +536,7 @@ def test_residual_block():
         kernel_size=(1, 1),
     )
     D = relay.op.add(conv2, D)
-    D = D * relay.const(0.15, "float16")
+    D = D * relay.const(0.15, dtype)
     D = relay.op.nn.relu(D)
 
     conv3 = relay.nn.conv2d(
@@ -607,7 +588,8 @@ def test_residual_block():
 
 
 @tvm.testing.requires_opencl
-def test_concat():
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_concat(target, dtype):
     """
         layout_transform (NCHW->NCHW4c)
                   |                      <- buffer
@@ -619,9 +601,6 @@ def test_concat():
                      |                   <- buffer
                layout_transform (NCHW4c->NCHW)
     """
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
     input_shape = (1, 32, 40, 40)
     filter_shape1 = (96, 32, 2, 2)
     filter_shape2 = (32, 96, 2, 2)
@@ -721,7 +700,8 @@ def test_concat():
 
 
 @tvm.testing.requires_opencl
-def test_pooling_branching_texture_params():
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_pooling_branching_texture_params(target, dtype):
     """
     Verification of the pooling and many branches having textures
                 layout_transform (NCHW->NCHW4c)
@@ -738,9 +718,6 @@ def test_pooling_branching_texture_params():
                              |                   <- buffer
                     layout_transform (NCHW4c->NCHW)
     """
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
     input_shape = (1, 32, 40, 40)
     filter_shape0 = (32, 32, 1, 1)
     filter_shape1 = (32, 32, 2, 2)
@@ -849,7 +826,8 @@ def test_pooling_branching_texture_params():
 
 
 @tvm.testing.requires_opencl
-def test_branching_texture_params():
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_branching_texture_params(target, dtype):
     """
     Verification of passing texture to several consumers markup of relay variables in
     primary functions + on_device
@@ -866,9 +844,6 @@ def test_branching_texture_params():
                             |                   <- buffer
                     layout_transform (NCHW4c->NCHW)
     """
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
     input_shape = (1, 32, 40, 40)
     filter_shape0 = (32, 32, 1, 1)
     filter_shape1 = (32, 32, 2, 2)
@@ -976,7 +951,8 @@ def test_branching_texture_params():
 
 # function repeat, params scope are different in reused functions
 @tvm.testing.requires_opencl
-def test_conv2d_different_lowering_same_op():
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_different_lowering_same_op(target, dtype):
     """
     Use case for verification of caching compiled functions
     Three convolutions following by each other in this case should be
@@ -993,9 +969,6 @@ def test_conv2d_different_lowering_same_op():
                          |                      <- buffer
                     layout_transform (NCHW4c->NCHW)
     """
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
     input_shape = (1, 32, 40, 40)
     filter_shape1 = (32, 32, 1, 1)
     A = relay.var("data", shape=input_shape, dtype=dtype)
diff --git a/tests/python/relay/test_conv2d_nhwc_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py
similarity index 87%
rename from tests/python/relay/test_conv2d_nhwc_texture.py
rename to tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py
index cf8116c076cc..37c22137f035 100644
--- a/tests/python/relay/test_conv2d_nhwc_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py
@@ -23,13 +23,15 @@
 from tvm.relay import testing
 from tvm.contrib import utils
 from utils.adreno_utils import gpu_preprocess, build_run_compare
+import pytest
 
 
-@tvm.testing.requires_opencl
-def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16():
-    target = "opencl --device=adreno"
-    dtype = "float16"
+dtype = tvm.testing.parameter("float32")
+
 
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(target, dtype):
     input_shape = (1, 257, 257, 32)
     filter_shape = (1, 1, 32, 16)
     bias_shape = (filter_shape[-1],)
@@ -65,10 +67,8 @@ def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(target, dtype):
     input_shape = (1, 257, 257, 32)
     filter_shape = (1, 1, 32, 16)
     bias_shape = (filter_shape[-1],)
@@ -107,10 +107,8 @@ def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_4_35_35_32x3_3_144_16():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_4_35_35_32x3_3_144_16(target, dtype):
     input_shape = (4, 35, 35, 32)
     filter_shape = (3, 3, 32, 16)
     bias_shape = (filter_shape[-1],)
@@ -147,10 +145,8 @@ def test_conv2d_4_35_35_32x3_3_144_16():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(target, dtype):
     input_shape = (1, 513, 513, 3)
     filter_shape = (3, 3, 3, 32)
     bias_shape = (filter_shape[-1],)
@@ -187,10 +183,8 @@ def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(target, dtype):
     input_shape = (1, 42, 42, 32)
     filter_shape = (3, 3, 32, 96)
     bias_shape = (1, 1, 1, 96)
@@ -229,10 +223,8 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(target, dtype):
     input_shape = (1, 40, 40, 32)
     filter_shape = (2, 2, 32, 96)
     bias_shape = (1, 1, 1, 96)
@@ -271,10 +263,8 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_inceptionv3_35_35_strides():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_inceptionv3_35_35_strides(target, dtype):
     input_shape = (1, 35, 35, 48)
     filter_shape = (5, 5, 48, 64)
     bias_shape = (1, 1, 1, 64)
@@ -313,10 +303,8 @@ def test_conv2d_inceptionv3_35_35_strides():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_resnet50_v2_nhwc_3c():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_resnet50_v2_nhwc_3c(target, dtype):
     input_shape = (1, 224, 224, 3)
     filter_shape = (7, 7, 3, 64)
     bias_shape = (1, 1, 1, 64)
@@ -356,10 +344,8 @@ def test_conv2d_resnet50_v2_nhwc_3c():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_inceptionv3_nhwc_3c():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_inceptionv3_nhwc_3c(target, dtype):
     input_shape = (1, 299, 299, 3)
     filter_shape = (3, 3, 3, 64)
     bias_shape = (1, 1, 1, 64)
@@ -398,11 +384,9 @@ def test_conv2d_inceptionv3_nhwc_3c():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_1x1_16c16spatial():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
-    input_shape = (1, 256, 256, 16)
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_1x1_16c16spatial(target, dtype):
+    input_shape = (1, 128, 128, 16)
     filter_shape = (4, 4, 16, 32)
     bias_shape = (1, 1, 1, 32)
     A = relay.var("data", shape=input_shape, dtype=dtype)
@@ -440,10 +424,8 @@ def test_conv2d_1x1_16c16spatial():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_4x4_16c16pad():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_4x4_16c16pad(target, dtype):
     input_shape = (1, 256, 256, 32)
     filter_shape = (4, 4, 32, 32)
     bias_shape = (1, 1, 1, 32)
@@ -482,10 +464,8 @@ def test_conv2d_4x4_16c16pad():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_4x4x4_16c16pad():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_4x4x4_16c16pad(target, dtype):
     input_shape = (1, 256, 256, 32)
     filter_shape = (4, 4, 32, 4)
     bias_shape = (1, 1, 1, 4)
@@ -523,10 +503,8 @@ def test_conv2d_4x4x4_16c16pad():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_yolov3_v2_nhwc_3c():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_yolov3_v2_nhwc_3c(target, dtype):
     input_shape = (1, 13, 13, 1024)
     filter_shape = (1, 1, 1024, 255)
     A = relay.var("data", shape=input_shape, dtype=dtype)
@@ -558,10 +536,8 @@ def test_conv2d_yolov3_v2_nhwc_3c():
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_vgg16_winograd_4d():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_vgg16_winograd_4d(target, dtype):
     input_shape = (1, 28, 28, 512)
     filter_shape = (3, 3, 512, 512)
     bias_shape = (1, 1, 1, 512)
@@ -598,7 +574,7 @@ def test_conv2d_vgg16_winograd_4d():
     stat_file = temp.relpath("stat.log")
     with open(stat_file, "w") as f:
         f.write(
-            '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 512], "float16"], ["TENSOR", [3, 3, 512, 512], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n'
+            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 512], "{dtype}"], ["TENSOR", [3, 3, 512, 512], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
         )
     graph = build_run_compare(
         mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
diff --git a/tests/python/relay/test_depthwise_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py
similarity index 91%
rename from tests/python/relay/test_depthwise_conv2d_nchw_texture.py
rename to tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py
index c94d085b5115..0ac92d03b6f9 100644
--- a/tests/python/relay/test_depthwise_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py
@@ -22,12 +22,12 @@
 from tvm.relay import testing
 from utils.adreno_utils import gpu_preprocess, build_run_compare
 
+dtype = tvm.testing.parameter("float32")
 
-@tvm.testing.requires_opencl
-def test_depthwise_conv2d_bias_nchwc():
-    target = "opencl --device=adreno"
-    dtype = "float16"
 
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_depthwise_conv2d_bias_nchwc(target, dtype):
     input_shape = (1, 64, 112, 112)
     filter_shape = (64, 1, 3, 3)
     bias_shape = (1, 64, 1, 1)
@@ -68,10 +68,8 @@ def test_depthwise_conv2d_bias_nchwc():
 
 
 @tvm.testing.requires_opencl
-def test_depthwise_conv2d_nchwc():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_depthwise_conv2d_nchwc(target, dtype):
     input_shape = (1, 64, 112, 112)
     filter_shape = (64, 1, 3, 3)
     bias_shape = (1, 64, 1, 1)
@@ -107,10 +105,8 @@ def test_depthwise_conv2d_nchwc():
 
 
 @tvm.testing.requires_opencl
-def test_depthwise_conv2d_bias_nchw():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_depthwise_conv2d_bias_nchw(target, dtype):
     input_shape = (1, 64, 112, 112)
     filter_shape = (64, 1, 3, 3)
     bias_shape = (1, 64, 1, 1)
@@ -151,10 +147,8 @@ def test_depthwise_conv2d_bias_nchw():
 
 
 @tvm.testing.requires_opencl
-def test_depthwise_conv2d_repack_bias_nchw():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_depthwise_conv2d_repack_bias_nchw(target, dtype):
     input_shape = (1, 63, 112, 112)
     filter_shape = (63, 1, 3, 3)
     bias_shape = (1, 63, 1, 1)
diff --git a/tests/python/relay/test_depthwise_conv2d_nhwc_texture.py b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py
similarity index 91%
rename from tests/python/relay/test_depthwise_conv2d_nhwc_texture.py
rename to tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py
index 16f9b8749909..3af7db3a4e1f 100644
--- a/tests/python/relay/test_depthwise_conv2d_nhwc_texture.py
+++ b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py
@@ -22,12 +22,12 @@
 from tvm.relay import testing
 from utils.adreno_utils import build_run_compare
 
+dtype = tvm.testing.parameter("float32")
 
-@tvm.testing.requires_opencl
-def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1():
-    target = "opencl --device=adreno"
-    dtype = "float16"
 
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(target, dtype):
     input_shape = (1, 129, 129, 144)
     filter_shape = (3, 3, 144, 1)
     kernel_size = (filter_shape[0], filter_shape[1])
@@ -66,10 +66,8 @@ def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1():
 
 
 @tvm.testing.requires_opencl
-def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(target, dtype):
     input_shape = (4, 35, 35, 576)
     filter_shape = (3, 3, 576, 1)
     kernel_size = (filter_shape[0], filter_shape[1])
@@ -108,10 +106,8 @@ def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1():
 
 
 @tvm.testing.requires_opencl
-def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding(target, dtype):
     input_shape = (1, 129, 129, 144)
     filter_shape = (3, 3, 144, 1)
     kernel_size = (filter_shape[0], filter_shape[1])
@@ -152,10 +148,8 @@ def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding():
 
 
 @tvm.testing.requires_opencl
-def test_depthwise_conv2d_1_513_513_7x3_3_7_1():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_depthwise_conv2d_1_513_513_7x3_3_7_1(target, dtype):
     input_shape = (1, 513, 513, 7)
     filter_shape = (3, 3, 7, 1)
     bias_shape = (filter_shape[2],)
@@ -193,10 +187,8 @@ def test_depthwise_conv2d_1_513_513_7x3_3_7_1():
 
 
 @tvm.testing.requires_opencl
-def test_depthwise_conv2d_1_513_513_3x3_3_3_1():
-    target = "opencl --device=adreno"
-    dtype = "float16"
-
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_depthwise_conv2d_1_513_513_3x3_3_3_1(target, dtype):
     input_shape = (1, 513, 513, 3)
     filter_shape = (3, 3, 3, 1)
     bias_shape = (filter_shape[2],)
diff --git a/tests/python/relay/utils/adreno_utils.py b/tests/python/relay/opencl_texture/utils/adreno_utils.py
similarity index 100%
rename from tests/python/relay/utils/adreno_utils.py
rename to tests/python/relay/opencl_texture/utils/adreno_utils.py
diff --git a/tests/python/unittest/test_target_codegen_vulkan.py b/tests/python/unittest/test_target_codegen_vulkan.py
index 73e840208549..76cad250e053 100644
--- a/tests/python/unittest/test_target_codegen_vulkan.py
+++ b/tests/python/unittest/test_target_codegen_vulkan.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import os
+from posixpath import split
 import random
 import re
 import threading
@@ -91,6 +92,8 @@ def test_array_copy(dev, dtype, fuzz_seed):
 def test_array_vectorize_add(target, dev, dtype):
     arr_size = 64
     lanes = 2
+    if "opencl" in target and dtype == "float16":
+        pytest.xfail("Opencl target does not support float16")
 
     num_thread = 8
 
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index f79076e213cb..5163a16da3cd 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -28,6 +28,7 @@ echo set\(USE_CUDNN ON\) >> config.cmake
 echo set\(USE_CUDA ON\) >> config.cmake
 echo set\(USE_VULKAN ON\) >> config.cmake
 echo set\(USE_OPENGL ON\) >> config.cmake
+echo set\(USE_OPENCL ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM \"/usr/bin/llvm-config-9 --link-static\"\) >> config.cmake
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index fc7cbf3a88e7..5eac7b45ba61 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -61,12 +61,14 @@ run_pytest cython ${TVM_INTEGRATION_TESTSUITE_NAME}-dso_plugin_module-1 apps/dso
 run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-integration tests/python/integration
 
 # Ignoring Arm(R) Ethos(TM)-U NPU tests in the collective to run to run them in parallel in the next step.
-run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib --ignore=tests/python/contrib/test_ethosu --ignore=tests/python/contrib/test_cmsisnn
-
+run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib --ignore=tests/python/contrib/test_ethosu --ignore=tests/python/contrib/test_cmsisnn 
 # forked is needed because the global registry gets contaminated
 TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \
     run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-relay tests/python/relay
 
+# OpenCL texture test. Deselected specific tests that fails  in CI
+TVM_TEST_TARGETS="${TVM_RELAY_OPENCL_TEXTURE_TARGETS:-opencl}" \
+    run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture tests/python/relay/opencl_texture
 # Command line driver test
 run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver
 
diff --git a/tests/scripts/task_python_integration_gpuonly.sh b/tests/scripts/task_python_integration_gpuonly.sh
index 3ce5571caa0e..432984c95561 100755
--- a/tests/scripts/task_python_integration_gpuonly.sh
+++ b/tests/scripts/task_python_integration_gpuonly.sh
@@ -18,9 +18,10 @@
 
 set -exo pipefail
 
-export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl -device=mali,aocl_sw_emu"
+export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl -device=mali,aocl_sw_emu,adreno"
 export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
 export TVM_RELAY_TEST_TARGETS="cuda"
+export TVM_RELAY_OPENCL_TEXTURE_TARGETS="opencl -device=adreno"
 export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu
 export TVM_INTEGRATION_GPU_ONLY=1
 

From b21bf6638bc9a0b339bdbebeae9630ddb583b5a9 Mon Sep 17 00:00:00 2001
From: Black <32191045+blackkker@users.noreply.github.com>
Date: Sat, 10 Sep 2022 03:02:16 +0800
Subject: [PATCH 142/704] [Frontend][Paddle] Fix op in paddle did't transmit
 layout information (#12658)

[Frontend][Paddle] Fix adaptive_avg_pool2d in paddle did't transmit layout information
---
 python/tvm/relay/frontend/paddlepaddle.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index a869e2e1b807..9b909895e084 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -1193,6 +1193,7 @@ def convert_pool2d(g, op, block):
     paddings = op.attr("paddings")
     padding_algorithm = op.attr("padding_algorithm")
     pooling_type = op.attr("pooling_type")
+    data_format = op.attr("data_format")
 
     if global_pooling:
         adaptive = True
@@ -1260,7 +1261,9 @@ def convert_pool2d(g, op, block):
                 input_x, pool_size=ksize, strides=strides, padding=paddings, ceil_mode=ceil_mode
             )
     else:
-        out = getattr(_op.nn, "adaptive_" + op_map[pooling_type])(input_x, output_size=ksize)
+        out = getattr(_op.nn, "adaptive_" + op_map[pooling_type])(
+            input_x, output_size=ksize, layout=data_format
+        )
     g.add_node(op.output("Out")[0], out)
 
 
From 029fa462d22ce3c75bc5ea530eece999a160c05b Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Sat, 10 Sep 2022 03:10:50 +0800
Subject: [PATCH 143/704] [TIR][Arith] Add more strict checking in imm
 construction and folding. (#12515)

* Add more strict check in tir imm construction and folding.

* fix bool-compare compile error

* fix some illegal imm construction in testcases

* do not test i64 overflow behaviour because it is not consistent on cython and ctypes

* fix float32 testcase

* auto-inferred dtype should be int64 when value exceeds int32 range

* add floatimm range check for fp16 and fp32

* add more folding testcases and fix store fp32 folding result to double

* fix i386 fp16 cases
---
 include/tvm/tir/op.h                          |   9 +-
 python/tvm/runtime/object_generic.py          |  14 +-
 python/tvm/script/tir/intrin.py               |   5 +
 src/arith/const_fold.h                        | 112 +++-
 src/ir/expr.cc                                |  32 +-
 src/support/scalars.cc                        |   4 -
 src/support/scalars.h                         |   4 +
 tests/python/relay/test_op_level4.py          |   2 +-
 tests/python/relay/test_pass_fuse_ops.py      |   2 +-
 .../unittest/test_arith_rewrite_simplify.py   |   2 +
 .../unittest/test_target_codegen_cuda.py      |   7 +-
 tests/python/unittest/test_tir_imm_values.py  | 577 ++++++++++++++++++
 .../test_tir_transform_narrow_datatype.py     |   9 -
 13 files changed, 743 insertions(+), 36 deletions(-)
 create mode 100644 tests/python/unittest/test_tir_imm_values.py

diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
index b4c5d45cbf8e..0939e25efddf 100644
--- a/include/tvm/tir/op.h
+++ b/include/tvm/tir/op.h
@@ -911,7 +911,9 @@ inline PrimExpr MakeConstScalar(DataType t, ValueType value, Span span = Span())
   if (t.is_uint()) {
     // Use IntImm if it is a small integer
     uint64_t uval = static_cast<uint64_t>(value);
-    if (uval <= static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
+    if (value < static_cast<ValueType>(0)) {
+      LOG(FATAL) << "cannot make uint from negative value " << value;
+    } else if (uval <= static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
       return IntImm(t, static_cast<int64_t>(value), span);
     } else {
       uint64_t mask = (static_cast<uint64_t>(1) << 32U) - 1U;
@@ -932,6 +934,11 @@ inline PrimExpr MakeConstScalar(DataType t, ValueType value, Span span = Span())
   return PrimExpr();
 }
 
+template <>
+inline PrimExpr MakeConstScalar(DataType t, bool value, Span span) {
+  return MakeConstScalar(t, static_cast<int>(value), span);
+}
+
 template <typename ValueType, typename>
 inline PrimExpr make_const(DataType t, ValueType value, Span span) {
   if (t.lanes() == 1) {
diff --git a/python/tvm/runtime/object_generic.py b/python/tvm/runtime/object_generic.py
index 7a55d3ef244e..05426dfb1aeb 100644
--- a/python/tvm/runtime/object_generic.py
+++ b/python/tvm/runtime/object_generic.py
@@ -115,11 +115,17 @@ def _scalar_type_inference(value):
     elif isinstance(value, bool):
         dtype = "bool"
     elif isinstance(value, float):
-        # We intentionally convert the float to float32 since it's more common in DL.
-        dtype = "float32"
+        # We intentionally prefer convert the float to float32 since it's more common in DL.
+        if -3.40282347e38 <= value <= 3.40282347e38:
+            dtype = "float32"
+        else:
+            dtype = "float64"
     elif isinstance(value, int):
-        # We intentionally convert the python int to int32 since it's more common in DL.
-        dtype = "int32"
+        # We intentionally prefer convert the python int to int32 since it's more common in DL.
+        if -2147483648 <= value <= 2147483647:
+            dtype = "int32"
+        else:
+            dtype = "int64"
     else:
         raise NotImplementedError(
             "Cannot automatically inference the type." " value={}".format(value)
diff --git a/python/tvm/script/tir/intrin.py b/python/tvm/script/tir/intrin.py
index f3919afe5a24..bd9aa1fdadfd 100644
--- a/python/tvm/script/tir/intrin.py
+++ b/python/tvm/script/tir/intrin.py
@@ -89,6 +89,11 @@ def truncmod(x, y, span):
     return tvm.tir.truncmod(x, y, span)
 
 
+@register
+def truncdiv(x, y, span):
+    return tvm.tir.truncdiv(x, y, span)
+
+
 @register
 def ceildiv(x, y, span):
     return tvm.tir.ceildiv(x, y, span)
diff --git a/src/arith/const_fold.h b/src/arith/const_fold.h
index 9c3afe41b901..d0e09a1a7429 100644
--- a/src/arith/const_fold.h
+++ b/src/arith/const_fold.h
@@ -29,6 +29,7 @@
 
 #include <algorithm>
 #include <cmath>
+#include <limits>
 
 #include "int_operator.h"
 
@@ -73,6 +74,39 @@ inline bool IsIndexType(const DataType& type) {
   return type.is_int() && type.lanes() == 1 && (type.bits() == 32 || type.bits() == 64);
 }
 
+/*! \brief Helper to get const folding result repr in int64. */
+inline int64_t GetFoldResultInt64Repr(int64_t x, const DataType& dtype) {
+  if (dtype.bits() < 64) {
+    x &= (1LL << dtype.bits()) - 1;
+  }
+  if (dtype.is_int()) {
+    // get sign extended value of integer with specified bits
+    int64_t m = 1LL << (dtype.bits() - 1);
+    x = (x ^ m) - m;
+  }
+  return x;
+}
+
+/*! \brief Helper to get fp32 const folding result repr in double. */
+inline double GetFoldResultDoubleRepr(float x) {
+  double res = static_cast<double>(x);
+  if (std::isinf(res) || std::isnan(res)) {
+    return res;
+  }
+  // certain platform (eg, on gcc7-i386) do the folding arithmetic
+  // on float and write back to double is optimized to double
+  // precision arithmetic, this is legal and we check the output
+  // range thus to ensure consistency when the float result is inf.
+  if (res < std::numeric_limits<float>::lowest()) {
+    LOG(WARNING) << "underlying float value overflow";
+    return -std::numeric_limits<double>::infinity();
+  } else if (res > std::numeric_limits<float>::max()) {
+    LOG(WARNING) << "underlying float value overflow";
+    return std::numeric_limits<double>::infinity();
+  }
+  return res;
+}
+
 #define TVM_ARITH_CONST_PROPAGATION(BODY)        \
   using tir::FloatImmNode;                       \
   const IntImmNode* pa = a.as<IntImmNode>();     \
@@ -95,10 +129,22 @@ template <>
 inline PrimExpr TryConstFold<tir::Add>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, pa->value + pb->value);
+    if (pa && pb) {
+      int64_t res = pa->value + pb->value;
+      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+    }
     if (pa && pa->value == 0) return b;
     if (pb && pb->value == 0) return a;
-    if (fa && fb) return FloatImm(rtype, fa->value + fb->value);
+    if (fa && fb) {
+      if (rtype.bits() == 32) {
+        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) +
+                                                       static_cast<float>(fb->value)));
+      } else if (rtype.bits() == 64) {
+        return FloatImm(rtype, fa->value + fb->value);
+      } else {
+        return PrimExpr();
+      }
+    }
     if (fa && fa->value == 0) return b;
     if (fb && fb->value == 0) return a;
   });
@@ -113,9 +159,21 @@ inline PrimExpr TryConstFold<tir::Sub>(PrimExpr a, PrimExpr b) {
         << "Checked failed. Minuend 's value is 0U and it's dtype is uint "
         << "while Subtrahend's dtype is uint; which will cause a negative uint";
     const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, pa->value - pb->value);
+    if (pa && pb) {
+      int64_t res = pa->value - pb->value;
+      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+    }
     if (pb && pb->value == 0) return a;
-    if (fa && fb) return FloatImm(rtype, fa->value - fb->value);
+    if (fa && fb) {
+      if (rtype.bits() == 32) {
+        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) -
+                                                       static_cast<float>(fb->value)));
+      } else if (rtype.bits() == 64) {
+        return FloatImm(rtype, fa->value - fb->value);
+      } else {
+        return PrimExpr();
+      }
+    }
     if (fb && fb->value == 0) return a;
   });
   return PrimExpr();
@@ -125,7 +183,10 @@ template <>
 inline PrimExpr TryConstFold<tir::Mul>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, pa->value * pb->value);
+    if (pa && pb) {
+      int64_t res = pa->value * pb->value;
+      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+    }
     if (pa) {
       if (pa->value == 1) return b;
       if (pa->value == 0) return a;
@@ -134,7 +195,16 @@ inline PrimExpr TryConstFold<tir::Mul>(PrimExpr a, PrimExpr b) {
       if (pb->value == 1) return a;
       if (pb->value == 0) return b;
     }
-    if (fa && fb) return FloatImm(rtype, fa->value * fb->value);
+    if (fa && fb) {
+      if (rtype.bits() == 32) {
+        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) *
+                                                       static_cast<float>(fb->value)));
+      } else if (rtype.bits() == 64) {
+        return FloatImm(rtype, fa->value * fb->value);
+      } else {
+        return PrimExpr();
+      }
+    }
     if (fa) {
       if (fa->value == 1) return b;
       if (fa->value == 0) return a;
@@ -155,7 +225,8 @@ inline PrimExpr TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
       // due to division and mod can have different modes
       // NOTE: this will assumes truc div.
       ICHECK_NE(pb->value, 0) << "Divide by zero";
-      return IntImm(rtype, pa->value / pb->value);
+      int64_t res = pa->value / pb->value;
+      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
     }
     if (pa) {
       if (pa->value == 0) return a;
@@ -165,7 +236,14 @@ inline PrimExpr TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
       ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
     if (fa && fb && fb->value != 0) {
-      return FloatImm(rtype, fa->value / fb->value);
+      if (rtype.bits() == 32) {
+        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) /
+                                                       static_cast<float>(fb->value)));
+      } else if (rtype.bits() == 64) {
+        return FloatImm(rtype, fa->value / fb->value);
+      } else {
+        return PrimExpr();
+      }
     }
     if (fa && fa->value == 0) return a;
     if (fb) {
@@ -182,7 +260,8 @@ inline PrimExpr TryConstFold<tir::Mod>(PrimExpr a, PrimExpr b) {
     const DataType& rtype = a.dtype();
     if (pa && pb) {
       ICHECK_NE(pb->value, 0) << "Divide by zero";
-      return IntImm(rtype, pa->value % pb->value);
+      int64_t res = pa->value % pb->value;
+      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
     }
     if (pa) {
       if (pa->value == 0) return a;
@@ -201,7 +280,8 @@ inline PrimExpr TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
     const DataType& rtype = a.dtype();
     if (pa && pb) {
       ICHECK_NE(pb->value, 0) << "Divide by zero";
-      return IntImm(rtype, arith::floordiv(pa->value, pb->value));
+      int64_t res = arith::floordiv(pa->value, pb->value);
+      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
     }
     if (pa) {
       if (pa->value == 0) return a;
@@ -211,7 +291,14 @@ inline PrimExpr TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
       ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
     if (fa && fb && fb->value != 0) {
-      return FloatImm(rtype, std::floor(fa->value / fb->value));
+      if (rtype.bits() == 32) {
+        return FloatImm(rtype, GetFoldResultDoubleRepr(std::floor(static_cast<float>(fa->value) /
+                                                                  static_cast<float>(fb->value))));
+      } else if (rtype.bits() == 64) {
+        return FloatImm(rtype, std::floor(fa->value / fb->value));
+      } else {
+        return PrimExpr();
+      }
     }
     if (fa && fa->value == 0) return a;
     if (fb) {
@@ -228,7 +315,8 @@ inline PrimExpr TryConstFold<tir::FloorMod>(PrimExpr a, PrimExpr b) {
     const DataType& rtype = a.dtype();
     if (pa && pb) {
       ICHECK_NE(pb->value, 0) << "Divide by zero";
-      return IntImm(rtype, floormod(pa->value, pb->value));
+      int64_t res = arith::floormod(pa->value, pb->value);
+      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
     }
     if (pa) {
       if (pa->value == 0) return a;
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index d3e23800d6c7..c926cc56e89a 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -33,6 +33,8 @@
 #include <tvm/te/tensor.h>
 #include <tvm/tir/expr.h>
 
+#include "../support/scalars.h"
+
 namespace tvm {
 
 PrimExpr::PrimExpr(int32_t value) : PrimExpr(IntImm(DataType::Int(32), value)) {}
@@ -76,7 +78,20 @@ IntImm::IntImm(DataType dtype, int64_t value, Span span) {
   ICHECK(dtype.is_int() || dtype.is_uint())
       << "ValueError: IntImm supports only int or uint type, but " << dtype << " was supplied.";
   if (dtype.is_uint()) {
-    ICHECK_GE(value, 0U);
+    ICHECK_GE(value, 0U) << "ValueError: Literal value " << value
+                         << " is negative for unsigned integer type " << dtype;
+    if (dtype.bits() < 64) {
+      ICHECK_LT(value, 1LL << dtype.bits())
+          << "ValueError: Literal value " << value << " exceeds maximum of " << dtype;
+    }
+  } else if (dtype.bits() == 1) {
+    // int(1)
+    ICHECK(value == 0 || value == 1) << "ValueError: " << value << " exceeds range of " << dtype;
+  } else if (dtype.bits() < 64) {
+    ICHECK_GE(value, -(1LL << (dtype.bits() - 1)))
+        << "ValueError: Literal value " << value << " exceeds minimum of " << dtype;
+    ICHECK_LT(value, 1LL << (dtype.bits() - 1))
+        << "ValueError: Literal value " << value << " exceeds maximum of " << dtype;
   }
   ObjectPtr<IntImmNode> node = make_object<IntImmNode>();
   node->dtype = dtype;
@@ -103,6 +118,21 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 FloatImm::FloatImm(DataType dtype, double value, Span span) {
   ICHECK_EQ(dtype.lanes(), 1) << "ValueError: FloatImm can only take scalar.";
+
+  // check range for float32 and float16 since they have specified range.
+  if (!std::isinf(value) && !std::isnan(value)) {
+    if (dtype.bits() == 32) {
+      ICHECK_GE(value, std::numeric_limits<float>::lowest())
+          << "ValueError: Literal value " << value << " exceeds minimum of " << dtype;
+      ICHECK_LE(value, std::numeric_limits<float>::max())
+          << "ValueError: Literal value " << value << " exceeds maximum of " << dtype;
+    } else if (dtype.is_float16()) {
+      ICHECK_GE(value, -support::kMaxFloat16)
+          << "ValueError: Literal value " << value << " exceeds minimum of " << dtype;
+      ICHECK_LE(value, support::kMaxFloat16)
+          << "ValueError: Literal value " << value << " exceeds maximum of " << dtype;
+    }
+  }
   ObjectPtr<FloatImmNode> node = make_object<FloatImmNode>();
   node->dtype = dtype;
   node->value = value;
diff --git a/src/support/scalars.cc b/src/support/scalars.cc
index 9caa7ca58915..0ab16899bae9 100644
--- a/src/support/scalars.cc
+++ b/src/support/scalars.cc
@@ -174,10 +174,6 @@ IntImm ValueToIntImm(int64_t value, int width) {
   }
 }
 
-// 2^15 * (1 + 1023/1024)
-// See https://en.wikipedia.org/wiki/Half-precision_floating-point_format
-constexpr double kMaxFloat16 = 65504.0;
-
 FloatImm ValueToFloatImm(double value, int width) {
   if (width == 16) {
     if (!std::isinf(value) && (value < -kMaxFloat16 || value > kMaxFloat16)) {
diff --git a/src/support/scalars.h b/src/support/scalars.h
index 60b8fc40a8de..2fdbb001d922 100644
--- a/src/support/scalars.h
+++ b/src/support/scalars.h
@@ -61,6 +61,10 @@ std::string FloatImmToString(const FloatImm& float_imm);
 IntImm ValueToIntImm(int64_t value, int width);
 FloatImm ValueToFloatImm(double value, int width);
 
+// 2^15 * (1 + 1023/1024)
+// See https://en.wikipedia.org/wiki/Half-precision_floating-point_format
+constexpr double kMaxFloat16 = 65504.0;
+
 }  // namespace support
 }  // namespace tvm
 
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 89de2f6a9520..a8eb7f406c37 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -512,7 +512,7 @@ def verify(
     # Test backwards slicing.
     verify((3, 4, 3), [-1, -1, -1], [-5, -5, -5], [-1, -1, -1], (3, 4, 3))
     # Test slicing with overlarge indices.
-    verify((3, 4, 3), [0, 0, 0], [np.iinfo(np.int64).max] * 3, [1, 1, 1], (3, 4, 3))
+    verify((3, 4, 3), [0, 0, 0], [np.iinfo(np.int32).max] * 3, [1, 1, 1], (3, 4, 3))
     # Test slice mode.
     verify(
         (3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 1], (2, 4, 3), slice_mode="size", test_ref=False
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index cacce5603e5f..fe662a30766c 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -777,7 +777,7 @@ def test_fuse_dynamic_squeeze_slice_take():
 
     squeeze = relay.op.squeeze(x, axis=[0])
     strided_slice = relay.op.strided_slice(
-        squeeze, begin=[0, 0], end=[15130, 9223372036854775807], strides=[1, 1]
+        squeeze, begin=[0, 0], end=[15130, 2147483647], strides=[1, 1]
     )
     take = relay.op.take(strided_slice, take_val, axis=0)
 
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 82e1372f991e..c880f90ddffe 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -951,6 +951,8 @@ def test_cast_simplify():
         ck.verify(tvm.tir.Cast(dtype1, x == x), tvm.tir.const(1, dtype1))
         for dtype2 in dtypes:
             for i in [0, 1, 2, 3]:
+                if i > 1 and (dtype1 == "bool" or dtype2 == "bool"):
+                    continue
                 ck.verify(tvm.tir.Cast(dtype1, tvm.tir.const(i, dtype2)), tvm.tir.const(i, dtype1))
 
 
diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py
index 994a85095728..96b947e20655 100644
--- a/tests/python/unittest/test_target_codegen_cuda.py
+++ b/tests/python/unittest/test_target_codegen_cuda.py
@@ -1,4 +1,5 @@
 # Licensed to the Apache Software Foundation (ASF) under one
+
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
@@ -194,13 +195,13 @@ def check_cuda(n, value, lanes):
         fun(a)
         np.testing.assert_equal(a.numpy(), np_a)
 
-    check_cuda(64, 0xAB, 4)
+    check_cuda(64, np.int8(0xAB), 4)
     check_cuda(64, 0, 4)
     check_cuda(64, -3, 4)
-    check_cuda(64, 0xAB, 3)
+    check_cuda(64, np.int8(0xAB), 3)
     check_cuda(64, 0, 3)
     check_cuda(64, -3, 3)
-    check_cuda(64, 0xAB, 2)
+    check_cuda(64, np.int8(0xAB), 2)
     check_cuda(64, 0, 2)
     check_cuda(64, -3, 2)
 
diff --git a/tests/python/unittest/test_tir_imm_values.py b/tests/python/unittest/test_tir_imm_values.py
new file mode 100644
index 000000000000..a2a19a09ad87
--- /dev/null
+++ b/tests/python/unittest/test_tir_imm_values.py
@@ -0,0 +1,577 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import math
+import random
+import numpy as np
+import tvm
+import tvm.testing
+import pytest
+from tvm import tir
+from tvm.script import tir as T
+import pytest
+
+
+@pytest.mark.parametrize(
+    "dtype, literals",
+    [
+        ["int8", [-128, 0, 127]],
+        ["uint8", [0, 255]],
+        ["int32", [-2147483648, 2147483647]],
+        ["uint32", [0, 4294967295]],
+        ["int64", [-9223372036854775808, 9223372036854775807]],
+        ["uint64", [0, 9223372036854775807]],
+    ],
+)
+def test_tir_make_intimm(dtype, literals):
+    for l in literals:
+        imm = tir.const(l, dtype)
+        assert imm.value == l, imm
+
+
+@pytest.mark.parametrize(
+    "dtype, literals",
+    [
+        ["int8", [-129, 128]],
+        ["uint8", [-1, 256]],
+        ["int32", [-2147483650, 2147483648]],
+        ["uint32", [-1, 4294967296]],
+        ["uint64", [-1, 18446744073709551616]],
+    ],
+)
+def test_tir_invalid_intimm(dtype, literals):
+    for l in literals:
+        with pytest.raises(tvm.TVMError):
+            tir.const(l, dtype)
+
+
+@pytest.mark.parametrize(
+    "dtype, literals",
+    [
+        [
+            "uint64",
+            {
+                9223372036854775807: 9223372036854775807,
+                18446744073709551615: 18446744073709551615,
+            },
+        ],
+    ],
+)
+def test_tir_large_py_int_literals(dtype, literals):
+    """
+    For large uint value, use LargeUIntImm intrin,
+    """
+    for l in literals:
+        x = tir.const(l, dtype)
+        if isinstance(x, (tir.IntImm, tir.FloatImm)):
+            assert x.value == literals[l]
+        else:
+            # LargeUIntImm(low32, hi32)
+            assert (int(x.args[1]) << 32) + int(x.args[0]) == literals[l]
+
+
+def test_tir_intimm_overflow():
+    assert int(tir.const(255, "uint8") + tir.const(1, "uint8")) == 0
+    assert int(tir.const(2**31 - 1, "int32") + tir.const(1, "int32")) == -(2**31)
+    assert int(tir.const(2**32 - 1, "uint32") + tir.const(1, "uint32")) == 0
+    assert int(tir.const(2**63 - 1, "int64") + tir.const(1, "int64")) == -(2**63)
+    assert int(tir.const(2**32, "uint64") * tir.const(2**32, "uint64")) == 0
+    # customized int types
+    assert int(tir.const(7, "int4") + tir.const(1, "int4")) == -8
+    assert int(tir.const(2**39 - 1, "int40") + tir.const(1, "int40")) == -(2**39)
+
+
+def compare_float_value(value, expect, msg):
+    if math.isfinite(value):
+        assert np.abs(value - expect) < 1e-5, f"{value} vs {expect}, {msg}"
+    elif math.isnan(value):
+        assert math.isnan(expect), f"{value} vs {expect}, {msg}"
+    elif math.isinf(value):
+        assert math.isinf(expect), f"{value} vs {expect}, {msg}"
+
+
+@pytest.mark.parametrize(
+    "dtype, literals",
+    [
+        ["float16", [-65504.0, 3.14, 65504.0, np.inf, np.nan]],
+        ["bfloat16", [-3.38953139e38, 3.38953139e38, 3.14]],
+        ["float32", [np.finfo("float32").min, 3.14, np.finfo("float32").max, np.inf, np.nan]],
+        ["float64", [np.finfo("float64").min, 3.14, np.finfo("float64").max, np.inf, np.nan]],
+    ],
+)
+def test_tir_make_floatimm(dtype, literals):
+    for l in literals:
+        imm = tir.const(l, dtype)
+        compare_float_value(imm.value, l, "imm value should match feed value")
+
+
+@pytest.mark.parametrize(
+    "dtype, literals",
+    [
+        ["float16", [-65505.0, 65505.0]],
+        ["float32", [-3.402e39, 3.402e39]],
+    ],
+)
+def test_tir_invalid_floatimm(dtype, literals):
+    """Currently only fp16 and fp32 have range check."""
+    for l in literals:
+        with pytest.raises(tvm.TVMError):
+            tir.const(l, dtype)
+
+
+@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"])
+@pytest.mark.parametrize("literal", [3.14, np.nan, np.inf])
+def test_tir_special_floatimms(dtype, literal):
+    x = tir.const(literal, dtype)
+    compare_float_value(x.value, literal, "imm value should match feed value")
+
+
+@tvm.testing.requires_llvm()
+def test_tir_too_large_literal_f64():
+    # Behavior check: if literal f64 value is out of dtype range, the
+    # object is still constructed, and eval to infinity.
+    @T.prim_func
+    def imm_overflow_fp64() -> T.float64:
+        T.evaluate(T.ret(T.float64(1.7976e309), dtype="float64"))
+
+    f = tvm.build(imm_overflow_fp64, target="llvm")
+    assert math.isinf(f())
+
+
+@pytest.mark.parametrize(
+    "literal, expect_dtype",
+    [
+        (256, "int32"),
+        (2147483647, "int32"),
+        (-2147483648, "int32"),
+        (2147483648, "int64"),
+        (-2147483649, "int64"),
+        (3.14159, "float32"),
+        (np.finfo("float32").min, "float32"),
+        (np.finfo("float32").max, "float32"),
+        (-3.402e39, "float64"),
+        (3.402e39, "float64"),
+    ],
+)
+def test_tir_const_auto_dtype(literal, expect_dtype):
+    x = tir.const(literal, dtype=None)
+    assert x.dtype == expect_dtype
+    assert x.value == literal
+
+
+def check_tir_const_fold(
+    dtype, foldf, calcf, x_range=None, y_range=None, expect=None, skip_overflow=False
+):
+    """Helper to check constant folding behavior
+
+    Parameters
+    ----------
+    dtype: str
+        Datatype of constants
+
+    foldf: (x, y) -> z
+        Folding function to call
+
+    calcf: (x, y) -> z
+        Compiled calculation function to call
+
+    x_range: Union[int, float, tuple]
+        Single value or value range [min, max]
+
+    y_range: Union[int, float, tuple]
+        Single value or value range [min, max]
+
+    expect: Union[int, float]
+        Expected calculation result
+
+    skip_overflow: bool
+        Skip assertion if the overflow happens
+    """
+    seed = random.randint(0, 2147483648)
+    np.random.seed(seed)
+    ninfo = np.finfo(dtype) if dtype.startswith("float") else np.iinfo(dtype)
+
+    if x_range is None:
+        x_range = (ninfo.min, ninfo.max)
+    if isinstance(x_range, (int, float)):
+        x = x_range
+    elif dtype.startswith("int") or dtype.startswith("uint"):
+        x = np.random.randint(x_range[0], x_range[1] + 1, dtype=dtype)
+    else:
+        x = np.random.uniform(x_range[0], x_range[1])
+
+    if y_range is None:
+        y_range = (ninfo.min, ninfo.max)
+    if isinstance(y_range, (int, float)):
+        y = y_range
+    elif dtype.startswith("int") or dtype.startswith("uint"):
+        y = np.random.randint(y_range[0], y_range[1] + 1, dtype=dtype)
+    else:
+        y = np.random.uniform(y_range[0], y_range[1])
+
+    if skip_overflow:
+        py_res = foldf(x, y)
+        if isinstance(py_res, (tir.IntImm, tir.FloatImm)):
+            py_res = py_res.value
+        if not (ninfo.min <= py_res <= ninfo.max):
+            # If the result overflow, certain arithmetics is non-defined
+            # thus we intentionally do not make the test failed.
+            return
+
+    fold_res = foldf(tir.const(x, dtype), tir.const(y, dtype))
+    calc_res = calcf(x, y)
+
+    flaky_msg = (
+        f"{dtype} ({x}, {y}, {expect}) const folding check failed.\n"
+        + "This test is intentionally non-deterministic, "
+        + f"if it fails please report it in github issue together with this seed {seed}\n"
+    )
+    if dtype.startswith("float"):
+        compare_float_value(calc_res, fold_res.value, flaky_msg)
+        if expect:
+            compare_float_value(expect, calc_res, flaky_msg)
+    else:
+        assert calc_res == fold_res.value, flaky_msg
+        if expect:
+            assert expect == calc_res, flaky_msg
+
+
+@tvm.testing.requires_llvm()
+def test_tir_floatimm_const_fold():
+    """Behavior check: folding fp32 match platform f32 arithmetic"""
+
+    @T.prim_func
+    def float_imm_multiply(x: T.float32, y: T.float32, z: T.Buffer[(), "float32"]):
+        z[()] = x * y
+
+    @T.prim_func
+    def float_imm_add(x: T.float32, y: T.float32, z: T.Buffer[(), "float32"]):
+        z[()] = x + y
+
+    @T.prim_func
+    def float_imm_sub(x: T.float32, y: T.float32, z: T.Buffer[(), "float32"]):
+        z[()] = x - y
+
+    @T.prim_func
+    def float_imm_div(x: T.float32, y: T.float32, z: T.Buffer[(), "float32"]):
+        z[()] = x / y
+
+    def __wrap_build(f):
+        lib = tvm.build(f, target="llvm")
+        z = tvm.nd.array(np.zeros([]).astype("float32"))
+
+        def _func(x, y):
+            lib(x, y, z)
+            return z.numpy()
+
+        return _func
+
+    fmul = __wrap_build(float_imm_multiply)
+    fadd = __wrap_build(float_imm_add)
+    fsub = __wrap_build(float_imm_sub)
+    fdiv = __wrap_build(float_imm_div)
+
+    # overflow
+    check_tir_const_fold("float32", lambda x, y: x * y, fmul, 3.0e30, 3.0e30, np.inf)
+    check_tir_const_fold("float32", lambda x, y: x * y, fmul, 3.0e30, -3.0e30, -np.inf)
+    check_tir_const_fold("float32", lambda x, y: x / y, fdiv, 3.0e30, 3.0e-30, np.inf)
+
+    # divide by zero
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("float32", lambda x, y: x / y, fdiv, 1.0, 0.0)
+
+    # nan and inf
+    check_tir_const_fold("float32", lambda x, y: x + y, fadd, 1.0, np.nan, np.nan)
+    check_tir_const_fold("float32", lambda x, y: x + y, fadd, 1.0, np.inf, np.inf)
+    check_tir_const_fold("float32", lambda x, y: x + y, fadd, 1.0, -np.inf, -np.inf)
+
+    # randomized check
+    check_tir_const_fold("float32", lambda x, y: x * y, fmul)
+    check_tir_const_fold("float32", lambda x, y: x + y, fadd)
+    check_tir_const_fold("float32", lambda x, y: x - y, fsub)
+    check_tir_const_fold(
+        "float32", lambda x, y: x / y, fdiv, y_range=(0.01, np.finfo("float32").max)
+    )
+
+
+@tvm.testing.requires_llvm()
+def test_tir_int8_const_fold():
+    """Behavior check: folding i8 operation match platform i8 arithmetic"""
+
+    @T.prim_func
+    def imm_multiply(x: T.int8, y: T.int8) -> T.int8:
+        T.evaluate(T.ret(x * y, dtype="int8"))
+
+    @T.prim_func
+    def imm_add(x: T.int8, y: T.int8) -> T.int8:
+        T.evaluate(T.ret(x + y, dtype="int8"))
+
+    @T.prim_func
+    def imm_sub(x: T.int8, y: T.int8) -> T.int8:
+        T.evaluate(T.ret(x - y, dtype="int8"))
+
+    @T.prim_func
+    def imm_truncdiv(x: T.int8, y: T.int8) -> T.int8:
+        T.evaluate(T.ret(T.truncdiv(x, y), dtype="int8"))
+
+    @T.prim_func
+    def imm_floordiv(x: T.int8, y: T.int8) -> T.int8:
+        T.evaluate(T.ret(T.floordiv(x, y), dtype="int8"))
+
+    fmul = tvm.build(imm_multiply, target="llvm")
+    fadd = tvm.build(imm_add, target="llvm")
+    fsub = tvm.build(imm_sub, target="llvm")
+    ffloordiv = tvm.build(imm_floordiv, target="llvm")
+    ftruncdiv = tvm.build(imm_truncdiv, target="llvm")
+
+    # overflow
+    check_tir_const_fold("int8", lambda x, y: x + y, fadd, 127, 1, -128)
+    check_tir_const_fold("int8", lambda x, y: x * y, fmul, 127, 127, 1)
+
+    # divide by zero
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("int8", lambda x, y: tir.floordiv(x, y), ffloordiv, 1, 0)
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("int8", lambda x, y: tir.truncdiv(x, y), ftruncdiv, 1, 0)
+
+    # i8 mod folding is not implemented
+    assert not isinstance(tir.floormod(tir.const(7, "int8"), tir.const(3, "int8")), tir.IntImm)
+    assert not isinstance(tir.truncmod(tir.const(7, "int8"), tir.const(3, "int8")), tir.IntImm)
+
+    # randomized check
+    check_tir_const_fold("int8", lambda x, y: x * y, fmul)
+    check_tir_const_fold("int8", lambda x, y: x + y, fadd)
+    check_tir_const_fold("int8", lambda x, y: x - y, fsub)
+    check_tir_const_fold(
+        "int8", lambda x, y: tir.floordiv(x, y), ffloordiv, y_range=(1, np.iinfo("int8").max)
+    )
+    check_tir_const_fold(
+        "int8", lambda x, y: tir.truncdiv(x, y), ftruncdiv, y_range=(1, np.iinfo("int8").max)
+    )
+
+
+@tvm.testing.requires_llvm()
+def test_tir_uint8_const_fold():
+    """Behavior check: folding u8 operation match platform u8 arithmetic"""
+
+    @T.prim_func
+    def imm_multiply(x: T.uint8, y: T.uint8) -> T.uint8:
+        T.evaluate(T.ret(x * y, dtype="uint8"))
+
+    @T.prim_func
+    def imm_add(x: T.uint8, y: T.uint8) -> T.uint8:
+        T.evaluate(T.ret(x + y, dtype="uint8"))
+
+    @T.prim_func
+    def imm_sub(x: T.uint8, y: T.uint8) -> T.uint8:
+        T.evaluate(T.ret(x - y, dtype="uint8"))
+
+    @T.prim_func
+    def imm_truncdiv(x: T.uint8, y: T.uint8) -> T.uint8:
+        T.evaluate(T.ret(T.truncdiv(x, y), dtype="uint8"))
+
+    @T.prim_func
+    def imm_floordiv(x: T.uint8, y: T.uint8) -> T.uint8:
+        T.evaluate(T.ret(T.floordiv(x, y), dtype="uint8"))
+
+    fmul = tvm.build(imm_multiply, target="llvm")
+    fadd = tvm.build(imm_add, target="llvm")
+    fsub = tvm.build(imm_sub, target="llvm")
+    ffloordiv = tvm.build(imm_floordiv, target="llvm")
+    ftruncdiv = tvm.build(imm_truncdiv, target="llvm")
+
+    # overflow
+    check_tir_const_fold("uint8", lambda x, y: x + y, fadd, 255, 1, 0)
+
+    # zero sub
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("uint8", lambda x, y: x - y, fsub, 0, 10)
+
+    # divide by zero
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("uint8", lambda x, y: tir.floordiv(x, y), ffloordiv, 1, 0)
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("uint8", lambda x, y: tir.truncdiv(x, y), ftruncdiv, 1, 0)
+
+    # u8 mod folding is not implemented
+    assert not isinstance(tir.floormod(tir.const(7, "uint8"), tir.const(3, "uint8")), tir.IntImm)
+    assert not isinstance(tir.truncmod(tir.const(7, "uint8"), tir.const(3, "uint8")), tir.IntImm)
+
+    # randomized check
+    check_tir_const_fold("uint8", lambda x, y: x * y, fmul)
+    check_tir_const_fold("uint8", lambda x, y: x + y, fadd)
+    check_tir_const_fold("uint8", lambda x, y: x - y, fsub)
+    check_tir_const_fold(
+        "uint8", lambda x, y: tir.floordiv(x, y), ffloordiv, y_range=(1, np.iinfo("uint8").max)
+    )
+    check_tir_const_fold(
+        "uint8", lambda x, y: tir.truncdiv(x, y), ftruncdiv, y_range=(1, np.iinfo("uint8").max)
+    )
+
+
+@tvm.testing.requires_llvm()
+def test_tir_int32_const_fold():
+    """Behavior check: folding i32 operation match platform i32 arithmetic"""
+
+    @T.prim_func
+    def imm_multiply(x: T.int32, y: T.int32) -> T.int32:
+        T.evaluate(T.ret(x * y, dtype="int32"))
+
+    @T.prim_func
+    def imm_add(x: T.int32, y: T.int32) -> T.int32:
+        T.evaluate(T.ret(x + y, dtype="int32"))
+
+    @T.prim_func
+    def imm_sub(x: T.int32, y: T.int32) -> T.int32:
+        T.evaluate(T.ret(x - y, dtype="int32"))
+
+    @T.prim_func
+    def imm_truncdiv(x: T.int32, y: T.int32) -> T.int32:
+        T.evaluate(T.ret(T.truncdiv(x, y), dtype="int32"))
+
+    @T.prim_func
+    def imm_truncmod(x: T.int32, y: T.int32) -> T.int32:
+        T.evaluate(T.ret(T.truncmod(x, y), dtype="int32"))
+
+    @T.prim_func
+    def imm_floordiv(x: T.int32, y: T.int32) -> T.int32:
+        T.evaluate(T.ret(T.floordiv(x, y), dtype="int32"))
+
+    @T.prim_func
+    def imm_floormod(x: T.int32, y: T.int32) -> T.int32:
+        T.evaluate(T.ret(T.floormod(x, y), dtype="int32"))
+
+    fmul = tvm.build(imm_multiply, target="llvm")
+    fadd = tvm.build(imm_add, target="llvm")
+    fsub = tvm.build(imm_sub, target="llvm")
+    ffloordiv = tvm.build(imm_floordiv, target="llvm")
+    ffloormod = tvm.build(imm_floormod, target="llvm")
+    ftruncdiv = tvm.build(imm_truncdiv, target="llvm")
+    ftruncmod = tvm.build(imm_truncmod, target="llvm")
+
+    # i32 overflow is not specified, only check for range
+    assert -(2**31) <= int(tir.const(2**31 - 1, "int32") + tir.const(1, "int32")) < 2**31
+    assert -(2**31) <= int(tir.const(-(2**31), "int32") - tir.const(1, "int32")) < 2**31
+
+    # divide by zero
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("int32", lambda x, y: tir.floordiv(x, y), ffloordiv, 1, 0)
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("int32", lambda x, y: tir.floormod(x, y), ffloormod, 1, 0)
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("int32", lambda x, y: tir.truncdiv(x, y), ftruncdiv, 1, 0)
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("int32", lambda x, y: tir.truncmod(x, y), ftruncmod, 1, 0)
+
+    # randomized check
+    check_tir_const_fold("int32", lambda x, y: x * y, fmul, skip_overflow=True)
+    check_tir_const_fold("int32", lambda x, y: x + y, fadd, skip_overflow=True)
+    check_tir_const_fold("int32", lambda x, y: x - y, fsub, skip_overflow=True)
+    check_tir_const_fold(
+        "int32",
+        lambda x, y: tir.floordiv(x, y),
+        ffloordiv,
+        y_range=(1, np.iinfo("int32").max),
+        skip_overflow=True,
+    )
+    check_tir_const_fold(
+        "int32",
+        lambda x, y: tir.truncdiv(x, y),
+        ftruncdiv,
+        y_range=(1, np.iinfo("int32").max),
+        skip_overflow=True,
+    )
+    check_tir_const_fold(
+        "int32",
+        lambda x, y: tir.floormod(x, y),
+        ffloormod,
+        y_range=(1, np.iinfo("int32").max),
+        skip_overflow=False,
+    )
+    check_tir_const_fold(
+        "int32",
+        lambda x, y: tir.truncmod(x, y),
+        ftruncmod,
+        y_range=(1, np.iinfo("int32").max),
+        skip_overflow=False,
+    )
+
+
+@tvm.testing.requires_llvm()
+def test_tir_uint32_const_fold():
+    """Behavior check: folding u32 operation match platform u32 arithmetic"""
+
+    @T.prim_func
+    def imm_multiply(x: T.uint32, y: T.uint32) -> T.uint32:
+        T.evaluate(T.ret(x * y, dtype="uint32"))
+
+    @T.prim_func
+    def imm_add(x: T.uint32, y: T.uint32) -> T.uint32:
+        T.evaluate(T.ret(x + y, dtype="uint32"))
+
+    @T.prim_func
+    def imm_sub(x: T.uint32, y: T.uint32) -> T.uint32:
+        T.evaluate(T.ret(x - y, dtype="uint32"))
+
+    @T.prim_func
+    def imm_truncdiv(x: T.uint32, y: T.uint32) -> T.uint32:
+        T.evaluate(T.ret(T.truncdiv(x, y), dtype="uint32"))
+
+    @T.prim_func
+    def imm_floordiv(x: T.uint32, y: T.uint32) -> T.uint32:
+        T.evaluate(T.ret(T.floordiv(x, y), dtype="uint32"))
+
+    fmul = tvm.build(imm_multiply, target="llvm")
+    fadd = tvm.build(imm_add, target="llvm")
+    fsub = tvm.build(imm_sub, target="llvm")
+    ffloordiv = tvm.build(imm_floordiv, target="llvm")
+    ftruncdiv = tvm.build(imm_truncdiv, target="llvm")
+
+    # u32 overflow is not specified, only check for range
+    assert 0 <= int(tir.const(2**32 - 1, "uint32") + tir.const(1, "uint32")) < 2**32
+
+    # divide by zero
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("uint32", lambda x, y: tir.floordiv(x, y), ffloordiv, 1, 0)
+    with pytest.raises(tvm.TVMError):
+        check_tir_const_fold("uint32", lambda x, y: tir.truncdiv(x, y), ftruncdiv, 1, 0)
+
+    # u8 mod folding is not implemented
+    assert not isinstance(tir.floormod(tir.const(7, "uint32"), tir.const(3, "uint32")), tir.IntImm)
+    assert not isinstance(tir.truncmod(tir.const(7, "uint32"), tir.const(3, "uint32")), tir.IntImm)
+
+    # randomized check
+    check_tir_const_fold("uint32", lambda x, y: x * y, fmul, skip_overflow=True)
+    check_tir_const_fold("uint32", lambda x, y: x + y, fadd, skip_overflow=True)
+    check_tir_const_fold("uint32", lambda x, y: x - y, fsub, skip_overflow=True)
+    check_tir_const_fold(
+        "uint32",
+        lambda x, y: tir.floordiv(x, y),
+        ffloordiv,
+        y_range=(1, np.iinfo("uint32").max),
+        skip_overflow=False,
+    )
+    check_tir_const_fold(
+        "uint32",
+        lambda x, y: tir.truncdiv(x, y),
+        ftruncdiv,
+        y_range=(1, np.iinfo("uint32").max),
+        skip_overflow=False,
+    )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py
index d66b4ef5dd5b..20818a5b326a 100644
--- a/tests/python/unittest/test_tir_transform_narrow_datatype.py
+++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py
@@ -67,8 +67,6 @@ def check(m, n, target_bits, target_dtype):
     # const shape
     # i32 -> i32
     check(2, 2, 32, "int32")
-    # i32 + i32 is not promoted to i64 even if overflow
-    check(2**16, 2**16, 32, "int32")
     # i64 -> i32
     check(const(2, dtype="int64"), const(2, dtype="int64"), 32, "int32")
     check(const(2**16, dtype="int64"), const(2**16, dtype="int64"), 32, "int64")
@@ -100,12 +98,6 @@ def check(m, n, target_bits, target_dtype):
 
     # i32 -> i32
     check(2, 32, target_bits=32, target_dtype="int32")
-    check(
-        2**30,
-        32,  # i32 + i32 is not promoted to i64 even in the case of overflow
-        target_bits=32,
-        target_dtype="int32",
-    )
     # i64 -> i32
     check(const(2, dtype="int64"), const(32, dtype="int64"), target_bits=32, target_dtype="int32")
     check(
@@ -162,7 +154,6 @@ def check(m, lanes, target_bits, target_dtype):
 
     # i32 -> i32
     check(const(2**10, dtype="int32"), 2, target_bits=32, target_dtype="int32")
-    check(const(2**32, dtype="int32"), 2, target_bits=32, target_dtype="int32")
     # i64 -> i32
     check(const(2**10, dtype="int64"), 2, target_bits=32, target_dtype="int32")
     check(const(2**32, dtype="int64"), 2, target_bits=32, target_dtype="int64")

From 4c05656c65e6ab73f398f3fa982f84d48b16b55d Mon Sep 17 00:00:00 2001
From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com>
Date: Fri, 9 Sep 2022 16:19:45 -0500
Subject: [PATCH 144/704] [TOPI][Hexagon] Add test and schedule for uint8
 resize2d (#12559)

* [TOPI][Hexagon] Add test and schedule for uint8 resize2d

* Fix correctness issue

* Reformat

* Remove cubic from testing

* Remove unnecessary else
---
 python/tvm/topi/hexagon/resize2d.py           | 41 +++++++++++++--
 .../test_hexagon/topi/test_resize2d.py        | 52 +++++++++++++++----
 2 files changed, 80 insertions(+), 13 deletions(-)

diff --git a/python/tvm/topi/hexagon/resize2d.py b/python/tvm/topi/hexagon/resize2d.py
index ed544143b583..0e817e2e9330 100755
--- a/python/tvm/topi/hexagon/resize2d.py
+++ b/python/tvm/topi/hexagon/resize2d.py
@@ -58,24 +58,59 @@ def resize2d_compute(
     )
 
 
-def tir_broadcast_schedule(
+def tir_resize2d_schedule(
     out_m,
     input_a,
     input_layout: str,
     output_layout: str,
 ):
-    """Schedule for input and output layout nhwc-8h2w32c2w-2d"""
+    """Schedule for input and output layout nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d"""
     func = te.create_prim_func([input_a, out_m])
 
     s = tir.Schedule(func)
 
     block = s.get_block("resize")
 
-    if input_layout == "nhwc-8h2w32c2w-2d":
+    if input_layout in (
+        "nhwc-8h2w32c2w-2d",
+        "nhwc-8h8w32c-2d",
+    ):
         input_transformed_layout = get_layout_transform_fn(input_layout)
         s.transform_layout(block, buffer=("read", 0), index_map=input_transformed_layout)
 
     output_transformed_layout = get_layout_transform_fn(output_layout)
     s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout)
 
+    if output_layout == "nhwc-8h2w32c2w-2d":
+        # Fixed chunk size is 2048 byte
+        # For fp16 the layout for fixed chunk is 8x4x32
+        # where each element is 2 bytes
+        # Split and reorder is done to iterate over the fixed chunk
+        # Channel is split by a factor of 32
+        # Width is split by a factor of 4
+        # Height is split by a factor of 8
+        n, h, w, c = s.get_loops(block)
+
+        ho, hi = s.split(h, [None, 8])
+        wo, wi = s.split(w, [None, 4])
+        co, ci = s.split(c, [None, 32])
+
+        s.reorder(n, ho, wo, co, hi, wi, ci)
+
+    elif output_layout == "nhwc-8h8w32c-2d":
+        # Fixed chunk size is 2048 byte
+        # For uint8 the layout for fixed chunk is 8x8x32
+        # where each element is 1 bytes
+        # Split and reorder is done to iterate over the fixed chunk
+        # Channel is split by a factor of 32
+        # Width is split by a factor of 8
+        # Height is split by a factor of 8
+        n, h, w, c = s.get_loops(block)
+
+        ho, hi = s.split(h, [None, 8])
+        wo, wi = s.split(w, [None, 8])
+        co, ci = s.split(c, [None, 32])
+
+        s.reorder(n, ho, wo, co, hi, wi, ci)
+
     return s
diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
index d0c2c1464a95..1ef9f50977c5 100755
--- a/tests/python/contrib/test_hexagon/topi/test_resize2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
@@ -26,26 +26,46 @@
 
 @tvm.testing.fixture
 def expected_output_np(
-    input_np, in_height, in_width, out_height, out_width, layout, method, coord_trans
+    input_np,
+    in_height,
+    in_width,
+    out_height,
+    out_width,
+    layout,
+    method,
+    coord_trans,
+    dtype,
 ):
     scale_h = out_height / in_height
     scale_w = out_width / in_width
+
     return resize2d_python(input_np, (scale_h, scale_w), layout, method, coord_trans)
 
 
 @tvm.testing.fixture
 def input_np(input_shape, dtype):
-    return np.random.random(input_shape).astype(dtype)
+    if dtype == "float16":
+        return np.random.random(input_shape).astype(dtype)
+    if dtype == "uint8":
+        return np.random.randint(0, 255, input_shape).astype(dtype)
+    if dtype == "int8":
+        return np.random.randint(-128, 127, input_shape).astype(dtype)
 
 
 @tvm.testing.fixture
-def transformed_input_np(input_np, layout, input_crouton_layout):
-    return transform_numpy(input_np, layout.lower(), input_crouton_layout)
+def transformed_input_np(input_np, layout, input_crouton_layout, dtype):
+    if dtype == "float16" or dtype == "uint8" or dtype == "int8":
+        return transform_numpy(input_np, layout.lower(), input_crouton_layout)
+
+    raise RuntimeError(f"Unsupported data type '{dtype}'")
 
 
 @tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, layout, output_layout):
-    return transform_numpy(expected_output_np, layout.lower(), output_layout)
+def transformed_expected_output_np(expected_output_np, layout, output_layout, dtype):
+    if dtype == "float16" or dtype == "uint8" or dtype == "int8":
+        return transform_numpy(expected_output_np, layout.lower(), output_layout)
+
+    raise RuntimeError(f"Unsupported data type '{dtype}'")
 
 
 @tvm.testing.fixture
@@ -80,10 +100,11 @@ class TestResize2d:
 
     (layout, input_crouton_layout, output_layout, dtype,) = tvm.testing.parameters(
         ("NHWC", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
+        ("NHWC", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
     )
 
     coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
-    method = tvm.testing.parameter("nearest_neighbor", "linear", "cubic")
+    method = tvm.testing.parameter("nearest_neighbor", "linear")
 
     @tvm.testing.requires_hexagon
     def test_resize2d(
@@ -112,14 +133,18 @@ def test_resize2d(
             layout=layout,
             coordinate_transformation_mode=coord_trans,
             method=method,
+            out_dtype=dtype,
         )
 
-        tir_schedule = s1.tir_broadcast_schedule(M, A, input_crouton_layout, output_layout)
+        tir_schedule = s1.tir_resize2d_schedule(M, A, input_crouton_layout, output_layout)
 
         sch = tir_schedule.mod
 
         input_axis_separator = [4]
-        if output_layout == "nhwc-8h2w32c2w-2d":
+        if output_layout in (
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h8w32c-2d",
+        ):
             output_axis_separator = [4]
         else:
             raise RuntimeError(f"Unexpected layout '{output_layout}'")
@@ -155,8 +180,15 @@ def test_resize2d(
         # convert nd to np and reshape to fixed chunk size layout
         if output_layout == "nhwc-8h2w32c2w-2d":
             M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
+        elif output_layout == "nhwc-8h8w32c-2d":
+            M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32])
 
-        np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3)
+        if dtype == "float16":
+            np.testing.assert_allclose(
+                transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3
+            )
+        elif dtype == "int8" or dtype == "uint8":
+            np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1, atol=1)
 
 
 if __name__ == "__main__":

From 2eed6636436901f8a862304603d9b40d83432261 Mon Sep 17 00:00:00 2001
From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com>
Date: Fri, 9 Sep 2022 16:21:23 -0500
Subject: [PATCH 145/704] [TOPI][Hexagon] Implement quantized elementwise for
 hexagon (#12606)

* [TOPI][Hexagon] Add test and schedule for uint8 resize2d

* Fix correctness issue

* Reformat

* [TOPI][Hexagon] Implement quantized elementwise

* Reformat

* Address review comments

* Reformat

* Revert

* Address review comments
---
 python/tvm/topi/hexagon/qnn/__init__.py       |   2 +-
 python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py | 270 ++++++++++++++++++
 .../topi/test_add_subtract_multiply.py        | 217 ++++++++++++--
 3 files changed, 463 insertions(+), 26 deletions(-)
 create mode 100755 python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py

diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
index 25d1e6d1854d..ef9c025ba5b2 100644
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -18,7 +18,7 @@
 """ Computes and schedules for Hexagon quantized ops """
 
 from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule
-
+from .qadd_qsub_qmul import *
 from .dequantize import (
     dequantize_compute,
     dequantize_schedule,
diff --git a/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py b/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py
new file mode 100755
index 000000000000..043ad313bdef
--- /dev/null
+++ b/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py
@@ -0,0 +1,270 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Compute and schedule for quantized add, multiply, subtract op
+
+Please note the following assumptions made by the implementation:
+
+1) The inputs will be multiple of crouton layout except for the axis that needs broadcasting."""
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn, get_fixed_point_value
+
+
+def broadcast_axis(tensor_A, tensor_B):
+    """Find out the indices that will have broadcasting"""
+    A_broadcast = []
+    B_broadcast = []
+
+    for i in range(len(tensor_A.shape)):
+        if tensor_A.shape[i] == tensor_B.shape[i]:
+            A_broadcast.append(1)
+            B_broadcast.append(1)
+        elif tensor_A.shape[i] == 1:
+            A_broadcast.append(0)
+            B_broadcast.append(1)
+        elif tensor_B.shape[i] == 1:
+            A_broadcast.append(1)
+            B_broadcast.append(0)
+    return A_broadcast, B_broadcast
+
+
+def saturate(x: te.Tensor, dtype: str):
+    """Saturate value for the specified data type"""
+    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
+
+
+def get_int_scale(
+    scale_A: float,
+    scale_B: float,
+    scale_M: float,
+    zero_point_A: int,
+    zero_point_B: int,
+    zero_point_M: int,
+    op: str,
+):
+    """
+    Get fixed-point number and exp_scale_factor from topi.hexagon.utils.get_fixed_point_value.
+    Also, depending on the op, this function uses exp_scale_factor(log2 of the scale factor)
+    to adjust the output's zero_point.
+    """
+
+    C_recip = 1 / scale_M
+
+    if op == "qmul":
+        scale = scale_A * scale_B * C_recip
+        scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
+
+        # We need to adjust output's zero point value since the compute for the op is multiplied
+        # by a scaling factor.
+        # The scaling factor is 2^x where x is the exp_scale_factor which is assigned to rsh here.
+        # Since zero_point_M is multipled by 2^rsh while converting floating-point scale value
+        # into fixed-point number, we left shift it by rsh in our compute to reflect that.
+
+        corr = zero_point_M << rsh
+
+        return scale_fixed_point, rsh, corr
+
+    a_scale_f = scale_A * C_recip
+    b_scale_f = scale_B * C_recip
+    scale_fixed_point_a, rsh_a = get_fixed_point_value(a_scale_f, "int16")
+    scale_fixed_point_b, rsh_b = get_fixed_point_value(b_scale_f, "int16")
+
+    # Here we have two exp_scale_factors rsh_a and rsh_b.
+    # To avoid complexity, we want to use a common exp_scale_factor and
+    # we want to use the lowest of the two.
+
+    # Since, either of scale_fixed_point_a or scale_fixed_point_b has already been multiplied
+    # by 2^max(rsh_a, rsh_b) in topi.hexagon.utils.get_fixed_point_value,
+    # we want to undo that by right shifting that scale_fixed_point value
+    # by the difference of rsh_a and rsh_b.
+
+    # This results into having a common exp_scale_factor for both scale_fixed_point_a
+    # and scale_fixed_point_b.
+
+    # We also set rsh here which is used to adjust the zero_point_M and compute the corr value,
+    # computation of which comes from the original equation of the op's compute.
+
+    if rsh_a > rsh_b:
+        scale_fixed_point_a = scale_fixed_point_a >> (rsh_a - rsh_b)
+        rsh = rsh_b
+    else:
+        scale_fixed_point_b = scale_fixed_point_b >> (rsh_b - rsh_a)
+        rsh = rsh_a
+
+    if op == "qadd":
+        corr = (zero_point_M << rsh) - (
+            zero_point_A * scale_fixed_point_a + zero_point_B * scale_fixed_point_b
+        )
+    else:
+        corr = (zero_point_M << rsh) - (
+            zero_point_A * scale_fixed_point_a - zero_point_B * scale_fixed_point_b
+        )
+
+    return scale_fixed_point_a, scale_fixed_point_b, rsh, corr
+
+
+def qadd_broadcast_compute(
+    tensor_A: te.Tensor,
+    tensor_B: te.Tensor,
+    output_shape: list,
+    zero_point_A: int,
+    scale_A: float,
+    zero_point_B: int,
+    scale_B: float,
+    zero_point_M: int,
+    scale_M: float,
+    dtype: str,
+):
+    """Compute quantized add with broadcasting"""
+    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
+    n_a, h_a, w_a, c_a = A_broadcast
+    n_b, h_b, w_b, c_b = B_broadcast
+
+    scale_a, scale_b, rsh, corr = get_int_scale(
+        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qadd"
+    )
+
+    return te.compute(
+        output_shape,
+        lambda n, h, w, c: saturate(
+            (
+                (
+                    (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] * scale_a)
+                    + (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] * scale_b)
+                    + corr
+                )
+                >> rsh
+            ),
+            dtype,
+        ).astype(dtype),
+    )
+
+
+def qsubtract_broadcast_compute(
+    tensor_A: te.Tensor,
+    tensor_B: te.Tensor,
+    output_shape: list,
+    zero_point_A: int,
+    scale_A: float,
+    zero_point_B: int,
+    scale_B: float,
+    zero_point_M: int,
+    scale_M: float,
+    dtype: str,
+):
+    """Compute quantized subtract with broadcasting"""
+    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
+    n_a, h_a, w_a, c_a = A_broadcast
+    n_b, h_b, w_b, c_b = B_broadcast
+
+    scale_a, scale_b, rsh, corr = get_int_scale(
+        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qsub"
+    )
+
+    return te.compute(
+        output_shape,
+        lambda n, h, w, c: saturate(
+            (
+                (
+                    (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] * scale_a)
+                    - (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] * scale_b)
+                    + corr
+                )
+                >> rsh
+            ),
+            dtype,
+        ).astype(dtype),
+    )
+
+
+def qmultiply_broadcast_compute(
+    tensor_A: te.Tensor,
+    tensor_B: te.Tensor,
+    output_shape: list,
+    zero_point_A: int,
+    scale_A: float,
+    zero_point_B: int,
+    scale_B: float,
+    zero_point_M: int,
+    scale_M: float,
+    dtype: str,
+):
+    """Compute quantized multiply with broadcasting"""
+    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
+    n_a, h_a, w_a, c_a = A_broadcast
+    n_b, h_b, w_b, c_b = B_broadcast
+
+    scale_int, rsh, corr = get_int_scale(
+        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qmul"
+    )
+
+    return te.compute(
+        output_shape,
+        lambda n, h, w, c: saturate(
+            (
+                (
+                    scale_int
+                    * (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] - zero_point_A)
+                    * (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] - zero_point_B)
+                    + corr
+                )
+                >> rsh
+            ),
+            dtype,
+        ).astype(dtype),
+    )
+
+
+def tir_schedule_quant(
+    out_M: te.Tensor,
+    tensor_A: te.Tensor,
+    tensor_B: te.Tensor,
+    output_layout: str,
+    tensor_A_layout: str,
+    tensor_B_layout: str,
+):
+    """Schedule for output layout nhwc-8h8w32c-2d"""
+    func = te.create_prim_func([tensor_A, tensor_B, out_M])
+
+    s = tir.Schedule(func)
+
+    block = s.get_block("compute")
+
+    if tensor_A_layout == "nhwc-8h8w32c-2d":
+        tensor_A_transformed_layout = get_layout_transform_fn(tensor_A_layout)
+        s.transform_layout(block, buffer=tensor_A.name, index_map=tensor_A_transformed_layout)
+
+    if tensor_B_layout == "nhwc-8h8w32c-2d":
+        tensor_B_transformed_layout = get_layout_transform_fn(tensor_B_layout)
+        s.transform_layout(block, buffer=tensor_B.name, index_map=tensor_B_transformed_layout)
+
+    output_transformed_layout = get_layout_transform_fn(output_layout)
+    s.transform_layout(block, buffer=out_M.name, index_map=output_transformed_layout)
+
+    n, h, w, c = s.get_loops(block)
+
+    h_o, h_i = s.split(h, [None, 8])
+    w_o, w_i = s.split(w, [None, 8])
+    c_o, c_i = s.split(c, [None, 32])
+    wio, wii = s.split(w_i, [None, 4])
+
+    s.reorder(n, h_o, w_o, c_o, h_i, wio, wii, c_i)
+
+    return s
diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
index 606aa628d009..fe70745143a9 100755
--- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
+++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
@@ -22,7 +22,8 @@
 import tvm
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+import tvm.topi.hexagon.qnn as qn
+from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
 
 
 @tvm.testing.fixture
@@ -38,34 +39,77 @@ def expected_output_np(input_np_A, input_np_B, op_name):
 
 @tvm.testing.fixture
 def input_np_A(input_shape_A, dtype):
+    if dtype == "uint8" or dtype == "int8":
+        dtype = "float32"
     return np.random.random(input_shape_A).astype(dtype)
 
 
 @tvm.testing.fixture
 def input_np_B(input_shape_B, dtype):
+    if dtype == "uint8" or dtype == "int8":
+        dtype = "float32"
     return np.random.random(input_shape_B).astype(dtype)
 
 
 @tvm.testing.fixture
-def transformed_input_np_A(input_np_A, input_A_layout):
-    return transform_numpy(input_np_A, "nhwc", input_A_layout)
+def quantize_input_np_A(input_np_A, dtype):
+    if dtype == "uint8" or dtype == "int8":
+        global zero_point_A_val, scale_A_val
+        input_np_A_quantized, scale_A_val, zero_point_A_val = quantize_np(input_np_A, dtype)
+        return input_np_A_quantized
 
 
 @tvm.testing.fixture
-def transformed_input_np_B(input_np_B, input_B_layout):
-    return transform_numpy(input_np_B, "nhwc", input_B_layout)
+def quantize_input_np_B(input_np_B, dtype):
+    if dtype == "uint8" or dtype == "int8":
+        global zero_point_B_val, scale_B_val
+        input_np_B_quantized, scale_B_val, zero_point_B_val = quantize_np(input_np_B, dtype)
+        return input_np_B_quantized
 
 
 @tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, output_layout):
-    return transform_numpy(expected_output_np, "nhwc", output_layout)
+def transformed_input_np_A(input_np_A, quantize_input_np_A, input_A_layout, dtype):
+    if dtype == "float16":
+        return transform_numpy(input_np_A, "nhwc", input_A_layout)
+    if dtype == "uint8" or dtype == "int8":
+        return transform_numpy(quantize_input_np_A, "nhwc", input_A_layout)
+
+    raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+
+@tvm.testing.fixture
+def transformed_input_np_B(input_np_B, quantize_input_np_B, input_B_layout, dtype):
+    if dtype == "float16":
+        return transform_numpy(input_np_B, "nhwc", input_B_layout)
+    if dtype == "uint8" or dtype == "int8":
+        return transform_numpy(quantize_input_np_B, "nhwc", input_B_layout)
+
+    raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, output_layout, dtype):
+    if dtype == "float16":
+        return transform_numpy(expected_output_np, "nhwc", output_layout)
+    if dtype == "uint8" or dtype == "int8":
+        global zero_point_M_val, scale_M_val
+        out_ref_quantized, scale_M_val, zero_point_M_val = quantize_np(expected_output_np, dtype)
+        return transform_numpy(out_ref_quantized, "nhwc", output_layout)
+
+    raise RuntimeError(f"Unsupported data type '{dtype}'")
 
 
 def hexagon_wrapper_allocation(
-    device, layout, axis_separators, tensor_shape=None, data=None, transformed_data=None, dtype=None
+    device,
+    layout,
+    axis_separators,
+    tensor_shape=None,
+    data_original=None,
+    transformed_data=None,
+    dtype=None,
 ):
     """Input layout can either be nhwc-8h2w32c2w-2d or nhwc"""
-    if layout == "nhwc-8h2w32c2w-2d":
+    if layout == "nhwc-8h2w32c2w-2d" or layout == "nhwc-8h8w32c-2d":
         data_nd = allocate_hexagon_array(
             device,
             tensor_shape=tensor_shape,
@@ -77,7 +121,7 @@ def hexagon_wrapper_allocation(
     elif layout == "nhwc":
         data_nd = allocate_hexagon_array(
             device,
-            data=data,
+            data=data_original,
         )
     return data_nd
 
@@ -136,6 +180,86 @@ class TestAddSubtractMultiplyBroadcast2d:
             "nhwc-8h2w32c2w-2d",
             "float16",
         ),
+        # broadcast all axes in one input
+        (
+            [1, 48, 56, 32],
+            [1, 1, 1, 1],
+            "nhwc-8h2w32c2w-2d",
+            "nhwc",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 48, 32, 64],
+            [1, 48, 32, 64],
+            "nhwc-8h8w32c-2d",
+            "nhwc-8h8w32c-2d",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast axis 2 in one input
+        (
+            [1, 48, 32, 64],
+            [1, 48, 1, 64],
+            "nhwc-8h8w32c-2d",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast axis 1 in one input
+        (
+            [1, 48, 32, 64],
+            [1, 1, 32, 64],
+            "nhwc-8h8w32c-2d",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast axis 3 in one input
+        (
+            [1, 8, 8, 32],
+            [1, 8, 8, 1],
+            "nhwc-8h8w32c-2d",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast both inputs
+        (
+            [1, 56, 1, 128],
+            [1, 1, 64, 1],
+            "nhwc",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast both inputs
+        (
+            [1, 48, 1, 1],
+            [1, 1, 32, 32],
+            "nhwc",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast both inputs
+        (
+            [1, 48, 1, 32],
+            [1, 1, 32, 1],
+            "nhwc",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast all axes in one input
+        (
+            [1, 48, 56, 32],
+            [1, 1, 1, 1],
+            "nhwc-8h8w32c-2d",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
     )
 
     op_name = tvm.testing.parameter("add", "subtract", "multiply")
@@ -148,6 +272,8 @@ def test_transform(
         input_shape_B,
         input_np_A,
         input_np_B,
+        quantize_input_np_A,
+        quantize_input_np_B,
         transformed_input_np_A,
         transformed_input_np_B,
         expected_output_np,
@@ -158,23 +284,50 @@ def test_transform(
         input_B_layout,
         op_name,
     ):
+        output_shape = expected_output_np.shape
         target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape_A, name="A", dtype=dtype)
         B = te.placeholder(input_shape_B, name="B", dtype=dtype)
-        if op_name == "add":
-            M = sl.add_broadcast_compute(A, B)
-        elif op_name == "subtract":
-            M = sl.subtract_broadcast_compute(A, B)
-        elif op_name == "multiply":
-            M = sl.multiply_broadcast_compute(A, B)
-
-        tir_schedule = sl.tir_broadcast_schedule(
-            M, A, B, output_layout, input_A_layout, input_B_layout, op_name
-        )
+        if dtype == "float16":
+            if op_name == "add":
+                M = sl.add_broadcast_compute(A, B)
+            elif op_name == "subtract":
+                M = sl.subtract_broadcast_compute(A, B)
+            elif op_name == "multiply":
+                M = sl.multiply_broadcast_compute(A, B)
+            tir_schedule = sl.tir_broadcast_schedule(
+                M, A, B, output_layout, input_A_layout, input_B_layout, op_name
+            )
+        elif dtype == "uint8" or dtype == "int8":
+            args = [
+                A,
+                B,
+                output_shape,
+                zero_point_A_val,
+                scale_A_val,
+                zero_point_B_val,
+                scale_B_val,
+                zero_point_M_val,
+                scale_M_val,
+                dtype,
+            ]
+            if op_name == "add":
+                M = qn.qadd_broadcast_compute(*args)
+            elif op_name == "subtract":
+                M = qn.qsubtract_broadcast_compute(*args)
+            elif op_name == "multiply":
+                M = qn.qmultiply_broadcast_compute(*args)
+            tir_schedule = qn.tir_schedule_quant(
+                M, A, B, output_layout, input_A_layout, input_B_layout
+            )
+
         sch = tir_schedule.mod
 
         input_axis_separator = [4]
-        if output_layout == "nhwc-8h2w32c2w-2d":
+        if output_layout in (
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h8w32c-2d",
+        ):
             output_axis_separator = [4]
         else:
             raise RuntimeError(f"Unexpected layout '{output_layout}'")
@@ -187,19 +340,26 @@ def test_transform(
                 name="slice_op_with_transform",
             )
 
-        output_shape = expected_output_np.shape
+        if dtype == "float16":
+            in_data_np_A = input_np_A
+            in_data_np_B = input_np_B
+        elif dtype == "int8" or dtype == "uint8":
+            in_data_np_A = quantize_input_np_A
+            in_data_np_B = quantize_input_np_B
+        else:
+            raise RuntimeError(f"Unsupport dtype '{dtype}'")
 
         A_data_nd = hexagon_wrapper_allocation(
             hexagon_session.device,
             layout=input_A_layout,
-            data=input_np_A,
+            data_original=in_data_np_A,
             transformed_data=transformed_input_np_A,
             axis_separators=input_axis_separator,
         )
         B_data_nd = hexagon_wrapper_allocation(
             hexagon_session.device,
             layout=input_B_layout,
-            data=input_np_B,
+            data_original=in_data_np_B,
             transformed_data=transformed_input_np_B,
             axis_separators=input_axis_separator,
         )
@@ -218,8 +378,15 @@ def test_transform(
         # convert nd to np and reshape to fixed chunk size layout
         if output_layout == "nhwc-8h2w32c2w-2d":
             M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
+        elif output_layout == "nhwc-8h8w32c-2d":
+            M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32])
 
-        np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3)
+        if dtype == "float16":
+            np.testing.assert_allclose(
+                transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3
+            )
+        elif dtype == "int8" or dtype == "uint8":
+            np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1, atol=1)
 
 
 if __name__ == "__main__":

From 76f91b42b96b7f3274509ed713a118c117ed2f65 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Sat, 10 Sep 2022 10:58:45 +0100
Subject: [PATCH 146/704] [ETHOSN] Update driver stack version to 22.08
 (#12650)

Updates the driver stack used by the NPU to the latest released version
(semantic version 3.1.0), while maintaining backwards compatibility for
the previous version 22.05 (semantic 3.0.1) during the migration period.
In addition, support for split is re-introduced as this is now supported
in 22.08.

Change-Id: I86bce3469f0b8ad52e66461ae055dec6717b3527
---
 .../ubuntu_install_ethosn_driver_stack.sh     |  2 +-
 python/tvm/relay/op/contrib/ethosn.py         |  8 ++---
 .../contrib/test_ethosn/test_networks.py      | 33 +++++++++++++++----
 .../python/contrib/test_ethosn/test_resize.py |  9 -----
 .../python/contrib/test_ethosn/test_split.py  | 15 +++++++--
 .../contrib/test_ethosn/test_topologies.py    | 18 ++++++----
 6 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/docker/install/ubuntu_install_ethosn_driver_stack.sh b/docker/install/ubuntu_install_ethosn_driver_stack.sh
index 1f8373a839e9..1696b3230e2f 100755
--- a/docker/install/ubuntu_install_ethosn_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosn_driver_stack.sh
@@ -22,7 +22,7 @@ set -o pipefail
 
 repo_url="https://github.com/Arm-software/ethos-n-driver-stack"
 repo_dir="ethosn-driver"
-repo_revision="22.05"
+repo_revision="22.08"
 install_path="/opt/arm/$repo_dir"
 
 tmpdir=$(mktemp -d)
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 5129ed9ffaef..c8003c8da4d5 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -102,11 +102,11 @@ def partition_for_ethosn(mod, params=None, **opts):
         raise ValueError("When targeting Ethos(TM)-N78, -variant=n78 should be set.")
 
     api_version = ethosn_api_version()
-    expected_api_version = "3.0.1"
-    if api_version != LooseVersion(expected_api_version):
+    supported_api_versions = ["3.0.1", "3.1.0"]
+    if all(api_version != LooseVersion(exp_ver) for exp_ver in supported_api_versions):
         raise ValueError(
             f"Driver stack version {api_version} is unsupported. "
-            f"Please use version {expected_api_version}."
+            f"Please use version in {supported_api_versions}."
         )
 
     if params:
@@ -415,7 +415,7 @@ def split(expr):
     """Check if a split is supported by Ethos-N."""
     if not ethosn_available():
         return False
-    if ethosn_api_version() >= LooseVersion("3.0.1"):
+    if ethosn_api_version() == LooseVersion("3.0.1"):
         return False
     if not _ethosn.split(expr):
         return False
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index b584a579b8be..75f3479a5a9c 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -14,7 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=wrong-import-position
+# pylint: disable=wrong-import-position, wrong-import-order
+
 """Arm(R) Ethos(TM)-N integration end-to-end network tests"""
 
 import pytest
@@ -22,11 +23,16 @@
 pytest.importorskip("tflite")
 pytest.importorskip("tensorflow")
 
+from distutils.version import LooseVersion
+
 import tflite.Model
+
 from tvm import relay
 from tvm.testing import requires_ethosn
 from tvm.contrib import download
+from tvm.relay.op.contrib.ethosn import ethosn_api_version
 import tvm.relay.testing.tf as tf_testing
+
 from . import infrastructure as tei
 
 
@@ -119,7 +125,10 @@ def test_mobilenet_v1():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"50186822915909303e813205db80e032"}
+    if ethosn_api_version() == LooseVersion("3.1.0"):
+        _compile_hash = {"c37fec1f214c7f93ce49ee4e3b587969"}
+    else:
+        _compile_hash = {"50186822915909303e813205db80e032"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
@@ -141,7 +150,10 @@ def test_resnet_50_int8():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"9245965b2c01e7f3d9b478e38a186eb4", "4225fa951c145bb1e48e28cad6a3bdd4"}
+    if ethosn_api_version() == LooseVersion("3.1.0"):
+        _compile_hash = {"12d65aec33594c88b6d0d31dcd5144e6", "6a64d69ccb36dfb6b30dd2abdba4b005"}
+    else:
+        _compile_hash = {"9245965b2c01e7f3d9b478e38a186eb4", "4225fa951c145bb1e48e28cad6a3bdd4"}
     _test_image_network(
         model_url="https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/"
         "models/Quantized/resnet_50_quantized.tflite",
@@ -162,7 +174,10 @@ def test_inception_v3():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"a5a2b5d2b618de754bf9a01033a020c0"}
+    if ethosn_api_version() == LooseVersion("3.1.0"):
+        _compile_hash = {"cff892eb15944756f22dad4b83c756d2"}
+    else:
+        _compile_hash = {"a5a2b5d2b618de754bf9a01033a020c0"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/tflite_11_05_08/inception_v3_quant.tgz",
@@ -183,7 +198,10 @@ def test_inception_v4():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"61b4ade41898d7cb2451dbdc3340aced"}
+    if ethosn_api_version() == LooseVersion("3.1.0"):
+        _compile_hash = {"2eeae331898f8e94c74868e190077837"}
+    else:
+        _compile_hash = {"61b4ade41898d7cb2451dbdc3340aced"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/inception_v4_299_quant_20181026.tgz",
@@ -204,7 +222,10 @@ def test_ssd_mobilenet_v1():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"789906c7d8ac787809b303d82781fc9d", "6b699f94795785d31b39940a5cf84a81"}
+    if ethosn_api_version() == LooseVersion("3.1.0"):
+        _compile_hash = {"ec2b78852192058f88b64d45c26620d5", "f68cbeaaba03874ea735ce3f5eab9227"}
+    else:
+        _compile_hash = {"789906c7d8ac787809b303d82781fc9d", "6b699f94795785d31b39940a5cf84a81"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip",
diff --git a/tests/python/contrib/test_ethosn/test_resize.py b/tests/python/contrib/test_ethosn/test_resize.py
index b437ad1e545c..30b29fb1612e 100644
--- a/tests/python/contrib/test_ethosn/test_resize.py
+++ b/tests/python/contrib/test_ethosn/test_resize.py
@@ -108,19 +108,10 @@ def test_resize(dtype, shape, size, coordinate_transformation_mode, rounding_met
             (20, 30),
             "Requested width isn't supported",
         ),
-        (
-            (19, 20),
-            "Requested width and height must be both even or both odd",
-        ),
-        (
-            (20, 19),
-            "Requested width and height must be both even or both odd",
-        ),
     ],
 )
 def test_resize_failure(size, err_msg):
     """Check Resize error messages."""
-
     dtype = "int8"
     zp_min = np.iinfo(dtype).min
 
diff --git a/tests/python/contrib/test_ethosn/test_split.py b/tests/python/contrib/test_ethosn/test_split.py
index afbc45a0805d..a6155065a54c 100644
--- a/tests/python/contrib/test_ethosn/test_split.py
+++ b/tests/python/contrib/test_ethosn/test_split.py
@@ -17,12 +17,15 @@
 
 """Split tests for Arm(R) Ethos(TM)-N"""
 
+from distutils.version import LooseVersion
+
 import numpy as np
 import pytest
 
 import tvm
 from tvm import relay
 from tvm.testing import requires_ethosn
+from tvm.relay.op.contrib.ethosn import ethosn_api_version
 
 from . import infrastructure as tei
 
@@ -33,7 +36,6 @@ def _get_model(shape, dtype, splits, axis):
     return split.astuple()
 
 
-@pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.")
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 @pytest.mark.parametrize(
@@ -45,6 +47,11 @@ def _get_model(shape, dtype, splits, axis):
 )
 def test_split(dtype, shape, splits, axis):
     """Compare Split output with TVM."""
+    if ethosn_api_version() == LooseVersion("3.0.1"):
+        pytest.skip(
+            "Split is not supported by the 3.0.1 version of the driver stack.",
+        )
+
     np.random.seed(0)
 
     outputs = []
@@ -62,7 +69,6 @@ def test_split(dtype, shape, splits, axis):
         tei.verify(outputs, dtype, 0)
 
 
-@pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.")
 @requires_ethosn
 @pytest.mark.parametrize(
     "shape,dtype,splits,axis,err_msg",
@@ -83,6 +89,11 @@ def test_split(dtype, shape, splits, axis):
 )
 def test_split_failure(shape, dtype, splits, axis, err_msg):
     """Check Split error messages."""
+    if ethosn_api_version() == LooseVersion("3.0.1"):
+        pytest.skip(
+            "Split is not supported by the 3.0.1 version of the driver stack.",
+        )
+
     model = _get_model(shape, dtype, splits, axis)
     mod = tei.make_ethosn_partition(model)
     tei.test_error(mod, {}, err_msg)
diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py
index dc6a2ed086d4..47a01154d0b2 100644
--- a/tests/python/contrib/test_ethosn/test_topologies.py
+++ b/tests/python/contrib/test_ethosn/test_topologies.py
@@ -17,13 +17,15 @@
 
 """Arm(R) Ethos(TM)-N tests for complex network topologies."""
 
+from distutils.version import LooseVersion
+
 import numpy as np
 import pytest
 
 import tvm
 from tvm import relay
 from tvm.testing import requires_ethosn
-from tvm.relay.op.contrib.ethosn import Available, ethosn_available
+from tvm.relay.op.contrib.ethosn import Available, ethosn_available, ethosn_api_version
 
 from . import infrastructure as tei
 
@@ -78,8 +80,8 @@ def get_model(input_shape, dtype, var_names):
         model = get_model(inputs["a"].shape, dtype, iter(inputs))
         mod = tei.make_module(model, [])
 
-        expected_host_ops = 1
-        npu_partitions = 2
+        expected_host_ops = 1 if ethosn_api_version() == LooseVersion("3.0.1") else 0
+        npu_partitions = 2 if ethosn_api_version() == LooseVersion("3.0.1") else 1
 
         # Mock inference is only supported when the whole graph is offloaded to the NPU
         if ethosn_available() == Available.SW_ONLY:
@@ -280,8 +282,8 @@ def get_model(shape, dtype, splits, axis):
         model = get_model(shape, dtype, splits, axis)
         mod = tei.make_module(model, {})
 
-        expected_host_ops = 1
-        npu_partitions = 2
+        expected_host_ops = 1 if ethosn_api_version() == LooseVersion("3.0.1") else 0
+        npu_partitions = 2 if ethosn_api_version() == LooseVersion("3.0.1") else 1
 
         # Mock inference is only supported when the whole graph is offloaded to the NPU
         if ethosn_available() == Available.SW_ONLY:
@@ -309,13 +311,17 @@ def get_model(shape, dtype, splits, axis):
         tei.verify(outputs, dtype, 0)
 
 
-@pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.")
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 def test_output_tuple_propagation(dtype):
     """This tests the case where the output tuple must be inferred
     as having dummy tensor information."""
 
+    if ethosn_api_version() == LooseVersion("3.0.1"):
+        pytest.skip(
+            "Split is not supported by the 3.0.1 version of the driver stack.",
+        )
+
     def get_model(dtype):
         a = relay.var("a", shape=(1, 4, 4, 16), dtype=dtype)
         split = relay.op.split(a, indices_or_sections=4, axis=2)

From 286fadecb8d536940b89669e699d757399dad755 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Mon, 12 Sep 2022 00:01:10 -0700
Subject: [PATCH 147/704] [TF] Add Bincount support (#12751)

---
 python/tvm/relay/frontend/tensorflow_ops.py   | 41 ++++++++++++-
 .../frontend/tensorflow/test_forward.py       | 35 +++++++++++
 .../tensorflow2/test_functional_models.py     | 60 +++++++++++++++++++
 3 files changed, 135 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py
index 4598f4f09a05..66bb858edbf0 100644
--- a/python/tvm/relay/frontend/tensorflow_ops.py
+++ b/python/tvm/relay/frontend/tensorflow_ops.py
@@ -2868,11 +2868,49 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _bincount():
+    def _impl(inputs, attr, params, mod):
+        input = inputs[0]  # arr: int32 Tensor
+        size = inputs[1]  # size: non-negative int scalar Tensor
+        # weights: int32, int64, float32, or float64 Tensor with the same shape as arr
+        # or a length-0 Tensor, in which case it acts as all weights equal to 1.
+        weights = inputs[2]
+        # Returns: Output: 1D Tensor with length equal to size
+        # The counts or summed weights for each value in the range [0, size).
+
+        input_shape = _infer_shape(input, mod)
+        if len(input_shape) > 1:
+            input = _op.reshape(input, [-1])
+
+        is_weights_zero_tensor = True
+        if weights:
+            weights_shape = _infer_shape(weights, mod)
+            is_weights_zero_tensor = weights_shape == (0,)
+            if len(weights_shape) > 1:
+                weights = _op.reshape(weights, [-1])
+
+        # Output should have the same dtype as weights.
+        if is_weights_zero_tensor:
+            # if weights are length-0 Tensor - output dtype is float32
+            out_dtype = "float32"
+            updates = _op.cast(_op.ones_like(input), out_dtype)
+        else:
+            out_dtype = _infer_type(weights, mod).checked_type.dtype
+            updates = weights
+
+        counts_shape = _op.reshape(size, [1])
+        counts = _op.zeros(counts_shape, out_dtype)
+        out = _op.scatter_add(counts, input, updates, axis=0)
+        return out
+
+    return _impl
+
+
 def _dense_bincount():
     def _impl(inputs, attr, params, mod):
         input = inputs[0]  # input: int32, int64. 1D or 2D int Tensor
         size = inputs[1]  # size: non-negative int scalar Tensor
-        # weights: int32, int64, float32, or float64 Tensor with the same shape as arr
+        # weights: int32, int64, float32, or float64 Tensor with the same shape as input
         # or a length-0 Tensor, in which case it acts as all weights equal to 1.
         weights = inputs[2]
         # Returns: Output: 1D Tensor with length equal to size
@@ -2951,6 +2989,7 @@ def _impl(inputs, attr, params, mod):
     "BatchNormWithGlobalNormalization": _batch_norm(),
     "BatchToSpaceND": _batch_to_space_nd(),
     "BiasAdd": _bias_add(),
+    "Bincount": _bincount(),
     "BroadcastTo": _broadcast_to(),
     "BroadcastArgs": _broadcast_args(),
     "Cast": _cast(),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index ebeb35e08f5d..8ed6d9108e5d 100755
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -5758,6 +5758,41 @@ def test_invert_permutation():
         compare_tf_with_tvm(x, "Placeholder:0", out_name, no_gpu=False)
 
 
+#######################################################################
+# Bincount
+# ----
+
+
+def _test_bincount(in_shape, size, weights):
+    with tf.Graph().as_default():
+        inputs = []
+        data = []
+        inputs.append(tf.placeholder(shape=in_shape, dtype="int32", name="input0"))
+        data.append(np.random.uniform(0, size, size=in_shape).astype("int32"))
+        inputs.append(tf.placeholder(shape=(), dtype="int32", name="size"))
+        data.append(np.array(size, "int32"))
+        if weights:
+            inputs.append(tf.placeholder(shape=in_shape, dtype="float32", name="weights"))
+            data.append(np.reshape(weights, in_shape).astype("float32"))
+        else:
+            inputs.append(tf.placeholder(shape=(0,), dtype="float32", name="weights"))
+            data.append(np.array([], "float32"))
+        result = tf.raw_ops.Bincount(arr=data[0], size=data[1], weights=data[2])
+        compare_tf_with_tvm(data, [a.name for a in inputs], result.name, mode="vm")
+
+
+def test_forward_bincount():
+    """Test Bincount Op"""
+    # 2D input
+    _test_bincount((3, 10), 20, [1.0] * 30)
+    _test_bincount((3, 10), 20, [1.5] * 30)
+    _test_bincount((3, 10), 20, None)
+    # 1D input
+    _test_bincount((10,), 20, [1.0] * 10)
+    _test_bincount((10,), 20, [1.5] * 10)
+    _test_bincount((10,), 20, None)
+
+
 #######################################################################
 # DenseBincount
 # ----
diff --git a/tests/python/frontend/tensorflow2/test_functional_models.py b/tests/python/frontend/tensorflow2/test_functional_models.py
index 001ba6de1967..42ad5b29af79 100644
--- a/tests/python/frontend/tensorflow2/test_functional_models.py
+++ b/tests/python/frontend/tensorflow2/test_functional_models.py
@@ -585,5 +585,65 @@ def func(self, x):
     run_test((-1, -1))
 
 
+def test_bincount_1d():
+    def run_test(weights, minlength, maxlength, axis, binary_output):
+        class Bincount1D(tf.Module):
+            def get_input(self):
+                return np.random.uniform(low=0, high=maxlength, size=(100,)).astype("int32")
+
+            @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.int32)])
+            def func(self, x):
+                return tf.math.bincount(
+                    x,
+                    weights=weights,
+                    minlength=minlength,
+                    maxlength=maxlength,
+                    axis=axis,
+                    binary_output=binary_output,
+                )
+
+        run_model_graph(Bincount1D)
+        run_func_graph(Bincount1D, runtime="vm")
+
+    for axis in [None, 0, -1]:
+        run_test(weights=None, minlength=20, maxlength=20, axis=axis, binary_output=False)
+        run_test(weights=None, minlength=20, maxlength=20, axis=axis, binary_output=True)
+
+    # weights and axis=None need operator UnsortedSegmentSum to be implemented. Skip axis=None
+    weights = np.random.uniform(low=0.2, high=5, size=(100,)).astype("float32")
+    for axis in [0, -1]:
+        run_test(weights=weights, minlength=20, maxlength=20, axis=axis, binary_output=False)
+
+
+def test_bincount_2d():
+    def run_test(weights, minlength, maxlength, axis, binary_output):
+        class Bincount2D(tf.Module):
+            def get_input(self):
+                return np.random.uniform(low=0, high=maxlength, size=(3, 100)).astype("int32")
+
+            @tf.function(input_signature=[tf.TensorSpec([None, None], tf.int32)])
+            def func(self, x):
+                return tf.math.bincount(
+                    x,
+                    weights=weights,
+                    minlength=minlength,
+                    maxlength=maxlength,
+                    axis=axis,
+                    binary_output=binary_output,
+                )
+
+        run_model_graph(Bincount2D)
+        run_func_graph(Bincount2D, runtime="vm")
+
+    for axis in [None, 0, -1]:
+        run_test(weights=None, minlength=20, maxlength=20, axis=axis, binary_output=False)
+        run_test(weights=None, minlength=20, maxlength=20, axis=axis, binary_output=True)
+
+    # weights and axis=None need operator UnsortedSegmentSum to be implemented. Skip axis=None
+    weights = np.random.uniform(low=0.2, high=5, size=(3, 100)).astype("float32")
+    for axis in [0, -1]:
+        run_test(weights=weights, minlength=20, maxlength=20, axis=axis, binary_output=False)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 4c863fc115ee463284f20b5ee37c973ac0ed5d9a Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Mon, 12 Sep 2022 01:00:29 -0700
Subject: [PATCH 148/704] [TVMScript] Base IRBuilder methods for `Block`
 (#12748)

This PR introduces base IRBuilder methods for `Block`.

Co-authored-by: yongwww <yongcale@gmail.com>
---
 include/tvm/script/ir_builder/tir/frame.h     | 70 +++++++++++++++++++
 include/tvm/script/ir_builder/tir/ir.h        |  8 +++
 python/tvm/script/ir_builder/tir/frame.py     |  5 ++
 python/tvm/script/ir_builder/tir/ir.py        | 20 ++++++
 src/script/ir_builder/tir/frame.cc            | 24 +++++++
 src/script/ir_builder/tir/ir.cc               | 17 +++++
 src/script/ir_builder/tir/utils.h             |  9 +++
 .../unittest/test_tvmscript_ir_builder_tir.py | 27 +++++++
 8 files changed, 180 insertions(+)

diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h
index 4bfd022af27a..15ab77863e5e 100644
--- a/include/tvm/script/ir_builder/tir/frame.h
+++ b/include/tvm/script/ir_builder/tir/frame.h
@@ -117,6 +117,76 @@ class PrimFuncFrame : public TIRFrame {
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(PrimFuncFrame, TIRFrame, PrimFuncFrameNode);
 };
 
+/*!
+ * \brief A frame that represents the block.
+ *
+ * \sa BlockFrame
+ */
+class BlockFrameNode : public TIRFrameNode {
+ public:
+  /*! \brief The name of the block. */
+  String name;
+  /*! \brief The variables of the block. */
+  Array<tvm::tir::IterVar> iter_vars;
+  /*! \brief The read buffer regions of the block. */
+  Optional<Array<tvm::tir::BufferRegion>> reads;
+  /*! \brief The write buffer regions of the block. */
+  Optional<Array<tvm::tir::BufferRegion>> writes;
+  /*! \brief The init statement of the bolck. */
+  Optional<tvm::tir::Stmt> init;
+  /*! \brief The buffer allocated in the block. */
+  Array<tvm::tir::Buffer> alloc_buffers;
+  /*! \brief The match buffer regions. */
+  Array<tvm::tir::MatchBufferRegion> match_buffers;
+  /*! \brief The annotation of the block. */
+  Optional<Map<String, ObjectRef>> annotations;
+  /*! \brief The corresponding values of the iter vars. */
+  Array<PrimExpr> iter_values;
+  /*!
+   * \brief The predicate of the block realization, the block will only be executed when the
+   * predicate is true.
+   */
+  Optional<PrimExpr> predicate;
+  /*! \brief The flag whether to construct BlockRealize or Block. */
+  bool no_realize;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("name", &name);
+    v->Visit("iter_vars", &iter_vars);
+    v->Visit("reads", &reads);
+    v->Visit("writes", &writes);
+    v->Visit("init", &init);
+    v->Visit("alloc_buffers", &alloc_buffers);
+    v->Visit("match_buffers", &match_buffers);
+    v->Visit("annotations", &annotations);
+    v->Visit("iter_values", &iter_values);
+    v->Visit("predicate", &predicate);
+    v->Visit("no_realize", &no_realize);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.BlockFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(BlockFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to BlockFrameNode.
+ *
+ * \sa BlockFrameNode
+ */
+
+class BlockFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(BlockFrame, TIRFrame, BlockFrameNode);
+};
+
 /*!
  * \brief A frame that represents the assert statement. Proceeds if the condition is true,
  * otherwise aborts with the message.
diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
index cee60ad4f827..615ce90383dd 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -34,6 +34,14 @@ namespace tir {
  */
 PrimFuncFrame PrimFunc();
 
+/*!
+ * \brief The block declaration statement.
+ * \param name The name of the block.
+ * \param no_realize The flag whether to construct BlockRealize or Block.
+ * \return The BlockFrame.
+ */
+BlockFrame Block(String name, bool no_realize = false);
+
 /*!
  * \brief Evaluate the input expression.
  * \param value The input expression to evaluate.
diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py
index 61418e0b2aa6..0e7eb2bb4720 100644
--- a/python/tvm/script/ir_builder/tir/frame.py
+++ b/python/tvm/script/ir_builder/tir/frame.py
@@ -29,3 +29,8 @@ class TIRFrame(IRBuilderFrame):
 @_register_object("script.ir_builder.tir.PrimFuncFrame")
 class PrimFuncFrame(TIRFrame):
     ...
+
+
+@_register_object("script.ir_builder.tir.BlockFrame")
+class BlockFrame(TIRFrame):
+    ...
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index ae5d5b260f65..7ba2f6df9418 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -33,6 +33,25 @@ def prim_func() -> frame.PrimFuncFrame:
     return _ffi_api.PrimFunc()  # pylint: disable=no-member # type: ignore
 
 
+def block(name: str = "", no_realize: bool = False) -> frame.BlockFrame:
+    """The block declaration statement.
+
+    Parameters
+    ----------
+    name : str
+        The name of the block.
+
+    no_realize : bool
+        The flag whether to construct BlockRealize or Block.
+
+    Returns
+    -------
+    res : frame.BlockFrame
+        The BlockFrame.
+    """
+    return _ffi_api.Block(name, no_realize)  # pylint: disable=no-member # type: ignore
+
+
 def evaluate(value: PrimExpr) -> None:
     """Evaluate the input expression.
 
@@ -50,6 +69,7 @@ def evaluate(value: PrimExpr) -> None:
 
 
 __all__ = [
+    "block",
     "evaluate",
     "prim_func",
 ]
diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc
index 139c8193b0ba..dd3097e388b7 100644
--- a/src/script/ir_builder/tir/frame.cc
+++ b/src/script/ir_builder/tir/frame.cc
@@ -50,8 +50,32 @@ void PrimFuncFrameNode::ExitWithScope() {
   }
 }
 
+void BlockFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  Array<tvm::tir::Buffer> tir_alloc_buffers;
+  for (const tvm::tir::Buffer& buffer : alloc_buffers) {
+    tir_alloc_buffers.push_back(buffer);
+  }
+  Map<String, ObjectRef> attrs = annotations.value_or({});
+  if (int detect_access = (!reads.defined()) | (!writes.defined() << 1)) {
+    attrs.Set("tir.script_parsing_detect_access", tvm::IntImm(DataType::Int(64), detect_access));
+  }
+  tvm::tir::Block block(iter_vars, reads.value_or(Array<tvm::tir::BufferRegion>()),
+                        writes.value_or(Array<tvm::tir::BufferRegion>()), name, AsStmt(stmts), init,
+                        tir_alloc_buffers, match_buffers, attrs);
+  if (no_realize) {
+    CHECK(iter_values.empty())
+        << "ValueError: Block bindings are not allowed when `no_realize=True`";
+    CHECK(!predicate.defined()) << "ValueError: `T.where` is not allowed when `no_realize=True`";
+    AddToParent(block);
+  } else {
+    AddToParent(tvm::tir::BlockRealize(iter_values, predicate.value_or(Bool(true)), block));
+  }
+}
+
 TVM_REGISTER_NODE_TYPE(TIRFrameNode);
 TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode);
+TVM_REGISTER_NODE_TYPE(BlockFrameNode);
 
 }  // namespace tir
 }  // namespace ir_builder
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index 5f994d71ca0a..4c2679ae6b56 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -41,8 +41,25 @@ PrimFuncFrame PrimFunc() {
   return PrimFuncFrame(n);
 }
 
+BlockFrame Block(String name, bool no_realize) {
+  ObjectPtr<BlockFrameNode> n = make_object<BlockFrameNode>();
+  n->name = name;
+  n->iter_vars.clear();
+  n->reads = NullOpt;
+  n->writes = NullOpt;
+  n->init = NullOpt;
+  n->alloc_buffers.clear();
+  n->match_buffers.clear();
+  n->annotations = NullOpt;
+  n->iter_values.clear();
+  n->predicate = NullOpt;
+  n->no_realize = no_realize;
+  return BlockFrame(n);
+}
+
 void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); }
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.PrimFunc").set_body_typed(PrimFunc);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Block").set_body_typed(Block);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate);
 }  // namespace tir
 }  // namespace ir_builder
diff --git a/src/script/ir_builder/tir/utils.h b/src/script/ir_builder/tir/utils.h
index 47557917cca5..4f8b3f77c6e1 100644
--- a/src/script/ir_builder/tir/utils.h
+++ b/src/script/ir_builder/tir/utils.h
@@ -60,6 +60,15 @@ inline PrimFuncFrame FindPrimFuncFrame(const String& method) {
   throw;
 }
 
+inline BlockFrame FindBlockFrame(const String& method) {
+  if (Optional<BlockFrame> frame = IRBuilder::Current()->GetLastFrame<BlockFrame>()) {
+    return frame.value();
+  }
+  LOG(FATAL) << "ValueError: Block frame not find. Please ensure '" << method
+             << "' is called under T.block()";
+  throw;
+}
+
 }  // namespace tir
 }  // namespace ir_builder
 }  // namespace script
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index 70a8f3565d03..85080c7c65fc 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -45,5 +45,32 @@ def test_ir_builder_tir_primfunc():
     assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True)
 
 
+def test_ir_builder_tir_block():
+    with IRBuilder() as ib:
+        with T.block("block"):
+            T.evaluate(0)
+    # the block generated by IRBuilder
+    block_realize_actual = ib.get()
+
+    # the expected block
+    block_expected = tir.Block(
+        iter_vars=[],
+        reads=[],
+        writes=[],
+        name_hint="block",
+        body=tir.Evaluate(0),
+        alloc_buffers=None,
+        match_buffers=None,
+        annotations={"tir.script_parsing_detect_access": tir.IntImm("int64", 3)},
+    )
+    block_realize_expected = tir.BlockRealize(
+        iter_values=[],
+        predicate=True,
+        block=block_expected,
+    )
+    # Check if the generated ir is expected
+    assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From a63d03a116e6b8a3a80b96a90519a96ca63e16b9 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Mon, 12 Sep 2022 16:07:00 +0800
Subject: [PATCH 149/704] [MetaSchedule] Fix typo of compare between GlobalVar
 and str (#12704)

fix typo of compare between GlobalVar and str
---
 python/tvm/meta_schedule/default_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
index 652f09261b2f..ac4028ec50f8 100644
--- a/python/tvm/meta_schedule/default_config.py
+++ b/python/tvm/meta_schedule/default_config.py
@@ -53,7 +53,7 @@ def mod(mod: Union[PrimFunc, IRModule]) -> IRModule:  # pylint: disable=redefine
         raise TypeError(f"Expected `mod` to be PrimFunc or IRModule, but gets: {mod}")
     func_names = mod.get_global_vars()
     (func_name,) = func_names
-    if len(func_names) == 1 and func_name != "main":
+    if len(func_names) == 1 and func_name.name_hint != "main":
         mod = IRModule({"main": mod[func_name]})
     return mod
 

From a047e0228a3f7015e56c6756cdadb13444008623 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Mon, 12 Sep 2022 12:23:44 -0400
Subject: [PATCH 150/704] [CI] Always install into a python venv in ci
 containers (#12663)

This PR changes all ci_ to install TVM Python dependencies in a
virtualenv separate from the system Python dependencies.

 Sets the stage for adding the poetry-based dependency
generator to the CI container build process.

* Always install into a python venv in ci containers.
* Respect Dockerfile ENV PATH modifications in
docker/bash.sh lookups.
---
 docker/Dockerfile.ci_arm                      |  11 +-
 docker/Dockerfile.ci_cortexm                  |  15 +-
 docker/Dockerfile.ci_cpu                      |  11 +-
 docker/Dockerfile.ci_gpu                      |  12 +-
 docker/Dockerfile.ci_hexagon                  |  11 +-
 docker/Dockerfile.ci_i386                     |  11 +-
 docker/Dockerfile.ci_lint                     |  11 +-
 docker/Dockerfile.ci_minimal                  |  11 +-
 docker/Dockerfile.ci_riscv                    |  15 +-
 docker/Dockerfile.ci_wasm                     |  11 +-
 docker/install/ubuntu1804_install_python.sh   |  45 ----
 .../install/ubuntu1804_install_python_venv.sh |  30 ---
 docker/install/ubuntu_install_python.sh       |  79 ++++--
 docker/python/bootstrap-requirements.txt      |  82 ++++++
 docker/python/bootstrap/.gitignore            |   1 +
 docker/python/bootstrap/generate.sh           | 100 +++++++
 .../bootstrap/lockfiles/constraints-3.7.txt   | 254 ++++++++++++++++++
 .../bootstrap/lockfiles/constraints-3.8.txt   | 251 +++++++++++++++++
 .../bootstrap/lockfiles/requirements-3.7.txt  |   3 +
 .../bootstrap/lockfiles/requirements-3.8.txt  |   3 +
 docker/python/ci-constraints.txt              |  39 +++
 docker/with_the_same_user                     |  26 +-
 22 files changed, 875 insertions(+), 157 deletions(-)
 delete mode 100755 docker/install/ubuntu1804_install_python.sh
 delete mode 100755 docker/install/ubuntu1804_install_python_venv.sh
 create mode 100644 docker/python/bootstrap-requirements.txt
 create mode 100644 docker/python/bootstrap/.gitignore
 create mode 100755 docker/python/bootstrap/generate.sh
 create mode 100644 docker/python/bootstrap/lockfiles/constraints-3.7.txt
 create mode 100644 docker/python/bootstrap/lockfiles/constraints-3.8.txt
 create mode 100644 docker/python/bootstrap/lockfiles/requirements-3.7.txt
 create mode 100644 docker/python/bootstrap/lockfiles/requirements-3.8.txt
 create mode 100644 docker/python/ci-constraints.txt

diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
index 90fbef4d441a..932687f1e568 100644
--- a/docker/Dockerfile.ci_arm
+++ b/docker/Dockerfile.ci_arm
@@ -47,11 +47,12 @@ ENV PATH /opt/sccache:$PATH
 COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
 RUN bash /install/ubuntu_install_llvm.sh
 
-COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
-
-# Globally disable pip cache
-RUN pip config set global.no-cache-dir false
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
 RUN bash /install/ubuntu_install_cmake_source.sh
diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm
index d646704bb0a8..6ca2f2f40b75 100644
--- a/docker/Dockerfile.ci_cortexm
+++ b/docker/Dockerfile.ci_cortexm
@@ -29,18 +29,15 @@ RUN bash /install/ubuntu_install_core.sh
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
-COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
-
 COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
 RUN bash /install/ubuntu_install_cmake_source.sh 3.20.0
 
-COPY install/ubuntu1804_install_python_venv.sh /install/ubuntu1804_install_python_venv.sh
-RUN bash /install/ubuntu1804_install_python_venv.sh
-ENV PATH=/opt/tvm-venv/bin:/opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH
-
-# Globally disable pip cache
-RUN pip config set global.no-cache-dir false
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index d9f353d41be1..00fd9a4fcab3 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -28,11 +28,12 @@ RUN bash /install/ubuntu_install_core.sh
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
-COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
-
-# Globally disable pip cache
-RUN pip config set global.no-cache-dir false
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 6f02ab97c09e..4b729a5f516e 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -38,13 +38,15 @@ RUN bash /install/ubuntu_install_cmake_source.sh
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
-COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
-
-# Globally disable pip cache
-RUN pip config set global.no-cache-dir false
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+# Globally disable pip cache
 RUN bash /install/ubuntu_install_cmake_source.sh
 
 COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index 66b78ae0800c..d2ed29278488 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -31,11 +31,12 @@ RUN bash /install/ubuntu_install_core.sh
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
-COPY install/ubuntu2004_install_python.sh /install/ubuntu2004_install_python.sh
-RUN bash /install/ubuntu2004_install_python.sh
-
-# Globally disable pip cache
-RUN pip config set global.cache-dir false
+ENV TVM_VENV /venv/apache-tvm-py3.8
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index 0b6d8d28c4d7..dc767ff6def1 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -35,18 +35,19 @@ RUN bash /install/ubuntu_install_googletest.sh
 COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
 RUN bash /install/ubuntu_install_llvm.sh
 
-COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
-
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
 RUN bash /install/ubuntu_install_rust.sh
 ENV RUSTUP_HOME /opt/rust
 ENV CARGO_HOME /opt/rust
 ENV PATH $PATH:$CARGO_HOME/bin
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
-# Globally disable pip cache
-RUN pip config set global.no-cache-dir false
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
 
 COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
 RUN bash /install/ubuntu_install_cmake_source.sh
diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 437ea71bd4be..860a43fa2194 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -26,11 +26,12 @@ RUN apt-get update --fix-missing
 
 RUN apt-install-and-clear -y wget git sudo make parallel
 
-COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
-
-# Globally disable pip cache
-RUN pip config set global.no-cache-dir false
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 RUN apt-get update && apt-install-and-clear -y doxygen graphviz curl shellcheck
 
diff --git a/docker/Dockerfile.ci_minimal b/docker/Dockerfile.ci_minimal
index cf548989eba2..974f3eea11d6 100644
--- a/docker/Dockerfile.ci_minimal
+++ b/docker/Dockerfile.ci_minimal
@@ -28,11 +28,12 @@ RUN bash /install/ubuntu_install_core.sh
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
-COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
-
-# Globally disable pip cache
-RUN pip config set global.no-cache-dir false
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
diff --git a/docker/Dockerfile.ci_riscv b/docker/Dockerfile.ci_riscv
index 1ca792e20c98..9b956d55ddaa 100644
--- a/docker/Dockerfile.ci_riscv
+++ b/docker/Dockerfile.ci_riscv
@@ -29,18 +29,15 @@ RUN bash /install/ubuntu_install_core.sh
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
-COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
-
 COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
 RUN bash /install/ubuntu_install_cmake_source.sh
 
-COPY install/ubuntu1804_install_python_venv.sh /install/ubuntu1804_install_python_venv.sh
-RUN bash /install/ubuntu1804_install_python_venv.sh
-ENV PATH=/opt/tvm-venv/bin:/opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH
-
-# Globally disable pip cache
-RUN pip config set global.no-cache-dir false
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index 49435b4f3d47..17230312f041 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -26,11 +26,12 @@ RUN bash /install/ubuntu_install_core.sh
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
-COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
-
-# Globally disable pip cache
-RUN pip config set global.no-cache-dir false
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
diff --git a/docker/install/ubuntu1804_install_python.sh b/docker/install/ubuntu1804_install_python.sh
deleted file mode 100755
index 2cdddbd451a6..000000000000
--- a/docker/install/ubuntu1804_install_python.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-
-cleanup() {
-  rm -rf base-requirements.txt
-}
-
-trap cleanup 0
-
-
-# Install python and pip. Don't modify this to add Python package dependencies,
-# instead modify install_python_package.sh
-apt-get update
-apt-install-and-clear -y software-properties-common python3.7 python3.7-dev python3-pip
-update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
-
-# Pin pip and setuptools versions
-# Hashes generated via:
-#   $ pip download <package>==<version>
-#   $ pip hash --algorithm sha512 <package>.whl
-cat <<EOF > base-requirements.txt
-pip==19.3.1 --hash=sha256:6917c65fc3769ecdc61405d3dfd97afdedd75808d200b2838d7d961cebc0c2c7
-setuptools==58.4.0 --hash=sha256:e8b1d3127a0441fb99a130bcc3c2bf256c2d3ead3aba8fd400e5cbbaf788e036
-EOF
-pip3 install -r base-requirements.txt
diff --git a/docker/install/ubuntu1804_install_python_venv.sh b/docker/install/ubuntu1804_install_python_venv.sh
deleted file mode 100755
index 3f0fb3ee8971..000000000000
--- a/docker/install/ubuntu1804_install_python_venv.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-# install python and pip, don't modify this, modify install_python_package.sh
-apt-get update
-apt-install-and-clear -y software-properties-common python3.7-dev python3-setuptools python3.7-venv
-
-python3 -mvenv /opt/tvm-venv
-
-# Pin pip and setuptools versions
-/opt/tvm-venv/bin/pip3 install pip==19.3.1 setuptools==58.4.0
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index ec50682c1454..66a80e1fdc52 100755
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -18,28 +18,77 @@
 
 set -e
 set -u
-# Used for debugging RVM build
-set -x
 set -o pipefail
 
-# install python and pip, don't modify this, modify install_python_package.sh
+set -x
+
+if [ -z "${TVM_VENV+x}" ]; then
+    echo "ERROR: expect TVM_VENV env var to be set"
+    exit 2
+fi
+
 apt-get update
-apt-install-and-clear -y python-dev
 
-# python 3.6
+# Ensure lsb-release is installed.
+apt-install-and-clear -y \
+    lsb-core
+
+release=$(lsb_release -sc)
+if [ "${release}" == "bionic" ]; then
+    PYTHON_VERSION=3.7
+elif [ "${release}" == "focal" ]; then
+    PYTHON_VERSION=3.8
+else
+    echo "Don't know which version of python to install for lsb-release ${release}"
+    exit 2
+fi
+
+# Install python and pip. Don't modify this to add Python package dependencies,
+# instead modify install_python_package.sh
 apt-install-and-clear -y software-properties-common
+apt-install-and-clear -y \
+    acl \
+    python${PYTHON_VERSION} \
+    python${PYTHON_VERSION}-dev \
+    python3-pip \
+    python${PYTHON_VERSION}-venv
 
-add-apt-repository -y ppa:deadsnakes/ppa
-apt-get update
-apt-install-and-clear -y python-pip python-dev python3.6 python3.6-dev
+update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1
+
+# Allow disabling user site-packages, even with sudo; this makes it harder to repro CI failures
+# locally because it's hard to tell what might be in this directory.
+echo "Defaults env_keep += \"PYTHONNOUSERSITE\"" >/etc/sudoers.d/91-preserve-python-nousersite
+export PYTHONNOUSERSITE=1
+
+venv_dir="$(python3 -c "import os.path;print(os.path.dirname(\"${TVM_VENV}\"))")"
+mkdir -p "${venv_dir}"
+python3 -mvenv "${TVM_VENV}"
+. "${TVM_VENV}/bin/activate"
+
+# Update pip to match version used to produce requirements-hashed.txt. This step
+# is necessary so that pip's dependency solver is recent.
+pip_spec=$(cat /install/python/bootstrap/lockfiles/constraints-${PYTHON_VERSION}.txt | grep 'pip==')
+pip3 install -U --require-hashes -r <(echo "${pip_spec}") \
+     -c /install/python/bootstrap/lockfiles/constraints-${PYTHON_VERSION}.txt
 
-rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
+# Python configuration
+pip3 config set global.no-cache-dir true  # Never cache packages
 
-# python 3.7
-apt-install-and-clear -y python3.7
+# Now install the remaining base packages.
+pip3 install \
+     --require-hashes \
+     -r /install/python/bootstrap/lockfiles/constraints-${PYTHON_VERSION}.txt
 
-# Install pip
-wget -q https://bootstrap.pypa.io/get-pip.py && python3.7 get-pip.py
+addgroup tvm-venv
+chgrp -R tvm-venv "${TVM_VENV}"
+setfacl -R -d -m group:tvm-venv:rwx "${TVM_VENV}"
 
-# Pin pip and setuptools versions
-pip3 install pip==19.3.1 setuptools==58.4.0
+# Prevent further use of pip3 via the system.
+# There may be multiple (i.e. from python3-pip apt package and pip3 install -U).
+deactivate
+while [ "$(which pip3)" != "" ]; do
+    rm "$(which pip3)"
+done
+while [ "$(which pip)" != "" ]; do
+    rm "$(which pip)"
+done
diff --git a/docker/python/bootstrap-requirements.txt b/docker/python/bootstrap-requirements.txt
new file mode 100644
index 000000000000..5c036b8ed97d
--- /dev/null
+++ b/docker/python/bootstrap-requirements.txt
@@ -0,0 +1,82 @@
+CacheControl==0.12.11 \
+    --hash=sha256:2c75d6a8938cb1933c75c50184549ad42728a27e9f6b92fd677c3151aa72555b
+SecretStorage==3.3.2 \
+    --hash=sha256:755dc845b6ad76dcbcbc07ea3da75ae54bb1ea529eb72d15f83d26499a5df319
+cachy==0.3.0 \
+    --hash=sha256:338ca09c8860e76b275aff52374330efedc4d5a5e45dc1c5b539c1ead0786fe7
+certifi==2022.5.18.1 \
+    --hash=sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a
+cffi==1.15.0 \
+    --hash=sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997
+charset-normalizer==2.0.12 \
+    --hash=sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df
+cleo==0.8.1 \
+    --hash=sha256:141cda6dc94a92343be626bb87a0b6c86ae291dfc732a57bf04310d4b4201753
+clikit==0.6.2 \
+    --hash=sha256:71268e074e68082306e23d7369a7b99f824a0ef926e55ba2665e911f7208489e
+crashtest==0.3.1 \
+    --hash=sha256:300f4b0825f57688b47b6d70c6a31de33512eb2fa1ac614f780939aa0cf91680
+cryptography==37.0.2 \
+    --hash=sha256:0cc20f655157d4cfc7bada909dc5cc228211b075ba8407c46467f63597c78178
+distlib==0.3.4 \
+    --hash=sha256:6564fe0a8f51e734df6333d08b8b94d4ea8ee6b99b5ed50613f731fd4089f34b
+filelock==3.7.0 \
+    --hash=sha256:c7b5fdb219b398a5b28c8e4c1893ef5f98ece6a38c6ab2c22e26ec161556fed6
+html5lib==1.1 \
+    --hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d
+idna==3.3 \
+    --hash=sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff
+importlib-metadata==1.7.0 \
+    --hash=sha256:dc15b2969b4ce36305c51eebe62d418ac7791e9a157911d58bfb1f9ccd8e2070
+jeepney==0.8.0 \
+    --hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755
+keyring==22.3.0 \
+    --hash=sha256:2bc8363ebdd63886126a012057a85c8cb6e143877afa02619ac7dbc9f38a207b
+lockfile==0.12.2 \
+    --hash=sha256:6c3cb24f344923d30b2785d5ad75182c8ea7ac1b6171b08657258ec7429d50fa
+msgpack==1.0.3 \
+    --hash=sha256:9c0903bd93cbd34653dd63bbfcb99d7539c372795201f39d16fdfde4418de43a
+packaging==20.9 \
+    --hash=sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a
+pastel==0.2.1 \
+    --hash=sha256:4349225fcdf6c2bb34d483e523475de5bb04a5c10ef711263452cb37d7dd4364
+pexpect==4.8.0 \
+    --hash=sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937
+pip==22.1.1 \
+    --hash=sha256:e7bcf0b2cbdec2af84cc1b7b79b25fdbd7228fbdb61a4dca0b82810d0ba9d18b
+pkginfo==1.8.2 \
+    --hash=sha256:c24c487c6a7f72c66e816ab1796b96ac6c3d14d49338293d2141664330b55ffc
+platformdirs==2.5.2 \
+    --hash=sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788
+poetry==1.1.13 \
+    --hash=sha256:52deb0792a2e801967ba9c4cdb39b56fe68b0b5cd3f195b004bef603db9d51a7
+poetry-core==1.0.8 \
+    --hash=sha256:54b0fab6f7b313886e547a52f8bf52b8cf43e65b2633c65117f8755289061924
+ptyprocess==0.7.0 \
+    --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35
+pycparser==2.21 \
+    --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9
+pylev==1.4.0 \
+    --hash=sha256:7b2e2aa7b00e05bb3f7650eb506fc89f474f70493271a35c242d9a92188ad3dd
+pyparsing==3.0.9 \
+    --hash=sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc
+requests==2.27.1 \
+    --hash=sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d
+requests-toolbelt==0.9.1 \
+    --hash=sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f
+setuptools==62.3.2 \
+    --hash=sha256:68e45d17c9281ba25dc0104eadd2647172b3472d9e01f911efa57965e8d51a36
+shellingham==1.4.0 \
+    --hash=sha256:536b67a0697f2e4af32ab176c00a50ac2899c5a05e0d8e2dadac8e58888283f9
+six==1.16.0 \
+    --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
+tomlkit==0.10.2 \
+    --hash=sha256:905cf92c2111ef80d355708f47ac24ad1b6fc2adc5107455940088c9bbecaedb
+urllib3==1.26.9 \
+    --hash=sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14
+virtualenv==20.14.1 \
+    --hash=sha256:e617f16e25b42eb4f6e74096b9c9e37713cf10bf30168fb4a739f3fa8f898a3a
+webencodings==0.5.1 \
+    --hash=sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78
+zipp==3.8.0 \
+    --hash=sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099
diff --git a/docker/python/bootstrap/.gitignore b/docker/python/bootstrap/.gitignore
new file mode 100644
index 000000000000..3d2dbd4b6317
--- /dev/null
+++ b/docker/python/bootstrap/.gitignore
@@ -0,0 +1 @@
+/_venv
diff --git a/docker/python/bootstrap/generate.sh b/docker/python/bootstrap/generate.sh
new file mode 100755
index 000000000000..116b8d8daee0
--- /dev/null
+++ b/docker/python/bootstrap/generate.sh
@@ -0,0 +1,100 @@
+#!/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+set -x
+
+cd "$(dirname "$0")"
+
+rm -rf build lockfiles
+mkdir build
+mkdir lockfiles
+
+function lock() {
+    mkdir -p build/$1
+    cat >build/$1/pyproject.toml <<EOF
+# AUTOGENERATED DO NOT EDIT
+
+[tool.poetry]
+name = "apache-tvm-bootstrap"
+authors = []
+version = "0.0.1"
+description = ""
+
+[tool.poetry.dependencies]
+python = "^$1"
+pip = "*"
+poetry = "1.2.0b1"
+setuptools = "*"
+EOF
+
+    # Install poetry so that the env can be locked.
+    python3 -mvenv build/$1/_venv
+    pwd
+    . build/$1/_venv/bin/activate
+    (mkdir -p build/$1/downloaded && cd build/$1/downloaded && pip3 download pip setuptools && pip3 install *.whl)
+    pip3 install poetry
+    (cd build/$1 && poetry lock)
+
+    # Now export requirements.txt and constraints.txt for
+    # requirements.txt has to be generated by scanning pyproject.toml and translating the [tool.poetry.dependencies] section.
+    (cd build/$1 && python3 <<EOF )
+found_deps = False
+requirements = []
+for line in open("pyproject.toml"):
+    if line.startswith("[tool.poetry.dependencies]"):
+        found_deps = True
+        continue
+    if found_deps and "=" in line:
+        package = line.split("=", 1)[0].strip()
+        if package != "python":
+            requirements.append(package)
+
+with open("requirements.txt", "w") as f:
+    for r in sorted(requirements):
+        f.write(f"{r}\n")
+EOF
+
+    # For
+    (cd build/$1 && poetry export -o constraints.txt)
+
+
+    (cd build/$1 && python3 <<EOF )
+import os
+import pkginfo
+import subprocess
+
+with open("constraints.txt", "a") as constraints_f:
+    for f in sorted(os.scandir("downloaded"), key=lambda x: x.name):
+        if not f.is_file():
+            continue
+        p = pkginfo.get_metadata("downloaded/" + f.name)
+        constraints_f.write(
+            f"{p.name}=={p.version} {subprocess.check_output(['pip3', 'hash', '-a', 'sha256', p.filename], encoding='utf-8').split()[1]}\n")
+EOF
+
+    # Assemble the directory passed to docker
+    cp build/$1/requirements.txt lockfiles/requirements-$1.txt
+    cp build/$1/constraints.txt lockfiles/constraints-$1.txt
+    deactivate
+}
+
+lock 3.7
+lock 3.8
diff --git a/docker/python/bootstrap/lockfiles/constraints-3.7.txt b/docker/python/bootstrap/lockfiles/constraints-3.7.txt
new file mode 100644
index 000000000000..5b3a0a41e397
--- /dev/null
+++ b/docker/python/bootstrap/lockfiles/constraints-3.7.txt
@@ -0,0 +1,254 @@
+cachecontrol==0.12.11; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:2c75d6a8938cb1933c75c50184549ad42728a27e9f6b92fd677c3151aa72555b \
+    --hash=sha256:a5b9fcc986b184db101aa280b42ecdcdfc524892596f606858e0b7a8b4d9e144
+cachy==0.3.0; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.4.0" \
+    --hash=sha256:338ca09c8860e76b275aff52374330efedc4d5a5e45dc1c5b539c1ead0786fe7 \
+    --hash=sha256:186581f4ceb42a0bbe040c407da73c14092379b1e4c0e327fdb72ae4a9b269b1
+certifi==2022.6.15; python_version >= "3.7" and python_version < "4" \
+    --hash=sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412 \
+    --hash=sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d
+cffi==1.15.1; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \
+    --hash=sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2 \
+    --hash=sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2 \
+    --hash=sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914 \
+    --hash=sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3 \
+    --hash=sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e \
+    --hash=sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162 \
+    --hash=sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b \
+    --hash=sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21 \
+    --hash=sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185 \
+    --hash=sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd \
+    --hash=sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc \
+    --hash=sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f \
+    --hash=sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e \
+    --hash=sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4 \
+    --hash=sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01 \
+    --hash=sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e \
+    --hash=sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2 \
+    --hash=sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d \
+    --hash=sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac \
+    --hash=sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83 \
+    --hash=sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9 \
+    --hash=sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c \
+    --hash=sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325 \
+    --hash=sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c \
+    --hash=sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef \
+    --hash=sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8 \
+    --hash=sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d \
+    --hash=sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104 \
+    --hash=sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7 \
+    --hash=sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6 \
+    --hash=sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d \
+    --hash=sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a \
+    --hash=sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405 \
+    --hash=sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e \
+    --hash=sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf \
+    --hash=sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497 \
+    --hash=sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375 \
+    --hash=sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e \
+    --hash=sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82 \
+    --hash=sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b \
+    --hash=sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c \
+    --hash=sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426 \
+    --hash=sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9 \
+    --hash=sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045 \
+    --hash=sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3 \
+    --hash=sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a \
+    --hash=sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5 \
+    --hash=sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca \
+    --hash=sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02 \
+    --hash=sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192 \
+    --hash=sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314 \
+    --hash=sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5 \
+    --hash=sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585 \
+    --hash=sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0 \
+    --hash=sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415 \
+    --hash=sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d \
+    --hash=sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984 \
+    --hash=sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35 \
+    --hash=sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27 \
+    --hash=sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76 \
+    --hash=sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3 \
+    --hash=sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee \
+    --hash=sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c \
+    --hash=sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9
+charset-normalizer==2.1.0; python_version >= "3.7" and python_version < "4" and python_full_version >= "3.6.0" \
+    --hash=sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413 \
+    --hash=sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5
+cleo==1.0.0a5; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:ff53056589300976e960f75afb792dfbfc9c78dcbb5a448e207a17b643826360 \
+    --hash=sha256:097c9d0e0332fd53cc89fc11eb0a6ba0309e6a3933c08f7b38558555486925d3
+crashtest==0.3.1; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:300f4b0825f57688b47b6d70c6a31de33512eb2fa1ac614f780939aa0cf91680 \
+    --hash=sha256:42ca7b6ce88b6c7433e2ce47ea884e91ec93104a4b754998be498a8e6c3d37dd
+cryptography==37.0.3; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \
+    --hash=sha256:d10413d493e98075060d3e62e5826de372912ea653ccc948f3c41b21ddca087f \
+    --hash=sha256:cd64147ff16506632893ceb2569624b48c84daa3ba4d89695f7c7bc24188eee9 \
+    --hash=sha256:17c74f7d9e9e9bb7e84521243695c1b4bdc3a0e44ca764e6bcf8f05f3de3d0df \
+    --hash=sha256:0713bee6c8077786c56bdec9c5d3f099d40d2c862ff3200416f6862e9dd63156 \
+    --hash=sha256:b9c2008417741cdfbe945ef2d16b7b7ba0790886a0b49e1de533acf93eb66ed6 \
+    --hash=sha256:646905ff7a712e415bf0d0f214e0eb669dd2257c4d7a27db1e8baec5d2a1d55f \
+    --hash=sha256:dcafadb5a06cb7a6bb49fb4c1de7414ee2f8c8e12b047606d97c3175d690f582 \
+    --hash=sha256:0b4bfc5ccfe4e5c7de535670680398fed4a0bbc5dfd52b3a295baad42230abdf \
+    --hash=sha256:a03dbc0d8ce8c1146c177cd0e3a66ea106f36733fb1b997ea4d051f8a68539ff \
+    --hash=sha256:190a24c14e91c1fa3101069aac7e77d11c5a73911c3904128367f52946bbb6fd \
+    --hash=sha256:b05c5478524deb7a019e240f2a970040c4b0f01f58f0425e6262c96b126c6a3e \
+    --hash=sha256:891ed8312840fd43e0696468a6520a582a033c0109f7b14b96067bfe1123226b \
+    --hash=sha256:30d6aabf623a01affc7c0824936c3dde6590076b61f5dd299df3cc2c75fc5915 \
+    --hash=sha256:31a7c1f1c2551f013d4294d06e22848e2ccd77825f0987cba3239df6ebf7b020 \
+    --hash=sha256:a94fd1ff80001cb97add71d07f596d8b865b716f25ef501183e0e199390e50d3 \
+    --hash=sha256:8a85dbcc770256918b40c2f40bd3ffd3b2ae45b0cf19068b561db8f8d61bf492 \
+    --hash=sha256:773d5b5f2e2bd2c7cbb1bd24902ad41283c88b9dd463a0f82adc9a2870d9d066 \
+    --hash=sha256:0f9193428a55a4347af2d4fd8141a2002dedbcc26487e67fd2ae19f977ee8afc \
+    --hash=sha256:7bf652c73e8f7c32a3f92f7184bf7f9106dacdf5ef59c3c3683d7dae2c4972fb \
+    --hash=sha256:c3c8b1ad2c266fdf7adc041cc4156d6a3d14db93de2f81b26a5af97ef3f209e5 \
+    --hash=sha256:2383d6c3088e863304c37c65cd2ea404b7fbb4886823eab1d74137cc27f3d2ee \
+    --hash=sha256:ae430d51c67ac638dfbb42edf56c669ca9c74744f4d225ad11c6f3d355858187
+distlib==0.3.4; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:6564fe0a8f51e734df6333d08b8b94d4ea8ee6b99b5ed50613f731fd4089f34b \
+    --hash=sha256:e4b58818180336dc9c529bfb9a0b58728ffc09ad92027a3f30b7cd91e3458579
+entrypoints==0.3; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19 \
+    --hash=sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451
+filelock==3.7.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404 \
+    --hash=sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04
+html5lib==1.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d \
+    --hash=sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f
+idna==3.3; python_version >= "3.7" and python_version < "4" \
+    --hash=sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff \
+    --hash=sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d
+importlib-metadata==4.12.0; python_version >= "3.7" and python_version < "3.8" and (python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "3.8" or python_version >= "3.7" and python_version < "3.8" and python_full_version >= "3.5.0") \
+    --hash=sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23 \
+    --hash=sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670
+jeepney==0.8.0; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \
+    --hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755 \
+    --hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806
+keyring==23.6.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:372ff2fc43ab779e3f87911c26e6c7acc8bb440cbd82683e383ca37594cb0617 \
+    --hash=sha256:3ac00c26e4c93739e19103091a9986a9f79665a78cf15a4df1dba7ea9ac8da2f
+lockfile==0.12.2; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:6c3cb24f344923d30b2785d5ad75182c8ea7ac1b6171b08657258ec7429d50fa \
+    --hash=sha256:6aed02de03cba24efabcd600b30540140634fc06cfa603822d508d5361e9f799
+msgpack==1.0.4; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:4ab251d229d10498e9a2f3b1e68ef64cb393394ec477e3370c457f9430ce9250 \
+    --hash=sha256:112b0f93202d7c0fef0b7810d465fde23c746a2d482e1e2de2aafd2ce1492c88 \
+    --hash=sha256:002b5c72b6cd9b4bafd790f364b8480e859b4712e91f43014fe01e4f957b8467 \
+    --hash=sha256:35bc0faa494b0f1d851fd29129b2575b2e26d41d177caacd4206d81502d4c6a6 \
+    --hash=sha256:4733359808c56d5d7756628736061c432ded018e7a1dff2d35a02439043321aa \
+    --hash=sha256:eb514ad14edf07a1dbe63761fd30f89ae79b42625731e1ccf5e1f1092950eaa6 \
+    --hash=sha256:c23080fdeec4716aede32b4e0ef7e213c7b1093eede9ee010949f2a418ced6ba \
+    --hash=sha256:49565b0e3d7896d9ea71d9095df15b7f75a035c49be733051c34762ca95bbf7e \
+    --hash=sha256:aca0f1644d6b5a73eb3e74d4d64d5d8c6c3d577e753a04c9e9c87d07692c58db \
+    --hash=sha256:0dfe3947db5fb9ce52aaea6ca28112a170db9eae75adf9339a1aec434dc954ef \
+    --hash=sha256:4dea20515f660aa6b7e964433b1808d098dcfcabbebeaaad240d11f909298075 \
+    --hash=sha256:e83f80a7fec1a62cf4e6c9a660e39c7f878f603737a0cdac8c13131d11d97f52 \
+    --hash=sha256:3c11a48cf5e59026ad7cb0dc29e29a01b5a66a3e333dc11c04f7e991fc5510a9 \
+    --hash=sha256:1276e8f34e139aeff1c77a3cefb295598b504ac5314d32c8c3d54d24fadb94c9 \
+    --hash=sha256:6c9566f2c39ccced0a38d37c26cc3570983b97833c365a6044edef3574a00c08 \
+    --hash=sha256:fcb8a47f43acc113e24e910399376f7277cf8508b27e5b88499f053de6b115a8 \
+    --hash=sha256:76ee788122de3a68a02ed6f3a16bbcd97bc7c2e39bd4d94be2f1821e7c4a64e6 \
+    --hash=sha256:0a68d3ac0104e2d3510de90a1091720157c319ceeb90d74f7b5295a6bee51bae \
+    --hash=sha256:85f279d88d8e833ec015650fd15ae5eddce0791e1e8a59165318f371158efec6 \
+    --hash=sha256:c1683841cd4fa45ac427c18854c3ec3cd9b681694caf5bff04edb9387602d661 \
+    --hash=sha256:a75dfb03f8b06f4ab093dafe3ddcc2d633259e6c3f74bb1b01996f5d8aa5868c \
+    --hash=sha256:9667bdfdf523c40d2511f0e98a6c9d3603be6b371ae9a238b7ef2dc4e7a427b0 \
+    --hash=sha256:11184bc7e56fd74c00ead4f9cc9a3091d62ecb96e97653add7a879a14b003227 \
+    --hash=sha256:ac5bd7901487c4a1dd51a8c58f2632b15d838d07ceedaa5e4c080f7190925bff \
+    --hash=sha256:1e91d641d2bfe91ba4c52039adc5bccf27c335356055825c7f88742c8bb900dd \
+    --hash=sha256:2a2df1b55a78eb5f5b7d2a4bb221cd8363913830145fad05374a80bf0877cb1e \
+    --hash=sha256:545e3cf0cf74f3e48b470f68ed19551ae6f9722814ea969305794645da091236 \
+    --hash=sha256:2cc5ca2712ac0003bcb625c96368fd08a0f86bbc1a5578802512d87bc592fe44 \
+    --hash=sha256:eba96145051ccec0ec86611fe9cf693ce55f2a3ce89c06ed307de0e085730ec1 \
+    --hash=sha256:7760f85956c415578c17edb39eed99f9181a48375b0d4a94076d84148cf67b2d \
+    --hash=sha256:449e57cc1ff18d3b444eb554e44613cffcccb32805d16726a5494038c3b93dab \
+    --hash=sha256:d603de2b8d2ea3f3bcb2efe286849aa7a81531abc52d8454da12f46235092bcb \
+    --hash=sha256:48f5d88c99f64c456413d74a975bd605a9b0526293218a3b77220a2c15458ba9 \
+    --hash=sha256:6916c78f33602ecf0509cc40379271ba0f9ab572b066bd4bdafd7434dee4bc6e \
+    --hash=sha256:81fc7ba725464651190b196f3cd848e8553d4d510114a954681fd0b9c479d7e1 \
+    --hash=sha256:d5b5b962221fa2c5d3a7f8133f9abffc114fe218eb4365e40f17732ade576c8e \
+    --hash=sha256:77ccd2af37f3db0ea59fb280fa2165bf1b096510ba9fe0cc2bf8fa92a22fdb43 \
+    --hash=sha256:b17be2478b622939e39b816e0aa8242611cc8d3583d1cd8ec31b249f04623243 \
+    --hash=sha256:2bb8cdf50dd623392fa75525cce44a65a12a00c98e1e37bf0fb08ddce2ff60d2 \
+    --hash=sha256:26b8feaca40a90cbe031b03d82b2898bf560027160d3eae1423f4a67654ec5d6 \
+    --hash=sha256:462497af5fd4e0edbb1559c352ad84f6c577ffbbb708566a0abaaa84acd9f3ae \
+    --hash=sha256:2999623886c5c02deefe156e8f869c3b0aaeba14bfc50aa2486a0415178fce55 \
+    --hash=sha256:f0029245c51fd9473dc1aede1160b0a29f4a912e6b1dd353fa6d317085b219da \
+    --hash=sha256:ed6f7b854a823ea44cf94919ba3f727e230da29feb4a99711433f25800cf747f \
+    --hash=sha256:0df96d6eaf45ceca04b3f3b4b111b86b33785683d682c655063ef8057d61fd92 \
+    --hash=sha256:6a4192b1ab40f8dca3f2877b70e63799d95c62c068c84dc028b40a6cb03ccd0f \
+    --hash=sha256:0e3590f9fb9f7fbc36df366267870e77269c03172d086fa76bb4eba8b2b46624 \
+    --hash=sha256:1576bd97527a93c44fa856770197dec00d223b0b9f36ef03f65bac60197cedf8 \
+    --hash=sha256:63e29d6e8c9ca22b21846234913c3466b7e4ee6e422f205a2988083de3b08cae \
+    --hash=sha256:fb62ea4b62bfcb0b380d5680f9a4b3f9a2d166d9394e9bbd9666c0ee09a3645c \
+    --hash=sha256:4d5834a2a48965a349da1c5a79760d94a1a0172fbb5ab6b5b33cbf8447e109ce \
+    --hash=sha256:f5d869c18f030202eb412f08b28d2afeea553d6613aee89e200d7aca7ef01f5f
+packaging==20.9; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.4.0" \
+    --hash=sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a \
+    --hash=sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5
+pexpect==4.8.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937 \
+    --hash=sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c
+pkginfo==1.8.3; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.0" \
+    --hash=sha256:848865108ec99d4901b2f7e84058b6e7660aae8ae10164e015a6dcf5b242a594 \
+    --hash=sha256:a84da4318dd86f870a9447a8c98340aa06216bfc6f2b7bdc4b8766984ae1867c
+platformdirs==2.5.2; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788 \
+    --hash=sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19
+poetry-core==1.1.0b2; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:48ef71ff8a4c2f0b4eaf9c138c12feb96dbf32e65baac8ca673769d05edf142f \
+    --hash=sha256:4967fe08f745291b353328d4226d378a1731de2997a25b7a0c891e302460108d
+poetry==1.2.0b1; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:e3d68c88492550c48df10c738e962f1f770ad71e715bab878a46f527e1ce81d2 \
+    --hash=sha256:26cf8d309a74fff25d768219c2215a989a530acab886c01de3db07ab70bc7abf
+ptyprocess==0.7.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 \
+    --hash=sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220
+pycparser==2.21; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" and sys_platform == "linux" or python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" and python_full_version >= "3.4.0" \
+    --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \
+    --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206
+pylev==1.4.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:7b2e2aa7b00e05bb3f7650eb506fc89f474f70493271a35c242d9a92188ad3dd \
+    --hash=sha256:9e77e941042ad3a4cc305dcdf2b2dec1aec2fbe3dd9015d2698ad02b173006d1
+pyparsing==3.0.9; python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.8" \
+    --hash=sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc \
+    --hash=sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb
+pywin32-ctypes==0.2.0; python_version >= "3.7" and python_version < "4.0" and sys_platform == "win32" \
+    --hash=sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942 \
+    --hash=sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98
+requests-toolbelt==0.9.1; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0 \
+    --hash=sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f
+requests==2.28.1; python_version >= "3.7" and python_version < "4" \
+    --hash=sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349 \
+    --hash=sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983
+secretstorage==3.3.2; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \
+    --hash=sha256:755dc845b6ad76dcbcbc07ea3da75ae54bb1ea529eb72d15f83d26499a5df319 \
+    --hash=sha256:0a8eb9645b320881c222e827c26f4cfcf55363e8b374a021981ef886657a912f
+shellingham==1.4.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:536b67a0697f2e4af32ab176c00a50ac2899c5a05e0d8e2dadac8e58888283f9 \
+    --hash=sha256:4855c2458d6904829bd34c299f11fdeed7cfefbf8a2c522e4caea6cd76b3171e
+six==1.16.0; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 \
+    --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926
+tomlkit==0.11.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:0f4050db66fd445b885778900ce4dd9aea8c90c4721141fde0d6ade893820ef1 \
+    --hash=sha256:71ceb10c0eefd8b8f11fe34e8a51ad07812cb1dc3de23247425fbc9ddc47b9dd
+typing-extensions==4.3.0; python_version >= "3.7" and python_version < "3.8" \
+    --hash=sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02 \
+    --hash=sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6
+urllib3==1.26.9; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_full_version >= "3.5.0" and python_version < "4" and python_version >= "3.7" \
+    --hash=sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14 \
+    --hash=sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e
+virtualenv==20.15.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:b30aefac647e86af6d82bfc944c556f8f1a9c90427b2fb4e3bfbf338cb82becf \
+    --hash=sha256:288171134a2ff3bfb1a2f54f119e77cd1b81c29fc1265a2356f3e8d14c7d58c4
+webencodings==0.5.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78 \
+    --hash=sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923
+zipp==3.8.0; python_version >= "3.7" and python_version < "3.8" \
+    --hash=sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099 \
+    --hash=sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad
+pip==22.1.2 --hash=sha256:a3edacb89022ef5258bf61852728bf866632a394da837ca49eb4303635835f17
+setuptools==62.6.0 --hash=sha256:c1848f654aea2e3526d17fc3ce6aeaa5e7e24e66e645b5be2171f3f6b4e5a178
diff --git a/docker/python/bootstrap/lockfiles/constraints-3.8.txt b/docker/python/bootstrap/lockfiles/constraints-3.8.txt
new file mode 100644
index 000000000000..f15e0e427cd4
--- /dev/null
+++ b/docker/python/bootstrap/lockfiles/constraints-3.8.txt
@@ -0,0 +1,251 @@
+cachecontrol==0.12.11; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:2c75d6a8938cb1933c75c50184549ad42728a27e9f6b92fd677c3151aa72555b \
+    --hash=sha256:a5b9fcc986b184db101aa280b42ecdcdfc524892596f606858e0b7a8b4d9e144
+cachy==0.3.0; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.4.0" \
+    --hash=sha256:338ca09c8860e76b275aff52374330efedc4d5a5e45dc1c5b539c1ead0786fe7 \
+    --hash=sha256:186581f4ceb42a0bbe040c407da73c14092379b1e4c0e327fdb72ae4a9b269b1
+certifi==2022.6.15; python_version >= "3.7" and python_version < "4" \
+    --hash=sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412 \
+    --hash=sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d
+cffi==1.15.1; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \
+    --hash=sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2 \
+    --hash=sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2 \
+    --hash=sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914 \
+    --hash=sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3 \
+    --hash=sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e \
+    --hash=sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162 \
+    --hash=sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b \
+    --hash=sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21 \
+    --hash=sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185 \
+    --hash=sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd \
+    --hash=sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc \
+    --hash=sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f \
+    --hash=sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e \
+    --hash=sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4 \
+    --hash=sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01 \
+    --hash=sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e \
+    --hash=sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2 \
+    --hash=sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d \
+    --hash=sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac \
+    --hash=sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83 \
+    --hash=sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9 \
+    --hash=sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c \
+    --hash=sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325 \
+    --hash=sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c \
+    --hash=sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef \
+    --hash=sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8 \
+    --hash=sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d \
+    --hash=sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104 \
+    --hash=sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7 \
+    --hash=sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6 \
+    --hash=sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d \
+    --hash=sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a \
+    --hash=sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405 \
+    --hash=sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e \
+    --hash=sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf \
+    --hash=sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497 \
+    --hash=sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375 \
+    --hash=sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e \
+    --hash=sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82 \
+    --hash=sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b \
+    --hash=sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c \
+    --hash=sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426 \
+    --hash=sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9 \
+    --hash=sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045 \
+    --hash=sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3 \
+    --hash=sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a \
+    --hash=sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5 \
+    --hash=sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca \
+    --hash=sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02 \
+    --hash=sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192 \
+    --hash=sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314 \
+    --hash=sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5 \
+    --hash=sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585 \
+    --hash=sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0 \
+    --hash=sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415 \
+    --hash=sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d \
+    --hash=sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984 \
+    --hash=sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35 \
+    --hash=sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27 \
+    --hash=sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76 \
+    --hash=sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3 \
+    --hash=sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee \
+    --hash=sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c \
+    --hash=sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9
+charset-normalizer==2.1.0; python_version >= "3.7" and python_version < "4" and python_full_version >= "3.6.0" \
+    --hash=sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413 \
+    --hash=sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5
+cleo==1.0.0a5; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:ff53056589300976e960f75afb792dfbfc9c78dcbb5a448e207a17b643826360 \
+    --hash=sha256:097c9d0e0332fd53cc89fc11eb0a6ba0309e6a3933c08f7b38558555486925d3
+crashtest==0.3.1; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:300f4b0825f57688b47b6d70c6a31de33512eb2fa1ac614f780939aa0cf91680 \
+    --hash=sha256:42ca7b6ce88b6c7433e2ce47ea884e91ec93104a4b754998be498a8e6c3d37dd
+cryptography==37.0.3; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \
+    --hash=sha256:d10413d493e98075060d3e62e5826de372912ea653ccc948f3c41b21ddca087f \
+    --hash=sha256:cd64147ff16506632893ceb2569624b48c84daa3ba4d89695f7c7bc24188eee9 \
+    --hash=sha256:17c74f7d9e9e9bb7e84521243695c1b4bdc3a0e44ca764e6bcf8f05f3de3d0df \
+    --hash=sha256:0713bee6c8077786c56bdec9c5d3f099d40d2c862ff3200416f6862e9dd63156 \
+    --hash=sha256:b9c2008417741cdfbe945ef2d16b7b7ba0790886a0b49e1de533acf93eb66ed6 \
+    --hash=sha256:646905ff7a712e415bf0d0f214e0eb669dd2257c4d7a27db1e8baec5d2a1d55f \
+    --hash=sha256:dcafadb5a06cb7a6bb49fb4c1de7414ee2f8c8e12b047606d97c3175d690f582 \
+    --hash=sha256:0b4bfc5ccfe4e5c7de535670680398fed4a0bbc5dfd52b3a295baad42230abdf \
+    --hash=sha256:a03dbc0d8ce8c1146c177cd0e3a66ea106f36733fb1b997ea4d051f8a68539ff \
+    --hash=sha256:190a24c14e91c1fa3101069aac7e77d11c5a73911c3904128367f52946bbb6fd \
+    --hash=sha256:b05c5478524deb7a019e240f2a970040c4b0f01f58f0425e6262c96b126c6a3e \
+    --hash=sha256:891ed8312840fd43e0696468a6520a582a033c0109f7b14b96067bfe1123226b \
+    --hash=sha256:30d6aabf623a01affc7c0824936c3dde6590076b61f5dd299df3cc2c75fc5915 \
+    --hash=sha256:31a7c1f1c2551f013d4294d06e22848e2ccd77825f0987cba3239df6ebf7b020 \
+    --hash=sha256:a94fd1ff80001cb97add71d07f596d8b865b716f25ef501183e0e199390e50d3 \
+    --hash=sha256:8a85dbcc770256918b40c2f40bd3ffd3b2ae45b0cf19068b561db8f8d61bf492 \
+    --hash=sha256:773d5b5f2e2bd2c7cbb1bd24902ad41283c88b9dd463a0f82adc9a2870d9d066 \
+    --hash=sha256:0f9193428a55a4347af2d4fd8141a2002dedbcc26487e67fd2ae19f977ee8afc \
+    --hash=sha256:7bf652c73e8f7c32a3f92f7184bf7f9106dacdf5ef59c3c3683d7dae2c4972fb \
+    --hash=sha256:c3c8b1ad2c266fdf7adc041cc4156d6a3d14db93de2f81b26a5af97ef3f209e5 \
+    --hash=sha256:2383d6c3088e863304c37c65cd2ea404b7fbb4886823eab1d74137cc27f3d2ee \
+    --hash=sha256:ae430d51c67ac638dfbb42edf56c669ca9c74744f4d225ad11c6f3d355858187
+distlib==0.3.4; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:6564fe0a8f51e734df6333d08b8b94d4ea8ee6b99b5ed50613f731fd4089f34b \
+    --hash=sha256:e4b58818180336dc9c529bfb9a0b58728ffc09ad92027a3f30b7cd91e3458579
+entrypoints==0.3; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19 \
+    --hash=sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451
+filelock==3.7.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404 \
+    --hash=sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04
+html5lib==1.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d \
+    --hash=sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f
+idna==3.3; python_version >= "3.7" and python_version < "4" \
+    --hash=sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff \
+    --hash=sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d
+importlib-metadata==4.12.0; python_version >= "3.7" and python_version < "3.10" \
+    --hash=sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23 \
+    --hash=sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670
+jeepney==0.8.0; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \
+    --hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755 \
+    --hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806
+keyring==23.6.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:372ff2fc43ab779e3f87911c26e6c7acc8bb440cbd82683e383ca37594cb0617 \
+    --hash=sha256:3ac00c26e4c93739e19103091a9986a9f79665a78cf15a4df1dba7ea9ac8da2f
+lockfile==0.12.2; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:6c3cb24f344923d30b2785d5ad75182c8ea7ac1b6171b08657258ec7429d50fa \
+    --hash=sha256:6aed02de03cba24efabcd600b30540140634fc06cfa603822d508d5361e9f799
+msgpack==1.0.4; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:4ab251d229d10498e9a2f3b1e68ef64cb393394ec477e3370c457f9430ce9250 \
+    --hash=sha256:112b0f93202d7c0fef0b7810d465fde23c746a2d482e1e2de2aafd2ce1492c88 \
+    --hash=sha256:002b5c72b6cd9b4bafd790f364b8480e859b4712e91f43014fe01e4f957b8467 \
+    --hash=sha256:35bc0faa494b0f1d851fd29129b2575b2e26d41d177caacd4206d81502d4c6a6 \
+    --hash=sha256:4733359808c56d5d7756628736061c432ded018e7a1dff2d35a02439043321aa \
+    --hash=sha256:eb514ad14edf07a1dbe63761fd30f89ae79b42625731e1ccf5e1f1092950eaa6 \
+    --hash=sha256:c23080fdeec4716aede32b4e0ef7e213c7b1093eede9ee010949f2a418ced6ba \
+    --hash=sha256:49565b0e3d7896d9ea71d9095df15b7f75a035c49be733051c34762ca95bbf7e \
+    --hash=sha256:aca0f1644d6b5a73eb3e74d4d64d5d8c6c3d577e753a04c9e9c87d07692c58db \
+    --hash=sha256:0dfe3947db5fb9ce52aaea6ca28112a170db9eae75adf9339a1aec434dc954ef \
+    --hash=sha256:4dea20515f660aa6b7e964433b1808d098dcfcabbebeaaad240d11f909298075 \
+    --hash=sha256:e83f80a7fec1a62cf4e6c9a660e39c7f878f603737a0cdac8c13131d11d97f52 \
+    --hash=sha256:3c11a48cf5e59026ad7cb0dc29e29a01b5a66a3e333dc11c04f7e991fc5510a9 \
+    --hash=sha256:1276e8f34e139aeff1c77a3cefb295598b504ac5314d32c8c3d54d24fadb94c9 \
+    --hash=sha256:6c9566f2c39ccced0a38d37c26cc3570983b97833c365a6044edef3574a00c08 \
+    --hash=sha256:fcb8a47f43acc113e24e910399376f7277cf8508b27e5b88499f053de6b115a8 \
+    --hash=sha256:76ee788122de3a68a02ed6f3a16bbcd97bc7c2e39bd4d94be2f1821e7c4a64e6 \
+    --hash=sha256:0a68d3ac0104e2d3510de90a1091720157c319ceeb90d74f7b5295a6bee51bae \
+    --hash=sha256:85f279d88d8e833ec015650fd15ae5eddce0791e1e8a59165318f371158efec6 \
+    --hash=sha256:c1683841cd4fa45ac427c18854c3ec3cd9b681694caf5bff04edb9387602d661 \
+    --hash=sha256:a75dfb03f8b06f4ab093dafe3ddcc2d633259e6c3f74bb1b01996f5d8aa5868c \
+    --hash=sha256:9667bdfdf523c40d2511f0e98a6c9d3603be6b371ae9a238b7ef2dc4e7a427b0 \
+    --hash=sha256:11184bc7e56fd74c00ead4f9cc9a3091d62ecb96e97653add7a879a14b003227 \
+    --hash=sha256:ac5bd7901487c4a1dd51a8c58f2632b15d838d07ceedaa5e4c080f7190925bff \
+    --hash=sha256:1e91d641d2bfe91ba4c52039adc5bccf27c335356055825c7f88742c8bb900dd \
+    --hash=sha256:2a2df1b55a78eb5f5b7d2a4bb221cd8363913830145fad05374a80bf0877cb1e \
+    --hash=sha256:545e3cf0cf74f3e48b470f68ed19551ae6f9722814ea969305794645da091236 \
+    --hash=sha256:2cc5ca2712ac0003bcb625c96368fd08a0f86bbc1a5578802512d87bc592fe44 \
+    --hash=sha256:eba96145051ccec0ec86611fe9cf693ce55f2a3ce89c06ed307de0e085730ec1 \
+    --hash=sha256:7760f85956c415578c17edb39eed99f9181a48375b0d4a94076d84148cf67b2d \
+    --hash=sha256:449e57cc1ff18d3b444eb554e44613cffcccb32805d16726a5494038c3b93dab \
+    --hash=sha256:d603de2b8d2ea3f3bcb2efe286849aa7a81531abc52d8454da12f46235092bcb \
+    --hash=sha256:48f5d88c99f64c456413d74a975bd605a9b0526293218a3b77220a2c15458ba9 \
+    --hash=sha256:6916c78f33602ecf0509cc40379271ba0f9ab572b066bd4bdafd7434dee4bc6e \
+    --hash=sha256:81fc7ba725464651190b196f3cd848e8553d4d510114a954681fd0b9c479d7e1 \
+    --hash=sha256:d5b5b962221fa2c5d3a7f8133f9abffc114fe218eb4365e40f17732ade576c8e \
+    --hash=sha256:77ccd2af37f3db0ea59fb280fa2165bf1b096510ba9fe0cc2bf8fa92a22fdb43 \
+    --hash=sha256:b17be2478b622939e39b816e0aa8242611cc8d3583d1cd8ec31b249f04623243 \
+    --hash=sha256:2bb8cdf50dd623392fa75525cce44a65a12a00c98e1e37bf0fb08ddce2ff60d2 \
+    --hash=sha256:26b8feaca40a90cbe031b03d82b2898bf560027160d3eae1423f4a67654ec5d6 \
+    --hash=sha256:462497af5fd4e0edbb1559c352ad84f6c577ffbbb708566a0abaaa84acd9f3ae \
+    --hash=sha256:2999623886c5c02deefe156e8f869c3b0aaeba14bfc50aa2486a0415178fce55 \
+    --hash=sha256:f0029245c51fd9473dc1aede1160b0a29f4a912e6b1dd353fa6d317085b219da \
+    --hash=sha256:ed6f7b854a823ea44cf94919ba3f727e230da29feb4a99711433f25800cf747f \
+    --hash=sha256:0df96d6eaf45ceca04b3f3b4b111b86b33785683d682c655063ef8057d61fd92 \
+    --hash=sha256:6a4192b1ab40f8dca3f2877b70e63799d95c62c068c84dc028b40a6cb03ccd0f \
+    --hash=sha256:0e3590f9fb9f7fbc36df366267870e77269c03172d086fa76bb4eba8b2b46624 \
+    --hash=sha256:1576bd97527a93c44fa856770197dec00d223b0b9f36ef03f65bac60197cedf8 \
+    --hash=sha256:63e29d6e8c9ca22b21846234913c3466b7e4ee6e422f205a2988083de3b08cae \
+    --hash=sha256:fb62ea4b62bfcb0b380d5680f9a4b3f9a2d166d9394e9bbd9666c0ee09a3645c \
+    --hash=sha256:4d5834a2a48965a349da1c5a79760d94a1a0172fbb5ab6b5b33cbf8447e109ce \
+    --hash=sha256:f5d869c18f030202eb412f08b28d2afeea553d6613aee89e200d7aca7ef01f5f
+packaging==20.9; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.4.0" \
+    --hash=sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a \
+    --hash=sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5
+pexpect==4.8.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937 \
+    --hash=sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c
+pkginfo==1.8.3; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.0" \
+    --hash=sha256:848865108ec99d4901b2f7e84058b6e7660aae8ae10164e015a6dcf5b242a594 \
+    --hash=sha256:a84da4318dd86f870a9447a8c98340aa06216bfc6f2b7bdc4b8766984ae1867c
+platformdirs==2.5.2; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788 \
+    --hash=sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19
+poetry-core==1.1.0b2; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:48ef71ff8a4c2f0b4eaf9c138c12feb96dbf32e65baac8ca673769d05edf142f \
+    --hash=sha256:4967fe08f745291b353328d4226d378a1731de2997a25b7a0c891e302460108d
+poetry==1.2.0b1; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:e3d68c88492550c48df10c738e962f1f770ad71e715bab878a46f527e1ce81d2 \
+    --hash=sha256:26cf8d309a74fff25d768219c2215a989a530acab886c01de3db07ab70bc7abf
+ptyprocess==0.7.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 \
+    --hash=sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220
+pycparser==2.21; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" and sys_platform == "linux" or python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" and python_full_version >= "3.4.0" \
+    --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \
+    --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206
+pylev==1.4.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:7b2e2aa7b00e05bb3f7650eb506fc89f474f70493271a35c242d9a92188ad3dd \
+    --hash=sha256:9e77e941042ad3a4cc305dcdf2b2dec1aec2fbe3dd9015d2698ad02b173006d1
+pyparsing==3.0.9; python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.8" \
+    --hash=sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc \
+    --hash=sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb
+pywin32-ctypes==0.2.0; python_version >= "3.7" and python_version < "4.0" and sys_platform == "win32" \
+    --hash=sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942 \
+    --hash=sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98
+requests-toolbelt==0.9.1; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0 \
+    --hash=sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f
+requests==2.28.1; python_version >= "3.7" and python_version < "4" \
+    --hash=sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349 \
+    --hash=sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983
+secretstorage==3.3.2; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \
+    --hash=sha256:755dc845b6ad76dcbcbc07ea3da75ae54bb1ea529eb72d15f83d26499a5df319 \
+    --hash=sha256:0a8eb9645b320881c222e827c26f4cfcf55363e8b374a021981ef886657a912f
+shellingham==1.4.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:536b67a0697f2e4af32ab176c00a50ac2899c5a05e0d8e2dadac8e58888283f9 \
+    --hash=sha256:4855c2458d6904829bd34c299f11fdeed7cfefbf8a2c522e4caea6cd76b3171e
+six==1.16.0; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 \
+    --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926
+tomlkit==0.11.0; python_version >= "3.7" and python_version < "4.0" \
+    --hash=sha256:0f4050db66fd445b885778900ce4dd9aea8c90c4721141fde0d6ade893820ef1 \
+    --hash=sha256:71ceb10c0eefd8b8f11fe34e8a51ad07812cb1dc3de23247425fbc9ddc47b9dd
+urllib3==1.26.9; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_full_version >= "3.5.0" and python_version < "4" and python_version >= "3.7" \
+    --hash=sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14 \
+    --hash=sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e
+virtualenv==20.15.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:b30aefac647e86af6d82bfc944c556f8f1a9c90427b2fb4e3bfbf338cb82becf \
+    --hash=sha256:288171134a2ff3bfb1a2f54f119e77cd1b81c29fc1265a2356f3e8d14c7d58c4
+webencodings==0.5.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \
+    --hash=sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78 \
+    --hash=sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923
+zipp==3.8.0; python_version >= "3.7" and python_version < "3.10" \
+    --hash=sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099 \
+    --hash=sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad
+pip==22.1.2 --hash=sha256:a3edacb89022ef5258bf61852728bf866632a394da837ca49eb4303635835f17
+setuptools==62.6.0 --hash=sha256:c1848f654aea2e3526d17fc3ce6aeaa5e7e24e66e645b5be2171f3f6b4e5a178
diff --git a/docker/python/bootstrap/lockfiles/requirements-3.7.txt b/docker/python/bootstrap/lockfiles/requirements-3.7.txt
new file mode 100644
index 000000000000..43a3c2405739
--- /dev/null
+++ b/docker/python/bootstrap/lockfiles/requirements-3.7.txt
@@ -0,0 +1,3 @@
+pip
+poetry
+setuptools
diff --git a/docker/python/bootstrap/lockfiles/requirements-3.8.txt b/docker/python/bootstrap/lockfiles/requirements-3.8.txt
new file mode 100644
index 000000000000..43a3c2405739
--- /dev/null
+++ b/docker/python/bootstrap/lockfiles/requirements-3.8.txt
@@ -0,0 +1,3 @@
+pip
+poetry
+setuptools
diff --git a/docker/python/ci-constraints.txt b/docker/python/ci-constraints.txt
new file mode 100644
index 000000000000..6e586b14ae3d
--- /dev/null
+++ b/docker/python/ci-constraints.txt
@@ -0,0 +1,39 @@
+# This file lists packages we intentionally hold back in CI for no reason other than that
+# updates outside of these bounds require a considerable amount of work, and allowing them to float
+# freely would mean that small changes to the TVM dependency set could be held up behind large
+# migration tasks if a new version of these packages were to be released. Holding packages back
+# here allows us to decide when to tackle such migration work.
+#keras = "^2.6.0"
+#mxnet = "^1.6.0"
+
+#black = "<21.8b0"  # Breaks tensorflow-gpu. Revisit when tensorflow is upgraded.
+blocklint = "==0.2.3"
+#commonmark = ">=0.7.3"
+cpplint = "==1.6.0"
+#docutils = ">=0.11,<0.17"
+#ethos-u-vela = "==3.2.0"
+flake8 = "==3.9.2"
+flowvision = "==0.1.0"
+#h5py = "==3.1.0"
+keras = "==2.7"
+jinja2 = "==3.0.3"
+mxnet = "==1.6.0"
+mypy = "==0.902"
+oneflow = "==0.7.0"
+onnx = "==1.10.2"
+onnxruntime = "==1.9.0"
+numpy = "==1.19.3"
+paddlepaddle = "==2.1.3"
+pillow = "==9.1.0"
+pylint = "==2.4.4"
+scipy = "==1.7.3"
+sphinx = "==4.2.0"
+sphinx-gallery = "==0.4.0"
+tensorflow = "==2.7.2"
+tensorflow-aarch64 = "==2.7.2"
+tensorflow-estimator = "==2.7.0"
+tensorflow-gpu = "==2.7.2"
+tflite = "==2.4.0"
+torch = "==1.11.0"
+torchvision = "==0.12.0+cpu"
+#xgboost = "==1.4.2"
diff --git a/docker/with_the_same_user b/docker/with_the_same_user
index 71e701dcfb59..397b885ee166 100644
--- a/docker/with_the_same_user
+++ b/docker/with_the_same_user
@@ -25,7 +25,13 @@
 
 set -e
 
-COMMAND=("$@")
+# NOTE: sudo uses the env_reset option to reset environment variables to a secure bare minimum.
+# The --preserve-env option below passes those variables through to the invoked process; however,
+# this appears not to affect the environment used with execve, so we resolve the binary to run
+# in this file using the $PATH specified in the Dockerfile.
+COMMAND=( "$(which "$1")" )
+shift
+COMMAND=( "${COMMAND[@]}" "$@" )
 
 if ! touch /this_is_writable_file_system; then
   echo "You can't write to your filesystem!"
@@ -41,21 +47,23 @@ getent group "${CI_BUILD_GID}" || (
     if grep -q "^${CI_BUILD_GROUP}:" /etc/group; then
         CI_BUILD_GROUP="${CI_BUILD_GROUP}2"
     fi
-    addgroup --force-badname --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}")
+    addgroup --force-badname --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" >/dev/null)
+
+getent group tvm-venv || (addgroup tvm-venv >/dev/null)
 getent passwd "${CI_BUILD_UID}" || adduser --force-badname --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
     --disabled-password --home "${CI_BUILD_HOME}" --quiet "${CI_BUILD_USER}"
-usermod -a -G sudo "${CI_BUILD_USER}"
+usermod -a -G sudo -G tvm-venv "${CI_BUILD_USER}"
 
 # Add user to video group for ROCm
-if [[ ! -z $ROCM_ENABLED ]]; then
+if [[ ! -z "${ROCM_ENABLED-}" ]]; then
   usermod -a -G video "${CI_BUILD_USER}"
 fi
 
 # This is a grotesque hack to get PYTEST_ADD_OPTS available to all task scripts.
 echo "${CI_BUILD_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-nopasswd-sudo
 
-if [[ ! -z $CUDA_VISIBLE_DEVICES ]]; then
+if [[ ! -z "${CUDA_VISIBLE_DEVICES-}" ]]; then
     CUDA_ENV="CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}"
 else
     CUDA_ENV=""
@@ -65,8 +73,8 @@ sudo -u "#${CI_BUILD_UID}" --preserve-env \
 ${CUDA_ENV} \
 PATH=${PATH} \
 JAVA_HOME=${JAVA_HOME} \
-LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \
-PYTHONPATH=${PYTHONPATH} \
-CI_IMAGE_NAME=${CI_IMAGE_NAME} \
-HOME=${CI_BUILD_HOME} \
+LD_LIBRARY_PATH="${LD_LIBRARY_PATH-}" \
+PYTHONPATH="${PYTHONPATH-}" \
+CI_IMAGE_NAME="${CI_IMAGE_NAME-}" \
+HOME="${CI_BUILD_HOME-}" \
 "${COMMAND[@]}"

From b22b872da800b0b44feeca67e808319e21b840a2 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Tue, 13 Sep 2022 00:44:40 +0530
Subject: [PATCH 151/704] [Hexagon] Add Hand written HVX conv2d (#12204)

* [Hexagon] Add Hand written HVX conv2d

Co-authored-by: Krzysztof Parzyszek <kparzysz@quicinc.com>

* Address review comments

Co-authored-by: Krzysztof Parzyszek <kparzysz@quicinc.com>

* Add some more comments and a file rename

* Add gtest unit tests for blockize/deblockize

* Add gtest unit tests fp16 utils

Co-authored-by: Krzysztof Parzyszek <kparzysz@quicinc.com>
---
 cmake/modules/Hexagon.cmake                   |  10 +
 include/tvm/runtime/hexagon/ops/conv2d.h      | 198 +++++++
 src/runtime/hexagon/ops/conv2d_fp16_hvx.cc    | 489 ++++++++++++++++++
 src/runtime/hexagon/ops/conv_utils.cc         | 243 +++++++++
 .../hexagon/hexagon_fp16_utils_tests.cc       | 289 +++++++++++
 .../topi/test_conv2d_fp16_intrin.py           | 248 +++++++++
 6 files changed, 1477 insertions(+)
 create mode 100644 include/tvm/runtime/hexagon/ops/conv2d.h
 create mode 100644 src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
 create mode 100644 src/runtime/hexagon/ops/conv_utils.cc
 create mode 100644 tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py

diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index c08ea5eb1df1..aad770120201 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -172,6 +172,16 @@ if(BUILD_FOR_HEXAGON)
     list(APPEND TVM_RUNTIME_LINKER_LIBS -Wl,--whole-archive ${USE_HEXAGON_SDK}/libs/qhl/prebuilt/hexagon_toolv84_v68/libqhmath.a -Wl,--no-whole-archive)
 
   endif()
+
+  # Hand-written ops
+  file_glob_append(RUNTIME_HEXAGON_SRCS
+    "${TVMRT_SOURCE_DIR}/hexagon/ops/*.cc"
+  )
+
+  set_source_files_properties(
+    "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_fp16_hvx.cc"
+    PROPERTIES COMPILE_FLAGS "-mhvx"
+  )
 endif()
 
 if(USE_HEXAGON_RPC)
diff --git a/include/tvm/runtime/hexagon/ops/conv2d.h b/include/tvm/runtime/hexagon/ops/conv2d.h
new file mode 100644
index 000000000000..d759149727e8
--- /dev/null
+++ b/include/tvm/runtime/hexagon/ops/conv2d.h
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <cassert>
+
+#ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+#define TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+static constexpr auto hexagon_device = DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
+
+// Standalone DLTensor: the standalone-ness means that this object owns the shape
+// (as opposed to a DLTensor).
+template <size_t NDIM>
+class SDLTensor : public DLTensor {
+ public:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space, const int64_t* data_dims)
+      : SDLTensor(data_ptr, data_type, data_space) {
+    for (size_t i = 0; i < NDIM; ++i) dims[i] = data_dims[i];
+  }
+
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space,
+            std::initializer_list<int64_t> data_dims)
+      : SDLTensor(data_ptr, data_type, data_space, data_dims.begin()) {}
+
+  void* GetDataSpace() const { return data_space; }
+
+ private:
+  /**
+   * @brief Construct SDLTensor
+   *
+   * @param data_ptr Either points to the same memory as data_space or an array of pointers to the
+   * start of each chunk of weight. Since weights can be of varying sizes, this array could contain
+   * the pointer to each chunk of memory
+   * @param data_type data type of the elements in Tensor
+   * @param data_space is meant to store the pointer returned from AllocDataSpace and can be freed
+   * by passing it to FreeDataSpace
+   */
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space) : data_space(data_space) {
+    data = data_ptr;
+    device = hexagon_device;
+    ndim = NDIM;
+    dtype = data_type;
+    shape = dims;
+    strides = nullptr;
+    byte_offset = 0;
+  }
+
+  void* data_space = nullptr;
+  int64_t dims[NDIM];
+};
+
+inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
+
+inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); }
+
+constexpr int xyc_to_sm_16b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxcccccx
+  assert(y >= 0 && x >= 0 && c >= 0);
+  return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
+}
+
+constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 16-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiiioooooi, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
+}
+
+inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+
+// Returns the block address at the given index
+// Assumptions
+// - The data type of tensor is fp16
+// - There is only one batch, and hence n==0
+inline uintptr_t nhwc_at(const DLTensor& a, int n, int y, int x, int c) {
+  if (y < 0 || y >= a.shape[1]) return uintptr_t(0);
+  auto p = static_cast<uintptr_t*>(a.data);
+  assert(n == 0);
+  return p[y * a.shape[2] * a.shape[3] + x * a.shape[3] + c];
+}
+
+// Returns the address of the chunk stored at given index
+// Assumptions
+// - The data type of tensor is fp16
+inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
+  auto p = static_cast<uintptr_t*>(f.data);
+  return p[y * f.shape[1] * f.shape[2] * f.shape[3] + x * f.shape[2] * f.shape[3] + i * f.shape[3] +
+           o];
+}
+
+/**
+ * @brief Function to "blockize" the flat input data
+ * The term "blockize" is used to mention that the data is stored in non-contiguous blocks
+ *
+ * The input is mapped into the below mentioned layout (notation similar to index map used for
+ * transform layout):
+ *
+ * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2
+ *
+ * where AXIS_SEPARATOR represents split up in the physical layout
+ *
+ * @param out Pre-allocated output memory pointer
+ * @param inp_flat Flat input data pointer
+ * @param height
+ * @param width
+ * @param depth
+ */
+void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth);
+
+/**
+ * @brief Convert back from non-contguous layout to a flat layout
+ *
+ * @param out_flat Pre-allocated output memory pointer
+ * @param inp Blockized input data pointer
+ * @param height
+ * @param width
+ * @param depth
+ */
+void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth);
+
+/**
+ * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
+ *
+ * Weights are packed into the below mentioned layout (notation similar to index map):
+ * Since weights cannot be exactly represented into a index map notation, the
+ * base split up is mentioned below with a few gotchas
+ *
+ * lambda h, w, i, o: h//8, w//4, o//32, i//32, h%8, w%4, (i%32)//2, o%32, i%2
+ *
+ * The gotchas are:
+ *  - (w%4) is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3
+ *  - The h%8 and (w%4) dimensions are not padded up, leading to chunks of different sizes
+ *    (thereby the name "chunked" instead of packed)
+ *  - The thinnest chunk of width is stored first. For example, if a kernel is 5x5, the first
+ *    chunk along the width has size 1 (representing index 0) and then next one has size 4
+ *    representing indices (1,2,3,4)
+ *
+ * @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses
+ * of the "chunked" weights
+ * @param out_ptr_size The number of chunks
+ * @param out Pointer to pre-allocated output memory
+ * @param inp Pointer to flat input data
+ * @param height
+ * @param width
+ * @param idepth
+ * @param odepth
+ */
+void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height,
+                       int width, int idepth, int odepth);
+
+SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat,
+                          bool copy_data);
+
+int calculate_num_weight_chunks(int64_t* shape_hwio);
+
+SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
+                          int num_chunks, void** ptr_table);
+
+template <size_t N>
+void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor<N>& tensor) {
+  if (auto* data_space = tensor.GetDataSpace()) {
+    device_api->FreeDataSpace(hexagon_device, data_space);
+  }
+}
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
diff --git a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
new file mode 100644
index 000000000000..cf4dc43c6515
--- /dev/null
+++ b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
@@ -0,0 +1,489 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <HAP_compute_res.h>
+#include <hexagon_types.h>
+#include <hvx_hexagon_protos.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+
+#include "tvm/runtime/hexagon/ops/conv2d.h"
+
+// Current limitations:
+// - N in NHWC must be 1
+// - dilated convolutions are not supported
+// - Bias is not accepted
+// - Optional "relu" is not performed
+
+// Packed arguments:
+//   0: DLTensor activations (NHWC)
+//   1: DLTensor weights (HWIO)
+//   2: int offset_top
+//   3: int offset_left
+//   4: int stride_h
+//   5: int stride_w
+//   6: DLTensor output (NHWC)
+extern "C" int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
+                                  int out_code, void* res_handle);
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+/**
+ * @brief Returns the pointer to the element within the given block
+ * assuming fp16 type and speicific layout as mentioned in blockize_hwc_16b.
+ * All the below params are explained with the same layout assumption
+ *
+ * @param block_out_y y-index of block
+ * @param block_out_x x-index of block
+ * @param block_out_c c-index of block
+ * @param yi height-offset within the block
+ * @param xio outer width offset within the block
+ * @param ci channel offset within the block
+ * @param xii inner width offset within the block
+ * @param block base DLTensor
+ *
+ * @return The pointer to the element within the given block
+ */
+static inline uint16_t* getElementPtr(int block_out_y, int block_out_x, int block_out_c, int yi,
+                                      int xio, int ci, int xii, const DLTensor& tensor) {
+  auto block_ptr = nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c);
+  auto block_offset = yi * 128 + xio * 64 + ci * 2 + xii;
+  auto first_element_ptr = reinterpret_cast<uint16_t*>(block_ptr);
+  return first_element_ptr + block_offset;
+}
+
+/**
+ * @brief Compute 2 vectors with ones in the even and odd lanes
+ *
+ * Output vectors are:
+ * vector 1     = [0xFFFF,0x0000,0xFFFFF,0x0000,...,0xFFFF,0x0000]
+ * vector lanes = [   0  ,   2  ,   3   ,   4  ,...,   62 ,   63 ]
+ *
+ * vector 2     = [0x0000,0xFFFF,0x0000,0xFFFFF,...,0xFFFF,0x0000]
+ * vector lanes = [   0  ,   2  ,   3   ,   4  ,...,   62 ,   63 ]
+ *
+ * @return Return the 2 vectors
+ */
+inline std::pair<HVX_Vector, HVX_Vector> getOddEvenOnes() {
+  HVX_Vector v0 = Q6_V_vzero();
+  HVX_Vector v1 = Q6_Vh_vsplat_R(0xFFFF);
+
+  HVX_Vector v1e = Q6_Vh_vshuffe_VhVh(v0, v1);
+  HVX_Vector v1o = Q6_V_vnot_V(v1e);
+  return {v1e, v1o};
+}
+
+/**
+ * @brief Return the input vector filled with the 2 channel elements(which is the 1st and 3rd
+ * element) from base_ptr filled up 32 times to get 64 elements
+ *
+ * 1. It's generated by first creating 2 vectors "splatted" with the 2 required elements
+ * 2. Then we andd it with vectors containing all ones (0xFFFF) in the even and odd lanes
+ * 3. Finally those 2 vectors are OR'ed together
+ *
+ * @param base_ptr pointer to the first of the 2 channel elements to be filled
+ *
+ * @return input vector
+ */
+inline HVX_Vector getInputVector(uint16_t* base_ptr) {
+  HVX_Vector v1 = Q6_Vh_vsplat_R(base_ptr[0]);
+  HVX_Vector v2 = Q6_Vh_vsplat_R(base_ptr[2]);
+
+  auto oddEvenOnes = getOddEvenOnes();
+  auto v1e = oddEvenOnes.first;
+  auto v1o = oddEvenOnes.second;
+
+  HVX_Vector v_even_vals = Q6_V_vand_VV(v1, v1e);
+  HVX_Vector v_odd_vals = Q6_V_vand_VV(v2, v1o);
+
+  return Q6_V_vor_VV(v_even_vals, v_odd_vals);
+}
+
+/**
+ * @brief Return the Output vector which contains the 32 output channels in the even lanes
+ *
+ * The output vector is commputed as:
+ * 1. vector multiply(vmpy) of input and weights
+ * 2. Rotate the vector right by 1 element and add with the first vector to add the 2 input channels
+ * 3. Then convert the results back from qfloat16 to IEEE half-precision float
+ * 4. The added values are in even lanes, so zero out the odd lanes by anding with ones in even
+ * lanes and return
+ *
+ * @param act_vec Input activations vector
+ * @param wgt_vec Weights vector
+ *
+ * @return output vector with 32 output channels even lanes
+ */
+inline HVX_Vector computeOuputVector(HVX_Vector act_vec, HVX_Vector wgt_vec) {
+  HVX_Vector v_res = Q6_Vqf16_vmpy_VhfVhf(act_vec, wgt_vec);  // result is in qf16
+  HVX_Vector v_rot = Q6_V_vror_VR(v_res, 2);
+  HVX_Vector v_reduced = Q6_Vqf16_vadd_Vqf16Vqf16(v_res, v_rot);
+  HVX_Vector v_hf = Q6_Vhf_equals_Vqf16(v_reduced);
+  HVX_Vector v1e = getOddEvenOnes().first;
+  HVX_Vector v_reduced_even_lanes = Q6_V_vand_VV(v_hf, v1e);
+  return v_reduced_even_lanes;
+}
+
+static int round_down(int v, int base) { return v - (v % base); }
+
+/**
+ * @brief Compute the convolution of inputs from cr_act, and weights from
+ * cr_filt to update the output to cr_out. The goal is to have an efficient
+ * HVX implementation
+ *
+ * Assumptions:
+ * -----------
+ * - This implementation right now assumes that the dilation is 1
+ * - there is zero padding or the input was already pre-padded.
+ * - block specific spatial padding is only expected at the end and hence
+ *   pad_top and pad_left are not yet used
+ * - Relu activation is not used
+ * - Bias add is not done
+ *
+ * @param cr_out blockized output tensor with zeros already filled in
+ * @param cr_act blockized activations
+ * @param cr_filt Chunkified weights as returned from output of prepare_hwio
+ * @param out_shape Original output shape of the tensor before blockization
+ * @param act_shape Original input shape
+ * @param bias_flat Flat bias values and are not used right now
+ *        TODO (quic-sanirudh) Add support for bias add
+ * @param filt_shape Original filter shape
+ * @param pad_shape Pad top and pad left shape
+ * @param relu Whether to apply relu after convolution, not done right now
+ *        TODO (quic-sanirudh) Add support for relu activation
+ * @param zero_block A block filled with zeros
+ *
+ * @return
+ */
+void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
+                         const DLTensor& cr_filt, const DLTensor& out_shape,
+                         const DLTensor& act_shape, const DLTensor& bias_flat,
+                         const DLTensor& filt_shape, const DLTensor& pad_shape, bool relu,
+                         int stride_h, int stride_w, uintptr_t zero_block) {
+  int64_t filt_height = filt_shape.shape[0];
+  int64_t filt_width = filt_shape.shape[1];
+  int64_t filt_idepth = filt_shape.shape[2];
+
+  int pad_top = pad_shape.shape[0];
+  int pad_left = pad_shape.shape[1];
+  LOG_INFO << "filt_height=" << filt_height << ", filt_width=" << filt_width
+           << ", filt_idepth=" << filt_idepth << ", pad_top=" << pad_top
+           << ", pad_left=" << pad_left << "\n";
+
+  ICHECK_LT(pad_top, 8) << "pad_top offset cannot be >= 8";
+  ICHECK_LT(pad_left, 4) << "pad_left offset cannot be >= 4";
+
+  int a_height = cr_act.shape[1];
+  int a_width = cr_act.shape[2];
+  int a_depth = cr_act.shape[3];
+
+  int w_height = cr_filt.shape[0];
+  int w_width = cr_filt.shape[1];
+
+  int o_depth = cr_out.shape[3];
+  int b_depth = bias_flat.shape[0];
+
+  int o_height = cr_out.shape[1];
+  int o_width = cr_out.shape[2];
+
+  int out_height = out_shape.shape[1];
+  int out_width = out_shape.shape[2];
+
+  LOG_INFO << "a: 1x" << a_height << "x" << a_width << "x" << a_depth << ", w: " << w_height << "x"
+           << w_width << "x" << static_cast<int>(cr_filt.shape[2]) << "x"
+           << static_cast<int>(cr_filt.shape[3]) << ", o: 1x" << o_height << "x" << o_width << "x"
+           << o_depth << ", b: " << b_depth << ", out_shape: " << out_height << "x" << out_width
+           << "\n";
+
+  ICHECK_EQ(a_depth, cr_filt.shape[2]) << "input depth should match weights input channels";
+  ICHECK_EQ(o_depth, cr_filt.shape[3]) << "output depth should match the weights output channel";
+
+  int rd = round_down(filt_width, 4);
+  int wgt_chunk_thin_width = filt_width - rd;
+
+  /*
+   * Compute the output vector of either 1 or 2 elements along the width and max 32 elements along
+   * the depth to constitue a maximum of 64 elements
+   *
+   * The weights are loaded directly in the order they're stored, which results
+   * in 2 input channels and 32 output channels
+   *
+   * Weights vector illustration:
+   * ------- ------ ------------
+   * weights_vec = [0-0,0-1,1-0,1-1,2-0,2-1,3-0,3-1,4-0,4-1,...,31-0,31-1] -> This is the
+   * vector representation of weights, where the elements are represented as
+   * "out_channel-input_channel"
+   *
+   *
+   * Same 2 input channels have to be multiplied across all output channels in the weights.
+   *
+   * Activations vector would thus be:
+   * ----------- ------ ----- ---- --
+   * act_vec = [i0,i1,i0,i1,i0,i1,...,i0,i1] - 2 elements of the input channels broadcasted 32 times
+   * to fill 64 elements of the vector
+   *
+   *
+   * Thus the computation is just a vmpy(act_vec,weights_vec) followed by a some rearrangement to
+   * add every pair of 16b lanes in the vector to reduce along the input channels
+   *
+   * This result is added to the result of the next pair of input channels all the way until we
+   * have reduced across the entire input channels.
+   *
+   * Then the same vector is added to the results of the following elements along the width and
+   * height to finally get 32 elements representing 32 output channels.
+   *
+   * Since the output block also has the 8h2w32c2w format, the 32 elements of the next element
+   * along the width is also added into the the same vector such that the first 32 channel elements
+   * occupy the even lanes and the next 32 occupy the odd lanes to form a single 64-element vector
+   * which is then stored
+   */
+  auto computeConv = [filt_height, filt_width, wgt_chunk_thin_width, filt_idepth, stride_h,
+                      stride_w, &cr_out, &cr_act, &cr_filt](int out_act_y, int out_act_x, int out_c,
+                                                            int h, int wo, bool skip_wi_1 = false) {
+    auto out_element_ptr = getElementPtr(out_act_y, out_act_x, out_c, h, wo, 0, 0, cr_out);
+
+    LOG_INFO << "out_act_y: " << out_act_y << ", out_act_x: " << out_act_x << ", out_c: " << out_c
+             << ", h: " << h << ", wo: " << wo << " out_element_ptr: " << out_element_ptr;
+
+    HVX_Vector* out_vector = reinterpret_cast<HVX_Vector*>(out_element_ptr);
+    HVX_Vector existing_out_vec = *out_vector;
+
+    for (int fh = 0; fh < filt_height; ++fh) {
+      for (int fw = 0; fw < filt_width; ++fw) {
+        int fch = fh / 8;
+        int fcw = 0;
+        if (fw >= wgt_chunk_thin_width) {
+          fcw = (fw - wgt_chunk_thin_width) / 4 + 1;
+        }
+        int fx = (fw < wgt_chunk_thin_width) ? fw : ((fw - wgt_chunk_thin_width) % 4);
+        int fy = fh % 8;
+        for (int c = 0; c < round_up(filt_idepth, 2); c += 2) {
+          int out_act_cc = c / 32;
+          int ci = c % 32;
+          auto wgt_chunk = hwio_at(cr_filt, fch, fcw, out_act_cc, out_c);
+
+          // Find weight chunk offset ptr
+          int max_x = (fcw == 0) ? wgt_chunk_thin_width : 4;
+
+          int wi = 0;
+
+          int out_width_idx = out_act_x * 4 + wo * 2 + wi;
+          int act_width_access_idx = out_width_idx * stride_w + fw;
+          int true_out_act_x = act_width_access_idx / 4;
+          int true_wo = (act_width_access_idx % 4) / 2;
+          int true_wi = act_width_access_idx % 2;
+
+          int out_height_idx = out_act_y * 8 + h;
+          int act_height_access_idx = out_height_idx * stride_h + fh;
+          int true_out_act_y = act_height_access_idx / 8;
+          int true_h = act_height_access_idx % 8;
+
+          int act_channel_idx = out_act_cc * 32 + ci;
+
+          auto act_element_ptr = getElementPtr(true_out_act_y, true_out_act_x, out_act_cc, true_h,
+                                               true_wo, ci, true_wi, cr_act);
+          HVX_Vector act_vec = getInputVector(act_element_ptr);
+
+          auto wgt_chunk_offset = hwio_to_sm_16b(max_x, fy, fx, ci, 0);
+          auto base_chunk_ptr = reinterpret_cast<uint16_t*>(wgt_chunk);
+          auto chunk_ptr = base_chunk_ptr + wgt_chunk_offset;
+
+          LOG_INFO << "act:  0x" << act_height_access_idx << "x" << act_width_access_idx << "x"
+                   << act_channel_idx << ", wgt: " << fh << "x" << fw << "x" << act_channel_idx
+                   << "x" << out_c * 32 << ", out: 0x" << out_height_idx << "x" << out_width_idx
+                   << "x" << out_c * 32 << ", wgt_chunk_offset: " << wgt_chunk_offset;
+
+          const HVX_Vector* weights_vec_ptr = reinterpret_cast<const HVX_Vector*>(chunk_ptr);
+          HVX_Vector weights_vec = *weights_vec_ptr;
+
+          HVX_Vector reduced_vec_even_elements = computeOuputVector(act_vec, weights_vec);
+
+          if (!skip_wi_1) {
+            wi = 1;
+
+            out_width_idx = out_act_x * 4 + wo * 2 + wi;
+            act_width_access_idx = out_width_idx * stride_w + fw;
+            true_out_act_x = act_width_access_idx / 4;
+            true_wo = (act_width_access_idx % 4) / 2;
+            true_wi = act_width_access_idx % 2;
+
+            act_element_ptr = getElementPtr(true_out_act_y, true_out_act_x, out_act_cc, true_h,
+                                            true_wo, ci, true_wi, cr_act);
+            act_vec = getInputVector(act_element_ptr);
+
+            LOG_INFO << "act:  0x" << act_height_access_idx << "x" << act_width_access_idx << "x"
+                     << act_channel_idx << ", wgt: " << fh << "x" << fw << "x" << act_channel_idx
+                     << "x" << out_c * 32 << ", out: 0x" << out_height_idx << "x" << out_width_idx
+                     << "x" << out_c * 32 << ", wgt_chunk_offset: " << wgt_chunk_offset;
+
+            HVX_Vector reduced_vec_odd_elements = computeOuputVector(act_vec, weights_vec);
+            reduced_vec_odd_elements = Q6_V_vror_VR(reduced_vec_odd_elements, -2);
+            HVX_Vector out_final = Q6_V_vor_VV(reduced_vec_even_elements, reduced_vec_odd_elements);
+
+            HVX_Vector out_vec_qf16 = Q6_Vqf16_vadd_VhfVhf(out_final, existing_out_vec);
+            existing_out_vec = Q6_Vhf_equals_Vqf16(out_vec_qf16);
+          } else {
+            HVX_Vector out_vec_qf16 =
+                Q6_Vqf16_vadd_VhfVhf(reduced_vec_even_elements, existing_out_vec);
+            existing_out_vec = Q6_Vhf_equals_Vqf16(out_vec_qf16);
+          }
+        }
+      }
+    }
+    *out_vector = existing_out_vec;
+  };
+
+  auto computeFullWidth = [&computeConv](int out_y, int out_x, int out_c, int h) {
+    for (int wo = 0; wo < 2; ++wo) {
+      computeConv(out_y, out_x, out_c, h, wo);
+    }
+  };
+
+  auto computePartialWidth = [out_width, o_width, &computeConv](int out_y, int out_c, int h) {
+    int out_x = o_width - 1;
+    int wo = 0;
+    for (; wo < (out_width % 4) / 2; ++wo) {
+      computeConv(out_y, out_x, out_c, h, wo);
+    }
+
+    if (out_width % 2) {
+      computeConv(out_y, out_x, out_c, h, wo, true /* skip_wi_1 */);
+    }
+  };
+
+  for (int out_c = 0; out_c < cr_filt.shape[3]; ++out_c) {
+    for (int out_act_y = 0; out_act_y < out_height / 8; ++out_act_y) {
+      int out_y = out_act_y;
+      for (int out_act_x = 0; out_act_x < out_width / 4; ++out_act_x) {
+        int out_x = out_act_x;
+        for (int h = 0; h < 8; ++h) {
+          computeFullWidth(out_y, out_x, out_c, h);
+        }
+      }
+
+      for (int h = 0; h < 8; ++h) {
+        computePartialWidth(out_y, out_c, h);
+      }
+    }
+
+    int out_y = o_height - 1;
+    for (int h = 0; h < out_height % 8; ++h) {
+      for (int out_act_x = 0; out_act_x < out_width / 4; ++out_act_x) {
+        int out_x = out_act_x;
+        computeFullWidth(out_y, out_x, out_c, h);
+      }
+      computePartialWidth(out_y, out_c, h);
+    }
+  }
+}
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
+                       int out_code, void* res_handle) {
+  namespace hexagonrt = tvm::runtime::hexagon;
+  ICHECK_EQ(num_args, 7) << "Unexpected number of arguments";
+  ICHECK_EQ(type_codes[0], kTVMDLTensorHandle)
+      << "First argument is expected to be the input tensor";  // Input activations
+  ICHECK_EQ(type_codes[1], kTVMDLTensorHandle)
+      << "Second argument is expected to be the weights tensor";  // Weights
+  ICHECK_EQ(type_codes[2], kDLInt)
+      << "Third argument is expected to be the pad_top offset";  // pad_top offset
+  ICHECK_EQ(type_codes[3], kDLInt)
+      << "Fourth argument is expected to be the pad_left offset";  // pad_left offset
+  ICHECK_EQ(type_codes[4], kDLInt) << "Fifth argument is expected to be the stride_h";  // stride_h
+  ICHECK_EQ(type_codes[5], kDLInt) << "Sixth argument is expected to be the stride_w";  // stride_w
+  ICHECK_EQ(type_codes[6], kTVMDLTensorHandle)
+      << "Seventh argument is expected to be the output tensor";  // output
+
+  auto* act_flat = static_cast<DLTensor*>(args[0].v_handle);
+  auto* wgt_flat = static_cast<DLTensor*>(args[1].v_handle);
+  auto* out_flat = static_cast<DLTensor*>(args[6].v_handle);
+
+  // Temporary assertion until multiple batches are supported
+  ICHECK_EQ(act_flat->shape[0], 1) << "Input batch size more than 1 is not supported yet";
+
+  // Temporary assertion until multiple batches are supported
+  ICHECK_EQ(out_flat->shape[0], 1) << "Output batch size more than 1 is not supported yet";
+
+  int pad_top = args[2].v_int64;
+  int pad_left = args[3].v_int64;
+  int stride_h = args[4].v_int64;
+  int stride_w = args[5].v_int64;
+
+  LOG_INFO << "act.shape=" << act_flat->shape[0] << "x" << act_flat->shape[1] << "x"
+           << act_flat->shape[2] << "x" << act_flat->shape[3]
+           << ", wgt.shape=" << wgt_flat->shape[0] << "x" << wgt_flat->shape[1] << "x"
+           << wgt_flat->shape[2] << "x" << wgt_flat->shape[3] << ", pad_top=" << pad_top
+           << ", pad_left=" << pad_left;
+
+  auto* device_api = tvm::runtime::DeviceAPI::Get(hexagonrt::hexagon_device, false);
+  ICHECK(device_api != nullptr);
+  tvm::runtime::String vtcm_scope = "global.vtcm";
+
+  auto act_vtcm = hexagonrt::prepare_nhwc(device_api, act_flat, /*copy_data=*/true);
+
+  ICHECK_NE(wgt_flat->shape[0], 0) << "Weights height should not be zero";
+  ICHECK_NE(wgt_flat->shape[1], 0) << "Weights width should not be zero";
+  ICHECK_NE(wgt_flat->shape[2], 0) << "Weights input channels should not be zero";
+  ICHECK_NE(wgt_flat->shape[3], 0) << "Weights output channels should not be zero";
+  int num_wgt_chunks = hexagonrt::calculate_num_weight_chunks(wgt_flat->shape);
+  LOG_INFO << "num_wgt_chunks: " << num_wgt_chunks;
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+  auto wgt_vtcm = hexagonrt::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table);
+
+  auto out_vtcm = hexagonrt::prepare_nhwc(device_api, out_flat, /*copy_data=*/false);
+
+  // Prepare zero_block
+  int64_t block_nbytes = 2048;
+  void* zero_block = device_api->AllocDataSpace(hexagonrt::hexagon_device, 1, &block_nbytes,
+                                                tvm::runtime::DataType::UInt(8), vtcm_scope);
+  memset(zero_block, 0, 2048);
+
+  // FIXME: Setting bias to zero_block: this works for up to 256 output channels.
+  auto bias_flat =
+      hexagonrt::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]);
+  auto act_shape = hexagonrt::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
+  auto filt_shape = hexagonrt::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
+  auto pad_shape = hexagonrt::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left});
+  auto out_shape = hexagonrt::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
+  bool relu = false;
+
+  hexagonrt::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, bias_flat,
+                                 filt_shape, pad_shape, relu, stride_h, stride_w,
+                                 hexagonrt::to_uint(zero_block));
+
+  hexagonrt::deblockize_hwc_16b(out_flat->data, out_vtcm.data, out_flat->shape[1],
+                                out_flat->shape[2], out_flat->shape[3]);
+
+  device_api->FreeDataSpace(hexagonrt::hexagon_device, zero_block);
+  hexagonrt::release(device_api, out_vtcm);
+  hexagonrt::release(device_api, wgt_vtcm);
+  hexagonrt::release(device_api, act_vtcm);
+
+  return 0;
+}
diff --git a/src/runtime/hexagon/ops/conv_utils.cc b/src/runtime/hexagon/ops/conv_utils.cc
new file mode 100644
index 000000000000..e1ec1e17277d
--- /dev/null
+++ b/src/runtime/hexagon/ops/conv_utils.cc
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "tvm/runtime/hexagon/ops/conv2d.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+/**
+ * @brief Function to "blockize" the flat input data
+ * The term "blockize" is used to mention that the data is stored in non-contiguous blocks
+ *
+ * The input is mapped into the below mentioned layout (notation similar to index map used for
+ * transform layout):
+ *
+ * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2
+ *
+ * where AXIS_SEPARATOR represents split up in the physical layout
+ *
+ * @param out Pre-allocated output memory pointer
+ * @param inp_flat Flat input data pointer
+ * @param height
+ * @param width
+ * @param depth
+ */
+void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth) {
+  auto inp_data = static_cast<uint16_t*>(inp_flat);
+  auto out_data = static_cast<uintptr_t*>(out);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += 8) {
+    for (int cx = 0; cx < width; cx += 4) {
+      for (int cc = 0; cc < depth; cc += 32) {
+        auto block = reinterpret_cast<uint16_t*>(*out_data++);
+        int max_y = std::min(8, height - cy);
+        int max_x = std::min(4, width - cx);
+        int max_c = std::min(32, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              block[xyc_to_sm_16b(y, x, c)] =
+                  inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)];
+            }
+            for (int c = max_c; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
+          }
+          for (int x = max_x; x < 4; ++x) {
+            for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
+          }
+        }
+
+        for (int y = max_y; y < 8; ++y)
+          for (int x = 0; x < 4; ++x)
+            for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
+      }  // cc
+    }    // cx
+  }      // cy
+}
+
+/**
+ * @brief Convert back from non-contguous layout to a flat layout
+ *
+ * @param out_flat Pre-allocated output memory pointer
+ * @param inp Blockized input data pointer
+ * @param height
+ * @param width
+ * @param depth
+ */
+void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth) {
+  uintptr_t* inp_data = static_cast<uintptr_t*>(inp);
+  uint16_t* out_data = static_cast<uint16_t*>(out_flat);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += 8) {
+    for (int cx = 0; cx < width; cx += 4) {
+      for (int cc = 0; cc < depth; cc += 32) {
+        auto block = reinterpret_cast<uint16_t*>(*inp_data);
+        int max_y = std::min(8, height - cy);
+        int max_x = std::min(4, width - cx);
+        int max_c = std::min(32, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] =
+                  block[xyc_to_sm_16b(y, x, c)];
+            }
+          }
+        }
+
+        inp_data++;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
+ *
+ * Weights are packed into the below mentioned layout (notation similar to index map):
+ * Since weights cannot be exactly represented into a index map notation, the
+ * base split up is mentioned below with a few gotchas
+ *
+ * lambda h, w, i, o: h//8, w//4, o//32, i//32, h%8, w%4, (i%32)//2, o%32, i%2
+ *
+ * The gotchas are:
+ *  - (w%4) is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3
+ *  - The h%8 and (w%4) dimensions are not padded up, leading to chunks of different sizes
+ *    (thereby the name "chunked" instead of packed)
+ *  - The thinnest chunk of width is stored first. For example, if a kernel is 5x5, the first
+ *    chunk along the width has size 1 (representing index 0) and then next one has size 4
+ *    representing indices (1,2,3,4)
+ *
+ * @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses
+ * of the "chunked" weights
+ * @param out_ptr_size The number of chunks
+ * @param out Pointer to pre-allocated output memory
+ * @param inp Pointer to flat input data
+ * @param height
+ * @param width
+ * @param idepth
+ * @param odepth
+ */
+void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height,
+                       int width, int idepth, int odepth) {
+  auto inp_data = static_cast<uint16_t*>(inp);
+  auto out_data = static_cast<uintptr_t*>(out);
+  const int stride_i = odepth;
+  const int stride_x = stride_i * idepth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += 8) {
+    // In the chunkified tensor, the chunks are ordered in increasing
+    // x order, but they start from the thin one.
+    for (int cx = width - round_up(width, 4); cx < width; cx += 4) {
+      int cx0 = std::max(0, cx);
+      for (int ci = 0; ci < idepth; ci += 32) {
+        for (int co = 0; co < odepth; co += 32) {
+          int max_y = std::min(8, height - cy);
+          int max_x = std::min(4, cx + 4 - cx0);
+          int max_i = std::min(32, idepth - ci);
+          int max_o = std::min(32, odepth - co);
+
+          auto chunk = reinterpret_cast<uint16_t*>(out_data);
+          for (int y = 0; y < max_y; ++y) {
+            for (int x = max_x - 1; x >= 0; --x) {
+              for (int i = 0; i < max_i; ++i) {
+                for (int o = 0; o < max_o; ++o) {
+                  chunk[hwio_to_sm_16b(max_x, y, x, i, o)] =
+                      inp_data[(cy + y) * stride_y + (cx0 + x) * stride_x + (ci + i) * stride_i +
+                               (co + o)];
+                }
+                for (int o = max_o; o < 32; ++o) chunk[hwio_to_sm_16b(max_x, y, x, i, o)] = 0;
+              }
+              for (int i = max_i; i < 32; ++i)
+                for (int o = 0; o < 32; ++o) chunk[hwio_to_sm_16b(max_x, y, x, i, o)] = 0;
+            }
+          }
+
+          *out_ptr++ = chunk;
+          out_data += max_y * max_x * 32 * 32;
+          out_ptr_size--;
+          assert(out_ptr_size >= 0);
+        }
+      }
+    }
+  }
+}
+
+SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat,
+                          bool copy_data) {
+  tvm::runtime::String vtcm_scope = "global.vtcm";
+
+  // Allocate blocks for activations. We will use the block pointers
+  // directly from the allocated area.
+  int n = nhwc_flat->shape[0];
+  int h = round_up(nhwc_flat->shape[1], 8);
+  int w = round_up(nhwc_flat->shape[2], 4);
+  int c = round_up(nhwc_flat->shape[3], 32);
+  int64_t shape_2d[2] = {(n * h * w * c) / (8 * 4 * 32), 8 * 4 * 32};
+  void* nhwc_vtcm =
+      device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope);
+  if (copy_data) {
+    blockize_hwc_16b(nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2],
+                     nhwc_flat->shape[3]);
+  }
+
+  return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm, {n, h / 8, w / 4, c / 32});
+}
+
+SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
+                          int num_chunks, void** ptr_table) {
+  tvm::runtime::String vtcm_scope = "global.vtcm";
+
+  // Allocate one block for filter data. We will need to create our own
+  // pointer table. The reason is that filter chunks cannot be padded
+  // height- or width-wise, so filter chunks may have different sizes.
+  // A filter chunk is a block of size HxWx32x32, where H, W are at most
+  // height and width of a block respectively.
+  int h = hwio_flat->shape[0];
+  int w = hwio_flat->shape[1];
+  int i = round_up(hwio_flat->shape[2], 32);
+  int o = round_up(hwio_flat->shape[3], 32);
+  int64_t shape_1d[] = {h * w * i * o};
+  void* hwio_vtcm =
+      device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope);
+
+  chunkify_hwio_16b(ptr_table, num_chunks, hwio_vtcm, hwio_flat->data, hwio_flat->shape[0],
+                    hwio_flat->shape[1], hwio_flat->shape[2], hwio_flat->shape[3]);
+
+  return SDLTensor<4>(ptr_table, hwio_flat->dtype, hwio_vtcm,
+                      {round_up(h, 8) / 8, round_up(w, 4) / 4, i / 32, o / 32});
+}
+
+int calculate_num_weight_chunks(int64_t* shape_hwio) {
+  int h = round_up(shape_hwio[0], 8);
+  int w = round_up(shape_hwio[1], 4);
+  int i = round_up(shape_hwio[2], 32);
+  int o = round_up(shape_hwio[3], 32);
+
+  return (h * w * i * o) / (8 * 4 * 32 * 32);
+}
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
diff --git a/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
new file mode 100644
index 000000000000..3b922fa6c2a8
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dlpack/dlpack.h>
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <ctime>
+#include <functional>
+#include <string>
+#include <tuple>
+
+#include "tvm/runtime/hexagon/ops/conv2d.h"
+
+using namespace tvm::runtime::hexagon;
+
+class HexagonUtilsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    vtcm_scope = "global.vtcm";
+    device_api = tvm::runtime::DeviceAPI::Get(hexagon_device, false);
+    float16.code = kDLFloat;
+    float16.bits = 16;
+    float16.lanes = 1;
+  }
+
+  void setupTensor(std::tuple<int64_t, int64_t, int64_t, int64_t> shape) {
+    auto [s1, s2, s3, s4] = shape;
+    tensor_shape[0] = s1;
+    tensor_shape[1] = s2;
+    tensor_shape[2] = s3;
+    tensor_shape[3] = s4;
+    int64_t shape_1d[1] = {s1 * s2 * s3 * s4};
+
+    flat_mem = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope);
+    flat_mem_data = static_cast<uint16_t*>(flat_mem);
+    fill_vals(flat_mem_data, shape_1d[0]);
+
+    flat_tensor.data = flat_mem;
+    flat_tensor.device = hexagon_device;
+    flat_tensor.ndim = 4;
+    flat_tensor.dtype = float16;
+    flat_tensor.shape = tensor_shape;
+    flat_tensor.strides = nullptr;
+    flat_tensor.byte_offset = 0;
+  }
+
+  void TearDownTensor() {
+    if (flat_tensor.data) device_api->FreeDataSpace(hexagon_device, flat_mem);
+  }
+
+  static void fill_vals(uint16_t* arr, int size) {
+    // Testing with uint16 instead of float16 as generating random float16 is not easy within c++
+    uint16_t max = UINT16_MAX;
+    srand(std::time(0));
+    for (int i = 0; i < size; ++i) {
+      arr[i] = static_cast<uint16_t>(std::rand() % max);
+    }
+  }
+
+  static int flattened_idx(int nn, int hh, int ww, int cc, int64_t* shape) {
+    int h = shape[1];
+    int w = shape[2];
+    int c = shape[3];
+    return cc + c * (ww + w * (hh + h * (nn)));
+  }
+
+  DLTensor flat_tensor;
+  void* flat_mem;
+  uint16_t* flat_mem_data;
+  tvm::runtime::DeviceAPI* device_api;
+  tvm::runtime::String vtcm_scope;
+  DLDataType float16;
+  int64_t tensor_shape[4];
+};
+
+// Parameterized test fixture with 4 params representing n, h, w, c
+class HexagonUtilsActivationsBlockizeTest
+    : public HexagonUtilsTest,
+      public ::testing::WithParamInterface<std::tuple<
+          std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
+
+// TODO (quic-sanirudh): See if we can test with random generated indices
+INSTANTIATE_TEST_SUITE_P(
+    BlockizeDeblockizeTestFixtures, HexagonUtilsActivationsBlockizeTest,
+    ::testing::Combine(::testing::Values(std::make_tuple(1, 14, 7, 60)),
+                       ::testing::Values(std::make_tuple(0, 0, 0, 0),   // first element
+                                         std::make_tuple(0, 7, 3, 31),  // last element
+                                         // Remaining are random element tests
+                                         std::make_tuple(0, 13, 6, 59),
+                                         std::make_tuple(0, 0, 0, 32), std::make_tuple(0, 0, 4, 32),
+                                         std::make_tuple(0, 2, 3, 4), std::make_tuple(0, 5, 6, 7),
+                                         std::make_tuple(0, 10, 4, 12))),
+    [](const ::testing::TestParamInfo<HexagonUtilsActivationsBlockizeTest::ParamType>& info) {
+      // Can use info.param here to generate the test suffix
+      auto indices = std::get<1>(info.param);
+      int h = std::get<1>(indices);
+      int w = std::get<2>(indices);
+      int c = std::get<3>(indices);
+      // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0
+      std::string name =
+          "hwc" + std::to_string(h) + "x" + std::to_string(w) + "x" + std::to_string(c);
+      return name;
+    });
+
+TEST_F(HexagonUtilsActivationsBlockizeTest, prepare_nhwc) {
+  auto shape = std::make_tuple(1, 14, 7, 60);
+  auto [n, h, w, c] = shape;
+  setupTensor(shape);
+
+  // // copy_data is set to false here as there's a separate test for blockize when copy_data
+  // becomes true
+  auto blocked_tensor = prepare_nhwc(device_api, &flat_tensor, /*copy_data=*/false);
+
+  EXPECT_EQ(blocked_tensor.shape[0], n);
+  EXPECT_EQ(blocked_tensor.shape[1], round_up(h, 8) / 8);
+  EXPECT_EQ(blocked_tensor.shape[2], round_up(w, 4) / 4);
+  EXPECT_EQ(blocked_tensor.shape[3], round_up(c, 32) / 32);
+
+  TearDownTensor();
+  release(device_api, blocked_tensor);
+}
+
+TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
+  auto shape_tuple = std::get<0>(GetParam());
+  setupTensor(shape_tuple);
+  auto [n, h, w, c] = shape_tuple;
+  int64_t shape[] = {n, h, w, c};
+
+  int h_rounded = round_up(h, 8);
+  int w_rounded = round_up(w, 4);
+  int c_rounded = round_up(c, 32);
+  int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 4 * 32), 8 * 4 * 32};
+
+  void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope);
+  int64_t blocked_shape[] = {n, h_rounded / 8, w_rounded / 4, c_rounded / 32};
+  blockize_hwc_16b(blocked_mem, flat_mem, h, w, c);
+
+  std::function<int(int, int, int, int, int64_t*)> flatten =
+      HexagonUtilsActivationsBlockizeTest::flattened_idx;
+
+  auto getBlockedElem = [&blocked_shape, blocked_mem, flatten](int nn, int hh, int ww, int cc) {
+    auto* blocks = static_cast<uintptr_t*>(blocked_mem);
+    int blockIdx = flatten(nn, hh / 8, ww / 4, cc / 32, blocked_shape);
+    uint16_t* block = reinterpret_cast<uint16_t*>(blocks[blockIdx]);
+    return block[xyc_to_sm_16b(hh % 8, ww % 4, cc % 32)];
+  };
+
+  auto [nn, hh, ww, cc] = std::get<1>(GetParam());
+
+  EXPECT_EQ(flat_mem_data[flattened_idx(nn, hh, ww, cc, shape)], getBlockedElem(nn, hh, ww, cc));
+
+  TearDownTensor();
+  device_api->FreeDataSpace(hexagon_device, blocked_mem);
+}
+
+TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) {
+  auto shape_tuple = std::get<0>(GetParam());
+  setupTensor(shape_tuple);
+  auto [n, h, w, c] = shape_tuple;
+  int64_t shape[] = {n, h, w, c};
+  int64_t shape_1d[1] = {n * h * w * c};
+
+  int h_rounded = round_up(h, 8);
+  int w_rounded = round_up(w, 4);
+  int c_rounded = round_up(c, 32);
+  int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 4 * 32), 8 * 4 * 32};
+
+  void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope);
+  blockize_hwc_16b(blocked_mem, flat_mem, h, w, c);
+
+  void* deblocked_flat_mem =
+      device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope);
+  deblockize_hwc_16b(deblocked_flat_mem, blocked_mem, h, w, c);
+  auto* deblocked_flat_mem_data = static_cast<uint16_t*>(deblocked_flat_mem);
+
+  auto [nn, hh, ww, cc] = std::get<1>(GetParam());
+
+  auto idx = flattened_idx(nn, hh, ww, cc, shape);
+  EXPECT_EQ(flat_mem_data[idx], deblocked_flat_mem_data[idx]);
+
+  TearDownTensor();
+  device_api->FreeDataSpace(hexagon_device, blocked_mem);
+  device_api->FreeDataSpace(hexagon_device, deblocked_flat_mem);
+}
+
+class HexagonUtilsWeightsChunkifyTest
+    : public HexagonUtilsTest,
+      public ::testing::WithParamInterface<std::tuple<
+          std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
+
+INSTANTIATE_TEST_SUITE_P(
+    ChunkifyDechunkifyTests, HexagonUtilsWeightsChunkifyTest,
+    ::testing::Combine(::testing::Values(std::make_tuple(3, 3, 40, 40)),
+                       ::testing::Values(std::make_tuple(0, 0, 0, 0),    // first element
+                                         std::make_tuple(2, 2, 39, 39),  // Last element
+                                         // Remaining are random element tests
+                                         std::make_tuple(1, 1, 28, 33),
+                                         std::make_tuple(1, 2, 8, 38),
+                                         std::make_tuple(1, 0, 12, 15),
+                                         std::make_tuple(2, 1, 9, 22), std::make_tuple(0, 2, 6, 7),
+                                         std::make_tuple(1, 2, 3, 4))),
+    [](const ::testing::TestParamInfo<HexagonUtilsWeightsChunkifyTest::ParamType>& info) {
+      // Can use info.param here to generate the test suffix
+      auto indices = std::get<1>(info.param);
+      int h = std::get<0>(indices);
+      int w = std::get<1>(indices);
+      int i = std::get<2>(indices);
+      int o = std::get<3>(indices);
+      // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0
+      std::string name = "hwio" + std::to_string(h) + std::to_string(w) + "x" + std::to_string(i) +
+                         "x" + std::to_string(o);
+      return name;
+    });
+
+TEST_F(HexagonUtilsWeightsChunkifyTest, calculate_num_weight_chunks) {
+  int64_t shape[] = {3, 3, 40, 40};
+  int num_wgt_chunks = calculate_num_weight_chunks(shape);
+  EXPECT_EQ(num_wgt_chunks, 4);
+}
+
+TEST_F(HexagonUtilsWeightsChunkifyTest, prepare_hwio) {
+  int64_t shape[] = {3, 3, 40, 40};
+  auto [h, w, i, o] = shape;
+  auto shape_tuple = std::make_tuple(h, w, i, o);
+  setupTensor(shape_tuple);
+
+  // copy_data is set to false here as there's a separate test for blockize when copy_data becomes
+  // true
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape);
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+  auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
+
+  EXPECT_EQ(chunked_tensor.shape[0], round_up(h, 8) / 8);
+  EXPECT_EQ(chunked_tensor.shape[1], round_up(w, 4) / 4);
+  EXPECT_EQ(chunked_tensor.shape[2], round_up(i, 32) / 32);
+  EXPECT_EQ(chunked_tensor.shape[3], round_up(o, 32) / 32);
+
+  release(device_api, chunked_tensor);
+  TearDownTensor();
+}
+
+TEST_P(HexagonUtilsWeightsChunkifyTest, chunkify_hwio_16b) {
+  auto [shape_tuple, indices] = GetParam();
+  auto [h, w, i, o] = shape_tuple;
+  setupTensor(shape_tuple);
+  int64_t shape[] = {h, w, i, o};
+
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape);
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+  auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
+
+  int rd = w - (w % 4);  // round down by 4 for width
+  int thin_w = w - rd;
+
+  auto getChunkedElem = [thin_w, chunked_tensor](int hh, int ww, int ii, int oo) {
+    int fcw = 0;
+    if (ww >= thin_w) {
+      fcw = (ww - thin_w) / 4 + 1;
+      ww = (ww - thin_w) % 4;
+    }
+    auto chunk = hwio_at(chunked_tensor, hh / 8, fcw, ii / 32, oo / 32);
+    auto chunk_uint16 = reinterpret_cast<uint16_t*>(chunk);
+    return chunk_uint16[hwio_to_sm_16b(thin_w, hh % 8, ww, ii % 32, oo % 32)];
+  };
+
+  auto [hh, ww, ii, oo] = indices;
+
+  EXPECT_EQ(flat_mem_data[flattened_idx(hh, ww, ii, oo, shape)], getChunkedElem(hh, ww, ii, oo));
+  release(device_api, chunked_tensor);
+}
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
new file mode 100644
index 000000000000..e8efdb369590
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
@@ -0,0 +1,248 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test conv2d HVX intrinsic implementation"""
+
+import numpy as np
+
+import tvm
+import tvm.contrib.hexagon
+from tvm.topi.testing import conv2d_nhwc_python
+
+
+def build_conv2d(target):
+    """Build and the return the conv2d module that calls the intrinsic implementation"""
+    act_n, act_h, act_w, act_c = (
+        tvm.te.var("act_n"),
+        tvm.te.var("act_h"),
+        tvm.te.var("act_w"),
+        tvm.te.var("act_c"),
+    )
+    filt_h, filt_w, filt_o = tvm.te.var("filt_h"), tvm.te.var("fw"), tvm.te.var("filt_o")
+    off_l, off_t = tvm.te.var("off_l"), tvm.te.var("off_t")
+    stride_h, stride_w = tvm.te.var("stride_h"), tvm.te.var("stride_w")
+
+    act_flat = tvm.te.placeholder(
+        shape=(act_n, act_h, act_w, act_c), dtype="float16", name="act_flat"
+    )
+    wgt_flat = tvm.te.placeholder(
+        shape=(filt_h, filt_w, act_c, filt_o), dtype="float16", name="wgt_flat"
+    )
+
+    out_flat = tvm.te.extern(
+        shape=(act_n, (act_h - filt_h) // stride_h + 1, (act_w - filt_w) // stride_w + 1, filt_o),
+        inputs=[act_flat, wgt_flat],
+        fcompute=lambda ins, outs: tvm.tir.call_cpacked(
+            "conv2d_packed_fp16",  # Function from TVM runtime
+            ins[0],
+            ins[1],
+            off_t,
+            off_l,
+            stride_h,
+            stride_w,
+            outs[0],
+            tvm.runtime.const(0),  # resource_handle (unused)
+        ),
+        dtype="float16",
+    )
+
+    s = tvm.te.create_schedule(out_flat.op)
+
+    func_name = "extern_conv"
+    with tvm.transform.PassContext(opt_level=3):
+        module = tvm.build(
+            s,
+            [act_flat, wgt_flat, off_t, off_l, stride_h, stride_w, out_flat],
+            target=target,
+            name=func_name,
+        )
+
+    return module
+
+
+shape_parameters = [
+    (
+        (1, 8, 4, 3),
+        (3, 3, 3, 3),
+        (1, 1),
+    ),
+    (
+        (1, 10, 14, 3),
+        (3, 3, 3, 3),
+        (1, 1),
+    ),
+    (
+        (1, 14, 6, 3),
+        (3, 3, 3, 3),
+        (1, 1),
+    ),
+    (
+        (1, 14, 6, 3),
+        (3, 3, 3, 64),
+        (1, 1),
+    ),
+    (
+        (1, 14, 6, 3),
+        (5, 5, 3, 3),
+        (1, 1),
+    ),
+    (
+        (1, 8, 8, 3),
+        (2, 2, 3, 3),
+        (1, 1),
+    ),
+    (
+        (1, 14, 6, 64),
+        (3, 3, 64, 3),
+        (1, 1),
+    ),
+    (
+        (1, 4, 4, 40),
+        (3, 3, 40, 3),
+        (1, 1),
+    ),
+    (
+        (1, 4, 4, 3),
+        (3, 3, 3, 3),
+        (1, 1),
+    ),
+    (
+        (1, 5, 5, 3),
+        (3, 3, 3, 3),
+        (1, 1),
+    ),
+    (
+        (1, 6, 6, 3),
+        (3, 3, 3, 3),
+        (1, 1),
+    ),
+    (
+        (1, 7, 7, 3),
+        (3, 3, 3, 3),
+        (1, 1),
+    ),
+    (
+        (1, 8, 8, 3),
+        (3, 3, 3, 3),
+        (1, 1),
+    ),
+    (
+        (1, 8, 8, 3),
+        (5, 5, 3, 3),
+        (1, 1),
+    ),
+    (
+        (1, 8, 8, 64),
+        (2, 2, 64, 64),
+        (1, 1),
+    ),
+    (
+        (1, 8, 4, 3),
+        (3, 3, 3, 3),
+        (2, 2),
+    ),
+    (
+        (1, 14, 6, 3),
+        (3, 3, 3, 64),
+        (2, 2),
+    ),
+    (
+        (1, 14, 6, 3),
+        (5, 5, 3, 3),
+        (2, 2),
+    ),
+    (
+        (1, 8, 8, 3),
+        (2, 2, 3, 3),
+        (2, 2),
+    ),
+]
+
+
+def gen_config(params):
+    """Utility function to generate useful ids for shape_parameters"""
+
+    dims = lambda vals: "x".join(map(str, vals))
+
+    config = {}
+    for param in params:
+        act_shape, wgt_shape, inp_stride = param
+        name = f"nhwc{dims(act_shape)}-hwio{dims(wgt_shape)}-stride{dims(inp_stride)}"
+        config[name] = param
+
+    return config
+
+
+class TestConv2dIntrin:
+    """Test Conv2d Intrin class"""
+
+    config = gen_config(shape_parameters)
+    act_shape, wgt_shape, inp_stride = tvm.testing.parameters(*config.values(), ids=config.keys())
+    inp_offset = tvm.testing.parameter((0, 0), ids=["offset0x0"])
+
+    @tvm.testing.requires_hexagon
+    def test_conv2d(self, act_shape, wgt_shape, inp_stride, inp_offset, hexagon_session):
+        """Test conv2d intrinsic implementation"""
+        assert act_shape[3] == wgt_shape[2]
+
+        target_hexagon = tvm.target.hexagon("v69")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+        # Currently, input offset does not affect the output shape
+        def get_out_shape(ash, wsh, inp_stride):
+            assert ash[3] == wsh[2]
+            osh = (
+                ash[0],
+                (ash[1] - wsh[0]) // inp_stride[0] + 1,
+                (ash[2] - wsh[1]) // inp_stride[1] + 1,
+                wsh[3],
+            )
+            assert tvm.tir.all([x > 0 for x in osh])
+            return osh
+
+        act = np.random.rand(*act_shape).astype("float16")
+        wgt = np.random.rand(*wgt_shape).astype("float16")
+
+        module = build_conv2d(target)
+
+        mod = hexagon_session.load_module(module)
+        output = tvm.nd.array(
+            np.zeros(get_out_shape(act_shape, wgt_shape, inp_stride), dtype="float16"),
+            device=hexagon_session.device,
+        )
+        mod(
+            tvm.nd.array(act, device=hexagon_session.device),
+            tvm.nd.array(wgt, device=hexagon_session.device),
+            inp_offset[0],  # off_t
+            inp_offset[1],  # off_l
+            inp_stride[0],  # stride_height
+            inp_stride[1],  # stride_width
+            output,
+        )
+
+        out = output.numpy()
+
+        # Generate reference output and compare:
+        ref_out = conv2d_nhwc_python(
+            act.astype("float32"), wgt.astype("float32"), stride=inp_stride, padding="VALID"
+        ).astype("float16")
+
+        tvm.testing.assert_allclose(out, ref_out, rtol=5e-2, atol=5e-2)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 12223983422868bbbc5444f66d175aeb9318b71f Mon Sep 17 00:00:00 2001
From: Dhruv Chauhan <89972057+dchauhan-arm@users.noreply.github.com>
Date: Mon, 12 Sep 2022 21:03:56 +0100
Subject: [PATCH 152/704] [TFLite] Support quantized GREATER op in TFLite
 frontend (#12754)

Support GREATER quantization operation conversion as part of issue #9187 Continuation of #11519.
---
 python/tvm/relay/frontend/tflite.py          | 19 ++++----
 tests/python/frontend/tflite/test_forward.py | 49 +++++++++++---------
 2 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index c38191b389c9..6c68230e0ecc 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -1291,7 +1291,13 @@ def convert_square(self, op):
 
         return out
 
-    def _convert_elemwise(self, relay_op, op, ignore_qnn_params=False):
+    def _convert_elemwise(
+        self,
+        relay_op,
+        op,
+        ignore_qnn_params=False,
+        comparison_op=False,
+    ):
         """Generic method to Convert TFLite elemwise"""
         try:
             from tflite.AddOptions import AddOptions
@@ -1316,7 +1322,7 @@ def _convert_elemwise(self, relay_op, op, ignore_qnn_params=False):
 
         # TFLite format demands equal scale and zero_point tuple parameters for some operations
         # to allow us to use non-quantized operation instead of quantized if ignore_qnn_params=True
-        if ignore_qnn_params:
+        if ignore_qnn_params and not comparison_op:
             assert (
                 lhs_tensor.qnn_params
                 and self.has_same_qnn_params(lhs_tensor, output_tensor)
@@ -1431,12 +1437,7 @@ def convert_minimum(self, op):
 
     def convert_greater(self, op):
         """Convert TFLite GREATER"""
-        # Check if the input tensor is quantized, call QNN op
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented(
-                "TFlite quantized GREATER operator is not supported yet."
-            )
-        return self._convert_elemwise(_op.greater, op)
+        return self._convert_elemwise(_op.greater, op, self.is_quantized(op), comparison_op=True)
 
     def convert_squared_difference(self, op):
         """Convert TFLite SQUARED DIFFERENCE"""
@@ -1475,7 +1476,7 @@ def convert_less_equal(self, op):
 
     def convert_equal(self, op):
         """Convert TFLite EQUAL"""
-        return self._convert_elemwise(_op.equal, op, self.is_quantized(op))
+        return self._convert_elemwise(_op.equal, op, self.is_quantized(op), comparison_op=True)
 
     def convert_not_equal(self, op):
         """Convert TFLite NOT_EQUAL"""
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 7267b725483d..18045b8e8365 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -2254,6 +2254,7 @@ def _test_elemwise(
     quantized=False,
     qnn_op=None,
     same_qnn_params=False,
+    comparison_op=False,
 ):
     """One iteration of elemwise"""
 
@@ -2298,7 +2299,7 @@ def __test_elemwise(in_data):
                 if x[0] is not None
             }
 
-            if math_op is math_ops.equal:
+            if comparison_op:
                 out = math_op(inq_data[0], inq_data[1])
                 out = with_fused_activation_function(out, fused_activation_function)
 
@@ -2307,6 +2308,9 @@ def __test_elemwise(in_data):
                     [x + ":0" for x in input_range.keys()],
                     [x[1] for x in zip(in_data, inq_data) if x[0] is not None],
                     [out],
+                    quantized=True,
+                    input_range=input_range,
+                    experimental_new_converter=same_qnn_params,
                 )
             else:
                 out = math_op(inq_data[0], inq_data[1])
@@ -2314,6 +2318,7 @@ def __test_elemwise(in_data):
                 out = tf.quantization.fake_quant_with_min_max_args(
                     out, min=out_min, max=out_max, name="out"
                 )
+
                 # Note same_qnn_params uses experimental_new_converter as toco failed
                 compare_tflite_with_tvm(
                     [x[1] for x in zip(in_data, data) if x[0] is not None],
@@ -2440,9 +2445,17 @@ def _test_minimum(data, fused_activation_function=None, quantized=False, qnn_op=
 # -------
 
 
-def _test_greater(data):
+def _test_greater(data, fused_activation_function=None, quantized=False, qnn_op=None):
     """One iteration of greater"""
-    return _test_elemwise(math_ops.greater, data)
+    return _test_elemwise(
+        math_ops.greater,
+        data,
+        fused_activation_function,
+        quantized,
+        qnn_op,
+        same_qnn_params=True,
+        comparison_op=True,
+    )
 
 
 #######################################################################
@@ -2489,6 +2502,7 @@ def _test_equal(data, fused_activation_function=None, quantized=False, qnn_op=No
         quantized,
         qnn_op,
         same_qnn_params=True,
+        comparison_op=True,
     )
 
 
@@ -2555,25 +2569,14 @@ def _test_forward_elemwise(testop):
 
 
 def _test_forward_elemwise_quantized(testop):
-    if testop is not _test_equal:
-        testop(
-            [
-                np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8),
-                np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8),
-            ],
-            quantized=True,
-            qnn_op=testop,
-        )
-    else:
-        # no need for fake_quant to hold tensors in float32 until conversion
-        testop(
-            [
-                np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.float32),
-                np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.float32),
-            ],
-            quantized=True,
-            qnn_op=testop,
-        )
+    testop(
+        [
+            np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8),
+            np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8),
+        ],
+        quantized=True,
+        qnn_op=testop,
+    )
 
 
 def _test_elemwise_qnn_out_range(qnn_op):
@@ -2585,6 +2588,7 @@ def _test_elemwise_qnn_out_range(qnn_op):
         _test_maximum: (-112, 111),
         _test_minimum: (-128, 127),
         _test_equal: (-150, 150),
+        _test_greater: (-150, 150),
     }
 
     return qnn_out_range[qnn_op]
@@ -2615,6 +2619,7 @@ def test_all_elemwise():
     _test_forward_elemwise(_test_minimum)
     _test_forward_elemwise_quantized(_test_minimum)
     _test_forward_elemwise(_test_greater)
+    _test_forward_elemwise_quantized(_test_greater)
     _test_forward_elemwise(_test_squared_difference)
     _test_forward_elemwise(_test_greater_equal)
     _test_forward_elemwise(_test_less)

From 9671aee942503815ad2a586406eef11391287ee5 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 12 Sep 2022 15:31:52 -0500
Subject: [PATCH 153/704] [Hexagon] Validate 2-d physical shapes for
 TIR-derived schedules (#12662)

Previously, the test cases only tested TE-based schedules.  This
commit runs the same tests for equivalent TIR-based schedules as
well.  This is intended to catch Hexagon-specific regressions, such as
the one resolved in https://github.com/apache/tvm/pull/12652.
---
 .../test_hexagon/test_2d_physical_buffers.py  | 59 ++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 tests/python/contrib/test_hexagon/test_2d_physical_buffers.py

diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
old mode 100644
new mode 100755
index cebb36edc35d..cba6ddc4433a
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -41,6 +41,8 @@
 # there as well
 # pylint: disable=invalid-name
 
+schedule_type = tvm.testing.parameter("TE", "TIR")
+
 dtype = tvm.testing.parameter("int8")
 batch_size = tvm.testing.parameter(
     16,
@@ -198,6 +200,7 @@ def output_shape(self, input_shape):
     @tvm.testing.fixture
     def schedule_args(
         self,
+        schedule_type,
         input_shape,
         dtype,
         input_layout,
@@ -206,12 +209,39 @@ def schedule_args(
         working_scope,
     ):
         """Create and return the schedule and input args after applying layout transform"""
+        if schedule_type == "TE":
+
+            return self._te_schedule_args(
+                input_shape, dtype, input_layout, output_layout, working_layout, working_scope
+            )
+        elif schedule_type == "TIR":
+            return self._tir_schedule_args(
+                input_shape, dtype, input_layout, output_layout, working_layout, working_scope
+            )
+
+        else:
+            raise ValueError(f"Unknown schedule type: {schedule_type}")
+
+    def _te_tensors(self, input_shape, dtype):
         input_tensor = te.placeholder(input_shape, dtype, name="Input")
         output_tensor = te.compute(
             shape=input_tensor.shape,
             fcompute=lambda *indices: (2 * input_tensor[indices]).astype(dtype),
             name="Output",
         )
+        return input_tensor, output_tensor
+
+    def _te_schedule_args(
+        self,
+        input_shape,
+        dtype,
+        input_layout,
+        output_layout,
+        working_layout,
+        working_scope,
+    ):
+        input_tensor, output_tensor = self._te_tensors(input_shape, dtype)
+
         schedule = te.create_schedule(output_tensor.op)
 
         write_cache = schedule.cache_write(output_tensor, working_scope)
@@ -235,6 +265,33 @@ def apply_transform(tensor, layout):
 
         return [schedule, [input_tensor, output_tensor]]
 
+    def _tir_schedule_args(
+        self, input_shape, dtype, input_layout, output_layout, working_layout, working_scope
+    ):
+        tensors = self._te_tensors(input_shape, dtype)
+
+        sch = tvm.tir.Schedule(te.create_prim_func(tensors))
+
+        cache_read_block = sch.cache_read("Output", 0, working_scope)
+        cache_write_block = sch.cache_write("Output", 0, working_scope)
+
+        def apply_transform(block, buffer_name, layout):
+            if layout == "nhwc":
+                pass
+            elif layout == "nchw-8h8w32c-1d":
+                sch.transform_layout(block, buffer_name, layout_transform_1d)
+            elif layout == "nchw-8h8w32c-2d":
+                sch.transform_layout(block, buffer_name, layout_transform_2d)
+            else:
+                raise RuntimeError(f"Unexpected layout '{layout}'")
+
+        apply_transform(cache_read_block, ("read", 0), input_layout)
+        apply_transform(cache_read_block, ("write", 0), working_layout)
+        apply_transform(cache_write_block, ("read", 0), working_layout)
+        apply_transform(cache_write_block, ("write", 0), output_layout)
+
+        return [sch.mod]
+
     @tvm.testing.fixture
     def ir_module(self, schedule_args):
         # If the two buffers are accessed with the same indices, CSE
@@ -272,7 +329,7 @@ def test_cache_shape(self, ir_module, input_layout, working_layout, output_layou
                 "Input.global.vtcm": working_layout,
                 "Output.global.vtcm": working_layout,
                 "Output": output_layout,
-            }[buffer.name]
+            }[buffer.name.replace("_", ".")]
 
             expected_physical_dimensions = {
                 "nhwc": 1,

From 4d2766409f1b95504aac171649367c2df2813029 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 12 Sep 2022 15:06:16 -0800
Subject: [PATCH 154/704] [AutoTVM] Fix `None` feature in AutoTVM tuning
 (#12760)

This PR introduces a couple of fixes to make AutoTVM working more
robustly:
- Fixed a very rarecase that `None` could pop up in AutoTVM features;
- Fixed a misuse of `ARGS` in the testing script;
- Fixed the filename for caching.
---
 python/tvm/autotvm/testing/tune_relay.py           | 13 +++++++------
 python/tvm/autotvm/tuner/xgboost_cost_model.py     |  7 +++----
 python/tvm/meta_schedule/testing/relay_workload.py |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/python/tvm/autotvm/testing/tune_relay.py b/python/tvm/autotvm/testing/tune_relay.py
index e4745963741f..743127ec1ded 100644
--- a/python/tvm/autotvm/testing/tune_relay.py
+++ b/python/tvm/autotvm/testing/tune_relay.py
@@ -139,12 +139,6 @@ def _parse_args():
         tracker_key=parsed.rpc_key,
         session_timeout_sec=600,
     )
-    if ARGS.target.kind.name != "llvm" and ARGS.graph_tuner:
-        raise ValueError("GraphTuner only supports llvm target")
-    if ARGS.target.kind.name != "llvm" and ARGS.cpu_flush:
-        raise ValueError("cpu_flush only supports llvm target")
-    if ARGS.target.kind.name == "llvm" and not ARGS.cpu_flush:
-        warnings.warn("cpu_flush is not enabled for llvm target")
     return parsed
 
 
@@ -152,6 +146,13 @@ def _parse_args():
 
 
 def main():
+    if ARGS.target.kind.name != "llvm" and ARGS.graph_tuner:
+        raise ValueError("GraphTuner only supports llvm target")
+    if ARGS.target.kind.name != "llvm" and ARGS.cpu_flush:
+        raise ValueError("cpu_flush only supports llvm target")
+    if ARGS.target.kind.name == "llvm" and not ARGS.cpu_flush:
+        warnings.warn("cpu_flush is not enabled for llvm target")
+
     log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
     graph_opt_sch_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}_graph_opt.log")
     measure_option = autotvm.measure_option(
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index d4942ce6a4ca..6fa04f336f10 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -21,12 +21,11 @@
 import time
 
 import numpy as np
-
 from tvm.contrib.popen_pool import PopenPoolExecutor, StatusKind
 
 from .. import feature
 from ..utils import get_rank
-from .metric import max_curve, recall_curve, cover_curve
+from .metric import cover_curve, max_curve, recall_curve
 from .model_based_tuner import CostModel, FeatureCache
 
 xgb = None
@@ -346,7 +345,7 @@ def _get_feature(self, indexes):
         ret = np.empty((len(indexes), feature_len), dtype=np.float32)
         for i, ii in enumerate(indexes):
             t = fea_cache[ii]
-            if t.shape[0] < feature_len:
+            if t is not None and t.shape[0] < feature_len:
                 t = np.pad(t, (0, feature_len - t.shape[0]))
             ret[i, :] = t if t is not None else 0
         return ret
@@ -449,8 +448,8 @@ def custom_callback(
 ):
     """callback function for xgboost to support multiple custom evaluation functions"""
     # pylint: disable=import-outside-toplevel
-    from xgboost.core import EarlyStopException
     from xgboost.callback import _fmt_metric
+    from xgboost.core import EarlyStopException
 
     try:
         from xgboost.training import aggcv
diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py
index f4f6336df33f..98bb99512020 100644
--- a/python/tvm/meta_schedule/testing/relay_workload.py
+++ b/python/tvm/meta_schedule/testing/relay_workload.py
@@ -230,7 +230,7 @@ def get_network(
     inputs: Tuple[str, List[int], str]
     params_bytearray: bytearray
 
-    filename = f'relay-{name}-{",".join(str(i) for i in input_shape)}.json'
+    filename = f'relay-{name}-{layout}-{",".join(str(i) for i in input_shape)}.json'
     cached = _load_cache(cache_dir, filename)
     if cached is None:
         with multiprocessing.Pool(processes=1) as pool:

From a23b71ce1e3011be6b8e6ca5162b023956358911 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 12 Sep 2022 15:42:40 -0800
Subject: [PATCH 155/704] [MetaSchedule][Test] Migrate AddRFactor to SEqual
 (#12758)

This PR migrates the usage of `check_trace` to `check_sketch`,
which prefers structural equality of TIRs insteda of string equalty
of traces.
---
 .../meta_schedule/testing/schedule_rule.py    |  16 +-
 python/tvm/tir/schedule/testing.py            |   8 +-
 .../schedule_rule/add_rfactor.cc              |   5 +-
 src/tir/schedule/primitive/sampling.cc        |   4 +-
 ...meta_schedule_schedule_rule_add_rfactor.py | 142 ++++++++++++------
 5 files changed, 109 insertions(+), 66 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py
index 46df4b95ce07..b08db0811dd3 100644
--- a/python/tvm/meta_schedule/testing/schedule_rule.py
+++ b/python/tvm/meta_schedule/testing/schedule_rule.py
@@ -18,7 +18,6 @@
 from typing import List, Union
 
 from tvm.meta_schedule.schedule_rule import (
-    AddRFactor,
     AutoBind,
     AutoInline,
     CrossThreadReduction,
@@ -28,7 +27,9 @@
     ReuseType,
     ScheduleRule,
 )
-from tvm.meta_schedule.schedule_rule.multi_level_tiling import MultiLevelTilingTensorCore
+from tvm.meta_schedule.schedule_rule.multi_level_tiling import (
+    MultiLevelTilingTensorCore,
+)
 from tvm.target import Target
 
 
@@ -64,13 +65,6 @@ def auto_inline(target: Target) -> ScheduleRule:
     raise NotImplementedError(f"{target.kind.name} is not supported")
 
 
-def add_rfactor(target: Target) -> ScheduleRule:
-    """Default schedule rules for with add_rfactor"""
-    if target.kind.name == "llvm":
-        return AddRFactor(max_jobs_per_core=16, max_innermost_factor=64)
-    raise NotImplementedError(f"{target.kind.name} is not supported")
-
-
 def cross_thread_reduction(target: Target) -> ScheduleRule:
     """Default schedule rules for with cross-thread reduction"""
     if target.kind.name == "cuda":
@@ -131,7 +125,9 @@ def multi_level_tiling_tensor_core(
         trans_b = [trans_b]
 
     if target.kind.name == "cuda":
-        from tvm.tir.tensor_intrin import cuda  # pylint: disable=import-outside-toplevel
+        from tvm.tir.tensor_intrin import (  # pylint: disable=import-outside-toplevel
+            cuda,
+        )
 
         intrin_groups = [
             cuda.get_wmma_intrin_group(write_reuse_scope, _in_dtype, _out_dtype, _trans_b)
diff --git a/python/tvm/tir/schedule/testing.py b/python/tvm/tir/schedule/testing.py
index 3689f756e83c..538cc6e143ee 100644
--- a/python/tvm/tir/schedule/testing.py
+++ b/python/tvm/tir/schedule/testing.py
@@ -15,12 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utilities for the TensorIR schedule API"""
-from typing import Union, Sequence
+from typing import Sequence, Union
 
 import tvm
-from tvm.ir import IRModule, structural_equal
+from tvm.ir import IRModule, assert_structural_equal
 from tvm.tir import PrimFunc
-from tvm.tir.schedule import Trace, Schedule
+from tvm.tir.schedule import Schedule, Trace
 
 
 def verify_trace_roundtrip(
@@ -70,7 +70,7 @@ def verify_trace_roundtrip(
         assert text_format in ("json", "python"), f"Unknown text format: {text_format}"
 
     # Step 2. Verify that the round-trip produced the same scheduling
-    assert structural_equal(new_sch.mod, sch.mod)
+    assert_structural_equal(new_sch.mod, sch.mod)
 
     # Step 3. Check the consistency of the text format between the old and new traces
     py_repr = "\n".join(trace.as_python())
diff --git a/src/meta_schedule/schedule_rule/add_rfactor.cc b/src/meta_schedule/schedule_rule/add_rfactor.cc
index 5ef2ac3aad36..cf87f24ac233 100644
--- a/src/meta_schedule/schedule_rule/add_rfactor.cc
+++ b/src/meta_schedule/schedule_rule/add_rfactor.cc
@@ -90,8 +90,7 @@ Array<tir::Schedule> AddRFactorNode::Apply(const tir::Schedule& sch, const tir::
 
   // Split the fused reduction loop.
   Array<tir::ExprRV> factors = sch->SamplePerfectTile(fused_reduce_loop, 2, max_innermost_factor);
-  const Array<tir::LoopRV>& split_loops =
-      sch->Split(fused_reduce_loop, {factors.begin(), factors.end()});
+  Array<tir::LoopRV> split_loops = sch->Split(fused_reduce_loop, {factors.begin(), factors.end()});
 
   Array<tir::Schedule> res;
   for (const tir::LoopRV& split_loop : split_loops) {
@@ -104,7 +103,7 @@ Array<tir::Schedule> AddRFactorNode::Apply(const tir::Schedule& sch, const tir::
 
       // Annotate that the rfactor block, which is now the producer of the original block, needs to
       // be considered by the rule Random-Compute-Location.
-      sch_tmp->Annotate(block_rv, tir::attr::meta_schedule_random_compute_producer, Bool(true));
+      sch_tmp->Annotate(block_rv, tir::attr::meta_schedule_random_compute_producer, Integer(1));
       res.push_back(sch_tmp);
     } catch (const tvm::runtime::Error& e) {
     }
diff --git a/src/tir/schedule/primitive/sampling.cc b/src/tir/schedule/primitive/sampling.cc
index b1001a7f9455..ec12b045d3f0 100644
--- a/src/tir/schedule/primitive/sampling.cc
+++ b/src/tir/schedule/primitive/sampling.cc
@@ -338,7 +338,9 @@ std::vector<int64_t> SamplePerfectTile(
   } else {
     // Case 3. Use fresh new sampling result
     result = SamplePerfectTile(rand_state, *extent, n_splits, max_innermost_factor);
-    ICHECK_LE(result.back(), max_innermost_factor);
+    if (max_innermost_factor != -1) {
+      ICHECK_LE(result.back(), max_innermost_factor);
+    }
   }
   *decision = support::AsArray<int64_t, Integer>(result);
   return result;
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
index a39c8aea5fb6..17f42654fcf7 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
@@ -15,62 +15,108 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-
-from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
+from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing import te_workload
-from tvm.meta_schedule.testing.schedule_rule import add_rfactor
-from tvm.meta_schedule.testing.space_generation import check_trace
-from tvm.meta_schedule.tune_context import TuneContext
+from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.script import tir as T
 from tvm.target import Target
-from tvm.te.operation import create_prim_func
+from tvm.te import create_prim_func
 
 
-def _create_context(mod, target, rule) -> TuneContext:
-    ctx = TuneContext(
-        mod=mod,
-        target=target,
-        space_generator=PostOrderApply(),
-        sch_rules=[rule],
-        task_name="test",
-    )
-    return ctx
+def test_cpu_matmul():
+    @T.prim_func
+    def cpu_matmul_0(
+        A: T.Buffer[(4, 512), "float32"],
+        B: T.Buffer[(512, 4), "float32"],
+        C: T.Buffer[(4, 4), "float32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i0, i1, i2 in T.grid(4, 4, 512):
+            with T.block("C"):
+                i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                T.reads(A[i, k], B[k, j])
+                T.writes(C[i, j])
+                with T.init():
+                    C[i, j] = T.float32(0)
+                C[i, j] = C[i, j] + A[i, k] * B[k, j]
 
+    @T.prim_func
+    def cpu_matmul_1(
+        A: T.Buffer[(4, 512), "float32"],
+        B: T.Buffer[(512, 4), "float32"],
+        C: T.Buffer[(4, 4), "float32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        C_rf = T.alloc_buffer([4, 4, 128], dtype="float32")
+        for i0, i1, i2_0, i2_1 in T.grid(4, 4, 4, 128):
+            with T.block("C_rf"):
+                vi2_1, i, j, vi2_0 = T.axis.remap("SSSR", [i2_1, i0, i1, i2_0])
+                T.reads(A[i, vi2_0 * 128 + vi2_1], B[vi2_0 * 128 + vi2_1, j])
+                T.writes(C_rf[i, j, vi2_1])
+                with T.init():
+                    C_rf[i, j, vi2_1] = T.float32(0)
+                C_rf[i, j, vi2_1] = (
+                    C_rf[i, j, vi2_1] + A[i, vi2_0 * 128 + vi2_1] * B[vi2_0 * 128 + vi2_1, j]
+                )
+        for i0, i1, i2_1 in T.grid(4, 4, 128):
+            with T.block("C"):
+                vi2_1, i, j = T.axis.remap("RSS", [i2_1, i0, i1])
+                T.reads(C_rf[i, j, vi2_1])
+                T.writes(C[i, j])
+                T.block_attr({"meta_schedule.random_compute_producer": 1})
+                with T.init():
+                    C[i, j] = T.float32(0)
+                C[i, j] = C[i, j] + C_rf[i, j, vi2_1]
 
-def test_cpu_matmul():
-    expected = [
-        [],
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            "l1, l2, l3 = sch.get_loops(block=b0)",
-            "v4, v5 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l6, l7 = sch.split(loop=l3, factors=[v4, v5], preserve_unit_iters=True)",
-            "b8 = sch.rfactor(loop=l7, factor_axis=2)",
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.random_compute_producer", ann_val=1)',
-        ],
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            "l1, l2, l3 = sch.get_loops(block=b0)",
-            "v4, v5 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l6, l7 = sch.split(loop=l3, factors=[v4, v5], preserve_unit_iters=True)",
-            "b8 = sch.rfactor(loop=l6, factor_axis=2)",
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.random_compute_producer", ann_val=1)',
-        ],
+    @T.prim_func
+    def cpu_matmul_2(
+        A: T.Buffer[(4, 512), "float32"],
+        B: T.Buffer[(512, 4), "float32"],
+        C: T.Buffer[(4, 4), "float32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        C_rf = T.alloc_buffer([4, 4, 4], dtype="float32")
+        for i0, i1, i2_0, i2_1 in T.grid(4, 4, 4, 128):
+            with T.block("C_rf"):
+                vi2_0, i, j, vi2_1 = T.axis.remap("SSSR", [i2_0, i0, i1, i2_1])
+                T.reads(A[i, vi2_0 * 128 + vi2_1], B[vi2_0 * 128 + vi2_1, j])
+                T.writes(C_rf[i, j, vi2_0])
+                with T.init():
+                    C_rf[i, j, vi2_0] = T.float32(0)
+                C_rf[i, j, vi2_0] = (
+                    C_rf[i, j, vi2_0] + A[i, vi2_0 * 128 + vi2_1] * B[vi2_0 * 128 + vi2_1, j]
+                )
+        for i0, i1, i2_0 in T.grid(4, 4, 4):
+            with T.block("C"):
+                vi2_0, i, j = T.axis.remap("RSS", [i2_0, i0, i1])
+                T.reads(C_rf[i, j, vi2_0])
+                T.writes(C[i, j])
+                T.block_attr({"meta_schedule.random_compute_producer": 1})
+                with T.init():
+                    C[i, j] = T.float32(0)
+                C[i, j] = C[i, j] + C_rf[i, j, vi2_0]
+
+    decision_0 = []  # type: ignore
+    decision_1 = [
+        ("SamplePerfectTile", [4, 128]),
+    ]
+    decision_2 = [
+        ("SamplePerfectTile", [4, 128]),
     ]
-    target = Target("llvm --num-cores=32")
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.matmul(
-                n=4,
-                m=4,
-                k=512,
-            )
-        ),
-        target=target,
-        rule=add_rfactor(target=target),
+    mod = create_prim_func(te_workload.matmul(n=4, m=4, k=512))
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("llvm --num-cores=32"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[ms.schedule_rule.AddRFactor()],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[cpu_matmul_0, cpu_matmul_1, cpu_matmul_2],
+        expected_decisions=[decision_0, decision_1, decision_2],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 3
-    check_trace(spaces, expected)
 
 
 if __name__ == "__main__":

From ef784d68e04ab4b858ce4c953b2d83b5d5811eda Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 13 Sep 2022 02:20:30 -0700
Subject: [PATCH 156/704] [MetaSchedule][Test] Migrate `check_trace` to
 `check_sketch` (#12764)

* Migrate AutoBind

* Migrate RandomComputeLocation

* Migrate CrossThreadReduction

* Migrate ParallelVectorizeUnroll
---
 .../meta_schedule/testing/schedule_rule.py    |  48 +-
 ...t_meta_schedule_schedule_rule_auto_bind.py | 175 +++--
 ...le_schedule_rule_cross_thread_reduction.py | 665 +++++++++++++-----
 ...schedule_rule_parallel_vectorize_unroll.py | 111 +--
 ...e_schedule_rule_random_compute_location.py |  72 +-
 5 files changed, 718 insertions(+), 353 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py
index b08db0811dd3..12ca4200d77a 100644
--- a/python/tvm/meta_schedule/testing/schedule_rule.py
+++ b/python/tvm/meta_schedule/testing/schedule_rule.py
@@ -18,28 +18,15 @@
 from typing import List, Union
 
 from tvm.meta_schedule.schedule_rule import (
-    AutoBind,
     AutoInline,
-    CrossThreadReduction,
     MultiLevelTiling,
-    ParallelizeVectorizeUnroll,
-    RandomComputeLocation,
+    MultiLevelTilingTensorCore,
     ReuseType,
     ScheduleRule,
 )
-from tvm.meta_schedule.schedule_rule.multi_level_tiling import (
-    MultiLevelTilingTensorCore,
-)
 from tvm.target import Target
 
 
-def auto_bind(target: Target) -> ScheduleRule:
-    """Default schedule rules for auto bind"""
-    if target.kind.name == "cuda":
-        return AutoBind(max_threadblocks=256, thread_extents=[32, 64, 128, 256, 512, 1024])
-    raise NotImplementedError(f"{target.kind.name} is not supported")
-
-
 def auto_inline(target: Target) -> ScheduleRule:
     """Default schedule rules for auto inline"""
     if target.kind.name == "llvm":
@@ -65,13 +52,6 @@ def auto_inline(target: Target) -> ScheduleRule:
     raise NotImplementedError(f"{target.kind.name} is not supported")
 
 
-def cross_thread_reduction(target: Target) -> ScheduleRule:
-    """Default schedule rules for with cross-thread reduction"""
-    if target.kind.name == "cuda":
-        return CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512])
-    raise NotImplementedError(f"{target.kind.name} is not supported")
-
-
 def multi_level_tiling(target: Target) -> ScheduleRule:
     """Default schedule rules for with multi-level tiling and reuse"""
     if target.kind.name == "llvm":
@@ -154,29 +134,3 @@ def multi_level_tiling_tensor_core(
             use_software_pipeline=use_software_pipeline,
         )
     raise NotImplementedError(f"{target.kind.name} is not supported")
-
-
-def random_compute_location(target: Target) -> ScheduleRule:
-    """Default schedule rules for with random-compute-location"""
-    if target.kind.name == "llvm":
-        return RandomComputeLocation()
-    raise NotImplementedError(f"{target.kind.name} is not supported")
-
-
-def parallel_vectorize_unroll(target: Target) -> ScheduleRule:
-    """Default schedule rules for with parallel-vectorize-unroll"""
-    if target.kind.name == "llvm":
-        return ParallelizeVectorizeUnroll(
-            max_jobs_per_core=16,
-            max_vectorize_extent=32,
-            unroll_max_steps=[0, 16, 64, 512],
-            unroll_explicit=True,
-        )
-    if target.kind.name == "cuda":
-        return ParallelizeVectorizeUnroll(
-            max_jobs_per_core=-1,
-            max_vectorize_extent=-1,
-            unroll_max_steps=[0, 16, 64, 512, 1024],
-            unroll_explicit=True,
-        )
-    raise NotImplementedError(f"{target.kind.name} is not supported")
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index a89cca72e1b1..21ad04da473e 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -15,10 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
-from tvm.meta_schedule.testing.schedule_rule import auto_bind
-from tvm.meta_schedule.testing.space_generation import check_trace
-from tvm.meta_schedule.tune_context import TuneContext
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.space_generation import check_sketches
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -60,83 +58,120 @@ def zero_dim_add(
         C[()] = A[()] + B[()]
 
 
-def _create_context(mod, target, rule) -> TuneContext:
-    ctx = TuneContext(
-        mod=mod,
-        target=target,
-        space_generator=PostOrderApply(),
-        sch_rules=[rule],
-        task_name="test",
-    )
-    return ctx
-
-
 def test_cuda_element_wise():
-    expected = [
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            "l1, l2 = sch.get_loops(block=b0)",
-            "l3 = sch.fuse(l1, l2, preserve_unit_iters=True)",
-            "v4 = sch.sample_categorical(candidates=[32, 64, 128, 256, 512, 1024], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
-            'sch.bind(loop=l5, thread_axis="blockIdx.x")',
-            'sch.bind(loop=l6, thread_axis="threadIdx.x")',
-        ]
+    @T.prim_func
+    def elementwise_0(
+        A: T.Buffer[(512, 512), "float32"],
+        B: T.Buffer[(512, 512), "float32"],
+    ) -> None:
+        # body
+        # with T.block("root")
+        for i_j_fused_0 in T.thread_binding(256, thread="blockIdx.x"):
+            for i_j_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
+                with T.block("C"):
+                    vi = T.axis.spatial(512, (i_j_fused_0 * 1024 + i_j_fused_1) // 512)
+                    vj = T.axis.spatial(512, (i_j_fused_0 * 1024 + i_j_fused_1) % 512)
+                    T.reads(A[vi, vj])
+                    T.writes(B[vi, vj])
+                    B[vi, vj] = A[vi, vj] + T.float32(1)
+
+    decision_0 = [
+        ("SampleCategorical", 5),
     ]
-    target = Target("nvidia/geforce-rtx-3080", host="llvm")
-    ctx = _create_context(
-        element_wise,
-        target=target,
-        rule=auto_bind(target=target),
+    mod = element_wise
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3080", host="llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.AutoBind(
+                max_threadblocks=256,
+                thread_extents=[32, 64, 128, 256, 512, 1024],
+            )
+        ],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[elementwise_0],
+        expected_decisions=[decision_0],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
 
 
 def test_cuda_reduction_loop_only():
-    expected = [
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            "l1, = sch.get_loops(block=b0)",
-            "l2 = sch.add_unit_loop(block_or_loop=l1)",
-            "l3 = sch.fuse(l2, preserve_unit_iters=True)",
-            "l4, l5 = sch.split(loop=l3, factors=[None, 1], preserve_unit_iters=True)",
-            'sch.bind(loop=l4, thread_axis="blockIdx.x")',
-            'sch.bind(loop=l5, thread_axis="threadIdx.x")',
-        ]
-    ]
-    target = Target("nvidia/geforce-rtx-3080", host="llvm")
-    ctx = _create_context(
-        reduction_loop_only,
-        target=target,
-        rule=auto_bind(target=target),
+    @T.prim_func
+    def reduction_loop_only_0(
+        A: T.Buffer[2, "float32"],
+        B: T.Buffer[2, "float32"],
+        C: T.Buffer[(), "float32"],
+    ) -> None:
+        for u_fused_0 in T.thread_binding(1, thread="blockIdx.x"):
+            for u_fused_1 in T.thread_binding(1, thread="threadIdx.x"):
+                for i0 in T.serial(2):
+                    with T.block("C"):
+                        k0 = T.axis.reduce(2, i0)
+                        T.reads(A[k0], B[k0])
+                        T.writes(C[()])
+                        with T.init():
+                            C[()] = T.float32(1)
+                        C[()] = T.min(C[()], A[k0] / B[k0])
+
+    mod = reduction_loop_only
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3080", host="llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.AutoBind(
+                max_threadblocks=256,
+                thread_extents=[32, 64, 128, 256, 512, 1024],
+            )
+        ],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[reduction_loop_only_0],
+        expected_decisions=[[]],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
 
 
 def test_cuda_zero_dim_add():
-    expected = [
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            "l1 = sch.add_unit_loop(block_or_loop=b0)",
-            "l2 = sch.fuse(l1, preserve_unit_iters=True)",
-            "l3, l4 = sch.split(loop=l2, factors=[None, 1], preserve_unit_iters=True)",
-            'sch.bind(loop=l3, thread_axis="blockIdx.x")',
-            'sch.bind(loop=l4, thread_axis="threadIdx.x")',
-        ]
-    ]
-    target = Target("nvidia/geforce-rtx-3080", host="llvm")
-    ctx = _create_context(
-        zero_dim_add,
-        target=target,
-        rule=auto_bind(target=target),
+    @T.prim_func
+    def zero_dim_add_0(
+        A: T.Buffer[(), "float32"],
+        B: T.Buffer[(), "float32"],
+        C: T.Buffer[(), "float32"],
+    ) -> None:
+        for u_fused_0 in T.thread_binding(1, thread="blockIdx.x"):
+            for u_fused_1 in T.thread_binding(1, thread="threadIdx.x"):
+                with T.block("C"):
+                    vi = T.axis.spatial(1, 0)
+                    T.reads(A[()], B[()])
+                    T.writes(C[()])
+                    C[()] = A[()] + B[()]
+
+    mod = zero_dim_add
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3080", host="llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.AutoBind(
+                max_threadblocks=256,
+                thread_extents=[32, 64, 128, 256, 512, 1024],
+            )
+        ],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[zero_dim_add_0],
+        expected_decisions=[[]],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index 592d32d6245d..a0ca47c09a34 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -17,14 +17,12 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 
 import tvm
-from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
+from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing import te_workload
-from tvm.meta_schedule.testing.schedule_rule import cross_thread_reduction
-from tvm.meta_schedule.testing.space_generation import check_trace
-from tvm.meta_schedule.tune_context import TuneContext
+from tvm.meta_schedule.testing.space_generation import check_sketches
 from tvm.script import tir as T
 from tvm.target import Target
-from tvm.te.operation import create_prim_func
+from tvm.te import create_prim_func
 
 
 @tvm.script.ir_module
@@ -59,179 +57,522 @@ def main(
                 )
 
 
-def _create_context(mod, target, rule) -> TuneContext:
-    ctx = TuneContext(
-        mod=mod,
-        target=target,
-        space_generator=PostOrderApply(),
-        sch_rules=[rule],
-        task_name="test",
-    )
-    return ctx
+def test_gpu_softmax_mn():
+    @T.prim_func
+    def softmax_mn_0(
+        A: T.Buffer[(256, 256), "float32"],
+        T_softmax_norm: T.Buffer[(256, 256), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        T_softmax_maxelem = T.alloc_buffer([256], dtype="float32")
+        T_softmax_exp = T.alloc_buffer([256, 256], dtype="float32")
+        T_softmax_expsum = T.alloc_buffer([256], dtype="float32")
+        for i0, i1 in T.grid(256, 256):
+            with T.block("T_softmax_maxelem"):
+                i0_1, k = T.axis.remap("SR", [i0, i1])
+                T.reads(A[i0_1, k])
+                T.writes(T_softmax_maxelem[i0_1])
+                with T.init():
+                    T_softmax_maxelem[i0_1] = T.float32(-3.4028234663852886e38)
+                T_softmax_maxelem[i0_1] = T.max(T_softmax_maxelem[i0_1], A[i0_1, k])
+        for i0, i1 in T.grid(256, 256):
+            with T.block("T_softmax_exp"):
+                i0_2, i1_1 = T.axis.remap("SS", [i0, i1])
+                T.reads(A[i0_2, i1_1], T_softmax_maxelem[i0_2])
+                T.writes(T_softmax_exp[i0_2, i1_1])
+                T_softmax_exp[i0_2, i1_1] = T.exp(
+                    A[i0_2, i1_1] - T_softmax_maxelem[i0_2], dtype="float32"
+                )
+        for i0_3, i1 in T.grid(256, 256):
+            with T.block("T_softmax_expsum"):
+                i0_4, k = T.axis.remap("SR", [i0_3, i1])
+                T.reads(T_softmax_exp[i0_4, k])
+                T.writes(T_softmax_expsum[i0_4])
+                with T.init():
+                    T_softmax_expsum[i0_4] = T.float32(0)
+                T_softmax_expsum[i0_4] = T_softmax_expsum[i0_4] + T_softmax_exp[i0_4, k]
+        for i0_5, i1 in T.grid(256, 256):
+            with T.block("T_softmax_norm"):
+                i0_6, i1_2 = T.axis.remap("SS", [i0_5, i1])
+                T.reads(T_softmax_exp[i0_6, i1_2], T_softmax_expsum[i0_6])
+                T.writes(T_softmax_norm[i0_6, i1_2])
+                T.block_attr({"axis": 1})
+                T_softmax_norm[i0_6, i1_2] = T_softmax_exp[i0_6, i1_2] / T_softmax_expsum[i0_6]
 
+    @T.prim_func
+    def softmax_mn_1(
+        A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"]
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        T_softmax_maxelem_shared = T.alloc_buffer([256], dtype="float32", scope="shared")
+        T_softmax_exp = T.alloc_buffer([256, 256], dtype="float32")
+        T_softmax_expsum = T.alloc_buffer([256], dtype="float32")
+        for i0 in T.serial(256):
+            for ax0, ax1_0 in T.grid(1, 1):
+                for ax1_1 in T.thread_binding(512, thread="threadIdx.x"):
+                    with T.block("T_softmax_maxelem"):
+                        T.where(ax1_0 * 512 + ax1_1 < 256)
+                        i0_1 = T.axis.spatial(256, ax0 + i0)
+                        k = T.axis.reduce(256, ax1_0 * 512 + ax1_1)
+                        T.reads(A[i0_1, k])
+                        T.writes(T_softmax_maxelem_shared[i0_1])
+                        with T.init():
+                            T_softmax_maxelem_shared[i0_1] = T.float32(-3.4028234663852886e38)
+                        T_softmax_maxelem_shared[i0_1] = T.max(
+                            T_softmax_maxelem_shared[i0_1], A[i0_1, k]
+                        )
+            for i1_0 in T.serial(1):
+                for i1_1 in T.thread_binding(512, thread="threadIdx.x"):
+                    with T.block("T_softmax_exp"):
+                        T.where(i1_0 * 512 + i1_1 < 256)
+                        i0_2 = T.axis.spatial(256, i0)
+                        i1 = T.axis.spatial(256, i1_0 * 512 + i1_1)
+                        T.reads(A[i0_2, i1], T_softmax_maxelem_shared[i0_2])
+                        T.writes(T_softmax_exp[i0_2, i1])
+                        T_softmax_exp[i0_2, i1] = T.exp(
+                            A[i0_2, i1] - T_softmax_maxelem_shared[i0_2], dtype="float32"
+                        )
+        for i0_3, i1 in T.grid(256, 256):
+            with T.block("T_softmax_expsum"):
+                i0_4, k = T.axis.remap("SR", [i0_3, i1])
+                T.reads(T_softmax_exp[i0_4, k])
+                T.writes(T_softmax_expsum[i0_4])
+                with T.init():
+                    T_softmax_expsum[i0_4] = T.float32(0)
+                T_softmax_expsum[i0_4] = T_softmax_expsum[i0_4] + T_softmax_exp[i0_4, k]
+        for i0_5, i1 in T.grid(256, 256):
+            with T.block("T_softmax_norm"):
+                i0_6, i1_2 = T.axis.remap("SS", [i0_5, i1])
+                T.reads(T_softmax_exp[i0_6, i1_2], T_softmax_expsum[i0_6])
+                T.writes(T_softmax_norm[i0_6, i1_2])
+                T.block_attr({"axis": 1})
+                T_softmax_norm[i0_6, i1_2] = T_softmax_exp[i0_6, i1_2] / T_softmax_expsum[i0_6]
 
-def test_gpu_softmax_mn():
-    expected = [
-        [],
-        [
-            'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")',
-            "b1, = sch.get_consumers(block=b0)",
-            "l2, l3 = sch.get_loops(block=b1)",
-            "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
-            'sch.bind(loop=l6, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)",
-            'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
-            "l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
-            'sch.bind(loop=l11, thread_axis="threadIdx.x")',
-        ],
-        [
-            'b0 = sch.get_block(name="T_softmax_expsum", func_name="main")',
-            "b1, = sch.get_consumers(block=b0)",
-            "l2, l3 = sch.get_loops(block=b1)",
-            "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
-            'sch.bind(loop=l6, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)",
-            'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
-            "l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
-            'sch.bind(loop=l11, thread_axis="threadIdx.x")',
-        ],
-        [
-            'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")',
-            'b1 = sch.get_block(name="T_softmax_expsum", func_name="main")',
-            "b2, = sch.get_consumers(block=b1)",
-            "l3, l4 = sch.get_loops(block=b2)",
-            "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)",
-            'sch.bind(loop=l7, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True, index=-1)",
-            'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")',
-            "l8, l9, l10 = sch.get_loops(block=b1)",
-            "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)",
-            'sch.bind(loop=l12, thread_axis="threadIdx.x")',
-            "b13, = sch.get_consumers(block=b0)",
-            "l14, l15 = sch.get_loops(block=b13)",
-            "v16 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l17, l18 = sch.split(loop=l15, factors=[None, v16], preserve_unit_iters=True)",
-            'sch.bind(loop=l18, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b0, loop=l14, preserve_unit_loops=True, index=-1)",
-            'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
-            "l19, l20, l21 = sch.get_loops(block=b0)",
-            "l22, l23 = sch.split(loop=l21, factors=[None, v16], preserve_unit_iters=True)",
-            'sch.bind(loop=l23, thread_axis="threadIdx.x")',
-        ],
+    @T.prim_func
+    def softmax_mn_2(
+        A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"]
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        T_softmax_maxelem = T.alloc_buffer([256], dtype="float32")
+        T_softmax_exp = T.alloc_buffer([256, 256], dtype="float32")
+        T_softmax_expsum_shared = T.alloc_buffer([256], dtype="float32", scope="shared")
+        for i0, i1 in T.grid(256, 256):
+            with T.block("T_softmax_maxelem"):
+                i0_1, k = T.axis.remap("SR", [i0, i1])
+                T.reads(A[i0_1, k])
+                T.writes(T_softmax_maxelem[i0_1])
+                with T.init():
+                    T_softmax_maxelem[i0_1] = T.float32(-3.4028234663852886e38)
+                T_softmax_maxelem[i0_1] = T.max(T_softmax_maxelem[i0_1], A[i0_1, k])
+        for i0, i1 in T.grid(256, 256):
+            with T.block("T_softmax_exp"):
+                i0_2, i1_1 = T.axis.remap("SS", [i0, i1])
+                T.reads(A[i0_2, i1_1], T_softmax_maxelem[i0_2])
+                T.writes(T_softmax_exp[i0_2, i1_1])
+                T_softmax_exp[i0_2, i1_1] = T.exp(
+                    A[i0_2, i1_1] - T_softmax_maxelem[i0_2], dtype="float32"
+                )
+        for i0_3 in T.serial(256):
+            for ax0, ax1_0 in T.grid(1, 32):
+                for ax1_1 in T.thread_binding(8, thread="threadIdx.x"):
+                    with T.block("T_softmax_expsum"):
+                        i0_4 = T.axis.spatial(256, ax0 + i0_3)
+                        k = T.axis.reduce(256, ax1_0 * 8 + ax1_1)
+                        T.reads(T_softmax_exp[i0_4, k])
+                        T.writes(T_softmax_expsum_shared[i0_4])
+                        with T.init():
+                            T_softmax_expsum_shared[i0_4] = T.float32(0)
+                        T_softmax_expsum_shared[i0_4] = (
+                            T_softmax_expsum_shared[i0_4] + T_softmax_exp[i0_4, k]
+                        )
+            for i1_0 in T.serial(32):
+                for i1_1_1 in T.thread_binding(8, thread="threadIdx.x"):
+                    with T.block("T_softmax_norm"):
+                        i0_5 = T.axis.spatial(256, i0_3)
+                        i1 = T.axis.spatial(256, i1_0 * 8 + i1_1_1)
+                        T.reads(T_softmax_exp[i0_5, i1], T_softmax_expsum_shared[i0_5])
+                        T.writes(T_softmax_norm[i0_5, i1])
+                        T.block_attr({"axis": 1})
+                        T_softmax_norm[i0_5, i1] = (
+                            T_softmax_exp[i0_5, i1] / T_softmax_expsum_shared[i0_5]
+                        )
+
+    @T.prim_func
+    def softmax_mn_3(
+        A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"]
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        T_softmax_maxelem_shared = T.alloc_buffer([256], dtype="float32", scope="shared")
+        T_softmax_exp = T.alloc_buffer([256, 256], dtype="float32")
+        T_softmax_expsum_shared = T.alloc_buffer([256], dtype="float32", scope="shared")
+        for i0 in T.serial(256):
+            for ax0, ax1_0 in T.grid(1, 1):
+                for ax1_1 in T.thread_binding(512, thread="threadIdx.x"):
+                    with T.block("T_softmax_maxelem"):
+                        T.where(ax1_0 * 512 + ax1_1 < 256)
+                        i0_1 = T.axis.spatial(256, ax0 + i0)
+                        k = T.axis.reduce(256, ax1_0 * 512 + ax1_1)
+                        T.reads(A[i0_1, k])
+                        T.writes(T_softmax_maxelem_shared[i0_1])
+                        with T.init():
+                            T_softmax_maxelem_shared[i0_1] = T.float32(-3.4028234663852886e38)
+                        T_softmax_maxelem_shared[i0_1] = T.max(
+                            T_softmax_maxelem_shared[i0_1], A[i0_1, k]
+                        )
+            for i1_0 in T.serial(1):
+                for i1_1 in T.thread_binding(512, thread="threadIdx.x"):
+                    with T.block("T_softmax_exp"):
+                        T.where(i1_0 * 512 + i1_1 < 256)
+                        i0_2 = T.axis.spatial(256, i0)
+                        i1 = T.axis.spatial(256, i1_0 * 512 + i1_1)
+                        T.reads(A[i0_2, i1], T_softmax_maxelem_shared[i0_2])
+                        T.writes(T_softmax_exp[i0_2, i1])
+                        T_softmax_exp[i0_2, i1] = T.exp(
+                            A[i0_2, i1] - T_softmax_maxelem_shared[i0_2], dtype="float32"
+                        )
+        for i0_3 in T.serial(256):
+            for ax0, ax1_0 in T.grid(1, 32):
+                for ax1_1 in T.thread_binding(8, thread="threadIdx.x"):
+                    with T.block("T_softmax_expsum"):
+                        i0_4 = T.axis.spatial(256, ax0 + i0_3)
+                        k = T.axis.reduce(256, ax1_0 * 8 + ax1_1)
+                        T.reads(T_softmax_exp[i0_4, k])
+                        T.writes(T_softmax_expsum_shared[i0_4])
+                        with T.init():
+                            T_softmax_expsum_shared[i0_4] = T.float32(0)
+                        T_softmax_expsum_shared[i0_4] = (
+                            T_softmax_expsum_shared[i0_4] + T_softmax_exp[i0_4, k]
+                        )
+            for i1_0 in T.serial(32):
+                for i1_1 in T.thread_binding(8, thread="threadIdx.x"):
+                    with T.block("T_softmax_norm"):
+                        i0_5 = T.axis.spatial(256, i0_3)
+                        i1 = T.axis.spatial(256, i1_0 * 8 + i1_1)
+                        T.reads(T_softmax_exp[i0_5, i1], T_softmax_expsum_shared[i0_5])
+                        T.writes(T_softmax_norm[i0_5, i1])
+                        T.block_attr({"axis": 1})
+                        T_softmax_norm[i0_5, i1] = (
+                            T_softmax_exp[i0_5, i1] / T_softmax_expsum_shared[i0_5]
+                        )
+
+    decision_0 = []  # type: ignore
+    decision_1 = [
+        ("SampleCategorical", 7),
+    ]
+    decision_2 = [
+        ("SampleCategorical", 1),
+    ]
+    decision_3 = [
+        ("SampleCategorical", 1),
+        ("SampleCategorical", 7),
     ]
-    target = Target("nvidia/geforce-rtx-3090", host="llvm")
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.softmax_mn(
-                n=256,
-                m=256,
-            )
-        ),
-        target=target,
-        rule=cross_thread_reduction(target=target),
+    mod = create_prim_func(te_workload.softmax_mn(n=256, m=256))
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3090", host="llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512])
+        ],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[softmax_mn_0, softmax_mn_1, softmax_mn_2, softmax_mn_3],
+        expected_decisions=[decision_0, decision_1, decision_2, decision_3],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 4
-    check_trace(spaces, expected)
 
 
 def test_gpu_softmax_mn_after_inline():
-    expected = [
-        [],
-        [
-            'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")',
-            "v1 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l2, l3 = sch.get_loops(block=b0)",
-            "l4, l5 = sch.split(loop=l3, factors=[None, v1], preserve_unit_iters=True)",
-            'sch.bind(loop=l5, thread_axis="threadIdx.x")',
-        ],
-        [
-            'b0 = sch.get_block(name="T_softmax_expsum", func_name="main")',
-            "b1, = sch.get_consumers(block=b0)",
-            "l2, l3 = sch.get_loops(block=b1)",
-            "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
-            'sch.bind(loop=l6, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)",
-            'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
-            "l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
-            'sch.bind(loop=l11, thread_axis="threadIdx.x")',
+    @T.prim_func
+    def softmax_mn_after_inline_0(
+        A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"]
+    ) -> None:
+        T_softmax_maxelem = T.alloc_buffer([256], dtype="float32")
+        T_softmax_expsum = T.alloc_buffer([256], dtype="float32")
+        for i0, i1 in T.grid(256, 256):
+            with T.block("T_softmax_maxelem"):
+                i0_1, k = T.axis.remap("SR", [i0, i1])
+                T.reads(A[i0_1, k])
+                T.writes(T_softmax_maxelem[i0_1])
+                with T.init():
+                    T_softmax_maxelem[i0_1] = T.float32(-3.4028234663852886e38)
+                T_softmax_maxelem[i0_1] = T.max(T_softmax_maxelem[i0_1], A[i0_1, k])
+        for i0, i1 in T.grid(256, 256):
+            with T.block("T_softmax_expsum"):
+                i0_2, k = T.axis.remap("SR", [i0, i1])
+                T.reads(A[i0_2, k], T_softmax_maxelem[i0_2])
+                T.writes(T_softmax_expsum[i0_2])
+                with T.init():
+                    T_softmax_expsum[i0_2] = T.float32(0)
+                T_softmax_expsum[i0_2] = T_softmax_expsum[i0_2] + T.exp(
+                    A[i0_2, k] - T_softmax_maxelem[i0_2], dtype="float32"
+                )
+        for i0_3, i1 in T.grid(256, 256):
+            with T.block("T_softmax_norm"):
+                i0_4, i1_1 = T.axis.remap("SS", [i0_3, i1])
+                T.reads(A[i0_4, i1_1], T_softmax_maxelem[i0_4], T_softmax_expsum[i0_4])
+                T.writes(T_softmax_norm[i0_4, i1_1])
+                T.block_attr({"axis": 1})
+                T_softmax_norm[i0_4, i1_1] = (
+                    T.exp(A[i0_4, i1_1] - T_softmax_maxelem[i0_4], dtype="float32")
+                    / T_softmax_expsum[i0_4]
+                )
+
+    @T.prim_func
+    def softmax_mn_after_inline_1(
+        A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"]
+    ) -> None:
+        T_softmax_maxelem = T.alloc_buffer([256], dtype="float32")
+        T_softmax_expsum = T.alloc_buffer([256], dtype="float32")
+        for i0, i1_0 in T.grid(256, 4):
+            for i1_1 in T.thread_binding(64, thread="threadIdx.x"):
+                with T.block("T_softmax_maxelem"):
+                    i0_1 = T.axis.spatial(256, i0)
+                    k = T.axis.reduce(256, i1_0 * 64 + i1_1)
+                    T.reads(A[i0_1, k])
+                    T.writes(T_softmax_maxelem[i0_1])
+                    with T.init():
+                        T_softmax_maxelem[i0_1] = T.float32(-3.4028234663852886e38)
+                    T_softmax_maxelem[i0_1] = T.max(T_softmax_maxelem[i0_1], A[i0_1, k])
+        for i0, i1 in T.grid(256, 256):
+            with T.block("T_softmax_expsum"):
+                i0_2, k = T.axis.remap("SR", [i0, i1])
+                T.reads(A[i0_2, k], T_softmax_maxelem[i0_2])
+                T.writes(T_softmax_expsum[i0_2])
+                with T.init():
+                    T_softmax_expsum[i0_2] = T.float32(0)
+                T_softmax_expsum[i0_2] = T_softmax_expsum[i0_2] + T.exp(
+                    A[i0_2, k] - T_softmax_maxelem[i0_2], dtype="float32"
+                )
+        for i0_3, i1 in T.grid(256, 256):
+            with T.block("T_softmax_norm"):
+                i0_4, i1_1 = T.axis.remap("SS", [i0_3, i1])
+                T.reads(A[i0_4, i1_1], T_softmax_maxelem[i0_4], T_softmax_expsum[i0_4])
+                T.writes(T_softmax_norm[i0_4, i1_1])
+                T.block_attr({"axis": 1})
+                T_softmax_norm[i0_4, i1_1] = (
+                    T.exp(A[i0_4, i1_1] - T_softmax_maxelem[i0_4], dtype="float32")
+                    / T_softmax_expsum[i0_4]
+                )
+
+    @T.prim_func
+    def softmax_mn_after_inline_2(
+        A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"]
+    ) -> None:
+        T_softmax_maxelem = T.alloc_buffer([256], dtype="float32")
+        T_softmax_expsum_shared = T.alloc_buffer([256], dtype="float32", scope="shared")
+        for i0, i1 in T.grid(256, 256):
+            with T.block("T_softmax_maxelem"):
+                i0_1, k = T.axis.remap("SR", [i0, i1])
+                T.reads(A[i0_1, k])
+                T.writes(T_softmax_maxelem[i0_1])
+                with T.init():
+                    T_softmax_maxelem[i0_1] = T.float32(-3.4028234663852886e38)
+                T_softmax_maxelem[i0_1] = T.max(T_softmax_maxelem[i0_1], A[i0_1, k])
+        for i0_3 in T.serial(256):
+            for ax0, ax1_0 in T.grid(1, 1):
+                for ax1_1 in T.thread_binding(512, thread="threadIdx.x"):
+                    with T.block("T_softmax_expsum"):
+                        T.where(ax1_0 * 512 + ax1_1 < 256)
+                        i0_2 = T.axis.spatial(256, ax0 + i0_3)
+                        k = T.axis.reduce(256, ax1_0 * 512 + ax1_1)
+                        T.reads(A[i0_2, k], T_softmax_maxelem[i0_2])
+                        T.writes(T_softmax_expsum_shared[i0_2])
+                        with T.init():
+                            T_softmax_expsum_shared[i0_2] = T.float32(0)
+                        T_softmax_expsum_shared[i0_2] = T_softmax_expsum_shared[i0_2] + T.exp(
+                            A[i0_2, k] - T_softmax_maxelem[i0_2], dtype="float32"
+                        )
+            for i1_0 in T.serial(1):
+                for i1_1 in T.thread_binding(512, thread="threadIdx.x"):
+                    with T.block("T_softmax_norm"):
+                        T.where(i1_0 * 512 + i1_1 < 256)
+                        i0_4 = T.axis.spatial(256, i0_3)
+                        i1_1_1 = T.axis.spatial(256, i1_0 * 512 + i1_1)
+                        T.reads(
+                            A[i0_4, i1_1_1], T_softmax_maxelem[i0_4], T_softmax_expsum_shared[i0_4]
+                        )
+                        T.writes(T_softmax_norm[i0_4, i1_1_1])
+                        T.block_attr({"axis": 1})
+                        T_softmax_norm[i0_4, i1_1_1] = (
+                            T.exp(A[i0_4, i1_1_1] - T_softmax_maxelem[i0_4], dtype="float32")
+                            / T_softmax_expsum_shared[i0_4]
+                        )
+
+    @T.prim_func
+    def softmax_mn_after_inline_3(
+        A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"]
+    ) -> None:
+        T_softmax_maxelem_shared = T.alloc_buffer([256], dtype="float32", scope="shared")
+        T_softmax_expsum_shared = T.alloc_buffer([256], dtype="float32", scope="shared")
+        for i0_3 in T.serial(256):
+            for ax0, ax1_0 in T.grid(1, 1):
+                for ax1_1 in T.thread_binding(512, thread="threadIdx.x"):
+                    with T.block("T_softmax_maxelem"):
+                        T.where(ax1_0 * 512 + ax1_1 < 256)
+                        i0_1 = T.axis.spatial(256, ax0 + i0_3)
+                        k = T.axis.reduce(256, ax1_0 * 512 + ax1_1)
+                        T.reads(A[i0_1, k])
+                        T.writes(T_softmax_maxelem_shared[i0_1])
+                        with T.init():
+                            T_softmax_maxelem_shared[i0_1] = T.float32(-3.4028234663852886e38)
+                        T_softmax_maxelem_shared[i0_1] = T.max(
+                            T_softmax_maxelem_shared[i0_1], A[i0_1, k]
+                        )
+            for ax0, ax1_0 in T.grid(1, 1):
+                for ax1_1 in T.thread_binding(512, thread="threadIdx.x"):
+                    with T.block("T_softmax_expsum"):
+                        T.where(ax1_0 * 512 + ax1_1 < 256)
+                        i0_2 = T.axis.spatial(256, ax0 + i0_3)
+                        k = T.axis.reduce(256, ax1_0 * 512 + ax1_1)
+                        T.reads(A[i0_2, k], T_softmax_maxelem_shared[i0_2])
+                        T.writes(T_softmax_expsum_shared[i0_2])
+                        with T.init():
+                            T_softmax_expsum_shared[i0_2] = T.float32(0)
+                        T_softmax_expsum_shared[i0_2] = T_softmax_expsum_shared[i0_2] + T.exp(
+                            A[i0_2, k] - T_softmax_maxelem_shared[i0_2], dtype="float32"
+                        )
+            for i1_0 in T.serial(1):
+                for i1_1 in T.thread_binding(512, thread="threadIdx.x"):
+                    with T.block("T_softmax_norm"):
+                        T.where(i1_0 * 512 + i1_1 < 256)
+                        i0_4 = T.axis.spatial(256, i0_3)
+                        i1_1_1 = T.axis.spatial(256, i1_0 * 512 + i1_1)
+                        T.reads(
+                            A[i0_4, i1_1_1],
+                            T_softmax_maxelem_shared[i0_4],
+                            T_softmax_expsum_shared[i0_4],
+                        )
+                        T.writes(T_softmax_norm[i0_4, i1_1_1])
+                        T.block_attr({"axis": 1})
+                        T_softmax_norm[i0_4, i1_1_1] = (
+                            T.exp(A[i0_4, i1_1_1] - T_softmax_maxelem_shared[i0_4], dtype="float32")
+                            / T_softmax_expsum_shared[i0_4]
+                        )
+
+    decision_0 = []  # type: ignore
+    decision_1 = [
+        ("SampleCategorical", 4),
+    ]
+    decision_2 = [
+        ("SampleCategorical", 7),
+    ]
+    decision_3 = [
+        ("SampleCategorical", 7),
+        ("SampleCategorical", 0),
+    ]
+
+    mod = Softmax_mn_after_inline
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3090", host="llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512])
         ],
-        [
-            'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")',
-            'b1 = sch.get_block(name="T_softmax_expsum", func_name="main")',
-            "b2, = sch.get_consumers(block=b1)",
-            "l3, l4 = sch.get_loops(block=b2)",
-            "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)",
-            'sch.bind(loop=l7, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True, index=-1)",
-            'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")',
-            "l8, l9, l10 = sch.get_loops(block=b1)",
-            "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)",
-            'sch.bind(loop=l12, thread_axis="threadIdx.x")',
-            "b13, b14 = sch.get_consumers(block=b0)",
-            "l15, l16, l17, l18 = sch.get_loops(block=b13)",
-            "sch.compute_at(block=b0, loop=l15, preserve_unit_loops=True, index=-1)",
-            'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
-            "l19, l20, l21 = sch.get_loops(block=b0)",
-            "l22, l23 = sch.split(loop=l21, factors=[None, v5], preserve_unit_iters=True)",
-            'sch.bind(loop=l23, thread_axis="threadIdx.x")',
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[
+            softmax_mn_after_inline_0,
+            softmax_mn_after_inline_1,
+            softmax_mn_after_inline_2,
+            softmax_mn_after_inline_3,
         ],
-    ]
-    target = Target("nvidia/geforce-rtx-3090", host="llvm")
-    ctx = _create_context(
-        mod=Softmax_mn_after_inline,
-        target=target,
-        rule=cross_thread_reduction(target=target),
+        expected_decisions=[decision_0, decision_1, decision_2, decision_3],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 4
-    check_trace(spaces, expected)
 
 
 def test_gpu_batch_norm_bmn():
-    expected = [
-        [],
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            "b1, = sch.get_consumers(block=b0)",
-            "l2, = sch.get_loops(block=b1)",
-            "v3 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l4, l5 = sch.split(loop=l2, factors=[None, v3], preserve_unit_iters=True)",
-            'sch.bind(loop=l5, thread_axis="threadIdx.x")',
-            "sch.compute_at(block=b0, loop=l4, preserve_unit_loops=True, index=-1)",
-            'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
-            "l6, l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10 = sch.fuse(l8, l9, preserve_unit_iters=True)",
-            "l11, l12 = sch.split(loop=l10, factors=[None, v3], preserve_unit_iters=True)",
-            'sch.bind(loop=l12, thread_axis="threadIdx.x")',
-        ],
+    @T.prim_func
+    def batch_norm_bmn_0(A: T.Buffer[(1, 512, 512), "float32"], D: T.Buffer[1, "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C = T.alloc_buffer([1], dtype="float32")
+        for i0, i1, i2 in T.grid(1, 512, 512):
+            with T.block("C"):
+                b, i, j = T.axis.remap("SRR", [i0, i1, i2])
+                T.reads(A[b, i, j])
+                T.writes(C[b])
+                with T.init():
+                    C[b] = T.float32(0)
+                C[b] = C[b] + A[b, i, j] * A[b, i, j]
+        for i0 in T.serial(1):
+            with T.block("D"):
+                b = T.axis.spatial(1, i0)
+                T.reads(C[b])
+                T.writes(D[b])
+                D[b] = T.sqrt(C[b], dtype="float32")
+
+    @T.prim_func
+    def batch_norm_bmn_1(A: T.Buffer[(1, 512, 512), "float32"], D: T.Buffer[1, "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C_shared = T.alloc_buffer([1], dtype="float32", scope="shared")
+        for i0_0 in T.serial(1):
+            for ax0, ax1_ax2_fused_0 in T.grid(1, 1024):
+                for ax1_ax2_fused_1 in T.thread_binding(256, thread="threadIdx.x"):
+                    with T.block("C"):
+                        b = T.axis.spatial(1, ax0)
+                        i = T.axis.reduce(512, (ax1_ax2_fused_0 * 256 + ax1_ax2_fused_1) // 512)
+                        j = T.axis.reduce(512, (ax1_ax2_fused_0 * 256 + ax1_ax2_fused_1) % 512)
+                        T.reads(A[b, i, j])
+                        T.writes(C_shared[b])
+                        with T.init():
+                            C_shared[b] = T.float32(0)
+                        C_shared[b] = C_shared[b] + A[b, i, j] * A[b, i, j]
+            for i0_1 in T.thread_binding(256, thread="threadIdx.x"):
+                with T.block("D"):
+                    T.where(i0_0 * 256 + i0_1 < 1)
+                    b = T.axis.spatial(1, i0_0 * 256 + i0_1)
+                    T.reads(C_shared[b])
+                    T.writes(D[b])
+                    D[b] = T.sqrt(C_shared[b], dtype="float32")
+
+    decision_0 = []  # type: ignore
+    decision_1 = [
+        ("SampleCategorical", 6),
     ]
-    target = Target("nvidia/geforce-rtx-3090", host="llvm")
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.norm_bmn(
-                B=1,
-                M=512,
-                N=512,
-            )
-        ),
-        target=target,
-        rule=cross_thread_reduction(target=target),
+
+    mod = create_prim_func(te_workload.norm_bmn(B=1, M=512, N=512))
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3090", host="llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512])
+        ],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[batch_norm_bmn_0, batch_norm_bmn_1],
+        expected_decisions=[decision_0, decision_1],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 2
-    check_trace(spaces, expected)
 
 
 if __name__ == "__main__":
-    # test_gpu_softmax_mn()
-    # test_gpu_softmax_mn_after_inline()
+    test_gpu_softmax_mn()
+    test_gpu_softmax_mn_after_inline()
     test_gpu_batch_norm_bmn()
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
index 02b55350b7d5..8076fcaa8bd4 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
@@ -17,10 +17,7 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
 from tvm import meta_schedule as ms
-from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
-from tvm.meta_schedule.testing.schedule_rule import parallel_vectorize_unroll
-from tvm.meta_schedule.testing.space_generation import check_trace
-from tvm.meta_schedule.tune_context import TuneContext
+from tvm.meta_schedule.testing.space_generation import check_sketches
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -68,10 +65,7 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:
 class PureSpatial:
     @T.prim_func
     def main(placeholder: T.Buffer[(1, 13, 13, 3, 85), "float32"], placeholder_1: T.Buffer[(1, 26, 26, 3, 85), "float32"], placeholder_2: T.Buffer[(1, 52, 52, 3, 85), "float32"], T_expand_dims: T.Buffer[(1, 80, 10647), "float32"]) -> None:
-        # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # body
-        # with T.block("root")
         T_strided_slice_with_axes = T.alloc_buffer([1, 52, 52, 3, 1], dtype="float32")
         T_sigmoid = T.alloc_buffer([1, 52, 52, 3, 1], dtype="float32")
         T_strided_slice_with_axes_1 = T.alloc_buffer([1, 52, 52, 3, 80], dtype="float32")
@@ -224,55 +218,80 @@ def main(placeholder: T.Buffer[(1, 13, 13, 3, 85), "float32"], placeholder_1: T.
 # fmt: on
 
 
-def _create_context(mod, target, rule):
-    ctx = TuneContext(
-        mod=mod,
-        target=target,
-        space_generator=PostOrderApply(),
-        sch_rules=[rule],
-        task_name="test",
-    )
-    return ctx
-
-
 def test_parallel_vectorize_unroll():
-    expected = [
-        [
-            'b0 = sch.get_block(name="root", func_name="main")',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.parallel", ann_val=512)',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.vectorize", ann_val=32)',
-            "v1 = sch.sample_categorical(candidates=[0, 16, 64, 512], probs=[0.25, 0.25, 0.25, 0.25])",
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.unroll_explicit", ann_val=v1)',
-        ]
+    @T.prim_func
+    def Matmul_0(
+        A: T.Buffer[(1024, 1024), "float32"],
+        B: T.Buffer[(1024, 1024), "float32"],
+        C: T.Buffer[(1024, 1024), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main"})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr(
+                {
+                    "meta_schedule.parallel": 512,
+                    "meta_schedule.unroll_explicit": 16,
+                    "meta_schedule.vectorize": 32,
+                }
+            )
+            for i, j, k in T.grid(1024, 1024, 1024):
+                with T.block("matmul"):
+                    vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                    T.reads(A[vi, vk], B[vk, vj])
+                    T.writes(C[vi, vj])
+                    with T.init():
+                        C[vi, vj] = T.float32(0)
+                    C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+    decision_0 = [
+        ("SampleCategorical", 1),
     ]
+
     mod = Matmul
-    target = Target("llvm --num-cores=32")
-    ctx = _create_context(
+    actual = ms.TuneContext(
         mod=mod,
-        target=target,
-        rule=parallel_vectorize_unroll(target=target),
+        target=Target("llvm --num-cores=32"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.ParallelizeVectorizeUnroll(
+                max_jobs_per_core=16,
+                max_vectorize_extent=32,
+                unroll_max_steps=[0, 16, 64, 512],
+                unroll_explicit=True,
+            ),
+        ],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[Matmul_0],
+        expected_decisions=[decision_0],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
 
 
 def test_parallel_vectorize_unroll_spatial():
     mod = PureSpatial
-    target = Target("llvm --num-cores=32")
-    ctx = _create_context(
+    actual = ms.TuneContext(
         mod=mod,
-        target=target,
-        rule=ms.schedule_rule.ParallelizeVectorizeUnroll(
-            max_jobs_per_core=-1,
-            max_vectorize_extent=-1,
-            unroll_max_steps=[1, 2, 4, 8, 16, 32, 64],
-            unroll_explicit=True,
-        ),
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=mod)
-    assert len(spaces) == 1
-    trace = spaces[0].trace.simplified(remove_postproc=True)
+        target=Target("llvm --num-cores=32"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.ParallelizeVectorizeUnroll(
+                max_jobs_per_core=-1,
+                max_vectorize_extent=-1,
+                unroll_max_steps=[0, 16, 64, 512],
+                unroll_explicit=True,
+            ),
+        ],
+        task_name="test",
+    ).generate_design_space()
+    assert len(actual) == 1
+    trace = actual[0].trace.simplified(remove_postproc=True)
     assert not trace.insts
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
index c951a5adf386..fc52aa199cc1 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
@@ -16,10 +16,8 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
-from tvm.meta_schedule.schedule_rule import RandomComputeLocation
-from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
-from tvm.meta_schedule.testing.space_generation import check_trace
-from tvm.meta_schedule.tune_context import TuneContext
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.space_generation import check_sketches
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -55,35 +53,53 @@ def main(a: T.handle, b: T.handle) -> None:
 # fmt: on
 
 
-def _create_context(mod, target, rule):
-    ctx = TuneContext(
-        mod=mod,
-        target=target,
-        space_generator=PostOrderApply(),
-        sch_rules=[rule],
-        task_name="test",
-    )
-    return ctx
-
-
 def test_random_compute_location():
-    expected = [
-        [
-            'b0 = sch.get_block(name="move", func_name="main")',
-            "l1 = sch.sample_compute_location(block=b0)",
-            "sch.compute_at(block=b0, loop=l1, preserve_unit_loops=True, index=-1)",
-        ]
+    @T.prim_func
+    def add_0(
+        A: T.Buffer[(2048, 2048, 2048), "float32"],
+        B: T.Buffer[(2048, 2048, 2048), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main"})
+        # body
+        # with T.block("root")
+        A_cached = T.alloc_buffer([2048, 2048, 2048], dtype="float32")
+        for i0, j0, i1, j1, k0, i2 in T.grid(128, 64, 4, 4, 64, 4):
+            for ax0, ax1, ax2 in T.grid(1, 8, 32):
+                with T.block("move"):
+                    vi = T.axis.spatial(2048, i0 * 16 + i1 * 4 + i2 + ax0)
+                    vj = T.axis.spatial(2048, j0 * 32 + j1 * 8 + ax1)
+                    vk = T.axis.spatial(2048, k0 * 32 + ax2)
+                    T.reads(A[vi, vj, vk])
+                    T.writes(A_cached[vi, vj, vk])
+                    A_cached[vi, vj, vk] = A[vi, vj, vk]
+            for j2, k1 in T.grid(8, 32):
+                with T.block("add"):
+                    vi = T.axis.spatial(2048, i0 * 16 + i1 * 4 + i2)
+                    vj = T.axis.spatial(2048, j0 * 32 + j1 * 8 + j2)
+                    vk = T.axis.spatial(2048, k0 * 32 + k1)
+                    T.reads(A_cached[vi, vj, vk])
+                    T.writes(B[vi, vj, vk])
+                    B[vi, vj, vk] = A_cached[vi, vj, vk] + T.float32(1)
+
+    decision_0 = [
+        ("SampleComputeLocation", 5),
     ]
+
     mod = Add
-    target = Target("llvm")
-    ctx = _create_context(
+    actual = ms.TuneContext(
         mod=mod,
-        target=target,
-        rule=RandomComputeLocation(),
+        target=Target("llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[ms.schedule_rule.RandomComputeLocation()],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[add_0],
+        expected_decisions=[decision_0],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
 
 
 if __name__ == "__main__":

From 8058423f096cb71952982188a5c386ad37f6105a Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Tue, 13 Sep 2022 14:23:35 -0700
Subject: [PATCH 157/704] [Hexagon] Create tests to showcase vtcm loading
 capabilities on Hexagon.  (#12667)

* [Hexagon] Increase max buffer size for tvm_rpc_android to 1GB.

* [Hexagon] Make errors more clear when unable to allocate VTCM buffers and throw an error to fail early.

* [Hexagon] Add mem_copy_DLTensor to enable directly calling DMA for mem copies.

* [Hexagon] Add new tests as examples of the performance to expect when copying data to VTCM.

* [Hexagon] Reduce rpc max size.

* [Hexagon] Fix test_parallel_hvx_load_vtcm.py test output to be human readable.

* Comment out tests that only work on 8Gen1 HDKs to get CI to pass
---
 python/tvm/contrib/hexagon/session.py         |   2 +-
 src/runtime/hexagon/hexagon_buffer.cc         |   9 +-
 src/runtime/hexagon/hexagon_device_api.cc     |  11 +
 .../test_parallel_hvx_load_vtcm.py            | 537 ++++++++++++++++++
 .../test_hexagon/test_vtcm_bandwidth.py       | 169 ++++++
 5 files changed, 723 insertions(+), 5 deletions(-)
 create mode 100644 tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
 create mode 100644 tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py

diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 9308e396b2a5..5619d036e283 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -58,7 +58,7 @@ def __init__(
         remote_kw: dict,
         session_name: str = "hexagon-rpc",
         remote_stack_size_bytes: int = 256 * 1024,  # Min size for main thread in QuRT/sim
-        rpc_receive_buffer_size_bytes: int = 5 * 1024 * 1024,  # Size for passing hexagon tests
+        rpc_receive_buffer_size_bytes: int = 256 * 1024 * 1024,  # Size for passing hexagon tests
     ):
         self._launcher = launcher
         self._session_name: str = session_name
diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index f23317fd01ed..3ba1b5be3d3d 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -62,7 +62,7 @@ struct VTCMAllocation : public Allocation {
 
     // allocate nbytes of vtcm on a single page
     HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes,
-                                                          /*b_single_page = */ 1));
+                                                          /*b_single_page = */ 0));
 
     // TODO(HWE): Investigate why a non-zero timeout results in
     // hanging, both in the simulator and on hardware.
@@ -71,13 +71,14 @@ struct VTCMAllocation : public Allocation {
     if (context_id_) {
       data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info);
       if (!data_) {
-        LOG(ERROR) << "ERROR: Allocated VTCM ptr is null.";
+        LOG(ERROR) << "ERROR: HAP_compute_res_acquire returned nullptr when allocating VTCM.";
         HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_));
         return;
       }
     } else {
-      LOG(ERROR) << "ERROR: Unable to acquire requeisted resource.";
-      return;
+      LOG(FATAL) << "FATAL: HAP_compute_res_acquire failed to acquire requested VTCM resource.";
+      throw std::runtime_error(
+          "HAP_compute_res_acquire failed to acquire requested VTCM resource.");
     }
   }
   ~VTCMAllocation() {
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index cf384ae88db7..fd3a0db2025b 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -184,6 +184,17 @@ void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void
   memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
 }
 
+TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy_DLTensor")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      DLTensor* dst = args[0];
+      DLTensor* src = args[1];
+      int size = args[2];
+
+      hexagon_user_dma_1d_sync(dst->data, src->data, size);
+
+      *rv = static_cast<int32_t>(0);
+    });
+
 TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
   void* dst = args[0];
   void* src = args[1];
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
new file mode 100644
index 000000000000..c9ff07c490c8
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -0,0 +1,537 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test different strategies for loading data into vtcm before running HVX workloads. """
+
+import numpy as np
+import tvm
+
+from tvm.script import tir as T
+from numpy.random import default_rng
+
+TEST_OUTPUT_TEMPLATE = "Test with {} MB of data to load... \n    -No VTCM: {} Gops \n    -Basic VTCM: {} Gops \n    -Vectorized: {} Gops\n    -Vectorized and Parallelized: {} Gops\n    -Preallocated and Vectorized: {} Gops\n    -Preallocated, Vectorized, and Parallelized: {} Gops\n    -Single DMA: {} Gops\n    -Preloaded: {} Gops\n"
+
+
+def apply_parallel_unroll_vectorize(sch, blocks, outer_split, unroll_split, vector_split):
+    for block in blocks:
+        vb, vi = sch.get_loops(block)
+        v = sch.fuse(vb, vi)
+        vbo, vbi, vio, vii = sch.split(v, factors=[outer_split, None, unroll_split, vector_split])
+        sch.vectorize(vii)
+        sch.unroll(vio)
+        sch.parallel(vbo)
+    return sch
+
+
+def apply_unroll_vectorize(sch, blocks, unroll_split, vector_split):
+    for block in blocks:
+        vb, vi = sch.get_loops(block)
+        v = sch.fuse(vb, vi)
+        _, vio, vii = sch.split(v, factors=[None, unroll_split, vector_split])
+        sch.vectorize(vii)
+        sch.unroll(vio)
+    return sch
+
+
+def apply_vrmpy_parallelization(sch):
+    block = sch.get_block("C")
+    b = sch.get_loops(block)
+    bo, _ = sch.split(b[0], factors=[4, None])
+    sch.parallel(bo)
+    return sch
+
+
+def apply_vtcm_cache_read_write(sch):
+    block = sch.get_block("C")
+    sch.cache_read(block, 0, "global.vtcm")
+    sch.cache_read(block, 1, "global.vtcm")
+    sch.cache_write(block, 0, "global.vtcm")
+    return sch
+
+
+def vrmpy(operations):
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128)
+        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128)
+        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128)
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
+                    T.uint32(2),
+                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    dtype="int32x32",
+                )
+
+    return operator
+
+
+def preloaded_vrmpy(operations):
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(
+            a,
+            [T.cast(operations, "int32") * 128],
+            dtype="uint8",
+            align=128,
+            mem_scope="global.vtcm",
+        )
+        B = T.match_buffer(
+            b,
+            [T.cast(operations, "int32") * 128],
+            dtype="uint8",
+            align=128,
+            mem_scope="global.vtcm",
+        )
+        C = T.match_buffer(
+            c, [T.cast(operations, "int32") * 32], dtype="int32", align=128, mem_scope="global.vtcm"
+        )
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
+                    T.uint32(2),
+                    T.reinterpret(A[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(B[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"),
+                    dtype="int32x32",
+                )
+
+    return operator
+
+
+def preallocated_vrmpy(operations):
+    size = operations * 128
+    out_size = operations * 32
+
+    @T.prim_func
+    def operator(
+        a: T.handle, b: T.handle, c: T.handle, a_v: T.handle, b_v: T.handle, c_v: T.handle
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global")
+        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global")
+        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
+        A_global_vtcm = T.match_buffer(
+            a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
+        )
+        B_global_vtcm = T.match_buffer(
+            b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
+        )
+        C_global_vtcm = T.match_buffer(
+            c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm"
+        )
+        for n, i in T.grid(operations, 128):
+            with T.block("A_global.vtcm"):
+                vn, vi = T.axis.remap("SS", [n, i])
+                A_global_vtcm[vn * 128 + vi] = A[vn, vi]
+        for n, i in T.grid(operations, 128):
+            with T.block("B_global.vtcm"):
+                vn, vi = T.axis.remap("SS", [n, i])
+                B_global_vtcm[vn * 128 + vi] = B[vn, vi]
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
+                    T.uint32(2),
+                    T.reinterpret(
+                        A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
+                    T.reinterpret(
+                        B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
+                    dtype="int32x32",
+                )
+        for n, i in T.grid(operations, 32):
+            with T.block("C_global.vtcm"):
+                vn, vi = T.axis.remap("SS", [n, i])
+                C[vn, vi] = C_global_vtcm[vn * 32 + vi]
+
+    return operator
+
+
+def preallocated_single_dma_vrmpy(operations):
+    size = operations * 128
+    out_size = operations * 32
+
+    @T.prim_func
+    def operator(
+        a: T.handle,
+        b: T.handle,
+        c: T.handle,
+        a_v: T.handle,
+        b_v: T.handle,
+        c_v: T.handle,
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global")
+        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global")
+        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
+        A_global_vtcm = T.match_buffer(
+            a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
+        )
+        B_global_vtcm = T.match_buffer(
+            b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
+        )
+        C_global_vtcm = T.match_buffer(
+            c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm"
+        )
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.mem_copy_DLTensor",
+                T.tvm_stack_make_array(
+                    A_global_vtcm.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    A_global_vtcm.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    A.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    A.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.cast(size, dtype="int"),
+                dtype="int32",
+            )
+        )
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.mem_copy_DLTensor",
+                T.tvm_stack_make_array(
+                    B_global_vtcm.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    B_global_vtcm.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    B.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    B.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.cast(size, dtype="int"),
+                dtype="int32",
+            )
+        )
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
+                    T.uint32(2),
+                    T.reinterpret(
+                        A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
+                    T.reinterpret(
+                        B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
+                    dtype="int32x32",
+                )
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.mem_copy_DLTensor",
+                T.tvm_stack_make_array(
+                    C.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    C.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    C_global_vtcm.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    C_global_vtcm.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.cast(size, dtype="int"),
+                dtype="int32",
+            )
+        )
+
+    return operator
+
+
+def evaluate_result(operations, tag, time, result, expected_output):
+    transfer_mb = round(3 * operations * 128 / 1e6, 2)
+    gops = round(operations * 128 * 3 / time.mean / 1e9, 3)
+    mean_ms = round(time.mean * 1000, 6)
+
+    print("\ntest_{}MB_{} took {} ms @ GOPS: {}".format(transfer_mb, tag, mean_ms, gops))
+    tvm.testing.assert_allclose(result, expected_output)
+
+
+def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global"):
+    target_hexagon = tvm.target.hexagon("v69")
+    func_tir = tvm.build(
+        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+    )
+    module = hexagon_session.load_module(func_tir)
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope=mem_scope)
+    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope=mem_scope)
+    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope=mem_scope)
+    timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
+    time = timer(a_hexagon, b_hexagon, c_hexagon)
+    gops = round(operations * 128 * 3 / time.mean / 1e9, 4)
+    return gops, c_hexagon.asnumpy()
+
+
+def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations):
+    target_hexagon = tvm.target.hexagon("v69")
+    func_tir = tvm.build(
+        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+    )
+    module = hexagon_session.load_module(func_tir)
+
+    a_vtcm = np.zeros((a.size), dtype="uint8")
+    b_vtcm = np.zeros((b.size), dtype="uint8")
+    c_vtcm = np.zeros((c.size), dtype="int32")
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope="global")
+    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope="global")
+    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope="global")
+    a_vtcm_hexagon = tvm.runtime.ndarray.array(
+        a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
+    )
+    b_vtcm_hexagon = tvm.runtime.ndarray.array(
+        b_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
+    )
+    c_vtcm_hexagon = tvm.runtime.ndarray.array(
+        c_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
+    )
+
+    timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
+    time = timer(a_hexagon, b_hexagon, c_hexagon, a_vtcm_hexagon, b_vtcm_hexagon, c_vtcm_hexagon)
+    gops = round(operations * 128 * 3 / time.mean / 1e9, 4)
+    return gops, c_hexagon.asnumpy()
+
+
+@tvm.testing.fixture
+def input_a(operations):
+    return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
+
+
+@tvm.testing.fixture
+def input_b(operations):
+    return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
+
+
+@tvm.testing.fixture
+def input_c(operations):
+    return np.zeros((operations, 32), dtype="int32")
+
+
+@tvm.testing.fixture
+def expected_output(operations, input_a, input_b, input_c):
+    expected_output = np.zeros(input_c.shape, dtype="int32")
+    for n in range(operations):
+        for i in range(32):
+            for r in range(4):
+                expected_output[n, i] = expected_output[n, i] + np.uint32(
+                    input_a[n, i * 4 + r]
+                ) * np.uint32(input_b[n, i * 4 + r])
+    return expected_output
+
+
+class TestMatMulVec:
+
+    operations = tvm.testing.parameter(
+        1024,
+        2048,
+        4096,
+        5 * 2048,  # 3.93MB of total transfer
+        # 16384, #Only works on 8Gen1 HDK's
+        # 5 * 4096,  # 7.86MB of total transfer. Only works on 8Gen1 HDK's
+    )
+
+    # Experimentally best configurations for the memcopy
+    outer_split = tvm.testing.parameter(4)
+    unroll_split = tvm.testing.parameter(8)
+    vector_split = tvm.testing.parameter(64)
+    c_vector_split = tvm.testing.parameter(16)
+    c_vector_split_unallocated = tvm.testing.parameter(8)
+
+    @tvm.testing.requires_hexagon
+    def test_loading_vtcm_for_vrmpy(
+        self,
+        hexagon_session,
+        operations,
+        input_a,
+        input_b,
+        input_c,
+        expected_output,
+        outer_split,
+        unroll_split,
+        vector_split,
+        c_vector_split,
+        c_vector_split_unallocated,
+    ):
+
+        # Run parallel vrmpy without loading to VTCM.
+        sch = tvm.tir.Schedule(vrmpy(operations))
+        sch = apply_vrmpy_parallelization(sch)
+        base_runtime, result = setup_and_run(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with basic memory loads to VTCM.
+        sch = tvm.tir.Schedule(vrmpy(operations))
+        sch = apply_vtcm_cache_read_write(sch)
+        sch = apply_vrmpy_parallelization(sch)
+        basic_load_runtime, result = setup_and_run(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with vectorized memory loads to VTCM.
+        sch = tvm.tir.Schedule(vrmpy(operations))
+        sch = apply_vtcm_cache_read_write(sch)
+        sch = apply_vrmpy_parallelization(sch)
+        sch = apply_unroll_vectorize(
+            sch,
+            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            unroll_split,
+            vector_split,
+        )
+        sch = apply_unroll_vectorize(
+            sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split_unallocated
+        )
+        vectorized_runtime, result = setup_and_run(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with vectorized and parallelized memory loads to VTCM.
+        sch = tvm.tir.Schedule(vrmpy(operations))
+        sch = apply_vtcm_cache_read_write(sch)
+        sch = apply_vrmpy_parallelization(sch)
+        sch = apply_parallel_unroll_vectorize(
+            sch,
+            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            outer_split,
+            unroll_split,
+            vector_split,
+        )
+        sch = apply_parallel_unroll_vectorize(
+            sch,
+            [sch.get_block("C_global.vtcm")],
+            outer_split,
+            unroll_split,
+            c_vector_split_unallocated,
+        )
+        vectorized_parallelized_runtime, result = setup_and_run(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with preallocated and vectorized memory loads to VTCM.
+        sch = tvm.tir.Schedule(preallocated_vrmpy(operations))
+        sch = apply_vrmpy_parallelization(sch)
+        sch = apply_unroll_vectorize(
+            sch,
+            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            unroll_split,
+            vector_split,
+        )
+        sch = apply_unroll_vectorize(
+            sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split
+        )
+        preallocated_vectorized_runtime, result = setup_and_run_preallocated(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        result = result.reshape((operations, 32))
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with preallocated, vectorized, and parallelized memory loads to VTCM.
+        sch = tvm.tir.Schedule(preallocated_vrmpy(operations))
+        sch = apply_vrmpy_parallelization(sch)
+        sch = apply_parallel_unroll_vectorize(
+            sch,
+            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            outer_split,
+            unroll_split,
+            vector_split,
+        )
+        sch = apply_parallel_unroll_vectorize(
+            sch, [sch.get_block("C_global.vtcm")], outer_split, unroll_split, c_vector_split
+        )
+        preallocated_vectorized_parallelized_runtime, result = setup_and_run_preallocated(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        result = result.reshape((operations, 32))
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with preallocated single dma memory load to VTCM.
+        sch = tvm.tir.Schedule(preallocated_single_dma_vrmpy(operations))
+        sch = apply_vrmpy_parallelization(sch)
+        single_dma_runtime, result = setup_and_run_preallocated(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        result = result.reshape((operations, 32))
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with data preloaded in VTCM.
+        sch = tvm.tir.Schedule(preloaded_vrmpy(operations))
+        sch = apply_vrmpy_parallelization(sch)
+        input_a = input_a.reshape(operations * 128)
+        input_b = input_b.reshape(operations * 128)
+        input_c = input_c.reshape(operations * 32)
+        preloaded_runtime, result = setup_and_run(
+            hexagon_session, sch, input_a, input_b, input_c, operations, "global.vtcm"
+        )
+        result = result.reshape((operations, 32))
+        tvm.testing.assert_allclose(result, expected_output)
+
+        transfer_mb = round(3 * operations * 128 / 1e6, 2)
+        print(
+            TEST_OUTPUT_TEMPLATE.format(
+                transfer_mb,
+                base_runtime,
+                basic_load_runtime,
+                vectorized_runtime,
+                vectorized_parallelized_runtime,
+                preallocated_vectorized_runtime,
+                preallocated_vectorized_parallelized_runtime,
+                single_dma_runtime,
+                preloaded_runtime,
+            )
+        )
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
new file mode 100644
index 000000000000..6db8b9101997
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -0,0 +1,169 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test theoretical bandwith for data transfers to VTCM for different strategies."""
+
+import numpy as np
+from tests.python.contrib.test_hexagon.infrastructure import allocate_hexagon_array
+import tvm
+
+from tvm.script import tir as T
+from numpy.random import default_rng
+
+MB = 1024**2
+KB = 1024
+TEST_OUTPUT_TEMPLATE = "Test bandwidth with buffer size {}MB... \n    -Base: {} GBps \n    -Vectorized: {} GBps\n    -Vectorized and Parallelized: {} GBps\n    -Single DMA Copy: {} GBps\n"
+
+
+def memcopy_operator(size):
+    @T.prim_func
+    def operator(a: T.handle, a_v: T.handle) -> None:
+        A = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
+        A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
+        for ax0 in T.serial(size):
+            with T.block("A_global.vtcm"):
+                v0 = T.axis.spatial(size, ax0)
+                T.reads(A[v0])
+                T.writes(A_global_vtcm[v0])
+                A_global_vtcm[v0] = A[v0]
+
+    return operator
+
+
+def single_dma_operator(size):
+    @T.prim_func
+    def operator(a: T.handle, a_v: T.handle) -> None:
+        A = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
+        A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.mem_copy_DLTensor",
+                T.tvm_stack_make_array(
+                    A_global_vtcm.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    A_global_vtcm.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    A.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    A.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.cast(size, dtype="int"),
+                dtype="int32",
+            )
+        )
+
+    return operator
+
+
+def evaluate(hexagon_session, sch, size):
+    a_shape = size
+
+    target_hexagon = tvm.target.hexagon("v69")
+    func_tir = tvm.build(
+        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+    )
+    module = hexagon_session.load_module(func_tir)
+
+    rng = default_rng()
+    a = rng.integers(-128, 127, a_shape, dtype="int8")
+    a_vtcm = np.zeros(a_shape, dtype="int8")
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope="global")
+    a_vtcm_hexagon = tvm.runtime.ndarray.array(
+        a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
+    )
+
+    # a_hexagon = allocate_hexagon_array(hexagon_session.device, data=a, mem_scope="global")
+    # a_vtcm_hexagon = allocate_hexagon_array(hexagon_session.device, data=a_vtcm, mem_scope="global.vtcm")
+
+    timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
+    runtime = timer(a_hexagon, a_vtcm_hexagon)
+
+    gbps = round((size / 2**30) / runtime.mean, 4)
+    tvm.testing.assert_allclose(a_vtcm_hexagon.asnumpy(), a)
+
+    return gbps
+
+
+class TestMatMulVec:
+
+    size = tvm.testing.parameter(
+        10 * KB,
+        20 * KB,
+        40 * KB,
+        80 * KB,
+        160 * KB,
+        320 * KB,
+        640 * KB,
+        MB,
+        2 * MB,
+        3 * MB,
+        4 * MB,
+        # 8 * MB,  # Only works on 8gen1 HDKs
+    )
+
+    outer_split = tvm.testing.parameter(4)
+    unroll_split = tvm.testing.parameter(2)
+    vector_split = tvm.testing.parameter(128)
+
+    @tvm.testing.requires_hexagon
+    def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split):
+
+        # Run the base memcopy operator.
+        sch = tvm.tir.Schedule(memcopy_operator(size))
+        base_gpbs = evaluate(hexagon_session, sch, size)
+
+        # Run with some basic unroll and vectorize scheduling.
+        sch = tvm.tir.Schedule(memcopy_operator(size))
+        vtcm_block_a = sch.get_block("A_global.vtcm")
+        vb = sch.get_loops(vtcm_block_a)
+        vbi_a, vio_a, vii_a = sch.split(vb[0], factors=[None, unroll_split, vector_split])
+        sch.unroll(vio_a)
+        sch.vectorize(vii_a)
+        vectorize_gbps = evaluate(hexagon_session, sch, size)
+
+        # Run with some basic unroll and vectorize scheduling and parallelization.
+        sch = tvm.tir.Schedule(memcopy_operator(size))
+        vtcm_block_a = sch.get_block("A_global.vtcm")
+        vb = sch.get_loops(vtcm_block_a)
+        vbo_a, vbi_a, vio_a, vii_a = sch.split(
+            vb[0], factors=[outer_split, None, unroll_split, vector_split]
+        )
+        sch.unroll(vio_a)
+        sch.vectorize(vii_a)
+        sch.parallel(vbo_a)
+        parallel_gbps = evaluate(hexagon_session, sch, size)
+
+        # Run using a single dma copy to transfer the data.
+        sch = tvm.tir.Schedule(single_dma_operator(size))
+        single_dma_gbps = evaluate(hexagon_session, sch, size)
+
+        mbs = round(size / MB, 2)
+        print(
+            TEST_OUTPUT_TEMPLATE.format(
+                mbs, base_gpbs, vectorize_gbps, parallel_gbps, single_dma_gbps
+            )
+        )

From 64635b7f372f229f4179806bf65e83f45e9ab856 Mon Sep 17 00:00:00 2001
From: Ziheng Jiang <ziheng@apache.org>
Date: Tue, 13 Sep 2022 14:43:23 -0700
Subject: [PATCH 158/704] [COMMUNITY] Josh Fromm -> PMC (#12768)

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 2231fac66596..42f67e87df10 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -36,7 +36,7 @@ We do encourage everyone to work anything they are interested in.
 - [Wei Chen](https://github.com/wweic): @wweic - runtime, relay, vm
 - [Zhi Chen](https://github.com/zhiics) (PMC): @zhiics - relay, quantization, pass manager
 - [Siyuan Feng](https://github.com/Hzfengsy) (PMC): @Hzfengsy - tir
-- [Josh Fromm](https://github.com/jwfromm): @jwfromm - frontends, quantization, topi
+- [Josh Fromm](https://github.com/jwfromm) (PMC): @jwfromm - frontends, quantization, topi
 - [Mehrdad Hessar](https://github.com/mehrdadh): @mehrdadh - microTVM, hexagon
 - [Bohan Hou](https://github.com/spectrometerHBH): @spectrometerHBH - tir, arith, tvm-script
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends

From ab8fe34c8e0a73ceb886e95616417281019c4d1d Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Tue, 13 Sep 2022 18:24:33 -0600
Subject: [PATCH 159/704] [FQ2I] Quantized constant bias (#12666)

* support fp32 constants in quantized bias add

* add a test

* clean up comment

* assert the bias is floating point as well as constant before requantizing
---
 .../transform/fake_quantization_to_integer.py | 43 +++++++++++--------
 .../test_pass_fake_quantization_to_integer.py | 15 +++++--
 2 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index bb874c131cd8..242740399f96 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -79,7 +79,6 @@ def quantize(expr, type_map):
             out_dtype=expr.attrs.out_dtype,
             axis=t.axis,
         )
-
     return [
         out,
         TensorAffineType(expr.args[1], expr.args[2], expr.attrs.out_dtype, expr.attrs.axis),
@@ -204,23 +203,30 @@ def bias_add(expr, type_map):
     """Rewrite a bias_add op"""
     x, b = expr.args
     x_t = type_map[x]
-    b_t = type_map[b]
-    in_scale = fold_constant(x_t.scale)
-    in_zero_point = fold_constant(x_t.zero_point)
-    if not (
-        approx_equal(x_t.scale, b_t.scale)
-        and approx_equal(x_t.zero_point, b_t.zero_point)
-        and tvm.ir.structural_equal(x_t.dtype, b_t.dtype)
-    ):
-        b = relay.qnn.op.requantize(
-            b,
-            b_t.scale,
-            b_t.zero_point,
-            in_scale,
-            in_zero_point,
-            out_dtype=x_t.dtype,
-            axis=0,
-        )
+    if b in type_map:
+        # Ensure bias matches the previous op
+        b_t = type_map[b]
+        in_scale = fold_constant(x_t.scale)
+        in_zero_point = fold_constant(x_t.zero_point)
+        if not (
+            approx_equal(x_t.scale, b_t.scale)
+            and approx_equal(x_t.zero_point, b_t.zero_point)
+            and tvm.ir.structural_equal(x_t.dtype, b_t.dtype)
+        ):
+            b = relay.qnn.op.requantize(
+                b,
+                b_t.scale,
+                b_t.zero_point,
+                in_scale,
+                in_zero_point,
+                out_dtype=x_t.dtype,
+                axis=0,
+            )
+    else:
+        # If the bias is a constant, we need to quantize it
+        assert isinstance(b, relay.expr.Constant)
+        assert b.checked_type.dtype in ["float32", "float64", "float16", "bfloat16"]
+        b = relay.qnn.op.quantize(b, x_t.scale, x_t.zero_point, axis=0, out_dtype=x_t.dtype)
     out = relay.op.nn.bias_add(x, b, **expr.attrs)
     return [out, x_t]
 
@@ -431,6 +437,7 @@ def pad(expr, type_map):
     else:
         # If the pad-value is a constant, we need to quantize it
         assert isinstance(pad_value, relay.expr.Constant)
+        assert pad_value.checked_type.dtype in ["float32", "float64", "float16", "bfloat16"]
         pad_value = relay.qnn.op.quantize(pad_value, t.scale, t.zero_point)
 
     out = relay.op.nn.pad(arg, pad_value=pad_value, **expr.attrs)
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index a63d82e68750..46979dfc3cba 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -192,26 +192,33 @@ def test_fake_transpose_quantize_conv():
     compare_fq_to_int(op, [x_np, w_np])
 
 
-def test_fake_transpose_quantize_conv_bias_add():
+@pytest.mark.parametrize("const_bias", [False, True])
+def test_fake_transpose_quantize_conv_bias_add(const_bias):
     x = relay.var("x", shape=[1, 224, 224, 3], dtype="int8")
     w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
-    bias = relay.var("bias", shape=[16], dtype="int32")
     one = relay.const(1.0)
     zero = relay.const(0)
+    if const_bias:
+        bias = relay.const(np.random.random(16).astype("float32"))
+    else:
+        bias = relay.qnn.op.dequantize(relay.var("bias", shape=[16], dtype="int32"), one, zero)
 
     x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
     x = relay.transpose(x, [0, 3, 1, 2])
     op = relay.op.nn.conv2d(
         x, relay.qnn.op.dequantize(w, relay.const(0.5), zero), kernel_size=[5, 5]
     )
-    op = relay.op.nn.bias_add(op, relay.qnn.op.dequantize(bias, one, zero))
+    op = relay.op.nn.bias_add(op, bias)
     op = relay.qnn.op.quantize(op, one, zero)
 
     x_np = np.random.randint(-128, 127, size=[1, 224, 224, 3], dtype="int8")
     w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8")
     bias_np = np.random.randint(-32768, 32767, size=[16], dtype="int32")
+    args = [x_np, w_np]
 
-    compare_fq_to_int(op, [x_np, w_np, bias_np])
+    if not const_bias:
+        args.append(bias_np)
+    compare_fq_to_int(op, args)
 
 
 def test_fake_transpose_quantize_conv_bias_add_per_channel():

From 91bd9a3fec0dfc419e739d12ee098d0bc39f763d Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 13 Sep 2022 21:43:54 -0700
Subject: [PATCH 160/704] [Hybrid] Fix handling AST subcription for Python3.9
 (#12769)

fixed https://github.com/apache/tvm/issues/9955, this is covered by the existing test case `tests/python/relay/test_op_level3.py::test_unique`
---
 python/tvm/te/hybrid/parser.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py
index 1e1e4c50f7b9..4956aaf0be32 100644
--- a/python/tvm/te/hybrid/parser.py
+++ b/python/tvm/te/hybrid/parser.py
@@ -374,6 +374,10 @@ def visit_Attribute(self, node):
 
     def visit_Subscript(self, node):
         args = self.visit(node.slice)
+        if sys.version_info > (3, 8):
+            if not isinstance(node.slice, ast.Tuple):
+                args = [args]
+
         arr = self.visit(node.value)
         if isinstance(arr, Array):
             for i in args:

From f7f2cda6756c170755fc18cbe23f6bf4a4b0d584 Mon Sep 17 00:00:00 2001
From: Matthew Barrett <55580676+mbaret@users.noreply.github.com>
Date: Wed, 14 Sep 2022 10:25:45 +0100
Subject: [PATCH 161/704] [AOT] Add AOTLowerMain pass to lower a Relay main
 into TIR (#12550)

This is a pass refactored out of the AOTExecutorCodegen. Instead of
combining all of the functionality of the AOTExecutorCodegen into a
single monolithic pass, this pass only handles the lowering of the
Relay main function into TIR. Tests for the pass are included.
---
 CMakeLists.txt                                |   1 +
 python/tvm/relay/backend/_aot.py              |  21 +
 python/tvm/relay/backend/aot.py               |  43 +
 python/tvm/relay/backend/utils.py             |   7 +
 src/relay/backend/aot/aot_lower_main.cc       | 861 ++++++++++++++++++
 src/relay/backend/aot/aot_lower_main.h        |  58 ++
 src/relay/backend/utils.cc                    |  28 +-
 src/relay/backend/utils.h                     |  74 ++
 .../relay/backend/aot/aot_lower_main_test.cc  |  63 ++
 .../relay/aot/test_pass_aot_lower_main.py     | 429 +++++++++
 10 files changed, 1572 insertions(+), 13 deletions(-)
 create mode 100644 python/tvm/relay/backend/_aot.py
 create mode 100644 python/tvm/relay/backend/aot.py
 create mode 100644 src/relay/backend/aot/aot_lower_main.cc
 create mode 100644 src/relay/backend/aot/aot_lower_main.h
 create mode 100644 tests/cpp/relay/backend/aot/aot_lower_main_test.cc
 create mode 100644 tests/python/relay/aot/test_pass_aot_lower_main.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8995f9a87fb7..7c355238b8c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -305,6 +305,7 @@ tvm_file_glob(GLOB_RECURSE RELAY_PASS_SRCS
 tvm_file_glob(GLOB RELAY_BACKEND_SRCS
     src/relay/backend/*.cc
     src/relay/backend/vm/*.cc
+    src/relay/backend/aot/*.cc
     )
 tvm_file_glob(GLOB_RECURSE RELAY_IR_SRCS
     src/relay/ir/*.cc
diff --git a/python/tvm/relay/backend/_aot.py b/python/tvm/relay/backend/_aot.py
new file mode 100644
index 000000000000..437cd71c4c35
--- /dev/null
+++ b/python/tvm/relay/backend/_aot.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The AOT FFI namespace.
+"""
+import tvm._ffi
+
+tvm._ffi._init_api("relay.backend.aot", __name__)
diff --git a/python/tvm/relay/backend/aot.py b/python/tvm/relay/backend/aot.py
new file mode 100644
index 000000000000..8e7406c72f32
--- /dev/null
+++ b/python/tvm/relay/backend/aot.py
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""AOT passes"""
+from tvm.ir.transform import Pass
+from .utils import CallType
+
+from . import _aot
+
+
+def AOTLowerMain(mod_name: str, config: object, call_type: CallType) -> Pass:
+    """Lower a Relay main function into an AOT TIR main function.
+
+    Parameters
+    ----------
+    mod_name: str
+        The name of the module.
+    config : CompilationConfig
+        The compilation configuration.
+    call_type : CallType
+        The calling convention to use.
+
+    Returns
+    -------
+    Pass
+        The AOTLowerMain pass.
+
+    """
+    return _aot.AOTLowerMain(mod_name, config, call_type.value)
diff --git a/python/tvm/relay/backend/utils.py b/python/tvm/relay/backend/utils.py
index b8430a9e6b6e..7289dbbc4af4 100644
--- a/python/tvm/relay/backend/utils.py
+++ b/python/tvm/relay/backend/utils.py
@@ -15,6 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 """Utility backend functions."""
+from enum import Enum
+
+
+class CallType(Enum):
+    Packed = 0
+    CPacked = 1
+    Unpacked = 2
 
 
 def _is_valid_modname(mod_name):
diff --git a/src/relay/backend/aot/aot_lower_main.cc b/src/relay/backend/aot/aot_lower_main.cc
new file mode 100644
index 000000000000..ce72595dc10b
--- /dev/null
+++ b/src/relay/backend/aot/aot_lower_main.cc
@@ -0,0 +1,861 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/aot/aot_lower_main.cc
+ * \brief Lower the Relay main func into an AOT TIR main func.
+ */
+#include "./aot_lower_main.h"
+
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/transform.h>
+
+#include "../../op/call/call.h"
+#include "../../op/memory/device_copy.h"
+#include "../../op/memory/memory.h"
+#include "../../transforms/device_aware_visitors.h"
+#include "../name_transforms.h"
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+namespace aot {
+
+/*!
+ * \brief Looks at the expressions in a given function and produces an Expr to
+ * StorageInfo map by assigning one or more StorageInfos to the expressions that
+ * require storage.
+ *
+ * This pass is leveraged by AOTMainLowerer to perform an initial naive allocation
+ * for tensors in the Relay main function. The resulting storage map is then lowered
+ * into TIR allocations by AOTMainLowerer where the allocation can be subsequently
+ * optimized by later passes (e.g. USMP).
+ */
+class ExprAllocator : public transform::DeviceAwareExprVisitor {
+ public:
+  ExprAllocator() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  // run the visitor on a global function.
+  void Run(const Function& func) { VisitExpr(func); }
+
+  std::vector<int> GetReturnSIDs() const { return return_sids_; }
+
+  StorageMap GetStorageMap() const { return expr_storage_map_; }
+
+  using ExprVisitor::VisitExpr_;
+
+  void DeviceAwareVisitExpr_(const CallNode* call_node) final {
+    Array<Expr> args;
+
+    CallLoweredProps call_lowered_props = GetCallLoweredProps(call_node);
+    if (call_lowered_props.lowered_func.defined()) {
+      args = call_lowered_props.arguments;
+    } else {  // Relay functions that have not been lowered and lowered extern functions
+      args = call_node->args;
+      if (call_node->op.as<GlobalVarNode>()) {  // Lowered extern function
+        ICHECK(!(call_node->attrs.defined())) << "Extern functions should have null attributes.";
+      } else {  // Relay function which has not been lowered yet
+        ICHECK(call_node->op.as<FunctionNode>())
+            << "Expected the call to be to a lowered primfunc, a lowered extern function or a "
+               "unlowered Relay function.";
+      }
+    }
+    CreateStorage(call_node);
+    for (const Expr& arg : args) {
+      VisitExpr(arg);
+    }
+    AssignReturnSID(GetRef<Expr>(call_node));
+  }
+
+  void DeviceAwareVisitExpr_(const FunctionNode* func_node) final {
+    if (function_nesting() > 1) {
+      // Do not recurse into sub functions.
+      return;
+    }
+    for (const auto& param : func_node->params) {
+      CreateStorage(param.get());
+    }
+    VisitExpr(func_node->body);
+  }
+
+  void PreVisitLetBinding_(const Var& var, const Expr& value) final {
+    VisitExpr(value);
+    StorageInfo si = GetStorage(value);
+    expr_storage_map_[var] = si;
+  }
+
+  void VisitExpr_(const ConstantNode* op) final {
+    CreateStorage(op);
+    AssignReturnSID(GetRef<Expr>(op));
+  }
+
+  void VisitExpr_(const VarNode* op) final { AssignReturnSID(GetRef<Expr>(op)); }
+
+  void VisitExpr_(const TupleNode* op) final {
+    std::vector<int64_t> storage_ids;
+    std::vector<VirtualDevice> virtual_devices;
+    std::vector<int64_t> storage_sizes_in_bytes;
+    Expr expr = GetRef<Expr>(op);
+    for (Expr field : op->fields) {
+      auto sid = GetStorage(field);
+      storage_ids.insert(storage_ids.end(), sid->storage_ids.begin(), sid->storage_ids.end());
+      virtual_devices.insert(virtual_devices.end(), sid->virtual_devices.begin(),
+                             sid->virtual_devices.end());
+      storage_sizes_in_bytes.insert(storage_sizes_in_bytes.end(),
+                                    sid->storage_sizes_in_bytes.begin(),
+                                    sid->storage_sizes_in_bytes.end());
+    }
+    expr_storage_map_[expr] = StorageInfo(storage_ids, virtual_devices, storage_sizes_in_bytes);
+    AssignReturnSID(expr);
+  }
+
+  void VisitExpr_(const TupleGetItemNode* op) final {
+    Expr expr = GetRef<Expr>(op);
+    auto sids = GetStorage(op->tuple);
+    ICHECK_LT(static_cast<size_t>(op->index), sids->storage_ids.size());
+    expr_storage_map_[expr] =
+        StorageInfo({sids->storage_ids[op->index]}, {sids->virtual_devices[op->index]},
+                    {sids->storage_sizes_in_bytes[op->index]});
+    AssignReturnSID(expr);
+  }
+
+  void VisitExpr_(const IfNode* op) final { LOG(FATAL) << "'If' is not supported."; }
+
+ private:
+  /*!
+   * \brief Assign the expression's storage IDs as the return storage IDs.
+   * \note This is called when visiting every expression on the understanding
+   * that the returned expression will be visited last.
+   */
+  void AssignReturnSID(const Expr& e) {
+    if (expr_storage_map_.find(e) != expr_storage_map_.end()) {
+      StorageInfo& sinfo = expr_storage_map_[e];
+      return_sids_.clear();
+      for (auto sid : sinfo->storage_ids) {
+        return_sids_.push_back(sid);
+      }
+    }
+  }
+
+  /*!
+   * \brief Get the necessary storage for the expression.
+   * \param expr The expression.
+   * \return The corresponding token.
+   */
+  StorageInfo GetStorage(const Expr& expr) {
+    // See through "on_device" calls.
+    Expr true_expr = IgnoreOnDevice(expr);
+    VisitExpr(true_expr);
+    auto it = expr_storage_map_.find(true_expr);
+    ICHECK(it != expr_storage_map_.end()) << "Could not find " << true_expr->GetTypeKey() << " "
+                                          << PrettyPrint(true_expr) << " in storage device map";
+    return it->second;
+  }
+
+  /*!
+   * \brief Create storage for the expression.
+   */
+  void CreateStorage(const ExprNode* op) {
+    Expr expr = GetRef<Expr>(op);
+    return CreateStorage(expr, GetVirtualDevice(expr));
+  }
+
+  /*!
+   * \brief Create storage to hold the result of evaluating \p expr in \p virtual_device.
+   */
+  void CreateStorage(const Expr& expr, const VirtualDevice& virtual_device) {
+    ICHECK(!virtual_device->IsFullyUnconstrained())
+        << "invalid virtual device for expr:" << std::endl
+        << PrettyPrint(expr);
+    std::vector<int64_t> storage_ids;
+    std::vector<VirtualDevice> virtual_devices;
+    std::vector<int64_t> storage_sizes_in_bytes;
+    for (const auto& ttype : FlattenTupleType(expr->checked_type())) {
+      storage_ids.push_back(next_available_sid_++);
+      virtual_devices.push_back(virtual_device);
+      storage_sizes_in_bytes.push_back(GetMemorySizeBytes(ttype->shape, ttype->dtype));
+    }
+    expr_storage_map_[expr] = StorageInfo(std::move(storage_ids), std::move(virtual_devices),
+                                          std::move(storage_sizes_in_bytes));
+  }
+
+  /*! \brief Map between Exprs and StorageInfos */
+  StorageMap expr_storage_map_;
+  /*! \brief The next available storage ID to be used */
+  int next_available_sid_{0};
+  /*! \brief The storage IDs that correspond to return values */
+  std::vector<int> return_sids_;
+};
+
+std::tuple<StorageMap, std::vector<int>> CreateStorage(const Function& func) {
+  ExprAllocator expr_allocator;
+  expr_allocator.Run(func);
+  return std::make_tuple(expr_allocator.GetStorageMap(), expr_allocator.GetReturnSIDs());
+}
+
+class AOTMainLowerer : public MixedModeVisitor {
+ public:
+  AOTMainLowerer(tvm::CompilationConfig config, CallType call_type)
+      : config_(config), call_type_(call_type) {}
+
+  IRModule Lower(IRModule mod, String mod_name) {
+    VLOG_CONTEXT << "AOT";
+    IRModule lowered_mod = GetRef<IRModule>(mod.CopyOnWrite());
+
+    auto lowered_main = lowered_mod->Lookup("main");
+    auto lowered_main_func = GetRef<Function>(lowered_main.as<FunctionNode>());
+
+    // Assign StorageInfo to all the Relay exprs and get the return SIDs
+    std::tie(expr_storage_map_, return_sid_) = CreateStorage(lowered_main_func);
+
+    for (auto input : lowered_main_func->params) {
+      input_vars_.push_back(input);
+      std::string input_name = SanitizeName(input->name_hint());
+      // We don't want the compiler changing input names in the
+      // event of a sanitization collision. Therefore, enforcing
+      // the var created to use the input_name strictly.
+      CreateIOVar(input, input_name, /*use_unique_name = */ false);
+    }
+
+    // Define the storage allocator ids
+    for (auto kv : expr_storage_map_) {
+      for (auto sid : kv.second->storage_ids) {
+        // The buffer_var is created with storage_scope to be global.workspace to be serviced by
+        // TVMBackendAllocWorkspace(TVMBAW) calls, explicitly. The reasoning being the executor
+        // allocates should be serviced by TVMBAWs as the data could be accessed by many devices and
+        // should not be lowered to the stack. For more details please refer to the discussion here:
+        // https://github.com/apache/tvm/issues/9022
+        tir::Var buffer_var(MakeString("sid_", sid),
+                            PointerType(PrimType(DataType::Int(8)), "global.workspace"));
+        sids_table_[sid] = buffer_var;
+      }
+    }
+
+    // Create output vars for the TIR main func
+    // If output tensor names were provided use them
+    if (auto opt = lowered_main->GetAttr<Array<String>>("output_tensor_names")) {
+      Array<String> output_tensor_names = opt.value();
+      Expr output_expr = lowered_main_func->body;
+      if (output_expr->checked_type()->IsInstance<TupleTypeNode>()) {
+        TupleType output_tuple_type = Downcast<TupleType>(output_expr->checked_type());
+        for (unsigned i = 0; i < output_tuple_type->fields.size(); i++) {
+          // AoT Executor Codegen does not create these names,
+          // thus should be used as they are provided.
+          CreateIOVar(output_tuple_type->fields[i], output_tensor_names[i],
+                      /*use_unique_name = */ false);
+        }
+      } else {
+        // AoT Executor Codegen does not create these names,
+        // thus should be used as they are provided.
+        CreateIOVar(lowered_main_func->body, output_tensor_names[0], /*use_unique_name = */ false);
+      }
+    } else {
+      // If output tensor names are not provided we will generate output(x)
+      // where x is a counter to create unique names.
+      if (lowered_main_func->body->checked_type()->IsInstance<TupleTypeNode>()) {
+        CreateIOVar(lowered_main_func->body, "output");
+      } else {
+        CreateIOVar(lowered_main_func->body, "output", /*use_unique_name = */ false);
+      }
+    }
+
+    CollectDeviceVariables(lowered_mod->GetAttr<Map<GlobalVar, String>>("device_contexts")
+                               .value_or(Map<GlobalVar, String>()));
+    VisitExpr(lowered_main_func->body);
+
+    // Remove the Relay main and replace it with the lowered TIR version
+    lowered_mod->Remove(lowered_mod->GetGlobalVar("main"));
+    auto tir_main_func = CreateMainFunc(mod_name);
+    lowered_mod->Update(GlobalVar(runtime::symbol::tvm_module_main), tir_main_func);
+    lowered_mod = tir::transform::RemoveNoOp()(lowered_mod);
+    return lowered_mod;
+  }
+
+  void VisitExpr_(const CallNode* call_node) override {
+    OnDeviceProps on_device_props = GetOnDeviceProps(call_node);
+    if (on_device_props.body.defined()) {
+      VisitExpr(on_device_props.body);
+      return;
+    }
+
+    DeviceCopyProps device_copy_props = GetDeviceCopyProps(call_node);
+    CallLoweredProps call_lowered_props = GetCallLoweredProps(call_node);
+
+    if (device_copy_props.body.defined()) {
+      // TODO(mbs): device_copy cleaunp
+      // Suspect treating as no-op is better since already built into the StorageInfo?
+      LOG(FATAL) << "The AOT executor does not currently support device_copy";
+      return;
+    }
+
+    // At this point we should only see calls of the form call_lowered(@callee, (args...)),
+    // where @callee can be a PrimFunc we've compiled or an external function supplied via
+    // some other mechanism.
+    ICHECK(call_lowered_props.lowered_func.defined())
+        << "AOT does not support calling Relay functions. Attempting to call:" << std::endl
+        << PrettyPrint(GetRef<Call>(call_node));
+    for (const auto& arg : call_lowered_props.arguments) {
+      // Evaluate the args
+      VisitExpr(arg);
+    }
+    CreateFuncCall(call_lowered_props, GetRef<Call>(call_node));
+  }
+
+  void VisitExpr_(const VarNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+    StorageInfo& sinfo = expr_storage_map_[expr];
+
+    // Let bound vars refer to a value, so these should not be considered "output" vars.
+    if (let_bound_vars_.find(GetRef<Var>(op)) != let_bound_vars_.end()) {
+      return;
+    }
+
+    // If the Var node is an output node we need to copy the content of the variable to the output
+    // It's safe to check the SID here because Var StorageToken are never reallocated
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sinfo->storage_ids[0]);
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      auto var_expr = FindExpr(expr);
+      CopyToOutput(GetBufferVarForIO(input_vars_.size() + output_index), var_expr[0],
+                   /*pack_input*/ false, sinfo->storage_sizes_in_bytes[0]);
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+    ICHECK(expr_storage_map_.find(expr) != expr_storage_map_.end())
+        << "Storage map did not contain constant expr " << PrettyPrint(expr);
+    StorageInfo& sinfo = expr_storage_map_[expr];
+    std::stringstream ss;
+    ss << "constant_" << constant_map_.size();
+
+    tir::Var constant(ss.str(), PointerType(PrimType(DataType(op->data->dtype))));
+    constant_map_[constant] = op;
+    auto sid = sinfo->storage_ids[0];
+    sids_table_[sid] = constant;
+
+    // If the Constant node is an output node we need to copy the content of the parameter to the
+    // output. A node can only produce a single output
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid);
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(),
+                                         {tir::StringImm(ss.str())});
+      CopyToOutput(GetBufferVarForIO(input_vars_.size() + output_index), constant,
+                   /* pack_input */ false, sinfo->storage_sizes_in_bytes[0]);
+    }
+  }
+
+  void VisitExpr_(const TupleNode* op) override {
+    for (auto field : op->fields) {
+      VisitExpr(field);
+    }
+  }
+
+  void VisitExpr_(const LetNode* op) override {
+    auto pre_visit = [this](const LetNode* op) {
+      let_bound_vars_.insert(op->var);
+      this->VisitExpr(op->value);
+    };
+    auto post_visit = [this](const LetNode* op) {
+      this->VisitExpr(op->body);
+      this->visit_counter_[op] += 1;
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+  }
+
+  void VisitExpr_(const TupleGetItemNode* op) override { VisitExpr(op->tuple); }
+  void VisitExpr_(const OpNode* op) override {
+    if (GetRef<Op>(op) != CallLoweredOp() && GetRef<Op>(op) != OnDeviceOp()) {
+      LOG(FATAL) << "All OpNodes except for call_lowered should have been expanded";
+    }
+  }
+  void VisitExpr_(const IfNode* op) override {
+    LOG(FATAL) << "All GlobalVarNodes should be removed before AOT executor's Codegen is called";
+  }
+  void VisitExpr_(const FunctionNode* op) override {
+    ICHECK(op->GetAttr<String>(attr::kCompiler).defined())
+        << "FunctionNode only supported by custom codegen";
+  }
+  void VisitExpr_(const RefCreateNode* op) override {
+    LOG(FATAL) << "AOT executor does not support references (found RefCreateNode)";
+  }
+  void VisitExpr_(const RefReadNode* op) override {
+    LOG(FATAL) << "AOT executor does not support references (found RefReadNode)";
+  }
+  void VisitExpr_(const RefWriteNode* op) override {
+    LOG(FATAL) << "AOT executor does not support references (found RefWriteNode)";
+  }
+  void VisitExpr_(const ConstructorNode* op) override {
+    LOG(FATAL) << "AOT executor does not support ADTs (found ConstructorNode)";
+  }
+  void VisitExpr_(const MatchNode* op) override {
+    LOG(FATAL) << "AOT executor does not support matching (found MatchNode)";
+  }
+
+ private:
+  /*!
+   * \brief Create the main PrimFunc to execute the graph.
+   * \note The packed function calls don't pack their arguments. The AOT
+   * runner function needs to be legalized by the LegalizePackedCalls pass.
+   */
+  tir::PrimFunc CreateMainFunc(String mod_name) {
+    tir::Stmt body = tir::SeqStmt(stmts_);
+    // Allocate the sids
+    std::unordered_map<int, bool> allocated;
+    std::vector<std::pair<int64_t, int64_t>> sids_to_allocate;
+
+    for (auto kv : expr_storage_map_) {
+      // Only allocate sids that are needed
+      const bool is_input =
+          (std::find(input_vars_.begin(), input_vars_.end(), kv.first) != input_vars_.end());
+      if (is_input) {
+        continue;
+      }
+
+      for (unsigned int i = 0; i < kv.second->storage_ids.size(); i++) {
+        sids_to_allocate.push_back(
+            std::make_pair(kv.second->storage_ids[i], kv.second->storage_sizes_in_bytes[i]));
+      }
+    }
+
+    // Sort the SID allocation to make output deterministic
+    std::sort(sids_to_allocate.begin(), sids_to_allocate.end());
+
+    for (auto p : sids_to_allocate) {
+      int sid = p.first;
+      int size = p.second;
+
+      if (std::find(return_sid_.begin(), return_sid_.end(), sid) != return_sid_.end()) {
+        continue;
+      }
+
+      // Make sure it hasn't already been allocated, this can happen
+      // with let-bound var/value pairs.
+      if (allocated.find(sid) != allocated.end()) {
+        continue;
+      }
+
+      allocated[sid] = constant_map_.count(sids_table_[sid]);
+
+      // TODO(giuseros): we should allocate this once outside the PrimFunc
+      // so we don't pay the price of allocation for every inference
+      if (!allocated[sid]) {
+        PointerType ptype = Downcast<PointerType>(sids_table_[sid]->type_annotation);
+        DataType element_type = Downcast<PrimType>(ptype->element_type)->dtype;
+        body = tir::Allocate(sids_table_[sid], element_type, {size}, tir::const_true(), body);
+      }
+      allocated[sid] = true;
+    }
+
+    for (auto kv : constant_map_) {
+      auto buffer_var = kv.first;
+      auto dtype = DataType(kv.second->data->dtype);
+
+      int ndim = kv.second->data->ndim;
+      Array<PrimExpr> extents;
+
+      for (int i = 0; i < ndim; i++) {
+        int shape = kv.second->data->shape[i];
+        extents.push_back(tir::make_const(DataType::Int(32), shape, Span()));
+      }
+      body = tir::AllocateConst(buffer_var, dtype, extents, kv.second->data, body);
+    }
+
+    // Define the PrimFunc attributes
+    Map<String, ObjectRef> dict_attrs;
+    String run_func_name = runtime::get_name_mangled(mod_name, runtime::symbol::tvm_module_main);
+    dict_attrs.Set("global_symbol", run_func_name);
+    dict_attrs.Set("runner_function", Bool(true));
+    dict_attrs.Set(tvm::attr::kTarget, config_->host_target);
+    Array<tir::Var> input_vars =
+        Array<tir::Var>(main_signature_.begin(), main_signature_.begin() + input_vars_.size());
+    dict_attrs.Set("input_vars", input_vars);
+    Array<tir::Var> output_vars =
+        Array<tir::Var>(main_signature_.begin() + input_vars_.size(),
+                        main_signature_.begin() + input_vars_.size() + return_sid_.size());
+    dict_attrs.Set("output_vars", output_vars);
+
+    tir::Stmt device_activations = GenerateAllDeviceHook("Activate");
+    tir::Stmt device_deactivations = GenerateAllDeviceHook("Deactivate");
+    tir::Stmt final_body = tir::SeqStmt({device_activations, body, device_deactivations});
+
+    // Make the PrimFunc
+    return tir::PrimFunc(main_signature_, final_body, VoidType(), main_buffer_map_, {},
+                         DictAttrs(dict_attrs));
+  }
+
+  /*!
+   * \brief Collects device context variables for passing to operators
+   */
+  void CollectDeviceVariables(const Map<GlobalVar, String>& device_contexts) {
+    Map<TargetKind, tir::Var> target_contexts;
+    TargetKindAttrMap<Bool> target_attr_map = tvm::TargetKind::GetAttrMap<Bool>("use_device_api");
+
+    for (const auto& it : device_contexts) {
+      const GlobalVar& global_var = it.first;
+      const std::string device_context_name = it.second;
+
+      Optional<TargetKind> target_kind = tvm::TargetKind::Get(device_context_name);
+      if (!target_kind || !target_attr_map.count(target_kind.value())) {
+        return;
+      }
+      if (target_attr_map[target_kind.value()]) {
+        std::string context_name = SanitizeName(device_context_name);
+        tir::Var device_context_var("device_context_" + context_name, DataType::Handle());
+
+        auto pair = target_contexts.find(target_kind.value());
+        if (pair != target_contexts.end()) {
+          device_context_var = (*pair).second;
+        } else {
+          main_signature_.push_back(device_context_var);
+          devices_.Set(context_name, device_context_var);
+          target_contexts.Set(target_kind.value(), device_context_var);
+        }
+
+        device_contexts_.Set(global_var, device_context_var);
+      }
+    }
+  }
+
+  /*!
+   * \brief Return a vector of variables that represents the sids for the given Relay Expr
+   */
+  std::vector<tir::Var> PackSid(Expr expr) {
+    std::vector<tir::Var> buffer_vars;
+
+    ICHECK(expr_storage_map_.find(expr) != expr_storage_map_.end())
+        << "Storage map did not contain constant expr " << PrettyPrint(expr);
+    StorageInfo& sinfo = expr_storage_map_[expr];
+
+    // Note that an expression can have multiple sids associated with it
+    // e.g., returning multiple values from a function
+    for (auto sid : sinfo->storage_ids) {
+      // Determine if an sid is an output buffer
+      auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid);
+      if (output_iter != return_sid_.end()) {
+        int output_index = std::distance(return_sid_.begin(), output_iter);
+        buffer_vars.push_back(GetBufferVarForIO(input_vars_.size() + output_index));
+        continue;
+      }
+
+      auto sid_value = sids_table_[sid];
+      buffer_vars.push_back(sid_value);
+    }
+    return buffer_vars;
+  }
+
+  /*!
+   * \brief Given an expression return the variable(s) associated with that expression
+   */
+  std::vector<tir::Var> FindExpr(Expr arg) {
+    auto input_iter = std::find(input_vars_.begin(), input_vars_.end(), arg);
+    if (input_iter != input_vars_.end()) {
+      // Input variable
+      int main_index = std::distance(input_vars_.begin(), input_iter);
+      return {GetBufferVarForIO(main_index)};
+    } else {
+      // Storage identifier (i.e., intermediate memory)
+      return PackSid(arg);
+    }
+  }
+
+  void PushArgs(const Expr& expr, const std::vector<tir::Var>& sids, Array<PrimExpr>* args) {
+    const TupleNode* t = expr.as<TupleNode>();
+    if (t != nullptr) {
+      CHECK_EQ(sids.size(), t->fields.size()) << "Relay tuple does not map 1:1 into TIR; AOT can't "
+                                                 "handle this type of Relay Expr in a CallNode.";
+    }
+
+    args->insert(args->end(), sids.begin(), sids.end());
+  }
+
+  /*!
+   * \brief Wraps a call_extern with a tvm_check_return annotation if required otherwise
+   * returns the passed Call
+   */
+  tir::Call AddCheckReturn(tir::Call existing_call) {
+    Array<PrimExpr> args = {tir::make_const(DataType::Int(32, 1), 0, Span()),
+                            tir::make_const(DataType::Int(32, 1), -1, Span()), existing_call};
+    return tir::Call(DataType::Int(32), tir::builtin::tvm_check_return(), args);
+  }
+
+  /*!
+   * \brief Create a function call
+   * \param call_lowered_props The lowered function and the arguments to call it with
+   * \param result_expr The call we got func and args from (so as to recover the storage
+   * ids to hold the result).
+   */
+  void CreateFuncCall(CallLoweredProps call_lowered_props, const Expr& result_expr) {
+    std::string func_name = call_lowered_props.lowered_func->name_hint;
+    tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
+    std::vector<tir::Stmt> create_func_call_stmts;
+
+    // Pack the inputs
+    for (const Expr& arg : call_lowered_props.arguments) {
+      auto sids = FindExpr(arg);
+      PushArgs(arg, sids, &args);
+    }
+
+    // Pack the return(s) value. A call node can produce multiple outputs
+    auto result_expr_sid = PackSid(result_expr);
+    PushArgs(result_expr, result_expr_sid, &args);
+
+    GlobalVar global_var = call_lowered_props.lowered_func;
+    bool has_c_device_api_context = device_contexts_.count(global_var) != 0;
+    tir::Var device_context;
+    tir::Stmt func_call;
+
+    switch (call_type_) {
+      case CallType::kUnpacked: {
+        // call_extern calling convention with optional context
+        if (has_c_device_api_context) {
+          device_context = device_contexts_.Get(global_var).value();
+          args.push_back(device_context);
+        }
+        func_call = tir::Evaluate(AddCheckReturn(
+            tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::call_extern(), args)));
+        break;
+      }
+      case CallType::kCPacked: {
+        if (has_c_device_api_context) {
+          device_context = device_contexts_.Get(global_var).value();
+          args.push_back(device_context);
+        } else {
+          // NOTE: LowerTVMBuiltin expects some device_context placeholder.
+          args.push_back(tir::make_zero(DataType::Handle()));
+        }
+        func_call = tir::Evaluate(
+            tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_cpacked(), args));
+        create_func_call_stmts.push_back(func_call);
+        break;
+      }
+      case CallType::kPacked: {
+        // call_packed does not accept a device context.
+        CHECK(!has_c_device_api_context) << "CallType::kPacked does not accept a device context";
+        func_call = tir::Evaluate(AddCheckReturn(
+            tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_packed(), args)));
+        create_func_call_stmts.push_back(func_call);
+        break;
+      }
+      default:
+        ICHECK(false) << "Unknown CallType: " << call_type_;
+    }
+
+    ICHECK(func_call.defined()) << "Must define func_call";
+
+    if (has_c_device_api_context) {
+      func_call = tir::SeqStmt(Array<tir::Stmt>({
+          GenerateDeviceHook(device_context, "Open"),
+          func_call,
+          GenerateDeviceHook(device_context, "Close"),
+      }));
+    }
+
+    tir::Stmt body = tir::SeqStmt({func_call});
+    stmts_.push_back(body);
+  }
+
+  /*!
+   * \brief Copy a variable to the output. This function is mainly used in edge cases
+   * when we want to return an input or a parameter.
+   * TODO(giuseros): we should try to avoid unnecessary copy to the output, e.g., in a
+   * copy-on-write fashion.
+   */
+  void CopyToOutput(PrimExpr out, PrimExpr in, bool pack_input, size_t size) {
+    // Define intermediate DLTensor to load/store the data
+    tir::Buffer tmp_read =
+        tir::decl_buffer({IntImm(DataType::UInt(64), size)}, DataType::UInt(8), "tmp_read");
+    tir::Buffer tmp_write =
+        tir::decl_buffer({IntImm(DataType::UInt(64), size)}, DataType::UInt(8), "tmp_write");
+    te::Var loop_idx("i", DataType::Int(32));
+    auto retval_i = tir::BufferLoad(tmp_read, {loop_idx});
+    // Copy the variable from the input to the output
+    tir::Stmt copy = tir::For(
+        loop_idx, 0, tir::make_const(DataType::Int(32, 1), size, Span()), tir::ForKind::kSerial,
+        tir::BufferStore(tmp_write, tir::Let(tmp_read->data, in, retval_i), {loop_idx}));
+    stmts_.push_back(tir::LetStmt(tmp_write->data, out, copy));
+  }
+
+  /*!
+   * \brief Generates a call to a given hook for all Devices found for C Device API
+   * \param hook Name of hook to generate statements for
+   * \return Statement with function calls for each device
+   */
+  tir::Stmt GenerateAllDeviceHook(const String& hook) {
+    std::vector<tir::Stmt> device_hooks;
+    for (const auto& it : devices_) {
+      const String& device_name = it.first;
+      const tir::Var& context = it.second;
+      Array<String> sections = {"Device", device_name, hook};
+      String device_hook_name = ToCFunctionStyle(PrefixName(sections));
+
+      tir::Evaluate device_hook(
+          AddCheckReturn(tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::call_extern(),
+                                        {tvm::tir::StringImm(device_hook_name), context})));
+      device_hooks.push_back(device_hook);
+    }
+    return tir::SeqStmt(device_hooks);
+  }
+
+  /*!
+   * \brief Generates a call to a given hook for a single Device function
+   * \param context Device context to call hook on
+   * \param hook Name of hook to generate statements for
+   * \return Statement with function call to Device API
+   */
+  tir::Stmt GenerateDeviceHook(const tir::Var& context, const String& hook) {
+    const auto& it = std::find_if(std::begin(devices_), std::end(devices_), [&](const auto& it) {
+      return it.second->name_hint == context->name_hint;
+    });
+    const String& device_name = (*it).first;
+    Array<String> sections = {"Device", device_name, hook};
+    String device_hook = ToCFunctionStyle(PrefixName(sections));
+
+    return tir::Evaluate(
+        AddCheckReturn(tir::Call(DataType::Int(32), tvm::tir::builtin::call_extern(),
+                                 {tvm::tir::StringImm(device_hook), context})));
+  }
+
+  /*!
+   * \brief Utility function to string together different arguments
+   */
+  template <typename... Args>
+  std::string MakeString(Args const&... args) {
+    std::ostringstream ss;
+    using List = int[];
+    (void)List{0, ((void)(ss << args), 0)...};
+
+    return ss.str();
+  }
+
+  /*!
+   * \brief Access IO vars using the buffer vars and
+   * not the actual var.
+   */
+  tir::Var GetBufferVarForIO(int index) { return main_buffer_map_[main_signature_[index]]->data; }
+
+  /*!
+   * \brief Create tir::Var for input/output while updating the buffer_maps.
+   * \param expr The expression to evaluate.
+   * \param original_name The name of the tir::Var.
+   * \param use_unique_name Whether to generate a new unique name where a name conflicts.
+   */
+  void CreateIOVar(const Expr& expr, const std::string& original_name,
+                   bool use_unique_name = true) {
+    CreateIOVar(expr->checked_type(), original_name, use_unique_name);
+  }
+
+  /*!
+   * \brief Create tir::Var for input/output while updating the buffer_maps.
+   * \param expr The expression to evaluate.
+   * \param original_name The name of the tir::Var.
+   * \param use_unique_name Whether to generate a new unique name where a name conflicts.
+   */
+  void CreateIOVar(const Type& type, const std::string& original_name,
+                   bool use_unique_name = true) {
+    if (type->IsInstance<TupleTypeNode>()) {
+      TupleType tuple_type = Downcast<TupleType>(type);
+      for (unsigned i = 0; i < tuple_type->fields.size(); i++) {
+        CreateIOVar(tuple_type->fields[i], original_name);
+      }
+    } else {
+      std::string name = original_name;
+      if (use_unique_name) {
+        name = GetUniqueIOVarName(original_name);
+      }
+      tir::Var var = tir::Var(name, DataType::Handle());
+      main_signature_.push_back(var);
+      auto tensor_type = type.as<TensorTypeNode>();
+      ICHECK(tensor_type) << "Expected TensorType node but was " << type->GetTypeKey();
+      DataType elem_type = tensor_type->dtype;
+      tir::Var buffer_var =
+          tir::Var(name + "_buffer_var", PointerType(PrimType(elem_type), "global"));
+      tir::Buffer buffer = tir::Buffer(buffer_var, elem_type, tensor_type->shape, {}, 0,
+                                       name + "_buffer", 16, 1, tir::BufferType::kDefault);
+      main_buffer_map_.Set(var, buffer);
+    }
+  }
+
+  /*!
+   * \brief Create a unique name for I/O Var
+   */
+  std::string GetUniqueIOVarName(std::string name) {
+    if (io_var_names_.find(name) == io_var_names_.end()) {
+      io_var_names_[name] = 1;
+      return name + std::to_string(io_var_names_[name] - 1);
+    } else {
+      io_var_names_[name] = io_var_names_[name] + 1;
+      return name + std::to_string(io_var_names_[name] - 1);
+    }
+  }
+
+  /*! \brief list of input expressions (i.e., variable passed by the user) */
+  std::vector<Var> input_vars_;
+  /*! \brief map of device contexts variables */
+  Map<String, tir::Var> devices_;
+  /*! \brief map of GlobalVars to C Device API contexts */
+  Map<GlobalVar, tir::Var> device_contexts_;
+  /*! \brief input and output variables belonging to the main function signature */
+  Array<tir::Var> main_signature_;
+  /*! \brief input and output variables belonging to the main function signature */
+  Map<tir::Var, tir::Buffer> main_buffer_map_;
+  /*! \brief All available targets. */
+  CompilationConfig config_;
+  /*!
+   * \brief The type of kernel call to be emitted.
+   * See CallType for more documentation.
+   */
+  CallType call_type_;
+  std::unordered_map<const tir::Var, const ConstantNode*, ObjectPtrHash, ObjectPtrEqual>
+      constant_map_;
+  /*! \brief plan memory of device result */
+  StorageMap expr_storage_map_;
+  /*! \brief mapping sid -> tir::Var */
+  std::unordered_map<int, tir::Var> sids_table_;
+  /*! \brief the set of statements that make the program */
+  std::vector<tir::Stmt> stmts_;
+  /*! \brief the list of return sids (note that the function might return more then one output */
+  std::vector<int> return_sid_;
+  /*! \brief This is per IO var name counter to aid the generating unique names */
+  std::unordered_map<std::string, int> io_var_names_;
+  /*! \brief A set of variables that are let bound. */
+  std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> let_bound_vars_;
+};
+
+Pass AOTLowerMain(String mod_name, tvm::CompilationConfig config, CallType call_type) {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [=](IRModule module, transform::PassContext ctx) {
+        return AOTMainLowerer(config, call_type).Lower(module, mod_name);
+      };
+
+  return tvm::transform::CreateModulePass(pass_func, 0, "AOTLowerMain", {"InferType"});
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.aot.AOTLowerMain")
+    .set_body_typed([](const String& mod_name, const tvm::CompilationConfig& config,
+                       int call_type) {
+      return AOTLowerMain(mod_name, config, static_cast<CallType>(call_type));
+    });
+
+}  // namespace aot
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/aot/aot_lower_main.h b/src/relay/backend/aot/aot_lower_main.h
new file mode 100644
index 000000000000..8981e7d7434f
--- /dev/null
+++ b/src/relay/backend/aot/aot_lower_main.h
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RELAY_BACKEND_AOT_AOT_LOWER_MAIN_H_
+#define TVM_RELAY_BACKEND_AOT_AOT_LOWER_MAIN_H_
+
+#include <tvm/ir/transform.h>
+#include <tvm/target/compilation_config.h>
+
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+namespace aot {
+
+using StorageMap =
+    std::unordered_map<Expr, StorageInfo, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
+
+/*! \brief Exposed for testing, part of the implementation of AOTLowerMain */
+std::tuple<StorageMap, std::vector<int>> CreateStorage(const Function& func);
+
+/*! \brief Lower the Relay main function into TIR for use with the AOT executor.
+ *
+ * This pass expects that all operators have already been lowered to TIR and
+ * so only Calls to 'call_lowered' are present in main.
+ *
+ * \param mod_name The name of the module.
+ * \param config The compilation config.
+ * \param call_type The call type to use when calling functions.
+ */
+transform::Pass AOTLowerMain(String mod_name, tvm::CompilationConfig config, CallType call_type);
+
+}  // namespace aot
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_AOT_AOT_LOWER_MAIN_H_
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 5cf7a5563d19..51bcab527d1b 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -138,8 +138,20 @@ TVM_REGISTER_GLOBAL("relay.ir.StaticMemoryPlan")
       return StaticMemoryPlan(expr_to_storage_info);
     });
 
-// TODO(mbs): Cf GetMemorySizeBytes in aot_executor_codegen.cc, GetMemorySize in
-// graph_plan_memory.cc
+size_t DivRoundUp(size_t size, size_t word_size) { return (size + word_size - 1) / word_size; }
+
+size_t GetMemorySizeBytes(const Array<PrimExpr>& shape, const DataType& dtype) {
+  size_t size = 1;
+  for (IndexExpr dim : shape) {
+    const int64_t* pval = tir::as_const_int(dim);
+    ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << shape;
+    ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
+    size *= static_cast<size_t>(pval[0]);
+  }
+  size *= DivRoundUp(dtype.bits() * dtype.lanes(), 8);
+  return size;
+}
+
 int64_t CalculateRelayExprSizeBytes(const Type& expr_type) {
   if (expr_type->IsInstance<TupleTypeNode>()) {
     auto tuple_type = Downcast<TupleType>(expr_type);
@@ -152,17 +164,7 @@ int64_t CalculateRelayExprSizeBytes(const Type& expr_type) {
   auto tensor_type = expr_type.as<TensorTypeNode>();
   ICHECK(tensor_type);
   auto shape = tensor_type->shape;
-  int num_of_elements = 1;
-  for (const auto& dim_index_expr : shape) {
-    if (dim_index_expr->IsInstance<IntImmNode>()) {
-      num_of_elements *= dim_index_expr.as<IntImmNode>()->value;
-    } else {
-      // If shape is dynamic, we cannot calculate workspace in compile time.
-      num_of_elements = 0;
-    }
-  }
-  auto element_size = tensor_type->dtype.bytes();
-  return element_size * num_of_elements;
+  return GetMemorySizeBytes(tensor_type->shape, tensor_type->dtype);
 }
 
 TVM_REGISTER_NODE_TYPE(FunctionInfoNode);
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 37ae9d803a35..6c65a081f156 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -59,6 +59,73 @@ class TECompiler;
 namespace backend {
 using Pass = tvm::transform::Pass;
 
+/*! \brief Describes the type of kernel call emitted. */
+enum CallType {
+  /*!
+   * \brief Emit PackedFunc calls bound just-in-time using TVMBackend* functions.
+   *
+   * When this type is selected, assumes all operators must be called via TVMFuncCall. Given the
+   * implementation of TVMFuncCall in the C++ runtime, this in practice implies that those
+   * functions are of type TVMBackendPackedCFunc.
+   *
+   * The following code is emitted at call sites to call a function named `func`:
+   * void* func_ptr = TVMBackendGetFuncFromEnv("func");
+   * TVMFuncCall(func_ptr, values, tcodes, num_args, ret_values, ret_tcodes)
+   *
+   * The arguments given to the tir::Call node are encoded into `values`, `tcodes`, and `num_args`
+   * by LowerTVMBuiltin TIR transform.
+   *
+   * If `resource_handle` is passed to `func`, it is determined by TVMFuncCall (often,
+   * `resource_handle` is registered with the C++ runtime to provide a `this` equivalent when
+   * `func` is implemented in C).
+   *
+   * Compatible with both C++ and C runtimes, implemented with the C runtime only.
+   */
+  kPacked,  // Emit tir.call_packed and wrap all arguments in DLTensor.
+
+  /*!
+   * \brief Directly call a TVMBackendPackedCFunc named according to the tir::Call.
+   *
+   * When this type is selected, assumes all operators are implemented in functions of type
+   * `TVMBackendPackedCFunc` and should be called directly. That is, presumes at the time of
+   * downstream compilation that there is a symbol named after the 0th arg to tir::Call of
+   * type `TVMBackendPackedCFunc`. This situation should occur when target_host == target.
+   *
+   * The following code is emitted at call sites to call a function named `func`:
+   * func(values, tcodes, num_args, ret_values, ret_tcodes, resource_handle)
+   *
+   * The arguments given to the tir::Call node are encoded into `values`, `tcodes`, and `num_args`
+   * by LowerTVMBuiltin TIR transform.
+   *
+   * `resource_handle` is encoded as the final argument to the tir::Call node. In practice, it is
+   * always the device context parameter when not null. At present, the implementation does not
+   * support forwarding device context parameters to CPacked.
+   *
+   * Compatible with the C runtime and C++ runtime (so long as target_host == target). Implemented
+   * in the same scenarios.
+   */
+  kCPacked,  // Emit tir.call_cpacked and wrap all arguments in DLTensor.
+
+  /*! \brief Directly call a function accepting the `data` arrays as args.
+   *
+   * When this type is selected, assumes all operaotrs are implemented in C functions whose
+   * arguments are 1-to-1 with those in the tir::Call. DLTensor arguments are encoded as just the
+   * `data` parameters (i.e. no DLTensor object is passed along).
+   *
+   * The following code is emitted at call sites to a function named `func`:
+   * func(void* arg0, void* arg1, ..., void* argN) // no resource_handle
+   * -or-
+   * func(void* arg0, void* arg1, ..., void* argN, void* resource_handle) // with resource_handle
+   *
+   * `resource_handle` is encoded as the final argument to the tir::Call node. In practice, it is
+   * always the device context parameter when not null.
+   *
+   * Compatible with the C runtime and C++ runtime (so long as target_host == target). Implemented
+   * with the C runtime only.
+   */
+  kUnpacked,  // Emit tir.call_extern passing only the `data` part of DLTensors.
+};
+
 /*!
  * \brief Structure that can be optionally used by the executor codegen
  */
@@ -207,6 +274,13 @@ class FunctionInfo : public ObjectRef {
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(FunctionInfo, ObjectRef, FunctionInfoNode);
 };
 
+/*!
+ * \brief Calculate the bytes of memory needed to hold a tensor of a given shape and data type.
+ * \param shape The shape of the tensor
+ * \param dtype The data type of the tensor
+ */
+size_t GetMemorySizeBytes(const Array<PrimExpr>& shape, const DataType& dtype);
+
 /*!
  * \brief Calculate the storage required to store the type of relay.Expr
  *
diff --git a/tests/cpp/relay/backend/aot/aot_lower_main_test.cc b/tests/cpp/relay/backend/aot/aot_lower_main_test.cc
new file mode 100644
index 000000000000..31166f1e6bb8
--- /dev/null
+++ b/tests/cpp/relay/backend/aot/aot_lower_main_test.cc
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "../../../../../src/relay/backend/aot/aot_lower_main.h"
+
+#include <gtest/gtest.h>
+#include <tvm/parser/parser.h>
+
+namespace tvm {
+namespace relay {
+namespace backend {
+namespace aot {
+
+TEST(AOTLowerMain, ExprAllocatorSkipNestedFunc) {
+  constexpr const char* mod_text = R"(
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32]) {
+          nn.relu(%FunctionVar_01)
+        };
+        %0(%x)
+      }
+    )";
+  IRModule mod = parser::ParseModule("string", mod_text, {}, {});
+  auto host_target = tvm::Target("llvm");
+  auto prim_target = tvm::Target(host_target, host_target);
+  auto ctxt = tvm::transform::PassContext::Current();
+  auto config = tvm::CompilationConfig(ctxt, {prim_target});
+  mod = tvm::relay::transform::PlanDevices(config)(mod);
+  mod = tvm::relay::transform::InferType()(mod);
+
+  StorageMap storage_map;
+  std::vector<int> return_sids;
+  auto func = Downcast<Function>(mod->Lookup("main"));
+  std::tie(storage_map, return_sids) = CreateStorage(func);
+
+  auto nested_func = Downcast<Function>(Downcast<Call>(func->body)->op);
+  EXPECT_EQ(storage_map.find(nested_func->body), storage_map.end());
+  EXPECT_EQ(storage_map.find(nested_func->params[0]), storage_map.end());
+  EXPECT_NE(storage_map.find(func->body), storage_map.end());
+  EXPECT_NE(storage_map.find(func->params[0]), storage_map.end());
+}
+
+}  // namespace aot
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/aot/test_pass_aot_lower_main.py b/tests/python/relay/aot/test_pass_aot_lower_main.py
new file mode 100644
index 000000000000..c583b287727a
--- /dev/null
+++ b/tests/python/relay/aot/test_pass_aot_lower_main.py
@@ -0,0 +1,429 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=line-too-long,missing-class-docstring,missing-module-docstring,missing-function-docstring,no-self-argument,unused-argument,invalid-name
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+from tvm.script import tir as T
+from tvm.relay.backend.aot import AOTLowerMain, CallType
+
+
+def _make_const(dtype, shape):
+    return tvm.relay.const(np.zeros(shape).astype(dtype))
+
+
+def _make_consts(dtype, shapes):
+    return [_make_const(dtype, shape) for shape in shapes]
+
+
+def _plan_devices(mod):
+    host_target = tvm.target.Target("llvm")
+    prim_target = tvm.target.Target("llvm", host=host_target)
+    ctxt = tvm.transform.PassContext()
+    config = tvm.target.make_compilation_config(ctxt, prim_target)
+    mod = tvm.relay.transform.PlanDevices(config)(mod)
+    mod = tvm.relay.transform.InferType()(mod)
+    return mod, config
+
+
+def _assert_lowered_main(mod, main_func, call_type, print_script=False):
+    mod, config = _plan_devices(mod)
+    mod = AOTLowerMain("test_mod", config, call_type)(mod)
+    if print_script:
+        print(mod["__tvm_main__"].script())
+
+    assert mod["__tvm_main__"].script() == main_func.script()
+
+
+def test_single_call_cpacked():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @test_fused_add(%x: Tensor[(5, 7), float32]) { %x }
+
+def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+  %0 = (%a,) /* ty=(Tensor[(5, 7), float32],) */;
+  call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */
+}
+        """,
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+        # body
+        T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    _assert_lowered_main(mod, func, CallType.CPacked)
+
+
+def test_single_call_packed():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @test_fused_add(%x: Tensor[(5, 7), float32]) { %x }
+
+def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+  %0 = (%a,) /* ty=(Tensor[(5, 7), float32],) */;
+  call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */
+}
+        """,
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+        # body
+        T.evaluate(T.tvm_check_return(0, -1, T.tvm_call_packed("test_fused_add", a_buffer.data, output_buffer.data, dtype="int32"), dtype="int32"))
+    # fmt: on
+
+    _assert_lowered_main(mod, func, CallType.Packed)
+
+
+def test_single_call_unpacked():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @test_fused_add(%x: Tensor[(5, 7), float32]) { %x }
+
+def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+  %0 = (%a,) /* ty=(Tensor[(5, 7), float32],) */;
+  call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */
+}
+        """,
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+        # body
+        T.evaluate(T.tvm_check_return(0, -1, T.call_extern("test_fused_add", a_buffer.data, output_buffer.data, dtype="int32"), dtype="int32"))
+    # fmt: on
+
+    _assert_lowered_main(mod, func, CallType.Unpacked)
+
+
+def test_constant():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @test_fused_add(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) { %x }
+
+def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+  %0 = (%a, meta[relay.Constant][0]) /* ty=(Tensor[(5, 7), float32], Tensor[(5, 7), float32]) */;
+  call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */
+}
+        """,
+        init_meta_table={"relay.Constant": _make_consts("float32", [(5, 7)])},
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "global_symbol": "test_mod___tvm_main__", "input_vars": [a], "output_vars": [output]})
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+        # body
+        constant_0 = T.allocate_const([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "float32", [5, 7])
+        T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, constant_0, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    _assert_lowered_main(mod, func, CallType.CPacked)
+
+
+# TODO(@mbaret) There seems to be a TVMScript round-trip bug causing this to fail
+@pytest.mark.xfail()
+def test_copy_to_output():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+  %a
+}
+        """,
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        tmp_read = T.buffer_var("uint8", "")
+        # buffer definition
+        tmp_read_1 = T.buffer_decl([T.uint64(140)], dtype="uint8", data=tmp_read)
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+        # body
+        tmp_write: T.Ptr[T.uint8] = output_buffer.data
+        tmp_write_1 = T.buffer_decl([T.uint64(140)], dtype="uint8", data=tmp_write)
+        for i in T.serial(140):
+            tmp_write_1[i] = T.let(tmp_read, a_buffer.data, tmp_read_1[i])
+    # fmt: on
+
+    _assert_lowered_main(mod, func, CallType.CPacked)
+
+
+def test_two_calls():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @test_fused_add(%x: Tensor[(5, 7), float32]) { %x }
+
+def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+  %0 = (%a,) /* ty=(Tensor[(5, 7), float32],) */;
+  %1 = call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */;
+  %2 = (%1,) /* ty=(Tensor[(5, 7), float32],) */;
+  call_lowered(@test_fused_add, %2) /* ty=Tensor[(5, 7), float32] */
+}
+        """,
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+        # body
+        sid_2 = T.allocate([140], "int8", "global.workspace")
+        T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, sid_2, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("test_fused_add", sid_2, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    _assert_lowered_main(mod, func, CallType.CPacked)
+
+
+def test_tuple_output():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @test_fused_add(%x: Tensor[(5, 7), float32]) { (%x, %x) }
+
+def @main(%a: Tensor[(5, 7), float32]) -> (Tensor[(5, 7), float32], Tensor[(5, 7), float32]) {
+  %0 = (%a,) /* ty=(Tensor[(5, 7), float32],) */;
+  call_lowered(@test_fused_add, %0) /* ty=(Tensor[(5, 7), float32], Tensor[(5, 7), float32]) */
+}
+        """,
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, output0: T.handle, output1: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output0, output1]})
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        output0_buffer = T.match_buffer(output0, [5, 7], dtype="float32", align=16)
+        output1_buffer = T.match_buffer(output1, [5, 7], dtype="float32", align=16)
+        # body
+        T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, output0_buffer.data, output1_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    _assert_lowered_main(mod, func, CallType.CPacked)
+
+
+def test_tuple_intermediate():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @test_fused_add_0(%x: Tensor[(5, 7), float32]) -> (Tensor[(5, 7), float32], Tensor[(5, 7), float32]) { (%x, %x) }
+def @test_fused_add_1(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { %x }
+
+def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+  %0 = (%a,);
+  %1 = call_lowered(@test_fused_add_0, %0);
+  %2 = (%1.0, %1.1);
+  call_lowered(@test_fused_add_1, %2)
+}
+        """,
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+        # body
+        sid_3 = T.allocate([140], "int8", "global.workspace")
+        sid_2 = T.allocate([140], "int8", "global.workspace")
+        T.evaluate(T.tvm_call_cpacked("test_fused_add_0", a_buffer.data, sid_2, sid_3, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("test_fused_add_1", sid_2, sid_3, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    _assert_lowered_main(mod, func, CallType.CPacked)
+
+
+def test_multi_input():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @test_fused_add(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) { %x }
+
+def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+  %0 = (%a, %b) /* ty=(Tensor[(5, 7), float32], Tensor[(5, 7), float32]) */;
+  call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */
+}
+        """,
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, b: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a, b], "output_vars": [output]})
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        b_buffer = T.match_buffer(b, [5, 7], dtype="float32", align=16)
+        output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+        # body
+        T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, b_buffer.data, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    _assert_lowered_main(mod, func, CallType.CPacked)
+
+
+def test_let_binding():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @test_fused_add(%x: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { %x }
+
+def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+  %0 = (%a,);
+  let %v1 = call_lowered(@test_fused_add, %0);
+  %v1
+}
+        """,
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+        # body
+        T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    _assert_lowered_main(mod, func, CallType.CPacked)
+
+
+def test_let_binding_branch():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @test_fused_add_0(%x: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { %x }
+def @test_fused_add_1(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { %x }
+
+def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+  %0 = (%a,);
+  let %v0 = call_lowered(@test_fused_add_0, %0);
+  %1 = (%v0,);
+  let %v1 = call_lowered(@test_fused_add_0, %1);
+  %2 = (%v1,);
+  let %v2 = call_lowered(@test_fused_add_0, %2);
+  %3 = (%v1, %v2);
+  let %v3 = call_lowered(@test_fused_add_1, %3);
+  %v3
+}
+        """,
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+        # body
+        sid_3 = T.allocate([140], "int8", "global.workspace")
+        sid_2 = T.allocate([140], "int8", "global.workspace")
+        sid_1 = T.allocate([140], "int8", "global.workspace")
+        T.evaluate(T.tvm_call_cpacked("test_fused_add_0", a_buffer.data, sid_1, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_1, sid_2, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_2, sid_3, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+        T.evaluate(T.tvm_call_cpacked("test_fused_add_1", sid_2, sid_3, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    _assert_lowered_main(mod, func, CallType.CPacked)
+
+
+def test_device_hooks():
+    mod = tvm.parser.parse(
+        """
+#[version = "0.0.5"]
+def @test_fused_add(%x: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { %x }
+
+def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+  %0 = (%a,);
+  %1 = call_lowered(@test_fused_add, %0);
+  %2 = (%1,);
+  call_lowered(@test_fused_add, %2)
+}
+        """,
+    )
+
+    # fmt: off
+    @T.prim_func
+    def func(a: T.handle, output: T.handle, device_context_example_target_hook: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+        output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+        # body
+        T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookActivate", device_context_example_target_hook, dtype="int32"), dtype="int32"))
+        with T.allocate([140], "int8", "global.workspace") as sid_2:
+            T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookOpen", device_context_example_target_hook, dtype="int32"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, sid_2, device_context_example_target_hook, dtype="int32"))
+            T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookClose", device_context_example_target_hook, dtype="int32"), dtype="int32"))
+            T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookOpen", device_context_example_target_hook, dtype="int32"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add", sid_2, output_buffer.data, device_context_example_target_hook, dtype="int32"))
+            T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookClose", device_context_example_target_hook, dtype="int32"), dtype="int32"))
+        T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookDeactivate", device_context_example_target_hook, dtype="int32"), dtype="int32"))
+    # fmt: on
+
+    device_contexts = {}
+    for gv in mod.get_global_vars():
+        device_contexts[gv] = "example_target_hook"
+
+    mod = mod.with_attr("device_contexts", device_contexts)
+
+    _assert_lowered_main(mod, func, CallType.CPacked)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 2aa0d1fbfcf4a31e343cc6852fdc4abd660c850a Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Wed, 14 Sep 2022 15:18:03 +0530
Subject: [PATCH 162/704] [OpenCLML] More ops and network coverage (#12762)

Added operators pooling (avg, max), binary operators (add, subtract, multiply, min, max) and concat.
Clip operator with min=0 and max=6 is remapped to relu6 to take advantage of CLML acceleration
without sub graphing this to fallback path.

Added new test cases for above listed operators and also end-to-end network test cases for Resnet50
& InceptionV3.

CLML support FP16 arithmetic mode which gives significant performance boost over FP32. This PR
enhances FP16 usage based on Operator datatype in relay graph.

Co-authored-by: Krishna Raju quic_kvegiraj@quicinc.com
Co-authored-by: Shwetank Singh quic_shwesing@quicinc.com
---
 python/tvm/relay/op/contrib/clml.py           |  35 +-
 src/relay/backend/contrib/clml/codegen.cc     |  37 ++
 src/runtime/contrib/clml/clml_runtime.cc      | 315 +++++++++++++++---
 .../contrib/test_clml/infrastructure.py       |  28 +-
 .../python/contrib/test_clml/test_network.py  | 139 ++++++--
 tests/python/contrib/test_clml/test_ops.py    |  83 ++++-
 6 files changed, 529 insertions(+), 108 deletions(-)

diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index cacd10de2865..d253544d45d9 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -23,7 +23,7 @@
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
 
-from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item
+from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item, is_tuple
 from .register import register_pattern_table
 from ..strategy.generic import is_depthwise_conv2d
 
@@ -135,6 +135,7 @@ def conv_pattern():
         """Create a convolution pattern."""
         pattern = is_op("nn.conv2d")(wildcard(), is_constant())
         pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
+        pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
         pattern = pattern.optional(
             lambda x: is_op("nn.batch_norm")(
                 x, is_constant(), is_constant(), is_constant(), is_constant()
@@ -142,6 +143,7 @@ def conv_pattern():
         )
         pattern = pattern.optional(is_tuple_get_item)
         pattern = pattern.optional(is_op("nn.relu"))
+        pattern = pattern.optional(is_op("clip"))
         return pattern
 
     def batch_norm_pattern():
@@ -152,10 +154,24 @@ def batch_norm_pattern():
         pattern = is_tuple_get_item(pattern)
         return pattern
 
+    def concat_pattern():
+        """Create a concat pattern.
+
+        Returns
+        -------
+        pattern : dataflow_pattern.AltPattern
+            Denotes the concat pattern.
+        """
+        pattern = is_tuple(None)
+        pattern = is_op("concatenate")(pattern)
+
+        return pattern
+
     def dense_pattern():
         """Create a dense pattern."""
         pattern = is_op("nn.dense")(wildcard(), is_constant())
         pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
+        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
         return pattern
 
     def pad_pattern():
@@ -172,6 +188,13 @@ def check_conv(extract):
             call = call.args[0]
             if isinstance(call, tvm.relay.expr.TupleGetItem):
                 call = call.tuple_value
+        elif call.op.name == "clip":
+            if call.attrs["a_min"] != 0.0 or call.attrs["a_max"] != 6.0:
+                return False
+            call = call.args[0]
+            if isinstance(call, tvm.relay.expr.TupleGetItem):
+                call = call.tuple_value
+
         while call.op.name != "nn.conv2d":
             call = call.args[0]
         attrs, args = call.attrs, call.args
@@ -194,6 +217,7 @@ def check_conv(extract):
         ("clml.conv2d", conv_pattern(), check_conv),
         ("clml.dense", dense_pattern()),
         ("clml.pad", pad_pattern()),
+        ("clml.concat", concat_pattern()),
         ("clml.batch_norm", batch_norm_pattern()),
     ]
 
@@ -207,11 +231,18 @@ def _func_wrapper(expr):
 
 
 _register_external_op_helper("clip")
-_register_external_op_helper("relu")
+_register_external_op_helper("nn.relu")
 _register_external_op_helper("nn.global_avg_pool2d")
 _register_external_op_helper("nn.global_max_pool2d")
+_register_external_op_helper("nn.avg_pool2d")
+_register_external_op_helper("nn.max_pool2d")
 _register_external_op_helper("nn.softmax")
 _register_external_op_helper("reshape")
+_register_external_op_helper("add")
+_register_external_op_helper("subtract")
+_register_external_op_helper("multiply")
+_register_external_op_helper("minimum")
+_register_external_op_helper("maximum")
 
 
 class OpAttrContext(object):
diff --git a/src/relay/backend/contrib/clml/codegen.cc b/src/relay/backend/contrib/clml/codegen.cc
index fa082a423d78..b89f05e17857 100644
--- a/src/relay/backend/contrib/clml/codegen.cc
+++ b/src/relay/backend/contrib/clml/codegen.cc
@@ -91,6 +91,8 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
       json_node = CreateDenseJSONNode(cn);
     } else if (name == "clml.pad") {
       json_node = CreatePadJSONNode(cn);
+    } else if (name == "clml.concat") {
+      json_node = CreateConcatJSONNode(cn);
     } else {
       LOG(FATAL) << "Unrecognized CLML  pattern: " << name;
     }
@@ -148,6 +150,15 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
       } else {
         current_call = current_call->args[0].as<CallNode>();
       }
+    } else if (backend::IsOp(current_call, "clip")) {
+      nodes.activation = current_call;
+      nodes.act_type = "relu6";
+      if (current_call->args[0].as<TupleGetItemNode>()) {
+        auto tuple_item = current_call->args[0].as<TupleGetItemNode>();
+        current_call = tuple_item->tuple.as<CallNode>();
+      } else {
+        current_call = current_call->args[0].as<CallNode>();
+      }
     }
     if (backend::IsOp(current_call, "nn.batch_norm")) {
       nodes.bn = current_call;
@@ -279,6 +290,32 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
     return json_node;
   }
 
+  /*!
+   * \brief Create a JSON representation of a Concat operator.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateConcatJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* concat = fn->body.as<CallNode>();
+
+    ICHECK(backend::IsOp(concat, "concatenate"));
+    const auto* concat_op = concat->op.as<OpNode>();
+    ICHECK(concat_op);
+    const std::string name = concat_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    for (auto arg : cn->args) {
+      inputs.push_back(VisitExpr(arg)[0]);
+    }
+
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, concat);
+    return json_node;
+  }
+
   /*!
    * \brief Create a JSON representation of a Dense operator.
    *
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index da41442ef91d..cdc3b9a7b51c 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -335,13 +335,15 @@ class CLMLRuntime : public JSONRuntimeBase {
     size_t nid;
     for (nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
+      DLDataType tvm_dtype = node.GetOpDataType()[0];
+      cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
       if (node.GetOpType() == "input") {
-        auto clml_input = MakeCLMLTensorFromJSONNode(node);
+        auto clml_input = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
         this->layer_.storage_map.insert({nid, std::make_pair(clml_input, node)});
         this->layer_.inputs.push_back(clml_input);
         // Input copy placeholder Tensor
         this->layer_.in_placeholder.push_back(
-            MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM));
+            MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype));
       } else if (node.GetOpType() == "kernel") {
         auto op_name = node.GetOpName();
         if ("nn.conv2d" == op_name) {
@@ -364,6 +366,11 @@ class CLMLRuntime : public JSONRuntimeBase {
           auto out = CreateBatchNormLayer(&layer_, node);
           this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
           this->layer_.func_outs.push_back(out);
+        } else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
+                   "nn.l2_pool2d" == op_name) {
+          auto out = CreatePoolingLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
         } else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name) {
           auto out = CreateGlobalPoolingLayer(&layer_, node);
           this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
@@ -372,6 +379,10 @@ class CLMLRuntime : public JSONRuntimeBase {
           auto out = CreateReshapeLayer(&layer_, node);
           this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
           this->layer_.func_outs.push_back(out);
+        } else if ("concatenate" == op_name) {
+          auto out = CreateConcatLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
         } else if ("nn.dense" == op_name) {
           auto out = CreateDenseLayer(&layer_, node);
           this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
@@ -388,6 +399,11 @@ class CLMLRuntime : public JSONRuntimeBase {
           auto out = CreateClipLayer(&layer_, node);
           this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
           this->layer_.func_outs.push_back(out);
+        } else if ("add" == op_name || "subtract" == op_name || "multiply" == op_name ||
+                   "minimum" == op_name || "maximum" == op_name) {
+          auto out = CreateBinaryLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
@@ -396,10 +412,14 @@ class CLMLRuntime : public JSONRuntimeBase {
         LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
       }
     }
-    if (nid > 0) {
-      this->layer_.outputs.push_back(this->layer_.storage_map[nid - 1].first);
+
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      nid = outputs_[i].id_;
+      DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
+      cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+      this->layer_.outputs.push_back(this->layer_.storage_map[nid].first);
       this->layer_.out_placeholder.push_back(
-          MakeCLMLTensorFromJSONNode(nodes_[nid - 1], CL_TENSOR_LAYOUT_NCHW_QCOM));
+          MakeCLMLTensorFromJSONNode(nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype));
     }
     // ALlocate device memories and initialize the params if any
     cl_int result = 0;
@@ -558,6 +578,20 @@ class CLMLRuntime : public JSONRuntimeBase {
     }
   }
 
+  cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type,
+                                          const cl_channel_type& acc_type = CL_FLOAT) {
+    if (data_type == CL_FLOAT && acc_type == CL_FLOAT) {
+      return CL_ARITHMETIC_MODE_FP32_QCOM;
+    } else if (data_type == CL_HALF_FLOAT && acc_type == CL_FLOAT) {
+      return CL_ARITHMETIC_MODE_FP16_ACC32_QCOM;
+    } else if (data_type == CL_HALF_FLOAT && acc_type == CL_HALF_FLOAT) {
+      return CL_ARITHMETIC_MODE_FP16_QCOM;
+    } else {
+      LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
+      return CL_ARITHMETIC_MODE_FP32_QCOM;
+    }
+  }
+
   std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
       const JSONGraphNode& tensor_rep, void* data, std::vector<size_t> c_shape,
       cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint dtype = CL_FLOAT) {
@@ -634,6 +668,9 @@ class CLMLRuntime : public JSONRuntimeBase {
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
     std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
     std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
     if (!node.HasAttr("padding")) {
       clml_padding.resize(4);
       std::fill(clml_padding.begin(), clml_padding.end(), 0);
@@ -668,7 +705,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       has_act = true;
     }
     cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
-                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+                                              cl_arithmetic_mode};
 
     // Collect inputs and outputs, handling nn.conv2d.
     std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
@@ -680,15 +717,15 @@ class CLMLRuntime : public JSONRuntimeBase {
     has_bias = (num_inputs == 3) || (num_inputs == 7);
     has_bn = (num_inputs == 6) || (num_inputs == 7);
     // Input
-    auto input = MakeCLMLTensorFromJSONEntry(inputs[0]);
-
+    auto input =
+        MakeCLMLTensorFromJSONEntry(inputs[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     // Weight
-    auto weight = MakeCLMLTensorFromJSONEntry(inputs[1]);
-
+    auto weight =
+        MakeCLMLTensorFromJSONEntry(inputs[1], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     // Bias
     auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     if (has_bias) {
-      bias = MakeCLMLTensorFromJSONEntry(inputs[2]);
+      bias = MakeCLMLTensorFromJSONEntry(inputs[2], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     } else {
       cl_ml_tensor_desc_qcom desc = {};
       desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
@@ -698,7 +735,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       bias->tensor = layer_.unusedTensor;
     }
     // Output
-    auto output = MakeCLMLTensorFromJSONNode(node);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_ml_op_convolution_desc_qcom conv_desc{mode,
                                              groups,
                                              4,
@@ -707,7 +744,7 @@ class CLMLRuntime : public JSONRuntimeBase {
                                              {clml_strides[0], clml_strides[1]},
                                              {clml_dilation[0], clml_dilation[1]},
                                              0,
-                                             CL_ARITHMETIC_MODE_FP32_QCOM};
+                                             cl_arithmetic_mode};
 
     cl_ml_op_qcom op = NULL;
     if (!has_bn) {
@@ -734,13 +771,16 @@ class CLMLRuntime : public JSONRuntimeBase {
       auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
       auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
       auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-      bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape);
-      bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape);
-      bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape);
-      bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape);
-
-      cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
-                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+      bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape,
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+      bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape,
+                                            CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+      bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape,
+                                            CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+      bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape,
+                                           CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+
+      cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
       if (!has_act) {
         result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM(
             workspace->context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor,
@@ -772,11 +812,15 @@ class CLMLRuntime : public JSONRuntimeBase {
       cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) {
     cl_int result = 0;
     cl_ml_op_qcom op = NULL;
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
-    auto output = MakeCLMLTensorFromJSONNode(node);
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                             cl_dtype);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
     cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
-                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+                                              cl_arithmetic_mode};
 
     cl_ml_tensor_desc_qcom desc = {};
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
@@ -805,7 +849,11 @@ class CLMLRuntime : public JSONRuntimeBase {
                                                                       const JSONGraphNode& node) {
     cl_int result = 0;
     cl_ml_op_qcom op = NULL;
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                             cl_dtype);
     int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
     auto bn_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
     std::vector<size_t> bn_shape = {1, 1, 1, 1};
@@ -814,15 +862,18 @@ class CLMLRuntime : public JSONRuntimeBase {
     auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-    bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape);
-    bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape);
-    bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape);
-    bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape);
+    bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape,
+                                           CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape,
+                                          CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape,
+                                          CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape,
+                                         CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
-    auto output = MakeCLMLTensorFromJSONNode(node);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
-    cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
-                                            CL_ARITHMETIC_MODE_FP32_QCOM};
+    cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
 
     result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM(
         workspace->context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor,
@@ -834,6 +885,61 @@ class CLMLRuntime : public JSONRuntimeBase {
     return output;
   }
 
+  /*!
+   * \brief Create a creating pooling layer.
+   *
+   * \note Currently global_max_pool2d and global_avg_pool2d are supported.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreatePoolingLayer(CachedLayer* layer,
+                                                                    const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                             cl_dtype);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+
+    std::vector<std::string> windows = node.GetAttr<std::vector<std::string>>("pool_size");
+    std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
+    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
+    std::vector<cl_uint> clml_window = GetVectorValues(windows);
+    std::vector<cl_uint> clml_stride = GetVectorValues(strides);
+    std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+
+    cl_ml_op_pooling_desc_qcom pool_desc = {
+        node.GetOpName() == "nn.max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
+                                            : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
+        4,  // reserved
+        {clml_padding[0], clml_padding[1]},
+        {clml_padding[2], clml_padding[3]},
+        {clml_stride[0], clml_stride[1]},
+        {clml_window[0], clml_window[1]},
+        CL_PROPAGATE_NAN_QCOM,
+        cl_arithmetic_mode,
+    };
+
+    cl_ml_tensor_desc_qcom desc = {};
+    cl_ml_tensor_qcom unusedTensor = NULL;
+    desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+    result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &unusedTensor);
+    ICHECK(unusedTensor && result == CL_SUCCESS) << ":" << result;
+
+    result =
+        h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(workspace->context, 0, &pool_desc, input->tensor,
+                                                   unusedTensor, output->tensor, &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
   /*!
    * \brief Create a global pooling layer.
    *
@@ -846,8 +952,12 @@ class CLMLRuntime : public JSONRuntimeBase {
       CachedLayer* layer, const JSONGraphNode& node) {
     cl_int result = 0;
     cl_ml_op_qcom op = NULL;
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
-    auto output = MakeCLMLTensorFromJSONNode(node);
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                             cl_dtype);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
     cl_ml_op_pooling_desc_qcom pool_desc = {
         node.GetOpName() == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
@@ -858,7 +968,7 @@ class CLMLRuntime : public JSONRuntimeBase {
         {1, 1},
         {in_dims.w, in_dims.h},
         CL_PROPAGATE_NAN_QCOM,
-        CL_ARITHMETIC_MODE_FP32_QCOM,
+        cl_arithmetic_mode,
     };
 
     cl_ml_tensor_desc_qcom desc = {};
@@ -887,14 +997,17 @@ class CLMLRuntime : public JSONRuntimeBase {
                                                                     const JSONGraphNode& node) {
     cl_int result = 0;
     cl_ml_op_qcom op = NULL;
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                             cl_dtype);
     auto out_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
-    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, CL_FLOAT, nullptr,
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype, nullptr,
                                              {out_dims.n, out_dims.c, 1, 1});
 
     cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM,
-                                               CL_SOFTMAX_MODE_INSTANCE_QCOM,
-                                               CL_ARITHMETIC_MODE_FP32_QCOM};
+                                               CL_SOFTMAX_MODE_INSTANCE_QCOM, cl_arithmetic_mode};
 
     result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(workspace->context, 0, &softmax_desc,
                                                  input->tensor, output->tensor, &op, tuning_cache);
@@ -915,8 +1028,12 @@ class CLMLRuntime : public JSONRuntimeBase {
                                                                 const JSONGraphNode& node) {
     cl_int result = 0;
     cl_ml_op_qcom op = NULL;
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
-    auto output = MakeCLMLTensorFromJSONNode(node);
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                             cl_dtype);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
     std::string pad_mode = node.GetAttr<std::vector<std::string>>("pad_mode")[0];
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("pad_width");
@@ -936,7 +1053,7 @@ class CLMLRuntime : public JSONRuntimeBase {
         clml_pad_mode,
         {0, 0},
         {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0},
-        CL_ARITHMETIC_MODE_FP32_QCOM};
+        cl_arithmetic_mode};
 
     result = h_ClmlIntf->clCreateMLOpPadQCOM(workspace->context, 0, &pad_desc, input->tensor,
                                              output->tensor, &op, tuning_cache);
@@ -957,8 +1074,11 @@ class CLMLRuntime : public JSONRuntimeBase {
                                                                     const JSONGraphNode& node) {
     cl_int result = 0;
     cl_ml_op_qcom op = NULL;
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
-    auto output = MakeCLMLTensorFromJSONNode(node);
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                             cl_dtype);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
     result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->context, 0, input->tensor,
                                                  output->tensor, &op, tuning_cache);
@@ -969,6 +1089,42 @@ class CLMLRuntime : public JSONRuntimeBase {
     return output;
   }
 
+  /*!
+   * \brief Create a concat layer.
+   *
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateConcatLayer(CachedLayer* layer,
+                                                                   const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    std::vector<JSONGraphNodeEntry> input_ = node.GetInputs();
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    int inputSize = input_.size();
+    int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[inputSize];
+    for (int i = 0; i < inputSize; i++) {
+      auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[i], {},
+                                               CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+      concatInputs[i] = input->tensor;
+    }
+    cl_ml_op_concat_desc_qcom concatDesc = {1, (cl_uint)inputSize, cl_arithmetic_mode};
+
+    result = h_ClmlIntf->clCreateMLOpConcatQCOM(workspace->context, 0, &concatDesc, concatInputs,
+                                                output->tensor, &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Concat Error:" << result;
+
+    layer->function.push_back(op);
+
+    delete[] concatInputs;
+    return output;
+  }
+
   /*!
    * \brief Create a dense layer.
    *
@@ -980,21 +1136,27 @@ class CLMLRuntime : public JSONRuntimeBase {
                                                                   const JSONGraphNode& node) {
     cl_int result = 0;
     cl_ml_op_qcom op = NULL;
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto inp_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {1, inp_dims.c, 1, 1},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto wt_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
     bool has_bias = node.GetInputs().size() == 3 ? true : false;
-
-    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c});
+    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c},
+                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     if (has_bias) {
       auto bias_dims = get_tensor_dims(nodes_[node.GetInputs()[2].id_]);
-      bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1});
+      bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1},
+                                         CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     }
 
     cl_ml_op_fully_connected_desc_qcom fc_desc = {1, CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM,
-                                                  CL_ARITHMETIC_MODE_FP32_QCOM};
+                                                  cl_arithmetic_mode};
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
-    auto output = MakeCLMLTensorFromJSONNode(node);
     if (has_bias) {
       result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(
           workspace->context, 0, &fc_desc, input->tensor, weight->tensor, bias->tensor,
@@ -1021,15 +1183,17 @@ class CLMLRuntime : public JSONRuntimeBase {
                                                                  const JSONGraphNode& node) {
     cl_int result = 0;
     cl_ml_op_qcom op = NULL;
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
-    auto output = MakeCLMLTensorFromJSONNode(node);
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                             cl_dtype);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_float a_max = std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
     cl_float a_min = std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);
 
-    cl_ml_op_clip_desc_qcom clip_desc = {CL_CLIP_BY_VALUE_QCOM,
-                                         {{a_max}, CL_FLOAT},
-                                         {{a_min}, CL_FLOAT},
-                                         CL_ARITHMETIC_MODE_FP32_QCOM};
+    cl_ml_op_clip_desc_qcom clip_desc = {
+        CL_CLIP_BY_VALUE_QCOM, {{a_max}, CL_FLOAT}, {{a_min}, CL_FLOAT}, cl_arithmetic_mode};
 
     result = h_ClmlIntf->clCreateMLOpClipQCOM(workspace->context, 0, &clip_desc, input->tensor,
                                               output->tensor, &op, tuning_cache);
@@ -1040,6 +1204,47 @@ class CLMLRuntime : public JSONRuntimeBase {
     return output;
   }
 
+  /*!
+   * \brief Create a Binary layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateBinaryLayer(CachedLayer* layer,
+                                                                   const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto input_a = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {},
+                                               CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    auto input_b = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {},
+                                               CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    std::string op_name = node.GetOpName();
+    cl_binary_op_qcom binary_op = CL_TENSOR_OP_ADD_QCOM;
+    if (op_name == "subtract")
+      binary_op = CL_TENSOR_OP_SUB_QCOM;
+    else if (op_name == "multiply")
+      binary_op = CL_TENSOR_OP_MUL_QCOM;
+    else if (op_name == "minimum")
+      binary_op = CL_TENSOR_OP_MIN_QCOM;
+    else if (op_name == "maximum")
+      binary_op = CL_TENSOR_OP_MAX_QCOM;
+    cl_ml_op_binary_desc_qcom add_desc = {
+        binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, cl_arithmetic_mode};
+
+    result = h_ClmlIntf->clCreateMLOpBinaryQCOM(workspace->context, 0, &add_desc, input_a->tensor,
+                                                input_b->tensor, output->tensor, &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << op_name << " Node Error:" << result;
+
+    layer_.func_ins.push_back(input_a);
+    layer_.func_ins.push_back(input_b);
+    layer->function.push_back(op);
+    return output;
+  }
+
   /*!
    * \brief The network layers represented by acl functions.
    * \note Currently only supports a single layer.
diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py
index 0cf76079e8fb..08b11525ecd2 100644
--- a/tests/python/contrib/test_clml/infrastructure.py
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -29,6 +29,7 @@
 from tvm.contrib import graph_executor
 from tvm.relay.op.contrib import clml
 from tvm.contrib import utils
+from tvm import autotvm
 from tvm.autotvm.measure import request_remote
 from tvm.relay.expr_functor import ExprMutator, Call
 
@@ -144,35 +145,28 @@ def skip_codegen_test():
         return True
 
 
-def build_module(mod, target, target_host, params=None, enable_clml=True):
+def build_module(mod, target, target_host, params=None, enable_clml=True, tune_log=""):
     """Build module with option to build for CLML."""
     if isinstance(mod, tvm.relay.expr.Call):
         mod = tvm.IRModule.from_expr(mod)
 
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        if enable_clml:
-            mod = clml.partition_for_clml(mod, params)
-        relay.backend.te_compiler.get().clear()
-        # print("Build  Mod:", mod)
-        return relay.build(mod, target=target, target_host=target_host, params=params)
+    with autotvm.apply_history_best(tune_log):
+        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            if enable_clml:
+                mod = clml.partition_for_clml(mod, params)
+            relay.backend.te_compiler.get().clear()
+            return relay.build(mod, target=target, target_host=target_host, params=params)
 
 
 def build_and_run(
-    mod,
-    inputs,
-    outputs,
-    params,
-    device,
-    enable_clml=True,
-    no_runs=1,
-    config=None,
+    mod, inputs, outputs, params, device, enable_clml=True, no_runs=1, config=None, tune_log=""
 ):
     """Build and run the relay module."""
     if config is None:
         config = {}
 
     try:
-        libm = build_module(mod, device.target, device.target_host, params, enable_clml)
+        libm = build_module(mod, device.target, device.target_host, params, enable_clml, tune_log)
 
         clml_modules = extract_clml_modules(libm)
         for mod in clml_modules:
@@ -198,7 +192,7 @@ def build_and_run(
     for _ in range(no_runs):
         gen_module.run()
         out.append([gen_module.get_output(i) for i in range(outputs)])
-    time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=50)
+    time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=1)
     cost = time_f().mean
     print("%g secs/iteration\n" % cost)
     return out
diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py
index 405f5782ff2e..95f3a45baf78 100644
--- a/tests/python/contrib/test_clml/test_network.py
+++ b/tests/python/contrib/test_clml/test_network.py
@@ -25,20 +25,13 @@
 from test_clml.infrastructure import skip_runtime_test, build_and_run, Device
 
 
-def _build_and_run_network(mod, params, inputs, data, device, atol, rtol):
+def _build_and_run_network(mod, params, inputs, data, device, atol, rtol, tvm_log=""):
     """Helper function to build and run a network."""
 
     outputs = []
     for clml in [True, False]:
         outputs.append(
-            build_and_run(
-                mod,
-                data,
-                1,
-                params,
-                device,
-                enable_clml=clml,
-            )[0]
+            build_and_run(mod, data, 1, params, device, enable_clml=clml, tune_log=tvm_log)[0][0]
         )
     return outputs
 
@@ -55,11 +48,7 @@ def _get_keras_model(keras_model, inputs_dict, data):
     def get_bottom_top_model(model, layer_name):
         layer = model.get_layer(layer_name)
         bottom_input = model.layers[0].input
-        bottom_output = bottom_input
-        for layer in model.layers:
-            bottom_output = layer(bottom_output)
-            if layer.name == layer_name:
-                break
+        bottom_output = layer.output
         bottom_model = Model(bottom_input, bottom_output)
         return bottom_model
 
@@ -81,6 +70,9 @@ def test_mobilenet():
 
     def get_model():
         from tensorflow.keras.applications import MobileNet
+        import tensorflow as tf
+
+        tf.keras.backend.clear_session()
 
         mobilenet = MobileNet(
             include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000
@@ -106,32 +98,113 @@ def get_model():
     )
 
     # test
-    print("OpenCL:", outputs[0][0].asnumpy().shape)
-    print("CLML:", outputs[1][0].asnumpy().shape)
+    print("OpenCL:", outputs[0].asnumpy().shape)
+    print("CLML:", outputs[1].asnumpy().shape)
 
-    opencl_sort = np.argsort(outputs[1][0].asnumpy()).flatten()
-    clml_sort = np.argsort(outputs[0][0].asnumpy()).flatten()
+    opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
+    clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
 
     tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5)
 
 
-"""
-    tvm.testing.assert_allclose(
-         ref_outputs, outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
-    print("OpenCL to Keras looks good")
-    tvm.testing.assert_allclose(
-         outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
-    print("OpenCL to CLML looks good")
-    exit(0)
+def test_inception_v3():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    dtype = "float16"
+
+    def get_model():
+        from tensorflow.keras.applications import InceptionV3
+        import tensorflow as tf
+
+        tf.keras.backend.clear_session()
+
+        inceptionV3 = InceptionV3(
+            include_top=True, weights=None, input_shape=(299, 299, 3), classes=1000
+        )
+        inputs = {inceptionV3.input_names[0]: ((1, 3, 299, 299), "float16")}
+
+        data = {}
+        np.random.seed(0)
+        for name, (shape, dtype) in inputs.items():
+            if dtype == "uint8":
+                low, high = 0, 1
+            else:
+                low, high = -2, 1
+            data[name] = np.random.uniform(low, high, shape).astype(dtype)
+
+        mod, params, ref_outputs = _get_keras_model(inceptionV3, inputs, data)
+        return mod, params, inputs, data, ref_outputs
+
+    mod, params, inputs, input_data, ref_outputs = get_model()
+    outputs = _build_and_run_network(
+        mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
+    )
+
+    opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
+    clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
+
+    tvm.testing.assert_allclose(opencl_sort[:5], clml_sort[:5], rtol=1e-5, atol=1e-5)
+
+
+def test_resnet50v2():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    dtype = "float16"
+
+    def get_model():
+        from tensorflow.keras.applications import ResNet50V2
+        import tensorflow as tf
+
+        tf.keras.backend.clear_session()
 
-    tvm.testing.assert_allclose(
-         ref_outputs.transpose(0, 3, 1, 2), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
-    print("OpenCL to Keras looks good")
-    tvm.testing.assert_allclose(
-         outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
-    print("OpenCL to CLML looks good")
-"""
+        model = ResNet50V2(include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000)
+        inputs_dict = {model.input_names[0]: ((1, 3, 224, 224), "float32")}
+
+        data = {}
+        np.random.seed(0)
+
+        for name, (shape, dtype) in inputs_dict.items():
+            if dtype == "uint8":
+                low, high = 0, 1
+            else:
+                low, high = -1, 1
+            data[name] = np.random.uniform(low, high, shape).astype(dtype)
+
+        """Convert Keras graph to relay."""
+        inputs = {}
+        for name, (shape, _) in inputs_dict.items():
+            inputs[model.input_names[0]] = shape
+
+        ref_outputs = model.predict(data["input_1"].transpose(0, 2, 3, 1))
+
+        mod, params = relay.frontend.from_keras(model, inputs, layout="NCHW")
+
+        return mod, params, inputs, data, ref_outputs
+
+    mod, params, inputs, input_data, ref_outputs = get_model()
+    outputs = _build_and_run_network(
+        mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
+    )
+
+    # test
+    print("OpenCL:", outputs[0].asnumpy().shape)
+    print("CLML:", outputs[1].asnumpy().shape)
+
+    opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
+    clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
+
+    tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5)
 
 
 if __name__ == "__main__":
     test_mobilenet()
+    test_resnet50v2()
+    test_inception_v3()
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
index 13f49d152714..d14a5ec6e90d 100644
--- a/tests/python/contrib/test_clml/test_ops.py
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -211,6 +211,87 @@ def test_batchnorm():
     )
 
 
+def test_concat():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    dtype = "float16"
+    in_shape_1 = (1, 16, 16, 16)
+    in_shape_2 = (1, 16, 16, 16)
+    a = relay.var("input_1", shape=in_shape_1, dtype=dtype)
+    b = relay.var("input_2", shape=in_shape_2, dtype=dtype)
+    low, high = -1, 1
+    inputs = {
+        "input_1": tvm.nd.array(np.random.uniform(-1, 1, in_shape_1).astype(dtype)),
+        "input_2": tvm.nd.array(np.random.uniform(-1, 1, in_shape_2).astype(dtype)),
+    }
+
+    params = {}
+    func = relay.concatenate((a, b), axis=1)
+    mod = IRModule.from_expr(func)
+
+    opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+    clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+
+    tvm.testing.assert_allclose(
+        clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+    )
+
+
+def test_avgpool():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    dtype = "float16"
+    trials = [
+        # input size         pool_size stride  paading
+        [(1, 64, 147, 147), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
+        [(1, 192, 71, 71), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
+        [(1, 288, 35, 35), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
+        [(1, 768, 17, 17), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
+        [(1, 2048, 17, 17), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
+        [(1, 192, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
+        [(1, 256, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
+        [(1, 288, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
+        [(1, 768, 17, 17), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
+        [(1, 1280, 8, 8), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
+    ]
+    params = {}
+    for (
+        input_shape,
+        pool_size,
+        stride,
+        padding,
+        pooling_type,
+    ) in trials:
+        a = relay.var("input_1", shape=input_shape, dtype=dtype)
+        input_arr = tvm.nd.array(np.random.uniform(-1, 1, input_shape).astype(dtype))
+        inputs = {
+            "input_1": input_arr,
+        }
+
+        if pooling_type == "max":
+            func = relay.nn.max_pool2d(a, pool_size=pool_size, strides=stride, padding=padding)
+        else:
+            func = relay.nn.avg_pool2d(a, pool_size=pool_size, strides=stride, padding=padding)
+        mod = IRModule.from_expr(func)
+
+        opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+        clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+
+        tvm.testing.assert_allclose(
+            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+        )
+
+
 if __name__ == "__main__":
     test_conv2d()
-    test_batchnorm()
+    # test_batchnorm()
+    test_avgpool()
+    test_concat()

From a40849342d250bd585e19434e4a2473fcf978bcb Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 14 Sep 2022 09:23:51 -0500
Subject: [PATCH 163/704] [Relay][TE] Use Relay parameter name to generated TE
 tensor name (#10516)

* [Relay][TE] Use Relay parameter name to generated TE tensor name

Previously, the TE placeholders representing relay function parameters
were all named `"placeholder"`, which could be difficult to follow
when debugging larger functions.
---
 .../ci_logs/resnet-18-NHWC-B1-cuda.json       | 50 +++++++++----------
 python/tvm/auto_scheduler/measure.py          | 17 +++++--
 .../tvm/auto_scheduler/relay_integration.py   |  5 +-
 .../contrib/ethosu/tir_to_cs_translator.py    |  2 +-
 src/relay/backend/te_compiler_cache.cc        |  9 ++--
 5 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
index 7cb3a67067b0..c8b9f41a5ca9 100644
--- a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
+++ b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
@@ -1,26 +1,24 @@
-# Provide valid schedules for resnet-18 on GPU.
-# This is used to run the tutorial on the documentation web server.
-{"i": [["[\"d7b65649a4dd54becea0a52aabbc5af5\", 1, 1000, 1, 1000]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["SP", 4, 1, 1000, [40], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["FSP", 1, 1, 0, 1], ["AN", 1, 2, 6], ["CA", 1, 4, 0], ["AN", 4, 0, 5], ["PR", 1, 0, "auto_unroll_max_step$512"], ["PR", 3, 0, "auto_unroll_max_step$512"]]]], "r": [[4.87396e-06], 0, 1.30575, 1606984701], "v": "v0.5"}
-{"i": [["[\"9847f8cc0b305137f49f2c5c0c8ab25d\", 1, 512, 1000, 512, 1000, 1, 1000]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [1, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 16], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 32, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 32, [1], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"]]]], "r": [[2.25155e-05], 0, 1.5128, 1606984719], "v": "v0.5"}
-{"i": [["[\"69115f188984ae34ede37c3b8ca40b43\", 1, 7, 7, 512, 1, 1, 1, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [2], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [32], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$64"]]]], "r": [[3.91068e-06], 0, 1.63708, 1606984742], "v": "v0.5"}
-{"i": [["[\"ad6cecbf5d85cb1cda3c2bb7af170211\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 1, 1, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [2], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 16, [4, 1, 4, 1], 1], ["SP", 6, 15, 512, [2, 8, 1, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 2, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000190231], 0, 1.95863, 1606984773], "v": "v0.5"}
-{"i": [["[\"3a69f9fbc63760d99e36b4c17b3bfc57\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 16, [4, 2, 2, 1], 1], ["SP", 6, 15, 512, [1, 16, 2, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000218188], 0, 2.05807, 1606984806], "v": "v0.5"}
-{"i": [["[\"d730bcd28f0920f6b97245e2a11bd8d6\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [1], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 1, 8], 1], ["SP", 6, 15, 512, [1, 16, 1, 2], 1], ["SP", 6, 20, 512, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [2], 1], ["SP", 4, 4, 512, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000165484], 0, 2.76154, 1606984831], "v": "v0.5"}
-{"i": [["[\"f3b6c10fcc6ce01ff01add933e4d21e9\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 49, [1, 1, 1, 7], 1], ["SP", 6, 15, 256, [1, 128, 1, 2], 1], ["SP", 6, 20, 256, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000157488], 0, 2.05375, 1606984883], "v": "v0.5"}
-{"i": [["[\"b8b52b9be9df6102466a22a014c44c1f\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 7, 1], 1], ["SP", 6, 15, 256, [1, 32, 1, 2], 1], ["SP", 6, 20, 256, [8, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 224, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.00011824], 0, 1.84964, 1606984912], "v": "v0.5"}
-{"i": [["[\"d374e472bd9d8164892b9e28a0a8cb59\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 4, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 49, [1, 1, 7, 1], 1], ["SP", 6, 15, 256, [4, 8, 1, 1], 1], ["SP", 6, 20, 256, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [49], 1], ["SP", 4, 4, 256, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.67244e-05], 0, 1.93124, 1606984935], "v": "v0.5"}
-{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 28, 28, 128, 3, 3, 128, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [2, 7, 1, 1], 1], ["SP", 3, 10, 14, [1, 7, 2, 1], 1], ["SP", 3, 15, 256, [2, 2, 1, 4], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [4, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 96, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 36, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[9.20105e-05], 0, 1.88263, 1606984952], "v": "v0.5"}
-{"i": [["[\"c4500b4e2fd04e695c32d2f31bbdc14a\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 7], 1], ["SP", 6, 15, 128, [1, 4, 1, 16], 1], ["SP", 6, 20, 128, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000102747], 0, 2.2858, 1606984979], "v": "v0.5"}
-{"i": [["[\"e4cdf917b876dbdd64488c3818d9c141\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 4], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 128, [2, 8, 1, 1], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000133211], 0, 2.07337, 1606985017], "v": "v0.5"}
-{"i": [["[\"dac19035dd5fe9424ee8617421b9c817\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 2, 7, 1], 1], ["SP", 6, 15, 128, [1, 2, 2, 2], 1], ["SP", 6, 20, 128, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 128, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000150142], 0, 1.90539, 1606985042], "v": "v0.5"}
-{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 56, 56, 64, 3, 3, 64, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 2, 1], 1], ["SP", 3, 10, 28, [1, 7, 2, 2], 1], ["SP", 3, 15, 128, [1, 8, 8, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 64, [4, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 576, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 360, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000101548], 0, 1.92449, 1606985059], "v": "v0.5"}
-{"i": [["[\"1e3c4211ffd2f2db91078ae4d04b779d\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 14, 1, 1], 1], ["SP", 6, 15, 64, [2, 2, 4, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [7], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[5.64548e-05], 0, 3.15692, 1606985088], "v": "v0.5"}
-{"i": [["[\"b818b53148cd450f86569dfc3e04cb8a\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 3, 2, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 4, 1], 1], ["SP", 6, 15, 64, [1, 8, 1, 4], 1], ["SP", 6, 20, 64, [1, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 64, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000135574], 0, 2.88002, 1606985120], "v": "v0.5"}
-{"i": [["[\"3ea73fb9b0364374730d09e068821f95\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 3], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 1, 4], 1], ["SP", 6, 15, 64, [1, 8, 1, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 48, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 96, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000115802], 0, 4.06441, 1606985158], "v": "v0.5"}
-{"i": [["[\"a5612fdeb9db4d579a75ec225ea4c06a\", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$16"]]]], "r": [[2.00968e-05], 0, 1.53065, 1606985193], "v": "v0.5"}
-{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [1, 2, 7, 1], 1], ["SP", 3, 10, 112, [1, 7, 1, 1], 1], ["SP", 3, 15, 64, [1, 8, 4, 1], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [1, 7], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 84, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 273, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[7.14326e-05], 0, 2.05623, 1606985220], "v": "v0.5"}
-{"i": [["[\"7006235cfc29b73be524cf390ed5a977\", 1, 56, 56, 64, 1, 1, 64, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [1, 2, 2, 2], 1], ["SP", 3, 10, 56, [1, 7, 1, 2], 1], ["SP", 3, 15, 64, [1, 16, 1, 4], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 256, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.17113e-05], 0, 1.9863, 1606985239], "v": "v0.5"}
-{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 56, 56, 64, 1, 1, 64, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 1, 1], 1], ["SP", 3, 10, 28, [1, 2, 1, 7], 1], ["SP", 3, 15, 128, [8, 8, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 208, [2], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$16"]]]], "r": [[1.76965e-05], 0, 1.63284, 1606985253], "v": "v0.5"}
-{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 28, 28, 128, 1, 1, 128, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 1], 1], ["SP", 3, 10, 14, [2, 1, 7, 1], 1], ["SP", 3, 15, 256, [2, 64, 1, 2], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 52, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[3.05015e-05], 0, 1.59532, 1606985280], "v": "v0.5"}
-{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 14, 14, 256, 1, 1, 256, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 7, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [4, 128, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 2704, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[2.18808e-05], 0, 1.88033, 1606985298], "v": "v0.5"}
-{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 14, 14, 256, 3, 3, 256, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [7, 1, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 4, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 256, [8, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 144, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000190239], 0, 2.28266, 1606985323], "v": "v0.5"}
+{"i": [["[\"f19692ed81d032b1697c08adee62f9a5\", [1, 28, 28, 128], [4, 4, 128, 128], [1, 28, 28, 128], [1, 1, 1, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 128, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 2, 2], 1], ["SP", 6, 10, 196, [1, 49, 2, 1], 1], ["SP", 6, 15, 128, [4, 2, 1, 1], 1], ["SP", 6, 20, 128, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [4], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000186843], 0, 0.965096, 1650980656], "v": "v0.6"}
+{"i": [["[\"2d10de6646307f0e3e5cf4b31c20e69b\", [1, 56, 56, 64], [1, 1, 64, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [1, 2, 2, 1], 1], ["SP", 3, 10, 56, [1, 8, 1, 7], 1], ["SP", 3, 15, 64, [1, 16, 4, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [32, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 4, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 14, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.37742e-05], 0, 1.18571, 1650980663], "v": "v0.6"}
+{"i": [["[\"a3df19e5b88592ef5a9ce584a1ca3010\", [1, 7, 7, 512], [4, 4, 512, 512], [1, 7, 7, 512], [1, 1, 1, 512], [1, 1, 1, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [1], 1], ["SP", 8, 4, 512, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 2, 1], 1], ["SP", 6, 5, 4, [2, 1, 2, 1], 1], ["SP", 6, 10, 16, [1, 8, 2, 1], 1], ["SP", 6, 15, 512, [2, 8, 1, 2], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [2], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000317285], 0, 0.910312, 1650980674], "v": "v0.6"}
+{"i": [["[\"0fad1b42d0d33418e0a8d15d3bbad3c9\", [1, 56, 56, 64], [1, 1, 64, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 2, 2], 1], ["SP", 3, 10, 28, [2, 7, 1, 2], 1], ["SP", 3, 15, 128, [2, 8, 4, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [1, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 21, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000229624], 0, 0.97359, 1650980681], "v": "v0.6"}
+{"i": [["[\"0bcf718c0e6566bcd6c3b1437a3b6291\", [1, 28, 28, 128], [4, 4, 128, 128], [1, 1, 1, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 2, 1], 1], ["SP", 6, 10, 196, [1, 7, 4, 1], 1], ["SP", 6, 15, 128, [1, 8, 2, 1], 1], ["SP", 6, 20, 128, [1, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 4, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000142796], 0, 0.851287, 1650980693], "v": "v0.6"}
+{"i": [["[\"1097323f3970e5c881ad3a0028ca79cb\", [1, 14, 14, 256], [4, 4, 256, 256], [1, 1, 1, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 2, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 1], 1], ["SP", 6, 10, 49, [7, 1, 1, 1], 1], ["SP", 6, 15, 256, [1, 64, 1, 2], 1], ["SP", 6, 20, 256, [1, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [1], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 4, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 2, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000289122], 0, 1.35723, 1650980705], "v": "v0.6"}
+{"i": [["[\"d78e8eb6021c4cdda0ad7775d10f751a\", [1, 7, 7, 512], [4, 4, 512, 512], [1, 7, 7, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [2, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 2], 1], ["SP", 6, 10, 16, [4, 1, 1, 2], 1], ["SP", 6, 15, 512, [1, 8, 1, 2], 1], ["SP", 6, 20, 512, [2, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000182145], 0, 0.954184, 1650980716], "v": "v0.6"}
+{"i": [["[\"7c2a4f1f432f81c44985590780dfb52d\", [1, 56, 56, 64], [6, 6, 64, 64], [1, 1, 1, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 2], 1], ["SP", 6, 5, 6, [2, 1, 1, 1], 1], ["SP", 6, 10, 196, [1, 2, 98, 1], 1], ["SP", 6, 15, 64, [2, 16, 1, 1], 1], ["SP", 6, 20, 64, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 64, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 392, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00029727], 0, 2.54044, 1650980730], "v": "v0.6"}
+{"i": [["[\"64b7ce5264a64cb340d78b444b0325e6\", [1, 14, 14, 256], [4, 4, 256, 256], [1, 14, 14, 256], [1, 1, 1, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [2, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 1], 1], ["SP", 6, 10, 49, [1, 1, 7, 7], 1], ["SP", 6, 15, 256, [4, 16, 2, 1], 1], ["SP", 6, 20, 256, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [8], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000645288], 0, 3.306, 1650980745], "v": "v0.6"}
+{"i": [["[\"be3babb9a46e32f66b717a3e2a2d522c\", [1, 7, 7, 512], [1, 1, 1, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [32], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$64"]]]], "r": [[3.49558e-06], 0, 0.880265, 1650980753], "v": "v0.6"}
+{"i": [["[\"7d79c516e212fe1d73f5dbb90eaca2cf\", [1, 1000], [1, 1000]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["SP", 4, 1, 1000, [20], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["AN", 4, 0, 5], ["AN", 1, 0, 6], ["PR", 1, 0, "auto_unroll_max_step$0"], ["PR", 3, 0, "auto_unroll_max_step$16"]]]], "r": [[1.66218e-05], 0, 1.00389, 1650980756], "v": "v0.6"}
+{"i": [["[\"40b1cf1fd37b0ef111b3cc0247302508\", [1, 7, 7, 512], [4, 4, 512, 512], [1, 1, 1, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [8], 1], ["SP", 8, 4, 512, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 16, [1, 4, 1, 4], 1], ["SP", 6, 15, 512, [1, 128, 1, 1], 1], ["SP", 6, 20, 512, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00019327], 0, 0.828601, 1650980768], "v": "v0.6"}
+{"i": [["[\"0fad1b42d0d33418e0a8d15d3bbad3c9\", [1, 28, 28, 128], [1, 1, 128, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 2, 1, 1], 1], ["SP", 3, 10, 14, [1, 1, 2, 1], 1], ["SP", 3, 15, 256, [4, 8, 1, 4], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [2, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 32, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 24, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.57402e-05], 0, 1.28219, 1650980774], "v": "v0.6"}
+{"i": [["[\"25577781e50c611c2e45e73c1cb3a6ca\", [1, 28, 28, 128], [4, 4, 128, 128], [1, 28, 28, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [7], 1], ["SP", 8, 4, 128, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 2, 1, 2], 1], ["SP", 6, 10, 196, [7, 7, 2, 2], 1], ["SP", 6, 15, 128, [1, 4, 2, 1], 1], ["SP", 6, 20, 128, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [7], 1], ["SP", 4, 4, 128, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000287883], 0, 1.48484, 1650980787], "v": "v0.6"}
+{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 28, 28, 128], [3, 3, 128, 256], [1, 1, 1, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 2, 1], 1], ["SP", 3, 10, 14, [1, 1, 2, 7], 1], ["SP", 3, 15, 256, [1, 16, 1, 1], 1], ["SP", 3, 20, 3, [1, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 128, [4, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 648, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000155192], 0, 0.945575, 1650980794], "v": "v0.6"}
+{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 14, 14, 256], [3, 3, 256, 512], [1, 1, 1, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 7, 1, 1], 1], ["SP", 3, 10, 7, [7, 1, 1, 1], 1], ["SP", 3, 15, 512, [1, 16, 2, 8], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 256, [1, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 576, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 36, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00083305], 0, 2.13994, 1650980802], "v": "v0.6"}
+{"i": [["[\"6c4f6234946e16bcf9e48bdf289f9200\", [1, 56, 56, 64], [6, 6, 64, 64], [1, 56, 56, 64], [1, 1, 1, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 64, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [2, 1, 3, 1], 1], ["SP", 6, 5, 6, [1, 6, 1, 1], 1], ["SP", 6, 10, 196, [1, 28, 1, 1], 1], ["SP", 6, 15, 64, [1, 1, 1, 4], 1], ["SP", 6, 20, 64, [4, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 96, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 24, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000112836], 0, 1.67377, 1650980816], "v": "v0.6"}
+{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 224, 224, 3], [7, 7, 3, 64], [1, 1, 1, 64], [1, 112, 112, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [1, 2, 28, 1], 1], ["SP", 3, 10, 112, [7, 1, 1, 1], 1], ["SP", 3, 15, 64, [1, 32, 1, 1], 1], ["SP", 3, 20, 7, [1, 7], 1], ["SP", 3, 23, 7, [7, 1], 1], ["SP", 3, 26, 3, [1, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 49, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 91, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000303402], 0, 1.28361, 1650980824], "v": "v0.6"}
+{"i": [["[\"10b8215aaf2e14d47d40b4093e6f41a0\", [1, 56, 56, 64], [6, 6, 64, 64], [1, 56, 56, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [7], 1], ["SP", 8, 4, 64, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 1], 1], ["SP", 6, 10, 196, [1, 14, 1, 1], 1], ["SP", 6, 15, 64, [8, 2, 2, 1], 1], ["SP", 6, 20, 64, [4, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 64, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[6.50144e-05], 0, 3.25197, 1650980839], "v": "v0.6"}
+{"i": [["[\"7f3fee61bc3c2604395f5d343b840b7c\", [1, 14, 14, 256], [4, 4, 256, 256], [1, 14, 14, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [49], 1], ["SP", 8, 4, 256, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 2], 1], ["SP", 6, 10, 49, [1, 7, 1, 1], 1], ["SP", 6, 15, 256, [2, 32, 4, 1], 1], ["SP", 6, 20, 256, [4, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000233087], 0, 0.828703, 1650980851], "v": "v0.6"}
+{"i": [["[\"0fad1b42d0d33418e0a8d15d3bbad3c9\", [1, 14, 14, 256], [1, 1, 256, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 1, 1], 1], ["SP", 3, 10, 7, [7, 1, 1, 1], 1], ["SP", 3, 15, 512, [2, 128, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [8, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 208, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.42677e-05], 0, 1.28859, 1650980857], "v": "v0.6"}
+{"i": [["[\"affd3c4a65f665e451a06d65bf32750d\", [1, 112, 112, 64], [1, 1, 1, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [1], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [2], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000265616], 0, 0.615762, 1650980871], "v": "v0.6"}
+{"i": [["[\"00a059b856ac30ac172b6252254479a6\", [1, 512], [1000, 512], [1, 1000], [1, 1000]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [4, 50, 1, 1], 1], ["SP", 2, 10, 512, [2, 4], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 32, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 8, [2], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"]]]], "r": [[7.0238e-05], 0, 0.673282, 1650980874], "v": "v0.6"}
+{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 56, 56, 64], [3, 3, 64, 128], [1, 1, 1, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 2, 7, 2], 1], ["SP", 3, 10, 28, [2, 7, 1, 2], 1], ["SP", 3, 15, 128, [1, 8, 8, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 64, [1, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [4], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 145, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00116892], 0, 1.6731, 1650980882], "v": "v0.6"}
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 6f331499b042..e59e78f57154 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -780,7 +780,7 @@ def register(myf):
     return register
 
 
-def prepare_input_map(args):
+def prepare_input_map(args, workload_key=None):
     """This function deals with special task inputs. Map the input Tensor of a TVM subgraph
     to a specific buffer name in the global buffer map.
 
@@ -789,6 +789,11 @@ def prepare_input_map(args):
     args : List[Tensor]
         Input/output Tensor of a TVM subgraph.
 
+    workload_key: Optional[str]
+        The workload for which these inputs are being prepared.  This
+        is used to identify if an input is being provided by (see
+        `register_task_input_buffer`).
+
     Returns
     -------
     Dict[Tensor, str] :
@@ -803,13 +808,19 @@ def prepare_input_map(args):
 
     global TASK_INPUT_CHECK_FUNC_REGISTRY
 
+    from .search_task import TASK_INPUT_BUFFER_TABLE
+
     # A dict that maps the input tensor arg to a buffer name
     tensor_input_map = {}
 
     # Case 0: Check placeholder name
     for arg in args:
         if isinstance(arg.op, tvm.te.PlaceholderOp):
-            if arg.op.name != "placeholder":
+            if (
+                workload_key
+                and workload_key in TASK_INPUT_BUFFER_TABLE
+                and arg.op.name in TASK_INPUT_BUFFER_TABLE[workload_key]
+            ):
                 tensor_input_map[arg] = arg.op.name
 
     # Case 1: Check specific tensor inputs
@@ -843,7 +854,7 @@ def prepare_runner_args(inp, build_res):
     from .search_task import get_task_input_buffer  # lazily import to avoid recursive dependency
 
     task_input_names = inp.task.task_input_names
-    tensor_input_map = prepare_input_map(build_res.args)
+    tensor_input_map = prepare_input_map(build_res.args, inp.task.workload_key)
     if not task_input_names:
         tensor_input_map = {}
     args = []
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 9541232a6a38..52c7f44fcede 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -336,7 +336,8 @@ def auto_schedule_topi(func_name, outs):
         logger.info("Failed to create a ComputeDAG for auto_scheduler: %s", str(err))
         return None
 
-    key = register_workload_tensors(dag.workload_key(), io_tensors)
+    workload_key = dag.workload_key()
+    key = register_workload_tensors(workload_key, io_tensors)
     target = tvm.target.Target.current()
 
     dispatch_ctx = DispatchContext.current
@@ -356,7 +357,7 @@ def auto_schedule_topi(func_name, outs):
         # in the task extraction mode
         if has_complex_op or env.tracing_mode == TracingMode.EXTRACT_TASK:
             env.add_workload_key(func_name, key)
-            input_map = prepare_input_map(io_tensors)
+            input_map = prepare_input_map(io_tensors, workload_key)
             if input_map:
                 env.add_workload_input_names(key, list(input_map.values()))
     elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE:
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
index a3d46170dfca..f5c8994bec77 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
@@ -268,7 +268,7 @@ def extract_param_base_addresses(mod, buffer_info, scratch_region_map) -> List[u
         size_bytes = element_size_bytes * np.prod(list(buffer.shape))
         base_addresses.append(
             util.BaseAddress(
-                param.name,
+                param.name.replace("-", "_"),
                 idx,
                 _get_region(buffer_info[param].btype, param, scratch_region_map),
                 size_bytes,
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 1d7566ebe2bd..a8eb6a58105f 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -131,7 +131,8 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
     for (Var param : relay_func->params) {
       Array<tvm::te::Tensor> inputs;
       for (const auto& ttype : FlattenTupleType(param->checked_type())) {
-        tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
+        tvm::te::Tensor tensor =
+            tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype, param->vid->name_hint);
         inputs.push_back(tensor);
         fn_inputs_.push_back(tensor);
       }
@@ -478,7 +479,8 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       for (const auto& ttype : FlattenTupleType(param->checked_type())) {
         // Add data placeholder (in case we discover we need it below)
         Shape shape = GetShape(ttype->shape);
-        tvm::te::Tensor data_tensor = tvm::te::placeholder(shape, ttype->dtype);
+        tvm::te::Tensor data_tensor =
+            tvm::te::placeholder(shape, ttype->dtype, "data_" + param->vid->name_hint);
         data_inputs.push_back(data_tensor);
         // Add shape placeholder (in case we discover we need it below)
         int64_t ndim = shape.size();
@@ -486,7 +488,8 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
         if (ndim > 0) {
           sshape.push_back(tvm::Integer(ndim));
         }
-        tvm::te::Tensor shape_tensor = tvm::te::placeholder(sshape, DataType::Int(64));
+        tvm::te::Tensor shape_tensor =
+            tvm::te::placeholder(sshape, DataType::Int(64), "shape_" + param->vid->name_hint);
         shape_inputs.push_back(shape_tensor);
       }
       param_data_[param] = data_inputs;

From a0cbefbe9568468a35bc3dce7d23a143da3008b8 Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Wed, 14 Sep 2022 17:16:57 +0100
Subject: [PATCH 164/704] [CI] Set USE_CMSISNN and USE_ETHOSU off in
 task_config_build_cpu.sh (#12456)

The dependencies for these have moved into ci_cortexm Docker
image, so there is not much point in building them for ci_cpu as we
can't run the associated tests.
---
 tests/scripts/task_config_build_cpu.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 7f48839f23c0..8d5a2a95bb89 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -51,11 +51,11 @@ echo set\(USE_TENSORFLOW_PATH \"/tensorflow\"\) >> config.cmake
 echo set\(USE_FLATBUFFERS_PATH \"/flatbuffers\"\) >> config.cmake
 echo set\(USE_ETHOSN /opt/arm/ethosn-driver\) >> config.cmake
 echo set\(USE_ETHOSN_HW OFF\) >> config.cmake
-echo set\(USE_CMSISNN ON\) >> config.cmake
+echo set\(USE_CMSISNN OFF\) >> config.cmake
 echo set\(USE_VITIS_AI ON\) >> config.cmake
 echo set\(USE_VERILATOR ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
-echo set\(USE_ETHOSU ON\) >> config.cmake
+echo set\(USE_ETHOSU OFF\) >> config.cmake
 echo set\(USE_UMA ON\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake

From 3d7439eb0bf3d0a2253e7011b7f115499b7f4f33 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Wed, 14 Sep 2022 12:36:10 -0700
Subject: [PATCH 165/704] [TVMScript] IRBuilder methods for `PrimFunc` (#12755)

This PR introduces remaining IRBuilder methods for `PrimFunc`.

Co-authored-by: yongwww <yongcale@gmail.com>
---
 include/tvm/script/ir_builder/tir/ir.h        | 126 ++++
 python/tvm/script/ir_builder/tir/ir.py        | 629 +++++++++++++++++-
 src/script/ir_builder/tir/ir.cc               | 194 ++++++
 src/script/ir_builder/tir/utils.h             |  32 +
 .../unittest/test_tvmscript_ir_builder_tir.py |  44 +-
 5 files changed, 1022 insertions(+), 3 deletions(-)

diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
index 615ce90383dd..aaa5442eede3 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -28,12 +28,111 @@ namespace script {
 namespace ir_builder {
 namespace tir {
 
+using tvm::tir::Buffer;
+using tvm::tir::Var;
+
+/*!
+ * \brief The buffer declaration function.
+ * \param shape The type of the buffer prior to flattening.
+ * \param dtype The data type in the content of the buffer.
+ * \param buffer_name The name of the buffer.
+ * \param data The pointer to the head of the data.
+ * \param strides The strides of each dimension.
+ * \param elem_offset The offset in terms of number of dtype elements (including lanes).
+ * \param storage_scope The optional storage scope of buffer data pointer.
+ * \param align The alignment requirement of data pointer in bytes.
+ * \param offset_factor The factor of elem_offset field.
+ * \param buffer_type The buffer type.
+ * \param axis_separators The separators between input axes when generating flattened output axes.
+ * \return The declared buffer.
+ */
+Buffer BufferDecl(Array<PrimExpr> shape, DataType dtype, String buffer_name, Optional<Var> data,
+                  Optional<Array<PrimExpr>> strides, Optional<PrimExpr> elem_offset,
+                  String storage_scope, int align, int offset_factor, String buffer_type,
+                  Optional<Array<IntImm>> axis_separators);
+
 /*!
  * \brief The primitive function statement.
  * \return The PrimFuncFrame.
  */
 PrimFuncFrame PrimFunc();
 
+/*!
+ * \brief The PrimFunc variable arguments adding function.
+ * \param name The name of the variable.
+ * \param var The variable argument.
+ * \return The variable.
+ */
+Var Arg(String name, Var var);
+
+/*!
+ * \brief The PrimFunc buffer arguments adding function.
+ * \param name The name of the buffer.
+ * \param buffer The buffer argument.
+ * \return The buffer.
+ */
+Buffer Arg(String name, Buffer buffer);
+
+/*!
+ * \brief The PrimFunc naming statement.
+ * \param name The name of the PrimFunc.
+ */
+void FuncName(String name);
+
+/*!
+ * \brief The PrimFunc annotation statement.
+ * \param attrs The annotations of the PrimFunc.
+ */
+void FuncAttrs(Map<String, ObjectRef> attrs);
+
+/*!
+ * \brief The PrimFunc return type statement.
+ * \param ret_type The return type of the PrimFunc.
+ * \return The return type.
+ */
+Type FuncRet(Type ret_type);
+
+/*!
+ * \brief The buffer match statement.
+ * \param param The parameter of the PrimFunc to match.
+ * \param shape The type of the buffer prior to flattening.
+ * \param dtype The data type in the content of the buffer.
+ * \param data The pointer to the head of the data.
+ * \param strides The strides of each dimension.
+ * \param elem_offset The offset in terms of number of dtype elements (including lanes).
+ * \param storage_scope The optional storage scope of buffer data pointer.
+ * \param align The alignment requirement of data pointer in bytes.
+ * \param offset_factor The factor of elem_offset field.
+ * \param buffer_type The buffer type.
+ * \param axis_separators The separators between input axes when generating flattened output axes.
+ * \return The matched buffer.
+ */
+Buffer MatchBuffer(ObjectRef param, Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
+                   Optional<Var> data = NullOpt, Array<PrimExpr> strides = {},
+                   PrimExpr elem_offset = PrimExpr(), String storage_scope = "global",
+                   int align = -1, int offset_factor = 0, String buffer_type = "default",
+                   Array<IntImm> axis_separators = {});
+
+/*!
+ * \brief The pre-flattened buffer statement.
+ * \param postflattened_buffer The original buffer to be flattened.
+ * \param shape The type of the buffer prior to flattening.
+ * \param dtype The data type in the content of the buffer.
+ * \param data The pointer to the head of the data.
+ * \param strides The strides of each dimension.
+ * \param elem_offset The offset in terms of number of dtype elements (including lanes).
+ * \param storage_scope The optional storage scope of buffer data pointer.
+ * \param align The alignment requirement of data pointer in bytes.
+ * \param offset_factor The factor of elem_offset field.
+ * \param buffer_type The buffer type.
+ * \param axis_separators The separators between input axes when generating flattened output axes.
+ */
+void PreflattenedBuffer(Buffer postflattened_buffer, Array<PrimExpr> shape,
+                        DataType dtype = DataType::Float(32), Optional<Var> data = NullOpt,
+                        Array<PrimExpr> strides = {}, PrimExpr elem_offset = PrimExpr(),
+                        String storage_scope = "global", int align = -1, int offset_factor = 0,
+                        String buffer_type = "default", Array<IntImm> axis_separators = {});
+
 /*!
  * \brief The block declaration statement.
  * \param name The name of the block.
@@ -48,6 +147,33 @@ BlockFrame Block(String name, bool no_realize = false);
  */
 void Evaluate(PrimExpr value);
 
+#define TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(FuncName, DType)                             \
+  inline PrimExpr FuncName(Optional<PrimExpr> expr = NullOpt) {                        \
+    DataType dtype = DType;                                                            \
+    return expr.defined() ? tvm::cast(dtype, expr.value()) : tvm::tir::Var("", dtype); \
+  }
+
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int8, DataType::Int(8));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int16, DataType::Int(16));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32, DataType::Int(32));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int64, DataType::Int(64));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt8, DataType::UInt(8));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt16, DataType::UInt(16));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt32, DataType::UInt(32));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt64, DataType::UInt(64));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float8, DataType::Float(8));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float16, DataType::Float(16));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float32, DataType::Float(32));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float64, DataType::Float(64));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32x4, DataType::Int(32, 4));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32x8, DataType::Int(32, 8));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32x16, DataType::Int(32, 16));
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Boolean, DataType::Bool());
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Handle, DataType::Handle());
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Void, DataType::Void());
+
+#undef TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST
+
 }  // namespace tir
 }  // namespace ir_builder
 }  // namespace script
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index 7ba2f6df9418..63fd1291f4bc 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -17,11 +17,89 @@
 # pylint: disable=missing-docstring
 """IRBuilder for TIR"""
 
-from tvm.tir import PrimExpr, StringImm
+from numbers import Integral
+from typing import Any, Dict, List, Optional, Union, Tuple
+
+from tvm.ir import Type
+from tvm.tir import (
+    Buffer,
+    BufferLoad,
+    BufferRegion,
+    PrimExpr,
+    StringImm,
+    Var,
+)
 
 from . import _ffi_api, frame
 
 
+def buffer_decl(
+    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
+    dtype: str = "float32",
+    data: Var = None,
+    strides: List[PrimExpr] = None,
+    elem_offset: PrimExpr = None,
+    scope: str = "",
+    align: int = 0,
+    offset_factor: int = 0,
+    buffer_type: str = "",
+    axis_separators: List[int] = None,
+) -> Buffer:
+    """The buffer declaration function.
+
+    Parameters
+    ----------
+    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
+        The type of the buffer prior to flattening.
+
+    dtype : str
+        The data type in the content of the buffer.
+
+    data : Var
+        The pointer to the head of the data.
+
+    strides : List[PrimExpr]
+        The strides of each dimension.
+
+    elem_offset : PrimExpr
+        The offset in terms of number of dtype elements (including lanes).
+
+    scope : str
+        The optional storage scope of buffer data pointer.
+
+    align : int
+        The alignment requirement of data pointer in bytes.
+
+    offset_factor : int
+        The factor of elem_offset field.
+
+    buffer_type : str
+        The buffer type.
+
+    axis_separators : List[int]
+        The separators between input axes when generating flattened output axes.
+
+    Returns
+    -------
+    res : Buffer
+        The declared buffer.
+    """
+    shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
+    return _ffi_api.BufferDecl(  # pylint: disable=no-member # type: ignore
+        shape,
+        dtype,
+        "",
+        data,
+        strides,
+        elem_offset,
+        scope,
+        align,
+        offset_factor,
+        buffer_type,
+        axis_separators,
+    )
+
+
 def prim_func() -> frame.PrimFuncFrame:
     """The primitive function statement.
 
@@ -33,6 +111,220 @@ def prim_func() -> frame.PrimFuncFrame:
     return _ffi_api.PrimFunc()  # pylint: disable=no-member # type: ignore
 
 
+def arg(name: str, obj: Union[Var, Buffer]) -> Union[Var, Buffer]:
+    """The PrimFunc arguments adding function.
+
+    Parameters
+    ----------
+    name : str
+        The name of the argument.
+
+    var : Union[Var, Buffer]
+        The argument of Var or Buffer.
+
+    Returns
+    -------
+    res : Union[Var, Buffer]
+        The argument.
+    """
+    return _ffi_api.Arg(name, obj)  # pylint: disable=no-member # type: ignore
+
+
+def func_name(name: str) -> None:
+    """The PrimFunc naming statement.
+
+    Parameters
+    ----------
+    name : str
+        The name of the PrimFunc.
+    """
+    _ffi_api.FuncName(name)  # pylint: disable=no-member # type: ignore
+
+
+def func_attr(attrs: Dict[str, Any]) -> None:
+    """The PrimFunc annotation statement.
+
+    Parameters
+    ----------
+    attrs : Dict[str, Any]
+        The annotations of the PrimFunc.
+    """
+    _ffi_api.FuncAttrs(attrs)  # pylint: disable=no-member # type: ignore
+
+
+def func_ret(ret_type: Type) -> Type:
+    """The PrimFunc return type statement.
+
+    Parameters
+    ----------
+    ret_type : Type
+        The return type of the PrimFunc.
+
+    Returns
+    -------
+    res : Type
+        The return type.
+    """
+    return _ffi_api.FuncRet(ret_type)  # pylint: disable=no-member # type: ignore
+
+
+def match_buffer(
+    param: Union[Var, BufferLoad, BufferRegion],
+    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
+    dtype: str = "float32",
+    data: Var = None,
+    strides: List[PrimExpr] = None,
+    elem_offset: PrimExpr = None,
+    scope: str = "global",
+    align: int = -1,
+    offset_factor: int = 0,
+    buffer_type: str = "default",
+    axis_separators: List[int] = None,
+) -> Buffer:
+    """The buffer match function.
+
+    Note
+    ----
+    This function will perform different behavior, depending on the type of param.
+    If the param is a var in function parameter, it will create a buffer from DLTensor.
+    Else if the param is a subregion of other buffers, then create a subregion match inside a block.
+
+    Example
+    -------
+    Match buffer from function parameter
+    .. code-block:: python
+        A = T.match_buffer(a, (128, 128), dtype="float32")
+
+    Match buffer from Buffer subregion
+    .. code-block:: python
+        A = T.match_buffer(B[0:128, i * 128 : i * 128 + 128], (128, 128), dtype="float32")
+
+    Parameters
+    ----------
+    param : Union[Var, BufferLoad, BufferRegion]
+        The parameter of the PrimFunc to match.
+
+    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
+        The type of the buffer prior to flattening.
+
+    dtype : str
+        The data type in the content of the buffer.
+
+    data : Var
+        The pointer to the head of the data.
+
+    strides : List[PrimExpr]
+        The strides of each dimension.
+
+    elem_offset : PrimExpr
+        The offset in terms of number of dtype elements (including lanes).
+
+    scope : str
+        The optional storage scope of buffer data pointer.
+
+    align : int
+        The alignment requirement of data pointer in bytes.
+
+    offset_factor : int
+        The factor of elem_offset field.
+
+    buffer_type : str
+        The buffer type.
+
+    axis_separators : List[int]
+        The separators between input axes when generating flattened output axes.
+
+    Returns
+    -------
+    res : Buffer
+        The matched buffer.
+    """
+    shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
+    if strides is None:
+        strides = []
+    return _ffi_api.MatchBuffer(  # pylint: disable=no-member # type: ignore
+        param,
+        shape,
+        dtype,
+        data,
+        strides,
+        elem_offset,
+        scope,
+        align,
+        offset_factor,
+        buffer_type,
+        axis_separators,
+    )
+
+
+def preflattened_buffer(
+    postflattened: Buffer,
+    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
+    dtype: str = "float32",
+    data: Var = None,
+    strides: List[PrimExpr] = None,
+    elem_offset: PrimExpr = None,
+    scope: str = "global",
+    align: int = -1,
+    offset_factor: int = 0,
+    buffer_type: str = "default",
+    axis_separators: List[int] = None,
+) -> None:
+    """The pre-flattened buffer statement.
+
+    Parameters
+    ----------
+    postflattened : Buffer
+        The original buffer to be flattened.
+
+    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
+        The type of the buffer prior to flattening.
+
+    dtype : str
+        The data type in the content of the buffer.
+
+    data : Var
+        The pointer to the head of the data.
+
+    strides : List[PrimExpr]
+        The strides of each dimension.
+
+    elem_offset : PrimExpr
+        The offset in terms of number of dtype elements (including lanes).
+
+    scope : str
+        The optional storage scope of buffer data pointer.
+
+    align : int
+        The alignment requirement of data pointer in bytes.
+
+    offset_factor : int
+        The factor of elem_offset field.
+
+    buffer_type : str
+        The buffer type.
+
+    axis_separators : List[int]
+        The separators between input axes when generating flattened output axes.
+    """
+    shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
+    if strides is None:
+        strides = []
+    _ffi_api.PreflattenedBuffer(  # pylint: disable=no-member # type: ignore
+        postflattened,
+        shape,
+        dtype,
+        data,
+        strides,
+        elem_offset,
+        scope,
+        align,
+        offset_factor,
+        buffer_type,
+        axis_separators,
+    )
+
+
 def block(name: str = "", no_realize: bool = False) -> frame.BlockFrame:
     """The block declaration statement.
 
@@ -65,11 +357,344 @@ def evaluate(value: PrimExpr) -> None:
     return _ffi_api.Evaluate(value)  # pylint: disable=no-member # type: ignore
 
 
+def int8(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type int8 or cast expression to type int8.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type int8 or casted expression with type int8.
+    """
+    return _ffi_api.Int8(expr)  # pylint: disable=no-member # type: ignore
+
+
+def int16(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type int16 or cast expression to type int16.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type int16 or casted expression with type int16.
+    """
+    return _ffi_api.Int16(expr)  # pylint: disable=no-member # type: ignore
+
+
+def int32(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type int32 or cast expression to type int32.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type int32 or casted expression with type int32.
+    """
+    return _ffi_api.Int32(expr)  # pylint: disable=no-member # type: ignore
+
+
+def int64(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type int64 or cast expression to type int64.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type int64 or casted expression with type int64.
+    """
+    return _ffi_api.Int64(expr)  # pylint: disable=no-member # type: ignore
+
+
+def uint8(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type uint8 or cast expression to type uint8.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type uint8 or casted expression with type uint8.
+    """
+    return _ffi_api.UInt8(expr)  # pylint: disable=no-member # type: ignore
+
+
+def uint16(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type uint16 or cast expression to type uint16.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type uint16 or casted expression with type uint16.
+    """
+    return _ffi_api.UInt16(expr)  # pylint: disable=no-member # type: ignore
+
+
+def uint32(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type uint32 or cast expression to type uint32.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type uint32 or casted expression with type uint32.
+    """
+    return _ffi_api.UInt32(expr)  # pylint: disable=no-member # type: ignore
+
+
+def uint64(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type uint64 or cast expression to type uint64.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type uint64 or casted expression with type uint64.
+    """
+    return _ffi_api.UInt64(expr)  # pylint: disable=no-member # type: ignore
+
+
+def float8(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type float8 or cast expression to type float8.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type float8 or casted expression with type float8.
+    """
+    return _ffi_api.Float8(expr)  # pylint: disable=no-member # type: ignore
+
+
+def float16(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type float16 or cast expression to type float16.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type float16 or casted expression with type float16.
+    """
+    return _ffi_api.Float16(expr)  # pylint: disable=no-member # type: ignore
+
+
+def float32(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type float32 or cast expression to type float32.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type float32 or casted expression with type float32.
+    """
+    return _ffi_api.Float32(expr)  # pylint: disable=no-member # type: ignore
+
+
+def float64(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type float64 or cast expression to type float64.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type float64 or casted expression with type float64.
+    """
+    return _ffi_api.Float64(expr)  # pylint: disable=no-member # type: ignore
+
+
+def int32x4(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type int32x4 or cast expression to type int32x4.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type int32x4 or casted expression with type int32x4.
+    """
+    return _ffi_api.Int32x4(expr)  # pylint: disable=no-member # type: ignore
+
+
+def int32x8(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type int32x8 or cast expression to type int32x8.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type int32x8 or casted expression with type int32x8.
+    """
+    return _ffi_api.Int32x8(expr)  # pylint: disable=no-member # type: ignore
+
+
+def int32x16(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type int32x16 or cast expression to type int32x16.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type int32x16 or casted expression with type int32x16.
+    """
+    return _ffi_api.Int32x16(expr)  # pylint: disable=no-member # type: ignore
+
+
+def boolean(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type boolean or cast expression to type boolean.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type boolean or casted expression with type boolean.
+    """
+    return _ffi_api.Boolean(expr)  # pylint: disable=no-member # type: ignore
+
+
+def handle(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type handle or cast expression to type handle.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type handle or casted expression with type handle.
+    """
+    return _ffi_api.Handle(expr)  # pylint: disable=no-member # type: ignore
+
+
+def void(expr: Optional[PrimExpr] = None) -> PrimExpr:
+    """Construct a new tir.Var with type void or cast expression to type void.
+
+    Parameters
+    ----------
+    expr: PrimExpr
+        The expression to be cast.
+
+    Returns
+    -------
+    res : PrimExpr
+        The new tir.Var with type void or casted expression with type void.
+    """
+    return _ffi_api.Void(expr)  # pylint: disable=no-member # type: ignore
+
+
+def var(dtype, name="") -> Var:
+    """Construct a new tir.Var.
+
+    Parameters
+    ----------
+    dtype: str
+        The dtype of the Var.
+
+    name: str
+        The name of the Var.
+
+    Returns
+    -------
+    res : Var
+        The result tir.Var.
+    """
+    return Var(name, dtype)  # pylint: disable=no-member # type: ignore
+
+
 # pylint: enable=invalid-name
 
 
 __all__ = [
+    "buffer_decl",
+    "prim_func",
+    "arg",
+    "func_name",
+    "func_attr",
+    "func_ret",
+    "match_buffer",
+    "preflattened_buffer",
     "block",
     "evaluate",
-    "prim_func",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "float8",
+    "float16",
+    "float32",
+    "float64",
+    "int32x4",
+    "int32x8",
+    "int32x16",
+    "boolean",
+    "handle",
+    "void",
+    "var",
 ]
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index 4c2679ae6b56..e2c1218a7e87 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -28,6 +28,30 @@ namespace tir {
 
 using tvm::tir::IterVar;
 
+Buffer BufferDecl(Array<PrimExpr> shape, DataType dtype, String buffer_name, Optional<Var> data,
+                  Optional<Array<PrimExpr>> strides, Optional<PrimExpr> elem_offset,
+                  String storage_scope, int align, int offset_factor, String buffer_type,
+                  Optional<Array<IntImm>> axis_separators) {
+  Var buffer_data;
+  if (!data.defined()) {
+    DataType storage_dtype = dtype;
+    if (storage_dtype == DataType::Bool()) {
+      storage_dtype = DataType::Int(8);
+    }
+    buffer_data = tvm::tir::Var(buffer_name, PointerType(PrimType(storage_dtype), storage_scope));
+  } else {
+    buffer_data = data.value();
+  }
+  if (!elem_offset.defined() && offset_factor) {
+    DataType shape_dtype = shape.empty() ? DataType::Int(32) : shape[0]->dtype;
+    elem_offset = tvm::tir::Var("elem_offset", shape_dtype);
+  }
+  return Buffer(buffer_data, dtype, shape, strides.value_or(Array<PrimExpr>()),
+                elem_offset.value_or(PrimExpr()), buffer_name, align, offset_factor,
+                (buffer_type == "auto_broadcast") ? tvm::tir::kAutoBroadcast : tvm::tir::kDefault,
+                axis_separators.value_or(Array<IntImm>()));
+}
+
 PrimFuncFrame PrimFunc() {
   ObjectPtr<PrimFuncFrameNode> n = make_object<PrimFuncFrameNode>();
   n->name = NullOpt;
@@ -41,6 +65,98 @@ PrimFuncFrame PrimFunc() {
   return PrimFuncFrame(n);
 }
 
+Var Arg(String name, Var var) {
+  PrimFuncFrame frame = FindPrimFuncFrame("T.Arg");
+  details::Namer::Name(var, name);
+  frame->args.push_back(var);
+  return var;
+}
+
+Buffer Arg(String name, Buffer buffer) {
+  PrimFuncFrame frame = FindPrimFuncFrame("T.Arg");
+  details::Namer::Name(buffer, name);
+  Var handle(buffer->name + "_handle", DataType::Handle());
+  frame->args.push_back(handle);
+  frame->buffer_map.Set(handle, buffer);
+  return buffer;
+}
+
+void FuncName(String name) {
+  PrimFuncFrame frame = FindPrimFuncFrame("T.func_name");
+  if (frame->name.defined()) {
+    LOG(FATAL) << "ValueError: Duplicate prim func name, previous one is " << frame->name.value();
+  }
+  frame->name = name;
+}
+
+void FuncAttrs(Map<String, ObjectRef> attrs) {
+  using namespace tvm::tir;
+  PrimFuncFrame frame = FindPrimFuncFrame("T.func_attr");
+  if (frame->attrs.defined()) {
+    LOG(FATAL) << "ValueError: Duplicate prim func annotations, previous one is " << frame->attrs;
+  }
+  frame->attrs = attrs;
+}
+
+tvm::Type FuncRet(tvm::Type ret_type) {
+  PrimFuncFrame frame = FindPrimFuncFrame("T.ret_type");
+  if (frame->ret_type.defined()) {
+    LOG(FATAL) << "ValueError: Duplicate prim func return type, previous one is "
+               << frame->ret_type.value();
+  }
+  frame->ret_type = ret_type;
+  return ret_type;
+}
+
+Buffer MatchBuffer(ObjectRef param, Array<PrimExpr> shape, DataType dtype, Optional<Var> data,
+                   Array<PrimExpr> strides, PrimExpr elem_offset, String storage_scope, int align,
+                   int offset_factor, String buffer_type_str, Array<IntImm> axis_separators) {
+  Buffer buffer = BufferDecl(shape, dtype, "", data, strides, elem_offset, storage_scope, align,
+                             offset_factor, buffer_type_str, axis_separators);
+  if (const auto* var = param.as<tvm::tir::VarNode>()) {
+    PrimFuncFrame frame = FindPrimFuncFrame("T.match_buffer");
+    Var v = GetRef<Var>(var);
+    for (auto const& arg : frame->args) {
+      if (arg.same_as(v)) {
+        frame->buffer_map.Set(v, buffer);
+        return buffer;
+      }
+    }
+    LOG(FATAL) << "ValueError: Can not bind non-input param to buffer.";
+  } else if (const auto* buffer_load = param.as<tvm::tir::BufferLoadNode>()) {
+    BlockFrame frame = FindBlockFrame("T.match_buffer");
+    frame->match_buffers.push_back(tvm::tir::MatchBufferRegion(
+        buffer, BufferRegionFromLoad(GetRef<tvm::tir::BufferLoad>(buffer_load))));
+  } else if (const auto* buffer_region = param.as<tvm::tir::BufferRegionNode>()) {
+    BlockFrame frame = FindBlockFrame("T.match_buffer");
+    frame->match_buffers.push_back(
+        tvm::tir::MatchBufferRegion(buffer, GetRef<tvm::tir::BufferRegion>(buffer_region)));
+  } else {
+    LOG(FATAL) << "ValueError: Unexpected type for TIR MatchBuffer.";
+  }
+  return buffer;
+}
+
+void PreflattenedBuffer(Buffer postflattened_buffer, Array<PrimExpr> shape, DataType dtype,
+                        Optional<Var> data, Array<PrimExpr> strides, PrimExpr elem_offset,
+                        String storage_scope, int align, int offset_factor, String buffer_type_str,
+                        Array<IntImm> axis_separators) {
+  PrimFuncFrame frame = FindPrimFuncFrame("T.preflattened_buffer");
+  for (auto const& p : frame->buffer_map) {
+    if (p.second.same_as(postflattened_buffer)) {
+      String buffer_name(postflattened_buffer->name + "_preflatten");
+      Buffer buffer =
+          BufferDecl(shape, dtype, buffer_name, data.value_or(p.second->data), strides, elem_offset,
+                     storage_scope, align, offset_factor, buffer_type_str, axis_separators);
+      details::Namer::Name(buffer, buffer_name);
+      frame->preflattened_buffer_map.Set(p.first, buffer);
+      return;
+    }
+  }
+  LOG(FATAL) << "ValueError: postflattened buffer " << postflattened_buffer->name
+             << " does not exist.";
+}
+
 BlockFrame Block(String name, bool no_realize) {
   ObjectPtr<BlockFrameNode> n = make_object<BlockFrameNode>();
   n->name = name;
@@ -58,9 +174,87 @@ BlockFrame Block(String name, bool no_realize) {
 }
 
 void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); }
+
+using tvm::script::ir_builder::details::Namer;
+
+TVM_STATIC_IR_FUNCTOR(Namer, vtable)
+    .set_dispatch<tvm::tir::BufferNode>([](const ObjectRef& node, String name) -> void {
+      tvm::tir::BufferNode* buffer =
+          const_cast<tvm::tir::BufferNode*>(node.as<tvm::tir::BufferNode>());
+      buffer->name = name;
+      Namer::Name(buffer->data, name);
+      int n = buffer->strides.size();
+      for (int i = 0; i < n; ++i) {
+        PrimExpr e = buffer->strides[i];
+        if (const tvm::tir::VarNode* v = e.as<tvm::tir::VarNode>()) {
+          Namer::Name(GetRef<tvm::tir::Var>(v), name + "_s" + std::to_string(i));
+        }
+      }
+    });
+
+TVM_STATIC_IR_FUNCTOR(Namer, vtable)
+    .set_dispatch<tvm::tir::SizeVarNode>([](const ObjectRef& node, String name) -> void {
+      using namespace tvm::tir;
+      SizeVarNode* var = const_cast<SizeVarNode*>(node.as<SizeVarNode>());
+      var->name_hint = name;
+    });
+
+TVM_STATIC_IR_FUNCTOR(Namer, vtable)
+    .set_dispatch<tvm::tir::VarNode>([](const ObjectRef& node, String name) -> void {
+      using namespace tvm::tir;
+      VarNode* var = const_cast<VarNode*>(node.as<VarNode>());
+      var->name_hint = name;
+    });
+
+TVM_STATIC_IR_FUNCTOR(Namer, vtable)
+    .set_dispatch<tvm::tir::IterVarNode>([](const ObjectRef& node, String name) -> void {
+      using namespace tvm::tir;
+      IterVarNode* var = const_cast<IterVarNode*>(node.as<IterVarNode>());
+      Namer::Name(var->var, name);
+    });
+
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.BufferDecl").set_body_typed(BufferDecl);
+
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.PrimFunc").set_body_typed(PrimFunc);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Arg")
+    .set_body_typed([](String name, ObjectRef obj) -> ObjectRef {
+      using namespace tvm::tir;
+      if (const auto* var = obj.as<VarNode>()) {
+        return Arg(name, GetRef<tvm::tir::Var>(var));
+      }
+      if (const auto* buffer = obj.as<BufferNode>()) {
+        return Arg(name, GetRef<Buffer>(buffer));
+      }
+      LOG(FATAL) << "ValueError: Unexpected type for TIR Arg: " << obj->GetTypeKey();
+      throw;
+    });
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.FuncName").set_body_typed(FuncName);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.FuncAttrs").set_body_typed(FuncAttrs);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.FuncRet").set_body_typed(FuncRet);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.MatchBuffer").set_body_typed(MatchBuffer);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.PreflattenedBuffer").set_body_typed(PreflattenedBuffer);
+
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Block").set_body_typed(Block);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate);
+
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int8").set_body_typed(Int8);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int16").set_body_typed(Int16);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32").set_body_typed(Int32);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int64").set_body_typed(Int64);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt8").set_body_typed(UInt8);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt16").set_body_typed(UInt16);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt32").set_body_typed(UInt32);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt64").set_body_typed(UInt64);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float8").set_body_typed(Float8);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float16").set_body_typed(Float16);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float32").set_body_typed(Float32);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float64").set_body_typed(Float64);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32x4").set_body_typed(Int32x4);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32x8").set_body_typed(Int32x8);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32x16").set_body_typed(Int32x16);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Boolean").set_body_typed(Boolean);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Handle").set_body_typed(Handle);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Void").set_body_typed(Void);
 }  // namespace tir
 }  // namespace ir_builder
 }  // namespace script
diff --git a/src/script/ir_builder/tir/utils.h b/src/script/ir_builder/tir/utils.h
index 4f8b3f77c6e1..c29fae1c65e9 100644
--- a/src/script/ir_builder/tir/utils.h
+++ b/src/script/ir_builder/tir/utils.h
@@ -28,6 +28,10 @@ namespace script {
 namespace ir_builder {
 namespace tir {
 
+/*!
+ * \brief Add tir Stmt to the top frame in IRBuilder frame stack.
+ * \param stmt The Stmt.
+ */
 inline void AddToParent(tvm::tir::Stmt stmt) {
   IRBuilder builder = IRBuilder::Current();
   if (builder->frames.empty()) {
@@ -40,6 +44,11 @@ inline void AddToParent(tvm::tir::Stmt stmt) {
   }
 }
 
+/*!
+ * \brief Convert array of tir Stmt to single Stmt.
+ * \param stmt The array of Stmt.
+ * \return The SeqStmt.
+ */
 inline tvm::tir::Stmt AsStmt(const Array<tvm::tir::Stmt>& stmt) {
   using namespace tvm::tir;
   if (stmt.empty()) {
@@ -51,6 +60,11 @@ inline tvm::tir::Stmt AsStmt(const Array<tvm::tir::Stmt>& stmt) {
   }
 }
 
+/*!
+ * \brief Check whether the top frame in IRBuilder frame stack is PrimFuncFrame.
+ * \param method The method name to be printed when throwing exception.
+ * \return The top frame of PrimFuncFrame.
+ */
 inline PrimFuncFrame FindPrimFuncFrame(const String& method) {
   if (Optional<PrimFuncFrame> frame = IRBuilder::Current()->GetLastFrame<PrimFuncFrame>()) {
     return frame.value();
@@ -60,6 +74,11 @@ inline PrimFuncFrame FindPrimFuncFrame(const String& method) {
   throw;
 }
 
+/*!
+ * \brief Check whether the top frame in IRBuilder frame stack is BlockFrame.
+ * \param method The method name to be printed when throwing exception.
+ * \return The top frame of BlockFrame.
+ */
 inline BlockFrame FindBlockFrame(const String& method) {
   if (Optional<BlockFrame> frame = IRBuilder::Current()->GetLastFrame<BlockFrame>()) {
     return frame.value();
@@ -69,6 +88,19 @@ inline BlockFrame FindBlockFrame(const String& method) {
   throw;
 }
 
+/*!
+ * \brief Convert BufferLoad to BufferRegion.
+ * \param buffer_load The BufferLoad.
+ * \return The converted BufferRegion.
+ */
+inline tvm::tir::BufferRegion BufferRegionFromLoad(tvm::tir::BufferLoad buffer_load) {
+  Array<Range> ranges;
+  for (const PrimExpr& index : buffer_load->indices) {
+    ranges.push_back(Range::FromMinExtent(index, IntImm(index->dtype, 1)));
+  }
+  return tvm::tir::BufferRegion(buffer_load->buffer, ranges);
+}
+
 }  // namespace tir
 }  // namespace ir_builder
 }  // namespace script
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index 85080c7c65fc..5c93e99909d9 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -25,7 +25,7 @@
 from tvm.ir.base import assert_structural_equal
 
 
-def test_ir_builder_tir_primfunc():
+def test_ir_builder_tir_primfunc_base():
     with IRBuilder() as ib:
         with T.prim_func():
             T.evaluate(0)
@@ -45,6 +45,48 @@ def test_ir_builder_tir_primfunc():
     assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True)
 
 
+def test_ir_builder_tir_primfunc_complete():
+    with IRBuilder() as ib:
+        with T.prim_func():
+            T.arg("a", T.handle())
+            T.arg("b", T.var("int64"))
+            T.arg("c", T.buffer_decl((128, 128), "float32"))
+            d = T.arg("d", T.handle())
+            e = T.arg("e", T.buffer_decl((1024,), "int8"))
+            T.func_attr({"key": "value"})
+            T.func_ret(tvm.ir.PrimType("int64"))
+            buffer_d = T.match_buffer(d, (64, 64), "int64")
+            T.preflattened_buffer(e, (32, 32), "int8", data=e.data)
+            T.evaluate(0)
+    # the prim_func generated by IRBuilder
+    prim_func_actual = ib.get()
+
+    # the expected prim_func
+    c_handle, c_buffer = tir.Var("c_handle", "handle"), tir.decl_buffer(
+        (128, 128), "float32", name="c"
+    )
+    d_handle, d_buffer = tir.Var("d", "handle"), tir.decl_buffer((64, 64), "int64", name="d")
+    e_handle, e_buffer = tir.Var("e_handle", "handle"), tir.decl_buffer((1024,), "int8", name="e")
+    prim_func_expected = tir.PrimFunc(
+        params=[
+            tir.Var("a", "handle"),
+            tir.Var("b", "int64"),
+            c_handle,
+            d_handle,
+            e_handle,
+        ],
+        body=tir.Evaluate(0),
+        ret_type=tvm.ir.PrimType("int64"),
+        buffer_map={c_handle: c_buffer, d_handle: d_buffer, e_handle: e_buffer},
+        preflattened_buffer_map={
+            e_handle: tir.decl_buffer((32, 32), "int8", name="e_preflatten", data=e_buffer.data)
+        },
+        attrs=tvm.ir.make_node("DictAttrs", key="value"),
+    )
+    # Check if the generated ir is expected
+    assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True)
+
+
 def test_ir_builder_tir_block():
     with IRBuilder() as ib:
         with T.block("block"):

From 421ff76e3e02e0d97018623fc1a42f202fe202bc Mon Sep 17 00:00:00 2001
From: Ruihang Lai <lairuihangdongdong@qq.com>
Date: Wed, 14 Sep 2022 17:24:14 -0400
Subject: [PATCH 166/704] [TIR][Meta-Schedule] Tuple-reduction scheduling
 support (#11639)

[TIR][MetaSchedule] Support Tuple Reduction

This PR improves our TIR scheduling primitives/transformations (rfactor & cross-thread reduction)
designed for reduction operators, so that they can be applied to blocks of tuple-reduction.
---
 .../schedule_rule/cross_thread_reduction.cc   |   7 +
 src/tir/schedule/analysis.h                   |  48 +-
 src/tir/schedule/analysis/analysis.cc         | 524 +------------
 src/tir/schedule/analysis/reducer.cc          | 702 ++++++++++++++++++
 src/tir/schedule/primitive/reduction.cc       | 402 ++++++----
 .../lower_cross_thread_reduction.cc           | 323 ++++----
 ...meta_schedule_schedule_rule_add_rfactor.py | 166 +++++
 ...le_schedule_rule_cross_thread_reduction.py |  99 +++
 .../unittest/test_tir_schedule_rfactor.py     | 649 +++++++++++++++-
 ..._transform_lower_cross_thread_reduction.py | 244 +++++-
 10 files changed, 2314 insertions(+), 850 deletions(-)
 create mode 100644 src/tir/schedule/analysis/reducer.cc

diff --git a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
index 0f0ab99e7259..35be33f72e21 100644
--- a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
+++ b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
@@ -184,6 +184,13 @@ class CrossThreadReductionNode : public ScheduleRuleNode {
    */
   std::tuple<bool, tir::LoopRV, tir::BlockRV, tir::LoopRV> GetComputeTargetLoopAndBlock(
       const tir::Schedule& sch, const tir::BlockRV& block_rv) {
+    // Step 0. Due to technical reason of some primitives (e.g., compute-at), if the block is doing
+    // a tuple reduction, fusion is temporarily not supported.
+    if (sch->Get(block_rv)->writes.size() != 1) {
+      return std::make_tuple(false, tir::LoopRV{nullptr}, tir::BlockRV{nullptr},
+                             tir::LoopRV{nullptr});
+    }
+
     // Step 1. Get all the consumers of the input block.
     Array<tir::BlockRV> consumers = sch->GetConsumers(block_rv);
 
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index 52ef17df162c..489df8959d1b 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -455,15 +455,14 @@ std::pair<Optional<StmtSRef>, bool> GetBufferDefiningSite(const StmtSRef& block_
 /******** Reduction Block Related ********/
 
 /*!
- * \brief Convert the `init` and `body` of the input block to BufferStores
- * \param self The schedule state
- * \param block The block to be analyzed
- * \return The BufferStores of the `init` and `body` of the input block
- * \throw ScheduleError If the `init` or `body` is not BufferStore, or they don't write to the same
- * buffer
+ * \brief Get the init values and the BufferStore updates from the input reduction block
+ * \param self The schedule state, used for error reporting
+ * \param block The block from which the init values and BufferStore updates are extracted from
+ * \return The extracted init values and BufferStore updates
+ * \throw ScheduleError If rfactor or cross-thread reduction cannot be applied to the block
  */
-std::pair<BufferStore, BufferStore> GetBufferStoresFromReductionBlock(
-    const Optional<ScheduleState>& self, const Block& block);
+std::pair<Array<PrimExpr>, Array<BufferStore>> GetInitValuesAndUpdatesFromReductionBlock(
+    const Optional<ScheduleState>& self, Block block);
 
 /*!
  * \brief Check whether the input array of IterVars only contains data-parallel and reduction block
@@ -484,16 +483,17 @@ bool ContainsOnlyDataParAndReductionBlockIter(const Array<IterVar>& iters);
 bool ReductionIterNotIndexOutputBuffer(const Block& block);
 
 /*!
- * \brief Given a reduction identity and a reduction combiner, detect the corresponding commutative
- * reducer, and extract the combiner lhs and combiner rhs
+ * \brief Given a list of reduction identities and a list of reduction combiners, detect the
+ * corresponding commutative reducer, and extract the combiner LHS values and combiner RHS values
  * \param self The schedule state
- * \param identity The reduction identity to be analyzed
- * \param combiner The reduction combiner to be analyzed
- * \return The corresponding CommReducer, the combiner lhs and the combiner rhs
+ * \param identities The reduction identities to be analyzed
+ * \param combiners The reduction combiners to be analyzed
+ * \return The corresponding CommReducer, combiner LHS values and combiner RHS values
  * \throw ScheduleError If no corresponding commutative reducer can be matched
  */
-std::tuple<CommReducer, PrimExpr, PrimExpr> GetReducerAndCombinerLhsRhs(
-    const Optional<ScheduleState>& self, const PrimExpr& identity, const BufferStore& combiner);
+std::tuple<CommReducer, Array<PrimExpr>, Array<PrimExpr>> GetReducerAndCombinerLhsRhs(
+    const Optional<ScheduleState>& self, const Array<PrimExpr>& identities,
+    const Array<BufferStore>& combiners);
 
 /******** Commutative Reducer ********/
 
@@ -502,20 +502,20 @@ std::tuple<CommReducer, PrimExpr, PrimExpr> GetReducerAndCombinerLhsRhs(
  * \return The list of the registered reducer-getter functions
  * \sa ReducerRegistry
  */
-std::vector<runtime::TypedPackedFunc<CommReducer(DataType)>> GetReducerGetters();
+std::vector<runtime::TypedPackedFunc<Optional<CommReducer>(Array<PrimExpr>)>> GetReducerGetters();
 
 /*!
- * \brief Given the input identity and the combiner BufferStore of a reduction, extract the
- * corresponding commutative reducer and its lhs, rhs if possible.
- * \param identity The identity of the reduction
- * \param combiner The combiner of the reduction
+ * \brief Given the input identities and the combiner BufferStores of a reduction, extract the
+ * corresponding commutative reducer, LHS values and RHS values, if possible.
+ * \param identities The identities of the reduction
+ * \param combiners The combiners of the reduction
  * \param result_reducer The extracted CommReducer
- * \param lhs The extracted lhs of the reducer
- * \param rhs The extracted rhs of the reducer
+ * \param lhs The extracted LHS values of the reducer
+ * \param rhs The extracted RHS values of the reducer
  * \return A boolean indicating whether a corresponding commutative reducer is found
  */
-bool FromIdentityCombiner(const PrimExpr& identity, const BufferStore& combiner,
-                          CommReducer* result_reducer, PrimExpr* lhs, PrimExpr* rhs);
+bool FromIdentityCombiner(const Array<PrimExpr>& identities, const Array<BufferStore>& combiners,
+                          CommReducer* result_reducer, Array<PrimExpr>* lhs, Array<PrimExpr>* rhs);
 
 /******** Misc ********/
 
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index fb09a3480a3a..7ed60876ab22 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -16,9 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <tvm/runtime/container/optional.h>
-#include <tvm/tir/expr.h>
-
 #include "../ir_comparator.h"
 #include "../utils.h"
 
@@ -1237,523 +1234,6 @@ std::pair<Optional<StmtSRef>, bool> GetBufferDefiningSite(const StmtSRef& block_
   return {NullOpt, false};
 }
 
-/******** Pattern Matcher ********/
-
-/*!
- * \brief PrimExpr pattern matcher.
- *
- * It is different from the pattern matcher in arith/pattern_match.h, which is dedicated
- * for compile-time constant patterns. This pattern matcher can work on dynamic user-specific
- * patterns.
- *
- * The code below shows how to use the pattern matcher.
- *
- * \code
- *
- * Var x("x"), y("y");
- * // use PrimExpr to declare patterns, x, y are holes that can be filled with
- * PatternMatcher pattern_matcher(x + y);
- * // expr = C[i, j] + A[i, k] * B[k, j], which is the expr we want to match
- * pattern_matcher.Match(expr);
- *
- * if (pattern_matcher.Success()) {
- *   pattern_matcher.Eval(x) // C[i, j]
- *   pattern_matcher.Eval(y) // A[i, k] * B[k, j]
- * }
- *
- * \endcode
- */
-class PatternMatcher : public ExprVisitor {
- public:
-  explicit PatternMatcher(PrimExpr pattern) : pattern_(std::move(pattern)) {}
-
-  void VisitExpr_(const VarNode* op) final {
-    auto it = filled_map_.find(op);
-    if (it == filled_map_.end()) {
-      filled_map_[op] = expr_to_match_;
-    } else {
-      ExprDeepEqual equal;
-      if (it->second.same_as(expr_to_match_) || equal(it->second, expr_to_match_)) return;
-      match_success_ = false;
-    }
-  }
-
-  void VisitExpr_(const LoadNode* op) final {
-    const auto* ptr = expr_to_match_.as<LoadNode>();
-    if (ptr == nullptr) {
-      match_success_ = false;
-    } else {
-      if (!op->buffer_var.same_as(ptr->buffer_var)) {
-        match_success_ = false;
-      } else {
-        PrimExpr tmp = expr_to_match_;
-        expr_to_match_ = ptr->predicate;
-        VisitExpr(op->predicate);
-        expr_to_match_ = ptr->index;
-        VisitExpr(op->index);
-        std::swap(expr_to_match_, tmp);
-      }
-    }
-  }
-
-  void VisitExpr_(const LetNode* op) final {
-    const auto* ptr = expr_to_match_.as<LetNode>();
-    if (ptr == nullptr) {
-      match_success_ = false;
-    } else {
-      PrimExpr tmp = expr_to_match_;
-      expr_to_match_ = ptr->var;
-      VisitExpr(op->var);
-      expr_to_match_ = ptr->value;
-      VisitExpr(op->value);
-      expr_to_match_ = ptr->body;
-      VisitExpr(op->body);
-      std::swap(expr_to_match_, tmp);
-    }
-  }
-
-  void VisitExpr_(const CallNode* op) final {
-    const auto* ptr = expr_to_match_.as<CallNode>();
-    if (ptr == nullptr) {
-      match_success_ = false;
-    } else {
-      if (!op->op.same_as(ptr->op)) {
-        match_success_ = false;
-      } else {
-        PrimExpr tmp = expr_to_match_;
-        for (size_t i = 0; i < op->args.size(); ++i) {
-          expr_to_match_ = ptr->args[i];
-          VisitExpr(op->args[i]);
-        }
-        std::swap(expr_to_match_, tmp);
-      }
-    }
-  }
-
-#define TVM_DECLARE_PATTERN_MATCHER_BIN_OP(OpName) \
-  void VisitExpr_(const OpName* op) {              \
-    const auto* ptr = expr_to_match_.as<OpName>(); \
-    if (ptr == nullptr) {                          \
-      match_success_ = false;                      \
-    } else {                                       \
-      PrimExpr current = expr_to_match_;           \
-      expr_to_match_ = ptr->a;                     \
-      VisitExpr(op->a);                            \
-      expr_to_match_ = ptr->b;                     \
-      VisitExpr(op->b);                            \
-      std::swap(expr_to_match_, current);          \
-    }                                              \
-  }
-
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(AddNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(SubNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MulNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(DivNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(ModNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(FloorDivNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(FloorModNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MinNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MaxNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(EQNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(NENode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(LTNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(LENode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(GTNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(GENode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(AndNode);
-  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(OrNode);
-
-  void VisitExpr_(const CastNode* op) final {
-    const auto* ptr = expr_to_match_.as<CastNode>();
-    if (ptr == nullptr) {
-      match_success_ = false;
-    } else {
-      if (!runtime::TypeEqual(op->dtype, ptr->dtype)) {
-        match_success_ = false;
-      } else {
-        PrimExpr tmp = expr_to_match_;
-        expr_to_match_ = ptr->value;
-        VisitExpr(op->value);
-        std::swap(expr_to_match_, tmp);
-      }
-    }
-  }
-
-  void VisitExpr_(const NotNode* op) final {
-    const auto* ptr = expr_to_match_.as<NotNode>();
-    if (ptr == nullptr) {
-      match_success_ = false;
-    } else {
-      PrimExpr tmp = expr_to_match_;
-      expr_to_match_ = ptr->a;
-      VisitExpr(op->a);
-      std::swap(expr_to_match_, tmp);
-    }
-  }
-
-  void VisitExpr_(const SelectNode* op) final {
-    const auto* ptr = expr_to_match_.as<SelectNode>();
-    if (ptr == nullptr) {
-      match_success_ = false;
-    } else {
-      PrimExpr tmp = expr_to_match_;
-      expr_to_match_ = ptr->condition;
-      VisitExpr(op->condition);
-      expr_to_match_ = ptr->true_value;
-      VisitExpr(op->true_value);
-      expr_to_match_ = ptr->false_value;
-      VisitExpr(op->false_value);
-      std::swap(expr_to_match_, tmp);
-    }
-  }
-
-  void VisitExpr_(const RampNode* op) final {
-    const auto* ptr = expr_to_match_.as<RampNode>();
-    if (ptr == nullptr) {
-      match_success_ = false;
-    } else {
-      if (op->lanes != ptr->lanes) {
-        match_success_ = false;
-      } else {
-        PrimExpr tmp = expr_to_match_;
-        expr_to_match_ = ptr->base;
-        VisitExpr(op->base);
-        expr_to_match_ = ptr->stride;
-        VisitExpr(op->stride);
-        std::swap(expr_to_match_, tmp);
-      }
-    }
-  }
-
-  void VisitExpr_(const BroadcastNode* op) final {
-    const auto* ptr = expr_to_match_.as<BroadcastNode>();
-    if (ptr == nullptr) {
-      match_success_ = false;
-    } else {
-      if (op->lanes != ptr->lanes) {
-        match_success_ = false;
-      } else {
-        PrimExpr tmp = expr_to_match_;
-        expr_to_match_ = ptr->value;
-        VisitExpr(op->value);
-        std::swap(expr_to_match_, tmp);
-      }
-    }
-  }
-
-  void VisitExpr_(const ShuffleNode* op) final {
-    const auto* ptr = expr_to_match_.as<ShuffleNode>();
-    if (ptr == nullptr) {
-      match_success_ = false;
-    } else {
-      if (op->vectors.size() != ptr->vectors.size() || op->indices.size() != ptr->indices.size()) {
-        match_success_ = false;
-      } else {
-        PrimExpr tmp = expr_to_match_;
-        for (size_t i = 0; i < op->indices.size(); ++i) {
-          expr_to_match_ = ptr->indices[i];
-          VisitExpr(op->indices[i]);
-        }
-        for (size_t i = 0; i < op->vectors.size(); ++i) {
-          expr_to_match_ = ptr->vectors[i];
-          VisitExpr(op->vectors[i]);
-        }
-        std::swap(expr_to_match_, tmp);
-      }
-    }
-  }
-
-  void VisitExpr_(const IntImmNode* op) final {
-    const auto* ptr = expr_to_match_.as<IntImmNode>();
-    match_success_ = ptr != nullptr && op->value == ptr->value;
-  }
-
-  void VisitExpr_(const FloatImmNode* op) final {
-    const auto* ptr = expr_to_match_.as<FloatImmNode>();
-    match_success_ = ptr != nullptr && op->value == ptr->value;
-  }
-
-  void VisitExpr_(const StringImmNode* op) final {
-    const auto* ptr = expr_to_match_.as<StringImmNode>();
-    match_success_ = ptr != nullptr && op->value == ptr->value;
-  }
-
-  void VisitExpr_(const BufferLoadNode* op) final {
-    const auto* ptr = expr_to_match_.as<BufferLoadNode>();
-    if (ptr == nullptr) {
-      match_success_ = false;
-    } else {
-      if (!op->buffer.same_as(ptr->buffer) || op->indices.size() != ptr->indices.size()) {
-        match_success_ = false;
-      } else {
-        PrimExpr tmp = expr_to_match_;
-        for (size_t i = 0; i < op->indices.size(); ++i) {
-          expr_to_match_ = ptr->indices[i];
-          VisitExpr(op->indices[i]);
-        }
-        std::swap(expr_to_match_, tmp);
-      }
-    }
-  }
-
-  void Match(const PrimExpr& expr_to_match) {
-    this->match_success_ = true;
-    this->filled_map_.clear();
-    this->expr_to_match_ = expr_to_match;
-    this->operator()(pattern_);
-  }
-
-  PrimExpr Eval(const Var& var) {
-    auto it = filled_map_.find(var.operator->());
-    ICHECK(it != filled_map_.end()) << "Unknown pattern variable";
-    ICHECK(match_success_) << "Match failed";
-    return it->second;
-  }
-
-  bool Success() const { return match_success_; }
-
- private:
-  bool match_success_{true};
-  PrimExpr pattern_, expr_to_match_;
-  std::unordered_map<const VarNode*, PrimExpr> filled_map_;
-};
-
-/******** Reduction Block Related ********/
-
-class InitBodyNotBufferStoreError : public ScheduleError {
- public:
-  explicit InitBodyNotBufferStoreError(IRModule mod, Block block, bool init_is_bufferstore,
-                                       bool body_is_bufferstore)
-      : mod_(std::move(mod)),
-        block_(std::move(block)),
-        init_is_bufferstore_(init_is_bufferstore),
-        body_is_bufferstore_(body_is_bufferstore) {}
-
-  String FastErrorString() const final {
-    return "ScheduleError: The `init` and `body` of reduction block are required to be both "
-           "BufferStore so that rfactor or cross-thread reduction can be applied";
-  }
-
-  String DetailRenderTemplate() const final {
-    if (!init_is_bufferstore_ && !body_is_bufferstore_) {
-      return "The `init` and `body` of block {0} are required to be BufferStore so that rfactor or "
-             "cross-thread reduction can be applied";
-    } else if (!init_is_bufferstore_) {
-      return "The `init` of block {0} is required to be BufferStore so that rfactor or cross-thread"
-             " reduction can be applied";
-    } else {
-      ICHECK(!body_is_bufferstore_);
-      return "The `body` of block {0} is required to be BufferStore so that rfactor or cross-thread"
-             " reduction can be applied";
-    }
-  }
-
-  IRModule mod() const final { return mod_; }
-  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
-
-  IRModule mod_;
-  Block block_;
-  bool init_is_bufferstore_;
-  bool body_is_bufferstore_;
-};
-
-class InitBodyNotSameBufferAccessError : public ScheduleError {
- public:
-  explicit InitBodyNotSameBufferAccessError(IRModule mod, Block block)
-      : mod_(std::move(mod)), block_(std::move(block)) {}
-
-  String FastErrorString() const final {
-    return "ScheduleError: The `init` and `body` of the reduction block are required to have the "
-           "same buffer access pattern";
-  }
-
-  String DetailRenderTemplate() const final {
-    std::ostringstream os;
-    const auto* init = block_->init.as<BufferStoreNode>();
-    const auto* update = block_->body.as<BufferStoreNode>();
-    os << "The `init` and `body` of the block {0} is required to have the same buffer access "
-          "pattern. However, in block {0} the `init` writes to "
-       << init->buffer->name << init->indices << ", and the `body` writes to "
-       << update->buffer->name << update->indices;
-    return os.str();
-  }
-
-  IRModule mod() const final { return mod_; }
-  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
-
-  IRModule mod_;
-  Block block_;
-};
-
-std::pair<BufferStore, BufferStore> GetBufferStoresFromReductionBlock(
-    const Optional<ScheduleState>& self, const Block& block) {
-  static constexpr const char* error_str1 =
-      "ValueError: The `init` and `body` of the reduction block are required to be both "
-      "BufferStore so that rfactor or cross-thread reduction can be applied. However, a reduction "
-      "block that doesn't meet this requirement is ";
-  static constexpr const char* error_str2 =
-      "ValueError: The `init` and `body` of the reduction block are required to have the same "
-      "buffer access pattern so that rfactor or cross-thread reduction can be applied. However, a "
-      "reduction block that doesn't meet this requirement is ";
-
-  const auto* init = block->init.as<BufferStoreNode>();
-  const auto* body = block->body.as<BufferStoreNode>();
-  if (!(init && body)) {
-    if (self.defined()) {
-      throw InitBodyNotBufferStoreError(self.value()->mod, block, init != nullptr, body != nullptr);
-    } else {
-      LOG(FATAL) << error_str1 << block;
-    }
-  }
-  if (!init->buffer.same_as(body->buffer)) {
-    if (self.defined()) {
-      throw InitBodyNotSameBufferAccessError(self.value()->mod, block);
-    } else {
-      LOG(FATAL) << error_str2 << block;
-    }
-  }
-  int ndim = static_cast<int>(init->buffer->shape.size());
-  for (int i = 0; i < ndim; ++i) {
-    if (!ExprDeepEqual()(init->indices[i], body->indices[i])) {
-      if (self.defined()) {
-        throw InitBodyNotSameBufferAccessError(self.value()->mod, block);
-      } else {
-        LOG(FATAL) << error_str2 << block;
-      }
-    }
-  }
-  return std::make_pair(GetRef<BufferStore>(init), GetRef<BufferStore>(body));
-}
-
-bool ContainsOnlyDataParAndReductionBlockIter(const Array<IterVar>& iters) {
-  for (const IterVar& iter_var : iters) {
-    if (iter_var->iter_type != kDataPar && iter_var->iter_type != kCommReduce) {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool ReductionIterNotIndexOutputBuffer(const Block& block) {
-  // Step 1. Collect the reduction block iters.
-  std::unordered_set<const VarNode*> reduction_block_iters;
-  reduction_block_iters.reserve(block->iter_vars.size());
-  for (const IterVar& iter_var : block->iter_vars) {
-    if (iter_var->iter_type == kCommReduce) {
-      reduction_block_iters.insert(iter_var->var.get());
-    }
-  }
-  // Step 2. Check if the reduction block iters are used to index the output buffer.
-  std::unordered_set<const BufferNode*> buffer_written;
-  buffer_written.reserve(block->writes.size());
-  for (const BufferRegion& write_region : block->writes) {
-    buffer_written.insert(write_region->buffer.get());
-  }
-  auto f_uses_reduction_block_var = [&](const PrimExpr& expr) -> bool {
-    return UsesVar(expr, [&](const VarNode* var) {  //
-      return reduction_block_iters.count(var);
-    });
-  };
-  bool affected = false;
-  PreOrderVisit(block->body, [&](const ObjectRef& obj) {
-    if (affected) {
-      return false;
-    }
-    const auto* store = obj.as<BufferStoreNode>();
-    if (!store) {
-      return true;
-    }
-    ICHECK(buffer_written.count(store->buffer.get()))
-        << "ValueError: The buffer \"" << store->buffer
-        << "\" is written in the block but is not in the block's signature";
-    for (const PrimExpr& index : store->indices) {
-      if (f_uses_reduction_block_var(index)) {
-        affected = true;
-        return false;
-      }
-    }
-    return false;
-  });
-  return !affected;
-}
-
-class NoMatchedReducerError : public ScheduleError {
- public:
-  explicit NoMatchedReducerError(IRModule mod, PrimExpr identity, BufferStore combiner)
-      : mod_(std::move(mod)), identity_(std::move(identity)), combiner_(std::move(combiner)) {}
-
-  String FastErrorString() const final {
-    return "ScheduleError: No matched reducer for the identity and the combiner of this reduction "
-           "block. So rfactor and cross-thread reduction cannot be applied.";
-  }
-
-  String DetailRenderTemplate() const final {
-    std::ostringstream os;
-    os << "No matched reducer for identity " << identity_ << " and combiner " << combiner_
-       << "In this case rfactor cannot be applied. You can check tvm::tir::ReducerRegistry for "
-          "default reducers or registering new reducers.";
-    return os.str();
-  }
-
-  IRModule mod() const final { return mod_; }
-  Array<ObjectRef> LocationsOfInterest() const final { return {}; }
-
-  IRModule mod_;
-  PrimExpr identity_;
-  BufferStore combiner_;
-};
-
-std::tuple<CommReducer, PrimExpr, PrimExpr> GetReducerAndCombinerLhsRhs(
-    const Optional<ScheduleState>& self, const PrimExpr& identity, const BufferStore& combiner) {
-  CommReducer reducer{nullptr};
-  PrimExpr combiner_lhs{nullptr}, combiner_rhs{nullptr};
-  bool matched = FromIdentityCombiner(identity, combiner, &reducer, &combiner_lhs, &combiner_rhs);
-  if (!matched) {
-    if (self.defined()) {
-      throw NoMatchedReducerError(self.value()->mod, identity, combiner);
-    } else {
-      LOG(FATAL) << "ValueError: No matched reducer for the identity and the combiner of the "
-                    "reduction block. So rfactor and cross-thread reduction cannot be applied.";
-    }
-  }
-  return std::make_tuple(std::move(reducer), std::move(combiner_lhs), std::move(combiner_rhs));
-}
-
-/******** Commutative Reducer ********/
-
-bool MatchReducer(const CommReducer& reducer, const PrimExpr& identity, const PrimExpr& combiner,
-                  const BufferLoad& load, PrimExpr* lhs, PrimExpr* rhs) {
-  if (!ExprDeepEqual()(reducer->identity_element[0], identity)) {
-    return false;
-  }
-  PatternMatcher pattern_matcher(reducer->result[0]);
-  pattern_matcher.Match(combiner);
-  if (pattern_matcher.Success()) {
-    PrimExpr lhs_tmp = pattern_matcher.Eval(reducer->lhs[0]);
-    PrimExpr rhs_tmp = pattern_matcher.Eval(reducer->rhs[0]);
-    if (ExprDeepEqual()(load, lhs_tmp)) {
-      *lhs = std::move(lhs_tmp);
-      *rhs = std::move(rhs_tmp);
-    }
-    return true;
-  }
-  return false;
-}
-
-bool FromIdentityCombiner(const PrimExpr& identity, const BufferStore& combiner,
-                          CommReducer* result_reducer, PrimExpr* lhs, PrimExpr* rhs) {
-  BufferLoad load(combiner->buffer, combiner->indices);
-  // Check reduction patterns.
-  for (const TypedPackedFunc<CommReducer(DataType)>& reducer_getter : GetReducerGetters()) {
-    CommReducer reducer = reducer_getter(identity.dtype());
-    if (MatchReducer(reducer, identity, combiner->value, load, lhs, rhs)) {
-      *result_reducer = std::move(reducer);
-      return true;
-    }
-  }
-  return false;
-}
-
 /******** SRef Tree Related ********/
 
 StmtSRef GetSRefTreeRoot(const StmtSRef& sref) {
@@ -2072,8 +1552,8 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self,   //
   const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   Array<tir::StmtSRef> loops = tir::GetLoops(block_sref);
 
-  // Cond 1. The block has only one write buffer
-  if (block->writes.size() != 1) {
+  // Cond 1. The block must have at lease one write buffer
+  if (block->writes.size() == 0) {
     return false;
   }
 
diff --git a/src/tir/schedule/analysis/reducer.cc b/src/tir/schedule/analysis/reducer.cc
new file mode 100644
index 000000000000..50813ef3cae8
--- /dev/null
+++ b/src/tir/schedule/analysis/reducer.cc
@@ -0,0 +1,702 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace tir {
+
+/******** Pattern Matcher ********/
+
+/*!
+ * \brief PrimExpr pattern matcher.
+ *
+ * It is different from the pattern matcher in arith/pattern_match.h, which is dedicated
+ * for compile-time constant patterns. This pattern matcher can work on dynamic user-specific
+ * patterns.
+ *
+ * The code below shows how to use the pattern matcher.
+ *
+ * \code
+ *
+ * Var x("x"), y("y");
+ * // use PrimExpr to declare patterns, x, y are holes that can be filled with
+ * PatternMatcher pattern_matcher(x + y);
+ * // expr = C[i, j] + A[i, k] * B[k, j], which is the expr we want to match
+ * pattern_matcher.Match(expr);
+ *
+ * if (pattern_matcher.Success()) {
+ *   pattern_matcher.Eval(x) // C[i, j]
+ *   pattern_matcher.Eval(y) // A[i, k] * B[k, j]
+ * }
+ *
+ * \endcode
+ */
+class PatternMatcher : public ExprVisitor {
+ public:
+  explicit PatternMatcher(Array<PrimExpr> pattern) : pattern_(std::move(pattern)) {}
+
+  void VisitExpr_(const VarNode* op) final {
+    auto it = filled_map_.find(op);
+    if (it == filled_map_.end()) {
+      filled_map_[op] = expr_to_match_;
+    } else {
+      ExprDeepEqual equal;
+      if (it->second.same_as(expr_to_match_) || equal(it->second, expr_to_match_)) return;
+      match_success_ = false;
+    }
+  }
+
+  void VisitExpr_(const LoadNode* op) final {
+    const auto* ptr = expr_to_match_.as<LoadNode>();
+    if (ptr == nullptr) {
+      match_success_ = false;
+    } else {
+      if (!op->buffer_var.same_as(ptr->buffer_var)) {
+        match_success_ = false;
+      } else {
+        PrimExpr tmp = expr_to_match_;
+        expr_to_match_ = ptr->predicate;
+        VisitExpr(op->predicate);
+        expr_to_match_ = ptr->index;
+        VisitExpr(op->index);
+        std::swap(expr_to_match_, tmp);
+      }
+    }
+  }
+
+  void VisitExpr_(const LetNode* op) final {
+    const auto* ptr = expr_to_match_.as<LetNode>();
+    if (ptr == nullptr) {
+      match_success_ = false;
+    } else {
+      PrimExpr tmp = expr_to_match_;
+      expr_to_match_ = ptr->var;
+      VisitExpr(op->var);
+      expr_to_match_ = ptr->value;
+      VisitExpr(op->value);
+      expr_to_match_ = ptr->body;
+      VisitExpr(op->body);
+      std::swap(expr_to_match_, tmp);
+    }
+  }
+
+  void VisitExpr_(const CallNode* op) final {
+    const auto* ptr = expr_to_match_.as<CallNode>();
+    if (ptr == nullptr) {
+      match_success_ = false;
+    } else {
+      if (!op->op.same_as(ptr->op)) {
+        match_success_ = false;
+      } else {
+        PrimExpr tmp = expr_to_match_;
+        for (size_t i = 0; i < op->args.size(); ++i) {
+          expr_to_match_ = ptr->args[i];
+          VisitExpr(op->args[i]);
+        }
+        std::swap(expr_to_match_, tmp);
+      }
+    }
+  }
+
+#define TVM_DECLARE_PATTERN_MATCHER_BIN_OP(OpName) \
+  void VisitExpr_(const OpName* op) {              \
+    const auto* ptr = expr_to_match_.as<OpName>(); \
+    if (ptr == nullptr) {                          \
+      match_success_ = false;                      \
+    } else {                                       \
+      PrimExpr current = expr_to_match_;           \
+      expr_to_match_ = ptr->a;                     \
+      VisitExpr(op->a);                            \
+      expr_to_match_ = ptr->b;                     \
+      VisitExpr(op->b);                            \
+      std::swap(expr_to_match_, current);          \
+    }                                              \
+  }
+
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(AddNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(SubNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MulNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(DivNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(ModNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(FloorDivNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(FloorModNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MinNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MaxNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(EQNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(NENode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(LTNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(LENode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(GTNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(GENode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(AndNode);
+  TVM_DECLARE_PATTERN_MATCHER_BIN_OP(OrNode);
+
+  void VisitExpr_(const CastNode* op) final {
+    const auto* ptr = expr_to_match_.as<CastNode>();
+    if (ptr == nullptr) {
+      match_success_ = false;
+    } else {
+      if (!runtime::TypeEqual(op->dtype, ptr->dtype)) {
+        match_success_ = false;
+      } else {
+        PrimExpr tmp = expr_to_match_;
+        expr_to_match_ = ptr->value;
+        VisitExpr(op->value);
+        std::swap(expr_to_match_, tmp);
+      }
+    }
+  }
+
+  void VisitExpr_(const NotNode* op) final {
+    const auto* ptr = expr_to_match_.as<NotNode>();
+    if (ptr == nullptr) {
+      match_success_ = false;
+    } else {
+      PrimExpr tmp = expr_to_match_;
+      expr_to_match_ = ptr->a;
+      VisitExpr(op->a);
+      std::swap(expr_to_match_, tmp);
+    }
+  }
+
+  void VisitExpr_(const SelectNode* op) final {
+    const auto* ptr = expr_to_match_.as<SelectNode>();
+    if (ptr == nullptr) {
+      match_success_ = false;
+    } else {
+      PrimExpr tmp = expr_to_match_;
+      expr_to_match_ = ptr->condition;
+      VisitExpr(op->condition);
+      expr_to_match_ = ptr->true_value;
+      VisitExpr(op->true_value);
+      expr_to_match_ = ptr->false_value;
+      VisitExpr(op->false_value);
+      std::swap(expr_to_match_, tmp);
+    }
+  }
+
+  void VisitExpr_(const RampNode* op) final {
+    const auto* ptr = expr_to_match_.as<RampNode>();
+    if (ptr == nullptr) {
+      match_success_ = false;
+    } else {
+      if (op->lanes != ptr->lanes) {
+        match_success_ = false;
+      } else {
+        PrimExpr tmp = expr_to_match_;
+        expr_to_match_ = ptr->base;
+        VisitExpr(op->base);
+        expr_to_match_ = ptr->stride;
+        VisitExpr(op->stride);
+        std::swap(expr_to_match_, tmp);
+      }
+    }
+  }
+
+  void VisitExpr_(const BroadcastNode* op) final {
+    const auto* ptr = expr_to_match_.as<BroadcastNode>();
+    if (ptr == nullptr) {
+      match_success_ = false;
+    } else {
+      if (op->lanes != ptr->lanes) {
+        match_success_ = false;
+      } else {
+        PrimExpr tmp = expr_to_match_;
+        expr_to_match_ = ptr->value;
+        VisitExpr(op->value);
+        std::swap(expr_to_match_, tmp);
+      }
+    }
+  }
+
+  void VisitExpr_(const ShuffleNode* op) final {
+    const auto* ptr = expr_to_match_.as<ShuffleNode>();
+    if (ptr == nullptr) {
+      match_success_ = false;
+    } else {
+      if (op->vectors.size() != ptr->vectors.size() || op->indices.size() != ptr->indices.size()) {
+        match_success_ = false;
+      } else {
+        PrimExpr tmp = expr_to_match_;
+        for (size_t i = 0; i < op->indices.size(); ++i) {
+          expr_to_match_ = ptr->indices[i];
+          VisitExpr(op->indices[i]);
+        }
+        for (size_t i = 0; i < op->vectors.size(); ++i) {
+          expr_to_match_ = ptr->vectors[i];
+          VisitExpr(op->vectors[i]);
+        }
+        std::swap(expr_to_match_, tmp);
+      }
+    }
+  }
+
+  void VisitExpr_(const IntImmNode* op) final {
+    const auto* ptr = expr_to_match_.as<IntImmNode>();
+    match_success_ = ptr != nullptr && op->value == ptr->value;
+  }
+
+  void VisitExpr_(const FloatImmNode* op) final {
+    const auto* ptr = expr_to_match_.as<FloatImmNode>();
+    match_success_ = ptr != nullptr && op->value == ptr->value;
+  }
+
+  void VisitExpr_(const StringImmNode* op) final {
+    const auto* ptr = expr_to_match_.as<StringImmNode>();
+    match_success_ = ptr != nullptr && op->value == ptr->value;
+  }
+
+  void VisitExpr_(const BufferLoadNode* op) final {
+    const auto* ptr = expr_to_match_.as<BufferLoadNode>();
+    if (ptr == nullptr) {
+      match_success_ = false;
+    } else {
+      if (!op->buffer.same_as(ptr->buffer) || op->indices.size() != ptr->indices.size()) {
+        match_success_ = false;
+      } else {
+        PrimExpr tmp = expr_to_match_;
+        for (size_t i = 0; i < op->indices.size(); ++i) {
+          expr_to_match_ = ptr->indices[i];
+          VisitExpr(op->indices[i]);
+        }
+        std::swap(expr_to_match_, tmp);
+      }
+    }
+  }
+
+  void Match(const Array<PrimExpr>& exprs_to_match) {
+    this->match_success_ = true;
+    this->filled_map_.clear();
+
+    ICHECK_EQ(pattern_.size(), exprs_to_match.size());
+    int n_buffers = pattern_.size();
+    for (int i = 0; i < n_buffers; ++i) {
+      this->expr_to_match_ = exprs_to_match[i];
+      this->operator()(pattern_[i]);
+    }
+  }
+
+  PrimExpr Eval(const Var& var) {
+    auto it = filled_map_.find(var.operator->());
+    ICHECK(it != filled_map_.end()) << "Unknown pattern variable";
+    ICHECK(match_success_) << "Match failed";
+    return it->second;
+  }
+
+  bool Success() const { return match_success_; }
+
+ private:
+  bool match_success_{true};
+  Array<PrimExpr> pattern_;
+  PrimExpr expr_to_match_;
+  std::unordered_map<const VarNode*, PrimExpr> filled_map_;
+};
+
+/******** Reduction Block Related ********/
+
+static const char* kRFactorCrossThreadReductionApplicableBlockDef =
+    R"(Definition of a reduction block that is applicable by RFactor and Cross-Thread Reduction:
+1) The block init should be a single BufferStore or a SeqStmt of BufferStores
+2) The buffers initialized in the block init should be all different
+3) The number of consecutive LetStmts in the block body (if any) should equal the number of BufferStores in the block init
+4) The variables of the LetStmts in the block body should be all different
+5) The body of the innermost LetStmt should be a single BufferStore or a SeqStmt of BufferStores
+6) The number of BufferStores under the block body should equal the number of BufferStores in the block init, and thereby equal the number of LetStmts above
+7) The variables bound by the LetStmts in the block body must all directly serve as values of the BufferStores inside, and the stored values of the BufferStores can only be those variables
+8) The variables stored by the BufferStores in the block body should be all different
+9) The buffers written by the BufferStores in the block body should be all different
+10) The buffers initialized in the block init and written in the block body should match
+11) The buffers written by the block should have same shape
+12) The indices of all BufferStores in the reduction block should be the same)";
+
+void ErrorRFactorCrossThreadReductionNotApplicable(const Optional<ScheduleState>& self, Block block,
+                                                   int violated_cond) {
+  class RFactorNotApplicableError : public ScheduleError {
+   public:
+    explicit RFactorNotApplicableError(IRModule mod, Block block, int violated_cond)
+        : mod_(std::move(mod)), block_(std::move(block)), violated_cond_(violated_cond) {}
+
+    String FastErrorString() const final {
+      return "ScheduleError: RFactor cannot be applied to the block since the block does not meet "
+             "the requirements";
+    }
+
+    String DetailRenderTemplate() const final {
+      std::ostringstream os;
+      os << "RFactor cannot be applied to block {0}, because the block violates condition #"
+         << violated_cond_ << ".\n"
+         << kRFactorCrossThreadReductionApplicableBlockDef;
+      return os.str();
+    }
+
+    IRModule mod() const final { return mod_; }
+    Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+
+    IRModule mod_;
+    Block block_;
+    int violated_cond_;
+  };
+
+  if (self.defined()) {
+    throw RFactorNotApplicableError(self.value()->mod, std::move(block), violated_cond);
+  } else {
+    LOG(FATAL) << "ValueError: Cross-thread reduction cannot be applied to the block "
+               << block->name_hint << " because the block violates the condition #" << violated_cond
+               << ".\n"
+               << kRFactorCrossThreadReductionApplicableBlockDef;
+  }
+}
+
+/*!
+ * \brief Extract the BufferStores, which serve as the reduction updates, from the given LetStmt and
+ * the BufferStores inside. And meanwhile set the buffer order of the reduction
+ * \param self The schedule state, used for error reporting
+ * \param block The reduction block, used for error reporting
+ * \param let The LetStmt from which the reduction updates are extracted
+ * \param n_buffers The number of buffers participating in the reduction
+ * \param updates The extracted reduction updates
+ * \param buf2index A mapping from reduction buffers to their indices of the reduction order
+ * \throw ScheduleError If rfactor or cross-thread reduction cannot be applied to the block
+ */
+void ExtractReductionUpdates(const Optional<ScheduleState>& self, Block block,
+                             const LetStmtNode* let, int n_buffers, Array<BufferStore>* updates,
+                             std::unordered_map<const BufferNode*, int>* buf2index) {
+  std::unordered_map<const VarNode*, int> var2index;
+  Array<PrimExpr> let_values;
+  let_values.reserve(n_buffers);
+  updates->resize(n_buffers);
+
+  // Step 1.
+  // - Extract the BufferStore values from the LetStmts.
+  // - Construct the mapping from let variables to the index.
+  for (int i = 0; i < n_buffers; ++i) {
+    if (let == nullptr) {
+      ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/3);
+    }
+
+    let_values.push_back(let->value);
+    auto insert_result = var2index.insert(std::make_pair(let->var.get(), i));
+    if (!insert_result.second) {
+      ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/4);
+    }
+    if (i != n_buffers - 1) {
+      let = let->body.as<LetStmtNode>();
+    }
+  }
+
+  // There should be no more LetStmt.
+  if (let->body->IsInstance<LetStmtNode>()) {
+    ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/3);
+  }
+
+  // Now `let` is expected to be the innermost LetStmt, whose body should either be a SeqStmt or
+  // a BufferStore
+  const auto* p_seq = let->body.as<SeqStmtNode>();
+  const auto* p_buf_store = let->body.as<BufferStoreNode>();
+  if (p_seq == nullptr && p_buf_store == nullptr) {
+    ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/5);
+  }
+  SeqStmt seq =
+      p_seq != nullptr ? GetRef<SeqStmt>(p_seq) : SeqStmt({GetRef<BufferStore>(p_buf_store)});
+  if (static_cast<int>(seq->seq.size()) != n_buffers) {
+    ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/6);
+  }
+
+  // Step 2.
+  // - Create BufferStores according to the variables being stored.
+  // - Construct the mapping from reduction buffers to the index.
+  for (const Stmt& stmt : seq->seq) {
+    const auto* buf_store = stmt.as<BufferStoreNode>();
+    if (buf_store == nullptr) {
+      ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/5);
+    }
+    const auto* var = buf_store->value.as<VarNode>();
+    if (var == nullptr) {
+      ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/7);
+    }
+    auto it = var2index.find(var);
+    if (it == var2index.end()) {
+      ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/7);
+    }
+    int idx = it->second;
+    if ((*updates)[idx].defined()) {
+      ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/8);
+    }
+    updates->Set(idx, BufferStore(buf_store->buffer, let_values[idx], buf_store->indices));
+    auto insert_result = buf2index->insert(std::make_pair(buf_store->buffer.get(), idx));
+    if (!insert_result.second) {
+      ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/9);
+    }
+  }
+  for (int i = 0; i < n_buffers; ++i) {
+    ICHECK((*updates)[i].defined());
+  }
+}
+
+std::pair<Array<PrimExpr>, Array<BufferStore>> GetInitValuesAndUpdatesFromReductionBlock(
+    const Optional<ScheduleState>& self, Block block) {
+  Array<BufferStore> inits;
+  Array<BufferStore> updates;
+
+  // Step 1. Extract the BufferStores serving as block inits.
+  if (const auto* init = block->init.as<BufferStoreNode>()) {
+    inits.push_back(GetRef<BufferStore>(init));
+  } else if (const auto* seq_init = block->init.as<SeqStmtNode>()) {
+    std::unordered_set<const BufferNode*> init_buffers;
+    for (const Stmt& stmt : seq_init->seq) {
+      init = stmt.as<BufferStoreNode>();
+      if (init == nullptr) {
+        ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/1);
+      }
+      auto insert_result = init_buffers.insert(init->buffer.get());
+      if (!insert_result.second) {
+        ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/2);
+      }
+      inits.push_back(GetRef<BufferStore>(init));
+    }
+  } else {
+    ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/1);
+  }
+
+  // Step 2. Extract the block updates, in the form of BufferStores.
+  int n_buffers = inits.size();
+  std::unordered_map<const BufferNode*, int> buf2index;
+  if (const auto* update = block->body.as<BufferStoreNode>()) {
+    updates.push_back(GetRef<BufferStore>(update));
+    buf2index[update->buffer.get()] = 0;
+  } else {
+    const auto* let = block->body.as<LetStmtNode>();
+    ExtractReductionUpdates(self, block, let, n_buffers, &updates, &buf2index);
+  }
+  ICHECK_EQ(updates.size(), n_buffers);
+
+  // Step 3. Set the init values according to the buffer order in `updates`, with the help of the
+  // mapping `buf2index`.
+  Array<PrimExpr> init_values;
+  init_values.resize(n_buffers);
+
+  // - Check all buffers have the same shape
+  // - Check all indices of the BufferStores are the same
+  // - Check buffers written in the block init and the block body can match
+  // - Check buffers do not duplicate
+  const Array<PrimExpr>& expected_shape = updates[0]->buffer->shape;
+  const Array<PrimExpr>& expected_indices = updates[0]->indices;
+  ICHECK_EQ(expected_shape.size(), expected_indices.size());
+  int n_dim = expected_indices.size();
+  arith::Analyzer ana;
+  for (int i = 0; i < n_buffers; ++i) {
+    if (static_cast<int>(updates[i]->buffer->shape.size()) != n_dim) {
+      ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/11);
+    }
+    if (static_cast<int>(inits[i]->indices.size()) != n_dim ||
+        static_cast<int>(updates[i]->indices.size()) != n_dim) {
+      ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/12);
+    }
+    for (int d = 0; d < n_dim; ++d) {
+      if (!ana.CanProveEqual(updates[i]->buffer->shape[d], expected_shape[d])) {
+        ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/11);
+      }
+      if (!ana.CanProveEqual(inits[i]->indices[d], expected_indices[d]) ||
+          !ana.CanProveEqual(updates[i]->indices[d], expected_indices[d])) {
+        ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/12);
+      }
+    }
+
+    auto it = buf2index.find(inits[i]->buffer.get());
+    if (it == buf2index.end()) {
+      ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/10);
+    }
+    int idx = it->second;
+    ICHECK(updates[idx]->buffer.same_as(inits[i]->buffer));
+    ICHECK(!init_values[idx].defined());
+    init_values.Set(idx, inits[i]->value);
+  }
+  for (int i = 0; i < n_buffers; ++i) {
+    ICHECK(init_values[i].defined());
+  }
+
+  return std::make_pair(init_values, updates);
+}
+
+bool ContainsOnlyDataParAndReductionBlockIter(const Array<IterVar>& iters) {
+  for (const IterVar& iter_var : iters) {
+    if (iter_var->iter_type != kDataPar && iter_var->iter_type != kCommReduce) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ReductionIterNotIndexOutputBuffer(const Block& block) {
+  // Step 1. Collect the reduction block iters.
+  std::unordered_set<const VarNode*> reduction_block_iters;
+  reduction_block_iters.reserve(block->iter_vars.size());
+  for (const IterVar& iter_var : block->iter_vars) {
+    if (iter_var->iter_type == kCommReduce) {
+      reduction_block_iters.insert(iter_var->var.get());
+    }
+  }
+  // Step 2. Check if the reduction block iters are used to index the output buffer.
+  std::unordered_set<const BufferNode*> buffer_written;
+  buffer_written.reserve(block->writes.size());
+  for (const BufferRegion& write_region : block->writes) {
+    buffer_written.insert(write_region->buffer.get());
+  }
+  auto f_uses_reduction_block_var = [&](const PrimExpr& expr) -> bool {
+    return UsesVar(expr, [&](const VarNode* var) {  //
+      return reduction_block_iters.count(var);
+    });
+  };
+  bool affected = false;
+  PreOrderVisit(block->body, [&](const ObjectRef& obj) {
+    if (affected) {
+      return false;
+    }
+    const auto* store = obj.as<BufferStoreNode>();
+    if (!store) {
+      return true;
+    }
+    ICHECK(buffer_written.count(store->buffer.get()))
+        << "ValueError: The buffer \"" << store->buffer
+        << "\" is written in the block but is not in the block's signature";
+    for (const PrimExpr& index : store->indices) {
+      if (f_uses_reduction_block_var(index)) {
+        affected = true;
+        return false;
+      }
+    }
+    return false;
+  });
+  return !affected;
+}
+
+class NoMatchedReducerError : public ScheduleError {
+ public:
+  explicit NoMatchedReducerError(IRModule mod, Array<PrimExpr> identities,
+                                 Array<BufferStore> combiners)
+      : mod_(std::move(mod)),
+        identities_(std::move(identities)),
+        combiners_(std::move(combiners)) {}
+
+  String FastErrorString() const final {
+    return "ScheduleError: No matched reducer for the identity and the combiner of this reduction "
+           "block. So rfactor and cross-thread reduction cannot be applied.";
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "No matched reducer for identity " << identities_ << " and combiner " << combiners_
+       << "In this case rfactor cannot be applied. You can check tvm::tir::ReducerRegistry for "
+          "default reducers or registering new reducers.";
+    return os.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {}; }
+
+  IRModule mod_;
+  Array<PrimExpr> identities_;
+  Array<BufferStore> combiners_;
+};
+
+std::tuple<CommReducer, Array<PrimExpr>, Array<PrimExpr>> GetReducerAndCombinerLhsRhs(
+    const Optional<ScheduleState>& self, const Array<PrimExpr>& identities,
+    const Array<BufferStore>& combiners) {
+  CommReducer reducer{nullptr};
+  Array<PrimExpr> combiner_lhs, combiner_rhs;
+  bool matched =
+      FromIdentityCombiner(identities, combiners, &reducer, &combiner_lhs, &combiner_rhs);
+  if (!matched) {
+    if (self.defined()) {
+      throw NoMatchedReducerError(self.value()->mod, identities, combiners);
+    } else {
+      LOG(FATAL) << "ValueError: No matched reducer for the identity and the combiner of the "
+                    "reduction block. So rfactor and cross-thread reduction cannot be applied.";
+    }
+  }
+  return std::make_tuple(std::move(reducer), std::move(combiner_lhs), std::move(combiner_rhs));
+}
+
+/******** Commutative Reducer ********/
+
+bool MatchReducer(const CommReducer& reducer, const Array<PrimExpr>& identities,
+                  const Array<PrimExpr>& combined_values, const Array<BufferLoad>& buf_loads,
+                  Array<PrimExpr>* lhs, Array<PrimExpr>* rhs) {
+  ExprDeepEqual equal;
+  ICHECK_EQ(identities.size(), combined_values.size());
+  int n_buffers = identities.size();
+  for (int i = 0; i < n_buffers; ++i) {
+    if (!equal(reducer->identity_element[i], identities[i])) {
+      return false;
+    }
+  }
+
+  PatternMatcher pattern_matcher(reducer->result);
+  pattern_matcher.Match(combined_values);
+  Array<PrimExpr> lhs_tmp, rhs_tmp;
+  lhs_tmp.reserve(n_buffers);
+  rhs_tmp.reserve(n_buffers);
+  if (!pattern_matcher.Success()) {
+    return false;
+  }
+
+  for (int i = 0; i < n_buffers; ++i) {
+    PrimExpr l = pattern_matcher.Eval(reducer->lhs[i]);
+    PrimExpr r = pattern_matcher.Eval(reducer->rhs[i]);
+    if (!equal(buf_loads[i], l)) {
+      return false;
+    }
+    lhs_tmp.push_back(l);
+    rhs_tmp.push_back(r);
+  }
+  *lhs = std::move(lhs_tmp);
+  *rhs = std::move(rhs_tmp);
+  return true;
+}
+
+bool FromIdentityCombiner(const Array<PrimExpr>& identities, const Array<BufferStore>& combiners,
+                          CommReducer* result_reducer, Array<PrimExpr>* lhs, Array<PrimExpr>* rhs) {
+  int n = identities.size();
+  Array<BufferLoad> buf_loads;
+  Array<PrimExpr> stored_values;
+  buf_loads.reserve(n);
+  stored_values.reserve(n);
+
+  for (int i = 0; i < n; ++i) {
+    buf_loads.push_back(BufferLoad(combiners[i]->buffer, combiners[i]->indices));
+    stored_values.push_back(combiners[i]->value);
+  }
+
+  // Check reduction patterns.
+  for (const TypedPackedFunc<Optional<CommReducer>(Array<PrimExpr>)>& reducer_getter :
+       GetReducerGetters()) {
+    Optional<CommReducer> reducer = reducer_getter(identities);
+    if (!reducer.defined()) {
+      continue;
+    }
+    if (MatchReducer(reducer.value(), identities, stored_values, buf_loads, lhs, rhs)) {
+      *result_reducer = reducer.value();
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc
index 1198e67d710a..2dc47fa15bea 100644
--- a/src/tir/schedule/primitive/reduction.cc
+++ b/src/tir/schedule/primitive/reduction.cc
@@ -297,29 +297,85 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref,
  */
 struct ReducerRegistry {
   ReducerRegistry()
-      : reducer_getters{CreateReducerGetter([](const Var& x, const Var& y) { return x + y; },
-                                            [](DataType dtype) { return make_const(dtype, 0); }),
-                        CreateReducerGetter([](const Var& x, const Var& y) { return x * y; },
-                                            [](DataType dtype) { return make_const(dtype, 1); }),
-                        CreateReducerGetter([](const Var& x, const Var& y) { return min(x, y); },
-                                            [](DataType dtype) { return max_value(dtype); }),
-                        CreateReducerGetter([](const Var& x, const Var& y) { return max(x, y); },
-                                            [](DataType dtype) { return min_value(dtype); })} {}
-
-  static void RegisterReducer(TypedPackedFunc<PrimExpr(Var, Var)> combiner_getter,
-                              TypedPackedFunc<PrimExpr(DataType)> identity_getter) {
+      : reducer_getters{CreateReducerGetter(
+                            /*n_buffers=*/1,
+                            [](const Array<Var>& x, const Array<Var>& y) {
+                              return Array<PrimExpr>{x[0] + y[0]};
+                            },
+                            [](const Array<PrimExpr>& values) {
+                              return Array<PrimExpr>{make_const(values[0]->dtype, 0)};
+                            }),
+                        CreateReducerGetter(
+                            /*n_buffers=*/1,
+                            [](const Array<Var>& x, const Array<Var>& y) {
+                              return Array<PrimExpr>{x[0] * y[0]};
+                            },
+                            [](const Array<PrimExpr>& values) {
+                              return Array<PrimExpr>{make_const(values[0]->dtype, 1)};
+                            }),
+                        CreateReducerGetter(
+                            /*n_buffers=*/1,
+                            [](const Array<Var>& x, const Array<Var>& y) {
+                              return Array<PrimExpr>{min(x[0], y[0])};
+                            },
+                            [](const Array<PrimExpr>& values) {
+                              return Array<PrimExpr>{max_value(values[0]->dtype)};
+                            }),
+                        CreateReducerGetter(
+                            /*n_buffers=*/1,
+                            [](const Array<Var>& x, const Array<Var>& y) {
+                              return Array<PrimExpr>{max(x[0], y[0])};
+                            },
+                            [](const Array<PrimExpr>& values) {
+                              return Array<PrimExpr>{min_value(values[0]->dtype)};
+                            }),
+                        CreateReducerGetter(
+                            /*n_buffers=*/2,
+                            [](const Array<Var>& x, const Array<Var>& y) {
+                              PrimExpr idx = Select(x[1] >= y[1], x[0], y[0]);
+                              PrimExpr val = Select(x[1] >= y[1], x[1], y[1]);
+                              return Array<PrimExpr>{idx, val};
+                            },
+                            [](const Array<PrimExpr>& values) {
+                              return Array<PrimExpr>{make_const(values[0]->dtype, -1),
+                                                     min_value(values[1]->dtype)};
+                            }),
+                        CreateReducerGetter(
+                            /*n_buffers=*/2,
+                            [](const Array<Var>& x, const Array<Var>& y) {
+                              PrimExpr idx = Select(x[1] <= y[1], x[0], y[0]);
+                              PrimExpr val = Select(x[1] <= y[1], x[1], y[1]);
+                              return Array<PrimExpr>{idx, val};
+                            },
+                            [](const Array<PrimExpr>& values) {
+                              return Array<PrimExpr>{make_const(values[0]->dtype, -1),
+                                                     max_value(values[1]->dtype)};
+                            })} {}
+
+  static void RegisterReducer(
+      int n_buffers, TypedPackedFunc<Array<PrimExpr>(Array<Var>, Array<Var>)> combiner_getter,
+      TypedPackedFunc<Array<PrimExpr>(Array<PrimExpr>)> identity_getter) {
     ReducerRegistry::Global()->reducer_getters.push_back(ReducerRegistry::CreateReducerGetter(
-        std::move(combiner_getter), std::move(identity_getter)));
+        n_buffers, std::move(combiner_getter), std::move(identity_getter)));
   }
 
-  static TypedPackedFunc<CommReducer(DataType)> CreateReducerGetter(
-      TypedPackedFunc<PrimExpr(Var, Var)> combiner_getter,
-      TypedPackedFunc<PrimExpr(DataType)> identity_getter) {
-    return [combiner_getter = std::move(combiner_getter),
-            identity_getter = std::move(identity_getter)](DataType dtype) -> CommReducer {
-      Var lhs("x", dtype);
-      Var rhs("y", dtype);
-      return CommReducer({lhs}, {rhs}, {combiner_getter(lhs, rhs)}, {identity_getter(dtype)});
+  static TypedPackedFunc<Optional<CommReducer>(Array<PrimExpr>)> CreateReducerGetter(
+      int n_buffers, TypedPackedFunc<Array<PrimExpr>(Array<Var>, Array<Var>)> combiner_getter,
+      TypedPackedFunc<Array<PrimExpr>(Array<PrimExpr>)> identity_getter) {
+    return [n_buffers,                                     //
+            combiner_getter = std::move(combiner_getter),  //
+            identity_getter = std::move(identity_getter)   //
+    ](Array<PrimExpr> values) -> Optional<CommReducer> {
+      if (static_cast<int>(values.size()) != n_buffers) {
+        return NullOpt;
+      }
+      Array<Var> lhs;
+      Array<Var> rhs;
+      for (int i = 0; i < n_buffers; ++i) {
+        lhs.push_back(Var("x" + std::to_string(i), values[i]->dtype));
+        rhs.push_back(Var("y" + std::to_string(i), values[i]->dtype));
+      }
+      return CommReducer(lhs, rhs, combiner_getter(lhs, rhs), identity_getter(values));
     };
   }
 
@@ -328,10 +384,10 @@ struct ReducerRegistry {
     return &instance;
   }
 
-  std::vector<TypedPackedFunc<CommReducer(DataType)>> reducer_getters;
+  std::vector<TypedPackedFunc<Optional<CommReducer>(Array<PrimExpr>)>> reducer_getters;
 };
 
-std::vector<TypedPackedFunc<CommReducer(DataType)>> GetReducerGetters() {
+std::vector<TypedPackedFunc<Optional<CommReducer>(Array<PrimExpr>)>> GetReducerGetters() {
   return ReducerRegistry::Global()->reducer_getters;
 }
 
@@ -508,44 +564,57 @@ std::unordered_map<const VarNode*, For> GetLoopVar2LoopMap(const Array<For>& loo
 }
 
 /*!
- * \brief Create the intermediate rfactor buffer, which the rfactor block writes to and the
+ * \brief Create the intermediate rfactor buffers, which the rfactor block writes to and the
  * write-back block reads from
- * \param buffer The buffer written by the reduction block
+ * \param buf_stores The BufferStores of the original block, where the rfactor buffers will be
+ * created from
  * \param factor_axis The `factor_axis` parameter of rfactor
  * \param rf_loop The rfactor loop
  * \return The new created intermediate rfactor buffer
  */
-Buffer CreateRFactorBuffer(const Buffer& buffer, int factor_axis, const ForNode* rf_loop) {
-  Array<PrimExpr> rf_shape = buffer->shape;
-  rf_shape.insert(rf_shape.begin() + factor_axis, rf_loop->extent);
-
-  ObjectPtr<BufferNode> n = make_object<BufferNode>(*buffer.get());
-  n->shape = rf_shape;
-  n->name = buffer->name + ".rf";
-  n->data = buffer->data.copy_with_suffix(".rf");
-  return Buffer(n);
+Array<Buffer> CreateRFactorBuffers(const Array<BufferStore>& buf_stores, int factor_axis,
+                                   const ForNode* rf_loop) {
+  Array<Buffer> rf_buffers;
+  rf_buffers.reserve(buf_stores.size());
+  for (const BufferStore& buf_store : buf_stores) {
+    Buffer buffer = buf_store->buffer;
+    Array<PrimExpr> rf_shape = buffer->shape;
+    rf_shape.insert(rf_shape.begin() + factor_axis, rf_loop->extent);
+
+    ObjectPtr<BufferNode> n = make_object<BufferNode>(*buffer.get());
+    n->shape = rf_shape;
+    n->name = buffer->name + ".rf";
+    n->data = buffer->data.copy_with_suffix(".rf");
+    rf_buffers.push_back(Buffer(n));
+  }
+  return rf_buffers;
 }
 
 /*!
  * \brief The base class of the rfactor/write-back block creator, which creates the blocks in four
  * steps:
  * 1) Create the new block iters and the their iter bindings
- * 2) Create the reduction update of the new block
+ * 2) Create the body and init of the new block
  * 3) Create the read/write regions of the new block
  * 4) Create the new block and the new block-realize
  */
 class BaseBlockCreator {
  public:
   explicit BaseBlockCreator(BlockRealize old_block_realize, For rf_loop,
-                            BufferStore old_reduction_update, CommReducer reducer, Buffer rf_buffer,
-                            bool is_rf_block)
+                            Array<BufferStore> old_reduction_updates, CommReducer reducer,
+                            Array<Buffer> rf_buffers, bool is_rf_block)
       : old_block_realize_(std::move(old_block_realize)),
         rf_loop_(std::move(rf_loop)),
-        old_reduction_update_(std::move(old_reduction_update)),
+        old_reduction_updates_(std::move(old_reduction_updates)),
         reducer_(std::move(reducer)),
-        rf_buffer_(std::move(rf_buffer)),
+        rf_buffers_(std::move(rf_buffers)),
+        n_buffers_(static_cast<int>(rf_buffers_.size())),
         is_rf_block_(is_rf_block) {
     n_block_iters_ = static_cast<int>(old_block_realize_->iter_values.size());
+    update_buffers_.reserve(n_buffers_);
+    update_indices_.reserve(n_buffers_);
+    update_lhs_.reserve(n_buffers_);
+    update_rhs_.reserve(n_buffers_);
   }
 
   void CreateBlock() {
@@ -560,7 +629,15 @@ class BaseBlockCreator {
         break;
       }
     }
-    CreateReductionUpdate(has_reduce_iter);
+
+    // The pre-processing finds out the buffers written in the block, the indices of the buffer
+    // accesses, and the reduction LHS and RHS of the stored values.
+    PreProcess();
+    Stmt block_body = Substitute(CreateBlockBody(has_reduce_iter), var_map_);
+    Optional<Stmt> block_init = CreateBlockInit(has_reduce_iter);
+    if (block_init.defined()) {
+      block_init = Substitute(block_init.value(), var_map_);
+    }
     CreateReadWriteRegions();
 
     String new_block_name = old_block_realize_->block->name_hint;
@@ -569,17 +646,13 @@ class BaseBlockCreator {
       new_block_name = new_block_name + "_rf";
       predicate = old_block_realize_->predicate;
     }
-    Optional<Stmt> init_block =
-        has_reduce_iter ? BufferStore(new_reduction_update_->buffer, reducer_->identity_element[0],
-                                      new_reduction_update_->indices)
-                        : Optional<Stmt>(NullOpt);
     new_block_ = Block(
         /*iter_vars=*/iter_vars_,
         /*reads=*/read_regions_,
         /*writes=*/write_regions_,
         /*name_hint=*/new_block_name,
-        /*body=*/new_reduction_update_,
-        /*init=*/init_block,
+        /*body=*/std::move(block_body),
+        /*init=*/std::move(block_init),
         /*alloc_buffers=*/{},
         /*match_buffers=*/{},
         /*annotations=*/old_block_realize_->block->annotations);
@@ -589,9 +662,58 @@ class BaseBlockCreator {
  private:
   virtual void CreateAdditionalIter() = 0;
   virtual void CreateNormalIters(int idx) = 0;
-  virtual void CreateReductionUpdate(bool has_reduce_iter) = 0;
+  virtual void PreProcess() = 0;
   virtual void CreateReadWriteRegions() = 0;
 
+  Stmt CreateBlockBody(bool has_reduce_iter) {
+    Array<Stmt> buf_stores;
+    buf_stores.reserve(n_buffers_);
+
+    // Case 1. If the block has no reduction iterator, we just store the RHS values into the
+    // buffers.
+    if (!has_reduce_iter) {
+      for (int i = 0; i < n_buffers_; ++i) {
+        buf_stores.push_back(BufferStore(update_buffers_[i], update_rhs_[i], update_indices_[i]));
+      }
+      return n_buffers_ > 1 ? SeqStmt(buf_stores) : buf_stores[0];
+    }
+
+    // Case 2. If the reduction is for single buffer, the block body is a single BufferStore.
+    Array<PrimExpr> stored_values = (*reducer_.get())(update_lhs_, update_rhs_);
+    if (n_buffers_ == 1) {
+      return BufferStore(update_buffers_[0], stored_values[0], update_indices_[0]);
+    }
+
+    // Case 3. In case the reduction is for multiple buffers, we should create the reduction with
+    // LetStmt so that the reduction execution generates correct results.
+    Array<Var> let_vars;
+    let_vars.reserve(n_buffers_);
+    for (int i = 0; i < n_buffers_; ++i) {
+      Var var("v_" + update_buffers_[i]->name, PrimType(stored_values[i]->dtype));
+      let_vars.push_back(var);
+      buf_stores.push_back(BufferStore(update_buffers_[i], var, update_indices_[i]));
+    }
+    Stmt body = SeqStmt(buf_stores);
+    for (int i = n_buffers_ - 1; i >= 0; --i) {
+      body = LetStmt(let_vars[i], stored_values[i], std::move(body));
+    }
+    return body;
+  }
+
+  Optional<Stmt> CreateBlockInit(bool has_reduce_iter) {
+    if (!has_reduce_iter) {
+      return NullOpt;
+    }
+
+    Array<Stmt> inits;
+    inits.reserve(n_buffers_);
+    for (int i = 0; i < n_buffers_; ++i) {
+      inits.push_back(
+          BufferStore(update_buffers_[i], reducer_->identity_element[i], update_indices_[i]));
+    }
+    return n_buffers_ > 1 ? SeqStmt(inits) : inits[0];
+  }
+
  public:
   /*! \brief The new created block */
   Block new_block_;
@@ -607,12 +729,19 @@ class BaseBlockCreator {
   int n_block_iters_;
   /*! \brief The rfactor loop */
   For rf_loop_;
-  /*! \brief The update BufferStore of the old block */
-  BufferStore old_reduction_update_;
+  /*! \brief The update BufferStores of the old block */
+  Array<BufferStore> old_reduction_updates_;
   /*! \brief The matched commutative reducer */
   CommReducer reducer_;
-  /*! \brief The intermediate rfactor buffer */
-  Buffer rf_buffer_;
+  /*! \brief The intermediate rfactor buffers */
+  Array<Buffer> rf_buffers_;
+  /*! \brief The number of rfactor buffers. */
+  const int n_buffers_;
+  /*!
+   * \brief A mapping which maps old block iters to new expressions. The old iters will be replaced
+   * by the expressions in future substitution for the two blocks
+   */
+  Map<Var, PrimExpr> var_map_;
 
   /*! \brief Whether we are creating the rfactor block or the write-back block */
   bool is_rf_block_;
@@ -620,13 +749,14 @@ class BaseBlockCreator {
   std::vector<IterVar> iter_vars_;
   /*! \brief The new block iter bindings of the new created block-realize */
   std::vector<PrimExpr> iter_values_;
-  /*!
-   * \brief A mapping which maps old block iters to new expressions. The old iters will be replaced
-   * by the expressions in future substitution for the two blocks
-   */
-  Map<Var, PrimExpr> var_map_;
-  /*! \brief The update BufferStore of the new created block */
-  BufferStore new_reduction_update_;
+  /*! \brief The buffers updated in this block */
+  Array<Buffer> update_buffers_;
+  /*! \brief The indices of the buffers updated in this block, respectively */
+  Array<Array<PrimExpr>> update_indices_;
+  /*! \brief The LHS values of the reduction in this block */
+  Array<PrimExpr> update_lhs_;
+  /*! \brief THe RHS values of the reduction in this block */
+  Array<PrimExpr> update_rhs_;
   /*! \brief The read regions of the new created block */
   Array<BufferRegion> read_regions_;
   /*! \brief The write regions of the new created block */
@@ -658,13 +788,13 @@ class BaseBlockCreator {
 class RFactorBlockCreator : public BaseBlockCreator {
  public:
   explicit RFactorBlockCreator(BlockRealize old_block_realize, For rf_loop,
-                               BufferStore old_reduction_update, CommReducer reducer,
-                               Buffer rf_buffer,
+                               Array<BufferStore> old_reduction_updates, CommReducer reducer,
+                               Array<Buffer> rf_buffers,
                                std::unordered_map<const VarNode*, For> loop_vars2loop,
-                               int factor_axis, PrimExpr combiner_rhs)
+                               int factor_axis, Array<PrimExpr> combiner_rhs)
       : BaseBlockCreator(std::move(old_block_realize), std::move(rf_loop),
-                         std::move(old_reduction_update), std::move(reducer), std::move(rf_buffer),
-                         true),
+                         std::move(old_reduction_updates), std::move(reducer),
+                         std::move(rf_buffers), true),
         loop_vars2loop_(std::move(loop_vars2loop)),
         factor_axis_(factor_axis),
         combiner_rhs_(std::move(combiner_rhs)) {}
@@ -718,41 +848,38 @@ class RFactorBlockCreator : public BaseBlockCreator {
     var_map_.Set(old_iter->var, Substitute(old_binding, loop_var2block_binding_));
   }
 
-  void CreateReductionUpdate(bool has_reduce_iter) final {
-    rf_buf_access_indices_ = old_reduction_update_->indices;
+  void PreProcess() final {
+    // The accessed indices for all reduction buffers are the same.
+    rf_buf_access_indices_ = old_reduction_updates_[0]->indices;
     rf_buf_access_indices_.insert(rf_buf_access_indices_.begin() + factor_axis_,
                                   additional_iter_->var);
-    PrimExpr rhs{nullptr};
-    if (has_reduce_iter) {
-      rhs = (*reducer_.get())({BufferLoad(rf_buffer_, rf_buf_access_indices_)}, {combiner_rhs_})[0];
-    } else {
-      rhs = combiner_rhs_;
+    for (int i = 0; i < n_buffers_; ++i) {
+      update_buffers_.push_back(rf_buffers_[i]);
+      update_indices_.push_back(rf_buf_access_indices_);
+      update_lhs_.push_back(BufferLoad(update_buffers_[i], rf_buf_access_indices_));
+      update_rhs_.push_back(combiner_rhs_[i]);
     }
-    new_reduction_update_ = BufferStore(rf_buffer_, rhs, rf_buf_access_indices_);
-    new_reduction_update_ = Downcast<BufferStore>(Substitute(new_reduction_update_, var_map_));
   }
 
   void CreateReadWriteRegions() final {
+    Map<Buffer, Buffer> buffer_map;
+    for (int i = 0; i < n_buffers_; ++i) {
+      buffer_map.Set(old_reduction_updates_[i]->buffer, rf_buffers_[i]);
+    }
     const Block& old_block = old_block_realize_->block;
-    read_regions_ = CreateRegions(old_block->reads);
-    write_regions_ = CreateRegions(old_block->writes);
-  }
-
-  Array<BufferRegion> CreateRegions(const Array<BufferRegion>& old_regions) {
-    Array<BufferRegion> new_regions;
-    new_regions.reserve(old_regions.size());
-    for (const BufferRegion& buffer_region : old_regions) {
-      if (buffer_region->buffer.same_as(old_reduction_update_->buffer)) {
-        Array<Range> region = buffer_region->region;
-        region.insert(region.begin() + factor_axis_,
-                      Range::FromMinExtent(additional_iter_->var, 1));
-        new_regions.push_back(BufferRegion(rf_buffer_, Substitute(region, var_map_)));
-      } else {
-        new_regions.push_back(
-            BufferRegion(buffer_region->buffer, Substitute(buffer_region->region, var_map_)));
-      }
+    read_regions_.reserve(old_block->reads.size());
+    for (const BufferRegion& read_region : old_block->reads) {
+      read_regions_.push_back(
+          BufferRegion(read_region->buffer, Substitute(read_region->region, var_map_)));
+    }
+    write_regions_.reserve(old_block->writes.size());
+    for (const BufferRegion& write_region : old_block->writes) {
+      Array<Range> region = write_region->region;
+      region.insert(region.begin() + factor_axis_, Range::FromMinExtent(additional_iter_->var, 1));
+      Optional<Buffer> rf_buffer = buffer_map.Get(write_region->buffer);
+      ICHECK(rf_buffer.defined());
+      write_regions_.push_back(BufferRegion(rf_buffer.value(), Substitute(region, var_map_)));
     }
-    return new_regions;
   }
 
  public:
@@ -767,8 +894,8 @@ class RFactorBlockCreator : public BaseBlockCreator {
   std::unordered_map<const VarNode*, For> loop_vars2loop_;
   /*! \brief The factor_axis specified for rfactor */
   int factor_axis_;
-  /*! \brief The rhs of the combiner in the reduction update of the old block */
-  PrimExpr combiner_rhs_;
+  /*! \brief The RHS values of the reduction in the old block */
+  Array<PrimExpr> combiner_rhs_;
   /*!
    * \brief A mapping which maps loop vars to new created block iters. This map is used to
    * substitute the loop vars which appear in the bindings of some old block iters with the new
@@ -784,12 +911,13 @@ class RFactorBlockCreator : public BaseBlockCreator {
 class WriteBackBlockCreator : public BaseBlockCreator {
  public:
   explicit WriteBackBlockCreator(BlockRealize old_block_realize, For rf_loop,
-                                 BufferStore old_reduction_update, CommReducer reducer,
-                                 Buffer rf_buffer, IterVar rf_additional_iter,
-                                 PrimExpr combiner_lhs, Array<PrimExpr> rf_buf_access_indices)
+                                 Array<BufferStore> old_reduction_updates, CommReducer reducer,
+                                 Array<Buffer> rf_buffers, IterVar rf_additional_iter,
+                                 Array<PrimExpr> combiner_lhs,
+                                 Array<PrimExpr> rf_buf_access_indices)
       : BaseBlockCreator(std::move(old_block_realize), std::move(rf_loop),
-                         std::move(old_reduction_update), std::move(reducer), std::move(rf_buffer),
-                         false),
+                         std::move(old_reduction_updates), std::move(reducer),
+                         std::move(rf_buffers), false),
         rf_additional_iter_(std::move(rf_additional_iter)),
         combiner_lhs_(std::move(combiner_lhs)) {
     iter_vars_.reserve(n_block_iters_);
@@ -817,39 +945,40 @@ class WriteBackBlockCreator : public BaseBlockCreator {
     }
   }
 
-  void CreateReductionUpdate(bool has_reduce_iter) final {
-    wb_lhs_ = Downcast<BufferLoad>(Substitute(combiner_lhs_, var_map_));
-    wb_rhs_ =
-        Downcast<BufferLoad>(Substitute(BufferLoad(rf_buffer_, rf_buf_access_indices_), var_map_));
-    new_reduction_update_ =
-        BufferStore(old_reduction_update_->buffer, (*reducer_.get())({wb_lhs_}, {wb_rhs_})[0],
-                    old_reduction_update_->indices);
-    new_reduction_update_ = Downcast<BufferStore>(Substitute(new_reduction_update_, var_map_));
+  void PreProcess() final {
+    for (int i = 0; i < n_buffers_; ++i) {
+      PrimExpr rhs = BufferLoad(rf_buffers_[i], rf_buf_access_indices_);
+      update_buffers_.push_back(old_reduction_updates_[i]->buffer);
+      update_indices_.push_back(old_reduction_updates_[i]->indices);
+      update_lhs_.push_back(Substitute(combiner_lhs_[i], var_map_));
+      update_rhs_.push_back(Substitute(std::move(rhs), var_map_));
+    }
   }
 
   void CreateReadWriteRegions() final {
-    read_regions_.push_back(CreateRegion(wb_rhs_));
-    write_regions_.push_back(CreateRegion(wb_lhs_));
+    CreateRegion(update_rhs_, true);
+    CreateRegion(update_lhs_, false);
   }
 
-  static BufferRegion CreateRegion(const BufferLoad& load) {
-    Array<Range> region;
-    region.reserve(load->indices.size());
-    for (const PrimExpr& index : load->indices) {
-      region.push_back(Range::FromMinExtent(index, 1));
+  void CreateRegion(const Array<PrimExpr>& buf_loads, bool is_read) {
+    Array<BufferRegion>& buf_regions = is_read ? read_regions_ : write_regions_;
+    for (const PrimExpr& expr : buf_loads) {
+      const auto* buf_load = expr.as<BufferLoadNode>();
+      ICHECK(buf_load != nullptr);
+      Array<Range> region;
+      region.reserve(buf_load->indices.size());
+      for (const PrimExpr& index : buf_load->indices) {
+        region.push_back(Range::FromMinExtent(index, 1));
+      }
+      buf_regions.push_back(BufferRegion(buf_load->buffer, std::move(region)));
     }
-    return BufferRegion(load->buffer, std::move(region));
   }
 
  private:
   /*! \brief The new created additional block iter of the rfactor block */
   IterVar rf_additional_iter_;
-  /*! \brief The lhs of the combiner in the reduction update of the old block */
-  PrimExpr combiner_lhs_;
-  /*! \brief The lhs of the combiner of the write-back block */
-  BufferLoad wb_lhs_;
-  /*! \brief The rhs of the combiner of the write-back block */
-  BufferLoad wb_rhs_;
+  /*! \brief The LHS values of the reduction in the old block */
+  Array<PrimExpr> combiner_lhs_;
 };
 
 /*!
@@ -924,14 +1053,16 @@ class BlockReplacer : public StmtMutator {
                        BlockRealize wb_block_realize, BlockRealize old_block_realize, For rf_loop,
                        std::unordered_set<const VarNode*> reduce_loop_vars,
                        std::unordered_map<const VarNode*, For> loop_vars2loop,
-                       const Buffer& rf_buffer) {
+                       const Array<Buffer>& rf_buffers) {
     BlockReplacer replacer(std::move(rf_body), std::move(outermost_loop),
                            std::move(wb_block_realize), std::move(old_block_realize),
                            std::move(rf_loop), std::move(reduce_loop_vars),
                            std::move(loop_vars2loop));
     Block new_scope_root = Downcast<Block>(replacer(std::move(scope_root_block)));
     BlockNode* p = new_scope_root.CopyOnWrite();
-    p->alloc_buffers.push_back(rf_buffer);
+    for (const Buffer& rf_buffer : rf_buffers) {
+      p->alloc_buffers.push_back(rf_buffer);
+    }
     return new_scope_root;
   }
 
@@ -1040,13 +1171,19 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax
   // commutative reducer, combiner lhs and combiner rhs from the reduction identity and the
   // reduction combiner. The lhs will be used when constructing the write-back block, and the rhs
   // will be used when constructing the rfactor block.
-  auto [init, update] = GetBufferStoresFromReductionBlock(self, block);
-  auto [reducer, combiner_lhs, combiner_rhs] =
-      GetReducerAndCombinerLhsRhs(self, init->value, update);
+  Array<PrimExpr> init_values{nullptr};
+  Array<BufferStore> updates{nullptr};
+  CommReducer reducer{nullptr};
+  Array<PrimExpr> combiner_lhs{nullptr};
+  Array<PrimExpr> combiner_rhs{nullptr};
+  std::tie(init_values, updates) = GetInitValuesAndUpdatesFromReductionBlock(self, block);
+  std::tie(reducer, combiner_lhs, combiner_rhs) =
+      GetReducerAndCombinerLhsRhs(self, init_values, updates);
 
   // Step 6. Check whether `factor_axis` is in a correct range, and convert it to non-negative if it
   // is negative.
-  factor_axis = FactorAxisOutOfRangeError::CheckAndUpdate(self->mod, update->buffer, factor_axis);
+  factor_axis =
+      FactorAxisOutOfRangeError::CheckAndUpdate(self->mod, updates[0]->buffer, factor_axis);
 
   // *****************************************************
   // *                 IR Manipulation                   *
@@ -1056,17 +1193,17 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax
 
   // Step 1. Create the intermediate buffer (a.k.a. rfactor buffer), which has an additional
   // dimension that specified by `factor_axis` and `rf_loop`.
-  Buffer rf_buffer = CreateRFactorBuffer(update->buffer, factor_axis, rf_loop);
+  Array<Buffer> rf_buffers = CreateRFactorBuffers(updates, factor_axis, rf_loop);
 
   // Step 2. Create the rfactor block.
-  RFactorBlockCreator rf_block_creator(block_realize, GetRef<For>(rf_loop), update, reducer,
-                                       rf_buffer, loop_vars2loop, factor_axis,
+  RFactorBlockCreator rf_block_creator(block_realize, GetRef<For>(rf_loop), updates, reducer,
+                                       rf_buffers, loop_vars2loop, factor_axis,
                                        std::move(combiner_rhs));
   rf_block_creator.CreateBlock();
 
   // Step 3. Create the write-back block.
-  WriteBackBlockCreator wb_block_creator(block_realize, GetRef<For>(rf_loop), update, reducer,
-                                         rf_buffer, std::move(rf_block_creator.additional_iter_),
+  WriteBackBlockCreator wb_block_creator(block_realize, GetRef<For>(rf_loop), updates, reducer,
+                                         rf_buffers, std::move(rf_block_creator.additional_iter_),
                                          std::move(combiner_lhs),
                                          std::move(rf_block_creator.rf_buf_access_indices_));
   wb_block_creator.CreateBlock();
@@ -1082,7 +1219,7 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax
   Block old_scope_root_block = GetRef<Block>(scope_root->StmtAs<BlockNode>());
   Block new_scope_root_block = BlockReplacer::Replace(
       old_scope_root_block, rf_body, loops[0], wb_block_creator.new_block_realize_, block_realize,
-      GetRef<For>(rf_loop), reduce_loop_vars, loop_vars2loop, rf_buffer);
+      GetRef<For>(rf_loop), reduce_loop_vars, loop_vars2loop, rf_buffers);
   self->Replace(
       scope_root, new_scope_root_block,
       {{old_scope_root_block, new_scope_root_block}, {block, wb_block_creator.new_block_}});
@@ -1157,8 +1294,9 @@ TVM_REGISTER_INST_KIND_TRAITS(DecomposeReductionTraits);
 /******** FFI ********/
 
 TVM_REGISTER_GLOBAL("tir.schedule.RegisterReducer")
-    .set_body_typed([](PackedFunc combiner_getter, PackedFunc identity_getter) {
-      ReducerRegistry::RegisterReducer(std::move(combiner_getter), std::move(identity_getter));
+    .set_body_typed([](int n_buffers, PackedFunc combiner_getter, PackedFunc identity_getter) {
+      ReducerRegistry::RegisterReducer(n_buffers, std::move(combiner_getter),
+                                       std::move(identity_getter));
     });
 
 }  // namespace tir
diff --git a/src/tir/transforms/lower_cross_thread_reduction.cc b/src/tir/transforms/lower_cross_thread_reduction.cc
index 04b025b5f9ae..c10555e74d07 100644
--- a/src/tir/transforms/lower_cross_thread_reduction.cc
+++ b/src/tir/transforms/lower_cross_thread_reduction.cc
@@ -111,70 +111,66 @@ bool IsReductionBlock(const BlockRealize& realize, const Map<Var, Range>& loop_r
 }
 
 /*!
- * \brief Create an intermediate buffer with specified name and data type
- * \param name The specified name
- * \param dtype The specified data type
- * \return The created buffer
+ * \brief Create intermediate buffers according to the input buffers and buffer kind
+ * \param reduction_buffers The old reduction buffers which provide the buffer names and data types
+ * \param is_cross_thread_buffer A boolean indicating whether to create buffers for the cross-thread
+ * computation results or not, which is used for determine the buffer name prefix
+ * \return The created buffers
  */
-Buffer MakeScratchpad(String name, const DataType& dtype) {
-  return Buffer(/*ptr=*/Var(name, PointerType(PrimType(dtype), "local")),
-                /*dtype=*/dtype,
-                /*shape=*/{Integer(1)},
-                /*strides=*/{Integer(1)},
-                /*elem_offset=*/PrimExpr{nullptr},
-                /*name=*/name,
-                /*data_alignment=*/0,
-                /*offset_factor=*/0,
-                /*buffer_type=*/kDefault);
-}
-
-/*!
- * \brief Remove the BufferRegions whose buffer is the input buffer
- * \param buffer_regions The array of BufferRegions to be
- * \param buffer_to_remove The specified buffer
- * \return The mutated array of BufferRegions, no longer containing BufferRegion of the input buffer
- */
-Array<BufferRegion> RemoveBufferFromBufferRegions(const Array<BufferRegion>& buffer_regions,
-                                                  const Buffer& buffer_to_remove) {
-  Array<BufferRegion> res;
-  res.reserve(buffer_regions.size());
-  for (const BufferRegion& buffer_region : buffer_regions) {
-    if (!buffer_region->buffer.same_as(buffer_to_remove)) {
-      res.push_back(buffer_region);
-    }
+Array<Buffer> MakeScratchpads(const Array<Buffer>& reduction_buffers, bool is_cross_thread_buffer) {
+  Array<Buffer> new_buffers;
+  new_buffers.reserve(reduction_buffers.size());
+  for (const Buffer& buffer : reduction_buffers) {
+    String name = is_cross_thread_buffer ? "cross" : "in";
+    name = name + "_thread_" + buffer->name;
+    new_buffers.push_back(Buffer(/*ptr=*/Var(name, PointerType(PrimType(buffer->dtype), "local")),
+                                 /*dtype=*/buffer->dtype,
+                                 /*shape=*/{Integer(1)},
+                                 /*strides=*/{Integer(1)},
+                                 /*elem_offset=*/PrimExpr{nullptr},
+                                 /*name=*/name,
+                                 /*data_alignment=*/0,
+                                 /*offset_factor=*/0,
+                                 /*buffer_type=*/kDefault));
   }
-  return res;
+  return new_buffers;
 }
 
 /*!
- * \brief Substitute a given source buffer with a given target buffer in statements or expressions
+ * \brief Substitute given source buffers with given target buffers respectively in the input
+ * statement
  */
 class BufferReplacer : private StmtExprMutator {
  public:
-  static Stmt Run(Buffer src_buffer, Buffer tgt_buffer, Stmt stmt) {
-    return BufferReplacer(src_buffer, tgt_buffer)(std::move(stmt));
+  static Stmt Run(Array<Buffer> src_buffers, Array<Buffer> tgt_buffers, Stmt stmt) {
+    Map<Buffer, Buffer> buffer_map;
+    ICHECK_EQ(src_buffers.size(), tgt_buffers.size());
+    int n_buffers = src_buffers.size();
+    for (int i = 0; i < n_buffers; ++i) {
+      buffer_map.Set(src_buffers[i], tgt_buffers[i]);
+    }
+    return BufferReplacer(buffer_map)(std::move(stmt));
   }
 
  private:
-  explicit BufferReplacer(Buffer src_buffer, Buffer tgt_buffer)
-      : src_buffer_(std::move(src_buffer)), tgt_buffer_(std::move(tgt_buffer)) {}
+  explicit BufferReplacer(Map<Buffer, Buffer> buffer_map) : buffer_map_(std::move(buffer_map)) {}
 
   PrimExpr VisitExpr_(const BufferLoadNode* load) final {
-    return load->buffer.same_as(src_buffer_) ? BufferLoad(tgt_buffer_, {0})
-                                             : GetRef<BufferLoad>(load);
+    auto it = buffer_map_.find(load->buffer);
+    return it != buffer_map_.end() ? BufferLoad((*it).second, {0}) : GetRef<BufferLoad>(load);
   }
 
   Stmt VisitStmt_(const BufferStoreNode* store) final {
-    if (store->buffer.same_as(src_buffer_)) {
+    auto it = buffer_map_.find(store->buffer);
+    if (it != buffer_map_.end()) {
       PrimExpr value = StmtExprMutator::VisitExpr(store->value);
-      return BufferStore(tgt_buffer_, value, {0});
+      return BufferStore((*it).second, std::move(value), {0});
     } else {
       return StmtMutator::VisitStmt_(store);
     }
   }
 
-  Buffer src_buffer_;
-  Buffer tgt_buffer_;
+  Map<Buffer, Buffer> buffer_map_;
 };
 
 /*!
@@ -231,25 +227,40 @@ class InThreadReducerMaker : private StmtMutator {
 
 /*!
  * \brief Create the lowered allreduce block transformed from the input reduction block
- * \param reduction_block The input reduction block
- * \param it_buffer The buffer to store in-thread reduction results
- * \param ct_buffer The buffer to store cross-thread reduction results
+ * \param realize The block-realize which contains the old reduction block
+ * \param it_buffers The buffers to store in-thread reduction results
+ * \param ct_buffers The buffers to store cross-thread reduction results
+ * \param wb_buffers The buffers to store the final reduction results
+ * \param old_wb_indices The indices used to access the write-back buffers when storing the final
+ * reduction results into the write-back buffers
  * \param reducer The reduction function
- * \param combiner_rhs The RHS of the combiner
+ * \param combiner_rhs The RHS values of the combiner
  * \param reduction_loops The reduction loops
  */
-Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optional<Buffer>& it_buffer,
-                             const Buffer& ct_buffer, const CommReducer& reducer,
-                             const PrimExpr& combiner_rhs,
+Stmt TransformReductionBlock(const BlockRealizeNode* realize,            //
+                             const Optional<Array<Buffer>>& it_buffers,  //
+                             const Array<Buffer>& ct_buffers,            //
+                             const Array<Buffer>& wb_buffers,            //
+                             const Array<PrimExpr>& old_wb_indices,      //
+                             const CommReducer& reducer,                 //
+                             const Array<PrimExpr>& combiner_rhs,        //
                              const std::vector<const ForNode*>& reduction_loops) {
+  int n_buffers = wb_buffers.size();
   const BlockNode* block = realize->block.get();
-  Buffer wb_buffer = block->writes[0]->buffer;
-  Array<Range> wb_region = block->writes[0]->region;
 
-  BufferRegion ct_buffer_region(ct_buffer, {Range::FromMinExtent(0, 1)});
-  Optional<BufferRegion> it_buffer_region = NullOpt;
-  if (it_buffer.defined()) {
-    it_buffer_region = BufferRegion(it_buffer.value(), {Range::FromMinExtent(0, 1)});
+  auto f_create_buffer_regions = [](Array<Buffer> buffers) {
+    Array<BufferRegion> regions;
+    regions.reserve(buffers.size());
+    for (const Buffer& buffer : buffers) {
+      regions.push_back(BufferRegion(buffer, {Range::FromMinExtent(0, 1)}));
+    }
+    return regions;
+  };
+
+  Array<BufferRegion> ct_buffer_regions = f_create_buffer_regions(ct_buffers);
+  Optional<Array<BufferRegion>> it_buffer_regions = NullOpt;
+  if (it_buffers.defined()) {
+    it_buffer_regions = f_create_buffer_regions(it_buffers.value());
   }
   // In total, the block is transformed into at most 4 statements
   // - Stmt 1: initialize the buffer for in-thread reduction
@@ -259,35 +270,35 @@ Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optional<Buf
   Array<Stmt> stmts;
   stmts.reserve(4);
   // Stmt 1: initialize the buffer for in-thread reduction
-  if (it_buffer.defined()) {
-    BufferStore init = Downcast<BufferStore>(block->init);
-    stmts.push_back(BlockRealize(
-        /*iter_values=*/{},
-        /*predicate=*/const_true(),
-        /*block=*/
-        Block(/*iter_vars=*/{},
-              /*reads=*/{},
-              /*writes=*/{it_buffer_region.value()},
-              /*name_hint=*/block->name_hint + "_in_thread_init",
-              /*body=*/
-              BufferStore(/*buffer=*/it_buffer.value(),
-                          /*value=*/init->value,
-                          /*indices=*/{Integer(0)}))));
+  if (it_buffers.defined()) {
+    Array<Stmt> inits;
+    inits.reserve(n_buffers);
+    for (int i = 0; i < n_buffers; ++i) {
+      inits.push_back(
+          BufferStore(it_buffers.value()[i], reducer->identity_element[i], {Integer(0)}));
+    }
+    stmts.push_back(BlockRealize(/*iter_values=*/{},
+                                 /*predicate=*/const_true(),
+                                 /*block=*/
+                                 Block(/*iter_vars=*/{},
+                                       /*reads=*/{},
+                                       /*writes=*/it_buffer_regions.value(),
+                                       /*name_hint=*/block->name_hint + "_in_thread_init",
+                                       /*body=*/n_buffers > 1 ? SeqStmt(inits) : inits[0])));
   }
   // Stmt 2: do in-thread reduction
   {
     Optional<BlockRealize> new_realize = NullOpt;
     // If need to generate in-thread reduction,
-    // then replace `wb_buffer` with `it_buffer` accordingly in given BlockRealize
+    // then replace `wb_buffers` with `it_buffers` accordingly in given BlockRealize
     // otherwise, directly remove given BlockRealize
-    if (it_buffer.defined()) {
+    if (it_buffers.defined()) {
       ObjectPtr<BlockNode> new_block = make_object<BlockNode>(*block);
-      new_block->reads = RemoveBufferFromBufferRegions(std::move(new_block->reads), wb_buffer);
-      new_block->reads.push_back(it_buffer_region.value());
-      new_block->writes = {it_buffer_region.value()};
+      new_block->reads = std::move(new_block->reads);
+      new_block->writes = it_buffer_regions.value();
       new_block->name_hint = new_block->name_hint + "_in_thread";
       new_block->body =
-          BufferReplacer::Run(wb_buffer, it_buffer.value(), std::move(new_block->body));
+          BufferReplacer::Run(wb_buffers, it_buffers.value(), std::move(new_block->body));
       new_block->init = NullOpt;
       ObjectPtr<BlockRealizeNode> n = make_object<BlockRealizeNode>(*realize);
       n->block = Block(new_block);
@@ -303,19 +314,23 @@ Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optional<Buf
     // Step 3.1. Create the parameters to the intrinsic
     Array<PrimExpr> parameters;
     parameters.reserve(reduction_loops.size() + 4);
-    // 1-st argument: size
-    parameters.push_back(make_const(DataType::UInt(32), 1));
-    // 2-nd argument: source
-    if (it_buffer.defined()) {
-      parameters.push_back(BufferLoad(it_buffer.value(), {Integer(0)}));
+    // 1-st argument: number of buffers
+    parameters.push_back(make_const(DataType::UInt(32), n_buffers));
+    // Next `n_buffers` arguments: sources
+    if (it_buffers.defined()) {
+      for (int i = 0; i < n_buffers; ++i) {
+        parameters.push_back(BufferLoad(it_buffers.value()[i], {Integer(0)}));
+      }
     } else {
-      parameters.push_back(combiner_rhs);
+      parameters.insert(parameters.end(), combiner_rhs.begin(), combiner_rhs.end());
     }
-    // 3-rd argument: predicate
+    // Next argument: predicate
     parameters.push_back(const_true());
-    // 4-th argument: destination
-    parameters.push_back(BufferLoad(ct_buffer, {0}));
-    // next arguments: all the reduction threads
+    // Next `n_buffers` arguments: destinations
+    for (int i = 0; i < n_buffers; ++i) {
+      parameters.push_back(BufferLoad(ct_buffers[i], {0}));
+    }
+    // Next arguments: all the reduction threads
     for (const ForNode* reduction_loop : reduction_loops) {
       if (reduction_loop->thread_binding.defined()) {
         parameters.push_back(reduction_loop->loop_var);
@@ -325,14 +340,14 @@ Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optional<Buf
     Array<IterVar> iter_vars{nullptr};
     Array<PrimExpr> bindings{nullptr};
     Array<BufferRegion> reads{nullptr};
-    if (it_buffer.defined()) {
+    if (it_buffers.defined()) {
       iter_vars = Array<IterVar>{};
       bindings = Array<PrimExpr>{};
-      reads = {it_buffer_region.value()};
+      reads = it_buffer_regions.value();
     } else {
       iter_vars = block->iter_vars;
       bindings = realize->iter_values;
-      reads = {RemoveBufferFromBufferRegions(block->reads, wb_buffer)};
+      reads = block->reads;
     }
     stmts.push_back(BlockRealize(
         /*iter_values=*/std::move(bindings),
@@ -340,7 +355,7 @@ Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optional<Buf
         /*block=*/
         Block(/*iter_vars=*/std::move(iter_vars),
               /*reads=*/std::move(reads),
-              /*writes=*/{ct_buffer_region},
+              /*writes=*/ct_buffer_regions,
               /*name_hint=*/block->name_hint + "_cross_thread",
               /*body=*/
               AttrStmt(/*node=*/reducer,
@@ -376,21 +391,31 @@ Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optional<Buf
         var_map.Set(iter_var->var, new_iter_var->var);
       }
     }
-    BufferStore update = Downcast<BufferStore>(block->body);
-    update = Downcast<BufferStore>(Substitute(std::move(update), var_map));
+    Array<Stmt> wb_updates;
+    Array<BufferRegion> wb_regions;
+    wb_updates.reserve(n_buffers);
+    wb_regions.reserve(n_buffers);
+    int n_dim = static_cast<int>(old_wb_indices.size());
+    Array<Range> region = Substitute(block->writes[0]->region, var_map);
+    Array<PrimExpr> wb_indices;
+    wb_indices.reserve(n_dim);
+    for (int d = 0; d < n_dim; ++d) {
+      wb_indices.push_back(Substitute(old_wb_indices[d], var_map));
+    }
+    for (int i = 0; i < n_buffers; ++i) {
+      wb_updates.push_back(
+          BufferStore(wb_buffers[i], BufferLoad(ct_buffers[i], {Integer(0)}), wb_indices));
+      wb_regions.push_back(BufferRegion(wb_buffers[i], region));
+    }
     stmts.push_back(BlockRealize(
         /*iter_values=*/std::move(bindings),
         /*predicate=*/const_true(),
         /*block=*/
-        Block(
-            /*iter_vars=*/std::move(iter_vars),
-            /*reads=*/{std::move(ct_buffer_region)},
-            /*writes=*/{BufferRegion(wb_buffer, Substitute(wb_region, var_map))},
-            /*name_hint=*/block->name_hint + "_write_back",
-            /*body=*/
-            BufferStore(/*buffer=*/wb_buffer,
-                        /*value=*/BufferLoad(ct_buffer, {Integer(0)}),
-                        /*indices=*/update->indices))));
+        Block(/*iter_vars=*/std::move(iter_vars),
+              /*reads=*/std::move(ct_buffer_regions),
+              /*writes=*/std::move(wb_regions),
+              /*name_hint=*/block->name_hint + "_write_back",
+              /*body=*/n_buffers > 1 ? SeqStmt(wb_updates) : wb_updates[0])));
   }
   // Final step: Wrap all the above four statements with the reduction loops bound to threadIdx
   Stmt new_stmt = SeqStmt::Flatten(std::move(stmts));
@@ -447,18 +472,23 @@ class CrossThreadReductionTransformer : public StmtMutator {
     return need ? reduction_loops : std::vector<const ForNode*>{};
   }
 
-  // Given that the input block needs cross-thread reduction, check if cross-thread reduction can
-  // be applied to the block (i.e., the block satisfies all necessary conditions of cross-thread
-  // reduction).
-  std::tuple<int, CommReducer, PrimExpr> CheckCanApplyCrossThreadReduction(
-      const BlockNode* block, const std::vector<const ForNode*>& reduction_loops) const {
-    // Condition 1. The block being applied cross-thread reduction should write to single buffer.
-    CHECK_EQ(block->writes.size(), 1)
-        << "ValueError: Cross-thread reduction requires the block to only "
-           "write to single buffer. However, the block "
-        << block->name_hint << " writes to " << block->writes.size() << " buffer(s).";
-
-    // Condition 2. All the reduction-related loops should be the deepest among all statements
+  /*!
+   * \brief Given that the input block needs cross-thread reduction, check if cross-thread reduction
+   * can be applied to the block (i.e., the block satisfies all necessary conditions of cross-thread
+   * reduction)
+   * \param block The block to be checked
+   * \param reduction_loops The reduction loops above the block
+   * \return A tuple consisting of five elements:
+   *  - an integer which indicates the number of reduction loops that are bound to thread axes,
+   *  - the detected commutative reducer of the reduction,
+   *  - the reduction buffers which store the reduction results,
+   *  - the RHS values of the reduction updates,
+   *  - the indices which is used to access the reduction buffers when storing the reduction results
+   */
+  std::tuple<int, CommReducer, Array<Buffer>, Array<PrimExpr>, Array<PrimExpr>>
+  CheckCanApplyCrossThreadReduction(const BlockNode* block,
+                                    const std::vector<const ForNode*>& reduction_loops) const {
+    // Condition 1. All the reduction-related loops should be the deepest among all statements
     // outside the block (ignoring SeqStmt here).
     int n_deepest_reduction_loops = 0;
     for (auto rit = statement_stack_.rbegin() + 1; rit != statement_stack_.rend(); ++rit) {
@@ -480,7 +510,7 @@ class CrossThreadReductionTransformer : public StmtMutator {
         << " needs cross-thread reduction, while the reduction-related loops outside of it are not "
            "the deepest statements, which violates the condition.";
 
-    // Condition 3. All the reduction-related loops that are bound to thread axes should only be
+    // Condition 2. All the reduction-related loops that are bound to thread axes should only be
     // bound to `threadIdx.x/y/z`.
     int n_bound_reduction_loops = 0;
     for (const ForNode* reduction_loop : reduction_loops) {
@@ -493,16 +523,26 @@ class CrossThreadReductionTransformer : public StmtMutator {
       }
     }
 
-    // Condition 4. Get the `init` identity and the `update` combiner of the reduction. They should
-    // both be BufferStores with the same buffer and indices;
-    // Extract the commutative reducer, combiner lhs and combiner rhs from the reduction identity
-    // and the reduction combiner.
-    auto [init, update] = GetBufferStoresFromReductionBlock(NullOpt, GetRef<Block>(block));
-    auto [reducer, combiner_lhs, combiner_rhs] =
-        GetReducerAndCombinerLhsRhs(NullOpt, init->value, update);
-    (void)combiner_lhs;  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
+    // Condition 3. Get the identity values of the block init and the BufferStore block combiner
+    // updates of the reduction. Extract the commutative reducer, combiner lhs and combiner rhs from
+    // the reduction identities and the reduction combiner.
+    Array<PrimExpr> init_values{nullptr};
+    Array<BufferStore> updates{nullptr};
+    CommReducer reducer{nullptr};
+    Array<PrimExpr> combiner_lhs{nullptr};
+    Array<PrimExpr> combiner_rhs{nullptr};
+    std::tie(init_values, updates) =
+        GetInitValuesAndUpdatesFromReductionBlock(NullOpt, GetRef<Block>(block));
+    std::tie(reducer, combiner_lhs, combiner_rhs) =
+        GetReducerAndCombinerLhsRhs(NullOpt, init_values, updates);
+
+    Array<Buffer> reduction_buffers;
+    reduction_buffers.reserve(updates.size());
+    for (const BufferStore& buf_store : updates) {
+      reduction_buffers.push_back(buf_store->buffer);
+    }
 
-    // Condition 5. The block should be the last block under the first reduction-related loop.
+    // Condition 4. The block should be the last block under the first reduction-related loop.
     bool visit = false;
     PreOrderVisit(GetRef<For>(reduction_loops[0]), [block, &visit](const ObjectRef& obj) {
       if (const auto* realize = obj.as<BlockRealizeNode>()) {
@@ -515,7 +555,11 @@ class CrossThreadReductionTransformer : public StmtMutator {
       }
       return true;
     });
-    return std::make_tuple(n_bound_reduction_loops, reducer, combiner_rhs);
+    return std::make_tuple(n_bound_reduction_loops,       //
+                           std::move(reducer),            //
+                           std::move(reduction_buffers),  //
+                           std::move(combiner_rhs),       //
+                           updates[0]->indices);
   }
 
   Stmt VisitStmt(const Stmt& stmt) final {
@@ -570,10 +614,14 @@ class CrossThreadReductionTransformer : public StmtMutator {
     if (reduction_loops.empty()) {
       return StmtMutator::VisitStmt_(realize);
     }
-    ++reduction_id_;
     // Step 2. Check whether cross-thread reduction can be applied. If no, throw an exception on
     // which condition the block violates.
-    auto [n_bound_reduction_loops, reducer, combiner_rhs] =
+    int n_bound_reduction_loops = 0;
+    CommReducer reducer{nullptr};
+    Array<Buffer> reduction_buffers{nullptr};
+    Array<PrimExpr> combiner_rhs{nullptr};
+    Array<PrimExpr> wb_indices{nullptr};
+    std::tie(n_bound_reduction_loops, reducer, reduction_buffers, combiner_rhs, wb_indices) =
         CheckCanApplyCrossThreadReduction(block, reduction_loops);
     // Step 3. Before doing the cross-thread reduction, in-thread reduction is needed when
     //  - not all the reduction-related loops are bound to thread axes, or
@@ -581,31 +629,30 @@ class CrossThreadReductionTransformer : public StmtMutator {
     bool need_in_thread_reduction =
         n_bound_reduction_loops < static_cast<int>(reduction_loops.size()) ||
         !is_one(realize->predicate);
-    // Step 4. Create intermediate buffers, storing them in `ct_buffer` and
-    // `it_buffer`. Let the scope block allocate these new buffers.
-    std::vector<Buffer>& new_buffers = block2new_buffers_[block_stack_.back()];
-    DataType dtype = block->writes[0]->buffer->dtype;
-    Buffer ct_buffer = MakeScratchpad("cross_thread_" + std::to_string(reduction_id_), dtype);
-    new_buffers.push_back(ct_buffer);
-    Optional<Buffer> it_buffer = NullOpt;
+    // Step 4. Create intermediate buffers, storing them in `ct_buffers` and
+    // `it_buffers`. Let the scope block allocate these new buffers.
+    Array<Buffer>& new_buffers = block2new_buffers_[block_stack_.back()];
+    Array<Buffer> ct_buffers = MakeScratchpads(reduction_buffers, /*is_cross_thread_buffer=*/true);
+    new_buffers.insert(new_buffers.end(), ct_buffers.begin(), ct_buffers.end());
+    Optional<Array<Buffer>> it_buffers = NullOpt;
     if (need_in_thread_reduction) {
-      it_buffer = MakeScratchpad("in_thread_" + std::to_string(reduction_id_), dtype);
-      new_buffers.push_back(it_buffer.value());
+      it_buffers = MakeScratchpads(reduction_buffers, /*is_cross_thread_buffer=*/false);
+      new_buffers.insert(new_buffers.end(), it_buffers.value().begin(), it_buffers.value().end());
     }
     // Step 5. Transform.
-    loop2new_stmt_[reduction_loops[0]] = TransformReductionBlock(
-        realize, it_buffer, ct_buffer, reducer, combiner_rhs, reduction_loops);
+    loop2new_stmt_[reduction_loops[0]] =
+        TransformReductionBlock(realize, it_buffers, ct_buffers, reduction_buffers, wb_indices,
+                                reducer, combiner_rhs, reduction_loops);
     // Step 6. Return an empty statement, because the transformation result will be inserted when
     // returning to the first reduction-related loop.
     return Stmt{nullptr};
   }
 
  private:
-  int reduction_id_ = -1;
   std::vector<const StmtNode*> statement_stack_;
   std::vector<const ForNode*> loop_stack_;
   std::vector<const BlockNode*> block_stack_;
-  std::unordered_map<const BlockNode*, std::vector<Buffer>> block2new_buffers_;
+  std::unordered_map<const BlockNode*, Array<Buffer>> block2new_buffers_;
   std::unordered_map<const ForNode*, Stmt> loop2new_stmt_;
   Map<Var, Range> loop_range_map_;
   arith::Analyzer analyzer_;
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
index 17f42654fcf7..70b49944ba0f 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
@@ -119,5 +119,171 @@ def cpu_matmul_2(
     )
 
 
+def test_cpu_argmax():
+    @T.prim_func
+    def argmax(
+        idx: T.Buffer[(128, 128), "int32"],
+        val: T.Buffer[(128, 128), "float32"],
+        argmax_v0: T.Buffer[(128,), "int32"],
+        argmax_v1: T.Buffer[(128,), "float32"],
+    ) -> None:
+        for i0, i1 in T.grid(128, 128):
+            with T.block("argmax"):
+                i = T.axis.spatial(128, i0)
+                k = T.axis.reduce(128, i1)
+                T.reads(idx[i, k], val[i, k])
+                T.writes(argmax_v0[i], argmax_v1[i])
+                with T.init():
+                    argmax_v0[i] = -1
+                    argmax_v1[i] = T.min_value("float32")
+                v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+                v_argmax_v1: T.float32 = T.Select(
+                    argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]
+                )
+                argmax_v0[i] = v_argmax_v0
+                argmax_v1[i] = v_argmax_v1
+
+    @T.prim_func
+    def argmax_0(
+        idx: T.Buffer[(128, 128), "int32"],
+        val: T.Buffer[(128, 128), "float32"],
+        argmax_v0: T.Buffer[128, "int32"],
+        argmax_v1: T.Buffer[128, "float32"],
+    ) -> None:
+        for i0, i1 in T.grid(128, 128):
+            with T.block("argmax"):
+                i, k = T.axis.remap("SR", [i0, i1])
+                T.reads(idx[i, k], val[i, k])
+                T.writes(argmax_v0[i], argmax_v1[i])
+                with T.init():
+                    argmax_v0[i] = -1
+                    argmax_v1[i] = T.float32(-3.4028234663852886e38)
+                v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+                v_argmax_v1: T.float32 = T.Select(
+                    argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]
+                )
+                argmax_v0[i] = v_argmax_v0
+                argmax_v1[i] = v_argmax_v1
+
+    @T.prim_func
+    def argmax_1(
+        idx: T.Buffer[(128, 128), "int32"],
+        val: T.Buffer[(128, 128), "float32"],
+        argmax_v0: T.Buffer[128, "int32"],
+        argmax_v1: T.Buffer[128, "float32"],
+    ) -> None:
+        argmax_v0_rf = T.alloc_buffer([128, 16], dtype="int32")
+        argmax_v1_rf = T.alloc_buffer([128, 16], dtype="float32")
+        for i0, i1_0, i1_1 in T.grid(128, 8, 16):
+            with T.block("argmax_rf"):
+                vi1_1, i, vi1_0 = T.axis.remap("SSR", [i1_1, i0, i1_0])
+                T.reads(idx[i, vi1_0 * 16 + vi1_1], val[i, vi1_0 * 16 + vi1_1])
+                T.writes(argmax_v0_rf[i, vi1_1], argmax_v1_rf[i, vi1_1])
+                with T.init():
+                    argmax_v0_rf[i, vi1_1] = -1
+                    argmax_v1_rf[i, vi1_1] = T.float32(-3.4028234663852886e38)
+                v_argmax_v0_rf: T.int32 = T.Select(
+                    argmax_v1_rf[i, vi1_1] >= val[i, vi1_0 * 16 + vi1_1],
+                    argmax_v0_rf[i, vi1_1],
+                    idx[i, vi1_0 * 16 + vi1_1],
+                )
+                v_argmax_v1_rf: T.float32 = T.Select(
+                    argmax_v1_rf[i, vi1_1] >= val[i, vi1_0 * 16 + vi1_1],
+                    argmax_v1_rf[i, vi1_1],
+                    val[i, vi1_0 * 16 + vi1_1],
+                )
+                argmax_v0_rf[i, vi1_1] = v_argmax_v0_rf
+                argmax_v1_rf[i, vi1_1] = v_argmax_v1_rf
+        for i0, i1_1 in T.grid(128, 16):
+            with T.block("argmax"):
+                vi1_1, i = T.axis.remap("RS", [i1_1, i0])
+                T.reads(argmax_v0_rf[i, vi1_1], argmax_v1_rf[i, vi1_1])
+                T.writes(argmax_v0[i], argmax_v1[i])
+                T.block_attr({"meta_schedule.random_compute_producer": 1})
+                with T.init():
+                    argmax_v0[i] = -1
+                    argmax_v1[i] = T.float32(-3.4028234663852886e38)
+                v_argmax_v0: T.int32 = T.Select(
+                    argmax_v1[i] >= argmax_v1_rf[i, vi1_1], argmax_v0[i], argmax_v0_rf[i, vi1_1]
+                )
+                v_argmax_v1: T.float32 = T.Select(
+                    argmax_v1[i] >= argmax_v1_rf[i, vi1_1], argmax_v1[i], argmax_v1_rf[i, vi1_1]
+                )
+                argmax_v0[i] = v_argmax_v0
+                argmax_v1[i] = v_argmax_v1
+
+    @T.prim_func
+    def argmax_2(
+        idx: T.Buffer[(128, 128), "int32"],
+        val: T.Buffer[(128, 128), "float32"],
+        argmax_v0: T.Buffer[128, "int32"],
+        argmax_v1: T.Buffer[128, "float32"],
+    ) -> None:
+        # body
+        # with T.block("root")
+        argmax_v0_rf = T.alloc_buffer([128, 8], dtype="int32")
+        argmax_v1_rf = T.alloc_buffer([128, 8], dtype="float32")
+        for i0, i1_0, i1_1 in T.grid(128, 8, 16):
+            with T.block("argmax_rf"):
+                vi1_0, i, vi1_1 = T.axis.remap("SSR", [i1_0, i0, i1_1])
+                T.reads(idx[i, vi1_0 * 16 + vi1_1], val[i, vi1_0 * 16 + vi1_1])
+                T.writes(argmax_v0_rf[i, vi1_0], argmax_v1_rf[i, vi1_0])
+                with T.init():
+                    argmax_v0_rf[i, vi1_0] = -1
+                    argmax_v1_rf[i, vi1_0] = T.float32(-3.4028234663852886e38)
+                v_argmax_v0_rf: T.int32 = T.Select(
+                    argmax_v1_rf[i, vi1_0] >= val[i, vi1_0 * 16 + vi1_1],
+                    argmax_v0_rf[i, vi1_0],
+                    idx[i, vi1_0 * 16 + vi1_1],
+                )
+                v_argmax_v1_rf: T.float32 = T.Select(
+                    argmax_v1_rf[i, vi1_0] >= val[i, vi1_0 * 16 + vi1_1],
+                    argmax_v1_rf[i, vi1_0],
+                    val[i, vi1_0 * 16 + vi1_1],
+                )
+                argmax_v0_rf[i, vi1_0] = v_argmax_v0_rf
+                argmax_v1_rf[i, vi1_0] = v_argmax_v1_rf
+        for i0, i1_0 in T.grid(128, 8):
+            with T.block("argmax"):
+                vi1_0, i = T.axis.remap("RS", [i1_0, i0])
+                T.reads(argmax_v0_rf[i, vi1_0], argmax_v1_rf[i, vi1_0])
+                T.writes(argmax_v0[i], argmax_v1[i])
+                T.block_attr({"meta_schedule.random_compute_producer": 1})
+                with T.init():
+                    argmax_v0[i] = -1
+                    argmax_v1[i] = T.float32(-3.4028234663852886e38)
+                v_argmax_v0: T.int32 = T.Select(
+                    argmax_v1[i] >= argmax_v1_rf[i, vi1_0], argmax_v0[i], argmax_v0_rf[i, vi1_0]
+                )
+                v_argmax_v1: T.float32 = T.Select(
+                    argmax_v1[i] >= argmax_v1_rf[i, vi1_0], argmax_v1[i], argmax_v1_rf[i, vi1_0]
+                )
+                argmax_v0[i] = v_argmax_v0
+                argmax_v1[i] = v_argmax_v1
+
+    decision_0 = []  # type: ignore
+    decision_1 = [
+        ("SamplePerfectTile", [8, 16]),
+    ]
+    decision_2 = [
+        ("SamplePerfectTile", [8, 16]),
+    ]
+    mod = argmax
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("llvm --num-cores=32"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[ms.schedule_rule.AddRFactor()],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[argmax_0, argmax_1, argmax_2],
+        expected_decisions=[decision_0, decision_1, decision_2],
+    )
+
+
 if __name__ == "__main__":
     test_cpu_matmul()
+    test_cpu_argmax()
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index a0ca47c09a34..ab8df6678b0b 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -572,7 +572,106 @@ def batch_norm_bmn_1(A: T.Buffer[(1, 512, 512), "float32"], D: T.Buffer[1, "floa
     )
 
 
+@T.prim_func
+def argmax(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1 in T.grid(128, 128):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+
+
+def test_gpu_argmax():
+    @T.prim_func
+    def argmax_0(
+        idx: T.Buffer[(128, 128), "int32"],
+        val: T.Buffer[(128, 128), "float32"],
+        argmax_v0: T.Buffer[128, "int32"],
+        argmax_v1: T.Buffer[128, "float32"],
+    ) -> None:
+        # body
+        # with T.block("root")
+        for i0, i1 in T.grid(128, 128):
+            with T.block("argmax"):
+                i, k = T.axis.remap("SR", [i0, i1])
+                T.reads(idx[i, k], val[i, k])
+                T.writes(argmax_v0[i], argmax_v1[i])
+                with T.init():
+                    argmax_v0[i] = -1
+                    argmax_v1[i] = T.float32(-3.4028234663852886e38)
+                v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+                v_argmax_v1: T.float32 = T.Select(
+                    argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]
+                )
+                argmax_v0[i] = v_argmax_v0
+                argmax_v1[i] = v_argmax_v1
+
+    @T.prim_func
+    def argmax_1(
+        idx: T.Buffer[(128, 128), "int32"],
+        val: T.Buffer[(128, 128), "float32"],
+        argmax_v0: T.Buffer[128, "int32"],
+        argmax_v1: T.Buffer[128, "float32"],
+    ) -> None:
+        # body
+        # with T.block("root")
+        for i0, i1_0 in T.grid(128, 2):
+            for i1_1 in T.thread_binding(64, thread="threadIdx.x"):
+                with T.block("argmax"):
+                    i = T.axis.spatial(128, i0)
+                    k = T.axis.reduce(128, i1_0 * 64 + i1_1)
+                    T.reads(idx[i, k], val[i, k])
+                    T.writes(argmax_v0[i], argmax_v1[i])
+                    with T.init():
+                        argmax_v0[i] = -1
+                        argmax_v1[i] = T.float32(-3.4028234663852886e38)
+                    v_argmax_v0: T.int32 = T.Select(
+                        argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]
+                    )
+                    v_argmax_v1: T.float32 = T.Select(
+                        argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]
+                    )
+                    argmax_v0[i] = v_argmax_v0
+                    argmax_v1[i] = v_argmax_v1
+
+    decision_0 = []  # type: ignore
+    decision_1 = [
+        ("SampleCategorical", 4),
+    ]
+
+    mod = argmax
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3090", host="llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512])
+        ],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[argmax_0, argmax_1],
+        expected_decisions=[decision_0, decision_1],
+    )
+
+
 if __name__ == "__main__":
     test_gpu_softmax_mn()
     test_gpu_softmax_mn_after_inline()
     test_gpu_batch_norm_bmn()
+    test_gpu_argmax()
diff --git a/tests/python/unittest/test_tir_schedule_rfactor.py b/tests/python/unittest/test_tir_schedule_rfactor.py
index 4078b1e89682..f6db79f3ed23 100644
--- a/tests/python/unittest/test_tir_schedule_rfactor.py
+++ b/tests/python/unittest/test_tir_schedule_rfactor.py
@@ -29,9 +29,9 @@
 
 @T.prim_func
 def transformed_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, [128, 128])
-    B = T.match_buffer(b, [128, 128])
-    C = T.match_buffer(c, [128, 128])
+    A = T.match_buffer(a, [128, 128], dtype="float32")
+    B = T.match_buffer(b, [128, 128], dtype="float32")
+    C = T.match_buffer(c, [128, 128], dtype="float32")
 
     for i0, i1, i2_outer, i2_inner_outer, i2_inner_inner in T.grid(128, 128, 4, 8, 4):
         with T.block("update"):
@@ -44,12 +44,30 @@ def transformed_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
             C[vi, vj] = C[vi, vj] + (A[vi, vk] * B[vj, vk])
 
 
+@T.prim_func
+def transformed_matmul_with_let(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, [128, 128], dtype="float32")
+    B = T.match_buffer(b, [128, 128], dtype="float32")
+    C = T.match_buffer(c, [128, 128], dtype="float32")
+
+    for i0, i1, i2_outer, i2_inner_outer, i2_inner_inner in T.grid(128, 128, 4, 8, 4):
+        with T.block("update"):
+            vi, vj = T.axis.remap("SS", [i0, i1])
+            vk = T.axis.R(128, i2_outer * 32 + i2_inner_outer * 4 + i2_inner_inner)
+            T.reads([A[vi, vk], B[vj, vk]])
+            T.writes([C[vi, vj]])
+            with T.init():
+                C[vi, vj] = 0.0
+            v_C: T.float32 = C[vi, vj] + (A[vi, vk] * B[vj, vk])
+            C[vi, vj] = v_C
+
+
 @T.prim_func
 def matmul_rfactor(a: T.handle, b: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, [128, 128])
-    B = T.match_buffer(b, [128, 128])
-    C = T.match_buffer(c, [128, 128])
-    C_rf = T.alloc_buffer([4, 128, 128])
+    A = T.match_buffer(a, [128, 128], dtype="float32")
+    B = T.match_buffer(b, [128, 128], dtype="float32")
+    C = T.match_buffer(c, [128, 128], dtype="float32")
+    C_rf = T.alloc_buffer([4, 128, 128], dtype="float32")
 
     for i0, i1, i2_outer, i2_inner_outer, i2_inner_inner in T.grid(128, 128, 4, 8, 4):
         with T.block("update_rf"):
@@ -436,6 +454,20 @@ def rowsum_wrong_reduce_pattern2(a: T.handle, b: T.handle) -> None:
             B[vi] = B[vi] - A[vi, vk]
 
 
+@T.prim_func
+def rowsum_init_not_bufferstore(a: T.handle, b: T.handle) -> None:
+    A = T.match_buffer(a, (128, 128))
+    B = T.match_buffer(b, (128,))
+
+    for i, k in T.grid(128, 128):
+        with T.block("B"):
+            vi, vk = T.axis.remap("SR", [i, k])
+            with T.init():
+                v_init: T.float32 = T.float32(0)
+                B[vi] = v_init
+            B[vi] = B[vi] + A[vi, vk]
+
+
 @T.prim_func
 def rowsum_transformed(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
@@ -654,6 +686,453 @@ def rfactor_spatial_only_after(
             B[ax0, ax1, ax2, ax3] = B[ax0, ax1, ax2, ax3] + B_rf[ax0, ax1, ax2, ax3, vi4]
 
 
+@T.prim_func
+def argmax_split(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+
+
+@T.prim_func
+def argmin_split_init_update_reordered(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmin_v0: T.Buffer[(128,), "int32"],
+    argmin_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmin"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmin_v0[i], argmin_v1[i])
+            with T.init():
+                argmin_v1[i] = T.max_value("float32")
+                argmin_v0[i] = -1
+            v_argmin_v0: T.int32 = T.Select(argmin_v1[i] <= val[i, k], argmin_v0[i], idx[i, k])
+            v_argmin_v1: T.float32 = T.Select(argmin_v1[i] <= val[i, k], argmin_v1[i], val[i, k])
+            argmin_v1[i] = v_argmin_v1
+            argmin_v0[i] = v_argmin_v0
+
+
+@T.prim_func
+def argmax_split_different_shape(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(256,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+
+
+@T.prim_func
+def argmax_split_different_indices(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i + 1] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i + 1] = v_argmax_v1
+
+
+@T.prim_func
+def argmax_split_init_not_bufferstore(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                v1_init: T.float32 = T.min_value("float32")
+                argmax_v1[i] = v1_init
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+
+
+@T.prim_func
+def argmax_split_init_buffer_duplicate(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v0[i] = -1
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+
+
+@T.prim_func
+def argmax_split_letstmt_fewer_than_init(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+
+
+@T.prim_func
+def argmax_split_letstmt_more_than_init(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+
+
+@T.prim_func
+def argmax_split_let_body_neither_seqstmt_nor_bufferstore(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            T.evaluate(0)
+
+
+@T.prim_func
+def argmax_split_init_update_inconsistent_bufferstore_number(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+            argmax_v1[i] = v_argmax_v1
+
+
+@T.prim_func
+def argmax_split_body_seq_not_bufferstore(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            T.evaluate(0)
+
+
+@T.prim_func
+def argmax_split_body_bufferstore_value_not_var(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            argmax_v1[i] = v_argmax_v1
+
+
+@T.prim_func
+def argmax_split_body_bufferstore_value_unbound_var(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    v_unbound = T.var("int32")
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_unbound
+            argmax_v1[i] = v_argmax_v1
+
+
+@T.prim_func
+def argmax_split_one_let_var_used_multi_times(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "int32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "int32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("int32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v0
+
+
+@T.prim_func
+def argmax_split_body_one_buffer_updated_multi_times(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "int32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "int32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("int32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v0[i] = v_argmax_v1
+
+
+@T.prim_func
+def argmax_split_init_buffer_not_match(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v0_1: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(128, i0)
+            k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v0_1[i], argmax_v1[i])
+            with T.init():
+                argmax_v0_1[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+
+
+@T.prim_func
+def argmax_split_rfactor(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    argmax_v0_rf = T.alloc_buffer([128, 32], dtype="int32")
+    argmax_v1_rf = T.alloc_buffer([128, 32], dtype="float32")
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmax_rf"):
+            vi1_1, i, vi1_0 = T.axis.remap("SSR", [i1_1, i0, i1_0])
+            T.reads(idx[i, vi1_0 * 32 + vi1_1], val[i, vi1_0 * 32 + vi1_1])
+            T.writes(argmax_v0_rf[i, vi1_1], argmax_v1_rf[i, vi1_1])
+            with T.init():
+                argmax_v0_rf[i, vi1_1] = -1
+                argmax_v1_rf[i, vi1_1] = T.min_value("float32")
+            v_argmax_v0_rf: T.int32 = T.Select(
+                argmax_v1_rf[i, vi1_1] >= val[i, vi1_0 * 32 + vi1_1],
+                argmax_v0_rf[i, vi1_1],
+                idx[i, vi1_0 * 32 + vi1_1],
+            )
+            v_argmax_v1_rf: T.float32 = T.Select(
+                argmax_v1_rf[i, vi1_1] >= val[i, vi1_0 * 32 + vi1_1],
+                argmax_v1_rf[i, vi1_1],
+                val[i, vi1_0 * 32 + vi1_1],
+            )
+            argmax_v0_rf[i, vi1_1] = v_argmax_v0_rf
+            argmax_v1_rf[i, vi1_1] = v_argmax_v1_rf
+    for i0, i1_1 in T.grid(128, 32):
+        with T.block("argmax"):
+            vi1_1, i = T.axis.remap("RS", [i1_1, i0])
+            T.reads(argmax_v0_rf[i, vi1_1], argmax_v1_rf[i, vi1_1])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(
+                argmax_v1[i] >= argmax_v1_rf[i, vi1_1], argmax_v0[i], argmax_v0_rf[i, vi1_1]
+            )
+            v_argmax_v1: T.float32 = T.Select(
+                argmax_v1[i] >= argmax_v1_rf[i, vi1_1], argmax_v1[i], argmax_v1_rf[i, vi1_1]
+            )
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+
+
+@T.prim_func
+def argmin_split_rfactor(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmin_v0: T.Buffer[(128,), "int32"],
+    argmin_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    argmin_v0_rf = T.alloc_buffer([128, 32], dtype="int32")
+    argmin_v1_rf = T.alloc_buffer([128, 32], dtype="float32")
+    for i0, i1_0, i1_1 in T.grid(128, 4, 32):
+        with T.block("argmin_rf"):
+            vi1_1, i, vi1_0 = T.axis.remap("SSR", [i1_1, i0, i1_0])
+            T.reads(idx[i, vi1_0 * 32 + vi1_1], val[i, vi1_0 * 32 + vi1_1])
+            T.writes(argmin_v0_rf[i, vi1_1], argmin_v1_rf[i, vi1_1])
+            with T.init():
+                argmin_v0_rf[i, vi1_1] = -1
+                argmin_v1_rf[i, vi1_1] = T.max_value("float32")
+            v_argmin_v0_rf: T.int32 = T.Select(
+                argmin_v1_rf[i, vi1_1] <= val[i, vi1_0 * 32 + vi1_1],
+                argmin_v0_rf[i, vi1_1],
+                idx[i, vi1_0 * 32 + vi1_1],
+            )
+            v_argmin_v1_rf: T.float32 = T.Select(
+                argmin_v1_rf[i, vi1_1] <= val[i, vi1_0 * 32 + vi1_1],
+                argmin_v1_rf[i, vi1_1],
+                val[i, vi1_0 * 32 + vi1_1],
+            )
+            argmin_v0_rf[i, vi1_1] = v_argmin_v0_rf
+            argmin_v1_rf[i, vi1_1] = v_argmin_v1_rf
+    for i0, i1_1 in T.grid(128, 32):
+        with T.block("argmin"):
+            vi1_1, i = T.axis.remap("RS", [i1_1, i0])
+            T.reads(argmin_v0_rf[i, vi1_1], argmin_v1_rf[i, vi1_1])
+            T.writes(argmin_v0[i], argmin_v1[i])
+            with T.init():
+                argmin_v0[i] = -1
+                argmin_v1[i] = T.max_value("float32")
+            v_argmin_v0: T.int32 = T.Select(
+                argmin_v1[i] <= argmin_v1_rf[i, vi1_1], argmin_v0[i], argmin_v0_rf[i, vi1_1]
+            )
+            v_argmin_v1: T.float32 = T.Select(
+                argmin_v1[i] <= argmin_v1_rf[i, vi1_1], argmin_v1[i], argmin_v1_rf[i, vi1_1]
+            )
+            argmin_v0[i] = v_argmin_v0
+            argmin_v1[i] = v_argmin_v1
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
 
 
@@ -668,6 +1147,17 @@ def test_reduction_rfactor_matmul():
     verify_trace_roundtrip(s, mod=transformed_matmul)
 
 
+def test_reduction_rfactor_matmul_with_let():
+    s = tir.Schedule(transformed_matmul_with_let, debug_mask="all")
+    update = s.get_block("update")
+    _, _, _, _, kii = s.get_loops(update)
+    rf_block = s.rfactor(kii, 0)
+    tvm.ir.assert_structural_equal(s.mod["main"], matmul_rfactor)
+    assert s.get(rf_block).same_as(s.get(s.get_block("update_rf")))
+    assert s.get(update).same_as(s.get(s.get_block("update")))
+    verify_trace_roundtrip(s, mod=transformed_matmul_with_let)
+
+
 def test_reduction_rfactor_square_sum():
     s = tir.Schedule(square_sum, debug_mask="all")
     C = s.get_block("C")
@@ -773,6 +1263,13 @@ def test_reduction_rfactor_wrong_reduce_pattern2():
         s.rfactor(k, 0)
 
 
+def test_reduction_rfactor_init_not_bufferstore():
+    s = tir.Schedule(rowsum_init_not_bufferstore, debug_mask="all")
+    _, k = s.get_loops(s.get_block("B"))
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(k, 0)
+
+
 def test_reduction_rfactor_wrong_loops1():
     s = tir.Schedule(rowsum, debug_mask="all")
     i, _ = s.get_loops(s.get_block("B"))
@@ -852,10 +1349,146 @@ def test_reduction_rfactor_spatial_only():
     s = tir.Schedule(rfactor_spatial_only, debug_mask="all")
     block = s.get_block(name="acc", func_name="main")
     _, _, _, _, loop, _ = s.get_loops(block)
-    s.rfactor(loop=loop, factor_axis=4)
+    rf_block = s.rfactor(loop=loop, factor_axis=4)
     tvm.ir.assert_structural_equal(s.mod["main"], rfactor_spatial_only_after)
+    assert s.get(rf_block).same_as(s.get(s.get_block("acc_rf")))
+    assert s.get(block).same_as(s.get(s.get_block("acc")))
     verify_trace_roundtrip(s, mod=rfactor_spatial_only)
 
 
+def test_reduction_rfactor_argmax():
+    s = tir.Schedule(argmax_split, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    rf_block = s.rfactor(ki, 1)
+    tvm.ir.assert_structural_equal(s.mod["main"], argmax_split_rfactor)
+    assert s.get(rf_block).same_as(s.get(s.get_block("argmax_rf")))
+    assert s.get(argmax).same_as(s.get(s.get_block("argmax")))
+    verify_trace_roundtrip(s, mod=argmax_split)
+
+
+def test_reduction_rfactor_argmin_init_update_reordeded():
+    s = tir.Schedule(argmin_split_init_update_reordered, debug_mask="all")
+    argmin = s.get_block("argmin")
+    _, _, ki = s.get_loops(argmin)
+    rf_block = s.rfactor(ki, 1)
+    tvm.ir.assert_structural_equal(s.mod["main"], argmin_split_rfactor)
+    assert s.get(rf_block).same_as(s.get(s.get_block("argmin_rf")))
+    assert s.get(argmin).same_as(s.get(s.get_block("argmin")))
+    verify_trace_roundtrip(s, mod=argmin_split_init_update_reordered)
+
+
+def test_reduction_rfactor_argmax_reduction_buffer_different_shape():
+    s = tir.Schedule(argmax_split_different_shape, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_different_access_indices():
+    s = tir.Schedule(argmax_split_different_indices, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_init_not_bufferstore():
+    s = tir.Schedule(argmax_split_init_not_bufferstore, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_init_buffer_duplicate():
+    s = tir.Schedule(argmax_split_init_buffer_duplicate, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_letstmt_fewer_than_init():
+    s = tir.Schedule(argmax_split_letstmt_fewer_than_init, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_letstmt_more_than_init():
+    s = tir.Schedule(argmax_split_letstmt_more_than_init, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_let_body_neither_seqstmt_nor_bufferstore():
+    s = tir.Schedule(argmax_split_let_body_neither_seqstmt_nor_bufferstore, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_init_update_inconsistent_bufferstore_number():
+    s = tir.Schedule(argmax_split_init_update_inconsistent_bufferstore_number, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_body_seq_not_bufferstore():
+    s = tir.Schedule(argmax_split_body_seq_not_bufferstore, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_body_bufferstore_value_not_var():
+    s = tir.Schedule(argmax_split_body_bufferstore_value_not_var, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_body_bufferstore_value_unbound_var():
+    s = tir.Schedule(argmax_split_body_bufferstore_value_unbound_var, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_one_let_var_used_multi_times():
+    s = tir.Schedule(argmax_split_one_let_var_used_multi_times, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_body_one_buffer_updated_multi_times():
+    s = tir.Schedule(argmax_split_body_one_buffer_updated_multi_times, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
+def test_reduction_rfactor_argmax_init_buffer_not_match():
+    s = tir.Schedule(argmax_split_init_buffer_not_match, debug_mask="all")
+    argmax = s.get_block("argmax")
+    _, _, ki = s.get_loops(argmax)
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.rfactor(ki, 1)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
index 9b5937ac6efd..ff1353d2265e 100644
--- a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
+++ b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=missing-function-docstring,missing-module-docstring
 import sys
 
 import pytest
@@ -22,6 +23,8 @@
 from tvm import te
 from tvm.script import tir as T
 
+# pylint: disable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
+
 
 def _check(original, transformed):
     mod = tvm.IRModule.from_expr(original)
@@ -44,7 +47,7 @@ def loop_split(a: T.handle, b: T.handle) -> None:
             with T.block("B"):
                 vi = T.axis.S(128, i)
                 vk = T.axis.R(128, ko * 32 + ki)
-                T.reads([B[vi], A[vi, vk]])
+                T.reads([A[vi, vk]])
                 T.writes([B[vi]])
                 with T.init():
                     B[vi] = T.float32(0)
@@ -67,7 +70,7 @@ def lowered_loop_split(a: T.handle, b: T.handle) -> None:
                 with T.block("B_normal_reduction"):
                     vi = T.axis.S(128, i)
                     vk = T.axis.R(128, ko * 32 + ki)
-                    T.reads([A[vi, vk], normal_reduce_temp0[0]])
+                    T.reads([A[vi, vk]])
                     T.writes([normal_reduce_temp0[0]])
                     normal_reduce_temp0[0] = normal_reduce_temp0[0] + A[vi, vk]
             with T.block("B_cross_thread_reduction"):
@@ -103,7 +106,7 @@ def no_normal_reduction(a: T.handle, b: T.handle) -> None:
         for k in T.thread_binding(0, 128, thread="threadIdx.x"):
             with T.block("B"):
                 vi, vk = T.axis.remap("SR", [i, k])
-                T.reads([B[vi], A[vi, vk]])
+                T.reads([A[vi, vk]])
                 T.writes([B[vi]])
                 with T.init():
                     B[vi] = T.float32(0)
@@ -148,7 +151,7 @@ def two_bound_loops(a: T.handle, b: T.handle) -> None:
                 with T.block("B"):
                     vi = T.axis.spatial(128, i)
                     vk = T.axis.reduce(128, ko * 32 + ki)
-                    T.reads([B[vi], A[vi, vk]])
+                    T.reads([A[vi, vk]])
                     T.writes([B[vi]])
                     with T.init():
                         B[vi] = T.float32(0)
@@ -196,7 +199,7 @@ def multiple_blocks_under_reduction_loop(a: T.handle, b: T.handle) -> None:
                 with T.block("B_rf"):
                     vk0 = T.axis.spatial(16, k0o * 4 + k0i0)
                     vi, vk1 = T.axis.remap("SR", [i, k1])
-                    T.reads([B_rf_local[vk0, vi], A[vi, vk0, vk1]])
+                    T.reads([A[vi, vk0, vk1]])
                     T.writes([B_rf_local[vk0, vi]])
                     with T.init():
                         B_rf_local[vk0, vi] = T.float32(0)
@@ -205,7 +208,7 @@ def multiple_blocks_under_reduction_loop(a: T.handle, b: T.handle) -> None:
                 with T.block("B"):
                     vk0 = T.axis.reduce(16, k0o * 4 + k0i1)
                     vi = T.axis.spatial(16, i)
-                    T.reads([B[vi], B_rf_local[vk0, vi]])
+                    T.reads([B_rf_local[vk0, vi]])
                     T.writes([B[vi]])
                     with T.init():
                         B[vi] = T.float32(0)
@@ -229,7 +232,7 @@ def lowered_multiple_blocks_under_reduction_loop(a: T.handle, b: T.handle) -> No
                 with T.block("B_rf"):
                     vk0 = T.axis.spatial(16, k0o * 4 + k0i0)
                     vi, vk1 = T.axis.remap("SR", [i, k1])
-                    T.reads([B_rf_local[vk0, vi], A[vi, vk0, vk1]])
+                    T.reads([A[vi, vk0, vk1]])
                     T.writes([B_rf_local[vk0, vi]])
                     with T.init():
                         B_rf_local[vk0, vi] = T.float32(0)
@@ -238,7 +241,7 @@ def lowered_multiple_blocks_under_reduction_loop(a: T.handle, b: T.handle) -> No
                 with T.block("B_normal_reduction"):
                     vk0 = T.axis.reduce(16, k0o * 4 + k0i1)
                     vi = T.axis.spatial(16, i)
-                    T.reads([B_rf_local[vk0, vi], normal_reduce_temp0[0]])
+                    T.reads([B_rf_local[vk0, vi]])
                     T.writes([normal_reduce_temp0[0]])
                     normal_reduce_temp0[0] = normal_reduce_temp0[0] + B_rf_local[vk0, vi]
             with T.block("B_cross_thread_reduction"):
@@ -276,7 +279,7 @@ def with_block_predicate(a: T.handle, b: T.handle) -> None:
                 vi = T.axis.spatial(128, i)
                 vk = T.axis.reduce(120, ko * 32 + ki)
                 T.where(ko * 32 + ki < 120)
-                T.reads([B[vi], A[vi, vk]])
+                T.reads([A[vi, vk]])
                 T.writes([B[vi]])
                 with T.init():
                     B[vi] = T.float32(0)
@@ -300,7 +303,7 @@ def lowered_with_block_predicate(a: T.handle, b: T.handle) -> None:
                     vi = T.axis.spatial(128, i)
                     vk = T.axis.reduce(120, ko * 32 + ki)
                     T.where(ko * 32 + ki < 120)
-                    T.reads([A[vi, vk], normal_reduce_temp0[0]])
+                    T.reads([A[vi, vk]])
                     T.writes([normal_reduce_temp0[0]])
                     normal_reduce_temp0[0] = normal_reduce_temp0[0] + A[vi, vk]
             with T.block("B_cross_thread_reduction"):
@@ -341,7 +344,7 @@ def single_reduction_loop_with_block_predicate(
                     i0_1 = T.axis.spatial(256, i0)
                     k = T.axis.reduce(256, ax1_1)
                     T.where(ax1_0 * 512 + ax1_1 < 256)
-                    T.reads(T_softmax_maxelem_shared[i0_1], A[i0_1, k])
+                    T.reads(A[i0_1, k])
                     T.writes(T_softmax_maxelem_shared[i0_1])
                     with T.init():
                         T_softmax_maxelem_shared[i0_1] = T.float32(-3.4028234663852886e38)
@@ -354,9 +357,7 @@ def single_reduction_loop_with_block_predicate(
                     i0_2 = T.axis.spatial(256, i0)
                     k = T.axis.reduce(256, ax1_1)
                     T.where(ax1_0 * 512 + ax1_1 < 256)
-                    T.reads(
-                        T_softmax_expsum_shared[i0_2], A[i0_2, k], T_softmax_maxelem_shared[i0_2]
-                    )
+                    T.reads(A[i0_2, k], T_softmax_maxelem_shared[i0_2])
                     T.writes(T_softmax_expsum_shared[i0_2])
                     with T.init():
                         T_softmax_expsum_shared[i0_2] = T.float32(0)
@@ -401,7 +402,7 @@ def lowered_single_reduction_loop_with_block_predicate(
                     i0_1 = T.axis.spatial(256, i0)
                     k = T.axis.reduce(256, ax1_1)
                     T.where(ax1_0 * 512 + ax1_1 < 256)
-                    T.reads(A[i0_1, k], in_thread_0[0])
+                    T.reads(A[i0_1, k])
                     T.writes(in_thread_0[0])
                     in_thread_0[0] = T.max(in_thread_0[0], A[i0_1, k])
                 with T.block("T_softmax_maxelem_cross_thread"):
@@ -439,7 +440,7 @@ def lowered_single_reduction_loop_with_block_predicate(
                     i0_3 = T.axis.spatial(256, i0)
                     k = T.axis.reduce(256, ax1_1)
                     T.where(ax1_0 * 512 + ax1_1 < 256)
-                    T.reads(A[i0_3, k], T_softmax_maxelem_shared[i0_3], in_thread_1[0])
+                    T.reads(A[i0_3, k], T_softmax_maxelem_shared[i0_3])
                     T.writes(in_thread_1[0])
                     in_thread_1[0] = in_thread_1[0] + T.exp(
                         A[i0_3, k] - T_softmax_maxelem_shared[i0_3], dtype="float32"
@@ -492,7 +493,7 @@ def reducer_max(a: T.handle, b: T.handle) -> None:
         for k in T.thread_binding(0, 128, thread="threadIdx.x"):
             with T.block("B"):
                 vi, vk = T.axis.remap("SR", [i, k])
-                T.reads([B[vi], A[vi, vk]])
+                T.reads([A[vi, vk]])
                 T.writes([B[vi]])
                 with T.init():
                     B[vi] = T.min_value("float32")
@@ -534,7 +535,7 @@ def zero_rank_buffer(a: T.handle, b: T.handle) -> None:
     for k in T.thread_binding(0, 128, thread="threadIdx.x"):
         with T.block("B"):
             vk = T.axis.reduce(128, k)
-            T.reads([B[()], A[vk]])
+            T.reads([A[vk]])
             T.writes([B[()]])
             with T.init():
                 B[()] = T.float32(0)
@@ -590,7 +591,7 @@ def reduction_loop_not_deepest(a: T.handle, b: T.handle) -> None:
         for i in T.serial(0, 128):
             with T.block("B"):
                 vi, vk = T.axis.remap("SR", [i, k])
-                T.reads([B[vi], A[vi, vk]])
+                T.reads([A[vi, vk]])
                 T.writes([B[vi]])
                 with T.init():
                     B[vi] = T.float32(0)
@@ -605,7 +606,7 @@ def reduction_loop_bound_to_blockidx(a: T.handle, b: T.handle) -> None:
         for k in T.thread_binding(0, 128, thread="blockIdx.x"):
             with T.block("B"):
                 vi, vk = T.axis.remap("SR", [i, k])
-                T.reads([B[vi], A[vi, vk]])
+                T.reads([A[vi, vk]])
                 T.writes([B[vi]])
                 with T.init():
                     B[vi] = T.float32(0)
@@ -620,7 +621,7 @@ def different_access_indices(a: T.handle, b: T.handle) -> None:
         for k in T.thread_binding(0, 128, thread="threadIdx.x"):
             with T.block("B"):
                 vi, vj, vk = T.axis.remap("SSR", [i, j, k])
-                T.reads([B[vi, vj], A[vi, vj, vk]])
+                T.reads([A[vi, vj, vk]])
                 T.writes(
                     [
                         B[
@@ -642,7 +643,7 @@ def invalid_reducer(a: T.handle, b: T.handle) -> None:
         for k in T.thread_binding(0, 128, thread="threadIdx.x"):
             with T.block("B"):
                 vi, vk = T.axis.remap("SR", [i, k])
-                T.reads([B[vi], A[vi, vk]])
+                T.reads([A[vi, vk]])
                 T.writes([B[vi]])
                 with T.init():
                     B[vi] = T.float32(0)
@@ -661,7 +662,7 @@ def softmax(var_A: T.handle, var_T_softmax_norm: T.handle) -> None:
                 with T.block("T_softmax_maxelem"):
                     i0_1 = T.axis.spatial(256, i0)
                     k = T.axis.reduce(256, ax0_0 * 32 + ax0_1)
-                    T.reads([T_softmax_maxelem_shared[i0_1], A[i0_1, k]])
+                    T.reads([A[i0_1, k]])
                     T.writes([T_softmax_maxelem_shared[i0_1]])
                     with T.init():
                         T_softmax_maxelem_shared[i0_1] = T.min_value("float32")
@@ -675,7 +676,6 @@ def softmax(var_A: T.handle, var_T_softmax_norm: T.handle) -> None:
                     k = T.axis.reduce(256, ax0_0 * 32 + ax0_1)
                     T.reads(
                         [
-                            T_softmax_expsum_shared[i0_2],
                             A[i0_2, k],
                             T_softmax_maxelem_shared[i0_2],
                         ]
@@ -729,7 +729,7 @@ def lowered_softmax(var_A: T.handle, var_T_softmax_norm: T.handle) -> None:
                 with T.block("T_softmax_maxelem_normal_reduction"):
                     i0_1 = T.axis.spatial(256, i0)
                     k = T.axis.reduce(256, ax0_0 * 32 + ax0_1)
-                    T.reads([A[i0_1, k], normal_reduce_temp0[0]])
+                    T.reads([A[i0_1, k]])
                     T.writes([normal_reduce_temp0[0]])
                     normal_reduce_temp0[0] = T.max(normal_reduce_temp0[0], A[i0_1, k])
             with T.block("T_softmax_maxelem_cross_thread_reduction"):
@@ -768,7 +768,6 @@ def lowered_softmax(var_A: T.handle, var_T_softmax_norm: T.handle) -> None:
                         [
                             A[i0_3, k],
                             T_softmax_maxelem_shared[i0_3],
-                            normal_reduce_temp1[0],
                         ]
                     )
                     T.writes([normal_reduce_temp1[0]])
@@ -821,6 +820,191 @@ def lowered_softmax(var_A: T.handle, var_T_softmax_norm: T.handle) -> None:
                     )
 
 
+@T.prim_func
+def argmax_split(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0 in T.grid(128, 4):
+        for i1_1 in T.thread_binding(32, thread="threadIdx.x"):
+            with T.block("argmax"):
+                i = T.axis.spatial(128, i0)
+                k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+                T.reads(idx[i, k], val[i, k])
+                T.writes(argmax_v0[i], argmax_v1[i])
+                with T.init():
+                    argmax_v0[i] = -1
+                    argmax_v1[i] = T.float32(-3.4028234663852886e38)
+                v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+                v_argmax_v1: T.float32 = T.Select(
+                    argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]
+                )
+                argmax_v0[i] = v_argmax_v0
+                argmax_v1[i] = v_argmax_v1
+
+
+@T.prim_func
+def lowered_argmax_split(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmax_v0: T.Buffer[(128,), "int32"],
+    argmax_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    cross_thread_argmax_v0 = T.alloc_buffer([1], dtype="int32", strides=[1], scope="local")
+    cross_thread_argmax_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local")
+    in_thread_argmax_v0 = T.alloc_buffer([1], dtype="int32", strides=[1], scope="local")
+    in_thread_argmax_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local")
+    for i0 in T.serial(128):
+        for i1_1 in T.thread_binding(32, thread="threadIdx.x"):
+            with T.block("argmax_in_thread_init"):
+                T.reads()
+                T.writes(in_thread_argmax_v0[0], in_thread_argmax_v1[0])
+                in_thread_argmax_v0[0] = -1
+                in_thread_argmax_v1[0] = T.float32(-3.4028234663852886e38)
+            for i1_0 in T.serial(4):
+                with T.block("argmax_in_thread"):
+                    i = T.axis.spatial(128, i0)
+                    k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+                    T.reads(idx[i, k], val[i, k])
+                    T.writes(in_thread_argmax_v0[0], in_thread_argmax_v1[0])
+                    v_argmax_v0: T.int32 = T.Select(
+                        in_thread_argmax_v1[0] >= val[i, k], in_thread_argmax_v0[0], idx[i, k]
+                    )
+                    v_argmax_v1: T.float32 = T.Select(
+                        in_thread_argmax_v1[0] >= val[i, k], in_thread_argmax_v1[0], val[i, k]
+                    )
+                    in_thread_argmax_v0[0] = v_argmax_v0
+                    in_thread_argmax_v1[0] = v_argmax_v1
+            with T.block("argmax_cross_thread"):
+                T.reads(in_thread_argmax_v0[0], in_thread_argmax_v1[0])
+                T.writes(cross_thread_argmax_v0[0], cross_thread_argmax_v1[0])
+                T.attr(
+                    T.comm_reducer(
+                        lambda x0, x1, y0, y1: (
+                            T.Select(x1 >= y1, x0, y0),
+                            T.Select(x1 >= y1, x1, y1),
+                        ),
+                        [-1, T.float32(-3.4028234663852886e38)],
+                    ),
+                    "reduce_scope",
+                    T.reinterpret(T.uint64(0), dtype="handle"),
+                )
+                T.evaluate(
+                    T.tvm_thread_allreduce(
+                        T.uint32(2),
+                        in_thread_argmax_v0[0],
+                        in_thread_argmax_v1[0],
+                        True,
+                        cross_thread_argmax_v0[0],
+                        cross_thread_argmax_v1[0],
+                        i1_1,
+                        dtype="handle",
+                    )
+                )
+            with T.block("argmax_write_back"):
+                i = T.axis.spatial(128, i0)
+                T.reads(cross_thread_argmax_v0[0], cross_thread_argmax_v1[0])
+                T.writes(argmax_v0[i], argmax_v1[i])
+                argmax_v0[i] = cross_thread_argmax_v0[0]
+                argmax_v1[i] = cross_thread_argmax_v1[0]
+
+
+@T.prim_func
+def argmin_split_init_update_reordered(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmin_v0: T.Buffer[(128,), "int32"],
+    argmin_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    for i0, i1_0 in T.grid(128, 4):
+        for i1_1 in T.thread_binding(32, thread="threadIdx.x"):
+            with T.block("argmin"):
+                i = T.axis.spatial(128, i0)
+                k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+                T.reads(idx[i, k], val[i, k])
+                T.writes(argmin_v0[i], argmin_v1[i])
+                with T.init():
+                    argmin_v1[i] = T.float32(3.4028234663852886e38)
+                    argmin_v0[i] = -1
+                v_argmin_v0: T.int32 = T.Select(argmin_v1[i] <= val[i, k], argmin_v0[i], idx[i, k])
+                v_argmin_v1: T.float32 = T.Select(
+                    argmin_v1[i] <= val[i, k], argmin_v1[i], val[i, k]
+                )
+                argmin_v1[i] = v_argmin_v1
+                argmin_v0[i] = v_argmin_v0
+
+
+@T.prim_func
+def lowered_argmin_split_init_update_reordered(
+    idx: T.Buffer[(128, 128), "int32"],
+    val: T.Buffer[(128, 128), "float32"],
+    argmin_v0: T.Buffer[(128,), "int32"],
+    argmin_v1: T.Buffer[(128,), "float32"],
+) -> None:
+    cross_thread_argmin_v0 = T.alloc_buffer([1], dtype="int32", strides=[1], scope="local")
+    cross_thread_argmin_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local")
+    in_thread_argmin_v0 = T.alloc_buffer([1], dtype="int32", strides=[1], scope="local")
+    in_thread_argmin_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local")
+    for i0 in T.serial(128):
+        for i1_1 in T.thread_binding(32, thread="threadIdx.x"):
+            with T.block("argmin_in_thread_init"):
+                T.reads()
+                T.writes(in_thread_argmin_v0[0], in_thread_argmin_v1[0])
+                in_thread_argmin_v0[0] = -1
+                in_thread_argmin_v1[0] = T.float32(3.4028234663852886e38)
+            for i1_0 in T.serial(4):
+                with T.block("argmin_in_thread"):
+                    i = T.axis.spatial(128, i0)
+                    k = T.axis.reduce(128, i1_0 * 32 + i1_1)
+                    T.reads(idx[i, k], val[i, k])
+                    T.writes(in_thread_argmin_v0[0], in_thread_argmin_v1[0])
+                    v_argmin_v0: T.int32 = T.Select(
+                        in_thread_argmin_v1[0] <= val[i, k], in_thread_argmin_v0[0], idx[i, k]
+                    )
+                    v_argmin_v1: T.float32 = T.Select(
+                        in_thread_argmin_v1[0] <= val[i, k], in_thread_argmin_v1[0], val[i, k]
+                    )
+                    in_thread_argmin_v1[0] = v_argmin_v1
+                    in_thread_argmin_v0[0] = v_argmin_v0
+            with T.block("argmin_cross_thread"):
+                T.reads(in_thread_argmin_v0[0], in_thread_argmin_v1[0])
+                T.writes(cross_thread_argmin_v0[0], cross_thread_argmin_v1[0])
+                T.attr(
+                    T.comm_reducer(
+                        lambda x0, x1, y0, y1: (
+                            T.Select(x1 <= y1, x0, y0),
+                            T.Select(x1 <= y1, x1, y1),
+                        ),
+                        [-1, T.float32(3.4028234663852886e38)],
+                    ),
+                    "reduce_scope",
+                    T.reinterpret(T.uint64(0), dtype="handle"),
+                )
+                T.evaluate(
+                    T.tvm_thread_allreduce(
+                        T.uint32(2),
+                        in_thread_argmin_v0[0],
+                        in_thread_argmin_v1[0],
+                        True,
+                        cross_thread_argmin_v0[0],
+                        cross_thread_argmin_v1[0],
+                        i1_1,
+                        dtype="handle",
+                    )
+                )
+            with T.block("argmin_write_back"):
+                i = T.axis.spatial(128, i0)
+                T.reads(cross_thread_argmin_v0[0], cross_thread_argmin_v1[0])
+                T.writes(argmin_v0[i], argmin_v1[i])
+                argmin_v0[i] = cross_thread_argmin_v0[0]
+                argmin_v1[i] = cross_thread_argmin_v1[0]
+
+
+# pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
+
+
 def test_loop_split():
     _check(loop_split, lowered_loop_split)
 
@@ -880,6 +1064,14 @@ def test_softmax():
     _check(softmax, lowered_softmax)
 
 
+def test_argmax_split():
+    _check(argmax_split, lowered_argmax_split)
+
+
+def test_argmin_split_init_update_reordered():
+    _check(argmin_split_init_update_reordered, lowered_argmin_split_init_update_reordered)
+
+
 def test_lower_te():
     a = te.placeholder((32, 2, 2))
     k1 = te.reduce_axis((0, 2), "k1")

From 296565aaf985adbc33ede565e9b167987138ddfc Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Wed, 14 Sep 2022 23:06:05 +0100
Subject: [PATCH 167/704] Fixed pylint issues after moving to venv in ci_lint
 docker (#12775)

Following change introduced installing python dependencies inside
virtual environments: https://github.com/apache/tvm/pull/12663
Previous to this fix, a different version of python was being
picked up that didn't catch the issues fixed in this commit.

Change-Id: Ie290d9474a799311e07d293fa1b8299326b11661
---
 python/tvm/relay/testing/darknet.py              | 2 +-
 tests/python/frontend/darknet/test_forward.py    | 2 +-
 tests/python/frontend/tensorflow/test_forward.py | 2 +-
 tests/python/frontend/tflite/test_forward.py     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/testing/darknet.py b/python/tvm/relay/testing/darknet.py
index e1345043c6bb..b1f364273e1b 100644
--- a/python/tvm/relay/testing/darknet.py
+++ b/python/tvm/relay/testing/darknet.py
@@ -23,9 +23,9 @@
 These are utility functions used for testing and tutorial file.
 """
 from __future__ import division
+from cffi import FFI
 import numpy as np
 import cv2
-from cffi import FFI
 
 
 def convert_image(image):
diff --git a/tests/python/frontend/darknet/test_forward.py b/tests/python/frontend/darknet/test_forward.py
index ffaa773fc1bd..5e6af51f3298 100644
--- a/tests/python/frontend/darknet/test_forward.py
+++ b/tests/python/frontend/darknet/test_forward.py
@@ -22,6 +22,7 @@
 All the required models and libraries will be downloaded from the internet
 by the script.
 """
+from cffi import FFI
 import numpy as np
 import tvm
 from tvm.contrib import graph_executor
@@ -31,7 +32,6 @@
 from tvm.relay.testing.darknet import __darknetffi__
 from tvm.relay.frontend.darknet import ACTIVATION
 from tvm import relay
-from cffi import FFI
 
 REPO_URL = "https://github.com/dmlc/web-data/blob/main/darknet/"
 DARKNET_LIB = "libdarknet2.0.so"
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 8ed6d9108e5d..f3195f05d40f 100755
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -26,11 +26,11 @@
 import threading
 import platform
 import os.path
+from packaging import version as package_version
 import numpy as np
 import pytest
 
 from PIL import Image
-from packaging import version as package_version
 from tvm import relay
 from tvm.runtime.vm import VirtualMachine
 from tvm.relay.frontend.tensorflow import from_tensorflow
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 18045b8e8365..deaef72e1d7f 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -26,11 +26,11 @@
 
 import os
 import tempfile
+from packaging import version as package_version
 import pytest
 import numpy as np
 
 from PIL import Image
-from packaging import version as package_version
 
 import tvm
 import tvm.relay.testing.tf as tf_testing

From e5adb83d8e1cd3f5a9fe10946fb7b5b60bf54b94 Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Wed, 14 Sep 2022 20:08:32 -0300
Subject: [PATCH 168/704] [microTVM][Zephyr] Fix PLL freq. in overlay for
 nucleo_l4r5zi board (#12756)

* [microTVM][Zephyr] Fix PLL freq. in overlay for nucleo_l4r5zi board

Commit 1d32c400f ("Add project overlay to overwrite device tree configs")
added overlay for setting 'clock-frequency' property of node 'rcc' to
120 MHz, however to effectively change the PLL frequency that drivers
the core it's necessary also to overlay the attributes for the 'pll'
node. This commit does that.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>

* Remove div-p and div-q properties from overlay

Remove div-p and div-q properties from the overlay file since values for
these properties will be inherited from the 'pll' that is overlaid.

Since currently microTVM does not use any subsystem which relies on
clocks associated to either P or Q params, these params can be left
unchanged for now.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 .../app-overlay/nucleo_l4r5zi.overlay         | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay b/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay
index 360e0753d4f5..532efe50d397 100644
--- a/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay
+++ b/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay
@@ -21,3 +21,25 @@
 &rcc {
 	clock-frequency = <DT_FREQ_M(120)>;
 };
+
+/*
+   Set PLL accordingly to freq. reported by 'clock-frequency' property, where:
+
+   VCO freq = PLL clock input freq (HSI: 16 MHz) * N / M and
+   Core freq = VCO freq / R.
+
+   Hence:
+
+   VCO freq = 16 * 30 / 2 = 240 MHz and
+   Core freq = 240 MHz / 2 = 120 MHz
+
+   Prop. 'div-p' and 'div-q' will be inherited from the overlaid 'pll' node.
+*/
+
+&pll {
+	div-m = <2>;
+	mul-n = <30>;
+	div-r = <2>;
+	clocks = <&clk_hsi>;
+	status = "okay";
+};

From 397cf8781eba7a2bcc35e832130801c1d1419c43 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 15 Sep 2022 06:39:20 -0500
Subject: [PATCH 169/704] [Arith][Refactor] Return Optional<PrimExpr> from
 TryConstFold (#12784)

Prior to this commit, the templated `TryConstFold` utility returned an
undefined `PrimExpr` to represent a failure to perform constant
folding.  This commit makes this explicit by returning
`Optional<PrimExpr>` instead.
---
 src/arith/canonical_simplify.cc | 21 +++-----
 src/arith/const_fold.h          | 91 +++++++++++++++++----------------
 src/arith/int_set.cc            | 10 ++--
 src/arith/iter_affine_map.cc    | 15 ++----
 src/arith/pattern_match.h       |  3 +-
 src/arith/rewrite_simplify.cc   | 42 +++++----------
 src/tir/op/op.cc                | 57 +++++++--------------
 7 files changed, 99 insertions(+), 140 deletions(-)

diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc
index 9f45317cba11..f5d2667aa64e 100644
--- a/src/arith/canonical_simplify.cc
+++ b/src/arith/canonical_simplify.cc
@@ -716,8 +716,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const AddNode* op) {
   PrimExpr b = this->CanonicalMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<Add>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Add>(a, b)) return const_res.value();
 
   // canonical form simplification.
   SumExpr ret = ToSumExpr(std::move(a));
@@ -741,8 +740,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const SubNode* op) {
   PrimExpr b = this->CanonicalMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<Sub>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Sub>(a, b)) return const_res.value();
 
   // canonical form simplification.
   SumExpr ret = ToSumExpr(std::move(a));
@@ -766,8 +764,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const MulNode* op) {
   PrimExpr b = this->CanonicalMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<Mul>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Mul>(a, b)) return const_res.value();
 
   // x * c
   if (a.as<IntImmNode>()) {
@@ -870,8 +867,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const DivNode* op) {
   PrimExpr b = this->CanonicalMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<Div>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Div>(a, b)) return const_res.value();
   PVar<IntImm> c1;
   // x / c1
   if (c1.Match(b) && c1.Eval()->value > 0) {
@@ -928,8 +924,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
   PrimExpr b = this->CanonicalMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<FloorDiv>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<FloorDiv>(a, b)) return const_res.value();
   PVar<IntImm> c1;
   // x / c1
   if (c1.Match(b) && c1.Eval()->value > 0) {
@@ -1037,8 +1032,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ModNode* op) {
   PrimExpr b = this->CanonicalMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<Mod>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Mod>(a, b)) return const_res.value();
 
   PVar<IntImm> c1;
   // x % c1
@@ -1105,8 +1099,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
   PrimExpr b = this->CanonicalMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<FloorMod>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<FloorMod>(a, b)) return const_res.value();
 
   PVar<IntImm> c1;
   // x % c1
diff --git a/src/arith/const_fold.h b/src/arith/const_fold.h
index d0e09a1a7429..a7466cf38c85 100644
--- a/src/arith/const_fold.h
+++ b/src/arith/const_fold.h
@@ -24,6 +24,7 @@
 #ifndef TVM_ARITH_CONST_FOLD_H_
 #define TVM_ARITH_CONST_FOLD_H_
 
+#include <tvm/runtime/container/optional.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 
@@ -44,10 +45,10 @@ namespace arith {
  * \tparam Op The operator type.
  *
  * \note a and b Must already matched data types with each other.
- * \return nullptr if constant fold fails, otherwise return folded result.
+ * \return NullOpt if constant fold fails, otherwise return folded result.
  */
 template <typename Op>
-inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b);
+inline Optional<PrimExpr> TryConstFold(PrimExpr a, PrimExpr b);
 
 /*!
  * \brief Try to run unary compute with constant folding.
@@ -56,10 +57,10 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b);
  * \tparam Op The operator type.
  *
  * \note a and b Must already matched data types with each other.
- * \return nullptr if constant fold fails, otherwise return folded result.
+ * \return NullOpt if constant fold fails, otherwise return folded result.
  */
 template <typename Op>
-inline PrimExpr TryConstFold(PrimExpr a);
+inline Optional<PrimExpr> TryConstFold(PrimExpr a);
 
 /*!
  * \brief Check whether type is used to represent index.
@@ -126,7 +127,7 @@ inline double GetFoldResultDoubleRepr(float x) {
 
 // specialization of constant folders.
 template <>
-inline PrimExpr TryConstFold<tir::Add>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::Add>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
@@ -142,17 +143,17 @@ inline PrimExpr TryConstFold<tir::Add>(PrimExpr a, PrimExpr b) {
       } else if (rtype.bits() == 64) {
         return FloatImm(rtype, fa->value + fb->value);
       } else {
-        return PrimExpr();
+        return NullOpt;
       }
     }
     if (fa && fa->value == 0) return b;
     if (fb && fb->value == 0) return a;
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::Sub>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::Sub>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     ICHECK(!((pa && pa->dtype.is_uint() && pa->value == 0U) &&
              (pb && pb->dtype.is_uint() && pb->value > 0U)))
@@ -171,16 +172,16 @@ inline PrimExpr TryConstFold<tir::Sub>(PrimExpr a, PrimExpr b) {
       } else if (rtype.bits() == 64) {
         return FloatImm(rtype, fa->value - fb->value);
       } else {
-        return PrimExpr();
+        return NullOpt;
       }
     }
     if (fb && fb->value == 0) return a;
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::Mul>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::Mul>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
@@ -202,7 +203,7 @@ inline PrimExpr TryConstFold<tir::Mul>(PrimExpr a, PrimExpr b) {
       } else if (rtype.bits() == 64) {
         return FloatImm(rtype, fa->value * fb->value);
       } else {
-        return PrimExpr();
+        return NullOpt;
       }
     }
     if (fa) {
@@ -214,11 +215,11 @@ inline PrimExpr TryConstFold<tir::Mul>(PrimExpr a, PrimExpr b) {
       if (fb->value == 0) return b;
     }
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
@@ -242,7 +243,7 @@ inline PrimExpr TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
       } else if (rtype.bits() == 64) {
         return FloatImm(rtype, fa->value / fb->value);
       } else {
-        return PrimExpr();
+        return NullOpt;
       }
     }
     if (fa && fa->value == 0) return a;
@@ -251,11 +252,11 @@ inline PrimExpr TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
       ICHECK_NE(fb->value, 0) << "Divide by zero";
     }
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::Mod>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::Mod>(PrimExpr a, PrimExpr b) {
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
@@ -271,11 +272,11 @@ inline PrimExpr TryConstFold<tir::Mod>(PrimExpr a, PrimExpr b) {
       ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
@@ -297,7 +298,7 @@ inline PrimExpr TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
       } else if (rtype.bits() == 64) {
         return FloatImm(rtype, std::floor(fa->value / fb->value));
       } else {
-        return PrimExpr();
+        return NullOpt;
       }
     }
     if (fa && fa->value == 0) return a;
@@ -306,11 +307,11 @@ inline PrimExpr TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
       ICHECK_NE(fb->value, 0) << "Divide by zero";
     }
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::FloorMod>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::FloorMod>(PrimExpr a, PrimExpr b) {
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
@@ -326,114 +327,114 @@ inline PrimExpr TryConstFold<tir::FloorMod>(PrimExpr a, PrimExpr b) {
       ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::Min>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::Min>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) return IntImm(rtype, std::min(pa->value, pb->value));
     if (fa && fb) return FloatImm(rtype, std::min(fa->value, fb->value));
   });
   if (a.same_as(b)) return a;
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::Max>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::Max>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) return IntImm(rtype, std::max(pa->value, pb->value));
     if (fa && fb) return FloatImm(rtype, std::max(fa->value, fb->value));
   });
   if (a.same_as(b)) return a;
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::GT>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::GT>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     if (pa && pb) return IntImm(DataType::UInt(1), pa->value > pb->value);
     if (fa && fb) return IntImm(DataType::UInt(1), fa->value > fb->value);
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::GE>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::GE>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     if (pa && pb) return IntImm(DataType::UInt(1), pa->value >= pb->value);
     if (fa && fb) return IntImm(DataType::UInt(1), fa->value >= fb->value);
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::LT>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::LT>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     if (pa && pb) return IntImm(DataType::UInt(1), pa->value < pb->value);
     if (fa && fb) return IntImm(DataType::UInt(1), fa->value < fb->value);
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::LE>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::LE>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     if (pa && pb) return IntImm(DataType::UInt(1), pa->value <= pb->value);
     if (fa && fb) return IntImm(DataType::UInt(1), fa->value <= fb->value);
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::EQ>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::EQ>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     if (pa && pb) return IntImm(DataType::UInt(1), pa->value == pb->value);
     if (fa && fb) return IntImm(DataType::UInt(1), fa->value == fb->value);
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::NE>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::NE>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     if (pa && pb) return IntImm(DataType::UInt(1), pa->value != pb->value);
     if (fa && fb) return IntImm(DataType::UInt(1), fa->value != fb->value);
   });
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::And>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::And>(PrimExpr a, PrimExpr b) {
   const IntImmNode* pa = a.as<IntImmNode>();
   const IntImmNode* pb = b.as<IntImmNode>();
   if (pa && pa->value) return b;
   if (pa && !pa->value) return a;
   if (pb && pb->value) return a;
   if (pb && !pb->value) return b;
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::Or>(PrimExpr a, PrimExpr b) {
+inline Optional<PrimExpr> TryConstFold<tir::Or>(PrimExpr a, PrimExpr b) {
   const IntImmNode* pa = a.as<IntImmNode>();
   const IntImmNode* pb = b.as<IntImmNode>();
   if (pa && pa->value) return a;
   if (pa && !pa->value) return b;
   if (pb && pb->value) return b;
   if (pb && !pb->value) return a;
-  return PrimExpr();
+  return NullOpt;
 }
 
 template <>
-inline PrimExpr TryConstFold<tir::Not>(PrimExpr a) {
+inline Optional<PrimExpr> TryConstFold<tir::Not>(PrimExpr a) {
   const IntImmNode* pa = a.as<IntImmNode>();
   if (pa) {
     return IntImm(DataType::UInt(1), !(pa->value));
   }
-  return PrimExpr();
+  return NullOpt;
 }
 
 /*! \brief Helper namespace for symbolic value limits */
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index e8e223ceca09..35b12bb35238 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -108,9 +108,13 @@ TVM_DECLARE_LOGICAL_OP(Not);
 template <typename Op>
 inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b, DataType dtype) {
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
-    PrimExpr res = TryConstFold<Op>(a->min_value, b->min_value);
-    if (!res.defined()) res = Op(a->min_value, b->min_value);
-    return IntervalSet::SinglePoint(res);
+    PrimExpr expr;
+    if (auto res = TryConstFold<Op>(a->min_value, b->min_value)) {
+      expr = res.value();
+    } else {
+      expr = Op(a->min_value, b->min_value);
+    }
+    return IntervalSet::SinglePoint(expr);
   }
   if (is_logical_op<Op>::value) {
     return IntervalSet(make_const(dtype, 0), make_const(dtype, 1));
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 83e2821c9800..182eada24d96 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -1205,8 +1205,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const AddNode* op) {
   PrimExpr b = this->DirectMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<Add>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Add>(a, b)) return const_res.value();
   // does not contain iter map.
   if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
     if (op->a.same_as(a) && op->b.same_as(b)) {
@@ -1240,8 +1239,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const SubNode* op) {
   PrimExpr b = this->DirectMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<Sub>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Sub>(a, b)) return const_res.value();
 
   // does not contain iter map.
   if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
@@ -1276,8 +1274,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
   PrimExpr b = this->DirectMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<Mul>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Mul>(a, b)) return const_res.value();
 
   // does not contain iter map.
   if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
@@ -1572,8 +1569,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
   PrimExpr b = this->DirectMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<FloorDiv>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<FloorDiv>(a, b)) return const_res.value();
 
   // does not contain iter map.
   if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
@@ -1657,8 +1653,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
   PrimExpr b = this->DirectMutate(op->b);
 
   // const folding
-  PrimExpr const_res = TryConstFold<FloorMod>(a, b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<FloorMod>(a, b)) return const_res.value();
 
   // does not contain iter map.
   if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
diff --git a/src/arith/pattern_match.h b/src/arith/pattern_match.h
index 6abcc728fc8d..69f064e11931 100644
--- a/src/arith/pattern_match.h
+++ b/src/arith/pattern_match.h
@@ -330,8 +330,7 @@ class PBinaryExpr : public Pattern<PBinaryExpr<OpType, TA, TB>> {
   PrimExpr Eval() const {
     PrimExpr lhs = a_.Eval();
     PrimExpr rhs = b_.Eval();
-    PrimExpr ret = TryConstFold<OpType>(lhs, rhs);
-    if (ret.defined()) return ret;
+    if (auto ret = TryConstFold<OpType>(lhs, rhs)) return ret.value();
     return OpType(lhs, rhs);
   }
 
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index d7866fc1307b..e3e9db62d0bd 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -124,8 +124,7 @@ void RewriteSimplifier::Impl::Update(const Var& var, const PrimExpr& info, bool
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AddNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<AddNode>();
-  PrimExpr const_res = TryConstFold<Add>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Add>(op->a, op->b)) return const_res.value();
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, b1, b2, s1, s2;
   // Pattern var match IntImm
@@ -258,8 +257,7 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const SubNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<SubNode>();
-  PrimExpr const_res = TryConstFold<Sub>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Sub>(op->a, op->b)) return const_res.value();
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, b1, b2, s1, s2;
   // Pattern var match IntImm
@@ -450,8 +448,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const SubNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MulNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<MulNode>();
-  PrimExpr const_res = TryConstFold<Mul>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Mul>(op->a, op->b)) return const_res.value();
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, b1, b2, s1, s2;
   // Pattern var match IntImm
@@ -490,8 +487,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MulNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<DivNode>();
-  PrimExpr const_res = TryConstFold<Div>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Div>(op->a, op->b)) return const_res.value();
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, b1;
   // Pattern var match IntImm
@@ -666,8 +662,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const ModNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<ModNode>();
-  PrimExpr const_res = TryConstFold<Mod>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Mod>(op->a, op->b)) return const_res.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, b1;
@@ -748,8 +743,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const ModNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<FloorDivNode>();
-  PrimExpr const_res = TryConstFold<FloorDiv>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<FloorDiv>(op->a, op->b)) return const_res.value();
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, b1;
   // Pattern var match IntImm
@@ -895,8 +889,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<FloorModNode>();
-  PrimExpr const_res = TryConstFold<FloorMod>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<FloorMod>(op->a, op->b)) return const_res.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, b1;
@@ -977,8 +970,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MinNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<MinNode>();
-  PrimExpr const_res = TryConstFold<Min>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Min>(op->a, op->b)) return const_res.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, s1, s2;
@@ -1149,8 +1141,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MinNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MaxNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<MaxNode>();
-  PrimExpr const_res = TryConstFold<Max>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Max>(op->a, op->b)) return const_res.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, s1, s2;
@@ -1327,8 +1318,7 @@ Optional<PrimExpr> RewriteSimplifier::Impl::TryMatchLiteralConstraint(const Prim
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const EQNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<EQNode>();
-  PrimExpr const_res = TryConstFold<EQ>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<EQ>(op->a, op->b)) return const_res.value();
   if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
@@ -1376,8 +1366,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const GENode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LTNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<LTNode>();
-  PrimExpr const_res = TryConstFold<LT>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<LT>(op->a, op->b)) return const_res.value();
   if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
@@ -1508,8 +1497,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LTNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NotNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<NotNode>();
-  PrimExpr const_res = TryConstFold<Not>(op->a);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Not>(op->a)) return const_res.value();
   if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
@@ -1534,8 +1522,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NotNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<AndNode>();
-  PrimExpr const_res = TryConstFold<And>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<And>(op->a, op->b)) return const_res.value();
   if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
@@ -1574,8 +1561,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<OrNode>();
-  PrimExpr const_res = TryConstFold<Or>(op->a, op->b);
-  if (const_res.defined()) return const_res;
+  if (auto const_res = TryConstFold<Or>(op->a, op->b)) return const_res.value();
   if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index b9e0c3c37068..509badbebb92 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -327,8 +327,7 @@ PrimExpr operator+(PrimExpr a, PrimExpr b) { return add(a, b); }
 
 PrimExpr add(PrimExpr a, PrimExpr b, Span span) {
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::Add>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::Add>(a, b)) return ret.value();
   return tir::Add(a, b, span);
 }
 
@@ -349,23 +348,20 @@ PrimExpr operator-(PrimExpr a, PrimExpr b) { return sub(a, b); }
 
 PrimExpr sub(PrimExpr a, PrimExpr b, Span span) {
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::Sub>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::Sub>(a, b)) return ret.value();
   return tir::Sub(a, b, span);
 }
 
 PrimExpr operator*(PrimExpr a, PrimExpr b) { return mul(a, b); }
 PrimExpr mul(PrimExpr a, PrimExpr b, Span span) {
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::Mul>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::Mul>(a, b)) return ret.value();
   return tir::Mul(a, b, span);
 }
 
 PrimExpr div(PrimExpr a, PrimExpr b, Span span) {
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::Div>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::Div>(a, b)) return ret.value();
   return tir::Div(a, b, span);
 }
 
@@ -377,8 +373,7 @@ PrimExpr truncdiv(PrimExpr a, PrimExpr b, Span span) {
 
 PrimExpr truncmod(PrimExpr a, PrimExpr b, Span span) {
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::Mod>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::Mod>(a, b)) return ret.value();
   return tir::Mod(a, b, span);
 }
 
@@ -397,8 +392,7 @@ PrimExpr floordiv(PrimExpr a, PrimExpr b, Span span) {
   ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
   ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::FloorDiv>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::FloorDiv>(a, b)) return ret.value();
   return tir::FloorDiv(a, b, span);
 }
 
@@ -406,8 +400,7 @@ PrimExpr ceildiv(PrimExpr a, PrimExpr b, Span span) {
   ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
   ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::FloorDiv>(a + b - 1, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::FloorDiv>(a + b - 1, b)) return ret.value();
   return tir::FloorDiv(a + b - 1, b, span);
 }
 
@@ -415,8 +408,7 @@ PrimExpr floormod(PrimExpr a, PrimExpr b, Span span) {
   ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
   ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::FloorMod>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::FloorMod>(a, b)) return ret.value();
   return tir::FloorMod(a, b, span);
 }
 
@@ -429,8 +421,7 @@ PrimExpr min(PrimExpr a, PrimExpr b, Span span) {
   if (is_pos_inf(b)) return a;
   if (is_neg_inf(b)) return b;
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::Min>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::Min>(a, b)) return ret.value();
   return tir::Min(a, b, span);
 }
 
@@ -443,8 +434,7 @@ PrimExpr max(PrimExpr a, PrimExpr b, Span span) {
   if (is_pos_inf(b)) return b;
   if (is_neg_inf(b)) return a;
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::Max>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::Max>(a, b)) return ret.value();
   return tir::Max(a, b, span);
 }
 
@@ -475,48 +465,42 @@ PrimExpr likely(PrimExpr cond, Span span) {
 PrimExpr operator>(PrimExpr a, PrimExpr b) { return greater(a, b); }
 PrimExpr greater(PrimExpr a, PrimExpr b, Span span) {
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::GT>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::GT>(a, b)) return ret.value();
   return tir::GT(a, b, span);
 }
 
 PrimExpr operator>=(PrimExpr a, PrimExpr b) { return greater_equal(a, b); }
 PrimExpr greater_equal(PrimExpr a, PrimExpr b, Span span) {
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::GE>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::GE>(a, b)) return ret.value();
   return tir::GE(a, b, span);
 }
 
 PrimExpr operator<(PrimExpr a, PrimExpr b) { return less(a, b); }
 PrimExpr less(PrimExpr a, PrimExpr b, Span span) {
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::LT>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::LT>(a, b)) return ret.value();
   return tir::LT(a, b, span);
 }
 
 PrimExpr operator<=(PrimExpr a, PrimExpr b) { return less_equal(a, b); }
 PrimExpr less_equal(PrimExpr a, PrimExpr b, Span span) {
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::LE>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::LE>(a, b)) return ret.value();
   return tir::LE(a, b, span);
 }
 
 PrimExpr operator==(PrimExpr a, PrimExpr b) { return equal(a, b); }
 PrimExpr equal(PrimExpr a, PrimExpr b, Span span) {
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::EQ>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::EQ>(a, b)) return ret.value();
   return tir::EQ(a, b, span);
 }
 
 PrimExpr operator!=(PrimExpr a, PrimExpr b) { return not_equal(a, b); }
 PrimExpr not_equal(PrimExpr a, PrimExpr b, Span span) {
   BinaryOpMatchTypes(a, b, span);
-  PrimExpr ret = arith::TryConstFold<tir::NE>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::NE>(a, b)) return ret.value();
   return tir::NE(a, b, span);
 }
 
@@ -551,24 +535,21 @@ void type_check_integer_args(const PrimExpr& lhs, const PrimExpr& rhs, const cha
 PrimExpr operator&&(PrimExpr a, PrimExpr b) { return logical_and(a, b); }
 PrimExpr logical_and(PrimExpr a, PrimExpr b, Span span) {
   type_check_boolean_args(a, b, "&& operator (logical AND)");
-  PrimExpr ret = arith::TryConstFold<tir::And>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::And>(a, b)) return ret.value();
   return tir::And(a, b, span);
 }
 
 PrimExpr operator||(PrimExpr a, PrimExpr b) { return logical_or(a, b); }
 PrimExpr logical_or(PrimExpr a, PrimExpr b, Span span) {
   type_check_boolean_args(a, b, "|| operator (logical OR)");
-  PrimExpr ret = arith::TryConstFold<tir::Or>(a, b);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::Or>(a, b)) return ret.value();
   return tir::Or(a, b, span);
 }
 
 PrimExpr operator!(PrimExpr a) { return logical_not(a); }
 PrimExpr logical_not(PrimExpr a, Span span) {
   type_check_boolean_args(a, "! operator (logical NOT)");
-  PrimExpr ret = arith::TryConstFold<tir::Not>(a);
-  if (ret.defined()) return ret;
+  if (auto ret = arith::TryConstFold<tir::Not>(a)) return ret.value();
   return tir::Not(a, span);
 }
 

From 1f8b5dec29e6e34b4cf5f092acf5b1d197a59d42 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 15 Sep 2022 13:15:10 -0700
Subject: [PATCH 170/704] [TIR, Schedule] Add schedule primitive PadEinsum
 (#12750)

* [TIR, Schedule] Add schedule primitive PadEinsum

Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>

* lint

* [TIR] Fix producer indices check in PadEinsum

* address comments

* simplify lambda expr

* fix

Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
---
 include/tvm/tir/schedule/schedule.h           |  20 +
 python/tvm/tir/schedule/schedule.py           | 122 +++++
 src/tir/schedule/analysis.h                   |  27 +
 src/tir/schedule/analysis/analysis.cc         |  29 ++
 src/tir/schedule/concrete_schedule.cc         |   6 +
 src/tir/schedule/concrete_schedule.h          |   1 +
 src/tir/schedule/primitive.h                  |  11 +-
 .../primitive/layout_transformation.cc        |  36 +-
 src/tir/schedule/primitive/pad_einsum.cc      | 474 ++++++++++++++++++
 src/tir/schedule/schedule.cc                  |   3 +-
 src/tir/schedule/traced_schedule.cc           |  12 +-
 src/tir/schedule/traced_schedule.h            |   3 +-
 src/tir/schedule/transform.cc                 |   8 +
 src/tir/schedule/transform.h                  |   7 +-
 .../unittest/test_tir_schedule_pad_einsum.py  | 122 +++++
 15 files changed, 841 insertions(+), 40 deletions(-)
 create mode 100644 src/tir/schedule/primitive/pad_einsum.cc
 create mode 100644 tests/python/unittest/test_tir_schedule_pad_einsum.py

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index da399ab976d6..8e5cd34d2e0b 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -627,6 +627,7 @@ class ScheduleNode : public runtime::Object {
                                 BufferIndexType buffer_index_type,
                                 const Array<IntImm>& axis_separators) = 0;
 
+  /******** Schedule: Padding ********/
   /*!
    * \brief Decompose a padding block into a block filling const pad values and a block
    * writing in-bound values.
@@ -636,6 +637,25 @@ class ScheduleNode : public runtime::Object {
    */
   virtual BlockRV DecomposePadding(const BlockRV& block_rv, const LoopRV& loop_rv) = 0;
 
+  /*!
+   * \brief Pad the computation of Einsum.
+   * \param block_rv The block that matches the Einsum pattern.
+   * \param padding The padding for each block iter.
+   * \details This schedule primitives identifies the Einsum pattern in the block body, and find its
+   * producer blocks. It then pads the computation of the Einsum pattern and its producer blocks.
+   * The output buffer and the producer buffer is resized according to the padding size. It requires
+   * the output buffer and the producer buffer to be allocated inside the PrimFunc.
+   *
+   * The padding is a list of non-negative integers, each element corresponds to the padding for
+   * each block iter in the order of block iters. The block and its producer blocks should have
+   * trivial bindings, i.e. each block iter is bound to a single loop variable. After padding, the
+   * block iter extent and the corresponding outer loop is extended by the padding size.
+   *
+   * The size of the producer buffers are infered from the padding size of the Einsum computation.
+   * The producer buffers are padded by the initial value of the corresponding reduction.
+   */
+  virtual void PadEinsum(const BlockRV& block_rv, const Array<Integer>& padding) = 0;
+
   /******** Schedule: Misc ********/
   /*! \brief A no-op that marks the start of postprocessing phase of scheduling */
   virtual void EnterPostproc() = 0;
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index d1293371a0e0..fdc871703275 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -2783,6 +2783,128 @@ def can_decompose_padding(self, block: Union[BlockRV, str], loop: LoopRV) -> boo
         """Check whether the block match padding pattern and can be decomposed."""
         return _ffi_api.CanDecomposePadding(self, block, loop)  # type: ignore # pylint: disable=no-member
 
+    @type_checked
+    def pad_einsum(self, block: Union[BlockRV, str], padding: List[int]) -> None:
+        """Pad the computation of Einsum.
+
+        This schedule primitives identifies the Einsum pattern in the block body, and find its
+        producer blocks. It then pads the computation of the Einsum pattern and its producer blocks.
+        The output buffer and the producer buffer is resized according to the padding size. It
+        requires the output buffer and the producer buffer to be allocated inside the PrimFunc.
+
+        The padding is a list of non-negative integers, each element corresponds to the padding for
+        each block iter in the order of block iters. The block and it's producer blocks should have
+        trivial bindings, i.e. each block iter is bound to a single loop variable. After padding,
+        thblock iter extent and the corresponding outer loop is extended by the padding size.
+
+        The size of the producer buffers are infered from the padding size of the Einsum
+        computation. The producer buffers are padded by the initial value of the corresponding
+        reduction.
+
+        Parameters
+        ----------
+        block : Union[BlockRV, str]
+            The block that matches the Einsum pattern.
+
+        padding : List[int]
+            The padding for each block iter.
+
+        Examples
+        --------
+
+        Before applying pad-einsum, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def before_pad_einsum(
+                A: T.Buffer[(128, 127), "float32"],
+                B: T.Buffer[(127, 127), "float32"],
+                C: T.Buffer[(128, 127), "float32"],
+            ) -> None:
+                A_shared = T.alloc_buffer((128, 127), "float32", scope="shared")
+                B_shared = T.alloc_buffer((127, 127), "float32", scope="shared")
+                C_shared = T.alloc_buffer((128, 127), "float32", scope="shared")
+                for i0, i1 in T.grid(128, 127):
+                    with T.block("A"):
+                        i, j = T.axis.remap("SS", [i0, i1])
+                        A_shared[i, j] = A[i, j]
+                for i0, i1 in T.grid(127, 127):
+                    with T.block("B"):
+                        i, j = T.axis.remap("SS", [i0, i1])
+                        B_shared[i, j] = B[i, j]
+                for i0, i1, i2 in T.grid(128, 127, 127):
+                    with T.block("C_shared"):
+                        i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                        with T.init():
+                            C_shared[i, j] = T.float32(0)
+                        C_shared[i, j] = C_shared[i, j] + A_shared[i, k] * B_shared[k, j]
+                for i0, i1 in T.grid(128, 127):
+                    with T.block("C"):
+                        i, j = T.axis.remap("SS", [i0, i1])
+                        C[i, j] = C_shared[i, j]
+
+        Create the schedule and do pad-einsum with specified block:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_pad_einsum, debug_mask="all")
+            block = sch.get_block("C_shared")
+            sch.pad_einsum(block, [0, 1, 1])
+            print(sch.mod["main"].script())
+
+        After applying decompose-padding, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def after_pad_einsum(
+                A: T.Buffer[(128, 127), "float32"],
+                B: T.Buffer[(127, 127), "float32"],
+                C: T.Buffer[(128, 127), "float32"],
+            ) -> None:
+                A_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+                B_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+                C_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+                for i0, i1 in T.grid(128, 128):
+                    with T.block("A"):
+                        i, j = T.axis.remap("SS", [i0, i1])
+                        T.reads(A[i, j])
+                        T.writes(A_shared_padded[i, j])
+                        A_shared_padded[i, j] = T.if_then_else(
+                            j < 127, A[i, j], T.float32(0), dtype="float32"
+                        )
+                for i0, i1 in T.grid(128, 128):
+                    with T.block("B"):
+                        i, j = T.axis.remap("SS", [i0, i1])
+                        T.reads(B[i, j])
+                        T.writes(B_shared_padded[i, j])
+                        B_shared_padded[i, j] = T.if_then_else(
+                            i < 127 and j < 127, B[i, j], T.float32(0), dtype="float32"
+                        )
+                for i0, i1, i2 in T.grid(128, 128, 128):
+                    with T.block("C_shared"):
+                        i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                        T.reads(A_shared_padded[i, k], B_shared_padded[k, j])
+                        T.writes(C_shared_padded[i, j])
+                        with T.init():
+                            C_shared_padded[i, j] = T.float32(0)
+                        C_shared_padded[i, j] = (
+                            C_shared_padded[i, j] + A_shared_padded[i, k] * B_shared_padded[k, j]
+                        )
+                for i0, i1 in T.grid(128, 127):
+                    with T.block("C"):
+                        i, j = T.axis.remap("SS", [i0, i1])
+                        T.reads(C_shared_padded[i, j])
+                        T.writes(C[i, j])
+                        C[i, j] = C_shared_padded[i, j]
+
+        """
+        block = self._normalize_block_arg(block)
+        return _ffi_api.SchedulePadEinsum(  # type: ignore # pylint: disable=no-member
+            self, block, padding
+        )
+
     ########## Schedule: Misc ##########
 
     @type_checked
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index 489df8959d1b..ca45bcac6b34 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -298,6 +298,15 @@ bool GetVarsTouchedByBlockIters(const BlockRealize& block_realize,
 void CheckLoopStartsWithZero(const ScheduleState& self, const StmtSRef& loop_sref,
                              arith::Analyzer* analyzer);
 
+/*!
+ * \brief Check whether a block has a trivial binding, i.e. each block var is bound to a outer loop,
+ * from outer to inner.
+ * \param self The schedule state
+ * \param block_sref The block to be checked
+ * \throw ScheduleError If the block does not have trivial bindings
+ */
+void CheckBlockHasTrivialBinding(const ScheduleState& self, const StmtSRef& block_sref);
+
 /******** Block-loop relation ********/
 
 /*!
@@ -697,6 +706,24 @@ Array<arith::IntSet> AnalyzeRegionLowerBound(const BufferRegion& region, const P
                                              const StmtSRef& dom_high_exclusive,
                                              arith::Analyzer* analyzer);
 
+/*!
+ * \brief Check if buffer indices are all Vars and extr
+ * \param buffer_access The BufferLoad or BufferStore
+ * \return The indices if the indices are all Vars, otherwise NullOpt
+ */
+template <typename T>
+Optional<Array<Var>> CheckTrivialBufferIndices(const T& buffer_access) {
+  Array<Var> indices;
+  for (const PrimExpr& index : buffer_access->indices) {
+    const VarNode* var = index.as<VarNode>();
+    if (var == nullptr) {
+      return NullOpt;
+    }
+    indices.push_back(GetRef<Var>(var));
+  }
+  return indices;
+}
+
 /*! \brief Necessary information used for tensorization */
 class TensorizeInfoNode : public Object {
  public:
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 7ed60876ab22..4f78b0c9cd43 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -652,6 +652,35 @@ void CheckAffineBinding(const ScheduleState& self, Block block) {
   CheckPartialAffineBinding(self, std::move(block), NullOpt);
 }
 
+void CheckBlockHasTrivialBinding(const ScheduleState& self, const StmtSRef& block_sref) {
+  class NotTrivialBindingError : public ScheduleError {
+   public:
+    explicit NotTrivialBindingError(IRModule mod, Block block)
+        : mod_(std::move(mod)), block_(std::move(block)) {}
+
+    String FastErrorString() const final {
+      return "ScheduleError: The binding values of the block are not variables of outer loops.";
+    }
+
+    String DetailRenderTemplate() const final {
+      std::ostringstream os;
+      os << "The binding values of the {0} are not variables of outer loops.";
+      return os.str();
+    }
+
+    IRModule mod() const final { return mod_; }
+    Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+
+   private:
+    IRModule mod_;
+    Block block_;
+  };
+
+  if (!IsTrivialBinding(self, block_sref)) {
+    throw NotTrivialBindingError(self->mod, GetRef<Block>(block_sref->StmtAs<BlockNode>()));
+  }
+}
+
 Map<Var, Range> LoopDomainOfSRefTreePath(const StmtSRef& low_inclusive,
                                          const Optional<StmtSRef>& high_exclusive,
                                          const runtime::StorageScope& extra_relax_scope) {
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index afc675799706..9d7dc6b95f50 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -795,6 +795,12 @@ BlockRV ConcreteScheduleNode::DecomposePadding(const BlockRV& block_rv, const Lo
   return CreateRV<BlockRV>(result);
 }
 
+void ConcreteScheduleNode::PadEinsum(const BlockRV& block_rv, const Array<Integer>& padding) {
+  TVM_TIR_SCHEDULE_BEGIN();
+  tir::PadEinsum(state_, this->GetSRef(block_rv), padding);
+  TVM_TIR_SCHEDULE_END("pad-einsum", this->error_render_level_);
+  this->state_->DebugVerify();
+}
 /******** Schedule: Misc ********/
 
 }  // namespace tir
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index e79d1d528809..1aa9dafcc93e 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -128,6 +128,7 @@ class ConcreteScheduleNode : public ScheduleNode {
   /******** Schedule: Reduction ********/
   BlockRV RFactor(const LoopRV& loop_rv, int factor_axis) override;
   BlockRV DecomposeReduction(const BlockRV& block_rv, const LoopRV& loop_rv) override;
+  void PadEinsum(const BlockRV& block_rv, const Array<Integer>& padding) override;
   /******** Schedule: Block annotation ********/
   void StorageAlign(const BlockRV& block_rv, int buffer_index, int axis, int factor,
                     int offset) override;
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 05d9e4cf944a..97233fe4bc6f 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -490,7 +490,7 @@ TVM_DLL void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int
 TVM_DLL void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
                                   const IndexMap& index_map);
 
-/******** Schedule: Padding decomposition ********/
+/******** Schedule: Padding ********/
 /*!
  * \brief Decompose a padding block into a block filling const pad values and a block
  * writing in-bound values.
@@ -501,6 +501,15 @@ TVM_DLL void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref
 TVM_DLL StmtSRef DecomposePadding(ScheduleState self, const StmtSRef& block_sref,
                                   const StmtSRef& loop_sref);
 
+/*!
+ * \brief Pad the computation of Einsum.
+ * \param self The state of the schedule
+ * \param block_sref The block sref that matches the Einsum pattern.
+ * \param padding The padding for each block iter.
+ */
+TVM_DLL void PadEinsum(ScheduleState self, const StmtSRef& block_sref,
+                       const Array<Integer>& padding);
+
 /******** Schedule: Misc ********/
 
 }  // namespace tir
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 8e2643db0103..32ed279f028f 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -278,40 +278,6 @@ class IndexMapNotApplicableToBlockIterError : public ScheduleError {
   IndexMap index_map_;
 };
 
-class NotTrivialBindingError : public ScheduleError {
- public:
-  explicit NotTrivialBindingError(IRModule mod, Block block)
-      : mod_(std::move(mod)), block_(std::move(block)) {}
-
-  static void CheckBlockHasTrivialBinding(const IRModule& mod, const BlockRealize& block_realize,
-                                          std::unordered_set<const VarNode*> outer_loop_vars) {
-    // Step 2: Check all the binding values are loops vars
-    for (const PrimExpr& iter_value : block_realize->iter_values) {
-      const VarNode* loop_var = iter_value.as<VarNode>();
-      if (!loop_var || !outer_loop_vars.count(loop_var)) {
-        throw NotTrivialBindingError(mod, block_realize->block);
-      }
-    }
-  }
-
-  String FastErrorString() const final {
-    return "ScheduleError: The binding values of the block are not variables of outer loops.";
-  }
-
-  String DetailRenderTemplate() const final {
-    std::ostringstream os;
-    os << "The binding values of the {0} are not variables of outer loops.";
-    return os.str();
-  }
-
-  IRModule mod() const final { return mod_; }
-  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
-
- private:
-  IRModule mod_;
-  Block block_;
-};
-
 class OpaqueNewIterTypeError : public ScheduleError {
  public:
   explicit OpaqueNewIterTypeError(IRModule mod, Block block, PrimExpr iter_value)
@@ -363,7 +329,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
   }
 
   BlockRealize block_realize = GetBlockRealize(self, block_sref);
-  NotTrivialBindingError::CheckBlockHasTrivialBinding(self->mod, block_realize, loop_vars);
+  CheckBlockHasTrivialBinding(self, block_sref);
 
   // Step 3: Collect information of block iter vars
   Array<PrimExpr> block_vars;      // iter_var->var of each block iter
diff --git a/src/tir/schedule/primitive/pad_einsum.cc b/src/tir/schedule/primitive/pad_einsum.cc
new file mode 100644
index 000000000000..7a7b88d686f9
--- /dev/null
+++ b/src/tir/schedule/primitive/pad_einsum.cc
@@ -0,0 +1,474 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <optional>
+
+#include "../utils.h"
+
+namespace tvm {
+namespace tir {
+
+/*! \brief The schedule error class when the padding size is invalid. */
+class InvalidPaddingError : public ScheduleError {
+ public:
+  InvalidPaddingError(IRModule mod, Block block, Array<Integer> padding)
+      : mod_(std::move(mod)), block_(std::move(block)), padding_(std::move(padding)) {}
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+  String FastErrorString() const final {
+    return "ScheduleError: The padding size for the block is invalid.";
+  }
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "The padding for the block {0} are invalid. It should be a list of "
+       << block_->iter_vars.size() << " non-negative integers. Got " << padding_;
+    return os.str();
+  }
+
+  static void Check(const ScheduleState& self, const Block& block, Array<Integer> padding) {
+    if (padding.size() != block->iter_vars.size()) {
+      throw InvalidPaddingError(self->mod, block, padding);
+    }
+    for (const auto& pad : padding) {
+      if (pad->value < 0) {
+        throw InvalidPaddingError(self->mod, block, padding);
+      }
+    }
+  }
+
+ private:
+  IRModule mod_;
+  Block block_;
+  Array<Integer> padding_;
+};
+
+/*! \brief The schedule error class when the block body is not an Einsum pattern. */
+class NonEinsumError : public ScheduleError {
+ public:
+  explicit NonEinsumError(IRModule mod, Block block)
+      : mod_(std::move(mod)), block_(std::move(block)) {}
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+  String FastErrorString() const final {
+    return "ScheduleError: The block is not a computation of Einsum pattern.";
+  }
+  String DetailRenderTemplate() const final {
+    return "The block {0} not a computation of Einsum pattern.";
+  }
+
+ private:
+  IRModule mod_;
+  Block block_;
+};
+
+/*! \brief Data structure that represents a Einsum computation. */
+struct Einsum {
+  // The output buffer
+  Buffer output_buffer;
+  // The indices of the output buffer
+  Array<Var> output_indices;
+  // The indices of the input buffers
+  Map<Buffer, Array<Var>> input_indices;
+};
+
+class EinsumExtractor : public ExprVisitor {
+ public:
+  EinsumExtractor() = default;
+
+  std::optional<Einsum> Extract(const Block& block) {
+    const BufferStoreNode* update = block->body.as<BufferStoreNode>();
+    // Step 1: Check the body is a BufferStore and the block has the init statement, and the
+    // BufferStore and the init statement store have the same output buffer indices.
+    if (update == nullptr || !block->init.defined()) {
+      return std::nullopt;
+    }
+
+    if (Optional<Array<Var>> opt_indices = CheckTrivialBufferIndices(update);
+        opt_indices.defined()) {
+      ein_sum_.output_indices = std::move(opt_indices.value());
+    } else {
+      return std::nullopt;
+    }
+    ein_sum_.output_buffer = update->buffer;
+
+    const BufferStoreNode* init = block->init.value().as<BufferStoreNode>();
+    ICHECK(init != nullptr);
+    if (!CompareBufferIndices(init->indices, ein_sum_.output_indices)) {
+      return std::nullopt;
+    }
+    // Step 2: Check the BufferStore updates the output buffer and the input buffers indices are
+    // block iter variables.
+    CheckStoreValue(update->value);
+    if (fail_) {
+      return std::nullopt;
+    }
+    return std::move(ein_sum_);
+  }
+
+ private:
+  void CheckStoreValue(const PrimExpr& update) {
+    // Check the update part has the form:
+    //   Output[output_indices] += Input_0[input_indices_0] op_0 Input_1[input_indices_1] op_1 ...
+    // where output_indices and input_indices_i are the indices are arrays whose elements are the
+    // block iter variables instead of composite PrimExpr, and op_i are the binary operations.
+
+    // Check the value is Add and eithe LHS or RHS is the BufferLoad from the output buffer.
+    const AddNode* add = update.as<AddNode>();
+    if (add == nullptr) {
+      fail_ = true;
+      return;
+    }
+    const BufferLoadNode* lhs = add->a.as<BufferLoadNode>();
+    const BufferLoadNode* rhs = add->b.as<BufferLoadNode>();
+    if (lhs == nullptr && rhs != nullptr) {
+      std::swap(lhs, rhs);
+    }
+    if (lhs == nullptr || !lhs->buffer.same_as(ein_sum_.output_buffer) ||
+        !CompareBufferIndices(lhs->indices, ein_sum_.output_indices)) {
+      fail_ = true;
+      return;
+    }
+    VisitExpr(add->b);
+  }
+
+  void VisitExpr(const PrimExpr& n) final {
+    if (n->IsInstance<BufferLoadNode>() || n->IsInstance<MulNode>() || n->IsInstance<CastNode>()) {
+      ExprVisitor::VisitExpr(n);
+    } else {
+      fail_ = true;
+      return;
+    }
+  }
+
+  void VisitExpr_(const BufferLoadNode* op) final {
+    if (auto it = ein_sum_.input_indices.find(op->buffer);
+        it != ein_sum_.input_indices.end() && !CompareBufferIndices(op->indices, (*it).second)) {
+      fail_ = true;
+      return;
+    }
+    if (Optional<Array<Var>> opt_indices = CheckTrivialBufferIndices(op); opt_indices.defined()) {
+      ein_sum_.input_indices.Set(op->buffer, std::move(opt_indices.value()));
+    } else {
+      fail_ = true;
+      return;
+    }
+  }
+
+  void VisitExpr_(const CastNode* op) { VisitExpr(op->value); }
+
+  bool Fail() { return fail_; }
+
+  bool CompareBufferIndices(const Array<PrimExpr>& indices, const Array<Var>& other) {
+    return std::equal(indices.begin(), indices.end(), other.begin(), other.end(),
+                      [](const PrimExpr& a, const Var& b) { return a.same_as(b); });
+  }
+
+  Einsum ein_sum_;
+  bool fail_{false};
+};
+
+Einsum ExtractEinsum(const ScheduleState& self, const Block& block) {
+  EinsumExtractor extractor;
+  std::optional<Einsum> einsum = extractor.Extract(block);
+  if (!einsum.has_value()) {
+    throw NonEinsumError(self->mod, block);
+  }
+  return einsum.value();
+}
+
+class BufferNotAllocatedInScopeError : public ScheduleError {
+ public:
+  explicit BufferNotAllocatedInScopeError(IRModule mod, Buffer buffer)
+      : mod_(std::move(mod)), buffer_(std::move(buffer)) {}
+
+  String FastErrorString() const final {
+    return "ScheduleError: The buffer is not allocated as an intermediate buffer in current "
+           "PrimFunc.";
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "The buffer " << buffer_->name
+       << " is not allocated as an intermediate buffer in current PrimFunc.";
+    return os.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {}; }
+
+ private:
+  IRModule mod_;
+  Buffer buffer_;
+};
+
+class PadEinsumRewriter : public ReplaceBufferMutator {
+ public:
+  PadEinsumRewriter(const std::unordered_map<const BlockNode*, PrimExpr> producer_predicate,
+                    Map<Var, PrimExpr> padded_iter_extents, const Map<Buffer, Buffer>& buffer_remap,
+                    Map<Block, Block>* block_sref_reuse, arith::Analyzer* analyzer)
+      : ReplaceBufferMutator(buffer_remap, block_sref_reuse),
+        producer_predicate_(producer_predicate),
+        padded_iter_extents_(padded_iter_extents),
+        analyzer_(analyzer) {}
+
+  Stmt VisitStmt_(const ForNode* op) final {
+    For new_for = Downcast<For>(ReplaceBufferMutator::VisitStmt_(op));
+    if (padded_iter_extents_.count(new_for->loop_var)) {
+      new_for.CopyOnWrite()->extent = padded_iter_extents_.at(new_for->loop_var);
+    }
+    return std::move(new_for);
+  }
+
+  Block PadProducerBlock(Block block, const PrimExpr& predicate) {
+    BufferStore store = Downcast<BufferStore>(block->body);
+    store.CopyOnWrite()->value =
+        analyzer_->Simplify(if_then_else(predicate, store->value, make_zero(store->value.dtype())));
+    block.CopyOnWrite()->body = std::move(store);
+    return block;
+  }
+
+  Stmt VisitStmt_(const BlockNode* op) final {
+    Block old_block = GetRef<Block>(op);
+    Block new_block = Downcast<Block>(ReplaceBufferMutator::VisitStmt_(op));
+    if (auto it = producer_predicate_.find(op); it != producer_predicate_.end()) {
+      new_block = PadProducerBlock(std::move(new_block), (*it).second);
+    }
+
+    // Mutate block iters
+    Array<IterVar> new_iters;
+    bool changed = false;
+    for (const IterVar& iter : new_block->iter_vars) {
+      if (auto it = padded_iter_extents_.find(iter->var); it != padded_iter_extents_.end()) {
+        changed = true;
+        new_iters.push_back(
+            IterVar(Range::FromMinExtent(0, (*it).second), iter->var, iter->iter_type));
+      } else {
+        new_iters.push_back(iter);
+      }
+    }
+    if (changed) {
+      new_block.CopyOnWrite()->iter_vars = std::move(new_iters);
+    }
+    if (!old_block.same_as(new_block)) {
+      block_sref_reuse_->Set(old_block, new_block);
+    }
+    return std::move(new_block);
+  }
+
+ private:
+  const std::unordered_set<const BlockNode*> producer_blocks_;
+  const std::unordered_map<const BlockNode*, PrimExpr> producer_predicate_;
+  const Map<Var, PrimExpr> padded_iter_extents_;
+  arith::Analyzer* analyzer_;
+};
+
+/*! \brief The schedule error class when the producer block cannot be padded. */
+class InvalidProducerError : public ScheduleError {
+ public:
+  explicit InvalidProducerError(IRModule mod, Block producer)
+      : mod_(std::move(mod)), producer_(std::move(producer)) {}
+
+  String FastErrorString() const final {
+    return "ScheduleError: The producer block cannot be padded.";
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "The producer block {0} cannot be padded. It should write to a single buffer and the "
+          "body should be a BufferStore.";
+    return os.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {producer_}; }
+
+ private:
+  IRModule mod_;
+  Buffer buffer_;
+  Block producer_;
+};
+
+void PadEinsum(ScheduleState self, const StmtSRef& block_sref, const Array<Integer>& padding) {
+  arith::Analyzer analyzer;
+  // Step 1: Input checking and error handling
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
+  BlockRealize realize = GetBlockRealize(self, block_sref);
+
+  const StmtSRef& scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
+  InvalidPaddingError::Check(self, GetRef<Block>(block), padding);
+
+  const Array<StmtSRef> producers = GetProducers(self, block_sref);
+  {
+    auto f_check_block_properties = [&](const StmtSRef& block_sref, bool is_producer) {
+      CheckBlockHasTrivialBinding(self, block_sref);
+      if (is_producer) {
+        CheckCompleteBlock(self, block_sref, scope_sref);
+      } else {
+        CheckReductionBlock(self, block_sref, scope_sref);
+      }
+      Array loops = GetLoops(block_sref);
+      ICHECK(!loops.empty());
+      CheckGetSingleChildBlockRealizeOnSRefTree(self, loops.front());
+    };
+
+    // Check block properties of the computation block
+    f_check_block_properties(block_sref, false);
+
+    // Check block properties of the producer block
+    for (const StmtSRef& producer_sref : producers) {
+      f_check_block_properties(producer_sref, true);
+    }
+  }
+
+  Einsum einsum = ExtractEinsum(self, GetRef<Block>(block));
+
+  // Check input and output buffers are all allocated in the current scope.
+  {
+    auto f_check_buffer_allocated = [&](const Buffer& buffer) {
+      auto [defining_site_sref, is_allocate] = GetBufferDefiningSite(block_sref, buffer);
+      if (!defining_site_sref.defined() || !is_allocate) {
+        throw BufferNotAllocatedInScopeError(self->mod, buffer);
+      }
+    };
+    f_check_buffer_allocated(einsum.output_buffer);
+    for (const auto& buffer_indices_pair : einsum.input_indices) {
+      f_check_buffer_allocated(buffer_indices_pair.first);
+    }
+  }
+
+  // Step 2: Prepare buffer and variable remapping. Infer the new shape of the input and the output
+  // buffers. Infer the new extent of the block iters of the computation block and the producer
+  // block.
+
+  Map<Var, PrimExpr> padded_iter_extents;  // The new extents of both the block iters and loop vars
+
+  // Convert the input padding array to a map from variables to the padded extents
+  for (int i = 0, n = padding.size(); i < n; ++i) {
+    const IterVar& iter = block->iter_vars[i];
+    PrimExpr new_extent =
+        IntImm(iter->var->dtype, Downcast<Integer>(iter->dom->extent)->value + padding[i]->value);
+    padded_iter_extents.Set(iter->var, new_extent);
+    padded_iter_extents.Set(Downcast<Var>(realize->iter_values[i]), new_extent);
+  }
+
+  Map<Buffer, Buffer> buffer_remap;  // mapping from buffers to new buffers with padded shapes
+
+  // Utility function to pad a buffer with the new shape
+  auto f_pad_buffer = [&padded_iter_extents, &buffer_remap](Buffer buffer,
+                                                            const Array<Var>& indices) -> Buffer {
+    Array<PrimExpr> new_shape;
+    for (const Var& index : indices) {
+      new_shape.push_back(padded_iter_extents.at(index));
+    }
+    ICHECK_EQ(buffer->shape.size(), new_shape.size());
+    buffer.CopyOnWrite()->shape = std::move(new_shape);
+    return buffer;
+  };
+
+  buffer_remap.Set(einsum.output_buffer, f_pad_buffer(einsum.output_buffer, einsum.output_indices));
+
+  std::unordered_map<const BlockNode*, PrimExpr> producer_predicate;
+
+  // Different from the output block, the padding for the producer block is not directly specified
+  // as the input argument. Instead, it is inferred from indices of the producer buffer accessed in
+  // the output block.
+  // We will find the indices (which are block iters) in BufferStore to the producer buffer
+  // and infer the new extents of the block iters and the corresponding loop vars.
+  for (const StmtSRef& producer_sref : producers) {
+    const BlockNode* producer_block = TVM_SREF_TO_BLOCK(producer_sref);
+    const BufferStoreNode* buffer_store = producer_block->body.as<BufferStoreNode>();
+    Optional<Array<Var>> producer_store_indices;
+    if (!buffer_store || producer_block->writes.size() != 1 ||
+        !(producer_store_indices = CheckTrivialBufferIndices(buffer_store)).defined()) {
+      throw InvalidProducerError(self->mod, GetRef<Block>(producer_block));
+    }
+    BlockRealize producer_realize = GetBlockRealize(self, producer_sref);
+
+    const Buffer& old_buffer = producer_block->writes[0]->buffer;
+    Buffer new_buffer = f_pad_buffer(old_buffer, einsum.input_indices.at(old_buffer));
+    buffer_remap.Set(old_buffer, new_buffer);
+
+    // The predicate to ensure the producer block is in the original bound before padding
+    PrimExpr predicate = Bool(true);
+    Map<Var, PrimExpr> indices_to_padded_extents;  // buffer indices to padded extents
+    for (int i = 0, n = producer_store_indices.value().size(); i < n; ++i) {
+      const Var& index = producer_store_indices.value()[i];
+      PrimExpr padded_extent = new_buffer->shape[i];
+      if (!analyzer.CanProveEqual(padded_extent, old_buffer->shape[i])) {
+        predicate = predicate && (index < old_buffer->shape[i]);
+      }
+      indices_to_padded_extents.Set(index, padded_extent);
+    }
+
+    for (int i = 0, n = producer_block->iter_vars.size(); i < n; ++i) {
+      const IterVar& iter = producer_block->iter_vars[i];
+      if (auto it = indices_to_padded_extents.find(iter->var);
+          it != indices_to_padded_extents.end()) {
+        const PrimExpr& padded_extent = (*it).second;
+        padded_iter_extents.Set(iter->var, padded_extent);
+        padded_iter_extents.Set(Downcast<Var>(producer_realize->iter_values[i]), padded_extent);
+      } else if (!is_one(iter->dom->extent)) {
+        throw InvalidProducerError(self->mod, GetRef<Block>(producer_block));
+      }
+    }
+    producer_predicate[producer_block] = predicate;
+  }
+
+  // Step 3: Mutate the AST subtree with the new buffers and the new block iter extents.
+  Map<Block, Block> block_sref_reuse;
+  PadEinsumRewriter rewriter(producer_predicate, padded_iter_extents, buffer_remap,
+                             &block_sref_reuse, &analyzer);
+  const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref);
+  Stmt new_scope_block = rewriter(GetRef<Block>(scope_block));
+
+  // Step 4: Do the actual replacement.
+  self->Replace(scope_sref, new_scope_block, block_sref_reuse);
+}
+
+/******** Instruction Registration ********/
+
+struct PadEinsumTraits : public UnpackedInstTraits<PadEinsumTraits> {
+  static constexpr const char* kName = "PadEinsum";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumAttrs = 1;
+  static constexpr size_t kNumDecisions = 0;
+
+  static void UnpackedApplyToSchedule(Schedule sch, BlockRV block, Array<Integer> padding) {
+    sch->PadEinsum(block, padding);
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String block, Array<Integer> padding) {
+    PythonAPICall py("pad_einsum");
+    py.Input("block", block);
+    py.Input("padding", padding);
+    return py.Str();
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
+TVM_REGISTER_INST_KIND_TRAITS(PadEinsumTraits);
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index 091db344aadb..d72f67fb7c2d 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -264,7 +264,8 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleSetAxisSeparator")
 /******** (FFI) Padding decomposition ********/
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleDecomposePadding")
     .set_body_method<Schedule>(&ScheduleNode::DecomposePadding);
-
+TVM_REGISTER_GLOBAL("tir.schedule.SchedulePadEinsum")
+    .set_body_method<Schedule>(&ScheduleNode::PadEinsum);
 /******** (FFI) Misc ********/
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleEnterPostproc")
     .set_body_method<Schedule>(&ScheduleNode::EnterPostproc);
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 04ddc0507dc4..a31950d33115 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -520,7 +520,7 @@ void TracedScheduleNode::SetAxisSeparator(const BlockRV& block_rv, int buffer_in
       /*outputs=*/{}));
 }
 
-/******** Schedule: Padding decomposition ********/
+/******** Schedule: Padding ********/
 BlockRV TracedScheduleNode::DecomposePadding(const BlockRV& block_rv, const LoopRV& loop_rv) {
   BlockRV new_block = ConcreteScheduleNode::DecomposePadding(block_rv, loop_rv);
   static const InstructionKind& kind = InstructionKind::Get("DecomposePadding");
@@ -532,6 +532,16 @@ BlockRV TracedScheduleNode::DecomposePadding(const BlockRV& block_rv, const Loop
   return new_block;
 }
 
+void TracedScheduleNode::PadEinsum(const BlockRV& block_rv, const Array<Integer>& padding) {
+  ConcreteScheduleNode::PadEinsum(block_rv, padding);
+  static const InstructionKind& kind = InstructionKind::Get("PadEinsum");
+  trace_->Append(/*inst=*/Instruction(
+      /*kind=*/kind,
+      /*inputs=*/{block_rv},
+      /*attrs=*/{padding},
+      /*outputs=*/{}));
+}
+
 /******** Schedule: Misc ********/
 
 void TracedScheduleNode::EnterPostproc() {
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index d98e4ba4bb95..ad44cc6ae552 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -108,8 +108,9 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   void SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
                         BufferIndexType buffer_index_type,
                         const Array<IntImm>& axis_separators) final;
-  /******** Schedule: Padding decomposition ********/
+  /******** Schedule: Padding ********/
   BlockRV DecomposePadding(const BlockRV& block_rv, const LoopRV& loop_rv) final;
+  void PadEinsum(const BlockRV& block_rv, const Array<Integer>& padding) final;
   /******** Schedule: Misc ********/
   void EnterPostproc() final;
 };
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index c11fa656d6da..dfbd3dbcbcc4 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -103,6 +103,14 @@ ReplaceBufferMutator::ReplaceBufferMutator(const Buffer& old_buffer, Buffer new_
   buffer_var_map_[old_buffer->data.get()] = std::move(new_buffer);
 }
 
+ReplaceBufferMutator::ReplaceBufferMutator(const Map<Buffer, Buffer>& buffer_map,
+                                           Map<Block, Block>* block_sref_reuse)
+    : block_sref_reuse_(block_sref_reuse) {
+  for (const auto& [old_buffer, new_buffer] : buffer_map) {
+    buffer_var_map_[old_buffer->data.get()] = new_buffer;
+  }
+}
+
 PrimExpr ReplaceBufferMutator::VisitExpr_(const VarNode* var) {
   auto it = buffer_var_map_.find(var);
   return it != buffer_var_map_.end() ? it->second->data : GetRef<Var>(var);
diff --git a/src/tir/schedule/transform.h b/src/tir/schedule/transform.h
index 908a823c2d86..4de3685e2482 100644
--- a/src/tir/schedule/transform.h
+++ b/src/tir/schedule/transform.h
@@ -114,7 +114,12 @@ class ReplaceBufferMutator : public StmtExprMutator {
   ReplaceBufferMutator(const Buffer& old_buffer, Buffer new_buffer,
                        Map<Block, Block>* block_sref_reuse);
 
+  ReplaceBufferMutator(const Map<Buffer, Buffer>& buffer_map, Map<Block, Block>* block_sref_reuse);
+
  protected:
+  using StmtExprMutator::VisitExpr_;
+  using StmtExprMutator::VisitStmt_;
+
   PrimExpr VisitExpr_(const VarNode* var) final;
 
   template <typename Node>
@@ -132,7 +137,7 @@ class ReplaceBufferMutator : public StmtExprMutator {
 
   virtual MatchBufferRegion VisitMatchBufferRegion(const MatchBufferRegion& match_buffer);
 
-  Stmt VisitStmt_(const BlockNode* block) final;
+  Stmt VisitStmt_(const BlockNode* block) override;
 
   /*!
    * \brief A mapping which maps old buffer vars to new buffers, including the buffers defined in
diff --git a/tests/python/unittest/test_tir_schedule_pad_einsum.py b/tests/python/unittest/test_tir_schedule_pad_einsum.py
new file mode 100644
index 000000000000..89628db4ff74
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_pad_einsum.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-function-docstring,missing-module-docstring
+import sys
+
+import pytest
+import tvm
+import tvm.testing
+from tvm import tir, te
+from tvm.script import tir as T
+from tvm.tir.schedule.schedule import ScheduleError
+from tvm.tir.schedule.testing import verify_trace_roundtrip
+from tvm.meta_schedule.testing import te_workload
+
+# pylint: disable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
+
+
+@T.prim_func
+def matmul_before(
+    A: T.Buffer[(128, 127), "float32"],
+    B: T.Buffer[(127, 127), "float32"],
+    C: T.Buffer[(128, 127), "float32"],
+) -> None:
+    A_shared = T.alloc_buffer((128, 127), "float32", scope="shared")
+    B_shared = T.alloc_buffer((127, 127), "float32", scope="shared")
+    C_shared = T.alloc_buffer((128, 127), "float32", scope="shared")
+    for i0, i1 in T.grid(128, 127):
+        with T.block("A"):
+            i, j = T.axis.remap("SS", [i0, i1])
+            A_shared[i, j] = A[i, j]
+    for i0, i1 in T.grid(127, 127):
+        with T.block("B"):
+            i, j = T.axis.remap("SS", [i0, i1])
+            B_shared[i, j] = B[i, j]
+    for i0, i1, i2 in T.grid(128, 127, 127):
+        with T.block("C_shared"):
+            i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+            with T.init():
+                C_shared[i, j] = T.float32(0)
+            C_shared[i, j] = C_shared[i, j] + A_shared[i, k] * B_shared[k, j]
+    for i0, i1 in T.grid(128, 127):
+        with T.block("C"):
+            i, j = T.axis.remap("SS", [i0, i1])
+            C[i, j] = C_shared[i, j]
+
+
+@T.prim_func
+def matmul_expected(
+    A: T.Buffer[(128, 127), "float32"],
+    B: T.Buffer[(127, 127), "float32"],
+    C: T.Buffer[(128, 127), "float32"],
+) -> None:
+    A_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+    B_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+    C_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+    for i0, i1 in T.grid(128, 128):
+        with T.block("A"):
+            i, j = T.axis.remap("SS", [i0, i1])
+            T.reads(A[i, j])
+            T.writes(A_shared_padded[i, j])
+            A_shared_padded[i, j] = T.if_then_else(j < 127, A[i, j], T.float32(0), dtype="float32")
+    for i0, i1 in T.grid(128, 128):
+        with T.block("B"):
+            i, j = T.axis.remap("SS", [i0, i1])
+            T.reads(B[i, j])
+            T.writes(B_shared_padded[i, j])
+            B_shared_padded[i, j] = T.if_then_else(
+                i < 127 and j < 127, B[i, j], T.float32(0), dtype="float32"
+            )
+    for i0, i1, i2 in T.grid(128, 128, 128):
+        with T.block("C_shared"):
+            i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+            T.reads(A_shared_padded[i, k], B_shared_padded[k, j])
+            T.writes(C_shared_padded[i, j])
+            with T.init():
+                C_shared_padded[i, j] = T.float32(0)
+            C_shared_padded[i, j] = (
+                C_shared_padded[i, j] + A_shared_padded[i, k] * B_shared_padded[k, j]
+            )
+    for i0, i1 in T.grid(128, 127):
+        with T.block("C"):
+            i, j = T.axis.remap("SS", [i0, i1])
+            T.reads(C_shared_padded[i, j])
+            T.writes(C[i, j])
+            C[i, j] = C_shared_padded[i, j]
+
+
+# pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
+
+
+def test_pad_matmul():
+    sch = tir.Schedule(matmul_before, debug_mask="all")
+    C = sch.get_block("C_shared")
+    sch.pad_einsum(C, [0, 1, 1])
+    tvm.ir.assert_structural_equal(matmul_expected, sch.mod["main"])
+    verify_trace_roundtrip(sch, mod=matmul_before)
+
+
+def test_pad_matmul_error_non_intermediate_buffer():
+    func = te.create_prim_func(te_workload.matmul(128, 127, 127))
+    sch = tir.Schedule(func, debug_mask="all")
+    C = sch.get_block("C")
+    with pytest.raises(ScheduleError):
+        sch.pad_einsum(C, [0, 1, 1])
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 9b1042585effaad047808520158260a33f3f0f75 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 15 Sep 2022 13:30:08 -0700
Subject: [PATCH 171/704] [Arith] Simplify nested if_then_else (#12749)

[Arith] Simplify nested if_then_else

Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
---
 src/arith/rewrite_simplify.cc                 | 20 +++++++++++++++++++
 .../unittest/test_arith_rewrite_simplify.py   | 10 ++++++++++
 2 files changed, 30 insertions(+)

diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index e3e9db62d0bd..2f7b88dfc508 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -1654,6 +1654,26 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
     }
   }
 
+  if (op->op.same_as(tir::builtin::if_then_else())) {
+    // Simplify nested if_then_else
+    // if (cond) { if (inner_cond) { inner_then_expr } else { inner_else_expr } } else { else_expr }
+    // => if (cond && inner_cond) { inner_then_expr } else { else_expr }
+    const PrimExpr& cond = op->args[0];
+    const PrimExpr& then_expr = op->args[1];
+    const PrimExpr& else_expr = op->args[2];
+    const CallNode* inner_call = then_expr.as<CallNode>();
+    if (inner_call != nullptr && inner_call->op.same_as(tir::builtin::if_then_else())) {
+      const PrimExpr& inner_cond = inner_call->args[0];
+      const PrimExpr& inner_then_expr = inner_call->args[1];
+      const PrimExpr& inner_else_expr = inner_call->args[2];
+      // Only check constant cases to avoid recursion
+      if (is_const_number(inner_else_expr) && is_const_number(else_expr) &&
+          analyzer_->CanProve(inner_else_expr == else_expr)) {
+        return if_then_else(cond && inner_cond, inner_then_expr, else_expr);
+      }
+    }
+  }
+
   return ret;
 }
 
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index c880f90ddffe..77751b160177 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -992,5 +992,15 @@ def test_sub_bufferload():
     ck.verify(expr, 0.0)
 
 
+def test_if_then_else_simplify():
+    ck = RewriteChecker()
+    x = te.var("x", "int32")
+    z = tvm.tir.if_then_else(x < 5, tvm.tir.if_then_else(x > 1, 1, 0), 0)
+    ck.verify(z, tvm.tir.if_then_else(tvm.tir.And(tvm.tir.LT(x, 5), tvm.tir.LT(1, x)), 1, 0))
+
+    z = tvm.tir.if_then_else(x > 2, tvm.tir.if_then_else(x > 1, 1, 0), 0)
+    ck.verify(z, tvm.tir.if_then_else(tvm.tir.LT(2, x), 1, 0))
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From f5517d4a08342e66fc70ba557930abc83f5cb20b Mon Sep 17 00:00:00 2001
From: Philipp van Kempen <philipp.van-kempen@tum.de>
Date: Thu, 15 Sep 2022 23:02:22 +0200
Subject: [PATCH 172/704] [Docker][CI][RISC-V] Build riscv-isa-sim (spike) in
 ci_riscv Docker image to enable RISC-V unit testing (#12534)

* Remove CSI-NN from ci_cortexm docker image

* [Docker] [RISC-V] Split up CSI-NN2 installation script into several files

[Docker] [RISC-V] move gcc toolchain installation out of csi-nn2 script

[Docker] [RISC-V] move qemu installation out of csi-nn2 script

* use updated version of qemu

* [Docker] [RISC-V] Install newlib (baremetal) gcc toolchain

* [Docker] [RISC-V] Install spike simulator

* [Docker] move initialization of timezone and DEBIAN_FRONTEND to ubuntu_install_core.sh script
---
 docker/Dockerfile.ci_cortexm                  |  6 --
 docker/Dockerfile.ci_riscv                    | 24 +++++-
 .../ubuntu_download_csinn2_compute_lib.sh     | 20 ++---
 .../ubuntu_download_xuantie_gcc_linux.sh      | 57 +++++++++++++
 .../ubuntu_download_xuantie_gcc_newlib.sh     | 57 +++++++++++++
 .../install/ubuntu_download_xuantie_qemu.sh   | 56 +++++++++++++
 docker/install/ubuntu_install_arduino.sh      |  1 -
 docker/install/ubuntu_install_core.sh         |  5 ++
 docker/install/ubuntu_install_spike_sim.sh    | 81 +++++++++++++++++++
 docker/install/ubuntu_install_zephyr.sh       |  5 --
 10 files changed, 288 insertions(+), 24 deletions(-)
 create mode 100755 docker/install/ubuntu_download_xuantie_gcc_linux.sh
 create mode 100755 docker/install/ubuntu_download_xuantie_gcc_newlib.sh
 create mode 100755 docker/install/ubuntu_download_xuantie_qemu.sh
 create mode 100755 docker/install/ubuntu_install_spike_sim.sh

diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm
index 6ca2f2f40b75..a62ea059fa8c 100644
--- a/docker/Dockerfile.ci_cortexm
+++ b/docker/Dockerfile.ci_cortexm
@@ -110,11 +110,5 @@ RUN bash /install/ubuntu_install_ethosu_driver_stack.sh
 COPY install/ubuntu_install_vela.sh /install/ubuntu_install_vela.sh
 RUN bash /install/ubuntu_install_vela.sh
 
-#Install CSI-NN2
-COPY install/ubuntu_download_csinn2_compute_lib.sh /install/ubuntu_download_csinn2_compute_lib.sh
-RUN bash /install/ubuntu_download_csinn2_compute_lib.sh
-
 # Update PATH
 ENV PATH /opt/arm/gcc-arm-none-eabi/bin:/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4:$PATH
-ENV PATH /opt/csi-nn2/tools/gcc-toolchain/bin:$PATH
-ENV PATH /opt/csi-nn2/tools/qemu/bin:$PATH
diff --git a/docker/Dockerfile.ci_riscv b/docker/Dockerfile.ci_riscv
index 9b956d55ddaa..5c597135ee41 100644
--- a/docker/Dockerfile.ci_riscv
+++ b/docker/Dockerfile.ci_riscv
@@ -84,10 +84,28 @@ COPY install/ubuntu_install_zephyr_sdk.sh /install/ubuntu_install_zephyr_sdk.sh
 RUN bash /install/ubuntu_install_zephyr.sh
 ENV ZEPHYR_BASE=/opt/zephyrproject/zephyr
 
-#Install CSI-NN2
+# Download RISC-V gcc toolchain (linux)
+COPY install/ubuntu_download_xuantie_gcc_linux.sh /install/ubuntu_download_xuantie_gcc_linux.sh
+RUN bash /install/ubuntu_download_xuantie_gcc_linux.sh /opt/riscv/riscv64-unknown-linux-gnu
+
+# Download RISC-V gcc toolchain (baremetal)
+COPY install/ubuntu_download_xuantie_gcc_newlib.sh /install/ubuntu_download_xuantie_gcc_newlib.sh
+RUN bash /install/ubuntu_download_xuantie_gcc_newlib.sh /opt/riscv/riscv64-unknown-elf
+
+# Install RISC-V QEMU
+COPY install/ubuntu_download_xuantie_qemu.sh /install/ubuntu_download_xuantie_qemu.sh
+RUN bash /install/ubuntu_download_xuantie_qemu.sh /opt/riscv/qemu/
+
+# Install CSI-NN2
 COPY install/ubuntu_download_csinn2_compute_lib.sh /install/ubuntu_download_csinn2_compute_lib.sh
 RUN bash /install/ubuntu_download_csinn2_compute_lib.sh
 
+# Build spike (riscv-isa-sim) and proxy kernel (pk)
+COPY install/ubuntu_install_spike_sim.sh /install/ubuntu_install_spike_sim.sh
+RUN bash /install/ubuntu_install_spike_sim.sh /opt/riscv/riscv64-unknown-elf/
+
 # Update PATH
-ENV PATH /opt/csi-nn2/tools/gcc-toolchain/bin:$PATH
-ENV PATH /opt/csi-nn2/tools/qemu/bin:$PATH
+ENV PATH /opt/riscv/riscv64-unknown-linux-gnu/bin:$PATH
+ENV PATH /opt/riscv/riscv64-unknown-elf/bin:$PATH
+ENV PATH /opt/riscv/qemu/bin:$PATH
+ENV PATH /opt/riscv/spike/bin:$PATH
diff --git a/docker/install/ubuntu_download_csinn2_compute_lib.sh b/docker/install/ubuntu_download_csinn2_compute_lib.sh
index 568ee4146084..4e483d173cbd 100755
--- a/docker/install/ubuntu_download_csinn2_compute_lib.sh
+++ b/docker/install/ubuntu_download_csinn2_compute_lib.sh
@@ -23,19 +23,21 @@ install_path="/opt/csi-nn2"
 # Clone CSI-NN2 Compute Library source code
 git clone --depth 1 --branch 1.12.2 https://github.com/T-head-Semi/csi-nn2.git ${install_path}
 
-# download cross-compiler when not building natively.
-# riscv gcc toolchain will be downloaded to "/path/csi-nn2/tools/gcc-toolchain".
+# The toolchain is downloaded in: ubuntu_download_xuantie_gcc_linux.sh
 cd ${install_path}
-./script/download_toolchain.sh
-
-# download custom QEMU to "/path/csi-nn2/tools/qemu".
-./script/download_qemu.sh
 
 # build csinn2 lib for x86 and c906
 # lib will be installed in /path/csi-nn2/install
+
 # for x86
-make -j4; cd x86_build; make install; cd -
+make -j4
+cd x86_build
+make install
+cd -
+
 # for c906
 mkdir -p riscv_build; cd riscv_build
-cmake ../ -DBUILD_RISCV=ON; make -j4; make install; cd -
-
+export RISCV_GNU_GCC_PATH=/opt/riscv/riscv64-unknown-linux-gnu/bin
+cmake ../ -DBUILD_RISCV=ON
+make -j4
+make install; cd -
diff --git a/docker/install/ubuntu_download_xuantie_gcc_linux.sh b/docker/install/ubuntu_download_xuantie_gcc_linux.sh
new file mode 100755
index 000000000000..ab782b979785
--- /dev/null
+++ b/docker/install/ubuntu_download_xuantie_gcc_linux.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+ubuntu_install_spike_sim.sh
+
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+function show_usage() {
+    cat <<EOF
+Usage: docker/install/ubuntu_download_xuantie_gcc_linux.sh <INSTALLATION_PATH>
+INSTALLATION_PATH is the installation path for the toolchain.
+EOF
+}
+
+if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then
+    show_usage
+    exit 1
+fi
+
+INSTALLATION_PATH=$1
+shift
+
+# Create installation path directory
+mkdir -p "${INSTALLATION_PATH}"
+
+# Download and extract RISC-V gcc
+RISCV_GCC_VERSION="2.6.0"
+RISCV_GCC_ID="1659325511536"
+RISCV_GCC_KERNEL_VERSION="5.10.4"
+RISCV_GCC_DATE="20220715"
+RISCV_GCC_ARCH="x86_64"
+RISCV_GCC_BASE="Xuantie-900-gcc-linux-${RISCV_GCC_KERNEL_VERSION}-glibc-${RISCV_GCC_ARCH}-V${RISCV_GCC_VERSION}-${RISCV_GCC_DATE}"
+RISCV_GCC_EXT="tar.gz"
+RISCV_GCC_URL="https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//${RISCV_GCC_ID}/${RISCV_GCC_BASE}.${RISCV_GCC_EXT}"
+DOWNLOAD_PATH="/tmp/${RISCV_GCC_BASE}.${RISCV_GCC_EXT}"
+
+wget ${RISCV_GCC_URL} -O "${DOWNLOAD_PATH}"
+tar -xf "${DOWNLOAD_PATH}" -C "${INSTALLATION_PATH}" --strip-components=1
+rm $DOWNLOAD_PATH
+echo "SUCCESS"
diff --git a/docker/install/ubuntu_download_xuantie_gcc_newlib.sh b/docker/install/ubuntu_download_xuantie_gcc_newlib.sh
new file mode 100755
index 000000000000..203bc1a2f076
--- /dev/null
+++ b/docker/install/ubuntu_download_xuantie_gcc_newlib.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+ubuntu_install_spike_sim.sh
+
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+function show_usage() {
+    cat <<EOF
+Usage: docker/install/ubuntu_download_xuantie_gcc_newlib.sh <INSTALLATION_PATH>
+INSTALLATION_PATH is the installation path for the toolchain.
+EOF
+}
+
+if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then
+    show_usage
+    exit 1
+fi
+
+INSTALLATION_PATH=$1
+shift
+
+# Create installation path directory
+mkdir -p "${INSTALLATION_PATH}"
+
+# Download and extract RISC-V gcc
+RISCV_GCC_VERSION="2.6.0"
+RISCV_GCC_ID="1659318201401"
+RISCV_GCC_DATE="20220715"
+RISCV_GCC_ARCH="x86_64"
+RISCV_GCC_BASE="Xuantie-900-gcc-elf-newlib-${RISCV_GCC_ARCH}-V${RISCV_GCC_VERSION}-${RISCV_GCC_DATE}"
+RISCV_GCC_EXT="tar.gz"
+# extra forward slash is required somehow
+RISCV_GCC_URL="https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//${RISCV_GCC_ID}/${RISCV_GCC_BASE}.${RISCV_GCC_EXT}"
+DOWNLOAD_PATH="/tmp/${RISCV_GCC_BASE}.${RISCV_GCC_EXT}"
+
+wget ${RISCV_GCC_URL} -O "${DOWNLOAD_PATH}"
+tar -xf "${DOWNLOAD_PATH}" -C "${INSTALLATION_PATH}" --strip-components=1
+rm $DOWNLOAD_PATH
+echo "SUCCESS"
diff --git a/docker/install/ubuntu_download_xuantie_qemu.sh b/docker/install/ubuntu_download_xuantie_qemu.sh
new file mode 100755
index 000000000000..56f0f3d0a34f
--- /dev/null
+++ b/docker/install/ubuntu_download_xuantie_qemu.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+function show_usage() {
+    cat <<EOF
+Usage: docker/install/ubuntu_download_xuantie_qemu.sh <INSTALLATION_PATH>
+INSTALLATION_PATH is the installation path for the tool.
+EOF
+}
+
+if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then
+    show_usage
+    exit 1
+fi
+
+INSTALLATION_PATH=$1
+
+# Create installation path directory
+mkdir -p "${INSTALLATION_PATH}"
+
+QEMU_DATE="20220623-0307"
+QEMU_SOURCE_ID="1655972947885"
+QEMU_ARCH="x86_64-Ubuntu-18.04"
+QEMU_BASE="xuantie-qemu-${QEMU_ARCH}-${QEMU_DATE}"
+QEMU_EXT="tar.gz"
+QEMU_URL="https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//${QEMU_SOURCE_ID}/${QEMU_BASE}.${QEMU_EXT}"
+DOWNLOAD_PATH="/tmp/${QEMU_BASE}.${QEMU_EXT}"
+
+wget ${QEMU_URL} -O "${DOWNLOAD_PATH}"
+tar -xf "${DOWNLOAD_PATH}" -C "${INSTALLATION_PATH}" --strip-components=1
+rm $DOWNLOAD_PATH
+
+# Remove non riscv64 binaries? (TODO)
+# ls $INSTALLATION_PATH/bin | grep -v qemu-riscv64 | xargs -i rm -rf $INSTALLATION_PATH/bin/{}
+# ls $INSTALLATION_PATH | grep -v bin | xargs -i rm -rf $INSTALLATION_PATH/{}
+
+echo "SUCCESS"
diff --git a/docker/install/ubuntu_install_arduino.sh b/docker/install/ubuntu_install_arduino.sh
index 107b452f8d3f..15dbd20fa758 100755
--- a/docker/install/ubuntu_install_arduino.sh
+++ b/docker/install/ubuntu_install_arduino.sh
@@ -20,7 +20,6 @@ set -e
 set -u
 set -o pipefail
 
-export DEBIAN_FRONTEND=noninteractive
 apt-install-and-clear -y ca-certificates
 
 ARDUINO_CLI_VERSION="0.21.1"
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index d20eeeba6998..a27c45433115 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -22,6 +22,11 @@ set -u
 set -x
 set -o pipefail
 
+export DEBIAN_FRONTEND=noninteractive
+export TZ=Etc/UTC
+ln -snf /usr/share/zoneinfo/$TZ /etc/localtime
+echo $TZ > /etc/timezone
+
 # install libraries for building c++ core on ubuntu
 apt-get update && apt-install-and-clear -y --no-install-recommends \
     apt-transport-https \
diff --git a/docker/install/ubuntu_install_spike_sim.sh b/docker/install/ubuntu_install_spike_sim.sh
new file mode 100755
index 000000000000..24a11d758c38
--- /dev/null
+++ b/docker/install/ubuntu_install_spike_sim.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+ubuntu_install_spike_sim.sh
+
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+set -x
+
+function show_usage() {
+    cat <<EOF
+Usage: docker/install/ubuntu_install_spike_sim.sh <RISCV_PATH>
+RISCV_PATH is the installation path of the risc-v gcc.
+EOF
+}
+
+if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then
+    show_usage
+    exit -1
+fi
+
+export RISCV=$1
+export PATH=$RISCV/bin:$PATH
+shift
+
+sudo apt-install-and-clear -y --no-install-recommends device-tree-compiler
+
+# Install spike
+mkdir /tmp/spike
+cd /tmp/spike
+# TODO: freeze version?
+git clone https://github.com/riscv/riscv-isa-sim.git
+pushd riscv-isa-sim
+mkdir build
+cd build
+../configure --prefix=$RISCV --with-isa=RV32IMAC
+make -j`nproc`
+make install
+popd
+
+# Install pk
+git clone https://github.com/riscv/riscv-pk.git
+pushd riscv-pk
+
+# rv32imac
+mkdir build
+pushd build
+../configure --prefix=`pwd`/install --host=riscv64-unknown-elf --with-arch=rv32imac
+make -j`nproc`
+make install
+cp ./pk $RISCV/riscv64-unknown-elf/bin/pk
+popd
+
+git status
+
+# rv64imac
+mkdir build64
+pushd build64
+../configure --prefix=`pwd`/install --host=riscv64-unknown-elf --with-arch=rv64imac
+make -j`nproc`
+make install
+cp ./pk $RISCV/riscv64-unknown-elf/bin/pk64
+
+# cleanup
+rm -rf /tmp/spike
diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh
index f955a7ff9b19..552ad2626029 100755
--- a/docker/install/ubuntu_install_zephyr.sh
+++ b/docker/install/ubuntu_install_zephyr.sh
@@ -21,11 +21,6 @@ set -u
 set -o pipefail
 set -x
 
-export DEBIAN_FRONTEND=noninteractive
-export TZ=Etc/UTC
-sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime
-echo $TZ > /etc/timezone
-
 sudo apt-install-and-clear -y --no-install-recommends \
      libsdl2-dev ca-certificates gnupg software-properties-common wget \
      git cmake ninja-build gperf \

From c9002509f6a16ea04e711d151973fe4bcce6a365 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Thu, 15 Sep 2022 22:03:21 +0100
Subject: [PATCH 173/704] [Target] Print deprecation warning before
 canonicalisation in build module (#12747)

Hopefully fixes #12742, as the warning should only be printed when a user passes `target_host`, in the current case if the user passes `None` as `target_host` it'll be processed by `canon_target_map_and_host` which seems to always produce a `target_host` and thus triggering the warning despite the user doing nothing wrong.
---
 python/tvm/driver/build_module.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 54db421e1be0..9389e7fbee60 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -258,8 +258,6 @@ def build(
             raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.")
         annotated_mods[tar] = mod.with_attr("runtime", runtime)
 
-    annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host)
-
     # TODO(mbs): Both CompilationConfig and TIRToRuntime implement the same host target
     #  defaulting logic, but there's currently no way to get back the decided host.
     if target_host is not None:
@@ -267,6 +265,8 @@ def build(
             "target_host parameter is going to be deprecated. "
             "Please pass in tvm.target.Target(target, host=target_host) instead."
         )
+
+    annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host)
     if not target_host:
         for tar, mod in annotated_mods.items():
             device_type = ndarray.device(tar.kind.name, 0).device_type

From c00ce572c299a1cba6aede525be738c617e15325 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 15 Sep 2022 15:20:24 -0700
Subject: [PATCH 174/704] [ci] Add retries to docker push (#12773)

This should mitigate failures like in
https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4274/pipeline.
This also moves the `retry` function to a script now that we have
PR #12604.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                      | 1555 +++---------------------------
 ci/jenkins/Build.groovy.j2       |   18 +-
 ci/jenkins/Deploy.groovy.j2      |    6 +-
 ci/jenkins/DockerBuild.groovy.j2 |    6 +-
 ci/jenkins/Prepare.groovy.j2     |    6 +-
 ci/jenkins/Test.groovy.j2        |    2 +-
 ci/jenkins/macros.j2             |   34 +-
 ci/scripts/retry.sh              |   39 +
 8 files changed, 223 insertions(+), 1443 deletions(-)
 create mode 100644 ci/scripts/retry.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index ed1cf4b09e6e..5835100fde3e 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-09-01T11:52:42.195970
+// Generated at 2022-09-14T11:22:31.582192
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -145,26 +145,7 @@ def init_git() {
   sh(
     script: """
       set -eux
-      retry() {
-  local max_retries=\$1
-  shift
-  local n=0
-  local backoff_max=30
-  until [ "\$n" -ge \$max_retries ]
-  do
-      "\$@" && break
-      n=\$((n+1))
-      if [ "\$n" -eq \$max_retries ]; then
-          echo "failed to update after attempt \$n / \$max_retries, giving up"
-          exit 1
-      fi
-
-      WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-      echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-      sleep \$WAIT
-  done
-}
-
+      . ci/scripts/retry.sh
       retry 3 timeout 5m git submodule update --init -f --jobs 0
     """,
     label: 'Update git submodules',
@@ -196,27 +177,8 @@ def docker_init(image) {
     sh(
       script: """
       set -eux
-      retry() {
-  local max_retries=\$1
-  shift
-  local n=0
-  local backoff_max=30
-  until [ "\$n" -ge \$max_retries ]
-  do
-      "\$@" && break
-      n=\$((n+1))
-      if [ "\$n" -eq \$max_retries ]; then
-          echo "failed to update after attempt \$n / \$max_retries, giving up"
-          exit 1
-      fi
-
-      WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-      echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-      sleep \$WAIT
-  done
-}
-
-      retry 3 docker pull ${image}
+      . ci/scripts/retry.sh
+      retry 5 docker pull ${image}
       """,
       label: 'Pull docker image',
     )
@@ -453,8 +415,9 @@ def ecr_push(full_name) {
       sh(
         script: """
           set -x
+          . ci/scripts/retry.sh
           docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
-          docker push \$AWS_ECR_REPO/${full_name}
+          retry 5 docker push \$AWS_ECR_REPO/${full_name}
         """,
         label: 'Upload image to ECR'
       )
@@ -495,7 +458,8 @@ def ecr_pull(full_name) {
       sh(
         script: """
           set -eux
-          docker pull ${full_name}
+          . ci/scripts/retry.sh
+          retry 5 docker pull ${full_name}
         """,
         label: 'Pull image from ECR'
       )
@@ -649,8 +613,8 @@ def lint() {
   'Lint 1 of 2': {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
-        docker_init(ci_lint)
         init_git()
+        docker_init(ci_lint)
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
             'TVM_NUM_SHARDS=2',
@@ -669,8 +633,8 @@ def lint() {
   'Lint 2 of 2': {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
-        docker_init(ci_lint)
         init_git()
+        docker_init(ci_lint)
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
             'TVM_NUM_SHARDS=2',
@@ -771,33 +735,14 @@ stage('Build') {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
           sh(
             script: """
               set -eux
-              retry() {
-                local max_retries=\$1
-                shift
-                local n=0
-                local backoff_max=30
-                until [ "\$n" -ge \$max_retries ]
-                do
-                    "\$@" && break
-                    n=\$((n+1))
-                    if [ "\$n" -eq \$max_retries ]; then
-                        echo "failed to update after attempt \$n / \$max_retries, giving up"
-                        exit 1
-                    fi
-
-                    WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                    echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                    sleep \$WAIT
-                done
-              }
-
+              . ci/scripts/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu/build/libtvm.so
               md5sum build/libvta_fsim.so
@@ -818,26 +763,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              retry() {
-                local max_retries=\$1
-                shift
-                local n=0
-                local backoff_max=30
-                until [ "\$n" -ge \$max_retries ]
-                do
-                    "\$@" && break
-                    n=\$((n+1))
-                    if [ "\$n" -eq \$max_retries ]; then
-                        echo "failed to update after attempt \$n / \$max_retries, giving up"
-                        exit 1
-                    fi
-
-                    WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                    echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                    sleep \$WAIT
-                done
-              }
-
+              . ci/scripts/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu2/build/libtvm.so
               md5sum build/libvta_fsim.so
@@ -858,8 +784,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu") {
-          docker_init(ci_cpu)
           init_git()
+          docker_init(ci_cpu)
           sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
             label: 'Create CPU cmake config',
@@ -868,26 +794,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              retry() {
-                local max_retries=\$1
-                shift
-                local n=0
-                local backoff_max=30
-                until [ "\$n" -ge \$max_retries ]
-                do
-                    "\$@" && break
-                    n=\$((n+1))
-                    if [ "\$n" -eq \$max_retries ]; then
-                        echo "failed to update after attempt \$n / \$max_retries, giving up"
-                        exit 1
-                    fi
-
-                    WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                    echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                    sleep \$WAIT
-                done
-              }
-
+              . ci/scripts/retry.sh
               md5sum build/libvta_tsim.so
               retry 3 aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/cpu/build/libvta_tsim.so
               md5sum build/libtvm.so
@@ -918,8 +825,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu-minimal") {
-          docker_init(ci_minimal)
           init_git()
+          docker_init(ci_minimal)
           sh (
             script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build",
             label: 'Create CPU minimal cmake config',
@@ -928,26 +835,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              retry() {
-                local max_retries=\$1
-                shift
-                local n=0
-                local backoff_max=30
-                until [ "\$n" -ge \$max_retries ]
-                do
-                    "\$@" && break
-                    n=\$((n+1))
-                    if [ "\$n" -eq \$max_retries ]; then
-                        echo "failed to update after attempt \$n / \$max_retries, giving up"
-                        exit 1
-                    fi
-
-                    WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                    echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                    sleep \$WAIT
-                done
-              }
-
+              . ci/scripts/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cpu-minimal/build/libtvm.so
               md5sum build/libtvm_runtime.so
@@ -968,8 +856,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-wasm") {
-          docker_init(ci_wasm)
           init_git()
+          docker_init(ci_wasm)
           sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
             label: 'Create WASM cmake config',
@@ -993,8 +881,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-i386") {
-          docker_init(ci_i386)
           init_git()
+          docker_init(ci_i386)
           sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
             label: 'Create i386 cmake config',
@@ -1003,26 +891,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              retry() {
-                local max_retries=\$1
-                shift
-                local n=0
-                local backoff_max=30
-                until [ "\$n" -ge \$max_retries ]
-                do
-                    "\$@" && break
-                    n=\$((n+1))
-                    if [ "\$n" -eq \$max_retries ]; then
-                        echo "failed to update after attempt \$n / \$max_retries, giving up"
-                        exit 1
-                    fi
-
-                    WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                    echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                    sleep \$WAIT
-                done
-              }
-
+              . ci/scripts/retry.sh
               md5sum build/libvta_tsim.so
               retry 3 aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/i386/build/libvta_tsim.so
               md5sum build/libtvm.so
@@ -1047,8 +916,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-arm") {
-          docker_init(ci_arm)
           init_git()
+          docker_init(ci_arm)
           sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
             label: 'Create ARM cmake config',
@@ -1057,26 +926,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              retry() {
-                local max_retries=\$1
-                shift
-                local n=0
-                local backoff_max=30
-                until [ "\$n" -ge \$max_retries ]
-                do
-                    "\$@" && break
-                    n=\$((n+1))
-                    if [ "\$n" -eq \$max_retries ]; then
-                        echo "failed to update after attempt \$n / \$max_retries, giving up"
-                        exit 1
-                    fi
-
-                    WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                    echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                    sleep \$WAIT
-                done
-              }
-
+              . ci/scripts/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/arm/build/libtvm.so
               md5sum build/libvta_fsim.so
@@ -1099,8 +949,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cortexm") {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           sh (
             script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build",
             label: 'Create Cortex-M cmake config',
@@ -1109,26 +959,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              retry() {
-                local max_retries=\$1
-                shift
-                local n=0
-                local backoff_max=30
-                until [ "\$n" -ge \$max_retries ]
-                do
-                    "\$@" && break
-                    n=\$((n+1))
-                    if [ "\$n" -eq \$max_retries ]; then
-                        echo "failed to update after attempt \$n / \$max_retries, giving up"
-                        exit 1
-                    fi
-
-                    WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                    echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                    sleep \$WAIT
-                done
-              }
-
+              . ci/scripts/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cortexm/build/libtvm.so
               md5sum build/libtvm_runtime.so
@@ -1150,8 +981,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") {
-          docker_init(ci_hexagon)
           init_git()
+          docker_init(ci_hexagon)
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
             label: 'Create Hexagon cmake config',
@@ -1164,26 +995,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              retry() {
-                local max_retries=\$1
-                shift
-                local n=0
-                local backoff_max=30
-                until [ "\$n" -ge \$max_retries ]
-                do
-                    "\$@" && break
-                    n=\$((n+1))
-                    if [ "\$n" -eq \$max_retries ]; then
-                        echo "failed to update after attempt \$n / \$max_retries, giving up"
-                        exit 1
-                    fi
-
-                    WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                    echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                    sleep \$WAIT
-                done
-              }
-
+              . ci/scripts/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/hexagon/build/libtvm.so
               md5sum build/libtvm_runtime.so
@@ -1205,8 +1017,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-riscv") {
-          docker_init(ci_riscv)
           init_git()
+          docker_init(ci_riscv)
           sh (
             script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build",
             label: 'Create RISC-V cmake config',
@@ -1215,26 +1027,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              retry() {
-                local max_retries=\$1
-                shift
-                local n=0
-                local backoff_max=30
-                until [ "\$n" -ge \$max_retries ]
-                do
-                    "\$@" && break
-                    n=\$((n+1))
-                    if [ "\$n" -eq \$max_retries ]; then
-                        echo "failed to update after attempt \$n / \$max_retries, giving up"
-                        exit 1
-                    fi
-
-                    WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                    echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                    sleep \$WAIT
-                done
-              }
-
+              . ci/scripts/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/riscv/build/libtvm.so
               md5sum build/libtvm_runtime.so
@@ -1266,8 +1059,8 @@ def shard_run_unittest_GPU_1_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -1278,26 +1071,7 @@ def shard_run_unittest_GPU_1_of_3() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libvta_fsim.so build/libvta_fsim.so
@@ -1315,26 +1089,7 @@ def shard_run_unittest_GPU_1_of_3() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -1382,8 +1137,8 @@ def shard_run_unittest_GPU_2_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -1394,26 +1149,7 @@ def shard_run_unittest_GPU_2_of_3() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -1464,8 +1200,8 @@ def shard_run_unittest_GPU_3_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -1476,26 +1212,7 @@ def shard_run_unittest_GPU_3_of_3() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -1543,8 +1260,8 @@ def shard_run_integration_CPU_1_of_4() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
-          docker_init(ci_cpu)
           init_git()
+          docker_init(ci_cpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
@@ -1555,26 +1272,7 @@ def shard_run_integration_CPU_1_of_4() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
                           md5sum build/libvta_tsim.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
@@ -1619,8 +1317,8 @@ def shard_run_integration_CPU_2_of_4() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
-          docker_init(ci_cpu)
           init_git()
+          docker_init(ci_cpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
@@ -1631,26 +1329,7 @@ def shard_run_integration_CPU_2_of_4() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
                           md5sum build/libvta_tsim.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
@@ -1695,8 +1374,8 @@ def shard_run_integration_CPU_3_of_4() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
-          docker_init(ci_cpu)
           init_git()
+          docker_init(ci_cpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
@@ -1707,26 +1386,7 @@ def shard_run_integration_CPU_3_of_4() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
                           md5sum build/libvta_tsim.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
@@ -1771,8 +1431,8 @@ def shard_run_integration_CPU_4_of_4() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
-          docker_init(ci_cpu)
           init_git()
+          docker_init(ci_cpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cpu',
@@ -1783,26 +1443,7 @@ def shard_run_integration_CPU_4_of_4() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
                           md5sum build/libvta_tsim.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
@@ -1848,8 +1489,8 @@ def shard_run_python_i386_1_of_3() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
-          docker_init(ci_i386)
           init_git()
+          docker_init(ci_i386)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=i386',
@@ -1860,26 +1501,7 @@ def shard_run_python_i386_1_of_3() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
@@ -1924,8 +1546,8 @@ def shard_run_python_i386_2_of_3() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
-          docker_init(ci_i386)
           init_git()
+          docker_init(ci_i386)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=i386',
@@ -1936,26 +1558,7 @@ def shard_run_python_i386_2_of_3() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
@@ -2000,8 +1603,8 @@ def shard_run_python_i386_3_of_3() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
-          docker_init(ci_i386)
           init_git()
+          docker_init(ci_i386)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=i386',
@@ -2012,26 +1615,7 @@ def shard_run_python_i386_3_of_3() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
@@ -2076,8 +1660,8 @@ def shard_run_test_Hexagon_1_of_8() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
+          docker_init(ci_hexagon)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
@@ -2088,26 +1672,7 @@ def shard_run_test_Hexagon_1_of_8() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2151,8 +1716,8 @@ def shard_run_test_Hexagon_2_of_8() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
+          docker_init(ci_hexagon)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
@@ -2163,26 +1728,7 @@ def shard_run_test_Hexagon_2_of_8() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2225,8 +1771,8 @@ def shard_run_test_Hexagon_3_of_8() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
+          docker_init(ci_hexagon)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
@@ -2237,26 +1783,7 @@ def shard_run_test_Hexagon_3_of_8() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2299,8 +1826,8 @@ def shard_run_test_Hexagon_4_of_8() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
+          docker_init(ci_hexagon)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
@@ -2311,26 +1838,7 @@ def shard_run_test_Hexagon_4_of_8() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2373,8 +1881,8 @@ def shard_run_test_Hexagon_5_of_8() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
+          docker_init(ci_hexagon)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
@@ -2385,26 +1893,7 @@ def shard_run_test_Hexagon_5_of_8() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2447,8 +1936,8 @@ def shard_run_test_Hexagon_6_of_8() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
+          docker_init(ci_hexagon)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
@@ -2459,26 +1948,7 @@ def shard_run_test_Hexagon_6_of_8() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2521,8 +1991,8 @@ def shard_run_test_Hexagon_7_of_8() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
+          docker_init(ci_hexagon)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
@@ -2533,26 +2003,7 @@ def shard_run_test_Hexagon_7_of_8() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2595,8 +2046,8 @@ def shard_run_test_Hexagon_8_of_8() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
+          docker_init(ci_hexagon)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=hexagon',
@@ -2607,26 +2058,7 @@ def shard_run_test_Hexagon_8_of_8() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2670,8 +2102,8 @@ def shard_run_integration_aarch64_1_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
+          docker_init(ci_arm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
@@ -2682,26 +2114,7 @@ def shard_run_integration_aarch64_1_of_4() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -2745,8 +2158,8 @@ def shard_run_integration_aarch64_2_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
+          docker_init(ci_arm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
@@ -2757,26 +2170,7 @@ def shard_run_integration_aarch64_2_of_4() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -2820,8 +2214,8 @@ def shard_run_integration_aarch64_3_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
+          docker_init(ci_arm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
@@ -2832,26 +2226,7 @@ def shard_run_integration_aarch64_3_of_4() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -2895,8 +2270,8 @@ def shard_run_integration_aarch64_4_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
+          docker_init(ci_arm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
@@ -2907,26 +2282,7 @@ def shard_run_integration_aarch64_4_of_4() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -2971,8 +2327,8 @@ def shard_run_topi_GPU_1_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -2983,26 +2339,7 @@ def shard_run_topi_GPU_1_of_3() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -3045,8 +2382,8 @@ def shard_run_topi_GPU_2_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -3057,26 +2394,7 @@ def shard_run_topi_GPU_2_of_3() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -3119,8 +2437,8 @@ def shard_run_topi_GPU_3_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -3131,26 +2449,7 @@ def shard_run_topi_GPU_3_of_3() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -3194,8 +2493,8 @@ def shard_run_frontend_GPU_1_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -3206,26 +2505,7 @@ def shard_run_frontend_GPU_1_of_6() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -3268,8 +2548,8 @@ def shard_run_frontend_GPU_2_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -3280,26 +2560,7 @@ def shard_run_frontend_GPU_2_of_6() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -3342,8 +2603,8 @@ def shard_run_frontend_GPU_3_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -3354,26 +2615,7 @@ def shard_run_frontend_GPU_3_of_6() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -3416,8 +2658,8 @@ def shard_run_frontend_GPU_4_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -3428,26 +2670,7 @@ def shard_run_frontend_GPU_4_of_6() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -3490,8 +2713,8 @@ def shard_run_frontend_GPU_5_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -3502,26 +2725,7 @@ def shard_run_frontend_GPU_5_of_6() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -3564,8 +2768,8 @@ def shard_run_frontend_GPU_6_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=gpu',
@@ -3576,26 +2780,7 @@ def shard_run_frontend_GPU_6_of_6() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -3639,8 +2824,8 @@ def shard_run_topi_aarch64_1_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
+          docker_init(ci_arm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
@@ -3651,26 +2836,7 @@ def shard_run_topi_aarch64_1_of_2() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -3718,8 +2884,8 @@ def shard_run_topi_aarch64_2_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
+          docker_init(ci_arm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
@@ -3730,26 +2896,7 @@ def shard_run_topi_aarch64_2_of_2() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -3797,8 +2944,8 @@ def shard_run_frontend_aarch64_1_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
+          docker_init(ci_arm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
@@ -3809,26 +2956,7 @@ def shard_run_frontend_aarch64_1_of_2() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -3871,8 +2999,8 @@ def shard_run_frontend_aarch64_2_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
+          docker_init(ci_arm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=arm',
@@ -3883,26 +3011,7 @@ def shard_run_frontend_aarch64_2_of_2() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -3946,8 +3055,8 @@ def shard_run_test_Cortex_M_1_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -3958,26 +3067,7 @@ def shard_run_test_Cortex_M_1_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4025,8 +3115,8 @@ def shard_run_test_Cortex_M_2_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -4037,26 +3127,7 @@ def shard_run_test_Cortex_M_2_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4099,8 +3170,8 @@ def shard_run_test_Cortex_M_3_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -4111,26 +3182,7 @@ def shard_run_test_Cortex_M_3_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4173,8 +3225,8 @@ def shard_run_test_Cortex_M_4_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -4185,26 +3237,7 @@ def shard_run_test_Cortex_M_4_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4247,8 +3280,8 @@ def shard_run_test_Cortex_M_5_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -4259,26 +3292,7 @@ def shard_run_test_Cortex_M_5_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4321,8 +3335,8 @@ def shard_run_test_Cortex_M_6_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -4333,26 +3347,7 @@ def shard_run_test_Cortex_M_6_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4395,8 +3390,8 @@ def shard_run_test_Cortex_M_7_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -4407,26 +3402,7 @@ def shard_run_test_Cortex_M_7_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4469,8 +3445,8 @@ def shard_run_test_Cortex_M_8_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -4481,26 +3457,7 @@ def shard_run_test_Cortex_M_8_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4543,8 +3500,8 @@ def shard_run_test_Cortex_M_9_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -4555,26 +3512,7 @@ def shard_run_test_Cortex_M_9_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4617,8 +3555,8 @@ def shard_run_test_Cortex_M_10_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -4629,26 +3567,7 @@ def shard_run_test_Cortex_M_10_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4691,8 +3610,8 @@ def shard_run_test_Cortex_M_11_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -4703,26 +3622,7 @@ def shard_run_test_Cortex_M_11_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4765,8 +3665,8 @@ def shard_run_test_Cortex_M_12_of_12() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
         try {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=cortexm',
@@ -4777,26 +3677,7 @@ def shard_run_test_Cortex_M_12_of_12() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4840,8 +3721,8 @@ def shard_run_test_RISC_V_1_of_1() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-riscv") {
         try {
-          docker_init(ci_riscv)
           init_git()
+          docker_init(ci_riscv)
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM=riscv',
@@ -4852,26 +3733,7 @@ def shard_run_test_RISC_V_1_of_1() {
               sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/riscv/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/riscv/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4917,32 +3779,13 @@ def run_unittest_minimal() {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu-minimal") {
         timeout(time: max_time, unit: 'MINUTES') {
           try {
-            docker_init(ci_minimal)
             init_git()
+            docker_init(ci_minimal)
             withEnv(['PLATFORM=minimal'], {
               sh(
                     script: """
                       set -eux
-                      retry() {
-                        local max_retries=\$1
-                        shift
-                        local n=0
-                        local backoff_max=30
-                        until [ "\$n" -ge \$max_retries ]
-                        do
-                            "\$@" && break
-                            n=\$((n+1))
-                            if [ "\$n" -eq \$max_retries ]; then
-                                echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                exit 1
-                            fi
-
-                            WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                            echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                            sleep \$WAIT
-                        done
-                      }
-
+                      . ci/scripts/retry.sh
                       retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu-minimal/build/libtvm.so build/libtvm.so
                       md5sum build/libtvm.so
                       retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu-minimal/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -5134,34 +3977,15 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
-              docker_init(ci_cpu)
               init_git()
+              docker_init(ci_cpu)
               withEnv(['PLATFORM=cpu',
               'TEST_STEP_NAME=unittest: CPU',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
                 sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
                           md5sum build/libvta_tsim.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
@@ -5209,34 +4033,15 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
-              docker_init(ci_cpu)
               init_git()
+              docker_init(ci_cpu)
               withEnv(['PLATFORM=cpu',
               'TEST_STEP_NAME=frontend: CPU',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
                 sh(
                         script: """
                           set -eux
-                          retry() {
-                            local max_retries=\$1
-                            shift
-                            local n=0
-                            local backoff_max=30
-                            until [ "\$n" -ge \$max_retries ]
-                            do
-                                "\$@" && break
-                                n=\$((n+1))
-                                if [ "\$n" -eq \$max_retries ]; then
-                                    echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                    exit 1
-                                fi
-
-                                WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                                echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                                sleep \$WAIT
-                            done
-                          }
-
+                          . ci/scripts/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -5277,31 +4082,12 @@ stage('Test') {
     if (!skip_ci) {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           sh(
             script: """
               set -eux
-              retry() {
-                local max_retries=\$1
-                shift
-                local n=0
-                local backoff_max=30
-                until [ "\$n" -ge \$max_retries ]
-                do
-                    "\$@" && break
-                    n=\$((n+1))
-                    if [ "\$n" -eq \$max_retries ]; then
-                        echo "failed to update after attempt \$n / \$max_retries, giving up"
-                        exit 1
-                    fi
-
-                    WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                    echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                    sleep \$WAIT
-                done
-              }
-
+              . ci/scripts/retry.sh
               retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -5326,26 +4112,7 @@ stage('Test') {
           sh(
             script: """
               set -eux
-              retry() {
-                local max_retries=\$1
-                shift
-                local n=0
-                local backoff_max=30
-                until [ "\$n" -ge \$max_retries ]
-                do
-                    "\$@" && break
-                    n=\$((n+1))
-                    if [ "\$n" -eq \$max_retries ]; then
-                        echo "failed to update after attempt \$n / \$max_retries, giving up"
-                        exit 1
-                    fi
-
-                    WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                    echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                    sleep \$WAIT
-                done
-              }
-
+              . ci/scripts/retry.sh
               md5sum docs.tgz
               retry 3 aws s3 cp --no-progress docs.tgz s3://${s3_prefix}/docs/docs.tgz
             """,
@@ -5395,10 +4162,11 @@ def update_docker(ecr_image, hub_image) {
   sh(
     script: """
     set -eux
+    . ci/scripts/retry.sh
     docker tag \
       ${ecr_image} \
       ${hub_image}
-    docker push ${hub_image}
+    retry 5 docker push ${hub_image}
     """,
     label: "Update ${hub_image} on Docker Hub",
   )
@@ -5457,26 +4225,7 @@ def deploy() {
             sh(
                       script: """
                         set -eux
-                        retry() {
-                          local max_retries=\$1
-                          shift
-                          local n=0
-                          local backoff_max=30
-                          until [ "\$n" -ge \$max_retries ]
-                          do
-                              "\$@" && break
-                              n=\$((n+1))
-                              if [ "\$n" -eq \$max_retries ]; then
-                                  echo "failed to update after attempt \$n / \$max_retries, giving up"
-                                  exit 1
-                              fi
-
-                              WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-                              echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-                              sleep \$WAIT
-                          done
-                        }
-
+                        . ci/scripts/retry.sh
                         retry 3 aws s3 cp --no-progress s3://${s3_prefix}/docs/docs.tgz docs.tgz
                         md5sum docs.tgz
                       """,
@@ -5555,9 +4304,10 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
+                              . ci/scripts/retry.sh
                               docker pull tlcpackstaging/ci_arm:${tag}
                               docker tag tlcpackstaging/ci_arm:${tag} tlcpack/ci-arm:${tag}
-                              docker push tlcpack/ci-arm:${tag}
+                              retry 5 docker push tlcpack/ci-arm:${tag}
                             """,
                             label: 'Tag tlcpackstaging/ci_arm image to tlcpack',
                           )
@@ -5568,9 +4318,10 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
+                              . ci/scripts/retry.sh
                               docker pull tlcpackstaging/ci_cortexm:${tag}
                               docker tag tlcpackstaging/ci_cortexm:${tag} tlcpack/ci-cortexm:${tag}
-                              docker push tlcpack/ci-cortexm:${tag}
+                              retry 5 docker push tlcpack/ci-cortexm:${tag}
                             """,
                             label: 'Tag tlcpackstaging/ci_cortexm image to tlcpack',
                           )
@@ -5581,9 +4332,10 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
+                              . ci/scripts/retry.sh
                               docker pull tlcpackstaging/ci_cpu:${tag}
                               docker tag tlcpackstaging/ci_cpu:${tag} tlcpack/ci-cpu:${tag}
-                              docker push tlcpack/ci-cpu:${tag}
+                              retry 5 docker push tlcpack/ci-cpu:${tag}
                             """,
                             label: 'Tag tlcpackstaging/ci_cpu image to tlcpack',
                           )
@@ -5594,9 +4346,10 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
+                              . ci/scripts/retry.sh
                               docker pull tlcpackstaging/ci_gpu:${tag}
                               docker tag tlcpackstaging/ci_gpu:${tag} tlcpack/ci-gpu:${tag}
-                              docker push tlcpack/ci-gpu:${tag}
+                              retry 5 docker push tlcpack/ci-gpu:${tag}
                             """,
                             label: 'Tag tlcpackstaging/ci_gpu image to tlcpack',
                           )
@@ -5607,9 +4360,10 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
+                              . ci/scripts/retry.sh
                               docker pull tlcpackstaging/ci_hexagon:${tag}
                               docker tag tlcpackstaging/ci_hexagon:${tag} tlcpack/ci-hexagon:${tag}
-                              docker push tlcpack/ci-hexagon:${tag}
+                              retry 5 docker push tlcpack/ci-hexagon:${tag}
                             """,
                             label: 'Tag tlcpackstaging/ci_hexagon image to tlcpack',
                           )
@@ -5620,9 +4374,10 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
+                              . ci/scripts/retry.sh
                               docker pull tlcpackstaging/ci_i386:${tag}
                               docker tag tlcpackstaging/ci_i386:${tag} tlcpack/ci-i386:${tag}
-                              docker push tlcpack/ci-i386:${tag}
+                              retry 5 docker push tlcpack/ci-i386:${tag}
                             """,
                             label: 'Tag tlcpackstaging/ci_i386 image to tlcpack',
                           )
@@ -5633,9 +4388,10 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
+                              . ci/scripts/retry.sh
                               docker pull tlcpackstaging/ci_lint:${tag}
                               docker tag tlcpackstaging/ci_lint:${tag} tlcpack/ci-lint:${tag}
-                              docker push tlcpack/ci-lint:${tag}
+                              retry 5 docker push tlcpack/ci-lint:${tag}
                             """,
                             label: 'Tag tlcpackstaging/ci_lint image to tlcpack',
                           )
@@ -5646,9 +4402,10 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
+                              . ci/scripts/retry.sh
                               docker pull tlcpackstaging/ci_minimal:${tag}
                               docker tag tlcpackstaging/ci_minimal:${tag} tlcpack/ci-minimal:${tag}
-                              docker push tlcpack/ci-minimal:${tag}
+                              retry 5 docker push tlcpack/ci-minimal:${tag}
                             """,
                             label: 'Tag tlcpackstaging/ci_minimal image to tlcpack',
                           )
@@ -5659,9 +4416,10 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
+                              . ci/scripts/retry.sh
                               docker pull tlcpackstaging/ci_riscv:${tag}
                               docker tag tlcpackstaging/ci_riscv:${tag} tlcpack/ci-riscv:${tag}
-                              docker push tlcpack/ci-riscv:${tag}
+                              retry 5 docker push tlcpack/ci-riscv:${tag}
                             """,
                             label: 'Tag tlcpackstaging/ci_riscv image to tlcpack',
                           )
@@ -5672,9 +4430,10 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
+                              . ci/scripts/retry.sh
                               docker pull tlcpackstaging/ci_wasm:${tag}
                               docker tag tlcpackstaging/ci_wasm:${tag} tlcpack/ci-wasm:${tag}
-                              docker push tlcpack/ci-wasm:${tag}
+                              retry 5 docker push tlcpack/ci-wasm:${tag}
                             """,
                             label: 'Tag tlcpackstaging/ci_wasm image to tlcpack',
                           )
diff --git a/ci/jenkins/Build.groovy.j2 b/ci/jenkins/Build.groovy.j2
index 51360b7d9c54..a083fe88ad80 100644
--- a/ci/jenkins/Build.groovy.j2
+++ b/ci/jenkins/Build.groovy.j2
@@ -84,8 +84,8 @@ stage('Build') {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
           {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
@@ -102,8 +102,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
-          docker_init(ci_cpu)
           init_git()
+          docker_init(ci_cpu)
           sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
             label: 'Create CPU cmake config',
@@ -126,8 +126,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-cpu-minimal') }}) {
-          docker_init(ci_minimal)
           init_git()
+          docker_init(ci_minimal)
           sh (
             script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build",
             label: 'Create CPU minimal cmake config',
@@ -144,8 +144,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
-          docker_init(ci_wasm)
           init_git()
+          docker_init(ci_wasm)
           sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
             label: 'Create WASM cmake config',
@@ -169,8 +169,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-i386') }}) {
-          docker_init(ci_i386)
           init_git()
+          docker_init(ci_i386)
           sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
             label: 'Create i386 cmake config',
@@ -187,8 +187,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-arm') }}) {
-          docker_init(ci_arm)
           init_git()
+          docker_init(ci_arm)
           sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
             label: 'Create ARM cmake config',
@@ -205,8 +205,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-cortexm') }}) {
-          docker_init(ci_cortexm)
           init_git()
+          docker_init(ci_cortexm)
           sh (
             script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build",
             label: 'Create Cortex-M cmake config',
@@ -223,8 +223,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
-          docker_init(ci_hexagon)
           init_git()
+          docker_init(ci_hexagon)
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
             label: 'Create Hexagon cmake config',
@@ -245,8 +245,8 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-riscv') }}) {
-          docker_init(ci_riscv)
           init_git()
+          docker_init(ci_riscv)
           sh (
             script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build",
             label: 'Create RISC-V cmake config',
diff --git a/ci/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2
index 08516da41b9f..d2ee4360da6b 100644
--- a/ci/jenkins/Deploy.groovy.j2
+++ b/ci/jenkins/Deploy.groovy.j2
@@ -30,10 +30,11 @@ def update_docker(ecr_image, hub_image) {
   sh(
     script: """
     set -eux
+    . ci/scripts/retry.sh
     docker tag \
       ${ecr_image} \
       ${hub_image}
-    docker push ${hub_image}
+    retry 5 docker push ${hub_image}
     """,
     label: "Update ${hub_image} on Docker Hub",
   )
@@ -144,9 +145,10 @@ def deploy() {
                 sh(
                   script: """
                     set -eux
+                    . ci/scripts/retry.sh
                     docker pull tlcpackstaging/{{ image.name }}:${tag}
                     docker tag tlcpackstaging/{{ image.name }}:${tag} tlcpack/{{ image.name.replace("_", "-") }}:${tag}
-                    docker push tlcpack/{{ image.name.replace("_", "-") }}:${tag}
+                    retry 5 docker push tlcpack/{{ image.name.replace("_", "-") }}:${tag}
                   """,
                   label: 'Tag tlcpackstaging/{{ image.name }} image to tlcpack',
                 )
diff --git a/ci/jenkins/DockerBuild.groovy.j2 b/ci/jenkins/DockerBuild.groovy.j2
index 1f3bded86242..5ffbeded80fa 100644
--- a/ci/jenkins/DockerBuild.groovy.j2
+++ b/ci/jenkins/DockerBuild.groovy.j2
@@ -21,8 +21,9 @@ def ecr_push(full_name) {
       sh(
         script: """
           set -x
+          . ci/scripts/retry.sh
           docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
-          docker push \$AWS_ECR_REPO/${full_name}
+          retry 5 docker push \$AWS_ECR_REPO/${full_name}
         """,
         label: 'Upload image to ECR'
       )
@@ -63,7 +64,8 @@ def ecr_pull(full_name) {
       sh(
         script: """
           set -eux
-          docker pull ${full_name}
+          . ci/scripts/retry.sh
+          retry 5 docker pull ${full_name}
         """,
         label: 'Pull image from ECR'
       )
diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2
index 6d0c0ec9c4b6..4464108968de 100644
--- a/ci/jenkins/Prepare.groovy.j2
+++ b/ci/jenkins/Prepare.groovy.j2
@@ -33,7 +33,7 @@ def init_git() {
   sh(
     script: """
       set -eux
-      {{ m.bash_retry() }}
+      . ci/scripts/retry.sh
       retry 3 timeout 5m git submodule update --init -f --jobs 0
     """,
     label: 'Update git submodules',
@@ -65,8 +65,8 @@ def docker_init(image) {
     sh(
       script: """
       set -eux
-      {{ m.bash_retry() }}
-      retry 3 docker pull ${image}
+      . ci/scripts/retry.sh
+      retry 5 docker pull ${image}
       """,
       label: 'Pull docker image',
     )
diff --git a/ci/jenkins/Test.groovy.j2 b/ci/jenkins/Test.groovy.j2
index 4ed149da9be0..52ed742d4cc0 100644
--- a/ci/jenkins/Test.groovy.j2
+++ b/ci/jenkins/Test.groovy.j2
@@ -294,8 +294,8 @@ stage('Test') {
     if (!skip_ci) {
       node('GPU') {
         ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
-          docker_init(ci_gpu)
           init_git()
+          docker_init(ci_gpu)
           {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
           add_microtvm_permissions()
           timeout(time: 180, unit: 'MINUTES') {
diff --git a/ci/jenkins/macros.j2 b/ci/jenkins/macros.j2
index 9d02ad68d6da..e6e69097b076 100644
--- a/ci/jenkins/macros.j2
+++ b/ci/jenkins/macros.j2
@@ -39,8 +39,8 @@ def {{ method_name }}() {
     node('{{ node }}') {
       ws({{ per_exec_ws(ws) }}) {
         try {
-          docker_init({{ docker_image }})
           init_git()
+          docker_init({{ docker_image }})
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
               'PLATFORM={{ platform }}',
@@ -71,8 +71,8 @@ def {{ method_name }}() {
   '{{ name }} {{ shard_index }} of {{ num_shards }}': {
     node('{{ node }}') {
       ws({{ per_exec_ws(ws) }}) {
-        docker_init({{ docker_image }})
         init_git()
+        docker_init({{ docker_image }})
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
             'TVM_NUM_SHARDS={{ num_shards }}',
@@ -95,8 +95,8 @@ def {{ method_name }}() {
       ws({{ per_exec_ws(ws) }}) {
         timeout(time: max_time, unit: 'MINUTES') {
           try {
-            docker_init({{ docker_image }})
             init_git()
+            docker_init({{ docker_image }})
             withEnv(['PLATFORM={{ platform }}'], {
               {{ caller() | indent(width=8) | trim }}
             })
@@ -120,8 +120,8 @@ def {{ method_name }}() {
         ws({{ per_exec_ws(ws) }}) {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
-              docker_init({{ docker_image }})
               init_git()
+              docker_init({{ docker_image }})
               withEnv(['PLATFORM={{ platform }}',
               'TEST_STEP_NAME={{ name }}',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
@@ -140,28 +140,6 @@ def {{ method_name }}() {
   },
 {% endmacro %}
 
-{% macro bash_retry() %}
-retry() {
-  local max_retries=\$1
-  shift
-  local n=0
-  local backoff_max=30
-  until [ "\$n" -ge \$max_retries ]
-  do
-      "\$@" && break
-      n=\$((n+1))
-      if [ "\$n" -eq \$max_retries ]; then
-          echo "failed to update after attempt \$n / \$max_retries, giving up"
-          exit 1
-      fi
-
-      WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))')
-      echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again"
-      sleep \$WAIT
-  done
-}
-{% endmacro %}
-
 {% macro deploy_step(name, feature_flag, ws) %}
   '{{ name }}': {
     if ({{ feature_flag }}) {
@@ -182,7 +160,7 @@ retry() {
 sh(
             script: """
               set -eux
-              {{ bash_retry() | indent(width=14) }}
+              . ci/scripts/retry.sh
               {% for filename in filenames %}
               md5sum {{ filename }}
               retry 3 aws s3 cp --no-progress {{ filename }} s3://${s3_prefix}/{{ tag }}/{{ filename }}
@@ -199,7 +177,7 @@ sh(
 sh(
             script: """
               set -eux
-              {{ bash_retry() | indent(width=14) }}
+              . ci/scripts/retry.sh
               {% for filename in filenames %}
               retry 3 aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ filename }} {{ filename }}
               md5sum {{ filename }}
diff --git a/ci/scripts/retry.sh b/ci/scripts/retry.sh
new file mode 100644
index 000000000000..08958fedce89
--- /dev/null
+++ b/ci/scripts/retry.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -eux
+
+retry() {
+  local max_retries=$1
+  shift
+  local n=0
+  until [ "$n" -ge "$max_retries" ]
+  do
+      "$@" && break
+      n=$((n+1))
+      if [ "$n" -eq "$max_retries" ]; then
+          echo "failed to update after attempt $n / $max_retries, giving up"
+          exit 1
+      fi
+
+      WAIT=$(python3 -c 'import random; print(random.randint(10, 30))')
+      echo "failed to update $n / $max_retries, waiting $WAIT to try again"
+      sleep "$WAIT"
+  done
+}

From 111a88d04ece6a0c6803277d5b7b1d4852b5e46c Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 15 Sep 2022 15:29:15 -0700
Subject: [PATCH 175/704] [ci][docker] Always build cmake from source (#12774)

This should fix some version drift in the current cmake versions in the
Docker containers (currently running all of 3.10, 3.16, 3.18, and 3.20)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/Dockerfile.ci_arm              | 6 +++---
 docker/Dockerfile.ci_cortexm          | 6 +++---
 docker/Dockerfile.ci_cpu              | 6 +++---
 docker/Dockerfile.ci_hexagon          | 3 +++
 docker/Dockerfile.ci_i386             | 6 +++---
 docker/Dockerfile.ci_minimal          | 3 +++
 docker/Dockerfile.ci_riscv            | 6 +++---
 docker/Dockerfile.ci_wasm             | 3 +++
 docker/install/ubuntu_install_core.sh | 1 -
 9 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
index 932687f1e568..2297e8f1e6e7 100644
--- a/docker/Dockerfile.ci_arm
+++ b/docker/Dockerfile.ci_arm
@@ -29,6 +29,9 @@ RUN apt-install-and-clear -y ca-certificates gnupg2
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
@@ -54,9 +57,6 @@ RUN bash /install/ubuntu_install_python.sh
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
-COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
-RUN bash /install/ubuntu_install_cmake_source.sh
-
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm
index a62ea059fa8c..db02792efda9 100644
--- a/docker/Dockerfile.ci_cortexm
+++ b/docker/Dockerfile.ci_cortexm
@@ -26,12 +26,12 @@ RUN apt-get update --fix-missing
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
-COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
-RUN bash /install/ubuntu_install_googletest.sh
-
 COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
 RUN bash /install/ubuntu_install_cmake_source.sh 3.20.0
 
+COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
+RUN bash /install/ubuntu_install_googletest.sh
+
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 00fd9a4fcab3..155f9ef7d914 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -25,6 +25,9 @@ RUN apt-get update --fix-missing
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
@@ -41,9 +44,6 @@ RUN bash /install/ubuntu_install_python_package.sh
 COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh
 RUN bash /install/ubuntu1804_install_llvm.sh
 
-COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
-RUN bash /install/ubuntu_install_cmake_source.sh
-
 COPY install/ubuntu_install_dnnl.sh /install/ubuntu_install_dnnl.sh
 RUN bash /install/ubuntu_install_dnnl.sh
 
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index d2ed29278488..f1fc7be52484 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -28,6 +28,9 @@ RUN apt-install-and-clear -y ca-certificates gnupg2 libxml2-dev
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index dc767ff6def1..b37e849819be 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -29,6 +29,9 @@ RUN apt-install-and-clear -y ca-certificates
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
@@ -49,9 +52,6 @@ COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
 RUN bash /install/ubuntu_install_python.sh
 ENV PATH ${TVM_VENV}/bin:$PATH
 
-COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
-RUN bash /install/ubuntu_install_cmake_source.sh
-
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
diff --git a/docker/Dockerfile.ci_minimal b/docker/Dockerfile.ci_minimal
index 974f3eea11d6..b4ba758901b4 100644
--- a/docker/Dockerfile.ci_minimal
+++ b/docker/Dockerfile.ci_minimal
@@ -25,6 +25,9 @@ RUN apt-get update --fix-missing
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
diff --git a/docker/Dockerfile.ci_riscv b/docker/Dockerfile.ci_riscv
index 5c597135ee41..0d03db15e39b 100644
--- a/docker/Dockerfile.ci_riscv
+++ b/docker/Dockerfile.ci_riscv
@@ -26,12 +26,12 @@ RUN apt-get update --fix-missing
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
-COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
-RUN bash /install/ubuntu_install_googletest.sh
-
 COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
 RUN bash /install/ubuntu_install_cmake_source.sh
 
+COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
+RUN bash /install/ubuntu_install_googletest.sh
+
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index 17230312f041..46f64b44dab5 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -23,6 +23,9 @@ RUN apt-get update --fix-missing
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
 RUN bash /install/ubuntu_install_googletest.sh
 
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index a27c45433115..7f26c6def25d 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -31,7 +31,6 @@ echo $TZ > /etc/timezone
 apt-get update && apt-install-and-clear -y --no-install-recommends \
     apt-transport-https \
     ca-certificates \
-    cmake \
     curl \
     g++ \
     gdb \

From 5b43c62ee64a7006dccc40811bd94de91d02a136 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 15 Sep 2022 15:43:33 -0700
Subject: [PATCH 176/704] [ci] Remove author check from ping bot (#12788)

This has been working fine for a while, this code opens it up so it's
not limited to the authors in #9983.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 ci/scripts/ping_reviewers.py | 19 +------------------
 tests/python/ci/test_ci.py   | 20 --------------------
 2 files changed, 1 insertion(+), 38 deletions(-)

diff --git a/ci/scripts/ping_reviewers.py b/ci/scripts/ping_reviewers.py
index 0b034a795efd..af642a52a0eb 100755
--- a/ci/scripts/ping_reviewers.py
+++ b/ci/scripts/ping_reviewers.py
@@ -189,7 +189,6 @@ def make_ping_message(pr, reviewers):
     parser.add_argument("--wait-time-minutes", required=True, type=int, help="ssh remote to parse")
     parser.add_argument("--cutoff-pr-number", default=0, type=int, help="ssh remote to parse")
     parser.add_argument("--dry-run", action="store_true", help="don't update GitHub")
-    parser.add_argument("--allowlist", help="filter by these PR authors")
     parser.add_argument("--pr-json", help="(testing) data for testing to use instead of GitHub")
     parser.add_argument("--now", help="(testing) custom string for current time")
     args = parser.parse_args()
@@ -208,17 +207,6 @@ def make_ping_message(pr, reviewers):
         end="",
     )
 
-    # [slow rollout]
-    # This code is here to gate this feature to a limited set of people before
-    # deploying it for everyone to avoid spamming in the case of bugs or
-    # ongoing development.
-    if args.allowlist:
-        author_allowlist = args.allowlist.split(",")
-    else:
-        github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
-        allowlist_issue = github.get("issues/9983")
-        author_allowlist = set(find_reviewers(allowlist_issue["body"]))
-
     if args.pr_json:
         r = json.loads(args.pr_json)
     else:
@@ -242,13 +230,8 @@ def make_ping_message(pr, reviewers):
                 print(
                     f"Skipping #{pr['number']} since it's too old ({pr['number']} <= {cutoff_pr_number})"
                 )
-            elif pr["author"]["login"] not in author_allowlist:
-                # [slow rollout]
-                print(
-                    f"Skipping #{pr['number']} since author {pr['author']['login']} is not in allowlist: {author_allowlist}"
-                )
             else:
-                print(f"Checking #{pr['number']} since author is in {author_allowlist}")
+                print(f"Checking #{pr['number']}")
                 prs_to_check.append(pr)
 
         print(f"Summary: Checking {len(prs_to_check)} of {len(prs)} fetched")
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 79c72ce988c3..6c25694cfc74 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -548,24 +548,6 @@ def all_time_keys(time):
         },
         check="Pinging reviewers ['someone'] on https://github.com/apache/tvm/pull/123",
     ),
-    # Check allowlist functionality
-    allowlist=dict(
-        pull_request={
-            "number": 123,
-            "url": "https://github.com/apache/tvm/pull/123",
-            "body": "cc @someone",
-            "isDraft": False,
-            "author": {"login": "user2"},
-            "reviews": {"nodes": []},
-            **all_time_keys("2022-01-18T17:54:19Z"),
-            "comments": {
-                "nodes": [
-                    {**all_time_keys("2022-01-19T17:54:19Z"), "bodyText": "abc"},
-                ]
-            },
-        },
-        check="Checking 0 of 1 fetched",
-    ),
     # Old comment, ping
     old_comment=dict(
         pull_request={
@@ -632,8 +614,6 @@ def test_ping_reviewers(tmpdir_factory, pull_request, check):
             "1",
             "--cutoff-pr-number",
             "5",
-            "--allowlist",
-            "user",
             "--pr-json",
             json.dumps(data),
             "--now",

From afad20d8d9740baa1d251b7e80e8e56a1c2b7a4d Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 15 Sep 2022 16:38:38 -0700
Subject: [PATCH 177/704] Fix typo in doc of logging (#12798)

---
 include/tvm/runtime/logging.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h
index 7b635eab0422..7dbc6d810dc0 100644
--- a/include/tvm/runtime/logging.h
+++ b/include/tvm/runtime/logging.h
@@ -476,12 +476,12 @@ inline bool DebugLoggingEnabled() {
  *
  * To enable file \p relay/foo.cc up to level 2 and \p ir/bar.cc for level 0 only set:
  * \code
- * TVM_LOG_DEBUG="relay/foo.cc=2;ir/bar.cc=0"
+ * TVM_LOG_DEBUG="relay/foo.cc=2,ir/bar.cc=0"
  * \endcode
  *
  * To enable all files up to level 3 but disable \p ir/bar.cc set:
  * \code
- * TVM_LOG_DEBUG="DEFAULT=2;ir/bar.cc=-1"
+ * TVM_LOG_DEBUG="DEFAULT=2,ir/bar.cc=-1"
  * \endcode
  *
  * Any of these settings will also enable DLOG statements.

From 6a051843a9af11261cc0103837f517db14066fc5 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Thu, 15 Sep 2022 16:39:58 -0700
Subject: [PATCH 178/704] [TVMScript] IRBuilder methods for `For` (#12786)

This PR introduces remaining IRBuilder methods for `For`.

Co-authored-by: yongwww <yongcale@gmail.com>
---
 include/tvm/script/ir_builder/tir/frame.h     |  51 ++++++
 include/tvm/script/ir_builder/tir/ir.h        |  53 ++++++
 python/tvm/script/ir_builder/tir/frame.py     |   9 +
 python/tvm/script/ir_builder/tir/ir.py        | 172 ++++++++++++++++++
 src/script/ir_builder/tir/frame.cc            |   6 +
 src/script/ir_builder/tir/ir.cc               |  76 ++++++++
 .../unittest/test_tvmscript_ir_builder_tir.py |  55 ++++++
 7 files changed, 422 insertions(+)

diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h
index 15ab77863e5e..2902b982d5a6 100644
--- a/include/tvm/script/ir_builder/tir/frame.h
+++ b/include/tvm/script/ir_builder/tir/frame.h
@@ -187,6 +187,57 @@ class BlockFrame : public TIRFrame {
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(BlockFrame, TIRFrame, BlockFrameNode);
 };
 
+/*!
+ * \brief A frame that represents the for loop.
+ *
+ * \sa ForFrame
+ */
+class ForFrameNode : public TIRFrameNode {
+ public:
+  /*!
+   * \brief Functions that generate loop nests.
+   * \param loop_vars The loop variables, from outer to inner
+   * \param loop_extents The loop extents that correspond to loop variables
+   * \param loop_body The loop body
+   * \return A stmt, the loop nest
+   */
+  using FMakeForLoop = runtime::TypedPackedFunc<tvm::tir::Stmt(
+      Array<tvm::tir::Var> loop_vars, Array<Range> loop_extents, tvm::tir::Stmt loop_body)>;
+  /*! \brief The loop variable. */
+  Array<tvm::tir::Var> vars;
+  /*! \brief The domains of iteration. */
+  Array<Range> doms;
+  /*! \brief The for loop generating function. */
+  FMakeForLoop f_make_for_loop;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("vars", &vars);
+    v->Visit("doms", &doms);
+    // `f_make_for_loop` is not visited.
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.ForFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ForFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to ForFrameNode.
+ *
+ * \sa ForFrameNode
+ */
+class ForFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ForFrame, TIRFrame, ForFrameNode);
+};
+
 /*!
  * \brief A frame that represents the assert statement. Proceeds if the condition is true,
  * otherwise aborts with the message.
diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
index aaa5442eede3..68948196ff6b 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -141,6 +141,59 @@ void PreflattenedBuffer(Buffer postflattened_buffer, Array<PrimExpr> shape,
  */
 BlockFrame Block(String name, bool no_realize = false);
 
+/*!
+ * \brief The serial For statement.
+ * \param start The minimum value of iteration.
+ * \param stop The maximum value of iteration.
+ * \param annotations The optional annotations of the For statement.
+ * \return The ForFrame.
+ */
+ForFrame Serial(PrimExpr start, PrimExpr stop,
+                Optional<Map<String, ObjectRef>> annotations = NullOpt);
+/*!
+ * \brief The parallel For statement.
+ * \param start The minimum value of iteration.
+ * \param stop The maximum value of iteration.
+ * \param annotations The optional annotations of the For statement.
+ * \return The ForFrame.
+ */
+ForFrame Parallel(PrimExpr start, PrimExpr stop,
+                  Optional<Map<String, ObjectRef>> annotations = NullOpt);
+/*!
+ * \brief The vectorized For statement.
+ * \param start The minimum value of iteration.
+ * \param stop The maximum value of iteration.
+ * \param annotations The optional annotations of the For statement.
+ * \return The ForFrame.
+ */
+ForFrame Vectorized(PrimExpr start, PrimExpr stop,
+                    Optional<Map<String, ObjectRef>> annotations = NullOpt);
+/*!
+ * \brief The unrolled For statement.
+ * \param start The minimum value of iteration.
+ * \param stop The maximum value of iteration.
+ * \param annotations The optional annotations of the For statement.
+ * \return The ForFrame.
+ */
+ForFrame Unroll(PrimExpr start, PrimExpr stop,
+                Optional<Map<String, ObjectRef>> annotations = NullOpt);
+/*!
+ * \brief The thread-binding For statement.
+ * \param start The minimum value of iteration.
+ * \param stop The maximum value of iteration.
+ * \param thread The thread for loop variable to bind.
+ * \param annotations The optional annotations of the For statement.
+ * \return The ForFrame.
+ */
+ForFrame ThreadBinding(PrimExpr start, PrimExpr stop, String thread,
+                       Optional<Map<String, ObjectRef>> annotations = NullOpt);
+/*!
+ * \brief The grid For statement.
+ * \param extents The extents of the iteration.
+ * \return The ForFrame.
+ */
+ForFrame Grid(Array<PrimExpr> extents);
+
 /*!
  * \brief Evaluate the input expression.
  * \param value The input expression to evaluate.
diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py
index 0e7eb2bb4720..75bb0231aeef 100644
--- a/python/tvm/script/ir_builder/tir/frame.py
+++ b/python/tvm/script/ir_builder/tir/frame.py
@@ -15,8 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """IRBuilder for TIR"""
+from typing import List, Union
 
 from tvm._ffi import register_object as _register_object
+from tvm.tir import Var
 
 from ..base import IRBuilderFrame
 
@@ -34,3 +36,10 @@ class PrimFuncFrame(TIRFrame):
 @_register_object("script.ir_builder.tir.BlockFrame")
 class BlockFrame(TIRFrame):
     ...
+
+
+@_register_object("script.ir_builder.tir.ForFrame")
+class ForFrame(TIRFrame):
+    def __enter__(self) -> Union[Var, List[Var]]:
+        super().__enter__()
+        return self.vars if len(self.vars) > 1 else self.vars[0]
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index 63fd1291f4bc..a5cdf8a3a105 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -344,6 +344,172 @@ def block(name: str = "", no_realize: bool = False) -> frame.BlockFrame:
     return _ffi_api.Block(name, no_realize)  # pylint: disable=no-member # type: ignore
 
 
+def serial(
+    start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None
+) -> frame.ForFrame:
+    """The serial For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+    if stop is None:
+        stop = start
+        start = 0
+    return _ffi_api.Serial(start, stop, annotations)  # pylint: disable=no-member # type: ignore
+
+
+def parallel(
+    start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None
+) -> frame.ForFrame:
+    """The parallel For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+    if stop is None:
+        stop = start
+        start = 0
+    return _ffi_api.Parallel(start, stop, annotations)  # pylint: disable=no-member # type: ignore
+
+
+def vectorized(
+    start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None
+) -> frame.ForFrame:
+    """The vectorized For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+    if stop is None:
+        stop = start
+        start = 0
+    return _ffi_api.Vectorized(start, stop, annotations)  # pylint: disable=no-member # type: ignore
+
+
+def unroll(
+    start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None
+) -> frame.ForFrame:
+    """The unrolled For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+    if stop is None:
+        stop = start
+        start = 0
+    return _ffi_api.Unroll(start, stop, annotations)  # pylint: disable=no-member # type: ignore
+
+
+def thread_binding(
+    start: PrimExpr,
+    stop: PrimExpr = None,
+    thread: str = None,
+    *,
+    annotations: Dict[str, Any] = None,
+) -> frame.ForFrame:
+    """The thread-binding For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    thread : str
+        The thread for loop variable to bind.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+    if thread is None:
+        if not isinstance(stop, str):
+            raise ValueError("Thread cannot be None for thread_binding")
+        thread = stop
+        stop = start
+        start = 0
+    elif stop is None:
+        stop = start
+        start = 0
+    return _ffi_api.ThreadBinding(  # pylint: disable=no-member # type: ignore
+        start, stop, thread, annotations
+    )
+
+
+def grid(*extents: PrimExpr) -> frame.ForFrame:
+    """The grid For statement.
+
+    Parameters
+    ----------
+    extents : PrimExpr
+        The extents of the iteration.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+    return _ffi_api.Grid(extents)  # pylint: disable=no-member # type: ignore
+
+
 def evaluate(value: PrimExpr) -> None:
     """Evaluate the input expression.
 
@@ -677,6 +843,12 @@ def var(dtype, name="") -> Var:
     "match_buffer",
     "preflattened_buffer",
     "block",
+    "serial",
+    "parallel",
+    "vectorized",
+    "unroll",
+    "thread_binding",
+    "grid",
     "evaluate",
     "int8",
     "int16",
diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc
index dd3097e388b7..e54bf75eeff2 100644
--- a/src/script/ir_builder/tir/frame.cc
+++ b/src/script/ir_builder/tir/frame.cc
@@ -73,9 +73,15 @@ void BlockFrameNode::ExitWithScope() {
   }
 }
 
+void ForFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  AddToParent(this->f_make_for_loop(vars, doms, AsStmt(stmts)));
+}
+
 TVM_REGISTER_NODE_TYPE(TIRFrameNode);
 TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode);
 TVM_REGISTER_NODE_TYPE(BlockFrameNode);
+TVM_REGISTER_NODE_TYPE(ForFrameNode);
 
 }  // namespace tir
 }  // namespace ir_builder
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index e2c1218a7e87..22c7face7084 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -173,6 +173,74 @@ BlockFrame Block(String name, bool no_realize) {
   return BlockFrame(n);
 }
 
+#define TVM_TIR_IR_BUILDER_FOR_FRAME(Method, Kind)                                                \
+  ForFrame Method(PrimExpr start, PrimExpr stop, Optional<Map<String, ObjectRef>> annotations) {  \
+    PrimExpr min = start;                                                                         \
+    PrimExpr extent = arith::Analyzer().Simplify(stop - start);                                   \
+    ObjectPtr<ForFrameNode> n = make_object<ForFrameNode>();                                      \
+    int bits = std::max(min.dtype().bits(), extent.dtype().bits());                               \
+    n->vars = {Var("v", DataType::Int(bits))};                                                    \
+    n->doms = {Range::FromMinExtent(min, extent)};                                                \
+    n->f_make_for_loop = [annotations](Array<Var> vars, Array<Range> doms, tvm::tir::Stmt body) { \
+      ICHECK_EQ(vars.size(), 1);                                                                  \
+      ICHECK_EQ(doms.size(), 1);                                                                  \
+      return tvm::tir::For(vars[0], doms[0]->min, doms[0]->extent, Kind, body, NullOpt,           \
+                           annotations.value_or(Map<String, ObjectRef>()));                       \
+    };                                                                                            \
+    return ForFrame(n);                                                                           \
+  }
+
+TVM_TIR_IR_BUILDER_FOR_FRAME(Serial, tvm::tir::ForKind::kSerial);
+TVM_TIR_IR_BUILDER_FOR_FRAME(Parallel, tvm::tir::ForKind::kParallel);
+TVM_TIR_IR_BUILDER_FOR_FRAME(Vectorized, tvm::tir::ForKind::kVectorized);
+TVM_TIR_IR_BUILDER_FOR_FRAME(Unroll, tvm::tir::ForKind::kUnrolled);
+
+#undef TVM_TIR_IR_BUILDER_FOR_FRAME
+
+ForFrame ThreadBinding(PrimExpr start, PrimExpr stop, String thread,
+                       Optional<Map<String, ObjectRef>> annotations) {
+  using namespace tvm::tir;
+  PrimExpr min = start;
+  PrimExpr extent = arith::Analyzer().Simplify(stop - start);
+  ObjectPtr<ForFrameNode> n = make_object<ForFrameNode>();
+  int bits = std::max(min.dtype().bits(), extent.dtype().bits());
+  n->vars = {Var("v", DataType::Int(bits))};
+  n->doms = {Range::FromMinExtent(min, extent)};
+  n->f_make_for_loop = [annotations, thread](Array<Var> vars, Array<Range> doms, Stmt body) -> For {
+    ICHECK_EQ(vars.size(), 1);
+    ICHECK_EQ(doms.size(), 1);
+    IterVar iter_var(Range(nullptr), Var("iter", DataType::Int(32)), IterVarType::kThreadIndex,
+                     thread);
+    return For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kThreadBinding, body, iter_var,
+               annotations.value_or(Map<String, ObjectRef>()));
+  };
+  return ForFrame(n);
+}
+
+ForFrame Grid(Array<PrimExpr> extents) {
+  using namespace tvm::tir;
+  ObjectPtr<ForFrameNode> n = make_object<ForFrameNode>();
+  n->vars.reserve(extents.size());
+  n->doms.reserve(extents.size());
+  for (const auto& extent : extents) {
+    DataType dtype = extent.dtype();
+    n->vars.push_back(Var("v", extent.dtype()));
+    n->doms.push_back(Range(make_const(dtype, 0), extent));
+  }
+  n->f_make_for_loop = [](Array<Var> vars, Array<Range> doms, Stmt body) -> Stmt {
+    ICHECK_EQ(vars.size(), doms.size());
+    int n = vars.size();
+    for (int i = n - 1; i >= 0; --i) {
+      Range dom = doms[i];
+      Var var = vars[i];
+      body = For(var, dom->min, dom->extent, ForKind::kSerial, std::move(body),
+                 /*thread_binding=*/NullOpt, /*annotations=*/{});
+    }
+    return body;
+  };
+  return ForFrame(n);
+}
+
 void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); }
 
 using tvm::script::ir_builder::details::Namer;
@@ -235,6 +303,14 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.MatchBuffer").set_body_typed(MatchBuf
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.PreflattenedBuffer").set_body_typed(PreflattenedBuffer);
 
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Block").set_body_typed(Block);
+
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Serial").set_body_typed(Serial);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Parallel").set_body_typed(Parallel);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Vectorized").set_body_typed(Vectorized);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Unroll").set_body_typed(Unroll);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.ThreadBinding").set_body_typed(ThreadBinding);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Grid").set_body_typed(Grid);
+
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate);
 
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int8").set_body_typed(Int8);
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index 5c93e99909d9..9cbfd75e2280 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -114,5 +114,60 @@ def test_ir_builder_tir_block():
     assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True)
 
 
+def test_ir_builder_tir_for():
+    with IRBuilder() as ib:
+        with T.serial(128) as a:
+            with T.parallel(64) as b:
+                with T.vectorized(32) as c:
+                    with T.unroll(16) as d:
+                        with T.thread_binding(8, thread="threadIdx.x") as e:
+                            T.evaluate(0)
+
+    # the for generated by IRBuilder
+    for_actual = ib.get()
+
+    # the expected for
+    thread_binding_expected = tir.For(
+        loop_var=tir.Var("", "int32"),
+        min_val=0,
+        extent=8,
+        kind=tir.ForKind.THREAD_BINDING,
+        body=tir.Evaluate(0),
+        thread_binding=tir.IterVar(
+            None, tir.Var("", "int32"), tir.IterVar.ThreadIndex, "threadIdx.x"
+        ),
+    )
+    unroll_expected = tir.For(
+        loop_var=tir.Var("", "int32"),
+        min_val=0,
+        extent=16,
+        kind=tir.ForKind.UNROLLED,
+        body=thread_binding_expected,
+    )
+    vectorized_expected = tir.For(
+        loop_var=tir.Var("", "int32"),
+        min_val=0,
+        extent=32,
+        kind=tir.ForKind.VECTORIZED,
+        body=unroll_expected,
+    )
+    parallel_expected = tir.For(
+        loop_var=tir.Var("", "int32"),
+        min_val=0,
+        extent=64,
+        kind=tir.ForKind.PARALLEL,
+        body=vectorized_expected,
+    )
+    for_expected = tir.For(
+        loop_var=tir.Var("", "int32"),
+        min_val=0,
+        extent=128,
+        kind=tir.ForKind.SERIAL,
+        body=parallel_expected,
+    )
+    # Check if the generated ir is expected
+    assert_structural_equal(for_actual, for_expected, map_free_vars=True)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 9a3b3dd1ceac8f9b065636146756baead39b8ab6 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Fri, 16 Sep 2022 07:40:55 +0800
Subject: [PATCH 179/704] [TVMScript] Fix parse minimal i32 literal for tir
 script (#12772)

This change tries to fix an issue due to #12515.

Previously the logic for `-2147483648` is  `parse(-literal)` = `-parse(literal)`, and all integer literals are converted to i32 (either the literal value actually overflow or not).

Since after #12515, parse `2147483648` results in an i64 typed integer rather than i32, `-2147483648` then becomes an i64 integer too, which is not reasonable.
---
 python/tvm/script/parser.py                       |  7 +++++++
 tests/python/unittest/test_tvmscript_roundtrip.py | 10 ++++++++++
 2 files changed, 17 insertions(+)

diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index e9b4286edad8..c34aae23453c 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -906,6 +906,13 @@ def transform_Call(self, node):
                 )
             if node.func_name.name in self._unaryop_maker:
                 rhs = self.transform(node.params[0])
+                if node.func_name.name == ast.BuiltinOp.USub and isinstance(
+                    node.params[0], ast.Constant
+                ):
+                    # '-literal' should be parsed together for proper literal type inference
+                    if not isinstance(rhs, (tvm.tir.IntImm, tvm.tir.FloatImm)):
+                        self.report_error("The literal is illegal after -", node.params[0].span)
+                    return tvm.tir.const(-rhs.value)
                 return self._unaryop_maker[node.func_name.name](
                     rhs, span=tvm_span_from_synr(node.span)
                 )
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 17622789558d..1f5871b488e2 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3381,6 +3381,15 @@ def func(
     return func
 
 
+def minimal_i32_literal():
+    @T.prim_func
+    def func() -> None:
+        T.evaluate(T.int32(-2147483648))
+        T.evaluate(-T.int64(2147483648))
+
+    return func
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3423,6 +3432,7 @@ def func(
     decl_buffer,
     allocate_and_decl_buffer,
     float_infinity,
+    minimal_i32_literal,
 )
 
 
From c96cc1101ff1a78b69945680574a69c1402a29ff Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 15 Sep 2022 17:07:39 -0700
Subject: [PATCH 180/704] [community] Fix outdated contributor GitHub usernames
 (#12799)

These couple names were linking to 404 pages, this PR updates them to
their current counterparts.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 CONTRIBUTORS.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 42f67e87df10..a5da6c8abc79 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -45,7 +45,7 @@ We do encourage everyone to work anything they are interested in.
 - [Animesh Jain](https://github.com/anijain2305): @anijain2305 - quantization, relay
 - [Chenfan Jia](https://github.com/jcf94): @jcf94 - auto_scheduler
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
-- [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm - ethos-u, memory planner
+- [Manupa Karunaratne](https://github.com/manupak): @manupak - ethos-u, memory planner
 - [Elen Kalda](https://github.com/ekalda): @ekalda - ethos-u, arm
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay
 - [Tristan Konolige](https://github.com/tkonolige): @tkonolige - profiling, relay, tir, runtime
@@ -70,7 +70,7 @@ We do encourage everyone to work anything they are interested in.
 - [Giuseppe Rossini](https://github.com/giuseros): @giuseros - aot, arm
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends
 - [Christopher Sidebottom](https://github.com/Mousius): @Mousius - arm, ethos-u, relay
-- [Junru Shao](https://github.com/junrushao1994) (PMC): @junrushao1994 - relay, compiler
+- [Junru Shao](https://github.com/junrushao) (PMC): @junrushao - relay, compiler
 - [Haichen Shen](https://github.com/icemelon) (PMC): @icemelon - relay, topi
 - [Chris Sullivan](https://github.com/csullivan): @csullivan - amd backend
 - [Siva Rama Krishna Reddy](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang
@@ -85,7 +85,7 @@ We do encourage everyone to work anything they are interested in.
 - [Hao Yu](https://github.com/comaniac): @comaniac (PMC) - relay, byoc, auto_scheduler
 - [Lianmin Zheng](https://github.com/merrymercy) (PMC): @merrymercy - autotvm, auto_scheduler, topi, relay
 - [Xiyou Zhou](https://github.com/zxybazh): @zxybazh - relay
-- [wrongtest](https://github.com/wrongtest): @wrongtest - tir, tvm-script, arith
+- [wrongtest](https://github.com/wrongtest-intellif): @wrongtest-intellif - tir, tvm-script, arith
 
 ## Reviewers
 
@@ -120,7 +120,7 @@ We do encourage everyone to work anything they are interested in.
 - [Hua Jiang](https://github.com/huajsj): @huajsj
 - [Ziheng Jiang](https://github.com/ZihengJiang): @ZihengJiang
 - [Hongyi Jin](https://github.com/jinhongyii): @jinhongyii
-- [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm
+- [Manupa Karunaratne](https://github.com/manupak): @manupak
 - [Elen Kalda](https://github.com/ekalda): @ekalda
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame
 - [Michael J. Klaiber](https://github.com/MichaelJKlaiber/) @MichaelJKlaiber
@@ -162,7 +162,7 @@ We do encourage everyone to work anything they are interested in.
 - [Gustavo Romero](https://github.com/gromero): @gromero
 - [Giuseppe Rossini](https://github.com/giuseros): @giuseros
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel
-- [Junru Shao](https://github.com/junrushao1994): @junrushao1994
+- [Junru Shao](https://github.com/junrushao): @junrushao
 - [Haichen Shen](https://github.com/icemelon): @icemelon
 - [Xingjian Shi](https://github.com/sxjscience): @sxjscience
 - [Yuanjing Shi](https://github.com/shingjan): @shingjan
@@ -187,7 +187,7 @@ We do encourage everyone to work anything they are interested in.
 - [Logan Weber](https://github.com/weberlo): @weberlo
 - [Matt Welsh](https://github.com/mdw-octoml): @mdw-octoml
 - [Jian Weng](https://github.com/were): @were
-- [wrongtest](https://github.com/wrongtest): @wrongtest
+- [wrongtest](https://github.com/wrongtest-intellif): @wrongtest-intellif
 - [Yong Wu](https://github.com/yongwww): @yongwww
 - [Zhao Wu](https://github.com/FrozenGene): @FrozenGene
 - [Bing Xu](https://github.com/antinucleon): @antinucleon

From e6525a30e6de3bc3f95564beeead8e9e8b1f9efc Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 15 Sep 2022 18:49:22 -0700
Subject: [PATCH 181/704] [TIR] Add extra simpliciation in region cover
 analysis (#12800)

Added extra simplify step to eliminate false negative cases.
---
 src/tir/schedule/state.cc                     |  5 ++
 .../test_tir_schedule_state_cached_flags.py   | 86 ++++++++++++++++++-
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc
index 15d0e08ddc2c..6d4a42236f57 100644
--- a/src/tir/schedule/state.cc
+++ b/src/tir/schedule/state.cc
@@ -108,6 +108,11 @@ bool ProducerCoversConsumer(const Array<PrimExpr>& buffer_shape,
     produced = arith::Intersect({produced, buffer_size});
     consumed = arith::Intersect({consumed, buffer_size});
 
+    produced = arith::IntSet::Interval(analyzer->Simplify(produced.min()),
+                                       analyzer->Simplify(produced.max()));
+    consumed = arith::IntSet::Interval(analyzer->Simplify(consumed.min()),
+                                       analyzer->Simplify(consumed.max()));
+
     if (!analyzer->CanProve((analyzer->canonical_simplify(produced.min() - consumed.min()) <= 0) &&
                             (analyzer->canonical_simplify(consumed.max() - produced.max()) <= 0))) {
       return false;
diff --git a/tests/python/unittest/test_tir_schedule_state_cached_flags.py b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
index bbeb8d87600b..987821714078 100644
--- a/tests/python/unittest/test_tir_schedule_state_cached_flags.py
+++ b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
@@ -26,7 +26,7 @@
 from tvm.tir.stmt_functor import post_order_visit
 
 # pylint: disable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
-
+# fmt: off
 
 @T.prim_func
 def elementwise(a: T.handle, c: T.handle) -> None:
@@ -366,7 +366,80 @@ def uncovered_producer_region(A: T.Buffer[(128,), "float32"], B: T.Buffer[(128,)
             B[vi] = A[vi]
 
 
+@T.prim_func
+def matmul_relu_padding(A: T.Buffer[(127, 127), "float16"], B: T.Buffer[(127, 127), "float16"], compute: T.Buffer[(127, 127), "float32"]) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    # body
+    # with T.block("root")
+    C = T.alloc_buffer([127, 127], dtype="float32")
+    A_reindex = T.alloc_buffer([128, 128], dtype="float16")
+    B_reindex = T.alloc_buffer([128, 128], dtype="float16")
+    C_reindex_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+    C_reindex_shared_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator")
+    for ax0, ax1, ax2 in T.grid(128, 1, 128):
+        with T.block("A_reindex"):
+            v0, v1, v2 = T.axis.remap("SSS", [ax0, ax1, ax2])
+            T.reads(A[v0, v2])
+            T.writes(A_reindex[v0, v2])
+            A_reindex[v0, v2] = T.if_then_else(v0 < 127 and v2 < 127, A[v0, v2], T.float16(0), dtype="float16")
+    for ax0, ax1, ax2 in T.grid(1, 128, 128):
+        with T.block("B_reindex"):
+            v0, v1, v2 = T.axis.remap("SSS", [ax0, ax1, ax2])
+            T.reads(B[v2, v1])
+            T.writes(B_reindex[v2, v1])
+            B_reindex[v2, v1] = T.if_then_else(v2 < 127 and v1 < 127, B[v2, v1], T.float16(0), dtype="float16")
+    for ax0_0_0_ax1_0_0_fused in T.thread_binding(2, thread="blockIdx.y"):
+        for ax0_0_1_ax1_0_1_fused in T.thread_binding(1, thread="blockIdx.x"):
+            for ax0_0_2_ax1_0_2_fused in T.thread_binding(16, thread="threadIdx.y"):
+                for ax2_0_0, ax2_0_1, ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(2, 2, 1, 2, 2, 1, 1):
+                    with T.block("C_o"):
+                        v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2 + ax0_0_3 + ax0_0_4)
+                        v1_o = T.axis.spatial(8, ax1_0_4 + ax0_0_0_ax1_0_0_fused * 4 + ax0_0_2_ax1_0_2_fused % 2 * 2 + ax1_0_3)
+                        v2_o = T.axis.reduce(8, ax2_0_0 * 4 + ax2_0_1 * 2 + ax2_0_2)
+                        T.reads(A_reindex[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                        T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                        T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1})
+                        with T.init():
+                            for ax0_1, ax1_1 in T.grid(16, 16):
+                                with T.block("C_init"):
+                                    v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1])
+                                    T.reads()
+                                    T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init])
+                                    C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0)
+                        for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16):
+                            with T.block("C"):
+                                v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1])
+                                T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex[v2_o * 16 + v2_i, v1_o * 16 + v1_i])
+                                T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
+                                C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32")
+                for ax0, ax1 in T.grid(16, 32):
+                    with T.block("C_reindex_shared_wmma.accumulator"):
+                        v0 = T.axis.spatial(128, ax0_0_2_ax1_0_2_fused // 2 * 16 + ax0)
+                        v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused * 64 + ax0_0_2_ax1_0_2_fused % 2 * 32 + ax1)
+                        T.reads(C_reindex_shared_wmma_accumulator[v0, v1])
+                        T.writes(C_reindex_shared[v0, v1])
+                        C_reindex_shared[v0, v1] = C_reindex_shared_wmma_accumulator[v0, v1]
+            for ax0, ax1 in T.grid(128, 64):
+                with T.block("C_reindex_shared"):
+                    v0 = T.axis.spatial(128, ax0)
+                    v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused * 64 + ax1)
+                    T.where(ax0 < 127 and ax0_0_0_ax1_0_0_fused * 64 + ax1 < 127)
+                    T.reads(C_reindex_shared[v0, v1])
+                    T.writes(C[v0, v1])
+                    T.block_attr({"meta_schedule.cooperative_fetch":3})
+                    C[v0, v1] = C_reindex_shared[v0, v1]
+    for i0, i1 in T.grid(127, 127):
+        with T.block("compute"):
+            i0_1, i1_1 = T.axis.remap("SS", [i0, i1])
+            T.reads(C[i0_1, i1_1])
+            T.writes(compute[i0_1, i1_1])
+            compute[i0_1, i1_1] = T.max(C[i0_1, i1_1], T.float32(0))
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
+# fmt: on
 
 
 def _get_block(s: tir.ScheduleState, name_hint: str) -> tir.StmtSRef:
@@ -781,5 +854,16 @@ def test_uncovered_producer_region():
     # pylint: enable=protected-access
 
 
+def test_matmul_relu_padding():
+    s = tir.ScheduleState(matmul_relu_padding, debug_mask="all")
+    # pylint: disable=protected-access
+    assert s._get_cached_flags(_get_block(s, "C_reindex_shared")) == CachedFlags(
+        affine_binding=True,
+        region_cover=True,
+        stage_pipeline=True,
+    )
+    # pylint: enable=protected-access
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 02c2eae510d6d6c15189427c97819f7ce05f002d Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 15 Sep 2022 22:01:33 -0700
Subject: [PATCH 182/704] [MetaSchedule] Enable Clone Function for Task-Level
 Classes (#12796)

This PR introduces a clone function for each of the task-level MetaSchedule classes for convenient class deep copying.

- [x] ScheduleRule
- [x] Postproc
- [x] Mutator
- [x] SpaceGenerator
- [x] SearchStrategy
- [x] TuneContext
---
 include/tvm/meta_schedule/mutator.h           |  88 ++++++++------
 include/tvm/meta_schedule/postproc.h          |  86 ++++++++-----
 include/tvm/meta_schedule/schedule_rule.h     |  86 ++++++++-----
 include/tvm/meta_schedule/search_strategy.h   | 114 +++++++++++-------
 include/tvm/meta_schedule/space_generator.h   |  78 +++++++-----
 include/tvm/meta_schedule/tune_context.h      |   6 +
 python/tvm/meta_schedule/mutator/mutator.py   |  24 +++-
 python/tvm/meta_schedule/postproc/postproc.py |  24 +++-
 .../schedule_rule/schedule_rule.py            |  32 +++--
 .../search_strategy/search_strategy.py        |  23 ++++
 .../space_generator/space_generator.py        |  24 +++-
 .../tvm/meta_schedule/testing/dummy_object.py |   3 +
 python/tvm/meta_schedule/tune_context.py      |  10 ++
 .../mutator/mutate_compute_location.cc        |   5 +
 src/meta_schedule/mutator/mutate_parallel.cc  |   5 +
 .../mutator/mutate_thread_binding.cc          |   5 +
 src/meta_schedule/mutator/mutate_tile_size.cc |   5 +
 src/meta_schedule/mutator/mutate_unroll.cc    |   5 +
 src/meta_schedule/mutator/mutator.cc          |   8 ++
 .../postproc/disallow_dynamic_loop.cc         |   5 +
 src/meta_schedule/postproc/postproc.cc        |   8 ++
 .../postproc/rewrite_cooperative_fetch.cc     |   5 +
 src/meta_schedule/postproc/rewrite_layout.cc  |   5 +
 .../rewrite_parallel_vectorize_unroll.cc      |   6 +
 .../postproc/rewrite_reduction_block.cc       |   5 +
 .../postproc/rewrite_tensorize.cc             |   5 +
 .../postproc/rewrite_unbound_block.cc         |   5 +
 src/meta_schedule/postproc/verify_gpu_code.cc |   6 +
 .../schedule_rule/add_rfactor.cc              |   6 +
 src/meta_schedule/schedule_rule/auto_bind.cc  |   6 +
 .../schedule_rule/auto_inline.cc              |   6 +
 .../schedule_rule/cross_thread_reduction.cc   |   6 +
 .../schedule_rule/multi_level_tiling.cc       |   6 +
 .../schedule_rule/multi_level_tiling.h        |   3 +
 .../multi_level_tiling_tensor_core.cc         |   7 ++
 .../multi_level_tiling_with_intrin.cc         |   7 ++
 .../parallel_vectorize_unroll.cc              |   7 ++
 .../schedule_rule/random_compute_location.cc  |   6 +
 .../schedule_rule/schedule_rule.cc            |   9 ++
 .../search_strategy/evolutionary_search.cc    |  18 +++
 .../search_strategy/replay_func.cc            |  10 ++
 .../search_strategy/replay_trace.cc           |  11 ++
 .../search_strategy/search_strategy.cc        |  11 +-
 .../space_generator/post_order_apply.cc       |   9 ++
 .../space_generator/schedule_fn.cc            |   5 +
 .../space_generator/space_generator.cc        |  12 +-
 .../space_generator/space_generator_union.cc  |   9 ++
 src/meta_schedule/tune_context.cc             |  26 ++++
 48 files changed, 675 insertions(+), 186 deletions(-)

diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h
index 566cc82e9716..2b580e75e019 100644
--- a/include/tvm/meta_schedule/mutator.h
+++ b/include/tvm/meta_schedule/mutator.h
@@ -32,6 +32,7 @@ namespace tvm {
 namespace meta_schedule {
 
 class TuneContext;
+class Mutator;
 
 /*! \brief Mutator is designed to mutate the trace to explore the design space. */
 class MutatorNode : public runtime::Object {
@@ -57,12 +58,21 @@ class MutatorNode : public runtime::Object {
   virtual Optional<tir::Trace> Apply(const tir::Trace& trace,
                                      support::LinearCongruentialEngine::TRandState* rand_state) = 0;
 
+  /*!
+   * \brief Clone the mutator.
+   * \return The cloned mutator.
+   */
+  virtual Mutator Clone() const = 0;
+
   static constexpr const char* _type_key = "meta_schedule.Mutator";
   TVM_DECLARE_BASE_OBJECT_INFO(MutatorNode, Object);
 };
 
-/*! \brief The mutator with customized methods on the python-side. */
-class PyMutatorNode : public MutatorNode {
+/*!
+ * \brief Managed reference to MutatorNode
+ * \sa MutatorNode
+ */
+class Mutator : public runtime::ObjectRef {
  public:
   /*!
    * \brief The function type of `InitializeWithTuneContext` method.
@@ -76,39 +86,16 @@ class PyMutatorNode : public MutatorNode {
    */
   using FApply = runtime::TypedPackedFunc<Optional<tir::Trace>(
       const tir::Trace&, support::LinearCongruentialEngine::TRandState rand_state)>;
+  /*!
+   * \brief Clone the mutator.
+   * \return The cloned mutator.
+   */
+  using FClone = runtime::TypedPackedFunc<Mutator()>;
   /*!
    * \brief Get the mutator as string with name.
    * \return The string of the mutator.
    */
   using FAsString = runtime::TypedPackedFunc<String()>;
-
-  /*! \brief The packed function to the `InitializeWithTuneContext` function. */
-  FInitializeWithTuneContext f_initialize_with_tune_context;
-  /*! \brief The packed function to the `Apply` function. */
-  FApply f_apply;
-  /*! \brief The packed function to the `AsString` function. */
-  FAsString f_as_string;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    // `f_initialize_with_tune_context` is not visited
-    // `f_apply` is not visited
-    // `f_as_string` is not visited
-  }
-
-  void InitializeWithTuneContext(const TuneContext& context) final;
-  Optional<tir::Trace> Apply(const tir::Trace& trace,
-                             support::LinearCongruentialEngine::TRandState* rand_state) final;
-
-  static constexpr const char* _type_key = "meta_schedule.PyMutator";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PyMutatorNode, MutatorNode);
-};
-
-/*!
- * \brief Managed reference to MutatorNode
- * \sa MutatorNode
- */
-class Mutator : public runtime::ObjectRef {
- public:
   /*! \brief Create a Mutator that mutates the decision of instruction Sample-Perfect-Tile */
   TVM_DLL static Mutator MutateTileSize();
   /*!
@@ -136,16 +123,49 @@ class Mutator : public runtime::ObjectRef {
    * \brief Create a mutator with customized methods on the python-side.
    * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`.
    * \param f_apply The packed function of `Apply`.
+   * \param f_clone The packed function of `Clone`.
    * \param f_as_string The packed function of `AsString`.
    * \return The mutator created.
    */
-  TVM_DLL static Mutator PyMutator(
-      PyMutatorNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
-      PyMutatorNode::FApply f_apply,                                             //
-      PyMutatorNode::FAsString f_as_string);
+  TVM_DLL static Mutator PyMutator(FInitializeWithTuneContext f_initialize_with_tune_context,  //
+                                   FApply f_apply,                                             //
+                                   FClone f_clone,                                             //
+                                   FAsString f_as_string);
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Mutator, ObjectRef, MutatorNode);
 };
 
+/*! \brief The mutator with customized methods on the python-side. */
+class PyMutatorNode : public MutatorNode {
+ public:
+  using FInitializeWithTuneContext = Mutator::FInitializeWithTuneContext;
+  using FApply = Mutator::FApply;
+  using FClone = Mutator::FClone;
+  using FAsString = Mutator::FAsString;
+  /*! \brief The packed function to the `InitializeWithTuneContext` function. */
+  FInitializeWithTuneContext f_initialize_with_tune_context;
+  /*! \brief The packed function to the `Apply` function. */
+  FApply f_apply;
+  /*! \brief The packed function to the `Clone` function. */
+  FClone f_clone;
+  /*! \brief The packed function to the `AsString` function. */
+  FAsString f_as_string;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    // `f_initialize_with_tune_context` is not visited
+    // `f_apply` is not visited
+    // `f_clone` is not visited
+    // `f_as_string` is not visited
+  }
+
+  void InitializeWithTuneContext(const TuneContext& context) final;
+  Optional<tir::Trace> Apply(const tir::Trace& trace,
+                             support::LinearCongruentialEngine::TRandState* rand_state) final;
+  Mutator Clone() const final;
+
+  static constexpr const char* _type_key = "meta_schedule.PyMutator";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PyMutatorNode, MutatorNode);
+};
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 5d99f6845463..4fafb9557631 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -29,6 +29,7 @@ namespace tvm {
 namespace meta_schedule {
 
 class TuneContext;
+class Postproc;
 
 /*!
  * \brief Rules to apply a postprocessor to a schedule.
@@ -54,12 +55,21 @@ class PostprocNode : public runtime::Object {
    */
   virtual bool Apply(const tir::Schedule& sch) = 0;
 
+  /*!
+   * \brief Clone the postprocessor.
+   * \return The cloned postprocessor.
+   */
+  virtual Postproc Clone() const = 0;
+
   static constexpr const char* _type_key = "meta_schedule.Postproc";
   TVM_DECLARE_BASE_OBJECT_INFO(PostprocNode, Object);
 };
 
-/*! \brief The postprocessor with customized methods on the python-side. */
-class PyPostprocNode : public PostprocNode {
+/*!
+ * \brief Managed reference to PostprocNode
+ * \sa PostprocNode
+ */
+class Postproc : public runtime::ObjectRef {
  public:
   /*!
    * \brief The function type of `InitializeWithTuneContext` method.
@@ -72,49 +82,28 @@ class PyPostprocNode : public PostprocNode {
    * \return Whether the postprocessor was successfully applied.
    */
   using FApply = runtime::TypedPackedFunc<bool(const tir::Schedule&)>;
+  /*!
+   * \brief Clone the postprocessor.
+   * \return The cloned postprocessor.
+   */
+  using FClone = runtime::TypedPackedFunc<Postproc()>;
   /*!
    * \brief Get the postprocessor function as string with name.
    * \return The string of the postprocessor function.
    */
   using FAsString = runtime::TypedPackedFunc<String()>;
-
-  /*! \brief The packed function to the `InitializeWithTuneContext` function. */
-  FInitializeWithTuneContext f_initialize_with_tune_context;
-  /*! \brief The packed function to the `Apply` function. */
-  FApply f_apply;
-  /*! \brief The packed function to the `AsString` function. */
-  FAsString f_as_string;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    // `f_initialize_with_tune_context` is not visited
-    // `f_apply` is not visited
-    // `f_as_string` is not visited
-  }
-
-  void InitializeWithTuneContext(const TuneContext& context) final;
-  bool Apply(const tir::Schedule& sch) final;
-
-  static constexpr const char* _type_key = "meta_schedule.PyPostproc";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PyPostprocNode, PostprocNode);
-};
-
-/*!
- * \brief Managed reference to PostprocNode
- * \sa PostprocNode
- */
-class Postproc : public runtime::ObjectRef {
- public:
   /*!
    * \brief Create a postprocessor with customized methods on the python-side.
    * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`.
    * \param f_apply The packed function of `Apply`.
+   * \param f_clone The packed function of `Clone`.
    * \param f_as_string The packed function of `AsString`.
    * \return The postprocessor created.
    */
-  TVM_DLL static Postproc PyPostproc(
-      PyPostprocNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
-      PyPostprocNode::FApply f_apply,                                             //
-      PyPostprocNode::FAsString f_as_string);
+  TVM_DLL static Postproc PyPostproc(FInitializeWithTuneContext f_initialize_with_tune_context,  //
+                                     FApply f_apply,                                             //
+                                     FClone f_clone,                                             //
+                                     FAsString f_as_string);
   /*!
    * \brief Create a postprocessor that checks if all loops are static
    * \return The postprocessor created
@@ -164,6 +153,37 @@ class Postproc : public runtime::ObjectRef {
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Postproc, ObjectRef, PostprocNode);
 };
 
+/*! \brief The postprocessor with customized methods on the python-side. */
+class PyPostprocNode : public PostprocNode {
+ public:
+  using FInitializeWithTuneContext = Postproc::FInitializeWithTuneContext;
+  using FApply = Postproc::FApply;
+  using FClone = Postproc::FClone;
+  using FAsString = Postproc::FAsString;
+  /*! \brief The packed function to the `InitializeWithTuneContext` function. */
+  FInitializeWithTuneContext f_initialize_with_tune_context;
+  /*! \brief The packed function to the `Apply` function. */
+  FApply f_apply;
+  /*! \brief The packed function to the `Clone` function. */
+  FClone f_clone;
+  /*! \brief The packed function to the `AsString` function. */
+  FAsString f_as_string;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    // `f_initialize_with_tune_context` is not visited
+    // `f_apply` is not visited
+    // `f_clone` is not visited
+    // `f_as_string` is not visited
+  }
+
+  void InitializeWithTuneContext(const TuneContext& context) final;
+  bool Apply(const tir::Schedule& sch) final;
+  Postproc Clone() const final;
+
+  static constexpr const char* _type_key = "meta_schedule.PyPostproc";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PyPostprocNode, PostprocNode);
+};
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 2da441c95e0b..55704cf4a97d 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -34,6 +34,7 @@ namespace tvm {
 namespace meta_schedule {
 
 class TuneContext;
+class ScheduleRule;
 
 /*! \brief Rules to modify a block in a schedule. */
 class ScheduleRuleNode : public runtime::Object {
@@ -59,12 +60,21 @@ class ScheduleRuleNode : public runtime::Object {
   virtual runtime::Array<tir::Schedule> Apply(const tir::Schedule& sch,
                                               const tir::BlockRV& block) = 0;
 
+  /*!
+   * \brief Deep clone the schedule rule.
+   * \return The cloned schedule rule.
+   */
+  virtual ScheduleRule Clone() const = 0;
+
   static constexpr const char* _type_key = "meta_schedule.ScheduleRule";
   TVM_DECLARE_BASE_OBJECT_INFO(ScheduleRuleNode, Object);
 };
 
-/*! \brief The schedule rule with customized methods on the python-side. */
-class PyScheduleRuleNode : public ScheduleRuleNode {
+/*!
+ * \brief Managed reference to ScheduleRuleNode
+ * \sa ScheduleRuleNode
+ */
+class ScheduleRule : public runtime::ObjectRef {
  public:
   /*!
    * \brief The function type of `InitializeWithTuneContext` method.
@@ -84,33 +94,11 @@ class PyScheduleRuleNode : public ScheduleRuleNode {
    * \return The string of the schedule rule.
    */
   using FAsString = runtime::TypedPackedFunc<String()>;
-
-  /*! \brief The packed function to the `InitializeWithTuneContext` function. */
-  FInitializeWithTuneContext f_initialize_with_tune_context;
-  /*! \brief The packed function to the `Apply` function. */
-  FApply f_apply;
-  /*! \brief The packed function to the `AsString` function. */
-  FAsString f_as_string;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    // `f_initialize_with_tune_context` is not visited
-    // `f_apply` is not visited
-    // `f_as_string` is not visited
-  }
-
-  void InitializeWithTuneContext(const TuneContext& context) final;
-  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block) final;
-
-  static constexpr const char* _type_key = "meta_schedule.PyScheduleRule";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PyScheduleRuleNode, ScheduleRuleNode);
-};
-
-/*!
- * \brief Managed reference to ScheduleRuleNode
- * \sa ScheduleRuleNode
- */
-class ScheduleRule : public runtime::ObjectRef {
- public:
+  /*!
+   * \brief The function type of `Clone` method.
+   * \return The cloned schedule rule.
+   */
+  using FClone = runtime::TypedPackedFunc<ScheduleRule()>;
   /*!
    * \brief Create an auto-inline rule that inlines spatial blocks if it satisfies some conditions
    * \param into_producer If allows to inline a block into its producer
@@ -249,16 +237,50 @@ class ScheduleRule : public runtime::ObjectRef {
    * \brief Create a schedule rule with customized methods on the python-side.
    * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`.
    * \param f_apply The packed function of `Apply`.
+   * \param f_clone The packed function of `Clone`.
    * \param f_as_string The packed function of `AsString`.
    * \return The schedule rule created.
    */
   TVM_DLL static ScheduleRule PyScheduleRule(
-      PyScheduleRuleNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
-      PyScheduleRuleNode::FApply f_apply,                                             //
-      PyScheduleRuleNode::FAsString f_as_string);
+      FInitializeWithTuneContext f_initialize_with_tune_context,  //
+      FApply f_apply,                                             //
+      FClone f_clone,                                             //
+      FAsString f_as_string);
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ScheduleRule, ObjectRef, ScheduleRuleNode);
 };
 
+/*! \brief The schedule rule with customized methods on the python-side. */
+class PyScheduleRuleNode : public ScheduleRuleNode {
+ public:
+  using FInitializeWithTuneContext = ScheduleRule::FInitializeWithTuneContext;
+  using FApply = ScheduleRule::FApply;
+  using FClone = ScheduleRule::FClone;
+  using FAsString = ScheduleRule::FAsString;
+
+  /*! \brief The packed function to the `InitializeWithTuneContext` function. */
+  FInitializeWithTuneContext f_initialize_with_tune_context;
+  /*! \brief The packed function to the `Apply` function. */
+  FApply f_apply;
+  /*! \brief The packed function to the `AsString` function. */
+  FAsString f_as_string;
+  /*! \brief The packed function to the `Clone` function. */
+  FClone f_clone;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    // `f_initialize_with_tune_context` is not visited
+    // `f_apply` is not visited
+    // `f_as_string` is not visited
+    // `f_clone` is not visited
+  }
+
+  void InitializeWithTuneContext(const TuneContext& context) final;
+  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block) final;
+  ScheduleRule Clone() const final;
+
+  static constexpr const char* _type_key = "meta_schedule.PyScheduleRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PyScheduleRuleNode, ScheduleRuleNode);
+};
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index a75a4cd8ae86..efd3dc24524a 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -36,6 +36,7 @@ namespace meta_schedule {
 
 // Forward declaration
 class TuneContext;
+class SearchStrategy;
 
 /*!
  * \brief The search strategy for measure candidates generation.
@@ -119,12 +120,21 @@ class SearchStrategyNode : public runtime::Object {
   virtual void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                                    const Array<RunnerResult>& results) = 0;
 
+  /*!
+   * \brief Clone the search strategy.
+   * \return The cloned search strategy.
+   */
+  virtual SearchStrategy Clone() const = 0;
+
   static constexpr const char* _type_key = "meta_schedule.SearchStrategy";
   TVM_DECLARE_BASE_OBJECT_INFO(SearchStrategyNode, Object);
 };
 
-/*! \brief The python side customizable class for measure candidate generation */
-class PySearchStrategyNode : public SearchStrategyNode {
+/*!
+ * \brief Managed reference to SearchStrategyNode.
+ * \sa SearchStrategyNode
+ */
+class SearchStrategy : public runtime::ObjectRef {
  public:
   /*!
    * \brief The function type of `InitializeWithTuneContext` method.
@@ -150,44 +160,11 @@ class PySearchStrategyNode : public SearchStrategyNode {
    */
   using FNotifyRunnerResults =
       runtime::TypedPackedFunc<void(const Array<MeasureCandidate>&, const Array<RunnerResult>&)>;
-
-  /*! \brief The packed function to the `InitializeWithTuneContext` method. */
-  FInitializeWithTuneContext f_initialize_with_tune_context;
-  /*! \brief The packed function to the `PreTuning` method. */
-  FPreTuning f_pre_tuning;
-  /*! \brief The packed function to the `PostTuning` method. */
-  FPostTuning f_post_tuning;
-  /*! \brief The packed function to the `GenerateMeasureCandidates` method. */
-  FGenerateMeasureCandidates f_generate_measure_candidates;
-  /*! \brief The packed function to the `NotifyRunnerResults` method. */
-  FNotifyRunnerResults f_notify_runner_results;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    // `f_initialize_with_tune_context` is not visited
-    // `f_pre_tuning` is not visited
-    // `f_post_tuning` is not visited
-    // `f_generate_measure_candidates` is not visited
-    // `f_notify_runner_results` is not visited
-  }
-
-  void InitializeWithTuneContext(const TuneContext& context) final;
-  void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
-                 const Optional<CostModel>& cost_model) final;
-  void PostTuning() final;
-  Optional<Array<MeasureCandidate>> GenerateMeasureCandidates() final;
-  void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
-                           const Array<RunnerResult>& results);
-
-  static constexpr const char* _type_key = "meta_schedule.PySearchStrategy";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PySearchStrategyNode, SearchStrategyNode);
-};
-
-/*!
- * \brief Managed reference to SearchStrategyNode.
- * \sa SearchStrategyNode
- */
-class SearchStrategy : public runtime::ObjectRef {
- public:
+  /*!
+   * \brief The function type of `Clone` method.
+   * \return The cloned search strategy.
+   */
+  using FClone = runtime::TypedPackedFunc<SearchStrategy()>;
   /*!
    * \brief Create a search strategy with customized methods on the python-side.
    * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`.
@@ -195,14 +172,16 @@ class SearchStrategy : public runtime::ObjectRef {
    * \param f_post_tuning The packed function of `PostTuning`.
    * \param f_generate_measure_candidates The packed function of `GenerateMeasureCandidates`.
    * \param f_notify_runner_results The packed function of `NotifyRunnerResults`.
+   * \param f_clone The packed function of `Clone`.
    * \return The search strategy created.
    */
   TVM_DLL static SearchStrategy PySearchStrategy(
-      PySearchStrategyNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
-      PySearchStrategyNode::FPreTuning f_pre_tuning,                                    //
-      PySearchStrategyNode::FPostTuning f_post_tuning,                                  //
-      PySearchStrategyNode::FGenerateMeasureCandidates f_generate_measure_candidates,   //
-      PySearchStrategyNode::FNotifyRunnerResults f_notify_runner_results);
+      FInitializeWithTuneContext f_initialize_with_tune_context,  //
+      FPreTuning f_pre_tuning,                                    //
+      FPostTuning f_post_tuning,                                  //
+      FGenerateMeasureCandidates f_generate_measure_candidates,   //
+      FNotifyRunnerResults f_notify_runner_results,               //
+      FClone f_clone);
 
   /*!
    * \brief Constructor of replay trace search strategy.
@@ -245,6 +224,51 @@ class SearchStrategy : public runtime::ObjectRef {
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(SearchStrategy, ObjectRef, SearchStrategyNode);
 };
 
+/*! \brief The python side customizable class for measure candidate generation */
+class PySearchStrategyNode : public SearchStrategyNode {
+ public:
+  using FInitializeWithTuneContext = SearchStrategy::FInitializeWithTuneContext;
+  using FPreTuning = SearchStrategy::FPreTuning;
+  using FPostTuning = SearchStrategy::FPostTuning;
+  using FGenerateMeasureCandidates = SearchStrategy::FGenerateMeasureCandidates;
+  using FNotifyRunnerResults = SearchStrategy::FNotifyRunnerResults;
+  using FClone = SearchStrategy::FClone;
+
+  /*! \brief The packed function to the `InitializeWithTuneContext` method. */
+  FInitializeWithTuneContext f_initialize_with_tune_context;
+  /*! \brief The packed function to the `PreTuning` method. */
+  FPreTuning f_pre_tuning;
+  /*! \brief The packed function to the `PostTuning` method. */
+  FPostTuning f_post_tuning;
+  /*! \brief The packed function to the `GenerateMeasureCandidates` method. */
+  FGenerateMeasureCandidates f_generate_measure_candidates;
+  /*! \brief The packed function to the `NotifyRunnerResults` method. */
+  FNotifyRunnerResults f_notify_runner_results;
+  /*! \brief The packed function to the `Clone` method. */
+  FClone f_clone;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    // `f_initialize_with_tune_context` is not visited
+    // `f_pre_tuning` is not visited
+    // `f_post_tuning` is not visited
+    // `f_generate_measure_candidates` is not visited
+    // `f_notify_runner_results` is not visited
+    // `f_clone` is not visited
+  }
+
+  void InitializeWithTuneContext(const TuneContext& context) final;
+  void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
+                 const Optional<CostModel>& cost_model) final;
+  void PostTuning() final;
+  Optional<Array<MeasureCandidate>> GenerateMeasureCandidates() final;
+  void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
+                           const Array<RunnerResult>& results);
+  SearchStrategy Clone() const final;
+
+  static constexpr const char* _type_key = "meta_schedule.PySearchStrategy";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PySearchStrategyNode, SearchStrategyNode);
+};
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/include/tvm/meta_schedule/space_generator.h b/include/tvm/meta_schedule/space_generator.h
index 2c1b2d4e4d7d..1e29e757a15c 100644
--- a/include/tvm/meta_schedule/space_generator.h
+++ b/include/tvm/meta_schedule/space_generator.h
@@ -31,6 +31,7 @@ namespace meta_schedule {
 
 // Forward declaration
 class TuneContext;
+class SpaceGenerator;
 
 /*!
  * \brief The abstract class for design space generation.
@@ -87,12 +88,21 @@ class SpaceGeneratorNode : public runtime::Object {
    */
   virtual Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod) = 0;
 
+  /*!
+   * \brief Clone the space generator.
+   * \return The cloned space generator.
+   */
+  virtual SpaceGenerator Clone() const = 0;
+
   static constexpr const char* _type_key = "meta_schedule.SpaceGenerator";
   TVM_DECLARE_BASE_OBJECT_INFO(SpaceGeneratorNode, Object);
 };
 
-/*! \brief The design space generator with customized methods on the python-side. */
-class PySpaceGeneratorNode : public SpaceGeneratorNode {
+/*!
+ * \brief Managed reference to SpaceGeneratorNode.
+ * \sa SpaceGeneratorNode
+ */
+class SpaceGenerator : public runtime::ObjectRef {
  public:
   /*!
    * \brief The function type of `InitializeWithTuneContext` method.
@@ -105,29 +115,12 @@ class PySpaceGeneratorNode : public SpaceGeneratorNode {
    * \return The generated design spaces, i.e., schedules.
    */
   using FGenerateDesignSpace = runtime::TypedPackedFunc<Array<tir::Schedule>(const IRModule&)>;
+  /*!
+   * \brief The function type of `Clone` method.
+   * \return The cloned space generator.
+   */
+  using FClone = runtime::TypedPackedFunc<SpaceGenerator()>;
 
-  /*! \brief The packed function to the `InitializeWithTuneContext` function. */
-  FInitializeWithTuneContext f_initialize_with_tune_context;
-  /*! \brief The packed function to the `GenerateDesignSpace` function. */
-  FGenerateDesignSpace f_generate_design_space;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    // `f_initialize_with_tune_context` is not visited
-    // `f_generate_design_space` is not visited
-  }
-
-  void InitializeWithTuneContext(const TuneContext& context) final;
-  Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod) final;
-
-  static constexpr const char* _type_key = "meta_schedule.PySpaceGenerator";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PySpaceGeneratorNode, SpaceGeneratorNode);
-};
-
-/*!
- * \brief Managed reference to SpaceGeneratorNode.
- * \sa SpaceGeneratorNode
- */
-class SpaceGenerator : public runtime::ObjectRef {
  protected:
   SpaceGenerator() = default;
 
@@ -136,11 +129,12 @@ class SpaceGenerator : public runtime::ObjectRef {
    * \brief Create a design space generator with customized methods on the python-side.
    * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`.
    * \param f_generate_design_space The packed function of `GenerateDesignSpace`.
+   * \param f_clone The packed function of `Clone`.
    * \return The design space generator created.
    */
   TVM_DLL static SpaceGenerator PySpaceGenerator(
-      PySpaceGeneratorNode::FInitializeWithTuneContext f_initialize_with_tune_context,
-      PySpaceGeneratorNode::FGenerateDesignSpace f_generate_design_space);
+      FInitializeWithTuneContext f_initialize_with_tune_context,
+      FGenerateDesignSpace f_generate_design_space, FClone f_clone);
   /*!
    * \brief Create a design space generator with customized schedule function.
    * \param schedule_fn The schedule function, which can have the following signatures:
@@ -156,14 +150,40 @@ class SpaceGenerator : public runtime::ObjectRef {
    */
   TVM_DLL static SpaceGenerator SpaceGeneratorUnion(Array<SpaceGenerator, void> space_generators);
   /*!
-   * \brief Create a design space generator that generates design spaces by applying schedule rules
-   *  to blocks in post-DFS order.
-   * \return The design space generator created.
+   * \brief Create a design space generator that generates design spaces by applying schedule
+   * rules to blocks in post-DFS order. \return The design space generator created.
    */
   TVM_DLL static SpaceGenerator PostOrderApply(runtime::PackedFunc f_block_filter = nullptr);
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(SpaceGenerator, ObjectRef, SpaceGeneratorNode);
 };
 
+/*! \brief The design space generator with customized methods on the python-side. */
+class PySpaceGeneratorNode : public SpaceGeneratorNode {
+ public:
+  using FInitializeWithTuneContext = SpaceGenerator::FInitializeWithTuneContext;
+  using FGenerateDesignSpace = SpaceGenerator::FGenerateDesignSpace;
+  using FClone = SpaceGenerator::FClone;
+  /*! \brief The packed function to the `InitializeWithTuneContext` function. */
+  FInitializeWithTuneContext f_initialize_with_tune_context;
+  /*! \brief The packed function to the `GenerateDesignSpace` function. */
+  FGenerateDesignSpace f_generate_design_space;
+  /*! \brief The packed function to the `Clone` function. */
+  FClone f_clone;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    // `f_initialize_with_tune_context` is not visited
+    // `f_generate_design_space` is not visited
+    // `f_clone` is not visited
+  }
+
+  void InitializeWithTuneContext(const TuneContext& context) final;
+  Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod) final;
+  SpaceGenerator Clone() const final;
+
+  static constexpr const char* _type_key = "meta_schedule.PySpaceGenerator";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PySpaceGeneratorNode, SpaceGeneratorNode);
+};
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
index 3d732e7fbd99..4e2f00fb5a0c 100644
--- a/include/tvm/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -43,6 +43,7 @@ namespace meta_schedule {
 
 class TaskSchedulerNode;
 class MeasureCallback;
+class TuneContext;
 
 /*! \brief The auto tuning context. */
 class TuneContextNode : public runtime::Object {
@@ -99,6 +100,11 @@ class TuneContextNode : public runtime::Object {
 
   /*! \brief Initialize members that needs initialization with tune context. */
   void Initialize();
+  /*!
+   * \brief Clone the tune context.
+   * \return The cloned tune context.
+   */
+  TuneContext Clone() const;
   /*! \brief Set the measure candidates from the SearchStrategy */
   void _SetMeasureCandidates(const Array<MeasureCandidate>& candidates);
   /*!
diff --git a/python/tvm/meta_schedule/mutator/mutator.py b/python/tvm/meta_schedule/mutator/mutator.py
index 0c8de9668034..c5286aced7d8 100644
--- a/python/tvm/meta_schedule/mutator/mutator.py
+++ b/python/tvm/meta_schedule/mutator/mutator.py
@@ -58,6 +58,16 @@ def apply(self, trace: Trace) -> Optional[Trace]:
         """
         return _ffi_api.MutatorApply(self, trace, -1)  # type: ignore # pylint: disable=no-member
 
+    def clone(self) -> "Mutator":
+        """Clone the mutator.
+
+        Returns
+        -------
+        mutator : Mutator
+            The cloned mutator.
+        """
+        return _ffi_api.MutatorClone(self)  # type: ignore # pylint: disable=no-member
+
 
 @register_object("meta_schedule.PyMutator")
 class _PyMutator(Mutator):
@@ -72,6 +82,7 @@ def __init__(
         self,
         f_initialize_with_tune_context: Callable = None,
         f_apply: Callable = None,
+        f_clone: Callable = None,
         f_as_string: Callable = None,
     ):
         """Constructor."""
@@ -80,6 +91,7 @@ def __init__(
             _ffi_api.MutatorPyMutator,  # type: ignore # pylint: disable=no-member
             f_initialize_with_tune_context,
             f_apply,
+            f_clone,
             f_as_string,
         )
 
@@ -94,7 +106,7 @@ class PyMutator:
 
     _tvm_metadata = {
         "cls": _PyMutator,
-        "methods": ["_initialize_with_tune_context", "apply", "__str__"],
+        "methods": ["_initialize_with_tune_context", "apply", "clone", "__str__"],
     }
 
     def _initialize_with_tune_context(self, context: "TuneContext") -> None:
@@ -122,6 +134,16 @@ def apply(self, trace: Trace, _) -> Optional[Trace]:
         """
         raise NotImplementedError
 
+    def clone(self) -> Mutator:
+        """Clone the mutator.
+
+        Returns
+        -------
+        mutator : Mutator
+            The cloned mutator.
+        """
+        raise NotImplementedError
+
     def __str__(self) -> str:
         """Get the mutator as string with name.
 
diff --git a/python/tvm/meta_schedule/postproc/postproc.py b/python/tvm/meta_schedule/postproc/postproc.py
index e37666bd1ce0..6eec2965ceeb 100644
--- a/python/tvm/meta_schedule/postproc/postproc.py
+++ b/python/tvm/meta_schedule/postproc/postproc.py
@@ -60,6 +60,16 @@ def apply(self, sch: Schedule) -> bool:
         """
         return _ffi_api.PostprocApply(self, sch)  # type: ignore # pylint: disable=no-member
 
+    def clone(self) -> "Postproc":
+        """Clone the postprocessor.
+
+        Returns
+        -------
+        cloned_postproc : Postproc
+            The cloned postprocessor.
+        """
+        return _ffi_api.PostprocClone(self)  # type: ignore # pylint: disable=no-member
+
 
 @register_object("meta_schedule.PyPostproc")
 class _PyPostproc(Postproc):
@@ -74,6 +84,7 @@ def __init__(
         self,
         f_initialize_with_tune_context: Callable = None,
         f_apply: Callable = None,
+        f_clone: Callable = None,
         f_as_string: Callable = None,
     ):
         """Constructor."""
@@ -82,6 +93,7 @@ def __init__(
             _ffi_api.PostprocPyPostproc,  # type: ignore # pylint: disable=no-member
             f_initialize_with_tune_context,
             f_apply,
+            f_clone,
             f_as_string,
         )
 
@@ -96,7 +108,7 @@ class PyPostproc:
 
     _tvm_metadata = {
         "cls": _PyPostproc,
-        "methods": ["_initialize_with_tune_context", "apply", "__str__"],
+        "methods": ["_initialize_with_tune_context", "apply", "clone", "__str__"],
     }
 
     def _initialize_with_tune_context(self, context: "TuneContext") -> None:
@@ -124,6 +136,16 @@ def apply(self, sch: Schedule) -> bool:
         """
         raise NotImplementedError
 
+    def clone(self) -> Postproc:
+        """Clone the postprocessor.
+
+        Returns
+        -------
+        cloned_postproc : Postproc
+            The cloned postprocessor.
+        """
+        raise NotImplementedError
+
     def __str__(self) -> str:
         """Get the post processor as string with name.
 
diff --git a/python/tvm/meta_schedule/schedule_rule/schedule_rule.py b/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
index 481444341b86..2c8e223611aa 100644
--- a/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
+++ b/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
@@ -66,6 +66,16 @@ def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
             self, sch, block
         )
 
+    def clone(self) -> "ScheduleRule":
+        """Deep clone the schedule rule.
+
+        Returns
+        -------
+        cloned_rule : ScheduleRule
+            The cloned schedule rule.
+        """
+        return _ffi_api.ScheduleRuleClone(self)  # type: ignore # pylint: disable=no-member
+
 
 @register_object("meta_schedule.PyScheduleRule")
 class _PyScheduleRule(ScheduleRule):
@@ -80,6 +90,7 @@ def __init__(
         self,
         f_initialize_with_tune_context: Callable = None,
         f_apply: Callable = None,
+        f_clone: Callable = None,
         f_as_string: Callable = None,
     ):
         """Constructor."""
@@ -88,6 +99,7 @@ def __init__(
             _ffi_api.ScheduleRulePyScheduleRule,  # type: ignore # pylint: disable=no-member
             f_initialize_with_tune_context,
             f_apply,
+            f_clone,
             f_as_string,
         )
 
@@ -102,7 +114,7 @@ class PyScheduleRule:
 
     _tvm_metadata = {
         "cls": _PyScheduleRule,
-        "methods": ["_initialize_with_tune_context", "apply", "__str__"],
+        "methods": ["_initialize_with_tune_context", "apply", "clone", "__str__"],
     }
 
     def _initialize_with_tune_context(self, context: "TuneContext") -> None:
@@ -113,9 +125,7 @@ def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         context : TuneContext
             The tuning context for initializing the schedule rule.
         """
-        _ffi_api.ScheduleRuleInitializeWithTuneContext(  # type: ignore # pylint: disable=no-member
-            self, context
-        )
+        raise NotImplementedError
 
     def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
         """Apply a schedule rule to the specific block in the given schedule.
@@ -132,9 +142,17 @@ def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
         design_spaces : List[Schedule]
             The list of schedules generated by applying the schedule rule.
         """
-        return _ffi_api.ScheduleRuleApply(  #  type: ignore # pylint: disable=no-member
-            self, sch, block
-        )
+        raise NotImplementedError
+
+    def clone(self) -> ScheduleRule:
+        """Deep clone the schedule rule.
+
+        Returns
+        -------
+        cloned_rule : ScheduleRule
+            The cloned schedule rule.
+        """
+        raise NotImplementedError
 
     def __str__(self) -> str:
         """Get the schedule rule as string with name.
diff --git a/python/tvm/meta_schedule/search_strategy/search_strategy.py b/python/tvm/meta_schedule/search_strategy/search_strategy.py
index e88cdf825a79..276e65713325 100644
--- a/python/tvm/meta_schedule/search_strategy/search_strategy.py
+++ b/python/tvm/meta_schedule/search_strategy/search_strategy.py
@@ -151,6 +151,16 @@ def notify_runner_results(
             results,
         )
 
+    def clone(self) -> "SearchStrategy":
+        """Clone the search strategy.
+
+        Returns
+        -------
+        cloned : SearchStrategy
+            The cloned search strategy.
+        """
+        return _ffi_api.SearchStrategyClone(self)  # type: ignore # pylint: disable=no-member
+
 
 @register_object("meta_schedule.PySearchStrategy")
 class _PySearchStrategy(SearchStrategy):
@@ -168,6 +178,7 @@ def __init__(
         f_post_tuning: Callable = None,
         f_generate_measure_candidates: Callable = None,
         f_notify_runner_results: Callable = None,
+        f_clone: Callable = None,
     ):
         """Constructor."""
 
@@ -178,6 +189,7 @@ def __init__(
             f_post_tuning,
             f_generate_measure_candidates,
             f_notify_runner_results,
+            f_clone,
         )
 
 
@@ -197,6 +209,7 @@ class PySearchStrategy:
             "post_tuning",
             "generate_measure_candidates",
             "notify_runner_results",
+            "clone",
         ],
     }
 
@@ -250,6 +263,16 @@ def notify_runner_results(
         """
         raise NotImplementedError
 
+    def clone(self) -> SearchStrategy:
+        """Clone the search strategy.
+
+        Returns
+        -------
+        strategy : SearchStrategy
+            The cloned search strategy.
+        """
+        raise NotImplementedError
+
 
 def create(  # pylint: disable=keyword-arg-before-vararg
     kind: Literal[
diff --git a/python/tvm/meta_schedule/space_generator/space_generator.py b/python/tvm/meta_schedule/space_generator/space_generator.py
index 9d7ebf3bae26..23c0361645b5 100644
--- a/python/tvm/meta_schedule/space_generator/space_generator.py
+++ b/python/tvm/meta_schedule/space_generator/space_generator.py
@@ -72,6 +72,16 @@ def generate_design_space(self, mod: IRModule) -> List[Schedule]:
         """
         return _ffi_api.SpaceGeneratorGenerateDesignSpace(self, mod)  # type: ignore # pylint: disable=no-member
 
+    def clone(self) -> "SpaceGenerator":
+        """Clone the design space generator.
+
+        Returns
+        -------
+        cloned_sg : SpaceGenerator
+            The cloned design space generator.
+        """
+        return _ffi_api.SpaceGeneratorClone(self)  # type: ignore # pylint: disable=no-member
+
 
 ScheduleFnType = SpaceGenerator.ScheduleFnType
 
@@ -89,6 +99,7 @@ def __init__(
         self,
         f_initialize_with_tune_context: Optional[Callable] = None,
         f_generate_design_space: Optional[Callable] = None,
+        f_clone: Optional[Callable] = None,
     ):
         """Constructor."""
 
@@ -96,6 +107,7 @@ def __init__(
             _ffi_api.SpaceGeneratorPySpaceGenerator,  # type: ignore # pylint: disable=no-member
             f_initialize_with_tune_context,
             f_generate_design_space,
+            f_clone,
         )
 
 
@@ -109,7 +121,7 @@ class PySpaceGenerator:
 
     _tvm_metadata = {
         "cls": _PySpaceGenerator,
-        "methods": ["_initialize_with_tune_context", "generate_design_space"],
+        "methods": ["_initialize_with_tune_context", "generate_design_space", "clone"],
     }
 
     def _initialize_with_tune_context(self, context: "TuneContext") -> None:
@@ -137,6 +149,16 @@ def generate_design_space(self, mod: IRModule) -> List[Schedule]:
         """
         raise NotImplementedError
 
+    def clone(self) -> SpaceGenerator:
+        """Clone the design space generator.
+
+        Returns
+        -------
+        cloned_sg : SpaceGenerator
+            The cloned design space generator.
+        """
+        raise NotImplementedError
+
 
 def create(  # pylint: disable=keyword-arg-before-vararg
     kind: Union[
diff --git a/python/tvm/meta_schedule/testing/dummy_object.py b/python/tvm/meta_schedule/testing/dummy_object.py
index 50ae974df5d8..bb2294544920 100644
--- a/python/tvm/meta_schedule/testing/dummy_object.py
+++ b/python/tvm/meta_schedule/testing/dummy_object.py
@@ -58,3 +58,6 @@ def _initialize_with_tune_context(self, context: "TuneContext") -> None:
 
     def apply(self, trace: Trace, _) -> Optional[Trace]:
         return Trace(trace.insts, {})
+
+    def clone(self):
+        return DummyMutator()
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index 17acad8d4a57..29cd94110c0c 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -331,3 +331,13 @@ def notify_runner_results(
                 "Please construct TuneContext with search_strategy"
             )
         return self.search_strategy.notify_runner_results(measure_candidates, results)
+
+    def clone(self) -> "TuneContext":
+        """Clone the TuneContext.
+
+        Returns
+        -------
+        cloned_context : TuneContext
+            The cloned TuneContext.
+        """
+        return _ffi_api.TuneContextClone(self)  # type: ignore # pylint: disable=no-member
diff --git a/src/meta_schedule/mutator/mutate_compute_location.cc b/src/meta_schedule/mutator/mutate_compute_location.cc
index 9d6d69ba355f..2a31d2da9b53 100644
--- a/src/meta_schedule/mutator/mutate_compute_location.cc
+++ b/src/meta_schedule/mutator/mutate_compute_location.cc
@@ -42,6 +42,11 @@ class MutateComputeLocationNode : public MutatorNode {
   }
   // Inherit from `MutatorNode`
   Optional<Trace> Apply(const Trace& trace, TRandState* rand_state) final;
+  // Inherit from `MutatorNode`
+  Mutator Clone() const final {
+    ObjectPtr<MutateComputeLocationNode> n = make_object<MutateComputeLocationNode>(*this);
+    return Mutator(n);
+  }
 
  private:
   struct Candidate {
diff --git a/src/meta_schedule/mutator/mutate_parallel.cc b/src/meta_schedule/mutator/mutate_parallel.cc
index 82b91da682c6..9feb4747d807 100644
--- a/src/meta_schedule/mutator/mutate_parallel.cc
+++ b/src/meta_schedule/mutator/mutate_parallel.cc
@@ -188,6 +188,11 @@ class MutateParallelNode : public MutatorNode {
   }
   // Inherit from `MutatorNode`
   Optional<Trace> Apply(const Trace& trace, TRandState* rand_state) final;
+  // Inherit from `MutatorNode`
+  Mutator Clone() const final {
+    ObjectPtr<MutateParallelNode> n = make_object<MutateParallelNode>(*this);
+    return Mutator(n);
+  }
 };
 
 /*! \brief The candidate to be mutated */
diff --git a/src/meta_schedule/mutator/mutate_thread_binding.cc b/src/meta_schedule/mutator/mutate_thread_binding.cc
index de780b53e2d9..f5d89a85092b 100644
--- a/src/meta_schedule/mutator/mutate_thread_binding.cc
+++ b/src/meta_schedule/mutator/mutate_thread_binding.cc
@@ -42,6 +42,11 @@ class MutateThreadBindingNode : public MutatorNode {
   }
   // Inherit from `MutatorNode`
   Optional<Trace> Apply(const Trace& trace, TRandState* rand_state) final;
+  // Inherit from `MutatorNode`
+  Mutator Clone() const final {
+    ObjectPtr<MutateThreadBindingNode> n = make_object<MutateThreadBindingNode>(*this);
+    return Mutator(n);
+  }
 
  private:
   struct Candidate {
diff --git a/src/meta_schedule/mutator/mutate_tile_size.cc b/src/meta_schedule/mutator/mutate_tile_size.cc
index 4a3bfda8a4a8..8fb83147ea7b 100644
--- a/src/meta_schedule/mutator/mutate_tile_size.cc
+++ b/src/meta_schedule/mutator/mutate_tile_size.cc
@@ -63,6 +63,11 @@ class MutateTileSizeNode : public MutatorNode {
   void InitializeWithTuneContext(const TuneContext& context) final {}
   // Inherit from `MutatorNode`
   Optional<Trace> Apply(const Trace& trace, TRandState* rand_state) final;
+  // Inherit from `MutatorNode`
+  Mutator Clone() const final {
+    ObjectPtr<MutateTileSizeNode> n = make_object<MutateTileSizeNode>(*this);
+    return Mutator(n);
+  }
 };
 
 /*!
diff --git a/src/meta_schedule/mutator/mutate_unroll.cc b/src/meta_schedule/mutator/mutate_unroll.cc
index c282a171c3b7..7bbf00343af3 100644
--- a/src/meta_schedule/mutator/mutate_unroll.cc
+++ b/src/meta_schedule/mutator/mutate_unroll.cc
@@ -60,6 +60,11 @@ class MutateUnrollNode : public MutatorNode {
   void InitializeWithTuneContext(const TuneContext& context) final {}
   // Inherit from `MutatorNode`
   Optional<Trace> Apply(const Trace& trace, TRandState* rand_state) final;
+  // Inherit from `MutatorNode`
+  Mutator Clone() const final {
+    ObjectPtr<MutateUnrollNode> n = make_object<MutateUnrollNode>(*this);
+    return Mutator(n);
+  }
 };
 
 /*! \brief A candidate to be mutated */
diff --git a/src/meta_schedule/mutator/mutator.cc b/src/meta_schedule/mutator/mutator.cc
index 43b95000c71d..25312ab61f99 100644
--- a/src/meta_schedule/mutator/mutator.cc
+++ b/src/meta_schedule/mutator/mutator.cc
@@ -33,13 +33,20 @@ Optional<tir::Trace> PyMutatorNode::Apply(
   return f_apply(trace, *rand_state);
 }
 
+Mutator PyMutatorNode::Clone() const {
+  ICHECK(f_clone != nullptr) << "PyMutator's Clone method not implemented!";
+  return f_clone();
+}
+
 Mutator Mutator::PyMutator(
     PyMutatorNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PyMutatorNode::FApply f_apply,                                             //
+    PyMutatorNode::FClone f_clone,                                             //
     PyMutatorNode::FAsString f_as_string) {
   ObjectPtr<PyMutatorNode> n = make_object<PyMutatorNode>();
   n->f_initialize_with_tune_context = std::move(f_initialize_with_tune_context);
   n->f_apply = std::move(f_apply);
+  n->f_clone = std::move(f_clone);
   n->f_as_string = std::move(f_as_string);
   return Mutator(n);
 }
@@ -63,6 +70,7 @@ TVM_REGISTER_GLOBAL("meta_schedule.MutatorApply")
       TRandState seed_ = (seed != -1) ? seed : support::LinearCongruentialEngine::DeviceRandom();
       return self->Apply(trace, &seed_);
     });
+TVM_REGISTER_GLOBAL("meta_schedule.MutatorClone").set_body_method<Mutator>(&MutatorNode::Clone);
 TVM_REGISTER_GLOBAL("meta_schedule.MutatorPyMutator").set_body_typed(Mutator::PyMutator);
 
 }  // namespace meta_schedule
diff --git a/src/meta_schedule/postproc/disallow_dynamic_loop.cc b/src/meta_schedule/postproc/disallow_dynamic_loop.cc
index 85a81f10fdcd..8362da552ea5 100644
--- a/src/meta_schedule/postproc/disallow_dynamic_loop.cc
+++ b/src/meta_schedule/postproc/disallow_dynamic_loop.cc
@@ -67,6 +67,11 @@ class DisallowDynamicLoopNode : public PostprocNode {
   void InitializeWithTuneContext(const TuneContext& context) final {}
   // Inherited from PostprocNode
   bool Apply(const tir::Schedule& sch) final { return !tir::DynamicExtentFinder::Find(sch->mod()); }
+  // Inherited from PostprocNode
+  Postproc Clone() const {
+    ObjectPtr<DisallowDynamicLoopNode> n = make_object<DisallowDynamicLoopNode>(*this);
+    return Postproc(n);
+  }
 
   static constexpr const char* _type_key = "meta_schedule.DisallowDynamicLoop";
   TVM_DECLARE_FINAL_OBJECT_INFO(DisallowDynamicLoopNode, PostprocNode);
diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc
index 0f4f1b1192f6..957d6e7364e4 100644
--- a/src/meta_schedule/postproc/postproc.cc
+++ b/src/meta_schedule/postproc/postproc.cc
@@ -32,13 +32,20 @@ bool PyPostprocNode::Apply(const tir::Schedule& sch) {
   return f_apply(sch);
 }
 
+Postproc PyPostprocNode::Clone() const {
+  ICHECK(f_clone != nullptr) << "PyPostproc's Clone method not implemented!";
+  return f_clone();
+}
+
 Postproc Postproc::PyPostproc(
     PyPostprocNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PyPostprocNode::FApply f_apply,                                             //
+    PyPostprocNode::FClone f_clone,                                             //
     PyPostprocNode::FAsString f_as_string) {
   ObjectPtr<PyPostprocNode> n = make_object<PyPostprocNode>();
   n->f_initialize_with_tune_context = std::move(f_initialize_with_tune_context);
   n->f_apply = std::move(f_apply);
+  n->f_clone = std::move(f_clone);
   n->f_as_string = std::move(f_as_string);
   return Postproc(n);
 }
@@ -58,6 +65,7 @@ TVM_REGISTER_NODE_TYPE(PyPostprocNode);
 TVM_REGISTER_GLOBAL("meta_schedule.PostprocInitializeWithTuneContext")
     .set_body_method<Postproc>(&PostprocNode::InitializeWithTuneContext);
 TVM_REGISTER_GLOBAL("meta_schedule.PostprocApply").set_body_method<Postproc>(&PostprocNode::Apply);
+TVM_REGISTER_GLOBAL("meta_schedule.PostprocClone").set_body_method<Postproc>(&PostprocNode::Clone);
 TVM_REGISTER_GLOBAL("meta_schedule.PostprocPyPostproc").set_body_typed(Postproc::PyPostproc);
 
 }  // namespace meta_schedule
diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index d111bdb42abb..ac9f45ca8ef4 100644
--- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -104,6 +104,11 @@ class RewriteCooperativeFetchNode : public PostprocNode {
   // Inherited from PostprocNode
   bool Apply(const tir::Schedule& sch) final;
 
+  Postproc Clone() const {
+    ObjectPtr<RewriteCooperativeFetchNode> n = make_object<RewriteCooperativeFetchNode>(*this);
+    return Postproc(n);
+  }
+
   void VisitAttrs(tvm::AttrVisitor* v) {}
 
   static constexpr const char* _type_key = "meta_schedule.RewriteCooperativeFetch";
diff --git a/src/meta_schedule/postproc/rewrite_layout.cc b/src/meta_schedule/postproc/rewrite_layout.cc
index f4cbdfe737fb..6ff9958c791f 100644
--- a/src/meta_schedule/postproc/rewrite_layout.cc
+++ b/src/meta_schedule/postproc/rewrite_layout.cc
@@ -167,6 +167,11 @@ class RewriteLayoutNode : public PostprocNode {
   // Inherited from PostprocNode
   bool Apply(const tir::Schedule& sch) final { return tir::RewriteLayout(sch); }
 
+  Postproc Clone() const {
+    ObjectPtr<RewriteLayoutNode> n = make_object<RewriteLayoutNode>(*this);
+    return Postproc(n);
+  }
+
   static constexpr const char* _type_key = "meta_schedule.RewriteLayout";
   TVM_DECLARE_FINAL_OBJECT_INFO(RewriteLayoutNode, PostprocNode);
 };
diff --git a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
index 08d25d017840..c3cc0ef60152 100644
--- a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
+++ b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
@@ -384,6 +384,12 @@ class RewriteParallelVectorizeUnrollNode : public PostprocNode {
     return true;
   }
 
+  Postproc Clone() const {
+    ObjectPtr<RewriteParallelVectorizeUnrollNode> n =
+        make_object<RewriteParallelVectorizeUnrollNode>(*this);
+    return Postproc(n);
+  }
+
   static constexpr const char* _type_key = "meta_schedule.RewriteParallelVectorizeUnroll";
   TVM_DECLARE_FINAL_OBJECT_INFO(RewriteParallelVectorizeUnrollNode, PostprocNode);
 };
diff --git a/src/meta_schedule/postproc/rewrite_reduction_block.cc b/src/meta_schedule/postproc/rewrite_reduction_block.cc
index ea204e306133..05a7640f047c 100644
--- a/src/meta_schedule/postproc/rewrite_reduction_block.cc
+++ b/src/meta_schedule/postproc/rewrite_reduction_block.cc
@@ -114,6 +114,11 @@ class RewriteReductionBlockNode : public PostprocNode {
   // Inherited from PostprocNode
   bool Apply(const tir::Schedule& sch) final;
 
+  Postproc Clone() const {
+    ObjectPtr<RewriteReductionBlockNode> n = make_object<RewriteReductionBlockNode>(*this);
+    return Postproc(n);
+  }
+
   void VisitAttrs(tvm::AttrVisitor* v) {}
 
   static constexpr const char* _type_key = "meta_schedule.RewriteReductionBlock";
diff --git a/src/meta_schedule/postproc/rewrite_tensorize.cc b/src/meta_schedule/postproc/rewrite_tensorize.cc
index 3b6c438d0216..4f8e0fb213f8 100644
--- a/src/meta_schedule/postproc/rewrite_tensorize.cc
+++ b/src/meta_schedule/postproc/rewrite_tensorize.cc
@@ -68,6 +68,11 @@ class RewriteTensorizeNode : public PostprocNode {
 
   void VisitAttrs(tvm::AttrVisitor* v) {}
 
+  Postproc Clone() const {
+    ObjectPtr<RewriteTensorizeNode> n = make_object<RewriteTensorizeNode>(*this);
+    return Postproc(n);
+  }
+
   bool vectorize_init_loop = false;
 
   static constexpr const char* _type_key = "meta_schedule.RewriteTensorize";
diff --git a/src/meta_schedule/postproc/rewrite_unbound_block.cc b/src/meta_schedule/postproc/rewrite_unbound_block.cc
index eb57e90f82f6..1ba68538ea04 100644
--- a/src/meta_schedule/postproc/rewrite_unbound_block.cc
+++ b/src/meta_schedule/postproc/rewrite_unbound_block.cc
@@ -97,6 +97,11 @@ class RewriteUnboundBlockNode : public PostprocNode {
   // Inherited from PostprocNode
   bool Apply(const tir::Schedule& sch) final;
 
+  Postproc Clone() const {
+    ObjectPtr<RewriteUnboundBlockNode> n = make_object<RewriteUnboundBlockNode>(*this);
+    return Postproc(n);
+  }
+
  public:
   /*! \brief The max number of threads per block from Target */
   int max_threads_per_block_ = -1;
diff --git a/src/meta_schedule/postproc/verify_gpu_code.cc b/src/meta_schedule/postproc/verify_gpu_code.cc
index dfe2c5a06a17..0828ee538427 100644
--- a/src/meta_schedule/postproc/verify_gpu_code.cc
+++ b/src/meta_schedule/postproc/verify_gpu_code.cc
@@ -196,6 +196,12 @@ class VerifyGPUCodeNode : public PostprocNode {
     return true;
   }
 
+  Postproc Clone() const {
+    ObjectPtr<VerifyGPUCodeNode> n = make_object<VerifyGPUCodeNode>(*this);
+    n->target_constraints_ = this->target_constraints_;
+    return Postproc(n);
+  }
+
   static constexpr const char* _type_key = "meta_schedule.VerifyGPUCode";
   TVM_DECLARE_FINAL_OBJECT_INFO(VerifyGPUCodeNode, PostprocNode);
 };
diff --git a/src/meta_schedule/schedule_rule/add_rfactor.cc b/src/meta_schedule/schedule_rule/add_rfactor.cc
index cf87f24ac233..2fc1352677cb 100644
--- a/src/meta_schedule/schedule_rule/add_rfactor.cc
+++ b/src/meta_schedule/schedule_rule/add_rfactor.cc
@@ -36,6 +36,12 @@ class AddRFactorNode : public ScheduleRuleNode {
   // Inherited from ScheduleRuleNode
   Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv);
 
+  // Inherited from ScheduleRuleNode
+  ScheduleRule Clone() const final {
+    ObjectPtr<AddRFactorNode> n = make_object<AddRFactorNode>(*this);
+    return ScheduleRule(n);
+  }
+
  public:
   /*!
    * \brief The maximum number of jobs to be launched per core.
diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc
index d8f52fa8e1de..7af1418d8f3e 100644
--- a/src/meta_schedule/schedule_rule/auto_bind.cc
+++ b/src/meta_schedule/schedule_rule/auto_bind.cc
@@ -177,6 +177,12 @@ class AutoBindNode : public ScheduleRuleNode {
   // Inherited from ScheduleRuleNode
   Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final;
 
+  // Inherited from ScheduleRuleNode
+  ScheduleRule Clone() const final {
+    ObjectPtr<AutoBindNode> n = make_object<AutoBindNode>(*this);
+    return ScheduleRule(n);
+  }
+
  public:
   /*! \brief The max number of threads per block from Target */
   int64_t max_threads_per_block_ = -1;
diff --git a/src/meta_schedule/schedule_rule/auto_inline.cc b/src/meta_schedule/schedule_rule/auto_inline.cc
index 446c8ead7e8e..dcdc83f95cb1 100644
--- a/src/meta_schedule/schedule_rule/auto_inline.cc
+++ b/src/meta_schedule/schedule_rule/auto_inline.cc
@@ -60,6 +60,12 @@ class AutoInlineNode : public ScheduleRuleNode {
     return {sch};
   }
 
+  // Inherited from ScheduleRuleNode
+  ScheduleRule Clone() const final {
+    ObjectPtr<AutoInlineNode> n = make_object<AutoInlineNode>(*this);
+    return ScheduleRule(n);
+  }
+
  public:
   /*! \brief If allows to inline a block into its producer */
   bool into_producer;
diff --git a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
index 35be33f72e21..f2fc67f74cc7 100644
--- a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
+++ b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
@@ -113,6 +113,12 @@ class CrossThreadReductionNode : public ScheduleRuleNode {
     return {tmp_sch, sch};
   }
 
+  // Inherited from ScheduleRuleNode
+  ScheduleRule Clone() const final {
+    ObjectPtr<CrossThreadReductionNode> n = make_object<CrossThreadReductionNode>(*this);
+    return ScheduleRule(n);
+  }
+
  private:
   /*!
    * \brief Check whether the input block is in thread scope, i.e., some of its outer loop is
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index c126c854462c..1625a27b9aaf 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -104,6 +104,12 @@ Array<Schedule> MultiLevelTilingNode::Apply(const Schedule& sch, const BlockRV&
   return results;
 }
 
+// Inherited from ScheduleRuleNode
+ScheduleRule MultiLevelTilingNode::Clone() const {
+  ObjectPtr<MultiLevelTilingNode> n = make_object<MultiLevelTilingNode>(*this);
+  return ScheduleRule(n);
+}
+
 std::vector<State> MultiLevelTilingNode::ApplySubRules(std::vector<State> states) {
   states = SubRule(std::move(states), [&](State state) { return TileLoopNest(std::move(state)); });
   states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(std::move(state)); });
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.h b/src/meta_schedule/schedule_rule/multi_level_tiling.h
index 9161a972c187..47da878c3be0 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.h
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.h
@@ -155,6 +155,9 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
   // Entry of the mega rule; Inherited from ScheduleRuleNode
   Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) override;
 
+  // Inherited from ScheduleRuleNode
+  ScheduleRule Clone() const override;
+
  protected:
   virtual std::vector<State> ApplySubRules(std::vector<State> states);
 
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 7ddda9b2635b..13b00fa7deb6 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -137,6 +137,13 @@ class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode {
   // Override Apply to apply tensorization-specific analysis before applying sub-rules
   Array<Schedule> Apply(const Schedule& sch, const BlockRV& block_rv) final;
 
+  // Inherited from ScheduleRuleNode
+  ScheduleRule Clone() const final {
+    ObjectPtr<MultiLevelTilingTensorCoreNode> n =
+        make_object<MultiLevelTilingTensorCoreNode>(*this);
+    return ScheduleRule(n);
+  }
+
   /*!
    * \brief Transform and tensorize with the given tensor intrin
    * \param state The state of the meta schedule rule
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
index 3a299ed041e2..b953d1ad4b50 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
@@ -63,6 +63,13 @@ class MultiLevelTilingWithIntrinNode : public MultiLevelTilingNode {
     return res;
   }
 
+  // Inherited from ScheduleRuleNode
+  ScheduleRule Clone() const final {
+    ObjectPtr<MultiLevelTilingWithIntrinNode> n =
+        make_object<MultiLevelTilingWithIntrinNode>(*this);
+    return ScheduleRule(n);
+  }
+
   // Override ApplySubRules to tile the inner loops according to the given tensor intrinsic, then
   // tile the outerloops.
   virtual std::vector<State> ApplySubRules(std::vector<State> states) {
diff --git a/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc b/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
index 19758996e608..045aa85b73ad 100644
--- a/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
+++ b/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
@@ -79,6 +79,13 @@ class ParallelizeVectorizeUnrollNode : public ScheduleRuleNode {
     return {sch};
   }
 
+  // Inherited from ScheduleRuleNode
+  ScheduleRule Clone() const final {
+    ObjectPtr<ParallelizeVectorizeUnrollNode> n =
+        make_object<ParallelizeVectorizeUnrollNode>(*this);
+    return ScheduleRule(n);
+  }
+
  public:
   /*!
    * \brief The maximum number of jobs to be launched per CPU core. It sets the
diff --git a/src/meta_schedule/schedule_rule/random_compute_location.cc b/src/meta_schedule/schedule_rule/random_compute_location.cc
index 65988dfd5688..7796eddd44d3 100644
--- a/src/meta_schedule/schedule_rule/random_compute_location.cc
+++ b/src/meta_schedule/schedule_rule/random_compute_location.cc
@@ -57,6 +57,12 @@ class RandomComputeLocationNode : public ScheduleRuleNode {
     return {res};
   }
 
+  // Inherited from ScheduleRuleNode
+  ScheduleRule Clone() const final {
+    ObjectPtr<RandomComputeLocationNode> n = make_object<RandomComputeLocationNode>(*this);
+    return ScheduleRule(n);
+  }
+
  private:
   bool CheckConditions(const tir::Schedule sch, const tir::BlockRV& block_rv) const {
     tir::StmtSRef block_sref = sch->GetSRef(block_rv);
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index 80f8725b0c0d..416b43f46d56 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -33,13 +33,20 @@ Array<tir::Schedule> PyScheduleRuleNode::Apply(const tir::Schedule& sch,
   return f_apply(sch, block);
 }
 
+ScheduleRule PyScheduleRuleNode::Clone() const {
+  ICHECK(f_clone != nullptr) << "PyScheduleRule's Clone method not implemented!";
+  return f_clone();
+}
+
 ScheduleRule ScheduleRule::PyScheduleRule(
     PyScheduleRuleNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PyScheduleRuleNode::FApply f_apply,                                             //
+    PyScheduleRuleNode::FClone f_clone,                                             //
     PyScheduleRuleNode::FAsString f_as_string) {
   ObjectPtr<PyScheduleRuleNode> n = make_object<PyScheduleRuleNode>();
   n->f_initialize_with_tune_context = std::move(f_initialize_with_tune_context);
   n->f_apply = std::move(f_apply);
+  n->f_clone = std::move(f_clone);
   n->f_as_string = std::move(f_as_string);
   return ScheduleRule(n);
 }
@@ -60,6 +67,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleInitializeWithTuneContext")
     .set_body_method<ScheduleRule>(&ScheduleRuleNode::InitializeWithTuneContext);
 TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleApply")
     .set_body_method<ScheduleRule>(&ScheduleRuleNode::Apply);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleClone")
+    .set_body_method<ScheduleRule>(&ScheduleRuleNode::Clone);
 TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRulePyScheduleRule")
     .set_body_typed(ScheduleRule::PyScheduleRule);
 
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index c5ff9008effe..5930704eb0d1 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -431,6 +431,24 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     ICHECK(this->state_ != nullptr);
     this->state_->NotifyRunnerResults(measure_candidates, results);
   }
+
+  SearchStrategy Clone() const final {
+    ObjectPtr<EvolutionarySearchNode> n = make_object<EvolutionarySearchNode>();
+    n->max_trials_per_task = this->max_trials_per_task;
+    n->num_trials_per_iter = this->num_trials_per_iter;
+    n->population_size = this->population_size;
+    n->num_empty_iters_before_early_stop = this->num_empty_iters_before_early_stop;
+    n->init_measured_ratio = this->init_measured_ratio;
+    n->init_min_unmeasured = this->init_min_unmeasured;
+    n->genetic_num_iters = this->genetic_num_iters;
+    n->genetic_mutate_prob = this->genetic_mutate_prob;
+    n->genetic_max_fail_count = this->genetic_max_fail_count;
+    n->eps_greedy = this->eps_greedy;
+    n->context_ = this->context_;
+    n->rand_state_ = this->rand_state_;
+    n->state_ = nullptr;  // cleared the state
+    return SearchStrategy(n);
+  }
 };
 
 std::vector<Schedule> EvolutionarySearchNode::State::PickBestFromDatabase(int num) {
diff --git a/src/meta_schedule/search_strategy/replay_func.cc b/src/meta_schedule/search_strategy/replay_func.cc
index 4574c1c817a8..6914ab2f0f0a 100644
--- a/src/meta_schedule/search_strategy/replay_func.cc
+++ b/src/meta_schedule/search_strategy/replay_func.cc
@@ -100,6 +100,16 @@ class ReplayFuncNode : public SearchStrategyNode {
     ICHECK(this->state_ != nullptr);
     this->state_->NotifyRunnerResults(results);
   }
+
+  SearchStrategy Clone() const final {
+    ObjectPtr<ReplayFuncNode> n = make_object<ReplayFuncNode>();
+    n->num_trials_per_iter = this->num_trials_per_iter;
+    n->max_trials_per_task = this->max_trials_per_task;
+    n->context_ = this->context_;
+    n->rand_state_ = this->rand_state_;
+    n->state_ = nullptr;  // cleared the state
+    return SearchStrategy(n);
+  }
 };
 
 inline Optional<Array<MeasureCandidate>> ReplayFuncNode::State::GenerateMeasureCandidates() {
diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc
index 64fc68394357..bd553bf037d1 100644
--- a/src/meta_schedule/search_strategy/replay_trace.cc
+++ b/src/meta_schedule/search_strategy/replay_trace.cc
@@ -118,6 +118,17 @@ class ReplayTraceNode : public SearchStrategyNode {
     ICHECK(this->state_ != nullptr);
     this->state_->NotifyRunnerResults(results);
   }
+
+  SearchStrategy Clone() const final {
+    ObjectPtr<ReplayTraceNode> n = make_object<ReplayTraceNode>();
+    n->num_trials_per_iter = this->num_trials_per_iter;
+    n->max_trials_per_task = this->max_trials_per_task;
+    n->max_fail_count = this->max_fail_count;
+    n->context_ = this->context_;
+    n->rand_state_ = this->rand_state_;
+    n->state_ = nullptr;  // cleared the state
+    return SearchStrategy(n);
+  }
 };
 
 inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasureCandidates() {
diff --git a/src/meta_schedule/search_strategy/search_strategy.cc b/src/meta_schedule/search_strategy/search_strategy.cc
index 5865fc842248..81c7fda315b4 100644
--- a/src/meta_schedule/search_strategy/search_strategy.cc
+++ b/src/meta_schedule/search_strategy/search_strategy.cc
@@ -59,18 +59,25 @@ void PySearchStrategyNode::NotifyRunnerResults(const Array<MeasureCandidate>& me
   f_notify_runner_results(measure_candidates, results);
 }
 
+SearchStrategy PySearchStrategyNode::Clone() const {
+  ICHECK(f_clone != nullptr) << "PySearchStrategy's Clone method not implemented!";
+  return f_clone();
+}
+
 SearchStrategy SearchStrategy::PySearchStrategy(
     PySearchStrategyNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PySearchStrategyNode::FPreTuning f_pre_tuning,                                    //
     PySearchStrategyNode::FPostTuning f_post_tuning,                                  //
     PySearchStrategyNode::FGenerateMeasureCandidates f_generate_measure_candidates,   //
-    PySearchStrategyNode::FNotifyRunnerResults f_notify_runner_results) {
+    PySearchStrategyNode::FNotifyRunnerResults f_notify_runner_results,               //
+    PySearchStrategyNode::FClone f_clone) {
   ObjectPtr<PySearchStrategyNode> n = make_object<PySearchStrategyNode>();
   n->f_initialize_with_tune_context = f_initialize_with_tune_context;
   n->f_pre_tuning = f_pre_tuning;
   n->f_post_tuning = f_post_tuning;
   n->f_generate_measure_candidates = f_generate_measure_candidates;
   n->f_notify_runner_results = f_notify_runner_results;
+  n->f_clone = f_clone;
   return SearchStrategy(n);
 }
 
@@ -94,6 +101,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyGenerateMeasureCandidates")
     .set_body_method<SearchStrategy>(&SearchStrategyNode::GenerateMeasureCandidates);
 TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyNotifyRunnerResults")
     .set_body_method<SearchStrategy>(&SearchStrategyNode::NotifyRunnerResults);
+TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyClone")
+    .set_body_method<SearchStrategy>(&SearchStrategyNode::Clone);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/space_generator/post_order_apply.cc b/src/meta_schedule/space_generator/post_order_apply.cc
index 9be89e2d9c70..991e4fa08047 100644
--- a/src/meta_schedule/space_generator/post_order_apply.cc
+++ b/src/meta_schedule/space_generator/post_order_apply.cc
@@ -188,6 +188,15 @@ class PostOrderApplyNode : public SpaceGeneratorNode {
     }
     return result;
   }
+
+  SpaceGenerator Clone() const final {
+    ObjectPtr<PostOrderApplyNode> n = make_object<PostOrderApplyNode>(*this);
+    n->sch_rules_ = Array<ScheduleRule>();
+    for (const ScheduleRule& sch_rule : this->sch_rules_) {
+      n->sch_rules_.push_back(sch_rule->Clone());
+    }
+    return SpaceGenerator(n);
+  }
   static constexpr const char* _type_key = "meta_schedule.PostOrderApply";
   TVM_DECLARE_FINAL_OBJECT_INFO(PostOrderApplyNode, SpaceGeneratorNode);
 };
diff --git a/src/meta_schedule/space_generator/schedule_fn.cc b/src/meta_schedule/space_generator/schedule_fn.cc
index 70559fbcf1fb..adea139b1cd4 100644
--- a/src/meta_schedule/space_generator/schedule_fn.cc
+++ b/src/meta_schedule/space_generator/schedule_fn.cc
@@ -72,6 +72,11 @@ class ScheduleFnNode : public SpaceGeneratorNode {
     throw;
   }
 
+  SpaceGenerator Clone() const final {
+    ObjectPtr<ScheduleFnNode> n = make_object<ScheduleFnNode>(*this);
+    return SpaceGenerator(n);
+  }
+
   static constexpr const char* _type_key = "meta_schedule.ScheduleFn";
   TVM_DECLARE_FINAL_OBJECT_INFO(ScheduleFnNode, SpaceGeneratorNode);
 };
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index 5c5ab6ebbae5..6fc31ed896f2 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -33,12 +33,18 @@ Array<tir::Schedule> PySpaceGeneratorNode::GenerateDesignSpace(const IRModule& m
   return f_generate_design_space(mod);
 }
 
+SpaceGenerator PySpaceGeneratorNode::Clone() const {
+  ICHECK(f_clone != nullptr) << "PySpaceGenerator's Clone method not implemented!";
+  return f_clone();
+}
+
 SpaceGenerator SpaceGenerator::PySpaceGenerator(
-    PySpaceGeneratorNode::FInitializeWithTuneContext f_initialize_with_tune_context,
-    PySpaceGeneratorNode::FGenerateDesignSpace f_generate_design_space) {
+    FInitializeWithTuneContext f_initialize_with_tune_context,
+    FGenerateDesignSpace f_generate_design_space, FClone f_clone) {
   ObjectPtr<PySpaceGeneratorNode> n = make_object<PySpaceGeneratorNode>();
   n->f_initialize_with_tune_context = std::move(f_initialize_with_tune_context);
   n->f_generate_design_space = std::move(f_generate_design_space);
+  n->f_clone = std::move(f_clone);
   return SpaceGenerator(n);
 }
 
@@ -51,6 +57,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.SpaceGeneratorGenerateDesignSpace")
     .set_body_method<SpaceGenerator>(&SpaceGeneratorNode::GenerateDesignSpace);
 TVM_REGISTER_GLOBAL("meta_schedule.SpaceGeneratorPySpaceGenerator")
     .set_body_typed(SpaceGenerator::PySpaceGenerator);
+TVM_REGISTER_GLOBAL("meta_schedule.SpaceGeneratorClone")
+    .set_body_method<SpaceGenerator>(&SpaceGeneratorNode::Clone);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/space_generator/space_generator_union.cc b/src/meta_schedule/space_generator/space_generator_union.cc
index 6ea61824f932..771d0c187f97 100644
--- a/src/meta_schedule/space_generator/space_generator_union.cc
+++ b/src/meta_schedule/space_generator/space_generator_union.cc
@@ -47,6 +47,15 @@ class SpaceGeneratorUnionNode : public SpaceGeneratorNode {
     return design_spaces;
   }
 
+  SpaceGenerator Clone() const final {
+    ObjectPtr<SpaceGeneratorUnionNode> n = make_object<SpaceGeneratorUnionNode>(*this);
+    n->space_generators = Array<SpaceGenerator>();
+    for (const SpaceGenerator& space_generator : this->space_generators) {
+      n->space_generators.push_back(space_generator->Clone());
+    }
+    return SpaceGenerator(n);
+  }
+
   static constexpr const char* _type_key = "meta_schedule.SpaceGeneratorUnion";
   TVM_DECLARE_FINAL_OBJECT_INFO(SpaceGeneratorUnionNode, SpaceGeneratorNode);
 };
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 57b2344c6f8d..3650c0374dab 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -52,6 +52,32 @@ TuneContext::TuneContext(Optional<IRModule> mod,
   data_ = std::move(n);
 }
 
+TuneContext TuneContextNode::Clone() const {
+  ObjectPtr<TuneContextNode> n = make_object<TuneContextNode>(*this);
+  if (this->sch_rules.defined()) {
+    n->sch_rules = Array<ScheduleRule>();
+    for (const ScheduleRule& sch_rule : this->sch_rules) {
+      n->sch_rules.push_back(sch_rule->Clone());
+    }
+  }
+  if (this->postprocs.defined()) {
+    n->postprocs = Array<Postproc>();
+    for (const Postproc& postproc : this->postprocs) {
+      n->postprocs.push_back(postproc->Clone());
+    }
+  }
+  if (this->mutator_probs.defined()) {
+    n->mutator_probs = Map<Mutator, FloatImm>();
+    for (const auto& kv : this->mutator_probs) {
+      n->mutator_probs.Set(kv.first->Clone(), kv.second);
+    }
+  }
+  if (this->space_generator.defined()) n->space_generator = this->space_generator.value()->Clone();
+  if (this->search_strategy.defined()) n->search_strategy = this->search_strategy.value()->Clone();
+  n->Initialize();
+  return TuneContext(n);
+}
+
 void TuneContextNode::Initialize() {
   if (this->space_generator.defined()) {
     this->space_generator.value()->InitializeWithTuneContext(GetRef<TuneContext>(this));

From 77d0a288df4a1975784def14b316bde576fe3980 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 15 Sep 2022 23:28:45 -0700
Subject: [PATCH 183/704] [MetaSchedule][Test] MLT uses SEqual tests (#12805)

This PR finishes migration from `check_trace` (string-based equality
check on TIR trace) to `check_sketch` (SEqual-based equality check on
TIR). Here, we split multi-level-tiling into 3 files:
- Plain multi-level tiling without any intrinsics
- Multi-level tiling with intrinsics like VNNI, DP4a
- Multi-level tiling with TensorCore which comes with different handling

Besides, we cleaned up the testing folder and removed several methods
that are no longer useful for unittests.
---
 .../meta_schedule/testing/schedule_rule.py    |  138 +-
 .../multi_level_tiling_tensor_core.cc         |    4 +-
 src/meta_schedule/utils.h                     |   35 +-
 ...t_meta_schedule_schedule_rule_auto_bind.py |   22 +-
 ...meta_schedule_schedule_rule_auto_inline.py |   19 +-
 ...le_schedule_rule_cross_thread_reduction.py |   17 +-
 .../test_meta_schedule_schedule_rule_mlt.py   |  529 ++++++++
 ..._meta_schedule_schedule_rule_mlt_intrin.py |  418 ++++++
 ...test_meta_schedule_schedule_rule_mlt_tc.py |  957 +++++++++++++
 ...hedule_schedule_rule_multi_level_tiling.py | 1205 -----------------
 10 files changed, 1961 insertions(+), 1383 deletions(-)
 create mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
 create mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
 create mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
 delete mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py

diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py
index 12ca4200d77a..f14e90b6f0b2 100644
--- a/python/tvm/meta_schedule/testing/schedule_rule.py
+++ b/python/tvm/meta_schedule/testing/schedule_rule.py
@@ -15,122 +15,22 @@
 # specific language governing permissions and limitations
 # under the License.
 """Default schedule rules"""
-from typing import List, Union
-
-from tvm.meta_schedule.schedule_rule import (
-    AutoInline,
-    MultiLevelTiling,
-    MultiLevelTilingTensorCore,
-    ReuseType,
-    ScheduleRule,
-)
-from tvm.target import Target
-
-
-def auto_inline(target: Target) -> ScheduleRule:
-    """Default schedule rules for auto inline"""
-    if target.kind.name == "llvm":
-        return AutoInline(
-            into_producer=False,
-            into_consumer=True,
-            inline_const_tensor=True,
-            disallow_if_then_else=True,
-            require_injective=True,
-            require_ordered=True,
-            disallow_op=["tir.exp"],
-        )
-    if target.kind.name == "cuda":
-        return AutoInline(
-            into_producer=True,
-            into_consumer=True,
-            inline_const_tensor=True,
-            disallow_if_then_else=False,
-            require_injective=False,
-            require_ordered=False,
-            disallow_op=None,
-        )
-    raise NotImplementedError(f"{target.kind.name} is not supported")
-
-
-def multi_level_tiling(target: Target) -> ScheduleRule:
-    """Default schedule rules for with multi-level tiling and reuse"""
-    if target.kind.name == "llvm":
-        return MultiLevelTiling(
-            structure="SSRSRS",
-            tile_binds=None,
-            max_innermost_factor=64,
-            vector_load_lens=None,
-            reuse_read=None,
-            reuse_write=ReuseType(
-                req="may",
-                levels=[1, 2],
-                scope="global",
-            ),
-        )
-    if target.kind.name == "cuda":
-        return MultiLevelTiling(
-            structure="SSSRRSRS",
-            tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
-            max_innermost_factor=64,
-            vector_load_lens=[1, 2, 3, 4, 8, 16],
-            reuse_read=ReuseType(
-                req="must",
-                levels=[4],
-                scope="shared",
-            ),
-            reuse_write=ReuseType(
-                req="must",
-                levels=[3],
-                scope="local",
-            ),
-        )
-    raise NotImplementedError(f"{target.kind.name} is not supported")
-
-
-def multi_level_tiling_tensor_core(
-    target: Target,
-    write_reuse_scope: str = "shared",
-    in_dtype: Union[str, List[str]] = "float16",
-    out_dtype: Union[str, List[str]] = "float32",
-    trans_b: Union[bool, List[bool]] = False,
-    use_software_pipeline: bool = False,
-) -> ScheduleRule:
-    """Default schedule rules for with multi-level tiling reuse for tensor core"""
-    assert write_reuse_scope in ["shared", "global"]
-    if not isinstance(in_dtype, list):
-        in_dtype = [in_dtype]
-    if not isinstance(out_dtype, list):
-        out_dtype = [out_dtype]
-    if not isinstance(trans_b, list):
-        trans_b = [trans_b]
-
-    if target.kind.name == "cuda":
-        from tvm.tir.tensor_intrin import (  # pylint: disable=import-outside-toplevel
-            cuda,
-        )
-
-        intrin_groups = [
-            cuda.get_wmma_intrin_group(write_reuse_scope, _in_dtype, _out_dtype, _trans_b)
-            for _in_dtype in in_dtype
-            for _out_dtype in out_dtype
-            for _trans_b in trans_b
-        ]
-        return MultiLevelTilingTensorCore(
-            intrin_groups=intrin_groups,
-            structure="SSSRRSRS",
-            tile_binds=["blockIdx.y", "blockIdx.x", "threadIdx.y"],
-            max_innermost_factor=4,  # 64 // tensor intrin size
-            vector_load_lens=[1, 2, 3, 4, 8, 16],
-            reuse_read=ReuseType(
-                req="must",
-                levels=[4],
-                scope="shared",
-            ),
-            reuse_write=ReuseType(
-                req="must" if write_reuse_scope == "shared" else "no",
-                levels=[2],
-                scope=write_reuse_scope,
-            ),
-            use_software_pipeline=use_software_pipeline,
-        )
-    raise NotImplementedError(f"{target.kind.name} is not supported")
+from typing import List, Tuple, Union
+
+from tvm.meta_schedule import default_config
+from tvm.meta_schedule.schedule_rule import ScheduleRule
+
+
+def get_rules(kind: str, types: Union[type, Tuple[type, ...]]) -> List[ScheduleRule]:
+    """Get default schedule rules"""
+    # pylint: disable=protected-access
+    if kind == "llvm":
+        rules = default_config._DefaultLLVM.schedule_rules()
+    elif kind == "cuda":
+        rules = default_config._DefaultCUDA.schedule_rules()
+    elif kind == "tensor_core":
+        rules = default_config._DefaultCUDATensorCore.schedule_rules()
+    else:
+        raise NotImplementedError(f"{kind} is not supported")
+    # pylint: enable=protected-access
+    return [rule for rule in rules if isinstance(rule, types)]
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 13b00fa7deb6..8fcb8fe503b7 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -328,7 +328,7 @@ std::vector<State> MultiLevelTilingTensorCoreNode::AddSoftwarePipeline(
   // Add local stage and double buffering
   for (int i = 0; i < 2; ++i) {
     const tir::BlockRV cache_read = state->read_reuse.at(i);
-    sch->Annotate(cache_read, tir::attr::manifest_shared_memory_local_stage, Bool(true));
+    sch->Annotate(cache_read, tir::attr::manifest_shared_memory_local_stage, Integer(1));
     sch->Annotate(cache_read, tir::attr::double_buffer_scope, Integer(0));
   }
 
@@ -536,7 +536,7 @@ inline std::vector<State> MultiLevelTilingTensorCoreNode::TransformForTensorizat
                        state->intrin_group.compute_intrin);
   state->sch->Annotate(state->block_rv, tir::attr::meta_schedule_auto_tensorize_init,
                        state->intrin_group.init_intrin);
-  state->sch->Annotate(state->block_rv, tir::attr::warp_execution, Bool(true));
+  state->sch->Annotate(state->block_rv, tir::attr::warp_execution, Integer(1));
   return {std::move(state)};
 }
 
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index ad56fa7f6a52..cf9a32917031 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -77,33 +77,34 @@ class PyLogMessage {
     // FATAL not included
   };
 
-  PyLogMessage(const std::string& file, int lineno, PackedFunc logging_func, Level logging_level) {
-    this->logging_func = logging_func;
-    this->logging_level = logging_level;
-  }
+  explicit PyLogMessage(const char* file, int lineno, PackedFunc logging_func, Level logging_level)
+      : file_(file), lineno_(lineno), logging_func_(logging_func), logging_level_(logging_level) {}
+
   TVM_NO_INLINE ~PyLogMessage() {
-    if (this->logging_func.defined()) {
-      logging_func(static_cast<int>(logging_level), stream_.str());
+    if (this->logging_func_.defined()) {
+      logging_func_(static_cast<int>(logging_level_), stream_.str());
     } else {
-      if (logging_level == Level::INFO) {
-        LOG(INFO) << stream_.str();
-      } else if (logging_level == Level::WARNING) {
-        LOG(WARNING) << stream_.str();
-      } else if (logging_level == Level::ERROR) {
-        LOG(ERROR) << stream_.str();
-      } else if (logging_level == Level::DEBUG) {
-        DLOG(INFO) << stream_.str();
+      if (logging_level_ == Level::INFO) {
+        runtime::detail::LogMessage(file_, lineno_).stream() << stream_.str();
+      } else if (logging_level_ == Level::WARNING) {
+        runtime::detail::LogMessage(file_, lineno_).stream() << "Warning: " << stream_.str();
+      } else if (logging_level_ == Level::ERROR) {
+        runtime::detail::LogMessage(file_, lineno_).stream() << "Error: " << stream_.str();
+      } else if (logging_level_ == Level::DEBUG) {
+        runtime::detail::LogMessage(file_, lineno_).stream() << "Debug: " << stream_.str();
       } else {
-        LOG(FATAL) << stream_.str();
+        runtime::detail::LogFatal(file_, lineno_).stream() << stream_.str();
       }
     }
   }
   std::ostringstream& stream() { return stream_; }
 
  private:
+  const char* file_;
+  int lineno_;
   std::ostringstream stream_;
-  PackedFunc logging_func;
-  Level logging_level;
+  PackedFunc logging_func_;
+  Level logging_level_;
 };
 
 /*! \brief The type of the random state */
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index 21ad04da473e..a50292df7ae3 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.schedule_rule import get_rules
 from tvm.meta_schedule.testing.space_generation import check_sketches
 from tvm.script import tir as T
 from tvm.target import Target
@@ -83,12 +84,7 @@ def elementwise_0(
         mod=mod,
         target=Target("nvidia/geforce-rtx-3080", host="llvm"),
         space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[
-            ms.schedule_rule.AutoBind(
-                max_threadblocks=256,
-                thread_extents=[32, 64, 128, 256, 512, 1024],
-            )
-        ],
+        sch_rules=get_rules("cuda", ms.schedule_rule.AutoBind),
         task_name="test",
     ).generate_design_space()
     check_sketches(
@@ -122,12 +118,7 @@ def reduction_loop_only_0(
         mod=mod,
         target=Target("nvidia/geforce-rtx-3080", host="llvm"),
         space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[
-            ms.schedule_rule.AutoBind(
-                max_threadblocks=256,
-                thread_extents=[32, 64, 128, 256, 512, 1024],
-            )
-        ],
+        sch_rules=get_rules("cuda", ms.schedule_rule.AutoBind),
         task_name="test",
     ).generate_design_space()
     check_sketches(
@@ -158,12 +149,7 @@ def zero_dim_add_0(
         mod=mod,
         target=Target("nvidia/geforce-rtx-3080", host="llvm"),
         space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[
-            ms.schedule_rule.AutoBind(
-                max_threadblocks=256,
-                thread_extents=[32, 64, 128, 256, 512, 1024],
-            )
-        ],
+        sch_rules=get_rules("cuda", ms.schedule_rule.AutoBind),
         task_name="test",
     ).generate_design_space()
     check_sketches(
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
index fcf6a8571b7f..c0801c9d7b5e 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
@@ -16,9 +16,8 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
-from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
-from tvm.meta_schedule.testing.schedule_rule import auto_inline
-from tvm.meta_schedule.tune_context import TuneContext
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.schedule_rule import get_rules
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -340,10 +339,10 @@ def main(T_full: T.Buffer[(1, 12, 4096), "int64"]) -> None:
 
 
 def _create_context(mod, target, rule):
-    ctx = TuneContext(
+    ctx = ms.TuneContext(
         mod=mod,
         target=target,
-        space_generator=PostOrderApply(),
+        space_generator=ms.space_generator.PostOrderApply(),
         sch_rules=[rule],
         task_name="test",
     )
@@ -356,7 +355,7 @@ def test_inline_consumer_chain():
     ctx = _create_context(
         mod=mod,
         target=target,
-        rule=auto_inline(target=target),
+        rule=get_rules("llvm", ms.schedule_rule.AutoInline)[0],
     )
     (space,) = ctx.space_generator.generate_design_space(mod=mod)
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=Conv2DBiasBnReLUInlined)
@@ -368,7 +367,7 @@ def test_inline_into_cache():
     ctx = _create_context(
         mod=mod,
         target=target,
-        rule=auto_inline(target=target),
+        rule=get_rules("cuda", ms.schedule_rule.AutoInline)[0],
     )
     (space,) = ctx.space_generator.generate_design_space(mod=mod)
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=MultiLevelTiledConv2DAfterInline)
@@ -380,7 +379,7 @@ def test_inline_into_multiple_consumers():
     ctx = _create_context(
         mod=mod,
         target=target,
-        rule=auto_inline(target=target),
+        rule=get_rules("cuda", ms.schedule_rule.AutoInline)[0],
     )
     (space,) = ctx.space_generator.generate_design_space(mod=mod)
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=SoftmaxAfterInline)
@@ -392,7 +391,7 @@ def test_inline_pure_spatial():
     ctx = _create_context(
         mod=mod,
         target=target,
-        rule=auto_inline(target=target),
+        rule=get_rules("llvm", ms.schedule_rule.AutoInline)[0],
     )
     (space,) = ctx.space_generator.generate_design_space(mod=mod)
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=AfterPureSpatial)
@@ -404,7 +403,7 @@ def test_inline_constant_tensor():
     ctx = _create_context(
         mod=mod,
         target=target,
-        rule=auto_inline(target=target),
+        rule=get_rules("cuda", ms.schedule_rule.AutoInline)[0],
     )
     (space,) = ctx.space_generator.generate_design_space(mod=mod)
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=ConstConsumer)
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index ab8df6678b0b..4278638a1aa3 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -19,6 +19,7 @@
 import tvm
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing import te_workload
+from tvm.meta_schedule.testing.schedule_rule import get_rules
 from tvm.meta_schedule.testing.space_generation import check_sketches
 from tvm.script import tir as T
 from tvm.target import Target
@@ -283,9 +284,7 @@ def softmax_mn_3(
         mod=mod,
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
         space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[
-            ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512])
-        ],
+        sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction),
         task_name="test",
     ).generate_design_space()
     check_sketches(
@@ -481,9 +480,7 @@ def softmax_mn_after_inline_3(
         mod=mod,
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
         space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[
-            ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512])
-        ],
+        sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction),
         task_name="test",
     ).generate_design_space()
     check_sketches(
@@ -559,9 +556,7 @@ def batch_norm_bmn_1(A: T.Buffer[(1, 512, 512), "float32"], D: T.Buffer[1, "floa
         mod=mod,
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
         space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[
-            ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512])
-        ],
+        sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction),
         task_name="test",
     ).generate_design_space()
     check_sketches(
@@ -657,9 +652,7 @@ def argmax_1(
         mod=mod,
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
         space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[
-            ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512])
-        ],
+        sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction),
         task_name="test",
     ).generate_design_space()
     check_sketches(
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
new file mode 100644
index 000000000000..939ccbe54fa6
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
@@ -0,0 +1,529 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+from tvm import meta_schedule as ms
+from tvm import te
+from tvm.meta_schedule.testing import te_workload
+from tvm.meta_schedule.testing.schedule_rule import get_rules
+from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.script import tir as T
+from tvm.target import Target
+
+
+def test_cpu_matmul():
+    @T.prim_func
+    def cpu_matmul_0(
+        A: T.Buffer[(512, 512), "float32"],
+        B: T.Buffer[(512, 512), "float32"],
+        C: T.Buffer[(512, 512), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C_global = T.alloc_buffer([512, 512], dtype="float32")
+        for i0_0, i1_0, i0_1, i1_1 in T.grid(1, 8, 8, 1):
+            for i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(16, 2, 8, 32, 32, 8):
+                with T.block("C"):
+                    i = T.axis.spatial(512, i0_0 * 512 + i0_1 * 64 + i0_2 * 32 + i0_3)
+                    j = T.axis.spatial(512, i1_0 * 64 + i1_1 * 64 + i1_2 * 8 + i1_3)
+                    k = T.axis.reduce(512, i2_0 * 32 + i2_1)
+                    T.reads(A[i, k], B[k, j])
+                    T.writes(C_global[i, j])
+                    T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"})
+                    with T.init():
+                        C_global[i, j] = T.float32(0)
+                    C_global[i, j] = C_global[i, j] + A[i, k] * B[k, j]
+            for ax0, ax1 in T.grid(64, 64):
+                with T.block("C_global"):
+                    v0 = T.axis.spatial(512, i0_1 * 64 + ax0)
+                    v1 = T.axis.spatial(512, i1_0 * 64 + ax1)
+                    T.reads(C_global[v0, v1])
+                    T.writes(C[v0, v1])
+                    C[v0, v1] = C_global[v0, v1]
+
+    @T.prim_func
+    def cpu_matmul_1(
+        A: T.Buffer[(512, 512), "float32"],
+        B: T.Buffer[(512, 512), "float32"],
+        C: T.Buffer[(512, 512), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C_global = T.alloc_buffer([512, 512], dtype="float32")
+        for i0_0, i1_0 in T.grid(1, 8):
+            for i0_1, i1_1, i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(8, 1, 16, 2, 8, 32, 32, 8):
+                with T.block("C"):
+                    i = T.axis.spatial(512, i0_0 * 512 + i0_1 * 64 + i0_2 * 32 + i0_3)
+                    j = T.axis.spatial(512, i1_0 * 64 + i1_1 * 64 + i1_2 * 8 + i1_3)
+                    k = T.axis.reduce(512, i2_0 * 32 + i2_1)
+                    T.reads(A[i, k], B[k, j])
+                    T.writes(C_global[i, j])
+                    T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"})
+                    with T.init():
+                        C_global[i, j] = T.float32(0)
+                    C_global[i, j] = C_global[i, j] + A[i, k] * B[k, j]
+            for ax0, ax1 in T.grid(512, 64):
+                with T.block("C_global"):
+                    v0 = T.axis.spatial(512, ax0)
+                    v1 = T.axis.spatial(512, i1_0 * 64 + ax1)
+                    T.reads(C_global[v0, v1])
+                    T.writes(C[v0, v1])
+                    C[v0, v1] = C_global[v0, v1]
+
+    @T.prim_func
+    def cpu_matmul_2(
+        A: T.Buffer[(512, 512), "float32"],
+        B: T.Buffer[(512, 512), "float32"],
+        C: T.Buffer[(512, 512), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0_0, i1_0, i0_1, i1_1, i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(
+            1, 8, 8, 1, 16, 2, 8, 32, 32, 8
+        ):
+            with T.block("C"):
+                i = T.axis.spatial(512, i0_0 * 512 + i0_1 * 64 + i0_2 * 32 + i0_3)
+                j = T.axis.spatial(512, i1_0 * 64 + i1_1 * 64 + i1_2 * 8 + i1_3)
+                k = T.axis.reduce(512, i2_0 * 32 + i2_1)
+                T.reads(A[i, k], B[k, j])
+                T.writes(C[i, j])
+                T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"})
+                with T.init():
+                    C[i, j] = T.float32(0)
+                C[i, j] = C[i, j] + A[i, k] * B[k, j]
+
+    decision_0 = [
+        ("SamplePerfectTile", [1, 8, 2, 32]),
+        ("SamplePerfectTile", [8, 1, 8, 8]),
+        ("SamplePerfectTile", [16, 32]),
+    ]
+    decision_1 = [
+        ("SamplePerfectTile", [1, 8, 2, 32]),
+        ("SamplePerfectTile", [8, 1, 8, 8]),
+        ("SamplePerfectTile", [16, 32]),
+    ]
+    decision_2 = [
+        ("SamplePerfectTile", [1, 8, 2, 32]),
+        ("SamplePerfectTile", [8, 1, 8, 8]),
+        ("SamplePerfectTile", [16, 32]),
+    ]
+
+    mod = te.create_prim_func(te_workload.matmul(512, 512, 512))
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=get_rules("llvm", ms.schedule_rule.MultiLevelTiling),
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[cpu_matmul_0, cpu_matmul_1, cpu_matmul_2],
+        expected_decisions=[decision_0, decision_1, decision_2],
+    )
+
+
+def test_cpu_matmul_relu():
+    @T.prim_func
+    def cpu_matmul_relu_0(
+        A: T.Buffer[(512, 512), "float32"],
+        B: T.Buffer[(512, 512), "float32"],
+        compute: T.Buffer[(512, 512), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C = T.alloc_buffer([512, 512], dtype="float32")
+        for i0_0, i1_0, i0_1, i1_1, i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(
+            256, 4, 1, 4, 64, 1, 32, 8, 2, 1
+        ):
+            with T.block("C"):
+                i = T.axis.spatial(512, i0_0 * 2 + i0_1 * 2 + i0_2 * 2 + i0_3)
+                j = T.axis.spatial(512, i1_0 * 128 + i1_1 * 32 + i1_2 + i1_3)
+                k = T.axis.reduce(512, i2_0 * 8 + i2_1)
+                T.reads(A[i, k], B[k, j])
+                T.writes(C[i, j])
+                T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"})
+                with T.init():
+                    C[i, j] = T.float32(0)
+                C[i, j] = C[i, j] + A[i, k] * B[k, j]
+        for i0, i1 in T.grid(512, 512):
+            with T.block("compute"):
+                i0_4, i1_4 = T.axis.remap("SS", [i0, i1])
+                T.reads(C[i0_4, i1_4])
+                T.writes(compute[i0_4, i1_4])
+                compute[i0_4, i1_4] = T.max(C[i0_4, i1_4], T.float32(0))
+
+    @T.prim_func
+    def cpu_matmul_relu_1(
+        A: T.Buffer[(512, 512), "float32"],
+        B: T.Buffer[(512, 512), "float32"],
+        compute: T.Buffer[(512, 512), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C = T.alloc_buffer([512, 512], dtype="float32")
+        for i0_0, i1_0, i0_1, i1_1 in T.grid(256, 4, 1, 4):
+            for i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(64, 1, 32, 8, 2, 1):
+                with T.block("C"):
+                    i = T.axis.spatial(512, i0_0 * 2 + i0_1 * 2 + i0_2 * 2 + i0_3)
+                    j = T.axis.spatial(512, i1_0 * 128 + i1_1 * 32 + i1_2 + i1_3)
+                    k = T.axis.reduce(512, i2_0 * 8 + i2_1)
+                    T.reads(A[i, k], B[k, j])
+                    T.writes(C[i, j])
+                    T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"})
+                    with T.init():
+                        C[i, j] = T.float32(0)
+                    C[i, j] = C[i, j] + A[i, k] * B[k, j]
+            for ax0, ax1 in T.grid(2, 32):
+                with T.block("compute"):
+                    i0 = T.axis.spatial(512, i0_0 * 2 + ax0)
+                    i1 = T.axis.spatial(512, i1_0 * 128 + i1_1 * 32 + ax1)
+                    T.reads(C[i0, i1])
+                    T.writes(compute[i0, i1])
+                    compute[i0, i1] = T.max(C[i0, i1], T.float32(0))
+
+    @T.prim_func
+    def cpu_matmul_relu_2(
+        A: T.Buffer[(512, 512), "float32"],
+        B: T.Buffer[(512, 512), "float32"],
+        compute: T.Buffer[(512, 512), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C = T.alloc_buffer([512, 512], dtype="float32")
+        for i0_0, i1_0 in T.grid(256, 4):
+            for i0_1, i1_1, i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(1, 4, 64, 1, 32, 8, 2, 1):
+                with T.block("C"):
+                    i = T.axis.spatial(512, i0_0 * 2 + i0_1 * 2 + i0_2 * 2 + i0_3)
+                    j = T.axis.spatial(512, i1_0 * 128 + i1_1 * 32 + i1_2 + i1_3)
+                    k = T.axis.reduce(512, i2_0 * 8 + i2_1)
+                    T.reads(A[i, k], B[k, j])
+                    T.writes(C[i, j])
+                    T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"})
+                    with T.init():
+                        C[i, j] = T.float32(0)
+                    C[i, j] = C[i, j] + A[i, k] * B[k, j]
+            for ax0, ax1 in T.grid(2, 128):
+                with T.block("compute"):
+                    i0 = T.axis.spatial(512, i0_0 * 2 + ax0)
+                    i1 = T.axis.spatial(512, i1_0 * 128 + ax1)
+                    T.reads(C[i0, i1])
+                    T.writes(compute[i0, i1])
+                    compute[i0, i1] = T.max(C[i0, i1], T.float32(0))
+
+    decision_0 = [
+        ("SamplePerfectTile", [256, 1, 1, 2]),
+        ("SamplePerfectTile", [4, 4, 32, 1]),
+        ("SamplePerfectTile", [64, 8]),
+    ]
+    decision_1 = [
+        ("SamplePerfectTile", [256, 1, 1, 2]),
+        ("SamplePerfectTile", [4, 4, 32, 1]),
+        ("SamplePerfectTile", [64, 8]),
+    ]
+    decision_2 = [
+        ("SamplePerfectTile", [256, 1, 1, 2]),
+        ("SamplePerfectTile", [4, 4, 32, 1]),
+        ("SamplePerfectTile", [64, 8]),
+    ]
+    mod = te.create_prim_func(te_workload.matmul_relu(512, 512, 512))
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=get_rules("llvm", ms.schedule_rule.MultiLevelTiling),
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[cpu_matmul_relu_0, cpu_matmul_relu_1, cpu_matmul_relu_2],
+        expected_decisions=[decision_0, decision_1, decision_2],
+    )
+
+
+def test_cuda_matmul():
+    @T.prim_func
+    def cuda_matmul_0(
+        A: T.Buffer[(512, 512), "float32"],
+        B: T.Buffer[(512, 512), "float32"],
+        C: T.Buffer[(512, 512), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C_local = T.alloc_buffer([512, 512], dtype="float32", scope="local")
+        A_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared")
+        B_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared")
+        for i0_0_i1_0_fused in T.thread_binding(128, thread="blockIdx.x"):
+            for i0_1_i1_1_fused in T.thread_binding(8, thread="vthread.x"):
+                for i0_2_i1_2_fused in T.thread_binding(4, thread="threadIdx.x"):
+                    for i2_0 in T.serial(128):
+                        for ax0_ax1_fused in T.serial(256):
+                            with T.block("A_shared"):
+                                v0 = T.axis.spatial(
+                                    512, i0_0_i1_0_fused // 16 * 64 + ax0_ax1_fused // 4
+                                )
+                                v1 = T.axis.spatial(512, i2_0 * 4 + ax0_ax1_fused % 4)
+                                T.reads(A[v0, v1])
+                                T.writes(A_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 2})
+                                A_shared[v0, v1] = A[v0, v1]
+                        for ax0_ax1_fused in T.serial(128):
+                            with T.block("B_shared"):
+                                v0 = T.axis.spatial(512, i2_0 * 4 + ax0_ax1_fused // 32)
+                                v1 = T.axis.spatial(
+                                    512, i0_0_i1_0_fused % 16 * 32 + ax0_ax1_fused % 32
+                                )
+                                T.reads(B[v0, v1])
+                                T.writes(B_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 1})
+                                B_shared[v0, v1] = B[v0, v1]
+                        for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(2, 1, 1, 2, 16, 4):
+                            with T.block("C"):
+                                i = T.axis.spatial(
+                                    512,
+                                    i0_0_i1_0_fused // 16 * 64
+                                    + i0_1_i1_1_fused // 2 * 16
+                                    + i0_3 * 16
+                                    + i0_4,
+                                )
+                                j = T.axis.spatial(
+                                    512,
+                                    i0_0_i1_0_fused % 16 * 32
+                                    + i0_1_i1_1_fused % 2 * 16
+                                    + i0_2_i1_2_fused * 4
+                                    + i1_3 * 4
+                                    + i1_4,
+                                )
+                                k = T.axis.reduce(512, i2_0 * 4 + i2_1 * 2 + i2_2)
+                                T.reads(A_shared[i, k], B_shared[k, j])
+                                T.writes(C_local[i, j])
+                                T.block_attr(
+                                    {
+                                        "meta_schedule.thread_extent_high_inclusive": 1024,
+                                        "meta_schedule.thread_extent_low_inclusive": 32,
+                                        "meta_schedule.tiling_structure": "SSSRRSRS",
+                                    }
+                                )
+                                with T.init():
+                                    C_local[i, j] = T.float32(0)
+                                C_local[i, j] = C_local[i, j] + A_shared[i, k] * B_shared[k, j]
+                    for ax0, ax1 in T.grid(16, 4):
+                        with T.block("C_local"):
+                            v0 = T.axis.spatial(
+                                512, i0_0_i1_0_fused // 16 * 64 + i0_1_i1_1_fused // 2 * 16 + ax0
+                            )
+                            v1 = T.axis.spatial(
+                                512,
+                                i0_0_i1_0_fused % 16 * 32
+                                + i0_1_i1_1_fused % 2 * 16
+                                + i0_2_i1_2_fused * 4
+                                + ax1,
+                            )
+                            T.reads(C_local[v0, v1])
+                            T.writes(C[v0, v1])
+                            C[v0, v1] = C_local[v0, v1]
+
+    decision_0 = [
+        ("SamplePerfectTile", [8, 4, 1, 1, 16]),
+        ("SamplePerfectTile", [16, 2, 4, 1, 4]),
+        ("SamplePerfectTile", [128, 2, 2]),
+        ("SampleCategorical", 1),
+        ("SampleCategorical", 0),
+    ]
+    mod = te.create_prim_func(te_workload.matmul(512, 512, 512))
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3080"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=get_rules("cuda", ms.schedule_rule.MultiLevelTiling),
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[cuda_matmul_0],
+        expected_decisions=[decision_0],
+    )
+
+
+def test_cuda_matmul_relu():
+    @T.prim_func
+    def cuda_matmul_relu_0(
+        A: T.Buffer[(512, 512), "float32"],
+        B: T.Buffer[(512, 512), "float32"],
+        compute: T.Buffer[(512, 512), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C = T.alloc_buffer([512, 512], dtype="float32")
+        C_local = T.alloc_buffer([512, 512], dtype="float32", scope="local")
+        A_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared")
+        B_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared")
+        for i0_0_i1_0_fused in T.thread_binding(64, thread="blockIdx.x"):
+            for i0_1_i1_1_fused in T.thread_binding(64, thread="vthread.x"):
+                for i0_2_i1_2_fused in T.thread_binding(8, thread="threadIdx.x"):
+                    for i2_0 in T.serial(8):
+                        for ax0_ax1_fused in T.serial(4096):
+                            with T.block("A_shared"):
+                                v0 = T.axis.spatial(
+                                    512, i0_0_i1_0_fused // 8 * 64 + ax0_ax1_fused // 64
+                                )
+                                v1 = T.axis.spatial(512, i2_0 * 64 + ax0_ax1_fused % 64)
+                                T.reads(A[v0, v1])
+                                T.writes(A_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 2})
+                                A_shared[v0, v1] = A[v0, v1]
+                        for ax0_ax1_fused in T.serial(4096):
+                            with T.block("B_shared"):
+                                v0 = T.axis.spatial(512, i2_0 * 64 + ax0_ax1_fused // 64)
+                                v1 = T.axis.spatial(
+                                    512, i0_0_i1_0_fused % 8 * 64 + ax0_ax1_fused % 64
+                                )
+                                T.reads(B[v0, v1])
+                                T.writes(B_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 4})
+                                B_shared[v0, v1] = B[v0, v1]
+                        for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(8, 2, 1, 8, 2, 2):
+                            with T.block("C"):
+                                i = T.axis.spatial(
+                                    512,
+                                    i0_0_i1_0_fused // 8 * 64
+                                    + i0_1_i1_1_fused // 8 * 8
+                                    + i0_2_i1_2_fused // 4 * 4
+                                    + i0_3 * 2
+                                    + i0_4,
+                                )
+                                j = T.axis.spatial(
+                                    512,
+                                    i0_0_i1_0_fused % 8 * 64
+                                    + i0_1_i1_1_fused % 8 * 8
+                                    + i0_2_i1_2_fused % 4 * 2
+                                    + i1_3 * 2
+                                    + i1_4,
+                                )
+                                k = T.axis.reduce(512, i2_0 * 64 + i2_1 * 8 + i2_2)
+                                T.reads(A_shared[i, k], B_shared[k, j])
+                                T.writes(C_local[i, j])
+                                T.block_attr(
+                                    {
+                                        "meta_schedule.thread_extent_high_inclusive": 1024,
+                                        "meta_schedule.thread_extent_low_inclusive": 32,
+                                        "meta_schedule.tiling_structure": "SSSRRSRS",
+                                    }
+                                )
+                                with T.init():
+                                    C_local[i, j] = T.float32(0)
+                                C_local[i, j] = C_local[i, j] + A_shared[i, k] * B_shared[k, j]
+                    for ax0, ax1 in T.grid(4, 2):
+                        with T.block("C_local"):
+                            v0 = T.axis.spatial(
+                                512,
+                                i0_0_i1_0_fused // 8 * 64
+                                + i0_1_i1_1_fused // 8 * 8
+                                + i0_2_i1_2_fused // 4 * 4
+                                + ax0,
+                            )
+                            v1 = T.axis.spatial(
+                                512,
+                                i0_0_i1_0_fused % 8 * 64
+                                + i0_1_i1_1_fused % 8 * 8
+                                + i0_2_i1_2_fused % 4 * 2
+                                + ax1,
+                            )
+                            T.reads(C_local[v0, v1])
+                            T.writes(C[v0, v1])
+                            C[v0, v1] = C_local[v0, v1]
+        for i0, i1 in T.grid(512, 512):
+            with T.block("compute"):
+                i0_1, i1_1 = T.axis.remap("SS", [i0, i1])
+                T.reads(C[i0_1, i1_1])
+                T.writes(compute[i0_1, i1_1])
+                compute[i0_1, i1_1] = T.max(C[i0_1, i1_1], T.float32(0))
+
+    decision_0 = [
+        ("SamplePerfectTile", [8, 8, 2, 2, 2]),
+        ("SamplePerfectTile", [8, 8, 4, 1, 2]),
+        ("SamplePerfectTile", [8, 8, 8]),
+        ("SampleCategorical", 1),
+        ("SampleCategorical", 3),
+    ]
+    mod = te.create_prim_func(te_workload.matmul_relu(512, 512, 512))
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3080"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=get_rules("cuda", ms.schedule_rule.MultiLevelTiling),
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[cuda_matmul_relu_0],
+        expected_decisions=[decision_0],
+    )
+
+
+def test_cuda_sum_with_trivial_block_iter():
+    @T.prim_func
+    def sum_with_trivial_block_iter(
+        A: T.Buffer[(1, 64, 768), "float32"],
+        B: T.Buffer[(1, 64, 1), "float32"],
+    ) -> None:
+        for i0, i1, i2, i3 in T.grid(1, 64, 1, 768):
+            with T.block("sum"):
+                ax0, ax1, ax2, k2 = T.axis.remap("SSSR", [i0, i1, i2, i3])
+                T.reads(A[ax0, ax1, k2])
+                T.writes(B[ax0, ax1, ax2])
+                with T.init():
+                    B[ax0, ax1, ax2] = T.float32(0)
+                B[ax0, ax1, ax2] = B[ax0, ax1, ax2] + A[ax0, ax1, k2]
+
+    # Expect nothing to happen - the rule is not supposed to be applied in this case
+    mod = sum_with_trivial_block_iter
+    (sch,) = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3080"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=get_rules("cuda", ms.schedule_rule.MultiLevelTiling),
+        task_name="test",
+    ).generate_design_space()
+    assert not sch.trace.simplified(remove_postproc=True).insts
+
+
+if __name__ == "__main__":
+    test_cpu_matmul()
+    test_cpu_matmul_relu()
+    test_cuda_matmul()
+    test_cuda_matmul_relu()
+    test_cuda_sum_with_trivial_block_iter()
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
new file mode 100644
index 000000000000..38ddb137e108
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
@@ -0,0 +1,418 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+from tvm import meta_schedule as ms
+from tvm import te
+from tvm.ir import assert_structural_equal
+from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.script import tir as T
+from tvm.target import Target
+from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
+
+
+def test_vnni_conv2d_nchwc():
+    @T.prim_func
+    def conv2d_nchwc(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
+            with T.block("conv2d_NCHWc_int8"):
+                (
+                    n,
+                    oc_chunk,
+                    oh,
+                    ow,
+                    oc_block,
+                    kh,
+                    kw,
+                    ic_outer,
+                    ic_f_inner,
+                    ic_s_inner,
+                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                    n, oc_chunk, oh, ow, oc_block
+                ] + T.cast(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
+                ) * T.cast(
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                    "int32",
+                )
+
+    # fmt: off
+    @T.prim_func
+    def vnni_conv2d_nchwc_0(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        conv2d_NCHWc_int8_global = T.alloc_buffer([1, 16, 56, 56, 16], dtype="int32")
+        for i0_0, i1_0, i2_0, i3_0, i4_0_0, i0_1, i1_1, i2_1, i3_1, i4_0_1 in T.grid(1, 8, 28, 56, 1, 1, 2, 1, 1, 1):
+            for i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1):
+                with T.block("conv2d_NCHWc_int8_o"):
+                    n = T.axis.spatial(1, 0)
+                    oc_chunk = T.axis.spatial(16, i1_0 * 2 + i1_1 + i1_2 + i1_3)
+                    oh = T.axis.spatial(56, i2_0 * 2 + i2_1 * 2 + i2_2 + i2_3)
+                    ow = T.axis.spatial(56, i3_3 + i3_0 + i3_1 + i3_2)
+                    oc_block_o = T.axis.spatial(1, 0)
+                    kh = T.axis.reduce(1, 0)
+                    kw = T.axis.reduce(1, 0)
+                    ic_outer = T.axis.reduce(4, i7_0 * 4 + i7_1)
+                    ic_f_inner = T.axis.reduce(4, i8_0 + i8_1)
+                    ic_s_inner_o = T.axis.reduce(1, 0)
+                    T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
+                    T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, 0 : 16])
+                    T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
+                    with T.init():
+                        for i4_1 in T.serial(16):
+                            with T.block("conv2d_NCHWc_int8_init"):
+                                oc_block_i_init = T.axis.spatial(16, i4_1)
+                                T.reads()
+                                T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i_init])
+                                conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                    for i4_1, i9_1 in T.grid(16, 4):
+                        with T.block("conv2d_NCHWc_int8"):
+                            oc_block_i, ic_s_inner_i = T.axis.remap("SR", [i4_1, i9_1])
+                            T.reads(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i], placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i])
+                            T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i])
+                            T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                            conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i] = conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i] + T.cast(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], "int32") * T.cast(placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i], "int32")
+            for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 1, 2, 1, 16):
+                with T.block("conv2d_NCHWc_int8_global"):
+                    v0 = T.axis.spatial(1, ax0)
+                    v1 = T.axis.spatial(16, i1_0 * 2 + i1_1 + ax1)
+                    v2 = T.axis.spatial(56, i2_0 * 2 + ax2)
+                    v3 = T.axis.spatial(56, i3_0 + ax3)
+                    v4 = T.axis.spatial(16, ax4)
+                    T.reads(conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4])
+                    T.writes(conv2d_NCHWc_int8[v0, v1, v2, v3, v4])
+                    conv2d_NCHWc_int8[v0, v1, v2, v3, v4] = conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4]
+
+    @T.prim_func
+    def vnni_conv2d_nchwc_1(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        conv2d_NCHWc_int8_global = T.alloc_buffer([1, 16, 56, 56, 16], dtype="int32")
+        for i0_0, i1_0, i2_0, i3_0, i4_0_0 in T.grid(1, 8, 28, 56, 1):
+            for i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1):
+                with T.block("conv2d_NCHWc_int8_o"):
+                    n = T.axis.spatial(1, 0)
+                    oc_chunk = T.axis.spatial(16, i1_0 * 2 + i1_1 + i1_2 + i1_3)
+                    oh = T.axis.spatial(56, i2_0 * 2 + i2_1 * 2 + i2_2 + i2_3)
+                    ow = T.axis.spatial(56, i3_3 + i3_0 + i3_1 + i3_2)
+                    oc_block_o = T.axis.spatial(1, 0)
+                    kh = T.axis.reduce(1, 0)
+                    kw = T.axis.reduce(1, 0)
+                    ic_outer = T.axis.reduce(4, i7_0 * 4 + i7_1)
+                    ic_f_inner = T.axis.reduce(4, i8_0 + i8_1)
+                    ic_s_inner_o = T.axis.reduce(1, 0)
+                    T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
+                    T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, 0 : 16])
+                    T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
+                    with T.init():
+                        for i4_1 in T.serial(16):
+                            with T.block("conv2d_NCHWc_int8_init"):
+                                oc_block_i_init = T.axis.spatial(16, i4_1)
+                                T.reads()
+                                T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i_init])
+                                conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                    for i4_1, i9_1 in T.grid(16, 4):
+                        with T.block("conv2d_NCHWc_int8"):
+                            oc_block_i, ic_s_inner_i = T.axis.remap("SR", [i4_1, i9_1])
+                            T.reads(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i], placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i])
+                            T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i])
+                            T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                            conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i] = conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i] + T.cast(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], "int32") * T.cast(placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i], "int32")
+            for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 2, 2, 1, 16):
+                with T.block("conv2d_NCHWc_int8_global"):
+                    v0 = T.axis.spatial(1, ax0)
+                    v1 = T.axis.spatial(16, i1_0 * 2 + ax1)
+                    v2 = T.axis.spatial(56, i2_0 * 2 + ax2)
+                    v3 = T.axis.spatial(56, i3_0 + ax3)
+                    v4 = T.axis.spatial(16, ax4)
+                    T.reads(conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4])
+                    T.writes(conv2d_NCHWc_int8[v0, v1, v2, v3, v4])
+                    conv2d_NCHWc_int8[v0, v1, v2, v3, v4] = conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4]
+
+    @T.prim_func
+    def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i0_0, i1_0, i2_0, i3_0, i4_0_0, i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 8, 28, 56, 1, 1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1):
+            with T.block("conv2d_NCHWc_int8_o"):
+                n = T.axis.spatial(1, 0)
+                oc_chunk = T.axis.spatial(16, i1_0 * 2 + i1_1 + i1_2 + i1_3)
+                oh = T.axis.spatial(56, i2_0 * 2 + i2_1 * 2 + i2_2 + i2_3)
+                ow = T.axis.spatial(56, i3_3 + i3_0 + i3_1 + i3_2)
+                oc_block_o = T.axis.spatial(1, 0)
+                kh = T.axis.reduce(1, 0)
+                kw = T.axis.reduce(1, 0)
+                ic_outer = T.axis.reduce(4, i7_0 * 4 + i7_1)
+                ic_f_inner = T.axis.reduce(4, i8_0 + i8_1)
+                ic_s_inner_o = T.axis.reduce(1, 0)
+                T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
+                T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
+                with T.init():
+                    for i4_1 in T.serial(16):
+                        with T.block("conv2d_NCHWc_int8_init"):
+                            oc_block_i_init = T.axis.spatial(16, i4_1)
+                            T.reads()
+                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
+                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                for i4_1, i9_1 in T.grid(16, 4):
+                    with T.block("conv2d_NCHWc_int8"):
+                        oc_block_i, ic_s_inner_i = T.axis.remap("SR", [i4_1, i9_1])
+                        T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i], placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i])
+                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i])
+                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                        conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i] = conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i] + T.cast(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], "int32") * T.cast(placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i], "int32")
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [8, 2, 1, 1]),
+        ("SamplePerfectTile", [28, 1, 2, 1]),
+        ("SamplePerfectTile", [56, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1]),
+        ("SamplePerfectTile", [1, 1]),
+        ("SamplePerfectTile", [1, 4]),
+        ("SamplePerfectTile", [4, 1]),
+        ("SamplePerfectTile", [1, 1]),
+    ]
+    decision_1 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [8, 2, 1, 1]),
+        ("SamplePerfectTile", [28, 1, 2, 1]),
+        ("SamplePerfectTile", [56, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1]),
+        ("SamplePerfectTile", [1, 1]),
+        ("SamplePerfectTile", [1, 4]),
+        ("SamplePerfectTile", [4, 1]),
+        ("SamplePerfectTile", [1, 1]),
+    ]
+    decision_2 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [8, 2, 1, 1]),
+        ("SamplePerfectTile", [28, 1, 2, 1]),
+        ("SamplePerfectTile", [56, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1]),
+        ("SamplePerfectTile", [1, 1]),
+        ("SamplePerfectTile", [1, 4]),
+        ("SamplePerfectTile", [4, 1]),
+        ("SamplePerfectTile", [1, 1]),
+    ]
+
+    mod = conv2d_nchwc
+    target = Target("llvm -mcpu=cascadelake -num-cores=4")
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target(target),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.MultiLevelTilingWithIntrin(
+                VNNI_INTRIN,
+                structure="SSRSRS",
+                tile_binds=None,
+                max_innermost_factor=64,
+                vector_load_lens=None,
+                reuse_read=None,
+                reuse_write=ms.schedule_rule.ReuseType(req="may", levels=[1, 2], scope="global"),
+            ),
+        ],
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[vnni_conv2d_nchwc_0, vnni_conv2d_nchwc_1, vnni_conv2d_nchwc_2],
+        expected_decisions=[decision_0, decision_1, decision_2],
+    )
+
+
+def _check_dp4a_dense(m, n, k, in_dtype, out_dtype, expected_mods, expected_decisions):
+    def _dense(m, n, k, in_dtype, out_dtype):
+        X = te.placeholder((m, k), name="X", dtype=in_dtype)
+        W = te.placeholder((n, k), name="W", dtype=in_dtype)
+        ak = te.reduce_axis((0, k), name="k")
+        matmul = te.compute(
+            (m, n),
+            lambda i, j: te.sum(
+                X[i, ak].astype(out_dtype) * W[j, ak].astype(out_dtype),
+                axis=ak,
+            ),
+            name="compute",
+        )
+        return te.create_prim_func([X, W, matmul])
+
+    mod = _dense(m, n, k, in_dtype, out_dtype)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("cuda"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.MultiLevelTilingWithIntrin(
+                DP4A_INTRIN,
+                structure="SSSRRSRS",
+                tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
+                max_innermost_factor=64,
+                vector_load_lens=[1, 2, 3, 4],
+                reuse_read=ms.schedule_rule.ReuseType(req="must", levels=[4], scope="shared"),
+                reuse_write=ms.schedule_rule.ReuseType(req="must", levels=[3], scope="local"),
+            )
+        ],
+    ).generate_design_space()
+    if expected_mods is None:
+        assert expected_decisions is None
+        assert len(actual) == 1
+        assert_structural_equal(mod, actual[0].mod["main"])
+    else:
+        check_sketches(mod, actual, expected_mods, expected_decisions)
+
+
+def test_dp4a_dense():
+    @T.prim_func
+    def dp4a_dense_0(
+        X: T.Buffer[(128, 128), "int8"],
+        W: T.Buffer[(128, 128), "int8"],
+        compute: T.Buffer[(128, 128), "int32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        compute_local = T.alloc_buffer([128, 128], dtype="int32", scope="local")
+        X_shared = T.alloc_buffer([128, 128], dtype="int8", scope="shared")
+        W_shared = T.alloc_buffer([128, 128], dtype="int8", scope="shared")
+        for i0_0_i1_0_fused in T.thread_binding(1, thread="blockIdx.x"):
+            for i0_1_i1_1_fused in T.thread_binding(512, thread="vthread.x"):
+                for i0_2_i1_2_fused in T.thread_binding(2, thread="threadIdx.x"):
+                    for i2_0_0 in T.serial(1):
+                        for ax0_ax1_fused in T.serial(16384):
+                            with T.block("X_shared"):
+                                v0 = T.axis.spatial(128, ax0_ax1_fused // 128)
+                                v1 = T.axis.spatial(128, ax0_ax1_fused % 128)
+                                T.reads(X[v0, v1])
+                                T.writes(X_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 1})
+                                X_shared[v0, v1] = X[v0, v1]
+                        for ax0_ax1_fused in T.serial(16384):
+                            with T.block("W_shared"):
+                                v0 = T.axis.spatial(128, ax0_ax1_fused // 128)
+                                v1 = T.axis.spatial(128, ax0_ax1_fused % 128)
+                                T.reads(W[v0, v1])
+                                T.writes(W_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 1})
+                                W_shared[v0, v1] = W[v0, v1]
+                        for i2_0_1, i0_3, i1_3, i2_0_2, i0_4, i1_4 in T.grid(1, 2, 4, 32, 2, 1):
+                            with T.block("compute_o"):
+                                i = T.axis.spatial(
+                                    128,
+                                    i0_1_i1_1_fused // 32 * 8
+                                    + i0_2_i1_2_fused * 4
+                                    + i0_3 * 2
+                                    + i0_4,
+                                )
+                                j = T.axis.spatial(128, i1_4 + i0_1_i1_1_fused % 32 * 4 + i1_3)
+                                k_o = T.axis.reduce(32, i2_0_0 * 32 + i2_0_1 * 32 + i2_0_2)
+                                T.reads(
+                                    X_shared[i, k_o * 4 : k_o * 4 + 4],
+                                    W_shared[j, k_o * 4 : k_o * 4 + 4],
+                                )
+                                T.writes(compute_local[i, j])
+                                T.block_attr({"meta_schedule.auto_tensorize": "dp4a"})
+                                with T.init():
+                                    with T.block("compute_init"):
+                                        T.reads()
+                                        T.writes(compute_local[i, j])
+                                        compute_local[i, j] = 0
+                                for i2_1 in T.serial(4):
+                                    with T.block("compute"):
+                                        k_i = T.axis.reduce(4, i2_1)
+                                        T.reads(
+                                            compute_local[i, j],
+                                            X_shared[i, k_o * 4 + k_i],
+                                            W_shared[j, k_o * 4 + k_i],
+                                        )
+                                        T.writes(compute_local[i, j])
+                                        T.block_attr({"meta_schedule.tiling_structure": "SSSRRSRS"})
+                                        compute_local[i, j] = compute_local[i, j] + T.cast(
+                                            X_shared[i, k_o * 4 + k_i], "int32"
+                                        ) * T.cast(W_shared[j, k_o * 4 + k_i], "int32")
+                    for ax0, ax1 in T.grid(4, 4):
+                        with T.block("compute_local"):
+                            v0 = T.axis.spatial(
+                                128, i0_1_i1_1_fused // 32 * 8 + i0_2_i1_2_fused * 4 + ax0
+                            )
+                            v1 = T.axis.spatial(128, i0_1_i1_1_fused % 32 * 4 + ax1)
+                            T.reads(compute_local[v0, v1])
+                            T.writes(compute[v0, v1])
+                            compute[v0, v1] = compute_local[v0, v1]
+
+    decision_0 = [
+        ("SamplePerfectTile", [1, 16, 2, 2, 2]),
+        ("SamplePerfectTile", [1, 32, 1, 4, 1]),
+        ("SamplePerfectTile", [1, 1, 32]),
+        ("SampleCategorical", 0),
+        ("SampleCategorical", 0),
+    ]
+    _check_dp4a_dense(
+        m=128,
+        n=128,
+        k=128,
+        in_dtype="int8",
+        out_dtype="int32",
+        expected_mods=[dp4a_dense_0],
+        expected_decisions=[decision_0],
+    )
+
+
+def test_dp4a_dense_no_tensorize_1():
+    _check_dp4a_dense(
+        m=128,
+        n=128,
+        k=128,
+        in_dtype="float32",
+        out_dtype="float32",
+        expected_mods=None,
+        expected_decisions=None,
+    )
+
+
+def test_dp4a_dense_no_tensorize_2():
+    _check_dp4a_dense(
+        m=127,
+        n=127,
+        k=127,
+        in_dtype="int8",
+        out_dtype="int32",
+        expected_mods=None,
+        expected_decisions=None,
+    )
+
+
+if __name__ == "__main__":
+    test_vnni_conv2d_nchwc()
+    test_dp4a_dense()
+    test_dp4a_dense_no_tensorize_1()
+    test_dp4a_dense_no_tensorize_2()
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
new file mode 100644
index 000000000000..fbb74090b1e5
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
@@ -0,0 +1,957 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+import tvm
+from tvm import meta_schedule as ms
+from tvm import te
+from tvm.meta_schedule.testing import te_workload
+from tvm.meta_schedule.testing.schedule_rule import get_rules
+from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.script import tir as T
+from tvm.tir.tensor_intrin.cuda import get_wmma_intrin_group
+
+
+def multi_level_tiling_tensor_core(
+    *,
+    write_reuse_scope="shared",
+    in_dtype="float16",
+    out_dtype="float32",
+    trans_b=False,
+    use_software_pipeline=False,
+) -> ms.schedule_rule.ScheduleRule:
+    assert write_reuse_scope in ["shared", "global"]
+    if not isinstance(in_dtype, list):
+        in_dtype = [in_dtype]
+    if not isinstance(out_dtype, list):
+        out_dtype = [out_dtype]
+    if not isinstance(trans_b, list):
+        trans_b = [trans_b]
+    return ms.schedule_rule.MultiLevelTilingTensorCore(
+        intrin_groups=[
+            get_wmma_intrin_group(write_reuse_scope, _in_dtype, _out_dtype, _trans_b)
+            for _in_dtype in in_dtype
+            for _out_dtype in out_dtype
+            for _trans_b in trans_b
+        ],
+        structure="SSSRRSRS",
+        tile_binds=["blockIdx.y", "blockIdx.x", "threadIdx.y"],
+        max_innermost_factor=4,  # 64 // tensor intrin size
+        vector_load_lens=[1, 2, 3, 4, 8, 16],
+        reuse_read=ms.schedule_rule.ReuseType(
+            req="must",
+            levels=[4],
+            scope="shared",
+        ),
+        reuse_write=ms.schedule_rule.ReuseType(
+            req="must" if write_reuse_scope == "shared" else "no",
+            levels=[2],
+            scope=write_reuse_scope,
+        ),
+        use_software_pipeline=use_software_pipeline,
+    )
+
+
+def test_matmul_relu():
+    # fmt: off
+    @T.prim_func
+    def matmul_relu_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "float16"], compute: T.Buffer[(128, 128), "float32"]) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        C_reindex_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+        C_reindex_shared_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator")
+        A_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared")
+        B_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared")
+        A_reindex_shared_wmma_matrix_a = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_a")
+        B_reindex_shared_wmma_matrix_b = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_b")
+        for ax0_0_0_ax1_0_0_fused in T.thread_binding(8, thread="blockIdx.y"):
+            for ax0_0_1_ax1_0_1_fused in T.thread_binding(2, thread="blockIdx.x"):
+                for ax0_0_2_ax1_0_2_fused in T.thread_binding(2, thread="threadIdx.y"):
+                    for ax2_0_0 in T.serial(1):
+                        for ax0_ax1_fused in T.serial(4096):
+                            with T.block("A_reindex_shared"):
+                                v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0_ax1_fused // 128)
+                                v1 = T.axis.spatial(128, ax0_ax1_fused % 128)
+                                T.reads(A[v0, v1])
+                                T.writes(A_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8})
+                                A_reindex_shared[v0, v1] = A[v0, v1]
+                        for ax0_ax1_fused in T.serial(4096):
+                            with T.block("B_reindex_shared"):
+                                v0 = T.axis.spatial(128, ax0_ax1_fused // 32)
+                                v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0_ax1_fused % 32)
+                                T.reads(B[v0, v1])
+                                T.writes(B_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":1})
+                                B_reindex_shared[v0, v1] = B[v0, v1]
+                        for ax2_0_1 in T.serial(4):
+                            for ax0_0, ax1_0 in T.grid(2, 2):
+                                with T.block("A_reindex_shared_wmma.matrix_a_o"):
+                                    v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0)
+                                    v1_o = T.axis.spatial(8, ax2_0_1 * 2 + ax1_0)
+                                    T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("A_reindex_shared_wmma.matrix_a"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0, ax1_0 in T.grid(2, 1):
+                                with T.block("B_reindex_shared_wmma.matrix_b_o"):
+                                    v0_o = T.axis.spatial(8, ax2_0_1 * 2 + ax0_0)
+                                    v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused)
+                                    T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("B_reindex_shared_wmma.matrix_b"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(1, 1, 2, 2, 1):
+                                with T.block("C_o"):
+                                    v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0_3 * 2 + ax0_0_4)
+                                    v1_o = T.axis.spatial(8, ax1_0_4 + ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused + ax1_0_3)
+                                    v2_o = T.axis.reduce(8, ax2_0_0 * 8 + ax2_0_1 * 2 + ax2_0_2)
+                                    T.reads(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex_shared_wmma_matrix_b[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1})
+                                    with T.init():
+                                        for ax0_1, ax1_1 in T.grid(16, 16):
+                                            with T.block("C_init"):
+                                                v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1])
+                                                T.reads()
+                                                T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init])
+                                                C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0)
+                                    for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16):
+                                        with T.block("C"):
+                                            v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1])
+                                            T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i])
+                                            T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
+                                            C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32")
+                    for ax0_0, ax1_0 in T.grid(2, 1):
+                        with T.block("C_reindex_shared_wmma.accumulator_o"):
+                            v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0)
+                            v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused)
+                            T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.writes(C_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
+                            for ax0_1, ax1_1 in T.grid(16, 16):
+                                with T.block("C_reindex_shared_wmma.accumulator"):
+                                    v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                    T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                for ax0, ax1 in T.grid(32, 32):
+                    with T.block("C_reindex_shared"):
+                        v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0)
+                        v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax1)
+                        T.reads(C_reindex_shared[v0, v1])
+                        T.writes(compute[v0, v1])
+                        T.block_attr({"meta_schedule.cooperative_fetch":4})
+                        compute[v0, v1] = T.max(C_reindex_shared[v0, v1], T.float32(0))
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [4, 1, 1, 1, 2]),
+        ("SamplePerfectTile", [2, 2, 2, 1, 1]),
+        ("SamplePerfectTile", [1, 4, 2]),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 0),
+    ]
+
+    mod = te.create_prim_func(
+        te_workload.matmul_relu(
+            n=128,
+            m=128,
+            k=128,
+            in_dtype="float16",
+            out_dtype="float32",
+        )
+    )
+    actual = ms.TuneContext(
+        mod=mod,
+        target=tvm.target.Target("cuda"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[multi_level_tiling_tensor_core()]
+        + get_rules("cuda", ms.schedule_rule.AutoInline),
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[matmul_relu_0],
+        expected_decisions=[decision_0],
+    )
+
+
+def test_matmul_relu_with_fallback():
+    # fmt: off
+    @T.prim_func
+    def matmul_relu_fallback_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "float16"], compute: T.Buffer[(128, 128), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C_reindex_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+        C_reindex_shared_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator")
+        A_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared")
+        B_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared")
+        A_reindex_shared_wmma_matrix_a = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_a")
+        B_reindex_shared_wmma_matrix_b = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_b")
+        for ax0_0_0_ax1_0_0_fused in T.thread_binding(2, thread="blockIdx.y"):
+            for ax0_0_1_ax1_0_1_fused in T.thread_binding(2, thread="blockIdx.x"):
+                for ax0_0_2_ax1_0_2_fused in T.thread_binding(2, thread="threadIdx.y"):
+                    for ax2_0_0 in T.serial(2):
+                        for ax0_ax1_fused in T.serial(2048):
+                            with T.block("A_reindex_shared"):
+                                v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0_ax1_fused // 64)
+                                v1 = T.axis.spatial(128, ax2_0_0 * 64 + ax0_ax1_fused % 64)
+                                T.reads(A[v0, v1])
+                                T.writes(A_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":4})
+                                A_reindex_shared[v0, v1] = A[v0, v1]
+                        for ax0_ax1_fused in T.serial(8192):
+                            with T.block("B_reindex_shared"):
+                                v0 = T.axis.spatial(128, ax2_0_0 * 64 + ax0_ax1_fused // 128)
+                                v1 = T.axis.spatial(128, ax0_ax1_fused % 128)
+                                T.reads(B[v0, v1])
+                                T.writes(B_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":2})
+                                B_reindex_shared[v0, v1] = B[v0, v1]
+                        for ax2_0_1 in T.serial(1):
+                            for ax0_0, ax1_0 in T.grid(2, 4):
+                                with T.block("A_reindex_shared_wmma.matrix_a_o"):
+                                    v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0)
+                                    v1_o = T.axis.spatial(8, ax2_0_0 * 4 + ax1_0)
+                                    T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("A_reindex_shared_wmma.matrix_a"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0, ax1_0 in T.grid(4, 4):
+                                with T.block("B_reindex_shared_wmma.matrix_b_o"):
+                                    v0_o = T.axis.spatial(8, ax2_0_0 * 4 + ax0_0)
+                                    v1_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused * 4 + ax1_0)
+                                    T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("B_reindex_shared_wmma.matrix_b"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(1, 1, 4, 2, 4):
+                                with T.block("C_o"):
+                                    v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_3 * 2 + ax0_0_4)
+                                    v1_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused * 4 + ax1_0_3 * 4 + ax1_0_4)
+                                    v2_o = T.axis.reduce(8, ax2_0_0 * 4 + ax2_0_1 * 4 + ax2_0_2)
+                                    T.reads(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex_shared_wmma_matrix_b[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1})
+                                    with T.init():
+                                        for ax0_1, ax1_1 in T.grid(16, 16):
+                                            with T.block("C_init"):
+                                                v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1])
+                                                T.reads()
+                                                T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init])
+                                                C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0)
+                                    for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16):
+                                        with T.block("C"):
+                                            v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1])
+                                            T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i])
+                                            T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
+                                            C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32")
+                    for ax0_0, ax1_0 in T.grid(2, 4):
+                        with T.block("C_reindex_shared_wmma.accumulator_o"):
+                            v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0)
+                            v1_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused * 4 + ax1_0)
+                            T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.writes(C_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
+                            for ax0_1, ax1_1 in T.grid(16, 16):
+                                with T.block("C_reindex_shared_wmma.accumulator"):
+                                    v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                    T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                for ax0, ax1 in T.grid(32, 128):
+                    with T.block("C_reindex_shared"):
+                        v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0)
+                        v1 = T.axis.spatial(128, ax1)
+                        T.reads(C_reindex_shared[v0, v1])
+                        T.writes(compute[v0, v1])
+                        T.block_attr({"meta_schedule.cooperative_fetch":4})
+                        compute[v0, v1] = T.max(C_reindex_shared[v0, v1], T.float32(0))
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [2, 2, 1, 1, 2]),
+        ("SamplePerfectTile", [1, 1, 2, 1, 4]),
+        ("SamplePerfectTile", [2, 1, 4]),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 2),
+        ("SampleCategorical", 1),
+    ]
+
+    mod = te.create_prim_func(
+        te_workload.matmul_relu(
+            n=128,
+            m=128,
+            k=128,
+            in_dtype="float16",
+            out_dtype="float32",
+        )
+    )
+    actual = ms.TuneContext(
+        mod=mod,
+        target=tvm.target.Target("cuda"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            multi_level_tiling_tensor_core(),
+        ]
+        + get_rules(
+            "cuda",
+            (
+                ms.schedule_rule.MultiLevelTiling,
+                ms.schedule_rule.AutoInline,
+            ),
+        ),
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[matmul_relu_fallback_0],
+        expected_decisions=[decision_0],
+    )
+
+
+def test_conv2d():
+    # fmt: off
+    @T.prim_func
+    def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3, 3, 32, 32), "float16"], conv2d_nhwc: T.Buffer[(1, 16, 16, 32), "float32"]) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        PadInput = T.alloc_buffer([1, 18, 18, 32], dtype="float16")
+        conv2d_nhwc_reindex_shared = T.alloc_buffer([256, 32], dtype="float32", scope="shared")
+        conv2d_nhwc_reindex_shared_wmma_accumulator = T.alloc_buffer([256, 32], dtype="float32", scope="wmma.accumulator")
+        PadInput_reindex_shared = T.alloc_buffer([256, 288], dtype="float16", scope="shared")
+        weight_reindex_shared = T.alloc_buffer([288, 32], dtype="float16", scope="shared")
+        PadInput_reindex_shared_wmma_matrix_a = T.alloc_buffer([256, 288], dtype="float16", scope="wmma.matrix_a")
+        weight_reindex_shared_wmma_matrix_b = T.alloc_buffer([288, 32], dtype="float16", scope="wmma.matrix_b")
+        for i0, i1, i2, i3 in T.grid(1, 18, 18, 32):
+            with T.block("PadInput"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1])
+                T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
+                PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 17 and 1 <= i2_1 and i2_1 < 17, inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float16(0), dtype="float16")
+        for ax0_0_ax1_0_0_ax2_0_0_fused in T.thread_binding(2, thread="blockIdx.y"):
+            for ax0_1_ax1_0_1_ax2_0_1_fused in T.thread_binding(16, thread="blockIdx.x"):
+                for ax0_2_ax1_0_2_ax2_0_2_fused in T.thread_binding(1, thread="threadIdx.y"):
+                    for ax3_0_0 in T.serial(1):
+                        for ax0_ax1_fused in T.serial(4608):
+                            with T.block("PadInput_reindex_shared"):
+                                v0 = T.axis.spatial(256, ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0_ax1_fused // 288)
+                                v1 = T.axis.spatial(288, ax0_ax1_fused % 288)
+                                T.reads(PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32])
+                                T.writes(PadInput_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":2})
+                                PadInput_reindex_shared[v0, v1] = PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32]
+                        for ax0_ax1_fused in T.serial(4608):
+                            with T.block("weight_reindex_shared"):
+                                v0 = T.axis.spatial(288, ax0_ax1_fused // 16)
+                                v1 = T.axis.spatial(32, ax0_0_ax1_0_0_ax2_0_0_fused * 16 + ax0_ax1_fused % 16)
+                                T.reads(weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1])
+                                T.writes(weight_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8})
+                                weight_reindex_shared[v0, v1] = weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1]
+                        for ax3_0_1 in T.serial(18):
+                            for ax0_0, ax1_0 in T.grid(1, 1):
+                                with T.block("PadInput_reindex_shared_wmma.matrix_a_o"):
+                                    v0_o, v1_o = T.axis.remap("SS", [ax0_1_ax1_0_1_ax2_0_1_fused, ax3_0_1])
+                                    T.reads(PadInput_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("PadInput_reindex_shared_wmma.matrix_a"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0, ax1_0 in T.grid(1, 1):
+                                with T.block("weight_reindex_shared_wmma.matrix_b_o"):
+                                    v0_o, v1_o = T.axis.remap("SS", [ax3_0_1, ax0_0_ax1_0_0_ax2_0_0_fused])
+                                    T.reads(weight_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("weight_reindex_shared_wmma.matrix_b"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_3, ax1_0_3, ax2_0_3, ax3_0_2, ax0_4, ax1_0_4, ax2_0_4 in T.grid(1, 1, 1, 1, 1, 1, 1):
+                                with T.block("conv2d_nhwc_o"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1_o = T.axis.spatial(16, ax1_0_4 + ax0_1_ax1_0_1_ax2_0_1_fused + ax1_0_3)
+                                    v2_o = T.axis.spatial(2, ax0_0_ax1_0_0_ax2_0_0_fused + ax2_0_3 + ax2_0_4)
+                                    v3_o = T.axis.reduce(18, ax3_0_0 * 18 + ax3_0_1 + ax3_0_2)
+                                    T.reads(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 : v1_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], weight_reindex_shared_wmma_matrix_b[v3_o * 16 : v3_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16])
+                                    T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1})
+                                    with T.init():
+                                        for ax1_1, ax2_1 in T.grid(16, 16):
+                                            with T.block("conv2d_nhwc_init"):
+                                                v1_i_init, v2_i_init = T.axis.remap("SS", [ax1_1, ax2_1])
+                                                T.reads()
+                                                T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init])
+                                                conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init] = T.float32(0)
+                                    for ax1_1, ax2_1, ax3_1 in T.grid(16, 16, 16):
+                                        with T.block("conv2d_nhwc"):
+                                            v1_i, v2_i, v3_i = T.axis.remap("SSR", [ax1_1, ax2_1, ax3_1])
+                                            T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i], PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i])
+                                            T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i])
+                                            T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
+                                            conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] + T.cast(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], "float32") * T.cast(weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i], "float32")
+                    for ax0_0, ax1_0 in T.grid(1, 1):
+                        with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
+                            v0_o, v1_o = T.axis.remap("SS", [ax0_1_ax1_0_1_ax2_0_1_fused, ax0_0_ax1_0_0_ax2_0_0_fused])
+                            T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
+                            for ax0_1, ax1_1 in T.grid(16, 16):
+                                with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator"):
+                                    v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                    T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                for ax0, ax1 in T.grid(16, 16):
+                    with T.block("conv2d_nhwc_reindex_shared"):
+                        v0 = T.axis.spatial(256, ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0)
+                        v1 = T.axis.spatial(32, ax0_0_ax1_0_0_ax2_0_0_fused * 16 + ax1)
+                        T.reads(conv2d_nhwc_reindex_shared[v0, v1])
+                        T.writes(conv2d_nhwc[0, v0 // 16, v0 % 16, v1])
+                        T.block_attr({"meta_schedule.cooperative_fetch":3})
+                        conv2d_nhwc[0, v0 // 16, v0 % 16, v1] = conv2d_nhwc_reindex_shared[v0, v1]
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 16, 1, 1, 1]),
+        ("SamplePerfectTile", [2, 1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 18, 1]),
+        ("SampleCategorical", 2),
+        ("SampleCategorical", 1),
+        ("SampleCategorical", 3),
+    ]
+    mod = te.create_prim_func(
+        te_workload.conv2d_nhwc(
+            N=1,
+            H=16,
+            W=16,
+            CI=32,
+            CO=32,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            in_dtype="float16",
+            out_dtype="float32",
+        )
+    )
+    actual = ms.TuneContext(
+        mod=mod,
+        target=tvm.target.Target("cuda"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[multi_level_tiling_tensor_core()],
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[conv2d_0],
+        expected_decisions=[decision_0],
+    )
+
+
+def test_conv2d_more_intrin():
+    # test adding inapplicable tensor intrinsics doesn't change the search space
+    # fmt: off
+    @T.prim_func
+    def conv2d_more_intrin_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3, 3, 32, 32), "float16"], conv2d_nhwc: T.Buffer[(1, 16, 16, 32), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        PadInput = T.alloc_buffer([1, 18, 18, 32], dtype="float16")
+        conv2d_nhwc_reindex_shared = T.alloc_buffer([256, 32], dtype="float32", scope="shared")
+        conv2d_nhwc_reindex_shared_wmma_accumulator = T.alloc_buffer([256, 32], dtype="float32", scope="wmma.accumulator")
+        PadInput_reindex_shared = T.alloc_buffer([256, 288], dtype="float16", scope="shared")
+        weight_reindex_shared = T.alloc_buffer([288, 32], dtype="float16", scope="shared")
+        PadInput_reindex_shared_wmma_matrix_a = T.alloc_buffer([256, 288], dtype="float16", scope="wmma.matrix_a")
+        weight_reindex_shared_wmma_matrix_b = T.alloc_buffer([288, 32], dtype="float16", scope="wmma.matrix_b")
+        for i0, i1, i2, i3 in T.grid(1, 18, 18, 32):
+            with T.block("PadInput"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1])
+                T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
+                PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 17 and 1 <= i2_1 and i2_1 < 17, inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float16(0), dtype="float16")
+        for ax0_0_ax1_0_0_ax2_0_0_fused in T.thread_binding(4, thread="blockIdx.y"):
+            for ax0_1_ax1_0_1_ax2_0_1_fused in T.thread_binding(4, thread="blockIdx.x"):
+                for ax0_2_ax1_0_2_ax2_0_2_fused in T.thread_binding(1, thread="threadIdx.y"):
+                    for ax3_0_0 in T.serial(3):
+                        for ax0_ax1_fused in T.serial(1536):
+                            with T.block("PadInput_reindex_shared"):
+                                v0 = T.axis.spatial(256, ax0_0_ax1_0_0_ax2_0_0_fused * 64 + ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0_ax1_fused // 96)
+                                v1 = T.axis.spatial(288, ax3_0_0 * 96 + ax0_ax1_fused % 96)
+                                T.reads(PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32])
+                                T.writes(PadInput_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8})
+                                PadInput_reindex_shared[v0, v1] = PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32]
+                        for ax0_ax1_fused in T.serial(3072):
+                            with T.block("weight_reindex_shared"):
+                                v0 = T.axis.spatial(288, ax3_0_0 * 96 + ax0_ax1_fused // 32)
+                                v1 = T.axis.spatial(32, ax0_ax1_fused % 32)
+                                T.reads(weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1])
+                                T.writes(weight_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8})
+                                weight_reindex_shared[v0, v1] = weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1]
+                        for ax3_0_1 in T.serial(2):
+                            for ax0_0, ax1_0 in T.grid(1, 3):
+                                with T.block("PadInput_reindex_shared_wmma.matrix_a_o"):
+                                    v0_o = T.axis.spatial(16, ax0_0_ax1_0_0_ax2_0_0_fused * 4 + ax0_1_ax1_0_1_ax2_0_1_fused)
+                                    v1_o = T.axis.spatial(18, ax3_0_0 * 6 + ax3_0_1 * 3 + ax1_0)
+                                    T.reads(PadInput_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("PadInput_reindex_shared_wmma.matrix_a"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0, ax1_0 in T.grid(3, 2):
+                                with T.block("weight_reindex_shared_wmma.matrix_b_o"):
+                                    v0_o = T.axis.spatial(18, ax3_0_0 * 6 + ax3_0_1 * 3 + ax0_0)
+                                    v1_o = T.axis.spatial(2, ax1_0)
+                                    T.reads(weight_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("weight_reindex_shared_wmma.matrix_b"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_3, ax1_0_3, ax2_0_3, ax3_0_2, ax0_4, ax1_0_4, ax2_0_4 in T.grid(1, 1, 2, 3, 1, 1, 1):
+                                with T.block("conv2d_nhwc_o"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1_o = T.axis.spatial(16, ax1_0_4 + ax0_0_ax1_0_0_ax2_0_0_fused * 4 + ax0_1_ax1_0_1_ax2_0_1_fused + ax1_0_3)
+                                    v2_o = T.axis.spatial(2, ax2_0_4 + ax2_0_3)
+                                    v3_o = T.axis.reduce(18, ax3_0_0 * 6 + ax3_0_1 * 3 + ax3_0_2)
+                                    T.reads(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 : v1_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], weight_reindex_shared_wmma_matrix_b[v3_o * 16 : v3_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16])
+                                    T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1})
+                                    with T.init():
+                                        for ax1_1, ax2_1 in T.grid(16, 16):
+                                            with T.block("conv2d_nhwc_init"):
+                                                v1_i_init, v2_i_init = T.axis.remap("SS", [ax1_1, ax2_1])
+                                                T.reads()
+                                                T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init])
+                                                conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init] = T.float32(0)
+                                    for ax1_1, ax2_1, ax3_1 in T.grid(16, 16, 16):
+                                        with T.block("conv2d_nhwc"):
+                                            v1_i, v2_i, v3_i = T.axis.remap("SSR", [ax1_1, ax2_1, ax3_1])
+                                            T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i], PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i])
+                                            T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i])
+                                            T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
+                                            conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] + T.cast(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], "float32") * T.cast(weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i], "float32")
+                    for ax0_0, ax1_0 in T.grid(1, 2):
+                        with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
+                            v0_o = T.axis.spatial(16, ax0_0_ax1_0_0_ax2_0_0_fused * 4 + ax0_1_ax1_0_1_ax2_0_1_fused)
+                            v1_o = T.axis.spatial(2, ax1_0)
+                            T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
+                            for ax0_1, ax1_1 in T.grid(16, 16):
+                                with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator"):
+                                    v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                    T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                for ax0, ax1 in T.grid(16, 32):
+                    with T.block("conv2d_nhwc_reindex_shared"):
+                        v0 = T.axis.spatial(256, ax0_0_ax1_0_0_ax2_0_0_fused * 64 + ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0)
+                        v1 = T.axis.spatial(32, ax1)
+                        T.reads(conv2d_nhwc_reindex_shared[v0, v1])
+                        T.writes(conv2d_nhwc[0, v0 // 16, v0 % 16, v1])
+                        T.block_attr({"meta_schedule.cooperative_fetch":3})
+                        conv2d_nhwc[0, v0 // 16, v0 % 16, v1] = conv2d_nhwc_reindex_shared[v0, v1]
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1, 1]),
+        ("SamplePerfectTile", [4, 4, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 1, 2, 1]),
+        ("SamplePerfectTile", [3, 2, 3]),
+        ("SampleCategorical", 2),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 3),
+    ]
+
+    mod = te.create_prim_func(
+        te_workload.conv2d_nhwc(
+            N=1,
+            H=16,
+            W=16,
+            CI=32,
+            CO=32,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            in_dtype="float16",
+            out_dtype="float32",
+        )
+    )
+    actual = ms.TuneContext(
+        mod=mod,
+        target=tvm.target.Target("cuda"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            multi_level_tiling_tensor_core(
+                in_dtype="float16",
+                out_dtype=["float16", "float32"],
+            ),
+        ],
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[conv2d_more_intrin_0],
+        expected_decisions=[decision_0],
+    )
+
+
+def test_matmul_relu_pipeline():
+    # fmt: off
+    @T.prim_func
+    def matmul_relu_pipeline_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "float16"], compute: T.Buffer[(128, 128), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C = T.alloc_buffer([128, 128], dtype="float32")
+        C_reindex_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+        C_reindex_shared_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator")
+        A_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared")
+        B_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared")
+        A_reindex_shared_wmma_matrix_a = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_a")
+        B_reindex_shared_wmma_matrix_b = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_b")
+        for ax0_0_0_ax1_0_0_fused in T.thread_binding(1, thread="blockIdx.y"):
+            for ax0_0_1_ax1_0_1_fused in T.thread_binding(16, thread="blockIdx.x"):
+                for ax0_0_2_ax1_0_2_fused in T.thread_binding(1, thread="threadIdx.y"):
+                    for ax2_0_0 in T.serial(4, annotations={"software_pipeline_order":[0, 3, 1, 4, 5, 2, 6], "software_pipeline_stage":[0, 0, 0, 0, 0, 1, 1]}):
+                        for ax0_ax1_fused in T.serial(1024):
+                            with T.block("A_reindex_shared"):
+                                v0 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused // 4 * 32 + ax0_ax1_fused // 32)
+                                v1 = T.axis.spatial(128, ax2_0_0 * 32 + ax0_ax1_fused % 32)
+                                T.reads(A[v0, v1])
+                                T.writes(A_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "double_buffer_scope":0, "meta_schedule.cooperative_fetch":4, "tir.manifest_shared_memory_local_stage":1})
+                                A_reindex_shared[v0, v1] = A[v0, v1]
+                        for ax0_ax1_fused in T.serial(1024):
+                            with T.block("B_reindex_shared"):
+                                v0 = T.axis.spatial(128, ax2_0_0 * 32 + ax0_ax1_fused // 32)
+                                v1 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused % 4 * 32 + ax0_ax1_fused % 32)
+                                T.reads(B[v0, v1])
+                                T.writes(B_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "double_buffer_scope":0, "meta_schedule.cooperative_fetch":2, "tir.manifest_shared_memory_local_stage":1})
+                                B_reindex_shared[v0, v1] = B[v0, v1]
+                        for ax2_0_1 in T.serial(2, annotations={"software_pipeline_order":[0, 1, 2], "software_pipeline_stage":[0, 0, 1]}):
+                            for ax0_0, ax1_0 in T.grid(2, 1):
+                                with T.block("A_reindex_shared_wmma.matrix_a_o"):
+                                    v0_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused // 4 * 2 + ax0_0)
+                                    v1_o = T.axis.spatial(8, ax2_0_0 * 2 + ax2_0_1)
+                                    T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("A_reindex_shared_wmma.matrix_a"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0, ax1_0 in T.grid(1, 2):
+                                with T.block("B_reindex_shared_wmma.matrix_b_o"):
+                                    v0_o = T.axis.spatial(8, ax2_0_0 * 2 + ax2_0_1)
+                                    v1_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused % 4 * 2 + ax1_0)
+                                    T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("B_reindex_shared_wmma.matrix_b"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(1, 1, 1, 2, 2):
+                                with T.block("C_o"):
+                                    v0_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused // 4 * 2 + ax0_0_3 * 2 + ax0_0_4)
+                                    v1_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused % 4 * 2 + ax1_0_3 * 2 + ax1_0_4)
+                                    v2_o = T.axis.reduce(8, ax2_0_0 * 2 + ax2_0_1 + ax2_0_2)
+                                    T.reads(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex_shared_wmma_matrix_b[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1})
+                                    with T.init():
+                                        for ax0_1, ax1_1 in T.grid(16, 16):
+                                            with T.block("C_init"):
+                                                v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1])
+                                                T.reads()
+                                                T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init])
+                                                C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0)
+                                    for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16):
+                                        with T.block("C"):
+                                            v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1])
+                                            T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i])
+                                            T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
+                                            C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32")
+                    for ax0_0, ax1_0 in T.grid(2, 2):
+                        with T.block("C_reindex_shared_wmma.accumulator_o"):
+                            v0_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused // 4 * 2 + ax0_0)
+                            v1_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused % 4 * 2 + ax1_0)
+                            T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.writes(C_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
+                            for ax0_1, ax1_1 in T.grid(16, 16):
+                                with T.block("C_reindex_shared_wmma.accumulator"):
+                                    v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                    T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                for ax0, ax1 in T.grid(32, 32):
+                    with T.block("C_reindex_shared"):
+                        v0 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused // 4 * 32 + ax0)
+                        v1 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused % 4 * 32 + ax1)
+                        T.reads(C_reindex_shared[v0, v1])
+                        T.writes(C[v0, v1])
+                        T.block_attr({"meta_schedule.cooperative_fetch":3})
+                        C[v0, v1] = C_reindex_shared[v0, v1]
+        for i0, i1 in T.grid(128, 128):
+            with T.block("compute"):
+                i0_1, i1_1 = T.axis.remap("SS", [i0, i1])
+                T.reads(C[i0_1, i1_1])
+                T.writes(compute[i0_1, i1_1])
+                compute[i0_1, i1_1] = T.max(C[i0_1, i1_1], T.float32(0))
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 4, 1, 1, 2]),
+        ("SamplePerfectTile", [1, 4, 1, 1, 2]),
+        ("SamplePerfectTile", [4, 2, 1]),
+        ("SampleCategorical", 2),
+        ("SampleCategorical", 2),
+        ("SampleCategorical", 1),
+    ]
+    mod = te.create_prim_func(
+        te_workload.matmul_relu(
+            n=128,
+            m=128,
+            k=128,
+            in_dtype="float16",
+            out_dtype="float32",
+        )
+    )
+    actual = ms.TuneContext(
+        mod=mod,
+        target=tvm.target.Target("cuda"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            multi_level_tiling_tensor_core(
+                use_software_pipeline=True,
+            ),
+        ],
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[matmul_relu_pipeline_0],
+        expected_decisions=[decision_0],
+    )
+
+
+def test_matmul_relu_global():
+    # fmt: off
+    @T.prim_func
+    def matmul_relu_global_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "float16"], compute: T.Buffer[(128, 128), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C = T.alloc_buffer([128, 128], dtype="float32")
+        C_reindex_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator")
+        A_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared")
+        B_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared")
+        A_reindex_shared_wmma_matrix_a = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_a")
+        B_reindex_shared_wmma_matrix_b = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_b")
+        for ax0_0_0_ax1_0_0_fused in T.thread_binding(1, thread="blockIdx.y"):
+            for ax0_0_1_ax1_0_1_fused in T.thread_binding(1, thread="blockIdx.x"):
+                for ax0_0_2_ax1_0_2_fused in T.thread_binding(16, thread="threadIdx.y"):
+                    for ax2_0_0 in T.serial(2):
+                        for ax0_ax1_fused in T.serial(8192):
+                            with T.block("A_reindex_shared"):
+                                v0 = T.axis.spatial(128, ax0_ax1_fused // 64)
+                                v1 = T.axis.spatial(128, ax2_0_0 * 64 + ax0_ax1_fused % 64)
+                                T.reads(A[v0, v1])
+                                T.writes(A_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":1})
+                                A_reindex_shared[v0, v1] = A[v0, v1]
+                        for ax0_ax1_fused in T.serial(8192):
+                            with T.block("B_reindex_shared"):
+                                v0 = T.axis.spatial(128, ax2_0_0 * 64 + ax0_ax1_fused // 128)
+                                v1 = T.axis.spatial(128, ax0_ax1_fused % 128)
+                                T.reads(B[v0, v1])
+                                T.writes(B_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":1})
+                                B_reindex_shared[v0, v1] = B[v0, v1]
+                        for ax2_0_1 in T.serial(2):
+                            for ax0_0, ax1_0 in T.grid(1, 2):
+                                with T.block("A_reindex_shared_wmma.matrix_a_o"):
+                                    v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2)
+                                    v1_o = T.axis.spatial(8, ax2_0_0 * 4 + ax2_0_1 * 2 + ax1_0)
+                                    T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("A_reindex_shared_wmma.matrix_a"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0, ax1_0 in T.grid(2, 4):
+                                with T.block("B_reindex_shared_wmma.matrix_b_o"):
+                                    v0_o = T.axis.spatial(8, ax2_0_0 * 4 + ax2_0_1 * 2 + ax0_0)
+                                    v1_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused % 2 * 4 + ax1_0)
+                                    T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("B_reindex_shared_wmma.matrix_b"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(1, 4, 2, 1, 1):
+                                with T.block("C_o"):
+                                    v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2 + ax0_0_3 + ax0_0_4)
+                                    v1_o = T.axis.spatial(8, ax1_0_4 + ax0_0_2_ax1_0_2_fused % 2 * 4 + ax1_0_3)
+                                    v2_o = T.axis.reduce(8, ax2_0_0 * 4 + ax2_0_1 * 2 + ax2_0_2)
+                                    T.reads(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex_shared_wmma_matrix_b[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(C_reindex_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1})
+                                    with T.init():
+                                        for ax0_1, ax1_1 in T.grid(16, 16):
+                                            with T.block("C_init"):
+                                                v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1])
+                                                T.reads()
+                                                T.writes(C_reindex_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init])
+                                                C_reindex_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0)
+                                    for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16):
+                                        with T.block("C"):
+                                            v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1])
+                                            T.reads(C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i])
+                                            T.writes(C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
+                                            C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32")
+                    for ax0_0, ax1_0 in T.grid(1, 4):
+                        with T.block("C_reindex_wmma.accumulator_o"):
+                            v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2)
+                            v1_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused % 2 * 4 + ax1_0)
+                            T.reads(C_reindex_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.writes(C[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_global"})
+                            for ax0_1, ax1_1 in T.grid(16, 16):
+                                with T.block("C_reindex_wmma.accumulator"):
+                                    v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                    T.reads(C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    T.writes(C[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    C[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+        for i0, i1 in T.grid(128, 128):
+            with T.block("compute"):
+                i0_1, i1_1 = T.axis.remap("SS", [i0, i1])
+                T.reads(C[i0_1, i1_1])
+                T.writes(compute[i0_1, i1_1])
+                compute[i0_1, i1_1] = T.max(C[i0_1, i1_1], T.float32(0))
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 8, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 2, 4, 1]),
+        ("SamplePerfectTile", [2, 2, 2]),
+        ("SampleCategorical", 0),
+        ("SampleCategorical", 0),
+    ]
+    mod = te.create_prim_func(
+        te_workload.matmul_relu(
+            n=128,
+            m=128,
+            k=128,
+            in_dtype="float16",
+            out_dtype="float32",
+        )
+    )
+    actual = ms.TuneContext(
+        mod=mod,
+        target=tvm.target.Target("cuda"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[multi_level_tiling_tensor_core(write_reuse_scope="global")]
+        + get_rules("cuda", ms.schedule_rule.AutoInline),
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[matmul_relu_global_0],
+        expected_decisions=[decision_0],
+    )
+
+
+def test_matmul_relu_non_tensorizable():
+    # expected to do nothing on non-tensorizable workloads
+    mod = te.create_prim_func(
+        te_workload.matmul_relu(  # dtype doesn't match tensor intrin
+            n=128,
+            m=128,
+            k=128,
+        )
+    )
+    (sch,) = ms.TuneContext(
+        mod=mod,
+        target=tvm.target.Target("cuda"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[multi_level_tiling_tensor_core(write_reuse_scope="global")]
+        + get_rules("cuda", ms.schedule_rule.AutoInline),
+    ).generate_design_space()
+    tvm.ir.assert_structural_equal(mod, sch.mod["main"])
+
+
+if __name__ == "__main__":
+    test_matmul_relu()
+    test_matmul_relu_with_fallback()
+    test_conv2d()
+    test_conv2d_more_intrin()
+    test_matmul_relu_pipeline()
+    test_matmul_relu_global()
+    test_matmul_relu_non_tensorizable()
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
deleted file mode 100644
index fe1220c50925..000000000000
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ /dev/null
@@ -1,1205 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.meta_schedule import schedule_rule
-from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
-from tvm.meta_schedule.testing import te_workload
-from tvm.meta_schedule.testing.schedule_rule import (
-    auto_inline,
-    multi_level_tiling,
-    multi_level_tiling_tensor_core,
-)
-from tvm.meta_schedule.testing.space_generation import check_trace
-from tvm.meta_schedule.tune_context import TuneContext
-from tvm.script import tir as T
-from tvm.target import Target
-from tvm.te import create_prim_func
-from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
-from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
-
-
-def _create_context(mod, target, rule) -> TuneContext:
-    if not isinstance(rule, (list, tuple)):
-        rule = [rule]
-    ctx = TuneContext(
-        mod=mod,
-        target=target,
-        space_generator=PostOrderApply(),
-        sch_rules=rule,
-        task_name="test",
-    )
-    return ctx
-
-
-def test_cpu_matmul():
-    expected = [
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
-            "l1, l2, l3 = sch.get_loops(block=b0)",
-            "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
-            "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
-            "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
-            "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
-            'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")',
-            "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True, index=-1)",
-        ],
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
-            "l1, l2, l3 = sch.get_loops(block=b0)",
-            "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
-            "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
-            "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
-            "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
-            'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")',
-            "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True, index=-1)",
-        ],
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
-            "l1, l2, l3 = sch.get_loops(block=b0)",
-            "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
-            "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
-            "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
-            "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
-        ],
-    ]
-    target = Target("llvm")
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.matmul(
-                n=512,
-                m=512,
-                k=512,
-            )
-        ),
-        target=target,
-        rule=multi_level_tiling(target=target),
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 3
-    check_trace(spaces, expected)
-
-
-def test_cpu_matmul_relu():
-    # pylint: disable=line-too-long
-    expected = [
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
-            "l1, l2, l3 = sch.get_loops(block=b0)",
-            "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
-            "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
-            "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
-            "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
-            "b24, = sch.get_consumers(block=b0)",
-            "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True, index=-1)",
-        ],
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
-            "l1, l2, l3 = sch.get_loops(block=b0)",
-            "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
-            "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
-            "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
-            "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
-            "b24, = sch.get_consumers(block=b0)",
-            "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True, index=-1)",
-        ],
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
-            "l1, l2, l3 = sch.get_loops(block=b0)",
-            "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
-            "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
-            "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
-            "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
-        ],
-    ]
-    # pylint: enable=line-too-long
-    target = Target("llvm")
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.matmul_relu(
-                n=512,
-                m=512,
-                k=512,
-            )
-        ),
-        target=target,
-        rule=multi_level_tiling(target=target),
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 3
-    check_trace(spaces, expected)
-
-
-def test_cuda_matmul():
-    # pylint: disable=line-too-long
-    expected = [
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")',
-            "l1, l2, l3 = sch.get_loops(block=b0)",
-            "v4, v5, v6, v7, v8 = sch.sample_perfect_tile(loop=l1, n=5, max_innermost_factor=64)",
-            "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8], preserve_unit_iters=True)",
-            "v14, v15, v16, v17, v18 = sch.sample_perfect_tile(loop=l2, n=5, max_innermost_factor=64)",
-            "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18], preserve_unit_iters=True)",
-            "v24, v25, v26 = sch.sample_perfect_tile(loop=l3, n=3, max_innermost_factor=64)",
-            "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26], preserve_unit_iters=True)",
-            "sch.reorder(l9, l19, l10, l20, l11, l21, l27, l28, l12, l22, l29, l13, l23)",
-            "l30 = sch.fuse(l9, l19, preserve_unit_iters=True)",
-            'sch.bind(loop=l30, thread_axis="blockIdx.x")',
-            "l31 = sch.fuse(l10, l20, preserve_unit_iters=True)",
-            'sch.bind(loop=l31, thread_axis="vthread.x")',
-            "l32 = sch.fuse(l11, l21, preserve_unit_iters=True)",
-            'sch.bind(loop=l32, thread_axis="threadIdx.x")',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32)',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024)',
-            'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")',
-            "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True, index=-1)",
-            'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")',
-            "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True, index=-1)",
-            "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)",
-            "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)",
-            "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
-            'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)',
-            'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
-            "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True, index=-1)",
-            "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)",
-            "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)",
-            "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
-            'sch.annotate(block_or_loop=b43, ann_key="meta_schedule.cooperative_fetch", ann_val=v51)',
-        ]
-    ]
-    # pylint: enable=line-too-long
-    target = Target("cuda --max_threads_per_block=1024 --thread_warp_size=32", host="llvm")
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.matmul(
-                n=512,
-                m=512,
-                k=512,
-            )
-        ),
-        target=target,
-        rule=multi_level_tiling(target=target),
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
-
-
-def test_cuda_matmul_relu():
-    # pylint: disable=line-too-long
-    expected = [
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")',
-            "l1, l2, l3 = sch.get_loops(block=b0)",
-            "v4, v5, v6, v7, v8 = sch.sample_perfect_tile(loop=l1, n=5, max_innermost_factor=64)",
-            "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8], preserve_unit_iters=True)",
-            "v14, v15, v16, v17, v18 = sch.sample_perfect_tile(loop=l2, n=5, max_innermost_factor=64)",
-            "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18], preserve_unit_iters=True)",
-            "v24, v25, v26 = sch.sample_perfect_tile(loop=l3, n=3, max_innermost_factor=64)",
-            "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26], preserve_unit_iters=True)",
-            "sch.reorder(l9, l19, l10, l20, l11, l21, l27, l28, l12, l22, l29, l13, l23)",
-            "l30 = sch.fuse(l9, l19, preserve_unit_iters=True)",
-            'sch.bind(loop=l30, thread_axis="blockIdx.x")',
-            "l31 = sch.fuse(l10, l20, preserve_unit_iters=True)",
-            'sch.bind(loop=l31, thread_axis="vthread.x")',
-            "l32 = sch.fuse(l11, l21, preserve_unit_iters=True)",
-            'sch.bind(loop=l32, thread_axis="threadIdx.x")',
-            'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")',
-            "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True, index=-1)",
-            'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")',
-            "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True, index=-1)",
-            "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)",
-            "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)",
-            "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
-            'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)',
-            'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
-            "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True, index=-1)",
-            "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)",
-            "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)",
-            "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
-            'sch.annotate(block_or_loop=b43, ann_key="meta_schedule.cooperative_fetch", ann_val=v51)',
-        ]
-    ]
-    # pylint: enable=line-too-long
-    target = Target("cuda", host="llvm")
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.matmul_relu(
-                n=512,
-                m=512,
-                k=512,
-            )
-        ),
-        target=target,
-        rule=multi_level_tiling(target=target),
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
-
-
-def test_cuda_sum_with_trivial_block_iter():
-    @T.prim_func
-    def sum_with_trivial_block_iter(
-        A: T.Buffer[(1, 64, 768), "float32"], B: T.Buffer[(1, 64, 1), "float32"]
-    ) -> None:
-        for i0, i1, i2, i3 in T.grid(1, 64, 1, 768):
-            with T.block("sum"):
-                ax0, ax1, ax2, k2 = T.axis.remap("SSSR", [i0, i1, i2, i3])
-                T.reads(A[ax0, ax1, k2])
-                T.writes(B[ax0, ax1, ax2])
-                with T.init():
-                    B[ax0, ax1, ax2] = T.float32(0)
-                B[ax0, ax1, ax2] = B[ax0, ax1, ax2] + A[ax0, ax1, k2]
-
-    # Expect nothing to happen - the rule is not supposed to be applied in this case
-    expected = [[]]
-    target = Target("cuda", host="llvm")
-    ctx = _create_context(
-        sum_with_trivial_block_iter,
-        target=target,
-        rule=multi_level_tiling(target=target),
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
-
-
-@tvm.script.ir_module
-class Conv2dNCHWcVNNIModule:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
-        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
-        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
-    ) -> None:
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
-            with T.block("conv2d_NCHWc_int8"):
-                (
-                    n,
-                    oc_chunk,
-                    oh,
-                    ow,
-                    oc_block,
-                    kh,
-                    kw,
-                    ic_outer,
-                    ic_f_inner,
-                    ic_s_inner,
-                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
-                T.reads(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                )
-                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
-                with T.init():
-                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
-                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
-                    n, oc_chunk, oh, ow, oc_block
-                ] + T.cast(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
-                ) * T.cast(
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                    "int32",
-                )
-
-
-def test_multi_level_tiling_conv2d_nchwc_vnni():
-    target = "llvm -mcpu=cascadelake -num-cores 4"
-    ctx = _create_context(
-        Conv2dNCHWcVNNIModule,
-        target=tvm.target.Target(target),
-        rule=schedule_rule.MultiLevelTilingWithIntrin(
-            VNNI_INTRIN,
-            structure="SSRSRS",
-            tile_binds=None,
-            max_innermost_factor=64,
-            vector_load_lens=None,
-            reuse_read=None,
-            reuse_write=schedule_rule.ReuseType(
-                req="may",
-                levels=[1, 2],
-                scope="global",
-            ),
-        ),
-    )
-
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-
-    expected = [
-        """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
-sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
-l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[None, 4], preserve_unit_iters=True)
-l13, l14 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
-l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
-sch.reorder(l21, l22, l23, l24, l25, l14, l12)
-b27 = sch.blockize(loop=l14)
-sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
-l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
-v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
-l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True)
-v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
-l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True)
-v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
-l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True)
-v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
-l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True)
-v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
-l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True)
-v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
-l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True)
-v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
-l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True)
-v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
-l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True)
-v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
-l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True)
-v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
-l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True)
-sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)
-b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global")
-sch.reverse_compute_at(block=b98, loop=l75, preserve_unit_loops=True, index=-1)""".split(
-            "\n"
-        ),
-        """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
-sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
-l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[None, 4], preserve_unit_iters=True)
-l13, l14 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
-l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
-sch.reorder(l21, l22, l23, l24, l25, l14, l12)
-b27 = sch.blockize(loop=l14)
-sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
-l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
-v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
-l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True)
-v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
-l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True)
-v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
-l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True)
-v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
-l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True)
-v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
-l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True)
-v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
-l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True)
-v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
-l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True)
-v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
-l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True)
-v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
-l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True)
-v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
-l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True)
-sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)
-b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global")
-sch.reverse_compute_at(block=b98, loop=l74, preserve_unit_loops=True, index=-1)""".split(
-            "\n"
-        ),
-        """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
-sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
-l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[None, 4], preserve_unit_iters=True)
-l13, l14 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
-l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
-sch.reorder(l21, l22, l23, l24, l25, l14, l12)
-b27 = sch.blockize(loop=l14)
-sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
-l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
-v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
-l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True)
-v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
-l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True)
-v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
-l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True)
-v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
-l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True)
-v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
-l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True)
-v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
-l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True)
-v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
-l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True)
-v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
-l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True)
-v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
-l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True)
-v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
-l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True)
-sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)""".split(
-            "\n"
-        ),
-    ]
-
-    check_trace(spaces, expected)
-
-
-def _test_multi_level_tiling_dense_dp4a(m, n, k, in_dtype, out_dtype, expected):
-    X = te.placeholder((m, k), name="X", dtype=in_dtype)
-    W = te.placeholder((n, k), name="W", dtype=in_dtype)
-    ak = te.reduce_axis((0, k), name="k")
-
-    matmul = te.compute(
-        (m, n),
-        lambda i, j: te.sum(
-            X[i, ak].astype(out_dtype) * W[j, ak].astype(out_dtype),
-            axis=ak,
-        ),
-        name="compute",
-    )
-
-    func = te.create_prim_func([X, W, matmul])
-
-    ctx = _create_context(
-        func,
-        target=tvm.target.Target("cuda"),
-        rule=schedule_rule.MultiLevelTilingWithIntrin(
-            DP4A_INTRIN,
-            structure="SSSRRSRS",
-            tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
-            max_innermost_factor=64,
-            vector_load_lens=[1, 2, 3, 4],
-            reuse_read=schedule_rule.ReuseType(
-                req="must",
-                levels=[4],
-                scope="shared",
-            ),
-            reuse_write=schedule_rule.ReuseType(
-                req="must",
-                levels=[3],
-                scope="local",
-            ),
-        ),
-    )
-
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    check_trace(spaces, expected)
-
-
-def test_multi_level_tiling_dense_dp4a():
-    m, n, k = 128, 128, 128
-
-    expected = [
-        """b0 = sch.get_block(name="compute", func_name="main")
-sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
-l1, l2, l3 = sch.get_loops(block=b0)
-l4, l5 = sch.split(loop=l3, factors=[None, 4], preserve_unit_iters=True)
-sch.reorder(l5)
-b6 = sch.blockize(loop=l5)
-sch.annotate(block_or_loop=b6, ann_key="meta_schedule.auto_tensorize", ann_val="dp4a")
-l7, l8, l9 = sch.get_loops(block=b6)
-v10, v11, v12, v13, v14 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64)
-l15, l16, l17, l18, l19 = sch.split(loop=l7, factors=[v10, v11, v12, v13, v14], preserve_unit_iters=True)
-v20, v21, v22, v23, v24 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64)
-l25, l26, l27, l28, l29 = sch.split(loop=l8, factors=[v20, v21, v22, v23, v24], preserve_unit_iters=True)
-v30, v31, v32 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64)
-l33, l34, l35 = sch.split(loop=l9, factors=[v30, v31, v32], preserve_unit_iters=True)
-sch.reorder(l15, l25, l16, l26, l17, l27, l33, l34, l18, l28, l35, l19, l29)
-l36 = sch.fuse(l15, l25, preserve_unit_iters=True)
-sch.bind(loop=l36, thread_axis="blockIdx.x")
-l37 = sch.fuse(l16, l26, preserve_unit_iters=True)
-sch.bind(loop=l37, thread_axis="vthread.x")
-l38 = sch.fuse(l17, l27, preserve_unit_iters=True)
-sch.bind(loop=l38, thread_axis="threadIdx.x")
-b39 = sch.cache_write(block=b6, write_buffer_index=0, storage_scope="local")
-sch.reverse_compute_at(block=b39, loop=l38, preserve_unit_loops=True, index=-1)
-b40 = sch.cache_read(block=b6, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b40, loop=l33, preserve_unit_loops=True, index=-1)
-l41, l42, l43, l44, l45, l46 = sch.get_loops(block=b40)
-l47 = sch.fuse(l45, l46, preserve_unit_iters=True)
-v48 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b40, ann_key="meta_schedule.cooperative_fetch", ann_val=v48)
-b49 = sch.cache_read(block=b6, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b49, loop=l33, preserve_unit_loops=True, index=-1)
-l50, l51, l52, l53, l54, l55 = sch.get_loops(block=b49)
-l56 = sch.fuse(l54, l55, preserve_unit_iters=True)
-v57 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b49, ann_key="meta_schedule.cooperative_fetch", ann_val=v57)""".split(
-            "\n"
-        )
-    ]
-
-    _test_multi_level_tiling_dense_dp4a(m, n, k, "int8", "int32", expected)
-
-
-def test_multi_level_tiling_dense_dp4a_non_tensorizable():
-    _test_multi_level_tiling_dense_dp4a(128, 128, 128, "float32", "float32", [""])
-    _test_multi_level_tiling_dense_dp4a(127, 127, 127, "int8", "int32", [""])
-
-
-def test_cuda_tensor_core_matmul_relu():
-    m = n = k = 128
-    target = Target("cuda", host="llvm")
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.matmul_relu(
-                n=n,
-                m=m,
-                k=k,
-                in_dtype="float16",
-                out_dtype="float32",
-            )
-        ),
-        target=target,
-        rule=[
-            multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"),
-            auto_inline(target),
-        ],
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-
-    expected = [
-        """b0 = sch.get_block(name="C", func_name="main")
-b1 = sch.get_block(name="compute", func_name="main")
-sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
-b2 = sch.reindex(block=b0, buffer=("write", 0))
-b3 = sch.reindex(block=b0, buffer=("read", 0))
-b4 = sch.reindex(block=b0, buffer=("read", 1))
-sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, ))
-sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, ))
-sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, ))
-sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b4, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, ))
-l5, l6, l7 = sch.get_loops(block=b0)
-l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True)
-l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
-l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
-l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0)
-sch.reorder(l16, l18, l13, l11, l9)
-b20 = sch.blockize(loop=l13)
-sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32")
-sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
-sch.annotate(block_or_loop=b20, ann_key="warp_execution", ann_val=1)
-l21, l22, l23 = sch.get_loops(block=b20)
-v24, v25, v26, v27, v28 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4)
-l29, l30, l31, l32, l33 = sch.split(loop=l21, factors=[v24, v25, v26, v27, v28], preserve_unit_iters=True)
-v34, v35, v36, v37, v38 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4)
-l39, l40, l41, l42, l43 = sch.split(loop=l22, factors=[v34, v35, v36, v37, v38], preserve_unit_iters=True)
-v44, v45, v46 = sch.sample_perfect_tile(loop=l23, n=3, max_innermost_factor=4)
-l47, l48, l49 = sch.split(loop=l23, factors=[v44, v45, v46], preserve_unit_iters=True)
-sch.reorder(l29, l39, l30, l40, l31, l41, l47, l48, l32, l42, l49, l33, l43)
-l50 = sch.fuse(l29, l39, preserve_unit_iters=True)
-sch.bind(loop=l50, thread_axis="blockIdx.y")
-l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
-sch.bind(loop=l51, thread_axis="blockIdx.x")
-l52 = sch.fuse(l31, l41, preserve_unit_iters=True)
-sch.bind(loop=l52, thread_axis="threadIdx.y")
-b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared")
-sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True, index=-1)
-b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator")
-sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True, index=-1)
-v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55)
-sch.reverse_compute_inline(block=b2)
-l56, l57, l58, l59, l60 = sch.get_loops(block=b54)
-l61, l62 = sch.split(loop=l60, factors=[None, 16], preserve_unit_iters=True)
-l63, l64 = sch.split(loop=l59, factors=[None, 16], preserve_unit_iters=True)
-l65, l66, l67, l68, l69, l70, l71 = sch.get_loops(block=b54)
-sch.reorder(l70, l64, l62)
-b72 = sch.blockize(loop=l64)
-sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared")
-b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True, index=-1)
-l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73)
-l80 = sch.fuse(l78, l79, preserve_unit_iters=True)
-v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81)
-b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True, index=-1)
-l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82)
-l89 = sch.fuse(l87, l88, preserve_unit_iters=True)
-v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90)
-b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a")
-sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True, index=-1)
-l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91)
-l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True)
-l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True)
-l103, l104, l105, l106, l107, l108, l109, l110, l111 = sch.get_loops(block=b91)
-sch.reorder(l110, l102, l100)
-b112 = sch.blockize(loop=l102)
-sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
-b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b")
-sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True, index=-1)
-l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113)
-l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True)
-l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True)
-l125, l126, l127, l128, l129, l130, l131, l132, l133 = sch.get_loops(block=b113)
-sch.reorder(l132, l124, l122)
-b134 = sch.blockize(loop=l124)
-sch.annotate(block_or_loop=b134, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b")
-sch.compute_inline(block=b3)
-sch.compute_inline(block=b4)
-sch.storage_align(block=b73, buffer_index=0, axis=-2, factor=32, offset=8)
-sch.storage_align(block=b82, buffer_index=0, axis=-2, factor=32, offset=8)
-sch.reverse_compute_inline(block=b1)""".split(
-            "\n"
-        )
-    ]
-    check_trace(spaces, expected)
-
-    # test multi_level_tiling_tensor_core and multi_level_tiling can be used together in order
-    # to use multi_level_tiling as a fallback when the workload can't be tensorized
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.matmul_relu(
-                n=n,
-                m=m,
-                k=k,
-                in_dtype="float16",
-                out_dtype="float32",
-            )
-        ),
-        target=target,
-        rule=[
-            multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"),
-            multi_level_tiling(target=target),
-            auto_inline(target),
-        ],
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
-
-
-def test_cuda_tensor_core_software_pipeline_matmul_relu():
-    m = n = k = 128
-    target = Target("cuda", host="llvm")
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.matmul_relu(
-                n=n,
-                m=m,
-                k=k,
-                in_dtype="float16",
-                out_dtype="float32",
-            )
-        ),
-        target=target,
-        rule=[
-            multi_level_tiling_tensor_core(
-                target=target, write_reuse_scope="shared", use_software_pipeline=True
-            ),
-            auto_inline(target),
-        ],
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-
-    expected = [
-        """b0 = sch.get_block(name="C", func_name="main")
-b1 = sch.get_block(name="compute", func_name="main")
-sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
-b2 = sch.reindex(block=b0, buffer=("write", 0))
-b3 = sch.reindex(block=b0, buffer=("read", 0))
-b4 = sch.reindex(block=b0, buffer=("read", 1))
-sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, ))
-sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, ))
-sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, ))
-sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b4, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, ))
-l5, l6, l7 = sch.get_loops(block=b0)
-l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True)
-l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
-l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
-l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0)
-sch.reorder(l16, l18, l13, l11, l9)
-b20 = sch.blockize(loop=l13)
-sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32")
-sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
-sch.annotate(block_or_loop=b20, ann_key="warp_execution", ann_val=1)
-l21, l22, l23 = sch.get_loops(block=b20)
-v24, v25, v26, v27, v28 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4)
-l29, l30, l31, l32, l33 = sch.split(loop=l21, factors=[v24, v25, v26, v27, v28], preserve_unit_iters=True)
-v34, v35, v36, v37, v38 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4)
-l39, l40, l41, l42, l43 = sch.split(loop=l22, factors=[v34, v35, v36, v37, v38], preserve_unit_iters=True)
-v44, v45, v46 = sch.sample_perfect_tile(loop=l23, n=3, max_innermost_factor=4)
-l47, l48, l49 = sch.split(loop=l23, factors=[v44, v45, v46], preserve_unit_iters=True)
-sch.reorder(l29, l39, l30, l40, l31, l41, l47, l48, l32, l42, l49, l33, l43)
-l50 = sch.fuse(l29, l39, preserve_unit_iters=True)
-sch.bind(loop=l50, thread_axis="blockIdx.y")
-l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
-sch.bind(loop=l51, thread_axis="blockIdx.x")
-l52 = sch.fuse(l31, l41, preserve_unit_iters=True)
-sch.bind(loop=l52, thread_axis="threadIdx.y")
-b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared")
-sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True, index=-1)
-b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator")
-sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True, index=-1)
-v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55)
-sch.reverse_compute_inline(block=b2)
-l56, l57, l58, l59, l60 = sch.get_loops(block=b54)
-l61, l62 = sch.split(loop=l60, factors=[None, 16], preserve_unit_iters=True)
-l63, l64 = sch.split(loop=l59, factors=[None, 16], preserve_unit_iters=True)
-l65, l66, l67, l68, l69, l70, l71 = sch.get_loops(block=b54)
-sch.reorder(l70, l64, l62)
-b72 = sch.blockize(loop=l64)
-sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared")
-b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True, index=-1)
-l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73)
-l80 = sch.fuse(l78, l79, preserve_unit_iters=True)
-v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81)
-b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True, index=-1)
-l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82)
-l89 = sch.fuse(l87, l88, preserve_unit_iters=True)
-v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90)
-b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a")
-sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True, index=-1)
-l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91)
-l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True)
-l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True)
-l103, l104, l105, l106, l107, l108, l109, l110, l111 = sch.get_loops(block=b91)
-sch.reorder(l110, l102, l100)
-b112 = sch.blockize(loop=l102)
-sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
-b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b")
-sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True, index=-1)
-l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113)
-l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True)
-l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True)
-l125, l126, l127, l128, l129, l130, l131, l132, l133 = sch.get_loops(block=b113)
-sch.reorder(l132, l124, l122)
-b134 = sch.blockize(loop=l124)
-sch.annotate(block_or_loop=b134, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b")
-sch.compute_inline(block=b3)
-sch.compute_inline(block=b4)
-sch.storage_align(block=b73, buffer_index=0, axis=-2, factor=32, offset=8)
-sch.storage_align(block=b82, buffer_index=0, axis=-2, factor=32, offset=8)
-sch.annotate(block_or_loop=b73, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1)
-sch.annotate(block_or_loop=b73, ann_key="double_buffer_scope", ann_val=0)
-sch.annotate(block_or_loop=b82, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1)
-sch.annotate(block_or_loop=b82, ann_key="double_buffer_scope", ann_val=0)
-sch.annotate(block_or_loop=l48, ann_key="software_pipeline_stage", ann_val=[0, 0, 1])
-sch.annotate(block_or_loop=l48, ann_key="software_pipeline_order", ann_val=[0, 1, 2])
-sch.annotate(block_or_loop=l47, ann_key="software_pipeline_stage", ann_val=[0, 0, 0, 0, 0, 1, 1])
-sch.annotate(block_or_loop=l47, ann_key="software_pipeline_order", ann_val=[0, 3, 1, 4, 5, 2, 6])
-sch.reverse_compute_inline(block=b1)""".split(
-            "\n"
-        )
-    ]
-    check_trace(spaces, expected)
-
-
-def test_cuda_tensor_core_matmul_relu_global():
-    m = n = k = 128
-    target = Target("cuda", host="llvm")
-    workload = create_prim_func(
-        te_workload.matmul_relu(
-            n=n,
-            m=m,
-            k=k,
-            in_dtype="float16",
-            out_dtype="float32",
-        ),
-    )
-    ctx = _create_context(
-        workload,
-        target=target,
-        rule=[
-            multi_level_tiling_tensor_core(target=target, write_reuse_scope="global"),
-            auto_inline(target),
-        ],
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-
-    expected = [
-        """b0 = sch.get_block(name="C", func_name="main")
-sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
-b1 = sch.reindex(block=b0, buffer=("write", 0))
-b2 = sch.reindex(block=b0, buffer=("read", 0))
-b3 = sch.reindex(block=b0, buffer=("read", 1))
-sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, ))
-sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, ))
-sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, ))
-sch.transform_block_layout(block=b1, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, ))
-l4, l5, l6 = sch.get_loops(block=b0)
-l7, l8 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
-l9, l10 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
-l11, l12 = sch.split(loop=l4, factors=[None, 16], preserve_unit_iters=True)
-l13, l14, l15, l16, l17, l18 = sch.get_loops(block=b0)
-sch.reorder(l15, l17, l12, l10, l8)
-b19 = sch.blockize(loop=l12)
-sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32")
-sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
-sch.annotate(block_or_loop=b19, ann_key="warp_execution", ann_val=1)
-l20, l21, l22 = sch.get_loops(block=b19)
-v23, v24, v25, v26, v27 = sch.sample_perfect_tile(loop=l20, n=5, max_innermost_factor=4)
-l28, l29, l30, l31, l32 = sch.split(loop=l20, factors=[v23, v24, v25, v26, v27], preserve_unit_iters=True)
-v33, v34, v35, v36, v37 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4)
-l38, l39, l40, l41, l42 = sch.split(loop=l21, factors=[v33, v34, v35, v36, v37], preserve_unit_iters=True)
-v43, v44, v45 = sch.sample_perfect_tile(loop=l22, n=3, max_innermost_factor=4)
-l46, l47, l48 = sch.split(loop=l22, factors=[v43, v44, v45], preserve_unit_iters=True)
-sch.reorder(l28, l38, l29, l39, l30, l40, l46, l47, l31, l41, l48, l32, l42)
-l49 = sch.fuse(l28, l38, preserve_unit_iters=True)
-sch.bind(loop=l49, thread_axis="blockIdx.y")
-l50 = sch.fuse(l29, l39, preserve_unit_iters=True)
-sch.bind(loop=l50, thread_axis="blockIdx.x")
-l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
-sch.bind(loop=l51, thread_axis="threadIdx.y")
-b52 = sch.cache_write(block=b19, write_buffer_index=0, storage_scope="wmma.accumulator")
-sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True, index=-1)
-sch.reverse_compute_inline(block=b1)
-l53, l54, l55, l56, l57 = sch.get_loops(block=b52)
-l58, l59 = sch.split(loop=l57, factors=[None, 16], preserve_unit_iters=True)
-l60, l61 = sch.split(loop=l56, factors=[None, 16], preserve_unit_iters=True)
-l62, l63, l64, l65, l66, l67, l68 = sch.get_loops(block=b52)
-sch.reorder(l67, l61, l59)
-b69 = sch.blockize(loop=l61)
-sch.annotate(block_or_loop=b69, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_global")
-b70 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True, index=-1)
-l71, l72, l73, l74, l75, l76 = sch.get_loops(block=b70)
-l77 = sch.fuse(l75, l76, preserve_unit_iters=True)
-v78 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b70, ann_key="meta_schedule.cooperative_fetch", ann_val=v78)
-b79 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True, index=-1)
-l80, l81, l82, l83, l84, l85 = sch.get_loops(block=b79)
-l86 = sch.fuse(l84, l85, preserve_unit_iters=True)
-v87 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch", ann_val=v87)
-b88 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="wmma.matrix_a")
-sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True, index=-1)
-l89, l90, l91, l92, l93, l94, l95 = sch.get_loops(block=b88)
-l96, l97 = sch.split(loop=l95, factors=[None, 16], preserve_unit_iters=True)
-l98, l99 = sch.split(loop=l94, factors=[None, 16], preserve_unit_iters=True)
-l100, l101, l102, l103, l104, l105, l106, l107, l108 = sch.get_loops(block=b88)
-sch.reorder(l107, l99, l97)
-b109 = sch.blockize(loop=l99)
-sch.annotate(block_or_loop=b109, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
-b110 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="wmma.matrix_b")
-sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True, index=-1)
-l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b110)
-l118, l119 = sch.split(loop=l117, factors=[None, 16], preserve_unit_iters=True)
-l120, l121 = sch.split(loop=l116, factors=[None, 16], preserve_unit_iters=True)
-l122, l123, l124, l125, l126, l127, l128, l129, l130 = sch.get_loops(block=b110)
-sch.reorder(l129, l121, l119)
-b131 = sch.blockize(loop=l121)
-sch.annotate(block_or_loop=b131, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b")
-sch.compute_inline(block=b2)
-sch.compute_inline(block=b3)
-sch.storage_align(block=b70, buffer_index=0, axis=-2, factor=32, offset=8)
-sch.storage_align(block=b79, buffer_index=0, axis=-2, factor=32, offset=8)""".split(
-            "\n"
-        )
-    ]
-    check_trace(spaces, expected)
-
-    ctx = _create_context(
-        workload,
-        target=target,
-        rule=[
-            multi_level_tiling_tensor_core(
-                target=target, write_reuse_scope="global", trans_b=[False, True]
-            ),
-            auto_inline(target),
-        ],
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 2
-
-    expected = [
-        expected[0],
-        """b0 = sch.get_block(name="C", func_name="main")
-sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
-b1 = sch.reindex(block=b0, buffer=("write", 0))
-b2 = sch.reindex(block=b0, buffer=("read", 0))
-b3 = sch.reindex(block=b0, buffer=("read", 1))
-sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, ))
-sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (j, k, ))
-sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, ))
-sch.transform_block_layout(block=b1, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, ))
-sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, ))
-l4, l5, l6 = sch.get_loops(block=b0)
-l7, l8 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
-l9, l10 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
-l11, l12 = sch.split(loop=l4, factors=[None, 16], preserve_unit_iters=True)
-l13, l14, l15, l16, l17, l18 = sch.get_loops(block=b0)
-sch.reorder(l15, l17, l12, l10, l8)
-b19 = sch.blockize(loop=l12)
-sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32_trans")
-sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
-sch.annotate(block_or_loop=b19, ann_key="warp_execution", ann_val=1)
-l20, l21, l22 = sch.get_loops(block=b19)
-v23, v24, v25, v26, v27 = sch.sample_perfect_tile(loop=l20, n=5, max_innermost_factor=4)
-l28, l29, l30, l31, l32 = sch.split(loop=l20, factors=[v23, v24, v25, v26, v27], preserve_unit_iters=True)
-v33, v34, v35, v36, v37 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4)
-l38, l39, l40, l41, l42 = sch.split(loop=l21, factors=[v33, v34, v35, v36, v37], preserve_unit_iters=True)
-v43, v44, v45 = sch.sample_perfect_tile(loop=l22, n=3, max_innermost_factor=4)
-l46, l47, l48 = sch.split(loop=l22, factors=[v43, v44, v45], preserve_unit_iters=True)
-sch.reorder(l28, l38, l29, l39, l30, l40, l46, l47, l31, l41, l48, l32, l42)
-l49 = sch.fuse(l28, l38, preserve_unit_iters=True)
-sch.bind(loop=l49, thread_axis="blockIdx.y")
-l50 = sch.fuse(l29, l39, preserve_unit_iters=True)
-sch.bind(loop=l50, thread_axis="blockIdx.x")
-l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
-sch.bind(loop=l51, thread_axis="threadIdx.y")
-b52 = sch.cache_write(block=b19, write_buffer_index=0, storage_scope="wmma.accumulator")
-sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True, index=-1)
-sch.reverse_compute_inline(block=b1)
-l53, l54, l55, l56, l57 = sch.get_loops(block=b52)
-l58, l59 = sch.split(loop=l57, factors=[None, 16], preserve_unit_iters=True)
-l60, l61 = sch.split(loop=l56, factors=[None, 16], preserve_unit_iters=True)
-l62, l63, l64, l65, l66, l67, l68 = sch.get_loops(block=b52)
-sch.reorder(l67, l61, l59)
-b69 = sch.blockize(loop=l61)
-sch.annotate(block_or_loop=b69, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_global")
-b70 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True, index=-1)
-l71, l72, l73, l74, l75, l76 = sch.get_loops(block=b70)
-l77 = sch.fuse(l75, l76, preserve_unit_iters=True)
-v78 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b70, ann_key="meta_schedule.cooperative_fetch", ann_val=v78)
-b79 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True, index=-1)
-l80, l81, l82, l83, l84, l85 = sch.get_loops(block=b79)
-l86 = sch.fuse(l84, l85, preserve_unit_iters=True)
-v87 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch", ann_val=v87)
-b88 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="wmma.matrix_a")
-sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True, index=-1)
-l89, l90, l91, l92, l93, l94, l95 = sch.get_loops(block=b88)
-l96, l97 = sch.split(loop=l95, factors=[None, 16], preserve_unit_iters=True)
-l98, l99 = sch.split(loop=l94, factors=[None, 16], preserve_unit_iters=True)
-l100, l101, l102, l103, l104, l105, l106, l107, l108 = sch.get_loops(block=b88)
-sch.reorder(l107, l99, l97)
-b109 = sch.blockize(loop=l99)
-sch.annotate(block_or_loop=b109, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
-b110 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="wmma.matrix_b")
-sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True, index=-1)
-l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b110)
-l118, l119 = sch.split(loop=l117, factors=[None, 16], preserve_unit_iters=True)
-l120, l121 = sch.split(loop=l116, factors=[None, 16], preserve_unit_iters=True)
-l122, l123, l124, l125, l126, l127, l128, l129, l130 = sch.get_loops(block=b110)
-sch.reorder(l129, l121, l119)
-b131 = sch.blockize(loop=l121)
-sch.annotate(block_or_loop=b131, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b_trans")
-sch.compute_inline(block=b2)
-sch.compute_inline(block=b3)
-sch.storage_align(block=b70, buffer_index=0, axis=-2, factor=32, offset=8)
-sch.storage_align(block=b79, buffer_index=0, axis=-2, factor=32, offset=8)""".split(
-            "\n"
-        ),
-    ]
-    check_trace(spaces, expected)
-
-
-def test_multi_level_tiling_non_tensorizable():
-    # expected to do nothing on non-tensorizable workloads
-    m = n = k = 128
-    target = Target("cuda", host="llvm")
-    ctx = _create_context(
-        create_prim_func(
-            # dtype doesn't match tensor intrin
-            te_workload.matmul_relu(
-                n=n,
-                m=m,
-                k=k,
-            )
-        ),
-        target=target,
-        rule=multi_level_tiling_tensor_core(target=target, write_reuse_scope="global"),
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-
-    expected = [
-        "",  # expected to do nothing when the workload can't be tensorized
-    ]
-    check_trace(spaces, expected)
-
-
-def test_cuda_tensor_core_conv2d():
-    target = Target("cuda", host="llvm")
-    workload = create_prim_func(
-        te_workload.conv2d_nhwc(
-            N=1,
-            H=16,
-            W=16,
-            CI=32,
-            CO=32,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            in_dtype="float16",
-            out_dtype="float32",
-        )
-    )
-    ctx = _create_context(
-        workload,
-        target=target,
-        rule=multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"),
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-
-    expected = [
-        """b0 = sch.get_block(name="conv2d_nhwc", func_name="main")
-sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
-b1 = sch.reindex(block=b0, buffer=("write", 0))
-b2 = sch.reindex(block=b0, buffer=("read", 0))
-b3 = sch.reindex(block=b0, buffer=("read", 1))
-sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda h, w, rh, rw, rc: (((h*16) + w), (((rh*96) + (rw*32)) + rc), ))
-sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda co, rh, rw, rc: ((((rh*96) + (rw*32)) + rc), co, ))
-sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda h, w, co: (((h*16) + w), co, ))
-sch.transform_block_layout(block=b1, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), ))
-sch.transform_block_layout(block=b2, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), ))
-sch.transform_block_layout(block=b3, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), ))
-sch.transform_block_layout(block=b0, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), ))
-l4, l5, l6, l7 = sch.get_loops(block=b0)
-l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True)
-l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
-l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
-l14, l15, l16, l17, l18, l19, l20 = sch.get_loops(block=b0)
-sch.reorder(l17, l19, l13, l11, l9)
-b21 = sch.blockize(loop=l13)
-sch.annotate(block_or_loop=b21, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32")
-sch.annotate(block_or_loop=b21, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
-sch.annotate(block_or_loop=b21, ann_key="warp_execution", ann_val=1)
-l22, l23, l24, l25 = sch.get_loops(block=b21)
-v26, v27, v28, v29, v30 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4)
-l31, l32, l33, l34, l35 = sch.split(loop=l22, factors=[v26, v27, v28, v29, v30], preserve_unit_iters=True)
-v36, v37, v38, v39, v40 = sch.sample_perfect_tile(loop=l23, n=5, max_innermost_factor=4)
-l41, l42, l43, l44, l45 = sch.split(loop=l23, factors=[v36, v37, v38, v39, v40], preserve_unit_iters=True)
-v46, v47, v48, v49, v50 = sch.sample_perfect_tile(loop=l24, n=5, max_innermost_factor=4)
-l51, l52, l53, l54, l55 = sch.split(loop=l24, factors=[v46, v47, v48, v49, v50], preserve_unit_iters=True)
-v56, v57, v58 = sch.sample_perfect_tile(loop=l25, n=3, max_innermost_factor=4)
-l59, l60, l61 = sch.split(loop=l25, factors=[v56, v57, v58], preserve_unit_iters=True)
-sch.reorder(l31, l41, l51, l32, l42, l52, l33, l43, l53, l59, l60, l34, l44, l54, l61, l35, l45, l55)
-l62 = sch.fuse(l31, l41, l51, preserve_unit_iters=True)
-sch.bind(loop=l62, thread_axis="blockIdx.y")
-l63 = sch.fuse(l32, l42, l52, preserve_unit_iters=True)
-sch.bind(loop=l63, thread_axis="blockIdx.x")
-l64 = sch.fuse(l33, l43, l53, preserve_unit_iters=True)
-sch.bind(loop=l64, thread_axis="threadIdx.y")
-b65 = sch.cache_write(block=b21, write_buffer_index=0, storage_scope="shared")
-sch.reverse_compute_at(block=b65, loop=l63, preserve_unit_loops=True, index=-1)
-b66 = sch.cache_write(block=b21, write_buffer_index=0, storage_scope="wmma.accumulator")
-sch.reverse_compute_at(block=b66, loop=l64, preserve_unit_loops=True, index=-1)
-v67 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b65, ann_key="meta_schedule.cooperative_fetch", ann_val=v67)
-sch.reverse_compute_inline(block=b1)
-l68, l69, l70, l71, l72 = sch.get_loops(block=b66)
-l73, l74 = sch.split(loop=l72, factors=[None, 16], preserve_unit_iters=True)
-l75, l76 = sch.split(loop=l71, factors=[None, 16], preserve_unit_iters=True)
-l77, l78, l79, l80, l81, l82, l83 = sch.get_loops(block=b66)
-sch.reorder(l82, l76, l74)
-b84 = sch.blockize(loop=l76)
-sch.annotate(block_or_loop=b84, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared")
-b85 = sch.cache_read(block=b21, read_buffer_index=0, storage_scope="shared")
-sch.compute_at(block=b85, loop=l59, preserve_unit_loops=True, index=-1)
-l86, l87, l88, l89, l90, l91 = sch.get_loops(block=b85)
-l92 = sch.fuse(l90, l91, preserve_unit_iters=True)
-v93 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b85, ann_key="meta_schedule.cooperative_fetch", ann_val=v93)
-b94 = sch.cache_read(block=b21, read_buffer_index=1, storage_scope="shared")
-sch.compute_at(block=b94, loop=l59, preserve_unit_loops=True, index=-1)
-l95, l96, l97, l98, l99, l100 = sch.get_loops(block=b94)
-l101 = sch.fuse(l99, l100, preserve_unit_iters=True)
-v102 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
-sch.annotate(block_or_loop=b94, ann_key="meta_schedule.cooperative_fetch", ann_val=v102)
-b103 = sch.cache_read(block=b21, read_buffer_index=0, storage_scope="wmma.matrix_a")
-sch.compute_at(block=b103, loop=l60, preserve_unit_loops=True, index=-1)
-l104, l105, l106, l107, l108, l109, l110 = sch.get_loops(block=b103)
-l111, l112 = sch.split(loop=l110, factors=[None, 16], preserve_unit_iters=True)
-l113, l114 = sch.split(loop=l109, factors=[None, 16], preserve_unit_iters=True)
-l115, l116, l117, l118, l119, l120, l121, l122, l123 = sch.get_loops(block=b103)
-sch.reorder(l122, l114, l112)
-b124 = sch.blockize(loop=l114)
-sch.annotate(block_or_loop=b124, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
-b125 = sch.cache_read(block=b21, read_buffer_index=1, storage_scope="wmma.matrix_b")
-sch.compute_at(block=b125, loop=l60, preserve_unit_loops=True, index=-1)
-l126, l127, l128, l129, l130, l131, l132 = sch.get_loops(block=b125)
-l133, l134 = sch.split(loop=l132, factors=[None, 16], preserve_unit_iters=True)
-l135, l136 = sch.split(loop=l131, factors=[None, 16], preserve_unit_iters=True)
-l137, l138, l139, l140, l141, l142, l143, l144, l145 = sch.get_loops(block=b125)
-sch.reorder(l144, l136, l134)
-b146 = sch.blockize(loop=l136)
-sch.annotate(block_or_loop=b146, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b")
-sch.compute_inline(block=b2)
-sch.compute_inline(block=b3)
-sch.storage_align(block=b85, buffer_index=0, axis=-2, factor=32, offset=8)
-sch.storage_align(block=b94, buffer_index=0, axis=-2, factor=32, offset=8)""".split(
-            "\n"
-        )
-    ]
-    check_trace(spaces, expected)
-
-    # test adding unappliable tensor intrinsics doesn't change the search space
-    ctx = _create_context(
-        workload,
-        target,
-        multi_level_tiling_tensor_core(
-            target=target,
-            write_reuse_scope="shared",
-            in_dtype="float16",
-            out_dtype=["float16", "float32"],
-        ),
-    )
-    check_trace(spaces, expected)
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-
-
-if __name__ == "__main__":
-    tvm.testing.main()

From c0d2734056d4d4bfc67a125b4e61194a809f22d5 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Thu, 15 Sep 2022 23:29:17 -0700
Subject: [PATCH 184/704] [TVMScript] IRBuilder methods for `Axis` (#12808)

This PR introduces remaining IRBuilder methods for `Axis`.

Co-authored-by: yongwww <yongcale@gmail.com>
---
 include/tvm/script/ir_builder/tir/ir.h        |  49 ++++++
 python/tvm/script/ir_builder/tir/ir.py        | 157 +++++++++++++++++-
 src/script/ir_builder/tir/ir.cc               |  86 ++++++++++
 .../unittest/test_tvmscript_ir_builder_tir.py |  43 +++++
 4 files changed, 334 insertions(+), 1 deletion(-)

diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
index 68948196ff6b..037606253adc 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -141,6 +141,55 @@ void PreflattenedBuffer(Buffer postflattened_buffer, Array<PrimExpr> shape,
  */
 BlockFrame Block(String name, bool no_realize = false);
 
+namespace axis {
+
+/*!
+ * \brief The spatial block axis defining function.
+ * \param dom The domain of the iteration variable.
+ * \param binding The binding value of the iteration variable.
+ * \param dtype The data type of the iteration variable.
+ * \return The iteration variable.
+ */
+Var Spatial(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+
+/*!
+ * \brief The reduced block axis defining function.
+ * \param dom The domain of the iteration variable.
+ * \param binding The binding value of the iteration variable.
+ * \param dtype The data type of the iteration variable.
+ * \return The iteration variable.
+ */
+Var Reduce(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+
+/*!
+ * \brief The scanning block axis defining function.
+ * \param dom The domain of the iteration variable.
+ * \param binding The binding value of the iteration variable.
+ * \param dtype The data type of the iteration variable.
+ * \return The iteration variable.
+ */
+Var Scan(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+
+/*!
+ * \brief The opaque block axis defining function.
+ * \param dom The domain of the iteration variable.
+ * \param binding The binding value of the iteration variable.
+ * \param dtype The data type of the iteration variable.
+ * \return The iteration variable.
+ */
+Var Opaque(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+
+/*!
+ * \brief The block axis remapping function.
+ * \param kinds The types of the iteration variables.
+ * \param bindings The binding values of the iteration variables.
+ * \param dtype The data types of the iteration variables.
+ * \return The iteration variables.
+ */
+Array<Var> Remap(String kinds, Array<PrimExpr> bindings, DataType dtype = DataType::Int(32));
+
+}  // namespace axis
+
 /*!
  * \brief The serial For statement.
  * \param start The minimum value of iteration.
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index a5cdf8a3a105..40cd99c744d7 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -20,7 +20,7 @@
 from numbers import Integral
 from typing import Any, Dict, List, Optional, Union, Tuple
 
-from tvm.ir import Type
+from tvm.ir import Range, Type
 from tvm.tir import (
     Buffer,
     BufferLoad,
@@ -344,6 +344,160 @@ def block(name: str = "", no_realize: bool = False) -> frame.BlockFrame:
     return _ffi_api.Block(name, no_realize)  # pylint: disable=no-member # type: ignore
 
 
+def _as_range(dom: Union[Range, List[PrimExpr]]) -> Range:
+    """The range constructor.
+
+    Parameters
+    ----------
+    dom : Union[Range, List[PrimExpr]]
+        The domain.
+
+    Returns
+    -------
+    res : Range
+        The Range.
+    """
+    if isinstance(dom, Range):
+        return dom
+    if isinstance(dom, (list, tuple)):
+        return Range(dom[0], dom[1])
+    return Range(0, dom)
+
+
+class axis:  # pylint: disable=invalid-name
+    @staticmethod
+    def spatial(
+        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32"
+    ) -> Var:
+        """The spatial block axis defining function.
+
+        Parameters
+        ----------
+        dom : Union[Range, List[PrimExpr], Tuple[PrimExpr]]
+            The domain of the iteration variable.
+
+        binding : PrimExpr
+            The binding value of the iteration variable.
+
+        dtype : str
+            The data type of the iteration variable.
+
+        Returns
+        -------
+        res : Var
+            The iteration variable.
+        """
+        return _ffi_api.AxisSpatial(  # pylint: disable=no-member # type: ignore
+            _as_range(dom), binding, dtype
+        )
+
+    @staticmethod
+    def reduce(
+        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32"
+    ) -> Var:
+        """The reduced block axis defining function.
+
+        Parameters
+        ----------
+        dom : Union[Range, List[PrimExpr], Tuple[PrimExpr]]
+            The domain of the iteration variable.
+
+        binding : PrimExpr
+            The binding value of the iteration variable.
+
+        dtype : str
+            The data type of the iteration variable.
+
+        Returns
+        -------
+        res : Var
+            The iteration variable.
+        """
+        return _ffi_api.AxisReduce(  # pylint: disable=no-member # type: ignore
+            _as_range(dom), binding, dtype
+        )
+
+    @staticmethod
+    def scan(
+        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32"
+    ) -> Var:
+        """The scanning block axis defining function.
+
+        Parameters
+        ----------
+        dom : Union[Range, List[PrimExpr], Tuple[PrimExpr]]
+            The domain of the iteration variable.
+
+        binding : PrimExpr
+            The binding value of the iteration variable.
+
+        dtype : str
+            The data type of the iteration variable.
+
+        Returns
+        -------
+        res : Var
+            The iteration variable.
+        """
+        return _ffi_api.AxisScan(  # pylint: disable=no-member # type: ignore
+            _as_range(dom), binding, dtype
+        )
+
+    @staticmethod
+    def opaque(
+        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32"
+    ) -> Var:
+        """The opaque block axis defining function.
+
+        Parameters
+        ----------
+        dom : Union[Range, List[PrimExpr], Tuple[PrimExpr]]
+            The domain of the iteration variable.
+
+        binding : PrimExpr
+            The binding value of the iteration variable.
+
+        dtype : str
+            The data type of the iteration variable.
+
+        Returns
+        -------
+        res : Var
+            The iteration variable.
+        """
+        return _ffi_api.AxisOpaque(  # pylint: disable=no-member # type: ignore
+            _as_range(dom), binding, dtype
+        )
+
+    @staticmethod
+    def remap(kinds: str, bindings: List[PrimExpr], dtype: str = "int32") -> Union[List[Var], Var]:
+        """The block axis remapping function.
+
+        Parameters
+        ----------
+        kinds : str
+            The types of the iteration variables.
+
+        bindings : List[PrimExpr]
+            The binding values of the iteration variables.
+
+        dtype : str
+            The data types of the iteration variables.
+
+        Returns
+        -------
+        res : Var
+            The iteration variables.
+        """
+        iter_vars = _ffi_api.AxisRemap(  # pylint: disable=no-member # type: ignore
+            kinds, bindings, dtype
+        )
+        return iter_vars[0] if len(iter_vars) == 1 else iter_vars
+
+    S = spatial  # pylint: disable=invalid-name
+    R = reduce  # pylint: disable=invalid-name
+
+
 def serial(
     start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None
 ) -> frame.ForFrame:
@@ -843,6 +997,7 @@ def var(dtype, name="") -> Var:
     "match_buffer",
     "preflattened_buffer",
     "block",
+    "axis",
     "serial",
     "parallel",
     "vectorized",
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index 22c7face7084..5013e321728e 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -173,6 +173,86 @@ BlockFrame Block(String name, bool no_realize) {
   return BlockFrame(n);
 }
 
+namespace axis {
+
+IterVar PushBlockVar(IterVar iter_var, PrimExpr binding) {
+  if (Optional<BlockFrame> opt_frame = IRBuilder::Current()->GetLastFrame<BlockFrame>()) {
+    BlockFrame frame = opt_frame.value();
+    frame->iter_vars.push_back(iter_var);
+    frame->iter_values.push_back(binding);
+  } else {
+    LOG(FATAL) << "TypeError: The last frame is not BlockFrame";
+  }
+  return iter_var;
+}
+
+#define TVM_TIR_IR_BUILDER_AXIS(Method, Kind, Name)                                           \
+  Var Method(Range dom, PrimExpr binding, DataType dtype) {                                   \
+    ICHECK(dom.defined()) << Name << " axis must have a domain";                              \
+    int bits = std::max({dom->min.dtype().bits(), dom->extent.dtype().bits(), dtype.bits()}); \
+    return PushBlockVar(IterVar(/*dom=*/dom, /*var=*/Var("", dtype.with_bits(bits)),          \
+                                /*iter_type=*/Kind, /*thread_tag=*/""),                       \
+                        binding)                                                              \
+        ->var;                                                                                \
+  }
+TVM_TIR_IR_BUILDER_AXIS(Spatial, tvm::tir::IterVarType::kDataPar, "Spatial");
+TVM_TIR_IR_BUILDER_AXIS(Reduce, tvm::tir::IterVarType::kCommReduce, "Reduction");
+TVM_TIR_IR_BUILDER_AXIS(Scan, tvm::tir::IterVarType::kOrdered, "Scan");
+TVM_TIR_IR_BUILDER_AXIS(Opaque, tvm::tir::IterVarType::kOpaque, "Opaque");
+#undef TVM_TIR_IR_BUILDER_AXIS
+
+Array<Var> Remap(String kinds, Array<PrimExpr> bindings, DataType dtype) {
+  using namespace tvm::tir;
+  Array<Var> results;
+  ICHECK_EQ(kinds.size(), bindings.size());
+  int n = bindings.size();
+  results.reserve(n);
+  for (int i = 0; i < n; ++i) {
+    char c = kinds.c_str()[i];
+    PrimExpr e = bindings[i];
+    const VarNode* v = e.as<VarNode>();
+    ICHECK(v) << "TypeError: Only Var is supported in T.axis.remap";
+    Range dom{nullptr};
+    for (const auto& frame : IRBuilder::Current()->frames) {
+      if (const auto* for_frame = frame.as<ForFrameNode>()) {
+        ICHECK_EQ(for_frame->doms.size(), for_frame->vars.size());
+        int n = for_frame->doms.size();
+        for (int i = 0; i < n; ++i) {
+          if (for_frame->vars[i].get() == v) {
+            dom = for_frame->doms[i];
+            break;
+          }
+        }
+        if (dom.defined()) {
+          break;
+        }
+      }
+    }
+    ICHECK(dom.defined()) << "TypeError: Variable is not in the loop: " << GetRef<Var>(v);
+    DataType dtype = v->dtype;
+    if (c == 'S') {
+      results.push_back(PushBlockVar(IterVar(/*dom=*/dom,
+                                             /*var=*/Var("", dtype),
+                                             /*iter_type=*/IterVarType::kDataPar,
+                                             /*thread_tag=*/""),
+                                     e)
+                            ->var);
+    } else if (c == 'R') {
+      results.push_back(PushBlockVar(IterVar(/*dom=*/dom,
+                                             /*var=*/Var("", dtype),
+                                             /*iter_type=*/IterVarType::kCommReduce,
+                                             /*thread_tag=*/""),
+                                     e)
+                            ->var);
+    } else {
+      LOG(FATAL) << "Unknown axis kind: " << c;
+    }
+  }
+  return results;
+}
+
+}  // namespace axis
+
 #define TVM_TIR_IR_BUILDER_FOR_FRAME(Method, Kind)                                                \
   ForFrame Method(PrimExpr start, PrimExpr stop, Optional<Map<String, ObjectRef>> annotations) {  \
     PrimExpr min = start;                                                                         \
@@ -304,6 +384,12 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.PreflattenedBuffer").set_body_typed(P
 
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Block").set_body_typed(Block);
 
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisSpatial").set_body_typed(axis::Spatial);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisReduce").set_body_typed(axis::Reduce);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisScan").set_body_typed(axis::Scan);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisOpaque").set_body_typed(axis::Opaque);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisRemap").set_body_typed(axis::Remap);
+
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Serial").set_body_typed(Serial);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Parallel").set_body_typed(Parallel);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Vectorized").set_body_typed(Vectorized);
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index 9cbfd75e2280..d893ebc545c6 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -114,6 +114,49 @@ def test_ir_builder_tir_block():
     assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True)
 
 
+def test_ir_builder_tir_axis():
+    with IRBuilder() as ib:
+        a = T.var("int32", "a")
+        b = T.var("int32", "b")
+        c = T.var("int32", "c")
+        d = T.var("int32", "d")
+        with T.block("block"):
+            T.axis.spatial(8, a)
+            T.axis.reduce(16, b)
+            T.axis.scan(32, c)
+            T.axis.opaque(64, d)
+            T.evaluate(0)
+
+    # the block generated by IRBuilder
+    block_realize_actual = ib.get()
+
+    # the expected block
+    var_a = tir.Var("a", "int32")
+    var_b = tir.Var("b", "int32")
+    var_c = tir.Var("c", "int32")
+    var_d = tir.Var("d", "int32")
+    block_expected = tir.Block(
+        iter_vars=[
+            tir.IterVar((0, 8), tir.Var("", "int32"), iter_type=tir.IterVar.DataPar),
+            tir.IterVar((0, 16), tir.Var("", "int32"), iter_type=tir.IterVar.CommReduce),
+            tir.IterVar((0, 32), tir.Var("", "int32"), iter_type=tir.IterVar.Ordered),
+            tir.IterVar((0, 64), tir.Var("", "int32"), iter_type=tir.IterVar.DimInfo),
+        ],
+        reads=[],
+        writes=[],
+        name_hint="block",
+        body=tir.Evaluate(0),
+        annotations={"tir.script_parsing_detect_access": tir.IntImm("int64", 3)},
+    )
+    block_realize_expected = tir.BlockRealize(
+        iter_values=[var_a, var_b, var_c, var_d],
+        predicate=True,
+        block=block_expected,
+    )
+    # Check if the generated ir is expected
+    assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True)
+
+
 def test_ir_builder_tir_for():
     with IRBuilder() as ib:
         with T.serial(128) as a:

From 9b17f344a31a13226458a2d48dcb7b55ce282274 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 16 Sep 2022 00:02:18 -0700
Subject: [PATCH 185/704] [ci][docker] Fix nightly Docker tests (#12804)

These were broken due to this missing guard:
https://ci.tlcpack.ai/job/docker-images-ci/job/docker-image-run-tests/223/console

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                  | 4 ++--
 ci/jenkins/Prepare.groovy.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 5835100fde3e..8ca181a759ff 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-09-14T11:22:31.582192
+// Generated at 2022-09-15T16:03:21.407877
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -213,7 +213,7 @@ def cancel_previous_build() {
 
 def checkout_trusted_files() {
   // trust everything from branch builds
-  if (!env.BRANCH_NAME.startsWith('PR-')) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
     return;
   }
 
diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2
index 4464108968de..cb677f437a3c 100644
--- a/ci/jenkins/Prepare.groovy.j2
+++ b/ci/jenkins/Prepare.groovy.j2
@@ -101,7 +101,7 @@ def cancel_previous_build() {
 
 def checkout_trusted_files() {
   // trust everything from branch builds
-  if (!env.BRANCH_NAME.startsWith('PR-')) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
     return;
   }
 

From 6b3be496e6ac2e2de22a59d935e5256e04bc8c74 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 16 Sep 2022 06:11:07 -0700
Subject: [PATCH 186/704] [MetaSchedule][Minor]Fix Random State Fork in
 TuneContext Clone Function (#12811)

Fix random state fork in TuneContext Clone function.
---
 src/meta_schedule/tune_context.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 3650c0374dab..ee24624fe9e4 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -74,6 +74,7 @@ TuneContext TuneContextNode::Clone() const {
   }
   if (this->space_generator.defined()) n->space_generator = this->space_generator.value()->Clone();
   if (this->search_strategy.defined()) n->search_strategy = this->search_strategy.value()->Clone();
+  n->rand_state = support::LinearCongruentialEngine(&n->rand_state).ForkSeed();
   n->Initialize();
   return TuneContext(n);
 }

From 8f8b6d8837a989a95ea9716b517644f898ef9b7e Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Fri, 16 Sep 2022 16:51:09 +0100
Subject: [PATCH 187/704] Fix for import requests and import caffe failures
 (#12813)

Recently virtual environments were introduced in the
docker images which was a great contribution to
localize errors: https://github.com/apache/tvm/pull/12663. In this fix, link to the caffe is
created inside this virtual env instead of adding it
to the system path of python. This fix also removes
importing request package where not needed.

Fixes #12663
---
 ci/scripts/github_skipped_tests_comment.py | 2 --
 docker/install/ubuntu_install_caffe.sh     | 8 +++++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/ci/scripts/github_skipped_tests_comment.py b/ci/scripts/github_skipped_tests_comment.py
index ef0630620b97..c07fbf4a8463 100755
--- a/ci/scripts/github_skipped_tests_comment.py
+++ b/ci/scripts/github_skipped_tests_comment.py
@@ -24,8 +24,6 @@
 from urllib import error
 from xml.etree import ElementTree
 
-import requests
-
 from git_utils import git, GitHubRepo, parse_remote
 from cmd_utils import init_log
 
diff --git a/docker/install/ubuntu_install_caffe.sh b/docker/install/ubuntu_install_caffe.sh
index c37bfb764935..4d9763b69aa3 100755
--- a/docker/install/ubuntu_install_caffe.sh
+++ b/docker/install/ubuntu_install_caffe.sh
@@ -18,6 +18,11 @@
 
 set -euxo pipefail
 
+if [ -z "${TVM_VENV+x}" ]; then
+    echo "ERROR: expect TVM_VENV env var to be set"
+    exit 2
+fi
+
 apt-get update --fix-missing
 
 # # Install dependencies
@@ -60,4 +65,5 @@ cd / && rm -rf /caffe_src
 
 PYCAFFE_ROOT=${CAFFE_HOME}/python
 echo "${CAFFE_HOME}/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
-ln -s ${PYCAFFE_ROOT}/caffe /usr/local/lib/python3.7/dist-packages/caffe
+VENV_SITE_PACKAGE=$(pip3 show numpy | grep "Location:" | cut -d ' ' -f 2)
+ln -s ${PYCAFFE_ROOT}/caffe ${VENV_SITE_PACKAGE}/caffe

From 43d9a3b93baeeec33fa0f4953f50c7242e8183b5 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Fri, 16 Sep 2022 10:50:30 -0700
Subject: [PATCH 188/704] =?UTF-8?q?[Hexagon]=20Reduce=20the=20number=20of?=
 =?UTF-8?q?=20tests=20run=20for=20VTCM=20testing=20in=20order=20to?=
 =?UTF-8?q?=E2=80=A6=20(#12783)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[Hexagon] Reduce the number of tests run for VTCM testing in order to speedup CI.
---
 .../test_parallel_hvx_load_vtcm.py            | 25 ++++++++++++----
 .../test_hexagon/test_vtcm_bandwidth.py       | 30 +++++++++++--------
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
index c9ff07c490c8..5dcb4b18b845 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -308,7 +308,14 @@ def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global")
     a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope=mem_scope)
     b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope=mem_scope)
     c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope=mem_scope)
-    timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
+
+    # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
+    number = 1
+    repeat = 1
+
+    timer = module.time_evaluator(
+        "__tvm_main__", hexagon_session.device, number=number, repeat=repeat
+    )
     time = timer(a_hexagon, b_hexagon, c_hexagon)
     gops = round(operations * 128 * 3 / time.mean / 1e9, 4)
     return gops, c_hexagon.asnumpy()
@@ -338,7 +345,13 @@ def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations):
         c_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
     )
 
-    timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
+    # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
+    number = 1
+    repeat = 1
+
+    timer = module.time_evaluator(
+        "__tvm_main__", hexagon_session.device, number=number, repeat=repeat
+    )
     time = timer(a_hexagon, b_hexagon, c_hexagon, a_vtcm_hexagon, b_vtcm_hexagon, c_vtcm_hexagon)
     gops = round(operations * 128 * 3 / time.mean / 1e9, 4)
     return gops, c_hexagon.asnumpy()
@@ -372,12 +385,12 @@ def expected_output(operations, input_a, input_b, input_c):
 
 
 class TestMatMulVec:
-
+    # Removed most of these to speedup CI.
     operations = tvm.testing.parameter(
         1024,
-        2048,
-        4096,
-        5 * 2048,  # 3.93MB of total transfer
+        # 2048,
+        # 4096,
+        # 5 * 2048,  # 3.93MB of total transfer
         # 16384, #Only works on 8Gen1 HDK's
         # 5 * 4096,  # 7.86MB of total transfer. Only works on 8Gen1 HDK's
     )
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 6db8b9101997..83daf2458737 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -96,10 +96,13 @@ def evaluate(hexagon_session, sch, size):
         a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
     )
 
-    # a_hexagon = allocate_hexagon_array(hexagon_session.device, data=a, mem_scope="global")
-    # a_vtcm_hexagon = allocate_hexagon_array(hexagon_session.device, data=a_vtcm, mem_scope="global.vtcm")
+    # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
+    number = 1
+    repeat = 1
 
-    timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
+    timer = module.time_evaluator(
+        "__tvm_main__", hexagon_session.device, number=number, repeat=repeat
+    )
     runtime = timer(a_hexagon, a_vtcm_hexagon)
 
     gbps = round((size / 2**30) / runtime.mean, 4)
@@ -110,18 +113,19 @@ def evaluate(hexagon_session, sch, size):
 
 class TestMatMulVec:
 
+    # Removed most of these to speedup CI.
     size = tvm.testing.parameter(
-        10 * KB,
-        20 * KB,
-        40 * KB,
-        80 * KB,
-        160 * KB,
-        320 * KB,
+        # 10 * KB,
+        # 20 * KB,
+        # 40 * KB,
+        # 80 * KB,
+        # 160 * KB,
+        # 320 * KB,
         640 * KB,
-        MB,
-        2 * MB,
-        3 * MB,
-        4 * MB,
+        # MB,
+        # 2 * MB,
+        # 3 * MB,
+        # 4 * MB,
         # 8 * MB,  # Only works on 8gen1 HDKs
     )
 

From 7c96e255ce7d6d6a22b3665449ebfafb581a9fc8 Mon Sep 17 00:00:00 2001
From: Janet Schneider <janet.schneider@gmail.com>
Date: Fri, 16 Sep 2022 11:53:53 -0700
Subject: [PATCH 189/704] [Hexagon] [runtime] Protect access to global
 HexagonBufferManager map (#12807)

* Protect access to global buffer manager map

* Fix lint
---
 src/runtime/hexagon/hexagon_buffer_manager.h | 25 +++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_buffer_manager.h b/src/runtime/hexagon/hexagon_buffer_manager.h
index 658a39fac8a8..a698b0ecb163 100644
--- a/src/runtime/hexagon/hexagon_buffer_manager.h
+++ b/src/runtime/hexagon/hexagon_buffer_manager.h
@@ -43,7 +43,10 @@ class HexagonBufferManager {
     CHECK(it != hexagon_buffer_map_.end())
         << "Attempt made to free unknown or already freed dataspace allocation";
     CHECK(it->second != nullptr);
-    hexagon_buffer_map_.erase(it);
+    {
+      std::lock_guard<std::mutex> lock(map_mutex_);
+      hexagon_buffer_map_.erase(it);
+    }
   }
   /*!
    * \brief Allocate a HexagonBuffer.
@@ -53,15 +56,22 @@ class HexagonBufferManager {
   void* AllocateHexagonBuffer(Args&&... args) {
     auto buf = std::make_unique<HexagonBuffer>(std::forward<Args>(args)...);
     void* ptr = buf->GetPointer();
-    hexagon_buffer_map_.insert({ptr, std::move(buf)});
+    {
+      std::lock_guard<std::mutex> lock(map_mutex_);
+      hexagon_buffer_map_.insert({ptr, std::move(buf)});
+    }
     return ptr;
   }
 
   //! \brief Returns whether the HexagonBuffer is in the map.
-  size_t count(void* ptr) { return hexagon_buffer_map_.count(ptr); }
+  size_t count(void* ptr) {
+    std::lock_guard<std::mutex> lock(map_mutex_);
+    return hexagon_buffer_map_.count(ptr);
+  }
 
   //! \brief Returns an iterator to the HexagonBuffer within the map.
   HexagonBuffer* find(void* ptr) {
+    std::lock_guard<std::mutex> lock(map_mutex_);
     auto it = hexagon_buffer_map_.find(ptr);
     if (it != hexagon_buffer_map_.end()) {
       return it->second.get();
@@ -69,9 +79,18 @@ class HexagonBufferManager {
     return nullptr;
   }
 
+  //! \brief Returns whether the HexagonBufferManager has any allocations.
+  bool empty() {
+    std::lock_guard<std::mutex> lock(map_mutex_);
+    return hexagon_buffer_map_.empty();
+  }
+
  private:
   //! \brief Contains the HexagonBuffer objects managed by this class.
   std::unordered_map<void*, std::unique_ptr<HexagonBuffer>> hexagon_buffer_map_;
+
+  //! \brief Protects updates to the map.
+  std::mutex map_mutex_;
 };
 
 }  // namespace hexagon

From 5d0a16749cdba494178dee7deefa2938d1f8a88b Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 16 Sep 2022 12:54:51 -0700
Subject: [PATCH 190/704] [ci] Fix docs push (#12810)

This was missing a repo checkout and failing as in
https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4302/pipeline.
This also adds in the changes from #12719:

Fixes #12600. The original solution there doesn't actually fix the
issue, there would need to be some job queue that could make sure to
reject old pushes. Since this case is pretty rare, generally the next
commit that comes along and builds will fix everything up so we can
ignore failures that happen on `push`es.
---
 Jenkinsfile                 | 7 ++++---
 ci/jenkins/Deploy.groovy.j2 | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8ca181a759ff..a61ab1cd69a2 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-09-15T16:03:21.407877
+// Generated at 2022-09-16T08:47:49.743918
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -4205,7 +4205,7 @@ def deploy_docs() {
       script: '''
         cd tvm-site
         git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git
-        git push deploy $DOCS_DEPLOY_BRANCH
+        git push deploy $DOCS_DEPLOY_BRANCH || true
       ''',
       label: 'Upload docs to apache/tvm-site'
     )
@@ -4222,7 +4222,8 @@ def deploy() {
       node('CPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") {
           timeout(time: max_time, unit: 'MINUTES') {
-            sh(
+            init_git()
+                    sh(
                       script: """
                         set -eux
                         . ci/scripts/retry.sh
diff --git a/ci/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2
index d2ee4360da6b..9812e1113598 100644
--- a/ci/jenkins/Deploy.groovy.j2
+++ b/ci/jenkins/Deploy.groovy.j2
@@ -73,7 +73,7 @@ def deploy_docs() {
       script: '''
         cd tvm-site
         git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git
-        git push deploy $DOCS_DEPLOY_BRANCH
+        git push deploy $DOCS_DEPLOY_BRANCH || true
       ''',
       label: 'Upload docs to apache/tvm-site'
     )
@@ -90,6 +90,7 @@ def deploy() {
           feature_flag="env.DOCS_DEPLOY_ENABLED == 'yes'",
           ws="tvm/deploy-docs",
         ) %}
+          init_git()
           {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }}
           deploy_docs()
         {% endcall %}

From e037ae49928592afdfa8d2c27198fc68592f9528 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 16 Sep 2022 13:29:57 -0700
Subject: [PATCH 191/704] [ci] Add bot to post welcome comment (#12695)

This would post the comment that the tests bot and the docs comment bot
uses straightaway when a PR is posted. This will contain links to
generic info about posting PRs (and obviate the
`.github/PULL_REQUEST_TEMPLATE.md`) as well as dynamic info about the
specific PR (filled in later by the respective bots). This would make
things like the auto-cc bot more transparent since it would have a link
to the relevant issue.

Tested live here: https://github.com/driazati/tvm/pull/21#issuecomment-1236019529
---
 .github/PULL_REQUEST_TEMPLATE.md           |   1 -
 .github/workflows/docs_bot.yml             |  18 --
 .github/workflows/pr_comment_bot.yml       |  55 +++++
 .github/workflows/tag_teams.yml            |   7 -
 .github/workflows/tests_bot.yml            |  21 --
 ci/scripts/__init__.py                     |  19 ++
 ci/scripts/git_utils.py                    |  60 ++++-
 ci/scripts/github_commenter.py             | 132 +++++++++++
 ci/scripts/github_docs_comment.py          |  83 +------
 ci/scripts/github_pr_comment.py            | 141 +++++++++++
 ci/scripts/github_skipped_tests_comment.py | 140 ++++-------
 ci/scripts/github_tag_teams.py             |  85 +++++--
 tests/python/ci/test_ci.py                 | 262 ++++++++++++++++++---
 13 files changed, 741 insertions(+), 283 deletions(-)
 delete mode 100644 .github/PULL_REQUEST_TEMPLATE.md
 delete mode 100644 .github/workflows/docs_bot.yml
 create mode 100644 .github/workflows/pr_comment_bot.yml
 delete mode 100644 .github/workflows/tests_bot.yml
 create mode 100644 ci/scripts/__init__.py
 create mode 100644 ci/scripts/github_commenter.py
 create mode 100755 ci/scripts/github_pr_comment.py

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
deleted file mode 100644
index 093cdc483c78..000000000000
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ /dev/null
@@ -1 +0,0 @@
-Thanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.
diff --git a/.github/workflows/docs_bot.yml b/.github/workflows/docs_bot.yml
deleted file mode 100644
index 73c12a8d7d05..000000000000
--- a/.github/workflows/docs_bot.yml
+++ /dev/null
@@ -1,18 +0,0 @@
-
-name: docs-bot
-on:
-  status
-jobs:
-  run-docs-bot:
-    if: ${{ github.repository == 'apache/tvm' && github.event.state == 'success' && github.event.context == 'tvm-ci/pr-head' }}
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Comment link to docs
-        env:
-          COMMIT_SHA: ${{ github.event.sha }}
-          TARGET_URL: ${{ github.event.target_url }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -eux
-          python ci/scripts/github_docs_comment.py
\ No newline at end of file
diff --git a/.github/workflows/pr_comment_bot.yml b/.github/workflows/pr_comment_bot.yml
new file mode 100644
index 000000000000..89416df928b8
--- /dev/null
+++ b/.github/workflows/pr_comment_bot.yml
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: comment-bot
+on:
+  pull_request_target:
+    types: [opened, reopened, edited, ready_for_review, labeled]
+  status:
+
+concurrency:
+  group: pr-comment-${{ github.event.number }}-${{ github.event.target_url }}
+  cancel-in-progress: true
+
+jobs:
+  run-comment-bot:
+    if: ${{ github.repository == 'apache/tvm' }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Comment bot comment (pr)
+        if: ${{ github.event.number }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUMBER: ${{ github.event.number }}
+        run: |
+          set -eux
+          python ci/scripts/github_pr_comment.py --pr "$PR_NUMBER"
+      - name: Comment bot comment (status)
+        if: ${{ github.event.state }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          URL: ${{ github.event.target_url }}
+        run: |
+          set -eux
+          if [[ "$URL" == *"PR-"* ]]; then
+            echo "PR status, sending comment"
+            PR_NUMBER=$(echo $URL | sed 's/.*PR-//g' | sed 's/\/.*//g')
+            python ci/scripts/github_pr_comment.py --pr "$PR_NUMBER"
+          else
+            echo "Not a PR status, skipping"
+          fi
diff --git a/.github/workflows/tag_teams.yml b/.github/workflows/tag_teams.yml
index 7c10f9c33d9f..c0c1b8b8299d 100644
--- a/.github/workflows/tag_teams.yml
+++ b/.github/workflows/tag_teams.yml
@@ -15,16 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# GH actions.
-# We use it to cover windows and mac builds
-# Jenkins is still the primary CI
-
 name: Teams
 
 on:
-  # See https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target
-  pull_request_target:
-    types: [opened, reopened, edited, ready_for_review, labeled]
   issues:
     types: [opened, edited, reopened, labeled]
 
diff --git a/.github/workflows/tests_bot.yml b/.github/workflows/tests_bot.yml
deleted file mode 100644
index 0ddae2afb771..000000000000
--- a/.github/workflows/tests_bot.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-
-name: tests-bot
-on:
-  status
-jobs:
-  run-tests-bot:
-    if: ${{ github.repository == 'apache/tvm' && github.event.state == 'success' && github.event.context == 'tvm-ci/pr-head' }}
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Comment skipped tests
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.CI_RESOURCES_AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.CI_RESOURCES_AWS_SECRET_ACCESS_KEY }}
-          AWS_DEFAULT_REGION: us-west-2
-          COMMIT_SHA: ${{ github.event.sha }}
-          TARGET_URL: ${{ github.event.target_url }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -eux
-          python ci/scripts/github_skipped_tests_comment.py
\ No newline at end of file
diff --git a/ci/scripts/__init__.py b/ci/scripts/__init__.py
new file mode 100644
index 000000000000..064781fa158d
--- /dev/null
+++ b/ci/scripts/__init__.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Package to enable testing of CI scripts"""
+
+from . import github_skipped_tests_comment, github_pr_comment, github_tag_teams, github_docs_comment
diff --git a/ci/scripts/git_utils.py b/ci/scripts/git_utils.py
index cb639178c3f9..1295ff8e3c2c 100644
--- a/ci/scripts/git_utils.py
+++ b/ci/scripts/git_utils.py
@@ -19,11 +19,14 @@
 import json
 import subprocess
 import re
+import os
 import base64
 import logging
 from urllib import request, error
 from typing import Dict, Tuple, Any, Optional, List
 
+DRY_RUN = object()
+
 
 def compress_query(query: str) -> str:
     query = query.replace("\n", "")
@@ -32,7 +35,7 @@ def compress_query(query: str) -> str:
 
 
 def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] = None):
-    print(f"Requesting POST to", url, "with", body)
+    logging.info(f"Requesting POST to", url, "with", body)
     headers = {}
     req = request.Request(url, headers=headers, method="POST")
     if auth is not None:
@@ -51,11 +54,21 @@ def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] =
         return response.read()
 
 
+def dry_run_token(is_dry_run: bool) -> Any:
+    if is_dry_run:
+        return DRY_RUN
+    return os.environ["GITHUB_TOKEN"]
+
+
 class GitHubRepo:
-    def __init__(self, user, repo, token):
+    GRAPHQL_URL = "https://api.github.com/graphql"
+
+    def __init__(self, user, repo, token, test_data=None):
         self.token = token
         self.user = user
         self.repo = repo
+        self.test_data = test_data
+        self.num_calls = 0
         self.base = f"https://api.github.com/repos/{user}/{repo}/"
 
     def headers(self):
@@ -63,22 +76,41 @@ def headers(self):
             "Authorization": f"Bearer {self.token}",
         }
 
+    def dry_run(self) -> bool:
+        return self.token == DRY_RUN
+
     def graphql(self, query: str, variables: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
         query = compress_query(query)
         if variables is None:
             variables = {}
+
         response = self._request(
-            "https://api.github.com/graphql",
+            self.GRAPHQL_URL,
             {"query": query, "variables": variables},
             method="POST",
         )
+        if self.dry_run():
+            return self.testing_response("POST", self.GRAPHQL_URL)
+
         if "data" not in response:
             msg = f"Error fetching data with query:\n{query}\n\nvariables:\n{variables}\n\nerror:\n{json.dumps(response, indent=2)}"
             raise RuntimeError(msg)
         return response
 
+    def testing_response(self, method: str, url: str) -> Any:
+        self.num_calls += 1
+        key = f"[{self.num_calls}] {method} - {url}"
+        if self.test_data is not None and key in self.test_data:
+            return self.test_data[key]
+        logging.info(f"Unknown URL in dry run: {key}")
+        return {}
+
     def _request(self, full_url: str, body: Dict[str, Any], method: str) -> Dict[str, Any]:
-        print(f"Requesting {method} to", full_url, "with", body)
+        if self.dry_run():
+            logging.info(f"Dry run, would have requested a {method} to {full_url} with {body}")
+            return self.testing_response(method, full_url)
+
+        logging.info(f"Requesting {method} to {full_url} with {body}")
         req = request.Request(full_url, headers=self.headers(), method=method.upper())
         req.add_header("Content-Type", "application/json; charset=utf-8")
         data = json.dumps(body)
@@ -111,16 +143,22 @@ def post(self, url: str, data: Dict[str, Any]) -> Dict[str, Any]:
         return self._request(self.base + url, data, method="POST")
 
     def get(self, url: str) -> Dict[str, Any]:
+        if self.dry_run():
+            logging.info(f"Dry run, would have requested a GET to {url}")
+            return self.testing_response("GET", url)
         url = self.base + url
-        print("Requesting GET to", url)
+        logging.info(f"Requesting GET to {url}")
         req = request.Request(url, headers=self.headers())
         with request.urlopen(req) as response:
             response = json.loads(response.read())
         return response
 
     def delete(self, url: str) -> Dict[str, Any]:
+        if self.dry_run():
+            logging.info(f"Dry run, would have requested a DELETE to {url}")
+            return self.testing_response("DELETE", url)
         url = self.base + url
-        print("Requesting DELETE to", url)
+        logging.info(f"Requesting DELETE to {url}")
         req = request.Request(url, headers=self.headers(), method="DELETE")
         with request.urlopen(req) as response:
             response = json.loads(response.read())
@@ -136,18 +174,22 @@ def parse_remote(remote: str) -> Tuple[str, str]:
         parts = remote.split("/")
         if len(parts) < 2:
             raise RuntimeError(f"Unable to parse remote '{remote}'")
-        return parts[-2], parts[-1].replace(".git", "")
+        user, repo = parts[-2], parts[-1].replace(".git", "")
     else:
         # Parse SSH remote
         m = re.search(r":(.*)/(.*)\.git", remote)
         if m is None or len(m.groups()) != 2:
             raise RuntimeError(f"Unable to parse remote '{remote}'")
-        return m.groups()
+        user, repo = m.groups()
+
+    user = os.getenv("DEBUG_USER", user)
+    repo = os.getenv("DEBUG_REPO", repo)
+    return user, repo
 
 
 def git(command, **kwargs):
     command = ["git"] + command
-    print("Running", command)
+    logging.info(f"Running {command}")
     proc = subprocess.run(command, stdout=subprocess.PIPE, encoding="utf-8", **kwargs)
     if proc.returncode != 0:
         raise RuntimeError(f"Command failed {command}:\nstdout:\n{proc.stdout}")
diff --git a/ci/scripts/github_commenter.py b/ci/scripts/github_commenter.py
new file mode 100644
index 000000000000..dc71fcd1fd32
--- /dev/null
+++ b/ci/scripts/github_commenter.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+import logging
+from typing import Dict, Tuple, Any, Optional, List, Union
+
+from git_utils import GitHubRepo
+
+BOT_COMMENT_START = "<!---bot-comment-->"
+WELCOME_TEXT = "Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @-ing them in a comment."
+
+
+class BotCommentBuilder:
+    ALLOWLIST_USERS = {"driazati", "gigiblender", "areusch"}
+
+    def __init__(self, github: GitHubRepo, data: Dict[str, Any]):
+        self.github = github
+        self.pr_number = data["number"]
+        self.comment_data = data["comments"]["nodes"]
+        self.author = data["author"]["login"]
+
+    def find_bot_comment(self) -> Optional[Dict[str, Any]]:
+        """
+        Return the existing bot comment or None if it does not exist
+        """
+        for comment in self.comment_data:
+            logging.info(f"Checking comment {comment}")
+            if (
+                comment["author"]["login"] == "github-actions"
+                and BOT_COMMENT_START in comment["body"]
+            ):
+                logging.info("Found existing comment")
+                return comment
+        logging.info("No existing comment found")
+        return None
+
+    def find_existing_body(self) -> Dict[str, str]:
+        """
+        Find existing dynamic bullet point items
+        """
+        existing_comment = self.find_bot_comment()
+        if existing_comment is None:
+            logging.info(f"No existing comment while searching for body items")
+            return {}
+
+        matches = re.findall(
+            r"<!--bot-comment-([a-z][a-z-]+)-start-->([\S\s]*?)<!--bot-comment-([a-z-]+)-end-->",
+            existing_comment["body"],
+            flags=re.MULTILINE,
+        )
+        logging.info(f"Fetch body item matches: {matches}")
+
+        items = {}
+        for start, text, end in matches:
+            if start != end:
+                raise RuntimeError(
+                    f"Malformed comment found: {start} marker did not have matching end, found instead {end}"
+                )
+            items[start] = text.strip().lstrip("* ")
+
+        logging.info(f"Found body items: {items}")
+        return items
+
+    def _post_comment(self, body_items: Dict[str, str]):
+        comment = BOT_COMMENT_START + "\n\n" + WELCOME_TEXT + "\n\n"
+        for key, content in body_items.items():
+            line = self.start_key(key) + "\n * " + content.strip() + self.end_key(key)
+            logging.info(f"Adding line {line}")
+            comment += line
+        comment += "\n\n<sub>Generated by [tvm-bot](https://github.com/apache/tvm/blob/main/ci/README.md#github-actions)</sub>"
+
+        data = {"body": comment}
+        url = f"issues/{self.pr_number}/comments"
+
+        logging.info(f"Commenting {comment} on {url}")
+
+        if self.author not in self.ALLOWLIST_USERS:
+            logging.info(f"Skipping comment for author {self.author}")
+            return
+
+        existing_comment = self.find_bot_comment()
+        if existing_comment is None:
+            # Comment does not exist, post it
+            r = self.github.post(url, data)
+        else:
+            # Comment does exist, update it
+            comment_url = f"issues/comments/{existing_comment['databaseId']}"
+            r = self.github.patch(comment_url, data)
+
+        logging.info(f"Got response from posting comment: {r}")
+
+    def start_key(self, key: str) -> str:
+        return f"<!--bot-comment-{key}-start-->"
+
+    def end_key(self, key: str) -> str:
+        return f"<!--bot-comment-{key}-end-->"
+
+    def post_items(self, items: List[Tuple[str, str]]):
+        """
+        Update or post bullet points in the PR based on 'items' which is a
+        list of (key, text) pairs
+        """
+        # Find the existing bullet points
+        body_items = self.find_existing_body()
+
+        # Add or update the requested items
+        for key, text in items:
+            if text is None or text.strip() == "":
+                logging.info(f"Skipping {key} since it was empty")
+                continue
+            logging.info(f"Updating comment items {key} with {text}")
+            body_items[key] = text.strip()
+
+        # Post or update the comment
+        # print(body_items)
+        self._post_comment(body_items=body_items)
diff --git a/ci/scripts/github_docs_comment.py b/ci/scripts/github_docs_comment.py
index 64377b632c48..0a29dde2038a 100755
--- a/ci/scripts/github_docs_comment.py
+++ b/ci/scripts/github_docs_comment.py
@@ -16,29 +16,19 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
-import logging
-import argparse
-import sys
-from urllib import error
-
-from git_utils import git, GitHubRepo, parse_remote
-from cmd_utils import init_log
-
-DOCS_BOT_MARKER = "<!---docs-bot-comment-->\n\n"
-GITHUB_ACTIONS_BOT_LOGIN = "github-actions[bot]"
+from typing import Dict, Any
 
 
 def build_docs_url(base_url_docs, pr_number, build_number):
     return f"{base_url_docs}/PR-{str(pr_number)}/{str(build_number)}/docs/index.html"
 
 
-def get_pr_comments(github, url):
-    try:
-        return github.get(url)
-    except error.HTTPError as e:
-        logging.exception(f"Failed to retrieve PR comments: {url}: {e}")
-        return []
+def find_target_url(pr_head: Dict[str, Any]):
+    for status in pr_head["statusCheckRollup"]["contexts"]["nodes"]:
+        if status.get("context", "") == "tvm-ci/pr-head":
+            return status["targetUrl"]
+
+    raise RuntimeError(f"Unable to find tvm-ci/pr-head status in {pr_head}")
 
 
 def get_pr_and_build_numbers(target_url):
@@ -49,62 +39,15 @@ def get_pr_and_build_numbers(target_url):
     return {"pr_number": pr_number, "build_number": build_number}
 
 
-def search_for_docs_comment(comments):
-    for comment in comments:
-        if (
-            comment["user"]["login"] == GITHUB_ACTIONS_BOT_LOGIN
-            and DOCS_BOT_MARKER in comment["body"]
-        ):
-            return comment
-    return None
-
-
-if __name__ == "__main__":
-    help = "Add comment with link to docs"
-    parser = argparse.ArgumentParser(description=help)
-    parser.add_argument("--remote", default="origin", help="ssh remote to parse")
-    parser.add_argument("--base-url-docs", default="https://pr-docs.tlcpack.ai")
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        default=False,
-        help="run but don't send any request to GitHub",
-    )
-    args = parser.parse_args()
-    init_log()
-
-    remote = git(["config", "--get", f"remote.{args.remote}.url"])
-    user, repo = parse_remote(remote)
-
-    target_url = os.environ["TARGET_URL"]
+def get_doc_url(pr: Dict[str, Any], base_docs_url: str = "https://pr-docs.tlcpack.ai") -> str:
+    pr_head = pr["commits"]["nodes"][0]["commit"]
+    target_url = find_target_url(pr_head)
     pr_and_build = get_pr_and_build_numbers(target_url)
 
-    commit_sha = os.environ["COMMIT_SHA"]
+    commit_sha = pr_head["oid"]
 
     docs_url = build_docs_url(
-        args.base_url_docs, pr_and_build["pr_number"], pr_and_build["build_number"]
+        base_docs_url, pr_and_build["pr_number"], pr_and_build["build_number"]
     )
 
-    url = f'issues/{pr_and_build["pr_number"]}/comments'
-    body = f"{DOCS_BOT_MARKER}Built docs for commit {commit_sha} can be found [here]({docs_url})."
-    if not args.dry_run:
-        github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
-
-        # For now, only comment for PRs open by driazati, gigiblender and areusch.
-        get_pr_url = f'pulls/{pr_and_build["pr_number"]}'
-        pull_request_body = github.get(get_pr_url)
-        author = pull_request_body["user"]["login"]
-        if author not in ["driazati", "gigiblender", "areusch"]:
-            logging.info(f"Skipping this action for user {author}")
-            sys.exit(0)
-
-        pr_comments = get_pr_comments(github, url)
-        comment = search_for_docs_comment(pr_comments)
-
-        if comment is not None:
-            comment_url = comment["url"]
-            github.patch(comment_url, {"body": body})
-        else:
-            github.post(url, {"body": body})
-    else:
-        logging.info(f"Dry run, would have posted {url} with data {body}.")
+    return f"Built docs for commit {commit_sha} can be found [here]({docs_url})."
diff --git a/ci/scripts/github_pr_comment.py b/ci/scripts/github_pr_comment.py
new file mode 100755
index 000000000000..bcf4c5096ab0
--- /dev/null
+++ b/ci/scripts/github_pr_comment.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import argparse
+import os
+import json
+
+from git_utils import git, GitHubRepo, parse_remote, DRY_RUN
+from cmd_utils import init_log
+from github_commenter import BotCommentBuilder
+from github_skipped_tests_comment import get_skipped_tests_comment
+from github_tag_teams import get_tags
+from github_docs_comment import get_doc_url
+
+PR_QUERY = """
+    query ($owner: String!, $name: String!, $number: Int!) {
+      repository(owner: $owner, name: $name) {
+        pullRequest(number: $number) {
+          title
+          body
+          state
+          number
+          author {
+            login
+          }
+          labels(first:100) {
+            nodes {
+              name
+            }
+          }
+          comments(last: 100) {
+            pageInfo {
+              hasPreviousPage
+            }
+            nodes {
+              author {
+                login
+              }
+              databaseId
+              body
+            }
+          }
+          commits(last: 1) {
+            nodes {
+              commit {
+                oid
+                statusCheckRollup {
+                  contexts(first: 100) {
+                    pageInfo {
+                      hasNextPage
+                    }
+                    nodes {
+                      ... on StatusContext {
+                        state
+                        context
+                        targetUrl
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+"""
+
+
+if __name__ == "__main__":
+    help = "Comment a welcome message on PRs"
+    parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--remote", default="origin", help="ssh remote to parse")
+    parser.add_argument("--pr", required=True)
+    parser.add_argument("--test-data", help="(testing) mock GitHub API data")
+    parser.add_argument("--test-comments", help="(testing) testing comments")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="run but don't send any request to GitHub",
+    )
+    args = parser.parse_args()
+    init_log()
+
+    remote = git(["config", "--get", f"remote.{args.remote}.url"])
+    user, repo = parse_remote(remote)
+
+    test_data = None
+    if args.test_data is not None:
+        test_data = json.loads(args.test_data)
+
+    github = GitHubRepo(
+        user=user,
+        repo=repo,
+        token=DRY_RUN if args.dry_run else os.environ["GITHUB_TOKEN"],
+        test_data=test_data,
+    )
+
+    pr_data = github.graphql(
+        PR_QUERY,
+        {
+            "owner": user,
+            "name": repo,
+            "number": int(args.pr),
+        },
+    )
+
+    pr_data = pr_data["data"]["repository"]["pullRequest"]
+    commenter = BotCommentBuilder(github=github, data=pr_data)
+
+    if args.test_comments is not None:
+        test_comments = json.loads(args.test_comments)
+        skipped_tests = test_comments["skipped-tests"]
+        ccs = test_comments["ccs"]
+        docs_info = test_comments["docs"]
+    else:
+        skipped_tests = get_skipped_tests_comment(pr_data, github=github)
+        ccs = get_tags(pr_data, github, team_issue=10317)
+        docs_info = get_doc_url(pr_data)
+
+    items = {
+        "ccs": ccs,
+        "skipped-tests": skipped_tests,
+        "docs": docs_info,
+    }
+    commenter.post_items(items=items.items())
diff --git a/ci/scripts/github_skipped_tests_comment.py b/ci/scripts/github_skipped_tests_comment.py
index c07fbf4a8463..7a62f16a5b81 100755
--- a/ci/scripts/github_skipped_tests_comment.py
+++ b/ci/scripts/github_skipped_tests_comment.py
@@ -15,23 +15,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import json
 import os
 import logging
-import argparse
 import subprocess
-import sys
-from urllib import error
 from xml.etree import ElementTree
-
-from git_utils import git, GitHubRepo, parse_remote
-from cmd_utils import init_log
-
-SKIPPED_TESTS_COMMENT_MARKER = "<!---skipped-tests-comment-->\n\n"
-GITHUB_ACTIONS_BOT_LOGIN = "github-actions[bot]"
-
-PR_TEST_REPORT_DIR = "pr-reports"
-MAIN_TEST_REPORT_DIR = "main-reports"
+from pathlib import Path
+from typing import Dict, Any, Optional
 
 
 def run_subprocess(command):
@@ -43,7 +32,7 @@ def run_subprocess(command):
 
 
 def retrieve_test_report(s3_url, target_dir):
-    command = f"aws s3 cp {s3_url} {target_dir} --recursive"
+    command = f"aws --region us-west-2 s3 cp {s3_url} {target_dir} --recursive --no-sign-request"
     run_subprocess(command)
 
 
@@ -70,14 +59,16 @@ def get_main_jenkins_build_number(github, common_commit):
     raise RuntimeError(f"Failed to find main build number for commit {common_commit}")
 
 
-def retrieve_test_reports(common_main_build, pr_number, build_number, s3_prefix):
+def retrieve_test_reports(
+    common_main_build, pr_number, build_number, s3_prefix, pr_test_report_dir, main_test_report_dir
+):
     cur_build_s3_link = (
         f"s3://{s3_prefix}/tvm/PR-{str(pr_number)}/{str(build_number)}/pytest-results"
     )
-    retrieve_test_report(cur_build_s3_link, PR_TEST_REPORT_DIR)
+    retrieve_test_report(cur_build_s3_link, pr_test_report_dir)
 
     common_build_s3_link = f"s3://{s3_prefix}/tvm/main/{common_main_build}/pytest-results"
-    retrieve_test_report(common_build_s3_link, MAIN_TEST_REPORT_DIR)
+    retrieve_test_report(common_build_s3_link, main_test_report_dir)
 
 
 def get_pr_and_build_numbers(target_url):
@@ -89,15 +80,16 @@ def get_pr_and_build_numbers(target_url):
 
 
 def build_test_set(directory):
+    directory = Path(directory)
     subdir_to_skipped = {}
     subdirs = [
         item for item in os.listdir(directory) if os.path.isdir(os.path.join(directory, item))
     ]
     for subdir in subdirs:
         subdir_to_skipped[subdir] = set()
-        for root, _, files in os.walk(directory + "/" + subdir):
+        for root, _, files in os.walk(directory / subdir):
             for file in files:
-                test_report = ElementTree.parse(root + "/" + file)
+                test_report = ElementTree.parse(Path(root) / file)
                 for testcase in test_report.iter("testcase"):
                     skipped = testcase.find("skipped")
                     if skipped is not None:
@@ -120,13 +112,13 @@ def build_comment(
     jenkins_prefix,
 ):
     if common_main_build["state"] != "success":
-        return f"{SKIPPED_TESTS_COMMENT_MARKER}Unable to run tests bot because main failed to pass CI at {common_commit_sha}."
+        return f"Unable to run tests bot because main failed to pass CI at {common_commit_sha}."
 
     if len(skipped_list) == 0:
-        return f"{SKIPPED_TESTS_COMMENT_MARKER}No additional skipped tests found in this branch for commit {commit_sha}."
+        return f"No additional skipped tests found in this branch for commit {commit_sha}."
 
     text = (
-        f"{SKIPPED_TESTS_COMMENT_MARKER}The list below shows some tests that ran in main {common_commit_sha} but were "
+        f"The list below shows some tests that ran in main {common_commit_sha} but were "
         f"skipped in the CI build of {commit_sha}:\n"
         f"```\n"
     )
@@ -139,68 +131,51 @@ def build_comment(
     return text
 
 
-def get_pr_comments(github, url):
-    try:
-        return github.get(url)
-    except error.HTTPError as e:
-        logging.exception(f"Failed to retrieve PR comments: {url}: {e}")
-        return []
+def find_target_url(pr_head: Dict[str, Any]):
+    for status in pr_head["statusCheckRollup"]["contexts"]["nodes"]:
+        if status.get("context", "") == "tvm-ci/pr-head":
+            return status["targetUrl"]
 
+    raise RuntimeError(f"Unable to find tvm-ci/pr-head status in {pr_head}")
 
-def search_for_docs_comment(comments):
-    for comment in comments:
-        if (
-            comment["user"]["login"] == GITHUB_ACTIONS_BOT_LOGIN
-            and SKIPPED_TESTS_COMMENT_MARKER in comment["body"]
-        ):
-            return comment
-    return None
 
-
-if __name__ == "__main__":
-    help = (
-        "Compares the skipped tests of this PR against the last successful build on main. Also comments on the PR "
-        "issue when tests are skipped in this PR and not on main."
-    )
-    parser = argparse.ArgumentParser(description=help)
-    parser.add_argument("--remote", default="origin", help="ssh remote to parse")
-    parser.add_argument("--s3-prefix", default="tvm-jenkins-artifacts-prod")
-    parser.add_argument("--jenkins-prefix", default="ci.tlcpack.ai")
-    parser.add_argument("--common-main-build")
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        default=False,
-        help="run but don't send any request to GitHub",
-    )
-    args = parser.parse_args()
-    init_log()
-
-    remote = git(["config", "--get", f"remote.{args.remote}.url"])
-    user, repo = parse_remote(remote)
-
-    target_url = os.environ["TARGET_URL"]
+def get_skipped_tests_comment(
+    pr: Dict[str, Any],
+    github,
+    s3_prefix: str = "tvm-jenkins-artifacts-prod",
+    jenkins_prefix: str = "ci.tlcpack.ai",
+    pr_test_report_dir: str = "pr-reports",
+    main_test_report_dir: str = "main-reports",
+    common_commit_sha: Optional[str] = None,
+    common_main_build: Optional[Dict[str, Any]] = None,
+) -> str:
+    pr_head = pr["commits"]["nodes"][0]["commit"]
+    target_url = find_target_url(pr_head)
     pr_and_build = get_pr_and_build_numbers(target_url)
+    logging.info(f"Getting comment for {pr_head} with target {target_url}")
+
+    commit_sha = pr_head["oid"]
 
-    commit_sha = os.environ["COMMIT_SHA"]
+    is_dry_run = common_commit_sha is not None
 
-    if not args.dry_run:
-        github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
+    if not is_dry_run:
+        logging.info("Fetching common commit sha and build info")
         common_commit_sha = get_common_commit_sha()
         common_main_build = get_main_jenkins_build_number(github, common_commit_sha)
+
         retrieve_test_reports(
             common_main_build=common_main_build["build_number"],
             pr_number=pr_and_build["pr_number"],
             build_number=pr_and_build["build_number"],
-            s3_prefix=args.s3_prefix,
+            s3_prefix=s3_prefix,
+            main_test_report_dir=main_test_report_dir,
+            pr_test_report_dir=pr_test_report_dir,
         )
     else:
-        assert args.common_main_build is not None
-        common_main_build = json.loads(args.common_main_build)
-        common_commit_sha = os.environ["COMMIT_SHA"]
+        logging.info("Dry run, expecting PR and main reports on disk")
 
-    main_tests = build_test_set(MAIN_TEST_REPORT_DIR)
-    build_tests = build_test_set(PR_TEST_REPORT_DIR)
+    main_tests = build_test_set(main_test_report_dir)
+    build_tests = build_test_set(pr_test_report_dir)
 
     skipped_list = []
     for subdir, skipped_set in build_tests.items():
@@ -227,28 +202,7 @@ def search_for_docs_comment(comments):
         pr_and_build["pr_number"],
         pr_and_build["build_number"],
         commit_sha,
-        args.jenkins_prefix,
+        jenkins_prefix,
     )
-    url = f'issues/{pr_and_build["pr_number"]}/comments'
-    if not args.dry_run:
-        # For now, only comment for PRs open by driazati, gigiblender and areusch.
-        get_pr_url = f'pulls/{pr_and_build["pr_number"]}'
-        pull_request_body = github.get(get_pr_url)
-        author = pull_request_body["user"]["login"]
-        if author not in ["driazati", "gigiblender", "areusch"]:
-            logging.info(f"Skipping this action for user {author}")
-            sys.exit(0)
-
-        pr_comments = get_pr_comments(github, url)
-        comment = search_for_docs_comment(pr_comments)
-
-        if comment is not None:
-            comment_url = comment["url"]
-            comment_id = comment_url[comment_url.find("comments/") : len(comment_url)].strip(
-                "comments/"
-            )
-            github.patch(f"issues/comments/{comment_id}", {"body": body})
-        else:
-            github.post(url, {"body": body})
-    else:
-        logging.info(f"Dry run, would have posted {url} with data {body}.")
+
+    return body
diff --git a/ci/scripts/github_tag_teams.py b/ci/scripts/github_tag_teams.py
index 4f03b4f71aea..fd63070db1ba 100755
--- a/ci/scripts/github_tag_teams.py
+++ b/ci/scripts/github_tag_teams.py
@@ -19,13 +19,13 @@
 import os
 import json
 import argparse
+import logging
 import re
-from urllib import error
-from typing import Dict, Any, List, Tuple
+from typing import Dict, Any, List, Tuple, Optional
 
 
-from git_utils import git, GitHubRepo, parse_remote, find_ccs
-from cmd_utils import tags_from_title
+from git_utils import git, GitHubRepo, parse_remote, find_ccs, dry_run_token
+from cmd_utils import tags_from_title, init_log
 
 
 GITHUB_NAME_REGEX = r"@[a-zA-Z0-9-]+"
@@ -168,6 +168,51 @@ def gen_cc_line(users):
     return "\n".join(lines)
 
 
+def determine_users_to_cc(
+    issue: Dict[str, Any], github: GitHubRepo, team_issue: str, issue_data: Optional[Dict[str, Any]]
+) -> List[str]:
+    if issue_data is None:
+        issue_data = fetch_issue(github, issue_number=int(team_issue))
+
+    # Fetch the list of teams
+    teams = parse_teams(issue_data, issue_number=int(team_issue))
+
+    logging.info(f"Found these teams in issue #{team_issue}\n{json.dumps(teams, indent=2)}")
+
+    title = issue["title"]
+    if "author" in issue:
+        author = issue["author"]["login"]
+    else:
+        author = issue["user"]["login"]
+    tags = tags_from_title(title)
+    if isinstance(issue["labels"], dict):
+        tags += tags_from_labels(issue["labels"]["nodes"])
+    else:
+        tags += tags_from_labels(issue["labels"])
+
+    tags = [t.lower() for t in tags]
+    logging.info(f"Found tags: {tags}")
+
+    # Update the PR or issue based on tags in the title and GitHub tags
+    to_cc = [teams.get(t, []) for t in tags]
+    to_cc = list(set(item for sublist in to_cc for item in sublist))
+    to_cc = [user for user in to_cc if user != author]
+    return to_cc
+
+
+def get_tags(pr_data: Dict[str, Any], github: GitHubRepo, team_issue: int) -> str:
+    to_cc = determine_users_to_cc(
+        issue=pr_data, github=github, team_issue=team_issue, issue_data=None
+    )
+
+    logging.info(f"Users to cc based on labels: {to_cc}")
+    description = "<sub>See [#10317](https://github.com/apache/tvm/issues/10317) for details</sub>"
+    if len(to_cc) == 0:
+        return "No users to tag found in teams " + description
+
+    return "cc " + ", ".join([f"@{user}" for user in to_cc]) + " " + description
+
+
 if __name__ == "__main__":
     help = "Automatically tag people based on PR / issue labels"
     parser = argparse.ArgumentParser(description=help)
@@ -183,21 +228,17 @@ def gen_cc_line(users):
         help="run but don't send any request to GitHub",
     )
     args = parser.parse_args()
+    init_log()
 
     remote = git(["config", "--get", f"remote.{args.remote}.url"])
     user, repo = parse_remote(remote)
 
+    github = GitHubRepo(token=dry_run_token(args.dry_run), user=user, repo=repo)
     if args.team_issue_json:
         issue_data = json.loads(args.team_issue_json)
     else:
-        github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
         issue_data = fetch_issue(github, issue_number=int(args.team_issue))
 
-    # Fetch the list of teams
-    teams = parse_teams(issue_data, issue_number=int(args.team_issue))
-
-    print(f"Found these teams in issue #{args.team_issue}\n{json.dumps(teams, indent=2)}")
-
     # Extract the payload from GitHub Actions
     issue = json.loads(os.getenv("ISSUE", "null"))
     pr = json.loads(os.getenv("PR", "null"))
@@ -213,33 +254,27 @@ def gen_cc_line(users):
     item = issue if issue is not None else pr
     title = item["title"]
     body = item["body"]
-    author = item["user"]["login"]
-    tags = tags_from_title(item["title"]) + tags_from_labels(item["labels"])
 
-    tags = [t.lower() for t in tags]
-    print(f"Found tags: {tags}")
-
-    # Update the PR or issue based on tags in the title and GitHub tags
-    to_cc = [teams.get(t, []) for t in tags]
-    to_cc = list(set(item for sublist in to_cc for item in sublist))
-    to_cc = [user for user in to_cc if user != author]
+    to_cc = determine_users_to_cc(
+        issue=item, github=github, team_issue=args.team_issue, issue_data=issue_data
+    )
     existing_tags = list(set(re.findall(GITHUB_NAME_REGEX, body)))
     existing_tags = set(tag.replace("@", "") for tag in existing_tags)
-    print(f"Found existing tags: {existing_tags}")
+    logging.info(f"Found existing tags: {existing_tags}")
     to_cc = [user for user in to_cc if user not in existing_tags]
-    print("Users to cc based on labels", to_cc)
+    logging.info("Users to cc based on labels", to_cc)
 
     # Create the new PR/issue body
     if len(to_cc) == 0:
-        print("No one to cc, exiting")
+        logging.info("No one to cc, exiting")
         exit(0)
 
     new_body = add_ccs_to_body(body, to_cc)
     if new_body is None:
-        print(f"Everyone to cc is already cc'ed, no update needed")
+        logging.info(f"Everyone to cc is already cc'ed, no update needed")
         exit(0)
 
-    print(f"Changing body from:\n----\n{body}\n----\nto:\n----\n{new_body}\n----")
+    logging.info(f"Changing body from:\n----\n{body}\n----\nto:\n----\n{new_body}\n----")
 
     # Set the PR/issue body on GitHub
     data = {"body": new_body}
@@ -255,4 +290,4 @@ def gen_cc_line(users):
     if not args.dry_run:
         github.post(url, data=data)
     else:
-        print(f"Dry run, would have updated {url} with {data}")
+        logging.info(f"Dry run, would have updated {url} with {data}")
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 6c25694cfc74..4b8c5d9ad444 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -19,12 +19,23 @@
 import subprocess
 import json
 import textwrap
+import sys
+import logging
 from pathlib import Path
 
 import pytest
 import tvm.testing
+
 from .test_utils import REPO_ROOT, TempGit, run_script
 
+# pylint: disable=wrong-import-position,wrong-import-order
+sys.path.insert(0, str(REPO_ROOT / "ci"))
+sys.path.insert(0, str(REPO_ROOT / "ci" / "scripts"))
+
+import scripts
+
+# pylint: enable=wrong-import-position,wrong-import-order
+
 
 def parameterize_named(**kwargs):
     keys = next(iter(kwargs.values())).keys()
@@ -71,9 +82,8 @@ def parameterize_named(**kwargs):
         "s3_prefix": "tvm-jenkins-artifacts-prod",
         "jenkins_prefix": "ci.tlcpack.ai",
         "common_main_build": """{"build_number": "4115", "state": "success"}""",
-        "commit_sha": "SHA",
-        "expected_url": "issues/11594/comments",
-        "expected_body": """<!---skipped-tests-comment-->\n\nThe list below shows some tests that ran in main SHA but were skipped in the CI build of SHA:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\nunittest -> ctypes.tests.python.unittest.test_roofline#test_estimate_peak_bandwidth[cuda]\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).""",
+        "commit_sha": "sha1234",
+        "expected_body": "The list below shows some tests that ran in main sha1234 but were skipped in the CI build of sha1234:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\nunittest -> ctypes.tests.python.unittest.test_roofline#test_estimate_peak_bandwidth[cuda]\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).",
     },
     "no-diff": {
         "main_xml_file": "unittest/file1.xml",
@@ -108,9 +118,8 @@ def parameterize_named(**kwargs):
         "s3_prefix": "tvm-jenkins-artifacts-prod",
         "jenkins_prefix": "ci.tlcpack.ai",
         "common_main_build": """{"build_number": "4115", "state": "success"}""",
-        "commit_sha": "SHA",
-        "expected_url": "issues/11594/comments",
-        "expected_body": """<!---skipped-tests-comment-->\n\nNo additional skipped tests found in this branch for commit SHA.""",
+        "commit_sha": "sha1234",
+        "expected_body": "No additional skipped tests found in this branch for commit sha1234.",
     },
     "unable-to-run": {
         "main_xml_file": "unittest/file1.xml",
@@ -127,9 +136,8 @@ def parameterize_named(**kwargs):
         "s3_prefix": "tvm-jenkins-artifacts-prod",
         "jenkins_prefix": "ci.tlcpack.ai",
         "common_main_build": """{"build_number": "4115", "state": "failed"}""",
-        "commit_sha": "SHA",
-        "expected_url": "issues/11594/comments",
-        "expected_body": """<!---skipped-tests-comment-->\n\nUnable to run tests bot because main failed to pass CI at SHA.""",
+        "commit_sha": "sha1234",
+        "expected_body": "Unable to run tests bot because main failed to pass CI at sha1234.",
     },
 }
 # pylint: enable=line-too-long
@@ -139,6 +147,7 @@ def parameterize_named(**kwargs):
 @parameterize_named(**TEST_DATA_SKIPPED_BOT)
 # pylint: enable=line-too-long
 def test_skipped_tests_comment(
+    caplog,
     tmpdir_factory,
     main_xml_file,
     main_xml_content,
@@ -149,13 +158,11 @@ def test_skipped_tests_comment(
     jenkins_prefix,
     common_main_build,
     commit_sha,
-    expected_url,
     expected_body,
 ):
     """
     Test that a comment with a link to the docs is successfully left on PRs
     """
-    skipped_tests_script = REPO_ROOT / "ci" / "scripts" / "github_skipped_tests_comment.py"
 
     def write_xml_file(root_dir, xml_file, xml_content):
         shutil.rmtree(root_dir, ignore_errors=True)
@@ -165,25 +172,45 @@ def write_xml_file(root_dir, xml_file, xml_content):
             f.write(textwrap.dedent(xml_content))
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-
     pr_test_report_dir = Path(git.cwd) / "pr-reports"
     write_xml_file(pr_test_report_dir, pr_xml_file, pr_xml_content)
     main_test_report_dir = Path(git.cwd) / "main-reports"
     write_xml_file(main_test_report_dir, main_xml_file, main_xml_content)
 
-    proc = run_script(
-        [
-            skipped_tests_script,
-            "--dry-run",
-            f"--s3-prefix={s3_prefix}",
-            f"--jenkins-prefix={jenkins_prefix}",
-            f"--common-main-build={common_main_build}",
-        ],
-        env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha},
-        cwd=git.cwd,
-    )
-
-    assert_in(f"Dry run, would have posted {expected_url} with data {expected_body}.", proc.stderr)
+    pr_data = {
+        "commits": {
+            "nodes": [
+                {
+                    "commit": {
+                        "oid": commit_sha,
+                        "statusCheckRollup": {
+                            "contexts": {
+                                "nodes": [
+                                    {
+                                        "context": "tvm-ci/pr-head",
+                                        "targetUrl": target_url,
+                                    }
+                                ]
+                            }
+                        },
+                    }
+                }
+            ]
+        }
+    }
+    with caplog.at_level(logging.INFO):
+        comment = scripts.github_skipped_tests_comment.get_skipped_tests_comment(
+            pr=pr_data,
+            github=None,
+            s3_prefix=s3_prefix,
+            jenkins_prefix=jenkins_prefix,
+            common_commit_sha=commit_sha,
+            pr_test_report_dir=pr_test_report_dir,
+            main_test_report_dir=main_test_report_dir,
+            common_main_build=json.loads(common_main_build),
+        )
+    assert_in(expected_body, comment)
+    assert_in(f"with target {target_url}", caplog.text)
 
 
 @tvm.testing.skip_if_wheel_test
@@ -192,27 +219,40 @@ def write_xml_file(root_dir, xml_file, xml_content):
         target_url="https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
         base_url="https://pr-docs.tlcpack.ai",
         commit_sha="SHA",
-        expected_url="issues/11594/comments",
-        expected_body="<!---docs-bot-comment-->\n\nBuilt docs for commit SHA can be found "
+        expected_body="Built docs for commit SHA can be found "
         "[here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).",
     )
 )
-def test_docs_comment(
-    tmpdir_factory, target_url, base_url, commit_sha, expected_url, expected_body
-):
+def test_docs_comment(target_url, base_url, commit_sha, expected_body):
     """
     Test that a comment with a link to the docs is successfully left on PRs
     """
-    docs_comment_script = REPO_ROOT / "ci" / "scripts" / "github_docs_comment.py"
-
-    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-    proc = run_script(
-        [docs_comment_script, "--dry-run", f"--base-url-docs={base_url}"],
-        env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha},
-        cwd=git.cwd,
+    pr_data = {
+        "commits": {
+            "nodes": [
+                {
+                    "commit": {
+                        "oid": commit_sha,
+                        "statusCheckRollup": {
+                            "contexts": {
+                                "nodes": [
+                                    {
+                                        "context": "tvm-ci/pr-head",
+                                        "targetUrl": target_url,
+                                    }
+                                ]
+                            }
+                        },
+                    }
+                }
+            ]
+        }
+    }
+    comment = scripts.github_docs_comment.get_doc_url(
+        pr=pr_data,
+        base_docs_url=base_url,
     )
-
-    assert_in(f"Dry run, would have posted {expected_url} with data {expected_body}.", proc.stderr)
+    assert_in(expected_body, comment)
 
 
 @tvm.testing.skip_if_wheel_test
@@ -385,6 +425,149 @@ def test_update_branch(tmpdir_factory, statuses, expected_rc, expected_output):
         )
 
 
+# pylint: disable=line-too-long
+@parameterize_named(
+    author_gate=dict(
+        pr_author="abc",
+        comments=[],
+        expected="Skipping comment for author abc",
+    ),
+    new_comment=dict(
+        pr_author="driazati",
+        comments=[],
+        expected="No existing comment found",
+    ),
+    update_comment=dict(
+        pr_author="driazati",
+        comments=[
+            {
+                "author": {"login": "github-actions"},
+                "databaseId": "comment456",
+                "body": "<!---bot-comment--> abc",
+            }
+        ],
+        expected="PATCH to https://api.github.com/repos/apache/tvm/issues/comments/comment456",
+    ),
+    new_body=dict(
+        pr_author="driazati",
+        comments=[],
+        expected="Commenting "
+        + textwrap.dedent(
+            """
+        <!---bot-comment-->
+
+        Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @-ing them in a comment.
+
+        <!--bot-comment-ccs-start-->
+         * the cc<!--bot-comment-ccs-end--><!--bot-comment-skipped-tests-start-->
+         * the skipped tests<!--bot-comment-skipped-tests-end--><!--bot-comment-docs-start-->
+         * the docs<!--bot-comment-docs-end-->
+        """
+        ).strip(),
+    ),
+    update_body=dict(
+        pr_author="driazati",
+        comments=[
+            {
+                "author": {"login": "github-actions"},
+                "databaseId": "comment456",
+                "body": textwrap.dedent(
+                    """
+        <!---bot-comment-->
+
+        Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @-ing them in a comment.
+
+        <!--bot-comment-ccs-start-->
+         * the cc<!--bot-comment-ccs-end--><!--bot-comment-something-tests-start-->
+         * something else<!--bot-comment-something-tests-end--><!--bot-comment-docs-start-->
+         * the docs<!--bot-comment-docs-end-->
+        """
+                ).strip(),
+            }
+        ],
+        expected="Commenting "
+        + textwrap.dedent(
+            """
+        <!---bot-comment-->
+
+        Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @-ing them in a comment.
+
+        <!--bot-comment-ccs-start-->
+         * the cc<!--bot-comment-ccs-end--><!--bot-comment-something-tests-start-->
+         * something else<!--bot-comment-something-tests-end--><!--bot-comment-docs-start-->
+         * the docs<!--bot-comment-docs-end--><!--bot-comment-skipped-tests-start-->
+         * the skipped tests<!--bot-comment-skipped-tests-end-->
+        """
+        ).strip(),
+    ),
+)
+# pylint: enable=line-too-long
+def test_pr_comment(tmpdir_factory, pr_author, comments, expected):
+    """
+    Test the PR commenting bot
+    """
+    comment_script = REPO_ROOT / "ci" / "scripts" / "github_pr_comment.py"
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+    target_url = "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect"
+    commit = {
+        "commit": {
+            "oid": "sha1234",
+            "statusCheckRollup": {
+                "contexts": {
+                    "nodes": [
+                        {
+                            "context": "tvm-ci/pr-head",
+                            "targetUrl": target_url,
+                        }
+                    ]
+                }
+            },
+        }
+    }
+    data = {
+        "[1] POST - https://api.github.com/graphql": {},
+        "[2] POST - https://api.github.com/graphql": {
+            "data": {
+                "repository": {
+                    "pullRequest": {
+                        "number": 1234,
+                        "comments": {
+                            "nodes": comments,
+                        },
+                        "author": {
+                            "login": pr_author,
+                        },
+                        "commits": {
+                            "nodes": [commit],
+                        },
+                    }
+                }
+            }
+        },
+    }
+    comments = {
+        "ccs": "the cc",
+        "docs": "the docs",
+        "skipped-tests": "the skipped tests",
+    }
+    proc = run_script(
+        [
+            comment_script,
+            "--dry-run",
+            "--test-data",
+            json.dumps(data),
+            "--test-comments",
+            json.dumps(comments),
+            "--pr",
+            "1234",
+        ],
+        stderr=subprocess.STDOUT,
+        cwd=git.cwd,
+    )
+    assert_in(expected, proc.stdout)
+
+
 @parameterize_named(
     dont_skip_main=dict(
         commands=[],
@@ -873,6 +1056,7 @@ def test_github_tag_teams(tmpdir_factory, source_type, data, check):
             "--team-issue-json",
             json.dumps(teams),
         ],
+        stderr=subprocess.STDOUT,
         cwd=git.cwd,
         env=env,
     )

From aded9d43ba1e798031900911cca4613487db84fe Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 16 Sep 2022 17:01:11 -0500
Subject: [PATCH 192/704] [Testing] Add decorator
 tvm.testing.requires_cuda_compute_version (#12778)

* [Testing] Add decorator tvm.testing.requires_cuda_compute_version

Previously, individual unit tests would call
`tvm.contrib.nvcc.get_target_compute_version` and return early.  This
was repeated boilerplate in many tests, and incorrectly reported a
test as `PASSED` if the required infrastructure wasn't present.

This commit introduces `tvm.testing.requires_cuda_compute_version`, a
decorator that checks the CUDA compute version and applies
`pytest.mark.skipif`.  If required infrastructure isn't present, a
test will be reported as `SKIPPED`.

* requires_cuda_compute_version skips test when no GPU is present
---
 python/tvm/testing/utils.py                   |  44 ++++++
 .../python/unittest/test_tir_ptx_cp_async.py  |   7 +-
 .../python/unittest/test_tir_ptx_ldmatrix.py  |   8 +-
 tests/python/unittest/test_tir_ptx_mma.py     | 146 +++---------------
 tests/python/unittest/test_tir_ptx_mma_sp.py  |  14 +-
 ...est_tir_schedule_tensorize_ldmatrix_mma.py |  13 +-
 6 files changed, 71 insertions(+), 161 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 37a27a4213e9..ad1e003d6e3f 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1058,6 +1058,50 @@ def inner(func):
     return inner
 
 
+def requires_cuda_compute_version(major_version, minor_version=0):
+    """Mark a test as requiring at least a compute architecture
+
+    Unit test marked with this decorator will run only if the CUDA
+    compute architecture of the GPU is at least `(major_version,
+    minor_version)`.
+
+    This also marks the test as requiring a cuda support.
+
+    Parameters
+    ----------
+    major_version: int
+
+        The major version of the (major,minor) version tuple.
+
+    minor_version: int
+
+        The minor version of the (major,minor) version tuple.
+    """
+    min_version = (major_version, minor_version)
+    try:
+        arch = tvm.contrib.nvcc.get_target_compute_version()
+        compute_version = tvm.contrib.nvcc.parse_compute_version(arch)
+    except ValueError:
+        # No GPU present.  This test will be skipped from the
+        # requires_cuda() marks as well.
+        compute_version = (0, 0)
+
+    min_version_str = ".".join(str(v) for v in min_version)
+    compute_version_str = ".".join(str(v) for v in compute_version)
+    requires = [
+        pytest.mark.skipif(
+            compute_version < min_version,
+            reason=f"Requires CUDA compute >= {min_version_str}, but have {compute_version_str}",
+        ),
+        *requires_cuda.marks(),
+    ]
+
+    def inner(func):
+        return _compose([func], requires)
+
+    return inner
+
+
 def skip_if_32bit(reason):
     def decorator(*args):
         if "32bit" in platform.architecture()[0]:
diff --git a/tests/python/unittest/test_tir_ptx_cp_async.py b/tests/python/unittest/test_tir_ptx_cp_async.py
index 5e6535f295cb..dc521f3c471a 100644
--- a/tests/python/unittest/test_tir_ptx_cp_async.py
+++ b/tests/python/unittest/test_tir_ptx_cp_async.py
@@ -47,14 +47,9 @@ def ptx_cp_async(A: T.Buffer[(32, 128), "float16"], B: T.Buffer[(32, 128), "floa
             B[tx, i] = A_shared[tx, i]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_ptx_cp_async():
     f = ptx_cp_async
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
 
     mod = tvm.build(f, target="cuda")
     A_np = np.random.rand(32, 128).astype("float16")
diff --git a/tests/python/unittest/test_tir_ptx_ldmatrix.py b/tests/python/unittest/test_tir_ptx_ldmatrix.py
index f718082ff8a1..f652be442133 100644
--- a/tests/python/unittest/test_tir_ptx_ldmatrix.py
+++ b/tests/python/unittest/test_tir_ptx_ldmatrix.py
@@ -56,15 +56,11 @@ def ptx_ldmatrix(
                     B[8 * j + tx // 4, 8 * k + (tx % 4) * 2 + i] = A_local[4 * k + 2 * j + i]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(7, 5)
 def test_ptx_ldmatrix():
     f = ptx_ldmatrix
     _, _, param_num, param_trans = f.params
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major * 10 + minor < 75:
-        # Require at least SM75
-        return
+
     for num in [1, 2, 4]:
         for trans in [False, True]:
             mod = tvm.build(f.specialize({param_num: num, param_trans: trans}), target="cuda")
diff --git a/tests/python/unittest/test_tir_ptx_mma.py b/tests/python/unittest/test_tir_ptx_mma.py
index bee9b7b48020..cc9eec3a69d7 100644
--- a/tests/python/unittest/test_tir_ptx_mma.py
+++ b/tests/python/unittest/test_tir_ptx_mma.py
@@ -66,14 +66,9 @@ def gemm_mma_m8n8k4_row_col_fp64pf64fp64(a: T.handle, b: T.handle, c: T.handle):
         C[(tx % 32) // 4, (tx % 32) % 4 * 2 + mma_accum_c_id] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_gemm_mma_m8n8k4_row_col_fp64pf64fp64():
     sch = tvm.tir.Schedule(gemm_mma_m8n8k4_row_col_fp64pf64fp64)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-1, 1, [8, 4]).astype("float64")
@@ -147,14 +142,9 @@ def gemm_mma_m8n8k4_row_row_fp16fp16fp16(a: T.handle, b: T.handle, c: T.handle):
         ] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(7)
 def test_gemm_mma_m8n8k4_row_row_fp16fp16fp16():
     sch = tvm.tir.Schedule(gemm_mma_m8n8k4_row_row_fp16fp16fp16)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 7:
-        # Require at least SM70
-        return
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-1, 1, [16, 4]).astype("float16")
@@ -235,14 +225,9 @@ def gemm_mma_m8n8k4_row_row_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle):
         ] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(7)
 def test_gemm_mma_m8n8k4_row_row_fp16fp16fp32():
     sch = tvm.tir.Schedule(gemm_mma_m8n8k4_row_row_fp16fp16fp32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 7:
-        # Require at least SM70
-        return
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-1, 1, [16, 4]).astype("float16")
@@ -311,14 +296,9 @@ def gemm_mma_m8n8k16_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle):
 # Failure occurs during the external call to nvcc, when attempting to
 # generate the .fatbin file.
 @tvm.testing.requires_nvcc_version(11)
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(7, 5)
 def test_gemm_mma_m8n8k16_row_col_s8s8s32():
     sch = tvm.tir.Schedule(gemm_mma_m8n8k16_row_col_s8s8s32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major * 10 + minor < 75:
-        # Require at least SM75
-        return
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-10, 10, [8, 16]).astype("int8")
@@ -387,14 +367,9 @@ def gemm_mma_m8n8k16_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle):
 # Failure occurs during the external call to nvcc, when attempting to
 # generate the .fatbin file.
 @tvm.testing.requires_nvcc_version(11)
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(7, 5)
 def test_gemm_mma_m8n8k16_row_col_s8u8s32():
     sch = tvm.tir.Schedule(gemm_mma_m8n8k16_row_col_s8u8s32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major * 10 + minor < 75:
-        # Require at least SM75
-        return
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-10, 10, [8, 16]).astype("int8")
@@ -463,14 +438,9 @@ def gemm_mma_m8n8k32_row_col_s4s4s32(a: T.handle, b: T.handle, c: T.handle):
 # Failure occurs during the external call to nvcc, when attempting to
 # generate the .fatbin file.
 @tvm.testing.requires_nvcc_version(11)
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(7, 5)
 def test_gemm_mma_m8n8k32_row_col_s4s4s32():
     sch = tvm.tir.Schedule(gemm_mma_m8n8k32_row_col_s4s4s32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major * 10 + minor < 75:
-        # Require at least SM75
-        return
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     ctx = tvm.cuda()
@@ -531,14 +501,9 @@ def gemm_mma_m8n8k32_row_col_s4u4s32(a: T.handle, b: T.handle, c: T.handle):
 # Failure occurs during the external call to nvcc, when attempting to
 # generate the .fatbin file.
 @tvm.testing.requires_nvcc_version(11)
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(7, 5)
 def test_gemm_mma_m8n8k32_row_col_s4u4s32():
     sch = tvm.tir.Schedule(gemm_mma_m8n8k32_row_col_s4u4s32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major * 10 + minor < 75:
-        # Require at least SM75
-        return
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     ctx = tvm.cuda()
@@ -601,14 +566,9 @@ def gemm_mma_m16n8k8_row_col_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle)
         ]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_gemm_mma_m16n8k8_row_col_fp16fp16fp32():
     sch = tvm.tir.Schedule(gemm_mma_m16n8k8_row_col_fp16fp16fp32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-1, 1, [16, 8]).astype("float16")
@@ -682,15 +642,9 @@ def gemm_mma_m16n8k16_row_col_fp16fp16fp16(a: T.handle, b: T.handle, c: T.handle
         ] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_gemm_mma_m16n8k16_row_col_fp16fp16fp16():
     sch = tvm.tir.Schedule(gemm_mma_m16n8k16_row_col_fp16fp16fp16)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
-    cuda_mod = tvm.build(sch.mod, target="cuda")
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-1, 1, [16, 16]).astype("float16")
@@ -764,15 +718,9 @@ def gemm_mma_m16n8k16_row_col_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle
         ] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_gemm_mma_m16n8k16_row_col_fp16fp16fp32():
     sch = tvm.tir.Schedule(gemm_mma_m16n8k16_row_col_fp16fp16fp32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
-    cuda_mod = tvm.build(sch.mod, target="cuda")
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-1, 1, [16, 16]).astype("float16")
@@ -846,15 +794,9 @@ def gemm_mma_m16n8k16_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle):
         ] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_gemm_mma_m16n8k16_row_col_s8s8s32():
     sch = tvm.tir.Schedule(gemm_mma_m16n8k16_row_col_s8s8s32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
-    cuda_mod = tvm.build(sch.mod, target="cuda")
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-10, 10, [16, 16]).astype("int8")
@@ -928,15 +870,9 @@ def gemm_mma_m16n8k16_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle):
         ] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_gemm_mma_m16n8k16_row_col_s8u8s32():
     sch = tvm.tir.Schedule(gemm_mma_m16n8k16_row_col_s8u8s32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
-    cuda_mod = tvm.build(sch.mod, target="cuda")
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-10, 10, [16, 16]).astype("int8")
@@ -1010,15 +946,9 @@ def gemm_mma_m16n8k32_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle):
         ] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_gemm_mma_m16n8k32_row_col_s8s8s32():
     sch = tvm.tir.Schedule(gemm_mma_m16n8k32_row_col_s8s8s32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
-    cuda_mod = tvm.build(sch.mod, target="cuda")
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-10, 10, [16, 32]).astype("int8")
@@ -1092,15 +1022,9 @@ def gemm_mma_m16n8k32_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle):
         ] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_gemm_mma_m16n8k32_row_col_s8u8s32():
     sch = tvm.tir.Schedule(gemm_mma_m16n8k32_row_col_s8u8s32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
-    cuda_mod = tvm.build(sch.mod, target="cuda")
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     A_np = np.random.uniform(-10, 10, [16, 32]).astype("int8")
@@ -1174,15 +1098,9 @@ def gemm_mma_m16n8k64_row_col_s4s4s32(a: T.handle, b: T.handle, c: T.handle):
         ] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_gemm_mma_m16n8k64_row_col_s4s4s32():
     sch = tvm.tir.Schedule(gemm_mma_m16n8k64_row_col_s4s4s32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
-    cuda_mod = tvm.build(sch.mod, target="cuda")
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     ctx = tvm.cuda()
@@ -1248,15 +1166,9 @@ def gemm_mma_m16n8k64_row_col_s4u4s32(a: T.handle, b: T.handle, c: T.handle):
         ] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_gemm_mma_m16n8k64_row_col_s4u4s32():
     sch = tvm.tir.Schedule(gemm_mma_m16n8k64_row_col_s4u4s32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
-    cuda_mod = tvm.build(sch.mod, target="cuda")
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     ctx = tvm.cuda()
@@ -1323,15 +1235,9 @@ def gemm_mma_m16n8k256_row_col_b1b1s32(a: T.handle, b: T.handle, c: T.handle):
         ] = Accum[mma_accum_c_id]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_gemm_mma_m16n8k256_row_col_b1b1s32():
     sch = tvm.tir.Schedule(gemm_mma_m16n8k256_row_col_b1b1s32)
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # Require at least SM80
-        return
-    cuda_mod = tvm.build(sch.mod, target="cuda")
     cuda_mod = tvm.build(sch.mod, target="cuda")
 
     ctx = tvm.cuda()
@@ -1345,20 +1251,4 @@ def test_gemm_mma_m16n8k256_row_col_b1b1s32():
 
 
 if __name__ == "__main__":
-    test_gemm_mma_m8n8k4_row_col_fp64pf64fp64()
-    test_gemm_mma_m8n8k4_row_row_fp16fp16fp16()
-    test_gemm_mma_m8n8k4_row_row_fp16fp16fp32()
-    test_gemm_mma_m8n8k16_row_col_s8s8s32()
-    test_gemm_mma_m8n8k16_row_col_s8u8s32()
-    test_gemm_mma_m8n8k32_row_col_s4s4s32()
-    test_gemm_mma_m8n8k32_row_col_s4u4s32()
-    test_gemm_mma_m16n8k8_row_col_fp16fp16fp32()
-    test_gemm_mma_m16n8k16_row_col_fp16fp16fp16()
-    test_gemm_mma_m16n8k16_row_col_fp16fp16fp32()
-    test_gemm_mma_m16n8k16_row_col_s8s8s32()
-    test_gemm_mma_m16n8k16_row_col_s8u8s32()
-    test_gemm_mma_m16n8k32_row_col_s8s8s32()
-    test_gemm_mma_m16n8k32_row_col_s8u8s32()
-    test_gemm_mma_m16n8k64_row_col_s4s4s32()
-    test_gemm_mma_m16n8k64_row_col_s4u4s32()
-    test_gemm_mma_m16n8k256_row_col_b1b1s32()
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_ptx_mma_sp.py b/tests/python/unittest/test_tir_ptx_mma_sp.py
index 24170b4898f9..0b5073864a43 100644
--- a/tests/python/unittest/test_tir_ptx_mma_sp.py
+++ b/tests/python/unittest/test_tir_ptx_mma_sp.py
@@ -255,7 +255,7 @@ def mma_sp_m16n8k32_f16f16f32(a: T.handle, b: T.handle, c: T.handle, _metadata:
         C[i // 2 * 8 + tx // 4, tx % 4 * 2 + i % 2] = accum[i]
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_mma_sp_m16n8k16_f16():
     def get_meta_m16n8k16_half(mask):
         assert mask.shape == (16, 4, 2)
@@ -273,11 +273,6 @@ def get_meta_m16n8k16_half(mask):
     for out_dtype in ["float16", "float32"]:
         func = mma_sp_m16n8k16_f16f16f16 if out_dtype == "float16" else mma_sp_m16n8k16_f16f16f32
         sch = tvm.tir.Schedule(func)
-        arch = tvm.contrib.nvcc.get_target_compute_version()
-        major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-        if major < 8:
-            # Requires SM80+
-            return
         cuda_mod = tvm.build(sch.mod, target="cuda")
 
         A_np = np.random.uniform(-1, 1, [16, 8]).astype("float16")
@@ -297,7 +292,7 @@ def get_meta_m16n8k16_half(mask):
         tvm.testing.assert_allclose(C_tvm.numpy(), C_np, atol=1e-3, rtol=1e-3)
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_mma_sp_m16n8k32_f16():
     def get_meta_m16n8k32_half(mask):
         assert mask.shape == (16, 8, 2)
@@ -317,11 +312,6 @@ def get_meta_m16n8k32_half(mask):
     for out_dtype in ["float16", "float32"]:
         func = mma_sp_m16n8k32_f16f16f16 if out_dtype == "float16" else mma_sp_m16n8k32_f16f16f32
         sch = tvm.tir.Schedule(func)
-        arch = tvm.contrib.nvcc.get_target_compute_version()
-        major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-        if major < 8:
-            # Requires SM80+
-            return
         cuda_mod = tvm.build(sch.mod, target="cuda")
 
         A_np = np.random.uniform(-1, 1, [16, 16]).astype("float16")
diff --git a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
index 32c1625653e5..2eda2b9ec458 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
@@ -111,9 +111,6 @@ def run_test(
         mma_store_intrin,
     )
 
-    if not tvm.testing.is_ampere_or_newer():
-        return None
-
     f = tvm.build(sch.mod["main"], target="cuda", name="dense")
 
     dev = tvm.device("cuda", 0)
@@ -155,7 +152,7 @@ def run_test(
     return lambda: f.time_evaluator(f.entry_name, dev, number=500)(a, b, c)
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_f16f16f32_m16n16k16():
     def index_map(i, j):
         return (
@@ -212,7 +209,7 @@ def index_map(i, j):
         print("f16f16f32_m16n16k16_trans: %f GFLOPS" % (gflops / (timer().mean)))
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_f16f16f16_m16n16k16():
     def index_map(i, j):
         return (
@@ -269,7 +266,7 @@ def index_map(i, j):
         print("f16f16f16_m16n16k16_trans: %f GFLOPS" % (gflops / (timer().mean)))
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.requires_cuda_compute_version(8)
 def test_i8i8i32_m16n16k32():
     def index_map_A(i, j):
         return (
@@ -341,6 +338,4 @@ def index_map_C(i, j):
 
 
 if __name__ == "__main__":
-    test_f16f16f32_m16n16k16()
-    test_f16f16f16_m16n16k16()
-    test_i8i8i32_m16n16k32()
+    tvm.testing.main()

From bb80f19ea8493af71c6130301f1b479143d213ee Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 16 Sep 2022 15:24:03 -0700
Subject: [PATCH 193/704] [Hexagon] Add debug option to hexagon pytest (#12795)

* add debug option to hexagon pytest

* address comment
---
 python/tvm/contrib/hexagon/build.py         |  9 +++++----
 python/tvm/contrib/hexagon/pytest_plugin.py | 21 +++++++++++++++++----
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index fe7434f7386d..8960d110b85e 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -145,7 +145,7 @@ def start_server(self):
         ...
 
     @abc.abstractmethod
-    def stop_server(self):
+    def stop_server(self, cleanup=True):
         """Stop the RPC server"""
         ...
 
@@ -509,11 +509,12 @@ def start_server(self):
         self._copy_binaries()
         self._run_server_script()
 
-    def stop_server(self):
+    def stop_server(self, cleanup=True):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._cleanup_port_forwarding()
         self._terminate_remote()
-        self.cleanup_directory()
+        if cleanup:
+            self.cleanup_directory()
 
 
 class HexagonLauncherSimulator(HexagonLauncherRPC):
@@ -617,7 +618,7 @@ def _start(self):
     def cleanup_directory(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
 
-    def stop_server(self):
+    def stop_server(self, cleanup=True):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._server_process.terminate()
 
diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index 0b9f65540c34..03f4a1a143c2 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -158,7 +158,7 @@ def adb_server_socket() -> str:
 
 @pytest.fixture(scope="session")
 def hexagon_server_process(
-    request, rpc_server_port_for_session, adb_server_socket, skip_rpc
+    request, rpc_server_port_for_session, adb_server_socket, skip_rpc, hexagon_debug
 ) -> HexagonLauncherRPC:
     """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined.
     This launcher is started only once per test session.
@@ -194,7 +194,7 @@ def hexagon_server_process(
             yield {"launcher": launcher, "device_adr": device_adr}
         finally:
             if not skip_rpc:
-                launcher.stop_server()
+                launcher.stop_server(cleanup=(not hexagon_debug))
 
 
 def read_device_list():
@@ -221,6 +221,7 @@ def hexagon_launcher(
     tvm_tracker_host,
     tvm_tracker_port,
     adb_server_socket,
+    hexagon_debug,
 ) -> HexagonLauncherRPC:
     """Initials and returns hexagon launcher which reuses RPC info and Android serial number."""
     android_serial_num = android_serial_number()
@@ -246,8 +247,9 @@ def hexagon_launcher(
         yield launcher
     finally:
         if android_serial_num == ["simulator"]:
-            launcher.stop_server()
-        launcher.cleanup_directory()
+            launcher.stop_server(cleanup=(not hexagon_debug))
+        elif not hexagon_debug:
+            launcher.cleanup_directory()
 
 
 @pytest.fixture
@@ -297,6 +299,11 @@ def skip_rpc(request) -> bool:
     return request.config.getoption("--skip-rpc")
 
 
+@pytest.fixture(scope="session")
+def hexagon_debug(request) -> bool:
+    return request.config.getoption("--hexagon-debug")
+
+
 def pytest_addoption(parser):
     parser.addoption("--gtest_args", action="store", default="")
 
@@ -306,6 +313,12 @@ def pytest_addoption(parser):
         default=False,
         help="If set true, the RPC server initialization on Android would be skipped",
     )
+    parser.addoption(
+        "--hexagon-debug",
+        action="store_true",
+        default=False,
+        help="If set true, it will keep the hexagon test directories on the target.",
+    )
 
 
 def pytest_generate_tests(metafunc):

From 38f53e8c95d6b4387510e38da89b02edb913e886 Mon Sep 17 00:00:00 2001
From: Janet Schneider <janetsc@octoml.ai>
Date: Fri, 16 Sep 2022 16:25:28 -0700
Subject: [PATCH 194/704] [Hexagon] [runtime] Improve runtime resource
 management (#12727)

* First pass at improving runtime resource management

* Add unit test

* Fix lint and clang format errors

* Disable resource reset for simulator

* Moved acquire/release calls to session object, separate buffer managers for non-runtime (static) and runtime (dynamic).

* Fix lint errors

* Fix lint errors

* Improve robustness of session shutdown

* Fix lint

* Address feedback

* Only allow call to Acquire in a clean state

* Use a pointer to indicate the "active" manager
---
 python/tvm/contrib/hexagon/session.py         | 15 +++++++++--
 src/runtime/hexagon/hexagon_device_api.cc     | 26 +++++++++++++-----
 src/runtime/hexagon/hexagon_device_api.h      | 27 ++++++++++++++++++-
 .../hexagon/hexagon_device_api_tests.cc       | 18 +++++++++++++
 4 files changed, 76 insertions(+), 10 deletions(-)

diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 5619d036e283..e242a95aa8b8 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -88,14 +88,25 @@ def __enter__(self):
                     self._rpc_receive_buffer_size_bytes,
                 ],
             )
+            func = self._rpc.get_function("device_api.hexagon.acquire_resources")
+            func()
             return self
 
         except RuntimeError as exception:
             raise exception
 
     def __exit__(self, exc_type, exc_value, exc_traceback):
-        # close session to the tracker
-        del self._rpc
+        try:
+            func = self._rpc.get_function("device_api.hexagon.release_resources")
+            func()
+        except RuntimeError as exception:
+            print(
+                "Exception occurred while calling release_resources() during Session __exit__: ",
+                exception,
+            )
+        finally:
+            # close session to the tracker
+            del self._rpc
 
     @property
     def device(self):
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index fd3a0db2025b..463d9799b082 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -92,16 +92,16 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
 
   if (ndim == 0) {
     // Allocate storage for a single scalar value.
-    return hexbuffs.AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
+    return mgr->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
   } else if (ndim == 1) {
     // Allocate a single, contiguous memory region.
     size_t nbytes = shape[0] * typesize;
-    return hexbuffs.AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
+    return mgr->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
   } else if (ndim == 2) {
     // Allocate the region(s) needed for Hexagon's indirect-tensor format.
     size_t nallocs = shape[0];
     size_t nbytes = shape[1] * typesize;
-    return hexbuffs.AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope);
+    return mgr->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope);
   } else {
     return nullptr;  // unreachable
   }
@@ -115,13 +115,13 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme
   if (alignment < kHexagonAllocAlignment) {
     alignment = kHexagonAllocAlignment;
   }
-  return hexbuffs.AllocateHexagonBuffer(nbytes, alignment, String("global"));
+  return mgr->AllocateHexagonBuffer(nbytes, alignment, String("global"));
 }
 
 void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
   CHECK(ptr) << "buffer pointer is null";
   CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
-  hexbuffs.FreeHexagonBuffer(ptr);
+  mgr->FreeHexagonBuffer(ptr);
 }
 
 // WorkSpace: runtime allocations for Hexagon
@@ -137,7 +137,7 @@ void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_
 
 void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
   CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
-  CHECK(hexbuffs.count(data) != 0)
+  CHECK(mgr->count(data) != 0)
       << "Attempt made to free unknown or already freed workspace allocation";
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
@@ -161,7 +161,7 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan
   CHECK_EQ(to->byte_offset, 0);
   CHECK_EQ(GetDataSize(*from), GetDataSize(*to));
 
-  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* { return hexbuffs.find(ptr); };
+  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* { return mgr->find(ptr); };
 
   HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
   HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
@@ -246,6 +246,18 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.free_nd").set_body([](TVMArgs args, TVMR
   *rv = static_cast<int32_t>(0);
 });
 
+TVM_REGISTER_GLOBAL("device_api.hexagon.acquire_resources")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      HexagonDeviceAPI* api = HexagonDeviceAPI::Global();
+      api->AcquireResources();
+    });
+
+TVM_REGISTER_GLOBAL("device_api.hexagon.release_resources")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      HexagonDeviceAPI* api = HexagonDeviceAPI::Global();
+      api->ReleaseResources();
+    });
+
 TVM_REGISTER_GLOBAL("device_api.hexagon").set_body([](TVMArgs args, TVMRetValue* rv) {
   DeviceAPI* ptr = HexagonDeviceAPI::Global();
   *rv = static_cast<void*>(ptr);
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index 4da12e35fbe7..b8861238771b 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -45,11 +45,29 @@ class HexagonDeviceAPI final : public DeviceAPI {
   static HexagonDeviceAPI* Global();
 
   //! \brief Constructor
-  HexagonDeviceAPI() {}
+  HexagonDeviceAPI() { mgr = &hexbuffs; }
 
   //! \brief Destructor
   ~HexagonDeviceAPI() {}
 
+  //! \brief Ensures resource managers are in a good state for the runtime
+  void AcquireResources() {
+    CHECK_EQ(runtime_hexbuffs, nullptr);
+    runtime_hexbuffs = std::make_unique<HexagonBufferManager>();
+    LOG(INFO) << "runtime_hexbuffs created";
+    mgr = runtime_hexbuffs.get();
+  }
+
+  //! \brief Ensures all runtime resources are freed
+  void ReleaseResources() {
+    if (runtime_hexbuffs && !runtime_hexbuffs->empty()) {
+      LOG(INFO) << "runtime_hexbuffs was not empty in ReleaseResources";
+    }
+    mgr = &hexbuffs;
+    LOG(INFO) << "runtime_hexbuffs reset";
+    runtime_hexbuffs.reset();
+  }
+
   /*! \brief Currently unimplemented interface to specify the active
    *  Hexagon device.
    */
@@ -138,7 +156,14 @@ class HexagonDeviceAPI final : public DeviceAPI {
   }
 
   //! \brief Manages underlying HexagonBuffer allocations
+  // runtime_hexbuffs is used for runtime allocations.  It is created
+  // with a call to AcquireResources, and destroyed on ReleaseResources.
+  // hexbuffs is used for all allocations outside of the session lifetime.
   HexagonBufferManager hexbuffs;
+  std::unique_ptr<HexagonBufferManager> runtime_hexbuffs;
+
+  //! \brief Current buffer manager
+  HexagonBufferManager* mgr;
 };
 }  // namespace hexagon
 }  // namespace runtime
diff --git a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
index fbcee37cb154..1827c4059dea 100644
--- a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
@@ -146,3 +146,21 @@ TEST_F(HexagonDeviceAPITest, DISABLED_alloc_free_diff_dev) {
   CHECK(buf != nullptr);
   EXPECT_THROW(hexapi->FreeDataSpace(cpu_dev, buf), InternalError);
 }
+
+// Alloc a non-runtime buffer
+// Alloc a runtime buffer
+// "Release" resources for runtime
+// Verify the runtime buffer cannot be freed, but the non-runtime buffer can
+// This test should be run last
+TEST_F(HexagonDeviceAPITest, leak_resources) {
+  hexapi->ReleaseResources();
+  void* pre_runtime_buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8);
+  CHECK(pre_runtime_buf != nullptr);
+  hexapi->AcquireResources();
+  void* runtime_buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8);
+  CHECK(runtime_buf != nullptr);
+  hexapi->ReleaseResources();
+  EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, runtime_buf), InternalError);
+  hexapi->FreeDataSpace(hex_dev, pre_runtime_buf);
+  hexapi->AcquireResources();
+}

From 41b65a3144595afb04228be1334dc77c08d11ba7 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Fri, 16 Sep 2022 18:11:06 -0700
Subject: [PATCH 195/704] [TVMScript] IRBuilder methods for `Block` (#12815)

This PR introduces remaining IRBuilder methods for `Block`.

Co-authored-by: yongwww <yongcale@gmail.com>
---
 include/tvm/script/ir_builder/tir/frame.h     |  35 +++
 include/tvm/script/ir_builder/tir/ir.h        |  49 ++++
 python/tvm/script/ir_builder/base.py          |  18 +-
 python/tvm/script/ir_builder/ir/ir.py         |   2 +-
 python/tvm/script/ir_builder/tir/frame.py     |   7 +-
 python/tvm/script/ir_builder/tir/ir.py        | 235 +++++++++++++++---
 src/script/ir_builder/tir/frame.cc            |  15 ++
 src/script/ir_builder/tir/ir.cc               |  80 ++++++
 .../unittest/test_tvmscript_ir_builder_tir.py |  50 +++-
 tests/scripts/task_mypy.sh                    |   3 +
 10 files changed, 442 insertions(+), 52 deletions(-)

diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h
index 2902b982d5a6..c76b400d96b4 100644
--- a/include/tvm/script/ir_builder/tir/frame.h
+++ b/include/tvm/script/ir_builder/tir/frame.h
@@ -187,6 +187,41 @@ class BlockFrame : public TIRFrame {
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(BlockFrame, TIRFrame, BlockFrameNode);
 };
 
+/*!
+ * \brief A frame that represents the block initialization statment.
+ *
+ * \sa BlockInitFrame
+ */
+class BlockInitFrameNode : public TIRFrameNode {
+ public:
+  void VisitAttrs(tvm::AttrVisitor* v) { TIRFrameNode::VisitAttrs(v); }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.BlockInitFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(BlockInitFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when entering RAII scope.
+   * \sa tvm::support::With
+   */
+  void EnterWithScope() final;
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to BlockInitFrameNode.
+ *
+ * \sa BlockInitFrameNode
+ */
+class BlockInitFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(BlockInitFrame, TIRFrame, BlockInitFrameNode);
+};
+
 /*!
  * \brief A frame that represents the for loop.
  *
diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
index 037606253adc..191887648dbd 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -141,6 +141,55 @@ void PreflattenedBuffer(Buffer postflattened_buffer, Array<PrimExpr> shape,
  */
 BlockFrame Block(String name, bool no_realize = false);
 
+/*!
+ * \brief The block initialization statement.
+ * \return The BlockInitFrame.
+ */
+BlockInitFrame Init();
+
+/*!
+ * \brief The block predicate statement.
+ * \param predicate The predicate condition.
+ */
+void Where(PrimExpr predicate);
+
+/*!
+ * \brief The block buffer region reading statement.
+ * \param buffer_slices The array of buffer regions to read.
+ */
+void Reads(Array<ObjectRef> buffer_slices);
+
+/*!
+ * \brief The block buffer region writing statement.
+ * \param buffer_slices The array of buffer regions to write.
+ */
+void Writes(Array<ObjectRef> buffer_slices);
+
+/*!
+ * \brief The block annotation statement.
+ * \param attrs The annotation of the block.
+ */
+void BlockAttrs(Map<String, ObjectRef> attrs);
+
+/*!
+ * \brief The buffer allocation function.
+ * \param shape The type of the buffer prior to flattening.
+ * \param dtype The data type in the content of the buffer.
+ * \param data The pointer to the head of the data.
+ * \param strides The strides of each dimension.
+ * \param elem_offset The offset in terms of number of dtype elements (including lanes).
+ * \param storage_scope The optional storage scope of buffer data pointer.
+ * \param align The alignment requirement of data pointer in bytes.
+ * \param offset_factor The factor of elem_offset field.
+ * \param buffer_type The buffer type.
+ * \param axis_separators The separators between input axes when generating flattened output axes.
+ * \return The allocated buffer.
+ */
+Buffer AllocBuffer(Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
+                   Optional<Var> data = NullOpt, Array<PrimExpr> strides = {},
+                   PrimExpr elem_offset = PrimExpr(), String storage_scope = "", int align = -1,
+                   int offset_factor = 0, String buffer_type = "default",
+                   Array<IntImm> axis_separators = {});
 namespace axis {
 
 /*!
diff --git a/python/tvm/script/ir_builder/base.py b/python/tvm/script/ir_builder/base.py
index 767fa8bf2596..7aa33ee49c72 100644
--- a/python/tvm/script/ir_builder/base.py
+++ b/python/tvm/script/ir_builder/base.py
@@ -61,11 +61,11 @@ class IRBuilderFrame(_Object):
     """
 
     def __enter__(self) -> "IRBuilderFrame":
-        _ffi_api.IRBuilderFrameEnter(self)  # pylint: disable=no-member # type: ignore
+        _ffi_api.IRBuilderFrameEnter(self)  # type: ignore[attr-defined] # pylint: disable=no-member
         return self
 
     def __exit__(self, ptype, value, trace) -> None:  # pylint: disable=unused-argument
-        _ffi_api.IRBuilderFrameExit(self)  # pylint: disable=no-member # type: ignore
+        _ffi_api.IRBuilderFrameExit(self)  # type: ignore[attr-defined] # pylint: disable=no-member
 
     def add_callback(self, callback: Callable[[], None]) -> None:
         """Add a callback method invoked when exiting the with-scope.
@@ -75,7 +75,7 @@ def add_callback(self, callback: Callable[[], None]) -> None:
         callback : Callable[[], None]
             The callback method to be invoked.
         """
-        _ffi_api.IRBuilderFrameAddCallback(  # pylint: disable=no-member # type: ignore
+        _ffi_api.IRBuilderFrameAddCallback(  # type: ignore[attr-defined] # pylint: disable=no-member
             self, callback
         )
 
@@ -104,7 +104,7 @@ class IRBuilder(_Object):
     def __init__(self) -> None:
         """Construct an IRBuilder."""
         self.__init_handle_by_constructor__(
-            _ffi_api.IRBuilder  # pylint: disable=no-member # type: ignore
+            _ffi_api.IRBuilder  # type: ignore[attr-defined] # pylint: disable=no-member
         )
 
     def __enter__(self) -> "IRBuilder":
@@ -119,11 +119,11 @@ def __enter__(self) -> "IRBuilder":
         with IRBuilder() as builder:
             assert IRBuilder.current() == builder
         """
-        _ffi_api.IRBuilderEnter(self)  # pylint: disable=no-member # type: ignore
+        _ffi_api.IRBuilderEnter(self)  # type: ignore[attr-defined] # pylint: disable=no-member
         return self
 
     def __exit__(self, ptype, value, trace) -> None:  # pylint: disable=unused-argument
-        _ffi_api.IRBuilderExit(self)  # pylint: disable=no-member # type: ignore
+        _ffi_api.IRBuilderExit(self)  # type: ignore[attr-defined] # pylint: disable=no-member
 
     @staticmethod
     def current() -> "IRBuilder":
@@ -134,11 +134,11 @@ def current() -> "IRBuilder":
         builder : IRBuilder
             The current IRBuilder.
         """
-        return _ffi_api.IRBuilderCurrent()  # pylint: disable=no-member # type: ignore
+        return _ffi_api.IRBuilderCurrent()  # type: ignore[attr-defined] # pylint: disable=no-member
 
     def get(self) -> _Object:
         """Get the constructed IR."""
-        return _ffi_api.IRBuilderGet(self)  # pylint: disable=no-member # type: ignore
+        return _ffi_api.IRBuilderGet(self)  # type: ignore[attr-defined] # pylint: disable=no-member
 
     @staticmethod
     def name(s: str, v: Any) -> Any:
@@ -156,7 +156,7 @@ def name(s: str, v: Any) -> Any:
         v : Any
             The same object with the name set.
         """
-        return _ffi_api.IRBuilderName(s, v)  # pylint: disable=no-member # type: ignore
+        return _ffi_api.IRBuilderName(s, v)  # type: ignore[attr-defined] # pylint: disable=no-member
 
     @staticmethod
     def name_many(  # pylint: disable=invalid-name
diff --git a/python/tvm/script/ir_builder/ir/ir.py b/python/tvm/script/ir_builder/ir/ir.py
index df920364356b..213180463cb2 100644
--- a/python/tvm/script/ir_builder/ir/ir.py
+++ b/python/tvm/script/ir_builder/ir/ir.py
@@ -21,4 +21,4 @@
 
 
 def ir_module() -> IRModuleFrame:
-    return _ffi_api.IRModule()  # pylint: disable=no-member # type: ignore
+    return _ffi_api.IRModule()  # type: ignore[attr-defined] # pylint: disable=no-member
diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py
index 75bb0231aeef..2ad08f35160d 100644
--- a/python/tvm/script/ir_builder/tir/frame.py
+++ b/python/tvm/script/ir_builder/tir/frame.py
@@ -38,8 +38,13 @@ class BlockFrame(TIRFrame):
     ...
 
 
+@_register_object("script.ir_builder.tir.BlockInitFrame")
+class BlockInitFrame(TIRFrame):
+    ...
+
+
 @_register_object("script.ir_builder.tir.ForFrame")
 class ForFrame(TIRFrame):
-    def __enter__(self) -> Union[Var, List[Var]]:
+    def __enter__(self) -> Union[Var, List[Var]]:  # type: ignore[override]
         super().__enter__()
         return self.vars if len(self.vars) > 1 else self.vars[0]
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index 40cd99c744d7..d1dc1c89600d 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -25,6 +25,7 @@
     Buffer,
     BufferLoad,
     BufferRegion,
+    IntImm,
     PrimExpr,
     StringImm,
     Var,
@@ -85,7 +86,7 @@ def buffer_decl(
         The declared buffer.
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
-    return _ffi_api.BufferDecl(  # pylint: disable=no-member # type: ignore
+    return _ffi_api.BufferDecl(  # type: ignore[attr-defined] # pylint: disable=no-member
         shape,
         dtype,
         "",
@@ -108,7 +109,7 @@ def prim_func() -> frame.PrimFuncFrame:
     res : frame.PrimFuncFrame
         The PrimFuncFrame.
     """
-    return _ffi_api.PrimFunc()  # pylint: disable=no-member # type: ignore
+    return _ffi_api.PrimFunc()  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def arg(name: str, obj: Union[Var, Buffer]) -> Union[Var, Buffer]:
@@ -127,7 +128,7 @@ def arg(name: str, obj: Union[Var, Buffer]) -> Union[Var, Buffer]:
     res : Union[Var, Buffer]
         The argument.
     """
-    return _ffi_api.Arg(name, obj)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Arg(name, obj)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def func_name(name: str) -> None:
@@ -138,7 +139,7 @@ def func_name(name: str) -> None:
     name : str
         The name of the PrimFunc.
     """
-    _ffi_api.FuncName(name)  # pylint: disable=no-member # type: ignore
+    _ffi_api.FuncName(name)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def func_attr(attrs: Dict[str, Any]) -> None:
@@ -149,7 +150,7 @@ def func_attr(attrs: Dict[str, Any]) -> None:
     attrs : Dict[str, Any]
         The annotations of the PrimFunc.
     """
-    _ffi_api.FuncAttrs(attrs)  # pylint: disable=no-member # type: ignore
+    _ffi_api.FuncAttrs(attrs)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def func_ret(ret_type: Type) -> Type:
@@ -165,7 +166,7 @@ def func_ret(ret_type: Type) -> Type:
     res : Type
         The return type.
     """
-    return _ffi_api.FuncRet(ret_type)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.FuncRet(ret_type)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def match_buffer(
@@ -242,7 +243,7 @@ def match_buffer(
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is None:
         strides = []
-    return _ffi_api.MatchBuffer(  # pylint: disable=no-member # type: ignore
+    return _ffi_api.MatchBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
         param,
         shape,
         dtype,
@@ -310,7 +311,7 @@ def preflattened_buffer(
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is None:
         strides = []
-    _ffi_api.PreflattenedBuffer(  # pylint: disable=no-member # type: ignore
+    _ffi_api.PreflattenedBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
         postflattened,
         shape,
         dtype,
@@ -341,7 +342,155 @@ def block(name: str = "", no_realize: bool = False) -> frame.BlockFrame:
     res : frame.BlockFrame
         The BlockFrame.
     """
-    return _ffi_api.Block(name, no_realize)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Block(name, no_realize)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def init() -> frame.BlockInitFrame:
+    """The block initialization statement.
+
+    Returns
+    -------
+    res : frame.BlockInitFrame
+        The BlockInitFrame.
+    """
+    return _ffi_api.Init()  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def where(predicate: Union[PrimExpr, int]) -> None:
+    """The block predicate statement.
+
+    Parameters
+    ----------
+    predicate : Union[PrimExpr, Literal[0, 1]]
+        The predicate condition.
+    """
+    if isinstance(predicate, bool):
+        predicate = IntImm("bool", predicate)
+    if isinstance(predicate, int):
+        if predicate in [0, 1]:
+            predicate = IntImm("bool", predicate)
+        else:
+            raise ValueError(f"Invalid value for predicate: {predicate}")
+    _ffi_api.Where(predicate)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def reads(*buffer_slices: List[Union[BufferRegion, BufferLoad]]) -> None:
+    """The block buffer region reading statement.
+
+    Parameters
+    ----------
+    buffer_slices : List[Union[BufferRegion, BufferLoad]]
+        The array of buffer regions to read.
+    """
+    if len(buffer_slices) == 1:
+        if isinstance(buffer_slices[0], tuple):
+            buffer_slices = list(buffer_slices[0])
+        elif isinstance(buffer_slices[0], list):
+            buffer_slices = buffer_slices[0]  # type: ignore[assignment]
+        else:
+            buffer_slices = [buffer_slices[0]]
+    else:
+        buffer_slices = list(buffer_slices)  # type: ignore[assignment]
+    _ffi_api.Reads(buffer_slices)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def writes(*buffer_slices: List[Union[BufferRegion, BufferLoad]]) -> None:
+    """The block buffer region writing statement.
+
+    Parameters
+    ----------
+    buffer_slices : List[Union[BufferRegion, BufferLoad]]
+        The array of buffer regions to write.
+    """
+    if len(buffer_slices) == 1:
+        if isinstance(buffer_slices[0], tuple):
+            buffer_slices = list(buffer_slices[0])
+        elif isinstance(buffer_slices[0], list):
+            buffer_slices = buffer_slices[0]  # type: ignore[assignment]
+        else:
+            buffer_slices = [buffer_slices[0]]
+    else:
+        buffer_slices = list(buffer_slices)  # type: ignore[assignment]
+    _ffi_api.Writes(buffer_slices)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def block_attr(attrs: Dict[str, Any]) -> None:
+    """The block annotation statement.
+
+    Parameters
+    ----------
+    attrs : Dict[str, Any]
+        The annotation of the block.
+    """
+    return _ffi_api.BlockAttrs(attrs)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def alloc_buffer(
+    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
+    dtype: str = "float32",
+    data: Var = None,
+    strides: List[PrimExpr] = None,
+    elem_offset: PrimExpr = None,
+    scope: str = "",
+    align: int = -1,
+    offset_factor: int = 0,
+    buffer_type: str = "default",
+    axis_separators: List[int] = None,
+) -> Buffer:
+    """The buffer alllocation function.
+
+    Parameters
+    ----------
+    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
+        The type of the buffer prior to flattening.
+
+    dtype : str
+        The data type in the content of the buffer.
+
+    data : Var
+        The pointer to the head of the data.
+
+    strides : List[PrimExpr]
+        The strides of each dimension.
+
+    elem_offset : PrimExpr
+        The offset in terms of number of dtype elements (including lanes).
+
+    scope : str
+        The optional storage scope of buffer data pointer.
+
+    align : int
+        The alignment requirement of data pointer in bytes.
+
+    offset_factor : int
+        The factor of elem_offset field.
+
+    buffer_type : str
+        The buffer type.
+
+    axis_separators : List[int]
+        The separators between input axes when generating flattened output axes.
+
+    Returns
+    -------
+    res : Buffer
+        The allocated buffer.
+    """
+    shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
+    if strides is None:
+        strides = []
+    return _ffi_api.AllocBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
+        shape,
+        dtype,
+        data,
+        strides,
+        elem_offset,
+        scope,
+        align,
+        offset_factor,
+        buffer_type,
+        axis_separators,
+    )
 
 
 def _as_range(dom: Union[Range, List[PrimExpr]]) -> Range:
@@ -387,7 +536,7 @@ def spatial(
         res : Var
             The iteration variable.
         """
-        return _ffi_api.AxisSpatial(  # pylint: disable=no-member # type: ignore
+        return _ffi_api.AxisSpatial(  # type: ignore[attr-defined] # pylint: disable=no-member
             _as_range(dom), binding, dtype
         )
 
@@ -413,7 +562,7 @@ def reduce(
         res : Var
             The iteration variable.
         """
-        return _ffi_api.AxisReduce(  # pylint: disable=no-member # type: ignore
+        return _ffi_api.AxisReduce(  # type: ignore[attr-defined] # pylint: disable=no-member
             _as_range(dom), binding, dtype
         )
 
@@ -439,7 +588,7 @@ def scan(
         res : Var
             The iteration variable.
         """
-        return _ffi_api.AxisScan(  # pylint: disable=no-member # type: ignore
+        return _ffi_api.AxisScan(  # type: ignore[attr-defined] # pylint: disable=no-member
             _as_range(dom), binding, dtype
         )
 
@@ -465,7 +614,7 @@ def opaque(
         res : Var
             The iteration variable.
         """
-        return _ffi_api.AxisOpaque(  # pylint: disable=no-member # type: ignore
+        return _ffi_api.AxisOpaque(  # type: ignore[attr-defined] # pylint: disable=no-member
             _as_range(dom), binding, dtype
         )
 
@@ -489,7 +638,7 @@ def remap(kinds: str, bindings: List[PrimExpr], dtype: str = "int32") -> Union[L
         res : Var
             The iteration variables.
         """
-        iter_vars = _ffi_api.AxisRemap(  # pylint: disable=no-member # type: ignore
+        iter_vars = _ffi_api.AxisRemap(  # type: ignore[attr-defined] # pylint: disable=no-member
             kinds, bindings, dtype
         )
         return iter_vars[0] if len(iter_vars) == 1 else iter_vars
@@ -522,7 +671,7 @@ def serial(
     if stop is None:
         stop = start
         start = 0
-    return _ffi_api.Serial(start, stop, annotations)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Serial(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def parallel(
@@ -549,7 +698,7 @@ def parallel(
     if stop is None:
         stop = start
         start = 0
-    return _ffi_api.Parallel(start, stop, annotations)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Parallel(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def vectorized(
@@ -576,7 +725,7 @@ def vectorized(
     if stop is None:
         stop = start
         start = 0
-    return _ffi_api.Vectorized(start, stop, annotations)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Vectorized(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def unroll(
@@ -603,7 +752,7 @@ def unroll(
     if stop is None:
         stop = start
         start = 0
-    return _ffi_api.Unroll(start, stop, annotations)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Unroll(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def thread_binding(
@@ -643,7 +792,7 @@ def thread_binding(
     elif stop is None:
         stop = start
         start = 0
-    return _ffi_api.ThreadBinding(  # pylint: disable=no-member # type: ignore
+    return _ffi_api.ThreadBinding(  # type: ignore[attr-defined] # pylint: disable=no-member
         start, stop, thread, annotations
     )
 
@@ -661,7 +810,7 @@ def grid(*extents: PrimExpr) -> frame.ForFrame:
     res : frame.ForFrame
         The ForFrame.
     """
-    return _ffi_api.Grid(extents)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Grid(extents)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def evaluate(value: PrimExpr) -> None:
@@ -674,7 +823,7 @@ def evaluate(value: PrimExpr) -> None:
     """
     if isinstance(value, str):
         value = StringImm(value)
-    return _ffi_api.Evaluate(value)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Evaluate(value)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def int8(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -690,7 +839,7 @@ def int8(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type int8 or casted expression with type int8.
     """
-    return _ffi_api.Int8(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Int8(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def int16(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -706,7 +855,7 @@ def int16(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type int16 or casted expression with type int16.
     """
-    return _ffi_api.Int16(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Int16(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def int32(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -722,7 +871,7 @@ def int32(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type int32 or casted expression with type int32.
     """
-    return _ffi_api.Int32(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Int32(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def int64(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -738,7 +887,7 @@ def int64(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type int64 or casted expression with type int64.
     """
-    return _ffi_api.Int64(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Int64(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def uint8(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -754,7 +903,7 @@ def uint8(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type uint8 or casted expression with type uint8.
     """
-    return _ffi_api.UInt8(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.UInt8(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def uint16(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -770,7 +919,7 @@ def uint16(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type uint16 or casted expression with type uint16.
     """
-    return _ffi_api.UInt16(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.UInt16(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def uint32(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -786,7 +935,7 @@ def uint32(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type uint32 or casted expression with type uint32.
     """
-    return _ffi_api.UInt32(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.UInt32(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def uint64(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -802,7 +951,7 @@ def uint64(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type uint64 or casted expression with type uint64.
     """
-    return _ffi_api.UInt64(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.UInt64(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def float8(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -818,7 +967,7 @@ def float8(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type float8 or casted expression with type float8.
     """
-    return _ffi_api.Float8(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Float8(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def float16(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -834,7 +983,7 @@ def float16(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type float16 or casted expression with type float16.
     """
-    return _ffi_api.Float16(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Float16(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def float32(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -850,7 +999,7 @@ def float32(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type float32 or casted expression with type float32.
     """
-    return _ffi_api.Float32(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Float32(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def float64(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -866,7 +1015,7 @@ def float64(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type float64 or casted expression with type float64.
     """
-    return _ffi_api.Float64(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Float64(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def int32x4(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -882,7 +1031,7 @@ def int32x4(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type int32x4 or casted expression with type int32x4.
     """
-    return _ffi_api.Int32x4(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Int32x4(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def int32x8(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -898,7 +1047,7 @@ def int32x8(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type int32x8 or casted expression with type int32x8.
     """
-    return _ffi_api.Int32x8(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Int32x8(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def int32x16(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -914,7 +1063,7 @@ def int32x16(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type int32x16 or casted expression with type int32x16.
     """
-    return _ffi_api.Int32x16(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Int32x16(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def boolean(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -930,7 +1079,7 @@ def boolean(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type boolean or casted expression with type boolean.
     """
-    return _ffi_api.Boolean(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Boolean(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def handle(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -946,7 +1095,7 @@ def handle(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type handle or casted expression with type handle.
     """
-    return _ffi_api.Handle(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Handle(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def void(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -962,7 +1111,7 @@ def void(expr: Optional[PrimExpr] = None) -> PrimExpr:
     res : PrimExpr
         The new tir.Var with type void or casted expression with type void.
     """
-    return _ffi_api.Void(expr)  # pylint: disable=no-member # type: ignore
+    return _ffi_api.Void(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def var(dtype, name="") -> Var:
@@ -981,7 +1130,7 @@ def var(dtype, name="") -> Var:
     res : Var
         The result tir.Var.
     """
-    return Var(name, dtype)  # pylint: disable=no-member # type: ignore
+    return Var(name, dtype)  # pylint: disable=no-member
 
 
 # pylint: enable=invalid-name
@@ -997,6 +1146,12 @@ def var(dtype, name="") -> Var:
     "match_buffer",
     "preflattened_buffer",
     "block",
+    "init",
+    "where",
+    "reads",
+    "writes",
+    "block_attr",
+    "alloc_buffer",
     "axis",
     "serial",
     "parallel",
diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc
index e54bf75eeff2..8b8b2a4d80e0 100644
--- a/src/script/ir_builder/tir/frame.cc
+++ b/src/script/ir_builder/tir/frame.cc
@@ -73,6 +73,20 @@ void BlockFrameNode::ExitWithScope() {
   }
 }
 
+void BlockInitFrameNode::EnterWithScope() {
+  BlockFrame frame = FindBlockFrame("T.init");
+  if (frame->init.defined()) {
+    LOG(FATAL) << "ValueError: Duplicate block init declaration";
+  }
+  TIRFrameNode::EnterWithScope();
+}
+
+void BlockInitFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  BlockFrame frame = FindBlockFrame("T.init");
+  frame->init = AsStmt(stmts);
+}
+
 void ForFrameNode::ExitWithScope() {
   TIRFrameNode::ExitWithScope();
   AddToParent(this->f_make_for_loop(vars, doms, AsStmt(stmts)));
@@ -81,6 +95,7 @@ void ForFrameNode::ExitWithScope() {
 TVM_REGISTER_NODE_TYPE(TIRFrameNode);
 TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode);
 TVM_REGISTER_NODE_TYPE(BlockFrameNode);
+TVM_REGISTER_NODE_TYPE(BlockInitFrameNode);
 TVM_REGISTER_NODE_TYPE(ForFrameNode);
 
 }  // namespace tir
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index 5013e321728e..75e759262655 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -173,6 +173,80 @@ BlockFrame Block(String name, bool no_realize) {
   return BlockFrame(n);
 }
 
+BlockInitFrame Init() { return BlockInitFrame(make_object<BlockInitFrameNode>()); }
+
+void Where(PrimExpr predicate) {
+  BlockFrame frame = FindBlockFrame("T.where");
+  if (frame->predicate.defined()) {
+    LOG(FATAL) << "ValueError: Duplicate block predicate declaration, previous one is "
+               << frame->predicate;
+  }
+  frame->predicate = predicate;
+}
+
+void Reads(Array<ObjectRef> buffer_slices) {
+  using namespace tvm::tir;
+  BlockFrame frame = FindBlockFrame("T.reads");
+  if (frame->reads.defined()) {
+    LOG(FATAL) << "ValueError: Duplicate read region declaration, previous one is " << frame->reads;
+  }
+  Array<BufferRegion> reads;
+  for (const ObjectRef& obj : buffer_slices) {
+    if (const auto* buffer_region = obj.as<BufferRegionNode>()) {
+      reads.push_back(GetRef<BufferRegion>(buffer_region));
+    } else if (const auto* buffer_load = obj.as<BufferLoadNode>()) {
+      reads.push_back(BufferRegionFromLoad(GetRef<BufferLoad>(buffer_load)));
+    } else {
+      LOG(FATAL) << "Invalid type for buffer reads.";
+    }
+  }
+  frame->reads = reads;
+}
+
+void Writes(Array<ObjectRef> buffer_slices) {
+  using namespace tvm::tir;
+  BlockFrame frame = FindBlockFrame("T.writes");
+  if (frame->writes.defined()) {
+    LOG(FATAL) << "ValueError: Duplicate write region declaration, previous one is "
+               << frame->writes;
+  }
+  Array<BufferRegion> writes;
+  for (const ObjectRef& obj : buffer_slices) {
+    if (const auto* buffer_region = obj.as<BufferRegionNode>()) {
+      writes.push_back(GetRef<BufferRegion>(buffer_region));
+    } else if (const auto* buffer_load = obj.as<BufferLoadNode>()) {
+      writes.push_back(BufferRegionFromLoad(GetRef<BufferLoad>(buffer_load)));
+    } else {
+      LOG(FATAL) << "Invalid type for buffer writes.";
+    }
+  }
+  frame->writes = writes;
+}
+
+void BlockAttrs(Map<String, ObjectRef> attrs) {
+  BlockFrame frame = FindBlockFrame("T.block_attr");
+  if (frame->annotations.defined()) {
+    LOG(FATAL) << "ValueError: Duplicate block annotations, previous one is " << frame->annotations;
+  }
+  frame->annotations = attrs;
+}
+
+Buffer AllocBuffer(Array<PrimExpr> shape, DataType dtype, Optional<Var> data,
+                   Array<PrimExpr> strides, PrimExpr elem_offset, String storage_scope, int align,
+                   int offset_factor, String buffer_type_str, Array<IntImm> axis_separators) {
+  Buffer buffer = BufferDecl(shape, dtype, "", data, strides, elem_offset, storage_scope, align,
+                             offset_factor, buffer_type_str, axis_separators);
+  IRBuilder builder = IRBuilder::Current();
+  if (Optional<BlockFrame> frame = builder->GetLastFrame<BlockFrame>()) {
+    frame.value()->alloc_buffers.push_back(buffer);
+  } else if (Optional<PrimFuncFrame> frame = builder->GetLastFrame<PrimFuncFrame>()) {
+    frame.value()->root_alloc_buffers.push_back(buffer);
+  } else {
+    LOG(FATAL) << "ValueError: Block frame or PrimFunc frame not find. Please ensure "
+                  "'T.alloc_buffer' is called under T.block() or T.prim_func()";
+  }
+  return buffer;
+}
 namespace axis {
 
 IterVar PushBlockVar(IterVar iter_var, PrimExpr binding) {
@@ -383,6 +457,12 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.MatchBuffer").set_body_typed(MatchBuf
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.PreflattenedBuffer").set_body_typed(PreflattenedBuffer);
 
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Block").set_body_typed(Block);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Init").set_body_typed(Init);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Where").set_body_typed(Where);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Reads").set_body_typed(Reads);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Writes").set_body_typed(Writes);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.BlockAttrs").set_body_typed(BlockAttrs);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.AllocBuffer").set_body_typed(AllocBuffer);
 
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisSpatial").set_body_typed(axis::Spatial);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisReduce").set_body_typed(axis::Reduce);
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index d893ebc545c6..a5d8c1068064 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -87,7 +87,7 @@ def test_ir_builder_tir_primfunc_complete():
     assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True)
 
 
-def test_ir_builder_tir_block():
+def test_ir_builder_tir_block_base():
     with IRBuilder() as ib:
         with T.block("block"):
             T.evaluate(0)
@@ -114,6 +114,54 @@ def test_ir_builder_tir_block():
     assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True)
 
 
+def test_ir_builder_tir_block_complete():
+    with IRBuilder() as ib:
+        a = T.var("int64", "a")
+        b = T.buffer_decl((128, 128), "float32")
+        c = T.buffer_decl((128, 128), "float32")
+        d = T.var("int32", "d")
+        e = T.buffer_decl((128, 128), "float32")
+        f = T.var("int32", "f")
+        with T.block("block"):
+            T.where(a > 1)
+            T.reads(b[0:16, 0:16])
+            T.writes(c[d:128, d:128])
+            T.block_attr({"key": "value"})
+            T.alloc_buffer((128, 128), "float32")
+            T.match_buffer(e[0:32, 0:32], (32, 32), "float32")
+            T.axis.spatial(128, f)
+            T.evaluate(0)
+    # the block generated by IRBuilder
+    block_realize_actual = ib.get()
+
+    # the expected block
+    var_a = tir.Var("a", "int64")
+    buffer_b = tir.decl_buffer((128, 128), "float32", name="b")
+    buffer_c = tir.decl_buffer((128, 128), "float32", name="c")
+    var_d = tir.Var("d", "int32")
+    buffer_e = tir.decl_buffer((128, 128), "float32", name="c")
+    var_f = tir.Var("f", "int32")
+    block_expected = tir.Block(
+        iter_vars=[tir.IterVar((0, 128), tir.Var("", "int32"), iter_type=tir.IterVar.DataPar)],
+        reads=[buffer_b[0:16, 0:16]],
+        writes=[buffer_c[var_d:128, var_d:128]],
+        name_hint="block",
+        body=tir.Evaluate(0),
+        alloc_buffers=[tir.decl_buffer((128, 128), "float32")],
+        match_buffers=[
+            tir.MatchBufferRegion(tir.decl_buffer((32, 32), "float32"), buffer_e[0:32, 0:32])
+        ],
+        annotations={"key": "value"},
+    )
+    block_realize_expected = tir.BlockRealize(
+        iter_values=[var_f],
+        predicate=var_a > 1,
+        block=block_expected,
+    )
+    # Check if the generated ir is expected
+    assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True)
+
+
 def test_ir_builder_tir_axis():
     with IRBuilder() as ib:
         a = T.var("int32", "a")
diff --git a/tests/scripts/task_mypy.sh b/tests/scripts/task_mypy.sh
index f165adfe1bc4..c3e5d50b3e03 100755
--- a/tests/scripts/task_mypy.sh
+++ b/tests/scripts/task_mypy.sh
@@ -47,3 +47,6 @@ mypy --disallow-untyped-defs python/tvm/relay/op/contrib/tensorrt.py
 #TODO(@mikepapadim): This is failing atm
 # echo "Checking MyPy Type defs in the tvm.relay.backend.contrib.ethosu package."
 # mypy  --check-untyped-defs python/tvm/relay/backend/contrib/ethosu/
+
+echo "Checking MyPy Type defs in the tvmscript IRBuilder package."
+mypy  --check-untyped-defs python/tvm/script/ir_builder

From 2cae905a727930eaaeb59085393eef1e1421fc20 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Fri, 16 Sep 2022 21:11:31 -0400
Subject: [PATCH 196/704] [TIR] Support pattern matching argmax/argmin
 generated by TOPI (#12827)

This PR introduces two reducers to TIR reduction part, so that rfactor and cross-thread reduction can be applied to those functions who contains argmax/argmin computation generated by TOPI.
---
 src/tir/schedule/primitive/reduction.cc       | 134 +++++++++------
 .../unittest/test_tir_schedule_rfactor.py     | 156 +++++++++++++++++-
 2 files changed, 233 insertions(+), 57 deletions(-)

diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc
index 2dc47fa15bea..dd2bcf727c40 100644
--- a/src/tir/schedule/primitive/reduction.cc
+++ b/src/tir/schedule/primitive/reduction.cc
@@ -297,60 +297,86 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref,
  */
 struct ReducerRegistry {
   ReducerRegistry()
-      : reducer_getters{CreateReducerGetter(
-                            /*n_buffers=*/1,
-                            [](const Array<Var>& x, const Array<Var>& y) {
-                              return Array<PrimExpr>{x[0] + y[0]};
-                            },
-                            [](const Array<PrimExpr>& values) {
-                              return Array<PrimExpr>{make_const(values[0]->dtype, 0)};
-                            }),
-                        CreateReducerGetter(
-                            /*n_buffers=*/1,
-                            [](const Array<Var>& x, const Array<Var>& y) {
-                              return Array<PrimExpr>{x[0] * y[0]};
-                            },
-                            [](const Array<PrimExpr>& values) {
-                              return Array<PrimExpr>{make_const(values[0]->dtype, 1)};
-                            }),
-                        CreateReducerGetter(
-                            /*n_buffers=*/1,
-                            [](const Array<Var>& x, const Array<Var>& y) {
-                              return Array<PrimExpr>{min(x[0], y[0])};
-                            },
-                            [](const Array<PrimExpr>& values) {
-                              return Array<PrimExpr>{max_value(values[0]->dtype)};
-                            }),
-                        CreateReducerGetter(
-                            /*n_buffers=*/1,
-                            [](const Array<Var>& x, const Array<Var>& y) {
-                              return Array<PrimExpr>{max(x[0], y[0])};
-                            },
-                            [](const Array<PrimExpr>& values) {
-                              return Array<PrimExpr>{min_value(values[0]->dtype)};
-                            }),
-                        CreateReducerGetter(
-                            /*n_buffers=*/2,
-                            [](const Array<Var>& x, const Array<Var>& y) {
-                              PrimExpr idx = Select(x[1] >= y[1], x[0], y[0]);
-                              PrimExpr val = Select(x[1] >= y[1], x[1], y[1]);
-                              return Array<PrimExpr>{idx, val};
-                            },
-                            [](const Array<PrimExpr>& values) {
-                              return Array<PrimExpr>{make_const(values[0]->dtype, -1),
-                                                     min_value(values[1]->dtype)};
-                            }),
-                        CreateReducerGetter(
-                            /*n_buffers=*/2,
-                            [](const Array<Var>& x, const Array<Var>& y) {
-                              PrimExpr idx = Select(x[1] <= y[1], x[0], y[0]);
-                              PrimExpr val = Select(x[1] <= y[1], x[1], y[1]);
-                              return Array<PrimExpr>{idx, val};
-                            },
-                            [](const Array<PrimExpr>& values) {
-                              return Array<PrimExpr>{make_const(values[0]->dtype, -1),
-                                                     max_value(values[1]->dtype)};
-                            })} {}
+      : reducer_getters{
+            CreateReducerGetter(
+                /*n_buffers=*/1,
+                [](const Array<Var>& x, const Array<Var>& y) {
+                  return Array<PrimExpr>{x[0] + y[0]};
+                },
+                [](const Array<PrimExpr>& values) {
+                  return Array<PrimExpr>{make_const(values[0]->dtype, 0)};
+                }),
+            CreateReducerGetter(
+                /*n_buffers=*/1,
+                [](const Array<Var>& x, const Array<Var>& y) {
+                  return Array<PrimExpr>{x[0] * y[0]};
+                },
+                [](const Array<PrimExpr>& values) {
+                  return Array<PrimExpr>{make_const(values[0]->dtype, 1)};
+                }),
+            CreateReducerGetter(
+                /*n_buffers=*/1,
+                [](const Array<Var>& x, const Array<Var>& y) {
+                  return Array<PrimExpr>{min(x[0], y[0])};
+                },
+                [](const Array<PrimExpr>& values) {
+                  return Array<PrimExpr>{max_value(values[0]->dtype)};
+                }),
+            CreateReducerGetter(
+                /*n_buffers=*/1,
+                [](const Array<Var>& x, const Array<Var>& y) {
+                  return Array<PrimExpr>{max(x[0], y[0])};
+                },
+                [](const Array<PrimExpr>& values) {
+                  return Array<PrimExpr>{min_value(values[0]->dtype)};
+                }),
+            CreateReducerGetter(
+                /*n_buffers=*/2,
+                [](const Array<Var>& x, const Array<Var>& y) {
+                  PrimExpr idx = Select(x[1] >= y[1], x[0], y[0]);
+                  PrimExpr val = Select(x[1] >= y[1], x[1], y[1]);
+                  return Array<PrimExpr>{idx, val};
+                },
+                [](const Array<PrimExpr>& values) {
+                  return Array<PrimExpr>{make_const(values[0]->dtype, -1),
+                                         min_value(values[1]->dtype)};
+                }),
+            CreateReducerGetter(
+                /*n_buffers=*/2,
+                [](const Array<Var>& x, const Array<Var>& y) {
+                  PrimExpr idx =
+                      Select(Or(greater(x[1], y[1]), And(equal(x[1], y[1]), less(x[0], y[0]))),
+                             x[0], y[0]);
+                  PrimExpr val = Select(greater(x[1], y[1]), x[1], y[1]);
+                  return Array<PrimExpr>{idx, val};
+                },
+                [](const Array<PrimExpr>& values) {
+                  return Array<PrimExpr>{make_const(values[0]->dtype, -1),
+                                         min_value(values[1]->dtype)};
+                }),
+            CreateReducerGetter(
+                /*n_buffers=*/2,
+                [](const Array<Var>& x, const Array<Var>& y) {
+                  PrimExpr idx = Select(x[1] <= y[1], x[0], y[0]);
+                  PrimExpr val = Select(x[1] <= y[1], x[1], y[1]);
+                  return Array<PrimExpr>{idx, val};
+                },
+                [](const Array<PrimExpr>& values) {
+                  return Array<PrimExpr>{make_const(values[0]->dtype, -1),
+                                         max_value(values[1]->dtype)};
+                }),
+            CreateReducerGetter(
+                /*n_buffers=*/2,
+                [](const Array<Var>& x, const Array<Var>& y) {
+                  PrimExpr idx = Select(
+                      Or(less(x[1], y[1]), And(equal(x[1], y[1]), less(x[0], y[0]))), x[0], y[0]);
+                  PrimExpr val = Select(less(x[1], y[1]), x[1], y[1]);
+                  return Array<PrimExpr>{idx, val};
+                },
+                [](const Array<PrimExpr>& values) {
+                  return Array<PrimExpr>{make_const(values[0]->dtype, -1),
+                                         max_value(values[1]->dtype)};
+                })} {}
 
   static void RegisterReducer(
       int n_buffers, TypedPackedFunc<Array<PrimExpr>(Array<Var>, Array<Var>)> combiner_getter,
diff --git a/tests/python/unittest/test_tir_schedule_rfactor.py b/tests/python/unittest/test_tir_schedule_rfactor.py
index f6db79f3ed23..964fe772d8af 100644
--- a/tests/python/unittest/test_tir_schedule_rfactor.py
+++ b/tests/python/unittest/test_tir_schedule_rfactor.py
@@ -15,12 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring
-import sys
-
 import pytest
 import tvm
 import tvm.testing
-from tvm import tir
+from tvm import te, tir, topi
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
 
@@ -1133,6 +1131,128 @@ def argmin_split_rfactor(
             argmin_v1[i] = v_argmin_v1
 
 
+@T.prim_func
+def argmax_topi_rfactor(
+    placeholder: T.Buffer[(1, 32), "int32"], placeholder_red: T.Buffer[1, "int32"]
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    placeholder_red_temp_v0 = T.alloc_buffer([1], dtype="int32")
+    placeholder_red_temp_v1 = T.alloc_buffer([1], dtype="int32")
+    placeholder_red_temp_v0_rf = T.alloc_buffer([1, 8], dtype="int32")
+    placeholder_red_temp_v1_rf = T.alloc_buffer([1, 8], dtype="int32")
+    for i0, i1_0, i1_1 in T.grid(1, 4, 8):
+        with T.block("placeholder_red_temp_rf"):
+            vi1_1, ax0, vi1_0 = T.axis.remap("SSR", [i1_1, i0, i1_0])
+            T.reads(placeholder[ax0, vi1_0 * 8 + vi1_1])
+            T.writes(placeholder_red_temp_v0_rf[ax0, vi1_1], placeholder_red_temp_v1_rf[ax0, vi1_1])
+            with T.init():
+                placeholder_red_temp_v0_rf[ax0, vi1_1] = -1
+                placeholder_red_temp_v1_rf[ax0, vi1_1] = -2147483648
+            v_placeholder_red_temp_v0_rf: T.int32 = T.Select(
+                placeholder_red_temp_v1_rf[ax0, vi1_1] > placeholder[ax0, vi1_0 * 8 + vi1_1]
+                or placeholder_red_temp_v1_rf[ax0, vi1_1] == placeholder[ax0, vi1_0 * 8 + vi1_1]
+                and placeholder_red_temp_v0_rf[ax0, vi1_1] < vi1_0 * 8 + vi1_1,
+                placeholder_red_temp_v0_rf[ax0, vi1_1],
+                vi1_0 * 8 + vi1_1,
+            )
+            v_placeholder_red_temp_v1_rf: T.int32 = T.Select(
+                placeholder_red_temp_v1_rf[ax0, vi1_1] > placeholder[ax0, vi1_0 * 8 + vi1_1],
+                placeholder_red_temp_v1_rf[ax0, vi1_1],
+                placeholder[ax0, vi1_0 * 8 + vi1_1],
+            )
+            placeholder_red_temp_v0_rf[ax0, vi1_1] = v_placeholder_red_temp_v0_rf
+            placeholder_red_temp_v1_rf[ax0, vi1_1] = v_placeholder_red_temp_v1_rf
+    for i0, i1_1 in T.grid(1, 8):
+        with T.block("placeholder_red_temp"):
+            vi1_1, ax0 = T.axis.remap("RS", [i1_1, i0])
+            T.reads(placeholder_red_temp_v0_rf[ax0, vi1_1], placeholder_red_temp_v1_rf[ax0, vi1_1])
+            T.writes(placeholder_red_temp_v0[ax0], placeholder_red_temp_v1[ax0])
+            with T.init():
+                placeholder_red_temp_v0[ax0] = -1
+                placeholder_red_temp_v1[ax0] = -2147483648
+            v_placeholder_red_temp_v0: T.int32 = T.Select(
+                placeholder_red_temp_v1[ax0] > placeholder_red_temp_v1_rf[ax0, vi1_1]
+                or placeholder_red_temp_v1[ax0] == placeholder_red_temp_v1_rf[ax0, vi1_1]
+                and placeholder_red_temp_v0[ax0] < placeholder_red_temp_v0_rf[ax0, vi1_1],
+                placeholder_red_temp_v0[ax0],
+                placeholder_red_temp_v0_rf[ax0, vi1_1],
+            )
+            v_placeholder_red_temp_v1: T.int32 = T.Select(
+                placeholder_red_temp_v1[ax0] > placeholder_red_temp_v1_rf[ax0, vi1_1],
+                placeholder_red_temp_v1[ax0],
+                placeholder_red_temp_v1_rf[ax0, vi1_1],
+            )
+            placeholder_red_temp_v0[ax0] = v_placeholder_red_temp_v0
+            placeholder_red_temp_v1[ax0] = v_placeholder_red_temp_v1
+    for i0 in T.serial(1):
+        with T.block("placeholder_red"):
+            ax0 = T.axis.spatial(1, i0)
+            T.reads(placeholder_red_temp_v0[ax0])
+            T.writes(placeholder_red[ax0])
+            placeholder_red[ax0] = placeholder_red_temp_v0[ax0]
+
+
+@T.prim_func
+def argmin_topi_rfactor(
+    placeholder: T.Buffer[(1, 32), "int32"], placeholder_red: T.Buffer[1, "int32"]
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    placeholder_red_temp_v0 = T.alloc_buffer([1], dtype="int32")
+    placeholder_red_temp_v1 = T.alloc_buffer([1], dtype="int32")
+    placeholder_red_temp_v0_rf = T.alloc_buffer([1, 8], dtype="int32")
+    placeholder_red_temp_v1_rf = T.alloc_buffer([1, 8], dtype="int32")
+    for i0, i1_0, i1_1 in T.grid(1, 4, 8):
+        with T.block("placeholder_red_temp_rf"):
+            vi1_1, ax0, vi1_0 = T.axis.remap("SSR", [i1_1, i0, i1_0])
+            T.reads(placeholder[ax0, vi1_0 * 8 + vi1_1])
+            T.writes(placeholder_red_temp_v0_rf[ax0, vi1_1], placeholder_red_temp_v1_rf[ax0, vi1_1])
+            with T.init():
+                placeholder_red_temp_v0_rf[ax0, vi1_1] = -1
+                placeholder_red_temp_v1_rf[ax0, vi1_1] = 2147483647
+            v_placeholder_red_temp_v0_rf: T.int32 = T.Select(
+                placeholder_red_temp_v1_rf[ax0, vi1_1] < placeholder[ax0, vi1_0 * 8 + vi1_1]
+                or placeholder_red_temp_v1_rf[ax0, vi1_1] == placeholder[ax0, vi1_0 * 8 + vi1_1]
+                and placeholder_red_temp_v0_rf[ax0, vi1_1] < vi1_0 * 8 + vi1_1,
+                placeholder_red_temp_v0_rf[ax0, vi1_1],
+                vi1_0 * 8 + vi1_1,
+            )
+            v_placeholder_red_temp_v1_rf: T.int32 = T.Select(
+                placeholder_red_temp_v1_rf[ax0, vi1_1] < placeholder[ax0, vi1_0 * 8 + vi1_1],
+                placeholder_red_temp_v1_rf[ax0, vi1_1],
+                placeholder[ax0, vi1_0 * 8 + vi1_1],
+            )
+            placeholder_red_temp_v0_rf[ax0, vi1_1] = v_placeholder_red_temp_v0_rf
+            placeholder_red_temp_v1_rf[ax0, vi1_1] = v_placeholder_red_temp_v1_rf
+    for i0, i1_1 in T.grid(1, 8):
+        with T.block("placeholder_red_temp"):
+            vi1_1, ax0 = T.axis.remap("RS", [i1_1, i0])
+            T.reads(placeholder_red_temp_v0_rf[ax0, vi1_1], placeholder_red_temp_v1_rf[ax0, vi1_1])
+            T.writes(placeholder_red_temp_v0[ax0], placeholder_red_temp_v1[ax0])
+            with T.init():
+                placeholder_red_temp_v0[ax0] = -1
+                placeholder_red_temp_v1[ax0] = 2147483647
+            v_placeholder_red_temp_v0: T.int32 = T.Select(
+                placeholder_red_temp_v1[ax0] < placeholder_red_temp_v1_rf[ax0, vi1_1]
+                or placeholder_red_temp_v1[ax0] == placeholder_red_temp_v1_rf[ax0, vi1_1]
+                and placeholder_red_temp_v0[ax0] < placeholder_red_temp_v0_rf[ax0, vi1_1],
+                placeholder_red_temp_v0[ax0],
+                placeholder_red_temp_v0_rf[ax0, vi1_1],
+            )
+            v_placeholder_red_temp_v1: T.int32 = T.Select(
+                placeholder_red_temp_v1[ax0] < placeholder_red_temp_v1_rf[ax0, vi1_1],
+                placeholder_red_temp_v1[ax0],
+                placeholder_red_temp_v1_rf[ax0, vi1_1],
+            )
+            placeholder_red_temp_v0[ax0] = v_placeholder_red_temp_v0
+            placeholder_red_temp_v1[ax0] = v_placeholder_red_temp_v1
+    for i0 in T.serial(1):
+        with T.block("placeholder_red"):
+            ax0 = T.axis.spatial(1, i0)
+            T.reads(placeholder_red_temp_v0[ax0])
+            T.writes(placeholder_red[ax0])
+            placeholder_red[ax0] = placeholder_red_temp_v0[ax0]
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
 
 
@@ -1490,5 +1610,35 @@ def test_reduction_rfactor_argmax_init_buffer_not_match():
         s.rfactor(ki, 1)
 
 
+def test_reduction_rfactor_topi_argmax():
+    A = te.placeholder((1, 32), dtype="int32")
+    B = topi.argmax(A, axis=1)
+    argmax_topi = te.create_prim_func([A, B])
+    s = tir.Schedule(argmax_topi, debug_mask="all")
+    argmax = s.get_block("placeholder_red_temp")
+    _, k = s.get_loops(argmax)
+    _, ki = s.split(k, [None, 8])
+    rf_block = s.rfactor(ki, 1)
+    tvm.ir.assert_structural_equal(s.mod["main"], argmax_topi_rfactor)
+    assert s.get(rf_block).same_as(s.get(s.get_block("placeholder_red_temp_rf")))
+    assert s.get(argmax).same_as(s.get(s.get_block("placeholder_red_temp")))
+    verify_trace_roundtrip(s, mod=argmax_topi)
+
+
+def test_reduction_rfactor_topi_argmin():
+    A = te.placeholder((1, 32), dtype="int32")
+    B = topi.argmin(A, axis=1)
+    argmin_topi = te.create_prim_func([A, B])
+    s = tir.Schedule(argmin_topi, debug_mask="all")
+    argmin = s.get_block("placeholder_red_temp")
+    _, k = s.get_loops(argmin)
+    _, ki = s.split(k, [None, 8])
+    rf_block = s.rfactor(ki, 1)
+    tvm.ir.assert_structural_equal(s.mod["main"], argmin_topi_rfactor)
+    assert s.get(rf_block).same_as(s.get(s.get_block("placeholder_red_temp_rf")))
+    assert s.get(argmin).same_as(s.get(s.get_block("placeholder_red_temp")))
+    verify_trace_roundtrip(s, mod=argmin_topi)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 91cce56cfa697a6a2e097bbae1c67ace22ef8af3 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 16 Sep 2022 18:13:58 -0700
Subject: [PATCH 197/704] [TIR] Construct the inverse in SuggestIndexMap
 (#12797)

Computing the inverse mapping requires arithmetic analysis which is not guaranteed to cover all cases. We provide the pre-defined inverse index map instead.
---
 include/tvm/tir/index_map.h                   | 26 +++++++++-
 python/tvm/tir/function.py                    | 46 ++++++++++++++---
 src/tir/ir/index_map.cc                       | 47 +++++++++++++++---
 src/tir/schedule/analysis/layout.cc           | 49 ++++++++++++++++---
 .../unittest/test_tir_schedule_analysis.py    | 41 ++++++++++++++++
 5 files changed, 188 insertions(+), 21 deletions(-)

diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h
index f461c5640bb0..8a176cb3cee8 100644
--- a/include/tvm/tir/index_map.h
+++ b/include/tvm/tir/index_map.h
@@ -70,6 +70,18 @@ class IndexMapNode : public Object {
    */
   Array<PrimExpr> final_indices;
 
+  /*!
+   * \brief The inverse index map.
+   *
+   * When this is defined, IndexMap::Inverse will return the pre-defined inverse index map.
+   * Otherwise, the inverse index map will be computed on the fly.
+   * It is the user's responsibility to ensure the correctness of the pre-defined inverse index
+   * map.
+   *
+   * \note ObjectRef is used here instead of IndexMap to avoid circular reference.
+   */
+  Optional<ObjectRef> inverse_index_map;
+
   /*!
    * \brief Default constructor
    *
@@ -133,6 +145,7 @@ class IndexMapNode : public Object {
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("initial_indices", &initial_indices);
     v->Visit("final_indices", &final_indices);
+    v->Visit("inverse_index_map", &inverse_index_map);
   }
 
   bool SEqualReduce(const IndexMapNode* other, SEqualReducer equal) const {
@@ -153,15 +166,24 @@ class IndexMapNode : public Object {
 
 class IndexMap : public ObjectRef {
  public:
-  IndexMap(Array<Var> initial_indices, Array<PrimExpr> final_indices);
+  /*!
+   * \brief The constructor
+   * \param initial_indices Variables representing the indices prior to remapping
+   * \param final_indices Expressions defining the indices after remapping.
+   * \param inverse_index_map The optional pre-defined inverse index map
+   */
+  IndexMap(Array<Var> initial_indices, Array<PrimExpr> final_indices,
+           Optional<IndexMap> inverse_index_map = NullOpt);
 
   /*!
    * \brief Create an index map from a packed function
    * \param ndim The number of dimensions
    * \param func The function to be applied
+   * \param inverse_index_map The optional pre-defined inverse index map
    * \return The created index map
    */
-  static IndexMap FromFunc(int ndim, runtime::TypedPackedFunc<Array<PrimExpr>(Array<Var>)> func);
+  static IndexMap FromFunc(int ndim, runtime::TypedPackedFunc<Array<PrimExpr>(Array<Var>)> func,
+                           Optional<IndexMap> inverse_index_map = NullOpt);
 
   /*! \brief Generate the inverse mapping.
    *
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index 12c8053e39cc..e525fc2cc31a 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -271,6 +271,12 @@ class IndexMap(Object):
         Variables representing the indices prior to remapping.
     final_indices : List[PrimExpr]
         Expressions defining the indices after remapping.
+    inverse_index_map : Union[Callable, Optional[IndexMap]]
+        The optional pre-defined inverse index map.
+        When this is defined, IndexMap::Inverse will return the pre-defined inverse index map.
+        Otherwise, the inverse index map will be computed on the fly.
+        It is the user's responsibility to ensure the correctness of the pre-defined inverse
+        index map.
     """
 
     initial_indices: List[Var]
@@ -281,11 +287,19 @@ class IndexMap(Object):
     # Stage.transform_layout for more details.
     AXIS_SEPARATOR = "axis_separator"
 
-    def __init__(self, initial_indices, final_indices):
-        self.__init_handle_by_constructor__(_ffi_api.IndexMap, initial_indices, final_indices)
+    def __init__(self, initial_indices, final_indices, inverse_index_map):
+        if isinstance(inverse_index_map, Callable):
+            inverse_index_map = IndexMap.from_func(inverse_index_map)
+        self.__init_handle_by_constructor__(
+            _ffi_api.IndexMap, initial_indices, final_indices, inverse_index_map
+        )
 
     @staticmethod
-    def from_func(mapping_function: Callable, ndim: Optional[int] = None):
+    def from_func(
+        mapping_function: Callable,
+        ndim: Optional[int] = None,
+        inverse_index_map: Union[Callable, Optional["IndexMap"]] = None,
+    ):
         """Create an index map from a function
 
         Parameters
@@ -305,6 +319,13 @@ def from_func(mapping_function: Callable, ndim: Optional[int] = None):
             mapping_function does not use variadic arguments, ndim is
             optional.
 
+        inverse_index_map : Union[Callable, Optional[IndexMap]]
+            The optional pre-defined inverse index map.
+            When this is defined, IndexMap::Inverse will return the pre-defined inverse index map.
+            Otherwise, the inverse index map will be computed on the fly.
+            It is the user's responsibility to ensure the correctness of the pre-defined inverse
+            index map.
+
         Returns
         -------
         index_map: IndexMap
@@ -312,7 +333,9 @@ def from_func(mapping_function: Callable, ndim: Optional[int] = None):
             Returns an IndexMap representing the `mapping_function`.
 
         """
-        index_map, axis_separators = IndexMap.from_func_with_separators(mapping_function, ndim)
+        index_map, axis_separators = IndexMap.from_func_with_separators(
+            mapping_function, ndim, inverse_index_map
+        )
         assert not axis_separators, (
             "The mapping_function provided to IndexMap.from_func "
             "may not return IndexMap.AXIS_SEPARATOR.  "
@@ -321,7 +344,11 @@ def from_func(mapping_function: Callable, ndim: Optional[int] = None):
         return index_map
 
     @staticmethod
-    def from_func_with_separators(mapping_function: Callable, ndim: Optional[int] = None):
+    def from_func_with_separators(
+        mapping_function: Callable,
+        ndim: Optional[int] = None,
+        inverse_index_map: Union[Callable, Optional["IndexMap"]] = None,
+    ):
         """Create an index map from a function
 
         Parameters
@@ -341,6 +368,13 @@ def from_func_with_separators(mapping_function: Callable, ndim: Optional[int] =
             mapping_function does not use variadic arguments, ndim is
             optional.
 
+        inverse_index_map : Union[Callable, Optional[IndexMap]]
+            The optional pre-defined inverse index map.
+            When this is defined, IndexMap::Inverse will return the pre-defined inverse index map.
+            Otherwise, the inverse index map will be computed on the fly.
+            It is the user's responsibility to ensure the correctness of the pre-defined inverse
+            index map.
+
         Returns
         -------
         ret: Tuple[IndexMap, List[int]]
@@ -401,7 +435,7 @@ def from_func_with_separators(mapping_function: Callable, ndim: Optional[int] =
                     f"Instead received {val} of type {type(val)}."
                 )
 
-        return IndexMap(initial_indices, final_indices), axis_separators
+        return IndexMap(initial_indices, final_indices, inverse_index_map), axis_separators
 
     def is_equivalent_to(self, other_map: "IndexMap") -> bool:
         """Return if the index maps are equivalent.
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 0e3c3b2774c8..cceff72ec82f 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -34,20 +34,23 @@
 namespace tvm {
 namespace tir {
 
-IndexMap::IndexMap(Array<Var> initial_indices, Array<PrimExpr> final_indices) {
+IndexMap::IndexMap(Array<Var> initial_indices, Array<PrimExpr> final_indices,
+                   Optional<IndexMap> inverse_index_map) {
   auto n = make_object<IndexMapNode>();
   n->initial_indices = std::move(initial_indices);
   n->final_indices = std::move(final_indices);
+  n->inverse_index_map = std::move(inverse_index_map);
   data_ = std::move(n);
 }
 
-IndexMap IndexMap::FromFunc(int ndim, runtime::TypedPackedFunc<Array<PrimExpr>(Array<Var>)> func) {
+IndexMap IndexMap::FromFunc(int ndim, runtime::TypedPackedFunc<Array<PrimExpr>(Array<Var>)> func,
+                            Optional<IndexMap> inverse_index_map) {
   Array<Var> initial_indices;
   initial_indices.reserve(ndim);
   for (int i = 0; i < ndim; ++i) {
     initial_indices.push_back(Var("i" + std::to_string(i), DataType::Int(32)));
   }
-  return IndexMap(initial_indices, func(initial_indices));
+  return IndexMap(initial_indices, func(initial_indices), std::move(inverse_index_map));
 }
 
 std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initial_ranges) const {
@@ -114,6 +117,10 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
 }
 
 IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
+  if ((*this)->inverse_index_map.defined()) {
+    // return the pre-defined inverse index map if exists.
+    return Downcast<IndexMap>((*this)->inverse_index_map.value());
+  }
   // Dummy variables to represent the inverse's inputs.
   Array<Var> output_vars;
   for (size_t i = 0; i < (*this)->final_indices.size(); i++) {
@@ -232,7 +239,14 @@ Array<PrimExpr> IndexMapNode::MapShape(const Array<PrimExpr>& shape,
   return output;
 }
 
-String IndexMapNode::ToPythonString() const {
+/*!
+ * \brief Auxilarry function to comvert an index map to lambda expression in Python.
+ * \param initial_indices The initial indices in the index map.
+ * \param final_indices The final indices in the index map.
+ * \return The lambda expression string.
+ */
+std::string IndexMap2PythonLambdaExpr(const Array<Var>& initial_indices,
+                                      const Array<PrimExpr>& final_indices) {
   std::unordered_set<std::string> used_names;
   Map<Var, PrimExpr> var_remap;
   for (const Var& initial_index : initial_indices) {
@@ -259,10 +273,28 @@ String IndexMapNode::ToPythonString() const {
   }
   oss << ": (";
   for (size_t i = 0; i < final_indices.size(); ++i) {
+    if (i != 0) {
+      oss << " ";
+    }
     oss << Substitute(final_indices[i], var_remap);
-    oss << ", ";
+    oss << ",";
   }
   oss << ")";
+  return oss.str();
+}
+
+String IndexMapNode::ToPythonString() const {
+  std::string lambda_expr = IndexMap2PythonLambdaExpr(initial_indices, final_indices);
+  if (!inverse_index_map.defined()) {
+    return String(lambda_expr);
+  }
+  // Also convert the inverse index map.
+  IndexMap inverse = Downcast<IndexMap>(inverse_index_map.value());
+  std::string inverse_lambda_expr =
+      IndexMap2PythonLambdaExpr(inverse->initial_indices, inverse->final_indices);
+  std::ostringstream oss;
+  oss << "tvm.tir.IndexMap.from_func(" << lambda_expr
+      << ", inverse_index_map=" << inverse_lambda_expr << ")";
   return String(oss.str());
 }
 
@@ -275,8 +307,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 TVM_REGISTER_NODE_TYPE(IndexMapNode);
 
 TVM_REGISTER_GLOBAL("tir.IndexMap")
-    .set_body_typed([](Array<Var> initial_indices, Array<PrimExpr> final_indices) {
-      return IndexMap(initial_indices, final_indices);
+    .set_body_typed([](Array<Var> initial_indices, Array<PrimExpr> final_indices,
+                       Optional<IndexMap> inverse_index_map) {
+      return IndexMap(initial_indices, final_indices, inverse_index_map);
     });
 
 TVM_REGISTER_GLOBAL("tir.IndexMapMapIndices")
diff --git a/src/tir/schedule/analysis/layout.cc b/src/tir/schedule/analysis/layout.cc
index b0cafac3151f..b071b2d7e4a1 100644
--- a/src/tir/schedule/analysis/layout.cc
+++ b/src/tir/schedule/analysis/layout.cc
@@ -167,20 +167,25 @@ Optional<IndexMap> SuggestIndexMap(const Buffer& buffer, const Array<PrimExpr>&
     }
     return a.lower_factor > b.lower_factor;
   });
+  // Compute the inverse permutation by argsort
+  std::vector<int> inverse_order = order;
+  std::sort(inverse_order.begin(), inverse_order.end(),
+            [&order](int _a, int _b) -> bool { return order[_a] < order[_b]; });
   // Step 5. Create the indexing mapping
   auto f_alter_layout = [f_flatten_index = std::move(f_flatten_index),  //
-                         split_exprs = std::move(split_exprs),          //
-                         order = std::move(order),                      //
-                         shape = buffer->shape,                         //
+                         &split_exprs,                                  //
+                         &order,                                        //
+                             & shape = buffer->shape,                   //
                          analyzer                                       //
   ](Array<Var> indices) -> Array<PrimExpr> {
     ICHECK_EQ(indices.size(), shape.size());
     for (int i = 0, n = indices.size(); i < n; ++i) {
       analyzer->Bind(indices[i], Range::FromMinExtent(0, shape[i]));
     }
+    // Step 5.1: Fuse all indices into a flattened one
     PrimExpr index = f_flatten_index({indices.begin(), indices.end()});
     int ndim = split_exprs.size();
-    // Step 5.1. Split the flattened index according to `split_exprs`
+    // Step 5.2. Split the flattened index according to `split_exprs`
     std::vector<PrimExpr> split;
     split.reserve(ndim);
     for (int i = ndim - 1; i >= 0; --i) {
@@ -190,7 +195,7 @@ Optional<IndexMap> SuggestIndexMap(const Buffer& buffer, const Array<PrimExpr>&
       index = floordiv(index, extent);
     }
     std::reverse(split.begin(), split.end());
-    // Step 5.2. Reorder the indexing pattern according to `order`
+    // Step 5.3. Reorder the indexing pattern according to `order`
     Array<PrimExpr> results;
     results.reserve(ndim);
     for (int i = 0; i < ndim; ++i) {
@@ -198,7 +203,39 @@ Optional<IndexMap> SuggestIndexMap(const Buffer& buffer, const Array<PrimExpr>&
     }
     return results;
   };
-  return IndexMap::FromFunc(ndim, f_alter_layout);
+  // Step 6: Create the inverse index mapping.
+  auto f_inverse = [&inverse_order, &split_exprs, &shape = buffer->shape,
+                    analyzer](Array<Var> indices) -> Array<PrimExpr> {
+    ICHECK_EQ(indices.size(), split_exprs.size());
+    // Step 6.1: Reorder the indices according to `inverse_order`. This is the inverse of Step 5.3.
+    // After the inverse permutation, indices[i] corresponds to split_exprs[i]
+    Array<Var> inv_permuted_indices;
+    inv_permuted_indices.reserve(indices.size());
+    for (int i = 0, n = indices.size(); i < n; ++i) {
+      const Var& index = indices[inverse_order[i]];
+      inv_permuted_indices.push_back(index);
+      analyzer->Bind(index, Range::FromMinExtent(0, Integer(split_exprs[i].extent)));
+    }
+
+    // Step 6.2: Fuse all the indices. This is the inverse of Step 5.2.
+    PrimExpr flattened_index = make_const(indices[0]->dtype, 0);
+    int64_t stride = 1;
+    for (int i = static_cast<int>(split_exprs.size()) - 1; i >= 0; --i) {
+      flattened_index = inv_permuted_indices[i] * Integer(stride) + flattened_index;
+      stride *= split_exprs[i].extent;
+    }
+    // Step 6.3: Split the flattened index into multiple indices. This is the inverse of Step 5.1.
+    Array<PrimExpr> result;
+    result.reserve(shape.size());
+    for (int i = static_cast<int>(shape.size()) - 1; i >= 0; --i) {
+      PrimExpr index = analyzer->Simplify(floormod(flattened_index, shape[i]));
+      flattened_index = floordiv(flattened_index, shape[i]);
+      result.push_back(index);
+    }
+    return Array<PrimExpr>(result.rbegin(), result.rend());
+  };
+  IndexMap inverse_index_map = IndexMap::FromFunc(split_exprs.size(), f_inverse);
+  return IndexMap::FromFunc(ndim, f_alter_layout, inverse_index_map);
 }
 
 TVM_REGISTER_GLOBAL("tir.schedule.SuggestIndexMap")
diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index 5524abbaf094..378e5183b49c 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -101,6 +101,47 @@ def test_suggest_index_map_bijective():
     assert index_map.is_equivalent_to(expected_index_map)
 
 
+def test_suggest_index_map_winograd():
+    """use case in winograd conv where the indices are complicated"""
+    fused_outer, i3_3_fused, i4_0, i4_1 = _make_vars("fused_outer", "i3_3_fused", "i4_0", "i4_1")
+    eps = floordiv(fused_outer, 336) * 2 + floordiv(floormod(fused_outer, 16), 8)
+    nu = floordiv(floormod(fused_outer, 336), 112) * 2 + floordiv(floormod(fused_outer, 8), 4)
+    co = floormod(fused_outer, 4) * 32 + i3_3_fused
+    ci = (i4_0 * 32) + i4_1
+    buffer = decl_buffer(shape=[6, 6, 128, 128])
+    index_map = suggest_index_map(
+        buffer=buffer,
+        indices=[eps, nu, co, ci],
+        loops=_make_loops(
+            loop_vars=[fused_outer, i3_3_fused, i4_0, i4_1],
+            extents=[1008, 32, 4, 32],
+        ),
+        predicate=True,
+    )
+    expected_index_map = IndexMap.from_func(
+        lambda i0, i1, i2, i3: (
+            floordiv(i0, 2),
+            floordiv(i1, 2),
+            floormod(i0, 2),
+            floormod(((i1 * 4) + floordiv(i2, 32)), 8),
+            floormod(i2, 32),
+            floordiv(i3, 32),
+            floormod(i3, 32),
+        )
+    )
+    assert index_map.is_equivalent_to(expected_index_map)
+    inverse_index_map = index_map.inverse(buffer.shape)
+    expected_inverse_index_map = IndexMap.from_func(
+        lambda i0, i1, i2, i3, i4, i5, i6: (
+            ((i0 * 2) + i2),
+            ((i1 * 2) + floordiv(((i3 * 32) + i4), 128)),
+            floormod(((i3 * 32) + i4), 128),
+            ((i5 * 32) + i6),
+        )
+    )
+    assert inverse_index_map.is_equivalent_to(expected_inverse_index_map)
+
+
 @tvm.script.ir_module
 class DenseVNNIModule:
     @T.prim_func

From e92f5d43f334752d4928764aa7203f229a07bd9b Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Sat, 17 Sep 2022 11:08:34 -0400
Subject: [PATCH 198/704] [BugFix][TIR] Fix Buffer LCA Detector (#12819)

Prior to this PR, the LCA detector of buffers in TIR didn't take buffer memory scopes and GPU hierarchy into consideration. An consequent issue is that, when an intermediate buffer is in global memory, TIR's lowering passes don't necessarily allocated the intermediate buffer outside all `blockIdx`. As a result, the global intermediate buffer is allocated under a GPU thread block, which is illegal.

This PR fixes this issue by fixing the LCA detector, making it be aware of the buffer memory scopes and GPU hierarchy. With this fix, the global intermediate buffers are all allocated outside `blockIdx`.
---
 .../analysis/buffer_access_lca_detector.cc    | 45 ++++++++++++++++++-
 ...t_tir_analysis_detect_buffer_access_lca.py | 26 +++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/src/tir/analysis/buffer_access_lca_detector.cc b/src/tir/analysis/buffer_access_lca_detector.cc
index b71e6b27f486..7197e1ba83c5 100644
--- a/src/tir/analysis/buffer_access_lca_detector.cc
+++ b/src/tir/analysis/buffer_access_lca_detector.cc
@@ -25,6 +25,7 @@
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include "../../runtime/thread_storage_scope.h"
 #include "../../support/arena.h"
 
 namespace tvm {
@@ -32,7 +33,11 @@ namespace tir {
 
 /*!
  * \brief Detect the lowest common ancestor(LCA) position of Buffer access.
- * \note Only consider BlockNode and ForNode to be the LCA nodes.
+ * \note
+ * - Only consider BlockNode and ForNode to be the LCA nodes.
+ * - In the LCA locator, we are aware of the buffer scope and CUDA hierarchy so that any buffer in
+ * global memory will have its buffer access LCA outside all launch sites of `blockIdx`, in order to
+ * prevent conflicts between buffer memory scopes and CUDA hierarchy.
  */
 class LCADetector : public StmtExprVisitor {
  public:
@@ -51,6 +56,8 @@ class LCADetector : public StmtExprVisitor {
     detector.ancestor_scopes_.push_back(&root);
 
     detector(func->body);
+    detector.UpdateWithBlockidx();
+
     // Prepare the return
     Map<Buffer, Optional<Stmt>> buffer_lca;
     for (const auto& kv : detector.buffer_lca_) {
@@ -82,6 +89,15 @@ class LCADetector : public StmtExprVisitor {
     int n = ancestor_scopes_.size();
     const ScopeInfo* parent_scope = ancestor_scopes_.back();
     auto* current_scope = arena_.make<ScopeInfo>(parent_scope, op, n);
+
+    if (op->thread_binding.defined()) {
+      const runtime::ThreadScope& scope =
+          runtime::ThreadScope::Create(op->thread_binding.value()->thread_tag);
+      if (scope.rank == 0) {
+        blockidx_scopes_.push_back(current_scope);
+      }
+    }
+
     ancestor_scopes_.push_back(current_scope);
     StmtExprVisitor::VisitStmt_(op);
     ancestor_scopes_.pop_back();
@@ -107,6 +123,18 @@ class LCADetector : public StmtExprVisitor {
     ancestor_scopes_.pop_back();
   }
 
+  void VisitStmt_(const AttrStmtNode* op) final {
+    if (op->attr_key == attr::thread_extent) {
+      const auto* iter = op->node.as<IterVarNode>();
+      ICHECK_NOTNULL(iter);
+      const runtime::ThreadScope& scope = runtime::ThreadScope::Create(iter->thread_tag);
+      if (scope.rank == 0) {
+        blockidx_scopes_.push_back(ancestor_scopes_.back());
+      }
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
   void VisitExpr_(const BufferLoadNode* op) final {
     UpdateBufferLCA(op->buffer.get());
     StmtExprVisitor::VisitExpr_(op);
@@ -150,6 +178,19 @@ class LCADetector : public StmtExprVisitor {
     }
   }
 
+  void UpdateWithBlockidx() {
+    for (const auto& it : buffer_lca_) {
+      const runtime::StorageScope& scope =
+          runtime::StorageScope::Create(GetRef<Buffer>(it.first).scope());
+      if (scope.rank == runtime::StorageRank::kGlobal) {
+        const ScopeInfo*& lca = buffer_lca_[it.first];
+        for (const ScopeInfo* blockidx_scope : blockidx_scopes_) {
+          lca = LowestCommonAncestor(lca, blockidx_scope);
+        }
+      }
+    }
+  }
+
   static const ScopeInfo* LowestCommonAncestor(const ScopeInfo* lhs, const ScopeInfo* rhs) {
     if (lhs == nullptr) return rhs;
     if (rhs == nullptr) return lhs;
@@ -186,6 +227,8 @@ class LCADetector : public StmtExprVisitor {
   std::unordered_map<const VarNode*, const BufferNode*> buffer_var_map_ = {};
   /*! \brief The match buffers inside blocks. */
   std::unordered_set<const BufferNode*> match_buffers_ = {};
+  /*! \brief The ForNodes/BlockNodes which contain immediate `blockIdx` launch. */
+  std::vector<const ScopeInfo*> blockidx_scopes_ = {};
   /*! \brief Internal arena. */
   support::Arena arena_;
 };
diff --git a/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py b/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py
index 344f37a23677..d438427e1fe1 100644
--- a/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py
+++ b/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py
@@ -93,6 +93,19 @@ def match_buffer_func(a: T.handle, b: T.handle) -> None:
             T.evaluate(B1.data)
 
 
+@T.prim_func
+def global_buffer_with_blockidx(
+    a: T.Buffer[(1, 32), "int32"], b: T.Buffer[(1, 32), "int32"]
+) -> None:
+    for i0 in T.thread_binding(0, 1, thread="blockIdx.x"):
+        for i1 in T.thread_binding(0, 32, thread="threadIdx.x"):
+            with T.block("copy"):
+                i, j = T.axis.remap("SS", [i0, i1])
+                T.reads(a[i, j])
+                T.writes(b[i, j])
+                b[i, j] = a[i, j]
+
+
 def test_buffer_load_store():
     func = buffer_load_store_func
     A, B = [func.buffer_map[x] for x in func.params]
@@ -154,8 +167,21 @@ def test_match_buffer():
     assert lca[B] == block
 
 
+def test_global_buffer_with_blockidx():
+    func = global_buffer_with_blockidx
+    A, B = [func.buffer_map[x] for x in func.params]
+    lca = tir.analysis.detect_buffer_access_lca(func)
+
+    root_block = func.body.block
+    blockidx_loop = root_block.body
+    # LCA of both A and B should be the loop bound to `blockIdx`
+    assert lca[A] == blockidx_loop
+    assert lca[B] == blockidx_loop
+
+
 if __name__ == "__main__":
     test_buffer_load_store()
     test_opaque_access()
     test_lca_func_root()
     test_match_buffer()
+    test_global_buffer_with_blockidx()

From 1ecf084eecaff167967df1a8c998de72e1198c24 Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Sat, 17 Sep 2022 16:54:01 -0400
Subject: [PATCH 199/704] [TVMScript] Add more helper functions to the printer
 infra  (#12829)

This PR is split from https://github.com/apache/tvm/pull/12492, to make the necessary updates to the printer infra for future PRs of TIR printer.

Tracking issue: https://github.com/apache/tvm/issues/11912

Co-authored-by: Greg Bonik <gbonik@octoml.ai>
---
 include/tvm/script/printer/doc.h              | 64 +++++++++++++
 .../script/printer/traced_object_functor.h    | 37 +-------
 include/tvm/script/printer/var_table.h        | 11 +++
 src/script/printer/doc.cc                     | 30 ++++--
 src/script/printer/ir_docsifier.cc            |  2 +-
 src/script/printer/utils.h                    | 93 +++++++++++++++++++
 src/script/printer/var_table.cc               |  3 +-
 .../cpp/tvmscript_printer_irdocsifier_test.cc | 13 ++-
 ...ript_printer_traced_object_functor_test.cc | 37 ++++----
 9 files changed, 228 insertions(+), 62 deletions(-)
 create mode 100644 src/script/printer/utils.h

diff --git a/include/tvm/script/printer/doc.h b/include/tvm/script/printer/doc.h
index 72f343354b1b..1ee7fd6a7fd4 100644
--- a/include/tvm/script/printer/doc.h
+++ b/include/tvm/script/printer/doc.h
@@ -22,6 +22,7 @@
 #include <tvm/ir/expr.h>
 #include <tvm/node/node.h>
 #include <tvm/runtime/data_type.h>
+#include <tvm/script/printer/traced_object.h>
 
 namespace tvm {
 namespace script {
@@ -87,6 +88,15 @@ class ExprDocNode : public DocNode {
    */
   ExprDoc Attr(String attr) const;
 
+  /*!
+   * \brief Create a doc representing attribute access on the current ExprDoc
+   * \param attr The attribute to access.
+   *
+   * The ObjectPath of attr will be pushed to the source_path of the returned
+   * doc.
+   */
+  ExprDoc Attr(TracedObject<String> attr) const;
+
   /*!
    * \brief Create a doc representing index access on the current ExprDoc
    * \param indices The indices to access.
@@ -242,6 +252,7 @@ class LiteralDocNode : public ExprDocNode {
 class LiteralDoc : public ExprDoc {
  protected:
   explicit LiteralDoc(ObjectRef value);
+  LiteralDoc(ObjectRef value, ObjectPath object_path);
 
  public:
   /*!
@@ -249,30 +260,83 @@ class LiteralDoc : public ExprDoc {
    */
   static LiteralDoc None() { return LiteralDoc(ObjectRef(nullptr)); }
 
+  /*!
+   * \brief Create a LiteralDoc to represent None/null/empty value.
+   * \param object_path The source path of the returned Doc.
+   */
+  static LiteralDoc None(ObjectPath object_path) {
+    return LiteralDoc(ObjectRef(nullptr), object_path);
+  }
+
   /*!
    * \brief Create a LiteralDoc to represent integer.
    * \param v The integer value.
    */
   static LiteralDoc Int(int v) { return LiteralDoc(IntImm(DataType::Int(64), v)); }
 
+  /*!
+   * \brief Create a LiteralDoc to represent integer.
+   * \param v The integer value.
+   *
+   * The ObjectPath of v will be pushed to the source_path of the returned doc.
+   */
+  static LiteralDoc Int(const TracedObject<IntImm>& v) { return LiteralDoc(v.Get(), v.GetPath()); }
+
+  /*!
+   * \brief Create a LiteralDoc to represent integer.
+   * \param v The integer value.
+   *
+   * The ObjectPath of v will be pushed to the source_path of the returned doc.
+   */
+  static LiteralDoc Int(const TracedBasicValue<int>& v) {
+    return LiteralDoc(IntImm(DataType::Int(64), v.Get()), v.GetPath());
+  }
   /*!
    * \brief Create a LiteralDoc to represent boolean.
    * \param v The boolean value.
    */
   static LiteralDoc Boolean(bool v) { return LiteralDoc(IntImm(DataType::Bool(), v)); }
 
+  /*!
+   * \brief Create a LiteralDoc to represent boolean.
+   * \param v The boolean value.
+   *
+   * The ObjectPath of v will be pushed to the source_path of the returned doc.
+   */
+  static LiteralDoc Boolean(const TracedBasicValue<bool>& v) {
+    return LiteralDoc(IntImm(DataType::Bool(), v.Get()), v.GetPath());
+  }
+
   /*!
    * \brief Create a LiteralDoc to represent float.
    * \param v The float value.
    */
   static LiteralDoc Float(double v) { return LiteralDoc(FloatImm(DataType::Float(64), v)); }
 
+  /*!
+   * \brief Create a LiteralDoc to represent float.
+   * \param v The float value.
+   *
+   * The ObjectPath of v will be pushed to the source_path of the returned doc.
+   */
+  static LiteralDoc Float(const TracedObject<FloatImm>& v) {
+    return LiteralDoc(v.Get(), v.GetPath());
+  }
+
   /*!
    * \brief Create a LiteralDoc to represent string.
    * \param v The string value.
    */
   static LiteralDoc Str(const String& v) { return LiteralDoc(v); }
 
+  /*!
+   * \brief Create a LiteralDoc to represent string.
+   * \param v The string value.
+   *
+   * The ObjectPath of v will be pushed to the source_path of the returned doc.
+   */
+  static LiteralDoc Str(const TracedObject<String>& v) { return LiteralDoc(v.Get(), v.GetPath()); }
+
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(LiteralDoc, ExprDoc, LiteralDocNode);
 };
 
diff --git a/include/tvm/script/printer/traced_object_functor.h b/include/tvm/script/printer/traced_object_functor.h
index 6caaf8a6e0d5..8f72d139a5a5 100644
--- a/include/tvm/script/printer/traced_object_functor.h
+++ b/include/tvm/script/printer/traced_object_functor.h
@@ -34,35 +34,6 @@ namespace tvm {
 namespace script {
 namespace printer {
 
-namespace {
-
-namespace detail {
-/*!
- * \brief Helper template class to extract the type of first argument of a function
- * \tparam FType The function type.
- */
-template <typename FType>
-struct FirstArgTypeGetter;
-
-template <typename R, typename ArgOne, typename... OtherArgs>
-struct FirstArgTypeGetter<R(ArgOne, OtherArgs...)> {
-  using T = ArgOne;
-};
-
-/*!
- * \brief Template alias for the type of first argument of a function
- * \tparam FType The function type.
- *
- * The name of public functions are in snake case to be consistent with
- * tvm/node/functor.h
- */
-template <typename FType>
-using FirstArgType = typename detail::FirstArgTypeGetter<
-    typename tvm::runtime::detail::function_signature<FType>::FType>::T;
-}  // namespace detail
-
-}  // namespace
-
 /*
  * This type alias and the following free functions are created to reduce the binary bloat
  * from template and also hide implementation details from this header
@@ -156,8 +127,7 @@ class TracedObjectFunctor {
    *
    * The diaptch function should have signature `R(TracedObject<TObjectRef>, Args...)`.
    */
-  template <typename TCallable,
-            typename TObjectRef = typename detail::FirstArgType<TCallable>::ObjectRefType,
+  template <typename TObjectRef, typename TCallable,
             typename = std::enable_if_t<IsDispatchFunction<TObjectRef, TCallable>::value>>
   TSelf& set_dispatch(String token, TCallable f) {
     return set_dispatch(
@@ -177,9 +147,10 @@ class TracedObjectFunctor {
    *
    * Default dispatch function has an empty string as dispatch token.
    */
-  template <typename TCallable>
+  template <typename TObjectRef, typename TCallable,
+            typename = std::enable_if_t<IsDispatchFunction<TObjectRef, TCallable>::value>>
   TSelf& set_dispatch(TCallable&& f) {
-    return set_dispatch(kDefaultDispatchToken, std::forward<TCallable>(f));
+    return set_dispatch<TObjectRef>(kDefaultDispatchToken, std::forward<TCallable>(f));
   }
 
   /*!
diff --git a/include/tvm/script/printer/var_table.h b/include/tvm/script/printer/var_table.h
index 9300a976c569..2cd9335213a3 100644
--- a/include/tvm/script/printer/var_table.h
+++ b/include/tvm/script/printer/var_table.h
@@ -103,6 +103,17 @@ class VarTableNode : public Object {
    */
   Optional<ExprDoc> GetVarDoc(const ObjectRef& obj, const ObjectPath& object_path) const;
 
+  /*!
+   * \brief Get the doc for variable.
+   * \param obj The traced variable object.
+   *
+   * \return The doc for variable, if it exists in the table. Otherwise it returns NullOpt.
+   */
+  template <typename TObjectRef>
+  Optional<ExprDoc> GetVarDoc(const TracedObject<TObjectRef> obj) const {
+    return GetVarDoc(obj.Get(), obj.GetPath());
+  }
+
   /*!
    * \brief Check if a variable exists in the table.
    * \param obj The variable object.
diff --git a/src/script/printer/doc.cc b/src/script/printer/doc.cc
index d6f5ff35ab53..f3b431bd62db 100644
--- a/src/script/printer/doc.cc
+++ b/src/script/printer/doc.cc
@@ -27,6 +27,12 @@ namespace printer {
 
 ExprDoc ExprDocNode::Attr(String attr) const { return AttrAccessDoc(GetRef<ExprDoc>(this), attr); }
 
+ExprDoc ExprDocNode::Attr(TracedObject<String> attr) const {
+  auto doc = AttrAccessDoc(GetRef<ExprDoc>(this), attr.Get());
+  doc->source_paths.push_back(attr.GetPath());
+  return doc;
+}
+
 ExprDoc ExprDocNode::operator[](Array<Doc> indices) const {
   return IndexDoc(GetRef<ExprDoc>(this), indices);
 }
@@ -54,6 +60,13 @@ LiteralDoc::LiteralDoc(ObjectRef value) {
   this->data_ = std::move(n);
 }
 
+LiteralDoc::LiteralDoc(ObjectRef value, ObjectPath object_path) {
+  ObjectPtr<LiteralDocNode> n = make_object<LiteralDocNode>();
+  n->value = value;
+  n->source_paths.push_back(object_path);
+  this->data_ = std::move(n);
+}
+
 IdDoc::IdDoc(String name) {
   ObjectPtr<IdDocNode> n = make_object<IdDocNode>();
   n->name = name;
@@ -225,7 +238,8 @@ TVM_REGISTER_GLOBAL("script.printer.DocSetSourcePaths")
     });
 
 TVM_REGISTER_NODE_TYPE(ExprDocNode);
-TVM_REGISTER_GLOBAL("script.printer.ExprDocAttr").set_body_method<ExprDoc>(&ExprDocNode::Attr);
+TVM_REGISTER_GLOBAL("script.printer.ExprDocAttr")
+    .set_body_method<ExprDoc, ExprDocNode, ExprDoc, String>(&ExprDocNode::Attr);
 TVM_REGISTER_GLOBAL("script.printer.ExprDocIndex")
     .set_body_method<ExprDoc>(&ExprDocNode::operator[]);
 TVM_REGISTER_GLOBAL("script.printer.ExprDocCall")
@@ -242,11 +256,15 @@ TVM_REGISTER_GLOBAL("script.printer.StmtBlockDoc").set_body_typed([](Array<StmtD
 });
 
 TVM_REGISTER_NODE_TYPE(LiteralDocNode);
-TVM_REGISTER_GLOBAL("script.printer.LiteralDocNone").set_body_typed(LiteralDoc::None);
-TVM_REGISTER_GLOBAL("script.printer.LiteralDocInt").set_body_typed(LiteralDoc::Int);
-TVM_REGISTER_GLOBAL("script.printer.LiteralDocBoolean").set_body_typed(LiteralDoc::Boolean);
-TVM_REGISTER_GLOBAL("script.printer.LiteralDocFloat").set_body_typed(LiteralDoc::Float);
-TVM_REGISTER_GLOBAL("script.printer.LiteralDocStr").set_body_typed(LiteralDoc::Str);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocNone").set_body_typed<LiteralDoc()>(LiteralDoc::None);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocInt")
+    .set_body_typed<LiteralDoc(int)>(LiteralDoc::Int);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocBoolean")
+    .set_body_typed<LiteralDoc(bool)>(LiteralDoc::Boolean);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocFloat")
+    .set_body_typed<LiteralDoc(double)>(LiteralDoc::Float);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocStr")
+    .set_body_typed<LiteralDoc(const String&)>(LiteralDoc::Str);
 
 TVM_REGISTER_NODE_TYPE(IdDocNode);
 TVM_REGISTER_GLOBAL("script.printer.IdDoc").set_body_typed([](String name) { return IdDoc(name); });
diff --git a/src/script/printer/ir_docsifier.cc b/src/script/printer/ir_docsifier.cc
index b72ed48db63b..7f032ec50269 100644
--- a/src/script/printer/ir_docsifier.cc
+++ b/src/script/printer/ir_docsifier.cc
@@ -61,7 +61,7 @@ RootNodeContainer::RootNodeContainer(ObjectRef root_node) {
 //     });
 // \endcode
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch([](TracedObject<RootNodeContainer> obj, IRDocsifier p) -> Doc {
+    .set_dispatch<RootNodeContainer>([](TracedObject<RootNodeContainer> obj, IRDocsifier p) -> Doc {
       String top_dispatch_token = p->dispatch_tokens.back();
       ICHECK_NE(top_dispatch_token, "");
       ICHECK(false) << "Printing IR " << top_dispatch_token << " is not implemented.";
diff --git a/src/script/printer/utils.h b/src/script/printer/utils.h
new file mode 100644
index 000000000000..abe7ce5e9a88
--- /dev/null
+++ b/src/script/printer/utils.h
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_PRINTER_UTILS_H_
+#define TVM_SCRIPT_PRINTER_UTILS_H_
+
+#include <tvm/script/printer/doc.h>
+#include <tvm/script/printer/ir_docsifier.h>
+
+#include <utility>
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+template <typename DocType, typename NodeType>
+Array<DocType> AsDocArray(const TracedArray<NodeType>& refs, const IRDocsifier& ir_docsifier) {
+  Array<DocType> result;
+  for (auto ref : refs) {
+    result.push_back(ir_docsifier->AsExprDoc(ref));
+  }
+  return result;
+}
+
+template <typename DocType, typename NodeType>
+Array<DocType> AsDocArray(std::initializer_list<NodeType>&& refs, const IRDocsifier& ir_docsifier) {
+  Array<DocType> result;
+  for (auto& ref : refs) {
+    result.push_back(ir_docsifier->AsExprDoc(ref));
+  }
+  return result;
+}
+
+template <typename RefType>
+Array<ExprDoc> AsExprDocArray(const TracedArray<RefType>& refs, const IRDocsifier& ir_docsifier) {
+  return AsDocArray<ExprDoc>(refs, ir_docsifier);
+}
+
+template <typename RefType>
+Array<ExprDoc> AsExprDocArray(std::initializer_list<RefType>&& refs,
+                              const IRDocsifier& ir_docsifier) {
+  return AsDocArray<ExprDoc>(std::move(refs), ir_docsifier);
+}
+
+inline DictDoc AsDictDoc(const TracedMap<String, ObjectRef>& dict,
+                         const IRDocsifier& ir_docsifier) {
+  Array<ExprDoc> keys;
+  Array<ExprDoc> values;
+
+  for (auto p : dict) {
+    keys.push_back(LiteralDoc::Str(p.first));
+    values.push_back(ir_docsifier->AsExprDoc(p.second));
+  }
+
+  auto doc = DictDoc(keys, values);
+  doc->source_paths.push_back(dict.GetPath());
+  return doc;
+}
+
+template <typename T>
+inline ListDoc AsListDoc(const TracedArray<T>& arr, const IRDocsifier& ir_docsifier) {
+  auto ret = ListDoc(AsExprDocArray(arr, ir_docsifier));
+  ret->source_paths.push_back(arr.GetPath());
+  return ret;
+}
+
+template <typename T>
+inline TupleDoc AsTupleDoc(const TracedArray<T>& arr, const IRDocsifier& ir_docsifier) {
+  auto ret = TupleDoc(AsExprDocArray(arr, ir_docsifier));
+  ret->source_paths.push_back(arr.GetPath());
+  return ret;
+}
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_PRINTER_UTILS_H_
diff --git a/src/script/printer/var_table.cc b/src/script/printer/var_table.cc
index 49ba93f9bcfe..62d8b2f66cc2 100644
--- a/src/script/printer/var_table.cc
+++ b/src/script/printer/var_table.cc
@@ -99,7 +99,8 @@ TVM_REGISTER_GLOBAL("script.printer.VarTableDefineByDoc")
           obj, [f = std::move(factory)]() { return f(); }, frame);
     });
 TVM_REGISTER_GLOBAL("script.printer.VarTableGetVarDoc")
-    .set_body_method<VarTable>(&VarTableNode::GetVarDoc);
+    .set_body_method<VarTable, VarTableNode, Optional<ExprDoc>, const ObjectRef&,
+                     const ObjectPath&>(&VarTableNode::GetVarDoc);
 TVM_REGISTER_GLOBAL("script.printer.VarTableIsVarDefined")
     .set_body_method<VarTable>(&VarTableNode::IsVarDefined);
 
diff --git a/tests/cpp/tvmscript_printer_irdocsifier_test.cc b/tests/cpp/tvmscript_printer_irdocsifier_test.cc
index fcdb5ed04e41..8c68399df222 100644
--- a/tests/cpp/tvmscript_printer_irdocsifier_test.cc
+++ b/tests/cpp/tvmscript_printer_irdocsifier_test.cc
@@ -45,14 +45,19 @@ class TestObject : public ObjectRef {
 TVM_REGISTER_NODE_TYPE(TestObjectNode);
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch([](TracedObject<TestObject> obj, IRDocsifier p) { return IdDoc("x"); });
+    .set_dispatch<TestObject>([](TracedObject<TestObject> obj, IRDocsifier p) {
+      return IdDoc("x");
+    });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch("tir", [](TracedObject<TestObject> obj, IRDocsifier p) { return IdDoc("tir"); });
+    .set_dispatch<TestObject>("tir", [](TracedObject<TestObject> obj, IRDocsifier p) {
+      return IdDoc("tir");
+    });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch("relax",
-                  [](TracedObject<TestObject> obj, IRDocsifier p) { return IdDoc("relax"); });
+    .set_dispatch<TestObject>("relax", [](TracedObject<TestObject> obj, IRDocsifier p) {
+      return IdDoc("relax");
+    });
 
 TEST(PrinterIRDocsifierTest, AsDoc) {
   IRDocsifier p(Map<String, String>{});
diff --git a/tests/cpp/tvmscript_printer_traced_object_functor_test.cc b/tests/cpp/tvmscript_printer_traced_object_functor_test.cc
index 374eb609b6cb..d662ce132405 100644
--- a/tests/cpp/tvmscript_printer_traced_object_functor_test.cc
+++ b/tests/cpp/tvmscript_printer_traced_object_functor_test.cc
@@ -33,7 +33,7 @@ class FooObjectNode : public Object {
  public:
   void VisitAttrs(AttrVisitor* v) {}
 
-  static constexpr const char* _type_key = "test.FooObject";
+  static constexpr const char* _type_key = "test.TracedObjectFunctor.FooObject";
   TVM_DECLARE_FINAL_OBJECT_INFO(FooObjectNode, Object);
 };
 
@@ -49,7 +49,7 @@ class BarObjectNode : public Object {
  public:
   void VisitAttrs(AttrVisitor* v) {}
 
-  static constexpr const char* _type_key = "test.BarObject";
+  static constexpr const char* _type_key = "test.TracedObjectFunctor.BarObject";
   TVM_DECLARE_FINAL_OBJECT_INFO(BarObjectNode, Object);
 };
 
@@ -69,8 +69,8 @@ TEST(TracedObjectFunctorTest, NormalRegistration) {
   TracedObjectFunctor<String> functor;
   ObjectPath path = ObjectPath::Root();
 
-  functor.set_dispatch([](TracedObject<FooObject> o) -> String { return "Foo"; });
-  functor.set_dispatch([](TracedObject<BarObject> o) -> String { return "Bar"; });
+  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o) -> String { return "Foo"; });
+  functor.set_dispatch<BarObject>([](TracedObject<BarObject> o) -> String { return "Bar"; });
 
   ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "Foo");
   ICHECK_EQ(functor("", MakeTraced(BarObject(), path)), "Bar");
@@ -80,8 +80,8 @@ TEST(TracedObjectFunctorTest, RegistrationWithFunction) {
   TracedObjectFunctor<String> functor;
   ObjectPath path = ObjectPath::Root();
 
-  functor.set_dispatch([](TracedObject<FooObject> o) -> String { return "FooLambda"; });
-  functor.set_dispatch("tir", ComputeFoo);
+  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o) -> String { return "FooLambda"; });
+  functor.set_dispatch<FooObject>("tir", ComputeFoo);
 
   ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "FooLambda");
   ICHECK_EQ(functor("tir", MakeTraced(FooObject(), path)), "Foo");
@@ -91,9 +91,11 @@ TEST(TracedObjectFunctorTest, RegistrationWithDispatchToken) {
   TracedObjectFunctor<String> functor;
   ObjectPath path = ObjectPath::Root();
 
-  functor.set_dispatch([](TracedObject<FooObject> o) -> String { return "Foo"; });
-  functor.set_dispatch("tir", [](TracedObject<FooObject> o) -> String { return "Foo tir"; });
-  functor.set_dispatch("relax", [](TracedObject<FooObject> o) -> String { return "Foo relax"; });
+  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o) -> String { return "Foo"; });
+  functor.set_dispatch<FooObject>("tir",
+                                  [](TracedObject<FooObject> o) -> String { return "Foo tir"; });
+  functor.set_dispatch<FooObject>("relax",
+                                  [](TracedObject<FooObject> o) -> String { return "Foo relax"; });
 
   ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "Foo");
   ICHECK_EQ(functor("tir", MakeTraced(FooObject(), path)), "Foo tir");
@@ -119,8 +121,8 @@ TEST(TracedObjectFunctorTest, ExtraArg) {
   TracedObjectFunctor<int, int> functor;
   ObjectPath path = ObjectPath::Root();
 
-  functor.set_dispatch([](TracedObject<FooObject> o, int x) { return x; });
-  functor.set_dispatch([](TracedObject<BarObject> o, int x) { return x + 1; });
+  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o, int x) { return x; });
+  functor.set_dispatch<BarObject>([](TracedObject<BarObject> o, int x) { return x + 1; });
 
   ICHECK_EQ(functor("", MakeTraced(FooObject(), path), 2), 2);
   ICHECK_EQ(functor("", MakeTraced(BarObject(), path), 2), 3);
@@ -131,8 +133,9 @@ TEST(TracedObjectFunctorTest, RemoveDispatchFunction) {
   TracedObjectFunctor<String> functor;
   ObjectPath path = ObjectPath::Root();
 
-  functor.set_dispatch([](TracedObject<FooObject> o) -> String { return "Foo"; });
-  functor.set_dispatch("tir", [](TracedObject<FooObject> o) -> String { return "Foo tir"; });
+  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o) -> String { return "Foo"; });
+  functor.set_dispatch<FooObject>("tir",
+                                  [](TracedObject<FooObject> o) -> String { return "Foo tir"; });
 
   ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "Foo");
   ICHECK_EQ(functor("tir", MakeTraced(FooObject(), path)), "Foo tir");
@@ -158,11 +161,11 @@ TEST(TracedObjectFunctorTest, DuplicateRegistration_WithoutToken) {
   TracedObjectFunctor<int, int> functor;
   ObjectPath path = ObjectPath::Root();
 
-  functor.set_dispatch([](TracedObject<FooObject> o, int x) { return x; });
+  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o, int x) { return x; });
 
   bool failed = false;
   try {
-    functor.set_dispatch([](TracedObject<FooObject> o, int x) { return x; });
+    functor.set_dispatch<FooObject>([](TracedObject<FooObject> o, int x) { return x; });
   } catch (...) {
     failed = true;
   }
@@ -173,11 +176,11 @@ TEST(TracedObjectFunctorTest, DuplicateRegistration_WithToken) {
   TracedObjectFunctor<int, int> functor;
   ObjectPath path = ObjectPath::Root();
 
-  functor.set_dispatch("tir", [](TracedObject<FooObject> o, int x) { return x; });
+  functor.set_dispatch<FooObject>("tir", [](TracedObject<FooObject> o, int x) { return x; });
 
   bool failed = false;
   try {
-    functor.set_dispatch("tir", [](TracedObject<FooObject> o, int x) { return x; });
+    functor.set_dispatch<FooObject>("tir", [](TracedObject<FooObject> o, int x) { return x; });
   } catch (...) {
     failed = true;
   }

From d1871a6957b4f469f1b994aa6c89e0d209b64f05 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Sat, 17 Sep 2022 22:03:17 -0400
Subject: [PATCH 200/704] [MetaSchedule] Relax conditions of rule Cross-Thread
 Reduction (#12825)

This PR relaxes the conditions of Meta-Schedule schedule rule CrossThreadReduction. The rules are previously a bit over-strict, and some workloads with small reduction loop length are unable to be optimized by cross-thread reduction automatically. In this PR, we relax the rules so that such workloads can be optimized.
---
 src/tir/schedule/analysis/analysis.cc         |  6 +-
 ...le_schedule_rule_cross_thread_reduction.py | 98 +++++++++++++++++++
 2 files changed, 100 insertions(+), 4 deletions(-)

diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 4f78b0c9cd43..e39f7b25543c 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -1640,11 +1640,9 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self,   //
   if (NeedsMultiLevelTiling(self, block_sref)) {
     // Do not use rfactor/cross-thread-reduction if we have enough parallelism on spatial loops.
     return !(cum_space_len >= cum_reduce_len || cum_space_len > max_parallel_extent);
-  } else if (cum_reduce_len > 1) {
-    // Always try rfactor/cross-thread-reduction for other reduction blocks.
-    return cum_reduce_len > max_parallel_basic;
   } else {
-    return false;
+    // Always try rfactor/cross-thread-reduction for other reduction blocks.
+    return cum_reduce_len > 1;
   }
 }
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index 4278638a1aa3..718b264bddd2 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -589,6 +589,28 @@ def argmax(
             argmax_v1[i] = v_argmax_v1
 
 
+@T.prim_func
+def argmax_32(
+    idx: T.Buffer[(1, 32), "int32"],
+    val: T.Buffer[(1, 32), "float32"],
+    argmax_v0: T.Buffer[(1,), "int32"],
+    argmax_v1: T.Buffer[(1,), "float32"],
+) -> None:
+    for i0, i1 in T.grid(1, 32):
+        with T.block("argmax"):
+            i = T.axis.spatial(1, i0)
+            k = T.axis.reduce(32, i1)
+            T.reads(idx[i, k], val[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = -1
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+
+
 def test_gpu_argmax():
     @T.prim_func
     def argmax_0(
@@ -663,8 +685,84 @@ def argmax_1(
     )
 
 
+def test_gpu_argmax_32():
+    @T.prim_func
+    def argmax_0(
+        idx: T.Buffer[(1, 32), "int32"],
+        val: T.Buffer[(1, 32), "float32"],
+        argmax_v0: T.Buffer[(1,), "int32"],
+        argmax_v1: T.Buffer[(1,), "float32"],
+    ) -> None:
+        # body
+        # with T.block("root")
+        for i0, i1 in T.grid(1, 32):
+            with T.block("argmax"):
+                i, k = T.axis.remap("SR", [i0, i1])
+                T.reads(idx[i, k], val[i, k])
+                T.writes(argmax_v0[i], argmax_v1[i])
+                with T.init():
+                    argmax_v0[i] = -1
+                    argmax_v1[i] = T.float32(-3.4028234663852886e38)
+                v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+                v_argmax_v1: T.float32 = T.Select(
+                    argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]
+                )
+                argmax_v0[i] = v_argmax_v0
+                argmax_v1[i] = v_argmax_v1
+
+    @T.prim_func
+    def argmax_1(
+        idx: T.Buffer[(1, 32), "int32"],
+        val: T.Buffer[(1, 32), "float32"],
+        argmax_v0: T.Buffer[(1,), "int32"],
+        argmax_v1: T.Buffer[(1,), "float32"],
+    ) -> None:
+        # body
+        # with T.block("root")
+        for i0, i1_0 in T.grid(1, 1):
+            for i1_1 in T.thread_binding(64, thread="threadIdx.x"):
+                with T.block("argmax"):
+                    i = T.axis.spatial(1, i0)
+                    k = T.axis.reduce(32, i1_0 * 64 + i1_1)
+                    T.where(i1_0 * 64 + i1_1 < 32)
+                    T.reads(idx[i, k], val[i, k])
+                    T.writes(argmax_v0[i], argmax_v1[i])
+                    with T.init():
+                        argmax_v0[i] = -1
+                        argmax_v1[i] = T.float32(-3.4028234663852886e38)
+                    v_argmax_v0: T.int32 = T.Select(
+                        argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]
+                    )
+                    v_argmax_v1: T.float32 = T.Select(
+                        argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]
+                    )
+                    argmax_v0[i] = v_argmax_v0
+                    argmax_v1[i] = v_argmax_v1
+
+    decision_0 = []  # type: ignore
+    decision_1 = [
+        ("SampleCategorical", 4),
+    ]
+
+    mod = argmax_32
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3090", host="llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction),
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[argmax_0, argmax_1],
+        expected_decisions=[decision_0, decision_1],
+    )
+
+
 if __name__ == "__main__":
     test_gpu_softmax_mn()
     test_gpu_softmax_mn_after_inline()
     test_gpu_batch_norm_bmn()
     test_gpu_argmax()
+    test_gpu_argmax_32()

From b2c5addbb4e92aa770f0cd0847eabb43400ac9d2 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Sat, 17 Sep 2022 19:18:01 -0700
Subject: [PATCH 201/704] [TVMScript] IRBuilder methods for `Stmt` (#12830)

This PR introduces  IRBuilder methods for `Assert`, `Let`, `Realize`, `Evaluate`, `LaunchThread`, `EnvThread`.

Co-authored-by: yongwww <yongcale@gmail.com>
---
 include/tvm/script/ir_builder/tir/frame.h     | 132 ++++++++++++++++++
 include/tvm/script/ir_builder/tir/ir.h        |  40 ++++++
 python/tvm/script/ir_builder/tir/frame.py     |  20 +++
 python/tvm/script/ir_builder/tir/ir.py        | 131 +++++++++++++++++
 src/script/ir_builder/tir/frame.cc            |  27 ++++
 src/script/ir_builder/tir/ir.cc               |  67 +++++++++
 .../unittest/test_tvmscript_ir_builder_tir.py |  69 +++++++++
 7 files changed, 486 insertions(+)

diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h
index c76b400d96b4..38fe9009dd61 100644
--- a/include/tvm/script/ir_builder/tir/frame.h
+++ b/include/tvm/script/ir_builder/tir/frame.h
@@ -303,6 +303,138 @@ class AssertFrameNode : public TIRFrameNode {
   void ExitWithScope() final;
 };
 
+/*!
+ * \brief Managed reference to AssertFrameNode.
+ *
+ * \sa AssertFrameNode
+ */
+class AssertFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(AssertFrame, TIRFrame, AssertFrameNode);
+};
+
+/*!
+ * \brief A frame represents the let binding expression, which binds a var.
+ *
+ * \sa LetFrameNode
+ */
+class LetFrameNode : public TIRFrameNode {
+ public:
+  /*! \brief The variable we bind to */
+  tvm::tir::Var var;
+  /*! \brief The value we bind var to */
+  PrimExpr value;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("var", &var);
+    v->Visit("value", &value);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.LetFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(LetFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to LetFrameNode.
+ *
+ * \sa LetFrameNode
+ */
+class LetFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(LetFrame, TIRFrame, LetFrameNode);
+};
+
+/*!
+ * \brief The LaunchThreadFrameNode.
+ * \note It is used only inside a PrimFunc.
+ */
+class LaunchThreadFrameNode : public TIRFrameNode {
+ public:
+  /*! \brief The extent of environment thread. */
+  PrimExpr extent;
+  /*! \brief The attribute key, could be either virtual_thread or thread_extent. */
+  String attr_key;
+  /*! \brief The iteration variable. */
+  tvm::tir::IterVar iter_var;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("extent", &extent);
+    v->Visit("attr_key", &attr_key);
+    v->Visit("iter_var", &iter_var);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.LaunchThreadFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(LaunchThreadFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to LaunchThreadFrameNode.
+ *
+ * \sa LaunchThreadFrameNode
+ */
+class LaunchThreadFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(LaunchThreadFrame, TIRFrame,
+                                                    LaunchThreadFrameNode);
+};
+
+/*!
+ * \brief A frame that represents realization.
+ *
+ * \sa RealizeFrame
+ */
+class RealizeFrameNode : public TIRFrameNode {
+ public:
+  /*! \brief The region of buffer access. */
+  tvm::tir::BufferRegion buffer_slice;
+  /*! \brief The storage scope associated with this realization. */
+  String storage_scope;
+  /*! \brief The condition expression. */
+  PrimExpr condition;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("buffer_slice", &buffer_slice);
+    v->Visit("storage_scope", &storage_scope);
+    v->Visit("condition", &condition);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.RealizeFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(RealizeFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to RealizeFrameNode.
+ *
+ * \sa RealizeFrameNode
+ */
+class RealizeFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(RealizeFrame, TIRFrame, RealizeFrameNode);
+};
 }  // namespace tir
 }  // namespace ir_builder
 }  // namespace script
diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
index 191887648dbd..ec1f7f3753d1 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -292,6 +292,46 @@ ForFrame ThreadBinding(PrimExpr start, PrimExpr stop, String thread,
  */
 ForFrame Grid(Array<PrimExpr> extents);
 
+/*!
+ * \brief The assertion statement.
+ * \param condition The assertion condition.
+ * \param message The error message when the assertion fails.
+ * \return The AssertFrame.
+ */
+AssertFrame Assert(PrimExpr condition, String message);
+
+/*!
+ * \brief The let binding.
+ * \param var The variable to bind.
+ * \param value The value to be bound.
+ * \return The created LetFrame.
+ */
+LetFrame Let(Var var, PrimExpr value);
+
+/*!
+ * \brief The realization.
+ * \param buffer_slice The region of buffer access.
+ * \param storage_scope The storage scope associated with this realization.
+ * \param condition The condition expression.
+ * \return The result RealizeFrame.
+ */
+RealizeFrame Realize(tvm::tir::BufferRegion buffer_slice, String storage_scope, PrimExpr condition);
+
+/*!
+ * \brief Launch a thread.
+ * \param var The iteration variable.
+ * \param extent The extent of environment thread.
+ * \return The result LaunchThreadFrame.
+ */
+LaunchThreadFrame LaunchThread(Var var, PrimExpr extent);
+
+/*!
+ * \brief Bind a var to thread env.
+ * \param thread_tag The thread type tag.
+ * \return The result variable which gets bound to the thread env.
+ */
+Var EnvThread(String thread_tag);
+
 /*!
  * \brief Evaluate the input expression.
  * \param value The input expression to evaluate.
diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py
index 2ad08f35160d..69bc5bfc9676 100644
--- a/python/tvm/script/ir_builder/tir/frame.py
+++ b/python/tvm/script/ir_builder/tir/frame.py
@@ -48,3 +48,23 @@ class ForFrame(TIRFrame):
     def __enter__(self) -> Union[Var, List[Var]]:  # type: ignore[override]
         super().__enter__()
         return self.vars if len(self.vars) > 1 else self.vars[0]
+
+
+@_register_object("script.ir_builder.tir.AssertFrame")
+class AssertFrame(TIRFrame):
+    ...
+
+
+@_register_object("script.ir_builder.tir.LetFrame")
+class LetFrame(TIRFrame):
+    ...
+
+
+@_register_object("script.ir_builder.tir.RealizeFrame")
+class RealizeFrame(TIRFrame):
+    ...
+
+
+@_register_object("script.ir_builder.tir.LaunchThreadFrame")
+class LaunchThreadFrame(TIRFrame):
+    ...
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index d1dc1c89600d..6db8f40c32c8 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -26,6 +26,8 @@
     BufferLoad,
     BufferRegion,
     IntImm,
+    IterVar,
+    Let,
     PrimExpr,
     StringImm,
     Var,
@@ -813,6 +815,130 @@ def grid(*extents: PrimExpr) -> frame.ForFrame:
     return _ffi_api.Grid(extents)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
+def Assert(condition: PrimExpr, message: str) -> frame.AssertFrame:  # pylint: disable=invalid-name
+    """Create an assertion statement.
+
+    Parameters
+    ----------
+    condition : PrimExpr
+        The PrimExpr to test.
+
+    message : str
+        The output error message when the assertion fails.
+
+    Returns
+    -------
+    res : frame.AssertFrame
+        The result AssertFrame.
+    """
+    return _ffi_api.Assert(condition, message)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def let(
+    v: Var,
+    value: PrimExpr,
+    body: PrimExpr = None,
+) -> frame.LetFrame:
+    """Create a new let binding.
+
+    Parameters
+    ----------
+    v : Var
+        The variable to bind.
+
+    value : PrimExpr
+        The value to be bound.
+
+    body : PrimExpr
+        The body expression, None will be used if it was not specified.
+
+    Returns
+    -------
+    res : frame.LetFrame
+        The result LetFrame.
+    """
+    if body is None:
+        return _ffi_api.Let(v, value)  # type: ignore[attr-defined] # pylint: disable=no-member
+    return Let(v, value, body)
+
+
+def realize(
+    buffer_slice: BufferRegion,
+    storage_scope: str,
+    condition: PrimExpr = True,
+) -> frame.RealizeFrame:
+    """Create a realization.
+
+    Parameters
+    ----------
+    buffer_slice : BufferRegion
+        The region of buffer access.
+
+    storage_scope : str
+        The storage scope associated with this realization.
+
+    condition: PrimExpr
+        The condition expression, the default is True.
+
+    Returns
+    -------
+    res : frame.RealizeFrame
+        The result RealizeFrame.
+    """
+    return _ffi_api.Realize(  # type: ignore[attr-defined] # pylint: disable=no-member
+        buffer_slice, storage_scope, condition
+    )
+
+
+def launch_thread(
+    iter_var: IterVar,  # pylint: disable=redefined-outer-name
+    extent: PrimExpr,
+) -> frame.LaunchThreadFrame:
+    """Launch a thread.
+
+    Parameters
+    ----------
+    iter_var : IterVar
+        The iteration variable.
+
+    extent : PrimExpr
+        The extent of environment thread.
+
+    Returns
+    -------
+    res : frame.LaunchThreadFrame
+        The result LaunchThreadFrame.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+    from tvm.script.ir_builder import tir as T
+    brow = T.env_thread("blockIdx.y")
+    T.launch_thread(brow, 1)
+
+    """
+    return _ffi_api.LaunchThread(iter_var, extent)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def env_thread(thread_tag: str) -> IterVar:
+    """Bind a var to thread env"
+
+    Parameters
+    ----------
+    thread_tag : str
+        The thread type tag.
+
+    Returns
+    -------
+    res : IterVar
+        The result iteration variable gets bound to the thread env.
+
+    """
+    return _ffi_api.EnvThread(thread_tag)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
 def evaluate(value: PrimExpr) -> None:
     """Evaluate the input expression.
 
@@ -1159,6 +1285,11 @@ def var(dtype, name="") -> Var:
     "unroll",
     "thread_binding",
     "grid",
+    "Assert",
+    "let",
+    "realize",
+    "launch_thread",
+    "env_thread",
     "evaluate",
     "int8",
     "int16",
diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc
index 8b8b2a4d80e0..6c9459e6389c 100644
--- a/src/script/ir_builder/tir/frame.cc
+++ b/src/script/ir_builder/tir/frame.cc
@@ -92,11 +92,38 @@ void ForFrameNode::ExitWithScope() {
   AddToParent(this->f_make_for_loop(vars, doms, AsStmt(stmts)));
 }
 
+void AssertFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  AddToParent(tvm::tir::AssertStmt(condition, message, AsStmt(stmts)));
+}
+
+void LetFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  AddToParent(tvm::tir::LetStmt(var, value, AsStmt(stmts)));
+}
+
+void RealizeFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  AddToParent(tvm::tir::AttrStmt(buffer_slice->buffer, "realize_scope",
+                                 tvm::tir::StringImm(storage_scope),
+                                 tvm::tir::BufferRealize(buffer_slice->buffer, buffer_slice->region,
+                                                         condition, AsStmt(stmts))));
+}
+
+void LaunchThreadFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  AddToParent(tvm::tir::AttrStmt(iter_var, attr_key, extent, AsStmt(stmts)));
+}
+
 TVM_REGISTER_NODE_TYPE(TIRFrameNode);
 TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode);
 TVM_REGISTER_NODE_TYPE(BlockFrameNode);
 TVM_REGISTER_NODE_TYPE(BlockInitFrameNode);
 TVM_REGISTER_NODE_TYPE(ForFrameNode);
+TVM_REGISTER_NODE_TYPE(AssertFrameNode);
+TVM_REGISTER_NODE_TYPE(LetFrameNode);
+TVM_REGISTER_NODE_TYPE(RealizeFrameNode);
+TVM_REGISTER_NODE_TYPE(LaunchThreadFrameNode);
 
 }  // namespace tir
 }  // namespace ir_builder
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index 75e759262655..5951af298f62 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -395,6 +395,67 @@ ForFrame Grid(Array<PrimExpr> extents) {
   return ForFrame(n);
 }
 
+AssertFrame Assert(PrimExpr condition, String message) {
+  ObjectPtr<AssertFrameNode> n = make_object<AssertFrameNode>();
+  n->condition = condition;
+  n->message = tvm::tir::StringImm(message);
+  return AssertFrame(n);
+}
+
+LetFrame Let(Var var, PrimExpr value) {
+  ObjectPtr<LetFrameNode> n = make_object<LetFrameNode>();
+  n->var = var;
+  n->value = value;
+  return LetFrame(n);
+}
+
+LaunchThreadFrame LaunchThread(Var var, PrimExpr extent) {
+  IterVar iter_var{nullptr};
+
+  if (Optional<PrimFuncFrame> opt_frame = IRBuilder::Current()->FindFrame<PrimFuncFrame>()) {
+    if (Optional<IterVar> opt_iter_var = opt_frame.value()->env_threads.Get(var)) {
+      iter_var = opt_iter_var.value();
+    } else {
+      LOG(FATAL) << "ValueError: " << var->name_hint
+                 << " is not an env_thread created using T.env_thread.";
+    }
+  } else {
+    LOG(FATAL) << "LaunchThread can only be used inside a PrimFunc";
+  }
+  ObjectPtr<LaunchThreadFrameNode> n = make_object<LaunchThreadFrameNode>();
+  if (!iter_var->dom.defined()) {
+    const_cast<tvm::tir::IterVarNode*>(iter_var.get())->dom = Range(0, extent);
+  } else if (!arith::Analyzer().CanProveEqual(iter_var->dom->extent, extent)) {
+    LOG(FATAL) << "ValueError: Inconsistent extents of environment thread. "
+               << iter_var->dom->extent << " vs " << extent;
+  }
+  n->iter_var = iter_var;
+  n->extent = extent;
+  n->attr_key = iter_var->thread_tag == "vthread" ? "virtual_thread" : "thread_extent";
+  return LaunchThreadFrame(n);
+}
+
+RealizeFrame Realize(tvm::tir::BufferRegion buffer_slice, String storage_scope,
+                     PrimExpr condition) {
+  ObjectPtr<RealizeFrameNode> n = make_object<RealizeFrameNode>();
+  n->buffer_slice = buffer_slice;
+  n->storage_scope = storage_scope;
+  n->condition = condition;
+  return RealizeFrame(n);
+}
+
+Var EnvThread(String thread_tag) {
+  IterVar iter_var(Range{nullptr}, Var("", DataType::Int(32)), tvm::tir::IterVarType::kThreadIndex,
+                   thread_tag);
+  Var var = iter_var->var;
+  if (Optional<PrimFuncFrame> opt_frame = IRBuilder::Current()->FindFrame<PrimFuncFrame>()) {
+    opt_frame.value()->env_threads.Set(var, iter_var);
+  } else {
+    LOG(FATAL) << "EnvThread can only be used inside a PrimFunc";
+  }
+  return var;
+}
+
 void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); }
 
 using tvm::script::ir_builder::details::Namer;
@@ -477,6 +538,12 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.Unroll").set_body_typed(Unroll);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.ThreadBinding").set_body_typed(ThreadBinding);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Grid").set_body_typed(Grid);
 
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Assert").set_body_typed(Assert);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Let").set_body_typed(Let);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Realize").set_body_typed(Realize);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.LaunchThread").set_body_typed(LaunchThread);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.EnvThread").set_body_typed(EnvThread);
+
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate);
 
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int8").set_body_typed(Int8);
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index a5d8c1068064..7f2e6e1a4706 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -260,5 +260,74 @@ def test_ir_builder_tir_for():
     assert_structural_equal(for_actual, for_expected, map_free_vars=True)
 
 
+def test_ir_builder_tir_assert():
+    with IRBuilder() as ib:
+        with T.Assert(T.var("int32", name="a") == 0, message="a is 0"):
+            T.evaluate(0)
+    # the assert generated by IRBuilder
+    assert_actual = ib.get()
+
+    # the expected assert statement
+    assert_expected = tir.AssertStmt(
+        T.var("int32", name="a") == 0, tir.StringImm("a is 0"), tir.Evaluate(0)
+    )
+    # Check if the generated ir is expected
+    assert_structural_equal(assert_actual, assert_expected, map_free_vars=True)
+
+
+def test_ir_builder_tir_evaluate():
+    with IRBuilder() as ib:
+        T.evaluate(0)
+    # the evaluate generated by IRBuilder
+    eval_actual = ib.get()
+
+    # the expected evaluate
+    eval_expected = tir.Evaluate(0)
+    # Check if the generated ir is expected
+    assert_structural_equal(eval_actual, eval_expected, map_free_vars=True)
+
+
+def test_ir_builder_tir_let():
+    with IRBuilder() as ib:
+        with T.let(T.var("int32", name="a"), tir.IntImm("int32", 2)):
+            T.evaluate(0)
+    # the let binding generated by IRBuilder
+    let_actual = ib.get()
+
+    # the expected Let statement
+    let_expected = tir.LetStmt(T.var("int32", name="a"), tir.IntImm("int32", 2), tir.Evaluate(0))
+    assert_structural_equal(let_actual, let_expected, map_free_vars=True)
+
+
+def test_ir_builder_tir_realize():
+    buffer_a = T.buffer_decl((128, 128), "float32")
+    with IRBuilder() as ib:
+        with T.realize(buffer_a[0:128, 0:128], "test_storage_scope", True):
+            T.evaluate(0)
+    realize_actual = ib.get()
+
+    # the expected buffer realization
+    buffer_realize = tir.BufferRealize(
+        buffer_a, [tvm.ir.Range(0, 128), tvm.ir.Range(0, 128)], True, tir.Evaluate(0)
+    )
+    expected_realize = tir.AttrStmt(
+        buffer_a, "realize_scope", tir.StringImm("test_storage_scope"), buffer_realize
+    )
+    assert_structural_equal(realize_actual, expected_realize, map_free_vars=True)
+
+
+def test_ir_builder_tir_thread():
+    with IRBuilder() as ib:
+        with T.prim_func():
+            brow = T.env_thread("blockIdx.y")
+            with T.launch_thread(brow, 1):
+                T.evaluate(0)
+    ir_actual = ib.get()
+    iter_var = tir.IterVar((0, 1), "v", iter_type=1, thread_tag="blockIdx.y")
+    attr_stmt = tir.AttrStmt(iter_var, "thread_extent", 1, tir.Evaluate(0))
+    func = tir.PrimFunc([], attr_stmt)
+    assert_structural_equal(ir_actual, func, map_free_vars=True)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 052e7028271be2aa2932e8721faf847940d28429 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Sun, 18 Sep 2022 11:51:23 -0700
Subject: [PATCH 202/704] [TVMScript] IRBuilder methods for `Stmt` (#12831)

This PR introduces  IRBuilder methods for
`allocate`, `Let`, `allocate_const`, `attr`,  `While`, `If/Then/Else`, `decl_buffer`, `buffer_store`, `prefetch`.

Co-authored-by: yongwww <yongcale@gmail.com>
---
 include/tvm/script/ir_builder/tir/frame.h     | 307 ++++++++++++++++++
 include/tvm/script/ir_builder/tir/ir.h        |  97 ++++++
 python/tvm/script/ir_builder/tir/frame.py     |  48 ++-
 python/tvm/script/ir_builder/tir/ir.py        | 271 ++++++++++++++++
 src/script/ir_builder/tir/frame.cc            |  78 +++++
 src/script/ir_builder/tir/ir.cc               |  86 +++++
 src/script/ir_builder/tir/utils.h             |  15 +
 .../unittest/test_tvmscript_ir_builder_tir.py | 173 +++++++++-
 8 files changed, 1061 insertions(+), 14 deletions(-)

diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h
index 38fe9009dd61..aa2386e7f1e4 100644
--- a/include/tvm/script/ir_builder/tir/frame.h
+++ b/include/tvm/script/ir_builder/tir/frame.h
@@ -435,6 +435,313 @@ class RealizeFrame : public TIRFrame {
  public:
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(RealizeFrame, TIRFrame, RealizeFrameNode);
 };
+
+/*!
+ * \brief A frame represents the allocate.
+ *
+ * \sa AllocateFrame
+ */
+class AllocateFrameNode : public TIRFrameNode {
+ public:
+  /*! \brief The extents of the allocate. */
+  Array<PrimExpr> extents;
+  /*! \brief The data type of the buffer. */
+  DataType dtype;
+  /*! \brief The storage scope. */
+  String storage_scope;
+  /*! \brief The condition. */
+  PrimExpr condition;
+  /*! \brief Additional annotation hints. */
+  Map<String, ObjectRef> annotations;
+  /*! \brief The buffer. */
+  tvm::tir::Buffer buffer;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("extents", &extents);
+    v->Visit("dtype", &dtype);
+    v->Visit("storage_scope", &storage_scope);
+    v->Visit("condition", &condition);
+    v->Visit("annotations", &annotations);
+    v->Visit("buffer", &buffer);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.AllocateFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AllocateFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to AllocateFrameNode.
+ *
+ * \sa AllocateFrameNode
+ */
+class AllocateFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(AllocateFrame, TIRFrame, AllocateFrameNode);
+};
+
+/*!
+ * \brief A frame represents the allocate constant.
+ *
+ * \sa AllocateConstFrame
+ */
+class AllocateConstFrameNode : public TIRFrameNode {
+ public:
+  /*! \brief The data type of the buffer. */
+  DataType dtype;
+  /*! \brief The extents of the allocate. */
+  Array<PrimExpr> extents;
+  /*! \brief The data associated with the constant. */
+  tvm::runtime::NDArray data;
+  /*! \brief The buffer */
+  tvm::tir::Buffer buffer;
+  /*! \brief Additional annotations about the allocation. */
+  Map<String, ObjectRef> annotations;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("dtype", &dtype);
+    v->Visit("extents", &extents);
+    v->Visit("data", &data);
+    v->Visit("buffer", &buffer);
+    v->Visit("annotations", &annotations);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.AllocateConstFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AllocateConstFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to AllocateConstFrameNode.
+ *
+ * \sa AllocateConstFrameNode
+ */
+class AllocateConstFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(AllocateConstFrame, TIRFrame,
+                                                    AllocateConstFrameNode);
+};
+/*!
+ * \brief A frame that represents attribute node.
+ *
+ * \sa AttrFrame
+ */
+class AttrFrameNode : public TIRFrameNode {
+ public:
+  /*! \brief The node to annotate the attribute. */
+  ObjectRef node;
+  /*! \brief Attribute type key. */
+  String attr_key;
+  /*! \brief The value of the attribute. */
+  PrimExpr value;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("node", &node);
+    v->Visit("attr_key", &attr_key);
+    v->Visit("value", &value);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.AttrFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AttrFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to AttrFrameNode.
+ *
+ * \sa AttrFrameNode
+ */
+class AttrFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(AttrFrame, TIRFrame, AttrFrameNode);
+};
+
+/*!
+ * \brief A frame that represents while loop.
+ *
+ * \sa WhileFrame
+ */
+class WhileFrameNode : public TIRFrameNode {
+ public:
+  /*! \brief The termination condition of while. */
+  PrimExpr condition;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("condition", &condition);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.WhileFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(WhileFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to WhileFrameNode.
+ *
+ * \sa WhileFrameNode
+ */
+class WhileFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(WhileFrame, TIRFrame, WhileFrameNode);
+};
+
+/*!
+ * \brief A frame that represents if statement.
+ *
+ * \sa IfFrame
+ */
+class IfFrameNode : public TIRFrameNode {
+ public:
+  /*! \brief The condition of the if statement. */
+  PrimExpr condition;
+  /*! \brief The statements in the true branch. */
+  Optional<Array<tvm::tir::Stmt>> then_stmts;
+  /*! \brief The stetements in the false branch. */
+  Optional<Array<tvm::tir::Stmt>> else_stmts;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("condition", &condition);
+    v->Visit("then_stmts", &then_stmts);
+    v->Visit("else_stmts", &else_stmts);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.IfFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IfFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to IfFrameNode.
+ *
+ * \sa IfFrameNode
+ */
+class IfFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(IfFrame, TIRFrame, IfFrameNode);
+};
+
+/*!
+ * \brief A frame that represents then.
+ *
+ * \sa ThenFrame
+ */
+class ThenFrameNode : public TIRFrameNode {
+ public:
+  static constexpr const char* _type_key = "script.ir_builder.tir.ThenFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ThenFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when entering RAII scope.
+   * \sa tvm::support::With
+   */
+  void EnterWithScope() final;
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to ThenFrameNode.
+ *
+ * \sa ThenFrameNode
+ */
+class ThenFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ThenFrame, TIRFrame, ThenFrameNode);
+};
+
+/*!
+ * \brief A frame that represents else.
+ *
+ * \sa ElseFrame
+ */
+class ElseFrameNode : public TIRFrameNode {
+ public:
+  static constexpr const char* _type_key = "script.ir_builder.tir.ElseFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ElseFrameNode, TIRFrameNode);
+
+ public:
+  /*!
+   * \brief The method called when entering RAII scope.
+   * \sa tvm::support::With
+   */
+  void EnterWithScope() final;
+  /*!
+   * \brief The method called when exiting RAII scope.
+   * \sa tvm::support::With
+   */
+  void ExitWithScope() final;
+};
+
+/*!
+ * \brief Managed reference to ElseFrameNode.
+ *
+ * \sa ElseFrameNode
+ */
+class ElseFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ElseFrame, TIRFrame, ElseFrameNode);
+};
+
+class DeclBufferFrameNode : public TIRFrameNode {
+ public:
+  tvm::tir::Buffer buffer;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TIRFrameNode::VisitAttrs(v);
+    v->Visit("buffer", &buffer);
+  }
+
+  static constexpr const char* _type_key = "script.ir_builder.tir.DeclBufferFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(DeclBufferFrameNode, TIRFrameNode);
+
+ public:
+  void ExitWithScope() final;
+};
+
+class DeclBufferFrame : public TIRFrame {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(DeclBufferFrame, TIRFrame, DeclBufferFrameNode);
+};
+
 }  // namespace tir
 }  // namespace ir_builder
 }  // namespace script
diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
index ec1f7f3753d1..dd289b691502 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -28,6 +28,7 @@ namespace script {
 namespace ir_builder {
 namespace tir {
 
+using tvm::runtime::NDArray;
 using tvm::tir::Buffer;
 using tvm::tir::Var;
 
@@ -317,6 +318,87 @@ LetFrame Let(Var var, PrimExpr value);
  */
 RealizeFrame Realize(tvm::tir::BufferRegion buffer_slice, String storage_scope, PrimExpr condition);
 
+/*!
+ * \brief The allocate node.
+ * \param extents The extents of the allocate.
+ * \param dtype The data type of the buffer.
+ * \param storage_scope The storage scope.
+ * \param condition The condition.
+ * \param annotations Additional annotation hints.
+ * \return The created AllocateFrame.
+ */
+AllocateFrame Allocate(Array<PrimExpr> extents, DataType dtype, String storage_scope = "",
+                       Optional<PrimExpr> condition = NullOpt,
+                       Optional<Map<String, ObjectRef>> annotations = NullOpt);
+
+/*!
+ * \brief The allocate constant node.
+ * \param data The data associated with the constant.
+ * \param dtype The data type of the buffer.
+ * \param extents The extents of the allocate.
+ * \param annotations Additional annotation hints.
+ * \return The created AllocateConstFrame.
+ */
+AllocateConstFrame AllocateConst(
+    NDArray data, DataType dtype, Array<PrimExpr> extents,
+    Map<String, ObjectRef> annotations = NullValue<Map<String, ObjectRef>>());
+
+/*!
+ * \brief Create an attribute.
+ * \param node The node to annotate the attribute.
+ * \param attr_key Attribute type key.
+ * \param value The value of the attribute.
+ * \return The result AttrFrame.
+ */
+AttrFrame Attr(ObjectRef node, String attr_key, PrimExpr value);
+
+/*!
+ * \brief Create a while loop.
+ * \param condition The termination condition of the loop.
+ * \return The result WhileFrame.
+ */
+WhileFrame While(PrimExpr condition);
+
+/*!
+ * \brief Create an if statement.
+ * \param condition The condition of if statement.
+ * \return The result IfFrame.
+ */
+IfFrame If(PrimExpr condition);
+
+/*!
+ * \brief Create a then.
+ * \return The result ThenFrame.
+ */
+ThenFrame Then();
+
+/*!
+ * \brief Create an else.
+ * \return The result ElseFrame.
+ */
+ElseFrame Else();
+
+/*!
+ * \brief The buffer declaration frame.
+ * \param shape The type of the buffer prior to flattening.
+ * \param dtype The data type in the content of the buffer.
+ * \param buffer_name The name of the buffer.
+ * \param data The pointer to the head of the data.
+ * \param strides The strides of each dimension.
+ * \param elem_offset The offset in terms of number of dtype elements (including lanes).
+ * \param storage_scope The optional storage scope of buffer data pointer.
+ * \param align The alignment requirement of data pointer in bytes.
+ * \param offset_factor The factor of elem_offset field.
+ * \param buffer_type The buffer type.
+ * \param axis_separators The separators between input axes when generating flattened output axes.
+ * \return The declared buffer.
+ */
+DeclBufferFrame DeclBuffer(Array<PrimExpr> shape, DataType dtype, String buffer_name,
+                           Optional<Var> data, Optional<Array<PrimExpr>> strides,
+                           Optional<PrimExpr> elem_offset, String storage_scope, int align,
+                           int offset_factor, String buffer_type,
+                           Optional<Array<IntImm>> axis_separators);
+
 /*!
  * \brief Launch a thread.
  * \param var The iteration variable.
@@ -332,6 +414,21 @@ LaunchThreadFrame LaunchThread(Var var, PrimExpr extent);
  */
 Var EnvThread(String thread_tag);
 
+/*!
+ * \brief Store data in a buffer.
+ * \param buffer The buffer.
+ * \param value The value to be stored.
+ * \param indices The indices location to be stored.
+ */
+void BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices);
+
+/*!
+ * \brief The prefetch hint for a buffer
+ * \param buffer The buffer to be prefetched.
+ * \param bounds The bounds to be prefetched.
+ */
+void Prefetch(Buffer buffer, Array<Range> bounds);
+
 /*!
  * \brief Evaluate the input expression.
  * \param value The input expression to evaluate.
diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py
index 69bc5bfc9676..b9b50dfa9876 100644
--- a/python/tvm/script/ir_builder/tir/frame.py
+++ b/python/tvm/script/ir_builder/tir/frame.py
@@ -18,7 +18,7 @@
 from typing import List, Union
 
 from tvm._ffi import register_object as _register_object
-from tvm.tir import Var
+from tvm.tir import Buffer, Var
 
 from ..base import IRBuilderFrame
 
@@ -65,6 +65,52 @@ class RealizeFrame(TIRFrame):
     ...
 
 
+@_register_object("script.ir_builder.tir.AllocateFrame")
+class AllocateFrame(TIRFrame):
+    def __enter__(self) -> Buffer:
+        super().__enter__()
+        return self.buffer
+
+
+@_register_object("script.ir_builder.tir.AllocateConstFrame")
+class AllocateConstFrame(TIRFrame):
+    def __enter__(self) -> Buffer:
+        super().__enter__()
+        return self.buffer
+
+
+@_register_object("script.ir_builder.tir.AttrFrame")
+class AttrFrame(TIRFrame):
+    ...
+
+
+@_register_object("script.ir_builder.tir.WhileFrame")
+class WhileFrame(TIRFrame):
+    ...
+
+
+@_register_object("script.ir_builder.tir.IfFrame")
+class IfFrame(TIRFrame):
+    ...
+
+
+@_register_object("script.ir_builder.tir.ThenFrame")
+class ThenFrame(TIRFrame):
+    ...
+
+
+@_register_object("script.ir_builder.tir.ElseFrame")
+class ElseFrame(TIRFrame):
+    ...
+
+
+@_register_object("script.ir_builder.tir.DeclBufferFrame")
+class DeclBufferFrame(TIRFrame):
+    def __enter__(self) -> Buffer:
+        super().__enter__()
+        return self.buffer
+
+
 @_register_object("script.ir_builder.tir.LaunchThreadFrame")
 class LaunchThreadFrame(TIRFrame):
     ...
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index 6db8f40c32c8..625e1291ff20 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -19,8 +19,10 @@
 
 from numbers import Integral
 from typing import Any, Dict, List, Optional, Union, Tuple
+import numpy as np  # type: ignore
 
 from tvm.ir import Range, Type
+from tvm.runtime import convert, ndarray
 from tvm.tir import (
     Buffer,
     BufferLoad,
@@ -32,6 +34,7 @@
     StringImm,
     Var,
 )
+from tvm.tir import Ramp as ramp
 
 from . import _ffi_api, frame
 
@@ -890,6 +893,217 @@ def realize(
     )
 
 
+def allocate(
+    extents: List[PrimExpr],
+    dtype: str,
+    scope: str = "",
+    condition: PrimExpr = None,
+    annotations=None,
+) -> frame.AllocateFrame:
+    """Allocate node.
+
+    Parameters
+    ----------
+    extents : List[PrimExpr]
+        The extents of the allocate.
+
+    dtype : str
+        The data type of the buffer.
+
+    scope : str
+        The storage scope.
+
+    condition : PrimExpr
+        The condition.
+
+    annotations: Optional[Mapping[str, Object]]
+        Additional annotation hints.
+    """
+    if isinstance(condition, bool):
+        condition = IntImm("bool", condition)
+    return _ffi_api.Allocate(  # type: ignore[attr-defined] # pylint: disable=no-member
+        extents, dtype, scope, condition, annotations
+    )
+
+
+def allocate_const(
+    data: List[PrimExpr],
+    dtype: str,
+    extents: List[PrimExpr],
+    annotations=None,
+) -> frame.AllocateConstFrame:
+    """Allocate constant node.
+
+    Parameters
+    ----------
+    data : List[PrimExpr]
+        The data associated with the constant.
+
+    dtype : str
+        The data type of the buffer.
+
+    extents : List[PrimExpr]
+        The extents of the allocate.
+
+    annotations : Optional[Map]
+        Additional annotations about the allocation.
+    """
+
+    return _ffi_api.AllocateConst(  # type: ignore[attr-defined] # pylint: disable=no-member
+        ndarray.array(np.asarray(data, dtype)), dtype, extents, annotations
+    )
+
+
+def attr(node: Any, attr_key: str, value: Union[PrimExpr, str]) -> frame.AttrFrame:
+    """Create an attribute node.
+
+    Parameters
+    ----------
+    node : Any
+        The node to annotate the attribute.
+
+    attr_key : str
+        Attribute type key.
+
+    value : Union[PrimExpr, str]
+        The value of the attribute.
+
+    Returns
+    -------
+    res : frame.AttrFrame
+        The result AttrFrame.
+    """
+    node = convert(node)
+    value = convert(value)
+    return _ffi_api.Attr(node, attr_key, value)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def While(condition: PrimExpr) -> frame.WhileFrame:  # pylint: disable=invalid-name
+    """Create a while node.
+
+    Parameters
+    ----------
+    condition : PrimExpr
+        The termination condition of the loop.
+
+    Returns
+    -------
+    res : frame.WhileFrame
+        The result WhileFrame.
+    """
+    if isinstance(condition, bool):
+        condition = IntImm("bool", condition)
+    return _ffi_api.While(condition)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def If(condition: PrimExpr) -> frame.IfFrame:  # pylint: disable=invalid-name
+    """Create an if node.
+
+    Parameters
+    ----------
+    condition : PrimExpr
+        The condition of if statement, executes the true branch if the condition is true,
+        otherwise jump into the false branch.
+
+    Returns
+    -------
+    res : frame.IfFrame
+        The result IfFrame.
+    """
+    if isinstance(condition, bool):
+        condition = IntImm("bool", condition)
+    return _ffi_api.If(condition)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def Then() -> frame.ThenFrame:  # pylint: disable=invalid-name
+    """Create a then.
+
+    Returns
+    -------
+    res : frame.ThenFrame
+        The result ThenFrame.
+    """
+    return _ffi_api.Then()  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def Else() -> frame.ElseFrame:  # pylint: disable=invalid-name
+    """Create an else.
+
+    Returns
+    -------
+    res : frame.ElseFrame
+        The result ElseFrame.
+    """
+    return _ffi_api.Else()  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def decl_buffer(
+    shape,
+    dtype="float32",
+    data=None,
+    strides=None,
+    elem_offset=None,
+    scope="",
+    align=0,
+    offset_factor=0,
+    buffer_type="",
+    axis_separators=None,
+) -> frame.DeclBufferFrame:
+    """Create a buffer declaration node.
+
+    Parameters
+    ----------
+    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
+        The type of the buffer prior to flattening.
+
+    dtype : str
+        The data type in the content of the buffer.
+
+    data : Var
+        The pointer to the head of the data.
+
+    strides : List[PrimExpr]
+        The strides of each dimension.
+
+    elem_offset : PrimExpr
+        The offset in terms of number of dtype elements (including lanes).
+
+    scope : str
+        The optional storage scope of buffer data pointer.
+
+    align : int
+        The alignment requirement of data pointer in bytes.
+
+    offset_factor : int
+        The factor of elem_offset field.
+
+    buffer_type : str
+        The buffer type.
+
+    axis_separators : List[int]
+        The separators between input axes when generating flattened output axes.
+
+    Returns
+    -------
+    res : frame.DeclBufferFrame
+        The result DeclBufferFrame.
+    """
+    shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
+    return _ffi_api.DeclBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
+        shape,
+        dtype,
+        "",
+        data,
+        strides,
+        elem_offset,
+        scope,
+        align,
+        offset_factor,
+        buffer_type,
+        axis_separators,
+    )
+
+
 def launch_thread(
     iter_var: IterVar,  # pylint: disable=redefined-outer-name
     extent: PrimExpr,
@@ -939,6 +1153,53 @@ def env_thread(thread_tag: str) -> IterVar:
     return _ffi_api.EnvThread(thread_tag)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
+def buffer_store(buffer: Buffer, value: PrimExpr, indices: List[Union[PrimExpr, slice]]) -> None:
+    """Buffer store node.
+
+    Parameters
+    ----------
+    buffer : Buffer
+        The buffer.
+
+    value : PrimExpr
+        The value to be stored.
+
+    indices : List[Union[PrimExpr, slice]]
+        The indices location to be stored.
+    """
+    from tvm.arith import Analyzer  # pylint: disable=import-outside-toplevel
+
+    expr_indices = []
+    for index in indices:
+        if isinstance(index, slice):
+            step = 1 if index.step is None else index.step
+            lanes = Analyzer().simplify((index.stop - index.start + step - 1) // step)
+            if lanes == 1:
+                expr_indices.append(index.start)
+            else:
+                expr_indices.append(ramp(index.start, step, int(lanes)))
+        else:
+            expr_indices.append(index)
+    if isinstance(value, bool) and buffer.dtype == "bool":
+        value = IntImm("bool", value)
+    return _ffi_api.BufferStore(  # type: ignore[attr-defined] # pylint: disable=no-member
+        buffer, value, expr_indices
+    )
+
+
+def prefetch(buffer: Buffer, indices: List[PrimExpr]) -> None:
+    """The prefetch hint for a buffer.
+
+    Parameters
+    ----------
+    buffer : Buffer
+        The buffer to be prefetched.
+    indices : List[PrimExpr]
+        The indices of the buffer to extract.
+    """
+    return _ffi_api.Prefetch(buffer, indices)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
 def evaluate(value: PrimExpr) -> None:
     """Evaluate the input expression.
 
@@ -1288,8 +1549,18 @@ def var(dtype, name="") -> Var:
     "Assert",
     "let",
     "realize",
+    "allocate",
+    "allocate_const",
+    "attr",
+    "While",
+    "If",
+    "Then",
+    "Else",
+    "decl_buffer",
     "launch_thread",
     "env_thread",
+    "buffer_store",
+    "prefetch",
     "evaluate",
     "int8",
     "int16",
diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc
index 6c9459e6389c..aa9efa653f71 100644
--- a/src/script/ir_builder/tir/frame.cc
+++ b/src/script/ir_builder/tir/frame.cc
@@ -115,6 +115,76 @@ void LaunchThreadFrameNode::ExitWithScope() {
   AddToParent(tvm::tir::AttrStmt(iter_var, attr_key, extent, AsStmt(stmts)));
 }
 
+void AllocateFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  AddToParent(tvm::tir::Allocate(buffer->data, buffer->dtype, buffer->shape, condition,
+                                 AsStmt(stmts), annotations));
+}
+
+void AllocateConstFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  AddToParent(
+      tvm::tir::AllocateConst(buffer->data, dtype, extents, data, AsStmt(stmts), annotations));
+}
+void AttrFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  AddToParent(tvm::tir::AttrStmt(node, attr_key, value, AsStmt(stmts)));
+}
+
+void WhileFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  AddToParent(tvm::tir::While(condition, AsStmt(stmts)));
+}
+
+void IfFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  if (!stmts.empty()) {
+    LOG(FATAL) << "stmt within IfThenElse frame should be either in ThenFrame or ElseFrame";
+  }
+  if (!then_stmts.defined()) {
+    LOG(FATAL) << "IfThenElse frame should have at least one then branch";
+  }
+  AddToParent(tvm::tir::IfThenElse(
+      condition, AsStmt(then_stmts.value()),
+      else_stmts.defined() ? AsStmt(else_stmts.value()) : tvm::tir::Stmt(nullptr)));
+}
+
+void ThenFrameNode::EnterWithScope() {
+  IfFrame frame = FindIfFrame("T.then_");
+  if (frame->then_stmts.defined()) {
+    LOG(FATAL) << "ValueError: Duplicate then branch declaration, previous one is "
+               << frame->then_stmts.value();
+  }
+  TIRFrameNode::EnterWithScope();
+}
+
+void ThenFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  FindIfFrame("T.then_")->then_stmts = stmts;
+}
+
+void ElseFrameNode::EnterWithScope() {
+  IfFrame frame = FindIfFrame("T.else_");
+  if (!frame->then_stmts.defined()) {
+    LOG(FATAL) << "The else branch should follow then branch";
+  }
+  if (frame->else_stmts.defined()) {
+    LOG(FATAL) << "ValueError: Duplicate else branch declaration, previous one is "
+               << frame->else_stmts.value();
+  }
+  TIRFrameNode::EnterWithScope();
+}
+
+void ElseFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  FindIfFrame("T.else_")->else_stmts = stmts;
+}
+
+void DeclBufferFrameNode::ExitWithScope() {
+  TIRFrameNode::ExitWithScope();
+  AddToParent(tvm::tir::DeclBuffer(buffer, AsStmt(stmts)));
+}
+
 TVM_REGISTER_NODE_TYPE(TIRFrameNode);
 TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode);
 TVM_REGISTER_NODE_TYPE(BlockFrameNode);
@@ -124,6 +194,14 @@ TVM_REGISTER_NODE_TYPE(AssertFrameNode);
 TVM_REGISTER_NODE_TYPE(LetFrameNode);
 TVM_REGISTER_NODE_TYPE(RealizeFrameNode);
 TVM_REGISTER_NODE_TYPE(LaunchThreadFrameNode);
+TVM_REGISTER_NODE_TYPE(AllocateFrameNode);
+TVM_REGISTER_NODE_TYPE(AllocateConstFrameNode);
+TVM_REGISTER_NODE_TYPE(AttrFrameNode);
+TVM_REGISTER_NODE_TYPE(WhileFrameNode);
+TVM_REGISTER_NODE_TYPE(IfFrameNode);
+TVM_REGISTER_NODE_TYPE(ThenFrameNode);
+TVM_REGISTER_NODE_TYPE(ElseFrameNode);
+TVM_REGISTER_NODE_TYPE(DeclBufferFrameNode);
 
 }  // namespace tir
 }  // namespace ir_builder
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index 5951af298f62..28c3d69861fa 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -444,6 +444,63 @@ RealizeFrame Realize(tvm::tir::BufferRegion buffer_slice, String storage_scope,
   return RealizeFrame(n);
 }
 
+AllocateFrame Allocate(Array<PrimExpr> extents, DataType dtype, String storage_scope,
+                       Optional<PrimExpr> condition, Optional<Map<String, ObjectRef>> annotations) {
+  ObjectPtr<AllocateFrameNode> n = make_object<AllocateFrameNode>();
+  n->extents = extents;
+  n->dtype = dtype;
+  n->storage_scope = storage_scope;
+  n->condition = condition.value_or(tvm::Bool(true));
+  n->annotations = annotations.value_or(Map<String, ObjectRef>());
+  n->buffer = BufferDecl(extents, dtype, "", NullOpt, NullOpt, NullOpt, storage_scope, 0, 0,
+                         "default", NullOpt);
+  return AllocateFrame(n);
+}
+
+AllocateConstFrame AllocateConst(tvm::runtime::NDArray data, DataType dtype,
+                                 Array<PrimExpr> extents, Map<String, ObjectRef> annotations) {
+  ObjectPtr<AllocateConstFrameNode> n = make_object<AllocateConstFrameNode>();
+  n->dtype = dtype;
+  n->extents = extents;
+  n->data = data;
+  n->annotations = annotations;
+  n->buffer =
+      BufferDecl(extents, dtype, "", NullOpt, NullOpt, NullOpt, "", 0, 0, "default", NullOpt);
+  return AllocateConstFrame(n);
+}
+
+AttrFrame Attr(ObjectRef node, String attr_key, PrimExpr value) {
+  ObjectPtr<AttrFrameNode> n = make_object<AttrFrameNode>();
+  n->node = node;
+  n->attr_key = attr_key;
+  n->value = value;
+  return AttrFrame(n);
+}
+
+WhileFrame While(PrimExpr condition) {
+  ObjectPtr<WhileFrameNode> n = make_object<WhileFrameNode>();
+  n->condition = condition;
+  return WhileFrame(n);
+}
+
+IfFrame If(PrimExpr condition) {
+  ObjectPtr<IfFrameNode> n = make_object<IfFrameNode>();
+  n->condition = condition;
+  n->then_stmts = NullOpt;
+  n->else_stmts = NullOpt;
+  return IfFrame(n);
+}
+
+ThenFrame Then() {
+  ObjectPtr<ThenFrameNode> n = make_object<ThenFrameNode>();
+  return ThenFrame(n);
+}
+
+ElseFrame Else() {
+  ObjectPtr<ElseFrameNode> n = make_object<ElseFrameNode>();
+  return ElseFrame(n);
+}
+
 Var EnvThread(String thread_tag) {
   IterVar iter_var(Range{nullptr}, Var("", DataType::Int(32)), tvm::tir::IterVarType::kThreadIndex,
                    thread_tag);
@@ -456,6 +513,25 @@ Var EnvThread(String thread_tag) {
   return var;
 }
 
+void BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices) {
+  AddToParent(tvm::tir::BufferStore(buffer, value, indices));
+}
+
+void Prefetch(Buffer buffer, Array<Range> bounds) {
+  AddToParent(tvm::tir::Prefetch(buffer, bounds));
+}
+
+DeclBufferFrame DeclBuffer(Array<PrimExpr> shape, DataType dtype, String buffer_name,
+                           Optional<Var> data, Optional<Array<PrimExpr>> strides,
+                           Optional<PrimExpr> elem_offset, String storage_scope, int align,
+                           int offset_factor, String buffer_type,
+                           Optional<Array<IntImm>> axis_separators) {
+  ObjectPtr<DeclBufferFrameNode> n = make_object<DeclBufferFrameNode>();
+  n->buffer = BufferDecl(shape, dtype, buffer_name, data, strides, elem_offset, storage_scope,
+                         align, offset_factor, buffer_type, axis_separators);
+  return DeclBufferFrame(n);
+}
+
 void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); }
 
 using tvm::script::ir_builder::details::Namer;
@@ -540,10 +616,20 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.Grid").set_body_typed(Grid);
 
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Assert").set_body_typed(Assert);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Let").set_body_typed(Let);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Allocate").set_body_typed(Allocate);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.AllocateConst").set_body_typed(AllocateConst);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Realize").set_body_typed(Realize);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Attr").set_body_typed(Attr);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.While").set_body_typed(While);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.If").set_body_typed(If);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Then").set_body_typed(Then);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Else").set_body_typed(Else);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.DeclBuffer").set_body_typed(DeclBuffer);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.LaunchThread").set_body_typed(LaunchThread);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.EnvThread").set_body_typed(EnvThread);
 
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.BufferStore").set_body_typed(BufferStore);
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Prefetch").set_body_typed(Prefetch);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate);
 
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int8").set_body_typed(Int8);
diff --git a/src/script/ir_builder/tir/utils.h b/src/script/ir_builder/tir/utils.h
index c29fae1c65e9..733c975fad7e 100644
--- a/src/script/ir_builder/tir/utils.h
+++ b/src/script/ir_builder/tir/utils.h
@@ -88,6 +88,21 @@ inline BlockFrame FindBlockFrame(const String& method) {
   throw;
 }
 
+/*!
+ * \brief Check whether the top frame in IRBuilder frame stack is IfFrame.
+ * \param method The method name to be printed when throwing exception.
+ * \return The top frame of IfFrame.
+ */
+inline IfFrame FindIfFrame(const String& method) {
+  if (Optional<IfFrame> frame = IRBuilder::Current()->GetLastFrame<IfFrame>()) {
+    return frame.value();
+  } else {
+    LOG(FATAL) << "ValueError: IfThenElse frame not find. Please ensure '" << method
+               << "' is called under T.if_()";
+  }
+  throw;
+}
+
 /*!
  * \brief Convert BufferLoad to BufferRegion.
  * \param buffer_load The BufferLoad.
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index 7f2e6e1a4706..40e13a2fbe2f 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -17,9 +17,11 @@
 # pylint: disable=invalid-name, missing-docstring
 """Unittests for tvm.script.ir_builder.tir"""
 import pytest
-import tvm.testing
+import numpy as np
 import tvm
+import tvm.testing
 from tvm import tir
+from tvm.runtime import ndarray
 from tvm.script.ir_builder import tir as T
 from tvm.script.ir_builder import IRBuilder
 from tvm.ir.base import assert_structural_equal
@@ -29,6 +31,7 @@ def test_ir_builder_tir_primfunc_base():
     with IRBuilder() as ib:
         with T.prim_func():
             T.evaluate(0)
+
     # the prim_func generated by IRBuilder
     prim_func_actual = ib.get()
 
@@ -41,6 +44,7 @@ def test_ir_builder_tir_primfunc_base():
         preflattened_buffer_map=None,
         attrs=None,
     )
+
     # Check if the generated ir is expected
     assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True)
 
@@ -58,6 +62,7 @@ def test_ir_builder_tir_primfunc_complete():
             buffer_d = T.match_buffer(d, (64, 64), "int64")
             T.preflattened_buffer(e, (32, 32), "int8", data=e.data)
             T.evaluate(0)
+
     # the prim_func generated by IRBuilder
     prim_func_actual = ib.get()
 
@@ -83,6 +88,7 @@ def test_ir_builder_tir_primfunc_complete():
         },
         attrs=tvm.ir.make_node("DictAttrs", key="value"),
     )
+
     # Check if the generated ir is expected
     assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True)
 
@@ -91,6 +97,7 @@ def test_ir_builder_tir_block_base():
     with IRBuilder() as ib:
         with T.block("block"):
             T.evaluate(0)
+
     # the block generated by IRBuilder
     block_realize_actual = ib.get()
 
@@ -110,6 +117,7 @@ def test_ir_builder_tir_block_base():
         predicate=True,
         block=block_expected,
     )
+
     # Check if the generated ir is expected
     assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True)
 
@@ -131,6 +139,7 @@ def test_ir_builder_tir_block_complete():
             T.match_buffer(e[0:32, 0:32], (32, 32), "float32")
             T.axis.spatial(128, f)
             T.evaluate(0)
+
     # the block generated by IRBuilder
     block_realize_actual = ib.get()
 
@@ -158,6 +167,7 @@ def test_ir_builder_tir_block_complete():
         predicate=var_a > 1,
         block=block_expected,
     )
+
     # Check if the generated ir is expected
     assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True)
 
@@ -201,6 +211,7 @@ def test_ir_builder_tir_axis():
         predicate=True,
         block=block_expected,
     )
+
     # Check if the generated ir is expected
     assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True)
 
@@ -256,6 +267,7 @@ def test_ir_builder_tir_for():
         kind=tir.ForKind.SERIAL,
         body=parallel_expected,
     )
+
     # Check if the generated ir is expected
     assert_structural_equal(for_actual, for_expected, map_free_vars=True)
 
@@ -271,20 +283,9 @@ def test_ir_builder_tir_assert():
     assert_expected = tir.AssertStmt(
         T.var("int32", name="a") == 0, tir.StringImm("a is 0"), tir.Evaluate(0)
     )
-    # Check if the generated ir is expected
-    assert_structural_equal(assert_actual, assert_expected, map_free_vars=True)
 
-
-def test_ir_builder_tir_evaluate():
-    with IRBuilder() as ib:
-        T.evaluate(0)
-    # the evaluate generated by IRBuilder
-    eval_actual = ib.get()
-
-    # the expected evaluate
-    eval_expected = tir.Evaluate(0)
     # Check if the generated ir is expected
-    assert_structural_equal(eval_actual, eval_expected, map_free_vars=True)
+    assert_structural_equal(assert_actual, assert_expected, map_free_vars=True)
 
 
 def test_ir_builder_tir_let():
@@ -296,6 +297,8 @@ def test_ir_builder_tir_let():
 
     # the expected Let statement
     let_expected = tir.LetStmt(T.var("int32", name="a"), tir.IntImm("int32", 2), tir.Evaluate(0))
+
+    # Check if the generated ir is expected
     assert_structural_equal(let_actual, let_expected, map_free_vars=True)
 
 
@@ -304,6 +307,8 @@ def test_ir_builder_tir_realize():
     with IRBuilder() as ib:
         with T.realize(buffer_a[0:128, 0:128], "test_storage_scope", True):
             T.evaluate(0)
+
+    # the buffer realization generated by IRBuilder
     realize_actual = ib.get()
 
     # the expected buffer realization
@@ -313,6 +318,8 @@ def test_ir_builder_tir_realize():
     expected_realize = tir.AttrStmt(
         buffer_a, "realize_scope", tir.StringImm("test_storage_scope"), buffer_realize
     )
+
+    # Check if the generated ir is expected
     assert_structural_equal(realize_actual, expected_realize, map_free_vars=True)
 
 
@@ -322,12 +329,152 @@ def test_ir_builder_tir_thread():
             brow = T.env_thread("blockIdx.y")
             with T.launch_thread(brow, 1):
                 T.evaluate(0)
+
+    # the prim_func generated by IRBuilder
     ir_actual = ib.get()
+
+    # the expected prim_func
     iter_var = tir.IterVar((0, 1), "v", iter_type=1, thread_tag="blockIdx.y")
     attr_stmt = tir.AttrStmt(iter_var, "thread_extent", 1, tir.Evaluate(0))
     func = tir.PrimFunc([], attr_stmt)
+
+    # Check if the generated ir is expected
     assert_structural_equal(ir_actual, func, map_free_vars=True)
 
 
+def test_ir_builder_tir_allocate():
+    with IRBuilder() as ib:
+        with T.allocate([10], "float32", scope="local"):
+            T.evaluate(1)
+
+    # the allocate generated by IRBuilder
+    ir_actual = ib.get()
+
+    # the expected allocate
+    buffer_var = tir.Var("v", tvm.ir.PointerType(tvm.ir.PrimType("float32"), "local"))
+    ir_expected = tir.Allocate(
+        buffer_var, "float32", [10], tvm.tir.const(1, "uint1"), tir.Evaluate(1)
+    )
+
+    # Check if the generated ir is expected
+    assert_structural_equal(ir_actual, ir_expected, map_free_vars=True)
+
+
+def test_ir_builder_tir_allocate_const():
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    with IRBuilder() as ib:
+        with T.allocate_const(data, "int32", [10]):
+            T.evaluate(1)
+
+    # the allocate const generated by IRBuilder
+    ir_actual = ib.get()
+
+    # the expected allocate const
+    buffer_var = tir.Var("v", tvm.ir.PointerType(tvm.ir.PrimType("int32")))
+    ir_expected = tir.AllocateConst(
+        buffer_var, "int32", [10], ndarray.array(np.asarray(data, "int32")), tir.Evaluate(1)
+    )
+
+    # Check if the generated ir is expected
+    assert_structural_equal(ir_actual, ir_expected, map_free_vars=True)
+
+
+def test_ir_builder_tir_while():
+    with IRBuilder() as ib:
+        with T.While(T.var("int32", "x") > 0):
+            T.evaluate(0)
+
+    # the while generated by IRBuilder
+    ir_actual = ib.get()
+
+    # the expected while
+    ir_expected = tir.While(tir.Var("x", "int32") > 0, tir.Evaluate(0))
+
+    # Check if the generated ir is expected
+    assert_structural_equal(ir_actual, ir_expected, map_free_vars=True)
+
+
+def test_ir_builder_tir_if_then_else():
+    with IRBuilder() as ib:
+        with T.If(T.var("int32", "c") < 12):
+            with T.Then():
+                T.evaluate(T.int32(0))
+            with T.Else():
+                T.evaluate(T.int32(1))
+
+    # the if_then_else generated by IRBuilder
+    ir_actual = ib.get()
+
+    # the expected if_then_else
+    ir_expected = tir.IfThenElse(
+        tir.Var("c", "int32") < 12,
+        tir.Evaluate(tir.IntImm("int32", 0)),
+        tir.Evaluate(tir.IntImm("int32", 1)),
+    )
+
+    # Check if the generated ir is expected
+    assert_structural_equal(ir_actual, ir_expected, map_free_vars=True)
+
+
+def test_ir_builder_tir_buffer_store():
+    buffer_a = T.buffer_decl((10, 10), "float32")
+    i = T.var("int32", "x")
+    with IRBuilder() as ib:
+        T.buffer_store(buffer_a, 0.1, [0, i])
+
+    # the buffer store generated by IRBuilder
+    ir_actual = ib.get()
+
+    # the expected buffer store
+    ir_expected = tir.BufferStore(buffer_a, 0.1, [0, i])
+
+    # Check if the generated ir is expected
+    assert_structural_equal(ir_actual, ir_expected, map_free_vars=True)
+
+
+def test_ir_builder_tir_prefetch():
+    with IRBuilder() as ib:
+        buffer_a = T.buffer_decl((128, 128), "float32")
+        T.prefetch(buffer_a, [])
+
+    # the prefetch generated by IRBuilder
+    ir_actual = ib.get()
+
+    # the expected prefetch
+    ir_expected = tir.Prefetch(buffer_a, [])
+
+    # Check if the generated ir is expected
+    assert_structural_equal(ir_actual, ir_expected, map_free_vars=True)
+
+
+def test_ir_builder_tir_evaluate():
+    with IRBuilder() as ib:
+        T.evaluate(0)
+    # the evaluate generated by IRBuilder
+    eval_actual = ib.get()
+
+    # the expected evaluate
+    eval_expected = tir.Evaluate(0)
+
+    # Check if the generated ir is expected
+    assert_structural_equal(eval_actual, eval_expected, map_free_vars=True)
+
+
+def test_ir_builder_tir_decl_buffer():
+    with IRBuilder() as ib:
+        with T.decl_buffer([128, 128], "float32"):
+            T.evaluate(0)
+
+    # the decl_buffer generated by IRBuilder
+    ir_actual = ib.get()
+
+    # the expected decl_buffer
+    buffer = T.buffer_decl((128, 128), "float32")
+    ir_expected = tir.DeclBuffer(buffer, tir.Evaluate(0))
+
+    # Check if the generated ir is expected
+    assert_structural_equal(ir_actual, ir_expected, map_free_vars=True)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 60cf692a63a22cd2698273c4945f037b4b22474b Mon Sep 17 00:00:00 2001
From: czh978 <41666381+czh978@users.noreply.github.com>
Date: Mon, 19 Sep 2022 13:49:04 +0800
Subject: [PATCH 203/704] [Frontend][TFLite] fix detection_postprocess's
 non_max_suppression_attrs["force_suppress"] (#12593)

* [Frontend][TFLite]fix detection_postprocess's non_max_suppression_attrs["force_suppress"]

Since tvm only supports operators detection_postprocess use_regular_nms
is false, which will suppress boxes that exceed the threshold regardless
of the class when implementing NMS in tflite, in order for the results
of tvm and tflite to be consistent, we need to set force_suppress to
True.

* [Frontend][TFLite]fix detection_postprocess's non_max_suppression_attrs[force_suppress]

Added a test case that reproduces inconsistent results between tvm and tflite
When the force_suppress is false,it will get a good result if you set the force_suppress as true
---
 python/tvm/relay/frontend/tflite.py          |  2 +-
 tests/python/frontend/tflite/test_forward.py | 37 ++++++++++++++------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 6c68230e0ecc..a7e10ad72e55 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -3355,7 +3355,7 @@ def convert_detection_postprocess(self, op):
         non_max_suppression_attrs = {}
         non_max_suppression_attrs["return_indices"] = False
         non_max_suppression_attrs["iou_threshold"] = custom_options["nms_iou_threshold"]
-        non_max_suppression_attrs["force_suppress"] = False
+        non_max_suppression_attrs["force_suppress"] = True
         non_max_suppression_attrs["top_k"] = anchor_boxes
         non_max_suppression_attrs["max_output_size"] = custom_options["max_detections"]
         non_max_suppression_attrs["invalid_to_bottom"] = False
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index deaef72e1d7f..7b2bd60d8a20 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -4311,13 +4311,8 @@ def test_forward_matrix_diag():
 # ----------------
 
 
-def test_detection_postprocess():
-    """Detection PostProcess"""
-    tf_model_file = tf_testing.get_workload_official(
-        "http://download.tensorflow.org/models/object_detection/"
-        "ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03.tar.gz",
-        "ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03/tflite_graph.pb",
-    )
+def _test_detection_postprocess(tf_model_file, box_encodings_size, class_predictions_size):
+    """One iteration of detection postProcess with given model and shapes"""
     converter = tf.lite.TFLiteConverter.from_frozen_graph(
         tf_model_file,
         input_arrays=["raw_outputs/box_encodings", "raw_outputs/class_predictions"],
@@ -4328,16 +4323,16 @@ def test_detection_postprocess():
             "TFLite_Detection_PostProcess:3",
         ],
         input_shapes={
-            "raw_outputs/box_encodings": (1, 1917, 4),
-            "raw_outputs/class_predictions": (1, 1917, 91),
+            "raw_outputs/box_encodings": box_encodings_size,
+            "raw_outputs/class_predictions": class_predictions_size,
         },
     )
     converter.allow_custom_ops = True
     converter.inference_type = tf.lite.constants.FLOAT
     tflite_model = converter.convert()
     np.random.seed(0)
-    box_encodings = np.random.uniform(size=(1, 1917, 4)).astype("float32")
-    class_predictions = np.random.uniform(size=(1, 1917, 91)).astype("float32")
+    box_encodings = np.random.uniform(size=box_encodings_size).astype("float32")
+    class_predictions = np.random.uniform(size=class_predictions_size).astype("float32")
     tflite_output = run_tflite_graph(tflite_model, [box_encodings, class_predictions])
     tvm_output = run_tvm_graph(
         tflite_model,
@@ -4382,6 +4377,26 @@ def test_detection_postprocess():
         )
 
 
+def test_detection_postprocess():
+    """Detection PostProcess"""
+    box_encodings_size = (1, 1917, 4)
+    class_predictions_size = (1, 1917, 91)
+    tf_model_file = tf_testing.get_workload_official(
+        "http://download.tensorflow.org/models/object_detection/"
+        "ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03.tar.gz",
+        "ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03/tflite_graph.pb",
+    )
+    _test_detection_postprocess(tf_model_file, box_encodings_size, class_predictions_size)
+
+    box_encodings_size = (1, 2034, 4)
+    class_predictions_size = (1, 2034, 91)
+    tf_model_file = download_testdata(
+        "https://github.com/czh978/models_for_tvm_test/raw/main/tflite_graph_with_postprocess.pb",
+        "tflite_graph_with_postprocess.pb",
+    )
+    _test_detection_postprocess(tf_model_file, box_encodings_size, class_predictions_size)
+
+
 #######################################################################
 # Custom Converter
 # ----------------

From 2af9b90ec191424724842795c552d4c15682eb8c Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 19 Sep 2022 08:20:33 -0500
Subject: [PATCH 204/704] [TIR] Implement API for padded layout transformations
 (#12720)

Implementation of API in `tvm.tir.schedule` for layout transformations
with padding, as part of https://github.com/apache/tvm/issues/12261,
item "Insert pad value into generated TIR, using `tir::if_then_else`,
`builtin::assume`, and `builtin::undef`".

Following the RFC discussion in
https://github.com/apache/tvm-rfcs/pull/77#issuecomment-1170294348 and
https://github.com/apache/tvm-rfcs/pull/77#issuecomment-1171290053,
this commit preferentially rewrites the loops that surround a padded
transformation where possible, in order to express padding in terms of
`tir::if_then_else`.
---
 include/tvm/tir/schedule/schedule.h           |  17 +-
 python/tvm/tir/function.py                    |  46 +-
 python/tvm/tir/schedule/_type_checker.py      |   2 +-
 python/tvm/tir/schedule/schedule.py           |  42 +-
 python/tvm/tir/tensor_intrin/cuda.py          |   2 +-
 src/meta_schedule/postproc/rewrite_layout.cc  |   3 +-
 .../multi_level_tiling_tensor_core.cc         |   2 +-
 src/tir/ir/index_map.cc                       |   2 +-
 src/tir/schedule/concrete_schedule.cc         |   6 +-
 src/tir/schedule/concrete_schedule.h          |   2 +-
 src/tir/schedule/instruction_traits.h         |   4 +-
 src/tir/schedule/primitive.h                  |   4 +-
 .../primitive/layout_transformation.cc        | 910 +++++++++++++++++-
 src/tir/schedule/schedule.cc                  |   6 +-
 src/tir/schedule/traced_schedule.cc           |  15 +-
 src/tir/schedule/traced_schedule.h            |   2 +-
 .../test_tir_schedule_transform_layout.py     | 410 ++++++++
 17 files changed, 1408 insertions(+), 67 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 8e5cd34d2e0b..049f063240df 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -601,9 +601,24 @@ class ScheduleNode : public runtime::Object {
    * \param buffer_index The index of the buffer in block's read or write region.
    * \param buffer_index_type The type of the buffer index, kRead or kWrite.
    * \param index_map The transformation to apply.
+   *
+   * \param pad_value The value to write into padding introduced by
+   *    the transformation.  If the schedule contains a producer block
+   *    for the specified buffer, the pad value will be written as
+   *    part of the producer block if possible, or after the producer
+   *    block otherwise.  Otherwise, if the buffer is an input, will
+   *    insert an annotation block to state that the padding contains
+   *    the known value.
+   *
+   *    Note: If applied to an input buffer, the calling scope is
+   *    responsible for ensuring that the pad_value is present.
+   *    Algebraic symplifications, branch elimination, and other
+   *    optimizations may assume that this precondition is met, and
+   *    may result in incorrect results being returned.
    */
   virtual void TransformLayout(const BlockRV& block_rv, int buffer_index,
-                               BufferIndexType buffer_index_type, const IndexMap& index_map) = 0;
+                               BufferIndexType buffer_index_type, const IndexMap& index_map,
+                               const Optional<IndexMap>& pad_value = NullOpt) = 0;
 
   /*!
    * \brief Apply a transformation represented by IndexMap to block
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index e525fc2cc31a..df39f8aebf71 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -308,8 +308,9 @@ def from_func(
 
             The function to map from source indices to target indices.
             The function should accept `tir.Var` parameters and return
-            a list. Each element of the returned list should be a
-            `tir.PrimExpr`.
+            a either a `tir.PrimExpr`, or a list of `tir.PrimExpr`.
+            Returning a `tir.PrimExpr` is equivalent to returning a
+            list of length 1 containing that `tir.PrimExpr`.
 
         ndim: Optional[int]
 
@@ -356,9 +357,12 @@ def from_func_with_separators(
         mapping_function : Callable
 
             The function to map from source indices to target indices.
-            The function should accept tir.Var parameters and return a
-            list. Each element of the returned list should be either a
-            `tir.PrimExpr` or the object `IndexMap.AXIS_SEPARATOR`.
+            The function should accept tir.Var parameters and return
+            either a `tir.PrimExpr` or a list.  Each element of the
+            returned list should be either a `tir.PrimExpr` or the
+            object `IndexMap.AXIS_SEPARATOR`.  Returning a
+            `tir.PrimExpr` is equivalent to returning a list of length
+            1 containing that `tir.PrimExpr`.
 
         ndim: Optional[int]
 
@@ -423,17 +427,27 @@ def from_func_with_separators(
 
         final_indices = []
         axis_separators = []
-        for val in mapping:
-            if isinstance(val, tvm.ir.PrimExpr):
-                final_indices.append(val)
-            elif val is IndexMap.AXIS_SEPARATOR:
-                axis_separators.append(len(final_indices))
-            else:
-                raise TypeError(
-                    "Expected mapping function to return list of "
-                    "either tvm.ir.PrimExpr or IndexMap.AXIS_SEPARATOR.  "
-                    f"Instead received {val} of type {type(val)}."
-                )
+
+        try:
+            iter(mapping)
+            is_iterable = True
+        except TypeError:
+            is_iterable = False
+
+        if is_iterable:
+            for val in mapping:
+                if isinstance(val, tvm.ir.PrimExpr):
+                    final_indices.append(val)
+                elif val is IndexMap.AXIS_SEPARATOR:
+                    axis_separators.append(len(final_indices))
+                else:
+                    raise TypeError(
+                        "Expected mapping function to return list of "
+                        "either tvm.ir.PrimExpr or IndexMap.AXIS_SEPARATOR.  "
+                        f"Instead received {val} of type {type(val)}."
+                    )
+        else:
+            final_indices.append(mapping)
 
         return IndexMap(initial_indices, final_indices, inverse_index_map), axis_separators
 
diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py
index 0b48dfc2b0e6..0c66f7ef6cdf 100644
--- a/python/tvm/tir/schedule/_type_checker.py
+++ b/python/tvm/tir/schedule/_type_checker.py
@@ -164,7 +164,7 @@ def _dispatcher(type_: Any) -> Tuple[str, List[type]]:
     return "atomic", [type_]
 
 
-def callable_str(subtypes):
+def callable_str(*subtypes):
     if subtypes:
         *arg_types, return_type = subtypes
         arg_str = ", ".join(_type2str(arg_type) for arg_type in arg_types)
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index fdc871703275..b8f696b7a134 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -2443,6 +2443,7 @@ def transform_layout(
         block: Union[BlockRV, str],
         buffer: Union[Tuple[str, int], str, Buffer],
         index_map: Union[IndexMap, Callable],
+        pad_value: Optional[Union[int, float, IndexMap, Callable]] = None,
     ) -> None:
         """Apply a transformation represented by IndexMap to buffer
 
@@ -2479,6 +2480,36 @@ def transform_layout(
             primitive will be called in addition to the
             TransformLayout primitive.
 
+        pad_value: Optional[Union[int, float, PrimExpr, IndexMap, Callable]]
+
+            The value to be used for any padding introduced by the
+            transformation.  If the schedule contains a producer block
+            for the specified buffer, the pad value will be written as
+            part of the producer block if possible, or after the producer
+            block otherwise.  Otherwise, if the buffer is an input, will
+            insert an annotation block to state that the padding contains
+            the known value.
+
+            The pad value may not contain instances of BufferLoad,
+            except where it loads a value from the buffer being
+            transformed (e.g. to create a circular buffer with
+            padding that consists of repeated elements).
+
+            Note: If applied to an input buffer, the calling scope is
+            responsible for ensuring that the pad_value is present.
+            Algebraic symplifications, branch elimination, and other
+            optimizations may assume that this precondition is met, and
+            may result in incorrect results being returned.
+
+            If None, the transformation may not introduce padding.
+
+            If an int, float or PrimExpr, the transformation is the
+            specific value to be present in the padding.
+
+            If an IndexMap or Callable, the transformation is the
+            value to be present in the padding in terms of the
+            transformed index.
+
         Examples
         --------
         Before transform_layout, in TensorIR, the IR is:
@@ -2536,9 +2567,18 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) ->
         else:
             axis_separators = []
 
+        if pad_value is None:
+            pass
+        elif callable(pad_value):
+            pad_value = IndexMap.from_func(pad_value, ndim=len(index_map.final_indices))
+        elif not isinstance(pad_value, IndexMap):
+            pad_value = IndexMap.from_func(
+                lambda *indices: pad_value, ndim=len(index_map.final_indices)
+            )
+
         buffer_index_type_enum = 0 if buffer_index_type == "read" else 1
         _ffi_api.ScheduleTransformLayout(  # type: ignore # pylint: disable=no-member
-            self, block, buffer_index, buffer_index_type_enum, index_map
+            self, block, buffer_index, buffer_index_type_enum, index_map, pad_value
         )
         if axis_separators:
             _ffi_api.ScheduleSetAxisSeparator(  # type: ignore # pylint: disable=no-member
diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
index 64d7c24840ae..a309b091285b 100644
--- a/python/tvm/tir/tensor_intrin/cuda.py
+++ b/python/tvm/tir/tensor_intrin/cuda.py
@@ -36,7 +36,7 @@ def shared_16x32_to_ldmatrix_32x16_layout(i, j):
 
 
 def shared_32x16_to_ldmatrix_32x16_layout(i, j):
-    thread_id = (i % 4) + 4 * (j % 8)
+    thread_id = (i % 16) // 4 + 4 * (j % 8)
     return thread_id, 8 * (j // 8) + (i // 16) * 4 + i % 4
 
 
diff --git a/src/meta_schedule/postproc/rewrite_layout.cc b/src/meta_schedule/postproc/rewrite_layout.cc
index 6ff9958c791f..998b22b57463 100644
--- a/src/meta_schedule/postproc/rewrite_layout.cc
+++ b/src/meta_schedule/postproc/rewrite_layout.cc
@@ -148,7 +148,8 @@ bool RewriteLayout(const Schedule& sch) {
       // Apply schedule
       BlockRV block_rv = sch->GetBlock(block->name_hint, func_name);
       BlockRV cached_block_rv = sch->CacheRead(block_rv, buffer_index, "global");
-      sch->TransformLayout(block_rv, buffer_index, BufferIndexType::kRead, index_map.value());
+      sch->TransformLayout(block_rv, buffer_index, BufferIndexType::kRead, index_map.value(),
+                           NullOpt);
       sch->Annotate(cached_block_rv, attr::meta_schedule_layout_rewrite_preproc, const_true());
     }
   }
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 8fcb8fe503b7..6759b59a3245 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -499,7 +499,7 @@ Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
     const tir::BufferRegion& reindexed_buffer_region = tir::GetNthAccessBufferRegion(
         state->sch->state(), GetRef<tir::Block>(block), buffer_index, index_type);
     auto sub_index_map = f_get_sub_index_map(lhs_buffer, reindexed_buffer_region->region);
-    state->sch->TransformLayout(state->block_rv, buffer_index, index_type, sub_index_map);
+    state->sch->TransformLayout(state->block_rv, buffer_index, index_type, sub_index_map, NullOpt);
   };
 
   for (int i = 0, n = block_before_reindex->reads.size(); i < n; ++i) {
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index cceff72ec82f..64c5d5d5ddde 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -93,7 +93,7 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
   // Unpack the map to an array, maintaining the same parameter order.
   Array<PrimExpr> inverse_exprs;
   for (const auto& index : (*this)->initial_indices) {
-    inverse_exprs.push_back(inverse_exprs_map.at(index));
+    inverse_exprs.push_back(analyzer.Simplify(inverse_exprs_map.at(index)));
   }
 
   PrimExpr padding_predicate = padded_iter_map->padding_predicate;
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 9d7dc6b95f50..4558ad04baed 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -761,9 +761,11 @@ void ConcreteScheduleNode::Unannotate(const BlockRV& block_rv, const String& ann
 /******** Schedule: Layout transformation ********/
 void ConcreteScheduleNode::TransformLayout(const BlockRV& block_rv, int buffer_index,
                                            BufferIndexType buffer_index_type,
-                                           const IndexMap& index_map) {
+                                           const IndexMap& index_map,
+                                           const Optional<IndexMap>& pad_value) {
   TVM_TIR_SCHEDULE_BEGIN();
-  tir::TransformLayout(state_, this->GetSRef(block_rv), buffer_index, buffer_index_type, index_map);
+  tir::TransformLayout(state_, this->GetSRef(block_rv), buffer_index, buffer_index_type, index_map,
+                       pad_value);
   this->state_->DebugVerify();
   TVM_TIR_SCHEDULE_END("transform_layout", this->error_render_level_);
 }
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 1aa9dafcc93e..59a9e3752859 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -144,7 +144,7 @@ class ConcreteScheduleNode : public ScheduleNode {
   void Unannotate(const BlockRV& block_rv, const String& ann_key) override;
   /******** Schedule: Layout transformation ********/
   void TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type,
-                       const IndexMap& index_map) override;
+                       const IndexMap& index_map, const Optional<IndexMap>& pad_value) override;
   void TransformBlockLayout(const BlockRV& block_rv, const IndexMap& index_map) override;
   void SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
                         BufferIndexType buffer_index_type,
diff --git a/src/tir/schedule/instruction_traits.h b/src/tir/schedule/instruction_traits.h
index 56c69224fe17..122c5ff0d9fe 100644
--- a/src/tir/schedule/instruction_traits.h
+++ b/src/tir/schedule/instruction_traits.h
@@ -430,7 +430,9 @@ TVM_ALWAYS_INLINE Array<ObjectRef> UnpackedInstTraits<TTraits>::_ConvertOutputs(
 /********** PythonAPICall **********/
 
 inline void PythonAPICall::AsPythonString(const ObjectRef& obj, std::ostream& os) {
-  if (const auto* str = obj.as<runtime::StringObj>()) {
+  if (!obj.defined()) {
+    os << "None";
+  } else if (const auto* str = obj.as<runtime::StringObj>()) {
     os << str->data;
   } else if (const auto* int_imm = obj.as<IntImmNode>()) {
     os << int_imm->value;
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 97233fe4bc6f..21388ff132ae 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -474,9 +474,11 @@ TVM_DLL void Unannotate(ScheduleState self, const StmtSRef& sref, const String&
  * \param buffer_index The index of the buffer in block's read or write region.
  * \param buffer_index_type The type of the buffer index, kRead or kWrite.
  * \param index_map The transformation to apply.
+ * \param pad_value The value to write into padding introduced by the transformation.
  */
 TVM_DLL void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
-                             BufferIndexType buffer_index_type, const IndexMap& index_map);
+                             BufferIndexType buffer_index_type, const IndexMap& index_map,
+                             const Optional<IndexMap>& pad_value);
 
 /*!
  * \brief Apply a transformation represented by IndexMap to block
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 32ed279f028f..025723e1793d 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -16,12 +16,647 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+
+#include <optional>
+#include <variant>
+
 #include "../../../arith/ir_mutator_with_analyzer.h"
 #include "../utils.h"
 
 namespace tvm {
 namespace tir {
 
+/*! \brief Planning stage prior to rewriting in TransformLayoutRewriter
+ *
+ * There are four ways that transformation may be handled.  Each
+ * updates the buffer shape and the indices used to acces the buffer
+ * in BufferStore/BufferLoad nodes, but differ in how they handle the
+ * `pad_value`.  In order of preference, the different strategies are
+ * as follows:
+ *
+ * 1. NoPaddingRequired.  The transformation does not introduce
+ * padding, so only local changes to update the indices of
+ * BufferLoad/BufferStore nodes are required.  No blocks are added,
+ * removed, or replaced.
+ *
+ * 2. ProloguePlan.  The transformation introduces padding, but the
+ * analyzed block has no write stages for the transformed buffer.
+ * This buffer is an input and the caller is responsible for ensuring
+ * that the padding contains the specified `pad_value`.  The generated
+ * prologue contains `builtin::assume()` calls that will expose this
+ * known value during scheduling/simplification, but will be removed
+ * during lowering.
+ *
+ * 3. ReplacementPlan.  The transformation introduces padding, has at
+ * least one write stage for the transformed buffer, and at least one
+ * of those write stages writes to all pre-transformation indices
+ * following a row-major traversal.  These write stage is rewritten to
+ * be row-major traversals of the post-transformation indices, with a
+ * `tir::if_then_else` call to write either the specified `pad_value`
+ * into padding or the computed value into non-padding.
+ *
+ * 4. EpiloguePlan.  The transformation introduces padding, has at
+ * least one write stage for the transformed buffer, but no write
+ * stage can be rewritten to use `tir::if_then_else`.  The
+ * transformation still requires the `pad_value` to be written into
+ * the padding, so a new block is inserted after the last write stage
+ * to explicitly fill the padding.
+ *
+ */
+class TransformLayoutPlanner : private StmtExprVisitor {
+ public:
+  // Statement to be inserted prior to the analyzed block
+  struct ProloguePlan {
+    Stmt prologue;
+  };
+
+  // Loops within the analyzed block that should be replaced
+  struct ReplacementPlan {
+    Map<For, Stmt> replacements;
+    Map<Block, Block> block_sref_reuse;
+  };
+
+  // The block to be inserted, along with the location at which it
+  // should be inserted.  The location will be either a For or a
+  // Block, and will be after all writes the transformed buffer.
+  struct EpiloguePlan {
+    Stmt insert_after;
+    Stmt new_block;
+  };
+
+  struct NoPaddingRequired {};
+
+  using TransformPlan =
+      std::variant<ProloguePlan, ReplacementPlan, EpiloguePlan, NoPaddingRequired>;
+
+  static TransformPlan Plan(Block block, Buffer old_buffer, Buffer new_buffer, IndexMap index_map,
+                            IndexMap inverse, PrimExpr padding_predicate,
+                            Optional<IndexMap> pad_value) {
+    ICHECK(!pad_value.defined() || pad_value.value()->final_indices.size() == 1)
+        << "Internal error: Should be caught by ScheduleError checks prior to this point";
+    TransformLayoutPlanner visitor(old_buffer);
+    visitor(block);
+    return visitor.Finalize(new_buffer, index_map, inverse, padding_predicate, pad_value);
+  }
+
+ private:
+  explicit TransformLayoutPlanner(Buffer old_buffer) : old_buffer_(old_buffer) {}
+
+  void VisitStmt_(const ForNode* op) override {
+    BindLoopVar context(this, GetRef<For>(op));
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitStmt_(const LetStmtNode* op) override {
+    BindVariableDefinition context(this, op->var, op->value);
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitStmt_(const BlockRealizeNode* op) override {
+    BindBlockRealize context(this, GetRef<BlockRealize>(op));
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode* op) override {
+    if (!op->buffer.same_as(old_buffer_)) {
+      return;
+    }
+
+    std::optional<std::pair<size_t, size_t>> loop_dependency_range = std::nullopt;
+    for (const auto& index : op->indices) {
+      if (auto index_depth = LoopDependencyRange(index); index_depth.has_value()) {
+        if (loop_dependency_range) {
+          loop_dependency_range = {
+              std::min(loop_dependency_range.value().first, index_depth.value().first),
+              std::max(loop_dependency_range.value().second, index_depth.value().second)};
+        } else {
+          loop_dependency_range = index_depth;
+        }
+      }
+    }
+
+    WriteInfo write_info;
+    write_info.store = GetRef<BufferStore>(op);
+    if (loop_dependency_range) {
+      size_t i = loop_dependency_range.value().first;
+      size_t j = loop_dependency_range.value().second;
+      ICHECK_LT(i, active_loops_.size());
+      ICHECK_LT(j, active_loops_.size());
+
+      write_info.dependent_loopnest = {active_loops_.begin() + i, active_loops_.begin() + j + 1};
+    }
+    write_info.innermost_block_realize = innermost_block_realize_;
+
+    write_info.contains_row_major_traversal = [&]() -> bool {
+      const auto& loopnest = write_info.dependent_loopnest;
+      if (loopnest.empty()) {
+        return false;
+      }
+
+      if (loopnest.size() != old_buffer_->shape.size() || loopnest.size() != op->indices.size()) {
+        return false;
+      }
+
+      for (size_t i = 0; i < loopnest.size(); i++) {
+        const For& loop = loopnest[i];
+        const PrimExpr& buffer_dim = old_buffer_->shape[i];
+        PrimExpr index = Substitute(op->indices[i], active_var_bindings_);
+        bool is_loop_over_axis = index.same_as(loop->loop_var) && is_const_int(loop->min, 0) &&
+                                 ExprDeepEqual()(loop->extent, buffer_dim) &&
+                                 loop->kind == ForKind::kSerial;
+        if (!is_loop_over_axis) {
+          return false;
+        }
+      }
+
+      return true;
+    }();
+
+    write_info_.push_back(write_info);
+
+    // Don't need to continue recursing, as the entire goal was to
+    // find the BufferStore.
+  }
+
+  std::optional<std::pair<size_t, size_t>> LoopDependencyRange(const PrimExpr& expr) const {
+    std::optional<std::pair<size_t, size_t>> prev = std::nullopt;
+    for (const auto& var : UndefinedVars(expr)) {
+      auto it = loop_depth_lookup_.find(var.get());
+      if (it != loop_depth_lookup_.end()) {
+        if (prev.has_value()) {
+          prev = {std::min(prev.value().first, it->second.first),
+                  std::max(prev.value().second, it->second.second)};
+        } else {
+          prev = it->second;
+        }
+      }
+    }
+
+    return prev;
+  }
+
+  class BufferStoreReplacer : public StmtExprMutator {
+   public:
+    BufferStoreReplacer(std::function<Optional<Stmt>(const BufferStoreNode*)> replace_store,
+                        std::function<Optional<Stmt>(const BlockRealizeNode*, const BlockRealize&)>
+                            replace_block_realize)
+        : replace_store_(replace_store), replace_block_realize_(replace_block_realize) {}
+
+    Stmt VisitStmt_(const BufferStoreNode* op) final {
+      if (auto replacement = replace_store_(op)) {
+        auto store = Downcast<BufferStore>(replacement.value());
+        return StmtExprMutator::VisitStmt_(store.get());
+      } else {
+        return StmtExprMutator::VisitStmt_(op);
+      }
+    }
+
+    Stmt VisitStmt_(const BlockRealizeNode* op) final {
+      auto realize = Downcast<BlockRealize>(StmtExprMutator::VisitStmt_(op));
+      if (auto replacement = replace_block_realize_(op, realize)) {
+        return replacement.value();
+      } else {
+        return std::move(realize);
+      }
+    }
+
+   private:
+    std::function<Optional<Stmt>(const BufferStoreNode*)> replace_store_;
+    std::function<Optional<Stmt>(const BlockRealizeNode*, const BlockRealize&)>
+        replace_block_realize_;
+  };
+
+  TransformPlan Finalize(Buffer new_buffer, IndexMap index_map, IndexMap inverse,
+                         PrimExpr padding_predicate, Optional<IndexMap> pad_value) const {
+    if (auto prologue_plan =
+            FinalizeProloguePlan(new_buffer, index_map, inverse, padding_predicate, pad_value);
+        prologue_plan.has_value()) {
+      return prologue_plan.value();
+    } else if (auto replacement_plan = FinalizeReplacementPlan(new_buffer, index_map, inverse,
+                                                               padding_predicate, pad_value);
+               replacement_plan.has_value()) {
+      return replacement_plan.value();
+    } else if (auto epilogue_plan = FinalizeEpiloguePlan(new_buffer, index_map, inverse,
+                                                         padding_predicate, pad_value);
+               epilogue_plan.has_value()) {
+      return epilogue_plan.value();
+    } else {
+      return NoPaddingRequired();
+    }
+  }
+
+  std::optional<ProloguePlan> FinalizeProloguePlan(Buffer new_buffer, IndexMap index_map,
+                                                   IndexMap inverse, PrimExpr padding_predicate,
+                                                   Optional<IndexMap> pad_value) const {
+    if (write_info_.size() || is_zero(padding_predicate) || !pad_value.defined()) {
+      return std::nullopt;
+    }
+
+    Array<IterVar> iter_vars;
+    Array<PrimExpr> iter_values;
+    Array<PrimExpr> indices;
+    Map<Var, PrimExpr> loop_indices_to_block_indices;
+    ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size());
+    for (size_t i = 0; i < inverse->initial_indices.size(); i++) {
+      const auto& loop_var = inverse->initial_indices[i];
+      const auto& dim = new_buffer->shape[i];
+      Var block_var("v_" + loop_var->name_hint, loop_var->dtype);
+      IterVar iter_var(Range(0, dim), block_var, kDataPar);
+      loop_indices_to_block_indices.Set(loop_var, block_var);
+      indices.push_back(iter_var->var);
+      iter_vars.push_back(iter_var);
+      iter_values.push_back(loop_var);
+    }
+    padding_predicate = Substitute(std::move(padding_predicate), loop_indices_to_block_indices);
+
+    PrimExpr pad_value_at_index = pad_value.value()->MapIndices(indices)[0];
+    PrimExpr expr = (!padding_predicate) || (BufferLoad(new_buffer, indices) == pad_value_at_index);
+    Stmt stmt = Evaluate(Call(DataType::Bool(), builtin::assume(), {expr}));
+
+    std::stringstream block_name;
+    block_name << "buffer_" << new_buffer->name << "_assumptions";
+    auto read_region = BufferRegion::FromPoint(new_buffer, indices);
+    stmt = BlockRealize(iter_values, Bool(true),
+                        Block(iter_vars, {read_region}, {}, block_name.str(), stmt));
+
+    for (size_t rev_i = 0; rev_i < inverse->initial_indices.size(); rev_i++) {
+      size_t i = (inverse->initial_indices.size() - 1) - rev_i;
+      Var loop_var = inverse->initial_indices[i];
+      PrimExpr extent = new_buffer->shape[i];
+      stmt = For(loop_var, 0, extent, ForKind::kSerial, stmt);
+    }
+    return ProloguePlan{stmt};
+  }
+
+  std::optional<ReplacementPlan> FinalizeReplacementPlan(Buffer new_buffer, IndexMap index_map,
+                                                         IndexMap inverse,
+                                                         PrimExpr padding_predicate,
+                                                         Optional<IndexMap> pad_value) const {
+    if (write_info_.empty() || is_zero(padding_predicate) || !pad_value.defined()) {
+      return std::nullopt;
+    }
+
+    auto generate_if_then_else_block = [&](const WriteInfo& info) -> Optional<Stmt> {
+      if (!info.contains_row_major_traversal || !pad_value.defined() ||
+          is_zero(padding_predicate)) {
+        return NullOpt;
+      }
+
+      Array<PrimExpr> old_indices = info.store->indices;
+      PrimExpr if_then_else_condition = padding_predicate;
+      Array<PrimExpr> new_indices;
+      for (const auto& var : inverse->initial_indices) {
+        new_indices.push_back(var);
+      }
+
+      auto replace_block_realize =
+          [&]() -> std::function<Optional<Stmt>(const BlockRealizeNode*, const BlockRealize&)> {
+        auto no_change = [](const BlockRealizeNode*, const BlockRealize&) -> Optional<Stmt> {
+          return NullOpt;
+        };
+        if (!info.innermost_block_realize) {
+          return no_change;
+        }
+        if (old_indices.empty()) {
+          return no_change;
+        }
+
+        BlockRealize block_realize = info.innermost_block_realize.value();
+        const auto& block = block_realize->block;
+
+        // Find the block iterators that are used to access the buffer.  Must be in the same order
+        // as they appear in the indices.
+        if (block->iter_vars.size() < old_indices.size()) {
+          return no_change;
+        }
+        const auto& iter_vars = block->iter_vars;
+        size_t block_index_start = 0;
+        for (; block_index_start < iter_vars.size() - old_indices.size(); block_index_start++) {
+          if (old_indices[0].same_as(iter_vars[block_index_start]->var)) {
+            break;
+          }
+        }
+        if (block_index_start > iter_vars.size() - old_indices.size()) {
+          return no_change;
+        }
+
+        for (size_t i = 0; i < old_indices.size(); i++) {
+          if (!old_indices[i].same_as(iter_vars[block_index_start + i]->var) ||
+              iter_vars[block_index_start + i]->iter_type != kDataPar) {
+            return no_change;
+          }
+        }
+
+        // If we got to this point, all indices used to access the
+        // buffer are virtual indices defined in the innermost block.
+        // Therefore, generate new virtual indices for iterating over
+        // the post-transform buffer.
+        Array<PrimExpr> new_iter_values;             // For BlockRealize
+        Array<IterVar> new_iter_vars;                // For Block
+        Array<PrimExpr> new_access_indices;          // For BufferStore
+        Map<Var, PrimExpr> loop_var_to_virtual_var;  // For updating if_then_else_condition
+
+        for (size_t i = 0; i < block_index_start; i++) {
+          new_iter_vars.push_back(iter_vars[i]);
+          new_iter_values.push_back(block_realize->iter_values[i]);
+        }
+
+        ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size());
+        for (size_t i = 0; i < inverse->initial_indices.size(); i++) {
+          Var var = inverse->initial_indices[i];
+          PrimExpr dim = new_buffer->shape[i];
+          std::stringstream ss;
+          ss << "v_" << var->name_hint;
+          Var virtual_var(ss.str(), var.dtype());
+          new_iter_values.push_back(var);
+          new_iter_vars.push_back(IterVar(Range::FromMinExtent(0, dim), virtual_var, kDataPar));
+          new_access_indices.push_back(virtual_var);
+          loop_var_to_virtual_var.Set(var, virtual_var);
+        }
+
+        for (size_t i = block_index_start + old_indices.size(); i < iter_vars.size(); i++) {
+          new_iter_vars.push_back(iter_vars[i]);
+          new_iter_values.push_back(block_realize->iter_values[i]);
+        }
+
+        Map<Var, PrimExpr> old_virtual_var_to_new_virtual_var;
+        ICHECK_EQ(inverse->final_indices.size(), old_indices.size());
+        for (size_t i = 0; i < old_indices.size(); i++) {
+          Var var = Downcast<Var>(old_indices[i]);
+          PrimExpr expr = Substitute(inverse->final_indices[i], loop_var_to_virtual_var);
+          old_virtual_var_to_new_virtual_var.Set(var, expr);
+        }
+
+        if_then_else_condition = Substitute(if_then_else_condition, loop_var_to_virtual_var);
+        new_indices = new_access_indices;
+
+        return [target_realize = info.innermost_block_realize, new_iter_vars, new_iter_values,
+                old_virtual_var_to_new_virtual_var](const BlockRealizeNode* op,
+                                                    const BlockRealize& visited) -> Optional<Stmt> {
+          if (op == target_realize.get()) {
+            Block block = visited->block;
+            block =
+                Downcast<Block>(Substitute(std::move(block), old_virtual_var_to_new_virtual_var));
+            block.CopyOnWrite()->iter_vars = new_iter_vars;
+
+            BlockRealize realize = visited;
+            {
+              auto write_ptr = realize.CopyOnWrite();
+              write_ptr->block = block;
+              write_ptr->iter_values = new_iter_values;
+            }
+            return realize;
+          } else {
+            return NullOpt;
+          }
+        };
+      }();
+
+      bool all_stores_replaced = true;
+      auto replace_store = [&](const BufferStoreNode* op) -> Optional<Stmt> {
+        if (!op->buffer.same_as(info.store->buffer)) {
+          all_stores_replaced = false;
+          return NullOpt;
+        }
+        ICHECK_EQ(old_indices.size(), op->indices.size());
+        ExprDeepEqual expr_equal;
+        for (size_t i = 0; i < old_indices.size(); i++) {
+          if (!expr_equal(old_indices[i], op->indices[i])) {
+            all_stores_replaced = false;
+            return NullOpt;
+          }
+        }
+
+        PrimExpr pad_value_at_index = pad_value.value()->MapIndices(new_indices)[0];
+        return BufferStore(new_buffer,
+                           if_then_else(if_then_else_condition, pad_value_at_index, op->value),
+                           new_indices);
+      };
+
+      BufferStoreReplacer replacer(replace_store, replace_block_realize);
+      Stmt stmt = replacer(info.dependent_loopnest.back()->body);
+      if (!all_stores_replaced) {
+        return NullOpt;
+      }
+
+      std::unordered_map<const VarNode*, PrimExpr> var_remap;
+      ICHECK_EQ(info.dependent_loopnest.size(), inverse->final_indices.size());
+      for (size_t i = 0; i < info.dependent_loopnest.size(); i++) {
+        Var var = info.dependent_loopnest[i]->loop_var;
+        PrimExpr expr = inverse->final_indices[i];
+        var_remap[var.get()] = expr;
+      }
+      stmt = Substitute(std::move(stmt), var_remap);
+
+      ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size());
+      for (size_t rev_i = 0; rev_i < inverse->initial_indices.size(); rev_i++) {
+        size_t i = (inverse->initial_indices.size() - 1) - rev_i;
+        Var loop_var = inverse->initial_indices[i];
+        PrimExpr extent = new_buffer->shape[i];
+        stmt = For(loop_var, 0, extent, ForKind::kSerial, stmt);
+      }
+
+      return stmt;
+    };
+
+    Map<For, Stmt> loop_replacements;
+
+    for (const auto& info : write_info_) {
+      if (info.dependent_loopnest.size()) {
+        if (auto opt_stmt = generate_if_then_else_block(info)) {
+          loop_replacements.Set(info.dependent_loopnest[0], opt_stmt.value());
+        }
+      }
+    }
+
+    if (loop_replacements.size()) {
+      return ReplacementPlan{std::move(loop_replacements)};
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  std::optional<EpiloguePlan> FinalizeEpiloguePlan(Buffer new_buffer, IndexMap index_map,
+                                                   IndexMap inverse, PrimExpr padding_predicate,
+                                                   Optional<IndexMap> pad_value) const {
+    if (write_info_.empty() || is_zero(padding_predicate) || !pad_value.defined()) {
+      return std::nullopt;
+    }
+
+    Array<IterVar> iter_vars;
+    Array<PrimExpr> iter_values;
+    Array<PrimExpr> indices;
+    ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size());
+    for (size_t i = 0; i < inverse->initial_indices.size(); i++) {
+      const auto& loop_var = inverse->initial_indices[i];
+      const auto& dim = new_buffer->shape[i];
+      Var block_var("v_" + loop_var->name_hint, loop_var->dtype);
+      IterVar iter_var(Range(0, dim), block_var, kDataPar);
+      indices.push_back(iter_var->var);
+      iter_vars.push_back(iter_var);
+      iter_values.push_back(loop_var);
+    }
+
+    PrimExpr pad_value_at_index = pad_value.value()->MapIndices(indices)[0];
+    Stmt stmt = BufferStore(new_buffer, pad_value_at_index, indices);
+
+    std::stringstream block_name;
+    block_name << "buffer_" << new_buffer->name << "_padding";
+    auto write_region = BufferRegion::FromPoint(new_buffer, indices);
+    stmt = BlockRealize(iter_values, padding_predicate,
+                        Block(iter_vars, {}, {write_region}, block_name.str(), stmt));
+
+    ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size());
+    for (size_t rev_i = 0; rev_i < inverse->initial_indices.size(); rev_i++) {
+      size_t i = (inverse->initial_indices.size() - 1) - rev_i;
+      Var loop_var = inverse->initial_indices[i];
+      PrimExpr extent = new_buffer->shape[i];
+      stmt = For(loop_var, 0, extent, ForKind::kSerial, stmt);
+    }
+
+    const auto& info = write_info_.back();
+    Stmt insert_after = [&]() -> Stmt {
+      if (info.dependent_loopnest.size()) {
+        return info.dependent_loopnest.front();
+      } else if (info.innermost_block_realize) {
+        return info.innermost_block_realize.value();
+      } else {
+        LOG(FATAL) << "Write occured outside of any block/loop";
+        return Stmt();
+      }
+    }();
+    return EpiloguePlan{insert_after, stmt};
+  }
+
+  struct BindLoopVar {
+    BindLoopVar(TransformLayoutPlanner* self, For for_node)
+        : self_(self), var_(for_node->loop_var) {
+      size_t loop_depth = self_->active_loops_.size();
+      self_->loop_depth_lookup_[var_.get()] = {loop_depth, loop_depth};
+      self_->active_loops_.push_back(std::move(for_node));
+    }
+    ~BindLoopVar() {
+      self_->active_loops_.pop_back();
+      self_->loop_depth_lookup_.erase(var_.get());
+    }
+    BindLoopVar(const BindLoopVar&) = delete;
+    BindLoopVar& operator=(const BindLoopVar&) = delete;
+    BindLoopVar(BindLoopVar&&) = delete;
+    BindLoopVar& operator=(BindLoopVar&&) = delete;
+
+    TransformLayoutPlanner* self_{nullptr};
+    Var var_;
+  };
+
+  struct BindVariableDefinition {
+    BindVariableDefinition() {}
+    BindVariableDefinition(TransformLayoutPlanner* self, Var var, PrimExpr value)
+        : self_(self), var_(var) {
+      if (auto loop_depth = self->LoopDependencyRange(value); loop_depth.has_value()) {
+        self_->loop_depth_lookup_[var_.get()] = loop_depth.value();
+        self_->active_var_bindings_[var_.get()] = Substitute(value, self_->active_var_bindings_);
+      }
+    }
+    ~BindVariableDefinition() {
+      if (self_) {
+        self_->loop_depth_lookup_.erase(var_.get());
+        self_->active_var_bindings_.erase(var_.get());
+      }
+    }
+    BindVariableDefinition(const BindVariableDefinition&) = delete;
+    BindVariableDefinition& operator=(const BindVariableDefinition&) = delete;
+    BindVariableDefinition(BindVariableDefinition&& other) : BindVariableDefinition() {
+      swap(other);
+    }
+    BindVariableDefinition& operator=(BindVariableDefinition&& other) {
+      swap(other);
+      return *this;
+    }
+    void swap(BindVariableDefinition& other) {
+      std::swap(self_, other.self_);
+      std::swap(var_, other.var_);
+    }
+
+    TransformLayoutPlanner* self_{nullptr};
+    Var var_;
+  };
+
+  struct BindBlockRealize {
+    BindBlockRealize(TransformLayoutPlanner* self, BlockRealize block_realize) : self_(self) {
+      ICHECK_EQ(block_realize->iter_values.size(), block_realize->block->iter_vars.size());
+      for (size_t i = 0; i < block_realize->iter_values.size(); i++) {
+        bound_vars_.emplace_back(self, block_realize->block->iter_vars[i]->var,
+                                 block_realize->iter_values[i]);
+      }
+      cache_ = std::move(block_realize);
+      std::swap(self_->innermost_block_realize_, cache_);
+    }
+    ~BindBlockRealize() { std::swap(self_->innermost_block_realize_, cache_); }
+    BindBlockRealize(const BindBlockRealize&) = delete;
+    BindBlockRealize& operator=(const BindBlockRealize&) = delete;
+    BindBlockRealize(BindBlockRealize&&) = delete;
+    BindBlockRealize& operator=(BindBlockRealize&&) = delete;
+
+    TransformLayoutPlanner* self_{nullptr};
+    Optional<BlockRealize> cache_;
+    std::vector<BindVariableDefinition> bound_vars_;
+  };
+
+  struct WriteInfo {
+    // The BufferStore object
+    BufferStore store;
+
+    // The block realize that contains the store, if any.
+    Optional<BlockRealize> innermost_block_realize;
+
+    // The nested loops whose values contribute to the indices used in
+    // the store.  Not all loop variables in the loopnest need to
+    // contribute, but the first and last must.
+    std::vector<For> dependent_loopnest;
+
+    // Whether the padding could be represented as a tir::if_then_else
+    // node.  This requires that the surrounding loop iterators
+    // iterate over all pre-transformation buffer axes, that there are
+    // no data dependencies between loop iterations, and that
+    bool contains_row_major_traversal{false};
+  };
+
+  /*! \brief Collected information about each BufferStore */
+  std::vector<WriteInfo> write_info_;
+
+  /*! \brief The loop iterators surrounding the current node
+   *
+   * The outermost loop iterator is `active_loops_.front()`, and the
+   * innermost loop iterator is `active_loops_.back()`.
+   *
+   * Used to fill the `WriteInfo::dependent_loopnest` field.
+   */
+  std::vector<For> active_loops_;
+
+  /*! \brief Lookup for the outer/inner loops
+   *
+   * Used to fill the `WriteInfo::dependent_loopnest` field.
+   */
+  std::unordered_map<const VarNode*, std::pair<size_t, size_t>> loop_depth_lookup_;
+
+  /*! \brief The variable mappings that are currently in-scope
+   *
+   * Used to determine whether the indices of a BufferStore are a
+   * row-major traversal, even if they are rebound in let/block
+   * mappings.
+   */
+  std::unordered_map<const VarNode*, PrimExpr> active_var_bindings_;
+
+  /*! \brief The innermost BlockRealize surrounding the current node
+   *
+   * Used to fill the `WriteInfo::innermost_block_realize` field..
+   */
+  Optional<BlockRealize> innermost_block_realize_{NullOpt};
+
+  /*! \brief The buffer to be replaced */
+  Buffer old_buffer_;
+};
+
 class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
  public:
   /*!
@@ -33,23 +668,33 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
    * \return The new AST rooting at the original parent scope and the map from the old block to the
    * new block
    */
-  static std::pair<Stmt, Map<Block, Block>> Rewrite(const Stmt& scope_stmt,
-                                                    const Buffer& old_buffer,
-                                                    const Buffer& new_buffer,
-                                                    const IndexMap& index_map) {
+  static std::pair<Stmt, Map<Block, Block>> Rewrite(
+      const Block& scope_stmt, const Buffer& old_buffer, const Buffer& new_buffer,
+      const IndexMap& index_map, const IndexMap& inverse, const PrimExpr& padding_predicate,
+      const Optional<IndexMap>& pad_value) {
+    auto plan = TransformLayoutPlanner::Plan(scope_stmt, old_buffer, new_buffer, index_map, inverse,
+                                             padding_predicate, pad_value);
+
     arith::Analyzer analyzer;
-    TransformLayoutRewriter rewriter(old_buffer, new_buffer, index_map, &analyzer);
-    Stmt result = rewriter(scope_stmt);
+    TransformLayoutRewriter rewriter(old_buffer, new_buffer, index_map, plan, &analyzer);
+    Block result = Downcast<Block>(rewriter(scope_stmt));
+    if (auto plan_ptr = std::get_if<TransformLayoutPlanner::ProloguePlan>(&plan)) {
+      auto write_ptr = result.CopyOnWrite();
+      write_ptr->body = SeqStmt({plan_ptr->prologue, write_ptr->body});
+    }
     return {result, rewriter.block_sref_reuse_};
   }
 
  private:
   TransformLayoutRewriter(const Buffer& old_buffer, const Buffer& new_buffer,
-                          const IndexMap& index_map, arith::Analyzer* analyzer)
+                          const IndexMap& index_map,
+                          const TransformLayoutPlanner::TransformPlan& plan,
+                          arith::Analyzer* analyzer)
       : IRMutatorWithAnalyzer(analyzer),
         old_buffer_(old_buffer),
         new_buffer_(new_buffer),
         index_map_(index_map),
+        plan_(plan),
         buffer_data_to_buffer_{{new_buffer->data, new_buffer}} {}
 
   void RewriteBufferAccess(Buffer* buffer, Array<PrimExpr>* indices) {
@@ -61,6 +706,31 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
   using Parent::VisitExpr_;
   using Parent::VisitStmt_;
 
+  Stmt VisitStmt(const Stmt& stmt) final {
+    Stmt output = Parent::VisitStmt(stmt);
+    if (auto plan_ptr = std::get_if<TransformLayoutPlanner::EpiloguePlan>(&plan_)) {
+      if (plan_ptr->insert_after.same_as(stmt)) {
+        return SeqStmt({output, plan_ptr->new_block});
+      }
+    }
+    return output;
+  }
+
+  Stmt VisitStmt_(const ForNode* op) final {
+    // Some replacements may include the original string, such as
+    // replacing `loop` with `{loop, post_proc}`.  In this case, avoid
+    // infinite recursion.
+
+    For node = GetRef<For>(op);
+    if (auto plan_ptr = std::get_if<TransformLayoutPlanner::ReplacementPlan>(&plan_)) {
+      auto it = plan_ptr->replacements.find(node);
+      if (it != plan_ptr->replacements.end()) {
+        return VisitStmt((*it).second);
+      }
+    }
+    return Parent::VisitStmt_(op);
+  }
+
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
     BufferLoad buffer_load = Downcast<BufferLoad>(Parent::VisitExpr_(op));
     if (buffer_load->buffer.same_as(old_buffer_)) {
@@ -97,6 +767,13 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
     auto* n = block.CopyOnWrite();
     RewriteAccessRegion(&n->reads, infered_access_regions[0]);
     RewriteAccessRegion(&n->writes, infered_access_regions[1]);
+    n->alloc_buffers.MutateByApply([this](const Buffer& buffer) {
+      if (buffer.same_as(old_buffer_)) {
+        return new_buffer_;
+      } else {
+        return buffer;
+      }
+    });
     block_sref_reuse_.Set(GetRef<Block>(op), block);
     return std::move(block);
   }
@@ -104,6 +781,7 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
   const Buffer& old_buffer_;
   const Buffer& new_buffer_;
   const IndexMap& index_map_;
+  const TransformLayoutPlanner::TransformPlan& plan_;
   Map<Var, Buffer> buffer_data_to_buffer_;
   Map<Block, Block> block_sref_reuse_;
 };
@@ -132,8 +810,158 @@ class BufferIsSubregionError : public ScheduleError {
   Buffer buffer_;
 };
 
+class TransformationPaddingIndexMapError : public ScheduleError {
+ public:
+  TransformationPaddingIndexMapError(IRModule mod, IndexMap pad_value)
+      : mod_(mod), pad_value_(pad_value) {}
+
+  String FastErrorString() const final {
+    std::ostringstream ss;
+    ss << "ScheduleError: The IndexMap specifying pad_value has "
+       << pad_value_->final_indices.size() << " outputs, should only have one output";
+    return ss.str();
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream ss;
+    ss << "ScheduleError: Pad value is specified as " << pad_value_ << " which has "
+       << pad_value_->final_indices.size() << " outputs, but should only have one output";
+    return ss.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {}; }
+
+ private:
+  IRModule mod_;
+  IndexMap pad_value_;
+};
+
+class TransformationPaddingTypeError : public ScheduleError {
+ public:
+  TransformationPaddingTypeError(IRModule mod, Buffer buffer, IndexMap pad_value)
+      : mod_(mod), buffer_(buffer), pad_value_(pad_value) {
+    ICHECK_EQ(pad_value_->final_indices.size(), 1);
+    pad_value_dtype_ = pad_value_->final_indices[0].dtype();
+  }
+
+  String FastErrorString() const final {
+    std::ostringstream ss;
+    ss << "ScheduleError: Type mismatch " << buffer_->dtype << " vs " << pad_value_dtype_;
+    return ss.str();
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream ss;
+    ss << "ScheduleError: Buffer " << buffer_->name << " has elements of type " << buffer_->dtype
+       << ", but the transformation fills padding with " << pad_value_ << ", which is of type "
+       << pad_value_dtype_;
+    return ss.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {}; }
+
+ private:
+  IRModule mod_;
+  Buffer buffer_;
+  IndexMap pad_value_;
+  DataType pad_value_dtype_;
+};
+
+class TransformationPaddingExpressionError : public ScheduleError {
+ public:
+  static void Check(IRModule mod, Buffer buffer, IndexMap pad_value) {
+    Visitor visitor(buffer);
+    ICHECK_EQ(pad_value->final_indices.size(), 1)
+        << "Internal error: Should be caught by ScheduleError checks prior to this point";
+    visitor(pad_value->final_indices[0]);
+    if (visitor.illegal_load) {
+      throw TransformationPaddingExpressionError(mod, buffer, pad_value,
+                                                 visitor.illegal_load.value());
+    }
+  }
+
+ private:
+  struct Visitor : ExprVisitor {
+    explicit Visitor(const Buffer& buffer) : buffer_(buffer) {}
+
+    void VisitExpr_(const BufferLoadNode* op) final {
+      if (!op->buffer.same_as(buffer_)) {
+        illegal_load = GetRef<BufferLoad>(op);
+      }
+      ExprVisitor::VisitExpr_(op);
+    }
+
+    const Buffer& buffer_;
+    Optional<BufferLoad> illegal_load;
+  };
+
+  TransformationPaddingExpressionError(IRModule mod, Buffer buffer, IndexMap pad_value,
+                                       BufferLoad illegal_load)
+      : mod_(mod), buffer_(buffer), pad_value_(pad_value), illegal_load_(illegal_load) {}
+
+  String FastErrorString() const final {
+    std::ostringstream ss;
+    ss << "ScheduleError: Pad value may not contain load load from " << illegal_load_->buffer->name;
+    return ss.str();
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream ss;
+    ss << "ScheduleError: Pad value may only contain BufferLoad from the transformed buffer "
+       << buffer_->name << ", but pad_value " << pad_value_ << " contains expression "
+       << illegal_load_;
+    return ss.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {}; }
+
+  IRModule mod_;
+  Buffer buffer_;
+  IndexMap pad_value_;
+  BufferLoad illegal_load_;
+};
+
+class TransformationIntroducesPaddingError : public ScheduleError {
+ public:
+  TransformationIntroducesPaddingError(IRModule mod, Buffer buffer, IndexMap index_map,
+                                       PrimExpr padding_predicate)
+      : mod_(std::move(mod)),
+        buffer_(std::move(buffer)),
+        index_map_(std::move(index_map)),
+        padding_predicate_(std::move(padding_predicate)) {}
+
+  String FastErrorString() const final {
+    std::ostringstream ss;
+    ss << "ScheduleError: Transformation would introduce padding at " << padding_predicate_ << ".";
+    return ss.str();
+  }
+
+  String DetailRenderTemplate() const final {
+    auto new_shape = index_map_->MapShape(buffer_->shape);
+    std::ostringstream os;
+    os << "The transformation " << index_map_ << " applied on buffer " << buffer_->name
+       << " of shape " << buffer_->shape << " would result in shape " << new_shape
+       << ".  However, this would introduce padding wherever " << padding_predicate_ << " is true.";
+    return os.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {}; }
+
+ private:
+  IRModule mod_;
+  Buffer buffer_;
+  IndexMap index_map_;
+  PrimExpr padding_predicate_;
+};
+
 void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
-                     BufferIndexType buffer_index_type, const IndexMap& index_map) {
+                     BufferIndexType buffer_index_type, const IndexMap& index_map,
+                     const Optional<IndexMap>& pad_value) {
+  // Step 1: Input handling and error checking
   const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref);
   Buffer old_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index, buffer_index_type);
@@ -141,33 +969,48 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
   if (defining_site_sref.defined() && !is_alloc) {
     throw BufferIsSubregionError(self->mod, old_buffer);
   }
+  if (pad_value) {
+    if (pad_value.value()->final_indices.size() != 1) {
+      throw TransformationPaddingIndexMapError(self->mod, pad_value.value());
+    }
+    if (pad_value.value()->final_indices[0]->dtype != old_buffer->dtype) {
+      throw TransformationPaddingTypeError(self->mod, old_buffer, pad_value.value());
+    }
+
+    TransformationPaddingExpressionError::Check(self->mod, old_buffer, pad_value.value());
+  }
 
   StmtSRef scope_sref = defining_site_sref.defined()
                             ? defining_site_sref.value()
                             : GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
   const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref);
 
-  // Step 1: Infer the shape of the new buffer
-  ObjectPtr<BufferNode> new_buffer_node = make_object<BufferNode>(*(old_buffer.get()));
-  new_buffer_node->shape = index_map->MapShape(old_buffer->shape);
-  Buffer new_buffer{new_buffer_node};
+  auto [inverse, padding_predicate] = [&]() {
+    Array<Range> region;
+    for (const auto& dim : old_buffer->shape) {
+      region.push_back(Range::FromMinExtent(0, dim));
+    }
+    return index_map.NonSurjectiveInverse(region);
+  }();
+
+  bool has_padding = !is_zero(padding_predicate);
+  if (has_padding && !pad_value.defined()) {
+    throw TransformationIntroducesPaddingError(self->mod, old_buffer, index_map, padding_predicate);
+  }
 
-  // Step 2: Rewrite access indices and regions of the buffer
-  auto [new_stmt, block_sref_reuse] = TransformLayoutRewriter::Rewrite(
-      GetRef<Block>(scope_block), old_buffer, new_buffer, index_map);
+  // Step 2: Infer the shape of the new buffer
+  Buffer new_buffer = old_buffer;
+  new_buffer.CopyOnWrite()->shape = index_map->MapShape(old_buffer->shape);
+
+  // Step 3: Rewrite BufferLoad/BufferStore access indices, block read/write regions, and block
+  // alloc_buffers.
+  auto [new_stmt, block_sref_reuse] =
+      TransformLayoutRewriter::Rewrite(GetRef<Block>(scope_block), old_buffer, new_buffer,
+                                       index_map, inverse, padding_predicate, pad_value);
   Block new_scope_block = Downcast<Block>(new_stmt);
 
-  // Step 3: Rewrite alloc_buffer of the block or buffer_map of the PrimFunc.
-  if (defining_site_sref.defined()) {
-    auto* n = new_scope_block.CopyOnWrite();
-    n->alloc_buffers.MutateByApply([&old_buffer, &new_buffer](const Buffer& buffer) {
-      if (buffer.same_as(old_buffer)) {
-        return new_buffer;
-      }
-      return buffer;
-    });
-    block_sref_reuse.Set(GetRef<Block>(scope_block), new_scope_block);
-  } else {
+  // Step 4: Rewrite buffer_map of the PrimFunc if necessary.
+  if (!defining_site_sref.defined()) {
     GlobalVar g_var;
     GetRootPrimFunc(self->mod, scope_block, &g_var);
     IRModuleNode* new_mod = self->mod.CopyOnWrite();
@@ -502,17 +1345,20 @@ struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits>
 
  private:
   static constexpr size_t kNumInputs = 1;
-  static constexpr size_t kNumAttrs = 3;
+  static constexpr size_t kNumAttrs = 4;
   static constexpr size_t kNumDecisions = 0;
 
   static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, Integer buffer_index,
-                                      Integer buffer_index_type, IndexMap index_map) {
+                                      Integer buffer_index_type, IndexMap index_map,
+                                      Optional<IndexMap> pad_value) {
     return sch->TransformLayout(block_rv, buffer_index.IntValue(),
-                                static_cast<BufferIndexType>(buffer_index_type->value), index_map);
+                                static_cast<BufferIndexType>(buffer_index_type->value), index_map,
+                                pad_value);
   }
 
   static String UnpackedAsPython(Array<String> outputs, String block_rv, Integer buffer_index,
-                                 Integer buffer_index_type, IndexMap index_map) {
+                                 Integer buffer_index_type, IndexMap index_map,
+                                 Optional<IndexMap> pad_value) {
     PythonAPICall py("transform_layout");
     py.Input("block", block_rv);
 
@@ -522,6 +1368,8 @@ struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits>
     py.Input("buffer", os.str());
 
     py.Input("index_map", index_map->ToPythonString());
+    py.Input("pad_value", pad_value ? pad_value.value()->ToPythonString() : "None");
+
     return py.Str();
   }
 
@@ -532,6 +1380,7 @@ struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits>
     attrs_record.push_back(attrs[0]);
     attrs_record.push_back(attrs[1]);
     attrs_record.push_back(String(::tvm::SaveJSON(attrs[2])));
+    attrs_record.push_back(attrs[3]);
     return std::move(attrs_record);
   }
 
@@ -541,6 +1390,7 @@ struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits>
     attrs.push_back(attrs_record[0]);
     attrs.push_back(attrs_record[1]);
     attrs.push_back(::tvm::LoadJSON(Downcast<String>(attrs_record[2])));
+    attrs.push_back(attrs_record[3]);
     return attrs;
   }
 
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index d72f67fb7c2d..2f27dbb9fbf1 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -248,9 +248,11 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleUnannotate")
 /******** (FFI) Layout transformation ********/
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleTransformLayout")
     .set_body_typed([](Schedule self, const BlockRV& block_rv, int buffer_index,
-                       int buffer_index_type, const IndexMap& index_map) {
+                       int buffer_index_type, const IndexMap& index_map,
+                       const Optional<IndexMap>& pad_value) {
       return self->TransformLayout(block_rv, buffer_index,
-                                   static_cast<BufferIndexType>(buffer_index_type), index_map);
+                                   static_cast<BufferIndexType>(buffer_index_type), index_map,
+                                   pad_value);
     });
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleTransformBlockLayout")
     .set_body_method<Schedule>(&ScheduleNode::TransformBlockLayout);
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index a31950d33115..9ff793dc39dd 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -487,14 +487,17 @@ void TracedScheduleNode::Unannotate(const BlockRV& block_rv, const String& ann_k
 
 void TracedScheduleNode::TransformLayout(const BlockRV& block_rv, int buffer_index,
                                          BufferIndexType buffer_index_type,
-                                         const IndexMap& index_map) {
-  ConcreteScheduleNode::TransformLayout(block_rv, buffer_index, buffer_index_type, index_map);
+                                         const IndexMap& index_map,
+                                         const Optional<IndexMap>& pad_value) {
+  ConcreteScheduleNode::TransformLayout(block_rv, buffer_index, buffer_index_type, index_map,
+                                        pad_value);
   static const InstructionKind& kind = InstructionKind::Get("TransformLayout");
   trace_->Append(
-      /*inst=*/Instruction(/*kind=*/kind,
-                           /*inputs=*/{block_rv},
-                           /*attrs=*/{Integer(buffer_index), Integer(buffer_index_type), index_map},
-                           /*outputs=*/{}));
+      /*inst=*/Instruction(
+          /*kind=*/kind,
+          /*inputs=*/{block_rv},
+          /*attrs=*/{Integer(buffer_index), Integer(buffer_index_type), index_map, pad_value},
+          /*outputs=*/{}));
 }
 
 void TracedScheduleNode::TransformBlockLayout(const BlockRV& block_rv, const IndexMap& index_map) {
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index ad44cc6ae552..0e83b35f44e9 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -103,7 +103,7 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   void Unannotate(const BlockRV& block_rv, const String& ann_key) override;
   /******** Schedule: Layout transformation ********/
   void TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type,
-                       const IndexMap& index_map) override;
+                       const IndexMap& index_map, const Optional<IndexMap>& pad_value) override;
   void TransformBlockLayout(const BlockRV& block_rv, const IndexMap& index_map) override;
   void SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
                         BufferIndexType buffer_index_type,
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index 0332df7fd312..8ed350cc4c46 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -329,5 +329,415 @@ def test_transform_block_layout_fail_mixed_iter_type(use_block_name):
         )
 
 
+class BasePaddingCompare(tvm.testing.CompareBeforeAfter):
+    pad_value = tvm.testing.parameter(None)
+
+    transformed_buffer = tvm.testing.parameter("A")
+
+    @pytest.fixture
+    def transform(self, pad_value, transformed_buffer):
+        def transform(mod):
+            sch = tir.Schedule(mod)
+            sch.transform_layout(
+                "block", transformed_buffer, lambda i: [i // 4, i % 4], pad_value=pad_value
+            )
+            return sch.mod
+
+        return transform
+
+
+class TestNoPadding(BasePaddingCompare):
+    """Transformations without padding do not depend on pad_value."""
+
+    pad_value = tvm.testing.parameter(None, 42)
+
+    def before():
+        A = T.alloc_buffer(16, "int32")
+        for i in T.serial(16):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                A[vi] = 0
+
+    def expected():
+        A = T.alloc_buffer([4, 4], "int32")
+        for i in T.serial(16):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                A[vi // 4, vi % 4] = 0
+
+
+class TestNoPaddingMultipleUsage(BasePaddingCompare):
+    """Transformations without padding do not depend on pad_value.
+
+    Like TestNoPadding, but the buffer A shows up in multiple
+    locations.  To remain internally consistent, all instances of the
+    buffer should be rewritten.
+    """
+
+    pad_value = tvm.testing.parameter(None, 42)
+
+    def before():
+        A = T.alloc_buffer(16, "int32")
+        for i in T.serial(16):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                A[vi] = 0
+
+        B = T.alloc_buffer(16, "int32")
+        for i in T.serial(16):
+            with T.block("other"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi]
+
+    def expected():
+        A = T.alloc_buffer([4, 4], "int32")
+        for i in T.serial(16):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                A[vi // 4, vi % 4] = 0
+
+        B = T.alloc_buffer(16, "int32")
+        for i in T.serial(16):
+            with T.block("other"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi // 4, vi % 4]
+
+
+class TestNoPaddingOpaqueBlock(BasePaddingCompare):
+    """Transformations without padding do not depend on pad_value.
+
+    Like TestNoPadding, but buffer access is done in an opaque block.
+    """
+
+    pad_value = tvm.testing.parameter(None, 42)
+
+    def before():
+        A = T.alloc_buffer(16, "int32")
+        for i in T.serial(16):
+            with T.block("block"):
+                A[i] = 0
+
+    def expected():
+        A = T.alloc_buffer([4, 4], "int32")
+        for i in T.serial(16):
+            with T.block("block"):
+                A[i // 4, i % 4] = 0
+
+
+class TestErrorIfPaddingForbidden(BasePaddingCompare):
+    """Unless padding is explicitly enabled, should raise error"""
+
+    def before():
+        A = T.alloc_buffer(14, "int32")
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                A[vi] = 0
+
+    expected = tvm.tir.schedule.schedule.ScheduleError
+
+
+class TestErrorOnWrongPaddingType(BasePaddingCompare):
+    """The padding must have the same dtype as the buffer"""
+
+    pad_value = tvm.testing.parameter(0.5)
+
+    def before():
+        A = T.alloc_buffer(14, "int32")
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                A[vi] = 0
+
+    expected = tvm.tir.schedule.schedule.ScheduleError
+
+
+class TestPaddedTransformIfThenElse(BasePaddingCompare):
+    """Use if_then_else to represent padding, if possible.
+
+    For a block that is a producer of the pre-transformation buffer,
+    which visits all indices according to a row-major traversal, and
+    which has no effect other than producing the transformed buffer,
+    transform the loop iterators to be a row-major traversal of the
+    post-transformation buffer, with padding represented by
+    `T.if_then_else`.
+    """
+
+    pad_value = tvm.testing.parameter(0)
+    transformed_buffer = tvm.testing.parameter("B")
+
+    def before(A: T.Buffer[14, "int32"]):
+        B = T.alloc_buffer(14, "int32")
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi]
+
+    def expected(A: T.Buffer[14, "int32"]):
+        B = T.alloc_buffer([4, 4], "int32")
+        for i, j in T.grid(4, 4):
+            with T.block("block"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                B[vi, vj] = T.if_then_else(vi == 3 and 2 <= vj, 0, A[vi * 4 + vj], dtype="int32")
+
+
+class TestPaddedTransformWithoutLoop(BasePaddingCompare):
+    """Handle padded writes without a loop
+
+    The statement being replaced may be something other than a
+    for-loop, such as if a loop has already been unrolled.
+    """
+
+    pad_value = tvm.testing.parameter(0)
+
+    def before(A: T.Buffer[14, "int32"]):
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            with T.block("block"):
+                A[0] = 0
+
+    def expected(A: T.Buffer[(4, 4), "int32"]):
+        with T.block("block"):
+            A[0, 0] = 0
+
+        for i, j in T.grid(4, 4):
+            with T.block("buffer_A_padding"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.where(i == 3 and 2 <= j)
+                A[vi, vj] = 0
+
+
+class TestPaddedTransformIfThenElseReduction(BasePaddingCompare):
+    """Like TestPaddedTransformIfThenElse, but with a reduction axis"""
+
+    pad_value = tvm.testing.parameter(0)
+    transformed_buffer = tvm.testing.parameter("B")
+
+    def before(A: T.Buffer[(14, 32), "int32"]):
+        B = T.alloc_buffer(14, "int32")
+        for i, k in T.grid(14, 32):
+            with T.block("block"):
+                vi, vk = T.axis.remap("SR", [i, k])
+                with T.init():
+                    B[vi] = 0
+                B[vi] = B[vi] + A[vi, vk]
+
+    def expected(A: T.Buffer[(14, 32), "int32"]):
+        B = T.alloc_buffer([4, 4], "int32")
+        for i, j, k in T.grid(4, 4, 32):
+            with T.block("block"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    B[vi, vj] = T.if_then_else(vi == 3 and 2 <= vj, 0, 0, dtype="int32")
+                B[vi, vj] = T.if_then_else(
+                    vi == 3 and 2 <= vj, 0, B[vi, vj] + A[vi * 4 + vj, vk], dtype="int32"
+                )
+
+
+class TestPaddedTransformIfThenElseReductionOpaque(BasePaddingCompare):
+    """Like TestPaddedTransformIfThenElseReduction, but with opaque blocks"""
+
+    pad_value = tvm.testing.parameter(0)
+    transformed_buffer = tvm.testing.parameter("B")
+
+    def before(A: T.Buffer[(14, 32), "int32"]):
+        B = T.alloc_buffer(14, "int32")
+        for i in T.serial(14):
+            B[i] = 0
+            for k in T.serial(32):
+                with T.block("block"):
+                    B[i] = B[i] + A[i, k]
+
+    def expected(A: T.Buffer[(14, 32), "int32"]):
+        B = T.alloc_buffer([4, 4], "int32")
+        for i, j in T.grid(4, 4):
+            B[i, j] = T.if_then_else(i == 3 and 2 <= j, 0, 0, dtype="int32")
+            for k in T.serial(32):
+                with T.block("block"):
+                    B[i, j] = T.if_then_else(
+                        i == 3 and 2 <= j, 0, B[i, j] + A[i * 4 + j, k], dtype="int32"
+                    )
+
+
+class TestPaddedTransformPostProcIfRequiredDueToSideEffects(BasePaddingCompare):
+    """Set the transformation padding in a post-processing block.
+
+    Like TestPaddedTransformIfThenElse, but the block that produces B
+    also has the effect of setting `C`.
+    """
+
+    pad_value = tvm.testing.parameter(0)
+    transformed_buffer = tvm.testing.parameter("B")
+
+    def before(A: T.Buffer[14, "int32"]):
+        B = T.alloc_buffer(14, "int32")
+        C = T.alloc_buffer(14, "int32")
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi]
+                C[vi] = 0
+
+    def expected(A: T.Buffer[14, "int32"]):
+        B = T.alloc_buffer([4, 4], "int32")
+        C = T.alloc_buffer(14, "int32")
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                B[vi // 4, vi % 4] = A[vi]
+                C[vi] = 0
+
+        for i, j in T.grid(4, 4):
+            with T.block("block_pad_B"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.where(i == 3 and 2 <= j)
+                B[vi, vj] = 0
+
+
+class TestPaddedTransformOfInputCreatesAssumption(BasePaddingCompare):
+    """Transformation of an input buffer places T.assume locally"""
+
+    pad_value = tvm.testing.parameter(42)
+
+    def before(A: T.Buffer[14, "int32"], B: T.Buffer[14, "int32"]):
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi]
+
+    def expected(A: T.Buffer[(4, 4), "int32"], B: T.Buffer[14, "int32"]):
+        for i, j in T.grid(4, 4):
+            with T.block("buffer_A_assumption"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.assume(not (vi == 3 and 2 <= vj) or A[vi, vj] == 42)
+
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi // 4, vi % 4]
+
+
+class TestPaddedTransformNonConstantValue(tvm.testing.CompareBeforeAfter):
+    """Allow an expression to specify the pad value.
+
+    Like TestPaddedTransformIfThenElse, but the pad value depends on
+    the indices.
+    """
+
+    @pytest.fixture
+    def transform(self):
+        def transform(mod):
+            sch = tir.Schedule(mod)
+            sch.transform_layout(
+                "block",
+                "B",
+                lambda i: [i // 4, i % 4],
+                pad_value=lambda i, j: i + j,
+            )
+            return sch.mod
+
+        return transform
+
+    def before(A: T.Buffer[14, "int32"]):
+        B = T.alloc_buffer(14, "int32")
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi]
+
+    def expected(A: T.Buffer[14, "int32"]):
+        B = T.alloc_buffer([4, 4], "int32")
+        for i, j in T.grid(4, 4):
+            with T.block("block"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                B[vi, vj] = T.if_then_else(
+                    vi == 3 and 2 <= vj, vi + vj, A[vi * 4 + vj], dtype="int32"
+                )
+
+
+@pytest.mark.xfail(reason="Not yet implemented")
+class TestPaddedTransformRepeatedBufferElement(tvm.testing.CompareBeforeAfter):
+    """Allow an expression to specify the pad value.
+
+    Like TestPaddedTransformOfInputCreatesAssumption, but the pad
+    value depends on another portion of the buffer.  In this case, the
+    padding at the end of A contains repeated elements from the
+    beginning of A.
+    """
+
+    @pytest.fixture
+    def transform(self):
+        def transform(mod):
+            sch = tir.Schedule(mod)
+
+            A = sch.get(sch.get_block("block")).reads[0].buffer
+            sch.transform_layout(
+                "block",
+                "A",
+                lambda i: [i // 4, i % 4],
+                pad_value=lambda i, j: A[(4 * i + j) % 14],
+            )
+            return sch.mod
+
+        return transform
+
+    def before(A: T.Buffer[14, "int32"]):
+        B = T.alloc_buffer(14, "int32")
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi]
+
+    def expected(A: T.Buffer[(4, 4), "int32"]):
+        for i, j in T.grid(4, 4):
+            with T.block("buffer_A_assumption"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.assume(
+                    not (vi == 3 and 2 <= vj)
+                    or A[vi, vj] == A[((4 * vi + j) % 14) // 4, ((4 * vi + j) % 14) % 4]
+                )
+
+        B = T.alloc_buffer(14, "int32")
+        for i in T.grid(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi // 4, vi % 4]
+
+
+class TestPadValueMayNotReferenceOtherBuffer(tvm.testing.CompareBeforeAfter):
+    """Allow an expression to specify the pad value.
+
+    Like TestPaddedTransformRepeatedBufferElement, but the pad value depends on
+    a different buffer, which is not allowed.
+    """
+
+    @pytest.fixture
+    def transform(self):
+        def transform(mod):
+            sch = tir.Schedule(mod)
+
+            A = sch.get(sch.get_block("block")).reads[0].buffer
+            other = tir.decl_buffer(1, A.dtype, name="other")
+            sch.transform_layout(
+                "block",
+                "A",
+                lambda i: [i // 4, i % 4],
+                pad_value=lambda i, j: other[0],
+            )
+            return sch.mod
+
+        return transform
+
+    def before(A: T.Buffer[14, "int32"]):
+        B = T.alloc_buffer(14, "int32")
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi]
+
+    expected = tvm.tir.schedule.schedule.ScheduleError
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From e30ac71bdecea3625c150a49591c886e60a48479 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 19 Sep 2022 11:50:20 -0500
Subject: [PATCH 205/704] [Arith][TIR] IntSetAnalyzer, delay intersection of
 IntSet until use (#12821)

Follow-up from https://github.com/apache/tvm/pull/11970, to improve
performance.  In the initial implementation, the `analyzer->int_set`
would compute the intersection of all scope-based constraints when
entering the scope, even if they weren't actually used.  This commit
delays the call to `Intersect` until required, following the same
behavior as `ConstIntBound`.
---
 src/arith/int_set.cc | 126 ++++++++++++++++++-------------------------
 1 file changed, 52 insertions(+), 74 deletions(-)

diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index 35b12bb35238..7d601d9a8bae 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -362,8 +362,13 @@ using namespace tir;
 // We might use better set analysis in the future to replace the intervalset.
 class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
  public:
-  IntervalSetEvaluator(Analyzer* analyzer, const Map<Var, IntSet>& dom_map, bool eval_vec = false)
-      : analyzer_(analyzer), dom_map_(dom_map), eval_vec_(eval_vec) {}
+  IntervalSetEvaluator(Analyzer* analyzer, const Map<Var, IntSet>& dom_map,
+                       const std::vector<std::pair<Var, IntSet>>* dom_constraints = nullptr,
+                       bool eval_vec = false)
+      : analyzer_(analyzer),
+        dom_map_(dom_map),
+        dom_constraints_(dom_constraints),
+        eval_vec_(eval_vec) {}
 
   IntervalSet Eval(const PrimExpr& val) { return this->VisitExpr(val); }
   // evaluate and relax the set
@@ -383,18 +388,40 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
 
   IntervalSet VisitExpr_(const VarNode* op) final {
     Var var = GetRef<Var>(op);
+
+    Array<IntSet> values;
+    if (dom_constraints_) {
+      for (const auto& constraint : *dom_constraints_) {
+        if (var.same_as(constraint.first)) {
+          values.push_back(constraint.second);
+        }
+      }
+    }
+
     auto it = dom_map_.find(var);
     if (it != dom_map_.end()) {
-      IntervalSet res = ToIntervalSet((*it).second);
-      if (res->min_value.same_as(var) && res->max_value.same_as(var)) {
-        return res;
-      }
-      // recursively evaluate mapped result
-      // in case the domain contains variables to be relaxed.
-      return Eval(res);
-    } else {
+      values.push_back((*it).second);
+    }
+
+    if (values.empty()) {
       return IntervalSet::SinglePoint(var);
     }
+
+    IntSet intersection = [&]() {
+      if (values.size() == 1) {
+        return values.front();
+      } else {
+        return Intersect(values);
+      }
+    }();
+
+    IntervalSet res = ToIntervalSet(intersection);
+    if (res->min_value.same_as(var) && res->max_value.same_as(var)) {
+      return res;
+    }
+    // recursively evaluate mapped result
+    // in case the domain contains variables to be relaxed.
+    return Eval(res);
   }
 
   IntervalSet VisitExpr_(const AddNode* op) final { return VisitBinaryExpr_<Add>(op); }
@@ -517,6 +544,7 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
   // analyzer
   Analyzer* analyzer_;
   const Map<Var, IntSet>& dom_map_;
+  const std::vector<std::pair<Var, IntSet>>* dom_constraints_;
   bool eval_vec_{false};
 };
 
@@ -529,7 +557,7 @@ class IntSetAnalyzer::Impl {
   }
 
   IntSet Eval(const PrimExpr& expr) const {
-    return IntervalSetEvaluator(analyzer_, GetCurrentBounds(), true).Eval(expr);
+    return IntervalSetEvaluator(analyzer_, dom_map_, &dom_constraints_, true).Eval(expr);
   }
 
   void Bind(const Var& var, const Range& range, bool allow_override) {
@@ -541,10 +569,6 @@ class IntSetAnalyzer::Impl {
   std::function<void()> EnterConstraint(const PrimExpr& constraint);
 
  private:
-  // Get the current variable bounds, including both global bounds and
-  // scope-dependent bounds.
-  Map<Var, IntSet> GetCurrentBounds() const;
-
   // Utility function to split a boolean condition into the domain
   // bounds implied by that condition.
   static std::vector<std::pair<Var, IntSet>> DetectBoundInfo(const PrimExpr& cond);
@@ -556,9 +580,11 @@ class IntSetAnalyzer::Impl {
   // ranges)
   Map<Var, IntSet> dom_map_;
 
-  // Map of variables to implicit scope-dependent bounds (e.g. inside
-  // the body of an if-statement)
-  Map<Var, IntSet> constraints_;
+  // List of implicit scope-dependent bounds (e.g. inside the body of
+  // an if-statement).  Maintained as a list of constraints, rather
+  // than as a `Map<Var,IntSet>`, to avoid computing an Intersection
+  // until required.
+  std::vector<std::pair<Var, IntSet>> dom_constraints_;
 };
 
 IntSetAnalyzer::IntSetAnalyzer(Analyzer* parent) : impl_(new Impl(parent)) {}
@@ -603,29 +629,6 @@ void IntSetAnalyzer::Impl::Bind(const Var& var, const PrimExpr& expr, bool can_o
   Update(var, Eval(expr), can_override);
 }
 
-Map<Var, IntSet> IntSetAnalyzer::Impl::GetCurrentBounds() const {
-  // If either constraints_ or dom_map_ is empty, return the other to
-  // avoid constructing a new map.
-  if (constraints_.empty()) {
-    return dom_map_;
-  } else if (dom_map_.empty()) {
-    return constraints_;
-  }
-
-  // If neither is empty, construct a merged domain map with
-  // information from both sources.
-  Map<Var, IntSet> merged = dom_map_;
-  for (const auto& pair : constraints_) {
-    auto it = merged.find(pair.first);
-    if (it == merged.end()) {
-      merged.Set(pair.first, pair.second);
-    } else {
-      merged.Set(pair.first, Intersect({pair.second, (*it).second}));
-    }
-  }
-  return merged;
-}
-
 std::vector<std::pair<Var, IntSet>> IntSetAnalyzer::Impl::DetectBoundInfo(
     const PrimExpr& constraint) {
   PVar<Var> x;
@@ -665,41 +668,16 @@ std::function<void()> IntSetAnalyzer::EnterConstraint(const PrimExpr& constraint
 }
 
 std::function<void()> IntSetAnalyzer::Impl::EnterConstraint(const PrimExpr& constraint) {
-  Map<Var, IntSet> cached_values;
-
   auto bounds = DetectBoundInfo(constraint);
 
   if (bounds.size() == 0) return nullptr;
 
-  // Collect the current values of each var that is changes by this
-  // constraint.
-  for (const auto& pair : bounds) {
-    auto it = constraints_.find(pair.first);
-    if (it == constraints_.end()) {
-      cached_values.Set(pair.first, IntSet());
-    } else {
-      cached_values.Set(pair.first, (*it).second);
-    }
-  }
-
-  // Update all constraints
-  for (const auto& pair : bounds) {
-    auto it = constraints_.find(pair.first);
-    if (it == constraints_.end()) {
-      constraints_.Set(pair.first, pair.second);
-    } else {
-      constraints_.Set(pair.first, Intersect({pair.second, (*it).second}));
-    }
-  }
-
-  auto frecover = [cached_values, this]() {
-    for (const auto& it : cached_values) {
-      if (it.second.defined()) {
-        constraints_.Set(it.first, it.second);
-      } else {
-        constraints_.erase(it.first);
-      }
-    }
+  size_t old_size = dom_constraints_.size();
+  dom_constraints_.insert(dom_constraints_.end(), bounds.begin(), bounds.end());
+  size_t new_size = dom_constraints_.size();
+  auto frecover = [old_size, new_size, this]() {
+    ICHECK_EQ(dom_constraints_.size(), new_size);
+    dom_constraints_.resize(old_size);
   };
   return frecover;
 }
@@ -960,13 +938,13 @@ Map<Var, IntSet> ConvertDomMap(const std::unordered_map<const VarNode*, IntSet>&
 
 IntSet EvalSet(PrimExpr e, const Map<Var, IntSet>& dom_map) {
   Analyzer ana;
-  return IntervalSetEvaluator(&ana, dom_map, false).Eval(e);
+  return IntervalSetEvaluator(&ana, dom_map, {}, false).Eval(e);
 }
 
 IntSet IntSet::Vector(PrimExpr x) {
   Analyzer ana;
   Map<Var, IntSet> dmap;
-  return IntervalSetEvaluator(&ana, dmap, true).Eval(x);
+  return IntervalSetEvaluator(&ana, dmap, {}, true).Eval(x);
 }
 
 IntSet EvalSet(PrimExpr e, const Map<IterVar, IntSet>& dom_map) {

From da7f65d9d152397f8f7e73b21c6310f976e64bfd Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Mon, 19 Sep 2022 10:01:29 -0700
Subject: [PATCH 206/704] [Hexagon] Create test examples to show
 parallelization (#12654)

* [Hexagon] Create test examples to show parallelization working on Hexagon workloads.

* Increase max size of tvm_rpc_android buffer size.

* Reformat tests to be parameterized.

* Comment out tests to speedup CI.
---
 .../contrib/test_hexagon/test_parallel_hvx.py | 230 ++++++++++++++++++
 .../test_hexagon/test_parallel_scalar.py      | 159 ++++++++++++
 2 files changed, 389 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/test_parallel_hvx.py
 create mode 100644 tests/python/contrib/test_hexagon/test_parallel_scalar.py

diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx.py b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
new file mode 100644
index 000000000000..a34f5b8e261b
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
@@ -0,0 +1,230 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Test parallelizing HVX workloads and compare them to single thread examples. 
+"""
+import numpy as np
+import tvm
+
+from tvm.script import tir as T
+from numpy.random import default_rng
+
+TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n    -Single Thread: {} ms \n    -Parallel: {} ms\n    -Speedup: {}x\n"
+
+
+def get_vrmpy_shape_dtypes(operations):
+    return ((operations, 128), "uint8", (operations, 128), "uint8", (operations, 32), "int32")
+
+
+def get_vmpy_vadd_shape_dtype(operations):
+    return ((operations, 128), "uint8", (operations, 128), "uint8", (operations, 128), "int16")
+
+
+def vmpy_expected_producer(shape, a, b):
+    expected = np.zeros(shape, dtype="int16")
+    for n in range(shape[0]):
+        for i in range(0, 128, 2):
+            expected[n, i // 2] = np.int16(a[n, i]) * np.int16(b[n, i])
+        for i in range(1, 128, 2):
+            expected[n, i // 2 + 64] = np.int16(a[n, i]) * np.int16(b[n, i])
+    return expected
+
+
+def vadd_expected_producer(shape, a, b):
+    expected = np.zeros(shape, dtype="int16")
+    for n in range(shape[0]):
+        for i in range(0, 128, 2):
+            expected[n, i // 2] = np.int16(a[n, i]) + np.int16(b[n, i])
+        for i in range(1, 128, 2):
+            expected[n, i // 2 + 64] = np.int16(a[n, i]) + np.int16(b[n, i])
+    return expected
+
+
+def vrmpy_expected_producer(shape, a, b):
+    expected = np.zeros(shape, dtype="int32")
+    for n in range(shape[0]):
+        for i in range(32):
+            for r in range(4):
+                expected[n, i] = expected[n, i] + np.uint32(a[n, i * 4 + r]) * np.uint32(
+                    b[n, i * 4 + r]
+                )
+    return expected
+
+
+def get_vmpy_operator(operations):
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations, 128], dtype="uint8")
+        B = T.match_buffer(b, [operations, 128], dtype="uint8")
+        C = T.match_buffer(c, [operations, 128], dtype="int16")
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C[vn, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vmpybusv.128B"),
+                    T.uint32(2),
+                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    dtype="int16x128",
+                )
+
+    return operator
+
+
+def get_vadd_operator(operations):
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations, 128], dtype="uint8")
+        B = T.match_buffer(b, [operations, 128], dtype="uint8")
+        C = T.match_buffer(c, [operations, 128], dtype="int16")
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C[vn, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vaddubh.128B"),
+                    T.uint32(2),
+                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    dtype="int16x128",
+                )
+
+    return operator
+
+
+def get_vrmpy_operator(operations):
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations, 128], dtype="uint8")
+        B = T.match_buffer(b, [operations, 128], dtype="uint8")
+        C = T.match_buffer(c, [operations, 32], dtype="int32")
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
+                    T.uint32(2),
+                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    dtype="int32x32",
+                )
+
+    return operator
+
+
+def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch):
+    a_shape, a_dtype, b_shape, b_dtype, c_shape, c_dtype = shape_dtypes
+
+    target_hexagon = tvm.target.hexagon("v68")
+    func_tir = tvm.build(
+        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+    )
+    module = hexagon_session.load_module(func_tir)
+
+    rng = default_rng()
+    a = rng.integers(0, 16, a_shape, dtype=a_dtype)
+    b = rng.integers(0, 16, b_shape, dtype=b_dtype)
+    c = np.zeros(c_shape, dtype=c_dtype)
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
+    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+
+    # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
+    number = 1
+    repeat = 1
+
+    timer = module.time_evaluator(
+        "__tvm_main__", hexagon_session.device, number=number, repeat=repeat
+    )
+    runtime = timer(a_hexagon, b_hexagon, c_hexagon)
+    tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected_output_producer(c_shape, a, b))
+
+    return round(runtime.mean * 1000, 6)
+
+
+class TestMatMulVec:
+
+    (
+        operation_name,
+        operator_producer,
+        shape_dtypes_producer,
+        expected_output_producer,
+    ) = tvm.testing.parameters(
+        ("vrmpy", get_vrmpy_operator, get_vrmpy_shape_dtypes, vrmpy_expected_producer),
+        ("vmpy", get_vmpy_operator, get_vmpy_vadd_shape_dtype, vmpy_expected_producer),
+        ("vadd", get_vadd_operator, get_vmpy_vadd_shape_dtype, vadd_expected_producer),
+    )
+
+    # Experimentally best split factor but all multiples of 4 perform pretty well.
+    # This is because there are 4 HVX untis available on the device and pipelining
+    # works best with parallels of the number of available HVX.
+    split_factor = tvm.testing.parameter(4)
+
+    # Removed most of these to speedup CI.
+    operation_count = tvm.testing.parameter(
+        128,
+        # 256,
+        # 512,
+        # 1024,  # Single thread runs faster since L2 cache can handle the entire request quickly
+        # 2048,
+        # 4096,  # Significant performance degredation once the inputs and outputs cannot all fit in L2
+        # 8192,
+        # 16384,
+    )
+
+    @tvm.testing.requires_hexagon
+    def test(
+        self,
+        hexagon_session,
+        operation_count,
+        operation_name,
+        operator_producer,
+        shape_dtypes_producer,
+        expected_output_producer,
+        split_factor,
+    ):
+
+        sch = tvm.tir.Schedule(operator_producer(operation_count))
+        single_thread_runtime = evaluate(
+            hexagon_session, shape_dtypes_producer(operation_count), expected_output_producer, sch
+        )
+
+        sch = tvm.tir.Schedule(operator_producer(operation_count))
+        block = sch.get_block("C")
+        b = sch.get_loops(block)
+        bo, _ = sch.split(b[0], factors=[split_factor, None])
+        sch.parallel(bo)
+
+        parallel_runtime = evaluate(
+            hexagon_session, shape_dtypes_producer(operation_count), expected_output_producer, sch
+        )
+
+        speedup = round(single_thread_runtime / parallel_runtime, 2)
+
+        print(
+            TEST_OUTPUT_TEMPLATE.format(
+                operation_name, operation_count, single_thread_runtime, parallel_runtime, speedup
+            )
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_parallel_scalar.py b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
new file mode 100644
index 000000000000..b3d07ae978ba
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
@@ -0,0 +1,159 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test parallelism for multiple different scalar workloads. """
+
+import numpy as np
+import tvm
+
+from tvm.script import tir as T
+from numpy.random import default_rng
+
+TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n    -Single Thread: {} ms \n    -Parallel: {} ms\n    -Speedup: {}x\n"
+
+
+def get_add_operator(operations):
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations], dtype="float64")
+        B = T.match_buffer(b, [operations], dtype="float64")
+        C = T.match_buffer(c, [operations], dtype="float64")
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C[vn] = A[vn] + B[vn]
+
+    return operator
+
+
+def get_multiply_operator(operations):
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations], dtype="float64")
+        B = T.match_buffer(b, [operations], dtype="float64")
+        C = T.match_buffer(c, [operations], dtype="float64")
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C[vn] = A[vn] * B[vn]
+
+    return operator
+
+
+def get_sub_operator(operations):
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations], dtype="float64")
+        B = T.match_buffer(b, [operations], dtype="float64")
+        C = T.match_buffer(c, [operations], dtype="float64")
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C[vn] = A[vn] - B[vn]
+
+    return operator
+
+
+def evaluate(hexagon_session, operations, expected, sch):
+    shape = operations
+    dtype = "float64"
+
+    target_hexagon = tvm.target.hexagon("v68")
+    func_tir = tvm.build(
+        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+    )
+    module = hexagon_session.load_module(func_tir)
+
+    rng = default_rng()
+    a = rng.random(shape, dtype=dtype)
+    b = rng.random(shape, dtype=dtype)
+    c = np.zeros(shape, dtype=dtype)
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
+    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+
+    # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
+    number = 1
+    repeat = 1
+
+    timer = module.time_evaluator(
+        "__tvm_main__", hexagon_session.device, number=number, repeat=repeat
+    )
+    runtime = timer(a_hexagon, b_hexagon, c_hexagon)
+
+    tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected(a, b))
+
+    return round(runtime.mean * 1000, 6)
+
+
+class TestMatMulVec:
+
+    (operation_name, operator_producer, expected_output_producer,) = tvm.testing.parameters(
+        ("add", get_add_operator, (lambda a, b: a + b)),
+        ("mul", get_multiply_operator, (lambda a, b: a * b)),
+        ("sub", get_sub_operator, (lambda a, b: a - b)),
+    )
+
+    # Removed most of these to speedup CI.
+    operations = tvm.testing.parameter(
+        128,
+        # 256,
+        # 512,
+        # 1024,  # Single thread runs faster since L2 cache can handle the entire request quickly
+        # 2048,
+        # 4096,  # Significant performance degredation once the inputs and outputs cannot all fit in L2
+        # 8192,
+        # 16384,
+    )
+
+    split_factor = tvm.testing.parameter(4)
+
+    @tvm.testing.requires_hexagon
+    def test_add(
+        self,
+        hexagon_session,
+        operation_name,
+        operator_producer,
+        expected_output_producer,
+        operations,
+        split_factor,
+    ):
+
+        sch = tvm.tir.Schedule(operator_producer(operations))
+        single_thread_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch)
+
+        sch = tvm.tir.Schedule(operator_producer(operations))
+        block = sch.get_block("C")
+        b = sch.get_loops(block)
+        bo, _ = sch.split(b[0], factors=[split_factor, None])
+        sch.parallel(bo)
+        parallel_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch)
+
+        speedup = round(single_thread_runtime / parallel_runtime, 2)
+        print(
+            TEST_OUTPUT_TEMPLATE.format(
+                operation_name, operations, single_thread_runtime, parallel_runtime, speedup
+            )
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From f19046caba1a87fb21a832dff74cd80699703576 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Mon, 19 Sep 2022 10:04:16 -0700
Subject: [PATCH 207/704] [MetaSchedule] Support padding for irregular shapes
 for CUDA tensor core (#12759)

* [MetaSchedule] Support padding for irregular shapes for CUDA tensor core

* fix

* Fix test
---
 python/tvm/tir/schedule/analysis.py           |   7 +-
 python/tvm/tir/schedule/transform.py          |   8 +-
 .../multi_level_tiling_tensor_core.cc         |   3 +-
 src/tir/schedule/analysis.h                   |   8 +-
 src/tir/schedule/analysis/analysis.cc         |  53 +++++--
 src/tir/schedule/transform.cc                 |  10 +-
 src/tir/schedule/transform.h                  |   3 +-
 ...test_meta_schedule_schedule_rule_mlt_tc.py | 149 +++++++++++++++++-
 .../unittest/test_tir_schedule_analysis.py    |  29 +++-
 9 files changed, 242 insertions(+), 28 deletions(-)

diff --git a/python/tvm/tir/schedule/analysis.py b/python/tvm/tir/schedule/analysis.py
index cdb4aa9cfa20..90c585ac8ce1 100644
--- a/python/tvm/tir/schedule/analysis.py
+++ b/python/tvm/tir/schedule/analysis.py
@@ -68,7 +68,7 @@ class TensorizeInfo(Object):
 
 
 def get_tensorize_loop_mapping(
-    sch: Schedule, block: BlockRV, desc_func: PrimFunc
+    sch: Schedule, block: BlockRV, desc_func: PrimFunc, allow_padding: bool = False
 ) -> Optional[TensorizeInfo]:
     """Establish a mapping between loops in a target block and an intrinsic description
 
@@ -80,13 +80,14 @@ def get_tensorize_loop_mapping(
         The target block to match against
     desc_func : PrimFunc
         The prim func describing the computation to be tensorized
-
+    allow_padding : bool
+        Whether to allow padding the block iters to match the intrinsic description
     Returns
     -------
     tensorize_info : Optional[TensorizeInfo]
         TensorizeInfo structure if a valid mapping is found, None otherwise
     """
-    return _ffi_api.GetTensorizeLoopMapping(sch, block, desc_func)  # type: ignore
+    return _ffi_api.GetTensorizeLoopMapping(sch, block, desc_func, allow_padding)  # type: ignore
 
 
 @tvm._ffi.register_object("tir.schedule.AutoTensorizeMappingInfo")
diff --git a/python/tvm/tir/schedule/transform.py b/python/tvm/tir/schedule/transform.py
index 5dbc06846d52..e40b55d4d6b2 100644
--- a/python/tvm/tir/schedule/transform.py
+++ b/python/tvm/tir/schedule/transform.py
@@ -21,7 +21,9 @@
 from . import _ffi_api
 
 
-def tile_with_tensor_intrin(sch: Schedule, block: BlockRV, intrin_name: str) -> Optional[LoopRV]:
+def tile_with_tensor_intrin(
+    sch: Schedule, block: BlockRV, intrin_name: str, allow_padding: bool = False
+) -> Optional[LoopRV]:
     """Tile a subset of loops in the block according to the given tensor intrinsic.
 
     Parameters
@@ -32,6 +34,8 @@ def tile_with_tensor_intrin(sch: Schedule, block: BlockRV, intrin_name: str) ->
         The block whose subset of loops will be tiled
     intrin_name : str
         The name of a tensor intrinsic, must be registerd via TensorIntrin.register(...) beforehand
+    allow_padding : bool
+        Whether to allow padding when tiling
 
     Returns
     -------
@@ -39,4 +43,4 @@ def tile_with_tensor_intrin(sch: Schedule, block: BlockRV, intrin_name: str) ->
         LoopRV corresponding to the outermost loop of a block tiled according to the given intrin
         NullOpt if no valid loop mapping is found
     """
-    return _ffi_api.TileWithTensorIntrin(sch, block, intrin_name)  # type: ignore
+    return _ffi_api.TileWithTensorIntrin(sch, block, intrin_name, allow_padding)  # type: ignore
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 6759b59a3245..290a85b2579b 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -515,7 +515,8 @@ Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
   state->sch->TransformBlockLayout(state->tensor_core_reindex_B, index_map);
   state->sch->TransformBlockLayout(state->block_rv, index_map);
 
-  return tir::TileWithTensorIntrin(state->sch, state->block_rv, intrin_name);
+  return tir::TileWithTensorIntrin(state->sch, state->block_rv, intrin_name,
+                                   /*allow_padding=*/true);
 }
 
 inline std::vector<State> MultiLevelTilingTensorCoreNode::TransformForTensorization(
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index ca45bcac6b34..57165fd08ad4 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -731,10 +731,15 @@ class TensorizeInfoNode : public Object {
   Map<tir::StmtSRef, tir::For> loop_map;
   /*! \brief Maps loops in an intrinsic description to its index, outer to inner */
   Map<tir::For, Integer> desc_loop_indexer;
+  /*! \brief Optional padded extents of the block iters when padding is needed to match the
+   * intrinsic description
+   */
+  Optional<Array<Integer>> block_iter_paddings;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("loop_map", &loop_map);
     v->Visit("desc_loop_indexer", &desc_loop_indexer);
+    v->Visit("block_iter_paddings", &block_iter_paddings);
   }
 
   static constexpr const char* _type_key = "tir.schedule.TensorizeInfo";
@@ -751,11 +756,12 @@ class TensorizeInfo : public ObjectRef {
  * \param self The schedule state to be tensorized
  * \param block_sref The target block to match against
  * \param desc_func The prim func describing the computation to be tensorized
+ * \param allow_padding Whether to allow padding the block iters to match the intrinsic description
  * \return TensorizeInfo structure if a valid mapping is found, NullOpt otherwise
  */
 Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
                                                 const tir::StmtSRef& block_sref,
-                                                const tir::PrimFunc& desc_func);
+                                                const tir::PrimFunc& desc_func, bool allow_padding);
 
 /*！\brief Necessary information used to perform transformations for tensorization */
 class AutoTensorizeMappingInfoNode : public Object {
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index e39f7b25543c..294826a1f6b9 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -1697,7 +1697,8 @@ TensorIntrinDescInfo ExtractTensorIntrinDescInfo(arith::Analyzer* analyzer,
 
 Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
                                                 const tir::StmtSRef& block_sref,
-                                                const tir::PrimFunc& desc_func) {
+                                                const tir::PrimFunc& desc_func,
+                                                bool allow_padding) {
   arith::Analyzer analyzer;
   const tir::BlockRealize& block = tir::GetBlockRealize(self, block_sref);
   // Step 1. Analyze desc_func, extract its block, loops and loop vars
@@ -1730,6 +1731,8 @@ Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
   const int n_desc_vars = desc_block->iter_values.size();
   const int offset = n_block_vars - n_desc_vars;
 
+  std::unordered_map<int, int> block_index_to_padding;  // padding of each block iter if necessary
+
   if (offset < 0) {
     return NullOpt;
   }
@@ -1780,10 +1783,11 @@ Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
 
     // Step 3.2. Find the corresponding iter_value of the target block with a matching iterator type
     PrimExpr block_bind;
-    for (int i = next_block_ind; i >= 0; --i) {
-      if (iter_types_block[i] == iter_type_desc) {
-        next_block_ind = i - 1;
-        block_bind = block->iter_values[i];
+    int current_block_ind = next_block_ind;
+    for (; current_block_ind >= 0; --current_block_ind) {
+      if (iter_types_block[current_block_ind] == iter_type_desc) {
+        next_block_ind = current_block_ind - 1;
+        block_bind = block->iter_values[current_block_ind];
         break;
       }
     }
@@ -1800,15 +1804,30 @@ Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
 
       PrimExpr residual = analyzer.Simplify(block_bind - block_loops[i]->loop_var);
       if (UsesVar(residual,
-                  [&block_loop_vars](const VarNode* var) { return block_loop_vars.count(var); }))
+                  [&block_loop_vars](const VarNode* var) { return block_loop_vars.count(var); })) {
         continue;
+      }
+      // padding is allowed only when the block has trivial bindings
+      if (allow_padding && !is_zero(residual)) {
+        allow_padding = false;
+      }
 
       const IntImmNode* int_block_extent = block_loops[i]->extent.as<IntImmNode>();
 
       // Check divisibility
-      if (!int_block_extent || int_block_extent->value % int_desc_extent->value != 0) {
+      if (!int_block_extent) {
         return NullOpt;
       }
+      int64_t remainder = int_block_extent->value % int_desc_extent->value;
+      if (remainder != 0) {
+        if (allow_padding) {
+          // If the block loop is not divisible by the desc loop, we pad the block loop to make it
+          // divisible if padding is allowed.
+          block_index_to_padding[current_block_ind] = int_desc_extent->value - remainder;
+        } else {
+          return NullOpt;
+        }
+      }
 
       ret->loop_map.Set(block_loop_sref, GetRef<tir::For>(desc_loop));
       break;
@@ -1818,13 +1837,29 @@ Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
   for (int i = 0, n = desc_loops.size(); i < n; ++i) {
     ret->desc_loop_indexer.Set(GetRef<tir::For>(desc_loops[i]), Integer(i));
   }
+  if (!block_index_to_padding.empty()) {
+    if (!allow_padding) {
+      return NullOpt;
+    }
+    Array<Integer> paddings;
+    for (int i = 0, n = block->block->iter_vars.size(); i < n; ++i) {
+      const IterVar& iter_var = block->block->iter_vars[i];
+      if (auto it = block_index_to_padding.find(i); it != block_index_to_padding.end()) {
+        paddings.push_back(IntImm(iter_var->var.dtype(), it->second));
+      } else {
+        paddings.push_back(IntImm(iter_var->var.dtype(), 0));
+      }
+    }
+    ret->block_iter_paddings = std::move(paddings);
+  }
+
   return TensorizeInfo(ret);
 }
 
 TVM_REGISTER_GLOBAL("tir.schedule.IsSpatialPrimFunc").set_body_typed(IsSpatialPrimFunc);
 TVM_REGISTER_GLOBAL("tir.schedule.GetTensorizeLoopMapping")
-    .set_body_typed([](Schedule sch, BlockRV block, PrimFunc desc_func) {
-      return GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block), desc_func);
+    .set_body_typed([](Schedule sch, BlockRV block, PrimFunc desc_func, bool allow_padding) {
+      return GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block), desc_func, allow_padding);
     });
 
 /******** Auto Tensorization ********/
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index dfbd3dbcbcc4..b00005c58061 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -288,11 +288,15 @@ void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_
 }
 
 Optional<LoopRV> TileWithTensorIntrin(const tir::Schedule& sch, const tir::BlockRV& block_rv,
-                                      const String& intrin_name) {
-  Optional<tir::TensorizeInfo> opt_tensorize_info = GetTensorizeLoopMapping(
-      sch->state(), sch->GetSRef(block_rv), tir::TensorIntrin::Get(intrin_name)->desc);
+                                      const String& intrin_name, bool allow_padding) {
+  Optional<tir::TensorizeInfo> opt_tensorize_info =
+      GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block_rv),
+                              tir::TensorIntrin::Get(intrin_name)->desc, allow_padding);
   if (!opt_tensorize_info) return NullOpt;
   const tir::TensorizeInfoNode* info = opt_tensorize_info.value().get();
+  if (info->block_iter_paddings.defined()) {
+    sch->PadEinsum(block_rv, info->block_iter_paddings.value());
+  }
   // Construct a mapping from tir loops back to LoopRVs
   Map<tir::StmtSRef, LoopRV> loop2rv;
   {
diff --git a/src/tir/schedule/transform.h b/src/tir/schedule/transform.h
index 4de3685e2482..2bba13e2bd1c 100644
--- a/src/tir/schedule/transform.h
+++ b/src/tir/schedule/transform.h
@@ -193,11 +193,12 @@ void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_
  * \param block_rv The block whose subset of loops will be tiled
  * \param intrin_name The name of a tensor intrinsic, must be registerd via
  * TensorIntrin.register(...) beforehand
+ * \param allow_padding Whether to allow padding when tiling
  * \return LoopRV corresponding to the outermost loop of a
  * block tiled according to the given intrin, NullOpt if a valid loop mapping is not found
  */
 Optional<tir::LoopRV> TileWithTensorIntrin(const tir::Schedule& sch, const tir::BlockRV& block_rv,
-                                           const String& intrin_name);
+                                           const String& intrin_name, bool allow_padding = false);
 
 /******** Block mutation ********/
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
index fbb74090b1e5..f7a5ce997edf 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
+import tvm.testing
 from tvm import meta_schedule as ms
 from tvm import te
 from tvm.meta_schedule.testing import te_workload
@@ -947,11 +948,145 @@ def test_matmul_relu_non_tensorizable():
     tvm.ir.assert_structural_equal(mod, sch.mod["main"])
 
 
+def test_padded_matmul_relu():
+    # fmt: off
+    @T.prim_func
+    def padded_matmul_relu_0(A: T.Buffer[(127, 127), "float16"], B: T.Buffer[(127, 127), "float16"], compute: T.Buffer[(127, 127), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C_reindex_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+        C_reindex_shared_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator")
+        A_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared")
+        B_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared")
+        A_reindex_shared_wmma_matrix_a = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_a")
+        B_reindex_shared_wmma_matrix_b = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_b")
+        for ax0_0_0_ax1_0_0_fused in T.thread_binding(8, thread="blockIdx.y"):
+            for ax0_0_1_ax1_0_1_fused in T.thread_binding(2, thread="blockIdx.x"):
+                for ax0_0_2_ax1_0_2_fused in T.thread_binding(2, thread="threadIdx.y"):
+                    for ax2_0_0 in T.serial(1):
+                        for ax0_ax1_fused in T.serial(4096):
+                            with T.block("A_reindex_shared"):
+                                v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0_ax1_fused // 128)
+                                v1 = T.axis.spatial(128, ax0_ax1_fused % 128)
+                                T.reads(A[v0, v1])
+                                T.writes(A_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8})
+                                A_reindex_shared[v0, v1] = T.if_then_else(v0 < 127 and v1 < 127, A[v0, v1], T.float16(0), dtype="float16")
+                        for ax0_ax1_fused in T.serial(4096):
+                            with T.block("B_reindex_shared"):
+                                v0 = T.axis.spatial(128, ax0_ax1_fused // 32)
+                                v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0_ax1_fused % 32)
+                                T.reads(B[v0, v1])
+                                T.writes(B_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":1})
+                                B_reindex_shared[v0, v1] = T.if_then_else(v0 < 127 and v1 < 127, B[v0, v1], T.float16(0), dtype="float16")
+                        for ax2_0_1 in T.serial(4):
+                            for ax0_0, ax1_0 in T.grid(2, 2):
+                                with T.block("A_reindex_shared_wmma.matrix_a_o"):
+                                    v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0)
+                                    v1_o = T.axis.spatial(8, ax2_0_1 * 2 + ax1_0)
+                                    T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("A_reindex_shared_wmma.matrix_a"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0, ax1_0 in T.grid(2, 1):
+                                with T.block("B_reindex_shared_wmma.matrix_b_o"):
+                                    v0_o = T.axis.spatial(8, ax2_0_1 * 2 + ax0_0)
+                                    v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused)
+                                    T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
+                                    for ax0_1, ax1_1 in T.grid(16, 16):
+                                        with T.block("B_reindex_shared_wmma.matrix_b"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                            T.reads(B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(1, 1, 2, 2, 1):
+                                with T.block("C_o"):
+                                    v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0_3 * 2 + ax0_0_4)
+                                    v1_o = T.axis.spatial(8, ax1_0_4 + ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused + ax1_0_3)
+                                    v2_o = T.axis.reduce(8, ax2_0_0 * 8 + ax2_0_1 * 2 + ax2_0_2)
+                                    T.reads(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex_shared_wmma_matrix_b[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1})
+                                    with T.init():
+                                        for ax0_1, ax1_1 in T.grid(16, 16):
+                                            with T.block("C_init"):
+                                                v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1])
+                                                T.reads()
+                                                T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init])
+                                                C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0)
+                                    for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16):
+                                        with T.block("C"):
+                                            v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1])
+                                            T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i])
+                                            T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
+                                            C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32")
+                    for ax0_0, ax1_0 in T.grid(2, 1):
+                        with T.block("C_reindex_shared_wmma.accumulator_o"):
+                            v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0)
+                            v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused)
+                            T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.writes(C_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
+                            for ax0_1, ax1_1 in T.grid(16, 16):
+                                with T.block("C_reindex_shared_wmma.accumulator"):
+                                    v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                    T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                for ax0, ax1 in T.grid(32, 32):
+                    with T.block("C_reindex_shared"):
+                        T.where(ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0 < 127 and ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax1 < 127)
+                        v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0)
+                        v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax1)
+                        T.reads(C_reindex_shared[v0, v1])
+                        T.writes(compute[v0, v1])
+                        T.block_attr({"meta_schedule.cooperative_fetch":4})
+                        compute[v0, v1] = T.max(C_reindex_shared[v0, v1], T.float32(0))
+    # fmt: on
+
+    decision_0 = [
+        ("SamplePerfectTile", [4, 1, 1, 1, 2]),
+        ("SamplePerfectTile", [2, 2, 2, 1, 1]),
+        ("SamplePerfectTile", [1, 4, 2]),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 0),
+    ]
+
+    mod = te.create_prim_func(
+        te_workload.matmul_relu(
+            n=127,
+            m=127,
+            k=127,
+            in_dtype="float16",
+            out_dtype="float32",
+        )
+    )
+    actual = ms.TuneContext(
+        mod=mod,
+        target=tvm.target.Target("cuda"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[multi_level_tiling_tensor_core(write_reuse_scope="shared")]
+        + get_rules("cuda", ms.schedule_rule.AutoInline),
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[padded_matmul_relu_0],
+        expected_decisions=[decision_0],
+    )
+
+
 if __name__ == "__main__":
-    test_matmul_relu()
-    test_matmul_relu_with_fallback()
-    test_conv2d()
-    test_conv2d_more_intrin()
-    test_matmul_relu_pipeline()
-    test_matmul_relu_global()
-    test_matmul_relu_non_tensorizable()
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index 378e5183b49c..807420ece3ba 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -21,7 +21,10 @@
 import tvm.testing
 from tvm.tir.function import TensorIntrin
 from tvm.tir.tensor_intrin.x86 import dot_product_16x4_u8i8i32_desc
-from tvm.tir.tensor_intrin.cuda import WMMA_SYNC_16x16x16_f16f16f32_INTRIN
+from tvm.tir.tensor_intrin.cuda import (
+    WMMA_SYNC_16x16x16_f16f16f16_INTRIN,
+    WMMA_SYNC_16x16x16_f16f16f32_INTRIN,
+)
 
 
 from tvm.tir import Evaluate, For, ForKind, IndexMap, Var, decl_buffer, floordiv, floormod, Schedule
@@ -301,6 +304,30 @@ def matmul_16x16x16xf16f16f16_desc(
         assert s.get(desc_loop_to_sref[desc_loops[2]]) == s.get(i2)
 
 
+def test_get_tensorize_loop_mapping_padding_matmul():
+    matmul = create_prim_func(
+        te_workload.matmul_relu(
+            n=127,
+            m=256,
+            k=65,
+            in_dtype="float16",
+            out_dtype="float16",
+        )
+    )
+    s = Schedule(matmul)
+    block = s.get_block("C")
+
+    desc = TensorIntrin.get(WMMA_SYNC_16x16x16_f16f16f16_INTRIN).desc
+    info = get_tensorize_loop_mapping(s, block, desc, allow_padding=True)
+    assert info is not None
+    expected_padding = [1, 0, 15]
+    actual_padding = info.block_iter_paddings
+    assert actual_padding is not None
+    assert len(actual_padding) == len(expected_padding)
+    for actual, expected in zip(actual_padding, expected_padding):
+        assert actual == expected
+
+
 def check_index_map(workload, block_name, intrin_name, expected_index_map):
     s = Schedule(workload)
     block = s.get_block(block_name)

From 79c48f38878788b46d3acd1945469ae97e508d7d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 19 Sep 2022 13:02:53 -0500
Subject: [PATCH 208/704] [TIR][Bugfix] Correct handling of buffer argument
 when scheduling (#12816)

Follow-up from https://github.com/apache/tvm/pull/11269, which allowed
schedule arguments of the buffer to be transformed to be specified as
a string, or as a `tir::Buffer`.  The string handling worked
correctly, but the `tir::Buffer` object was handled incorrectly.  This
commit corrects the handling of `tir::Buffer` arguments when
scheduling, and adds a unit test to validate this behavior.
---
 python/tvm/tir/schedule/schedule.py           |  6 +--
 .../test_tir_schedule_set_axis_separator.py   | 41 +++++++++++++------
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index b8f696b7a134..27171aca411b 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -2390,7 +2390,7 @@ def iter_buffers():
         if isinstance(buffer, str):
             possible_buffers = {}
             # String lookup requires ensuring that the name is unique
-            for buffer_index, buffer_index_type, buf in iter_buffers():
+            for buffer_index_type, buffer_index, buf in iter_buffers():
                 if buf.name == buffer:
                     possible_buffers[buf] = (buffer_index_type, buffer_index)
 
@@ -2398,12 +2398,12 @@ def iter_buffers():
             assert (
                 len(possible_buffers) == 1
             ), f"Multiple buffers named '{buffer}' in block '{block_name}'"
-            buffer_obj, (buffer_index, buffer_index_type) = next(iter(possible_buffers.items()))
+            buffer_obj, (buffer_index_type, buffer_index) = next(iter(possible_buffers.items()))
 
         elif isinstance(buffer, Buffer):
             # Buffer lookup has unique id, can break out early
             found = False
-            for buffer_index, buffer_index_type, buffer_obj in iter_buffers():
+            for buffer_index_type, buffer_index, buffer_obj in iter_buffers():
                 if buffer_obj.same_as(buffer):
                     found = True
                     break
diff --git a/tests/python/unittest/test_tir_schedule_set_axis_separator.py b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
index b432fbb61066..327df33408f2 100644
--- a/tests/python/unittest/test_tir_schedule_set_axis_separator.py
+++ b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
@@ -102,18 +102,25 @@ def element_wise_subregion_match_set_axis_separator(A: T.Buffer[(128, 128), "flo
 
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
 
-use_sugared_transform = tvm.testing.parameter(
-    by_dict={"set_axis_separators": False, "transform_layout_sugared": True}
-)
+argument_style = tvm.testing.parameter('set_axis_separators',
+                                      'transform_layout_named',
+                                      'transform_layout_buffer_object',
+                                      )
 
-def test_set_axis_separator(use_sugared_transform):
+
+def test_set_axis_separator(argument_style):
     func = element_wise
     s = tir.Schedule(func, debug_mask='all')
 
-    if use_sugared_transform:
+    if argument_style=='set_axis_separators':
         s.set_axis_separator(s.get_block("B"), ("write",0), [1])
-    else:
+    elif argument_style=='transform_layout_named':
         s.transform_layout(block='B', buffer='B', index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j])
+    elif argument_style =='transform_layout_buffer_object':
+        B = s.get(s.get_block('B')).writes[0].buffer
+        s.transform_layout(block='B', buffer=B, index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j])
+    else:
+        raise ValueError(f'Unexpected argument_style: {argument_style}')
 
     tvm.ir.assert_structural_equal(element_wise_set_axis_separator, s.mod["main"])
     verify_trace_roundtrip(sch=s, mod=func)
@@ -128,28 +135,38 @@ def test_set_scope_fail_on_index_out_of_bound():
         s.set_axis_separator(s.get_block("B"), ("read",-1),[1])
 
 
-def test_set_axis_separator_input_buffer(use_sugared_transform):
+def test_set_axis_separator_input_buffer(argument_style):
     func = element_wise
     s = tir.Schedule(func, debug_mask='all')
 
-    if use_sugared_transform:
+    if argument_style=='set_axis_separators':
+        s.set_axis_separator(s.get_block("B"), ("read",0), [1])
+    elif argument_style=='transform_layout_named':
         s.transform_layout(block='B', buffer='A', index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j])
+    elif argument_style =='transform_layout_buffer_object':
+        A = s.get(s.get_block('B')).reads[0].buffer
+        s.transform_layout(block='B', buffer=A, index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j])
     else:
-        s.set_axis_separator(s.get_block("B"), ("read",0), [1])
+        raise ValueError(f'Unexpected argument_style: {argument_style}')
 
 
     tvm.ir.assert_structural_equal(element_wise_set_axis_separator_input_buffer, s.mod["main"])
     verify_trace_roundtrip(sch=s, mod=func)
 
 
-def test_set_axis_separator_subregion(use_sugared_transform):
+def test_set_axis_separator_subregion(argument_style):
     func = element_wise_subregion_match
     s = tir.Schedule(func, debug_mask='all')
 
-    if use_sugared_transform:
+    if argument_style=='set_axis_separators':
+        s.set_axis_separator(s.get_block("B"), ("write",0), [1])
+    elif argument_style=='transform_layout_named':
         s.transform_layout(block='B', buffer='B', index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j])
+    elif argument_style =='transform_layout_buffer_object':
+        B = s.get(s.get_block('B')).writes[0].buffer
+        s.transform_layout(block='B', buffer=B, index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j])
     else:
-        s.set_axis_separator(s.get_block("B"), ("write",0), [1])
+        raise ValueError(f'Unexpected argument_style: {argument_style}')
 
     tvm.ir.assert_structural_equal(element_wise_subregion_match_set_axis_separator, s.mod["main"])
     verify_trace_roundtrip(sch=s, mod=func)

From f9b692765adf19a2bd3e5cf7abab8c1c74714f81 Mon Sep 17 00:00:00 2001
From: yanghaku <36074633+yanghaku@users.noreply.github.com>
Date: Tue, 20 Sep 2022 06:32:58 +0800
Subject: [PATCH 209/704] [BugFix][LLVM] Fix the bug that the generated
 systemlib cannot register ```__tvm_module_ctx``` symbol sometimes (#12817)

[BugFix][LLVM] Fix the bug that the generated systemlib cannot register '__tvm_module_ctx' symbol sometimes.
---
 src/target/llvm/codegen_cpu.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 57ee62e152db..eb5c92e663fa 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -178,9 +178,9 @@ void CodeGenCPU::Init(const std::string& module_name, LLVMTarget* llvm_target, b
         llvm::Function::Create(ftype_tvm_parallel_barrier_, llvm::Function::ExternalLinkage,
                                "TVMBackendParallelBarrier", module_.get());
   }
-  InitGlobalContext(dynamic_lookup);
   target_c_runtime_ = target_c_runtime;
   is_system_lib_ = system_lib;
+  InitGlobalContext(dynamic_lookup);
 }
 
 void CodeGenCPU::AddFunction(const PrimFunc& f) {

From a75dcabd3f5306ed1c792c0877becab219004ed8 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 19 Sep 2022 17:20:53 -0700
Subject: [PATCH 210/704] [MetaSchedule] PyDatabase Complete Function Reload
 Support (#12838)

* Save for PR.

* Fix database default query function call.

* Add test.

* Fix lint.

* Remove unused import.

* Differentiate override class.

* Reuse outer class functions.

* Fix lint.
---
 include/tvm/meta_schedule/database.h          |  70 ++++++
 python/tvm/meta_schedule/database/database.py |  81 +++++++
 src/meta_schedule/database/database.cc        |   6 +
 .../unittest/test_meta_schedule_database.py   | 211 +++++++++++++++++-
 4 files changed, 365 insertions(+), 3 deletions(-)

diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index fa488a38ce0a..4092fdae36dd 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -28,6 +28,7 @@
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/target/target.h>
+#include <tvm/tir/schedule/schedule.h>
 #include <tvm/tir/schedule/trace.h>
 
 namespace tvm {
@@ -267,6 +268,33 @@ class PyDatabaseNode : public DatabaseNode {
    * \return An Array of all the tuning records in the database.
    */
   using FGetAllTuningRecords = runtime::TypedPackedFunc<Array<TuningRecord>()>;
+  /*!
+   * \brief The function type of `QueryTuningRecord` method.
+   * \param mod The IRModule to be searched for.
+   * \param target The target to be searched for.
+   * \param workload_name The name of the workload to be searched for.
+   * \return The best record of the given workload; NullOpt if not found.
+   */
+  using FQueryTuningRecord = runtime::TypedPackedFunc<Optional<TuningRecord>(
+      const IRModule&, const Target&, const String&)>;
+  /*!
+   * \brief The function type of `QuerySchedule` method.
+   * \param mod The IRModule to be searched for.
+   * \param target The target to be searched for.
+   * \param workload_name The name of the workload to be searched for.
+   * \return The schedule in the best schedule of the given workload; NullOpt if not found.
+   */
+  using FQuerySchedule = runtime::TypedPackedFunc<Optional<tir::Schedule>(
+      const IRModule&, const Target&, const String&)>;
+  /*!
+   * \brief The function type of `QueryIRModule` method.
+   * \param mod The IRModule to be searched for.
+   * \param target The target to be searched for.
+   * \param workload_name The name of the workload to be searched for.
+   * \return The IRModule in the best IRModule of the given workload; NullOpt if not found.
+   */
+  using FQueryIRModule =
+      runtime::TypedPackedFunc<Optional<IRModule>(const IRModule&, const Target&, const String&)>;
   /*!
    * \brief The function type of `Size` method.
    * \return The size of the database.
@@ -283,6 +311,12 @@ class PyDatabaseNode : public DatabaseNode {
   FGetTopK f_get_top_k;
   /*! \brief The packed function to the `GetAllTuningRecords` function. */
   FGetAllTuningRecords f_get_all_tuning_records;
+  /*! \brief The packed function to the `QueryTuningRecord` function. */
+  FQueryTuningRecord f_query_tuning_record;
+  /*! \brief The packed function to the `QuerySchedule` function. */
+  FQuerySchedule f_query_schedule;
+  /*! \brief The packed function to the `QueryIRModule` function. */
+  FQueryIRModule f_query_ir_module;
   /*! \brief The packed function to the `Size` function. */
   FSize f_size;
 
@@ -295,6 +329,9 @@ class PyDatabaseNode : public DatabaseNode {
     // `f_commit_tuning_record` is not visited
     // `f_get_top_k` is not visited
     // `f_get_all_tuning_records` is not visited
+    // `f_query_tuning_record` is not visited
+    // `f_query_schedule` is not visited
+    // `f_query_ir_module` is not visited
     // `f_size` is not visited
   }
 
@@ -325,6 +362,33 @@ class PyDatabaseNode : public DatabaseNode {
     return f_get_all_tuning_records();
   }
 
+  Optional<TuningRecord> QueryTuningRecord(const IRModule& mod, const Target& target,
+                                           const String& workload_name) final {
+    if (f_query_tuning_record == nullptr) {
+      return DatabaseNode::QueryTuningRecord(mod, target, workload_name);
+    } else {
+      return f_query_tuning_record(mod, target, workload_name);
+    }
+  }
+
+  Optional<tir::Schedule> QuerySchedule(const IRModule& mod, const Target& target,
+                                        const String& workload_name) final {
+    if (f_query_schedule == nullptr) {
+      return DatabaseNode::QuerySchedule(mod, target, workload_name);
+    } else {
+      return f_query_schedule(mod, target, workload_name);
+    }
+  }
+
+  Optional<IRModule> QueryIRModule(const IRModule& mod, const Target& target,
+                                   const String& workload_name) final {
+    if (f_query_ir_module == nullptr) {
+      return DatabaseNode::QueryIRModule(mod, target, workload_name);
+    } else {
+      return f_query_ir_module(mod, target, workload_name);
+    }
+  }
+
   int64_t Size() final {
     ICHECK(f_size != nullptr) << "PyDatabase's Size method not implemented!";
     return f_size();
@@ -380,6 +444,9 @@ class Database : public runtime::ObjectRef {
    * \param f_commit_tuning_record The packed function of `CommitTuningRecord`.
    * \param f_get_top_k The packed function of `GetTopK`.
    * \param f_get_all_tuning_records The packed function of `GetAllTuningRecords`.
+   * \param f_query_tuning_record The packed function of `QueryTuningRecord`.
+   * \param f_query_schedule The packed function of `QuerySchedule`.
+   * \param f_query_ir_module The packed function of `QueryIRModule`.
    * \param f_size The packed function of `Size`.
    * \return The created database.
    */
@@ -388,6 +455,9 @@ class Database : public runtime::ObjectRef {
                                      PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record,
                                      PyDatabaseNode::FGetTopK f_get_top_k,
                                      PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records,
+                                     PyDatabaseNode::FQueryTuningRecord f_query_tuning_record,
+                                     PyDatabaseNode::FQuerySchedule f_query_schedule,
+                                     PyDatabaseNode::FQueryIRModule f_query_ir_module,
                                      PyDatabaseNode::FSize f_size);
   /*! \return The current Database in the scope. */
   static Optional<Database> Current();
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index 7a1338f46b20..75b78b118eea 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -378,6 +378,9 @@ def __init__(
         f_commit_tuning_record: Callable = None,
         f_get_top_k: Callable = None,
         f_get_all_tuning_records: Callable = None,
+        f_query_tuning_record: Callable = None,
+        f_query_schedule: Callable = None,
+        f_query_ir_module: Callable = None,
         f_size: Callable = None,
     ):
         """Constructor."""
@@ -389,6 +392,9 @@ def __init__(
             f_commit_tuning_record,
             f_get_top_k,
             f_get_all_tuning_records,
+            f_query_tuning_record,
+            f_query_schedule,
+            f_query_ir_module,
             f_size,
         )
 
@@ -409,6 +415,9 @@ class PyDatabase:
             "commit_tuning_record",
             "get_top_k",
             "get_all_tuning_records",
+            "query_tuning_record",
+            "query_schedule",
+            "query_ir_module",
             "__len__",
         ],
     }
@@ -478,6 +487,78 @@ def get_all_tuning_records(self) -> List[TuningRecord]:
         """
         raise NotImplementedError
 
+    def query_tuning_record(
+        self, mod: IRModule, target: Target, workload_name: Optional[str] = None
+    ) -> Optional[TuningRecord]:
+        """Query a tuning record from the database.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The IRModule to be searched for.
+        target : Target
+            The target to be searched for.
+        workload_name : Optional[str]
+            The workload name to be searched for.
+
+        Returns
+        -------
+        record : Optional[TuningRecord]
+            The tuning record corresponding to the given workload.
+        """
+        # Using self._outer to replace the self pointer
+        return _ffi_api.DatabaseQueryTuningRecord(  # type: ignore # pylint: disable=no-member
+            self._outer(), mod, target, workload_name  # type: ignore # pylint: disable=no-member
+        )
+
+    def query_schedule(
+        self, mod: IRModule, target: Target, workload_name: Optional[str] = None
+    ) -> Optional[Schedule]:
+        """Query a schedule from the database.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The IRModule to be searched for.
+        target : Target
+            The target to be searched for.
+        workload_name : Optional[str]
+            The workload name to be searched for.
+
+        Returns
+        -------
+        schedule : Optional[Schedule]
+            The schedule corresponding to the given workload.
+        """
+        # Using self._outer to replace the self pointer
+        return _ffi_api.DatabaseQuerySchedule(  # type: ignore # pylint: disable=no-member
+            self._outer(), mod, target, workload_name  # type: ignore # pylint: disable=no-member
+        )
+
+    def query_ir_module(
+        self, mod: IRModule, target: Target, workload_name: Optional[str] = None
+    ) -> Optional[IRModule]:
+        """Query an IRModule from the database.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The IRModule to be searched for.
+        target : Target
+            The target to be searched for.
+        workload_name : Optional[str]
+            The workload name to be searched for.
+
+        Returns
+        -------
+        mod : Optional[IRModule]
+            The IRModule corresponding to the given workload.
+        """
+        # Using self._outer to replace the self pointer
+        return _ffi_api.DatabaseQueryIRModule(  # type: ignore # pylint: disable=no-member
+            self._outer(), mod, target, workload_name  # type: ignore # pylint: disable=no-member
+        )
+
     def __len__(self) -> int:
         """Get the number of records in the database.
 
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index d082ff7a3901..0976e158aaf0 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -217,6 +217,9 @@ Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload,
                               PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record,
                               PyDatabaseNode::FGetTopK f_get_top_k,
                               PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records,
+                              PyDatabaseNode::FQueryTuningRecord f_query_tuning_record,
+                              PyDatabaseNode::FQuerySchedule f_query_schedule,
+                              PyDatabaseNode::FQueryIRModule f_query_ir_module,
                               PyDatabaseNode::FSize f_size) {
   ObjectPtr<PyDatabaseNode> n = make_object<PyDatabaseNode>();
   n->f_has_workload = f_has_workload;
@@ -224,6 +227,9 @@ Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload,
   n->f_commit_tuning_record = f_commit_tuning_record;
   n->f_get_top_k = f_get_top_k;
   n->f_get_all_tuning_records = f_get_all_tuning_records;
+  n->f_query_tuning_record = f_query_tuning_record;
+  n->f_query_schedule = f_query_schedule;
+  n->f_query_ir_module = f_query_ir_module;
   n->f_size = f_size;
   return Database(n);
 }
diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py
index e6342f1c3536..777c5589a141 100644
--- a/tests/python/unittest/test_meta_schedule_database.py
+++ b/tests/python/unittest/test_meta_schedule_database.py
@@ -18,11 +18,13 @@
 """Test Meta Schedule Database"""
 import os.path as osp
 import tempfile
-from typing import Callable
+from typing import Callable, Optional, List
 
 import tvm
 import tvm.testing
+from tvm.target import Target
 from tvm import meta_schedule as ms
+from tvm.meta_schedule.database import TuningRecord, Workload
 from tvm import tir
 from tvm.ir.module import IRModule
 from tvm.script import tir as T
@@ -106,6 +108,123 @@ def _equal_record(a: ms.database.TuningRecord, b: ms.database.TuningRecord):
         assert str(arg0.as_json()) == str(arg1.as_json())
 
 
+@ms.utils.derived_object
+class PyMemoryDatabaseDefault(ms.database.PyDatabase):
+    def __init__(self):
+        super().__init__()
+        self.tuning_records_: List[TuningRecord] = []
+        self.workloads_: List[Workload] = []
+
+    def has_workload(self, mod: IRModule) -> bool:
+        for workload in self.workloads_:
+            if tvm.ir.structural_equal(mod, workload.mod):
+                return True
+
+    def commit_workload(self, mod: IRModule) -> ms.database.Workload:
+        if self.has_workload(mod):
+            for workload in self.workloads_:
+                if tvm.ir.structural_equal(mod, workload.mod):
+                    return workload
+        else:
+            workload = ms.database.Workload(mod)
+            self.workloads_.append(workload)
+            return workload
+
+    def commit_tuning_record(self, record: TuningRecord) -> None:
+        self.tuning_records_.append(record)
+
+    def get_all_tuning_records(self) -> List[TuningRecord]:
+        return self.tuning_records_
+
+    def get_top_k(self, workload: ms.database.Workload, top_k: int) -> List[TuningRecord]:
+        return sorted(
+            list(
+                filter(
+                    lambda x: tvm.ir.structural_equal(workload.mod, x.workload.mod),
+                    self.tuning_records_,
+                )
+            ),
+            key=lambda x: sum(x.run_secs) / len(x.run_secs) if x.run_secs else 1e9,
+        )[:top_k]
+
+    def __len__(self) -> int:
+        return len(self.tuning_records_)
+
+
+@ms.utils.derived_object
+class PyMemoryDatabaseOverride(ms.database.PyDatabase):
+    def __init__(self):
+        super().__init__()
+        self.tuning_records_: List[TuningRecord] = []
+        self.workloads_: List[Workload] = []
+
+    def has_workload(self, mod: IRModule) -> bool:
+        for workload in self.workloads_:
+            if tvm.ir.structural_equal(mod, workload.mod):
+                return True
+
+    def commit_workload(self, mod: IRModule) -> ms.database.Workload:
+        if self.has_workload(mod):
+            for workload in self.workloads_:
+                if tvm.ir.structural_equal(mod, workload.mod):
+                    return workload
+        else:
+            workload = ms.database.Workload(mod)
+            self.workloads_.append(workload)
+            return workload
+
+    def commit_tuning_record(self, record: TuningRecord) -> None:
+        self.tuning_records_.append(record)
+
+    def get_all_tuning_records(self) -> List[TuningRecord]:
+        return self.tuning_records_
+
+    def get_top_k(self, workload: ms.database.Workload, top_k: int) -> List[TuningRecord]:
+        return sorted(
+            list(
+                filter(
+                    lambda x: tvm.ir.structural_equal(workload.mod, x.workload.mod),
+                    self.tuning_records_,
+                )
+            ),
+            key=lambda x: sum(x.run_secs) / len(x.run_secs) if x.run_secs else 1e9,
+        )[:top_k]
+
+    def __len__(self) -> int:
+        return len(self.tuning_records_)
+
+    def query_tuning_record(
+        self, mod: IRModule, target: Target, workload_name: Optional[str] = None
+    ) -> Optional[TuningRecord]:
+        if self.has_workload(mod):
+            records = self.get_top_k(self.commit_workload(mod), 2)
+            if len(records) == 1:
+                return records[0]
+            elif len(records) == 2:
+                return records[1]  # return the 2nd best if there are two records
+        return None
+
+    def query_schedule(
+        self, mod: IRModule, target: Target, workload_name: Optional[str] = None
+    ) -> Optional[Schedule]:
+        record = self.query_tuning_record(mod, target, workload_name)
+        if record is not None:
+            sch = Schedule(record.workload.mod)
+            record.trace.apply_to_schedule(sch, remove_postproc=False)
+            return sch
+        return None
+
+    def query_ir_module(
+        self, mod: IRModule, target: Target, workload_name: Optional[str] = None
+    ) -> Optional[IRModule]:
+        record = self.query_tuning_record(mod, target, workload_name)
+        if record is not None:
+            sch = Schedule(record.workload.mod)
+            record.trace.apply_to_schedule(sch, remove_postproc=False)
+            return sch.mod
+        return None
+
+
 def test_meta_schedule_tuning_record_round_trip():
     mod: IRModule = Matmul
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -302,10 +421,10 @@ def test_meta_schedule_database_union():
     db_2 = ms.database.MemoryDatabase()
     trace = _create_schedule(mod, _schedule_matmul).trace
 
-    def query(db):
+    def query(db):  # pylint: disable=invalid-name
         return db.query_tuning_record(mod=mod, target=target, workload_name="main").run_secs
 
-    def commit_record(db, run_sec):
+    def commit_record(db, run_sec):  # pylint: disable=invalid-name
         db.commit_tuning_record(
             ms.database.TuningRecord(
                 trace,
@@ -331,5 +450,91 @@ def commit_record(db, run_sec):
     assert run_secs.value == 1.0
 
 
+def test_meta_schedule_pydatabase_default_query():
+
+    mod: IRModule = Matmul
+    target = tvm.target.Target("llvm")
+    arg_info = ms.arg_info.ArgInfo.from_prim_func(func=mod["main"])
+    db = PyMemoryDatabaseDefault()  # pylint: disable=invalid-name
+    sch = _create_schedule(mod, _schedule_matmul)
+    trace = sch.trace
+
+    def query(db, mod, target, kind):  # pylint: disable=invalid-name
+        return db.query(mod=mod, target=target, workload_name="main", kind=kind)
+
+    def commit_record(trace, db, run_sec):  # pylint: disable=invalid-name
+        db.commit_tuning_record(
+            ms.database.TuningRecord(
+                trace,
+                workload=db.commit_workload(mod),
+                run_secs=[run_sec],
+                target=target,
+                args_info=arg_info,
+            )
+        )
+
+    commit_record(trace, db, 1.0)
+    record = query(db, mod, target, "record")
+    assert record is not None and record.run_secs[0].value == 1.0
+    sch_res = query(db, mod, target, "schedule")
+    assert sch_res is not None and tvm.ir.structural_equal(sch_res.mod, sch.mod)
+    mod_res = query(db, mod, target, "ir_module")
+    assert mod_res is not None and tvm.ir.structural_equal(mod_res, sch.mod)
+
+    commit_record(Schedule(mod).trace, db, 0.2)  # Empty Trace
+    record = query(db, mod, target, "record")
+    assert record is not None and record.run_secs[0].value == 0.2
+    sch_res = query(db, mod, target, "schedule")
+    assert sch_res is not None and tvm.ir.structural_equal(sch_res.mod, mod)
+    mod_res = query(db, mod, target, "ir_module")
+    assert mod_res is not None and tvm.ir.structural_equal(mod_res, mod)
+
+
+def test_meta_schedule_pydatabase_override_query():
+
+    mod: IRModule = Matmul
+    target = tvm.target.Target("llvm")
+    arg_info = ms.arg_info.ArgInfo.from_prim_func(func=mod["main"])
+    db = PyMemoryDatabaseOverride()  # pylint: disable=invalid-name
+    sch = _create_schedule(mod, _schedule_matmul)
+    trace = sch.trace
+
+    def query(db, mod, target, kind):  # pylint: disable=invalid-name
+        return db.query(mod=mod, target=target, workload_name="main", kind=kind)
+
+    def commit_record(trace, db, run_sec):  # pylint: disable=invalid-name
+        db.commit_tuning_record(
+            ms.database.TuningRecord(
+                trace,
+                workload=db.commit_workload(mod),
+                run_secs=[run_sec],
+                target=target,
+                args_info=arg_info,
+            )
+        )
+
+    commit_record(trace, db, 1.14)
+    record = query(db, mod, target, "record")
+    assert record is not None and record.run_secs[0].value == 1.14
+    sch_res = query(db, mod, target, "schedule")
+    assert sch_res is not None and tvm.ir.structural_equal(sch_res.mod, sch.mod)
+    mod_res = query(db, mod, target, "ir_module")
+    assert mod_res is not None and tvm.ir.structural_equal(mod_res, sch.mod)
+
+    commit_record(Schedule(mod).trace, db, 0.514)  # Empty Trace
+    record = query(db, mod, target, "record")
+    assert record is not None and record.run_secs[0].value == 1.14  # Override to 2nd best
+    sch_res = query(db, mod, target, "schedule")
+    assert sch_res is not None and tvm.ir.structural_equal(sch_res.mod, sch.mod)
+    mod_res = query(db, mod, target, "ir_module")
+    assert mod_res is not None and tvm.ir.structural_equal(mod_res, sch.mod)
+
+
+def test_meta_schedule_pydatabase_current():
+    db = PyMemoryDatabaseDefault()  # pylint: disable=invalid-name
+    with db:  # pylint: disable=not-context-manager
+        assert ms.database.Database.current() == db
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From e18b48bed82da917e8e9a754217135bb5901f2a6 Mon Sep 17 00:00:00 2001
From: YudongChen <529641713@qq.com>
Date: Tue, 20 Sep 2022 11:07:46 +0800
Subject: [PATCH 211/704] [Fix] naming outputs of graph nodes by
 op_name:output_index (#12809)

to avoid fuzziness when the num of outputs per node is greater than 1. (#12672)

Co-authored-by: victor.chen <victor.chen@enflame-tech.com>
---
 src/runtime/graph_executor/graph_executor.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index fc7e82bed4e2..d805abfc658a 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -97,7 +97,9 @@ void GraphExecutor::Init(const std::string& graph_json, tvm::runtime::Module mod
   for (size_t i = 0; i < outputs_.size(); i++) {
     const uint32_t nid = outputs_[i].node_id;
     std::string& name = nodes_[nid].name;
-    output_map_[name] = i;
+    std::stringstream ss;
+    ss << name << ":" << i;
+    output_map_[ss.str()] = i;
   }
 }
 

From 18909a4c135cb8df5125cf1e417de7a35e02e705 Mon Sep 17 00:00:00 2001
From: "Sevin F. Varoglu" <sfvaroglu@octoml.ai>
Date: Tue, 20 Sep 2022 18:10:23 +0300
Subject: [PATCH 212/704] [ONNX] Upgrade onnx and onnxruntime (#12729)

Upgrade onnx and onnxruntime to latest
---
 docker/install/ubuntu_install_onnx.sh      |  4 +-
 tests/python/frontend/onnx/test_forward.py | 64 ++++++++++++++++++++++
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
index 7bd15afd7eb3..d775875bc7c5 100755
--- a/docker/install/ubuntu_install_onnx.sh
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -27,8 +27,8 @@ set -o pipefail
 # https://github.com/onnx/onnx/pull/2834).  When updating the CI image
 # to onnx>=1.9, onnxoptimizer should also be installed.
 pip3 install \
-    onnx==1.10.2 \
-    onnxruntime==1.9.0 \
+    onnx==1.12.0 \
+    onnxruntime==1.12.1 \
     onnxoptimizer==0.2.7
 
 # torch depends on a number of other packages, but unhelpfully, does
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 893815de7e5c..17a0513844ba 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5233,6 +5233,10 @@ def verify_eyelike(indata, dynamic=False):
     "test_bernoulli_double_expanded",
     "test_bernoulli_seed",
     "test_bernoulli_seed_expanded",
+    "test_blackmanwindow",
+    "test_blackmanwindow_expanded",
+    "test_blackmanwindow_symmetric",
+    "test_blackmanwindow_symmetric_expanded",
     "test_cast_DOUBLE_to_FLOAT16",
     "test_cast_FLOAT_to_STRING",
     "test_cast_STRING_to_FLOAT",
@@ -5262,19 +5266,61 @@ def verify_eyelike(indata, dynamic=False):
     "test_cumsum_2d_negative_axis",
     "test_det_2d",
     "test_det_nd",
+    "test_dft",
+    "test_dft_axis",
+    "test_dft_inverse",
     "test_dropout_default",
     "test_dropout_default_mask",
     "test_dropout_default_mask_ratio",
     "test_dropout_default_ratio",
+    "test_gridsample",
+    "test_gridsample_aligncorners_true",
+    "test_gridsample_bicubic",
+    "test_gridsample_bilinear",
+    "test_gridsample_border_padding",
+    "test_gridsample_nearest",
+    "test_gridsample_reflection_padding",
+    "test_gridsample_zeros_padding",
     "test_gru_batchwise",
+    "test_hammingwindow",
+    "test_hammingwindow_expanded",
+    "test_hammingwindow_symmetric",
+    "test_hammingwindow_symmetric_expanded",
+    "test_hannwindow",
+    "test_hannwindow_expanded",
+    "test_hannwindow_symmetric",
+    "test_hannwindow_symmetric_expanded",
+    "test_identity_opt",
     "test_identity_sequence",
+    "test_if_opt",
     "test_if_seq",
+    "test_layer_normalization_2d_axis0",
+    "test_layer_normalization_2d_axis1",
+    "test_layer_normalization_2d_axis_negative_1",
+    "test_layer_normalization_2d_axis_negative_2",
+    "test_layer_normalization_3d_axis0_epsilon",
+    "test_layer_normalization_3d_axis1_epsilon",
+    "test_layer_normalization_3d_axis2_epsilon",
+    "test_layer_normalization_3d_axis_negative_1_epsilon",
+    "test_layer_normalization_3d_axis_negative_2_epsilon",
+    "test_layer_normalization_3d_axis_negative_3_epsilon",
+    "test_layer_normalization_4d_axis0",
+    "test_layer_normalization_4d_axis1",
+    "test_layer_normalization_4d_axis2",
+    "test_layer_normalization_4d_axis3",
+    "test_layer_normalization_4d_axis_negative_1",
+    "test_layer_normalization_4d_axis_negative_2",
+    "test_layer_normalization_4d_axis_negative_3",
+    "test_layer_normalization_4d_axis_negative_4",
+    "test_layer_normalization_default_axis",
     "test_loop11",
     "test_loop13_seq",
+    "test_loop16_seq_none",
     "test_lstm_batchwise",
     "test_maxpool_with_argmax_2d_precomputed_pads",
     "test_maxpool_with_argmax_2d_precomputed_strides",
     "test_maxunpool_export_with_output_shape",
+    "test_melweightmatrix",
     # This test fails llvm with a lowering error:
     "test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded",
     "test_optional_has_element",
@@ -5290,8 +5336,24 @@ def verify_eyelike(indata, dynamic=False):
     "test_reduce_sum_keepdims_random",
     "test_reduce_sum_negative_axes_keepdims_example",
     "test_reduce_sum_negative_axes_keepdims_random",
+    "test_roialign_aligned_true",
+    "test_scatter_elements_with_duplicate_indices",
+    "test_scatternd_add",
+    "test_scatternd_multiply",
     "test_sequence_insert_at_back",
     "test_sequence_insert_at_front",
+    "test_sequence_map_add_1_sequence_1_tensor",
+    "test_sequence_map_add_1_sequence_1_tensor_expanded",
+    "test_sequence_map_add_2_sequences",
+    "test_sequence_map_add_2_sequences_expanded",
+    "test_sequence_map_extract_shapes",
+    "test_sequence_map_extract_shapes_expanded",
+    "test_sequence_map_identity_1_sequence",
+    "test_sequence_map_identity_1_sequence_1_tensor",
+    "test_sequence_map_identity_1_sequence_1_tensor_expanded",
+    "test_sequence_map_identity_1_sequence_expanded",
+    "test_sequence_map_identity_2_sequences",
+    "test_sequence_map_identity_2_sequences_expanded",
     "test_simple_rnn_batchwise",
     "test_simple_rnn_defaults",
     "test_simple_rnn_with_initial_bias",
@@ -5299,6 +5361,8 @@ def verify_eyelike(indata, dynamic=False):
     "test_split_variable_parts_2d",
     "test_split_variable_parts_default_axis",
     "test_split_zero_size_splits",
+    "test_stft",
+    "test_stft_with_window",
     "test_strnormalizer_export_monday_casesensintive_lower",
     "test_strnormalizer_export_monday_casesensintive_nochangecase",
     "test_strnormalizer_export_monday_casesensintive_upper",

From ecd003c742da85d4945c7d02e9301e07ad413136 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 20 Sep 2022 08:10:59 -0700
Subject: [PATCH 213/704] [ci] Lint PR title/body for @ tags (#12840)

This ensures that no users are tagged directly in PR titles or
descriptions, which lets us finally turn this on
https://github.blog/changelog/2022-08-23-new-options-for-controlling-the-default-commit-message-when-merging-a-pull-request/
---
 ci/scripts/check_pr.py     | 18 ++++++++----------
 tests/python/ci/test_ci.py | 12 ++++++++++++
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/ci/scripts/check_pr.py b/ci/scripts/check_pr.py
index 9af5ec5580a3..8be5c0ee46a8 100755
--- a/ci/scripts/check_pr.py
+++ b/ci/scripts/check_pr.py
@@ -69,19 +69,17 @@ def trailing_period(s: str):
 title_checks = [
     Check(check=non_empty, error_fn=lambda d: "PR must have a title but title was empty"),
     Check(check=trailing_period, error_fn=lambda d: "PR must not end in a tailing '.'"),
-    # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done
-    # Check(
-    #     check=usernames,
-    #     error_fn=lambda d: f"PR title must not tag anyone but found these usernames: {d}",
-    # ),
+    Check(
+        check=usernames,
+        error_fn=lambda d: f"PR title must not tag anyone but found these usernames: {d}",
+    ),
 ]
 body_checks = [
     Check(check=non_empty, error_fn=lambda d: "PR must have a body but body was empty"),
-    # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done
-    # Check(
-    #     check=usernames,
-    #     error_fn=lambda d: f"PR body must not tag anyone but found these usernames: {d}",
-    # ),
+    Check(
+        check=usernames,
+        error_fn=lambda d: f"PR body must not tag anyone but found these usernames: {d}",
+    ),
 ]
 
 
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 4b8c5d9ad444..8c7c9f6bb409 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -1327,6 +1327,18 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec
         expected="non_empty: FAILED",
         expected_code=1,
     ),
+    user_title=dict(
+        title="[something] a change @someon",
+        body="hello",
+        expected="usernames: FAILED: PR title must not tag",
+        expected_code=1,
+    ),
+    user_body=dict(
+        title="[something] a change",
+        body="hello\n\n cc @someone",
+        expected="usernames: FAILED: PR body must not tag",
+        expected_code=1,
+    ),
 )
 def test_pr_linter(title, body, expected, expected_code):
     """

From d9f7cf3539bc9e94f7b5b2c343536388e1b7fd26 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 20 Sep 2022 09:48:23 -0700
Subject: [PATCH 214/704] Revert "[ci] Lint PR title/body for @ tags (#12840)"
 (#12848)

This reverts commit ecd003c742da85d4945c7d02e9301e07ad413136.

The check needs to ignore @ s in some cases, such as within code blocks.
---
 ci/scripts/check_pr.py     | 18 ++++++++++--------
 tests/python/ci/test_ci.py | 12 ------------
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/ci/scripts/check_pr.py b/ci/scripts/check_pr.py
index 8be5c0ee46a8..9af5ec5580a3 100755
--- a/ci/scripts/check_pr.py
+++ b/ci/scripts/check_pr.py
@@ -69,17 +69,19 @@ def trailing_period(s: str):
 title_checks = [
     Check(check=non_empty, error_fn=lambda d: "PR must have a title but title was empty"),
     Check(check=trailing_period, error_fn=lambda d: "PR must not end in a tailing '.'"),
-    Check(
-        check=usernames,
-        error_fn=lambda d: f"PR title must not tag anyone but found these usernames: {d}",
-    ),
+    # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done
+    # Check(
+    #     check=usernames,
+    #     error_fn=lambda d: f"PR title must not tag anyone but found these usernames: {d}",
+    # ),
 ]
 body_checks = [
     Check(check=non_empty, error_fn=lambda d: "PR must have a body but body was empty"),
-    Check(
-        check=usernames,
-        error_fn=lambda d: f"PR body must not tag anyone but found these usernames: {d}",
-    ),
+    # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done
+    # Check(
+    #     check=usernames,
+    #     error_fn=lambda d: f"PR body must not tag anyone but found these usernames: {d}",
+    # ),
 ]
 
 
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 8c7c9f6bb409..4b8c5d9ad444 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -1327,18 +1327,6 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec
         expected="non_empty: FAILED",
         expected_code=1,
     ),
-    user_title=dict(
-        title="[something] a change @someon",
-        body="hello",
-        expected="usernames: FAILED: PR title must not tag",
-        expected_code=1,
-    ),
-    user_body=dict(
-        title="[something] a change",
-        body="hello\n\n cc @someone",
-        expected="usernames: FAILED: PR body must not tag",
-        expected_code=1,
-    ),
 )
 def test_pr_linter(title, body, expected, expected_code):
     """

From 5dfa8da00ec658934f3fc0df8eb9f41a167e1545 Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Tue, 20 Sep 2022 12:38:04 -0700
Subject: [PATCH 215/704] [Hexagon] 2-Stage Pipeline; Lower Async TIR
 primitives to Hexagon User DMA (#12785)

* [Hexagon] 2-Stage Pipeline; Lower Async TIR primitives to HexagonUserDMA

* save queue ID in `copy`, inspect in `wait` transform; add comments

* improve testing; parameters for shape, scope, dtype

* add log statements and adjust comments to clarify pass behavior

* generalize use_async_copy for pass enable

* use DLOG instead of LOG

* trigger ci

* trigger ci again
---
 include/tvm/tir/builtin.h                     |  10 +
 include/tvm/tir/transform.h                   |   5 +
 src/driver/driver_api.cc                      |  12 +-
 src/runtime/hexagon/hexagon_device_api.cc     |  25 +++
 src/tir/op/builtin.cc                         |   6 +
 src/tir/transforms/lower_async_dma.cc         | 194 ++++++++++++++++++
 src/tir/transforms/lower_tvm_builtin.cc       |  30 +++
 .../test_software_pipeline_async.py           |  86 ++++++++
 ...est_tir_transform_inject_ptx_async_copy.py |   4 +-
 ..._tir_transform_inject_software_pipeline.py |   2 +-
 10 files changed, 367 insertions(+), 7 deletions(-)
 create mode 100644 src/tir/transforms/lower_async_dma.cc
 create mode 100644 tests/python/contrib/test_hexagon/test_software_pipeline_async.py

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 12290a97c840..a1a97595bfd8 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -720,6 +720,16 @@ TVM_DLL const Op& texture2d_load();
  */
 TVM_DLL const Op& mem_copy();
 
+/*!
+ * \brief Initiate a non-blocking DMA copy from source to destination
+ */
+TVM_DLL const Op& dma_copy();
+
+/*!
+ * \brief Wait until the number of DMAs in flight is less than or equal to some maximum
+ */
+TVM_DLL const Op& dma_wait();
+
 /*!
  * \brief Provide a true statement that can be used for simplifications
  *
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index fd4261e4a4e3..a4caeee43604 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -485,6 +485,11 @@ TVM_DLL Pass TextureFlatten();
  */
 TVM_DLL Pass LowerVtcmAlloc();
 
+/*!
+ * \brief Lower Async TIR primitives to DMA copy and wait builtins
+ */
+TVM_DLL Pass LowerAsyncDMA();
+
 /*!
  * \brief Implements a Common Subexpression Elimination (CSE) for TIR
  *        which introduces let-in bindings for duplicated sub-expressions.
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index e528686d967d..1a617dcd494d 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -50,7 +50,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_storage_rewrite", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_ptx_async_copy", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool);
 
 using runtime::PackedFunc;
 using runtime::TVMArgs;
@@ -225,6 +225,11 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   }
   // LowerVtcmAlloc must occur after any transformations that modify memory allocation locations
   pass_list.push_back(tir::transform::LowerVtcmAlloc());
+  bool use_async_copy = pass_ctx->GetConfig<Bool>("tir.use_async_copy", Bool(false)).value();
+
+  if (use_async_copy) {
+    pass_list.push_back(tir::transform::LowerAsyncDMA());
+  }
   pass_list.push_back(tir::transform::UnrollLoop());
 
   // Add user-defined phase-2 passes
@@ -543,10 +548,9 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
   mixed_pass_list.push_back(tir::transform::InferFragment());
   mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
 
-  bool use_ptx_async_copy =
-      pass_ctx->GetConfig<Bool>("tir.use_ptx_async_copy", Bool(false)).value();
+  bool use_async_copy = pass_ctx->GetConfig<Bool>("tir.use_async_copy", Bool(false)).value();
 
-  if (use_ptx_async_copy) {
+  if (use_async_copy) {
     mixed_pass_list.push_back(tir::transform::InjectPTXAsyncCopy());
   }
 
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 463d9799b082..84232a614428 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -33,6 +33,7 @@
 
 #include "../workspace_pool.h"
 #include "hexagon_common.h"
+#include "hexagon_user_dma.h"
 
 namespace tvm {
 namespace runtime {
@@ -206,6 +207,30 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM
   *rv = static_cast<int32_t>(0);
 });
 
+TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
+  int queue_id = args[0];
+  ICHECK(queue_id == 0 && "Hexagon supports just a single asynchronous queue for DMA");
+  void* dst = args[1];
+  void* src = args[2];
+  int size = args[3];
+  ICHECK(size > 0);
+
+  int ret = DMA_RETRY;
+  do {
+    ret = HexagonUserDMA::Get().Copy(dst, src, size);
+  } while (ret == DMA_RETRY);
+  *rv = static_cast<int32_t>(ret);
+});
+
+TVM_REGISTER_GLOBAL("device_api.hexagon.dma_wait").set_body([](TVMArgs args, TVMRetValue* rv) {
+  int queue_id = args[0];
+  ICHECK(queue_id == 0 && "Hexagon supports just a single asynchronous queue for DMA");
+  int inflight = args[1];
+  ICHECK(inflight >= 0);
+  HexagonUserDMA::Get().Wait(inflight);
+  *rv = static_cast<int32_t>(0);
+});
+
 TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
   int32_t device_type = args[0];
   int32_t device_id = args[1];
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 9642f8e39f39..1e2d790c76e1 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -288,6 +288,12 @@ TIR_DEFINE_BUILTIN_FUNC(texture2d_load)
 TIR_DEFINE_BUILTIN_FUNC(mem_copy).set_attr<TCallEffectKind>("TCallEffectKind",
                                                             Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(dma_copy).set_attr<TCallEffectKind>("TCallEffectKind",
+                                                            Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_BUILTIN_FUNC(dma_wait).set_attr<TCallEffectKind>("TCallEffectKind",
+                                                            Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_BUILTIN_FUNC(assume)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kEmbedInfo))
     .set_num_inputs(1);
diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc
new file mode 100644
index 000000000000..78d363f67c02
--- /dev/null
+++ b/src/tir/transforms/lower_async_dma.cc
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file lower_async_dma.cc
+ */
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "ir_utils.h"
+
+namespace tvm {
+namespace tir {
+
+class AsyncDMALowerer : public StmtExprMutator {
+ public:
+  AsyncDMALowerer() {}
+
+  Stmt VisitStmt_(const AttrStmtNode* op) final {
+    // Convert this, for example:
+    // attr [0] "async_wait_queue_scope" = 0;
+    // attr [0] "async_wait_inflight_count" = 0;
+    //
+    // To this:
+    // @tir.dma_wait(
+    //   0, /* queue id */
+    //   0, /* in flight count */
+    //   dtype=int32
+    // )
+    if (op->attr_key == tir::attr::async_wait_queue_scope) {
+      // get queue ID
+      auto queue_id_node = op->value.as<IntImmNode>();
+      ICHECK(queue_id_node);
+      int queue_id = queue_id_node->value;
+
+      // abort if we have not seen this queue ID in `copy` transform
+      if (queue_ids.find(queue_id) == queue_ids.end()) {
+        DLOG(INFO) << "AsyncDMALowerer exiting because the queue ID observed in the "
+                      "`async_wait_queue_scope` transform has not been previously observed in the "
+                      "`async_commit_queue_scope` transform";
+        return StmtExprMutator::VisitStmt_(op);
+      }
+
+      auto async_wait = op->body.as<AttrStmtNode>();
+      if (!async_wait || async_wait->attr_key != tir::attr::async_wait_inflight_count) {
+        DLOG(INFO) << "AsyncDMALowerer exiting because the body of the `AttrStmtNode` with key "
+                      "`async_wait_queue_scope` does not contain an `AttrStmtNode` with key "
+                      "`async_wait_inflight_count`";
+        return StmtExprMutator::VisitStmt_(op);
+      }
+
+      auto call_dma_wait =
+          Evaluate(Call(DataType::Int(32), builtin::dma_wait(), {queue_id, async_wait->value}));
+
+      // concatenate the call with the body and return
+      return SeqStmt({call_dma_wait, async_wait->body});
+
+      // Convert this, for example:
+      // attr [0] "async_commit_queue_scope" = 0;
+      // attr [0] "async_scope" = 1;
+      // for (ax0: int32, 0, 128) {
+      //   A_global[ax0] = A[ax0]
+      // }
+      //
+      // To this:
+      // @tir.dma_copy(
+      //   0, /* queue id */
+      //   @tir.address_of(A_global[0], dtype=handle),
+      //   @tir.address_of(A[0], dtype=handle),
+      //   128, /* size */
+      //   dtype=int32
+      // )
+    } else if (op->attr_key == tir::attr::async_commit_queue_scope) {
+      // get queue ID
+      auto queue_id_node = op->value.as<IntImmNode>();
+      ICHECK(queue_id_node);
+      int queue_id = queue_id_node->value;
+
+      // save queue ID for inspection in `wait` transform
+      queue_ids.insert(queue_id);
+
+      // walk the graph to verify this is a mem copy ...
+      // 1) async_commit_queue_scope contains async_scope
+      auto async_scope = op->body.as<AttrStmtNode>();
+      if (!async_scope || async_scope->attr_key != tir::attr::async_scope) {
+        DLOG(INFO) << "AsyncDMALowerer exiting because the body of the `AttrStmtNode` with key "
+                      "`async_commit_queue_scope` does not contain an `AttrStmtNode` with key "
+                      "`async_scope`";
+        return StmtExprMutator::VisitStmt_(op);
+      }
+
+      // 2) async_scope contains single for loop
+      auto for_loop = async_scope->body.as<ForNode>();
+      if (!for_loop) {
+        DLOG(INFO) << "AsyncDMALowerer exiting because the body of the `AttrStmtNode` with key "
+                      "`async_scope` does not contain a single `ForNode`";
+        return StmtExprMutator::VisitStmt_(op);
+      }
+
+      // 3) for loop contains buffer store with single index
+      auto bufferstorenode = for_loop->body.as<BufferStoreNode>();
+      if (!bufferstorenode || bufferstorenode->indices.size() != 1) {
+        DLOG(INFO)
+            << "AsyncDMALowerer exiting because the body of the `ForNode` does not contain a "
+               "single `BufferStoreNode` with a single index variable";
+        return StmtExprMutator::VisitStmt_(op);
+      }
+
+      // 4) buffer store value is a buffer load with single index
+      auto bufferloadnode = bufferstorenode->value.as<BufferLoadNode>();
+      if (!bufferloadnode || bufferloadnode->indices.size() != 1) {
+        DLOG(INFO) << "AsyncDMALowerer exiting because the value of the `BufferStoreNode` is not a "
+                      "single `BufferLoadNode` with a single index variable";
+        return StmtExprMutator::VisitStmt_(op);
+      }
+
+      // get store buffer; assert it exists and is contiguous given it uses a single index
+      auto bufferstore = bufferstorenode->buffer.as<BufferNode>();
+      ICHECK(bufferstore && bufferstore->strides.empty());
+
+      // get load buffer; assert it exists and is contiguous given it uses a single index
+      auto bufferload = bufferloadnode->buffer.as<BufferNode>();
+      ICHECK(bufferload && bufferload->strides.empty());
+
+      // we will be replacing the entire for loop including its index
+      // with a DMA copy instrinsic that spans the entire index space of the for loop
+      // so we will need to replace the for loop index with value zero in the buffer indices
+      // thus we eliminate the index from the expression so the DMA copy receives the buffer range
+      // base address
+      Map<Var, PrimExpr> loop_var_remap = {{for_loop->loop_var, IntImm(DataType::Int(32), 0)}};
+
+      // map loop variable to zero for the store index & simplify
+      Array<PrimExpr> store_index = bufferstorenode->indices;
+      store_index.MutateByApply([&](PrimExpr expr) {
+        arith::Analyzer analyzer;
+        return analyzer.Simplify(Substitute(std::move(expr), loop_var_remap));
+      });
+
+      // map loop variable to zero for the load index & simplify
+      Array<PrimExpr> load_index = bufferloadnode->indices;
+      load_index.MutateByApply([&](PrimExpr expr) {
+        arith::Analyzer analyzer;
+        return analyzer.Simplify(Substitute(std::move(expr), loop_var_remap));
+      });
+
+      return Evaluate(Call(DataType::Int(32), builtin::dma_copy(),
+                           {queue_id,
+                            Call(DataType::Handle(), builtin::address_of(),
+                                 {BufferLoad(bufferstorenode->buffer, store_index)}),
+                            Call(DataType::Handle(), builtin::address_of(),
+                                 {BufferLoad(bufferloadnode->buffer, load_index)}),
+                            for_loop->extent * bufferloadnode->dtype.bytes()}));
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+ private:
+  std::set<int> queue_ids;
+};
+
+namespace transform {
+
+Pass LowerAsyncDMA() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto fptr = f.CopyOnWrite();
+    fptr->body = AsyncDMALowerer()(std::move(fptr->body));
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.LowerAsyncDMA", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.LowerAsyncDMA").set_body_typed(LowerAsyncDMA);
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 9d0087cc7a0b..f79682ef7ecc 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -317,6 +317,10 @@ class BuiltinLower : public StmtExprMutator {
       return make_zero(op->dtype);
     } else if (op->op.same_as(builtin::mem_copy())) {
       return MakeMemCopy(op);
+    } else if (op->op.same_as(builtin::dma_copy())) {
+      return MakeDMACopy(op);
+    } else if (op->op.same_as(builtin::dma_wait())) {
+      return MakeDMAWait(op);
     } else {
       return StmtExprMutator::VisitExpr_(op);
     }
@@ -335,6 +339,32 @@ class BuiltinLower : public StmtExprMutator {
     return VisitExpr(call_packed);
   }
 
+  PrimExpr MakeDMACopy(const CallNode* op) {
+    PrimExpr queue_id = op->args[0];
+    PrimExpr dst = op->args[1];
+    PrimExpr src = op->args[2];
+    PrimExpr size = op->args[3];
+
+    std::string fdevapi_prefix =
+        "device_api." + std::string(runtime::DeviceName(device_type_.as<IntImmNode>()->value));
+
+    Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(),
+                            {StringImm(fdevapi_prefix + ".dma_copy"), queue_id, dst, src, size});
+    return VisitExpr(call_packed);
+  }
+
+  PrimExpr MakeDMAWait(const CallNode* op) {
+    PrimExpr queue_id = op->args[0];
+    PrimExpr inflight = op->args[1];
+
+    std::string fdevapi_prefix =
+        "device_api." + std::string(runtime::DeviceName(device_type_.as<IntImmNode>()->value));
+
+    Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(),
+                            {StringImm(fdevapi_prefix + ".dma_wait"), queue_id, inflight});
+    return VisitExpr(call_packed);
+  }
+
   // call shape
   PrimExpr MakeShape(const CallNode* op) {
     // if args.size() == 0, it represents a scalar shape ()
diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
new file mode 100644
index 000000000000..6bcca90ec9d3
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import pytest
+import numpy as np
+
+import tvm
+from tvm import tir
+from tvm.contrib.hexagon.session import Session
+from tvm.script import tir as T
+
+outer = tvm.testing.parameter(8, 16)
+inner = tvm.testing.parameter(64, 128)
+scope = tvm.testing.parameter("global", "global.vtcm")
+dtype = tvm.testing.parameter("uint8", "float16")
+
+
+@tvm.testing.fixture
+def compute(outer, inner, dtype):
+    @T.prim_func
+    def plus_one_primfunc(A: T.Buffer[(outer, inner), dtype], B: T.Buffer[(outer, inner), dtype]):
+        for i in T.serial(outer):
+            for j in T.serial(inner):
+                with T.block("compute"):
+                    with T.block():
+                        B[i, j] = A[i, j] + T.cast(1, dtype)
+
+    def plus_one_ref(a):
+        return a + 1
+
+    return plus_one_primfunc, plus_one_ref
+
+
+@tvm.testing.requires_hexagon
+def test_software_pipeline_with_cache_read(hexagon_launcher, compute, outer, inner, dtype, scope):
+    sch = tir.Schedule(compute[0])
+    root = sch.get_block("root")
+    compute_block = sch.get_block("compute")
+    cache_read_block = sch.cache_read(compute_block, 0, scope)
+
+    i, _ = sch.get_loops(compute_block)
+    sch.compute_at(cache_read_block, i)
+    sch.annotate(i, "software_pipeline_stage", [0, 1])
+    sch.annotate(i, "software_pipeline_order", [0, 1])
+    sch.annotate(i, "software_pipeline_async_stages", [0])
+
+    a_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
+    b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
+    ref = compute[1](a_np)
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
+        func = tvm.build(
+            sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+        )
+
+    with hexagon_launcher.start_session() as hexagon_session:
+        dev = hexagon_session.device
+        a = tvm.nd.array(a_np, device=dev)
+        b = tvm.nd.array(b_np, device=dev)
+        mod = hexagon_session.load_module(func)
+        mod(a, b)
+
+        if "int" in dtype:
+            np.testing.assert_equal(b.numpy(), ref)
+        else:
+            np.testing.assert_allclose(b.numpy(), ref, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
index 1a906b2fb66e..7062d5129713 100644
--- a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
+++ b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
@@ -138,7 +138,7 @@ def test_inject_async_copy():
         if not tvm.testing.is_ampere_or_newer():
             continue
 
-        with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}):
+        with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
             mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda")
 
         A_np = np.random.rand(32, 128).astype(dtype)
@@ -166,7 +166,7 @@ def test_inject_async_copy_shared_dyn():
     if not tvm.testing.is_ampere_or_newer():
         return
 
-    with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}):
+    with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
         mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda")
 
     A_np = np.random.rand(32, 128).astype("float16")
diff --git a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
index edaeb7c9b639..49255e0f2094 100644
--- a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
+++ b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
@@ -1390,7 +1390,7 @@ def index_map(i, j):
 
 def build_and_run(sch):
     if tvm.testing.is_ampere_or_newer():
-        with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}):
+        with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
             f = tvm.build(sch.mod["main"], target="cuda")
 
         dev = tvm.device("cuda", 0)

From 534378b935aa08b77e7529ec183133a24f121ae4 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 20 Sep 2022 15:49:46 -0500
Subject: [PATCH 216/704] [Containers] Add Array::Map (#12692)

* [Containers] Add Array::Map

Previously, an in-place mutation could be applied to an array using
`Array::MutateByApply`, but this couldn't be used for transformations
that return a new array, or for transformations that return a new
type.

The commit adds `Array::Map`, which can map to any `ObjectRef`
subclass.  For mappings that return the same type, this is done by
delegating to `Array::MutateByApply`, to take advantage of the same
copy-on-write behavior.

* [Refactor] Use Array::Map where possible

With the new `Array::Map` functionality, many places that previously
used explicit loops or `tvm::tir::MutateArray` can be cleaned.

* Merge the Map and MutateInPlace implementations

* Fix off-by-one error in MapHelper

* Updated with unit tests for Array::Map conversions

* Improved comments explaining the copy-on-write in MapHelper
---
 include/tvm/runtime/container/array.h         | 198 ++++++++++++++----
 src/ir/type_functor.cc                        |   9 +-
 src/te/operation/create_primfunc.cc           |   2 +-
 src/tir/analysis/device_constraint_utils.cc   |   5 +-
 src/tir/ir/buffer.cc                          |   4 +-
 src/tir/ir/expr.cc                            |   3 +-
 src/tir/ir/expr_functor.cc                    |  14 +-
 src/tir/ir/functor_common.h                   |   3 +-
 src/tir/ir/index_map.cc                       |   5 +-
 src/tir/ir/specialize.cc                      |  19 +-
 src/tir/ir/stmt_functor.cc                    |   3 +-
 .../schedule/primitive/decompose_padding.cc   |  15 +-
 src/tir/schedule/transform.cc                 |   8 +-
 src/tir/transforms/inject_virtual_thread.cc   |   4 +-
 src/tir/transforms/lower_match_buffer.cc      |   8 +-
 src/tir/transforms/renew_defs.cc              |  37 ++--
 src/tir/transforms/vectorize_loop.cc          |   6 +-
 tests/cpp/container_test.cc                   | 135 ++++++++++++
 18 files changed, 353 insertions(+), 125 deletions(-)

diff --git a/include/tvm/runtime/container/array.h b/include/tvm/runtime/container/array.h
index 26f4e545deb7..11bacb18e92c 100644
--- a/include/tvm/runtime/container/array.h
+++ b/include/tvm/runtime/container/array.h
@@ -26,10 +26,12 @@
 
 #include <algorithm>
 #include <memory>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "./base.h"
+#include "./optional.h"
 
 namespace tvm {
 namespace runtime {
@@ -248,6 +250,23 @@ class ArrayNode : public Object, public InplaceArrayBase<ArrayNode, ObjectRef> {
   friend ObjectPtr<ArrayNode> make_object<>();
 };
 
+/*! \brief Helper struct for type-checking
+ *
+ * is_valid_iterator<T,IterType>::value will be true if IterType can
+ * be dereferenced into a type that can be stored in an Array<T>, and
+ * false otherwise.
+ */
+template <typename T, typename IterType>
+struct is_valid_iterator
+    : std::bool_constant<std::is_base_of_v<
+          T, std::remove_cv_t<std::remove_reference_t<decltype(*std::declval<IterType>())>>>> {};
+
+template <typename T, typename IterType>
+struct is_valid_iterator<Optional<T>, IterType> : is_valid_iterator<T, IterType> {};
+
+template <typename T, typename IterType>
+inline constexpr bool is_valid_iterator_v = is_valid_iterator<T, IterType>::value;
+
 /*!
  * \brief Array, container representing a contiguous sequence of ObjectRefs.
  *
@@ -574,54 +593,39 @@ class Array : public ObjectRef {
   /*! \return The underlying ArrayNode */
   ArrayNode* GetArrayNode() const { return static_cast<ArrayNode*>(data_.get()); }
 
+  /*!
+   * \brief Helper function to apply a map function onto the array.
+   *
+   * \param fmap The transformation function T -> U.
+   *
+   * \tparam F The type of the mutation function.
+   *
+   * \tparam U The type of the returned array, inferred from the
+   * return type of F.  If overridden by the user, must be something
+   * that is convertible from the return type of F.
+   *
+   * \note This function performs copy on write optimization.  If
+   * `fmap` returns an object of type `T`, and all elements of the
+   * array are mapped to themselves, then the returned array will be
+   * the same as the original, and reference counts of the elements in
+   * the array will not be incremented.
+   *
+   * \return The transformed array.
+   */
+  template <typename F, typename U = std::invoke_result_t<F, T>>
+  Array<U> Map(F fmap) const {
+    return Array<U>(MapHelper(data_, fmap));
+  }
+
   /*!
    * \brief Helper function to apply fmutate to mutate an array.
    * \param fmutate The transformation function T -> T.
    * \tparam F the type of the mutation function.
    * \note This function performs copy on write optimization.
    */
-  template <typename F>
+  template <typename F, typename = std::enable_if_t<std::is_same_v<T, std::invoke_result_t<F, T>>>>
   void MutateByApply(F fmutate) {
-    if (data_ == nullptr) {
-      return;
-    }
-    struct StackFrame {
-      ArrayNode* p;
-      ObjectRef* itr;
-      int64_t i;
-      int64_t size;
-    };
-    std::unique_ptr<StackFrame> s = std::make_unique<StackFrame>();
-    s->p = GetArrayNode();
-    s->itr = s->p->MutableBegin();
-    s->i = 0;
-    s->size = s->p->size_;
-    if (!data_.unique()) {
-      // Loop invariant: keeps iterating when
-      // 1) data is not unique
-      // 2) no elements are actually mutated yet
-      for (; s->i < s->size; ++s->i, ++s->itr) {
-        T new_elem = fmutate(DowncastNoCheck<T>(*s->itr));
-        // do nothing when there is no mutation
-        if (new_elem.same_as(*s->itr)) {
-          continue;
-        }
-        // loop invariant breaks when the first real mutation happens
-        // we copy the elements into a new unique array
-        ObjectPtr<ArrayNode> copy = ArrayNode::CopyFrom(s->p->capacity_, s->p);
-        s->itr = copy->MutableBegin() + (s->i++);
-        *s->itr++ = std::move(new_elem);
-        data_ = std::move(copy);
-        // make sure `data_` is unique and break
-        break;
-      }
-    }
-    // when execution comes to this line, it is guaranteed that either
-    //    1) i == size
-    // or 2) data_.unique() is true
-    for (; s->i < s->size; ++s->i, ++s->itr) {
-      *s->itr = std::move(fmutate(std::move(DowncastNoCheck<T>(std::move(*s->itr)))));
-    }
+    data_ = MapHelper(std::move(data_), fmutate);
   }
 
   /*!
@@ -706,6 +710,118 @@ class Array : public ObjectRef {
     }
     return static_cast<ArrayNode*>(data_.get());
   }
+
+  /*! \brief Helper method for mutate/map
+   *
+   * A helper function used internally by both `Array::Map` and
+   * `Array::MutateInPlace`.  Given an array of data, apply the
+   * mapping function to each element, returning the collected array.
+   * Applies both mutate-in-place and copy-on-write optimizations, if
+   * possible.
+   *
+   * \param data A pointer to the ArrayNode containing input data.
+   * Passed by value to allow for mutate-in-place optimizations.
+   *
+   * \param fmap The mapping function
+   *
+   * \tparam F The type of the mutation function.
+   *
+   * \tparam U The output type of the mutation function.  Inferred
+   * from the callable type given.  Must inherit from ObjectRef.
+   *
+   * \return The mapped array.  Depending on whether mutate-in-place
+   * or copy-on-write optimizations were applicable, may be the same
+   * underlying array as the `data` parameter.
+   */
+  template <typename F, typename U = std::invoke_result_t<F, T>>
+  static ObjectPtr<Object> MapHelper(ObjectPtr<Object> data, F fmap) {
+    if (data == nullptr) {
+      return nullptr;
+    }
+
+    ICHECK(data->IsInstance<ArrayNode>());
+
+    constexpr bool is_same_output_type = std::is_same_v<T, U>;
+
+    if constexpr (is_same_output_type) {
+      if (data.unique()) {
+        // Mutate-in-place path.  Only allowed if the output type U is
+        // the same as type T, we have a mutable this*, and there are
+        // no other shared copies of the array.
+        auto arr = static_cast<ArrayNode*>(data.get());
+        for (auto it = arr->MutableBegin(); it != arr->MutableEnd(); it++) {
+          T mapped = fmap(DowncastNoCheck<T>(std::move(*it)));
+          *it = std::move(mapped);
+        }
+        return data;
+      }
+    }
+
+    constexpr bool compatible_types = is_valid_iterator_v<T, U*> || is_valid_iterator_v<U, T*>;
+
+    ObjectPtr<ArrayNode> output = nullptr;
+    auto arr = static_cast<ArrayNode*>(data.get());
+
+    auto it = arr->begin();
+    if constexpr (compatible_types) {
+      // Copy-on-write path, if the output Array<U> might be
+      // represented by the same underlying array as the existing
+      // Array<T>.  Typically, this is for functions that map `T` to
+      // `T`, but can also apply to functions that map `T` to
+      // `Optional<T>`, or that map `T` to a subclass or superclass of
+      // `T`.
+      bool all_identical = true;
+      for (; it != arr->end(); it++) {
+        U mapped = fmap(DowncastNoCheck<T>(*it));
+        if (!mapped.same_as(*it)) {
+          // At least one mapped element is different than the
+          // original.  Therefore, prepare the output array,
+          // consisting of any previous elements that had mapped to
+          // themselves (if any), and the element that didn't map to
+          // itself.
+          all_identical = false;
+          output = ArrayNode::CreateRepeated(arr->size(), U());
+          output->InitRange(0, arr->begin(), it);
+          output->SetItem(it - arr->begin(), std::move(mapped));
+          it++;
+          break;
+        }
+      }
+      if (all_identical) {
+        return data;
+      }
+    } else {
+      // Path for incompatible types.  The constexpr check for
+      // compatible types isn't strictly necessary, as the first
+      // mapped.same_as(*it) would return false, but we might as well
+      // avoid it altogether.
+      output = ArrayNode::CreateRepeated(arr->size(), U());
+    }
+
+    // Normal path for incompatible types, or post-copy path for
+    // copy-on-write instances.
+    //
+    // If the types are incompatible, then at this point `output` is
+    // empty, and `it` points to the first element of the input.
+    //
+    // If the types were compatible, then at this point `output`
+    // contains zero or more elements that mapped to themselves
+    // followed by the first element that does not map to itself, and
+    // `it` points to the element just after the first element that
+    // does not map to itself.  Because at least one element has been
+    // changed, we no longer have the opportunity to avoid a copy, so
+    // we don't need to check the result.
+    //
+    // In both cases, `it` points to the next element to be processed,
+    // so we can either start or resume the iteration from that point,
+    // with no further checks on the result.
+    for (; it != arr->end(); it++) {
+      U mapped = fmap(DowncastNoCheck<T>(*it));
+      output->SetItem(it - arr->begin(), std::move(mapped));
+    }
+
+    return output;
+  }
 };
 
 /*!
diff --git a/src/ir/type_functor.cc b/src/ir/type_functor.cc
index 51d5d3778c10..36838b62aabc 100644
--- a/src/ir/type_functor.cc
+++ b/src/ir/type_functor.cc
@@ -97,14 +97,7 @@ Type TypeMutator::VisitType(const Type& t) {
 Array<Type> TypeMutator::MutateArray(Array<Type> arr) {
   // The array will do copy on write
   // If no changes are made, the original array will be returned.
-  for (size_t i = 0; i < arr.size(); ++i) {
-    Type ty = arr[i];
-    Type new_ty = VisitType(ty);
-    if (!ty.same_as(new_ty)) {
-      arr.Set(i, new_ty);
-    }
-  }
-  return arr;
+  return arr.Map([this](const Type& ty) { return VisitType(ty); });
 }
 
 Type TypeMutator::VisitType_(const TypeVarNode* op) { return GetRef<TypeVar>(op); }
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 4c1358f42519..fb325684e65b 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -256,7 +256,7 @@ BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
     // TensorIR will not allow Tensor data structure
     if (value->IsInstance<ArrayNode>()) {
       const auto array_value = Downcast<Array<ObjectRef>>(value);
-      annotations.Set(key, MutateArray(array_value, mutate_attr));
+      annotations.Set(key, array_value.Map(mutate_attr));
     } else {
       annotations.Set(key, mutate_attr(value));
     }
diff --git a/src/tir/analysis/device_constraint_utils.cc b/src/tir/analysis/device_constraint_utils.cc
index 1309681513a9..32b59ce54b69 100644
--- a/src/tir/analysis/device_constraint_utils.cc
+++ b/src/tir/analysis/device_constraint_utils.cc
@@ -393,9 +393,8 @@ class ApplyDeviceConstraintsMutator : public StmtExprMutator {
   }
 
   template <typename T>
-  Array<T> VisitItems(Array<T> items) {
-    items.MutateByApply([this](const T& item) { return VisitItem(item.get()); });  // copy-on-write
-    return items;
+  Array<T> VisitItems(const Array<T>& items) {
+    return items.Map([this](T item) -> T { return VisitItem(item.get()); });
   }
 
   Stmt VisitStmt_(const BlockNode* block_node) final {
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index cae4109a6026..0dfda954b818 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -461,8 +461,8 @@ Buffer Buffer::MakeSlice(Array<PrimExpr> begins, Array<PrimExpr> extents) const
   ICHECK(n != nullptr);
   arith::Analyzer ana;
   begins = SimplifyArray(&ana, begins);
-  Array<PrimExpr> elem_offset = n->ElemOffset(begins);
-  elem_offset.MutateByApply([&](const PrimExpr& expr) { return ana.Simplify(expr); });
+  Array<PrimExpr> elem_offset =
+      n->ElemOffset(begins).Map([&](const PrimExpr& expr) { return ana.Simplify(expr); });
 
   Array<PrimExpr> strides = n->strides;
   if (strides.size() == 0) {
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index 59db4ea410fd..daae7eaf68f5 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -994,8 +994,7 @@ Array<PrimExpr> CommReducerNode::operator()(Array<PrimExpr> a, Array<PrimExpr> b
     value_map.Set(lhs[i], a[i]);
     value_map.Set(rhs[i], b[i]);
   }
-  auto ret = this->result;
-  ret.MutateByApply([&value_map](const PrimExpr& e) { return Substitute(e, value_map); });
+  auto ret = this->result.Map([&value_map](const PrimExpr& e) { return Substitute(e, value_map); });
   return ret;
 }
 
diff --git a/src/tir/ir/expr_functor.cc b/src/tir/ir/expr_functor.cc
index c8dc84695b4f..da02e0316f48 100644
--- a/src/tir/ir/expr_functor.cc
+++ b/src/tir/ir/expr_functor.cc
@@ -132,7 +132,7 @@ PrimExpr ExprMutator::VisitExpr_(const LoadNode* op) {
 
 PrimExpr ExprMutator::VisitExpr_(const BufferLoadNode* op) {
   auto fmutate = [this](const PrimExpr& e) { return this->VisitExpr(e); };
-  Array<PrimExpr> indices = MutateArray(op->indices, fmutate);
+  Array<PrimExpr> indices = op->indices.Map(fmutate);
   if (indices.same_as(op->indices)) {
     return GetRef<PrimExpr>(op);
   } else {
@@ -142,7 +142,7 @@ PrimExpr ExprMutator::VisitExpr_(const BufferLoadNode* op) {
 
 PrimExpr ExprMutator::VisitExpr_(const ProducerLoadNode* op) {
   auto fmutate = [this](const PrimExpr& e) { return this->VisitExpr(e); };
-  Array<PrimExpr> indices = MutateArray(op->indices, fmutate);
+  Array<PrimExpr> indices = op->indices.Map(fmutate);
   if (indices.same_as(op->indices)) {
     return GetRef<PrimExpr>(op);
   } else {
@@ -162,7 +162,7 @@ PrimExpr ExprMutator::VisitExpr_(const LetNode* op) {
 
 PrimExpr ExprMutator::VisitExpr_(const CallNode* op) {
   auto fmutate = [this](const PrimExpr& e) { return this->VisitExpr(e); };
-  Array<PrimExpr> args = MutateArray(op->args, fmutate);
+  Array<PrimExpr> args = op->args.Map(fmutate);
 
   if (args.same_as(op->args)) {
     return GetRef<PrimExpr>(op);
@@ -218,11 +218,11 @@ PrimExpr ExprMutator::VisitExpr_(const ReduceNode* op) {
       return IterVar(Range::FromMinExtent(min, extent), v->var, v->iter_type, v->thread_tag);
     }
   };
-  Array<IterVar> axis = MutateArray(op->axis, fitervar);
+  Array<IterVar> axis = op->axis.Map(fitervar);
 
   auto fexpr = [this](const PrimExpr& e) { return this->VisitExpr(e); };
-  Array<PrimExpr> source = MutateArray(op->source, fexpr);
-  Array<PrimExpr> init = MutateArray(op->init, fexpr);
+  Array<PrimExpr> source = op->source.Map(fexpr);
+  Array<PrimExpr> init = op->init.Map(fexpr);
 
   PrimExpr condition = this->VisitExpr(op->condition);
 
@@ -285,7 +285,7 @@ PrimExpr ExprMutator::VisitExpr_(const BroadcastNode* op) {
 
 PrimExpr ExprMutator::VisitExpr_(const ShuffleNode* op) {
   auto fexpr = [this](const PrimExpr& e) { return this->VisitExpr(e); };
-  auto vectors = MutateArray(op->vectors, fexpr);
+  auto vectors = op->vectors.Map(fexpr);
   if (vectors.same_as(op->vectors)) {
     return GetRef<PrimExpr>(op);
   } else {
diff --git a/src/tir/ir/functor_common.h b/src/tir/ir/functor_common.h
index 8b5a361a37c6..b9bb43ca6ba6 100644
--- a/src/tir/ir/functor_common.h
+++ b/src/tir/ir/functor_common.h
@@ -38,8 +38,7 @@ inline void VisitArray(const Array<T>& arr, F fvisit) {
 
 template <typename T, typename F>
 inline Array<T> MutateArray(Array<T> arr, F fmutate) {
-  arr.MutateByApply(fmutate);
-  return arr;
+  return arr.Map(fmutate);
 }
 
 }  // namespace tir
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 64c5d5d5ddde..2ffc5079246b 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -185,9 +185,8 @@ Array<PrimExpr> IndexMapNode::MapIndices(const Array<PrimExpr>& indices,
     analyzer = &local_analyzer;
   }
 
-  Array<PrimExpr> output = final_indices;
-  output.MutateByApply(
-      [&](const PrimExpr& index) { return analyzer->Simplify(Substitute(index, vmap)); });
+  Array<PrimExpr> output = final_indices.Map(
+      [&](PrimExpr index) { return analyzer->Simplify(Substitute(std::move(index), vmap)); });
 
   return output;
 }
diff --git a/src/tir/ir/specialize.cc b/src/tir/ir/specialize.cc
index 520e3ee03c92..ea68015bc73b 100644
--- a/src/tir/ir/specialize.cc
+++ b/src/tir/ir/specialize.cc
@@ -115,8 +115,7 @@ class PrimFuncSpecializer : public StmtExprMutator {
  private:
   Stmt VisitStmt_(const BlockNode* op) final {
     // Step.0. Define buffer mappings which is allocated inside the block
-    Array<Buffer> alloc_buffers = MutateArray(
-        op->alloc_buffers,
+    Array<Buffer> alloc_buffers = op->alloc_buffers.Map(
         std::bind(&PrimFuncSpecializer::MutateAllocBuffer, this, std::placeholders::_1));
 
     // Step.1. Recursively visit block body
@@ -124,11 +123,9 @@ class PrimFuncSpecializer : public StmtExprMutator {
     op = stmt.as<BlockNode>();
     ICHECK(op != nullptr);
 
-    Array<BufferRegion> reads = MutateArray(
-        op->reads,
+    Array<BufferRegion> reads = op->reads.Map(
         std::bind(&PrimFuncSpecializer::MutateBufferRegion, this, std::placeholders::_1));
-    Array<BufferRegion> writes = MutateArray(
-        op->writes,
+    Array<BufferRegion> writes = op->writes.Map(
         std::bind(&PrimFuncSpecializer::MutateBufferRegion, this, std::placeholders::_1));
 
     if (alloc_buffers.same_as(op->alloc_buffers) && reads.same_as(op->reads)) {
@@ -200,10 +197,9 @@ class PrimFuncSpecializer : public StmtExprMutator {
 
  private:
   Buffer MutateBuffer(const Buffer& buffer) {
-    Array<PrimExpr> shape =
-        MutateArray(buffer->shape, [this](const PrimExpr& e) { return VisitExpr(e); });
+    Array<PrimExpr> shape = buffer->shape.Map([this](const PrimExpr& e) { return VisitExpr(e); });
     Array<PrimExpr> strides =
-        MutateArray(buffer->strides, [this](const PrimExpr& e) { return VisitExpr(e); });
+        buffer->strides.Map([this](const PrimExpr& e) { return VisitExpr(e); });
 
     PrimExpr elem_offset = VisitExpr(buffer->elem_offset);
 
@@ -242,9 +238,8 @@ class PrimFuncSpecializer : public StmtExprMutator {
 
   BufferRegion MutateBufferRegion(const BufferRegion& buffer_region) {
     auto it = buffer_map_.find(buffer_region->buffer);
-    Array<Range> region =
-        MutateArray(buffer_region->region,
-                    std::bind(&PrimFuncSpecializer::MutateRange, this, std::placeholders::_1));
+    Array<Range> region = buffer_region->region.Map(
+        std::bind(&PrimFuncSpecializer::MutateRange, this, std::placeholders::_1));
     if (it == buffer_map_.end() && region.same_as(buffer_region->region)) {
       return buffer_region;
     } else {
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index c75eb52f9296..c2e2489cba92 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -183,9 +183,8 @@ class StmtMutator::Internal {
       return arr;
     } else {
       bool allow_cow = false;
-      Array<T> copy = arr;
       std::swap(allow_cow, self->allow_copy_on_write_);
-      copy.MutateByApply(fmutate);
+      Array<T> copy = arr.Map(fmutate);
       std::swap(allow_cow, self->allow_copy_on_write_);
       return copy;
     }
diff --git a/src/tir/schedule/primitive/decompose_padding.cc b/src/tir/schedule/primitive/decompose_padding.cc
index 93fb88e66619..c41760876722 100644
--- a/src/tir/schedule/primitive/decompose_padding.cc
+++ b/src/tir/schedule/primitive/decompose_padding.cc
@@ -212,16 +212,15 @@ static std::pair<Stmt, BlockRealize> CreateConstBlock(const BlockRealizeNode* re
 
   // create new write region
   ICHECK_EQ(block->writes.size(), 1U);
-  BufferRegion write_region =
-      BufferRegion(block->writes[0]->buffer,
-                   MutateArray(block->writes[0]->region, [rewrite_expr](const Range& r) {
-                     return Range::FromMinExtent(rewrite_expr(r->min), rewrite_expr(r->extent));
-                   }));
+  BufferRegion write_region = BufferRegion(
+      block->writes[0]->buffer, block->writes[0]->region.Map([rewrite_expr](const Range& r) {
+        return Range::FromMinExtent(rewrite_expr(r->min), rewrite_expr(r->extent));
+      }));
 
   // create block to fill const pad values
   BufferStore store = Downcast<BufferStore>(block->body);
   store.CopyOnWrite()->value = info.pad_value;
-  store.CopyOnWrite()->indices = MutateArray(store->indices, rewrite_expr);
+  store.CopyOnWrite()->indices = store->indices.Map(rewrite_expr);
   Block new_block(/*iter_vars=*/new_iter_vars, /*reads=*/{}, /*writes=*/{write_region},
                   /*name_hint=*/block->name_hint + "_pad_const", /*body=*/std::move(store));
 
@@ -307,7 +306,7 @@ static std::pair<Stmt, BlockRealize> CreateInBoundBlock(const BlockRealizeNode*
     return analyzer->Simplify(Substitute(e, repl_dict));
   };
   auto rewrite_region = [rewrite_expr](const Region& region) {
-    return MutateArray(region, [rewrite_expr](const Range& r) {
+    return region.Map([rewrite_expr](const Range& r) {
       return Range::FromMinExtent(rewrite_expr(r->min), rewrite_expr(r->extent));
     });
   };
@@ -324,7 +323,7 @@ static std::pair<Stmt, BlockRealize> CreateInBoundBlock(const BlockRealizeNode*
   // create new block realize node
   BufferStore store = Downcast<BufferStore>(block->body);
   store.CopyOnWrite()->value = rewrite_expr(info.in_bound_value);
-  store.CopyOnWrite()->indices = MutateArray(store->indices, rewrite_expr);
+  store.CopyOnWrite()->indices = store->indices.Map(rewrite_expr);
   Block new_block(/*iter_vars=*/new_iter_vars, /*reads=*/reads, /*writes=*/writes,
                   /*name_hint=*/block->name_hint, /*body=*/std::move(store));
   PrimExpr new_predicate = rewrite_expr(info.in_bound_predicate);
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index b00005c58061..d99cc199fe5f 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -177,12 +177,12 @@ Stmt ReplaceBufferMutator::VisitStmt_(const BlockNode* block) {
   };
 
   // Step 1. Mutate `match_buffers`. If an old buffer appears as a source of MatchBufferRegion,
-  Array<MatchBufferRegion> match_buffers = MutateArray(block->match_buffers, f_mutate_match_buffer);
+  Array<MatchBufferRegion> match_buffers = block->match_buffers.Map(f_mutate_match_buffer);
   // Step 2. Mutate the read/write region.
-  Array<BufferRegion> reads = MutateArray(block->reads, f_mutate_read_write_region);
-  Array<BufferRegion> writes = MutateArray(block->writes, f_mutate_read_write_region);
+  Array<BufferRegion> reads = block->reads.Map(f_mutate_read_write_region);
+  Array<BufferRegion> writes = block->writes.Map(f_mutate_read_write_region);
   // Step 3. Mutate `alloc_buffers` for the old buffer allocated in this block.
-  Array<Buffer> alloc_buffers = MutateArray(block->alloc_buffers, f_mutate_alloc_buffers);
+  Array<Buffer> alloc_buffers = block->alloc_buffers.Map(f_mutate_alloc_buffers);
   // Step 4. Recursively mutate the block.
   Block mutated_block = Downcast<Block>(StmtMutator::VisitStmt_(block));
 
diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc
index 455140c75c13..f49b6b2ace8e 100644
--- a/src/tir/transforms/inject_virtual_thread.cc
+++ b/src/tir/transforms/inject_virtual_thread.cc
@@ -400,8 +400,8 @@ class VTInjector : public arith::IRMutatorWithAnalyzer {
 
     PrimExpr condition = this->VisitExpr(op->condition);
 
-    Array<PrimExpr> extents = op->extents;
-    extents.MutateByApply([this](const PrimExpr& extent) { return this->VisitExpr(extent); });
+    Array<PrimExpr> extents =
+        op->extents.Map([this](const PrimExpr& extent) { return this->VisitExpr(extent); });
 
     if (visit_touched_var_ && !vt_loop_injected_) {
       return InjectVTLoop(GetRef<Stmt>(op), true);
diff --git a/src/tir/transforms/lower_match_buffer.cc b/src/tir/transforms/lower_match_buffer.cc
index 5bde5cb90e2b..9b915da6290b 100644
--- a/src/tir/transforms/lower_match_buffer.cc
+++ b/src/tir/transforms/lower_match_buffer.cc
@@ -51,10 +51,10 @@ class MatchBufferLower : public StmtExprMutator {
     Stmt stmt = StmtExprMutator ::VisitStmt_(op);
     op = stmt.as<BlockNode>();
     ICHECK(op != nullptr);
-    Array<BufferRegion> reads = MutateArray(
-        op->reads, std::bind(&MatchBufferLower::VisitBufferRegion, this, std::placeholders::_1));
-    Array<BufferRegion> writes = MutateArray(
-        op->writes, std::bind(&MatchBufferLower::VisitBufferRegion, this, std::placeholders::_1));
+    Array<BufferRegion> reads =
+        op->reads.Map(std::bind(&MatchBufferLower::VisitBufferRegion, this, std::placeholders::_1));
+    Array<BufferRegion> writes = op->writes.Map(
+        std::bind(&MatchBufferLower::VisitBufferRegion, this, std::placeholders::_1));
 
     if (reads.same_as(op->reads) && writes.same_as(op->writes) && op->match_buffers.empty()) {
       return stmt;
diff --git a/src/tir/transforms/renew_defs.cc b/src/tir/transforms/renew_defs.cc
index c717dc9b98f2..a185916a9a4c 100644
--- a/src/tir/transforms/renew_defs.cc
+++ b/src/tir/transforms/renew_defs.cc
@@ -96,18 +96,16 @@ class RenewDefMutator : public StmtExprMutator {
 
   Stmt VisitStmt_(const BlockNode* op) final {
     // Step 0. Re-define Itervars
-    Array<IterVar> iter_vars = MutateArray(
-        op->iter_vars, std::bind(&RenewDefMutator::VisitIterVar, this, std::placeholders::_1));
+    Array<IterVar> iter_vars =
+        op->iter_vars.Map(std::bind(&RenewDefMutator::VisitIterVar, this, std::placeholders::_1));
 
     // Step 1. Re-define buffers allocate under the block
-    Array<Buffer> alloc_buffers = MutateArray(
-        op->alloc_buffers,
+    Array<Buffer> alloc_buffers = op->alloc_buffers.Map(
         std::bind(&RenewDefMutator::VisitBuffer, this, std::placeholders::_1, /*define=*/true));
 
     // Step 2. Re-define match_buffers
-    Array<MatchBufferRegion> match_buffers =
-        MutateArray(op->match_buffers,
-                    std::bind(&RenewDefMutator::VisitMatchBuffer, this, std::placeholders::_1));
+    Array<MatchBufferRegion> match_buffers = op->match_buffers.Map(
+        std::bind(&RenewDefMutator::VisitMatchBuffer, this, std::placeholders::_1));
 
     // Step 3. Visit body
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
@@ -115,10 +113,10 @@ class RenewDefMutator : public StmtExprMutator {
     ICHECK(op);
 
     // Step 4. Revisit access region
-    Array<BufferRegion> reads = MutateArray(
-        op->reads, std::bind(&RenewDefMutator::VisitBufferRegion, this, std::placeholders::_1));
-    Array<BufferRegion> writes = MutateArray(
-        op->writes, std::bind(&RenewDefMutator::VisitBufferRegion, this, std::placeholders::_1));
+    Array<BufferRegion> reads =
+        op->reads.Map(std::bind(&RenewDefMutator::VisitBufferRegion, this, std::placeholders::_1));
+    Array<BufferRegion> writes =
+        op->writes.Map(std::bind(&RenewDefMutator::VisitBufferRegion, this, std::placeholders::_1));
 
     // Step 5. Regenerate block. Since the defs are changed, we need to create a new block
     auto n = make_object<BlockNode>(*op);
@@ -203,9 +201,9 @@ class RenewDefMutator : public StmtExprMutator {
     // update data
     Var data = Downcast<Var>(redefine_if_is_var(buffer->data));
     // update shape
-    Array<PrimExpr> shape = MutateArray(buffer->shape, redefine_if_is_var);
+    Array<PrimExpr> shape = buffer->shape.Map(redefine_if_is_var);
     // update strides
-    Array<PrimExpr> strides = MutateArray(buffer->strides, redefine_if_is_var);
+    Array<PrimExpr> strides = buffer->strides.Map(redefine_if_is_var);
     // update elem_offset
     PrimExpr elem_offset = redefine_if_is_var(buffer->elem_offset);
 
@@ -242,10 +240,10 @@ class RenewDefMutator : public StmtExprMutator {
       return Downcast<Buffer>((*it).second);
     }
     Var data = Downcast<Var>(VisitExpr(buffer->data));
-    Array<PrimExpr> shape = MutateArray(
-        buffer->shape, std::bind(&RenewDefMutator::VisitExpr, this, std::placeholders::_1));
-    Array<PrimExpr> strides = MutateArray(
-        buffer->strides, std::bind(&RenewDefMutator::VisitExpr, this, std::placeholders::_1));
+    Array<PrimExpr> shape =
+        buffer->shape.Map(std::bind(&RenewDefMutator::VisitExpr, this, std::placeholders::_1));
+    Array<PrimExpr> strides =
+        buffer->strides.Map(std::bind(&RenewDefMutator::VisitExpr, this, std::placeholders::_1));
     PrimExpr elem_offset = VisitExpr(buffer->elem_offset);
 
     auto n = make_object<BufferNode>(*buffer.get());
@@ -276,9 +274,8 @@ class RenewDefMutator : public StmtExprMutator {
 
   BufferRegion VisitBufferRegion(const BufferRegion& buffer_region) {
     Buffer buffer = VisitBuffer(buffer_region->buffer);
-    Array<Range> region =
-        MutateArray(buffer_region->region,
-                    std::bind(&RenewDefMutator::VisitRange, this, std::placeholders::_1));
+    Array<Range> region = buffer_region->region.Map(
+        std::bind(&RenewDefMutator::VisitRange, this, std::placeholders::_1));
     if (buffer.same_as(buffer_region->buffer) && region.same_as(buffer_region->region)) {
       return buffer_region;
     } else {
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index 5c5a47e86a9a..3cc17847e69b 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -379,8 +379,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     auto load = GetRef<BufferLoad>(op);
 
     auto fmutate = [this](const PrimExpr& index) { return this->VisitExpr(index); };
-    Array<PrimExpr> indices = op->indices;
-    indices.MutateByApply(fmutate);
+    Array<PrimExpr> indices = op->indices.Map(fmutate);
 
     if (!indices.same_as(op->indices)) {
       auto writer = load.CopyOnWrite();
@@ -428,8 +427,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     auto store = GetRef<BufferStore>(op);
 
     auto fmutate = [this](const PrimExpr& index) { return this->VisitExpr(index); };
-    Array<PrimExpr> indices = op->indices;
-    indices.MutateByApply(fmutate);
+    Array<PrimExpr> indices = op->indices.Map(fmutate);
 
     PrimExpr value = this->VisitExpr(op->value);
 
diff --git a/tests/cpp/container_test.cc b/tests/cpp/container_test.cc
index f6c4fb4b67d6..d75a510d0c95 100644
--- a/tests/cpp/container_test.cc
+++ b/tests/cpp/container_test.cc
@@ -173,6 +173,141 @@ TEST(Array, Mutate) {
   ICHECK(list2[1].same_as(z));
 }
 
+TEST(Array, MutateInPlaceForUniqueReference) {
+  using namespace tvm;
+  Var x("x");
+  Array<Var> arr{x, x};
+  ICHECK(arr.unique());
+  auto* before = arr.get();
+
+  arr.MutateByApply([](Var) { return Var("y"); });
+  auto* after = arr.get();
+  ICHECK_EQ(before, after);
+}
+
+TEST(Array, CopyWhenMutatingNonUniqueReference) {
+  using namespace tvm;
+  Var x("x");
+  Array<Var> arr{x, x};
+  Array<Var> arr2 = arr;
+
+  ICHECK(!arr.unique());
+  auto* before = arr.get();
+
+  arr.MutateByApply([](Var) { return Var("y"); });
+  auto* after = arr.get();
+  ICHECK_NE(before, after);
+}
+
+TEST(Array, Map) {
+  // Basic functionality
+  using namespace tvm;
+  Var x("x");
+  Var y("y");
+  Array<Var> var_arr{x, y};
+  Array<PrimExpr> expr_arr = var_arr.Map([](Var var) -> PrimExpr { return var + 1; });
+
+  ICHECK_NE(var_arr.get(), expr_arr.get());
+  ICHECK(expr_arr[0]->IsInstance<AddNode>());
+  ICHECK(expr_arr[1]->IsInstance<AddNode>());
+  ICHECK(expr_arr[0].as<AddNode>()->a.same_as(x));
+  ICHECK(expr_arr[1].as<AddNode>()->a.same_as(y));
+}
+
+TEST(Array, MapToSameTypeWithoutCopy) {
+  // If the applied map doesn't alter the contents, we can avoid a
+  // copy.
+  using namespace tvm;
+  Var x("x");
+  Var y("y");
+  Array<Var> var_arr{x, y};
+  Array<Var> var_arr2 = var_arr.Map([](Var var) { return var; });
+
+  ICHECK_EQ(var_arr.get(), var_arr2.get());
+}
+
+TEST(Array, MapToSameTypeWithCopy) {
+  // If the applied map does alter the contents, we need to make a
+  // copy.  The loop in this test is to validate correct behavior
+  // regardless of where the first discrepancy occurs.
+  using namespace tvm;
+  Var x("x");
+  Var y("y");
+  Var z("z");
+  Var replacement("replacement");
+  for (size_t i = 0; i < 2; i++) {
+    Array<Var> var_arr{x, y, z};
+    Var to_replace = var_arr[i];
+    Array<Var> var_arr2 =
+        var_arr.Map([&](Var var) { return var.same_as(to_replace) ? replacement : var; });
+
+    ICHECK_NE(var_arr.get(), var_arr2.get());
+
+    // The original array is unchanged
+    ICHECK_EQ(var_arr.size(), 3);
+    ICHECK(var_arr[0].same_as(x));
+    ICHECK(var_arr[1].same_as(y));
+
+    // The returned array has one of the elements replaced.
+    ICHECK_EQ(var_arr2.size(), 3);
+    ICHECK(var_arr2[i].same_as(replacement));
+    ICHECK(i == 0 || var_arr2[0].same_as(x));
+    ICHECK(i == 1 || var_arr2[1].same_as(y));
+    ICHECK(i == 2 || var_arr2[2].same_as(z));
+  }
+}
+
+TEST(Array, MapToSuperclassWithoutCopy) {
+  // If a map is converting to a superclass, and the mapping function
+  // array doesn't change the value other than a cast, we can avoid a
+  // copy.
+  using namespace tvm;
+  Var x("x");
+  Var y("y");
+  Array<Var> var_arr{x, y};
+  Array<PrimExpr> expr_arr = var_arr.Map([](Var var) { return PrimExpr(var); });
+
+  ICHECK_EQ(var_arr.get(), expr_arr.get());
+}
+
+TEST(Array, MapToSubclassWithoutCopy) {
+  // If a map is converting to a subclass, and the mapped array
+  // happens to only contain instances of that subclass, we can
+  // able to avoid a copy.
+  using namespace tvm;
+  Var x("x");
+  Var y("y");
+  Array<PrimExpr> expr_arr{x, y};
+  Array<Var> var_arr = expr_arr.Map([](PrimExpr expr) -> Var { return Downcast<Var>(expr); });
+
+  ICHECK_EQ(var_arr.get(), expr_arr.get());
+}
+
+TEST(Array, MapToOptionalWithoutCopy) {
+  // Optional<T> and T both have the same T::ContainerType, just with
+  // different interfaces for handling `T::data_ == nullptr`.
+  using namespace tvm;
+  Var x("x");
+  Var y("y");
+  Array<Var> var_arr{x, y};
+  Array<Optional<Var>> opt_arr = var_arr.Map([](Var var) { return Optional<Var>(var); });
+
+  ICHECK_EQ(var_arr.get(), opt_arr.get());
+}
+
+TEST(Array, MapFromOptionalWithoutCopy) {
+  // Optional<T> and T both have the same T::ContainerType, just with
+  // different interfaces for handling `T::data_ == nullptr`.
+  using namespace tvm;
+  Var x("x");
+  Var y("y");
+  Array<Optional<Var>> opt_arr{x, y};
+  Array<Var> var_arr =
+      opt_arr.Map([](Optional<Var> var) { return var.value_or(Var("undefined")); });
+
+  ICHECK_EQ(var_arr.get(), opt_arr.get());
+}
+
 TEST(Array, Iterator) {
   using namespace tvm;
   Array<PrimExpr> array{1, 2, 3};

From 52dbf102cdba1186e517977ee02aaa7bbe46d0df Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 20 Sep 2022 14:55:16 -0700
Subject: [PATCH 217/704] Fix caffe, boost install in Python venvs by creating
 python3.X link (#12828)

* Fix caffe, boost install in Python venvs by creating python3.X link.

* Use getsitepackages()
---
 docker/install/ubuntu_install_boost.sh  | 3 +++
 docker/install/ubuntu_install_caffe.sh  | 4 ++--
 docker/install/ubuntu_install_python.sh | 7 +++++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_boost.sh b/docker/install/ubuntu_install_boost.sh
index e226bbc5d96e..1c04c8a8ff20 100755
--- a/docker/install/ubuntu_install_boost.sh
+++ b/docker/install/ubuntu_install_boost.sh
@@ -24,6 +24,9 @@ cleanup() {
 
 trap cleanup 0
 
+# NOTE: by default, tvm-venv python is used. Install boost on the system.
+PATH=${PATH/${TVM_VENV}\/bin:/}
+
 curl -LO https://boostorg.jfrog.io/artifactory/main/release/1.67.0/source/boost_1_67_0.tar.gz
 BOOST_HASH=8c247e040303a97895cee9c9407ef205e2c3ab09f0b8320997835ad6221dff23a87231629498ccfd0acca473f74e9ec27b8bd774707b062228df1e5f72d44c92
 echo "$BOOST_HASH" boost_1_67_0.tar.gz | sha512sum -c
diff --git a/docker/install/ubuntu_install_caffe.sh b/docker/install/ubuntu_install_caffe.sh
index 4d9763b69aa3..1e42270e267a 100755
--- a/docker/install/ubuntu_install_caffe.sh
+++ b/docker/install/ubuntu_install_caffe.sh
@@ -65,5 +65,5 @@ cd / && rm -rf /caffe_src
 
 PYCAFFE_ROOT=${CAFFE_HOME}/python
 echo "${CAFFE_HOME}/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
-VENV_SITE_PACKAGE=$(pip3 show numpy | grep "Location:" | cut -d ' ' -f 2)
-ln -s ${PYCAFFE_ROOT}/caffe ${VENV_SITE_PACKAGE}/caffe
+site_packages=$("${TVM_VENV}/bin/python3" -c 'import site; print(site.getsitepackages()[0])')
+ln -s ${PYCAFFE_ROOT}/caffe "${site_packages}/caffe"
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index 66a80e1fdc52..fb31c41dccea 100755
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -65,6 +65,13 @@ mkdir -p "${venv_dir}"
 python3 -mvenv "${TVM_VENV}"
 . "${TVM_VENV}/bin/activate"
 
+# NOTE: Only in python3.9 does venv guarantee it creates the python3.X binary.
+# This is needed so that cmake's find_package(PythonInterp) works inside the venv.
+# See https://bugs.python.org/issue39656
+if [ ! -e "${TVM_VENV}/bin/python${PYTHON_VERSION}" ]; then
+    ln -s "${TVM_VENV}/bin/python" "${TVM_VENV}/bin/python${PYTHON_VERSION}"
+fi
+
 # Update pip to match version used to produce requirements-hashed.txt. This step
 # is necessary so that pip's dependency solver is recent.
 pip_spec=$(cat /install/python/bootstrap/lockfiles/constraints-${PYTHON_VERSION}.txt | grep 'pip==')

From fa5045bf6923c94758e15a7fad7c0904440a4698 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 21 Sep 2022 09:32:12 +0900
Subject: [PATCH 218/704] [Metaschedule] MultiLevelTiling for wide vector
 architectures (#12845)

* [Metaschedule] Introduce MultiLevelTiling for wide vector architecture

* update test

* format

* cpplint
---
 include/tvm/meta_schedule/schedule_rule.h     |  15 +++
 .../meta_schedule/schedule_rule/__init__.py   |   1 +
 .../schedule_rule/multi_level_tiling.py       |  37 ++++++
 .../schedule_rule/multi_level_tiling.cc       |  35 +++--
 .../schedule_rule/multi_level_tiling.h        |   3 +
 .../multi_level_tiling_wide_vector.cc         | 120 ++++++++++++++++++
 .../test_meta_schedule_schedule_rule_mlt.py   | 108 +++++++++++++++-
 7 files changed, 307 insertions(+), 12 deletions(-)
 create mode 100644 src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc

diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 55704cf4a97d..2c9da1df9dae 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -187,6 +187,21 @@ class ScheduleRule : public runtime::ObjectRef {
       Optional<Array<Integer>> vector_load_lens, Optional<Map<String, ObjectRef>> reuse_read,
       Optional<Map<String, ObjectRef>> reuse_write, bool use_software_pipeline);
 
+  /*!
+   * \brief Extension of MultiLevelTiling for backends with wide vectors.
+   * The loop over the innermost spatial axis of the output buffer is always vectorized with the
+   * maximum vector length.
+   * \param structure The tiling structure. 'SSRSRS' is recommended.
+   * \param vector_length_in_bits The length of a vector register in bits.
+   * \param max_innermost_factor The maximum size of the innermost factor. NullOpt means no limit
+   * \param reuse_read Data reuse configuration for reading. NullOpt means no reuse.
+   * \param reuse_write Data reuse configuration for writing. NullOpt means no reuse.
+   * \return The schedule rule created
+   */
+  TVM_DLL static ScheduleRule MultiLevelTilingWideVector(
+      String structure, Integer vector_length_in_bits, Optional<Integer> max_innermost_factor,
+      Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write);
+
   /*!
    * \brief Create a rule: add-rfactor to some blocks if needed
    * \param max_jobs_per_core The maximum number of jobs to be launched per CPU core. It sets the
diff --git a/python/tvm/meta_schedule/schedule_rule/__init__.py b/python/tvm/meta_schedule/schedule_rule/__init__.py
index dd0119b0a7f8..a015d0eb1ab2 100644
--- a/python/tvm/meta_schedule/schedule_rule/__init__.py
+++ b/python/tvm/meta_schedule/schedule_rule/__init__.py
@@ -28,6 +28,7 @@
     MultiLevelTilingWithIntrin,
     ReuseType,
     MultiLevelTilingTensorCore,
+    MultiLevelTilingWideVector,
 )
 from .parallel_vectorize_unroll import ParallelizeVectorizeUnroll
 from .random_compute_location import RandomComputeLocation
diff --git a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
index 6703bc5716e9..e91382dd017a 100644
--- a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
+++ b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
@@ -187,3 +187,40 @@ def __init__(
             reuse_write.as_dict() if reuse_write is not None else None,
             use_software_pipeline,
         )
+
+
+@register_object("meta_schedule.MultiLevelTilingWideVector")
+class MultiLevelTilingWideVector(ScheduleRule):
+    """Extension of MultiLevelTiling for backends with wide vectors. The loop over the innermost
+    spatial axis of the output buffer is always vectorized with the maximum vector length.
+
+    Parameters
+    ----------
+    structure : str
+        The tiling structure. 'SSRSRS' is recommended.
+    vector_length_in_bits: int
+        The length of a vector register in bits.
+    max_innermost_factor : Optional[int]
+        The maximum size of the innermost factor. None means no limit
+    reuse_read : Optional[ReuseType]
+        Data reuse configuration for reading. None means no reuse.
+    reuse_write : Optional[ReuseType]
+        Data reuse configuration for writing. None means no reuse.
+    """
+
+    def __init__(
+        self,
+        structure: str,
+        vector_length_in_bits: int,
+        max_innermost_factor: Optional[int] = None,
+        reuse_read: Optional[ReuseType] = None,
+        reuse_write: Optional[ReuseType] = None,
+    ) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.ScheduleRuleMultiLevelTilingWideVector,  # type: ignore # pylint: disable=no-member
+            structure,
+            vector_length_in_bits,
+            max_innermost_factor,
+            reuse_read.as_dict() if reuse_read is not None else None,
+            reuse_write.as_dict() if reuse_write is not None else None,
+        )
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index 1625a27b9aaf..2ae6714f55d8 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -166,6 +166,17 @@ std::vector<State> MultiLevelTilingNode::AddWriteReuse(State state) const {
   return results;
 }
 
+Array<tir::LoopRV> MultiLevelTilingNode::SplitLoop(const Schedule& sch, BlockRV block, LoopRV loop,
+                                                   int n_tiles) const {
+  Array<tir::ExprRV> factors = sch->SamplePerfectTile(
+      /*loop=*/loop,
+      /*n=*/n_tiles,
+      /*max_innermost_factor=*/max_innermost_factor);
+  Array<tir::LoopRV> splits = sch->Split(/*loop=*/loop,
+                                         /*factors=*/{factors.begin(), factors.end()});
+  return splits;
+}
+
 std::vector<State> MultiLevelTilingNode::TileLoopNest(State state) const {
   Schedule& sch = state->sch;
   const BlockRV& block_rv = state->block_rv;
@@ -179,6 +190,7 @@ std::vector<State> MultiLevelTilingNode::TileLoopNest(State state) const {
   for (int i = 0, n = loops.size(); i < n; ++i) {
     LoopRV loop = loops[i];
     const std::vector<int>* idx = nullptr;
+
     if (iter_types[i] == IterVarType::kDataPar) {
       idx = &s_indices_;
       if (spatial_loop_product != -1) {
@@ -193,17 +205,18 @@ std::vector<State> MultiLevelTilingNode::TileLoopNest(State state) const {
     } else {
       continue;
     }
-    // Do the split
-    int n_tiles = idx->size();
-    Array<tir::ExprRV> factors = sch->SamplePerfectTile(
-        /*loop=*/loop,
-        /*n=*/n_tiles,
-        /*max_innermost_factor=*/max_innermost_factor);
-    Array<tir::LoopRV> splits = sch->Split(/*loop=*/loop,
-                                           /*factors=*/{factors.begin(), factors.end()});
-    // Put every tile to its slot
-    for (int j = 0; j < n_tiles; ++j) {
-      tiles[idx->at(j)].push_back(splits[j]);
+
+    const int n_tiles = idx->size();
+
+    if (n_tiles == 1) {
+      tiles[idx->at(0)].push_back(loop);
+    } else {
+      auto splits = SplitLoop(sch, block_rv, loop, n_tiles);
+
+      // Put every tile to its slot
+      for (int j = 0; j < n_tiles; ++j) {
+        tiles[idx->at(j)].push_back(splits[j]);
+      }
     }
   }
   // Step 3. Reorder to organize the tiles
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.h b/src/meta_schedule/schedule_rule/multi_level_tiling.h
index 47da878c3be0..8f55e8e7e4e4 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.h
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.h
@@ -161,6 +161,9 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
  protected:
   virtual std::vector<State> ApplySubRules(std::vector<State> states);
 
+  virtual Array<tir::LoopRV> SplitLoop(const tir::Schedule& sch, tir::BlockRV block,
+                                       tir::LoopRV loop, int n_tiles) const;
+
   // Annotate a block to use cooperative fetching
   void AnnotateCooperativeFetching(tir::Schedule* sch, const tir::BlockRV& block) const;
 
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc
new file mode 100644
index 000000000000..f5ec009a9b28
--- /dev/null
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "../../tir/schedule/analysis.h"
+#include "../../tir/schedule/transform.h"
+#include "../utils.h"
+#include "multi_level_tiling.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using tir::BlockRV;
+using tir::LoopRV;
+using tir::Schedule;
+
+/*!
+ * \brief Extension of MultiLevelTiling for backends with wide vectors.
+ * The loop over the innermost spatial axis of the output buffer is always vectorized with the
+ * maximum vector length.
+ */
+class MultiLevelTilingWideVectorNode : public MultiLevelTilingNode {
+ public:
+  size_t vector_length_in_bits;
+
+  static constexpr const char* _type_key = "meta_schedule.MultiLevelTilingWideVector";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MultiLevelTilingWideVectorNode, MultiLevelTilingNode);
+
+ protected:
+  Array<tir::LoopRV> SplitLoop(const Schedule& sch, BlockRV block, LoopRV loop, int n_tiles) const;
+};
+
+Array<tir::LoopRV> MultiLevelTilingWideVectorNode::SplitLoop(const Schedule& sch, BlockRV block_rv,
+                                                             LoopRV loop_rv, int n_tiles) const {
+  const tir::ForNode* loop = TVM_SREF_TO_FOR(sch->GetSRef(loop_rv));
+  const tir::StmtSRef block_sref = sch->GetSRef(block_rv);
+  const tir::BlockNode* block_node = block_sref->StmtAs<tir::BlockNode>();
+  const tir::BlockRealize block_realize = tir::GetBlockRealize(sch->state(), block_sref);
+  ICHECK(block_node && block_node->writes.size() == 1);
+
+  const auto out_dtype = block_node->writes[0]->buffer->dtype;
+  const int vec_len = vector_length_in_bits / out_dtype.bits();
+
+  // Determine if this loop is over the innermost axis of the output buffer.
+  // In the example below, we look for a loop whose loop var is bound to the axis co.
+
+  // for (i0, 0, 1) {
+  //    for (i1, 0, 56) {
+  //      for (i2, 0, 56) {
+  //        for (i3, 0, 64) {
+  //          for (i4, 0, 3) {
+  //            for (i5, 0, 3) {
+  //              for (i6, 0, 64) {
+  //                block conv2d_nhwc(...) {
+  //                  ...
+  //                  bind(co, i3)
+  //                  ...
+  //                  writes([conv2d_nhwc[n, h, w, co]])
+  //                  ...
+  //                  conv2d_nhwc[n, h, w, co] = ...
+  // }
+  const size_t innermost_axis = block_node->writes[0]->region.size() - 1;
+  const PrimExpr innermost_iter_value = block_realize->iter_values[innermost_axis];
+
+  if (!arith::Analyzer().CanProve(loop->loop_var == innermost_iter_value)) {
+    // If this is not the innermost spatial loop, split the loop in the normal way.
+    return MultiLevelTilingNode::SplitLoop(sch, block_rv, loop_rv, n_tiles);
+  } else {
+    // We split the innermost spatial loop in a way that always uses the maximum vector length.
+    const int64_t* extent_int = tir::GetLoopIntExtent(loop);
+    if (extent_int && *extent_int > vec_len) {
+      Array<tir::LoopRV> inner_splits = sch->Split(/*loop=*/loop_rv,
+                                                   /*factors=*/{NullOpt, PrimExpr(vec_len)});
+      Array<tir::ExprRV> outer_factors = sch->SamplePerfectTile(
+          /*loop=*/inner_splits[0],
+          /*n=*/n_tiles - 1,
+          /*max_innermost_factor=*/max_innermost_factor);
+      Array<tir::LoopRV> outer_splits = sch->Split(
+          /*loop=*/inner_splits[0], /*factors=*/{outer_factors.begin(), outer_factors.end()});
+      outer_splits.push_back(inner_splits[1]);
+      return outer_splits;
+    } else {
+      Array<tir::ExprRV> factors(n_tiles - 1, PrimExpr(1));
+      factors.push_back(loop->extent);
+      return sch->Split(/*loop=*/loop_rv,
+                        /*factors=*/{factors.begin(), factors.end()});
+    }
+  }
+}
+
+ScheduleRule ScheduleRule::MultiLevelTilingWideVector(
+    String structure, Integer vector_length_in_bits, Optional<Integer> max_innermost_factor,
+    Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write) {
+  auto node = MultiLevelTilingInitCommon<MultiLevelTilingWideVectorNode>(
+      structure, NullOpt, max_innermost_factor, NullOpt, reuse_read, reuse_write);
+  node->vector_length_in_bits = vector_length_in_bits->value;
+  return ScheduleRule(node);
+}
+
+TVM_REGISTER_NODE_TYPE(MultiLevelTilingWideVectorNode);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleMultiLevelTilingWideVector")
+    .set_body_typed(ScheduleRule::MultiLevelTilingWideVector);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
index 939ccbe54fa6..d9d078106333 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 from tvm import meta_schedule as ms
-from tvm import te
+from tvm import te, target
 from tvm.meta_schedule.testing import te_workload
 from tvm.meta_schedule.testing.schedule_rule import get_rules
 from tvm.meta_schedule.testing.space_generation import check_sketches
@@ -521,9 +521,115 @@ def sum_with_trivial_block_iter(
     assert not sch.trace.simplified(remove_postproc=True).insts
 
 
+def test_multi_level_tiling_hexagon():
+    @T.prim_func
+    def cpu_conv2d_nhwc(
+        inputs: T.Buffer[(1, 56, 56, 64), "float16"],
+        weight: T.Buffer[(3, 3, 64, 64), "float16"],
+        conv2d_nhwc: T.Buffer[(1, 56, 56, 64), "float16"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        PadInput = T.alloc_buffer([1, 58, 58, 64], dtype="float16")
+        for i0, i1, i2, i3 in T.grid(1, 58, 58, 64):
+            with T.block("PadInput"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1])
+                T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
+                PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
+                    1 <= i1_1 and i1_1 < 57 and 1 <= i2_1 and i2_1 < 57,
+                    inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1],
+                    T.float16(0),
+                    dtype="float16",
+                )
+        for (
+            i0_0,
+            i1_0,
+            i2_0,
+            i3_0,
+            i4_0,
+            i5_0,
+            i6_0,
+            i0_1_1,
+            i1_1_1,
+            i2_1_1,
+            i3_1_1,
+            i4_1,
+            i5_1,
+            i6_1,
+            i0_2,
+            i1_2,
+            i2_2,
+            i3_2,
+        ) in T.grid(1, 1, 2, 1, 3, 3, 16, 1, 14, 2, 1, 1, 1, 4, 1, 4, 14, 64):
+            with T.block("conv2d_nhwc"):
+                n = T.axis.spatial(1, i0_1_1 + i0_2 + i0_0)
+                h = T.axis.spatial(56, i1_0 * 56 + i1_1_1 * 4 + i1_2)
+                w = T.axis.spatial(56, i2_0 * 28 + i2_1_1 * 14 + i2_2)
+                co = T.axis.spatial(64, i3_0 * 64 + i3_1_1 * 64 + i3_2)
+                rh = T.axis.reduce(3, i4_1 + i4_0)
+                rw = T.axis.reduce(3, i5_0 + i5_1)
+                rc = T.axis.reduce(64, i6_0 * 4 + i6_1)
+                T.reads(PadInput[n, h + rh, w + rw, co // 64 * 64 + rc], weight[rh, rw, rc, co])
+                T.writes(conv2d_nhwc[n, h, w, co])
+                T.block_attr({"meta_schedule.tiling_structure": "SRSRS"})
+                with T.init():
+                    conv2d_nhwc[n, h, w, co] = T.float16(0)
+                conv2d_nhwc[n, h, w, co] = (
+                    conv2d_nhwc[n, h, w, co]
+                    + PadInput[n, h + rh, w + rw, co // 64 * 64 + rc] * weight[rh, rw, rc, co]
+                )
+
+    target_hexagon = target.hexagon("v69", num_cores=4)
+
+    I = 64
+    O = 64
+    H = 56
+    W = 56
+
+    mod = te.create_prim_func(
+        te_workload.conv2d_nhwc(1, H, W, I, O, 3, 1, 1, 1, in_dtype="float16", out_dtype="float16")
+    )
+
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target(target_hexagon, host=target_hexagon),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.MultiLevelTilingWideVector(
+                structure="SRSRS",
+                vector_length_in_bits=1024,
+                max_innermost_factor=64,
+                reuse_read=None,
+                reuse_write=None,
+            )
+        ],
+        task_name="test",
+    ).generate_design_space()
+
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1]),
+        ("SamplePerfectTile", [1, 14, 4]),
+        ("SamplePerfectTile", [2, 2, 14]),
+        ("SamplePerfectTile", [3, 1]),
+        ("SamplePerfectTile", [3, 1]),
+        ("SamplePerfectTile", [16, 4]),
+    ]
+
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[cpu_conv2d_nhwc],
+        expected_decisions=[decision_0],
+    )
+
+
 if __name__ == "__main__":
     test_cpu_matmul()
     test_cpu_matmul_relu()
     test_cuda_matmul()
     test_cuda_matmul_relu()
     test_cuda_sum_with_trivial_block_iter()
+    test_multi_level_tiling_hexagon()

From d4e3207cca1bae532e6e616eca2e80191e45b437 Mon Sep 17 00:00:00 2001
From: "yin.changsheng" <yin.changsheng@intellif.com>
Date: Wed, 21 Sep 2022 11:03:14 +0800
Subject: [PATCH 219/704] [TIR] Enhance RemoveNoOp pass to remove negative loop
 (#12836)

---
 src/tir/transforms/remove_no_op.cc               | 11 +++++++++++
 .../unittest/test_tir_transform_remove_no_op.py  | 16 +++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/tir/transforms/remove_no_op.cc b/src/tir/transforms/remove_no_op.cc
index ce0d9b87c433..8728817aad57 100644
--- a/src/tir/transforms/remove_no_op.cc
+++ b/src/tir/transforms/remove_no_op.cc
@@ -31,6 +31,7 @@
 
 #include <unordered_map>
 
+#include "../../arith/const_fold.h"
 #include "ir_utils.h"
 
 namespace tvm {
@@ -87,7 +88,14 @@ class NoOpRemover : public StmtMutator {
     }
   }
   Stmt VisitStmt_(const ForNode* op) final {
+    var_range_map_[op->loop_var.get()] = arith::IntSet::FromMinExtent(op->min, op->extent);
+    auto extent_range = arith::EvalSet(op->extent, var_range_map_);
+    if (!arith::is_neg_inf(extent_range.max()) && !arith::is_pos_inf(extent_range.max()) &&
+        analyzer_.CanProve(extent_range.max() <= 0)) {
+      return Evaluate(0);
+    }
     Stmt stmt = StmtMutator::VisitStmt_(op);
+    var_range_map_.erase(op->loop_var.get());
     op = stmt.as<ForNode>();
     if (is_zero(op->extent)) {
       return Evaluate(0);
@@ -162,6 +170,9 @@ class NoOpRemover : public StmtMutator {
     }
     return stmt.defined() ? stmt : Evaluate(0);
   }
+
+  std::unordered_map<const VarNode*, arith::IntSet> var_range_map_;
+  arith::Analyzer analyzer_;
 };
 
 Stmt RemoveNoOp(Stmt stmt) { return NoOpRemover()(std::move(stmt)); }
diff --git a/tests/python/unittest/test_tir_transform_remove_no_op.py b/tests/python/unittest/test_tir_transform_remove_no_op.py
index e80d46193507..820e32eb7e72 100644
--- a/tests/python/unittest/test_tir_transform_remove_no_op.py
+++ b/tests/python/unittest/test_tir_transform_remove_no_op.py
@@ -16,6 +16,8 @@
 # under the License.
 import tvm
 from tvm import te
+from tvm.script import tir as T
+import tvm.testing
 
 
 def nop():
@@ -68,5 +70,17 @@ def test_remove_no_op():
     assert isinstance(ret, tvm.tir.Evaluate)
 
 
+def test_remove_no_op_with_invalid_extent():
+    @T.prim_func
+    def main(A: T.Buffer[(16), "int32"], B: T.Buffer[(16), "int32"]) -> None:
+        for i in T.serial(16):
+            for j in T.serial(i - 20):
+                B[i] = A[i] + j
+
+    mod = tvm.ir.module.IRModule.from_expr(main)
+    ret = tvm.tir.transform.RemoveNoOp()(mod)["main"].body
+    assert isinstance(ret, tvm.tir.Evaluate)
+
+
 if __name__ == "__main__":
-    test_remove_no_op()
+    tvm.testing.main()

From b051cad9f40671675d7101ac510b6f733cff0bc2 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Tue, 20 Sep 2022 22:51:05 -0700
Subject: [PATCH 220/704] [FIX,PROFILING] Fix gpu timer name and lookup
 (#12849)

* [FIX,PROFILING] Fix gpu timer name and lookup

In the switch from gpu to cuda naming, the cuda timer was passed over.
Renaming it to "profiling.timer.cuda" so it is correctly picked up by
the timing mechanisms.

* warn if timer impl does not exist
---
 src/runtime/cuda/cuda_device_api.cc | 18 ++++++++++--------
 src/runtime/profiling.cc            | 13 +++++++++++++
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index b4d7b41b7f4a..71788e52999a 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -252,9 +252,11 @@ TVM_REGISTER_GLOBAL("device_api.cuda_host").set_body([](TVMArgs args, TVMRetValu
   *rv = static_cast<void*>(ptr);
 });
 
-class GPUTimerNode : public TimerNode {
+class CUDATimerNode : public TimerNode {
  public:
   virtual void Start() {
+    // This initial cudaEventRecord is sometimes pretty slow (~100us). Does
+    // cudaEventRecord do some stream synchronization?
     CUDA_CALL(cudaEventRecord(start_, CUDAThreadEntry::ThreadLocal()->stream));
   }
   virtual void Stop() { CUDA_CALL(cudaEventRecord(stop_, CUDAThreadEntry::ThreadLocal()->stream)); }
@@ -264,27 +266,27 @@ class GPUTimerNode : public TimerNode {
     CUDA_CALL(cudaEventElapsedTime(&milliseconds, start_, stop_));
     return milliseconds * 1e6;
   }
-  virtual ~GPUTimerNode() {
+  virtual ~CUDATimerNode() {
     CUDA_CALL(cudaEventDestroy(start_));
     CUDA_CALL(cudaEventDestroy(stop_));
   }
-  GPUTimerNode() {
+  CUDATimerNode() {
     CUDA_CALL(cudaEventCreate(&start_));
     CUDA_CALL(cudaEventCreate(&stop_));
   }
 
-  static constexpr const char* _type_key = "GPUTimerNode";
-  TVM_DECLARE_FINAL_OBJECT_INFO(GPUTimerNode, TimerNode);
+  static constexpr const char* _type_key = "CUDATimerNode";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CUDATimerNode, TimerNode);
 
  private:
   cudaEvent_t start_;
   cudaEvent_t stop_;
 };
 
-TVM_REGISTER_OBJECT_TYPE(GPUTimerNode);
+TVM_REGISTER_OBJECT_TYPE(CUDATimerNode);
 
-TVM_REGISTER_GLOBAL("profiling.timer.gpu").set_body_typed([](Device dev) {
-  return Timer(make_object<GPUTimerNode>());
+TVM_REGISTER_GLOBAL("profiling.timer.cuda").set_body_typed([](Device dev) {
+  return Timer(make_object<CUDATimerNode>());
 });
 
 TVM_DLL String GetCudaFreeMemory() {
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 2c92633c34fc..168441d1708d 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -89,9 +89,22 @@ TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](Device dev) {
   return Timer(make_object<CPUTimerNode>());
 });
 
+// keep track of which timers are not defined but we have already warned about
+std::set<DLDeviceType> seen_devices;
+std::mutex seen_devices_lock;
+
 Timer Timer::Start(Device dev) {
   auto f = Registry::Get(std::string("profiling.timer.") + DeviceName(dev.device_type));
   if (f == nullptr) {
+    {
+      std::lock_guard<std::mutex> lock(seen_devices_lock);
+      if (seen_devices.find(dev.device_type) == seen_devices.end()) {
+        LOG(WARNING)
+            << "No timer implementation for " << DeviceName(dev.device_type)
+            << ", using default timer instead. It may be inaccurate or have extra overhead.";
+        seen_devices.insert(dev.device_type);
+      }
+    }
     Timer t = DefaultTimer(dev);
     t->Start();
     return t;

From fdc6894b7dae096d0ec983292aa0a2a475843f56 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 21 Sep 2022 08:04:53 -0500
Subject: [PATCH 221/704] [TVMScript][Fix] Correct round-trip of explicit root
 block (#12673)

* [TVMScript][Fix] Correct round-trip of explicit root block

Prior to this commit, when converting TIR to TVMScript, the root
`tir::Block` is typically hidden.  When parsing, however,
`tvm::tir::ScriptComplete` will wrap the function body in a root block
if the primfunc if the contains at least one block and does not
already have a root block.  As a result, if the root block is the only
block present, it would be stripped by a round-trip.

This commit tightens the condition for hiding the root `tir::Block`
when converting to TVMScript, so that it is printed in cases where
the autocompleter would reinsert it when parsing.
---
 include/tvm/tir/stmt_functor.h                | 32 ++++++++++++
 src/printer/tvmscript_printer.cc              | 50 ++++++++++++++++---
 src/tir/ir/script/script_complete.cc          | 37 ++++++++++----
 .../unittest/test_tvmscript_roundtrip.py      | 21 ++++++++
 4 files changed, 123 insertions(+), 17 deletions(-)

diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index 49b1f28e5d83..2fc3b9678b40 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -427,6 +427,38 @@ TVM_DLL void PreOrderVisit(const ObjectRef& stmt_or_expr,
  * \return The renewed func.
  */
 TVM_DLL PrimFunc RenewDefs(const PrimFunc& func);
+
+/*!
+ * \brief Check if the statement contains the specified node type.
+ *
+ * This utility potentially walks the entire statement, and should
+ * therefore not be used if it could otherwise be merged with another
+ * pass.
+ *
+ * \param stmt The statement to be searched
+ * \return Whether stmt contains Node
+ */
+template <typename Node, typename = std::enable_if_t<std::is_base_of_v<StmtNode, Node>>>
+bool ContainsNode(const Stmt& stmt) {
+  struct Visitor : StmtVisitor {
+    // Early bail-out, if we already found the node.
+    void VisitStmt(const Stmt& stmt) {
+      if (contains_node) {
+        return;
+      }
+      StmtVisitor::VisitStmt(stmt);
+    }
+
+    void VisitStmt_(const Node* block) override { contains_node = true; }
+
+    bool contains_node{false};
+  };
+
+  Visitor visitor;
+  visitor(stmt);
+  return visitor.contains_node;
+}
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 20720373589f..936ac7580f28 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -1664,19 +1664,53 @@ Doc TVMScriptPrinter::PrintPrimFunc(const PrimFunc& primFunc) {
   }
   // print body
   body << "# body" << Doc::NewLine();
-  if (op->body->IsInstance<BlockRealizeNode>() &&
-      op->body.as<BlockRealizeNode>()->iter_values.empty()) {
-    const BlockNode* block = op->body.as<BlockRealizeNode>()->block.get();
-    if (block->annotations.empty() && !ContainsOptionalInfo(GetRef<Stmt>(block))) {
-      // Skip print root block
-      body << "# with " << tir_prefix_ << ".block(\"root\")" << Doc::NewLine();
-      body << PrintBlockBody(block);
+
+  Optional<Block> elided_root_block_body = [&]() -> Optional<Block> {
+    auto block_realize = op->body.as<BlockRealizeNode>();
+    if (!block_realize || block_realize->iter_values.size()) {
+      return NullOpt;
+    }
+
+    const auto& block = block_realize->block;
+    if (block->annotations.size() || ContainsOptionalInfo(block)) {
+      return NullOpt;
+    }
+
+    // The autocomplete might recognize the body itself as being a
+    // root block, and fail to insert it.
+    bool autocomplete_would_insert_root_block = [&]() -> bool {
+      if (block->alloc_buffers.size()) {
+        return true;
+      }
+
+      auto* block_realize = block->body.as<BlockRealizeNode>();
+      if (block_realize && block_realize->block->iter_vars.size()) {
+        return true;
+      }
+      if (!block_realize && ContainsNode<BlockRealizeNode>(block->body)) {
+        return true;
+      }
+      return false;
+    }();
+
+    if (autocomplete_would_insert_root_block) {
+      return block;
     } else {
-      body << PrintBody(op->body);
+      return NullOpt;
     }
+  }();
+
+  if (elided_root_block_body) {
+    // Skip printing of root block in cases where tvm::tir::ScriptComplete
+    // would re-insert it.
+    body << "# with " << tir_prefix_ << ".block(\"root\")" << Doc::NewLine();
+    body << PrintBlockBody(elided_root_block_body.value().get());
   } else {
+    // If this is a non-root block, or is an unskippable root block,
+    // just print it without skipping.
     body << PrintBody(op->body);
   }
+
   // print func attrs
   Doc header_attr;
   if (primFunc->attrs.defined()) {
diff --git a/src/tir/ir/script/script_complete.cc b/src/tir/ir/script/script_complete.cc
index b11ca6650a14..c44083108d45 100644
--- a/src/tir/ir/script/script_complete.cc
+++ b/src/tir/ir/script/script_complete.cc
@@ -105,16 +105,35 @@ PrimFunc ScriptComplete(PrimFunc func, const Array<Buffer>& root_allocates) {
   for (const auto& alloc : root_allocates) {
     buffer_var_map.Set(alloc->data, alloc);
   }
-  bool contain_root = root_allocates.empty() && func->body->IsInstance<BlockRealizeNode>() &&
-                      Downcast<BlockRealize>(func->body)->block->iter_vars.empty();
-  ScriptCompleter script_completer(&buffer_var_map);
-  // generate surrounding loops automatically
-  Stmt res = script_completer(func->body);
-  // generate root block automatically
-  if ((script_completer.contains_block || root_allocates.size()) && !contain_root) {
-    res = Block({}, {}, {}, "root", res, NullOpt, root_allocates);
-    res = BlockRealize({}, Bool(true), Downcast<Block>(res));
+
+  Stmt res = func->body;
+
+  // Generate root block automatically.  This is done before
+  // ScriptCompleter, in order to fill the root block's T.reads() and
+  // T.writes() annotations, as if it had been explicitly written.
+  bool should_insert_root = [&]() -> bool {
+    if (root_allocates.size()) {
+      return true;
+    }
+    auto* block_realize = func->body.as<BlockRealizeNode>();
+    if (block_realize && block_realize->block->iter_vars.size()) {
+      return true;
+    }
+    if (!block_realize && ContainsNode<BlockRealizeNode>(func->body)) {
+      return true;
+    }
+    return false;
+  }();
+
+  if (should_insert_root) {
+    Block root_block({}, {}, {}, "root", std::move(res), NullOpt, root_allocates);
+    res = BlockRealize({}, Bool(true), std::move(root_block));
   }
+
+  // generate surrounding loops automatically
+  ScriptCompleter script_completer(&buffer_var_map);
+  res = script_completer(std::move(res));
+
   if (func->body.same_as(res)) {
     return func;
   } else {
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 1f5871b488e2..e139d2111bee 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3142,6 +3142,25 @@ def func_root_attr():
     return func_root_attr
 
 
+def func_trivial_root_block():
+    @T.prim_func
+    def func(A: T.Buffer[1, "int32"]):
+        with T.block("root"):
+            A[0] = 0
+
+    return func
+
+
+def func_nested_root_block():
+    @T.prim_func
+    def func(A: T.Buffer[1, "int32"]):
+        with T.block("root"):
+            with T.block("block"):
+                A[0] = 0
+
+    return func
+
+
 def func_T_ptr_let_statement():
     @T.prim_func
     def func_T_ptr_let_statement(
@@ -3418,6 +3437,8 @@ def func() -> None:
     func_with_target_spec_by_config,
     func_with_target_spec_by_str,
     func_root_attr,
+    func_trivial_root_block,
+    func_nested_root_block,
     func_T_ptr_let_statement,
     func_T_ptr_allocate,
     llvm_intrin_call,

From da0e5e3be2834b214ca7035fb50d9d378ecc5c52 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 21 Sep 2022 11:13:49 -0500
Subject: [PATCH 222/704] [Utils] Disable automatic move constructor for
 tvm::With (#12822)

* [Utils] Move constructor for tvm::With

Previously, `tvm::With` had the default compiler-provided move
constructors.  If these were used (e.g. by storing a `With` into a
vector), the `ExitWithScope` would be called multiple times.  This
commit explicitly removes the copy constructor/assignment, and
explicitly implements move constructor/assignment.

* Update PR to remove move With's constructor/assignment altogether
---
 include/tvm/support/with.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/tvm/support/with.h b/include/tvm/support/with.h
index d28e9f3a6894..5959affafdb3 100644
--- a/include/tvm/support/with.h
+++ b/include/tvm/support/with.h
@@ -68,6 +68,15 @@ class With {
   /*! \brief destructor, leaves the scope of the context. */
   ~With() DMLC_THROW_EXCEPTION { ctx_.ExitWithScope(); }
 
+  // Disable copy and move construction.  `With` is intended only for
+  // use in nested contexts that are exited in the reverse order of
+  // entry.  Allowing context to be copied or moved would break this
+  // expectation.
+  With(const With& other) = delete;
+  With& operator=(const With& other) = delete;
+  With(With&& other) = delete;
+  With& operator=(With&& other) = delete;
+
   ContextType* get() { return &ctx_; }
   const ContextType* get() const { return &ctx_; }
 

From 3c8a94bd4eedb43d5402ec41755a4f57a90ff4fe Mon Sep 17 00:00:00 2001
From: chengven027-intellif <darkvan_wen@hotmail.com>
Date: Thu, 22 Sep 2022 02:57:30 +0800
Subject: [PATCH 223/704] [frontend][torch] Support aten::relu6 operator
 (#12855)

support aten::relu6 operator
---
 python/tvm/relay/frontend/pytorch.py          | 5 +++++
 tests/python/frontend/pytorch/test_forward.py | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 7c52393b8468..b0e594d99312 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -840,6 +840,10 @@ def relu(self, inputs, input_types):
             return qnn_torch.quantized_relu(data, input_zero_point)
         return _op.nn.relu(data)
 
+    def relu6(self, inputs, input_types):
+        data = inputs[0]
+        return _op.tensor.clip(data, 0.0, 6.0)
+
     def prelu(self, inputs, input_types):
         # Reference: https://pytorch.org/docs/stable/generated/torch.nn.PReLU.html#torch.nn.PReLU
         data = inputs[0]
@@ -3477,6 +3481,7 @@ def create_convert_map(self):
             "aten::where": self.where,
             "aten::topk": self.topk,
             "aten::relu": self.relu,
+            "aten::relu6": self.relu6,
             "aten::prelu": self.prelu,
             "aten::leaky_relu": self.leaky_relu,
             "aten::elu": self.elu,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 2d0a476e372d..0525c5fd8e7d 100755
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -698,6 +698,15 @@ def test_forward_relu():
     verify_model(torch.nn.ReLU().eval(), input_data=input_data)
 
 
+@tvm.testing.uses_gpu
+def test_forward_relu6():
+    """test_forward_relu6"""
+    torch.set_grad_enabled(False)
+    input_shape = [10, 10]
+    input_data = torch.rand(input_shape).float()
+    verify_model(torch.nn.ReLU6().eval(), input_data=input_data)
+
+
 @tvm.testing.uses_gpu
 def test_forward_prelu():
     """test_forward_prelu"""

From c0c7569529cb258c151acf101e6e4650c726d34d Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 21 Sep 2022 11:58:46 -0700
Subject: [PATCH 224/704] Allow failures in pr_comment_bot for now (#12860)

Allow failures in pr_comment_bot for now.
---
 .github/workflows/pr_comment_bot.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr_comment_bot.yml b/.github/workflows/pr_comment_bot.yml
index 89416df928b8..1ac33c77d2e5 100644
--- a/.github/workflows/pr_comment_bot.yml
+++ b/.github/workflows/pr_comment_bot.yml
@@ -49,7 +49,7 @@ jobs:
           if [[ "$URL" == *"PR-"* ]]; then
             echo "PR status, sending comment"
             PR_NUMBER=$(echo $URL | sed 's/.*PR-//g' | sed 's/\/.*//g')
-            python ci/scripts/github_pr_comment.py --pr "$PR_NUMBER"
+            python ci/scripts/github_pr_comment.py --pr "$PR_NUMBER" || /bin/true
           else
             echo "Not a PR status, skipping"
           fi

From 7aef584c0f8fb3b516afde3fb5fac9c2d0969c0a Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 21 Sep 2022 15:14:28 -0700
Subject: [PATCH 225/704] [Hybrid] Fix sys version check (#12837)

This is a follow-up to #12769 The check for sys version of python 3.9 is not correct.
Fixed #12814
---
 python/tvm/te/hybrid/parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py
index 4956aaf0be32..ec103ac18811 100644
--- a/python/tvm/te/hybrid/parser.py
+++ b/python/tvm/te/hybrid/parser.py
@@ -374,7 +374,7 @@ def visit_Attribute(self, node):
 
     def visit_Subscript(self, node):
         args = self.visit(node.slice)
-        if sys.version_info > (3, 8):
+        if sys.version_info >= (3, 9):
             if not isinstance(node.slice, ast.Tuple):
                 args = [args]
 

From 39f71ae2881f5c647aa8e98e4f6d87ed84a28688 Mon Sep 17 00:00:00 2001
From: Oleksandr Viazlo <oleksandr.viazlo@axelera.ai>
Date: Thu, 22 Sep 2022 10:21:17 +0200
Subject: [PATCH 226/704] [frontend][pytorch] Add a new test case for torch
 aten::fill_ operator implementation (#12857)

Fix aten::fill_ torch operator implementation by adding constant folding on the fill value.
Add new test case for torch aten::fill_ operator implementation.
---
 python/tvm/relay/frontend/pytorch.py          |  8 ++++++--
 tests/python/frontend/pytorch/test_forward.py | 10 ++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index b0e594d99312..e35e23b3381c 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -43,7 +43,7 @@
 from .common import infer_shape as _infer_shape
 from .common import infer_value as _infer_value
 from .common import infer_value_simulated as _infer_value_simulated
-from .common import lstm_cell, try_infer_value, unbind
+from .common import lstm_cell, try_infer_value, unbind, fold_constant
 from .pytorch_utils import is_version_greater_than, getattr_attr_name
 
 __all__ = ["from_pytorch"]
@@ -672,7 +672,9 @@ def full_impl(self, data, fill_value, dtype):
                 tmp.append(_op.cast(_op.expand_dims(dim, axis=0), "int64"))
             size = _op.concatenate(tmp, axis=0)
 
-        out = _op.full(_expr.const(fill_value, dtype=dtype), size, dtype=dtype)
+        if not isinstance(fill_value, _expr.Constant):
+            fill_value = _expr.const(fill_value, dtype=dtype)
+        out = _op.full(fill_value, size, dtype=dtype)
         if need_reshape:
             out = _op.reshape(out, new_shape)
         return out
@@ -805,6 +807,8 @@ def new_full(self, inputs, input_types):
     def fill_(self, inputs, input_types):
         data = inputs[0]
         fill_value = inputs[1]
+        if not isinstance(fill_value, (bool, int, float, complex)):
+            fill_value = fold_constant(fill_value)
         return self.full_impl(self.infer_shape(data), fill_value, input_types[0])
 
     def linspace(self, inputs, input_types):
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 0525c5fd8e7d..5236b763faf0 100755
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3341,6 +3341,16 @@ def test_func(x):
     verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()])
 
 
+def test_forward_fill_with_div():
+    """test_forward_fill_with_div"""
+
+    def test_func(x):
+        y = torch.div(torch.tensor(6.0), torch.tensor(2.0))
+        return x.fill_(y)
+
+    verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()])
+
+
 @tvm.testing.uses_gpu
 def test_forward_linspace():
     """test_forward_linspace"""

From fe75f00991f60d4483d2d14f7ec23bb6fda956a9 Mon Sep 17 00:00:00 2001
From: Alexey Voronov <avoronov.icemist@gmail.com>
Date: Thu, 22 Sep 2022 13:52:28 +0300
Subject: [PATCH 227/704] [AutoTVM] Introducing multi_filter into ConfigSpace
 autotvm (#12545)

* [AutoTVM] Introducing multi_filter into ConfigSpace autotvm

Co-authored-by: Andrey Malyshev elvin.nnov@gmail.com
Co-authored-by: Egor Churaev egor.churaev@gmail.com

* update multi_filter for adreno conv's
---
 python/tvm/autotvm/task/space.py              | 330 +++++++++++++++++-
 python/tvm/autotvm/tuner/ga_tuner.py          | 108 +++---
 python/tvm/autotvm/tuner/index_based_tuner.py |  73 ++--
 python/tvm/autotvm/tuner/model_based_tuner.py |  40 +--
 .../tvm/autotvm/tuner/sa_model_optimizer.py   |  39 +--
 python/tvm/autotvm/tuner/tuner.py             |   1 +
 python/tvm/autotvm/utils.py                   |  32 --
 python/tvm/topi/adreno/conv2d_nchw.py         |  10 +-
 python/tvm/topi/adreno/conv2d_nhwc.py         |  10 +-
 .../tvm/topi/adreno/conv2d_winograd_common.py |   7 +-
 .../tvm/topi/adreno/depthwise_conv2d_nchw.py  |   9 +
 .../tvm/topi/adreno/depthwise_conv2d_nhwc.py  |   9 +
 .../test_topi_conv2d_hwnc_tensorcore.py       |   4 +-
 .../python/unittest/test_autotvm_ga_tuner.py  |  89 +++++
 .../unittest/test_autotvm_index_tuner.py      |  77 +++-
 tests/python/unittest/test_autotvm_space.py   | 167 ++++++++-
 16 files changed, 758 insertions(+), 247 deletions(-)
 create mode 100644 tests/python/unittest/test_autotvm_ga_tuner.py

diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index 4d6b23162a25..22812f907bb3 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -30,6 +30,7 @@
 import functools
 import math
 from collections import namedtuple, OrderedDict
+from random import randrange
 import numpy as np
 
 from tvm.te import schedule, thread_axis
@@ -665,6 +666,8 @@ def __init__(self):
         self.space_map = OrderedDict()  # name -> space
         self._collect = True
         self._length = None
+        self._range_length = None
+        self._dims = None
         self._entity_map = OrderedDict()  # name -> entity
         self._constraints = []
         self.errors = []
@@ -672,6 +675,8 @@ def __init__(self):
         self.flop = 0
         self.cost = None
         self.is_fallback = False
+        self._shared_filter = None
+        self._shared_filter_cache = None
 
     @staticmethod
     def axis(var):
@@ -714,18 +719,19 @@ def define_split(self, name, axis, policy="factors", **kwargs):
                 the total number of axis after split (`int`).
             ``no_tail``:
                 should we only include divisible numbers as split factors (`bool`).
-            `candidate``:
+            ``candidate``:
                 (policy=candidate) manual candidate list (`List`).
 
         Examples
         --------
         >>> # use custom candidates
-        >>> cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4], [4, 1, 4]])
+        >>> cfg.define_split('tile_x', x, policy='candidate', num_outputs=3,
+        >>>   candidate=[[1, 4, 4], [4, 1, 4]])
 
         >>> # use a filter that only accepts the split scheme whose inner most tile is less then 4
-        >>> cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 4)
+        >>> cfg.define_split('tile_y', y, policy='factors', num_outputs=3,
+        >>>   filter=lambda x: x.size[-1] <= 4)
         """
-
         axes = [axis]
         return self._add_new_transform(SplitSpace, name, axes, policy, **kwargs)
 
@@ -822,11 +828,300 @@ def valid(self):
         """
         return not bool(self.errors)
 
+    def is_index_valid(self, index):
+        """Checks if the index satisfies the multi_filter condition
+
+        Parameters
+        ----------
+        index: int
+            index from the range of the space
+
+        Returns
+        -------
+        valid: bool
+            whether the index meets all the constraints
+        """
+        assert 0 <= index < self.range_length
+        if self._shared_filter is None:
+            return True
+        if self._shared_filter_cache is None:
+            self._make_shared_filter_cache()
+        return self._shared_filter_cache[index]
+
+    def multi_filter(self, filter):  # pylint: disable=redefined-builtin
+        """The filter can restrict combination of parameters in difference to the knob filter,
+        that restricts only single parameter
+
+        Parameters
+        ----------
+        filter: function
+            predicate with one argument (Callable[[int], bool])
+
+        .. note::
+
+            Using this filter causes additional restrictions on the use of __len__.
+            Normally, it define the count of valid indexes and the range of space, but when
+            multi_filter enabled, it requires to use __len__ for getting the count of valid
+            indexes or range_length for the range of space. It is recommended to use:
+            ``is_index_valid``, ``get_next_index``, ``get_rand_index`` to bypass the space
+
+        Examples
+        --------
+        >>> # Pre-requisites
+        >>> candidates = [[16, 64], [32, 32], [64, 16]]
+        >>> filter = lambda v: v.size[0] != 16
+        >>> multi_filter = lambda e: (e["tile_x"].size[0] + e["tile_y"].size[0]) <= 64
+
+        >>> # Case 1 - without filtering
+        >>> cfg.define_split("tile_x", x, num_outputs=2, policy="candidate", candidate=candidates)
+        >>> cfg.define_split("tile_y", y, num_outputs=2, policy="candidate", candidate=candidates)
+        >>> # [('tile_x', [16, 64]), ('tile_y', [16, 64])],None,0
+        >>> # [('tile_x', [32, 32]), ('tile_y', [16, 64])],None,1
+        >>> # [('tile_x', [64, 16]), ('tile_y', [16, 64])],None,2
+        >>> # [('tile_x', [16, 64]), ('tile_y', [32, 32])],None,3
+        >>> # [('tile_x', [32, 32]), ('tile_y', [32, 32])],None,4
+        >>> # [('tile_x', [64, 16]), ('tile_y', [32, 32])],None,5
+        >>> # [('tile_x', [16, 64]), ('tile_y', [64, 16])],None,6
+        >>> # [('tile_x', [32, 32]), ('tile_y', [64, 16])],None,7
+        >>> # [('tile_x', [64, 16]), ('tile_y', [64, 16])],None,8
+
+        >>> # Case 2 - with filter
+        >>> cfg.define_split("tile_x", x, num_outputs=2, policy="candidate", candidate=candidates,
+        >>>   filter=filter)
+        >>> cfg.define_split("tile_y", y, num_outputs=2, policy="candidate", candidate=candidates,
+        >>>   filter=filter)
+        >>> # [('tile_x', [32, 32]), ('tile_y', [32, 32])],None,0
+        >>> # [('tile_x', [64, 16]), ('tile_y', [32, 32])],None,1
+        >>> # [('tile_x', [32, 32]), ('tile_y', [64, 16])],None,2
+        >>> # [('tile_x', [64, 16]), ('tile_y', [64, 16])],None,3
+
+        >>> # Case 3 - with filter and multi_filter
+        >>> cfg.define_split("tile_x", x, num_outputs=2, policy="candidate", candidate=candidates,
+        >>>   filter=filter)
+        >>> cfg.define_split("tile_y", y, num_outputs=2, policy="candidate", candidate=candidates,
+        >>>   filter=filter)
+        >>> cfg.multi_filter(filter=multi_filter)
+        >>> # [('tile_x', [32, 32]), ('tile_y', [32, 32])],None,0
+        """
+        if self._collect:
+            self.clear_cache()
+            self._shared_filter = filter
+
+    @property
+    def range_length(self):
+        """Length of the index range in the space"""
+        if self._range_length is None:
+            self._range_length = int(np.prod([len(x) for x in self.space_map.values()]))
+        return self._range_length
+
+    @property
+    def dims(self):
+        """Dimensions in the space"""
+        if self._dims is None:
+            self._dims = [len(x) for x in self.space_map.values()]
+        return self._dims
+
+    def subrange_length(self, start, end):
+        """Returns the number of valid indexes within the limited range from [start, end]
+
+        Parameters
+        ----------
+        start: int
+            start of subrange, inclusive
+        end: int
+            end of subrange, exclusive
+
+        Returns
+        -------
+        count: int
+            number of valid indexes
+        """
+        assert 0 <= start <= end <= self.range_length
+        if self._shared_filter is None:
+            return end - start
+        if self._shared_filter_cache is None:
+            self._make_shared_filter_cache()
+        return self._shared_filter_cache[start:end].count(True)
+
+    def get_rand_index(self, start=None, end=None, to_exclude=None):
+        """Returns a random valid index unlisted to exclusion
+
+        Parameters
+        ----------
+        start: int, optional
+            specifying at which position to start, inclusive
+        end: int, optional
+            specifying at which position to end, exclusive
+        to_exclude: list, optional
+            determines unsuitable values
+
+        Returns
+        -------
+        rand: int
+            random index in the space
+
+        .. note::
+
+            Excluding all valid space indexes will lead to an infinite loop.
+
+        """
+        start = start or 0
+        end = end or self.range_length
+        while True:
+            index = randrange(start, end)
+            if self.is_index_valid(index) and index not in (to_exclude or []):
+                return index
+
+    def get_next_index(self, index, n=1, start=None, end=None):
+        """Returns the nth valid next index or None if out of range
+
+        Parameters
+        ----------
+        index: int
+            specifying at which position to start, inclusive
+        n: int, optional
+            step by using to find the next index, for the opposite
+            direction a negative number should be used
+        start: list, optional
+            start of subrange, inclusive
+        end: list, optional
+            end of subrange, exclusive
+
+        Returns
+        -------
+        next: int
+            next index in the space
+        """
+        assert n != 0
+        start = start or 0
+        end = end or self.range_length
+        if self._shared_filter is None:
+            index += n
+            if start <= index < end:
+                return index
+            return None
+        trend = 1 if n > 0 else -1
+        counter = abs(n)
+        while counter != 0:
+            index += trend
+            if index < start or index >= end:
+                return None
+            if self.is_index_valid(index):
+                counter -= 1
+        return index
+
+    def clear_cache(self):
+        """Clears the cache of index validity"""
+        del self._shared_filter_cache
+        self._dims = None
+        self._length = None
+        self._range_length = None
+        self._shared_filter_cache = None
+
+    def _make_shared_filter_cache(self):
+        def apply(t):
+            entities = OrderedDict()
+            for name, space in self.space_map.items():
+                entities[name] = space[t % len(space)]
+                t //= len(space)
+            return bool(self._shared_filter(entities))
+
+        self._shared_filter_cache = tuple(apply(i) for i in range(self.range_length))
+        self._length = self._shared_filter_cache.count(True)
+
+    def point2knob(self, point):
+        """Convert point form (single integer) to knob (vector)
+
+        Parameters
+        ----------
+        point: int
+            point to convert
+
+        Returns
+        -------
+        knob: list
+            knob representation of the point
+        """
+        knob = []
+        for dim in self.dims:
+            knob.append(point % dim)
+            point //= dim
+        return knob
+
+    def knob2point(self, knob):
+        """Convert knob form (vector) to point form (single integer)
+
+        Parameters
+        ----------
+        knob: list
+            knob to convert
+
+        Returns
+        -------
+        point: int
+            point of the knob representation
+        """
+        point = 0
+        for j, k in enumerate(knob):
+            point += int(np.prod(self.dims[:j])) * k
+        return point
+
+    def sample_ints(self, m):
+        """
+        Sample m different integer numbers from [0, self.range_length) without replacement
+        This function is an alternative of `np.random.choice` when self.range_length > 2 ^ 32, in
+        which case numpy does not work.
+
+        Parameters
+        ----------
+        m: int
+            The number of sampled int
+
+        Returns
+        -------
+        ints: an numpy array of size m
+        """
+        assert m <= len(self)
+        vis = set()
+        while len(vis) < m:
+            new = randrange(0, self.range_length)
+            if self.is_index_valid(new):
+                vis.add(new)
+        return np.fromiter(vis, int, len(vis))
+
+    def random_walk(self, point):
+        """random walk as local transition
+
+        Parameters
+        ----------
+        point: int
+            index of the ConfigEntity
+
+        Returns
+        -------
+        new_point: int
+            new neighborhood index
+        """
+        # transform to knob form
+        old_knob = self.point2knob(point)
+        new_knob = old_knob.copy()
+        new_point = self.knob2point(new_knob)
+        # mutate
+        while new_knob == old_knob or not self.is_index_valid(new_point):
+            from_i = np.random.randint(len(old_knob))
+            to_v = np.random.randint(self.dims[from_i])
+            new_knob[from_i] = to_v
+            new_point = self.knob2point(new_knob)
+        # transform to index form
+        return new_point
+
     def _add_new_transform(self, space_class, name, axes, policy, **kwargs):
         """Add a new transform space in template"""
         # if we do not have tuned info (_collect == True) but defined KNOB value
         # for "default" scheduling before call of _add_new_transform, in this case
         # no need to create new space and override previously pointed KNOB values
+        if kwargs.get("filter"):
+            self.clear_cache()
         if self._collect and not (self.is_fallback and name in self._entity_map):
             # convert schedule axis to space definition axis
             axes = [x if isinstance(x, (VirtualAxis, Axis)) else self.axis(x) for x in axes]
@@ -839,8 +1134,11 @@ def _add_new_transform(self, space_class, name, axes, policy, **kwargs):
         return [Axis(None, i) for i in range(space_class.get_num_output(axes, policy, **kwargs))]
 
     def __len__(self):
-        if self._length is None:
-            self._length = int(np.prod([len(x) for x in self.space_map.values()]))
+        """Returns the number of valid indexes in the space"""
+        if self._shared_filter is None:
+            return self.range_length
+        if self._shared_filter_cache is None:
+            self._make_shared_filter_cache()
         return self._length
 
     def get(self, index):
@@ -850,9 +1148,21 @@ def get(self, index):
         ----------
         index: int
             index in the space
+
+        Returns
+        -------
+        config: ConfigEntity
+            config corresponds to the index
         """
-        if index < 0 or index >= len(self):
-            raise IndexError("Index out of range: size {}, got index {}".format(len(self), index))
+        if index < 0 or index >= self.range_length:
+            raise IndexError(
+                "Index out of range: size {}, got index {}".format(self.range_length, index)
+            )
+        if not self.is_index_valid(index):
+            raise IndexError(
+                "Index does not correspond to the multi-filter condition, got index {}. "
+                "Use is_index_valid to pre-check".format(index)
+            )
         entities = OrderedDict()
         t = index
         for name, space in self.space_map.items():
@@ -876,7 +1186,9 @@ def __getitem__(self, name):
         return self._entity_map[name]
 
     def __repr__(self):
-        res = "ConfigSpace (len=%d, space_map=\n" % len(self)
+        res = "ConfigSpace (len={}, range_length={}, space_map=\n".format(
+            len(self), self.range_length
+        )
         for i, (name, space) in enumerate(self.space_map.items()):
             res += "  %2d %s: %s\n" % (i, name, space)
         return res + ")"
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
index 2ecd120e8504..ad5b87ac5d70 100644
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -21,7 +21,6 @@
 import numpy as np
 
 from .tuner import Tuner
-from .model_based_tuner import knob2point, point2knob
 
 
 class GATuner(Tuner):
@@ -49,41 +48,24 @@ def __init__(self, task, pop_size=100, elite_num=3, mutation_prob=0.1):
 
         assert elite_num <= pop_size, "The number of elites must be less than population size"
 
-        # space info
-        self.space = task.config_space
-        self.dim_keys = []
-        self.dims = []
-        for k, v in self.space.space_map.items():
-            self.dim_keys.append(k)
-            self.dims.append(len(v))
-
-        self.visited = set([])
+        # random initialization
+        self.pop_size = min(self.pop_size, len(self.space))
+        self.elite_num = min(self.pop_size, self.elite_num)
+        self.visited = set(self.space.sample_ints(self.pop_size))
 
         # current generation
-        self.genes = []
+        self.genes = [self.space.point2knob(idx) for idx in self.visited]
         self.scores = []
         self.elites = []
         self.elite_scores = []
         self.trial_pt = 0
 
-        # random initialization
-        self.pop_size = min(self.pop_size, len(self.space))
-        self.elite_num = min(self.pop_size, self.elite_num)
-        for _ in range(self.pop_size):
-            tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims)
-            while knob2point(tmp_gene, self.dims) in self.visited:
-                tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims)
-
-            self.genes.append(tmp_gene)
-            self.visited.add(knob2point(tmp_gene, self.dims))
-
     def next_batch(self, batch_size):
         ret = []
-        for _ in range(batch_size):
+        while len(ret) < batch_size and self.has_next():
             gene = self.genes[self.trial_pt % self.pop_size]
             self.trial_pt += 1
-            ret.append(self.space.get(knob2point(gene, self.dims)))
-
+            ret.append(self.space.get(self.space.knob2point(gene)))
         return ret
 
     def update(self, inputs, results):
@@ -95,47 +77,43 @@ def update(self, inputs, results):
                 self.scores.append(0.0)
 
         if len(self.scores) >= len(self.genes) and len(self.visited) < len(self.space):
-            genes = self.genes + self.elites
-            scores = np.array(self.scores[: len(self.genes)] + self.elite_scores)
-
-            # reserve elite
-            self.elites, self.elite_scores = [], []
-            elite_indexes = np.argpartition(scores, -self.elite_num)[-self.elite_num :]
-            for ind in elite_indexes:
-                self.elites.append(genes[ind])
-                self.elite_scores.append(scores[ind])
-
-            # cross over
-            indices = np.arange(len(genes))
-            scores += 1e-8
-            scores /= np.max(scores)
-            probs = scores / np.sum(scores)
-            tmp_genes = []
-            for _ in range(self.pop_size):
-                p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs)
-                p1, p2 = genes[p1], genes[p2]
-                point = np.random.randint(len(self.dims))
-                tmp_gene = p1[:point] + p2[point:]
-                tmp_genes.append(tmp_gene)
-
-            # mutation
             next_genes = []
-            for tmp_gene in tmp_genes:
-                for j, dim in enumerate(self.dims):
-                    if np.random.random() < self.mutation_prob:
-                        tmp_gene[j] = np.random.randint(dim)
-
-                if len(self.visited) < len(self.space):
-                    while knob2point(tmp_gene, self.dims) in self.visited:
-                        j = np.random.randint(len(self.dims))
-                        tmp_gene[j] = np.random.randint(
-                            self.dims[j]  # pylint: disable=invalid-sequence-index
-                        )
-                    next_genes.append(tmp_gene)
-                    self.visited.add(knob2point(tmp_gene, self.dims))
-                else:
-                    break
-
+            # There is no reason to crossover or mutate since the size of the unvisited
+            # is no larger than the size of the population.
+            if len(self.space) - len(self.visited) <= self.pop_size:
+                for idx in range(self.space.range_length):
+                    if self.space.is_index_valid(idx) and idx not in self.visited:
+                        next_genes.append(self.space.point2knob(idx))
+                        self.visited.add(idx)
+            else:
+                genes = self.genes + self.elites
+                scores = np.array(self.scores[: len(self.genes)] + self.elite_scores)
+
+                # reserve elite
+                self.elites, self.elite_scores = [], []
+                elite_indexes = np.argpartition(scores, -self.elite_num)[-self.elite_num :]
+                for ind in elite_indexes:
+                    self.elites.append(genes[ind])
+                    self.elite_scores.append(scores[ind])
+
+                indices = np.arange(len(genes))
+                scores += 1e-8
+                scores /= np.max(scores)
+                probs = scores / np.sum(scores)
+                while len(next_genes) < self.pop_size:
+                    # cross over
+                    p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs)
+                    p1, p2 = genes[p1], genes[p2]
+                    point = np.random.randint(len(self.space.dims))
+                    tmp_gene = p1[:point] + p2[point:]
+                    # mutation
+                    for j, dim in enumerate(self.space.dims):
+                        if np.random.random() < self.mutation_prob:
+                            tmp_gene[j] = np.random.randint(dim)
+
+                    if self.space.is_index_valid(self.space.knob2point(tmp_gene)):
+                        next_genes.append(tmp_gene)
+                        self.visited.add(self.space.knob2point(tmp_gene))
             self.genes = next_genes
             self.trial_pt = 0
             self.scores = []
diff --git a/python/tvm/autotvm/tuner/index_based_tuner.py b/python/tvm/autotvm/tuner/index_based_tuner.py
index 972de65154c9..881728bc9b34 100644
--- a/python/tvm/autotvm/tuner/index_based_tuner.py
+++ b/python/tvm/autotvm/tuner/index_based_tuner.py
@@ -17,8 +17,6 @@
 # pylint: disable=abstract-method
 """Grid search tuner and random tuner"""
 
-import numpy as np
-
 from .tuner import Tuner
 
 
@@ -32,7 +30,7 @@ class IndexBaseTuner(Tuner):
         The tuning task
 
     range_idx: Optional[Tuple[int, int]]
-        A tuple of index range that this tuner can select from
+        A tuple of index range that this tuner can select from [begin_idx, end_idx]
     """
 
     def __init__(self, task, range_idx=None):
@@ -41,17 +39,19 @@ def __init__(self, task, range_idx=None):
             range_idx, tuple
         ), "range_idx must be None or (int, int)"
 
-        self.range_length = len(self.task.config_space)
-        self.index_offset = 0
-        if range_idx is not None:
-            assert range_idx[1] > range_idx[0], "Index range must be positive"
-            assert range_idx[0] >= 0, "Start index must be positive"
-            self.range_length = range_idx[1] - range_idx[0] + 1
-            self.index_offset = range_idx[0]
-        self.counter = 0
+        self.visited = []
+        self.begin_idx, self.end_idx = range_idx or (0, self.space.range_length - 1)
+        assert self.begin_idx >= 0, "Start index must be positive"
+        self.end_idx += 1  # Further end_idx is exclusive
+        assert (
+            self.end_idx <= self.space.range_length
+        ), "Finish index must be less the space range length "
+        self.range_length = self.end_idx - self.begin_idx
+        assert self.range_length > 0, "Index range must be positive"
+        self.visited_max = self.space.subrange_length(self.begin_idx, self.end_idx)
 
     def has_next(self):
-        return self.counter < self.range_length
+        return len(self.visited) < self.visited_max
 
     def load_history(self, data_set, min_seed_records=500):
         pass
@@ -60,14 +60,23 @@ def load_history(self, data_set, min_seed_records=500):
 class GridSearchTuner(IndexBaseTuner):
     """Enumerate the search space in a grid search order"""
 
+    def __init__(self, task, range_idx=None):
+        super(GridSearchTuner, self).__init__(task, range_idx)
+
+        self.index = self.begin_idx
+        if not self.space.is_index_valid(self.index):
+            self.index = self.space.get_next_index(
+                self.index, start=self.begin_idx, end=self.end_idx
+            )
+
     def next_batch(self, batch_size):
         ret = []
-        for _ in range(batch_size):
-            if self.counter >= self.range_length:
-                break
-            index = self.counter + self.index_offset
-            ret.append(self.task.config_space.get(index))
-            self.counter = self.counter + 1
+        while len(ret) < batch_size and self.has_next():
+            self.visited.append(self.index)
+            ret.append(self.space.get(self.index))
+            self.index = self.space.get_next_index(
+                self.index, start=self.begin_idx, end=self.end_idx
+            )
         return ret
 
 
@@ -83,32 +92,10 @@ class RandomTuner(IndexBaseTuner):
         A tuple of index range to random
     """
 
-    def __init__(self, task, range_idx=None):
-        super(RandomTuner, self).__init__(task, range_idx)
-
-        # Use a dict to mimic a range(n) list without storing rand_state[i] = i entries so that
-        # we can generate non-repetitive random indices.
-        self.rand_state = {}
-        self.rand_max = self.range_length
-        self.visited = []
-
     def next_batch(self, batch_size):
         ret = []
-        for _ in range(batch_size):
-            if self.rand_max == 0:
-                break
-
-            # Random an indirect index.
-            index_ = np.random.randint(self.rand_max)
-            self.rand_max -= 1
-
-            # Use the indirect index to get a direct index.
-            index = self.rand_state.get(index_, index_) + self.index_offset
-            ret.append(self.task.config_space.get(index))
+        while len(ret) < batch_size and self.has_next():
+            index = self.space.get_rand_index(self.begin_idx, self.end_idx, to_exclude=self.visited)
             self.visited.append(index)
-
-            # Update the direct index map.
-            self.rand_state[index_] = self.rand_state.get(self.rand_max, self.rand_max)
-            self.rand_state.pop(self.rand_max, None)
-            self.counter += 1
+            ret.append(self.space.get(index))
         return ret
diff --git a/python/tvm/autotvm/tuner/model_based_tuner.py b/python/tvm/autotvm/tuner/model_based_tuner.py
index f07e7fb4eb76..0841e9a76528 100644
--- a/python/tvm/autotvm/tuner/model_based_tuner.py
+++ b/python/tvm/autotvm/tuner/model_based_tuner.py
@@ -207,9 +207,6 @@ def __init__(self, task, cost_model, model_optimizer, plan_size, diversity_filte
         self.task = task
         self.target = task.target
         self.plan_size = plan_size
-        self.space = task.config_space
-        self.space_len = len(task.config_space)
-        self.dims = [len(x) for x in self.space.space_map.values()]
 
         self.cost_model = cost_model
         self.model_optimizer = model_optimizer
@@ -233,29 +230,19 @@ def __init__(self, task, cost_model, model_optimizer, plan_size, diversity_filte
 
     def next_batch(self, batch_size):
         ret = []
-
-        counter = 0
-        while counter < batch_size:
-            if len(self.visited) >= len(self.space):
-                break
-
+        while len(ret) < batch_size and self.has_next():
             while self.trial_pt < len(self.trials):
                 index = self.trials[self.trial_pt]
-                if index not in self.visited:
+                if index not in self.visited and self.space.is_index_valid(index):
                     break
                 self.trial_pt += 1
 
             if self.trial_pt >= len(self.trials) - int(0.05 * self.plan_size):
                 # if the trial list is empty or
                 # the tuner is doing the last 5% trials (e-greedy), choose randomly
-                index = np.random.randint(len(self.space))
-                while index in self.visited:
-                    index = np.random.randint(len(self.space))
-
+                index = self.space.get_rand_index(to_exclude=self.visited)
             ret.append(self.space.get(index))
             self.visited.add(index)
-
-            counter += 1
         return ret
 
     def update(self, inputs, results):
@@ -274,8 +261,8 @@ def update(self, inputs, results):
             # However, adding the index to visited again here enables us
             # to also use this update function to resume tuning progress in
             # case of interruption.
+            assert self.space.is_index_valid(index)
             self.visited.add(index)
-
         # if we have enough new training samples
         if len(self.xs) >= self.plan_size * (self.train_ct + 1) and self.flops_max > 1e-6:
             self.cost_model.fit(self.xs, self.ys, self.plan_size)
@@ -284,7 +271,7 @@ def update(self, inputs, results):
                     self.cost_model, self.plan_size * self.diversity_filter_ratio, self.visited
                 )
                 scores = self.cost_model.predict(candidate)
-                knobs = [point2knob(x, self.dims) for x in candidate]
+                knobs = [self.space.point2knob(x) for x in candidate]
                 pick_index = submodular_pick(0 * scores, knobs, self.plan_size, knob_weight=1)
                 maximums = np.array(candidate)[pick_index]
             else:
@@ -322,23 +309,6 @@ def has_next(self):
         return len(self.visited) < len(self.space)
 
 
-def point2knob(p, dims):
-    """convert point form (single integer) to knob form (vector)"""
-    knob = []
-    for dim in dims:
-        knob.append(p % dim)
-        p //= dim
-    return knob
-
-
-def knob2point(knob, dims):
-    """convert knob form (vector) to point form (single integer)"""
-    p = 0
-    for j, k in enumerate(knob):
-        p += int(np.prod(dims[:j])) * k
-    return p
-
-
 def submodular_pick(scores, knobs, n_pick, knob_weight=1.0):
     """Run greedy optimization to pick points with regard to both score and diversity.
     DiversityScore = knob_weight * number of unique knobs in the selected set
diff --git a/python/tvm/autotvm/tuner/sa_model_optimizer.py b/python/tvm/autotvm/tuner/sa_model_optimizer.py
index 401eda8c276f..a50f148f2eb2 100644
--- a/python/tvm/autotvm/tuner/sa_model_optimizer.py
+++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py
@@ -25,8 +25,7 @@
 
 import numpy as np
 
-from ..utils import sample_ints
-from .model_based_tuner import ModelOptimizer, knob2point, point2knob
+from .model_based_tuner import ModelOptimizer
 
 logger = logging.getLogger("autotvm")
 
@@ -60,10 +59,7 @@ def __init__(
         log_interval=50,
     ):
         super(SimulatedAnnealingOptimizer, self).__init__()
-
         self.task = task
-        self.dims = [len(x) for x in self.task.config_space.space_map.values()]
-
         self.n_iter = n_iter
         self.temp = temp
         self.persistent = persistent
@@ -84,7 +80,7 @@ def find_maximums(self, model, num, exclusive):
         if self.persistent and self.points is not None:
             points = self.points
         else:
-            points = np.array(sample_ints(0, len(self.task.config_space), self.parallel_size))
+            points = self.task.config_space.sample_ints(self.parallel_size)
 
         scores = model.predict(points)
 
@@ -113,7 +109,7 @@ def find_maximums(self, model, num, exclusive):
         while k < n_iter and k < k_last_modify + early_stop:
             new_points = np.empty_like(points)
             for i, p in enumerate(points):
-                new_points[i] = random_walk(p, self.dims)
+                new_points[i] = self.task.config_space.random_walk(p)
 
             new_scores = model.predict(new_points)
 
@@ -157,32 +153,3 @@ def find_maximums(self, model, num, exclusive):
             self.points = points
 
         return [x[1] for x in heap_items]
-
-
-def random_walk(p, dims):
-    """random walk as local transition
-
-    Parameters
-    ----------
-    p: int
-        index of the ConfigEntity
-    dims: Array of int
-        sizes of each dimension
-
-    Returns
-    -------
-    new_p: int
-        new neighborhood index
-    """
-    # transform to knob form
-    old = point2knob(p, dims)
-    new = list(old)
-
-    # mutate
-    while new == old:
-        from_i = np.random.randint(len(old))
-        to_v = np.random.randint(dims[from_i])
-        new[from_i] = to_v
-
-    # transform to index form
-    return knob2point(new, dims)
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 848265ce17ca..a758a5d4cd9c 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -43,6 +43,7 @@ def __init__(self, task, **kwargs):
         self.recorder = None
 
         self.task = task
+        self.space = self.task.config_space
 
         # keep the current best
         self.best_config = None
diff --git a/python/tvm/autotvm/utils.py b/python/tvm/autotvm/utils.py
index ec3f18daa6c9..75db5208adbe 100644
--- a/python/tvm/autotvm/utils.py
+++ b/python/tvm/autotvm/utils.py
@@ -19,8 +19,6 @@
 import logging
 import time
 
-from random import randrange
-
 import numpy as np
 import tvm.arith
 from tvm.tir import expr
@@ -57,36 +55,6 @@ def get_rank(values):
     return ranks
 
 
-def sample_ints(low, high, m):
-    """
-    Sample m different integer numbers from [low, high) without replacement
-    This function is an alternative of `np.random.choice` when (high - low) > 2 ^ 32, in
-    which case numpy does not work.
-
-    Parameters
-    ----------
-    low: int
-        low point of sample range
-    high: int
-        high point of sample range
-    m: int
-        The number of sampled int
-
-    Returns
-    -------
-    ints: an array of size m
-    """
-    vis = set()
-    assert m <= high - low
-    while len(vis) < m:
-        new = randrange(low, high)
-        while new in vis:
-            new = randrange(low, high)
-        vis.add(new)
-
-    return list(vis)
-
-
 def pool_map(func, args, batch_size, verbose=False, pool=None):
     """A wrapper of multiprocessing.pool.Pool.map to support small-batch mapping
     for large argument list. This can reduce memory usage
diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py
index 082f71364af8..b1f229ebe5dc 100644
--- a/python/tvm/topi/adreno/conv2d_nchw.py
+++ b/python/tvm/topi/adreno/conv2d_nchw.py
@@ -260,7 +260,15 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output):
     cfg.define_split("tile_rx", rx, num_outputs=2)
     cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
     cfg.define_knob("unroll_explicit", [0, 1])
-
+    cfg.multi_filter(
+        filter=lambda entity: (  # pylint: disable=chained-comparison
+            entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1]
+        )
+        <= 24
+        and 32
+        <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2])
+        < 1024
+    )
     if cfg.is_fallback:
         get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3])
     ##### space definition end #####
diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py
index 993b63252531..644978743b4d 100644
--- a/python/tvm/topi/adreno/conv2d_nhwc.py
+++ b/python/tvm/topi/adreno/conv2d_nhwc.py
@@ -258,7 +258,15 @@ def schedule_conv2d_NHWC(cfg, s, output):
     cfg.define_split("tile_rx", rx, num_outputs=2)
     cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
     cfg.define_knob("unroll_explicit", [0, 1])
-
+    cfg.multi_filter(
+        filter=lambda entity: (  # pylint: disable=chained-comparison
+            entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1]
+        )
+        <= 24
+        and 32
+        <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2])
+        < 1024
+    )
     if cfg.is_fallback:
         get_default_conv2d_config(cfg, conv.shape[3], conv.shape[1], conv.shape[2])
     ##### space definition end #####
diff --git a/python/tvm/topi/adreno/conv2d_winograd_common.py b/python/tvm/topi/adreno/conv2d_winograd_common.py
index 501773ad46fa..8c62f11c2fe5 100644
--- a/python/tvm/topi/adreno/conv2d_winograd_common.py
+++ b/python/tvm/topi/adreno/conv2d_winograd_common.py
@@ -440,10 +440,9 @@ def schedule_conv2d_winograd(cfg, s, output, pre_computed):
         and entry.size[1] <= 16,
     )
     cfg.define_split("tile_rc", rcc, num_outputs=2)
-    # TODO: Uncomment the following lines when multi_filter will be introduced
-    # cfg.multi_filter(
-    # filter=lambda entity: entity["tile_y"].size[2] * entity["tile_x"].size[2] in range(32,1024)
-    # )
+    cfg.multi_filter(
+        filter=lambda entity: 32 <= (entity["tile_y"].size[2] * entity["tile_x"].size[2]) < 1024
+    )
     ##### space definition end #####
 
     # batch gemm
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
index eb998bdbcd6e..8549399fb0d0 100644
--- a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
+++ b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
@@ -214,6 +214,15 @@ def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output):
     cfg.define_split("tile_rx", rx, num_outputs=2)
     cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
     cfg.define_knob("unroll_explicit", [0, 1])
+    cfg.multi_filter(
+        filter=lambda entity: (  # pylint: disable=chained-comparison
+            entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1]
+        )
+        <= 32
+        and 32
+        <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2])
+        < 1024
+    )
 
     if cfg.is_fallback:
         get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3])
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
index c27f2a9eae7c..82e128443e85 100644
--- a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
+++ b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
@@ -211,6 +211,15 @@ def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output):
     cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
     cfg.define_knob("unroll_explicit", [0, 1])
 
+    cfg.multi_filter(
+        filter=lambda entity: (  # pylint: disable=chained-comparison
+            entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1]
+        )
+        <= 32
+        and 32
+        <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2])
+        < 1024
+    )
     if cfg.is_fallback:
         get_default_conv2d_config(cfg, conv.shape[3], conv.shape[1], conv.shape[2])
     ##### space definition end #####
diff --git a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
index 5448a54fae6b..1dbff816699e 100644
--- a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
@@ -175,8 +175,8 @@ def get_mod():
 
     space = task.config_space
 
-    idx1 = np.random.randint(len(space))
-    idx2 = np.random.randint(len(space))
+    idx1 = space.get_rand_index()
+    idx2 = space.get_rand_index()
 
     cfg = space.get(idx1)
     sch, arg_bufs = task.instantiate(cfg)
diff --git a/tests/python/unittest/test_autotvm_ga_tuner.py b/tests/python/unittest/test_autotvm_ga_tuner.py
new file mode 100644
index 000000000000..625c6c66b6f2
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_ga_tuner.py
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test genetic algorithm tuner"""
+
+from tvm.testing.autotvm import DummyRunner, get_sample_task
+from tvm import autotvm
+
+
+def test_ga_tuner():
+    """Test GATuner"""
+    # Test population size smaller than space size tuning configuration
+    task, _ = get_sample_task()
+    tuner = autotvm.tuner.GATuner(task, pop_size=32)
+    valid_indexes = list(
+        filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length))
+    )
+    assert tuner.visited.issubset(valid_indexes)
+    assert tuner.pop_size == len(tuner.visited) == len(tuner.genes)
+    assert len(tuner.space) == 64
+
+    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
+    tuner.tune(n_trial=len(tuner.space), measure_option=measure_option)
+    assert tuner.visited.issubset(valid_indexes)
+
+    # Test population size bigger than space size tuning configuration
+    task, _ = get_sample_task()
+    tuner = autotvm.tuner.GATuner(task, pop_size=100)
+    valid_indexes = list(
+        filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length))
+    )
+    assert tuner.visited.issubset(valid_indexes)
+    assert tuner.pop_size == len(tuner.visited) == len(tuner.genes)
+    assert len(tuner.space) == 64
+
+    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
+    tuner.tune(n_trial=len(tuner.space), measure_option=measure_option)
+    assert tuner.visited.issubset(valid_indexes)
+
+    # Test population size smaller than multi-filtered space size tuning configuration
+    task, _ = get_sample_task()
+    task.config_space.multi_filter(
+        filter=lambda entity: 8 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
+    )
+    tuner = autotvm.tuner.GATuner(task, pop_size=32)
+    valid_indexes = list(
+        filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length))
+    )
+    assert tuner.visited.issubset(valid_indexes)
+    assert tuner.pop_size == len(tuner.visited) == len(tuner.genes)
+    assert len(tuner.space) == 43
+
+    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
+    tuner.tune(n_trial=len(tuner.space), measure_option=measure_option)
+    assert tuner.visited.issubset(valid_indexes)
+
+    # Test population size bigger than multi-filtered space size tuning configuration
+    task, _ = get_sample_task()
+    task.config_space.multi_filter(
+        filter=lambda entity: 8 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
+    )
+    tuner = autotvm.tuner.GATuner(task, pop_size=100)
+    valid_indexes = list(
+        filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length))
+    )
+    assert tuner.visited.issubset(valid_indexes)
+    assert tuner.pop_size == len(tuner.visited) == len(tuner.genes)
+    assert len(tuner.space) == 43
+
+    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
+    tuner.tune(n_trial=len(tuner.space), measure_option=measure_option)
+    assert tuner.visited.issubset(valid_indexes)
+
+
+if __name__ == "__main__":
+    test_ga_tuner()
diff --git a/tests/python/unittest/test_autotvm_index_tuner.py b/tests/python/unittest/test_autotvm_index_tuner.py
index be89ee2506fc..514577f1c986 100644
--- a/tests/python/unittest/test_autotvm_index_tuner.py
+++ b/tests/python/unittest/test_autotvm_index_tuner.py
@@ -19,10 +19,9 @@
 import multiprocessing
 from tvm.testing.autotvm import DummyRunner, get_sample_task
 from tvm import autotvm
-from tvm.autotvm.tuner import GridSearchTuner, RandomTuner
 
 
-def test_gridsearch_tuner():
+def test_grid_search_tuner():
     """Test GridSearchTuner"""
 
     task, _ = get_sample_task()
@@ -30,28 +29,60 @@ def test_gridsearch_tuner():
 
     # When no range index, range_length should be the length of config space
     tuner = autotvm.tuner.GridSearchTuner(task)
-    assert tuner.range_length == len(task.config_space)
-    assert tuner.index_offset == 0
+    assert tuner.begin_idx == 0
+    assert tuner.end_idx == 64
+    assert tuner.index == 0
+    assert tuner.range_length == 64
+    assert tuner.visited_max == 64
 
     # With range index, range_length should be the length of the specified range
     tuner = autotvm.tuner.GridSearchTuner(task, range_idx=(8, 15))
+    assert tuner.begin_idx == 8
+    assert tuner.end_idx == 16
+    assert tuner.index == 8
     assert tuner.range_length == 8
-    assert tuner.index_offset == 8
+    assert tuner.visited_max == 8
 
     # Tuner should only focus on the specified range
     tuner.tune(n_trial=8, measure_option=measure_option)
-    assert tuner.counter == 8
+    assert len(tuner.visited) == 8
+    assert not tuner.has_next()
+
+    # With multi-filter
+    task, _ = get_sample_task()
+    task.config_space.multi_filter(
+        filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
+    )
+
+    tuner = autotvm.tuner.GridSearchTuner(task)
+    assert tuner.begin_idx == 0
+    assert tuner.end_idx == 64
+    assert tuner.index == 5
+    assert tuner.range_length == 64
+    assert tuner.visited_max == 34
+
+    # With range index, range_length should be the length of the specified range
+    tuner = autotvm.tuner.GridSearchTuner(task, range_idx=(8, 15))
+    assert tuner.begin_idx == 8
+    assert tuner.end_idx == 16
+    assert tuner.index == 12
+    assert tuner.range_length == 8
+    assert tuner.visited_max == 4
+
+    # Tuner should only focus on the specified range
+    tuner.tune(n_trial=8, measure_option=measure_option)
+    assert len(tuner.visited) == 4
     assert not tuner.has_next()
 
 
 def grid_search_spawn():
     assert multiprocessing.get_spawn_method(False) == "spawn"
-    test_gridsearch_tuner()
+    test_grid_search_tuner()
 
 
 def test_grid_search_tuner_spawn():
     ctx = multiprocessing.get_context("spawn")
-    p = ctx.Process(target=test_gridsearch_tuner)
+    p = ctx.Process(target=test_grid_search_tuner)
     p.start()
     p.join()
 
@@ -63,20 +94,38 @@ def test_random_tuner():
     measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
 
     tuner = autotvm.tuner.RandomTuner(task, range_idx=(8, 15))
+    assert tuner.begin_idx == 8
+    assert tuner.end_idx == 16
+    assert tuner.range_length == 8
+    assert tuner.visited_max == 8
+
+    # Tuner should only focus on the specified range and should visit all indices
+    tuner.tune(n_trial=8, measure_option=measure_option)
+    assert len(tuner.visited) == 8
+    assert not tuner.has_next()
+    for idx in tuner.visited:
+        assert 8 <= idx <= 15
+
+    # With multi-filter
+    task, _ = get_sample_task()
+    task.config_space.multi_filter(
+        filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
+    )
+    tuner = autotvm.tuner.RandomTuner(task, range_idx=(8, 15))
+    assert tuner.begin_idx == 8
+    assert tuner.end_idx == 16
     assert tuner.range_length == 8
-    assert tuner.index_offset == 8
+    assert tuner.visited_max == 4
 
     # Tuner should only focus on the specified range and should visit all indices
     tuner.tune(n_trial=8, measure_option=measure_option)
-    assert tuner.counter == 8
+    assert len(tuner.visited) == 4
     assert not tuner.has_next()
-    visited = set()
     for idx in tuner.visited:
-        assert idx not in visited
         assert 8 <= idx <= 15
 
 
 if __name__ == "__main__":
-    test_gridsearch_tuner()
-    test_gridsearch_tuner_spawn()
+    test_grid_search_tuner()
+    test_grid_search_tuner_spawn()
     test_random_tuner()
diff --git a/tests/python/unittest/test_autotvm_space.py b/tests/python/unittest/test_autotvm_space.py
index d9f2b528e429..eb783a9f8bcd 100644
--- a/tests/python/unittest/test_autotvm_space.py
+++ b/tests/python/unittest/test_autotvm_space.py
@@ -16,12 +16,11 @@
 # under the License.
 """Test space definition primitives"""
 
-import tvm
 from tvm import te
 from tvm.autotvm.task.space import ConfigSpace, FallbackConfigEntity
 
 
-def gemm_func(cfg, N):
+def gemm_func(cfg, N, filter_y=None, filter_x=None):
     A = te.placeholder((N, N), name="A")
     B = te.placeholder((N, N), name="B")
 
@@ -32,8 +31,8 @@ def gemm_func(cfg, N):
 
     y, x = s[C].op.axis
 
-    cfg.define_split("tile_y", cfg.axis(y), num_outputs=2)
-    cfg.define_split("tile_x", cfg.axis(x), num_outputs=2)
+    cfg.define_split("tile_y", cfg.axis(y), num_outputs=2, filter=filter_y)
+    cfg.define_split("tile_x", cfg.axis(x), num_outputs=2, filter=filter_x)
 
     return s, [A, B, C]
 
@@ -42,7 +41,7 @@ def test_split():
     cfg = ConfigSpace()
 
     gemm_func(cfg, 128)
-    assert len(cfg) == 64
+    assert cfg.range_length == 64
     assert len(cfg.space_map["tile_y"]) == 8
 
     # test policy
@@ -102,5 +101,163 @@ def count4(n):
         pass
 
 
+def _raises_exception(f):
+    try:
+        f()
+    except Exception:
+        return True
+    return False
+
+
+def test_multi_filter():
+    # create config without multi_filter
+    cfg = ConfigSpace()
+    gemm_func(cfg, 128)
+    # create config with multi_filter
+    cfg_mf = ConfigSpace()
+    gemm_func(cfg_mf, 128)
+    cfg_mf.multi_filter(
+        filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
+    )
+    # test len
+    assert len(cfg) == 64
+    assert len(cfg_mf) == 34
+    # test range_length
+    assert cfg.range_length == 64
+    assert cfg_mf.range_length == 64
+    # test dims
+    assert cfg.dims == [8, 8]
+    assert cfg_mf.dims == [8, 8]
+    # test is_index_valid
+    assert cfg.is_index_valid(0) is True
+    assert cfg.is_index_valid(15) is True
+    assert cfg_mf.is_index_valid(0) is False
+    assert cfg_mf.is_index_valid(15) is True
+    # test get
+    assert _raises_exception(lambda: cfg.get(0)) is False
+    assert _raises_exception(lambda: cfg.get(15)) is False
+    assert _raises_exception(lambda: cfg_mf.get(0)) is True
+    assert _raises_exception(lambda: cfg_mf.get(15)) is False
+    # test subrange_length
+    assert cfg.subrange_length(0, 64) == 64
+    assert cfg.subrange_length(0, 32) == 32
+    assert cfg.subrange_length(16, 32) == 16
+    assert cfg.subrange_length(16, 16) == 0
+    assert _raises_exception(lambda: cfg.subrange_length(0, 128))
+    assert _raises_exception(lambda: cfg.subrange_length(-64, 64))
+    assert _raises_exception(lambda: cfg.subrange_length(64, 0))
+    assert cfg_mf.subrange_length(0, 64) == 34
+    assert cfg_mf.subrange_length(0, 32) == 17
+    assert cfg_mf.subrange_length(16, 32) == 10
+    assert cfg_mf.subrange_length(16, 16) == 0
+    assert _raises_exception(lambda: cfg_mf.subrange_length(0, 128))
+    assert _raises_exception(lambda: cfg_mf.subrange_length(-64, 64))
+    assert _raises_exception(lambda: cfg_mf.subrange_length(64, 0))
+    # test point2knob
+    assert cfg.point2knob(0) == [0, 0]
+    assert cfg.point2knob(4) == [4, 0]
+    assert cfg.point2knob(8) == [0, 1]
+    assert cfg.point2knob(12) == [4, 1]
+    assert cfg_mf.point2knob(0) == [0, 0]
+    assert cfg_mf.point2knob(4) == [4, 0]
+    assert cfg_mf.point2knob(8) == [0, 1]
+    assert cfg_mf.point2knob(12) == [4, 1]
+    # test knob2point
+    assert cfg.knob2point([0, 0]) == 0
+    assert cfg.knob2point([4, 0]) == 4
+    assert cfg.knob2point([0, 1]) == 8
+    assert cfg.knob2point([4, 1]) == 12
+    assert cfg_mf.knob2point([0, 0]) == 0
+    assert cfg_mf.knob2point([4, 0]) == 4
+    assert cfg_mf.knob2point([0, 1]) == 8
+    assert cfg_mf.knob2point([4, 1]) == 12
+    # get_rand_index
+    cfg_valid_indexes = list(filter(lambda idx: cfg.is_index_valid(idx), range(cfg.range_length)))
+    assert cfg.get_rand_index() in cfg_valid_indexes
+    assert cfg.get_rand_index(start=15, end=16) == 15
+    assert 10 <= cfg.get_rand_index(start=10, end=20) < 20
+    assert cfg.get_rand_index(to_exclude=cfg_valid_indexes[:-1]) == cfg_valid_indexes[-1:][0]
+    cfg_mf_valid_indexes = list(
+        filter(lambda idx: cfg_mf.is_index_valid(idx), range(cfg_mf.range_length))
+    )
+    assert cfg_mf.get_rand_index() in cfg_mf_valid_indexes
+    assert cfg_mf.get_rand_index(start=15, end=16) == 15
+    assert 10 <= cfg_mf.get_rand_index(start=10, end=20) < 20
+    assert (
+        cfg_mf.get_rand_index(to_exclude=cfg_mf_valid_indexes[:-1]) == cfg_mf_valid_indexes[-1:][0]
+    )
+    # get_next_index
+    assert cfg.get_next_index(0) == 1
+    assert cfg.get_next_index(0, 1) == 1
+    assert cfg.get_next_index(0, 2) == 2
+    assert cfg.get_next_index(0, -1) is None
+    assert cfg.get_next_index(0, -2) is None
+    assert cfg.get_next_index(63) is None
+    assert cfg.get_next_index(63, 1) is None
+    assert cfg.get_next_index(63, 2) is None
+    assert cfg.get_next_index(63, -1) == 62
+    assert cfg.get_next_index(63, -2) == 61
+    assert cfg.get_next_index(60, 1, end=63) == 61
+    assert cfg.get_next_index(63, -1, start=60) == 62
+    assert cfg_mf.get_next_index(0) == 5
+    assert cfg_mf.get_next_index(0, 1) == 5
+    assert cfg_mf.get_next_index(0, 2) == 6
+    assert cfg_mf.get_next_index(0, -1) is None
+    assert cfg_mf.get_next_index(0, -2) is None
+    assert cfg_mf.get_next_index(63) is None
+    assert cfg_mf.get_next_index(63, 1) is None
+    assert cfg_mf.get_next_index(63, 2) is None
+    assert cfg_mf.get_next_index(63, -1) == 58
+    assert cfg_mf.get_next_index(63, -2) == 57
+    assert cfg_mf.get_next_index(60, 1, end=63) is None
+    assert cfg_mf.get_next_index(63, -1, start=60) is None
+    # test sample_ints
+    cfg_ints = cfg.sample_ints(5)
+    assert len(cfg_ints) == 5
+    assert set(cfg_ints).issubset(cfg_valid_indexes)
+    cfg_mf_ints = cfg_mf.sample_ints(5)
+    assert len(cfg_mf_ints) == 5
+    assert set(cfg_mf_ints).issubset(cfg_mf_valid_indexes)
+    # test random_walk
+    cfg_walk = cfg.random_walk(15)
+    assert cfg_walk != 15
+    assert cfg_walk in cfg_valid_indexes
+    cfg_mf_walk = cfg_mf.random_walk(15)
+    assert cfg_mf_walk != 15
+    assert cfg_mf_walk in cfg_mf_valid_indexes
+
+
+def test_filter_and_multi_filter():
+    # test the order: filter -> multi_filter
+    cfg = ConfigSpace()
+    gemm_func(cfg, 128, filter_y=lambda y: y.size[-1] < 64)
+    # after adding filter
+    assert len(cfg) == 48
+    assert cfg.range_length == 48
+    cfg.multi_filter(
+        filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
+    )
+    # after adding multi_filter
+    assert len(cfg) == 27
+    assert cfg.range_length == 48
+
+    # test the order: multi_filter -> filter
+    cfg = ConfigSpace()
+    s, (A, B, C) = gemm_func(cfg, 128, filter_y=None)
+    cfg.multi_filter(
+        filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024
+    )
+    # after adding multi_filter
+    assert len(cfg) == 34
+    assert cfg.range_length == 64
+    y, x = s[C].op.axis
+    cfg.define_split("tile_y", cfg.axis(y), num_outputs=2, filter=lambda y: y.size[-1] < 64)
+    # after adding filter
+    assert len(cfg) == 27
+    assert cfg.range_length == 48
+
+
 if __name__ == "__main__":
     test_split()
+    test_multi_filter()
+    test_filter_and_multi_filter()

From 195ae72b5c6f0df68fac41f7808d125d155a6345 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 23 Sep 2022 03:26:05 +0900
Subject: [PATCH 228/704] [TOPI] Fix dtype legalize logic for CPU dot product
 instruction (#12865)

The logic in `python/tvm/topi/generic/conv2d.py#L480-L499` is supposed to legalize the input dtype to be able to apply target-specific intrinsics that only support one of int8 or uint8. For example, the x86 VNNI instruction only supports uint8 activation.

But the logic is incorrect (two cases are flipped) and leads to incorrect result in the following case:

* The input activation is int8, and we want to use the x86 VNNI intrinsic which only supports uint8 activations.
* The input activation is uint8, and we want to use the ARM `sdot` intrinsic which only supports int8 activations.

The first case also applies to the Hexagon `vrmpy` intrinsic. I found this bug while testing `vrmpy` conv2d on int8 input.

To test this on CI, we need to be running on a cascadelake or ARM v8.2 (with dot product support) instance. I cannot find a way to detect such cpu feature from a python script. `try / catch` doesn't work because the error is raised from LLVM (`LLVM ERROR: Do not know how to split the result of this operator`) that I don't know how to catch. So for now the test is skipped.
---
 python/tvm/topi/generic/conv2d.py    | 15 +++---
 tests/python/relay/test_op_level2.py | 75 ++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index 1cb69d593d1a..48b2a2f97146 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -477,7 +477,7 @@ def conv2d_alter_int8_common(
     pt, pl, pb, pr = get_pad_tuple(padding, (kh, kw))
 
     if data_tensor.dtype != data_dtype:
-        # How to convert data to int8
+        # How to convert data to uint8
         # Original --> C = A (conv) B
         # A and B are int8
         #   C = (A + 128 - 128) (conv) B
@@ -485,18 +485,20 @@ def conv2d_alter_int8_common(
         # where A' = A + 128
         # and 128 (conv) B is basically a reduce on CRS axis for weights.
         #
-        # How to convert data to uint8
+        # How to convert data to int8
         #   C = (A - 128 + 128) (conv) B
         #   C = (A' conv B) + 128 (conv) B
         # where A' = A - 128
-        if data_dtype == "int8":
-            # shift data to int8
+        if data_dtype == "uint8":
+            # shift data to uint8
             before_shift = relay.add
             after_shift = relay.subtract
+            pad_value = 128
         else:
-            # shift data to uint8
+            # shift data to int8
             before_shift = relay.subtract
             after_shift = relay.add
+            pad_value = -128
 
         if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
             adjust_shift = relay.sum(relay.cast(kernel, dtype="int32"), axis=(0, 1, 2))
@@ -514,7 +516,8 @@ def conv2d_alter_int8_common(
 
         # Do external padding as pad value has to be 128.
         if any(padding):
-            data = relay.nn.pad(data, pad_width=pad_width, pad_value=128)
+            data = relay.nn.pad(data, pad_width=pad_width, pad_value=pad_value)
+
         new_attrs["padding"] = (0, 0)
 
         # Multiply 128 to adjust shift.
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 84b72e4cffd2..6a895aaf0518 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2137,5 +2137,80 @@ def get_subgraph(dtype):
             np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
 
 
+@pytest.mark.skip("Requires cascadelake or ARM v8.2")
+def test_conv2d_int8_alter_dtype():
+    def get_conv2d_nchw(
+        d_shape,
+        w_shape,
+        data_dtype,
+    ):
+        out_dtype = "int32"
+        strides = (1, 1)
+        padding = (1, 1)
+        data = relay.var("data", shape=d_shape, dtype=data_dtype)
+        weight = relay.var("weight", shape=w_shape, dtype="int8")
+        out_channel = w_shape[0]
+        return relay.nn.conv2d(
+            data=data,
+            weight=weight,
+            kernel_size=w_shape[2:],
+            channels=out_channel,
+            padding=padding,
+            strides=strides,
+            out_dtype=out_dtype,
+        )
+
+    I, O, H, W = 64, 64, 56, 56
+    kH = kW = 3
+
+    data_shape = (1, I, H, W)
+    weight_shape = (O, I, kH, kW)
+    bias_shape = (1, weight_shape[0], 1, 1)
+
+    bias = relay.var("bias", shape=bias_shape, dtype="int32")
+    bias_np = np.random.randint(low=-127, high=128, size=bias_shape).astype("int32")
+    weight_np = np.random.uniform(-128, 127, size=weight_shape).astype("int8")
+
+    for data_dtype, target, dot_product_instr in [
+        ("uint8", "llvm --device arm_cpu -mattr=+v8.2a,+dotprod", "sdot"),
+        ("int8", "llvm -mcpu=cascadelake", "vpdpbusd"),
+    ]:
+        conv2d = get_conv2d_nchw(data_shape, weight_shape, data_dtype)
+        bias_add = relay.add(conv2d, bias)
+        mod = tvm.IRModule.from_expr(bias_add)
+
+        if data_dtype == "uint8":
+            data_np = np.random.uniform(0, 255, size=data_shape).astype("uint8")
+        else:
+            data_np = np.random.uniform(-128, 127, size=data_shape).astype("int8")
+
+        params = {"weight": weight_np, "bias": bias_np}
+
+        ref = (
+            relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
+            .evaluate()(*[data_np, weight_np, bias_np])
+            .numpy()
+        )
+
+        dev = tvm.cpu(0)
+
+        with tvm.transform.PassContext(
+            opt_level=3,
+        ):
+            lib = relay.build(mod, target=target, params=params)
+
+        assert dot_product_instr in lib.lib.get_source("asm")
+
+        rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+        rt_mod.set_input("data", data_np)
+
+        rt_mod.run()
+
+        out = rt_mod.get_output(0).numpy()
+
+        np.testing.assert_equal(out, ref)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 86f9580498e7d4b5c826e7ae55b05f2a4e35a95c Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 22 Sep 2022 13:19:14 -0700
Subject: [PATCH 229/704] [Relay] Fix handling of TransfromLayout in TE
 compiler cache (#12874)

---
 src/relay/backend/te_compiler_cache.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index a8eb6a58105f..17eac443ffe3 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -374,7 +374,7 @@ class ScheduleBuilder : public ExprVisitor {
             TuningRecord record = opt_record.value();
             for (const Instruction& inst : record->trace->insts) {
               if (inst->kind.same_as(kind_transform_layout)) {
-                ICHECK_EQ(inst->attrs.size(), 3);
+                ICHECK_EQ(inst->attrs.size(), 4);
                 MetaScheduleLayoutRewriter::LayoutQueuePush(Downcast<IndexMap>(inst->attrs[2]));
               }
             }

From 4e783a6087fd236c588cde30e0ac99daa15afe61 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 22 Sep 2022 13:20:40 -0700
Subject: [PATCH 230/704] [TOPI] Add layer norm operator (#12864)

* [TOPI] Add one-pass layer norm using tuple reduction

* Add reducer pattern for LowerCrossThreadReduction

* lint

* update docs
---
 include/tvm/topi/nn/layer_norm.h              | 117 ++++++++++++++
 include/tvm/topi/reduction.h                  |  23 +++
 python/tvm/topi/nn/__init__.py                |   1 +
 python/tvm/topi/nn/layer_norm.py              |  46 ++++++
 python/tvm/topi/testing/__init__.py           |   1 +
 python/tvm/topi/testing/layer_norm_python.py  |  53 +++++++
 src/tir/schedule/primitive/reduction.cc       |   9 ++
 src/topi/nn.cc                                |   6 +
 .../topi/python/test_topi_layer_norm.py       |  62 ++++++++
 ..._transform_lower_cross_thread_reduction.py | 149 ++++++++++++++++++
 10 files changed, 467 insertions(+)
 create mode 100644 include/tvm/topi/nn/layer_norm.h
 create mode 100644 python/tvm/topi/nn/layer_norm.py
 create mode 100644 python/tvm/topi/testing/layer_norm_python.py
 create mode 100644 tests/python/topi/python/test_topi_layer_norm.py

diff --git a/include/tvm/topi/nn/layer_norm.h b/include/tvm/topi/nn/layer_norm.h
new file mode 100644
index 000000000000..93e5582ef184
--- /dev/null
+++ b/include/tvm/topi/nn/layer_norm.h
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief layer normalization op constructions
+ * \file nn/layer_norm.h
+ */
+#ifndef TVM_TOPI_NN_LAYER_NORM_H_
+#define TVM_TOPI_NN_LAYER_NORM_H_
+
+#include <tvm/te/operation.h>
+#include <tvm/topi/tags.h>
+
+#include <string>
+
+namespace tvm {
+namespace topi {
+namespace nn {
+
+using namespace tvm::te;
+
+/*!
+ * \brief Layer normalization.
+ * \param data N-D tensor with shape [d_0, d_1, ..., d_{N-1}]
+ * \param gamma K-D tensor with shape [r_0, r_1, ..., r_{K-1}] where K == len(axis) and
+ *              d_{axis_k} == r_k
+ * \param beta Optional, K-D tensor with shape [r_0, r_1, ..., r_{K-1}] where
+ *             d_{axis_k} == r_k
+ * \param axis The axis to normalize over.
+ * \param epsilon The epsilon value to avoid division by zero.
+ * \param name The name of the operation.
+ * \param tag The tag to mark the operation.
+ * \return The normalized tensor, with the same shape as data.
+ */
+inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor& beta,
+                         const Array<Integer>& axis, double epsilon,
+                         std::string name = "T_layer_norm", std::string tag = kInjective) {
+  // sum x and x^2
+  auto ndim = data->shape.size();
+  ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
+  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
+  auto reduce_axes = MakeReduceAxes(real_axis, data);
+  auto target_shape =
+      MakeReduceTargetShape(real_axis, data, /*keepdims=*/false, /*atleast1d=*/true);
+  auto func = MakeTupleSumReducer();
+
+  auto compute = [ndim, &real_axis, &reduce_axes, &func, &data](const Array<Var>& indices) {
+    Array<PrimExpr> eval_range;
+    int arg_counter = 0;
+    int red_counter = 0;
+
+    for (size_t i = 0; i < ndim; ++i) {
+      if (std::find(real_axis.begin(), real_axis.end(), i) != real_axis.end()) {
+        // real_axis contains i
+        eval_range.push_back(reduce_axes[red_counter]);
+        red_counter++;
+      } else {
+        eval_range.push_back(indices[arg_counter]);
+        arg_counter++;
+      }
+    }
+    auto square = [](const PrimExpr& x) { return x * x; };
+    return func({data(eval_range), square(data(eval_range))}, reduce_axes, nullptr);
+  };
+
+  auto temp_x_x2 =
+      tvm::te::compute(target_shape, compute, data->op->name + "_red_temp", kCommReduce);
+
+  auto temp_x = temp_x_x2[0];
+  auto temp_x2 = temp_x_x2[1];
+
+  auto reduce_extent = make_const(data->dtype, 1);
+  for (int i : real_axis) {
+    reduce_extent *= data->shape[i];
+  }
+  auto layer_norm_func = [&](const Array<Var>& indices) {
+    Array<Var> reduce_indices, non_reduce_indices;
+    for (int i = 0, n = static_cast<int>(indices.size()); i < n; ++i) {
+      if (std::find(real_axis.begin(), real_axis.end(), i) != real_axis.end()) {
+        reduce_indices.push_back(indices[i]);
+      } else {
+        non_reduce_indices.push_back(indices[i]);
+      }
+    }
+    auto mean = temp_x(non_reduce_indices) / reduce_extent;
+    auto var = temp_x2(non_reduce_indices) / reduce_extent - mean * mean;
+    auto layer_norm = (data(indices) - mean) * tvm::rsqrt(var + make_const(var->dtype, epsilon));
+    layer_norm = topi::multiply(layer_norm, gamma(reduce_indices));
+    if (beta.defined()) {
+      layer_norm = topi::add(layer_norm, beta(reduce_indices));
+    }
+    return layer_norm;
+  };
+  return tvm::te::compute(data->shape, layer_norm_func, name, tag);
+}
+
+}  // namespace nn
+}  // namespace topi
+}  // namespace tvm
+
+#endif  // TVM_TOPI_NN_LAYER_NORM_H_
diff --git a/include/tvm/topi/reduction.h b/include/tvm/topi/reduction.h
index d4e420d80b02..5e79bd429d6f 100644
--- a/include/tvm/topi/reduction.h
+++ b/include/tvm/topi/reduction.h
@@ -570,6 +570,29 @@ inline Tensor prod(const Tensor& data, const Array<Integer>& axis, bool keepdims
   return CommReduce(data, axis, ProdOp, keepdims, atleast1d);
 }
 
+/*!
+ * \brief Create communitive reducer summing over tuples
+ */
+inline FCommReduce MakeTupleSumReducer() {
+  auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
+    Array<PrimExpr> result;
+    ICHECK_EQ(lhs.size(), rhs.size());
+    result.reserve(lhs.size());
+    for (size_t i = 0; i < lhs.size(); ++i) {
+      result.push_back(lhs[i] + rhs[i]);
+    }
+    return result;
+  };
+  auto fidentity = [](std::vector<DataType> types) {
+    Array<PrimExpr> result;
+    for (size_t i = 0; i < types.size(); ++i) {
+      result.push_back(tvm::tir::make_const(types[i], 0));
+    }
+    return result;
+  };
+  return MakeCommReducer(fcombine, fidentity, "tuple_sum");
+}
+
 }  // namespace topi
 }  // namespace tvm
 #endif  // TVM_TOPI_REDUCTION_H_
diff --git a/python/tvm/topi/nn/__init__.py b/python/tvm/topi/nn/__init__.py
index 1dd922d76819..8f081242fa10 100644
--- a/python/tvm/topi/nn/__init__.py
+++ b/python/tvm/topi/nn/__init__.py
@@ -38,6 +38,7 @@
 from .bnn import *
 from .qnn import *
 from .upsampling import *
+from .layer_norm import layer_norm
 from .local_response_norm import *
 from .bitserial_conv2d import *
 from .bitserial_dense import *
diff --git a/python/tvm/topi/nn/layer_norm.py b/python/tvm/topi/nn/layer_norm.py
new file mode 100644
index 000000000000..3bdeaaac61a5
--- /dev/null
+++ b/python/tvm/topi/nn/layer_norm.py
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Layer normalization operator."""
+from .. import cpp
+
+
+def layer_norm(data, gamma, beta, axis, epsilon=1e-5):
+    """Layer normalization operator.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        N-D with shape (d_0, d_1, ..., d_{N-1})
+
+    gamma: tvm.te.Tensor
+        K-D with shape (r_0, r_1, ..., r_{K-1}) where K == len(axis) and d_{axis_k} == r_k
+
+    beta: tvm.te.Tensor
+        Optional, K-D with shape (r_0, r_1, ..., r_{K-1}) where K == len(axis) and d_{axis_k} == r_k
+
+    axis : list of int
+        Axis over the normalization applied
+
+    epsilon : float
+        The epsilon value to avoid division by zero.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        N-D with shape (d_0, d_1, ..., d_{N-1})
+    """
+    return cpp.nn.layer_norm(data, gamma, beta, axis, epsilon)
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index 2f091cba10b7..2922c30b505c 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -43,6 +43,7 @@
 from .reorg_python import reorg_python
 from .roi_align_python import roi_align_nchw_python, roi_align_nhwc_python
 from .roi_pool_python import roi_pool_nchw_python
+from .layer_norm_python import layer_norm_python
 from .lrn_python import lrn_python
 from .l2_normalize_python import l2_normalize_python
 from .gather_python import gather_python
diff --git a/python/tvm/topi/testing/layer_norm_python.py b/python/tvm/topi/testing/layer_norm_python.py
new file mode 100644
index 000000000000..6b3b00146983
--- /dev/null
+++ b/python/tvm/topi/testing/layer_norm_python.py
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Layer normalization in python"""
+import numpy as np
+
+
+def layer_norm_python(data, gamma, beta, axis, epsilon=1e-5):
+    """Layer normalization operator in Python.
+
+    Parameters
+    ----------
+    data : numpy.ndarray
+        N-D with shape (d_0, d_1, ..., d_{N-1})
+
+    gamma: numpy.ndarray
+        K-D with shape (r_0, r_1, ..., r_{K-1}) where K == len(axis) and d_{axis_k} == r_k
+
+    beta: numpy.ndarray
+        Optional, K-D with shape (r_0, r_1, ..., r_{K-1}) where K == len(axis) and d_{axis_k} == r_k
+
+    axis : int or tuple of ints
+        Axis over the normalization applied
+
+    epsilon : float
+        The epsilon value to avoid division by zero.
+
+    Returns
+    -------
+    result : np.ndarray
+        N-D with shape (d_0, d_1, ..., d_{N-1})
+    """
+    mean = np.mean(data, axis, keepdims=True)
+    var = np.var(data, axis, keepdims=True)
+    result = (data - mean) / np.sqrt(var + epsilon)
+    result *= gamma
+    if beta is not None:
+        result += beta
+    return result
diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc
index dd2bcf727c40..bb43df1ce914 100644
--- a/src/tir/schedule/primitive/reduction.cc
+++ b/src/tir/schedule/primitive/reduction.cc
@@ -330,6 +330,15 @@ struct ReducerRegistry {
                 [](const Array<PrimExpr>& values) {
                   return Array<PrimExpr>{min_value(values[0]->dtype)};
                 }),
+            CreateReducerGetter(
+                /*n_buffers=*/2,
+                [](const Array<Var>& x, const Array<Var>& y) {
+                  return Array<PrimExpr>{x[0] + y[0], x[1] + y[1]};
+                },
+                [](const Array<PrimExpr>& values) {
+                  return Array<PrimExpr>{make_const(values[0]->dtype, 0),
+                                         make_const(values[1]->dtype, 0)};
+                }),
             CreateReducerGetter(
                 /*n_buffers=*/2,
                 [](const Array<Var>& x, const Array<Var>& y) {
diff --git a/src/topi/nn.cc b/src/topi/nn.cc
index 2950aee4e90d..35dbf3a03e4f 100644
--- a/src/topi/nn.cc
+++ b/src/topi/nn.cc
@@ -29,6 +29,7 @@
 #include <tvm/topi/nn/dense.h>
 #include <tvm/topi/nn/dilate.h>
 #include <tvm/topi/nn/flatten.h>
+#include <tvm/topi/nn/layer_norm.h>
 #include <tvm/topi/nn/local_response_norm.h>
 #include <tvm/topi/nn/mapping.h>
 #include <tvm/topi/nn/pooling.h>
@@ -157,5 +158,10 @@ TVM_REGISTER_GLOBAL("topi.nn.binary_dense").set_body([](TVMArgs args, TVMRetValu
   *rv = nn::binary_dense(args[0], args[1]);
 });
 
+/* Ops from nn/layer_norm.h */
+TVM_REGISTER_GLOBAL("topi.nn.layer_norm").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = nn::layer_norm(args[0], args[1], args[2], args[3], static_cast<double>(args[4]));
+});
+
 }  // namespace topi
 }  // namespace tvm
diff --git a/tests/python/topi/python/test_topi_layer_norm.py b/tests/python/topi/python/test_topi_layer_norm.py
new file mode 100644
index 000000000000..ead05470be3b
--- /dev/null
+++ b/tests/python/topi/python/test_topi_layer_norm.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for layer_norm."""
+import numpy as np
+import pytest
+import tvm
+from tvm import te
+from tvm import topi
+from tvm.topi.utils import get_const_tuple
+import tvm.topi.testing
+
+import tvm.testing
+
+
+_layer_norm_schedule = {
+    "generic": topi.generic.schedule_injective,
+}
+
+
+# only test on llvm because schedule is missing
+@tvm.testing.parametrize_targets("llvm")
+@pytest.mark.parametrize("shape,axis", [([4, 16], (1,)), ([4, 16, 16], (1, 2))])
+def test_layer_norm(target, dev, shape, axis, episilon=1e-5, dtype="float32", rtol=1e-5, atol=1e-5):
+    data = te.placeholder(shape, dtype=dtype, name="data")
+    scale_shape = [shape[dim] for dim in axis]
+    gamma = te.placeholder(scale_shape, dtype=dtype, name="gamma")
+    beta = te.placeholder(scale_shape, dtype=dtype, name="beta")
+    B = topi.nn.layer_norm(data, gamma, beta, axis, episilon)
+
+    data_np = np.random.uniform(size=shape).astype(dtype)
+    gamma_np = np.random.uniform(size=scale_shape).astype(dtype)
+    beta_np = np.random.uniform(size=scale_shape).astype(dtype)
+    b_np = tvm.topi.testing.layer_norm_python(data_np, gamma_np, beta_np, axis, episilon)
+
+    with tvm.target.Target(target):
+        s_func = tvm.topi.testing.dispatch(target, _layer_norm_schedule)
+        s = s_func([B])
+    data_tvm = tvm.nd.array(data_np, dev)
+    gamma_tvm = tvm.nd.array(gamma_np, dev)
+    beta_tvm = tvm.nd.array(beta_np, dev)
+    b_tvm = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
+    f = tvm.build(s, [data, gamma, beta, B], target)
+    f(data_tvm, gamma_tvm, beta_tvm, b_tvm)
+    tvm.testing.assert_allclose(b_tvm.asnumpy(), b_np, rtol=rtol, atol=atol)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
index ff1353d2265e..8c139b710e23 100644
--- a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
+++ b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
@@ -1002,6 +1002,151 @@ def lowered_argmin_split_init_update_reordered(
                 argmin_v1[i] = cross_thread_argmin_v1[0]
 
 
+@T.prim_func
+def layer_norm_tuple_sum(
+    data: T.Buffer[(128, 768), "float32"],
+    gamma: T.Buffer[768, "float32"],
+    bias: T.Buffer[768, "float32"],
+    T_layer_norm: T.Buffer[(128, 768), "float32"],
+) -> None:
+    data_red_temp_v0 = T.alloc_buffer([128], dtype="float32")
+    data_red_temp_v1 = T.alloc_buffer([128], dtype="float32")
+    for i0_fused in T.thread_binding(128, thread="blockIdx.x"):
+        for i1_0 in T.serial(24):
+            for i1_1 in T.thread_binding(32, thread="threadIdx.x"):
+                with T.block("data_red_temp"):
+                    ax0 = T.axis.spatial(128, i0_fused)
+                    k1 = T.axis.reduce(768, i1_0 * 32 + i1_1)
+                    T.reads(data[ax0, k1])
+                    T.writes(data_red_temp_v0[ax0], data_red_temp_v1[ax0])
+                    with T.init():
+                        data_red_temp_v0[ax0] = T.float32(0)
+                        data_red_temp_v1[ax0] = T.float32(0)
+                    v_data_red_temp_v0: T.float32 = data_red_temp_v0[ax0] + data[ax0, k1]
+                    v_data_red_temp_v1: T.float32 = (
+                        data_red_temp_v1[ax0] + data[ax0, k1] * data[ax0, k1]
+                    )
+                    data_red_temp_v0[ax0] = v_data_red_temp_v0
+                    data_red_temp_v1[ax0] = v_data_red_temp_v1
+    for i0_i1_fused_0 in T.thread_binding(384, thread="blockIdx.x"):
+        for i0_i1_fused_1 in T.thread_binding(256, thread="threadIdx.x"):
+            with T.block("T_layer_norm"):
+                ax0 = T.axis.spatial(128, (i0_i1_fused_0 * 256 + i0_i1_fused_1) // 768)
+                ax1 = T.axis.spatial(768, (i0_i1_fused_0 * 256 + i0_i1_fused_1) % 768)
+                T.reads(
+                    data[ax0, ax1],
+                    data_red_temp_v0[ax0],
+                    data_red_temp_v1[ax0],
+                    gamma[ax1],
+                    bias[ax1],
+                )
+                T.writes(T_layer_norm[ax0, ax1])
+                T_layer_norm[ax0, ax1] = (
+                    data[ax0, ax1] - data_red_temp_v0[ax0] * T.float32(0.0013020833333333333)
+                ) * T.rsqrt(
+                    data_red_temp_v1[ax0] * T.float32(0.0013020833333333333)
+                    - data_red_temp_v0[ax0]
+                    * T.float32(0.0013020833333333333)
+                    * (data_red_temp_v0[ax0] * T.float32(0.0013020833333333333))
+                    + T.float32(1.0000000000000001e-05),
+                    dtype="float32",
+                ) * gamma[
+                    ax1
+                ] + bias[
+                    ax1
+                ]
+
+
+@T.prim_func
+def lowered_layer_norm_tuple_sum(
+    data: T.Buffer[(128, 768), "float32"],
+    gamma: T.Buffer[768, "float32"],
+    bias: T.Buffer[768, "float32"],
+    T_layer_norm: T.Buffer[(128, 768), "float32"],
+) -> None:
+    # with T.block("root")
+    data_red_temp_v0 = T.alloc_buffer([128], dtype="float32")
+    data_red_temp_v1 = T.alloc_buffer([128], dtype="float32")
+    cross_thread_data_red_temp_v0 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local")
+    cross_thread_data_red_temp_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local")
+    in_thread_data_red_temp_v0 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local")
+    in_thread_data_red_temp_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local")
+    for i0_fused in T.thread_binding(128, thread="blockIdx.x"):
+        for i1_1 in T.thread_binding(32, thread="threadIdx.x"):
+            with T.block("data_red_temp_in_thread_init"):
+                T.reads()
+                T.writes(in_thread_data_red_temp_v0[0], in_thread_data_red_temp_v1[0])
+                in_thread_data_red_temp_v0[0] = T.float32(0)
+                in_thread_data_red_temp_v1[0] = T.float32(0)
+            for i1_0 in T.serial(24):
+                with T.block("data_red_temp_in_thread"):
+                    ax0 = T.axis.spatial(128, i0_fused)
+                    k1 = T.axis.reduce(768, i1_0 * 32 + i1_1)
+                    T.reads(data[ax0, k1])
+                    T.writes(in_thread_data_red_temp_v0[0], in_thread_data_red_temp_v1[0])
+                    v_data_red_temp_v0: T.float32 = in_thread_data_red_temp_v0[0] + data[ax0, k1]
+                    v_data_red_temp_v1: T.float32 = (
+                        in_thread_data_red_temp_v1[0] + data[ax0, k1] * data[ax0, k1]
+                    )
+                    in_thread_data_red_temp_v0[0] = v_data_red_temp_v0
+                    in_thread_data_red_temp_v1[0] = v_data_red_temp_v1
+            with T.block("data_red_temp_cross_thread"):
+                T.reads(in_thread_data_red_temp_v0[0], in_thread_data_red_temp_v1[0])
+                T.writes(cross_thread_data_red_temp_v0[0], cross_thread_data_red_temp_v1[0])
+                T.attr(
+                    T.comm_reducer(
+                        lambda x0, x1, y0, y1: (x0 + y0, x1 + y1), [T.float32(0), T.float32(0)]
+                    ),
+                    "reduce_scope",
+                    T.reinterpret(T.uint64(0), dtype="handle"),
+                )
+                T.evaluate(
+                    T.tvm_thread_allreduce(
+                        T.uint32(2),
+                        in_thread_data_red_temp_v0[0],
+                        in_thread_data_red_temp_v1[0],
+                        True,
+                        cross_thread_data_red_temp_v0[0],
+                        cross_thread_data_red_temp_v1[0],
+                        i1_1,
+                        dtype="handle",
+                    )
+                )
+            with T.block("data_red_temp_write_back"):
+                ax0 = T.axis.spatial(128, i0_fused)
+                T.reads(cross_thread_data_red_temp_v0[0], cross_thread_data_red_temp_v1[0])
+                T.writes(data_red_temp_v0[ax0], data_red_temp_v1[ax0])
+                data_red_temp_v0[ax0] = cross_thread_data_red_temp_v0[0]
+                data_red_temp_v1[ax0] = cross_thread_data_red_temp_v1[0]
+    for i0_i1_fused_0 in T.thread_binding(384, thread="blockIdx.x"):
+        for i0_i1_fused_1 in T.thread_binding(256, thread="threadIdx.x"):
+            with T.block("T_layer_norm"):
+                ax0 = T.axis.spatial(128, (i0_i1_fused_0 * 256 + i0_i1_fused_1) // 768)
+                ax1 = T.axis.spatial(768, (i0_i1_fused_0 * 256 + i0_i1_fused_1) % 768)
+                T.reads(
+                    data[ax0, ax1],
+                    data_red_temp_v0[ax0],
+                    data_red_temp_v1[ax0],
+                    gamma[ax1],
+                    bias[ax1],
+                )
+                T.writes(T_layer_norm[ax0, ax1])
+                T_layer_norm[ax0, ax1] = (
+                    data[ax0, ax1] - data_red_temp_v0[ax0] * T.float32(0.0013020833333333333)
+                ) * T.rsqrt(
+                    data_red_temp_v1[ax0] * T.float32(0.0013020833333333333)
+                    - data_red_temp_v0[ax0]
+                    * T.float32(0.0013020833333333333)
+                    * (data_red_temp_v0[ax0] * T.float32(0.0013020833333333333))
+                    + T.float32(1.0000000000000001e-05),
+                    dtype="float32",
+                ) * gamma[
+                    ax1
+                ] + bias[
+                    ax1
+                ]
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
 
 
@@ -1087,5 +1232,9 @@ def test_lower_te():
     )  # LowerCrossThreadReduction should do nothing on TE
 
 
+def test_layer_norm_tuple_sum():
+    _check(layer_norm_tuple_sum, lowered_layer_norm_tuple_sum)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 0b074d8f06dc411bcf779e8b59645626228bf5ce Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 22 Sep 2022 14:38:41 -0700
Subject: [PATCH 231/704] Fix clang warnings (#12876)

---
 include/tvm/tir/stmt_functor.h                   | 2 +-
 src/relay/transforms/annotate_texture_storage.cc | 4 ++--
 src/script/printer/doc.cc                        | 2 +-
 src/tir/schedule/primitive/pad_einsum.cc         | 5 +++--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index 2fc3b9678b40..17530380e665 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -442,7 +442,7 @@ template <typename Node, typename = std::enable_if_t<std::is_base_of_v<StmtNode,
 bool ContainsNode(const Stmt& stmt) {
   struct Visitor : StmtVisitor {
     // Early bail-out, if we already found the node.
-    void VisitStmt(const Stmt& stmt) {
+    void VisitStmt(const Stmt& stmt) final {
       if (contains_node) {
         return;
       }
diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc
index c9cf45e06929..6904c6b5d7cc 100644
--- a/src/relay/transforms/annotate_texture_storage.cc
+++ b/src/relay/transforms/annotate_texture_storage.cc
@@ -444,7 +444,7 @@ class RewriteVDStorageScopes : public transform::DeviceAwareExprMutator {
       c->virtual_device_ =
           VirtualDevice(virtual_device->device_type(), virtual_device->virtual_device_id,
                         virtual_device->target, storage_scope_[GetRef<Expr>(vn)][Expr()][0]);
-      return c;
+      return std::move(c);
     }
     return GetRef<Var>(vn);
   }
@@ -520,7 +520,7 @@ class RewriteVDStorageScopes : public transform::DeviceAwareExprMutator {
                                  virtual_device->target, memory_scope),
                    true);
     }
-    return new_call;
+    return std::move(new_call);
   }
 
  private:
diff --git a/src/script/printer/doc.cc b/src/script/printer/doc.cc
index f3b431bd62db..1ca7ced8e8a7 100644
--- a/src/script/printer/doc.cc
+++ b/src/script/printer/doc.cc
@@ -30,7 +30,7 @@ ExprDoc ExprDocNode::Attr(String attr) const { return AttrAccessDoc(GetRef<ExprD
 ExprDoc ExprDocNode::Attr(TracedObject<String> attr) const {
   auto doc = AttrAccessDoc(GetRef<ExprDoc>(this), attr.Get());
   doc->source_paths.push_back(attr.GetPath());
-  return doc;
+  return std::move(doc);
 }
 
 ExprDoc ExprDocNode::operator[](Array<Doc> indices) const {
diff --git a/src/tir/schedule/primitive/pad_einsum.cc b/src/tir/schedule/primitive/pad_einsum.cc
index 7a7b88d686f9..2190dc69d33d 100644
--- a/src/tir/schedule/primitive/pad_einsum.cc
+++ b/src/tir/schedule/primitive/pad_einsum.cc
@@ -227,6 +227,8 @@ class PadEinsumRewriter : public ReplaceBufferMutator {
         producer_predicate_(producer_predicate),
         padded_iter_extents_(padded_iter_extents),
         analyzer_(analyzer) {}
+  using ReplaceBufferMutator::VisitExpr_;
+  using ReplaceBufferMutator::VisitStmt_;
 
   Stmt VisitStmt_(const ForNode* op) final {
     For new_for = Downcast<For>(ReplaceBufferMutator::VisitStmt_(op));
@@ -371,8 +373,7 @@ void PadEinsum(ScheduleState self, const StmtSRef& block_sref, const Array<Integ
   Map<Buffer, Buffer> buffer_remap;  // mapping from buffers to new buffers with padded shapes
 
   // Utility function to pad a buffer with the new shape
-  auto f_pad_buffer = [&padded_iter_extents, &buffer_remap](Buffer buffer,
-                                                            const Array<Var>& indices) -> Buffer {
+  auto f_pad_buffer = [&padded_iter_extents](Buffer buffer, const Array<Var>& indices) -> Buffer {
     Array<PrimExpr> new_shape;
     for (const Var& index : indices) {
       new_shape.push_back(padded_iter_extents.at(index));

From ce8ac3e78454462025c03197767af764387df4ff Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 22 Sep 2022 20:43:10 -0700
Subject: [PATCH 232/704] [TIR] Allow missing TensorIntrin in registry lookup
 (#12875)

Added an option to allow missing tensor intrin.
---
 include/tvm/tir/function.h                           |  7 +++++--
 python/tvm/tir/function.py                           | 12 ++++++++----
 .../schedule_rule/multi_level_tiling_tensor_core.cc  |  2 +-
 .../schedule_rule/multi_level_tiling_with_intrin.cc  |  2 +-
 src/tir/ir/function.cc                               | 12 +++++++++---
 src/tir/schedule/concrete_schedule.cc                |  4 ++--
 src/tir/schedule/transform.cc                        |  2 +-
 tests/python/unittest/test_tir_schedule_tensorize.py |  7 +++++++
 8 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index bb6287c69b16..d793d84fc677 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -234,10 +234,13 @@ class TensorIntrin : public ObjectRef {
   /*!
    * \brief Look up TensorIntrin by name. Raises an exception if not found.
    * \param name The name of the TensorIntrin.
+   * \param allow_missing Whether to allow missing tensor intrin. If false, an exception is raised
+   *    if the tensor intrin is not found.
    * \return The TensorIntrin with the specified name.
-   * \throws This method throws an exception if the TensorIntrin does not exist.
+   * \throws This method throws an exception if the TensorIntrin does not exist and allow_missing is
+   * false.
    */
-  TVM_DLL static TensorIntrin Get(String name);
+  TVM_DLL static Optional<TensorIntrin> Get(String name, bool allow_missing = false);
 
   TVM_DEFINE_OBJECT_REF_METHODS(TensorIntrin, ObjectRef, TensorIntrinNode)
 };
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index df39f8aebf71..dd684bc4f1ae 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -245,7 +245,7 @@ def register(name: str, desc: PrimFunc, impl: PrimFunc, override: bool = False):
         )  # type: ignore
 
     @staticmethod
-    def get(name: str):
+    def get(name: str, allow_missing: bool = False) -> Optional["TensorIntrin"]:
         """Look up a tensor intrinsic by its name.
 
         Parameters
@@ -253,12 +253,16 @@ def get(name: str):
         name : str
             The name of the TensorIntrin to look up.
 
+        allow_missing : bool
+            Whether to allow missing tensor intrin. If False, raise an error if the tensor intrin
+        doesn't exist.
+
         Returns
         -------
-        result : TensorIntrin
-            The TensorIntrin with the specified name.
+        result : Optional[TensorIntrin]
+            The TensorIntrin with the specified name, or None if not found.
         """
-        return _ffi_api.TensorIntrinGet(name)  # pylint: type: ignore
+        return _ffi_api.TensorIntrinGet(name, allow_missing)  # pylint: type: ignore
 
 
 @tvm._ffi.register_object("tir.IndexMap")
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 290a85b2579b..fbf9aa19b711 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -184,7 +184,7 @@ Array<Schedule> MultiLevelTilingTensorCoreNode::Apply(const Schedule& sch,
     TensorCoreIntrinGroup intrin_group = intrin_groups[i];
     Optional<tir::AutoTensorizeMappingInfo> mapping_info = tir::GetAutoTensorizeMappingInfo(
         sch->state(), sch->GetSRef(block_rv),
-        tir::TensorIntrin::Get(intrin_groups[i].compute_intrin)->desc);
+        tir::TensorIntrin::Get(intrin_groups[i].compute_intrin).value()->desc);
     if (mapping_info.defined()) {
       intrin_group_to_mapping_info.emplace(i, mapping_info.value());
     }
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
index b953d1ad4b50..8485e697eb24 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
@@ -47,7 +47,7 @@ Optional<tir::BlockRV> TileForIntrin(tir::Schedule sch, tir::BlockRV block,
 class MultiLevelTilingWithIntrinNode : public MultiLevelTilingNode {
  protected:
   Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final {
-    auto desc_func = tir::TensorIntrin::Get(intrin_name)->desc;
+    auto desc_func = tir::TensorIntrin::Get(intrin_name).value()->desc;
     if (!CheckAutoTensorizeApplicable(sch, block_rv, desc_func)) {
       TVM_PY_LOG(INFO, logging_func) << "The workload cannot be tensorized.";
       return {sch};
diff --git a/src/tir/ir/function.cc b/src/tir/ir/function.cc
index 028ff1220785..c609ad158e34 100644
--- a/src/tir/ir/function.cc
+++ b/src/tir/ir/function.cc
@@ -97,11 +97,17 @@ void TensorIntrin::Register(String name, TensorIntrin intrin, bool override) {
   manager->reg.Set(name, intrin);
 }
 
-TensorIntrin TensorIntrin::Get(String name) {
+Optional<TensorIntrin> TensorIntrin::Get(String name, bool allow_missing) {
   const TensorIntrinManager* manager = TensorIntrinManager::Global();
   auto it = manager->reg.find(name);
-  CHECK(it != manager->reg.end()) << "ValueError: TensorIntrin '" << name << "' is not registered";
-  return manager->reg.at(name);
+  if (it == manager->reg.end()) {
+    if (allow_missing) {
+      return NullOpt;
+    } else {
+      LOG(FATAL) << "ValueError: TensorIntrin '" << name << "' is not registered";
+    }
+  }
+  return (*it).second;
 }
 
 TVM_REGISTER_NODE_TYPE(TensorIntrinNode);
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 4558ad04baed..8cfbadf65012 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -675,14 +675,14 @@ BlockRV ConcreteScheduleNode::Blockize(const LoopRV& loop_rv) {
 
 void ConcreteScheduleNode::Tensorize(const LoopRV& loop_rv, const String& intrin) {
   TVM_TIR_SCHEDULE_BEGIN();
-  tir::Tensorize(state_, this->GetSRef(loop_rv), tir::TensorIntrin::Get(intrin));
+  tir::Tensorize(state_, this->GetSRef(loop_rv), tir::TensorIntrin::Get(intrin).value());
   this->state_->DebugVerify();
   TVM_TIR_SCHEDULE_END("tensorize", this->error_render_level_);
 }
 
 void ConcreteScheduleNode::Tensorize(const BlockRV& block_rv, const String& intrin) {
   TVM_TIR_SCHEDULE_BEGIN();
-  tir::Tensorize(state_, this->GetSRef(block_rv), tir::TensorIntrin::Get(intrin));
+  tir::Tensorize(state_, this->GetSRef(block_rv), tir::TensorIntrin::Get(intrin).value());
   this->state_->DebugVerify();
   TVM_TIR_SCHEDULE_END("tensorize", this->error_render_level_);
 }
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index d99cc199fe5f..7a720fe3eae2 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -291,7 +291,7 @@ Optional<LoopRV> TileWithTensorIntrin(const tir::Schedule& sch, const tir::Block
                                       const String& intrin_name, bool allow_padding) {
   Optional<tir::TensorizeInfo> opt_tensorize_info =
       GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block_rv),
-                              tir::TensorIntrin::Get(intrin_name)->desc, allow_padding);
+                              tir::TensorIntrin::Get(intrin_name).value()->desc, allow_padding);
   if (!opt_tensorize_info) return NullOpt;
   const tir::TensorizeInfoNode* info = opt_tensorize_info.value().get();
   if (info->block_iter_paddings.defined()) {
diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index 828dad2fc036..f04de8e0051f 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -646,5 +646,12 @@ def fetch_to_shared(block, idx):
         verify_trace_roundtrip(sch=sch, mod=func)
 
 
+def test_tensor_intrin_look_up():
+    intrin_name = 'non_existent_intrin'
+    assert tir.TensorIntrin.get(intrin_name, allow_missing=True) is None
+    with pytest.raises(ValueError):
+        tir.TensorIntrin.get(intrin_name)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 9ce95a9abe3db43b4a4187111c9e2ad0d6bf3dbd Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 22 Sep 2022 20:43:22 -0700
Subject: [PATCH 233/704] [TIR] Fix wmma index in CUDA tensor intrins (#12879)

---
 python/tvm/tir/tensor_intrin/cuda.py | 77 ++++++++++++++++++++++------
 1 file changed, 62 insertions(+), 15 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
index a309b091285b..3374d18dff80 100644
--- a/python/tvm/tir/tensor_intrin/cuda.py
+++ b/python/tvm/tir/tensor_intrin/cuda.py
@@ -489,10 +489,13 @@ def mma_store_impl(a: T.handle, c: T.handle) -> None:
 ######## WMMA intrinsics ########
 
 
-def get_wmma_fragment_index(buffer, m_dim, n_dim):
+def get_wmma_fragment_index(buffer, stride, m_dim, n_dim):
     """Compute wmma fragment index using elem_offset of the buffer"""
-    frag_size = lift(m_dim * n_dim)
-    return buffer.elem_offset // frag_size + (buffer.elem_offset % frag_size) // n_dim
+    frag_index_m = buffer.elem_offset // stride // m_dim
+    frag_index_n = buffer.elem_offset % stride // n_dim
+
+    num_fragments_per_row = stride // n_dim
+    return frag_index_m * num_fragments_per_row + frag_index_n
 
 
 def get_wmma_load_intrin(
@@ -526,6 +529,8 @@ def wmma_load_desc(a: T.handle, c: T.handle) -> None:
     def wmma_load_impl(a: T.handle, c: T.handle) -> None:
         s1 = T.var("int32")
         s0 = T.var("int32")
+        d1 = T.var("int32")
+        d0 = T.var("int32")
         A = T.match_buffer(
             a,
             (m_dim, n_dim),
@@ -536,7 +541,13 @@ def wmma_load_impl(a: T.handle, c: T.handle) -> None:
             strides=[s1, s0],
         )
         C = T.match_buffer(
-            c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=wmma_fragment_scope
+            c,
+            (m_dim, n_dim),
+            dtype,
+            align=64,
+            offset_factor=16,
+            scope=wmma_fragment_scope,
+            strides=[d1, d0],
         )
         with T.block("root"):
             T.reads(A[0:m_dim, 0:n_dim])
@@ -547,7 +558,7 @@ def wmma_load_impl(a: T.handle, c: T.handle) -> None:
                     m_dim,
                     n_dim,
                     k_dim,
-                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    get_wmma_fragment_index(C, d1, m_dim, n_dim),
                     A.access_ptr("r"),
                     s1,
                     layout,
@@ -579,8 +590,16 @@ def wmma_fill_desc(c: T.handle) -> None:
 
     @T.prim_func
     def wmma_fill_impl(c: T.handle) -> None:
+        d1 = T.var("int32")
+        d0 = T.var("int32")
         C = T.match_buffer(
-            c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator"
+            c,
+            (m_dim, n_dim),
+            dtype,
+            align=64,
+            offset_factor=16,
+            scope="wmma.accumulator",
+            strides=[d1, d0],
         )
         with T.block("root"):
             T.reads()
@@ -591,7 +610,7 @@ def wmma_fill_impl(c: T.handle) -> None:
                     m_dim,
                     n_dim,
                     k_dim,
-                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    get_wmma_fragment_index(C, d1, m_dim, n_dim),
                     T.float32(0),
                     dtype="handle",
                 )
@@ -623,8 +642,16 @@ def wmma_store_desc(a: T.handle, c: T.handle) -> None:
     def wmma_store_impl(a: T.handle, c: T.handle) -> None:
         s1 = T.var("int32")
         s0 = T.var("int32")
+        d1 = T.var("int32")
+        d0 = T.var("int32")
         A = T.match_buffer(
-            a, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator"
+            a,
+            (m_dim, n_dim),
+            dtype,
+            align=64,
+            offset_factor=16,
+            scope="wmma.accumulator",
+            strides=[d1, d0],
         )
         C = T.match_buffer(
             c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=scope, strides=[s1, s0]
@@ -638,7 +665,7 @@ def wmma_store_impl(a: T.handle, c: T.handle) -> None:
                     m_dim,
                     n_dim,
                     k_dim,
-                    get_wmma_fragment_index(A, m_dim, n_dim),
+                    get_wmma_fragment_index(A, d1, m_dim, n_dim),
                     C.access_ptr("w"),
                     s1,
                     "row_major",
@@ -696,8 +723,21 @@ def wmma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
 
     @T.prim_func
     def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
+        a1 = T.var("int32")
+        a0 = T.var("int32")
+        b1 = T.var("int32")
+        b0 = T.var("int32")
+        c1 = T.var("int32")
+        c0 = T.var("int32")
+
         A = T.match_buffer(
-            a, (m_dim, k_dim), in_dtype, align=64, offset_factor=16, scope="wmma.matrix_a"
+            a,
+            (m_dim, k_dim),
+            in_dtype,
+            align=64,
+            offset_factor=16,
+            scope="wmma.matrix_a",
+            strides=[a1, a0],
         )
         B = T.match_buffer(
             b,
@@ -706,9 +746,16 @@ def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
             align=64,
             offset_factor=16,
             scope="wmma.matrix_b",
+            strides=[b1, b0],
         )
         C = T.match_buffer(
-            c, (m_dim, n_dim), out_dtype, align=64, offset_factor=16, scope="wmma.accumulator"
+            c,
+            (m_dim, n_dim),
+            out_dtype,
+            align=64,
+            offset_factor=16,
+            scope="wmma.accumulator",
+            strides=[c1, c0],
         )
 
         with T.block("root"):
@@ -717,13 +764,13 @@ def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
             T.evaluate(
                 T.tvm_mma_sync(
                     C.data,
-                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    get_wmma_fragment_index(C, c1, m_dim, n_dim),
                     A.data,
-                    get_wmma_fragment_index(A, m_dim, k_dim),
+                    get_wmma_fragment_index(A, a1, m_dim, k_dim),
                     B.data,
-                    get_wmma_fragment_index(B, b_shape_0, b_shape_1),
+                    get_wmma_fragment_index(B, b1, b_shape_0, b_shape_1),
                     C.data,
-                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    get_wmma_fragment_index(C, c1, m_dim, n_dim),
                     dtype="handle",
                 )
             )

From d80ce6b1ba5439dbe0437be6e37121844f87a113 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Fri, 23 Sep 2022 00:33:19 -0700
Subject: [PATCH 234/704] [EZ][Release] Update gather PRs Script (#12862)

Update internal path to account for directory structure
change in TVM repository, with the introduction of `ci`
directory.
---
 tests/scripts/release/gather_prs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/scripts/release/gather_prs.py b/tests/scripts/release/gather_prs.py
index 0720a87d042b..5fbfa2278feb 100644
--- a/tests/scripts/release/gather_prs.py
+++ b/tests/scripts/release/gather_prs.py
@@ -25,11 +25,12 @@
 from typing import Callable, Dict, List, Any
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
-sys.path.append(str(REPO_ROOT / "tests" / "scripts"))
+sys.path.append(str(REPO_ROOT / "ci" / "scripts"))
 
 from git_utils import git, GitHubRepo
 from github_tag_teams import tags_from_title
 
+
 GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
 
 
From e8aeb4adf3525837db5f24965104640163b38f0e Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Fri, 23 Sep 2022 16:57:18 +0100
Subject: [PATCH 235/704] [CI] Add Zephyr-SDK binaries to PATH env. in
 ci_cortexm (#12884)

In recent test rounds with updated images, it seems Zephyr-SDK
binaries such as various QEMU related files are missing from
$PATH, which makes Zephyr tests to fail with, e.g.
"qemu-system-i386: command not found".

This PR adds those missing binaries to $PATH.

Co-authored-by: Gustavo Romero <Gustavo.Romero@linaro.org>
---
 docker/Dockerfile.ci_cortexm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm
index db02792efda9..8e8d2c0a4f9e 100644
--- a/docker/Dockerfile.ci_cortexm
+++ b/docker/Dockerfile.ci_cortexm
@@ -79,6 +79,7 @@ COPY install/ubuntu_init_zephyr_project.sh /install/ubuntu_init_zephyr_project.s
 COPY install/ubuntu_install_zephyr_sdk.sh /install/ubuntu_install_zephyr_sdk.sh
 RUN bash /install/ubuntu_install_zephyr.sh
 ENV ZEPHYR_BASE=/opt/zephyrproject/zephyr
+ENV PATH /opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH
 
 # FreeRTOS deps
 COPY install/ubuntu_install_freertos.sh /install/ubuntu_install_freertos.sh

From eba75e4640d68989cd850ef66bdac0061e873d92 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 23 Sep 2022 13:30:52 -0700
Subject: [PATCH 236/704] [METASCHEDULE] Mark work_dir as not optional in docs
 (#12888)

---
 python/tvm/meta_schedule/tune.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 20eccc30a113..b1cc0f67bd5f 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -259,7 +259,7 @@ def tune_extracted_tasks(
         The list of extracted tasks.
     config : TuneConfig
         The search strategy config.
-    work_dir : Optional[str]
+    work_dir : str
         The working directory to save intermediate results.
     builder : Optional[Builder]
         The builder to use.
@@ -380,7 +380,7 @@ def tune_tir(
         The target to tune for.
     config : TuneConfig
         The search strategy config.
-    work_dir : Optional[str]
+    work_dir : str
         The working directory to save intermediate results.
     builder : Optional[Builder]
         The builder to use.
@@ -499,7 +499,7 @@ def tune_te(
         The search strategy config.
     task_name : str
         The name of the task.
-    work_dir : Optional[str]
+    work_dir : str
         The working directory to save intermediate results.
     builder : Optional[Builder]
         The builder to use.
@@ -569,7 +569,7 @@ def tune_relay(
         The associated parameters of the program
     task_name : str
         The name of the task.
-    work_dir : Optional[str]
+    work_dir : str
         The working directory to save intermediate results.
     builder : Optional[Builder]
         The builder to use.

From 428269f80ca869a2fdee09af1683989448cd6bd4 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 23 Sep 2022 15:38:56 -0700
Subject: [PATCH 237/704] [FIX,PROFILING] Fix PAPI docs (#12861)

The VM requires arguements to not be wrapped in an array. Passing the
arguments unwrapped now. Also added relevant imports.
---
 docs/how_to/profile/papi.rst | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/how_to/profile/papi.rst b/docs/how_to/profile/papi.rst
index 78d512c9888b..02643451aa09 100644
--- a/docs/how_to/profile/papi.rst
+++ b/docs/how_to/profile/papi.rst
@@ -62,6 +62,12 @@ is an example:
 
 .. code:: python
 
+    import tvm
+    from tvm import relay
+    from tvm.relay.testing import mlp
+    from tvm.runtime import profiler_vm
+    import numpy as np
+
     target = "llvm"
     dev = tvm.cpu()
     mod, params = mlp.get_workload(1)
@@ -71,7 +77,7 @@ is an example:
 
     data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
     report = vm.profile(
-        [data],
+        data,
         func_name="main",
         collectors=[tvm.runtime.profiling.PAPIMetricCollector()],
     )
@@ -94,7 +100,7 @@ You can also change which metrics are collected:
 .. code:: python
 
     report = vm.profile(
-        [data],
+        data,
         func_name="main",
         collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: ["PAPI_FP_OPS"])],
     )

From fbb500e92f4f3104710ae4119871e6c105de4dc4 Mon Sep 17 00:00:00 2001
From: multiverstack-intellif <39256082+multiverstack@users.noreply.github.com>
Date: Sat, 24 Sep 2022 21:08:46 +0800
Subject: [PATCH 238/704] [TIR][Schedule] Relax cache read/write's restriction
 and fix unexpected behavior (#12766)

[TIR][Schedule] Relax cache read/write's restriction and fix unexpected behavior.

Co-authored-by: Min Chen <chen.min@intellif.com>
---
 .../schedule/primitive/cache_read_write.cc    | 76 +++++++++++++------
 src/tir/schedule/state.cc                     |  1 +
 .../test_tir_schedule_cache_read_write.py     | 63 ++++++++++++++-
 3 files changed, 114 insertions(+), 26 deletions(-)

diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index a221733eb394..c76e6abaebb5 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -382,9 +382,16 @@ class CacheLocDetector : public StmtVisitor {
   static void Detect(const ScheduleState& self, const StmtSRef& block_sref,
                      const StmtSRef& scope_sref, CacheStageInfo* info) {
     std::vector<StmtSRef> related_blocks;
-    for (const Dependency& def : self->GetBlockScope(scope_sref)->GetDepsBySrc(block_sref)) {
-      if (def->kind == DepKind::kRAW) {
-        related_blocks.push_back(def->dst);
+    // If consumer is specified, skip detecting the others
+    if (info->consumer_blocks.size() > 0) {
+      for (StmtSRef consumer : info->consumer_blocks) {
+        related_blocks.emplace_back(consumer);
+      }
+    } else {
+      for (const Dependency& def : self->GetBlockScope(scope_sref)->GetDepsBySrc(block_sref)) {
+        if (def->kind == DepKind::kRAW) {
+          related_blocks.push_back(def->dst);
+        }
       }
     }
     if (!related_blocks.empty()) {
@@ -416,29 +423,24 @@ class CacheLocDetector : public StmtVisitor {
 
   void VisitStmt_(const SeqStmtNode* seq_stmt) final {
     bool previous_visited_block = visited_block_;
-    bool previous_visited_related = visited_related_;
-    visited_block_ = visited_related_ = false;
+    visited_block_ = false;
 
-    int pos = -1;
     for (size_t i = 0; i < seq_stmt->size(); ++i) {
       if (loc_pos_ != -1) {
         break;
       }
       VisitStmt(seq_stmt->seq[i]);
       // `pos` can be assigned only once when we visited `block_sref`
-      if (visited_block_ && visited_related_ && pos == -1) {
+      if (visited_block_ && visited_related_ && loc_pos_ == -1) {
         // The offset of insert position from the block
-        pos = i;
+        loc_pos_ = i;
+        return;
+      } else if (visited_related_) {
+        // If meet the target consumer, stop searching
+        visited_block_ = visited_block_ || previous_visited_block;
+        return;
       }
     }
-    visited_block_ = visited_block_ || previous_visited_block;
-    visited_related_ = visited_related_ || previous_visited_related;
-    // Only we visited the writing block and any one of the related blocks
-    // That means that we have found the lowest ancestor
-    // of the block and any one of the related ones
-    if (visited_block_ && visited_related_ && loc_pos_ == -1) {
-      loc_pos_ = pos;
-    }
   }
 
   void VisitStmt_(const BlockNode* block) final {
@@ -446,11 +448,12 @@ class CacheLocDetector : public StmtVisitor {
     if (block == scope_sref_->stmt) {
       // The block vistied is the current parent scope
       StmtVisitor::VisitStmt_(block);
-      // Handling cache_read for input buffer
-      if (visited_block_ && visited_related_ && !loc_sref_.defined()) {
+      // Handling cases when insert outside any loop or cache_read for input buffer
+      if (visited_related_ && !loc_sref_.defined()) {
         loc_sref_ = self_->stmt2ref.at(block);
-        if (loc_pos_ == -1) {
-          loc_pos_ = 1;
+        // Handling cache_read for input buffer
+        if (visited_block_ == false && loc_pos_ == -1) {
+          loc_pos_ = 0;
         }
       }
       return;
@@ -980,6 +983,33 @@ class ReIndexRewriter : public StmtExprMutator {
   Region region_;
 };
 
+void CheckRegionCover(const ScheduleState& self, StmtSRef scope_root) {
+  class NotRegionCoverError : public ScheduleError {
+   public:
+    explicit NotRegionCoverError(IRModule mod, Block block) : mod_(mod), block_(block) {}
+    IRModule mod() const final { return mod_; }
+    String FastErrorString() const final {
+      return "ScheduleError: The scope root's region cover is not complete.";
+    }
+    String DetailRenderTemplate() const final {
+      return R"(The scope {0} 's region cover is not complete.
+The region cover property require to hold for every of its child blocks
+)";
+    }
+    Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+    IRModule mod_;
+    Block block_;
+  };
+  BlockScope scope = self->GetBlockScope(scope_root);
+  for (const auto& kv : scope->dst2deps) {
+    const StmtSRef& consumer_block_sref = kv.first;
+    if (!self->block_info.at(consumer_block_sref).region_cover) {
+      const BlockNode* block = TVM_SREF_TO_BLOCK(scope_root);
+      throw NotRegionCoverError(self->mod, GetRef<Block>(block));
+    }
+  }
+}
+
 /******** Implementation ********/
 
 StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buffer_index,
@@ -1002,7 +1032,9 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff
   const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   Buffer read_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block), read_buffer_index, BufferIndexType::kRead);
-  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
+  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
+  // Check required region cover for cache_read
+  CheckRegionCover(self, scope_sref);
   const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref);
 
   // Step 2. Create CacheStageInfo
@@ -1075,7 +1107,7 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu
   const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
   Buffer write_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block), write_buffer_index, BufferIndexType::kWrite);
-  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
+  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
 
   // Step 2. Creating CacheStageInfo
   CacheStageInfo info;
diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc
index 6d4a42236f57..27056124d9e1 100644
--- a/src/tir/schedule/state.cc
+++ b/src/tir/schedule/state.cc
@@ -346,6 +346,7 @@ class BlockInfoCollector : private StmtVisitor {
               if (!ProducerCoversConsumer(buffer->shape, produced_region, consumed_region,
                                           &analyzer_)) {
                 region_cover = false;
+                self_->block_info.at(consumer_block_sref).region_cover = region_cover;
                 break;
               }
             }
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index cf4836e5361e..334fb988d775 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -223,6 +223,24 @@ def func_with_block_predicate() -> None:
             B[ax] = A[ax] + 1.0
 
 
+@T.prim_func
+def inplace_func(data_io: T.Buffer[(64), "int32"]):
+    data_1d = T.alloc_buffer([64], dtype="int32")
+    for i0 in T.serial(64):
+        with T.block("copy_in"):
+            v0 = T.axis.remap("S", [i0])
+            data_1d[v0] = data_io[v0]
+    for i0 in T.serial(1):
+        with T.block("ext_call"):
+            T.reads(data_1d[:64])
+            T.writes(data_1d[:64])
+            T.evaluate(T.call_extern("call_impl", data_1d.data, dtype=""))
+    for i0 in T.serial(64):
+        with T.block("copy_out"):
+            v0 = T.axis.remap("S", [i0])
+            data_io[v0] = data_1d[v0]
+
+
 ########## Expected function after cache_read ##########
 
 
@@ -414,15 +432,15 @@ def cache_read_multi_consumer_target() -> None:
             with T.block("A"):
                 vi = T.axis.S(128, i * 16 + j)
                 A[vi] = 1.0
-        for j in T.grid(16):
-            with T.block("A"):
-                vi = T.axis.S(128, i * 16 + j)
-                A_global[vi] = A[vi]
         for j in T.grid(16):
             with T.block("B"):
                 vi = T.axis.S(128, i * 16 + j)
                 B[vi] = A[vi] + 1.0
 
+    for i in T.grid(128):
+        with T.block("A"):
+            vi = T.axis.S(128, i)
+            A_global[vi] = A[vi]
     for i in T.grid(128):
         with T.block("C"):
             vi = T.axis.S(128, i)
@@ -501,6 +519,35 @@ def cache_read_shape_int64(var_A: T.handle, var_C: T.handle) -> None:
             C[vi, vj] = B[vi, vj] + T.float32(1)
 
 
+@T.prim_func
+def cache_read_inplace(data_io: T.Buffer[64, "int32"]) -> None:
+    data_1d = T.alloc_buffer([64], dtype="int32")
+    data_io_local = T.alloc_buffer([64], dtype="int32", scope="local")
+    for ax0 in T.serial(64):
+        with T.block("data_io_local"):
+            v0 = T.axis.spatial(64, ax0)
+            T.reads(data_io[v0])
+            T.writes(data_io_local[v0])
+            data_io_local[v0] = data_io[v0]
+    for i0 in T.serial(64):
+        with T.block("copy_in"):
+            v0 = T.axis.spatial(64, i0)
+            T.reads(data_io_local[v0])
+            T.writes(data_1d[v0])
+            data_1d[v0] = data_io_local[v0]
+    for i0 in T.serial(1):
+        with T.block("ext_call"):
+            T.reads(data_1d[0:64])
+            T.writes(data_1d[0:64])
+            T.evaluate(T.call_extern("call_impl", data_1d.data, dtype=""))
+    for i0 in T.serial(64):
+        with T.block("copy_out"):
+            v0 = T.axis.spatial(64, i0)
+            T.reads(data_1d[v0])
+            T.writes(data_io[v0])
+            data_io[v0] = data_1d[v0]
+
+
 ########## Expected function after cache_write ##########
 
 
@@ -876,6 +923,14 @@ def test_cache_read_fail_invalid_storage_scope(use_block_name):
         sch.cache_read(block_b, 0, "test_scope")
 
 
+def test_inplace_cache_read():
+    sch = tvm.tir.Schedule(inplace_func, debug_mask="all")
+    block = sch.get_block("copy_in")
+    sch.cache_read(block, 0, "local", [block])
+    tvm.ir.assert_structural_equal(cache_read_inplace, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=inplace_func)
+
+
 ########## Testcases for cache_write ##########
 
 
From 71f25b3d6c851046e925ef6a2d2626626084913a Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Sat, 24 Sep 2022 18:28:02 -0500
Subject: [PATCH 239/704] [IR] Use TVM_DEFINE_OBJECT_REF_METHODS macro for Op
 (#12893)

Previously, the `get()` method wasn't defined, and returned a
`RelayExprNode` instead of a `Op::ContainerType*`.
---
 include/tvm/ir/op.h | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/include/tvm/ir/op.h b/include/tvm/ir/op.h
index 683170026451..6e6b8bee5fc3 100644
--- a/include/tvm/ir/op.h
+++ b/include/tvm/ir/op.h
@@ -164,15 +164,6 @@ class OpNode : public RelayExprNode {
  */
 class Op : public RelayExpr {
  public:
-  /*! \brief default constructor  */
-  Op() {}
-  /*! \brief constructor from node pointer */
-  explicit Op(ObjectPtr<Object> n) : RelayExpr(n) {}
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  inline const OpNode* operator->() const;
   /*!
    * \brief Get additional registered attribute about operators.
    *  If nothing has been registered, an empty OpAttrMap will be returned.
@@ -196,8 +187,7 @@ class Op : public RelayExpr {
    */
   TVM_DLL static const Op& Get(const String& op_name);
 
-  /*! \brief specify container node */
-  using ContainerType = OpNode;
+  TVM_DEFINE_OBJECT_REF_METHODS(Op, RelayExpr, OpNode)
 
  private:
   /*!
@@ -370,7 +360,6 @@ class OpAttrMap : public AttrRegistryMap<Op, ValueType> {
       ::tvm::OpRegEntry::RegisterOrGet(OpName).set_name()
 
 // implementations
-inline const OpNode* Op::operator->() const { return static_cast<const OpNode*>(get()); }
 
 template <typename ValueType>
 inline OpAttrMap<ValueType> Op::GetAttrMap(const String& key) {

From a61c1ad0f03b53a4b5a3cc3e4a60d6daafe2b1e2 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Mon, 26 Sep 2022 08:47:09 +0800
Subject: [PATCH 240/704] [TIR] Fix plan buffer allocation location for loop
 carried dependencies (#12757)

* Fix plan buffer allocation location for loop carried dependencies

* fix testcase region annotation issue

* fix typo in ut
---
 .../analysis/buffer_access_lca_detector.cc    | 106 +++++++++++++++--
 ..._plan_update_buffer_allocation_location.py | 109 +++++++++++++++++-
 2 files changed, 200 insertions(+), 15 deletions(-)

diff --git a/src/tir/analysis/buffer_access_lca_detector.cc b/src/tir/analysis/buffer_access_lca_detector.cc
index 7197e1ba83c5..64d10fae2ff1 100644
--- a/src/tir/analysis/buffer_access_lca_detector.cc
+++ b/src/tir/analysis/buffer_access_lca_detector.cc
@@ -99,23 +99,32 @@ class LCADetector : public StmtExprVisitor {
     }
 
     ancestor_scopes_.push_back(current_scope);
+    loop_scope_map_.insert({op->loop_var.get(), current_scope});
     StmtExprVisitor::VisitStmt_(op);
     ancestor_scopes_.pop_back();
+    loop_scope_map_.erase(op->loop_var.get());
   }
 
-  void VisitStmt_(const BlockNode* op) final {
+  void VisitStmt_(const BlockRealizeNode* op) final {
+    const BlockNode* block = op->block.get();
     int n = ancestor_scopes_.size();
-    for (const Buffer& buf : op->alloc_buffers) {
+    for (const Buffer& buf : block->alloc_buffers) {
       buffer_var_map_.emplace(buf->data.get(), buf.get());
     }
 
     const ScopeInfo* parent_scope = ancestor_scopes_.back();
-    auto* current_scope = arena_.make<ScopeInfo>(parent_scope, op, n);
+    auto* current_scope = arena_.make<ScopeInfo>(parent_scope, block, n);
 
     ancestor_scopes_.push_back(current_scope);
+
+    // For each accessed buffer of the block, update the buffer's lca to
+    // the lowest inclusive stmt position, which should dominate all loops
+    // related to the accessed opaque block iter vars in buffer indices.
+    UpdateDominateScopeOfOpaqueIter(op);
+
     // Update match_buffers
-    for (const MatchBufferRegion& match_buffer : op->match_buffers) {
-      UpdateBufferLCA(match_buffer->source->buffer.get());
+    for (const MatchBufferRegion& match_buffer : block->match_buffers) {
+      UpdateBufferLCA(match_buffer->source->buffer.get(), ancestor_scopes_.back());
       match_buffers_.insert(match_buffer->buffer.get());
     }
 
@@ -123,6 +132,80 @@ class LCADetector : public StmtExprVisitor {
     ancestor_scopes_.pop_back();
   }
 
+  void UpdateDominateScopeOfOpaqueIter(const BlockRealizeNode* block_realize) {
+    // map opaque iter var to the scope which dominate all loop carried dependencies.
+    std::unordered_map<const VarNode*, const ScopeInfo*> itervar_to_dom_scope;
+
+    // function to collect `itervar_to_dom_scope`, the result scope for each block
+    // iter var should be above all loop scopes the opaque iter var binding relates to.
+    auto do_collect_itervar_scope = [this, &itervar_to_dom_scope](const IterVar& itervar,
+                                                                  const PrimExpr& binding) {
+      PostOrderVisit(binding, [this, &itervar_to_dom_scope, &itervar](const ObjectRef& obj) {
+        if (const VarNode* loop_var = obj.as<VarNode>()) {
+          auto it = loop_scope_map_.find(loop_var);
+          if (it == loop_scope_map_.end()) {
+            return;
+          }
+          const ScopeInfo* scope = it->second->parent_scope_info;
+          // find the highest loop scope the iter var binding has related to.
+          auto dom_scope_it = itervar_to_dom_scope.find(itervar->var.get());
+          if (dom_scope_it == itervar_to_dom_scope.end()) {
+            itervar_to_dom_scope.insert(dom_scope_it, {itervar->var.get(), scope});
+          } else if (scope->depth < dom_scope_it->second->depth) {
+            dom_scope_it->second = scope;
+          }
+        }
+      });
+    };
+
+    // function to update lca scope of the buffer with loop carried dependent buffer accesses.
+    // the result scope should be above all loop scopes the accessed opaque block iter vars
+    // relate to, which is record in `itervar_to_dom_scope`.
+    auto do_update = [this, &itervar_to_dom_scope](const BufferRegion& region) {
+      const Buffer& buffer = region->buffer;
+      const ScopeInfo* scope = ancestor_scopes_.back();
+
+      auto handle_itervar = [&itervar_to_dom_scope, &scope](const ObjectRef& obj) {
+        if (const VarNode* iter_var = obj.as<VarNode>()) {
+          auto dom_scope_it = itervar_to_dom_scope.find(iter_var);
+          if (dom_scope_it == itervar_to_dom_scope.end()) {
+            return;
+          }
+          // find the highest loop scope the accessed buffer index has
+          // loop carried dependencies to (via opaque iter var binding).
+          if (dom_scope_it->second->depth < scope->depth) {
+            scope = dom_scope_it->second;
+          }
+        }
+      };
+
+      // visit region min and max to find the lowest legal lca scope
+      for (const Range& range : region->region) {
+        PostOrderVisit(range->min, handle_itervar);
+        PostOrderVisit(range->min + range->extent - 1, handle_itervar);
+      }
+      UpdateBufferLCA(buffer.get(), scope);
+    };
+
+    // do collect and update
+    const Block& block = block_realize->block;
+    for (size_t i = 0; i < block_realize->iter_values.size(); ++i) {
+      const IterVar& iter_var = block->iter_vars[i];
+      if (iter_var->iter_type != IterVarType::kDataPar &&
+          iter_var->iter_type != IterVarType::kCommReduce) {
+        do_collect_itervar_scope(iter_var, block_realize->iter_values[i]);
+      }
+    }
+    if (!itervar_to_dom_scope.empty()) {
+      for (const auto& read : block->reads) {
+        do_update(read);
+      }
+      for (const auto& write : block->writes) {
+        do_update(write);
+      }
+    }
+  }
+
   void VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::thread_extent) {
       const auto* iter = op->node.as<IterVarNode>();
@@ -136,17 +219,18 @@ class LCADetector : public StmtExprVisitor {
   }
 
   void VisitExpr_(const BufferLoadNode* op) final {
-    UpdateBufferLCA(op->buffer.get());
+    UpdateBufferLCA(op->buffer.get(), ancestor_scopes_.back());
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitStmt_(const BufferStoreNode* op) final {
-    UpdateBufferLCA(op->buffer.get());
+    UpdateBufferLCA(op->buffer.get(), ancestor_scopes_.back());
     StmtExprVisitor::VisitStmt_(op);
   }
 
   void VisitStmt_(const BufferRealizeNode* op) final {
     buffer_var_map_.emplace(op->buffer->data.get(), op->buffer.get());
+    UpdateBufferLCA(op->buffer.get(), ancestor_scopes_.back());
     StmtExprVisitor::VisitStmt_(op);
   }
 
@@ -165,16 +249,16 @@ class LCADetector : public StmtExprVisitor {
   void VisitBufferVar(const VarNode* op) {
     auto it = buffer_var_map_.find(op);
     if (it != buffer_var_map_.end()) {
-      UpdateBufferLCA(it->second);
+      UpdateBufferLCA(it->second, ancestor_scopes_.back());
     }
   }
 
-  void UpdateBufferLCA(const BufferNode* buffer) {
+  void UpdateBufferLCA(const BufferNode* buffer, const ScopeInfo* scope) {
     buffer_var_map_.emplace(buffer->data.get(), buffer);
     if (match_buffers_.find(buffer) == match_buffers_.end()) {
       // Ingore buffer created by block match_buffer
       const ScopeInfo*& lca = buffer_lca_[buffer];
-      lca = LowestCommonAncestor(lca, ancestor_scopes_.back());
+      lca = LowestCommonAncestor(lca, scope);
     }
   }
 
@@ -229,6 +313,8 @@ class LCADetector : public StmtExprVisitor {
   std::unordered_set<const BufferNode*> match_buffers_ = {};
   /*! \brief The ForNodes/BlockNodes which contain immediate `blockIdx` launch. */
   std::vector<const ScopeInfo*> blockidx_scopes_ = {};
+  /*! \brief The map from loop var to the corresponding scope. */
+  std::unordered_map<const VarNode*, const ScopeInfo*> loop_scope_map_ = {};
   /*! \brief Internal arena. */
   support::Arena arena_;
 };
diff --git a/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py b/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
index c22f5f82ee10..34d82f86a422 100644
--- a/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
+++ b/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
 from tvm import te
 from tvm.script import tir as T
 
@@ -242,9 +243,107 @@ def test_lower_te():
     )  # PlanAndUpdateBufferAllocationLocation should do nothing on TE
 
 
+def test_loop_carried_dependency():
+    """The buffer allocation should be above opaque iter var's loop scopes
+    such that buffer accesses with loop carried dependencies are covered."""
+
+    @T.prim_func
+    def before(A: T.Buffer[(8, 8, 8), "int32"], B: T.Buffer[(8, 8, 8), "int32"]):
+        C = T.alloc_buffer([8, 8, 8], dtype="int32")
+        for i in T.serial(8):
+            for j in T.serial(8):
+                for k in T.serial(8):
+                    with T.block("b0"):
+                        vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+                        C[vi, vj, vk] = A[vi, vj, vk] + 1
+                for k in T.serial(8):
+                    with T.block("b1"):
+                        vi, vk = T.axis.remap("SS", [i, k])
+                        vj = T.axis.opaque(8, j)
+                        B[vi, vj, vk] = C[vi, vj, vk] + T.if_then_else(
+                            0 < vj, C[vi, vj - 1, vk], 0, dtype="int32"
+                        )
+
+    @T.prim_func
+    def after(A: T.Buffer[(8, 8, 8), "int32"], B: T.Buffer[(8, 8, 8), "int32"]) -> None:
+        for i in T.serial(8):
+            with T.block():
+                T.reads(A[i, 0:8, 0:8])
+                T.writes(B[i, 0:8, 0:8])
+                C = T.alloc_buffer([8, 8, 8], dtype="int32")
+                for j in T.serial(8):
+                    for k in T.serial(8):
+                        with T.block("b0"):
+                            vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+                            C[vi, vj, vk] = A[vi, vj, vk] + 1
+                    for k in T.serial(8):
+                        with T.block("b1"):
+                            vi, vk = T.axis.remap("SS", [i, k])
+                            vj = T.axis.opaque(8, j)
+                            B[vi, vj, vk] = C[vi, vj, vk] + T.if_then_else(
+                                0 < vj, C[vi, vj - 1, vk], 0, dtype="int32"
+                            )
+
+    _check(before, after)
+
+
+def test_1D_cascade_op_rolling_buffer():
+    """The intermediate buffer must be allocated above rolling buffer's rolling loop,
+    which is marked as opaque in consumer block's iter mappings."""
+
+    @T.prim_func
+    def before(A: T.Buffer[(4, 16), "int32"], C: T.Buffer[(4, 8), "int32"]):
+        B = T.alloc_buffer((4, 6), "int32")
+        for c in T.serial(4):
+            for i in T.serial(0, 2):
+                for j in T.serial(0, 6):
+                    for k in T.serial(3):
+                        with T.block("P1"):
+                            T.where(i < 1 or j >= 2)
+                            cc, vi, vj, vk = T.axis.remap("SSSR", [c, i, j, k])
+                            if vk == 0:
+                                B[cc, T.floormod(vi * 4 + vj, 6)] = 0
+                            B[cc, T.floormod(vi * 4 + vj, 6)] = (
+                                B[cc, T.floormod(vi * 4 + vj, 6)] + A[cc, vi * 4 + vj + vk]
+                            )
+                for j in T.serial(0, 4):
+                    for k in T.serial(3):
+                        with T.block("P2"):
+                            vi = T.axis.opaque(2, i)
+                            cc, vj, vk = T.axis.remap("SSR", [c, j, k])
+                            if vk == 0:
+                                C[cc, vi * 4 + vj] = 0
+                            C[cc, vi * 4 + vj] = (
+                                C[cc, vi * 4 + vj] + B[cc, T.floormod(vi * 4 + vj + vk, 6)]
+                            )
+
+    @T.prim_func
+    def after(A: T.Buffer[(4, 16), "int32"], C: T.Buffer[(4, 8), "int32"]):
+        for c in T.serial(4):
+            with T.block():
+                T.reads(A[c, 0:12], C[c, 0:8])
+                T.writes(C[c, 0:8])
+                B = T.alloc_buffer([4, 6], dtype="int32")
+                for i in T.serial(2):
+                    for j, k in T.grid(6, 3):
+                        with T.block("P1"):
+                            T.where(i < 1 or j >= 2)
+                            cc, vi, vj, vk = T.axis.remap("SSSR", [c, i, j, k])
+                            if vk == 0:
+                                B[cc, (vi * 4 + vj) % 6] = 0
+                            B[cc, (vi * 4 + vj) % 6] = (
+                                B[cc, (vi * 4 + vj) % 6] + A[cc, vi * 4 + vj + vk]
+                            )
+                    for j, k in T.grid(4, 3):
+                        with T.block("P2"):
+                            vi = T.axis.opaque(2, i)
+                            cc, vj, vk = T.axis.remap("SSR", [c, j, k])
+                            if vk == 0:
+                                C[cc, vi * 4 + vj] = 0
+                            C[cc, vi * 4 + vj] = C[cc, vi * 4 + vj] + B[cc, (vi * 4 + vj + vk) % 6]
+
+    _check(before, after)
+
+
 if __name__ == "__main__":
-    test_elementwise()
-    test_locate_buffer_allocation()
-    test_match_buffer_allocation()
-    test_opaque_access()
-    test_lower_te()
+    tvm.testing.main()

From c8423a6843edec5e85003a33d260f2214fd16c42 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Sun, 25 Sep 2022 22:50:20 -0700
Subject: [PATCH 241/704] [Meta Schedule][XGBoost] Update the custom callback
 function of xgboost in meta schedule (#12141)

* update the custom callback function of xgboost

* fix lint

* fix ci

* fix lint

* add unit test

* remote unused code

* fix lint

* add decorator

* address comment

* fix lint

* address comments

* fix mypy

* fix lint

* remove unused comments

* address comments

* Fix xgboost unit test import.

Co-authored-by: Xiyou Zhou <xiyou@octoml.ai>
---
 .../tvm/meta_schedule/cost_model/xgb_model.py | 169 +++++++++++-------
 .../unittest/test_meta_schedule_cost_model.py |  85 +++++++++
 2 files changed, 194 insertions(+), 60 deletions(-)

diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py
index 8de034758b4b..1171e081b90a 100644
--- a/python/tvm/meta_schedule/cost_model/xgb_model.py
+++ b/python/tvm/meta_schedule/cost_model/xgb_model.py
@@ -35,7 +35,26 @@
 from ..utils import cpu_count, derived_object, shash2hex
 from .metric import max_curve
 
+
+def optional_xgboost_callback(cls):
+    """Decorator for importing TraningCallback from xgboost"""
+    # pylint:disable = import-outside-toplevel
+    try:
+        from xgboost.callback import TrainingCallback  # type: ignore
+    # pylint:enable = import-outside-toplevel
+    except ImportError:
+
+        class TrainingCallback:  # type: ignore
+            pass
+
+    class OptXGBoostCustomCallback(cls, TrainingCallback):  # type: ignore
+        pass
+
+    return OptXGBoostCustomCallback
+
+
 if TYPE_CHECKING:
+
     import xgboost as xgb  # type: ignore
 
     from ..tune_context import TuneContext
@@ -579,14 +598,12 @@ def avg_peak_score(ys_pred: np.ndarray, d_train: "xgb.DMatrix"):  # type: ignore
             num_boost_round=10000,
             obj=obj,
             callbacks=[
-                custom_callback(
+                XGBoostCustomCallback(
                     early_stopping_rounds=self.early_stopping_rounds,
                     verbose_eval=self.verbose_eval,
-                    fevals=[
-                        rmse,
-                        avg_peak_score,
-                    ],
+                    fevals=[rmse, avg_peak_score],
                     evals=[(self.d_train.dmatrix, "tr")],
+                    cvfolds=None,
                 )
             ],
         )
@@ -640,52 +657,83 @@ def average_peak_score(ys_pred: np.ndarray):
         return eval_result
 
 
-def custom_callback(
-    early_stopping_rounds: int,
-    verbose_eval: int,
-    fevals: List[Callable],
-    evals: List[Tuple["xgb.DMatrix", str]],
-    focused_metric: str = "tr-p-rmse",
-):
-    """Callback function for xgboost to support multiple custom evaluation functions"""
-    sort_key = make_metric_sorter(focused_metric=focused_metric)
-
-    state: Dict[str, Any] = {}
-
-    def init(env: "xgb.core.CallbackEnv"):
-        """Internal function"""
-        booster: "xgb.Booster" = env.model
+@optional_xgboost_callback
+class XGBoostCustomCallback:
+    """Custom callback class for xgboost to support multiple custom evaluation functions"""
 
-        state["best_iteration"] = 0
-        state["best_score"] = float("inf")
+    def __init__(
+        self,
+        early_stopping_rounds: int,
+        verbose_eval: int,
+        fevals: List[Callable],
+        evals: List[Tuple["xgb.DMatrix", str]],
+        focused_metric: str = "tr-p-rmse",
+        cvfolds: List["xgb.training.CVPack"] = None,
+    ):
+        self.early_stopping_rounds = early_stopping_rounds
+        self.verbose_eval = verbose_eval
+        self.fevals = fevals
+        self.evals = evals
+        self.state: Dict[str, Any] = {}
+        self.focused_metric = focused_metric
+        self.sort_key = make_metric_sorter(focused_metric=focused_metric)
+        self.cvfolds = cvfolds
+        if cvfolds is not None:
+            self.aggregated_cv = None
+
+    def __call__(self, env: "xgb.core.CallbackEnv"):
+        # Compatibility with xgboost < 1.3
+        return self.after_iteration(env.model, env.iteration, env.evaluation_result_list)
+
+    def init(self, model: "xgb.Booster"):
+        """Internal function for intialization"""
+        booster: "xgb.Booster" = model
+        self.state["best_iteration"] = 0
+        self.state["best_score"] = float("inf")
         if booster is None:
-            assert env.cvfolds is not None
+            assert self.cvfolds is not None
             return
         if booster.attr("best_score") is not None:
-            state["best_score"] = float(booster.attr("best_score"))
-            state["best_iteration"] = int(booster.attr("best_iteration"))
-            state["best_msg"] = booster.attr("best_msg")
+            self.state["best_score"] = float(booster.attr("best_score"))
+            self.state["best_iteration"] = int(booster.attr("best_iteration"))
+            self.state["best_msg"] = booster.attr("best_msg")
         else:
-            booster.set_attr(best_iteration=str(state["best_iteration"]))
-            booster.set_attr(best_score=str(state["best_score"]))
+            booster.set_attr(best_iteration=str(self.state["best_iteration"]))
+            booster.set_attr(best_score=str(self.state["best_score"]))
 
-    def callback(env: "xgb.core.CallbackEnv"):
+    def after_iteration(
+        self, model: "xgb.Booster", epoch: int, evals_log: Dict
+    ):  # pylint: disable = unused-argument
+        """Internal function for after_iteration"""
         # pylint:disable = import-outside-toplevel
+        try:
+            from xgboost.callback import _fmt_metric  # type: ignore
+        except ImportError:
+            # Compatibility with xgboost >= 1.6
+
+            def _fmt_metric(value, show_stdv=True):
+                if len(value) == 2:
+                    return f"{value[0]}:{value[1]:.5f}"
+                if len(value) == 3:
+                    if show_stdv:
+                        return f"{value[0]}:{value[1]:.5f}+{value[2]:.5f}"
+                    return f"{value[0]}:{value[1]:.5f}"
+                raise ValueError("wrong metric value", value)
+
         import xgboost as xgb
-        from xgboost.callback import _fmt_metric  # type: ignore
-        from xgboost.core import EarlyStopException  # type: ignore
+        from xgboost import rabit  # type: ignore
 
         try:
             from xgboost.training import aggcv  # type: ignore
         except ImportError:
             from xgboost.callback import _aggcv as aggcv  # type: ignore
-        # pylint:enable = import-outside-toplevel
 
-        if not state:
-            init(env)
-        booster: xgb.Booster = env.model
-        iteration: int = env.iteration
-        cvfolds: List[xgb.training.CVPack] = env.cvfolds
+        # pylint:enable = import-outside-toplevel
+        if not self.state:
+            self.init(model)
+        booster: xgb.Booster = model
+        iteration: int = epoch
+        cvfolds: List[xgb.training.CVPack] = self.cvfolds
         ##### Evaluation #####
         # `eval_result` is a list of (key, score)
         eval_result: List[Tuple[str, float]] = []
@@ -697,13 +745,13 @@ def callback(env: "xgb.core.CallbackEnv"):
                         for key, value in map(
                             lambda x: x.split(":"),
                             booster.eval_set(
-                                evals=evals,
+                                evals=self.evals,
                                 iteration=iteration,
                                 feval=feval,
                             ).split()[1:],
                         )
                     ]
-                    for feval in fevals
+                    for feval in self.fevals
                 )
             )
         else:
@@ -719,14 +767,14 @@ def callback(env: "xgb.core.CallbackEnv"):
                             for fold in cvfolds
                         )
                     ]
-                    for feval in fevals
+                    for feval in self.fevals
                 )
             )
         eval_result = list(eval_result)
-        eval_result.sort(key=sort_key)
+        eval_result.sort(key=self.sort_key)
 
         ##### Print eval result #####
-        if verbose_eval and iteration % verbose_eval == 0:
+        if self.verbose_eval and iteration % self.verbose_eval == 0:
             info = []
             for key, score in eval_result:
                 if "null" not in key:
@@ -736,30 +784,31 @@ def callback(env: "xgb.core.CallbackEnv"):
         ##### Choose score and do early stopping #####
         score = None
         for key, _score in eval_result:
-            if key == focused_metric:
+            if key == self.focused_metric:
                 score = _score
                 break
         assert score is not None
 
-        best_score = state["best_score"]
-        best_iteration = state["best_iteration"]
+        best_score = self.state["best_score"]
+        best_iteration = self.state["best_iteration"]
         if score < best_score:
             tab = "\t"  # to work with f-string
-            msg = f"[{env.iteration}] {tab.join([_fmt_metric(x) for x in eval_result])}"
-            state["best_msg"] = msg
-            state["best_score"] = score
-            state["best_iteration"] = env.iteration
+            msg = f"[{epoch}] {tab.join([_fmt_metric(x) for x in eval_result])}"
+            self.state["best_msg"] = msg
+            self.state["best_score"] = score
+            self.state["best_iteration"] = epoch
             # save the property to attributes, so they will occur in checkpoint.
-            if env.model is not None:
-                env.model.set_attr(
-                    best_score=str(state["best_score"]),
-                    best_iteration=str(state["best_iteration"]),
-                    best_msg=state["best_msg"],
+            if model is not None:
+                model.set_attr(
+                    best_score=str(self.state["best_score"]),
+                    best_iteration=str(self.state["best_iteration"]),
+                    best_msg=self.state["best_msg"],
                 )
-        elif env.iteration - best_iteration >= early_stopping_rounds:
-            best_msg = state["best_msg"]
-            if verbose_eval and env.rank == 0:
-                logger.debug("XGB stopped. Best iteration: %s ", best_msg)
-            raise EarlyStopException(best_iteration)
+        elif epoch - best_iteration >= self.early_stopping_rounds:
+            best_msg = self.state["best_msg"]
 
-    return callback
+            if self.verbose_eval and rabit.get_rank() == 0:
+                logger.debug("XGB stopped. Best iteration: %s ", best_msg)
+            return True  # instead of raising EarlyStopException, returning True to end the training
+        # False to indicate training should not stop.
+        return False
diff --git a/tests/python/unittest/test_meta_schedule_cost_model.py b/tests/python/unittest/test_meta_schedule_cost_model.py
index d1d558181324..94b7bce246f4 100644
--- a/tests/python/unittest/test_meta_schedule_cost_model.py
+++ b/tests/python/unittest/test_meta_schedule_cost_model.py
@@ -27,6 +27,7 @@
 import tvm
 import tvm.testing
 from tvm.meta_schedule.cost_model import PyCostModel, RandomModel, XGBModel
+from tvm.meta_schedule.cost_model.xgb_model import XGBoostCustomCallback, PackSum
 from tvm.meta_schedule.feature_extractor import RandomFeatureExtractor
 from tvm.meta_schedule.runner import RunnerResult
 from tvm.meta_schedule.search_strategy import MeasureCandidate
@@ -228,5 +229,89 @@ def test_meta_schedule_xgb_model_reupdate():
     model.predict(TuneContext(), [_dummy_candidate() for i in range(predict_sample_count)])
 
 
+def test_meta_schedule_xgb_model_callback():
+    import xgboost as xgb
+    from itertools import chain as itertools_chain
+    from functools import partial
+
+    extractor = RandomFeatureExtractor()
+    model = XGBModel(extractor=extractor, num_warmup_samples=10)
+    update_sample_count = 20
+    predict_sample_count = 30
+
+    model.update(
+        TuneContext(),
+        [_dummy_candidate() for i in range(update_sample_count)],
+        [_dummy_result() for i in range(update_sample_count)],
+    )
+    model.predict(TuneContext(), [_dummy_candidate() for i in range(predict_sample_count)])
+    with tempfile.NamedTemporaryFile() as path:
+        # Backup and train on new TrainingCallBack api
+        random_state = model.extractor.random_state  # save feature extractor's random state
+
+        model.save(path.name)
+
+        old_booster = model.booster
+        xs = [
+            x.numpy().astype("float32")
+            for x in extractor.extract_from(
+                TuneContext(),
+                [_dummy_candidate() for i in range(predict_sample_count)],
+            )
+        ]
+        d_test = PackSum(xs=xs, ys=None)
+        pred1 = old_booster.predict(d_test.dmatrix)
+
+        # Load and train on deprecated TrainingCallBack api
+        model.extractor.random_state = random_state  # load feature extractor's random state
+        model.load(path.name)
+        d_train = PackSum(
+            xs=list(itertools_chain.from_iterable([g.features for g in model.data.values()])),
+            ys=np.concatenate(
+                [g.min_cost / g.costs for g in model.data.values()],
+                axis=0,
+            ),
+        )
+
+        def obj(ys_pred: np.ndarray, d_train1: "xgb.DMatrix"):  # type: ignore # pylint: disable = unused-argument
+            return d_train.obj_square_error(ys_pred)
+
+        def rmse(ys_pred: np.ndarray, d_train1: "xgb.DMatrix"):  # type: ignore # pylint: disable = unused-argument
+            return d_train.rmse(ys_pred)
+
+        def avg_peak_score(ys_pred: np.ndarray, d_train1: "xgb.DMatrix"):  # type: ignore # pylint: disable = unused-argument
+            return d_train.average_peak_score(ys_pred, model.average_peak_n)
+
+        new_booster = xgb.train(
+            model.config.to_dict(),
+            d_train.dmatrix,
+            num_boost_round=10000,
+            obj=obj,
+            callbacks=[
+                partial(
+                    XGBoostCustomCallback(
+                        early_stopping_rounds=model.early_stopping_rounds,
+                        verbose_eval=model.verbose_eval,
+                        fevals=[rmse, avg_peak_score],
+                        evals=[(d_train.dmatrix, "tr")],
+                        cvfolds=None,
+                    )
+                )
+            ],
+        )
+
+        xs = [
+            x.numpy().astype("float32")
+            for x in extractor.extract_from(
+                TuneContext(),
+                [_dummy_candidate() for i in range(predict_sample_count)],
+            )
+        ]
+        d_test = PackSum(xs=xs, ys=None)
+        pred2 = new_booster.predict(d_test.dmatrix)
+
+    assert np.allclose(pred1, pred2, rtol=1e-3, atol=1e-3)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 46ea2ed42ee41225141c5ed522900d340b08944d Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Sun, 25 Sep 2022 22:50:51 -0700
Subject: [PATCH 242/704] [MetaSchedule][UX] User Interface for Jupyter
 Notebook (#12866)

* Add features for jupyter notebook.

* Fix workload import warnings.

* Enable output clearing for cli.

* Fix test.

* Fix lint.

* Change to separate cleaning function.
---
 .../meta_schedule/testing/relay_workload.py   | 18 +++----
 python/tvm/meta_schedule/utils.py             | 18 ++++++-
 .../task_scheduler/gradient_based.cc          | 50 ++++++++++++++-----
 src/meta_schedule/utils.h                     | 32 ++++++++++++
 .../unittest/test_meta_schedule_tune_relay.py | 18 +++----
 5 files changed, 104 insertions(+), 32 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py
index 98bb99512020..9dcff2ace583 100644
--- a/python/tvm/meta_schedule/testing/relay_workload.py
+++ b/python/tvm/meta_schedule/testing/relay_workload.py
@@ -61,23 +61,23 @@ def _get_network(
         assert layout is None or layout in ["NCHW", "NHWC"]
 
         if name in ["resnet_18", "resnet_50"]:
-            model = getattr(models, name.replace("_", ""))(pretrained=False)
+            model = getattr(models, name.replace("_", ""))(weights=None)
         elif name == "wide_resnet_50":
-            model = getattr(models, "wide_resnet50_2")(pretrained=False)
+            model = getattr(models, "wide_resnet50_2")(weights=None)
         elif name == "resnext_50":
-            model = getattr(models, "resnext50_32x4d")(pretrained=False)
+            model = getattr(models, "resnext50_32x4d")(weights=None)
         elif name == "mobilenet_v2":
-            model = getattr(models, name)(pretrained=False)
+            model = getattr(models, name)(weights=None)
         elif name == "mobilenet_v3":
-            model = getattr(models, name + "_large")(pretrained=False)
+            model = getattr(models, name + "_large")(weights=None)
         elif name == "inception_v3":
-            model = getattr(models, name)(pretrained=False, aux_logits=False)
+            model = getattr(models, name)(weights=None, aux_logits=False)
         elif name == "densenet_121":
-            model = getattr(models, name.replace("_", ""))(pretrained=False)
+            model = getattr(models, name.replace("_", ""))(weights=None)
         elif name == "resnet3d_18":
-            model = models.video.r3d_18(pretrained=False)
+            model = models.video.r3d_18(weights=None)
         elif name == "vgg_16":
-            model = getattr(models, name.replace("_", ""))(pretrained=False)
+            model = getattr(models, name.replace("_", ""))(weights=None)
 
         dtype = "float32"
         input_data = torch.randn(input_shape).type(  # pylint: disable=no-member
diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index 26bf20670955..7b7c4a68653d 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -371,11 +371,27 @@ def make_logging_func(logger: logging.Logger) -> Optional[Callable]:
     }
 
     def logging_func(level: int, msg: str):
-        level2log[level](msg)
+        def clear_notebook_output():
+            from IPython.display import clear_output  # type: ignore # pylint: disable=import-outside-toplevel
+
+            clear_output(wait=True)
+
+        if level < 0:
+            clear_notebook_output()
+        else:
+            level2log[level](msg)
 
     return logging_func
 
 
+@register_func("meta_schedule.using_ipython")
+def _check_ipython_env():
+    try:
+        return get_ipython().__class__.__name__ == "ZMQInteractiveShell"  # type: ignore
+    except NameError:
+        return False
+
+
 def parameterize_config(config: Dict[str, Any], params: Dict[str, str]) -> Dict[str, Any]:
     """Parameterize the given configuration.
 
diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
index 73d191f593fe..506bb620e1d8 100644
--- a/src/meta_schedule/task_scheduler/gradient_based.cc
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -61,22 +61,43 @@ class GradientBasedNode final : public TaskSchedulerNode {
     int total_trials = 0;
     double total_latency = 0.0;
     support::TablePrinter p;
-    p.Row() << "ID"
-            << "Name"
-            << "FLOP"
-            << "Weight"
-            << "Speed (GFLOPS)"
-            << "Latency (us)"
-            << "Weighted Latency (us)"
-            << "Trials"
-            << "Terminated";
+
+    if (using_ipython()) {
+      p.Row() << "ID"
+              << "Name"
+              << "FLOP"
+              << "Weight"
+              << "GFLOPS"
+              << "Latency (us)"
+              << "Wtd. Latency"
+              << "Trials"
+              << "Terminated";
+    } else {
+      p.Row() << "ID"
+              << "Name"
+              << "FLOP"
+              << "Weight"
+              << "Speed (GFLOPS)"
+              << "Latency (us)"
+              << "Weighted Latency (us)"
+              << "Trials"
+              << "Terminated";
+    }
+
     p.Separator();
+
     for (int i = 0; i < n_tasks; ++i) {
       const TaskRecord& record = task_records_[i];
       auto row = p.Row();
       int trials = record.trials;
+      String task_name = record.task->task_name.value();
+      if (using_ipython() && task_name.length() > 23) {
+        std::string temp = task_name.c_str();
+        temp = temp.substr(0, 20) + "...";
+        task_name = String(temp);
+      }
       row << /*id=*/i                                     //
-          << /*name=*/record.task->task_name.value()      //
+          << /*name=*/task_name                           //
           << /*flops=*/static_cast<int64_t>(record.flop)  //
           << /*weight=*/static_cast<int>(record.weight);
       double latency = 1e9;
@@ -101,9 +122,10 @@ class GradientBasedNode final : public TaskSchedulerNode {
       }
     }
     p.Separator();
-    os << p.AsStr()                                  //
-       << "\nTotal trials: " << total_trials         //
-       << "\nTotal latency (us): " << total_latency  //
+    os << p.AsStr()                                                    //
+       << "\nProgress: " << total_trials / (max_trials * 0.01) << "%"  //
+       << "\nTotal Trials: " << total_trials << " / " << max_trials    //
+       << "\nTotal latency (us): " << total_latency                    //
        << "\n";
     return os.str();
   }
@@ -112,6 +134,7 @@ class GradientBasedNode final : public TaskSchedulerNode {
     int n_tasks = task_records_.size();
     // Round robin
     if (num_rounds_already_ == 0) {
+      TVM_PY_LOG_CLEAR_SCREEN(this->logging_func);
       TVM_PY_LOG(INFO, this->logging_func) << "\n" << this->TuningStatistics();
     }
     if (num_rounds_already_ < n_tasks) {
@@ -178,6 +201,7 @@ class GradientBasedNode final : public TaskSchedulerNode {
     }
     record.best_time_cost_history.push_back(best_time_cost);
     record.trials += results.size();
+    TVM_PY_LOG_CLEAR_SCREEN(this->logging_func);
     TVM_PY_LOG(INFO, this->logging_func)
         << "[Updated] Task #" << task_id << ": " << record.task->task_name << "\n"
         << this->TuningStatistics();
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index cf9a32917031..f0b736081670 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -59,6 +59,7 @@
   ::tvm::meta_schedule::PyLogMessage(__FILE__, __LINE__, logging_func,   \
                                      PyLogMessage::Level::logging_level) \
       .stream()
+#define TVM_PY_LOG_CLEAR_SCREEN(logging_func) clear_logging(__FILE__, __LINE__, logging_func)
 
 namespace tvm {
 namespace meta_schedule {
@@ -66,10 +67,13 @@ namespace meta_schedule {
 /*!
  * \brief Class to accumulate an log message on the python side. Do not use directly, instead use
  * TVM_PY_LOG(DEBUG), TVM_PY_LOG(INFO), TVM_PY_LOG(WARNING), TVM_PY_ERROR(ERROR).
+ * \sa TVM_PY_LOG
+ * \sa TVM_PY_LOG_CLEAR_SCREEN
  */
 class PyLogMessage {
  public:
   enum class Level : int32_t {
+    CLEAR = -10,
     DEBUG = 10,
     INFO = 20,
     WARNING = 30,
@@ -81,6 +85,8 @@ class PyLogMessage {
       : file_(file), lineno_(lineno), logging_func_(logging_func), logging_level_(logging_level) {}
 
   TVM_NO_INLINE ~PyLogMessage() {
+    ICHECK(logging_level_ != Level::CLEAR)
+        << "Cannot use CLEAR as logging level in TVM_PY_LOG, please use TVM_PY_LOG_CLEAR_SCREEN.";
     if (this->logging_func_.defined()) {
       logging_func_(static_cast<int>(logging_level_), stream_.str());
     } else {
@@ -107,6 +113,32 @@ class PyLogMessage {
   Level logging_level_;
 };
 
+/*!
+ * \brief Whether the tuning is running on ipython kernel.
+ * \return A boolean indicating whether ipython kernel is used.
+ */
+inline bool using_ipython() {
+  bool flag = false;
+  const auto* f_using_ipython = runtime::Registry::Get("meta_schedule.using_ipython");
+  if (f_using_ipython->defined()) flag = (*f_using_ipython)();
+  return flag;
+}
+
+/*!
+ * \brief A helper function to clear logging output for ipython kernel and console.
+ * \param file The file name.
+ * \param lineno The line number.
+ * \param logging_func The logging function.
+ */
+inline void clear_logging(const char* file, int lineno, PackedFunc logging_func) {
+  if (logging_func.defined() && using_ipython()) {
+    logging_func(static_cast<int>(PyLogMessage::Level::CLEAR), "");
+  } else {
+    // this would clear all logging output in the console
+    runtime::detail::LogMessage(file, lineno).stream() << "\033c\033[3J\033[2J\033[0m\033[H";
+  }
+}
+
 /*! \brief The type of the random state */
 using TRandState = support::LinearCongruentialEngine::TRandState;
 
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index 0267352fd697..5cc4f8f6a404 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -115,11 +115,11 @@ def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T.
 @pytest.mark.parametrize(
     "model_name, input_shape, target, layout",
     [
-        ("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC"),
+        ("resnet_18", [1, 3, 224, 224], "llvm --num-cores=12", "NHWC"),
         ("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NHWC"),
-        ("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC"),
+        ("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=12", "NHWC"),
         ("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NHWC"),
-        ("bert_base", [1, 64], "llvm --num-cores=16", None),
+        ("bert_base", [1, 64], "llvm --num-cores=12", None),
         ("bert_base", [1, 64], "nvidia/geforce-rtx-3070", None),
     ],
 )
@@ -242,7 +242,7 @@ def print_results(self) -> None:
 
     input_name = "data"
     dev = tvm.cpu()
-    target = Target("llvm --num-cores=16")
+    target = Target("llvm --num-cores=12")
     data = tvm.nd.array(data_sample, dev)
 
     database = TestDummyDatabase()
@@ -250,7 +250,7 @@ def print_results(self) -> None:
     database.commit_workload(tvmgen_default_fused_layout_transform_1)
     database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc)
 
-    with database, tvm.transform.PassContext(
+    with database, tvm.transform.PassContext(  # pylint: disable=not-context-manager
         opt_level=3,
         config={"relay.backend.use_meta_schedule": True},
     ):
@@ -295,7 +295,7 @@ def test_meta_schedule_relay_lowering():
 
     input_name = "data"
     dev = tvm.cpu()
-    target = Target("llvm --num-cores=16")
+    target = Target("llvm --num-cores=12")
     data = tvm.nd.array(data_sample, dev)
 
     with tempfile.TemporaryDirectory() as work_dir:
@@ -542,11 +542,11 @@ def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
 
 
 if __name__ == """__main__""":
-    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", None)
+    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=12", None)
     test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NCHW")
-    test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16", None)
+    test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=12", None)
     test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", None)
-    test_meta_schedule_tune_relay("bert_base", [1, 64], "llvm --num-cores=16", None)
+    test_meta_schedule_tune_relay("bert_base", [1, 64], "llvm --num-cores=12", None)
     test_meta_schedule_tune_relay("bert_base", [1, 64], "nvidia/geforce-rtx-3070", None)
     test_meta_schedule_te2primfunc_argument_order()
     test_meta_schedule_relay_lowering()

From cc6e01edc6390de19c72f9283a6d4fa178672836 Mon Sep 17 00:00:00 2001
From: chengven027-intellif <darkvan_wen@hotmail.com>
Date: Mon, 26 Sep 2022 17:13:22 +0800
Subject: [PATCH 243/704] [frontend][pytorch]support aten::zero_ operator
 (#12872)

support aten::zero_ operator
---
 python/tvm/relay/frontend/pytorch.py          | 5 +++++
 tests/python/frontend/pytorch/test_forward.py | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index e35e23b3381c..c1bf69502ba8 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -738,6 +738,10 @@ def zeros(self, inputs, input_types):
             dtype = self.default_dtype
         return self.full_impl(data, 0, dtype)
 
+    def zero_(self, inputs, input_types):
+        data = inputs[0]
+        return self.full_impl(self.infer_shape(data), 0, input_types[0])
+
     def zeros_like(self, inputs, input_types):
         data = inputs[0]
         out = _op.zeros_like(data)
@@ -3462,6 +3466,7 @@ def create_convert_map(self):
             "aten::ones": self.ones,
             "aten::ones_like": self.ones_like,
             "aten::zeros": self.zeros,
+            "aten::zero_": self.zero_,
             "aten::zeros_like": self.zeros_like,
             "aten::new_ones": self.new_ones,
             "aten::full": self.full,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 5236b763faf0..33c70a4d74a4 100755
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3257,6 +3257,13 @@ def forward(self, *args):
     verify_model(Zeros1().float().eval(), input_data=[])
 
 
+def test_forward_zero_():
+    def test_func(x):
+        return x.zero_()
+
+    verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()])
+
+
 @tvm.testing.uses_gpu
 def test_forward_zeros_like():
     """test_forward_zeros_like"""

From 87085b0e0dad2a422993472e35431d4f22fd69d8 Mon Sep 17 00:00:00 2001
From: chengven027-intellif <darkvan_wen@hotmail.com>
Date: Mon, 26 Sep 2022 17:14:33 +0800
Subject: [PATCH 244/704] [frontend][pytorch]Support aten::Tensor_split
 operator (#12871)

Support aten::Tensor_split operator
---
 python/tvm/relay/frontend/pytorch.py          | 54 +++++++++++++++++++
 tests/python/frontend/pytorch/test_forward.py | 22 ++++++++
 2 files changed, 76 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index c1bf69502ba8..1b86b120dfcc 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -559,6 +559,59 @@ def split_with_sizes(self, inputs, input_types):
 
         return _op.split(data, indices, dim)
 
+    def tensor_split(self, inputs, input_types):
+        # Reference: https://pytorch.org/docs/stable/generated/torch.tensor_split.html
+        import torch
+
+        if not isinstance(inputs[1], (int, list, tuple, torch.Tensor)):
+            msg = "indices_or_sections type %s could not be parsed in tensor_split op" % (
+                type(inputs[1])
+            )
+            raise AssertionError(msg)
+
+        if isinstance(inputs[1], torch.Tensor) and not (
+            list(inputs[1].shape) == [] or list(inputs[1].shape) == 1
+        ):
+            msg = "indices_or_sections must be a zero-dimensional or one-dimensional long tensor"
+            raise AssertionError(msg)
+
+        if isinstance(inputs[1], int) or (
+            isinstance(inputs[1], torch.Tensor) and list(inputs[1].shape) == []
+        ):
+            data = inputs[0]
+            n = int(inputs[1])
+            dim = int(inputs[2])
+
+            split_size = int(self.infer_shape(data)[dim] / n)
+            split_rest = int(self.infer_shape(data)[dim] % n)
+
+            indices = []
+            split_index = split_size
+            if split_rest == 0:
+                for i in range(n - 1):
+                    indices.append(split_index)
+                    split_index += split_size
+            else:
+                for i in range(split_rest):
+                    indices.append(split_index + 1)
+                    split_index = (i + 1) * (split_index + 1)
+                for i in range(n - split_rest - 1):
+                    split_index += split_size
+                    indices.append(split_index)
+
+            return _op.split(data, indices, dim)
+        else:
+            data = inputs[0]
+            sections = inputs[1]
+            dim = int(inputs[2])
+
+            if isinstance(sections, tuple):
+                sections = list(sections)
+            elif isinstance(sections, torch.Tensor):
+                sections = sections.cpu().numpy().tolist()
+
+            return _op.split(data, sections, dim)
+
     def select(self, inputs, input_types):
         data = inputs[0]
         dim = int(inputs[1])
@@ -3484,6 +3537,7 @@ def create_convert_map(self):
             "aten::slice": self.slice,
             "aten::narrow": self.narrow,
             "aten::split": self.split,
+            "aten::tensor_split": self.tensor_split,
             "aten::split_with_sizes": self.split_with_sizes,
             "aten::select": self.select,
             "aten::take": self.take,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 33c70a4d74a4..3c8bd5efd80d 100755
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -959,6 +959,28 @@ def forward(self, *args):
     verify_model(Split([2, 3, 5], 1).float().eval(), input_data=input_data)
 
 
+@tvm.testing.uses_gpu
+def test_forward_tensor_split():
+    """test_forward_tensor_split"""
+    torch.set_grad_enabled(False)
+    input_shape = [4, 10]
+
+    class Tensor_Split(Module):
+        def __init__(self, split_size_or_sections, dim):
+            super().__init__()
+            self.split_size_or_sections = split_size_or_sections
+            self.dim = dim
+
+        def forward(self, *args):
+            return torch.tensor_split(args[0], self.split_size_or_sections, self.dim)
+
+    input_data = torch.rand(input_shape).float()
+    verify_model(Tensor_Split(2, 0).float().eval(), input_data=input_data)
+    verify_model(Tensor_Split(torch.tensor(3), 1).float().eval(), input_data=input_data)
+    verify_model(Tensor_Split([2, 3, 5], 1).float().eval(), input_data=input_data)
+    verify_model(Tensor_Split((2, 3, 5), 1).float().eval(), input_data=input_data)
+
+
 @tvm.testing.uses_gpu
 def test_forward_avgpool1d():
     """test_forward_avgpool1d"""

From 4ef1465d409655322cbeacbbb1b64e7791b7bf8a Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Mon, 26 Sep 2022 13:37:33 +0300
Subject: [PATCH 245/704] [skip ci] Temporarily disable comments bot (#12903)

[skip ci] Disable comment bot
---
 .github/{workflows => disabled_workflows}/pr_comment_bot.yml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/{workflows => disabled_workflows}/pr_comment_bot.yml (100%)

diff --git a/.github/workflows/pr_comment_bot.yml b/.github/disabled_workflows/pr_comment_bot.yml
similarity index 100%
rename from .github/workflows/pr_comment_bot.yml
rename to .github/disabled_workflows/pr_comment_bot.yml

From b6a660be5860a851725b417565ecf71bfa343bc7 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 26 Sep 2022 09:38:43 -0700
Subject: [PATCH 246/704] [BUILD] Re-enable ccache by default (#12839)

* [BUILD] Re-enable ccache by default

Previously ccache was disabled because of possible issues with hexagon.
Re-enabling it to provide a best effort attempt at using it.

* set tvm_option, set variables correctly

* clean up comment, fatal error if launcher is defined with USE_CCACHE=ON

* add ccache to libinfo

* more libinfo

* add launcher to summary, move ccache to seperate file

* Update cmake/utils/Summary.cmake

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>

* correct name for Summary.cmake

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>
---
 CMakeLists.txt               |  6 +++++
 cmake/config.cmake           | 11 ++++++++
 cmake/modules/LibInfo.cmake  |  1 +
 cmake/utils/CCache.cmake     | 52 ++++++++++++++++++++++++++++++++++++
 cmake/utils/Summary.cmake    |  1 +
 docs/install/from_source.rst |  2 ++
 src/support/libinfo.cc       |  1 +
 7 files changed, 74 insertions(+)
 create mode 100644 cmake/utils/CCache.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c355238b8c8..188f9fb1c7a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,7 @@ tvm_option(USE_PAPI "Use Performance Application Programming Interface (PAPI) to
 tvm_option(USE_GTEST "Use GoogleTest for C++ sanity tests" AUTO)
 tvm_option(USE_CUSTOM_LOGGING "Use user-defined custom logging, tvm::runtime::detail::LogFatalImpl and tvm::runtime::detail::LogMessageImpl must be implemented" OFF)
 tvm_option(USE_ALTERNATIVE_LINKER "Use 'mold' or 'lld' if found when invoking compiler to link artifact" AUTO)
+tvm_option(USE_CCACHE "Use ccache if found when invoking compiler" AUTO)
 
 # 3rdparty libraries
 tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")
@@ -460,6 +461,11 @@ if(USE_PIPELINE_EXECUTOR)
   list(APPEND RUNTIME_SRCS ${RUNTIME_PIPELINE_SRCS})
 endif(USE_PIPELINE_EXECUTOR)
 
+# Caches the build.
+# Note that ccache-3.x doesn't support nvcc well, so CUDA kernels may never hit the cache and still
+# need to be re-compiled every time. Using ccache 4.0+ can resolve this issue.
+include(cmake/utils/CCache.cmake)
+
 # Module rules
 include(cmake/modules/VTA.cmake)
 include(cmake/modules/StandaloneCrt.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 18725de844b2..7067af42e9f1 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -352,6 +352,17 @@ set(USE_LIBBACKTRACE AUTO)
 # runtime functions to be unavailable to the program.
 set(BUILD_STATIC_RUNTIME OFF)
 
+# Caches the build so that building is faster when switching between branches.
+# If you switch branches, build and then encounter a linking error, you may
+# need to regenerate the build tree through "make .." (the cache will
+# still provide significant speedups).
+# Possible values:
+# - AUTO: search for path to ccache, disable if not found.
+# - ON: enable ccache by searching for the path to ccache, report an error if not found
+# - OFF: disable ccache
+# - /path/to/ccache: use specific path to ccache
+set(USE_CCACHE AUTO)
+
 # Whether to enable PAPI support in profiling. PAPI provides access to hardware
 # counters while profiling.
 # Possible values:
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 6bc8f6b46390..73d3a9dbbe10 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -117,6 +117,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_CLML="${USE_CLML}"
     TVM_INFO_USE_CLML_GRAPH_EXECUTOR="${USE_CLML_GRAPH_EXECUTOR}"
     TVM_INFO_USE_UMA="${USE_UMA}"
+    TVM_INFO_USE_CCACHE="${USE_CCACHE}"
   )
 
 endfunction()
diff --git a/cmake/utils/CCache.cmake b/cmake/utils/CCache.cmake
new file mode 100644
index 000000000000..f38a36b5dee8
--- /dev/null
+++ b/cmake/utils/CCache.cmake
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_CCACHE) # True for AUTO, ON, /path/to/ccache
+  if(DEFINED CXX_COMPILER_LAUNCHER OR DEFINED C_COMPILER_LAUNCHER)
+    if("${USE_CCACHE}" STREQUAL "AUTO")
+      message(STATUS "CXX_COMPILER_LAUNCHER or C_COMPILER_LAUNCHER already defined, not using ccache")
+    elseif("${USE_CCACHE}" MATCHES ${IS_TRUE_PATTERN})
+      message(FATAL_ERROR "CXX_COMPILER_LAUNCHER or C_COMPILER_LAUNCHER is already defined, refusing to override with ccache. Either unset or disable ccache.")
+    endif()
+  else()
+    if("${USE_CCACHE}" STREQUAL "AUTO") # Auto mode
+      find_program(CCACHE_FOUND "ccache")
+      if(CCACHE_FOUND)
+        message(STATUS "Found the path to ccache, enabling ccache")
+        set(PATH_TO_CCACHE "ccache")
+      else()
+        message(STATUS "Didn't find the path to CCACHE, disabling ccache")
+      endif(CCACHE_FOUND)
+    elseif("${USE_CCACHE}" MATCHES ${IS_TRUE_PATTERN})
+      find_program(CCACHE_FOUND "ccache")
+      if(CCACHE_FOUND)
+        message(STATUS "Found the path to ccache, enabling ccache")
+        set(PATH_TO_CCACHE "ccache")
+      else()
+        message(FATAL_ERROR "Cannot find ccache. Set USE_CCACHE mode to AUTO or OFF to build without ccache. USE_CCACHE=" "${USE_CCACHE}")
+      endif(CCACHE_FOUND)
+    else() # /path/to/ccache
+      set(PATH_TO_CCACHE "${USE_CCACHE}")
+      message(STATUS "Setting ccache path to " "${PATH_TO_CCACHE}")
+    endif()
+    # Set the flag for ccache
+    if(DEFINED PATH_TO_CCACHE)
+      set(CXX_COMPILER_LAUNCHER "${PATH_TO_CCACHE}")
+      set(C_COMPILER_LAUNCHER "${PATH_TO_CCACHE}")
+    endif()
+  endif()
+endif(USE_CCACHE)
diff --git a/cmake/utils/Summary.cmake b/cmake/utils/Summary.cmake
index 1b973f253a00..e3ea925a9ae1 100644
--- a/cmake/utils/Summary.cmake
+++ b/cmake/utils/Summary.cmake
@@ -42,6 +42,7 @@ macro(print_summary)
     message(STATUS "  C++ compiler ID       : ${CMAKE_CXX_COMPILER_ID}")
     message(STATUS "  C++ compiler version  : ${CMAKE_CXX_COMPILER_VERSION}")
     message(STATUS "  CXX flags             : ${CMAKE_CXX_FLAGS}")
+    message(STATUS "  CXX launcher          : ${CXX_COMPILER_LAUNCHER}")
     message(STATUS "  Linker flags          : ${CMAKE_SHARED_LINKER_FLAGS}")
     message(STATUS "  Build type            : ${CMAKE_BUILD_TYPE}")
     get_directory_property(READABLE_COMPILE_DEFS DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index e5622b40a173..63d8aab33623 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -141,6 +141,8 @@ The configuration of TVM can be modified by editing `config.cmake` and/or by pas
   - On supported platforms, the `Ccache compiler wrapper <https://ccache.dev/>`_ may be helpful for
     reducing TVM's build time.  There are several ways to enable CCache in TVM builds:
 
+    - Leave `USE_CCACHE=AUTO` in `build/config.cmake`. CCache will be used if it is found.
+
     - Ccache's Masquerade mode. This is typically enabled during the Ccache installation process.
       To have TVM use Ccache in masquerade, simply specify the appropriate C/C++ compiler
       paths when configuring TVM's build system.  For example:
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 4b2f6034730d..a7d8e6a1ae2d 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -318,6 +318,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_CLML", TVM_INFO_USE_CLML},
       {"USE_CLML_GRAPH_EXECUTOR", TVM_INFO_USE_CLML_GRAPH_EXECUTOR},
       {"USE_UMA", TVM_INFO_USE_UMA},
+      {"USE_CCACHE", TVM_INFO_USE_CCACHE},
   };
   return result;
 }

From 8711ba44b9bebc54bb4bc3c3f456ee3ce3d40eed Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Mon, 26 Sep 2022 09:52:02 -0700
Subject: [PATCH 247/704] [TVMScript] Import TIR methods into the IRBuilder
 (#12900)

This PR introduces remaining TIR methods into IRBuilder

Co-authored-by: yongwww <yongcale@gmail.com>
---
 include/tvm/script/ir_builder/tir/ir.h        |   8 +
 python/tvm/script/ir_builder/tir/ir.py        | 396 +++++++++++++++++-
 src/script/ir_builder/tir/ir.cc               |  11 +
 .../unittest/test_tvmscript_ir_builder_tir.py |  15 +
 4 files changed, 428 insertions(+), 2 deletions(-)

diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
index dd289b691502..7460099f9448 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -435,6 +435,14 @@ void Prefetch(Buffer buffer, Array<Range> bounds);
  */
 void Evaluate(PrimExpr value);
 
+/*!
+ * \brief The pointer declaration function.
+ * \param dtype The data type of the pointer.
+ * \param storage_scope The storage scope of the pointer.
+ * \return The pointer.
+ */
+PrimExpr Ptr(runtime::DataType dtype, String storage_scope = "global");
+
 #define TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(FuncName, DType)                             \
   inline PrimExpr FuncName(Optional<PrimExpr> expr = NullOpt) {                        \
     DataType dtype = DType;                                                            \
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index 625e1291ff20..4ec1511f2907 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -17,24 +17,35 @@
 # pylint: disable=missing-docstring
 """IRBuilder for TIR"""
 
+import inspect
+import functools
 from numbers import Integral
-from typing import Any, Dict, List, Optional, Union, Tuple
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
 import numpy as np  # type: ignore
 
 from tvm.ir import Range, Type
 from tvm.runtime import convert, ndarray
+from tvm.target.codegen import llvm_lookup_intrinsic_id
 from tvm.tir import (
     Buffer,
     BufferLoad,
     BufferRegion,
+    Cast,
+    CommReducer,
     IntImm,
     IterVar,
     Let,
     PrimExpr,
+    Select,
+    Shuffle,
     StringImm,
+    type_annotation,
     Var,
 )
+from tvm.tir import Broadcast as broadcast
 from tvm.tir import Ramp as ramp
+from tvm.tir import op as _tir_op
+from tvm.tir.generic import cast
 
 from . import _ffi_api, frame
 
@@ -1501,7 +1512,7 @@ def void(expr: Optional[PrimExpr] = None) -> PrimExpr:
     return _ffi_api.Void(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def var(dtype, name="") -> Var:
+def var(dtype: str, name: str = "") -> Var:
     """Construct a new tir.Var.
 
     Parameters
@@ -1520,6 +1531,268 @@ def var(dtype, name="") -> Var:
     return Var(name, dtype)  # pylint: disable=no-member
 
 
+def ptr(dtype: str, storage_scope: str = "global") -> Var:
+    """The pointer declaration function.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type of the pointer.
+
+    storage_scope : str
+        The storage scope of the pointer.
+
+    Returns
+    -------
+    res : Var
+        The pointer.
+    """
+    return _ffi_api.Ptr(dtype, storage_scope)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def min(a: PrimExpr, b: PrimExpr) -> PrimExpr:  # pylint: disable=redefined-builtin
+    """Compute the minimum value of two expressions.
+
+    Parameters
+    ----------
+    a : PrimExpr
+        The left hand operand
+
+    b : PrimExpr
+        The right hand operand
+
+    Returns
+    -------
+    res : PrimExpr
+        The result expression.
+    """
+    return _ffi_api.min(a, b)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def max(a: PrimExpr, b: PrimExpr) -> PrimExpr:  # pylint: disable=redefined-builtin
+    """Compute the maximum value of two expressions.
+
+    Parameters
+    ----------
+    a : PrimExpr
+        The left hand operand
+
+    b : PrimExpr
+        The right hand operand
+
+    Returns
+    -------
+    res : PrimExpr
+        The result expression.
+    """
+    return _ffi_api.max(a, b)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def iter_var(v: Union[Var, str], dom: Range, iter_type: str, thread_tag: str) -> IterVar:
+    """The iteration variable.
+
+    Parameters
+    ----------
+    var : Union[Var, str]
+        The internal variable that is used for iteration.
+
+    dom : Range
+        The domain of the iteration.
+
+    iter_type : str
+        The iteration type.
+
+    thread_tag : str
+        The thread type tag.
+
+    Returns
+    -------
+    res : IterVar
+        The iteration variable.
+    """
+    iter_type = getattr(IterVar, iter_type)
+    return IterVar(dom, v, iter_type, thread_tag)
+
+
+def comm_reducer(combiner: Callable, identity: List[PrimExpr]) -> CommReducer:
+    """
+    Create a CommReducer from lambda inputs/outputs and the identities
+
+    Parameters
+    ----------
+    combiner : Callable
+        A binary function which takes two PrimExpr as input to return a PrimExpr.
+
+    identity : List[PrimExpr]
+        A list of types of output PrimExpr.
+
+    Returns
+    -------
+    res : CommReducer
+        The CommReducer.
+    """
+    params = inspect.signature(combiner).parameters
+    num_args = len(params)
+    args = []
+    for name, i in zip(params.keys(), identity + identity):
+        if isinstance(i, int):
+            args.append(Var(name, "int32"))
+        else:
+            args.append(Var(name, i.dtype))
+    res = combiner(*args)
+    if not isinstance(res, tuple):
+        res = (res,)
+    return CommReducer(args[: num_args // 2], args[num_args // 2 :], res, identity)
+
+
+def _op_wrapper(func):
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        if "dtype" in kwargs:
+            kwargs.pop("dtype")
+        return func(*args, **kwargs)
+
+    return wrapped
+
+
+def _dtype_forward(func):
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        if "dtype" in kwargs:
+            args = (kwargs.pop("dtype"),) + args
+        return func(*args, **kwargs)
+
+    return wrapped
+
+
+# pylint: disable=invalid-name
+
+buffer_var = ptr
+abs = _op_wrapper(_tir_op.abs)  # pylint: disable=redefined-builtin
+fabs = abs
+acos = _op_wrapper(_tir_op.acos)
+acosh = _op_wrapper(_tir_op.acosh)
+address_of = _op_wrapper(_tir_op.address_of)
+asin = _op_wrapper(_tir_op.asin)
+asinh = _op_wrapper(_tir_op.asinh)
+atan = _op_wrapper(_tir_op.atan)
+atan2 = _op_wrapper(_tir_op.atan2)
+atanh = _op_wrapper(_tir_op.atanh)
+ceil = _op_wrapper(_tir_op.ceil)
+clz = _op_wrapper(_tir_op.clz)
+copysign = _op_wrapper(_tir_op.copysign)
+cos = _op_wrapper(_tir_op.cos)
+cosh = _op_wrapper(_tir_op.cosh)
+erf = _op_wrapper(_tir_op.erf)
+exp = _op_wrapper(_tir_op.exp)
+exp2 = _op_wrapper(_tir_op.exp2)
+exp10 = _op_wrapper(_tir_op.exp10)
+floor = _op_wrapper(_tir_op.floor)
+ceildiv = _op_wrapper(_tir_op.ceildiv)
+floordiv = _op_wrapper(_tir_op.floordiv)
+floormod = _op_wrapper(_tir_op.floormod)
+fmod = _op_wrapper(_tir_op.fmod)
+hypot = _op_wrapper(_tir_op.hypot)
+if_then_else = _op_wrapper(_tir_op.if_then_else)
+infinity = _op_wrapper(_tir_op.infinity)
+isfinite = _op_wrapper(_tir_op.isfinite)
+isinf = _op_wrapper(_tir_op.isinf)
+isnan = _op_wrapper(_tir_op.isnan)
+isnullptr = _op_wrapper(_tir_op.isnullptr)
+ldexp = _op_wrapper(_tir_op.ldexp)
+likely = _op_wrapper(_tir_op.likely)
+log = _op_wrapper(_tir_op.log)
+log1p = _op_wrapper(_tir_op.log1p)
+log2 = _op_wrapper(_tir_op.log2)
+log10 = _op_wrapper(_tir_op.log10)
+lookup_param = _op_wrapper(_tir_op.lookup_param)
+max_value = _op_wrapper(_tir_op.max_value)
+min_value = _op_wrapper(_tir_op.min_value)
+nearbyint = _op_wrapper(_tir_op.nearbyint)
+nextafter = _op_wrapper(_tir_op.nextafter)
+popcount = _op_wrapper(_tir_op.popcount)
+power = _op_wrapper(_tir_op.power)
+q_multiply_shift = _op_wrapper(_tir_op.q_multiply_shift)
+ret = _op_wrapper(_tir_op.ret)
+reinterpret = _dtype_forward(_tir_op.reinterpret)
+round = _op_wrapper(_tir_op.round)  # pylint: disable=redefined-builtin
+rsqrt = _op_wrapper(_tir_op.rsqrt)
+shift_left = _op_wrapper(_tir_op.shift_left)
+shift_right = _op_wrapper(_tir_op.shift_right)
+sigmoid = _op_wrapper(_tir_op.sigmoid)
+sin = _op_wrapper(_tir_op.sin)
+sinh = _op_wrapper(_tir_op.sinh)
+sqrt = _op_wrapper(_tir_op.sqrt)
+tan = _op_wrapper(_tir_op.tan)
+tanh = _op_wrapper(_tir_op.tanh)
+trunc = _op_wrapper(_tir_op.trunc)
+truncdiv = _op_wrapper(_tir_op.truncdiv)
+truncmod = _op_wrapper(_tir_op.truncmod)
+tvm_access_ptr = _op_wrapper(_tir_op.tvm_access_ptr)
+tvm_throw_last_error = _op_wrapper(_tir_op.tvm_throw_last_error)
+tvm_stack_alloca = _op_wrapper(_tir_op.tvm_stack_alloca)
+tvm_stack_make_shape = _op_wrapper(_tir_op.tvm_stack_make_shape)
+tvm_stack_make_array = _op_wrapper(_tir_op.tvm_stack_make_array)
+call_packed = _op_wrapper(_tir_op.call_packed)
+call_cpacked = _op_wrapper(_tir_op.call_cpacked)
+call_packed_lowered = _op_wrapper(_tir_op.call_packed_lowered)
+call_cpacked_lowered = _op_wrapper(_tir_op.call_cpacked_lowered)
+call_extern = _dtype_forward(_tir_op.call_extern)
+call_intrin = _dtype_forward(_tir_op.call_intrin)
+call_llvm_intrin = _dtype_forward(_tir_op.call_llvm_intrin)
+call_llvm_pure_intrin = _dtype_forward(_tir_op.call_llvm_pure_intrin)
+call_pure_extern = _dtype_forward(_tir_op.call_pure_extern)
+tvm_access_ptr = _op_wrapper(_tir_op.tvm_access_ptr)
+tvm_tuple = _op_wrapper(_tir_op.tvm_tuple)
+tvm_struct_set = _op_wrapper(_tir_op.tvm_struct_set)
+tvm_struct_get = _tir_op.tvm_struct_get
+tvm_thread_allreduce = _op_wrapper(_tir_op.tvm_thread_allreduce)
+tvm_load_matrix_sync = _op_wrapper(_tir_op.tvm_load_matrix_sync)
+tvm_mma_sync = _op_wrapper(_tir_op.tvm_mma_sync)
+tvm_bmma_sync = _op_wrapper(_tir_op.tvm_bmma_sync)
+tvm_fill_fragment = _op_wrapper(_tir_op.tvm_fill_fragment)
+tvm_store_matrix_sync = _op_wrapper(_tir_op.tvm_store_matrix_sync)
+ptx_mma = _dtype_forward(_tir_op.ptx_mma)
+ptx_mma_sp = _dtype_forward(_tir_op.ptx_mma_sp)
+ptx_ldmatrix = _dtype_forward(_tir_op.ptx_ldmatrix)
+ptx_cp_async = _dtype_forward(_tir_op.ptx_cp_async)
+ptx_wait_group = _op_wrapper(_tir_op.ptx_wait_group)
+ptx_commit_group = _op_wrapper(_tir_op.ptx_commit_group)
+mma_store = _dtype_forward(_tir_op.mma_store)
+mma_fill = _dtype_forward(_tir_op.mma_fill)
+vectorlow = _dtype_forward(_tir_op.vectorlow)
+vectorhigh = _dtype_forward(_tir_op.vectorhigh)
+vectorcombine = _dtype_forward(_tir_op.vectorcombine)
+assume = _op_wrapper(_tir_op.assume)
+undef = _op_wrapper(_tir_op.undef)
+tvm_call_packed = call_packed
+tvm_call_cpacked = call_cpacked
+tvm_call_packed_lowered = call_packed_lowered
+tvm_call_cpacked_lowered = call_cpacked_lowered
+TVMBackendAllocWorkspace = _op_wrapper(_tir_op.TVMBackendAllocWorkspace)
+TVMBackendFreeWorkspace = _op_wrapper(_tir_op.TVMBackendFreeWorkspace)
+
+
+class inline:
+    """Inline function for meta-programming.
+
+    Parameters
+    ----------
+    value: Any
+        The value to be inlined.
+    """
+
+    def __init__(self, value: Any) -> None:
+        self.value = value
+
+    def __iter__(self):
+        def f():
+            for i in self.value:
+                yield inline(i)
+
+        return f()
+
+
 # pylint: enable=invalid-name
 
 
@@ -1581,4 +1854,123 @@ def var(dtype, name="") -> Var:
     "handle",
     "void",
     "var",
+    "ptr",
+    "min",
+    "max",
+    "iter_var",
+    "comm_reducer",
+    "buffer_var",
+    "abs",
+    "fabs",
+    "acos",
+    "acosh",
+    "address_of",
+    "asin",
+    "asinh",
+    "atan",
+    "atan2",
+    "atanh",
+    "ceil",
+    "clz",
+    "copysign",
+    "cos",
+    "cosh",
+    "erf",
+    "exp",
+    "exp2",
+    "exp10",
+    "floor",
+    "ceildiv",
+    "floordiv",
+    "floormod",
+    "fmod",
+    "hypot",
+    "if_then_else",
+    "infinity",
+    "isfinite",
+    "isinf",
+    "isnan",
+    "isnullptr",
+    "ldexp",
+    "likely",
+    "log",
+    "log1p",
+    "log2",
+    "log10",
+    "lookup_param",
+    "max_value",
+    "min_value",
+    "nearbyint",
+    "nextafter",
+    "popcount",
+    "power",
+    "q_multiply_shift",
+    "ret",
+    "reinterpret",
+    "round",
+    "rsqrt",
+    "shift_left",
+    "shift_right",
+    "sigmoid",
+    "sin",
+    "sinh",
+    "sqrt",
+    "tan",
+    "tanh",
+    "trunc",
+    "truncdiv",
+    "truncmod",
+    "tvm_access_ptr",
+    "tvm_throw_last_error",
+    "tvm_stack_alloca",
+    "tvm_stack_make_shape",
+    "tvm_stack_make_array",
+    "call_packed",
+    "call_cpacked",
+    "call_packed_lowered",
+    "call_cpacked_lowered",
+    "call_extern",
+    "call_intrin",
+    "call_llvm_intrin",
+    "call_llvm_pure_intrin",
+    "call_pure_extern",
+    "tvm_access_ptr",
+    "tvm_tuple",
+    "tvm_struct_set",
+    "tvm_struct_get",
+    "tvm_thread_allreduce",
+    "tvm_load_matrix_sync",
+    "tvm_mma_sync",
+    "tvm_bmma_sync",
+    "tvm_fill_fragment",
+    "tvm_store_matrix_sync",
+    "ptx_mma",
+    "ptx_mma_sp",
+    "ptx_ldmatrix",
+    "ptx_cp_async",
+    "ptx_wait_group",
+    "ptx_commit_group",
+    "mma_store",
+    "mma_fill",
+    "vectorlow",
+    "vectorhigh",
+    "vectorcombine",
+    "assume",
+    "undef",
+    "tvm_call_packed",
+    "tvm_call_cpacked",
+    "tvm_call_packed_lowered",
+    "tvm_call_cpacked_lowered",
+    "TVMBackendAllocWorkspace",
+    "TVMBackendFreeWorkspace",
+    "inline",
+    "llvm_lookup_intrinsic_id",
+    "Cast",
+    "Let",
+    "Select",
+    "Shuffle",
+    "type_annotation",
+    "broadcast",
+    "ramp",
+    "cast",
 ]
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index 28c3d69861fa..6be6e2619fea 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -534,6 +534,10 @@ DeclBufferFrame DeclBuffer(Array<PrimExpr> shape, DataType dtype, String buffer_
 
 void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); }
 
+PrimExpr Ptr(runtime::DataType dtype, String storage_scope) {
+  return tvm::tir::Var("", tvm::PointerType(PrimType(dtype), storage_scope));
+}
+
 using tvm::script::ir_builder::details::Namer;
 
 TVM_STATIC_IR_FUNCTOR(Namer, vtable)
@@ -632,6 +636,8 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.BufferStore").set_body_typed(BufferSt
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Prefetch").set_body_typed(Prefetch);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate);
 
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.Ptr").set_body_typed(Ptr);
+
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int8").set_body_typed(Int8);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int16").set_body_typed(Int16);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32").set_body_typed(Int32);
@@ -650,6 +656,11 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32x16").set_body_typed(Int32x16);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Boolean").set_body_typed(Boolean);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Handle").set_body_typed(Handle);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Void").set_body_typed(Void);
+
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.min")
+    .set_body_typed([](PrimExpr a, PrimExpr b) -> PrimExpr { return tvm::min(a, b); });
+TVM_REGISTER_GLOBAL("script.ir_builder.tir.max")
+    .set_body_typed([](PrimExpr a, PrimExpr b) -> PrimExpr { return tvm::max(a, b); });
 }  // namespace tir
 }  // namespace ir_builder
 }  // namespace script
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index 40e13a2fbe2f..dbc9b594fb87 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -476,5 +476,20 @@ def test_ir_builder_tir_decl_buffer():
     assert_structural_equal(ir_actual, ir_expected, map_free_vars=True)
 
 
+def test_ir_builder_tir_inline():
+    with IRBuilder() as ib:
+        m, n = T.inline(1), T.inline(2)
+        a, b = T.inline([3, 4])
+        T.evaluate(m.value + n.value + a.value + b.value)
+    # the evaluate generated by IRBuilder
+    eval_actual = ib.get()
+
+    # the expected evaluate
+    eval_expected = tir.Evaluate(10)
+
+    # Check if the generated ir is expected
+    assert_structural_equal(eval_actual, eval_expected, map_free_vars=True)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From fd268137237d2f6fbff4aa4517449284330c3cd8 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 26 Sep 2022 13:55:06 -0500
Subject: [PATCH 248/704] [TVMScript] Infer T.match_buffer parameters for
 region (#12890)

* [TVMScript] Infer T.match_buffer parameters for region

When using `T.match_buffer` to define a view into another buffer,
default shape and dtype parameters can be inferred.

* Updated unit test for new behavior

The test intentionally triggers a failed match based on mismatched
`elem_offset`.  Therefore, the test now needs to explicitly pass an
`elem_offset` to trigger the failure, as this now defaults to having a
`Var` for `match_buffer` calls that represent views.
---
 python/tvm/script/tir/special_stmt.py         | 68 ++++++++++++++-----
 .../unittest/test_tir_lower_match_buffer.py   |  4 +-
 .../unittest/test_tvmscript_syntax_sugar.py   | 25 +++++++
 3 files changed, 79 insertions(+), 18 deletions(-)

diff --git a/python/tvm/script/tir/special_stmt.py b/python/tvm/script/tir/special_stmt.py
index 15502055b7fc..7cbf47441053 100644
--- a/python/tvm/script/tir/special_stmt.py
+++ b/python/tvm/script/tir/special_stmt.py
@@ -121,8 +121,8 @@ class MatchBuffer(SpecialStmt):
     def __init__(self):
         def match_buffer(
             param,
-            shape,
-            dtype="float32",
+            shape=None,
+            dtype=None,
             data=None,
             strides=None,
             elem_offset=None,
@@ -146,28 +146,64 @@ def match_buffer(
                 offset_factor, "offset_factor", self.context.report_error, self.node.span
             )
             buffer_name: str = self.node.lhs[0].id.name
-            buffer = tvm.tir.decl_buffer(
-                shape,
-                dtype,
-                buffer_name,
-                data,
-                strides,
-                elem_offset,
-                scope,
-                align,
-                offset_factor,
-                buffer_type,
-                axis_separators,
-                span=span,
-            )
+
             if isinstance(param, tvm.tir.Var):
+                if shape is None:
+                    self.context.report_error(
+                        "Shape must be specified when binding input param",
+                        self.node.rhs.span,
+                    )
+
+                if dtype is None:
+                    dtype = "float32"
+
+                buffer = tvm.tir.decl_buffer(
+                    shape,
+                    dtype,
+                    buffer_name,
+                    data,
+                    strides,
+                    elem_offset,
+                    scope,
+                    align,
+                    offset_factor,
+                    buffer_type,
+                    axis_separators,
+                    span=span,
+                )
                 if param not in self.context.func_params:
                     self.context.report_error(
                         "Can not bind non-input param to buffer", self.node.rhs.params[0].span
                     )
                 self.context.func_buffer_map[param] = buffer
+
             elif isinstance(param, BufferSlice):
                 buffer_region = param.as_buffer_region()
+
+                if shape is None:
+                    shape = [dim.extent for dim in buffer_region.region]
+
+                if dtype is None:
+                    dtype = buffer_region.buffer.dtype
+
+                if elem_offset is None and offset_factor == 0:
+                    offset_factor = 1
+
+                buffer = tvm.tir.decl_buffer(
+                    shape,
+                    dtype,
+                    buffer_name,
+                    data,
+                    strides,
+                    elem_offset,
+                    scope,
+                    align,
+                    offset_factor,
+                    buffer_type,
+                    axis_separators,
+                    span=span,
+                )
+
                 self.context.current_block_scope().match_buffers.append(
                     tvm.tir.MatchBufferRegion(buffer, buffer_region)
                 )
diff --git a/tests/python/unittest/test_tir_lower_match_buffer.py b/tests/python/unittest/test_tir_lower_match_buffer.py
index 93b7caf9cdde..6120cf2b673c 100644
--- a/tests/python/unittest/test_tir_lower_match_buffer.py
+++ b/tests/python/unittest/test_tir_lower_match_buffer.py
@@ -464,7 +464,7 @@ def fail_match_load(a: T.handle) -> None:
         with T.block():
             T.reads(A[i, j])
             T.writes([])
-            sub_A = T.match_buffer(A[i, j], ())
+            sub_A = T.match_buffer(A[i, j], (), elem_offset=0)
             T.evaluate(sub_A[()])
 
 
@@ -475,7 +475,7 @@ def fail_match_store(a: T.handle) -> None:
         with T.block():
             T.reads([])
             T.writes(A[i, j])
-            sub_A = T.match_buffer(A[i, j], ())
+            sub_A = T.match_buffer(A[i, j], (), elem_offset=0)
             sub_A[()] = 1
 
 
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index d955ec0a8c80..2a2f7354d7cd 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -251,6 +251,31 @@ def test_match_buffer_int64():
     assert_structural_equal(original, after_roundtrip, True)
 
 
+def test_match_buffer_region_has_implicit_shape_dtype():
+    @T.prim_func
+    def explicit_shape_dtype(A: T.Buffer[(16, 64), "int32"]):
+        with T.block():
+            B = T.match_buffer(A[8:16, 32:64], shape=(8, 32), dtype="int32")
+            T.evaluate(0)
+
+    @T.prim_func
+    def implicit_shape_dtype(A: T.Buffer[(16, 64), "int32"]):
+        with T.block():
+            B = T.match_buffer(A[8:16, 32:64])
+            T.evaluate(0)
+
+    assert_structural_equal(explicit_shape_dtype, implicit_shape_dtype)
+
+
+def test_match_buffer_input_requires_shape_arg():
+    with pytest.raises(tvm.error.DiagnosticError):
+
+        @T.prim_func
+        def func(a: T.handle):
+            A = T.match_buffer(a, dtype="int32")
+            T.evaluate(0)
+
+
 def test_letstmt_bufferload_without_type_annotation():
     # Variable assignment of PrimExpr types uses the dtype of the
     # PrimExpr to determine the variable's dtype.  Parsing of

From e1f3f90588aa2d9bb71e0ca8ebc5baab865e054d Mon Sep 17 00:00:00 2001
From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com>
Date: Mon, 26 Sep 2022 15:37:47 -0500
Subject: [PATCH 249/704] [TOPI][Hexagon] Implement quantize op for hexagon
 (#12820)

* [TOPI][Hexagon] Implement quantize op for hexagon

* Fix lint issue
---
 python/tvm/topi/hexagon/qnn/__init__.py       |   2 +
 python/tvm/topi/hexagon/qnn/quantize.py       |  80 ++++++++++++
 python/tvm/topi/hexagon/utils.py              |   5 +
 .../contrib/test_hexagon/infrastructure.py    |   4 +-
 .../test_hexagon/topi/test_quantize.py        | 121 ++++++++++++++++++
 5 files changed, 210 insertions(+), 2 deletions(-)
 create mode 100755 python/tvm/topi/hexagon/qnn/quantize.py
 create mode 100755 tests/python/contrib/test_hexagon/topi/test_quantize.py

diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
index ef9c025ba5b2..2616b9315a9b 100644
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -23,3 +23,5 @@
     dequantize_compute,
     dequantize_schedule,
 )
+
+from .quantize import quantize_compute, tir_quantize_schedule
diff --git a/python/tvm/topi/hexagon/qnn/quantize.py b/python/tvm/topi/hexagon/qnn/quantize.py
new file mode 100755
index 000000000000..ff03aac0a862
--- /dev/null
+++ b/python/tvm/topi/hexagon/qnn/quantize.py
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Compute and schedule for hexagon quantize
+Please note the following assumptions made by the implementation:
+1) The input and output data will be multiple of crouton layout
+2) And the supported layout is NHWC
+3) The input layout will be nhwc-4h2w32c2w-2d and
+   output layout will be nhwc-8h8w32c-2d"""
+
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn, saturate
+
+
+def quantize_compute(tensor_A: te.Tensor, scale: float, zero_point: int, dtype: str):
+    """Compute for quantize"""
+    scale_recip = 1 / scale
+
+    return te.compute(
+        tensor_A.shape,
+        lambda n, h, w, c: saturate(
+            ((tensor_A[n, h, w, c] * scale_recip).astype("int32") + zero_point),
+            dtype,
+        ).astype(dtype),
+        name="quantize",
+    )
+
+
+def tir_quantize_schedule(
+    out_M: te.Tensor,
+    tensor_A: te.Tensor,
+    input_layout: str,
+    output_layout: str,
+):
+    """Schedule for output layout nhwc-8h8w32c-2d"""
+    func = te.create_prim_func([tensor_A, out_M])
+
+    s = tir.Schedule(func)
+
+    block = s.get_block("quantize")
+
+    input_transformed_layout = get_layout_transform_fn(input_layout)
+    s.transform_layout(block, buffer=tensor_A.name, index_map=input_transformed_layout)
+
+    output_transformed_layout = get_layout_transform_fn(output_layout)
+    s.transform_layout(block, buffer=out_M.name, index_map=output_transformed_layout)
+
+    # Fixed chunk size is 2048 byte
+    # For uint8 the layout for fixed chunk is 8x8x32
+    # where each element is 1 bytes
+    # Split and reorder is done to iterate over the fixed chunk
+    # Channel is split by a factor of 32
+    # Width is split by a factor of 8
+    # Height is split by a factor of 8
+    n, h, w, c = s.get_loops(block)
+
+    h_o, h_i = s.split(h, [None, 8])
+    w_o, w_i = s.split(w, [None, 8])
+    c_o, c_i = s.split(c, [None, 32])
+    wio, wii = s.split(w_i, [None, 4])
+
+    s.reorder(n, h_o, w_o, c_o, h_i, wio, wii, c_i)
+
+    return s
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index 9939e5b6fbb7..dab9aa3f74ab 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -294,3 +294,8 @@ def within_range(val, dtype):
         fixed_point_value = int(round(flp * scale_f[0]))
 
     return fixed_point_value, exp_scale_factor
+
+
+def saturate(x: te.Tensor, dtype: str):
+    """Saturate value for the specified data type"""
+    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 71960b649ea2..1058e1dd8117 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -334,8 +334,8 @@ def quantize_np(arr_np: numpy.ndarray, dtype: str):
         qmax = 255
         qmin = 0
     elif dtype == "int8":
-        qmax = 128
-        qmin = -127
+        qmax = 127
+        qmin = -128
     else:
         raise RuntimeError(f"Unsupported quantized data type '{dtype}'")
     fmin = numpy.amin(arr_np)
diff --git a/tests/python/contrib/test_hexagon/topi/test_quantize.py b/tests/python/contrib/test_hexagon/topi/test_quantize.py
new file mode 100755
index 000000000000..2c1718d29465
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_quantize.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+import numpy as np
+
+import tvm
+from tvm import te
+import tvm.topi.hexagon.qnn as s1
+from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
+
+
+@tvm.testing.fixture
+def expected_output_np(input_np, output_dtype):
+    global scale, zero_point
+    quant_np, scale, zero_point = quantize_np(input_np, output_dtype)
+    return quant_np
+
+
+@tvm.testing.fixture
+def input_np(input_shape, input_dtype):
+    return np.random.random(input_shape).astype(input_dtype)
+
+
+@tvm.testing.fixture
+def transformed_input_np(input_np, input_crouton_layout):
+    return transform_numpy(input_np, "nhwc", input_crouton_layout)
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, output_layout):
+    return transform_numpy(expected_output_np, "nhwc", output_layout)
+
+
+class TestQuantize:
+    input_crouton_layout, output_layout, input_dtype = tvm.testing.parameters(
+        ("nhwc-4h2w32c2w-2d", "nhwc-8h8w32c-2d", "float32"),
+    )
+
+    output_dtype = tvm.testing.parameter("uint8", "int8")
+
+    input_shape = tvm.testing.parameter(
+        (1, 8, 8, 32), (1, 16, 16, 32), (1, 16, 16, 128), (1, 64, 64, 64)
+    )
+
+    @tvm.testing.requires_hexagon
+    def test_quantize(
+        self,
+        input_dtype,
+        output_dtype,
+        input_np,
+        transformed_input_np,
+        input_shape,
+        expected_output_np,
+        transformed_expected_output_np,
+        input_crouton_layout,
+        output_layout,
+        hexagon_session,
+    ):
+        target_hexagon = tvm.target.hexagon("v69")
+        A = te.placeholder(input_shape, name="A", dtype=input_dtype)
+
+        M = s1.quantize_compute(A, scale, zero_point, output_dtype)
+
+        tir_schedule = s1.tir_quantize_schedule(M, A, input_crouton_layout, output_layout)
+
+        sch = tir_schedule.mod
+
+        input_axis_separator = [4]
+        output_axis_separator = [4]
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                [A, M],
+                tvm.target.Target(target_hexagon, host=target_hexagon),
+                name="quantize",
+            )
+
+        A_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            dtype=input_dtype,
+            axis_separators=input_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        M_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=transformed_expected_output_np.shape,
+            dtype=output_dtype,
+            axis_separators=output_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(A_data_nd, M_data_nd)
+
+        b, h, w, c = expected_output_np.shape
+
+        # convert nd to np and reshape to fixed chunk size layout
+        M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32])
+
+        np.testing.assert_allclose(transformed_expected_output_np, M_data_np, atol=1)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From f25a702a1fb2aef6ded6fbb0384720dadacbd8ef Mon Sep 17 00:00:00 2001
From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com>
Date: Mon, 26 Sep 2022 15:40:03 -0500
Subject: [PATCH 250/704] [TOPI][Hexagon] Add schedule and test for maxpool
 uint8 layout (#12826)

* [TOPI][Hexagon] Add schedule and test for maxpool uint8 layout for hexagon

* Fix lint issue
---
 .../tvm/topi/hexagon/slice_ops/max_pool2d.py  |  55 +++++----
 .../topi/test_max_pool2d_slice.py             | 105 +++++++++++-------
 2 files changed, 100 insertions(+), 60 deletions(-)

diff --git a/python/tvm/topi/hexagon/slice_ops/max_pool2d.py b/python/tvm/topi/hexagon/slice_ops/max_pool2d.py
index 4bf958c11694..d56879e45b84 100644
--- a/python/tvm/topi/hexagon/slice_ops/max_pool2d.py
+++ b/python/tvm/topi/hexagon/slice_ops/max_pool2d.py
@@ -73,8 +73,10 @@ def max_pool2d_compute(A, out_shape, kernel, stride, dilation):
     return Max
 
 
-def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
-    """Schedule for input and output layout nhwc-8h2w32c2w"""
+def STIR_schedule_nhwc_8h2w32c2w_nhwc_8h8w32c(
+    outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str
+):
+    """Schedule for input and output layout nhwc-8h2w32c2w and nhwc-8h8w32c"""
     func = te.create_prim_func([ins, outs])
     s = tir.Schedule(func)
 
@@ -93,10 +95,14 @@ def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: st
 
     Max = s.get_block("max")
 
-    input_transform_fn = get_layout_transform_fn(input_layout)
-    output_transform_fn = get_layout_transform_fn(output_layout)
+    if input_layout in (
+        "nhwc-8h2w32c2w-2d",
+        "nhwc-8h8w32c-2d",
+    ):
+        input_transform_fn = get_layout_transform_fn(input_layout)
+        s.transform_layout(Max, ("read", 0), input_transform_fn)
 
-    s.transform_layout(Max, ("read", 0), input_transform_fn)
+    output_transform_fn = get_layout_transform_fn(output_layout)
     s.transform_layout(Max, ("write", 0), output_transform_fn)
 
     # pylint: disable=line-too-long
@@ -120,13 +126,21 @@ def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: st
         rw,
     ) = s.get_loops(Max)
 
-    # Restructure the loops from NHWC to nhwc_8h2w32c2w, with loops for 'max's reduction
+    # Restructure the loops from NHWC to nhwc_8h2w32c2w or nhwc_8h8w32c, with loops for 'max's reduction
     # axes at the very end.
-    ho, hi = s.split(h, [None, 8])
-    wo, wi = s.split(w, [None, 4])
-    wio, wii = s.split(wi, [None, 2])
-    co, ci = s.split(c, [None, 32])
-    s.reorder(n, ho, wo, co, hi, wio, ci, wii, rh, rw)
+    # nhwc_8h2w32c2w layout is for float16 and nhwc-8h8w32c-2d layout is for uint8/int8
+    if output_layout == "nhwc-8h2w32c2w-2d":
+        ho, hi = s.split(h, [None, 8])
+        wo, wi = s.split(w, [None, 4])
+        wio, wii = s.split(wi, [None, 2])
+        co, ci = s.split(c, [None, 32])
+        s.reorder(n, ho, wo, co, hi, wio, ci, wii, rh, rw)
+    elif output_layout == "nhwc-8h8w32c-2d":
+        ho, hi = s.split(h, [None, 8])
+        wo, wi = s.split(w, [None, 8])
+        co, ci = s.split(c, [None, 32])
+
+        s.reorder(n, ho, wo, co, hi, wi, ci, rh, rw)
 
     # TODO: Enable vectorization.
     # Hexagon v69's HVX units support SIMD operations on 64-element float16 vectors.
@@ -154,10 +168,10 @@ def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: st
     return s
 
 
-def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
-    """Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w"""
+def STIR_schedule_n11c(outs, ins, output_layout: str, input_layout: str):
+    """Schedule for output layout: n11c-1024c, n11c-2048c-2d;"""
 
-    # NOTE: This function is a variation of the STIR_schedule_nhwc_8h2w32c2w
+    # NOTE: This function is a variation of the STIR_schedule_maxpool2d
     # functions.  Most of that function's code comments apply to this function
     # as well, but are ommited for brevity.
 
@@ -181,7 +195,10 @@ def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
         rh,
         rw,
     ) = s.get_loops(Max)
-    co, ci = s.split(c, [None, 1024])
+    if output_layout == "n11c-1024c-2d":
+        co, ci = s.split(c, [None, 1024])
+    else:
+        co, ci = s.split(c, [None, 2048])
     # s.vectorize(ci)
 
     return s
@@ -189,8 +206,8 @@ def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
 
 def max_pool2d_STIR_schedule(outs, ins, output_layout: str, input_layout: str):
     """STIR based schedule"""
-    if output_layout == "nhwc-8h2w32c2w-2d":
-        return STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout)
-    if output_layout == "n11c-1024c-2d":
-        return STIR_schedule_n11c_1024c(outs, ins, output_layout, input_layout)
+    if output_layout == "nhwc-8h2w32c2w-2d" or "nhwc-8h8w32c-2d":
+        return STIR_schedule_nhwc_8h2w32c2w_nhwc_8h8w32c(outs, ins, output_layout, input_layout)
+    if output_layout == "n11c-1024c-2d" or "n11c-2048c-2d":
+        return STIR_schedule_n11c(outs, ins, output_layout, input_layout)
     raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py
index f827f025af17..de60ffc6df4d 100644
--- a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py
@@ -50,6 +50,31 @@ def transformed_input_np_padded(input_np_padded, input_layout):
     return transform_numpy(input_np_padded, "nhwc", input_layout)
 
 
+(input_layout, dtype) = tvm.testing.parameters(
+    ("nhwc-8h2w32c2w-2d", "float16"),
+    ("nhwc-8h8w32c-2d", "uint8"),
+)
+
+
+@tvm.testing.fixture
+def output_layout(output_shape, dtype):
+    o_b, o_h, o_w, o_c = output_shape
+    if dtype == "float16":
+        if o_h == 1 and o_w == 1:
+            return "n11c-1024c-2d"
+        else:
+            assert o_h % 8 == 0 and o_w % 4 == 0, "Invalid output shape"
+            return "nhwc-8h2w32c2w-2d"
+    elif dtype == "int8" or "uint8":
+        if o_h == 1 and o_w == 1:
+            return "n11c-2048c-2d"
+        else:
+            assert o_h % 8 == 0 and o_w % 8 == 0, "Invalid output shape"
+            return "nhwc-8h8w32c-2d"
+    else:
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+
 class TestmaxPool2dSlice:
     _param_descs = [
         "out_shape",  # output_shape
@@ -59,8 +84,6 @@ class TestmaxPool2dSlice:
         "pad",  # padding
         "ceil",  # ceil_mode
         "cnt_padded",  # count_include_pad
-        "out_layout",  # output_layout
-        None,  # dtype
         None,  # input_tensor_populator
     ]
 
@@ -73,8 +96,6 @@ class TestmaxPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -85,8 +106,6 @@ class TestmaxPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -97,8 +116,6 @@ class TestmaxPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         # Test non-one stride and dilation
@@ -110,8 +127,6 @@ class TestmaxPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -122,8 +137,6 @@ class TestmaxPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -134,8 +147,6 @@ class TestmaxPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         # Test non-zero padding
@@ -147,8 +158,6 @@ class TestmaxPool2dSlice:
             [1, 1, 1, 1],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -159,8 +168,6 @@ class TestmaxPool2dSlice:
             [1, 2, 3, 4],
             False,
             True,
-            "nhwc-8h2w32c2w-2d",
-            "float16",
             TensorContentRandom(),
         ),
         # Test n11c-1024c-2d layout which will require input and output to have different layout
@@ -172,8 +179,6 @@ class TestmaxPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "n11c-1024c-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -184,8 +189,6 @@ class TestmaxPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "n11c-1024c-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -196,8 +199,6 @@ class TestmaxPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "n11c-1024c-2d",
-            "float16",
             TensorContentRandom(),
         ),
         (
@@ -208,19 +209,14 @@ class TestmaxPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
-            "n11c-1024c-2d",
-            "float16",
             TensorContentRandom(),
         ),
     ]
 
     _param_ids = get_multitest_ids(_multitest_params, _param_descs)
 
-    input_layout = tvm.testing.parameter(
-        "nhwc-8h2w32c2w-2d",
-    )
-
-    # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d"
+    # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d" for float16
+    # and "nhwc-8h8w32c-2d" for uint8
     (
         output_shape,
         kernel,
@@ -229,8 +225,6 @@ class TestmaxPool2dSlice:
         padding,
         ceil_mode,
         count_include_pad,
-        output_layout,
-        dtype,
         input_tensor_populator,
     ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids)
 
@@ -283,15 +277,32 @@ def input_shape(self, output_shape, kernel, padding, stride, dilation, output_la
         return [o_b, in_h, in_w, o_c]
 
     @tvm.testing.fixture
-    def input_shape_padded(self, input_shape, padding, output_layout):
+    def input_shape_padded(self, dtype, input_shape, padding, output_layout):
         # Input shape is adjusted to account for 'padding'. Also, due to the physical
         # layout of the buffer, height and width are adjusted so that they are a
         # multiple of 8 and 4 respectively.
-        # NOTE: Input layout is always assumed to be nhwc-8h2w32c2w-2d.
+        # NOTE: For float16, the input layout is always assumed to be nhwc-8h2w32c2w-2d and
+        # for int8/uint8, it's nhwc-8h8w32c-2d.
+        # For both nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d, the height should be a multiple
+        # of 8. However, the width should be a multiple of 4 for the first case and 8 for
+        # the second case.
+
+        height_mult = 8
+        if dtype == "float16":
+            width_mult = 4  # input layout : nhwc-8h2w32c2w-2d
+        elif dtype in ("uint8", "int8"):
+            width_mult = 8  # input layout : nhwc-8h8w32c-2d
+        else:
+            raise RuntimeError(f"Unsupport dtype '{dtype}'")
+
         pad_before_h, pad_before_w = padding[:2]
         pad_after_h, pad_after_w = padding[2:]
-        padded_input_height = ((input_shape[1] + pad_before_h + pad_after_h + 7) // 8) * 8
-        padded_input_width = ((input_shape[2] + pad_before_w + pad_after_w + 3) // 4) * 4
+        padded_input_height = (
+            (input_shape[1] + pad_before_h + pad_after_h + height_mult - 1) // height_mult
+        ) * height_mult
+        padded_input_width = (
+            (input_shape[2] + pad_before_w + pad_after_w + width_mult - 1) // width_mult
+        ) * width_mult
         return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]]
 
     @tvm.testing.fixture
@@ -340,9 +351,12 @@ def test_max_pool2d_slice(
         sch = tir_schedule.mod
 
         input_axis_separator = [4]
-        if output_layout == "nhwc-8h2w32c2w-2d":
-            output_axis_separator = [4]
-        elif output_layout == "n11c-1024c-2d":
+        if output_layout in (
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h8w32c-2d",
+            "n11c-1024c-2d",
+            "n11c-2048c-2d",
+        ):
             output_axis_separator = [4]
         else:
             raise RuntimeError(f"Unexpected layout '{output_layout}'")
@@ -374,12 +388,21 @@ def test_max_pool2d_slice(
         b, h, w, c = output_shape
         if output_layout == "nhwc-8h2w32c2w-2d":
             output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
+        elif output_layout == "nhwc-8h8w32c-2d":
+            output_np = output_arr.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32])
+        elif output_layout == "n11c-2048c-2d":
+            output_np = output_arr.numpy().reshape([b, 1, 1, c // 2048, 2048])
         elif output_layout == "n11c-1024c-2d":
             output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024])
         else:
             raise RuntimeError(f"Unexpected layout '{output_layout}'")
 
-        np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3)
+        if dtype == "float16":
+            np.testing.assert_allclose(
+                output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3
+            )
+        elif dtype == "uint8":
+            np.testing.assert_allclose(output_np, transformed_expected_output_np, atol=1)
 
 
 if __name__ == "__main__":

From d4fb957ae1caf34604f03d9348ee9b3d3acb4709 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 26 Sep 2022 15:14:10 -0700
Subject: [PATCH 251/704] [microTVM][ARM] Improve dense DSP micro kernel
 (#12908)

Fix micro kernel
---
 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
index ffc48eaabd59..f1c0e3ea8d6d 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
@@ -207,7 +207,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
   int16_t bb_pad[{bb_pad_size}];
   int32_t retcode = 0;
 
-  if ( {M} < 16 || {N} < 16 ) {{
+  if ( {M} < 2 && {N} < 2 ) {{
     retcode = gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
     goto out;
   }}
@@ -313,7 +313,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
   int16_t bb_pad[{bb_pad_size}];
   int32_t retcode = 0;
 
-  if ( {M} < 16 || {N} < 16 ) {{
+  if ( {M} < 2 && {N} < 2 ) {{
     retcode = gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
     goto out;
   }}
@@ -393,7 +393,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
     int A_stride, int B_stride, int C_stride) {{
   int32_t retcode = 0;
 
-  if ( {M} < 2 || {N} < 2 ) {{
+  if ( {M} < 2 && {N} < 2 ) {{
     retcode = gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
     goto out;
   }}
@@ -471,7 +471,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
     int A_stride, int B_stride, int C_stride) {{
   int32_t retcode = 0;
 
-  if ( {M} < 2 || {N} < 2 ) {{
+  if ( {M} < 2 && {N} < 2 ) {{
     retcode = gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
     goto out;
   }}

From 830ebc4ec8d588bc84c283c45b22dbee1340b95d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 26 Sep 2022 17:30:59 -0500
Subject: [PATCH 252/704] [TIR] Refactor IndexMap::Inverse in terms of
 NonSurjectiveInverse (#12904)

The two implementations were largely identical, and had
implementations that drifted apart, resulting in bugs such as
https://github.com/apache/tvm/issues/12852.  This commit removes this
duplication by writing `Inverse` in terms of `NonSurjectiveInverse`.
The merged version of `NonSurjectiveInverse` contains bugfix
https://github.com/apache/tvm/pull/11841, that were previously present
only in `Inverse`.
---
 include/tvm/tir/index_map.h | 16 +++-----
 src/tir/ir/index_map.cc     | 74 +++++++++++--------------------------
 2 files changed, 28 insertions(+), 62 deletions(-)

diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h
index 8a176cb3cee8..e1b323462cda 100644
--- a/include/tvm/tir/index_map.h
+++ b/include/tvm/tir/index_map.h
@@ -73,10 +73,10 @@ class IndexMapNode : public Object {
   /*!
    * \brief The inverse index map.
    *
-   * When this is defined, IndexMap::Inverse will return the pre-defined inverse index map.
-   * Otherwise, the inverse index map will be computed on the fly.
-   * It is the user's responsibility to ensure the correctness of the pre-defined inverse index
-   * map.
+   * When this is defined, IndexMap::Inverse will return the
+   * pre-defined inverse index map.  Otherwise, the inverse index map
+   * will be computed on the fly.  It is the user's responsibility to
+   * ensure the correctness of the pre-defined inverse index map.
    *
    * \note ObjectRef is used here instead of IndexMap to avoid circular reference.
    */
@@ -190,12 +190,8 @@ class IndexMap : public ObjectRef {
    * The range of the input indices is required in order to ensure
    * that the transformation is bijective over the input domain.
    *
-   * TODO(Lunderberg): Look into allowing non-bijective
-   * transformations.  If injective, the inverse mapping could still
-   * be generated with some predicate (see NonSurjectiveInverse).  If
-   * non-injective, could simplify the implementation of other
-   * optimizations (e.g. double buffering as a map `lambda *indices:
-   * [buffer_loop%2, *indices]`).
+   * If the user has supplied an `inverse_index_map`, that map is
+   * assumed to be correct and bijective, and is returned.
    */
   IndexMap Inverse(Array<Range> initial_ranges) const;
 
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 2ffc5079246b..2c5349ab9941 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -54,6 +54,14 @@ IndexMap IndexMap::FromFunc(int ndim, runtime::TypedPackedFunc<Array<PrimExpr>(A
 }
 
 std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initial_ranges) const {
+  if ((*this)->inverse_index_map.defined()) {
+    // return the pre-defined inverse index map if exists.  In this
+    // case, the user-defined inverse is assumed to be correct and
+    // bijective.
+    PrimExpr padding_predicate = Bool(false);
+    return {Downcast<IndexMap>((*this)->inverse_index_map.value()), padding_predicate};
+  }
+
   // Dummy variables to represent the inverse's inputs.
   Array<Var> output_vars;
   for (size_t i = 0; i < (*this)->final_indices.size(); i++) {
@@ -92,8 +100,15 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
 
   // Unpack the map to an array, maintaining the same parameter order.
   Array<PrimExpr> inverse_exprs;
-  for (const auto& index : (*this)->initial_indices) {
-    inverse_exprs.push_back(analyzer.Simplify(inverse_exprs_map.at(index)));
+  for (int i = 0, n = (*this)->initial_indices.size(); i < n; ++i) {
+    Var index = (*this)->initial_indices[i];
+    PrimExpr expr;
+    if (is_one(initial_ranges[i]->extent) && !inverse_exprs_map.count(index)) {
+      expr = initial_ranges[i]->min;
+    } else {
+      expr = inverse_exprs_map.at(index);
+    }
+    inverse_exprs.push_back(analyzer.Simplify(expr));
   }
 
   PrimExpr padding_predicate = padded_iter_map->padding_predicate;
@@ -117,57 +132,12 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
 }
 
 IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
-  if ((*this)->inverse_index_map.defined()) {
-    // return the pre-defined inverse index map if exists.
-    return Downcast<IndexMap>((*this)->inverse_index_map.value());
-  }
-  // Dummy variables to represent the inverse's inputs.
-  Array<Var> output_vars;
-  for (size_t i = 0; i < (*this)->final_indices.size(); i++) {
-    PrimExpr index = (*this)->final_indices[i];
-    // TODO(Lunderberg): Better names for these variables.  A variable
-    // that is passed through unmodified (`index` is an element of
-    // `initial_indices`) should use that input index's name.  A pair
-    // of output indices variables split from a single input index
-    // should be named (X.outer,X.inner).
-    std::stringstream ss;
-    ss << "axis" << i;
-    Var var_index(ss.str(), index.dtype());
-    output_vars.push_back(var_index);
-  }
-
-  // Dummy ranges for the extent of each input.
-  Map<Var, Range> input_iters;
-  ICHECK_EQ((*this)->initial_indices.size(), initial_ranges.size());
-  for (size_t i = 0; i < initial_ranges.size(); i++) {
-    input_iters.Set((*this)->initial_indices[i], initial_ranges[i]);
-  }
-
-  // Unpack the output indices into linear combinations of the initial
-  // indices.
+  auto [inverse, padding_predicate] = NonSurjectiveInverse(std::move(initial_ranges));
   arith::Analyzer analyzer;
-  auto iter_map = DetectIterMap((*this)->final_indices, input_iters, /* predicate = */ 1,
-                                /* check_level = */ arith::IterMapLevel::Bijective, &analyzer,
-                                /* simplify_trivial_iterators = */ false);
-  CHECK(iter_map->indices.size()) << "Index transformation was not bijective.";
-
-  // Determine expressions for the input variables, in terms of the
-  // output variables.
-  Map<Var, PrimExpr> inverse_exprs_map = InverseAffineIterMap(
-      iter_map->indices, Array<PrimExpr>(output_vars.begin(), output_vars.end()));
-
-  // Unpack the map to an array, maintaining the same parameter order.
-  Array<PrimExpr> inverse_exprs;
-  for (int i = 0, n = (*this)->initial_indices.size(); i < n; ++i) {
-    Var index = (*this)->initial_indices[i];
-    if (is_one(initial_ranges[i]->extent) && !inverse_exprs_map.count(index)) {
-      inverse_exprs.push_back(initial_ranges[i]->min);
-    } else {
-      inverse_exprs.push_back(inverse_exprs_map.at(index));
-    }
-  }
-
-  return IndexMap(output_vars, inverse_exprs);
+  CHECK(analyzer.CanProve(!padding_predicate))
+      << "Bijective inverse should not contain padding, but inverse of " << *this << " over range "
+      << initial_ranges << " resulted in a padding predicate of " << padding_predicate;
+  return inverse;
 }
 
 Array<PrimExpr> IndexMapNode::MapIndices(const Array<PrimExpr>& indices,

From 5ddd35c37724bec8c4e89d911b31d4ecd6e41caa Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Mon, 26 Sep 2022 13:32:22 -1000
Subject: [PATCH 253/704] [Relay][TE] Add default param name if needed (#12912)

#10516 used the Relay parameter name when lowering to TE. However, this creates an issue when the parameter name is empty. This is legal in Relay, but results in errors during code generation. For example, this is the generated CUDA kernel for bias add:

```
extern "C" __global__ void __launch_bounds__(1024) fused_raf_op_tvm_add_kernel0(
    float* __restrict__ T_add,
    float* __restrict__ , /* Name is missing and it results in compile errors. */
    float* __restrict__ _1) {
    T_add[((((int)blockIdx.x) * 1024) + ((int)threadIdx.x))] = ([((((int)blockIdx.x) * 1024) + ((int)threadIdx.x))] + _1[((((((int)blockIdx.x) * 16) + (((int)threadIdx.x) >> 6)) % 54) / 9)]);
}
```

This PR adds "placeholder" back as a default to make sure no empty string will be passed when lowering to TE.
---
 src/relay/backend/te_compiler_cache.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 17eac443ffe3..6f55402baded 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -131,8 +131,9 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
     for (Var param : relay_func->params) {
       Array<tvm::te::Tensor> inputs;
       for (const auto& ttype : FlattenTupleType(param->checked_type())) {
-        tvm::te::Tensor tensor =
-            tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype, param->vid->name_hint);
+        auto name_hint = param->vid->name_hint;
+        tvm::te::Tensor tensor = tvm::te::placeholder(
+            GetShape(ttype->shape), ttype->dtype, (name_hint == "") ? "placeholder" : name_hint);
         inputs.push_back(tensor);
         fn_inputs_.push_back(tensor);
       }

From 4d5ed073250aabf1dab50001aa4c85ec505062a7 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Mon, 26 Sep 2022 16:32:57 -0700
Subject: [PATCH 254/704] [TIR] Fix GetProducer/Consumer for duplicating dep
 edges (#12910)

* [TIR] Fix GetProducer/Consumer for duplicating dep edges

* preserve result ordering
---
 src/tir/schedule/primitive/get_block_loop.cc  | 10 +++-
 .../unittest/test_tir_schedule_utilities.py   | 52 +++++++++++++++++++
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/src/tir/schedule/primitive/get_block_loop.cc b/src/tir/schedule/primitive/get_block_loop.cc
index cbdb99c6444f..ecbadce470b9 100644
--- a/src/tir/schedule/primitive/get_block_loop.cc
+++ b/src/tir/schedule/primitive/get_block_loop.cc
@@ -81,10 +81,13 @@ Array<StmtSRef> GetProducers(const ScheduleState& self, const StmtSRef& block_sr
   StmtSRef scope_root = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
   Array<Dependency> edges = self->GetBlockScope(scope_root)->GetDepsByDst(block_sref);
   Array<StmtSRef> results;
+  std::unordered_set<StmtSRef, ObjectPtrHash, ObjectPtrEqual> result_set;
   results.reserve(edges.size());
   for (const Dependency& edge : edges) {
-    if (edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) {
+    if ((edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) &&
+        !result_set.count(edge->src)) {
       results.push_back(edge->src);
+      result_set.emplace(edge->src);
     }
   }
   return results;
@@ -94,10 +97,13 @@ Array<StmtSRef> GetConsumers(const ScheduleState& self, const StmtSRef& block_sr
   StmtSRef scope_root = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
   Array<Dependency> edges = self->GetBlockScope(scope_root)->GetDepsBySrc(block_sref);
   Array<StmtSRef> results;
+  std::unordered_set<StmtSRef, ObjectPtrHash, ObjectPtrEqual> result_set;
   results.reserve(edges.size());
   for (const Dependency& edge : edges) {
-    if (edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) {
+    if ((edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) &&
+        !result_set.count(edge->dst)) {
       results.push_back(edge->dst);
+      result_set.emplace(edge->dst);
     }
   }
   return results;
diff --git a/tests/python/unittest/test_tir_schedule_utilities.py b/tests/python/unittest/test_tir_schedule_utilities.py
index 41844a868e6b..33ef0e221563 100644
--- a/tests/python/unittest/test_tir_schedule_utilities.py
+++ b/tests/python/unittest/test_tir_schedule_utilities.py
@@ -124,6 +124,38 @@ def vector_add_2(
                 B[vi] = A[vi]
 
 
+@T.prim_func
+def tuple_reduction(data: T.Buffer[(4, 32), "float32"], T_add: T.Buffer[(4,), "float32"]) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    # body
+    with T.block("root"):
+        T.reads()
+        T.writes()
+        data_red_temp_v0 = T.alloc_buffer([4], dtype="float32")
+        data_red_temp_v1 = T.alloc_buffer([4], dtype="float32")
+        for i0, i1 in T.grid(4, 32):
+            with T.block("data_red_temp"):
+                ax0, k1 = T.axis.remap("SR", [i0, i1])
+                T.reads(data[ax0, k1])
+                T.writes(data_red_temp_v0[ax0], data_red_temp_v1[ax0])
+                with T.init():
+                    data_red_temp_v0[ax0] = T.float32(0)
+                    data_red_temp_v1[ax0] = T.float32(0)
+                v_data_red_temp_v0: T.float32 = data_red_temp_v0[ax0] + data[ax0, k1]
+                v_data_red_temp_v1: T.float32 = (
+                    data_red_temp_v1[ax0] + data[ax0, k1] * data[ax0, k1]
+                )
+                data_red_temp_v0[ax0] = v_data_red_temp_v0
+                data_red_temp_v1[ax0] = v_data_red_temp_v1
+        for i0 in range(4):
+            with T.block("T_add"):
+                (ax0,) = T.axis.remap("S", [i0])
+                T.reads(data_red_temp_v0[ax0], data_red_temp_v1[ax0])
+                T.writes(T_add[ax0])
+                T_add[ax0] = data_red_temp_v0[ax0] + data_red_temp_v1[ax0]
+
+
 # pylint: enable=no-member,invalid-name,unused-variable
 
 use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
@@ -261,6 +293,16 @@ def test_get_producers(use_block_name):
     verify_trace_roundtrip(sch, mod=matmul_relu)
 
 
+def test_get_producers_multiple_buffer_depdencies(use_block_name):
+    sch = tir.Schedule(mod=tuple_reduction, debug_mask="all")
+    block = "T_add" if use_block_name else sch.get_block("T_add")
+    (producer,) = sch.get_producers(block)
+    assert tvm.ir.structural_equal(
+        sch.get_sref(producer).stmt,
+        sch.get_sref(sch.get_block("data_red_temp")).stmt,
+    )
+
+
 def test_get_consumers(use_block_name):
     sch = tir.Schedule(mod=matmul_relu, debug_mask="all")
     block = "matmul" if use_block_name else sch.get_block("matmul")
@@ -272,6 +314,16 @@ def test_get_consumers(use_block_name):
     verify_trace_roundtrip(sch, mod=matmul_relu)
 
 
+def test_get_consumers_multiple_buffer_depdencies(use_block_name):
+    sch = tir.Schedule(mod=tuple_reduction, debug_mask="all")
+    block = "data_red_temp" if use_block_name else sch.get_block("data_red_temp")
+    (consumer,) = sch.get_consumers(block)
+    assert tvm.ir.structural_equal(
+        sch.get_sref(consumer).stmt,
+        sch.get_sref(sch.get_block("T_add")).stmt,
+    )
+
+
 def test_annotate_unannotate_loop():
     sch = tir.Schedule(mod=matmul_relu, debug_mask="all")
     matmul = sch.get_block("matmul")

From f64e933246ba7837f691979b5d78c0449297d4b2 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 27 Sep 2022 01:02:05 -0500
Subject: [PATCH 255/704] [LLVM] Emit fp16/fp32 builtins directly into target
 module (#12877)

For conversions between `_Float16` and `float`, LLVM uses runtime functions
`__extendhfsf2` and `__truncsfhf2`.  On X86 up until version 14, LLVM used
`uint16_t` for representing `_Float16`. Starting with LLVM 15, half-
precision values can be passed in XMM registers (i.e. as floating-point).
This happens when the compilation target has SSE2 enabled (either directly,
or by enabling a feature that implies SSE2).
Because the names of the conversion functions remain unchanged, it is
impossible for TVM to provide them in the runtime, and have them work in
both cases. To solve this issue, emit these functions directly into the
target module after detecting whether or not to use floating-point ABI.
To allow the linker to remove potential duplicates (or if they are unused),
they are weak and reside in a separate section.
---
 src/runtime/builtin_fp16.cc                   |   3 -
 src/target/llvm/codegen_llvm.cc               | 227 ++++++++++++++++++
 src/target/llvm/codegen_llvm.h                |   8 +
 .../unittest/test_target_codegen_llvm.py      |   7 +-
 .../unittest/test_target_codegen_x86.py       |  74 ++++--
 5 files changed, 298 insertions(+), 21 deletions(-)

diff --git a/src/runtime/builtin_fp16.cc b/src/runtime/builtin_fp16.cc
index 4b175fb3ff60..d229491a4c7b 100644
--- a/src/runtime/builtin_fp16.cc
+++ b/src/runtime/builtin_fp16.cc
@@ -48,7 +48,4 @@ TVM_DLL float __gnu_h2f_ieee(uint16_t a) {
 }
 
 #endif
-
-TVM_DLL uint16_t __truncsfhf2(float v) { return __gnu_f2h_ieee(v); }
-TVM_DLL float __extendhfsf2(uint16_t v) { return __gnu_h2f_ieee(v); }
 }
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 305358d079d0..ca9d577f64f6 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -34,6 +34,11 @@
 #else
 #include <llvm/Support/Dwarf.h>
 #endif
+#if TVM_LLVM_VERSION >= 60
+#include <llvm/CodeGen/TargetSubtargetInfo.h>
+#else
+#include <llvm/Target/TargetSubtargetInfo.h>
+#endif
 #include <llvm/IR/Argument.h>
 #include <llvm/IR/Attributes.h>
 #include <llvm/IR/BasicBlock.h>
@@ -167,6 +172,45 @@ void CodeGenLLVM::InitTarget() {
       LOG(WARNING) << "Set native vector bits to be 128 for " << arch_name;
     }
   }
+
+#if TVM_LLVM_VERSION >= 60
+  bool use_float16_abi = false;
+#if TVM_LLVM_VERSION >= 150
+  // For conversions between _Float16 and float, LLVM uses runtime functions
+  // __extendhfsf2 and __truncsfhf2.  On X86 up until version 14, LLVM used
+  // "uint16_t" for representing _Float16. Starting with LLVM 15, half-precision
+  // values can be passed in XMM registers (i.e. as floating-point). This happens
+  // when the compilation target has SSE2 enabled (either directly, or by enabling
+  // a feature that implies SSE2).
+  // Because the names of the conversion functions remain unchanged, it is impossible
+  // for TVM to provide them in the runtime, and have them work in both cases.
+  // To alleviate this issue, emit these functions directly into the target module
+  // after detecting whether or not to use floating-point ABI. To allow the linker
+  // to remove potential duplicates (or if they are unused), they are weak and
+  // reside in a separate section (ELF).
+  llvm::Triple::ArchType arch_type = tm->getTargetTriple().getArch();
+  if (arch_type == llvm::Triple::x86 || arch_type == llvm::Triple::x86_64) {
+    // Detect if SSE2 is enabled. This determines whether float16 ABI is used.
+    std::stringstream os;
+    const char fname[] = "test_sse2";
+    os << "target triple = \"" << llvm_target_->GetTargetTriple() << "\"\n"
+       << "define void @" << fname << "() #0 { ret void } attributes #0 = { \"target-cpu\"=\""
+       << llvm_target_->GetCPU() << "\" ";
+    if (auto&& fs = llvm_target_->GetTargetFeatureString(); !fs.empty()) {
+      os << "\"target-features\"=\"" << fs << "\" ";
+    }
+    os << "}\n";
+    auto mod = llvm_target_->GetInstance().ParseIR(os.str());
+    auto* test_sse2 = mod->getFunction(fname);
+    ICHECK_NE(test_sse2, nullptr) << "Module creation error";
+    use_float16_abi = tm->getSubtargetImpl(*test_sse2)->checkFeatures("+sse2");
+  }
+#endif  // TVM_LLVM_VERSION >= 150
+
+  // Call this function only with LLVM >= 6.0. The code it emits uses "dso_local"
+  // which was introduced in LLVM 6.
+  EmitFloat16ConversionBuiltins(use_float16_abi);
+#endif  // TVM_LLVM_VERSION >= 60
 }
 
 void CodeGenLLVM::AddFunction(const PrimFunc& f) { this->AddFunctionInternal(f, false); }
@@ -949,6 +993,189 @@ void CodeGenLLVM::SetTargetAttributes(llvm::Function* func) {
   }
 }
 
+void CodeGenLLVM::EmitFloat16ConversionBuiltins(bool use_float16_abi) {
+  // The LLVM IR for these function was obtained by compiling
+  //
+  // For integer ABI:
+  // __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(a);
+  // __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(a);
+  // For floating-point ABI:
+  // __truncXfYf2__<float, uint32_t, 23, _Float16, uint16_t, 10>(a);
+  // __extendXfYf2__<_Float16, uint16_t, 10, float, uint32_t, 23>(a);
+
+  static const char trunc_body[] =  // __truncsfhf2
+      "  %v0 = bitcast float %a0 to i32\n"
+      "  %v1 = and i32 %v0, 2147483647\n"
+      "  %v2 = add nsw i32 %v1, -947912704\n"
+      "  %v3 = add nsw i32 %v1, -1199570944\n"
+      "  %v4 = icmp ult i32 %v2, %v3\n"
+      "  br i1 %v4, label %b1, label %b5\n"
+      "b1:\n"
+      "  %v5 = lshr i32 %v0, 13\n"
+      "  %v6 = and i32 %v5, 65535\n"
+      "  %v7 = add nuw nsw i32 %v6, -114688\n"
+      "  %v8 = and i32 %v0, 8191\n"
+      "  %v9 = icmp ugt i32 %v8, 4096\n"
+      "  br i1 %v9, label %b2, label %b3\n"
+      "b2:\n"
+      "  %v10 = add nuw nsw i32 %v6, -114687\n"
+      "  br label %b13\n"
+      "b3:\n"
+      "  %v11 = icmp eq i32 %v8, 4096\n"
+      "  br i1 %v11, label %b4, label %b13\n"
+      "b4:\n"
+      "  %v12 = and i32 %v7, 65535\n"
+      "  %v13 = and i32 %v5, 1\n"
+      "  %v14 = add nuw nsw i32 %v12, %v13\n"
+      "  br label %b13\n"
+      "b5:\n"
+      "  %v15 = icmp ugt i32 %v1, 2139095040\n"
+      "  br i1 %v15, label %b6, label %b7\n"
+      "b6:\n"
+      "  %v16 = lshr i32 %v0, 13\n"
+      "  %v17 = and i32 %v16, 511\n"
+      "  %v18 = or i32 %v17, 32256\n"
+      "  br label %b13\n"
+      "b7:\n"
+      "  %v19 = icmp ugt i32 %v1, 1199570943\n"
+      "  br i1 %v19, label %b13, label %b8\n"
+      "b8:\n"
+      "  %v20 = icmp ult i32 %v1, 754974720\n"
+      "  br i1 %v20, label %b13, label %b9\n"
+      "b9:\n"
+      "  %v21 = lshr i32 %v1, 23\n"
+      "  %v22 = sub nsw i32 113, %v21\n"
+      "  %v23 = and i32 %v0, 8388607\n"
+      "  %v24 = or i32 %v23, 8388608\n"
+      "  %v25 = add nsw i32 %v21, -81\n"
+      "  %v26 = shl i32 %v24, %v25\n"
+      "  %v27 = icmp ne i32 %v26, 0\n"
+      "  %v28 = lshr i32 %v24, %v22\n"
+      "  %v29 = zext i1 %v27 to i32\n"
+      "  %v30 = lshr i32 %v28, 13\n"
+      "  %v31 = and i32 %v28, 8191\n"
+      "  %v32 = or i32 %v31, %v29\n"
+      "  %v33 = icmp ugt i32 %v32, 4096\n"
+      "  br i1 %v33, label %b10, label %b11\n"
+      "b10:\n"
+      "  %v34 = add nuw nsw i32 %v30, 1\n"
+      "  br label %b13\n"
+      "b11:\n"
+      "  %v35 = icmp eq i32 %v32, 4096\n"
+      "  br i1 %v35, label %b12, label %b13\n"
+      "b12:\n"
+      "  %v36 = and i32 %v30, 1\n"
+      "  %v37 = add nuw nsw i32 %v36, %v30\n"
+      "  br label %b13\n"
+      "b13:\n"
+      "  %v38 = phi i32 [ %v18, %b6 ], [ %v10, %b2 ], [ %v14, %b4 ], [ %v7, %b3 ],\n"
+      "                 [ 31744, %b7 ], [ 0, %b8 ], [ %v34, %b10 ], [ %v37, %b12 ],\n"
+      "                 [ %v30, %b11 ]\n"
+      "  %v39 = lshr i32 %v0, 16\n"
+      "  %v40 = and i32 %v39, 32768\n"
+      "  %v41 = or i32 %v38, %v40\n"
+      "  %vlast = trunc i32 %v41 to i16\n";
+
+  static const char extend_body[] =  // __extendhfsf2
+      "  %v1 = and i16 %vinp, 32767\n"
+      "  %v2 = zext i16 %v1 to i32\n"
+      "  %v3 = add nsw i16 %v1, -1024\n"
+      "  %v4 = icmp ult i16 %v3, 30720\n"
+      "  br i1 %v4, label %b1, label %b2\n"
+      "b1:\n"
+      "  %v5 = shl nuw nsw i32 %v2, 13\n"
+      "  %v6 = add nuw nsw i32 %v5, 939524096\n"
+      "  br label %b6\n"
+      "b2:\n"
+      "  %v7 = icmp ugt i16 %v1, 31743\n"
+      "  br i1 %v7, label %b3, label %b4\n"
+      "b3:\n"
+      "  %v8 = shl nuw nsw i32 %v2, 13\n"
+      "  %v9 = or i32 %v8, 2139095040\n"
+      "  br label %b6\n"
+      "b4:\n"
+      "  %v10 = icmp eq i16 %v1, 0\n"
+      "  br i1 %v10, label %b6, label %b5\n"
+      "b5:\n"
+      "  %v11 = icmp ult i16 %v1, 256\n"
+      "  %v12 = lshr i32 %v2, 8\n"
+      "  %v13 = select i1 %v11, i32 %v2, i32 %v12\n"
+      "  %v14 = select i1 %v11, i32 32, i32 24\n"
+      "  %v15 = icmp ult i32 %v13, 16\n"
+      "  %v16 = lshr i32 %v13, 4\n"
+      "  %v17 = add nsw i32 %v14, -4\n"
+      "  %v18 = select i1 %v15, i32 %v13, i32 %v16\n"
+      "  %v19 = select i1 %v15, i32 %v14, i32 %v17\n"
+      "  %v20 = icmp ult i32 %v18, 4\n"
+      "  %v21 = lshr i32 %v18, 2\n"
+      "  %v22 = add nsw i32 %v19, -2\n"
+      "  %v23 = select i1 %v20, i32 %v18, i32 %v21\n"
+      "  %v24 = select i1 %v20, i32 %v19, i32 %v22\n"
+      "  %v25 = icmp ult i32 %v23, 2\n"
+      "  %v26 = sub nsw i32 0, %v23\n"
+      "  %v27 = select i1 %v25, i32 %v26, i32 -2\n"
+      "  %v28 = add nsw i32 %v27, %v24\n"
+      "  %v29 = add nsw i32 %v28, -8\n"
+      "  %v30 = shl i32 %v2, %v29\n"
+      "  %v31 = xor i32 %v30, 8388608\n"
+      "  %v32 = shl i32 %v28, 23\n"
+      "  %v33 = sub i32 1124073472, %v32\n"
+      "  %v34 = or i32 %v31, %v33\n"
+      "  br label %b6\n"
+      "b6:\n"
+      "  %v35 = phi i32 [ %v6, %b1 ], [ %v9, %b3 ], [ %v34, %b5 ], [ 0, %b4 ]\n"
+      "  %v36 = and i16 %vinp, -32768\n"
+      "  %v37 = zext i16 %v36 to i32\n"
+      "  %v38 = shl nuw i32 %v37, 16\n"
+      "  %v39 = or i32 %v35, %v38\n"
+      "  %v40 = bitcast i32 %v39 to float\n"
+      "  ret float %v40\n"
+      "}\n";
+
+  std::string short_type = use_float16_abi ? "half" : "i16";
+
+  std::string short_cast_in, short_cast_out;
+  if (use_float16_abi) {
+    short_cast_in = "  %vinp = bitcast half %a0 to i16\n";
+    short_cast_out = "  %vres = bitcast i16 %vlast to half\n";
+  } else {
+    // No-ops that preserve the i16 values.
+    short_cast_in = "  %vinp = add i16 %a0, 0\n";
+    short_cast_out = "  %vres = add i16 %vlast, 0\n";
+  }
+
+  llvm::Triple triple(llvm_target_->GetTargetTriple());
+
+  static const char elf_section_name[] = ".text.tvm.fp16.conv";
+  std::string section = triple.getObjectFormat() == llvm::Triple::ELF
+                            ? std::string("section \"") + elf_section_name + "\" "
+                            : "";
+
+  std::string trunc_header = "define weak dso_local " + short_type +
+                             " @__truncsfhf2(float %a0) local_unnamed_addr #0 " + section +
+                             "{\nb0:\n";
+  std::string trunc_return = "  ret " + short_type + " %vres\n}\n";
+
+  std::string extend_header = "define weak dso_local float @__extendhfsf2(" + short_type +
+                              " %a0) local_unnamed_addr #0 " + section + "{\nb0:\n";
+
+  // truncate = trunc_header + trunc_body + short_cast_out + trunc_return
+  // extend   = extend_header + short_cast_in + extend_body
+
+  std::string attributes = "attributes #0 = { nounwind readnone \"target-cpu\"=\"" +
+                           llvm_target_->GetCPU() + "\" \"target-features\"=\"" +
+                           llvm_target_->GetTargetFeatureString() + "\" }\n";
+
+  auto data_layout = llvm_target_->GetOrCreateTargetMachine()->createDataLayout();
+  std::string module_ir = "target triple = \"" + llvm_target_->GetTargetTriple() + "\"\n" +
+                          "target datalayout = \"" + data_layout.getStringRepresentation() +
+                          "\"\n" + trunc_header + trunc_body + short_cast_out + trunc_return +
+                          extend_header + short_cast_in + extend_body + attributes;
+
+  auto builtins_module = llvm_target_->GetInstance().ParseIR(module_ir);
+  link_modules_.push_back(std::move(builtins_module));
+}
+
 llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
   if (op->op.same_as(builtin_call_llvm_intrin_) || op->op.same_as(builtin_call_llvm_pure_intrin_)) {
     ICHECK_GE(op->args.size(), 2U);
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index e6321be647aa..7a8daf2e761f 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -395,6 +395,14 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    * \param func The function to set attributes on.
    */
   void SetTargetAttributes(llvm::Function* func);
+  /*!
+   * \brief Emit LLVM IR for conversion functions __extendhfsf2 and __truncsfhf2
+   *        into the current llvm::Module.
+   *
+   * \param use_float16_abi Whether to use floating-point or integer ABI.
+   */
+  void EmitFloat16ConversionBuiltins(bool use_float16_abi);
+
   /*!
    * \brief Get the number of elements in the given vector value.
    * \param vec The value, must be of a vector type.
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index c57648382827..e179d17101a3 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -18,11 +18,11 @@
 import ctypes
 import json
 import math
+import numpy as np
+import pytest
 import re
 import sys
 
-import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import te
@@ -854,7 +854,8 @@ def make_call_extern(caller, callee):
     }
     mod = tvm.IRModule(functions=functions)
     ir_text = tvm.build(mod, None, target="llvm").get_source("ll")
-    matches = re.findall(r"^define[^@]*@([a-zA-Z_][a-zA-Z0-9_]*)", ir_text, re.MULTILINE)
+    # Skip functions whose names start with _.
+    matches = re.findall(r"^define[^@]*@([a-zA-Z][a-zA-Z0-9_]*)", ir_text, re.MULTILINE)
     assert matches == sorted(matches)
 
 
diff --git a/tests/python/unittest/test_target_codegen_x86.py b/tests/python/unittest/test_target_codegen_x86.py
index ec42e0a4d749..af91ed4520fd 100644
--- a/tests/python/unittest/test_target_codegen_x86.py
+++ b/tests/python/unittest/test_target_codegen_x86.py
@@ -14,27 +14,25 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import tvm
-from tvm import te
+import numpy as np
+import platform
+import pytest
 import re
+import textwrap
 
+import tvm
+from tvm import te
 
-def test_fp16_to_fp32():
-    if tvm.target.codegen.llvm_version_major() < 6:
-        print(
-            "Skipping due to LLVM version being {} < 6".format(
-                tvm.target.codegen.llvm_version_major()
-            )
-        )
-        return
+llvm_version = tvm.target.codegen.llvm_version_major()
+machine = platform.machine()
 
-    import platform
+if machine not in ["i386", "x86_64", "AMD64", "amd64"]:
+    pytest.skip(f"Requires x86_64/i386, but machine is {machine}", allow_module_level=True)
 
-    machine = platform.machine()
-    if machine not in ["x86_64", "i386", "AMD64"]:
-        print("Skipping test because the platform is: {} ".format(machine))
-        return
 
+@tvm.testing.requires_llvm
+@pytest.mark.skipif(llvm_version < 6, reason=f"Requires LLVM 6+, got {llvm_version}")
+def test_fp16_to_fp32():
     def fp16_to_fp32(target, width, match=None, not_match=None):
         elements = 64
         n = tvm.runtime.convert(elements)
@@ -63,5 +61,51 @@ def fp16_to_fp32(target, width, match=None, not_match=None):
     fp16_to_fp32("llvm", 9, not_match="vcvtph2ps")
 
 
+is_32bit = platform.architecture()[0] == "32bit"
+
+
+@tvm.testing.requires_llvm
+@pytest.mark.skipif(is_32bit, reason=f"Fails in CI due to architecture mismatch in JIT")
+@pytest.mark.parametrize("feature_string", ["-sse2", "+sse2"])
+def test_fp16_fp32_conversions(feature_string):
+    relay_model = textwrap.dedent(
+        """
+        #[version = "0.0.5"]
+        def @main(%inp : Tensor[(3), float32], %cst : Tensor[(3), float32]) {
+            %1 = cast(%inp, dtype="float16");
+            %2 = cast(%cst, dtype="float16");
+            %3 = add(%1, %2);
+            %4 = cast(%3, dtype="float32");
+            %4
+        }
+        """
+    )
+
+    ir_mod = tvm.parser.fromtext(relay_model)
+
+    arch = "i386" if machine == "i386" else "x86_64"
+    aot_factory = tvm.relay.build(
+        ir_mod,
+        params={"cst": np.array([1.0, 2.0, 3.0], dtype="float32")},
+        target=f"llvm --mtriple={arch} --mattr={feature_string}",
+        executor=tvm.relay.backend.Executor(
+            "aot", {"interface-api": "packed", "unpacked-api": False}
+        ),
+    )
+
+    mod_name = aot_factory["list_module_names"]()[0]
+    executor = aot_factory[mod_name]
+    mod = executor(tvm.cpu(0))
+
+    inp = tvm.nd.array(np.array([1.1, 2.1, 3.1], dtype="float32"), device=tvm.cpu(0))
+
+    mod.get_function("set_input")(0, inp)
+    mod.get_function("run")()
+    out = mod.get_function("get_output")(0)
+
+    expected = np.array([2.1, 4.1, 6.1], dtype="float32")
+    np.testing.assert_allclose(out.asnumpy(), expected, rtol=1e-3)
+
+
 if __name__ == "__main__":
     test_fp16_to_fp32()

From b61f633e10b02ac3e767ad268562a4dd2c178de5 Mon Sep 17 00:00:00 2001
From: Yaoda Zhou <judaplus@sjtu.edu.cn>
Date: Tue, 27 Sep 2022 14:20:59 +0800
Subject: [PATCH 256/704] [TVM PyTorch Integration] optimized_torch & as_torch
 how-to guide (#12318)

* how-to use optmized_torch

* as_torch

* format

* one more comment

* improve doc

* improve code

* fix text

* SSR

* CPU model

* whitespace

* improve document

* small edit

* retrigger ci

* using_as_torch polish

* using_optimized_torch

* fix errors

* one more author

* small edit

* polish as_torch

* save progress

* more edit

* small edit

Co-authored-by: juda <yzhou@octoml.ai>
---
 .../work_with_pytorch/using_as_torch.py       | 159 ++++++++++++++++++
 .../using_optimized_torch.py                  | 149 ++++++++++++++++
 python/tvm/contrib/torch/as_torch.py          |   9 +-
 python/tvm/contrib/torch/optimize_torch.py    |   4 +-
 4 files changed, 316 insertions(+), 5 deletions(-)
 create mode 100644 gallery/how_to/work_with_pytorch/using_as_torch.py
 create mode 100644 gallery/how_to/work_with_pytorch/using_optimized_torch.py

diff --git a/gallery/how_to/work_with_pytorch/using_as_torch.py b/gallery/how_to/work_with_pytorch/using_as_torch.py
new file mode 100644
index 000000000000..e17a29e277ea
--- /dev/null
+++ b/gallery/how_to/work_with_pytorch/using_as_torch.py
@@ -0,0 +1,159 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Wrap Your TVMScript as PyTorch Module
+======================
+**Author**: 
+`Yaoda Zhou <https://github.com/juda>`_
+
+This article is a tutorial on wrapping the TVMScript code as the PyTorch module.
+Using the decorator `as_torch`, users can wrap TVMScript code into a PyTorch nn.Module naturally.
+"""
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
+# Import PyTorch, as well as necessary libraries
+import torch
+import torch.nn.functional as F
+import torch.utils.benchmark as benchmark
+
+import tvm
+from tvm.contrib.torch import as_torch
+from tvm.script import tir as T
+
+######################################################################
+# Write your own PyTorch operator by TVMScript
+# -------------------------------
+# PyTorch is a very popular machine learning framework which contains
+# optimized implementations of most commonly used operators.
+# Nevertheless, sometimes you might want to write your own operators in PyTorch.
+# In that case, the performance of such custom operators might not be satisfactory for your needs.
+#
+# For example, suppose that we are going to define a 1-d depthwise convolution operator.
+# Assume the number of in_channel and out_channel are both 70,
+# the width is 80 and the kernel size is 20,
+# then the 1-d depthwise conv could be written in PyTorch in one line:
+
+in_channel = 70
+out_channel = 70
+width = 80
+kernel_size = 20
+
+
+def torch_depthwise(inputs, filters):
+    return F.conv1d(inputs, filters.view(out_channel, 1, kernel_size), groups=out_channel)
+
+
+# We can run this function as:
+
+inputs = torch.randn(in_channel, width)
+filters = torch.randn(out_channel, kernel_size)
+ret_torch = torch_depthwise(inputs, filters)
+
+
+# The `torch_depthwise` function, in a plain Python code, could be written as:
+
+
+def vanilla_depthwise(input, weight):
+    ret = torch.zeros(out_channel, width - kernel_size + 1)
+    for j in range(out_channel):
+        for i in range(width - kernel_size + 1):
+            for k in range(kernel_size):
+                ret[j, i] += weight[j, k] * input[j, i + k]
+    return ret
+
+
+# Then, we plan to optimize the `depthwise` function by leveraging the power of TVM.
+# TVM community proposes an embedded Domain Specific Language in Python called TVMScript,
+# which serves as the high-level frontend for TVM's Tensor IR.
+# The depthwise 1D convolution code above can be translated to TVMScript as follows.
+# We provide an `as_torch` decorator, which converts the TVMScript code to PyTorch's nn.Module automatically.
+
+
+@as_torch
+@T.prim_func
+def tvm_depthwise(
+    A: T.Buffer((70, 80), "float32"),
+    B: T.Buffer((70, 20), "float32"),
+    C: T.Buffer((70, 61), "float32"),
+) -> None:
+    for j, i, k in T.grid(70, 61, 20):
+        with T.block():
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vj, vi] = T.float32(0)
+            C[vj, vi] += B[vj, vk] * A[vj, vi + vk]
+
+
+# We can build the TVMScript code by calling the `tune` method in default setting.
+# Without providing extra information, the model will be tuned for CPU.
+
+tvm_depthwise.tune()
+
+# We can print out the tuned TVMScript code to see how the program is transformed, as
+
+print(tvm_depthwise.script())
+
+# We can verify that the two outputs are the same:
+
+ret_tvm = torch.zeros(out_channel, width - kernel_size + 1)
+tvm_depthwise(inputs, filters, ret_tvm)
+
+testing.assert_allclose(ret_torch.cpu().numpy(), ret_tvm.cpu().numpy(), atol=1e-5, rtol=1e-5)
+
+
+######################################################################
+# Benchmark
+# -------------------------------
+
+results = []
+for i in range(5):
+    inputs = torch.randn(out_channel, width)
+    filters = torch.randn(out_channel, kernel_size)
+    res = torch.zeros(out_channel, width - kernel_size + 1)
+    sub_label = f"[test {i}]"
+    results.append(
+        benchmark.Timer(
+            stmt="tvm_depthwise(inputs, filters, res)",
+            setup="from __main__ import tvm_depthwise",
+            globals={"inputs": inputs, "filters": filters, "res": res},
+            sub_label=sub_label,
+            description="TVMScript",
+        ).blocked_autorange()
+    )
+    results.append(
+        benchmark.Timer(
+            stmt="torch_depthwise(inputs, filters)",
+            setup="from __main__ import torch_depthwise",
+            globals={
+                "inputs": inputs,
+                "filters": filters,
+            },
+            sub_label=sub_label,
+            description="PyTorch",
+        ).blocked_autorange()
+    )
+compare = benchmark.Compare(results)
+compare.print()
+
+# In author's environment, the average inference time of `tvm_depthwise` is 120.0 us,
+# while the average inference time of `torch_depthwise` is 196.0 us (PyTorch version is 1.11.0),
+# showing the speedup of around 38%.
diff --git a/gallery/how_to/work_with_pytorch/using_optimized_torch.py b/gallery/how_to/work_with_pytorch/using_optimized_torch.py
new file mode 100644
index 000000000000..aa68d9e68ec6
--- /dev/null
+++ b/gallery/how_to/work_with_pytorch/using_optimized_torch.py
@@ -0,0 +1,149 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile PyTorch Models
+======================
+**Author**: 
+`Yaoda Zhou <https://github.com/juda>`_
+
+This article is a tutorial to optimize PyTorch models by using decorator `optimize_torch`.
+To follow this tutorial, PyTorch, as well as TorchVision, should be installed.
+"""
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
+# Import PyTorch
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# Import library for profiling
+import torch.utils.benchmark as benchmark
+from torchvision.models import resnet18
+
+# Import `optimize_torch` function
+from tvm.contrib.torch import optimize_torch
+from tvm.meta_schedule import TuneConfig
+
+######################################################################
+# Define a simple module written by PyTorch
+# ------------------------------
+
+
+class SimpleModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 20, 5)
+        self.conv2 = nn.Conv2d(20, 20, 5)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        return F.relu(self.conv2(x))
+
+
+######################################################################
+# Optimize SimpleModel by TVM MetaSchedule
+# ------------------------------
+# We provide the `optimize_torch` function, which has the similar usage as `torch.jit.trace`.
+# The PyTorch model to optimize, along with its example input, are provided by users.
+# The PyTorch module will be tuned by TVM for the target hardware.
+# Without providing extra information, the model will be tuned for CPU.
+
+simple_model = SimpleModel()
+example_input = torch.randn(20, 1, 10, 10)
+model_optimized_by_tvm = optimize_torch(simple_model, example_input)
+
+######################################################################
+# Save/Load module
+# ------------------------------
+# We can save and load our tuned module like the standard `nn.Module`.
+
+# Let us run our tuned module.
+ret1 = model_optimized_by_tvm(example_input)
+
+torch.save(model_optimized_by_tvm, "model_optimized.pt")
+model_loaded = torch.load("model_optimized.pt")
+
+# We load the module and run it again.
+ret2 = model_loaded(example_input)
+
+# We will show 2 results:
+# (1) we can safely load and save model by showing the result of model
+# after save and load operations is still the same as original one;
+# (2) the model we optimize returns the same result as the original PyTorch model.
+
+ret3 = simple_model(example_input)
+testing.assert_allclose(ret1.detach().numpy(), ret2.detach().numpy(), atol=1e-5, rtol=1e-5)
+testing.assert_allclose(ret1.detach().numpy(), ret3.detach().numpy(), atol=1e-5, rtol=1e-5)
+
+######################################################################
+# Optimize resnet18
+# ------------------------------
+# In the following, we will show that our approach is able to
+# accelerate common models, such as resnet18.
+
+# We will tune our model for the GPU.
+target_cuda = "nvidia/geforce-rtx-3070"
+
+# For PyTorch users, the code could be written as usual, except for
+# applying "optimize_torch" function on the resnet18 model.
+
+resnet18_tvm = optimize_torch(
+    resnet18().cuda().eval(), [torch.rand(1, 3, 224, 224).cuda()], target=target_cuda
+)
+
+# TorchScript also provides a built-in "optimize_for_inference" function to accelerate the inference.
+resnet18_torch = torch.jit.optimize_for_inference(torch.jit.script(resnet18().cuda().eval()))
+
+
+######################################################################
+# Compare the performance between two approaches.
+# ------------------------------
+
+results = []
+for i in range(5):
+    test_input = torch.rand(1, 3, 224, 224).cuda()
+    sub_label = f"[test {i}]"
+    results.append(
+        benchmark.Timer(
+            stmt="resnet18_tvm(test_input)",
+            setup="from __main__ import resnet18_tvm",
+            globals={"test_input": test_input},
+            sub_label=sub_label,
+            description="tuning by meta",
+        ).blocked_autorange()
+    )
+    results.append(
+        benchmark.Timer(
+            stmt="resnet18_torch(test_input)",
+            setup="from __main__ import resnet18_torch",
+            globals={"test_input": test_input},
+            sub_label=sub_label,
+            description="tuning by jit",
+        ).blocked_autorange()
+    )
+
+compare = benchmark.Compare(results)
+compare.print()
+
+# In author's environment, the average inference time of `resnet18_tvm` is 620.0 us,
+# while the average inference time of `resnet18_torch` is 980.0 us (PyTorch version is 1.11.0),
+# showing the speedup of around 38%.
diff --git a/python/tvm/contrib/torch/as_torch.py b/python/tvm/contrib/torch/as_torch.py
index 3a2b4dda9ea9..a8cd895a6c5e 100644
--- a/python/tvm/contrib/torch/as_torch.py
+++ b/python/tvm/contrib/torch/as_torch.py
@@ -21,7 +21,7 @@
 # pylint: disable=missing-class-docstring
 # pylint: disable=missing-function-docstring
 """
-as_torch: a decorator, which is used to wrap the TVMscript code to `torch.nn.module`.
+as_torch: a decorator, which is used to wrap the TVMScript code to `torch.nn.module`.
 """
 import tempfile
 from typing import Callable, List, Union
@@ -50,7 +50,7 @@ def __init__(
 
     def tune(self, config: TuneConfig = None, target: Union[str, Target] = None):
         """
-        Tune the TVMscript code.
+        Tune the TVMScript code.
 
         Parameters
         ----------
@@ -80,6 +80,9 @@ def tune(self, config: TuneConfig = None, target: Union[str, Target] = None):
             self.ir_module = sch.mod
             self.build(target)
 
+    def script(self):
+        return self.ir_module.script()
+
     def build(self, target=None):
         runtime_module = tvm.build(self.ir_module, target=target)
         func = tvm.get_global_func("tvmtorch.save_runtime_mod")
@@ -105,7 +108,7 @@ def as_torch(func: Union[tvm.ir.module.IRModule, tvm.tir.function.PrimFunc, Call
     Parameters
     ----------
     func: Optional[tvm.ir.module.IRModule, tvm.tir.function.PrimFunc, Callable]
-        The function written by TVMscript.
+        The function written by TVMScript.
 
     Returns
     -------
diff --git a/python/tvm/contrib/torch/optimize_torch.py b/python/tvm/contrib/torch/optimize_torch.py
index 282e6c5dc84f..821a3b1f71d5 100644
--- a/python/tvm/contrib/torch/optimize_torch.py
+++ b/python/tvm/contrib/torch/optimize_torch.py
@@ -40,7 +40,6 @@
 from tvm.ir.module import IRModule
 from tvm.ir.transform import PassContext
 from tvm.meta_schedule import TuneConfig, default_config
-from tvm.meta_schedule.apply_history_best import ApplyHistoryBest
 from tvm.meta_schedule.relay_integration import extract_task_from_relay
 from tvm.meta_schedule.tune import tune_extracted_tasks
 from tvm.meta_schedule.utils import autotvm_silencer
@@ -114,12 +113,13 @@ def tune_relay_auto(
         )
     database = tune_extracted_tasks(extracted_tasks, config, work_dir)
     relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend]
-    with target, autotvm_silencer(), ApplyHistoryBest(database):
+    with target, autotvm_silencer(), database:
         with PassContext(
             opt_level=3,
             config={
                 "relay.backend.use_meta_schedule": True,
                 "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda",
+                "relay.backend.tir_converter": "default",
             },
         ):
             return relay_build(mod, target=target, params=params)

From 7a4c10c44a9255ac2fa52ce7e3a83f718d60823f Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 27 Sep 2022 03:56:42 -0500
Subject: [PATCH 257/704] [TIR][Transform] Remove num_unpacked_args from
 MakePackedAPI (#12892)

Other than a single unit test, there are no usages of this parameter
in TVM, and it significantly complicates the logic of `MakePackedAPI`.
Similar functionality can be had by using `MakeUnpackedAPI` instead.
---
 include/tvm/tir/transform.h                   |   9 +-
 python/tvm/tir/transform/transform.py         |  11 +-
 src/driver/driver_api.cc                      |   2 +-
 src/tir/transforms/make_packed_api.cc         | 109 ++++++------------
 .../test_tir_transform_make_packed_api.py     |   5 +-
 5 files changed, 46 insertions(+), 90 deletions(-)

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index a4caeee43604..6aa1aca69970 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -192,16 +192,13 @@ TVM_DLL Pass InstrumentBoundCheckers();
  *   - Map the values in the api_args to Var that is required by body.
  *   - Insert assertions to check type/value of the passed arguments.
  *
- * \param num_unpacked_args Number of arguments that
- *         are processed in plain form instead of packed form.
- *
  * \note
  *  The function signature have two cases
  *
- *  let num_packed_args = len(api_args) - num_unpacked_args;
+ *  let num_packed_args = len(api_args);
  *
  *  if num_packed_args is zero:
- *     f(api_arg_0, api_arg_1, .., api_arg_n) where n == len(api_args)
+ *     f()
  *
  *  if num_packed_args is not zero:
  *       f(TVMArg* packed_args, int* packed_arg_type_ids, int num_packed_args,
@@ -212,7 +209,7 @@ TVM_DLL Pass InstrumentBoundCheckers();
  *
  * \return The pass.
  */
-TVM_DLL Pass MakePackedAPI(int num_unpacked_args);
+TVM_DLL Pass MakePackedAPI();
 
 /*!
  * \brief Transform the high-level PrimFunc to a C signature that can be used
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 324471c71891..3c1ca196f1b0 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -387,22 +387,15 @@ def LowerCustomDatatypes():
     return _ffi_api.LowerCustomDatatypes()  # type: ignore
 
 
-def MakePackedAPI(num_unpacked_params: int = -1):
+def MakePackedAPI():
     """Transform the PrimFuncs in the module to a packed func API.
 
-    Parameters
-    ----------
-    num_unpacked_params : int
-        Number of parameters that we hope to directly pass via normal arguments
-        following the PackedFunc input signature. If it is specified as -1 or it
-        is less than the number of arguments, the pass will packed arguments still.
-
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.MakePackedAPI(num_unpacked_params)  # type: ignore
+    return _ffi_api.MakePackedAPI()  # type: ignore
 
 
 def MakeUnpackedAPI():
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 1a617dcd494d..b460557da034 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -561,7 +561,7 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
   if (unpacked_api) {
     mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI());
   } else {
-    mixed_pass_list.push_back(tir::transform::MakePackedAPI(-1));
+    mixed_pass_list.push_back(tir::transform::MakePackedAPI());
   }
   mixed_pass_list.push_back(tir::transform::SplitHostDevice());
 
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 4f8ad1223cd2..bf7ff09c86c7 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -139,7 +139,7 @@ inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) {
   return AssertStmt(lhs == rhs, tvm::tir::StringImm(msg), Evaluate(0));
 }
 
-PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
+PrimFunc MakePackedAPI(PrimFunc&& func) {
   auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
   ICHECK(global_symbol) << "MakePackedAPI: Expect PrimFunc to have the global_symbol attribute";
 
@@ -152,14 +152,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   auto* func_ptr = func.CopyOnWrite();
   const Stmt nop = Evaluate(0);
   int num_args = static_cast<int>(func_ptr->params.size());
-  ICHECK_LE(num_unpacked_args, num_args);
-  bool pack_args = (num_unpacked_args == -1) || (num_args > num_unpacked_args);
-  if (num_unpacked_args == -1) {
-    // reset to zero
-    num_unpacked_args = 0;
-  }
-  ICHECK_GE(num_unpacked_args, 0);
-  int num_packed_args = num_args - num_unpacked_args;
+
   // Data field definitions
   // The packed fields
   Var v_packed_args("args", DataType::Handle());
@@ -170,7 +163,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   Var v_out_ret_tcode("out_ret_tcode", PointerType(PrimType(DataType::Int(32))));
   Var v_resource_handle("resource_handle", DataType::Handle());
   // The arguments of the function.
-  Array<Var> args;
+
   // The device context
   Var device_id("dev_id");
   Integer device_type(target_device_type);
@@ -194,14 +187,6 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
     }
     return res;
   };
-  // ---------------------------
-  // start of logics
-  // add signature for packed arguments.
-  if (pack_args) {
-    args.push_back(v_packed_args);
-    args.push_back(buf_packed_arg_type_ids->data);
-    args.push_back(v_num_packed_args);
-  }
 
   // Need to re-declare vars, in case some arguments also appears in the buffer.
   std::vector<std::pair<Var, Var>> var_def;
@@ -219,7 +204,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
 
     // Pluck the device API context out based on name
     if (param->name_hint == kDeviceContextVar) {
-      num_packed_args--;
+      num_args--;
       v_resource_handle = param;
       continue;
     }
@@ -232,44 +217,34 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
       var_def.emplace_back(v_arg, param);
     }
 
-    if (i < num_packed_args) {
-      // Value loads
-      seq_init.emplace_back(LetStmt(v_arg, f_arg_value(v_arg.dtype(), i), nop));
-      // type code checks
-      Var tcode(v_arg->name_hint + ".code", DataType::Int(32));
-      seq_init.emplace_back(
-          LetStmt(tcode, BufferLoad(buf_packed_arg_type_ids, {IntImm(DataType::Int(32), i)}), nop));
-      DataType t = v_arg.dtype();
-      if (t.is_handle()) {
-        std::ostringstream msg;
-        msg << name_hint << ": Expect arg[" << i << "] to be pointer";
-        seq_check.emplace_back(AssertStmt(tcode == kTVMOpaqueHandle || tcode == kTVMNDArrayHandle ||
-                                              tcode == kTVMDLTensorHandle || tcode == kTVMNullptr,
-                                          tvm::tir::StringImm(msg.str()), nop));
-      } else if (t.is_int() || t.is_uint()) {
-        std::ostringstream msg;
-        msg << name_hint << ": Expect arg[" << i << "] to be int";
-        seq_check.emplace_back(AssertStmt(tcode == kDLInt, tvm::tir::StringImm(msg.str()), nop));
-      } else {
-        ICHECK(t.is_float());
-        std::ostringstream msg;
-        msg << name_hint << ": Expect arg[" << i << "] to be float";
-        seq_check.emplace_back(AssertStmt(tcode == kDLFloat, tvm::tir::StringImm(msg.str()), nop));
-      }
+    // Value loads
+    seq_init.emplace_back(LetStmt(v_arg, f_arg_value(v_arg.dtype(), i), nop));
+    // type code checks
+    Var tcode(v_arg->name_hint + ".code", DataType::Int(32));
+    seq_init.emplace_back(
+        LetStmt(tcode, BufferLoad(buf_packed_arg_type_ids, {IntImm(DataType::Int(32), i)}), nop));
+    DataType t = v_arg.dtype();
+    if (t.is_handle()) {
+      std::ostringstream msg;
+      msg << name_hint << ": Expect arg[" << i << "] to be pointer";
+      seq_check.emplace_back(AssertStmt(tcode == kTVMOpaqueHandle || tcode == kTVMNDArrayHandle ||
+                                            tcode == kTVMDLTensorHandle || tcode == kTVMNullptr,
+                                        tvm::tir::StringImm(msg.str()), nop));
+    } else if (t.is_int() || t.is_uint()) {
+      std::ostringstream msg;
+      msg << name_hint << ": Expect arg[" << i << "] to be int";
+      seq_check.emplace_back(AssertStmt(tcode == kDLInt, tvm::tir::StringImm(msg.str()), nop));
     } else {
-      args.push_back(v_arg);
+      ICHECK(t.is_float());
+      std::ostringstream msg;
+      msg << name_hint << ": Expect arg[" << i << "] to be float";
+      seq_check.emplace_back(AssertStmt(tcode == kDLFloat, tvm::tir::StringImm(msg.str()), nop));
     }
   }
 
-  // allow return value if the function is packed.
-  if (pack_args) {
-    args.push_back(v_out_ret_value);
-    args.push_back(v_out_ret_tcode);
-    args.push_back(v_resource_handle);
-  }
-
-  size_t expected_nargs = num_unpacked_args + (pack_args ? 6 : 0);
-  ICHECK_EQ(args.size(), expected_nargs);
+  Array<Var> args{v_packed_args,     buf_packed_arg_type_ids->data,
+                  v_num_packed_args, v_out_ret_value,
+                  v_out_ret_tcode,   v_resource_handle};
 
   // Arg definitions are defined before buffer binding to avoid the use before
   // def errors.
@@ -286,9 +261,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
     binder.BindDLTensor(kv.second, device_type, device_id, kv.first, kv.first->name_hint);
   }
 
-  if (num_unpacked_args == 0) {
-    func = WithAttr(std::move(func), tvm::attr::kCallingConv, Integer(CallingConv::kCPackedFunc));
-  }
+  func = WithAttr(std::move(func), tvm::attr::kCallingConv, Integer(CallingConv::kCPackedFunc));
 
   Stmt body = RewriteReturn(func_ptr->body, v_out_ret_value, v_out_ret_tcode);
   body = AttrStmt(make_zero(DataType::Int(32)), attr::compute_scope,
@@ -307,16 +280,11 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
     }
   }
 
-  if (pack_args) {
-    std::ostringstream num_args_error;
-    num_args_error << name_hint << ": num_args should be " << num_packed_args;
-    std::vector<Stmt> arg_assert = {
-        MakeAssertEQ(v_num_packed_args, num_packed_args, num_args_error.str())};
-    func_ptr->body =
-        MergeNest({arg_assert, seq_init, binder.init_nest(), seq_check, binder.asserts()}, body);
-  } else {
-    func_ptr->body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts()}, body);
-  }
+  std::ostringstream num_args_error;
+  num_args_error << name_hint << ": num_args should be " << num_args;
+  std::vector<Stmt> arg_assert = {MakeAssertEQ(v_num_packed_args, num_args, num_args_error.str())};
+  func_ptr->body =
+      MergeNest({arg_assert, seq_init, binder.init_nest(), seq_check, binder.asserts()}, body);
   func_ptr->params = args;
 
   Array<Var> undefined = UndefinedVars(func_ptr->body, func_ptr->params);
@@ -339,9 +307,8 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
 
 namespace transform {
 
-Pass MakePackedAPI(int num_unpacked_args) {
-  // packed arguments anyway while `num_unpacked_args` is -1
-  auto pass_func = [num_unpacked_args](IRModule m, PassContext ctx) {
+Pass MakePackedAPI() {
+  auto pass_func = [](IRModule m, PassContext ctx) {
     IRModuleNode* mptr = m.CopyOnWrite();
     std::vector<std::pair<GlobalVar, PrimFunc>> updates;
 
@@ -350,7 +317,7 @@ Pass MakePackedAPI(int num_unpacked_args) {
         PrimFunc func = GetRef<PrimFunc>(n);
         if (func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
             CallingConv::kDefault) {
-          auto updated_func = MakePackedAPI(std::move(func), num_unpacked_args);
+          auto updated_func = MakePackedAPI(std::move(func));
           updates.push_back({kv.first, updated_func});
         }
       }
@@ -365,7 +332,7 @@ Pass MakePackedAPI(int num_unpacked_args) {
   return tvm::transform::CreateModulePass(pass_func, 0, "tir.MakePackedAPI", {});
 }
 
-TVM_REGISTER_GLOBAL("tir.transform.MakePackedAPI").set_body_typed(MakePackedAPI);
+TVM_REGISTER_GLOBAL("tir.transform.MakePackedAPI").set_body_typed([]() { return MakePackedAPI(); });
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/tests/python/unittest/test_tir_transform_make_packed_api.py b/tests/python/unittest/test_tir_transform_make_packed_api.py
index 047c95b6134f..e78ed98d8569 100644
--- a/tests/python/unittest/test_tir_transform_make_packed_api.py
+++ b/tests/python/unittest/test_tir_transform_make_packed_api.py
@@ -39,9 +39,8 @@ def test_makeapi():
         )
     )(mod)
 
-    num_unpacked_args = 2
-    f = tvm.tir.transform.MakePackedAPI(num_unpacked_args)(mod)["main"]
-    assert len(f.params) == 8
+    f = tvm.tir.transform.MakePackedAPI()(mod)["main"]
+    assert len(f.params) == 6
 
 
 def _find_assignment(stmt, var_name):

From 7dbc68d1087e2ade75314f8b0525e30fc5c6b801 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 27 Sep 2022 06:26:37 -0700
Subject: [PATCH 258/704] [ONNX] Fix test_roi_align failure (#12906)

RoiAlign-16 introduces coordinate_transformation_mode, which should be set to 'output_half_pixel' to omit the pixel shift for the input (for a backward-compatible behavior). This PR should fix the failure in https://ci.tlcpack.ai/job/docker-images-ci/job/docker-image-run-tests/231/testReport/junit/cython.tests.python.frontend.onnx/test_forward/Test___frontend__GPU_3_of_6___test_roi_align_cuda_/

Co-authored-by: Sevin F. Varoglu <sfvaroglu@octoml.ai>
Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                                | 20 ++++++++++----------
 ci/jenkins/Jenkinsfile.j2                  | 20 ++++++++++----------
 tests/python/frontend/onnx/test_forward.py |  1 +
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index a61ab1cd69a2..c49eb66711c7 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -49,16 +49,16 @@
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220908-060034-62bdc91b1'
-ci_gpu = 'tlcpack/ci-gpu:20220908-060034-62bdc91b1'
-ci_cpu = 'tlcpack/ci-cpu:20220908-060034-62bdc91b1'
-ci_minimal = 'tlcpack/ci-minimal:20220908-060034-62bdc91b1'
-ci_wasm = 'tlcpack/ci-wasm:20220908-060034-62bdc91b1'
-ci_i386 = 'tlcpack/ci-i386:20220908-060034-62bdc91b1'
-ci_cortexm = 'tlcpack/ci-cortexm:20220909-090211-cb08a1251'
-ci_arm = 'tlcpack/ci-arm:20220908-060034-62bdc91b1'
-ci_hexagon = 'tlcpack/ci-hexagon:20220908-060034-62bdc91b1'
-ci_riscv = 'tlcpack/ci-riscv:20220908-060034-62bdc91b1'
+ci_lint = 'tlcpack/ci-lint:20220925-060158-71f25b3d6'
+ci_gpu = 'tlcpack/ci-gpu:20220925-060158-71f25b3d6'
+ci_cpu = 'tlcpack/ci-cpu:20220925-060158-71f25b3d6'
+ci_minimal = 'tlcpack/ci-minimal:20220925-060158-71f25b3d6'
+ci_wasm = 'tlcpack/ci-wasm:20220925-060158-71f25b3d6'
+ci_i386 = 'tlcpack/ci-i386:20220925-060158-71f25b3d6'
+ci_cortexm = 'tlcpack/ci-cortexm:20220925-060158-71f25b3d6'
+ci_arm = 'tlcpack/ci-arm:20220925-060158-71f25b3d6'
+ci_hexagon = 'tlcpack/ci-hexagon:20220925-060158-71f25b3d6'
+ci_riscv = 'tlcpack/ci-riscv:20220925-060158-71f25b3d6'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index 6ba0c2df8efd..2fcbc9e7e042 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -51,16 +51,16 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'ci/jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220908-060034-62bdc91b1'
-ci_gpu = 'tlcpack/ci-gpu:20220908-060034-62bdc91b1'
-ci_cpu = 'tlcpack/ci-cpu:20220908-060034-62bdc91b1'
-ci_minimal = 'tlcpack/ci-minimal:20220908-060034-62bdc91b1'
-ci_wasm = 'tlcpack/ci-wasm:20220908-060034-62bdc91b1'
-ci_i386 = 'tlcpack/ci-i386:20220908-060034-62bdc91b1'
-ci_cortexm = 'tlcpack/ci-cortexm:20220909-090211-cb08a1251'
-ci_arm = 'tlcpack/ci-arm:20220908-060034-62bdc91b1'
-ci_hexagon = 'tlcpack/ci-hexagon:20220908-060034-62bdc91b1'
-ci_riscv = 'tlcpack/ci-riscv:20220908-060034-62bdc91b1'
+ci_lint = 'tlcpack/ci-lint:20220925-060158-71f25b3d6'
+ci_gpu = 'tlcpack/ci-gpu:20220925-060158-71f25b3d6'
+ci_cpu = 'tlcpack/ci-cpu:20220925-060158-71f25b3d6'
+ci_minimal = 'tlcpack/ci-minimal:20220925-060158-71f25b3d6'
+ci_wasm = 'tlcpack/ci-wasm:20220925-060158-71f25b3d6'
+ci_i386 = 'tlcpack/ci-i386:20220925-060158-71f25b3d6'
+ci_cortexm = 'tlcpack/ci-cortexm:20220925-060158-71f25b3d6'
+ci_arm = 'tlcpack/ci-arm:20220925-060158-71f25b3d6'
+ci_hexagon = 'tlcpack/ci-hexagon:20220925-060158-71f25b3d6'
+ci_riscv = 'tlcpack/ci-riscv:20220925-060158-71f25b3d6'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 17a0513844ba..da6f5785023d 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -4481,6 +4481,7 @@ def verify_roi_align(
 
         node = helper.make_node(
             "RoiAlign",
+            coordinate_transformation_mode="output_half_pixel",
             inputs=["X", "rois", "batch_indices"],
             outputs=["Y"],
             mode=mode,

From 77d8eef5148da6517e471b52fec61ab40ea2436d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 27 Sep 2022 11:39:19 -0500
Subject: [PATCH 259/704] [Runtime][Bugfix] Added type-checking for
 Array::insert (#12691)

Prior to this commit, the following code would compile and run without
error.  This occurs because the typed `Array<T>::insert` calls the
untyped `ArrayNode::InitRange`, with no type-checking done before the
call.

```c++
Var x("x");
Var y("y");
Array<Var> var_arr{x, y};
Array<PrimExpr> expr_arr{x + 1, y + 2};
// Erroneously inserts static-type PrimExpr, runtime-type Add, even
// though neither PrimExpr is a type of Var.
var_arr.insert(var_arr.begin(), expr_arr.begin(), expr_arr.end());
```

After this commit, a `static_assert` in `Array<T>::insert` and in
`Array<T>::Array(IterType,IterTYpe)` restricts the iterators, such
that they must dereference to `T`, `Optional<T>`, a subclass of `T`,
or `Optional<U>` where `U` is a subclass of `T`.

The public method `ArrayNode::SetItem` exposes a similar issue.  In
the future, we may want to make it be private, accessed only through
type-safe method in `Array<T>::Set`.
---
 include/tvm/runtime/container/array.h | 5 +++++
 src/te/schedule/schedule_lang.cc      | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/tvm/runtime/container/array.h b/include/tvm/runtime/container/array.h
index 11bacb18e92c..1b735e73c386 100644
--- a/include/tvm/runtime/container/array.h
+++ b/include/tvm/runtime/container/array.h
@@ -325,6 +325,8 @@ class Array : public ObjectRef {
    */
   template <typename IterType>
   Array(IterType first, IterType last) {
+    static_assert(is_valid_iterator_v<T, IterType>,
+                  "IterType cannot be inserted into a tvm::Array<T>");
     Assign(first, last);
   }
 
@@ -481,6 +483,9 @@ class Array : public ObjectRef {
    */
   template <typename IterType>
   void insert(iterator position, IterType first, IterType last) {
+    static_assert(is_valid_iterator_v<T, IterType>,
+                  "IterType cannot be inserted into a tvm::Array<T>");
+
     if (first == last) {
       return;
     }
diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc
index 0fcd6133c4a2..e8f4f65eb651 100644
--- a/src/te/schedule/schedule_lang.cc
+++ b/src/te/schedule/schedule_lang.cc
@@ -200,7 +200,7 @@ Stage& Stage::env_threads(Array<IterVar> threads) {
   ICHECK_EQ(self->env_threads.size(), 0U) << "Already set env_threads";
   Array<IterVar>& leaf_vars = self->leaf_iter_vars;
   Array<IterVar>& all_vars = self->all_iter_vars;
-  std::vector<ObjectRef> temp;
+  std::vector<IterVar> temp;
   for (IterVar iv : threads) {
     temp.push_back(iv);
   }

From 9a673faa74ed7cd715a4e011716bcce3fd2158b6 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 27 Sep 2022 09:41:34 -0700
Subject: [PATCH 260/704] [ci] Initialize git during deploys (#12909)

We rely on some utilities scripts in the deploy steps so they also need
a git checkout.

Fixes issues like https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4345/pipeline

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                 | 14 ++++++++------
 ci/jenkins/Deploy.groovy.j2 | 10 ++++++----
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index c49eb66711c7..e964ac79a3ce 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-09-16T08:47:49.743918
+// Generated at 2022-09-26T10:48:49.577077
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -4246,7 +4246,8 @@ def deploy() {
       node('CPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docker") {
           timeout(time: max_time, unit: 'MINUTES') {
-            try {
+            init_git()
+                    try {
                       withCredentials([string(
                         credentialsId: 'dockerhub-tlcpackstaging-key',
                         variable: 'DOCKERHUB_KEY',
@@ -4290,10 +4291,11 @@ def deploy() {
       node('CPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/tag-images") {
           timeout(time: max_time, unit: 'MINUTES') {
-            withCredentials([string(
-                        credentialsId: 'dockerhub-tlcpack-key',
-                        variable: 'TLCPACK_TOKEN',
-                      )]) {
+            init_git()
+                    withCredentials([string(
+                      credentialsId: 'dockerhub-tlcpack-key',
+                      variable: 'TLCPACK_TOKEN',
+                    )]) {
                       try {
                         sh(
                           script: 'echo $TLCPACK_TOKEN | docker login --username octomldriazati --password-stdin',
diff --git a/ci/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2
index 9812e1113598..798af6736e1e 100644
--- a/ci/jenkins/Deploy.groovy.j2
+++ b/ci/jenkins/Deploy.groovy.j2
@@ -99,6 +99,7 @@ def deploy() {
           feature_flag="env.DEPLOY_DOCKER_IMAGES == 'yes' && rebuild_docker_images && upstream_revision != null",
           ws="tvm/deploy-docker",
         ) %}
+          init_git()
           try {
             withCredentials([string(
               credentialsId: 'dockerhub-tlcpackstaging-key',
@@ -130,10 +131,11 @@ def deploy() {
           feature_flag="env.DOCS_DEPLOY_ENABLED == 'yes'",
           ws="tvm/tag-images",
         ) %}
-            withCredentials([string(
-              credentialsId: 'dockerhub-tlcpack-key',
-              variable: 'TLCPACK_TOKEN',
-            )]) {
+          init_git()
+          withCredentials([string(
+            credentialsId: 'dockerhub-tlcpack-key',
+            variable: 'TLCPACK_TOKEN',
+          )]) {
             try {
               sh(
                 script: 'echo $TLCPACK_TOKEN | docker login --username octomldriazati --password-stdin',

From 332b1469b71cf7eb5e40ec385eb9664f3959643a Mon Sep 17 00:00:00 2001
From: Venkat Rasagna Komatireddy
 <89959097+rasagna-quic@users.noreply.github.com>
Date: Tue, 27 Sep 2022 22:50:13 +0530
Subject: [PATCH 261/704] [Hexagon] depth_to_space slice op (#12669)

hexagon slice depth_to_space op
---
 python/tvm/topi/hexagon/slice_ops/__init__.py |   1 +
 .../topi/hexagon/slice_ops/depth_to_space.py  |  43 ++++++
 .../test_hexagon/topi/test_depth_to_space.py  | 136 ++++++++++++++++++
 3 files changed, 180 insertions(+)
 create mode 100644 python/tvm/topi/hexagon/slice_ops/depth_to_space.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_depth_to_space.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index b96156dc46d2..5f86e706af50 100644
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -35,3 +35,4 @@
 from .relu import relu_compute, relu_stir_schedule
 from .tanh import tanh_te_compute, tanhf16_schedule
 from .dwconv2d import *
+from .depth_to_space import d2s_compute, d2s_schedule
diff --git a/python/tvm/topi/hexagon/slice_ops/depth_to_space.py b/python/tvm/topi/hexagon/slice_ops/depth_to_space.py
new file mode 100644
index 000000000000..aa14a97f5ee9
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/depth_to_space.py
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Compute and schedule for depth to space slice op
+"""
+
+from tvm import te, tir, topi
+from ..utils import get_layout_transform_fn
+
+
+def d2s_compute(inp, block_size, layout, mode):
+    """depth_to_space compute"""
+    return topi.nn.depth_to_space(inp, block_size=block_size, layout=layout, mode=mode)
+
+
+def d2s_schedule(inp, out, input_layout, output_layout):
+    """Schedule for depth to space: top level function"""
+    if (input_layout != output_layout) or (
+        output_layout not in ("nhwc-8h2w32c2w-2d", "nhwc-8h8w32c-2d")
+    ):
+        raise RuntimeError(
+            f"Unexpected input_layout, output_layout '{input_layout, output_layout}'"
+        )
+    d2s_func = te.create_prim_func([inp, out])
+    sch = tir.Schedule(d2s_func, debug_mask="all")
+    compute = sch.get_block("depth_to_space")
+    sch.transform_layout(compute, inp.name, get_layout_transform_fn(input_layout))
+    sch.transform_layout(compute, out.name, get_layout_transform_fn(output_layout))
+    return sch
diff --git a/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py b/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
new file mode 100644
index 000000000000..f74d13f641d5
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=line-too-long, redefined-outer-name
+
+"""Test depth_to_space slice op for hexagon"""
+
+import numpy as np
+import pytest
+
+import tvm
+from tvm import te
+import tvm.testing
+from tvm.topi.hexagon.slice_ops.depth_to_space import d2s_compute, d2s_schedule
+from tvm.topi.testing import depth_to_space_python
+
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+
+d2s_fp16_tests = (
+    ((1, 8, 8, 256), 2, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+    ((1, 8, 8, 1024), 4, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+    ((1, 16, 16, 256), 2, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+    ((1, 16, 16, 1024), 4, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+    ((1, 8, 8, 256), 2, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+    ((1, 8, 8, 1024), 4, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+    ((1, 16, 16, 256), 2, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+    ((1, 16, 16, 1024), 4, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+)
+
+d2s_uint8_tests = (
+    ((1, 8, 8, 256), 2, "CDR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
+    ((1, 8, 8, 1024), 4, "CDR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
+    ((1, 8, 8, 256), 2, "DCR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
+    ((1, 8, 8, 1024), 4, "DCR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
+)
+
+
+class TestD2SSlice:
+    """Test class that defines the Depth to Space slice test"""
+
+    (input_shape, block_size, mode, dtype, input_layout, output_layout,) = tvm.testing.parameters(
+        *d2s_fp16_tests,
+        *d2s_uint8_tests,
+    )
+
+    working_scope = tvm.testing.parameter("global.vtcm")
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, dtype):
+        return np.random.uniform(size=input_shape).astype(dtype)
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, input_layout):
+        return transform_numpy(input_np, "nhwc", input_layout)
+
+    @tvm.testing.fixture
+    def ref_output_np(self, input_np, block_size, mode):
+        a_np = np.transpose(input_np, axes=[0, 3, 1, 2])
+        ref_np = depth_to_space_python(a_np, block_size, mode=mode)
+        ref_np = np.transpose(ref_np, axes=[0, 2, 3, 1])
+        return ref_np
+
+    @tvm.testing.fixture
+    def transformed_ref_output_np(self, ref_output_np, output_layout):
+        return transform_numpy(ref_output_np, "nhwc", output_layout)
+
+    @tvm.testing.requires_hexagon
+    def test_d2s_slice(
+        self,
+        input_shape,
+        block_size,
+        mode,
+        dtype,
+        input_layout,
+        output_layout,
+        hexagon_session,
+        working_scope,
+        transformed_input_np,
+        transformed_ref_output_np,
+    ):
+        """Top level testing function for depth to space"""
+        Input = te.placeholder(input_shape, name="Input", dtype=dtype)
+
+        Output = d2s_compute(Input, block_size, "NHWC", mode)
+
+        target_hexagon = tvm.target.hexagon("v69")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+        tir_s = d2s_schedule(Input, Output, input_layout, output_layout)
+
+        input_data = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            axis_separators=[4],
+            mem_scope=working_scope,
+        )
+        output_data = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=transformed_ref_output_np.shape,
+            dtype=transformed_ref_output_np.dtype,
+            axis_separators=[4],
+            mem_scope=working_scope,
+        )
+        with tvm.transform.PassContext(opt_level=3):
+            runtime_module = tvm.build(
+                tir_s.mod, [Input, Output], target=target, name="depth_to_space"
+            )
+        mod = hexagon_session.load_module(runtime_module)
+
+        mod(input_data, output_data)
+        output_np = output_data.numpy()
+
+        tvm.testing.assert_allclose(
+            output_np,
+            transformed_ref_output_np,
+            1e-3,
+            1e-3,
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 5a807e27c04a4dcb80ab35e8e601f5a9d5e78986 Mon Sep 17 00:00:00 2001
From: Janet Schneider <janetsc@octoml.ai>
Date: Tue, 27 Sep 2022 10:35:44 -0700
Subject: [PATCH 262/704] [Hexagon] [runtime] Add thread manager to resource
 management (#12905)

* Add thread manager to Acquire/ReleaseResources

* Change logging to debug logs

* Fix lint

* Increase pipe size
---
 src/runtime/hexagon/hexagon_device_api.h      | 26 ++++++++++++++++---
 .../hexagon/hexagon_device_api_tests.cc       | 11 ++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index b8861238771b..4f544faffba1 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -31,6 +31,7 @@
 
 #include "hexagon_buffer.h"
 #include "hexagon_buffer_manager.h"
+#include "hexagon_thread_manager.h"
 
 namespace tvm {
 namespace runtime {
@@ -54,17 +55,26 @@ class HexagonDeviceAPI final : public DeviceAPI {
   void AcquireResources() {
     CHECK_EQ(runtime_hexbuffs, nullptr);
     runtime_hexbuffs = std::make_unique<HexagonBufferManager>();
-    LOG(INFO) << "runtime_hexbuffs created";
+    DLOG(INFO) << "runtime_hexbuffs created";
     mgr = runtime_hexbuffs.get();
+
+    CHECK_EQ(runtime_threads, nullptr);
+    runtime_threads = std::make_unique<HexagonThreadManager>(threads, stack_size, pipe_size);
+    DLOG(INFO) << "runtime_threads created";
   }
 
   //! \brief Ensures all runtime resources are freed
   void ReleaseResources() {
+    CHECK(runtime_threads) << "runtime_threads was not created in AcquireResources";
+    runtime_threads.reset();
+    DLOG(INFO) << "runtime_threads reset";
+
+    CHECK(runtime_hexbuffs) << "runtime_hexbuffs was not created in AcquireResources";
     if (runtime_hexbuffs && !runtime_hexbuffs->empty()) {
-      LOG(INFO) << "runtime_hexbuffs was not empty in ReleaseResources";
+      DLOG(INFO) << "runtime_hexbuffs was not empty in ReleaseResources";
     }
     mgr = &hexbuffs;
-    LOG(INFO) << "runtime_hexbuffs reset";
+    DLOG(INFO) << "runtime_hexbuffs reset";
     runtime_hexbuffs.reset();
   }
 
@@ -139,6 +149,10 @@ class HexagonDeviceAPI final : public DeviceAPI {
    */
   void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final;
 
+  HexagonThreadManager* ThreadManager() {
+    return runtime_threads ? runtime_threads.get() : nullptr;
+  }
+
  protected:
   //! Standard Device API interface to copy data from one storage to another.
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
@@ -164,6 +178,12 @@ class HexagonDeviceAPI final : public DeviceAPI {
 
   //! \brief Current buffer manager
   HexagonBufferManager* mgr;
+
+  //! \brief Thread manager
+  std::unique_ptr<HexagonThreadManager> runtime_threads;
+  const unsigned threads{6};
+  const unsigned pipe_size{1000};
+  const unsigned stack_size{0x4000};  // 16KB
 };
 }  // namespace hexagon
 }  // namespace runtime
diff --git a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
index 1827c4059dea..b54e40e87958 100644
--- a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
@@ -164,3 +164,14 @@ TEST_F(HexagonDeviceAPITest, leak_resources) {
   hexapi->FreeDataSpace(hex_dev, pre_runtime_buf);
   hexapi->AcquireResources();
 }
+
+// Ensure thread manager is properly configured and destroyed
+// in Acquire/Release
+TEST_F(HexagonDeviceAPITest, thread_manager) {
+  HexagonThreadManager* threads = hexapi->ThreadManager();
+  CHECK(threads != nullptr);
+  hexapi->ReleaseResources();
+  threads = hexapi->ThreadManager();
+  CHECK(threads == nullptr);
+  hexapi->AcquireResources();
+}

From 82e6fc41f8069fdaf98991faee31e21f77e2cf8c Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Tue, 27 Sep 2022 13:55:45 -0700
Subject: [PATCH 263/704] [microTVM] add the option to open a saved micro
 project for debugging (#12495)

* add the option to open a saved project for debugging.

* addressing comments

Co-authored-by: Mohamad <mkatanbaf@users.noreply.github.com>
---
 python/tvm/micro/build.py              | 26 ++++++++++--
 python/tvm/micro/session.py            | 56 ++++++++++++++++++--------
 python/tvm/micro/testing/evaluation.py | 39 ++++++++++++------
 3 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index 795a61edcbb3..92574ce2f8c2 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -20,7 +20,6 @@
 import json
 import logging
 import os
-import pathlib
 import contextlib
 import enum
 
@@ -115,23 +114,40 @@ class AutoTvmModuleLoader:
 
     Parameters
     ----------
-    template_project_dir : Union[pathlib.Path, str]
+    template_project_dir : Union[os.PathLike, str]
         project template path
 
     project_options : dict
         project generation option
+
+    project_dir: str
+        if use_existing is False: The path to save the generated microTVM Project.
+        if use_existing is True: The path to a generated microTVM Project for debugging.
+
+    use_existing: bool
+        skips the project generation and opens transport to the project at the project_dir address.
     """
 
     def __init__(
-        self, template_project_dir: Union[pathlib.Path, str], project_options: dict = None
+        self,
+        template_project_dir: Union[os.PathLike, str],
+        project_options: dict = None,
+        project_dir: Union[os.PathLike, str] = None,
+        use_existing: bool = False,
     ):
         self._project_options = project_options
+        self._use_existing = use_existing
 
-        if isinstance(template_project_dir, (pathlib.Path, str)):
+        if isinstance(template_project_dir, (os.PathLike, str)):
             self._template_project_dir = str(template_project_dir)
         elif not isinstance(template_project_dir, str):
             raise TypeError(f"Incorrect type {type(template_project_dir)}.")
 
+        if isinstance(project_dir, (os.PathLike, str)):
+            self._project_dir = str(project_dir)
+        else:
+            self._project_dir = None
+
     @contextlib.contextmanager
     def __call__(self, remote_kw, build_result):
         with open(build_result.filename, "rb") as build_file:
@@ -147,6 +163,8 @@ def __call__(self, remote_kw, build_result):
                 build_result_bin,
                 self._template_project_dir,
                 json.dumps(self._project_options),
+                self._project_dir,
+                self._use_existing,
             ],
         )
         system_lib = remote.get_function("runtime.SystemLib")()
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 967eaee62958..8a51f1082dda 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -20,7 +20,10 @@
 import json
 import logging
 import sys
-
+import os
+import pathlib
+import shutil
+from typing import Union
 from ..error import register_error
 from .._ffi import get_global_func, register_func
 from ..contrib import graph_executor
@@ -259,6 +262,8 @@ def compile_and_create_micro_session(
     mod_src_bytes: bytes,
     template_project_dir: str,
     project_options: dict = None,
+    project_dir: Union[os.PathLike, str] = None,
+    use_existing: bool = False,
 ):
     """Compile the given libraries and sources into a MicroBinary, then invoke create_micro_session.
 
@@ -275,25 +280,44 @@ def compile_and_create_micro_session(
 
     project_options: dict
         Options for the microTVM API Server contained in template_project_dir.
-    """
 
-    temp_dir = utils.tempdir()
-    # Keep temp directory for generate project
-    temp_dir.set_keep_for_debug(True)
-    model_library_format_path = temp_dir / "model.tar.gz"
-    with open(model_library_format_path, "wb") as mlf_f:
-        mlf_f.write(mod_src_bytes)
+    project_dir: Union[os.PathLike, str]
+        if use_existing is False: The path to save the generated microTVM Project.
+        if use_existing is True: The path to a generated microTVM Project for debugging.
 
-    try:
-        template_project = project.TemplateProject.from_directory(template_project_dir)
-        generated_project = template_project.generate_project_from_mlf(
-            model_library_format_path,
-            str(temp_dir / "generated-project"),
+    use_existing: bool
+        skips the project generation and opens transport to the project at the project_dir address.
+    """
+
+    if use_existing:
+        project_dir = pathlib.Path(project_dir)
+        assert project_dir.is_dir(), f"{project_dir} does not exist."
+        build_dir = project_dir / "generated-project" / "build"
+        shutil.rmtree(build_dir)
+        generated_project = project.GeneratedProject.from_directory(
+            project_dir / "generated-project",
             options=json.loads(project_options),
         )
-    except Exception as exception:
-        logging.error("Project Generate Error: %s", str(exception))
-        raise exception
+    else:
+        if project_dir:
+            temp_dir = utils.tempdir(custom_path=project_dir, keep_for_debug=True)
+        else:
+            temp_dir = utils.tempdir()
+
+        model_library_format_path = temp_dir / "model.tar.gz"
+        with open(model_library_format_path, "wb") as mlf_f:
+            mlf_f.write(mod_src_bytes)
+
+        try:
+            template_project = project.TemplateProject.from_directory(template_project_dir)
+            generated_project = template_project.generate_project_from_mlf(
+                model_library_format_path,
+                str(temp_dir / "generated-project"),
+                options=json.loads(project_options),
+            )
+        except Exception as exception:
+            logging.error("Project Generate Error: %s", str(exception))
+            raise exception
 
     generated_project.build()
     generated_project.flash()
diff --git a/python/tvm/micro/testing/evaluation.py b/python/tvm/micro/testing/evaluation.py
index c8a90ff5b40f..1d80ed5568b2 100644
--- a/python/tvm/micro/testing/evaluation.py
+++ b/python/tvm/micro/testing/evaluation.py
@@ -27,6 +27,7 @@
 from pathlib import Path
 from contextlib import ExitStack
 import tempfile
+import shutil
 
 import tvm
 from tvm.relay.op.contrib import cmsisnn
@@ -53,6 +54,7 @@ def tune_model(
         "project_type": "host_driven",
         **(project_options or {}),
     }
+
     module_loader = tvm.micro.AutoTvmModuleLoader(
         template_project_dir=tvm.micro.get_microtvm_template_projects(platform),
         project_options=project_options,
@@ -99,6 +101,7 @@ def create_aot_session(
     timeout_override=None,
     use_cmsis_nn=False,
     project_options=None,
+    use_existing=False,
 ):
     """AOT-compiles and uploads a model to a microcontroller, and returns the RPC session"""
 
@@ -125,21 +128,31 @@ def create_aot_session(
     parameter_size = len(tvm.runtime.save_param_dict(lowered.get_params()))
     print(f"Model parameter size: {parameter_size}")
 
-    project = tvm.micro.generate_project(
-        str(tvm.micro.get_microtvm_template_projects(platform)),
-        lowered,
-        build_dir / "project",
-        {
-            f"{platform}_board": board,
-            "project_type": "host_driven",
-            # {} shouldn't be the default value for project options ({}
-            # is mutable), so we use this workaround
-            **(project_options or {}),
-        },
-    )
+    project_options = {
+        f"{platform}_board": board,
+        "project_type": "host_driven",
+        # {} shouldn't be the default value for project options ({}
+        # is mutable), so we use this workaround
+        **(project_options or {}),
+    }
+
+    if use_existing:
+        shutil.rmtree(build_dir / "project" / "build")
+        project = tvm.micro.GeneratedProject.from_directory(
+            build_dir / "project",
+            options=project_options,
+        )
+
+    else:
+        project = tvm.micro.generate_project(
+            str(tvm.micro.get_microtvm_template_projects(platform)),
+            lowered,
+            build_dir / "project",
+            project_options,
+        )
+
     project.build()
     project.flash()
-
     return tvm.micro.Session(project.transport(), timeout_override=timeout_override)
 
 
From a07a46ed19edcb41ef72d47299dee0dbb336260e Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Tue, 27 Sep 2022 17:08:09 -0400
Subject: [PATCH 264/704] [TIR] add unit-tests for upcoming primfunc-slicing
 (#12794)

[TIR] Add disabled primfunc-slice unit tests

Add unit tests (initially disabled) to motivate upcoming work on
semi-automated slicing of primfuncs.  (I.e., extracting some
subtree of a primfunc body's TIR into a separate primfunc.)
---
 tests/python/unittest/test_slice_tir.py | 216 ++++++++++++++++++++++++
 1 file changed, 216 insertions(+)
 create mode 100644 tests/python/unittest/test_slice_tir.py

diff --git a/tests/python/unittest/test_slice_tir.py b/tests/python/unittest/test_slice_tir.py
new file mode 100644
index 000000000000..03cd8f67d6b2
--- /dev/null
+++ b/tests/python/unittest/test_slice_tir.py
@@ -0,0 +1,216 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import tvm.testing
+from tvm.script import tir as T
+import pytest
+
+# ---------------------------------------------------------------------------------------------------
+# ABOUT THIS FILE:
+# ---------------------------------------------------------------------------------------------------
+# We (cconvey / OctoML) are working on a sequence of PRs to allow a single TIR primfunc's
+# AST to be sliced into multiple partitiones, where each partition will be converted into
+# a new TIR primfunc. (See https://en.wikipedia.org/wiki/Program_slicing).
+#
+# The unit tests below provide a roadmap for that sequence of PRs; each PR should allow
+# one more of these tests to pass.
+#
+# NOTE: These unit tests may change as work progresses.  They aren't meant to
+# indicate hard requirements.
+
+# NOTE! The `tvm.testing.CompareBeforeAfter` class provides TWO useful mechanisms for
+# these tests:
+#
+# (a) It lets us specify code snippets which are valid Python, but which aren't YET
+#     recognized as valid TVMScript.  This allows unit tests for new constructs,
+#     e.g. 'call_tir(...)' to simply be disabled rather than fully commented out.
+#
+# (b) It lets us structurally compare the TIR bodies of two primfuncs.
+#
+#     Note that some of the tests below will require the structural comparison of
+#     two entire IRModules, not just primfuncs.  This will require adding functionality
+#     to the `CompareBeforeAfter` class, or implementing that level of comparison within
+#     the individual unit tests.
+#
+# Some of the unit tests below which require whole-IRModule comparison.  For expedience
+# we simply comment out the (early draft) bodies of those unit tests, rather than
+# hacking their structure to get the benefits of (a).
+
+
+# ---------------------------------------------------------------------------------------------------
+# 'CALL_TIR' (AND RELATED) CAVEATS:
+# ---------------------------------------------------------------------------------------------------
+# (c) "call_tir" is a placeholder name.
+#     The TVM "Relax" effort also defines a node named "call_tir", which is likely
+#     become something different from what we're calling "call_tir" here.  So
+#     we may rename *this* "call_tir" during implementation.
+#
+# (d) For "call_tir" calls, the syntax/semantics for passing buffer regions is still
+#     an active area of development.  So that detail of these unit tests is likely
+#     to change.
+#
+# (e) The specific string "extract_as_subroutine" used to annotate some IR Blocks,
+#     i.e., `T.annotate("extract_as_subroutine", ...)`, may change as work progresses.
+
+
+# ---------------------------------------------------------------------------------------------------
+# step 1: Simply passes Python / TVMScript parsing.
+# ---------------------------------------------------------------------------------------------------
+#
+#   The only requirement for this test is that the TVMScript parser
+#   doesn't raise an error when encountering `T.call_tir(foo)`,
+#   where "foo" is a syntactically valid TVMScript function name.
+#
+#   NOTE! The role of this unit test should evolve as follows:
+#   1) Initially the test should fail, because we haven't yet changed the TVMScript
+#      parser to support 'call_tir'.
+#
+#   2) Initial TVMScript support for 'call_tir' will be minimal, essentially ignoring
+#      it.  This test should pass once that change is made.
+#
+#   3) As support for 'call_tir' becomes more complete, this test should once again
+#      fail, because the specified callee doesn't exist.  This test should be updated
+#      to once again expect failure.
+@pytest.mark.xfail(reason="Awaiting TVMScript support for 'call_tir' token.", strict=True)
+class TestParseCallTIR(tvm.testing.CompareBeforeAfter):
+    """
+    Simply confirm that the TIR node `call_tir` doesn't interfere with
+    the successful parsing of the TVMScript.
+    """
+
+    def before():
+        T.call_tir(add_one)
+        T.evalute(0)
+
+    def expected():
+        T.evaluate(0)
+
+    # Provide a trivial 'transform' pass to satisfy the requirements of
+    # tvm.testing.CompareBeforeAfter.
+    transform = tvm.tir.transform.prim_func_pass(lambda func, _mod, _ctx: func, 0)
+
+
+# ---------------------------------------------------------------------------------------------------
+# step 2: transform annotated block ==> separate primfuncs + call_tir
+#
+# NOTE: This early-draft version of the unit test contains pseudocode to compare entire IRModule
+# objects, analogously to how tvm.testing.CompareBeforeAfter compares two primfuncs.
+# TVM's testing infrastructure currently has no such functionality, and it will need to be added
+# (or approximated) to make this unit test useable.
+# ---------------------------------------------------------------------------------------------------
+@pytest.mark.xfail(
+    reason="Awaiting TVMScript support for 'call_tir' and T.annotation(\"extract_as_subroutine\").",
+    strict=True,
+)
+class TestAnnotateAndSliceTIR(tvm.testing.CompareBeforeAfter):
+    # def test_annotate_and_slice():
+    #    @tvm.script.ir_module
+    #    class irmod_before:
+    #        @T.prim_func
+    #        def main(A: T.Buffer[(1,), "int8"):
+    #            #A = T.match_buffer(a, (1,), "int8")
+    #            A[0] = 0
+    #            with T.block("block_foo"): # optional: give this block a name, perhaps for testing?
+    #                # NOTE: nice to have: human control over name used for the generated callee
+    #                T.annotate("extract_as_subroutine", "add_one")
+    #                A[0] += 1
+    #                return 42
+    #
+    #    @tvm.script.ir_module
+    #    class irmod_after:
+    #        @T.prim_func
+    #        def main():
+    #            A = T.buffer[[1], "int8"]
+    #            A[0] = 0
+    #            with T.block("block_foo"):
+    #                call_tir(add_one, A)
+    #
+    #        @T.prim_func
+    #        def add_one(X: T.buffer[[1], "int8"]):
+    #            X[0] += 1
+    pass
+
+
+# ---------------------------------------------------------------------------------------------------
+# step 3: transform call_tir ==> packed call
+# ---------------------------------------------------------------------------------------------------
+@pytest.mark.xfail(
+    reason="Awaiting TVMScript support for lowering of 'T.call_tir' to 'T.call_packed'.",
+    strict=True,
+)
+class TestLowerCallTir(tvm.testing.CompareBeforeAfter):
+    # @tvm.script.ir_module
+    # class test_lower_before:
+    #    @T.prim_func
+    #    def main():
+    #        A = T.buffer[[1], "int8"]
+    #        A[0] = 0
+    #        with T.block():
+    #            call_tir(add_one, A)
+    #
+    #    @T.prim_func
+    #    def add_one(X: T.buffer[[1], "int8"]):
+    #        X[0] += 1
+    #
+    # @tvm.script.ir_module
+    # class test_lower_after:
+    #    @T.prim_func
+    #    def main():
+    #        A = T.buffer[[1], "int8"]
+    #        A[0] = 0
+    #        with T.block():
+    #            # TODO: figure out the right TVMScript thing to do here
+    #            call_packed(add_one, A)  # not sure about this function / interface
+    #
+    #    @T.prim_func
+    #    def add_one(X: T.buffer[[1], "int8"]):
+    #        X[0] += 1
+    #
+    # TODO(cconvey): additional test logic needed.
+    # NOTE(lunderberg): Will also need a `transform` defined here.
+    #      I think we'll want it to occur in `tvm.tir.transform.MakePackedAPI`.
+    pass
+
+
+# ---------------------------------------------------------------------------------------------------
+# step 4: end-to-end functionality
+# ---------------------------------------------------------------------------------------------------
+
+
+@pytest.mark.xfail(reason="Awaiting end-to-end support for Primfunc slicing.", strict=True)
+class TestPrimfuncSlicingEndToEnd(tvm.testing.CompareBeforeAfter):
+    # @tvm.script.ir_module
+    # class test_annotate_before:
+    #    @T.prim_func
+    #    def main():
+    #        A = T.buffer[[1], "int8"]
+    #        A[0] = 0
+    #        with T.block(): # optional: give this block a name, perhaps for testing?
+    #            # NOTE: nice to have: human control over name used for the generated callee
+    #            T.annotate("extract_as_subroutine", "add_one")
+    #            A[0] += 1
+    #        assert(A[0] == 1)
+    #
+    # TODO(cconvey): additional test logic needed:
+    #     Starting with the IRModule shown above, end up with a running test that
+    #     module actually increments A[0] on Hexagon and x86-64 Linux.
+    #
+    # NOTE(lunderberg): We can use the function calls currently generated by `SplitHostDevice` as a template
+    #     (see https://github.com/apache/tvm/blob/9a673faa74ed7cd715a4e011716bcce3fd2158b6/src/tir/transforms/split_host_device.cc#L336).
+    #     Overall, we'll want to output a Call node with the operation builtin::tvm_call_packed().
+    pass

From bec9f16d42fc11ac97e0f01af007551398b025a2 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 27 Sep 2022 16:51:08 -0500
Subject: [PATCH 265/704] [TIR][Transform] Clear buffer_map during
 MakeUnpackedAPI (#12891)

* [TIR][Transform] Clear buffer_map during MakeUnpackedAPI

This mimics the behavior in `MakePackedAPI`, and is assumed to be the
case for some codegens.

* Remove read of buffer_map  in ethosu.tir_to_cs_translator

This previously relied on `MakeUnpackedAPI` preserving the
`PrimFunc::buffer_map`, even after it had been used for lowering.  It
now reads from the `BufferLoad` and `BufferStore` nodes to determine
buffer shapes.

* Added more documentation for MakePackedAPI/MakeUnpackedAPI
---
 .../relay/backend/contrib/ethosu/tir/utils.py | 30 +++++++++++++++
 .../contrib/ethosu/tir_to_cs_translator.py    | 37 +++++++++++++------
 python/tvm/tir/transform/transform.py         | 30 +++++++++++++++
 src/tir/transforms/make_unpacked_api.cc       |  7 +---
 4 files changed, 88 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/utils.py b/python/tvm/relay/backend/contrib/ethosu/tir/utils.py
index a823667234df..396735a07c4c 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/utils.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/utils.py
@@ -158,6 +158,36 @@ def get_outer_loops(stmt, layout):
     return None
 
 
+def collect_buffer_map(stmt):
+    """Collect a map of Var -> Buffer
+
+    Generate a map from a buffer's backing `tir.Var` to the
+    `tir.Buffer` object that uses it.  If multiple such buffers exist,
+    return the first occurrence.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.Stmt
+        The statement to get the BufferLoads from.
+
+    Returns
+    -------
+    buffer_map : Dict[Var, Buffer]
+        The map from buffer var to the buffers that use it.
+    """
+    buffer_map = {}
+
+    def _visit(node):
+        if isinstance(node, (tvm.tir.BufferLoad, tvm.tir.BufferStore)):
+            buf = node.buffer
+            if buf.data not in buffer_map:
+                buffer_map[buf.data] = buf
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, _visit)
+
+    return buffer_map
+
+
 def get_loads(stmt):
     """Get the BufferLoad statements.
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
index f5c8994bec77..19f009d284ab 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
@@ -29,6 +29,7 @@
 from tvm.relay.backend.contrib.ethosu import util
 from tvm.relay.backend.contrib.ethosu import vela_api
 from tvm.relay.backend.contrib.ethosu.tir import spec
+from tvm.relay.backend.contrib.ethosu.tir import utils as tir_utils
 
 
 class BufferType(Enum):
@@ -254,26 +255,40 @@ def extract_param_base_addresses(mod, buffer_info, scratch_region_map) -> List[u
     assert len(mod.functions.items()) == 1
     primfunc = mod.functions.items()[0][1]
 
+    buffer_map = tir_utils.collect_buffer_map(primfunc.body)
+
     base_addresses = list()
     idx = 0
+
     for param in primfunc.params:
         # constants are pooled together and handled specially
         # this will change after tir.allocate_const.
         # For now, we are skipping generating buffer addresses here
         if buffer_info[param].btype == BufferType.constant:
             continue
-        buffer = primfunc.buffer_map[param]
-        dtype = buffer.dtype
-        element_size_bytes = np.iinfo(dtype).bits // 8
-        size_bytes = element_size_bytes * np.prod(list(buffer.shape))
-        base_addresses.append(
-            util.BaseAddress(
-                param.name.replace("-", "_"),
-                idx,
-                _get_region(buffer_info[param].btype, param, scratch_region_map),
-                size_bytes,
+
+        if param in buffer_map:
+            buffer = buffer_map[param]
+            dtype = buffer.dtype
+            element_size_bytes = np.iinfo(dtype).bits // 8
+            size_bytes = element_size_bytes * np.prod(list(buffer.shape))
+            base_addresses.append(
+                util.BaseAddress(
+                    param.name.replace("-", "_"),
+                    idx,
+                    _get_region(buffer_info[param].btype, param, scratch_region_map),
+                    size_bytes,
+                )
+            )
+        else:
+            base_addresses.append(
+                util.BaseAddress(
+                    param.name.replace("-", "_"),
+                    idx,
+                    _get_region(buffer_info[param].btype, param, scratch_region_map),
+                    0,
+                )
             )
-        )
         idx += 1
 
     return base_addresses
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 3c1ca196f1b0..d95d15c0dfbe 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -390,6 +390,26 @@ def LowerCustomDatatypes():
 def MakePackedAPI():
     """Transform the PrimFuncs in the module to a packed func API.
 
+    Prior to this pass, the PrimFunc may have Buffer arguments defined
+    in the `PrimFuncNode::buffer_map`.  This pass consumes the
+    `buffer_map`, using it to generate `TVMArgs` and `TVMRetValue*`
+    arguments that implement the `PackedFunc` API.
+
+    For static shapes, the `BufferNode::shape`, `BufferNode::strides`,
+    and `BufferNode::elem_offset` member variables are used to
+    generate runtime checks on the corresponding member variables in
+    the user-provided `DLTensor*` or `tvm.nd.array` argument.  (e.g. A
+    PrimFunc that accepts a buffer of shape `[16,32]` validates that
+    the `DLTensor::shape` array is `[16,32]`.)
+
+    For dynamic Buffers, in which one or more of these `BufferNode` member
+    variables use `tir.Var` that are not defined by other PrimFunc
+    parameters, these are instead used to define the variables based on
+    the corresponding `DLTensor` members.  (e.g. A PrimFunc that accepts a
+    buffer of shape `[tir.Var("n"), tir.Var("m")]`, when passed a
+    `DLTensor` of shape `[16,32]`, will define `n = 16` and `n=32`, based
+    on the argument's shape.
+
     Returns
     -------
     fpass : tvm.transform.Pass
@@ -401,6 +421,16 @@ def MakePackedAPI():
 def MakeUnpackedAPI():
     """Transform the PrimFuncs in the module to a C API compatible with internal calls.
 
+    Prior to this pass, the PrimFunc may have Buffer arguments defined in
+    the `PrimFuncNode::buffer_map`.  This pass consumes the `buffer_map`,
+    using it to generate `T*` arguments (e.g. `float32*`) that can be
+    directly called by a C API.
+
+    For static shapes, no runtime validation is performed to confirm that
+    the argument buffer's shape matches the expected shape.  For dynamic
+    shapes, `MakeUnpackedAPI` requires that the dynamic parameters be
+    passed as separate `tir.Var` parameters.
+
     Returns
     -------
     fpass : tvm.transform.Pass
diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc
index c57daeabbe1d..87e8f38895cd 100644
--- a/src/tir/transforms/make_unpacked_api.cc
+++ b/src/tir/transforms/make_unpacked_api.cc
@@ -59,16 +59,13 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) {
 
   // Collect variables and buffers to map between
   Array<Var> args;
-  Map<Var, Buffer> new_buffer_map;
+
   for (const Var& param : func->params) {
     // Ideally all func params should have Buffers defined in the buffer_map
     // We should look to insert buffer_maps for all PrimFuncs that are returned
     // to the core compiler.
     if (func->buffer_map.find(param) != func->buffer_map.end()) {
       args.push_back(func->buffer_map[param]->data);
-      // Rewiring the buffer_var to map to Buffers for low-level passes
-      // retain information about the buffer.
-      new_buffer_map.Set(func->buffer_map[param]->data, func->buffer_map[param]);
     } else {
       args.push_back(param);
     }
@@ -82,7 +79,7 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) {
   func_ptr->body = MergeNest(device_init, func_ptr->body);
   func_ptr->params = args;
   func_ptr->ret_type = PrimType(DataType::Int(32));
-  func_ptr->buffer_map = new_buffer_map;
+  func_ptr->buffer_map = Map<Var, Buffer>();
 
   // return the function.
   return std::move(func);

From c89a8baeeb0e76eb67a38651cc8a5829195f9a6b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 27 Sep 2022 16:11:14 -0700
Subject: [PATCH 266/704] [usmp] Also remap VarNode to USMP-allocated buffer
 (#12880)

Before this patch, ConvertPoolAllocationsToOffsets would generate TIR
like the following:

  let dense_let: Pointer(global int32) = @tir.address_of(global_workspace_37_buffer_var[69952], dtype=handle)
  for (k.outer: int32, 0, 64) {
    @tir.call_extern("gemm_1x1x1_update_UKVNAEBL", ..., dense, ...)
  }

  T_multiply[ax1] = @tir.q_multiply_shift(((dense: Buffer(dense_let,
      int32, [10], [], align=32)[ax1], ...)

This caused CodegenSourceBase to later fail with this error:
  "src/target/source/codegen_source_base.cc", line 67
  Check failed: (it != var_idmap_.end()) is false: Find undefined
    Variable dense

After this patch, "dense" in the call_extern is changed to read "dense_let."
---
 src/tir/usmp/analysis/extract_buffer_info.cc  | 20 ++--
 .../convert_pool_allocations_to_offsets.cc    | 10 ++
 ...orm_convert_pool_allocations_to_offsets.py | 93 +++++++++++++++++++
 3 files changed, 114 insertions(+), 9 deletions(-)

diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc
index 74d428f6dddf..268058945750 100644
--- a/src/tir/usmp/analysis/extract_buffer_info.cc
+++ b/src/tir/usmp/analysis/extract_buffer_info.cc
@@ -429,15 +429,17 @@ void BufferInfoExtractor::VisitExpr_(const VarNode* op) {
 
 Array<Var> static GetMatchedBuffers(const PrimFunc& func) {
   Array<Var> buffer_vars;
-  for (unsigned int i = 0; i < func->params.size() - 1; i++) {
-    Var param = func->params[i];
-    buffer_vars.push_back(func->buffer_map[param]->data);
-  }
-  Var last_param = func->params.back();
-  // Checks whether last var is present in the buffer map
-  // because it could be the resource handle
-  if (func->buffer_map.find(last_param) != func->buffer_map.end()) {
-    buffer_vars.push_back(func->buffer_map[last_param]->data);
+  if (func->params.size() > 0) {
+    for (unsigned int i = 0; i < func->params.size() - 1; i++) {
+      Var param = func->params[i];
+      buffer_vars.push_back(func->buffer_map[param]->data);
+    }
+    Var last_param = func->params.back();
+    // Checks whether last var is present in the buffer map
+    // because it could be the resource handle
+    if (func->buffer_map.find(last_param) != func->buffer_map.end()) {
+      buffer_vars.push_back(func->buffer_map[last_param]->data);
+    }
   }
   return buffer_vars;
 }
diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
index 601e34719632..56aba654b59e 100644
--- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
+++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
@@ -96,6 +96,7 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator {
  private:
   PrimExpr VisitExpr_(const CallNode* op) override;
   Stmt VisitStmt_(const AllocateNode* op) override;
+  PrimExpr VisitExpr_(const VarNode* op) override;
   PrimExpr VisitExpr_(const BufferLoadNode* op) override;
   Stmt VisitStmt_(const BufferStoreNode* op) override;
 
@@ -395,6 +396,15 @@ PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const BufferLoadNode* op) {
   return std::move(load);
 }
 
+PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const VarNode* op) {
+  auto it = allocate_var_to_let_var_.find(GetRef<Var>(op));
+  if (it != allocate_var_to_let_var_.end()) {
+    return (*it).second;
+  }
+
+  return StmtExprMutator::VisitExpr_(op);
+}
+
 Buffer PoolAllocationToOffsetConverter::GetRemappedBuffer(Buffer original) {
   {
     auto it = original_buf_to_let_buf_.find(original);
diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index fdda400a779f..31cc6e07dec3 100644
--- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -600,5 +600,98 @@ def test_resnet_subgraph():
         tvm.ir.assert_structural_equal(actual_func, ref_func)
 
 
+@tvm.script.ir_module
+class TensorIntrinStructure:
+    @T.prim_func
+    def tensor_intrin_primfunc() -> None:
+        dense_data = T.allocate([10], "int32", "global")
+        T.evaluate(
+            T.call_extern(
+                "intrin_function",
+                T.tvm_access_ptr(
+                    T.type_annotation(dtype="int32"), dense_data, 0, 1, 2, dtype="handle"
+                ),
+                dtype="int32",
+            )
+        )
+
+        dense = T.buffer_decl([10], "int32", data=dense_data)
+        dense[0] = T.q_multiply_shift(dense[0], 1608879842, 31, -7, dtype="int32")
+
+    @T.prim_func
+    def __tvm_main__(input: T.handle, output: T.handle) -> None:
+        T.evaluate(T.call_extern("tensor_intrin_primfunc", dtype="int32"))
+
+
+@tvm.script.ir_module
+class TensorIntrinStructurePlanned:
+    @T.prim_func
+    def tensor_intrin_primfunc(global_workspace_1_var: T.Ptr[T.uint8]) -> None:
+        global_workspace_1_buffer_var = T.match_buffer(
+            global_workspace_1_var, [40], dtype="uint8", strides=[1], elem_offset=0, align=16
+        )
+        T.preflattened_buffer(
+            global_workspace_1_buffer_var, [40], dtype="uint8", strides=[1], elem_offset=0, align=16
+        )
+        dense_let = T.buffer_decl([10], "int32")
+        with T.let(dense_let.data, T.address_of(global_workspace_1_buffer_var[0], dtype="handle")):
+            T.evaluate(
+                T.call_extern(
+                    "intrin_function",
+                    T.tvm_access_ptr(
+                        T.type_annotation(dtype="int32"), dense_let.data, 0, 1, 2, dtype="handle"
+                    ),
+                    dtype="int32",
+                )
+            )
+            dense_let[0] = T.q_multiply_shift(dense_let[0], 1608879842, 31, -7, dtype="int32")
+
+    @T.prim_func
+    def __tvm_main__(
+        input: T.handle, global_workspace_1_var: T.Ptr[T.uint8], output: T.handle
+    ) -> None:
+        global_workspace_1_buffer_var = T.match_buffer(
+            global_workspace_1_var, [40], dtype="uint8", strides=[1], elem_offset=0, align=16
+        )
+        T.evaluate(
+            T.call_extern(
+                "tensor_intrin_primfunc", global_workspace_1_buffer_var.data, dtype="int32"
+            )
+        )
+
+
+def test_tensor_intrin():
+    target = Target("c")
+    global_workspace_pool = WorkspacePoolInfo(
+        "global_workspace",
+        [target],
+    )
+
+    tir_mod = TensorIntrinStructure
+    tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
+    tir_mod = assign_poolinfos_to_allocates_in_irmodule(tir_mod, [global_workspace_pool])
+    main_func = tir_mod["__tvm_main__"]
+    buffer_analysis = tvm.tir.usmp.analysis.extract_buffer_info(main_func, tir_mod)
+    buffer_info_map = buffer_analysis.buffer_info_stmts
+
+    fcreate_array_bi = tvm.get_global_func("tir.usmp.CreateArrayBufferInfo")
+    buffer_info_arr = fcreate_array_bi(buffer_info_map)
+    fusmp_algo_greedy_by_size = tvm.get_global_func("tir.usmp.algo.greedy_by_size")
+    buffer_pool_allocations = fusmp_algo_greedy_by_size(
+        buffer_info_arr, buffer_analysis.memory_pressure
+    )
+    fassign_stmt_pool_allocations = tvm.get_global_func("tir.usmp.AssignStmtPoolAllocations")
+    pool_allocations = fassign_stmt_pool_allocations(buffer_info_map, buffer_pool_allocations)
+    tir_mod_with_offsets = tvm.tir.usmp.transform.convert_pool_allocations_to_offsets(
+        pool_allocations, emit_tvmscript_printable=True
+    )(tir_mod)
+
+    expected = TensorIntrinStructurePlanned
+
+    for gv, ref_func in expected.functions.items():
+        actual_func = tir_mod_with_offsets[gv.name_hint]
+        tvm.ir.assert_structural_equal(actual_func, ref_func)
+
+
 if __name__ == "__main__":
     pytest.main([__file__] + sys.argv[1:])

From 178f82dc481bf31961206412c22dd5519a245b49 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 27 Sep 2022 16:49:31 -0700
Subject: [PATCH 267/704] [TOPI] Implement Einsum with reduction axes (#12913)

* [TOPI] Implement Einsum with reduction axes

* address comments
---
 include/tvm/topi/einsum.h                    | 889 +------------------
 src/relay/op/tensor/math.cc                  |   2 +-
 src/topi/einsum.cc                           | 353 ++++++++
 src/topi/transform.cc                        |   4 -
 tests/python/topi/python/test_topi_einsum.py |  36 +-
 5 files changed, 397 insertions(+), 887 deletions(-)
 create mode 100644 src/topi/einsum.cc

diff --git a/include/tvm/topi/einsum.h b/include/tvm/topi/einsum.h
index a0c4039909ad..5e7813f8431b 100644
--- a/include/tvm/topi/einsum.h
+++ b/include/tvm/topi/einsum.h
@@ -49,568 +49,6 @@ namespace topi {
 using namespace tvm::te;
 using namespace topi::detail;
 
-/*!
- * \brief Compute the stride of the given shape.
- *
- * \param shape for the operation.
- *
- * \return the stride of the shape.
- */
-inline Array<PrimExpr> GetStride(const Array<PrimExpr> shape) {
-  size_t ndim = shape.size();
-  int prod = 1;
-  Array<PrimExpr> stride = Array<PrimExpr>(ndim, -1);
-  for (int i = ndim - 1; i >= 0; i--) {
-    stride.Set(i, if_then_else(shape[i] > 1, prod, 0));
-    prod = prod * GetConstInt(shape[i]);
-  }
-  return stride;
-}
-
-/*!
- * \brief Pad the shape with 1.
- *
- * \param shape the input shape to be padded
- * \param odim the padding size of the objective shape.
- *
- * \return the padded shape.
- */
-inline Array<PrimExpr> Pad(const Array<PrimExpr> shape, int odim) {
-  int ndim = shape.size();
-  CHECK_GE(odim, ndim);
-  Array<PrimExpr> ret(static_cast<size_t>(odim), 1);
-  for (int idim = 0; idim < ndim; ++idim) {
-    ret.Set(idim, shape[idim]);
-  }
-  return ret;
-}
-
-/*!
- * \brief Parse the subscripts for one operand into an output of 'ndim' labels.
- *
- * \param subscripts the subscripts for to be parsed.
- * \param length subscripts[0: length] represents the current operand.
- * \param ndim the ndim of current operand.
- * \param iop the index of the operand.
- * \param op_labels the parsing result.
- *        For Example:
- *           subscripts="abbcbc",  ndim=6 -> op_labels=[97, 98, -1, 99, -3, -2].
- *           subscripts="ab...bc", ndim=6 -> op_labels=[97, 98, 0, 0, -3, 99].
- * \param label_counts Count the number the label appears.
- * \param min_label Save the minimal label according to ASCII.
- * \param max_label Save the maximal label according to ASCII.
- *
- * \return 0.
- */
-inline int ParseOperandSubscripts(const char* subscripts, int length, int ndim, int iop,
-                                  char* op_labels, char* label_counts, int* min_label,
-                                  int* max_label) {
-  int i;
-  int idim = 0;
-  int ellipsis = -1;
-
-  /* Process all labels for this operand */
-  for (i = 0; i < length; ++i) {
-    int label = subscripts[i];
-
-    /* A proper label for an axis. */
-    if (label > 0 && isalpha(label)) {
-      /* Check we don't exceed the operator dimensions. */
-      CHECK(idim < ndim) << "einstein sum subscripts string contains "
-                         << "too many subscripts for operand " << iop;
-
-      op_labels[idim++] = label;
-      if (label < *min_label) {
-        *min_label = label;
-      }
-      if (label > *max_label) {
-        *max_label = label;
-      }
-      label_counts[label]++;
-    } else if (label == '.') {
-      /* The beginning of the ellipsis. */
-      /* Check it's a proper ellipsis. */
-      CHECK(
-          !(ellipsis != -1 || i + 2 >= length || subscripts[++i] != '.' || subscripts[++i] != '.'))
-          << "einstein sum subscripts string contains a "
-          << "'.' that is not part of an ellipsis ('...') "
-          << "in operand " << iop;
-
-      ellipsis = idim;
-    } else {
-      CHECK(label == ' ') << "invalid subscript '" << static_cast<char>(label)
-                          << "' in einstein sum "
-                          << "subscripts string, subscripts must "
-                          << "be letters";
-    }
-  }
-
-  /* No ellipsis found, labels must match dimensions exactly. */
-  if (ellipsis == -1) {
-    CHECK(idim == ndim) << "operand has more dimensions than subscripts "
-                        << "given in einstein sum, but no '...' ellipsis "
-                        << "provided to broadcast the extra dimensions.";
-  } else if (idim < ndim) {
-    /* Ellipsis found, may have to add broadcast dimensions. */
-    /* Move labels after ellipsis to the end. */
-    for (i = 0; i < idim - ellipsis; ++i) {
-      op_labels[ndim - i - 1] = op_labels[idim - i - 1];
-    }
-    /* Set all broadcast dimensions to zero. */
-    for (i = 0; i < ndim - idim; ++i) {
-      op_labels[ellipsis + i] = 0;
-    }
-  }
-
-  /*
-   * Find any labels duplicated for this operand, and turn them
-   * into negative offsets to the axis to merge with.
-   *
-   * In C, the char type may be signed or unsigned, but with
-   * twos complement arithmetic the char is ok either way here, and
-   * later where it matters the char is cast to a signed char.
-   */
-  for (idim = 0; idim < ndim - 1; ++idim) {
-    int label = op_labels[idim];
-    /* If it is a proper label, find any duplicates of it. */
-    if (label > 0) {
-      /* Search for the next matching label. */
-      char* next = reinterpret_cast<char*>(memchr(op_labels + idim + 1, label, ndim - idim - 1));
-
-      while (next != nullptr) {
-        /* The offset from next to op_labels[idim] (negative). */
-        *next = static_cast<char>((op_labels + idim) - next);
-        /* Search for the next matching label. */
-        next = reinterpret_cast<char*>(memchr(next + 1, label, op_labels + ndim - 1 - next));
-      }
-    }
-  }
-  return 0;
-}
-
-/*!
- * \brief Parse the subscripts for the output into an output that includes 'ndim_broadcast'
- *        unlabeled dimensions.
- *
- * \param subscripts the subscripts for to be parsed.
- * \param length subscripts[0: length] represents the output operand.
- * \param ndim_broadcast the broadcast dimension number.
- * \param label_counts Count the number the label appears.
- * \param out_labels similar to the op_labels in ParseOperandSubscripts, for each
- *        dimension, the ASCII code of the corresponding label. zero for the broadcasting dim.
- *
- * \return the total number of output dimensions or -1 if there is an error.
- */
-inline int ParseOutputSubscripts(const char* subscripts, int length, int ndim_broadcast,
-                                 const char* label_counts, char* out_labels) {
-  int i, bdim;
-  int ndim = 0;
-  int ellipsis = 0;
-
-  /* Process all the output labels. */
-  for (i = 0; i < length; ++i) {
-    int label = subscripts[i];
-
-    /* A proper label for an axis. */
-    if (label > 0 && isalpha(label)) {
-      /* Check that it doesn't occur again. */
-      CHECK(memchr(subscripts + i + 1, label, length - i - 1) == nullptr)
-          << "einstein sum subscripts string includes "
-          << "output subscript '" << static_cast<char>(label) << "' multiple times";
-
-      /* Check that it was used in the inputs. */
-      CHECK(label_counts[label] != 0)
-          << "einstein sum subscripts string included "
-          << "output subscript '" << static_cast<char>(label) << "' which never appeared "
-          << "in an input";
-
-      /* Check that there is room in out_labels for this label. */
-      CHECK(ndim < NPY_MAXDIMS) << "einstein sum subscripts string contains "
-                                << "too many subscripts in the output";
-
-      out_labels[ndim++] = label;
-    } else if (label == '.') {
-      /* The beginning of the ellipsis. */
-      /* Check it is a proper ellipsis. */
-      CHECK(!(ellipsis || i + 2 >= length || subscripts[++i] != '.' || subscripts[++i] != '.'))
-          << "einstein sum subscripts string "
-          << "contains a '.' that is not part of "
-          << "an ellipsis ('...') in the output";
-
-      /* Check there is room in out_labels for broadcast dims. */
-      CHECK(ndim + ndim_broadcast <= NPY_MAXDIMS) << "einstein sum subscripts string contains "
-                                                  << "too many subscripts in the output";
-
-      ellipsis = 1;
-      for (bdim = 0; bdim < ndim_broadcast; ++bdim) {
-        out_labels[ndim++] = 0;
-      }
-    } else {
-      CHECK(label == ' ') << "invalid subscript '" << static_cast<char>(label)
-                          << "' in einstein sum "
-                          << "subscripts string, subscripts must "
-                          << "be letters";
-    }
-  }
-
-  /* If no ellipsis was found there should be no broadcast dimensions. */
-  CHECK(!(!ellipsis && ndim_broadcast > 0)) << "output has more dimensions than subscripts "
-                                            << "given in einstein sum, but no '...' ellipsis "
-                                            << "provided to broadcast the extra dimensions.";
-
-  return ndim;
-}
-
-/*!
- * \brief If any dimensions are combined, create a view that combines them.
- *        Shows in newshape and newstride.
- *
- * \param op the operand tensor.
- * \param iop the index of the operand.
- * \param labels the op_labels fot the operand. Like [97, 98, -2] for "aba".
- * \param newshape The combined shape.
- * \param newstride The combined stride.
- *
- * For example:
- *  "aba -> ab",              shape = [2,3,2] stride = [6,2,1]
- *  op_labels = [97, 98, -2], newshape = [2,3], newstride = [7,2]
- */
-inline void GetCombinedDimsView(const Tensor& op, int iop, char* labels, Array<PrimExpr>* newshape,
-                                Array<PrimExpr>* newstride) {
-  int idim, ndim, icombine, combineoffset;
-  int icombinemap[NPY_MAXDIMS];
-  int newdim;
-
-  Array<PrimExpr> shape = op->shape;
-  Array<PrimExpr> stride = GetStride(shape);
-  ndim = op.ndim();
-  newdim = newshape->size();
-
-  /* Initialize the dimensions and strides to zero */
-  for (idim = 0; idim < newdim; ++idim) {
-    newshape->Set(idim, 0);
-    newstride->Set(idim, 0);
-  }
-
-  /* Copy the dimensions and strides, except when collapsing */
-  icombine = 0;
-  for (idim = 0; idim < ndim; ++idim) {
-    /*
-     * The char type may be either signed or unsigned, we
-     * need it to be signed here.
-     */
-    int label = (signed char)labels[idim];
-    /* If this label says to merge axes, get the actual label */
-    if (label < 0) {
-      combineoffset = label;
-      label = labels[idim + label];
-    } else {
-      combineoffset = 0;
-      if (icombine != idim) {
-        labels[icombine] = labels[idim];
-      }
-      icombinemap[idim] = icombine;
-    }
-    /* If the label is 0, it's an unlabeled broadcast dimension */
-    if (label == 0) {
-      newshape->Set(icombine, shape[idim]);
-      newstride->Set(icombine, stride[idim]);
-    } else {
-      /* Update the combined axis dimensions and strides */
-      int i = icombinemap[idim + combineoffset];
-      CHECK(!((combineoffset < 0) &&
-              GetConstInt((*newshape)[i] != 0 && (*newshape)[i] != shape[idim])))
-          << "dimensions in operand " << iop << " for collapsing index '" << label
-          << "' don't match (" << GetConstInt((*newshape)[i]) << " != " << shape[idim] << ")";
-      newshape->Set(i, shape[idim]);
-      newstride->Set(i, (*newstride)[i] + stride[idim]);
-    }
-
-    /* If the label didn't say to combine axes, increment dest i */
-    if (combineoffset == 0) {
-      icombine++;
-    }
-  }
-}
-
-/*!
- * \brief Prepare the operand axes to match each stride or shape pair.
- *
- * \param ndim the ndim of the operand tensor.
- * \param iop the index of the operand.
- * \param labels the op_labels fot the operand. [97, 98, -1, 99, -3, -2] for "abbcbc".
- * \param axes The matched axes to be calculated.
- * \param ndim_iter the dimension of iterating. Subscripts "ab, bc -> ac" ndim_iter = 3.
- * \param iter_labels output_labels with the iterating label. ['a', 'c', 'b'] for the case above.
- */
-inline static int PrepareOpAxes(int ndim, int iop, char* labels, int* axes, int ndim_iter,
-                                char* iter_labels) {
-  int i, label, ibroadcast;
-
-  ibroadcast = ndim - 1;
-  for (i = ndim_iter - 1; i >= 0; --i) {
-    label = iter_labels[i];
-    /*
-     * If it's an unlabeled broadcast dimension, choose
-     * the next broadcast dimension from the operand.
-     */
-    if (label == 0) {
-      while (ibroadcast >= 0 && labels[ibroadcast] != 0) {
-        --ibroadcast;
-      }
-      /*
-       * If we used up all the operand broadcast dimensions,
-       * extend it with a "newaxis"
-       */
-      if (ibroadcast < 0) {
-        axes[i] = -1;
-      } else {
-        /* Otherwise map to the broadcast axis */
-        axes[i] = ibroadcast;
-        --ibroadcast;
-      }
-    } else {
-      /* It's a labeled dimension, find the matching one */
-      char* match = reinterpret_cast<char*>(memchr(labels, label, ndim));
-      /* If the op doesn't have the label, broadcast it */
-      if (match == nullptr) {
-        axes[i] = -1;
-      } else {
-        /* Otherwise use it */
-        axes[i] = match - labels;
-      }
-    }
-  }
-  return 0;
-}
-
-/*!
- * \brief Count SubString.
- * \param str the object string
- * \param sub the pattern string
- *
- * \return number of substring
- */
-inline int CountSubstring(const std::string& str, const std::string& sub) {
-  int count = 0;
-  std::string::size_type pos = 0;
-  while ((pos = str.find(sub, pos)) != std::string::npos) {
-    ++count;
-    pos += sub.length();
-  }
-  return count;
-}
-
-/*!
- * \brief Transfer string to.
- * \param str input string.
- *
- * \return bitset.
- */
-inline std::bitset<LABELRANGE> Str2Set(const std::string& str) {
-  std::bitset<LABELRANGE> ret;
-  for (const char& c : str) {
-    ret.set(static_cast<int>(c));
-  }
-  return ret;
-}
-
-/*!
- * \brief Split str according to substring.
- * \param str input string.
- * \param sub the split pattern string.
- *
- * \return vector contains the splited substring.
- */
-inline std::vector<std::string> Split(const std::string& str, const std::string& sub) {
-  std::string::size_type pos = 0;
-  std::string::size_type start = 0;
-  std::vector<std::string> ret;
-  while ((pos = str.find(sub, start)) != std::string::npos) {
-    ret.push_back(str.substr(start, pos - start));
-    start = pos + sub.length();
-  }
-  ret.push_back(str.substr(start));
-  return ret;
-}
-
-/*!
- * \brief Parse the input subscripts into a vector of strings.
- * \param subscripts input subscripts.
- * \param operands operand tensors.
- *
- * \return vector of strings, vector[0] represents the input part, vector[1] represents the output.
- * if no output, the vector[1] is NULL.
- * "ab, bc -> ac" => ["ab,bc", "ac"]
- */
-inline std::tuple<std::string, std::string> ParseEinsumInput(
-    std::string subscripts, const std::vector<Array<PrimExpr>>& operands) {
-  const std::string einsum_symbols = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
-  std::bitset<LABELRANGE> einsum_symbols_set;
-  for (const char& c : einsum_symbols) {
-    einsum_symbols_set.set(c);
-  }
-
-  CHECK_NE(operands.size(), 0U) << "No input operands";
-
-  auto end_pos = std::remove(subscripts.begin(), subscripts.end(), ' ');
-  subscripts.erase(end_pos, subscripts.end());
-
-  // Ensure all characters are valid
-  for (const char& c : subscripts) {
-    if (c == '.' || c == ',' || c == '-' || c == '>') {
-      continue;
-    }
-    CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol.";
-  }
-
-  // Check for proper "->"
-  if (subscripts.find('-') != std::string::npos || subscripts.find('>') != std::string::npos) {
-    bool invalid = (std::count(subscripts.begin(), subscripts.end(), '-') > 1 ||
-                    std::count(subscripts.begin(), subscripts.end(), '>') > 1);
-    CHECK(!invalid && CountSubstring(subscripts, "->") == 1)
-        << "Subscripts can only contain one '->'.";
-  }
-
-  // Parse ellipses
-  if (subscripts.find('.') != std::string::npos) {
-    std::string used = subscripts;
-    used.erase(
-        std::remove_if(used.begin(), used.end(),
-                       [](const char& c) { return c == '.' || c == ',' || c == '-' || c == '>'; }),
-        used.end());
-
-    std::bitset<LABELRANGE> used_set = Str2Set(used);
-    std::string ellipse_inds = "";
-    for (const char& c : einsum_symbols) {
-      if (!used_set.test(static_cast<int>(c))) {
-        ellipse_inds.append(1, c);
-      }
-    }
-    int longest = 0;
-    std::string input_tmp, output_sub;
-    std::vector<std::string> split_subscripts;
-    bool out_sub;
-
-    if (subscripts.find("->") != std::string::npos) {
-      std::vector<std::string> tmp = Split(subscripts, "->");
-      input_tmp = tmp[0];
-      output_sub = tmp[1];
-      split_subscripts = Split(input_tmp, ",");
-      out_sub = true;
-    } else {
-      split_subscripts = Split(subscripts, ",");
-      out_sub = false;
-    }
-
-    size_t size_split_subscripts = split_subscripts.size();
-    subscripts = "";
-    for (size_t i = 0; i < size_split_subscripts; ++i) {
-      const std::string& sub = split_subscripts[i];
-      if (sub.find('.') != std::string::npos) {
-        CHECK_EQ(std::count(sub.begin(), sub.end(), '.'), 3) << "Invalid Ellipses";
-        CHECK_EQ(CountSubstring(sub, "..."), 1) << "Invalid Ellipses";
-
-        // Take into account numerical values
-        int ellipse_count = 0;
-        if (operands[i].size() == 0) {
-          ellipse_count = 0;
-        } else {
-          ellipse_count = std::max(operands[i].size(), static_cast<size_t>(1));
-          ellipse_count -= sub.length() - 3;
-        }
-
-        if (ellipse_count > longest) {
-          longest = ellipse_count;
-        }
-
-        CHECK_GE(ellipse_count, 0) << "Ellipses lengths do not match.";
-        if (ellipse_count == 0) {
-          split_subscripts[i].erase(sub.find("..."), 3);
-        } else {
-          std::string rep_inds = ellipse_inds.substr(ellipse_inds.length() - ellipse_count);
-          split_subscripts[i].replace(sub.find("..."), 3, rep_inds);
-        }
-      }
-      subscripts += split_subscripts[i];
-      if (i + 1 < size_split_subscripts) {
-        subscripts += ",";
-      }
-    }
-    std::string out_ellipse;
-    if (longest == 0) {
-      out_ellipse = "";
-    } else {
-      out_ellipse = ellipse_inds.substr(ellipse_inds.length() - longest);
-    }
-
-    if (out_sub) {
-      output_sub.replace(output_sub.find("..."), 3, out_ellipse);
-      subscripts += "->" + output_sub;
-    } else {
-      // Special care for outputless ellipses
-      std::bitset<LABELRANGE> out_ellipse_set = Str2Set(out_ellipse);
-      std::string tmp_subscripts = subscripts, output_subscript = "";
-      size_t len_tmp_subscripts = tmp_subscripts.length();
-      std::sort(tmp_subscripts.begin(), tmp_subscripts.end());
-      for (size_t i = 0; i < len_tmp_subscripts; ++i) {
-        const char& c = tmp_subscripts[i];
-        if (c == ',') {
-          continue;
-        }
-        CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol.";
-        if ((i == 0 || tmp_subscripts[i - 1] != c) &&
-            (i == len_tmp_subscripts - 1 || tmp_subscripts[i + 1] != c) &&
-            !out_ellipse_set.test(c)) {
-          output_subscript.append(1, c);
-        }
-      }
-      subscripts += "->" + out_ellipse + output_subscript;
-    }
-  }
-
-  // Build output string if does not exist
-  std::tuple<std::string, std::string> ret;
-  if (subscripts.find("->") != std::string::npos) {
-    std::vector<std::string> tmp(2);
-    tmp = Split(subscripts, "->");
-    ret = std::make_tuple(tmp[0], tmp[1]);
-  } else {
-    std::string first = subscripts;
-    std::string second = "";
-    // Build output subscripts
-    std::string tmp_subscripts = subscripts;
-    size_t len_tmp_subscripts = tmp_subscripts.length();
-    std::sort(tmp_subscripts.begin(), tmp_subscripts.end());
-    for (size_t i = 0; i < len_tmp_subscripts; ++i) {
-      const char& c = tmp_subscripts[i];
-      if (c == ',') {
-        continue;
-      }
-      CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol.";
-      if ((i == 0 || tmp_subscripts[i - 1] != c) &&
-          (i == len_tmp_subscripts - 1 || tmp_subscripts[i + 1] != c)) {
-        second.append(1, c);
-      }
-    }
-    ret = std::make_tuple(first, second);
-  }
-
-  // Make sure output subscripts are in the input
-  std::bitset<LABELRANGE> input_subscripts_set = Str2Set(std::get<0>(ret));
-  for (const char& c : std::get<1>(ret)) {
-    CHECK(input_subscripts_set.test(c))
-        << "Output character " << c << " did not appear in the input";
-  }
-
-  // Make sure number operands is equivalent to the number of terms
-  CHECK_EQ(std::count(std::get<0>(ret).begin(), std::get<0>(ret).end(), ',') + 1, operands.size())
-      << "Number of einsum subscripts must be equal to the "
-      << "number of operands.";
-
-  return ret;
-}
-
 /*!
  * \brief Compute the shape of the output.
  * \param subscripts input subscripts.
@@ -618,54 +56,8 @@ inline std::tuple<std::string, std::string> ParseEinsumInput(
  *
  * \return the shape of the output.
  */
-inline Array<PrimExpr> NumpyEinsumShape(const std::string subscripts,
-                                        const std::vector<Array<PrimExpr>>& operands) {
-  // Parsing
-  std::tuple<std::string, std::string> parsed_subscripts = ParseEinsumInput(subscripts, operands);
-
-  // Build a few useful list and sets
-  std::vector<std::string> input_list = Split(std::get<0>(parsed_subscripts), ",");
-  size_t isize = input_list.size();
-
-  // Get length of each unique dimension and ensure all dimensions are correct
-  int dimension_dict[LABELRANGE];
-  memset(dimension_dict, -1, sizeof(dimension_dict));
-  for (size_t i = 0; i < isize; ++i) {
-    const std::string& term = input_list[i];
-    const Array<PrimExpr>& sh = operands[i];
-    CHECK_EQ(sh.size(), term.length())
-        << "Einstein sum subscript " << input_list[i] << " does not contain the "
-        << "correct number of indices for operand " << i << ".";
-    size_t len_term = term.length();
-    for (size_t j = 0; j < len_term; ++j) {
-      int64_t dim = GetConstInt(sh[j]);
-      const char& c = term[j];
-
-      if (dimension_dict[static_cast<int>(c)] != -1) {
-        // For broadcasting cases we always want the largest dim size
-        if (dimension_dict[static_cast<int>(c)] == 1) {
-          dimension_dict[static_cast<int>(c)] = dim;
-        }
-        CHECK(dim == 1 || dim == dimension_dict[static_cast<int>(c)])
-            << "Size of label '" << c << "' for operand  " << i << " ("
-            << dimension_dict[static_cast<int>(c)] << ") does not match previous terms (" << dim
-            << ").";
-      } else {
-        dimension_dict[static_cast<int>(c)] = dim;
-      }
-    }
-  }
-
-  // Get oshape
-  const std::string& output_str = std::get<1>(parsed_subscripts);
-  size_t odim = output_str.size();
-  Array<PrimExpr> oshape(odim, -1);
-  for (size_t i = 0; i < odim; ++i) {
-    oshape.Set(i, dimension_dict[static_cast<int>(output_str[i])]);
-  }
-  // Neglecting oshape assign check temporally
-  return oshape;
-}
+Array<PrimExpr> InferEinsumShape(const std::string& subscripts,
+                                 const std::vector<Array<PrimExpr>>& operands);
 
 /*!
  * \brief Evaluates the Einstein summation convention on the operands.
@@ -678,265 +70,26 @@ inline Array<PrimExpr> NumpyEinsumShape(const std::string subscripts,
  *
  * \return The calculation based on the Einstein summation convention.
  */
-inline Tensor einsum(const std::string& subscripts_str, const Array<Tensor> inputs,
-                     std::string name = "T_einsum", std::string tag = kEinsum) {
-  bool back = false;
-  const char* subscripts = subscripts_str.data();
-  const char* head = subscripts;
-  const int nop = inputs.size();
-
-  /* Step 1: Parse the subscripts string into label_counts and op_labels */
-  int iop, idim, min_label = LABELRANGE - 1, max_label = 0;
-  char label_counts[LABELRANGE], op_labels[NPY_MAXARGS][NPY_MAXDIMS];
-  memset(label_counts, 0, sizeof(label_counts));
-  for (iop = 0; iop < nop; ++iop) {
-    int length = static_cast<int>(strcspn(subscripts, ",-"));
-
-    CHECK(!(iop == nop - 1 && subscripts[length] == ','))
-        << "more operands provided to einstein sum function "
-        << "than specified in the subscripts string";
-    CHECK(!(iop < nop - 1 && subscripts[length] != ','))
-        << "fewer operands provided to einstein sum function "
-        << "than specified in the subscripts string";
-    CHECK_EQ(ParseOperandSubscripts(subscripts, length, inputs[iop + back].ndim(), iop,
-                                    op_labels[iop], label_counts, &min_label, &max_label),
-             0);
-
-    /* Move subscripts to the start of the labels for the next op */
-    subscripts += length;
-
-    if (iop < nop - 1) {
-      CHECK_LT(subscripts - head, subscripts_str.length()) << "subscripts out of range";
-      subscripts++;
-    }
-  }
-  /*
-   * Find the number of broadcast dimensions, which is the maximum
-   * number of labels == 0 in an op_labels array.
+Tensor einsum(const std::string& subscripts_str, const Array<Tensor> inputs,
+              std::string name = "T_einsum", std::string tag = kEinsum);
+
+struct EinsumEquation {
+  /*!
+   * \brief Create EinsumEquation from a string.
+   * The result will be converted to the explicit mode of Einsum if it is in implicit mode.
+   * \return The created EinsumEquation.
    */
-  int ndim_broadcast = 0;
-  for (iop = 0; iop < nop; ++iop) {
-    int count_zeros = 0;
-    int ndim;
-    char* labels = op_labels[iop];
-
-    ndim = inputs[iop + back].ndim();
-    for (idim = 0; idim < ndim; ++idim) {
-      if (labels[idim] == 0) {
-        ++count_zeros;
-      }
-    }
-
-    if (count_zeros > ndim_broadcast) {
-      ndim_broadcast = count_zeros;
-    }
-  }
-
-  /*
-   * If there is no output signature, fill output_labels and ndim_output
-   * using each label that appeared once, in alphabetical order.
-   */
-  int label, ndim_output;
-  char output_labels[NPY_MAXDIMS];
-  if (subscripts[0] == '\0') {
-    /* If no output was specified, always broadcast left, as usual. */
-    for (ndim_output = 0; ndim_output < ndim_broadcast; ++ndim_output) {
-      output_labels[ndim_output] = 0;
-    }
-    for (label = min_label; label <= max_label; ++label) {
-      if (label_counts[label] == 1) {
-        CHECK(ndim_output < NPY_MAXDIMS) << "einstein sum subscript string has too many "
-                                         << "distinct labels";
-        output_labels[ndim_output++] = label;
-      }
-    }
-  } else {
-    CHECK(subscripts[0] == '-' && subscripts[1] == '>') << "einstein sum subscript string does not "
-                                                        << "contain proper '->' output specified";
-    subscripts += 2;
-
-    /* Parse the output subscript string. */
-    ndim_output = ParseOutputSubscripts(subscripts, strlen(subscripts), ndim_broadcast,
-                                        label_counts, output_labels);
-    CHECK_GE(ndim_output, 0);
-  }
-
-  /*
-   * Step 2:
-   * Process all the input ops, combining dimensions into their
-   * diagonal where specified.
-   */
-  std::vector<Array<PrimExpr>> opshape(nop), opstride_true(nop);
-  for (iop = 0; iop < nop; ++iop) {
-    char* labels = op_labels[iop];
-    int combine, ndim;
-
-    ndim = inputs[iop + back].ndim();
-
-    /*
-     * Check whether any dimensions need to be combined
-     *
-     * The char type may be either signed or unsigned, we
-     * need it to be signed here.
-     */
-    combine = 0;
-    for (idim = 0; idim < ndim; ++idim) {
-      if ((signed char)labels[idim] < 0) {
-        combine++;
-      }
-    }
-    /* If any dimensions are combined, create a view which combines them */
-    if (combine) {
-      Array<PrimExpr> tshape(static_cast<size_t>(ndim - combine), -1);
-      Array<PrimExpr> tstride(static_cast<size_t>(ndim - combine), -1);
-      GetCombinedDimsView(inputs[iop + back], iop, labels, &tshape, &tstride);
-      opshape[iop] = tshape;
-      opstride_true[iop] = tstride;
-    } else {
-      /* No combining needed */
-      opshape[iop] = inputs[iop + back]->shape;
-      opstride_true[iop] = GetStride(opshape[iop]);
-    }
-  }
-  /*
-   * Step 3:
-   * Set up the labels for the iterator (output + combined labels).
-   * Can just share the output_labels memory, because iter_labels
-   * is output_labels with some more labels appended.
-   */
-  char* iter_labels = output_labels;
-  int ndim_iter = ndim_output;
-  for (label = min_label; label <= max_label; ++label) {
-    if (label_counts[label] > 0 && memchr(output_labels, label, ndim_output) == nullptr) {
-      CHECK(ndim_iter < NPY_MAXDIMS) << "too many subscripts in einsum";
-      iter_labels[ndim_iter++] = label;
-    }
-  }
-  /* Step 4: Set up the op_axes for the iterator */
-  Array<PrimExpr> itershape(static_cast<size_t>(ndim_iter), -1);
-  std::vector<Array<PrimExpr>> iterstride(nop + 1,
-                                          Array<PrimExpr>(static_cast<size_t>(ndim_iter), 0));
-
-  // output_shape
-  std::vector<Array<PrimExpr>> operands;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    operands.push_back(inputs[i]->shape);
-  }
-  Array<PrimExpr> oshape = NumpyEinsumShape(subscripts_str, operands);
-  Array<PrimExpr> ostride_true = GetStride(oshape);
-  Array<PrimExpr> reduceshape;
-  std::vector<Array<PrimExpr>> remainshape(nop);
-  int op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
-  int* op_axes[NPY_MAXARGS];
-  for (iop = 0; iop < nop; ++iop) {
-    op_axes[iop] = op_axes_arrays[iop];
-    CHECK_GE(PrepareOpAxes(opshape[iop].size(), iop, op_labels[iop], op_axes[iop], ndim_iter,
-                           iter_labels),
-             0);
-    for (idim = 0; idim < ndim_iter; idim++) {
-      if (op_axes[iop][idim] != -1) {
-        iterstride[iop].Set(idim, opstride_true[iop][op_axes[iop][idim]]);
-        if (GetConstInt(itershape[idim]) != -1) {
-          if (GetConstInt(itershape[idim]) == 1) {
-            itershape.Set(idim, opshape[iop][op_axes[iop][idim]]);
-          }
-        } else {
-          itershape.Set(idim, opshape[iop][op_axes[iop][idim]]);
-        }
-      }
-    }
-  }
-  for (idim = 0; idim < ndim_output; ++idim) {
-    iterstride[nop].Set(idim, ostride_true[idim]);
-  }
-  reduceshape = Array<PrimExpr>(static_cast<size_t>(ndim_iter - ndim_output), 0);
-  for (idim = ndim_output; idim < ndim_iter; ++idim) {
-    reduceshape.Set(idim - ndim_output, itershape[idim]);
-  }
-  for (iop = 0; iop < nop; iop++) {
-    Array<Integer> rsh;
-    for (idim = 0; idim < ndim_iter; idim++) {
-      if (op_axes_arrays[iop][idim] == -1) {
-        rsh.push_back(GetConstInt(itershape[idim]));
-      } else {
-        if (GetConstInt(itershape[idim] != opshape[iop][op_axes_arrays[iop][idim]])) {
-          rsh.push_back(GetConstInt(itershape[idim]));
-        }
-      }
-    }
-    remainshape[iop] = Array<PrimExpr>(rsh.begin(), rsh.end());
-  }
-  // exclude the 0-dim case
-  if (ndim_iter == 0) {
-    ndim_iter = 1;
-  }
-  itershape = Pad(itershape, ndim_iter);
-  for (iop = 0; iop <= nop; ++iop) {
-    iterstride[iop] = Pad(iterstride[iop], ndim_iter);
-  }
-  // oshape = Pad(oshape, ndim_iter);
-  reduceshape = Pad(reduceshape, ndim_iter);
-  for (iop = 0; iop < nop; ++iop) {
-    opshape[iop] = Pad(opshape[iop], ndim_iter);
-    remainshape[iop] = Pad(remainshape[iop], ndim_iter);
-  }
-  // ostride and rstride
-  Array<Array<PrimExpr>> ostride;
-  Array<Array<PrimExpr>> rstride;
-
-  for (iop = 0; iop < nop; ++iop) {
-    Array<PrimExpr> otmp(static_cast<size_t>(ndim_iter), 0);
-    Array<PrimExpr> rtmp(static_cast<size_t>(ndim_iter), 0);
-    for (idim = 0; idim < ndim_iter; ++idim) {
-      otmp.Set(idim, idim < ndim_output ? iterstride[iop][idim] : 1);
-      rtmp.Set(idim, idim < ndim_iter - ndim_output ? iterstride[iop][idim + ndim_output] : 1);
-    }
-    ostride.push_back(otmp);
-    rstride.push_back(rtmp);
-  }
-
-  // func: input indices => return cooresponding value
-  auto func = [inputs, oshape, ostride, reduceshape, ndim_iter, rstride,
-               nop](const Array<Var>& input_indices) -> PrimExpr {
-    for (int rdim = 0; rdim < ndim_iter; ++rdim) {
-      if (GetConstInt(reduceshape[rdim]) == 0) {
-        return 0;  //
-      }
-    }
-    Array<PrimExpr> ridx = UnravelIndex(0, reduceshape);
-
-    PrimExpr sum = 0;
-    bool rec_flag = false;
-    do {
-      PrimExpr tmp = 1;
-      for (int iop = 0; iop < nop; ++iop) {
-        if (iop != -1) {
-          PrimExpr k = 0;
-
-          for (size_t i = 0; i < input_indices.size(); ++i) {
-            k += input_indices[i] * ostride[iop][i];
-          }
-          for (size_t i = 0; i < ridx.size(); ++i) {
-            k += ridx[i] * rstride[iop][i];
-          }
-          Array<PrimExpr> temp_indices = UnravelIndex(k, inputs[iop]->shape);
-          tmp = tmp * inputs[iop](temp_indices);
-        }
-      }
-      sum += tmp;
-      ridx.Set(ridx.size() - 1, ridx[ridx.size() - 1] + 1);
-      for (int i = static_cast<int>(ridx.size() - 1);
-           (i > 0) && GetConstInt(ridx[i] >= reduceshape[i]); --i) {
-        ridx.Set(i, ridx[i] - reduceshape[i]);
-        ridx.Set(i - 1, ridx[i - 1] + 1);
-      }
-      rec_flag = GetConstInt(ridx[0] < reduceshape[0]);
-    } while (rec_flag);
-    return sum;
-  };
-
-  return compute(oshape, func, name, tag);
-}
+  static EinsumEquation FromString(const std::string& equation);
+  using Label = char;
+  using Subscript = std::vector<Label>;
+  // Special label value for ellipsis. The value is chosen to be less than any other letters so make
+  // sorting easier.
+  static constexpr Label kEllipsis = '\0';
+  // The input subscripts for each operand of the Einsum operator.
+  std::vector<Subscript> inputs;
+  // The output subscript of the Einsum equation.
+  Subscript output;
+};
 
 }  // namespace topi
 }  // namespace tvm
diff --git a/src/relay/op/tensor/math.cc b/src/relay/op/tensor/math.cc
index 246fba62cc66..6d1dabb497e0 100644
--- a/src/relay/op/tensor/math.cc
+++ b/src/relay/op/tensor/math.cc
@@ -77,7 +77,7 @@ bool EinsumRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
 
   // Calculate output shape
-  Array<IndexExpr> oshape = topi::NumpyEinsumShape(param->equation, input_shapes);
+  Array<IndexExpr> oshape = topi::InferEinsumShape(param->equation, input_shapes);
 
   auto rtype = TensorType(oshape, dtype);
   reporter->Assign(types[1], rtype);
diff --git a/src/topi/einsum.cc b/src/topi/einsum.cc
new file mode 100644
index 000000000000..892a17e58d7f
--- /dev/null
+++ b/src/topi/einsum.cc
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file topi/einsum.cc
+ * \brief Einstein summation op
+ */
+#include <tvm/topi/broadcast.h>
+#include <tvm/topi/einsum.h>
+
+namespace tvm {
+namespace topi {
+
+EinsumEquation EinsumEquation::FromString(const std::string& equation) {
+  EinsumEquation result;
+  Subscript current;
+  bool has_arrow = false;
+  bool has_ellipsis = false;
+
+  for (int i = 0, n = equation.size(); i < n; ++i) {
+    switch (equation[i]) {
+      case ' ':
+        // Ignore spaces
+        break;
+      case '-':
+        // Arrow
+        CHECK(!has_arrow) << "Equation can only have one arrow";
+        CHECK(i + 1 < n && equation[i + 1] == '>')
+            << "Cannot parse the Einsum equation: invalid arrow";
+        i++;
+        has_arrow = true;
+        [[fallthrough]];
+      case ',':
+        // Delimiter between inputs, push current and start a new one
+        result.inputs.emplace_back(current);
+        current.clear();
+        has_ellipsis = false;
+        break;
+      case '.':
+        // Ellipsis
+        CHECK(!has_ellipsis) << "Ellipsis can only appear once for each input and output";
+        CHECK(i + 2 < n && equation[i + 1] == '.' && equation[i + 2] == '.')
+            << "Cannot parse the Einsum equation: invalid ellipsis";
+        current.push_back(kEllipsis);
+        has_ellipsis = true;
+        i += 2;
+        break;
+      default:
+        // Default case: current character is a subscript label
+        CHECK(std::isalpha(equation[i])) << "Cannot parse the Einsum equation: invalid character "
+                                         << equation[i] << " in equation " << equation;
+        current.emplace_back(equation[i]);
+        break;
+    }
+  }
+
+  if (has_arrow) {
+    // If there is an arrow, the last subscript is the output
+    result.output = current;
+  } else {
+    // Otherwise, the equation is in implicit mode, and the last subscript is an input
+    result.inputs.emplace_back(current);
+  }
+
+  // Convert the equation to explicit mode if it is in implicit mode
+  if (!has_arrow) {
+    // The output of the implicit mode is all repeated labels sorted in alphabetical order and the
+    // ellipsis in the leftmost if it exists in the inputs.
+    std::map<char, int> label_counts;
+    for (const Subscript& subscript : result.inputs) {
+      for (char label : subscript) {
+        label_counts[label]++;
+      }
+    }
+    for (auto [label, count] : label_counts) {
+      if (label == kEllipsis || count == 1) {
+        result.output.emplace_back(label);
+      }
+    }
+  }
+  return result;
+}
+
+PrimExpr GetBroadcastedExtent(const PrimExpr& extent1, const PrimExpr& extent2) {
+  int64_t extent1_value = GetConstInt(extent1);
+  int64_t extent2_value = GetConstInt(extent2);
+  if (extent1_value == extent2_value) {
+    return extent1;
+  } else if (extent1_value == 1 || extent2_value == 1) {
+    return Integer(std::max(extent1_value, extent2_value));
+  }
+  LOG(FATAL) << "Cannot broadcast extents " << extent1 << " and " << extent2;
+  throw;
+}
+
+PrimExpr GetIndexForBroadcastedDim(const Var& index, const PrimExpr& extent,
+                                   const PrimExpr& broadcasted_extent) {
+  if (GetConstInt(extent) == GetConstInt(broadcasted_extent)) {
+    return index;
+  } else {
+    return Integer(0);
+  }
+}
+
+/*! \brief The compute builder for Einsum */
+class EinsumBuilder {
+ public:
+  /*!
+   * \brief The constructor
+   * \param equation The Einsum equation
+   * \param input_shapes The shapes of the input tensors
+   */
+  EinsumBuilder(EinsumEquation equation, Array<Array<PrimExpr>> input_shapes)
+      : equation_(equation), input_shapes_(input_shapes) {}
+
+  /*!
+   * \brief Run the shape inference
+   * \return The inferred shape of the output
+   */
+  Array<PrimExpr> InferShape() {
+    CHECK_EQ(equation_.inputs.size(), input_shapes_.size())
+        << "Number of operands does not match the "
+           "equation";
+
+    std::vector<Array<PrimExpr>>
+        ellipis_shapes;  // the sub-shape covered by the ellipsis for each operand
+
+    // Step 1: Collect the broadcasted extent for each label
+    for (int operand_index = 0; operand_index < static_cast<int>(input_shapes_.size());
+         ++operand_index) {
+      const EinsumEquation::Subscript subscript = equation_.inputs[operand_index];
+      const Array<PrimExpr>& input_shape = input_shapes_[operand_index];
+
+      int current_dim = 0;
+      for (auto label : subscript) {
+        if (label == EinsumEquation::kEllipsis) {
+          // Find the sub-shape covered by the ellipsis
+          int ellipsis_ndim =
+              static_cast<int>(input_shape.size()) - static_cast<int>(subscript.size()) + 1;
+          ellipis_shapes.emplace_back(input_shape.begin() + current_dim,
+                                      input_shape.begin() + current_dim + ellipsis_ndim);
+          current_dim += ellipsis_ndim;
+        } else {
+          const PrimExpr& extent = input_shape[current_dim++];
+          auto it = label_to_extent_.find(label);
+          if (it == label_to_extent_.end()) {
+            label_to_extent_[label] = extent;
+          } else {
+            it->second = GetBroadcastedExtent(it->second, extent);
+          }
+        }
+      }
+      ICHECK_EQ(current_dim, input_shape.size());
+    }
+
+    // Step 2: Infer the shape of the ellipsis if exists
+    // The ellipsis may cover different number of dimensions for each operand, these sub-shapes
+    // need to be broadcasted to the shape with the maximum number of dimensions
+    Array<PrimExpr> ellipsis_shape;
+    if (ellipis_shapes.size()) {
+      ellipsis_shape = *std::max_element(
+          ellipis_shapes.begin(), ellipis_shapes.end(),
+          [](const Array<PrimExpr>& a, const Array<PrimExpr>& b) { return a.size() < b.size(); });
+      for (const Array<PrimExpr>& shape : ellipis_shapes) {
+        auto common_shape = detail::BroadcastShape(ellipsis_shape, shape).common_shape;
+        ellipsis_shape = Array<PrimExpr>(common_shape.begin(), common_shape.end());
+      }
+    }
+
+    // Step 3: Infer output shape based on infered extent for each label
+    for (auto label : equation_.output) {
+      if (label == EinsumEquation::kEllipsis) {
+        output_shape_.insert(output_shape_.end(), ellipsis_shape.begin(), ellipsis_shape.end());
+      } else {
+        output_shape_.push_back(label_to_extent_[label]);
+      }
+    }
+    ellipsis_shape_ = std::move(ellipsis_shape);
+    return output_shape_;
+  }
+
+  PrimExpr BuildOutputExpr(const Array<Tensor> inputs, const Array<Var>& indices) {
+    std::unordered_map<EinsumEquation::Label, Var> label_to_index;
+    Array<Var> ellipsis_indices;
+    Array<IterVar> reduce_axes;
+
+    PrepareOutputIndicesMapping(indices, &label_to_index, &ellipsis_indices);
+    PrepareReductionIndicesMapping(indices, &label_to_index, &ellipsis_indices, &reduce_axes);
+
+    auto zero = make_zero(inputs[0]->dtype);
+
+    PrimExpr result = zero;
+    for (int i = 0, n = static_cast<int>(inputs.size()); i < n; ++i) {
+      auto term = inputs[i](GetIndicesForOperand(i, label_to_index, ellipsis_indices));
+      if (i == 0) {
+        result = term;
+      } else {
+        result = result * term;
+      }
+    }
+    if (reduce_axes.size() > 0) {
+      result = sum(result, reduce_axes, {zero});
+    }
+    return result;
+  }
+
+ private:
+  /*!
+   * \brief Prepare mapping from label (including ellipsis) to the output indices
+   */
+  void PrepareOutputIndicesMapping(const Array<Var>& indices,
+                                   std::unordered_map<EinsumEquation::Label, Var>* label_to_index,
+                                   Array<Var>* ellipsis_indices) {
+    int i = 0;
+    for (auto label : equation_.output) {
+      if (label == EinsumEquation::kEllipsis) {
+        auto ellipsis_ndim = ellipsis_shape_.value().size();
+        *ellipsis_indices = Array<Var>(indices.begin() + i, indices.begin() + i + ellipsis_ndim);
+        i += ellipsis_ndim;
+      } else {
+        label_to_index->emplace(label, indices[i++]);
+      }
+    }
+    ICHECK_EQ(i, indices.size());
+  }
+
+  /*!
+   * \brief Create reduction axes and prepare mapping from reduction label (including ellipsis if
+   * necessary) to the reduction axes
+   */
+  void PrepareReductionIndicesMapping(
+      const Array<Var>& indices, std::unordered_map<EinsumEquation::Label, Var>* label_to_index,
+      Array<Var>* ellipsis_indices, Array<IterVar>* reduction_axes) {
+    // Collect labels that need to be reduced, which is the union(input_labels) - output_labels
+    std::set<char> reduction_labels;
+    for (const EinsumEquation::Subscript& subscript : equation_.inputs) {
+      reduction_labels.insert(subscript.begin(), subscript.end());
+    }
+    for (auto label : equation_.output) {
+      reduction_labels.erase(label);
+    }
+
+    // Create reduction axes.The order of the reduction axes is not specified in the Einsum
+    // equation. Here we sort them alphabetically, with the ellipsis axes at the
+    // beginning if exists.
+    for (auto label : reduction_labels) {
+      if (label == EinsumEquation::kEllipsis) {
+        // Ellipsis
+        auto ellipsis_shape = ellipsis_shape_.value();
+        for (int i = 0; i < static_cast<int>(ellipsis_shape.size()); ++i) {
+          reduction_axes->push_back(
+              IterVar(Range(0, ellipsis_shape[i]), Var("k"), IterVarType::kCommReduce));
+          ellipsis_indices->push_back(reduction_axes->back()->var);
+        }
+      } else {
+        // Normal label
+        reduction_axes->push_back(IterVar(Range(0, label_to_extent_[label]),
+                                          Var(std::string(1, label)), IterVarType::kCommReduce));
+        label_to_index->emplace(label, reduction_axes->back()->var);
+      }
+    }
+  }
+
+  Array<PrimExpr> GetIndicesForOperand(
+      int operand_index, const std::unordered_map<EinsumEquation::Label, Var>& label_to_index,
+      const Array<Var>& ellipsis_indices) {
+    const EinsumEquation::Subscript& subscript = equation_.inputs[operand_index];
+    Array<PrimExpr> indices;  // the indices for the operand
+    const Array<PrimExpr> input_shape = input_shapes_[operand_index];
+
+    int i = 0;  // index of the operand shape
+    for (char label : subscript) {
+      if (label == EinsumEquation::kEllipsis) {
+        // Ellipsis
+        Array<PrimExpr> ellipsis_shape = ellipsis_shape_.value();
+        int ellipsis_ndim =
+            static_cast<int>(input_shape.size()) - static_cast<int>(subscript.size()) + 1;
+        // use last 'ellipsis_ndim' axes
+        for (int j = static_cast<int>(ellipsis_indices.size()) - ellipsis_ndim;
+             j < static_cast<int>(ellipsis_indices.size()); ++j) {
+          indices.push_back(
+              GetIndexForBroadcastedDim(ellipsis_indices[j], input_shape[i++], ellipsis_shape[j]));
+        }
+      } else {
+        // Normal label
+        indices.push_back(GetIndexForBroadcastedDim(label_to_index.at(label), input_shape[i++],
+                                                    label_to_extent_.at(label)));
+      }
+    }
+    ICHECK_EQ(i, input_shape.size());
+    ICHECK_EQ(indices.size(), input_shape.size());
+    return indices;
+  }
+
+  EinsumEquation equation_;
+  Array<Array<PrimExpr>> input_shapes_;
+
+  // intermediate results of shape inference
+
+  // The output shape
+  Array<PrimExpr> output_shape_;
+  // The extent of each label with broadcast rules applied
+  std::unordered_map<EinsumEquation::Label, PrimExpr> label_to_extent_;
+  // The shape of the ellipsis if ellipsis is used. The shape covered by the
+  // ellipsis in each operand might be different from this, this is the common
+  // shape among them according to the broadcast rules.
+  Optional<Array<PrimExpr>> ellipsis_shape_;
+};
+
+Tensor einsum(const std::string& subscripts_str, const Array<Tensor> inputs, std::string name,
+              std::string tag) {
+  EinsumEquation equation = EinsumEquation::FromString(subscripts_str);
+  Array<Array<PrimExpr>> input_shapes;
+  for (const Tensor& input : inputs) {
+    input_shapes.push_back(input->shape);
+  }
+  EinsumBuilder einsum_builder = EinsumBuilder(equation, input_shapes);
+  auto output_shape = einsum_builder.InferShape();
+  return te::compute(
+      output_shape,
+      [&](const Array<Var>& indices) { return einsum_builder.BuildOutputExpr(inputs, indices); },
+      name, tag);
+}
+
+Array<PrimExpr> InferEinsumShape(const std::string& subscripts,
+                                 const std::vector<Array<PrimExpr>>& operands) {
+  EinsumEquation equation = EinsumEquation::FromString(subscripts);
+  EinsumBuilder einsum_builder = EinsumBuilder(equation, operands);
+  return einsum_builder.InferShape();
+}
+
+TVM_REGISTER_GLOBAL("topi.einsum").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = einsum(args[0], args[1]);
+});
+
+}  // namespace topi
+}  // namespace tvm
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index 56e799f52563..0ea1392e5daf 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -173,10 +173,6 @@ TVM_REGISTER_GLOBAL("topi.tensordot").set_body([](TVMArgs args, TVMRetValue* rv)
   }
 });
 
-TVM_REGISTER_GLOBAL("topi.einsum").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = einsum(args[0], args[1]);
-});
-
 TVM_REGISTER_GLOBAL("topi.strided_slice").set_body([](TVMArgs args, TVMRetValue* rv) {
   Tensor x = args[0];
   Array<PrimExpr> begin = args[1];
diff --git a/tests/python/topi/python/test_topi_einsum.py b/tests/python/topi/python/test_topi_einsum.py
index 994d5438e661..d6dc43e4da00 100644
--- a/tests/python/topi/python/test_topi_einsum.py
+++ b/tests/python/topi/python/test_topi_einsum.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
+import pytest
 import tvm
 import tvm.testing
 from tvm import te
@@ -59,20 +60,27 @@ def verify_einsum(subscripts, shapes):
     tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
 
 
-def test_einsum():
-    verify_einsum("ii", [(5, 5)])
-    verify_einsum("ii->i", [(5, 5)])
-    verify_einsum("ij->i", [(5, 5)])
-    verify_einsum("...j->...", [(5, 5)])
-    verify_einsum("...j, j", [(5, 5), (5,)])
-    verify_einsum("..., ...", [(), (2, 3)])
-    verify_einsum("ijk, jil->kl", [(3, 4, 5), (4, 3, 2)])
-    verify_einsum("ij, ij -> i", [(1, 4), (2, 4)])
-    verify_einsum("...ij, ...jk -> ...ik", [(1, 4), (4, 2)])
-    verify_einsum("...ij, ...ik -> ...jk", [(1, 1, 1, 4), (1, 1, 1, 3)])
-    verify_einsum("ij,jk->ik", [(2, 3), (3, 4)])
-    verify_einsum("ij,jk,km->im", [(2, 3), (3, 4), (4, 5)])
+@pytest.mark.parametrize(
+    "equation,inputs",
+    [
+        ("ii", [(5, 5)]),
+        ("ii->i", [(5, 5)]),
+        ("ij->i", [(5, 5)]),
+        ("...j->...", [(5, 5)]),
+        ("...j, j", [(5, 5), (5,)]),
+        ("..., ...", [(), (2, 3)]),
+        ("ijk, jil->kl", [(3, 4, 5), (4, 3, 2)]),
+        ("ij, ij -> i", [(1, 4), (2, 4)]),
+        ("...ij, ...jk -> ...ik", [(1, 4), (4, 2)]),
+        ("...ij, ...ik -> ...jk", [(1, 1, 1, 4), (1, 1, 1, 3)]),
+        ("...ik, ...jk, ...hk -> i...jh", [(3, 4, 4), (1, 5, 3, 8, 4), (2, 5, 3, 6, 4)]),
+        ("ij,jk->ik", [(2, 3), (3, 4)]),
+        ("ij,jk,km->im", [(2, 3), (3, 4), (4, 5)]),
+    ],
+)
+def test_einsum(equation, inputs):
+    verify_einsum(equation, inputs)
 
 
 if __name__ == "__main__":
-    test_einsum()
+    tvm.testing.main()

From d1c9febeca8d9e2d938f1bedfe4daf1da46373e6 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 28 Sep 2022 16:35:56 +0100
Subject: [PATCH 268/704] [ETHOSN] Remove support for 22.05 version of the
 driver stack (#12770)

After the upgrade of the driver stack to 22.08 (3.1.0), this commit removes support for the 22.05 (3.0.1) version.
---
 python/tvm/relay/op/contrib/ethosn.py         |  2 +-
 .../contrib/test_ethosn/test_networks.py      | 28 ++++---------------
 .../python/contrib/test_ethosn/test_split.py  | 13 ---------
 .../contrib/test_ethosn/test_topologies.py    | 17 ++++-------
 4 files changed, 11 insertions(+), 49 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index c8003c8da4d5..6a318c602fd2 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -102,7 +102,7 @@ def partition_for_ethosn(mod, params=None, **opts):
         raise ValueError("When targeting Ethos(TM)-N78, -variant=n78 should be set.")
 
     api_version = ethosn_api_version()
-    supported_api_versions = ["3.0.1", "3.1.0"]
+    supported_api_versions = ["3.1.0"]
     if all(api_version != LooseVersion(exp_ver) for exp_ver in supported_api_versions):
         raise ValueError(
             f"Driver stack version {api_version} is unsupported. "
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 75f3479a5a9c..2e6b52927769 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -23,14 +23,11 @@
 pytest.importorskip("tflite")
 pytest.importorskip("tensorflow")
 
-from distutils.version import LooseVersion
-
 import tflite.Model
 
 from tvm import relay
 from tvm.testing import requires_ethosn
 from tvm.contrib import download
-from tvm.relay.op.contrib.ethosn import ethosn_api_version
 import tvm.relay.testing.tf as tf_testing
 
 from . import infrastructure as tei
@@ -125,10 +122,7 @@ def test_mobilenet_v1():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    if ethosn_api_version() == LooseVersion("3.1.0"):
-        _compile_hash = {"c37fec1f214c7f93ce49ee4e3b587969"}
-    else:
-        _compile_hash = {"50186822915909303e813205db80e032"}
+    _compile_hash = {"c37fec1f214c7f93ce49ee4e3b587969"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
@@ -150,10 +144,7 @@ def test_resnet_50_int8():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    if ethosn_api_version() == LooseVersion("3.1.0"):
-        _compile_hash = {"12d65aec33594c88b6d0d31dcd5144e6", "6a64d69ccb36dfb6b30dd2abdba4b005"}
-    else:
-        _compile_hash = {"9245965b2c01e7f3d9b478e38a186eb4", "4225fa951c145bb1e48e28cad6a3bdd4"}
+    _compile_hash = {"12d65aec33594c88b6d0d31dcd5144e6", "6a64d69ccb36dfb6b30dd2abdba4b005"}
     _test_image_network(
         model_url="https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/"
         "models/Quantized/resnet_50_quantized.tflite",
@@ -174,10 +165,7 @@ def test_inception_v3():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    if ethosn_api_version() == LooseVersion("3.1.0"):
-        _compile_hash = {"cff892eb15944756f22dad4b83c756d2"}
-    else:
-        _compile_hash = {"a5a2b5d2b618de754bf9a01033a020c0"}
+    _compile_hash = {"cff892eb15944756f22dad4b83c756d2"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/tflite_11_05_08/inception_v3_quant.tgz",
@@ -198,10 +186,7 @@ def test_inception_v4():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    if ethosn_api_version() == LooseVersion("3.1.0"):
-        _compile_hash = {"2eeae331898f8e94c74868e190077837"}
-    else:
-        _compile_hash = {"61b4ade41898d7cb2451dbdc3340aced"}
+    _compile_hash = {"2eeae331898f8e94c74868e190077837"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/inception_v4_299_quant_20181026.tgz",
@@ -222,10 +207,7 @@ def test_ssd_mobilenet_v1():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    if ethosn_api_version() == LooseVersion("3.1.0"):
-        _compile_hash = {"ec2b78852192058f88b64d45c26620d5", "f68cbeaaba03874ea735ce3f5eab9227"}
-    else:
-        _compile_hash = {"789906c7d8ac787809b303d82781fc9d", "6b699f94795785d31b39940a5cf84a81"}
+    _compile_hash = {"ec2b78852192058f88b64d45c26620d5", "f68cbeaaba03874ea735ce3f5eab9227"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip",
diff --git a/tests/python/contrib/test_ethosn/test_split.py b/tests/python/contrib/test_ethosn/test_split.py
index a6155065a54c..57335feadbba 100644
--- a/tests/python/contrib/test_ethosn/test_split.py
+++ b/tests/python/contrib/test_ethosn/test_split.py
@@ -17,15 +17,12 @@
 
 """Split tests for Arm(R) Ethos(TM)-N"""
 
-from distutils.version import LooseVersion
-
 import numpy as np
 import pytest
 
 import tvm
 from tvm import relay
 from tvm.testing import requires_ethosn
-from tvm.relay.op.contrib.ethosn import ethosn_api_version
 
 from . import infrastructure as tei
 
@@ -47,11 +44,6 @@ def _get_model(shape, dtype, splits, axis):
 )
 def test_split(dtype, shape, splits, axis):
     """Compare Split output with TVM."""
-    if ethosn_api_version() == LooseVersion("3.0.1"):
-        pytest.skip(
-            "Split is not supported by the 3.0.1 version of the driver stack.",
-        )
-
     np.random.seed(0)
 
     outputs = []
@@ -89,11 +81,6 @@ def test_split(dtype, shape, splits, axis):
 )
 def test_split_failure(shape, dtype, splits, axis, err_msg):
     """Check Split error messages."""
-    if ethosn_api_version() == LooseVersion("3.0.1"):
-        pytest.skip(
-            "Split is not supported by the 3.0.1 version of the driver stack.",
-        )
-
     model = _get_model(shape, dtype, splits, axis)
     mod = tei.make_ethosn_partition(model)
     tei.test_error(mod, {}, err_msg)
diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py
index 47a01154d0b2..6425eb0faba3 100644
--- a/tests/python/contrib/test_ethosn/test_topologies.py
+++ b/tests/python/contrib/test_ethosn/test_topologies.py
@@ -17,15 +17,13 @@
 
 """Arm(R) Ethos(TM)-N tests for complex network topologies."""
 
-from distutils.version import LooseVersion
-
 import numpy as np
 import pytest
 
 import tvm
 from tvm import relay
 from tvm.testing import requires_ethosn
-from tvm.relay.op.contrib.ethosn import Available, ethosn_available, ethosn_api_version
+from tvm.relay.op.contrib.ethosn import Available, ethosn_available
 
 from . import infrastructure as tei
 
@@ -80,8 +78,8 @@ def get_model(input_shape, dtype, var_names):
         model = get_model(inputs["a"].shape, dtype, iter(inputs))
         mod = tei.make_module(model, [])
 
-        expected_host_ops = 1 if ethosn_api_version() == LooseVersion("3.0.1") else 0
-        npu_partitions = 2 if ethosn_api_version() == LooseVersion("3.0.1") else 1
+        expected_host_ops = 0
+        npu_partitions = 1
 
         # Mock inference is only supported when the whole graph is offloaded to the NPU
         if ethosn_available() == Available.SW_ONLY:
@@ -282,8 +280,8 @@ def get_model(shape, dtype, splits, axis):
         model = get_model(shape, dtype, splits, axis)
         mod = tei.make_module(model, {})
 
-        expected_host_ops = 1 if ethosn_api_version() == LooseVersion("3.0.1") else 0
-        npu_partitions = 2 if ethosn_api_version() == LooseVersion("3.0.1") else 1
+        expected_host_ops = 0
+        npu_partitions = 1
 
         # Mock inference is only supported when the whole graph is offloaded to the NPU
         if ethosn_available() == Available.SW_ONLY:
@@ -317,11 +315,6 @@ def test_output_tuple_propagation(dtype):
     """This tests the case where the output tuple must be inferred
     as having dummy tensor information."""
 
-    if ethosn_api_version() == LooseVersion("3.0.1"):
-        pytest.skip(
-            "Split is not supported by the 3.0.1 version of the driver stack.",
-        )
-
     def get_model(dtype):
         a = relay.var("a", shape=(1, 4, 4, 16), dtype=dtype)
         split = relay.op.split(a, indices_or_sections=4, axis=2)

From 17e4644019cd87b2fccab171875f109b322db8e3 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 28 Sep 2022 15:46:23 -0500
Subject: [PATCH 269/704] [TIR][MetaSchedule] Add regression test for
 layout_rewrite extent=1 (#12916)

* [TIR][MetaSchedule] Add regression test for layout_rewrite extent=1

Adds a regression test for using the `layout_rewrite` post-proc on a
buffer with an extent of one in at least one dimension, issue
https://github.com/apache/tvm/issues/12852.  This bug was resolved as
part of the refactor in https://github.com/apache/tvm/pull/12904, but
didn't have a regression test at that point.

* Identified segfault and added test case
---
 src/meta_schedule/postproc/rewrite_layout.cc  |   2 +
 ...t_meta_schedule_postproc_rewrite_layout.py | 156 ++++++++++++------
 2 files changed, 110 insertions(+), 48 deletions(-)

diff --git a/src/meta_schedule/postproc/rewrite_layout.cc b/src/meta_schedule/postproc/rewrite_layout.cc
index 998b22b57463..881c5ca7516b 100644
--- a/src/meta_schedule/postproc/rewrite_layout.cc
+++ b/src/meta_schedule/postproc/rewrite_layout.cc
@@ -56,6 +56,8 @@ class BufferReadPosCollector : public StmtExprVisitor {
   }
 
   void VisitExpr_(const BufferLoadNode* op) final {
+    CHECK(cur_realize_.defined()) << "BufferLoad occurred outside of any block";
+
     const Buffer& buffer = op->buffer;
     if (buffers_.count(buffer.get())) {
       Map<Var, PrimExpr> subst_map;
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py
index b3e112e0e704..e0ed68b69ce0 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py
@@ -38,54 +38,114 @@ def _create_context(mod, target) -> TuneContext:
     )
 
 
-@T.prim_func
-def tir_matmul(
-    A: T.Buffer[(16, 16), "float32"],
-    B: T.Buffer[(16, 16), "float32"],
-    C: T.Buffer[(16, 16), "float32"],
-) -> None:
-    T.func_attr({"layout_free_buffers": [1]})
-    for i0, j, k0, i1, k1 in T.grid(4, 16, 4, 4, 4):
-        with T.block("matmul"):
-            vi = T.axis.S(16, i0 * 4 + i1)
-            vj = T.axis.S(16, j)
-            vk = T.axis.R(16, k0 * 4 + k1)
-            with T.init():
-                C[vi, vj] = T.float32(0)
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
-
-
-@T.prim_func
-def rewritten_tir_matmul(
-    A: T.Buffer[(16, 16), "float32"],
-    B: T.Buffer[(16, 16), "float32"],
-    C: T.Buffer[(16, 16), "float32"],
-) -> None:
-    T.func_attr({"layout_free_buffers": [1]})
-    B_reindex = T.alloc_buffer([16, 4, 4], dtype="float32")
-    for ax0, ax1 in T.grid(16, 16):
-        with T.block("layout_rewrite"):
-            i0, i1 = T.axis.remap("SS", [ax0, ax1])
-            T.block_attr({"meta_schedule.layout_rewrite_preproc": True})
-            B_reindex[i1, i0 // 4, i0 % 4] = B[i0, i1]
-    for i0, j, k0, i1, k1 in T.grid(4, 16, 4, 4, 4):
-        with T.block("matmul"):
-            vi = T.axis.spatial(16, i0 * 4 + i1)
-            vj = T.axis.spatial(16, j)
-            vk = T.axis.reduce(16, k0 * 4 + k1)
-            with T.init():
-                C[vi, vj] = T.float32(0)
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B_reindex[vj, vk // 4, vk % 4]
-
-
-def test_layout_rewrite():
-    target = _target()
-    ctx = _create_context(tir_matmul, target)
-    sch = tvm.tir.Schedule(tir_matmul, debug_mask="all")
-    sch.enter_postproc()
-    assert ctx.postprocs[0].apply(sch)
-    tvm.ir.assert_structural_equal(sch.mod["main"], rewritten_tir_matmul)
+class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
+    def transform(self):
+        def inner(mod):
+            target = Target("cuda", host="llvm")
+            ctx = TuneContext(
+                mod=mod,
+                target=target,
+                postprocs=[
+                    RewriteLayout(),
+                ],
+                task_name="test",
+            )
+            sch = tvm.tir.Schedule(mod, debug_mask="all")
+            sch.enter_postproc()
+            assert ctx.postprocs[0].apply(sch)
+            return sch.mod
+
+        return inner
+
+
+class TestTIRMatmul(BaseBeforeAfter):
+    """Main functionality test
+
+    A new block should be inserted to transform the layout, with the
+    compute block operating on the temporary transformed buffer.
+    """
+
+    def before(
+        A: T.Buffer[(16, 16), "float32"],
+        B: T.Buffer[(16, 16), "float32"],
+        C: T.Buffer[(16, 16), "float32"],
+    ) -> None:
+        T.func_attr({"layout_free_buffers": [1]})
+        for i0, j, k0, i1, k1 in T.grid(4, 16, 4, 4, 4):
+            with T.block("matmul"):
+                vi = T.axis.S(16, i0 * 4 + i1)
+                vj = T.axis.S(16, j)
+                vk = T.axis.R(16, k0 * 4 + k1)
+                with T.init():
+                    C[vi, vj] = T.float32(0)
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+    def expected(
+        A: T.Buffer[(16, 16), "float32"],
+        B: T.Buffer[(16, 16), "float32"],
+        C: T.Buffer[(16, 16), "float32"],
+    ) -> None:
+        T.func_attr({"layout_free_buffers": [1]})
+        B_reindex = T.alloc_buffer([16, 4, 4], dtype="float32")
+        for ax0, ax1 in T.grid(16, 16):
+            with T.block("layout_rewrite"):
+                i0, i1 = T.axis.remap("SS", [ax0, ax1])
+                T.block_attr({"meta_schedule.layout_rewrite_preproc": True})
+                B_reindex[i1, i0 // 4, i0 % 4] = B[i0, i1]
+        for i0, j, k0, i1, k1 in T.grid(4, 16, 4, 4, 4):
+            with T.block("matmul"):
+                vi = T.axis.spatial(16, i0 * 4 + i1)
+                vj = T.axis.spatial(16, j)
+                vk = T.axis.reduce(16, k0 * 4 + k1)
+                with T.init():
+                    C[vi, vj] = T.float32(0)
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B_reindex[vj, vk // 4, vk % 4]
+
+
+class TestRewrittenBuffersMustOccurWithinBlock(BaseBeforeAfter):
+    """Buffers must occur within a Block"""
+
+    def before(
+        A: T.Buffer[(16, 16), "float32"],
+    ) -> None:
+        T.func_attr({"layout_free_buffers": [0]})
+        for i, j in T.grid(16, 16):
+            T.evaluate(A[i, j])
+
+    expected = tvm.TVMError
+
+
+class TestExtentOne(BaseBeforeAfter):
+    """Buffers with dimensions of extent 1 can be transformed
+
+    Regression test for a previous bug, in which the removal of
+    trivial variables resulted in an error in `IndexMap::Inverse`.
+    """
+
+    def before(
+        A: T.Buffer[(16, 1), "float32"],
+    ) -> None:
+        T.func_attr({"layout_free_buffers": [0]})
+        for i, j in T.grid(16, 1):
+            with T.block("block"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.evaluate(A[vi, vj])
+
+    def expected(A: T.Buffer[(16, 1), "float32"]):
+        T.func_attr({"layout_free_buffers": [0]})
+
+        A_global = T.alloc_buffer([16], dtype="float32")
+        for ax0, ax1 in T.grid(16, 1):
+            with T.block("A_global"):
+                v0, v1 = T.axis.remap("SS", [ax0, ax1])
+                T.block_attr({"meta_schedule.layout_rewrite_preproc": True})
+                A_global[v0] = A[v0, v1]
+
+        for i, j in T.grid(16, 1):
+            with T.block("block"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.evaluate(A_global[vi])
 
 
 if __name__ == "__main__":
-    test_layout_rewrite()
+    tvm.testing.main()

From e3a6cb6a1bc1764e261321713ab275461b36dcc0 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Thu, 29 Sep 2022 04:14:50 +0700
Subject: [PATCH 270/704] [microTVM] Generalize depthwise_conv2d schedule
 (#12856)

* Method without SMLAD

* Remove kernel packing without decreasing speed

* Finish removing weights reorg

* Unit tests for larger kernels

* Prototype int16 depthwise schedule

* Bugfixes and unit tests

* Formatting and linting

* Linting fix

* Address comments from code review

* Fix accidental winograd bug

* Clarifying comment about Relay constant assertion

* Another round of code review comments
---
 python/tvm/relay/op/strategy/arm_cpu.py       |  22 +-
 python/tvm/topi/arm_cpu/conv2d_alter_op.py    |  38 +++-
 .../arm_cpu/mprofile/dsp/depthwise_conv2d.py  | 126 ++---------
 .../mprofile/dsp/micro_kernel/common.py       |  15 ++
 .../micro_kernel/multi_channel_convolve.py    | 210 ++++++++++++++++++
 .../dsp/micro_kernel/quad_channel_convolve.py | 180 ---------------
 .../strategy/arm_cpu/test_depthwise_conv2d.py |  43 ++--
 7 files changed, 327 insertions(+), 307 deletions(-)
 create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
 delete mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 2d9ef99ba8a6..947beb396ae2 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -236,20 +236,24 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     name="depthwise_conv2d_nhwc.arm_cpu",
                 )
 
-            # Optimized special case depthwiseConv2D operation. Requires a 3x3 kernel, a
-            # NHWC layout, a HWOI kernel layout (which we rearrange), no dilation, int8 inputs,
-            # int32 output, the same number of input and output channels, and for that channel
-            # count to be divisible by 4. Additional work could remove these restrictions.
+            # Optimized special case depthwiseConv2D operation. Requires NHWC layout,
+            # a HWOI kernel layout (which we rearrange to a custom layout) no dilation,
+            # int8/16 inputs, int32 output, and the same number of input and output channels.
+            # The int8 implementation DOES need the DSP unit (for SXTB16), but it is not
+            # possible to use the DSP unit to speed up a NHWC depthwise convolution (though
+            # an NCHW convolution would benefit).
 
             elif (
-                target.features.has_dsp
-                and kernel.shape[0] == kernel.shape[1] == 3
-                and dilation_w == dilation_h == 1
+                dilation_w == dilation_h == 1
                 and kernel.shape[3] == 1  # channel_multiplier == 1
-                and data.dtype == "int8"
                 and out_type.dtype == "int32"
-                and data.shape[3] % 4 == 0
+                and (
+                    (data.shape[3] % 4 == 0 and data.dtype == "int8" and target.features.has_dsp)
+                    or (data.shape[3] % 2 == 0 and data.dtype == "int16")
+                )
                 and (padding != "SAME" or data.shape[1] % stride_h == data.shape[2] % stride_w == 0)
+                # Ideally we should check that kernel is a Relay constant, but strategy functions
+                # don't have access to the data needed to check this.
             ):
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nhwc_dsp),
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index 90461f0c1c99..d4878f4b6908 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -19,6 +19,8 @@
 
 import logging
 
+import numpy as np
+
 import tvm
 from tvm import te
 from tvm import relay
@@ -31,6 +33,7 @@
 from .conv2d_int8 import is_int8_hw_support
 from .arm_utils import get_tiling_B_interleaved_t
 from ..generic.conv2d import conv2d_alter_int8_common
+from .mprofile.dsp.micro_kernel.common import num_simd_lanes_per_word
 
 logger = logging.getLogger("topi")
 
@@ -121,7 +124,40 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
 
     idxd = tvm.tir.indexdiv
 
-    # We don't perform layout alteration for NHWC layout with real data types
+    if topi_tmpl == "depthwise_conv2d_nhwc_dsp.arm_cpu":
+        assert data_layout == "NHWC" and kernel_layout == "HWOI"
+
+        # We are not able to check if inputs[1] (the kernel) is a constant in the
+        # strategy function, so as a stopgap solution we use an assert here.
+        assert isinstance(
+            inputs[1], relay.Constant
+        ), "depthwise_conv2d_nhwc_dsp.arm_cpu requires kernel be a relay Constant"
+
+        channels = get_const_tuple(data.shape)[3]
+        KH, KW, _, _ = get_const_tuple(kernel.shape)
+        simd_lanes = num_simd_lanes_per_word(data.dtype)
+
+        HWOI_kernel_np = inputs[1].data.numpy()
+        CHWc_kernel_np = np.zeros((channels // simd_lanes, KH, KW, simd_lanes), dtype=kernel.dtype)
+        for i in range(channels // simd_lanes):
+            CHWc_kernel_np[i] = HWOI_kernel_np[:, :, simd_lanes * i : simd_lanes * (i + 1), 0]
+        reshaped_new_kernel = CHWc_kernel_np.reshape((KH, KW, channels, 1))
+
+        # Store the same config for the altered operator (workload)
+        new_data = data
+        new_kernel = te.placeholder((KH, KW, channels, 1), dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, out_dtype],
+            "depthwise_conv2d_nhwc_dsp.arm_cpu",
+        )
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.conv2d(
+            inputs[0],
+            relay.Constant(tvm.nd.array(reshaped_new_kernel)),
+            **new_attrs,
+        )
+
+    # Only microTVM does layout alteration for NHWC layout with real data types
     if data_layout == "NHWC" and data_dtype not in ["uint8", "int8"]:
         return None
 
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py
index 162bf65a21f9..b8da15dadf13 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py
@@ -19,84 +19,15 @@
 import random
 import string
 
-from tvm import te
-from tvm.topi.utils import traverse_inline, get_const_tuple
+from tvm import te, topi
+from tvm.topi.utils import traverse_inline
 from tvm.topi.nn.pad import pad
-from tvm import tir
 
-from .micro_kernel.quad_channel_convolve import (
-    intrin_quad_channel_convolve,
-    quad_channel_convolve_impl,
+from .micro_kernel.multi_channel_convolve import (
+    intrin_multi_channel_convolve,
+    multi_channel_convolve_impl,
 )
-
-# For depthwise_conv2d, kernels are normally given in HWOI format,
-# which when input_channels = output channels, we will call HWC.
-# This is bad, as we want "related" parts of the kernel to be next
-# to each other, so we can use __SMLAD later.
-#
-# Consider a 3x3 int8 kernel with no bias vector, with eight
-# channels. Let us specify entries in the kernel as H_W_C - i.e.
-# where 0_2_3 represents the rightmost position in the first row
-# of channel 4/8 (4 because of zero indexing). Each [ ] represents
-# a 32-bit integer. We currently store the kernel as:
-#
-# 0 ................................31
-# [ 0_0_0 || 0_0_1 || 0_0_2 || 0_0_3 ] [ 0_0_4 || 0_0_5 || 0_0_6 || 0_0_7 ]
-# [ 0_1_0 || 0_1_1 || 0_1_2 || 0_1_3 ] [ 0_1_4 || 0_1_5 || 0_1_6 || 0_1_7 ]
-# [ 0_2_0 || 0_2_1 || 0_2_2 || 0_2_3 ] [ 0_2_4 || 0_2_5 || 0_2_6 || 0_2_7 ]
-# [ 1_0_0 || 1_0_1 || 1_0_2 || 1_0_3 ] [ 1_0_4 || 1_0_5 || 1_0_6 || 1_0_7 ]
-# [ 1_1_0 || 1_1_1 || 1_1_2 || 1_1_3 ] [ 1_1_4 || 1_1_5 || 1_1_6 || 1_1_7 ]
-# [ 1_2_0 || 1_2_1 || 1_2_2 || 1_2_3 ] [ 1_2_4 || 1_2_5 || 1_2_6 || 1_2_7 ]
-# [ 2_0_0 || 2_0_1 || 2_0_2 || 2_0_3 ] [ 2_0_4 || 2_0_5 || 2_0_6 || 2_0_7 ]
-# [ 2_1_0 || 2_1_1 || 2_1_2 || 2_1_3 ] [ 2_1_4 || 2_1_5 || 2_1_6 || 2_1_7 ]
-# [ 2_2_0 || 2_2_1 || 2_2_2 || 2_2_3 ] [ 2_2_4 || 2_2_5 || 2_2_6 || 2_2_7 ]
-#
-# Let 0x00 be all zeros. We rearrange into:
-#
-# 0 ................................31
-# [ 0_0_0 || 0_0_1 || 0_1_0 || 0_1_1 ] [ 0_0_2 || 0_0_3 || 0_1_2 || 0_1_3 ]
-# [ 0_2_0 || 0_2_1 || 1_0_0 || 1_0_1 ] [ 0_2_2 || 0_2_3 || 1_0_2 || 1_0_3 ]
-# [ 1_1_0 || 1_1_1 || 1_2_0 || 1_2_1 ] [ 1_1_2 || 1_1_3 || 1_2_2 || 1_2_3 ]
-# [ 2_0_0 || 2_0_1 || 2_1_0 || 2_1_1 ] [ 2_0_2 || 2_0_3 || 2_1_2 || 2_1_3 ]
-# [ 2_2_0 || 2_2_1 || 0x000 || 0x000 ] [ 2_2_2 || 2_2_3 || 0x000 || 0x000 ]
-# [ 0_0_4 || 0_0_5 || 0_1_4 || 0_1_5 ] [ 0_0_6 || 0_0_7 || 0_1_6 || 0_1_7 ]
-# [ 0_2_4 || 0_2_5 || 1_0_4 || 1_0_5 ] [ 0_2_6 || 0_2_7 || 1_0_6 || 1_0_7 ]
-# [ 1_1_4 || 1_1_5 || 1_2_4 || 1_2_5 ] [ 1_1_6 || 1_1_7 || 1_2_6 || 1_2_7 ]
-# [ 2_0_4 || 2_0_5 || 2_1_4 || 2_1_5 ] [ 2_0_6 || 2_0_7 || 2_1_6 || 2_1_7 ]
-# [ 2_2_4 || 2_2_5 || 0x000 || 0x000 ] [ 2_2_6 || 2_2_7 || 0x000 || 0x000 ]
-#
-# This saves us six operations comapred to the original ordering, as we
-# do not need halfword packing instructions.
-#
-# This kernel re-arranging function will be used for 3x3 kernels (as that
-# is all this DSP implementation currently supports) but would work with
-# any M*N kernel such that M*N is odd.
-
-
-def _rearrange_kernel(kernel):
-    # Kernel must be HWC format.
-    kernel_h, kernel_w, channels, _ = get_const_tuple(kernel.shape)
-    assert channels % 4 == 0
-
-    # This restriction could be removed by only using tir.if_then_else to add padding
-    # zeros if (kernel_w * kernel_h) % 2 == 1, and filling completely otherwise.
-    assert (kernel_w * kernel_h) % 2 == 1
-
-    def fcompute(c_o, pos, c_i):
-        channel = (2 * (pos % 2)) + (c_i % 2) + (4 * c_o)
-        true_pos_index = 2 * (pos // 2) + (c_i // 2)
-
-        return tir.if_then_else(
-            true_pos_index < (kernel_h * kernel_w),
-            kernel[true_pos_index // kernel_w, true_pos_index % kernel_w, channel, 0],
-            tir.const(0, "int8"),
-        )
-
-    return te.compute(
-        (channels // 4, kernel_h * kernel_w + 1, 4),
-        fcompute,
-        name="packed_kernel",
-    )
+from .micro_kernel.common import num_simd_lanes_per_word
 
 
 def depthwise_conv2d_nhwc_dsp_compute(_cfg, data, kernel, strides, padding, dilation, out_dtype):
@@ -120,10 +51,7 @@ def depthwise_conv2d_nhwc_dsp_compute(_cfg, data, kernel, strides, padding, dila
 
     batch_size, height, width, channels = data.shape
     kernel_h, kernel_w, _, _ = kernel.shape
-
-    # We require that the number of channels be divisible by 4. This restriction could
-    # be removed with strip mining if people cared.
-    assert channels % 4 == 0
+    simd_lanes = num_simd_lanes_per_word(data.dtype)
 
     # We don't support different numbers of input and output channels.
     assert channels == kernel.shape[2]
@@ -133,11 +61,6 @@ def depthwise_conv2d_nhwc_dsp_compute(_cfg, data, kernel, strides, padding, dila
     # round until we compute activations.
     assert out_dtype == "int32"
 
-    # This can pretty easily be generalized in the future. Likely worth doing, and this
-    # function was written to make doing so easy. Should only require adding more calls
-    # to QUAD_CHANNEL_REARRANGE_SUM.
-    assert kernel_w == kernel_h == 3
-
     # Padding the data requires COPYING THE ENTIRE INPUT TENSOR, which
     # is slow and bad. We should really implement a strip mining
     # routine to avoid this, but TVM has terrible support for that.
@@ -188,18 +111,14 @@ def depthwise_conv2d_nhwc_dsp_compute(_cfg, data, kernel, strides, padding, dila
         raise RuntimeError()
     _, padded_h, padded_w, _ = padded_data.shape
 
-    packed_kernel = _rearrange_kernel(kernel)
     kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
     kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
+    reshaped_kernel = topi.reshape(kernel, (channels // simd_lanes, kernel_h, kernel_w, simd_lanes))
     return te.compute(
         (batch_size, output_h, output_w, channels),
         lambda h, i, j, k: te.sum(
             padded_data[h, (i * stride_h) + kh_i, (j * stride_w) + kw_i, k].astype("int32")
-            * packed_kernel[
-                k // 4,
-                (2 * ((3 * kh_i + kw_i) // 2)) + ((k % 4) // 2),
-                (2 * ((kh_i + kw_i) % 2)) + (k % 2),
-            ].astype("int32"),
+            * reshaped_kernel[k // simd_lanes, kh_i, kw_i, k % simd_lanes].astype("int32"),
             axis=(kh_i, kw_i),
         ),
         name="depthwise_conv2d",
@@ -212,33 +131,36 @@ def depthwise_conv2d_nhwc_dsp_schedule(_cfg, outs):
     """Schedule function for v7e-m DSP instructions of conv2d."""
     schedule = te.create_schedule([x.op for x in outs])
 
-    def _callback(op):
-        if "depthwise_conv2d_nhwc" not in op.tag:
+    def _callback(operator):
+        if "depthwise_conv2d_nhwc" not in operator.tag:
             return
 
         # extract tensors
-        output = op.output(0)
+        output = operator.output(0)
         padded_data = output.op.input_tensors[0]
-        packed_kernel = output.op.input_tensors[1]
-        kernel = packed_kernel.op.input_tensors[0]
+        reshaped_kernel = output.op.input_tensors[1]
+        in_dtype = padded_data.dtype
 
-        _, _, padded_w, channels = padded_data.shape
-        kernel_h, kernel_w, _, _ = kernel.shape
+        _, padded_h, padded_w, channels = padded_data.shape
+        _, kernel_h, kernel_w, _ = reshaped_kernel.shape
         suffix = "".join(random.choices(string.ascii_uppercase, k=8))
 
         b_ax, y_ax, x_ax, c_ax = schedule[output].op.axis
         ky_ax, kx_ax = schedule[output].op.reduce_axis
-        c_ax_o, c_ax_i = schedule[output].split(c_ax, factor=4)
+        simd_lanes = num_simd_lanes_per_word(in_dtype)
+        c_ax_o, c_ax_i = schedule[output].split(c_ax, factor=simd_lanes)
         schedule[output].reorder(b_ax, c_ax_o, y_ax, x_ax, ky_ax, kx_ax, c_ax_i)
 
-        quad_channel_convolve = intrin_quad_channel_convolve(
-            padded_w, channels, kernel_h, kernel_w, suffix
+        multi_channel_convolve = intrin_multi_channel_convolve(
+            in_dtype, padded_h, padded_w, channels, kernel_h, kernel_w, suffix
         )
-        schedule[output].tensorize(ky_ax, quad_channel_convolve)
+        schedule[output].tensorize(ky_ax, multi_channel_convolve)
         schedule[output].pragma(
             b_ax,
             "import_c",
-            quad_channel_convolve_impl(padded_w, channels, kernel_h, kernel_w, suffix),
+            multi_channel_convolve_impl(
+                in_dtype, padded_h, padded_w, channels, kernel_h, kernel_w, suffix
+            ),
         )
 
     traverse_inline(schedule, outs[-1].op, _callback)
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
index df54c101773e..0398844315a7 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
@@ -29,3 +29,18 @@
 #include <tvm/runtime/crt/error_codes.h>
 
 """
+
+MICRO_WORD_LENGTH_BITS = 32
+
+
+def num_simd_lanes_per_word(dtype: str) -> int:
+    """Takes a dtype, and returns how many of that dtype fit into a single microcontroller word.
+
+    >>> num_simd_lanes_per_word("int8")
+    4
+    >>> num_simd_lanes_per_word("int16")
+    2
+    """
+    assert dtype.startswith("int")
+    dtype_width = int(dtype[3:])
+    return MICRO_WORD_LENGTH_BITS // dtype_width
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
new file mode 100644
index 000000000000..992d90578046
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
@@ -0,0 +1,210 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""This is a special intrinsic used for depthwise convolution using Cortex-M DSP instructions
+(v7e-m). It takes as inputs an int8 HWC data tensor and an int8 CHWc kernel. This intrinsic "lays"
+the kernel on top of the data tensors starting from a given pointer, performs signed sixteen-bit
+multiplies on each pair of values, and sums all the products in an int32 accumlator. This process is
+repeated four times giving four int32 outputs - one per channel."""
+
+import textwrap
+
+from tvm import te, tir
+from .common import num_simd_lanes_per_word
+
+
+def _get_func_name(in_dtype, tensor_w, channels, kernel_h, kernel_w, suffix):
+    """Gets the C function name of the tensorized function."""
+    return f"kernel_convolve_{in_dtype}_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}"
+
+
+def intrin_multi_channel_convolve(
+    in_dtype, _tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix
+):
+    """Defines a v7e-m DSP-accelerated multi-channel convolution. Works on two
+    channels if in_dtype==int16, and four channels if in_dtype==int8."""
+    simd_lanes = num_simd_lanes_per_word(in_dtype)
+
+    overlap_dims = (kernel_h, kernel_w, simd_lanes)
+    data_slice = te.placeholder(overlap_dims, name="data_slice", dtype=in_dtype)
+    kernel_slice = te.placeholder(overlap_dims, name="kernel_slice", dtype=in_dtype)
+
+    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
+    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
+
+    output_slice = te.compute(
+        (simd_lanes,),
+        lambda k: te.sum(
+            data_slice[kh_i, kw_i, k].astype("int32") * kernel_slice[kh_i, kw_i, k].astype("int32"),
+            axis=(kh_i, kw_i),
+        ),
+        name="c",
+    )
+
+    data_buf = tir.decl_buffer(
+        data_slice.shape,
+        data_slice.dtype,
+        name="data",
+        offset_factor=1,
+        strides=[tensor_w * channels, channels, 1],
+    )
+    kernel_buf = tir.decl_buffer(
+        kernel_slice.shape,
+        kernel_slice.dtype,
+        name="kernel",
+        offset_factor=1,
+        strides=[kernel_w * simd_lanes, simd_lanes, 1],
+    )
+    output_buf = tir.decl_buffer(
+        output_slice.shape, output_slice.dtype, name="output", offset_factor=1, strides=[1]
+    )
+
+    def intrin_func(ins, outs):
+        builder = tir.ir_builder.create()
+        builder.emit(
+            tir.call_extern(
+                "int32",
+                _get_func_name(in_dtype, tensor_w, channels, kernel_h, kernel_w, suffix),
+                outs[0].access_ptr("w"),
+                ins[0].access_ptr("r"),
+                ins[1].access_ptr("r"),
+            )
+        )
+        return builder.get()
+
+    return te.decl_tensor_intrin(
+        output_slice.op,
+        intrin_func,
+        binds={data_slice: data_buf, kernel_slice: kernel_buf, output_slice: output_buf},
+    )
+
+
+def multi_channel_convolve_impl(in_dtype, *args) -> str:
+    """Generates C code for a fast multi-channel convolution function for ARM Cortex-M. This is done
+    by calling a sub-function depending on the input data type, as since v7e-m has no quad multiply
+    accumulate instruction, the int8 and int16 cases work differently."""
+    if in_dtype == "int8":
+        return _quad_int8_channel_convolve_impl(*args)
+    if in_dtype == "int16":
+        return _dual_int16_channel_convolve_impl(*args)
+
+    raise NotImplementedError(f"No Cortex-M {in_dtype} depthwise_conv2d implementation exists!")
+
+
+def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
+    return textwrap.dedent(
+        (
+            f"""
+        #include <stdint.h>
+        #include <arm_nnsupportfunctions.h>
+
+        // __SXTB16(_ROR(X, Y)) is combined into one assembly instruction
+
+        #define TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP( \
+            arranged_kernel, \
+            tensor_c3210, \
+            sum_c0, sum_c1, sum_c2, sum_c3) {{ \
+          \
+          uint32_t kernel_c3210 = *arranged_kernel++; \
+          \
+          uint32_t tensor_c20 = __SXTB16(tensor_c3210); \
+          uint32_t kernel_c20 = __SXTB16(kernel_c3210); \
+          sum_c0 = __builtin_arm_smlabb(tensor_c20, kernel_c20, sum_c0); \
+          sum_c2 = __builtin_arm_smlatt(tensor_c20, kernel_c20, sum_c2); \
+          \
+          uint32_t tensor_c31 = __SXTB16(__ROR(tensor_c3210, 8)); \
+          uint32_t kernel_c31 = __SXTB16(__ROR(kernel_c3210, 8)); \
+          sum_c1 = __builtin_arm_smlabb(tensor_c31, kernel_c31, sum_c1); \
+          sum_c3 = __builtin_arm_smlatt(tensor_c31, kernel_c31, sum_c3); \
+        }}
+
+        /* We do four channels at once to get this speed boost. */
+        #ifdef __cplusplus
+        extern "C"
+        #endif
+        int32_t {_get_func_name("int8", tensor_w, channels, kernel_h, kernel_w, suffix)}(
+            uint32_t *out,
+            uint32_t *tensor,
+            uint32_t *kernel) {{
+
+          uint32_t sum_c0 = 0;
+          uint32_t sum_c1 = 0;
+          uint32_t sum_c2 = 0;
+          uint32_t sum_c3 = 0;
+
+          #pragma GCC unroll 3
+          for (int i = 0; i < {kernel_h}; i++) {{
+            #pragma GCC unroll 3
+            for (int j = 0; j < {kernel_w}; j++) {{
+              TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP(
+                kernel,
+                *(tensor + j * {channels // 4} + i * {tensor_w * (channels // 4)}),
+                sum_c0, sum_c1, sum_c2, sum_c3)
+            }}
+          }}
+
+          out[0] = sum_c0;
+          out[1] = sum_c1;
+          out[2] = sum_c2;
+          out[3] = sum_c3;
+          return 0;
+        }}
+
+        #undef TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP
+        """
+        )
+    )
+
+
+def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
+    return textwrap.dedent(
+        (
+            f"""
+        #include <stdint.h>
+
+        /* We do four channels at once to get this speed boost. */
+        #ifdef __cplusplus
+        extern "C"
+        #endif
+        int32_t {_get_func_name("int16", tensor_w, channels, kernel_h, kernel_w, suffix)}(
+            uint32_t *out,
+            uint32_t *tensor,
+            uint32_t *kernel) {{
+
+          uint32_t sum_c0 = 0;
+          uint32_t sum_c1 = 0;
+
+          #pragma GCC unroll 3
+          for (int i = 0; i < {kernel_h}; i++) {{
+            #pragma GCC unroll 3
+            for (int j = 0; j < {kernel_w}; j++) {{
+              uint32_t tensor_c10 = *(tensor + j * {channels // 2}
+                + i * {tensor_w * (channels // 2)});
+              uint32_t kernel_c10 = *kernel++;
+              sum_c0 = __builtin_arm_smlabb(tensor_c10, kernel_c10, sum_c0);
+              sum_c1 = __builtin_arm_smlatt(tensor_c10, kernel_c10, sum_c1);
+            }}
+          }}
+
+          out[0] = sum_c0;
+          out[1] = sum_c1;
+          return 0;
+        }}
+
+        #undef TVMGEN_DUAL_INT16_CHANNEL_REARRANGE_SUM
+        """
+        )
+    )
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py
deleted file mode 100644
index 960ef8fadc0e..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""This is a special intrinsic used for depthwise convolution using Cortex-M DSP instructions
-(v7e-m). It takes as inputs an int8 HWC data tensor and an int8 CHWc kernel. This intrinsic "lays"
-the kernel on top of the data tensors starting from a given pointer, performs signed sixteen-bit
-multiplies on each pair of values, and sums all the products in an int32 accumlator. This process is
-repeated four times giving four int32 outputs - one per channel."""
-
-import textwrap
-
-from tvm import te, tir
-
-
-def intrin_quad_channel_convolve(tensor_w, channels, kernel_h, kernel_w, suffix):
-    """Defines a v7e-m DSP-accelerated four-channel convolution."""
-    data_slice = te.placeholder((kernel_h, kernel_w, 4), name="a", dtype="int8")
-
-    if kernel_h * kernel_w % 2 == 1:
-        kernel_length = kernel_h * kernel_w + 1
-    else:
-        kernel_length = kernel_h * kernel_w
-    kernel_slice = te.placeholder((kernel_length, 4), name="b", dtype="int8")
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-
-    output_slice = te.compute(
-        (4,),
-        lambda k: te.sum(
-            data_slice[kh_i, kw_i, k].astype("int32")
-            * kernel_slice[
-                (2 * ((3 * kh_i + kw_i) // 2)) + ((k % 4) // 2),
-                (2 * ((kh_i + kw_i) % 2)) + (k % 2),
-            ].astype("int32"),
-            axis=(kh_i, kw_i),
-        ),
-        name="c",
-    )
-
-    data_buf = tir.decl_buffer(
-        data_slice.shape,
-        data_slice.dtype,
-        name="data",
-        offset_factor=1,
-        strides=[tensor_w * channels, channels, 1],
-    )
-    kernel_buf = tir.decl_buffer(
-        kernel_slice.shape, kernel_slice.dtype, name="kernel", offset_factor=1, strides=[4, 1]
-    )
-    output_buf = tir.decl_buffer(
-        output_slice.shape, output_slice.dtype, name="output", offset_factor=1, strides=[1]
-    )
-
-    def intrin_func(ins, outs):
-        builder = tir.ir_builder.create()
-        builder.emit(
-            tir.call_extern(
-                "int32",
-                f"kernel_convolve_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}",
-                outs[0].access_ptr("w"),
-                ins[0].access_ptr("r"),
-                ins[1].access_ptr("r"),
-            )
-        )
-        return builder.get()
-
-    return te.decl_tensor_intrin(
-        output_slice.op,
-        intrin_func,
-        binds={data_slice: data_buf, kernel_slice: kernel_buf, output_slice: output_buf},
-    )
-
-
-def quad_channel_convolve_impl(tensor_w, channels, kernel_h, kernel_w, suffix):
-    """Emits C code for quad_channel_convolve. Note that while intrin_quad_channel_convolve supports
-    any kernel size, this function only supports 3x3 kernels (this could be fixed with work)."""
-    assert kernel_h == kernel_w == 3
-
-    return textwrap.dedent(
-        (
-            f"""
-        #include <stdint.h>
-        #include <arm_nnsupportfunctions.h>
-
-        // __SXTB16(_ROR(X, Y)) is combined into one assembly instruction
-
-        #define TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP( \
-            arranged_kernel, \
-            tensor_v0_c3210, tensor_v1_c3210, \
-            sum0, sum1, sum2, sum3) {{ \
-          \
-          uint32_t tensor_v0_c20 = __SXTB16(tensor_v0_c3210); \
-          uint32_t tensor_v0_c31 = __SXTB16(__ROR(tensor_v0_c3210, 8)); \
-          uint32_t tensor_v1_c20 = __SXTB16(tensor_v1_c3210); \
-          uint32_t tensor_v1_c31 = __SXTB16(__ROR(tensor_v1_c3210, 8)); \
-          \
-          uint32_t kernel_v1c1_v1c0_v0c1_v0c0 = *arranged_kernel++; \
-          uint32_t kernel_v1c3_v1c2_v0c3_v0c2 = *arranged_kernel++; \
-          \
-          uint32_t kernel_v10_c0 = __SXTB16(kernel_v1c1_v1c0_v0c1_v0c0); \
-          uint32_t kernel_v10_c1 = __SXTB16(__ROR(kernel_v1c1_v1c0_v0c1_v0c0, 8)); \
-          uint32_t kernel_v10_c2 = __SXTB16(kernel_v1c3_v1c2_v0c3_v0c2); \
-          uint32_t kernel_v10_c3 = __SXTB16(__ROR(kernel_v1c3_v1c2_v0c3_v0c2, 8)); \
-          \
-          uint32_t tensor_v10_c0 = __PKHBT(tensor_v0_c20, tensor_v1_c20, 16); \
-          uint32_t tensor_v10_c1 = __PKHBT(tensor_v0_c31, tensor_v1_c31, 16); \
-          uint32_t tensor_v10_c2 = __PKHTB(tensor_v1_c20, tensor_v0_c20, 16); \
-          uint32_t tensor_v10_c3 = __PKHTB(tensor_v1_c31, tensor_v0_c31, 16); \
-          \
-          sum_c0 = __SMLAD(tensor_v10_c0, kernel_v10_c0, sum_c0); \
-          sum_c1 = __SMLAD(tensor_v10_c1, kernel_v10_c1, sum_c1); \
-          sum_c2 = __SMLAD(tensor_v10_c2, kernel_v10_c2, sum_c2); \
-          sum_c3 = __SMLAD(tensor_v10_c3, kernel_v10_c3, sum_c3); \
-        }}
-
-        /* We do four channels at once to get this speed boost. */
-        #ifdef __cplusplus
-        extern "C"
-        #endif
-        int32_t kernel_convolve_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}(
-            uint32_t *out,
-            uint32_t *tensor,
-            uint32_t *packed_kernel) {{
-
-          uint32_t sum_c0 = 0;
-          uint32_t sum_c1 = 0;
-          uint32_t sum_c2 = 0;
-          uint32_t sum_c3 = 0;
-
-          TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP(
-            packed_kernel,
-            *tensor,
-            *(tensor + {channels // 4}),
-            sum_c0, sum_c1, sum_c2, sum_c3)
-          TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP(
-            packed_kernel,
-            *(tensor + {(2) * channels // 4}),
-            *(tensor + {tensor_w * (channels // 4)}),
-            sum_c0, sum_c1, sum_c2, sum_c3)
-          TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP(
-            packed_kernel,
-            *(tensor + {(tensor_w + 1) * (channels // 4)}),
-            *(tensor + {(tensor_w + 2) * (channels // 4)}),
-            sum_c0, sum_c1, sum_c2, sum_c3)
-          TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP(
-            packed_kernel,
-            *(tensor + {(2 * tensor_w) * (channels // 4)}),
-            *(tensor + {(2 * tensor_w + 1) * (channels // 4)}),
-            sum_c0, sum_c1, sum_c2, sum_c3)
-          TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP(
-            packed_kernel,
-            *(tensor + {(2 * tensor_w + 2) * (channels // 4)}),
-            0,
-            sum_c0, sum_c1, sum_c2, sum_c3)
-
-          out[0] = sum_c0;
-          out[1] = sum_c1;
-          out[2] = sum_c2;
-          out[3] = sum_c3;
-          return 0;
-        }}
-
-        #undef TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP
-        """
-        )
-    )
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
index 18c5082f2a0c..15ea2a31d864 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -150,24 +150,37 @@ class TestDepthwiseConv2d_NHWC_HWOI(BasicDepthwiseConv2dTests):
 class TestDepthwiseConv2d_NHWC_HWOI_DSP(BasicDepthwiseConv2dTests):
     """This test is for depthwise_conv2d_nhwc_dsp.arm_cpu schedule."""
 
-    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
-        # The LLVM implementation doesn't support "SAME" and "VALID" padding,
-        # so padding must be explicitly specified.
-        # Depthwise_conv2d parameters from MobileNetV1 0.25x
-        ((1, 48, 48, 8), (3, 3), 8, (1, 1), 1, 1),
-        ((1, 48, 48, 16), (3, 3), 16, (2, 2), (1, 1, 0, 0), 1),
-        ((1, 24, 24, 32), (3, 3), 32, (1, 1), 1, 1),
-        ((1, 24, 24, 32), (3, 3), 32, (2, 2), (1, 1, 0, 0), 1),
-        ((1, 12, 12, 64), (3, 3), 64, (1, 1), 1, 1),
-        ((1, 12, 12, 64), (3, 3), 64, (2, 2), (1, 1, 0, 0), 1),
-        ((1, 6, 6, 128), (3, 3), 128, (1, 1), 1, 1),
-        ((1, 6, 6, 128), (3, 3), 128, (2, 2), (1, 1, 0, 0), 1),
-        ((1, 3, 3, 256), (3, 3), 256, (1, 1), 1, 1),
+    # Tests that work with both int8 and int16 data types. Tuple elements are:
+    # data_shape, kernel_size, num_filter, strides, padding
+    dtype_parameterized_tests = [
+        # Depthwise_conv2d parameters from MobileNetV1 0.25x. The LLVM implementation doesn't
+        # support "SAME" and "VALID" padding, so padding must be explicitly specified.
+        ((1, 48, 48, 8), (3, 3), 8, (1, 1), 1),
+        ((1, 48, 48, 16), (3, 3), 16, (2, 2), (1, 1, 0, 0)),
+        ((1, 24, 24, 32), (3, 3), 32, (1, 1), 1),
+        ((1, 24, 24, 32), (3, 3), 32, (2, 2), (1, 1, 0, 0)),
+        ((1, 12, 12, 64), (3, 3), 64, (1, 1), 1),
+        ((1, 12, 12, 64), (3, 3), 64, (2, 2), (1, 1, 0, 0)),
+        ((1, 6, 6, 128), (3, 3), 128, (1, 1), 1),
+        ((1, 6, 6, 128), (3, 3), 128, (2, 2), (1, 1, 0, 0)),
+        ((1, 3, 3, 256), (3, 3), 256, (1, 1), 1),
         # Asymmetric height and width
-        ((1, 25, 5, 64), (3, 3), 64, (1, 1), 1, 1),
+        ((1, 25, 5, 64), (3, 3), 64, (1, 1), 1),
+        # Larger kernel
+        ((1, 24, 24, 8), (5, 5), 8, (1, 1), 1),
+        # Asymmetric kernel
+        ((1, 24, 24, 8), (3, 5), 8, (1, 1), 1),
+    ]
+
+    data_shape, kernel_size, num_filter, strides, padding, dtype = tvm.testing.parameters(
+        # Make a copy of each parameterized test for int8 and one for int16
+        *map(lambda t: t + ("int8",), dtype_parameterized_tests),
+        *map(lambda t: t + ("int16",), dtype_parameterized_tests),
+        # Test the int16 implementation with channel numbers not divisible by four
+        ((1, 48, 48, 6), (3, 3), 6, (1, 1), 1, "int16"),
     )
+    dilation = tvm.testing.parameter(1)
     data_layout = tvm.testing.parameter("NHWC")
-    dtype = tvm.testing.parameter("int8")
     kernel_layout = tvm.testing.parameter("HWOI")
     schedule_name = tvm.testing.parameter("depthwise_conv2d_nhwc_dsp.arm_cpu")
 

From 9d1fe6d8d109062de46842fa049d78cd752b8e1b Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 28 Sep 2022 15:39:55 -0700
Subject: [PATCH 271/704] [Target] Add Ampere GPUs CUDA tags (#12930)

* [Target] Add Ampere GPUs CUDA tags

* Update tag.cc
---
 src/target/tag.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/target/tag.cc b/src/target/tag.cc
index b59a23d8dcc5..0747769b1e04 100644
--- a/src/target/tag.cc
+++ b/src/target/tag.cc
@@ -111,6 +111,11 @@ TVM_REGISTER_CUDA_TAG("nvidia/tesla-c2075", "sm_20", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/tesla-c2050", "sm_20", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/tesla-c2070", "sm_20", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/nvidia-a100", "sm_80", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-a40", "sm_86", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-a30", "sm_80", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-a10", "sm_86", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-a16", "sm_86", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-a2", "sm_86", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/nvidia-t4", "sm_75", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/nvidia-v100", "sm_70", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/tesla-p100", "sm_60", 49152, 65536);
@@ -205,9 +210,13 @@ TVM_REGISTER_CUDA_TAG("nvidia/nvidia-nvs-310", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/nvs-5400m", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/nvs-5200m", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/nvs-4200m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3090-ti", "sm_86", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3090", "sm_86", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3080-ti", "sm_86", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3080", "sm_86", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3070-ti", "sm_86", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3070", "sm_86", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3060", "sm_86", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/nvidia-titan-rtx", "sm_75", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2080-ti", "sm_75", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2080", "sm_75", 49152, 65536);

From 8af43d3c11b13108eec0cea784961cf7b855c10f Mon Sep 17 00:00:00 2001
From: Janet Schneider <janet.schneider@gmail.com>
Date: Wed, 28 Sep 2022 17:02:13 -0700
Subject: [PATCH 272/704] [Hexagon] [runtime] Add user DMA to device API
 resource management (#12918)

---
 src/runtime/hexagon/hexagon_device_api.cc     |  4 +-
 src/runtime/hexagon/hexagon_device_api.h      | 20 +++++++-
 src/runtime/hexagon/hexagon_user_dma.cc       | 12 +++--
 src/runtime/hexagon/hexagon_user_dma.h        | 21 +++-----
 .../hexagon/hexagon_device_api_tests.cc       | 13 ++++-
 .../hexagon/hexagon_user_dma_tests.cc         | 48 ++++++++++---------
 6 files changed, 72 insertions(+), 46 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 84232a614428..06254fba4585 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -217,7 +217,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy").set_body([](TVMArgs args, TVM
 
   int ret = DMA_RETRY;
   do {
-    ret = HexagonUserDMA::Get().Copy(dst, src, size);
+    ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(dst, src, size);
   } while (ret == DMA_RETRY);
   *rv = static_cast<int32_t>(ret);
 });
@@ -227,7 +227,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.dma_wait").set_body([](TVMArgs args, TVM
   ICHECK(queue_id == 0 && "Hexagon supports just a single asynchronous queue for DMA");
   int inflight = args[1];
   ICHECK(inflight >= 0);
-  HexagonUserDMA::Get().Wait(inflight);
+  HexagonDeviceAPI::Global()->UserDMA()->Wait(inflight);
   *rv = static_cast<int32_t>(0);
 });
 
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index 4f544faffba1..555ca0fa51a8 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -32,6 +32,7 @@
 #include "hexagon_buffer.h"
 #include "hexagon_buffer_manager.h"
 #include "hexagon_thread_manager.h"
+#include "hexagon_user_dma.h"
 
 namespace tvm {
 namespace runtime {
@@ -61,10 +62,18 @@ class HexagonDeviceAPI final : public DeviceAPI {
     CHECK_EQ(runtime_threads, nullptr);
     runtime_threads = std::make_unique<HexagonThreadManager>(threads, stack_size, pipe_size);
     DLOG(INFO) << "runtime_threads created";
+
+    CHECK_EQ(runtime_dma, nullptr);
+    runtime_dma = std::make_unique<HexagonUserDMA>();
+    DLOG(INFO) << "runtime_dma created";
   }
 
   //! \brief Ensures all runtime resources are freed
   void ReleaseResources() {
+    CHECK(runtime_dma) << "runtime_dma was not created in AcquireResources";
+    runtime_dma.reset();
+    DLOG(INFO) << "runtime_dma reset";
+
     CHECK(runtime_threads) << "runtime_threads was not created in AcquireResources";
     runtime_threads.reset();
     DLOG(INFO) << "runtime_threads reset";
@@ -150,7 +159,13 @@ class HexagonDeviceAPI final : public DeviceAPI {
   void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final;
 
   HexagonThreadManager* ThreadManager() {
-    return runtime_threads ? runtime_threads.get() : nullptr;
+    CHECK(runtime_threads) << "runtime_threads has not been created";
+    return runtime_threads.get();
+  }
+
+  HexagonUserDMA* UserDMA() {
+    CHECK(runtime_dma) << "runtime_dma has not been created";
+    return runtime_dma.get();
   }
 
  protected:
@@ -184,6 +199,9 @@ class HexagonDeviceAPI final : public DeviceAPI {
   const unsigned threads{6};
   const unsigned pipe_size{1000};
   const unsigned stack_size{0x4000};  // 16KB
+
+  //! \brief User DMA manager
+  std::unique_ptr<HexagonUserDMA> runtime_dma;
 };
 }  // namespace hexagon
 }  // namespace runtime
diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc
index 8d45b7590bc4..ab464c150319 100644
--- a/src/runtime/hexagon/hexagon_user_dma.cc
+++ b/src/runtime/hexagon/hexagon_user_dma.cc
@@ -21,6 +21,8 @@
 
 #include <algorithm>
 
+#include "hexagon_device_api.h"
+
 namespace tvm {
 namespace runtime {
 namespace hexagon {
@@ -116,13 +118,15 @@ HexagonUserDMA::~HexagonUserDMA() {
 }
 
 int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
+  HexagonUserDMA* user_dma = HexagonDeviceAPI::Global()->UserDMA();
+
   // One DMA transfer can copy at most DESC_LENGTH_MASK bytes.
   // Make the common case quick.
   if (length <= DESC_LENGTH_MASK) {
     // sync DMA -> `Copy` and then `Wait(0)`
-    int ret_val = HexagonUserDMA::Get().Copy(dst, src, length);
+    int ret_val = user_dma->Copy(dst, src, length);
     if (ret_val != DMA_SUCCESS) return ret_val;
-    HexagonUserDMA::Get().Wait(0);
+    user_dma->Wait(0);
     return DMA_SUCCESS;
   }
 
@@ -133,9 +137,9 @@ int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
     // Ensure there is no overflow while updating i
     uint32_t cur_len = std::min<uint32_t>(length - i, DESC_LENGTH_MASK);
     // sync DMA -> `Copy` and then `Wait(0)`
-    int ret_val = HexagonUserDMA::Get().Copy(&cast_dst[i], &cast_src[i], cur_len);
+    int ret_val = user_dma->Copy(&cast_dst[i], &cast_src[i], cur_len);
     if (ret_val != DMA_SUCCESS) return ret_val;
-    HexagonUserDMA::Get().Wait(0);
+    user_dma->Wait(0);
     // 2 cases for new val for i:
     // 1. length - i <= DESC_LENGTH_MASK (<= MAX_UINT)
     //    new_i = i + (length - i) = length, no more iter
diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h
index aa00df79c4d0..f8838ee2dcc9 100644
--- a/src/runtime/hexagon/hexagon_user_dma.h
+++ b/src/runtime/hexagon/hexagon_user_dma.h
@@ -37,6 +37,13 @@ namespace hexagon {
 
 class HexagonUserDMA {
  public:
+  HexagonUserDMA();
+  ~HexagonUserDMA();
+  HexagonUserDMA(const HexagonUserDMA&) = delete;
+  HexagonUserDMA& operator=(const HexagonUserDMA&) = delete;
+  HexagonUserDMA(HexagonUserDMA&&) = delete;
+  HexagonUserDMA& operator=(HexagonUserDMA&&) = delete;
+
   /*!
    * \brief Initiate DMA to copy memory from source to destination address
    * \param dst Destination address
@@ -59,21 +66,7 @@ class HexagonUserDMA {
    */
   uint32_t Poll();
 
-  //! \brief HexagonUserDMA uses the singleton pattern
-  static HexagonUserDMA& Get() {
-    static HexagonUserDMA* hud = new HexagonUserDMA();
-    return *hud;
-  }
-
  private:
-  // HexagonUserDMA uses the singleton pattern
-  HexagonUserDMA();
-  ~HexagonUserDMA();
-  HexagonUserDMA(const HexagonUserDMA&) = delete;
-  HexagonUserDMA& operator=(const HexagonUserDMA&) = delete;
-  HexagonUserDMA(HexagonUserDMA&&) = delete;
-  HexagonUserDMA& operator=(HexagonUserDMA&&) = delete;
-
   //! \brief Initializes the Hexagon User DMA engine
   unsigned int Init();
 
diff --git a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
index b54e40e87958..d0f962cfcee5 100644
--- a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
@@ -171,7 +171,16 @@ TEST_F(HexagonDeviceAPITest, thread_manager) {
   HexagonThreadManager* threads = hexapi->ThreadManager();
   CHECK(threads != nullptr);
   hexapi->ReleaseResources();
-  threads = hexapi->ThreadManager();
-  CHECK(threads == nullptr);
+  EXPECT_THROW(hexapi->ThreadManager(), InternalError);
+  hexapi->AcquireResources();
+}
+
+// Ensure thread manager is properly configured and destroyed
+// in Acquire/Release
+TEST_F(HexagonDeviceAPITest, user_dma) {
+  HexagonUserDMA* user_dma = hexapi->UserDMA();
+  CHECK(user_dma != nullptr);
+  hexapi->ReleaseResources();
+  EXPECT_THROW(hexapi->UserDMA(), InternalError);
   hexapi->AcquireResources();
 }
diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
index bf7a23712d7d..fb46cb3fd976 100644
--- a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
@@ -19,13 +19,14 @@
 
 #include <gtest/gtest.h>
 
-#include "../src/runtime/hexagon/hexagon_user_dma.h"
+#include "../src/runtime/hexagon/hexagon_device_api.h"
 
 using namespace tvm::runtime;
 using namespace tvm::runtime::hexagon;
 
 class HexagonUserDMATest : public ::testing::Test {
   void SetUp() override {
+    user_dma = HexagonDeviceAPI::Global()->UserDMA();
     src = malloc(length);
     dst = malloc(length);
     ASSERT_NE(src, nullptr);
@@ -44,6 +45,7 @@ class HexagonUserDMATest : public ::testing::Test {
   }
 
  public:
+  HexagonUserDMA* user_dma;
   int ret{0};
   void* src{nullptr};
   void* dst{nullptr};
@@ -53,29 +55,29 @@ class HexagonUserDMATest : public ::testing::Test {
 };
 
 TEST_F(HexagonUserDMATest, wait) {
-  HexagonUserDMA::Get().Wait(0);
-  HexagonUserDMA::Get().Wait(10);
+  user_dma->Wait(0);
+  user_dma->Wait(10);
 }
 
-TEST_F(HexagonUserDMATest, poll) { ASSERT_EQ(HexagonUserDMA::Get().Poll(), 0); }
+TEST_F(HexagonUserDMATest, poll) { ASSERT_EQ(user_dma->Poll(), 0); }
 
 TEST_F(HexagonUserDMATest, bad_copy) {
   uint64_t bigaddr = 0x100000000;
   void* src64 = reinterpret_cast<void*>(bigaddr);
   void* dst64 = reinterpret_cast<void*>(bigaddr);
   uint32_t biglength = 0x1000000;
-  ASSERT_NE(HexagonUserDMA::Get().Copy(dst64, src, length), DMA_SUCCESS);
-  ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src64, length), DMA_SUCCESS);
-  ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src, biglength), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(dst64, src, length), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(dst, src64, length), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(dst, src, biglength), DMA_SUCCESS);
 }
 
 TEST_F(HexagonUserDMATest, sync_dma) {
   // kick off 1 DMA
-  ret = HexagonUserDMA::Get().Copy(dst, src, length);
+  ret = user_dma->Copy(dst, src, length);
   ASSERT_EQ(ret, DMA_SUCCESS);
 
   // wait for DMA to complete
-  HexagonUserDMA::Get().Wait(0);
+  user_dma->Wait(0);
 
   // verify
   for (uint32_t i = 0; i < length; ++i) {
@@ -86,12 +88,12 @@ TEST_F(HexagonUserDMATest, sync_dma) {
 TEST_F(HexagonUserDMATest, async_dma_wait) {
   // kick off 10x duplicate DMAs
   for (uint32_t i = 0; i < 10; ++i) {
-    ret = HexagonUserDMA::Get().Copy(dst, src, length);
+    ret = user_dma->Copy(dst, src, length);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }
 
   // wait for at least 1 DMA to complete
-  HexagonUserDMA::Get().Wait(9);
+  user_dma->Wait(9);
 
   // verify
   for (uint32_t i = 0; i < length; ++i) {
@@ -99,18 +101,18 @@ TEST_F(HexagonUserDMATest, async_dma_wait) {
   }
 
   // empty the DMA queue
-  HexagonUserDMA::Get().Wait(0);
+  user_dma->Wait(0);
 }
 
 TEST_F(HexagonUserDMATest, async_dma_poll) {
   // kick off 10x duplicate DMAs
   for (uint32_t i = 0; i < 10; ++i) {
-    ret = HexagonUserDMA::Get().Copy(dst, src, length);
+    ret = user_dma->Copy(dst, src, length);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }
 
   // poll until at least 1 DMA is complete
-  while (HexagonUserDMA::Get().Poll() == 10) {
+  while (user_dma->Poll() == 10) {
   };
 
   // verify
@@ -119,7 +121,7 @@ TEST_F(HexagonUserDMATest, async_dma_poll) {
   }
 
   // empty the DMA queue
-  HexagonUserDMA::Get().Wait(0);
+  user_dma->Wait(0);
 }
 
 // TODO: Run non-pipelined case with sync DMA and execution time vs. pipelined case
@@ -128,26 +130,26 @@ TEST_F(HexagonUserDMATest, pipeline) {
   uint32_t pipeline_length = length / pipeline_depth;
 
   for (uint32_t i = 0; i < pipeline_depth; ++i) {
-    ret |= HexagonUserDMA::Get().Copy(dst_char + i * pipeline_length,
-                                      src_char + i * pipeline_length, pipeline_length);
+    ret |= user_dma->Copy(dst_char + i * pipeline_length, src_char + i * pipeline_length,
+                          pipeline_length);
   }
 
-  HexagonUserDMA::Get().Wait(3);
+  user_dma->Wait(3);
   for (uint32_t i = 0; i < pipeline_length; ++i) {
     dst_char[i]++;
   }
 
-  HexagonUserDMA::Get().Wait(2);
+  user_dma->Wait(2);
   for (uint32_t i = pipeline_length; i < 2 * pipeline_length; ++i) {
     dst_char[i]++;
   }
 
-  HexagonUserDMA::Get().Wait(1);
+  user_dma->Wait(1);
   for (uint32_t i = 2 * pipeline_length; i < 3 * pipeline_length; ++i) {
     dst_char[i]++;
   }
 
-  HexagonUserDMA::Get().Wait(0);
+  user_dma->Wait(0);
   for (uint32_t i = 3 * pipeline_length; i < 4 * pipeline_length; ++i) {
     dst_char[i]++;
   }
@@ -165,8 +167,8 @@ TEST_F(HexagonUserDMATest, overflow_ring_buffer) {
 
   for (uint32_t i = 0; i < number_of_dmas; ++i) {
     do {
-      ret = HexagonUserDMA::Get().Copy(dst_char + i * length_of_each_dma,
-                                       src_char + i * length_of_each_dma, length_of_each_dma);
+      ret = user_dma->Copy(dst_char + i * length_of_each_dma, src_char + i * length_of_each_dma,
+                           length_of_each_dma);
     } while (ret == DMA_RETRY);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }

From 68f9509b0cece96b57581c3c21a145581b5a0365 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 28 Sep 2022 19:11:17 -0700
Subject: [PATCH 273/704] [TIR] Fix int64 dtype mismatch in Reindex (#12934)

---
 .../schedule/primitive/cache_read_write.cc    | 16 +++---
 .../unittest/test_tir_schedule_reindex.py     | 51 +++++++++++++++++++
 2 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index c76e6abaebb5..e9583adbbaa9 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -196,14 +196,16 @@ Block MakeReIndexStage(const Block& block, CacheStageInfo* info,
   // Step 1: Create block iters, access regions of the reindex block, and accessing indices to the
   // reindex buffer.
   for (const IterVar& iter : block->iter_vars) {
-    Var var("v" + std::to_string(new_block_iters.size()));
+    Var var("v" + std::to_string(new_block_iters.size()), iter->var->dtype);
     bool used = covered.count(iter->var);
-    new_block_iters.push_back(IterVar(/*dom=*/used ? iter->dom : Range::FromMinExtent(0, 1),
-                                      /*var=*/var,
-                                      /*IterVarType=*/kDataPar));
+    new_block_iters.push_back(
+        IterVar(/*dom=*/used ? iter->dom
+                             : Range::FromMinExtent(IntImm(var->dtype, 0), IntImm(var->dtype, 1)),
+                /*var=*/var,
+                /*IterVarType=*/kDataPar));
     if (used) {
       reindex_indices.push_back(var);
-      reindex_region.push_back(Range::FromMinExtent(var, 1));
+      reindex_region.push_back(Range::FromMinExtent(var, IntImm(var->dtype, 1)));
     }
     block_var_replace_map[iter->var] = var;
   }
@@ -254,7 +256,7 @@ Block MakeReIndexStage(const Block& block, CacheStageInfo* info,
   std::vector<Var> loop_vars;         // loop variables
   std::vector<PrimExpr> iter_values;  // bindings in block realize
   for (int i = 0; i < static_cast<int>(block->iter_vars.size()); ++i) {
-    Var loop_var("ax" + std::to_string(loop_vars.size()));
+    Var loop_var("ax" + std::to_string(loop_vars.size()), block->iter_vars[i]->var->dtype);
     loop_vars.push_back(loop_var);
     iter_values.push_back(loop_var);
   }
@@ -920,7 +922,7 @@ class ReIndexRewriter : public StmtExprMutator {
       for (const IterVar& iter : block->iter_vars) {
         if (covered_.count(iter->var)) {
           indices_.push_back(iter->var);
-          region_.push_back(Range::FromMinExtent(iter->var, 1));
+          region_.push_back(Range::FromMinExtent(iter->var, IntImm(iter->var->dtype, 1)));
         }
       }
       Block stmt = Downcast<Block>(StmtExprMutator::VisitStmt_(block));
diff --git a/tests/python/unittest/test_tir_schedule_reindex.py b/tests/python/unittest/test_tir_schedule_reindex.py
index c6776b0c8a3e..47b8b5cb88f4 100644
--- a/tests/python/unittest/test_tir_schedule_reindex.py
+++ b/tests/python/unittest/test_tir_schedule_reindex.py
@@ -168,6 +168,48 @@ def multiple_read(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "f
             B[vi, vj] = A[vj, vi] + A[vi, vj]
 
 
+@T.prim_func
+def mixed_dtype(
+    p0: T.Buffer[(T.int64(2), 1280), "float16"],
+    p1: T.Buffer[(1280, 1280), "float16"],
+    T_matmul_NT: T.Buffer[(T.int64(2), 1280), "float16"],
+) -> None:
+    for i0, i1, i2 in T.grid(T.int64(2), 1280, 1280):
+        with T.block("T_matmul_NT"):
+            i = T.axis.spatial(T.int64(2), i0)
+            j, k = T.axis.remap("SR", [i1, i2])
+            T.reads(p0[i, k], p1[j, k])
+            T.writes(T_matmul_NT[i, j])
+            with T.init():
+                T_matmul_NT[i, j] = T.float16(0)
+            T_matmul_NT[i, j] = T_matmul_NT[i, j] + p0[i, k] * p1[j, k]
+
+
+@T.prim_func
+def mixed_dtype_reindex_write(
+    p0: T.Buffer[(T.int64(2), 1280), "float16"],
+    p1: T.Buffer[(1280, 1280), "float16"],
+    T_matmul_NT: T.Buffer[(T.int64(2), 1280), "float16"],
+) -> None:
+    T_matmul_NT_reindex = T.alloc_buffer([T.int64(2), 1280], dtype="float16")
+    for i0, i1, i2 in T.grid(T.int64(2), 1280, 1280):
+        with T.block("T_matmul_NT"):
+            i = T.axis.spatial(T.int64(2), i0)
+            j, k = T.axis.remap("SR", [i1, i2])
+            T.reads(p0[i, k], p1[j, k])
+            T.writes(T_matmul_NT_reindex[i, j])
+            with T.init():
+                T_matmul_NT_reindex[i, j] = T.float16(0)
+            T_matmul_NT_reindex[i, j] = T_matmul_NT_reindex[i, j] + p0[i, k] * p1[j, k]
+    for ax0, ax1, ax2 in T.grid(T.int64(2), 1280, 1):
+        with T.block("T_matmul_NT_reindex"):
+            v0 = T.axis.spatial(T.int64(2), ax0)
+            v1, v2 = T.axis.remap("SS", [ax1, ax2])
+            T.reads(T_matmul_NT_reindex[v0, v1])
+            T.writes(T_matmul_NT[v0, v1])
+            T_matmul_NT[v0, v1] = T_matmul_NT_reindex[v0, v1]
+
+
 use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 use_buffer_name = tvm.testing.parameter(by_dict={"buffer_index": False, "buffer_name": True})
 
@@ -207,5 +249,14 @@ def test_reindex_fail_multiple_read(use_block_name, use_buffer_name):
         sch.reindex(block, buf)
 
 
+def test_reindex_mixed_dtype(use_block_name, use_buffer_name):
+    sch = tir.Schedule(mixed_dtype)
+    block = "T_matmul_NT" if use_block_name else sch.get_block("T_matmul_NT")
+    buf = "T_matmul_NT" if use_buffer_name else ("write", 0)
+    sch.reindex(block, buf)
+    tvm.ir.assert_structural_equal(mixed_dtype_reindex_write, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=mixed_dtype)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 8c88aab77863e43985318c575a2fa648fc783338 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@octoml.ai>
Date: Thu, 29 Sep 2022 03:03:17 -0400
Subject: [PATCH 274/704] [Bugfix][CMake] Update the minimum CMake version to
 3.18 (#12682)

TVM has recently switched to C++17. CUDA support with C++17 requires CMake past version 3.18, according to vinx13. However, the current CMake version check in `CMakeLists.txt` is not checking for a sufficiently high CMake version; this PR updates it.

Bug that prompted this: When building with CMake version 3.16 I had the following error: `CUDA_STANDARD is set to invalid value '17'`. Upgrading to the latest CMake (3.24) fixed it.
---
 CMakeLists.txt               | 2 +-
 docs/install/from_source.rst | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 188f9fb1c7a8..e961c6d2d992 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.2)
+cmake_minimum_required(VERSION 3.18)
 project(tvm C CXX)
 
 # Utility functions
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 63d8aab33623..33328c586760 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -64,7 +64,7 @@ The minimal building requirements for the ``TVM`` libraries are:
       - Clang 5.0
       - Apple Clang 9.3
       - Visual Studio 2019 (v16.7)
-   - CMake 3.10 or higher
+   - CMake 3.18 or higher
    - We highly recommend to build with LLVM to enable all the features.
    - If you want to use CUDA, CUDA toolkit version >= 8.0 is required. If you are upgrading from an older version, make sure you purge the older version and reboot after installation.
    - On macOS, you may want to install `Homebrew <https://brew.sh>`_ to easily install and manage dependencies.
@@ -78,6 +78,9 @@ linux operating systems, execute (in a terminal):
     sudo apt-get update
     sudo apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
+
+Note that the version of CMake on apt may not be sufficiently up to date; it may be necessary to install it directly from `Kitware's third-party APT repository <https://apt.kitware.com/>`_.
+
 Use Homebrew to install the required dependencies for macOS running either the Intel or M1 processors. You must follow the post-installation steps specified by
 Homebrew to ensure the dependencies are correctly installed and configured:
 

From 5f132fd6c125a266764f9d0f37e2e5694bad5a55 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 29 Sep 2022 09:40:02 +0100
Subject: [PATCH 275/704] [ETHOSN] Support conversion of add/mul to requantize
 where possible (#12887)

* [ETHOSN] Support conversion of add/mul to requantize where possible

Add/mul operations that correspond to identity operations can be
converted to a simple reinterpret quantize operation. This conversion
takes place in the convert equivalents pass similar to the depthwise
counter-part.

In addtion, an issue was noticed that would cause unsupported
operations to raise an error rather than not being offloaded. This has
been fixed by allowing the conversion to return Null when the conversion
is not supported.
---
 python/tvm/relay/op/contrib/ethosn.py         |  82 +++--
 src/relay/backend/contrib/ethosn/codegen.cc   |  39 +++
 .../backend/contrib/ethosn/codegen_ethosn.h   |   1 +
 .../contrib/ethosn/convert_equivalent.cc      | 324 +++++++++++++-----
 .../backend/contrib/ethosn/ethosn_api.cc      |  36 ++
 src/relay/backend/contrib/ethosn/ethosn_api.h |  16 +
 .../contrib/test_ethosn/test_addition.py      |  70 +++-
 .../test_ethosn/test_convert_equivalents.py   | 318 ++++++++++++++++-
 .../contrib/test_ethosn/test_multiply.py      | 102 ++++--
 .../contrib/test_ethosn/test_networks.py      |  10 +-
 10 files changed, 832 insertions(+), 166 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 6a318c602fd2..80cc1ca3b202 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -215,7 +215,7 @@ def qnn_mul_pattern():
         input_is_right = gen_mul_inputs(is_constant(), wildcard())
         return input_is_left | input_is_right
 
-    def qnn_add_pattern():
+    def qnn_add_pattern(has_constant_input=False):
         add_op = is_op("qnn.add")
         gen_add_inputs = lambda x, y: add_op(
             x,
@@ -227,11 +227,13 @@ def qnn_add_pattern():
             is_constant(),
             is_constant(),
         )
-        two_inputs = gen_add_inputs(wildcard(), wildcard())
-        input_is_left = gen_add_inputs(wildcard(), is_constant())
-        input_is_right = gen_add_inputs(is_constant(), wildcard())
 
-        return input_is_left | input_is_right | two_inputs
+        if has_constant_input:
+            input_is_left = gen_add_inputs(wildcard(), is_constant())
+            input_is_right = gen_add_inputs(is_constant(), wildcard())
+            return input_is_left | input_is_right
+        else:
+            return gen_add_inputs(wildcard(), wildcard())
 
     def qnn_conv2d_transpose_pattern():
         pattern = is_op("qnn.conv2d_transpose")(
@@ -299,16 +301,24 @@ def check_leaky_relu(extract):
 
         return _ethosn.leaky_relu(extract)
 
-    def check_mul(extract):
-        """Check if Mul is supported."""
+    def check_mul_to_reinterpret_quantize(extract):
+        """Check if Mul is supported by converting to reinterpret quantize"""
         if not ethosn_available():
             return False
-        # Do not support scalar constants for now
-        check_scalar = lambda i: isinstance(i, tvm.relay.Constant) and len(i.data.shape) == 0
-        if check_scalar(extract.args[0]) or check_scalar(extract.args[1]):
+
+        converted_extract = _ethosn.ConvertQnnMultiplyToReinterpretQuantize(extract)
+        if converted_extract:
+            return _ethosn.reinterpret_quantize(converted_extract)
+        return False
+
+    def check_mul_to_depthwise(extract):
+        """Check if Mul is supported by converting to a depthwise operation."""
+        if not ethosn_available():
             return False
-        extract = _ethosn.ConvertQnnMultiply(extract)
-        return _ethosn.conv2d(extract)
+        converted_extract = _ethosn.ConvertQnnMultiplyToDepthwise(extract)
+        if converted_extract:
+            return _ethosn.conv2d(converted_extract)
+        return False
 
     def check_requantize(extract):
         """Check if requantize is supported."""
@@ -328,19 +338,40 @@ def check_add(extract):
         """Check if an addition is supported by Ethos-N."""
         if not ethosn_available():
             return False
-        # Do not support scalar constants for now
-        check_scalar = lambda i: isinstance(i, tvm.relay.Constant) and len(i.data.shape) == 0
-        if check_scalar(extract.args[0]) or check_scalar(extract.args[1]):
-            return False
 
-        inputs = extract.args[0:2]
-        if any([isinstance(i, tvm.relay.Constant) for i in inputs]):
-            extract = _ethosn.ConvertQnnAdd(extract)
-            return _ethosn.conv2d(extract)
         return _ethosn.addition(extract)
 
+    def check_add_to_reinterpret_quantize(extract):
+        """Check if addition can be converted to a reinterpret quantize operation."""
+        if not ethosn_available():
+            return False
+        converted_extract = _ethosn.ConvertQnnAddToReinterpretQuantize(extract)
+        if converted_extract:
+            return _ethosn.reinterpret_quantize(converted_extract)
+        return False
+
+    def check_add_to_depthwise(extract):
+        """Check if addition can be converted to a depthwise operation."""
+        if not ethosn_available():
+            return False
+        converted_extract = _ethosn.ConvertQnnAddToDepthwise(extract)
+        if converted_extract:
+            return _ethosn.conv2d(converted_extract)
+        return False
+
     return [
-        ("ethos-n.qnn_mul", qnn_mul_pattern(), check_mul),
+        (
+            "ethos-n.qnn_mul_to_reinterpret_quantize",
+            qnn_mul_pattern(),
+            check_mul_to_reinterpret_quantize,
+        ),
+        ("ethos-n.qnn_mul_to_depthwise", qnn_mul_pattern(), check_mul_to_depthwise),
+        (
+            "ethos-n.qnn_add_to_reinterpret_quantize",
+            qnn_add_pattern(True),
+            check_add_to_reinterpret_quantize,
+        ),
+        ("ethos-n.qnn_add_to_depthwise", qnn_add_pattern(True), check_add_to_depthwise),
         ("ethos-n.qnn_add", qnn_add_pattern(), check_add),
         ("ethos-n.qnn_conv2d", qnn_conv_pattern(), check_conv2d),
         ("ethos-n.qnn_conv2d_transpose", qnn_conv2d_transpose_pattern(), check_conv2d_transpose),
@@ -355,15 +386,6 @@ def check_add(extract):
     ]
 
 
-def _is_ethosn_composite(node):
-    if isinstance(node, tvm.relay.expr.Call) and isinstance(node.op, tvm.relay.Function):
-        if "Composite" in node.op.attrs:
-            comp_name = node.op.attrs["Composite"]
-            return comp_name.split(".")[0] == "ethos-n"
-
-    return False
-
-
 @tvm.ir.register_op_attr("nn.max_pool2d", "target.ethos-n")
 def max_pool2d(expr):
     """Check if a max pool2d is supported by Ethos-N."""
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index c7109b754d2b..46420775ae5b 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -152,6 +152,10 @@ void InferTensorsVisitor::InferCall(const CallNode* cn) {
     RequantizeParams params;
     err += EthosnAPI::Requantize(cn->op.as<FunctionNode>()->body, &params);
     tensor_table_[cn->args[0]] = {params.input_info};
+  } else if (IsEthosnFunc(call, "ethos-n.qnn_reinterpret_quantize")) {
+    ReinterpretQuantizationParams params;
+    err += EthosnAPI::ReinterpretQuantize(cn->op.as<FunctionNode>()->body, &params);
+    tensor_table_[cn->args[0]] = {params.input_info};
   } else if (IsEthosnFunc(call, "ethos-n.qnn_resize")) {
     ResizeParams params;
     err += EthosnAPI::Resize(cn->op.as<FunctionNode>()->body, &params);
@@ -333,6 +337,9 @@ sl::TensorsAndId ConstructNetworkVisitor::HandleCall(const CallNode* cn) {
   } else if (IsEthosnFunc(call, "ethos-n.qnn_requantize")) {
     if ((err = MakeRequantizeLayer(call, &tensor))) ReportFatalError(call, err);
     return MakeOps(tensor);
+  } else if (IsEthosnFunc(call, "ethos-n.qnn_reinterpret_quantize")) {
+    if ((err = MakeReinterpretQuantizeLayer(call, &tensor))) ReportFatalError(call, err);
+    return MakeOps(tensor);
   } else if (IsEthosnFunc(call, "ethos-n.qnn_resize")) {
     if ((err = MakeResizeLayer(call, &tensor))) ReportFatalError(call, err);
     return MakeOps(tensor);
@@ -654,6 +661,24 @@ EthosnError ConstructNetworkVisitor::MakeRequantizeLayer(const Call& call,
   return EthosnError();
 }
 
+EthosnError ConstructNetworkVisitor::MakeReinterpretQuantizeLayer(
+    const Call& call, sl::TensorAndId<sl::Operand>* out) {
+  ReinterpretQuantizationParams params;
+  params.input_info = GetTensorInfo(tensor_table_, call);
+  if (auto err = EthosnAPI::ReinterpretQuantize(call->op.as<FunctionNode>()->body, &params)) {
+    return err;
+  }
+
+  auto input = operand_table_[call->args[0]][0];
+
+  try {
+    *out = AddReinterpretQuantization(network_, *input, params.reinterpret_quantize_info);
+  } catch (const sl::NotSupportedException& e) {
+    return EthosnError(e.what());
+  }
+  return EthosnError();
+}
+
 EthosnError ConstructNetworkVisitor::MakeResizeLayer(const Call& call,
                                                      sl::TensorAndId<sl::Operand>* out) {
   ResizeParams params;
@@ -1022,6 +1047,20 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.requantize")
       err += EthosnError(reason);
     });
 
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.reinterpret_quantize")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      ReinterpretQuantizationParams params;
+      auto err = EthosnAPI::ReinterpretQuantize(call, &params);
+      err += EthosnCompiler::SupportedSetup();
+      char reason[kReasonMaxLength];
+      reason[0] = '\0';
+      *rv = !err && EthosnCompiler::GetSupported()->IsReinterpretQuantizationSupported(
+                        params.reinterpret_quantize_info, params.input_info, &params.output_info,
+                        reason, sizeof(reason));
+      err += EthosnError(reason);
+    });
+
 TVM_REGISTER_GLOBAL("relay.ethos-n.support.resize")
     .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
       Call call = args[0];
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index a653b0b8dc97..ab853599aa2d 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -213,6 +213,7 @@ class ConstructNetworkVisitor : public MixedModeVisitor, private ErrorReportingP
   EthosnError MakeReluLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeLeakyReLULayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeRequantizeLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
+  EthosnError MakeReinterpretQuantizeLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeResizeLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
 
   /*! \brief A look-up table from Expr to layers. */
diff --git a/src/relay/backend/contrib/ethosn/convert_equivalent.cc b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
index 91c924b1b04f..7f4e1a3c5045 100644
--- a/src/relay/backend/contrib/ethosn/convert_equivalent.cc
+++ b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
@@ -39,37 +39,63 @@ namespace relay {
 namespace contrib {
 namespace ethosn {
 
+/*!
+ * \brief Helper class to extract inputs and quantization information from binary
+ * elementwise operations ready to convert.
+ */
+class BinaryElementwiseParams {
+ public:
+  static BinaryElementwiseParams ExtractBinaryElementwiseParams(const Call& call) {
+    auto params = BinaryElementwiseParams();
+    params.input1 = call->args[0];
+    params.input2 = call->args[1];
+    params.input1_scale = call->args[2];
+    params.input1_zero_point = call->args[3];
+    params.input2_scale = call->args[4];
+    params.input2_zero_point = call->args[5];
+    // Reverse the inputs if the constant is first input
+    if (call->args[0]->IsInstance<ConstantNode>()) {
+      params.input1 = call->args[1];
+      params.input2 = call->args[0];
+      params.input1_scale = call->args[4];
+      params.input1_zero_point = call->args[5];
+      params.input2_scale = call->args[2];
+      params.input2_zero_point = call->args[3];
+    }
+    params.output_scale = call->args[6];
+    params.output_zero_point = call->args[7];
+    return params;
+  }
+
+  Expr input1;
+  Expr input2;
+  Expr input1_scale;
+  Expr input1_zero_point;
+  Expr input2_scale;
+  Expr input2_zero_point;
+  Expr output_scale;
+  Expr output_zero_point;
+};
+
 /*!
  * \brief Converts qnn.mul to mathematically equivalent
  * qnn.conv2d depthwise operation.
+ *
+ * \param expr The expression to attempt to convert.
+ *
+ * \return Null if conversion is not supported else the converted expression.
  */
-Expr ConvertQnnMultiply(const Expr& expr) {
+Optional<Expr> ConvertQnnMultiplyToDepthwise(const Expr& expr) {
   Call call = Downcast<Call>(expr);
+  const auto params = BinaryElementwiseParams::ExtractBinaryElementwiseParams(call);
 
-  Expr input1 = call->args[0];
-  Expr input2 = call->args[1];
-  Expr input1_scale = call->args[2];
-  Expr input1_zero_point = call->args[3];
-  Expr input2_scale = call->args[4];
-  Expr input2_zero_point = call->args[5];
-  // Reverse the inputs if the constant is first input
-  if (call->args[0]->IsInstance<ConstantNode>()) {
-    input1 = call->args[1];
-    input2 = call->args[0];
-    input1_scale = call->args[4];
-    input1_zero_point = call->args[5];
-    input2_scale = call->args[2];
-    input2_zero_point = call->args[3];
+  Constant input_constant = Downcast<Constant>(params.input2);
+  TensorType input_constant_tt = Downcast<TensorType>(input_constant->checked_type());
+  TensorType input_tt = Downcast<TensorType>(call->checked_type());
+  int channels = Downcast<IntImm>(input_tt->shape.back())->value;
+  if (channels != Downcast<IntImm>(input_constant_tt->Size())->value) {
+    return NullOpt;
   }
-  Expr output_scale = call->args[6];
-  Expr output_zero_point = call->args[7];
-
-  const auto* input_constant = input2.as<ConstantNode>();
-  ICHECK(input_constant) << "Expected ConstantNode but got " << input2->GetTypeKey();
-  Type input_constant_type = input_constant->checked_type();
-  const auto* input_constant_tt = input_constant_type.as<TensorTypeNode>();
-  ICHECK(input_constant) << "Expected TensorTypeNode but got " << input_constant_type->GetTypeKey();
-  int channels = input_constant_tt->shape.back().as<IntImmNode>()->value;
 
   runtime::NDArray input_data = input_constant->data;
   runtime::NDArray kernel_data_hwoi =
@@ -77,62 +103,53 @@ Expr ConvertQnnMultiply(const Expr& expr) {
   kernel_data_hwoi.CopyFrom(input_data);
   Constant kernel = Constant(kernel_data_hwoi, input_constant->span);
 
-  Type output_type = expr->checked_type();
-  auto output_tt = output_type.as<TensorTypeNode>();
-  ICHECK(output_tt) << "Expected TensorTypeNode but got " << output_type->GetTypeKey();
+  TensorType output_tt = Downcast<TensorType>(expr->checked_type());
   DataType output_dtype = output_tt->dtype;
 
-  Expr conv2d = qnn::MakeQnnConv2D(
-      input1, kernel, input1_zero_point, input2_zero_point, input1_scale, input2_scale, {1, 1},
-      {0, 0, 0, 0}, {1, 1}, channels, channels, {1, 1}, "NHWC", "HWOI", "NHWC", DataType::Int(32));
+  Expr conv2d =
+      qnn::MakeQnnConv2D(params.input1, kernel, params.input1_zero_point, params.input2_zero_point,
+                         params.input1_scale, params.input2_scale, {1, 1}, {0, 0, 0, 0}, {1, 1},
+                         channels, channels, {1, 1}, "NHWC", "HWOI", "NHWC", DataType::Int(32));
   Constant bias_data = MakeConstantZeros(DataType::Int(32), {channels});
   Expr bias_add = MakeBiasAdd(conv2d, bias_data, 3);
-  Expr requantize = qnn::MakeRequantize(bias_add, input1_scale, input1_zero_point, output_scale,
-                                        output_zero_point, -1, "None", "None", output_dtype);
+  Expr requantize = qnn::MakeRequantize(bias_add, params.input1_scale, params.input1_zero_point,
+                                        params.output_scale, params.output_zero_point, -1, "None",
+                                        "None", output_dtype);
 
-  return InferType(requantize);
+  try {
+    requantize = InferType(requantize);
+    return requantize;
+  } catch (tvm::Error& e) {
+    // Conversion produced an invalid op.
+    return NullOpt;
+  }
 }
 
-TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnMultiply")
-    .set_body_typed(ConvertQnnMultiply);
-
 /*!
  * \brief Converts qnn.add to a mathematically equivalent
  * qnn.conv2d depthwise operation.
+ *
+ * \param expr The expression to attempt to convert.
+ *
+ * \return Null if conversion is not supported else the converted expression.
  */
-Expr ConvertQnnAdd(const Expr& expr) {
+Optional<Expr> ConvertQnnAddToDepthwise(const Expr& expr) {
   Call call = Downcast<Call>(expr);
+  const auto params = BinaryElementwiseParams::ExtractBinaryElementwiseParams(call);
 
-  Expr input1 = call->args[0];
-  Expr input2 = call->args[1];
-  Expr input1_scale = call->args[2];
-  Expr input1_zero_point = call->args[3];
-  Expr input2_scale = call->args[4];
-  Expr input2_zero_point = call->args[5];
-  // Reverse the inputs if the constant is first input
-  if (call->args[0]->IsInstance<ConstantNode>()) {
-    input1 = call->args[1];
-    input2 = call->args[0];
-    input1_scale = call->args[4];
-    input1_zero_point = call->args[5];
-    input2_scale = call->args[2];
-    input2_zero_point = call->args[3];
+  Constant input_constant = Downcast<Constant>(params.input2);
+  TensorType input_constant_tt = Downcast<TensorType>(input_constant->checked_type());
+  TensorType input_tt = Downcast<TensorType>(call->checked_type());
+  int channels = Downcast<IntImm>(input_tt->shape.back())->value;
+  if (channels != Downcast<IntImm>(input_constant_tt->Size())->value) {
+    return NullOpt;
   }
-  Expr output_scale = call->args[6];
-  Expr output_zero_point = call->args[7];
-
-  const auto* input_constant = input2.as<ConstantNode>();
-  ICHECK(input_constant) << "Expected ConstantNode but got " << input2->GetTypeKey();
-  Type input_constant_type = input_constant->checked_type();
-  const auto* input_constant_tt = input_constant_type.as<TensorTypeNode>();
-  ICHECK(input_constant) << "Expected TensorTypeNode but got " << input_constant_type->GetTypeKey();
-  int channels = input_constant_tt->shape.back().as<IntImmNode>()->value;
 
   // Create the identity kernel. The kernel data is constructed such that it produces an identity
   // operation in the quantized space. Therefore, the input is not scaled in any way which allows
   // us to later use the bias to perform the addition.
-  float input_scale_value = GetScalarFromConstant<float>(input1_scale);
-  float output_scale_value = GetScalarFromConstant<float>(output_scale);
+  float input_scale_value = GetScalarFromConstant<float>(params.input1_scale);
+  float output_scale_value = GetScalarFromConstant<float>(params.output_scale);
   float identity_kernel_scale_ub = std::min(output_scale_value / input_scale_value, 1.f);
   float identity_kernel_scale_lb = (1.f / 255.f);
   float identity_kernel_scale_target = (identity_kernel_scale_ub + identity_kernel_scale_lb) / 2.f;
@@ -153,25 +170,131 @@ Expr ConvertQnnAdd(const Expr& expr) {
       MakeConstantScalar(DataType::Float(32), input_scale_value * identity_kernel_scale_value);
   Constant bias_zero_point = MakeConstantScalar(DataType::Int(32), 0);
   Expr requantize_bias =
-      qnn::MakeRequantize(input2, input2_scale, input2_zero_point, bias_scale, bias_zero_point, -1,
-                          "None", "None", DataType::Int(32));
+      qnn::MakeRequantize(params.input2, params.input2_scale, params.input2_zero_point, bias_scale,
+                          bias_zero_point, -1, "None", "None", DataType::Int(32));
   Expr reshape_bias = MakeReshape(requantize_bias, {channels});
-  Constant bias = Downcast<Constant>(FoldConstantExpr(reshape_bias));
+
+  try {
+    reshape_bias = FoldConstantExpr(reshape_bias);
+  } catch (tvm::Error& e) {
+    // Conversion produced an invalid op.
+    return NullOpt;
+  }
+  Constant bias = Downcast<Constant>(reshape_bias);
 
   // Make depthwise conv2d operation
-  Expr conv2d =
-      qnn::MakeQnnConv2D(input1, identity_kernel, input1_zero_point, identity_kernel_zero_point,
-                         input1_scale, identity_kernel_scale, {1, 1}, {0, 0, 0, 0}, {1, 1},
-                         channels, channels, {1, 1}, "NHWC", "HWOI", "NHWC", DataType::Int(32));
+  Expr conv2d = qnn::MakeQnnConv2D(params.input1, identity_kernel, params.input1_zero_point,
+                                   identity_kernel_zero_point, params.input1_scale,
+                                   identity_kernel_scale, {1, 1}, {0, 0, 0, 0}, {1, 1}, channels,
+                                   channels, {1, 1}, "NHWC", "HWOI", "NHWC", DataType::Int(32));
   Expr bias_add = MakeBiasAdd(conv2d, bias, 3);
-  Expr requantize =
-      qnn::MakeRequantize(bias_add, input1_scale, input1_zero_point, output_scale,
-                          output_zero_point, -1, "None", "None", input_constant_tt->dtype);
+  Expr requantize = qnn::MakeRequantize(bias_add, params.input1_scale, params.input1_zero_point,
+                                        params.output_scale, params.output_zero_point, -1, "None",
+                                        "None", input_constant_tt->dtype);
 
-  return InferType(requantize);
+  try {
+    return InferType(requantize);
+  } catch (tvm::Error& e) {
+    // Conversion produced an invalid op.
+    return NullOpt;
+  }
 }
 
-TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnAdd").set_body_typed(ConvertQnnAdd);
+/*!
+ * \brief Converts qnn.mul to a mathematically equivalent qnn.requantize operation.
+ * When converting to support library API, a reinterpret quantize operation will be created.
+ *
+ * \param expr The expression to attempt to convert.
+ *
+ * \return Null if conversion is not supported else the converted expression.
+ */
+Optional<Expr> ConvertQnnMultiplyToReinterpretQuantize(const Expr& expr) {
+  Call call = Downcast<Call>(expr);
+  const auto params = BinaryElementwiseParams::ExtractBinaryElementwiseParams(call);
+
+  Constant input_constant = Downcast<Constant>(params.input2);
+  TensorType input_constant_tt = Downcast<TensorType>(input_constant->checked_type());
+  if (Downcast<IntImm>(input_constant_tt->Size())->value != 1) {
+    return NullOpt;
+  }
+
+  float input_scale_value = GetScalarFromConstant<float>(params.input1_scale);
+  float constant_scale_value = GetScalarFromConstant<float>(params.input2_scale);
+  int constant_zero_point_value = GetScalarFromConstant<int>(params.input2_zero_point);
+  float new_output_scale_value = input_scale_value * constant_scale_value *
+                                 (ToScalar(input_constant->data) - constant_zero_point_value);
+  Constant new_output_scale = MakeConstantScalar(DataType::Float(32), new_output_scale_value);
+
+  if (std::abs(new_output_scale_value - GetScalarFromConstant<float>(params.output_scale)) >
+      0.004f) {
+    // Multiply does not represent an identity operation so don't convert.
+    return NullOpt;
+  }
+
+  DataType output_data_type = Downcast<TensorType>(call->checked_type())->dtype;
+
+  // A requantize operation is used to represent the identity reinterperet quantize op in
+  // the support library at this stage. That is requantize is used here as a means for
+  // passing the quantization information to the API conversion layer.
+  Expr requantize = qnn::MakeRequantize(
+      params.input1, params.input1_scale, params.input1_zero_point, params.output_scale,
+      params.output_zero_point, -1, "None", "None", output_data_type);
+
+  try {
+    return InferType(requantize);
+  } catch (tvm::Error& e) {
+    // Conversion produced an invalid op.
+    return NullOpt;
+  }
+}
+
+/*!
+ * \brief Converts qnn.mul to a mathematically equivalent qnn.requantize operation.
+ * When converting to support library API, a reinterpret quantize operation will be created.
+ *
+ * \param expr The expression to attempt to convert.
+ *
+ * \return Null if conversion is not supported else the converted expression.
+ */
+Optional<Expr> ConvertQnnAddToReinterpretQuantize(const Expr& expr) {
+  Call call = Downcast<Call>(expr);
+  const auto params = BinaryElementwiseParams::ExtractBinaryElementwiseParams(call);
+
+  Constant input_constant = Downcast<Constant>(params.input2);
+  TensorType input_constant_tt = Downcast<TensorType>(input_constant->checked_type());
+  if (Downcast<IntImm>(input_constant_tt->Size())->value != 1) {
+    return NullOpt;
+  }
+
+  float input_scale = GetScalarFromConstant<float>(params.input1_scale);
+  int input_zero_point = GetScalarFromConstant<int>(params.input1_zero_point);
+  float scalar_scale = GetScalarFromConstant<float>(params.input2_scale);
+  int scalar_zero_point = GetScalarFromConstant<int>(params.input2_zero_point);
+  int output_zero_point_value = GetScalarFromConstant<int>(params.output_zero_point);
+  float scalar_value = (ToScalar(input_constant->data) - scalar_zero_point) * scalar_scale;
+
+  float new_output_zero_point_value = input_zero_point - (scalar_value / input_scale);
+  if (new_output_zero_point_value - output_zero_point_value > 1.0f) {
+    // Add does not represent an identity operation so don't convert
+    return NullOpt;
+  }
+
+  DataType output_data_type = Downcast<TensorType>(call->checked_type())->dtype;
+
+  // A requantize operation is used to represent the identity reinterperet quantize op in
+  // the support library at this stage. That is requantize is used here as a means for
+  // passing the quantization information to the API conversion layer.
+  Expr requantize = qnn::MakeRequantize(
+      params.input1, params.input1_scale, params.input1_zero_point, params.output_scale,
+      params.output_zero_point, -1, "None", "None", output_data_type);
+
+  try {
+    return InferType(requantize);
+  } catch (tvm::Error& e) {
+    // Conversion produced an invalid op.
+    return NullOpt;
+  }
+}
 
 class ConvertEquivalentsMutator : public MixedModeMutator {
  public:
@@ -184,29 +307,34 @@ class ConvertEquivalentsMutator : public MixedModeMutator {
     Function func = Downcast<Function>(call->op);
     Function new_func = Function(func);
     auto composite_name = func->GetAttr<String>(attr::kComposite);
-    if (composite_name == "ethos-n.qnn_mul") {
-      Expr new_func_body = ConvertQnnMultiply(func->body);
-      new_func = WithFields(func, func->params, new_func_body);
-      new_func = WithAttr(std::move(new_func), attr::kComposite, String("ethos-n.qnn_conv2d"));
-    } else if (composite_name == "ethos-n.qnn_add" && CheckCanConvertAdd(func->body)) {
-      Expr new_func_body = ConvertQnnAdd(func->body);
-      new_func = WithFields(func, func->params, new_func_body);
-      new_func = WithAttr(std::move(new_func), attr::kComposite, String("ethos-n.qnn_conv2d"));
+
+    Optional<Expr> optional_new_func_body;
+    String new_composite_name = "";
+    if (composite_name == "ethos-n.qnn_mul_to_reinterpret_quantize") {
+      optional_new_func_body = ConvertQnnMultiplyToReinterpretQuantize(func->body);
+      new_composite_name = "ethos-n.qnn_reinterpret_quantize";
+    } else if (composite_name == "ethos-n.qnn_mul_to_depthwise") {
+      optional_new_func_body = ConvertQnnMultiplyToDepthwise(func->body);
+      new_composite_name = "ethos-n.qnn_conv2d";
+    } else if (composite_name == "ethos-n.qnn_add_to_reinterpret_quantize") {
+      optional_new_func_body = ConvertQnnAddToReinterpretQuantize(func->body);
+      new_composite_name = "ethos-n.qnn_reinterpret_quantize";
+    } else if (composite_name == "ethos-n.qnn_add_to_depthwise") {
+      optional_new_func_body = ConvertQnnAddToDepthwise(func->body);
+      new_composite_name = "ethos-n.qnn_conv2d";
+    }
+
+    if (new_composite_name != "") {
+      ICHECK(optional_new_func_body)
+          << "Operation " << composite_name
+          << " was marked as having a valid conversion, but it could not be converted.";
+      new_func = WithFields(func, func->params, optional_new_func_body.value());
+      new_func = WithAttr(std::move(new_func), attr::kComposite, new_composite_name);
     }
 
     Call new_call = WithFields(call, new_func);
     return Downcast<Expr>(new_call);
   }
-
- private:
-  /*!
-   * \brief Check whether add can be converted to depthwise, or whether
-   * it should be offloaded as a normal add operation.
-   */
-  bool CheckCanConvertAdd(const Expr& expr) {
-    Call call = Downcast<Call>(expr);
-    return call->args[0]->IsInstance<ConstantNode>() || call->args[1]->IsInstance<ConstantNode>();
-  }
 };
 
 tvm::transform::Pass ConvertEquivalents() {
@@ -229,6 +357,18 @@ tvm::transform::Pass ConvertEquivalents() {
       pass_func, 0, "relay.backend.contrib.ethos-n.ConvertEquivalents", {"InferType"});
 }
 
+TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnMultiplyToDepthwise")
+    .set_body_typed(ConvertQnnMultiplyToDepthwise);
+
+TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnAddToDepthwise")
+    .set_body_typed(ConvertQnnAddToDepthwise);
+
+TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnMultiplyToReinterpretQuantize")
+    .set_body_typed(ConvertQnnMultiplyToReinterpretQuantize);
+
+TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnAddToReinterpretQuantize")
+    .set_body_typed(ConvertQnnAddToReinterpretQuantize);
+
 TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertEquivalents")
     .set_body_typed(ConvertEquivalents);
 
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index ce57cc23419a..dbcdecd8f382 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -809,6 +809,42 @@ EthosnError EthosnAPI::Requantize(const Expr& expr, RequantizeParams* params) {
   return err;
 }
 
+EthosnError EthosnAPI::ReinterpretQuantize(const Expr& expr,
+                                           ReinterpretQuantizationParams* params) {
+  Call call = Downcast<Call>(expr);
+  const auto* input_ttype = call->args[0]->checked_type().as<TensorTypeNode>();
+  sl::TensorShape input_tensor_shape = {1, 1, 1, 1};
+  sl::DataType input_data_type;
+  EthosnError err = Tvm2Npu(input_ttype->shape, &input_tensor_shape);
+  err += Tvm2Npu(input_ttype->dtype, &input_data_type);
+
+  const auto* output_ttype = call->checked_type().as<TensorTypeNode>();
+  sl::TensorShape output_tensor_shape = {1, 1, 1, 1};
+  sl::DataType output_data_type;
+  err += Tvm2Npu(output_ttype->shape, &output_tensor_shape);
+  err += Tvm2Npu(output_ttype->dtype, &output_data_type);
+
+  float input_sc, output_sc;
+  int input_zp, output_zp;
+  err += AsConstant(call->args[1], &input_sc);
+  err += AsConstant(call->args[2], &input_zp);
+  err += AsConstant(call->args[3], &output_sc);
+  err += AsConstant(call->args[4], &output_zp);
+
+  sl::QuantizationInfo input_q_info;
+  err += Tvm2Npu(input_zp, input_sc, &input_q_info);
+  params->input_info =
+      sl::TensorInfo(input_tensor_shape, input_data_type, sl::DataFormat::NHWC, input_q_info);
+
+  sl::QuantizationInfo reinterpret_quantize_q_info;
+  err += Tvm2Npu(output_zp, output_sc, &reinterpret_quantize_q_info);
+  params->reinterpret_quantize_info = sl::ReinterpretQuantizationInfo(reinterpret_quantize_q_info);
+
+  params->output_info = sl::TensorInfo(output_tensor_shape, output_data_type, sl::DataFormat::NHWC,
+                                       reinterpret_quantize_q_info);
+  return err;
+}
+
 EthosnError EthosnAPI::Resize(const Expr& expr, ResizeParams* params) {
   Call requantize = Downcast<Call>(expr);
   Call resize = Downcast<Call>(requantize->args[0]);
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.h b/src/relay/backend/contrib/ethosn/ethosn_api.h
index 167106c3d06d..3d704f2757c6 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.h
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.h
@@ -157,6 +157,12 @@ struct RequantizeParams {
   sl::TensorInfo output_info;
 };
 
+struct ReinterpretQuantizationParams {
+  sl::ReinterpretQuantizationInfo reinterpret_quantize_info;
+  sl::TensorInfo input_info;
+  sl::TensorInfo output_info;
+};
+
 struct ResizeParams {
   sl::ResizeInfo resize_info;
   sl::TensorInfo input_info;
@@ -261,6 +267,16 @@ class EthosnAPI {
   static EthosnError Relu(const Expr& expr, ReluParams* params);
   /*! \brief Extract the Support Library requantize params from a Relay qnn.requantize call */
   static EthosnError Requantize(const Expr& expr, RequantizeParams* params);
+
+  /*!
+   * \brief Extact the Support Library reinterpret quantization params from a Relay qnn.requantize
+   * call.
+   *
+   * \note This is used for the conversion from add and mul to a reinterpret quantization operator.
+   * This is effectively an identity operation, as not the same as 'requantize'.
+   */
+  static EthosnError ReinterpretQuantize(const Expr& expr, ReinterpretQuantizationParams* params);
+
   /*! \brief Extract the Support Library resize params from a Relay resize call */
   static EthosnError Resize(const Expr& expr, ResizeParams* params);
 
diff --git a/tests/python/contrib/test_ethosn/test_addition.py b/tests/python/contrib/test_ethosn/test_addition.py
index 72981182e17f..11d8b8d1cd56 100644
--- a/tests/python/contrib/test_ethosn/test_addition.py
+++ b/tests/python/contrib/test_ethosn/test_addition.py
@@ -37,6 +37,7 @@ def _get_model(
     dtype,
     lhs_is_constant=False,
     rhs_is_constant=False,
+    constant_data=None,
 ):
     """Return a model and any parameters it may have"""
 
@@ -45,13 +46,14 @@ def _get_model(
     data_max = iinfo.max
 
     if lhs_is_constant:
-        a_data = np.random.randint(data_min, data_max + 1, size=lhs_shape, dtype=dtype)
+        a_data = np.array(constant_data, dtype=dtype).reshape(lhs_shape)
         a = relay.const(a_data, dtype=dtype)
     else:
         a = relay.var("a", shape=lhs_shape, dtype=dtype)
 
     if rhs_is_constant:
-        b_data = np.random.randint(data_min, data_max + 1, size=rhs_shape, dtype=dtype)
+        b_data = np.array(constant_data, dtype=dtype).reshape(rhs_shape)
+        np.random.randint(data_min, data_max + 1, size=rhs_shape, dtype=dtype)
         b = relay.const(b_data, dtype=dtype)
     else:
         b = relay.var("b", shape=rhs_shape, dtype=dtype)
@@ -117,13 +119,15 @@ def test_addition(dtype, shape):
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 @pytest.mark.parametrize(
-    "lhs_shape,rhs_shape",
+    "lhs_shape,lhs_is_constant,rhs_shape,rhs_is_constant",
     [
-        ((1, 4, 4, 8), (1, 1, 1, 8)),
-        ((1, 16, 12, 4), (4,)),
+        ((1, 4, 4, 8), False, (1, 1, 1, 8), True),
+        ((4,), True, (1, 16, 12, 4), False),
+        ((1, 1, 1, 8), True, (1, 4, 4, 8), False),
+        ((1, 16, 12, 4), False, (4,), True),
     ],
 )
-def test_addition_to_depthwise_rhs_constant(dtype, lhs_shape, rhs_shape):
+def test_addition_to_depthwise(dtype, lhs_shape, lhs_is_constant, rhs_shape, rhs_is_constant):
     """Compare addition to depthwise with TVM."""
     np.random.seed(0)
 
@@ -132,6 +136,9 @@ def test_addition_to_depthwise_rhs_constant(dtype, lhs_shape, rhs_shape):
     data_max = iinfo.max
     lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype)
 
+    constant_shape = lhs_shape if lhs_is_constant else rhs_shape
+    constant_data = np.random.randint(data_min, data_max + 1, size=constant_shape, dtype=dtype)
+
     model = _get_model(
         lhs_shape,
         rhs_shape,
@@ -142,11 +149,16 @@ def test_addition_to_depthwise_rhs_constant(dtype, lhs_shape, rhs_shape):
         out_zp,
         out_sc,
         dtype,
-        lhs_is_constant=False,
-        rhs_is_constant=True,
+        lhs_is_constant=lhs_is_constant,
+        rhs_is_constant=rhs_is_constant,
+        constant_data=constant_data,
     )
+    input_shape = rhs_shape if lhs_is_constant else lhs_shape
+    input_name = "b" if lhs_is_constant else "a"
     inputs = {
-        "a": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=lhs_shape, dtype=dtype))
+        input_name: tvm.nd.array(
+            np.random.randint(data_min, data_max + 1, size=input_shape, dtype=dtype)
+        )
     }
     outputs = []
     for npu in [False, True]:
@@ -156,21 +168,40 @@ def test_addition_to_depthwise_rhs_constant(dtype, lhs_shape, rhs_shape):
 
 
 @requires_ethosn
-@pytest.mark.parametrize("dtype", ["uint8", "int8"])
 @pytest.mark.parametrize(
-    "lhs_shape,rhs_shape",
+    "lhs_shape,lhs_is_constant,rhs_shape,rhs_is_constant",
     [
-        ((1, 8), (1, 20, 15, 8)),
+        ((1, 2, 8, 4), False, None, True),
+        ((1, 5, 6, 7), False, (1, 1, 1, 1), True),
+        (None, True, (1, 2, 8, 4), False),
+        ((1, 1, 1, 1), True, (1, 5, 6, 7), False),
     ],
 )
-def test_addition_to_depthwise_lhs_constant(dtype, lhs_shape, rhs_shape):
+def test_addition_to_reinterpret_quantize(lhs_shape, lhs_is_constant, rhs_shape, rhs_is_constant):
     """Compare addition to depthwise with TVM."""
     np.random.seed(0)
 
+    dtype = "uint8"
     iinfo = np.iinfo(dtype)
     data_min = iinfo.min
     data_max = iinfo.max
-    lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype)
+
+    # Add can only be offloaded as a reinterpret quantize operation if
+    # it is an identity operation. We must choose the quantization and
+    # constant data carefully to maske sure that this is the case.
+    if lhs_is_constant:
+        rhs_zp = 128
+        rhs_sc = 0.0078125
+        lhs_zp = 0
+        lhs_sc = 0.003921568859368563
+    else:
+        lhs_zp = 128
+        lhs_sc = 0.0078125
+        rhs_zp = 0
+        rhs_sc = 0.003921568859368563
+    out_zp = 0
+    out_sc = 0.007814894430339336
+    constant_data = 255
 
     model = _get_model(
         lhs_shape,
@@ -182,11 +213,16 @@ def test_addition_to_depthwise_lhs_constant(dtype, lhs_shape, rhs_shape):
         out_zp,
         out_sc,
         dtype,
-        lhs_is_constant=True,
-        rhs_is_constant=False,
+        lhs_is_constant=lhs_is_constant,
+        rhs_is_constant=rhs_is_constant,
+        constant_data=constant_data,
     )
+    input_shape = rhs_shape if lhs_is_constant else lhs_shape
+    input_name = "b" if lhs_is_constant else "a"
     inputs = {
-        "b": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=rhs_shape, dtype=dtype))
+        input_name: tvm.nd.array(
+            np.random.randint(data_min, data_max + 1, size=input_shape, dtype=dtype)
+        )
     }
     outputs = []
     for npu in [False, True]:
diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
index c8d1b5729d83..77777293729c 100644
--- a/tests/python/contrib/test_ethosn/test_convert_equivalents.py
+++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
@@ -74,7 +74,7 @@ def before():
             relay.const(output_sc, "float32"),
             relay.const(output_zp, "int32"),
         )
-        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_mul")
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_mul_to_depthwise")
         return tei.make_ethosn_partition(composite)
 
     def expected():
@@ -117,6 +117,165 @@ def expected():
     _assert_structural_equal(mod["ethos-n_0"], expected_mod["ethos-n_0"])
 
 
+@requires_ethosn
+@pytest.mark.parametrize(
+    "dtype,shape,constant_shape",
+    [("int8", (1, 4, 4), (4,)), ("int16", (1, 16, 12, 4), (1, 1, 1, 4))],
+)
+def test_unsupported_multiply_to_depthwise(dtype, shape, constant_shape):
+    """Check that unsupported variants of multiply to depthwise are not converted."""
+    np.random.seed(0)
+
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+    input_zp = np.random.randint(data_min, data_max)
+    input_sc = np.random.random() * 2
+    input2_zp = np.random.randint(data_min, data_max)
+    input2_sc = np.random.random() * 2
+    output_zp, output_sc = tei.get_conv2d_qnn_params(
+        dtype, input_zp, input_sc, input2_zp, input2_sc, 1, 1, shape[-1]
+    )
+    x = relay.var("x", shape=shape, dtype=dtype)
+    y_data = np.random.randint(data_min, data_max + 1, size=constant_shape, dtype=dtype)
+
+    def before():
+        y = relay.const(y_data, dtype=dtype)
+        expr = relay.qnn.op.mul(
+            x,
+            y,
+            relay.const(input_sc, "float32"),
+            relay.const(input_zp, "int32"),
+            relay.const(input2_sc, "float32"),
+            relay.const(input2_zp, "int32"),
+            relay.const(output_sc, "float32"),
+            relay.const(output_zp, "int32"),
+        )
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_mul_to_depthwise")
+        return tei.make_ethosn_partition(composite)
+
+    mod = before()
+
+    error_regex = (
+        r'Operation "ethos-n.qnn_mul_to_depthwise" was marked '
+        r"as having a valid conversion, but it could not be converted."
+    )
+
+    with pytest.raises(tvm.TVMError, match=error_regex):
+        mod = ConvertEquivalents()(mod)
+
+
+@requires_ethosn
+@pytest.mark.parametrize(
+    "shape,constant_shape",
+    [((1, 4, 4, 8), (1, 1, 1, 1)), ((1, 16, 12, 4), None)],
+)
+@pytest.mark.parametrize("reverse_inputs", [True, False])
+def test_multiply_to_reinterpret_quantize(shape, constant_shape, reverse_inputs):
+    """Check that multiply is correctly converted to a reinterpret quantize operation."""
+    np.random.seed(0)
+
+    dtype = "uint8"
+
+    # Multiply can only be offloaded as a reinterpret quantize operation if
+    # it is an identity option. We must choose the quantization and constant
+    # data carefully to make sure that this is the case.
+    input_zp = 0
+    input_sc = 0.007814894430339336
+    input2_zp = 0
+    input2_sc = 0.5
+    output_zp = 0
+    output_sc = 0.9963990449905396
+    constant_data = 255
+
+    x = relay.var("x", shape=shape, dtype=dtype)
+    y_data = np.array(constant_data, dtype=dtype).reshape(constant_shape)
+
+    def before():
+        y = relay.const(y_data, dtype=dtype)
+        expr = relay.qnn.op.mul(
+            y if reverse_inputs else x,
+            x if reverse_inputs else y,
+            relay.const(input2_sc if reverse_inputs else input_sc, "float32"),
+            relay.const(input2_zp if reverse_inputs else input_zp, "int32"),
+            relay.const(input_sc if reverse_inputs else input2_sc, "float32"),
+            relay.const(input_zp if reverse_inputs else input2_zp, "int32"),
+            relay.const(output_sc, "float32"),
+            relay.const(output_zp, "int32"),
+        )
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_mul_to_reinterpret_quantize")
+        return tei.make_ethosn_partition(composite)
+
+    def expected():
+        expr = relay.qnn.op.requantize(
+            x,
+            relay.const(input_sc, "float32"),
+            relay.const(input_zp if reverse_inputs else input_zp, "int32"),
+            relay.const(output_sc, "float32"),
+            relay.const(output_zp, "int32"),
+            out_dtype=dtype,
+        )
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_reinterpret_quantize")
+        return tei.make_ethosn_partition(composite)
+
+    mod = before()
+    mod = ConvertEquivalents()(mod)
+    expected_mod = expected()
+    _assert_structural_equal(mod["ethos-n_0"], expected_mod["ethos-n_0"])
+
+
+@requires_ethosn
+@pytest.mark.parametrize(
+    "dtype,shape,constant_shape",
+    [("int16", (1, 16, 12, 4), None)],
+)
+def test_unsupported_multiply_to_reinterpret_quantize(dtype, shape, constant_shape):
+    """
+    Check that unsupported variants of multiply conversion to reinterpret
+    quantize are not converted.
+    """
+    np.random.seed(0)
+
+    # Multiply can only be offloaded as a reinterpret quantize operation if
+    # it is an identity option. We must choose the quantization and constant
+    # data carefully to make sure that this is the case.
+    input_zp = 0
+    input_sc = 0.007814894430339336
+    input2_zp = 0
+    input2_sc = 0.5
+    output_zp = 0
+    output_sc = 0.9963990449905396
+    constant_data = 255
+
+    x = relay.var("x", shape=shape, dtype=dtype)
+    y_data = np.array(constant_data, dtype=dtype).reshape(constant_shape)
+
+    def before():
+        y = relay.const(y_data, dtype=dtype)
+        expr = relay.qnn.op.mul(
+            x,
+            y,
+            relay.const(input_sc, "float32"),
+            relay.const(input_zp, "int32"),
+            relay.const(input2_sc, "float32"),
+            relay.const(input2_zp, "int32"),
+            relay.const(output_sc, "float32"),
+            relay.const(output_zp, "int32"),
+        )
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_mul_to_reinterpret_quantize")
+        return tei.make_ethosn_partition(composite)
+
+    mod = before()
+
+    error_regex = (
+        r'Operation "ethos-n.qnn_mul_to_reinterpret_quantize" was marked '
+        r"as having a valid conversion, but it could not be converted."
+    )
+
+    with pytest.raises(tvm.TVMError, match=error_regex):
+        mod = ConvertEquivalents()(mod)
+
+
 @requires_ethosn
 @pytest.mark.parametrize("reverse_inputs", [True, False])
 def test_add_to_depthwise(reverse_inputs):
@@ -148,7 +307,7 @@ def before():
             output_scale=relay.const(out_sc, "float32"),
             output_zero_point=relay.const(out_zp, "int32"),
         )
-        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_add")
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_add_to_depthwise")
         return tei.make_ethosn_partition(composite)
 
     class ConversionChecker(ExprVisitor):
@@ -176,3 +335,158 @@ def visit_call(self, call):
     mod = before()
     mod = ConvertEquivalents()(mod)
     mod = ConversionChecker().visit(mod["ethos-n_0"].body.op)
+
+
+@requires_ethosn
+@pytest.mark.parametrize(
+    "dtype,lhs_shape,rhs_shape", [("uint8", (1, 4, 4), (1, 1, 4)), ("int16", (1, 4, 4, 4), (4,))]
+)
+def test_unsupported_add_to_depthwise(dtype, lhs_shape, rhs_shape):
+    """Check that unsupported variants of add are not converted."""
+    np.random.seed(0)
+
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+    lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype)
+
+    x = relay.var("x", shape=lhs_shape, dtype=dtype)
+    y_data = np.random.randint(data_min, data_max + 1, size=rhs_shape, dtype=dtype)
+
+    def before():
+        y = relay.const(y_data)
+        expr = relay.qnn.op.add(
+            lhs=x,
+            rhs=y,
+            lhs_scale=relay.const(lhs_sc, "float32"),
+            lhs_zero_point=relay.const(lhs_zp, "int32"),
+            rhs_scale=relay.const(rhs_sc, "float32"),
+            rhs_zero_point=relay.const(rhs_zp, "int32"),
+            output_scale=relay.const(out_sc, "float32"),
+            output_zero_point=relay.const(out_zp, "int32"),
+        )
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_add_to_depthwise")
+        return tei.make_ethosn_partition(composite)
+
+    mod = before()
+
+    error_regex = (
+        r'Operation "ethos-n.qnn_add_to_depthwise" was marked '
+        r"as having a valid conversion, but it could not be converted."
+    )
+
+    with pytest.raises(tvm.TVMError, match=error_regex):
+        mod = ConvertEquivalents()(mod)
+
+
+@requires_ethosn
+@pytest.mark.parametrize(
+    "shape,constant_shape",
+    [
+        ((1, 4, 4, 8), (1, 1, 1, 1)),
+        ((1, 16, 12, 4), None),
+    ],
+)
+@pytest.mark.parametrize("reverse_inputs", [True, False])
+def test_add_to_reinterpret_quantize(shape, constant_shape, reverse_inputs):
+    """Check that add is correctly converted to a reinterpret quantize operation."""
+    np.random.seed(0)
+
+    dtype = "uint8"
+
+    # Add can only be offloaded as a reinterpret quantize operation if
+    # it is an identity option. We must choose the quantization and constant
+    # data carefully to make sure that this is the case.
+    input_zp = 128
+    input_sc = 0.0078125
+    input2_zp = 0
+    input2_sc = 0.003921568859368563
+    output_zp = 0
+    output_sc = 0.007814894430339336
+    constant_data = 255
+
+    x = relay.var("x", shape=shape, dtype=dtype)
+    y_data = np.array(constant_data, dtype=dtype).reshape(constant_shape)
+
+    def before():
+        y = relay.const(y_data, dtype=dtype)
+        expr = relay.qnn.op.add(
+            y if reverse_inputs else x,
+            x if reverse_inputs else y,
+            relay.const(input2_sc if reverse_inputs else input_sc, "float32"),
+            relay.const(input2_zp if reverse_inputs else input_zp, "int32"),
+            relay.const(input_sc if reverse_inputs else input2_sc, "float32"),
+            relay.const(input_zp if reverse_inputs else input2_zp, "int32"),
+            relay.const(output_sc, "float32"),
+            relay.const(output_zp, "int32"),
+        )
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_add_to_reinterpret_quantize")
+        return tei.make_ethosn_partition(composite)
+
+    def expected():
+        expr = relay.qnn.op.requantize(
+            x,
+            relay.const(input_sc, "float32"),
+            relay.const(input_zp if reverse_inputs else input_zp, "int32"),
+            relay.const(output_sc, "float32"),
+            relay.const(output_zp, "int32"),
+            out_dtype=dtype,
+        )
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_reinterpret_quantize")
+        return tei.make_ethosn_partition(composite)
+
+    mod = before()
+    mod = ConvertEquivalents()(mod)
+    expected_mod = expected()
+    _assert_structural_equal(mod["ethos-n_0"], expected_mod["ethos-n_0"])
+
+
+@requires_ethosn
+@pytest.mark.parametrize(
+    "dtype,shape,constant_shape",
+    [
+        ("int16", (1, 16, 12, 4), None),
+    ],
+)
+def test_unsupported_add_to_reinterpret_quantize(dtype, shape, constant_shape):
+    """Check that unsupported variants of add to reinterpret quantize are not converted."""
+    np.random.seed(0)
+
+    # Add can only be offloaded as a reinterpret quantize operation if
+    # it is an identity option. We must choose the quantization and constant
+    # data carefully to make sure that this is the case.
+    input_zp = 128
+    input_sc = 0.0078125
+    input2_zp = 0
+    input2_sc = 0.003921568859368563
+    output_zp = 0
+    output_sc = 0.007814894430339336
+    constant_data = 255
+
+    x = relay.var("x", shape=shape, dtype=dtype)
+    y_data = np.array(constant_data, dtype=dtype).reshape(constant_shape)
+
+    def before():
+        y = relay.const(y_data, dtype=dtype)
+        expr = relay.qnn.op.add(
+            x,
+            y,
+            relay.const(input_sc, "float32"),
+            relay.const(input_zp, "int32"),
+            relay.const(input2_sc, "float32"),
+            relay.const(input2_zp, "int32"),
+            relay.const(output_sc, "float32"),
+            relay.const(output_zp, "int32"),
+        )
+        composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_add_to_reinterpret_quantize")
+        return tei.make_ethosn_partition(composite)
+
+    mod = before()
+
+    error_regex = (
+        r'Operation "ethos-n.qnn_add_to_reinterpret_quantize" was marked '
+        r"as having a valid conversion, but it could not be converted."
+    )
+
+    with pytest.raises(tvm.TVMError, match=error_regex):
+        mod = ConvertEquivalents()(mod)
diff --git a/tests/python/contrib/test_ethosn/test_multiply.py b/tests/python/contrib/test_ethosn/test_multiply.py
index cb95a97db529..41c06092447a 100644
--- a/tests/python/contrib/test_ethosn/test_multiply.py
+++ b/tests/python/contrib/test_ethosn/test_multiply.py
@@ -38,13 +38,17 @@ def _get_model(
     output_sc,
     dtype,
     reverse_inputs=False,
+    constant_data=None,
 ):
     iinfo = np.iinfo(dtype)
     data_min = iinfo.min
     data_max = iinfo.max
 
     x = relay.var("x", shape=shape, dtype=dtype)
-    y_data = np.random.randint(data_min, data_max + 1, size=constant_shape, dtype=dtype)
+    if constant_data:
+        y_data = np.array(constant_data, dtype=dtype).reshape(constant_shape)
+    else:
+        y_data = np.random.randint(data_min, data_max + 1, size=constant_shape, dtype=dtype)
     y = relay.const(y_data, dtype=dtype)
 
     out = relay.qnn.op.mul(
@@ -64,11 +68,12 @@ def _get_model(
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 @pytest.mark.parametrize(
-    "shape,constant_shape", [((1, 4, 4, 8), (1, 1, 1, 8)), ((1, 16, 12, 4), (4,))]
+    "shape,constant_shape",
+    [((1, 4, 4, 8), (1, 1, 1, 8)), ((1, 16, 12, 4), (4,))],
 )
 @pytest.mark.parametrize("reverse_inputs", [False, True])
-def test_multiply(dtype, shape, constant_shape, reverse_inputs):
-    """Compare Multiply output with TVM."""
+def test_multiply_to_depthwise(dtype, shape, constant_shape, reverse_inputs):
+    """Compare Multiply -> Depthwise conversion output with TVM."""
 
     np.random.seed(0)
 
@@ -104,6 +109,53 @@ def test_multiply(dtype, shape, constant_shape, reverse_inputs):
     tei.verify(outputs, dtype, 1)
 
 
+@requires_ethosn
+@pytest.mark.parametrize(
+    "shape,constant_shape", [((1, 4, 5, 8), (1, 1, 1, 1)), ((1, 3, 7, 10), None)]
+)
+@pytest.mark.parametrize("reverse_inputs", [False, True])
+def test_multiply_to_reinterpret_quantize(shape, constant_shape, reverse_inputs):
+    """Compare Multiply -> Reinterpret Quantize conversion output with TVM."""
+    np.random.seed(0)
+
+    dtype = "uint8"
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+
+    # Multiply can only be offloaded as a reinterpret quantize operation if
+    # it is an identity option. We must choose the quantization and constant
+    # data carefully to make sure that this is the case.
+    input_zp = 0
+    input_sc = 0.007814894430339336
+    input2_zp = 0
+    input2_sc = 0.5
+    output_zp = 0
+    output_sc = 0.9963990449905396
+    constant_data = 255
+
+    model, params = _get_model(
+        shape,
+        constant_shape,
+        input_zp,
+        input_sc,
+        input2_zp,
+        input2_sc,
+        output_zp,
+        output_sc,
+        dtype,
+        reverse_inputs,
+        constant_data,
+    )
+    inputs = {"x": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=shape, dtype=dtype))}
+    outputs = []
+    for npu in [False, True]:
+        mod = tei.make_module(model, params)
+        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+
+    tei.verify(outputs, dtype, 1)
+
+
 @requires_ethosn
 def test_multiply_multiple_inputs_unsupported():
     """Check multiply operator with two inputs is not offloaded."""
@@ -151,14 +203,19 @@ def test_multiply_multiple_inputs_unsupported():
 
 
 @requires_ethosn
-def test_multiply_unsupported_datatype():
-    """Check multiply operator with unsupported datatype is not offloaded."""
+@pytest.mark.parametrize(
+    "dtype,shape,constant_shape",
+    [
+        ("int16", (1, 4, 5, 6), (1, 1, 1, 6)),
+        ("int8", (1, 1, 3), (1, 1, 1, 3)),
+        ("int8", (1, 2, 4, 8), (1, 2, 4, 8)),
+    ],
+)
+def test_multiply_unsupported(dtype, shape, constant_shape):
+    """Check multiply operator with unsupported attributes is not offloaded."""
 
     np.random.seed(0)
 
-    shape = (1, 4, 5, 6)
-    dtype = "int16"
-
     iinfo = np.iinfo(dtype)
     data_min = iinfo.min
     data_max = iinfo.max
@@ -167,20 +224,21 @@ def test_multiply_unsupported_datatype():
     input2_zp = np.random.randint(data_min, data_max)
     input2_sc = np.random.random() * 2
     output_zp, output_sc = tei.get_conv2d_qnn_params(
-        dtype, input_zp, input_sc, input2_zp, input2_sc, 1, 1, shape[3]
+        dtype, input_zp, input_sc, input2_zp, input2_sc, 1, 1, shape[-1]
     )
 
-    x = relay.var("x", shape=shape, dtype=dtype)
-    y = relay.var("y", shape=shape, dtype=dtype)
-    model = relay.qnn.op.mul(
-        x,
-        y,
-        relay.const(input_sc, "float32"),
-        relay.const(input_zp, "int32"),
-        relay.const(input2_sc, "float32"),
-        relay.const(input2_zp, "int32"),
-        relay.const(output_sc, "float32"),
-        relay.const(output_zp, "int32"),
+    model, params = _get_model(
+        shape,
+        constant_shape,
+        input_zp,
+        input_sc,
+        input2_zp,
+        input2_sc,
+        output_zp,
+        output_sc,
+        dtype,
+        reverse_inputs=False,
+        constant_data=False,
     )
 
     expected_host_ops = 1
@@ -189,7 +247,7 @@ def test_multiply_unsupported_datatype():
         mod = tei.make_module(model, {})
         tei.build(
             mod,
-            {},
+            params,
             npu=npu,
             expected_host_ops=expected_host_ops,
             npu_partitions=npu_partitions,
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 2e6b52927769..54ca44805171 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -144,7 +144,11 @@ def test_resnet_50_int8():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"12d65aec33594c88b6d0d31dcd5144e6", "6a64d69ccb36dfb6b30dd2abdba4b005"}
+    _compile_hash = {
+        "6b130a99397715156d5fb833809a92d2",
+        "6e5fcbab831607b9da1039aff4e56871",
+        "41acecca37b2735bd580f6ec38d8c2e0",
+    }
     _test_image_network(
         model_url="https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/"
         "models/Quantized/resnet_50_quantized.tflite",
@@ -152,8 +156,8 @@ def test_resnet_50_int8():
         input_dict={"input": (1, 224, 224, 3)},
         compile_hash=_compile_hash,
         output_count=1,
-        host_ops=10,
-        npu_partitions=2,
+        host_ops=9,
+        npu_partitions=3,
     )
 
 
From 5634a1a17a3d337728bdc375183c9aee71c40b29 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 29 Sep 2022 15:31:00 +0530
Subject: [PATCH 276/704] [CODEGEN][OPENCL] Compatibility for OpenCL version
 3.0 (#12938)

---
 src/target/source/codegen_opencl.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index e8d47b720bf6..73a064bc80f5 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -139,7 +139,8 @@ std::string CodeGenOpenCL::Finish() {
     // For now we rely on OpenCL preprocessor directives to utilize the correct behavior
     // depending on the OpenCL version detected at OpenCL compile time.
     decl_stream << "#ifdef __OPENCL_VERSION__\n"
-                << "#if __OPENCL_VERSION__ == CL_VERSION_2_0\n"
+                << "#if __OPENCL_VERSION__ == CL_VERSION_2_0"
+                << " || __OPENCL_VERSION__ == CL_VERSION_3_0 \n"
                 << "#define READ_IMAGEH(image, sampler, coord) "
                 << "read_imageh(image, sampler, coord)\n"
                 << "#define READ_IMAGEF(image, sampler, coord) "

From 0d8c9cef7212e62c18814f1632613fb04de6d290 Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Thu, 29 Sep 2022 16:50:59 +0400
Subject: [PATCH 277/704] [Relay] Extend split for blocked ConvertLayout pass
 (#12886)

* [Relay] Extend split for blocked ConvertLayout pass

* Fix lint hits

* Fix spelling
---
 src/relay/op/tensor/transform.cc              | 24 ++++++++-
 .../relay/test_pass_convert_op_layout.py      | 49 +++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index deb05e887775..985222307ad9 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -2982,10 +2982,32 @@ InferCorrectLayoutOutput SplitInferCorrectLayout(const Attrs& attrs,
 
   // If new_in_layouts are defined, this code tries to modify the layout.
   if (new_in_layouts.defined() && old_in_layouts.defined()) {
+    bool divisible = true;
     const auto& sp_dim = old_in_layouts[0][axis];
     auto new_index = new_in_layouts[0].IndexOf(sp_dim);
     param->axis = new_index;
-    ret = new_in_layouts[0];
+    int factor = new_in_layouts[0].FactorOf(sp_dim);
+    if (factor > 1) {
+      if (!param->indices_or_sections.as<IntImmNode>()) {
+        auto ios = Downcast<Array<Integer>>(param->indices_or_sections);
+        Array<Integer> new_ios;
+        for (const auto& v : ios) {
+          const IntImmNode* vint = v.as<IntImmNode>();
+          new_ios.push_back(vint->value / factor);
+          if (vint->value % factor) {
+            divisible = false;
+          }
+        }
+        if (divisible) {
+          param->indices_or_sections = new_ios;
+        }
+      }
+    }
+    if (divisible) {
+      ret = new_in_layouts[0];
+    } else {
+      ret = old_in_layouts[0];
+    }
   } else if (old_in_layouts.defined()) {
     ret = old_in_layouts[0];
   }
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 3d5af83b8c43..223926a8779c 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -1760,9 +1760,58 @@ def expected():
 
         assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
+    def _test_conv_split_convert_layout_blocking():
+        def before():
+            x = relay.var("x", shape=(1, 512, 38, 38))
+            weight = relay.var("weight", shape=(512, 512, 3, 3))
+            y = relay.nn.conv2d(
+                x,
+                weight,
+                channels=512,
+                kernel_size=(3, 3),
+                data_layout="NCHW",
+                kernel_layout="OIHW",
+            )
+            y = relay.nn.relu(y)
+            y = relay.op.split(y, indices_or_sections=[256], axis=1).astuple()
+            a = relay.TupleGetItem(y, 0)
+            b = relay.TupleGetItem(y, 1)
+            out = relay.Tuple([a, b])
+            return relay.Function(analysis.free_vars(out), out)
+
+        def expected():
+            x = relay.var("x", shape=(1, 512, 38, 38))
+            weight = relay.var("weight", shape=(512, 512, 3, 3))
+            weight = relay.layout_transform(weight, "OIHW", "OIHW4o")
+            x = relay.layout_transform(x, "NCHW", "NCHW4c")
+            y = relay.op.nn.contrib_conv2d_nchwc(
+                x,
+                weight,
+                channels=512,
+                kernel_size=(3, 3),
+                padding=(0, 0),
+                data_layout="NCHW4c",
+                kernel_layout="OIHW4o",
+            )
+            y = relay.nn.relu(y)
+            y = relay.op.split(y, indices_or_sections=[64], axis=1).astuple()
+            a = relay.TupleGetItem(y, 0)
+            b = relay.TupleGetItem(y, 1)
+            a = relay.layout_transform(a, "NCHW4c", "NCHW")
+            b = relay.layout_transform(b, "NCHW4c", "NCHW")
+            out = relay.Tuple([a, b])
+            return relay.Function(analysis.free_vars(out), out)
+
+        a = before()
+        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW4c", "OIHW4o"]}))
+        b = run_opt_pass(expected(), transform.InferType())
+
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
     _test_conv_split_convert_layout1()
     _test_conv_split_convert_layout2()
     _test_conv_split_convert_layout3()
+    _test_conv_split_convert_layout_blocking()
 
 
 def test_conv_strided_slice_axes_convert_layout():

From 9a451411650905866822cc79bd3365942d5dd9d1 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 29 Sep 2022 08:39:18 -0500
Subject: [PATCH 278/704] [TIR] Use buffer's dtype when converting pad_value to
 TIR (#12925)

[TIR] Use buffer's dtype when converting pad_value to TIR

Previously, a python int/float would be converted to a TIR
int32/float32 in `tvm.runtime.convert`.  These types may not match the
buffer's type, even though it is the closest Python representation of
the buffer's type.  Therefore, the `pad_value` should be
preferentially converted to the matching type of the buffer.
---
 python/tvm/tir/schedule/schedule.py           | 10 ++++-
 .../test_tir_schedule_transform_layout.py     | 41 +++++++++++++------
 2 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 27171aca411b..2268196b5898 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -2443,7 +2443,7 @@ def transform_layout(
         block: Union[BlockRV, str],
         buffer: Union[Tuple[str, int], str, Buffer],
         index_map: Union[IndexMap, Callable],
-        pad_value: Optional[Union[int, float, IndexMap, Callable]] = None,
+        pad_value: Optional[Union[int, float, PrimExpr, IndexMap, Callable]] = None,
     ) -> None:
         """Apply a transformation represented by IndexMap to buffer
 
@@ -2572,6 +2572,14 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) ->
         elif callable(pad_value):
             pad_value = IndexMap.from_func(pad_value, ndim=len(index_map.final_indices))
         elif not isinstance(pad_value, IndexMap):
+            # Explicitly convert python int/float arguments to the
+            # buffer's type.  If the default `tvm.runtime.convert`
+            # behavior is applied, these would be converted to
+            # int32/float32, which may not match the buffer's type.
+            if isinstance(pad_value, int):
+                pad_value = IntImm(buffer_obj.dtype, pad_value)
+            elif isinstance(pad_value, float):
+                pad_value = FloatImm(buffer_obj.dtype, pad_value)
             pad_value = IndexMap.from_func(
                 lambda *indices: pad_value, ndim=len(index_map.final_indices)
             )
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index 8ed350cc4c46..0b0146ee43fa 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -440,7 +440,7 @@ def before():
 class TestErrorOnWrongPaddingType(BasePaddingCompare):
     """The padding must have the same dtype as the buffer"""
 
-    pad_value = tvm.testing.parameter(0.5)
+    pad_value = tvm.testing.parameter(tir.IntImm("int8", 0))
 
     def before():
         A = T.alloc_buffer(14, "int32")
@@ -465,20 +465,35 @@ class TestPaddedTransformIfThenElse(BasePaddingCompare):
 
     pad_value = tvm.testing.parameter(0)
     transformed_buffer = tvm.testing.parameter("B")
+    dtype = tvm.testing.parameter("int32", "int8")
+
+    @tvm.testing.fixture
+    def before(self, dtype):
+        @T.prim_func
+        def func(A: T.Buffer[14, dtype]):
+            B = T.alloc_buffer(14, dtype)
+            for i in T.serial(14):
+                with T.block("block"):
+                    vi = T.axis.remap("S", [i])
+                    B[vi] = A[vi]
 
-    def before(A: T.Buffer[14, "int32"]):
-        B = T.alloc_buffer(14, "int32")
-        for i in T.serial(14):
-            with T.block("block"):
-                vi = T.axis.remap("S", [i])
-                B[vi] = A[vi]
+        return func
 
-    def expected(A: T.Buffer[14, "int32"]):
-        B = T.alloc_buffer([4, 4], "int32")
-        for i, j in T.grid(4, 4):
-            with T.block("block"):
-                vi, vj = T.axis.remap("SS", [i, j])
-                B[vi, vj] = T.if_then_else(vi == 3 and 2 <= vj, 0, A[vi * 4 + vj], dtype="int32")
+    @tvm.testing.fixture
+    def expected(self, dtype, pad_value):
+        pad_value = tir.IntImm(dtype, pad_value)
+
+        @T.prim_func
+        def func(A: T.Buffer[14, dtype]):
+            B = T.alloc_buffer([4, 4], dtype)
+            for i, j in T.grid(4, 4):
+                with T.block("block"):
+                    vi, vj = T.axis.remap("SS", [i, j])
+                    B[vi, vj] = T.if_then_else(
+                        vi == 3 and 2 <= vj, pad_value, A[vi * 4 + vj], dtype=dtype
+                    )
+
+        return func
 
 
 class TestPaddedTransformWithoutLoop(BasePaddingCompare):

From 3e3d900c66ba8caad6b491c60d1331be470f9cba Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Thu, 29 Sep 2022 16:22:35 +0000
Subject: [PATCH 279/704] [Virtual Machine] Implementation of
 'set_output_zero_copy' (#11358)

There is python API function 'set_output' which save external outputs in VM outputs_ field (map) for specified func name. It looks like 'set_input' method.
During 'invoke' outputs_ are saved in register_file. For this the register indices of output tensors are found from code_ field. I observed in tests for different models that AllocTensor and AllocADT ops are used for result tensors. Let's consider these two cases: result index is destination for AllocTensor op or AllocADT op. At the first case instead of construction new NDArray the outside output tensor is used. At the second one the fields of AllocADT are analyzed and register indices are extracted. During tests I observed that ReshapeTensor operation is rarely used as final one (SqueezeNet-v1.0 and DUC). Mechanism for replacement by external output tensors was also implemented for this op.
---
 include/tvm/runtime/vm/vm.h   |  91 +++++++++++++-
 python/tvm/runtime/vm.py      |  36 ++++++
 src/runtime/vm/vm.cc          | 215 ++++++++++++++++++++++++++++++----
 tests/python/relay/test_vm.py |  92 +++++++++++++--
 4 files changed, 401 insertions(+), 33 deletions(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 5a72a99fa635..6fa91832a731 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -226,6 +226,16 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    */
   ObjectRef Invoke(const std::string& name, const std::vector<ObjectRef>& args);
 
+  /*!
+   * \brief Invoke a VM function.
+   * \param func The function.
+   * \param input_args The input arguments to the function.
+   * \param output_args The pre-allocated output arguments of the function.
+   * \return The object(s) representing the result.
+   */
+  ObjectRef Invoke(const VMFunction& func, const std::vector<ObjectRef>& input_args,
+                   const std::vector<ObjectRef>& output_args);
+
   /*!
    * \brief Invoke a PackedFunction
    *
@@ -249,7 +259,7 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
             const std::vector<AllocatorType>& alloc_types);
 
   /*! \brief Run VM dispatch loop. */
-  void RunLoop();
+  void RunLoop(const std::vector<Index>& output_tensor_reg_indices = {});
 
   /*! \brief Get device from the device list based on a given device index. */
   Device GetDevice(Index device_index) const;
@@ -281,6 +291,32 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    */
   void SetOneInput(std::string name, const TVMArgValue& tag, const TVMArgValue& tensor);
 
+  /*!
+   * \brief Set pre-allocated output tensors to a function.
+   * It is native implementation of 'set_outputs' python method.
+   * It is used in scenario when output tensors are allocated outside each invocation.
+   * Note: it sets set_outputs_enabled_[name] true and fill outputs_[name]
+   * but after invocation the first is switched off and the second is cleared
+   * \param name The function name
+   * \param args outputs to the function.
+   */
+  void SetOutputs(std::string name, TVMArgs args);
+
+  /*!
+   * \brief Preparation part of Invoke method before RunLoop.
+   * \param func the function.
+   * \param args input args
+   */
+  void PrintInfoAndSetInputArgs(const VMFunction& func, const std::vector<ObjectRef>& args);
+
+  /*!
+   * \brief Set pre-allocated outputs to register for specified function.
+   * \param func_name The function's name.
+   * \param outputs set of output tensors.
+   */
+  void SetOutputTensorsToRegister(const std::string& func_name,
+                                  const std::vector<ObjectRef>& outputs);
+
   /*!
    * \brief Internal hook for profiling the start of an op.
    *
@@ -339,6 +375,51 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   void SetInputTensorWithIndex(std::vector<ObjectRef>& tensors,  // NOLINT(*)
                                const TVMArgValue& tensor, int index, Device dev);
 
+  /*!
+   * \brief Convert tensor from TVMArgValue to ObjectRef.
+   * DLTensor and NDArray types are supported.
+   * \param tensor given arg value containing tensor.
+   * \return tensor in ObjectRef format
+   */
+  ObjectRef TensorFromTVMArgValueToObjectRef(const TVMArgValue& tensor) const;
+
+  /*!
+   * \brief Get index of outputs in register_file from func code
+   * \return result register index
+   */
+  Index GetResultRegisterIndex() const;
+
+  /*!
+   * \brief Calculate the index of operation which destination is result
+   * \param res_index is the index of op returning result
+   */
+  void CalculatePreResultOpIndex(Index res_index);
+
+  /*!
+   * \brief Get indices from register_file for output tensors.
+   * It helps to replace output tensors allocated in RunLoop by
+   * tensors pre-allocated outside. Scenario is when `set_output` is used
+   * \return indices from register_file for output tensors.
+   */
+  std::vector<Index> GetOutputTensorRegIndices();
+
+  /*!
+   * \brief Write new allocated tensor to register_file of frame.
+   * \param instr current instruction containing shape and storage info.
+   */
+  void WriteAllocatedTensor(const Instruction& instr);
+
+  /*!
+   * \brief 'set_outputs_enabled' is assumed true for using this method.
+   * It is expected that result register has already contained tensor from outside,
+   * new memory is not allocated and write, but expected shape and data type are checked.
+   * For other register WriteAllocatedTensor method is used.
+   * \param instr current instruction containing shape and storage info.
+   */
+  void WriteAllocatedTensorFromOutside(const Instruction& instr);
+
+  bool FindIndex(const std::vector<Index>& indices, Index val) const;
+
  protected:
   /*! \brief The virtual machine's packed function table. */
   std::vector<PackedFunc> packed_funcs_;
@@ -356,6 +437,14 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   ObjectPtr<Executable> exec_;
   /*! \brief The function name to inputs mapping. */
   std::unordered_map<std::string, std::vector<ObjectRef>> inputs_;
+  /*! \brief The function name to flag enabling scenario with set outputs. */
+  std::unordered_map<std::string, bool> set_outputs_enabled_;
+  /*! \brief The index of operation which destination is result. */
+  Index preresult_op_index_ = -1;
+  /*! \brief The function name to indices of output tensors in register file. */
+  std::unordered_map<std::string, std::vector<Index>> output_tensor_reg_indices_;
+  /*! \brief The function name to pre-allocated outputs mapping. */
+  std::unordered_map<std::string, std::vector<ObjectRef>> outputs_;
   /*!
    * \brief The "physical" devices the VM can execute primitives on. All "device indexes"
    * are w.r.t. this vector. Each entry in this vector must match the corresponding entry
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 615f66fdcc1c..20778c40fd51 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -399,6 +399,7 @@ def __init__(self, exe, device, memory_cfg=None):
         self._get_input_index = self.module["get_input_index"]
         self._set_input = self.module["set_input"]
         self._set_one_input = self.module["set_one_input"]
+        self._set_outputs = self.module["set_outputs"]
         self._setup_device(device, memory_cfg)
 
     def _setup_device(self, dev, memory_cfg):
@@ -560,6 +561,41 @@ def invoke_stateful(self, func_name, *args, **kwargs):
             self.set_input(func_name, *args, **kwargs)
         self._invoke_stateful(func_name)
 
+    def invoke_with_outputs(self, func_name, input_args, output_args):
+        # TODO(vvchernov): consider scenario then output tensors set once
+        """Invoke a function with pre-allocated output tensors.
+        The output tensors should be set every invocation.
+        input_args can be None if set_input method was used before.
+
+        This invoke method allows to avoid excess copying if memory for output tensors
+        was allocated before inference.
+
+        Parameters
+        ----------
+        func_name : str
+            The name of the function.
+
+        input_args: dict of str to tvm.runtime.NDArray or np.ndarray
+            Named arguments to the function.
+
+        output_args : list[tvm.runtime.NDArray] or list[DLTensor]
+            The output tensors of the function.
+        """
+        if input_args:
+            func_params = self._exec.get_function_params(func_name)
+            new_args = [None] * len(func_params)
+            cnt = 0
+            for k in input_args:
+                if k in func_params:
+                    idx = func_params.index(k)
+                    new_args[idx] = input_args[k]
+                    cnt += 1
+            assert cnt == len(func_params)
+        cargs = convert(new_args)
+        self._set_input(func_name, *cargs)
+        self._set_outputs(func_name, *output_args)
+        self._invoke(func_name)
+
     def get_outputs(self):
         """Get the outputs from a call to :py:func`invoke_stateful`.
 
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 6f52f4b83c81..aaf4675733a8 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -143,8 +143,16 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
       } else {
         auto it = inputs_.find(func_name);
         ICHECK(it != inputs_.end()) << "Input has not been set for function " << func_name;
-        const std::vector<ObjectRef>& func_args = it->second;
-        *rv = Invoke(func, func_args);
+        const std::vector<ObjectRef>& input_args = it->second;
+        if (set_outputs_enabled_.count(func_name) && set_outputs_enabled_[func_name]) {
+          ICHECK(outputs_.count(func_name))
+              << "Outputs have not been set for function " << func_name;
+          *rv = Invoke(func, input_args, outputs_[func_name]);
+          outputs_[func_name].clear();
+          set_outputs_enabled_[func_name] = false;
+        } else {
+          *rv = Invoke(func, input_args);
+        }
       }
     });
   } else if (name == "invoke_stateful") {
@@ -224,6 +232,9 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
                                 << "(func_name, index or name, tensor)";
       SetOneInput(args[0], args[1], args[2]);
     });
+  } else if (name == "set_outputs") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { SetOutputs(args[0], args); });
   } else if (name == "load_late_bound_consts") {
     return PackedFunc([this](TVMArgs args, TVMRetValue* rv) {
       CHECK_EQ(args.size(), 1);
@@ -272,6 +283,62 @@ void VirtualMachine::SetOneInput(std::string func_name, const TVMArgValue& tag,
   SetInputTensorWithIndex(inputs_[func_name], tensor, inp_index, dev);
 }
 
+void VirtualMachine::SetOutputs(std::string func_name, TVMArgs args) {
+  set_outputs_enabled_[func_name] = true;
+  size_t outputs_size = args.size();
+  // First args is func_name
+  ICHECK_GT(outputs_size, 1) << "There is no output arguments set";
+
+  std::vector<ObjectRef> func_args(outputs_size - 1);
+  for (size_t i = 1; i < outputs_size; ++i) {
+    // TODO(vvchernov): device?
+    func_args[i - 1] = TensorFromTVMArgValueToObjectRef(args[i]);
+  }
+  outputs_.erase(func_name);
+  outputs_.emplace(func_name, func_args);
+}
+
+void VirtualMachine::PrintInfoAndSetInputArgs(const VMFunction& func,
+                                              const std::vector<ObjectRef>& args) {
+  VLOG(2) << "Executing Function: " << std::endl << func;
+  for (int i = 0; i < static_cast<int>(devices_.size()); ++i) {
+    VLOG(2) << "Device " << i << " has device type " << devices_[i].device_type << " and device id "
+            << devices_[i].device_id
+            << (i == exec_->host_device_index ? " (using as host device)" : "");
+  }
+
+  InvokeGlobal(func, args);
+}
+
+void VirtualMachine::SetOutputTensorsToRegister(const std::string& func_name,
+                                                const std::vector<ObjectRef>& outputs) {
+  size_t size = outputs.size();
+
+  if (output_tensor_reg_indices_[func_name].empty()) {
+    output_tensor_reg_indices_[func_name] = GetOutputTensorRegIndices();
+  }
+  auto& reg_indices = output_tensor_reg_indices_[func_name];
+  ICHECK_EQ(reg_indices.size(), size)
+      << "Number of outside output tensors should be equal to model outputs number";
+  size_t i = 0;
+  for (auto it = reg_indices.begin(); it != reg_indices.end(); ++it, ++i) {
+    WriteRegister(*it, outputs[i]);
+  }
+}
+
+ObjectRef VirtualMachine::TensorFromTVMArgValueToObjectRef(const TVMArgValue& output_tensor) const {
+  if (output_tensor.type_code() == kTVMDLTensorHandle) {
+    DLTensor* dl_tensor = output_tensor;
+    return NDArray::FromExternalDLTensor(*dl_tensor);
+  } else if (output_tensor.type_code() == kTVMNDArrayHandle) {
+    return output_tensor.AsObjectRef<tvm::runtime::NDArray>();
+  } else {
+    LOG(FATAL) << "It supports tensor of DLTensor or NDArray type only! Given type is "
+               << output_tensor.type_code();
+  }
+  return ObjectRef();
+}
+
 int64_t VirtualMachine::GetInputIndexFromVMFunction(const std::string& func_name,
                                                     const std::string& input_name) const {
   const auto& vm_func = CheckAndGetVMFunction(func_name);
@@ -359,14 +426,7 @@ void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<Obje
 }
 
 ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& args) {
-  VLOG(2) << "Executing Function: " << std::endl << func;
-  for (int i = 0; i < static_cast<int>(devices_.size()); ++i) {
-    VLOG(2) << "Device " << i << " has device type " << devices_[i].device_type << " and device id "
-            << devices_[i].device_id
-            << (i == exec_->host_device_index ? " (using as host device)" : "");
-  }
-
-  InvokeGlobal(func, args);
+  PrintInfoAndSetInputArgs(func, args);
   RunLoop();
   return return_register_;
 }
@@ -380,6 +440,14 @@ ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<Obje
   return Invoke(exec_->functions[func_index], args);
 }
 
+ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& input_args,
+                                 const std::vector<ObjectRef>& output_args) {
+  PrintInfoAndSetInputArgs(func, input_args);
+  SetOutputTensorsToRegister(func.name, output_args);
+  RunLoop(output_tensor_reg_indices_[func.name]);
+  return return_register_;
+}
+
 void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
                                   Index output_size, const std::vector<ObjectRef>& args) {
   size_t arity = 0;
@@ -518,7 +586,45 @@ int64_t VirtualMachine::LoadScalarInt(Index r) const {
   return result;
 }
 
-void VirtualMachine::RunLoop() {
+Index VirtualMachine::GetResultRegisterIndex() const {
+  Index op_index = 0;
+  while (code_[op_index].op != Opcode::Ret) {
+    ++op_index;
+  }
+
+  return code_[op_index].result;
+}
+
+void VirtualMachine::CalculatePreResultOpIndex(Index res_index) {
+  if (preresult_op_index_ == -1) {
+    preresult_op_index_ = 0;
+    while (code_[preresult_op_index_].dst != res_index) {
+      ++preresult_op_index_;
+    }
+  }
+}
+
+std::vector<Index> VirtualMachine::GetOutputTensorRegIndices() {
+  std::vector<Index> reg_indices;
+  Index res_index = GetResultRegisterIndex();
+  CalculatePreResultOpIndex(res_index);
+  auto& preres_instr = code_[preresult_op_index_];
+  auto op_code = preres_instr.op;
+  if (op_code == Opcode::AllocTensor) {
+    reg_indices.emplace_back(res_index);
+  } else if (op_code == Opcode::AllocADT) {
+    for (Index i = 0; i < preres_instr.num_fields; ++i) {
+      reg_indices.push_back(preres_instr.datatype_fields[i]);
+    }
+  } else if (op_code == Opcode::ReshapeTensor) {
+    reg_indices.push_back(preres_instr.reshape_tensor.tensor);
+  } else {
+    LOG(FATAL) << "Operation " << size_t(op_code) << " is not supported for set_outputs method";
+  }
+  return reg_indices;
+}
+
+void VirtualMachine::RunLoop(const std::vector<Index>& output_tensor_reg_indices) {
   ICHECK(this->exec_);
   ICHECK(this->code_);
   pc_ = 0;
@@ -666,21 +772,11 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::AllocTensor: {
         OpStartHook(instr);
-        auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim);
-
-        for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
-          shape[i] = instr.alloc_tensor.shape[i];
+        if (!output_tensor_reg_indices.empty() && FindIndex(output_tensor_reg_indices, instr.dst)) {
+          WriteAllocatedTensorFromOutside(instr);
+        } else {
+          WriteAllocatedTensor(instr);
         }
-
-        auto storage_obj = ReadRegister(instr.alloc_tensor.storage);
-        auto offset = LoadScalarInt(instr.alloc_tensor.offset);
-        auto storage = Downcast<Storage>(storage_obj);
-        auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor.dtype);
-        VLOG(2) << "allocated "
-                << RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
-                                        /*show_contents=*/false);
-
-        WriteRegister(instr.dst, obj);
         OpStopHook();
         pc_++;
         goto main_loop;
@@ -825,6 +921,75 @@ void VirtualMachine::RunLoop() {
   }
 }
 
+void VirtualMachine::WriteAllocatedTensor(const Instruction& instr) {
+  auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim);
+
+  for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
+    shape[i] = instr.alloc_tensor.shape[i];
+  }
+
+  auto storage_obj = ReadRegister(instr.alloc_tensor.storage);
+  auto offset = LoadScalarInt(instr.alloc_tensor.offset);
+  auto storage = Downcast<Storage>(storage_obj);
+  auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor.dtype);
+  VLOG(2) << "allocated "
+          << RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
+                                  /*show_contents=*/false);
+
+  WriteRegister(instr.dst, obj);
+}
+
+void VirtualMachine::WriteAllocatedTensorFromOutside(const Instruction& instr) {
+  // External tensor(s) has been already written to the register (instr.dst)
+  auto ex_arr = Downcast<NDArray>(ReadRegister(instr.dst));
+  auto ex_shape = ex_arr.Shape();
+  auto ex_size = ex_shape.size();
+  auto ex_dtype = ex_arr->dtype;
+
+  auto in_size = instr.alloc_tensor.ndim;
+  auto in_dtype = instr.alloc_tensor.dtype;
+  ICHECK_EQ(TypeEqual(in_dtype, ex_dtype), true)
+      << "Data types mismatching for internal and external output tensors";
+
+  bool size_check = false;
+  if (ex_size != in_size) {
+    size_check = true;
+  } else {
+    for (size_t i = 0; i < in_size; ++i) {
+      if (ex_shape[i] != instr.alloc_tensor.shape[i]) {
+        size_check = true;
+        break;
+      }
+    }
+  }
+
+  if (size_check) {
+    // Match element number
+    size_t in_el_num = 1, ex_el_num = 1;
+    for (size_t i = 0; i < ex_size; ++i) {
+      ex_el_num *= ex_shape[i];
+    }
+    for (size_t i = 0; i < in_size; ++i) {
+      in_el_num *= instr.alloc_tensor.shape[i];
+    }
+    ICHECK_EQ(in_el_num, ex_el_num)
+        << "Element number mismatching of internal and external output tensors";
+    if (code_[preresult_op_index_].op == Opcode::ReshapeTensor) {
+      int64_t* dims = instr.alloc_tensor.shape;
+      std::vector<int64_t> ref_shape(dims, dims + int64_t(in_size));
+      auto reshaped_tensor = ex_arr.CreateView(ref_shape, ex_dtype);
+      WriteRegister(instr.dst, reshaped_tensor);
+    } else {
+      LOG(FATAL) << "Internal and external output tensor shapes are mismatched";
+    }
+  }
+}
+
+bool VirtualMachine::FindIndex(const std::vector<Index>& indices, Index val) const {
+  auto it = std::find(indices.begin(), indices.end(), val);
+  return it != indices.end();
+}
+
 runtime::Module CreateVirtualMachine(Executable* exec) {
   auto vm = make_object<VirtualMachine>();
   vm->LoadExecutable(GetObjectPtr<Executable>(exec));
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 0b62db85c904..45e305c9a195 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -846,26 +846,38 @@ def relay_ext_test(func):
     assert "shape_func" in opt_mod.astext(False)
 
 
-def test_vm_rpc():
+def prepare_vm_model(path, tensor_shape):
     """
-    This test checks to make sure you can export a VMExecutable,
-    upload it to a remote machine using RPC and then execute it
-    on the other machine.
+    Virtual Machine is compiled for simple topology and
+    exported as library to given path
     """
     target = tvm.target.Target("llvm --host=llvm")
 
     # Build a IRModule.
-    x = relay.var("x", shape=(10, 1))
+    x = relay.var("x", shape=tensor_shape)
     f = relay.Function([x], x + x)
     mod = IRModule.from_expr(f)
 
     # Compile to VMExecutable.
     vm_exec = vm.compile(mod, target=target)
 
+    # Export to Disk
+    vm_exec.mod.export_library(path)
+
+
+def test_vm_rpc():
+    """
+    This test checks to make sure you can export a VMExecutable,
+    upload it to a remote machine using RPC and then execute it
+    on the other machine.
+    """
+    # Shape for input and output tensors
+    shape = (10, 1)
+
     # Export to Disk
     temp = utils.tempdir()
     path = temp.relpath("vm_library.so")
-    vm_exec.mod.export_library(path)
+    prepare_vm_model(path, shape)
 
     # Use local rpc server for testing.
     # Server must use popen so it doesn't inherit the current process state. It
@@ -881,7 +893,7 @@ def check_remote(server):
         device = remote.cpu()
         # Build a VM out of the executable and context.
         vm_factory = runtime.vm.VirtualMachine(rexec, device)
-        np_input = np.random.uniform(size=(10, 1)).astype("float32")
+        np_input = np.random.uniform(size=shape).astype("float32")
         input_tensor = tvm.nd.array(np_input, device)
         # Invoke its "main" function.
         out = vm_factory.invoke("main", input_tensor)
@@ -891,6 +903,72 @@ def check_remote(server):
     check_remote(rpc.Server("127.0.0.1"))
 
 
+def test_vm_invoke_with_outputs_rpc():
+    """
+    This test checks to make sure you can export a VMExecutable,
+    upload it to a remote machine using RPC and then execute it
+    on the other machine with preallocated outputs.
+    """
+    # Shape for input and output tensors
+    shape = (3, 2)
+
+    # Export to Disk
+    temp = utils.tempdir()
+    path = temp.relpath("vm_library.so")
+    prepare_vm_model(path, shape)
+
+    # Use local rpc server for testing.
+    # Server must use popen so it doesn't inherit the current process state. It
+    # will crash otherwise.
+    def check_remote_invoke_with_outputs(server):
+        remote = rpc.connect(server.host, server.port, session_timeout=10)
+
+        # Upload the serialized Executable.
+        remote.upload(path)
+        # Get a handle to remote Executable.
+        rexec = remote.load_module("vm_library.so")
+
+        device = remote.cpu()
+        # Build a VM out of the executable and context.
+        vm_factory = runtime.vm.VirtualMachine(rexec, device)
+        np_input = np.random.uniform(size=shape).astype("float32")
+        input_tensor = tvm.nd.array(np_input, device)
+        np_output = np.empty(shape, dtype="float32")
+        output_tensor = tvm.nd.array(np_output, device)
+        # Invoke its "main" function.
+        vm_factory.invoke_with_outputs(
+            "main", input_args={"x": input_tensor}, output_args=[output_tensor]
+        )
+        # Check the result.
+        np.testing.assert_allclose(output_tensor.numpy(), np_input + np_input)
+
+    check_remote_invoke_with_outputs(rpc.Server("127.0.0.1"))
+
+
+def test_vm_invoke_with_outputs():
+    target = tvm.target.Target("llvm")
+    shape = (3, 2)
+
+    # Build a IRModule.
+    x = relay.var("x", shape=shape)
+    f = relay.Function([x], x + x)
+    mod = IRModule.from_expr(f)
+
+    # Compile to VMExecutable.
+    vm_exec = vm.compile(mod, target=target)
+    vm_factory = runtime.vm.VirtualMachine(vm_exec, tvm.cpu())
+    np_input = np.random.uniform(size=shape).astype("float32")
+    input_tensor = tvm.nd.array(np_input)
+    np_output = np.empty(shape, dtype="float32")
+    output_tensor = tvm.nd.array(np_output)
+    # Invoke
+    vm_factory.invoke_with_outputs(
+        "main", input_args={"x": input_tensor}, output_args=[output_tensor]
+    )
+    # Check the result.
+    np.testing.assert_allclose(output_tensor.numpy(), np_input + np_input)
+
+
 def test_get_output_single():
     target = tvm.target.Target("llvm")
 

From ea01e3ffb428e02093503b8a4bcc9ed8ac9a77c3 Mon Sep 17 00:00:00 2001
From: albert qing <2628869@qq.com>
Date: Fri, 30 Sep 2022 01:32:27 +0800
Subject: [PATCH 280/704] [TIR] Preserve loop annotations in
 inject_software_pipeline pass (#12937)

---
 .../transforms/inject_software_pipeline.cc    | 30 ++++++---
 ..._tir_transform_inject_software_pipeline.py | 67 +++++++++++++++++++
 2 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/src/tir/transforms/inject_software_pipeline.cc b/src/tir/transforms/inject_software_pipeline.cc
index 2d97aa1a1158..08d57c53d1c2 100644
--- a/src/tir/transforms/inject_software_pipeline.cc
+++ b/src/tir/transforms/inject_software_pipeline.cc
@@ -308,9 +308,10 @@ class PipelineRewriter : public StmtExprMutator {
       const std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>& double_buffers,
       const Array<Buffer> pipeline_allocs, const For& pipeline_loop,
       const PipelineInfo& pipeline_info,
-      const std::unordered_map<const VarNode*, FragmentInfo>& fragment_info) {
+      const std::unordered_map<const VarNode*, FragmentInfo>& fragment_info,
+      const Map<String, ObjectRef> preserved_annotations) {
     PipelineRewriter rewriter(buffer_data_to_buffer, double_buffers, pipeline_allocs, pipeline_loop,
-                              pipeline_info, fragment_info);
+                              pipeline_info, fragment_info, preserved_annotations);
     return rewriter.BuildPipeline();
   }
 
@@ -319,14 +320,16 @@ class PipelineRewriter : public StmtExprMutator {
                    const std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>& double_buffers,
                    const Array<Buffer>& pipeline_allocs, const For& pipeline_loop,
                    const PipelineInfo& pipeline_info,
-                   const std::unordered_map<const VarNode*, FragmentInfo>& fragment_info)
+                   const std::unordered_map<const VarNode*, FragmentInfo>& fragment_info,
+                   const Map<String, ObjectRef> preserved_annotations)
 
       : buffer_data_to_buffer_(std::move(buffer_data_to_buffer)),
         double_buffers_(double_buffers),
         pipeline_allocs_(pipeline_allocs),
         pipeline_loop_(pipeline_loop),
         pipeline_info_(pipeline_info),
-        fragment_info_(fragment_info) {}
+        fragment_info_(fragment_info),
+        preserved_annotations_(preserved_annotations) {}
 
   Stmt BuildPipeline() {
     // Step 1: Analyze accesses to the buffers in the pipeline and compute the number of versions
@@ -903,7 +906,8 @@ class PipelineRewriter : public StmtExprMutator {
 
     if (!is_unit_loop) {
       new_loop = For(Downcast<Var>(new_loop_var), pipeline_loop_->min, extent,
-                     unroll_loop ? ForKind::kUnrolled : pipeline_loop_->kind, std::move(new_loop));
+                     unroll_loop ? ForKind::kUnrolled : pipeline_loop_->kind, std::move(new_loop),
+                     NullOpt, preserved_annotations_);
     }
 
     // Update producer heads in the global async states.
@@ -937,6 +941,7 @@ class PipelineRewriter : public StmtExprMutator {
   Map<Buffer, Buffer> buffer_remap_;
   Array<Block> ordered_stmts_;
   std::map<int, AsyncStateGlobal> async_states;
+  Map<String, ObjectRef> preserved_annotations_;
 };
 
 /*!
@@ -1100,6 +1105,15 @@ class PipelineInjector : private StmtExprMutator {
       }
     }
 
+    Map<String, ObjectRef> preserved_annotations;
+    for (const auto& kv : op->annotations) {
+      const String& key = kv.first;
+      if (kv.first != attr::software_pipeline_stage && kv.first != attr::software_pipeline_order &&
+          kv.first != attr::software_pipeline_async_stages) {
+        preserved_annotations.Set(key, kv.second);
+      }
+    }
+
     for (size_t i = 0; i < pipeline_stages.size(); i++) {
       int stage = static_cast<int>(pipeline_stages[i]->value);
       bool is_async = pipeline_async_stages.find(stage) != pipeline_async_stages.end();
@@ -1112,9 +1126,9 @@ class PipelineInjector : private StmtExprMutator {
     ValidatePipelineBody(pipeline_info, original_order);
 
     // Step 4: Rewrite the pipeline body.
-    Stmt pipeline =
-        PipelineRewriter::Rewrite(buffer_data_to_buffer_, double_buffers, pipeline_allocs,
-                                  GetRef<For>(op), pipeline_info, fragment_info_);
+    Stmt pipeline = PipelineRewriter::Rewrite(buffer_data_to_buffer_, double_buffers,
+                                              pipeline_allocs, GetRef<For>(op), pipeline_info,
+                                              fragment_info_, preserved_annotations);
 
     if (const auto* realize = op->body.as<BlockRealizeNode>()) {
       const auto& block = realize->block;
diff --git a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
index 49255e0f2094..9334a4d9e827 100644
--- a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
+++ b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
@@ -151,6 +151,69 @@ def transformed_simple_compute(
                 C[tx, 15] = B[1, tx, 0] + T.float32(1)
 
 
+@T.prim_func
+def simple_compute_with_other_annotation(
+    A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]
+):
+    for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
+        for i in T.serial(
+            0,
+            16,
+            annotations={
+                "software_pipeline_stage": [0, 1],
+                "software_pipeline_order": [0, 1],
+                "pragma_loop_partition_hint": True,
+            },
+        ):
+            with T.block("compute"):
+                T.reads(A[tx, i])
+                T.writes(C[tx, i])
+                B = T.alloc_buffer((16, 1), dtype="float32", scope="shared")
+                with T.block():
+                    T.reads(A[tx, i])
+                    T.writes(B[tx, 0])
+                    B[tx, 0] = A[tx, i] * T.float32(2)
+                with T.block():
+                    T.reads(B[tx, 0])
+                    T.writes(C[tx, i])
+                    C[tx, i] = B[tx, 0] + T.float32(1)
+
+
+@T.prim_func
+def transformed_simple_compute_with_other_annotation(
+    A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]
+) -> None:
+    for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
+        with T.block():
+            T.reads([A[tx, 0:16]])
+            T.writes([C[tx, 0:16]])
+            B = T.alloc_buffer([2, 16, 1], dtype="float32", scope="shared")
+            with T.block():
+                T.reads([A[tx, 0]])
+                T.writes([B[0, tx, 0]])
+                B[0, tx, 0] = A[tx, 0] * T.float32(2)
+            with T.block():
+                T.reads([A[tx, 1:16], B[0:2, tx, 0]])
+                T.writes([B[0:2, tx, 0], C[tx, 0:15]])
+                for i in T.serial(
+                    0,
+                    15,
+                    annotations={"pragma_loop_partition_hint": True},
+                ):
+                    with T.block():
+                        T.reads([A[tx, i + 1]])
+                        T.writes([B[(i + 1) % 2, tx, 0]])
+                        B[(i + 1) % 2, tx, 0] = A[tx, i + 1] * T.float32(2)
+                    with T.block():
+                        T.reads([B[i % 2, tx, 0]])
+                        T.writes([C[tx, i]])
+                        C[tx, i] = B[i % 2, tx, 0] + T.float32(1)
+            with T.block():
+                T.reads([B[1, tx, 0]])
+                T.writes([C[tx, 15]])
+                C[tx, 15] = B[1, tx, 0] + T.float32(1)
+
+
 @T.prim_func
 def three_stage_compute(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]):
     for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
@@ -1000,6 +1063,10 @@ def test_simple_compute():
     _check(gen_simple_compute(1), transformed_simple_compute)
 
 
+def test_simple_compute_with_other_annotation():
+    _check(simple_compute_with_other_annotation, transformed_simple_compute_with_other_annotation)
+
+
 def test_trivial_pipeline():
     _check(trivial_pipeline, transformed_trivial_pipeline)
 

From 2379917985919ed3918dc12cad47f469f245be7a Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Thu, 29 Sep 2022 13:35:21 -0400
Subject: [PATCH 281/704] [MetaSchedule] Add Script for TorchBench Model Tuning
 & Benchmarking (#12914)

This PR adds a script to tune and benchmark TorchBench models, using torchdynamo and the pytorch importer in TVM.
---
 .../testing/torchbench/__init__.py            |  16 +
 .../meta_schedule/testing/torchbench/run.py   | 609 ++++++++++++++++++
 .../meta_schedule/testing/torchbench/utils.py | 103 +++
 3 files changed, 728 insertions(+)
 create mode 100644 python/tvm/meta_schedule/testing/torchbench/__init__.py
 create mode 100644 python/tvm/meta_schedule/testing/torchbench/run.py
 create mode 100644 python/tvm/meta_schedule/testing/torchbench/utils.py

diff --git a/python/tvm/meta_schedule/testing/torchbench/__init__.py b/python/tvm/meta_schedule/testing/torchbench/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/torchbench/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/tvm/meta_schedule/testing/torchbench/run.py b/python/tvm/meta_schedule/testing/torchbench/run.py
new file mode 100644
index 000000000000..f6984d1c9d10
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/torchbench/run.py
@@ -0,0 +1,609 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This script is for benchmarking TVM performance on models from TorchBench.
+It uses the TorchDynamo as the frontend to ingest models into TVM, and it also
+leverages the benchmark util from TorchDynamo.
+
+TorchDynamo (https://github.com/pytorch/torchdynamo) and TorchBench
+(https://github.com/pytorch/benchmark) need to be in the parent directory of TVM.
+We need a local clone of these repos because torchbench and the benchmark runner
+in TorchDynamo isn't designed to be used as a Python package.
+
+To setup the environment, run the following commands in the parent directory of TVM and with
+the appropriate Python environment:
+```bash
+# torchdynamo requires nightly pytorch. If it fails to find the specified version, try
+# installing the latest nightly pytorch.
+pip3 install --pre \
+    --extra-index-url https://download.pytorch.org/whl/nightly/cu116 \
+    torch==1.13.0.dev20220926 \
+    torchvision==0.14.0.dev20220926 \
+    torchtext==0.14.0.dev20220926
+
+git clone https://github.com/pytorch/torchdynamo
+pushd torchdynamo
+git checkout c537639f9712621dc04ca09908796dbbe86c354b
+pip install -e .
+popd
+
+sudo apt install git-lfs  # git lfs is used for TorchBench
+git clone https://github.com/pytorch/benchmark
+pushd benchmark
+python install.py --continue_on_fail  # fambench_xlmr might fail to install
+popd
+```
+
+To run a benchmark, the script can be run under 'tune' mode by
+```bash
+python python/tvm/meta_schedule/testing/torchbench/run.py \
+    --mode tune \
+    --model resnet50 \
+    --target "nvidia/geforce-rtx-3070" \
+    --work-dir ../workdir \
+    --num-trials 20000 \
+    --rpc-host <rpc tracker host for tuning> \
+    --rpc-port <rpc tracker port for tuning> \
+    --rpc-key <rpc key> \
+```
+
+All available target tags (like nvidia/geforce-rtx-3070) can be found at
+https://github.com/apache/tvm/blob/main/src/target/tag.cc
+
+Then the script can be run under 'eval' mode to actual benchmark the performance,
+using the tuning database under the work directory. This can be executed on a different
+machine than the one executes tuning (the database json files need to be inside
+of the work directory).
+```bash
+python python/tvm/meta_schedule/testing/torchbench/run.py \
+    --mode eval \
+    --model resnet50 \
+    --target "nvidia/geforce-rtx-3070" \
+    --work-dir ../workdir \
+    --num-trials 0
+```
+
+Alternatively, both tuning and evaluation can be done in a single run on the same machine,
+by
+```bash
+python python/tvm/meta_schedule/testing/torchbench/run.py \
+    --mode all \
+    --model resnet50 \
+    --target "llvm -num-cores 6" \
+    --work-dir ../workdir \
+    --num-trials 0
+```
+"""
+
+# pylint: disable=logging-format-interpolation
+
+import argparse
+import functools
+import logging
+import warnings
+from enum import Enum
+from typing import Callable, List, Tuple
+
+import numpy as np  # type: ignore
+import torch  # type: ignore
+from scipy.stats import ttest_ind  # type: ignore
+
+import tvm
+import tvm.relay
+from tvm import meta_schedule as ms
+from tvm.contrib.graph_executor import GraphModule
+from tvm.meta_schedule.testing.torchbench.utils import (
+    load_torchdynamo_benchmark_runner,
+    same,
+    timed,
+)
+from tvm.runtime.vm import VirtualMachine
+from tvm.support import describe
+
+# Needs to be imported after the .utils is executed
+import torchdynamo  # type: ignore  # isort: skip, pylint: disable=wrong-import-order
+
+
+class RunMode(Enum):
+    """
+    The running mode of this script. Available values are:
+    - tune: Only tune the model and create the tuning database.
+    - eval: Only benchmark model using pre-existing tuning database.
+    - all: Run both tuning and benchmark
+    """
+
+    ALL = "all"
+    TUNE = "tune"
+    EVAL = "eval"
+
+    @property
+    def should_tune(self):
+        """
+        Returns whether it should tune the model.
+        """
+        return self != RunMode.EVAL
+
+    @property
+    def should_eval(self):
+        """
+        Returns whether it should actually benchmark the model.
+        """
+        return self != RunMode.TUNE
+
+
+class ResultComparisonMetric(Enum):
+    """
+    This changes how it compares the resultl with the expected value during
+    accuracy check.
+    - cosine: Use the cosine similarity. It should be greater than 0.99.
+    - allclose-1e-4: Use the max element-wise absolute difference. It should be less than 1e-4.
+    """
+
+    COSINE = "cosine"
+    ALLCLOSE = "allclose-1e-4"
+
+
+def parse_args():
+    """
+    Parse arguments
+    """
+    args = argparse.ArgumentParser()
+
+    args.add_argument(
+        "--mode",
+        type=RunMode,
+        default=RunMode.ALL,
+        help=RunMode.__doc__,
+    )
+    args.add_argument(
+        "--batch-size",
+        type=int,
+        default=None,
+        help="The batch size of model input. Use TorchBench's default value if not specified.",
+    )
+    args.add_argument(
+        "--result-metric",
+        type=ResultComparisonMetric,
+        default=ResultComparisonMetric.ALLCLOSE,
+        help=ResultComparisonMetric.__doc__,
+    )
+    args.add_argument(
+        "--benchmark-repeat",
+        type=int,
+        default=10,
+        help="The number of times to repeat the benchmark measurement.",
+    )
+    args.add_argument(
+        "--benchmark-warmup-rounds",
+        type=int,
+        default=5,
+        help="The number of rounds to warmup before starting to measure the performance.",
+    )
+
+    # Model selection
+    args.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="""
+        The name of model to run. It should a directory name under 
+        https://github.com/pytorch/benchmark/tree/main/torchbenchmark/models.
+        """,
+    )
+
+    # Tuning-related config
+    args.add_argument(
+        "--target",
+        type=tvm.target.Target,
+        required=True,
+        help="The target to tune and run benchmark for.",
+    )
+    args.add_argument(
+        "--work-dir",
+        type=str,
+        required=True,
+        help="""
+        The working directory to save intermediate results and store databases for compilation.
+        """,
+    )
+    args.add_argument(
+        "--cache-dir",
+        type=str,
+        default=None,
+        help="""
+        The directory to cache the generated network.
+        If not specified, the cache will be disabled.
+        """,
+    )
+    args.add_argument(
+        "--num-trials",
+        type=int,
+        required=True,
+        help="The max number of trials to run MetaSchedule.",
+    )
+    args.add_argument(
+        "--max-trials-per-task",
+        type=int,
+        default=None,
+        help="""
+        The max number of trials to run per task extracted in MetaSchedule. 
+        By default it's the same as --num-trials.
+        """,
+    )
+    args.add_argument(
+        "--backend",
+        type=str,
+        choices=["graph", "vm"],
+        default="graph",
+        help="The backend to use for relay compilation(graph / vm).",
+    )
+    # TODO(@yelite): Add a layout arg to transform the network after
+    # ingesting into Relay and before feeding into MetaSchedule.
+
+    # Evaluator-related config
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+        help="The number of times to run the model for taking average in a single measurement.",
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+        help="The number of times to repeat the measurement.",
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+        help="""
+        Minimum repeat time in ms. The number of runs will be increased if the actual
+        repeat time is lowered than this.
+        """,
+    )
+    args.add_argument(
+        "--adaptive-training",
+        action="store_true",
+        help="Whether to use adpative training for cost model.",
+    )
+    args.add_argument(
+        "--cpu-flush",
+        action="store_true",
+        help="Whether to perform CPU cache flush.",
+    )
+
+    # RPC-related args
+    args.add_argument(
+        "--rpc-host",
+        type=str,
+        help="Host of the RPC Tracker for tuning. Use LocalRunner if not provided",
+    )
+    args.add_argument(
+        "--rpc-port",
+        type=int,
+        help="Port of the RPC Tracker for tuning",
+    )
+    args.add_argument(
+        "--rpc-key",
+        type=str,
+        help="Key of the RPC Tracker for tuning",
+    )
+
+    parsed = args.parse_args()
+    return parsed
+
+
+logging.basicConfig(
+    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+)
+logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
+ARGS = parse_args()
+IS_CUDA = ARGS.target.kind.name == "cuda"
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+logger.setLevel(logging.INFO)
+
+
+runner = load_torchdynamo_benchmark_runner(  # pylint: disable=invalid-name
+    IS_CUDA, cosine_similarity=ARGS.result_metric == ResultComparisonMetric.COSINE
+)
+
+
+def get_metaschedule_runner() -> ms.runner.PyRunner:
+    """
+    Get the Runner for MetaSchedule.
+
+    It returns RPCRunner if --rpc-host is given, otherwise it returns LocalRunner
+    """
+    if ARGS.rpc_host is not None:
+        assert ARGS.rpc_port is not None, "Missing rpc_port"
+        assert ARGS.rpc_key is not None, "Missing rpc_key"
+        return ms.runner.RPCRunner(
+            rpc_config=ms.runner.RPCConfig(
+                tracker_host=ARGS.rpc_host,
+                tracker_port=ARGS.rpc_port,
+                tracker_key=ARGS.rpc_key,
+                session_timeout_sec=600,
+            ),
+            evaluator_config=ms.runner.EvaluatorConfig(
+                number=ARGS.number,
+                repeat=ARGS.repeat,
+                min_repeat_ms=ARGS.min_repeat_ms,
+                enable_cpu_cache_flush=ARGS.cpu_flush,
+            ),
+            alloc_repeat=1,
+        )
+    else:
+        warnings.warn("Falling back to Metaschedule LocalRunner because --rpc-host isn't provided.")
+        return ms.runner.LocalRunner()
+
+
+def get_tune_config() -> ms.TuneConfig:
+    """
+    Get the TuneConfig.
+    """
+    if ARGS.mode.should_tune:
+        max_trials_per_task = ARGS.max_trials_per_task
+        max_trials_global = ARGS.num_trials
+    else:
+        max_trials_per_task = 0
+        max_trials_global = 0
+
+    if max_trials_per_task is None:
+        max_trials_per_task = max_trials_global
+
+    return ms.TuneConfig(
+        strategy="evolutionary",
+        num_trials_per_iter=64,
+        max_trials_per_task=max_trials_per_task,
+        max_trials_global=max_trials_global,
+        adaptive_training=ARGS.adaptive_training,
+    )
+
+
+def get_graph_executor_forward(mod: GraphModule, device: tvm.runtime.Device) -> Callable:
+    """
+    Get the forward function for graph executor, in order to integrate with TorchDynamo.
+    """
+
+    def forward(*args):
+        if IS_CUDA:
+            torch.cuda.synchronize()
+        args = tuple(arg.contiguous() for arg in args)
+        for idx, arg in enumerate(args, 0):
+            mod.set_input(
+                f"inp_{idx}",
+                tvm.nd.from_dlpack(arg),
+            )
+        mod.run()
+        device.sync()
+        result = [torch.from_dlpack(mod.get_output(i)) for i in range(mod.get_num_outputs())]
+        return result
+
+    return forward
+
+
+def get_vm_forward(virtual_machine: VirtualMachine, device: tvm.runtime.Device) -> Callable:
+    """
+    Get the forward function for VM, in order to integrate with TorchDynamo.
+    """
+
+    def forward(*args):
+        if IS_CUDA:
+            torch.cuda.synchronize()
+        args = tuple(tvm.nd.from_dlpack(arg.contiguous()) for arg in args)
+        result = virtual_machine.invoke("main", *args)
+        device.sync()
+
+        if isinstance(result, tvm.nd.NDArray):
+            result = [result]
+        return [torch.from_dlpack(m) for m in result]
+
+    return forward
+
+
+def create_tvm_task_collection_backend(tasks: List[ms.ExtractedTask]) -> Callable:
+    """
+    This torchdynamo backend only collects the extracted tasks from Metaschedule.
+    It doesn't tune the model.
+    """
+
+    def backend(graph_module, example_inputs):
+        jit_mod = torch.jit.trace(graph_module, example_inputs)
+        shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
+        ir_mod, params = tvm.relay.frontend.from_pytorch(jit_mod, shape_list)
+
+        extracted_tasks = ms.extract_task_from_relay(ir_mod, ARGS.target, params)
+        logger.info("Extracted %d tasks", len(extracted_tasks))
+        tasks.extend(extracted_tasks)
+
+        return graph_module.forward
+
+    return backend
+
+
+def create_tvm_compilation_backend(database: ms.database.Database) -> Callable:
+    """
+    This torchdynamo backend compiles the model using history best record from the
+    Metaschedule database.
+    """
+
+    def backend(graph_module, example_inputs):
+        # pylint: disable=import-outside-toplevel
+        from tvm.ir.transform import PassContext
+
+        # pylint: enable=import-outside-toplevel
+
+        jit_mod = torch.jit.trace(graph_module, example_inputs)
+        shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
+        ir_mod, params = tvm.relay.frontend.from_pytorch(jit_mod, shape_list)
+
+        relay_build = {"graph": tvm.relay.build, "vm": tvm.relay.vm.compile}[ARGS.backend]
+        with ARGS.target, ms.utils.autotvm_silencer(), database:
+            with PassContext(
+                opt_level=3,
+                config={
+                    "relay.backend.use_meta_schedule": True,
+                    "relay.backend.use_meta_schedule_dispatch": not IS_CUDA,
+                    "relay.backend.tir_converter": "default",
+                },
+            ):
+                lib = relay_build(ir_mod, target=ARGS.target, params=params)
+
+        device = tvm.cuda(0) if IS_CUDA else tvm.cpu(0)
+
+        if ARGS.backend == "graph":
+            mod = GraphModule(lib["default"](device))
+            return get_graph_executor_forward(mod, device)
+        elif ARGS.backend == "vm":
+            vm = VirtualMachine(lib, device)  # pylint: disable=invalid-name
+            return get_vm_forward(vm, device)
+        else:
+            raise RuntimeError(f"Unknown backend {ARGS.backend}")
+
+    return backend
+
+
+def format_time(seconds: float) -> str:
+    """
+    Format elapsed time based on its value.
+    """
+    if seconds > 1:
+        return f"{seconds:.3g}s"
+    else:
+        return f"{seconds * 1000:.3g}ms"
+
+
+def is_output_correct(output: torch.Tensor, expected: torch.Tensor) -> bool:
+    """
+    Check whether the output is correct.
+    """
+    comparison_metric = ARGS.result_metric
+    if comparison_metric == ResultComparisonMetric.COSINE:
+        return same(expected, output, cosine_similarity=True)
+    elif comparison_metric == ResultComparisonMetric.ALLCLOSE:
+        return same(expected, output, tol=1e-4)
+    else:
+        raise RuntimeError(f"Unknown comparison metric {comparison_metric}")
+
+
+def performance_experiment(
+    model_iter_fn: Callable, model: torch.nn.Module, example_inputs: Tuple[torch.Tensor]
+) -> str:
+    """
+    Performs the actual benchmarking
+    Simplified from https://github.com/pytorch/torchdynamo/blob/c537639f9712621dc04ca09908796dbbe86c354b/benchmarks/common.py#L494 pylint: disable=line-too-long
+    """
+    timings = np.zeros((ARGS.benchmark_repeat, 2), np.float64)
+
+    is_correct = True
+
+    frozen_model_iter_fn = torchdynamo.run(model_iter_fn)
+
+    for _ in range(ARGS.benchmark_warmup_rounds):
+        frozen_model_iter_fn(model, example_inputs)
+        model_iter_fn(model, example_inputs)
+
+    for rep in range(ARGS.benchmark_repeat):
+        # interleave the runs to handle frequency scaling and load changes
+        timings[rep, 0], expected_output = timed(
+            model, model_iter_fn, example_inputs, return_result=True
+        )
+        timings[rep, 1], actual_output = timed(
+            model, frozen_model_iter_fn, example_inputs, return_result=True
+        )
+        is_correct = is_correct and is_output_correct(expected_output, actual_output)
+
+    pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue
+    median = np.median(timings, axis=0)
+    speedup = median[0] / median[1]
+    logger.info(
+        f"eager:{format_time(median[0])} "
+        f"optimized:{format_time(median[1])} "
+        f"speedup:{speedup:.3f}x p:{pvalue:.3f}"
+    )
+    if not is_correct:
+        logger.error("Result is incorrect.")
+        logger.error(f"Expected (PyTorch eager): {expected_output}")
+        logger.error(f"Actual (Optimized): {actual_output}")
+
+    return ""
+
+
+def get_torch_device_type(target: tvm.target.Target) -> str:
+    if target.kind.name == "llvm":
+        return "cpu"
+    elif target.kind.name == "cuda":
+        return "cuda"
+    else:
+        raise RuntimeError(f"Unsupported target {target}")
+
+
+def main():
+    """
+    Entry point of the benchmark
+    """
+    describe()
+
+    if not ARGS.mode.should_tune:
+        ms_database = ms.default_config.database(None, ARGS.work_dir)
+        if len(ms_database) == 0:
+            raise RuntimeError(
+                "Script is runnig in eval mode while the tuning database is empty. "
+                "Please tune the model first."
+            )
+
+    if IS_CUDA and ARGS.cpu_flush:
+        warnings.warn(
+            "Benchmark is running on CUDA, while --cpu-flush is turned on. "
+            "This flag will have no effect on CUDA."
+        )
+
+    try:
+        _, name, model, example_inputs, batch_size = runner.load_model(
+            get_torch_device_type(ARGS.target),
+            ARGS.model,
+            batch_size=ARGS.batch_size,
+        )
+        logger.info(
+            f"batch size: {batch_size} input shape: {[input.shape for input in example_inputs]}"
+        )
+    except NotImplementedError:
+        logging.exception(f"{ARGS.model} failed to load")
+        return
+
+    tuning_tasks: List[ms.ExtractedTask] = []
+    task_collect_ctx = torchdynamo.optimize(create_tvm_task_collection_backend(tuning_tasks))
+    task_collect_ctx(runner.model_iter_fn)(model, example_inputs)
+
+    database = ms.tune_extracted_tasks(
+        extracted_tasks=tuning_tasks,
+        config=get_tune_config(),
+        work_dir=ARGS.work_dir,
+        runner=get_metaschedule_runner(),  # type: ignore
+    )
+
+    if ARGS.mode.should_eval:
+        torchdynamo.reset()
+        model_compile_ctx = torchdynamo.optimize(create_tvm_compilation_backend(database))
+        experiment = functools.partial(performance_experiment, runner.model_iter_fn)
+        runner.run_one_model(name, model, example_inputs, model_compile_ctx, experiment)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/torchbench/utils.py b/python/tvm/meta_schedule/testing/torchbench/utils.py
new file mode 100644
index 000000000000..f5a745ea008a
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/torchbench/utils.py
@@ -0,0 +1,103 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Helper functions for running TorchBench through the benchmark functions
+from TorchDynamo.
+"""
+
+import os
+import sys
+from dataclasses import dataclass
+
+import torch  # type: ignore
+
+
+def find_torchdynamo() -> str:
+    """
+    Find the directory of TorchDynamo repo.
+
+    It can't directly import the benchmark runner in TorchDynamo
+    becuase it isn't designed to be used as a Python package.
+    """
+    candidates = [
+        "torchdynamo",
+        "../torchdynamo",
+        "../../torchdynamo",
+    ]
+    for library_dir in candidates:
+        if os.path.exists(f"{library_dir}/benchmarks"):
+            return library_dir
+
+    raise RuntimeError(
+        """
+        Cannot find directory for torchdynamo.
+        You need to clone https://github.com/pytorch/torchdynamo to the parent directory of cwd.
+        """
+    )
+
+
+DYNAMO_DIR = find_torchdynamo()
+sys.path.append(DYNAMO_DIR)
+sys.path.append(f"{DYNAMO_DIR}/benchmarks")
+
+# pylint: disable=wrong-import-position, unused-import
+from benchmarks.common import same, timed  # type: ignore
+from torchbench import TorchBenchmarkRunner  # type: ignore
+
+# pylint: disable=wrong-import-position, unused-import
+
+
+def load_torchdynamo_benchmark_runner(
+    is_cuda: bool, cosine_similarity: bool = False
+) -> TorchBenchmarkRunner:
+    """
+    Load the benchmark runner from TorchDynamo.
+    """
+
+    @dataclass
+    class RunnerArgs:
+        """
+        This class simulates the parsed args required by the benchmark code from TorchDynamo.
+        """
+
+        ci: bool = False  # Whether runs in CI mode. pylint: disable=invalid-name
+        training: bool = False  # Whether it benchmarks training workload.
+        use_eval_mode: bool = True  # Whether the model should be in eval mode.
+        dynamic_shapes: bool = False  # Whether runs the model in dynamic shape mode.
+        float16: bool = False  # Whether to cast model and inputs to float16
+        float32: bool = False  # Whether to cast model and inputs to float32
+
+        accuracy: bool = False  # Whether to perform a accuracy test
+        performance: bool = True  # Whether to perform a performance test
+
+        cosine: bool = False  # Whether to use consine similarity to check if output is correct.
+
+    args = RunnerArgs(cosine=cosine_similarity)
+
+    runner = TorchBenchmarkRunner()
+    runner.args = args
+    runner.model_iter_fn = runner.forward_pass
+
+    if is_cuda:
+        # pylint: disable=import-outside-toplevel
+        import benchmarks.common  # type: ignore
+
+        # pylint: enable=import-outside-toplevel
+
+        benchmarks.common.synchronize = torch.cuda.synchronize
+
+    return runner

From 595f0b3975a51976b24e6c39cf2a7de369c33c30 Mon Sep 17 00:00:00 2001
From: rahul <rahulutkoor4887@gmail.com>
Date: Fri, 30 Sep 2022 01:43:10 +0530
Subject: [PATCH 282/704] [HEXAGON][QHL] Clippling the inputs of HVX version of
 QHL Sigmoid operation (#12919)

* [HEXAGON][QHL] HVX version of QHL(Qualcomm Hexagon Library) sigmoid generates incorrect output if the input falls outside of [-8.0, 8.0]. To fix this, we need to clip the input to sigmoid in the range between >-8.0 and <8.0.

* setting vectorize attribute for tir.sigmoid to enable vectorization at TIR level

* Asserting if sigmoid/vmin/vmax are not generated

* formatting the test file

Co-authored-by: quic_rutkoor <quic_rutkoor@quicinc.com>
---
 src/target/llvm/intrin_rule_hexagon.cc        |  10 +-
 src/tir/op/op.cc                              |   2 +-
 .../contrib/test_hexagon/test_sigmoid.py      | 117 ++++++++++++++++++
 3 files changed, 127 insertions(+), 2 deletions(-)
 create mode 100644 tests/python/contrib/test_hexagon/test_sigmoid.py

diff --git a/src/target/llvm/intrin_rule_hexagon.cc b/src/target/llvm/intrin_rule_hexagon.cc
index c96245e1399c..7c4b38c1d702 100644
--- a/src/target/llvm/intrin_rule_hexagon.cc
+++ b/src/target/llvm/intrin_rule_hexagon.cc
@@ -182,10 +182,18 @@ TVM_REGISTER_OP("tir.sigmoid")
         useqhl = tstring.find("+hvx-qfloat") != std::string::npos;
       }
 
+      PrimExpr MinBound = tir::make_const(x.dtype(), -8);
+      PrimExpr MaxBound = tir::make_const(x.dtype(), 8);
+      const PrimExpr v1 = tir::Max(x, MinBound);
+      const PrimExpr v2 = tir::Min(v1, MaxBound);
+
+      Array<tvm::PrimExpr> new_args = {v2};
+      const tir::Call new_call = tir::Call(call->dtype, call->op, new_args);
+
       // Enable QHL library for FP16 data type
       if (x->dtype.is_float16() && x->dtype.lanes() > 1 && useqhl) {
         std::string tvm_wrapper("tvm_vect_qhmath_hvx_sigmoid_ahf");
-        return TVMExternCall(call, tvm_wrapper);
+        return TVMExternCall(new_call.get(), tvm_wrapper);
       }
 #endif
       PrimExpr one = tir::make_const(x.dtype(), 1);
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 509badbebb92..3f7f05fe8e64 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -866,7 +866,7 @@ TIR_REGISTER_PURE_UNARY_OP("tir.erf");
 
 TIR_REGISTER_PURE_UNARY_OP("tir.tanh").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.sigmoid");
+TIR_REGISTER_PURE_UNARY_OP("tir.sigmoid").set_attr<TVectorizable>("TVectorizable", true);
 
 TIR_REGISTER_PURE_UNARY_OP("tir.sqrt").set_attr<TVectorizable>("TVectorizable", true);
 
diff --git a/tests/python/contrib/test_hexagon/test_sigmoid.py b/tests/python/contrib/test_hexagon/test_sigmoid.py
new file mode 100644
index 000000000000..9aad35ee76c1
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_sigmoid.py
@@ -0,0 +1,117 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+from tvm import te
+from tvm import tir
+from tvm import topi
+from tvm.contrib.hexagon.build import HexagonLauncher
+
+from .infrastructure import allocate_hexagon_array, transform_numpy
+
+
+def sigmoid_compute(Input):
+    return topi.sigmoid(Input)
+
+
+def sigmoid_stir_schedule(Input, Output):
+    sigmoid_func = te.create_prim_func([Input, Output])
+    sch = tir.Schedule(sigmoid_func, debug_mask="all")
+    block = sch.get_block("compute")
+
+    (n,) = sch.get_loops(block)
+    sch.vectorize(n)
+    return sch
+
+
+@tvm.testing.fixture
+def input_np(in_shape, dtype, min_val, max_val):
+    return np.random.uniform(low=min_val, high=max_val, size=in_shape).astype(dtype)
+
+
+@tvm.testing.fixture
+def ref_output_np(input_np):
+    output_np = 1 / (1 + np.exp(-input_np))
+    return output_np
+
+
+class BaseSigmoid:
+    (in_shape, dtype, min_val, max_val,) = tvm.testing.parameters(
+        ((64,), "float16", -8.0, 8.0),
+        ((64,), "float16", -6.0, 7.0),
+        ((64,), "float16", -10.0, 15.0),
+        ((64,), "float16", -10.0, 0.0),
+        ((64,), "float16", 0.0, 10.0),
+    )
+
+
+class TestSigmoid(BaseSigmoid):
+    @tvm.testing.requires_hexagon
+    def test_sigmoid(
+        self,
+        in_shape,
+        dtype,
+        input_np,
+        ref_output_np,
+        target,
+        hexagon_session,
+    ):
+        InputTensor = te.placeholder(in_shape, name="InputTensor", dtype=dtype)
+
+        OutputTensor = sigmoid_compute(InputTensor)
+
+        target_hexagon = tvm.target.hexagon("v69")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+        tir_s = sigmoid_stir_schedule(InputTensor, OutputTensor)
+
+        input_data = allocate_hexagon_array(
+            hexagon_session.device,
+            data=input_np,
+        )
+        output_data = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=ref_output_np.shape,
+            dtype=ref_output_np.dtype,
+        )
+
+        func_name = "sigmoid"
+        with tvm.transform.PassContext(opt_level=3):
+            runtime_module = tvm.build(tir_s.mod, target=target, name=func_name)
+
+        assert "hvx_sigmoid" in runtime_module.get_source("asm")
+        assert "vmin" in runtime_module.get_source("asm")
+        assert "vmax" in runtime_module.get_source("asm")
+        mod = hexagon_session.load_module(runtime_module)
+
+        mod(input_data, output_data)
+        output_np = output_data.numpy()
+
+        tvm.testing.assert_allclose(
+            output_np,
+            ref_output_np,
+            1e-3,
+            1e-3,
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 25a54fb7919f077d2ef9b75639d26e1d5321e189 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 29 Sep 2022 22:53:00 -0700
Subject: [PATCH 283/704] [TIR] Remove unused iters from the result of reindex
 (#12946)

The existence of unused block iters make it difficult for block layout transformation. Previous it relies the flag `simplify_trivial_iter=true` in affine analysis, which is unsafe because simplifying trivial iterators during scheduling stage lose the information of iterators.
---
 .../multi_level_tiling_tensor_core.cc         | 18 ++++++++++++++----
 .../schedule/primitive/cache_read_write.cc    | 19 +++++++++++++------
 .../unittest/test_tir_schedule_reindex.py     | 14 ++++++--------
 3 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index fbf9aa19b711..2ec78c1918e9 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -487,6 +487,9 @@ Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
 
   std::unordered_set<tir::Buffer, ObjectPtrHash, ObjectPtrEqual> visited_buffers;
 
+  Map<tir::Buffer, tir::IndexMap> buffer_sub_index_map;  // cache of the sub index map associated
+                                                         // with each buffer
+
   auto f_transform_buffer_layout = [&](tir::BufferIndexType index_type, int buffer_index) {
     const tir::Buffer& lhs_buffer = tir::GetNthAccessBuffer(
         state->sch->state(), block_before_reindex, buffer_index, index_type);
@@ -499,6 +502,7 @@ Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
     const tir::BufferRegion& reindexed_buffer_region = tir::GetNthAccessBufferRegion(
         state->sch->state(), GetRef<tir::Block>(block), buffer_index, index_type);
     auto sub_index_map = f_get_sub_index_map(lhs_buffer, reindexed_buffer_region->region);
+    buffer_sub_index_map.Set(lhs_buffer, sub_index_map);
     state->sch->TransformLayout(state->block_rv, buffer_index, index_type, sub_index_map, NullOpt);
   };
 
@@ -510,11 +514,17 @@ Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
   }
 
   // Transform the layout of current block and reindex blocks
-  state->sch->TransformBlockLayout(state->tensor_core_reindex_store, index_map);
-  state->sch->TransformBlockLayout(state->tensor_core_reindex_A, index_map);
-  state->sch->TransformBlockLayout(state->tensor_core_reindex_B, index_map);
+  auto f_transform_reindex_block_layout = [&](const BlockRV& block_rv,
+                                              tir::BufferIndexType buffer_type) {
+    tir::Buffer buffer =
+        tir::GetNthAccessBuffer(state->sch->state(), state->sch->Get(block_rv), 0, buffer_type);
+    const auto& sub_index_map = buffer_sub_index_map.at(buffer);
+    state->sch->TransformBlockLayout(block_rv, sub_index_map);
+  };
+  f_transform_reindex_block_layout(state->tensor_core_reindex_store, tir::BufferIndexType::kWrite);
+  f_transform_reindex_block_layout(state->tensor_core_reindex_A, tir::BufferIndexType::kRead);
+  f_transform_reindex_block_layout(state->tensor_core_reindex_B, tir::BufferIndexType::kRead);
   state->sch->TransformBlockLayout(state->block_rv, index_map);
-
   return tir::TileWithTensorIntrin(state->sch, state->block_rv, intrin_name,
                                    /*allow_padding=*/true);
 }
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index e9583adbbaa9..489308ae8c0f 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -195,14 +195,18 @@ Block MakeReIndexStage(const Block& block, CacheStageInfo* info,
 
   // Step 1: Create block iters, access regions of the reindex block, and accessing indices to the
   // reindex buffer.
-  for (const IterVar& iter : block->iter_vars) {
+  std::unordered_set<int> skipped_block_iters;
+  for (int i = 0, n = block->iter_vars.size(); i < n; ++i) {
+    const IterVar& iter = block->iter_vars[i];
     Var var("v" + std::to_string(new_block_iters.size()), iter->var->dtype);
     bool used = covered.count(iter->var);
-    new_block_iters.push_back(
-        IterVar(/*dom=*/used ? iter->dom
-                             : Range::FromMinExtent(IntImm(var->dtype, 0), IntImm(var->dtype, 1)),
-                /*var=*/var,
-                /*IterVarType=*/kDataPar));
+    if (used) {
+      new_block_iters.push_back(IterVar(/*dom=*/used ? iter->dom : Range::FromMinExtent(0, 1),
+                                        /*var=*/var,
+                                        /*IterVarType=*/kDataPar));
+    } else {
+      skipped_block_iters.insert(i);
+    }
     if (used) {
       reindex_indices.push_back(var);
       reindex_region.push_back(Range::FromMinExtent(var, IntImm(var->dtype, 1)));
@@ -256,6 +260,9 @@ Block MakeReIndexStage(const Block& block, CacheStageInfo* info,
   std::vector<Var> loop_vars;         // loop variables
   std::vector<PrimExpr> iter_values;  // bindings in block realize
   for (int i = 0; i < static_cast<int>(block->iter_vars.size()); ++i) {
+    if (skipped_block_iters.count(i)) {
+      continue;
+    }
     Var loop_var("ax" + std::to_string(loop_vars.size()), block->iter_vars[i]->var->dtype);
     loop_vars.push_back(loop_var);
     iter_values.push_back(loop_var);
diff --git a/tests/python/unittest/test_tir_schedule_reindex.py b/tests/python/unittest/test_tir_schedule_reindex.py
index 47b8b5cb88f4..60dcefba631a 100644
--- a/tests/python/unittest/test_tir_schedule_reindex.py
+++ b/tests/python/unittest/test_tir_schedule_reindex.py
@@ -96,11 +96,9 @@ def conv2d_nhwc_reindex_weight(
                 T.float32(0),
                 dtype="float32",
             )
-    for ax0, ax1, ax2, ax3, ax4, ax5, ax6 in T.grid(1, 1, 1, 64, 7, 7, 3):
+    for ax3, ax4, ax5, ax6 in T.grid(64, 7, 7, 3):
         with T.block("weight_reindex"):
-            v0, v1, v2, v3, v4, v5, v6 = T.axis.remap(
-                "SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6]
-            )
+            v3, v4, v5, v6 = T.axis.remap("SSSS", [ax3, ax4, ax5, ax6])
             T.reads(weight[v4, v5, v6, v3])
             T.writes(weight_reindex[v3, v4, v5, v6])
             weight_reindex[v3, v4, v5, v6] = weight[v4, v5, v6, v3]
@@ -152,9 +150,9 @@ def matmul_reindex_write(
             with T.init():
                 C_reindex[i, j] = T.float32(0)
             C_reindex[i, j] = C_reindex[i, j] + A[i, k] * B[k, j]
-    for i0, i1, i2 in T.grid(512, 512, 1):
+    for i0, i1 in T.grid(512, 512):
         with T.block("C_reindex"):
-            v0, v1, v2 = T.axis.remap("SSS", [i0, i1, i2])
+            v0, v1 = T.axis.remap("SS", [i0, i1])
             T.reads(C_reindex[v0, v1])
             T.writes(C[v0, v1])
             C[v0, v1] = C_reindex[v0, v1]
@@ -201,10 +199,10 @@ def mixed_dtype_reindex_write(
             with T.init():
                 T_matmul_NT_reindex[i, j] = T.float16(0)
             T_matmul_NT_reindex[i, j] = T_matmul_NT_reindex[i, j] + p0[i, k] * p1[j, k]
-    for ax0, ax1, ax2 in T.grid(T.int64(2), 1280, 1):
+    for ax0, ax1 in T.grid(T.int64(2), 1280):
         with T.block("T_matmul_NT_reindex"):
             v0 = T.axis.spatial(T.int64(2), ax0)
-            v1, v2 = T.axis.remap("SS", [ax1, ax2])
+            (v1,) = T.axis.remap("S", [ax1])
             T.reads(T_matmul_NT_reindex[v0, v1])
             T.writes(T_matmul_NT[v0, v1])
             T_matmul_NT[v0, v1] = T_matmul_NT_reindex[v0, v1]

From 77c8b6e1634c83950630ae32cf8a171f6f27d264 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 29 Sep 2022 22:53:26 -0700
Subject: [PATCH 284/704] [Support] Add fallback definition of ccache in
 libinfo (#12945)

fallback definition is needed if it's not defined in `config.cmake`, otherwise it's compilation error
---
 src/support/libinfo.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index a7d8e6a1ae2d..46b12ba25303 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -227,6 +227,10 @@
 #define TVM_CXX_COMPILER_PATH ""
 #endif
 
+#ifndef TVM_INFO_USE_CCACHE
+#define TVM_INFO_USE_CCACHE "NOT-FOUND"
+#endif
+
 namespace tvm {
 
 /*!

From 4e4089edda7f3cd888178f4ad325d7824717ce8e Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 29 Sep 2022 22:54:01 -0700
Subject: [PATCH 285/704] [MetaSchedule] Fix XGBoost Import Issue (#12936)

Previous upgrade introduced a import of xgboost in meta_schedule, removed in current version by using a function to return the call back class.

We've recently introduced a XGBoost Model upgrade to support new xgboost version of callback class in https://github.com/apache/tvm/pull/12141. However, in this PR it uses a function called `optional_xgboost_callback` that works to avoid compatibility issue (xgboost 1.5.2 v.s. 1.6.0). In this specific function, it tries to import the newly introduced xgboost callback class and create a new class using it as base class. This actually imported xgboost when meta_schedule is imported, which is not ideal because xgboost is not a dependency of tvm and meta_schedule, it should only be required when xgboost cost model is employed. This PR fixes the problem by moving the class and the function mentioned above under a function that returns this class when needed. In this way we avoided unwanted import of xgboost in meta_schedule.
---
 .../tvm/meta_schedule/cost_model/xgb_model.py | 348 +++++++++---------
 .../unittest/test_meta_schedule_cost_model.py |  45 ++-
 2 files changed, 214 insertions(+), 179 deletions(-)

diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py
index 1171e081b90a..59774b534e55 100644
--- a/python/tvm/meta_schedule/cost_model/xgb_model.py
+++ b/python/tvm/meta_schedule/cost_model/xgb_model.py
@@ -22,7 +22,7 @@
 import tempfile
 from collections import OrderedDict
 from itertools import chain as itertools_chain
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple, Callable
 
 import numpy as np  # type: ignore
 
@@ -36,26 +36,10 @@
 from .metric import max_curve
 
 
-def optional_xgboost_callback(cls):
-    """Decorator for importing TraningCallback from xgboost"""
-    # pylint:disable = import-outside-toplevel
-    try:
-        from xgboost.callback import TrainingCallback  # type: ignore
-    # pylint:enable = import-outside-toplevel
-    except ImportError:
-
-        class TrainingCallback:  # type: ignore
-            pass
-
-    class OptXGBoostCustomCallback(cls, TrainingCallback):  # type: ignore
-        pass
-
-    return OptXGBoostCustomCallback
-
-
 if TYPE_CHECKING:
 
     import xgboost as xgb  # type: ignore
+    from xgboost.callback import TrainingCallback  # type: ignore
 
     from ..tune_context import TuneContext
 
@@ -346,7 +330,7 @@ def __init__(
         extractor: FeatureExtractor,
         # xgboost model config
         config: XGBConfig = XGBConfig(),
-        # behavior of randomness
+        # random result before enough samples
         num_warmup_samples: int = 100,
         # evaluation
         early_stopping_rounds: int = 50,
@@ -598,7 +582,7 @@ def avg_peak_score(ys_pred: np.ndarray, d_train: "xgb.DMatrix"):  # type: ignore
             num_boost_round=10000,
             obj=obj,
             callbacks=[
-                XGBoostCustomCallback(
+                _get_custom_call_back(
                     early_stopping_rounds=self.early_stopping_rounds,
                     verbose_eval=self.verbose_eval,
                     fevals=[rmse, avg_peak_score],
@@ -657,158 +641,194 @@ def average_peak_score(ys_pred: np.ndarray):
         return eval_result
 
 
-@optional_xgboost_callback
-class XGBoostCustomCallback:
-    """Custom callback class for xgboost to support multiple custom evaluation functions"""
-
-    def __init__(
-        self,
-        early_stopping_rounds: int,
-        verbose_eval: int,
-        fevals: List[Callable],
-        evals: List[Tuple["xgb.DMatrix", str]],
-        focused_metric: str = "tr-p-rmse",
-        cvfolds: List["xgb.training.CVPack"] = None,
-    ):
-        self.early_stopping_rounds = early_stopping_rounds
-        self.verbose_eval = verbose_eval
-        self.fevals = fevals
-        self.evals = evals
-        self.state: Dict[str, Any] = {}
-        self.focused_metric = focused_metric
-        self.sort_key = make_metric_sorter(focused_metric=focused_metric)
-        self.cvfolds = cvfolds
-        if cvfolds is not None:
-            self.aggregated_cv = None
-
-    def __call__(self, env: "xgb.core.CallbackEnv"):
-        # Compatibility with xgboost < 1.3
-        return self.after_iteration(env.model, env.iteration, env.evaluation_result_list)
-
-    def init(self, model: "xgb.Booster"):
-        """Internal function for intialization"""
-        booster: "xgb.Booster" = model
-        self.state["best_iteration"] = 0
-        self.state["best_score"] = float("inf")
-        if booster is None:
-            assert self.cvfolds is not None
-            return
-        if booster.attr("best_score") is not None:
-            self.state["best_score"] = float(booster.attr("best_score"))
-            self.state["best_iteration"] = int(booster.attr("best_iteration"))
-            self.state["best_msg"] = booster.attr("best_msg")
-        else:
-            booster.set_attr(best_iteration=str(self.state["best_iteration"]))
-            booster.set_attr(best_score=str(self.state["best_score"]))
+def _get_custom_call_back(
+    early_stopping_rounds: int,
+    verbose_eval: int,
+    fevals: List[Callable],
+    evals: List[Tuple["xgb.DMatrix", str]],
+    focused_metric: str = "tr-p-rmse",
+    cvfolds: List["xgb.training.CVPack"] = None,
+) -> "TrainingCallback":
+    """Get a customized callback function for XGBoost. Work around xgboost import."""
 
-    def after_iteration(
-        self, model: "xgb.Booster", epoch: int, evals_log: Dict
-    ):  # pylint: disable = unused-argument
-        """Internal function for after_iteration"""
+    def optional_xgboost_callback(cls):
+        """Decorator for importing TraningCallback from xgboost"""
         # pylint:disable = import-outside-toplevel
         try:
-            from xgboost.callback import _fmt_metric  # type: ignore
+            from xgboost.callback import TrainingCallback  # type: ignore
+        # pylint:enable = import-outside-toplevel
         except ImportError:
-            # Compatibility with xgboost >= 1.6
 
-            def _fmt_metric(value, show_stdv=True):
-                if len(value) == 2:
-                    return f"{value[0]}:{value[1]:.5f}"
-                if len(value) == 3:
-                    if show_stdv:
-                        return f"{value[0]}:{value[1]:.5f}+{value[2]:.5f}"
-                    return f"{value[0]}:{value[1]:.5f}"
-                raise ValueError("wrong metric value", value)
+            class TrainingCallback:  # type: ignore
+                pass
 
-        import xgboost as xgb
-        from xgboost import rabit  # type: ignore
+        class OptXGBoostCustomCallback(cls, TrainingCallback):  # type: ignore
+            pass
 
-        try:
-            from xgboost.training import aggcv  # type: ignore
-        except ImportError:
-            from xgboost.callback import _aggcv as aggcv  # type: ignore
+        return OptXGBoostCustomCallback
 
-        # pylint:enable = import-outside-toplevel
-        if not self.state:
-            self.init(model)
-        booster: xgb.Booster = model
-        iteration: int = epoch
-        cvfolds: List[xgb.training.CVPack] = self.cvfolds
-        ##### Evaluation #####
-        # `eval_result` is a list of (key, score)
-        eval_result: List[Tuple[str, float]] = []
-        if cvfolds is None:
-            eval_result = list(
-                itertools_chain.from_iterable(
-                    [
-                        (key, float(value))
-                        for key, value in map(
-                            lambda x: x.split(":"),
-                            booster.eval_set(
-                                evals=self.evals,
-                                iteration=iteration,
-                                feval=feval,
-                            ).split()[1:],
-                        )
-                    ]
-                    for feval in self.fevals
-                )
-            )
-        else:
-            eval_result = list(
-                itertools_chain.from_iterable(
-                    [
-                        (key, score)
-                        for key, score, _std in aggcv(
-                            fold.eval(
-                                iteration=iteration,
-                                feval=feval,
+    @optional_xgboost_callback
+    class XGBoostCustomCallback:
+        """Custom callback class for xgboost to support multiple custom evaluation functions"""
+
+        def __init__(
+            self,
+            early_stopping_rounds: int,
+            verbose_eval: int,
+            fevals: List[Callable],
+            evals: List[Tuple["xgb.DMatrix", str]],
+            focused_metric: str = "tr-p-rmse",
+            cvfolds: List["xgb.training.CVPack"] = None,
+        ):
+            self.early_stopping_rounds = early_stopping_rounds
+            self.verbose_eval = verbose_eval
+            self.fevals = fevals
+            self.evals = evals
+            self.state: Dict[str, Any] = {}
+            self.focused_metric = focused_metric
+            self.sort_key = make_metric_sorter(focused_metric=focused_metric)
+            self.cvfolds = cvfolds
+            if cvfolds is not None:
+                self.aggregated_cv = None
+
+        def __call__(self, env: "xgb.core.CallbackEnv"):
+            # Compatibility with xgboost < 1.3
+            return self.after_iteration(env.model, env.iteration, env.evaluation_result_list)
+
+        def init(self, model: "xgb.Booster"):
+            """Internal function for intialization"""
+            booster: "xgb.Booster" = model
+            self.state["best_iteration"] = 0
+            self.state["best_score"] = float("inf")
+            if booster is None:
+                assert self.cvfolds is not None
+                return
+            if booster.attr("best_score") is not None:
+                self.state["best_score"] = float(booster.attr("best_score"))
+                self.state["best_iteration"] = int(booster.attr("best_iteration"))
+                self.state["best_msg"] = booster.attr("best_msg")
+            else:
+                booster.set_attr(best_iteration=str(self.state["best_iteration"]))
+                booster.set_attr(best_score=str(self.state["best_score"]))
+
+        def after_iteration(
+            self, model: "xgb.Booster", epoch: int, evals_log: Dict
+        ):  # pylint: disable = unused-argument
+            """Internal function for after_iteration"""
+            # pylint:disable = import-outside-toplevel
+            try:
+                from xgboost.callback import _fmt_metric  # type: ignore
+            except ImportError:
+                # Compatibility with xgboost >= 1.6
+
+                def _fmt_metric(value, show_stdv=True):
+                    if len(value) == 2:
+                        return f"{value[0]}:{value[1]:.5f}"
+                    if len(value) == 3:
+                        if show_stdv:
+                            return f"{value[0]}:{value[1]:.5f}+{value[2]:.5f}"
+                        return f"{value[0]}:{value[1]:.5f}"
+                    raise ValueError("wrong metric value", value)
+
+            import xgboost as xgb
+            from xgboost import rabit  # type: ignore
+
+            try:
+                from xgboost.training import aggcv  # type: ignore
+            except ImportError:
+                from xgboost.callback import _aggcv as aggcv  # type: ignore
+
+            # pylint:enable = import-outside-toplevel
+            if not self.state:
+                self.init(model)
+            booster: xgb.Booster = model
+            iteration: int = epoch
+            cvfolds: List[xgb.training.CVPack] = self.cvfolds
+            ##### Evaluation #####
+            # `eval_result` is a list of (key, score)
+            eval_result: List[Tuple[str, float]] = []
+            if cvfolds is None:
+                eval_result = list(
+                    itertools_chain.from_iterable(
+                        [
+                            (key, float(value))
+                            for key, value in map(
+                                lambda x: x.split(":"),
+                                booster.eval_set(
+                                    evals=self.evals,
+                                    iteration=iteration,
+                                    feval=feval,
+                                ).split()[1:],
                             )
-                            for fold in cvfolds
-                        )
-                    ]
-                    for feval in self.fevals
+                        ]
+                        for feval in self.fevals
+                    )
                 )
-            )
-        eval_result = list(eval_result)
-        eval_result.sort(key=self.sort_key)
-
-        ##### Print eval result #####
-        if self.verbose_eval and iteration % self.verbose_eval == 0:
-            info = []
-            for key, score in eval_result:
-                if "null" not in key:
-                    info.append(f"{key}: {score:.6f}")
-            logger.debug("XGB iter %3d: %s", iteration, "\t".join(info))
-
-        ##### Choose score and do early stopping #####
-        score = None
-        for key, _score in eval_result:
-            if key == self.focused_metric:
-                score = _score
-                break
-        assert score is not None
-
-        best_score = self.state["best_score"]
-        best_iteration = self.state["best_iteration"]
-        if score < best_score:
-            tab = "\t"  # to work with f-string
-            msg = f"[{epoch}] {tab.join([_fmt_metric(x) for x in eval_result])}"
-            self.state["best_msg"] = msg
-            self.state["best_score"] = score
-            self.state["best_iteration"] = epoch
-            # save the property to attributes, so they will occur in checkpoint.
-            if model is not None:
-                model.set_attr(
-                    best_score=str(self.state["best_score"]),
-                    best_iteration=str(self.state["best_iteration"]),
-                    best_msg=self.state["best_msg"],
+            else:
+                eval_result = list(
+                    itertools_chain.from_iterable(
+                        [
+                            (key, score)
+                            for key, score, _std in aggcv(
+                                fold.eval(
+                                    iteration=iteration,
+                                    feval=feval,
+                                )
+                                for fold in cvfolds
+                            )
+                        ]
+                        for feval in self.fevals
+                    )
                 )
-        elif epoch - best_iteration >= self.early_stopping_rounds:
-            best_msg = self.state["best_msg"]
-
-            if self.verbose_eval and rabit.get_rank() == 0:
-                logger.debug("XGB stopped. Best iteration: %s ", best_msg)
-            return True  # instead of raising EarlyStopException, returning True to end the training
-        # False to indicate training should not stop.
-        return False
+            eval_result = list(eval_result)
+            eval_result.sort(key=self.sort_key)
+
+            ##### Print eval result #####
+            if self.verbose_eval and iteration % self.verbose_eval == 0:
+                info = []
+                for key, score in eval_result:
+                    if "null" not in key:
+                        info.append(f"{key}: {score:.6f}")
+                logger.debug("XGB iter %3d: %s", iteration, "\t".join(info))
+
+            ##### Choose score and do early stopping #####
+            score = None
+            for key, _score in eval_result:
+                if key == self.focused_metric:
+                    score = _score
+                    break
+            assert score is not None
+
+            best_score = self.state["best_score"]
+            best_iteration = self.state["best_iteration"]
+            if score < best_score:
+                tab = "\t"  # to work with f-string
+                msg = f"[{epoch}] {tab.join([_fmt_metric(x) for x in eval_result])}"
+                self.state["best_msg"] = msg
+                self.state["best_score"] = score
+                self.state["best_iteration"] = epoch
+                # save the property to attributes, so they will occur in checkpoint.
+                if model is not None:
+                    model.set_attr(
+                        best_score=str(self.state["best_score"]),
+                        best_iteration=str(self.state["best_iteration"]),
+                        best_msg=self.state["best_msg"],
+                    )
+            elif epoch - best_iteration >= self.early_stopping_rounds:
+                best_msg = self.state["best_msg"]
+
+                if self.verbose_eval and rabit.get_rank() == 0:
+                    logger.debug("XGB stopped. Best iteration: %s ", best_msg)
+                # instead of raising EarlyStopException, returning True to end the training
+                return True
+            # False to indicate training should not stop.
+            return False
+
+    return XGBoostCustomCallback(
+        early_stopping_rounds=early_stopping_rounds,
+        verbose_eval=verbose_eval,
+        fevals=fevals,
+        evals=evals,
+        focused_metric=focused_metric,
+        cvfolds=cvfolds,
+    )
diff --git a/tests/python/unittest/test_meta_schedule_cost_model.py b/tests/python/unittest/test_meta_schedule_cost_model.py
index 94b7bce246f4..c47897eabb3e 100644
--- a/tests/python/unittest/test_meta_schedule_cost_model.py
+++ b/tests/python/unittest/test_meta_schedule_cost_model.py
@@ -15,27 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
+from typing import List
+
 import os
 import re
 import shutil
-import sys
 import tempfile
-from typing import List
-
+from functools import partial
+import unittest
 import numpy as np
-import pytest
+
 import tvm
 import tvm.testing
+from tvm.script import tir as T
+from tvm.tir.schedule.schedule import Schedule
 from tvm.meta_schedule.cost_model import PyCostModel, RandomModel, XGBModel
-from tvm.meta_schedule.cost_model.xgb_model import XGBoostCustomCallback, PackSum
+from tvm.meta_schedule.cost_model.xgb_model import _get_custom_call_back, PackSum
 from tvm.meta_schedule.feature_extractor import RandomFeatureExtractor
 from tvm.meta_schedule.runner import RunnerResult
 from tvm.meta_schedule.search_strategy import MeasureCandidate
 from tvm.meta_schedule.tune_context import TuneContext
 from tvm.meta_schedule.utils import derived_object
-from tvm.script import tir as T
-from tvm.tir.schedule.schedule import Schedule
-
 
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
 @tvm.script.ir_module
@@ -196,13 +196,15 @@ def test_meta_schedule_xgb_model_reload():
     assert (res1 == res2).all()
     assert old_data_size == new_data_size
     assert len(old_data) == len(new_data)
-    for (k1, g1), (k2, g2) in zip(old_data.items(), new_data.items()):
+    for (k1, g1), (k2, g2) in zip(  # pylint: disable=invalid-name
+        old_data.items(), new_data.items()
+    ):
         assert k1 == k2
         assert k1 == g1.group_hash
         assert k2 == g2.group_hash
         assert (g1.costs == g2.costs).all()
         assert len(g1.features) == len(g2.features)
-        for f1, f2 in zip(g1.features, g2.features):
+        for f1, f2 in zip(g1.features, g2.features):  # pylint: disable=invalid-name
             assert (f1 == f2).all()
 
 
@@ -229,10 +231,23 @@ def test_meta_schedule_xgb_model_reupdate():
     model.predict(TuneContext(), [_dummy_candidate() for i in range(predict_sample_count)])
 
 
-def test_meta_schedule_xgb_model_callback():
+def xgb_version_check():
+
+    # pylint: disable=import-outside-toplevel
+    import xgboost as xgb
+    from packaging import version
+
+    # pylint: enable=import-outside-toplevel
+    return version.parse(xgb.__version__) >= version.parse("1.6.0")
+
+
+@unittest.skipIf(xgb_version_check(), "test not supported for xgboost version after 1.6.0")
+def test_meta_schedule_xgb_model_callback_as_function():
+    # pylint: disable=import-outside-toplevel
     import xgboost as xgb
     from itertools import chain as itertools_chain
-    from functools import partial
+
+    # pylint: enable=import-outside-toplevel
 
     extractor = RandomFeatureExtractor()
     model = XGBModel(extractor=extractor, num_warmup_samples=10)
@@ -252,7 +267,7 @@ def test_meta_schedule_xgb_model_callback():
         model.save(path.name)
 
         old_booster = model.booster
-        xs = [
+        xs = [  # pylint: disable=invalid-name
             x.numpy().astype("float32")
             for x in extractor.extract_from(
                 TuneContext(),
@@ -289,7 +304,7 @@ def avg_peak_score(ys_pred: np.ndarray, d_train1: "xgb.DMatrix"):  # type: ignor
             obj=obj,
             callbacks=[
                 partial(
-                    XGBoostCustomCallback(
+                    _get_custom_call_back(
                         early_stopping_rounds=model.early_stopping_rounds,
                         verbose_eval=model.verbose_eval,
                         fevals=[rmse, avg_peak_score],
@@ -300,7 +315,7 @@ def avg_peak_score(ys_pred: np.ndarray, d_train1: "xgb.DMatrix"):  # type: ignor
             ],
         )
 
-        xs = [
+        xs = [  # pylint: disable=invalid-name
             x.numpy().astype("float32")
             for x in extractor.extract_from(
                 TuneContext(),

From e9eb0bc66005a33786dd6ad2eb6c91c5eb047368 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 30 Sep 2022 00:58:20 -0500
Subject: [PATCH 286/704] [LLVM] Change CHECK_NE(x, nullptr) to CHECK(x !=
 nullptr), NFC (#12943)

The CHECK_NE can cause compilation issues in some cases.
---
 src/target/llvm/codegen_llvm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index ca9d577f64f6..6a50dd4534c2 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -202,7 +202,7 @@ void CodeGenLLVM::InitTarget() {
     os << "}\n";
     auto mod = llvm_target_->GetInstance().ParseIR(os.str());
     auto* test_sse2 = mod->getFunction(fname);
-    ICHECK_NE(test_sse2, nullptr) << "Module creation error";
+    ICHECK(test_sse2 != nullptr) << "Module creation error";
     use_float16_abi = tm->getSubtargetImpl(*test_sse2)->checkFeatures("+sse2");
   }
 #endif  // TVM_LLVM_VERSION >= 150

From dedf6393f1575ba7614cea93d1bac89331cb310a Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 30 Sep 2022 00:58:41 -0500
Subject: [PATCH 287/704] [Hexagon] Change NULL to nullptr, NFC (#12944)

Change the `NULL` to `nullptr`.
---
 src/runtime/hexagon/rpc/android/session.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/hexagon/rpc/android/session.cc b/src/runtime/hexagon/rpc/android/session.cc
index 7c8b81445323..59f1a2a12267 100644
--- a/src/runtime/hexagon/rpc/android/session.cc
+++ b/src/runtime/hexagon/rpc/android/session.cc
@@ -121,7 +121,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.hexagon.create_hexagon_session")
           new HexagonTransportChannel(hexagon_rpc_URI CDSP_DOMAIN, remote_stack_size_bytes,
                                       static_cast<uint32_t>(hexagon_rpc_receive_buf_size_bytes));
       std::unique_ptr<RPCChannel> channel(hexagon_channel);
-      auto ep = RPCEndpoint::Create(std::move(channel), session_name, "", NULL);
+      auto ep = RPCEndpoint::Create(std::move(channel), session_name, "", nullptr);
       auto sess = CreateClientSession(ep);
       *rv = CreateRPCSessionModule(sess);
     });

From d4bf9ecf5524d265916ac7b860b0027f5eee5c49 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 30 Sep 2022 06:17:42 -0500
Subject: [PATCH 288/704] [Target] Add target_device_type attribute to override
 default device_type (#12509)

Implement Target::GetTargetDeviceType (C++) or get_target_device_type
(python) to get the device type (kDL...) for a given target.

The attribute "target_device_type" can be used to override the default
device type associated with the target kind.
---
 include/tvm/target/compilation_config.h       |  2 +-
 include/tvm/target/target.h                   |  6 ++--
 include/tvm/target/target_kind.h              | 15 +++++----
 include/tvm/target/virtual_device.h           | 10 +++---
 python/tvm/micro/model_library_format.py      | 32 +++++++++----------
 python/tvm/relay/build_module.py              |  4 +--
 python/tvm/relay/collage/collage.py           |  2 +-
 python/tvm/target/target.py                   |  4 +++
 src/auto_scheduler/search_policy/utils.h      | 15 ++++-----
 src/auto_scheduler/search_task.cc             |  4 +--
 src/driver/driver_api.cc                      |  8 +++--
 src/relay/backend/build_module.cc             |  4 +--
 src/relay/backend/contrib/uma/targets.cc      |  2 +-
 src/relay/backend/interpreter.cc              |  6 ++--
 src/relay/backend/vm/compiler.cc              |  6 ++--
 src/runtime/vulkan/vulkan_device.h            |  2 +-
 src/target/compilation_config.cc              | 23 ++++++-------
 src/target/spirv/spirv_support.cc             |  2 +-
 src/target/target.cc                          | 14 ++++++--
 src/target/virtual_device.cc                  |  8 ++---
 src/tir/analysis/verify_memory.cc             |  2 +-
 src/tir/transforms/make_packed_api.cc         |  2 +-
 src/tir/transforms/make_unpacked_api.cc       |  2 +-
 .../relay/collage/demo_collage_partitioner.py | 10 +++---
 tests/python/unittest/test_target_target.py   |  4 ++-
 tests/scripts/release/PRERELEASE_NOTES.md     | 24 ++++++++++++++
 26 files changed, 128 insertions(+), 85 deletions(-)
 create mode 100644 tests/scripts/release/PRERELEASE_NOTES.md

diff --git a/include/tvm/target/compilation_config.h b/include/tvm/target/compilation_config.h
index 53b7df88b8ad..eab34de1fb9a 100644
--- a/include/tvm/target/compilation_config.h
+++ b/include/tvm/target/compilation_config.h
@@ -78,7 +78,7 @@ class CompilationConfigNode : public Object {
    * It is possible to have multiple primitive targets for the same device type. However given
    * primitive targets left and right where:
    *  - left appears before right in the array
-   *  - left->kind->device_type == right->kind->device_type
+   *  - left->GetTargetDeviceType() == right->GetTargetDeviceType()
    * then:
    *  - right.IsExternalCodegenFor(left) must be true
    * In this way the \p FindPrimitiveTargetForDeviceOrFail method will find the 'most general'
diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index 6ad213f1263f..df6951685a27 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -68,6 +68,8 @@ class TargetNode : public Object {
   TVM_DLL Map<String, ObjectRef> Export() const;
   /*! \return The Optional<Target> typed target host of the TargetNode */
   TVM_DLL Optional<Target> GetHost() const;
+  /*! \return The device type for this target */
+  TVM_DLL int GetTargetDeviceType() const;
 
   /*!
    * \brief Returns a human readable representation of \p Target which includes all fields,
@@ -230,11 +232,11 @@ class Target : public ObjectRef {
    * with \p that target. In particular:
    *  - \p this has a true ::tvm::attr::kIsExternalCodegen attribute
    *  - \p that does not have a true ::tvm::attr::kIsExternalCodegen attribute
-   *  - \p this and \p that have the same kind->device_type
+   *  - \p this and \p that have the same GetTargetDeviceType()
    *
    * After partitioning, the external codegen compilation path may use \p that to guide it's
    * compilation to a \p runtime::Module. Given \p this, an appropriate \p that can be
-   * found using \p CompilationConfig::FindPrimitiveTargetOrFail(this->kind->device_type).
+   * found using \p CompilationConfig::FindPrimitiveTargetOrFail(this->GetTargetDeviceType()).
    *
    * The \p CollagePartition pass uses this method to guide it's search over candidate partitions
    * using external codegen.
diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index 63c92fedbd6e..19bcce3116b2 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -92,7 +92,7 @@ class TargetKindNode : public Object {
   /*! \brief Name of the target kind */
   String name;
   /*! \brief Device type of target kind */
-  int device_type;
+  int default_device_type;
   /*! \brief Default keys of the target */
   Array<String> default_keys;
   /*! \brief Function used to preprocess on target creation */
@@ -102,7 +102,7 @@ class TargetKindNode : public Object {
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
-    v->Visit("device_type", &device_type);
+    v->Visit("default_device_type", &default_device_type);
     v->Visit("default_keys", &default_keys);
   }
 
@@ -211,7 +211,7 @@ class TargetKindRegEntry {
    * \brief Set DLPack's device_type the target
    * \param device_type Device type
    */
-  inline TargetKindRegEntry& set_device_type(int device_type);
+  inline TargetKindRegEntry& set_default_device_type(int device_type);
   /*!
    * \brief Set DLPack's device_type the target
    * \param keys The default keys
@@ -363,8 +363,8 @@ inline TargetKindRegEntry& TargetKindRegEntry::set_attr(const String& attr_name,
   return *this;
 }
 
-inline TargetKindRegEntry& TargetKindRegEntry::set_device_type(int device_type) {
-  kind_->device_type = device_type;
+inline TargetKindRegEntry& TargetKindRegEntry::set_default_device_type(int device_type) {
+  kind_->default_device_type = device_type;
   return *this;
 }
 
@@ -463,14 +463,15 @@ constexpr const char* kRelayToTIR = "RelayToTIR";
   TVM_STR_CONCAT(TVM_TARGET_KIND_REGISTER_VAR_DEF, __COUNTER__) = \
       ::tvm::TargetKindRegEntry::RegisterOrGet(TargetKindName)    \
           .set_name()                                             \
-          .set_device_type(DeviceType)                            \
+          .set_default_device_type(DeviceType)                    \
           .add_attr_option<Array<String>>("keys")                 \
           .add_attr_option<String>("tag")                         \
           .add_attr_option<String>("device")                      \
           .add_attr_option<String>("model")                       \
           .add_attr_option<Array<String>>("libs")                 \
           .add_attr_option<Target>("host")                        \
-          .add_attr_option<Integer>("from_device")
+          .add_attr_option<Integer>("from_device")                \
+          .add_attr_option<Integer>("target_device_type")
 
 }  // namespace tvm
 
diff --git a/include/tvm/target/virtual_device.h b/include/tvm/target/virtual_device.h
index 37f4b23b12c2..c26ae5befe66 100644
--- a/include/tvm/target/virtual_device.h
+++ b/include/tvm/target/virtual_device.h
@@ -63,7 +63,7 @@ using MemoryScope = String;
  *
  * Some or all of these fields may be unconstrained, signaling that device planning is free to
  * choose a value consistent with the whole program. However if a \p target is given then the \p
- * device_type must equal \p target->kind->device_type.
+ * device_type must equal \p target->GetTargetDeviceType().
  *
  * Note that currently we assume if a function returns its result on a particular (virtual) device
  * then the function body is also executed on that device. See the overview comment in
@@ -167,8 +167,8 @@ class VirtualDeviceNode : public AttrsNode<VirtualDeviceNode> {
  private:
   /*!
    * \brief The \p DLDeviceType (represented as an int) of the virtual device. If \p target is
-   * known then this will be equal to \p target->kind->device_type. If \p target is null then the
-   * target is to be determined later.
+   * known then this will be equal to \p target->GetTargetDeviceType(). If \p target is null then
+   * the target is to be determined later.
    *
    * This is needed to support the legacy "on_device" and "device_copy" calls which only allow
    * a \p DLDeviceTypes (as an integer) to be given.
@@ -263,7 +263,7 @@ class VirtualDevice : public ObjectRef {
   /*!
    * \brief Construct a virtual device.
    * \param device_type The device type for the virtual device, or \p kInvalidDeviceType if
-   * unconstrained.  If \p target is defined then must match its \p target->kind->device_type.
+   * unconstrained.  If \p target is defined then must match its \p target->GetTargetDeviceType().
    * \param virtual_device_id The device id for the virtual device, or -1 if unconstrained.
    * \param target The target describing how to compile for the virtual device, or null if
    * unconstrained.
@@ -304,7 +304,7 @@ class VirtualDevice : public ObjectRef {
 
   /*! \brief Returns the \p VirtualDevice for \p target. */
   static VirtualDevice ForTarget(Target target) {
-    DLDeviceType device_type = static_cast<DLDeviceType>(target->kind->device_type);
+    DLDeviceType device_type = static_cast<DLDeviceType>(target->GetTargetDeviceType());
     return VirtualDevice(device_type, /*virtual_device_id=*/0, std::move(target));
   }
 
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index e220fa1ca543..1ba9f5e73395 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -226,12 +226,12 @@ def _build_function_memory_map(function_metadata):
         for target in dict(finfo.workspace_sizes).keys():
             workspace_size = finfo.workspace_sizes[target]
             target_entry = {
-                "device": int(target.kind.device_type),
+                "device": int(target.get_target_device_type()),
                 "workspace_size_bytes": int(workspace_size),
             }
             target_local_entries[func_name].append(target_entry)
-            if workspace_size >= device_max_workspace.get(int(target.kind.device_type), 0):
-                device_max_workspace[int(target.kind.device_type)] = workspace_size
+            if workspace_size >= device_max_workspace.get(int(target.get_target_device_type()), 0):
+                device_max_workspace[int(target.get_target_device_type())] = workspace_size
 
     for func_name, target_entries_ in target_local_entries.items():
         func_entry = {
@@ -252,28 +252,28 @@ def _create_empty_entry(target_device_type):
 
     for target in dict(main_func_metadata.workspace_sizes).keys():
         main_func_local_workspace = main_func_metadata.workspace_sizes[target]
-        target_main_entries[int(target.kind.device_type)] = _create_empty_entry(
-            int(target.kind.device_type)
+        target_main_entries[int(target.get_target_device_type())] = _create_empty_entry(
+            int(target.get_target_device_type())
         )
-        target_main_entries[int(target.kind.device_type)]["workspace_size_bytes"] = int(
-            device_max_workspace.get(int(target.kind.device_type), 0)
+        target_main_entries[int(target.get_target_device_type())]["workspace_size_bytes"] = int(
+            device_max_workspace.get(int(target.get_target_device_type()), 0)
         ) + int(main_func_local_workspace)
 
     for target in dict(main_func_metadata.constant_sizes).keys():
-        if int(target.kind.device_type) not in target_main_entries.keys():
-            target_main_entries[int(target.kind.device_type)] = _create_empty_entry(
-                int(target.kind.device_type)
+        if int(target.get_target_device_type()) not in target_main_entries.keys():
+            target_main_entries[int(target.get_target_device_type())] = _create_empty_entry(
+                int(target.get_target_device_type())
             )
-        target_main_entries[int(target.kind.device_type)]["constants_size_bytes"] = int(
+        target_main_entries[int(target.get_target_device_type())]["constants_size_bytes"] = int(
             main_func_metadata.constant_sizes[target]
         )
 
     for target in dict(main_func_metadata.io_sizes).keys():
-        if int(target.kind.device_type) not in target_main_entries.keys():
-            target_main_entries[int(target.kind.device_type)] = _create_empty_entry(
-                int(target.kind.device_type)
+        if int(target.get_target_device_type()) not in target_main_entries.keys():
+            target_main_entries[int(target.get_target_device_type())] = _create_empty_entry(
+                int(target.get_target_device_type())
             )
-        target_main_entries[int(target.kind.device_type)]["io_size_bytes"] = int(
+        target_main_entries[int(target.get_target_device_type())]["io_size_bytes"] = int(
             main_func_metadata.io_sizes[target]
         )
 
@@ -483,7 +483,7 @@ def _eval_shape(param_name, buffer_shape):
     memory_map = {}
     for target in targets:
         # TODO(mbs): The device type is not unique, better would be to use target.kind.name
-        target_device_type = target.kind.device_type
+        target_device_type = target.get_target_device_type()
         ir_mod = ir_module_by_target[target]
         printer = get_global_func("tir.ModelLibraryFormatPrinter")(False, None, False)
         with open(src_dir / f"tir-{target_device_type}.txt", "w") as f:
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 6cdc79ceb587..112e5558fef9 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -664,10 +664,10 @@ def create_executor(kind="debug", mod=None, device=None, target="llvm", params=N
     if mod is None:
         mod = IRModule()
     if device is not None:
-        assert device.device_type == raw_targets[0].kind.device_type
+        assert device.device_type == raw_targets[0].get_target_device_type()
     else:
         # Derive the default device from the first target.
-        device = _nd.device(raw_targets[0].kind.device_type, 0)
+        device = _nd.device(raw_targets[0].get_target_device_type(), 0)
 
     if params is not None:
         mod = IRModule.from_expr(bind_params_by_name(mod["main"], params))
diff --git a/python/tvm/relay/collage/collage.py b/python/tvm/relay/collage/collage.py
index 4dd59d56b485..632ab1746f51 100644
--- a/python/tvm/relay/collage/collage.py
+++ b/python/tvm/relay/collage/collage.py
@@ -82,7 +82,7 @@ def vm_estimate_seconds(device, the_vm, func_name, args):
 def estimate_seconds(mod, target):
     """Returns the mean execution time of "main" in mod on target with params. The module
     may contain "Primitive" functions, possibly with "Compiler" attributes."""
-    device = tvm.device(target.kind.device_type)
+    device = tvm.device(target.get_target_device_type())
 
     try:
         # Build the module.
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 1e9e2e698c44..7081f992afd9 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -234,6 +234,10 @@ def get_kind_attr(self, attr_name):
         """
         return _ffi_api.TargetKindGetAttr(self.kind, attr_name)
 
+    def get_target_device_type(self):
+        """Returns the device_type for this target."""
+        return _ffi_api.TargetGetDeviceType(self)
+
     @staticmethod
     def list_kinds():
         """Returns the list of available target names."""
diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h
index ffd4bf4f486d..44b60de1d7ad 100644
--- a/src/auto_scheduler/search_policy/utils.h
+++ b/src/auto_scheduler/search_policy/utils.h
@@ -48,27 +48,24 @@ namespace auto_scheduler {
 
 /*! \brief Return whether the search task is targeting a CPU. */
 inline bool IsCPUTask(const SearchTask& task) {
-  return (task)->target->kind->device_type == kDLCPU;
+  return (task)->target->GetTargetDeviceType() == kDLCPU;
 }
 
 /*! \brief Return whether the search task is targeting a GPU. */
 inline bool IsGPUTask(const SearchTask& task) {
-  return (task)->target->kind->device_type == kDLCUDA ||
-         (task)->target->kind->device_type == kDLOpenCL ||
-         (task)->target->kind->device_type == kDLVulkan ||
-         (task)->target->kind->device_type == kDLMetal ||
-         (task)->target->kind->device_type == kDLROCM ||
-         (task)->target->kind->device_type == kOpenGL;
+  int device_type = (task)->target->GetTargetDeviceType();
+  return device_type == kDLCUDA || device_type == kDLOpenCL || device_type == kDLVulkan ||
+         device_type == kDLMetal || device_type == kDLROCM || device_type == kOpenGL;
 }
 
 /*! \brief Return whether the search task is targeting a CUDA GPU. */
 inline bool IsCUDATask(const SearchTask& task) {
-  return (task)->target->kind->device_type == kDLCUDA;
+  return (task)->target->GetTargetDeviceType() == kDLCUDA;
 }
 
 /*! \brief Return whether the search task is targeting a OpenCL GPU. */
 inline bool IsOpenCLTask(const SearchTask& task) {
-  return (task)->target->kind->device_type == kDLOpenCL;
+  return (task)->target->GetTargetDeviceType() == kDLOpenCL;
 }
 
 /*! \brief Argsort. Order: largest to smallest */
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 262340099cc7..5c8c678e8ce4 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -54,7 +54,7 @@ HardwareParams::HardwareParams(int num_cores, int vector_unit_bytes, int cache_l
 HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target,
                                                             const Target& target_host) {
   // There is no use of target_host so no updates here in the function.
-  const auto device_type = target->kind->device_type;
+  const auto device_type = target->GetTargetDeviceType();
   if (device_type == kDLCPU) {
     return HardwareParams(tvm::runtime::threading::MaxConcurrency(), 64, 64, 0, 0, 0, 0, 0);
   } else if (device_type == kDLCUDA || device_type == kDLROCM) {
@@ -91,7 +91,7 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
     int max_vthread_extent = warp_size / 4;
     return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
                           max_threads_per_block, max_vthread_extent, warp_size);
-  } else if (target->kind->device_type == kDLOpenCL) {
+  } else if (target->GetTargetDeviceType() == kDLOpenCL) {
     if (target->GetAttr<String>("device", "") == "mali") {
       // We cannot use device API to get hardware attributes like CUDA,
       // because like Mali target is normally on the remote machine.
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index b460557da034..b0af0fb65e16 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -72,7 +72,7 @@ bool ShouldAnnotateEntryFunc(const IRModule mod) {
 
 /*! \return The default host target for a given device target */
 Target DefaultTargetHost(Target target) {
-  if (target.defined() && target->kind->device_type == kDLCPU) {
+  if (target.defined() && target->GetTargetDeviceType() == kDLCPU) {
     return target;
   } else {
     if (LLVMEnabled()) {
@@ -423,7 +423,8 @@ runtime::Module TIRToRuntime(const Map<Target, IRModule>& inputs_arg,
 
   if (!target_host.defined()) {
     for (const auto& it : inputs) {
-      if (it.first->kind->device_type == kDLCPU || it.first->kind->device_type == kDLMicroDev) {
+      if (it.first->GetTargetDeviceType() == kDLCPU ||
+          it.first->GetTargetDeviceType() == kDLMicroDev) {
         target_host = it.first;
         break;
       }
@@ -460,7 +461,8 @@ runtime::Module TIRToRuntime(const Map<Target, IRModule>& inputs_arg,
       // unless they're supposed to. Here if we overrode the target host
       // to allow lowering previously we check that it's meant to be placed
       // back into the host Module.
-      bool overrides_host_target = target->kind->device_type == target_host->kind->device_type;
+      bool overrides_host_target =
+          target->GetTargetDeviceType() == target_host->GetTargetDeviceType();
       bool non_host_target_kind = target->kind != target_host->kind;
       if (overrides_host_target && non_host_target_kind) {
         device_modules.push_back(codegen::Build(host_mod, it.first));
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 7b39cb444360..bca524794a20 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -359,7 +359,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (backend::IsAutoSchedulerEnabled() && config_->optional_homogeneous_target.defined()) {
       Pass major_pass = transform::AutoSchedulerLayoutRewrite();
       bool enable_layout_rewrite_targets =
-          config_->optional_homogeneous_target->kind->device_type == kDLCPU ||
+          config_->optional_homogeneous_target->GetTargetDeviceType() == kDLCPU ||
           config_->optional_homogeneous_target->GetAttr<String>("device", "") == "mali";
       if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) {
         With<Target> tctx(config_->optional_homogeneous_target);
@@ -373,7 +373,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (backend::IsMetaScheduleEnabled() && config_->optional_homogeneous_target.defined()) {
       Pass major_pass = transform::MetaScheduleLayoutRewrite();
       bool enable_layout_rewrite_targets =
-          config_->optional_homogeneous_target->kind->device_type == kDLCPU ||
+          config_->optional_homogeneous_target->GetTargetDeviceType() == kDLCPU ||
           config_->optional_homogeneous_target->GetAttr<String>("device", "") == "mali";
       if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) {
         With<Target> tctx(config_->optional_homogeneous_target);
diff --git a/src/relay/backend/contrib/uma/targets.cc b/src/relay/backend/contrib/uma/targets.cc
index a17f6694f79f..ed2cc047cf2f 100644
--- a/src/relay/backend/contrib/uma/targets.cc
+++ b/src/relay/backend/contrib/uma/targets.cc
@@ -50,7 +50,7 @@ TVM_REGISTER_GLOBAL("relay.backend.contrib.uma.RegisterTarget")
       auto target_kind =
           ::tvm::TargetKindRegEntry::RegisterOrGet(target_name)
               .set_name()
-              .set_device_type(kDLCPU)
+              .set_default_device_type(kDLCPU)
               .add_attr_option<Array<String>>("keys")
               .add_attr_option<String>("tag")
               .add_attr_option<String>("device")
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 65a0fdc94824..1019ecf358b1 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -477,7 +477,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
 
     // TODO(mbs): Take this from the host_virtual_device.
     Device shape_device;
-    shape_device.device_type = static_cast<DLDeviceType>(prim_shape_target->kind->device_type);
+    shape_device.device_type = static_cast<DLDeviceType>(prim_shape_target->GetTargetDeviceType());
     shape_device.device_id = 0;
 
     // 'Compile' the TIR shape function to appropriate callable form.
@@ -1017,7 +1017,7 @@ TypedPackedFunc<ObjectRef(Array<Expr>)> EvalFunction(IRModule mod, Expr expr, De
           << PrettyPrint(mod) << "and expression:" << std::endl
           << PrettyPrint(expr);
 
-  ICHECK_EQ(device.device_type, target->kind->device_type);
+  ICHECK_EQ(device.device_type, target->GetTargetDeviceType());
   Array<Target> raw_targets = {target};
   CompilationConfig config(transform::PassContext::Current(), raw_targets);
 
@@ -1106,7 +1106,7 @@ TypedPackedFunc<ObjectRef(Array<Expr>)> EvalFunction(IRModule mod, Expr expr, De
 ObjectRef Eval(Expr expr, Map<GlobalTypeVar, TypeData> type_definitions,
                std::unordered_set<String> import_set, Device device, Target target,
                Map<String, ObjectRef> attrs) {
-  ICHECK_EQ(device.device_type, target->kind->device_type);
+  ICHECK_EQ(device.device_type, target->GetTargetDeviceType());
   Array<Target> raw_targets = {target};
   CompilationConfig config(transform::PassContext::Current(), raw_targets);
 
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index a8bd3df32a90..b807f4195947 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1067,7 +1067,7 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   if (backend::IsAutoSchedulerEnabled() && config_->optional_homogeneous_target.defined()) {
     Pass major_pass = transform::AutoSchedulerLayoutRewrite();
     bool enable_layout_rewrite_targets =
-        config_->optional_homogeneous_target->kind->device_type == kDLCPU ||
+        config_->optional_homogeneous_target->GetTargetDeviceType() == kDLCPU ||
         config_->optional_homogeneous_target->GetAttr<String>("device", "") == "mali";
     if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) {
       With<Target> tctx(config_->optional_homogeneous_target);
@@ -1081,7 +1081,7 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   if (backend::IsMetaScheduleEnabled() && config_->optional_homogeneous_target.defined()) {
     Pass major_pass = transform::MetaScheduleLayoutRewrite();
     bool enable_layout_rewrite_targets =
-        config_->optional_homogeneous_target->kind->device_type == kDLCPU ||
+        config_->optional_homogeneous_target->GetTargetDeviceType() == kDLCPU ||
         config_->optional_homogeneous_target->GetAttr<String>("device", "") == "mali";
     if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) {
       With<Target> tctx(config_->optional_homogeneous_target);
@@ -1164,7 +1164,7 @@ void VMCompiler::Codegen() {
   // Only the PrimFuncs will appear in per_target_modules, and there may legitimately be none.
   Map<Target, IRModule> per_tvm_target_modules = tec::GetPerTargetModules(context_.module);
   for (const auto& kv : per_tvm_target_modules) {
-    ICHECK(kv.first->kind->device_type != kDLExtDev);
+    ICHECK(kv.first->GetTargetDeviceType() != kDLExtDev);
   }
 
   // Retrieve all external runtime modules accumulated by external codegen (both function-at-a-time
diff --git a/src/runtime/vulkan/vulkan_device.h b/src/runtime/vulkan/vulkan_device.h
index a1257a732aff..59ebf430e6e6 100644
--- a/src/runtime/vulkan/vulkan_device.h
+++ b/src/runtime/vulkan/vulkan_device.h
@@ -67,7 +67,7 @@ struct VulkanQueueInsertDebugUtilsLabelFunctions {
  * \brief Stores the capabilities/limits queried from the physical device.
  *
  * The member variables here have a 1-1 mapping to Target parameters,
- * if target->kind->device_type==kDLVulkan.  A separate struct is used
+ * if target->GetTargetDeviceType()==kDLVulkan.  A separate struct is used
  * to maintain the boundary between the Vulkan runtime in
  * libtvm_runtime.so, and the Target object in libtvm.so.
  */
diff --git a/src/target/compilation_config.cc b/src/target/compilation_config.cc
index 5e001921b076..a7f708f12a15 100644
--- a/src/target/compilation_config.cc
+++ b/src/target/compilation_config.cc
@@ -42,13 +42,13 @@ Target CompilationConfigNode::FindPrimitiveTargetForDeviceOrFail(DLDeviceType de
   ICHECK_GT(device_type, 0) << "Invalid device type";
   auto itr = std::find_if(
       primitive_targets.begin(), primitive_targets.end(),
-      [device_type](const Target& target) { return target->kind->device_type == device_type; });
+      [device_type](const Target& target) { return target->GetTargetDeviceType() == device_type; });
   if (itr == primitive_targets.end()) {
     std::stringstream msg;
     msg << "No target is specified for device type " << device_type
         << ". The available device types and targets are:" << std::endl;
     for (const auto& target : primitive_targets) {
-      msg << "  " << target->kind->device_type << "-> " << target->ToDebugString() << std::endl;
+      msg << "  " << target->GetTargetDeviceType() << "-> " << target->ToDebugString() << std::endl;
     }
     LOG(FATAL) << msg.str();
   }
@@ -137,7 +137,7 @@ void CompilationConfigNode::Init(const transform::PassContext& pass_ctx,
   auto hosting_itr = std::find_if(raw_targets.begin(), raw_targets.end(), [](const Target& target) {
     // TODO(tvm-team): The kDLHexagon device can act as a host. We can remove kDLHexagon
     // here once we refactored kDLHexagon to kDLCPU.
-    return target->kind->device_type == kDLCPU || target->kind->device_type == kDLHexagon;
+    return target->GetTargetDeviceType() == kDLCPU || target->GetTargetDeviceType() == kDLHexagon;
   });
 
   // Any targets with their host field set?
@@ -149,23 +149,24 @@ void CompilationConfigNode::Init(const transform::PassContext& pass_ctx,
     // targets.
     host_target = Target((*has_host_itr)->GetHost().value(), /*host=*/Target());
     VLOG(1) << "The target " << (*has_host_itr)->ToDebugString() << " supplies a host target "
-            << host_target->ToDebugString() << " of device type " << host_target->kind->device_type;
+            << host_target->ToDebugString() << " of device type "
+            << host_target->GetTargetDeviceType();
   } else if (hosting_itr != raw_targets.end()) {
     // RULE B: If any raw target is for a device which could be a host then use the first such as
     // the host.
     host_target = Target(*hosting_itr, /*host=*/Target());
     VLOG(1) << "Using target " << host_target->ToDebugString() << " of CPU-like device type "
-            << host_target->kind->device_type << " as the host target";
+            << host_target->GetTargetDeviceType() << " as the host target";
   } else {
     // RULE C: Otherwise, create a default CPU host target.
     host_target = MakeDefaultCPUTarget();
     VLOG(1) << "Created a default target " << host_target->ToDebugString() << " of device type "
-            << host_target->kind->device_type << " for the host target";
+            << host_target->GetTargetDeviceType() << " for the host target";
   }
   ICHECK(host_target.defined());
   ICHECK(!host_target->host.defined());
 
-  if (host_target->kind->device_type != kDLCPU) {
+  if (host_target->GetTargetDeviceType() != kDLCPU) {
     // I think we're on thin ice here until we've audited the code base for assumed CPU hosts.
     VLOG(1) << "The host target is not a CPU. This is probably not going to work.";
   }
@@ -174,7 +175,7 @@ void CompilationConfigNode::Init(const transform::PassContext& pass_ctx,
   // Establish the host VirtualDevice.
   //
   host_virtual_device = virtual_device_cache_.Unique(
-      VirtualDevice(static_cast<DLDeviceType>(host_target->kind->device_type),
+      VirtualDevice(static_cast<DLDeviceType>(host_target->GetTargetDeviceType()),
                     /*virtual_device_id=*/0, host_target));
   ICHECK(host_virtual_device.defined());
   ICHECK(host_virtual_device->target.defined());
@@ -205,7 +206,7 @@ void CompilationConfigNode::Init(const transform::PassContext& pass_ctx,
   std::unordered_set<DLDeviceType> primitive_target_device_types;
   std::unordered_set<std::string> kind_names;
   for (const auto& target : primitive_targets) {
-    primitive_target_device_types.emplace(static_cast<DLDeviceType>(target->kind->device_type));
+    primitive_target_device_types.emplace(static_cast<DLDeviceType>(target->GetTargetDeviceType()));
     CHECK(kind_names.emplace(target->kind->name).second) << "Multiple targets have been given"
                                                             "for the same device kind '"
                                                          << target->kind->name << "'";
@@ -213,7 +214,7 @@ void CompilationConfigNode::Init(const transform::PassContext& pass_ctx,
   for (DLDeviceType device_type : primitive_target_device_types) {
     Target first_primitive_target;
     for (const auto& current_primitive_target : primitive_targets) {
-      if (current_primitive_target->kind->device_type != device_type) {
+      if (current_primitive_target->GetTargetDeviceType() != device_type) {
         continue;
       }
       if (!first_primitive_target.defined()) {
@@ -290,7 +291,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "Primitive targets:";
       for (const auto& target : node->primitive_targets) {
         p->stream << std::endl
-                  << "  " << target->kind->device_type << " |-> " << target->ToDebugString();
+                  << "  " << target->GetTargetDeviceType() << " |-> " << target->ToDebugString();
       }
       p->stream << std::endl
                 << "Default primitive virtual device: " << node->default_primitive_virtual_device;
diff --git a/src/target/spirv/spirv_support.cc b/src/target/spirv/spirv_support.cc
index a91a2a3384e0..81b5cd8b8a6a 100644
--- a/src/target/spirv/spirv_support.cc
+++ b/src/target/spirv/spirv_support.cc
@@ -32,7 +32,7 @@ namespace tvm {
 namespace codegen {
 
 SPIRVSupport::SPIRVSupport(tvm::Target target) {
-  ICHECK_EQ(target->kind->device_type, kDLVulkan)
+  ICHECK_EQ(target->GetTargetDeviceType(), kDLVulkan)
       << "SPIRVSupport can only be checked for vulkan device type";
 
   if (target->GetAttr<Integer>("vulkan_api_version")) {
diff --git a/src/target/target.cc b/src/target/target.cc
index e3e9354a61bb..cbebd0e10c46 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -622,7 +622,7 @@ bool Target::IsExternalCodegen() const {
 }
 
 bool Target::IsExternalCodegenFor(const Target& that) const {
-  return get()->kind->device_type == that->kind->device_type && IsExternalCodegen() &&
+  return get()->GetTargetDeviceType() == that->GetTargetDeviceType() && IsExternalCodegen() &&
          !that.IsExternalCodegen();
 }
 
@@ -665,6 +665,13 @@ Optional<Target> TargetNode::GetHost() const {
   return GetRef<Optional<Target>>(this->host.as<TargetNode>());
 }
 
+int TargetNode::GetTargetDeviceType() const {
+  if (Optional<Integer> device_type = GetAttr<Integer>("target_device_type")) {
+    return Downcast<Integer>(device_type)->value;
+  }
+  return kind->default_device_type;
+}
+
 String TargetNode::ToDebugString() const {
   std::ostringstream os;
   os << "Target(";
@@ -974,7 +981,7 @@ std::unordered_map<String, ObjectRef> TargetInternal::QueryDevice(int device_id,
                                                                   const TargetNode* target) {
   std::unordered_map<String, ObjectRef> output;
 
-  Device device{static_cast<DLDeviceType>(target->kind->device_type), device_id};
+  Device device{static_cast<DLDeviceType>(target->GetTargetDeviceType()), device_id};
 
   auto api = runtime::DeviceAPI::Get(device, true);
   if (!api) {
@@ -1042,6 +1049,9 @@ TVM_REGISTER_GLOBAL("target.TargetExitScope").set_body_typed(TargetInternal::Exi
 TVM_REGISTER_GLOBAL("target.TargetCurrent").set_body_typed(Target::Current);
 TVM_REGISTER_GLOBAL("target.TargetExport").set_body_typed(TargetInternal::Export);
 TVM_REGISTER_GLOBAL("target.WithHost").set_body_typed(TargetInternal::WithHost);
+TVM_REGISTER_GLOBAL("target.TargetGetDeviceType").set_body_typed([](const Target& target) {
+  return target->GetTargetDeviceType();
+});
 TVM_REGISTER_GLOBAL("target.TargetGetFeature")
     .set_body_typed([](const Target& target, const String& feature_key) {
       return target->GetFeature<ObjectRef>(feature_key);
diff --git a/src/target/virtual_device.cc b/src/target/virtual_device.cc
index ef01a2afda10..39bb11ff157b 100644
--- a/src/target/virtual_device.cc
+++ b/src/target/virtual_device.cc
@@ -68,9 +68,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 VirtualDevice::VirtualDevice(DLDeviceType device_type, int virtual_device_id, Target target,
                              MemoryScope memory_scope) {
-  ICHECK(!target.defined() || device_type == target->kind->device_type)
-      << "target " << target->ToDebugString() << " has device type " << target->kind->device_type
-      << " but virtual device has device type " << device_type;
+  ICHECK(!target.defined() || device_type == target->GetTargetDeviceType())
+      << "target " << target->ToDebugString() << " has device type "
+      << target->GetTargetDeviceType() << " but virtual device has device type " << device_type;
   auto node = make_object<VirtualDeviceNode>();
   node->device_type_int = device_type;
   node->virtual_device_id = virtual_device_id;
@@ -151,7 +151,7 @@ VirtualDevice VirtualDevice::Default(const VirtualDevice& lhs, const VirtualDevi
     defaulted_target = lhs->target;
   } else {
     // We can only default to the rhs's target if it is consistent with the device type
-    if (rhs->target.defined() && rhs->target->kind->device_type == defaulted_device_type) {
+    if (rhs->target.defined() && rhs->target->GetTargetDeviceType() == defaulted_device_type) {
       defaulted_target = rhs->target;
     }
     // else: leave as null
diff --git a/src/tir/analysis/verify_memory.cc b/src/tir/analysis/verify_memory.cc
index 6ee30e04704a..80d6897011d5 100644
--- a/src/tir/analysis/verify_memory.cc
+++ b/src/tir/analysis/verify_memory.cc
@@ -186,7 +186,7 @@ std::vector<String> VerifyMemory_(const PrimFunc& func) {
 
   if (func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
       CallingConv::kDefault) {
-    MemoryAccessVerifier v(func, target.value()->kind->device_type);
+    MemoryAccessVerifier v(func, target.value()->GetTargetDeviceType());
     v.Run();
     return v.Errors();
   } else {
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index bf7ff09c86c7..5b9bac03aba9 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -145,7 +145,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func) {
 
   auto target = func->GetAttr<Target>(tvm::attr::kTarget);
   ICHECK(target.defined()) << "MakePackedAPI: Require the target attribute";
-  int target_device_type = target.value()->kind->device_type;
+  int target_device_type = target.value()->GetTargetDeviceType();
 
   std::string name_hint = global_symbol.value();
 
diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc
index 87e8f38895cd..e44eb34068a6 100644
--- a/src/tir/transforms/make_unpacked_api.cc
+++ b/src/tir/transforms/make_unpacked_api.cc
@@ -50,7 +50,7 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) {
   auto* func_ptr = func.CopyOnWrite();
 
   // Setup device context
-  int target_device_type = target.value()->kind->device_type;
+  int target_device_type = target.value()->GetTargetDeviceType();
   Integer device_type(target_device_type);
   Integer device_id(0);
   PrimExpr node = StringImm("default");
diff --git a/tests/python/relay/collage/demo_collage_partitioner.py b/tests/python/relay/collage/demo_collage_partitioner.py
index 76db459d4c8e..c5a18c3832fa 100644
--- a/tests/python/relay/collage/demo_collage_partitioner.py
+++ b/tests/python/relay/collage/demo_collage_partitioner.py
@@ -280,7 +280,7 @@ def collage(model):
             logging.info("-------------- BEGIN PARTITIONED --------------")
             logging.info(partitioned_model["mod"])
             logging.info("-------------- END PARTITIONED ----------------")
-            dev = tvm.device(CUDA.kind.device_type)
+            dev = tvm.device(CUDA.get_target_device_type())
             compile_and_benchmark("collage", partitioned_model, targets, dev, tmp_dir)
 
 
@@ -309,7 +309,7 @@ def just_tensorrt(model):
         targets = []
         targets.append(CUDA)
         targets.append(trt_target)
-        dev = tvm.device(CUDA.kind.device_type)
+        dev = tvm.device(CUDA.get_target_device_type())
         compile_and_benchmark("just_tensorrt", partitioned_model, targets, dev, tmp_dir)
 
 
@@ -333,7 +333,7 @@ def just_cutlass(model):
             targets = []
             targets.append(CUDA)
             targets.append(tvm.target.Target(f"cutlass -tmp_dir={tmp_dir}", HOST))
-            dev = tvm.device(CUDA.kind.device_type)
+            dev = tvm.device(CUDA.get_target_device_type())
             compile_and_benchmark("just_cutlass", partitioned_model, targets, dev, tmp_dir)
 
 
@@ -346,7 +346,7 @@ def just_tvm(model):
     tmp_dir = tempfile.mkdtemp()
     autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
     with optional_tuning_records(TUNING_LOG):
-        dev = tvm.device(CUDA.kind.device_type)
+        dev = tvm.device(CUDA.get_target_device_type())
         compile_and_benchmark("just_tvm", model, CUDA, dev, tmp_dir)
 
 
@@ -360,7 +360,7 @@ def tvm_with_libs(model):
     cuda_target = tvm.target.Target("cuda -libs=cudnn,cublas", HOST)
     autotvm_tune_module(model["mod"], cuda_target, TUNING_LOG)
     with optional_tuning_records(TUNING_LOG):
-        dev = tvm.device(cuda_target.kind.device_type)
+        dev = tvm.device(cuda_target.get_target_device_type())
         compile_and_benchmark("tvm_with_libs", model, cuda_target, dev, tmp_dir)
 
 
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index d0dfa3942f16..2b0f1b2dd7a0 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -58,7 +58,9 @@ def test_all_targets_device_type_verify():
         if tgt.kind.name not in tvm._ffi.runtime_ctypes.Device.STR2MASK:
             raise KeyError("Cannot find target kind: %s in Device.STR2MASK" % tgt.kind.name)
 
-        assert tgt.kind.device_type == tvm._ffi.runtime_ctypes.Device.STR2MASK[tgt.kind.name]
+        assert (
+            tgt.get_target_device_type() == tvm._ffi.runtime_ctypes.Device.STR2MASK[tgt.kind.name]
+        )
 
 
 def test_target_dispatch():
diff --git a/tests/scripts/release/PRERELEASE_NOTES.md b/tests/scripts/release/PRERELEASE_NOTES.md
new file mode 100644
index 000000000000..933d8d272023
--- /dev/null
+++ b/tests/scripts/release/PRERELEASE_NOTES.md
@@ -0,0 +1,24 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+Notable changes since last release
+----------------------------------
+
+* PR12509:
+   - Changed `TargetKind::device_type` to `TargetKind::default_device_type`.
+   - Introduced "target_default_device" attribute that overrides the default device.
+   - Added `Target::GetTargetDeviceType` to return the effective device type for the target.

From bf5637dc32ba18836ea1a89f057e2504a66a9d37 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 1 Oct 2022 23:41:40 -0400
Subject: [PATCH 289/704] [DOCS][COMMUNITY] Elaborate Independence Principle
 for Project Participation (#12962)

* [DOCS][COMMUNITY] Elaborate Hat and Independence Principle

This PR adds an elaboration of hat and independence principle
for project participation. This is a principle that by default
applies to all apache projects.

See also other reference materials
- http://theapacheway.com/hats/
- https://community.apache.org/projectIndependence.html

* Update docs/contribute/committer_guide.rst

Co-authored-by: Gustavo Romero <gromero@users.noreply.github.com>

Co-authored-by: Gustavo Romero <gromero@users.noreply.github.com>
---
 docs/contribute/committer_guide.rst | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/contribute/committer_guide.rst b/docs/contribute/committer_guide.rst
index d0924400543e..73506e619ab2 100644
--- a/docs/contribute/committer_guide.rst
+++ b/docs/contribute/committer_guide.rst
@@ -63,6 +63,19 @@ Here are some example applications of this principle:
   (as an RFC or a discuss thread).
 
 
+Independent Project Management
+------------------------------
+
+Everyone is presumed to be wearing their Apache committer hat when participating in the project.
+That is, committers should act - in the context of the project activities - in the best interests of the project.
+Separating your hat between committer and any other roles you may have is important in all aspects.
+
+In the context of project participation, it can be helpful to state which hat you are wearing in cases where that
+can cause confusion, especially in cases where you are not wearing committer hat. Two examples:
+
+- "Wearing [foo] hat: [message when serving as foo's role and not as committer]".
+- "Wearing Apache TVM hat: [messages when serving as committer]".
+
 Shepherd a Pull Request
 -----------------------
 

From c3357f6820496c0ed5052fc634867ab52082d3d6 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Sun, 2 Oct 2022 02:52:46 -0400
Subject: [PATCH 290/704] [Relay][Op] Register some forgotten op in Python side
 (#12963)

It is noticed that many Attrs declared in this [file](https://github.com/apache/tvm/blob/main/include/tvm/relay/attrs/transform.h) are not registered [on Python side](https://github.com/apache/tvm/blob/main/python/tvm/relay/op/op_attrs.py). This will cause these Attrs having type `Object` when running with Python.

Therefore, this PR registers the missing operators in `op_attrs.py`, to make them have right types during runtime.
---
 python/tvm/relay/op/op_attrs.py | 60 +++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index 7e8367abbb2f..b76097722c07 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -622,3 +622,63 @@ class FixedPointMultiplyAttrs(Attrs):
 @tvm._ffi.register_object("relay.attrs.TriluAttrs")
 class TriluAttrs(Attrs):
     """Attributes used in trilu operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.SlidingWindowAttrs")
+class SlidingWindowAttrs(Attrs):
+    """Attributes used in sliding_window operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.DynExpandDimsAttrs")
+class DynExpandDimsAttrs(Attrs):
+    """Attributes used in dynamic expand_dims operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.ScatterAddAttrs")
+class ScatterAddAttrs(Attrs):
+    """Attributes used in scatter_add operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.ScatterNDAttrs")
+class ScatterNDAttrs(Attrs):
+    """Attributes used in scatter_nd operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.GatherNDAttrs")
+class GatherNDAttrs(Attrs):
+    """Attributes used in gather_nd operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.AutoSchedulerLayoutTransformAttrs")
+class AutoSchedulerLayoutTransformAttrs(Attrs):
+    """Attributes used in AutoSchedulerLayoutTransform operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.MetaScheduleLayoutTransformAttrs")
+class MetaScheduleLayoutTransformAttrs(Attrs):
+    """Attributes used in MetaScheduleLayoutTransform operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.MatrixSetDiagAttrs")
+class MatrixSetDiagAttrs(Attrs):
+    """Attributes used in matrix_set_diag operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.ScanopAttrs")
+class ScanopAttrs(Attrs):
+    """Attributes used in cumsum and cumprod operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.UniqueAttrs")
+class UniqueAttrs(Attrs):
+    """Attributes used in unique operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.EinsumAttrs")
+class EinsumAttrs(Attrs):
+    """Attributes used in einsum operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.StftAttrs")
+class StftAttrs(Attrs):
+    """Attributes used in stft operators"""

From fa17da22c73fb9e95c27e4c28130835b628caf6b Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 3 Oct 2022 22:33:45 +0900
Subject: [PATCH 291/704] [Hexagon] Support template-free meta schedule tuning
 (#12854)

* [Metaschedule] Support template-free tuning on Hexagon

* enable multi threading

* update tests

* black
---
 python/tvm/meta_schedule/default_config.py    |  57 ++++-
 python/tvm/meta_schedule/tune.py              |  29 ++-
 .../test_hexagon/test_meta_schedule.py        | 212 +++++++++++++++++-
 3 files changed, 291 insertions(+), 7 deletions(-)

diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
index ac4028ec50f8..c701fd6568e0 100644
--- a/python/tvm/meta_schedule/default_config.py
+++ b/python/tvm/meta_schedule/default_config.py
@@ -174,10 +174,12 @@ def schedule_rules(  # pylint: disable=redefined-outer-name
         return sch_rules()
     if sch_rules is not None:
         raise TypeError(f"Expected `sch_rules` to be None or callable, but gets: {sch_rules}")
-    if target.kind.name in ["llvm", "hexagon"]:
+    if target.kind.name == "llvm":
         return _DefaultLLVM.schedule_rules()
     if target.kind.name in ["cuda", "rocm", "vulkan"]:
         return _DefaultCUDA.schedule_rules()
+    if target.kind.name == "hexagon":
+        return _DefaultHexagon.schedule_rules()
     raise ValueError(f"Unsupported target: {target}")
 
 
@@ -190,10 +192,12 @@ def postproc(  # pylint: disable=redefined-outer-name
         return postproc()
     if postproc is not None:
         raise TypeError(f"Expected `postproc` to be None or callable, but gets: {postproc}")
-    if target.kind.name in ["llvm", "hexagon"]:
+    if target.kind.name == "llvm":
         return _DefaultLLVM.postprocs()
     if target.kind.name in ["cuda", "rocm", "vulkan"]:
         return _DefaultCUDA.postprocs()
+    if target.kind.name == "hexagon":
+        return _DefaultHexagon.postprocs()
     raise ValueError(f"Unsupported target: {target}")
 
 
@@ -277,6 +281,55 @@ def mutator_probs() -> Dict[Mutator, float]:
         }
 
 
+class _DefaultHexagon:
+    """Default tuning configuration for Hexagon."""
+
+    @staticmethod
+    def schedule_rules() -> List[ScheduleRule]:
+        from tvm.meta_schedule import schedule_rule as M
+
+        return [
+            M.AutoInline(
+                into_producer=False,
+                into_consumer=True,
+                inline_const_tensor=True,
+                disallow_if_then_else=True,
+                require_injective=True,
+                require_ordered=True,
+                disallow_op=["tir.exp"],
+            ),
+            M.MultiLevelTilingWideVector(
+                structure="SRSRS",
+                vector_length_in_bits=1024,
+                max_innermost_factor=128,
+                reuse_read=None,
+                reuse_write=M.ReuseType(
+                    req="may",
+                    levels=[1, 2],
+                    scope="global",
+                ),
+            ),
+            M.ParallelizeVectorizeUnroll(
+                max_jobs_per_core=16,
+                max_vectorize_extent=128,
+                unroll_max_steps=[0, 16, 64, 512],
+                unroll_explicit=True,
+            ),
+        ]
+
+    @staticmethod
+    def postprocs() -> List[Postproc]:
+        from tvm.meta_schedule import postproc as M
+
+        return [
+            M.DisallowDynamicLoop(),
+            M.RewriteParallelVectorizeUnroll(),
+            M.RewriteReductionBlock(),
+            # TODO(masahi): Fix RewriteLayout for link-params=True case
+            # M.RewriteLayout(),
+        ]
+
+
 class _DefaultCUDA:
     """Default tuning configuration for CUDA."""
 
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index b1cc0f67bd5f..96b554d4e659 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -554,6 +554,7 @@ def tune_relay(
     postprocs: Optional[FnPostproc] = None,
     mutator_probs: Optional[FnMutatorProb] = None,
     num_threads: Optional[int] = None,
+    executor=None,
 ) -> Union[Module, vm.Executable]:
     """Tune a Relay IRModule with a given target.
 
@@ -581,6 +582,9 @@ def tune_relay(
         The callbacks used during tuning.
     backend : str = "graph"
         The backend to use for relay compilation(graph / vm).
+    executor : relay.backend.Executor
+        The executor to be passed to relay.build(...). In particular, its link-params
+        attribute affects task extration and workload database look up.
 
     Returns
     -------
@@ -596,8 +600,23 @@ def tune_relay(
     target = default_config.target(target)
     # pylint: enable=protected-access,
     # parse the tuning contexts
+
+    if executor is None:
+        executor = relay.backend.Executor("graph")
+
+    if "link-params" in executor.attrs:
+        link_params = executor.attrs["link-params"]
+    else:
+        link_params = False
+
     with Profiler.timeit("TaskExtraction"):
-        extracted_tasks = extract_task_from_relay(mod, target, params)
+        pass_config = {
+            "relay.FuseOps.link_params": link_params,
+            "relay.backend.use_meta_schedule": True,
+            "relay.backend.tir_converter": "default",
+        }
+        extracted_tasks = extract_task_from_relay(mod, target, params, pass_config=pass_config)
+
     database = tune_extracted_tasks(
         extracted_tasks,
         config,
@@ -613,7 +632,7 @@ def tune_relay(
         mutator_probs=mutator_probs,
         num_threads=num_threads,
     )
-    relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend]
+
     with Profiler.timeit("PostTuningCompilation"):
         with target, autotvm_silencer(), database:
             with PassContext(
@@ -624,4 +643,8 @@ def tune_relay(
                     "relay.backend.tir_converter": "default",
                 },
             ):
-                return relay_build(mod, target=target, params=params)
+                if backend == "graph":
+                    return relay.build(mod, target=target, params=params, executor=executor)
+
+                # Executor is not supported by VM
+                return relay.vm.compile(mod, target=target, params=params)
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py
index 96d18c9b3076..74f3ab673ec8 100644
--- a/tests/python/contrib/test_hexagon/test_meta_schedule.py
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -21,15 +21,20 @@
 import tempfile
 
 import tvm.testing
-from tvm import te
+import tvm.topi.testing
+from tvm import te, relay
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.arg_info import TensorInfo
 from tvm.meta_schedule.builder import BuilderInput
+from tvm.meta_schedule import postproc, schedule_rule
 from tvm.script import tir as T
 from tvm.tir import FloatImm
 from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
 from tvm.meta_schedule.runner import RunnerInput
 from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
+from tvm.relay.backend import Executor
+from tvm.topi.utils import get_const_tuple
+from tvm.meta_schedule.testing import te_workload
 
 MATMUL_N = 16
 MATMUL_M = 32
@@ -166,7 +171,6 @@ def verify_dense(sch, target, M, N, K, hexagon_session):
     print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
 
 
-@pytest.mark.skip(reason="xgboost not installed on CI")
 @tvm.testing.requires_hexagon
 def test_vrmpy_dense(hexagon_launcher):
     if hexagon_launcher._serial_number == "simulator":
@@ -209,3 +213,207 @@ def schedule_dense_for_tune(sch):
 
     with hexagon_launcher.start_session() as session:
         verify_dense(sch, target, M, N, K, session)
+
+
+# This is an example of a schedule found by vrmpy auto tensorization.
+# It gets 440 GFLOPS on SD888.
+@tvm.script.ir_module
+class Module_vrmpy_auto_tensorize:
+    @T.prim_func
+    def main(
+        X: T.Buffer[(128, 768), "uint8"],
+        packedW: T.Buffer[(24, 192, 32, 4), "uint8"],
+        compute: T.Buffer[(128, 768), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i0_0_i1_0_0_fused in T.parallel(
+            512, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}
+        ):
+            for i0_1_init, i1_0_1_init, i0_2_init, i1_0_2_init in T.grid(2, 3, 1, 1):
+                with T.block("compute_o_init"):
+                    i = T.axis.spatial(128, i0_0_i1_0_0_fused // 8 * 2 + i0_1_init + i0_2_init)
+                    j_o = T.axis.spatial(24, i1_0_2_init + i0_0_i1_0_0_fused % 8 * 3 + i1_0_1_init)
+                    T.reads()
+                    T.writes(compute[i, j_o * 32 : j_o * 32 + 32])
+                    for i1_1 in T.vectorized(32):
+                        with T.block("compute_init"):
+                            j_i_init = T.axis.spatial(32, i1_1)
+                            T.reads()
+                            T.writes(compute[i, j_o * 32 + j_i_init])
+                            compute[i, j_o * 32 + j_i_init] = 0
+            for i2_0_0, i0_1, i1_0_1, i2_0_1, i0_2, i1_0_2 in T.grid(32, 2, 3, 6, 1, 1):
+                with T.block("compute_o_update"):
+                    i = T.axis.spatial(128, i0_0_i1_0_0_fused // 8 * 2 + i0_1 + i0_2)
+                    j_o = T.axis.spatial(24, i1_0_2 + i0_0_i1_0_0_fused % 8 * 3 + i1_0_1)
+                    k_o = T.axis.reduce(192, i2_0_0 * 6 + i2_0_1)
+                    T.reads(
+                        compute[i, j_o * 32 : j_o * 32 + 32],
+                        X[i, k_o * 4 : k_o * 4 + 4],
+                        packedW[j_o, k_o, 0:32, 0:4],
+                    )
+                    T.writes(compute[i, j_o * 32 : j_o * 32 + 32])
+                    A = T.match_buffer(
+                        X[i, k_o * 4 : k_o * 4 + 4], [4], dtype="uint8", offset_factor=1
+                    )
+                    B = T.match_buffer(
+                        packedW[j_o, k_o, 0:32, 0:4], [32, 4], dtype="uint8", offset_factor=1
+                    )
+                    C = T.match_buffer(
+                        compute[i, j_o * 32 : j_o * 32 + 32], [32], dtype="int32", offset_factor=1
+                    )
+                    A_u8x4: T.uint8x4 = A[0:4]
+                    A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                    B_i32x32: T.int32x32 = T.reinterpret(B[0, 0:128], dtype="int32x32")
+                    C[0:32] = T.call_llvm_pure_intrin(
+                        4390, T.uint32(3), C[0:32], B_i32x32, A_i32, dtype="int32x32"
+                    )
+
+
+@tvm.testing.requires_hexagon
+def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    target_hexagon = tvm.target.hexagon("v68")
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+    M, N, K = 128, 768, 768
+    workload = te.create_prim_func(dense(M, N, K))
+
+    sch_rules = [
+        schedule_rule.MultiLevelTilingWithIntrin(
+            VRMPY_u8u8i32_INTRIN,
+            structure="SRSRS",
+            tile_binds=None,
+            max_innermost_factor=64,
+            vector_load_lens=None,
+            reuse_read=None,
+            reuse_write=schedule_rule.ReuseType(
+                req="may",
+                levels=[1, 2],
+                scope="global",
+            ),
+        ),
+        schedule_rule.ParallelizeVectorizeUnroll(
+            max_jobs_per_core=16,
+            max_vectorize_extent=128,
+            unroll_max_steps=[0, 16, 64, 512],
+            unroll_explicit=True,
+        ),
+    ]
+
+    postprocs = [
+        postproc.RewriteParallelVectorizeUnroll(),
+        postproc.RewriteReductionBlock(),
+        postproc.RewriteTensorize(vectorize_init_loop=True),
+    ]
+
+    if True:
+        with tempfile.TemporaryDirectory() as work_dir:
+            config = ms.TuneConfig(
+                strategy="replay_trace",
+                num_trials_per_iter=8,
+                max_trials_per_task=8,
+                max_trials_global=8,
+            )
+
+            sch = ms.tune_tir(
+                mod=workload,
+                target=target,
+                config=config,
+                work_dir=work_dir,
+                sch_rules=lambda: sch_rules,
+                postprocs=lambda: postprocs,
+                builder=get_hexagon_local_builder(),
+                runner=get_hexagon_rpc_runner(hexagon_launcher, number=10),
+            )
+    else:
+        sch = tvm.tir.Schedule(Module_vrmpy_auto_tensorize, debug_mask="all")
+
+    with hexagon_launcher.start_session() as session:
+        verify_dense(sch, target, M, N, K, session)
+
+
+@tvm.testing.requires_hexagon
+def test_conv2d_relay_auto_schedule(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    target_hexagon = tvm.target.hexagon("v69")
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    I, O, H, W = 64, 64, 56, 56
+    kH = kW = 3
+
+    strides = (1, 1)
+    padding = (1, 1)
+
+    d_shape = (1, H, W, I)
+    w_shape = (kH, kW, I, O)
+    bias_shape = (1, 1, 1, w_shape[3])
+    out_channel = w_shape[3]
+
+    data = relay.var("data", shape=d_shape, dtype="float16")
+    weight = relay.var("weight", shape=w_shape, dtype="float16")
+    bias = relay.var("bias", shape=bias_shape, dtype="float16")
+    conv2d = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=(kH, kW),
+        channels=out_channel,
+        padding=padding,
+        strides=strides,
+        out_dtype="float16",
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+    )
+    mod = tvm.IRModule.from_expr(conv2d + bias)
+
+    data_np = np.random.randn(*d_shape).astype("float16")
+    weight_np = np.random.randn(*w_shape).astype("float16")
+    bias_np = np.random.randn(*bias_shape).astype("float16")
+    params = {"weight": weight_np, "bias": bias_np}
+
+    target_llvm = tvm.target.Target("llvm")
+
+    with tvm.transform.PassContext(
+        opt_level=3,
+    ):
+        lib_ref = relay.build(mod, target=target_llvm, params=params)
+
+    rt_mod_ref = tvm.contrib.graph_executor.GraphModule(lib_ref["default"](tvm.cpu(0)))
+
+    rt_mod_ref.set_input("data", data_np)
+
+    rt_mod_ref.run()
+
+    ref = rt_mod_ref.get_output(0).numpy()
+
+    config = ms.TuneConfig(
+        strategy="replay_trace",
+        num_trials_per_iter=8,
+        max_trials_per_task=8,
+        max_trials_global=8,
+    )
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        executor = Executor("graph", {"link-params": True})
+        lib = ms.tune_relay(
+            mod=mod,
+            params=params,
+            target=target,
+            config=config,
+            work_dir=work_dir,
+            builder=get_hexagon_local_builder(),
+            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
+            executor=executor,
+        )
+
+    with hexagon_launcher.start_session() as session:
+        rt_mod = session.get_executor_from_factory(lib)
+
+        rt_mod.set_input("data", data_np)
+
+        rt_mod.run()
+
+        out = rt_mod.get_output(0).numpy()
+        print(np.max(np.abs(ref - out)), np.mean(np.abs(ref - out)))

From f121e5e355059bc78921b107256468abf6a10bf6 Mon Sep 17 00:00:00 2001
From: Janet Schneider <janetsc@octoml.ai>
Date: Mon, 3 Oct 2022 14:13:12 -0700
Subject: [PATCH 292/704] [Hexagon] [runtime] VTCM Allocator (#12947)

Adds a VTCM Memory Pool class, which allocates the largest contiguous buffer possible within 1 page upon construction.

Allocations and free space are maintained in two lists.  Buffers that align on 2k size boundaries will choose the smallest open buffer which will satisfy the request.  Non-aligned buffers will be allocated from the end of the free space.

HexagonBuffer will use this pool to service VTCM scope requests, replacing the individual calls to allocated the memory on separate pages.

The pool is created and destroyed in the device API Acquire/ReleaseResources.

Adds unit tests to exercise edge cases.
---
 src/runtime/hexagon/hexagon_buffer.cc         |  41 ++---
 src/runtime/hexagon/hexagon_device_api.cc     |   1 -
 src/runtime/hexagon/hexagon_device_api.h      |  23 ++-
 src/runtime/hexagon/hexagon_vtcm_pool.cc      | 152 ++++++++++++++++++
 src/runtime/hexagon/hexagon_vtcm_pool.h       | 100 ++++++++++++
 .../hexagon/hexagon_device_api_tests.cc       |  12 +-
 .../hexagon/hexagon_vtcm_pool_tests.cc        | 124 ++++++++++++++
 .../topi/test_conv2d_fp16_intrin.py           |   2 +-
 8 files changed, 420 insertions(+), 35 deletions(-)
 create mode 100644 src/runtime/hexagon/hexagon_vtcm_pool.cc
 create mode 100644 src/runtime/hexagon/hexagon_vtcm_pool.h
 create mode 100644 tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc

diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index 3ba1b5be3d3d..861a8d9f4f7a 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -24,8 +24,8 @@
 #include <string>
 #include <utility>
 
-#include "HAP_compute_res.h"
 #include "hexagon_common.h"
+#include "hexagon_device_api.h"
 
 namespace tvm {
 namespace runtime {
@@ -57,35 +57,26 @@ struct DDRAllocation : public Allocation {
 
 struct VTCMAllocation : public Allocation {
   VTCMAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
-    compute_res_attr_t res_info;
-    HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info));
-
-    // allocate nbytes of vtcm on a single page
-    HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes,
-                                                          /*b_single_page = */ 0));
-
-    // TODO(HWE): Investigate why a non-zero timeout results in
-    // hanging, both in the simulator and on hardware.
-    context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 0);
-
-    if (context_id_) {
-      data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info);
-      if (!data_) {
-        LOG(ERROR) << "ERROR: HAP_compute_res_acquire returned nullptr when allocating VTCM.";
-        HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_));
-        return;
-      }
-    } else {
-      LOG(FATAL) << "FATAL: HAP_compute_res_acquire failed to acquire requested VTCM resource.";
-      throw std::runtime_error(
-          "HAP_compute_res_acquire failed to acquire requested VTCM resource.");
+    // TODO(HWE): Handle alignments greater than 2k
+    CHECK(alignment <= 0x800) << "VTCMAllocation called for invalid alignment";
+    if ((nbytes & 0x7FF) && ((alignment & 0x7FF) == 0)) {
+      // Caller has requested 2k alignment, but the size is not a multiple of 2k
+      // Adjust size to be a multiple of 2k so that we will allocate from the front of the pool
+      nbytes = nbytes >> 11;
+      nbytes = nbytes << 11;
+      nbytes += 0x800;
+      DLOG(INFO) << "VTCMAllocation size adjusted for alignment " << allocation_nbytes_ << " to "
+                 << nbytes;
+      allocation_nbytes_ = nbytes;
     }
+    data_ = HexagonDeviceAPI::Global()->VtcmPool()->Allocate(allocation_nbytes_);
+    DLOG(INFO) << "VTCMAllocation " << data_ << " " << allocation_nbytes_ << " " << alignment;
   }
   ~VTCMAllocation() {
-    HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_));
+    DLOG(INFO) << "~VTCMAllocation " << data_ << " " << allocation_nbytes_;
+    HexagonDeviceAPI::Global()->VtcmPool()->Free(data_, allocation_nbytes_);
     data_ = nullptr;
   }
-  unsigned int context_id_{0};
 };
 
 template <HexagonBuffer::StorageScope S>
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 06254fba4585..db3c847a55e8 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -33,7 +33,6 @@
 
 #include "../workspace_pool.h"
 #include "hexagon_common.h"
-#include "hexagon_user_dma.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index 555ca0fa51a8..1c802f353062 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -33,6 +33,7 @@
 #include "hexagon_buffer_manager.h"
 #include "hexagon_thread_manager.h"
 #include "hexagon_user_dma.h"
+#include "hexagon_vtcm_pool.h"
 
 namespace tvm {
 namespace runtime {
@@ -54,37 +55,37 @@ class HexagonDeviceAPI final : public DeviceAPI {
 
   //! \brief Ensures resource managers are in a good state for the runtime
   void AcquireResources() {
+    CHECK_EQ(runtime_vtcm, nullptr);
+    runtime_vtcm = std::make_unique<HexagonVtcmPool>();
+
     CHECK_EQ(runtime_hexbuffs, nullptr);
     runtime_hexbuffs = std::make_unique<HexagonBufferManager>();
-    DLOG(INFO) << "runtime_hexbuffs created";
     mgr = runtime_hexbuffs.get();
 
     CHECK_EQ(runtime_threads, nullptr);
     runtime_threads = std::make_unique<HexagonThreadManager>(threads, stack_size, pipe_size);
-    DLOG(INFO) << "runtime_threads created";
 
     CHECK_EQ(runtime_dma, nullptr);
     runtime_dma = std::make_unique<HexagonUserDMA>();
-    DLOG(INFO) << "runtime_dma created";
   }
 
   //! \brief Ensures all runtime resources are freed
   void ReleaseResources() {
     CHECK(runtime_dma) << "runtime_dma was not created in AcquireResources";
     runtime_dma.reset();
-    DLOG(INFO) << "runtime_dma reset";
 
     CHECK(runtime_threads) << "runtime_threads was not created in AcquireResources";
     runtime_threads.reset();
-    DLOG(INFO) << "runtime_threads reset";
 
     CHECK(runtime_hexbuffs) << "runtime_hexbuffs was not created in AcquireResources";
     if (runtime_hexbuffs && !runtime_hexbuffs->empty()) {
-      DLOG(INFO) << "runtime_hexbuffs was not empty in ReleaseResources";
+      LOG(INFO) << "runtime_hexbuffs was not empty in ReleaseResources";
     }
     mgr = &hexbuffs;
-    DLOG(INFO) << "runtime_hexbuffs reset";
     runtime_hexbuffs.reset();
+
+    CHECK(runtime_vtcm) << "runtime_vtcm was not created in AcquireResources";
+    runtime_vtcm.reset();
   }
 
   /*! \brief Currently unimplemented interface to specify the active
@@ -168,6 +169,11 @@ class HexagonDeviceAPI final : public DeviceAPI {
     return runtime_dma.get();
   }
 
+  HexagonVtcmPool* VtcmPool() {
+    CHECK(runtime_vtcm) << "runtime_vtcm has not been created";
+    return runtime_vtcm.get();
+  }
+
  protected:
   //! Standard Device API interface to copy data from one storage to another.
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
@@ -202,6 +208,9 @@ class HexagonDeviceAPI final : public DeviceAPI {
 
   //! \brief User DMA manager
   std::unique_ptr<HexagonUserDMA> runtime_dma;
+
+  //! \brief VTCM memory manager
+  std::unique_ptr<HexagonVtcmPool> runtime_vtcm;
 };
 }  // namespace hexagon
 }  // namespace runtime
diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.cc b/src/runtime/hexagon/hexagon_vtcm_pool.cc
new file mode 100644
index 000000000000..1f02e2748ff6
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.cc
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "hexagon_vtcm_pool.h"
+
+#include "HAP_compute_res.h"
+#include "hexagon_common.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+HexagonVtcmPool::HexagonVtcmPool() {
+  compute_res_attr_t res_info;
+  HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info));
+
+  // TODO(HWE): get the max  and min size programmatically
+  const unsigned int max_size = 4 * 1024 * 1024;
+  const unsigned int min_size = 1024 * 1024;
+
+  // allocate nbytes of vtcm on a single page
+  HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param_v2(&res_info,
+                                                           /*vtcm_size = */ max_size,
+                                                           /*min_page_size = */ 1,
+                                                           /*min_vtcm_size = */ min_size));
+
+  // TODO(HWE): Investigate why a non-zero timeout results in
+  // hanging, both in the simulator and on hardware.
+  context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 0);
+  CHECK(context_id_) << "HAP_compute_res_acquire failed to acquire requested VTCM resource.";
+  HEXAGON_SAFE_CALL(HAP_compute_res_attr_get_vtcm_ptr_v2(&res_info, &vtcm_data_, &vtcm_size_));
+  CHECK(vtcm_data_ != nullptr) << "HAP_compute_res_acquire returned nullptr when allocating VTCM.";
+  CHECK(vtcm_size_ >= min_size)
+      << "HAP_compute_res_acquire failed to allocate minimum amount of VTCM";
+  free_.emplace_back(std::pair<char*, size_t>(static_cast<char*>(vtcm_data_), vtcm_size_));
+  // DebugDump();
+}
+
+HexagonVtcmPool::~HexagonVtcmPool() { HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_)); }
+
+void* HexagonVtcmPool::Allocate(size_t nbytes) {
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  CHECK(!free_.empty()) << "No free VTCM";
+
+  // If this is not aligned on a 2k block, allocate from the end to avoid fragmentation
+  if (nbytes & size_t(0x7FF)) {
+    DLOG(INFO) << "VTCM nbytes requested: " << nbytes << " allocate from the end";
+    auto last_free_entry = free_.rbegin();
+    CHECK(last_free_entry->second >= nbytes)
+        << "Not enough contiguous VTCM space at the end to allocate";
+    char* ptr = last_free_entry->first + (last_free_entry->second - nbytes);
+    allocations_.emplace_back(std::pair<char*, size_t>(ptr, nbytes));
+    last_free_entry->second -= nbytes;
+    // DebugDump();
+    return ptr;
+  }
+
+  auto entry_to_allocate = free_.begin();
+  for (auto it = free_.begin(); it != free_.end(); it++) {
+    if ((it->second < entry_to_allocate->second) && (it->second >= nbytes)) {
+      entry_to_allocate = it;
+      if (entry_to_allocate->second == nbytes) {
+        break;
+      }
+    }
+  }
+  CHECK(entry_to_allocate->second >= nbytes) << "Not enough contiguous VTCM space to allocate";
+  char* ptr = entry_to_allocate->first;
+  allocations_.emplace(allocations_.end(), std::pair<char*, size_t>(ptr, nbytes));
+
+  if (entry_to_allocate->second == nbytes) {
+    free_.erase(entry_to_allocate);
+  } else {
+    entry_to_allocate->first = entry_to_allocate->first + nbytes;
+    entry_to_allocate->second = entry_to_allocate->second - nbytes;
+  }
+  // DebugDump();
+  return ptr;
+}
+
+void HexagonVtcmPool::Free(void* ptr, size_t nbytes) {
+  char* ptr_to_free = static_cast<char*>(ptr);
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  auto it = std::find_if(allocations_.begin(), allocations_.end(),
+                         [&](auto entry) { return entry.first == ptr_to_free; });
+  CHECK(it != allocations_.end()) << "Attempted to free a pointer that had not been allocated";
+  CHECK(it->second == nbytes) << "Attempted to free a different size than was allocated";
+  allocations_.erase(it);
+
+  it = std::lower_bound(free_.begin(), free_.end(), std::pair<char*, size_t>(ptr_to_free, nbytes),
+                        [](auto p, auto q) { return p.first <= q.first; });
+  if (it == free_.end()) {
+    // Insert an entry at the end
+    it = free_.emplace(it, std::pair<char*, size_t>(ptr_to_free, nbytes));
+  } else {
+    CHECK(ptr_to_free != it->first) << "Attempting to free a pointer that was already free";
+    CHECK(ptr_to_free + nbytes <= it->first)
+        << "free_ is in an inconsistent state, freed block overlaps with next";
+    if (ptr_to_free + nbytes == it->first) {
+      // Make this entry bigger
+      it->first = ptr_to_free;
+      it->second += nbytes;
+    } else {
+      // Insert an entry before this
+      it = free_.emplace(it, std::pair<char*, size_t>(ptr_to_free, nbytes));
+    }
+  }
+
+  // Check for overlap with the previous entry
+  if (it != free_.begin()) {
+    auto it_prev = it;
+    it_prev--;
+    CHECK(it_prev->first + it_prev->second <= ptr_to_free)
+        << "free_ is in an inconsistent state, freed block overlaps with previous";
+    if (it_prev->first + it_prev->second == ptr_to_free) {
+      it_prev->second += it->second;
+      free_.erase(it);
+    }
+  }
+  // DebugDump();
+}
+
+void HexagonVtcmPool::DebugDump() {
+  LOG(INFO) << "VTCM list state";
+  for (auto entry : allocations_) {
+    LOG(INFO) << "VTCM alloc: " << static_cast<void*>(entry.first) << " " << entry.second;
+  }
+  for (auto entry : free_) {
+    LOG(INFO) << "VTCM  free: " << static_cast<void*>(entry.first) << " " << entry.second;
+  }
+}
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.h b/src/runtime/hexagon/hexagon_vtcm_pool.h
new file mode 100644
index 000000000000..e1292e4e10d7
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.h
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_VTCM_POOL_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_VTCM_POOL_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+class HexagonVtcmPool {
+ public:
+  //! \brief Allocates all of VTCM memory, and manages allocations from the runtime
+  HexagonVtcmPool();
+
+  //! \brief Destruction deallocates the underlying VTCM allocation.
+  ~HexagonVtcmPool();
+
+  //! \brief Prevent copy construction of HexagonVtcmPool.
+  HexagonVtcmPool(const HexagonVtcmPool&) = delete;
+
+  //! \brief Prevent copy assignment with HexagonVtcmPool.
+  HexagonVtcmPool& operator=(const HexagonVtcmPool&) = delete;
+
+  //! \brief Prevent move construction.
+  HexagonVtcmPool(HexagonVtcmPool&&) = delete;
+
+  //! \brief Prevent move assignment.
+  HexagonVtcmPool& operator=(HexagonVtcmPool&&) = delete;
+
+  /* \brief Allocate memory from the VTCM manager
+   *
+   * \param nbytes The number of bytes to allocate.
+   */
+  void* Allocate(size_t nbytes);
+
+  /* \brief Copy data from a Hexagon Buffer an external buffer.
+   *
+   * \param ptr The pointer to the buffer to be freed.
+   *
+   * \param nbytes The number of bytes to be freed.
+   */
+  void Free(void* ptr, size_t nbytes);
+
+  //! \brief Returns the total number of bytes in this pool
+  size_t TotalBytes() { return reinterpret_cast<size_t>(vtcm_size_); }
+
+ private:
+  //! \brief Context for HAP_compute_res_*
+  unsigned int vtcm_size_;
+
+  //! \brief Context for HAP_compute_res_*
+  void* vtcm_data_;
+
+  //! \brief Context for HAP_compute_res_*
+  unsigned int context_id_{0};
+
+  //! \brief List of allocations
+  std::vector<std::pair<char*, size_t>> allocations_;
+
+  //! \brief List of free segments
+  std::vector<std::pair<char*, size_t>> free_;
+
+  //! \brief Mutext to protect access to the lists
+  std::mutex mutex_;
+
+  //! \brief Debug only dump of the state of the lists
+  void DebugDump();
+};
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_VTCM_POOL_H_
diff --git a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
index d0f962cfcee5..2139aa78f7ae 100644
--- a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
@@ -175,7 +175,7 @@ TEST_F(HexagonDeviceAPITest, thread_manager) {
   hexapi->AcquireResources();
 }
 
-// Ensure thread manager is properly configured and destroyed
+// Ensure user DMA manager is properly configured and destroyed
 // in Acquire/Release
 TEST_F(HexagonDeviceAPITest, user_dma) {
   HexagonUserDMA* user_dma = hexapi->UserDMA();
@@ -184,3 +184,13 @@ TEST_F(HexagonDeviceAPITest, user_dma) {
   EXPECT_THROW(hexapi->UserDMA(), InternalError);
   hexapi->AcquireResources();
 }
+
+// Ensure VTCM pool is properly configured and destroyed
+// in Acquire/Release
+TEST_F(HexagonDeviceAPITest, vtcm_pool) {
+  HexagonVtcmPool* vtcm_pool = hexapi->VtcmPool();
+  CHECK(vtcm_pool != nullptr);
+  hexapi->ReleaseResources();
+  EXPECT_THROW(hexapi->VtcmPool(), InternalError);
+  hexapi->AcquireResources();
+}
diff --git a/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc b/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
new file mode 100644
index 000000000000..766b414cd0a5
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../src/runtime/hexagon/hexagon_device_api.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::hexagon;
+
+class HexagonVtcmPoolTest : public ::testing::Test {
+  void SetUp() override { vtcm_pool = HexagonDeviceAPI::Global()->VtcmPool(); }
+  void TearDown() override {}
+
+ public:
+  HexagonVtcmPool* vtcm_pool;
+};
+
+TEST_F(HexagonVtcmPoolTest, basic) {
+  void* ptr;
+  size_t max_bytes = vtcm_pool->TotalBytes();
+  size_t two_k_block = 2048;
+  size_t one_k_block = 1024;
+  size_t one_byte_block = 1;
+  ptr = vtcm_pool->Allocate(max_bytes);
+  vtcm_pool->Free(ptr, max_bytes);
+  ptr = vtcm_pool->Allocate(two_k_block);
+  vtcm_pool->Free(ptr, two_k_block);
+  ptr = vtcm_pool->Allocate(one_k_block);
+  vtcm_pool->Free(ptr, one_k_block);
+  ptr = vtcm_pool->Allocate(one_byte_block);
+  vtcm_pool->Free(ptr, one_byte_block);
+}
+
+TEST_F(HexagonVtcmPoolTest, no_free_vtcm) {
+  void* ptr;
+  size_t max_bytes = vtcm_pool->TotalBytes();
+  ptr = vtcm_pool->Allocate(max_bytes);
+  EXPECT_THROW(vtcm_pool->Allocate(1), InternalError);
+  vtcm_pool->Free(ptr, max_bytes);
+}
+
+TEST_F(HexagonVtcmPoolTest, not_enough_free_vtcm) {
+  void* ptr;
+  size_t max_bytes = vtcm_pool->TotalBytes();
+  size_t two_k_block = 2048;
+  ptr = vtcm_pool->Allocate(max_bytes - two_k_block);
+  EXPECT_THROW(vtcm_pool->Allocate(two_k_block * 2), InternalError);
+  vtcm_pool->Free(ptr, max_bytes - two_k_block);
+}
+
+TEST_F(HexagonVtcmPoolTest, free_with_wrong_size) {
+  void* ptr;
+  size_t two_k_block = 2048;
+  ptr = vtcm_pool->Allocate(two_k_block * 2);
+  EXPECT_THROW(vtcm_pool->Free(ptr, two_k_block), InternalError);
+  vtcm_pool->Free(ptr, two_k_block * 2);
+}
+
+TEST_F(HexagonVtcmPoolTest, free_alloc_combinations) {
+  void* ptr1;
+  void* ptr2;
+  void* ptr3;
+  void* ptr4;
+  void* new_ptr;
+  size_t two_k_block = 2048;
+  size_t max_less_3_blocks = vtcm_pool->TotalBytes() - (3 * two_k_block);
+  ptr1 = vtcm_pool->Allocate(two_k_block);
+  ptr2 = vtcm_pool->Allocate(two_k_block);
+  ptr3 = vtcm_pool->Allocate(two_k_block);
+  ptr4 = vtcm_pool->Allocate(max_less_3_blocks);
+
+  // Make sure pointers are 2k apart from each other
+  CHECK(static_cast<char*>(ptr1) + two_k_block == static_cast<char*>(ptr2));
+  CHECK(static_cast<char*>(ptr2) + two_k_block == static_cast<char*>(ptr3));
+  CHECK(static_cast<char*>(ptr3) + two_k_block == static_cast<char*>(ptr4));
+
+  // Free 2, realloc it, make sure it is the same as before
+  vtcm_pool->Free(ptr2, two_k_block);
+  new_ptr = vtcm_pool->Allocate(two_k_block);
+  CHECK(new_ptr == ptr2);
+
+  // Free 1 and 2, re-alloc and make sure they are the same
+  vtcm_pool->Free(ptr1, two_k_block);
+  vtcm_pool->Free(ptr2, two_k_block);
+  new_ptr = vtcm_pool->Allocate(two_k_block);
+  CHECK(new_ptr == ptr1);
+  new_ptr = vtcm_pool->Allocate(two_k_block);
+  CHECK(new_ptr == ptr2);
+
+  // Exercise different deletion scenarios
+  vtcm_pool->Free(ptr2, two_k_block);
+  vtcm_pool->Free(ptr3, two_k_block);
+  vtcm_pool->Free(ptr4, max_less_3_blocks);
+  vtcm_pool->Free(ptr1, two_k_block);
+
+  ptr1 = vtcm_pool->Allocate(two_k_block);
+  ptr2 = vtcm_pool->Allocate(two_k_block);
+  ptr3 = vtcm_pool->Allocate(two_k_block);
+  vtcm_pool->Free(ptr1, two_k_block);
+  vtcm_pool->Free(ptr3, two_k_block);
+  vtcm_pool->Free(ptr2, two_k_block);
+
+  // Make sure at the end we have the full amount
+  // available again
+  ptr4 = vtcm_pool->Allocate(max_less_3_blocks);
+  vtcm_pool->Free(ptr4, max_less_3_blocks);
+}
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
index e8efdb369590..e7946d04608e 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
@@ -195,7 +195,7 @@ class TestConv2dIntrin:
     inp_offset = tvm.testing.parameter((0, 0), ids=["offset0x0"])
 
     @tvm.testing.requires_hexagon
-    def test_conv2d(self, act_shape, wgt_shape, inp_stride, inp_offset, hexagon_session):
+    def DISABLED_test_conv2d(self, act_shape, wgt_shape, inp_stride, inp_offset, hexagon_session):
         """Test conv2d intrinsic implementation"""
         assert act_shape[3] == wgt_shape[2]
 

From f3d3ecebe189af77ba3ac5163882591aaf67d8b3 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 4 Oct 2022 06:40:29 +0900
Subject: [PATCH 293/704] [Hexagon] vrmpy tensorization for e2e compilation of
 int8 models (#12911)

* [Hexagon] Support vrmpy tensorization for conv2d and dense schedules

* update

* clean up

* migrate tests to test_launcher.py

* remove vrmpy test files

* use generic int8 conv2d schedule

* clean up

* doc update

* pylint fix

* parametrize dtype in test

* doc update

* add missing paralleization for dense

* more pylint

* fixed for fp32 dense
---
 python/tvm/relay/op/strategy/hexagon.py       |  37 ++++-
 python/tvm/topi/generic/conv2d.py             |  11 +-
 python/tvm/topi/hexagon/__init__.py           |   2 +
 python/tvm/topi/hexagon/conv2d.py             |  49 +++++-
 python/tvm/topi/hexagon/conv2d_alter_op.py    | 111 +++++++++++++
 python/tvm/topi/hexagon/dense.py              |  73 ++++++++-
 python/tvm/topi/hexagon/dense_alter_op.py     | 147 +++++++++++++++++
 python/tvm/topi/hexagon/injective.py          |   3 +-
 python/tvm/topi/hexagon/tensor_intrin.py      |  86 ++++++++++
 .../contrib/test_hexagon/test_launcher.py     | 149 +++++++++++++++++-
 10 files changed, 662 insertions(+), 6 deletions(-)
 create mode 100644 python/tvm/topi/hexagon/conv2d_alter_op.py
 create mode 100644 python/tvm/topi/hexagon/dense_alter_op.py

diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py
index 13c808f96b95..693352d650ba 100644
--- a/python/tvm/relay/op/strategy/hexagon.py
+++ b/python/tvm/relay/op/strategy/hexagon.py
@@ -30,7 +30,7 @@ def batch_matmul_strategy_hexagon(attrs, inputs, out_type, target):
     """batch_matmul strategy for Hexagon"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_batch_matmul(topi.nn.batch_matmul),
+        wrap_compute_batch_matmul(topi.nn.batch_matmul, need_out_dtype=True),
         wrap_topi_schedule(topi.hexagon.schedule_batch_matmul),
         name="batch_matmul.hexagon",
     )
@@ -187,3 +187,38 @@ def schedule_reduce_hexagon(attrs, outs, target):
     """Schedule reduction ops for Hexagon"""
     with target:
         return topi.hexagon.schedule_reduce(outs)
+
+
+@conv2d_NCHWc_strategy.register("hexagon")
+def conv2d_NCHWc_strategy_hexagon(attrs, inputs, out_type, target):
+    """conv2d_NCHWc_ hexagon strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_conv2d(
+            topi.hexagon.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
+        ),
+        wrap_topi_schedule(topi.hexagon.schedule_conv2d_NCHWc_int8),
+        name="conv2d_NCHWc_int8.hexagon",
+    )
+    return strategy
+
+
+@dense_pack_strategy.register("hexagon")
+def dense_pack_strategy_hexagon(attrs, inputs, out_type, target):
+    """dense_pack hexagon strategy"""
+    strategy = _op.OpStrategy()
+
+    if (
+        inputs[0].dtype == "uint8"
+        and inputs[1].dtype == "uint8"
+        and out_type.dtype == "int32"
+        and attrs["weight_layout"] == "NC32n4c"
+    ):
+        strategy.add_implementation(
+            wrap_compute_dense(topi.hexagon.dense.dense_u8u8i32_vrmpy_compute),
+            wrap_topi_schedule(topi.hexagon.dense.dense_u8u8i32_vrmpy_schedule),
+            name="dense_uint8.hexagon",
+            plevel=12,
+        )
+
+    return strategy
diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index 48b2a2f97146..76cd9a7d69d1 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -139,7 +139,16 @@ def schedule_conv_NCHWc_cpu_common_int8(
     More details - https://software.intel.com/en-us/articles/
     lower-numerical-precision-deep-learning-inference-and-training
     """
-    reg_n, unroll_kw = cfg["tile_ow"].size[-1], cfg["unroll_kw"].val
+    if isinstance(cfg["tile_ow"], int):
+        reg_n = cfg["tile_ow"]
+    else:
+        reg_n = cfg["tile_ow"].size[-1]
+
+    if isinstance(cfg["unroll_kw"], (int, bool)):
+        unroll_kw = cfg["unroll_kw"]
+    else:
+        unroll_kw = cfg["unroll_kw"].val
+
     _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
     _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)
 
diff --git a/python/tvm/topi/hexagon/__init__.py b/python/tvm/topi/hexagon/__init__.py
index 295152d11631..b94526e5b919 100644
--- a/python/tvm/topi/hexagon/__init__.py
+++ b/python/tvm/topi/hexagon/__init__.py
@@ -29,3 +29,5 @@
 from .resize2d import *
 from .tensor_intrin import *
 from .qnn import *
+from .dense_alter_op import *
+from .conv2d_alter_op import *
diff --git a/python/tvm/topi/hexagon/conv2d.py b/python/tvm/topi/hexagon/conv2d.py
index d8f44d663843..aa1b7e57e464 100644
--- a/python/tvm/topi/hexagon/conv2d.py
+++ b/python/tvm/topi/hexagon/conv2d.py
@@ -14,11 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+# pylint: disable=invalid-name
 """Schedule for conv2d"""
 
 import tvm
+from tvm import te
+from .. import nn
 from ..utils import traverse_inline
+from .tensor_intrin import dot_vrmpy
+from ..generic import conv2d as conv2d_generic
 
 
 def schedule_conv2d_nhwc(outs):
@@ -86,3 +90,46 @@ def _callback(op):
 
     traverse_inline(s, outs[0].op, _callback)
     return s
+
+
+def conv2d_NCHWc_int8(
+    data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="int32"
+):
+    """Compute definition for int8 conv2d in NCHWc layout"""
+    n_elems = int(kernel.shape[-1])
+    return nn.conv2d_NCHWc_int8(
+        data, kernel, stride, padding, dilation, layout, out_layout, out_dtype, n_elems=n_elems
+    )
+
+
+def schedule_conv2d_NCHWc_int8(outs):
+    """Schedule for int8 conv2d in NCHWc layout using vrmpy tensorization"""
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "conv2d_NCHWc_int8" in op.tag:
+            conv_out = op.output(0)
+            kernel_vec = conv_out.op.input_tensors[1]
+            data_vec = conv_out.op.input_tensors[0]
+            out_width = conv_out.shape[3]
+
+            reg_n = 1
+            for n in range(31, 0, -1):
+                if out_width % n == 0:
+                    reg_n = n
+                    break
+
+            cfg = {"tile_ow": reg_n, "unroll_kw": False}
+            args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]]
+            intrin = dot_vrmpy(data_vec.dtype, kernel_vec.dtype)
+
+            conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(
+                *args,
+                int32_lanes=32,
+                int8_elems=4,
+                intrin=intrin,
+                inline_fused=True,
+            )
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/python/tvm/topi/hexagon/conv2d_alter_op.py b/python/tvm/topi/hexagon/conv2d_alter_op.py
new file mode 100644
index 000000000000..201b6f804352
--- /dev/null
+++ b/python/tvm/topi/hexagon/conv2d_alter_op.py
@@ -0,0 +1,111 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Conv2d alter op functions for Hexagon"""
+
+from tvm import relay
+from ..utils import get_const_tuple
+from .. import nn
+from ..nn import conv2d_alter_layout
+from ..generic.conv2d import conv2d_alter_int8_common
+
+
+@conv2d_alter_layout.register("hexagon")
+def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
+    """Convert nn.conv2d into nn.contrib_conv2d_nchwc if vrmpy is applicable."""
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data_tensor, kernel_tensor = tinfos
+    out_channel, in_channel, _, _ = get_const_tuple(kernel_tensor.shape)
+
+    if (
+        "int8" in data_tensor.dtype
+        and "int8" in kernel_tensor.dtype
+        and out_channel % 32 == 0
+        and in_channel % 4 == 0
+        and data_layout == "NCHW"
+        and kernel_layout == "OIHW"
+    ):
+        out_channel, in_channel, _, _ = get_const_tuple(kernel_tensor.shape)
+
+        n_elems = 4
+        oc_bn = 32
+        ic_bn = min(in_channel, 32)
+
+        new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+        new_attrs["channels"] = out_channel
+        new_attrs["data_layout"] = "NCHW%dc" % ic_bn
+        new_attrs["kernel_layout"] = "OIHW{:n}i{:n}o{:n}i".format(ic_bn // n_elems, oc_bn, n_elems)
+        new_attrs["out_layout"] = "NCHW%dc" % oc_bn
+
+        return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
+
+    return None
+
+
+@nn.conv2d_legalize.register("hexagon")
+def _conv2d_legalize(attrs, inputs, arg_types):
+    """Legalize conv2d op for vrmpy tensorization.
+
+    If the inputs are signed or unsigned int8, the input and output channels are padded to be
+    a multiple of 4 and 32 respectively.
+
+    If the input data types are (int8, int8), they are converted to (uint8, int8) and
+    the vector-by-vector variant of vrmpy is applied.
+    If the input data types are (uint8, uint8), the more efficient vector-by-scalar variant of vrmpy
+    is applied.
+
+    Unlike the nn.dense case (see dense_alter_op.py), we do not convert (uint8, int8) to
+    (uint8, uint8). That would introduce another convolution by a constant (128 or 1) filter,
+    to compensate for the dtype legalization. In the nn.dense case, such compensation factor is
+    just a sum over the K axis.
+    """
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+
+    output_tensor = arg_types[2]
+
+    data, kernel = inputs
+
+    if data_layout != "NCHW" or kernel_layout != "OIHW":
+        return None
+
+    data_tensor, kernel_tensor = arg_types[0], arg_types[1]
+
+    if "int8" in data_tensor.dtype and "int8" in data_tensor.dtype:
+        output_tensor = arg_types[2]
+        data, kernel = inputs
+        desired_data_dtype = "uint8"
+        in_channel_vector_length = 4
+        out_channel_vector_length = 32
+
+        return conv2d_alter_int8_common(
+            data,
+            data_tensor,
+            kernel,
+            kernel_tensor,
+            output_tensor,
+            attrs,
+            desired_data_dtype,
+            in_channel_vector_length,
+            out_channel_vector_length,
+        )
+
+    return None
diff --git a/python/tvm/topi/hexagon/dense.py b/python/tvm/topi/hexagon/dense.py
index afe53f515fa9..02ad141ecb5a 100644
--- a/python/tvm/topi/hexagon/dense.py
+++ b/python/tvm/topi/hexagon/dense.py
@@ -14,10 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+# pylint: disable=invalid-name
 """Schedule for dense operator"""
 
 import tvm
+from tvm.topi.utils import traverse_inline
+from tvm import te
+from .. import tag
+from .tensor_intrin import dot_vrmpy
 
 
 def schedule_dense(outs):
@@ -38,3 +42,70 @@ def schedule_dense(outs):
     s = tvm.te.create_schedule([x.op for x in outs])
     tvm.te.schedule.AutoInlineInjective(s)
     return s
+
+
+def dense_u8u8i32_vrmpy_compute(X, packed_w, bias, out_dtype):
+    """Compute for uint8 x uint8 -> int32 dense using vrmpy"""
+    assert X.dtype == "uint8" and packed_w.dtype == "uint8" and out_dtype == "int32"
+    m, k = X.shape
+    n_o, _, n_i, _ = packed_w.shape
+    assert n_i == 32
+    ak = te.reduce_axis((0, k), name="k")
+
+    C = te.compute(
+        (m, n_o * n_i),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packed_w[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype(
+                "int32"
+            ),
+            axis=ak,
+        ),
+        tag="dense_u8u8i32_vrmpy",
+        name="compute",
+    )
+
+    if bias is not None:
+        C = te.compute(C.shape, lambda i, j: C[i, j] + bias[j], tag=tag.BROADCAST)
+
+    return C
+
+
+def dense_u8u8i32_vrmpy_schedule(outs):
+    """Schedule for vrmpy dense"""
+    s = te.create_schedule([x.op for x in outs])
+    # O: The output of the fused op
+    O = outs[0]
+
+    def _schedule_dense(s, C, O):
+        (a_k,) = C.op.reduce_axis
+        a_y = C.op.axis[-2]
+        a_yo, a_yi = s[C].split(a_y, factor=32)
+        a_xo, a_xi = s[C].split(C.op.axis[-1], factor=32)
+        a_ko, a_ki = s[C].split(a_k, factor=4)
+
+        s[C].reorder(a_yo, a_xo, a_yi, a_ko, a_xi, a_ki)
+
+        pc = dot_vrmpy("uint8", "uint8")
+        s[C].tensorize(a_xi, pc)
+        s[C].parallel(s[C].fuse(a_yo, a_xo))
+
+        if C != O:
+            a_y = O.op.axis[-2]
+            a_yo, a_yi = s[O].split(a_y, factor=32)
+            a_xo, a_xi = s[O].split(O.op.axis[-1], factor=32)
+
+            s[O].reorder(a_yo, a_xo, a_yi, a_xi)
+            s[O].vectorize(a_xi)
+            s[C].compute_at(s[O], a_yi)
+            s[O].parallel(s[O].fuse(a_yo, a_xo))
+
+    def _callback(op):
+        if "u8u8i32_vrmpy" in op.tag:
+            # C: The output of GEMM
+            C = op.output(0)
+            _schedule_dense(s, C, O)
+
+    traverse_inline(s, outs[0].op, _callback)
+
+    return s
diff --git a/python/tvm/topi/hexagon/dense_alter_op.py b/python/tvm/topi/hexagon/dense_alter_op.py
new file mode 100644
index 000000000000..cb5feb56d68e
--- /dev/null
+++ b/python/tvm/topi/hexagon/dense_alter_op.py
@@ -0,0 +1,147 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Dense alter op functions for ARM"""
+
+import tvm
+from tvm import relay
+from .. import nn
+from ..nn import dense_alter_layout
+
+
+def check_vrmpy_applicable(x, y):
+    return (
+        "int8" in x.dtype and "int8" in y.dtype and y.shape[-2] % 32 == 0 and y.shape[-1] % 4 == 0
+    )
+
+
+@dense_alter_layout.register(["hexagon"])
+def _alter_dense_layout(attrs, inputs, tinfos, out_type):
+    data_tensor, weight_tensor = tinfos
+    out_dtype = out_type.dtype
+
+    if check_vrmpy_applicable(data_tensor, weight_tensor):
+        weight_layout = "NC32n4c"
+        return relay.nn.contrib_dense_pack(inputs[0], inputs[1], weight_layout, None, out_dtype)
+    else:
+        return None
+
+
+def vrmpy_legalize(x, w, arg_types, op, attrs):
+    """
+    Legalizes int8 inputs to dense for vrmpy.
+    X'_u8 = X_s8 + 128
+    X_s8 * W_s8 = (X'_u8 - 128) * (W'_u8 - 128)
+                = X'_u8 * W'_u8 - X'_u8 * 128 - 128 * W'_u8 + 128 * 128
+    X_u8 * W_s8 = X_u8 * (W'_u8 - 128)
+                = X'_u8 * W'_u8 - X_u8 * 128
+    """
+    if not check_vrmpy_applicable(arg_types[0], arg_types[1]):
+        return None
+
+    def cast_to_uint8(x):
+        x = relay.cast(x, "int32")
+        x = relay.add(x, relay.const(128, "int32"))
+        return relay.cast(x, "uint8")
+
+    if arg_types[0].dtype == "int8" and arg_types[1].dtype == "int8":
+        x = cast_to_uint8(x)
+        w = cast_to_uint8(w)
+
+        W_u8x128 = relay.const(-128, "int32") * relay.sum(relay.cast(w, "int32"), axis=[-1])
+        X_u8x128 = relay.const(-128, "int32") * relay.sum(relay.cast(x, "int32"), axis=[-1])
+        X_u8x128 = relay.expand_dims(X_u8x128, axis=1)
+
+        out = op(x, w, **attrs)
+
+        out += W_u8x128
+        out += X_u8x128
+
+        k_dim = int(arg_types[0].shape[-1])
+        return out + relay.const(128 * 128 * k_dim, "int32")
+
+    if arg_types[0].dtype == "uint8" and arg_types[1].dtype == "int8":
+        w = cast_to_uint8(w)
+
+        X_u8x128 = relay.expand_dims(
+            relay.const(-128, "int32") * relay.sum(relay.cast(x, "int32"), axis=[-1]), axis=1
+        )
+
+        out = op(x, w, **attrs)
+
+        return out + X_u8x128
+
+    return None
+
+
+@nn.dense_legalize.register("hexagon")
+def _dense_legalize(attrs, inputs, arg_types):
+    """Legalize dense op for HVX vectorization and vrmpy tensorization.
+
+    Given a workload with a matrix X of shape (M, K) and a matrix Y of (N, K),
+    we first pad the N dimension to be a multiple of the output vector length.
+
+    And if the inputs are signed or unsigned int8 and the Y matrix can be packed into the
+    NK32n4k layout, we convert both inputs to uint8 to apply the most efficient variant of vrmpy.
+    """
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+    # Collect the input tensors.
+    x_tensor, y_tensor = arg_types[0], arg_types[1]
+    dtype = x_tensor.dtype
+
+    # Collect the output tensor.
+    output_tensor = arg_types[2]
+
+    # Collect the input exprs.
+    x, y = inputs
+
+    N, _ = y_tensor.shape
+
+    if dtype == "float16":
+        vec_len = 64
+    elif "int8" in dtype:
+        vec_len = 32
+    else:
+        return None
+
+    if N % vec_len != 0:
+        N_padded = ((N + vec_len) // vec_len) * vec_len
+        dn = N_padded - N
+
+        y_ = relay.nn.pad(y, pad_width=((0, dn), (0, 0)))
+
+        # If units is explicitly specified, it is used to compute the output shape.
+        # We need to update units after padding to prevent a type error.
+        if attrs["units"] is not None:
+            new_attrs["units"] = N + dn
+
+        arg_types = [
+            arg_types[0],
+            tvm.ir.tensor_type.TensorType([N + dn, arg_types[1].shape[1]], arg_types[1].dtype),
+        ]
+
+        vrmpy_out = vrmpy_legalize(x, y_, arg_types, relay.nn.dense, new_attrs)
+
+        if vrmpy_out is None:
+            out_ = relay.nn.dense(x, y_, **new_attrs)
+        else:
+            out_ = vrmpy_out
+
+        out = relay.strided_slice(out_, begin=[0, 0], end=[x.value for x in output_tensor.shape])
+        return out
+
+    return vrmpy_legalize(inputs[0], inputs[1], arg_types, relay.nn.dense, attrs)
diff --git a/python/tvm/topi/hexagon/injective.py b/python/tvm/topi/hexagon/injective.py
index b1d1e1541961..bd06cb8ecd16 100644
--- a/python/tvm/topi/hexagon/injective.py
+++ b/python/tvm/topi/hexagon/injective.py
@@ -42,8 +42,9 @@ def schedule_injective(outs):
     # Fuse axes and vectorize inner elements
     for x in outs:
         fused = s[x].fuse(*x.op.axis)
-        _, inner = s[x].split(fused, factor=128 // np.dtype(x.dtype).itemsize)
+        outer, inner = s[x].split(fused, factor=128 // np.dtype(x.dtype).itemsize)
         s[x].vectorize(inner)
+        s[x].parallel(outer)
     return s
 
 
diff --git a/python/tvm/topi/hexagon/tensor_intrin.py b/python/tvm/topi/hexagon/tensor_intrin.py
index bdc63854328b..adea4690d4a7 100644
--- a/python/tvm/topi/hexagon/tensor_intrin.py
+++ b/python/tvm/topi/hexagon/tensor_intrin.py
@@ -14,10 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name
 """Optimized implementation of q_multiply_shift based on LLVM intrinsics"""
 
 import tvm
 from tvm.ir import register_intrin_lowering
+from tvm import te
 
 
 def _q_multiply_shift_hexagon(op):
@@ -69,3 +71,87 @@ def _q_multiply_shift_hexagon(op):
 register_intrin_lowering(
     "tir.q_multiply_shift", target="hexagon", f=_q_multiply_shift_hexagon, level=99
 )
+
+
+def dot_vrmpy(x_ty, y_ty):
+    """Generates vrmpy instruciton for tensorization."""
+    int32_lanes = 32
+    num_int8_elements = 4  # 4 int8 elements in int32
+    data = te.placeholder((num_int8_elements,), dtype=x_ty, name="data")
+    kernel = te.placeholder((int32_lanes, num_int8_elements), dtype=y_ty, name="kernel")
+    k = te.reduce_axis((0, num_int8_elements), name="k")
+    C = te.compute(
+        (int32_lanes,),
+        lambda i: te.sum(data[k].astype("int32") * kernel[i, k].astype("int32"), axis=k),
+        name="C",
+    )
+
+    a_buffer = tvm.tir.decl_buffer(
+        data.shape, dtype=x_ty, name="a_buffer", offset_factor=1, strides=[1]
+    )
+    b_buffer = tvm.tir.decl_buffer(
+        kernel.shape, dtype=y_ty, name="b_buffer", offset_factor=1, strides=[te.var("ldw"), 1]
+    )
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x32")))
+                return ib.get()
+
+            vec_zero = tvm.tir.const(0, "int32x32")
+
+            if x_ty == "uint8" and y_ty == "uint8":
+                a_uint8 = ins[0].vload([0], "uint8x4")
+                re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_uint8)
+                vec_b = ins[1].vload([0, 0], "uint8x128")
+
+                vrmpy_inst_name = "llvm.hexagon.V6.vrmpyub.acc.128B"
+
+                vec_bi32 = tvm.tir.call_intrin("int32x32", "tir.reinterpret", vec_b)
+
+                quad_reduction = tvm.tir.call_llvm_pure_intrin(
+                    "int32x32",
+                    vrmpy_inst_name,
+                    tvm.tir.const(3, "uint32"),
+                    vec_zero,
+                    vec_bi32,
+                    re_int32,
+                )
+            elif x_ty == "uint8" and y_ty == "int8":
+                a_uint8 = ins[0].vload([0], "uint8x4")
+                re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_uint8)
+                vec_b = ins[1].vload([0, 0], "int8x128")
+
+                vrmpy_inst_name = "llvm.hexagon.V6.vrmpybusv.acc.128B"
+
+                vec_bi32 = tvm.tir.call_intrin("int32x32", "tir.reinterpret", vec_b)
+
+                quad_reduction = tvm.tir.call_llvm_pure_intrin(
+                    "int32x32",
+                    vrmpy_inst_name,
+                    tvm.tir.const(3, "uint32"),
+                    vec_zero,
+                    re_int32.astype("int32x32"),
+                    vec_bi32,
+                )
+            else:
+                raise ValueError(f"Only (u8, u8) or (u8, i8) dtype pairs are supported by vrmpy.")
+
+            if index == 0:
+                ib.emit(outs[0].vstore(0, quad_reduction))
+            else:
+                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], "int32x32")))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={data: a_buffer, kernel: b_buffer},
+        default_buffer_params=buffer_params,
+    )
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 9321ddf71d3b..7431871524aa 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -14,8 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+# pylint: disable=invalid-name,missing-function-docstring,redefined-outer-name
 """ Test rpc based launcher for hexagon """
+import pytest
 
 import numpy as np
 
@@ -424,5 +425,151 @@ def test_aot_executor_multiple_conv2d(hexagon_session: Session, aot_host_target,
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
+data_dtype = tvm.testing.parameter("int8", "uint8")
+weight_dtype = tvm.testing.parameter("int8", "uint8")
+
+
+@tvm.testing.requires_hexagon
+def test_conv2d_relay_vrmpy(hexagon_session, data_dtype, weight_dtype):
+    if data_dtype == "int8" and weight_dtype == "uint8":
+        pytest.skip("(i8, u8) input pair is not supported")
+
+    def get_conv2d_nchw(d_shape, w_shape, padding, strides=(1, 1)):
+        out_dtype = "int32"
+
+        data = relay.var("data", shape=d_shape, dtype=data_dtype)
+        weight = relay.var("weight", shape=w_shape, dtype=weight_dtype)
+        out_channel = w_shape[0]
+        return relay.nn.conv2d(
+            data=data,
+            weight=weight,
+            kernel_size=w_shape[2:],
+            channels=out_channel,
+            padding=padding,
+            strides=strides,
+            out_dtype=out_dtype,
+        )
+
+    target_hexagon = tvm.target.hexagon("v68")
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    I, O, H, W = 64, 256, 56, 56
+    kH = kW = 3
+    padding = (1, 1)
+    strides = (1, 1)
+
+    data_shape = (1, I, H, W)
+    weight_shape = (O, I, kH, kW)
+    bias_shape = (weight_shape[0],)
+
+    bias = relay.var("bias", shape=bias_shape, dtype="int32")
+
+    conv2d = get_conv2d_nchw(
+        data_shape,
+        weight_shape,
+        padding,
+        strides=strides,
+    )
+    bias_add = relay.nn.bias_add(conv2d, bias)
+    mod = tvm.IRModule.from_expr(bias_add)
+
+    if data_dtype == "uint8":
+        data_np = np.random.uniform(0, 255, size=data_shape).astype("uint8")
+    else:
+        data_np = np.random.uniform(-128, 127, size=data_shape).astype("int8")
+
+    if weight_dtype == "uint8":
+        weight_np = np.random.uniform(0, 255, size=weight_shape).astype("uint8")
+    else:
+        weight_np = np.random.uniform(-128, 127, size=weight_shape).astype("int8")
+
+    bias_np = np.random.randint(low=-127, high=128, size=bias_shape).astype("int32")
+    params = {"weight": weight_np, "bias": bias_np}
+
+    ref = (
+        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
+        .evaluate()(*[data_np, weight_np, bias_np])
+        .numpy()
+    )
+
+    with tvm.transform.PassContext(
+        opt_level=3,
+    ):
+        executor = relay.backend.Executor("graph", {"link-params": True})
+        lib = relay.build(mod, target=target, params=params, executor=executor)
+
+    asm = lib.lib.get_source("asm")
+    assert "vrmpy" in asm
+
+    rt_mod = hexagon_session.get_executor_from_factory(lib)
+
+    rt_mod.set_input("data", data_np)
+
+    rt_mod.run()
+
+    out = rt_mod.get_output(0).numpy()
+
+    np.testing.assert_equal(out, ref)
+
+
+@tvm.testing.requires_hexagon
+def test_dense_relay_vrmpy(hexagon_session, data_dtype, weight_dtype):
+    if data_dtype == "int8" and weight_dtype == "uint8":
+        pytest.skip("(i8, u8) input pair is not supported")
+
+    target_hexagon = tvm.target.hexagon("v68")
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+    M = 128
+    N = 1000
+    K = 2048
+    data_shape = (M, K)
+    weight_shape = (N, K)
+
+    data = relay.var("data", shape=data_shape, dtype=data_dtype)
+    weight = relay.var("weight", shape=weight_shape, dtype=weight_dtype)
+
+    dense = relay.nn.dense(data, weight, out_dtype="int32")
+
+    if data_dtype == "uint8":
+        data_np = np.random.uniform(0, 255, size=data_shape).astype("uint8")
+    else:
+        data_np = np.random.uniform(-128, 127, size=data_shape).astype("int8")
+
+    if weight_dtype == "uint8":
+        weight_np = np.random.uniform(0, 255, size=weight_shape).astype("uint8")
+    else:
+        weight_np = np.random.uniform(-128, 127, size=weight_shape).astype("int8")
+
+    bias_np = np.random.uniform(1, 10, size=(weight_shape[0],)).astype("int32")
+
+    params = {"weight": weight_np, "bias": bias_np}
+
+    bias = relay.var("bias", shape=(weight_shape[0],), dtype="int32")
+    bias_add = relay.nn.bias_add(dense, bias)
+    mod = tvm.IRModule.from_expr(bias_add)
+
+    with tvm.transform.PassContext(
+        opt_level=3,
+    ):
+        executor = relay.backend.Executor("graph", {"link-params": True})
+        lib = relay.build(mod, target=target, params=params, executor=executor)
+
+    asm = lib.lib.get_source("asm")
+    assert "vrmpy" in asm
+
+    rt_mod = hexagon_session.get_executor_from_factory(lib)
+
+    rt_mod.set_input("data", data_np)
+
+    rt_mod.run()
+
+    out = rt_mod.get_output(0).numpy()
+
+    ref = np.dot(data_np.astype("int32"), weight_np.transpose().astype("int32"))
+    ref += bias_np
+
+    np.testing.assert_equal(out, ref)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 4e260d183f06732f59ae3e73423c8805f5b8f145 Mon Sep 17 00:00:00 2001
From: LiangW-intellif <114222082+LiangW526@users.noreply.github.com>
Date: Tue, 4 Oct 2022 14:45:50 +0800
Subject: [PATCH 294/704] [BugFix][Pattern] Fixed a bug in PatternGrouper
 (#12901)

* [BugFix][Pattern] Fixed a bug in PatternGrouper

This commit adds a duplicate check for make_input in PatternGrouper::CreateGroup,
fixes a bug where the partitioned function created by CreateGroup would have
an unused redundant variable when multiple operators in the partition share an input.

* Fix for test_type_check and test_same_input_to_binary_op
---
 .../cmsisnn/scalar_to_tensor_constant.cc      |  6 ++--
 src/relay/ir/dataflow_matcher.cc              |  5 ++++
 tests/python/relay/test_dataflow_pattern.py   | 30 +++++++++++++++++++
 .../python/relay/test_pass_merge_composite.py | 14 ++++-----
 4 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
index 40fd773eb209..0e2036505b6f 100644
--- a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
+++ b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
@@ -117,7 +117,8 @@ class ScalarToTensorConstantMutator : public MixedModeMutator {
   // operand tensor in a binary op (add or multiply supported via CMSIS-NN path). This applies only
   // to 1st and 2nd arguments of the ops.
   Call ReplaceScalarWithTensorVariable(Call call) {
-    if (!WorthyOfScalarToTensorReplacement(call)) {
+    // Returns if the operands of the binary operator come from the same input.
+    if (!WorthyOfScalarToTensorReplacement(call) || call->args.size() < 2) {
       return call;
     }
     Array<Expr> new_args(call->args);
@@ -146,7 +147,8 @@ class ScalarToTensorConstantMutator : public MixedModeMutator {
   // operand tensor in a binary op (add or multiply supported via CMSIS-NN path). This applies only
   // to 1st and 2nd arguments of the ops.
   Call ReplaceScalarWithTensorConstant(Call call, Function func) {
-    if (!WorthyOfScalarToTensorReplacement(func)) {
+    // Returns if the operands of the binary operator come from the same input.
+    if (!WorthyOfScalarToTensorReplacement(func) || call->args.size() < 2) {
       return call;
     }
     Array<Expr> new_args(call->args);
diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index 42fec9e27af2..7518380de3b1 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -646,6 +646,11 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
     auto make_input = [&](const Expr& input) {
       if (fuzzy_matches.count(input) == 0 && input.as<OpNode>() == nullptr &&
           input.as<FunctionNode>() == nullptr && !EmbedConst(input, node->ref())) {
+        // Avoid adding parameters repeatedly because multiple operatorss in the partition
+        // may use the same input.
+        if (inputs.find(input) != inputs.end()) {
+          return;
+        }
         inputs[input] =
             Var("FunctionVar_" + std::to_string(graph_number_) + "_" + std::to_string(var_number),
                 NullValue<Type>());
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index ba066e9a438f..24a405b0f6fd 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -1840,5 +1840,35 @@ def @main(%data: Tensor[(16, 16, 32, 32), float16], %weight: Tensor[(32, 16, 3,
     tvm.ir.assert_structural_equal(actual_mod, expected_mod)
 
 
+def test_partition_parallel_branch_with_same_input():
+    """In this example, conv2d's two consumer(add and multiply) on two different branches are
+    merged into one partition, make sure that the partitioned function has no redundant parameters"""
+    # Pattern
+    path1 = is_op("multiply")(wildcard(), wildcard())
+    path2 = is_op("add")(wildcard(), wildcard())
+    pattern = is_op("add")(path1, path2)
+
+    i = relay.Var("input")
+    w = relay.Var("weight")
+    l = relay.Var("left")
+    r = relay.Var("right")
+
+    conv2d = relay.op.nn.conv2d(i, w)
+    branch1 = relay.multiply(l, conv2d)
+    branch2 = relay.add(conv2d, r)
+    add = relay.add(branch1, branch2)
+
+    lf = relay.Var("leftf")
+    mf = relay.Var("midf")
+    rf = relay.Var("rightf")
+    f = relay.Function([lf, mf, rf], (lf * mf) + (mf + rf)).with_attr(
+        "PartitionedFromPattern", "multiply_add_add_"
+    )
+
+    partitioned = pattern.partition(add)
+    reference = f(l, conv2d, r)
+    assert tvm.ir.structural_equal(partitioned, reference)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/test_pass_merge_composite.py b/tests/python/relay/test_pass_merge_composite.py
index 1ec8f978d822..06cb1ecde78f 100644
--- a/tests/python/relay/test_pass_merge_composite.py
+++ b/tests/python/relay/test_pass_merge_composite.py
@@ -915,14 +915,13 @@ def expected_false():
         b = relay.var("b", shape=(8,))
 
         x0 = relay.var("x")
-        y0 = relay.var("y")
 
-        add = relay.op.add(y0, y0)
+        add = relay.op.add(x0, x0)
         relu = relay.nn.relu(add)
-        func = relay.Function([x0, y0], relu)
+        func = relay.Function([x0], relu)
         func = func.with_attr("PartitionedFromPattern", "add_nn.relu_")
         func = func.with_attr("Composite", "add_relu")
-        call = relay.Call(func, [x, x])
+        call = relay.Call(func, [x])
 
         conv = relay.nn.conv2d(
             call, w, kernel_size=(3, 3), kernel_layout="OIHW", data_layout="NHWC"
@@ -937,14 +936,13 @@ def expected_true():
         b = relay.var("b", shape=(8,))
 
         x0 = relay.var("x")
-        y0 = relay.var("y")
 
-        add = relay.op.add(y0, y0)
+        add = relay.op.add(x0, x0)
         relu = relay.nn.relu(add)
-        func = relay.Function([x0, y0], relu)
+        func = relay.Function([x0], relu)
         func = func.with_attr("PartitionedFromPattern", "add_nn.relu_")
         func = func.with_attr("Composite", "add_relu")
-        call = relay.Call(func, [x, x])
+        call = relay.Call(func, [x])
 
         x2 = relay.var("x")
         w1 = relay.var("w")

From de6d8067754d746d88262c530b5241b5577b9aae Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Tue, 4 Oct 2022 10:09:34 +0100
Subject: [PATCH 295/704] [CMSIS-NN] Support for int16 conv2d (#12950)

Support for int16 Conv2d via CMSIS-NN

-Pattern matching and RelayToTIR introduce int16 support
-Added new context buffer size APIs for int16 Conv2d
-Added int16 variants to integration and buffer size tests
---
 python/tvm/relay/op/contrib/cmsisnn.py        |  47 ++++--
 .../backend/contrib/cmsisnn/buffer_size.cc    |  80 +++++++++-
 .../backend/contrib/cmsisnn/buffer_size.h     |  36 ++++-
 .../backend/contrib/cmsisnn/relay_to_tir.cc   |  52 ++++---
 .../backend/contrib/cmsisnn/tir_to_runtime.cc |   4 +-
 .../contrib/cmsisnn/buffer_size_test.cc       |  86 +++++++++--
 .../contrib/test_cmsisnn/test_conv2d.py       | 142 +++++++++++-------
 tests/python/contrib/test_cmsisnn/utils.py    |  65 ++++----
 8 files changed, 374 insertions(+), 138 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index b887fafd7e00..8964937469c4 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -121,20 +121,14 @@ def check_qnn_conv2d(pattern):
             requantize = pattern
         requantize_input = requantize.args[0]
         bias_add = None
-        bias_dtype = "int32"
         if str(requantize_input.op.name) == "nn.bias_add":
             bias_add = requantize_input
             conv2d = bias_add.args[0]
-            bias_dtype = bias_add.args[1].checked_type.dtype
         else:
             conv2d = requantize_input
         conv2d_input = conv2d.args[0]
         conv2d_weight = conv2d.args[1]
 
-        # kernel zero_point should be 0
-        kernel_zp = conv2d.args[3].data.numpy()
-        kernel_zp = [kernel_zp] if kernel_zp.ndim == 0 else kernel_zp
-
         # check if depthwise Conv2D
         kernel_layout = conv2d.attrs.kernel_layout
         pos_o = kernel_layout.index("O")
@@ -145,12 +139,43 @@ def check_qnn_conv2d(pattern):
         ):
             is_depthwise = True
 
+        # check if dtypes are supported for the following entities
+        # (input_dtype, weight_dtype, bias_dtype, out_dtype, pattern_dtype)
+        are_dtypes_valid = False
+        conv2d_input_dtype = conv2d_input.checked_type.dtype
+        if bias_add:
+            bias_dtype = bias_add.args[1].checked_type.dtype
+        else:
+            # this is only to enable to following check that validates all sorts of dtypes
+            bias_dtype = "int32" if conv2d_input_dtype == "int8" else "int64"
+        valid_dtypes = None
+        if conv2d_input_dtype == "int8":
+            valid_dtypes = ("int8", "int8", "int32", "int32", "int8")
+        elif conv2d_input_dtype == "int16":
+            valid_dtypes = ("int16", "int8", "int64", "int64", "int16")
+
+        if (
+            conv2d_input_dtype,
+            conv2d_weight.checked_type.dtype,
+            bias_dtype,
+            conv2d.attrs.out_dtype,
+            pattern.checked_type.dtype,
+        ) == valid_dtypes:
+            are_dtypes_valid = True
+
+        # input_zero_point should be 0 when int16
+        valid_input_zp = True
+        if conv2d_input_dtype == "int16" and conv2d.args[2].data.numpy().item(0) != 0:
+            valid_input_zp = False
+
+        # kernel zero_point should be 0
+        kernel_zp = conv2d.args[3].data.numpy()
+        kernel_zp = [kernel_zp] if kernel_zp.ndim == 0 else kernel_zp
+
+        # combination of all checks to decide if pattern is eligible for partitioning
         ret = (
-            conv2d.attrs.out_dtype == "int32"
-            and conv2d_input.checked_type.dtype == "int8"
-            and conv2d_weight.checked_type.dtype == "int8"
-            and pattern.checked_type.dtype == "int8"
-            and bias_dtype == "int32"
+            are_dtypes_valid
+            and valid_input_zp
             and all([zp == 0 for zp in kernel_zp])
             and (not is_depthwise or bias_add is not None)
         )
diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.cc b/src/relay/backend/contrib/cmsisnn/buffer_size.cc
index 25f4d054e810..d5ac80cdfc26 100644
--- a/src/relay/backend/contrib/cmsisnn/buffer_size.cc
+++ b/src/relay/backend/contrib/cmsisnn/buffer_size.cc
@@ -29,10 +29,27 @@ namespace relay {
 namespace contrib {
 namespace cmsisnn {
 
-int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
-                     int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
-                     int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
-                     int32_t filter_w, int32_t filter_h) {
+int Conv2dBufferSize(bool is_int16, Target target, int32_t padding_w, int32_t padding_h,
+                     int32_t input_n, int32_t input_h, int32_t input_c, int32_t output_h,
+                     int32_t output_w, int32_t stride_w, int32_t stride_h, int32_t dilation_w,
+                     int32_t dilation_h, int32_t filter_w, int32_t filter_h) {
+  int size = -1;
+  if (is_int16) {
+    size = Conv2dBufferSizeInt16(target, padding_w, padding_h, input_n, input_h, input_c, output_h,
+                                 output_w, stride_w, stride_h, dilation_w, dilation_h, filter_w,
+                                 filter_h);
+  } else {
+    size = Conv2dBufferSizeInt8(target, padding_w, padding_h, input_n, input_h, input_c, output_h,
+                                output_w, stride_w, stride_h, dilation_w, dilation_h, filter_w,
+                                filter_h);
+  }
+  return size;
+}
+
+int Conv2dBufferSizeInt8(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
+                         int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
+                         int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
+                         int32_t filter_w, int32_t filter_h) {
   bool is1x1 = (padding_w == 0) && (padding_h == 0) && (input_c % 4 == 0) && (stride_w == 1) &&
                (stride_h == 1) && (filter_w == 1) && (filter_h == 1) && (dilation_w == 1) &&
                (dilation_h == 1);
@@ -62,9 +79,38 @@ int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_
   return 0;
 }
 
-int DepthwiseConv2dBufferSize(Target target, int32_t input_n, int32_t input_c, int32_t output_c,
-                              int32_t filter_w, int32_t filter_h, int32_t dilation_w,
-                              int32_t dilation_h) {
+int Conv2dBufferSizeInt16(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
+                          int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
+                          int32_t stride_w, int32_t stride_h, int32_t dilation_w,
+                          int32_t dilation_h, int32_t filter_w, int32_t filter_h) {
+  bool has_mve = target->GetFeature<Bool>("has_mve").value_or(Bool(false));
+  bool has_dsp = target->GetFeature<Bool>("has_dsp").value_or(Bool(false));
+
+  if (has_dsp && !has_mve) {
+    if ((filter_w * filter_h * input_c < 512) && dilation_w == 1 && dilation_h == 1) {
+      return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
+    }
+  }
+  return 0;
+}
+
+int DepthwiseConv2dBufferSize(bool is_int16, Target target, int32_t input_n, int32_t input_c,
+                              int32_t output_c, int32_t filter_w, int32_t filter_h,
+                              int32_t dilation_w, int32_t dilation_h, int32_t depth_multiplier) {
+  int size = -1;
+  if (is_int16) {
+    size = DepthwiseConv2dBufferSizeInt16(target, input_n, input_c, output_c, filter_w, filter_h,
+                                          dilation_w, dilation_h, depth_multiplier);
+  } else {
+    size = DepthwiseConv2dBufferSizeInt8(target, input_n, input_c, output_c, filter_w, filter_h,
+                                         dilation_w, dilation_h, depth_multiplier);
+  }
+  return size;
+}
+
+int DepthwiseConv2dBufferSizeInt8(Target target, int32_t input_n, int32_t input_c, int32_t output_c,
+                                  int32_t filter_w, int32_t filter_h, int32_t dilation_w,
+                                  int32_t dilation_h, int32_t depth_multiplier) {
   bool has_mve = target->GetFeature<Bool>("has_mve").value_or(Bool(false));
   bool has_dsp = target->GetFeature<Bool>("has_dsp").value_or(Bool(false));
 
@@ -78,6 +124,26 @@ int DepthwiseConv2dBufferSize(Target target, int32_t input_n, int32_t input_c, i
   return 0;
 }
 
+int DepthwiseConv2dBufferSizeInt16(Target target, int32_t input_n, int32_t input_c,
+                                   int32_t output_c, int32_t filter_w, int32_t filter_h,
+                                   int32_t dilation_w, int32_t dilation_h,
+                                   int32_t depth_multiplier) {
+  bool has_mve = target->GetFeature<Bool>("has_mve").value_or(Bool(false));
+  bool has_dsp = target->GetFeature<Bool>("has_dsp").value_or(Bool(false));
+
+  if (depth_multiplier == 1 && dilation_w == 1 && dilation_h == 1 &&
+      filter_w * filter_h * input_c < 512) {
+    if (has_dsp) {
+      if (has_mve) {
+        return 4 * input_c * filter_w * filter_h * (int32_t)sizeof(int16_t) + 8;
+      } else {
+        return input_c * filter_w * filter_h * (int32_t)sizeof(int16_t);
+      }
+    }
+  }
+  return 0;
+}
+
 int AvgPoolBufferSize(Target target, int32_t input_c) {
   bool has_mve = target->GetFeature<Bool>("has_mve").value_or(Bool(false));
   bool has_dsp = target->GetFeature<Bool>("has_dsp").value_or(Bool(false));
diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.h b/src/relay/backend/contrib/cmsisnn/buffer_size.h
index 9dae17c0a220..5cf8c309cc5e 100644
--- a/src/relay/backend/contrib/cmsisnn/buffer_size.h
+++ b/src/relay/backend/contrib/cmsisnn/buffer_size.h
@@ -41,6 +41,7 @@ namespace cmsisnn {
  * See:
  * https://github.com/ARM-software/CMSIS_5/blob/8c60448c0e1e50e426180b26db9bc31ddf774361/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L108-L127
  *
+ * \param is_int16 - type of conv2d
  * \param target - CMSIS-NN Target
  * \param padding_w - Width padding
  * \param padding_h - Height padding
@@ -56,16 +57,27 @@ namespace cmsisnn {
  *
  * \return Size of buffer to allocate for convolution
  */
-int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
-                     int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
-                     int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
-                     int32_t filter_w, int32_t filter_h);
+int Conv2dBufferSize(bool is_int16, Target target, int32_t padding_w, int32_t padding_h,
+                     int32_t input_n, int32_t input_h, int32_t input_c, int32_t output_h,
+                     int32_t output_w, int32_t stride_w, int32_t stride_h, int32_t dilation_w,
+                     int32_t dilation_h, int32_t filter_w, int32_t filter_h);
+
+int Conv2dBufferSizeInt8(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
+                         int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
+                         int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
+                         int32_t filter_w, int32_t filter_h);
+
+int Conv2dBufferSizeInt16(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
+                          int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
+                          int32_t stride_w, int32_t stride_h, int32_t dilation_w,
+                          int32_t dilation_h, int32_t filter_w, int32_t filter_h);
 
 /*!
  * \brief Calculates the appropriate buffer size for CMSIS-NN Depthwise Convolutions
  * See:
  * https://github.com/ARM-software/CMSIS_5/blob/325443e52637b6c7eedbd160d238a6c462e89c9f/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c#L115-L129
  *
+ * \param is_int16 - type of conv2d
  * \param target - CMSIS-NN Target
  * \param input_n - Input batch size
  * \param input_c - Input channels
@@ -74,12 +86,22 @@ int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_
  * \param filter_h - Filter height
  * \param dilation_w - Dilation width
  * \param dilation_h - Dilation height
+ * \param depth_multiplier - Depth Multiplier for Depthwise Convolution
  *
  * \return Size of buffer to allocate for depthwise convolution
  */
-int DepthwiseConv2dBufferSize(Target target, int32_t input_n, int32_t input_c, int32_t output_c,
-                              int32_t filter_w, int32_t filter_h, int32_t dilation_w,
-                              int32_t dilation_h);
+int DepthwiseConv2dBufferSize(bool is_int16, Target target, int32_t input_n, int32_t input_c,
+                              int32_t output_c, int32_t filter_w, int32_t filter_h,
+                              int32_t dilation_w, int32_t dilation_h, int32_t depth_multiplier);
+
+int DepthwiseConv2dBufferSizeInt8(Target target, int32_t input_n, int32_t input_c, int32_t output_c,
+                                  int32_t filter_w, int32_t filter_h, int32_t dilation_w,
+                                  int32_t dilation_h, int32_t depth_multiplier);
+
+int DepthwiseConv2dBufferSizeInt16(Target target, int32_t input_n, int32_t input_c,
+                                   int32_t output_c, int32_t filter_w, int32_t filter_h,
+                                   int32_t dilation_w, int32_t dilation_h,
+                                   int32_t depth_multiplier);
 
 /*!
  * \brief Calculates the appropriate buffer size for CMSIS-NN Average Pooling
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index a5cdfd570fea..da51e6b762dd 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -93,17 +93,17 @@ class RelayToTIRVisitor : public MixedModeMutator {
                                const Map<tir::Var, tir::Buffer>& buffer_map,
                                tvm::Array<PrimExpr> call_extern_args,
                                PrimExpr context_buffer_var = PrimExpr(),
-                               int context_buffer_size = 0) {
+                               int context_buffer_size = 0, int num_bits = 8) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set(tvm::attr::kGlobalSymbol, global_var->name_hint);
     dict_attrs.Set(tvm::attr::kTarget, target_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
-        tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
+        tvm::tir::Call(DataType::Int(num_bits), tir::builtin::call_extern(), call_extern_args));
 
     if (context_buffer_size) {
-      body = tir::Allocate(Downcast<tir::Var>(context_buffer_var), DataType::Int(8),
+      body = tir::Allocate(Downcast<tir::Var>(context_buffer_var), DataType::Int(num_bits),
                            {context_buffer_size}, tir::const_true(), body);
     }
 
@@ -133,6 +133,22 @@ class RelayToTIRVisitor : public MixedModeMutator {
     } else {
       conv2d_call = requantize_input;
     }
+    int32_t dtype_bits = conv2d_call->args[0]->type_as<TensorTypeNode>()->dtype.bits();
+
+    // Determine bitwidth of buffers based on input dtype
+    int32_t input_bits = 8;
+    int32_t filter_bits = 8;
+    int32_t bias_bits = 32;
+    int32_t output_bits = 8;
+    int32_t context_buffer_bits = 8;
+    bool is_int16 = false;
+    if (dtype_bits == 16) {
+      is_int16 = true;
+      input_bits = 16;
+      bias_bits = 64;
+      output_bits = 16;
+      context_buffer_bits = 16;
+    }
 
     // TIR variables are created in the order they appear in the Relay partitioned function
     // %1 = qnn.conv2d(%input, %weight_const_0, input_zero_point_scalar,
@@ -145,14 +161,14 @@ class RelayToTIRVisitor : public MixedModeMutator {
     const int filter_scale_pos = 3;
     const int input_scale_pos = bias_add_call ? 5 : 4;
     BufferCreator buffer_creator;
-    tir::Var input = buffer_creator.CreateBufferVar("input", DataType::Handle(8));
-    tir::Var filter = buffer_creator.CreateBufferVar("filter", DataType::Handle(8));
+    tir::Var input = buffer_creator.CreateBufferVar("input", DataType::Handle(input_bits));
+    tir::Var filter = buffer_creator.CreateBufferVar("filter", DataType::Handle(filter_bits));
     tir::Var multiplier = buffer_creator.CreateBufferVar("multiplier", DataType::Handle(32));
     if (bias_add_call) {
-      buffer_creator.CreateBufferVar("bias", DataType::Handle(32));
+      buffer_creator.CreateBufferVar("bias", DataType::Handle(bias_bits));
     }
     tir::Var shift = buffer_creator.CreateBufferVar("shift", DataType::Handle(32));
-    tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
+    tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(output_bits));
 
     // Relay function contains input_scale and filter_scale as function parameters at the following
     // locations in the global partitioned function for Conv2D
@@ -217,10 +233,10 @@ class RelayToTIRVisitor : public MixedModeMutator {
     scalar_args.push_back(ToArg(depth_multiplier));
 
     // original filter_layout for depthwise is HWOI
-    std::string cmsisnn_api = "arm_convolve_wrapper_s8";
+    std::string cmsisnn_api = is_int16 ? "arm_convolve_wrapper_s16" : "arm_convolve_wrapper_s8";
     bool is_depthwise = depth_multiplier != -1;
     if (is_depthwise) {
-      cmsisnn_api = "arm_depthwise_conv_wrapper_s8";
+      cmsisnn_api = is_int16 ? "arm_depthwise_conv_wrapper_s16" : "arm_depthwise_conv_wrapper_s8";
       int filter_pos_h = kernel_layout.find("H");
       int filter_pos_w = kernel_layout.find("W");
       Array<PrimExpr> depthwise_filter_shape{1, filter_shape[filter_pos_h],
@@ -242,18 +258,20 @@ class RelayToTIRVisitor : public MixedModeMutator {
     Target target = CreateTarget(transform::PassContext::Current());
     size_t context_buffer_size;
     if (is_depthwise) {
-      context_buffer_size = DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w,
-                                                      filter_h, dilation_w, dilation_h);
+      context_buffer_size =
+          DepthwiseConv2dBufferSize(is_int16, target, input_n, input_c, output_c, filter_w,
+                                    filter_h, dilation_w, dilation_h, depth_multiplier);
     } else {
-      context_buffer_size = Conv2dBufferSize(target, padding_w, padding_h, input_n, input_h,
-                                             input_c, output_h, output_w, stride_w, stride_h,
-                                             dilation_w, dilation_h, filter_w, filter_h);
+      context_buffer_size = Conv2dBufferSize(is_int16, target, padding_w, padding_h, input_n,
+                                             input_h, input_c, output_h, output_w, stride_w,
+                                             stride_h, dilation_w, dilation_h, filter_w, filter_h);
     }
 
     if (context_buffer_size) {
       String context_buffer_name = "context_buffer_" + std::to_string(context_buffer_id_++);
-      context_buffer_var = tir::Var(context_buffer_name,
-                                    PointerType(PrimType(DataType::Int(8)), "global.workspace"));
+      context_buffer_var =
+          tir::Var(context_buffer_name,
+                   PointerType(PrimType(DataType::Int(context_buffer_bits)), "global.workspace"));
     }
     tvm::Array<PrimExpr> context_buffer_args = {context_buffer_var, ToArg(context_buffer_size)};
 
@@ -266,7 +284,7 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     CreatePrimFuncForExtern(global_var, buffer_creator.GetPrimFuncParams(),
                             buffer_creator.GetBufferMap(), call_ext_args, context_buffer_var,
-                            context_buffer_size);
+                            context_buffer_size, context_buffer_bits);
   }
 
   void EmitFullyConnected(const GlobalVar& global_var, const Expr& expr) {
diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
index 50fa3821b7fa..ae9f195ca509 100644
--- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
@@ -111,7 +111,9 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
         cmsis_func_name == "arm_elementwise_add_s8") {
       CodeGenC::VisitExpr_(op, os);
     } else if (cmsis_func_name == "arm_convolve_wrapper_s8" ||
-               cmsis_func_name == "arm_depthwise_conv_wrapper_s8") {
+               cmsis_func_name == "arm_convolve_wrapper_s16" ||
+               cmsis_func_name == "arm_depthwise_conv_wrapper_s8" ||
+               cmsis_func_name == "arm_depthwise_conv_wrapper_s16") {
       EmitConv2D(op);
     } else if (cmsis_func_name == "arm_fully_connected_s8") {
       EmitFullyConnected(op);
diff --git a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
index d8870fa71525..2094b70eb872 100644
--- a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
+++ b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
@@ -46,10 +46,10 @@ static const Target kNoExt("cmsis-nn -mcpu=cortex-m55 -mattr=+nodsp,+nomve");
 
 class CMSISNNCalculatedBufferSize : public testing::TestWithParam<std::array<int32_t, 3>> {};
 
-TEST(CMSISNNConv2dBufferSize, Conv1x1) {
+TEST(CMSISNNConv2dBufferSizeInt8, Conv1x1) {
   int32_t any = fake_parameters(gen);
   auto conv2d_1x1 = [=](Target target, int32_t input_c) {
-    return Conv2dBufferSize(target, 0, 0, any, any, input_c, any, any, 1, 1, 1, 1, 1, 1);
+    return Conv2dBufferSizeInt8(target, 0, 0, any, any, input_c, any, any, 1, 1, 1, 1, 1, 1);
   };
 
   ASSERT_EQ(conv2d_1x1(kNoExt, 4), 0);
@@ -71,7 +71,7 @@ TEST(CMSISNNConv2dBufferSize, Conv1x1) {
   ASSERT_EQ(conv2d_1x1(kHasMVE, 32), 0);
 }
 
-TEST(CMSISNNConv2dBufferSize, Conv1xN) {
+TEST(CMSISNNConv2dBufferSizeInt8, Conv1xN) {
   int32_t any = fake_parameters(gen);
   int32_t input_c = fake_parameters(gen);
   int32_t filter_w = fake_parameters(gen);
@@ -79,8 +79,8 @@ TEST(CMSISNNConv2dBufferSize, Conv1xN) {
   int32_t calculated_buffer = (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
 
   auto conv2d_1xn = [=](Target target, int32_t output_w) {
-    return Conv2dBufferSize(target, any, any, 1, 1, input_c, 1, output_w, any, any, 1, 1, filter_w,
-                            filter_h);
+    return Conv2dBufferSizeInt8(target, any, any, 1, 1, input_c, 1, output_w, any, any, 1, 1,
+                                filter_w, filter_h);
   };
 
   ASSERT_EQ(conv2d_1xn(kNoExt, 4), calculated_buffer);
@@ -102,7 +102,7 @@ TEST(CMSISNNConv2dBufferSize, Conv1xN) {
   ASSERT_EQ(conv2d_1xn(kHasMVE, 32), 0);
 }
 
-TEST(CMSISNNConv2dBufferSize, Default) {
+TEST(CMSISNNConv2dBufferSizeInt8, Default) {
   int32_t any = fake_parameters(gen);
 
   int32_t input_c = fake_parameters(gen);
@@ -114,8 +114,8 @@ TEST(CMSISNNConv2dBufferSize, Default) {
   int32_t calculated_buffer_mve = 4 * col_length * 8 * (int32_t)sizeof(int8_t);
 
   auto conv2d = [=](Target target, int32_t output_w) {
-    return Conv2dBufferSize(target, any, any, 1, 1, input_c, 1, output_w, any, any, any, any,
-                            filter_w, filter_h);
+    return Conv2dBufferSizeInt8(target, any, any, 1, 1, input_c, 1, output_w, any, any, any, any,
+                                filter_w, filter_h);
   };
 
   ASSERT_EQ(conv2d(kNoExt, 4), calculated_buffer);
@@ -137,13 +137,39 @@ TEST(CMSISNNConv2dBufferSize, Default) {
   ASSERT_EQ(conv2d(kHasMVE, 32), calculated_buffer_mve);
 }
 
-TEST(CMSISNNDepthwiseConv2dBufferSize, UnEvenChannels) {
+TEST(CMSISNNConv2dBufferSizeInt16, Default) {
+  int32_t any = fake_parameters(gen);
+
+  auto conv2d_int16_buffer = [=](Target target, int32_t input_c, int32_t filter_w,
+                                 int32_t filter_h) {
+    return Conv2dBufferSizeInt16(target, any, any, 1, 1, input_c, any, any, any, any, 1, 1,
+                                 filter_w, filter_h);
+  };
+
+  auto calculated_buffer = [=](int32_t input_c, int32_t filter_w, int32_t filter_h) {
+    return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
+  };
+
+  ASSERT_EQ(conv2d_int16_buffer(kNoExt, 3, 5, 5), 0);
+  ASSERT_EQ(conv2d_int16_buffer(kNoExt, 32, 3, 3), 0);
+
+  ASSERT_EQ(conv2d_int16_buffer(kHasDSP, 3, 3, 3), calculated_buffer(3, 3, 3));
+  ASSERT_EQ(conv2d_int16_buffer(kHasDSP, 12, 5, 5), calculated_buffer(12, 5, 5));
+  ASSERT_EQ(conv2d_int16_buffer(kHasDSP, 24, 5, 5), 0);
+
+  ASSERT_EQ(conv2d_int16_buffer(kHasMVE, 3, 3, 3), 0);
+  ASSERT_EQ(conv2d_int16_buffer(kHasMVE, 12, 5, 5), 0);
+  ASSERT_EQ(conv2d_int16_buffer(kHasMVE, 24, 5, 5), 0);
+}
+
+TEST(CMSISNNDepthwiseConv2dBufferSizeInt8, UnEvenChannels) {
   int32_t filter_w = fake_parameters(gen);
   int32_t filter_h = fake_parameters(gen);
   int32_t input_n = 1;
 
   auto depthwise_conv2d_with_channels = [=](Target target, int32_t input_c, int32_t output_c) {
-    return DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w, filter_h, 1, 1);
+    return DepthwiseConv2dBufferSizeInt8(target, input_n, input_c, output_c, filter_w, filter_h, 1,
+                                         1, 1);
   };
 
   ASSERT_EQ(depthwise_conv2d_with_channels(kNoExt, 4, 6), 0);
@@ -154,14 +180,14 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, UnEvenChannels) {
   ASSERT_EQ(depthwise_conv2d_with_channels(kHasMVE, 8, 7), 0);
 }
 
-TEST(CMSISNNDepthwiseConv2dBufferSize, MultipleBatches) {
+TEST(CMSISNNDepthwiseConv2dBufferSizeInt8, MultipleBatches) {
   int32_t input_output_c = fake_parameters(gen);
   int32_t filter_w = fake_parameters(gen);
   int32_t filter_h = fake_parameters(gen);
 
   auto depthwise_conv2d_with_batch = [=](Target target, int32_t input_n) {
-    return DepthwiseConv2dBufferSize(target, input_n, input_output_c, input_output_c, filter_w,
-                                     filter_h, 1, 1);
+    return DepthwiseConv2dBufferSizeInt8(target, input_n, input_output_c, input_output_c, filter_w,
+                                         filter_h, 1, 1, 1);
   };
 
   ASSERT_EQ(depthwise_conv2d_with_batch(kNoExt, 4), 0);
@@ -172,7 +198,7 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, MultipleBatches) {
   ASSERT_EQ(depthwise_conv2d_with_batch(kHasMVE, 7), 0);
 }
 
-TEST(CMSISNNDepthwiseConv2dBufferSize, Default) {
+TEST(CMSISNNDepthwiseConv2dBufferSizeInt8, Default) {
   int32_t input_output_c = fake_parameters(gen);
   int32_t filter_w = fake_parameters(gen);
   int32_t filter_h = fake_parameters(gen);
@@ -183,8 +209,8 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, Default) {
   int32_t dsp_calculated_buffer = (input_output_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
 
   auto depthwise_conv2d = [=](Target target) {
-    return DepthwiseConv2dBufferSize(target, input_n, input_output_c, input_output_c, filter_w,
-                                     filter_h, 1, 1);
+    return DepthwiseConv2dBufferSizeInt8(target, input_n, input_output_c, input_output_c, filter_w,
+                                         filter_h, 1, 1, 1);
   };
 
   ASSERT_EQ(depthwise_conv2d(kNoExt), 0);
@@ -195,6 +221,34 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, Default) {
   ASSERT_EQ(depthwise_conv2d(kHasMVE), mve_calculated_buffer);
 }
 
+TEST(CMSISNNDepthwiseConv2dBufferSizeInt16, Default) {
+  int32_t any = fake_parameters(gen);
+
+  auto depthwise_int16_buffer = [=](Target target, int32_t input_c, int32_t filter_w,
+                                    int32_t filter_h) {
+    return DepthwiseConv2dBufferSizeInt16(target, any, input_c, any, filter_w, filter_h, 1, 1, 1);
+  };
+
+  auto dsp_only_buffer = [=](int32_t input_c, int32_t filter_w, int32_t filter_h) {
+    return (input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
+  };
+
+  auto dsp_mve_buffer = [=](int32_t input_c, int32_t filter_w, int32_t filter_h) {
+    return (4 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t) + 8;
+  };
+
+  ASSERT_EQ(depthwise_int16_buffer(kNoExt, 3, 5, 5), 0);
+  ASSERT_EQ(depthwise_int16_buffer(kNoExt, 32, 3, 3), 0);
+
+  ASSERT_EQ(depthwise_int16_buffer(kHasDSP, 3, 3, 3), dsp_only_buffer(3, 3, 3));
+  ASSERT_EQ(depthwise_int16_buffer(kHasDSP, 12, 5, 5), dsp_only_buffer(12, 5, 5));
+  ASSERT_EQ(depthwise_int16_buffer(kHasDSP, 24, 5, 5), 0);
+
+  ASSERT_EQ(depthwise_int16_buffer(kHasMVE, 3, 3, 3), dsp_mve_buffer(3, 3, 3));
+  ASSERT_EQ(depthwise_int16_buffer(kHasMVE, 12, 5, 5), dsp_mve_buffer(12, 5, 5));
+  ASSERT_EQ(depthwise_int16_buffer(kHasMVE, 24, 5, 5), 0);
+}
+
 TEST(CMSISNNAvgPoolBufferSize, Default) {
   int32_t input_c = fake_parameters(gen);
   int32_t calculated_buffer = (input_c * sizeof(int32_t));
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index d33d71261613..66ff5d793880 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -36,6 +36,7 @@
     get_range_for_dtype_str,
     get_same_padding,
     get_conv2d_qnn_params,
+    get_kernel_bias_dtype,
     make_qnn_relu,
     assert_partitioned_function,
     assert_no_external_function,
@@ -59,8 +60,9 @@ def make_model(
     groups,
     dtype,
     kernel_dtype,
+    bias_dtype,
     out_channels,
-    weight_format,
+    kernel_layout,
     enable_bias,
     relu_type,
     input_op=None,
@@ -71,8 +73,8 @@ def make_model(
     else:
         op = relay.var("input", shape=shape, dtype=dtype)
 
-    h_index = weight_format.index("H")
-    w_index = weight_format.index("W")
+    h_index = kernel_layout.index("H")
+    w_index = kernel_layout.index("W")
     kernel_h = kernel_shape[h_index]
     kernel_w = kernel_shape[w_index]
     p = (0, 0, 0, 0)
@@ -80,7 +82,7 @@ def make_model(
         p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
 
     rng = np.random.default_rng(12321)
-    weight = tvm.nd.array(
+    kernel = tvm.nd.array(
         rng.integers(
             np.iinfo(kernel_dtype).min,
             high=np.iinfo(kernel_dtype).max,
@@ -88,27 +90,27 @@ def make_model(
             dtype=kernel_dtype,
         )
     )
-    weight_const = relay.const(weight, kernel_dtype)
+    kernel_const = relay.const(kernel, kernel_dtype)
     conv2d_kernel_sc = kernel_scale[0] if out_channels == 1 else kernel_scale
     conv = relay.qnn.op.conv2d(
         op,
-        weight_const,
+        kernel_const,
         input_zero_point=relay.const(input_zero_point, "int32"),
         kernel_zero_point=relay.const(kernel_zero_point, "int32"),
         input_scale=relay.const(input_scale, "float32"),
         kernel_scale=relay.const(conv2d_kernel_sc, "float32"),
         kernel_size=(kernel_h, kernel_w),
         data_layout="NHWC",
-        kernel_layout=weight_format,
+        kernel_layout=kernel_layout,
         dilation=dilation,
         strides=strides,
         groups=groups,
         channels=out_channels,
         padding=p,
-        out_dtype="int32",
+        out_dtype=bias_dtype,
     )
-    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
-    bias_const = relay.const(bias, "int32")
+    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype=bias_dtype))
+    bias_const = relay.const(bias, bias_dtype)
     last_op = relay.nn.bias_add(conv, bias_const, axis=3) if enable_bias else conv
     requant_input_sc = [sc * input_scale for sc in kernel_scale]
     requant_input_sc = requant_input_sc[0] if out_channels == 1 else requant_input_sc
@@ -121,7 +123,7 @@ def make_model(
         out_dtype=dtype,
     )
     last_op = make_qnn_relu(last_op, relu_type, output_scale, output_zero_point, dtype)
-    params = {"w": weight, "b": bias}
+    params = {"w": kernel, "b": bias}
     return last_op, params
 
 
@@ -150,7 +152,7 @@ def test_conv2d_number_primfunc_args(
     dilation = (1, 1)
     dtype = "int8"
     groups = 1
-    weight_format = "HWIO"
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
@@ -158,6 +160,8 @@ def test_conv2d_number_primfunc_args(
     in_min, in_max = get_range_for_dtype_str(dtype)
     relu_type = "RELU"
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -165,7 +169,7 @@ def test_conv2d_number_primfunc_args(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
     )
 
@@ -183,9 +187,10 @@ def test_conv2d_number_primfunc_args(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
     )
@@ -220,6 +225,7 @@ def test_conv2d_number_primfunc_args(
 
 
 @tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("dtype", ["int8", "int16"])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
 @pytest.mark.parametrize("relu_type", ["RELU"])
 @pytest.mark.parametrize("enable_bias", [True, False])
@@ -230,7 +236,8 @@ def test_conv2d_number_primfunc_args(
 @pytest.mark.parametrize(
     "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
 )
-def test_conv2d_symmetric_padding_int8(
+def test_conv2d_symmetric_padding(
+    dtype,
     padding,
     enable_bias,
     relu_type,
@@ -249,15 +256,18 @@ def test_conv2d_symmetric_padding_int8(
     kernel_size = (3, 3)
     strides = (1, 1)
     dilation = (1, 1)
-    dtype = "int8"
     groups = 1
-    weight_format = "HWIO"
+    # input_zero_point is not handled by TFLM when int16
+    input_zero_point = input_zero_point if dtype == "int8" else 0
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
     in_min, in_max = get_range_for_dtype_str(dtype)
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -265,7 +275,7 @@ def test_conv2d_symmetric_padding_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
     )
 
@@ -283,9 +293,10 @@ def test_conv2d_symmetric_padding_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
     )
@@ -321,7 +332,7 @@ def test_conv2d_symmetric_padding_int8(
     "input_zero_point, input_scale, kernel_scale, out_channels",
     [(10, 0.0128, [0.11, 0.22], 2), (-64, 1, [1, 0.0256, 1.37], 3)],
 )
-def test_conv2d_asymmetric_padding_int8(
+def test_conv2d_asymmetric_padding(
     padding,
     enable_bias,
     relu_type,
@@ -335,19 +346,22 @@ def test_conv2d_asymmetric_padding_int8(
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
 
+    dtype = "int8"
     ifm_shape = (1, 25, 25, 12)
     kernel_size = (5, 5)
     strides = (2, 2)
     dilation = (1, 1)
-    dtype = "int8"
     groups = 1
-    weight_format = "HWIO"
+    input_zero_point = input_zero_point if dtype == "int8" else 0
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
     in_min, in_max = get_range_for_dtype_str(dtype)
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -355,7 +369,7 @@ def test_conv2d_asymmetric_padding_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
     )
 
@@ -373,9 +387,10 @@ def test_conv2d_asymmetric_padding_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
     )
@@ -434,13 +449,14 @@ def test_pad_conv2d_fusion_int8(
     kernel_scale = [0.11, 0.22]
     out_channels = 2
     groups = 1
-    weight_format = "HWIO"
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
     in_min, in_max = get_range_for_dtype_str(dtype)
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -448,7 +464,7 @@ def test_pad_conv2d_fusion_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
     )
 
@@ -474,9 +490,10 @@ def test_pad_conv2d_fusion_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
         input_op=pad,
@@ -545,13 +562,15 @@ def test_invalid_pad_conv2d_fusion_int8(
     kernel_scale = [0.11, 0.22]
     out_channels = 2
     groups = 1
-    weight_format = "HWIO"
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
     in_min, in_max = get_range_for_dtype_str(dtype)
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -559,7 +578,7 @@ def test_invalid_pad_conv2d_fusion_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
     )
 
@@ -585,9 +604,10 @@ def test_invalid_pad_conv2d_fusion_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
         input_op=pad,
@@ -675,6 +695,7 @@ def test_conv2d_int8_tflite(ifm_shape, kernel_shape, strides, dilation, padding,
 
 
 @tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("dtype", ["int8", "int16"])
 @pytest.mark.parametrize("ifm_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
 @pytest.mark.parametrize("kernel_size", [(3, 3)])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
@@ -691,7 +712,8 @@ def test_conv2d_int8_tflite(ifm_shape, kernel_shape, strides, dilation, padding,
 @pytest.mark.parametrize(
     "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
 )
-def test_depthwise_int8(
+def test_depthwise(
+    dtype,
     ifm_shape,
     kernel_size,
     padding,
@@ -711,9 +733,9 @@ def test_depthwise_int8(
     interface_api = "c"
     use_unpacked_api = True
 
-    dtype = "int8"
     groups = 1
-    weight_format = "HWIO"
+    input_zero_point = input_zero_point if dtype == "int8" else 0
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
@@ -721,12 +743,14 @@ def test_depthwise_int8(
     in_min, in_max = get_range_for_dtype_str(dtype)
 
     groups = ifm_shape[3]
-    weight_format = "HWOI"
+    kernel_layout = "HWOI"
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3], depth_multiplier)
     out_channels = ifm_shape[3] * depth_multiplier
     ks_len = len(kernel_scale)
     kernel_scale = [kernel_scale[i % ks_len] for i in range(out_channels)]
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -734,7 +758,7 @@ def test_depthwise_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
         is_depthwise=True,
     )
@@ -753,9 +777,10 @@ def test_depthwise_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
     )
@@ -823,7 +848,8 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
 
     ifm_shape = (1, 24, 24, 1)
     groups = ifm_shape[3]
-    weight_format = "HWIO"
+    input_zero_point = input_zero_point if dtype == "int8" else 0
+    kernel_layout = "HWIO"
     (kernel_h, kernel_w) = (3, 3)
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3], depth_multiplier)
     out_channels = ifm_shape[3] * depth_multiplier
@@ -832,6 +858,8 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
     kernel_zero_point = 0
     kernel_scale = [kernel_scale[i % ks_len] for i in range(out_channels)]
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -839,7 +867,7 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
         is_depthwise=True,
     )
@@ -858,9 +886,10 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
     )
@@ -914,19 +943,24 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
 
 
 def parameterize_for_invalid_model(test):
-    """Generates non int8 inputs"""
-    in_dtype = ["uint8", "int8"]
+    """Generates non-int8 non-int16 inputs"""
+    in_dtype = ["uint8", "int8", "int16"]
     kernel_dtype = ["uint8", "int8"]
     kernel_zero_point = [-33, 10, 0]
-    all_combinations = itertools.product(in_dtype, kernel_dtype, kernel_zero_point)
+    input_zero_point = [64, 0]
+    all_combinations = itertools.product(
+        in_dtype, kernel_dtype, kernel_zero_point, input_zero_point
+    )
     all_combinations = filter(
         lambda parameters: not (
-            parameters[0] == "int8" and parameters[1] == "int8" and parameters[2] == 0
+            (parameters[0] == "int8" or (parameters[0] == "int16" and parameters[3] == 0))
+            and parameters[1] == "int8"
+            and parameters[2] == 0
         ),
         all_combinations,
     )
     return pytest.mark.parametrize(
-        ["in_dtype", "kernel_dtype", "kernel_zero_point"],
+        ["in_dtype", "kernel_dtype", "kernel_zero_point", "input_zero_point"],
         all_combinations,
     )(test)
 
@@ -937,16 +971,17 @@ def test_invalid_parameters(
     in_dtype,
     kernel_dtype,
     kernel_zero_point,
+    input_zero_point,
 ):
     """Tests Depthwise op for non int8 inputs"""
     ifm_shape = (1, 28, 28, 12)
     out_channels = 2
     input_scale = 1
-    input_zero_point = 24
     kernel_scale = [0.11, 0.0237]
 
     kernel_layout = "HWIO"
     kernel_shape = [3, 3, ifm_shape[3], out_channels]
+    _, bias_dtype = get_kernel_bias_dtype(in_dtype)
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -973,8 +1008,9 @@ def test_invalid_parameters(
         groups=1,
         dtype=in_dtype,
         kernel_dtype=kernel_dtype,
+        bias_dtype=bias_dtype,
         out_channels=out_channels,
-        weight_format=kernel_layout,
+        kernel_layout=kernel_layout,
         enable_bias=True,
         relu_type="NONE",
     )
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index 9fdb89289aff..f3a6b0c1343b 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -137,39 +137,52 @@ def get_same_padding(in_shape, kernel, dilation, stride):
     return [pad_top, pad_left, pad_bottom, pad_right]
 
 
+def get_kernel_bias_dtype(input_dtype):
+    """
+    Returns (kernel_dtype, bias_dtype) based on input's dtype.
+    """
+    # uint8 corresponds to an invalid case, so returning int types
+    # does not cause tests to break
+    if input_dtype in ("int8", "uint8"):
+        return ("int8", "int32")
+    elif input_dtype == "int16":
+        return ("int8", "int64")
+    raise ValueError("Invalid dtype provided to get_kernel_bias_dtype()")
+
+
 def get_conv2d_qnn_params(
-    weight_shape: List[int],
+    kernel_shape: List[int],
     input_scale: float,
     input_zp: int,
-    weights_scale: Union[float, List[float]],
-    weights_zp: int,
+    kernel_scale: Union[float, List[float]],
+    kernel_zp: int,
     input_dtype: str = "int8",
-    weights_dtype: str = "int8",
+    kernel_dtype: str = "int8",
     output_dtype: str = "int8",
     is_depthwise: bool = False,
 ) -> Tuple[float, int]:
     """
     Calculate the output quantization parameters for convolution based on the input and
-    weights quantization paramters and the data types.
+    kernel quantization paramters and the data types.
 
     Parameters
     ----------
-    weight_shape : List[int]
-        shape of the weights
+    kernel_shape : List[int]
+        shape of the kernel
     input_scale : float
         scale of the input tensor
     input_zp : int
         zero point of the input tensor
-    weights_scale : Union[float, List[float]]
-        scale(s) of the weights tensor
-    weights_zp : int
-        zero point of the weights tensor
+    kernel_scale : Union[float, List[float]]
+        scale(s) of the kernel tensor
+    kernel_zp : int
+        zero point of the kernel tensor
     is_depthwise : bool
         whether it is a depthwise convolution
     input_dtype : str
         data type of the input tensor
-    weights_dtype : str
-        data type of the weights tensor
+    kernel_dtype : str
+        data type of the kernel tensor
     output_dtype : str
         data type of the output tensor
 
@@ -184,27 +197,27 @@ def get_conv2d_qnn_params(
     input_max = input_scale * (input_dtype_max - input_zp)
     input_min = input_scale * (input_dtype_min - input_zp)
 
-    weights_dtype_min, weights_dtype_max = get_range_for_dtype_str(weights_dtype)
-    weights_sc_max = np.max(weights_scale)
-    weights_max = weights_sc_max * (weights_dtype_max - weights_zp)
+    kernel_dtype_min, kernel_dtype_max = get_range_for_dtype_str(kernel_dtype)
+    kernel_sc_max = np.max(kernel_scale)
+    kernel_max = kernel_sc_max * (kernel_dtype_max - kernel_zp)
 
-    weights_sc_min = np.min(weights_scale)
-    weights_min = weights_sc_min * (weights_dtype_min - weights_zp)
+    kernel_sc_min = np.min(kernel_scale)
+    kernel_min = kernel_sc_min * (kernel_dtype_min - kernel_zp)
 
-    weights_h = weight_shape[1]
-    weights_w = weight_shape[2]
-    channels = weight_shape[3]
-    num_elements = weights_h * weights_w * channels
+    kernel_h = kernel_shape[1]
+    kernel_w = kernel_shape[2]
+    channels = kernel_shape[3]
+    num_elements = kernel_h * kernel_w * channels
     # Adjust the result if it is a depthwise convolution
     if is_depthwise:
         num_elements = num_elements / channels
 
     # The smallest and largest possible values in the unquantized output tensor
     output_limits = [
-        weights_max * input_max * num_elements,
-        weights_min * input_max * num_elements,
-        weights_min * input_min * num_elements,
-        weights_max * input_min * num_elements,
+        kernel_max * input_max * num_elements,
+        kernel_min * input_max * num_elements,
+        kernel_min * input_min * num_elements,
+        kernel_max * input_min * num_elements,
     ]
 
     output_max = max(output_limits)

From 1ea1a0bc888b9b4c8d8978c04a9af79b0ba85ee9 Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Tue, 4 Oct 2022 08:15:27 -0700
Subject: [PATCH 296/704] [Hexagon] 3-stage pipeline; multi queue async DMA for
 cache read / write (#12954)

* [Hexagon] 3-stage pipeline; multi queue async DMA for cache rd / wr

* add cache_write (no cache_read) schedule to python test
---
 src/runtime/hexagon/hexagon_device_api.cc     |   6 +-
 src/runtime/hexagon/hexagon_user_dma.cc       |  24 ++--
 src/runtime/hexagon/hexagon_user_dma.h        |  15 +--
 src/runtime/hexagon/ring_buffer.h             |  41 ++++++-
 .../hexagon/hexagon_user_dma_tests.cc         | 104 +++++++++++++-----
 .../cpp-runtime/hexagon/ring_buffer_tests.cc  |  30 ++++-
 .../test_software_pipeline_async.py           |  43 ++++++--
 7 files changed, 196 insertions(+), 67 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index db3c847a55e8..7c251721b749 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -208,7 +208,6 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM
 
 TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
   int queue_id = args[0];
-  ICHECK(queue_id == 0 && "Hexagon supports just a single asynchronous queue for DMA");
   void* dst = args[1];
   void* src = args[2];
   int size = args[3];
@@ -216,17 +215,16 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy").set_body([](TVMArgs args, TVM
 
   int ret = DMA_RETRY;
   do {
-    ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(dst, src, size);
+    ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(queue_id, dst, src, size);
   } while (ret == DMA_RETRY);
   *rv = static_cast<int32_t>(ret);
 });
 
 TVM_REGISTER_GLOBAL("device_api.hexagon.dma_wait").set_body([](TVMArgs args, TVMRetValue* rv) {
   int queue_id = args[0];
-  ICHECK(queue_id == 0 && "Hexagon supports just a single asynchronous queue for DMA");
   int inflight = args[1];
   ICHECK(inflight >= 0);
-  HexagonDeviceAPI::Global()->UserDMA()->Wait(inflight);
+  HexagonDeviceAPI::Global()->UserDMA()->Wait(queue_id, inflight);
   *rv = static_cast<int32_t>(0);
 });
 
diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc
index ab464c150319..619338e39688 100644
--- a/src/runtime/hexagon/hexagon_user_dma.cc
+++ b/src/runtime/hexagon/hexagon_user_dma.cc
@@ -32,7 +32,7 @@ unsigned int HexagonUserDMA::Init() {
   return status;
 }
 
-int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) {
+int HexagonUserDMA::Copy(int queue_id, void* dst, void* src, uint32_t length) {
   // length limited to 24 bits
   if (length > DESC_LENGTH_MASK) {
     return DMA_FAILURE;
@@ -54,7 +54,7 @@ int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) {
   uint32_t dst32 = static_cast<uint32_t>(dst64);
 
   // get pointer to next descriptor
-  dma_desc_2d_t* dma_desc = descriptors_->Next();
+  dma_desc_2d_t* dma_desc = descriptors_->Next(queue_id);
   if (!dma_desc) {
     return DMA_RETRY;
   }
@@ -87,17 +87,17 @@ int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) {
   return DMA_SUCCESS;
 }
 
-void HexagonUserDMA::Wait(uint32_t max_dmas_in_flight) {
+void HexagonUserDMA::Wait(int queue_id, uint32_t max_dmas_in_flight) {
   // wait (forever) until max DMAs in flight <= actual DMAs in flight
-  while (DMAsInFlight() > max_dmas_in_flight) {
+  while (DMAsInFlight(queue_id) > max_dmas_in_flight) {
   }
 }
 
-uint32_t HexagonUserDMA::Poll() { return DMAsInFlight(); }
+uint32_t HexagonUserDMA::Poll(int queue_id) { return DMAsInFlight(queue_id); }
 
-uint32_t HexagonUserDMA::DMAsInFlight() {
+uint32_t HexagonUserDMA::DMAsInFlight(int queue_id) {
   dmpoll();  // update DMA engine status
-  return descriptors_->InFlight();
+  return descriptors_->InFlight(queue_id);
 }
 
 HexagonUserDMA::HexagonUserDMA() {
@@ -109,7 +109,7 @@ HexagonUserDMA::HexagonUserDMA() {
     unsigned int done = dma_desc_get_done(dma_desc);
     return (done != DESC_DONE_COMPLETE);
   };
-  descriptors_ = new RingBuffer<dma_desc_2d_t>(MAX_DMA_DESCRIPTORS, desc_in_flight);
+  descriptors_ = new QueuedRingBuffer<dma_desc_2d_t>(MAX_DMA_DESCRIPTORS, desc_in_flight);
 }
 
 HexagonUserDMA::~HexagonUserDMA() {
@@ -124,9 +124,9 @@ int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
   // Make the common case quick.
   if (length <= DESC_LENGTH_MASK) {
     // sync DMA -> `Copy` and then `Wait(0)`
-    int ret_val = user_dma->Copy(dst, src, length);
+    int ret_val = user_dma->Copy(SYNC_DMA_QUEUE, dst, src, length);
     if (ret_val != DMA_SUCCESS) return ret_val;
-    user_dma->Wait(0);
+    user_dma->Wait(SYNC_DMA_QUEUE, 0);
     return DMA_SUCCESS;
   }
 
@@ -137,9 +137,9 @@ int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
     // Ensure there is no overflow while updating i
     uint32_t cur_len = std::min<uint32_t>(length - i, DESC_LENGTH_MASK);
     // sync DMA -> `Copy` and then `Wait(0)`
-    int ret_val = user_dma->Copy(&cast_dst[i], &cast_src[i], cur_len);
+    int ret_val = user_dma->Copy(SYNC_DMA_QUEUE, &cast_dst[i], &cast_src[i], cur_len);
     if (ret_val != DMA_SUCCESS) return ret_val;
-    user_dma->Wait(0);
+    user_dma->Wait(SYNC_DMA_QUEUE, 0);
     // 2 cases for new val for i:
     // 1. length - i <= DESC_LENGTH_MASK (<= MAX_UINT)
     //    new_i = i + (length - i) = length, no more iter
diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h
index f8838ee2dcc9..01e143d255b4 100644
--- a/src/runtime/hexagon/hexagon_user_dma.h
+++ b/src/runtime/hexagon/hexagon_user_dma.h
@@ -34,6 +34,7 @@ namespace hexagon {
 #define DMA_FAILURE -1
 #define DMA_RETRY 1
 #define MAX_DMA_DESCRIPTORS 100
+#define SYNC_DMA_QUEUE -1
 
 class HexagonUserDMA {
  public:
@@ -51,36 +52,36 @@ class HexagonUserDMA {
    * \param length Length in bytes to copy
    * \returns Status: DMA_SUCCESS or DMA_FAILURE
    */
-  int Copy(void* dst, void* src, uint32_t length);
+  int Copy(int queue_id, void* dst, void* src, uint32_t length);
 
   /*!
    * \brief Wait until the number of DMAs in flight is less than or equal to some maximum
    * \param max_dmas_in_flight Maximum number of DMAs allowed to be in flight
    * to satisfy the `Wait` e.g. use `Wait(0)` to wait on "all" outstanding DMAs to complete
    */
-  void Wait(uint32_t max_dmas_in_flight);
+  void Wait(int queue_id, uint32_t max_dmas_in_flight);
 
   /*!
    * \brief Poll the number of DMAs in flight
    * \returns Number of DMAs in flight
    */
-  uint32_t Poll();
+  uint32_t Poll(int queue_id);
 
  private:
   //! \brief Initializes the Hexagon User DMA engine
   unsigned int Init();
 
   //! \brief Calculates and returns the number of DMAs in flight
-  uint32_t DMAsInFlight();
+  uint32_t DMAsInFlight(int queue_id);
 
   //! \brief Tracks whether the very first DMA has been executed
-  bool first_dma_{true};
+  bool first_dma_ = true;
 
   //! \brief Tracks the tail DMA descriptor
-  void* tail_dma_desc_{nullptr};
+  void* tail_dma_desc_ = nullptr;
 
   //! \brief Storage for all DMA descriptors
-  RingBuffer<dma_desc_2d_t>* descriptors_{nullptr};
+  QueuedRingBuffer<dma_desc_2d_t>* descriptors_ = nullptr;
 };
 
 }  // namespace hexagon
diff --git a/src/runtime/hexagon/ring_buffer.h b/src/runtime/hexagon/ring_buffer.h
index d21b2b9953c2..4294ded8f52a 100644
--- a/src/runtime/hexagon/ring_buffer.h
+++ b/src/runtime/hexagon/ring_buffer.h
@@ -21,6 +21,7 @@
 #define TVM_RUNTIME_HEXAGON_RING_BUFFER_H_
 
 #include <functional>
+#include <vector>
 
 #include "hexagon_common.h"
 
@@ -72,19 +73,51 @@ class RingBuffer {
   }
 
   //! \brief Pointer to the ring buffer
-  T* ring_buff_ptr_{nullptr};
+  T* ring_buff_ptr_ = nullptr;
 
   //! \brief Size of the ring buffer in number of Ts
-  const uint32_t ring_buff_size_;
+  const uint32_t ring_buff_size_ = 0;
 
   //! \brief Function that determines whether a T is in flight
   const std::function<bool(T*)> in_flight_;
 
   //! \brief Tracks the ID of the next T to be added to the ring buffer
-  uint32_t id_next_{0};
+  uint32_t id_next_ = 0;
 
   //! \brief Tracks the ID of the oldest T in flight
-  uint32_t id_oldest_{0};
+  uint32_t id_oldest_ = 0;
+};
+
+//! \brief Separates a single RingBuffer into multiple virtual queues with each queue having a
+//! unique integer ID; queues allow for indepent users of the same RingBuffer while mainting overall
+//! FIFO ordering among all queues
+template <class T>
+class QueuedRingBuffer : RingBuffer<T> {
+ public:
+  QueuedRingBuffer(uint32_t ring_buff_size, std::function<bool(T*)> in_flight)
+      : RingBuffer<T>(ring_buff_size, in_flight) {}
+
+  //! \brief Returns pointer to next T; add the queue ID for tracking
+  T* Next(int queue_id) {
+    queue_ids_.push_back(queue_id);
+    return RingBuffer<T>::Next();
+  }
+
+  //! \brief Returns the number of Ts in flight for a given queue ID
+  uint32_t InFlight(int queue_id) {
+    uint32_t in_flight = 0;
+    // look at the queue IDs for the RingBuffer entries in flight
+    for (size_t i = queue_ids_.size() - RingBuffer<T>::InFlight(); i < queue_ids_.size(); ++i) {
+      // increment return value if in flight queue ID matches
+      if (queue_ids_[i] == queue_id) {
+        in_flight++;
+      }
+    }
+    return in_flight;
+  }
+
+ private:
+  std::vector<int> queue_ids_;
 };
 
 }  // namespace hexagon
diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
index fb46cb3fd976..b76c7c652e6a 100644
--- a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
@@ -46,38 +46,39 @@ class HexagonUserDMATest : public ::testing::Test {
 
  public:
   HexagonUserDMA* user_dma;
-  int ret{0};
-  void* src{nullptr};
-  void* dst{nullptr};
-  char* src_char{nullptr};
-  char* dst_char{nullptr};
-  uint32_t length{0x4000};  // 16KB
+  int ret = 0;
+  int queue_id = 0;
+  void* src = nullptr;
+  void* dst = nullptr;
+  char* src_char = nullptr;
+  char* dst_char = nullptr;
+  uint32_t length = 0x4000;  // 16KB
 };
 
 TEST_F(HexagonUserDMATest, wait) {
-  user_dma->Wait(0);
-  user_dma->Wait(10);
+  user_dma->Wait(queue_id, 0);
+  user_dma->Wait(queue_id, 10);
 }
 
-TEST_F(HexagonUserDMATest, poll) { ASSERT_EQ(user_dma->Poll(), 0); }
+TEST_F(HexagonUserDMATest, poll) { ASSERT_EQ(user_dma->Poll(queue_id), 0); }
 
 TEST_F(HexagonUserDMATest, bad_copy) {
   uint64_t bigaddr = 0x100000000;
   void* src64 = reinterpret_cast<void*>(bigaddr);
   void* dst64 = reinterpret_cast<void*>(bigaddr);
   uint32_t biglength = 0x1000000;
-  ASSERT_NE(user_dma->Copy(dst64, src, length), DMA_SUCCESS);
-  ASSERT_NE(user_dma->Copy(dst, src64, length), DMA_SUCCESS);
-  ASSERT_NE(user_dma->Copy(dst, src, biglength), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(queue_id, dst64, src, length), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(queue_id, dst, src64, length), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(queue_id, dst, src, biglength), DMA_SUCCESS);
 }
 
 TEST_F(HexagonUserDMATest, sync_dma) {
   // kick off 1 DMA
-  ret = user_dma->Copy(dst, src, length);
+  ret = user_dma->Copy(queue_id, dst, src, length);
   ASSERT_EQ(ret, DMA_SUCCESS);
 
   // wait for DMA to complete
-  user_dma->Wait(0);
+  user_dma->Wait(queue_id, 0);
 
   // verify
   for (uint32_t i = 0; i < length; ++i) {
@@ -88,12 +89,12 @@ TEST_F(HexagonUserDMATest, sync_dma) {
 TEST_F(HexagonUserDMATest, async_dma_wait) {
   // kick off 10x duplicate DMAs
   for (uint32_t i = 0; i < 10; ++i) {
-    ret = user_dma->Copy(dst, src, length);
+    ret = user_dma->Copy(queue_id, dst, src, length);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }
 
   // wait for at least 1 DMA to complete
-  user_dma->Wait(9);
+  user_dma->Wait(queue_id, 9);
 
   // verify
   for (uint32_t i = 0; i < length; ++i) {
@@ -101,18 +102,18 @@ TEST_F(HexagonUserDMATest, async_dma_wait) {
   }
 
   // empty the DMA queue
-  user_dma->Wait(0);
+  user_dma->Wait(queue_id, 0);
 }
 
 TEST_F(HexagonUserDMATest, async_dma_poll) {
   // kick off 10x duplicate DMAs
   for (uint32_t i = 0; i < 10; ++i) {
-    ret = user_dma->Copy(dst, src, length);
+    ret = user_dma->Copy(queue_id, dst, src, length);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }
 
   // poll until at least 1 DMA is complete
-  while (user_dma->Poll() == 10) {
+  while (user_dma->Poll(queue_id) == 10) {
   };
 
   // verify
@@ -121,35 +122,34 @@ TEST_F(HexagonUserDMATest, async_dma_poll) {
   }
 
   // empty the DMA queue
-  user_dma->Wait(0);
+  user_dma->Wait(queue_id, 0);
 }
 
-// TODO: Run non-pipelined case with sync DMA and execution time vs. pipelined case
 TEST_F(HexagonUserDMATest, pipeline) {
   uint32_t pipeline_depth = 4;
   uint32_t pipeline_length = length / pipeline_depth;
 
   for (uint32_t i = 0; i < pipeline_depth; ++i) {
-    ret |= user_dma->Copy(dst_char + i * pipeline_length, src_char + i * pipeline_length,
+    ret |= user_dma->Copy(queue_id, dst_char + i * pipeline_length, src_char + i * pipeline_length,
                           pipeline_length);
   }
 
-  user_dma->Wait(3);
+  user_dma->Wait(queue_id, 3);
   for (uint32_t i = 0; i < pipeline_length; ++i) {
     dst_char[i]++;
   }
 
-  user_dma->Wait(2);
+  user_dma->Wait(queue_id, 2);
   for (uint32_t i = pipeline_length; i < 2 * pipeline_length; ++i) {
     dst_char[i]++;
   }
 
-  user_dma->Wait(1);
+  user_dma->Wait(queue_id, 1);
   for (uint32_t i = 2 * pipeline_length; i < 3 * pipeline_length; ++i) {
     dst_char[i]++;
   }
 
-  user_dma->Wait(0);
+  user_dma->Wait(queue_id, 0);
   for (uint32_t i = 3 * pipeline_length; i < 4 * pipeline_length; ++i) {
     dst_char[i]++;
   }
@@ -161,14 +161,60 @@ TEST_F(HexagonUserDMATest, pipeline) {
   }
 }
 
+TEST_F(HexagonUserDMATest, pipeline_write_queue) {
+  int write_queue = queue_id + 1;
+  uint32_t pipeline_depth = 4;
+  uint32_t pipeline_length = length / pipeline_depth;
+
+  for (uint32_t i = 0; i < pipeline_depth; ++i) {
+    ret |= user_dma->Copy(queue_id, dst_char + i * pipeline_length, src_char + i * pipeline_length,
+                          pipeline_length);
+  }
+
+  user_dma->Wait(queue_id, 3);
+  for (uint32_t i = 0; i < pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+  ret |= user_dma->Copy(write_queue, src_char, dst_char, pipeline_length);
+
+  user_dma->Wait(queue_id, 2);
+  for (uint32_t i = pipeline_length; i < 2 * pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+  ret |= user_dma->Copy(write_queue, src_char + pipeline_length, dst_char + pipeline_length,
+                        pipeline_length);
+
+  user_dma->Wait(queue_id, 1);
+  for (uint32_t i = 2 * pipeline_length; i < 3 * pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+  ret |= user_dma->Copy(write_queue, src_char + 2 * pipeline_length, dst_char + 2 * pipeline_length,
+                        pipeline_length);
+
+  user_dma->Wait(queue_id, 0);
+  for (uint32_t i = 3 * pipeline_length; i < 4 * pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+  ret |= user_dma->Copy(write_queue, src_char + 3 * pipeline_length, dst_char + 3 * pipeline_length,
+                        pipeline_length);
+  user_dma->Wait(write_queue, 0);
+
+  // verify
+  ASSERT_EQ(ret, DMA_SUCCESS);
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(2, dst_char[i]);
+    ASSERT_EQ(2, src_char[i]);
+  }
+}
+
 TEST_F(HexagonUserDMATest, overflow_ring_buffer) {
   uint32_t number_of_dmas = 0x400;  // 1k
   uint32_t length_of_each_dma = length / number_of_dmas;
 
   for (uint32_t i = 0; i < number_of_dmas; ++i) {
     do {
-      ret = user_dma->Copy(dst_char + i * length_of_each_dma, src_char + i * length_of_each_dma,
-                           length_of_each_dma);
+      ret = user_dma->Copy(queue_id, dst_char + i * length_of_each_dma,
+                           src_char + i * length_of_each_dma, length_of_each_dma);
     } while (ret == DMA_RETRY);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }
@@ -177,4 +223,4 @@ TEST_F(HexagonUserDMATest, overflow_ring_buffer) {
   for (uint32_t i = 0; i < length; ++i) {
     ASSERT_EQ(src_char[i], dst_char[i]);
   }
-}
\ No newline at end of file
+}
diff --git a/tests/cpp-runtime/hexagon/ring_buffer_tests.cc b/tests/cpp-runtime/hexagon/ring_buffer_tests.cc
index cd40dca87b02..8cf363bae0b3 100644
--- a/tests/cpp-runtime/hexagon/ring_buffer_tests.cc
+++ b/tests/cpp-runtime/hexagon/ring_buffer_tests.cc
@@ -42,7 +42,7 @@ class RingBufferTest : public ::testing::Test {
   int inflight = 43;
   uint32_t size = 4;
   uint32_t half = size / 2;
-  RingBuffer<int>* ring_buff;
+  RingBuffer<int>* ring_buff = nullptr;
 };
 
 TEST_F(RingBufferTest, zero_size_ring_buffer) {
@@ -188,3 +188,31 @@ TEST_F(RingBufferTest, half_in_flight_blocked) {
   ASSERT_EQ(ring_buff->Next(), nullptr);
   ASSERT_EQ(ring_buff->InFlight(), size);
 }
+
+class QueuedRingBufferTest : public RingBufferTest {
+  void SetUp() override { queued_ring_buff = new QueuedRingBuffer<int>(size, in_flight); }
+  void TearDown() override { delete queued_ring_buff; }
+
+ public:
+  QueuedRingBuffer<int>* queued_ring_buff = nullptr;
+};
+
+TEST_F(QueuedRingBufferTest, two_queues) {
+  int* q0 = queued_ring_buff->Next(0);
+  *q0 = inflight;
+  ASSERT_EQ(queued_ring_buff->InFlight(0), 1);
+  ASSERT_EQ(queued_ring_buff->InFlight(1), 0);
+
+  int* q1 = queued_ring_buff->Next(1);
+  *q1 = inflight;
+  ASSERT_EQ(queued_ring_buff->InFlight(0), 1);
+  ASSERT_EQ(queued_ring_buff->InFlight(1), 1);
+
+  *q0 = finished;
+  ASSERT_EQ(queued_ring_buff->InFlight(0), 0);
+  ASSERT_EQ(queued_ring_buff->InFlight(1), 1);
+
+  *q1 = finished;
+  ASSERT_EQ(queued_ring_buff->InFlight(0), 0);
+  ASSERT_EQ(queued_ring_buff->InFlight(1), 0);
+}
diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
index 6bcca90ec9d3..7a53a1fc9b53 100644
--- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -26,8 +26,9 @@
 
 outer = tvm.testing.parameter(8, 16)
 inner = tvm.testing.parameter(64, 128)
-scope = tvm.testing.parameter("global", "global.vtcm")
 dtype = tvm.testing.parameter("uint8", "float16")
+scope = tvm.testing.parameter("global", "global.vtcm")
+sched = tvm.testing.parameter("cache_read", "cache_write", "cache_read_write")
 
 
 @tvm.testing.fixture
@@ -46,18 +47,40 @@ def plus_one_ref(a):
     return plus_one_primfunc, plus_one_ref
 
 
-@tvm.testing.requires_hexagon
-def test_software_pipeline_with_cache_read(hexagon_launcher, compute, outer, inner, dtype, scope):
+@tvm.testing.fixture
+def schedule(compute, sched, scope):
     sch = tir.Schedule(compute[0])
-    root = sch.get_block("root")
-    compute_block = sch.get_block("compute")
-    cache_read_block = sch.cache_read(compute_block, 0, scope)
 
+    compute_block = sch.get_block("compute")
     i, _ = sch.get_loops(compute_block)
-    sch.compute_at(cache_read_block, i)
-    sch.annotate(i, "software_pipeline_stage", [0, 1])
-    sch.annotate(i, "software_pipeline_order", [0, 1])
-    sch.annotate(i, "software_pipeline_async_stages", [0])
+
+    if sched == "cache_read":
+        cache_read_block = sch.cache_read(compute_block, 0, scope)
+        sch.compute_at(cache_read_block, i)
+        sch.annotate(i, "software_pipeline_stage", [0, 1])
+        sch.annotate(i, "software_pipeline_order", [0, 1])
+        sch.annotate(i, "software_pipeline_async_stages", [0])
+    elif sched == "cache_write":
+        cache_write_block = sch.cache_write(compute_block, 0, scope)
+        sch.reverse_compute_at(cache_write_block, i)
+        sch.annotate(i, "software_pipeline_stage", [0, 1])
+        sch.annotate(i, "software_pipeline_order", [0, 1])
+        sch.annotate(i, "software_pipeline_async_stages", [1])
+    elif sched == "cache_read_write":
+        cache_read_block = sch.cache_read(compute_block, 0, scope)
+        sch.compute_at(cache_read_block, i)
+        cache_write_block = sch.cache_write(compute_block, 0, scope)
+        sch.reverse_compute_at(cache_write_block, i)
+        sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
+        sch.annotate(i, "software_pipeline_order", [0, 1, 2])
+        sch.annotate(i, "software_pipeline_async_stages", [0, 2])
+
+    return sch
+
+
+@tvm.testing.requires_hexagon
+def test_async_software_pipeline(hexagon_launcher, compute, schedule, outer, inner, dtype, scope):
+    sch = schedule
 
     a_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
     b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)

From 61a7632b556b4920c358288a9f4fffc96c61221c Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 4 Oct 2022 11:40:00 -0700
Subject: [PATCH 297/704] [microTVM] Use default Project Options in template
 projects and add Makefile for Arduino template project (#12818)

* add default options

* add updat

* add more options

* add makefile for arduino

* fixed flash command

* add arduino-cli flag option

* cleanup

* lint

* lint and test added

* lint

* fix test

* fix flash command

* fix test, add board to makefile

* fix test

* fix flash and build command

* add cmsis support

* address comments

* lint

* address comments, refactor two more options

* change build.extra_flags to be empty when no flag added

* fix test

* address comments

* add test for project options

* update test
---
 .../template_project/Makefile.template        |  64 +++++
 .../template_project/microtvm_api_server.py   | 271 ++++++++++++------
 .../reference-vm/base-box/base_box_test.sh    |   6 +-
 .../template_project/microtvm_api_server.py   |  80 ++----
 cmake/modules/Arduino.cmake                   |   1 +
 .../how_to/work_with_microtvm/micro_aot.py    |   2 +-
 .../work_with_microtvm/micro_autotune.py      |   6 +-
 .../work_with_microtvm/micro_reference_vm.py  |   4 +-
 .../how_to/work_with_microtvm/micro_tflite.py |   2 +-
 .../how_to/work_with_microtvm/micro_train.py  |   2 +-
 python/tvm/micro/project_api/server.py        |  84 ++++++
 python/tvm/micro/testing/evaluation.py        |   4 +-
 tests/lint/check_file_type.py                 |   2 +
 tests/micro/arduino/README.md                 |   4 +-
 .../micro/arduino/test_arduino_rpc_server.py  |   2 +-
 tests/micro/arduino/test_utils.py             |   2 +-
 tests/micro/common/test_tvmc.py               |  14 +-
 .../test_arduino_microtvm_api_server.py       |  26 +-
 tests/micro/project_api/test_project_api.py   |  92 ++++++
 tests/micro/zephyr/README.md                  |   6 +-
 tests/micro/zephyr/test_utils.py              |   2 +-
 tests/micro/zephyr/test_zephyr.py             |   6 +-
 tests/micro/zephyr/test_zephyr_aot_exec.py    |   2 +-
 .../python/unittest/test_micro_project_api.py |  35 +++
 tests/scripts/task_python_microtvm.sh         |   4 +-
 25 files changed, 532 insertions(+), 191 deletions(-)
 create mode 100644 apps/microtvm/arduino/template_project/Makefile.template
 rename {apps/microtvm/arduino/template_project/tests => tests/micro/project_api}/test_arduino_microtvm_api_server.py (89%)
 create mode 100644 tests/micro/project_api/test_project_api.py

diff --git a/apps/microtvm/arduino/template_project/Makefile.template b/apps/microtvm/arduino/template_project/Makefile.template
new file mode 100644
index 000000000000..f067991865bd
--- /dev/null
+++ b/apps/microtvm/arduino/template_project/Makefile.template
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+FQBN			    ?= <FQBN>
+VERBOSE_FLAG	    ?= <VERBOSE_FLAG>
+BUILD_DIR		    := $(subst :,.,build)
+PORT			    ?=
+ARUINO_CLI_CMD	    ?= <ARUINO_CLI_CMD>
+BOARD			    := <BOARD>
+BUILD_EXTRA_FLAGS   := <BUILD_EXTRA_FLAGS>
+
+SRC        	:= $(wildcard *.ino)
+BIN        	:= $(BUILD_DIR)/$(SRC).bin
+ELF        	:= $(BUILD_DIR)/$(SRC).elf
+
+$(info FQBN                 `fully qualified board name` => [${FQBN}])
+$(info BUILD_DIR            `build directory for this project` => [${BUILD_DIR}])
+$(info SRC                  `Arduino .ino file for this project` => [${SRC}])
+$(info BIN                  `generated binary file path` => [${BIN}])
+$(info PORT                 `board's port` => [${PORT}])
+$(info BOARD                `board name` => [${BOARD}])
+$(info BUILD_EXTRA_FLAGS    `build extra flags including header include directories and other compiler flags` => [${BUILD_EXTRA_FLAGS}])
+
+all: $(ELF) flash
+.PHONY: all
+
+build: $(ELF)
+.PHONY: build
+
+$(ELF): $(SRC)
+	$(ARUINO_CLI_CMD) compile --fqbn $(FQBN) \
+    --build-path $(BUILD_DIR) \
+    --build-properties $(BUILD_EXTRA_FLAGS) \
+    $(VERBOSE_FLAG)
+
+flash:
+	if [ -z $(PORT) ] ; then \
+	echo "---> ERROR: Please set the device port environment variable PORT"; \
+	else $(ARUINO_CLI_CMD) upload --fqbn $(FQBN) --input-dir $(BUILD_DIR) --port $(PORT) $(VERBOSE_FLAG); \
+	fi
+
+info:
+	$(info --------------------------------------INFO--------------------------------------)
+	$(info This makefile is for building and flashing an Arduino project with TVM.)
+	$(info To build run: `make build`)
+	$(info To upload the sketch run: `make flash PORT=<Arduino board port path>`)
+	$(info --------------------------------------INFO--------------------------------------)
+
+clean:
+	rm -rf build
diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index 46b717fba480..cb0022b3beee 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -26,7 +26,6 @@
 import tempfile
 import time
 from string import Template
-
 from packaging import version
 
 from tvm.micro.project_api import server
@@ -46,6 +45,8 @@
 
 ARDUINO_CLI_CMD = shutil.which("arduino-cli")
 
+MAKEFILE_FILENAME = "Makefile"
+
 # Data structure to hold the information microtvm_api_server.py needs
 # to communicate with each of these boards.
 try:
@@ -55,59 +56,43 @@
     raise FileNotFoundError(f"Board file {{{BOARDS}}} does not exist.")
 
 
+def get_cmsis_path(cmsis_path: pathlib.Path) -> pathlib.Path:
+    """Returns CMSIS dependency path"""
+    if cmsis_path:
+        return pathlib.Path(cmsis_path)
+    if os.environ.get("CMSIS_PATH"):
+        return pathlib.Path(os.environ.get("CMSIS_PATH"))
+    assert False, "'cmsis_path' option not passed!"
+
+
 class BoardAutodetectFailed(Exception):
     """Raised when no attached hardware is found matching the requested board"""
 
 
 PROJECT_TYPES = ["example_project", "host_driven"]
 
-PROJECT_OPTIONS = [
-    server.ProjectOption(
-        "arduino_board",
-        required=["build", "flash", "open_transport"],
-        choices=list(BOARD_PROPERTIES),
-        type="str",
-        help="Name of the Arduino board to build for.",
-    ),
+PROJECT_OPTIONS = server.default_project_options(
+    project_type={"choices": tuple(PROJECT_TYPES)},
+    board={"choices": list(BOARD_PROPERTIES), "optional": ["flash", "open_transport"]},
+    warning_as_error={"optional": ["build", "flash"]},
+) + [
     server.ProjectOption(
         "arduino_cli_cmd",
-        required=(
-            ["generate_project", "build", "flash", "open_transport"]
-            if not ARDUINO_CLI_CMD
-            else None
-        ),
+        required=(["generate_project", "flash", "open_transport"] if not ARDUINO_CLI_CMD else None),
         optional=(
             ["generate_project", "build", "flash", "open_transport"] if ARDUINO_CLI_CMD else None
         ),
-        default=ARDUINO_CLI_CMD,
         type="str",
+        default=ARDUINO_CLI_CMD,
         help="Path to the arduino-cli tool.",
     ),
     server.ProjectOption(
         "port",
         optional=["flash", "open_transport"],
         type="int",
+        default=None,
         help="Port to use for connecting to hardware.",
     ),
-    server.ProjectOption(
-        "project_type",
-        required=["generate_project"],
-        choices=tuple(PROJECT_TYPES),
-        type="str",
-        help="Type of project to generate.",
-    ),
-    server.ProjectOption(
-        "verbose",
-        optional=["build", "flash"],
-        type="bool",
-        help="Run arduino-cli compile and upload with verbose output.",
-    ),
-    server.ProjectOption(
-        "warning_as_error",
-        optional=["build", "flash"],
-        type="bool",
-        help="Treat warnings as errors and raise an Exception.",
-    ),
 ]
 
 
@@ -313,7 +298,79 @@ def _find_modified_include_path(self, project_dir, file_path, include_path):
         # It's probably a standard C/C++ header
         return include_path
 
+    CMSIS_INCLUDE_HEADERS = [
+        "arm_nn_math_types.h",
+        "arm_nn_tables.h",
+        "arm_nn_types.h",
+        "arm_nnfunctions.h",
+        "arm_nnsupportfunctions.h",
+    ]
+
+    def _cmsis_required(self, project_path: pathlib.Path) -> bool:
+        """Check if CMSIS dependency is required."""
+        project_path = pathlib.Path(project_path)
+        for path in (project_path / "src" / "model").iterdir():
+            if path.is_file():
+                # Encoding is for reading C generated code which also includes hex numbers
+                with open(path, "r", encoding="ISO-8859-1") as lib_f:
+                    lib_content = lib_f.read()
+                if any(header in lib_content for header in self.CMSIS_INCLUDE_HEADERS):
+                    return True
+        return False
+
+    def _copy_cmsis(self, project_path: pathlib.Path, cmsis_path: str):
+        """Copy CMSIS header files to project.
+        Note: We use this CMSIS package:https://www.arduino.cc/reference/en/libraries/arduino_cmsis-dsp/
+        However, the latest release does not include header files that are copied in this function.
+        """
+        (project_path / "include" / "cmsis").mkdir()
+        cmsis_path = get_cmsis_path(cmsis_path)
+        for item in self.CMSIS_INCLUDE_HEADERS:
+            shutil.copy2(
+                cmsis_path / "CMSIS" / "NN" / "Include" / item,
+                project_path / "include" / "cmsis" / item,
+            )
+
+    def _populate_makefile(
+        self,
+        makefile_template_path: pathlib.Path,
+        makefile_path: pathlib.Path,
+        board: str,
+        verbose: bool,
+        arduino_cli_cmd: str,
+        build_extra_flags: str,
+    ):
+        """Generate Makefile from template."""
+        flags = {
+            "FQBN": self._get_fqbn(board),
+            "VERBOSE_FLAG": "--verbose" if verbose else "",
+            "ARUINO_CLI_CMD": self._get_arduino_cli_cmd(arduino_cli_cmd),
+            "BOARD": board,
+            "BUILD_EXTRA_FLAGS": build_extra_flags,
+        }
+
+        with open(makefile_path, "w") as makefile_f:
+            with open(makefile_template_path, "r") as makefile_template_f:
+                for line in makefile_template_f:
+                    SUBST_TOKEN_RE = re.compile(r"<([A-Z_]+)>")
+                    outs = []
+                    for i, m in enumerate(re.split(SUBST_TOKEN_RE, line)):
+                        if i % 2 == 1:
+                            m = flags[m]
+                        outs.append(m)
+                    line = "".join(outs)
+                    makefile_f.write(line)
+
     def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
+        # List all used project options
+        board = options["board"]
+        verbose = options.get("verbose")
+        project_type = options["project_type"]
+        arduino_cli_cmd = options.get("arduino_cli_cmd")
+        cmsis_path = options.get("cmsis_path")
+        compile_definitions = options.get("compile_definitions")
+        extra_files_tar = options.get("extra_files_tar")
+
         # Reference key directories with pathlib
         project_dir = pathlib.Path(project_dir)
         project_dir.mkdir()
@@ -323,11 +380,11 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         # Copies files from the template folder to project_dir
         shutil.copy2(API_SERVER_DIR / "microtvm_api_server.py", project_dir)
         shutil.copy2(BOARDS, project_dir / BOARDS.name)
-        self._copy_project_files(API_SERVER_DIR, project_dir, options["project_type"])
+        self._copy_project_files(API_SERVER_DIR, project_dir, project_type)
 
         # Copy standalone_crt into src folder
         self._copy_standalone_crt(source_dir, standalone_crt_dir)
-        self._remove_unused_components(source_dir, options["project_type"])
+        self._remove_unused_components(source_dir, project_type)
 
         # Populate crt-config.h
         crt_config_dir = project_dir / "src" / "standalone_crt" / "crt_config"
@@ -341,7 +398,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         shutil.copy2(model_library_format_path, project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
 
         # For AOT, template model.h with metadata to minimize space usage
-        if options["project_type"] == "example_project":
+        if project_type == "example_project":
             self._template_model_header(source_dir, metadata)
 
         self._change_cpp_file_extensions(source_dir)
@@ -349,8 +406,45 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         # Recursively change includes
         self._convert_includes(project_dir, source_dir)
 
-    def _get_arduino_cli_cmd(self, options: dict):
-        arduino_cli_cmd = options.get("arduino_cli_cmd", ARDUINO_CLI_CMD)
+        # create include directory
+        (project_dir / "include").mkdir()
+
+        # Populate extra_files
+        if extra_files_tar:
+            with tarfile.open(extra_files_tar, mode="r:*") as tf:
+                tf.extractall(project_dir)
+
+        build_extra_flags = '"build.extra_flags='
+        if extra_files_tar:
+            build_extra_flags += "-I./include "
+
+        if compile_definitions:
+            for item in compile_definitions:
+                build_extra_flags += f"{item} "
+
+        if self._cmsis_required(project_dir):
+            build_extra_flags += f"-I./include/cmsis "
+            self._copy_cmsis(project_dir, cmsis_path)
+
+        build_extra_flags += '"'
+
+        # Check if build_extra_flags is empty
+        if build_extra_flags == '"build.extra_flags="':
+            build_extra_flags = '""'
+
+        # Populate Makefile
+        self._populate_makefile(
+            API_SERVER_DIR / f"{MAKEFILE_FILENAME}.template",
+            project_dir / MAKEFILE_FILENAME,
+            board,
+            verbose,
+            arduino_cli_cmd,
+            build_extra_flags,
+        )
+
+    def _get_arduino_cli_cmd(self, arduino_cli_cmd: str):
+        if not arduino_cli_cmd:
+            arduino_cli_cmd = ARDUINO_CLI_CMD
         assert arduino_cli_cmd, "'arduino_cli_cmd' command not passed and not found by default!"
         return arduino_cli_cmd
 
@@ -368,9 +462,8 @@ def _get_platform_version(self, arduino_cli_path: str) -> float:
         return version.parse(str_version)
 
     # This will only be run for build and upload
-    def _check_platform_version(self, options):
+    def _check_platform_version(self, cli_command: str, warning_as_error: bool):
         if not self._version:
-            cli_command = self._get_arduino_cli_cmd(options)
             self._version = self._get_platform_version(cli_command)
 
         if self._version < MIN_ARDUINO_CLI_VERSION:
@@ -378,33 +471,24 @@ def _check_platform_version(self, options):
                 f"Arduino CLI version too old: found {self._version}, "
                 f"need at least {str(MIN_ARDUINO_CLI_VERSION)}."
             )
-            if options.get("warning_as_error") is not None and options["warning_as_error"]:
+            if warning_as_error is not None and warning_as_error:
                 raise server.ServerError(message=message)
             _LOG.warning(message)
 
-    def _get_fqbn(self, options):
-        o = BOARD_PROPERTIES[options["arduino_board"]]
+    def _get_fqbn(self, board: str):
+        o = BOARD_PROPERTIES[board]
         return f"{o['package']}:{o['architecture']}:{o['board']}"
 
     def build(self, options):
-        self._check_platform_version(options)
-        BUILD_DIR.mkdir()
-
-        compile_cmd = [
-            self._get_arduino_cli_cmd(options),
-            "compile",
-            "./project/",
-            "--fqbn",
-            self._get_fqbn(options),
-            "--build-path",
-            BUILD_DIR.resolve(),
-        ]
-
-        if options.get("verbose"):
-            compile_cmd.append("--verbose")
+        # List all used project options
+        arduino_cli_cmd = options.get("arduino_cli_cmd")
+        warning_as_error = options.get("warning_as_error")
 
+        cli_command = self._get_arduino_cli_cmd(arduino_cli_cmd)
+        self._check_platform_version(cli_command, warning_as_error)
+        compile_cmd = ["make", "build"]
         # Specify project to compile
-        subprocess.run(compile_cmd, check=True)
+        subprocess.run(compile_cmd, check=True, cwd=API_SERVER_DIR)
 
     POSSIBLE_BOARD_LIST_HEADERS = ("Port", "Protocol", "Type", "Board Name", "FQBN", "Core")
 
@@ -440,13 +524,13 @@ def _parse_connected_boards(self, tabular_str):
                 device[col_name] = str_row[column.start() : column.end()].strip()
             yield device
 
-    def _auto_detect_port(self, options):
-        list_cmd = [self._get_arduino_cli_cmd(options), "board", "list"]
+    def _auto_detect_port(self, arduino_cli_cmd: str, board: str) -> str:
+        list_cmd = [self._get_arduino_cli_cmd(arduino_cli_cmd), "board", "list"]
         list_cmd_output = subprocess.run(
             list_cmd, check=True, stdout=subprocess.PIPE
         ).stdout.decode("utf-8")
 
-        desired_fqbn = self._get_fqbn(options)
+        desired_fqbn = self._get_fqbn(board)
         for device in self._parse_connected_boards(list_cmd_output):
             if device["fqbn"] == desired_fqbn:
                 return device["port"]
@@ -454,40 +538,46 @@ def _auto_detect_port(self, options):
         # If no compatible boards, raise an error
         raise BoardAutodetectFailed()
 
-    def _get_arduino_port(self, options):
+    def _get_arduino_port(self, arduino_cli_cmd: str, board: str, port: int):
         if not self._port:
-            if "port" in options and options["port"]:
-                self._port = options["port"]
+            if port:
+                self._port = port
             else:
-                self._port = self._auto_detect_port(options)
+                self._port = self._auto_detect_port(arduino_cli_cmd, board)
 
         return self._port
 
+    def _get_board_from_makefile(self, makefile_path: pathlib.Path) -> str:
+        """Get Board from generated Makefile."""
+        with open(makefile_path) as makefile_f:
+            line = makefile_f.readline()
+            if "BOARD" in line:
+                board = re.sub(r"\s", "", line).split(":=")[1]
+                return board
+        raise RuntimeError("Board was not found in Makefile: {}".format(makefile_path))
+
     FLASH_TIMEOUT_SEC = 60
     FLASH_MAX_RETRIES = 5
 
     def flash(self, options):
-        self._check_platform_version(options)
-        port = self._get_arduino_port(options)
-
-        upload_cmd = [
-            self._get_arduino_cli_cmd(options),
-            "upload",
-            "./project",
-            "--fqbn",
-            self._get_fqbn(options),
-            "--input-dir",
-            BUILD_DIR.resolve(),
-            "--port",
-            port,
-        ]
-
-        if options.get("verbose"):
-            upload_cmd.append("--verbose")
-
+        # List all used project options
+        arduino_cli_cmd = options.get("arduino_cli_cmd")
+        warning_as_error = options.get("warning_as_error")
+        port = options.get("port")
+        board = options.get("board")
+        if not board:
+            board = self._get_board_from_makefile(API_SERVER_DIR / MAKEFILE_FILENAME)
+
+        cli_command = self._get_arduino_cli_cmd(arduino_cli_cmd)
+        self._check_platform_version(cli_command, warning_as_error)
+        port = self._get_arduino_port(cli_command, board, port)
+
+        upload_cmd = ["make", "flash", f"PORT={port}"]
         for _ in range(self.FLASH_MAX_RETRIES):
             try:
-                subprocess.run(upload_cmd, check=True, timeout=self.FLASH_TIMEOUT_SEC)
+                subprocess.run(
+                    upload_cmd, check=True, timeout=self.FLASH_TIMEOUT_SEC, cwd=API_SERVER_DIR
+                )
                 break
 
             # We only catch timeout errors - a subprocess.CalledProcessError
@@ -507,11 +597,18 @@ def open_transport(self, options):
         import serial
         import serial.tools.list_ports
 
+        # List all used project options
+        arduino_cli_cmd = options.get("arduino_cli_cmd")
+        port = options.get("port")
+        board = options.get("board")
+        if not board:
+            board = self._get_board_from_makefile(API_SERVER_DIR / MAKEFILE_FILENAME)
+
         # Zephyr example doesn't throw an error in this case
         if self._serial is not None:
             return
 
-        port = self._get_arduino_port(options)
+        port = self._get_arduino_port(arduino_cli_cmd, board, port)
 
         # It takes a moment for the Arduino code to finish initializing
         # and start communicating over serial
diff --git a/apps/microtvm/reference-vm/base-box/base_box_test.sh b/apps/microtvm/reference-vm/base-box/base_box_test.sh
index a8a55a0f40ae..09779bb048e0 100755
--- a/apps/microtvm/reference-vm/base-box/base_box_test.sh
+++ b/apps/microtvm/reference-vm/base-box/base_box_test.sh
@@ -28,15 +28,15 @@ platform=$1
 board=$2
 
 if [ "${platform}" == "zephyr" ]; then
-    pytest tests/micro/zephyr --zephyr-board=${board}
+    pytest tests/micro/zephyr --board=${board}
 fi
 
 if [ "${platform}" == "arduino" ]; then
-    pytest tests/micro/arduino/test_arduino_workflow.py --arduino-board=${board}
+    pytest tests/micro/arduino/test_arduino_workflow.py --board=${board}
     if [ $board == "nano33ble" ]; then
         # https://github.com/apache/tvm/issues/8730
         echo "NOTE: skipped test_arduino_rpc_server.py on $board -- known failure"
     else
-        pytest tests/micro/arduino/test_arduino_rpc_server.py --arduino-board=${board}
+        pytest tests/micro/arduino/test_arduino_rpc_server.py --board=${board}
     fi
 fi
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index 5a0bc7309c63..568d958fb033 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -200,8 +200,8 @@ def _get_board_mem_size_bytes(options):
         pathlib.Path(get_zephyr_base(options))
         / "boards"
         / "arm"
-        / options["zephyr_board"]
-        / (options["zephyr_board"] + ".yaml")
+        / options["board"]
+        / (options["board"] + ".yaml")
     )
     try:
         with open(board_file_path) as f:
@@ -216,7 +216,7 @@ def _get_board_mem_size_bytes(options):
 
 
 def _get_recommended_heap_size_bytes(options):
-    prop = BOARD_PROPERTIES[options["zephyr_board"]]
+    prop = BOARD_PROPERTIES[options["board"]]
     if "recommended_heap_size_bytes" in prop:
         return prop["recommended_heap_size_bytes"]
     return DEFAULT_HEAP_SIZE_BYTES
@@ -297,50 +297,37 @@ def _get_nrf_device_args(options):
         if d.is_dir():
             PROJECT_TYPES.append(d.name)
 
-
-PROJECT_OPTIONS = [
-    server.ProjectOption(
-        "extra_files_tar",
-        optional=["generate_project"],
-        type="str",
-        help="If given, during generate_project, uncompress the tarball at this path into the project dir.",
-    ),
+PROJECT_OPTIONS = server.default_project_options(
+    project_type={"choices": tuple(PROJECT_TYPES)},
+    board={"choices": list(BOARD_PROPERTIES)},
+    verbose={"optional": ["generate_project"]},
+) + [
     server.ProjectOption(
         "gdbserver_port",
-        help=("If given, port number to use when running the local gdbserver."),
         optional=["open_transport"],
         type="int",
+        default=None,
+        help=("If given, port number to use when running the local gdbserver."),
     ),
     server.ProjectOption(
         "nrfjprog_snr",
         optional=["open_transport"],
         type="int",
+        default=None,
         help=("When used with nRF targets, serial # of the attached board to use, from nrfjprog."),
     ),
     server.ProjectOption(
         "openocd_serial",
         optional=["open_transport"],
         type="int",
+        default=None,
         help=("When used with OpenOCD targets, serial # of the attached board to use."),
     ),
-    server.ProjectOption(
-        "project_type",
-        choices=tuple(PROJECT_TYPES),
-        required=["generate_project"],
-        type="str",
-        help="Type of project to generate.",
-    ),
-    server.ProjectOption(
-        "verbose",
-        optional=["generate_project"],
-        type="bool",
-        help="Run build with verbose output.",
-    ),
     server.ProjectOption(
         "west_cmd",
         optional=["generate_project"],
-        default=WEST_CMD,
         type="str",
+        default=WEST_CMD,
         help=(
             "Path to the west tool. If given, supersedes both the zephyr_base "
             "option and ZEPHYR_BASE environment variable."
@@ -350,57 +337,36 @@ def _get_nrf_device_args(options):
         "zephyr_base",
         required=(["generate_project", "open_transport"] if not ZEPHYR_BASE else None),
         optional=(["generate_project", "open_transport"] if ZEPHYR_BASE else ["build"]),
-        default=ZEPHYR_BASE,
         type="str",
+        default=ZEPHYR_BASE,
         help="Path to the zephyr base directory.",
     ),
-    server.ProjectOption(
-        "zephyr_board",
-        required=["generate_project"],
-        choices=list(BOARD_PROPERTIES),
-        type="str",
-        help="Name of the Zephyr board to build for.",
-    ),
     server.ProjectOption(
         "config_main_stack_size",
         optional=["generate_project"],
         type="int",
+        default=None,
         help="Sets CONFIG_MAIN_STACK_SIZE for Zephyr board.",
     ),
-    server.ProjectOption(
-        "warning_as_error",
-        optional=["generate_project"],
-        type="bool",
-        help="Treat warnings as errors and raise an Exception.",
-    ),
-    server.ProjectOption(
-        "compile_definitions",
-        optional=["generate_project"],
-        type="str",
-        help="Extra definitions added project compile.",
-    ),
-    server.ProjectOption(
-        "cmsis_path",
-        optional=["generate_project"],
-        type="str",
-        help="Path to the CMSIS directory.",
-    ),
     server.ProjectOption(
         "arm_fvp_path",
         optional=["generate_project", "open_transport"],
         type="str",
+        default=None,
         help="Path to the FVP binary to invoke.",
     ),
     server.ProjectOption(
         "use_fvp",
         optional=["generate_project"],
         type="bool",
+        default=False,
         help="Run on the FVP emulator instead of hardware.",
     ),
     server.ProjectOption(
         "heap_size_bytes",
         optional=["generate_project"],
         type="int",
+        default=None,
         help="Sets the value for HEAP_SIZE_BYTES passed to K_HEAP_DEFINE() to service TVM memory allocation requests.",
     ),
 ]
@@ -456,7 +422,7 @@ def server_info_query(self, tvm_version):
     }
 
     def _create_prj_conf(self, project_dir, options):
-        zephyr_board = options["zephyr_board"]
+        zephyr_board = options["board"]
         with open(project_dir / "prj.conf", "w") as f:
             f.write(
                 "# For UART used from main().\n"
@@ -549,15 +515,15 @@ def _generate_cmake_args(self, mlf_extracted_path, options) -> str:
         if options.get("west_cmd"):
             cmake_args += f"set(WEST {options['west_cmd']})\n"
 
-        if self._is_qemu(options["zephyr_board"], options.get("use_fvp")):
+        if self._is_qemu(options["board"], options.get("use_fvp")):
             # Some boards support more than one emulator, so ensure QEMU is set.
             cmake_args += f"set(EMU_PLATFORM qemu)\n"
 
-        if self._is_fvp(options["zephyr_board"], options.get("use_fvp")):
+        if self._is_fvp(options["board"], options.get("use_fvp")):
             cmake_args += "set(EMU_PLATFORM armfvp)\n"
             cmake_args += "set(ARMFVP_FLAGS -I)\n"
 
-        cmake_args += f"set(BOARD {options['zephyr_board']})\n"
+        cmake_args += f"set(BOARD {options['board']})\n"
 
         enable_cmsis = self._cmsis_required(mlf_extracted_path)
         if enable_cmsis:
@@ -567,7 +533,7 @@ def _generate_cmake_args(self, mlf_extracted_path, options) -> str:
         return cmake_args
 
     def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
-        zephyr_board = options["zephyr_board"]
+        zephyr_board = options["board"]
 
         # Check Zephyr version
         version = self._get_platform_version(get_zephyr_base(options))
diff --git a/cmake/modules/Arduino.cmake b/cmake/modules/Arduino.cmake
index f603e5d5df2b..bc06fab7a408 100644
--- a/cmake/modules/Arduino.cmake
+++ b/cmake/modules/Arduino.cmake
@@ -28,6 +28,7 @@ if(USE_MICRO)
       "apps/microtvm/arduino/template_project/src/host_driven *.c -> arduino/src/host_driven"
       "apps/microtvm/arduino/template_project/src/host_driven *.ino -> arduino/src/host_driven"
       "apps/microtvm/arduino/template_project/crt_config *.h -> arduino/crt_config"
+      "apps/microtvm/arduino/template_project Makefile.template -> arduino"
     )
 
     foreach(job_spec IN LISTS ARDUINO_FILE_COPY_JOBS)
diff --git a/gallery/how_to/work_with_microtvm/micro_aot.py b/gallery/how_to/work_with_microtvm/micro_aot.py
index 9a177559e116..9bfe4c39a967 100644
--- a/gallery/how_to/work_with_microtvm/micro_aot.py
+++ b/gallery/how_to/work_with_microtvm/micro_aot.py
@@ -133,7 +133,7 @@
 
 if use_physical_hw:
     template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
-    project_options = {"project_type": "host_driven", "zephyr_board": BOARD}
+    project_options = {"project_type": "host_driven", "board": BOARD}
 
 temp_dir = tvm.contrib.utils.tempdir()
 generated_project_dir = temp_dir / "project"
diff --git a/gallery/how_to/work_with_microtvm/micro_autotune.py b/gallery/how_to/work_with_microtvm/micro_autotune.py
index 58c52508b7c2..4c57717df889 100644
--- a/gallery/how_to/work_with_microtvm/micro_autotune.py
+++ b/gallery/how_to/work_with_microtvm/micro_autotune.py
@@ -152,7 +152,7 @@
     module_loader = tvm.micro.AutoTvmModuleLoader(
         template_project_dir=pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")),
         project_options={
-            "zephyr_board": BOARD,
+            "board": BOARD,
             "west_cmd": "west",
             "verbose": False,
             "project_type": "host_driven",
@@ -219,7 +219,7 @@
         lowered,
         temp_dir / "project",
         {
-            "zephyr_board": BOARD,
+            "board": BOARD,
             "west_cmd": "west",
             "verbose": False,
             "project_type": "host_driven",
@@ -262,7 +262,7 @@
         lowered_tuned,
         temp_dir / "project",
         {
-            "zephyr_board": BOARD,
+            "board": BOARD,
             "west_cmd": "west",
             "verbose": False,
             "project_type": "host_driven",
diff --git a/gallery/how_to/work_with_microtvm/micro_reference_vm.py b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
index b87a7265649f..80ab0edf8fae 100644
--- a/gallery/how_to/work_with_microtvm/micro_reference_vm.py
+++ b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
@@ -143,7 +143,7 @@
 .. code-block:: bash
 
     $ cd apps/microtvm/reference-vm/zephyr
-    $ poetry run python3 ../../../../tests/micro/zephyr/test_zephyr.py --zephyr-board=stm32f746g_disco
+    $ poetry run python3 ../../../../tests/micro/zephyr/test_zephyr.py --board=stm32f746g_disco
 
 If you do not have physical hardware attached, but wish to run the tests using the
 local QEMU emulator running within the VM, run the following commands instead:
@@ -152,7 +152,7 @@
 
     $ cd /Users/yourusername/path/to/tvm
     $ cd apps/microtvm/reference-vm/zephyr/
-    $ poetry run pytest ../../../../tests/micro/zephyr/test_zephyr.py --zephyr-board=qemu_x86
+    $ poetry run pytest ../../../../tests/micro/zephyr/test_zephyr.py --board=qemu_x86
 
 
diff --git a/gallery/how_to/work_with_microtvm/micro_tflite.py b/gallery/how_to/work_with_microtvm/micro_tflite.py
index dfe33eedac75..7bbc5fc228cc 100644
--- a/gallery/how_to/work_with_microtvm/micro_tflite.py
+++ b/gallery/how_to/work_with_microtvm/micro_tflite.py
@@ -291,7 +291,7 @@
 
 if use_physical_hw:
     template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
-    project_options = {"project_type": "host_driven", "zephyr_board": BOARD}
+    project_options = {"project_type": "host_driven", "board": BOARD}
 
 # Create a temporary directory
 
diff --git a/gallery/how_to/work_with_microtvm/micro_train.py b/gallery/how_to/work_with_microtvm/micro_train.py
index f75c0b05eb1d..44e0dd5cb730 100644
--- a/gallery/how_to/work_with_microtvm/micro_train.py
+++ b/gallery/how_to/work_with_microtvm/micro_train.py
@@ -478,7 +478,7 @@ def representative_dataset():
     mod,
     f"{FOLDER}/models/project",
     {
-        "arduino_board": "nano33ble",
+        "board": "nano33ble",
         "arduino_cli_cmd": "/content/bin/arduino-cli",
         "project_type": "example_project",
     },
diff --git a/python/tvm/micro/project_api/server.py b/python/tvm/micro/project_api/server.py
index ed26733cfc3c..5aed3a896241 100644
--- a/python/tvm/micro/project_api/server.py
+++ b/python/tvm/micro/project_api/server.py
@@ -64,6 +64,11 @@ def __new__(cls, name, **kw):
 
         return super().__new__(cls, **kw)
 
+    def replace(self, attributes):
+        """Update attributes associated to the project option."""
+        updated_option = self
+        return updated_option._replace(**attributes)
+
 
 ServerInfo = collections.namedtuple(
     "ServerInfo", ("platform_name", "is_template", "model_library_format_path", "project_options")
@@ -759,6 +764,85 @@ def write_with_timeout(fd, data, timeout_sec):  # pylint: disable=invalid-name
     return num_written
 
 
+def default_project_options(**kw) -> typing.List[ProjectOption]:
+    """Get default Project Options
+
+    Attributes of any default option can be updated. Here is an example
+    when attribute `optional` from `verbose` option needs to be updates:
+
+        default_project_options(verbose={"optional": ["build"]})
+
+    This will update the `optional` attribute of `verbose` ProjectOption
+    to be `["build"]`.
+
+    Returns
+    -------
+    options: List[ProjectOption]
+        A list of default ProjectOption with modifications.
+    """
+    options = [
+        ProjectOption(
+            "verbose",
+            optional=["generate_project"],
+            type="bool",
+            default=False,
+            help="Run build with verbose output.",
+        ),
+        ProjectOption(
+            "project_type",
+            required=["generate_project"],
+            type="str",
+            help="Type of project to generate.",
+        ),
+        ProjectOption(
+            "board",
+            required=["generate_project"],
+            type="str",
+            help="Name of the board to build for.",
+        ),
+        ProjectOption(
+            "cmsis_path",
+            optional=["generate_project"],
+            type="str",
+            default=None,
+            help="Path to the CMSIS directory.",
+        ),
+        ProjectOption(
+            "warning_as_error",
+            optional=["generate_project"],
+            type="bool",
+            default=False,
+            help="Treat warnings as errors and raise an Exception.",
+        ),
+        ProjectOption(
+            "compile_definitions",
+            optional=["generate_project"],
+            type="str",
+            default=None,
+            help="Extra definitions added project compile.",
+        ),
+        ProjectOption(
+            "extra_files_tar",
+            optional=["generate_project"],
+            type="str",
+            default=None,
+            help="If given, during generate_project, "
+            "uncompress the tarball at this path into the project dir.",
+        ),
+    ]
+    for name, config in kw.items():
+        option_found = False
+        for ind, option in enumerate(options):
+            if option.name == name:
+                options[ind] = option.replace(config)
+                option_found = True
+                break
+        if not option_found:
+            raise ValueError("Option {} was not found in default ProjectOptions.".format(name))
+
+    return options
+
+
 def main(handler: ProjectAPIHandler, argv: typing.List[str] = None):
     """Start a Project API server.
 
diff --git a/python/tvm/micro/testing/evaluation.py b/python/tvm/micro/testing/evaluation.py
index 1d80ed5568b2..c16b97f61df3 100644
--- a/python/tvm/micro/testing/evaluation.py
+++ b/python/tvm/micro/testing/evaluation.py
@@ -50,7 +50,7 @@ def tune_model(
     assert isinstance(params, dict)
 
     project_options = {
-        f"{platform}_board": board,
+        "board": board,
         "project_type": "host_driven",
         **(project_options or {}),
     }
@@ -129,7 +129,7 @@ def create_aot_session(
     print(f"Model parameter size: {parameter_size}")
 
     project_options = {
-        f"{platform}_board": board,
+        "board": board,
         "project_type": "host_driven",
         # {} shouldn't be the default value for project options ({}
         # is mutable), so we use this workaround
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 51a80431d37f..162e4a1cc7a1 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -149,6 +149,8 @@
     "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-riscv64",
     "apps/microtvm/zephyr/template_project/fvp-hack/FVP_Corstone_SSE-300_Ethos-U55",
     "apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay",
+    # microTVM Arduino runtime
+    "apps/microtvm/arduino/template_project/Makefile.template",
     # microTVM Virtual Machines
     "apps/microtvm/poetry.lock",
     "apps/microtvm/reference-vm/Vagrantfile",
diff --git a/tests/micro/arduino/README.md b/tests/micro/arduino/README.md
index 0b039ba6de7c..2b37599849f7 100644
--- a/tests/micro/arduino/README.md
+++ b/tests/micro/arduino/README.md
@@ -22,14 +22,14 @@ all of the appropriate TVM dependencies installed. You can run the test with:
 
 ```
 $ cd tvm/tests/micro/arduino
-$ pytest --arduino-board=spresense
+$ pytest --board=spresense
 ```
 
 Most of these tests require a supported Arduino board to be connected.
 If you don't want to run these tests, you can pass the flag
 `--test-build-only` to only test project generation and compilation.
 
-To see the list of supported values for `--arduino-board`, run:
+To see the list of supported values for `--board`, run:
 ```
 $ pytest --help
 ```
diff --git a/tests/micro/arduino/test_arduino_rpc_server.py b/tests/micro/arduino/test_arduino_rpc_server.py
index 3440fde8f7e6..e3d97bfdf915 100644
--- a/tests/micro/arduino/test_arduino_rpc_server.py
+++ b/tests/micro/arduino/test_arduino_rpc_server.py
@@ -44,7 +44,7 @@ def _make_session(model, arduino_board, arduino_cli_cmd, workspace_dir, mod, bui
         mod,
         workspace_dir / "project",
         {
-            "arduino_board": arduino_board,
+            "board": arduino_board,
             "arduino_cli_cmd": arduino_cli_cmd,
             "project_type": "host_driven",
             "verbose": bool(build_config.get("debug")),
diff --git a/tests/micro/arduino/test_utils.py b/tests/micro/arduino/test_utils.py
index 42ad7d40f35b..b27d4bb7aa10 100644
--- a/tests/micro/arduino/test_utils.py
+++ b/tests/micro/arduino/test_utils.py
@@ -84,7 +84,7 @@ def make_kws_project(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
         mod,
         workspace_dir / "project",
         {
-            "arduino_board": board,
+            "board": board,
             "arduino_cli_cmd": arduino_cli_cmd,
             "project_type": "example_project",
             "verbose": bool(build_config.get("debug")),
diff --git a/tests/micro/common/test_tvmc.py b/tests/micro/common/test_tvmc.py
index bd11b579e654..b2321f7d86a3 100644
--- a/tests/micro/common/test_tvmc.py
+++ b/tests/micro/common/test_tvmc.py
@@ -101,16 +101,13 @@ def test_tvmc_model_build_only(platform, board, output_dir):
         platform,
         "--project-option",
         "project_type=host_driven",
+        f"board={board}",
     ]
-    if platform == "zephyr":
-        create_project_cmd.append(f"{platform}_board={board}")
 
     cmd_result = _run_tvmc(create_project_cmd)
     assert cmd_result == 0, "tvmc micro failed in step: create-project"
 
     build_cmd = ["micro", "build", project_dir, platform]
-    if platform == "arduino":
-        build_cmd += ["--project-option", f"{platform}_board={board}"]
     cmd_result = _run_tvmc(build_cmd)
     assert cmd_result == 0, "tvmc micro failed in step: build"
     shutil.rmtree(output_dir)
@@ -168,23 +165,18 @@ def test_tvmc_model_run(platform, board, output_dir):
         platform,
         "--project-option",
         "project_type=host_driven",
+        f"board={board}",
     ]
-    if platform == "zephyr":
-        create_project_cmd.append(f"{platform}_board={board}")
 
     cmd_result = _run_tvmc(create_project_cmd)
     assert cmd_result == 0, "tvmc micro failed in step: create-project"
 
     build_cmd = ["micro", "build", project_dir, platform]
-    if platform == "arduino":
-        build_cmd += ["--project-option", f"{platform}_board={board}"]
     cmd_result = _run_tvmc(build_cmd)
 
     assert cmd_result == 0, "tvmc micro failed in step: build"
 
     flash_cmd = ["micro", "flash", project_dir, platform]
-    if platform == "arduino":
-        flash_cmd += ["--project-option", f"{platform}_board={board}"]
     cmd_result = _run_tvmc(flash_cmd)
     assert cmd_result == 0, "tvmc micro failed in step: flash"
 
@@ -194,8 +186,6 @@ def test_tvmc_model_run(platform, board, output_dir):
         "micro",
         project_dir,
     ]
-    if platform == "arduino":
-        run_cmd += ["--project-option", f"{platform}_board={board}"]
     run_cmd += ["--fill-mode", "random"]
     cmd_result = _run_tvmc(run_cmd)
     assert cmd_result == 0, "tvmc micro failed in step: run"
diff --git a/apps/microtvm/arduino/template_project/tests/test_arduino_microtvm_api_server.py b/tests/micro/project_api/test_arduino_microtvm_api_server.py
similarity index 89%
rename from apps/microtvm/arduino/template_project/tests/test_arduino_microtvm_api_server.py
rename to tests/micro/project_api/test_arduino_microtvm_api_server.py
index e74e3de55d32..ad9bd4a56a2d 100644
--- a/apps/microtvm/arduino/template_project/tests/test_arduino_microtvm_api_server.py
+++ b/tests/micro/project_api/test_arduino_microtvm_api_server.py
@@ -23,16 +23,17 @@
 from packaging import version
 import pytest
 
+import tvm
 from tvm.micro.project_api import server
 
-sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, tvm.micro.get_microtvm_template_projects("arduino"))
 import microtvm_api_server
 
 sys.path.pop(0)
 
 
 class TestGenerateProject:
-    DEFAULT_OPTIONS = {"arduino_cli_cmd": "arduino-cli", "arduino_board": "nano33ble"}
+    DEFAULT_OPTIONS = {"arduino_cli_cmd": "arduino-cli", "board": "nano33ble"}
 
     def _set_pathlib_path_exists(self, value):
         with mock.patch.object(Path, "exists") as mock_exists:
@@ -122,8 +123,7 @@ def test_auto_detect_port(self, mock_run):
         handler._get_fqbn = mock.MagicMock(return_value="arduino:mbed_nano:nano33")
         mock_run.return_value.stdout = bytes(self.BOARD_CONNECTED_V18, "utf-8")
         assert (
-            handler._auto_detect_port({**self.DEFAULT_OPTIONS, "arduino_board": "nano33"})
-            == "/dev/ttyACM1"
+            handler._auto_detect_port({**self.DEFAULT_OPTIONS, "board": "nano33"}) == "/dev/ttyACM1"
         )
 
     BAD_CLI_VERSION = "arduino-cli  Version: 0.7.1 Commit: 7668c465 Date: 2019-12-31T18:24:32Z\n"
@@ -133,13 +133,17 @@ def test_auto_detect_port(self, mock_run):
     def test_auto_detect_port(self, mock_run):
         handler = microtvm_api_server.Handler()
         mock_run.return_value.stdout = bytes(self.GOOD_CLI_VERSION, "utf-8")
-        handler._check_platform_version(self.DEFAULT_OPTIONS)
+        arduino_cli_cmd = self.DEFAULT_OPTIONS.get("arduino_cli_cmd")
+        warning_as_error = self.DEFAULT_OPTIONS.get("warning_as_error")
+
+        cli_command = handler._get_arduino_cli_cmd(arduino_cli_cmd)
+        handler._check_platform_version(cli_command=cli_command, warning_as_error=warning_as_error)
         assert handler._version == version.parse("0.21.1")
 
         handler = microtvm_api_server.Handler()
         mock_run.return_value.stdout = bytes(self.BAD_CLI_VERSION, "utf-8")
         with pytest.raises(server.ServerError) as error:
-            handler._check_platform_version({"warning_as_error": True})
+            handler._check_platform_version(cli_command=cli_command, warning_as_error=True)
         mock_run.reset_mock()
 
     @mock.patch("subprocess.run")
@@ -147,7 +151,7 @@ def test_flash_retry(self, mock_run):
         mock_run.return_value.stdout = bytes(self.GOOD_CLI_VERSION, "utf-8")
 
         def side_effect(cmd, *args, **kwargs):
-            if cmd[1] == "upload":
+            if cmd[1] == "flash":
                 raise subprocess.TimeoutExpired(cmd, kwargs["timeout"])
             return mock.DEFAULT
 
@@ -178,7 +182,7 @@ def test_flash(self, mock_run):
         # Test we checked version then called upload
         assert mock_run.call_count == 2
         assert mock_run.call_args_list[0][0] == (["arduino-cli", "version"],)
-        assert mock_run.call_args_list[1][0][0][0:2] == ["arduino-cli", "upload"]
+        assert mock_run.call_args_list[1][0][0][0:2] == ["make", "flash"]
         mock_run.reset_mock()
 
         # Test exception raised when `arduino-cli upload` returns error code
@@ -188,4 +192,8 @@ def test_flash(self, mock_run):
 
         # Version information should be cached and not checked again
         mock_run.assert_called_once()
-        assert mock_run.call_args[0][0][0:2] == ["arduino-cli", "upload"]
+        assert mock_run.call_args[0][0][0:2] == ["make", "flash"]
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/micro/project_api/test_project_api.py b/tests/micro/project_api/test_project_api.py
new file mode 100644
index 000000000000..b85f6c09536f
--- /dev/null
+++ b/tests/micro/project_api/test_project_api.py
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.micro.project_api import server
+from tvm.relay.backend import Runtime
+from tvm.micro.testing import get_target
+
+API_GENERATE_PROJECT = "generate_project"
+API_BUILD = "build"
+API_FLASH = "flash"
+API_OPEN_TRANSPORT = "open_transport"
+
+PLATFORM_ARDUINO = "arduino"
+PLATFORM_ZEPHYR = "zephyr"
+
+
+platform = tvm.testing.parameter(PLATFORM_ARDUINO, PLATFORM_ZEPHYR)
+
+
+@tvm.testing.requires_micro
+def test_default_options_exist(platform):
+    sys.path.insert(0, tvm.micro.get_microtvm_template_projects(platform))
+    import microtvm_api_server
+
+    platform_options = microtvm_api_server.PROJECT_OPTIONS
+    default_options = server.default_project_options()
+
+    option_names = []
+    for option in platform_options:
+        option_names.append(option.name)
+
+    for option in default_options:
+        assert option.name in option_names
+
+
+@tvm.testing.requires_micro
+def test_project_minimal_options(platform):
+    """Test template project with minimum projectOptions"""
+    shape = (10,)
+    dtype = "int8"
+    x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype))
+    xx = relay.multiply(x, x)
+    z = relay.add(xx, relay.const(np.ones(shape=shape, dtype=dtype)))
+    func = relay.Function([x], z)
+    ir_mod = tvm.IRModule.from_expr(func)
+
+    if platform == "arduino":
+        board = "due"
+    elif platform == "zephyr":
+        board = "qemu_x86"
+
+    runtime = Runtime("crt", {"system-lib": True})
+    target = get_target(platform, board)
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(ir_mod, target=target, runtime=runtime)
+
+    project_options = {
+        "project_type": "host_driven",
+        "board": board,
+    }
+
+    temp_dir = tvm.contrib.utils.tempdir()
+    project = tvm.micro.generate_project(
+        tvm.micro.get_microtvm_template_projects(platform),
+        mod,
+        temp_dir / "project",
+        project_options,
+    )
+    project.build()
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/micro/zephyr/README.md b/tests/micro/zephyr/README.md
index 09376e42f8bb..d41045c3752f 100644
--- a/tests/micro/zephyr/README.md
+++ b/tests/micro/zephyr/README.md
@@ -32,11 +32,11 @@ device) using:
 
 ```
 $ cd tvm/tests/micro/zephyr
-$ pytest test_zephyr.py --zephyr-board=qemu_x86       # For QEMU emulation
-$ pytest test_zephyr.py --zephyr-board=nrf5340dk_nrf5340_cpuapp  # For nRF5340DK
+$ pytest test_zephyr.py --board=qemu_x86       # For QEMU emulation
+$ pytest test_zephyr.py --board=nrf5340dk_nrf5340_cpuapp  # For nRF5340DK
 ```
 
-To see the list of supported values for `--zephyr-board`, run:
+To see the list of supported values for `--board`, run:
 ```
 $ pytest test_zephyr.py --help
 ```
diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py
index 52f0eef36359..695bf2e9caae 100644
--- a/tests/micro/zephyr/test_utils.py
+++ b/tests/micro/zephyr/test_utils.py
@@ -90,7 +90,7 @@ def build_project(
             "project_type": "aot_standalone_demo",
             "west_cmd": west_cmd,
             "verbose": bool(build_config.get("debug")),
-            "zephyr_board": zephyr_board,
+            "board": zephyr_board,
             "compile_definitions": [
                 # TODO(mehrdadh): It fails without offset.
                 f"-DWORKSPACE_SIZE={workspace_size + 128}",
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 8d9a73704d8e..0ef1dd2ce211 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -59,7 +59,7 @@ def _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config, use_fvp):
         "project_type": "host_driven",
         "west_cmd": west_cmd,
         "verbose": bool(build_config.get("debug")),
-        "zephyr_board": zephyr_board,
+        "board": zephyr_board,
         "arm_fvp_path": "/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4/FVP_Corstone_SSE-300_Ethos-U55",
         "use_fvp": bool(use_fvp),
     }
@@ -456,7 +456,7 @@ def test_autotune_conv2d(workspace_dir, board, west_cmd, microtvm_debug, use_fvp
         config_main_stack_size = 1536
 
     project_options = {
-        "zephyr_board": board,
+        "board": board,
         "west_cmd": west_cmd,
         "verbose": 1,
         "project_type": "host_driven",
@@ -577,7 +577,7 @@ def test_schedule_build_with_cmsis_dependency(
         "project_type": "host_driven",
         "west_cmd": west_cmd,
         "verbose": bool(build_config.get("debug")),
-        "zephyr_board": board,
+        "board": board,
         "cmsis_path": os.getenv("CMSIS_PATH"),
         "use_fvp": bool(use_fvp),
     }
diff --git a/tests/micro/zephyr/test_zephyr_aot_exec.py b/tests/micro/zephyr/test_zephyr_aot_exec.py
index 2f00d855a46d..9ebba8ec08cb 100644
--- a/tests/micro/zephyr/test_zephyr_aot_exec.py
+++ b/tests/micro/zephyr/test_zephyr_aot_exec.py
@@ -51,7 +51,7 @@ def _make_session(workspace_dir, zephyr_board, west_cmd, mod, build_config, use_
         "project_type": "host_driven",
         "west_cmd": west_cmd,
         "verbose": bool(build_config.get("debug")),
-        "zephyr_board": zephyr_board,
+        "board": zephyr_board,
         "arm_fvp_path": "/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4/FVP_Corstone_SSE-300_Ethos-U55",
         "use_fvp": bool(use_fvp),
     }
diff --git a/tests/python/unittest/test_micro_project_api.py b/tests/python/unittest/test_micro_project_api.py
index 569393c06094..8e4fe6700e00 100644
--- a/tests/python/unittest/test_micro_project_api.py
+++ b/tests/python/unittest/test_micro_project_api.py
@@ -494,5 +494,40 @@ def _request_reply(request):
     }
 
 
+@tvm.testing.requires_micro
+def test_default_project_options():
+    from tvm.micro import project_api
+
+    default_options = project_api.server.default_project_options()
+    names = []
+    for option in default_options:
+        names.append(option.name)
+        if option.name == "verbose":
+            assert "generate_project" in option.optional
+        if option.name in ["project_type", "board"]:
+            assert "generate_project" in option.required
+        if option.name == "warning_as_error":
+            assert "generate_project" in option.optional
+
+    for name in ["verbose", "project_type", "board", "cmsis_path", "warning_as_error"]:
+        assert name in names
+
+
+@tvm.testing.requires_micro
+def test_modified_project_options():
+    from tvm.micro import project_api
+
+    modified_options = project_api.server.default_project_options(
+        verbose={"optional": ["flash"], "required": ["build"]},
+        board={"choices": ["board1", "board2"]},
+    )
+    for option in modified_options:
+        if option.name == "verbose":
+            assert option.optional == ["flash"]
+            assert option.required == ["build"]
+        if option.name == "board":
+            assert option.choices == ["board1", "board2"]
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index a2ef53a123bf..e8907c99e303 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -31,7 +31,6 @@ run_pytest ctypes python-microtvm-zephyr-mps2_an521 tests/micro/zephyr --board=m
 run_pytest ctypes python-microtvm-zephyr-mps3_an547 tests/micro/zephyr --board=mps3_an547 --use-fvp
 
 # Arduino
-run_pytest ctypes python-microtvm-arduino apps/microtvm/arduino/template_project/tests
 run_pytest ctypes python-microtvm-arduino-nano33ble tests/micro/arduino --board=nano33ble --test-build-only
 run_pytest ctypes python-microtvm-arduino-due tests/micro/arduino --board=due --test-build-only
 
@@ -42,6 +41,9 @@ run_pytest ctypes python-microtvm-stm32 tests/micro/stm32
 run_pytest ctypes python-microtvm-common-qemu_x86 tests/micro/common --platform=zephyr --board=qemu_x86
 run_pytest ctypes python-microtvm-common-due tests/micro/common --platform=arduino --board=due --test-build-only
 
+# Project API
+run_pytest ctypes python-microtvm-project_api tests/micro/project_api
+
 # Tutorials
 python3 gallery/how_to/work_with_microtvm/micro_tflite.py
 python3 gallery/how_to/work_with_microtvm/micro_autotune.py

From 3a125375a3df820b2d0ade61591b0edbc335046d Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 4 Oct 2022 13:51:31 -0500
Subject: [PATCH 298/704] [Hexagon] Don't print simulator protocol messages
 (#12958)

They flood the standard error, and are only useful when debugging protocol
issues. Remove them, they are easy to add back if needed.
---
 src/runtime/hexagon/rpc/simulator/session.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/runtime/hexagon/rpc/simulator/session.cc b/src/runtime/hexagon/rpc/simulator/session.cc
index 8943c0b4bfe5..54a9ff8c5884 100644
--- a/src/runtime/hexagon/rpc/simulator/session.cc
+++ b/src/runtime/hexagon/rpc/simulator/session.cc
@@ -681,7 +681,6 @@ Message SimulatorRPCChannel::SendMsg(Message msg) {
   };
 
   Message_ msg_ = {msg};
-  LOG(INFO) << "Sending message: " << msg_.str();
 
   WriteToProcess(message_buffer_v_, &msg, sizeof msg);
   run();

From a661864f85369fbb0ca85f72dca47b2b1c347149 Mon Sep 17 00:00:00 2001
From: Havisha Panda <97978678+hpanda-naut@users.noreply.github.com>
Date: Tue, 4 Oct 2022 17:59:32 -0400
Subject: [PATCH 299/704] [skip ci] Modify issue templates to align with Issue
 Tracking RFC (#12898)

* Issue Templates Edits

Made changes to the issue templates to align with https://github.com/apache/tvm-rfcs/blob/main/rfcs/0093_Issue_Triage.md

* Added label tags' placeholder

* added back default labels

* edited format

* added the link to the label tag document

* added an example for Triage section
---
 .github/ISSUE_TEMPLATE/bug-report.md       | 11 +++++---
 .github/ISSUE_TEMPLATE/ci-image.md         | 29 ----------------------
 .github/ISSUE_TEMPLATE/ci-problem.md       | 13 +++++++---
 .github/ISSUE_TEMPLATE/documentation.md    | 11 +++++---
 .github/ISSUE_TEMPLATE/feature-tracking.md | 11 +++++---
 .github/ISSUE_TEMPLATE/flaky-test.md       | 10 ++++++--
 6 files changed, 41 insertions(+), 44 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/ci-image.md

diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
index 532f5f408b35..22771333683b 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -1,9 +1,8 @@
 ---
 name: "\U0001F41B Bug report"
-about: To help the developer act on the issues, please include a description of your environment, preferably a minimum script to reproduce the problem.
+about: To help the developer act on the issues, please include a description of your environment, preferably a minimum script to reproduce the problem. Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
 title: "[Bug] "
-labels: "type: bug"
-
+labels: "needs-triage, type: bug"
 ---
 
 Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking.  You are always welcomed to post on the forum first :smile_cat:
@@ -25,3 +24,9 @@ Any environment details, such as: Operating System, TVM version, etc
 ### Steps to reproduce
 
 Preferably a minimal script to cause the issue to occur.
+
+### Triage
+
+Please refer to the list of label tags linked above to find the relevant tags and add them here in a bullet format (example below).
+
+* needs-triage
diff --git a/.github/ISSUE_TEMPLATE/ci-image.md b/.github/ISSUE_TEMPLATE/ci-image.md
deleted file mode 100644
index d5abd8f20f80..000000000000
--- a/.github/ISSUE_TEMPLATE/ci-image.md
+++ /dev/null
@@ -1,29 +0,0 @@
----
-name: "\U0001F40B Update CI Docker Image"
-about: Provide information on CI Docker Images requiring updates
-title: "[CI Image] "
-
----
-
-Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking.  You are always welcomed to post on the forum first :smile_cat:
-
-Issues that are inactive for a period of time may get closed. We adopt this policy so that we won't lose track of actionable issues that may fall at the bottom of the pile. Feel free to reopen a new one if you feel there is an additional problem that needs attention when an old one gets closed.
-
-- [ ] S0. Reason: For example, a blocked PR or a feature issue
-
-- [ ] S1. Tag of nightly build: TAG. Docker hub: https://hub.docker.com/layers/tlcpackstaging/ci_cpu/...
-
-- [ ] S2. The nightly is built on TVM commit: TVM_COMMIT. Detailed info can be found here: https://ci.tlcpack.ai/blue/organizations/jenkins/docker-images-ci%2Fdaily-docker-image-rebuild/detail/daily-docker-image-rebuild/....
-
-- [ ] S3. Testing the nightly image on ci-docker-staging: https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/ci-docker-staging/...
-
-- [ ] S4. Retag TAG to VERSION:
-```
-docker pull tlcpackstaging/IMAGE_NAME:TAG
-docker tag tlcpackstaging/IMAGE_NAME:TAG tlcpack/IMAGE_NAME:VERSION
-docker push tlcpack/IMAGE_NAME:VERSION
-```
-
-- [ ] S5. Check if the new tag is really there: https://hub.docker.com/u/tlcpack
-
-- [ ] S6. Submit a PR updating the IMAGE_NAME version on Jenkins
diff --git a/.github/ISSUE_TEMPLATE/ci-problem.md b/.github/ISSUE_TEMPLATE/ci-problem.md
index c917472c186a..0e91c5650e05 100644
--- a/.github/ISSUE_TEMPLATE/ci-problem.md
+++ b/.github/ISSUE_TEMPLATE/ci-problem.md
@@ -1,9 +1,8 @@
 ---
 name: "\U0000274C CI Problem"
-about: To help the developers act on these problems, please give us as many details of the CI failure as possible.
+about: To help the developers act on these problems, please give us as many details of the CI failure as possible. Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
 title: "[CI Problem] "
-labels: "needs-triage"
-
+labels: "needs-triage, type: ci"
 ---
 
 Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking.  You are always welcomed to post on the forum first :smile_cat:
@@ -20,4 +19,10 @@ Provide a link to the specific run that has failed.
 
 ### Flakiness
 
-Have you seen this multiple times in this branch or in other branches?
\ No newline at end of file
+Have you seen this multiple times in this branch or in other branches?
+
+### Triage
+
+Please refer to the list of label tags linked above to find the relevant tags and add them here in a bullet format (example below).
+
+* needs-triage
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
index a1c1facb7e1b..f41d1238b0ec 100644
--- a/.github/ISSUE_TEMPLATE/documentation.md
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -1,9 +1,8 @@
 ---
 name: "\U0001F4C4 Documentation"
-about: Use this template to suggest additions and changes to the documentation.
+about: Use this template to suggest additions and changes to the documentation. Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
 title: "[Docs] "
-labels: "type: doc"
-
+labels: "needs-triage, type: doc"
 ---
 
 Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking.  You are always welcomed to post on the forum first :smile_cat:
@@ -20,3 +19,9 @@ If an RFC/discuss post exists, link it here.
 
 Otherwise, specify what actions should be taken to provide additional clarity/readability/reproducibility to the document. Include code snippets from the previous documentation if applicable.
 
+### Triage
+
+Please refer to the list of label tags linked above to find the relevant tags and add them here in a bullet format (example below).
+
+* needs-triage
+
diff --git a/.github/ISSUE_TEMPLATE/feature-tracking.md b/.github/ISSUE_TEMPLATE/feature-tracking.md
index 8dd0648f69d4..2113f4cbcff9 100644
--- a/.github/ISSUE_TEMPLATE/feature-tracking.md
+++ b/.github/ISSUE_TEMPLATE/feature-tracking.md
@@ -1,9 +1,8 @@
 ---
 name: "\U0001F527 Feature Tracking"
-about: List clear, small actionable items so we can track the progress of the change.
+about: List clear, small actionable items so we can track the progress of the change. Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
 title: "[Tracking Issue] "
-labels: type:rfc-tracking
-
+labels: "needs-triage, type:rfc-tracking"
 ---
 
 Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking.  You are always welcomed to post on the forum first :smile_cat:
@@ -12,3 +11,9 @@ Issues that are inactive for a period of time may get closed. We adopt this poli
 
 ### This issue is to track progress for FEATURE NAME
 - [ ] P1. Title of this piece of the feature (PR link if available)
+
+### Triage
+
+Please refer to the list of label tags linked above to find the relevant tags and add them here in a bullet format (example below).
+
+* needs-triage
diff --git a/.github/ISSUE_TEMPLATE/flaky-test.md b/.github/ISSUE_TEMPLATE/flaky-test.md
index 1e8d267f8ec1..8d8238613523 100644
--- a/.github/ISSUE_TEMPLATE/flaky-test.md
+++ b/.github/ISSUE_TEMPLATE/flaky-test.md
@@ -1,8 +1,8 @@
 ---
 name: "\U00002744 Flaky Test"
-about: Report a flaky test, make sure to include links to CI runs, a sample failure log, and the name of the test(s)
+about: Report a flaky test, make sure to include links to CI runs, a sample failure log, and the name of the test(s). Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
 title: "[Flaky Test] "
-labels: "test: flaky"
+labels: "needs-triage, test: flaky"
 ---
 
 Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking. You are always welcomed to post on the forum first :smile_cat:
@@ -16,3 +16,9 @@ These tests were found to be flaky (intermittently failing on `main` or failed i
 ### Jenkins Links
 
 - Please provide link(s) to failed CI runs. If runs are for a PR, explain why your PR did not break the test (e.g. did not touch that part of the codebase)
+
+### Triage
+
+Please refer to the list of label tags linked above to find the relevant tags and add them here in a bullet format (example below).
+
+* needs-triage

From af01526ae2214c3dded4514a025eca2771ff6e24 Mon Sep 17 00:00:00 2001
From: Havisha Panda <97978678+hpanda-naut@users.noreply.github.com>
Date: Tue, 4 Oct 2022 19:37:57 -0400
Subject: [PATCH 300/704] [skip ci] Edits to the Bug & Flaky test Issue
 templates to reduce word count (#12985)

* Update bug-report.md

* [skip ci] Edits to the Bug & Flaky test Issue templates to reduce word count
---
 .github/ISSUE_TEMPLATE/bug-report.md | 2 +-
 .github/ISSUE_TEMPLATE/ci-problem.md | 2 +-
 .github/ISSUE_TEMPLATE/flaky-test.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
index 22771333683b..b541eb3a317c 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -1,6 +1,6 @@
 ---
 name: "\U0001F41B Bug report"
-about: To help the developer act on the issues, please include a description of your environment, preferably a minimum script to reproduce the problem. Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
+about: Please include a description of your environment, preferably a minimum script to reproduce the problem. Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
 title: "[Bug] "
 labels: "needs-triage, type: bug"
 ---
diff --git a/.github/ISSUE_TEMPLATE/ci-problem.md b/.github/ISSUE_TEMPLATE/ci-problem.md
index 0e91c5650e05..73e485fbcac0 100644
--- a/.github/ISSUE_TEMPLATE/ci-problem.md
+++ b/.github/ISSUE_TEMPLATE/ci-problem.md
@@ -2,7 +2,7 @@
 name: "\U0000274C CI Problem"
 about: To help the developers act on these problems, please give us as many details of the CI failure as possible. Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
 title: "[CI Problem] "
-labels: "needs-triage, type: ci"
+labels: "needs-triage, type:ci"
 ---
 
 Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking.  You are always welcomed to post on the forum first :smile_cat:
diff --git a/.github/ISSUE_TEMPLATE/flaky-test.md b/.github/ISSUE_TEMPLATE/flaky-test.md
index 8d8238613523..d20da597d1de 100644
--- a/.github/ISSUE_TEMPLATE/flaky-test.md
+++ b/.github/ISSUE_TEMPLATE/flaky-test.md
@@ -1,6 +1,6 @@
 ---
 name: "\U00002744 Flaky Test"
-about: Report a flaky test, make sure to include links to CI runs, a sample failure log, and the name of the test(s). Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
+about: Report flaky tests, make sure to include link to CI runs, a sample failure log, and the name of the test(s). Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
 title: "[Flaky Test] "
 labels: "needs-triage, test: flaky"
 ---

From a997c23e94bd6e24038be394477b9afa602c5013 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Wed, 5 Oct 2022 16:15:36 +0530
Subject: [PATCH 301/704] [CODEGEN][OPENCL] Sampler definition should be at
 outermost scope (#12951)

Not all OpenCL compiers happy with inline sample definitions.

Specification: "The image read functions take a sampler argument. The sampler can be passed as an
argument to the kernel using clSetKernelArg, or can be declared in the outermost scope of
kernel functions, or it can be a constant variable of type sampler_t declared in the program source."
---
 src/target/source/codegen_opencl.cc | 13 ++++++++++++-
 src/target/source/codegen_opencl.h  |  1 +
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 73a064bc80f5..cd898043eeb5 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -89,6 +89,17 @@ void CodeGenOpenCL::InitFuncState(const PrimFunc& f) {
 
 void CodeGenOpenCL::PrintFuncPrefix() { stream << "__kernel void"; }
 
+void CodeGenOpenCL::PreFunctionBody(const PrimFunc& f) {
+  for (Var arg : f->params) {
+    auto ptr_type = arg->type_annotation.as<PointerTypeNode>();
+    if (ptr_type && runtime::IsTextureStorage(std::string(ptr_type->storage_scope))) {
+      this->stream << "  const sampler_t image_sampler = "
+                      "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n";
+      return;
+    }
+  }
+}
+
 std::string CodeGenOpenCL::Finish() {
   // inject extension enable pragma for fp16 and fp64
   if (enable_fp16_) {
@@ -433,7 +444,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     }
     this->PrintExpr(op->args[0], ss);
     ss << ", ";
-    ss << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, ";
+    ss << "image_sampler, ";
     ss << "((int2)(";
     this->PrintExpr(op->args[1], ss);
     ss << ", ";
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index a7f4483ee2a9..af6de1531017 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -42,6 +42,7 @@ class CodeGenOpenCL final : public CodeGenC {
   // override print thread tag.
   void InitFuncState(const PrimFunc& f) final;
   void PrintFuncPrefix() final;                                              // NOLINT(*)
+  void PreFunctionBody(const PrimFunc& f) final;                             // NOLINT(*)
   void BindThreadIndex(const IterVar& iv) final;                             // NOLINT(*)
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
   void PrintStorageSync(const CallNode* op) final;                           // NOLINT(*)

From 9618e6a457ee82830a8fce26d5a46c3444c93120 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 5 Oct 2022 21:46:21 +0900
Subject: [PATCH 302/704] [TIR] Use IndexMap to transform NDArray (#12949)

I've hit a weird use case where I want to manually transform `runtime::NDArray` (attached to `AllocateConst` node) according to the index map used in `transform_layout`. This is needed to support `AllocateConst` node in Metaschedule `RewriteLayout` postproc.

I can define it as a free function in the file where it is actually used. Having it available as part of the `IndexMap` interface makes it convenient to expose this to python and unit-test it. Let me know if this is a reasonable API addition.
---
 include/tvm/tir/index_map.h             |  8 ++++
 python/tvm/tir/function.py              | 16 +++++++
 src/tir/ir/index_map.cc                 | 57 ++++++++++++++++++++++++
 tests/python/unittest/test_index_map.py | 59 +++++++++++++++++++++++++
 4 files changed, 140 insertions(+)

diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h
index e1b323462cda..35a74d294fbb 100644
--- a/include/tvm/tir/index_map.h
+++ b/include/tvm/tir/index_map.h
@@ -136,6 +136,14 @@ class IndexMapNode : public Object {
    */
   Array<PrimExpr> MapShape(const Array<PrimExpr>& shape, arith::Analyzer* analyzer = nullptr) const;
 
+  /* \brief Map an NDArray according to this index map
+   *
+   * \param arr_src The NDArray whose layout is transformed by this index map.
+   *
+   * \returns The transformed NDArray.
+   */
+  runtime::NDArray MapNDArray(runtime::NDArray arr_src) const;
+
   /*!
    * \brief Convert to string representation in Python.
    * \return The stringified lambda expression in Python.
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index dd684bc4f1ae..4628ae36265f 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -28,6 +28,7 @@
 from .buffer import Buffer
 from .expr import Var, PrimExpr
 from . import _ffi_api
+from ..runtime.ndarray import NDArray
 
 
 @tvm._ffi.register_object("tir.PrimFunc")
@@ -515,6 +516,21 @@ def map_shape(self, shape: List[PrimExpr]) -> List[PrimExpr]:
         """
         return _ffi_api.IndexMapMapShape(self, shape)
 
+    def map_ndarray(self, arr_src: NDArray) -> NDArray:
+        """Apply thie index map to transform the layout of the input NDArray
+
+        Parameters
+        ----------
+        arr_src : runtime.NDArray
+            The NDArray to be transformed
+
+        Returns
+        -------
+        arr_dst : runtime.NDArray
+            The transformed NDArray
+        """
+        return _ffi_api.IndexMapMapNDArray(self, arr_src)
+
     def inverse(self, shape: List[Union[Range, PrimExpr]]) -> "IndexMap":
         """Return the inverse of the map
 
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 2c5349ab9941..6d982b510a26 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -208,6 +208,60 @@ Array<PrimExpr> IndexMapNode::MapShape(const Array<PrimExpr>& shape,
   return output;
 }
 
+runtime::NDArray IndexMapNode::MapNDArray(runtime::NDArray arr_src) const {
+  auto shape = arr_src.Shape();
+  ICHECK(shape.size() == initial_indices.size())
+      << "The rank of the input array should be " << initial_indices.size() << " but got "
+      << shape.size();
+  size_t size_1d = 1;
+  Array<PrimExpr> orig_shape;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size_1d *= shape[i];
+    orig_shape.push_back(PrimExpr(static_cast<int>((shape[i]))));
+  }
+  auto dst_shape = MapShape(orig_shape);
+
+  std::vector<int64_t> dst_shape_int;
+  for (size_t i = 0; i < dst_shape.size(); ++i) {
+    dst_shape_int.push_back(dst_shape[i].as<IntImmNode>()->value);
+  }
+
+  auto elem_bytes = (arr_src->dtype.bits / 8) * arr_src->dtype.lanes;
+  std::vector<uint8_t> bytes_src(size_1d * elem_bytes);
+  arr_src.CopyToBytes(bytes_src.data(), bytes_src.size());
+
+  std::vector<uint8_t> bytes_dst(bytes_src.size());
+
+  for (size_t i = 0; i < size_1d; ++i) {
+    // Convert a linear coordinate to an N-d coordinate tuple
+    // z * height * width + y * width + x -> (z, y, x)
+    Array<PrimExpr> src_indices;
+    auto div_factor = size_1d;
+    auto src_linear_index = i;
+    for (auto s : shape) {
+      div_factor /= s;
+      src_indices.push_back(PrimExpr(static_cast<int>((src_linear_index / div_factor))));
+      src_linear_index %= div_factor;
+    }
+    auto dst_indices = MapIndices(src_indices);
+
+    // Convert an N-d coordinate to a linear coordinate
+    // (z, y, x) -> z * height * width + y * width + x
+    size_t dst_linear_index = 0;
+    auto mul_factor = size_1d;
+    for (size_t j = 0; j < dst_indices.size(); ++j) {
+      mul_factor /= dst_shape_int[j];
+      dst_linear_index += dst_indices[j].as<IntImmNode>()->value * mul_factor;
+    }
+    std::copy(bytes_src.begin() + i * elem_bytes, bytes_src.begin() + (i + 1) * elem_bytes,
+              bytes_dst.begin() + dst_linear_index * elem_bytes);
+  }
+
+  auto arr_dst = runtime::NDArray::Empty(dst_shape_int, arr_src->dtype, arr_src->device);
+  arr_dst.CopyFromBytes(bytes_dst.data(), bytes_dst.size());
+  return arr_dst;
+}
+
 /*!
  * \brief Auxilarry function to comvert an index map to lambda expression in Python.
  * \param initial_indices The initial indices in the index map.
@@ -289,6 +343,9 @@ TVM_REGISTER_GLOBAL("tir.IndexMapMapShape").set_body_typed([](IndexMap map, Arra
 });
 TVM_REGISTER_GLOBAL("tir.IndexMapInverse").set_body_method(&IndexMap::Inverse);
 
+TVM_REGISTER_GLOBAL("tir.IndexMapMapNDArray")
+    .set_body_typed([](IndexMap map, runtime::NDArray arr) { return map->MapNDArray(arr); });
+
 TVM_REGISTER_GLOBAL("tir.IndexMapNonSurjectiveInverse")
     .set_body_typed([](IndexMap forward, Array<Range> initial_ranges) {
       auto result = forward.NonSurjectiveInverse(initial_ranges);
diff --git a/tests/python/unittest/test_index_map.py b/tests/python/unittest/test_index_map.py
index a86880b0f4a8..804d04d0b052 100644
--- a/tests/python/unittest/test_index_map.py
+++ b/tests/python/unittest/test_index_map.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy as np
 
 import pytest
 import tvm
@@ -202,5 +203,63 @@ def expected_inverse(i0, i1, i2, i3):
     assert expected_map.is_equivalent_to(inverse_map)
 
 
+def test_map_ndarray():
+    index_map = IndexMap.from_func(lambda i: [i // 4, i % 4])
+
+    inp = np.arange(16).astype("int8")
+
+    out = index_map.map_ndarray(tvm.nd.array(inp)).numpy()
+
+    ref = np.zeros(out.shape).astype("int8")
+
+    for i in range(16):
+        ref[i // 4, i % 4] = inp[i]
+
+    np.testing.assert_equal(ref, out)
+
+    index_map = IndexMap.from_func(lambda i0, i1, i2, i3: (i3, i0, i1, i2))
+
+    inp = np.random.randn(10, 10, 10, 10).astype("float16")
+
+    out = index_map.map_ndarray(tvm.nd.array(inp)).numpy()
+
+    ref = np.transpose(inp, (3, 0, 1, 2))
+
+    np.testing.assert_equal(ref, out)
+
+    index_map = IndexMap.from_func(
+        lambda i0, i1, i2, i3: (
+            floordiv(i3, 32),
+            i0,
+            floordiv(i2, 8),
+            floordiv(floormod(i3, 32), 16),
+            i1,
+            floormod(i2, 8),
+            floormod(i3, 16),
+        )
+    )
+
+    kH = kW = 3
+    I = 64
+    O = 64
+    inp = np.random.randn(kH, kW, I, O).astype("float32")
+    arr = tvm.nd.array(inp)
+    out = index_map.map_ndarray(arr).numpy()
+
+    ref = np.zeros(out.shape).astype("float32")
+
+    for i0 in range(kH):
+        for i1 in range(kW):
+            for i2 in range(I):
+                for i3 in range(O):
+                    v = inp[i0, i1, i2, i3]
+                    ref[i3 // 32, i0, i2 // 8, (i3 % 32) // 16, i1, i2 % 8, i3 % 16] = v
+
+    np.testing.assert_equal(ref, out)
+
+    inverse_map = index_map.inverse(inp.shape)
+    np.testing.assert_equal(inverse_map.map_ndarray(index_map.map_ndarray(arr)).numpy(), inp)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 393d5f7fd20acf18e7e38be3903ce45756a2cd13 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Wed, 5 Oct 2022 10:05:14 -0700
Subject: [PATCH 303/704] [FIX,PROFILER] Fix PAPI test and test in CI (#12953)

Fix papi test. Can't test in CI because performance counters aren't available on cloud machines.
---
 tests/python/unittest/test_runtime_profiling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index ed1841b21616..7afcc5250d6f 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -127,7 +127,7 @@ def test_papi(target, dev):
 
     data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
     report = vm.profile(
-        [data],
+        data,
         func_name="main",
         collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: [metric]})],
     )

From 2e257f037681766f0bf31f40a62b81691bbcbc8e Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 5 Oct 2022 13:18:08 -0700
Subject: [PATCH 304/704] [Hexagon] Do not use `target` test fixture in Hexagon
 tests (#12981)

* remove target from test functions

* refactor target_hexagon

* refactor target

* fix permission

* cleanup

* fix target

* remove target fixture from test_2d_physical_buffers

* fix target fixture in test_hexagon/conv2d tests

* address comments
---
 .../contrib/test_hexagon/infrastructure.py    |  8 ++++++-
 .../test_hexagon/test_2d_physical_buffers.py  | 16 +++----------
 .../contrib/test_hexagon/test_autotvm.py      | 11 +++++----
 .../test_benchmark_elemwise_add.py            |  8 +++----
 .../test_hexagon/test_benchmark_maxpool2d.py  | 12 +++-------
 .../test_hexagon/test_cache_read_write.py     |  5 ++--
 .../test_hexagon/test_fixed_point_multiply.py |  8 +++----
 .../contrib/test_hexagon/test_launcher.py     | 17 ++++++--------
 .../contrib/test_hexagon/test_memory_alloc.py | 11 +++------
 .../test_hexagon/test_meta_schedule.py        | 23 ++++++-------------
 .../contrib/test_hexagon/test_models.py       |  5 ++--
 .../contrib/test_hexagon/test_parallel_hvx.py | 11 ++++-----
 .../test_parallel_hvx_load_vtcm.py            | 16 +++++--------
 .../test_hexagon/test_parallel_scalar.py      | 11 ++++-----
 .../contrib/test_hexagon/test_sigmoid.py      |  8 ++-----
 .../test_software_pipeline_async.py           |  7 +++---
 .../contrib/test_hexagon/test_thread_pool.py  | 10 ++++----
 .../test_hexagon/test_vtcm_bandwidth.py       | 12 ++++------
 .../topi/test_add_subtract_multiply.py        | 10 +++++---
 .../test_hexagon/topi/test_argmax_slice.py    |  6 ++---
 .../topi/test_avg_pool2d_slice.py             | 12 ++++++----
 .../test_hexagon/topi/test_batch_matmul.py    | 11 ++++-----
 .../test_hexagon/topi/test_cast_slice.py      | 14 +++++------
 .../contrib/test_hexagon/topi/test_clip.py    |  9 +++-----
 .../topi/test_conv2d_fp16_intrin.py           |  7 +++---
 .../test_hexagon/topi/test_conv2d_nchw.py     |  6 ++---
 .../test_hexagon/topi/test_conv2d_nhwc.py     | 10 ++++----
 .../test_hexagon/topi/test_conv2d_slice.py    |  8 ++-----
 .../topi/test_conv2d_transpose.py             | 11 ++++-----
 .../contrib/test_hexagon/topi/test_dense.py   |  9 ++++----
 .../test_hexagon/topi/test_depth_to_space.py  |  7 ++----
 .../topi/test_depthwise_conv2d.py             |  7 +++---
 .../topi/test_dequantize_slice.py             | 11 +++++----
 .../test_hexagon/topi/test_dwconv2d_slice.py  |  8 ++-----
 .../topi/test_max_pool2d_slice.py             |  9 ++------
 .../contrib/test_hexagon/topi/test_pad.py     |  7 +++---
 .../contrib/test_hexagon/topi/test_pooling.py | 12 +++++-----
 .../test_hexagon/topi/test_quantize.py        | 11 +++++----
 .../contrib/test_hexagon/topi/test_reduce.py  |  8 +++----
 .../test_hexagon/topi/test_relu_slice.py      |  9 ++------
 .../contrib/test_hexagon/topi/test_reshape.py | 10 +++-----
 .../test_hexagon/topi/test_resize2d.py        |  5 ++--
 .../contrib/test_hexagon/topi/test_softmax.py |  9 ++++----
 .../test_hexagon/topi/test_softmax_slice.py   |  3 ++-
 .../test_hexagon/topi/test_tanh_slice.py      |  6 ++---
 45 files changed, 184 insertions(+), 250 deletions(-)
 mode change 100755 => 100644 tests/python/contrib/test_hexagon/test_2d_physical_buffers.py

diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 1058e1dd8117..6f7e1904da2f 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -128,7 +128,7 @@ def get_packed_filter_shape(logical_shape_oihw):
     return physical_shape_oihw8i32o4i
 
 
-def build_and_run(inputs, func, target, target_host, *args, **kwargs):
+def build_and_run(inputs, func, target: str, target_host: str, *args, **kwargs):
     """build and run the function func"""
     schedule, placeholders, binds = func(*args, **kwargs)
 
@@ -351,3 +351,9 @@ def quantize_np(arr_np: numpy.ndarray, dtype: str):
     zero_point = numpy.rint((fmax * qmin - fmin * qmax) / (fmax - fmin)).astype("int32")
     quant_np = (arr_np / scale + zero_point).astype(dtype)
     return quant_np, scale, zero_point
+
+
+def get_hexagon_target(cpu_ver: str) -> tvm.target.Target:
+    """Creates a Hexagon target"""
+    target = tvm.target.hexagon(cpu_ver)
+    return tvm.target.Target(target, host=target)
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
old mode 100755
new mode 100644
index cba6ddc4433a..7804ae2e4898
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -32,7 +32,7 @@
 from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain
 from tvm.tir.stmt_functor import post_order_visit
 
-from .infrastructure import allocate_hexagon_array
+from .infrastructure import allocate_hexagon_array, get_hexagon_target
 
 # Disabling invalid name as pylint assumes global variables as constants and
 # expects them to be all upper-case. Since these are used as
@@ -79,19 +79,9 @@
 
 
 @tvm.testing.fixture
-def target_host(target):
+def target_host():
     """Return tvm target.Target with host attached"""
-    target = tvm.target.Target(target)
-
-    if target.kind.name == "hexagon":
-        # Shouldn't have to modify the target here, current
-        # workaround.  In the future, should move the parameter
-        # handling from tvm.target to target_kind.cc.
-        target = tvm.target.hexagon("v68", link_params=True)
-        host = target
-    else:
-        host = None
-    return tvm.target.Target(target, host=host)
+    return get_hexagon_target("v68")
 
 
 # Disabling redefined-outer-name for the whole file as there isn't any easy
diff --git a/tests/python/contrib/test_hexagon/test_autotvm.py b/tests/python/contrib/test_hexagon/test_autotvm.py
index 513d5bdbab7a..da60e20c3bf4 100644
--- a/tests/python/contrib/test_hexagon/test_autotvm.py
+++ b/tests/python/contrib/test_hexagon/test_autotvm.py
@@ -19,8 +19,6 @@
 
 import contextlib
 import os
-import sys
-
 import pytest
 
 import tvm
@@ -28,6 +26,8 @@
 from tvm import autotvm, te
 from tvm.autotvm.tuner import GATuner, XGBTuner
 
+from .infrastructure import get_hexagon_target
+
 
 @autotvm.template("demo_template")
 def demo_template():
@@ -143,12 +143,13 @@ def test_autotvm(hexagon_session):
             ),
         ),
     }
-    target_hexagon = tvm.target.hexagon("v68")
     task = autotvm.task.create(
-        "demo_template", args=[], target=target_hexagon, target_host=target_hexagon
+        "demo_template",
+        args=[],
+        target=get_hexagon_target("v68"),
     )
     tune_tasks([task], **options)
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
index b15219ebc00e..3dcb9a880e00 100644
--- a/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
@@ -23,22 +23,20 @@
 
 import numpy as np
 import pytest
+
 import tvm.script
 import tvm.testing
 from tvm.contrib.hexagon.build import HexagonLauncherRPC
 from tvm.script import tir as T
 
 from . import benchmark_util as bu
+from .infrastructure import get_hexagon_target
 
 _SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
 
 # This is a fixed detail of the v68 architecture.
 HVX_VECTOR_BYTES = 128
 
-_HEXAGON_TARGET = tvm.target.hexagon("v69", link_params=True)
-
-_SUPER_TARGET = tvm.target.Target(_HEXAGON_TARGET, host=_HEXAGON_TARGET)
-
 # NOTE on server ports:
 # These tests use different port numbers for the RPC server (7070 + ...).
 # The reason is that an RPC session cannot be gracefully closed without
@@ -219,7 +217,7 @@ def _benchmark_hexagon_elementwise_add_kernel(
                     input2,
                     output,
                 ],
-                _SUPER_TARGET,
+                get_hexagon_target("v69"),
                 name=_PRIMFUNC_NAME,
             )
 
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
index 41169494417a..b2de2c7e95f0 100644
--- a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
@@ -42,7 +42,6 @@
       primfuncs and demonstrate more coding strategies.
 """
 
-import sys
 import pytest
 import numpy as np
 import copy
@@ -51,15 +50,11 @@
 import tvm.testing
 from tvm import te, topi, tir
 from tvm.topi import testing
-from tvm.script import tir as T
-from tvm.tir import IndexMap
-from tvm.relay.backend import Executor, Runtime
 from tvm.contrib.hexagon.session import Session
 from typing import List
 
-from .infrastructure import allocate_hexagon_array
+from .infrastructure import allocate_hexagon_array, get_hexagon_target
 from . import benchmark_util as bu
-from .benchmark_util import benchmark_group
 
 _SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
 
@@ -246,10 +241,9 @@ def test_maxpool2d_nhwc(
                     block="tensor", buffer="placeholder", index_map=_int8_nhwc_8h8w32c_map
                 )
 
-                target_hexagon = tvm.target.hexagon("v69", link_params=True)
-                # func = tvm.build(sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon))
                 built_module = tvm.build(
-                    sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon)
+                    sch.mod,
+                    target=get_hexagon_target("v69"),
                 )
 
                 # Save a local copy of the Hexagon object code (in the form of a .so file)
diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py
index 896db8b59c5c..af5e7a398870 100644
--- a/tests/python/contrib/test_hexagon/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py
@@ -24,6 +24,8 @@
 from tvm.contrib.hexagon.session import Session
 from tvm.script import tir as T
 
+from .infrastructure import get_hexagon_target
+
 
 def intrin_mem_copy(shape, dtype, dst_scope, src_scope):
     """Define and return tensor intrinsic for mem copy"""
@@ -76,11 +78,10 @@ def verify(hexagon_session: Session, schedule, x_tensor, y_tensor, z_tensor, siz
     """Verify correctness with reference from numpy"""
     print(tvm.lower(schedule, [x_tensor, y_tensor, z_tensor]))
 
-    target_hexagon = tvm.target.hexagon("v68", link_params=True)
     func = tvm.build(
         schedule,
         [x_tensor, y_tensor, z_tensor],
-        tvm.target.Target(target_hexagon, host=target_hexagon),
+        get_hexagon_target("v68"),
         name="dmacpy",
     )
 
diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
index 8ee04a649990..ee03599ff1f4 100644
--- a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
+++ b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
@@ -14,14 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import re
+import numpy as np
 
 import tvm.testing
 from tvm import relay
 from tvm.relay.backend import Executor
 from tvm.contrib.hexagon.session import Session
 
-import re
-import numpy as np
+from .infrastructure import get_hexagon_target
 
 
 @tvm.testing.requires_hexagon
@@ -37,13 +38,12 @@ def test_vmpy_intrinsic_presence():
     relay_mod = tvm.IRModule.from_expr(y)
 
     params = {}
-    target_hexagon = tvm.target.hexagon("v68")
     executor = Executor("graph", {"link-params": True})
 
     with tvm.transform.PassContext(opt_level=3):
         hexagon_lowered = tvm.relay.build(
             relay_mod,
-            tvm.target.Target(target_hexagon, host=target_hexagon),
+            get_hexagon_target("v68"),
             executor=executor,
             params=params,
         )
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 7431871524aa..565999c32957 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -25,6 +25,8 @@
 from tvm.contrib.hexagon.session import Session
 from tvm.relay.backend import Executor, Runtime
 
+from .infrastructure import get_hexagon_target
+
 
 @tvm.testing.requires_hexagon
 def test_add(hexagon_session: Session):
@@ -37,11 +39,10 @@ def test_add(hexagon_session: Session):
     )
     sched = tvm.te.create_schedule(compute_c.op)
 
-    target_hexagon = tvm.target.hexagon("v68", link_params=True)
     func = tvm.build(
         sched,
         [placeholder_a, placeholder_b, compute_c],
-        tvm.target.Target(target_hexagon, host=target_hexagon),
+        get_hexagon_target("v68"),
         name="add",
     )
 
@@ -68,11 +69,10 @@ def test_add_vtcm(hexagon_session: Session):
     )
     sched = tvm.te.create_schedule(compute_c.op)
 
-    target_hexagon = tvm.target.hexagon("v68", link_params=True)
     func = tvm.build(
         sched,
         [placeholder_a, placeholder_b, compute_c],
-        tvm.target.Target(target_hexagon, host=target_hexagon),
+        get_hexagon_target("v68"),
         name="add",
     )
 
@@ -117,11 +117,10 @@ def test_matmul(self, hexagon_session, size_m, size_n, size_k):
         )
         schedule = te.create_schedule(compute_z.op)
 
-        target_hexagon = tvm.target.hexagon("v68", link_params=True)
         func = tvm.build(
             schedule,
             [placeholder_x, placeholder_y, compute_z],
-            tvm.target.Target(target_hexagon, host=target_hexagon),
+            get_hexagon_target("v68"),
         )
 
         mod = hexagon_session.load_module(func)
@@ -173,7 +172,6 @@ def test_graph_executor(hexagon_session: Session):
     relay_mod = tvm.IRModule.from_expr(f)
     relay_mod = relay.transform.InferType()(relay_mod)
 
-    target_hexagon = tvm.target.hexagon("v68")
     runtime = Runtime("cpp")
     executor = Executor("graph")
 
@@ -185,7 +183,7 @@ def test_graph_executor(hexagon_session: Session):
     with tvm.transform.PassContext(opt_level=3):
         lowered = tvm.relay.build(
             relay_mod,
-            tvm.target.Target(target_hexagon, host=target_hexagon),
+            get_hexagon_target("v68"),
             runtime=runtime,
             executor=executor,
         )
@@ -243,14 +241,13 @@ def test_graph_executor_multiple_conv2d(hexagon_session: Session):
     relay_mod = tvm.IRModule.from_expr(f)
     relay_mod = relay.transform.InferType()(relay_mod)
 
-    target_hexagon = tvm.target.hexagon("v68")
     runtime = Runtime("cpp")
     executor = Executor("graph")
 
     with tvm.transform.PassContext(opt_level=3):
         lowered = tvm.relay.build(
             relay_mod,
-            tvm.target.Target(target_hexagon, host=target_hexagon),
+            get_hexagon_target("v68"),
             runtime=runtime,
             executor=executor,
         )
diff --git a/tests/python/contrib/test_hexagon/test_memory_alloc.py b/tests/python/contrib/test_hexagon/test_memory_alloc.py
index fd948ea524f2..a6d011eddd5a 100644
--- a/tests/python/contrib/test_hexagon/test_memory_alloc.py
+++ b/tests/python/contrib/test_hexagon/test_memory_alloc.py
@@ -15,20 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
 import os.path
-import sys
-import tempfile
 
 import numpy as np
-import pytest
 
 import tvm
 from tvm.script import tir as T
 
-from .infrastructure import allocate_hexagon_array
-
-_HEXAGON_TARGET = tvm.target.hexagon("v69", link_params=True)
+from .infrastructure import allocate_hexagon_array, get_hexagon_target
 
 
 @tvm.testing.fixture
@@ -63,7 +57,8 @@ def test_global_axis_separator(
         self, hexagon_session, generated_func, shape, dtype, scope, axis_separators
     ):
         mod1 = tvm.build(
-            generated_func, target=tvm.target.Target(_HEXAGON_TARGET, host=_HEXAGON_TARGET)
+            generated_func,
+            target=get_hexagon_target("v69"),
         )
         mod2 = hexagon_session.load_module(mod1)
 
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py
index 74f3ab673ec8..8b07122c2a17 100644
--- a/tests/python/contrib/test_hexagon/test_meta_schedule.py
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -33,8 +33,8 @@
 from tvm.meta_schedule.runner import RunnerInput
 from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
 from tvm.relay.backend import Executor
-from tvm.topi.utils import get_const_tuple
-from tvm.meta_schedule.testing import te_workload
+
+from .infrastructure import get_hexagon_target
 
 MATMUL_N = 16
 MATMUL_M = 32
@@ -61,14 +61,12 @@ def test_builder_runner(hexagon_launcher):
     if hexagon_launcher._serial_number == "simulator":
         pytest.skip(msg="Tuning on simulator not supported.")
 
-    target_hexagon = tvm.target.hexagon("v68", link_params=True)
-    target = tvm.target.Target(target_hexagon, host=target_hexagon)
     mod = MatmulModule
 
     builder = get_hexagon_local_builder()
     runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, min_repeat_ms=0)
 
-    (builder_result,) = builder.build([BuilderInput(mod, target)])
+    (builder_result,) = builder.build([BuilderInput(mod, get_hexagon_target("v68"))])
     assert builder_result.artifact_path is not None
     assert builder_result.error_msg is None
 
@@ -177,8 +175,6 @@ def test_vrmpy_dense(hexagon_launcher):
         pytest.skip(msg="Tuning on simulator not supported.")
 
     do_tune = True
-    target_hexagon = tvm.target.hexagon("v68")
-    target = tvm.target.Target(target_hexagon, host=target_hexagon)
 
     M, N, K = 128, 768, 768
     workload = te.create_prim_func(dense(M, N, K))
@@ -212,7 +208,7 @@ def schedule_dense_for_tune(sch):
             )
 
     with hexagon_launcher.start_session() as session:
-        verify_dense(sch, target, M, N, K, session)
+        verify_dense(sch, get_hexagon_target("v68"), M, N, K, session)
 
 
 # This is an example of a schedule found by vrmpy auto tensorization.
@@ -274,9 +270,6 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
     if hexagon_launcher._serial_number == "simulator":
         pytest.skip(msg="Tuning on simulator not supported.")
 
-    target_hexagon = tvm.target.hexagon("v68")
-    target = tvm.target.Target(target_hexagon, host=target_hexagon)
-
     M, N, K = 128, 768, 768
     workload = te.create_prim_func(dense(M, N, K))
 
@@ -319,7 +312,7 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
 
             sch = ms.tune_tir(
                 mod=workload,
-                target=target,
+                target=get_hexagon_target("v68"),
                 config=config,
                 work_dir=work_dir,
                 sch_rules=lambda: sch_rules,
@@ -331,7 +324,7 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
         sch = tvm.tir.Schedule(Module_vrmpy_auto_tensorize, debug_mask="all")
 
     with hexagon_launcher.start_session() as session:
-        verify_dense(sch, target, M, N, K, session)
+        verify_dense(sch, get_hexagon_target("v68"), M, N, K, session)
 
 
 @tvm.testing.requires_hexagon
@@ -339,8 +332,6 @@ def test_conv2d_relay_auto_schedule(hexagon_launcher):
     if hexagon_launcher._serial_number == "simulator":
         pytest.skip(msg="Tuning on simulator not supported.")
 
-    target_hexagon = tvm.target.hexagon("v69")
-    target = tvm.target.Target(target_hexagon, host=target_hexagon)
     I, O, H, W = 64, 64, 56, 56
     kH = kW = 3
 
@@ -400,7 +391,7 @@ def test_conv2d_relay_auto_schedule(hexagon_launcher):
         lib = ms.tune_relay(
             mod=mod,
             params=params,
-            target=target,
+            target=get_hexagon_target("v69"),
             config=config,
             work_dir=work_dir,
             builder=get_hexagon_local_builder(),
diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
index 78d7116d0853..95e5191a8619 100644
--- a/tests/python/contrib/test_hexagon/test_models.py
+++ b/tests/python/contrib/test_hexagon/test_models.py
@@ -25,6 +25,8 @@
 from tvm.contrib.hexagon.session import Session
 from tvm.relay.backend import Executor, Runtime
 
+from .infrastructure import get_hexagon_target
+
 
 def get_mobilenet():
     """Download and import mobilenet model with ONNX"""
@@ -43,7 +45,6 @@ def test_mobilenet(hexagon_session: Session):
     dtype = "float32"
     onnx_model = get_mobilenet()
 
-    target_hexagon = tvm.target.hexagon("v68")
     target_llvm = tvm.target.Target("llvm")
     runtime = Runtime("cpp")
     executor = Executor("graph", {"link-params": True})
@@ -58,7 +59,7 @@ def test_mobilenet(hexagon_session: Session):
     with tvm.transform.PassContext(opt_level=3):
         hexagon_lowered = tvm.relay.build(
             relay_mod,
-            tvm.target.Target(target_hexagon, host=target_hexagon),
+            get_hexagon_target("v68"),
             runtime=runtime,
             executor=executor,
             params=params,
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx.py b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
index a34f5b8e261b..6ebe03d4e6b1 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
@@ -19,10 +19,12 @@
 Test parallelizing HVX workloads and compare them to single thread examples. 
 """
 import numpy as np
-import tvm
+from numpy.random import default_rng
 
+import tvm
 from tvm.script import tir as T
-from numpy.random import default_rng
+
+from .infrastructure import get_hexagon_target
 
 TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n    -Single Thread: {} ms \n    -Parallel: {} ms\n    -Speedup: {}x\n"
 
@@ -132,10 +134,7 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch):
     a_shape, a_dtype, b_shape, b_dtype, c_shape, c_dtype = shape_dtypes
 
-    target_hexagon = tvm.target.hexagon("v68")
-    func_tir = tvm.build(
-        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
-    )
+    func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v68"))
     module = hexagon_session.load_module(func_tir)
 
     rng = default_rng()
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
index 5dcb4b18b845..6e43298a4eb5 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -18,10 +18,12 @@
 """ Test different strategies for loading data into vtcm before running HVX workloads. """
 
 import numpy as np
-import tvm
+from numpy.random import default_rng
 
+import tvm
 from tvm.script import tir as T
-from numpy.random import default_rng
+
+from .infrastructure import get_hexagon_target
 
 TEST_OUTPUT_TEMPLATE = "Test with {} MB of data to load... \n    -No VTCM: {} Gops \n    -Basic VTCM: {} Gops \n    -Vectorized: {} Gops\n    -Vectorized and Parallelized: {} Gops\n    -Preallocated and Vectorized: {} Gops\n    -Preallocated, Vectorized, and Parallelized: {} Gops\n    -Single DMA: {} Gops\n    -Preloaded: {} Gops\n"
 
@@ -299,10 +301,7 @@ def evaluate_result(operations, tag, time, result, expected_output):
 
 
 def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global"):
-    target_hexagon = tvm.target.hexagon("v69")
-    func_tir = tvm.build(
-        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
-    )
+    func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69"))
     module = hexagon_session.load_module(func_tir)
 
     a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope=mem_scope)
@@ -322,10 +321,7 @@ def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global")
 
 
 def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations):
-    target_hexagon = tvm.target.hexagon("v69")
-    func_tir = tvm.build(
-        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
-    )
+    func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69"))
     module = hexagon_session.load_module(func_tir)
 
     a_vtcm = np.zeros((a.size), dtype="uint8")
diff --git a/tests/python/contrib/test_hexagon/test_parallel_scalar.py b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
index b3d07ae978ba..fd3eef1b195b 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_scalar.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
@@ -18,10 +18,12 @@
 """ Test parallelism for multiple different scalar workloads. """
 
 import numpy as np
-import tvm
+from numpy.random import default_rng
 
+import tvm
 from tvm.script import tir as T
-from numpy.random import default_rng
+
+from .infrastructure import get_hexagon_target
 
 TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n    -Single Thread: {} ms \n    -Parallel: {} ms\n    -Speedup: {}x\n"
 
@@ -75,10 +77,7 @@ def evaluate(hexagon_session, operations, expected, sch):
     shape = operations
     dtype = "float64"
 
-    target_hexagon = tvm.target.hexagon("v68")
-    func_tir = tvm.build(
-        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
-    )
+    func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v68"))
     module = hexagon_session.load_module(func_tir)
 
     rng = default_rng()
diff --git a/tests/python/contrib/test_hexagon/test_sigmoid.py b/tests/python/contrib/test_hexagon/test_sigmoid.py
index 9aad35ee76c1..1ff5bf3db340 100644
--- a/tests/python/contrib/test_hexagon/test_sigmoid.py
+++ b/tests/python/contrib/test_hexagon/test_sigmoid.py
@@ -25,7 +25,7 @@
 from tvm import topi
 from tvm.contrib.hexagon.build import HexagonLauncher
 
-from .infrastructure import allocate_hexagon_array, transform_numpy
+from .infrastructure import allocate_hexagon_array, get_hexagon_target
 
 
 def sigmoid_compute(Input):
@@ -71,16 +71,12 @@ def test_sigmoid(
         dtype,
         input_np,
         ref_output_np,
-        target,
         hexagon_session,
     ):
         InputTensor = te.placeholder(in_shape, name="InputTensor", dtype=dtype)
 
         OutputTensor = sigmoid_compute(InputTensor)
 
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
-
         tir_s = sigmoid_stir_schedule(InputTensor, OutputTensor)
 
         input_data = allocate_hexagon_array(
@@ -95,7 +91,7 @@ def test_sigmoid(
 
         func_name = "sigmoid"
         with tvm.transform.PassContext(opt_level=3):
-            runtime_module = tvm.build(tir_s.mod, target=target, name=func_name)
+            runtime_module = tvm.build(tir_s.mod, target=get_hexagon_target("v69"), name=func_name)
 
         assert "hvx_sigmoid" in runtime_module.get_source("asm")
         assert "vmin" in runtime_module.get_source("asm")
diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
index 7a53a1fc9b53..25be8b8e2849 100644
--- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -24,6 +24,8 @@
 from tvm.contrib.hexagon.session import Session
 from tvm.script import tir as T
 
+from .infrastructure import get_hexagon_target
+
 outer = tvm.testing.parameter(8, 16)
 inner = tvm.testing.parameter(64, 128)
 dtype = tvm.testing.parameter("uint8", "float16")
@@ -86,11 +88,8 @@ def test_async_software_pipeline(hexagon_launcher, compute, schedule, outer, inn
     b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
     ref = compute[1](a_np)
 
-    target_hexagon = tvm.target.hexagon("v68", link_params=True)
     with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
-        func = tvm.build(
-            sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
-        )
+        func = tvm.build(sch.mod["main"], target=get_hexagon_target("v68"))
 
     with hexagon_launcher.start_session() as hexagon_session:
         dev = hexagon_session.device
diff --git a/tests/python/contrib/test_hexagon/test_thread_pool.py b/tests/python/contrib/test_hexagon/test_thread_pool.py
index c943dbbb22e5..2fc82cf49984 100644
--- a/tests/python/contrib/test_hexagon/test_thread_pool.py
+++ b/tests/python/contrib/test_hexagon/test_thread_pool.py
@@ -26,6 +26,8 @@
 from tvm.contrib.hexagon.session import Session
 from tvm.script import tir as T
 
+from .infrastructure import get_hexagon_target
+
 
 @tvm.script.ir_module
 class ElemwiseSumIRModule:
@@ -73,9 +75,9 @@ def benchmark_func(mod, name, args, hexagon_session):
 @tvm.testing.requires_hexagon
 def test_speedup(hexagon_session: Session, capsys):
     """Test speedup"""
-    target_hexagon = tvm.target.hexagon("v68", link_params=True)
     func = tvm.build(
-        ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon)
+        ElemwiseSumIRModule,
+        target=get_hexagon_target("v68"),
     )
     mod = hexagon_session.load_module(func)
     args = generate_add_test_data(hexagon_session)
@@ -89,9 +91,9 @@ def test_speedup(hexagon_session: Session, capsys):
 @tvm.testing.requires_hexagon
 def test_elemwise_sum_parallel(hexagon_session: Session):
     """Test parallel elementwise sum"""
-    target_hexagon = tvm.target.hexagon("v68", link_params=True)
     func = tvm.build(
-        ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon)
+        ElemwiseSumIRModule,
+        target=get_hexagon_target("v68"),
     )
     mod = hexagon_session.load_module(func)
 
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 83daf2458737..307d3a96bf15 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -18,11 +18,12 @@
 """Test theoretical bandwith for data transfers to VTCM for different strategies."""
 
 import numpy as np
-from tests.python.contrib.test_hexagon.infrastructure import allocate_hexagon_array
-import tvm
+from numpy.random import default_rng
 
+import tvm
 from tvm.script import tir as T
-from numpy.random import default_rng
+
+from .infrastructure import get_hexagon_target
 
 MB = 1024**2
 KB = 1024
@@ -81,10 +82,7 @@ def operator(a: T.handle, a_v: T.handle) -> None:
 def evaluate(hexagon_session, sch, size):
     a_shape = size
 
-    target_hexagon = tvm.target.hexagon("v69")
-    func_tir = tvm.build(
-        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
-    )
+    func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69"))
     module = hexagon_session.load_module(func_tir)
 
     rng = default_rng()
diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
index fe70745143a9..711d725e842f 100755
--- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
+++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
@@ -23,7 +23,12 @@
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.topi.hexagon.qnn as qn
-from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
+from ..infrastructure import (
+    allocate_hexagon_array,
+    transform_numpy,
+    quantize_np,
+    get_hexagon_target,
+)
 
 
 @tvm.testing.fixture
@@ -285,7 +290,6 @@ def test_transform(
         op_name,
     ):
         output_shape = expected_output_np.shape
-        target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape_A, name="A", dtype=dtype)
         B = te.placeholder(input_shape_B, name="B", dtype=dtype)
         if dtype == "float16":
@@ -336,7 +340,7 @@ def test_transform(
             func = tvm.build(
                 sch,
                 [A, B, M],
-                tvm.target.Target(target_hexagon, host=target_hexagon),
+                get_hexagon_target("v69"),
                 name="slice_op_with_transform",
             )
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
index 32d7a5097384..5ed86a1fcc92 100644
--- a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
@@ -22,7 +22,7 @@
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.contrib.hexagon
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
 class TestArgMaxSlice:
@@ -79,8 +79,6 @@ def test_argmax_slice(
         working_scope,
     ):
         """Top level testing function for argmax"""
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
         argmax_input = te.placeholder(input_shape, name="A", dtype=dtype)
         output = sl.argmax.argmax_compute(argmax_input, in_axis)
         argmax_func = te.create_prim_func([argmax_input, output])
@@ -101,7 +99,7 @@ def test_argmax_slice(
         with tvm.transform.PassContext(opt_level=3):
             tir_irm = tvm.lower(tir_s.mod, [argmax_input, output], name="argmax")
             runtime_module = tvm.build(
-                tir_irm, [argmax_input, output], target=target, name="argmax"
+                tir_irm, [argmax_input, output], target=get_hexagon_target("v69"), name="argmax"
             )
         mod = hexagon_session.load_module(runtime_module)
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
index 743519901542..6f6a7d762747 100644
--- a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
@@ -21,12 +21,15 @@
 
 from tvm import te
 import tvm.testing
-from tvm.topi import testing
-from tvm.contrib.hexagon.build import HexagonLauncher
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.topi.hexagon.qnn as qn
-from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
+from ..infrastructure import (
+    allocate_hexagon_array,
+    transform_numpy,
+    quantize_np,
+    get_hexagon_target,
+)
 from ..pytest_util import (
     get_multitest_ids,
     create_populated_numpy_ndarray,
@@ -401,13 +404,12 @@ def test_avg_pool2d_slice(
         schedule_args,
         hexagon_session: Session,
     ):
-        target_hexagon = tvm.target.hexagon("v69")
         in_data = transformed_input_np_padded
 
         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(
                 *schedule_args,
-                tvm.target.Target(target_hexagon, host=target_hexagon),
+                get_hexagon_target("v69"),
                 name="avg_pool2d",
             )
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
index c64477343943..f3273ea8b65b 100644
--- a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
+++ b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
@@ -27,6 +27,7 @@
 from tvm.topi.utils import get_const_tuple
 from tvm.contrib.hexagon.session import Session
 
+from ..infrastructure import get_hexagon_target
 
 dtype = tvm.testing.parameter(
     "float32",
@@ -63,8 +64,7 @@ def get_ref_data():
         # get the test data
         a_np, b_np, c_np = get_ref_data()
 
-        target_hexagon = tvm.target.hexagon("v68")
-        with tvm.target.Target(target_hexagon):
+        with tvm.target.Target(get_hexagon_target("v68")):
             fcompute = topi.nn.batch_matmul
             fschedule = topi.hexagon.schedule_batch_matmul
             out = fcompute(x, y)
@@ -74,7 +74,7 @@ def get_ref_data():
         func = tvm.build(
             s,
             [x, y, out],
-            tvm.target.Target(target_hexagon, host=target_hexagon),
+            get_hexagon_target("v68"),
             name="batch_matmul",
         )
         mod = hexagon_session.load_module(func)
@@ -115,8 +115,7 @@ def get_ref_data():
         # get the test data
         a_np, b_np, c_np = get_ref_data()
 
-        target_hexagon = tvm.target.hexagon("v68")
-        with tvm.target.Target(target_hexagon):
+        with tvm.target.Target(get_hexagon_target("v68")):
             fcompute = topi.nn.batch_matmul
             fschedule = topi.hexagon.schedule_batch_matmul
             out = fcompute(x, y)
@@ -125,7 +124,7 @@ def get_ref_data():
         func = tvm.build(
             s,
             [x, y, out],
-            tvm.target.Target(target_hexagon, host=target_hexagon),
+            get_hexagon_target("v68"),
             name="batch_matmul_int8",
         )
         mod = hexagon_session.load_module(func)
diff --git a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
index 1b235a4daf52..326370eb72d7 100644
--- a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
@@ -22,7 +22,7 @@
 import tvm.testing
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
 class TestCastF16F32Slice2d:
@@ -77,8 +77,6 @@ def test_cast_fp16_fp32_slice(
         if hexagon_session._launcher._serial_number != "simulator":
             pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
 
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
         cast_input = te.placeholder(input_shape, name="A", dtype=dtype)
         cast_output = sl.cast_f16_f32_compute(cast_input)
         cast_func = te.create_prim_func([cast_input, cast_output])
@@ -98,7 +96,9 @@ def test_cast_fp16_fp32_slice(
         )
         with tvm.transform.PassContext(opt_level=3):
             tir_irm = tvm.lower(tir_s.mod, [cast_input, cast_output], name="cast_f16_f32")
-            runtime_module = tvm.build(tir_irm, target=target, name="cast_f16_f32")
+            runtime_module = tvm.build(
+                tir_irm, target=get_hexagon_target("v69"), name="cast_f16_f32"
+            )
         mod = hexagon_session.load_module(runtime_module)
 
         mod(input_data, output_data)
@@ -163,8 +163,6 @@ def test_cast_fp32_fp16_slice(
         if hexagon_session._launcher._serial_number != "simulator":
             pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
 
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
         cast_input = te.placeholder(input_shape, name="A", dtype=dtype)
         cast_output = sl.cast_f32_f16_compute(cast_input)
         cast_func = te.create_prim_func([cast_input, cast_output])
@@ -184,7 +182,9 @@ def test_cast_fp32_fp16_slice(
         )
         with tvm.transform.PassContext(opt_level=3):
             tir_irm = tvm.lower(tir_s.mod, [cast_input, cast_output], name="cast_f32_f16")
-            runtime_module = tvm.build(tir_irm, target=target, name="cast_f32_f16")
+            runtime_module = tvm.build(
+                tir_irm, target=get_hexagon_target("v69"), name="cast_f32_f16"
+            )
         mod = hexagon_session.load_module(runtime_module)
 
         mod(input_data, output_data)
diff --git a/tests/python/contrib/test_hexagon/topi/test_clip.py b/tests/python/contrib/test_hexagon/topi/test_clip.py
index ac6890171dba..3f8f5077c758 100755
--- a/tests/python/contrib/test_hexagon/topi/test_clip.py
+++ b/tests/python/contrib/test_hexagon/topi/test_clip.py
@@ -19,13 +19,11 @@
 
 import numpy as np
 
-from tvm import te, topi
+from tvm import te
 
 import tvm.testing
-from tvm.topi import testing
-from tvm.contrib.hexagon.build import HexagonLauncher
 import tvm.topi.hexagon.slice_ops as sl
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 input_layout = tvm.testing.parameter(
     "nhwc-8h2w32c2w-2d",
@@ -73,7 +71,6 @@ def test_clip_slice(
         hexagon_session,
     ):
         # establish target and input placeholder
-        target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape, name="A", dtype=dtype)
 
         # get the compute function and schedule
@@ -86,7 +83,7 @@ def test_clip_slice(
         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(
                 tir_schedule.mod,
-                target=tvm.target.Target(target_hexagon, host=target_hexagon),
+                target=get_hexagon_target("v69"),
                 name="clip",
             )
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
index e7946d04608e..3f88a6e432b7 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
@@ -23,6 +23,8 @@
 import tvm.contrib.hexagon
 from tvm.topi.testing import conv2d_nhwc_python
 
+from ..infrastructure import get_hexagon_target
+
 
 def build_conv2d(target):
     """Build and the return the conv2d module that calls the intrinsic implementation"""
@@ -199,9 +201,6 @@ def DISABLED_test_conv2d(self, act_shape, wgt_shape, inp_stride, inp_offset, hex
         """Test conv2d intrinsic implementation"""
         assert act_shape[3] == wgt_shape[2]
 
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
-
         # Currently, input offset does not affect the output shape
         def get_out_shape(ash, wsh, inp_stride):
             assert ash[3] == wsh[2]
@@ -217,7 +216,7 @@ def get_out_shape(ash, wsh, inp_stride):
         act = np.random.rand(*act_shape).astype("float16")
         wgt = np.random.rand(*wgt_shape).astype("float16")
 
-        module = build_conv2d(target)
+        module = build_conv2d(get_hexagon_target("v68"))
 
         mod = hexagon_session.load_module(module)
         output = tvm.nd.array(
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
index 01c20601b685..0b94d6e781a7 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
@@ -26,6 +26,7 @@
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.utils import get_pad_tuple
 
+from ..infrastructure import get_hexagon_target
 
 dtype = tvm.testing.parameter("float32")
 random_seed = tvm.testing.parameter(0)
@@ -105,7 +106,6 @@ def test_conv2d_nchw(
         add_bias,
         apply_relu,
     ):
-        target_hexagon = tvm.target.hexagon("v68")
 
         pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
         padding_sum = pad_top + pad_left + pad_bottom + pad_right
@@ -129,7 +129,7 @@ def test_conv2d_nchw(
             gap_size = np.nextafter(c_np.max(), np.inf, dtype=c_np.dtype) - c_np.max()
             tol = {"rtol": 1e-3, "atol": num_values_summed * gap_size / 2}
 
-        with tvm.target.Target(target_hexagon):
+        with tvm.target.Target(get_hexagon_target("v68")):
             fcompute = topi.nn.conv2d_nchw
             fschedule = topi.hexagon.schedule_conv2d_nchw
             C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
@@ -153,7 +153,7 @@ def test_conv2d_nchw(
         func = tvm.build(
             s,
             [A, W, bias, C],
-            tvm.target.Target(target_hexagon, host=target_hexagon),
+            get_hexagon_target("v68"),
             name=func_name,
         )
         mod = hexagon_session.load_module(func)
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
index 9acffff358e8..2068f1e6e6fc 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
@@ -25,6 +25,8 @@
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
+from ..infrastructure import get_hexagon_target
+
 dtype = tvm.testing.parameter("float32")
 
 
@@ -57,14 +59,12 @@ def test_conv2d_nhwc(
         padding,
         dilation,
     ):
-        target_hexagon = tvm.target.hexagon("v68")
-
         a_np, w_np, b_np = ref_data
 
         A = te.placeholder(a_np.shape, name="A", dtype=dtype)
         W = te.placeholder(w_np.shape, name="W", dtype=dtype)
 
-        with tvm.target.Target(target_hexagon):
+        with tvm.target.Target(get_hexagon_target("v68")):
             fcompute = topi.nn.conv2d_nhwc
             fschedule = topi.hexagon.schedule_conv2d_nhwc
             B = fcompute(A, W, stride, padding, dilation, dtype)
@@ -81,9 +81,7 @@ def test_conv2d_nhwc(
             padding,
             dilation,
         )
-        func = tvm.build(
-            s, [A, W, B], tvm.target.Target(target_hexagon, host=target_hexagon), name=func_name
-        )
+        func = tvm.build(s, [A, W, B], get_hexagon_target("v68"), name=func_name)
         mod = hexagon_session.load_module(func)
 
         dev = hexagon_session.device
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py
index a03c35cb9e78..242265169fb8 100755
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py
@@ -25,7 +25,7 @@
 from tvm.topi.hexagon.slice_ops.conv2d import conv2d_compute, conv2d_schedule
 from tvm.topi.testing import conv2d_nhwc_python
 
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 input_layout = tvm.testing.parameter(
     "nhwc-8h2w32c2w-2d",
@@ -274,7 +274,6 @@ def test_conv2d(
         input_np_padded,
         weights_np_transformed,
         expected_output_np,
-        target,
         working_scope,
         hexagon_session,
     ):
@@ -287,9 +286,6 @@ def test_conv2d(
             input_tensor, weights, out_shape, stride, dilation, dtype, output_name
         )
 
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
-
         tir_schedule = conv2d_schedule(
             output_tensor,
             [input_tensor, weights],
@@ -303,7 +299,7 @@ def test_conv2d(
         with tvm.transform.PassContext(opt_level=3):
             runtime_module = tvm.build(
                 tir_schedule.mod,
-                target=target,
+                target=get_hexagon_target("v69"),
                 name=func_name,
             )
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
index 8536603a3c20..40c8efa1cec2 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test code for transposed convolution."""
 import numpy as np
+
 import tvm
 from tvm.contrib.hexagon.session import Session
 import tvm.testing
@@ -24,6 +25,7 @@
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
+from ..infrastructure import get_hexagon_target
 
 # TODO Should add kernal to tvm.testing.fixture
 
@@ -79,9 +81,6 @@ def test_conv2d(
         output_padding,
         random_seed,
     ):
-
-        target_hexagon = tvm.target.hexagon("v68")
-
         in_height, in_width = in_size
         kernel_height, kernel_width = (1, 1)
         stride_height, stride_width = stride
@@ -116,7 +115,7 @@ def get_ref_data():
             output_padding,
         )
 
-        with tvm.target.Target(target_hexagon):
+        with tvm.target.Target(get_hexagon_target("v68")):
             fcompute = topi.nn.conv2d_transpose_nchw
             fschedule = topi.hexagon.schedule_conv2d_transpose_nchw
             B = fcompute(*fcompute_args)
@@ -131,8 +130,8 @@ def get_ref_data():
             b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
             c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
 
-            func1 = tvm.build(s1, [A, W, B], tvm.target.Target(target_hexagon, host=target_hexagon))
-            func2 = tvm.build(s2, [A, W, C], tvm.target.Target(target_hexagon, host=target_hexagon))
+            func1 = tvm.build(s1, [A, W, B], get_hexagon_target("v68"))
+            func2 = tvm.build(s2, [A, W, C], get_hexagon_target("v68"))
 
             mod1 = hexagon_session.load_module(func1)
             mod2 = hexagon_session.load_module(func2)
diff --git a/tests/python/contrib/test_hexagon/topi/test_dense.py b/tests/python/contrib/test_hexagon/topi/test_dense.py
index 929108bb1492..c76006ac08c2 100644
--- a/tests/python/contrib/test_hexagon/topi/test_dense.py
+++ b/tests/python/contrib/test_hexagon/topi/test_dense.py
@@ -26,6 +26,8 @@
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
+from ..infrastructure import get_hexagon_target
+
 random_seed = tvm.testing.parameter(0)
 
 use_bias = tvm.testing.parameter(True, False)
@@ -94,15 +96,12 @@ def test_dense(
     fcompute = topi.nn.dense
     fschedule = topi.hexagon.schedule_dense
 
-    target_hexagon = tvm.target.hexagon("v68")
-    with tvm.target.Target(target_hexagon):
+    with tvm.target.Target(get_hexagon_target("v68")):
         D = fcompute(A, B, C if use_bias else None, out_dtype)
         D = topi.nn.relu(D)
         s = fschedule([D])
 
-    func = tvm.build(
-        s, [A, B, C, D], tvm.target.Target(target_hexagon, host=target_hexagon), name="dense"
-    )
+    func = tvm.build(s, [A, B, C, D], get_hexagon_target("v68"), name="dense")
     mod = hexagon_session.load_module(func)
 
     dev = hexagon_session.device
diff --git a/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py b/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
index f74d13f641d5..3de9ec13497a 100644
--- a/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
+++ b/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
@@ -27,7 +27,7 @@
 from tvm.topi.hexagon.slice_ops.depth_to_space import d2s_compute, d2s_schedule
 from tvm.topi.testing import depth_to_space_python
 
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
 d2s_fp16_tests = (
@@ -97,9 +97,6 @@ def test_d2s_slice(
 
         Output = d2s_compute(Input, block_size, "NHWC", mode)
 
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
-
         tir_s = d2s_schedule(Input, Output, input_layout, output_layout)
 
         input_data = allocate_hexagon_array(
@@ -117,7 +114,7 @@ def test_d2s_slice(
         )
         with tvm.transform.PassContext(opt_level=3):
             runtime_module = tvm.build(
-                tir_s.mod, [Input, Output], target=target, name="depth_to_space"
+                tir_s.mod, [Input, Output], target=get_hexagon_target("v69"), name="depth_to_space"
             )
         mod = hexagon_session.load_module(runtime_module)
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
index 5e09e691f743..063541cc21a0 100644
--- a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
@@ -28,6 +28,7 @@
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.utils import get_pad_tuple
 
+from ..infrastructure import get_hexagon_target
 
 random_seed = tvm.testing.parameter(0)
 
@@ -175,8 +176,6 @@ def test_conv2d(
         dilation,
         ref_data,
     ):
-        target_hexagon = tvm.target.hexagon("v68")
-
         # Transform the padding argument from 'str' to 'tuple' to
         # match the "workload" tuple in TopHub.  Which padding_args to
         # use for each layout chosen to reproduce previous behavior.
@@ -216,7 +215,7 @@ def test_conv2d(
                 out_dtype,
             )
 
-        with tvm.target.Target(target_hexagon):
+        with tvm.target.Target(get_hexagon_target("v68")):
             # Declare, build schedule
             if layout == "NCHW":
                 fcompute = topi.nn.depthwise_conv2d_nchw
@@ -236,7 +235,7 @@ def test_conv2d(
             f = tvm.build(
                 s,
                 [Input, Filter, Scale, Shift, C],
-                tvm.target.Target(target_hexagon, host=target_hexagon),
+                get_hexagon_target("v68"),
             )
             mod = hexagon_session.load_module(f)
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py b/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py
index e9b3dd132692..6ed217180aba 100644
--- a/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py
@@ -23,7 +23,12 @@
 import tvm.testing
 from tvm import te
 from tvm.topi.hexagon import qnn
-from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
+from ..infrastructure import (
+    allocate_hexagon_array,
+    transform_numpy,
+    quantize_np,
+    get_hexagon_target,
+)
 
 
 class TestDequantizeSlice2d:
@@ -78,8 +83,6 @@ def test_dequant_qnn(
         """
         Top level testing function for dequantize
         """
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
 
         dequant_input = te.placeholder(input_shape, name="A", dtype=dtype)
 
@@ -104,7 +107,7 @@ def test_dequant_qnn(
         )
         with tvm.transform.PassContext(opt_level=3):
             tir_irm = tvm.lower(tir_s.mod, [dequant_input, dequant_output], name="dequantize")
-            runtime_module = tvm.build(tir_irm, target=target, name="dequantize")
+            runtime_module = tvm.build(tir_irm, target=get_hexagon_target("v69"), name="dequantize")
         mod = hexagon_session.load_module(runtime_module)
 
         mod(input_data, output_data)
diff --git a/tests/python/contrib/test_hexagon/topi/test_dwconv2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_dwconv2d_slice.py
index f8419e9c55d8..3e43718afd8d 100644
--- a/tests/python/contrib/test_hexagon/topi/test_dwconv2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_dwconv2d_slice.py
@@ -26,7 +26,7 @@
 from tvm.topi.testing import depthwise_conv2d_python_nhwc
 from tvm.topi.hexagon.slice_ops.dwconv2d import dwconv2d_compute, dwconv2d_schedule
 
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
 @tvm.testing.fixture
@@ -258,7 +258,6 @@ def test_dwconv2d(
         input_np_padded,
         weights_np_transformed,
         expected_output_np,
-        target,
         working_scope,
         hexagon_session,
     ):
@@ -271,9 +270,6 @@ def test_dwconv2d(
         def transform_weights(height, width, in_channel, out_channel):
             return [out_channel // 32, height, width, in_channel, out_channel % 32]
 
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
-
         tir_schedule = dwconv2d_schedule(
             output_tensor, [input_tensor, weights], in_out_layout, transform_weights
         )
@@ -282,7 +278,7 @@ def transform_weights(height, width, in_channel, out_channel):
         with tvm.transform.PassContext(opt_level=3, config={"tir.disable_assert": True}):
             runtime_module = tvm.build(
                 tir_schedule.mod,
-                target=target,
+                target=get_hexagon_target("v69"),
                 name=func_name,
             )
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py
index de60ffc6df4d..f2ee76863cb6 100644
--- a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py
@@ -21,14 +21,10 @@
 
 from tvm import te
 import tvm.testing
-from tvm.topi import testing
-from tvm.contrib.hexagon.build import HexagonLauncher
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 from ..pytest_util import (
-    get_numpy_dtype_info,
-    get_test_id,
     get_multitest_ids,
     create_populated_numpy_ndarray,
     TensorContentRandom,
@@ -341,7 +337,6 @@ def test_max_pool2d_slice(
         expected_output_np,
         hexagon_session: Session,
     ):
-        target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape_padded, name="A", dtype=dtype)
 
         M = sl.max_pool2d_compute(A, output_shape, kernel, stride, dilation)
@@ -365,7 +360,7 @@ def test_max_pool2d_slice(
             func = tvm.build(
                 sch,
                 [A, M],
-                tvm.target.Target(target_hexagon, host=target_hexagon),
+                get_hexagon_target("v69"),
                 name="max_pool2d",
             )
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_pad.py b/tests/python/contrib/test_hexagon/topi/test_pad.py
index 631cb979dcbd..06b939bf6409 100644
--- a/tests/python/contrib/test_hexagon/topi/test_pad.py
+++ b/tests/python/contrib/test_hexagon/topi/test_pad.py
@@ -22,6 +22,8 @@
 from tvm.contrib.hexagon.session import Session
 from tvm.topi.utils import get_const_tuple
 
+from ..infrastructure import get_hexagon_target
+
 
 @tvm.testing.requires_hexagon
 def test_nn_pad(hexagon_session: Session):
@@ -34,12 +36,11 @@ def test_nn_pad(hexagon_session: Session):
 
     C = topi.nn.pad(A, [0, 1, 1, 0], [0, 1, 1, 0], pad_value=0)
 
-    target_hexagon = tvm.target.hexagon("v68")
-    with tvm.target.Target(target_hexagon):
+    with tvm.target.Target(get_hexagon_target("v68")):
         fschedule = topi.hexagon.schedule_pad
         s = fschedule(C)
 
-    func = tvm.build(s, [A, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="pad")
+    func = tvm.build(s, [A, C], get_hexagon_target("v68"), name="pad")
     mod = hexagon_session.load_module(func)
 
     dev = hexagon_session.device
diff --git a/tests/python/contrib/test_hexagon/topi/test_pooling.py b/tests/python/contrib/test_hexagon/topi/test_pooling.py
index 45e558e1b6dd..ecc998875296 100644
--- a/tests/python/contrib/test_hexagon/topi/test_pooling.py
+++ b/tests/python/contrib/test_hexagon/topi/test_pooling.py
@@ -25,6 +25,8 @@
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
+from ..infrastructure import get_hexagon_target
+
 
 class TestAdaptivePool:
     dshape, out_size, pool_type, layout = tvm.testing.parameters(
@@ -68,15 +70,14 @@ def test_adaptive_pool(self, hexagon_session: Session, dshape, out_size, pool_ty
             assert len(out_size) == 3
             out = topi.nn.adaptive_pool3d(data, out_size, pool_type, layout)
 
-        target_hexagon = tvm.target.hexagon("v68")
-        with tvm.target.Target(target_hexagon):
+        with tvm.target.Target(get_hexagon_target("v68")):
             fschedule = topi.hexagon.schedule_adaptive_pool
             s = fschedule(out)
 
         func = tvm.build(
             s,
             [data, out],
-            tvm.target.Target(target_hexagon, host=target_hexagon),
+            get_hexagon_target("v68"),
             name="adaptive-pool",
         )
         mod = hexagon_session.load_module(func)
@@ -166,12 +167,11 @@ def verify_poolnd(
 
     np.testing.assert_equal(tuple(output_shape), tuple(ref_np.shape))
 
-    target_hexagon = tvm.target.hexagon("v68")
-    with tvm.target.Target(target_hexagon):
+    with tvm.target.Target(get_hexagon_target("v68")):
         fschedule = topi.hexagon.schedule_pool
         s = fschedule(B, layout)
 
-    func = tvm.build(s, [A, B], tvm.target.Target(target_hexagon, host=target_hexagon), name="pool")
+    func = tvm.build(s, [A, B], get_hexagon_target("v68"), name="pool")
     mod = hexagon_session.load_module(func)
 
     dev = hexagon_session.device
diff --git a/tests/python/contrib/test_hexagon/topi/test_quantize.py b/tests/python/contrib/test_hexagon/topi/test_quantize.py
index 2c1718d29465..0b6e1dfa0e73 100755
--- a/tests/python/contrib/test_hexagon/topi/test_quantize.py
+++ b/tests/python/contrib/test_hexagon/topi/test_quantize.py
@@ -14,13 +14,17 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import pytest
 import numpy as np
 
 import tvm
 from tvm import te
 import tvm.topi.hexagon.qnn as s1
-from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
+from ..infrastructure import (
+    allocate_hexagon_array,
+    transform_numpy,
+    quantize_np,
+    get_hexagon_target,
+)
 
 
 @tvm.testing.fixture
@@ -70,7 +74,6 @@ def test_quantize(
         output_layout,
         hexagon_session,
     ):
-        target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape, name="A", dtype=input_dtype)
 
         M = s1.quantize_compute(A, scale, zero_point, output_dtype)
@@ -86,7 +89,7 @@ def test_quantize(
             func = tvm.build(
                 sch,
                 [A, M],
-                tvm.target.Target(target_hexagon, host=target_hexagon),
+                get_hexagon_target("v69"),
                 name="quantize",
             )
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_reduce.py b/tests/python/contrib/test_hexagon/topi/test_reduce.py
index a844e1d51206..8fc0b6d901ab 100644
--- a/tests/python/contrib/test_hexagon/topi/test_reduce.py
+++ b/tests/python/contrib/test_hexagon/topi/test_reduce.py
@@ -22,6 +22,7 @@
 from tvm import te
 from tvm.contrib.hexagon.session import Session
 
+from ..infrastructure import get_hexagon_target
 
 in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters(
     ((32,), 0, False, "argmax", "float32"),
@@ -125,14 +126,11 @@ def test_reduce_map(
     else:
         raise NotImplementedError
 
-    target_hexagon = tvm.target.hexagon("v68")
-    with tvm.target.Target(target_hexagon):
+    with tvm.target.Target(get_hexagon_target("v68")):
         fschedule = topi.hexagon.schedule_reduce
         s = fschedule(B)
 
-    func = tvm.build(
-        s, [A, B], tvm.target.Target(target_hexagon, host=target_hexagon), name=reduce_type
-    )
+    func = tvm.build(s, [A, B], get_hexagon_target("v68"), name=reduce_type)
     mod = hexagon_session.load_module(func)
 
     dev = hexagon_session.device
diff --git a/tests/python/contrib/test_hexagon/topi/test_relu_slice.py b/tests/python/contrib/test_hexagon/topi/test_relu_slice.py
index c08d4a5545f1..fd04cca061da 100644
--- a/tests/python/contrib/test_hexagon/topi/test_relu_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_relu_slice.py
@@ -22,9 +22,8 @@
 import tvm.testing
 from tvm.topi.hexagon.slice_ops.relu import relu_compute, relu_stir_schedule
 from tvm import te
-from tvm.contrib.hexagon.build import HexagonLauncher
 
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
 @tvm.testing.fixture
@@ -75,7 +74,6 @@ def test_relu(
         output_layout,
         transformed_input_np,
         transformed_ref_output_np,
-        target,
         working_scope,
         hexagon_session,
     ):
@@ -83,9 +81,6 @@ def test_relu(
 
         OutputTensor = relu_compute(InputTensor)
 
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
-
         tir_s = relu_stir_schedule(InputTensor, OutputTensor, input_layout, output_layout)
 
         input_data = allocate_hexagon_array(
@@ -104,7 +99,7 @@ def test_relu(
 
         func_name = "relu"
         with tvm.transform.PassContext(opt_level=3):
-            runtime_module = tvm.build(tir_s.mod, target=target, name=func_name)
+            runtime_module = tvm.build(tir_s.mod, target=get_hexagon_target("v69"), name=func_name)
 
         mod = hexagon_session.load_module(runtime_module)
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_reshape.py b/tests/python/contrib/test_hexagon/topi/test_reshape.py
index 7df29a02abff..38b8a9cf9a82 100644
--- a/tests/python/contrib/test_hexagon/topi/test_reshape.py
+++ b/tests/python/contrib/test_hexagon/topi/test_reshape.py
@@ -21,11 +21,9 @@
 import tvm
 import tvm.testing
 import tvm.topi.hexagon.slice_ops as sl
-from tvm import te, topi
-from tvm.contrib.hexagon.build import HexagonLauncher
-from tvm.topi import testing
+from tvm import te
 
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
 def reshape_helper(
@@ -40,8 +38,6 @@ def reshape_helper(
     hexagon_session,
 ):
 
-    target_hexagon = tvm.target.hexagon("v69")
-    target = tvm.target.Target(target_hexagon, host=target_hexagon)
     A = te.placeholder(input_shape, name="A", dtype=data_type)
     if func == "reshape":
         D = fcompute(A, output_shape)
@@ -56,7 +52,7 @@ def reshape_helper(
         input_layout,
     )
     with tvm.transform.PassContext(opt_level=3):
-        runtime_module = tvm.build(tir_s.mod, target=target, name=func)
+        runtime_module = tvm.build(tir_s.mod, target=get_hexagon_target("v69"), name=func)
 
     mod = hexagon_session.load_module(runtime_module)
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
index 1ef9f50977c5..80cfba5c6c9e 100755
--- a/tests/python/contrib/test_hexagon/topi/test_resize2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
@@ -21,7 +21,7 @@
 from tvm import te
 from tvm.topi.testing import resize2d_python
 import tvm.topi.hexagon as s1
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
 @tvm.testing.fixture
@@ -123,7 +123,6 @@ def test_resize2d(
         method,
         hexagon_session,
     ):
-        target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape, name="A", dtype=dtype)
 
         M = s1.resize2d_compute(
@@ -153,7 +152,7 @@ def test_resize2d(
             func = tvm.build(
                 sch,
                 [A, M],
-                tvm.target.Target(target_hexagon, host=target_hexagon),
+                get_hexagon_target("v69"),
                 name="resize2d",
             )
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax.py b/tests/python/contrib/test_hexagon/topi/test_softmax.py
index d1c78842b5ff..91f348494d6d 100644
--- a/tests/python/contrib/test_hexagon/topi/test_softmax.py
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax.py
@@ -26,6 +26,8 @@
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
+from ..infrastructure import get_hexagon_target
+
 dtype = tvm.testing.parameter(
     "float16",
     "float32",
@@ -78,14 +80,11 @@ def get_ref_data(shape):
     # get the test data
     a_np, b_np = get_ref_data(shape)
 
-    target_hexagon = tvm.target.hexagon("v68")
-    with tvm.target.Target(target_hexagon):
+    with tvm.target.Target(get_hexagon_target("v68")):
         fschedule = topi.hexagon.schedule_softmax
         s = fschedule(B)
 
-    func = tvm.build(
-        s, [A, B], tvm.target.Target(target_hexagon, host=target_hexagon), name="softmax"
-    )
+    func = tvm.build(s, [A, B], get_hexagon_target("v68"), name="softmax")
     mod = hexagon_session.load_module(func)
 
     dev = hexagon_session.device
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
index 91b51cb5cc75..1329fda7aa4a 100644
--- a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
@@ -20,7 +20,8 @@
 from tvm import te
 from tvm.topi.testing import softmax_python
 import tvm.topi.hexagon.slice_ops as sl
-from ..infrastructure import allocate_hexagon_array
+
+from ..infrastructure import allocate_hexagon_array, get_hexagon_target
 
 
 def transform_numpy(arr_np, layout):
diff --git a/tests/python/contrib/test_hexagon/topi/test_tanh_slice.py b/tests/python/contrib/test_hexagon/topi/test_tanh_slice.py
index d488d7dd46dd..02c587b9809c 100644
--- a/tests/python/contrib/test_hexagon/topi/test_tanh_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_tanh_slice.py
@@ -23,7 +23,7 @@
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.contrib.hexagon
-from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 # pylint: disable=invalid-name
 
@@ -71,8 +71,6 @@ def test_tanh(
     ):
         """Top Level testing function for tanh fp16 op"""
 
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
         A = te.placeholder(input_shape, name="A", dtype=dtype)
         M = sl.tanh_te_compute(A)
         tanhf16_func = te.create_prim_func([A, M])
@@ -92,7 +90,7 @@ def test_tanh(
         )
         with tvm.transform.PassContext(opt_level=3):
             tir_irm = tvm.lower(tir_s.mod, [A, M], name="tanhf16")
-            runtime_module = tvm.build(tir_irm, target=target, name="tanhf16")
+            runtime_module = tvm.build(tir_irm, target=get_hexagon_target("v69"), name="tanhf16")
         mod = hexagon_session.load_module(runtime_module)
 
         mod(A_data, M_data)

From 2860a50ff97a2791c876bdca2685d4f35f0e2e03 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 5 Oct 2022 13:48:24 -0700
Subject: [PATCH 305/704] [Hexagon] Fix RPC session close by adding shutdown
 PackedFunc (#12960)

* Add shutdown function to RPC module

* address comments
---
 python/tvm/contrib/hexagon/session.py |  2 ++
 src/runtime/rpc/rpc_endpoint.cc       |  5 +++++
 src/runtime/rpc/rpc_endpoint.h        | 18 +++++++++++++++---
 src/runtime/rpc/rpc_module.cc         |  4 ++++
 src/runtime/rpc/rpc_session.h         |  5 +++++
 tests/scripts/task_python_hexagon.sh  |  2 +-
 6 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index e242a95aa8b8..b69382fe1290 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -106,6 +106,8 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
             )
         finally:
             # close session to the tracker
+            shutdown_func = self._rpc._sess.get_function("CloseRPCConnection")
+            shutdown_func()
             del self._rpc
 
     @property
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index 07b90058ce99..46710587abe0 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -623,6 +623,9 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
 
 RPCCode RPCEndpoint::HandleUntilReturnEvent(bool client_mode, RPCSession::FEncodeReturn setreturn) {
   RPCCode code = RPCCode::kCallFunc;
+
+  CHECK(channel_) << "Expected connection to server " << name_
+                  << " to be active, but the connection was previously closed";
   while (code != RPCCode::kReturn && code != RPCCode::kShutdown && code != RPCCode::kCopyAck) {
     while (writer_.bytes_available() != 0) {
       writer_.ReadWithCallback(
@@ -1126,6 +1129,8 @@ class RPCClientSession : public RPCSession, public DeviceAPI {
 
   bool IsLocalSession() const final { return false; }
 
+  void Shutdown() final { endpoint_->Shutdown(); }
+
  private:
   uint64_t GetRPCMaxTransferSize() {
     if (rpc_chunk_max_size_bytes_ > 0) {
diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h
index d8e2dece73c5..4e4a09c6f333 100644
--- a/src/runtime/rpc/rpc_endpoint.h
+++ b/src/runtime/rpc/rpc_endpoint.h
@@ -68,8 +68,22 @@ enum class TrackerCode : int {
  */
 class RPCEndpoint {
  public:
-  /*! \brief virtual destructor */
+  /*! \brief virtual destructor
+   * Closes the connection if the connection hasn't already been closed.
+   */
   ~RPCEndpoint();
+
+  /*!
+   *  \brief Shutdown RPC connection.
+   *
+   *  Shutdown has no effect if the connection has already been shut down.
+   *  Shutdown will wait for all output currently queued from the RPC connection (i.e. The user
+   * doesn't need to wait for completion before calling Shutdown.) Any further use of objects that
+   * depended on the endpoint (e.g. A tvm.nd.array allocated on the remote RPC session) may throw an
+   * exception when used.
+   */
+  void Shutdown();
+
   /*!
    *  \brief The server loop that server runs to handle RPC calls.
    */
@@ -177,8 +191,6 @@ class RPCEndpoint {
   RPCCode HandleUntilReturnEvent(bool client_mode, RPCSession::FEncodeReturn setreturn);
   // Initalization
   void Init();
-  // Shutdown
-  void Shutdown();
   // Internal channel.
   std::unique_ptr<RPCChannel> channel_;
 
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index a3f41e063226..1578fce994f6 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -177,6 +177,10 @@ class RPCModuleNode final : public ModuleNode {
   const char* type_key() const final { return "rpc"; }
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
+    if (name == "CloseRPCConnection") {
+      return PackedFunc([this](TVMArgs, TVMRetValue*) { sess_->Shutdown(); });
+    }
+
     if (module_handle_ == nullptr) {
       return WrapRemoteFunc(sess_->GetFunction(name));
     } else {
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index d78b3219bf3d..60d067e49d3f 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -253,6 +253,11 @@ class RPCSession {
    */
   static std::shared_ptr<RPCSession> Get(int table_index);
 
+  /*!
+   * \brief Shutdown RPC connection.
+   */
+  virtual void Shutdown() {}
+
  protected:
   /*!
    * \brief Send an exception to the callback.
diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh
index f7c0a43c48e8..ba125b161a56 100755
--- a/tests/scripts/task_python_hexagon.sh
+++ b/tests/scripts/task_python_hexagon.sh
@@ -51,7 +51,7 @@ export ANDROID_SERIAL_NUMBER=${device_serial}
 if [ "${device_serial}" == "simulator" ]; then
     run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon
 else
-    run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon --tx $num_of_devices*popen --dist=load
+    run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon -n=$num_of_devices
 fi
 
 if [[ "${device_serial}" == "simulator" ]]; then

From d023ef47fcd680e23b9c1d69ae474a48dd132108 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 5 Oct 2022 15:14:24 -0700
Subject: [PATCH 306/704] [TIR, MetaSchedule] Preserve unit block iters for
 auto-tensorization (#12974)

* Update schedule primitives (ReIndex, TransformBlockLayout) to preserve unit iters. Added test cases.
* Allow workloads with unit dimensions to be detected during auto-tensorization pattern marching. This allows padding to be added for tensorizing such workloads.
---
 src/tir/ir/stmt.cc                            |   2 +-
 src/tir/schedule/analysis.h                   |  12 +
 src/tir/schedule/analysis/analysis.cc         |   9 +
 src/tir/schedule/ir_comparator.cc             |   4 +-
 .../schedule/primitive/cache_read_write.cc    |  23 +-
 .../primitive/layout_transformation.cc        |  48 +--
 src/tir/schedule/transform.cc                 |  25 +-
 src/tir/schedule/transform.h                  |   9 +-
 ...test_meta_schedule_schedule_rule_mlt_tc.py | 356 +++++++++---------
 .../unittest/test_tir_schedule_analysis.py    |   5 +-
 .../unittest/test_tir_schedule_reindex.py     |  90 ++++-
 .../test_tir_schedule_transform_layout.py     |  73 +++-
 12 files changed, 410 insertions(+), 246 deletions(-)

diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index e21d014fe185..8f2a7b4ffe5b 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -831,7 +831,7 @@ BufferRegion BufferRegion::FromPoint(Buffer buffer, Array<PrimExpr> indices) {
       region.push_back(
           Range::FromMinExtent(ramp_index->base, ramp_index->stride * ramp_index->lanes));
     } else {
-      region.push_back(Range::FromMinExtent(index, 1));
+      region.push_back(Range::FromMinExtent(index, make_const(index.dtype(), 1)));
     }
   }
   return BufferRegion(buffer, region);
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index 57165fd08ad4..7df991826728 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -724,6 +724,18 @@ Optional<Array<Var>> CheckTrivialBufferIndices(const T& buffer_access) {
   return indices;
 }
 
+/*!
+ * \brief Simplify non-trivial expressions
+ * \param expr The expression to be simplified
+ * \param analyzer The analyzer
+ * \return The simplified expression
+ *
+ * During scheduling, we often need preserve block iters in trivial expressions that can be
+ * simplified to constant values for further scheduling and analysis because simplifing away the
+ * block iters may result in loss of information for further analysis.
+ */
+PrimExpr SimplifyNonTrivialExpr(const PrimExpr& expr, arith::Analyzer* analyzer);
+
 /*! \brief Necessary information used for tensorization */
 class TensorizeInfoNode : public Object {
  public:
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 294826a1f6b9..384d006562f0 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -1646,6 +1646,15 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self,   //
   }
 }
 
+PrimExpr SimplifyNonTrivialExpr(const PrimExpr& expr, arith::Analyzer* analyzer) {
+  auto simplified = analyzer->Simplify(expr);
+  if (simplified->IsInstance<IntImmNode>()) {
+    return expr;
+  } else {
+    return simplified;
+  }
+}
+
 TVM_REGISTER_NODE_TYPE(TensorizeInfoNode);
 
 /*! \brief Auxiliary data structure of information extracted from tensor intrin description */
diff --git a/src/tir/schedule/ir_comparator.cc b/src/tir/schedule/ir_comparator.cc
index bfd394f24de7..648305d3655d 100644
--- a/src/tir/schedule/ir_comparator.cc
+++ b/src/tir/schedule/ir_comparator.cc
@@ -444,8 +444,8 @@ bool AutoTensorizeComparator::CompareBufferAccess(const T* lhs, const T* rhs) {
       return false;
     }
     std::vector<PrimExpr> lhs_indices;
-    for (const auto& index : lhs->indices) {
-      lhs_indices.push_back(analyzer_.Simplify(index));
+    for (const PrimExpr& index : lhs->indices) {
+      lhs_indices.push_back(SimplifyNonTrivialExpr(index, &analyzer_));
     }
 
     auto is_scalar_access = [](const Array<PrimExpr>& indices, PrimExpr index) {
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 489308ae8c0f..e03b1058d4ef 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -188,8 +188,6 @@ Block MakeReIndexStage(const Block& block, CacheStageInfo* info,
   Array<IterVar> new_block_iters;
   // the substition map from the original block iter to the iters of the reindex block
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectEqual> block_var_replace_map;
-  // block access region of reindexed buffer and target buffer
-  Region reindex_region, target_region;
   // indices to access the reindex buffer and the target buffer
   Array<PrimExpr> reindex_indices, target_indices;
 
@@ -201,7 +199,7 @@ Block MakeReIndexStage(const Block& block, CacheStageInfo* info,
     Var var("v" + std::to_string(new_block_iters.size()), iter->var->dtype);
     bool used = covered.count(iter->var);
     if (used) {
-      new_block_iters.push_back(IterVar(/*dom=*/used ? iter->dom : Range::FromMinExtent(0, 1),
+      new_block_iters.push_back(IterVar(/*dom=*/iter->dom,
                                         /*var=*/var,
                                         /*IterVarType=*/kDataPar));
     } else {
@@ -209,16 +207,11 @@ Block MakeReIndexStage(const Block& block, CacheStageInfo* info,
     }
     if (used) {
       reindex_indices.push_back(var);
-      reindex_region.push_back(Range::FromMinExtent(var, IntImm(var->dtype, 1)));
     }
     block_var_replace_map[iter->var] = var;
   }
 
   // Step 2: Replace the original block iters with the new block iters
-  BufferRegion buffer_region = buffer_index_type == BufferIndexType::kWrite
-                                   ? block->writes[buffer_index]
-                                   : block->reads[buffer_index];
-  target_region = Substitute(buffer_region->region, block_var_replace_map);
   for (const PrimExpr& index : original_indices) {
     target_indices.push_back(Substitute(index, block_var_replace_map));
   }
@@ -232,13 +225,9 @@ Block MakeReIndexStage(const Block& block, CacheStageInfo* info,
   Array<PrimExpr> dst_indices{nullptr};
 
   if (buffer_index_type == BufferIndexType::kWrite) {
-    src_region = reindex_region;
-    dst_region = target_region;
     src_indices = reindex_indices;
     dst_indices = target_indices;
   } else {
-    src_region = target_region;
-    dst_region = reindex_region;
     src_indices = target_indices;
     dst_indices = reindex_indices;
   }
@@ -246,11 +235,9 @@ Block MakeReIndexStage(const Block& block, CacheStageInfo* info,
   // Create the body block
   Block new_block(
       /*iter_vars=*/new_block_iters,
-      /*reads=*/
-      {BufferRegion(info->read_buffer, src_region)},
-      /*writes=*/
-      {BufferRegion(info->write_buffer, dst_region)},
-      /*name_hint=*/buffer_region->buffer->name + "_reindex",
+      /*reads=*/{BufferRegion::FromPoint(info->read_buffer, src_indices)},
+      /*writes=*/{BufferRegion::FromPoint(info->write_buffer, dst_indices)},
+      /*name_hint=*/info->write_buffer->name + "_reindex",
       /*body=*/
       BufferStore(info->write_buffer, BufferLoad(info->read_buffer, src_indices), dst_indices));
 
@@ -1169,7 +1156,7 @@ StmtSRef ReIndex(ScheduleState self, const StmtSRef& block_sref, int buffer_inde
     analyzer.Bind(iter->var, iter->dom);
   }
   original_indices.MutateByApply(
-      [&analyzer](const PrimExpr& expr) { return analyzer.Simplify(expr); });
+      [&analyzer](const PrimExpr& expr) { return SimplifyNonTrivialExpr(expr, &analyzer); });
 
   // Collect block iters appearing in the original_indices
   std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> covered;
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 025723e1793d..9d36a5f7e5c4 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -699,7 +699,9 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
 
   void RewriteBufferAccess(Buffer* buffer, Array<PrimExpr>* indices) {
     *buffer = new_buffer_;
-    *indices = index_map_->MapIndices(*indices, analyzer_);
+    *indices = index_map_->MapIndices(*indices);
+    (*indices).MutateByApply(
+        [&](const PrimExpr& e) { return SimplifyNonTrivialExpr(e, analyzer_); });
   }
 
   using Parent = arith::IRMutatorWithAnalyzer;
@@ -1113,7 +1115,7 @@ class IndexMapNotApplicableToBlockIterError : public ScheduleError {
 
   IRModule mod() const final { return mod_; }
 
-  Array<ObjectRef> LocationsOfInterest() const final { return {}; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
 
  private:
   IRModule mod_;
@@ -1194,22 +1196,14 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
   Array<PrimExpr> transformed_block_iters = index_map->MapIndices(block_vars);
   Array<PrimExpr> new_block_iter_range = index_map->MapShape(block_iter_range_array);
 
-  auto iter_map = arith::DetectIterMap(
-      /*indices=*/transformed_block_iters, /*input_iters=*/block_iter_dom, /*predicate=*/Bool(true),
-      /*check_level=*/arith::IterMapLevel::Bijective, &analyzer,
-      /*simplify_trivial_iterators=*/true);
-  if (iter_map->indices.empty()) {
-    throw NotBijectiveAffineIndexMapError(self->mod, index_map);
-  }
-
   // Step 5: Create the new block after transformation.
 
   // Step 5.1: Create new block iters. After applying the IndexMap f to block iters ax_0, ..., ax_n,
   // create block iter each expression in f(ax_0, ..., ax_n).
   Array<IterVar> new_block_iters;  // new block iters
   Array<PrimExpr> new_block_vars;  // iter_var->var of new block iters
-  for (size_t i = 0; i < index_map->final_indices.size(); ++i) {
-    Var new_block_var{"v" + std::to_string(i), DataType::Int(32)};
+  for (size_t i = 0; i < transformed_block_iters.size(); ++i) {
+    Var new_block_var{"v" + std::to_string(i), transformed_block_iters[i]->dtype};
     new_block_vars.push_back(new_block_var);
     IterVarType iter_type = DetectNewBlockIterType(transformed_block_iters[i], block_iter_type);
     if (iter_type == kOpaque) {
@@ -1221,18 +1215,28 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
 
   // Step 5.2: Update the block body. Use the inverse map f^{-1} to replace the original block iters
   // in the body.
+  Map<Var, PrimExpr> inverse_subst_map;
+  // Construct the inverse map
+  {
+    Array<Range> initial_ranges;
+    for (const PrimExpr& extent : block_iter_range_array) {
+      initial_ranges.push_back(Range::FromMinExtent(make_const(extent.dtype(), 0), extent));
+    }
+    IndexMap inverse_index_map{nullptr};
+    try {
+      inverse_index_map = index_map.Inverse(initial_ranges);
+    } catch (...) {
+      throw NotBijectiveAffineIndexMapError(self->mod, index_map);
+    }
 
-  auto inverse_map = arith::InverseAffineIterMap(iter_map->indices, new_block_vars);
-  // Trivial block iters will be simplified in DetectIterMap, they should be mapped to constant
-  // zero.
-  for (const auto& iter_var : block_ptr->iter_vars) {
-    if (inverse_map.find(iter_var->var) == inverse_map.end()) {
-      ICHECK(is_one(iter_var->dom->extent));
-      inverse_map.Set(iter_var->var, 0);
+    Array<PrimExpr> inversed_new_block_vars = inverse_index_map->MapIndices(
+        new_block_vars);  // old block vars written in terms of new block vars
+
+    for (int i = 0, n = block_vars.size(); i < n; ++i) {
+      inverse_subst_map.Set(Downcast<Var>(block_vars[i]), inversed_new_block_vars[i]);
     }
   }
-
-  Block new_block = Downcast<Block>(Substitute(GetRef<Block>(block_ptr), inverse_map));
+  Block new_block = Downcast<Block>(Substitute(GetRef<Block>(block_ptr), inverse_subst_map));
   new_block.CopyOnWrite()->iter_vars = new_block_iters;
   new_block = Downcast<Block>(BlockBufferAccessSimplifier::Simplify(new_block, &analyzer));
 
@@ -1241,7 +1245,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
   // Make new loop vars
   Array<PrimExpr> new_loop_vars;
   for (int i = 0; i < static_cast<int>(new_block_iters.size()); ++i) {
-    new_loop_vars.push_back(Var("ax" + std::to_string(i), DataType::Int(32)));
+    new_loop_vars.push_back(Var("ax" + std::to_string(i), new_block_iters[i]->var.dtype()));
   }
 
   // Make new block realize
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index 7a720fe3eae2..e91c5d142c04 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -359,14 +359,25 @@ void BlockBufferAccessSimplifier::SimplifyAccessRegion(Array<BufferRegion>* old_
   auto fmutate = [this](const BufferRegion& buffer_region) {
     std::vector<Range> new_buffer_region;
     for (const auto& range : buffer_region->region) {
-      new_buffer_region.push_back(Range::FromMinExtent(analyzer_->Simplify(range->min),
-                                                       analyzer_->Simplify(range->extent)));
+      if (is_one(range->extent) && range->min->IsInstance<VarNode>()) {
+        new_buffer_region.push_back(Range::FromMinExtent(
+            SimplifyNonTrivialExpr(range->min, analyzer_), make_const(range->min.dtype(), 1)));
+      } else {
+        new_buffer_region.push_back(
+            Range::FromMinExtent(SimplifyNonTrivialExpr(range->min, analyzer_),
+                                 SimplifyNonTrivialExpr(range->extent, analyzer_)));
+      }
     }
     return BufferRegion(buffer_region->buffer, new_buffer_region);
   };
   (*old_access_regions).MutateByApply(fmutate);
 }
 
+void BlockBufferAccessSimplifier::SimplifyBufferIndices(Array<PrimExpr>* indices) {
+  (*indices).MutateByApply(
+      [this](const PrimExpr& expr) { return SimplifyNonTrivialExpr(expr, analyzer_); });
+}
+
 Stmt BlockBufferAccessSimplifier::VisitStmt_(const BlockNode* op) {
   Block block = Downcast<Block>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
   auto* n = block.CopyOnWrite();
@@ -376,13 +387,15 @@ Stmt BlockBufferAccessSimplifier::VisitStmt_(const BlockNode* op) {
 }
 
 Stmt BlockBufferAccessSimplifier::VisitStmt_(const BufferStoreNode* op) {
-  auto node = Downcast<BufferStore>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
-  return VisitBufferAccess(std::move(node));
+  BufferStore node = Downcast<BufferStore>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
+  SimplifyBufferIndices(&node.CopyOnWrite()->indices);
+  return std::move(node);
 }
 
 PrimExpr BlockBufferAccessSimplifier::VisitExpr_(const BufferLoadNode* op) {
-  auto node = Downcast<BufferLoad>(arith::IRMutatorWithAnalyzer::VisitExpr_(op));
-  return VisitBufferAccess(std::move(node));
+  BufferLoad node = Downcast<BufferLoad>(arith::IRMutatorWithAnalyzer::VisitExpr_(op));
+  SimplifyBufferIndices(&node.CopyOnWrite()->indices);
+  return std::move(node);
 }
 
 }  // namespace tir
diff --git a/src/tir/schedule/transform.h b/src/tir/schedule/transform.h
index 2bba13e2bd1c..3593d6b9a444 100644
--- a/src/tir/schedule/transform.h
+++ b/src/tir/schedule/transform.h
@@ -226,16 +226,11 @@ class BlockBufferAccessSimplifier : public arith::IRMutatorWithAnalyzer {
   using IRMutatorWithAnalyzer::VisitStmt_;
 
   void SimplifyAccessRegion(Array<BufferRegion>* old_access_regions);
+  void SimplifyBufferIndices(Array<PrimExpr>* indices);
+
   Stmt VisitStmt_(const BlockNode* op) final;
   Stmt VisitStmt_(const BufferStoreNode* op) final;
   PrimExpr VisitExpr_(const BufferLoadNode* op) final;
-
-  template <typename Node>
-  Node VisitBufferAccess(Node node) {
-    node.CopyOnWrite()->indices.MutateByApply(
-        [this](const PrimExpr& expr) { return analyzer_->Simplify(expr); });
-    return node;
-  }
 };
 
 }  // namespace tir
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
index f7a5ce997edf..a53c1062b98d 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
@@ -365,30 +365,30 @@ def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3,
                 T.reads(inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1])
                 T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
                 PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 17 and 1 <= i2_1 and i2_1 < 17, inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float16(0), dtype="float16")
-        for ax0_0_ax1_0_0_ax2_0_0_fused in T.thread_binding(2, thread="blockIdx.y"):
-            for ax0_1_ax1_0_1_ax2_0_1_fused in T.thread_binding(16, thread="blockIdx.x"):
-                for ax0_2_ax1_0_2_ax2_0_2_fused in T.thread_binding(1, thread="threadIdx.y"):
-                    for ax3_0_0 in T.serial(1):
+        for ax0_0_0_ax1_0_0_fused in T.thread_binding(2, thread="blockIdx.y"):
+            for ax0_0_1_ax1_0_1_fused in T.thread_binding(16, thread="blockIdx.x"):
+                for ax0_0_2_ax1_0_2_fused in T.thread_binding(1, thread="threadIdx.y"):
+                    for ax2_0_0 in T.serial(1):
                         for ax0_ax1_fused in T.serial(4608):
                             with T.block("PadInput_reindex_shared"):
-                                v0 = T.axis.spatial(256, ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0_ax1_fused // 288)
+                                v0 = T.axis.spatial(256, ax0_0_1_ax1_0_1_fused * 16 + ax0_ax1_fused // 288)
                                 v1 = T.axis.spatial(288, ax0_ax1_fused % 288)
-                                T.reads(PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32])
+                                T.reads(PadInput[v0 // 256, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32])
                                 T.writes(PadInput_reindex_shared[v0, v1])
                                 T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":2})
-                                PadInput_reindex_shared[v0, v1] = PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32]
+                                PadInput_reindex_shared[v0, v1] = PadInput[v0 // 256, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32]
                         for ax0_ax1_fused in T.serial(4608):
                             with T.block("weight_reindex_shared"):
                                 v0 = T.axis.spatial(288, ax0_ax1_fused // 16)
-                                v1 = T.axis.spatial(32, ax0_0_ax1_0_0_ax2_0_0_fused * 16 + ax0_ax1_fused % 16)
+                                v1 = T.axis.spatial(32, ax0_0_0_ax1_0_0_fused * 16 + ax0_ax1_fused % 16)
                                 T.reads(weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1])
                                 T.writes(weight_reindex_shared[v0, v1])
                                 T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8})
                                 weight_reindex_shared[v0, v1] = weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1]
-                        for ax3_0_1 in T.serial(18):
+                        for ax2_0_1 in T.serial(18):
                             for ax0_0, ax1_0 in T.grid(1, 1):
                                 with T.block("PadInput_reindex_shared_wmma.matrix_a_o"):
-                                    v0_o, v1_o = T.axis.remap("SS", [ax0_1_ax1_0_1_ax2_0_1_fused, ax3_0_1])
+                                    v0_o, v1_o = T.axis.remap("SS", [ax0_0_1_ax1_0_1_fused, ax2_0_1])
                                     T.reads(PadInput_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
@@ -400,7 +400,7 @@ def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3,
                                             PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
                             for ax0_0, ax1_0 in T.grid(1, 1):
                                 with T.block("weight_reindex_shared_wmma.matrix_b_o"):
-                                    v0_o, v1_o = T.axis.remap("SS", [ax3_0_1, ax0_0_ax1_0_0_ax2_0_0_fused])
+                                    v0_o, v1_o = T.axis.remap("SS", [ax2_0_1, ax0_0_0_ax1_0_0_fused])
                                     T.reads(weight_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
@@ -410,32 +410,31 @@ def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3,
                                             T.reads(weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                             T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                             weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
-                            for ax0_3, ax1_0_3, ax2_0_3, ax3_0_2, ax0_4, ax1_0_4, ax2_0_4 in T.grid(1, 1, 1, 1, 1, 1, 1):
+                            for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(1, 1, 1, 1, 1):
                                 with T.block("conv2d_nhwc_o"):
-                                    v0 = T.axis.spatial(1, 0)
-                                    v1_o = T.axis.spatial(16, ax1_0_4 + ax0_1_ax1_0_1_ax2_0_1_fused + ax1_0_3)
-                                    v2_o = T.axis.spatial(2, ax0_0_ax1_0_0_ax2_0_0_fused + ax2_0_3 + ax2_0_4)
-                                    v3_o = T.axis.reduce(18, ax3_0_0 * 18 + ax3_0_1 + ax3_0_2)
-                                    T.reads(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 : v1_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], weight_reindex_shared_wmma_matrix_b[v3_o * 16 : v3_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16])
-                                    T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16])
+                                    v0_o = T.axis.spatial(16, ax0_0_4 + ax0_0_1_ax1_0_1_fused + ax0_0_3)
+                                    v1_o = T.axis.spatial(2, ax0_0_0_ax1_0_0_fused + ax1_0_3 + ax1_0_4)
+                                    v2_o = T.axis.reduce(18, ax2_0_0 * 18 + ax2_0_1 + ax2_0_2)
+                                    T.reads(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], weight_reindex_shared_wmma_matrix_b[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1})
                                     with T.init():
-                                        for ax1_1, ax2_1 in T.grid(16, 16):
+                                        for ax0_1, ax1_1 in T.grid(16, 16):
                                             with T.block("conv2d_nhwc_init"):
-                                                v1_i_init, v2_i_init = T.axis.remap("SS", [ax1_1, ax2_1])
+                                                v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1])
                                                 T.reads()
-                                                T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init])
-                                                conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init] = T.float32(0)
-                                    for ax1_1, ax2_1, ax3_1 in T.grid(16, 16, 16):
+                                                T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init])
+                                                conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0)
+                                    for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16):
                                         with T.block("conv2d_nhwc"):
-                                            v1_i, v2_i, v3_i = T.axis.remap("SSR", [ax1_1, ax2_1, ax3_1])
-                                            T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i], PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i])
-                                            T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i])
+                                            v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1])
+                                            T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], weight_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i])
+                                            T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                             T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
-                                            conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] + T.cast(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], "float32") * T.cast(weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i], "float32")
+                                            conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(weight_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32")
                     for ax0_0, ax1_0 in T.grid(1, 1):
                         with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
-                            v0_o, v1_o = T.axis.remap("SS", [ax0_1_ax1_0_1_ax2_0_1_fused, ax0_0_ax1_0_0_ax2_0_0_fused])
+                            v0_o, v1_o = T.axis.remap("SS", [ax0_0_1_ax1_0_1_fused, ax0_0_0_ax1_0_0_fused])
                             T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
@@ -447,15 +446,14 @@ def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3,
                                     conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
                 for ax0, ax1 in T.grid(16, 16):
                     with T.block("conv2d_nhwc_reindex_shared"):
-                        v0 = T.axis.spatial(256, ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0)
-                        v1 = T.axis.spatial(32, ax0_0_ax1_0_0_ax2_0_0_fused * 16 + ax1)
+                        v0 = T.axis.spatial(256, ax0_0_1_ax1_0_1_fused * 16 + ax0)
+                        v1 = T.axis.spatial(32, ax0_0_0_ax1_0_0_fused * 16 + ax1)
                         T.reads(conv2d_nhwc_reindex_shared[v0, v1])
-                        T.writes(conv2d_nhwc[0, v0 // 16, v0 % 16, v1])
+                        T.writes(conv2d_nhwc[v0 // 256, v0 // 16, v0 % 16, v1])
                         T.block_attr({"meta_schedule.cooperative_fetch":3})
-                        conv2d_nhwc[0, v0 // 16, v0 % 16, v1] = conv2d_nhwc_reindex_shared[v0, v1]
+                        conv2d_nhwc[v0 // 256, v0 // 16, v0 % 16, v1] = conv2d_nhwc_reindex_shared[v0, v1]
     # fmt: on
     decision_0 = [
-        ("SamplePerfectTile", [1, 1, 1, 1, 1]),
         ("SamplePerfectTile", [1, 16, 1, 1, 1]),
         ("SamplePerfectTile", [2, 1, 1, 1, 1]),
         ("SamplePerfectTile", [1, 18, 1]),
@@ -490,145 +488,8 @@ def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3,
         expected_decisions=[decision_0],
     )
 
-
-def test_conv2d_more_intrin():
-    # test adding inapplicable tensor intrinsics doesn't change the search space
-    # fmt: off
-    @T.prim_func
-    def conv2d_more_intrin_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3, 3, 32, 32), "float16"], conv2d_nhwc: T.Buffer[(1, 16, 16, 32), "float32"]) -> None:
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # body
-        # with T.block("root")
-        PadInput = T.alloc_buffer([1, 18, 18, 32], dtype="float16")
-        conv2d_nhwc_reindex_shared = T.alloc_buffer([256, 32], dtype="float32", scope="shared")
-        conv2d_nhwc_reindex_shared_wmma_accumulator = T.alloc_buffer([256, 32], dtype="float32", scope="wmma.accumulator")
-        PadInput_reindex_shared = T.alloc_buffer([256, 288], dtype="float16", scope="shared")
-        weight_reindex_shared = T.alloc_buffer([288, 32], dtype="float16", scope="shared")
-        PadInput_reindex_shared_wmma_matrix_a = T.alloc_buffer([256, 288], dtype="float16", scope="wmma.matrix_a")
-        weight_reindex_shared_wmma_matrix_b = T.alloc_buffer([288, 32], dtype="float16", scope="wmma.matrix_b")
-        for i0, i1, i2, i3 in T.grid(1, 18, 18, 32):
-            with T.block("PadInput"):
-                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                T.reads(inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1])
-                T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
-                PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 17 and 1 <= i2_1 and i2_1 < 17, inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float16(0), dtype="float16")
-        for ax0_0_ax1_0_0_ax2_0_0_fused in T.thread_binding(4, thread="blockIdx.y"):
-            for ax0_1_ax1_0_1_ax2_0_1_fused in T.thread_binding(4, thread="blockIdx.x"):
-                for ax0_2_ax1_0_2_ax2_0_2_fused in T.thread_binding(1, thread="threadIdx.y"):
-                    for ax3_0_0 in T.serial(3):
-                        for ax0_ax1_fused in T.serial(1536):
-                            with T.block("PadInput_reindex_shared"):
-                                v0 = T.axis.spatial(256, ax0_0_ax1_0_0_ax2_0_0_fused * 64 + ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0_ax1_fused // 96)
-                                v1 = T.axis.spatial(288, ax3_0_0 * 96 + ax0_ax1_fused % 96)
-                                T.reads(PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32])
-                                T.writes(PadInput_reindex_shared[v0, v1])
-                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8})
-                                PadInput_reindex_shared[v0, v1] = PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32]
-                        for ax0_ax1_fused in T.serial(3072):
-                            with T.block("weight_reindex_shared"):
-                                v0 = T.axis.spatial(288, ax3_0_0 * 96 + ax0_ax1_fused // 32)
-                                v1 = T.axis.spatial(32, ax0_ax1_fused % 32)
-                                T.reads(weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1])
-                                T.writes(weight_reindex_shared[v0, v1])
-                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8})
-                                weight_reindex_shared[v0, v1] = weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1]
-                        for ax3_0_1 in T.serial(2):
-                            for ax0_0, ax1_0 in T.grid(1, 3):
-                                with T.block("PadInput_reindex_shared_wmma.matrix_a_o"):
-                                    v0_o = T.axis.spatial(16, ax0_0_ax1_0_0_ax2_0_0_fused * 4 + ax0_1_ax1_0_1_ax2_0_1_fused)
-                                    v1_o = T.axis.spatial(18, ax3_0_0 * 6 + ax3_0_1 * 3 + ax1_0)
-                                    T.reads(PadInput_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
-                                    T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
-                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
-                                    for ax0_1, ax1_1 in T.grid(16, 16):
-                                        with T.block("PadInput_reindex_shared_wmma.matrix_a"):
-                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
-                                            T.reads(PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
-                                            T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
-                                            PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
-                            for ax0_0, ax1_0 in T.grid(3, 2):
-                                with T.block("weight_reindex_shared_wmma.matrix_b_o"):
-                                    v0_o = T.axis.spatial(18, ax3_0_0 * 6 + ax3_0_1 * 3 + ax0_0)
-                                    v1_o = T.axis.spatial(2, ax1_0)
-                                    T.reads(weight_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
-                                    T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
-                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
-                                    for ax0_1, ax1_1 in T.grid(16, 16):
-                                        with T.block("weight_reindex_shared_wmma.matrix_b"):
-                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
-                                            T.reads(weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
-                                            T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
-                                            weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
-                            for ax0_3, ax1_0_3, ax2_0_3, ax3_0_2, ax0_4, ax1_0_4, ax2_0_4 in T.grid(1, 1, 2, 3, 1, 1, 1):
-                                with T.block("conv2d_nhwc_o"):
-                                    v0 = T.axis.spatial(1, 0)
-                                    v1_o = T.axis.spatial(16, ax1_0_4 + ax0_0_ax1_0_0_ax2_0_0_fused * 4 + ax0_1_ax1_0_1_ax2_0_1_fused + ax1_0_3)
-                                    v2_o = T.axis.spatial(2, ax2_0_4 + ax2_0_3)
-                                    v3_o = T.axis.reduce(18, ax3_0_0 * 6 + ax3_0_1 * 3 + ax3_0_2)
-                                    T.reads(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 : v1_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], weight_reindex_shared_wmma_matrix_b[v3_o * 16 : v3_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16])
-                                    T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16])
-                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1})
-                                    with T.init():
-                                        for ax1_1, ax2_1 in T.grid(16, 16):
-                                            with T.block("conv2d_nhwc_init"):
-                                                v1_i_init, v2_i_init = T.axis.remap("SS", [ax1_1, ax2_1])
-                                                T.reads()
-                                                T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init])
-                                                conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init] = T.float32(0)
-                                    for ax1_1, ax2_1, ax3_1 in T.grid(16, 16, 16):
-                                        with T.block("conv2d_nhwc"):
-                                            v1_i, v2_i, v3_i = T.axis.remap("SSR", [ax1_1, ax2_1, ax3_1])
-                                            T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i], PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i])
-                                            T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i])
-                                            T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
-                                            conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] + T.cast(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], "float32") * T.cast(weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i], "float32")
-                    for ax0_0, ax1_0 in T.grid(1, 2):
-                        with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
-                            v0_o = T.axis.spatial(16, ax0_0_ax1_0_0_ax2_0_0_fused * 4 + ax0_1_ax1_0_1_ax2_0_1_fused)
-                            v1_o = T.axis.spatial(2, ax1_0)
-                            T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
-                            T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
-                            T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
-                            for ax0_1, ax1_1 in T.grid(16, 16):
-                                with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator"):
-                                    v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
-                                    T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
-                                    T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
-                                    conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
-                for ax0, ax1 in T.grid(16, 32):
-                    with T.block("conv2d_nhwc_reindex_shared"):
-                        v0 = T.axis.spatial(256, ax0_0_ax1_0_0_ax2_0_0_fused * 64 + ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0)
-                        v1 = T.axis.spatial(32, ax1)
-                        T.reads(conv2d_nhwc_reindex_shared[v0, v1])
-                        T.writes(conv2d_nhwc[0, v0 // 16, v0 % 16, v1])
-                        T.block_attr({"meta_schedule.cooperative_fetch":3})
-                        conv2d_nhwc[0, v0 // 16, v0 % 16, v1] = conv2d_nhwc_reindex_shared[v0, v1]
-    # fmt: on
-    decision_0 = [
-        ("SamplePerfectTile", [1, 1, 1, 1, 1]),
-        ("SamplePerfectTile", [4, 4, 1, 1, 1]),
-        ("SamplePerfectTile", [1, 1, 1, 2, 1]),
-        ("SamplePerfectTile", [3, 2, 3]),
-        ("SampleCategorical", 2),
-        ("SampleCategorical", 3),
-        ("SampleCategorical", 3),
-    ]
-
-    mod = te.create_prim_func(
-        te_workload.conv2d_nhwc(
-            N=1,
-            H=16,
-            W=16,
-            CI=32,
-            CO=32,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            in_dtype="float16",
-            out_dtype="float32",
-        )
-    )
+    # Test adding inapplicable tensor intrinsics doesn't change the search space
+    # This test case uses the same workload, decision and the expected sketch as above
     actual = ms.TuneContext(
         mod=mod,
         target=tvm.target.Target("cuda"),
@@ -643,7 +504,7 @@ def conv2d_more_intrin_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T
     check_sketches(
         mod,
         sketches=actual,
-        expected_mods=[conv2d_more_intrin_0],
+        expected_mods=[conv2d_0],
         expected_decisions=[decision_0],
     )
 
@@ -1088,5 +949,154 @@ def padded_matmul_relu_0(A: T.Buffer[(127, 127), "float16"], B: T.Buffer[(127, 1
     )
 
 
+def test_conv_1x1():
+    # fmt: off
+    @T.prim_func
+    def conv2d_1x1_0(inputs: T.Buffer[(1, 16, 16, 64), "float16"], weight: T.Buffer[(1, 1, 64, 64), "float16"], conv2d_nhwc: T.Buffer[(1, 16, 16, 64), "float32"]) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        conv2d_nhwc_reindex_shared = T.alloc_buffer([256, 64], dtype="float32", scope="shared")
+        conv2d_nhwc_reindex_shared_wmma_accumulator = T.alloc_buffer([256, 64], dtype="float32", scope="wmma.accumulator")
+        PadInput_reindex_shared = T.alloc_buffer([256, 64], dtype="float16", scope="shared")
+        weight_reindex_shared = T.alloc_buffer([1, 1, 64, 64], dtype="float16", scope="shared")
+        PadInput_reindex_shared_wmma_matrix_a = T.alloc_buffer([256, 64], dtype="float16", scope="wmma.matrix_a")
+        weight_reindex_shared_wmma_matrix_b = T.alloc_buffer([1, 1, 64, 64], dtype="float16", scope="wmma.matrix_b")
+        for ax2_0_0_ax3_0_0_fused in T.thread_binding(16, thread="blockIdx.y"):
+            for ax2_0_1_ax3_0_1_fused in T.thread_binding(2, thread="blockIdx.x"):
+                for ax2_0_2_ax3_0_2_fused in T.thread_binding(2, thread="threadIdx.y"):
+                    for ax0_0, ax1_0, ax4_0_0 in T.grid(1, 1, 1):
+                        for ax0_ax1_fused in T.serial(1024):
+                            with T.block("PadInput_reindex_shared"):
+                                v0 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused // 2 * 32 + ax2_0_1_ax3_0_1_fused * 16 + ax0_ax1_fused // 64)
+                                v1 = T.axis.spatial(64, ax0_ax1_fused % 64)
+                                T.reads(inputs[v0 // 256, v0 // 16, v0 % 16, v1])
+                                T.writes(PadInput_reindex_shared[v0, v1])
+                                T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":1})
+                                PadInput_reindex_shared[v0, v1] = inputs[v0 // 256, v0 // 16, v0 % 16, v1]
+                        for ax0_ax1_ax2_ax3_fused in T.serial(2048):
+                            with T.block("weight_reindex_shared"):
+                                v0 = T.axis.spatial(1, 0)
+                                v1 = T.axis.spatial(1, 0)
+                                v2 = T.axis.spatial(64, ax0_ax1_ax2_ax3_fused // 32)
+                                v3 = T.axis.spatial(64, ax2_0_0_ax3_0_0_fused % 2 * 32 + ax0_ax1_ax2_ax3_fused % 32)
+                                T.reads(weight[v0, v1, v2, v3])
+                                T.writes(weight_reindex_shared[v0, v1, v2, v3])
+                                T.block_attr({"buffer_dim_align":[[0, 2, 32, 8]], "meta_schedule.cooperative_fetch":4})
+                                weight_reindex_shared[v0, v1, v2, v3] = weight[v0, v1, v2, v3]
+                        for ax0_1, ax1_1, ax4_0_1 in T.grid(1, 1, 1):
+                            for ax0_0_1, ax1_0_1 in T.grid(1, 4):
+                                with T.block("PadInput_reindex_shared_wmma.matrix_a_o"):
+                                    v0_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused // 2 * 2 + ax2_0_1_ax3_0_1_fused)
+                                    v1_o = T.axis.spatial(4, ax1_0_1)
+                                    T.reads(PadInput_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
+                                    for ax0_1_1, ax1_1_1 in T.grid(16, 16):
+                                        with T.block("PadInput_reindex_shared_wmma.matrix_a"):
+                                            v0_i, v1_i = T.axis.remap("SS", [ax0_1_1, ax1_1_1])
+                                            T.reads(PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                            PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                            for ax0, ax1, ax2_0, ax3_0 in T.grid(1, 1, 4, 1):
+                                with T.block("weight_reindex_shared_wmma.matrix_b_o"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1 = T.axis.spatial(1, 0)
+                                    v2_o = T.axis.spatial(4, ax2_0)
+                                    v3_o = T.axis.spatial(4, ax2_0_0_ax3_0_0_fused % 2 * 2 + ax2_0_2_ax3_0_2_fused)
+                                    T.reads(weight_reindex_shared[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                    T.writes(weight_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
+                                    for ax2_1, ax3_1 in T.grid(16, 16):
+                                        with T.block("weight_reindex_shared_wmma.matrix_b"):
+                                            v2_i, v3_i = T.axis.remap("SS", [ax2_1, ax3_1])
+                                            T.reads(weight_reindex_shared[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i])
+                                            T.writes(weight_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i])
+                                            weight_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i] = weight_reindex_shared[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i]
+                            for ax2_0_3, ax3_0_3, ax0_2, ax1_2, ax4_0_2, ax2_0_4, ax3_0_4 in T.grid(1, 1, 1, 1, 4, 1, 1):
+                                with T.block("conv2d_nhwc_o"):
+                                    v0 = T.axis.reduce(1, 0)
+                                    v1 = T.axis.reduce(1, 0)
+                                    v2_o = T.axis.spatial(16, ax2_0_4 + ax2_0_0_ax3_0_0_fused // 2 * 2 + ax2_0_1_ax3_0_1_fused + ax2_0_3)
+                                    v3_o = T.axis.spatial(4, ax3_0_4 + ax2_0_0_ax3_0_0_fused % 2 * 2 + ax2_0_2_ax3_0_2_fused + ax3_0_3)
+                                    v4_o = T.axis.reduce(4, ax4_0_0 * 4 + ax4_0_1 * 4 + ax4_0_2)
+                                    T.reads(PadInput_reindex_shared_wmma_matrix_a[v2_o * 16 : v2_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], weight_reindex_shared_wmma_matrix_b[v0, v1, v4_o * 16 : v4_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                    T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1})
+                                    with T.init():
+                                        for ax2_1, ax3_1 in T.grid(16, 16):
+                                            with T.block("conv2d_nhwc_init"):
+                                                v2_i_init, v3_i_init = T.axis.remap("SS", [ax2_1, ax3_1])
+                                                T.reads()
+                                                T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i_init, v3_o * 16 + v3_i_init])
+                                                conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i_init, v3_o * 16 + v3_i_init] = T.float32(0)
+                                    for ax2_1, ax3_1, ax4_1 in T.grid(16, 16, 16):
+                                        with T.block("conv2d_nhwc"):
+                                            v2_i, v3_i, v4_i = T.axis.remap("SSR", [ax2_1, ax3_1, ax4_1])
+                                            T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i], PadInput_reindex_shared_wmma_matrix_a[v2_o * 16 + v2_i, v4_o * 16 + v4_i], weight_reindex_shared_wmma_matrix_b[v0, v1, v4_o * 16 + v4_i, v3_o * 16 + v3_i])
+                                            T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i])
+                                            T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
+                                            conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] + T.cast(PadInput_reindex_shared_wmma_matrix_a[v2_o * 16 + v2_i, v4_o * 16 + v4_i], "float32") * T.cast(weight_reindex_shared_wmma_matrix_b[v0, v1, v4_o * 16 + v4_i, v3_o * 16 + v3_i], "float32")
+                    for ax0_0, ax1_0 in T.grid(1, 1):
+                        with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
+                            v0_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused // 2 * 2 + ax2_0_1_ax3_0_1_fused)
+                            v1_o = T.axis.spatial(4, ax2_0_0_ax3_0_0_fused % 2 * 2 + ax2_0_2_ax3_0_2_fused)
+                            T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
+                            for ax0_1, ax1_1 in T.grid(16, 16):
+                                with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator"):
+                                    v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                    T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                    conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                for ax0, ax1 in T.grid(16, 32):
+                    with T.block("conv2d_nhwc_reindex_shared"):
+                        v0 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused // 2 * 32 + ax2_0_1_ax3_0_1_fused * 16 + ax0)
+                        v1 = T.axis.spatial(64, ax2_0_0_ax3_0_0_fused % 2 * 32 + ax1)
+                        T.reads(conv2d_nhwc_reindex_shared[v0, v1])
+                        T.writes(conv2d_nhwc[v0 // 256, v0 // 16, v0 % 16, v1])
+                        T.block_attr({"meta_schedule.cooperative_fetch":2})
+                        conv2d_nhwc[v0 // 256, v0 // 16, v0 % 16, v1] = conv2d_nhwc_reindex_shared[v0, v1]
+    # fmt: on
+
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 1]),
+        ("SamplePerfectTile", [8, 2, 1, 1, 1]),
+        ("SamplePerfectTile", [2, 1, 2, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 4]),
+        ("SampleCategorical", 1),
+        ("SampleCategorical", 0),
+        ("SampleCategorical", 2),
+    ]
+
+    mod = te.create_prim_func(
+        te_workload.conv2d_nhwc(
+            1,
+            16,
+            16,
+            64,
+            64,
+            1,
+            1,
+            0,
+            in_dtype="float16",
+            out_dtype="float32",
+        )
+    )
+    actual = ms.TuneContext(
+        mod=mod,
+        target=tvm.target.Target("cuda"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[multi_level_tiling_tensor_core(write_reuse_scope="shared")]
+        + get_rules("cuda", ms.schedule_rule.AutoInline),
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[conv2d_1x1_0],
+        expected_decisions=[decision_0],
+    )
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index 807420ece3ba..e0667da6fe92 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -360,8 +360,7 @@ def test_get_auto_tensorize_mapping_info_conv2d_unit_batch():
         conv2d,
         "conv2d_nhwc",
         WMMA_SYNC_16x16x16_f16f16f32_INTRIN,
-        # unit iter is not mapped
-        lambda n, h, w, c, rh, rw, rc: (n, h * 16 + w, c, rh * 192 + rw * 64 + rc),
+        lambda n, h, w, c, rh, rw, rc: (n * 256 + h * 16 + w, c, rh * 192 + rw * 64 + rc),
     )
 
 
@@ -388,7 +387,7 @@ def test_get_auto_tensorize_mapping_info_batch_matmul(b, m, n, k):
                 k,
             ),
         ),
-        (1, 32, 32, None),
+        (1, 32, 32, lambda n, m, k: (n, m, k)),
     ],
 )
 def test_get_auto_tensorize_mapping_info_matmul(n, m, k, expected):
diff --git a/tests/python/unittest/test_tir_schedule_reindex.py b/tests/python/unittest/test_tir_schedule_reindex.py
index 60dcefba631a..53bc726ceaf3 100644
--- a/tests/python/unittest/test_tir_schedule_reindex.py
+++ b/tests/python/unittest/test_tir_schedule_reindex.py
@@ -76,6 +76,37 @@ def conv2d_nhwc(
             )
 
 
+@T.prim_func
+def conv2d_nhwc_reindex_data(
+    Input: T.Buffer[(1, 224, 224, 3), "float32"],
+    Weight: T.Buffer[(7, 7, 3, 64), "float32"],
+    Conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"],
+) -> None:
+    PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+    ReindexInput = T.alloc_buffer([1, 112, 112, 7, 7, 3], dtype="float32")
+    for i0, i1, i2, i3 in T.grid(1, 230, 230, 3):
+        with T.block("PadInput"):
+            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
+                ((((i1_1 >= 3) and (i1_1 < 227)) and (i2_1 >= 3)) and (i2_1 < 227)),
+                Input[i0_1, (i1_1 - 3), (i2_1 - 3), i3_1],
+                T.float32(0),
+                dtype="float32",
+            )
+    for i0, i1, i2, i3, i4, i5 in T.grid(1, 112, 112, 7, 7, 3):
+        with T.block("ReindexInput"):
+            n, h, w, rh, rw, rc = T.axis.remap("SSSSSS", [i0, i1, i2, i3, i4, i5])
+            ReindexInput[n, h, w, rh, rw, rc] = PadInput[n, ((h * 2) + rh), ((w * 2) + rw), rc]
+    for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 112, 112, 64, 7, 7, 3):
+        with T.block("conv2d_nhwc"):
+            n, h, w, co, rh, rw, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+            with T.init():
+                Conv2d_nhwc[n, h, w, co] = T.float32(0)
+            Conv2d_nhwc[n, h, w, co] = Conv2d_nhwc[n, h, w, co] + (
+                ReindexInput[n, h, w, rh, rw, rc] * Weight[rh, rw, rc, co]
+            )
+
+
 @T.prim_func
 def conv2d_nhwc_reindex_weight(
     var_inputs: T.handle, var_weight: T.handle, var_conv2d_nhwc: T.handle
@@ -208,6 +239,45 @@ def mixed_dtype_reindex_write(
             T_matmul_NT[v0, v1] = T_matmul_NT_reindex[v0, v1]
 
 
+@T.prim_func
+def matmul_unit_dim(
+    A: T.Buffer[(1, 512), "float32"],
+    B: T.Buffer[(512, 1), "float32"],
+    C: T.Buffer[(1, 1), "float32"],
+) -> None:
+    for i0, i1, i2 in T.grid(1, 1, 512):
+        with T.block("matmul"):
+            i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+            T.reads(C[i, j], A[i, k], B[k, j])
+            T.writes(C[i, j])
+            with T.init():
+                C[i, j] = T.float32(0)
+            C[i, j] = C[i, j] + A[i, k] * B[k, j]
+
+
+@T.prim_func
+def matmul_unit_dim_reindex_write(
+    A: T.Buffer[(1, 512), "float32"],
+    B: T.Buffer[(512, 1), "float32"],
+    C: T.Buffer[(1, 1), "float32"],
+) -> None:
+    C_reindex = T.alloc_buffer([1, 1], dtype="float32")
+    for i0, i1, i2 in T.grid(1, 1, 512):
+        with T.block("matmul"):
+            i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+            T.reads(C_reindex[i, j], A[i, k], B[k, j])
+            T.writes(C_reindex[i, j])
+            with T.init():
+                C_reindex[i, j] = T.float32(0)
+            C_reindex[i, j] = C_reindex[i, j] + A[i, k] * B[k, j]
+    for i0, i1 in T.grid(1, 1):
+        with T.block("C_reindex"):
+            v0, v1 = T.axis.remap("SS", [i0, i1])
+            T.reads(C_reindex[v0, v1])
+            T.writes(C[v0, v1])
+            C[v0, v1] = C_reindex[v0, v1]
+
+
 use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 use_buffer_name = tvm.testing.parameter(by_dict={"buffer_index": False, "buffer_name": True})
 
@@ -221,7 +291,7 @@ def test_reindex_read_basic(use_block_name, use_buffer_name):
     verify_trace_roundtrip(sch=sch, mod=transpose_elementwise)
 
 
-def test_conv2d_reindex_read(use_block_name, use_buffer_name):
+def test_conv2d_reindex_weight(use_block_name, use_buffer_name):
     sch = tir.Schedule(conv2d_nhwc)
     block = "conv2d_nhwc" if use_block_name else sch.get_block("conv2d_nhwc")
     buf = "Weight" if use_buffer_name else ("read", 1)
@@ -230,6 +300,15 @@ def test_conv2d_reindex_read(use_block_name, use_buffer_name):
     verify_trace_roundtrip(sch=sch, mod=conv2d_nhwc)
 
 
+def test_conv2d_reindex_data(use_block_name, use_buffer_name):
+    sch = tir.Schedule(conv2d_nhwc)
+    block = "conv2d_nhwc" if use_block_name else sch.get_block("conv2d_nhwc")
+    buf = "PadInput" if use_buffer_name else ("read", 0)
+    sch.reindex(block, buf)
+    tvm.ir.assert_structural_equal(conv2d_nhwc_reindex_data, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=conv2d_nhwc)
+
+
 def test_matmul_reindex_write(use_block_name, use_buffer_name):
     sch = tir.Schedule(matmul)
     block = "matmul" if use_block_name else sch.get_block("matmul")
@@ -256,5 +335,14 @@ def test_reindex_mixed_dtype(use_block_name, use_buffer_name):
     verify_trace_roundtrip(sch=sch, mod=mixed_dtype)
 
 
+def test_matmul_unit_dim_reindex_write(use_block_name, use_buffer_name):
+    sch = tir.Schedule(matmul_unit_dim)
+    block = "matmul" if use_block_name else sch.get_block("matmul")
+    buf = "C" if use_buffer_name else ("write", 0)
+    sch.reindex(block, buf)
+    tvm.ir.assert_structural_equal(matmul_unit_dim_reindex_write, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=matmul_unit_dim)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index 0b0146ee43fa..174e9eb25cc0 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -152,21 +152,27 @@ def conv2d_nhwc_transformed(
                 T.float32(0),
                 dtype="float32",
             )
-    for ax0, ax_1, ax_2 in T.grid(12544, 64, 147):
+    for ax0, ax1, ax2 in T.grid(12544, 64, 147):
         with T.block("conv2d_nhwc"):
-            bv0, bv1, bv2 = T.axis.remap("SSR", [ax0, ax_1, ax_2])
-            T.reads(
-                PadInput[0, bv0 // 112 * 2 + bv2 // 21, bv0 % 112 * 2 + bv2 % 21 // 3, bv2 % 3],
-                Weight[bv2 // 21, bv2 % 21 // 3, bv2 % 3, bv1],
-            )
-            T.writes(Conv2d_nhwc[0, bv0 // 112, bv0 % 112, bv1])
+            v0, v1, v2 = T.axis.remap("SSR", [ax0, ax1, ax2])
+            T.reads(PadInput[v0 // 12544, v0 // 112 * 2 + v2 // 21, v0 % 112 * 2 + v2 % 21 // 3, v2 % 3], Weight[v2 // 21, v2 % 21 // 3, v2 % 3, v1])
+            T.writes(Conv2d_nhwc[v0 // 12544, v0 // 112, v0 % 112, v1])
             with T.init():
-                Conv2d_nhwc[0, bv0 // 112, bv0 % 112, bv1] = T.float32(0)
-            Conv2d_nhwc[0, bv0 // 112, bv0 % 112, bv1] = (
-                Conv2d_nhwc[0, bv0 // 112, bv0 % 112, bv1]
-                + PadInput[0, bv0 // 112 * 2 + bv2 // 21, bv0 % 112 * 2 + bv2 % 21 // 3, bv2 % 3]
-                * Weight[bv2 // 21, bv2 % 21 // 3, bv2 % 3, bv1]
-            )
+                Conv2d_nhwc[v0 // 12544, v0 // 112, v0 % 112, v1] = T.float32(0)
+            Conv2d_nhwc[v0 // 12544, v0 // 112, v0 % 112, v1] = Conv2d_nhwc[v0 // 12544, v0 // 112, v0 % 112, v1] + PadInput[v0 // 12544, v0 // 112 * 2 + v2 // 21, v0 % 112 * 2 + v2 % 21 // 3, v2 % 3] * Weight[v2 // 21, v2 % 21 // 3, v2 % 3, v1]
+
+
+@T.prim_func
+def two_elementwise_unit_dim(A: T.Buffer[(1, 128), "float32"], C: T.Buffer[(1, 128), "float32"]) -> None:
+    B = T.alloc_buffer((1, 128), "float32")
+    for i, j in T.grid(1, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(1, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 # pylint: enable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
 # fmt: on
@@ -225,6 +231,24 @@ def test_two_elementwise_transform_output_buffer(use_block_name):
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
 
+def test_two_elementwise_unit_dim(use_block_name):
+    sch = tir.Schedule(two_elementwise_unit_dim, debug_mask="all")
+    index_map = lambda i, j: (i, j)
+
+    if use_block_name:
+        sch.transform_layout(
+            index_map=index_map,
+            block="B",
+            buffer="B",
+        )
+    else:
+        block = sch.get_block("B")
+        sch.transform_layout(block, ("write", 0), index_map)
+
+    tvm.ir.assert_structural_equal(two_elementwise_unit_dim, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=two_elementwise_unit_dim)
+
+
 def test_simplify():
     sch = tir.Schedule(two_elementwise, debug_mask="all")
 
@@ -312,6 +336,29 @@ def test_transform_block_layout_conv2d_nhwc(use_block_name):
     verify_trace_roundtrip(sch=sch, mod=conv2d_nhwc)
 
 
+def test_transform_block_layout_unit_dim(use_block_name):
+    sch = tir.Schedule(two_elementwise_unit_dim, debug_mask="all")
+    block = "B" if use_block_name else sch.get_block("B")
+    sch.transform_block_layout(block, lambda i, j: (j, i))
+
+    @T.prim_func
+    def two_elementwise_unit_dim_transformed(
+        A: T.Buffer[(1, 128), "float32"], C: T.Buffer[(1, 128), "float32"]
+    ) -> None:
+        B = T.alloc_buffer((1, 128), "float32")
+        for j, i in T.grid(128, 1):
+            with T.block("B"):
+                vj, vi = T.axis.remap("SS", [j, i])
+                B[vi, vj] = A[vi, vj] * 2.0
+        for i, j in T.grid(1, 128):
+            with T.block("C"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                C[vi, vj] = B[vi, vj] + 1.0
+
+    tvm.ir.assert_structural_equal(two_elementwise_unit_dim_transformed, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=two_elementwise_unit_dim)
+
+
 def test_transform_block_layout_fail_non_affine(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
     block = "B" if use_block_name else sch.get_block("B")

From 59b945742d3a5f51a0f1f3e4638cc0ab356bc5ee Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Wed, 5 Oct 2022 17:03:26 -0700
Subject: [PATCH 307/704] [FIX,AUTOSCHEDULER,METASCHEDULE] Handle negative
 extents in featurization (#12990)

Both auto_scheduler and metaschedule featurization did not lower bound
loop extents by zero, leading to negative bounds. When multiplied with
element size integer underflow occurred before converting to floating
point.
---
 python/tvm/auto_scheduler/feature.py             |  8 ++++----
 src/auto_scheduler/feature.cc                    |  7 +++++--
 .../feature_extractor/per_store_feature.cc       |  4 +++-
 .../unittest/test_auto_scheduler_feature.py      | 11 +++++++++++
 ...hedule_feature_extractor_per_store_feature.py | 16 ++++++++++++++++
 5 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/python/tvm/auto_scheduler/feature.py b/python/tvm/auto_scheduler/feature.py
index 09d54a92fd64..491f8b7643b6 100644
--- a/python/tvm/auto_scheduler/feature.py
+++ b/python/tvm/auto_scheduler/feature.py
@@ -260,7 +260,7 @@ def features_from_primfunc(
     cache_line_bytes: int = 64,
     max_n_bufs: Optional[int] = None,
     log_scale: bool = False,
-) -> np.ndarray:
+) -> Optional[np.ndarray]:
     """Extract performance features from a PrimFunc.
 
     Parameters
@@ -284,7 +284,7 @@ def features_from_primfunc(
 
     Returns
     -------
-    np.ndarray
+    Optional[np.ndarray]
         Output features, one row per store into a unique buffer statement in `func`.
     """
     return _ffi_api.FeaturesFromPrimFunc(
@@ -297,7 +297,7 @@ def named_features_from_primfunc(
     cache_line_bytes: int = 64,
     max_n_bufs: Optional[int] = None,
     log_scale: bool = False,
-) -> Dict[str, np.ndarray]:
+) -> Optional[Dict[str, np.ndarray]]:
     """Extract performance features and associated names from a PrimFunc.
 
     Parameters
@@ -321,7 +321,7 @@ def named_features_from_primfunc(
 
     Returns
     -------
-    Dict[str, np.ndarray]
+    Optional[Dict[str, np.ndarray]]
         Mapping from feature name to features. One element per store into a
         unique buffer statement in `func`.
     """
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index e079018151a7..0b5a157c8813 100644
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -38,6 +38,7 @@
 #include <tvm/tir/transform.h>
 
 #include <algorithm>
+#include <cassert>
 #include <cmath>
 #include <numeric>
 #include <unordered_map>
@@ -880,6 +881,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
         ComputeRegion(acc.indices, &local_analyzer, &tmp_region);
         int64_t touched_size = ElementProduct(tmp_region);
+        touched_size = std::max<int64_t>(0, touched_size);
         buffer_regions_map[t].push_back(
             std::make_tuple(acc.acc_type, touched_size, buffer_dtypes.at(t).bytes()));
         mem_bytes += touched_size * buffer_dtypes.at(t).bytes();
@@ -917,8 +919,9 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
         lines = 1.0f;
         unique_lines = 1.0f;
       } else {
-        unique_bytes =
-            std::get<1>(for_touch_regions_[for_loop_stack_.front()][t].front()) * ele_bytes;
+        unique_bytes = static_cast<float>(
+                           std::get<1>(for_touch_regions_[for_loop_stack_.front()][t].front())) *
+                       ele_bytes;
 
         stride = 0;
         int64_t reduce_ratio = 1;
diff --git a/src/meta_schedule/feature_extractor/per_store_feature.cc b/src/meta_schedule/feature_extractor/per_store_feature.cc
index 571bb2face00..698de010b75e 100644
--- a/src/meta_schedule/feature_extractor/per_store_feature.cc
+++ b/src/meta_schedule/feature_extractor/per_store_feature.cc
@@ -836,6 +836,7 @@ void Feature::SetRegion(const LoopNest& loop_nest, IntVec* for_touched_bytes,
       // while others are discarded
       int64_t numel;
       feature.access_shape = utils::RelaxAndUnion(feature.multi_indices, &numel, analyzer);
+      numel = std::max<int64_t>(0, numel);
       feature.loop_accessed_numel[i][buffer] = numel;
       touched_bytes += numel * buffer->dtype.bytes();
       (*buffer_touched_under_loop)[loop][buffer].push_back(numel);
@@ -976,7 +977,8 @@ void Feature::SubFeature::SetFeature(const LoopNest& loop_nest, int64_t cache_li
     this->lines = 1;
     this->unique_lines = 1;
   } else {
-    this->unique_bytes = this->loop_accessed_numel.front().at(buffer) * dtype_bytes;
+    this->unique_bytes =
+        static_cast<double>(this->loop_accessed_numel.front().at(buffer)) * dtype_bytes;
     this->lines = static_cast<double>(loop_nest.prod) / this->prod_non_strided_loop_extent *
                   std::min(1.0, 1.0 * this->min_stride * dtype_bytes / cache_line_bytes);
     this->lines = std::max(1.0, this->lines);
diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py
index 2a058cdbc05c..6f9493fcbf99 100644
--- a/tests/python/unittest/test_auto_scheduler_feature.py
+++ b/tests/python/unittest/test_auto_scheduler_feature.py
@@ -262,6 +262,17 @@ def test_dense_lowered():
     assert total_bytes_loaded > 2 * 128 * 128 * 4  # 4 bytes per float32
 
 
+@T.prim_func
+def negative_extent(A: T.Buffer[(1,), "float32"]):
+    for j in range(0, -1):
+        A[j] = A[j] + 1.0
+
+
+def test_negative_extent():
+    features = auto_scheduler.feature.named_features_from_primfunc(negative_extent)
+    assert features["B0.unique_bytes"] == 0
+
+
 if __name__ == "__main__":
     test_cpu_matmul()
     test_cpu_fusion()
diff --git a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
index 17ea8b9d3bb5..cad140b8deb5 100644
--- a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
+++ b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
@@ -1588,5 +1588,21 @@ def test_cpu_layout_transform():
     )
 
 
+@T.prim_func
+def negative_extent(A: T.Buffer[(1,), "float32"]):
+    for j in range(0, -1):
+        A[j] = A[j] + 1.0
+
+
+def test_negative_extent():
+    extractor = ms.feature_extractor.PerStoreFeature()
+    (features,) = extractor.extract_from(
+        _make_context(tvm.target.Target("llvm")),
+        candidates=[_make_candidate(lambda: tir.Schedule(negative_extent))],
+    )
+    named_features = dict(zip(_feature_names(), list(features.numpy()[0, :])))
+    assert named_features["B0.unique_bytes"] == 0
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 3d5e440fff581e30b614037997feb81bf587cefe Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 5 Oct 2022 19:40:13 -0700
Subject: [PATCH 308/704] [ci] Disable flaky ethosu + roofline tests (#12956)

These are all segfaulting in main (see #12955, #12933, and #12841) so
they need to be skipped until a fix is merged.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 python/tvm/testing/utils.py                   | 22 ++++++++++++++-----
 .../contrib/test_ethosu/test_codegen.py       |  2 +-
 .../test_replace_depthwise_conv2d.py          |  5 ++++-
 tests/python/unittest/test_roofline.py        |  1 +
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index ad1e003d6e3f..2babe442ca3c 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1752,13 +1752,13 @@ def fetch_model_from_url(
     return tvmc_model.mod, tvmc_model.params
 
 
-def xfail_parameterizations(*xfail_params, reason):
+def _mark_parameterizations(*params, marker_fn, reason):
     """
-    Mark tests with a nodeid parameters that exactly matches one in params as
-    xfail. Useful for quickly marking tests as xfail when they have a large
+    Mark tests with a nodeid parameters that exactly matches one in params.
+    Useful for quickly marking tests as xfail when they have a large
     combination of parameters.
     """
-    xfail_params = set(xfail_params)
+    params = set(params)
 
     def decorator(func):
         @functools.wraps(func)
@@ -1766,8 +1766,10 @@ def wrapper(request, *args, **kwargs):
             if "[" in request.node.name and "]" in request.node.name:
                 # Strip out the test name and the [ and ] brackets
                 params_from_name = request.node.name[len(request.node.originalname) + 1 : -1]
-                if params_from_name in xfail_params:
-                    pytest.xfail(reason=f"xfail on nodeid {request.node.nodeid}: " + reason)
+                if params_from_name in params:
+                    marker_fn(
+                        reason=f"{marker_fn.__name__} on nodeid {request.node.nodeid}: " + reason
+                    )
 
             return func(request, *args, **kwargs)
 
@@ -1776,6 +1778,14 @@ def wrapper(request, *args, **kwargs):
     return decorator
 
 
+def xfail_parameterizations(*xfail_params, reason):
+    return _mark_parameterizations(*xfail_params, marker_fn=pytest.xfail, reason=reason)
+
+
+def skip_parameterizations(*skip_params, reason):
+    return _mark_parameterizations(*skip_params, marker_fn=pytest.skip, reason=reason)
+
+
 def main():
     test_file = inspect.getsourcefile(sys._getframe(1))
     sys.exit(pytest.main([test_file] + sys.argv[1:]))
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 28ea48f00932..89c87325baaf 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -368,7 +368,7 @@ def binary_elementwise(lhs, rhs):
     )
 
 
-@pytest.mark.xfail(strict=False, reason="See https://github.com/apache/tvm/issues/10487")
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/12634")
 @pytest.mark.parametrize(
     "accel_type",
     ACCEL_TYPES,
diff --git a/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
index 80aa74b8434d..32f75621fde0 100644
--- a/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
@@ -75,7 +75,10 @@
         ],
     ],
 )
-def test_depthwise_conv2d_single(trial):
+@tvm.testing.skip_parameterizations(
+    "trial3", reason="See https://github.com/apache/tvm/issues/12841"
+)
+def test_depthwise_conv2d_single(request, trial):
     def _get_func(
         ifm_shape,
         channels,
diff --git a/tests/python/unittest/test_roofline.py b/tests/python/unittest/test_roofline.py
index e37f6e085bf6..61e6e06aa8f3 100644
--- a/tests/python/unittest/test_roofline.py
+++ b/tests/python/unittest/test_roofline.py
@@ -35,6 +35,7 @@
 
 
 @tvm.testing.parametrize_targets("llvm", "cuda")
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/12955")
 def test_estimate_peak_flops(target, dev):
     server = rpc.Server(key="roofline_flops")
     remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops")

From 2d50979606a664f01d73d986be7562c5c2f8cd7c Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 6 Oct 2022 02:12:41 -0500
Subject: [PATCH 309/704] [TVMScript] Allow T.bool type annotations (#12975)

Previously, `T.bool` could be used to define a boolean value (e.g. `b
= T.bool(1)`), but required using `T.boolean` when writing a type
annotation.  This can cause both user confusion, as all other
TVMScript type annotations directly match their string representation
in `tvm.runtime.DataType`, and failure to roundtrip, because the
tvmscript printer will generate the non-existence `T.bool` type
annotations.

This commit add `tvm.script.tir.ty.bool`, which can be used in
TVMScript as `T.bool`. The previous
`tvm.script.tir.ty.boolean` (`T.boolean` in TVMScript) is maintained
for backwards compatibility.
---
 python/tvm/script/tir/__init__.py             |  1 +
 python/tvm/script/tir/ty.py                   | 12 ++++-
 .../unittest/test_tvmscript_roundtrip.py      | 46 +++++++++++++++++++
 3 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/python/tvm/script/tir/__init__.py b/python/tvm/script/tir/__init__.py
index 2f2b4bbc2520..d7db182f9d20 100644
--- a/python/tvm/script/tir/__init__.py
+++ b/python/tvm/script/tir/__init__.py
@@ -18,6 +18,7 @@
 
 # Type system
 from .ty import void, boolean, handle, Ptr, Tuple, Buffer
+from .ty import bool  # pylint: disable=redefined-builtin
 
 from .prim_func import prim_func
 
diff --git a/python/tvm/script/tir/ty.py b/python/tvm/script/tir/ty.py
index 4548102a9ee9..b8323dd4a167 100644
--- a/python/tvm/script/tir/ty.py
+++ b/python/tvm/script/tir/ty.py
@@ -206,7 +206,17 @@ def __getitem__(self, args):
             _name = _dtype + _size + _lanes
             globals()[_name] = ConcreteType(_name)
 
-boolean = ConcreteType("bool")
+
+# All other DataType annotations are represented with the same string
+# as is used by `tvm.runtime.DataType`.  This does redefine the Python
+# built-in bool, but only within the context of `tvm.script.tir.ty`
+# and `tvm.script.tir` modules.  The `T.boolean` alias is maintained
+# for backwards compatibility.
+
+bool = ConcreteType("bool")  # pylint: disable=redefined-builtin
+boolean = bool
+
+
 handle = ConcreteType("handle")
 void = VoidType()
 Ptr = GenericPtrType()
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index e139d2111bee..3b72a5ae8a92 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3409,6 +3409,47 @@ def func() -> None:
     return func
 
 
+def boolean_argument():
+    @T.prim_func
+    def func(a: T.boolean) -> None:
+        T.evaluate(a)
+
+    return func
+
+
+def bool_argument():
+    @T.prim_func
+    def func(a: T.bool) -> None:
+        T.evaluate(a)
+
+    return func
+
+
+def bool_variable_annotation():
+    @T.prim_func
+    def func() -> None:
+        a: T.bool = T.call_extern("dummy", dtype="bool")
+        T.evaluate(0)
+
+    return func
+
+
+def bool_primitive():
+    @T.prim_func
+    def func() -> None:
+        T.evaluate(T.bool(True))
+
+    return func
+
+
+def bool_cast():
+    @T.prim_func
+    def func() -> None:
+        T.evaluate(T.bool(T.int32(0)))
+
+    return func
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3454,6 +3495,11 @@ def func() -> None:
     allocate_and_decl_buffer,
     float_infinity,
     minimal_i32_literal,
+    boolean_argument,
+    bool_argument,
+    bool_variable_annotation,
+    bool_primitive,
+    bool_cast,
 )
 
 
From 7fc35da3b9f111899b846f2602cef198b80c926b Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 6 Oct 2022 19:25:19 +0900
Subject: [PATCH 310/704] [TEST] CPU feature detection for x86 and ARM dot
 product instructions (#12980)

* introduce requires_arm_dot

* introduce requires_cascadelake

* lint

* requires_cascadelake -> requires_vnni

* Update tests/python/integration/test_meta_schedule_auto_tensorize.py

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>

* Update python/tvm/testing/utils.py

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>

* Revert "requires_cascadelake -> requires_vnni"

This reverts commit 6931ca2b7cccfd19a3a9cf0dd8fe37b6896fff08.

* check for intel in requires_cascadelake

* black

* fix

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>
---
 python/tvm/testing/utils.py                   | 45 +++++++++++++
 .../test_meta_schedule_auto_tensorize.py      |  7 +-
 tests/python/relay/test_op_level1.py          |  2 +-
 tests/python/relay/test_op_level10.py         |  2 +-
 tests/python/relay/test_op_level2.py          | 65 ++++++++++---------
 .../unittest/test_meta_schedule_tune_relay.py |  9 +--
 6 files changed, 91 insertions(+), 39 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 2babe442ca3c..1c4dcba29d6c 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -77,6 +77,7 @@ def test_something():
 import textwrap
 import time
 import shutil
+import subprocess
 
 from pathlib import Path
 from typing import Optional, Callable, Union, List, Tuple
@@ -981,6 +982,50 @@ def _corstone300_compile_time_check():
 requires_vitis_ai = Feature("vitis_ai", "Vitis AI", cmake_flag="USE_VITIS_AI")
 
 
+def _arm_dot_supported():
+    arch = platform.machine()
+
+    if arch not in ["arm64", "aarch64"]:
+        return False
+
+    if sys.platform.startswith("darwin"):
+        cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode()
+        for line in cpu_info.split("\n"):
+            if line.startswith("hw.optional.arm.FEAT_DotProd"):
+                return bool(int(line.split(":", 1)[1]))
+    elif sys.platform.startswith("linux"):
+        return True
+
+    return False
+
+
+def _is_intel():
+    # Only linux is supported for now.
+    if sys.platform.startswith("linux"):
+        with open("/proc/cpuinfo", "r") as content:
+            return "Intel" in content.read()
+
+    return False
+
+
+def _has_vnni():
+    arch = platform.machine()
+    # Only linux is supported for now.
+    if arch == "x86_64" and sys.platform.startswith("linux"):
+        with open("/proc/cpuinfo", "r") as content:
+            return "avx512_vnni" in content.read()
+
+    return False
+
+
+requires_arm_dot = Feature("arm_dot", "ARM dot product", run_time_check=_arm_dot_supported)
+
+
+requires_cascadelake = Feature(
+    "cascadelake", "x86 CascadeLake", run_time_check=lambda: _has_vnni() and _is_intel()
+)
+
+
 def _cmake_flag_enabled(flag):
     flag = tvm.support.libinfo()[flag]
 
diff --git a/tests/python/integration/test_meta_schedule_auto_tensorize.py b/tests/python/integration/test_meta_schedule_auto_tensorize.py
index 7227ef0c7b79..fd28f7928301 100644
--- a/tests/python/integration/test_meta_schedule_auto_tensorize.py
+++ b/tests/python/integration/test_meta_schedule_auto_tensorize.py
@@ -284,7 +284,7 @@ def _test_bert_int8(target, sch_rules, postprocs):
     print(runtime.benchmark(dev, number=1, repeat=50).mean)
 
 
-@pytest.mark.skip("Requires cascadelake")
+@tvm.testing.requires_cascadelake
 def test_vnni_dense():
     _test_dense(
         "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4"
@@ -305,7 +305,7 @@ def test_dp4a_dense():
     # )
 
 
-@pytest.mark.skip("Requires cascadelake")
+@tvm.testing.requires_cascadelake
 def test_vnni_conv2d():
     _test_conv2d(
         "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4"
@@ -326,7 +326,8 @@ def test_dp4a_conv2d():
     # )
 
 
-@pytest.mark.skip("Requires cascadelake")
+@tvm.testing.requires_cascadelake
+@pytest.mark.skip_if(tvm.testing.IS_IN_CI, reason="Slow on CI")
 def test_vnni_bert_int8():
     _test_bert_int8("llvm -mcpu=cascadelake -num-cores 4", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI)
 
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 4ce422ae8893..3436bdd9f28d 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -744,7 +744,7 @@ def test_bitserial_dense():
     assert yy.checked_type == relay.TensorType((m, 32), "int16")
 
 
-@pytest.mark.skip("Requires cascadelake")
+@tvm.testing.requires_cascadelake
 def test_dense_vnni():
     data_shape = (32, 96)
     weight_shape = (128, 96)
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 8c30ab27ce18..5134ab156b3d 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -473,7 +473,7 @@ def test_batch_matmul(executor_kind):
     verify_batch_matmul_with_inputs(executor_kind, x, x, x_np, x_np, (10, 27, 27))
 
 
-@pytest.mark.skip("Requires cascadelake")
+@tvm.testing.requires_cascadelake
 def test_batch_matmul_vnni():
     x_shape = (16, 32, 96)
     y_shape = (16, 128, 96)
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 6a895aaf0518..7efec2db03b9 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2137,8 +2137,7 @@ def get_subgraph(dtype):
             np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
 
 
-@pytest.mark.skip("Requires cascadelake or ARM v8.2")
-def test_conv2d_int8_alter_dtype():
+def _test_conv2d_int8_alter_dtype(data_dtype, target, dot_product_instr):
     def get_conv2d_nchw(
         d_shape,
         w_shape,
@@ -2171,45 +2170,51 @@ def get_conv2d_nchw(
     bias_np = np.random.randint(low=-127, high=128, size=bias_shape).astype("int32")
     weight_np = np.random.uniform(-128, 127, size=weight_shape).astype("int8")
 
-    for data_dtype, target, dot_product_instr in [
-        ("uint8", "llvm --device arm_cpu -mattr=+v8.2a,+dotprod", "sdot"),
-        ("int8", "llvm -mcpu=cascadelake", "vpdpbusd"),
-    ]:
-        conv2d = get_conv2d_nchw(data_shape, weight_shape, data_dtype)
-        bias_add = relay.add(conv2d, bias)
-        mod = tvm.IRModule.from_expr(bias_add)
+    conv2d = get_conv2d_nchw(data_shape, weight_shape, data_dtype)
+    bias_add = relay.add(conv2d, bias)
+    mod = tvm.IRModule.from_expr(bias_add)
 
-        if data_dtype == "uint8":
-            data_np = np.random.uniform(0, 255, size=data_shape).astype("uint8")
-        else:
-            data_np = np.random.uniform(-128, 127, size=data_shape).astype("int8")
+    if data_dtype == "uint8":
+        data_np = np.random.uniform(0, 255, size=data_shape).astype("uint8")
+    else:
+        data_np = np.random.uniform(-128, 127, size=data_shape).astype("int8")
 
-        params = {"weight": weight_np, "bias": bias_np}
+    params = {"weight": weight_np, "bias": bias_np}
 
-        ref = (
-            relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
-            .evaluate()(*[data_np, weight_np, bias_np])
-            .numpy()
-        )
+    ref = (
+        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
+        .evaluate()(*[data_np, weight_np, bias_np])
+        .numpy()
+    )
 
-        dev = tvm.cpu(0)
+    dev = tvm.cpu(0)
+
+    with tvm.transform.PassContext(
+        opt_level=3,
+    ):
+        lib = relay.build(mod, target=target, params=params)
 
-        with tvm.transform.PassContext(
-            opt_level=3,
-        ):
-            lib = relay.build(mod, target=target, params=params)
+    assert dot_product_instr in lib.lib.get_source("asm")
 
-        assert dot_product_instr in lib.lib.get_source("asm")
+    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
-        rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+    rt_mod.set_input("data", data_np)
+
+    rt_mod.run()
+
+    out = rt_mod.get_output(0).numpy()
+
+    np.testing.assert_equal(out, ref)
 
-        rt_mod.set_input("data", data_np)
 
-        rt_mod.run()
+@tvm.testing.requires_arm_dot
+def test_conv2d_int8_alter_dtype_arm():
+    _test_conv2d_int8_alter_dtype("uint8", "llvm --device arm_cpu -mattr=+v8.2a,+dotprod", "sdot")
 
-        out = rt_mod.get_output(0).numpy()
 
-        np.testing.assert_equal(out, ref)
+@tvm.testing.requires_cascadelake
+def test_conv2d_int8_alter_dtype_vnni():
+    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=cascadelake", "vpdpbusd")
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index 5cc4f8f6a404..91101dd6b6c0 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -23,6 +23,7 @@
 import numpy as np  # type: ignore
 import pytest
 import tvm
+import tvm.testing
 from tvm import meta_schedule as ms
 from tvm import relay
 from tvm._ffi import register_func
@@ -441,9 +442,9 @@ def manual_tir_common(do_tune=False):
         )
         config = ms.TuneConfig(
             strategy="replay_trace",
-            num_trials_per_iter=64,
-            max_trials_per_task=20000,
-            max_trials_global=20000,
+            num_trials_per_iter=8,
+            max_trials_per_task=8,
+            max_trials_global=8,
         )
 
         with tempfile.TemporaryDirectory() as work_dir:
@@ -503,7 +504,7 @@ def schedule_fn(sch) -> bool:
     np.testing.assert_equal(out, ref)
 
 
-@pytest.mark.skip("Requires cascadelake")
+@tvm.testing.requires_cascadelake
 def test_tune_relay_manual_tir_vnni():
     manual_tir_common(do_tune=False)
 

From 1b9e20a807e7bfa369755822342b8830b853fa86 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 6 Oct 2022 16:57:23 +0100
Subject: [PATCH 311/704] [ETHOSN] Transpose fully connected weights (#12970)

* [ETHOSN] Transpose fully connected weights

The NPU driver stack expects weights in IO (HWIO) format, however, Relay
uses an OI representation. Although the shape of the weight tensor was
correctly changed during codegen, the values in the weights tensor were
not being transposed. This lead to an output mismatch when the output
"units" was > 1. The tests didn't catch this due to using a weights
tensor of all 1's.

Change-Id: I51b2bcd14b677280ef3b6a6845d56b7dfacc7d6a

* Address comments

* Refactor use of weight transpose to common file between contrib
codegens.
* Make function areguments more explicit.
* Update network hashes.

Change-Id: Ib53bc7d2837b62908b92fd09062cbe9a8bb4ab30

* Fix lint

Change-Id: I6a1d9ffa8e3a747b7c77c9b27aa1b1c0d4c5cbff

* Fix cmsis-nn weights transpose

Change-Id: Ie89e429da222ffe17bc8faf831bf59217008a68a

* Address comments

Change-Id: Ie5ded2db3024b9e2c5095f01adea65798fc1da55
---
 cmake/modules/contrib/CMSISNN.cmake           |  4 +-
 cmake/modules/contrib/EthosN.cmake            |  3 +-
 .../contrib/cmsisnn/generate_constants.cc     | 20 ++-----
 .../backend/contrib/constant_transforms.cc    | 58 ++++++++++++++++++
 .../backend/contrib/constant_transforms.h     | 59 +++++++++++++++++++
 src/relay/backend/contrib/ethosn/codegen.cc   |  4 +-
 .../contrib/ethosn/convert_equivalent.cc      |  1 +
 .../backend/contrib/ethosn/ethosn_api.cc      | 49 ++++++---------
 src/relay/backend/contrib/ethosn/ethosn_api.h | 13 +---
 .../test_ethosn/test_fullyconnected.py        | 56 ++++++++++--------
 .../contrib/test_ethosn/test_networks.py      |  4 +-
 11 files changed, 180 insertions(+), 91 deletions(-)
 create mode 100644 src/relay/backend/contrib/constant_transforms.cc
 create mode 100644 src/relay/backend/contrib/constant_transforms.h

diff --git a/cmake/modules/contrib/CMSISNN.cmake b/cmake/modules/contrib/CMSISNN.cmake
index 73ecd5916df3..eef12fdd778e 100644
--- a/cmake/modules/contrib/CMSISNN.cmake
+++ b/cmake/modules/contrib/CMSISNN.cmake
@@ -18,6 +18,8 @@
 if(USE_CMSISNN)
   add_definitions(-DTVM_USE_CMSISNN)
   message(STATUS "Build with CMSIS-NN support")
-  tvm_file_glob(GLOB RELAY_CONTRIB_CMSISNN_SRCS src/relay/backend/contrib/cmsisnn/*.cc)
+  tvm_file_glob(GLOB RELAY_CONTRIB_CMSISNN_SRCS
+    src/relay/backend/contrib/cmsisnn/*.cc
+    src/relay/backend/contrib/constant_transforms.cc)
   list(APPEND COMPILER_SRCS ${RELAY_CONTRIB_CMSISNN_SRCS})
 endif(USE_CMSISNN)
diff --git a/cmake/modules/contrib/EthosN.cmake b/cmake/modules/contrib/EthosN.cmake
index dbf5549180aa..b230acfc380d 100644
--- a/cmake/modules/contrib/EthosN.cmake
+++ b/cmake/modules/contrib/EthosN.cmake
@@ -35,7 +35,8 @@ if(NOT USE_ETHOSN STREQUAL "OFF")
     list(APPEND RUNTIME_SRCS ${ETHOSN_RUNTIME_CONTRIB_SRC})
 
     tvm_file_glob(GLOB COMPILER_ETHOSN_SRCS
-                  src/relay/backend/contrib/ethosn/*)
+                  src/relay/backend/contrib/ethosn/*
+                  src/relay/backend/contrib/constant_transforms.cc)
     list(APPEND COMPILER_SRCS ${COMPILER_ETHOSN_SRCS})
 
     list(APPEND TVM_LINKER_LIBS ${ETHOSN_COMPILER_LIBRARY}
diff --git a/src/relay/backend/contrib/cmsisnn/generate_constants.cc b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
index 297e6b7acea3..e08b61c457f9 100644
--- a/src/relay/backend/contrib/cmsisnn/generate_constants.cc
+++ b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
@@ -31,6 +31,7 @@
 #include "../../../op/make_op.h"
 #include "../../../qnn/utils.h"
 #include "../../../transforms/pattern_utils.h"
+#include "../constant_transforms.h"
 #include "convolutions.h"
 
 namespace tvm {
@@ -64,22 +65,9 @@ class GenerateConstantsMutator : public MixedModeMutator {
     attrs->out_dtype = std::move(conv2d_attrs->out_dtype);
     *new_attrs = tvm::Attrs{attrs};
 
-    std::string kernel_layout = conv2d_attrs->kernel_layout.c_str();
-    int pos_o = kernel_layout.find("O");
-    int pos_h = kernel_layout.find("H");
-    int pos_w = kernel_layout.find("W");
-    int pos_i = kernel_layout.find("I");
-
-    IRModule kernel_module;
-    auto func_body = MakeTranspose(
-        kernel_expr, {Integer(pos_o), Integer(pos_h), Integer(pos_w), Integer(pos_i)});
-    auto kernel_func =
-        Function(FreeVars(func_body), func_body, Type(), FreeTypeVars(func_body, kernel_module));
-    GlobalVar kernel_var("main");
-    kernel_module->Add(kernel_var, kernel_func);
-    kernel_module = relay::transform::FoldConstant()(kernel_module);
-    kernel_func = Downcast<Function>(kernel_module->Lookup("main"));
-    return kernel_func->body;
+    Constant conv2d_kernel = Downcast<Constant>(kernel_expr);
+    conv2d_kernel = TransposeWeights(conv2d_kernel, conv2d_attrs->kernel_layout, "OHWI");
+    return conv2d_kernel;
   }
 
   /*!  * \brief Performs weight transpose and substitutes existing constants in the composite
diff --git a/src/relay/backend/contrib/constant_transforms.cc b/src/relay/backend/contrib/constant_transforms.cc
new file mode 100644
index 000000000000..6041d37451aa
--- /dev/null
+++ b/src/relay/backend/contrib/constant_transforms.cc
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "constant_transforms.h"
+
+#include <string>
+
+#include "../../transforms/pattern_utils.h"
+#include "../../transforms/simplify_expr.h"
+
+/*!
+ * \file src/relay/backend/contrib/constant_transforms.cc
+ * \brief Transforms applied to constant operations during codegen for BYOC backends.
+ */
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+Expr FoldConstantExpr(const Expr& expr, bool fold_qnn) {
+  auto mod = IRModule::FromExpr(expr);
+  mod = transform::FoldConstant(fold_qnn)(mod);
+  auto entry_func = Downcast<Function>(mod->Lookup("main"));
+  return expr.as<FunctionNode>() == nullptr ? entry_func->body : entry_func;
+}
+
+Constant TransposeWeights(const Constant& data, const std::string& source_layout,
+                          const std::string& target_layout) {
+  Array<Integer> transpose_matrix;
+  for (const char& c : target_layout) {
+    int pos = source_layout.find(c);
+    transpose_matrix.push_back(pos);
+  }
+  Expr transpose = MakeTranspose(data, transpose_matrix);
+  transpose = InferType(FoldConstantExpr(transpose));
+  Constant transposed_data = Downcast<Constant>(transpose);
+  return transposed_data;
+}
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/constant_transforms.h b/src/relay/backend/contrib/constant_transforms.h
new file mode 100644
index 000000000000..39a9dc1d53d4
--- /dev/null
+++ b/src/relay/backend/contrib/constant_transforms.h
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/constant_transforms.h
+ * \brief Transforms applied to constant operations during codegen for BYOC backends.
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_CONSTANT_TRANSFORMS_H_
+#define TVM_RELAY_BACKEND_CONTRIB_CONSTANT_TRANSFORMS_H_
+
+#include <tvm/relay/expr.h>
+
+#include <string>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*!
+ * \brief Apply constant folding on an expression.
+ *
+ * \param expr The expression to fold.
+ * \param fold_qnn Whether to fold constants for QNN operations.
+ * \returns The new folded expression.
+ */
+Expr FoldConstantExpr(const Expr& expr, bool fold_qnn = true);
+
+/*!
+ *\brief Transpose weights from `source_layout` to `target_layout`
+ *
+ * \param data The constant expression to transpose.
+ * \param source_layout The current layout of the constant e.g. "OHWI".
+ * \param target_layout The target layout of the constant e.g. "HWIO".
+ */
+Constant TransposeWeights(const Constant& data, const std::string& source_layout,
+                          const std::string& target_layout);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_CONSTANT_TRANSFORMS_H_
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index 46420775ae5b..d2281f782615 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -412,8 +412,8 @@ EthosnError ConstructNetworkVisitor::MakeFullyConnectedLayer(const Call& call,
     return err;
   }
 
-  auto weights = AddConstant(network_, params.weights_info, params.raw_weights).tensor;
-  auto bias = AddConstant(network_, params.bias_info, params.raw_bias).tensor;
+  auto weights = AddConstant(network_, params.weights_info, params.raw_weights->data).tensor;
+  auto bias = AddConstant(network_, params.bias_info, params.raw_bias->data).tensor;
   try {
     auto input =
         AddReshape(network_, *operand_table_[call->args[0]][0], params.input_info.m_Dimensions)
diff --git a/src/relay/backend/contrib/ethosn/convert_equivalent.cc b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
index 7f4e1a3c5045..14d94192c84e 100644
--- a/src/relay/backend/contrib/ethosn/convert_equivalent.cc
+++ b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
@@ -32,6 +32,7 @@
 #include "../../../qnn/utils.h"
 #include "../../../transforms/pattern_utils.h"
 #include "../../../transforms/simplify_expr.h"
+#include "../constant_transforms.h"
 #include "ethosn_api.h"
 
 namespace tvm {
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index dbcdecd8f382..c0f8767a8c65 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -41,6 +41,7 @@
 #include "../../../op/make_op.h"
 #include "../../../transforms/pattern_utils.h"
 #include "../../../transforms/simplify_expr.h"
+#include "../constant_transforms.h"
 #include "ethosn_support_library/Support.hpp"
 #include "ethosn_support_library/SupportQueries.hpp"
 #include "tvm/relay/qnn/attrs.h"
@@ -197,7 +198,10 @@ EthosnError EthosnAPI::QnnFullyConnected(const Expr& expr, FullyConnectedParams*
   sl::QuantizationInfo output_q_info;
   err += Tvm2Npu(input_zero_point, input_scale, &data_q_info);
   err += Tvm2Npu(kernel_zero_point, kernel_scale, &weights_q_info);
-  err += Tvm2Npu(0, data_q_info.GetScale() * weights_q_info.GetScale(), &bias_q_info);
+  std::valarray<float> bias_scales = data_q_info.GetScale() * weights_q_info.GetScales();
+  const int bias_zero_point = 0;
+  const unsigned int bias_axis = 3;
+  err += Tvm2Npu(bias_zero_point, bias_scales, bias_axis, &bias_q_info);
   err += Tvm2Npu(output_zero_point, output_scale, &output_q_info);
 
   // Create fc info
@@ -213,27 +217,30 @@ EthosnError EthosnAPI::QnnFullyConnected(const Expr& expr, FullyConnectedParams*
                                       data_data_type, sl::DataFormat::NHWC, data_q_info);
 
   // Create weights info
-  const auto* weights_dtype = dense->args[1]->checked_type().as<TensorTypeNode>();
+  Constant weights_data = Downcast<Constant>(dense->args[1]);
+  weights_data = TransposeWeights(weights_data, "OI", "IO");
+  const auto* weights_ttype = weights_data->checked_type().as<TensorTypeNode>();
   sl::TensorShape weights_tensor_shape;
   sl::DataType weights_data_type;
   sl::DataFormat weights_data_format;
   // Ignore the error here because weights don't have a batch axis
-  Tvm2Npu(weights_dtype->shape, &weights_tensor_shape);
-  err += Tvm2Npu(weights_dtype->dtype, &weights_data_type);
+  Tvm2Npu(weights_ttype->shape, &weights_tensor_shape);
+  err += Tvm2Npu(weights_ttype->dtype, &weights_data_type);
   err += Tvm2Npu("HWIO", &weights_data_format);
-  params->weights_info = sl::TensorInfo({1, 1, weights_tensor_shape[1], weights_tensor_shape[0]},
+  // Weights tensor shape is 1, 1, I, O
+  params->weights_info = sl::TensorInfo({1, 1, weights_tensor_shape[0], weights_tensor_shape[1]},
                                         weights_data_type, weights_data_format, weights_q_info);
-  params->raw_weights = dense->args[1].as<ConstantNode>()->data->data;
+  params->raw_weights = weights_data->data;
 
   // Create bias info
   params->bias_info =
-      sl::TensorInfo({1, 1, 1, weights_tensor_shape[0]}, sl::DataType::INT32_QUANTIZED,
+      sl::TensorInfo({1, 1, 1, weights_tensor_shape[1]}, sl::DataType::INT32_QUANTIZED,
                      sl::DataFormat::NHWC, bias_q_info);
-  params->raw_bias = bias_add->args[1].as<ConstantNode>()->data->data;
+  params->raw_bias = bias_add->args[1].as<ConstantNode>()->data;
 
   sl::TensorInfo output_tensor_info;
   err += Tvm2Npu(requantize->checked_type(), &output_tensor_info);
-  output_tensor_info.m_Dimensions = {data_tensor_shape[0], 1, 1, weights_tensor_shape[0]};
+  output_tensor_info.m_Dimensions = {data_tensor_shape[0], 1, 1, weights_tensor_shape[1]};
   output_tensor_info.m_QuantizationInfo = output_q_info;
   params->output_info = output_tensor_info;
 
@@ -449,21 +456,6 @@ EthosnError EthosnAPI::Mean(const Expr& expr, MeanParams* params) {
   return err;
 }
 
-Constant TransposeWeights(const Constant& data, const std::string& input_layout) {
-  int pos_h = input_layout.find("H");
-  int pos_w = input_layout.find("W");
-  int pos_i = input_layout.find("I");
-  int pos_o = input_layout.find("O");
-
-  // Currently the expected target layout is HWIO only.
-  Array<Integer> target_shape = {pos_h, pos_w, pos_i, pos_o};
-
-  Expr transpose = MakeTranspose(data, target_shape);
-  transpose = InferType(FoldConstantExpr(transpose));
-  Constant transposed_data = Downcast<Constant>(transpose);
-  return transposed_data;
-}
-
 EthosnError EthosnAPI::QnnConv2dTranspose(const Expr& expr, QnnConv2dTransposeParams* params) {
   Call requantize = Downcast<Call>(expr);
   Call bias;
@@ -530,7 +522,7 @@ EthosnError EthosnAPI::QnnConv2dTranspose(const Expr& expr, QnnConv2dTransposePa
   // Create weights info
   Constant weights_data = Downcast<Constant>(conv2d_transpose->args[1]);
   if (conv_attr->kernel_layout != "HWIO") {
-    weights_data = TransposeWeights(weights_data, conv_attr->kernel_layout);
+    weights_data = TransposeWeights(weights_data, conv_attr->kernel_layout, "HWIO");
   }
   const auto* weights_ttype = weights_data->checked_type().as<TensorTypeNode>();
   sl::TensorShape weights_tensor_shape;
@@ -1080,13 +1072,6 @@ EthosnError EthosnAPI::AsConstant(const Expr& expr, T* out) {
   return EthosnError();
 }
 
-Expr FoldConstantExpr(const Expr& expr, bool fold_qnn) {
-  auto mod = IRModule::FromExpr(expr);
-  mod = transform::FoldConstant(fold_qnn)(mod);
-  auto entry_func = Downcast<Function>(mod->Lookup("main"));
-  return expr.as<FunctionNode>() == nullptr ? entry_func->body : entry_func;
-}
-
 }  // namespace ethosn
 }  // namespace contrib
 }  // namespace relay
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.h b/src/relay/backend/contrib/ethosn/ethosn_api.h
index 3d704f2757c6..d640a02312ec 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.h
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.h
@@ -66,8 +66,8 @@ struct FullyConnectedParams {
   sl::TensorInfo weights_info;
   sl::TensorInfo bias_info;
   sl::TensorInfo output_info;
-  void* raw_weights = nullptr;
-  void* raw_bias = nullptr;
+  runtime::NDArray raw_weights;
+  runtime::NDArray raw_bias;
 };
 
 struct MaxPool2DParams {
@@ -324,15 +324,6 @@ class EthosnAPI {
   static EthosnError AsConstant(const Expr& expr, std::valarray<float>* out);
 };
 
-/*!
- * \brief Apply constant folding on an expression.
- *
- * \param expr The expression to fold.
- * \param fold_qnn Whether to fold constants for QNN operations.
- * \returns The new folded expression.
- */
-Expr FoldConstantExpr(const Expr& expr, bool fold_qnn = true);
-
 }  // namespace ethosn
 }  // namespace contrib
 }  // namespace relay
diff --git a/tests/python/contrib/test_ethosn/test_fullyconnected.py b/tests/python/contrib/test_ethosn/test_fullyconnected.py
index d38b2528c7bb..e84464f90217 100644
--- a/tests/python/contrib/test_ethosn/test_fullyconnected.py
+++ b/tests/python/contrib/test_ethosn/test_fullyconnected.py
@@ -19,9 +19,11 @@
 
 import numpy as np
 import pytest
+
 import tvm
 from tvm import relay
 from tvm.testing import requires_ethosn
+
 from . import infrastructure as tei
 
 
@@ -30,7 +32,11 @@ def _get_model(
 ):
     """Return a model an any parameters it may have"""
     a = relay.var("a", shape=shape, dtype=dtype)
-    weights_array = tvm.nd.array(np.ones(weight_shape, dtype))
+    weights_array = tvm.nd.array(
+        np.random.randint(
+            np.iinfo(dtype).min, high=np.iinfo(dtype).max, size=weight_shape, dtype=dtype
+        )
+    )
     weights = relay.const(weights_array, dtype)
     dense = relay.qnn.op.dense(
         a,
@@ -66,26 +72,24 @@ def _get_model(
         ((1, 1280), 1000),
     ],
 )
-@pytest.mark.parametrize(
-    "dtype,input_zp,input_sc,kernel_zp,kernel_sc",
-    [
-        ("uint8", 71, 0.580, 176, 1.498),
-        ("uint8", 166, 1.724, 138, 0.180),
-        ("int8", 71, 0.580, 0, 1.498),
-        ("int8", 120, 1.724, 0, 0.180),
-    ],
-)
-def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_zp, kernel_sc):
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+def test_fullyconnected(shape, out_channels, dtype):
     """Compare Fully Connected output with TVM."""
 
     np.random.seed(0)
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+
     inputs = {
-        "a": tvm.nd.array(
-            np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype)
-        ),
+        "a": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=shape, dtype=dtype)),
     }
-
     outputs = []
+
+    input_zp = np.random.randint(data_min, data_max)
+    input_sc = np.random.random() * 2
+    kernel_zp = np.random.randint(data_min, data_max)
+    kernel_sc = np.random.random() * 2
     output_zp, output_sc = tei.get_conv2d_qnn_params(
         dtype,
         input_zp,
@@ -96,18 +100,18 @@ def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_z
         shape[1],
         1,
     )
+    model, params = _get_model(
+        shape,
+        (out_channels, shape[1]),
+        input_zp,
+        input_sc,
+        kernel_zp,
+        kernel_sc,
+        output_zp,
+        output_sc,
+        dtype,
+    )
     for npu in [False, True]:
-        model, params = _get_model(
-            shape,
-            (out_channels, shape[1]),
-            input_zp,
-            input_sc,
-            kernel_zp,
-            kernel_sc,
-            output_zp,
-            output_sc,
-            dtype,
-        )
         mod = tei.make_module(model, params)
         outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
     tei.verify(outputs, dtype, 1)
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 54ca44805171..5bd133ba20bb 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -145,7 +145,7 @@ def test_resnet_50_int8():
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
     _compile_hash = {
-        "6b130a99397715156d5fb833809a92d2",
+        "f16dc9caa8e696bc5da8a5c6a644eb72",
         "6e5fcbab831607b9da1039aff4e56871",
         "41acecca37b2735bd580f6ec38d8c2e0",
     }
@@ -190,7 +190,7 @@ def test_inception_v4():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"2eeae331898f8e94c74868e190077837"}
+    _compile_hash = {"c00c119506b34c8e87f81aa009b42431"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/inception_v4_299_quant_20181026.tgz",

From 6147c31b366f4d342633884694a22ddf963178e5 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 6 Oct 2022 16:58:22 +0100
Subject: [PATCH 312/704] [ETHOSN] Remove backwards copy in the runtime
 (#12968)

The runtime had a backwards copy which was important historically to
work around a bug in the NPU driver stack. It should now be safe to
replace with a standard copy operation.

Some cleanup has also been completed to remove unused functionality
from the runtime.

Change-Id: I8b800b6a57dca40ea5b9538a8874489e530caa61
---
 src/runtime/contrib/ethosn/ethosn_device.cc | 64 ++++++---------------
 1 file changed, 17 insertions(+), 47 deletions(-)

diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc
index 900ae65afcc3..612f4b4cec39 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.cc
+++ b/src/runtime/contrib/ethosn/ethosn_device.cc
@@ -74,19 +74,6 @@ bool WaitForInference(dl::Inference* inference, int timeout) {
   return true;
 }
 
-template <typename T>
-void CopyOutput(dl::Buffer* source_buffers[], std::vector<DLTensor*>* outputs) {
-  for (DLTensor* tensor : *outputs) {
-    dl::Buffer* source_buffer = source_buffers[0];
-    T* dest_pointer = static_cast<T*>(tensor->data);
-    size_t size = source_buffer->GetSize();
-    uint8_t* source_buffer_data = source_buffer->Map();
-    std::copy_backward(source_buffer_data, source_buffer_data + size, dest_pointer + size);
-    source_buffer->Unmap();
-    source_buffers++;
-  }
-}
-
 void CreateBuffers(std::vector<std::shared_ptr<dl::Buffer>>* fm,
                    const std::vector<DLTensor*>& tensors, const std::vector<uint32_t>& tensor_sizes,
                    bool input) {
@@ -108,65 +95,48 @@ bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
   size_t n_inputs = input_order.size();
   size_t n_outputs = output_order.size();
   std::vector<DLTensor*> inputs(n_inputs);
-  for (uint8_t i = 0; i < n_inputs; i++) {
+  for (size_t i = 0; i < n_inputs; i++) {
     inputs[i] = args[input_order[i]];
   }
   std::vector<DLTensor*> outputs(n_outputs);
   size_t output_offset = n_inputs;
-  for (uint8_t i = 0; i < n_outputs; i++) {
+  for (size_t i = 0; i < n_outputs; i++) {
     outputs[i] = args[output_order[i] + output_offset];
   }
 
   // Set up input buffers
-  std::vector<std::shared_ptr<dl::Buffer>> ifm(inputs.size());
+  std::vector<std::shared_ptr<dl::Buffer>> ifm(n_inputs);
   CreateBuffers(&ifm, inputs, input_sizes, true);
 
   // Set up output buffers
-  std::vector<std::shared_ptr<dl::Buffer>> ofm(outputs.size());
+  std::vector<std::shared_ptr<dl::Buffer>> ofm(n_outputs);
   CreateBuffers(&ofm, outputs, output_sizes, false);
 
   // Raw pointers for the inference
-  dl::Buffer* ifm_raw[inputs.size()];
-  for (size_t i = 0; i < inputs.size(); i++) {
+  dl::Buffer* ifm_raw[n_inputs];
+  for (size_t i = 0; i < n_inputs; i++) {
     ifm_raw[i] = ifm[i].get();
   }
-  dl::Buffer* ofm_raw[outputs.size()];
-  for (size_t i = 0; i < outputs.size(); i++) {
+  dl::Buffer* ofm_raw[n_outputs];
+  for (size_t i = 0; i < n_outputs; i++) {
     ofm_raw[i] = ofm[i].get();
   }
 
   // Execute the inference.
   std::unique_ptr<dl::Inference> result(
-      npu->ScheduleInference(ifm_raw, sizeof(ifm_raw) / sizeof(ifm_raw[0]), ofm_raw,
-                             sizeof(ofm_raw) / sizeof(ofm_raw[0])));
+      npu->ScheduleInference(ifm_raw, n_inputs, ofm_raw, n_outputs));
   bool inferenceCompleted = WaitForInference(result.get(), 60);
   if (inferenceCompleted) {
-    switch ((outputs)[0]->dtype.bits) {
-      case 8: {
-        dl::Buffer** ofms = &ofm_raw[0];
-        for (DLTensor* tensor : outputs) {
-          dl::Buffer* source_buffer = (*ofms++);
-          uint8_t* source_buffer_data = source_buffer->Map();
-          uint8_t* dest_pointer = static_cast<uint8_t*>(tensor->data);
-          if (source_buffer_data != dest_pointer) {
-            CopyOutput<uint8_t>(ofm_raw, &outputs);
-            break;
-          }
-          source_buffer->Unmap();
-        }
-        break;
-      }
-      case 16:
-        CopyOutput<uint16_t>(ofm_raw, &outputs);
-        break;
-      case 32:
-        CopyOutput<uint32_t>(ofm_raw, &outputs);
-        break;
-      default:
-        break;
+    for (size_t i = 0; i < n_outputs; i++) {
+      DLTensor* tensor = outputs[i];
+      dl::Buffer* source_buffer = ofm_raw[i];
+      uint8_t* dest_buffer = static_cast<uint8_t*>(tensor->data);
+      size_t size = source_buffer->GetSize();
+      uint8_t* source_buffer_data = source_buffer->Map();
+      std::copy(source_buffer_data, source_buffer_data + size, dest_buffer);
+      source_buffer->Unmap();
     }
   }
-
   return inferenceCompleted;
 }
 

From 5e24aa2b7884c034c2369b937255999ef9f202db Mon Sep 17 00:00:00 2001
From: Janet Schneider <janetsc@octoml.ai>
Date: Thu, 6 Oct 2022 11:11:33 -0700
Subject: [PATCH 313/704] [Hexagon] [runtime] Query for total and available
 VTCM (#12992)

Update HexagonVtcmPool to use compute_resource_query_VTCM to query for the total and available blocks in VTCM.  Ensures at least 1MB is available.

Also fixes two typos in the header.
---
 src/runtime/hexagon/hexagon_vtcm_pool.cc | 19 +++++++++++++------
 src/runtime/hexagon/hexagon_vtcm_pool.h  |  4 ++--
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.cc b/src/runtime/hexagon/hexagon_vtcm_pool.cc
index 1f02e2748ff6..107f4df0d187 100644
--- a/src/runtime/hexagon/hexagon_vtcm_pool.cc
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.cc
@@ -29,15 +29,22 @@ HexagonVtcmPool::HexagonVtcmPool() {
   compute_res_attr_t res_info;
   HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info));
 
-  // TODO(HWE): get the max  and min size programmatically
-  const unsigned int max_size = 4 * 1024 * 1024;
-  const unsigned int min_size = 1024 * 1024;
+  unsigned int total_block_size;
+  unsigned int avail_block_size;
+  compute_res_vtcm_page_t total_block_layout;
+  compute_res_vtcm_page_t avail_block_layout;
+
+  HEXAGON_SAFE_CALL(compute_resource_query_VTCM(/* application_id = */ 0, &total_block_size,
+                                                &total_block_layout, &avail_block_size,
+                                                &avail_block_layout));
+  DLOG(INFO) << "HexagonVtcmPool total " << total_block_size << " avail " << avail_block_size;
+  CHECK(avail_block_size >= (1024 * 1024)) << "Less than 1MB VTCM available";
 
   // allocate nbytes of vtcm on a single page
   HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param_v2(&res_info,
-                                                           /*vtcm_size = */ max_size,
+                                                           /*vtcm_size = */ total_block_size,
                                                            /*min_page_size = */ 1,
-                                                           /*min_vtcm_size = */ min_size));
+                                                           /*min_vtcm_size = */ avail_block_size));
 
   // TODO(HWE): Investigate why a non-zero timeout results in
   // hanging, both in the simulator and on hardware.
@@ -45,7 +52,7 @@ HexagonVtcmPool::HexagonVtcmPool() {
   CHECK(context_id_) << "HAP_compute_res_acquire failed to acquire requested VTCM resource.";
   HEXAGON_SAFE_CALL(HAP_compute_res_attr_get_vtcm_ptr_v2(&res_info, &vtcm_data_, &vtcm_size_));
   CHECK(vtcm_data_ != nullptr) << "HAP_compute_res_acquire returned nullptr when allocating VTCM.";
-  CHECK(vtcm_size_ >= min_size)
+  CHECK(vtcm_size_ >= avail_block_size)
       << "HAP_compute_res_acquire failed to allocate minimum amount of VTCM";
   free_.emplace_back(std::pair<char*, size_t>(static_cast<char*>(vtcm_data_), vtcm_size_));
   // DebugDump();
diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.h b/src/runtime/hexagon/hexagon_vtcm_pool.h
index e1292e4e10d7..1c44a455196c 100644
--- a/src/runtime/hexagon/hexagon_vtcm_pool.h
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.h
@@ -71,10 +71,10 @@ class HexagonVtcmPool {
   size_t TotalBytes() { return reinterpret_cast<size_t>(vtcm_size_); }
 
  private:
-  //! \brief Context for HAP_compute_res_*
+  //! \brief Total size of VTCM pool
   unsigned int vtcm_size_;
 
-  //! \brief Context for HAP_compute_res_*
+  //! \brief Pointer to the beginning of the pool
   void* vtcm_data_;
 
   //! \brief Context for HAP_compute_res_*

From 87c466cdee1a7df1bd147179eb7c81bb7a45d045 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Thu, 6 Oct 2022 21:46:26 +0300
Subject: [PATCH 314/704] [Textures] Improve error reporting (#12986)

---
 src/tir/transforms/texture_flatten.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 3c35b73bc8d7..3f8f0efd1f20 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -107,7 +107,8 @@ class TextureFlattener : public TextureLoweringBase {
       ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported";
       int vec_length = static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value);
       ICHECK(vec_length == 4 || vec_length == 1)
-          << "Inner dimension of texture must be vector of length 1 or 4 (RGBA)";
+          << "Inner dimension of texture must be vector of length 1 or 4 (RGBA), was: "
+          << vec_length;
 
       struct ShapeFromRange {
         const Array<Range>& bounds;

From e375c311dac3c4ec0636c1bd0e203c3cd70f7f23 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 6 Oct 2022 13:51:02 -0500
Subject: [PATCH 315/704] [Arith][IndexMap] Correct MapShape result for small
 vectorized dims (#12927)

Prior to this commit, `IndexMap::MapShape` could produce incorrect
results when the split factor is greater than the size of the
dimension being split.  For example, a buffer of shape `[N]` mapped
transformed with `lambda i: [i//4, i%4]` should result in shape
`[ceildiv(N,4), 4]`.  However, for `N<4`, the transformed shape was
instead `[1, N%4]`.  This results in unexpected shapes when attempting
to prepare a buffer for vectorized access.

This commit preferentially uses the result of `arith::DetectIterMap`
to determine the mapped buffer shape, similar to what is done when
computing the inverse.  The old method of `MapShape`, which relied on
`arith::EvalSet`, is maintained for transformations that aren't
recognized by `arith::DetectIterMap`.
---
 src/tir/ir/index_map.cc                 | 45 +++++++++++++++++++------
 tests/python/unittest/test_index_map.py |  9 ++++-
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 6d982b510a26..a25ecdd04079 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -169,23 +169,48 @@ Array<Range> IndexMapNode::MapRanges(const Array<Range>& ranges, arith::Analyzer
     input_iters.Set(initial_indices[i], ranges[i]);
   }
 
-  std::unordered_map<const VarNode*, arith::IntSet> dom_map;
-  for (size_t i = 0; i < initial_indices.size(); i++) {
-    dom_map[initial_indices[i].get()] = arith::IntSet::FromRange(ranges[i]);
-  }
-
   arith::Analyzer local_analyzer;
   if (!analyzer) {
     analyzer = &local_analyzer;
   }
 
+  auto iter_map = DetectIterMap(final_indices, input_iters, /* predicate = */ 1,
+                                /*check_level=*/arith::IterMapLevel::NoCheck, analyzer,
+                                /*simplify_trivial_iterators=*/false);
   Array<Range> output;
-  for (const auto& final_index : final_indices) {
-    auto int_set = arith::EvalSet(final_index, dom_map);
-    output.push_back(Range::FromMinExtent(analyzer->Simplify(int_set.min()),
-                                          analyzer->Simplify(int_set.max() - int_set.min() + 1)));
-  }
+  if (iter_map->indices.size()) {
+    // Preferred route, requires the map to be expressible as an
+    // affine sum.  Since the terms are orthogonal, the extent of the
+    // sum is the extent of the largest term.
+    for (const auto& index : iter_map->indices) {
+      Optional<PrimExpr> extent = NullOpt;
+      for (const auto& term : index->args) {
+        PrimExpr term_extent = term->extent * term->scale;
+        if (extent.defined()) {
+          extent = tvm::max(extent.value(), term_extent);
+        } else {
+          extent = term_extent;
+        }
+      }
+      output.push_back(Range::FromMinExtent(index->base, extent.value_or(1)));
+    }
 
+  } else {
+    // Fall-back method, more general but can ignore intended padding.
+    // For example, [N] mapped through i=>[i//4,i%4] should have shape
+    // [ceildiv(N,4), 4].  However, for N<4, this method instead
+    // results in a shape [1, N].
+    std::unordered_map<const VarNode*, arith::IntSet> dom_map;
+    for (size_t i = 0; i < initial_indices.size(); i++) {
+      dom_map[initial_indices[i].get()] = arith::IntSet::FromRange(ranges[i]);
+    }
+
+    for (const auto& final_index : final_indices) {
+      auto int_set = arith::EvalSet(final_index, dom_map);
+      output.push_back(Range::FromMinExtent(analyzer->Simplify(int_set.min()),
+                                            analyzer->Simplify(int_set.max() - int_set.min() + 1)));
+    }
+  }
   return output;
 }
 
diff --git a/tests/python/unittest/test_index_map.py b/tests/python/unittest/test_index_map.py
index 804d04d0b052..6882c2b42634 100644
--- a/tests/python/unittest/test_index_map.py
+++ b/tests/python/unittest/test_index_map.py
@@ -104,7 +104,7 @@ def test_nonbijective_inverse_gives_error():
             forward=lambda i: [i // 4, i % 4],
             inverse=lambda i, j: [4 * i + j],
             pre_shape=[dynamic_N],
-            post_shape=[(dynamic_N - 1) // 4 + 1, 4],
+            post_shape=[(dynamic_N - dynamic_N % (-4)) // 4, 4],
             padding=lambda i, j: tvm.tir.And(
                 dynamic_N % (-4) != 0,
                 tvm.tir.And(i == dynamic_N // 4, j >= dynamic_N % 4),
@@ -162,6 +162,13 @@ def test_nonbijective_inverse_gives_error():
             post_shape=[8, 4, 4],
             padding=lambda j, i, k: tvm.tir.And(i == 0, j * 4 + k < 5),
         ),
+        "outer_loop_extent_one": dict(
+            forward=lambda i: [i // 4, i % 4],
+            inverse=lambda i, j: [i * 4 + j],
+            pre_shape=[3],
+            post_shape=[1, 4],
+            padding=lambda i, j: 3 <= j,
+        ),
     }
 )
 

From 7ff42a1d4e0a2f890c7eb2095389bed1ead28af2 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 6 Oct 2022 13:52:12 -0500
Subject: [PATCH 316/704] [TIR] Require dtype.is_float() inside FloatImm
 (#12928)

* [TIR] Require dtype.is_float() inside FloatImm

Previously, a `tir::FloatImm` could have a dtype passed to it that was
not a floating point value.  This commit introduces a check, similar
to what is already done in `tir::IntImm`, that requires the dtype to
satisfy `DataType::is_float()`.

* Relaxed test to allow bfloat16 or custom datatypes in FloatImm
---
 src/ir/expr.cc                                |  3 +++
 tests/python/unittest/test_tir_constructor.py | 11 +++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index c926cc56e89a..f097f8f36352 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -119,6 +119,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 FloatImm::FloatImm(DataType dtype, double value, Span span) {
   ICHECK_EQ(dtype.lanes(), 1) << "ValueError: FloatImm can only take scalar.";
 
+  ICHECK(dtype.is_float() || dtype.is_bfloat16() || dtype.code() >= DataType::kCustomBegin)
+      << "ValueError: FloatImm supports only float, but " << dtype << " was supplied.";
+
   // check range for float32 and float16 since they have specified range.
   if (!std::isinf(value) && !std::isnan(value)) {
     if (dtype.bits() == 32) {
diff --git a/tests/python/unittest/test_tir_constructor.py b/tests/python/unittest/test_tir_constructor.py
index dcd642c3b9ec..6f591efc2d2d 100644
--- a/tests/python/unittest/test_tir_constructor.py
+++ b/tests/python/unittest/test_tir_constructor.py
@@ -14,6 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+import pytest
+
 import tvm
 from tvm import te
 
@@ -189,6 +192,10 @@ def test_stmt_constructor():
     assert isinstance(x, tvm.tir.Prefetch)
 
 
+def test_float_constructor_requires_float_dtype():
+    with pytest.raises(tvm.TVMError):
+        tvm.tir.FloatImm("int32", 1.0)
+
+
 if __name__ == "__main__":
-    test_expr_constructor()
-    test_stmt_constructor()
+    tvm.testing.main()

From 50df4abf3bcc90e002669bfb10dd5a934b2f7246 Mon Sep 17 00:00:00 2001
From: Yangzijun <73699744+Yangzijun510@users.noreply.github.com>
Date: Fri, 7 Oct 2022 02:53:12 +0800
Subject: [PATCH 317/704] [Relay] optimize dumpir time (#12792)

Add string before doc, slow. Add string after doc, fast
---
 src/printer/relay_text_printer.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/printer/relay_text_printer.cc b/src/printer/relay_text_printer.cc
index 35daf588fbeb..76cac28b07f7 100644
--- a/src/printer/relay_text_printer.cc
+++ b/src/printer/relay_text_printer.cc
@@ -389,6 +389,7 @@ Doc RelayTextPrinter::VisitExpr_(const IfNode* op) {
 
 Doc RelayTextPrinter::VisitExpr_(const LetNode* op) {
   int n = 0;
+  size_t l = doc_stack_.size();
   Expr let = GetRef<Let>(op);
   while (auto let_node = let.as<LetNode>()) {
     Doc doc;
@@ -399,11 +400,15 @@ Doc RelayTextPrinter::VisitExpr_(const LetNode* op) {
     ++n;
   }
   Doc doc = PrintScope(let);
+  Doc doc_last;
+  for (int i = 0; i < n; ++i) {
+    doc_last << doc_stack_[l + i];
+  }
+  doc_last << doc;
   for (int i = 0; i < n; ++i) {
-    doc = doc_stack_.back() << doc;
     doc_stack_.pop_back();
   }
-  return doc;
+  return doc_last;
 }
 
 Doc RelayTextPrinter::PrintFunc(const Doc& prefix, const relay::Function& fn) {

From 78df9c1d79c3d2e583bb7b8b472f2eb535b2f2a0 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 6 Oct 2022 12:56:10 -0700
Subject: [PATCH 318/704] [LOGGING] Optionally print backtrace on segfault
 (#12959)

Add BACKTRACE_ON_SEGFAULT cmake option to install a signal handler to print a backtrace on segfault. Disabled by default because existing signal handlers may be overriden by this signal handler. Creating a backtrace may allocate, which should not happen in a signal handler, but the program is crashing so it shouldn't make any difference.

Hopefully this will help diagnose segfaults like in #12955.
---
 CMakeLists.txt                               |  1 +
 cmake/config.cmake                           |  4 +++
 cmake/modules/LibInfo.cmake                  |  1 +
 cmake/modules/Logging.cmake                  |  5 ++++
 src/runtime/logging.cc                       | 29 ++++++++++++++++++++
 src/support/libinfo.cc                       |  1 +
 tests/scripts/task_config_build_arm.sh       |  1 +
 tests/scripts/task_config_build_cortexm.sh   |  1 +
 tests/scripts/task_config_build_cpu.sh       |  1 +
 tests/scripts/task_config_build_gpu.sh       |  1 +
 tests/scripts/task_config_build_gpu_other.sh |  1 +
 tests/scripts/task_config_build_hexagon.sh   |  1 +
 tests/scripts/task_config_build_i386.sh      |  1 +
 13 files changed, 48 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e961c6d2d992..47d30a89d2d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,6 +67,7 @@ tvm_option(USE_ETHOSN "Build with Arm(R) Ethos(TM)-N" OFF)
 tvm_option(USE_CMSISNN "Build with Arm CMSIS-NN" OFF)
 tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON)
 tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" AUTO)
+tvm_option(BACKTRACE_ON_SEGFAULT "Install a signal handler to print a backtrace on segfault" OFF)
 tvm_option(BUILD_STATIC_RUNTIME "Build static version of libtvm_runtime" OFF)
 tvm_option(USE_PAPI "Use Performance Application Programming Interface (PAPI) to read performance counters" OFF)
 tvm_option(USE_GTEST "Use GoogleTest for C++ sanity tests" AUTO)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 7067af42e9f1..0b72047603f0 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -340,6 +340,10 @@ set(USE_BNNS OFF)
 # - OFF: disable libbacktrace
 set(USE_LIBBACKTRACE AUTO)
 
+# Whether to install a signal handler to print a backtrace on segfault. This
+# may replace existing signal handlers specified by other libraries.
+set(BACKTRACE_ON_SEGFAULT OFF)
+
 # Whether to build static libtvm_runtime.a, the default is to build the dynamic
 # version: libtvm_runtime.so.
 #
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 73d3a9dbbe10..5e60dec3eede 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -118,6 +118,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_CLML_GRAPH_EXECUTOR="${USE_CLML_GRAPH_EXECUTOR}"
     TVM_INFO_USE_UMA="${USE_UMA}"
     TVM_INFO_USE_CCACHE="${USE_CCACHE}"
+    TVM_INFO_BACKTRACE_ON_SEGFAULT="${BACKTRACE_ON_SEGFAULT}"
   )
 
 endfunction()
diff --git a/cmake/modules/Logging.cmake b/cmake/modules/Logging.cmake
index a4ebabd4d5e0..6b20fba1fda6 100644
--- a/cmake/modules/Logging.cmake
+++ b/cmake/modules/Logging.cmake
@@ -53,3 +53,8 @@ else()
   target_compile_definitions(tvm_objs PRIVATE TVM_USE_LIBBACKTRACE=0)
   target_compile_definitions(tvm_runtime_objs PRIVATE TVM_USE_LIBBACKTRACE=0)
 endif()
+
+if(BACKTRACE_ON_SEGFAULT)
+  target_compile_definitions(tvm_objs PRIVATE TVM_BACKTRACE_ON_SEGFAULT)
+  target_compile_definitions(tvm_runtime_objs PRIVATE TVM_BACKTRACE_ON_SEGFAULT)
+endif()
diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc
index c6c756d85c7e..d62051f7cee0 100644
--- a/src/runtime/logging.cc
+++ b/src/runtime/logging.cc
@@ -33,6 +33,13 @@
 #include <unordered_map>
 #include <vector>
 
+#if TVM_BACKTRACE_ON_SEGFAULT
+#include <signal.h>
+
+#include <csignal>
+#include <cstring>
+#endif
+
 namespace tvm {
 namespace runtime {
 namespace {
@@ -117,6 +124,28 @@ int BacktraceFullCallback(void* data, uintptr_t pc, const char* filename, int li
   }
   return 0;
 }
+
+#if TVM_BACKTRACE_ON_SEGFAULT
+void backtrace_handler(int sig) {
+  // Technically we shouldn't do any allocation in a signal handler, but
+  // Backtrace may allocate. What's the worst it could do? We're already
+  // crashing.
+  std::cerr << "!!!!!!! TVM encountered a Segfault !!!!!!!\n" << Backtrace() << std::endl;
+
+  // Re-raise signal with default handler
+  struct sigaction act;
+  std::memset(&act, 0, sizeof(struct sigaction));
+  act.sa_flags = SA_RESETHAND;
+  act.sa_handler = SIG_DFL;
+  sigaction(sig, &act, nullptr);
+  raise(sig);
+}
+
+__attribute__((constructor)) void install_signal_handler(void) {
+  // this may override already installed signal handlers
+  std::signal(SIGSEGV, backtrace_handler);
+}
+#endif
 }  // namespace
 
 std::string Backtrace() {
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 46b12ba25303..40138b1b4d89 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -323,6 +323,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_CLML_GRAPH_EXECUTOR", TVM_INFO_USE_CLML_GRAPH_EXECUTOR},
       {"USE_UMA", TVM_INFO_USE_UMA},
       {"USE_CCACHE", TVM_INFO_USE_CCACHE},
+      {"BACKTRACE_ON_SEGFAULT", TVM_INFO_BACKTRACE_ON_SEGFAULT},
   };
   return result;
 }
diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh
index a01c1ed6d082..516e6ac86791 100755
--- a/tests/scripts/task_config_build_arm.sh
+++ b/tests/scripts/task_config_build_arm.sh
@@ -34,5 +34,6 @@ echo set\(USE_VTA_FSIM ON\) >> config.cmake
 echo set\(USE_ARM_COMPUTE_LIB ON\) >> config.cmake
 echo set\(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR "/opt/acl"\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
+echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake
 echo set\(USE_UMA ON\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_cortexm.sh b/tests/scripts/task_config_build_cortexm.sh
index 35dbd82110cd..7292f1c31e0d 100755
--- a/tests/scripts/task_config_build_cortexm.sh
+++ b/tests/scripts/task_config_build_cortexm.sh
@@ -32,6 +32,7 @@ echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-10\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
+echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 8d5a2a95bb89..e3d8aa9a1d1b 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -55,6 +55,7 @@ echo set\(USE_CMSISNN OFF\) >> config.cmake
 echo set\(USE_VITIS_AI ON\) >> config.cmake
 echo set\(USE_VERILATOR ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE ON\) >> config.cmake
+echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(USE_ETHOSU OFF\) >> config.cmake
 echo set\(USE_UMA ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 5163a16da3cd..ca5f3e935c08 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -45,6 +45,7 @@ echo set\(USE_BLAS openblas\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(USE_TENSORRT_CODEGEN ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
+echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu_other.sh b/tests/scripts/task_config_build_gpu_other.sh
index 9943d9b2514e..6fb10d44508a 100755
--- a/tests/scripts/task_config_build_gpu_other.sh
+++ b/tests/scripts/task_config_build_gpu_other.sh
@@ -31,5 +31,6 @@ echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE OFF\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
+echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index 2f84bed23a30..101260b764e0 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -32,5 +32,6 @@ echo set\(CMAKE_CXX_COMPILER "/opt/sccache/clang++"\) >> config.cmake
 echo set\(USE_HEXAGON "ON"\) >> config.cmake
 echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_ROOT}"\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
+echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 echo set\(USE_HEXAGON_QHL ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh
index a570e9801ad3..5e94f864ec90 100755
--- a/tests/scripts/task_config_build_i386.sh
+++ b/tests/scripts/task_config_build_i386.sh
@@ -34,6 +34,7 @@ echo set\(USE_VTA_FSIM ON\) >> config.cmake
 echo set\(USE_VTA_TSIM ON\) >> config.cmake
 echo set\(USE_VERILATOR ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
+echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake
 echo set\(USE_UMA OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 

From bb2cec1941db6f5a67b85550faa9279c4c14e2a3 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 6 Oct 2022 17:05:05 -0700
Subject: [PATCH 319/704] [Fix,MetaSchedule] Correct log usage in arithmetic
 intensity feature (#12806)

In meta schedule's featurization, arithmetic intensity was incorrectly calculated as log(FLOPs) / log(bytes). This change corrects it to log(FLOPs/bytes). Note that this is the same issue as in #12079.
---
 .../feature_extractor/per_store_feature.cc    | 21 +++++++--
 ...ule_feature_extractor_per_store_feature.py | 46 +++++++++++--------
 2 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/src/meta_schedule/feature_extractor/per_store_feature.cc b/src/meta_schedule/feature_extractor/per_store_feature.cc
index 698de010b75e..422f21abe17a 100644
--- a/src/meta_schedule/feature_extractor/per_store_feature.cc
+++ b/src/meta_schedule/feature_extractor/per_store_feature.cc
@@ -1042,6 +1042,17 @@ struct Feature {
   /*!
    * \brief See the wiki page [1] for details
    *
+   * Arithmetic intensity is FLOPs/unique bytes of memory touched. A value is computed
+   * for each set of loop nests starting with just the innermost loop and
+   * reaching to include all loops. There are a variable number of loops, so
+   * n_samples are taken from the curve of arithmetic intensity vs flops. This
+   * biases the values towards larger loops.
+   *
+   * Note that the denominator is unique bytes of memory touched. Repeated
+   * access to the same byte of memory counts as only a single byte touched.
+   *
+   * Values are scaled by log2(x + 1).
+   *
    * [1] https://en.wikipedia.org/wiki/Roofline_model
    */
   std::vector<double> arith_intensity_curve;
@@ -1060,7 +1071,7 @@ struct Feature {
     std::vector<double> memory_bytes;
     memory_bytes.resize(n_loops);
     for (int i = 0; i < n_loops; ++i) {
-      memory_bytes[n_loops - 1 - i] = std::log2(for_touched_bytes[i]);
+      memory_bytes[n_loops - 1 - i] = for_touched_bytes[i];
     }
     // Calculate `compute_ops` and `cur_compute_ops`
     std::vector<double> compute_ops;
@@ -1072,7 +1083,7 @@ struct Feature {
       if (const int64_t* extent = GetLoopIntExtent(loops[i])) {
         total_compute_ops *= *extent;
       }
-      compute_ops.push_back(std::log2(total_compute_ops));
+      compute_ops.push_back(total_compute_ops);
     }
     // Fill the feature set
     if (total_compute_ops <= 0 || compute_ops.empty()) {
@@ -1081,7 +1092,7 @@ struct Feature {
       }
       return;
     }
-    total_compute_ops = compute_ops.back();  // i.e. total_compute_ops = log2(total_compute_ops)
+    total_compute_ops = compute_ops.back();
     int p = 0;
     for (int i = 0; i < n_samples; ++i) {
       double& result = arith_intensity_curve[i];
@@ -1094,13 +1105,13 @@ struct Feature {
       }
       CHECK_LT(p, n_loops);
       if (p == 0) {
-        result = compute_ops[p] / memory_bytes[p];
+        result = slog(compute_ops[p] / memory_bytes[p]);
       } else {
         double base = compute_ops[p - 1] / memory_bytes[p - 1];
         double slope =
             (compute_ops[p] / memory_bytes[p] - compute_ops[p - 1] / memory_bytes[p - 1]) /
             (compute_ops[p] - compute_ops[p - 1]);
-        result = base + slope * (cur_compute_ops - compute_ops[p - 1]);
+        result = slog(base + slope * (cur_compute_ops - compute_ops[p - 1]));
       }
     }
   }
diff --git a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
index cad140b8deb5..701e1826b38a 100644
--- a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
+++ b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
@@ -365,19 +365,22 @@ def _create_schedule():
         atol=1e-5,
     )
     # Group 3: Arithmetic intensity
+    # arithmetic intensity = flops/bytes touched = 2*512*512*512/(3 * 4 * 512*512)
+    #                             add and multiply ^     3 arrays ^   ^ 4 bytes per f32
+    # = 85.3 but log2 is used so values should be around 6.4
     assert_allclose(
         actual=f[147:157],
         desired=[
-            0.7097842693328857,
-            0.7408391237258911,
-            0.8750449419021606,
-            0.9449487924575806,
-            1.0148526430130005,
-            1.0847564935684204,
-            1.113688349723816,
-            1.1394684314727783,
-            1.2119636535644531,
-            1.2971993684768677,
+            3.812599,
+            4.464822,
+            4.912349,
+            5.253426,
+            5.529086,
+            5.76043,
+            5.959752,
+            6.134849,
+            6.290977,
+            6.431846,
         ],
         rtol=1e-5,
         atol=1e-5,
@@ -1357,19 +1360,22 @@ def _create_schedule():
         atol=1e-5,
     )
     # Group 3: Arithmetic intensity
+    # Arithmetic intensity is high here because of repeated use of a shared
+    # buffer. Multiple accesses to the same memory location are counted as a
+    # single byte, skewing these numbers towards higher intensity.
     assert_allclose(
         actual=f[147:157],
         desired=[
-            0.7097842504665767,
-            0.7548801745187567,
-            0.8775907547541741,
-            0.9957389916154509,
-            1.2446737395193135,
-            1.493608487423176,
-            1.7093103019954263,
-            1.8031580276850985,
-            1.9841832691827785,
-            2.204648076869754,
+            11.98533,
+            12.977811,
+            13.562714,
+            13.977722,
+            14.299632,
+            14.562654,
+            14.785038,
+            14.977677,
+            15.147597,
+            15.299596,
         ],
         rtol=1e-5,
         atol=1e-5,

From 6780c9f87db6620409f8f58c2c2925c7bd7b6681 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 6 Oct 2022 23:48:07 -0700
Subject: [PATCH 320/704] [MetaSchedule] Tuning API cleanup & ergonomics
 (#12895)

---
 include/tvm/meta_schedule/measure_callback.h  |   7 +-
 include/tvm/meta_schedule/mutator.h           |  15 +-
 include/tvm/meta_schedule/postproc.h          |   9 +
 include/tvm/meta_schedule/schedule_rule.h     |  18 +-
 include/tvm/meta_schedule/search_strategy.h   |  32 +-
 include/tvm/meta_schedule/space_generator.h   |  53 +-
 include/tvm/meta_schedule/task_scheduler.h    | 282 ++++----
 include/tvm/meta_schedule/tune_context.h      |  87 +--
 include/tvm/support/random_engine.h           |  53 +-
 python/tvm/contrib/torch/as_torch.py          |  69 +-
 python/tvm/contrib/torch/optimize_torch.py    | 179 ++---
 python/tvm/meta_schedule/__init__.py          |  19 +-
 python/tvm/meta_schedule/builder/builder.py   |  44 +-
 .../meta_schedule/builder/local_builder.py    |   6 +-
 .../meta_schedule/cost_model/cost_model.py    |  46 +-
 .../tvm/meta_schedule/cost_model/mlp_model.py |   6 +-
 .../meta_schedule/cost_model/random_model.py  |   3 +-
 .../tvm/meta_schedule/cost_model/xgb_model.py |  22 +-
 python/tvm/meta_schedule/database/database.py |  87 ++-
 python/tvm/meta_schedule/default_config.py    | 454 ------------
 .../feature_extractor/feature_extractor.py    |  26 +-
 .../random_feature_extractor.py               |   8 +-
 python/tvm/meta_schedule/logging.py           | 259 +++++++
 .../measure_callback/__init__.py              |   7 +-
 .../measure_callback/echo_statistics.py       |  30 -
 .../measure_callback/measure_callback.py      |  16 +-
 python/tvm/meta_schedule/mutator/mutator.py   |  44 +-
 python/tvm/meta_schedule/postproc/postproc.py |  36 +-
 python/tvm/meta_schedule/profiler.py          |   4 -
 python/tvm/meta_schedule/relay_integration.py | 361 ++++++++--
 .../tvm/meta_schedule/runner/local_runner.py  |   4 +-
 python/tvm/meta_schedule/runner/rpc_runner.py |   4 +-
 python/tvm/meta_schedule/runner/runner.py     |  37 +-
 .../schedule_rule/schedule_rule.py            |  35 +
 .../search_strategy/evolutionary_search.py    |  10 -
 .../search_strategy/replay_func.py            |  11 +-
 .../search_strategy/replay_trace.py           |  12 +-
 .../search_strategy/search_strategy.py        |  86 ++-
 .../space_generator/post_order_apply.py       |  26 +-
 .../space_generator/schedule_fn.py            |  20 +-
 .../space_generator/space_generator.py        | 130 +++-
 .../space_generator/space_generator_union.py  |  20 +-
 .../task_scheduler/gradient_based.py          |  49 +-
 .../task_scheduler/round_robin.py             |  77 +-
 .../task_scheduler/task_scheduler.py          | 282 ++++----
 .../testing/dataset_extract_tasks.py          |   4 +-
 .../testing/dataset_sample_candidates.py      |  17 +-
 .../meta_schedule/testing/relay_workload.py   |  52 +-
 .../meta_schedule/testing/schedule_rule.py    |  36 -
 .../meta_schedule/testing/space_generation.py |  49 +-
 python/tvm/meta_schedule/testing/tlcbench.py  |   5 +-
 .../meta_schedule/testing/torchbench/run.py   | 135 ++--
 python/tvm/meta_schedule/testing/tune_onnx.py |  52 +-
 .../tvm/meta_schedule/testing/tune_relay.py   |  44 +-
 python/tvm/meta_schedule/testing/tune_te.py   |  41 +-
 python/tvm/meta_schedule/tir_integration.py   | 155 ++++
 python/tvm/meta_schedule/tune.py              | 678 ++----------------
 python/tvm/meta_schedule/tune_context.py      | 201 ++----
 python/tvm/meta_schedule/utils.py             | 143 +---
 python/tvm/tir/tensor_intrin/cuda.py          |   7 +-
 .../measure_callback/add_to_database.cc       |   6 +-
 .../measure_callback/echo_statistics.cc       | 134 ----
 .../measure_callback/measure_callback.cc      |  10 +
 .../measure_callback/update_cost_model.cc     |  11 +-
 src/meta_schedule/mutator/mutator.cc          |  30 +
 src/meta_schedule/postproc/postproc.cc        |  48 ++
 .../postproc/rewrite_cooperative_fetch.cc     |   2 +-
 .../schedule_rule/cross_thread_reduction.cc   |   4 +-
 .../schedule_rule/multi_level_tiling.cc       |   4 +-
 .../schedule_rule/multi_level_tiling.h        |   2 +-
 .../multi_level_tiling_tensor_core.cc         |  10 +-
 .../multi_level_tiling_with_intrin.cc         |   6 +-
 .../schedule_rule/schedule_rule.cc            | 154 ++++
 .../search_strategy/evolutionary_search.cc    | 141 ++--
 .../search_strategy/replay_func.cc            |  98 +--
 .../search_strategy/replay_trace.cc           |  97 +--
 .../search_strategy/search_strategy.cc        |   5 +-
 .../space_generator/post_order_apply.cc       |  35 +-
 .../space_generator/schedule_fn.cc            |  11 +-
 .../space_generator/space_generator.cc        |  96 +++
 .../space_generator/space_generator_union.cc  |  16 +-
 .../task_scheduler/gradient_based.cc          | 218 ++----
 .../task_scheduler/round_robin.cc             |  23 +-
 .../task_scheduler/task_scheduler.cc          | 405 +++++++----
 src/meta_schedule/tune_context.cc             | 180 +----
 src/meta_schedule/utils.h                     |  57 +-
 src/relay/backend/te_compiler.cc              |   2 +-
 src/relay/backend/te_compiler_cache.cc        |  14 +-
 src/relay/backend/utils.h                     |   7 +
 src/tir/schedule/concrete_schedule.cc         |   2 +-
 .../test_hexagon/test_meta_schedule.py        | 115 +--
 ...to_tensorize.py => test_auto_tensorize.py} | 191 ++---
 .../python/integration/test_legacy_tuning.py  | 380 ++++++++++
 tests/python/integration/test_tuning.py       | 433 ++---------
 .../unittest/test_meta_schedule_cost_model.py |  17 +-
 ..._meta_schedule_custom_rule_winograd_cpu.py |   4 -
 ...meta_schedule_custom_rule_winograd_cuda.py |   3 -
 .../test_meta_schedule_measure_callback.py    |  18 +-
 ...chedule_mutator_mutate_compute_location.py |  17 +-
 ...t_meta_schedule_mutator_mutate_parallel.py |  17 +-
 ..._schedule_mutator_mutate_thread_binding.py |  17 +-
 ..._meta_schedule_mutator_mutate_tile_size.py |  15 +-
 ...est_meta_schedule_mutator_mutate_unroll.py |  17 +-
 .../test_meta_schedule_post_order_apply.py    |  50 +-
 ...schedule_postproc_disallow_dynamic_loop.py |  21 +-
 ...dule_postproc_rewrite_cooperative_fetch.py |  22 +-
 ...t_meta_schedule_postproc_rewrite_layout.py |  83 ++-
 ...hedule_postproc_rewrite_reduction_block.py |  21 +-
 ...eta_schedule_postproc_rewrite_tensorize.py |  26 +-
 ...schedule_postproc_rewrite_unbound_block.py |  26 +-
 ..._meta_schedule_postproc_verify_gpu_code.py |  23 +-
 ...> test_meta_schedule_relay_integration.py} | 208 +++++-
 ...meta_schedule_schedule_rule_add_rfactor.py |  23 +-
 ...t_meta_schedule_schedule_rule_auto_bind.py |  33 +-
 ...meta_schedule_schedule_rule_auto_inline.py |  43 +-
 ...le_schedule_rule_cross_thread_reduction.py |  51 +-
 .../test_meta_schedule_schedule_rule_mlt.py   |  61 +-
 ..._meta_schedule_schedule_rule_mlt_intrin.py |  19 +-
 ...test_meta_schedule_schedule_rule_mlt_tc.py |  80 ++-
 ...schedule_rule_parallel_vectorize_unroll.py |  21 +-
 ...e_schedule_rule_random_compute_location.py |  13 +-
 .../test_meta_schedule_search_strategy.py     |  45 +-
 .../unittest/test_meta_schedule_space_cpu.py  | 106 +--
 .../unittest/test_meta_schedule_space_cuda.py | 116 +--
 .../test_meta_schedule_space_generator.py     |   7 +-
 .../test_meta_schedule_task_scheduler.py      | 155 ++--
 .../unittest/test_meta_schedule_tune_relay.py | 554 --------------
 .../unittest/test_meta_schedule_tune_te.py    |  52 --
 .../unittest/test_meta_schedule_tune_tir.py   | 115 ++-
 .../test_meta_schedule_vnni_integration.py    | 249 +++++++
 .../unittest/test_tir_schedule_trace.py       |   1 -
 131 files changed, 4873 insertions(+), 5127 deletions(-)
 delete mode 100644 python/tvm/meta_schedule/default_config.py
 create mode 100644 python/tvm/meta_schedule/logging.py
 delete mode 100644 python/tvm/meta_schedule/measure_callback/echo_statistics.py
 delete mode 100644 python/tvm/meta_schedule/testing/schedule_rule.py
 create mode 100644 python/tvm/meta_schedule/tir_integration.py
 delete mode 100644 src/meta_schedule/measure_callback/echo_statistics.cc
 rename tests/python/integration/{test_meta_schedule_auto_tensorize.py => test_auto_tensorize.py} (73%)
 create mode 100644 tests/python/integration/test_legacy_tuning.py
 rename tests/python/unittest/{test_meta_schedule_integration.py => test_meta_schedule_relay_integration.py} (55%)
 delete mode 100644 tests/python/unittest/test_meta_schedule_tune_relay.py
 delete mode 100644 tests/python/unittest/test_meta_schedule_tune_te.py
 create mode 100644 tests/python/unittest/test_meta_schedule_vnni_integration.py

diff --git a/include/tvm/meta_schedule/measure_callback.h b/include/tvm/meta_schedule/measure_callback.h
index 151582d4c9ce..30d1c2cd3ee0 100644
--- a/include/tvm/meta_schedule/measure_callback.h
+++ b/include/tvm/meta_schedule/measure_callback.h
@@ -122,11 +122,6 @@ class MeasureCallback : public runtime::ObjectRef {
    * \return The measure callback created.
    */
   TVM_DLL static MeasureCallback RemoveBuildArtifact();
-  /*!
-   * \brief Create a measure callback that echos the statistics of the tuning process to the console
-   * \return The measure callback created.
-   */
-  TVM_DLL static MeasureCallback EchoStatistics();
   /*!
    * \brief Create a measure callback that updates the cost model with measurement result.
    * \return The measure callback created.
@@ -140,6 +135,8 @@ class MeasureCallback : public runtime::ObjectRef {
    */
   TVM_DLL static MeasureCallback PyMeasureCallback(PyMeasureCallbackNode::FApply f_apply,
                                                    PyMeasureCallbackNode::FAsString f_as_string);
+  /*! \brief The default list of measure callbacks. */
+  TVM_DLL static Array<MeasureCallback, void> Default();
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(MeasureCallback, ObjectRef, MeasureCallbackNode);
 };
 
diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h
index 2b580e75e019..08a8248dfdbc 100644
--- a/include/tvm/meta_schedule/mutator.h
+++ b/include/tvm/meta_schedule/mutator.h
@@ -127,10 +127,17 @@ class Mutator : public runtime::ObjectRef {
    * \param f_as_string The packed function of `AsString`.
    * \return The mutator created.
    */
-  TVM_DLL static Mutator PyMutator(FInitializeWithTuneContext f_initialize_with_tune_context,  //
-                                   FApply f_apply,                                             //
-                                   FClone f_clone,                                             //
-                                   FAsString f_as_string);
+  TVM_DLL static Mutator PyMutator(FInitializeWithTuneContext f_initialize_with_tune_context,
+                                   FApply f_apply, FClone f_clone, FAsString f_as_string);
+  /*! \brief Create default mutators for LLVM */
+  TVM_DLL static Map<Mutator, FloatImm, void> DefaultLLVM();
+  /*! \brief Create default mutators for CUDA */
+  TVM_DLL static Map<Mutator, FloatImm, void> DefaultCUDA();
+  /*! \brief Create default mutators for CUDA with TensorCore */
+  TVM_DLL static Map<Mutator, FloatImm, void> DefaultCUDATensorCore();
+  /*! \brief Create default mutators for Hexagon */
+  TVM_DLL static Map<Mutator, FloatImm, void> DefaultHexagon();
+
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Mutator, ObjectRef, MutatorNode);
 };
 
diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 4fafb9557631..a680a647956c 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -150,6 +150,15 @@ class Postproc : public runtime::ObjectRef {
    * \return The postprocessor created
    */
   TVM_DLL static Postproc RewriteLayout();
+  /*! \brief Create default postprocessors for LLVM */
+  TVM_DLL static Array<Postproc, void> DefaultLLVM();
+  /*! \brief Create default postprocessors for CUDA */
+  TVM_DLL static Array<Postproc, void> DefaultCUDA();
+  /*! \brief Create default postprocessors for CUDA with TensorCore */
+  TVM_DLL static Array<Postproc, void> DefaultCUDATensorCore();
+  /*! \brief Create default postprocessors for Hexagon */
+  TVM_DLL static Array<Postproc, void> DefaultHexagon();
+
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Postproc, ObjectRef, PostprocNode);
 };
 
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 2c9da1df9dae..3bc30e09c74a 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -140,8 +140,8 @@ class ScheduleRule : public runtime::ObjectRef {
                                                Optional<Map<String, ObjectRef>> reuse_write);
 
   /*!
-   * \brief Extension of MultiLevelTiling for auto-tensorizing with a single intrinsic.
-   * \param intrin_name The name of a tensor intrinsic, must be registerd via
+   * \brief Extension of MultiLevelTiling for auto-tensorization with a single intrinsic.
+   * \param intrin_name The name of a tensor intrinsic, must be registered via
    * TensorIntrin.register(...) beforehand
    * \param structure The tiling structure. Recommended:
    * - 'SSRSRS' on CPU
@@ -162,12 +162,12 @@ class ScheduleRule : public runtime::ObjectRef {
       Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write);
 
   /*!
-   * \brief Extension of MultiLevelTiling for auto-tensorizing with multiple groups of candidate
+   * \brief Extension of MultiLevelTiling for auto-tensorization with multiple groups of candidate
    * tensor core intrinsics
    * \param intrin_groups A list of groups of tensor core intrinsics. The map should contains key
    * "init", "load_a", "load_b", "compute", "store", which represent the tensor intrin for
    * initialization, loading operand A, loading operand B, tensor core computation, storing the
-   * result. The value of the map should be names of tensor intrinsics, must be registerd via
+   * result. The value of the map should be names of tensor intrinsics, must be registered via
    * TensorIntrin.register(...) beforehand
    * \param structure The tiling structure. Recommended:
    * - 'SSSRRSRS' on GPU
@@ -261,6 +261,16 @@ class ScheduleRule : public runtime::ObjectRef {
       FApply f_apply,                                             //
       FClone f_clone,                                             //
       FAsString f_as_string);
+
+  /*! \brief Create default schedule rules for LLVM */
+  TVM_DLL static Array<ScheduleRule, void> DefaultLLVM();
+  /*! \brief Create default schedule rules for CUDA */
+  TVM_DLL static Array<ScheduleRule, void> DefaultCUDA();
+  /*! \brief Create default postprocessors for CUDA with TensorCore */
+  TVM_DLL static Array<ScheduleRule, void> DefaultCUDATensorCore();
+  /*! \brief Create default schedule rules for Hexagon */
+  TVM_DLL static Array<ScheduleRule, void> DefaultHexagon();
+
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ScheduleRule, ObjectRef, ScheduleRuleNode);
 };
 
diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index efd3dc24524a..c2399eef0824 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -88,6 +88,8 @@ class SearchStrategyNode : public runtime::Object {
 
   /*!
    * \brief Pre-tuning for the search strategy.
+   * \param max_trials The maximum number of trials.
+   * \param num_trials_per_iter The number of trials per iteration.
    * \param design_spaces The design spaces used during tuning process.
    * \param database The database used during tuning process.
    * \param cost_model The cost model used during tuning process.
@@ -95,7 +97,8 @@ class SearchStrategyNode : public runtime::Object {
    *  initialization. Because the search strategy is stateful, we can always call pretuning
    *  and reset the search strategy.
    */
-  virtual void PreTuning(const Array<tir::Schedule>& design_spaces,
+  virtual void PreTuning(int max_trials, int num_trials_per_iter,
+                         const Array<tir::Schedule>& design_spaces,
                          const Optional<Database>& database,
                          const Optional<CostModel>& cost_model) = 0;
 
@@ -143,10 +146,10 @@ class SearchStrategy : public runtime::ObjectRef {
   using FInitializeWithTuneContext = runtime::TypedPackedFunc<void(const TuneContext&)>;
   /*!
    * \brief The function type of `PreTuning` method.
-   * \param design_spaces The design spaces for pre-tuning.
    */
   using FPreTuning = runtime::TypedPackedFunc<void(
-      const Array<tir::Schedule>&, const Optional<Database>&, const Optional<CostModel>&)>;
+      int max_trials, int num_trials_per_iter, const Array<tir::Schedule>&,
+      const Optional<Database>&, const Optional<CostModel>&)>;
   /*! \brief The function type of `PostTuning` method. */
   using FPostTuning = runtime::TypedPackedFunc<void()>;
   /*!
@@ -185,24 +188,15 @@ class SearchStrategy : public runtime::ObjectRef {
 
   /*!
    * \brief Constructor of replay trace search strategy.
-   * \param num_trials_per_iter The number of trials per iteration, i.e., the batch size.
-   * \param max_trials_per_task The total number of trials for trace replaying.
    * \param max_fail_count The max number of failures during trace replaying.
    */
-  TVM_DLL static SearchStrategy ReplayTrace(int num_trials_per_iter, int max_trials_per_task,
-                                            int max_fail_count);
+  TVM_DLL static SearchStrategy ReplayTrace(int max_fail_count);
 
-  /*!
-   * \brief Constructor of replay func search strategy.
-   * \param num_trials_per_iter The number of trials per iteration, i.e., the batch size.
-   * \param max_trials_per_task The total number of trials for func replaying.
-   */
-  TVM_DLL static SearchStrategy ReplayFunc(int num_trials_per_iter, int max_trials_per_task);
+  /*! \brief Constructor of replay func search strategy. */
+  TVM_DLL static SearchStrategy ReplayFunc();
 
   /*!
    * \brief Constructor of evolutionary search strategy.
-   * \param num_trials_per_iter The number of trials per iteration, i.e., the batch size.
-   * \param max_trials_per_task The total number of trials for evolutionary search.
    * \param population_size The initial sample population.
    * \param init_measured_ratio The ratio of measures samples in initial population.
    * \param init_min_unmeasured The minimal size of unmeasured population in the initial sampling.
@@ -211,9 +205,7 @@ class SearchStrategy : public runtime::ObjectRef {
    * \param genetic_max_fail_count The maximum number to try evolving the given trace.
    * \param eps_greedy The ratio to select samples in a greedy fashion via their predicted score.
    */
-  TVM_DLL static SearchStrategy EvolutionarySearch(int num_trials_per_iter,     //
-                                                   int max_trials_per_task,     //
-                                                   int population_size,         //
+  TVM_DLL static SearchStrategy EvolutionarySearch(int population_size,         //
                                                    double init_measured_ratio,  //
                                                    int init_min_unmeasured,     //
                                                    int genetic_num_iters,       //
@@ -257,8 +249,8 @@ class PySearchStrategyNode : public SearchStrategyNode {
   }
 
   void InitializeWithTuneContext(const TuneContext& context) final;
-  void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
-                 const Optional<CostModel>& cost_model) final;
+  void PreTuning(int max_trials, int num_trials_per_iter, const Array<tir::Schedule>& design_spaces,
+                 const Optional<Database>& database, const Optional<CostModel>& cost_model) final;
   void PostTuning() final;
   Optional<Array<MeasureCandidate>> GenerateMeasureCandidates() final;
   void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
diff --git a/include/tvm/meta_schedule/space_generator.h b/include/tvm/meta_schedule/space_generator.h
index 1e29e757a15c..f746eb809194 100644
--- a/include/tvm/meta_schedule/space_generator.h
+++ b/include/tvm/meta_schedule/space_generator.h
@@ -20,10 +20,14 @@
 #define TVM_META_SCHEDULE_SPACE_GENERATOR_H_
 
 #include <tvm/ir/module.h>
+#include <tvm/meta_schedule/mutator.h>
+#include <tvm/meta_schedule/postproc.h>
+#include <tvm/meta_schedule/schedule_rule.h>
 #include <tvm/node/reflection.h>
 #include <tvm/runtime/container/array.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/target/target.h>
 #include <tvm/tir/schedule/schedule.h>
 
 namespace tvm {
@@ -71,6 +75,19 @@ class SpaceGenerator;
 */
 class SpaceGeneratorNode : public runtime::Object {
  public:
+  /*! \brief The schedule rules. */
+  Optional<Array<ScheduleRule>> sch_rules;
+  /*! \brief The postprocessors. */
+  Optional<Array<Postproc>> postprocs;
+  /*! \brief The probability of using certain mutator. */
+  Optional<Map<Mutator, FloatImm>> mutator_probs;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("sch_rules", &sch_rules);
+    v->Visit("postprocs", &postprocs);
+    v->Visit("mutator_probs", &mutator_probs);
+  }
+
   /*! \brief Default destructor */
   virtual ~SpaceGeneratorNode() = default;
 
@@ -79,7 +96,7 @@ class SpaceGeneratorNode : public runtime::Object {
    * \param context The tuning context for initialization.
    * \note This method is supposed to be called only once before every other method.
    */
-  virtual void InitializeWithTuneContext(const TuneContext& context) = 0;
+  virtual void InitializeWithTuneContext(const TuneContext& context);
 
   /*!
    * \brief Generate design spaces given a module.
@@ -127,12 +144,17 @@ class SpaceGenerator : public runtime::ObjectRef {
  public:
   /*!
    * \brief Create a design space generator with customized methods on the python-side.
+   * \param sch_rules The schedule rules.
+   * \param postprocs The postprocessors.
+   * \param mutator_probs The probability of using certain mutator.
    * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`.
    * \param f_generate_design_space The packed function of `GenerateDesignSpace`.
    * \param f_clone The packed function of `Clone`.
    * \return The design space generator created.
    */
   TVM_DLL static SpaceGenerator PySpaceGenerator(
+      Optional<Array<ScheduleRule>> sch_rules, Optional<Array<Postproc>> postprocs,
+      Optional<Map<Mutator, FloatImm>> mutator_probs,
       FInitializeWithTuneContext f_initialize_with_tune_context,
       FGenerateDesignSpace f_generate_design_space, FClone f_clone);
   /*!
@@ -141,19 +163,39 @@ class SpaceGenerator : public runtime::ObjectRef {
    * 1) void(Schedule)
    * 2) Schedule(Schedule)
    * 3) Array<Schedule>(Schedule)
+   * \param sch_rules The schedule rules.
+   * \param postprocs The postprocessors.
+   * \param mutator_probs The probability of using certain mutator.
    */
-  TVM_DLL static SpaceGenerator ScheduleFn(PackedFunc schedule_fn);
+  TVM_DLL static SpaceGenerator ScheduleFn(PackedFunc schedule_fn,
+                                           Optional<Array<ScheduleRule>> sch_rules,
+                                           Optional<Array<Postproc>> postprocs,
+                                           Optional<Map<Mutator, FloatImm>> mutator_probs);
   /*!
    * \brief Create a design space generator that is union of multiple design space generators.
    * \param space_generators An array of design space generators to be unioned.
+   * \param sch_rules The schedule rules.
+   * \param postprocs The postprocessors.
+   * \param mutator_probs The probability of using certain mutator.
    * \return The design space generator created.
    */
-  TVM_DLL static SpaceGenerator SpaceGeneratorUnion(Array<SpaceGenerator, void> space_generators);
+  TVM_DLL static SpaceGenerator SpaceGeneratorUnion(Array<SpaceGenerator, void> space_generators,
+                                                    Optional<Array<ScheduleRule>> sch_rules,
+                                                    Optional<Array<Postproc>> postprocs,
+                                                    Optional<Map<Mutator, FloatImm>> mutator_probs);
   /*!
    * \brief Create a design space generator that generates design spaces by applying schedule
-   * rules to blocks in post-DFS order. \return The design space generator created.
+   * rules to blocks in post-DFS order.
+   * \param f_block_filter The filter function to filter blocks to be applied with schedule rules.
+   * \param sch_rules The schedule rules.
+   * \param postprocs The postprocessors.
+   * \param mutator_probs The probability of using certain mutator.
+   * \return The design space generator created.
    */
-  TVM_DLL static SpaceGenerator PostOrderApply(runtime::PackedFunc f_block_filter = nullptr);
+  TVM_DLL static SpaceGenerator PostOrderApply(runtime::PackedFunc f_block_filter,
+                                               Optional<Array<ScheduleRule>> sch_rules,
+                                               Optional<Array<Postproc>> postprocs,
+                                               Optional<Map<Mutator, FloatImm>> mutator_probs);
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(SpaceGenerator, ObjectRef, SpaceGeneratorNode);
 };
 
@@ -171,6 +213,7 @@ class PySpaceGeneratorNode : public SpaceGeneratorNode {
   FClone f_clone;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
+    SpaceGeneratorNode::VisitAttrs(v);
     // `f_initialize_with_tune_context` is not visited
     // `f_generate_design_space` is not visited
     // `f_clone` is not visited
diff --git a/include/tvm/meta_schedule/task_scheduler.h b/include/tvm/meta_schedule/task_scheduler.h
index 385816e790e2..17d82558fb82 100644
--- a/include/tvm/meta_schedule/task_scheduler.h
+++ b/include/tvm/meta_schedule/task_scheduler.h
@@ -21,7 +21,6 @@
 
 #include <tvm/meta_schedule/builder.h>
 #include <tvm/meta_schedule/cost_model.h>
-#include <tvm/meta_schedule/database.h>
 #include <tvm/meta_schedule/measure_callback.h>
 #include <tvm/meta_schedule/runner.h>
 #include <tvm/meta_schedule/tune_context.h>
@@ -32,9 +31,64 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/support/random_engine.h>
 
+#include <string>
+#include <vector>
+
 namespace tvm {
 namespace meta_schedule {
 
+class TaskRecordNode : public runtime::Object {
+ public:
+  /*! \brief The tune context of the task. */
+  TuneContext ctx{nullptr};
+  /*! \brief The weight of the task */
+  double task_weight{1.0};
+  /*! \brief The FLOP count of the task */
+  double flop{1.0};
+  /*! \brief Whether the tuning task has been stopped or finished. */
+  bool is_terminated = false;
+  /*! \brief Builder errors happens in the task */
+  int build_error_count = 0;
+  /*! \brief Runner errors happens in the task */
+  int run_error_count = 0;
+  /*! \brief The latency of each run, in milliseconds. */
+  std::vector<double> latency_ms = {};
+  /*! \brief The measure candidates. */
+  Optional<Array<MeasureCandidate>> measure_candidates = NullOpt;
+  /*! \brief The building results. */
+  Optional<Array<BuilderResult>> builder_results = NullOpt;
+  /*! \brief Packed functions to fetch the runner results asynchronously. */
+  Optional<Array<RunnerFuture>> runner_futures = NullOpt;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("ctx", &ctx);
+    v->Visit("task_weight", &task_weight);
+    v->Visit("flop", &flop);
+    v->Visit("is_terminated", &is_terminated);
+    v->Visit("build_error_count", &build_error_count);
+    v->Visit("run_error_count", &run_error_count);
+    // `latency_ms` is not visited
+    v->Visit("measure_candidates", &measure_candidates);
+    v->Visit("builder_results", &builder_results);
+    v->Visit("runner_futures", &runner_futures);
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.TaskRecord";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TaskRecordNode, Object);
+};
+
+/*!
+ * \brief Managed reference to TaskRecordNode.
+ * \sa TaskRecordNode
+ */
+class TaskRecord : public runtime::ObjectRef {
+ public:
+  /*! \brief Constructor */
+  explicit TaskRecord(TuneContext task, double task_weight);
+
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(TaskRecord, ObjectRef, TaskRecordNode);
+};
+
 /*!
  * \brief The abstract interface of task schedulers.
  * \note The relationship between SpaceGenerator and other classes are as follows:
@@ -73,66 +127,77 @@ namespace meta_schedule {
 */
 class TaskSchedulerNode : public runtime::Object {
  public:
-  /*! \brief The tasks to be tuned */
-  Array<TuneContext> tasks;
-  /*! \brief The builder of the scheduler. */
-  Builder builder{nullptr};
-  /*! \brief The runner of the scheduler. */
-  Runner runner{nullptr};
-  /*! \brief The database of the scheduler. */
-  Optional<Database> database;
-  /*! \brief The cost model of the scheduler. */
-  Optional<CostModel> cost_model;
+  /*! \brief The tuning task's logging function. */
+  PackedFunc logger;
+  /*! \brief Records for each task */
+  Array<TaskRecord> tasks_;
   /*! \brief The list of measure callbacks of the scheduler. */
-  Array<MeasureCallback> measure_callbacks;
-  /*! \brief The maximum number of trials allowed. */
-  int max_trials;
-  /*! \brief The number of trials already conducted. */
-  int num_trials_already;
-  /*! \brief The tuning task's logging function. t*/
-  PackedFunc logging_func;
+  Array<MeasureCallback> measure_callbacks_;
+  /*! \brief The database used in tuning */
+  Optional<Database> database_;
+  /*! \brief The cost model used in tuning */
+  Optional<CostModel> cost_model_;
+  /*! \brief The number of remaining tasks to be tuned. */
+  int remaining_tasks_;
 
   /*! \brief The default destructor. */
   virtual ~TaskSchedulerNode() = default;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("tasks", &tasks);
-    v->Visit("builder", &builder);
-    v->Visit("runner", &runner);
-    v->Visit("database", &database);
-    v->Visit("cost_model", &cost_model);
-    v->Visit("measure_callbacks", &measure_callbacks);
-    v->Visit("max_trials", &max_trials);
-    v->Visit("num_trials_already", &num_trials_already);
-    // `logging_func` is not visited
+    // `logger` is not visited
+    v->Visit("tasks_", &tasks_);
+    v->Visit("measure_callbacks_", &measure_callbacks_);
+    v->Visit("database_", &database_);
+    v->Visit("cost_model_", &cost_model_);
+    v->Visit("remaining_tasks_", &remaining_tasks_);
   }
 
-  /*! \brief Auto-tuning. */
-  virtual void Tune();
-
-  /*!
-   * \brief Initialize modules of the given task.
-   * \param task_id The task id to be initialized.
-   */
-  virtual void InitializeTask(int task_id);
-
   /*!
-   * \brief Touch the task and update its status
-   * \param task_id The task id to be checked.
+   * \brief Fetch the next task id.
+   * \return The next task id.
    */
-  virtual void TouchTask(int task_id);
-
+  virtual int NextTaskId() = 0;
   /*!
    * \brief Wait until the task is finished.
    * \param task_id The task id to be joined.
+   * \return The results from the runner.
    */
   virtual Array<RunnerResult> JoinRunningTask(int task_id);
-
   /*!
-   * \brief Fetch the next task id.
-   * \return The next task id.
+   * \brief Jointly tune a given list of tasks.
+   * \param tasks The tasks to be tuned
+   * \param task_weights The weight of each task
+   * \param max_trials_global The maximum number of trials to be performed globally
+   * \param max_trials_per_task The maximum number of trials to be performed for each task
+   * \param num_trials_per_iter The number of trials to be performed in each iteration
+   * \param builder The MetaSchedule builder
+   * \param runner The MetaSchedule runner
+   * \param measure_callbacks The callbacks to be called after each measurement
+   * \param database The database used in tuning
+   * \param cost_model The cost model used in tuning
    */
-  virtual int NextTaskId() = 0;
+  virtual void Tune(Array<TuneContext> tasks,                  //
+                    Array<FloatImm> task_weights,              //
+                    int max_trials_global,                     //
+                    int max_trials_per_task,                   //
+                    int num_trials_per_iter,                   //
+                    Builder builder,                           //
+                    Runner runner,                             //
+                    Array<MeasureCallback> measure_callbacks,  //
+                    Optional<Database> database,               //
+                    Optional<CostModel> cost_model);
+  /*!
+   * \brief Terminate a task
+   * \param task_id The id of the task to be terminated
+   */
+  void TerminateTask(int task_id);
+  /*!
+   * \brief Touch the task and update its status
+   * \param task_id The task id to be checked.
+   */
+  void TouchTask(int task_id);
+  /*! \brief Returns a human-readable string of the tuning statistics. */
+  std::string TuningStatistics() const;
 
   static constexpr const char* _type_key = "meta_schedule.TaskScheduler";
   TVM_DECLARE_BASE_OBJECT_INFO(TaskSchedulerNode, Object);
@@ -143,55 +208,48 @@ class TaskScheduler;
 /*! \brief The task scheduler with customized methods on the python-side. */
 class PyTaskSchedulerNode : public TaskSchedulerNode {
  public:
-  /*! \brief The function type of `Tune` method. */
-  using FTune = runtime::TypedPackedFunc<void()>;
-
-  /*! \brief The function type of `InitializeTask` method. */
-  using FInitializeTask = runtime::TypedPackedFunc<void(int)>;
-
   /*!
-   * \brief The function type of `TouchTask` method.
-   * \param task_id The task id to be checked.
-   * \return Whether the task is running.
+   * \brief The function type of `NextTaskId` method.
+   * \return The next task id.
    */
-  using FTouchTask = runtime::TypedPackedFunc<void(int)>;
-
+  using FNextTaskId = runtime::TypedPackedFunc<int()>;
   /*!
    * \brief The function type of `JoinRunningTask` method.
    * \param task_id The task id to be joined.
    */
   using FJoinRunningTask = runtime::TypedPackedFunc<Array<RunnerResult>(int)>;
+  /*! \brief The function type of `Tune` method. */
+  using FTune = runtime::TypedPackedFunc<void(Array<TuneContext> tasks,                  //
+                                              Array<FloatImm> task_weights,              //
+                                              int max_trials_global,                     //
+                                              int max_trials_per_task,                   //
+                                              int num_trials_per_iter,                   //
+                                              Builder builder,                           //
+                                              Runner runner,                             //
+                                              Array<MeasureCallback> measure_callbacks,  //
+                                              Optional<Database> database,               //
+                                              Optional<CostModel> cost_model)>;
 
-  /*!
-   * \brief The function type of `NextTaskId` method.
-   * \return The next task id.
-   */
-  using FNextTaskId = runtime::TypedPackedFunc<int()>;
-
-  /*! \brief The packed function to the `Tune` function. */
-  FTune f_tune;
-  /*! \brief The packed function to the `InitializeTask` function. */
-  FInitializeTask f_initialize_task;
-  /*! \brief The packed function to the `TouchTask` function. */
-  FTouchTask f_touch_task;
-  /*! \brief The packed function to the `JoinRunningTask` function. */
-  FJoinRunningTask f_join_running_task;
   /*! \brief The packed function to the `NextTaskId` function. */
   FNextTaskId f_next_task_id;
+  /*! \brief The packed function to the `JoinRunningTask` function. */
+  FJoinRunningTask f_join_running_task;
+  /*! \brief The packed function to the `Tune` function. */
+  FTune f_tune;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
-    // `f_tune` is not visited
-    // `f_initialize_task` is not visited
-    // `f_touch_task` is not visited
-    // `f_join_running_task` is not visited
+    TaskSchedulerNode::VisitAttrs(v);
     // `f_next_task_id` is not visited
+    // `f_join_running_task` is not visited
+    // `f_tune` is not visited
   }
 
-  void Tune() final;
-  void InitializeTask(int task_id) final;
-  void TouchTask(int task_id) final;
-  Array<RunnerResult> JoinRunningTask(int task_id) final;
   int NextTaskId() final;
+  Array<RunnerResult> JoinRunningTask(int task_id) final;
+  void Tune(Array<TuneContext> tasks, Array<FloatImm> task_weights, int max_trials_global,
+            int max_trials_per_task, int num_trials_per_iter, Builder builder, Runner runner,
+            Array<MeasureCallback> measure_callbacks, Optional<Database> database,
+            Optional<CostModel> cost_model) final;
 
   static constexpr const char* _type_key = "meta_schedule.PyTaskScheduler";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyTaskSchedulerNode, TaskSchedulerNode);
@@ -205,83 +263,31 @@ class TaskScheduler : public runtime::ObjectRef {
  public:
   /*!
    * \brief Create a task scheduler that fetches tasks in a round-robin fashion.
-   * \param tasks The tasks to be tuned.
-   * \param builder The builder of the scheduler.
-   * \param runner The runner of the scheduler.
-   * \param database The database of the scheduler.
-   * \param max_trials The maximum number of trials.
-   * \param cost_model The cost model of the scheduler.
-   * \param measure_callbacks The measure callbacks of the scheduler.
-   * \param logging_func The tuning task's logging function.
+   * \param logger The tuning task's logging function.
    * \return The task scheduler created.
    */
-  TVM_DLL static TaskScheduler RoundRobin(Array<TuneContext> tasks,                            //
-                                          Builder builder,                                     //
-                                          Runner runner,                                       //
-                                          Optional<Database> database,                         //
-                                          Optional<CostModel> cost_model,                      //
-                                          Optional<Array<MeasureCallback>> measure_callbacks,  //
-                                          int max_trials,                                      //
-                                          PackedFunc logging_func);
+  TVM_DLL static TaskScheduler RoundRobin(PackedFunc logger);
   /*!
    * \brief Create a task scheduler that fetches tasks in a gradient based fashion.
-   * \param tasks The tasks to be tuned.
-   * \param task_weights The weights of each task.
-   * \param builder The builder of the scheduler.
-   * \param runner The runner of the scheduler.
-   * \param database The database of the scheduler.
-   * \param max_trials The maximum number of trials.
-   * \param cost_model The cost model of the scheduler.
-   * \param measure_callbacks The measure callbacks of the scheduler.
-   * \param logging_func The tuning task's logging function.
+   * \param logger The tuning task's logging function.
    * \param alpha The parameter alpha to control gradient computation.
    * \param window_size The parameter to control backward window size.
    * \param seed The random seed.
    * \return The task scheduler created.
    */
-  TVM_DLL static TaskScheduler GradientBased(Array<TuneContext> tasks,
-                                             Array<FloatImm> task_weights,                        //
-                                             Builder builder,                                     //
-                                             Runner runner,                                       //
-                                             Optional<Database> database,                         //
-                                             Optional<CostModel> cost_model,                      //
-                                             Optional<Array<MeasureCallback>> measure_callbacks,  //
-                                             int max_trials,                                      //
-                                             PackedFunc logging_func,                             //
-                                             double alpha,                                        //
-                                             int window_size,                                     //
+  TVM_DLL static TaskScheduler GradientBased(PackedFunc logger, double alpha, int window_size,
                                              support::LinearCongruentialEngine::TRandState seed);
   /*!
    * \brief Create a task scheduler with customized methods on the python-side.
-   * \param tasks The tasks to be tuned.
-   * \param builder The builder of the scheduler.
-   * \param runner The runner of the scheduler.
-   * \param database The database of the scheduler.
-   * \param max_trials The maximum number of trials.
-   * \param cost_model The cost model of the scheduler.
-   * \param measure_callbacks The measure callbacks of the scheduler.
-   * \param logging_func The tuning task's logging function.
-   * \param f_tune The packed function of `Tune`.
-   * \param f_initialize_task The packed function of `InitializeTask`.
-   * \param f_touch_task The packed function of `TouchTask`.
-   * \param f_join_running_task The packed function of `JoinRunningTask`.
+   * \param logger The tuning task's logging function.
    * \param f_next_task_id The packed function of `NextTaskId`.
+   * \param f_join_running_task The packed function of `JoinRunningTask`.
+   * \param f_tune The packed function of `Tune`.
    * \return The task scheduler created.
    */
   TVM_DLL static TaskScheduler PyTaskScheduler(
-      Array<TuneContext> tasks,                                   //
-      Builder builder,                                            //
-      Runner runner,                                              //
-      Optional<Database> database,                                //
-      Optional<CostModel> cost_model,                             //
-      Optional<Array<MeasureCallback>> measure_callbacks,         //
-      int max_trials,                                             //
-      PackedFunc logging_func,                                    //
-      PyTaskSchedulerNode::FTune f_tune,                          //
-      PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
-      PyTaskSchedulerNode::FTouchTask f_touch_task,               //
-      PyTaskSchedulerNode::FJoinRunningTask f_join_running_task,  //
-      PyTaskSchedulerNode::FNextTaskId f_next_task_id);
+      PackedFunc logger, PyTaskSchedulerNode::FNextTaskId f_next_task_id,
+      PyTaskSchedulerNode::FJoinRunningTask f_join_running_task, PyTaskSchedulerNode::FTune f_tune);
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(TaskScheduler, ObjectRef, TaskSchedulerNode);
 };
 
diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
index 4e2f00fb5a0c..15f3cba30b95 100644
--- a/include/tvm/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -22,10 +22,7 @@
 #include <tvm/ir/expr.h>
 #include <tvm/ir/module.h>
 #include <tvm/meta_schedule/builder.h>
-#include <tvm/meta_schedule/mutator.h>
-#include <tvm/meta_schedule/postproc.h>
 #include <tvm/meta_schedule/runner.h>
-#include <tvm/meta_schedule/schedule_rule.h>
 #include <tvm/meta_schedule/search_strategy.h>
 #include <tvm/meta_schedule/space_generator.h>
 #include <tvm/node/reflection.h>
@@ -48,6 +45,8 @@ class TuneContext;
 /*! \brief The auto tuning context. */
 class TuneContextNode : public runtime::Object {
  public:
+  using TRandState = support::LinearCongruentialEngine::TRandState;
+
   /*! \brief The workload to be tuned. */
   Optional<IRModule> mod;
   /*! \brief The target to be tuned for. */
@@ -56,74 +55,35 @@ class TuneContextNode : public runtime::Object {
   Optional<SpaceGenerator> space_generator;
   /*! \brief The search strategy. */
   Optional<SearchStrategy> search_strategy;
-  /*! \brief The schedule rules. */
-  Array<ScheduleRule> sch_rules;
-  /*! \brief The postprocessors. */
-  Array<Postproc> postprocs;
-  /*! \brief The probability of using certain mutator. */
-  Map<Mutator, FloatImm> mutator_probs;
   /*! \brief The name of the tuning task. */
   Optional<String> task_name;
-  /*! \brief The tuning task's logging function. t*/
-  PackedFunc logging_func;
-  /*! \brief The random state. */
-  support::LinearCongruentialEngine::TRandState rand_state;
   /*! \brief The number of threads to be used. */
   int num_threads;
-
-  /*! \brief Whether the tuning task has been stopped or finished. */
-  bool is_terminated;  // TODO(@junrushao1994): move to TaskScheduler
-  /*! \brief The measure candidates. */
-  Optional<Array<MeasureCandidate>> measure_candidates;
-  /*! \brief The building results. */
-  Optional<Array<BuilderResult>> builder_results;
-  /*! \brief Packed functions to fetch the runner results asynchronously. */
-  Optional<Array<RunnerFuture>> runner_futures;
+  /*! \brief The random state. */
+  TRandState rand_state;
+  /*! \brief The tuning task's logging function. t*/
+  PackedFunc logger;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("mod", &mod);
     v->Visit("target", &target);
     v->Visit("space_generator", &space_generator);
     v->Visit("search_strategy", &search_strategy);
-    v->Visit("sch_rules", &sch_rules);
-    v->Visit("postprocs", &postprocs);
-    v->Visit("mutator_probs", &mutator_probs);
     v->Visit("task_name", &task_name);
-    // `logging_func` is not visited
-    v->Visit("rand_state", &rand_state);
     v->Visit("num_threads", &num_threads);
-    v->Visit("is_terminated", &is_terminated);
-    v->Visit("measure_candidates", &measure_candidates);
-    v->Visit("builder_results", &builder_results);
-    v->Visit("runner_futures", &runner_futures);
+    v->Visit("rand_state", &rand_state);
+    // `logger` is not visited
   }
-
-  /*! \brief Initialize members that needs initialization with tune context. */
+  /*!
+   * \brief Initialize members that needs initialization with tune context.
+   */
   void Initialize();
   /*!
    * \brief Clone the tune context.
    * \return The cloned tune context.
    */
   TuneContext Clone() const;
-  /*! \brief Set the measure candidates from the SearchStrategy */
-  void _SetMeasureCandidates(const Array<MeasureCandidate>& candidates);
-  /*!
-   * \brief Send the measure candidates to builder.
-   * \param builder The builder to send the candidates to.
-   */
-  void _SendToBuilder(const Builder& builder);
-  /*!
-   * \brief Send the built measure candidates to runner.
-   * \param runner The runner to send the candidates to.
-   */
-  void _SendToRunner(const Runner& runner);
-  /*!
-   * \brief Join the running tasks.
-   * \returns The results from the runner
-   */
-  Array<RunnerResult> _Join();
-  /*! \brief Set `measure_candidates`, `builder_results` and `runner_futures` to null. */
-  void _ClearMeasureState();
+
   static constexpr const char* _type_key = "meta_schedule.TuneContext";
   TVM_DECLARE_FINAL_OBJECT_INFO(TuneContextNode, Object);
 };
@@ -134,31 +94,22 @@ class TuneContextNode : public runtime::Object {
  */
 class TuneContext : public runtime::ObjectRef {
  public:
+  using TRandState = support::LinearCongruentialEngine::TRandState;
   /*!
    * \brief Constructor.
    * \param mod The workload to be tuned.
    * \param target The target to be tuned for.
    * \param space_generator The design space generator.
    * \param search_strategy The search strategy.
-   * \param sch_rules The schedule rules.
-   * \param postprocs The postprocessors.
-   * \param mutator_probs The probability of using certain mutator.
    * \param task_name The name of the tuning task.
-   * \param logging_func The tuning task's logging function.
-   * \param rand_state The random state.
    * \param num_threads The number of threads to be used.
+   * \param rand_state The random state.
+   * \param logger The tuning task's logging function.
    */
-  TVM_DLL explicit TuneContext(Optional<IRModule> mod,                                    //
-                               Optional<Target> target,                                   //
-                               Optional<SpaceGenerator> space_generator,                  //
-                               Optional<SearchStrategy> search_strategy,                  //
-                               Optional<Array<ScheduleRule>> sch_rules,                   //
-                               Optional<Array<Postproc>> postprocs,                       //
-                               Optional<Map<Mutator, FloatImm>> mutator_probs,            //
-                               Optional<String> task_name,                                //
-                               PackedFunc logging_func,                                   //
-                               support::LinearCongruentialEngine::TRandState rand_state,  //
-                               int num_threads);
+  TVM_DLL explicit TuneContext(Optional<IRModule> mod, Optional<Target> target,
+                               Optional<SpaceGenerator> space_generator,
+                               Optional<SearchStrategy> search_strategy, Optional<String> task_name,
+                               int num_threads, TRandState rand_state, PackedFunc logger);
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(TuneContext, ObjectRef, TuneContextNode);
 };
 
diff --git a/include/tvm/support/random_engine.h b/include/tvm/support/random_engine.h
index d9a8a583ce9c..109a98b3d14a 100644
--- a/include/tvm/support/random_engine.h
+++ b/include/tvm/support/random_engine.h
@@ -16,19 +16,16 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 /*!
  * \file random_engine.h
  * \brief Random number generator. It provides a generic interface consistent with
  * `std::uniform_random_bit_generator`
  */
-
 #ifndef TVM_SUPPORT_RANDOM_ENGINE_H_
 #define TVM_SUPPORT_RANDOM_ENGINE_H_
-
 #include <tvm/runtime/logging.h>
 
-#include <cstdint>  // for uint64_t
+#include <cstdint>
 #include <random>
 
 namespace tvm {
@@ -46,32 +43,18 @@ namespace support {
 
 class LinearCongruentialEngine {
  public:
-  /*!
-   * \brief The result type is defined as uint64_t here to avoid overflow.
-   * \note The type name is not in Google style because it is used in STL's distribution inferface.
-   */
-  using result_type = uint64_t;
   using TRandState = int64_t;
-
+  /*! \brief The result type. */
+  using result_type = uint64_t;
   /*! \brief The multiplier */
   static constexpr TRandState multiplier = 48271;
-
   /*! \brief The increment */
   static constexpr TRandState increment = 0;
-
   /*! \brief The modulus */
   static constexpr TRandState modulus = 2147483647;
-
-  /*!
-   * \brief The minimum possible value of random state here.
-   * \note The function name is uncapilized because it is used in STL's distribution inferface.
-   */
+  /*! \brief The minimum possible value of random state here. */
   static constexpr result_type min() { return 0; }
-
-  /*!
-   * \brief The maximum possible value of random state here.
-   * \note The function name is uncapilized because it is used in STL's distribution inferface.
-   */
+  /*! \brief The maximum possible value of random state here. */
   static constexpr result_type max() { return modulus - 1; }
 
   /*!
@@ -94,20 +77,32 @@ class LinearCongruentialEngine {
     (*rand_state_ptr_) = ((*rand_state_ptr_) * multiplier + increment) % modulus;
     return *rand_state_ptr_;
   }
-
   /*!
-   * \brief Change the start random state of RNG with the seed of a new random state value.
-   * \param rand_state The random state given in result_type.
+   * \brief Normalize the random seed to the range of [1, modulus - 1].
+   * \param rand_state The random seed.
+   * \return The normalized random seed.
    */
-  void Seed(TRandState rand_state) {
+  static TRandState NormalizeSeed(TRandState rand_state) {
     if (rand_state == -1) {
       rand_state = DeviceRandom();
-    } else if (rand_state == 0) {
+    } else {
+      rand_state %= modulus;
+    }
+    if (rand_state == 0) {
       rand_state = 1;
     }
-    ICHECK(rand_state >= 0) << "The random state should be nonnegative";
+    if (rand_state < 0) {
+      LOG(FATAL) << "ValueError: Random seed must be non-negative";
+    }
+    return rand_state;
+  }
+  /*!
+   * \brief Change the start random state of RNG with the seed of a new random state value.
+   * \param rand_state The random state given in result_type.
+   */
+  void Seed(TRandState rand_state) {
     ICHECK(rand_state_ptr_ != nullptr);
-    *rand_state_ptr_ = rand_state % modulus;
+    *rand_state_ptr_ = NormalizeSeed(rand_state);
   }
 
   /*!
diff --git a/python/tvm/contrib/torch/as_torch.py b/python/tvm/contrib/torch/as_torch.py
index a8cd895a6c5e..2412519ea9c5 100644
--- a/python/tvm/contrib/torch/as_torch.py
+++ b/python/tvm/contrib/torch/as_torch.py
@@ -24,15 +24,19 @@
 as_torch: a decorator, which is used to wrap the TVMScript code to `torch.nn.module`.
 """
 import tempfile
-from typing import Callable, List, Union
+from typing import Callable, List, Optional, Union
+
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 
 import torch
 import torch.utils.dlpack
-
 import tvm
-from tvm.meta_schedule.tune import TuneConfig, tune_tir
+from tvm import meta_schedule as ms
 from tvm.target.target import Target
-from tvm.tir.schedule.schedule import Schedule
+from tvm.tir import PrimFunc
 
 
 # python wrapper for OperatorModule
@@ -48,7 +52,24 @@ def __init__(
         self.rt_module = None  # runtime module
         self.ir_module = module  # IR modules
 
-    def tune(self, config: TuneConfig = None, target: Union[str, Target] = None):
+    def tune(
+        self,
+        target: Union[str, Target] = "cpu",
+        max_trials_global: int = 32,
+        *,
+        num_trials_per_iter: int = 32,
+        builder: ms.Builder.BuilderType = "local",
+        runner: ms.Runner.RunnerType = "local",
+        database: ms.Database.DatabaseType = "json",
+        cost_model: ms.CostModel.CostModelType = "xgb",
+        measure_callbacks: ms.MeasureCallback.CallbackListType = "default",
+        task_scheduler: ms.TaskScheduler.TaskSchedulerType = "round-robin",
+        space: ms.SpaceGenerator.SpaceGeneratorType = "post-order-apply",
+        strategy: ms.SearchStrategy.SearchStrategyType = "replay_trace",
+        task_name: str = "main",
+        num_threads: Union[Literal["physical", "logical"], int] = "physical",
+        seed: Optional[int] = None,
+    ) -> None:
         """
         Tune the TVMScript code.
 
@@ -60,23 +81,29 @@ def tune(self, config: TuneConfig = None, target: Union[str, Target] = None):
         target : Optional[str, Target]
             The target to tune for.
         """
-        if config is None:
-            config = TuneConfig(
-                # Default setting
-                strategy="replay_trace",
-                num_trials_per_iter=32,
-                max_trials_per_task=32,
-                max_trials_global=32,
-            )
-        if target is None:
-            target = Target("llvm --num-cores=16")
+        if target == "cpu":
+            target = f"llvm --num-cores {ms.utils.cpu_count(logical=False)}"
+
         with tempfile.TemporaryDirectory() as work_dir:
-            sch: Schedule = tune_tir(
+            database = ms.tir_integration.tune_tir(
                 mod=self.ir_module,
                 target=target,
-                config=config,
                 work_dir=work_dir,
+                max_trials_global=max_trials_global,
+                num_trials_per_iter=num_trials_per_iter,
+                builder=builder,
+                runner=runner,
+                database=database,
+                cost_model=cost_model,
+                measure_callbacks=measure_callbacks,
+                task_scheduler=task_scheduler,
+                space=space,
+                strategy=strategy,
+                task_name=task_name,
+                num_threads=num_threads,
+                seed=seed,
             )
+            sch = ms.tir_integration.compile_tir(database, self.ir_module, target)
             self.ir_module = sch.mod
             self.build(target)
 
@@ -117,11 +144,11 @@ def as_torch(func: Union[tvm.ir.module.IRModule, tvm.tir.function.PrimFunc, Call
         which is the subclass of the original nn.Module.
 
     """
-    if isinstance(func, (tvm.ir.module.IRModule, tvm.tir.function.PrimFunc)):
+    if isinstance(func, (tvm.ir.module.IRModule, PrimFunc)):
         return OperatorModuleWrapper(func)
-    if isinstance(func, Callable):
+    if callable(func):
 
-        def func_get_param(*args, **kargs):
-            return OperatorModuleWrapper(func(*args, **kargs))
+        def func_get_param(*args, **kwargs):
+            return OperatorModuleWrapper(func(*args, **kwargs))
 
         return func_get_param
diff --git a/python/tvm/contrib/torch/optimize_torch.py b/python/tvm/contrib/torch/optimize_torch.py
index 821a3b1f71d5..347ea89f92ee 100644
--- a/python/tvm/contrib/torch/optimize_torch.py
+++ b/python/tvm/contrib/torch/optimize_torch.py
@@ -28,28 +28,17 @@
 import base64
 import contextlib
 import tempfile
-from typing import Dict, Optional, Tuple, Union
-import warnings
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.utils.dlpack
-
 import tvm
+from tvm import meta_schedule as ms
 from tvm import relay
 from tvm._ffi import get_global_func, register_func
-from tvm.ir.module import IRModule
-from tvm.ir.transform import PassContext
-from tvm.meta_schedule import TuneConfig, default_config
-from tvm.meta_schedule.relay_integration import extract_task_from_relay
-from tvm.meta_schedule.tune import tune_extracted_tasks
-from tvm.meta_schedule.utils import autotvm_silencer
-from tvm.runtime import vm
-from tvm.runtime.module import Module
-from tvm.runtime.ndarray import NDArray
-from tvm.target.target import Target
-
-
-# The python wrapper for GraphExecutorFactory
+from tvm.target import Target
+
+
 class GraphExecutorFactoryWrapper(torch.nn.Module):
     def __init__(self, module: tvm.runtime.Module):
         super().__init__()
@@ -62,75 +51,32 @@ def forward(self, *torch_inputs: Tuple[torch.Tensor]):
         return ret
 
 
-def llvm_target():
-    return "llvm -num-cores"
-
-
 @register_func("script_torch.save_to_base64")
 def save_to_base64(obj) -> bytes:
     with tempfile.NamedTemporaryFile(suffix=".so") as tmpfile:
         obj.export_library(tmpfile.name)
-        with open(tmpfile.name, "rb") as tfile:
-            return base64.b64encode(tfile.read())
-
-
-def tune_relay_auto(
-    mod: IRModule,
-    target: Union[str, Target],
-    config: TuneConfig,
-    work_dir: str,
-    backend: str = "graph",
-    params: Optional[Dict[str, NDArray]] = None,
-) -> Union[Module, vm.Executable]:
-    """A wrapper of `tune_relay` but provide a default setting for the config.
-
-    Parameters
-    ----------
-    mod : IRModule
-        The module to tune.
-    target : Union[str, Target]
-        The target to tune for.
-    config : TuneConfig
-        The search strategy config.
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
-        The associated parameters of the program
-    work_dir : Optional[str]
-        The working directory to save intermediate results.
-    backend : str = "graph"
-        The backend to use for relay compilation(graph / vm).
-
-    Returns
-    -------
-    lib : Union[Module, tvm.runtime.vm.Executable]
-        The built runtime module or vm Executable for the given relay workload.
-    """
-    target = default_config.target(target)
-    extracted_tasks = extract_task_from_relay(mod, target, params)
-    if config is None:
-        config = TuneConfig(
-            num_trials_per_iter=16,
-            max_trials_global=16 * len(extracted_tasks),
-        )
-    database = tune_extracted_tasks(extracted_tasks, config, work_dir)
-    relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend]
-    with target, autotvm_silencer(), database:
-        with PassContext(
-            opt_level=3,
-            config={
-                "relay.backend.use_meta_schedule": True,
-                "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda",
-                "relay.backend.tir_converter": "default",
-            },
-        ):
-            return relay_build(mod, target=target, params=params)
+        with open(tmpfile.name, "rb") as temp_file:
+            return base64.b64encode(temp_file.read())
 
 
 def optimize_torch(
     func,
     example_inputs,
-    tuning_config=None,
-    target=None,
+    *,
+    max_trials_global: int,
     work_dir=None,
+    target: Union[str, Target] = "cpu",
+    max_trials_per_task: Optional[int] = None,
+    num_trials_per_iter: int = 64,
+    builder: ms.Builder.BuilderType = "local",
+    runner: ms.Runner.RunnerType = "local",
+    database: ms.Database.DatabaseType = "json",
+    cost_model: ms.CostModel.CostModelType = "xgb",
+    measure_callbacks: ms.MeasureCallback.CallbackListType = "default",
+    task_scheduler: ms.TaskScheduler.TaskSchedulerType = "gradient",
+    space: ms.SpaceGenerator.SpaceGeneratorType = "post-order-apply",
+    strategy: ms.SearchStrategy.SearchStrategyType = "evolutionary",
+    seed: Optional[int] = None,
 ):
     """Load PyTorch model that could be traced by TorchScript, then optimize it via MetaSchedule.
 
@@ -139,22 +85,37 @@ def optimize_torch(
     func : callable or torch.nn.Module
         A Python function or nn.Module that could run by TorchScript's trace.
         (ie: torch.jit.trace(model, input))
-
     example_inputs : tuple or torch.Tensor
         Inputs to `torch.jit.trace`.
-
-    tuning_config : tvm.meta_schedule.TuneConfig
-        The configuration for tuning by MetaSchedule.
-        If user doesn't set the config, the tuning will run with a default setting.
-        Here, the total number of trials is proportional
-        to the number of tunable tasks in the input module.
-
+    max_trials_global : int
+        The maximum number of trials to run globally.
+    work_dir : Optional[str]
+        The working directory to save intermediate results.
     target : Optional[Union[str, Target]]
         The target of the compilation.
         If user doesn't set the target, the module will be built for the CPU target.
-
-    work_dir : Optional[str]
-        The working directory to save intermediate results.
+    max_trials_per_task : Optional[int]
+        The maximum number of trials to run per task.
+    num_trials_per_iter : int
+        The number of trials to run per iteration
+    builder : Builder.BuilderType
+        The builder.
+    runner : Runner.RunnerType
+        The runner.
+    database : Database.DatabaseType
+        The database.
+    cost_model : CostModel.CostModelType
+        The cost model.
+    measure_callbacks : MeasureCallback.CallbackListType
+        The measure callbacks.
+    task_scheduler : TaskScheduler.TaskSchedulerType
+        The task scheduler.
+    space : SpaceGenerator.SpaceGeneratorType
+        The space generator to use.
+    strategy : SearchStrategy.SearchStrategyType
+        The search strategy to use.
+    seed : Optional[int]
+        The random seed to use.
 
     Returns
     -------
@@ -163,33 +124,47 @@ def optimize_torch(
         which is the subclass of the original nn.Module.
     """
 
-    if target is None:
-        target = llvm_target()
-
-    if tuning_config is None:
-        warning_msg = (
-            "Using the default tuning parameters.",
-            "The default number of trials is set to a small value to let tuning finish quickly.",
-            "For optimal performance, it is recommended to provide",
-            "the `tuning_config` argument with a bigger number of trials.",
-        )
-        warnings.warn(" ".join(warning_msg), stacklevel=2)
+    if target == "cpu":
+        target = f"llvm --num-cores {ms.utils.cpu_count(logical=False)}"
+    if not isinstance(target, Target):
+        target = Target(target)
 
     # If `func` is already a traced module this statement makes no effect
     jit_mod = torch.jit.trace(func, example_inputs)
-
     if isinstance(example_inputs, torch.Tensor):
         example_inputs = [example_inputs]
-
     shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
     mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)  # IRmodule
+
     if work_dir:
         context_manager = contextlib.nullcontext(work_dir)
     else:
         context_manager = tempfile.TemporaryDirectory()
-    with context_manager as work_dir_path:
-        executor_factory = tune_relay_auto(
-            mod=mod, params=params, config=tuning_config, target=target, work_dir=work_dir_path
+    with context_manager as work_dir:  # pylint: disable=redefined-argument-from-local
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            params=params,
+            target=target,
+            work_dir=work_dir,
+            max_trials_global=max_trials_global,
+            max_trials_per_task=max_trials_per_task,
+            num_trials_per_iter=num_trials_per_iter,
+            builder=builder,
+            runner=runner,
+            database=database,
+            cost_model=cost_model,
+            measure_callbacks=measure_callbacks,
+            task_scheduler=task_scheduler,
+            space=space,
+            strategy=strategy,
+            seed=seed,
+        )
+        executor_factory = ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            target=target,
+            params=params,
+            backend="graph",
         )
 
     save_runtime_mod = get_global_func("tvmtorch.save_runtime_mod")
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index cf348d49f4e2..c92ed47d8a2a 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -20,24 +20,35 @@
     builder,
     cost_model,
     database,
-    default_config,
     feature_extractor,
     measure_callback,
     mutator,
     postproc,
+    relay_integration,
     runner,
     schedule_rule,
     search_strategy,
     space_generator,
+    tir_integration,
 )
+from .builder import Builder
+from .cost_model import CostModel
+from .database import Database
 from .extracted_task import ExtractedTask
+from .feature_extractor import FeatureExtractor
+from .measure_callback import MeasureCallback
+from .mutator import Mutator
+from .postproc import Postproc
 from .profiler import Profiler
 from .relay_integration import (
-    extract_task_from_relay,
     is_meta_schedule_dispatch_enabled,
     is_meta_schedule_enabled,
 )
-from .search_strategy import MeasureCandidate
-from .tune import TuneConfig, tune_extracted_tasks, tune_relay, tune_te, tune_tir
+from .runner import Runner
+from .schedule_rule import ScheduleRule
+from .search_strategy import MeasureCandidate, SearchStrategy
+from .space_generator import SpaceGenerator
+from .tir_integration import tune_tir
+from .tune import tune_tasks
 from .tune_context import TuneContext
 from .utils import derived_object
diff --git a/python/tvm/meta_schedule/builder/builder.py b/python/tvm/meta_schedule/builder/builder.py
index a2254f243380..fcab906e6207 100644
--- a/python/tvm/meta_schedule/builder/builder.py
+++ b/python/tvm/meta_schedule/builder/builder.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Meta Schedule builders that translate IRModule to runtime.Module, and then export"""
-from typing import Callable, Dict, List, Optional
+from typing import Callable, Dict, List, Optional, Union
 
 # isort: off
 from typing_extensions import Literal
@@ -112,6 +112,8 @@ def __init__(
 class Builder(Object):
     """The abstract builder interface."""
 
+    BuilderType = Union["Builder", Literal["local"]]
+
     def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
         """Build the given inputs.
 
@@ -126,6 +128,33 @@ def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
         """
         return _ffi_api.BuilderBuild(self, build_inputs)  # type: ignore # pylint: disable=no-member
 
+    @staticmethod
+    def create(  # pylint: disable=keyword-arg-before-vararg
+        kind: Literal["local"] = "local",
+        *args,
+        **kwargs,
+    ) -> "Builder":
+        """Create a Builder.
+
+        Parameters
+        ----------
+        kind : Literal["local"]
+            The kind of the builder. For now, only "local" is supported.
+
+        Returns
+        -------
+        builder : Builder
+            The builder created.
+        """
+        from . import LocalBuilder  # pylint: disable=import-outside-toplevel
+
+        if kind == "local":
+            return LocalBuilder(*args, **kwargs)  # type: ignore
+        raise ValueError(f"Unknown Builder: {kind}")
+
+
+create = Builder.create  # pylint: disable=invalid-name
+
 
 @register_object("meta_schedule.PyBuilder")
 class _PyBuilder(Builder):
@@ -168,16 +197,3 @@ def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
             The results of building the given inputs.
         """
         raise NotImplementedError
-
-
-def create(  # pylint: disable=keyword-arg-before-vararg
-    kind: Literal["local"] = "local",
-    *args,
-    **kwargs,
-) -> Builder:
-    """Create a Builder."""
-    from . import LocalBuilder  # pylint: disable=import-outside-toplevel
-
-    if kind == "local":
-        return LocalBuilder(*args, **kwargs)  # type: ignore
-    raise ValueError(f"Unknown Builder: {kind}")
diff --git a/python/tvm/meta_schedule/builder/local_builder.py b/python/tvm/meta_schedule/builder/local_builder.py
index e81ccfe808ff..6e282d8cb62d 100644
--- a/python/tvm/meta_schedule/builder/local_builder.py
+++ b/python/tvm/meta_schedule/builder/local_builder.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Local builder that compile on the local host"""
-import logging
 import os
 import tempfile
 from typing import Callable, Dict, List, Optional, Union
@@ -26,10 +25,11 @@
 from tvm.target import Target
 
 from ...contrib.popen_pool import MapResult, PopenPoolExecutor, StatusKind
+from ..logging import get_logger
 from ..utils import cpu_count, derived_object, get_global_func_with_default_on_worker
 from .builder import BuilderInput, BuilderResult, PyBuilder
 
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+logger = get_logger(__name__)  # pylint: disable=invalid-name
 
 
 T_BUILD = Callable[  # pylint: disable=invalid-name
@@ -137,7 +137,7 @@ def __init__(
         super().__init__()
 
         if max_workers is None:
-            max_workers = cpu_count(logical=False)
+            max_workers = cpu_count(logical=True)
         logger.info("LocalBuilder: max_workers = %d", max_workers)
 
         self.max_workers = max_workers
diff --git a/python/tvm/meta_schedule/cost_model/cost_model.py b/python/tvm/meta_schedule/cost_model/cost_model.py
index d3b660d837dd..54a4d7a34391 100644
--- a/python/tvm/meta_schedule/cost_model/cost_model.py
+++ b/python/tvm/meta_schedule/cost_model/cost_model.py
@@ -16,23 +16,30 @@
 # under the License.
 """Meta Schedule CostModel."""
 import ctypes
-from typing import Callable, List
+from typing import Callable, List, Union
+
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 
 import numpy as np  # type: ignore
 from tvm._ffi import register_object
-from tvm.meta_schedule.utils import _get_default_str
 from tvm.runtime import Object
 
 from .. import _ffi_api
 from ..runner import RunnerResult
 from ..search_strategy import MeasureCandidate
 from ..tune_context import TuneContext
+from ..utils import _get_default_str
 
 
 @register_object("meta_schedule.CostModel")
 class CostModel(Object):
     """Cost model."""
 
+    CostModelType = Union["CostModel", Literal["xgb", "mlp", "random"]]
+
     def load(self, path: str) -> None:
         """Load the cost model from given file location.
 
@@ -97,6 +104,41 @@ def predict(self, context: TuneContext, candidates: List[MeasureCandidate]) -> n
         )
         return results
 
+    @staticmethod
+    def create(
+        kind: Literal["xgb", "mlp", "random"],
+        *args,
+        **kwargs,
+    ) -> "CostModel":
+        """Create a CostModel.
+
+        Parameters
+        ----------
+        kind : Literal["xgb", "mlp", "random"]
+            The kind of the cost model. Can be "xgb", "mlp", or "random".
+
+        Returns
+        -------
+        cost_model : CostModel
+            The created cost model.
+        """
+        from . import RandomModel, XGBModel  # pylint: disable=import-outside-toplevel
+
+        if kind == "xgb":
+            return XGBModel(*args, **kwargs)  # type: ignore
+        if kind == "random":
+            return RandomModel(*args, **kwargs)  # type: ignore
+        if kind == "mlp":
+            from .mlp_model import (  # type: ignore  # pylint: disable=import-outside-toplevel
+                MLPModel,
+            )
+
+            return MLPModel(*args, **kwargs)  # type: ignore
+        raise ValueError(f"Unknown CostModel: {kind}")
+
+
+create = CostModel.create  # pylint: disable=invalid-name
+
 
 @register_object("meta_schedule.PyCostModel")
 class _PyCostModel(CostModel):
diff --git a/python/tvm/meta_schedule/cost_model/mlp_model.py b/python/tvm/meta_schedule/cost_model/mlp_model.py
index e7f07f0a4542..8bd050b689bf 100644
--- a/python/tvm/meta_schedule/cost_model/mlp_model.py
+++ b/python/tvm/meta_schedule/cost_model/mlp_model.py
@@ -19,7 +19,6 @@
 Segment Sum MLP cost model
 """
 import glob
-import logging
 import math
 import os
 import random
@@ -38,14 +37,13 @@
 from ..cost_model import PyCostModel
 from ..database import JSONDatabase
 from ..feature_extractor import FeatureExtractor, PerStoreFeature
+from ..logging import get_logger
 from ..runner import RunnerResult
 from ..search_strategy import MeasureCandidate
 from ..tune_context import TuneContext
 from ..utils import derived_object, shash2hex
 
-logging.basicConfig()
-logger = logging.getLogger("mlp_model")  # pylint: disable=invalid-name
-logger.setLevel(logging.INFO)
+logger = get_logger("mlp_model")  # pylint: disable=invalid-name
 
 # pylint: disable=no-member,import-outside-toplevel
 
diff --git a/python/tvm/meta_schedule/cost_model/random_model.py b/python/tvm/meta_schedule/cost_model/random_model.py
index bc178f76ac90..19516bee0d4f 100644
--- a/python/tvm/meta_schedule/cost_model/random_model.py
+++ b/python/tvm/meta_schedule/cost_model/random_model.py
@@ -19,12 +19,11 @@
 """
 from typing import List, Optional, Tuple, Union
 
-from tvm.meta_schedule.utils import derived_object  # type: ignore
-
 from ..cost_model import PyCostModel
 from ..runner import RunnerResult
 from ..search_strategy import MeasureCandidate
 from ..tune_context import TuneContext
+from ..utils import derived_object  # type: ignore
 
 
 @derived_object
diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py
index 59774b534e55..0a2786c6abe0 100644
--- a/python/tvm/meta_schedule/cost_model/xgb_model.py
+++ b/python/tvm/meta_schedule/cost_model/xgb_model.py
@@ -14,15 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-XGBoost-based cost model
-"""
-import logging
+"""XGBoost-based cost model"""
 import os
 import tempfile
 from collections import OrderedDict
 from itertools import chain as itertools_chain
-from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple, Callable
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, Optional, Tuple
 
 import numpy as np  # type: ignore
 
@@ -30,21 +27,20 @@
 from ...runtime import NDArray
 from ..cost_model import PyCostModel
 from ..feature_extractor import FeatureExtractor
+from ..logging import get_logger
 from ..runner import RunnerResult
 from ..search_strategy import MeasureCandidate
 from ..utils import cpu_count, derived_object, shash2hex
 from .metric import max_curve
 
-
 if TYPE_CHECKING:
-
     import xgboost as xgb  # type: ignore
     from xgboost.callback import TrainingCallback  # type: ignore
 
     from ..tune_context import TuneContext
 
 
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+logger = get_logger(__name__)  # pylint: disable=invalid-name
 
 
 def make_metric_sorter(focused_metric):
@@ -302,7 +298,7 @@ class XGBModel(PyCostModel):
     average_peak_n : int
         The number to calculate average peak score.
     adaptive_training : bool
-        Whether use adpative training to reduce tuning time.
+        Whether use adaptive training to reduce tuning time.
     """
 
     # feature extractor
@@ -327,7 +323,7 @@ def __init__(
         self,
         *,
         # feature extractor
-        extractor: FeatureExtractor,
+        extractor: FeatureExtractor.FeatureExtractorType = "per-store-feature",
         # xgboost model config
         config: XGBConfig = XGBConfig(),
         # random result before enough samples
@@ -339,6 +335,8 @@ def __init__(
         adaptive_training: bool = True,
     ):
         super().__init__()
+        if not isinstance(extractor, FeatureExtractor):
+            extractor = FeatureExtractor.create(extractor)
         # feature extractor
         self.extractor = extractor
         # model-related
@@ -652,7 +650,7 @@ def _get_custom_call_back(
     """Get a customized callback function for XGBoost. Work around xgboost import."""
 
     def optional_xgboost_callback(cls):
-        """Decorator for importing TraningCallback from xgboost"""
+        """Decorator for importing TrainingCallback from xgboost"""
         # pylint:disable = import-outside-toplevel
         try:
             from xgboost.callback import TrainingCallback  # type: ignore
@@ -696,7 +694,7 @@ def __call__(self, env: "xgb.core.CallbackEnv"):
             return self.after_iteration(env.model, env.iteration, env.evaluation_result_list)
 
         def init(self, model: "xgb.Booster"):
-            """Internal function for intialization"""
+            """Internal function for initialization"""
             booster: "xgb.Booster" = model
             self.state["best_iteration"] = 0
             self.state["best_score"] = float("inf")
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index 75b78b118eea..e21ce29ed699 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -164,6 +164,8 @@ def from_json(json_obj: Any, workload: Workload) -> "TuningRecord":
 class Database(Object):
     """The abstract database interface."""
 
+    DatabaseType = Union["Database", Literal["json", "memory"]]
+
     def has_workload(self, mod: IRModule) -> bool:
         """Check if the database has the given workload.
         Parameters
@@ -361,6 +363,56 @@ def current() -> Optional["Database"]:
         """Get the current database under scope."""
         return _ffi_api.DatabaseCurrent()  # type: ignore # pylint: disable=no-member
 
+    @staticmethod
+    def create(  # pylint: disable=keyword-arg-before-vararg
+        kind: Union[
+            Literal[
+                "json",
+                "memory",
+                "union",
+                "ordered_union",
+            ],
+            Callable[[Schedule], bool],
+        ] = "json",
+        *args,
+        **kwargs,
+    ) -> "Database":
+        """Create a Database.
+
+        Parameters
+        ----------
+        kind : str = "json" | "memory" | "union" | "ordered_union" | Callable[[Schedule], bool]
+            The kind of the database to be created. The following kinds are supported:
+            "json", "memory", "union", "ordered_union", and a custom schedule function.
+
+        Returns
+        -------
+        database : Database
+            The created database.
+        """
+        from . import (  # pylint: disable=import-outside-toplevel
+            JSONDatabase,
+            MemoryDatabase,
+            OrderedUnionDatabase,
+            ScheduleFnDatabase,
+            UnionDatabase,
+        )
+
+        if callable(kind):
+            return ScheduleFnDatabase(kind, *args, **kwargs)  # type: ignore
+        if kind == "json":
+            return JSONDatabase(*args, **kwargs)
+        if kind == "memory":
+            return MemoryDatabase(*args, **kwargs)  # type: ignore
+        if kind == "union":
+            return UnionDatabase(*args, **kwargs)  # type: ignore
+        if kind == "ordered_union":
+            return OrderedUnionDatabase(*args, **kwargs)  # type: ignore
+        raise ValueError(f"Unknown Database: {kind}")
+
+
+create = Database.create  # pylint: disable=invalid-name
+
 
 @register_object("meta_schedule.PyDatabase")
 class _PyDatabase(Database):
@@ -568,38 +620,3 @@ def __len__(self) -> int:
             The number of records in the database
         """
         raise NotImplementedError
-
-
-def create(  # pylint: disable=keyword-arg-before-vararg
-    kind: Union[
-        Literal[
-            "json",
-            "memory",
-            "union",
-            "ordered_union",
-        ],
-        Callable[[Schedule], bool],
-    ] = "json",
-    *args,
-    **kwargs,
-) -> Database:
-    """Create a Database."""
-    from . import (  # pylint: disable=import-outside-toplevel
-        JSONDatabase,
-        MemoryDatabase,
-        OrderedUnionDatabase,
-        ScheduleFnDatabase,
-        UnionDatabase,
-    )
-
-    if callable(kind):
-        return ScheduleFnDatabase(kind, *args, **kwargs)  # type: ignore
-    if kind == "json":
-        return JSONDatabase(*args, **kwargs)
-    if kind == "memory":
-        return MemoryDatabase(*args, **kwargs)  # type: ignore
-    if kind == "union":
-        return UnionDatabase(*args, **kwargs)  # type: ignore
-    if kind == "ordered_union":
-        return OrderedUnionDatabase(*args, **kwargs)  # type: ignore
-    raise ValueError(f"Unknown Database: {kind}")
diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
deleted file mode 100644
index c701fd6568e0..000000000000
--- a/python/tvm/meta_schedule/default_config.py
+++ /dev/null
@@ -1,454 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-outside-toplevel
-"""Pre-configured Defaults for MetaSchedule search rules"""
-import logging
-from os import path as osp
-from typing import Callable, Dict, List, Optional, Union
-
-from tvm.ir import IRModule
-from tvm.target import Target
-from tvm.tir import PrimFunc
-
-from .builder import Builder, LocalBuilder
-from .cost_model import CostModel, XGBModel
-from .database import Database, JSONDatabase
-from .feature_extractor import PerStoreFeature
-from .measure_callback import MeasureCallback
-from .mutator import Mutator
-from .postproc import Postproc
-from .runner import LocalRunner, Runner
-from .schedule_rule import ScheduleRule
-from .space_generator import PostOrderApply, SpaceGenerator
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-FnSpaceGenerator = Callable[[], SpaceGenerator]
-FnScheduleRule = Callable[[], List[ScheduleRule]]
-FnPostproc = Callable[[], List[Postproc]]
-FnMutatorProb = Callable[[], Dict[Mutator, float]]
-
-
-def mod(mod: Union[PrimFunc, IRModule]) -> IRModule:  # pylint: disable=redefined-outer-name
-    """Normalize the input to an IRModule"""
-    if isinstance(mod, PrimFunc):
-        mod = mod.with_attr("global_symbol", "main")
-        mod = mod.with_attr("tir.noalias", True)
-        mod = IRModule({"main": mod})
-    if not isinstance(mod, IRModule):
-        raise TypeError(f"Expected `mod` to be PrimFunc or IRModule, but gets: {mod}")
-    func_names = mod.get_global_vars()
-    (func_name,) = func_names
-    if len(func_names) == 1 and func_name.name_hint != "main":
-        mod = IRModule({"main": mod[func_name]})
-    return mod
-
-
-def target(target: Union[str, Target]) -> Target:  # pylint: disable=redefined-outer-name
-    """Normalize the input to tvm.target.Target"""
-    if isinstance(target, str):
-        target = Target(target)
-    if not isinstance(target, Target):
-        raise TypeError(f"Expected `target` to be str or Target, but gets: {target}")
-    return target
-
-
-def builder(builder: Optional[Builder]) -> Builder:  # pylint: disable=redefined-outer-name
-    """Normalize the input to tvm.meta_schedule.Builder"""
-    if builder is None:
-        builder = LocalBuilder()  # type: ignore
-    if not isinstance(builder, Builder):
-        raise TypeError(f"Expected `builder` to be Builder, but gets: {builder}")
-    return builder
-
-
-def runner(runner: Optional[Runner]) -> Runner:  # pylint: disable=redefined-outer-name
-    """Normalize the input to tvm.meta_schedule.Runner"""
-    if runner is None:
-        runner = LocalRunner()  # type: ignore
-    if not isinstance(runner, Runner):
-        raise TypeError(f"Expected `runner` to be Runner, but gets: {runner}")
-    return runner
-
-
-def database(
-    database: Union[None, Database],  # pylint: disable=redefined-outer-name
-    path: str,
-) -> Database:
-    """Normalize the input to tvm.meta_schedule.Database"""
-    if database is None:
-        path_workload = osp.join(path, "database_workload.json")
-        path_tuning_record = osp.join(path, "database_tuning_record.json")
-        logger.info(
-            "Creating JSONDatabase. Workload at: %s. Tuning records at: %s",
-            path_workload,
-            path_tuning_record,
-        )
-        database = JSONDatabase(
-            path_workload=path_workload,
-            path_tuning_record=path_tuning_record,
-        )
-    if not isinstance(database, Database):
-        raise TypeError(f"Expected `database` to be Database, but gets: {database}")
-    return database
-
-
-def callbacks(  # pylint: disable=redefined-outer-name
-    measure_callbacks: Optional[List[MeasureCallback]],
-) -> List[MeasureCallback]:
-    """Normalize the input to List[tvm.meta_schedule.MeasureCallback]"""
-    if measure_callbacks is None:
-        from tvm.meta_schedule import measure_callback as M
-
-        return [
-            M.AddToDatabase(),
-            M.RemoveBuildArtifact(),
-            M.EchoStatistics(),
-            M.UpdateCostModel(),
-        ]
-    if not isinstance(measure_callbacks, (list, tuple)):
-        raise TypeError(
-            f"Expected `measure_callbacks` to be List[MeasureCallback], "
-            f"but gets: {measure_callbacks}"
-        )
-    measure_callbacks = list(measure_callbacks)
-    for i, callback in enumerate(measure_callbacks):
-        if not isinstance(callback, MeasureCallback):
-            raise TypeError(
-                f"Expected `measure_callbacks` to be List[MeasureCallback], "
-                f"but measure_callbacks[{i}] is: {callback}"
-            )
-    return measure_callbacks
-
-
-def cost_model(
-    cost_model: Optional[CostModel],  # pylint: disable=redefined-outer-name
-    adpative_training: Optional[bool],
-) -> CostModel:
-    """Normalize the input to tvm.meta_schedule.CostModel"""
-    if cost_model is None:
-        return XGBModel(  # type: ignore
-            extractor=PerStoreFeature(),
-            adaptive_training=adpative_training is None or adpative_training,
-        )
-    if not isinstance(cost_model, CostModel):
-        raise TypeError(f"Expected `cost_model` to be CostModel, but gets: {cost_model}")
-    return cost_model
-
-
-def space_generator(
-    space_generator: Optional[FnSpaceGenerator],  # pylint: disable=redefined-outer-name
-) -> SpaceGenerator:
-    """Normalize the input to tvm.meta_schedule.SpaceGenerator"""
-    if space_generator is None:
-        return PostOrderApply()
-    if callable(space_generator):
-        space_generator = space_generator()
-    if not isinstance(space_generator, SpaceGenerator):
-        raise TypeError(
-            f"Expected `space_generator` to return SpaceGenerator, " f"but gets: {space_generator}"
-        )
-    return space_generator
-
-
-def schedule_rules(  # pylint: disable=redefined-outer-name
-    sch_rules: Optional[FnScheduleRule],
-    target: Target,
-) -> List[ScheduleRule]:
-    """Normalize the input to List[tvm.meta_schedule.ScheduleRule]"""
-    if callable(sch_rules):
-        return sch_rules()
-    if sch_rules is not None:
-        raise TypeError(f"Expected `sch_rules` to be None or callable, but gets: {sch_rules}")
-    if target.kind.name == "llvm":
-        return _DefaultLLVM.schedule_rules()
-    if target.kind.name in ["cuda", "rocm", "vulkan"]:
-        return _DefaultCUDA.schedule_rules()
-    if target.kind.name == "hexagon":
-        return _DefaultHexagon.schedule_rules()
-    raise ValueError(f"Unsupported target: {target}")
-
-
-def postproc(  # pylint: disable=redefined-outer-name
-    postproc: Optional[FnPostproc],
-    target: Target,
-) -> List[Postproc]:
-    """Normalize the input to List[tvm.meta_schedule.Postproc]"""
-    if callable(postproc):
-        return postproc()
-    if postproc is not None:
-        raise TypeError(f"Expected `postproc` to be None or callable, but gets: {postproc}")
-    if target.kind.name == "llvm":
-        return _DefaultLLVM.postprocs()
-    if target.kind.name in ["cuda", "rocm", "vulkan"]:
-        return _DefaultCUDA.postprocs()
-    if target.kind.name == "hexagon":
-        return _DefaultHexagon.postprocs()
-    raise ValueError(f"Unsupported target: {target}")
-
-
-def mutator_probs(  # pylint: disable=redefined-outer-name
-    mutator_probs: Optional[FnMutatorProb],
-    target: Target,
-) -> Dict[Mutator, float]:
-    """Normalize the input to Dict[tvm.meta_schedule.Mutator, float]"""
-    if callable(mutator_probs):
-        return mutator_probs()
-    if mutator_probs is not None:
-        raise TypeError(
-            f"Expected `mutator_probs` to be None or callable, but gets: {mutator_probs}"
-        )
-    if target.kind.name in ["llvm", "hexagon"]:
-        return _DefaultLLVM.mutator_probs()
-    if target.kind.name in ["cuda", "rocm", "vulkan"]:
-        return _DefaultCUDA.mutator_probs()
-    raise ValueError(f"Unsupported target: {target}")
-
-
-class _DefaultLLVM:
-    """Default tuning configuration for LLVM."""
-
-    @staticmethod
-    def schedule_rules() -> List[ScheduleRule]:
-        from tvm.meta_schedule import schedule_rule as M
-
-        return [
-            M.AutoInline(
-                into_producer=False,
-                into_consumer=True,
-                inline_const_tensor=True,
-                disallow_if_then_else=True,
-                require_injective=True,
-                require_ordered=True,
-                disallow_op=["tir.exp"],
-            ),
-            M.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
-            M.MultiLevelTiling(
-                structure="SSRSRS",
-                tile_binds=None,
-                max_innermost_factor=64,
-                vector_load_lens=None,
-                reuse_read=None,
-                reuse_write=M.ReuseType(
-                    req="may",
-                    levels=[1, 2],
-                    scope="global",
-                ),
-            ),
-            M.ParallelizeVectorizeUnroll(
-                max_jobs_per_core=16,
-                max_vectorize_extent=64,
-                unroll_max_steps=[0, 16, 64, 512],
-                unroll_explicit=True,
-            ),
-            M.RandomComputeLocation(),
-        ]
-
-    @staticmethod
-    def postprocs() -> List[Postproc]:
-        from tvm.meta_schedule import postproc as M
-
-        return [
-            M.DisallowDynamicLoop(),
-            M.RewriteParallelVectorizeUnroll(),
-            M.RewriteReductionBlock(),
-            M.RewriteLayout(),
-        ]
-
-    @staticmethod
-    def mutator_probs() -> Dict[Mutator, float]:
-        from tvm.meta_schedule import mutator as M
-
-        return {
-            M.MutateTileSize(): 0.9,
-            M.MutateComputeLocation(): 0.05,
-            M.MutateUnroll(): 0.03,
-            M.MutateParallel(max_jobs_per_core=16): 0.02,
-        }
-
-
-class _DefaultHexagon:
-    """Default tuning configuration for Hexagon."""
-
-    @staticmethod
-    def schedule_rules() -> List[ScheduleRule]:
-        from tvm.meta_schedule import schedule_rule as M
-
-        return [
-            M.AutoInline(
-                into_producer=False,
-                into_consumer=True,
-                inline_const_tensor=True,
-                disallow_if_then_else=True,
-                require_injective=True,
-                require_ordered=True,
-                disallow_op=["tir.exp"],
-            ),
-            M.MultiLevelTilingWideVector(
-                structure="SRSRS",
-                vector_length_in_bits=1024,
-                max_innermost_factor=128,
-                reuse_read=None,
-                reuse_write=M.ReuseType(
-                    req="may",
-                    levels=[1, 2],
-                    scope="global",
-                ),
-            ),
-            M.ParallelizeVectorizeUnroll(
-                max_jobs_per_core=16,
-                max_vectorize_extent=128,
-                unroll_max_steps=[0, 16, 64, 512],
-                unroll_explicit=True,
-            ),
-        ]
-
-    @staticmethod
-    def postprocs() -> List[Postproc]:
-        from tvm.meta_schedule import postproc as M
-
-        return [
-            M.DisallowDynamicLoop(),
-            M.RewriteParallelVectorizeUnroll(),
-            M.RewriteReductionBlock(),
-            # TODO(masahi): Fix RewriteLayout for link-params=True case
-            # M.RewriteLayout(),
-        ]
-
-
-class _DefaultCUDA:
-    """Default tuning configuration for CUDA."""
-
-    @staticmethod
-    def schedule_rules() -> List[ScheduleRule]:
-        from tvm.meta_schedule import schedule_rule as M
-
-        return [
-            M.MultiLevelTiling(
-                structure="SSSRRSRS",
-                tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
-                max_innermost_factor=64,
-                vector_load_lens=[1, 2, 3, 4, 8, 16],
-                reuse_read=M.ReuseType(
-                    req="must",
-                    levels=[4],
-                    scope="shared",
-                ),
-                reuse_write=M.ReuseType(
-                    req="must",
-                    levels=[3],
-                    scope="local",
-                ),
-            ),
-            M.AutoInline(
-                into_producer=True,
-                into_consumer=True,
-                inline_const_tensor=True,
-                disallow_if_then_else=False,
-                require_injective=False,
-                require_ordered=False,
-                disallow_op=None,
-            ),
-            M.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]),
-            M.ParallelizeVectorizeUnroll(
-                max_jobs_per_core=-1,  # disable parallelize
-                max_vectorize_extent=-1,  # disable vectorize
-                unroll_max_steps=[0, 16, 64, 512, 1024],
-                unroll_explicit=True,
-            ),
-            M.AutoBind(
-                max_threadblocks=256,
-                thread_extents=[32, 64, 128, 256, 512, 1024],
-            ),
-        ]
-
-    @staticmethod
-    def postprocs() -> List[Postproc]:
-        from tvm.meta_schedule import postproc as M
-
-        return [
-            M.DisallowDynamicLoop(),
-            M.RewriteCooperativeFetch(),
-            M.RewriteUnboundBlock(),
-            M.RewriteParallelVectorizeUnroll(),
-            M.RewriteReductionBlock(),
-            M.VerifyGPUCode(),
-        ]
-
-    @staticmethod
-    def mutator_probs() -> Dict[Mutator, float]:
-        from tvm.meta_schedule import mutator as M
-
-        return {
-            M.MutateTileSize(): 0.9,
-            M.MutateUnroll(): 0.08,
-            M.MutateThreadBinding(): 0.02,
-        }
-
-
-class _DefaultCUDATensorCore:
-    """Default tuning configuration for CUDA TensorCore."""
-
-    @staticmethod
-    def schedule_rules():
-        from tvm.meta_schedule import schedule_rule as M
-        from tvm.tir.tensor_intrin.cuda import get_wmma_intrin_group
-
-        return [
-            M.MultiLevelTilingTensorCore(
-                intrin_groups=[
-                    get_wmma_intrin_group(
-                        store_scope="shared",
-                        in_dtype=in_dtype,
-                        out_dtype=out_dtype,
-                        trans_b=trans_b,
-                    )
-                    for (in_dtype, out_dtype) in [("float16", "float16"), ("int8", "int32")]
-                    for trans_b in [False, True]
-                ],
-                structure="SSSRRSRS",
-                tile_binds=["blockIdx.y", "blockIdx.x", "threadIdx.y"],
-                max_innermost_factor=4,
-                vector_load_lens=[1, 2, 3, 4, 8, 16],
-                reuse_read=M.ReuseType(req="must", levels=[4], scope="shared"),
-                reuse_write=M.ReuseType(
-                    req="must",
-                    levels=[2],
-                    scope="shared",
-                ),
-                use_software_pipeline=False,
-            ),
-            *_DefaultCUDA.schedule_rules(),
-        ]
-
-    @staticmethod
-    def postprocs() -> List[Postproc]:
-        from tvm.meta_schedule import postproc as M
-
-        return [
-            M.DisallowDynamicLoop(),
-            M.RewriteCooperativeFetch(),
-            M.RewriteUnboundBlock(),
-            M.RewriteParallelVectorizeUnroll(),
-            M.RewriteReductionBlock(),
-            M.RewriteTensorize(),
-            M.VerifyGPUCode(),
-        ]
-
-    @staticmethod
-    def mutator_probs() -> Dict[Mutator, float]:
-        return _DefaultCUDA.mutator_probs()
diff --git a/python/tvm/meta_schedule/feature_extractor/feature_extractor.py b/python/tvm/meta_schedule/feature_extractor/feature_extractor.py
index 04064b1cce35..c14c97e0f526 100644
--- a/python/tvm/meta_schedule/feature_extractor/feature_extractor.py
+++ b/python/tvm/meta_schedule/feature_extractor/feature_extractor.py
@@ -15,22 +15,29 @@
 # specific language governing permissions and limitations
 # under the License.
 """Meta Schedule FeatureExtractor."""
-from typing import Callable, List
+from typing import Callable, List, Union
+
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
 from tvm.runtime.ndarray import NDArray
 
 from .. import _ffi_api
-from ..utils import _get_default_str
-from ..tune_context import TuneContext
 from ..search_strategy import MeasureCandidate
+from ..tune_context import TuneContext
+from ..utils import _get_default_str
 
 
 @register_object("meta_schedule.FeatureExtractor")
 class FeatureExtractor(Object):
     """Extractor for features from measure candidates for use in cost model."""
 
+    FeatureExtractorType = Union[Literal["per-store-feature"], "FeatureExtractor"]
+
     def extract_from(
         self, context: TuneContext, candidates: List[MeasureCandidate]
     ) -> List[NDArray]:
@@ -53,6 +60,19 @@ def extract_from(
         )
         return result
 
+    @staticmethod
+    def create(
+        kind: Literal["per-store-feature"],
+        *args,
+        **kwargs,
+    ) -> "FeatureExtractor":
+        """Create a CostModel."""
+        from . import PerStoreFeature  # pylint: disable=import-outside-toplevel
+
+        if kind == "per-store-feature":
+            return PerStoreFeature(*args, **kwargs)  # type: ignore
+        raise ValueError(f"Unknown CostModel: {kind}")
+
 
 @register_object("meta_schedule.PyFeatureExtractor")
 class _PyFeatureExtractor(FeatureExtractor):
diff --git a/python/tvm/meta_schedule/feature_extractor/random_feature_extractor.py b/python/tvm/meta_schedule/feature_extractor/random_feature_extractor.py
index d805648bfbfd..18b84c364ad4 100644
--- a/python/tvm/meta_schedule/feature_extractor/random_feature_extractor.py
+++ b/python/tvm/meta_schedule/feature_extractor/random_feature_extractor.py
@@ -15,16 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 """Random Feature Extractor."""
-from typing import List, Union, Tuple
+from typing import List, Tuple, Union
 
 import numpy as np  # type: ignore
 from tvm.runtime.ndarray import NDArray, array
 
-from ..tune_context import TuneContext
-from ..search_strategy import MeasureCandidate
 from ..feature_extractor import PyFeatureExtractor
+from ..search_strategy import MeasureCandidate
+from ..tune_context import TuneContext
+from ..utils import derived_object
 
 
+@derived_object
 class RandomFeatureExtractor(PyFeatureExtractor):
     """Random Feature Extractor
 
diff --git a/python/tvm/meta_schedule/logging.py b/python/tvm/meta_schedule/logging.py
new file mode 100644
index 000000000000..9d673266a3f2
--- /dev/null
+++ b/python/tvm/meta_schedule/logging.py
@@ -0,0 +1,259 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Logging interface in MetaSchedule"""
+import logging
+import logging.config
+import os
+import os.path as osp
+from logging import Logger
+from typing import Any, Callable, Dict, List, Optional
+
+
+def get_logger(name: str) -> Logger:
+    """Create or get a logger by its name. This is essentially a wrapper of python's native logger.
+
+    Parameters
+    ----------
+    name : str
+        The name of the logger.
+
+    Returns
+    -------
+    logger : Logger
+        The logger instance.
+    """
+    return logging.getLogger(name)
+
+
+def get_logging_func(logger: Logger) -> Optional[Callable[[int, str], None]]:
+    """Get the logging function.
+
+    Parameters
+    ----------
+    logger : Logger
+        The logger instance.
+    Returns
+    -------
+    result : Optional[Callable]
+        The function to do the specified level of logging.
+    """
+    if logger is None:
+        return None
+
+    level2log = {
+        logging.DEBUG: logger.debug,
+        logging.INFO: logger.info,
+        logging.WARNING: logger.warning,
+        logging.ERROR: logger.error,
+        # logging.FATAL not included
+    }
+
+    def logging_func(level: int, msg: str):
+        if level < 0:
+            from IPython.display import (  # type: ignore # pylint: disable=import-outside-toplevel
+                clear_output,
+            )
+
+            clear_output(wait=True)
+        else:
+            level2log[level](msg)
+
+    return logging_func
+
+
+def create_loggers(
+    log_dir: str,
+    params: List[Dict[str, Any]],
+    logger_config: Optional[Dict[str, Any]] = None,
+    disable_existing_loggers: bool = False,
+):
+    """Create loggers from configuration"""
+    if logger_config is None:
+        config = {}
+    else:
+        config = logger_config
+
+    config.setdefault("loggers", {})
+    config.setdefault("handlers", {})
+    config.setdefault("formatters", {})
+
+    global_logger_name = "tvm.meta_schedule"
+    global_logger = logging.getLogger(global_logger_name)
+    if global_logger.level is logging.NOTSET:
+        global_logger.setLevel(logging.INFO)
+
+    config["loggers"].setdefault(
+        global_logger_name,
+        {
+            "level": logging._levelToName[global_logger.level],  # pylint: disable=protected-access
+            "handlers": [handler.get_name() for handler in global_logger.handlers]
+            + [global_logger_name + ".console", global_logger_name + ".file"],
+            "propagate": False,
+        },
+    )
+    config["loggers"].setdefault(
+        "{logger_name}",
+        {
+            "level": "INFO",
+            "handlers": [
+                "{logger_name}.file",
+            ],
+            "propagate": False,
+        },
+    )
+    config["handlers"].setdefault(
+        global_logger_name + ".console",
+        {
+            "class": "logging.StreamHandler",
+            "stream": "ext://sys.stdout",
+            "formatter": "tvm.meta_schedule.standard_formatter",
+        },
+    )
+    config["handlers"].setdefault(
+        global_logger_name + ".file",
+        {
+            "class": "logging.FileHandler",
+            "filename": "{log_dir}/" + __name__ + ".task_scheduler.log",
+            "mode": "a",
+            "level": "INFO",
+            "formatter": "tvm.meta_schedule.standard_formatter",
+        },
+    )
+    config["handlers"].setdefault(
+        "{logger_name}.file",
+        {
+            "class": "logging.FileHandler",
+            "filename": "{log_dir}/{logger_name}.log",
+            "mode": "a",
+            "level": "INFO",
+            "formatter": "tvm.meta_schedule.standard_formatter",
+        },
+    )
+    config["formatters"].setdefault(
+        "tvm.meta_schedule.standard_formatter",
+        {
+            "format": "%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
+            "datefmt": "%Y-%m-%d %H:%M:%S",
+        },
+    )
+
+    # set up dictConfig loggers
+    p_config = {"version": 1, "disable_existing_loggers": disable_existing_loggers}
+    for k, v in config.items():
+        if k in ["formatters", "handlers", "loggers"]:
+            p_config[k] = _batch_parameterize_config(v, params)  # type: ignore
+        else:
+            p_config[k] = v
+    logging.config.dictConfig(p_config)
+
+    # check global logger
+    if global_logger.level not in [logging.DEBUG, logging.INFO]:
+        global_logger.warning(
+            "Logging level set to %s, please set to logging.INFO"
+            " or logging.DEBUG to view full log.",
+            logging._levelToName[global_logger.level],  # pylint: disable=protected-access
+        )
+    global_logger.info("Logging directory: %s", log_dir)
+
+
+def _batch_parameterize_config(
+    config: Dict[str, Any],
+    params: List[Dict[str, str]],
+) -> Dict[str, Any]:
+    """Parameterize the given configuration with multiple parameters sets.
+
+    Parameters
+    ----------
+    config : Dict[str, Any]
+        The given config dict.
+    Params : List[Dict[str, str]]
+        List of the given multiple parameters sets.
+
+    Returns
+    -------
+    result : Dict[str, Any]
+        The parameterized configuration.
+    """
+    results = {}
+    for name, cfg in config.items():
+        for p in params:
+            p_name = name.format(**p)
+            if p_name not in results:
+                p_cfg = _parameterize_config(cfg, p)
+                results[p_name] = p_cfg
+    return results
+
+
+def _parameterize_config(
+    config: Dict[str, Any],
+    params: Dict[str, str],
+) -> Dict[str, Any]:
+    """Parameterize the given configuration.
+
+    Parameters
+    ----------
+    config : Dict[str, Any]
+        The given config dict.
+    Params : Dict[str, str]
+        The given parameters.
+
+    Returns
+    -------
+    result : Dict[str, Any]
+        The parameterized configuration.
+    """
+    result = {}
+    for k, v in config.items():
+        if isinstance(k, str):
+            k = k.format(**params)
+        if isinstance(v, str):
+            v = v.format(**params)
+        elif isinstance(v, dict):
+            v = _parameterize_config(v, params)
+        elif isinstance(v, list):
+            v = [t.format(**params) for t in v]
+        result[k] = v
+    return result
+
+
+def get_loggers_from_work_dir(
+    work_dir: str,
+    task_names: List[str],
+) -> List[Logger]:
+    """Create loggers from work directory
+
+    Parameters
+    ----------
+    work_dir : str
+        The work directory.
+    task_names : List[str]
+        The list of task names.
+
+    Returns
+    -------
+    loggers : List[Logger]
+        The list of loggers.
+    """
+    log_dir = osp.join(work_dir, "logs")
+    os.makedirs(log_dir, exist_ok=True)
+    pattern = __name__ + ".task_{i:0" + f"{len(str(len(task_names) - 1))}" + "d}_{name}"
+    loggers = [pattern.format(i=i, name=name) for i, name in enumerate(task_names)]
+    create_loggers(
+        log_dir=log_dir,
+        params=[{"log_dir": log_dir, "logger_name": logger} for logger in loggers],
+    )
+    return [get_logger(logger) for logger in loggers]
diff --git a/python/tvm/meta_schedule/measure_callback/__init__.py b/python/tvm/meta_schedule/measure_callback/__init__.py
index f697e7733e7e..f43aee7d875c 100644
--- a/python/tvm/meta_schedule/measure_callback/__init__.py
+++ b/python/tvm/meta_schedule/measure_callback/__init__.py
@@ -14,11 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-The tvm.meta_schedule.measure_callback package.
-"""
-from .measure_callback import MeasureCallback, PyMeasureCallback
+"""The tvm.meta_schedule.measure_callback package."""
 from .add_to_database import AddToDatabase
-from .echo_statistics import EchoStatistics
+from .measure_callback import MeasureCallback, PyMeasureCallback
 from .remove_build_artifact import RemoveBuildArtifact
 from .update_cost_model import UpdateCostModel
diff --git a/python/tvm/meta_schedule/measure_callback/echo_statistics.py b/python/tvm/meta_schedule/measure_callback/echo_statistics.py
deleted file mode 100644
index 867409f88174..000000000000
--- a/python/tvm/meta_schedule/measure_callback/echo_statistics.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""A callback that echos the statistics of the tuning process to the console"""
-from tvm._ffi import register_object
-
-from .. import _ffi_api
-from .measure_callback import MeasureCallback
-
-
-@register_object("meta_schedule.EchoStatistics")
-class EchoStatistics(MeasureCallback):
-    def __init__(self) -> None:
-        """A callback that echos the statistics of the tuning process to the console"""
-        self.__init_handle_by_constructor__(
-            _ffi_api.MeasureCallbackEchoStatistics,  # type: ignore # pylint: disable=no-member
-        )
diff --git a/python/tvm/meta_schedule/measure_callback/measure_callback.py b/python/tvm/meta_schedule/measure_callback/measure_callback.py
index d9e412ed5605..d4a10c1e4009 100644
--- a/python/tvm/meta_schedule/measure_callback/measure_callback.py
+++ b/python/tvm/meta_schedule/measure_callback/measure_callback.py
@@ -16,7 +16,12 @@
 # under the License.
 """Meta Schedule MeasureCallback."""
 
-from typing import Callable, List, TYPE_CHECKING
+from typing import TYPE_CHECKING, Callable, List, Union
+
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
@@ -35,6 +40,8 @@
 class MeasureCallback(Object):
     """Rules to apply after measure results is available."""
 
+    CallbackListType = Union[List["MeasureCallback"], "MeasureCallback", Literal["default"]]
+
     def apply(
         self,
         task_scheduler: "TaskScheduler",
@@ -67,6 +74,13 @@ def apply(
             runner_results,
         )
 
+    @staticmethod
+    def create(kind: Literal["default"]) -> List["MeasureCallback"]:
+        """Create a list of measure callbacks."""
+        if kind == "default":
+            return _ffi_api.MeasureCallbackDefault()  # type: ignore # pylint: disable=no-member
+        raise ValueError(f"Unknown kind of MeasureCallback list: {kind}")
+
 
 @register_object("meta_schedule.PyMeasureCallback")
 class _PyMeasureCallback(MeasureCallback):
diff --git a/python/tvm/meta_schedule/mutator/mutator.py b/python/tvm/meta_schedule/mutator/mutator.py
index c5286aced7d8..188cb30c5b69 100644
--- a/python/tvm/meta_schedule/mutator/mutator.py
+++ b/python/tvm/meta_schedule/mutator/mutator.py
@@ -15,7 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 """Meta Schedule Mutator."""
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import TYPE_CHECKING, Callable, Dict, Optional
+
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
@@ -68,6 +73,43 @@ def clone(self) -> "Mutator":
         """
         return _ffi_api.MutatorClone(self)  # type: ignore # pylint: disable=no-member
 
+    @staticmethod
+    def create(
+        kind: Literal[
+            "llvm",
+            "cuda",
+            "cuda-tensorcore",
+            "hexagon",
+        ]
+    ) -> Dict["Mutator", float]:
+        """Create a list of default mutators.
+
+        Parameters
+        ----------
+        kind : Literal["llvm", "cuda", "cuda-tensorcore", "hexagon"]
+            The kind of mutators.
+
+        Returns
+        -------
+        mutators : List[Mutator]
+            The list of mutators.
+        """
+        funcs = {
+            # pylint: disable=no-member
+            "llvm": _ffi_api.MutatorDefaultLLVM,  # type: ignore
+            "cuda": _ffi_api.MutatorDefaultCUDA,  # type: ignore
+            "cuda-tensorcore": _ffi_api.MutatorDefaultCUDATensorCore,  # type: ignore
+            "hexagon": _ffi_api.MutatorDefaultHexagon,  # type: ignore
+            # pylint: enable=no-member
+        }
+        for k, v in funcs.items():
+            if k == kind:
+                return v()
+        raise ValueError(f"Unsupported kind {kind} for mutator creation.")
+
+
+create = Mutator.create  # pylint: disable=invalid-name
+
 
 @register_object("meta_schedule.PyMutator")
 class _PyMutator(Mutator):
diff --git a/python/tvm/meta_schedule/postproc/postproc.py b/python/tvm/meta_schedule/postproc/postproc.py
index 6eec2965ceeb..67a0d27e8261 100644
--- a/python/tvm/meta_schedule/postproc/postproc.py
+++ b/python/tvm/meta_schedule/postproc/postproc.py
@@ -15,8 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 """Meta Schedule Postproc."""
+from typing import TYPE_CHECKING, Callable, List
 
-from typing import TYPE_CHECKING, Callable
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
@@ -70,6 +74,36 @@ def clone(self) -> "Postproc":
         """
         return _ffi_api.PostprocClone(self)  # type: ignore # pylint: disable=no-member
 
+    @staticmethod
+    def create(kind: Literal["llvm", "cuda", "cuda-tensorcore", "hexagon"]) -> List["Postproc"]:
+        """Create a list of default postprocessors.
+
+        Parameters
+        ----------
+        kind : Literal["llvm", "cuda", "cuda-tensorcore", "hexagon"]
+            The kind of the postprocessors.
+
+        Returns
+        -------
+        postprocs : List[Mutator]
+            The list of postprocessors.
+        """
+        funcs = {
+            # pylint: disable=no-member
+            "llvm": _ffi_api.PostprocDefaultLLVM,  # type: ignore
+            "cuda": _ffi_api.PostprocDefaultCUDA,  # type: ignore
+            "cuda-tensorcore": _ffi_api.PostprocDefaultCUDATensorCore,  # type: ignore
+            "hexagon": _ffi_api.PostprocDefaultHexagon,  # type: ignore
+            # pylint: enable=no-member
+        }
+        for k, v in funcs.items():
+            if k == kind:
+                return v()
+        raise ValueError(f"Unsupported kind {kind} for postproc creation.")
+
+
+create = Postproc.create  # pylint: disable=invalid-name
+
 
 @register_object("meta_schedule.PyPostproc")
 class _PyPostproc(Postproc):
diff --git a/python/tvm/meta_schedule/profiler.py b/python/tvm/meta_schedule/profiler.py
index 206c2429d802..7446578a38d7 100644
--- a/python/tvm/meta_schedule/profiler.py
+++ b/python/tvm/meta_schedule/profiler.py
@@ -15,8 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """A context manager that profiles tuning time cost for different parts."""
-
-import logging
 from contextlib import contextmanager
 from typing import Dict, Optional
 
@@ -25,8 +23,6 @@
 
 from . import _ffi_api
 
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
 
 @register_object("meta_schedule.Profiler")
 class Profiler(Object):
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index 24009ab07fcf..af992dd4bc8b 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -15,8 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 """MetaSchedule-Relay integration"""
-from typing import Any, Dict, List, Optional
+from contextlib import contextmanager
+from types import MappingProxyType
+from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union
 
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 import numpy as np  # type: ignore
 from tvm import nd
 from tvm._ffi import get_global_func
@@ -24,19 +30,88 @@
 from tvm.runtime import NDArray
 from tvm.target import Target
 
+from .builder import Builder
+from .cost_model import CostModel
+from .database import Database
 from .extracted_task import ExtractedTask
-from .utils import autotvm_silencer
+from .logging import get_loggers_from_work_dir
+from .measure_callback import MeasureCallback
+from .profiler import Profiler
+from .runner import Runner
+from .search_strategy import SearchStrategy
+from .space_generator import SpaceGenerator
+from .task_scheduler import TaskScheduler
+from .tune import tune_tasks
+from .tune_context import TuneContext
+from .utils import fork_seed
+
+if TYPE_CHECKING:
+    from tvm import relay
+
+_extract_task = get_global_func(  # pylint: disable=invalid-name
+    "relay.backend.MetaScheduleExtractTask",
+    allow_missing=True,
+)
+
+
+@contextmanager
+def _autotvm_silencer():
+    """A context manager that silences autotvm warnings."""
+    from tvm import autotvm  # pylint: disable=import-outside-toplevel
+
+    silent = autotvm.GLOBAL_SCOPE.silent
+    autotvm.GLOBAL_SCOPE.silent = True
+    try:
+        yield
+    finally:
+        autotvm.GLOBAL_SCOPE.silent = silent
 
 
-def extract_task_from_relay(
+def _normalize_params(
     mod: IRModule,
-    target: Target,
-    params: Optional[Dict[str, NDArray]] = None,
+    target: Union[Target, str],
+    params: Optional[Dict[str, NDArray]],
+    pass_config: Mapping[str, Any],
+    executor: Optional["relay.backend.Executor"],
+) -> Tuple[
+    IRModule,
+    Target,
+    Dict[str, NDArray],
+    Dict[str, Any],
+    Optional["relay.backend.Executor"],
+]:
+    from tvm import relay  # pylint: disable=import-outside-toplevel
+
+    if isinstance(mod, relay.Function):
+        mod = IRModule.from_expr(mod)
+    if not isinstance(target, Target):
+        target = Target(target)
+    if params is None:
+        params = {}
+    relay_params = {}
+    for name, param in params.items():
+        if isinstance(param, np.ndarray):
+            param = nd.array(param)
+        relay_params[name] = param
+    if executor is not None:
+        mod = mod.with_attr("executor", executor)
+    pass_config = dict(pass_config)
+    return mod, target, relay_params, pass_config, executor
+
+
+def extract_tasks(
+    mod: IRModule,
+    target: Union[Target, str],
+    params: Optional[Dict[str, NDArray]],
     *,
     opt_level: int = 3,
-    pass_config: Optional[Dict[str, Any]] = None,
-    disabled_pass: Optional[List[str]] = None,
-    tir_converter: str = "default",
+    pass_config: Mapping[str, Any] = MappingProxyType(
+        {
+            "relay.backend.use_meta_schedule": True,
+            "relay.backend.tir_converter": "default",
+        }
+    ),
+    executor: Optional["relay.backend.Executor"] = None,
 ) -> List[ExtractedTask]:
     """Extract tuning tasks from a relay program.
 
@@ -49,18 +124,11 @@ def extract_task_from_relay(
     params : Optional[Dict[str, tvm.runtime.NDArray]]
         The associated parameters of the program
     opt_level : int
-        The optimization level of the compiler
-    pass_config : Optional[Dict[str, Any]]
-        The pass config of the compiler
-    disabled_pass : Optional[List[str]]
-        The list of disabled passes of the compiler
-    tir_converter : str
-        The filter function to filter out the extracted tasks. Builtin filters:
-          - "default"
-          - "allow_extern"
-        The converter is a PackedFunc registered as f"relay.backend.tir_converter.{tir_converter}",
-        with the signature below:
-            (args: List[te.Tensor], constants: List[NDArray]) -> Optional[tir.PrimFunc]
+        The optimization level of the compilation
+    pass_config : Mapping[str, Any]
+        The pass configuration
+    executor : Optional[relay.backend.Executor]
+        The executor to use
 
     Returns
     -------
@@ -69,47 +137,229 @@ def extract_task_from_relay(
     """
     # pylint: disable=import-outside-toplevel
     from tvm import autotvm
-    from tvm.relay import Function as RelayFunc
 
     # pylint: enable=import-outside-toplevel
+    mod, target, params, pass_config, _ = _normalize_params(
+        mod, target, params, pass_config, executor
+    )
+    if target.kind.name != "cuda" and isinstance(
+        autotvm.DispatchContext.current, autotvm.FallbackContext
+    ):
+        tophub_context = autotvm.tophub.context(target)
+    else:
+        tophub_context = autotvm.utils.EmptyContext()
+    with Profiler.timeit("TaskExtraction"):
+        with target, _autotvm_silencer(), tophub_context:
+            with transform.PassContext(
+                opt_level=opt_level,
+                config=pass_config,
+            ):
+                return list(_extract_task(mod, target, params))
+
+
+def extracted_tasks_to_tune_contexts(
+    extracted_tasks: List[ExtractedTask],
+    work_dir: str,
+    space: SpaceGenerator.SpaceGeneratorType = "post-order-apply",
+    strategy: SearchStrategy.SearchStrategyType = "evolutionary",
+    num_threads: Union[Literal["physical", "logical"], int] = "physical",
+    seed: Optional[int] = None,
+) -> Tuple[List[TuneContext], List[float]]:
+    """Convert ExtractedTask to TuneContext.
+
+    Parameters
+    ----------
+    tasks : List[ExtractedTask]
+        The tasks to be converted
+    work_dir : str
+        The working directory to store logs and databases
+    space : SpaceGenerator.SpaceGeneratorType
+        The space generator to use.
+    strategy : SearchStrategy.SearchStrategyType
+        The search strategy to use.
+    num_threads : Union[Literal["physical", "logical"], int]
+        The number of threads to use in multi-threaded search algorithm.
+    seed : Optional[int]
+        The random seed to use.
+
+    Returns
+    -------
+    tasks : List[TuneContext]
+        The converted tasks
+    task_weights : List[float]
+        The weights of the tasks
+    """
+    tasks: List[TuneContext] = []
+    task_weights: List[float] = []
+    for task, logger, rand_state in zip(
+        extracted_tasks,
+        get_loggers_from_work_dir(work_dir, [t.task_name for t in extracted_tasks]),
+        fork_seed(seed, n=len(extracted_tasks)),
+    ):
+        tasks.append(
+            TuneContext(
+                mod=task.dispatched[0],
+                target=task.target,
+                space_generator=space,
+                search_strategy=strategy,
+                task_name=task.task_name,
+                logger=logger,
+                rand_state=rand_state,
+                num_threads=num_threads,
+            ).clone()
+        )
+        task_weights.append(task.weight)
+    return tasks, task_weights
+
+
+def tune_relay(
+    mod: IRModule,
+    params: Dict[str, NDArray],
+    target: Union[str, Target],
+    work_dir: str,
+    max_trials_global: int,
+    *,
+    max_trials_per_task: Optional[int] = None,
+    num_trials_per_iter: int = 64,
+    builder: Builder.BuilderType = "local",
+    runner: Runner.RunnerType = "local",
+    database: Database.DatabaseType = "json",
+    cost_model: CostModel.CostModelType = "xgb",
+    measure_callbacks: MeasureCallback.CallbackListType = "default",
+    task_scheduler: TaskScheduler.TaskSchedulerType = "gradient",
+    space: SpaceGenerator.SpaceGeneratorType = "post-order-apply",
+    strategy: SearchStrategy.SearchStrategyType = "evolutionary",
+    seed: Optional[int] = None,
+) -> Database:
+    """Tune a Relay program.
+
+    Parameters
+    ----------
+    mod : Union[IRModule, tir.PrimFunc]
+        The module or function to tune
+    params : Optional[Dict[str, tvm.runtime.NDArray]]
+        The associated parameters of the program
+    target : Union[Target, str]
+        The compilation target
+    work_dir : str
+        The working directory to store the tuning records
+    max_trials_global : int
+        The maximum number of trials to run
+    max_trials_per_task : Optional[int]
+        The maximum number of trials to run for each task
+    num_trials_per_iter : int
+        The number of trials to run per iteration
+    builder : BuilderType
+        The builder to use
+    runner : RunnerType
+        The runner to use
+    database : DatabaseType
+        The database to use
+    cost_model : CostModelType
+        The cost model to use
+    measure_callbacks : CallbackListType
+        The measure callbacks to use
+    task_scheduler : TaskSchedulerType
+        The task scheduler to use
+    space : SpaceGeneratorType
+        The space generator to use
+    strategy : SearchStrategyType
+        The search strategy to use
+    seed : Optional[int]
+        The random seed
 
-    extract_task_func = get_global_func(
-        "relay.backend.MetaScheduleExtractTask",
-        allow_missing=False,
+    Returns
+    -------
+    database : Database
+        The database that contains the tuning records
+    """
+    tasks, task_weights = extracted_tasks_to_tune_contexts(
+        extracted_tasks=extract_tasks(mod, target, params),
+        work_dir=work_dir,
+        space=space,
+        strategy=strategy,
+        seed=seed,
+    )
+    return tune_tasks(
+        tasks=tasks,
+        task_weights=task_weights,
+        work_dir=work_dir,
+        max_trials_global=max_trials_global,
+        max_trials_per_task=max_trials_per_task,
+        num_trials_per_iter=num_trials_per_iter,
+        builder=builder,
+        runner=runner,
+        database=database,
+        cost_model=cost_model,
+        measure_callbacks=measure_callbacks,
+        task_scheduler=task_scheduler,
     )
 
-    if isinstance(mod, RelayFunc):
-        mod = IRModule.from_expr(mod)
-    if not isinstance(target, Target):
-        target = Target(target)
-    if disabled_pass is None:
-        disabled_pass = []
-    if pass_config is None:
-        pass_config = {
+
+def compile_relay(
+    database: Database,
+    mod: IRModule,
+    target: Union[Target, str],
+    params: Optional[Dict[str, NDArray]],
+    *,
+    backend: Literal["graph", "vm"] = "graph",
+    opt_level: int = 3,
+    pass_config: Mapping[str, Any] = MappingProxyType(
+        {
             "relay.backend.use_meta_schedule": True,
-            "relay.backend.tir_converter": tir_converter,
+            "relay.backend.tir_converter": "default",
         }
-    if params is None:
-        params = {}
-    relay_params = {}
-    for name, param in params.items():
-        if isinstance(param, np.ndarray):
-            param = nd.array(param)
-        relay_params[name] = param
+    ),
+    executor: Optional["relay.backend.Executor"] = None,
+):
+    """Compile a relay program with a MetaSchedule database.
 
-    with target, autotvm_silencer(), transform.PassContext(
-        opt_level=opt_level,
-        config=pass_config,
-        disabled_pass=disabled_pass,
-    ):
-        if target.kind.name != "cuda" and isinstance(
-            autotvm.DispatchContext.current, autotvm.FallbackContext
-        ):
-            tophub_context = autotvm.tophub.context(target)
-        else:
-            tophub_context = autotvm.utils.EmptyContext()
-        with tophub_context:
-            return list(extract_task_func(mod, target, relay_params))
+    Parameters
+    ----------
+    database : Database
+        The database to use
+    mod : IRModule
+        The Relay program to be compiled
+    target : tvm.target.Target
+        The compilation target
+    params : Optional[Dict[str, tvm.runtime.NDArray]]
+        The associated parameters of the program
+    backend : str
+        The backend to use. Builtin backends:
+            - "graph"
+            - "vm"
+    opt_level : int
+        The optimization level of the compilation
+    pass_config : Mapping[str, Any]
+        The pass configuration
+    executor : Optional[relay.backend.Executor]
+        The executor to use in relay.build. It is not supported by RelayVM.
+
+    Returns
+    -------
+    lib : Union[Module, tvm.runtime.vm.Executable]
+        The built runtime module or vm Executable for the given relay workload.
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+
+    # pylint: enable=import-outside-toplevel
+    mod, target, params, pass_config, executor = _normalize_params(
+        mod, target, params, pass_config, executor
+    )
+    pass_config.setdefault("relay.backend.use_meta_schedule_dispatch", target.kind.name != "cuda")
+    with Profiler.timeit("PostTuningCompilation"):
+        with target, _autotvm_silencer(), database:
+            with transform.PassContext(
+                opt_level=opt_level,
+                config=pass_config,
+            ):
+                if backend == "graph":
+                    return relay.build(mod, target=target, params=params, executor=executor)
+                elif backend == "vm":
+                    return relay.vm.compile(mod, target=target, params=params)
+                else:
+                    raise ValueError(f"Unknown backend: {backend}")
 
 
 def is_meta_schedule_enabled() -> bool:
@@ -134,7 +384,8 @@ def is_meta_schedule_dispatch_enabled() -> bool:
     enabled: bool
         Whether the meta schedule is enabled
     """
-    return transform.PassContext.current().config.get(
+    result = transform.PassContext.current().config.get(
         "relay.backend.use_meta_schedule_dispatch",
-        False,
+        0,
     )
+    return bool(result & 1)
diff --git a/python/tvm/meta_schedule/runner/local_runner.py b/python/tvm/meta_schedule/runner/local_runner.py
index 2d3214f53b6b..dfd4764607fb 100644
--- a/python/tvm/meta_schedule/runner/local_runner.py
+++ b/python/tvm/meta_schedule/runner/local_runner.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Local Runner"""
-import logging
 from contextlib import contextmanager
 from typing import Callable, List, Optional, Union
 
@@ -23,6 +22,7 @@
 
 from ...contrib.popen_pool import PopenPoolExecutor
 from ...runtime import Device, Module
+from ..logging import get_logger
 from ..profiler import Profiler
 from ..utils import derived_object, get_global_func_with_default_on_worker
 from .config import EvaluatorConfig
@@ -34,7 +34,7 @@
     run_evaluator_common,
 )
 
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+logger = get_logger(__name__)  # pylint: disable=invalid-name
 
 
 T_ALLOC_ARGUMENT = Callable[  # pylint: disable=invalid-name
diff --git a/python/tvm/meta_schedule/runner/rpc_runner.py b/python/tvm/meta_schedule/runner/rpc_runner.py
index aa6f3daaac60..9bdf715756cc 100644
--- a/python/tvm/meta_schedule/runner/rpc_runner.py
+++ b/python/tvm/meta_schedule/runner/rpc_runner.py
@@ -16,7 +16,6 @@
 # under the License.
 """RPC Runner"""
 import concurrent.futures
-import logging
 import os.path as osp
 from contextlib import contextmanager
 from typing import Callable, List, Optional, Union
@@ -25,6 +24,7 @@
 from tvm.rpc import RPCSession
 from tvm.runtime import Device, Module
 
+from ..logging import get_logger
 from ..profiler import Profiler
 from ..utils import (
     cpu_count,
@@ -41,7 +41,7 @@
     run_evaluator_common,
 )
 
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+logger = get_logger(__name__)  # pylint: disable=invalid-name
 
 
 T_CREATE_SESSION = Callable[  # pylint: disable=invalid-name
diff --git a/python/tvm/meta_schedule/runner/runner.py b/python/tvm/meta_schedule/runner/runner.py
index 539e47f15c41..1753d8b4abf9 100644
--- a/python/tvm/meta_schedule/runner/runner.py
+++ b/python/tvm/meta_schedule/runner/runner.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Runners"""
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Union
 
 # isort: off
 from typing_extensions import Literal
@@ -167,6 +167,8 @@ def result(self) -> RunnerResult:
 class Runner(Object):
     """The abstract runner interface"""
 
+    RunnerType = Union["Runner", Literal["local", "rpc"]]
+
     def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
         """Run the built artifact and get runner futures.
 
@@ -182,6 +184,24 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
         """
         return _ffi_api.RunnerRun(self, runner_inputs)  # type: ignore # pylint: disable=no-member
 
+    @staticmethod
+    def create(  # pylint: disable=keyword-arg-before-vararg
+        kind: Literal["local", "rpc"] = "local",
+        *args,
+        **kwargs,
+    ) -> "Runner":
+        """Create a Runner."""
+        from . import LocalRunner, RPCRunner  # pylint: disable=import-outside-toplevel
+
+        if kind == "local":
+            return LocalRunner(*args, **kwargs)  # type: ignore
+        elif kind == "rpc":
+            return RPCRunner(*args, **kwargs)  # type: ignore
+        raise ValueError(f"Unknown Runner: {kind}")
+
+
+create = Runner.create  # pylint: disable=invalid-name
+
 
 @register_object("meta_schedule.PyRunner")
 class _PyRunner(Runner):
@@ -228,18 +248,3 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
             The runner futures.
         """
         raise NotImplementedError
-
-
-def create(  # pylint: disable=keyword-arg-before-vararg
-    kind: Literal["local", "rpc"] = "local",
-    *args,
-    **kwargs,
-) -> Runner:
-    """Create a Runner."""
-    from . import LocalRunner, RPCRunner  # pylint: disable=import-outside-toplevel
-
-    if kind == "local":
-        return LocalRunner(*args, **kwargs)  # type: ignore
-    elif kind == "rpc":
-        return RPCRunner(*args, **kwargs)  # type: ignore
-    raise ValueError(f"Unknown Runner: {kind}")
diff --git a/python/tvm/meta_schedule/schedule_rule/schedule_rule.py b/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
index 2c8e223611aa..19cb1d8a55ec 100644
--- a/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
+++ b/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
@@ -20,6 +20,11 @@
 """
 from typing import TYPE_CHECKING, Callable, List
 
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
+
 from tvm._ffi import register_object
 from tvm.runtime import Object
 from tvm.tir.schedule import BlockRV, Schedule
@@ -76,6 +81,36 @@ def clone(self) -> "ScheduleRule":
         """
         return _ffi_api.ScheduleRuleClone(self)  # type: ignore # pylint: disable=no-member
 
+    @staticmethod
+    def create(kind: Literal["llvm", "cuda", "cuda-tensorcore", "hexagon"]) -> List["ScheduleRule"]:
+        """Create a list of schedule rules for the given kind.
+
+        Parameters
+        ----------
+        kind : Literal["llvm", "cuda", "cuda-tensorcore", "hexagon"]
+            The kind of the schedule rules.
+
+        Returns
+        -------
+        rules : List[ScheduleRule]
+            The list of schedule rules.
+        """
+        funcs = {
+            # pylint: disable=no-member
+            "llvm": _ffi_api.ScheduleRuleDefaultLLVM,  # type: ignore
+            "cuda": _ffi_api.ScheduleRuleDefaultCUDA,  # type: ignore
+            "cuda-tensorcore": _ffi_api.ScheduleRuleDefaultCUDATensorCore,  # type: ignore
+            "hexagon": _ffi_api.ScheduleRuleDefaultHexagon,  # type: ignore
+            # pylint: enable=no-member
+        }
+        for k, v in funcs.items():
+            if k == kind:
+                return v()
+        raise ValueError(f"Unsupported kind {kind} for schedule rule creation.")
+
+
+create = ScheduleRule.create  # pylint: disable=invalid-name
+
 
 @register_object("meta_schedule.PyScheduleRule")
 class _PyScheduleRule(ScheduleRule):
diff --git a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
index f54fc53935f0..2851ebe7b1d1 100644
--- a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
+++ b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
@@ -29,10 +29,6 @@ class EvolutionarySearch(SearchStrategy):
 
     Parameters
     ----------
-    num_trials_per_iter : int
-        Number of trials per iteration.
-    max_trials_per_task : int
-        Total number of trials.
     population_size : int
         The initial population of traces from measured samples and randomly generated samples.
     init_measured_ratio : int
@@ -49,8 +45,6 @@ class EvolutionarySearch(SearchStrategy):
         The ratio of greedy selected samples in the final picks.
     """
 
-    num_trials_per_iter: int
-    max_trials_per_task: int
     population_size: int
     init_measured_ratio: int
     init_min_unmeasured: int
@@ -62,8 +56,6 @@ class EvolutionarySearch(SearchStrategy):
     def __init__(
         self,
         *,
-        num_trials_per_iter: int,
-        max_trials_per_task: int,
         population_size: int = 2048,
         init_measured_ratio: float = 0.2,
         init_min_unmeasured: int = 50,
@@ -75,8 +67,6 @@ def __init__(
         """Constructor"""
         self.__init_handle_by_constructor__(
             _ffi_api.SearchStrategyEvolutionarySearch,  # type: ignore # pylint: disable=no-member
-            num_trials_per_iter,
-            max_trials_per_task,
             population_size,
             init_measured_ratio,
             init_min_unmeasured,
diff --git a/python/tvm/meta_schedule/search_strategy/replay_func.py b/python/tvm/meta_schedule/search_strategy/replay_func.py
index d89e2b133cde..f4660014241a 100644
--- a/python/tvm/meta_schedule/search_strategy/replay_func.py
+++ b/python/tvm/meta_schedule/search_strategy/replay_func.py
@@ -35,17 +35,8 @@ class ReplayFunc(SearchStrategy):
         Total number of trials for one task
     """
 
-    num_trials_per_iter: int
-    max_trials_per_task: int
-
-    def __init__(
-        self,
-        num_trials_per_iter: int,
-        max_trials_per_task: int,
-    ):
+    def __init__(self):
         """Constructor"""
         self.__init_handle_by_constructor__(
             _ffi_api.SearchStrategyReplayFunc,  # type: ignore # pylint: disable=no-member
-            num_trials_per_iter,
-            max_trials_per_task,
         )
diff --git a/python/tvm/meta_schedule/search_strategy/replay_trace.py b/python/tvm/meta_schedule/search_strategy/replay_trace.py
index 36dbb8734e57..e24ad5a5219a 100644
--- a/python/tvm/meta_schedule/search_strategy/replay_trace.py
+++ b/python/tvm/meta_schedule/search_strategy/replay_trace.py
@@ -29,25 +29,15 @@ class ReplayTrace(SearchStrategy):
 
     Parameters
     ----------
-    num_trials_per_iter : int
-        Number of trials per iteration.
-    max_trials_per_task : int
-        Total number of trials for one task
     max_fail_count : int
         Max number of failures during trace replaying.
     """
 
-    num_trials_per_iter: int
-    max_trials_per_task: int
     max_fail_count: int
 
-    def __init__(
-        self, num_trials_per_iter: int, max_trials_per_task: int, max_fail_count: int = 100
-    ):
+    def __init__(self, max_fail_count: int = 100):
         """Constructor"""
         self.__init_handle_by_constructor__(
             _ffi_api.SearchStrategyReplayTrace,  # type: ignore # pylint: disable=no-member
-            num_trials_per_iter,
-            max_trials_per_task,
             max_fail_count,
         )
diff --git a/python/tvm/meta_schedule/search_strategy/search_strategy.py b/python/tvm/meta_schedule/search_strategy/search_strategy.py
index 276e65713325..3b72cc8d1ac6 100644
--- a/python/tvm/meta_schedule/search_strategy/search_strategy.py
+++ b/python/tvm/meta_schedule/search_strategy/search_strategy.py
@@ -18,7 +18,7 @@
 Meta Schedule search strategy that generates the measure
 candidates for measurement.
 """
-from typing import TYPE_CHECKING, Callable, List, Optional
+from typing import TYPE_CHECKING, Callable, List, Optional, Union
 
 # isort: off
 from typing_extensions import Literal
@@ -76,10 +76,16 @@ def __init__(
 
 @register_object("meta_schedule.SearchStrategy")
 class SearchStrategy(Object):
-    """
-    Search strategy is the class that generates the measure candidates. It has to be pre-tuned
-    before usage and post-tuned after usage.
-    """
+    """Search strategy is the class that generates the measure candidates."""
+
+    SearchStrategyType = Union[
+        "SearchStrategy",
+        Literal[
+            "replay-func",
+            "replay-trace",
+            "evolutionary",
+        ],
+    ]
 
     def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the search strategy with tuning context.
@@ -95,6 +101,8 @@ def _initialize_with_tune_context(self, context: "TuneContext") -> None:
 
     def pre_tuning(
         self,
+        max_trials: int,
+        num_trials_per_iter: int,
         design_spaces: List[Schedule],
         database: Optional["Database"] = None,
         cost_model: Optional["CostModel"] = None,
@@ -103,6 +111,10 @@ def pre_tuning(
 
         Parameters
         ----------
+        max_trials : int
+            The maximum number of trials.
+        num_trials_per_iter : int
+            The number of trials per iteration.
         design_spaces : List[Schedule]
             The design spaces used during tuning process.
         database : Optional[Database] = None
@@ -112,6 +124,8 @@ def pre_tuning(
         """
         _ffi_api.SearchStrategyPreTuning(  # type: ignore # pylint: disable=no-member
             self,
+            max_trials,
+            num_trials_per_iter,
             design_spaces,
             database,
             cost_model,
@@ -161,6 +175,34 @@ def clone(self) -> "SearchStrategy":
         """
         return _ffi_api.SearchStrategyClone(self)  # type: ignore # pylint: disable=no-member
 
+    @staticmethod
+    def create(  # pylint: disable=keyword-arg-before-vararg
+        kind: Literal[
+            "evolutionary",
+            "replay-trace",
+            "replay-func",
+        ] = "evolutionary",
+        *args,
+        **kwargs,
+    ) -> "SearchStrategy":
+        """Create a search strategy."""
+        from . import (  # pylint: disable=import-outside-toplevel
+            EvolutionarySearch,
+            ReplayFunc,
+            ReplayTrace,
+        )
+
+        if kind == "evolutionary":
+            return EvolutionarySearch(*args, **kwargs)
+        if kind == "replay-trace":
+            return ReplayTrace(*args, **kwargs)
+        if kind == "replay-func":
+            return ReplayFunc(*args, **kwargs)  # type: ignore
+        raise ValueError(f"Unknown SearchStrategy: {kind}")
+
+
+create = SearchStrategy.create  # pylint: disable=invalid-name
+
 
 @register_object("meta_schedule.PySearchStrategy")
 class _PySearchStrategy(SearchStrategy):
@@ -223,7 +265,14 @@ def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """
         raise NotImplementedError
 
-    def pre_tuning(self, design_spaces: List[Schedule]) -> None:
+    def pre_tuning(
+        self,
+        max_trials: int,
+        num_trials_per_iter: int,
+        design_spaces: List[Schedule],
+        database: Optional["Database"] = None,
+        cost_model: Optional["CostModel"] = None,
+    ) -> None:
         """Pre-tuning for the search strategy.
 
         Parameters
@@ -272,28 +321,3 @@ def clone(self) -> SearchStrategy:
             The cloned search strategy.
         """
         raise NotImplementedError
-
-
-def create(  # pylint: disable=keyword-arg-before-vararg
-    kind: Literal[
-        "evolutionary",
-        "replay_trace",
-        "replay_func",
-    ] = "evolutionary",
-    *args,
-    **kwargs,
-) -> SearchStrategy:
-    """Create a search strategy."""
-    from . import (  # pylint: disable=import-outside-toplevel
-        EvolutionarySearch,
-        ReplayFunc,
-        ReplayTrace,
-    )
-
-    if kind == "evolutionary":
-        return EvolutionarySearch(*args, **kwargs)
-    if kind == "replay_trace":
-        return ReplayTrace(*args, **kwargs)
-    if kind == "replay_func":
-        return ReplayFunc(*args, **kwargs)
-    raise ValueError(f"Unknown SearchStrategy: {kind}")
diff --git a/python/tvm/meta_schedule/space_generator/post_order_apply.py b/python/tvm/meta_schedule/space_generator/post_order_apply.py
index 6e2a2c52b1a1..930e8a51dc61 100644
--- a/python/tvm/meta_schedule/space_generator/post_order_apply.py
+++ b/python/tvm/meta_schedule/space_generator/post_order_apply.py
@@ -15,11 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 """Post Order Apply Space Generator."""
-
-
 from tvm._ffi import register_object
-from .space_generator import SpaceGenerator
+
 from .. import _ffi_api
+from .space_generator import (
+    MutatorProbType,
+    PostprocType,
+    ScheduleRuleType,
+    SpaceGenerator,
+    _normalize_rules,
+)
 
 
 @register_object("meta_schedule.PostOrderApply")
@@ -37,8 +42,19 @@ class PostOrderApply(SpaceGenerator):
         all blocks will have schedules generated.
     """
 
-    def __init__(self, f_block_filter=None):
+    def __init__(
+        self,
+        f_block_filter=None,
+        sch_rules: ScheduleRuleType = "from-target",
+        postprocs: PostprocType = "from-target",
+        mutator_probs: MutatorProbType = "from-target",
+    ):
         """Constructor"""
+        sch_rules, postprocs, mutator_probs = _normalize_rules(sch_rules, postprocs, mutator_probs)
         self.__init_handle_by_constructor__(
-            _ffi_api.SpaceGeneratorPostOrderApply, f_block_filter  # type: ignore # pylint: disable=no-member
+            _ffi_api.SpaceGeneratorPostOrderApply,  # type: ignore # pylint: disable=no-member
+            f_block_filter,
+            sch_rules,
+            postprocs,
+            mutator_probs,
         )
diff --git a/python/tvm/meta_schedule/space_generator/schedule_fn.py b/python/tvm/meta_schedule/space_generator/schedule_fn.py
index d6b063dcb263..65956e843679 100644
--- a/python/tvm/meta_schedule/space_generator/schedule_fn.py
+++ b/python/tvm/meta_schedule/space_generator/schedule_fn.py
@@ -18,7 +18,13 @@
 from tvm._ffi import register_object
 
 from .. import _ffi_api
-from .space_generator import SpaceGenerator
+from .space_generator import (
+    MutatorProbType,
+    PostprocType,
+    ScheduleRuleType,
+    SpaceGenerator,
+    _normalize_rules,
+)
 
 
 @register_object("meta_schedule.ScheduleFn")
@@ -30,7 +36,13 @@ class ScheduleFn(SpaceGenerator):
     - 3) [Schedule] -> List[Schedule]
     """
 
-    def __init__(self, sch_fn: SpaceGenerator.ScheduleFnType):
+    def __init__(
+        self,
+        sch_fn: SpaceGenerator.ScheduleFnType,
+        sch_rules: ScheduleRuleType = "from-target",
+        postprocs: PostprocType = "from-target",
+        mutator_probs: MutatorProbType = "from-target",
+    ):
         """Constructor.
 
         Parameters
@@ -41,7 +53,11 @@ def __init__(self, sch_fn: SpaceGenerator.ScheduleFnType):
             - 2) [Schedule] -> Schedule
             - 3) [Schedule] -> List[Schedule]
         """
+        sch_rules, postprocs, mutator_probs = _normalize_rules(sch_rules, postprocs, mutator_probs)
         self.__init_handle_by_constructor__(
             _ffi_api.SpaceGeneratorScheduleFn,  # type: ignore # pylint: disable=no-member
             sch_fn,
+            sch_rules,
+            postprocs,
+            mutator_probs,
         )
diff --git a/python/tvm/meta_schedule/space_generator/space_generator.py b/python/tvm/meta_schedule/space_generator/space_generator.py
index 23c0361645b5..f6212a360a87 100644
--- a/python/tvm/meta_schedule/space_generator/space_generator.py
+++ b/python/tvm/meta_schedule/space_generator/space_generator.py
@@ -18,7 +18,7 @@
 Meta Schedule design space generators that generates design
 space for generation of measure candidates.
 """
-from typing import TYPE_CHECKING, Callable, List, Optional, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 # isort: off
 from typing_extensions import Literal
@@ -32,6 +32,9 @@
 from .. import _ffi_api
 
 if TYPE_CHECKING:
+    from ..mutator import Mutator
+    from ..postproc import Postproc
+    from ..schedule_rule import ScheduleRule
     from ..tune_context import TuneContext
 
 
@@ -45,6 +48,16 @@ class SpaceGenerator(Object):
         Callable[[Schedule], List[Schedule]],  # Multiple outputs
     ]
 
+    SpaceGeneratorType = Union[
+        "SpaceGenerator",
+        ScheduleFnType,
+        Literal["post-order-apply", "union"],
+    ]
+
+    sch_rules: Optional[List["ScheduleRule"]]
+    postprocs: Optional[List["Postproc"]]
+    mutator_probs: Optional[Dict["Mutator", float]]
+
     def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the design space generator with tuning context.
 
@@ -82,8 +95,91 @@ def clone(self) -> "SpaceGenerator":
         """
         return _ffi_api.SpaceGeneratorClone(self)  # type: ignore # pylint: disable=no-member
 
+    @staticmethod
+    def create(  # pylint: disable=keyword-arg-before-vararg
+        kind: Union[
+            Literal["post-order-apply", "union"],
+            ScheduleFnType,
+        ] = "post-order-apply",
+        *args,
+        **kwargs,
+    ) -> "SpaceGenerator":
+        """Create a design space generator."""
+        from . import (  # pylint: disable=import-outside-toplevel
+            PostOrderApply,
+            ScheduleFn,
+            SpaceGeneratorUnion,
+        )
+
+        if callable(kind):
+
+            def create_schedule_fn(
+                func,
+                sch_rules=[],
+                postprocs=[],
+                mutator_probs={},
+            ):  # pylint: disable=dangerous-default-value
+                return ScheduleFn(func, sch_rules, postprocs, mutator_probs)
+
+            return create_schedule_fn(kind, *args, **kwargs)  # type: ignore
+        if kind == "post-order-apply":
+            return PostOrderApply(*args, **kwargs)
+        if kind == "union":
+            return SpaceGeneratorUnion(*args, **kwargs)
+        raise ValueError(f"Unknown SpaceGenerator: {kind}")
+
 
 ScheduleFnType = SpaceGenerator.ScheduleFnType
+ScheduleRuleType = Union[
+    List["ScheduleRule"],
+    Literal["llvm", "cuda", "cuda-tensorcore", "hexagon", "from-target"],
+]
+PostprocType = Union[
+    List["Postproc"],
+    Literal["llvm", "cuda", "cuda-tensorcore", "hexagon", "from-target"],
+]
+MutatorProbType = Union[
+    Dict["Mutator", float],
+    Literal["llvm", "cuda", "cuda-tensorcore", "hexagon", "from-target"],
+]
+create = SpaceGenerator.create  # pylint: disable=invalid-name
+
+
+def _normalize_rules(
+    sch_rules: ScheduleRuleType,
+    postprocs: PostprocType,
+    mutator_probs: MutatorProbType,
+) -> Tuple[
+    Optional[List["ScheduleRule"]],
+    Optional[List["Postproc"]],
+    Optional[Dict["Mutator", float]],
+]:
+    # pylint: disable=import-outside-toplevel
+    from ..mutator import Mutator
+    from ..postproc import Postproc
+    from ..schedule_rule import ScheduleRule
+
+    # pylint: enable=import-outside-toplevel
+    assert sch_rules is not None
+    assert postprocs is not None
+    assert mutator_probs is not None
+
+    if isinstance(sch_rules, str):
+        if sch_rules == "from-target":
+            sch_rules = None
+        else:
+            sch_rules = ScheduleRule.create(sch_rules)
+    if isinstance(postprocs, str):
+        if postprocs == "from-target":
+            postprocs = None
+        else:
+            postprocs = Postproc.create(postprocs)
+    if isinstance(mutator_probs, str):
+        if mutator_probs == "from-target":
+            mutator_probs = None
+        else:
+            mutator_probs = Mutator.create(mutator_probs)
+    return sch_rules, postprocs, mutator_probs  # type: ignore
 
 
 @register_object("meta_schedule.PySpaceGenerator")
@@ -97,14 +193,21 @@ class _PySpaceGenerator(SpaceGenerator):
 
     def __init__(
         self,
+        sch_rules: ScheduleRuleType = "from-target",
+        postprocs: PostprocType = "from-target",
+        mutator_probs: MutatorProbType = "from-target",
         f_initialize_with_tune_context: Optional[Callable] = None,
         f_generate_design_space: Optional[Callable] = None,
         f_clone: Optional[Callable] = None,
     ):
         """Constructor."""
+        sch_rules, postprocs, mutator_probs = _normalize_rules(sch_rules, postprocs, mutator_probs)
 
         self.__init_handle_by_constructor__(
             _ffi_api.SpaceGeneratorPySpaceGenerator,  # type: ignore # pylint: disable=no-member
+            sch_rules,
+            postprocs,
+            mutator_probs,
             f_initialize_with_tune_context,
             f_generate_design_space,
             f_clone,
@@ -121,6 +224,7 @@ class PySpaceGenerator:
 
     _tvm_metadata = {
         "cls": _PySpaceGenerator,
+        "fields": ["sch_rules", "postprocs", "mutator_probs"],
         "methods": ["_initialize_with_tune_context", "generate_design_space", "clone"],
     }
 
@@ -158,27 +262,3 @@ def clone(self) -> SpaceGenerator:
             The cloned design space generator.
         """
         raise NotImplementedError
-
-
-def create(  # pylint: disable=keyword-arg-before-vararg
-    kind: Union[
-        Literal["post_order_apply", "union"],
-        ScheduleFnType,
-    ] = "post_order_apply",
-    *args,
-    **kwargs,
-) -> SpaceGenerator:
-    """Create a design space generator."""
-    from . import (  # pylint: disable=import-outside-toplevel
-        PostOrderApply,
-        ScheduleFn,
-        SpaceGeneratorUnion,
-    )
-
-    if callable(kind):
-        return ScheduleFn(kind, *args, **kwargs)  # type: ignore
-    if kind == "post_order_apply":
-        return PostOrderApply(*args, **kwargs)
-    if kind == "union":
-        return SpaceGeneratorUnion(*args, **kwargs)
-    raise ValueError(f"Unknown SpaceGenerator: {kind}")
diff --git a/python/tvm/meta_schedule/space_generator/space_generator_union.py b/python/tvm/meta_schedule/space_generator/space_generator_union.py
index 5541ab0b5026..e3d8f441d1ef 100644
--- a/python/tvm/meta_schedule/space_generator/space_generator_union.py
+++ b/python/tvm/meta_schedule/space_generator/space_generator_union.py
@@ -20,14 +20,26 @@
 from tvm._ffi import register_object
 
 from .. import _ffi_api
-from .space_generator import SpaceGenerator
+from .space_generator import (
+    MutatorProbType,
+    PostprocType,
+    ScheduleRuleType,
+    SpaceGenerator,
+    _normalize_rules,
+)
 
 
 @register_object("meta_schedule.SpaceGeneratorUnion")
 class SpaceGeneratorUnion(SpaceGenerator):
     """Union of design space generators."""
 
-    def __init__(self, space_generators: List[SpaceGenerator]):
+    def __init__(
+        self,
+        space_generators: List[SpaceGenerator],
+        sch_rules: ScheduleRuleType = "from-target",
+        postprocs: PostprocType = "from-target",
+        mutator_probs: MutatorProbType = "from-target",
+    ):
         """Constructor.
 
         Parameters
@@ -35,7 +47,11 @@ def __init__(self, space_generators: List[SpaceGenerator]):
         space_generators : List[SpaceGenerator]
             The list of design space generators to be unioned.
         """
+        sch_rules, postprocs, mutator_probs = _normalize_rules(sch_rules, postprocs, mutator_probs)
         self.__init_handle_by_constructor__(
             _ffi_api.SpaceGeneratorSpaceGeneratorUnion,  # type: ignore # pylint: disable=no-member
             space_generators,
+            sch_rules,
+            postprocs,
+            mutator_probs,
         )
diff --git a/python/tvm/meta_schedule/task_scheduler/gradient_based.py b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
index 20d32dd1c59f..963de8711e10 100644
--- a/python/tvm/meta_schedule/task_scheduler/gradient_based.py
+++ b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
@@ -15,24 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 """Gradient Based Task Scheduler"""
-import logging
-from typing import TYPE_CHECKING, List, Optional
-
 from tvm._ffi import register_object
 
 from .. import _ffi_api
-from ..builder import Builder
-from ..cost_model import CostModel
-from ..database import Database
-from ..measure_callback import MeasureCallback
-from ..runner import Runner
-from ..utils import make_logging_func
+from ..logging import get_logger, get_logging_func
 from .task_scheduler import TaskScheduler
 
-if TYPE_CHECKING:
-    from ..tune_context import TuneContext
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+logger = get_logger(__name__)  # pylint: disable=invalid-name
 
 
 @register_object("meta_schedule.GradientBased")
@@ -41,15 +30,7 @@ class GradientBased(TaskScheduler):
 
     def __init__(
         self,
-        tasks: List["TuneContext"],
-        task_weights: List[float],
-        builder: Builder,
-        runner: Runner,
         *,
-        database: Database,
-        cost_model: Optional[CostModel] = None,
-        measure_callbacks: Optional[List[MeasureCallback]] = None,
-        max_trials: int,
         alpha: float = 0.2,
         window_size: int = 3,
         seed: int = -1,
@@ -58,22 +39,6 @@ def __init__(
 
         Parameters
         ----------
-        tasks : List[TuneContext]
-            List of tasks to schedule.
-        task_weights : List[float]
-            The weights of each task.
-        builder : Builder
-            The builder.
-        runner : Runner
-            The runner.
-        database : Database
-            The database.
-        cost_model : CostModel, default None.
-            The cost model of the scheduler.
-        measure_callbacks : Optional[List[MeasureCallback]] = None
-            The list of measure callbacks of the scheduler.
-        max_trials : int
-            The maximum number of trials to run.
         alpha : float = 0.2
             The parameter alpha in gradient computation.
         window_size : int = 3
@@ -83,15 +48,7 @@ def __init__(
         """
         self.__init_handle_by_constructor__(
             _ffi_api.TaskSchedulerGradientBased,  # type: ignore # pylint: disable=no-member
-            tasks,
-            task_weights,
-            builder,
-            runner,
-            database,
-            cost_model,
-            measure_callbacks,
-            max_trials,
-            make_logging_func(logger),
+            get_logging_func(logger),
             alpha,
             window_size,
             seed,
diff --git a/python/tvm/meta_schedule/task_scheduler/round_robin.py b/python/tvm/meta_schedule/task_scheduler/round_robin.py
index ed395643bbaa..e5c7f14af424 100644
--- a/python/tvm/meta_schedule/task_scheduler/round_robin.py
+++ b/python/tvm/meta_schedule/task_scheduler/round_robin.py
@@ -15,87 +15,22 @@
 # specific language governing permissions and limitations
 # under the License.
 """Round Robin Task Scheduler"""
-
-import logging
-from typing import TYPE_CHECKING, List, Optional
-
 from tvm._ffi import register_object
-from tvm.meta_schedule.measure_callback.measure_callback import MeasureCallback
 
 from .. import _ffi_api
-from ..builder import Builder
-from ..cost_model import CostModel
-from ..database import Database
-from ..runner import Runner
-from ..utils import make_logging_func
+from ..logging import get_logger, get_logging_func
 from .task_scheduler import TaskScheduler
 
-if TYPE_CHECKING:
-    from ..tune_context import TuneContext
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+logger = get_logger(__name__)  # pylint: disable=invalid-name
 
 
 @register_object("meta_schedule.RoundRobin")
 class RoundRobin(TaskScheduler):
-    """Round Robin Task Scheduler
-
-    Parameters
-    ----------
-    tasks: List[TuneContext]
-        The list of tune context to process.
-    builder: Builder
-        The builder of the scheduler.
-    runner: Runner
-        The runner of the scheduler.
-    database: Database
-        The database of the scheduler.
-    measure_callbacks: Optional[List[MeasureCallback]] = None
-        The list of measure callbacks of the scheduler.
-    """
-
-    def __init__(
-        self,
-        tasks: List["TuneContext"],
-        task_weights: List[float],
-        builder: Builder,
-        runner: Runner,
-        *,
-        database: Database,
-        cost_model: Optional[CostModel] = None,
-        measure_callbacks: Optional[List[MeasureCallback]] = None,
-        max_trials: int,
-    ) -> None:
-        """Constructor.
+    """Round Robin Task Scheduler"""
 
-        Parameters
-        ----------
-        tasks : List[TuneContext]
-            List of tasks to schedule.
-        task_weights : List[float]
-            List of weights for each task. Not used in round robin.
-        builder : Builder
-            The builder.
-        runner : Runner
-            The runner.
-        database : Database
-            The database.
-        cost_model : Optional[CostModel]
-            The cost model.
-        measure_callbacks: Optional[List[MeasureCallback]]
-            The list of measure callbacks of the scheduler.
-        max_trials : int
-            The maximum number of trials.
-        """
-        del task_weights
+    def __init__(self) -> None:
+        """Constructor."""
         self.__init_handle_by_constructor__(
             _ffi_api.TaskSchedulerRoundRobin,  # type: ignore # pylint: disable=no-member
-            tasks,
-            builder,
-            runner,
-            database,
-            cost_model,
-            measure_callbacks,
-            max_trials,
-            make_logging_func(logger),
+            get_logging_func(logger),
         )
diff --git a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
index 29a5f18dfb8a..f06f4d911fa8 100644
--- a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
+++ b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
@@ -15,9 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Auto-tuning Task Scheduler"""
-
-import logging
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Union
 
 # isort: off
 from typing_extensions import Literal
@@ -28,53 +26,44 @@
 from tvm.runtime import Object
 
 from .. import _ffi_api
-from ..builder import Builder
+from ..builder import Builder, BuilderResult
 from ..cost_model import CostModel
 from ..database import Database
+from ..logging import get_logger, get_logging_func
 from ..measure_callback import MeasureCallback
 from ..runner import Runner, RunnerResult
+from ..search_strategy import MeasureCandidate
 from ..tune_context import TuneContext
-from ..utils import make_logging_func
 
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+logger = get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@register_object("meta_schedule.TaskRecord")
+class TaskRecord(Object):
+    """The running record of a task."""
+
+    ctx: TuneContext
+    task_weight: float
+    flop: float
+    is_terminated: bool
+    build_error_count: int
+    run_error_count: int
+    measure_candidates: List[MeasureCandidate]
+    builder_results: List[BuilderResult]
+    runner_results: List[RunnerResult]
 
 
 @register_object("meta_schedule.TaskScheduler")
 class TaskScheduler(Object):
-    """The abstract task scheduler interface.
-
-    Parameters
-    ----------
-    tasks: List[TuneContext]
-        The list of tune context to process.
-    builder: Builder
-        The builder of the scheduler.
-    runner: Runner
-        The runner of the scheduler.
-    database: Database
-        The database of the scheduler.
-    max_trials : int
-        The maximum number of trials allowed.
-    cost_model : Optional[CostModel]
-        The cost model used for search.
-    measure_callbacks: List[MeasureCallback] = None
-        The list of measure callbacks of the scheduler.
-    num_trials_already : int
-        The number of trials already conducted.
-    """
+    """The abstract task scheduler interface."""
 
-    tasks: List[TuneContext]
-    builder: Builder
-    runner: Runner
-    database: Database
-    max_trials: int
-    cost_model: Optional[CostModel]
-    measure_callbacks: List[MeasureCallback]
-    num_trials_already: int
+    tasks_: List[TaskRecord]
+    measure_callbacks_: List[MeasureCallback]
+    database_: Optional[Database]
+    cost_model_: Optional[CostModel]
+    remaining_tasks_: int
 
-    def tune(self) -> None:
-        """Auto-tuning."""
-        _ffi_api.TaskSchedulerTune(self)  # type: ignore # pylint: disable=no-member
+    TaskSchedulerType = Union["TaskScheduler", Literal["gradient", "round-robin"]]
 
     def next_task_id(self) -> int:
         """Fetch the next task id.
@@ -101,15 +90,68 @@ def join_running_task(self, task_id: int) -> List[RunnerResult]:
         """
         return _ffi_api.TaskSchedulerJoinRunningTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
-    def initialize_task(self, task_id: int) -> None:
-        """Initialize modules of the given task.
+    def tune(
+        self,
+        tasks: List[TuneContext],
+        task_weights: List[float],
+        max_trials_global: int,
+        max_trials_per_task: int,
+        num_trials_per_iter: int,
+        builder: Builder,
+        runner: Runner,
+        measure_callbacks: List[MeasureCallback],
+        database: Optional[Database],
+        cost_model: Optional[CostModel],
+    ) -> None:
+        """Auto-tuning.
+
+        Parameters
+        ----------
+        tasks : List[TuneContext]
+            The list of tuning contexts as tasks.
+        task_weights : List[float]
+            The list of task weights.
+        max_trials_global : int
+            The maximum number of trials globally.
+        max_trials_per_task : int
+            The maximum number of trials per task.
+        num_trials_per_iter : int
+            The number of trials per iteration.
+        builder : Builder
+            The builder.
+        runner : Runner
+            The runner.
+        measure_callbacks : List[MeasureCallback]
+            The list of measure callbacks.
+        database : Optional[Database]
+            The database.
+        cost_model : Optional[CostModel]
+            The cost model.
+        """
+        task_weights = [float(w) for w in task_weights]
+        _ffi_api.TaskSchedulerTune(  # type: ignore # pylint: disable=no-member
+            self,
+            tasks,
+            task_weights,
+            max_trials_global,
+            max_trials_per_task,
+            num_trials_per_iter,
+            builder,
+            runner,
+            measure_callbacks,
+            database,
+            cost_model,
+        )
+
+    def terminate_task(self, task_id: int) -> None:
+        """Terminate the task
 
         Parameters
         ----------
         task_id : int
-            The task id to be initialized.
+            The task id to be terminated.
         """
-        _ffi_api.TaskSchedulerInitializeTask(self, task_id)  # type: ignore # pylint: disable=no-member
+        _ffi_api.TaskSchedulerTerminateTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
     def touch_task(self, task_id: int) -> None:
         """Touch the task and update its status
@@ -121,6 +163,37 @@ def touch_task(self, task_id: int) -> None:
         """
         _ffi_api.TaskSchedulerTouchTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
+    def tuning_statistics(self) -> str:
+        """Returns a human-readable string of the tuning statistics.
+
+        Returns
+        -------
+        tuning_statistics : str
+            The tuning statistics.
+        """
+        return _ffi_api.TaskSchedulerTuningStatistics(self)  # type: ignore # pylint: disable=no-member
+
+    @staticmethod
+    def create(  # pylint: disable=keyword-arg-before-vararg
+        kind: Literal["round-robin", "gradient"] = "gradient",
+        *args,
+        **kwargs,
+    ) -> "TaskScheduler":
+        """Create a task scheduler."""
+        from . import (  # pylint: disable=import-outside-toplevel
+            GradientBased,
+            RoundRobin,
+        )
+
+        if kind == "round-robin":
+            return RoundRobin(*args, **kwargs)  # type: ignore
+        if kind == "gradient":
+            return GradientBased(*args, **kwargs)
+        raise ValueError(f"Unknown TaskScheduler name: {kind}")
+
+
+create = TaskScheduler.create  # pylint: disable=invalid-name
+
 
 @register_object("meta_schedule.PyTaskScheduler")
 class _PyTaskScheduler(TaskScheduler):
@@ -133,36 +206,18 @@ class _PyTaskScheduler(TaskScheduler):
 
     def __init__(
         self,
-        tasks: List[TuneContext],
-        builder: Builder,
-        runner: Runner,
-        database: Database,
-        max_trials: int,
-        cost_model: Optional[CostModel] = None,
-        measure_callbacks: Optional[List[MeasureCallback]] = None,
-        f_tune: Callable = None,
-        f_initialize_task: Callable = None,
-        f_touch_task: Callable = None,
-        f_join_running_task: Callable = None,
-        f_next_task_id: Callable = None,
+        f_next_task_id: Callable,
+        f_join_running_task: Callable,
+        f_tune: Callable,
     ):
         """Constructor."""
 
         self.__init_handle_by_constructor__(
             _ffi_api.TaskSchedulerPyTaskScheduler,  # type: ignore # pylint: disable=no-member
-            tasks,
-            builder,
-            runner,
-            database,
-            max_trials,
-            cost_model,
-            measure_callbacks,
-            make_logging_func(logger),
-            f_tune,
-            f_initialize_task,
-            f_touch_task,
-            f_join_running_task,
+            get_logging_func(logger),
             f_next_task_id,
+            f_join_running_task,
+            f_tune,
         )
 
 
@@ -176,47 +231,39 @@ class PyTaskScheduler:
 
     _tvm_metadata = {
         "cls": _PyTaskScheduler,
-        "fields": [
-            "tasks",
-            "builder",
-            "runner",
-            "database",
-            "cost_model",
-            "measure_callbacks",
-            "max_trials",
-        ],
-        "methods": [
-            "tune",
-            "initialize_task",
-            "touch_task",
-            "join_running_task",
-            "next_task_id",
-        ],
+        "fields": [],
+        "methods": ["next_task_id", "join_running_task", "tune"],
     }
 
-    def __init__(
+    def __init__(self):
+        ...
+
+    def tune(
         self,
         tasks: List[TuneContext],
+        task_weights: List[float],
+        max_trials_global: int,
+        max_trials_per_task: int,
         builder: Builder,
         runner: Runner,
-        *,
-        database: Optional[Database] = None,
-        cost_model: Optional[CostModel] = None,
-        measure_callbacks: Optional[List[MeasureCallback]] = None,
-        max_trials: int,
-    ):
-        self.tasks = tasks
-        self.builder = builder
-        self.runner = runner
-        self.database = database
-        self.cost_model = cost_model
-        self.measure_callbacks = measure_callbacks
-        self.max_trials = max_trials
-
-    def tune(self) -> None:
+        measure_callbacks: List[MeasureCallback],
+        database: Optional[Database],
+        cost_model: Optional[CostModel],
+    ) -> None:
         """Auto-tuning."""
         # Using self._outer to replace the self pointer
-        _ffi_api.TaskSchedulerTune(self._outer())  # type: ignore # pylint: disable=no-member
+        _ffi_api.TaskSchedulerTune(  # type: ignore # pylint: disable=no-member
+            self._outer(),  # type: ignore # pylint: disable=no-member
+            tasks,
+            task_weights,
+            max_trials_global,
+            max_trials_per_task,
+            builder,
+            runner,
+            measure_callbacks,
+            database,
+            cost_model,
+        )
 
     def next_task_id(self) -> int:
         """Fetch the next task id.
@@ -238,40 +285,3 @@ def join_running_task(self, task_id: int) -> List[RunnerResult]:
         """
         # Using self._outer to replace the self pointer
         return _ffi_api.TaskSchedulerJoinRunningTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
-
-    def initialize_task(self, task_id: int) -> None:
-        """Initialize modules of the given task.
-
-        Parameters
-        ----------
-        task_id : int
-            The task id to be initialized.
-        """
-        # Using self._outer to replace the self pointer
-        _ffi_api.TaskSchedulerInitializeTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
-
-    def touch_task(self, task_id: int) -> None:
-        """Touch the task and update its status
-
-        Parameters
-        ----------
-        task_id : int
-            The task id to be checked.
-        """
-        # Using self._outer to replace the self pointer
-        _ffi_api.TaskSchedulerTouchTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
-
-
-def create(  # pylint: disable=keyword-arg-before-vararg
-    kind: Literal["round-robin", "gradient"] = "gradient",
-    *args,
-    **kwargs,
-) -> "TaskScheduler":
-    """Create a task scheduler."""
-    from . import GradientBased, RoundRobin  # pylint: disable=import-outside-toplevel
-
-    if kind == "round-robin":
-        return RoundRobin(*args, **kwargs)
-    if kind == "gradient":
-        return GradientBased(*args, **kwargs)
-    raise ValueError(f"Unknown TaskScheduler name: {kind}")
diff --git a/python/tvm/meta_schedule/testing/dataset_extract_tasks.py b/python/tvm/meta_schedule/testing/dataset_extract_tasks.py
index 1795996a3717..5d71d088a379 100644
--- a/python/tvm/meta_schedule/testing/dataset_extract_tasks.py
+++ b/python/tvm/meta_schedule/testing/dataset_extract_tasks.py
@@ -21,8 +21,8 @@
 import json
 import os
 
-from tqdm import tqdm  # type: ignore
 import tvm
+from tqdm import tqdm  # type: ignore
 from tvm import meta_schedule as ms
 from tvm.ir import save_json
 from tvm.meta_schedule.testing.relay_workload import _load_cache
@@ -60,7 +60,7 @@ def extract_and_save_tasks(cache_file):
     mod, params_bytearray, _ = _load_cache(args.model_cache_dir, cache_file)
     params = load_param_dict(params_bytearray)
     try:
-        extracted_tasks = ms.extract_task_from_relay(mod, target=args.target, params=params)
+        extracted_tasks = ms.relay_integration.extract_tasks(mod, target=args.target, params=params)
     except tvm.error.TVMError as error:
         print(str(error))
         return
diff --git a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
index 35b872e7351e..39a12b494108 100644
--- a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
+++ b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
@@ -22,8 +22,8 @@
 import os
 from typing import List
 
-from tqdm import tqdm  # type: ignore
 import tvm
+from tqdm import tqdm  # type: ignore
 from tvm import meta_schedule as ms
 from tvm.ir import load_json
 from tvm.target import Target
@@ -117,25 +117,20 @@ def sample_candidates(task, task_name, model_name):
     evolve_with_cost_model = tvm.get_global_func(
         "meta_schedule.SearchStrategyEvolutionarySearchEvolveWithCostModel"
     )
-    strategy = ms.search_strategy.EvolutionarySearch(
-        num_trials_per_iter=args.num_trials_per_iter,
-        max_trials_per_task=args.max_trials_per_task,
-        init_measured_ratio=0.0,
-    )
+    strategy = ms.search_strategy.EvolutionarySearch(init_measured_ratio=0.0)
     target = Target(args.target)
     context = ms.TuneContext(
         mod=task,
         target=target,
-        space_generator=ms.space_generator.PostOrderApply(),
+        space_generator="post-order-apply",
         search_strategy=strategy,
-        sch_rules=ms.default_config.schedule_rules(None, target),
-        postprocs=ms.default_config.postproc(None, target),
-        mutator_probs=ms.default_config.mutator_probs(None, target),
         task_name=task_name,
     )
     context.initialize()
     context.pre_tuning(
-        context.generate_design_space(),
+        max_trials=args.max_trials_per_task,
+        num_trials_per_iter=args.num_trials_per_iter,
+        design_spaces=context.generate_design_space(),
         database=database,
         cost_model=ms.cost_model.RandomModel(),  # type: ignore
     )
diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py
index 9dcff2ace583..6d1cd7f1604c 100644
--- a/python/tvm/meta_schedule/testing/relay_workload.py
+++ b/python/tvm/meta_schedule/testing/relay_workload.py
@@ -24,9 +24,9 @@
 
 import tvm
 import tvm.relay.testing
+from tvm import meta_schedule as ms
 from tvm import relay
 from tvm.ir import IRModule
-from tvm.meta_schedule import ExtractedTask, extract_task_from_relay
 from tvm.runtime import NDArray, load_param_dict, save_param_dict
 from tvm.target import Target
 
@@ -34,15 +34,17 @@
 
 
 def _get_network(
-    args: Tuple[str, List[int], str]
+    args: Tuple[str, List[int], Optional[str]]
 ) -> Tuple[IRModule, bytearray, Tuple[str, List[int], str]]:
     name: str
     input_shape: List[int]
-    layout: str
+    layout: Optional[str]
     name, input_shape, layout = args
 
-    mod: IRModule
+    if layout == "None":
+        layout = None
 
+    mod: IRModule
     if name in [
         "resnet_18",
         "resnet_50",
@@ -60,24 +62,30 @@ def _get_network(
 
         assert layout is None or layout in ["NCHW", "NHWC"]
 
+        params: Dict[str, Any] = {}
         if name in ["resnet_18", "resnet_50"]:
-            model = getattr(models, name.replace("_", ""))(weights=None)
+            model = getattr(models, name.replace("_", ""))
         elif name == "wide_resnet_50":
-            model = getattr(models, "wide_resnet50_2")(weights=None)
+            model = getattr(models, "wide_resnet50_2")
         elif name == "resnext_50":
-            model = getattr(models, "resnext50_32x4d")(weights=None)
+            model = getattr(models, "resnext50_32x4d")
         elif name == "mobilenet_v2":
-            model = getattr(models, name)(weights=None)
+            model = getattr(models, name)
         elif name == "mobilenet_v3":
-            model = getattr(models, name + "_large")(weights=None)
+            model = getattr(models, name + "_large")
         elif name == "inception_v3":
-            model = getattr(models, name)(weights=None, aux_logits=False)
+            model = getattr(models, name)
+            params["aux_logits"] = False
         elif name == "densenet_121":
-            model = getattr(models, name.replace("_", ""))(weights=None)
+            model = getattr(models, name.replace("_", ""))
         elif name == "resnet3d_18":
-            model = models.video.r3d_18(weights=None)
+            model = models.video.r3d_18
         elif name == "vgg_16":
-            model = getattr(models, name.replace("_", ""))(weights=None)
+            model = getattr(models, name.replace("_", ""))
+        try:
+            model = model(**params, weights=None)
+        except TypeError:
+            model = model(**params, pretrained=False)
 
         dtype = "float32"
         input_data = torch.randn(input_shape).type(  # pylint: disable=no-member
@@ -90,7 +98,7 @@ def _get_network(
         shape_list = [(input_name, input_shape)]
         mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
         passes = [relay.transform.RemoveUnusedFunctions()]
-        if layout == "NHWC":
+        if layout is None or layout == "NHWC":
             # PyTorch is imported as NCHW by default
             passes.append(
                 relay.transform.ConvertLayout(
@@ -251,10 +259,7 @@ def extract_from_relay(
     input_shape: List[int],
     *,
     cache_dir: Optional[str] = None,
-    opt_level: int = 3,
-    pass_config: Optional[Dict[str, Any]] = None,
-    disabled_pass: Optional[List[str]] = None,
-) -> List[ExtractedTask]:
+) -> List[ms.ExtractedTask]:
     """Extract the tasks from a network.
 
     Parameters
@@ -272,12 +277,6 @@ def extract_from_relay(
     cache_dir : Optional[str]
         The directory to cache the generated network.
         If not specified, the cache will be disabled.
-    opt_level : int
-        The optimization level of the compiler.
-    pass_config : Optional[Dict[str, Any]]
-        The pass config of the compiler.
-    disabled_pass : Optional[List[str]]
-        The disabled pass of the compiler.
 
     Returns
     -------
@@ -287,13 +286,10 @@ def extract_from_relay(
     filename = f'tasks-{target.kind.name}-{name}-{",".join(str(i) for i in input_shape)}.json'
     extracted_tasks = _load_cache(cache_dir, filename)
     if extracted_tasks is None:
-        extracted_tasks = extract_task_from_relay(
+        extracted_tasks = ms.relay_integration.extract_tasks(
             mod=mod,
             target=target,
             params=params,
-            opt_level=opt_level,
-            pass_config=pass_config,
-            disabled_pass=disabled_pass,
         )
         extracted_tasks = list(extracted_tasks)
         _save_cache(cache_dir, filename, extracted_tasks)
diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py
deleted file mode 100644
index f14e90b6f0b2..000000000000
--- a/python/tvm/meta_schedule/testing/schedule_rule.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Default schedule rules"""
-from typing import List, Tuple, Union
-
-from tvm.meta_schedule import default_config
-from tvm.meta_schedule.schedule_rule import ScheduleRule
-
-
-def get_rules(kind: str, types: Union[type, Tuple[type, ...]]) -> List[ScheduleRule]:
-    """Get default schedule rules"""
-    # pylint: disable=protected-access
-    if kind == "llvm":
-        rules = default_config._DefaultLLVM.schedule_rules()
-    elif kind == "cuda":
-        rules = default_config._DefaultCUDA.schedule_rules()
-    elif kind == "tensor_core":
-        rules = default_config._DefaultCUDATensorCore.schedule_rules()
-    else:
-        raise NotImplementedError(f"{kind} is not supported")
-    # pylint: enable=protected-access
-    return [rule for rule in rules if isinstance(rule, types)]
diff --git a/python/tvm/meta_schedule/testing/space_generation.py b/python/tvm/meta_schedule/testing/space_generation.py
index f85faca13f7a..5ac20f8fdf2f 100644
--- a/python/tvm/meta_schedule/testing/space_generation.py
+++ b/python/tvm/meta_schedule/testing/space_generation.py
@@ -15,24 +15,51 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
+
+from tvm import meta_schedule as ms
 from tvm.ir import IRModule, structural_equal
+from tvm.target import Target
 from tvm.tir import Schedule
 from tvm.tir.schedule import Trace
 from tvm.tir.schedule.testing import verify_trace_roundtrip
 
 
-def check_trace(spaces: List[Schedule], expected: List[List[str]]):
-    expected_traces = {"\n".join(t) for t in expected}
-    actual_traces = set()
-    for space in spaces:
-        trace = Trace(space.trace.insts, {})
-        trace = trace.simplified(remove_postproc=True)
-        str_trace = "\n".join(t[2:] for t in str(trace).strip().splitlines()[2:] if t != "  pass")
-        actual_traces.add(str_trace)
-        assert str_trace in expected_traces, "\n" + str_trace
-    assert len(expected_traces) == len(actual_traces)
+def get_rules(
+    kind: Literal["llvm", "cuda", "cuda-tensorcore", "hexagon"],
+    types: Union[type, Tuple[type, ...]],
+) -> List[ms.ScheduleRule]:
+    """Get default schedule rules"""
+    rules = ms.ScheduleRule.create(kind)
+    return [rule for rule in rules if isinstance(rule, types)]
+
+
+def generate_design_space(
+    kind: Literal["llvm", "cuda", "cuda-tensorcore", "hexagon"],
+    mod: IRModule,
+    target: Target,
+    types: Union[type, Tuple[type, ...]],
+    sch_rules: Optional[List[ms.ScheduleRule]] = None,
+) -> List[Schedule]:
+    if sch_rules is None:
+        sch_rules = get_rules(kind, types)
+    else:
+        assert types is None
+    return ms.TuneContext(
+        mod=mod,
+        target=target,
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=sch_rules,
+            postprocs=[],
+            mutator_probs={},
+        ),
+        task_name="test",
+    ).generate_design_space()
 
 
 def _find_match_sketch_id(
diff --git a/python/tvm/meta_schedule/testing/tlcbench.py b/python/tvm/meta_schedule/testing/tlcbench.py
index 108d83ba9de9..2e9f9f52b1fc 100644
--- a/python/tvm/meta_schedule/testing/tlcbench.py
+++ b/python/tvm/meta_schedule/testing/tlcbench.py
@@ -17,14 +17,14 @@
 # pylint: disable=invalid-name,import-outside-toplevel
 # type: ignore
 """Model loader for TLCBench."""
+import logging
 import multiprocessing
 import os
-import logging
+
 import tvm
 from tvm import relay
 from tvm.contrib.download import download_testdata
 
-
 log = logging.getLogger(__name__)
 
 
@@ -64,7 +64,6 @@ def deserialize_relay(json_path, params_path):
 
     with open(params_path, "rb") as fi:
         params = relay.load_param_dict(fi.read())
-
     return mod, params
 
 
diff --git a/python/tvm/meta_schedule/testing/torchbench/run.py b/python/tvm/meta_schedule/testing/torchbench/run.py
index f6984d1c9d10..fe939b2c9ba9 100644
--- a/python/tvm/meta_schedule/testing/torchbench/run.py
+++ b/python/tvm/meta_schedule/testing/torchbench/run.py
@@ -54,7 +54,7 @@
     --mode tune \
     --model resnet50 \
     --target "nvidia/geforce-rtx-3070" \
-    --work-dir ../workdir \
+    --work-dir /path/to/work/dir/ \
     --num-trials 20000 \
     --rpc-host <rpc tracker host for tuning> \
     --rpc-port <rpc tracker port for tuning> \
@@ -73,7 +73,7 @@
     --mode eval \
     --model resnet50 \
     --target "nvidia/geforce-rtx-3070" \
-    --work-dir ../workdir \
+    --work-dir /path/to/work/dir/ \
     --num-trials 0
 ```
 
@@ -84,13 +84,11 @@
     --mode all \
     --model resnet50 \
     --target "llvm -num-cores 6" \
-    --work-dir ../workdir \
+    --work-dir /path/to/work/dir/ \
     --num-trials 0
 ```
 """
-
 # pylint: disable=logging-format-interpolation
-
 import argparse
 import functools
 import logging
@@ -100,10 +98,9 @@
 
 import numpy as np  # type: ignore
 import torch  # type: ignore
-from scipy.stats import ttest_ind  # type: ignore
-
 import tvm
 import tvm.relay
+from scipy.stats import ttest_ind  # type: ignore
 from tvm import meta_schedule as ms
 from tvm.contrib.graph_executor import GraphModule
 from tvm.meta_schedule.testing.torchbench.utils import (
@@ -147,10 +144,10 @@ def should_eval(self):
 
 class ResultComparisonMetric(Enum):
     """
-    This changes how it compares the resultl with the expected value during
+    This changes how it compares the results with the expected value during
     accuracy check.
     - cosine: Use the cosine similarity. It should be greater than 0.99.
-    - allclose-1e-4: Use the max element-wise absolute difference. It should be less than 1e-4.
+    - allclose-1e-4: Use the max elementwise absolute difference. It should be less than 1e-4.
     """
 
     COSINE = "cosine"
@@ -220,15 +217,6 @@ def parse_args():
         The working directory to save intermediate results and store databases for compilation.
         """,
     )
-    args.add_argument(
-        "--cache-dir",
-        type=str,
-        default=None,
-        help="""
-        The directory to cache the generated network.
-        If not specified, the cache will be disabled.
-        """,
-    )
     args.add_argument(
         "--num-trials",
         type=int,
@@ -279,7 +267,7 @@ def parse_args():
     args.add_argument(
         "--adaptive-training",
         action="store_true",
-        help="Whether to use adpative training for cost model.",
+        help="Whether to use adaptive training for cost model.",
     )
     args.add_argument(
         "--cpu-flush",
@@ -309,7 +297,8 @@ def parse_args():
 
 
 logging.basicConfig(
-    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
 )
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 ARGS = parse_args()
@@ -320,11 +309,12 @@ def parse_args():
 
 
 runner = load_torchdynamo_benchmark_runner(  # pylint: disable=invalid-name
-    IS_CUDA, cosine_similarity=ARGS.result_metric == ResultComparisonMetric.COSINE
+    IS_CUDA,
+    cosine_similarity=ARGS.result_metric == ResultComparisonMetric.COSINE,
 )
 
 
-def get_metaschedule_runner() -> ms.runner.PyRunner:
+def get_meta_schedule_runner() -> ms.runner.PyRunner:
     """
     Get the Runner for MetaSchedule.
 
@@ -349,33 +339,10 @@ def get_metaschedule_runner() -> ms.runner.PyRunner:
             alloc_repeat=1,
         )
     else:
-        warnings.warn("Falling back to Metaschedule LocalRunner because --rpc-host isn't provided.")
+        warnings.warn("Falling back to MetaSchedule LocalRunner because --rpc-host isn't provided.")
         return ms.runner.LocalRunner()
 
 
-def get_tune_config() -> ms.TuneConfig:
-    """
-    Get the TuneConfig.
-    """
-    if ARGS.mode.should_tune:
-        max_trials_per_task = ARGS.max_trials_per_task
-        max_trials_global = ARGS.num_trials
-    else:
-        max_trials_per_task = 0
-        max_trials_global = 0
-
-    if max_trials_per_task is None:
-        max_trials_per_task = max_trials_global
-
-    return ms.TuneConfig(
-        strategy="evolutionary",
-        num_trials_per_iter=64,
-        max_trials_per_task=max_trials_per_task,
-        max_trials_global=max_trials_global,
-        adaptive_training=ARGS.adaptive_training,
-    )
-
-
 def get_graph_executor_forward(mod: GraphModule, device: tvm.runtime.Device) -> Callable:
     """
     Get the forward function for graph executor, in order to integrate with TorchDynamo.
@@ -419,7 +386,7 @@ def forward(*args):
 
 def create_tvm_task_collection_backend(tasks: List[ms.ExtractedTask]) -> Callable:
     """
-    This torchdynamo backend only collects the extracted tasks from Metaschedule.
+    This torchdynamo backend only collects the extracted tasks from MetaSchedule.
     It doesn't tune the model.
     """
 
@@ -428,7 +395,11 @@ def backend(graph_module, example_inputs):
         shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
         ir_mod, params = tvm.relay.frontend.from_pytorch(jit_mod, shape_list)
 
-        extracted_tasks = ms.extract_task_from_relay(ir_mod, ARGS.target, params)
+        extracted_tasks = ms.relay_integration.extract_tasks(
+            mod=ir_mod,
+            target=ARGS.target,
+            params=params,
+        )
         logger.info("Extracted %d tasks", len(extracted_tasks))
         tasks.extend(extracted_tasks)
 
@@ -440,31 +411,21 @@ def backend(graph_module, example_inputs):
 def create_tvm_compilation_backend(database: ms.database.Database) -> Callable:
     """
     This torchdynamo backend compiles the model using history best record from the
-    Metaschedule database.
+    MetaSchedule database.
     """
 
     def backend(graph_module, example_inputs):
-        # pylint: disable=import-outside-toplevel
-        from tvm.ir.transform import PassContext
-
-        # pylint: enable=import-outside-toplevel
-
         jit_mod = torch.jit.trace(graph_module, example_inputs)
         shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
         ir_mod, params = tvm.relay.frontend.from_pytorch(jit_mod, shape_list)
 
-        relay_build = {"graph": tvm.relay.build, "vm": tvm.relay.vm.compile}[ARGS.backend]
-        with ARGS.target, ms.utils.autotvm_silencer(), database:
-            with PassContext(
-                opt_level=3,
-                config={
-                    "relay.backend.use_meta_schedule": True,
-                    "relay.backend.use_meta_schedule_dispatch": not IS_CUDA,
-                    "relay.backend.tir_converter": "default",
-                },
-            ):
-                lib = relay_build(ir_mod, target=ARGS.target, params=params)
-
+        lib = ms.relay_integration.compile_relay(
+            database=database,
+            mod=ir_mod,
+            target=ARGS.target,
+            params=params,
+            backend=ARGS.backend,
+        )
         device = tvm.cuda(0) if IS_CUDA else tvm.cpu(0)
 
         if ARGS.backend == "graph":
@@ -503,7 +464,9 @@ def is_output_correct(output: torch.Tensor, expected: torch.Tensor) -> bool:
 
 
 def performance_experiment(
-    model_iter_fn: Callable, model: torch.nn.Module, example_inputs: Tuple[torch.Tensor]
+    model_iter_fn: Callable,
+    model: torch.nn.Module,
+    example_inputs: Tuple[torch.Tensor],
 ) -> str:
     """
     Performs the actual benchmarking
@@ -560,11 +523,11 @@ def main():
     """
     describe()
 
+    database = ms.database.JSONDatabase(work_dir=ARGS.work_dir)
     if not ARGS.mode.should_tune:
-        ms_database = ms.default_config.database(None, ARGS.work_dir)
-        if len(ms_database) == 0:
+        if len(database) == 0:
             raise RuntimeError(
-                "Script is runnig in eval mode while the tuning database is empty. "
+                "Script is running in eval mode while the tuning database is empty. "
                 "Please tune the model first."
             )
 
@@ -573,6 +536,7 @@ def main():
             "Benchmark is running on CUDA, while --cpu-flush is turned on. "
             "This flag will have no effect on CUDA."
         )
+        ARGS.cpu_flush = False
 
     try:
         _, name, model, example_inputs, batch_size = runner.load_model(
@@ -587,16 +551,27 @@ def main():
         logging.exception(f"{ARGS.model} failed to load")
         return
 
-    tuning_tasks: List[ms.ExtractedTask] = []
-    task_collect_ctx = torchdynamo.optimize(create_tvm_task_collection_backend(tuning_tasks))
-    task_collect_ctx(runner.model_iter_fn)(model, example_inputs)
-
-    database = ms.tune_extracted_tasks(
-        extracted_tasks=tuning_tasks,
-        config=get_tune_config(),
-        work_dir=ARGS.work_dir,
-        runner=get_metaschedule_runner(),  # type: ignore
-    )
+    if ARGS.mode.should_tune:
+        extracted_tasks: List[ms.ExtractedTask] = []
+        task_collect_ctx = torchdynamo.optimize(create_tvm_task_collection_backend(extracted_tasks))
+        task_collect_ctx(runner.model_iter_fn)(model, example_inputs)
+        tasks, task_weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
+            extracted_tasks=extracted_tasks,
+            work_dir=ARGS.work_dir,
+        )
+        database = ms.tune.tune_tasks(
+            tasks=tasks,
+            task_weights=task_weights,
+            work_dir=ARGS.work_dir,
+            max_trials_global=ARGS.num_trials,
+            max_trials_per_task=ARGS.num_trials_per_task,
+            runner=get_meta_schedule_runner(),  # type: ignore
+            database=database,
+            cost_model=ms.cost_model.XGBModel(  # type: ignore
+                extractor=ms.feature_extractor.PerStoreFeature(),
+                adaptive_training=ARGS.adaptive_training,
+            ),
+        )
 
     if ARGS.mode.should_eval:
         torchdynamo.reset()
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py
index 6d473ed3237c..a7c177afdca4 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -15,18 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
-from distutils.util import strtobool
 import argparse
 import json
 import logging
-import onnx  # type: ignore
+from distutils.util import strtobool
 
+import onnx  # type: ignore
 import tvm
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.relay.frontend import from_onnx
 from tvm.support import describe
-from .tune_utils import generate_input_data, create_timer
+
+from .tune_utils import create_timer, generate_input_data
 
 
 def _parse_args():
@@ -126,7 +127,7 @@ def _parse_args():
 logging.basicConfig(
     format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
 )
-logging.getLogger("tvm.meta_schedule").setLevel(logging.INFO)
+logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 ARGS = _parse_args()
 
 
@@ -146,33 +147,38 @@ def main():
         item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape
     }
 
-    runner = ms.runner.RPCRunner(
-        rpc_config=ARGS.rpc_config,
-        evaluator_config=ms.runner.EvaluatorConfig(
-            number=ARGS.number,
-            repeat=ARGS.repeat,
-            min_repeat_ms=ARGS.min_repeat_ms,
-            enable_cpu_cache_flush=ARGS.cpu_flush,
-        ),
-        alloc_repeat=1,
-    )
-
     with ms.Profiler() as profiler:
-        lib = ms.tune_relay(
+        database = ms.relay_integration.tune_relay(
             mod=mod,
             target=ARGS.target,
-            config=ms.TuneConfig(
-                strategy="evolutionary",
-                num_trials_per_iter=64,
-                max_trials_per_task=ARGS.num_trials,
-                max_trials_global=ARGS.num_trials,
+            params=params,
+            work_dir=ARGS.work_dir,
+            max_trials_global=ARGS.num_trials,
+            num_trials_per_iter=64,
+            runner=ms.runner.RPCRunner(  # type: ignore
+                rpc_config=ARGS.rpc_config,
+                evaluator_config=ms.runner.EvaluatorConfig(
+                    number=ARGS.number,
+                    repeat=ARGS.repeat,
+                    min_repeat_ms=ARGS.min_repeat_ms,
+                    enable_cpu_cache_flush=ARGS.cpu_flush,
+                ),
+                alloc_repeat=1,
+            ),
+            cost_model=ms.cost_model.XGBModel(  # type: ignore
+                extractor=ms.feature_extractor.PerStoreFeature(),
                 adaptive_training=ARGS.adaptive_training,
             ),
-            runner=runner,  # type: ignore
-            work_dir=ARGS.work_dir,
+            strategy=ms.search_strategy.EvolutionarySearch(),
+        )
+        lib = ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            target=ARGS.target,
             params=params,
             backend=ARGS.backend,
         )
+
     print("Tuning Time:")
     print(profiler.table())
 
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index 7c5977495db5..de1668c1dd16 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -131,7 +131,7 @@ def _parse_args():
 logging.basicConfig(
     format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
 )
-logging.getLogger("tvm.meta_schedule").setLevel(logging.INFO)
+logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 ARGS = _parse_args()
 
 
@@ -164,30 +164,34 @@ def main():
         print(f"  input_shape: {item['shape']}")
         print(f"  input_dtype: {item['dtype']}")
 
-    runner = ms.runner.RPCRunner(
-        rpc_config=ARGS.rpc_config,
-        evaluator_config=ms.runner.EvaluatorConfig(
-            number=ARGS.number,
-            repeat=ARGS.repeat,
-            min_repeat_ms=ARGS.min_repeat_ms,
-            enable_cpu_cache_flush=ARGS.cpu_flush,
-        ),
-        alloc_repeat=1,
-    )
-
     with ms.Profiler() as profiler:
-        lib = ms.tune_relay(
+        database = ms.relay_integration.tune_relay(
             mod=mod,
             target=ARGS.target,
-            config=ms.TuneConfig(
-                strategy="evolutionary",
-                num_trials_per_iter=64,
-                max_trials_per_task=ARGS.num_trials,
-                max_trials_global=ARGS.num_trials,
+            work_dir=ARGS.work_dir,
+            max_trials_global=ARGS.num_trials,
+            num_trials_per_iter=64,
+            params=params,
+            runner=ms.runner.RPCRunner(  # type: ignore
+                rpc_config=ARGS.rpc_config,
+                evaluator_config=ms.runner.EvaluatorConfig(
+                    number=ARGS.number,
+                    repeat=ARGS.repeat,
+                    min_repeat_ms=ARGS.min_repeat_ms,
+                    enable_cpu_cache_flush=ARGS.cpu_flush,
+                ),
+                alloc_repeat=1,
+            ),
+            cost_model=ms.cost_model.XGBModel(  # type: ignore
+                extractor=ms.feature_extractor.PerStoreFeature(),
                 adaptive_training=ARGS.adaptive_training,
             ),
-            runner=runner,  # type: ignore
-            work_dir=ARGS.work_dir,
+            strategy=ms.search_strategy.EvolutionarySearch(),
+        )
+        lib = ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            target=ARGS.target,
             params=params,
             backend=ARGS.backend,
         )
diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py
index d54d92048ee6..16f9be674f39 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -15,14 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
-from distutils.util import strtobool
 import argparse
 import logging
+from distutils.util import strtobool
 from typing import Optional
 
 import tvm
-from tvm import tir
 from tvm import meta_schedule as ms
+from tvm import tir
 from tvm.meta_schedule.testing.te_workload import create_te_workload
 from tvm.support import describe
 
@@ -106,37 +106,36 @@ def _parse_args():
 logging.basicConfig(
     format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
 )
-logging.getLogger("tvm.meta_schedule").setLevel(logging.INFO)
+logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 ARGS = _parse_args()
 
 
 def main():
     describe()
     print(f"Workload: {ARGS.workload}")
-    runner = ms.runner.RPCRunner(
-        rpc_config=ARGS.rpc_config,
-        evaluator_config=ms.runner.EvaluatorConfig(
-            number=ARGS.number,
-            repeat=ARGS.repeat,
-            min_repeat_ms=ARGS.min_repeat_ms,
-            enable_cpu_cache_flush=ARGS.cpu_flush,
-        ),
-        alloc_repeat=1,
-    )
     with ms.Profiler() as profiler:
-        sch: Optional[tir.Schedule] = ms.tune_tir(
+        sch: Optional[tir.Schedule] = ms.tir_integration.tune_tir(
             mod=create_te_workload(ARGS.workload, 0),
             target=ARGS.target,
-            config=ms.TuneConfig(
-                strategy="evolutionary",
-                num_trials_per_iter=64,
-                max_trials_per_task=ARGS.num_trials,
-                max_trials_global=ARGS.num_trials,
+            work_dir=ARGS.work_dir,
+            max_trials_global=ARGS.num_trials,
+            num_trials_per_iter=64,
+            runner=ms.runner.RPCRunner(  # type: ignore
+                rpc_config=ARGS.rpc_config,
+                evaluator_config=ms.runner.EvaluatorConfig(
+                    number=ARGS.number,
+                    repeat=ARGS.repeat,
+                    min_repeat_ms=ARGS.min_repeat_ms,
+                    enable_cpu_cache_flush=ARGS.cpu_flush,
+                ),
+                alloc_repeat=1,
+            ),
+            cost_model=ms.cost_model.XGBModel(  # type: ignore
+                extractor=ms.feature_extractor.PerStoreFeature(),
                 adaptive_training=ARGS.adaptive_training,
             ),
-            runner=runner,  # type: ignore
+            strategy=ms.search_strategy.EvolutionarySearch(),
             task_name=ARGS.workload,
-            work_dir=ARGS.work_dir,
         )
 
     print("Tuning Time:")
diff --git a/python/tvm/meta_schedule/tir_integration.py b/python/tvm/meta_schedule/tir_integration.py
new file mode 100644
index 000000000000..975987ebcb67
--- /dev/null
+++ b/python/tvm/meta_schedule/tir_integration.py
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""MetaSchedule-TIR integration"""
+from typing import Optional, Union
+
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
+from tvm import ir, tir
+from tvm.target import Target
+
+from .builder import Builder
+from .cost_model import CostModel
+from .database import Database
+from .logging import get_loggers_from_work_dir
+from .measure_callback import MeasureCallback
+from .runner import Runner
+from .search_strategy import SearchStrategy
+from .space_generator import SpaceGenerator
+from .task_scheduler import TaskScheduler
+from .tune import tune_tasks
+from .tune_context import TuneContext, _normalize_mod
+from .utils import fork_seed
+
+
+def tune_tir(
+    mod: Union[ir.IRModule, tir.PrimFunc],
+    target: Union[str, Target],
+    work_dir: str,
+    max_trials_global: int,
+    *,
+    num_trials_per_iter: int = 64,
+    builder: Builder.BuilderType = "local",
+    runner: Runner.RunnerType = "local",
+    database: Database.DatabaseType = "json",
+    cost_model: CostModel.CostModelType = "xgb",
+    measure_callbacks: MeasureCallback.CallbackListType = "default",
+    task_scheduler: TaskScheduler.TaskSchedulerType = "round-robin",
+    space: SpaceGenerator.SpaceGeneratorType = "post-order-apply",
+    strategy: SearchStrategy.SearchStrategyType = "evolutionary",
+    task_name: str = "main",
+    num_threads: Union[Literal["physical", "logical"], int] = "physical",
+    seed: Optional[int] = None,
+) -> Database:
+    """Tune a TIR function.
+
+    Parameters
+    ----------
+    mod : Union[ir.IRModule, tir.PrimFunc]
+        The TIR function to tune.
+    target : Union[str, Target]
+        The target to tune for.
+    work_dir : str
+        The working directory.
+    max_trials_global : int
+        The maximum number of trials to run globally.
+    num_trials_per_iter : int
+        The number of trials to run per iteration
+    builder : Builder.BuilderType
+        The builder.
+    runner : Runner.RunnerType
+        The runner.
+    database : Database.DatabaseType
+        The database.
+    cost_model : CostModel.CostModelType
+        The cost model.
+    measure_callbacks : MeasureCallback.CallbackListType
+        The measure callbacks.
+    task_scheduler : TaskScheduler.TaskSchedulerType
+        The task scheduler.
+    space : SpaceGenerator.SpaceGeneratorType
+        The space generator.
+    strategy : SearchStrategy.SearchStrategyType
+        The search strategy.
+    task_name : str
+        The name of the task.
+    num_threads : Union[Literal["physical", "logical"], int]
+        The number of threads to use.
+    seed : Optional[int]
+        The seed for the random number generator.
+
+    Returns
+    -------
+    database : Database
+        The database with all tuning records
+    """
+    (logger,) = get_loggers_from_work_dir(work_dir, [task_name])
+    (seed,) = fork_seed(seed, n=1)
+    return tune_tasks(
+        tasks=[
+            TuneContext(
+                mod=mod,
+                target=target,
+                space_generator=space,
+                search_strategy=strategy,
+                task_name=task_name,
+                logger=logger,
+                rand_state=seed,
+                num_threads=num_threads,
+            ).clone()
+        ],
+        task_weights=[1.0],
+        work_dir=work_dir,
+        max_trials_global=max_trials_global,
+        max_trials_per_task=max_trials_global,
+        num_trials_per_iter=num_trials_per_iter,
+        builder=builder,
+        runner=runner,
+        database=database,
+        cost_model=cost_model,
+        measure_callbacks=measure_callbacks,
+        task_scheduler=task_scheduler,
+    )
+
+
+def compile_tir(
+    database: Database,
+    mod: Union[ir.IRModule, tir.PrimFunc],
+    target: Union[Target, str],
+) -> tir.Schedule:
+    """Compile a TIR to tir.Schedule, according to the records in the database.
+
+    Parameters
+    ----------
+    database : Database
+        The database of tuning records.
+    mod : Union[ir.IRModule, tir.PrimFunc]
+        The TIR function to tune.
+    target : Union[str, Target]
+        The target to tune for.
+
+    Returns
+    -------
+    sch : tir.Schedule
+        The best schedule found in the database.
+    """
+    mod = _normalize_mod(mod)
+    if not isinstance(target, Target):
+        target = Target(target)
+    return database.query_schedule(mod, target, workload_name="main")
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 96b554d4e659..f7a2d4dc376f 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -14,637 +14,99 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""User-facing Tuning API"""
-# pylint: disable=import-outside-toplevel
-import logging
-import logging.config
-import os
-from os import path as osp
-from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union
+"""The core tuning API"""
+from typing import List, Optional
 
-from tvm.ir import IRModule
-from tvm.ir.transform import PassContext
-from tvm.runtime import Module, NDArray, vm
-from tvm.target import Target
-from tvm.te import Tensor, create_prim_func
-from tvm.tir import PrimFunc, Schedule
-
-from . import default_config
 from .builder import Builder
 from .cost_model import CostModel
-from .database import Database, TuningRecord
-from .extracted_task import ExtractedTask
+from .database import Database
 from .measure_callback import MeasureCallback
-from .mutator import Mutator
-from .postproc import Postproc
-from .profiler import Profiler
 from .runner import Runner
-from .schedule_rule import ScheduleRule
-from .search_strategy import EvolutionarySearch, ReplayFunc, ReplayTrace
-from .space_generator import PostOrderApply, SpaceGenerator
-from .task_scheduler import GradientBased, RoundRobin
+from .task_scheduler import TaskScheduler
 from .tune_context import TuneContext
-from .utils import autotvm_silencer, batch_parameterize_config
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-FnSpaceGenerator = Callable[[], SpaceGenerator]
-FnScheduleRule = Callable[[], List[ScheduleRule]]
-FnPostproc = Callable[[], List[Postproc]]
-FnMutatorProb = Callable[[], Dict[Mutator, float]]
-
-
-class TuneConfig(NamedTuple):
-    """Configuration for tuning
-
-    Parameters
-    ----------
-    max_trials_global: int
-        Maximum number of trials to run.
-    num_trials_per_iter: int
-        Number of trials to run per iteration.
-    max_trials_per_task: Optional[int]
-        Maximum number of trials to run per task. If None, use `max_trials_global`.
-    task_scheduler: str = "gradient"
-        Task scheduler to use.
-        Valid options are: round_robin, gradient.
-    strategy: str = "evolutionary"
-        Search strategy to use.
-        Valid options are: evolutionary, replay_func, replay_trace.
-    task_scheduler_config: Optional[Dict[str, Any]] = None
-        Configuration for task scheduler.
-    search_strategy_config: Optional[Dict[str, Any]] = None
-        Configuration for search strategy.
-    logger_config: Optional[Dict[str, Any]] = None
-        Configuration for logger.
-    adaptive_training: Optional[bool] = None
-        Whether adpative training is enabled for cost model.
-    """
-
-    max_trials_global: int
-    num_trials_per_iter: int
-    max_trials_per_task: Optional[int] = None
-    task_scheduler: str = "gradient"
-    strategy: str = "evolutionary"
-    task_scheduler_config: Optional[Dict[str, Any]] = None
-    search_strategy_config: Optional[Dict[str, Any]] = None
-    logger_config: Optional[Dict[str, Any]] = None
-    adaptive_training: Optional[bool] = None
-
-    def create_strategy(self):
-        """Create search strategy from configuration"""
-        cls_tbl = {
-            "evolutionary": EvolutionarySearch,
-            "replay_func": ReplayFunc,
-            "replay_trace": ReplayTrace,
-        }
-        if self.strategy not in cls_tbl:
-            raise ValueError(
-                f"Invalid search strategy: {self.strategy}. "
-                "Valid options are: {}".format(", ".join(cls_tbl.keys()))
-            )
-        # `max_trials_per_task` defaults to `max_trials_global`
-        max_trials_per_task = self.max_trials_per_task
-        if max_trials_per_task is None:
-            max_trials_per_task = self.max_trials_global
-        # `search_strategy_config` defaults to empty dict
-        config = self.search_strategy_config
-        if config is None:
-            config = {}
-        return cls_tbl[self.strategy](
-            num_trials_per_iter=self.num_trials_per_iter,
-            max_trials_per_task=max_trials_per_task,
-            **config,
-        )
-
-    def create_task_scheduler(self, **kwargs):
-        """Create task scheduler from configuration"""
-        cls_tbl = {
-            "round_robin": RoundRobin,
-            "gradient": GradientBased,
-        }
-        if self.task_scheduler not in cls_tbl:
-            raise ValueError(
-                f"Invalid task scheduler: {self.task_scheduler}. "
-                "Valid options are: {}".format(", ".join(cls_tbl.keys()))
-            )
-        # `task_scheduler_config` defaults to empty dict
-        config = self.task_scheduler_config
-        if config is None:
-            config = {}
-        return cls_tbl[self.task_scheduler](
-            max_trials=self.max_trials_global,
-            **kwargs,
-            **config,
-        )
-
-    def create_loggers(
-        self,
-        log_dir: str,
-        params: List[Dict[str, Any]],
-        disable_existing_loggers: bool = False,
-    ):
-        """Create loggers from configuration"""
-        if self.logger_config is None:
-            config = {}
-        else:
-            config = self.logger_config
-
-        config.setdefault("loggers", {})
-        config.setdefault("handlers", {})
-        config.setdefault("formatters", {})
-
-        global_logger_name = "tvm.meta_schedule"
-        global_logger = logging.getLogger(global_logger_name)
-        if global_logger.level is logging.NOTSET:
-            global_logger.setLevel(logging.INFO)
-
-        config["loggers"].setdefault(
-            global_logger_name,
-            {
-                "level": logging._levelToName[  # pylint: disable=protected-access
-                    global_logger.level
-                ],
-                "handlers": [handler.get_name() for handler in global_logger.handlers]
-                + [global_logger_name + ".console", global_logger_name + ".file"],
-                "propagate": False,
-            },
-        )
-        config["loggers"].setdefault(
-            "{logger_name}",
-            {
-                "level": "INFO",
-                "handlers": [
-                    "{logger_name}.file",
-                ],
-                "propagate": False,
-            },
-        )
-        config["handlers"].setdefault(
-            global_logger_name + ".console",
-            {
-                "class": "logging.StreamHandler",
-                "stream": "ext://sys.stdout",
-                "formatter": "tvm.meta_schedule.standard_formatter",
-            },
-        )
-        config["handlers"].setdefault(
-            global_logger_name + ".file",
-            {
-                "class": "logging.FileHandler",
-                "filename": "{log_dir}/" + __name__ + ".task_scheduler.log",
-                "mode": "a",
-                "level": "INFO",
-                "formatter": "tvm.meta_schedule.standard_formatter",
-            },
-        )
-        config["handlers"].setdefault(
-            "{logger_name}.file",
-            {
-                "class": "logging.FileHandler",
-                "filename": "{log_dir}/{logger_name}.log",
-                "mode": "a",
-                "level": "INFO",
-                "formatter": "tvm.meta_schedule.standard_formatter",
-            },
-        )
-        config["formatters"].setdefault(
-            "tvm.meta_schedule.standard_formatter",
-            {
-                "format": "%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
-                "datefmt": "%Y-%m-%d %H:%M:%S",
-            },
-        )
-
-        # set up dictConfig loggers
-        p_config = {"version": 1, "disable_existing_loggers": disable_existing_loggers}
-        for k, v in config.items():
-            if k in ["formatters", "handlers", "loggers"]:
-                p_config[k] = batch_parameterize_config(v, params)  # type: ignore
-            else:
-                p_config[k] = v
-        logging.config.dictConfig(p_config)
-
-        # check global logger
-        if global_logger.level not in [logging.DEBUG, logging.INFO]:
-            global_logger.warning(
-                "Logging level set to %s, please set to logging.INFO"
-                " or logging.DEBUG to view full log.",
-                logging._levelToName[global_logger.level],  # pylint: disable=protected-access
-            )
-        global_logger.info("Logging directory: %s", log_dir)
 
 
-def tune_extracted_tasks(
-    extracted_tasks: List[ExtractedTask],
-    config: TuneConfig,
-    work_dir: str,
+def tune_tasks(
     *,
-    builder: Optional[Builder] = None,
-    runner: Optional[Runner] = None,
-    database: Optional[Database] = None,
-    cost_model: Optional[CostModel] = None,
-    measure_callbacks: Optional[List[MeasureCallback]] = None,
-    space: Optional[FnSpaceGenerator] = None,
-    sch_rules: Optional[FnScheduleRule] = None,
-    postprocs: Optional[FnPostproc] = None,
-    mutator_probs: Optional[FnMutatorProb] = None,
-    num_threads: Optional[int] = None,
+    tasks: List[TuneContext],
+    task_weights: List[float],
+    work_dir: str,
+    max_trials_global: int,
+    max_trials_per_task: Optional[int] = None,
+    num_trials_per_iter: int = 64,
+    builder: Builder.BuilderType = "local",
+    runner: Runner.RunnerType = "local",
+    database: Database.DatabaseType = "json",
+    cost_model: CostModel.CostModelType = "xgb",
+    measure_callbacks: MeasureCallback.CallbackListType = "default",
+    task_scheduler: TaskScheduler.TaskSchedulerType = "gradient",
 ) -> Database:
-    """Tune extracted tasks with a given target.
+    """Tune a list of tasks. Using a task scheduler.
 
     Parameters
     ----------
-    extracted_tasks : List[ExtractedTask]
-        The list of extracted tasks.
-    config : TuneConfig
-        The search strategy config.
+    tasks : List[TuneContext]
+        The list of tasks to tune.
+    task_weights : List[float]
+        The weight of each task.
     work_dir : str
-        The working directory to save intermediate results.
-    builder : Optional[Builder]
-        The builder to use.
-    runner : Optional[Runner]
-        The runner to use.
-    database : Optional[Database]
-        The database to use.
-    cost_model : Optional[CostModel]
-        The cost model to use.
-    measure_callbacks : Optional[List[MeasureCallback]]
-        The callbacks used during tuning.
-    task_scheduler : Optional[TaskScheduler]
-        The task scheduler to use.
-    space : Optional[FnSpaceGenerator]
-        The space generator to use.
-    sch_rules : Optional[FnScheduleRule]
-        The search rules to use.
-    postprocs : Optional[FnPostproc]
-        The postprocessors to use.
-    mutator_probs : Optional[FnMutatorProb]
-        The probability distribution to use different mutators.
-    num_threads : Optional[int]
-        The number of threads to use.
+        The working directory.
+    max_trials_global : int
+        The maximum number of trials to run globally.
+    max_trials_per_task : Optional[int]
+        The maximum number of trials to run per task.
+    num_trials_per_iter : int
+        The number of trials to run per iteration
+    builder : Builder.BuilderType
+        The builder.
+    runner : Runner.RunnerType
+        The runner.
+    database : Database.DatabaseType
+        The database.
+    cost_model : CostModel.CostModelType
+        The cost model.
+    measure_callbacks : MeasureCallback.CallbackListType
+        The measure callbacks.
+    task_scheduler : TaskScheduler.TaskSchedulerType
+        The task scheduler.
 
     Returns
     -------
     database : Database
-        The database containing all the tuning results.
-
+        The database with all tuning records
     """
-    # pylint: disable=protected-access
-    # logging directory is set to `work_dir/logs` by default
-    log_dir = osp.join(work_dir, "logs")
-    os.makedirs(log_dir, exist_ok=True)
-    max_width = len(str(len(extracted_tasks) - 1))
-    logger_name_pattern = __name__ + ".task_{task_id:0" + f"{max_width}" + "d}_{task_name}"
-
-    config.create_loggers(
-        log_dir=log_dir,
-        params=[
-            {
-                "log_dir": log_dir,
-                "logger_name": logger_name_pattern.format(task_id=i, task_name=task.task_name),
-            }
-            for i, task in enumerate(extracted_tasks)
-        ],
-    )
-
-    logger.info("Working directory: %s", work_dir)
-    database = default_config.database(database, work_dir)
-    builder = default_config.builder(builder)
-    runner = default_config.runner(runner)
-    cost_model = default_config.cost_model(cost_model, config.adaptive_training)
-    measure_callbacks = default_config.callbacks(measure_callbacks)
-    # parse the tuning contexts
-    tune_contexts = []
-    for i, task in enumerate(extracted_tasks):
-        assert len(task.dispatched) == 1, "Only size 1 dispatched task list is supported for now"
-        tune_contexts.append(
-            TuneContext(
-                mod=default_config.mod(task.dispatched[0]),
-                target=task.target,
-                space_generator=default_config.space_generator(space),
-                search_strategy=config.create_strategy(),
-                sch_rules=default_config.schedule_rules(sch_rules, task.target),
-                postprocs=default_config.postproc(postprocs, task.target),
-                mutator_probs=default_config.mutator_probs(mutator_probs, task.target),
-                task_name=task.task_name,
-                logger=logging.getLogger(
-                    logger_name_pattern.format(task_id=i, task_name=task.task_name)
-                ),
-                num_threads=num_threads,
-            )
+    if len(tasks) != len(task_weights):
+        raise ValueError(
+            f"Length of tasks ({len(tasks)}) and task_weights ({len(task_weights)}) do not match."
         )
-    # parse the task scheduler
-    # pylint: enable=protected-access
-    task_scheduler = config.create_task_scheduler(
-        tasks=tune_contexts,
-        task_weights=[float(t.weight) for t in extracted_tasks],
+    if max_trials_per_task is None:
+        max_trials_per_task = max_trials_global
+    if not isinstance(builder, Builder):
+        builder = Builder.create(builder)
+    if not isinstance(runner, Runner):
+        runner = Runner.create(runner)
+    if database == "json":
+        database = Database.create(database, work_dir=work_dir)
+    elif not isinstance(database, Database):
+        database = Database.create(database)
+    if not isinstance(cost_model, CostModel):
+        cost_model = CostModel.create(cost_model)
+    if isinstance(measure_callbacks, MeasureCallback):
+        measure_callbacks = [measure_callbacks]
+    elif measure_callbacks == "default":
+        measure_callbacks = MeasureCallback.create(measure_callbacks)
+    if not isinstance(task_scheduler, TaskScheduler):
+        task_scheduler = TaskScheduler.create(task_scheduler)
+    task_scheduler.tune(
+        tasks=tasks,
+        task_weights=task_weights,
+        max_trials_global=max_trials_global,
+        max_trials_per_task=max_trials_per_task,
+        num_trials_per_iter=num_trials_per_iter,
         builder=builder,
         runner=runner,
-        database=database,
-        cost_model=cost_model,
         measure_callbacks=measure_callbacks,
-    )
-    if config.max_trials_global > 0:
-        task_scheduler.tune()
-        cost_model.save(osp.join(work_dir, "cost_model.xgb"))
-    return database
-
-
-def tune_tir(
-    mod: Union[IRModule, PrimFunc],
-    target: Union[str, Target],
-    config: TuneConfig,
-    work_dir: str,
-    *,
-    builder: Optional[Builder] = None,
-    runner: Optional[Runner] = None,
-    database: Optional[Database] = None,
-    cost_model: Optional[CostModel] = None,
-    measure_callbacks: Optional[List[MeasureCallback]] = None,
-    space: Optional[FnSpaceGenerator] = None,
-    blocks: Optional[List[str]] = None,
-    sch_rules: Optional[FnScheduleRule] = None,
-    postprocs: Optional[FnPostproc] = None,
-    mutator_probs: Optional[FnMutatorProb] = None,
-    task_name: str = "main",
-    num_threads: Optional[int] = None,
-) -> Optional[Schedule]:
-    """Tune a TIR IRModule with a given target.
-
-    Parameters
-    ----------
-    mod : Union[IRModule, PrimFunc]
-        The module to tune.
-    target : Union[str, Target]
-        The target to tune for.
-    config : TuneConfig
-        The search strategy config.
-    work_dir : str
-        The working directory to save intermediate results.
-    builder : Optional[Builder]
-        The builder to use.
-    runner : Optional[Runner]
-        The runner to use.
-    database : Optional[Database]
-        The database to use.
-    cost_model : Optional[CostModel]
-        The cost model to use.
-    measure_callbacks : Optional[List[MeasureCallback]]
-        The callbacks used during tuning.
-    space : Optional[FnSpaceGenerator]
-        The space generator to use.
-    blocks : Optional[List[str]]
-        A list of block names specifying blocks to be tuned. Note that if
-        the list is not None, blocks outside this list will not be tuned.
-        Only one of this argument and space may be provided.
-    sch_rules : Optional[FnScheduleRule]
-        The search rules to use.
-    postprocs : Optional[FnPostproc]
-        The postprocessors to use.
-    mutator_probs : Optional[FnMutatorProb]
-        The probability distribution to use different mutators.
-    task_name : str
-        The name of the function to extract schedules from.
-    num_threads : Optional[int]
-        The number of threads to use
-
-    Returns
-    -------
-    sch : Optional[Schedule]
-        The tuned schedule.
-    """
-    # logging directory is set to `work_dir/logs` by default
-    log_dir = osp.join(work_dir, "logs")
-    os.makedirs(log_dir, exist_ok=True)
-
-    config.create_loggers(
-        log_dir=log_dir,
-        params=[{"log_dir": log_dir, "logger_name": __name__ + f".task_{task_name}"}],
-    )
-
-    if blocks is not None:
-        assert space is None, "Can not specify blocks to tune when a search space is given."
-        # Create a filter function to identify named blocks.
-        def _f_block_filter(block, target_names) -> bool:
-            return block.name_hint in target_names
-
-        # Create a space generator that targets specific blocks.
-        space = PostOrderApply(f_block_filter=lambda block: _f_block_filter(block, blocks))
-
-    # pylint: disable=protected-access
-    mod = default_config.mod(mod)
-    target = default_config.target(target)
-    # pylint: enable=protected-access
-    database = tune_extracted_tasks(
-        extracted_tasks=[
-            ExtractedTask(
-                task_name=task_name,
-                mod=mod,
-                dispatched=[mod],
-                target=target,
-                weight=1,
-            ),
-        ],
-        config=config,
-        work_dir=work_dir,
-        builder=builder,
-        runner=runner,
-        database=database,
-        cost_model=cost_model,
-        measure_callbacks=measure_callbacks,
-        space=space,
-        sch_rules=sch_rules,
-        postprocs=postprocs,
-        mutator_probs=mutator_probs,
-        num_threads=num_threads,
-    )
-    with Profiler.timeit("PostTuningCompilation"):
-        bests: List[TuningRecord] = database.get_top_k(database.commit_workload(mod), top_k=1)
-        if not bests:
-            return None
-        assert len(bests) == 1
-        sch = Schedule(mod)
-        bests[0].trace.apply_to_schedule(sch, remove_postproc=False)
-    return sch
-
-
-def tune_te(
-    tensors: List[Tensor],
-    target: Union[str, Target],
-    config: TuneConfig,
-    work_dir: str,
-    *,
-    task_name: str = "main",
-    builder: Optional[Builder] = None,
-    runner: Optional[Runner] = None,
-    database: Optional[Database] = None,
-    cost_model: Optional[CostModel] = None,
-    measure_callbacks: Optional[List[MeasureCallback]] = None,
-    space: Optional[FnSpaceGenerator] = None,
-    sch_rules: Optional[FnScheduleRule] = None,
-    postprocs: Optional[FnPostproc] = None,
-    mutator_probs: Optional[FnMutatorProb] = None,
-    num_threads: Optional[int] = None,
-) -> Optional[Schedule]:
-    """Tune a TE compute DAG with a given target.
-
-    Parameters
-    ----------
-    tensor : List[Tensor]
-        The list of input/output tensors of the TE compute DAG.
-    target : Union[str, Target]
-        The target to tune for.
-    config : TuneConfig
-        The search strategy config.
-    task_name : str
-        The name of the task.
-    work_dir : str
-        The working directory to save intermediate results.
-    builder : Optional[Builder]
-        The builder to use.
-    runner : Optional[Runner]
-        The runner to use.
-    database : Optional[Database]
-        The database to use.
-    measure_callbacks : Optional[List[MeasureCallback]]
-        The callbacks used during tuning.
-
-    Returns
-    -------
-    sch : Optional[Schedule]
-        The tuned schedule.
-    """
-    with Profiler.timeit("CreatePrimFunc"):
-        func = create_prim_func(tensors)
-    return tune_tir(
-        mod=func,
-        target=target,
-        config=config,
-        work_dir=work_dir,
-        task_name=task_name,
-        builder=builder,
-        runner=runner,
-        database=database,
-        cost_model=cost_model,
-        measure_callbacks=measure_callbacks,
-        space=space,
-        sch_rules=sch_rules,
-        postprocs=postprocs,
-        mutator_probs=mutator_probs,
-        num_threads=num_threads,
-    )
-
-
-def tune_relay(
-    mod: IRModule,
-    target: Union[str, Target],
-    config: TuneConfig,
-    work_dir: str,
-    *,
-    backend: str = "graph",
-    params: Optional[Dict[str, NDArray]] = None,
-    builder: Optional[Builder] = None,
-    runner: Optional[Runner] = None,
-    database: Optional[Database] = None,
-    cost_model: Optional[CostModel] = None,
-    measure_callbacks: Optional[List[MeasureCallback]] = None,
-    space: Optional[FnSpaceGenerator] = None,
-    sch_rules: Optional[FnScheduleRule] = None,
-    postprocs: Optional[FnPostproc] = None,
-    mutator_probs: Optional[FnMutatorProb] = None,
-    num_threads: Optional[int] = None,
-    executor=None,
-) -> Union[Module, vm.Executable]:
-    """Tune a Relay IRModule with a given target.
-
-    Parameters
-    ----------
-    mod : IRModule
-        The module to tune.
-    target : Union[str, Target]
-        The target to tune for.
-    config : TuneConfig
-        The search strategy config.
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
-        The associated parameters of the program
-    task_name : str
-        The name of the task.
-    work_dir : str
-        The working directory to save intermediate results.
-    builder : Optional[Builder]
-        The builder to use.
-    runner : Optional[Runner]
-        The runner to use.
-    database : Optional[Database]
-        The database to use.
-    measure_callbacks : Optional[List[MeasureCallback]]
-        The callbacks used during tuning.
-    backend : str = "graph"
-        The backend to use for relay compilation(graph / vm).
-    executor : relay.backend.Executor
-        The executor to be passed to relay.build(...). In particular, its link-params
-        attribute affects task extration and workload database look up.
-
-    Returns
-    -------
-    lib : Union[Module, tvm.runtime.vm.Executable]
-        The built runtime module or vm Executable for the given relay workload.
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    from .relay_integration import extract_task_from_relay
-
-    # pylint: disable=protected-access, enable=import-outside-toplevel
-    target = default_config.target(target)
-    # pylint: enable=protected-access,
-    # parse the tuning contexts
-
-    if executor is None:
-        executor = relay.backend.Executor("graph")
-
-    if "link-params" in executor.attrs:
-        link_params = executor.attrs["link-params"]
-    else:
-        link_params = False
-
-    with Profiler.timeit("TaskExtraction"):
-        pass_config = {
-            "relay.FuseOps.link_params": link_params,
-            "relay.backend.use_meta_schedule": True,
-            "relay.backend.tir_converter": "default",
-        }
-        extracted_tasks = extract_task_from_relay(mod, target, params, pass_config=pass_config)
-
-    database = tune_extracted_tasks(
-        extracted_tasks,
-        config,
-        work_dir,
-        builder=builder,
-        runner=runner,
         database=database,
         cost_model=cost_model,
-        measure_callbacks=measure_callbacks,
-        space=space,
-        sch_rules=sch_rules,
-        postprocs=postprocs,
-        mutator_probs=mutator_probs,
-        num_threads=num_threads,
     )
-
-    with Profiler.timeit("PostTuningCompilation"):
-        with target, autotvm_silencer(), database:
-            with PassContext(
-                opt_level=3,
-                config={
-                    "relay.backend.use_meta_schedule": True,
-                    "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda",
-                    "relay.backend.tir_converter": "default",
-                },
-            ):
-                if backend == "graph":
-                    return relay.build(mod, target=target, params=params, executor=executor)
-
-                # Executor is not supported by VM
-                return relay.vm.compile(mod, target=target, params=params)
+    return database
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index 29cd94110c0c..38a46ebe757e 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -16,38 +16,49 @@
 # under the License.
 """Meta Schedule tuning context."""
 
-import logging
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
+
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
 
 from tvm import IRModule
 from tvm._ffi import register_object
-from tvm.meta_schedule.utils import cpu_count, make_logging_func
 from tvm.runtime import Object
 from tvm.target import Target
 from tvm.tir import PrimFunc, Schedule
 
 from . import _ffi_api
+from .logging import Logger, get_logger, get_logging_func
+from .utils import cpu_count
 
 if TYPE_CHECKING:
     from .cost_model import CostModel
     from .database import Database
-    from .mutator import Mutator
-    from .postproc import Postproc
     from .runner import RunnerResult
-    from .schedule_rule import ScheduleRule
     from .search_strategy import MeasureCandidate, SearchStrategy
-    from .space_generator import ScheduleFn, ScheduleFnType, SpaceGenerator
-    from .tune import TuneConfig
+    from .space_generator import SpaceGenerator
+
+
+def _normalize_mod(mod: Union[PrimFunc, IRModule]) -> IRModule:
+    """Normalize the input to an IRModule"""
+    if isinstance(mod, PrimFunc):
+        mod = mod.with_attr("global_symbol", "main")
+        mod = mod.with_attr("tir.noalias", True)
+        mod = IRModule({"main": mod})
+    if not isinstance(mod, IRModule):
+        raise TypeError(f"Expected `mod` to be PrimFunc or IRModule, but gets: {mod}")
+    func_names = mod.get_global_vars()
+    (func_name,) = func_names
+    if len(func_names) == 1 and func_name.name_hint != "main":
+        mod = IRModule({"main": mod[func_name]})
+    return mod
 
 
 @register_object("meta_schedule.TuneContext")
 class TuneContext(Object):
-    """
-    The tune context class is designed to contain all resources for a tuning task.
-
-    Different tuning tasks are separated in different TuneContext classes, but different classes in
-    the same task can interact with each other through tune context. Most classes have a function
-    to initialize with a tune context.
+    """The tune context class is designed to contain all resources for a tuning task.
 
     Parameters
     ----------
@@ -57,22 +68,9 @@ class TuneContext(Object):
         The target to be optimized for.
     space_generator : Union[None, ScheduleFnType, SpaceGenerator] = None
         The design space generator.
-    search_strategy : Union[None, TuneConfig, SearchStrategy] = None
+    search_strategy : Union[None, SearchStrategy] = None
         The search strategy.
         if None, the strategy is left blank.
-        If TuneConfig, the strategy is initialized with the TuneConfig.create_strategy().
-    sch_rules: Union[None, str, List[ScheduleRule]] = None,
-        The schedule rules.
-        If None, use an empty list of rules.
-        if "default", use target-default rules.
-    postprocs: Union[None, str, List[Postproc"]] = None,
-        The postprocessors.
-        If None, use an empty list of rules.
-        if "default", use target-default rules.
-    mutator_probs: Union[None, str, Dict[Mutator, float]]
-        Mutators and their probability mass.
-        If None, use an empty list of rules.
-        if "default", use target-default rules.
     task_name : Optional[str] = None
         The name of the tuning task.
     logger : logging.Logger
@@ -82,24 +80,14 @@ class TuneContext(Object):
         Need to be in integer in [1, 2^31-1], -1 means using random number.
     num_threads : int = None
         The number of threads to be used, None means using the logical cpu count.
-
-    Note
-    ----
-    In most cases, mod and target should be available in the tuning context. They are "Optional"
-    because we allow the user to customize the tuning context, along with other classes, sometimes
-    without mod and target. E.g., we can have a stand alone search strategy that generates measure
-    candidates without initializing with the tune context.
     """
 
     mod: Optional[IRModule]
     target: Optional[Target]
     space_generator: Optional["SpaceGenerator"]
     search_strategy: Optional["SearchStrategy"]
-    sch_rules: List["ScheduleRule"]
-    postprocs: List["Postproc"]
-    mutator_probs: Optional[Dict["Mutator", float]]
     task_name: str
-    logger: Optional[logging.Logger]
+    logger: Optional[Logger]
     rand_state: int
     num_threads: int
 
@@ -107,114 +95,57 @@ def __init__(
         self,
         mod: Optional[IRModule] = None,
         *,
-        target: Optional[Target] = None,
-        space_generator: Union[None, "ScheduleFnType", "ScheduleFn", "SpaceGenerator"] = None,
-        search_strategy: Union[None, "SearchStrategy", "TuneConfig"] = None,
-        sch_rules: Union[None, str, List["ScheduleRule"]] = None,
-        postprocs: Union[None, str, List["Postproc"]] = None,
-        mutator_probs: Union[None, str, Dict["Mutator", float]] = None,
+        target: Union[Target, str, None] = None,
+        space_generator: Union["SpaceGenerator.SpaceGeneratorType", None] = None,
+        search_strategy: Union["SearchStrategy.SearchStrategyType", None] = None,
         task_name: str = "main",
-        logger: Optional[logging.Logger] = None,
         rand_state: int = -1,
-        num_threads: Optional[int] = None,
+        num_threads: Union[int, Literal["physical", "logical"]] = "physical",
+        logger: Optional[Logger] = None,
     ):
         # pylint: disable=import-outside-toplevel
-        from . import default_config
-        from .space_generator import ScheduleFn
-        from .tune import TuneConfig
+        import tvm.tir.tensor_intrin  # pylint: disable=unused-import
+
+        from .search_strategy import SearchStrategy
+        from .space_generator import SpaceGenerator
 
         # pylint: enable=import-outside-toplevel
         if isinstance(mod, PrimFunc):
-            mod = IRModule.from_expr(mod)
-        if callable(space_generator):
-            space_generator = ScheduleFn(space_generator)
-        if isinstance(search_strategy, TuneConfig):
-            search_strategy = search_strategy.create_strategy()
-        if isinstance(sch_rules, str):
-            if sch_rules == "default":
-                if target is None:
-                    raise ValueError("target is required when sch_rules is 'default'")
-                sch_rules = default_config.schedule_rules(None, target)
-            else:
-                raise ValueError("sch_rules should be a list of ScheduleRule or 'default'")
-        if isinstance(postprocs, str):
-            if postprocs == "default":
-                if target is None:
-                    raise ValueError("target is required when postprocs is 'default'")
-                postprocs = default_config.postproc(None, target)
-            else:
-                raise ValueError("postprocs should be a list of Postproc or 'default'")
-        if isinstance(mutator_probs, str):
-            if mutator_probs == "default":
-                if target is None:
-                    raise ValueError("target is required when mutator_probs is 'default'")
-                mutator_probs = default_config.mutator_probs(None, target)
+            mod = _normalize_mod(mod)
+        if target is not None:
+            if not isinstance(target, Target):
+                target = Target(target)
+        if space_generator is not None:
+            if not isinstance(space_generator, SpaceGenerator):
+                space_generator = SpaceGenerator.create(space_generator)
+        if search_strategy is not None:
+            if not isinstance(search_strategy, SearchStrategy):
+                search_strategy = SearchStrategy.create(search_strategy)
         if logger is None:
-            self.logger = logging.getLogger(__name__)
-        else:
-            self.logger = None
-        if num_threads is None:
-            num_threads = cpu_count(logical=False)
+            logger = get_logger(__name__)
+        if not isinstance(num_threads, int):
+            if num_threads == "physical":
+                num_threads = cpu_count(logical=False)
+            elif num_threads == "logical":
+                num_threads = cpu_count(logical=True)
+            else:
+                raise ValueError(
+                    f"Invalid num_threads: {num_threads}, "
+                    "should be either an integer, 'physical', or 'logical'"
+                )
         self.__init_handle_by_constructor__(
             _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
             mod,
             target,
             space_generator,
             search_strategy,
-            sch_rules,
-            postprocs,
-            mutator_probs,
             task_name,
-            make_logging_func(logger),
-            rand_state,
             num_threads,
+            rand_state,
+            get_logging_func(logger),
         )
         _ffi_api.TuneContextInitialize(self)  # type: ignore # pylint: disable=no-member
 
-    def _set_measure_candidates(self, candidates):
-        """Set candidates in a tuning context.
-
-        Parameters
-        ----------
-        candidates : List[MeasureCandidate]
-            A list of measure candidates for the tuning context.
-        """
-        _ffi_api.TuneContextSetMeasureCandidates(self, candidates)  # type: ignore # pylint: disable=no-member
-
-    def _send_to_builder(self, builder):
-        """Send candidates to builder.
-
-        Parameters
-        ----------
-        builder : Builder
-            The builder for building the candidates.
-        """
-        _ffi_api.TuneContextSendToBuilder(self, builder)  # type: ignore # pylint: disable=no-member
-
-    def _send_to_runner(self, runner):
-        """Send candidates to runner.
-
-        Parameters
-        ----------
-        runner : Runner
-            The runner for running the candidates.
-        """
-        _ffi_api.TuneContextSendToRunner(self, runner)  # type: ignore # pylint: disable=no-member
-
-    def _join(self):
-        """Join the runner processes.
-
-        Returns
-        -------
-        result : List[RunnerResult]
-            The runner results.
-        """
-        return _ffi_api.TuneContextJoin(self)  # type: ignore # pylint: disable=no-member
-
-    def _clear_measure_state(self):
-        """Clear the measure states."""
-        _ffi_api.TuneContextClearMeasureState(self)  # type: ignore # pylint: disable=no-member
-
     def generate_design_space(self) -> List[Schedule]:
         """Generate design spaces given a module.
 
@@ -236,6 +167,8 @@ def generate_design_space(self) -> List[Schedule]:
 
     def pre_tuning(
         self,
+        max_trials: int,
+        num_trials_per_iter: int = 64,
         design_spaces: Optional[List[Schedule]] = None,
         database: Optional["Database"] = None,
         cost_model: Optional["CostModel"] = None,
@@ -246,6 +179,10 @@ def pre_tuning(
 
         Parameters
         ----------
+        max_trials : int
+            The maximum number of trials to be executed.
+        num_trials_per_iter : int = 64
+            The number of trials to be executed per iteration.
         design_spaces : Optional[List[Schedule]]
             The design spaces used during tuning process.
             If None, use the outcome of `self.generate_design_space()`.
@@ -278,7 +215,13 @@ def pre_tuning(
         if cost_model is None:
             if isinstance(self.search_strategy, EvolutionarySearch):
                 cost_model = RandomModel()  # type: ignore
-        return self.search_strategy.pre_tuning(design_spaces, database, cost_model)
+        return self.search_strategy.pre_tuning(
+            max_trials,
+            num_trials_per_iter,
+            design_spaces,
+            database,
+            cost_model,
+        )
 
     def post_tuning(self) -> None:
         """A method to be called for SearchStrategy to do necessary cleanup after tuning.
diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index 7b7c4a68653d..eb3c6437603c 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -16,12 +16,11 @@
 # under the License.
 """Utilities for meta schedule"""
 import ctypes
-import logging
 import os
 import shutil
-from contextlib import contextmanager
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, List, Optional, Union
 
+import numpy as np  # type: ignore
 import psutil  # type: ignore
 from tvm._ffi import get_global_func, register_func
 from tvm.error import TVMError
@@ -86,7 +85,7 @@ def method(*args, **kwargs):
     assert isinstance(cls.__base__, type)
     assert hasattr(
         cls, "_tvm_metadata"
-    ), "Please use the user-facing method overiding class, i.e., PyRunner."
+    ), "Please use the user-facing method overriding class, i.e., PyRunner."
 
     base = cls.__base__
     metadata = getattr(base, "_tvm_metadata")
@@ -114,7 +113,10 @@ def __init__(self, *args, **kwargs):
 
         def __getattr__(self, name: str):
             """Bridge the attribute function."""
-            return self._inst.__getattribute__(name)
+            try:
+                return self._inst.__getattribute__(name)
+            except AttributeError:
+                return super(TVMDerivedObject, self).__getattr__(name)
 
         def __setattr__(self, name, value):
             if name not in ["_inst", "key", "handle"]:
@@ -157,14 +159,6 @@ def _cpu_count_impl(logical: bool = True) -> int:
     return psutil.cpu_count(logical=logical) or 1
 
 
-@register_func("meta_schedule._process_error_message")
-def _process_error_message(error_msg: str) -> str:
-    error_msg_lines = str(error_msg).splitlines()
-    if len(error_msg_lines) >= 50:
-        return "\n".join(error_msg_lines[:25] + ["..."] + error_msg_lines[-25:])
-    return error_msg
-
-
 def cpu_count(logical: bool = True) -> int:
     """Return the number of logical or physical CPUs in the system
 
@@ -193,6 +187,14 @@ def cpu_count(logical: bool = True) -> int:
     return _cpu_count_impl(logical)
 
 
+@register_func("meta_schedule.using_ipython")
+def _using_ipython():
+    try:
+        return get_ipython().__class__.__name__ == "ZMQInteractiveShell"  # type: ignore
+    except NameError:
+        return False
+
+
 def get_global_func_with_default_on_worker(
     name: Union[None, str, Callable],
     default: Callable,
@@ -335,114 +337,7 @@ def _to_hex_address(handle: ctypes.c_void_p) -> str:
     return hex(ctypes.cast(handle, ctypes.c_void_p).value)
 
 
-@contextmanager
-def autotvm_silencer():
-    """A context manager that silences autotvm warnings."""
-    from tvm import autotvm  # pylint: disable=import-outside-toplevel
-
-    silent = autotvm.GLOBAL_SCOPE.silent
-    autotvm.GLOBAL_SCOPE.silent = True
-    try:
-        yield
-    finally:
-        autotvm.GLOBAL_SCOPE.silent = silent
-
-
-def make_logging_func(logger: logging.Logger) -> Optional[Callable]:
-    """Get the logging function.
-    Parameters
-    ----------
-    logger : logging.Logger
-        The logger instance.
-    Returns
-    -------
-    result : Optional[Callable]
-        The function to do the specified level of logging.
-    """
-    if logger is None:
-        return None
-
-    level2log = {
-        logging.DEBUG: logger.debug,
-        logging.INFO: logger.info,
-        logging.WARNING: logger.warning,
-        logging.ERROR: logger.error,
-        # logging.FATAL not included
-    }
-
-    def logging_func(level: int, msg: str):
-        def clear_notebook_output():
-            from IPython.display import clear_output  # type: ignore # pylint: disable=import-outside-toplevel
-
-            clear_output(wait=True)
-
-        if level < 0:
-            clear_notebook_output()
-        else:
-            level2log[level](msg)
-
-    return logging_func
-
-
-@register_func("meta_schedule.using_ipython")
-def _check_ipython_env():
-    try:
-        return get_ipython().__class__.__name__ == "ZMQInteractiveShell"  # type: ignore
-    except NameError:
-        return False
-
-
-def parameterize_config(config: Dict[str, Any], params: Dict[str, str]) -> Dict[str, Any]:
-    """Parameterize the given configuration.
-
-    Parameters
-    ----------
-    config : Dict[str, Any]
-        The given config dict.
-    Params : Dict[str, str]
-        The given parameters.
-
-    Returns
-    -------
-    result : Dict[str, Any]
-        The parameterized configuration.
-    """
-    result = {}
-    for k, v in config.items():
-        if isinstance(k, str):
-            k = k.format(**params)
-        if isinstance(v, str):
-            v = v.format(**params)
-        elif isinstance(v, dict):
-            v = parameterize_config(v, params)
-        elif isinstance(v, list):
-            v = [t.format(**params) for t in v]
-        result[k] = v
-    return result
-
-
-def batch_parameterize_config(
-    config: Dict[str, Any], params: List[Dict[str, str]]
-) -> Dict[str, Any]:
-    """Parameterize the given configuration with multiple parameters sets.
-
-    Parameters
-    ----------
-    config : Dict[str, Any]
-        The given config dict.
-    Params : List[Dict[str, str]]
-        List of the given multiple parameters sets.
-
-    Returns
-    -------
-    result : Dict[str, Any]
-        The parameterized configuration.
-    """
-    results = {}
-    for name, cfg in config.items():
-        for p in params:
-            p_name = name.format(**p)
-            if p_name not in results:
-                p_cfg = parameterize_config(cfg, p)
-                results[p_name] = p_cfg
-    return results
+def fork_seed(seed: Optional[int], n: int) -> List[int]:
+    # fmt: off
+    return np.random.RandomState(seed=seed).randint(1, 2 ** 30, size=n).tolist()
+    # fmt: on
diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
index 3374d18dff80..86dd2eee5cd7 100644
--- a/python/tvm/tir/tensor_intrin/cuda.py
+++ b/python/tvm/tir/tensor_intrin/cuda.py
@@ -16,13 +16,14 @@
 # under the License.
 # pylint: disable=invalid-name,missing-function-docstring
 """Intrinsics for tensorization on NVIDIA GPU."""
-from typing import Tuple, Dict
+from typing import Dict, Tuple
+
 from tvm.script import tir as T
 from tvm.tir.function import PrimFunc
-from .. import IntImm, Cast
+
 from ..._ffi import register_func
 from ...runtime import convert
-from .. import TensorIntrin
+from .. import Cast, IntImm, TensorIntrin
 
 
 def shared_16x16_to_ldmatrix_32x8_layout(i, j):
diff --git a/src/meta_schedule/measure_callback/add_to_database.cc b/src/meta_schedule/measure_callback/add_to_database.cc
index 26399276c933..68a4b93ea96f 100644
--- a/src/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/meta_schedule/measure_callback/add_to_database.cc
@@ -27,12 +27,12 @@ class AddToDatabaseNode : public MeasureCallbackNode {
              const Array<MeasureCandidate>& measure_candidates,
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
-    if (!task_scheduler->database.defined()) {
+    if (!task_scheduler->database_.defined()) {
       return;
     }
     auto _ = Profiler::TimedScope("MeasureCallback/AddToDatabase");
-    TuneContext task = task_scheduler->tasks[task_id];
-    Database database = task_scheduler->database.value();
+    TuneContext task = task_scheduler->tasks_[task_id]->ctx;
+    Database database = task_scheduler->database_.value();
     Workload workload = database->CommitWorkload(task->mod.value());
     Target target = task->target.value();
     ICHECK_EQ(runner_results.size(), measure_candidates.size());
diff --git a/src/meta_schedule/measure_callback/echo_statistics.cc b/src/meta_schedule/measure_callback/echo_statistics.cc
deleted file mode 100644
index fb1064266566..000000000000
--- a/src/meta_schedule/measure_callback/echo_statistics.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <sstream>
-
-#include "../utils.h"
-
-namespace tvm {
-namespace meta_schedule {
-
-constexpr const double kMaxTime = 1e10;
-
-std::string GetTaskName(const TuneContext& task, int task_id) {
-  std::ostringstream os;
-  os << "Task #" << task_id << ": " << task->task_name;
-  return os.str();
-}
-
-struct TaskInfo {
-  std::string name;
-  double flop = 0.0;
-  int trials = -1;
-  int best_round = -1;
-  double best_ms = kMaxTime;
-  double best_gflops = 0.0;
-  int error_count = 0;
-  PackedFunc logging_func;
-
-  explicit TaskInfo(const String& name, PackedFunc logging_func)
-      : name(name), logging_func(logging_func) {}
-
-  void Update(double run_ms) {
-    ++trials;
-    if (run_ms < best_ms) {
-      best_ms = run_ms;
-      best_round = trials;
-      best_gflops = flop / run_ms / 1e6;
-    }
-    TVM_PY_LOG(INFO, logging_func) << "[" << name << "] Trial #" << trials   //
-                                   << std::fixed << std::setprecision(4)     //
-                                   << ": GFLOPs: " << (flop / run_ms / 1e6)  //
-                                   << ". Time: " << run_ms << " ms"          //
-                                   << ". Best GFLOPs: " << best_gflops;
-  }
-
-  void UpdateError(std::string err, const MeasureCandidate& candidate) {
-    static const auto* f_proc = runtime::Registry::Get("meta_schedule._process_error_message");
-    ICHECK(f_proc != nullptr);
-    err = (*f_proc)(err).operator std::string();
-    ++error_count;
-    ++trials;
-    TVM_PY_LOG(INFO, logging_func)
-        << "[" << name << "] Trial #" << trials  //
-        << std::fixed << std::setprecision(4)    //
-        << ": Error in building: " << err << "\n"
-        << tir::AsTVMScript(candidate->sch->mod()) << "\n"
-        << Concat(candidate->sch->trace().value()->AsPython(false), "\n");
-  }
-};
-
-class EchoStatisticsNode : public MeasureCallbackNode {
- public:
-  void Apply(const TaskScheduler& task_scheduler, int task_id,
-             const Array<MeasureCandidate>& measure_candidates,
-             const Array<BuilderResult>& builder_results,
-             const Array<RunnerResult>& runner_results) final {
-    auto _ = Profiler::TimedScope("MeasureCallback/EchoStatistics");
-    if (this->task_info.empty()) {
-      SetupTaskInfo(task_scheduler->tasks);
-    }
-    ICHECK_EQ(measure_candidates.size(), builder_results.size());
-    ICHECK_EQ(measure_candidates.size(), runner_results.size());
-    int n = measure_candidates.size();
-    TuneContext task = task_scheduler->tasks[task_id];
-    TaskInfo& info = this->task_info[task_id];
-    std::string task_name = GetTaskName(task, task_id);
-    for (int i = 0; i < n; ++i) {
-      MeasureCandidate candidate = measure_candidates[i];
-      BuilderResult builder_result = builder_results[i];
-      RunnerResult runner_result = runner_results[i];
-      if (Optional<String> err = builder_result->error_msg) {
-        info.UpdateError(err.value(), candidate);
-      } else if (Optional<String> err = runner_result->error_msg) {
-        info.UpdateError(err.value(), candidate);
-      } else {
-        ICHECK(runner_result->run_secs.defined());
-        info.Update(GetRunMsMedian(runner_result));
-      }
-    }
-  }
-
-  void SetupTaskInfo(const Array<TuneContext>& tasks) {
-    task_info.reserve(tasks.size());
-    int task_id = 0;
-    for (const TuneContext& task : tasks) {
-      task_info.push_back(TaskInfo(GetTaskName(task, task_id), task->logging_func));
-      TaskInfo& info = task_info.back();
-      info.flop = tir::EstimateTIRFlops(task->mod.value());
-      ++task_id;
-    }
-  }
-
-  std::vector<TaskInfo> task_info;
-
-  static constexpr const char* _type_key = "meta_schedule.EchoStatistics";
-  TVM_DECLARE_FINAL_OBJECT_INFO(EchoStatisticsNode, MeasureCallbackNode);
-};
-
-MeasureCallback MeasureCallback::EchoStatistics() {
-  ObjectPtr<EchoStatisticsNode> n = make_object<EchoStatisticsNode>();
-  return MeasureCallback(n);
-}
-
-TVM_REGISTER_NODE_TYPE(EchoStatisticsNode);
-TVM_REGISTER_GLOBAL("meta_schedule.MeasureCallbackEchoStatistics")
-    .set_body_typed(MeasureCallback::EchoStatistics);
-
-}  // namespace meta_schedule
-}  // namespace tvm
diff --git a/src/meta_schedule/measure_callback/measure_callback.cc b/src/meta_schedule/measure_callback/measure_callback.cc
index ebe63e7b76f1..f16fb73c520c 100644
--- a/src/meta_schedule/measure_callback/measure_callback.cc
+++ b/src/meta_schedule/measure_callback/measure_callback.cc
@@ -39,6 +39,14 @@ MeasureCallback MeasureCallback::PyMeasureCallback(PyMeasureCallbackNode::FApply
   return MeasureCallback(n);
 }
 
+Array<MeasureCallback, void> MeasureCallback::Default() {
+  return {
+      MeasureCallback::AddToDatabase(),
+      MeasureCallback::RemoveBuildArtifact(),
+      MeasureCallback::UpdateCostModel(),
+  };
+}
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<PyMeasureCallbackNode>([](const ObjectRef& n, ReprPrinter* p) {
       const auto* self = n.as<PyMeasureCallbackNode>();
@@ -55,6 +63,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.MeasureCallbackApply")
     .set_body_method<MeasureCallback>(&MeasureCallbackNode::Apply);
 TVM_REGISTER_GLOBAL("meta_schedule.MeasureCallbackPyMeasureCallback")
     .set_body_typed(MeasureCallback::PyMeasureCallback);
+TVM_REGISTER_GLOBAL("meta_schedule.MeasureCallbackDefault")
+    .set_body_typed(MeasureCallback::Default);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/measure_callback/update_cost_model.cc b/src/meta_schedule/measure_callback/update_cost_model.cc
index 8851345c43b0..0563699ba6b9 100644
--- a/src/meta_schedule/measure_callback/update_cost_model.cc
+++ b/src/meta_schedule/measure_callback/update_cost_model.cc
@@ -28,11 +28,12 @@ class UpdateCostModelNode : public MeasureCallbackNode {
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
     auto _ = Profiler::TimedScope("MeasureCallback/UpdateCostModel");
-    TuneContext task = task_scheduler->tasks[task_id];
-    ICHECK(task_scheduler->cost_model.defined())
-        << "Cost model must be defined for the task scheduler!";
+    const TaskRecord& task = task_scheduler->tasks_[task_id];
+    if (!task_scheduler->cost_model_.defined()) {
+      return;
+    }
+    CostModel cost_model = task_scheduler->cost_model_.value();
     ICHECK(task->measure_candidates.defined()) << "Task's measure candidates must be present!";
-    CostModel cost_model = task_scheduler->cost_model.value();
     ICHECK_EQ(measure_candidates.size(), builder_results.size());
     ICHECK_EQ(runner_results.size(), builder_results.size());
     int n = builder_results.size();
@@ -46,7 +47,7 @@ class UpdateCostModelNode : public MeasureCallbackNode {
         pruned_runner_result.push_back(runner_results[i]);
       }
     }
-    cost_model->Update(task, pruned_candidate, pruned_runner_result);
+    cost_model->Update(task->ctx, pruned_candidate, pruned_runner_result);
   }
 
   static constexpr const char* _type_key = "meta_schedule.UpdateCostModel";
diff --git a/src/meta_schedule/mutator/mutator.cc b/src/meta_schedule/mutator/mutator.cc
index 25312ab61f99..8e9bfc8bde4b 100644
--- a/src/meta_schedule/mutator/mutator.cc
+++ b/src/meta_schedule/mutator/mutator.cc
@@ -51,6 +51,31 @@ Mutator Mutator::PyMutator(
   return Mutator(n);
 }
 
+Map<Mutator, FloatImm> Mutator::DefaultLLVM() {
+  return Map<Mutator, FloatImm>{
+      {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
+      {Mutator::MutateComputeLocation(), FloatImm(DataType::Float(64), 0.05)},
+      {Mutator::MutateUnroll(), FloatImm(DataType::Float(64), 0.03)},
+      {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(DataType::Float(64), 0.02)}};
+}
+
+Map<Mutator, FloatImm> Mutator::DefaultCUDA() {
+  return Map<Mutator, FloatImm>{
+      {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
+      {Mutator::MutateUnroll(), FloatImm(DataType::Float(64), 0.08)},
+      {Mutator::MutateThreadBinding(), FloatImm(DataType::Float(64), 0.02)}};
+}
+
+Map<Mutator, FloatImm> Mutator::DefaultCUDATensorCore() { return Mutator::DefaultCUDA(); }
+
+Map<Mutator, FloatImm> Mutator::DefaultHexagon() {
+  return Map<Mutator, FloatImm>{
+      {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
+      {Mutator::MutateComputeLocation(), FloatImm(DataType::Float(64), 0.05)},
+      {Mutator::MutateUnroll(), FloatImm(DataType::Float(64), 0.03)},
+      {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(DataType::Float(64), 0.02)}};
+}
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<PyMutatorNode>([](const ObjectRef& n, ReprPrinter* p) {
       const auto* self = n.as<PyMutatorNode>();
@@ -72,6 +97,11 @@ TVM_REGISTER_GLOBAL("meta_schedule.MutatorApply")
     });
 TVM_REGISTER_GLOBAL("meta_schedule.MutatorClone").set_body_method<Mutator>(&MutatorNode::Clone);
 TVM_REGISTER_GLOBAL("meta_schedule.MutatorPyMutator").set_body_typed(Mutator::PyMutator);
+TVM_REGISTER_GLOBAL("meta_schedule.MutatorDefaultLLVM").set_body_typed(Mutator::DefaultLLVM);
+TVM_REGISTER_GLOBAL("meta_schedule.MutatorDefaultCUDA").set_body_typed(Mutator::DefaultCUDA);
+TVM_REGISTER_GLOBAL("meta_schedule.MutatorDefaultCUDATensorCore")
+    .set_body_typed(Mutator::DefaultCUDATensorCore);
+TVM_REGISTER_GLOBAL("meta_schedule.MutatorDefaultHexagon").set_body_typed(Mutator::DefaultHexagon);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc
index 957d6e7364e4..acc157e36e94 100644
--- a/src/meta_schedule/postproc/postproc.cc
+++ b/src/meta_schedule/postproc/postproc.cc
@@ -50,6 +50,48 @@ Postproc Postproc::PyPostproc(
   return Postproc(n);
 }
 
+Array<Postproc> Postproc::DefaultLLVM() {
+  return Array<Postproc>{
+      Postproc::DisallowDynamicLoop(),
+      Postproc::RewriteParallelVectorizeUnroll(),
+      Postproc::RewriteReductionBlock(),
+      Postproc::RewriteLayout(),
+  };
+}
+
+Array<Postproc> Postproc::DefaultCUDA() {
+  return Array<Postproc>{
+      Postproc::DisallowDynamicLoop(),
+      Postproc::RewriteCooperativeFetch(),
+      Postproc::RewriteUnboundBlock(/*max_threadblocks=*/256),
+      Postproc::RewriteParallelVectorizeUnroll(),
+      Postproc::RewriteReductionBlock(),
+      Postproc::VerifyGPUCode(),
+  };
+}
+
+Array<Postproc> Postproc::DefaultCUDATensorCore() {
+  return Array<Postproc>{
+      Postproc::DisallowDynamicLoop(),
+      Postproc::RewriteCooperativeFetch(),
+      Postproc::RewriteUnboundBlock(/*max_threadblocks=*/256),
+      Postproc::RewriteParallelVectorizeUnroll(),
+      Postproc::RewriteReductionBlock(),
+      Postproc::RewriteTensorize(/*vectorize_init_loop=*/false),
+      Postproc::VerifyGPUCode(),
+  };
+}
+
+Array<Postproc> Postproc::DefaultHexagon() {
+  return Array<Postproc>{
+      Postproc::DisallowDynamicLoop(),
+      Postproc::RewriteParallelVectorizeUnroll(),  //
+      Postproc::RewriteReductionBlock(),
+      // TODO(masahi): Fix RewriteLayout for link-params=True case
+      // Postproc::RewriteLayout(),
+  };
+}
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<PyPostprocNode>([](const ObjectRef& n, ReprPrinter* p) {
       const auto* self = n.as<PyPostprocNode>();
@@ -67,6 +109,12 @@ TVM_REGISTER_GLOBAL("meta_schedule.PostprocInitializeWithTuneContext")
 TVM_REGISTER_GLOBAL("meta_schedule.PostprocApply").set_body_method<Postproc>(&PostprocNode::Apply);
 TVM_REGISTER_GLOBAL("meta_schedule.PostprocClone").set_body_method<Postproc>(&PostprocNode::Clone);
 TVM_REGISTER_GLOBAL("meta_schedule.PostprocPyPostproc").set_body_typed(Postproc::PyPostproc);
+TVM_REGISTER_GLOBAL("meta_schedule.PostprocDefaultLLVM").set_body_typed(Postproc::DefaultLLVM);
+TVM_REGISTER_GLOBAL("meta_schedule.PostprocDefaultCUDA").set_body_typed(Postproc::DefaultCUDA);
+TVM_REGISTER_GLOBAL("meta_schedule.PostprocDefaultCUDATensorCore")
+    .set_body_typed(Postproc::DefaultCUDATensorCore);
+TVM_REGISTER_GLOBAL("meta_schedule.PostprocDefaultHexagon")
+    .set_body_typed(Postproc::DefaultHexagon);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index ac9f45ca8ef4..427653b06c2a 100644
--- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -97,7 +97,7 @@ class RewriteCooperativeFetchNode : public PostprocNode {
     if (Optional<Integer> v = context->target.value()->GetAttr<Integer>("thread_warp_size")) {
       this->thread_warp_size_ = v.value()->value;
     } else {
-      TVM_PY_LOG(INFO, context->logging_func) << "'thread_warp_size' is not defined in the target";
+      TVM_PY_LOG(INFO, context->logger) << "'thread_warp_size' is not defined in the target";
     }
   }
 
diff --git a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
index f2fc67f74cc7..e8d821636fd3 100644
--- a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
+++ b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
@@ -32,12 +32,12 @@ class CrossThreadReductionNode : public ScheduleRuleNode {
     Optional<Integer> opt_warp_size = target->GetAttr<Integer>("thread_warp_size");
 
     if (!opt_max_threads_per_block.defined()) {
-      TVM_PY_LOG(WARNING, context->logging_func)
+      TVM_PY_LOG(WARNING, context->logger)
           << "Target does not have attribute \"max_threads_per_block\", therefore the "
              "rule CrossThreadReduction will not be applied";
     }
     if (!opt_warp_size.defined()) {
-      TVM_PY_LOG(WARNING, context->logging_func)
+      TVM_PY_LOG(WARNING, context->logger)
           << "Target does not have attribute \"thread_warp_size\", therefore the rule "
              "CrossThreadReduction will not be applied";
     }
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index 2ae6714f55d8..d9c46015eac3 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -84,10 +84,10 @@ void MultiLevelTilingNode::InitializeWithTuneContext(const TuneContext& context)
     if (Optional<Integer> v = context->target.value()->GetAttr<Integer>("thread_warp_size")) {
       this->thread_warp_size_ = v.value()->value;
     } else {
-      TVM_PY_LOG(INFO, context->logging_func) << "'thread_warp_size' is not defined in the target";
+      TVM_PY_LOG(INFO, context->logger) << "'thread_warp_size' is not defined in the target";
     }
   }
-  logging_func = context->logging_func;
+  logger = context->logger;
 }
 
 // Entry of the mega rule; Inherited from ScheduleRuleNode
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.h b/src/meta_schedule/schedule_rule/multi_level_tiling.h
index 8f55e8e7e4e4..98b4634af106 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.h
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.h
@@ -193,7 +193,7 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
   /*! \brief The maximum number of threads to be used size of a thread warp */
   int max_threads_per_block_;
   /*! \brief The logging function */
-  PackedFunc logging_func;
+  PackedFunc logger;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("structure", &structure);
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 2ec78c1918e9..e8a03c722656 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -209,12 +209,12 @@ Array<Schedule> MultiLevelTilingTensorCoreNode::Apply(const Schedule& sch,
   }
   Array<Schedule> results;
   for (auto&& state : ApplySubRules(initial_states)) {
-    TVM_PY_LOG(INFO, logging_func) << "Sketch " << results.size() << ": tensorizing with "
-                                   << state.as<TensorCoreStateNode>()->intrin_group.compute_intrin;
+    TVM_PY_LOG(INFO, logger) << "Sketch " << results.size() << ": tensorizing with "
+                             << state.as<TensorCoreStateNode>()->intrin_group.compute_intrin;
     results.push_back(std::move(state->sch));
   }
   if (results.empty()) {
-    TVM_PY_LOG(INFO, logging_func) << "The workload cannot be tensorized.";
+    TVM_PY_LOG(INFO, logger) << "The workload cannot be tensorized.";
     return {original_sch};
   }
   return results;
@@ -293,8 +293,8 @@ std::vector<State> MultiLevelTilingTensorCoreNode::AddReadReuseTensorCore(
     } else if (dtype.is_int() && dtype.bits() == 8) {
       sch->StorageAlign(cache_read, 0, -2, 32, 16);
     } else {
-      TVM_PY_LOG(WARNING, logging_func) << "StorageAlign is not applied for data type " << dtype
-                                        << ", shared memory accesses might be inefficient.";
+      TVM_PY_LOG(WARNING, logger) << "StorageAlign is not applied for data type " << dtype
+                                  << ", shared memory accesses might be inefficient.";
     }
   }
   return {state};
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
index 8485e697eb24..428a1206a4ca 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
@@ -49,17 +49,17 @@ class MultiLevelTilingWithIntrinNode : public MultiLevelTilingNode {
   Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final {
     auto desc_func = tir::TensorIntrin::Get(intrin_name).value()->desc;
     if (!CheckAutoTensorizeApplicable(sch, block_rv, desc_func)) {
-      TVM_PY_LOG(INFO, logging_func) << "The workload cannot be tensorized.";
+      TVM_PY_LOG(INFO, logger) << "The workload cannot be tensorized.";
       return {sch};
     }
 
     auto res = MultiLevelTilingNode::Apply(sch->Copy(), block_rv);
 
     if (res.empty()) {
-      TVM_PY_LOG(INFO, logging_func) << "The workload cannot be tensorized.";
+      TVM_PY_LOG(INFO, logger) << "The workload cannot be tensorized.";
       return {sch};
     }
-    TVM_PY_LOG(INFO, logging_func) << "Tensorizing with " << intrin_name;
+    TVM_PY_LOG(INFO, logger) << "Tensorizing with " << intrin_name;
     return res;
   }
 
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index 416b43f46d56..8333833bfafa 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -51,6 +51,152 @@ ScheduleRule ScheduleRule::PyScheduleRule(
   return ScheduleRule(n);
 }
 
+Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
+  return {
+      ScheduleRule::AutoInline(
+          /*into_producer=*/false,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/true,
+          /*require_injective=*/true,
+          /*require_ordered=*/true,
+          /*disallow_op=*/Array<String>{"tir.exp"}),
+      ScheduleRule::AddRFactor(
+          /*max_jobs_per_core=*/16,
+          /*max_innermost_factor=*/Integer(64)),
+      ScheduleRule::MultiLevelTiling(
+          /*structure=*/"SSRSRS",
+          /*tile_binds=*/NullOpt,
+          /*max_innermost_factor=*/Integer(64),
+          /*vector_load_lens=*/NullOpt,
+          /*reuse_read=*/NullOpt,
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("may")},
+                                 {"levels", Array<Integer>{1, 2}},
+                                 {"scope", String("global")}}),
+      ScheduleRule::ParallelizeVectorizeUnroll(
+          /*max_jobs_per_core=*/16,
+          /*max_vectorize_extent=*/64,
+          /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512},
+          /*unroll_explicit=*/true),
+      ScheduleRule::RandomComputeLocation(),
+  };
+}
+
+Array<ScheduleRule> ScheduleRule::DefaultCUDA() {
+  return {
+      ScheduleRule::MultiLevelTiling(
+          /*structure=*/"SSSRRSRS",
+          /*tile_binds=*/Array<String>{"blockIdx.x", "vthread.x", "threadIdx.x"},
+          /*max_innermost_factor=*/Integer(64),
+          /*vector_load_lens=*/Array<Integer>{1, 2, 3, 4, 8, 16},
+          /*reuse_read=*/
+          Map<String, ObjectRef>{{"req", String("must")},
+                                 {"levels", Array<Integer>{4}},  //
+                                 {"scope", String("shared")}},
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("must")},
+                                 {"levels", Array<Integer>{3}},  //
+                                 {"scope", String("local")}}),
+      ScheduleRule::AutoInline(
+          /*into_producer=*/true,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/false,
+          /*require_injective=*/false,
+          /*require_ordered=*/false,
+          /*disallow_op=*/Array<String>{}),
+      ScheduleRule::CrossThreadReduction(
+          /*thread_extents=*/Array<Integer>{4, 8, 16, 32, 64, 128, 256, 512}),
+      ScheduleRule::ParallelizeVectorizeUnroll(
+          /*max_jobs_per_core=*/-1,
+          /*max_vectorize_extent=*/-1,
+          /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512, 1024},
+          /*unroll_explicit=*/true),
+      ScheduleRule::AutoBind(
+          /*max_threadblocks=*/256,
+          /*thread_extents*/ Array<Integer>{32, 64, 128, 256, 512, 1024}),
+  };
+}
+
+Array<ScheduleRule> ScheduleRule::DefaultCUDATensorCore() {
+  Array<Map<String, String>> intrin_groups = {
+      {
+          {"init", "wmma_fill_16x16x16_f16"},
+          {"load_a", "wmma_load_16x16x16_f16_a"},
+          {"load_b", "wmma_load_16x16x16_f16_b"},
+          {"compute", "wmma_sync_16x16x16_f16f16f16"},
+          {"store", "wmma_store_16x16x16_f16_shared"},
+      },
+      {
+          {"init", "wmma_fill_16x16x16_f16"},
+          {"load_a", "wmma_load_16x16x16_f16_a"},
+          {"load_b", "wmma_load_16x16x16_f16_b_trans"},
+          {"compute", "wmma_sync_16x16x16_f16f16f16_trans"},
+          {"store", "wmma_store_16x16x16_f16_shared"},
+      },
+      {
+          {"init", "wmma_fill_16x16x16_s32"},
+          {"load_a", "wmma_load_16x16x16_s8_a"},
+          {"load_b", "wmma_load_16x16x16_s8_b"},
+          {"compute", "wmma_sync_16x16x16_s8s8s32"},
+          {"store", "wmma_store_16x16x16_s32_shared"},
+      },
+      {
+          {"init", "wmma_fill_16x16x16_s32"},
+          {"load_a", "wmma_load_16x16x16_s8_a"},
+          {"load_b", "wmma_load_16x16x16_s8_b_trans"},
+          {"compute", "wmma_sync_16x16x16_s8s8s32_trans"},
+          {"store", "wmma_store_16x16x16_s32_shared"},
+      },
+  };
+  Array<ScheduleRule> results{ScheduleRule::MultiLevelTilingTensorCore(
+      /*intrin_groups=*/intrin_groups,
+      /*structure=*/"SSSRRSRS",
+      /*tile_binds=*/Array<String>{"blockIdx.x", "vthread.x", "threadIdx.x"},
+      /*max_innermost_factor=*/Integer(4),
+      /*vector_load_lens=*/Array<Integer>{1, 2, 3, 4, 8, 16},
+      /*reuse_read=*/
+      Map<String, ObjectRef>{{"req", String("must")},
+                             {"levels", Array<Integer>{4}},  //
+                             {"scope", String("shared")}},
+      /*reuse_write=*/
+      Map<String, ObjectRef>{{"req", String("must")},
+                             {"levels", Array<Integer>{2}},  //
+                             {"scope", String("shared")}},
+      /*use_software_pipeline=*/false)};
+  Array<ScheduleRule> append = ScheduleRule::DefaultCUDA();
+  results.insert(results.end(), append.begin(), append.end());
+  return results;
+}
+
+Array<ScheduleRule> ScheduleRule::DefaultHexagon() {
+  return {
+      ScheduleRule::AutoInline(
+          /*into_producer=*/false,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/true,
+          /*require_injective=*/true,
+          /*require_ordered=*/true,
+          /*disallow_op=*/Array<String>{"tir.exp"}),
+      ScheduleRule::MultiLevelTilingWideVector(
+          /*structure=*/"SRSRS",
+          /*vector_length_in_bits=*/1024,
+          /*max_innermost_factor=*/Integer(128),
+          /*reuse_read=*/NullOpt,
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("may")},
+                                 {"levels", Array<Integer>{1, 2}},
+                                 {"scope", String("global")}}),
+      ScheduleRule::ParallelizeVectorizeUnroll(
+          /*max_jobs_per_core=*/16,
+          /*max_vectorize_extent=*/128,
+          /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512},
+          /*unroll_explicit=*/true),
+  };
+}
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<PyScheduleRuleNode>([](const ObjectRef& n, ReprPrinter* p) {
       const auto* self = n.as<PyScheduleRuleNode>();
@@ -71,6 +217,14 @@ TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleClone")
     .set_body_method<ScheduleRule>(&ScheduleRuleNode::Clone);
 TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRulePyScheduleRule")
     .set_body_typed(ScheduleRule::PyScheduleRule);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleDefaultLLVM")
+    .set_body_typed(ScheduleRule::DefaultLLVM);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleDefaultCUDA")
+    .set_body_typed(ScheduleRule::DefaultCUDA);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleDefaultCUDATensorCore")
+    .set_body_typed(ScheduleRule::DefaultCUDATensorCore);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleDefaultHexagon")
+    .set_body_typed(ScheduleRule::DefaultHexagon);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 5930704eb0d1..df67d371929b 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -238,14 +238,18 @@ class EvolutionarySearchNode : public SearchStrategyNode {
   struct State {
     /*! \brief The search strategy itself */
     EvolutionarySearchNode* self;
-    /*! \brief The design spaces. Decisions are not used so traces only. */
-    Array<tir::Trace> design_spaces;
+    /*! \brief The number of total trials. */
+    int max_trials;
+    /*! \brief The number of trials per iteration. */
+    int num_trials_per_iter;
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
     int st;
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
     int ed;
     /*! \brief The counter of returning empty results. */
     int num_empty_iters;
+    /*! \brief The design spaces. Decisions are not used so traces only. */
+    Array<tir::Trace> design_spaces;
     /*! \brief Pre thread data including module to be tuned and random state. */
     std::vector<PerThreadData> per_thread_data_;
     /*!
@@ -260,14 +264,19 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     /*! \brief The token registered for the given workload in database. */
     Workload token_{nullptr};
 
-    explicit State(EvolutionarySearchNode* self, Array<tir::Trace> design_spaces, Database database,
-                   CostModel cost_model)
+    explicit State(EvolutionarySearchNode* self, int max_trials, int num_trials_per_iter,
+                   Array<Schedule> design_space_schedules, Database database, CostModel cost_model)
         : self(self),
-          design_spaces(design_spaces),
+          max_trials(max_trials),
+          num_trials_per_iter(num_trials_per_iter),
           st(0),
-          ed(self->num_trials_per_iter),
+          ed(num_trials_per_iter),
           num_empty_iters(0) {
-      const TuneContextNode* ctx = self->context_;
+      design_spaces.reserve(design_spaces.size());
+      for (const Schedule& space : design_space_schedules) {
+        design_spaces.push_back(space->trace().value()->Simplified(true));
+      }
+      const TuneContextNode* ctx = self->ctx_;
       IRModule mod = ctx->mod.value();
       this->per_thread_data_.resize(ctx->num_threads);
       for (PerThreadData& data : this->per_thread_data_) {
@@ -316,17 +325,17 @@ class EvolutionarySearchNode : public SearchStrategyNode {
   };
 
   /*! \brief The tuning context of the evolutionary search strategy. */
-  const TuneContextNode* context_{nullptr};
+  const TuneContextNode* ctx_{nullptr};
+  /*! \brief The postprocessors */
+  Array<Postproc> postprocs_;
+  /*! \brief The mutators and their probability. */
+  Map<Mutator, FloatImm> mutator_probs_;
   /*! \brief The random state. To be initialized with TuneContext. */
   TRandState rand_state_;
   /*! \brief The state of the search strategy. */
   std::unique_ptr<State> state_ = nullptr;
 
   /*** Configuration: global ***/
-  /*! \brief The number of trials per iteration. */
-  int num_trials_per_iter;
-  /*! \brief The number of total trials. */
-  int max_trials_per_task;
   /*! \brief The population size in the evolutionary search. */
   int population_size;
   /*!
@@ -356,8 +365,6 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     // `state_` is not visited
 
     /*** Configuration: global ***/
-    v->Visit("max_trials_per_task", &max_trials_per_task);
-    v->Visit("num_trials_per_iter", &num_trials_per_iter);
     v->Visit("population_size", &population_size);
     v->Visit("num_empty_iters_before_early_stop", &num_empty_iters_before_early_stop);
     /*** Configuration: the initial population ***/
@@ -374,23 +381,25 @@ class EvolutionarySearchNode : public SearchStrategyNode {
   static constexpr const char* _type_key = "meta_schedule.EvolutionarySearch";
   TVM_DECLARE_FINAL_OBJECT_INFO(EvolutionarySearchNode, SearchStrategyNode);
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    CHECK(context.defined()) << "TuneContext must be defined!";
-    CHECK(context->num_threads > 0) << "Number of threads has to be larger than 0.";
-    CHECK(context->target.defined()) << "Target must be defined!";
-    this->context_ = context.get();
-    this->rand_state_ = ForkSeed(&context->rand_state);
-    for (const auto& kv : context->mutator_probs) {
-      double mass = kv.second->value;
-      TVM_META_SCHEDULE_CHECK_PROB_RANGE(mass, "mutator_probs");
-    }
+  void InitializeWithTuneContext(const TuneContext& ctx) final {
+    CHECK(ctx->num_threads > 0) << "ValueError: `TuneContext.num_threads` must be > 0";
+    CHECK(ctx->space_generator.defined())
+        << "ValueError: `TuneContext.space_generator` must be defined";
+    CHECK(ctx->space_generator.value()->postprocs.defined())
+        << "ValueError: `TuneContext.space_generator.postprocs` must be defined";
+    CHECK(ctx->space_generator.value()->mutator_probs.defined())
+        << "ValueError: `TuneContext.space_generator.mutator_probs` must be defined";
+    this->ctx_ = ctx.get();
+    this->postprocs_ = ctx->space_generator.value()->postprocs.value();
+    this->mutator_probs_ = ctx->space_generator.value()->mutator_probs.value();
+    this->rand_state_ = ForkSeed(&ctx->rand_state);
     this->state_.reset();
   }
 
-  void PreTuning(const Array<Schedule>& design_spaces, const Optional<Database>& database,
-                 const Optional<CostModel>& cost_model) final {
+  void PreTuning(int max_trials, int num_trials_per_iter, const Array<Schedule>& design_spaces,
+                 const Optional<Database>& database, const Optional<CostModel>& cost_model) final {
     ICHECK(!design_spaces.empty());
-    CHECK(this->context_ != nullptr) << "ValueError: Did you forget to initialize the TuneContext?";
+    CHECK(this->ctx_ != nullptr) << "ValueError: Did you forget to initialize the TuneContext?";
     CHECK(database.defined())
         << "ValueError: Database is not supplied in PreTuning. Evolutionary"
            "search algorithm requires a database to be present, so that it "
@@ -401,23 +410,15 @@ class EvolutionarySearchNode : public SearchStrategyNode {
            "algorithm expects a cost model to filter out potentially less efficient kernels. If "
            "you do not expect a cost model to help, please use "
            "`tvm.meta_schedule.cost_model.RandomModel`";
-    if (this->state_ != nullptr) {
-      TVM_PY_LOG(WARNING, this->context_->logging_func)
-          << "EvolutionarySearch is already initialized.";
-      this->state_.reset();
-    }
-    ICHECK(this->state_ == nullptr);
-    Array<tir::Trace> design_space_traces;
-    design_space_traces.reserve(design_spaces.size());
-    for (const Schedule& space : design_spaces) {
-      design_space_traces.push_back(space->trace().value()->Simplified(true));
-    }
-    this->state_ =
-        std::make_unique<State>(this, design_space_traces, database.value(), cost_model.value());
+    CHECK(this->state_ == nullptr)
+        << "ValueError: `PreTuning` is already invoked without corresponding `PostTuning`.";
+    this->state_ = std::make_unique<State>(this, max_trials, num_trials_per_iter, design_spaces,
+                                           database.value(), cost_model.value());
   }
 
   void PostTuning() final {
-    ICHECK(this->state_ != nullptr);
+    CHECK(this->state_ != nullptr) << "ValueError: `PostTuning` is invoked without corresponding "
+                                      "`PreTuning`, or `PostTuning` is already invoked.";
     this->state_.reset();
   }
 
@@ -434,8 +435,6 @@ class EvolutionarySearchNode : public SearchStrategyNode {
 
   SearchStrategy Clone() const final {
     ObjectPtr<EvolutionarySearchNode> n = make_object<EvolutionarySearchNode>();
-    n->max_trials_per_task = this->max_trials_per_task;
-    n->num_trials_per_iter = this->num_trials_per_iter;
     n->population_size = this->population_size;
     n->num_empty_iters_before_early_stop = this->num_empty_iters_before_early_stop;
     n->init_measured_ratio = this->init_measured_ratio;
@@ -444,7 +443,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     n->genetic_mutate_prob = this->genetic_mutate_prob;
     n->genetic_max_fail_count = this->genetic_max_fail_count;
     n->eps_greedy = this->eps_greedy;
-    n->context_ = this->context_;
+    n->ctx_ = this->ctx_;
     n->rand_state_ = this->rand_state_;
     n->state_ = nullptr;  // cleared the state
     return SearchStrategy(n);
@@ -460,7 +459,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickBestFromDatabase(int nu
     measured_traces.push_back(record->trace);
   }
   int actual_num = measured_traces.size();
-  ThreadedTraceApply pp(self->context_->postprocs);
+  ThreadedTraceApply pp(self->postprocs_);
   std::vector<Schedule> results(actual_num, Schedule{nullptr});
   auto f_proc_measured = [this, &measured_traces, &results, &pp](int thread_id,
                                                                  int trace_id) -> void {
@@ -477,13 +476,13 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickBestFromDatabase(int nu
       throw;
     }
   };
-  support::parallel_for_dynamic(0, actual_num, self->context_->num_threads, f_proc_measured);
+  support::parallel_for_dynamic(0, actual_num, self->ctx_->num_threads, f_proc_measured);
   return results;
 }
 
 std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int num) {
   auto _ = Profiler::TimedScope("EvoSearch/SampleInitPopulation");
-  ThreadedTraceApply pp(self->context_->postprocs);
+  ThreadedTraceApply pp(self->postprocs_);
   std::vector<Schedule> out_schs;
   while (static_cast<int>(out_schs.size()) < self->init_min_unmeasured) {
     std::vector<Schedule> results(num, Schedule{nullptr});
@@ -499,14 +498,14 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
         result = sch.value();
       }
     };
-    support::parallel_for_dynamic(0, num, self->context_->num_threads, f_proc_unmeasured);
+    support::parallel_for_dynamic(0, num, self->ctx_->num_threads, f_proc_unmeasured);
     for (int i = 0; i < num; i++) {
       if (results[i].defined()) {
         out_schs.push_back(results[i]);
       }
     }
-    TVM_PY_LOG(INFO, self->context_->logging_func) << "Sample-Init-Population summary:\n"
-                                                   << pp.SummarizeFailures();
+    TVM_PY_LOG(INFO, self->ctx_->logger) << "Sample-Init-Population summary:\n"
+                                         << pp.SummarizeFailures();
   }
   return out_schs;
 }
@@ -524,7 +523,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
   for (int iter = 0;; ++iter) {
     // Predict normalized score with the cost model,
     std::vector<double> scores =
-        PredictNormalizedScore(population, GetRef<TuneContext>(self->context_), this->cost_model_);
+        PredictNormalizedScore(population, GetRef<TuneContext>(self->ctx_), this->cost_model_);
 
     {
       auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc");
@@ -545,12 +544,12 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
       }
       // Set threaded samplers, with probability from predicated normalized throughput
       for (PerThreadData& data : this->per_thread_data_) {
-        data.Set(scores, self->genetic_mutate_prob, self->context_->mutator_probs);
+        data.Set(scores, self->genetic_mutate_prob, self->mutator_probs_);
       }
     }
     {
       auto _ = Profiler::TimedScope("EvoSearch/Evolve/Mutation");
-      ThreadedTraceApply pp(self->context_->postprocs);
+      ThreadedTraceApply pp(self->postprocs_);
       ConcurrentBitmask cbmask(self->population_size);
       std::vector<Schedule> next_population(self->population_size, Schedule{nullptr});
       // The worker function
@@ -589,13 +588,12 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
           result = population.at(sampled_trace_id);
         }
       };
-      support::parallel_for_dynamic(0, self->population_size, self->context_->num_threads,
+      support::parallel_for_dynamic(0, self->population_size, self->ctx_->num_threads,
                                     f_find_candidate);
 
       population.swap(next_population);
-      TVM_PY_LOG(INFO, self->context_->logging_func)
-          << "Evolve iter #" << iter << " done. Summary:\n"
-          << pp.SummarizeFailures();
+      TVM_PY_LOG(INFO, self->ctx_->logger) << "Evolve iter #" << iter << " done. Summary:\n"
+                                           << pp.SummarizeFailures();
     }
   }
   // Return the best states from the heap, sorting from higher score to lower ones
@@ -622,7 +620,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
         os << std::fixed << std::setprecision(4) << heap.heap.at(i).score;
       }
     }
-    TVM_PY_LOG(INFO, self->context_->logging_func)
+    TVM_PY_LOG(INFO, self->ctx_->logger)
         << "Scores of the best " << n << " candidates:" << os.str();
     return results;
   }
@@ -673,33 +671,32 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickWithEpsGreedy(
 }
 
 Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasureCandidates() {
-  if (st >= self->max_trials_per_task) {
+  if (st >= max_trials) {
     return NullOpt;
   }
-  int sample_num = self->num_trials_per_iter;
-  if (ed > self->max_trials_per_task) {
-    sample_num = self->max_trials_per_task - st;
-    ed = self->max_trials_per_task;
+  int sample_num = num_trials_per_iter;
+  if (ed > max_trials) {
+    sample_num = max_trials - st;
+    ed = max_trials;
   }
   ICHECK_LT(st, ed);
   int pop = self->population_size;
   std::vector<Schedule> inits;
   inits.reserve(pop);
 
-  TVM_PY_LOG(INFO, self->context_->logging_func) << "Generating candidates......";
+  TVM_PY_LOG(INFO, self->ctx_->logger) << "Generating candidates......";
   std::vector<Schedule> measured = PickBestFromDatabase(pop * self->init_measured_ratio);
-  TVM_PY_LOG(INFO, self->context_->logging_func)
+  TVM_PY_LOG(INFO, self->ctx_->logger)
       << "Picked top " << measured.size() << " candidate(s) from database";
   std::vector<Schedule> unmeasured = SampleInitPopulation(pop - measured.size());
-  TVM_PY_LOG(INFO, self->context_->logging_func)
-      << "Sampled " << unmeasured.size() << " candidate(s)";
+  TVM_PY_LOG(INFO, self->ctx_->logger) << "Sampled " << unmeasured.size() << " candidate(s)";
   inits.insert(inits.end(), measured.begin(), measured.end());
   inits.insert(inits.end(), unmeasured.begin(), unmeasured.end());
   std::vector<Schedule> bests = EvolveWithCostModel(inits, sample_num);
-  TVM_PY_LOG(INFO, self->context_->logging_func)
+  TVM_PY_LOG(INFO, self->ctx_->logger)
       << "Got " << bests.size() << " candidate(s) with evolutionary search";
   std::vector<Schedule> picks = PickWithEpsGreedy(unmeasured, bests, sample_num);
-  TVM_PY_LOG(INFO, self->context_->logging_func)
+  TVM_PY_LOG(INFO, self->ctx_->logger)
       << "Sending " << picks.size() << " candidates(s) for measurement";
   if (picks.empty()) {
     ++this->num_empty_iters;
@@ -716,9 +713,7 @@ void EvolutionarySearchNode::State::NotifyRunnerResults(
   ed += results.size();
 }
 
-SearchStrategy SearchStrategy::EvolutionarySearch(int num_trials_per_iter,     //
-                                                  int max_trials_per_task,     //
-                                                  int population_size,         //
+SearchStrategy SearchStrategy::EvolutionarySearch(int population_size,         //
                                                   double init_measured_ratio,  //
                                                   int init_min_unmeasured,     //
                                                   int genetic_num_iters,       //
@@ -729,8 +724,6 @@ SearchStrategy SearchStrategy::EvolutionarySearch(int num_trials_per_iter,     /
   TVM_META_SCHEDULE_CHECK_PROB_RANGE(genetic_mutate_prob, "Mutation probability");
   TVM_META_SCHEDULE_CHECK_PROB_RANGE(eps_greedy, "Greedy pick probability");
   ObjectPtr<EvolutionarySearchNode> n = make_object<EvolutionarySearchNode>();
-  n->num_trials_per_iter = num_trials_per_iter;
-  n->max_trials_per_task = max_trials_per_task;
   n->population_size = population_size;
   n->num_empty_iters_before_early_stop = 5;
   n->init_measured_ratio = init_measured_ratio;
diff --git a/src/meta_schedule/search_strategy/replay_func.cc b/src/meta_schedule/search_strategy/replay_func.cc
index 6914ab2f0f0a..7bb4a02ab299 100644
--- a/src/meta_schedule/search_strategy/replay_func.cc
+++ b/src/meta_schedule/search_strategy/replay_func.cc
@@ -28,65 +28,69 @@ class ReplayFuncNode : public SearchStrategyNode {
   struct State {
     /*! \brief The search strategy itself */
     ReplayFuncNode* self;
+    /*! \brief The number of total trials. */
+    int max_trials;
+    /*! \brief The number of trials per iteration. */
+    int num_trials_per_iter;
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
     int st;
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
     int ed;
 
-    explicit State(ReplayFuncNode* self) : self(self), st(0), ed(self->num_trials_per_iter) {
-      const TuneContextNode* ctx = self->context_;
-      ICHECK(ctx);
+    explicit State(ReplayFuncNode* self, int max_trials, int num_trials_per_iter)
+        : self(self),
+          max_trials(max_trials),
+          num_trials_per_iter(num_trials_per_iter),
+          st(0),
+          ed(num_trials_per_iter) {
+      CHECK(self->mod_.defined() && self->space_generator_.defined())
+          << "ValueError: The search strategy has not been initialized.";
     }
 
     inline Optional<Array<MeasureCandidate>> GenerateMeasureCandidates();
     inline void NotifyRunnerResults(const Array<RunnerResult>& results);
   };
 
-  /*! \brief The number of trials per iteration. */
-  int num_trials_per_iter;
-  /*! \brief The number of total trials. */
-  int max_trials_per_task;
-
-  /*! \brief The tuning context of the search strategy. */
-  const TuneContextNode* context_{nullptr};
   /*! \brief The random state. -1 means using random number. */
   TRandState rand_state_ = -1;
+  /*! \brief The IRModule to be scheduled from TuneContext. */
+  Optional<IRModule> mod_ = NullOpt;
+  /*! \brief The space generator from TuneContext. */
+  Optional<SpaceGenerator> space_generator_ = NullOpt;
   /*! \brief The state of the search strategy. */
   std::unique_ptr<State> state_ = nullptr;
 
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("num_trials_per_iter", &num_trials_per_iter);
-    v->Visit("max_trials_per_task", &max_trials_per_task);
-    // `context_` is not visited.
-    // `rand_state_` is not visited
-    // `state_` is not visited
-  }
+  void VisitAttrs(tvm::AttrVisitor* v) {}
 
   static constexpr const char* _type_key = "meta_schedule.ReplayFunc";
   TVM_DECLARE_FINAL_OBJECT_INFO(ReplayFuncNode, SearchStrategyNode);
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    CHECK(context->space_generator.defined())
+  void InitializeWithTuneContext(const TuneContext& ctx) final {
+    CHECK(ctx->mod.defined()) << "ValueError: TuneContext.mod is not defined";
+    CHECK(ctx->space_generator.defined())
         << "ValueError: TuneContext.space_generator is not defined";
-    CHECK(context->mod.defined()) << "ValueError: TuneContext.mod is not defined";
-    this->context_ = context.get();
-    this->rand_state_ = ForkSeed(&context->rand_state);
+    if (!ctx->space_generator.value()->postprocs.defined()) {
+      TVM_PY_LOG(WARNING, ctx->logger)
+          << "`postprocs` is not defined in " << ctx->space_generator.value()
+          << ". Please explicitly set `postprocs` to an empty list if you don't want to "
+             "apply any post-processing.";
+    }
+    this->rand_state_ = ForkSeed(&ctx->rand_state);
+    this->mod_ = ctx->mod;
+    this->space_generator_ = ctx->space_generator;
     this->state_.reset();
   }
 
-  void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
-                 const Optional<CostModel>& cost_model) final {
-    CHECK(this->context_ != nullptr) << "ValueError: Did you forget to initialize the TuneContext?";
-    if (this->state_ != nullptr) {
-      TVM_PY_LOG(WARNING, this->context_->logging_func) << "ReplayFunc is already initialized.";
-      this->state_.reset();
-    }
-    ICHECK(this->state_ == nullptr);
-    this->state_ = std::make_unique<State>(this);
+  void PreTuning(int max_trials, int num_trials_per_iter, const Array<tir::Schedule>& design_spaces,
+                 const Optional<Database>& database, const Optional<CostModel>& cost_model) final {
+    CHECK(this->state_ == nullptr)
+        << "ValueError: `PreTuning` is already invoked without corresponding `PostTuning`.";
+    this->state_ = std::make_unique<State>(this, max_trials, num_trials_per_iter);
   }
 
   void PostTuning() final {
-    ICHECK(this->state_ != nullptr);
+    CHECK(this->state_ != nullptr) << "ValueError: `PostTuning` is invoked without corresponding "
+                                      "`PreTuning`, or `PostTuning` is already invoked.";
     this->state_.reset();
   }
 
@@ -103,32 +107,30 @@ class ReplayFuncNode : public SearchStrategyNode {
 
   SearchStrategy Clone() const final {
     ObjectPtr<ReplayFuncNode> n = make_object<ReplayFuncNode>();
-    n->num_trials_per_iter = this->num_trials_per_iter;
-    n->max_trials_per_task = this->max_trials_per_task;
-    n->context_ = this->context_;
-    n->rand_state_ = this->rand_state_;
-    n->state_ = nullptr;  // cleared the state
+    n->rand_state_ = -1;
+    n->mod_ = NullOpt;
+    n->space_generator_ = NullOpt;
+    n->state_ = nullptr;
     return SearchStrategy(n);
   }
 };
 
 inline Optional<Array<MeasureCandidate>> ReplayFuncNode::State::GenerateMeasureCandidates() {
-  if (st >= self->max_trials_per_task) {
+  if (st >= max_trials) {
     return NullOpt;
   }
-  ed = std::min(ed, self->max_trials_per_task);
+  ed = std::min(ed, max_trials);
   Array<MeasureCandidate> result;
-  const TuneContextNode* ctx = self->context_;
-  ICHECK(ctx);
-  IRModule mod = ctx->mod.value();
+  IRModule mod = self->mod_.value();
+  Array<Postproc> postprocs = self->space_generator_.value()->postprocs.value_or({});
   for (int i = st; i < ed; i++) {
     for (;;) {
-      Array<tir::Schedule> schs = ctx->space_generator.value()->GenerateDesignSpace(mod);
+      Array<tir::Schedule> schs = self->space_generator_.value()->GenerateDesignSpace(mod);
       int design_space_index = tir::SampleInt(&self->rand_state_, 0, schs.size());
       tir::Schedule sch = schs[design_space_index];
       sch->EnterPostproc();
       bool failed = false;
-      for (const Postproc& proc : ctx->postprocs) {
+      for (const Postproc& proc : postprocs) {
         if (!proc->Apply(sch)) {
           failed = true;
           break;
@@ -145,14 +147,12 @@ inline Optional<Array<MeasureCandidate>> ReplayFuncNode::State::GenerateMeasureC
 }
 
 inline void ReplayFuncNode::State::NotifyRunnerResults(const Array<RunnerResult>& results) {
-  st += self->num_trials_per_iter;
-  ed += self->num_trials_per_iter;
+  st += num_trials_per_iter;
+  ed += num_trials_per_iter;
 }
 
-SearchStrategy SearchStrategy::ReplayFunc(int num_trials_per_iter, int max_trials_per_task) {
+SearchStrategy SearchStrategy::ReplayFunc() {
   ObjectPtr<ReplayFuncNode> n = make_object<ReplayFuncNode>();
-  n->num_trials_per_iter = num_trials_per_iter;
-  n->max_trials_per_task = max_trials_per_task;
   return SearchStrategy(n);
 }
 
diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc
index bd553bf037d1..d76ee220a858 100644
--- a/src/meta_schedule/search_strategy/replay_trace.cc
+++ b/src/meta_schedule/search_strategy/replay_trace.cc
@@ -30,6 +30,10 @@ class ReplayTraceNode : public SearchStrategyNode {
     ReplayTraceNode* self;
     /*! \brief The design spaces. */
     Array<tir::Trace> design_spaces;
+    /*! \brief The number of total trials. */
+    int max_trials;
+    /*! \brief The number of trials per iteration. */
+    int num_trials_per_iter;
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
     int st;
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
@@ -38,13 +42,17 @@ class ReplayTraceNode : public SearchStrategyNode {
     /*! \brief The module to be tuned. */
     Array<IRModule> per_thread_mod_{nullptr};
 
-    explicit State(ReplayTraceNode* self, Array<tir::Trace> design_spaces)
-        : self(self), design_spaces(design_spaces), st(0), ed(self->num_trials_per_iter) {
-      const TuneContextNode* ctx = self->context_;
-      ICHECK(ctx);
-      IRModule mod = ctx->mod.value();
-      this->per_thread_mod_.reserve(ctx->num_threads);
-      for (int i = 0; i < ctx->num_threads; i++) {
+    explicit State(ReplayTraceNode* self, Array<tir::Trace> design_spaces, int max_trials,
+                   int num_trials_per_iter)
+        : self(self),
+          design_spaces(design_spaces),
+          max_trials(max_trials),
+          num_trials_per_iter(num_trials_per_iter),
+          st(0),
+          ed(num_trials_per_iter) {
+      IRModule mod = self->mod_.value();
+      this->per_thread_mod_.reserve(self->num_threads_);
+      for (int i = 0; i < self->num_threads_; i++) {
         this->per_thread_mod_.push_back(DeepCopyIRModule(mod));
       }
     }
@@ -53,54 +61,61 @@ class ReplayTraceNode : public SearchStrategyNode {
     inline void NotifyRunnerResults(const Array<RunnerResult>& results);
   };
 
-  /*! \brief The number of trials per iteration. */
-  int num_trials_per_iter;
-  /*! \brief The number of total trials. */
-  int max_trials_per_task;
   /*! \brief The max number of failures during trace replaying. */
   int max_fail_count;
 
-  /*! \brief The tuning context of the search strategy. */
-  const TuneContextNode* context_{nullptr};
   /*! \brief The random state. -1 means using random number. */
   TRandState rand_state_ = -1;
+  /*! \brief The IRModule to be scheduled from TuneContext. */
+  Optional<IRModule> mod_ = NullOpt;
+  /*! \brief The number of threads to be used. */
+  int num_threads_ = -1;
+  /*! \brief The postprocessors. */
+  Array<Postproc> postprocs_ = {};
   /*! \brief The state of the search strategy. */
   std::unique_ptr<State> state_ = nullptr;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("num_trials_per_iter", &num_trials_per_iter);
-    v->Visit("max_trials_per_task", &max_trials_per_task);
     v->Visit("max_fail_count", &max_fail_count);
-    // `context_` is not visited.
     // `rand_state_` is not visited
+    // `mod_` is not visited
+    // `num_threads_` is not visited
+    // `postprocs_` is not visited
     // `state_` is not visited
   }
 
   static constexpr const char* _type_key = "meta_schedule.ReplayTrace";
   TVM_DECLARE_FINAL_OBJECT_INFO(ReplayTraceNode, SearchStrategyNode);
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    CHECK(context->mod.defined()) << "ValueError: TuneContext.mod is not defined";
-    this->context_ = context.get();
-    this->rand_state_ = ForkSeed(&context->rand_state);
+  void InitializeWithTuneContext(const TuneContext& ctx) final {
+    CHECK(ctx->mod.defined()) << "ValueError: TuneContext.mod is not defined";
+    CHECK(ctx->space_generator.defined())
+        << "ValueError: TuneContext.space_generator is not defined";
+    if (!ctx->space_generator.value()->postprocs.defined()) {
+      TVM_PY_LOG(WARNING, ctx->logger)
+          << "`postprocs` is not defined in " << ctx->space_generator.value()
+          << ". Please explicitly set `postprocs` to an empty list if you don't want to "
+             "apply any post-processing.";
+    }
+    this->rand_state_ = ForkSeed(&ctx->rand_state);
+    this->mod_ = ctx->mod;
+    this->num_threads_ = ctx->num_threads;
+    this->postprocs_ = ctx->space_generator.value()->postprocs.value_or({});
     this->state_.reset();
   }
 
-  void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
-                 const Optional<CostModel>& cost_model) final {
+  void PreTuning(int max_trials, int num_trials_per_iter, const Array<tir::Schedule>& design_spaces,
+                 const Optional<Database>& database, const Optional<CostModel>& cost_model) final {
     ICHECK(!design_spaces.empty());
-    CHECK(this->context_ != nullptr) << "ValueError: Did you forget to initialize the TuneContext?";
-    if (this->state_ != nullptr) {
-      TVM_PY_LOG(WARNING, this->context_->logging_func) << "RelayTrace is already initialized.";
-      this->state_.reset();
-    }
-    ICHECK(this->state_ == nullptr);
+    CHECK(this->state_ == nullptr)
+        << "ValueError: `PreTuning` is already invoked without corresponding `PostTuning`.";
     Array<tir::Trace> design_space_traces;
     design_space_traces.reserve(design_spaces.size());
     for (const tir::Schedule& space : design_spaces) {
       design_space_traces.push_back(space->trace().value()->Simplified(true));
     }
-    this->state_ = std::make_unique<State>(this, design_space_traces);
+    this->state_ =
+        std::make_unique<State>(this, design_space_traces, max_trials, num_trials_per_iter);
   }
 
   void PostTuning() final {
@@ -121,10 +136,7 @@ class ReplayTraceNode : public SearchStrategyNode {
 
   SearchStrategy Clone() const final {
     ObjectPtr<ReplayTraceNode> n = make_object<ReplayTraceNode>();
-    n->num_trials_per_iter = this->num_trials_per_iter;
-    n->max_trials_per_task = this->max_trials_per_task;
     n->max_fail_count = this->max_fail_count;
-    n->context_ = this->context_;
     n->rand_state_ = this->rand_state_;
     n->state_ = nullptr;  // cleared the state
     return SearchStrategy(n);
@@ -132,16 +144,14 @@ class ReplayTraceNode : public SearchStrategyNode {
 };
 
 inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasureCandidates() {
-  if (st >= self->max_trials_per_task) {
+  if (st >= max_trials) {
     return NullOpt;
   }
-  ed = std::min(ed, self->max_trials_per_task);
+  ed = std::min(ed, max_trials);
   ICHECK_LT(st, ed);
-  const TuneContextNode* ctx = self->context_;
-  ICHECK(ctx);
-  std::vector<TRandState> per_thread_rand_state = ForkSeed(&self->rand_state_, ctx->num_threads);
+  std::vector<TRandState> per_thread_rand_state = ForkSeed(&self->rand_state_, self->num_threads_);
   Array<MeasureCandidate> per_task_result(ed - st, MeasureCandidate{nullptr});
-  ThreadedTraceApply pp(ctx->postprocs);
+  ThreadedTraceApply pp(self->postprocs_);
   auto f_worker = [this, &per_thread_rand_state, &per_task_result, &pp](int thread_id,
                                                                         int task_id) -> void {
     TRandState& rand_state = per_thread_rand_state[thread_id];
@@ -159,7 +169,7 @@ inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasure
       }
     }
   };
-  support::parallel_for_dynamic(0, ed - st, ctx->num_threads, f_worker);
+  support::parallel_for_dynamic(0, ed - st, self->num_threads_, f_worker);
   Array<MeasureCandidate> filtered;
   filtered.reserve(ed - st);
   for (MeasureCandidate result : per_task_result)
@@ -170,15 +180,12 @@ inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasure
 }
 
 inline void ReplayTraceNode::State::NotifyRunnerResults(const Array<RunnerResult>& results) {
-  st += self->num_trials_per_iter;
-  ed += self->num_trials_per_iter;
+  st += num_trials_per_iter;
+  ed += num_trials_per_iter;
 }
 
-SearchStrategy SearchStrategy::ReplayTrace(int num_trials_per_iter, int max_trials_per_task,
-                                           int max_fail_count) {
+SearchStrategy SearchStrategy::ReplayTrace(int max_fail_count) {
   ObjectPtr<ReplayTraceNode> n = make_object<ReplayTraceNode>();
-  n->num_trials_per_iter = num_trials_per_iter;
-  n->max_trials_per_task = max_trials_per_task;
   n->max_fail_count = max_fail_count;
   return SearchStrategy(n);
 }
diff --git a/src/meta_schedule/search_strategy/search_strategy.cc b/src/meta_schedule/search_strategy/search_strategy.cc
index 81c7fda315b4..641457226d11 100644
--- a/src/meta_schedule/search_strategy/search_strategy.cc
+++ b/src/meta_schedule/search_strategy/search_strategy.cc
@@ -34,11 +34,12 @@ void PySearchStrategyNode::InitializeWithTuneContext(const TuneContext& context)
   f_initialize_with_tune_context(context);
 }
 
-void PySearchStrategyNode::PreTuning(const Array<tir::Schedule>& design_spaces,
+void PySearchStrategyNode::PreTuning(int max_trials, int num_trials_per_iter,
+                                     const Array<tir::Schedule>& design_spaces,
                                      const Optional<Database>& database,
                                      const Optional<CostModel>& cost_model) {
   ICHECK(f_pre_tuning != nullptr) << "PySearchStrategy's PreTuning method not implemented!";
-  f_pre_tuning(design_spaces, database, cost_model);
+  f_pre_tuning(max_trials, num_trials_per_iter, design_spaces, database, cost_model);
 }
 
 void PySearchStrategyNode::PostTuning() {
diff --git a/src/meta_schedule/space_generator/post_order_apply.cc b/src/meta_schedule/space_generator/post_order_apply.cc
index 991e4fa08047..8eb2760dc791 100644
--- a/src/meta_schedule/space_generator/post_order_apply.cc
+++ b/src/meta_schedule/space_generator/post_order_apply.cc
@@ -89,31 +89,27 @@ class BlockCollector : public tir::StmtVisitor {
  * */
 class PostOrderApplyNode : public SpaceGeneratorNode {
  public:
-  /*! \brief The random state. -1 means using random number. */
-  TRandState rand_state_ = -1;
-  /*! \brief The schedule rules to be applied in order. */
-  Array<ScheduleRule> sch_rules_{nullptr};
-  /*! \brief The logging function to use. */
-  PackedFunc logging_func;
-  /*! \brief Optional block names to target. If not specified all blocks will have spaces generated.
+  /*!
+   * \brief Optional block names to target. If not specified all blocks will have spaces generated.
    */
   runtime::PackedFunc f_block_filter_ = nullptr;
+  /*! \brief The random state. -1 means using random number. */
+  TRandState rand_state_ = -1;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
+    SpaceGeneratorNode::VisitAttrs(v);
     // `rand_state_` is not visited
     // `sch_rules_` is not visited
   }
 
   void InitializeWithTuneContext(const TuneContext& context) final {
+    SpaceGeneratorNode::InitializeWithTuneContext(context);
     this->rand_state_ = ForkSeed(&context->rand_state);
-    CHECK(context->sch_rules.defined())
-        << "ValueError: Schedules rules not given in PostOrderApply!";
-    this->sch_rules_ = context->sch_rules;
-    this->logging_func = context->logging_func;
   }
 
   Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod) final {
     using ScheduleAndUnvisitedBlocks = std::pair<tir::Schedule, Array<tir::BlockRV>>;
+    CHECK(sch_rules.defined()) << "ValueError: `sch_rules` is not set in PostOrderApply";
     tir::Schedule sch = tir::Schedule::Traced(
         /*mod=*/mod,
         /*rand_state=*/ForkSeed(&this->rand_state_),
@@ -126,7 +122,7 @@ class PostOrderApplyNode : public SpaceGeneratorNode {
     // always concat multiple schedule rules as one
     Array<tir::BlockRV> all_blocks = BlockCollector::Collect(sch, f_block_filter_);
     Array<Optional<ScheduleRule>> rules{NullOpt};
-    rules.insert(rules.end(), sch_rules_.begin(), sch_rules_.end());
+    rules.insert(rules.end(), sch_rules.value().begin(), sch_rules.value().end());
     for (Optional<ScheduleRule> sch_rule : rules) {
       if (sch_rule.defined()) {
         for (const tir::Schedule& sch : result) {
@@ -191,19 +187,22 @@ class PostOrderApplyNode : public SpaceGeneratorNode {
 
   SpaceGenerator Clone() const final {
     ObjectPtr<PostOrderApplyNode> n = make_object<PostOrderApplyNode>(*this);
-    n->sch_rules_ = Array<ScheduleRule>();
-    for (const ScheduleRule& sch_rule : this->sch_rules_) {
-      n->sch_rules_.push_back(sch_rule->Clone());
-    }
+    CloneRules(this, n.get());
     return SpaceGenerator(n);
   }
   static constexpr const char* _type_key = "meta_schedule.PostOrderApply";
   TVM_DECLARE_FINAL_OBJECT_INFO(PostOrderApplyNode, SpaceGeneratorNode);
 };
 
-SpaceGenerator SpaceGenerator::PostOrderApply(runtime::PackedFunc f_block_filter) {
+SpaceGenerator SpaceGenerator::PostOrderApply(runtime::PackedFunc f_block_filter,
+                                              Optional<Array<ScheduleRule>> sch_rules,
+                                              Optional<Array<Postproc>> postprocs,
+                                              Optional<Map<Mutator, FloatImm>> mutator_probs) {
   ObjectPtr<PostOrderApplyNode> n = make_object<PostOrderApplyNode>();
-  n->f_block_filter_ = f_block_filter;
+  n->sch_rules = std::move(sch_rules);
+  n->postprocs = std::move(postprocs);
+  n->mutator_probs = std::move(mutator_probs);
+  n->f_block_filter_ = std::move(f_block_filter);
   return SpaceGenerator(n);
 }
 
diff --git a/src/meta_schedule/space_generator/schedule_fn.cc b/src/meta_schedule/space_generator/schedule_fn.cc
index adea139b1cd4..48fbc82aba02 100644
--- a/src/meta_schedule/space_generator/schedule_fn.cc
+++ b/src/meta_schedule/space_generator/schedule_fn.cc
@@ -30,10 +30,12 @@ class ScheduleFnNode : public SpaceGeneratorNode {
   runtime::PackedFunc schedule_fn_;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
+    SpaceGeneratorNode::VisitAttrs(v);
     // `schedule_fn_` is not visited.
   }
 
   void InitializeWithTuneContext(const TuneContext& context) final {
+    SpaceGeneratorNode::InitializeWithTuneContext(context);
     this->rand_state_ = ForkSeed(&context->rand_state);
   }
 
@@ -74,6 +76,7 @@ class ScheduleFnNode : public SpaceGeneratorNode {
 
   SpaceGenerator Clone() const final {
     ObjectPtr<ScheduleFnNode> n = make_object<ScheduleFnNode>(*this);
+    CloneRules(this, n.get());
     return SpaceGenerator(n);
   }
 
@@ -81,8 +84,14 @@ class ScheduleFnNode : public SpaceGeneratorNode {
   TVM_DECLARE_FINAL_OBJECT_INFO(ScheduleFnNode, SpaceGeneratorNode);
 };
 
-SpaceGenerator SpaceGenerator::ScheduleFn(PackedFunc schedule_fn) {
+SpaceGenerator SpaceGenerator::ScheduleFn(PackedFunc schedule_fn,
+                                          Optional<Array<ScheduleRule>> sch_rules,
+                                          Optional<Array<Postproc>> postprocs,
+                                          Optional<Map<Mutator, FloatImm>> mutator_probs) {
   ObjectPtr<ScheduleFnNode> n = make_object<ScheduleFnNode>();
+  n->sch_rules = std::move(sch_rules);
+  n->postprocs = std::move(postprocs);
+  n->mutator_probs = std::move(mutator_probs);
   n->schedule_fn_ = std::move(schedule_fn);
   return SpaceGenerator(n);
 }
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index 6fc31ed896f2..53107bafb2c0 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -21,6 +21,97 @@
 namespace tvm {
 namespace meta_schedule {
 
+String GetRuleKindFromTarget(const Target& target) {
+  if (target->kind->name == "llvm") {
+    return "llvm";
+  }
+  if (target->kind->name == "hexagon") {
+    return "hexagon";
+  }
+  if (target->kind->name == "cuda") {
+    if (Optional<String> opt_sm = target->GetAttr<String>("arch")) {
+      std::string sm = opt_sm.value();
+      if (support::StartsWith(sm, "sm_")) {
+        sm = sm.substr(3);
+        try {
+          if (std::stoi(sm) >= 75) {
+            return "cuda_tensorcore";
+          }
+        } catch (const std::invalid_argument& e) {
+          LOG(WARNING) << "ValueError: Unable to parse `target.arch`: " << sm
+                       << ". Details: " << e.what();
+        }
+      }
+    }
+    return "cuda";
+  }
+  if (target->kind->name == "rocm") {
+    return "cuda";
+  }
+  if (target->kind->name == "vulkan") {
+    return "cuda";
+  }
+  LOG(FATAL) << "Unsupported target: " << target;
+  throw;
+}
+
+void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
+  if (context->target.defined() &&  //
+      !(sch_rules.defined() &&      //
+        postprocs.defined() &&      //
+        mutator_probs.defined())) {
+    String kind = GetRuleKindFromTarget(context->target.value());
+    Array<ScheduleRule> default_sch_rules;
+    Array<Postproc> default_postprocs;
+    Map<Mutator, FloatImm> default_mutator_probs;
+    if (kind == "llvm") {
+      default_sch_rules = ScheduleRule::DefaultLLVM();
+      default_postprocs = Postproc::DefaultLLVM();
+      default_mutator_probs = Mutator::DefaultLLVM();
+    } else if (kind == "cuda") {
+      default_sch_rules = ScheduleRule::DefaultCUDA();
+      default_postprocs = Postproc::DefaultCUDA();
+      default_mutator_probs = Mutator::DefaultCUDA();
+    } else if (kind == "cuda_tensorcore") {
+      default_sch_rules = ScheduleRule::DefaultCUDATensorCore();
+      default_postprocs = Postproc::DefaultCUDATensorCore();
+      default_mutator_probs = Mutator::DefaultCUDATensorCore();
+    } else if (kind == "hexagon") {
+      default_sch_rules = ScheduleRule::DefaultHexagon();
+      default_postprocs = Postproc::DefaultHexagon();
+      default_mutator_probs = Mutator::DefaultHexagon();
+    } else {
+      LOG(FATAL) << "Unsupported kind: " << kind;
+      throw;
+    }
+    if (!sch_rules.defined()) {
+      sch_rules = default_sch_rules;
+    }
+    if (!postprocs.defined()) {
+      postprocs = default_postprocs;
+    }
+    if (!mutator_probs.defined()) {
+      mutator_probs = default_mutator_probs;
+    }
+  }
+  if (sch_rules.defined()) {
+    for (ScheduleRule i : sch_rules.value()) {
+      i->InitializeWithTuneContext(context);
+    }
+  }
+  if (postprocs.defined()) {
+    for (Postproc i : postprocs.value()) {
+      i->InitializeWithTuneContext(context);
+    }
+  }
+  if (mutator_probs.defined()) {
+    for (const auto& kv : mutator_probs.value()) {
+      Mutator mutator = kv.first;
+      mutator->InitializeWithTuneContext(context);
+    }
+  }
+}
+
 void PySpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
   ICHECK(f_initialize_with_tune_context != nullptr)
       << "PySpaceGenerator's InitializeWithTuneContext method not implemented!";
@@ -39,9 +130,14 @@ SpaceGenerator PySpaceGeneratorNode::Clone() const {
 }
 
 SpaceGenerator SpaceGenerator::PySpaceGenerator(
+    Optional<Array<ScheduleRule>> sch_rules, Optional<Array<Postproc>> postprocs,
+    Optional<Map<Mutator, FloatImm>> mutator_probs,
     FInitializeWithTuneContext f_initialize_with_tune_context,
     FGenerateDesignSpace f_generate_design_space, FClone f_clone) {
   ObjectPtr<PySpaceGeneratorNode> n = make_object<PySpaceGeneratorNode>();
+  n->sch_rules = sch_rules;
+  n->postprocs = postprocs;
+  n->mutator_probs = mutator_probs;
   n->f_initialize_with_tune_context = std::move(f_initialize_with_tune_context);
   n->f_generate_design_space = std::move(f_generate_design_space);
   n->f_clone = std::move(f_clone);
diff --git a/src/meta_schedule/space_generator/space_generator_union.cc b/src/meta_schedule/space_generator/space_generator_union.cc
index 771d0c187f97..819a4ee5f795 100644
--- a/src/meta_schedule/space_generator/space_generator_union.cc
+++ b/src/meta_schedule/space_generator/space_generator_union.cc
@@ -27,10 +27,13 @@ class SpaceGeneratorUnionNode : public SpaceGeneratorNode {
   /*! \brief The array of design space generators unioned, could be recursive. */
   Array<SpaceGenerator> space_generators;
 
-  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("space_generators", &space_generators); }
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    SpaceGeneratorNode::VisitAttrs(v);
+    v->Visit("space_generators", &space_generators);
+  }
 
   void InitializeWithTuneContext(const TuneContext& context) final {
-    // Initialize each space generator.
+    SpaceGeneratorNode::InitializeWithTuneContext(context);
     for (const SpaceGenerator& space_generator : space_generators) {
       space_generator->InitializeWithTuneContext(context);
     }
@@ -53,6 +56,7 @@ class SpaceGeneratorUnionNode : public SpaceGeneratorNode {
     for (const SpaceGenerator& space_generator : this->space_generators) {
       n->space_generators.push_back(space_generator->Clone());
     }
+    CloneRules(this, n.get());
     return SpaceGenerator(n);
   }
 
@@ -65,8 +69,14 @@ class SpaceGeneratorUnionNode : public SpaceGeneratorNode {
  * \param space_generators Array of the design space generators to be unioned.
  * \return The design space generator created.
  */
-SpaceGenerator SpaceGenerator::SpaceGeneratorUnion(Array<SpaceGenerator> space_generators) {
+SpaceGenerator SpaceGenerator::SpaceGeneratorUnion(Array<SpaceGenerator> space_generators,
+                                                   Optional<Array<ScheduleRule>> sch_rules,
+                                                   Optional<Array<Postproc>> postprocs,
+                                                   Optional<Map<Mutator, FloatImm>> mutator_probs) {
   ObjectPtr<SpaceGeneratorUnionNode> n = make_object<SpaceGeneratorUnionNode>();
+  n->sch_rules = std::move(sch_rules);
+  n->postprocs = std::move(postprocs);
+  n->mutator_probs = std::move(mutator_probs);
   n->space_generators = std::move(space_generators);
   return SpaceGenerator(n);
 }
diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
index 506bb620e1d8..bae52573a0f9 100644
--- a/src/meta_schedule/task_scheduler/gradient_based.cc
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -21,236 +21,122 @@
 namespace tvm {
 namespace meta_schedule {
 
-struct TaskRecord {
-  TuneContext task;
-  double weight;
-  double flop;
-  std::vector<double> best_time_cost_history;  // in ms
-  int trials;
-};
-
 /*! \brief The gradient based task scheduler. */
 class GradientBasedNode final : public TaskSchedulerNode {
  public:
-  // Parameters used in gradient computation
   double alpha;
   int window_size;
+  support::LinearCongruentialEngine::TRandState rand_state;
 
-  std::vector<TaskRecord> task_records_;
-  std::vector<double> best_time_cost_per_task_;  // in ms
-  int num_rounds_already_;
-  support::LinearCongruentialEngine::TRandState rand_state_;
+  int round_robin_rounds_;
+  std::vector<std::vector<double>> best_latency_history_;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     TaskSchedulerNode::VisitAttrs(v);
     v->Visit("alpha", &alpha);
     v->Visit("window_size", &window_size);
-    // `task_records_` is not visited.
-    // `best_time_cost_per_task_` is not visited.
+    // `rand_state` is not visited.
     // `num_rounds_already_` is not visited.
-    // `rand_state_` is not visited.
+    // `best_latency_history_` is not visited.
   }
 
   static constexpr const char* _type_key = "meta_schedule.GradientBased";
   TVM_DECLARE_FINAL_OBJECT_INFO(GradientBasedNode, TaskSchedulerNode);
 
  public:
-  std::string TuningStatistics() const {
-    std::ostringstream os;
-    int n_tasks = task_records_.size();
-    int total_trials = 0;
-    double total_latency = 0.0;
-    support::TablePrinter p;
-
-    if (using_ipython()) {
-      p.Row() << "ID"
-              << "Name"
-              << "FLOP"
-              << "Weight"
-              << "GFLOPS"
-              << "Latency (us)"
-              << "Wtd. Latency"
-              << "Trials"
-              << "Terminated";
-    } else {
-      p.Row() << "ID"
-              << "Name"
-              << "FLOP"
-              << "Weight"
-              << "Speed (GFLOPS)"
-              << "Latency (us)"
-              << "Weighted Latency (us)"
-              << "Trials"
-              << "Terminated";
-    }
-
-    p.Separator();
-
-    for (int i = 0; i < n_tasks; ++i) {
-      const TaskRecord& record = task_records_[i];
-      auto row = p.Row();
-      int trials = record.trials;
-      String task_name = record.task->task_name.value();
-      if (using_ipython() && task_name.length() > 23) {
-        std::string temp = task_name.c_str();
-        temp = temp.substr(0, 20) + "...";
-        task_name = String(temp);
-      }
-      row << /*id=*/i                                     //
-          << /*name=*/task_name                           //
-          << /*flops=*/static_cast<int64_t>(record.flop)  //
-          << /*weight=*/static_cast<int>(record.weight);
-      double latency = 1e9;
-      if (trials > 0) {
-        latency = record.best_time_cost_history.back();
-      }
-      if (latency >= 1e9) {
-        row << /*speed=*/"N/A" << /*latency=*/"N/A" << /*weighted_latency=*/"N/A";
-      } else {
-        latency *= 1000.0;
-        double speed = record.flop / latency / 1000.0;
-        double weighted_latency = latency * record.weight;
-        row << /*speed=*/speed << /*latency=*/latency << /*weighted_latency=*/weighted_latency;
-        total_latency += weighted_latency;
-        total_trials += trials;
-      }
-      row << trials;
-      if (tasks[i]->is_terminated) {
-        row << "Y";
-      } else {
-        row << "";
-      }
-    }
-    p.Separator();
-    os << p.AsStr()                                                    //
-       << "\nProgress: " << total_trials / (max_trials * 0.01) << "%"  //
-       << "\nTotal Trials: " << total_trials << " / " << max_trials    //
-       << "\nTotal latency (us): " << total_latency                    //
-       << "\n";
-    return os.str();
+  void Tune(Array<TuneContext> tasks, Array<FloatImm> task_weights, int max_trials_global,
+            int max_trials_per_task, int num_trials_per_iter, Builder builder, Runner runner,
+            Array<MeasureCallback> measure_callbacks, Optional<Database> database,
+            Optional<CostModel> cost_model) final {
+    int n_tasks = tasks.size();
+    round_robin_rounds_ = 0;
+    best_latency_history_.resize(n_tasks, std::vector<double>());
+    TaskSchedulerNode::Tune(tasks, task_weights, max_trials_global, max_trials_per_task,
+                            num_trials_per_iter, builder, runner, measure_callbacks, database,
+                            cost_model);
   }
 
   int NextTaskId() final {
-    int n_tasks = task_records_.size();
-    // Round robin
-    if (num_rounds_already_ == 0) {
-      TVM_PY_LOG_CLEAR_SCREEN(this->logging_func);
-      TVM_PY_LOG(INFO, this->logging_func) << "\n" << this->TuningStatistics();
+    int n_tasks = this->tasks_.size();
+    // Step 1. Check if it's in round robin mode.
+    if (round_robin_rounds_ == 0) {
+      TVM_PY_LOG(INFO, this->logger) << "\n" << this->TuningStatistics();
     }
-    if (num_rounds_already_ < n_tasks) {
-      return num_rounds_already_++;
+    if (round_robin_rounds_ < n_tasks) {
+      return round_robin_rounds_++;
     }
-    if (num_rounds_already_ == n_tasks) {
+    if (round_robin_rounds_ == n_tasks) {
       for (int i = 0; i < n_tasks; ++i) {
         this->JoinRunningTask(i);
       }
+      ++round_robin_rounds_;
     }
-    ++num_rounds_already_;
-    // Check running tasks
+    // Step 2. Collect the tasks that are not terminated yet
     std::vector<int> tasks_alive;
-    tasks_alive.reserve(n_tasks);
-    for (int i = 0; i < n_tasks; ++i) {
-      this->TouchTask(i);
-      if (!tasks[i]->is_terminated) {
-        tasks_alive.push_back(i);
+    {
+      tasks_alive.reserve(n_tasks);
+      for (int i = 0; i < n_tasks; ++i) {
+        this->TouchTask(i);
+        if (!this->tasks_[i]->is_terminated) {
+          tasks_alive.push_back(i);
+        }
+      }
+      if (tasks_alive.empty()) {
+        return -1;
       }
     }
-    if (tasks_alive.empty()) {
-      return -1;
-    }
+    // Step 3. Calculate the gradient of each task alive
     std::vector<double> grad;
     grad.reserve(n_tasks);
     for (int task_id : tasks_alive) {
-      const TaskRecord& record = task_records_[task_id];
-      const int w = this->window_size;
-      int n = record.best_time_cost_history.size();
+      const std::vector<double>& best_latency = this->best_latency_history_.at(task_id);
+      int n = best_latency.size();
       ICHECK_GE(n, 1);
-      double best = record.best_time_cost_history[n - 1];
+      double task_weight = this->tasks_[task_id]->task_weight;
+      int w = this->window_size;
+      double best = best_latency[n - 1];
       if (best < 1e9) {
-        double g1 = (n >= 1 + w) ? (record.best_time_cost_history[n - 1 - w] - best) / w : 0.0;
+        double g1 = (n >= 1 + w) ? (best_latency[n - 1 - w] - best) / w : 0.0;
         double g2 = best / n;
         double g = alpha * g1 + (1 - alpha) * g2;
-        grad.push_back(g * record.weight);
+        grad.push_back(g * task_weight);
       } else {
         // If the best time cost is unavailable, it means some task is not valid. Skip it.
         grad.push_back(-1e9);
       }
     }
+    // Step 4. Select the task with the largest gradient
     auto max_grad = std::max_element(grad.begin(), grad.end());
     auto min_grad = std::min_element(grad.begin(), grad.end());
     int task_id = -1;
     if (*max_grad == *min_grad) {
-      task_id = tasks_alive[tir::SampleInt(&rand_state_, 0, tasks_alive.size())];
+      task_id = tasks_alive[tir::SampleInt(&this->rand_state, 0, tasks_alive.size())];
     } else {
       task_id = tasks_alive[std::distance(grad.begin(), max_grad)];
     }
-    if (tasks[task_id]->runner_futures.defined()) {
+    if (this->tasks_[task_id]->runner_futures.defined()) {
       JoinRunningTask(task_id);
     }
     return task_id;
   }
 
   Array<RunnerResult> JoinRunningTask(int task_id) final {
-    TaskRecord& record = task_records_[task_id];
     Array<RunnerResult> results = TaskSchedulerNode::JoinRunningTask(task_id);
-    double& best_time_cost = this->best_time_cost_per_task_[task_id];
-    for (const RunnerResult& result : results) {
-      if (!result->error_msg.defined()) {
-        best_time_cost = std::min(best_time_cost, GetRunMsMedian(result));
-      }
-    }
-    record.best_time_cost_history.push_back(best_time_cost);
-    record.trials += results.size();
-    TVM_PY_LOG_CLEAR_SCREEN(this->logging_func);
-    TVM_PY_LOG(INFO, this->logging_func)
-        << "[Updated] Task #" << task_id << ": " << record.task->task_name << "\n"
-        << this->TuningStatistics();
+    TaskRecordNode* task = this->tasks_[task_id].get();
+    this->best_latency_history_.at(task_id).push_back(
+        *std::min_element(task->latency_ms.begin(),  //
+                          task->latency_ms.end()));
     return results;
   }
 };
 
-TaskScheduler TaskScheduler::GradientBased(Array<TuneContext> tasks,                            //
-                                           Array<FloatImm> task_weights,                        //
-                                           Builder builder,                                     //
-                                           Runner runner,                                       //
-                                           Optional<Database> database,                         //
-                                           Optional<CostModel> cost_model,                      //
-                                           Optional<Array<MeasureCallback>> measure_callbacks,  //
-                                           int max_trials,                                      //
-                                           PackedFunc logging_func,                             //
-                                           double alpha,                                        //
-                                           int window_size,                                     //
+TaskScheduler TaskScheduler::GradientBased(PackedFunc logger, double alpha, int window_size,
                                            support::LinearCongruentialEngine::TRandState seed) {
-  CHECK_EQ(tasks.size(), task_weights.size())
-      << "The size of `tasks` should have the same as `task_weights`.";
-  int n_tasks = tasks.size();
-  std::vector<TaskRecord> task_records;
-  task_records.reserve(n_tasks);
-  for (int i = 0; i < n_tasks; ++i) {
-    task_records.push_back(TaskRecord{
-        /*task=*/tasks[i],
-        /*weights=*/task_weights[i]->value,
-        /*flop=*/std::max(1.0, tir::EstimateTIRFlops(tasks[i]->mod.value())),
-        /*best_time_cost_history=*/{},
-        /*trials=*/0,
-    });
-  }
   ObjectPtr<GradientBasedNode> n = make_object<GradientBasedNode>();
-  n->tasks = tasks;
-  n->builder = builder;
-  n->runner = runner;
-  n->database = database;
-  n->max_trials = max_trials;
-  n->cost_model = cost_model;
-  n->measure_callbacks = measure_callbacks.value_or({});
-  n->logging_func = logging_func;
-  n->num_trials_already = 0;
+  n->logger = logger;
   n->alpha = alpha;
   n->window_size = window_size;
-  n->task_records_ = std::move(task_records);
-  n->best_time_cost_per_task_ = std::vector<double>(n_tasks, 1e100);
-  n->num_rounds_already_ = 0;
-  support::LinearCongruentialEngine(&n->rand_state_).Seed(seed);
+  n->rand_state = support::LinearCongruentialEngine::NormalizeSeed(seed);
   return TaskScheduler(n);
 }
 
diff --git a/src/meta_schedule/task_scheduler/round_robin.cc b/src/meta_schedule/task_scheduler/round_robin.cc
index ea22878840af..d09f2c2ba791 100644
--- a/src/meta_schedule/task_scheduler/round_robin.cc
+++ b/src/meta_schedule/task_scheduler/round_robin.cc
@@ -37,13 +37,13 @@ class RoundRobinNode final : public TaskSchedulerNode {
 
  protected:
   int NextTaskId() final {
-    int n_tasks = this->tasks.size();
+    int n_tasks = this->tasks_.size();
     for (int i = 0; i < n_tasks; ++i) {
       this->TouchTask(i);
     }
     for (int i = 0; i < n_tasks; ++i) {
       task_id = (task_id + 1) % n_tasks;
-      TuneContext task = tasks[task_id];
+      TaskRecordNode* task = this->tasks_[task_id].get();
       if (!task->is_terminated) {
         if (task->runner_futures.defined()) {
           JoinRunningTask(task_id);
@@ -55,24 +55,9 @@ class RoundRobinNode final : public TaskSchedulerNode {
   }
 };
 
-TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,                            //
-                                        Builder builder,                                     //
-                                        Runner runner,                                       //
-                                        Optional<Database> database,                         //
-                                        Optional<CostModel> cost_model,                      //
-                                        Optional<Array<MeasureCallback>> measure_callbacks,  //
-                                        int max_trials,                                      //
-                                        PackedFunc logging_func) {
+TaskScheduler TaskScheduler::RoundRobin(PackedFunc logger) {
   ObjectPtr<RoundRobinNode> n = make_object<RoundRobinNode>();
-  n->tasks = tasks;
-  n->builder = builder;
-  n->runner = runner;
-  n->database = database;
-  n->max_trials = max_trials;
-  n->cost_model = cost_model;
-  n->measure_callbacks = measure_callbacks.value_or({});
-  n->logging_func = logging_func;
-  n->num_trials_already = 0;
+  n->logger = logger;
   n->task_id = -1;
   return TaskScheduler(n);
 }
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index ea233648f4f5..21efde26d993 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -21,83 +21,225 @@
 namespace tvm {
 namespace meta_schedule {
 
-void TaskSchedulerNode::InitializeTask(int task_id) {
+TaskRecord::TaskRecord(TuneContext ctx, double task_weight) {
+  ObjectPtr<TaskRecordNode> n = runtime::make_object<TaskRecordNode>();
+  n->ctx = ctx;
+  n->task_weight = task_weight;
+  n->flop = 1.0;
   auto _ = Profiler::TimedScope("InitializeTask");
-  TuneContext task = this->tasks[task_id];
-  TVM_PY_LOG(INFO, this->logging_func)
-      << "Initializing Task #" << task_id << ": " << task->task_name;
-  TVM_PY_LOG(INFO, task->logging_func)
-      << "Initializing Task #" << task_id << ": " << task->task_name;
-  CHECK(task->mod.defined()) << "ValueError: Require `context.mod`, but it is not defined";
-  CHECK(task->space_generator.defined())
+  CHECK(ctx->mod.defined()) << "ValueError: Require `context.mod`, but it is not defined";
+  CHECK(ctx->space_generator.defined())
       << "ValueError: Require `context.space_generator`, but it is not defined";
-  CHECK(task->search_strategy.defined())
+  CHECK(ctx->search_strategy.defined())
       << "ValueError: Require `context.search_strategy`, but it is not defined";
-  TVM_PY_LOG(INFO, task->logging_func) << "\n" << tir::AsTVMScript(task->mod);
-  task->Initialize();
-  Array<tir::Schedule> design_spaces =
-      task->space_generator.value()->GenerateDesignSpace(task->mod.value());
-  TVM_PY_LOG(INFO, task->logging_func)
-      << "Total " << design_spaces.size() << " design space(s) generated";
-  for (int i = 0, n = design_spaces.size(); i < n; ++i) {
-    tir::Schedule sch = design_spaces[i];
-    tir::Trace trace = sch->trace().value();
-    trace = trace->Simplified(true);
-    TVM_PY_LOG(INFO, task->logging_func) << "Design space #" << i << ":\n"
-                                         << tir::AsTVMScript(sch->mod()) << "\n"
-                                         << Concat(trace->AsPython(false), "\n");
+  TVM_PY_LOG(INFO, ctx->logger) << "\n" << tir::AsTVMScript(ctx->mod);
+  ctx->Initialize();
+  n->flop = std::max(1.0, tir::EstimateTIRFlops(ctx->mod.value()));
+  this->data_ = std::move(n);
+}
+
+void SendToBuilder(TaskRecordNode* self, const Builder& builder) {
+  auto _ = Profiler::TimedScope("SendToBuilder");
+  Array<MeasureCandidate> candidates = self->measure_candidates.value();
+  Target target = self->ctx->target.value();
+  Array<BuilderInput> inputs;
+  inputs.reserve(candidates.size());
+  for (const MeasureCandidate& candidate : candidates) {
+    inputs.push_back(BuilderInput(candidate->sch->mod(), target));
   }
-  task->search_strategy.value()->PreTuning(design_spaces, database, cost_model);
+  self->builder_results = builder->Build(inputs);
 }
 
-void TaskSchedulerNode::Tune() {
-  int n_tasks = this->tasks.size();
-  for (int task_id = 0; task_id < n_tasks; ++task_id) {
-    InitializeTask(task_id);
+void SendToRunner(TaskRecordNode* self, const Runner& runner) {
+  auto _ = Profiler::TimedScope("SendToRunner");
+  Array<MeasureCandidate> candidates = self->measure_candidates.value();
+  Array<BuilderResult> builder_results = self->builder_results.value();
+  Target target = self->ctx->target.value();
+  ICHECK_EQ(candidates.size(), builder_results.size());
+  int n = candidates.size();
+  int n_build_errors = 0;
+  Array<RunnerInput> inputs;
+  inputs.reserve(n);
+  for (int i = 0; i < n; ++i) {
+    const MeasureCandidate& candidate = candidates[i];
+    const BuilderResult& builder_result = builder_results[i];
+    if (builder_result->error_msg.defined()) {
+      ++n_build_errors;
+      continue;
+    }
+    inputs.push_back(RunnerInput(/*artifact_path=*/builder_result->artifact_path.value(),
+                                 /*device_type=*/target->kind->name,
+                                 /*args_info=*/candidate->args_info));
+  }
+  Array<RunnerFuture> futures = runner->Run(inputs);
+  if (n_build_errors == 0) {
+    self->runner_futures = futures;
+    return;
+  }
+  Array<RunnerFuture> results;
+  results.reserve(n);
+  for (int i = 0, j = 0; i < n; ++i) {
+    const BuilderResult& builder_result = builder_results[i];
+    if (builder_result->error_msg.defined()) {
+      results.push_back(RunnerFuture(
+          /*f_done=*/[]() -> bool { return true; },
+          /*f_result=*/
+          [msg = builder_result->error_msg]() -> RunnerResult {
+            return RunnerResult(NullOpt, msg);
+          }));
+    } else {
+      results.push_back(futures[j++]);
+    }
+  }
+  self->runner_futures = results;
+}
+
+void TaskCleanUp(TaskRecordNode* self, int task_id, const Array<RunnerResult>& results) {
+  ICHECK_EQ(self->builder_results.value().size(), results.size());
+  ICHECK_EQ(self->runner_futures.value().size(), results.size());
+  int n = results.size();
+  std::string name = self->ctx->task_name.value();
+  const PackedFunc& logger = self->ctx->logger;
+  for (int i = 0; i < n; ++i) {
+    const BuilderResult& builder_result = self->builder_results.value()[i];
+    const MeasureCandidate& candidate = self->measure_candidates.value()[i];
+    const RunnerResult& runner_result = results[i];
+    Optional<String> error_msg = NullOpt;
+    int trials = self->latency_ms.size() + 1;
+    double run_ms = 1e9;
+    if ((error_msg = builder_result->error_msg)) {
+      ++self->build_error_count;
+    } else if ((error_msg = runner_result->error_msg)) {
+      ++self->run_error_count;
+    } else {
+      run_ms = GetRunMsMedian(runner_result);
+    }
+    self->latency_ms.push_back(run_ms);
+    if (error_msg) {
+      const tir::Schedule& sch = candidate->sch;
+      std::string err = error_msg.value();
+      TVM_PY_LOG(INFO, logger) << std::fixed << std::setprecision(4)  //
+                               << "[Task #" << task_id << ": " << name << "] Trial #" << trials
+                               << ": Error in building:\n"
+                               << err << "\n"
+                               << tir::AsTVMScript(sch->mod()) << "\n"
+                               << Concat(sch->trace().value()->AsPython(false), "\n");
+    } else {
+      double best_ms = *std::min_element(self->latency_ms.begin(), self->latency_ms.end());
+      TVM_PY_LOG(INFO, logger) << std::fixed << std::setprecision(4)  //
+                               << "[Task #" << task_id << ": " << name << "] Trial #" << trials
+                               << ": GFLOPs: " << (self->flop / run_ms / 1e6)
+                               << ". Time: " << (run_ms * 1e3) << " us"
+                               << ". Best GFLOPs: " << (self->flop / best_ms / 1e6);
+    }
   }
-  int running_tasks = tasks.size();
-  for (int task_id; num_trials_already < max_trials && (task_id = NextTaskId()) != -1;) {
-    TVM_PY_LOG(INFO, this->logging_func)
-        << "Scheduler picks Task #" << task_id << ": " << tasks[task_id]->task_name;
-    TuneContext task = tasks[task_id];
+  self->measure_candidates = NullOpt;
+  self->builder_results = NullOpt;
+  self->runner_futures = NullOpt;
+}
+
+void TaskSchedulerNode::Tune(Array<TuneContext> ctxs, Array<FloatImm> task_weights,
+                             int max_trials_global, int max_trials_per_task,
+                             int num_trials_per_iter, Builder builder, Runner runner,
+                             Array<MeasureCallback> measure_callbacks, Optional<Database> database,
+                             Optional<CostModel> cost_model) {
+  CHECK_EQ(ctxs.size(), task_weights.size()) << "ValueError: `task_weights` must have the same "
+                                                "length as `ctxs`";
+  int n_tasks = this->remaining_tasks_ = ctxs.size();
+  this->measure_callbacks_ = measure_callbacks;
+  this->database_ = database;
+  this->cost_model_ = cost_model;
+  this->tasks_.clear();
+  this->tasks_.reserve(n_tasks);
+  for (int i = 0; i < n_tasks; ++i) {
+    const TuneContext& ctx = ctxs[i];
+    double weight = task_weights[i]->value;
+    TVM_PY_LOG(INFO, this->logger) << "Initializing Task #" << i << ": " << ctx->task_name;
+    TVM_PY_LOG(INFO, ctx->logger) << "Initializing Task #" << i << ": " << ctx->task_name;
+    this->tasks_.push_back(TaskRecord(ctx, weight));
+    Array<tir::Schedule> design_spaces =
+        ctx->space_generator.value()->GenerateDesignSpace(ctx->mod.value());
+    TVM_PY_LOG(INFO, ctx->logger) << "Total " << design_spaces.size()
+                                  << " design space(s) generated";
+    for (int i = 0, n = design_spaces.size(); i < n; ++i) {
+      tir::Schedule sch = design_spaces[i];
+      tir::Trace trace = sch->trace().value();
+      trace = trace->Simplified(true);
+      TVM_PY_LOG(INFO, ctx->logger) << "Design space #" << i << ":\n"
+                                    << tir::AsTVMScript(sch->mod()) << "\n"
+                                    << Concat(trace->AsPython(false), "\n");
+    }
+    ctx->search_strategy.value()->PreTuning(max_trials_per_task, num_trials_per_iter, design_spaces,
+                                            database, cost_model);
+  }
+
+  int num_trials_already = 0;
+  for (int task_id; num_trials_already < max_trials_global && (task_id = NextTaskId()) != -1;) {
+    TVM_PY_LOG(INFO, this->logger)
+        << "TaskScheduler picks Task #" << task_id << ": " << tasks_[task_id]->ctx->task_name;
+    TaskRecordNode* task = tasks_[task_id].get();
     ICHECK(!task->is_terminated);
     ICHECK(!task->runner_futures.defined());
-    if (Optional<Array<MeasureCandidate>> candidates =
-            task->search_strategy.value()->GenerateMeasureCandidates()) {
+    if (static_cast<int>(task->latency_ms.size()) >= max_trials_per_task) {
+      TerminateTask(task_id);
+      continue;
+    }
+    if (Optional<Array<MeasureCandidate>> candidates = task->measure_candidates =
+            task->ctx->search_strategy.value()->GenerateMeasureCandidates()) {
       int num_candidates = candidates.value().size();
-      task->_SetMeasureCandidates(candidates.value());
       num_trials_already += num_candidates;
-      TVM_PY_LOG(INFO, this->logging_func)
-          << "Sending " << num_candidates << " sample(s) to builder";
-      task->_SendToBuilder(this->builder);
-      TVM_PY_LOG(INFO, this->logging_func)
-          << "Sending " << num_candidates << " sample(s) to runner";
-      task->_SendToRunner(this->runner);
+      TVM_PY_LOG(INFO, this->logger) << "Sending " << num_candidates << " sample(s) to builder";
+      SendToBuilder(task, builder);
+      TVM_PY_LOG(INFO, this->logger) << "Sending " << num_candidates << " sample(s) to runner";
+      SendToRunner(task, runner);
     } else {
-      ICHECK(!task->is_terminated);
-      task->is_terminated = true;
-      --running_tasks;
-      TVM_PY_LOG(INFO, this->logging_func)
-          << "Task #" << task_id << " has finished. Remaining task(s): " << running_tasks;
+      TerminateTask(task_id);
     }
   }
   for (int task_id = 0; task_id < n_tasks; ++task_id) {
-    TuneContext task = tasks[task_id];
+    TaskRecordNode* task = this->tasks_[task_id].get();
     if (!task->is_terminated) {
       if (task->runner_futures.defined()) {
         JoinRunningTask(task_id);
       }
-      task->is_terminated = true;
-      --running_tasks;
-      TVM_PY_LOG(INFO, this->logging_func)
-          << "Task #" << task_id << " has finished. Remaining task(s): " << running_tasks;
+      TerminateTask(task_id);
     }
-    task->search_strategy.value()->PostTuning();
+    task->ctx->search_strategy.value()->PostTuning();
   }
 }
 
+Array<RunnerResult> TaskSchedulerNode::JoinRunningTask(int task_id) {
+  TaskRecordNode* task = this->tasks_[task_id].get();
+  ICHECK(task->runner_futures.defined());
+  Array<RunnerResult> results;
+  {
+    auto _ = Profiler::TimedScope("JoinRunnerFutures");
+    Array<RunnerFuture> futures = task->runner_futures.value();
+    results.reserve(futures.size());
+    for (RunnerFuture future : futures) {
+      results.push_back(future->Result());
+    }
+  }
+  ICHECK(task->measure_candidates.defined());
+  task->ctx->search_strategy.value()->NotifyRunnerResults(task->measure_candidates.value(),
+                                                          results);
+  ICHECK(task->builder_results.defined());
+  ICHECK_EQ(results.size(), task->measure_candidates.value().size());
+  ICHECK_EQ(results.size(), task->builder_results.value().size());
+  for (const MeasureCallback& callback : this->measure_callbacks_) {
+    callback->Apply(GetRef<TaskScheduler>(this), task_id, task->measure_candidates.value(),
+                    task->builder_results.value(), results);
+  }
+  TaskCleanUp(task, task_id, results);
+  TVM_PY_LOG_CLEAR_SCREEN(this->logger);
+  TVM_PY_LOG(INFO, this->logger) << "[Updated] Task #" << task_id << ": " << task->ctx->task_name
+                                 << "\n"
+                                 << this->TuningStatistics();
+  return results;
+}
+
 void TaskSchedulerNode::TouchTask(int task_id) {
-  TuneContext task = tasks[task_id];
+  TaskRecordNode* task = this->tasks_[task_id].get();
   if (!task->is_terminated && task->runner_futures.defined()) {
     for (const RunnerFuture future : task->runner_futures.value()) {
       if (!future->Done()) {
@@ -108,39 +250,85 @@ void TaskSchedulerNode::TouchTask(int task_id) {
   }
 }
 
-Array<RunnerResult> TaskSchedulerNode::JoinRunningTask(int task_id) {
-  TuneContext task = tasks[task_id];
-  Array<RunnerResult> results = task->_Join();
-  for (const MeasureCallback& callback : this->measure_callbacks) {
-    callback->Apply(GetRef<TaskScheduler>(this), task_id, task->measure_candidates.value(),
-                    task->builder_results.value(), results);
-  }
-  task->_ClearMeasureState();
-  return results;
+void TaskSchedulerNode::TerminateTask(int task_id) {
+  TaskRecordNode* task = this->tasks_[task_id].get();
+  ICHECK(!task->is_terminated);
+  task->is_terminated = true;
+  --this->remaining_tasks_;
+  TVM_PY_LOG_CLEAR_SCREEN(this->logger);
+  TVM_PY_LOG(INFO, this->logger) << "Task #" << task_id
+                                 << " has finished. Remaining task(s): " << this->remaining_tasks_
+                                 << "\n"
+                                 << this->TuningStatistics();
 }
 
-void PyTaskSchedulerNode::Tune() {
-  if (f_tune == nullptr) {
-    TaskSchedulerNode::Tune();
-  } else {
-    f_tune();
+std::string TaskSchedulerNode::TuningStatistics() const {
+  std::ostringstream os;
+  int n_tasks = this->tasks_.size();
+  int total_trials = 0;
+  double total_latency = 0.0;
+  support::TablePrinter p;
+  p.Row() << "ID"
+          << "Name"
+          << "FLOP"
+          << "Weight"
+          << "Speed (GFLOPS)"
+          << "Latency (us)"
+          << "Weighted Latency (us)"
+          << "Trials"
+          << "Done";
+  p.Separator();
+  for (int i = 0; i < n_tasks; ++i) {
+    const TaskRecordNode* task = this->tasks_[i].get();
+    auto row = p.Row();
+    int trials = task->latency_ms.size();
+    row << /*id=*/i << /*name=*/task->ctx->task_name.value()  //
+        << /*flops=*/static_cast<int64_t>(task->flop)
+        << /*weight=*/static_cast<int>(task->task_weight);
+    double latency_ms = 1e9;
+    if (!task->latency_ms.empty()) {
+      latency_ms = *std::min_element(task->latency_ms.begin(), task->latency_ms.end());
+    }
+    if (latency_ms >= 1e9) {
+      row << /*speed=*/"N/A" << /*latency=*/"N/A" << /*weighted_latency=*/"N/A";
+    } else {
+      latency_ms *= 1000.0;
+      double speed = task->flop / latency_ms / 1000.0;
+      double weighted_latency = latency_ms * task->task_weight;
+      row << /*speed=*/speed << /*latency=*/latency_ms << /*weighted_latency=*/weighted_latency;
+      total_latency += weighted_latency;
+      total_trials += trials;
+    }
+    row << trials;
+    if (task->is_terminated) {
+      row << "Y";
+    } else {
+      row << "";
+    }
   }
+  p.Separator();
+  os << p.AsStr()                                  //
+     << "\nTotal trials: " << total_trials         //
+     << "\nTotal latency (us): " << total_latency  //
+     << "\n";
+  return os.str();
 }
 
-void PyTaskSchedulerNode::InitializeTask(int task_id) {
-  if (f_initialize_task == nullptr) {
-    TaskSchedulerNode::InitializeTask(task_id);
-  } else {
-    f_initialize_task(task_id);
-  }
+TaskScheduler TaskScheduler::PyTaskScheduler(
+    PackedFunc logger, PyTaskSchedulerNode::FNextTaskId f_next_task_id,
+    PyTaskSchedulerNode::FJoinRunningTask f_join_running_task, PyTaskSchedulerNode::FTune f_tune) {
+  CHECK(f_next_task_id != nullptr) << "ValueError: next_task_id is not defined";
+  ObjectPtr<PyTaskSchedulerNode> n = make_object<PyTaskSchedulerNode>();
+  n->logger = logger;
+  n->f_next_task_id = f_next_task_id;
+  n->f_join_running_task = f_join_running_task;
+  n->f_tune = f_tune;
+  return TaskScheduler(n);
 }
 
-void PyTaskSchedulerNode::TouchTask(int task_id) {
-  if (f_touch_task == nullptr) {
-    return TaskSchedulerNode::TouchTask(task_id);
-  } else {
-    return f_touch_task(task_id);
-  }
+int PyTaskSchedulerNode::NextTaskId() {
+  CHECK(f_next_task_id != nullptr) << "PyTaskScheduler's NextTaskId method not implemented!";
+  return f_next_task_id();
 }
 
 Array<RunnerResult> PyTaskSchedulerNode::JoinRunningTask(int task_id) {
@@ -151,61 +339,38 @@ Array<RunnerResult> PyTaskSchedulerNode::JoinRunningTask(int task_id) {
   }
 }
 
-int PyTaskSchedulerNode::NextTaskId() {
-  ICHECK(f_next_task_id != nullptr) << "PyTaskScheduler's NextTaskId method not implemented!";
-  return f_next_task_id();
-}
-
-TaskScheduler TaskScheduler::PyTaskScheduler(
-    Array<TuneContext> tasks,                                   //
-    Builder builder,                                            //
-    Runner runner,                                              //
-    Optional<Database> database,                                //
-    Optional<CostModel> cost_model,                             //
-    Optional<Array<MeasureCallback>> measure_callbacks,         //
-    int max_trials,                                             //
-    PackedFunc logging_func,                                    //
-    PyTaskSchedulerNode::FTune f_tune,                          //
-    PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
-    PyTaskSchedulerNode::FTouchTask f_touch_task,               //
-    PyTaskSchedulerNode::FJoinRunningTask f_join_running_task,  //
-    PyTaskSchedulerNode::FNextTaskId f_next_task_id) {
-  ObjectPtr<PyTaskSchedulerNode> n = make_object<PyTaskSchedulerNode>();
-  n->tasks = tasks;
-  n->builder = builder;
-  n->runner = runner;
-  n->database = database;
-  n->max_trials = max_trials;
-  n->cost_model = cost_model;
-  if (measure_callbacks.defined()) {
-    n->measure_callbacks = measure_callbacks.value();
+void PyTaskSchedulerNode::Tune(Array<TuneContext> tasks, Array<FloatImm> task_weights,
+                               int max_trials_global, int max_trials_per_task,
+                               int num_trials_per_iter, Builder builder, Runner runner,
+                               Array<MeasureCallback> measure_callbacks,
+                               Optional<Database> database, Optional<CostModel> cost_model) {
+  if (f_tune == nullptr) {
+    TaskSchedulerNode::Tune(tasks, task_weights, max_trials_global, max_trials_per_task,
+                            num_trials_per_iter, builder, runner, measure_callbacks, database,
+                            cost_model);
   } else {
-    n->measure_callbacks = {};
+    f_tune(tasks, task_weights, max_trials_global, max_trials_per_task, num_trials_per_iter,
+           builder, runner, measure_callbacks, database, cost_model);
   }
-  n->logging_func = logging_func;
-  n->num_trials_already = 0;
-  n->f_tune = f_tune;
-  n->f_initialize_task = f_initialize_task;
-  n->f_touch_task = f_touch_task;
-  n->f_join_running_task = f_join_running_task;
-  n->f_next_task_id = f_next_task_id;
-  return TaskScheduler(n);
 }
 
+TVM_REGISTER_NODE_TYPE(TaskRecordNode);
 TVM_REGISTER_OBJECT_TYPE(TaskSchedulerNode);
 TVM_REGISTER_NODE_TYPE(PyTaskSchedulerNode);
 TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerPyTaskScheduler")
     .set_body_typed(TaskScheduler::PyTaskScheduler);
 TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTune")
     .set_body_method<TaskScheduler>(&TaskSchedulerNode::Tune);
-TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerInitializeTask")
-    .set_body_method<TaskScheduler>(&TaskSchedulerNode::InitializeTask);
-TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTouchTask")
-    .set_body_method<TaskScheduler>(&TaskSchedulerNode::TouchTask);
 TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerJoinRunningTask")
     .set_body_method<TaskScheduler>(&TaskSchedulerNode::JoinRunningTask);
 TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerNextTaskId")
     .set_body_method<TaskScheduler>(&TaskSchedulerNode::NextTaskId);
+TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTerminateTask")
+    .set_body_method<TaskScheduler>(&TaskSchedulerNode::TerminateTask);
+TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTouchTask")
+    .set_body_method<TaskScheduler>(&TaskSchedulerNode::TouchTask);
+TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTuningStatistics")
+    .set_body_method<TaskScheduler>(&TaskSchedulerNode::TuningStatistics);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index ee24624fe9e4..768c95857184 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -23,58 +23,32 @@
 namespace tvm {
 namespace meta_schedule {
 
-TuneContext::TuneContext(Optional<IRModule> mod,                                    //
-                         Optional<Target> target,                                   //
-                         Optional<SpaceGenerator> space_generator,                  //
-                         Optional<SearchStrategy> search_strategy,                  //
-                         Optional<Array<ScheduleRule>> sch_rules,                   //
-                         Optional<Array<Postproc>> postprocs,                       //
-                         Optional<Map<Mutator, FloatImm>> mutator_probs,            //
-                         Optional<String> task_name,                                //
-                         PackedFunc logging_func,                                   //
-                         support::LinearCongruentialEngine::TRandState rand_state,  //
-                         int num_threads) {
+TuneContext::TuneContext(Optional<IRModule> mod, Optional<Target> target,
+                         Optional<SpaceGenerator> space_generator,
+                         Optional<SearchStrategy> search_strategy, Optional<String> task_name,
+                         int num_threads, TRandState rand_state, PackedFunc logger) {
+  CHECK(rand_state == -1 || rand_state >= 0) << "ValueError: Invalid random state: " << rand_state;
   ObjectPtr<TuneContextNode> n = make_object<TuneContextNode>();
   n->mod = mod;
   n->target = target;
   n->space_generator = space_generator;
   n->search_strategy = search_strategy;
-  n->sch_rules = sch_rules.value_or({});
-  n->postprocs = postprocs.value_or({});
-  n->mutator_probs = mutator_probs.value_or({});
   n->task_name = task_name;
-  n->logging_func = logging_func;
-  support::LinearCongruentialEngine(&n->rand_state).Seed(rand_state);
   n->num_threads = num_threads;
-  n->is_terminated = false;
-  n->runner_futures = NullOpt;
-  n->measure_candidates = NullOpt;
+  n->rand_state = support::LinearCongruentialEngine::NormalizeSeed(rand_state);
+  n->logger = logger;
   data_ = std::move(n);
 }
 
 TuneContext TuneContextNode::Clone() const {
   ObjectPtr<TuneContextNode> n = make_object<TuneContextNode>(*this);
-  if (this->sch_rules.defined()) {
-    n->sch_rules = Array<ScheduleRule>();
-    for (const ScheduleRule& sch_rule : this->sch_rules) {
-      n->sch_rules.push_back(sch_rule->Clone());
-    }
-  }
-  if (this->postprocs.defined()) {
-    n->postprocs = Array<Postproc>();
-    for (const Postproc& postproc : this->postprocs) {
-      n->postprocs.push_back(postproc->Clone());
-    }
+  if (this->space_generator.defined()) {
+    n->space_generator = this->space_generator.value()->Clone();
   }
-  if (this->mutator_probs.defined()) {
-    n->mutator_probs = Map<Mutator, FloatImm>();
-    for (const auto& kv : this->mutator_probs) {
-      n->mutator_probs.Set(kv.first->Clone(), kv.second);
-    }
+  if (this->search_strategy.defined()) {
+    n->search_strategy = this->search_strategy.value()->Clone();
   }
-  if (this->space_generator.defined()) n->space_generator = this->space_generator.value()->Clone();
-  if (this->search_strategy.defined()) n->search_strategy = this->search_strategy.value()->Clone();
-  n->rand_state = support::LinearCongruentialEngine(&n->rand_state).ForkSeed();
+  n->rand_state = ForkSeed(&n->rand_state);
   n->Initialize();
   return TuneContext(n);
 }
@@ -86,136 +60,22 @@ void TuneContextNode::Initialize() {
   if (this->search_strategy.defined()) {
     this->search_strategy.value()->InitializeWithTuneContext(GetRef<TuneContext>(this));
   }
-  for (const ScheduleRule& sch_rule : sch_rules) {
-    sch_rule->InitializeWithTuneContext(GetRef<TuneContext>(this));
-  }
-  for (const Postproc& postproc : postprocs) {
-    postproc->InitializeWithTuneContext(GetRef<TuneContext>(this));
-  }
-  for (const auto& kv : mutator_probs) {
-    kv.first->InitializeWithTuneContext(GetRef<TuneContext>(this));
-  }
-}
-
-void TuneContextNode::_SetMeasureCandidates(const Array<MeasureCandidate>& candidates) {
-  this->measure_candidates = candidates;
-}
-
-void TuneContextNode::_SendToBuilder(const Builder& builder) {
-  auto _ = Profiler::TimedScope("SendToBuilder");
-  Array<MeasureCandidate> candidates = this->measure_candidates.value();
-  Target target = this->target.value();
-  Array<BuilderInput> inputs;
-  inputs.reserve(candidates.size());
-  for (const MeasureCandidate& candidate : candidates) {
-    inputs.push_back(BuilderInput(candidate->sch->mod(), target));
-  }
-  this->builder_results = builder->Build(inputs);
-}
-
-void TuneContextNode::_SendToRunner(const Runner& runner) {
-  auto _ = Profiler::TimedScope("SendToRunner");
-  Array<MeasureCandidate> candidates = this->measure_candidates.value();
-  Array<BuilderResult> builder_results = this->builder_results.value();
-  Target target = this->target.value();
-  ICHECK_EQ(candidates.size(), builder_results.size());
-  int n = candidates.size();
-  int n_build_errors = 0;
-  Array<RunnerInput> inputs;
-  inputs.reserve(n);
-  for (int i = 0; i < n; ++i) {
-    const MeasureCandidate& candidate = candidates[i];
-    const BuilderResult& builder_result = builder_results[i];
-    if (builder_result->error_msg.defined()) {
-      ++n_build_errors;
-      continue;
-    }
-    inputs.push_back(RunnerInput(/*artifact_path=*/builder_result->artifact_path.value(),
-                                 /*device_type=*/target->kind->name,
-                                 /*args_info=*/candidate->args_info));
-  }
-  Array<RunnerFuture> futures = runner->Run(inputs);
-  if (n_build_errors == 0) {
-    this->runner_futures = futures;
-    return;
-  }
-  Array<RunnerFuture> results;
-  results.reserve(n);
-  for (int i = 0, j = 0; i < n; ++i) {
-    const BuilderResult& builder_result = builder_results[i];
-    if (builder_result->error_msg.defined()) {
-      results.push_back(RunnerFuture(
-          /*f_done=*/[]() -> bool { return true; },
-          /*f_result=*/
-          [msg = builder_result->error_msg]() -> RunnerResult {
-            return RunnerResult(NullOpt, msg);
-          }));
-    } else {
-      results.push_back(futures[j++]);
-    }
-  }
-  this->runner_futures = results;
-}
-
-Array<RunnerResult> TuneContextNode::_Join() {
-  ICHECK(this->runner_futures.defined());
-  Array<RunnerFuture> futures = this->runner_futures.value();
-  int n = futures.size();
-  Array<RunnerResult> results;
-  {
-    auto _ = Profiler::TimedScope("JoinRunnerFutures");
-    results.reserve(n);
-    for (RunnerFuture future : futures) {
-      results.push_back(future->Result());
-    }
-  }
-  if (this->search_strategy.defined()) {
-    this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results);
-  }
-  ICHECK(this->measure_candidates.defined());
-  ICHECK(this->builder_results.defined());
-  ICHECK_EQ(results.size(), this->measure_candidates.value().size());
-  ICHECK_EQ(results.size(), this->builder_results.value().size());
-  return results;
-}
-
-void TuneContextNode::_ClearMeasureState() {
-  this->measure_candidates = NullOpt;
-  this->builder_results = NullOpt;
-  this->runner_futures = NullOpt;
 }
 
 TVM_REGISTER_NODE_TYPE(TuneContextNode);
-
 TVM_REGISTER_GLOBAL("meta_schedule.TuneContext")
-    .set_body_typed([](Optional<IRModule> mod,                                    //
-                       Optional<Target> target,                                   //
-                       Optional<SpaceGenerator> space_generator,                  //
-                       Optional<SearchStrategy> search_strategy,                  //
-                       Optional<Array<ScheduleRule>> sch_rules,                   //
-                       Optional<Array<Postproc>> postprocs,                       //
-                       Optional<Map<Mutator, FloatImm>> mutator_probs,            //
-                       Optional<String> task_name,                                //
-                       PackedFunc logging_func,                                   //
-                       support::LinearCongruentialEngine::TRandState rand_state,  //
-                       int num_threads) -> TuneContext {
-      return TuneContext(mod, target, space_generator, search_strategy, sch_rules, postprocs,
-                         mutator_probs, task_name, logging_func, rand_state, num_threads);
+    .set_body_typed([](Optional<IRModule> mod, Optional<Target> target,
+                       Optional<SpaceGenerator> space_generator,
+                       Optional<SearchStrategy> search_strategy, Optional<String> task_name,
+                       int num_threads, TRandState rand_state, PackedFunc logger) -> TuneContext {
+      return TuneContext(mod, target, space_generator, search_strategy, task_name, num_threads,
+                         rand_state, logger);
     });
-
 TVM_REGISTER_GLOBAL("meta_schedule._SHash2Hex").set_body_typed(SHash2Hex);
 TVM_REGISTER_GLOBAL("meta_schedule.TuneContextInitialize")
     .set_body_method<TuneContext>(&TuneContextNode::Initialize);
-TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSetMeasureCandidates")
-    .set_body_method<TuneContext>(&TuneContextNode::_SetMeasureCandidates);
-TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSendToBuilder")
-    .set_body_method<TuneContext>(&TuneContextNode::_SendToBuilder);
-TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSendToRunner")
-    .set_body_method<TuneContext>(&TuneContextNode::_SendToRunner);
-TVM_REGISTER_GLOBAL("meta_schedule.TuneContextJoin")
-    .set_body_method<TuneContext>(&TuneContextNode::_Join);
-TVM_REGISTER_GLOBAL("meta_schedule.TuneContextClearMeasureState")
-    .set_body_method<TuneContext>(&TuneContextNode::_ClearMeasureState);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextClone")
+    .set_body_method<TuneContext>(&TuneContextNode::Clone);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index f0b736081670..41d8ffde558c 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -44,6 +44,7 @@
 
 #include <algorithm>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "../printer/text_printer.h"
@@ -55,8 +56,8 @@
 #include "../tir/schedule/primitive.h"
 #include "../tir/schedule/utils.h"
 
-#define TVM_PY_LOG(logging_level, logging_func)                          \
-  ::tvm::meta_schedule::PyLogMessage(__FILE__, __LINE__, logging_func,   \
+#define TVM_PY_LOG(logging_level, logger)                                \
+  ::tvm::meta_schedule::PyLogMessage(__FILE__, __LINE__, logger,         \
                                      PyLogMessage::Level::logging_level) \
       .stream()
 #define TVM_PY_LOG_CLEAR_SCREEN(logging_func) clear_logging(__FILE__, __LINE__, logging_func)
@@ -81,14 +82,18 @@ class PyLogMessage {
     // FATAL not included
   };
 
-  explicit PyLogMessage(const char* file, int lineno, PackedFunc logging_func, Level logging_level)
-      : file_(file), lineno_(lineno), logging_func_(logging_func), logging_level_(logging_level) {}
+  explicit PyLogMessage(const char* file, int lineno, PackedFunc logger, Level logging_level)
+      : file_(file), lineno_(lineno), logger_(logger), logging_level_(logging_level) {
+    if (this->logger_ != nullptr) {
+      stream_ << "" << file_ << ":" << lineno_ << " ";
+    }
+  }
 
   TVM_NO_INLINE ~PyLogMessage() {
     ICHECK(logging_level_ != Level::CLEAR)
         << "Cannot use CLEAR as logging level in TVM_PY_LOG, please use TVM_PY_LOG_CLEAR_SCREEN.";
-    if (this->logging_func_.defined()) {
-      logging_func_(static_cast<int>(logging_level_), stream_.str());
+    if (this->logger_ != nullptr) {
+      logger_(static_cast<int>(logging_level_), stream_.str());
     } else {
       if (logging_level_ == Level::INFO) {
         runtime::detail::LogMessage(file_, lineno_).stream() << stream_.str();
@@ -109,7 +114,7 @@ class PyLogMessage {
   const char* file_;
   int lineno_;
   std::ostringstream stream_;
-  PackedFunc logging_func_;
+  PackedFunc logger_;
   Level logging_level_;
 };
 
@@ -120,7 +125,9 @@ class PyLogMessage {
 inline bool using_ipython() {
   bool flag = false;
   const auto* f_using_ipython = runtime::Registry::Get("meta_schedule.using_ipython");
-  if (f_using_ipython->defined()) flag = (*f_using_ipython)();
+  if (f_using_ipython) {
+    flag = (*f_using_ipython)();
+  }
   return flag;
 }
 
@@ -459,6 +466,40 @@ struct SortTuningRecordByMeanRunSecs {
   }
 };
 
+/*!
+ * \brief The helper function to clone schedule rules, postprocessors, and mutators.
+ * \param src The source space generator.
+ * \param dst The destination space generator.
+ */
+inline void CloneRules(const SpaceGeneratorNode* src, SpaceGeneratorNode* dst) {
+  if (src->sch_rules.defined()) {
+    Array<ScheduleRule> original = src->sch_rules.value();
+    Array<ScheduleRule> sch_rules;
+    sch_rules.reserve(original.size());
+    for (const ScheduleRule& sch_rule : original) {
+      sch_rules.push_back(sch_rule->Clone());
+    }
+    dst->sch_rules = std::move(sch_rules);
+  }
+  if (src->postprocs.defined()) {
+    Array<Postproc> original = src->postprocs.value();
+    Array<Postproc> postprocs;
+    postprocs.reserve(original.size());
+    for (const Postproc& postproc : original) {
+      postprocs.push_back(postproc->Clone());
+    }
+    dst->postprocs = std::move(postprocs);
+  }
+  if (src->mutator_probs.defined()) {
+    Map<Mutator, FloatImm> original = src->mutator_probs.value();
+    Map<Mutator, FloatImm> mutator_probs;
+    for (const auto& kv : original) {
+      mutator_probs.Set(kv.first->Clone(), kv.second);
+    }
+    dst->mutator_probs = std::move(mutator_probs);
+  }
+}
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 8fa8610c0fca..b4373c6f5f1e 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -547,7 +547,7 @@ TECompiler& TECompiler::Global() {
 }
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_meta_schedule", Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_meta_schedule_dispatch", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_meta_schedule_dispatch", Integer);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.tir_converter", String);
 
 TVM_REGISTER_GLOBAL("relay.backend._TECompilerGlobal").set_body_typed([]() {
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 6f55402baded..27738615c7eb 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -48,6 +48,7 @@
 #include <utility>
 #include <vector>
 
+#include "../../printer/text_printer.h"
 #include "../../te/operation/create_primfunc.h"
 #include "../op/memory/memory.h"
 #include "../transforms/meta_schedule_layout_rewrite.h"
@@ -387,7 +388,18 @@ class ScheduleBuilder : public ExprVisitor {
             mod = tir::transform::RemoveWeightLayoutRewriteBlock()(std::move(mod));
             prim_func = Downcast<PrimFunc>(mod->Lookup("main"));
           } else {
-            LOG(WARNING) << "Cannot find workload: " << prim_fn_var->name_hint;
+            int dispatch = backend::UseMetaScheduleDispatch();
+            // (dispatch & 2): controls whether to print TVMScript for missing TIR
+            // (dispatch & 4): controls whether to raise fatal errors for missing TIR
+            if (dispatch & 2) {
+              LOG(WARNING) << "Cannot find workload: " << prim_fn_var->name_hint << "\n"
+                           << tir::AsTVMScript(f.value());
+            } else {
+              LOG(WARNING) << "Cannot find workload: " << prim_fn_var->name_hint;
+            }
+            if (dispatch & 4) {
+              LOG(FATAL);
+            }
           }
         }
       }
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 6c65a081f156..00c75921f2f2 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -632,6 +632,13 @@ inline bool IsMetaScheduleEnabled() {
       .value();
 }
 
+/*! \brief Consider MetaSchedule's dispatch option. */
+inline int UseMetaScheduleDispatch() {
+  return transform::PassContext::Current()
+      ->GetConfig<Integer>("relay.backend.use_meta_schedule_dispatch", Integer(0))
+      .value()
+      ->value;
+}
 /*!
  * \brief Method in TECompiler to convert TE compute to scheduleable TIR
  * \param args The arguments of the TE compute
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 8cfbadf65012..1d9272cf2dd5 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -225,7 +225,7 @@ Schedule ConcreteScheduleNode::Copy() {
 /******** Schedule: Schedule: Sampling ********/
 
 void ConcreteScheduleNode::Seed(support::LinearCongruentialEngine::TRandState seed) {
-  support::LinearCongruentialEngine(&rand_state_).Seed(seed);
+  this->rand_state_ = support::LinearCongruentialEngine::NormalizeSeed(seed);
 }
 
 support::LinearCongruentialEngine::TRandState ConcreteScheduleNode::ForkSeed() {
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py
index 8b07122c2a17..e8caa9f04e87 100644
--- a/tests/python/contrib/test_hexagon/test_meta_schedule.py
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -16,23 +16,25 @@
 # under the License.
 
 """ Test rpc based launcher for hexagon """
-import pytest
-import numpy as np
 import tempfile
 
+import numpy as np
+import pytest
 import tvm.testing
 import tvm.topi.testing
-from tvm import te, relay
 from tvm import meta_schedule as ms
+from tvm import relay, te
+from tvm.contrib.hexagon.meta_schedule import (
+    get_hexagon_local_builder,
+    get_hexagon_rpc_runner,
+)
+from tvm.meta_schedule import postproc, schedule_rule
 from tvm.meta_schedule.arg_info import TensorInfo
 from tvm.meta_schedule.builder import BuilderInput
-from tvm.meta_schedule import postproc, schedule_rule
+from tvm.meta_schedule.runner import RunnerInput
 from tvm.script import tir as T
 from tvm.tir import FloatImm
 from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
-from tvm.meta_schedule.runner import RunnerInput
-from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
-from tvm.relay.backend import Executor
 
 from .infrastructure import get_hexagon_target
 
@@ -43,7 +45,9 @@
 @tvm.script.ir_module
 class MatmulModule:
     @T.prim_func
-    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+    def main(  # type: ignore  # pylint: disable=no-self-argument
+        a: T.handle, b: T.handle, c: T.handle
+    ) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         A = T.match_buffer(a, (16, 16), "float32")
         B = T.match_buffer(b, (16, 16), "float32")
@@ -52,7 +56,7 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
             with T.block("matmul"):
                 vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 with T.init():
-                    C[vi, vj] = 0.0
+                    C[vi, vj] = 0.0  # type: ignore
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 
@@ -186,26 +190,28 @@ def test_vrmpy_dense(hexagon_launcher):
         schedule_dense(sch, block, M, do_tune)
     else:
         with tempfile.TemporaryDirectory() as work_dir:
-            config = ms.TuneConfig(
-                strategy="replay_trace",
-                num_trials_per_iter=8,
-                max_trials_per_task=8,
-                max_trials_global=8,
-            )
 
             def schedule_dense_for_tune(sch):
                 block = sch.get_block("compute")
                 return schedule_dense(sch, block, None, True)
 
-            sch = ms.tune_tir(
+            target = get_hexagon_target("v69")
+            database = ms.tir_integration.tune_tir(
                 mod=workload,
                 target=target,
-                config=config,
                 work_dir=work_dir,
-                space=ms.space_generator.ScheduleFn(schedule_dense_for_tune),
+                max_trials_global=8,
+                space=ms.space_generator.ScheduleFn(
+                    schedule_dense_for_tune,
+                    sch_rules=[],
+                    postprocs=[],
+                    mutator_probs=[],
+                ),
+                strategy="replay-trace",
                 builder=get_hexagon_local_builder(),
                 runner=get_hexagon_rpc_runner(hexagon_launcher, number=10),
             )
+            sch = ms.tir_integration.compile_tir(database, workload, target)
 
     with hexagon_launcher.start_session() as session:
         verify_dense(sch, get_hexagon_target("v68"), M, N, K, session)
@@ -216,10 +222,10 @@ def schedule_dense_for_tune(sch):
 @tvm.script.ir_module
 class Module_vrmpy_auto_tensorize:
     @T.prim_func
-    def main(
-        X: T.Buffer[(128, 768), "uint8"],
-        packedW: T.Buffer[(24, 192, 32, 4), "uint8"],
-        compute: T.Buffer[(128, 768), "int32"],
+    def main(  # type: ignore
+        X: T.Buffer[(128, 768), "uint8"],  # type: ignore
+        packedW: T.Buffer[(24, 192, 32, 4), "uint8"],  # type: ignore
+        compute: T.Buffer[(128, 768), "int32"],  # type: ignore
     ) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         for i0_0_i1_0_0_fused in T.parallel(
@@ -230,37 +236,37 @@ def main(
                     i = T.axis.spatial(128, i0_0_i1_0_0_fused // 8 * 2 + i0_1_init + i0_2_init)
                     j_o = T.axis.spatial(24, i1_0_2_init + i0_0_i1_0_0_fused % 8 * 3 + i1_0_1_init)
                     T.reads()
-                    T.writes(compute[i, j_o * 32 : j_o * 32 + 32])
+                    T.writes(compute[i, j_o * 32 : j_o * 32 + 32])  # type: ignore
                     for i1_1 in T.vectorized(32):
                         with T.block("compute_init"):
                             j_i_init = T.axis.spatial(32, i1_1)
                             T.reads()
                             T.writes(compute[i, j_o * 32 + j_i_init])
-                            compute[i, j_o * 32 + j_i_init] = 0
+                            compute[i, j_o * 32 + j_i_init] = 0  # type: ignore
             for i2_0_0, i0_1, i1_0_1, i2_0_1, i0_2, i1_0_2 in T.grid(32, 2, 3, 6, 1, 1):
                 with T.block("compute_o_update"):
                     i = T.axis.spatial(128, i0_0_i1_0_0_fused // 8 * 2 + i0_1 + i0_2)
                     j_o = T.axis.spatial(24, i1_0_2 + i0_0_i1_0_0_fused % 8 * 3 + i1_0_1)
                     k_o = T.axis.reduce(192, i2_0_0 * 6 + i2_0_1)
                     T.reads(
-                        compute[i, j_o * 32 : j_o * 32 + 32],
-                        X[i, k_o * 4 : k_o * 4 + 4],
-                        packedW[j_o, k_o, 0:32, 0:4],
+                        compute[i, j_o * 32 : j_o * 32 + 32],  # type: ignore
+                        X[i, k_o * 4 : k_o * 4 + 4],  # type: ignore
+                        packedW[j_o, k_o, 0:32, 0:4],  # type: ignore
                     )
-                    T.writes(compute[i, j_o * 32 : j_o * 32 + 32])
+                    T.writes(compute[i, j_o * 32 : j_o * 32 + 32])  # type: ignore
                     A = T.match_buffer(
-                        X[i, k_o * 4 : k_o * 4 + 4], [4], dtype="uint8", offset_factor=1
+                        X[i, k_o * 4 : k_o * 4 + 4], [4], dtype="uint8", offset_factor=1  # type: ignore
                     )
                     B = T.match_buffer(
                         packedW[j_o, k_o, 0:32, 0:4], [32, 4], dtype="uint8", offset_factor=1
                     )
                     C = T.match_buffer(
-                        compute[i, j_o * 32 : j_o * 32 + 32], [32], dtype="int32", offset_factor=1
+                        compute[i, j_o * 32 : j_o * 32 + 32], [32], dtype="int32", offset_factor=1  # type: ignore
                     )
-                    A_u8x4: T.uint8x4 = A[0:4]
-                    A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
-                    B_i32x32: T.int32x32 = T.reinterpret(B[0, 0:128], dtype="int32x32")
-                    C[0:32] = T.call_llvm_pure_intrin(
+                    A_u8x4: T.uint8x4 = A[0:4]  # type: ignore
+                    A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")  # type: ignore
+                    B_i32x32: T.int32x32 = T.reinterpret(B[0, 0:128], dtype="int32x32")  # type: ignore
+                    C[0:32] = T.call_llvm_pure_intrin(  # type: ignore
                         4390, T.uint32(3), C[0:32], B_i32x32, A_i32, dtype="int32x32"
                     )
 
@@ -303,23 +309,20 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
 
     if True:
         with tempfile.TemporaryDirectory() as work_dir:
-            config = ms.TuneConfig(
-                strategy="replay_trace",
+            target = get_hexagon_target("v68")
+            database = ms.tir_integration.tune_tir(
+                mod=workload,
+                target=target,
+                max_trials_global=8,
                 num_trials_per_iter=8,
                 max_trials_per_task=8,
-                max_trials_global=8,
-            )
-
-            sch = ms.tune_tir(
-                mod=workload,
-                target=get_hexagon_target("v68"),
-                config=config,
                 work_dir=work_dir,
                 sch_rules=lambda: sch_rules,
                 postprocs=lambda: postprocs,
                 builder=get_hexagon_local_builder(),
                 runner=get_hexagon_rpc_runner(hexagon_launcher, number=10),
             )
+            sch = ms.tir_integration.compile_tir(database, workload, target)
     else:
         sch = tvm.tir.Schedule(Module_vrmpy_auto_tensorize, debug_mask="all")
 
@@ -358,6 +361,7 @@ def test_conv2d_relay_auto_schedule(hexagon_launcher):
         kernel_layout="HWIO",
     )
     mod = tvm.IRModule.from_expr(conv2d + bias)
+    mod = mod.with_attr("executor", relay.backend.Executor("graph", {"link-params": True}))
 
     data_np = np.random.randn(*d_shape).astype("float16")
     weight_np = np.random.randn(*w_shape).astype("float16")
@@ -379,24 +383,25 @@ def test_conv2d_relay_auto_schedule(hexagon_launcher):
 
     ref = rt_mod_ref.get_output(0).numpy()
 
-    config = ms.TuneConfig(
-        strategy="replay_trace",
-        num_trials_per_iter=8,
-        max_trials_per_task=8,
-        max_trials_global=8,
-    )
-
     with tempfile.TemporaryDirectory() as work_dir:
-        executor = Executor("graph", {"link-params": True})
-        lib = ms.tune_relay(
+        target = get_hexagon_target("v69")
+        database = ms.relay_integration.tune_relay(
             mod=mod,
             params=params,
-            target=get_hexagon_target("v69"),
-            config=config,
+            target=target,
+            max_trials_global=8,
+            max_trials_per_task=8,
+            num_trials_per_iter=8,
+            strategy=ms.search_strategy.ReplayTrace(),
             work_dir=work_dir,
             builder=get_hexagon_local_builder(),
             runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
-            executor=executor,
+        )
+        lib = ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            params=params,
+            target=target,
         )
 
     with hexagon_launcher.start_session() as session:
diff --git a/tests/python/integration/test_meta_schedule_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
similarity index 73%
rename from tests/python/integration/test_meta_schedule_auto_tensorize.py
rename to tests/python/integration/test_auto_tensorize.py
index fd28f7928301..3fdf027a490d 100644
--- a/tests/python/integration/test_meta_schedule_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -24,23 +24,14 @@
 import tvm.topi.testing
 from tvm import meta_schedule as ms
 from tvm import relay
-from tvm.meta_schedule import postproc, schedule_rule
-from tvm.meta_schedule.relay_integration import extract_task_from_relay
+from tvm.meta_schedule.testing import relay_workload
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
-from tvm.meta_schedule.tune import tune_extracted_tasks
 from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
 from tvm.tir.tensor_intrin.rocm import AMDGPU_SDOT4_INTRIN
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 
-CONFIG = ms.TuneConfig(
-    strategy="evolutionary",
-    num_trials_per_iter=32,
-    max_trials_per_task=32,
-    max_trials_global=20000,
-)
-
 SCH_RULES_FOR_VNNI = [
-    schedule_rule.AutoInline(
+    ms.schedule_rule.AutoInline(
         into_producer=False,
         into_consumer=True,
         inline_const_tensor=True,
@@ -49,62 +40,62 @@
         require_ordered=True,
         disallow_op=["tir.exp"],
     ),
-    schedule_rule.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
-    schedule_rule.MultiLevelTilingWithIntrin(
+    ms.schedule_rule.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
+    ms.schedule_rule.MultiLevelTilingWithIntrin(
         VNNI_INTRIN,
         structure="SSRSRS",
         tile_binds=None,
         max_innermost_factor=64,
         vector_load_lens=None,
         reuse_read=None,
-        reuse_write=schedule_rule.ReuseType(
+        reuse_write=ms.schedule_rule.ReuseType(
             req="may",
             levels=[1, 2],
             scope="global",
         ),
     ),
-    schedule_rule.MultiLevelTiling(
+    ms.schedule_rule.MultiLevelTiling(
         structure="SSRSRS",
         tile_binds=None,
         max_innermost_factor=64,
         vector_load_lens=None,
         reuse_read=None,
-        reuse_write=schedule_rule.ReuseType(
+        reuse_write=ms.schedule_rule.ReuseType(
             req="may",
             levels=[1, 2],
             scope="global",
         ),
     ),
-    schedule_rule.ParallelizeVectorizeUnroll(
+    ms.schedule_rule.ParallelizeVectorizeUnroll(
         max_jobs_per_core=16,
         max_vectorize_extent=64,
         unroll_max_steps=[0, 16, 64, 512],
         unroll_explicit=True,
     ),
-    schedule_rule.RandomComputeLocation(),
+    ms.schedule_rule.RandomComputeLocation(),
 ]
 
 
-def get_sch_rules_for_dp4a(intrin):
+def _get_sch_rules_for_dp4a(intrin):
     return [
-        schedule_rule.MultiLevelTilingWithIntrin(
+        ms.schedule_rule.MultiLevelTilingWithIntrin(
             intrin,
             structure="SSSRRSRS",
             tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
             max_innermost_factor=64,
             vector_load_lens=[1, 2, 3, 4],
-            reuse_read=schedule_rule.ReuseType(
+            reuse_read=ms.schedule_rule.ReuseType(
                 req="must",
                 levels=[4],
                 scope="shared",
             ),
-            reuse_write=schedule_rule.ReuseType(
+            reuse_write=ms.schedule_rule.ReuseType(
                 req="must",
                 levels=[3],
                 scope="local",
             ),
         ),
-        schedule_rule.AutoInline(
+        ms.schedule_rule.AutoInline(
             into_producer=True,
             into_consumer=True,
             inline_const_tensor=True,
@@ -113,8 +104,8 @@ def get_sch_rules_for_dp4a(intrin):
             require_ordered=False,
             disallow_op=None,
         ),
-        schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]),
-        schedule_rule.ParallelizeVectorizeUnroll(
+        ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]),
+        ms.schedule_rule.ParallelizeVectorizeUnroll(
             max_jobs_per_core=-1,  # disable parallelize
             max_vectorize_extent=-1,  # disable vectorize
             unroll_max_steps=[0, 16, 64, 512, 1024],
@@ -123,24 +114,24 @@ def get_sch_rules_for_dp4a(intrin):
     ]
 
 
-SCH_RULES_FOR_DP4A = get_sch_rules_for_dp4a(DP4A_INTRIN)
-SCH_RULES_FOR_SDOT4 = get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN)
+SCH_RULES_FOR_DP4A = _get_sch_rules_for_dp4a(DP4A_INTRIN)
+SCH_RULES_FOR_SDOT4 = _get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN)
 
 POSTPROCS_FOR_VNNI = [
-    postproc.DisallowDynamicLoop(),
-    postproc.RewriteParallelVectorizeUnroll(),
-    postproc.RewriteReductionBlock(),
-    postproc.RewriteTensorize(vectorize_init_loop=True),
+    ms.postproc.DisallowDynamicLoop(),
+    ms.postproc.RewriteParallelVectorizeUnroll(),
+    ms.postproc.RewriteReductionBlock(),
+    ms.postproc.RewriteTensorize(vectorize_init_loop=True),
 ]
 
 POSTPROCS_FOR_DP4A = [
-    postproc.DisallowDynamicLoop(),
-    postproc.RewriteCooperativeFetch(),
-    postproc.RewriteUnboundBlock(),
-    postproc.RewriteParallelVectorizeUnroll(),
-    postproc.RewriteReductionBlock(),
-    postproc.RewriteTensorize(),
-    postproc.VerifyGPUCode(),
+    ms.postproc.DisallowDynamicLoop(),
+    ms.postproc.RewriteCooperativeFetch(),
+    ms.postproc.RewriteUnboundBlock(),
+    ms.postproc.RewriteParallelVectorizeUnroll(),
+    ms.postproc.RewriteReductionBlock(),
+    ms.postproc.RewriteTensorize(),
+    ms.postproc.VerifyGPUCode(),
 ]
 
 
@@ -148,33 +139,33 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
     """Test tuning."""
     tgt = "cuda" if "nvidia" in target else target
     dev = tvm.device(tgt, 0)
-
     ref = (
         relay.create_executor("vm", mod=relay_mod, device=dev, target=tgt)
         .evaluate()(*[data_np, weight_np])
         .numpy()
     )
-
     params = {"weight": weight_np}
-
-    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
-
     tune_tasks = list(
         filter(
             lambda task: op_name in task.task_name,
-            extracted_tasks,
+            ms.relay_integration.extracted_task_from_relay(relay_mod, target, params),
         )
     )
-
     with tempfile.TemporaryDirectory() as work_dir:
-        database = tune_extracted_tasks(
-            tune_tasks,
-            CONFIG,
+        tasks, task_weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
+            extracted_tasks=tune_tasks,
             work_dir=work_dir,
-            sch_rules=lambda: sch_rules,
-            postprocs=lambda: postprocs,
+            space=ms.space_generator.PostOrderApply(
+                sch_rules=sch_rules,
+                postprocs=postprocs,
+            ),
+        )
+        database = ms.tune.tune_tasks(
+            tasks=tasks,
+            task_weights=task_weights,
+            work_dir=work_dir,
+            max_trials_global=20000,
         )
-
     with database, tvm.transform.PassContext(
         opt_level=3,
         config={"relay.backend.use_meta_schedule": True},
@@ -186,12 +177,9 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
         assert "vpdpbusd" in asm
 
     runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
     runtime.set_input("data", data_np)
     runtime.run()
-
     out = runtime.get_output(0).numpy()
-
     np.testing.assert_equal(out, ref)
 
 
@@ -243,28 +231,28 @@ def _test_conv2d(data_dtype, sch_rules, postprocs, target):
     tune_and_test(relay_mod, data_np, weight_np, "conv2d", target, sch_rules, postprocs)
 
 
-def _test_bert_int8(target, sch_rules, postprocs):
-    relay_mod, params, input_info = load_quantized_bert_base()
-
+def _test_bert_int8(relay_mod, params, input_info, target, sch_rules, postprocs):
     relay_mod = relay.transform.FastMath()(relay_mod)
-
-    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
-
     tune_tasks = [
         task
-        for task in extracted_tasks
+        for task in ms.relay_integration.extract_tasks(relay_mod, target, params)
         if "dense" in task.task_name or "batch_matmul" in task.task_name
     ]
-
     with tempfile.TemporaryDirectory() as work_dir:
-        database = tune_extracted_tasks(
-            tune_tasks,
-            CONFIG,
+        tasks, task_weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
+            extracted_tasks=tune_tasks,
             work_dir=work_dir,
-            sch_rules=lambda: sch_rules,
-            postprocs=lambda: postprocs,
+            space=ms.space_generator.PostOrderApply(
+                sch_rules=sch_rules,
+                postprocs=postprocs,
+            ),
+        )
+        database = ms.tune.tune_tasks(
+            tasks=tasks,
+            task_weights=task_weights,
+            work_dir=work_dir,
+            max_trials_global=20000,
         )
-
     with database, tvm.transform.PassContext(
         opt_level=3,
         config={"relay.backend.use_meta_schedule": True},
@@ -273,14 +261,11 @@ def _test_bert_int8(target, sch_rules, postprocs):
 
     dev = tvm.device("cuda" if "nvidia" in target else target, 0)
     runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
     inputs = []
-
     for name, shape in input_info:
         arr = np.random.uniform(1, 10, size=shape).astype("int64")
         runtime.set_input(name, arr)
         inputs.append(arr)
-
     print(runtime.benchmark(dev, number=1, repeat=50).mean)
 
 
@@ -295,7 +280,6 @@ def test_vnni_dense():
 @tvm.testing.requires_gpu
 def test_dp4a_dense():
     _test_dense("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070")
-
     # Uncomment to test on vulkan or rocm target
     # _test_dense(
     #     "int8", sch_rules_for_dp4a, postprocs_for_dp4a, "vulkan -from_device=0"
@@ -316,7 +300,6 @@ def test_vnni_conv2d():
 @tvm.testing.requires_gpu
 def test_dp4a_conv2d():
     _test_conv2d("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070")
-
     # Uncomment to test on vulkan or rocm target
     # _test_conv2d(
     #     "int8", sch_rules_for_dp4a, postprocs_for_dp4a, "vulkan -from_device=0"
@@ -329,17 +312,46 @@ def test_dp4a_conv2d():
 @tvm.testing.requires_cascadelake
 @pytest.mark.skip_if(tvm.testing.IS_IN_CI, reason="Slow on CI")
 def test_vnni_bert_int8():
-    _test_bert_int8("llvm -mcpu=cascadelake -num-cores 4", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI)
+    relay_mod, params, input_info = load_quantized_bert_base()
+    _test_bert_int8(
+        relay_mod,
+        params,
+        input_info,
+        "llvm -mcpu=cascadelake -num-cores 4",
+        SCH_RULES_FOR_VNNI,
+        POSTPROCS_FOR_VNNI,
+    )
 
 
 @tvm.testing.requires_gpu
 @pytest.mark.skip("Slow on CI")
 def test_dp4a_bert_int8():
-    _test_bert_int8("nvidia/geforce-rtx-3070", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A)
-
+    relay_mod, params, input_info = load_quantized_bert_base()
+    _test_bert_int8(
+        relay_mod,
+        params,
+        input_info,
+        "nvidia/geforce-rtx-3070",
+        SCH_RULES_FOR_DP4A,
+        POSTPROCS_FOR_DP4A,
+    )
     # Uncomment to test on vulkan or rocm target
-    # _test_bert_int8("vulkan -from_device=0", sch_rules_for_dp4a, postprocs_for_dp4a)
-    # _test_bert_int8("rocm", sch_rules_for_sdot4, postprocs_for_dp4a)
+    # _test_bert_int8(
+    #     relay_mod,
+    #     params,
+    #     input_info,
+    #     "vulkan -from_device=0",
+    #     sch_rules_for_dp4a,
+    #     postprocs_for_dp4a,
+    # )
+    # _test_bert_int8(
+    #     relay_mod,
+    #     params,
+    #     input_info,
+    #     "rocm",
+    #     sch_rules_for_sdot4,
+    #     postprocs_for_dp4a,
+    # )
 
 
 @tvm.testing.requires_gpu
@@ -356,14 +368,12 @@ def test_cuda_tensor_core(model_name, input_shape):
         data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev)  # embedding size
     else:
         data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev)
-
     mod, params, (input_name, _, _) = relay_workload.get_network(model_name, input_shape)
     seq = tvm.transform.Sequential(
         [
             relay.transform.ToMixedPrecision(),
         ]
     )
-
     with tvm.transform.PassContext(opt_level=3):
         mod = seq(mod)
 
@@ -377,18 +387,19 @@ def convert_layout(mod):
 
     with tempfile.TemporaryDirectory() as work_dir:
         with ms.Profiler() as profiler:
-            rt_mod1: tvm.runtime.Module = ms.tune_relay(
-                mod=convert_layout(mod),
-                params=params,
+            converted_mod = convert_layout(mod)
+            database = ms.relay_integration.tune_relay(
+                mod=converted_mod,
                 target=target,
-                config=ms.TuneConfig(
-                    num_trials_per_iter=32,
-                    max_trials_per_task=200,
-                    max_trials_global=3000,
-                ),
-                sch_rules=ms.default_config._DefaultCUDATensorCore.schedule_rules,
-                postprocs=ms.default_config._DefaultCUDATensorCore.postprocs,
                 work_dir=work_dir,
+                max_trials_global=3000,
+                params=params,
+            )
+            rt_mod1 = ms.relay_integration.compile_relay(
+                database=database,
+                mod=converted_mod,
+                target=target,
+                params=params,
             )
         print(profiler.table())
 
diff --git a/tests/python/integration/test_legacy_tuning.py b/tests/python/integration/test_legacy_tuning.py
new file mode 100644
index 000000000000..04c5f85ce5d4
--- /dev/null
+++ b/tests/python/integration/test_legacy_tuning.py
@@ -0,0 +1,380 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Test the tuner
+"""
+import logging
+import multiprocessing as mp
+import textwrap
+
+import tvm
+import tvm.relay
+import tvm.testing
+from tvm import autotvm, te
+from tvm.autotvm.measure import measure_methods
+from tvm.autotvm.tuner import RandomTuner
+from tvm.contrib import tar
+from tvm.ir.instrument import pass_instrument
+from tvm.ir.transform import PassContext
+from tvm.target import Target
+from tvm.tir.analysis import _ffi_api as _analysis_ffi_api
+
+
+def setup_module():
+    """Setup the module used for testing."""
+
+    @autotvm.template("testing/conv2d_no_batching")
+    def conv2d_no_batching(  # pylint: disable=unused-variable
+        batch_size, input_h, input_w, channels_in, channels_out, kernel_h, kernel_w
+    ):
+        """An example template for testing"""
+        assert batch_size == 1, "Only consider batch_size = 1 in this template"
+
+        data = te.placeholder((batch_size, channels_in, input_h, input_w), name="data")
+        kernel = te.placeholder((channels_out, channels_in, kernel_h, kernel_w), name="kernel")
+
+        axis_rc = te.reduce_axis((0, channels_in), name="rc")
+        axis_ry = te.reduce_axis((0, kernel_h), name="ry")
+        axis_rx = te.reduce_axis((0, kernel_w), name="rx")
+
+        conv = te.compute(
+            (batch_size, channels_out, input_h - kernel_h + 1, input_w - kernel_w + 1),
+            lambda nn, ff, yy, xx: te.sum(
+                data[nn, axis_rc, yy + axis_ry, xx + axis_rx]
+                * kernel[ff, axis_rc, axis_ry, axis_rx],
+                axis=[axis_rc, axis_ry, axis_rx],
+            ),
+            tag="conv2d_nchw",
+        )
+
+        schedule = te.create_schedule([conv.op])
+
+        output = conv
+        cache_write_ol = schedule.cache_write(conv, "local")
+
+        # create cache stage
+        cache_read_aa = schedule.cache_read(data, "shared", [cache_write_ol])
+        cache_read_ww = schedule.cache_read(kernel, "shared", [cache_write_ol])
+        cache_read_al = schedule.cache_read(cache_read_aa, "local", [cache_write_ol])
+        cache_read_wl = schedule.cache_read(cache_read_ww, "local", [cache_write_ol])
+
+        # tile and bind spatial axes
+        axis_n, axis_f, axis_y, axis_x = schedule[output].op.axis
+        cfg = autotvm.get_config()
+        cfg.define_split("tile_f", cfg.axis(axis_f), num_outputs=4)
+        cfg.define_split("tile_y", cfg.axis(axis_y), num_outputs=4)
+        cfg.define_split("tile_x", cfg.axis(axis_x), num_outputs=4)
+        axis_bf, axis_vf, axis_tf, axis_fi = cfg["tile_f"].apply(schedule, output, axis_f)
+        axis_by, axis_vy, axis_ty, axis_yi = cfg["tile_y"].apply(schedule, output, axis_y)
+        axis_bx, axis_vx, axis_tx, axis_xi = cfg["tile_x"].apply(schedule, output, axis_x)
+        kernel_scope = axis_n  # this is the scope to attach global config inside this kernel
+
+        schedule[output].bind(axis_bf, te.thread_axis("blockIdx.z"))
+        schedule[output].bind(axis_by, te.thread_axis("blockIdx.y"))
+        schedule[output].bind(axis_bx, te.thread_axis("blockIdx.x"))
+        schedule[output].bind(axis_vf, te.thread_axis("vthread"))
+        schedule[output].bind(axis_vy, te.thread_axis("vthread"))
+        schedule[output].bind(axis_vx, te.thread_axis("vthread"))
+        schedule[output].bind(axis_tf, te.thread_axis("threadIdx.z"))
+        schedule[output].bind(axis_ty, te.thread_axis("threadIdx.y"))
+        schedule[output].bind(axis_tx, te.thread_axis("threadIdx.x"))
+        schedule[output].reorder(
+            axis_n,
+            axis_bf,
+            axis_by,
+            axis_bx,
+            axis_vf,
+            axis_vy,
+            axis_vx,
+            axis_tf,
+            axis_ty,
+            axis_tx,
+            axis_fi,
+            axis_yi,
+            axis_xi,
+        )
+        schedule[cache_write_ol].compute_at(schedule[output], axis_tx)
+
+        # tile and bind reduction axes
+        axis_n, axis_f, axis_y, axis_x = schedule[cache_write_ol].op.axis
+        axis_rc, axis_ry, axis_rx = schedule[cache_write_ol].op.reduce_axis
+        cfg.define_split("tile_rc", cfg.axis(axis_rc), num_outputs=3)
+        cfg.define_split("tile_ry", cfg.axis(axis_ry), num_outputs=3)
+        cfg.define_split("tile_rx", cfg.axis(axis_rx), num_outputs=3)
+        axis_rco, axis_rcm, axis_rci = cfg["tile_rc"].apply(schedule, cache_write_ol, axis_rc)
+        axis_ryo, axis_rym, axis_ryi = cfg["tile_rx"].apply(schedule, cache_write_ol, axis_ry)
+        axis_rxo, axis_rxm, axis_rxi = cfg["tile_ry"].apply(schedule, cache_write_ol, axis_rx)
+        schedule[cache_write_ol].reorder(
+            axis_rco,
+            axis_ryo,
+            axis_rxo,
+            axis_rcm,
+            axis_rym,
+            axis_rxm,
+            axis_rci,
+            axis_ryi,
+            axis_rxi,
+            axis_n,
+            axis_f,
+            axis_y,
+            axis_x,
+        )
+
+        schedule[cache_read_aa].compute_at(schedule[cache_write_ol], axis_rxo)
+        schedule[cache_read_ww].compute_at(schedule[cache_write_ol], axis_rxo)
+        schedule[cache_read_al].compute_at(schedule[cache_write_ol], axis_rxm)
+        schedule[cache_read_wl].compute_at(schedule[cache_write_ol], axis_rxm)
+
+        # cooperative fetching
+        for load in [cache_read_aa, cache_read_ww]:
+            axis_n, axis_f, axis_y, axis_x = schedule[load].op.axis
+            fused = schedule[load].fuse(axis_n, axis_f, axis_y, axis_x)
+            axis_tz, fused = schedule[load].split(fused, nparts=cfg["tile_f"].size[2])
+            axis_ty, fused = schedule[load].split(fused, nparts=cfg["tile_y"].size[2])
+            axis_tx, fused = schedule[load].split(fused, nparts=cfg["tile_x"].size[2])
+            schedule[load].bind(axis_tz, te.thread_axis("threadIdx.z"))
+            schedule[load].bind(axis_ty, te.thread_axis("threadIdx.y"))
+            schedule[load].bind(axis_tx, te.thread_axis("threadIdx.x"))
+
+        # tune unroll
+        cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+        cfg.define_knob("unroll_explicit", [0, 1])
+        schedule[output].pragma(
+            kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val
+        )
+        schedule[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
+
+        return schedule, [data, kernel, conv]
+
+
+def teardown_module():
+    """Remove the module from the autotvm task tables."""
+    # TODO(areusch): Tasks should not be registered into a global.
+    del autotvm.task.task.TASK_TABLE["testing/conv2d_no_batching"]
+
+
+def get_sample_task(target=tvm.target.cuda(), target_host=None):
+    """return a sample task for testing"""
+    target, target_host = Target.canon_target_and_host(target, target_host)
+    task = autotvm.task.create(
+        "testing/conv2d_no_batching", args=(1, 7, 7, 512, 512, 3, 3), target=target
+    )
+    return task, target
+
+
+def run_test_with_all_multiprocessing(func, *args, **kwargs):
+    """Check all multiprocessing methods work for the tuning test.
+
+    In the past fork() had the most support at detriment to spawn() and forkserver().
+    As fork() is unavailable or unsafe on some platforms it is good to check all
+    available methods.
+    """
+    for multiprocessing_method in mp.get_all_start_methods():
+        old_start_method = mp.get_start_method()
+        try:
+            mp.set_start_method(multiprocessing_method, force=True)
+            func(*args, **kwargs)
+        finally:
+            mp.set_start_method(old_start_method, force=True)
+
+
+@tvm.testing.parametrize_targets("cuda", "opencl")
+def test_tuning_gpu(target):
+    """Test gpu tuning."""
+
+    def runner(target):
+        # init task
+        task, target = get_sample_task(target, None)
+        logging.info("task config space: %s", task.config_space)
+
+        measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner())
+
+        results = []
+
+        tuner = RandomTuner(task)
+        tuner.tune(
+            n_trial=20,
+            measure_option=measure_option,
+            callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
+        )
+
+        assert len(results) == 20
+
+        successful_results = [
+            r
+            for r in results
+            if r.error_no == autotvm.MeasureErrorNo.NO_ERROR
+            # We filter records before building if we know they won't work ahead of time.
+            # We can't guarantee we get one good record so we count these as success too
+            or r.error_no == autotvm.MeasureErrorNo.INSTANTIATION_ERROR
+        ]
+        assert len(successful_results) > 0, f"No successful tuning runs: {results!r}"
+
+    run_test_with_all_multiprocessing(runner, target)
+
+
+@tvm.testing.parametrize_targets("cuda", "opencl")
+def test_tuning_gpu_inherits_pass_context(target):
+    """Autotvm tuner inherits PassContexts but also adds a gpu verification pass by default.
+
+    Test that using PassContext inherits passes properly but also runs gpu verification pass.
+    """
+
+    @pass_instrument
+    class PassInstrumentChecker:
+        """Pass Instrument that simply sees if it's been run."""
+
+        def __init__(self):
+            self.has_been_run = False
+
+        def run_after_pass(self, *_):
+            self.has_been_run = True
+
+    class GPUVerifyPassMocked:
+        """Context manager that mocks tir.analysis.verify_gpu_code meant
+        to verify the pass has been run. This is done by patching the ffi func handles."""
+
+        FFI_FUNC_HANDLE = "tir.analysis.verify_gpu_code"
+        FUNC_NAME = "verify_gpu_code"
+
+        def __init__(self) -> None:
+            self.old_impl = tvm._ffi.get_global_func(self.FFI_FUNC_HANDLE)
+            self.has_been_run = False
+
+        def gpu_verify_pass_mocked(self):
+            """Get the replacement for the gpu verification pass."""
+
+            def _gpu_verify_pass_mocked(*args, **kwargs):
+                self.has_been_run = True
+                return self.old_impl(*args, **kwargs)
+
+            return _gpu_verify_pass_mocked
+
+        def __enter__(self):
+            tvm._ffi.register_func(
+                self.FFI_FUNC_HANDLE, self.gpu_verify_pass_mocked(), override=True
+            )
+
+            # Also overwrite the python bindings
+            setattr(
+                _analysis_ffi_api, self.FUNC_NAME, tvm._ffi.get_global_func(self.FFI_FUNC_HANDLE)
+            )
+
+        def __exit__(self, *args, **kwargs):
+            # Restore FFI status back to normal
+            tvm._ffi.register_func(self.FFI_FUNC_HANDLE, self.old_impl, override=True)
+            setattr(_analysis_ffi_api, self.FUNC_NAME, self.old_impl)
+
+    class OverwrittenBuildFunc(measure_methods._WrappedBuildFunc):
+        """BuildFunc that mocks and patches as necessary to test proper passes are run."""
+
+        def __call__(self, measure_input, tmp_dir, **kwargs):
+            instrument = PassInstrumentChecker()
+            mocked_pass_checker = GPUVerifyPassMocked()
+            with mocked_pass_checker:
+                with PassContext(instruments=[instrument]):
+                    regular_result = super().__call__(measure_input, tmp_dir, **kwargs)
+
+                    # Check instrument has been run, meaning context was inherited by builder
+                    assert instrument.has_been_run
+
+                    # But also check the gpu verification pass has been run
+                    # (which was not in the inherited ctx)
+                    assert mocked_pass_checker.has_been_run
+
+                    return regular_result
+
+    class MockedLocalBuilder(measure_methods.LocalBuilder):
+        """As measure_methods.LocalBuilder but overwrites the PassContext for testing."""
+
+        def __init__(
+            self,
+            timeout=10,
+            n_parallel=None,
+            build_kwargs=None,
+            build_func="default",
+            do_fork=False,
+            runtime=None,
+        ):
+            # pylint: disable=too-many-function-args
+            super().__init__(timeout, n_parallel, build_kwargs, build_func, do_fork, runtime)
+
+            self.build_func = OverwrittenBuildFunc(tar.tar, runtime)
+
+    def runner(target):
+        task, target = get_sample_task(target, None)
+        logging.info("task config space: %s", task.config_space)
+
+        # Note: we use the MockedLocalBuilder here instead of autotvm.LocalBuilder()
+        measure_option = autotvm.measure_option(MockedLocalBuilder(), autotvm.LocalRunner())
+
+        results = []
+
+        tuner = RandomTuner(task)
+        tuner.tune(
+            n_trial=1,
+            measure_option=measure_option,
+            callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
+        )
+
+        assert len(results) == 1
+
+    run_test_with_all_multiprocessing(runner, target)
+
+
+def test_tuning_cpu():
+    """Test tuning on cpu."""
+
+    def runner():
+        ir_mod = tvm.parser.fromtext(
+            textwrap.dedent(
+                """
+            #[version = "0.0.5"]
+            def @main(%a : Tensor[(1, 3, 32, 32), float32], %b : Tensor[(3, 3, 5, 5), float32]) {
+                nn.conv2d(%a, %b, data_layout="NCHW", kernel_layout="OIHW")
+            }
+            """
+            )
+        )
+        tasks = autotvm.task.relay_integration.extract_from_program(
+            ir_mod, {}, tvm.target.create("llvm")
+        )
+        assert len(tasks) == 1, f"Extracted != 1 task from program: {tasks!r}"
+
+        task = tasks[0]
+
+        measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner())
+
+        results = []
+
+        tuner = RandomTuner(task)
+        tuner.tune(
+            n_trial=20,
+            measure_option=measure_option,
+            callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
+        )
+
+        assert len(results) == 20
+
+        successful_results = [r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR]
+        assert len(successful_results) > 0, f"No successful tuning runs: {results!r}"
+
+    run_test_with_all_multiprocessing(runner)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 04c5f85ce5d4..af5143908108 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -14,367 +14,86 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-Test the tuner
-"""
+# pylint: disable=missing-docstring
 import logging
-import multiprocessing as mp
-import textwrap
+import tempfile
+from typing import List, Optional
 
+import numpy as np  # type: ignore
+import pytest
 import tvm
-import tvm.relay
-import tvm.testing
-from tvm import autotvm, te
-from tvm.autotvm.measure import measure_methods
-from tvm.autotvm.tuner import RandomTuner
-from tvm.contrib import tar
-from tvm.ir.instrument import pass_instrument
-from tvm.ir.transform import PassContext
-from tvm.target import Target
-from tvm.tir.analysis import _ffi_api as _analysis_ffi_api
-
-
-def setup_module():
-    """Setup the module used for testing."""
-
-    @autotvm.template("testing/conv2d_no_batching")
-    def conv2d_no_batching(  # pylint: disable=unused-variable
-        batch_size, input_h, input_w, channels_in, channels_out, kernel_h, kernel_w
-    ):
-        """An example template for testing"""
-        assert batch_size == 1, "Only consider batch_size = 1 in this template"
-
-        data = te.placeholder((batch_size, channels_in, input_h, input_w), name="data")
-        kernel = te.placeholder((channels_out, channels_in, kernel_h, kernel_w), name="kernel")
-
-        axis_rc = te.reduce_axis((0, channels_in), name="rc")
-        axis_ry = te.reduce_axis((0, kernel_h), name="ry")
-        axis_rx = te.reduce_axis((0, kernel_w), name="rx")
-
-        conv = te.compute(
-            (batch_size, channels_out, input_h - kernel_h + 1, input_w - kernel_w + 1),
-            lambda nn, ff, yy, xx: te.sum(
-                data[nn, axis_rc, yy + axis_ry, xx + axis_rx]
-                * kernel[ff, axis_rc, axis_ry, axis_rx],
-                axis=[axis_rc, axis_ry, axis_rx],
-            ),
-            tag="conv2d_nchw",
-        )
-
-        schedule = te.create_schedule([conv.op])
-
-        output = conv
-        cache_write_ol = schedule.cache_write(conv, "local")
-
-        # create cache stage
-        cache_read_aa = schedule.cache_read(data, "shared", [cache_write_ol])
-        cache_read_ww = schedule.cache_read(kernel, "shared", [cache_write_ol])
-        cache_read_al = schedule.cache_read(cache_read_aa, "local", [cache_write_ol])
-        cache_read_wl = schedule.cache_read(cache_read_ww, "local", [cache_write_ol])
-
-        # tile and bind spatial axes
-        axis_n, axis_f, axis_y, axis_x = schedule[output].op.axis
-        cfg = autotvm.get_config()
-        cfg.define_split("tile_f", cfg.axis(axis_f), num_outputs=4)
-        cfg.define_split("tile_y", cfg.axis(axis_y), num_outputs=4)
-        cfg.define_split("tile_x", cfg.axis(axis_x), num_outputs=4)
-        axis_bf, axis_vf, axis_tf, axis_fi = cfg["tile_f"].apply(schedule, output, axis_f)
-        axis_by, axis_vy, axis_ty, axis_yi = cfg["tile_y"].apply(schedule, output, axis_y)
-        axis_bx, axis_vx, axis_tx, axis_xi = cfg["tile_x"].apply(schedule, output, axis_x)
-        kernel_scope = axis_n  # this is the scope to attach global config inside this kernel
-
-        schedule[output].bind(axis_bf, te.thread_axis("blockIdx.z"))
-        schedule[output].bind(axis_by, te.thread_axis("blockIdx.y"))
-        schedule[output].bind(axis_bx, te.thread_axis("blockIdx.x"))
-        schedule[output].bind(axis_vf, te.thread_axis("vthread"))
-        schedule[output].bind(axis_vy, te.thread_axis("vthread"))
-        schedule[output].bind(axis_vx, te.thread_axis("vthread"))
-        schedule[output].bind(axis_tf, te.thread_axis("threadIdx.z"))
-        schedule[output].bind(axis_ty, te.thread_axis("threadIdx.y"))
-        schedule[output].bind(axis_tx, te.thread_axis("threadIdx.x"))
-        schedule[output].reorder(
-            axis_n,
-            axis_bf,
-            axis_by,
-            axis_bx,
-            axis_vf,
-            axis_vy,
-            axis_vx,
-            axis_tf,
-            axis_ty,
-            axis_tx,
-            axis_fi,
-            axis_yi,
-            axis_xi,
-        )
-        schedule[cache_write_ol].compute_at(schedule[output], axis_tx)
-
-        # tile and bind reduction axes
-        axis_n, axis_f, axis_y, axis_x = schedule[cache_write_ol].op.axis
-        axis_rc, axis_ry, axis_rx = schedule[cache_write_ol].op.reduce_axis
-        cfg.define_split("tile_rc", cfg.axis(axis_rc), num_outputs=3)
-        cfg.define_split("tile_ry", cfg.axis(axis_ry), num_outputs=3)
-        cfg.define_split("tile_rx", cfg.axis(axis_rx), num_outputs=3)
-        axis_rco, axis_rcm, axis_rci = cfg["tile_rc"].apply(schedule, cache_write_ol, axis_rc)
-        axis_ryo, axis_rym, axis_ryi = cfg["tile_rx"].apply(schedule, cache_write_ol, axis_ry)
-        axis_rxo, axis_rxm, axis_rxi = cfg["tile_ry"].apply(schedule, cache_write_ol, axis_rx)
-        schedule[cache_write_ol].reorder(
-            axis_rco,
-            axis_ryo,
-            axis_rxo,
-            axis_rcm,
-            axis_rym,
-            axis_rxm,
-            axis_rci,
-            axis_ryi,
-            axis_rxi,
-            axis_n,
-            axis_f,
-            axis_y,
-            axis_x,
-        )
-
-        schedule[cache_read_aa].compute_at(schedule[cache_write_ol], axis_rxo)
-        schedule[cache_read_ww].compute_at(schedule[cache_write_ol], axis_rxo)
-        schedule[cache_read_al].compute_at(schedule[cache_write_ol], axis_rxm)
-        schedule[cache_read_wl].compute_at(schedule[cache_write_ol], axis_rxm)
-
-        # cooperative fetching
-        for load in [cache_read_aa, cache_read_ww]:
-            axis_n, axis_f, axis_y, axis_x = schedule[load].op.axis
-            fused = schedule[load].fuse(axis_n, axis_f, axis_y, axis_x)
-            axis_tz, fused = schedule[load].split(fused, nparts=cfg["tile_f"].size[2])
-            axis_ty, fused = schedule[load].split(fused, nparts=cfg["tile_y"].size[2])
-            axis_tx, fused = schedule[load].split(fused, nparts=cfg["tile_x"].size[2])
-            schedule[load].bind(axis_tz, te.thread_axis("threadIdx.z"))
-            schedule[load].bind(axis_ty, te.thread_axis("threadIdx.y"))
-            schedule[load].bind(axis_tx, te.thread_axis("threadIdx.x"))
-
-        # tune unroll
-        cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-        cfg.define_knob("unroll_explicit", [0, 1])
-        schedule[output].pragma(
-            kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val
-        )
-        schedule[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-        return schedule, [data, kernel, conv]
-
-
-def teardown_module():
-    """Remove the module from the autotvm task tables."""
-    # TODO(areusch): Tasks should not be registered into a global.
-    del autotvm.task.task.TASK_TABLE["testing/conv2d_no_batching"]
-
-
-def get_sample_task(target=tvm.target.cuda(), target_host=None):
-    """return a sample task for testing"""
-    target, target_host = Target.canon_target_and_host(target, target_host)
-    task = autotvm.task.create(
-        "testing/conv2d_no_batching", args=(1, 7, 7, 512, 512, 3, 3), target=target
+from tvm import meta_schedule as ms
+from tvm import relay
+from tvm.contrib import graph_executor
+from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.target.target import Target
+
+logging.basicConfig(
+    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
+
+
+@pytest.mark.skip("Integration test")
+@pytest.mark.parametrize(
+    "model_name, input_shape, target, layout",
+    [
+        ("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC"),
+        ("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3090-ti", "NHWC"),
+    ],
+)
+def test_meta_schedule_tune_relay(
+    model_name: str,
+    input_shape: List[int],
+    target: str,
+    layout: Optional[str],
+):
+    dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda()
+    if model_name.startswith("bert"):
+        data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev)  # embedding size
+    else:
+        data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev)
+
+    mod, params, (input_name, _, _) = get_network(
+        name=model_name,
+        input_shape=input_shape,
+        layout=layout,
     )
-    return task, target
-
-
-def run_test_with_all_multiprocessing(func, *args, **kwargs):
-    """Check all multiprocessing methods work for the tuning test.
-
-    In the past fork() had the most support at detriment to spawn() and forkserver().
-    As fork() is unavailable or unsafe on some platforms it is good to check all
-    available methods.
-    """
-    for multiprocessing_method in mp.get_all_start_methods():
-        old_start_method = mp.get_start_method()
-        try:
-            mp.set_start_method(multiprocessing_method, force=True)
-            func(*args, **kwargs)
-        finally:
-            mp.set_start_method(old_start_method, force=True)
-
-
-@tvm.testing.parametrize_targets("cuda", "opencl")
-def test_tuning_gpu(target):
-    """Test gpu tuning."""
-
-    def runner(target):
-        # init task
-        task, target = get_sample_task(target, None)
-        logging.info("task config space: %s", task.config_space)
-
-        measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner())
-
-        results = []
-
-        tuner = RandomTuner(task)
-        tuner.tune(
-            n_trial=20,
-            measure_option=measure_option,
-            callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
-        )
-
-        assert len(results) == 20
-
-        successful_results = [
-            r
-            for r in results
-            if r.error_no == autotvm.MeasureErrorNo.NO_ERROR
-            # We filter records before building if we know they won't work ahead of time.
-            # We can't guarantee we get one good record so we count these as success too
-            or r.error_no == autotvm.MeasureErrorNo.INSTANTIATION_ERROR
-        ]
-        assert len(successful_results) > 0, f"No successful tuning runs: {results!r}"
-
-    run_test_with_all_multiprocessing(runner, target)
-
-
-@tvm.testing.parametrize_targets("cuda", "opencl")
-def test_tuning_gpu_inherits_pass_context(target):
-    """Autotvm tuner inherits PassContexts but also adds a gpu verification pass by default.
-
-    Test that using PassContext inherits passes properly but also runs gpu verification pass.
-    """
-
-    @pass_instrument
-    class PassInstrumentChecker:
-        """Pass Instrument that simply sees if it's been run."""
-
-        def __init__(self):
-            self.has_been_run = False
 
-        def run_after_pass(self, *_):
-            self.has_been_run = True
-
-    class GPUVerifyPassMocked:
-        """Context manager that mocks tir.analysis.verify_gpu_code meant
-        to verify the pass has been run. This is done by patching the ffi func handles."""
-
-        FFI_FUNC_HANDLE = "tir.analysis.verify_gpu_code"
-        FUNC_NAME = "verify_gpu_code"
-
-        def __init__(self) -> None:
-            self.old_impl = tvm._ffi.get_global_func(self.FFI_FUNC_HANDLE)
-            self.has_been_run = False
-
-        def gpu_verify_pass_mocked(self):
-            """Get the replacement for the gpu verification pass."""
-
-            def _gpu_verify_pass_mocked(*args, **kwargs):
-                self.has_been_run = True
-                return self.old_impl(*args, **kwargs)
-
-            return _gpu_verify_pass_mocked
-
-        def __enter__(self):
-            tvm._ffi.register_func(
-                self.FFI_FUNC_HANDLE, self.gpu_verify_pass_mocked(), override=True
-            )
-
-            # Also overwrite the python bindings
-            setattr(
-                _analysis_ffi_api, self.FUNC_NAME, tvm._ffi.get_global_func(self.FFI_FUNC_HANDLE)
+    target = Target(target)
+    with tempfile.TemporaryDirectory() as work_dir:
+        with ms.Profiler() as profiler:
+            database = ms.relay_integration.tune_relay(
+                mod=mod,
+                target=target,
+                params=params,
+                work_dir=work_dir,
+                max_trials_global=2048,
             )
-
-        def __exit__(self, *args, **kwargs):
-            # Restore FFI status back to normal
-            tvm._ffi.register_func(self.FFI_FUNC_HANDLE, self.old_impl, override=True)
-            setattr(_analysis_ffi_api, self.FUNC_NAME, self.old_impl)
-
-    class OverwrittenBuildFunc(measure_methods._WrappedBuildFunc):
-        """BuildFunc that mocks and patches as necessary to test proper passes are run."""
-
-        def __call__(self, measure_input, tmp_dir, **kwargs):
-            instrument = PassInstrumentChecker()
-            mocked_pass_checker = GPUVerifyPassMocked()
-            with mocked_pass_checker:
-                with PassContext(instruments=[instrument]):
-                    regular_result = super().__call__(measure_input, tmp_dir, **kwargs)
-
-                    # Check instrument has been run, meaning context was inherited by builder
-                    assert instrument.has_been_run
-
-                    # But also check the gpu verification pass has been run
-                    # (which was not in the inherited ctx)
-                    assert mocked_pass_checker.has_been_run
-
-                    return regular_result
-
-    class MockedLocalBuilder(measure_methods.LocalBuilder):
-        """As measure_methods.LocalBuilder but overwrites the PassContext for testing."""
-
-        def __init__(
-            self,
-            timeout=10,
-            n_parallel=None,
-            build_kwargs=None,
-            build_func="default",
-            do_fork=False,
-            runtime=None,
-        ):
-            # pylint: disable=too-many-function-args
-            super().__init__(timeout, n_parallel, build_kwargs, build_func, do_fork, runtime)
-
-            self.build_func = OverwrittenBuildFunc(tar.tar, runtime)
-
-    def runner(target):
-        task, target = get_sample_task(target, None)
-        logging.info("task config space: %s", task.config_space)
-
-        # Note: we use the MockedLocalBuilder here instead of autotvm.LocalBuilder()
-        measure_option = autotvm.measure_option(MockedLocalBuilder(), autotvm.LocalRunner())
-
-        results = []
-
-        tuner = RandomTuner(task)
-        tuner.tune(
-            n_trial=1,
-            measure_option=measure_option,
-            callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
-        )
-
-        assert len(results) == 1
-
-    run_test_with_all_multiprocessing(runner, target)
-
-
-def test_tuning_cpu():
-    """Test tuning on cpu."""
-
-    def runner():
-        ir_mod = tvm.parser.fromtext(
-            textwrap.dedent(
-                """
-            #[version = "0.0.5"]
-            def @main(%a : Tensor[(1, 3, 32, 32), float32], %b : Tensor[(3, 3, 5, 5), float32]) {
-                nn.conv2d(%a, %b, data_layout="NCHW", kernel_layout="OIHW")
-            }
-            """
+            rt_mod1 = ms.relay_integration.compile_relay(
+                database=database,
+                mod=mod,
+                target=target,
+                params=params,
             )
-        )
-        tasks = autotvm.task.relay_integration.extract_from_program(
-            ir_mod, {}, tvm.target.create("llvm")
-        )
-        assert len(tasks) == 1, f"Extracted != 1 task from program: {tasks!r}"
-
-        task = tasks[0]
-
-        measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner())
-
-        results = []
-
-        tuner = RandomTuner(task)
-        tuner.tune(
-            n_trial=20,
-            measure_option=measure_option,
-            callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
-        )
-
-        assert len(results) == 20
-
-        successful_results = [r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR]
-        assert len(successful_results) > 0, f"No successful tuning runs: {results!r}"
-
-    run_test_with_all_multiprocessing(runner)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
+        print(profiler.table())
+        # Compile without meta-schedule for correctness check
+        with tvm.transform.PassContext(opt_level=0):
+            rt_mod2 = relay.build(mod, target=target, params=params)
+
+        def get_output(data, lib):
+            module = graph_executor.GraphModule(lib["default"](dev))
+            module.set_input(input_name, data)
+            module.run()
+            return module.get_output(0).numpy()
+
+        # Check correctness
+        actual_output = get_output(data, rt_mod1)
+        expected_output = get_output(data, rt_mod2)
+        assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
+
+
+if __name__ == """__main__""":
+    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC")
+    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3090-ti", None)
diff --git a/tests/python/unittest/test_meta_schedule_cost_model.py b/tests/python/unittest/test_meta_schedule_cost_model.py
index c47897eabb3e..ed5229a20af5 100644
--- a/tests/python/unittest/test_meta_schedule_cost_model.py
+++ b/tests/python/unittest/test_meta_schedule_cost_model.py
@@ -15,27 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
-from typing import List
-
 import os
 import re
 import shutil
 import tempfile
-from functools import partial
 import unittest
-import numpy as np
+from functools import partial
+from typing import List
 
+import numpy as np
 import tvm
 import tvm.testing
-from tvm.script import tir as T
-from tvm.tir.schedule.schedule import Schedule
 from tvm.meta_schedule.cost_model import PyCostModel, RandomModel, XGBModel
-from tvm.meta_schedule.cost_model.xgb_model import _get_custom_call_back, PackSum
+from tvm.meta_schedule.cost_model.xgb_model import PackSum, _get_custom_call_back
 from tvm.meta_schedule.feature_extractor import RandomFeatureExtractor
 from tvm.meta_schedule.runner import RunnerResult
 from tvm.meta_schedule.search_strategy import MeasureCandidate
 from tvm.meta_schedule.tune_context import TuneContext
 from tvm.meta_schedule.utils import derived_object
+from tvm.script import tir as T
+from tvm.tir.schedule.schedule import Schedule
+
 
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
 @tvm.script.ir_module
@@ -244,9 +244,10 @@ def xgb_version_check():
 @unittest.skipIf(xgb_version_check(), "test not supported for xgboost version after 1.6.0")
 def test_meta_schedule_xgb_model_callback_as_function():
     # pylint: disable=import-outside-toplevel
-    import xgboost as xgb
     from itertools import chain as itertools_chain
 
+    import xgboost as xgb
+
     # pylint: enable=import-outside-toplevel
 
     extractor = RandomFeatureExtractor()
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
index 69408a2e901a..ac18bab81006 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
@@ -168,10 +168,6 @@ def test_conv2d_winograd_cpu():
         target=target,
         task_name="Custom Search Space Task",
         space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=ms.default_config.schedule_rules(
-            None,
-            target,
-        ),
     )
     post_order_apply = context.space_generator
     (sch,) = post_order_apply.generate_design_space(mod)
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
index 958baabedb6d..89a04a9464ce 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
@@ -286,9 +286,6 @@ def test_conv2d_winograd_cuda():
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
         task_name="Custom Search Space Task",
         space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=ms.default_config.schedule_rules(  # pylint: disable=protected-access
-            None, Target("cuda")
-        ),
     )
     post_order_apply = context.space_generator
     (sch,) = post_order_apply.generate_design_space(mod)
diff --git a/tests/python/unittest/test_meta_schedule_measure_callback.py b/tests/python/unittest/test_meta_schedule_measure_callback.py
index fba8c883e501..20596e8e8c4d 100644
--- a/tests/python/unittest/test_meta_schedule_measure_callback.py
+++ b/tests/python/unittest/test_meta_schedule_measure_callback.py
@@ -73,14 +73,7 @@ def apply(
 
     measure_callback = FancyMeasureCallback()
     measure_callback.apply(
-        ms.task_scheduler.RoundRobin(
-            tasks=[],
-            task_weights=[],
-            builder=DummyBuilder(),
-            runner=DummyRunner(),
-            database=ms.database.MemoryDatabase(),
-            max_trials=1,
-        ),
+        ms.task_scheduler.RoundRobin(),
         0,
         [ms.MeasureCandidate(Schedule(Matmul), None)],
         [ms.builder.BuilderResult("test_build", None)],
@@ -104,14 +97,7 @@ def apply(
     measure_callback = FailingMeasureCallback()
     with pytest.raises(ValueError, match="test"):
         measure_callback.apply(
-            ms.task_scheduler.RoundRobin(
-                tasks=[],
-                task_weights=[],
-                builder=DummyBuilder(),
-                runner=DummyRunner(),
-                database=ms.database.MemoryDatabase(),
-                max_trials=1,
-            ),
+            ms.task_scheduler.RoundRobin(),
             0,
             [ms.MeasureCandidate(Schedule(Matmul), None)],
             [ms.builder.BuilderResult("test_build", None)],
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
index 3d4a9966cb90..4147a9fbab86 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
@@ -15,8 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.mutator import MutateComputeLocation, Mutator
+from tvm import meta_schedule as ms
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
@@ -61,15 +60,17 @@ def _sch(decision: int) -> Schedule:
     return sch
 
 
-def _make_mutator(target: Target) -> Mutator:
-    ctx = TuneContext(
+def _make_mutator(target: Target) -> ms.Mutator:
+    ctx = ms.TuneContext(
         mod=add,
         target=target,
-        mutator_probs={
-            MutateComputeLocation(): 1.0,
-        },
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[],
+            mutator_probs={ms.mutator.MutateComputeLocation(): 1.0},
+        ),
     )
-    return list(ctx.mutator_probs.keys())[0]
+    return list(ctx.space_generator.mutator_probs.keys())[0]
 
 
 def test_mutate_compute_location_add():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
index b517c3ed490a..728f522335bf 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
@@ -17,8 +17,7 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 from typing import List
 
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.mutator import MutateParallel, Mutator
+from tvm import meta_schedule as ms
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
@@ -79,15 +78,17 @@ def _sch(decisions: List[List[int]], ann_val: int) -> Schedule:
     return sch
 
 
-def _make_mutator(target: Target, max_jobs_per_core: int) -> Mutator:
-    ctx = TuneContext(
+def _make_mutator(target: Target, max_jobs_per_core: int) -> ms.Mutator:
+    ctx = ms.TuneContext(
         mod=matmul,
         target=target,
-        mutator_probs={
-            MutateParallel(max_jobs_per_core): 1.0,
-        },
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[],
+            mutator_probs={ms.mutator.MutateParallel(max_jobs_per_core): 1.0},
+        ),
     )
-    return list(ctx.mutator_probs.keys())[0]
+    return list(ctx.space_generator.mutator_probs.keys())[0]
 
 
 def test_mutate_parallel_matmul():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
index 1dc7588edd7d..d3a431af0687 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
@@ -15,8 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.mutator import MutateThreadBinding, Mutator
+from tvm import meta_schedule as ms
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
@@ -62,15 +61,17 @@ def _sch() -> Schedule:
     return sch
 
 
-def _make_mutator(target: Target) -> Mutator:
-    ctx = TuneContext(
+def _make_mutator(target: Target) -> ms.Mutator:
+    ctx = ms.TuneContext(
         mod=element_wise,
         target=target,
-        mutator_probs={
-            MutateThreadBinding(): 1.0,
-        },
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[],
+            mutator_probs={ms.mutator.MutateThreadBinding(): 1.0},
+        ),
     )
-    return list(ctx.mutator_probs.keys())[0]
+    return list(ctx.space_generator.mutator_probs.keys())[0]
 
 
 def test_mutate_thread_binding():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
index 00b190a75de7..0600c0b79194 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
@@ -19,8 +19,7 @@
 from functools import reduce
 from typing import List
 
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.mutator import MutateTileSize, Mutator
+from tvm import meta_schedule as ms
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
@@ -67,13 +66,17 @@ def _sch(decisions: List[List[int]]) -> Schedule:
     return sch
 
 
-def _make_mutator(target: Target) -> Mutator:
-    ctx = TuneContext(
+def _make_mutator(target: Target) -> ms.Mutator:
+    ctx = ms.TuneContext(
         mod=matmul,
         target=target,
-        mutator_probs={MutateTileSize(): 1.0},
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[],
+            mutator_probs={ms.mutator.MutateTileSize(): 1.0},
+        ),
     )
-    return list(ctx.mutator_probs.keys())[0]
+    return list(ctx.space_generator.mutator_probs.keys())[0]
 
 
 def test_mutate_tile_size_matmul():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
index 7bed83f52232..a59a7e655b09 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
@@ -17,8 +17,7 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 from typing import List
 
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.mutator import MutateUnroll, Mutator
+from tvm import meta_schedule as ms
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
@@ -84,15 +83,17 @@ def _sch(decisions: List[List[int]]) -> Schedule:
     return sch
 
 
-def _make_mutator(target: Target) -> Mutator:
-    ctx = TuneContext(
+def _make_mutator(target: Target) -> ms.Mutator:
+    ctx = ms.TuneContext(
         mod=matmul,
         target=target,
-        mutator_probs={
-            MutateUnroll(): 1.0,
-        },
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[],
+            mutator_probs={ms.mutator.MutateUnroll(): 1.0},
+        ),
     )
-    return list(ctx.mutator_probs.keys())[0]
+    return list(ctx.space_generator.mutator_probs.keys())[0]
 
 
 def test_mutate_unroll_matmul():
diff --git a/tests/python/unittest/test_meta_schedule_post_order_apply.py b/tests/python/unittest/test_meta_schedule_post_order_apply.py
index b40ba2869d1c..9026feb9e08e 100644
--- a/tests/python/unittest/test_meta_schedule_post_order_apply.py
+++ b/tests/python/unittest/test_meta_schedule_post_order_apply.py
@@ -243,8 +243,11 @@ def test_meta_schedule_post_order_apply():
         mod=mod,
         target=Target("llvm"),
         task_name="Test Task",
-        space_generator=PostOrderApply(),
-        sch_rules=[WowSoFancyScheduleRule()],
+        space_generator=PostOrderApply(
+            sch_rules=[WowSoFancyScheduleRule()],
+            postprocs=[],
+            mutator_probs={},
+        ),
     )
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
@@ -259,8 +262,11 @@ def test_meta_schedule_post_order_apply_double():
         mod=mod,
         target=Target("llvm"),
         task_name="Double Rules Task",
-        space_generator=PostOrderApply(),
-        sch_rules=[DoubleScheduleRule()],
+        space_generator=PostOrderApply(
+            sch_rules=[DoubleScheduleRule()],
+            postprocs=[],
+            mutator_probs={},
+        ),
     )
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
@@ -276,8 +282,11 @@ def test_meta_schedule_post_order_apply_multiple():
         mod=mod,
         target=Target("llvm"),
         task_name="Double Rules Task",
-        space_generator=PostOrderApply(),
-        sch_rules=[DoubleScheduleRule(), ReorderScheduleRule()],
+        space_generator=PostOrderApply(
+            sch_rules=[DoubleScheduleRule(), ReorderScheduleRule()],
+            postprocs=[],
+            mutator_probs={},
+        ),
     )
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
@@ -293,8 +302,11 @@ def test_meta_schedule_post_order_apply_duplicate_matmul():
         mod=mod,
         target=Target("llvm"),
         task_name="Duplicate Matmul Task",
-        space_generator=PostOrderApply(),
-        sch_rules=[WowSoFancyScheduleRule()],
+        space_generator=PostOrderApply(
+            sch_rules=[WowSoFancyScheduleRule()],
+            postprocs=[],
+            mutator_probs={},
+        ),
     )
     post_order_apply = context.space_generator
     with pytest.raises(
@@ -346,8 +358,11 @@ def correct_trace(a, b, c, d):
         mod=mod,
         target=Target("llvm"),
         task_name="Remove Block Task",
-        space_generator=PostOrderApply(),
-        sch_rules=[RemoveBlock(), TrinityDoubleRule()],
+        space_generator=PostOrderApply(
+            sch_rules=[RemoveBlock(), TrinityDoubleRule()],
+            postprocs=[],
+            mutator_probs={},
+        ),
     )
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
@@ -373,8 +388,11 @@ def test_meta_schedule_custom_search_space():
         mod=mod,
         target=Target("llvm"),
         task_name="Custom Search Space Task",
-        space_generator=PostOrderApply(),
-        sch_rules=[],
+        space_generator=PostOrderApply(
+            sch_rules=[],
+            postprocs=[],
+            mutator_probs={},
+        ),
     )
     post_order_apply = context.space_generator
     post_order_apply.generate_design_space(mod)
@@ -401,8 +419,12 @@ def _get_sch(filter_fn):
             mod=mod,
             target=Target("llvm"),
             task_name="Custom Search Space Task",
-            space_generator=PostOrderApply(f_block_filter=filter_fn),
-            sch_rules=[TrinityDoubleRule()],
+            space_generator=PostOrderApply(
+                f_block_filter=filter_fn,
+                sch_rules=[TrinityDoubleRule()],
+                postprocs=[],
+                mutator_probs={},
+            ),
         )
         post_order_apply = context.space_generator
         schs = post_order_apply.generate_design_space(mod)
diff --git a/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py b/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
index 92c669ca1feb..5dc2500d1b2d 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
@@ -17,9 +17,8 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 
 import tvm
+from tvm import meta_schedule as ms
 from tvm import tir
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.postproc import DisallowDynamicLoop
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -28,13 +27,17 @@ def _target() -> Target:
     return Target("cuda", host="llvm")
 
 
-def _create_context(mod, target) -> TuneContext:
-    ctx = TuneContext(
+def _create_context(mod, target) -> ms.TuneContext:
+    ctx = ms.TuneContext(
         mod=mod,
         target=target,
-        postprocs=[
-            DisallowDynamicLoop(),
-        ],
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[
+                ms.postproc.DisallowDynamicLoop(),
+            ],
+            mutator_probs={},
+        ),
         task_name="test",
     )
     return ctx
@@ -83,14 +86,14 @@ def test_postproc_disallow_dynamic_loops():
     mod = Matmul
     ctx = _create_context(mod, target=_target())
     sch = tir.Schedule(mod, debug_mask="all")
-    assert ctx.postprocs[0].apply(sch)
+    assert ctx.space_generator.postprocs[0].apply(sch)
 
 
 def test_postproc_disallow_dynamic_loops_fail():
     mod = DynamicLoop
     ctx = _create_context(mod, target=_target())
     sch = tir.Schedule(mod, debug_mask="all")
-    assert not ctx.postprocs[0].apply(sch)
+    assert not ctx.space_generator.postprocs[0].apply(sch)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
index e55f693e72d3..c82bc697c993 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
@@ -18,9 +18,8 @@
 
 import tvm
 import tvm.testing
+from tvm import meta_schedule as ms
 from tvm import tir
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.postproc import RewriteCooperativeFetch
 from tvm.meta_schedule.testing import te_workload
 from tvm.script import tir as T
 from tvm.target import Target
@@ -31,13 +30,17 @@ def _target() -> Target:
     return Target("cuda", host="llvm")
 
 
-def _create_context(mod, target) -> TuneContext:
-    ctx = TuneContext(
+def _create_context(mod, target) -> ms.TuneContext:
+    ctx = ms.TuneContext(
         mod=mod,
         target=target,
-        postprocs=[
-            RewriteCooperativeFetch(),
-        ],
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[
+                ms.postproc.RewriteCooperativeFetch(),
+            ],
+            mutator_probs={},
+        ),
         task_name="test",
     )
     return ctx
@@ -246,7 +249,7 @@ def test_rewrite_cooperative_fetch():
     # pylint: enable=line-too-long,invalid-name
     # fmt: on
     sch.enter_postproc()
-    assert ctx.postprocs[0].apply(sch)
+    assert ctx.space_generator.postprocs[0].apply(sch)
     tvm.ir.assert_structural_equal(sch.mod, AfterRewrite0)
 
 
@@ -291,8 +294,7 @@ def test_rewrite_warp_execution():
     # pylint: enable=line-too-long,invalid-name
     # fmt: on
     sch.enter_postproc()
-    assert ctx.postprocs[0].apply(sch)
-    print(sch.mod["main"].script())
+    assert ctx.space_generator.postprocs[0].apply(sch)
     tvm.ir.assert_structural_equal(sch.mod, WarpExecutionAfterRewrite)
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py
index e0ed68b69ce0..91a51c8e9033 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py
@@ -15,10 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-
 import tvm
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.postproc import RewriteLayout
+import tvm.testing
+from tvm import meta_schedule as ms
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -27,32 +26,41 @@ def _target() -> Target:
     return Target("cuda", host="llvm")
 
 
-def _create_context(mod, target) -> TuneContext:
-    return TuneContext(
+def _create_context(mod, target) -> ms.TuneContext:
+    ctx = ms.TuneContext(
         mod=mod,
         target=target,
-        postprocs=[
-            RewriteLayout(),
-        ],
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[
+                ms.postproc.RewriteLayout(),
+            ],
+            mutator_probs={},
+        ),
         task_name="test",
     )
+    return ctx
 
 
 class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
     def transform(self):
         def inner(mod):
             target = Target("cuda", host="llvm")
-            ctx = TuneContext(
+            ctx = ms.TuneContext(
                 mod=mod,
                 target=target,
-                postprocs=[
-                    RewriteLayout(),
-                ],
+                space_generator=ms.space_generator.PostOrderApply(
+                    sch_rules=[],
+                    postprocs=[
+                        ms.postproc.RewriteLayout(),
+                    ],
+                    mutator_probs={},
+                ),
                 task_name="test",
             )
             sch = tvm.tir.Schedule(mod, debug_mask="all")
             sch.enter_postproc()
-            assert ctx.postprocs[0].apply(sch)
+            assert ctx.space_generator.postprocs[0].apply(sch)
             return sch.mod
 
         return inner
@@ -147,5 +155,54 @@ def expected(A: T.Buffer[(16, 1), "float32"]):
                 T.evaluate(A_global[vi])
 
 
+@T.prim_func
+def tir_matmul(
+    A: T.Buffer[(16, 16), "float32"],
+    B: T.Buffer[(16, 16), "float32"],
+    C: T.Buffer[(16, 16), "float32"],
+) -> None:
+    T.func_attr({"layout_free_buffers": [1]})
+    for i0, j, k0, i1, k1 in T.grid(4, 16, 4, 4, 4):
+        with T.block("matmul"):
+            vi = T.axis.S(16, i0 * 4 + i1)
+            vj = T.axis.S(16, j)
+            vk = T.axis.R(16, k0 * 4 + k1)
+            with T.init():
+                C[vi, vj] = T.float32(0)
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@T.prim_func
+def rewritten_tir_matmul(
+    A: T.Buffer[(16, 16), "float32"],
+    B: T.Buffer[(16, 16), "float32"],
+    C: T.Buffer[(16, 16), "float32"],
+) -> None:
+    T.func_attr({"layout_free_buffers": [1]})
+    B_reindex = T.alloc_buffer([16, 4, 4], dtype="float32")
+    for ax0, ax1 in T.grid(16, 16):
+        with T.block("layout_rewrite"):
+            i0, i1 = T.axis.remap("SS", [ax0, ax1])
+            T.block_attr({"meta_schedule.layout_rewrite_preproc": True})
+            B_reindex[i1, i0 // 4, i0 % 4] = B[i0, i1]
+    for i0, j, k0, i1, k1 in T.grid(4, 16, 4, 4, 4):
+        with T.block("matmul"):
+            vi = T.axis.spatial(16, i0 * 4 + i1)
+            vj = T.axis.spatial(16, j)
+            vk = T.axis.reduce(16, k0 * 4 + k1)
+            with T.init():
+                C[vi, vj] = T.float32(0)
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B_reindex[vj, vk // 4, vk % 4]
+
+
+def test_layout_rewrite():
+    target = _target()
+    ctx = _create_context(tir_matmul, target)
+    sch = tvm.tir.Schedule(tir_matmul, debug_mask="all")
+    sch.enter_postproc()
+    assert ctx.space_generator.postprocs[0].apply(sch)
+    tvm.ir.assert_structural_equal(sch.mod["main"], rewritten_tir_matmul)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
index 24d1229b3ac6..7e499424058d 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
@@ -17,9 +17,8 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 
 import tvm
+from tvm import meta_schedule as ms
 from tvm import tir
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.postproc import RewriteReductionBlock
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -28,13 +27,17 @@ def _target() -> Target:
     return Target("cuda", host="llvm")
 
 
-def _create_context(mod, target) -> TuneContext:
-    ctx = TuneContext(
+def _create_context(mod, target) -> ms.TuneContext:
+    ctx = ms.TuneContext(
         mod=mod,
         target=target,
-        postprocs=[
-            RewriteReductionBlock(),
-        ],
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[
+                ms.postproc.RewriteReductionBlock(),
+            ],
+            mutator_probs={},
+        ),
         task_name="test",
     )
     return ctx
@@ -200,7 +203,7 @@ def test_rewrite_tiled_matmul():
     ctx = _create_context(mod, target)
     sch = tir.Schedule(mod, debug_mask="all")
     sch.enter_postproc()
-    assert ctx.postprocs[0].apply(sch)
+    assert ctx.space_generator.postprocs[0].apply(sch)
     tvm.ir.assert_structural_equal(sch.mod, Matmul_after_rewrite)
 
 
@@ -210,7 +213,7 @@ def test_rewrite_softmax():
     ctx = _create_context(mod, target)
     sch = tir.Schedule(mod, debug_mask="all")
     sch.enter_postproc()
-    assert ctx.postprocs[0].apply(sch)
+    assert ctx.space_generator.postprocs[0].apply(sch)
     # The module should not be rewritten
     tvm.ir.assert_structural_equal(sch.mod, Softmax_cross_thread_reduction)
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
index fc624cd5a68f..8f9d287621e2 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
-from tvm.meta_schedule import TuneContext, postproc
+from tvm import meta_schedule as ms
 from tvm.script import tir as T
 from tvm.tir.tensor_intrin import arm_cpu, cuda, rocm, x86
 
@@ -450,11 +450,15 @@ def main(
                             compute[v0, v1] = compute_local[v0, v1]
 
 
-def _create_context(mod, target, postprocs):
-    ctx = TuneContext(
+def _create_context(mod, target, postprocs) -> ms.TuneContext:
+    ctx = ms.TuneContext(
         mod=mod,
         target=target,
-        postprocs=postprocs,
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=postprocs,
+            mutator_probs={},
+        ),
         task_name="test",
     )
     return ctx
@@ -467,14 +471,14 @@ def test_rewrite_tensorize_conv2d_nchwc_vnni():
         mod,
         target,
         [
-            postproc.RewriteReductionBlock(),
-            postproc.RewriteTensorize(True),
+            ms.postproc.RewriteReductionBlock(),
+            ms.postproc.RewriteTensorize(True),
         ],
     )
     sch = tvm.tir.Schedule(mod, debug_mask="all")
     sch.enter_postproc()
 
-    for proc in ctx.postprocs:
+    for proc in ctx.space_generator.postprocs:
         proc.apply(sch)
 
     tvm.ir.assert_structural_equal(sch.mod, Conv2dNCHWcVNNIModuleTensorized)
@@ -487,15 +491,15 @@ def test_rewrite_tensorize_dense_dp4a():
         mod,
         target,
         [
-            postproc.RewriteCooperativeFetch(),
-            postproc.RewriteReductionBlock(),
-            postproc.RewriteTensorize(),
+            ms.postproc.RewriteCooperativeFetch(),
+            ms.postproc.RewriteReductionBlock(),
+            ms.postproc.RewriteTensorize(),
         ],
     )
     sch = tvm.tir.Schedule(mod, debug_mask="all")
     sch.enter_postproc()
 
-    for proc in ctx.postprocs:
+    for proc in ctx.space_generator.postprocs:
         proc.apply(sch)
 
     tvm.ir.assert_structural_equal(sch.mod, DenseDP4ATensorized)
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
index ebc435a02e8b..b01447ad4a9e 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
@@ -17,25 +17,25 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 
 import tvm
+from tvm import meta_schedule as ms
 from tvm import tir
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.postproc import RewriteUnboundBlock
 from tvm.script import tir as T
 from tvm.target import Target
-from tvm.tir.schedule.schedule import Schedule
 
 
 def _target() -> Target:
     return Target("cuda --max_threads_per_block=1024", host="llvm")
 
 
-def _create_context(mod, target) -> TuneContext:
-    ctx = TuneContext(
+def _create_context(mod, target) -> ms.TuneContext:
+    ctx = ms.TuneContext(
         mod=mod,
         target=target,
-        postprocs=[
-            RewriteUnboundBlock(),
-        ],
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[ms.postproc.RewriteUnboundBlock()],
+            mutator_probs={},
+        ),
         task_name="test",
     )
     return ctx
@@ -363,7 +363,7 @@ def test_rewrite_cooperative_fetch():
     ctx = _create_context(mod, target)
     sch = tir.Schedule(mod, debug_mask="all")
     sch.enter_postproc()
-    assert ctx.postprocs[0].apply(sch)
+    assert ctx.space_generator.postprocs[0].apply(sch)
     tvm.ir.assert_structural_equal(sch.mod, After_cooperative_fetch)
 
 
@@ -373,7 +373,7 @@ def test_rewrite_norm_bmn():
     ctx = _create_context(mod, target)
     sch = tir.Schedule(mod, debug_mask="all")
     sch.enter_postproc()
-    assert ctx.postprocs[0].apply(sch)
+    assert ctx.space_generator.postprocs[0].apply(sch)
     tvm.ir.assert_structural_equal(sch.mod, After_norm_bmn)
 
 
@@ -383,7 +383,7 @@ def test_rewrite_cuda_loop_split_no_reduction():
     ctx = _create_context(mod, target)
     sch = tir.Schedule(mod, debug_mask="all")
     sch.enter_postproc()
-    assert ctx.postprocs[0].apply(sch)
+    assert ctx.space_generator.postprocs[0].apply(sch)
     tvm.ir.assert_structural_equal(sch.mod, Bert_fused_reshape_transpose_reshape_after_rub)
 
 
@@ -393,7 +393,7 @@ def test_rewrite_cuda_loop_split_no_reduction_large():
     ctx = _create_context(mod, target)
     sch = tir.Schedule(mod, debug_mask="all")
     sch.enter_postproc()
-    assert ctx.postprocs[0].apply(sch)
+    assert ctx.space_generator.postprocs[0].apply(sch)
     tvm.ir.assert_structural_equal(sch.mod, Bert_fused_reshape_transpose_reshape_after_rub_large)
 
 
@@ -403,7 +403,7 @@ def test_rewrite_cuda_loop_split_for_kind():
     ctx = _create_context(mod, target)
     sch = tir.Schedule(mod, debug_mask="all")
     sch.enter_postproc()
-    assert ctx.postprocs[0].apply(sch)
+    assert ctx.space_generator.postprocs[0].apply(sch)
     tvm.ir.assert_structural_equal(sch.mod["main"], after_unrolled_loop)
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
index e7632561c05c..86a88af40309 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
@@ -15,15 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-
-import sys
-
 import pytest
 import tvm
 import tvm.testing
+from tvm import meta_schedule as ms
 from tvm import tir
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.postproc import VerifyGPUCode
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -32,16 +28,17 @@ def _target() -> Target:
     return Target("nvidia/geforce-rtx-3080")
 
 
-def _create_context(mod, target) -> TuneContext:
-    ctx = TuneContext(
+def _create_context(mod, target) -> ms.TuneContext:
+    return ms.TuneContext(
         mod=mod,
         target=target,
-        postprocs=[
-            VerifyGPUCode(),
-        ],
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[ms.postproc.VerifyGPUCode()],
+            mutator_probs={},
+        ),
         task_name="test",
     )
-    return ctx
 
 
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable,misplaced-comparison-constant
@@ -786,7 +783,7 @@ def GMMCUDATensorCore(
 def test_postproc_check_pass(mod):
     ctx = _create_context(mod, target=_target())
     sch = tir.Schedule(mod, debug_mask="all")
-    assert ctx.postprocs[0].apply(sch)
+    assert ctx.space_generator.postprocs[0].apply(sch)
 
 
 @pytest.mark.parametrize(
@@ -801,7 +798,7 @@ def test_postproc_check_pass(mod):
 def test_postproc_check_fail(mod):
     ctx = _create_context(mod, target=_target())
     sch = tir.Schedule(mod, debug_mask="all")
-    assert not ctx.postprocs[0].apply(sch)
+    assert not ctx.space_generator.postprocs[0].apply(sch)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
similarity index 55%
rename from tests/python/unittest/test_meta_schedule_integration.py
rename to tests/python/unittest/test_meta_schedule_relay_integration.py
index 366a2e4887ed..cf61df0c6ba8 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -23,9 +23,13 @@
 from tvm import meta_schedule as ms
 from tvm import relay, te, tir
 from tvm._ffi import register_func
+from tvm.contrib import graph_executor
+from tvm.ir.transform import PassContext
 from tvm.meta_schedule.testing.relay_workload import get_network
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
+from tvm.meta_schedule.tune_context import _normalize_mod
 from tvm.script import tir as T
+from tvm.target import Target
 
 # pylint: disable=no-member,line-too-long,too-many-nested-blocks,unbalanced-tuple-unpacking,no-self-argument,missing-docstring,invalid-name
 
@@ -60,15 +64,14 @@ def test_meta_schedule_dynamic_loop_extent():
     a = relay.var("a", shape=(1, 8, 8, 512), dtype="float32")
     b = relay.nn.adaptive_avg_pool2d(a, (7, 7), "NHWC")
     mod = IRModule({"main": relay.Function([a], b)})
-    extracted_tasks = ms.extract_task_from_relay(mod, target="llvm", params={})
+    extracted_tasks = ms.relay_integration.extract_tasks(mod, target="llvm", params={})
     assert not extracted_tasks
 
 
-@pytest.mark.xfail(strict=True, reason="See https://github.com/apache/tvm/issues/12732")
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet():
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    extracted_tasks = ms.extract_task_from_relay(mod, target="llvm", params=params)
+    extracted_tasks = ms.relay_integration.extract_tasks(mod, target="llvm", params=params)
     expected_task_names = [
         "fused_" + s
         for s in [
@@ -186,7 +189,7 @@ def test_meta_schedule_integration_extract_from_bert_base():
         ),
     }
     mod, params, _ = get_network(name="bert_base", input_shape=[1, 64])
-    extracted_tasks = ms.extract_task_from_relay(mod, target="llvm", params=params)
+    extracted_tasks = ms.relay_integration.extract_tasks(mod, target="llvm", params=params)
     assert len(extracted_tasks) == len(expected)
     for t in extracted_tasks:
         prim_func = None
@@ -199,7 +202,6 @@ def test_meta_schedule_integration_extract_from_bert_base():
         assert expected_shape == shape, t.task_name
 
 
-@pytest.mark.xfail(strict=True, reason="See https://github.com/apache/tvm/issues/12732")
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet_with_filter_func():
     @register_func("relay.backend.tir_converter.remove_purely_spatial", override=True)
@@ -229,11 +231,14 @@ def traverse(t):
         return create_prim_func(args)
 
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    extracted_tasks = ms.extract_task_from_relay(
+    extracted_tasks = ms.relay_integration.extract_tasks(
         mod,
         target="llvm",
         params=params,
-        tir_converter="remove_purely_spatial",
+        pass_config={
+            "relay.backend.use_meta_schedule": True,
+            "relay.backend.tir_converter": "remove_purely_spatial",
+        },
     )
     expected_task_names = [
         "fused_" + s
@@ -266,32 +271,34 @@ def traverse(t):
 
 @pytest.mark.skip("Too slow on CI")
 def extract_task_qbert():
-    mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
-    target = "llvm -mcpu=cascadelake"
-    extracted_tasks = ms.extract_task_from_relay(mod, target, params)
-    tune_tasks = list(
-        filter(
-            lambda task: "dense" in task.task_name or "batch_matmul" in task.task_name,
-            extracted_tasks,
+    def _test(mod, params, target):
+        extracted_tasks = ms.relay_integration.extract_tasks(mod, target, params)
+        tune_tasks = list(
+            filter(
+                lambda task: "dense" in task.task_name or "batch_matmul" in task.task_name,
+                extracted_tasks,
+            )
         )
-    )
-    # three int8 dense, two int8 bmm, and one fp32 dense
-    assert len(tune_tasks) == 6
+        # three int8 dense, two int8 bmm, and one fp32 dense
+        assert len(tune_tasks) == 6
+
+        for task in tune_tasks:
+            relay_func = list(task.mod.functions.values())[0]
+            out_type = relay_func.body.checked_type
 
-    for task in tune_tasks:
-        relay_func = list(task.mod.functions.values())[0]
-        out_type = relay_func.body.checked_type
+            if out_type.dtype == "float32":
+                continue
 
-        if out_type.dtype == "float32":
-            continue
+            sch = tvm.tir.Schedule(_normalize_mod(task.dispatched[0]))
+            block = sch.get_block("compute")
+            annotations = sch.get(block).annotations
 
-        mod = ms.default_config.mod(task.dispatched[0])
-        sch = tvm.tir.Schedule(mod)
-        block = sch.get_block("compute")
-        annotations = sch.get(block).annotations
+            assert "schedule_rule" in annotations
+            assert "vnni" in annotations["schedule_rule"]
+        ...
 
-        assert "schedule_rule" in annotations
-        assert "vnni" in annotations["schedule_rule"]
+    mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
+    _test(mod, params, target="llvm -mcpu=cascadelake")
 
 
 @tvm.testing.skip_if_32bit(reason="Apparently the LLVM version on i386 image is too old")
@@ -322,7 +329,7 @@ def test_extract_task_arm_conv2d_nchwc():
     params = {"weight": weight_np, "bias": bias_np}
 
     target = "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon"
-    extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params)
+    extracted_tasks = ms.relay_integration.extract_tasks(relay_mod, target, params)
     tune_tasks = list(
         filter(
             lambda task: "conv2d" in task.task_name,
@@ -339,5 +346,148 @@ def test_extract_task_arm_conv2d_nchwc():
     assert list(out_type.shape) == [1, 8, 130, 130, 4]
 
 
+def test_meta_schedule_te2primfunc_argument_order():
+    # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
+    # fmt: off
+    @tvm.script.ir_module
+    class _fused_layout_transform:
+        @T.prim_func
+        def main( # type: ignore
+            placeholder: T.Buffer[(1, 3, 16, 16), "float32"], # type: ignore
+            T_layout_trans: T.Buffer[(1, 1, 16, 16, 3), "float32"], # type: ignore
+        ) -> None: # type: ignore
+            # function attr dict
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            # body
+            # with T.block("root")
+            for i0, i1, i2, i3, i4 in T.grid(1, 1, 16, 16, 3):
+                with T.block("T_layout_trans"):
+                    ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                    T.reads(placeholder[ax0, ax1 * 3 + ax4, ax2, ax3])
+                    T.writes(T_layout_trans[ax0, ax1, ax2, ax3, ax4])
+                    T_layout_trans[ax0, ax1, ax2, ax3, ax4] = T.if_then_else(
+                        ax0 < 1 and ax1 * 3 + ax4 < 3 and ax2 < 16 and ax3 < 16, # type: ignore
+                        placeholder[ax0, ax1 * 3 + ax4, ax2, ax3],
+                        T.float32(0),
+                        dtype="float32",
+                    )
+
+    @tvm.script.ir_module
+    class _fused_layout_transform_1:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T.Buffer[(1, 8, 16, 16), "float32"]) -> None: # type: ignore
+            # function attr dict
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            # body
+            # with T.block("root")
+            for i0, i1, i2, i3 in T.grid(1, 8, 16, 16):
+                with T.block("T_layout_trans"):
+                    ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                    T.reads(placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4]) # type: ignore
+                    T.writes(T_layout_trans[ax0, ax1, ax2, ax3])
+                    T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < 1 and ax1 < 8 and ax2 < 16 and ax3 < 16, placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4], T.float32(0), dtype="float32") # type: ignore
+
+    @tvm.script.ir_module
+    class _fused_nn_contrib_conv2d_NCHWc:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(1, 1, 16, 16, 3), "float32"], placeholder_1: T.Buffer[(2, 1, 5, 5, 3, 4), "float32"], conv2d_NCHWc: T.Buffer[(1, 2, 16, 16, 4), "float32"]) -> None: # type: ignore
+            # function attr dict
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            # body
+            # with T.block("root")
+            data_pad = T.alloc_buffer([1, 1, 20, 20, 3], dtype="float32")
+            for i0, i1, i2, i3, i4 in T.grid(1, 1, 20, 20, 3):
+                with T.block("data_pad"):
+                    i0_1, i1_1, i2_1, i3_1, i4_1 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                    T.reads(placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1])
+                    T.writes(data_pad[i0_1, i1_1, i2_1, i3_1, i4_1])
+                    data_pad[i0_1, i1_1, i2_1, i3_1, i4_1] = T.if_then_else(2 <= i2_1 and i2_1 < 18 and 2 <= i3_1 and i3_1 < 18, placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1], T.float32(0), dtype="float32") # type: ignore # pylint: disable=R1716
+            for i0, i1, i2, i3, i4, i5, i6, i7 in T.grid(1, 2, 16, 16, 4, 3, 5, 5):
+                with T.block("conv2d_NCHWc"):
+                    n, oc_chunk, oh, ow, oc_block, ic, kh, kw = T.axis.remap("SSSSSRRR", [i0, i1, i2, i3, i4, i5, i6, i7])
+                    T.reads(data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3], placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block]) # type: ignore
+                    T.writes(conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block])
+                    T.block_attr({"workload":["conv2d_NCHWc.x86", ["TENSOR", [1, 1, 16, 16, 3], "float32"], ["TENSOR", [2, 1, 5, 5, 3, 4], "float32"], [1, 1], [2, 2, 2, 2], [1, 1], "NCHW3c", "NCHW4c", "float32"]})
+                    with T.init():
+                        conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = T.float32(0)
+                    conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3] * placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block] # type: ignore
+
+    # fmt: on
+    # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
+
+    def _create_database():
+        database = ms.database.create("memory")
+
+        def _commit(mod):
+            workload = database.commit_workload(mod)
+            database.commit_tuning_record(
+                ms.database.TuningRecord(
+                    tir.schedule.Trace([], {}),
+                    workload=workload,
+                    run_secs=[0.1],
+                )
+            )
+
+        _commit(_fused_layout_transform)
+        _commit(_fused_layout_transform_1)
+        _commit(_fused_nn_contrib_conv2d_NCHWc)
+        return database
+
+    data_shape = (1, 3, 16, 16)
+    weight_shape = (8, 3, 5, 5)
+
+    def _create_relay_mod():
+        data = relay.var("data", relay.TensorType(data_shape, "float32"))
+        weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
+        y = relay.nn.conv2d(
+            data,
+            weight,
+            padding=(2, 2),
+            kernel_size=(5, 5),
+            kernel_layout="OIHW",
+            out_dtype="float32",
+        )
+        f = relay.Function([data, weight], y)
+        mod = tvm.IRModule.from_expr(f)
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    mod = _create_relay_mod()
+    dev = tvm.cpu()
+    target = Target("llvm --num-cores=16")
+    params = {
+        "weight": np.random.rand(*weight_shape).astype("float32"),
+    }
+    data = tvm.nd.array(
+        np.random.rand(*data_shape).astype("float32"),
+        dev,
+    )
+
+    with target, _create_database(), PassContext(
+        opt_level=3,
+        config={
+            "relay.backend.use_meta_schedule": True,
+            "relay.backend.use_meta_schedule_dispatch": 7,
+            "relay.backend.tir_converter": "default",
+        },
+    ):
+        rt_mod1 = relay.build(mod, target=target, params=params)
+
+    # Compile without meta-schedule for correctness check
+    with tvm.transform.PassContext(opt_level=0):
+        rt_mod2 = relay.build(mod, target=target, params=params)
+
+    def get_output(data, lib):
+        module = graph_executor.GraphModule(lib["default"](dev))
+        module.set_input("data", data)
+        module.run()
+        return module.get_output(0).numpy()
+
+    # Check correctness
+    actual_output = get_output(data, rt_mod1)
+    expected_output = get_output(data, rt_mod2)
+    assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
index 70b49944ba0f..7f56683588ba 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
@@ -17,7 +17,10 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing import te_workload
-from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    generate_design_space,
+)
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.te import create_prim_func
@@ -104,13 +107,12 @@ def cpu_matmul_2(
         ("SamplePerfectTile", [4, 128]),
     ]
     mod = create_prim_func(te_workload.matmul(n=4, m=4, k=512))
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="llvm",
         mod=mod,
         target=Target("llvm --num-cores=32"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[ms.schedule_rule.AddRFactor()],
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.AddRFactor,
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -269,13 +271,12 @@ def argmax_2(
         ("SamplePerfectTile", [8, 16]),
     ]
     mod = argmax
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="llvm",
         mod=mod,
         target=Target("llvm --num-cores=32"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[ms.schedule_rule.AddRFactor()],
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.AddRFactor,
+    )
     check_sketches(
         mod,
         sketches=actual,
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index a50292df7ae3..f0eee4138daa 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -16,8 +16,10 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.schedule_rule import get_rules
-from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    generate_design_space,
+)
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -80,13 +82,12 @@ def elementwise_0(
         ("SampleCategorical", 5),
     ]
     mod = element_wise
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("nvidia/geforce-rtx-3080", host="llvm"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("cuda", ms.schedule_rule.AutoBind),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.AutoBind,
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -114,13 +115,12 @@ def reduction_loop_only_0(
                         C[()] = T.min(C[()], A[k0] / B[k0])
 
     mod = reduction_loop_only
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("nvidia/geforce-rtx-3080", host="llvm"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("cuda", ms.schedule_rule.AutoBind),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.AutoBind,
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -145,13 +145,12 @@ def zero_dim_add_0(
                     C[()] = A[()] + B[()]
 
     mod = zero_dim_add
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("nvidia/geforce-rtx-3080", host="llvm"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("cuda", ms.schedule_rule.AutoBind),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.AutoBind,
+    )
     check_sketches(
         mod,
         sketches=actual,
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
index c0801c9d7b5e..c17209e2cb77 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
@@ -17,7 +17,7 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
 from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.schedule_rule import get_rules
+from tvm.meta_schedule.testing.space_generation import generate_design_space
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -338,74 +338,63 @@ def main(T_full: T.Buffer[(1, 12, 4096), "int64"]) -> None:
 # fmt: on
 
 
-def _create_context(mod, target, rule):
-    ctx = ms.TuneContext(
-        mod=mod,
-        target=target,
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[rule],
-        task_name="test",
-    )
-    return ctx
-
-
 def test_inline_consumer_chain():
     mod = Conv2DBiasBnReLU
     target = Target("llvm")
-    ctx = _create_context(
+    (space,) = generate_design_space(
+        kind="llvm",
         mod=mod,
         target=target,
-        rule=get_rules("llvm", ms.schedule_rule.AutoInline)[0],
+        types=ms.schedule_rule.AutoInline,
     )
-    (space,) = ctx.space_generator.generate_design_space(mod=mod)
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=Conv2DBiasBnReLUInlined)
 
 
 def test_inline_into_cache():
     mod = MultiLevelTiledConv2D
     target = Target("cuda", host="llvm")
-    ctx = _create_context(
+    (space,) = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=target,
-        rule=get_rules("cuda", ms.schedule_rule.AutoInline)[0],
+        types=ms.schedule_rule.AutoInline,
     )
-    (space,) = ctx.space_generator.generate_design_space(mod=mod)
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=MultiLevelTiledConv2DAfterInline)
 
 
 def test_inline_into_multiple_consumers():
     mod = SoftmaxBeforeInline
     target = Target("cuda", host="llvm")
-    ctx = _create_context(
+    (space,) = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=target,
-        rule=get_rules("cuda", ms.schedule_rule.AutoInline)[0],
+        types=ms.schedule_rule.AutoInline,
     )
-    (space,) = ctx.space_generator.generate_design_space(mod=mod)
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=SoftmaxAfterInline)
 
 
 def test_inline_pure_spatial():
     mod = BeforePureSpatial
     target = Target("llvm")
-    ctx = _create_context(
+    (space,) = generate_design_space(
+        kind="llvm",
         mod=mod,
         target=target,
-        rule=get_rules("llvm", ms.schedule_rule.AutoInline)[0],
+        types=ms.schedule_rule.AutoInline,
     )
-    (space,) = ctx.space_generator.generate_design_space(mod=mod)
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=AfterPureSpatial)
 
 
 def test_inline_constant_tensor():
     mod = ConstConsumer
     target = Target("cuda", host="llvm")
-    ctx = _create_context(
+    (space,) = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=target,
-        rule=get_rules("cuda", ms.schedule_rule.AutoInline)[0],
+        types=ms.schedule_rule.AutoInline,
     )
-    (space,) = ctx.space_generator.generate_design_space(mod=mod)
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=ConstConsumer)
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index 718b264bddd2..c851c9bec3b5 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -19,8 +19,10 @@
 import tvm
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing import te_workload
-from tvm.meta_schedule.testing.schedule_rule import get_rules
-from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    generate_design_space,
+)
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.te import create_prim_func
@@ -280,13 +282,12 @@ def softmax_mn_3(
         ("SampleCategorical", 7),
     ]
     mod = create_prim_func(te_workload.softmax_mn(n=256, m=256))
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.CrossThreadReduction,
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -476,13 +477,12 @@ def softmax_mn_after_inline_3(
     ]
 
     mod = Softmax_mn_after_inline
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.CrossThreadReduction,
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -552,13 +552,12 @@ def batch_norm_bmn_1(A: T.Buffer[(1, 512, 512), "float32"], D: T.Buffer[1, "floa
     ]
 
     mod = create_prim_func(te_workload.norm_bmn(B=1, M=512, N=512))
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.CrossThreadReduction,
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -670,13 +669,12 @@ def argmax_1(
     ]
 
     mod = argmax
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.CrossThreadReduction,
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -745,13 +743,12 @@ def argmax_1(
     ]
 
     mod = argmax_32
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.CrossThreadReduction,
+    )
     check_sketches(
         mod,
         sketches=actual,
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
index d9d078106333..28e6f295e78f 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
@@ -16,10 +16,12 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 from tvm import meta_schedule as ms
-from tvm import te, target
+from tvm import target, te
 from tvm.meta_schedule.testing import te_workload
-from tvm.meta_schedule.testing.schedule_rule import get_rules
-from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    generate_design_space,
+)
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -128,13 +130,12 @@ def cpu_matmul_2(
     ]
 
     mod = te.create_prim_func(te_workload.matmul(512, 512, 512))
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="llvm",
         mod=mod,
         target=Target("llvm"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("llvm", ms.schedule_rule.MultiLevelTiling),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.MultiLevelTiling,
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -253,13 +254,12 @@ def cpu_matmul_relu_2(
         ("SamplePerfectTile", [64, 8]),
     ]
     mod = te.create_prim_func(te_workload.matmul_relu(512, 512, 512))
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="llvm",
         mod=mod,
         target=Target("llvm"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("llvm", ms.schedule_rule.MultiLevelTiling),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.MultiLevelTiling,
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -360,13 +360,12 @@ def cuda_matmul_0(
         ("SampleCategorical", 0),
     ]
     mod = te.create_prim_func(te_workload.matmul(512, 512, 512))
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("nvidia/geforce-rtx-3080"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("cuda", ms.schedule_rule.MultiLevelTiling),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.MultiLevelTiling,
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -479,13 +478,12 @@ def cuda_matmul_relu_0(
         ("SampleCategorical", 3),
     ]
     mod = te.create_prim_func(te_workload.matmul_relu(512, 512, 512))
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("nvidia/geforce-rtx-3080"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("cuda", ms.schedule_rule.MultiLevelTiling),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.MultiLevelTiling,
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -511,13 +509,12 @@ def sum_with_trivial_block_iter(
 
     # Expect nothing to happen - the rule is not supposed to be applied in this case
     mod = sum_with_trivial_block_iter
-    (sch,) = ms.TuneContext(
+    (sch,) = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("nvidia/geforce-rtx-3080"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=get_rules("cuda", ms.schedule_rule.MultiLevelTiling),
-        task_name="test",
-    ).generate_design_space()
+        types=ms.schedule_rule.MultiLevelTiling,
+    )
     assert not sch.trace.simplified(remove_postproc=True).insts
 
 
@@ -593,10 +590,11 @@ def cpu_conv2d_nhwc(
         te_workload.conv2d_nhwc(1, H, W, I, O, 3, 1, 1, 1, in_dtype="float16", out_dtype="float16")
     )
 
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target(target_hexagon, host=target_hexagon),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[
             ms.schedule_rule.MultiLevelTilingWideVector(
                 structure="SRSRS",
@@ -606,8 +604,7 @@ def cpu_conv2d_nhwc(
                 reuse_write=None,
             )
         ],
-        task_name="test",
-    ).generate_design_space()
+    )
 
     decision_0 = [
         ("SamplePerfectTile", [1, 1, 1]),
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
index 38ddb137e108..e70f7cb2c618 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
@@ -18,7 +18,10 @@
 from tvm import meta_schedule as ms
 from tvm import te
 from tvm.ir import assert_structural_equal
-from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    generate_design_space,
+)
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
@@ -226,10 +229,11 @@ def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
 
     mod = conv2d_nchwc
     target = Target("llvm -mcpu=cascadelake -num-cores=4")
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="llvm",
         mod=mod,
         target=Target(target),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[
             ms.schedule_rule.MultiLevelTilingWithIntrin(
                 VNNI_INTRIN,
@@ -241,7 +245,7 @@ def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
                 reuse_write=ms.schedule_rule.ReuseType(req="may", levels=[1, 2], scope="global"),
             ),
         ],
-    ).generate_design_space()
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -266,10 +270,11 @@ def _dense(m, n, k, in_dtype, out_dtype):
         return te.create_prim_func([X, W, matmul])
 
     mod = _dense(m, n, k, in_dtype, out_dtype)
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=Target("cuda"),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[
             ms.schedule_rule.MultiLevelTilingWithIntrin(
                 DP4A_INTRIN,
@@ -281,7 +286,7 @@ def _dense(m, n, k, in_dtype, out_dtype):
                 reuse_write=ms.schedule_rule.ReuseType(req="must", levels=[3], scope="local"),
             )
         ],
-    ).generate_design_space()
+    )
     if expected_mods is None:
         assert expected_decisions is None
         assert len(actual) == 1
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
index a53c1062b98d..0e4bd6bf302a 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
@@ -20,8 +20,11 @@
 from tvm import meta_schedule as ms
 from tvm import te
 from tvm.meta_schedule.testing import te_workload
-from tvm.meta_schedule.testing.schedule_rule import get_rules
-from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    generate_design_space,
+    get_rules,
+)
 from tvm.script import tir as T
 from tvm.tir.tensor_intrin.cuda import get_wmma_intrin_group
 
@@ -186,13 +189,16 @@ def matmul_relu_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "f
             out_dtype="float32",
         )
     )
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=tvm.target.Target("cuda"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[multi_level_tiling_tensor_core()]
-        + get_rules("cuda", ms.schedule_rule.AutoInline),
-    ).generate_design_space()
+        types=None,
+        sch_rules=[
+            multi_level_tiling_tensor_core(),
+        ]
+        + get_rules(kind="cuda", types=ms.schedule_rule.AutoInline),
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -324,10 +330,11 @@ def matmul_relu_fallback_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128,
             out_dtype="float32",
         )
     )
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=tvm.target.Target("cuda"),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[
             multi_level_tiling_tensor_core(),
         ]
@@ -338,7 +345,7 @@ def matmul_relu_fallback_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128,
                 ms.schedule_rule.AutoInline,
             ),
         ),
-    ).generate_design_space()
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -475,12 +482,15 @@ def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3,
             out_dtype="float32",
         )
     )
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=tvm.target.Target("cuda"),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules=[multi_level_tiling_tensor_core()],
-    ).generate_design_space()
+        types=None,
+        sch_rules=[
+            multi_level_tiling_tensor_core(),
+        ],
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -490,17 +500,18 @@ def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3,
 
     # Test adding inapplicable tensor intrinsics doesn't change the search space
     # This test case uses the same workload, decision and the expected sketch as above
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=tvm.target.Target("cuda"),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[
             multi_level_tiling_tensor_core(
                 in_dtype="float16",
                 out_dtype=["float16", "float32"],
             ),
         ],
-    ).generate_design_space()
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -638,16 +649,17 @@ def matmul_relu_pipeline_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128,
             out_dtype="float32",
         )
     )
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=tvm.target.Target("cuda"),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[
             multi_level_tiling_tensor_core(
                 use_software_pipeline=True,
             ),
         ],
-    ).generate_design_space()
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -775,13 +787,14 @@ def matmul_relu_global_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 1
             out_dtype="float32",
         )
     )
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=tvm.target.Target("cuda"),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[multi_level_tiling_tensor_core(write_reuse_scope="global")]
         + get_rules("cuda", ms.schedule_rule.AutoInline),
-    ).generate_design_space()
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -799,13 +812,14 @@ def test_matmul_relu_non_tensorizable():
             k=128,
         )
     )
-    (sch,) = ms.TuneContext(
+    (sch,) = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=tvm.target.Target("cuda"),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[multi_level_tiling_tensor_core(write_reuse_scope="global")]
         + get_rules("cuda", ms.schedule_rule.AutoInline),
-    ).generate_design_space()
+    )
     tvm.ir.assert_structural_equal(mod, sch.mod["main"])
 
 
@@ -934,13 +948,14 @@ def padded_matmul_relu_0(A: T.Buffer[(127, 127), "float16"], B: T.Buffer[(127, 1
             out_dtype="float32",
         )
     )
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=tvm.target.Target("cuda"),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[multi_level_tiling_tensor_core(write_reuse_scope="shared")]
         + get_rules("cuda", ms.schedule_rule.AutoInline),
-    ).generate_design_space()
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -1083,13 +1098,14 @@ def conv2d_1x1_0(inputs: T.Buffer[(1, 16, 16, 64), "float16"], weight: T.Buffer[
             out_dtype="float32",
         )
     )
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="cuda",
         mod=mod,
         target=tvm.target.Target("cuda"),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[multi_level_tiling_tensor_core(write_reuse_scope="shared")]
         + get_rules("cuda", ms.schedule_rule.AutoInline),
-    ).generate_design_space()
+    )
     check_sketches(
         mod,
         sketches=actual,
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
index 8076fcaa8bd4..520dfbfb1cc5 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
@@ -17,7 +17,10 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
 from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    generate_design_space,
+)
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -252,10 +255,11 @@ def Matmul_0(
     ]
 
     mod = Matmul
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="llvm",
         mod=mod,
         target=Target("llvm --num-cores=32"),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[
             ms.schedule_rule.ParallelizeVectorizeUnroll(
                 max_jobs_per_core=16,
@@ -264,8 +268,7 @@ def Matmul_0(
                 unroll_explicit=True,
             ),
         ],
-        task_name="test",
-    ).generate_design_space()
+    )
     check_sketches(
         mod,
         sketches=actual,
@@ -276,10 +279,11 @@ def Matmul_0(
 
 def test_parallel_vectorize_unroll_spatial():
     mod = PureSpatial
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="llvm",
         mod=mod,
         target=Target("llvm --num-cores=32"),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[
             ms.schedule_rule.ParallelizeVectorizeUnroll(
                 max_jobs_per_core=-1,
@@ -288,8 +292,7 @@ def test_parallel_vectorize_unroll_spatial():
                 unroll_explicit=True,
             ),
         ],
-        task_name="test",
-    ).generate_design_space()
+    )
     assert len(actual) == 1
     trace = actual[0].trace.simplified(remove_postproc=True)
     assert not trace.insts
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
index fc52aa199cc1..7c9433cedf50 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
@@ -17,7 +17,10 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
 from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    generate_design_space,
+)
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -87,13 +90,13 @@ def add_0(
     ]
 
     mod = Add
-    actual = ms.TuneContext(
+    actual = generate_design_space(
+        kind="llvm",
         mod=mod,
         target=Target("llvm"),
-        space_generator=ms.space_generator.PostOrderApply(),
+        types=None,
         sch_rules=[ms.schedule_rule.RandomComputeLocation()],
-        task_name="test",
-    ).generate_design_space()
+    )
     check_sketches(
         mod,
         sketches=actual,
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index 7433f001c0eb..e34554420600 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -84,14 +84,16 @@ def test_meta_schedule_replay_func(
 
     context = ms.TuneContext(
         mod=Matmul,
-        space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
-        search_strategy=TestClass(
-            num_trials_per_iter=num_trials_per_iter, max_trials_per_task=max_trials_per_task
-        ),
+        space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul, postprocs=[]),
+        search_strategy=TestClass(),
     )
     strategy = context.search_strategy
     spaces = context.space_generator.generate_design_space(context.mod)
-    strategy.pre_tuning(spaces)
+    strategy.pre_tuning(
+        max_trials=max_trials_per_task,
+        num_trials_per_iter=num_trials_per_iter,
+        design_spaces=spaces,
+    )
     (correct_sch,) = ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(
         Matmul
     )
@@ -135,10 +137,13 @@ def _schedule_matmul_small(sch: Schedule):
         mod=Matmul,
         space_generator=ms.space_generator.ScheduleFn(
             sch_fn=_schedule_matmul_small,
+            sch_rules=[],
+            postprocs=[],
+            mutator_probs={
+                DummyMutator(): 1.0,
+            },
         ),
         search_strategy=ms.search_strategy.EvolutionarySearch(
-            num_trials_per_iter=num_trials_per_iter,
-            max_trials_per_task=max_trials_per_task,
             population_size=5,
             init_measured_ratio=0.1,
             init_min_unmeasured=50,
@@ -147,15 +152,14 @@ def _schedule_matmul_small(sch: Schedule):
             genetic_max_fail_count=10,
             eps_greedy=0.9,
         ),
-        mutator_probs={
-            DummyMutator(): 1.0,
-        },
         target=tvm.target.Target("llvm"),
         num_threads=1,  # because we are using a mutator from the python side
     )
     strategy = context.search_strategy
     strategy.pre_tuning(
-        context.space_generator.generate_design_space(context.mod),
+        max_trials=max_trials_per_task,
+        num_trials_per_iter=num_trials_per_iter,
+        design_spaces=context.space_generator.generate_design_space(context.mod),
         database=ms.database.MemoryDatabase(),
         cost_model=ms.cost_model.RandomModel(),
     )
@@ -197,8 +201,6 @@ def _schedule_matmul_empty(sch: Schedule):
     context = ms.TuneContext(
         mod=Matmul,
         search_strategy=ms.search_strategy.EvolutionarySearch(
-            num_trials_per_iter=num_trials_per_iter,
-            max_trials_per_task=max_trials_per_task,
             population_size=5,
             init_measured_ratio=0.1,
             init_min_unmeasured=50,
@@ -209,16 +211,20 @@ def _schedule_matmul_empty(sch: Schedule):
         ),
         space_generator=ms.space_generator.ScheduleFn(
             sch_fn=_schedule_matmul_empty,
+            sch_rules=[],
+            postprocs=[],
+            mutator_probs={
+                DummyMutator(): 1.0,
+            },
         ),
-        mutator_probs={
-            DummyMutator(): 1.0,
-        },
         target=tvm.target.Target("llvm"),
         num_threads=1,
     )
     strategy = context.search_strategy
     strategy.pre_tuning(
-        context.space_generator.generate_design_space(context.mod),
+        max_trials=max_trials_per_task,
+        num_trials_per_iter=num_trials_per_iter,
+        design_spaces=context.space_generator.generate_design_space(context.mod),
         database=ms.database.MemoryDatabase(),
         cost_model=ms.cost_model.RandomModel(),
     )
@@ -246,4 +252,7 @@ def _schedule_matmul_empty(sch: Schedule):
 
 
 if __name__ == "__main__":
-    tvm.testing.main()
+    test_meta_schedule_replay_func(ms.search_strategy.ReplayFunc)
+    test_meta_schedule_replay_func(ms.search_strategy.ReplayTrace)
+    test_meta_schedule_evolutionary_search()
+    test_meta_schedule_evolutionary_search_early_stop()
diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py
index 25dc14fd5cb7..47f3e6d4cc51 100644
--- a/tests/python/unittest/test_meta_schedule_space_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_space_cpu.py
@@ -16,7 +16,11 @@
 # under the License.
 """Tests for MetaSchedule search space on CPU"""
 from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.space_generation import check_sketches, print_sketches
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    print_sketches,
+    generate_design_space,
+)
 from tvm.meta_schedule.testing.te_workload import create_te_workload
 from tvm.script import tir as T
 from tvm.target import Target
@@ -26,6 +30,15 @@ def _target():
     return Target("aws/cpu/c5.9xlarge")
 
 
+def _design_space(mod):
+    return generate_design_space(
+        kind="llvm",
+        mod=mod,
+        target=_target(),
+        types=ms.ScheduleRule,
+    )
+
+
 def test_cpu_c1d():
     # fmt: off
     @T.prim_func
@@ -161,12 +174,7 @@ def c1d_2(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
     ]
 
     mod = create_te_workload("C1D", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -337,12 +345,7 @@ def c2d_2(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
     ]
 
     mod = create_te_workload("C2D", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -534,12 +537,7 @@ def c3d_2(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7
     ]
 
     mod = create_te_workload("C3D", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -727,12 +725,7 @@ def cap_2(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(
         ("SampleComputeLocation", -1),
     ]
     mod = create_te_workload("CAP", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -887,12 +880,7 @@ def dep_2(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.
         ("SampleComputeLocation", 5),
     ]
     mod = create_te_workload("DEP", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -1065,12 +1053,7 @@ def dil_2(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
         ("SampleComputeLocation", 1),
     ]
     mod = create_te_workload("DIL", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -1187,12 +1170,7 @@ def gmm_2(X: T.Buffer[(1, 128, 128), "float32"], Y: T.Buffer[(1, 128, 128), "flo
         ("SampleCategorical", 1),
     ]
     mod = create_te_workload("GMM", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -1361,12 +1339,7 @@ def grp_2(inputs: T.Buffer[(1, 56, 56, 64), "float32"], weight: T.Buffer[(3, 3,
         ("SampleComputeLocation", 9),
     ]
     mod = create_te_workload("GRP", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -1521,12 +1494,7 @@ def t2d_2(inputs: T.Buffer[(1, 4, 4, 512), "float32"], weight: T.Buffer[(4, 4, 5
         ("SampleComputeLocation", -2),
     ]
     mod = create_te_workload("T2D", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -1646,12 +1614,7 @@ def nrm_2(A: T.Buffer[(1, 256, 256), "float32"], D: T.Buffer[1, "float32"]) -> N
         ("SampleComputeLocation", -1),
     ]
     mod = create_te_workload("NRM", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -2220,12 +2183,7 @@ def sfm_8(A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256
         ("SampleComputeLocation", 0),
     ]
     mod = create_te_workload("SFM", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -2404,12 +2362,7 @@ def cbr_2(data: T.Buffer[(1, 224, 224, 3), "float32"], kernel: T.Buffer[(7, 7, 3
         ("SampleComputeLocation", 1),
     ]
     mod = create_te_workload("CBR", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -2588,12 +2541,7 @@ def tbg_2(query: T.Buffer[(1, 128, 12, 64), "float32"], value: T.Buffer[(1, 128,
         ("SampleComputeLocation", -2),
     ]
     mod = create_te_workload("TBG", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index ffa2b57ba8ec..f0f6e91ea655 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -15,9 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 """Tests for MetaSchedule search space on CUDA"""
-from tvm import te, topi, autotvm
+from tvm import autotvm
 from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.space_generation import check_sketches, print_sketches
+from tvm import te, topi
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    generate_design_space,
+    print_sketches,
+)
 from tvm.meta_schedule.testing.te_workload import create_te_workload
 from tvm.script import tir as T
 from tvm.target import Target
@@ -27,6 +32,15 @@ def _target():
     return Target("nvidia/geforce-rtx-3070")
 
 
+def _design_space(mod):
+    return generate_design_space(
+        kind="cuda",
+        mod=mod,
+        target=_target(),
+        types=ms.ScheduleRule,
+    )
+
+
 def _conv2d_winograd_nchw():
     data = te.placeholder((1, 64, 224, 224), name="data", dtype="float32")
     kernel = te.placeholder((6, 6, 64, 64), name="kernel", dtype="float32")
@@ -119,12 +133,7 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
     ]
 
     mod = create_te_workload("C1D", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -208,12 +217,7 @@ def c2d_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
     ]
 
     mod = create_te_workload("C2D", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -303,12 +307,7 @@ def c3d_0(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7
         ("SampleCategorical", 1),
     ]
     mod = create_te_workload("C3D", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -404,12 +403,7 @@ def cap_0(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(
         ("SampleCategorical", 2),
     ]
     mod = create_te_workload("CAP", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -492,12 +486,7 @@ def dep_0(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.
         ("SampleCategorical", 1),
     ]
     mod = create_te_workload("DEP", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -580,12 +569,7 @@ def dil_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
         ("SampleCategorical", 3),
     ]
     mod = create_te_workload("DIL", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -661,12 +645,7 @@ def gmm_0(X: T.Buffer[(1, 128, 128), "float32"], Y: T.Buffer[(1, 128, 128), "flo
         ("SampleCategorical", 4),
     ]
     mod = create_te_workload("GMM", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -750,12 +729,7 @@ def grp_0(inputs: T.Buffer[(1, 56, 56, 64), "float32"], weight: T.Buffer[(3, 3,
         ("SampleCategorical", 1),
     ]
     mod = create_te_workload("GRP", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -840,12 +814,7 @@ def t2d_0(inputs: T.Buffer[(1, 4, 4, 512), "float32"], weight: T.Buffer[(4, 4, 5
         ("SampleCategorical", 2),
     ]
     mod = create_te_workload("T2D", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -923,12 +892,7 @@ def nrm_1(A: T.Buffer[(1, 256, 256), "float32"], D: T.Buffer[1, "float32"]) -> N
         ("SampleCategorical", 4),
     ]
     mod = create_te_workload("NRM", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -1135,12 +1099,7 @@ def sfm_3(A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256
         ("SampleCategorical", 0),
     ]
     mod = create_te_workload("SFM", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -1225,12 +1184,7 @@ def cbr_0(data: T.Buffer[(1, 224, 224, 3), "float32"], kernel: T.Buffer[(7, 7, 3
         ("SampleCategorical", 3),
     ]
     mod = create_te_workload("CBR", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -1309,12 +1263,7 @@ def tbg_0(query: T.Buffer[(1, 128, 12, 64), "float32"], value: T.Buffer[(1, 128,
         ("SampleCategorical", 4),
     ]
     mod = create_te_workload("TBG", 0)
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
@@ -1459,12 +1408,7 @@ def winograd_nchw_conv2d(data: T.Buffer[(1, 64, 224, 224), "float32"], kernel: T
         ("SampleCategorical", 4),
     ]
     mod = _conv2d_winograd_nchw()
-    actual = ms.TuneContext(
-        mod=mod,
-        target=_target(),
-        space_generator=ms.space_generator.PostOrderApply(),
-        sch_rules="default",
-    ).generate_design_space()
+    actual = _design_space(mod)
     check_sketches(
         mod,
         sketches=actual,
diff --git a/tests/python/unittest/test_meta_schedule_space_generator.py b/tests/python/unittest/test_meta_schedule_space_generator.py
index 9201fe16e849..ef2be381c694 100644
--- a/tests/python/unittest/test_meta_schedule_space_generator.py
+++ b/tests/python/unittest/test_meta_schedule_space_generator.py
@@ -18,7 +18,6 @@
 # pylint: disable=missing-function-docstring
 
 import math
-import sys
 
 import pytest
 import tvm
@@ -94,7 +93,11 @@ def test_meta_schedule_design_space_generator_union():
 def test_meta_schedule_design_space_generator_NIE():
     @derived_object
     class TestPySpaceGenerator(PySpaceGenerator):
-        pass
+        def __init__(self):
+            super().__init__()
+            self.sch_rules = []
+            self.postprocs = []
+            self.mutator_probs = {}
 
     with pytest.raises(
         TVMError, match="PySpaceGenerator's InitializeWithTuneContext method not implemented!"
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index 3edd81ee9a11..33a019e3c555 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """ Test Meta Schedule Task Scheduler """
-
 import random
 import weakref
 from typing import Set
@@ -23,19 +22,11 @@
 import pytest
 import tvm
 import tvm.testing
-from tvm.support import libinfo
 from tvm import meta_schedule as ms
-from tvm._ffi.base import TVMError
 from tvm.meta_schedule.testing.dummy_object import DummyBuilder, DummyRunner
 from tvm.script import tir as T
 from tvm.tir import Schedule
 
-# from tvm.meta_schedule import TuneContext, measure_callback
-# from tvm.meta_schedule.search_strategy import ReplayTrace
-# from tvm.meta_schedule.space_generator import ScheduleFn
-# from tvm.meta_schedule.task_scheduler import GradientBased, PyTaskScheduler, RoundRobin
-# from tvm.meta_schedule.utils import derived_object
-
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
 
 
@@ -131,9 +122,10 @@ class MyTaskScheduler(ms.task_scheduler.PyTaskScheduler):
     done: Set = set()
 
     def next_task_id(self) -> int:
-        while len(self.done) != len(self.tasks):
-            x = random.randint(0, len(self.tasks) - 1)
-            task = self.tasks[x]
+        tasks = self._outer().tasks_
+        while len(self.done) != len(tasks):
+            x = random.randint(0, len(tasks) - 1)
+            task = tasks[x]
             if not task.is_terminated:
                 """Calling base func via following route:
                 Python side:
@@ -157,28 +149,28 @@ def test_meta_schedule_task_scheduler_single():
     num_trials_per_iter = 3
     max_trials_per_task = 10
     database = ms.database.MemoryDatabase()
-    round_robin = ms.task_scheduler.RoundRobin(
+    round_robin = ms.task_scheduler.RoundRobin()
+    round_robin.tune(
         [
             ms.TuneContext(
                 MatmulModule,
                 target=tvm.target.Target("llvm"),
-                space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
-                search_strategy=ms.search_strategy.ReplayTrace(
-                    num_trials_per_iter,
-                    max_trials_per_task,
-                ),
+                space_generator=_schedule_matmul,
+                search_strategy=ms.search_strategy.ReplayTrace(),
                 task_name="Test",
                 rand_state=42,
             )
         ],
         [1.0],
+        max_trials_global=num_trials_per_iter,
+        max_trials_per_task=max_trials_per_task,
+        num_trials_per_iter=64,
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
         measure_callbacks=[ms.measure_callback.AddToDatabase()],
-        max_trials=max_trials_per_task,
+        cost_model=None,
     )
-    round_robin.tune()
     assert len(database) == max_trials_per_task
 
 
@@ -189,48 +181,42 @@ def test_meta_schedule_task_scheduler_multiple():
         ms.TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ms.search_strategy.ReplayTrace(
-                num_trials_per_iter,
-                max_trials_per_task,
-            ),
+            space_generator=_schedule_matmul,
+            search_strategy=ms.search_strategy.ReplayTrace(),
             task_name="Matmul",
             rand_state=42,
         ),
         ms.TuneContext(
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ms.search_strategy.ReplayTrace(
-                num_trials_per_iter,
-                max_trials_per_task,
-            ),
+            space_generator=_schedule_matmul,
+            search_strategy=ms.search_strategy.ReplayTrace(),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
         ms.TuneContext(
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ms.search_strategy.ReplayTrace(
-                num_trials_per_iter,
-                max_trials_per_task,
-            ),
+            space_generator=_schedule_batch_matmul,
+            search_strategy=ms.search_strategy.ReplayTrace(),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
     ]
     database = ms.database.MemoryDatabase()
-    round_robin = ms.task_scheduler.RoundRobin(
+    round_robin = ms.task_scheduler.RoundRobin()
+    round_robin.tune(
         tasks,
         [1.0, 1.0, 1.0],
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
         measure_callbacks=[ms.measure_callback.AddToDatabase()],
-        max_trials=max_trials_per_task * len(tasks),
+        max_trials_global=max_trials_per_task * len(tasks),
+        max_trials_per_task=max_trials_per_task,
+        num_trials_per_iter=num_trials_per_iter,
+        cost_model=None,
     )
-    round_robin.tune()
     assert len(database) == max_trials_per_task * len(tasks)
     for task in tasks:
         assert (
@@ -249,82 +235,60 @@ def test_meta_schedule_task_scheduler_NIE():  # pylint: disable=invalid-name
     class NIETaskScheduler(ms.task_scheduler.PyTaskScheduler):
         pass
 
-    with pytest.raises(TVMError, match="PyTaskScheduler's NextTaskId method not implemented!"):
-        scheduler = NIETaskScheduler(
-            tasks=[],
-            builder=DummyBuilder(),
-            runner=DummyRunner(),
-            database=ms.database.MemoryDatabase(),
-            max_trials=1,
-        )
+    with pytest.raises(ValueError, match="next_task_id is not defined"):
+        scheduler = NIETaskScheduler()
         scheduler.next_task_id()
 
 
 def test_meta_schedule_task_scheduler_avoid_cyclic():  # pylint: disable=invalid-name
-    database = ms.database.MemoryDatabase()
-    scheduler = MyTaskScheduler(
-        [],
-        builder=DummyBuilder(),
-        runner=DummyRunner(),
-        database=database,
-        measure_callbacks=[
-            ms.measure_callback.AddToDatabase(),
-        ],
-        max_trials=10,
-    )
+    scheduler = MyTaskScheduler()
     test = weakref.ref(scheduler)  # test if it can be destructed successfully
     del scheduler
     assert test() is None
 
 
 def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: disable=invalid-name
-    num_trials_per_iter = 6
     max_trials_per_task = 101
     tasks = [
         ms.TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ms.search_strategy.ReplayTrace(
-                num_trials_per_iter,
-                max_trials_per_task,
-            ),
+            space_generator=_schedule_matmul,
+            search_strategy=ms.search_strategy.ReplayTrace(),
             task_name="Matmul",
             rand_state=42,
         ),
         ms.TuneContext(
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ms.search_strategy.ReplayTrace(
-                num_trials_per_iter,
-                max_trials_per_task,
-            ),
+            space_generator=_schedule_matmul,
+            search_strategy=ms.search_strategy.ReplayTrace(),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
         ms.TuneContext(
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ms.search_strategy.ReplayTrace(
-                num_trials_per_iter,
-                max_trials_per_task,
-            ),
+            space_generator=_schedule_batch_matmul,
+            search_strategy=ms.search_strategy.ReplayTrace(),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
     ]
     database = ms.database.MemoryDatabase()
-    scheduler = MyTaskScheduler(
+    scheduler = MyTaskScheduler()
+    scheduler.tune(
         tasks,
+        task_weights=[1.0] * len(tasks),
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
         measure_callbacks=[ms.measure_callback.AddToDatabase()],
-        max_trials=max_trials_per_task * len(tasks),
+        max_trials_global=max_trials_per_task * len(tasks),
+        max_trials_per_task=max_trials_per_task,
+        num_trials_per_iter=6,
+        cost_model=None,
     )
-    scheduler.tune()
     assert len(database) == max_trials_per_task * len(tasks)
     for task in tasks:
         assert (
@@ -339,55 +303,47 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
 
 
 def test_meta_schedule_task_scheduler_multiple_gradient_based():
-    num_trials_per_iter = 6
     max_trials_per_task = 101
     tasks = [
         ms.TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ms.search_strategy.ReplayTrace(
-                num_trials_per_iter,
-                max_trials_per_task,
-            ),
+            space_generator=_schedule_matmul,
+            search_strategy=ms.search_strategy.ReplayTrace(),
             task_name="Matmul",
             rand_state=42,
         ),
         ms.TuneContext(
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ms.search_strategy.ReplayTrace(
-                num_trials_per_iter,
-                max_trials_per_task,
-            ),
+            space_generator=_schedule_matmul,
+            search_strategy=ms.search_strategy.ReplayTrace(),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
         ms.TuneContext(
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ms.search_strategy.ReplayTrace(
-                num_trials_per_iter,
-                max_trials_per_task,
-            ),
+            space_generator=_schedule_batch_matmul,
+            search_strategy=ms.search_strategy.ReplayTrace(),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
     ]
     database = ms.database.MemoryDatabase()
-    gradient_based = ms.task_scheduler.GradientBased(
+    gradient_based = ms.task_scheduler.GradientBased()
+    gradient_based.tune(
         tasks,
         task_weights=[1.0, 1.0, 1.0],
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
         measure_callbacks=[ms.measure_callback.AddToDatabase()],
-        seed=0x20220214,
-        max_trials=max_trials_per_task * len(tasks),
+        max_trials_global=max_trials_per_task * len(tasks),
+        max_trials_per_task=max_trials_per_task,
+        num_trials_per_iter=6,
+        cost_model=None,
     )
-    gradient_based.tune()
     assert len(database) == max_trials_per_task * len(tasks)
     for task in tasks:
         assert (
@@ -397,4 +353,9 @@ def test_meta_schedule_task_scheduler_multiple_gradient_based():
 
 
 if __name__ == "__main__":
-    tvm.testing.main()
+    test_meta_schedule_task_scheduler_single()
+    test_meta_schedule_task_scheduler_multiple()
+    test_meta_schedule_task_scheduler_NIE()
+    test_meta_schedule_task_scheduler_avoid_cyclic()
+    test_meta_schedule_task_scheduler_override_next_task_id_only()
+    test_meta_schedule_task_scheduler_multiple_gradient_based()
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
deleted file mode 100644
index 91101dd6b6c0..000000000000
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ /dev/null
@@ -1,554 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-import logging
-import tempfile
-from os import path as osp
-from typing import List, Optional
-
-import numpy as np  # type: ignore
-import pytest
-import tvm
-import tvm.testing
-from tvm import meta_schedule as ms
-from tvm import relay
-from tvm._ffi import register_func
-from tvm.contrib import graph_executor
-from tvm.ir import IRModule
-from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.script import tir as T
-from tvm.target.target import Target
-from tvm.tir.schedule import BlockRV, Schedule
-from tvm.tir.schedule.trace import Trace
-from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
-
-logging.basicConfig(
-    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
-)
-logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
-
-# pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
-# fmt: off
-
-@tvm.script.ir_module
-class tvmgen_default_fused_layout_transform:
-    @T.prim_func
-    def main( # type: ignore
-        placeholder: T.Buffer[(1, 3, 16, 16), "float32"], # type: ignore
-        T_layout_trans: T.Buffer[(1, 1, 16, 16, 3), "float32"], # type: ignore
-    ) -> None: # type: ignore
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # body
-        # with T.block("root")
-        for i0, i1, i2, i3, i4 in T.grid(1, 1, 16, 16, 3):
-            with T.block("T_layout_trans"):
-                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
-                T.reads(placeholder[ax0, ax1 * 3 + ax4, ax2, ax3])
-                T.writes(T_layout_trans[ax0, ax1, ax2, ax3, ax4])
-                T_layout_trans[ax0, ax1, ax2, ax3, ax4] = T.if_then_else(
-                    ax0 < 1 and ax1 * 3 + ax4 < 3 and ax2 < 16 and ax3 < 16, # type: ignore
-                    placeholder[ax0, ax1 * 3 + ax4, ax2, ax3],
-                    T.float32(0),
-                    dtype="float32",
-                )
-
-
-@tvm.script.ir_module
-class tvmgen_default_fused_nn_contrib_conv2d_NCHWc:
-    @T.prim_func
-    def main(placeholder: T.Buffer[(1, 1, 16, 16, 3), "float32"], placeholder_1: T.Buffer[(2, 1, 5, 5, 3, 4), "float32"], conv2d_NCHWc: T.Buffer[(1, 2, 16, 16, 4), "float32"]) -> None: # type: ignore
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # body
-        # with T.block("root")
-        data_pad = T.alloc_buffer([1, 1, 20, 20, 3], dtype="float32")
-        for i0, i1, i2, i3, i4 in T.grid(1, 1, 20, 20, 3):
-            with T.block("data_pad"):
-                i0_1, i1_1, i2_1, i3_1, i4_1 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
-                T.reads(placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1])
-                T.writes(data_pad[i0_1, i1_1, i2_1, i3_1, i4_1])
-                data_pad[i0_1, i1_1, i2_1, i3_1, i4_1] = T.if_then_else(2 <= i2_1 and i2_1 < 18 and 2 <= i3_1 and i3_1 < 18, placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1], T.float32(0), dtype="float32") # type: ignore # pylint: disable=R1716
-        for i0, i1, i2, i3, i4, i5, i6, i7 in T.grid(1, 2, 16, 16, 4, 3, 5, 5):
-            with T.block("conv2d_NCHWc"):
-                n, oc_chunk, oh, ow, oc_block, ic, kh, kw = T.axis.remap("SSSSSRRR", [i0, i1, i2, i3, i4, i5, i6, i7])
-                T.reads(data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3], placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block]) # type: ignore
-                T.writes(conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block])
-                T.block_attr({"workload":["conv2d_NCHWc.x86", ["TENSOR", [1, 1, 16, 16, 3], "float32"], ["TENSOR", [2, 1, 5, 5, 3, 4], "float32"], [1, 1], [2, 2, 2, 2], [1, 1], "NCHW3c", "NCHW4c", "float32"]})
-                with T.init():
-                    conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = T.float32(0)
-                conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3] * placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block] # type: ignore
-
-@tvm.script.ir_module
-class tvmgen_default_fused_layout_transform_1:
-    @T.prim_func
-    def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T.Buffer[(1, 8, 16, 16), "float32"]) -> None: # type: ignore
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # body
-        # with T.block("root")
-        for i0, i1, i2, i3 in T.grid(1, 8, 16, 16):
-            with T.block("T_layout_trans"):
-                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                T.reads(placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4]) # type: ignore
-                T.writes(T_layout_trans[ax0, ax1, ax2, ax3])
-                T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < 1 and ax1 < 8 and ax2 < 16 and ax3 < 16, placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4], T.float32(0), dtype="float32") # type: ignore
-
-# fmt: on
-# pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
-
-
-@pytest.mark.skip("Integration test")
-@pytest.mark.parametrize(
-    "model_name, input_shape, target, layout",
-    [
-        ("resnet_18", [1, 3, 224, 224], "llvm --num-cores=12", "NHWC"),
-        ("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NHWC"),
-        ("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=12", "NHWC"),
-        ("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NHWC"),
-        ("bert_base", [1, 64], "llvm --num-cores=12", None),
-        ("bert_base", [1, 64], "nvidia/geforce-rtx-3070", None),
-    ],
-)
-def test_meta_schedule_tune_relay(
-    model_name: str,
-    input_shape: List[int],
-    target: str,
-    layout: Optional[str],
-):
-    dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda()
-    if model_name.startswith("bert"):
-        data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev)  # embedding size
-    else:
-        data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev)
-
-    mod, params, (input_name, _, _) = get_network(
-        name=model_name,
-        input_shape=input_shape,
-        layout=layout,
-    )
-
-    target = Target(target)
-    with tempfile.TemporaryDirectory() as work_dir:
-        with ms.Profiler() as profiler:
-            rt_mod1: tvm.runtime.Module = ms.tune_relay(
-                mod=mod,
-                params=params,
-                target=target,
-                config=ms.TuneConfig(
-                    strategy="evolutionary",
-                    num_trials_per_iter=32,
-                    max_trials_per_task=20000,
-                    max_trials_global=20000,
-                ),
-                work_dir=work_dir,
-            )
-        print(profiler.table())
-        # Compile without meta-schedule for correctness check
-        with tvm.transform.PassContext(opt_level=0):
-            rt_mod2 = relay.build(mod, target=target, params=params)
-
-        def get_output(data, lib):
-            module = graph_executor.GraphModule(lib["default"](dev))
-            module.set_input(input_name, data)
-            module.run()
-            return module.get_output(0).numpy()
-
-        # Check correctness
-        actual_output = get_output(data, rt_mod1)
-        expected_output = get_output(data, rt_mod2)
-        assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
-
-
-def test_meta_schedule_te2primfunc_argument_order():
-    @ms.derived_object
-    class TestDummyDatabase(ms.database.PyDatabase):
-        def __init__(self):
-            super().__init__()
-            self.records = []
-            self.workload_reg = []
-
-        def has_workload(self, mod: IRModule) -> ms.database.Workload:
-            for workload in self.workload_reg:
-                if tvm.ir.structural_equal(workload.mod, mod):
-                    return True
-            # The database has already put in all correct workloads
-            raise ValueError(
-                "The workload searched for is not in given database!"
-                + " Incorrect TIR was generated from TE subgraph."
-            )
-
-        def commit_tuning_record(self, record: ms.database.TuningRecord) -> None:
-            self.records.append(record)
-
-        def commit_workload(self, mod: IRModule) -> ms.database.Workload:
-            for workload in self.workload_reg:
-                if tvm.ir.structural_equal(workload.mod, mod):
-                    return workload
-            workload = ms.database.Workload(mod)
-            self.workload_reg.append(workload)
-            return workload
-
-        def get_top_k(
-            self,
-            workload: ms.database.Workload,
-            top_k: int,
-        ) -> List[ms.database.TuningRecord]:
-            return list(
-                filter(
-                    lambda x: x.workload == workload,
-                    sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)),
-                )
-            )[: int(top_k)]
-
-        def __len__(self) -> int:
-            return len(self.records)
-
-        def print_results(self) -> None:
-            print("\n".join([str(r) for r in self.records]))
-
-    data_shape = (1, 3, 16, 16)
-    weight_shape = (8, 3, 5, 5)
-    data = relay.var("data", relay.TensorType(data_shape, "float32"))
-    weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
-    y = relay.nn.conv2d(
-        data,
-        weight,
-        padding=(2, 2),
-        kernel_size=(5, 5),
-        kernel_layout="OIHW",
-        out_dtype="float32",
-    )
-    f = relay.Function([data, weight], y)
-    mod = tvm.IRModule.from_expr(f)
-    mod = relay.transform.InferType()(mod)
-
-    data_sample = np.random.rand(*data_shape).astype("float32")
-    weight_sample = np.random.rand(*weight_shape).astype("float32")
-    params = {mod["main"].params[1].name_hint: weight_sample}
-
-    input_name = "data"
-    dev = tvm.cpu()
-    target = Target("llvm --num-cores=12")
-    data = tvm.nd.array(data_sample, dev)
-
-    database = TestDummyDatabase()
-    database.commit_workload(tvmgen_default_fused_layout_transform)
-    database.commit_workload(tvmgen_default_fused_layout_transform_1)
-    database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc)
-
-    with database, tvm.transform.PassContext(  # pylint: disable=not-context-manager
-        opt_level=3,
-        config={"relay.backend.use_meta_schedule": True},
-    ):
-        rt_mod1 = relay.build(mod, target=target, params=params)
-
-    # Compile without meta-schedule for correctness check
-    with tvm.transform.PassContext(opt_level=0):
-        rt_mod2 = relay.build(mod, target=target, params=params)
-
-    def get_output(data, lib):
-        module = graph_executor.GraphModule(lib["default"](dev))
-        module.set_input(input_name, data)
-        module.run()
-        return module.get_output(0).numpy()
-
-    # Check correctness
-    actual_output = get_output(data, rt_mod1)
-    expected_output = get_output(data, rt_mod2)
-    assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
-
-
-def test_meta_schedule_relay_lowering():
-    data_shape = (1, 3, 16, 16)
-    weight_shape = (8, 3, 5, 5)
-    data = relay.var("data", relay.TensorType(data_shape, "float32"))
-    weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
-    y = relay.nn.conv2d(
-        data,
-        weight,
-        padding=(2, 2),
-        kernel_size=(5, 5),
-        kernel_layout="OIHW",
-        out_dtype="float32",
-    )
-    f = relay.Function([data, weight], y)
-    mod = tvm.IRModule.from_expr(f)
-    mod = relay.transform.InferType()(mod)
-
-    data_sample = np.random.rand(*data_shape).astype("float32")
-    weight_sample = np.random.rand(*weight_shape).astype("float32")
-    params = {mod["main"].params[1].name_hint: weight_sample}
-
-    input_name = "data"
-    dev = tvm.cpu()
-    target = Target("llvm --num-cores=12")
-    data = tvm.nd.array(data_sample, dev)
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        database = ms.database.JSONDatabase(
-            osp.join(work_dir, "workload.json"), osp.join(work_dir, "records.json")
-        )
-        database.commit_tuning_record(
-            ms.database.TuningRecord(
-                Trace([], {}),
-                database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc),
-                [0.0],
-                target=target,
-                args_info=[],
-            )
-        )
-        with database, tvm.transform.PassContext(
-            opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
-        ):
-            rt_mod1 = relay.build(mod, target=target, params=params)
-
-        # Compile without meta-schedule for correctness check
-        with tvm.transform.PassContext(opt_level=0):
-            rt_mod2 = relay.build(mod, target=target, params=params)
-
-        def get_output(data, lib):
-            module = graph_executor.GraphModule(lib["default"](dev))
-            module.set_input(input_name, data)
-            module.run()
-            return module.get_output(0).numpy()
-
-        # Check correctness
-        actual_output = get_output(data, rt_mod1)
-        expected_output = get_output(data, rt_mod2)
-        assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
-
-
-def schedule_dense(dense_block, M, do_tune, sch):  # pylint: disable=invalid-name
-    """
-    Manually schedule a dense block, created from TE compute op via CreatePrimFunc,
-    using VNNI instruction.
-    """
-    post_blocks = sch.get_consumers(dense_block)
-
-    if len(post_blocks) > 0:
-        # Fuse all intermediate post ops into the last op.
-        # This is equivalent to the traverse_inline function used in TE schedules.
-        while True:
-            next_post_blocks = []
-            for post_block in post_blocks:
-                next_consumers = sch.get_consumers(post_block)
-
-                if len(next_consumers) > 0:
-                    sch.compute_inline(post_block)
-
-                next_post_blocks += next_consumers
-
-            if len(next_post_blocks) == 0:
-                assert len(post_blocks) == 1
-                outer_block = post_blocks[0]
-                a_y, a_x = sch.get_loops(outer_block)[-2:]
-                break
-
-            post_blocks = next_post_blocks
-    else:
-        a_y, a_x, _ = sch.get_loops(dense_block)[-3:]
-        outer_block = dense_block
-
-    if do_tune:
-        y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
-        a_yo, a_yi = sch.split(a_y, factors=y_factors)
-    else:
-        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 64)])
-
-    a_xo, a_xi = sch.split(a_x, factors=[None, 16])
-    sch.reorder(a_yo, a_xo, a_yi, a_xi)
-    fused = sch.fuse(a_yo, a_xo)
-
-    if outer_block != dense_block:
-        # Handle the case when dense is fused with post ops.
-        sch.vectorize(a_xi)
-        sch.compute_at(dense_block, a_yi)
-
-    a_xi, a_k = sch.get_loops(dense_block)[-2:]
-    a_ko, a_ki = sch.split(a_k, factors=[None, 4])
-    sch.reorder(a_ko, a_xi, a_ki)
-
-    # We need to parallelize before decompose_reduction, otherwise the so-called "Compact dataflow"
-    # condition is violated.
-    sch.parallel(fused)
-    dec = sch.decompose_reduction(dense_block, a_ko)
-
-    init_loop = sch.get_loops(dec)[-1]
-    sch.vectorize(init_loop)
-
-    sch.tensorize(a_xi, VNNI_INTRIN)
-
-
-def manual_tir_common(do_tune=False):
-    M, N, K = 1024, 1024, 1024  # pylint: disable=invalid-name
-    data_shape = (M, K)
-    weight_shape = (N, K)
-
-    data_dtype = "uint8"
-    data = relay.var("data", shape=data_shape, dtype=data_dtype)
-    weight = relay.var("weight", shape=weight_shape, dtype="int8")
-    bias = relay.var("bias", shape=(weight_shape[0],), dtype="int32")
-
-    # dense is tuned by the TIR schedule above, bmm is scheduled by TE (topi/x86/batch_matmul.py)
-    dense = relay.nn.dense(data, weight, out_dtype="int32")
-    bias_add = relay.nn.bias_add(dense, bias) + relay.const(1, dtype="int32")
-    out = relay.nn.batch_matmul(
-        relay.cast(relay.expand_dims(bias_add, 0), "uint8"),
-        relay.cast(relay.expand_dims(bias_add, 0), "int8"),
-        out_dtype="int32",
-    )
-
-    relay_mod = tvm.IRModule.from_expr(out)
-
-    target = "llvm -mcpu=cascadelake -num-cores 4"
-    dev = tvm.device(target, 0)
-
-    data = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
-    weight_np = np.random.uniform(1, 10, size=weight_shape).astype("int8")
-    bias_np = np.random.uniform(1, 10, size=(weight_shape[0],)).astype("int32")
-
-    ref = (
-        relay.create_executor("vm", mod=relay_mod, device=dev, target=target)
-        .evaluate()(*[data, weight_np, bias_np])
-        .numpy()
-    )
-
-    params = {"weight": weight_np, "bias": bias_np}
-
-    if do_tune:
-        extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params)
-        # Filter out tasks that we don't intend to schedule / tune with TIR.
-        tune_tasks = list(
-            filter(
-                lambda task: "dense" in task.task_name,
-                extracted_tasks,
-            )
-        )
-        config = ms.TuneConfig(
-            strategy="replay_trace",
-            num_trials_per_iter=8,
-            max_trials_per_task=8,
-            max_trials_global=8,
-        )
-
-        with tempfile.TemporaryDirectory() as work_dir:
-            # postprocs=lambda: [] is important to prevent default post processors from
-            # tampering with the manual schedule.
-            database = ms.tune_extracted_tasks(
-                tune_tasks,
-                config,
-                work_dir=work_dir,
-                postprocs=lambda: [],
-            )
-    else:
-
-        def schedule_fn(sch) -> bool:
-            if "dense" not in sch.mod.attrs["task_name"]:
-                return False
-
-            block = sch.get_block("compute")
-
-            # Looks up schedule_rule annotation.
-            # See the comment in test_tune_relay_manual_tir_vnni().
-            schedule_rule = sch.get(block).annotations["schedule_rule"]
-
-            assert "dense_vnni" in schedule_rule
-
-            schedule_dense(block, M, False, sch)
-
-            return True
-
-        database = ms.database.ScheduleFnDatabase(schedule_fn)
-
-    with database, tvm.transform.PassContext(
-        opt_level=3,
-        config={"relay.backend.use_meta_schedule": True},
-    ):
-        # pylint: disable=W0105
-        """
-        The log should say
-        Warning: Cannot find workload: tvmgen_default_fused_expand_dims
-        Warning: Cannot find workload: tvmgen_default_fused_cast
-        Warning: Cannot find workload: tvmgen_default_fused_cast_1
-        Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul
-
-        This means batch matmul and others are scheduled by TE, and dense (the one not warned)
-        is found in the meta schedule tuning database during compilation
-        """
-        # pylint: enable=W0105
-        lib = relay.build(relay_mod, target=target, params=params)
-
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-    runtime.set_input("data", data)
-    runtime.run()
-
-    out = runtime.get_output(0).numpy()
-
-    np.testing.assert_equal(out, ref)
-
-
-@tvm.testing.requires_cascadelake
-def test_tune_relay_manual_tir_vnni():
-    manual_tir_common(do_tune=False)
-
-    # pylint: disable=W0105
-    """
-    We can inject and apply a custom TIR scheduling to a TE compute of interest, using
-    the "schedule_rule" annotation. For example, in topi/x86/dense.py we have the following
-    declaration for int8 dense targeting the VNNI instruction.
-
-    C = te.compute(
-        ...
-        attrs={"schedule_rule": "meta_schedule.dense_vnni"},
-    )
-
-    When the MetaSchedule encounters a TensorIR block with the "schedule_rule" annotation,
-    it looks up the packed func registry for a function that is associated with the given schedule
-    rule key ("meta_schedule.dense_vnni" in this example). The signature of such custom schedule
-    functions must be
-
-       (tir.schedule.Schedule, tir.schedule.BlockRV) -> [tir.schedule.Schedule].
-
-    The BlockRV argument corresponds to the TE compute annotated with "schedule_rule".
-
-    The relevant code is in meta_schedule/space_generator/post_order_apply.cc.
-
-    """
-    # pylint: enable=W0105
-
-    def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
-        schedule_dense(dense_block, None, True, sch)
-        return [sch]
-
-    register_func("meta_schedule.dense_vnni", schedule_rule_dense_vnni)
-
-    manual_tir_common(do_tune=True)
-
-
-if __name__ == """__main__""":
-    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=12", None)
-    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NCHW")
-    test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=12", None)
-    test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", None)
-    test_meta_schedule_tune_relay("bert_base", [1, 64], "llvm --num-cores=12", None)
-    test_meta_schedule_tune_relay("bert_base", [1, 64], "nvidia/geforce-rtx-3070", None)
-    test_meta_schedule_te2primfunc_argument_order()
-    test_meta_schedule_relay_lowering()
-    test_tune_relay_manual_tir_vnni()
diff --git a/tests/python/unittest/test_meta_schedule_tune_te.py b/tests/python/unittest/test_meta_schedule_tune_te.py
deleted file mode 100644
index d294b2ddd6e8..000000000000
--- a/tests/python/unittest/test_meta_schedule_tune_te.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-import logging
-import tempfile
-
-import pytest
-from tvm.meta_schedule import TuneConfig, tune_te
-from tvm.meta_schedule.testing import te_workload
-from tvm.target.target import Target
-from tvm.tir import Schedule
-
-logging.basicConfig()
-logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
-
-
-def test_tune_matmul():
-    with tempfile.TemporaryDirectory() as work_dir:
-        sch: Schedule = tune_te(
-            tensors=te_workload.batch_matmul_nkkm(B=1, N=128, M=128, K=128),
-            target=Target("llvm --num-cores=16"),
-            config=TuneConfig(
-                strategy="replay_trace",
-                num_trials_per_iter=1,
-                max_trials_per_task=1,
-                max_trials_global=1,
-            ),
-            work_dir=work_dir,
-        )
-        if sch is None:
-            print("No valid schedule found!")
-        else:
-            print(sch.mod.script())
-            print(sch.trace)
-
-
-if __name__ == """__main__""":
-    test_tune_matmul()
diff --git a/tests/python/unittest/test_meta_schedule_tune_tir.py b/tests/python/unittest/test_meta_schedule_tune_tir.py
index 6ab5f9b8c5c4..aa45120c2316 100644
--- a/tests/python/unittest/test_meta_schedule_tune_tir.py
+++ b/tests/python/unittest/test_meta_schedule_tune_tir.py
@@ -17,17 +17,14 @@
 # pylint: disable=missing-docstring,no-member,invalid-name,unused-variable
 import logging
 import tempfile
-import numpy as np
 
+import numpy as np
 import pytest
 import tvm
-
+import tvm.testing
 from tvm import meta_schedule as ms
-from tvm.meta_schedule import TuneContext, TuneConfig, tune_tir
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.local_rpc import LocalRPC
-from tvm.meta_schedule.schedule_rule import PyScheduleRule
-from tvm.meta_schedule.utils import derived_object
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir.schedule import BlockRV, Schedule
@@ -64,77 +61,42 @@ def two_step(a: T.handle, c: T.handle) -> None:
             C[vi, vj] = B[vi, vj] + 3.0
 
 
-@pytest.mark.skip("Integration test")
+@tvm.testing.requires_llvm
 def test_tune_matmul_cpu():
     with tempfile.TemporaryDirectory() as work_dir:
-        sch: Schedule = tune_tir(
+        target = Target("llvm --num-cores=16")
+        database = ms.tir_integration.tune_tir(
             mod=matmul,
-            target=Target("llvm --num-cores=16"),
-            config=TuneConfig(
-                strategy="replay_trace",
-                num_trials_per_iter=32,
-                max_trials_per_task=32,
-                max_trials_global=32,
-            ),
+            target=target,
             work_dir=work_dir,
+            max_trials_global=32,
+            num_trials_per_iter=16,
         )
+        sch = ms.tir_integration.compile_tir(database, matmul, target)
         if sch is None:
             print("No valid schedule found!")
         else:
-            print(sch.mod.script())
-            print(sch.trace)
-
-
-@pytest.mark.skip("Integration test")
-def test_tune_block_cpu():
-    @derived_object
-    class RemoveBlock(PyScheduleRule):
-        def _initialize_with_tune_context(self, context: TuneContext) -> None:
-            pass
-
-        def apply(self, sch: Schedule, block: BlockRV):
-            if sch.get(block).name_hint == "root":
-                return [sch]
-            sch = sch.copy()
-            sch.compute_inline(block)
-            return [sch]
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        sch: Schedule = tune_tir(
-            mod=two_step,
-            target=Target("llvm --num-cores=16"),
-            config=TuneConfig(
-                strategy="replay_trace",
-                num_trials_per_iter=32,
-                max_trials_per_task=32,
-                max_trials_global=32,
-            ),
-            work_dir=work_dir,
-            blocks=["A"],
-            sch_rules=lambda *args: [RemoveBlock()],
-        )
-        assert sch is not None
+            sch.mod.show()
+            sch.trace.show()
 
 
-@pytest.mark.skip("Integration test")
+@tvm.testing.requires_cuda
 def test_tune_matmul_cuda():
     with tempfile.TemporaryDirectory() as work_dir:
-        sch: Schedule = tune_tir(
+        target = Target("nvidia/geforce-rtx-3070")
+        database = ms.tir_integration.tune_tir(
             mod=matmul,
-            target=Target("nvidia/geforce-rtx-3070"),
-            config=TuneConfig(
-                strategy="replay_trace",
-                num_trials_per_iter=32,
-                max_trials_per_task=32,
-                max_trials_global=32,
-            ),
+            target=target,
             work_dir=work_dir,
+            max_trials_global=32,
+            num_trials_per_iter=16,
         )
+        sch = ms.tir_integration.compile_tir(database, matmul, target)
         if sch is None:
             print("No valid schedule found!")
         else:
-            print(sch.mod.script())
-            print(sch.trace)
+            sch.mod.show()
+            sch.trace.show()
 
 
 def test_tune_run_module_via_rpc():
@@ -179,6 +141,43 @@ def f_timer(rt_mod, dev, input_data):
         tvm.testing.assert_allclose(result.numpy(), c_np, rtol=1e-3)
 
 
+def test_tune_block_cpu():
+    @ms.derived_object
+    class RemoveBlock(ms.schedule_rule.PyScheduleRule):
+        def _initialize_with_tune_context(self, context: ms.TuneContext) -> None:
+            pass
+
+        def apply(self, sch: Schedule, block: BlockRV):
+            if sch.get(block).name_hint == "root":
+                return [sch]
+            sch = sch.copy()
+            sch.compute_inline(block)
+            return [sch]
+
+        def clone(self) -> "RemoveBlock":
+            return RemoveBlock()
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        target = Target("llvm --num-cores=16")
+        database = ms.tir_integration.tune_tir(
+            mod=two_step,
+            target=target,
+            work_dir=work_dir,
+            max_trials_global=32,
+            num_trials_per_iter=16,
+            space=ms.space_generator.PostOrderApply(
+                f_block_filter=lambda block: block.name_hint == "A",
+                sch_rules=[RemoveBlock()],
+                postprocs=[],
+                mutator_probs={},
+            ),
+        )
+        sch = ms.tir_integration.compile_tir(database, two_step, target)
+        assert sch is not None
+        sch.mod.show()
+        sch.trace.show()
+
+
 if __name__ == """__main__""":
     test_tune_matmul_cpu()
     test_tune_matmul_cuda()
diff --git a/tests/python/unittest/test_meta_schedule_vnni_integration.py b/tests/python/unittest/test_meta_schedule_vnni_integration.py
new file mode 100644
index 000000000000..2cd609863056
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_vnni_integration.py
@@ -0,0 +1,249 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+import logging
+import tempfile
+from typing import Optional
+
+import numpy as np  # type: ignore
+import pytest
+import tvm
+from tvm import meta_schedule as ms
+from tvm import relay
+from tvm._ffi import register_func
+from tvm.tir.schedule import BlockRV, Schedule
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
+
+logging.basicConfig(
+    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
+
+
+def _schedule_dense(m: Optional[int], do_tune: bool):
+    """Manually schedule a dense block, created from TE compute op via CreatePrimFunc,
+    using VNNI instruction.
+    """
+
+    def schedule_fn(sch, dense_block: Optional[BlockRV] = None) -> bool:
+        if "dense" not in sch.mod.attrs["task_name"]:
+            return False
+        if dense_block is None:
+            dense_block = sch.get_block("compute")
+            assert "dense_vnni" in sch.get(dense_block).annotations["schedule_rule"]
+
+        post_blocks = sch.get_consumers(dense_block)
+        if len(post_blocks) > 0:
+            # Fuse all intermediate post ops into the last op.
+            # This is equivalent to the traverse_inline function used in TE schedules.
+            while True:
+                next_post_blocks = []
+                for post_block in post_blocks:
+                    next_consumers = sch.get_consumers(post_block)
+                    if len(next_consumers) > 0:
+                        sch.compute_inline(post_block)
+                    next_post_blocks += next_consumers
+                if len(next_post_blocks) == 0:
+                    assert len(post_blocks) == 1
+                    outer_block = post_blocks[0]
+                    a_y, a_x = sch.get_loops(outer_block)[-2:]
+                    break
+                post_blocks = next_post_blocks
+        else:
+            a_y, a_x, _ = sch.get_loops(dense_block)[-3:]
+            outer_block = dense_block
+        if do_tune:
+            y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+            a_yo, a_yi = sch.split(a_y, factors=y_factors)
+        else:
+            a_yo, a_yi = sch.split(a_y, factors=[None, min(m, 64)])
+        a_xo, a_xi = sch.split(a_x, factors=[None, 16])
+        sch.reorder(a_yo, a_xo, a_yi, a_xi)
+        fused = sch.fuse(a_yo, a_xo)
+        if outer_block != dense_block:
+            # Handle the case when dense is fused with post ops.
+            sch.vectorize(a_xi)
+            sch.compute_at(dense_block, a_yi)
+        a_xi, a_k = sch.get_loops(dense_block)[-2:]
+        a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+        sch.reorder(a_ko, a_xi, a_ki)
+        # We need to parallelize before decompose_reduction, otherwise the so-called "Compact dataflow"
+        # condition is violated.
+        sch.parallel(fused)
+        dec = sch.decompose_reduction(dense_block, a_ko)
+        init_loop = sch.get_loops(dec)[-1]
+        sch.vectorize(init_loop)
+        sch.tensorize(a_xi, VNNI_INTRIN)
+        return True
+
+    return schedule_fn
+
+
+def _relay_dense(m, n, k):
+    data = relay.var("data", shape=(m, k), dtype="uint8")
+    weight = relay.var("weight", shape=(n, k), dtype="int8")
+    bias = relay.var("bias", shape=(n,), dtype="int32")
+    # dense is tuned by the TIR schedule above, bmm is scheduled by TE (topi/x86/batch_matmul.py)
+    dense = relay.nn.dense(data, weight, out_dtype="int32")
+    bias_add = relay.nn.bias_add(dense, bias) + relay.const(1, dtype="int32")
+    out = relay.nn.batch_matmul(
+        relay.cast(relay.expand_dims(bias_add, 0), "uint8"),
+        relay.cast(relay.expand_dims(bias_add, 0), "int8"),
+        out_dtype="int32",
+    )
+    relay_mod = tvm.IRModule.from_expr(out)
+    data = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
+    params = {
+        "weight": np.random.uniform(1, 10, size=(n, k)).astype("int8"),
+        "bias": np.random.uniform(1, 10, size=(n,)).astype("int32"),
+    }
+
+    def f_check(lib, dev):
+        ref = (
+            relay.create_executor(
+                "vm",
+                mod=relay_mod,
+                device=dev,
+                target="llvm",
+            )
+            .evaluate()(data, params["weight"], params["bias"])
+            .numpy()
+        )
+        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+        runtime.set_input("data", data)
+        runtime.run()
+        out = runtime.get_output(0).numpy()
+        np.testing.assert_equal(out, ref)
+
+    return relay_mod, params, f_check
+
+
+@pytest.mark.skip("Requires cascadelake")
+def test_vnni_schedule_fn_database():
+    m, n, k = 1024, 1024, 1024
+    target = tvm.target.Target("llvm -mcpu=cascadelake -num-cores 4")
+    dev = tvm.cpu(0)
+    relay_mod, params, f_check = _relay_dense(m, n, k)
+
+    with ms.database.ScheduleFnDatabase(
+        _schedule_dense(
+            m=m,
+            do_tune=False,
+        )
+    ), tvm.transform.PassContext(
+        opt_level=3,
+        config={"relay.backend.use_meta_schedule": True},
+    ):
+        # pylint: disable=W0105
+        """The log should say
+        Warning: Cannot find workload: tvmgen_default_fused_expand_dims
+        Warning: Cannot find workload: tvmgen_default_fused_cast
+        Warning: Cannot find workload: tvmgen_default_fused_cast_1
+        Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul
+
+        This means batch matmul and others are scheduled by TE, and dense (the one not warned)
+        is found in the meta schedule tuning database during compilation
+        """
+        # pylint: enable=W0105
+        lib = relay.build(relay_mod, target=target, params=params)
+    f_check(lib, dev)
+
+
+@pytest.mark.skip("Requires cascadelake")
+def test_vnni_schedule_fn_tune():
+    # pylint: disable=W0105
+    """
+    We can inject and apply a custom TIR scheduling to a TE compute of interest, using
+    the "schedule_rule" annotation. For example, in topi/x86/dense.py we have the following
+    declaration for int8 dense targeting the VNNI instruction.
+
+    C = te.compute(
+        ...
+        attrs={"schedule_rule": "meta_schedule.dense_vnni"},
+    )
+
+    When the MetaSchedule encounters a TensorIR block with the "schedule_rule" annotation,
+    it looks up the packed func registry for a function that is associated with the given schedule
+    rule key ("meta_schedule.dense_vnni" in this example). The signature of such custom schedule
+    functions must be
+
+       (tir.schedule.Schedule, tir.schedule.BlockRV) -> [tir.schedule.Schedule].
+
+    The BlockRV argument corresponds to the TE compute annotated with "schedule_rule".
+
+    The relevant code is in meta_schedule/space_generator/post_order_apply.cc.
+    """
+
+    def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
+        _schedule_dense(m=None, do_tune=True)(sch, dense_block)
+        return [sch]
+
+    register_func("meta_schedule.dense_vnni", schedule_rule_dense_vnni)
+
+    m, n, k = 1024, 1024, 1024
+    target = tvm.target.Target("llvm -mcpu=cascadelake -num-cores 4")
+    dev = tvm.cpu(0)
+    relay_mod, params, f_check = _relay_dense(m, n, k)
+
+    extracted_tasks = ms.relay_integration.extract_tasks(relay_mod, target, params)
+    with tempfile.TemporaryDirectory() as work_dir:
+        # postprocs=lambda: [] is important to prevent default post processors from
+        # tampering with the manual schedule.
+        tasks = ms.relay_integration.extracted_tasks_to_tune_contexts(
+            list(
+                filter(
+                    lambda task: "dense" in task.task_name,
+                    extracted_tasks,
+                )
+            ),
+            work_dir=work_dir,
+            space=ms.space_generator.PostOrderApply(
+                f_block_filter=None,
+                sch_rules=None,
+                postprocs=[],
+                mutator_probs=None,
+            ),
+        )
+        database = ms.relay_integration.tune_tasks(
+            tasks=tasks,
+            task_weights=[1.0] * len(tasks),
+            work_dir=work_dir,
+            max_trials_global=20000,
+        )
+    with database, tvm.transform.PassContext(
+        opt_level=3,
+        config={"relay.backend.use_meta_schedule": True},
+    ):
+        # pylint: disable=W0105
+        """The log should say
+        Warning: Cannot find workload: tvmgen_default_fused_expand_dims
+        Warning: Cannot find workload: tvmgen_default_fused_cast
+        Warning: Cannot find workload: tvmgen_default_fused_cast_1
+        Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul
+
+        This means batch matmul and others are scheduled by TE, and dense (the one not warned)
+        is found in the meta schedule tuning database during compilation
+        """
+        # pylint: enable=W0105
+        lib = relay.build(relay_mod, target=target, params=params)
+    f_check(lib, dev)
+
+
+if __name__ == """__main__""":
+    test_vnni_schedule_fn_database()
+    test_vnni_schedule_fn_tune()
diff --git a/tests/python/unittest/test_tir_schedule_trace.py b/tests/python/unittest/test_tir_schedule_trace.py
index 8a5155bcba43..916db184e09b 100644
--- a/tests/python/unittest/test_tir_schedule_trace.py
+++ b/tests/python/unittest/test_tir_schedule_trace.py
@@ -282,7 +282,6 @@ def test_trace_simplified_2():
         )
     )
     trace = trace.simplified(remove_postproc=False)
-    print(trace.show())
     assert str(trace) == "\n".join(
         (
             "# from tvm import tir",

From 7804a9886cb8f19f7c680cb96a393e10047240fd Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Fri, 7 Oct 2022 06:13:39 -0700
Subject: [PATCH 321/704] [Hexagon] disable cache_write schedule type from sw
 pipeline test (#13004)

---
 .../contrib/test_hexagon/test_software_pipeline_async.py     | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
index 25be8b8e2849..943d4262f9da 100644
--- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -30,7 +30,10 @@
 inner = tvm.testing.parameter(64, 128)
 dtype = tvm.testing.parameter("uint8", "float16")
 scope = tvm.testing.parameter("global", "global.vtcm")
-sched = tvm.testing.parameter("cache_read", "cache_write", "cache_read_write")
+# TODO(Straw) Add back "cache_write" schedule type once we have upstreamed
+# buffer dependency analysis in InjectSoftwarePipeline pass
+# to insert approprite TIR "wait" attributes for this schedule
+sched = tvm.testing.parameter("cache_read", "cache_read_write")
 
 
 @tvm.testing.fixture

From fc333f90e83c0dd912a3957a1f55749322cb87b9 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 7 Oct 2022 10:37:08 -0500
Subject: [PATCH 322/704] [TIR][Arith] Prove conditionals by transitively
 applying knowns (#12863)

This commit adds a new sub-analyzer, `TransitiveComparisonAnalyzer`,
which attempts to apply multiple known comparisons to prove an
unknown.  For example, `a <= b` and `b <= c` imply that `a <= c`.
These simplifications are necessary for simplifying conditionals
resulting from padded layout
transformations (https://github.com/apache/tvm/issues/12261).

While some of these conditions may be proven using
`ConstIntBoundAnalyzer` or `IntSetAnalyzer`, each has some
limitations.  `ConstIntBoundAnalyzer` can only compare against a
constant, `IntSetAnalyzer` internally calls `RewriteSimplifier` which
can result in infinite recursion, and neither can handle not-equal
conditions because it would require tracking multiple intervals per
expression.  Therefore, introducing a new sub-analyzer for these
simplifications.

* Change mutable reference to mutable pointer

* Remove nullptr default on Impl unique_ptr

In g++ 7, defining a default constructor attempts to define the
destructor, which fails because `Impl` is an incomplete type.  As far
as I should tell, the destructor should only be defined at the point
where `~TransitiveComparisonAnalyzer` is defined, at which point
`Impl` has a full definition.  This issue does not occur in g++ 10.

* Require opt-in for CPU-intensive simplifications

* Document the intent of using bitflags

* Rename "Feature" to "Extension"

* Use TVM_DLL on new public member functions

* Remove duplicate BaseBeforeAfter.transform definition

* Explicitly enable extension for unit tests that require it

* Fix accidentally duplicate test case

* Improve TryCompareFromLHS documentation

* Update wording to distinguish `knowns_` and `scoped_knowns_`

* Better documentation for Key enum

* Document the normalization of LT/GT

* Removed unused PrimExpr temp

* Call out modifications of the `compared_to_x` contents

* Pointed to `Comparison::Comparison` for normalization details

* Updated to clarify right/RHS.

* Rename TryCompareFromLHS to DFSFromLHS
---
 include/tvm/arith/analyzer.h                  | 114 ++-
 src/arith/analyzer.cc                         |   3 +
 src/arith/canonical_simplify.cc               |  10 +-
 src/arith/rewrite_simplify.cc                 |  79 +-
 src/arith/rewrite_simplify.h                  |  28 +-
 src/arith/transitive_comparison_analyzer.cc   | 791 ++++++++++++++++++
 src/tir/transforms/simplify.cc                |  34 +-
 .../unittest/test_tir_transform_simplify.py   | 141 +++-
 8 files changed, 1172 insertions(+), 28 deletions(-)
 create mode 100644 src/arith/transitive_comparison_analyzer.cc

diff --git a/include/tvm/arith/analyzer.h b/include/tvm/arith/analyzer.h
index ceb9f574f2c9..79b01d0cc859 100644
--- a/include/tvm/arith/analyzer.h
+++ b/include/tvm/arith/analyzer.h
@@ -273,7 +273,41 @@ class RewriteSimplifier {
    *
    * \return an exit function that must be called to cleanup the constraint can be nullptr.
    */
-  std::function<void()> EnterConstraint(const PrimExpr& constraint);
+  TVM_DLL std::function<void()> EnterConstraint(const PrimExpr& constraint);
+
+  /*! \brief Flags to enable more computationally-intensive simplifications
+   *
+   * These simplifications may be required for specific schedules, but
+   * would impose too high a compile-time cost to enable by default.
+   * They can be enabled on an as-needed basis by calling
+   * `RewriteSimplifier::SetEnabledExtensions` prior to using
+   * `RewriteSimplifier::operator()`.
+   *
+   * Flags are defined as powers of two to allow future expansion.  To
+   * enable multiple extensions, a user should pass a bitwise OR of the
+   * flags for each desired extension.
+   */
+  enum Extension {
+    // No extensions enabled
+    kNone = 0,
+
+    /* When simplifying an inequality, attempt to use scope-based knowns.
+     *
+     * Example:
+     * if_then_else(i<j && j<k, i<k, false) => if_then_else(i<j && j<k, true, false)
+     */
+    kTransitivelyProveInequalities = (1 << 0),
+  };
+
+  /*! \brief Enable an optional extension or extensions
+   *
+   * \param flags A bitwise OR of all optional extensions that should
+   * be enabled.
+   */
+  TVM_DLL void SetEnabledExtensions(Extension flags);
+
+  /*! \brief Return the currently enabled extensions */
+  TVM_DLL Extension GetEnabledExtensions() const;
 
  private:
   friend class Analyzer;
@@ -317,6 +351,82 @@ class CanonicalSimplifier {
   Impl* impl_;
 };
 
+/*! \brief Structure for representing result of known
+ *
+ * Values are assigned to allow these flags to be used in bitwise
+ * operations.
+ */
+enum class CompareResult : int {
+  kInconsistent = 0,
+  kEQ = 1,
+  kLT = 2,
+  kLE = 3,
+  kGT = 4,
+  kGE = 5,
+  kNE = 6,
+  kUnknown = 7
+};
+
+inline constexpr CompareResult operator&(CompareResult lhs, CompareResult rhs) {
+  return CompareResult(static_cast<int>(lhs) & static_cast<int>(rhs));
+}
+inline constexpr CompareResult operator|(CompareResult lhs, CompareResult rhs) {
+  return CompareResult(static_cast<int>(lhs) | static_cast<int>(rhs));
+}
+
+/*!
+ * \brief Using previously specified knowns, compare the expressions provided
+ *
+ * Given known expressions [(a OP b), (b OP c), ..., (y OP z)], search
+ * for a known result for `(a OP z)`.
+ */
+class TransitiveComparisonAnalyzer {
+ public:
+  /* \brief Using previously specified knowns, compare the expressions provided
+   *
+   * \param lhs The left-hand side of the comparison
+   *
+   * \param rhs The right-hand side of the comparison
+   *
+   * \return The most specific result that can be proven about the
+   * comparison.  If nothing can be proven, returns kUnknown.
+   */
+  TVM_DLL CompareResult TryCompare(const PrimExpr& lhs, const PrimExpr& rhs);
+
+  /*! \brief Bind a variable as being equal to a known expression
+   *
+   * \param var The variable of interest.
+   * \param expr The bound expression
+   * \param allow_override Whether to allow override of existing information.
+   */
+  TVM_DLL void Bind(const Var& var, const PrimExpr& expr, bool allow_override = false);
+
+  /*! \brief Bind a variable as being within a specified range
+   *
+   * \param var The variable of interest.
+   * \param range The known range
+   * \param allow_override Whether to allow override of existing information.
+   */
+  TVM_DLL void Bind(const Var& var, const Range& range, bool allow_override = false);
+
+  /*!
+   * \brief Update the internal state to enter constraint.
+   * \param constraint A constraint expression.
+   *
+   * \return an exit function that must be called to cleanup the constraint can be nullptr.
+   */
+  TVM_DLL std::function<void()> EnterConstraint(const PrimExpr& constraint);
+
+ private:
+  friend class Analyzer;
+  friend class ConstraintContext;
+  TransitiveComparisonAnalyzer();
+  TVM_DLL ~TransitiveComparisonAnalyzer();
+  class Impl;
+  /*! \brief Internal impl */
+  std::unique_ptr<Impl> impl_;
+};
+
 /*!
  * \brief Constraint context.
  *
@@ -437,6 +547,8 @@ class TVM_DLL Analyzer {
   CanonicalSimplifier canonical_simplify;
   /*! \brief sub-analyzer: int set */
   IntSetAnalyzer int_set;
+  /*! \brief sub-analyzer transitive comparisons */
+  TransitiveComparisonAnalyzer transitive_comparisons;
   /*! \brief constructor */
   Analyzer();
   /*!
diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index ad52a6578b24..921f8ac7094b 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -45,6 +45,7 @@ void Analyzer::Bind(const Var& var, const PrimExpr& expr, bool allow_override) {
   this->rewrite_simplify.Update(var, new_expr, allow_override);
   this->canonical_simplify.Update(var, new_expr, allow_override);
   this->int_set.Update(var, this->int_set(new_expr), allow_override);
+  this->transitive_comparisons.Bind(var, expr, allow_override);
 }
 
 void Analyzer::Bind(const Var& var, const Range& range, bool allow_override) {
@@ -54,6 +55,7 @@ void Analyzer::Bind(const Var& var, const Range& range, bool allow_override) {
   } else {
     this->const_int_bound.Bind(var, range, allow_override);
     this->int_set.Bind(var, range, allow_override);
+    this->transitive_comparisons.Bind(var, range, allow_override);
   }
   // skip modular_set
   // skip rewrite simplify
@@ -72,6 +74,7 @@ void ConstraintContext::EnterWithScope() {
   recovery_functions_.push_back(analyzer_->modular_set.EnterConstraint(constraint_));
   recovery_functions_.push_back(analyzer_->rewrite_simplify.EnterConstraint(constraint_));
   recovery_functions_.push_back(analyzer_->int_set.EnterConstraint(constraint_));
+  recovery_functions_.push_back(analyzer_->transitive_comparisons.EnterConstraint(constraint_));
 }
 
 void ConstraintContext::ExitWithScope() {
diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc
index f5d2667aa64e..39d626aaf2b4 100644
--- a/src/arith/canonical_simplify.cc
+++ b/src/arith/canonical_simplify.cc
@@ -891,7 +891,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const DivNode* op) {
           lhs.CopyOnWrite()->AddToSelf(pconst->value / cval);
         } else {
           // if 0 <= extra < cval, it means the extra can be eliminated.
-          if (TryCompare(temp, cval) != kLT) {
+          if (TryCompare(temp, cval) != CompareResult::kLT) {
             lhs.CopyOnWrite()->AddToSelf(SplitDivConst(ToSplitExpr(temp), cval, kTruncDiv), 1);
           }
         }
@@ -945,7 +945,8 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
         lhs.CopyOnWrite()->AddToSelf(floordiv(pconst->value, cval));
       } else {
         // if 0 <= extra < cval, it means the extra can be eliminated.
-        if (!(TryCompare(temp, cval) == kLT && analyzer_->CanProveGreaterEqual(temp, 0))) {
+        if (!(TryCompare(temp, cval) == CompareResult::kLT &&
+              analyzer_->CanProveGreaterEqual(temp, 0))) {
           lhs.CopyOnWrite()->AddToSelf(SplitDivConst(ToSplitExpr(temp), cval, kFloorDiv), 1);
         }
       }
@@ -1052,7 +1053,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ModNode* op) {
           return truncmod(temp, c1.Eval());
         } else {
           // If temp < cval && temp >=0 then can remove the mod.
-          if (TryCompare(temp, cval) == kLT) {
+          if (TryCompare(temp, cval) == CompareResult::kLT) {
             return temp;
           } else {
             // contonue to use logic below.
@@ -1113,7 +1114,8 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
         return floormod(temp, c1.Eval());
       } else {
         // If temp < cval && temp >=0 then can remove the mod.
-        if (TryCompare(temp, cval) == kLT && analyzer_->CanProveGreaterEqual(temp, 0)) {
+        if (TryCompare(temp, cval) == CompareResult::kLT &&
+            analyzer_->CanProveGreaterEqual(temp, 0)) {
           return temp;
         } else {
           // contonue to use logic below.
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index 2f7b88dfc508..019b8cd5d353 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -71,42 +71,70 @@ using namespace tir;
 // handled by CanonicalSimplifier.
 //
 
+CompareResult RewriteSimplifier::Impl::TryCompare(const PrimExpr& x, const PrimExpr& y) {
+  CompareResult output = CompareResult::kUnknown;
+
+  auto is_finished = [&output]() {
+    return output == CompareResult::kEQ || output == CompareResult::kLT ||
+           output == CompareResult::kGT;
+  };
+
+  output = CompareResult(output & TryCompareUsingConstIntBounds(x, y));
+
+  if (is_finished()) return output;
+
+  if (enabled_extensions_ & kTransitivelyProveInequalities) {
+    output = CompareResult(output & TryCompareUsingKnownInequalities(x, y));
+  }
+
+  return output;
+}
+
+CompareResult RewriteSimplifier::Impl::TryCompareUsingConstIntBounds(const PrimExpr& x,
+                                                                     const PrimExpr y) {
+  return TryCompare(x - y, 0);
+}
+
+CompareResult RewriteSimplifier::Impl::TryCompareUsingKnownInequalities(const PrimExpr& x,
+                                                                        const PrimExpr& y) {
+  return analyzer_->transitive_comparisons.TryCompare(x, y);
+}
+
 // try to prove x equals val
-RewriteSimplifier::Impl::CompareResult RewriteSimplifier::Impl::TryCompare(const PrimExpr& x,
-                                                                           int64_t val) {
+CompareResult RewriteSimplifier::Impl::TryCompare(const PrimExpr& x, int64_t val) {
   PrimExpr diff = this->VisitExpr(x);
   if (const auto* ptr = diff.as<IntImmNode>()) {
     if (ptr->value == val) {
-      return kEQ;
+      return CompareResult::kEQ;
     } else if (ptr->value > val) {
-      return kGT;
+      return CompareResult::kGT;
     } else if (ptr->value < val) {
-      return kLT;
+      return CompareResult::kLT;
     }
   }
   ConstIntBound dbound = analyzer_->const_int_bound(diff);
   if (dbound->min_value == val && dbound->max_value == val) {
-    return kEQ;
+    return CompareResult::kEQ;
   }
   if (dbound->min_value > val) {
-    return kGT;
+    return CompareResult::kGT;
   }
   if (dbound->max_value < val) {
-    return kLT;
+    return CompareResult::kLT;
   }
   if (dbound->min_value >= val) {
-    return kGE;
+    return CompareResult::kGE;
   }
   if (dbound->max_value <= val) {
-    return kLE;
+    return CompareResult::kLE;
   }
   if (val == 0) {
     ModularSet dmod = analyzer_->modular_set(diff);
     if (dmod->base != 0) {
-      return kNE;
+      return CompareResult::kNE;
     }
   }
-  return kUnknown;
+  return CompareResult::kUnknown;
 }
 
 void RewriteSimplifier::Impl::Update(const Var& var, const PrimExpr& info, bool can_override) {
@@ -254,6 +282,12 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
   return frecover;
 }
 
+void RewriteSimplifier::Impl::SetEnabledExtensions(Extension flags) { enabled_extensions_ = flags; }
+
+RewriteSimplifier::Extension RewriteSimplifier::Impl::GetEnabledExtensions() const {
+  return enabled_extensions_;
+}
+
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const SubNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<SubNode>();
@@ -1333,10 +1367,11 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const EQNode* op) {
   }
 
   if (IsIndexType(op->a.dtype())) {
-    CompareResult result = TryCompare(op->a - op->b, 0);
-    if (result == kEQ) {
+    CompareResult result = TryCompare(op->a, op->b);
+    if (result == CompareResult::kEQ) {
       return make_const(op->dtype, true);
-    } else if (result == kNE || result == kGT || result == kLT) {
+    } else if (result == CompareResult::kNE || result == CompareResult::kGT ||
+               result == CompareResult::kLT) {
       return make_const(op->dtype, false);
     }
     TVM_TRY_REWRITE(x - c1 == 0, x == c1);
@@ -1382,11 +1417,12 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LTNode* op) {
   }
 
   if (IsIndexType(op->a.dtype())) {
-    CompareResult result = TryCompare(op->a - op->b, 0);
-    if (result == kLT) {
+    CompareResult result = TryCompare(op->a, op->b);
+    if (result == CompareResult::kLT) {
       return make_const(op->dtype, true);
     }
-    if (result == kEQ || result == kGT || result == kGE) {
+    if (result == CompareResult::kEQ || result == CompareResult::kGT ||
+        result == CompareResult::kGE) {
       return make_const(op->dtype, false);
     }
 
@@ -1742,6 +1778,13 @@ std::function<void()> RewriteSimplifier::EnterConstraint(const PrimExpr& constra
   return impl_->EnterConstraint(constraint);
 }
 
+void RewriteSimplifier::SetEnabledExtensions(Extension flags) {
+  impl_->SetEnabledExtensions(flags);
+}
+RewriteSimplifier::Extension RewriteSimplifier::GetEnabledExtensions() const {
+  return impl_->GetEnabledExtensions();
+}
+
 RewriteSimplifier::RewriteSimplifier(Analyzer* parent) : impl_(new Impl(parent)) {}
 
 RewriteSimplifier::~RewriteSimplifier() { delete impl_; }
diff --git a/src/arith/rewrite_simplify.h b/src/arith/rewrite_simplify.h
index 6007b6416742..00c60e21ee42 100644
--- a/src/arith/rewrite_simplify.h
+++ b/src/arith/rewrite_simplify.h
@@ -77,9 +77,17 @@ class RewriteSimplifier::Impl : public IRMutatorWithAnalyzer {
 
   std::function<void()> EnterConstraint(const PrimExpr& constraint);
 
+  /*! \brief Enable an optional extension or extensions
+   *
+   * \param flags A bitwise OR of all optional extensions that should
+   * be enabled.
+   */
+  void SetEnabledExtensions(Extension flags);
+
+  /*! \brief Return the currently enabled extensions */
+  Extension GetEnabledExtensions() const;
+
  protected:
-  /*! \brief internal structure for comparison. */
-  enum CompareResult { kUnknown, kEQ, kGT, kGE, kLT, kLE, kNE };
   // counter to record recursive rewrite depth.
   int recur_depth_{0};
   // internal variable map
@@ -87,6 +95,9 @@ class RewriteSimplifier::Impl : public IRMutatorWithAnalyzer {
 
   std::vector<PrimExpr> literal_constraints_;
 
+  // Optionally enabled extensions
+  Extension enabled_extensions_{kNone};
+
   // maximum number of recursion allowed during a single pass.
   static const constexpr int kMaxRecurDepth = 5;
 
@@ -98,6 +109,14 @@ class RewriteSimplifier::Impl : public IRMutatorWithAnalyzer {
    */
   CompareResult TryCompare(const PrimExpr& x, int64_t val);
 
+  /*! Try to compare x against y
+   *
+   * \param x The lhs of the comparison
+   * \param y The rhs of the comparison
+   * \return comparison result.
+   */
+  CompareResult TryCompare(const PrimExpr& x, const PrimExpr& y);
+
   /*!
    * \brief Internal function to check whether or not to inline let.
    * \param op The let expr.
@@ -115,6 +134,9 @@ class RewriteSimplifier::Impl : public IRMutatorWithAnalyzer {
   Optional<PrimExpr> TryMatchLiteralConstraint(const PrimExpr& expr) const;
 
  private:
+  CompareResult TryCompareUsingKnownInequalities(const PrimExpr& x, const PrimExpr& y);
+  CompareResult TryCompareUsingConstIntBounds(const PrimExpr& x, const PrimExpr y);
+
   // Whether x >= val
   bool CanProveGreaterEqual(const PrimExpr& x, int64_t val) {
     return analyzer_->CanProveGreaterEqual(x, val);
@@ -124,7 +146,7 @@ class RewriteSimplifier::Impl : public IRMutatorWithAnalyzer {
   // Whether x == val
   bool CanProveEqual(const PrimExpr& x, int64_t val) {
     // TODO(tqchen) refer back to super-analyzer.
-    return TryCompare(x, val) == kEQ;
+    return TryCompare(x, val) == CompareResult::kEQ;
   }
 
   // Recursive rewrite x
diff --git a/src/arith/transitive_comparison_analyzer.cc b/src/arith/transitive_comparison_analyzer.cc
new file mode 100644
index 000000000000..9a835f7fdec8
--- /dev/null
+++ b/src/arith/transitive_comparison_analyzer.cc
@@ -0,0 +1,791 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \file tvm/arith/transitive_comparison_analyzer.cc
+ */
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/expr.h>
+
+#include <optional>
+#include <vector>
+
+#include "constraint_extract.h"
+#include "pattern_match.h"
+
+namespace tvm {
+namespace arith {
+
+using namespace tir;
+
+class TransitiveComparisonAnalyzer::Impl {
+ public:
+  /* \brief Using previously specified knowns, compare the expressions provided
+   *
+   * \param lhs The left-hand side of the comparison
+   *
+   * \param rhs The right-hand side of the comparison
+   *
+   * \return The most specific result that can be proven about the
+   * comparison.  If nothing can be proven, returns kUnknown.
+   */
+  CompareResult TryCompare(const PrimExpr& lhs, const PrimExpr& rhs) const;
+
+  /*! \brief Bind a variable as being equal to a known expression
+   *
+   * \param var The variable of interest.
+   * \param expr The bound expression
+   * \param allow_override Whether to allow override of existing information.
+   */
+  void Bind(const tir::Var& var, const PrimExpr& expr, bool allow_override = false);
+
+  /*! \brief Bind a variable as being within a specified range
+   *
+   * \param var The variable of interest.
+   * \param range The known range
+   * \param allow_override Whether to allow override of existing information.
+   */
+  void Bind(const tir::Var& var, const Range& expr, bool allow_override = false);
+
+  /*!
+   * \brief Update the internal state to enter constraint.
+   * \param constraint A constraint expression.
+   *
+   * \return An exit function that must be called to cleanup.  May be
+   * `nullptr`, if no cleanup is required.
+   */
+  std::function<void()> EnterConstraint(const PrimExpr& expr);
+
+ private:
+  /* \brief Internal representation of a PrimExpr
+   *
+   * The Key enum serves two purposes.
+   *
+   * 1. Providing efficiency, as compared to a PrimExpr.  Two keys are
+   *    equal if and only if the corresponding PrimExprs would satisfy
+   *    ExprDeepEqual.  This allows two expressions to be checked for
+   *    equivalency, without requiring a call to ExprDeepEqual for
+   *    each comparison.
+   *
+   * 2. Providing type-safety, as compared to using `size_t` directly.
+   *    Requiring an explicit conversion from an integer to a Key
+   *    prevents accidental comparisons, especially if both loop
+   *    iterators and Keys are used in the same scope.
+   *
+   * A Key should only be obtained using the methods `ExprToKey` and
+   * `ExprToPreviousKey`.
+   */
+  enum class Key : size_t {};
+
+  /*! \brief Convert an expression to internal representation
+   *
+   * If the expression has previously been converted to the internal
+   * representation, returns the same Key as has been used previously.
+   * Otherwise, generate and return a new Key.
+   *
+   * \param expr The PrimExpr to be converted
+   *
+   * \returns The Key representing the expression
+   *
+   * \see ExprToPreviousKey
+   */
+  Key ExprToKey(const PrimExpr& expr);
+
+  /*! \brief Convert an expression to internal representation
+   *
+   * If the expression has previously been converted to the internal
+   * representation, returns the same Key as has been used previously.
+   * Otherwise, return `std::nullopt`.
+   *
+   * \param expr The PrimExpr to be converted
+   *
+   * \returns The Key representing the expression, if one exists.
+   *
+   * \see ExprToKey
+   */
+  std::optional<Key> ExprToPreviousKey(const PrimExpr& expr) const;
+
+  /*! \brief The mapping from expression to Key
+   *
+   * Should not be used directly.  Instead, use the helper functions
+   * `ExprToKey` and `ExprToPreviousKey`.
+   *
+   * \see ExprToKey
+   * \see ExprToPreviousKey
+   */
+  std::unordered_map<PrimExpr, Key, StructuralHash, StructuralEqual> expr_to_key;
+
+  /*! \brief Internal representation of a comparison operator */
+  struct Comparison {
+    /*! \brief Construct a comparison that represents `lhs OP rhs +
+     * offset`, where the operation is specified by the CompareResult.
+     */
+    Comparison(Key lhs, Key rhs, int64_t offset, CompareResult result);
+
+    /*! \brief Utility function to validate that all GT and LT results
+     *  have been normalized out
+     */
+    bool IsNormalized() const;
+
+    /*! \brief Move the specified expression to the LHS.
+     *
+     * \param new_lhs The argument that should be moved to the LHS of the
+     * comparison.
+     *
+     * \return If possible, returns a comparison that is equivalent to
+     * the current comparison, but with the specified LHS.  If not
+     * possible, returns nullopt.
+     */
+    std::optional<Comparison> WithLHS(Key new_lhs) const;
+
+    /*! \brief Create the negation of the current comparison */
+    Comparison Negated() const;
+
+    /*! \brief Check the this comparison implies
+     *
+     * Returns true if this comparison being true implies that the
+     * other comparison must also be true.  Returns false if the other
+     * comparison cannot be shown to be true.
+     */
+    bool Implies(const Comparison& other) const;
+
+    // The LHS of the comparison
+    Key lhs_;
+
+    // The RHS of the comparison, not including any constant offset.
+    Key rhs_;
+
+    // Additive offset on rhs
+    int64_t offset_{0};
+
+    // The comparison operator.
+    CompareResult result_{CompareResult::kInconsistent};
+  };
+
+  /*! \brief Generate a Comparison representing the given expression */
+  std::optional<Comparison> FromExpr(const PrimExpr& expr);
+
+  /*! \brief Utility function used by Bind and EnterConstraint
+   *
+   * \param expr The comparison expression, to be converted into
+   * internal Comparison objects.
+   *
+   * \param vec The vector to which the Comparison objects should be
+   * appended.
+   */
+  void AddKnown(const PrimExpr& expr, std::vector<Comparison>* vec);
+
+  /*! \brief Attempt to compare the expressions, starting at the lhs.
+   *
+   * Perform a depth-first search through the space of known
+   * expressions, starting at the LHS of a comparison.  In this
+   * search, each expression is a node of a graph, and each known
+   * comparison is an edge of the graph.
+   *
+   * For example, suppose we have previous knowns of (A<=B), (B<=C+1)
+   * and (C<=D-5).  The expressions [A,B,C,D] are the nodes of the
+   * search space.  Each comparison is an edge connecting two
+   * expressions, such as (B<=C+1) connecting the expressions B and D.
+   * If we are attempting to compare expressions A and D, a search
+   * starting at expression A could follow each edge until reaching
+   * expression D, then combine the comparisons that compose the path
+   * into the expression A<=D-4.
+   *
+   * \param lhs The left-hand side of the comparison
+   *
+   * \param rhs The right-hand side of the comparison
+   *
+   * \return The result of the comparison
+   */
+  CompareResult DFSFromLHS(Key lhs_key, Key rhs_key, int64_t offset, const PrimExpr& lhs,
+                           const PrimExpr& rhs) const;
+
+  /*! \brief Previous Range bindings
+   *
+   * Tracked separatedly to handle the `allow_override` option used by
+   * all sub-analyzers when binding variables.
+   */
+  Map<Var, Range> prev_bindings_;
+
+  /*! \brief Known comparisons based on definitionally-true statements
+   *
+   * For example, a Let binding, or the range of an iterator.  These
+   * known statements are always true, based on the definition site of
+   * the variable.  e.g. A loop iterator may never exceed the bounds
+   * of its loop.
+   */
+  std::vector<Comparison> knowns_;
+
+  /*! \brief Known comparisons based on scoped conditions
+   *
+   * For example, the condition of an IfThenElse.  These known
+   * statements may only be used within the scope of the statement
+   * from which they were derived.  e.g. After exiting an IfThenElse,
+   * the condition may no longer be true.
+   */
+  std::vector<Comparison> scoped_knowns_;
+};
+
+namespace {
+
+// Internal utility, return the CompareResult resulting from swapping
+// the left-hand side with the right-hand side.
+CompareResult Reverse(CompareResult res) {
+  switch (res) {
+    case CompareResult::kInconsistent:
+      return CompareResult::kInconsistent;
+    case CompareResult::kEQ:
+      return CompareResult::kEQ;
+    case CompareResult::kLT:
+      return CompareResult::kGT;
+    case CompareResult::kLE:
+      return CompareResult::kGE;
+    case CompareResult::kGT:
+      return CompareResult::kLT;
+    case CompareResult::kGE:
+      return CompareResult::kLE;
+    case CompareResult::kNE:
+      return CompareResult::kNE;
+    case CompareResult::kUnknown:
+      return CompareResult::kUnknown;
+    default:
+      LOG(FATAL) << "Invalid CompareResult: " << static_cast<int>(res);
+      return CompareResult::kInconsistent;
+  }
+}
+
+// Internal utility, return the CompareResult resulting from negating
+// the comparison.
+CompareResult Negate(CompareResult res) {
+  switch (res) {
+    case CompareResult::kInconsistent:
+      return CompareResult::kInconsistent;
+    case CompareResult::kUnknown:
+      return CompareResult::kUnknown;
+    default:
+      return CompareResult(~static_cast<int>(res) & static_cast<int>(CompareResult::kUnknown));
+  }
+}
+
+// Internal utility, extract constant offsets out of the two sides of
+// a comparison.  Given lhs and rhs, return a tuple of three elements
+// (lhs_inner, rhs_inner, offset), such that (lhs OP rhs) and
+// (lhs_inner OP rhs_inner + offset) are equivalent.
+std::tuple<PrimExpr, PrimExpr, int64_t> ExtractOffsets(const PrimExpr& lhs, const PrimExpr& rhs) {
+  auto extract_offset = [](const PrimExpr& expr) -> std::pair<PrimExpr, int64_t> {
+    PVar<PrimExpr> x;
+    PVar<IntImm> c;
+    if ((x + c).Match(expr)) {
+      return {x.Eval(), c.Eval()->value};
+    } else if ((x - c).Match(expr)) {
+      return {x.Eval(), -c.Eval()->value};
+    } else if (c.Match(expr)) {
+      return {0, c.Eval()->value};
+    } else {
+      return {expr, 0};
+    }
+  };
+
+  auto lhs_split = extract_offset(lhs);
+  auto rhs_split = extract_offset(rhs);
+  return {lhs_split.first, rhs_split.first, rhs_split.second - lhs_split.second};
+}
+
+}  // namespace
+
+std::optional<TransitiveComparisonAnalyzer::Impl::Comparison>
+TransitiveComparisonAnalyzer::Impl::FromExpr(const PrimExpr& expr) {
+  CompareResult res;
+  PVar<PrimExpr> x, y;
+  if ((x <= y).Match(expr)) {
+    res = CompareResult::kLE;
+  } else if ((x >= y).Match(expr)) {
+    res = CompareResult::kGE;
+  } else if ((x < y).Match(expr)) {
+    res = CompareResult::kLT;
+  } else if ((x > y).Match(expr)) {
+    res = CompareResult::kGT;
+  } else if ((x == y).Match(expr)) {
+    res = CompareResult::kEQ;
+  } else if ((x != y).Match(expr)) {
+    res = CompareResult::kNE;
+  } else {
+    return std::nullopt;
+  }
+
+  PrimExpr lhs_expr = x.Eval();
+  PrimExpr rhs_expr = y.Eval();
+
+  if (lhs_expr.as<IntImmNode>() && rhs_expr.as<IntImmNode>()) {
+    return std::nullopt;
+  }
+
+  auto [lhs, rhs, offset] = ExtractOffsets(lhs_expr, rhs_expr);
+  Key lhs_key = ExprToKey(lhs);
+  Key rhs_key = ExprToKey(rhs);
+
+  return Comparison(lhs_key, rhs_key, offset, res);
+}
+
+TransitiveComparisonAnalyzer::Impl::Comparison::Comparison(Key lhs, Key rhs, int64_t offset,
+                                                           CompareResult result)
+    : lhs_(lhs), rhs_(rhs), offset_(offset), result_(result) {
+  // Normalize the comparison to remove LT and GT expressions,
+  // reducing the number of operators that must be handled later.  By
+  // eliminating LT and GT, instead of eliminating LE or GE, a
+  // potential off-by-one error is avoided.
+  //
+  // For floating-point numbers, (x < y + c1) and (y < z + c2) implies
+  // that (x < z + (c1 + c2)).  For integer types, which the
+  // TransitiveComparisonAnalyzer is intended for use with integers,
+  // LT or GT can give a tighter constraint, though with a less
+  // convenient symmetry.
+  //
+  // i < j + c1, j < k + c2
+  // i <= j + c1 - 1, j <= k + c2 - 1
+  // i + 1 - c1 <= j, j <= k + c2 - 1
+  // i + 1 - c1 <= k + c2 - 1
+  // i <= k + c1 + c2 - 2
+  // i < k + (c1 + c2 - 1)
+  //
+  // By always working with LE and GE comparisons, we avoid needing to
+  // handle the offset of one that would be introduced by LT and GT at
+  // all points of use.  The only point of use for LT and GT is when
+  // normalizing comparisons (i.e. this constructor).
+
+  if (result_ == CompareResult::kLT) {
+    result_ = CompareResult::kLE;
+    offset_ -= 1;
+  }
+  if (result_ == CompareResult::kGT) {
+    result_ = CompareResult::kGE;
+    offset_ += 1;
+  }
+}
+
+std::optional<TransitiveComparisonAnalyzer::Impl::Key>
+TransitiveComparisonAnalyzer::Impl::ExprToPreviousKey(const PrimExpr& expr) const {
+  auto it = expr_to_key.find(expr);
+  if (it != expr_to_key.end()) {
+    return it->second;
+  } else {
+    return std::nullopt;
+  }
+}
+
+TransitiveComparisonAnalyzer::Impl::Key TransitiveComparisonAnalyzer::Impl::ExprToKey(
+    const PrimExpr& expr) {
+  if (auto prev = ExprToPreviousKey(expr)) {
+    return prev.value();
+  } else {
+    Key new_key = Key(expr_to_key.size());
+    expr_to_key[expr] = new_key;
+    return new_key;
+  }
+}
+
+bool TransitiveComparisonAnalyzer::Impl::Comparison::IsNormalized() const {
+  // These < and > should be removed during normalization.  See the
+  // `Comparison::Comparison` constructor for further details.
+  return result_ != CompareResult::kLT && result_ != CompareResult::kGT;
+}
+
+std::optional<TransitiveComparisonAnalyzer::Impl::Comparison>
+TransitiveComparisonAnalyzer::Impl::Comparison::WithLHS(Key new_lhs) const {
+  if (new_lhs == lhs_) {
+    return *this;
+  } else if (new_lhs == rhs_) {
+    return Comparison(rhs_, lhs_, -offset_, Reverse(result_));
+  } else {
+    return std::nullopt;
+  }
+}
+
+TransitiveComparisonAnalyzer::Impl::Comparison
+TransitiveComparisonAnalyzer::Impl::Comparison::Negated() const {
+  return Comparison(lhs_, rhs_, offset_, Negate(result_));
+}
+
+bool TransitiveComparisonAnalyzer::Impl::Comparison::Implies(
+    const TransitiveComparisonAnalyzer::Impl::Comparison& other) const {
+  ICHECK(lhs_ == other.lhs_);
+  ICHECK(rhs_ == other.rhs_);
+  ICHECK(IsNormalized());
+  ICHECK(other.IsNormalized());
+
+  if (result_ == other.result_ && offset_ == other.offset_) {
+    // if c1 == c2, x != y + c1 => x != y + c2
+    // if c1 == c2, x == y + c1 => x == y + c2
+    return true;
+  }
+
+  if (other.result_ == CompareResult::kLE && offset_ <= other.offset_) {
+    if (result_ == CompareResult::kEQ || result_ == CompareResult::kLE) {
+      // if c1 <= c2, x <= y + c1 => x <= y + c2
+      // if c1 <= c2, x == y + c1 => x <= y + c2
+      return true;
+    }
+  }
+
+  if (other.result_ == CompareResult::kGE && offset_ >= other.offset_) {
+    if (result_ == CompareResult::kEQ || result_ == CompareResult::kGE) {
+      // if c1 >= c2, x == y + c1 => x >= y + c2
+      // if c1 >= c2, x >= y + c1 => x >= y + c2
+      return true;
+    }
+  }
+
+  if (other.result_ == CompareResult::kNE) {
+    if (result_ == CompareResult::kEQ && offset_ != other.offset_) {
+      // if c1 != c2, x == y + c1 => x != y + c2
+      return true;
+    }
+
+    if (result_ == CompareResult::kLE && offset_ < other.offset_) {
+      // if c1 < c2, x <= y + c1 => x < y + c2 => x != y + c2
+      return true;
+    }
+
+    if (result_ == CompareResult::kGE && offset_ > other.offset_) {
+      // if c1 != c2, x >= y + c1 => x > y + c2 => x != y + c2
+      return true;
+    }
+  }
+
+  return false;
+}
+
+TransitiveComparisonAnalyzer::TransitiveComparisonAnalyzer() : impl_(std::make_unique<Impl>()) {}
+TransitiveComparisonAnalyzer::~TransitiveComparisonAnalyzer() {}
+
+CompareResult TransitiveComparisonAnalyzer::TryCompare(const PrimExpr& lhs, const PrimExpr& rhs) {
+  return impl_->TryCompare(lhs, rhs);
+}
+
+void TransitiveComparisonAnalyzer::Bind(const Var& var, const PrimExpr& expr, bool allow_override) {
+  impl_->Bind(var, expr, allow_override);
+}
+void TransitiveComparisonAnalyzer::Bind(const Var& var, const Range& range, bool allow_override) {
+  impl_->Bind(var, range, allow_override);
+}
+
+std::function<void()> TransitiveComparisonAnalyzer::EnterConstraint(const PrimExpr& constraint) {
+  return impl_->EnterConstraint(constraint);
+}
+
+void TransitiveComparisonAnalyzer::Impl::AddKnown(const PrimExpr& expr,
+                                                  std::vector<Comparison>* vec) {
+  for (const auto& subexpr : ExtractConstraints(expr)) {
+    if (tir::SideEffect(expr) <= tir::CallEffectKind::kPure) {
+      if (auto cmp = FromExpr(subexpr)) {
+        vec->push_back(cmp.value());
+      }
+    }
+  }
+}
+
+void TransitiveComparisonAnalyzer::Impl::Bind(const tir::Var& var, const Range& range,
+                                              bool allow_override) {
+  auto it = prev_bindings_.find(var);
+  if (it != prev_bindings_.end()) {
+    ExprDeepEqual expr_equal;
+    bool differs_from_previous = !expr_equal(range->min, (*it).second->min) ||
+                                 !expr_equal(range->extent, (*it).second->extent);
+    if (differs_from_previous) {
+      ICHECK(allow_override) << "Binding of variable " << var << " as " << range
+                             << " conflicts with previous binding as " << (*it).second;
+      if (auto key = ExprToPreviousKey(var)) {
+        knowns_.erase(std::remove_if(knowns_.begin(), knowns_.end(),
+                                     [&](const auto& known) { return known.lhs_ == key.value(); }),
+                      knowns_.end());
+      }
+    }
+  }
+
+  prev_bindings_.Set(var, range);
+
+  if (is_const_int(range->extent, 1)) {
+    AddKnown(var == range->min, &knowns_);
+  } else {
+    AddKnown(var >= range->min, &knowns_);
+    AddKnown(var < range->min + range->extent, &knowns_);
+  }
+}
+
+void TransitiveComparisonAnalyzer::Impl::Bind(const tir::Var& var, const PrimExpr& expr,
+                                              bool allow_override) {
+  Bind(var, Range::FromMinExtent(expr, 1), allow_override);
+}
+
+std::function<void()> TransitiveComparisonAnalyzer::Impl::EnterConstraint(const PrimExpr& expr) {
+  size_t old_literal_size = scoped_knowns_.size();
+  AddKnown(expr, &scoped_knowns_);
+  size_t new_literal_size = scoped_knowns_.size();
+
+  auto frecover = [old_literal_size, new_literal_size, this]() {
+    ICHECK_EQ(scoped_knowns_.size(), new_literal_size);
+    scoped_knowns_.erase(scoped_knowns_.begin() + old_literal_size, scoped_knowns_.end());
+  };
+  return frecover;
+}
+
+CompareResult TransitiveComparisonAnalyzer::Impl::TryCompare(const PrimExpr& lhs_expr,
+                                                             const PrimExpr& rhs_expr) const {
+  // Currently only supports integer checks
+  if (!lhs_expr.dtype().is_int() || !rhs_expr.dtype().is_int()) {
+    return CompareResult::kUnknown;
+  }
+
+  // Bail out early if possible.  This int check should have been
+  // constant-folded earlier, so this check shouldn't occur.
+  auto* x_int = lhs_expr.as<IntImmNode>();
+  auto* y_int = rhs_expr.as<IntImmNode>();
+  if (x_int && y_int) {
+    if (x_int->value < y_int->value) {
+      return CompareResult::kLT;
+    } else if (x_int->value > y_int->value) {
+      return CompareResult::kGT;
+    } else {
+      return CompareResult::kEQ;
+    }
+  }
+
+  auto [lhs, rhs, offset] = ExtractOffsets(lhs_expr, rhs_expr);
+  auto lhs_key = ExprToPreviousKey(lhs);
+  auto rhs_key = ExprToPreviousKey(rhs);
+
+  if (!lhs_key.has_value() || !rhs_key.has_value()) {
+    return CompareResult::kUnknown;
+  }
+
+  auto from_lhs = DFSFromLHS(lhs_key.value(), rhs_key.value(), offset, lhs, rhs);
+  auto from_rhs = Reverse(DFSFromLHS(rhs_key.value(), lhs_key.value(), -offset, rhs, lhs));
+  auto output = from_lhs & from_rhs;
+
+  return output;
+}
+
+CompareResult TransitiveComparisonAnalyzer::Impl::DFSFromLHS(Key lhs_key_input, Key rhs_key_input,
+                                                             int64_t offset_input,
+                                                             const PrimExpr& lhs_input,
+                                                             const PrimExpr& rhs_input) const {
+  Key lhs_key = lhs_key_input;
+  Key rhs_key = rhs_key_input;
+  int64_t offset = offset_input;
+
+  // Everything in `to_visit` has lhs as its lhs.
+  std::unordered_set<Key> seen;
+  std::unordered_set<Key> to_visit;
+  std::unordered_map<Key, std::vector<Comparison>> compared_to_x;
+
+  // Utility function to add a new known statement
+  auto declare_known = [&](Comparison cmp) {
+    std::vector<Comparison>& knowns = compared_to_x[cmp.rhs_];
+
+    // The comparison adds no new information, no modification
+    // required.
+    for (auto& prev_known : knowns) {
+      if (prev_known.Implies(cmp)) {
+        return;
+      }
+    }
+
+    // New information may require visiting a new expression.
+    if (cmp.rhs_ != rhs_key && !seen.count(cmp.rhs_)) {
+      to_visit.insert(cmp.rhs_);
+      seen.insert(cmp.rhs_);
+    }
+
+    // This comparison is a stronger version of a previous constraint.
+    // Therefore, replace the old version entirely.
+    for (auto& prev_known : knowns) {
+      if (cmp.Implies(prev_known)) {
+        prev_known = cmp;
+        return;
+      }
+    }
+
+    // Neither a superset nor a subset of previously known
+    // constraints, must be tracked separately.
+    knowns.push_back(cmp);
+  };
+
+  // Initialize the search based on any known (in)equalities that use
+  // the LHS of the comparison.
+  for (const auto& known : knowns_) {
+    if (auto normalized = known.WithLHS(lhs_key)) {
+      declare_known(normalized.value());
+    }
+  }
+  for (const auto& known : scoped_knowns_) {
+    if (auto normalized = known.WithLHS(lhs_key)) {
+      declare_known(normalized.value());
+    }
+  }
+
+  // Walk through the space of all comparisons that can be made with
+  // LHS.
+  while (to_visit.size()) {
+    Key middle_key = *to_visit.begin();
+    to_visit.erase(to_visit.begin());
+
+    std::vector<Comparison>& prev_knowns_using_middle = compared_to_x.at(middle_key);
+    ICHECK(compared_to_x.count(middle_key));
+
+    std::vector<Comparison> new_knowns_using_lhs;
+
+    auto attempt_transitive = [&](Comparison cmp) {
+      ICHECK(cmp.IsNormalized());
+
+      Key right_key = cmp.rhs_;
+
+      if (right_key == lhs_key) {
+        return;
+      }
+
+      for (const auto& prev : prev_knowns_using_middle) {
+        CompareResult new_result = CompareResult::kUnknown;
+        int64_t new_offset = prev.offset_ + cmp.offset_;
+
+        if (prev.result_ == CompareResult::kEQ) {
+          // x == y + c1 && y OP z + c2, x OP z + (c1 + c2)
+          new_result = cmp.result_;
+        } else if (cmp.result_ == CompareResult::kEQ) {
+          // x OP y + c1 && y == z + c2, x OP z + (c1 + c2)
+          new_result = prev.result_;
+        } else if (prev.result_ == cmp.result_ &&
+                   (prev.result_ == CompareResult::kLE || prev.result_ == CompareResult::kGE)) {
+          // x <= y + c1 && y <= z + c2, x <= z + (c1 + c2)
+          // x >= y + c1 && y >= z + c2, x >= z + (c1 + c2)
+          //
+          // This condition is much simpler to write than the
+          // equivalent handling of < or of >, which is why the
+          // inequalities are normalized to <= and to >=.  See
+          // `TransitiveComparisonAnalyzer::Impl::Comparison::Comparison`
+          // for further details.
+          new_result = prev.result_;
+        }
+
+        if (new_result != CompareResult::kUnknown) {
+          Comparison new_known(lhs_key, right_key, new_offset, new_result);
+          new_knowns_using_lhs.push_back(new_known);
+        }
+      }
+    };
+
+    // Attempt to prove a new comparison using one of the original
+    // known comparisons.  We want to find a known such that
+    // `(LHS OP1 middle) && (middle OP2 right)` can be simplified
+    // into `(LHS OP3 right)`.
+    //
+    // Note: The right side is this step is not necessarily the RHS of
+    // the comparison we're trying to prove, as we may need to find
+    // intermediate comparisons first.  For example, if we know that
+    // `a<=b`, `b<=c`, and `c<=d`, and we wish to prove that `a<=d`,
+    // we must first combine `a<=b` and `b<=c` into `a<=c`.  During
+    // this first step, `b` is the "middle" and `c` is the "right".
+    // The next step can then combind `a<=c` and `c<=d` into `a<=d`.
+    for (const auto& known : knowns_) {
+      if (auto cmp = known.WithLHS(middle_key)) {
+        attempt_transitive(cmp.value());
+      }
+    }
+
+    for (const auto& known : scoped_knowns_) {
+      if (auto cmp = known.WithLHS(middle_key)) {
+        attempt_transitive(cmp.value());
+      }
+    }
+
+    // Collect together all new knowns, marking new nodes for visiting
+    // as needed.
+    for (const auto& new_known : new_knowns_using_lhs) {
+      declare_known(new_known);
+    }
+  }
+
+  // It's possible that we don't have any transitive comparisons that
+  // can prove something about LHS and RHS.
+  auto it = compared_to_x.find(rhs_key);
+  if (it == compared_to_x.end()) {
+    return CompareResult::kUnknown;
+  }
+
+  const std::vector<Comparison>& known_between_lhs_and_rhs = it->second;
+
+  // Just because we found a comparison involving LHS and RHS doesn't
+  // mean that it's useful.  e.g. Knowing that `x < y` doesn't let us
+  // prove whether `x + 5 < y`.
+  CompareResult result = CompareResult::kUnknown;
+  for (const auto& known : known_between_lhs_and_rhs) {
+    switch (known.result_) {
+      case CompareResult::kInconsistent:
+        result = CompareResult::kInconsistent;
+        break;
+
+      case CompareResult::kEQ:
+        if (offset == known.offset_) {
+          result = result & CompareResult::kEQ;
+        } else {
+          result = result & CompareResult::kNE;
+        }
+        break;
+
+      case CompareResult::kLE:
+        if (known.offset_ < offset) {
+          result = result & CompareResult::kLT;
+        } else if (known.offset_ <= offset) {
+          result = result & CompareResult::kLE;
+        }
+        break;
+
+      case CompareResult::kGE:
+        if (known.offset_ > offset) {
+          result = result & CompareResult::kGT;
+        } else if (known.offset_ >= offset) {
+          result = result & CompareResult::kGE;
+        }
+        break;
+
+      case CompareResult::kNE:
+        if (offset == known.offset_) {
+          result = result & CompareResult::kNE;
+        }
+        break;
+
+      case CompareResult::kUnknown:
+        break;
+
+      case CompareResult::kGT:
+      case CompareResult::kLT:
+        LOG(FATAL) << "Internal error, normalized comparisons should only include <= and >=";
+        return CompareResult::kInconsistent;
+
+      default:
+        LOG(FATAL) << "Invalid CompareResult: " << static_cast<int>(known.result_);
+        return CompareResult::kInconsistent;
+    }
+  }
+
+  return result;
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/tir/transforms/simplify.cc b/src/tir/transforms/simplify.cc
index 1a61bf23432a..2a7c0f5a3585 100644
--- a/src/tir/transforms/simplify.cc
+++ b/src/tir/transforms/simplify.cc
@@ -36,6 +36,34 @@ namespace arith {
 
 using namespace tir;
 
+struct SimplifyConfigNode : public tvm::AttrsNode<SimplifyConfigNode> {
+  bool transitively_prove_inequalities;
+
+  TVM_DECLARE_ATTRS(SimplifyConfigNode, "tir.transform.SimplifyConfig") {
+    TVM_ATTR_FIELD(transitively_prove_inequalities)
+        .describe(
+            "If true, simplify conditionals with transitive combinations of scoped constraints")
+        .set_default(false);
+  }
+
+  RewriteSimplifier::Extension GetEnabledExtensions() const {
+    RewriteSimplifier::Extension flags = RewriteSimplifier::kNone;
+    if (transitively_prove_inequalities) {
+      flags =
+          RewriteSimplifier::Extension(flags | RewriteSimplifier::kTransitivelyProveInequalities);
+    }
+    return flags;
+  }
+};
+
+class SimplifyConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(SimplifyConfig, Attrs, SimplifyConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(SimplifyConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.Simplify", SimplifyConfig);
+
 class StmtSimplifier : public IRMutatorWithAnalyzer {
  public:
   explicit StmtSimplifier(Analyzer* analyzer) : IRMutatorWithAnalyzer(analyzer) {}
@@ -159,8 +187,12 @@ namespace transform {
 
 Pass Simplify() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
-    auto* n = f.CopyOnWrite();
     arith::Analyzer analyzer;
+    auto cfg = ctx->GetConfig<arith::SimplifyConfig>("tir.Simplify")
+                   .value_or(AttrsWithDefaultValues<arith::SimplifyConfig>());
+    analyzer.rewrite_simplify.SetEnabledExtensions(cfg->GetEnabledExtensions());
+
+    auto* n = f.CopyOnWrite();
     n->body = arith::StmtSimplifier(&analyzer).Simplify(std::move(n->body));
     return f;
   };
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 4ac502b21191..0a1263f70287 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -137,7 +137,20 @@ def sls(n, d):
 
 
 class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
-    transform = tvm.tir.transform.Simplify()
+    transitively_prove_inequalities = False
+
+    def transform(self):
+        def inner(mod):
+            config = {
+                "tir.Simplify": {
+                    "transitively_prove_inequalities": self.transitively_prove_inequalities,
+                }
+            }
+            with tvm.transform.PassContext(config=config):
+                mod = tvm.tir.transform.Simplify()(mod)
+            return mod
+
+        return inner
 
 
 class TestLoadStoreNoop(BaseBeforeAfter):
@@ -547,5 +560,131 @@ def before(A: T.Buffer[16, "float32"]):
     expected = before
 
 
+class TestRemoveTransitivelyProvableCondition(BaseBeforeAfter):
+    """Remove comparisons that may be proven using multiple others
+
+    For example, the `0 < i` and `i <= j` conditions can be used to prove
+    that `0 < j`.
+    """
+
+    transitively_prove_inequalities = True
+
+    i, j, k = [tvm.tir.Var(name, "int32") for name in "ijk"]
+    zero = tvm.tir.IntImm("int32", 0)
+
+    test_case = tvm.testing.parameter(
+        (tvm.tir.all(zero < i, i <= j), zero < j, True),
+        # Transitive comparisons from LT
+        (tvm.tir.all(i < j, j < k), i < k, True),
+        (tvm.tir.all(i < j, j == k), i < k, True),
+        (tvm.tir.all(i < j, j <= k), i < k, True),
+        (tvm.tir.all(i < j, j > k), i < k, False),
+        (tvm.tir.all(i < j, j >= k), i < k, False),
+        (tvm.tir.all(i < j, j != k), i < k, False),
+        # Transitive comparisons from LE
+        (tvm.tir.all(i <= j, j < k), i < k, True),
+        (tvm.tir.all(i <= j, j == k), i == k, False),
+        (tvm.tir.all(i <= j, j == k), i <= k, True),
+        (tvm.tir.all(i <= j, j <= k), i <= k, True),
+        (tvm.tir.all(i <= j, j <= k), i < k, False),
+        (tvm.tir.all(i <= j, j > k), i < k, False),
+        (tvm.tir.all(i <= j, j >= k), i < k, False),
+        (tvm.tir.all(i <= j, j != k), i < k, False),
+        # Transitive comparisons from GT
+        (tvm.tir.all(i > j, j > k), i > k, True),
+        (tvm.tir.all(i > j, j == k), i > k, True),
+        (tvm.tir.all(i > j, j >= k), i > k, True),
+        (tvm.tir.all(i > j, j < k), i > k, False),
+        (tvm.tir.all(i > j, j <= k), i > k, False),
+        (tvm.tir.all(i > j, j != k), i > k, False),
+        # Transitive comparisons from GE
+        (tvm.tir.all(i >= j, j > k), i > k, True),
+        (tvm.tir.all(i >= j, j == k), i == k, False),
+        (tvm.tir.all(i >= j, j == k), i >= k, True),
+        (tvm.tir.all(i >= j, j >= k), i >= k, True),
+        (tvm.tir.all(i >= j, j >= k), i > k, False),
+        (tvm.tir.all(i >= j, j < k), i > k, False),
+        (tvm.tir.all(i >= j, j <= k), i > k, False),
+        (tvm.tir.all(i >= j, j != k), i > k, False),
+        # GT or LT may be used to prove NE
+        (tvm.tir.all(i == j, j != k), i != k, True),
+        (tvm.tir.all(i == j, j < k), i != k, True),
+        (tvm.tir.all(i == j, j > k), i != k, True),
+        (tvm.tir.all(i == j, j != k), i < k, False),
+        (tvm.tir.all(i == j, j != k), i > k, False),
+        # Because these are integers, x<y is equivalent to x <= y-1,
+        # and may be used in equivalent simplifications.
+        (tvm.tir.all(i <= j - 1, j < k), i < k, True),
+        (tvm.tir.all(i <= j - 1, j == k), i < k, True),
+        (tvm.tir.all(i <= j - 1, j <= k), i < k, True),
+        (tvm.tir.all(i <= j - 1, j > k), i < k, False),
+        (tvm.tir.all(i <= j - 1, j >= k), i < k, False),
+        (tvm.tir.all(i <= j - 1, j != k), i < k, False),
+        # Either or both inequalities may have an additive offset.
+        (tvm.tir.all(i <= j + 5, j <= k + 7), i <= k + 12, True),
+        (tvm.tir.all(i <= j + 5, j <= k + 7), i <= k + 11, False),
+        # For floats, x < y + c1 and y < z + c2 implies that x < z + (c1 + c2).
+        # Because this simplification applies to integers, transitive
+        # application of LT or GT can give a tighter constraint.
+        #
+        # i < j + c1, j < k + c2
+        # i <= j + c1 - 1, j <= k + c2 - 1
+        # i + 1 - c1 <= j, j <= k + c2 - 1
+        # i + 1 - c1 <= k + c2 - 1
+        # i <= k + c1 + c2 - 2
+        # i < k + (c1 + c2 - 1)
+        #
+        (tvm.tir.all(i < j + 5, j < k + 7), i < k + 11, True),
+        (tvm.tir.all(i < j + 5, j < k + 7), i < k + 10, False),
+    )
+
+    @tvm.testing.fixture
+    def before(self, test_case):
+        priors, postulate, _ = test_case
+
+        @T.prim_func
+        def func(A: T.Buffer[1, "bool"]):
+            if priors:
+                A[0] = postulate
+
+        return func
+
+    @tvm.testing.fixture
+    def expected(self, test_case):
+        priors, postulate, provable = test_case
+
+        analyzer = tvm.arith.Analyzer()
+        priors = analyzer.canonical_simplify(priors)
+
+        if provable:
+
+            @T.prim_func
+            def func(A: T.Buffer[1, "bool"]):
+                if priors:
+                    A[0] = True
+
+            return func
+
+        else:
+            postulate = analyzer.canonical_simplify(postulate)
+
+            @T.prim_func
+            def func(A: T.Buffer[1, "bool"]):
+                if priors:
+                    A[0] = postulate
+
+            return func
+
+
+class TestSuppressTransitivelyProvableCondition(BaseBeforeAfter):
+    transitively_prove_inequalities = False
+
+    def before(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
+        if i < j and j < k:
+            A[0] = i < k
+
+    expected = before
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From d9e01475af1253ca3fb52d7ad91165407ca8e740 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 7 Oct 2022 10:43:16 -0500
Subject: [PATCH 323/704] [UnitTest][TIR] Support IRModule comparisons in
 CompareBeforeAfter (#12920)

A follow-up commit from https://github.com/apache/tvm/pull/12264.
This allows the before/expected fixtures generated by
`tvm.testing.CompareBeforeAfter` to be `IRModule` instances as well as
`PrimFunc`.  This is intended to allow testing that requires comparing
more than one function (e.g. hoisting/fusing a PrimFunc).

* Prevent circular fixture references
---
 python/tvm/testing/utils.py                   | 105 ++++++++++++------
 .../unittest/test_tvm_testing_before_after.py |  49 +++++++-
 2 files changed, 117 insertions(+), 37 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 1c4dcba29d6c..f89d5e636913 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1914,10 +1914,7 @@ def __init_subclass__(cls):
             cls.transform = cls._normalize_transform(cls.transform)
 
     @classmethod
-    def _normalize_before(cls, func):
-        if hasattr(func, "_pytestfixturefunction"):
-            return func
-
+    def _normalize_ir_module(cls, func):
         if isinstance(func, tvm.tir.PrimFunc):
 
             def inner(self):
@@ -1930,6 +1927,22 @@ def inner(self):
                 # pylint: disable=unused-argument
                 return func(self)
 
+        elif inspect.isclass(func):
+
+            def inner(self):
+                # pylint: disable=unused-argument
+                func_dict = {}
+                for name, method in func.__dict__.items():
+                    if name.startswith("_"):
+                        pass
+                    elif isinstance(method, tvm.ir.function.BaseFunc):
+                        func_dict[name] = method
+                    else:
+                        source_code = "@T.prim_func\n" + textwrap.dedent(inspect.getsource(method))
+                        prim_func = tvm.script.from_source(source_code)
+                        func_dict[name] = prim_func
+                return tvm.IRModule(func_dict)
+
         else:
 
             def inner(self):
@@ -1939,50 +1952,64 @@ def inner(self):
 
         return pytest.fixture(inner)
 
+    @classmethod
+    def _normalize_before(cls, func):
+        if hasattr(func, "_pytestfixturefunction"):
+            return func
+        else:
+            return cls._normalize_ir_module(func)
+
     @classmethod
     def _normalize_expected(cls, func):
         if hasattr(func, "_pytestfixturefunction"):
             return func
 
-        if isinstance(func, tvm.tir.PrimFunc) or (
-            inspect.isclass(func) and issubclass(func, Exception)
-        ):
+        elif inspect.isclass(func) and issubclass(func, Exception):
 
             def inner(self):
                 # pylint: disable=unused-argument
                 return func
 
-        elif cls._is_method(func):
-
-            def inner(self):
-                # pylint: disable=unused-argument
-                return func(self)
+            return pytest.fixture(inner)
 
         else:
-
-            def inner(self):
-                # pylint: disable=unused-argument
-                source_code = "@T.prim_func\n" + textwrap.dedent(inspect.getsource(func))
-                return tvm.script.from_source(source_code)
-
-        return pytest.fixture(inner)
+            return cls._normalize_ir_module(func)
 
     @classmethod
     def _normalize_transform(cls, transform):
+        def apply(module_transform):
+            def inner(obj):
+                if isinstance(obj, tvm.IRModule):
+                    return module_transform(obj)
+                elif isinstance(obj, tvm.tir.PrimFunc):
+                    mod = tvm.IRModule({"main": obj})
+                    mod = module_transform(mod)
+                    return mod["main"]
+                else:
+                    raise TypeError(f"Expected IRModule or PrimFunc, but received {type(obj)}")
+
+            return inner
+
         if hasattr(transform, "_pytestfixturefunction"):
-            return transform
 
-        if isinstance(transform, tvm.ir.transform.Pass):
+            if not hasattr(cls, "_transform_orig"):
+                cls._transform_orig = transform
+
+            def inner(self, _transform_orig):
+                # pylint: disable=unused-argument
+                return apply(_transform_orig)
+
+        elif isinstance(transform, tvm.ir.transform.Pass):
 
             def inner(self):
                 # pylint: disable=unused-argument
-                return transform
+                return apply(transform)
 
         elif cls._is_method(transform):
 
             def inner(self):
                 # pylint: disable=unused-argument
-                return transform(self)
+                return apply(transform(self))
 
         else:
 
@@ -2000,36 +2027,42 @@ def _is_method(func):
     def test_compare(self, before, expected, transform):
         """Unit test to compare the expected TIR PrimFunc to actual"""
 
-        before_mod = tvm.IRModule.from_expr(before)
+        def pprint(name, obj):
+            script = obj.script()
+            if isinstance(obj, tvm.IRModule):
+                return script.replace("class Module", f"class {name}")
+            else:
+                return script.replace("def func", f"def {name}")
 
         if inspect.isclass(expected) and issubclass(expected, Exception):
             with pytest.raises(expected):
-                after_mod = transform(before_mod)
+                after = transform(before)
 
                 # This portion through pytest.fail isn't strictly
                 # necessary, but gives a better error message that
                 # includes the before/after.
-                after = after_mod["main"]
-                script = tvm.IRModule({"after": after, "before": before}).script()
+                before_str = pprint("before", before)
+                after_str = pprint("after", after)
+
                 pytest.fail(
                     msg=(
                         f"Expected {expected.__name__} to be raised from transformation, "
-                        f"instead received TIR\n:{script}"
+                        f"instead received TIR\n:{before_str}\n{after_str}"
                     )
                 )
 
-        elif isinstance(expected, tvm.tir.PrimFunc):
-            after_mod = transform(before_mod)
-            after = after_mod["main"]
+        elif isinstance(expected, (tvm.tir.PrimFunc, tvm.ir.IRModule)):
+            after = transform(before)
 
             try:
                 tvm.ir.assert_structural_equal(after, expected)
             except ValueError as err:
-                script = tvm.IRModule(
-                    {"expected": expected, "after": after, "before": before}
-                ).script()
+                before_str = pprint("before", before)
+                after_str = pprint("after", after)
+                expected_str = pprint("expected", expected)
                 raise ValueError(
-                    f"TIR after transformation did not match expected:\n{script}"
+                    f"TIR after transformation did not match expected:\n"
+                    f"{before_str}\n{after_str}\n{expected_str}"
                 ) from err
 
         else:
@@ -2037,5 +2070,5 @@ def test_compare(self, before, expected, transform):
                 f"tvm.testing.CompareBeforeAfter requires the `expected` fixture "
                 f"to return either `Exception`, an `Exception` subclass, "
                 f"or an instance of `tvm.tir.PrimFunc`.  "
-                f"Instead, received {type(exception)}."
+                f"Instead, received {type(expected)}."
             )
diff --git a/tests/python/unittest/test_tvm_testing_before_after.py b/tests/python/unittest/test_tvm_testing_before_after.py
index 613d66ccdb2b..946493922ed5 100644
--- a/tests/python/unittest/test_tvm_testing_before_after.py
+++ b/tests/python/unittest/test_tvm_testing_before_after.py
@@ -18,7 +18,7 @@
 
 import tvm
 import tvm.testing
-from tvm.script import tir as T
+from tvm.script import tir as T, ir_module
 
 
 class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
@@ -79,5 +79,52 @@ def func(A: T.Buffer[n, "float32"]):
     expected = before
 
 
+class TestBeforeAfterIRModule(BaseBeforeAfter):
+    """The preferred form for writing TIR unit tests
+
+    All evaluation is done at test-time, with the minimal amount of
+    additional lines.  The `@tvm.testing.fixture`, `@ir_module`, and
+    `@T.prim_func` annotations are handled by
+    `tvm.testing.CompareBeforeAfter`.
+    """
+
+    class before:
+        def func_A(A: T.Buffer[16, "float32"]):
+            for i in T.serial(16):
+                A[i] = 0.0
+
+        def func_B(A: T.Buffer[16, "int32"]):
+            for i in T.serial(16):
+                A[i] = 42
+
+    expected = before
+
+
+class TestBeforeAfterIRModuleExplicitFixture(BaseBeforeAfter):
+    """Like TestBeforeAfterIRModule, but with an explicit fixture
+
+    If the IRModule depends on additional fixtures, this form can be
+    used.
+    """
+
+    @tvm.testing.fixture
+    def before(self):
+        @ir_module
+        class mod:
+            @T.prim_func
+            def func_A(A: T.Buffer[16, "float32"]):
+                for i in T.serial(16):
+                    A[i] = 0.0
+
+            @T.prim_func
+            def func_B(A: T.Buffer[16, "int32"]):
+                for i in T.serial(16):
+                    A[i] = 42
+
+        return mod
+
+    expected = before
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 20aa0cf2f77f89c1f6840bb486f3c7742c6612f7 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 7 Oct 2022 13:14:20 -0700
Subject: [PATCH 324/704] [ci] Re-enable roofline test (#13007)

After #12959, this re-enables the test disabled in #12955 to get a
backtrace next time it fails.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/unittest/test_roofline.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/unittest/test_roofline.py b/tests/python/unittest/test_roofline.py
index 61e6e06aa8f3..e37f6e085bf6 100644
--- a/tests/python/unittest/test_roofline.py
+++ b/tests/python/unittest/test_roofline.py
@@ -35,7 +35,6 @@
 
 
 @tvm.testing.parametrize_targets("llvm", "cuda")
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/12955")
 def test_estimate_peak_flops(target, dev):
     server = rpc.Server(key="roofline_flops")
     remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops")

From 189338c919c0876cf5909b99fb125ee2a7fbe2c6 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 8 Oct 2022 10:00:58 +0900
Subject: [PATCH 325/704] [MetaSchedule] Support RewriteLayout postproc on
 AllocateConst  (#12991)

* [Metaschedule] Support AllocateConst in RewriteLayout

* fix

* convert constant back to original before trace apply

* improved CreatePrimFunc change

* use IndexMap to transform constant in RemoveLayoutRewriteBlock

* wip

* wip

* add comments

* add test

* enable RewriteLayout on Hexagon

* lint

* improve RewriteLayout postproc

* fix compiler warning

* fixed CreatePrimFunc test

* black

* Fix after MS API change

* fixed auto tensorize test after MS API PR

* add missing Clone method to MultilevelTilingWideVector

* fixed hexagon dense MS test

* fixed the rest of Hexagon MS tests
---
 python/tvm/meta_schedule/relay_integration.py |  14 +-
 src/meta_schedule/postproc/postproc.cc        |   5 +-
 src/meta_schedule/postproc/rewrite_layout.cc  |  56 ++++--
 .../multi_level_tiling_wide_vector.cc         |   6 +
 src/relay/backend/te_compiler_cache.cc        | 147 ++++++++++++++-
 .../meta_schedule_layout_rewrite.cc           |  11 +-
 src/te/operation/create_primfunc.cc           |  24 ++-
 .../remove_weight_layout_rewrite_block.cc     | 172 ++++++++++++++++--
 .../test_hexagon/test_meta_schedule.py        |  88 +++++++--
 .../python/integration/test_auto_tensorize.py |   5 +-
 .../test_meta_schedule_relay_integration.py   |  78 ++++++++
 .../test_meta_schedule_vnni_integration.py    |   4 +-
 .../unittest/test_te_create_primfunc.py       |   2 +
 13 files changed, 539 insertions(+), 73 deletions(-)

diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index af992dd4bc8b..b3d8d582ba2b 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -93,8 +93,15 @@ def _normalize_params(
         if isinstance(param, np.ndarray):
             param = nd.array(param)
         relay_params[name] = param
-    if executor is not None:
+
+    if executor is None:
+        executor = relay.backend.Executor("graph")
+
+    if mod.get_attr("executor") is None:
         mod = mod.with_attr("executor", executor)
+    else:
+        executor = mod.get_attr("executor")
+
     pass_config = dict(pass_config)
     return mod, target, relay_params, pass_config, executor
 
@@ -384,8 +391,7 @@ def is_meta_schedule_dispatch_enabled() -> bool:
     enabled: bool
         Whether the meta schedule is enabled
     """
-    result = transform.PassContext.current().config.get(
+    return transform.PassContext.current().config.get(
         "relay.backend.use_meta_schedule_dispatch",
-        0,
+        False,
     )
-    return bool(result & 1)
diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc
index acc157e36e94..acd783b1860d 100644
--- a/src/meta_schedule/postproc/postproc.cc
+++ b/src/meta_schedule/postproc/postproc.cc
@@ -85,10 +85,9 @@ Array<Postproc> Postproc::DefaultCUDATensorCore() {
 Array<Postproc> Postproc::DefaultHexagon() {
   return Array<Postproc>{
       Postproc::DisallowDynamicLoop(),
-      Postproc::RewriteParallelVectorizeUnroll(),  //
+      Postproc::RewriteParallelVectorizeUnroll(),
       Postproc::RewriteReductionBlock(),
-      // TODO(masahi): Fix RewriteLayout for link-params=True case
-      // Postproc::RewriteLayout(),
+      Postproc::RewriteLayout(),
   };
 }
 
diff --git a/src/meta_schedule/postproc/rewrite_layout.cc b/src/meta_schedule/postproc/rewrite_layout.cc
index 881c5ca7516b..3aed6680e30d 100644
--- a/src/meta_schedule/postproc/rewrite_layout.cc
+++ b/src/meta_schedule/postproc/rewrite_layout.cc
@@ -16,6 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <unordered_set>
+
 #include "../utils.h"
 
 namespace tvm {
@@ -106,25 +108,53 @@ class BufferReadPosCollector : public StmtExprVisitor {
   BlockRealize cur_realize_;
 };
 
+class LayoutFreeBufferCollector : public StmtVisitor {
+ public:
+  void VisitStmt_(const BlockNode* block) final {
+    StmtVisitor::VisitStmt_(block);
+    if (Optional<ObjectRef> ann = block->annotations.Get("layout_free_placeholders")) {
+      for (Buffer buffer : Downcast<Array<Buffer>>(ann)) {
+        buffers.insert(buffer);
+      }
+    }
+  }
+
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> buffers;
+};
+
+Array<Buffer> CollectLayoutFreeBuffers(const PrimFuncNode* func) {
+  // Only rewrite PrimFuncs with attr "layout_free_buffers"
+  Array<Integer> layout_free_buffer_index =
+      func->GetAttr(attr::layout_free_buffers, Array<Integer>()).value();
+
+  Array<Buffer> layout_free_buffers;
+  for (const Integer& index : layout_free_buffer_index) {
+    ICHECK(static_cast<size_t>(index->value) < func->params.size());
+    const Var& param = func->params[index->value];
+    layout_free_buffers.push_back(func->buffer_map.at(param));
+  }
+
+  LayoutFreeBufferCollector collector;
+  collector(func->body);
+
+  for (auto buf : collector.buffers) {
+    layout_free_buffers.push_back(buf);
+  }
+  return layout_free_buffers;
+}
+
 bool RewriteLayout(const Schedule& sch) {
   std::vector<std::pair<StmtSRef, String>> results;
-  for (const auto& kv : sch->mod()->functions) {
-    const GlobalVar& g_var = kv.first;
+  for (const auto& [g_var, base_func] : sch->mod()->functions) {
     const String& func_name = g_var->name_hint;
-    const auto* prim_func = kv.second.as<PrimFuncNode>();
+    const auto* prim_func = base_func.as<PrimFuncNode>();
     // Only consider PrimFunc
     if (prim_func == nullptr) {
       continue;
     }
-    // Only rewrite PrimFuncs with attr "layout_free_buffers"
-    Array<Integer> layout_free_buffer_index =
-        prim_func->GetAttr(attr::layout_free_buffers, Array<Integer>()).value();
-
-    Array<Buffer> layout_free_buffers;
-    for (const Integer& index : layout_free_buffer_index) {
-      const Var& param = prim_func->params[index->value];
-      layout_free_buffers.push_back(prim_func->buffer_map.at(param));
-    }
+
+    Array<Buffer> layout_free_buffers = CollectLayoutFreeBuffers(prim_func);
+
     // Collect Buffer read positions
     BufferReadPosCollector collector(layout_free_buffers);
     collector(prim_func->body);
@@ -132,7 +162,7 @@ bool RewriteLayout(const Schedule& sch) {
     const auto& index_maps = collector.GetBufferIndexMap();
     // Check all buffers are collected
     if (locations.size() != layout_free_buffers.size() ||
-        index_maps.size() != layout_free_buffer_index.size()) {
+        index_maps.size() != layout_free_buffers.size()) {
       return false;
     }
 
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc
index f5ec009a9b28..d4c4a10fdd72 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc
@@ -42,6 +42,12 @@ class MultiLevelTilingWideVectorNode : public MultiLevelTilingNode {
   TVM_DECLARE_FINAL_OBJECT_INFO(MultiLevelTilingWideVectorNode, MultiLevelTilingNode);
 
  protected:
+  ScheduleRule Clone() const final {
+    ObjectPtr<MultiLevelTilingWideVectorNode> n =
+        make_object<MultiLevelTilingWideVectorNode>(*this);
+    return ScheduleRule(n);
+  }
+
   Array<tir::LoopRV> SplitLoop(const Schedule& sch, BlockRV block, LoopRV loop, int n_tiles) const;
 };
 
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 27738615c7eb..ad99cb41aa8e 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -38,6 +38,7 @@
 #include <tvm/tir/function.h>
 #include <tvm/tir/index_map.h>
 #include <tvm/tir/schedule/schedule.h>
+#include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 #include <tvm/topi/tags.h>
 
@@ -304,6 +305,66 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
 
 int LowerToTECompute::const_index = 0;
 
+using namespace tvm::tir;
+
+class LayoutFreeConstantCollector : public StmtVisitor {
+ public:
+  Array<runtime::NDArray> constants;
+
+ private:
+  void VisitStmt_(const BlockNode* op) final {
+    StmtVisitor::VisitStmt_(op);
+    if (Optional<ObjectRef> ann = op->annotations.Get("layout_free_placeholders")) {
+      for (Buffer buffer : Downcast<Array<Buffer>>(ann)) {
+        layout_free_buffer_vars_.insert(buffer->data.get());
+      }
+    }
+  }
+
+  void VisitStmt_(const AllocateConstNode* op) final {
+    StmtVisitor::VisitStmt_(op);
+    if (auto it = layout_free_buffer_vars_.find(op->buffer_var.get());
+        it != layout_free_buffer_vars_.end()) {
+      constants.push_back(op->data.value());
+    }
+  }
+
+  std::unordered_set<const tir::VarNode*> layout_free_buffer_vars_;
+};
+
+using NDArrayMap =
+    std::unordered_map<runtime::NDArray, runtime::NDArray, ObjectPtrHash, ObjectPtrEqual>;
+
+// Replace constants in AllocateConst nodes according to the given mapping
+class AllocateConstReplaceConstant : public StmtExprMutator {
+ public:
+  explicit AllocateConstReplaceConstant(const NDArrayMap& constant_map)
+      : constant_map_(constant_map) {}
+
+  static PrimFunc Rewrite(PrimFunc f, const NDArrayMap& constant_map) {
+    AllocateConstReplaceConstant rewriter(constant_map);
+    PrimFuncNode* n = f.CopyOnWrite();
+    n->body = rewriter(std::move(n->body));
+    return f;
+  }
+
+ private:
+  Stmt VisitStmt_(const AllocateConstNode* op) final {
+    if (auto it = constant_map_.find(op->data.value()); it != constant_map_.end()) {
+      auto rewriten_constant = it->second;
+      Array<PrimExpr> rewritten_extents;
+      for (auto s : rewriten_constant.Shape()) {
+        rewritten_extents.push_back(PrimExpr(static_cast<int>(s)));
+      }
+      return AllocateConst(op->buffer_var, op->dtype, rewritten_extents, rewriten_constant,
+                           op->body, op->annotations, op->span);
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  NDArrayMap constant_map_;
+};
+
 // Construct a schedule for a given Relay primitive function and target.
 class ScheduleBuilder : public ExprVisitor {
  public:
@@ -368,19 +429,99 @@ class ScheduleBuilder : public ExprVisitor {
           constants.push_back(const_node->data);
         }
         if (Optional<PrimFunc> f = tir_converter(te_args, constants)) {
+          IRModule query_mod = backend::PrimFuncToIRModule(f.value());
           if (Optional<TuningRecord> opt_record = database_.value()->QueryTuningRecord(
-                  /*mod=*/backend::PrimFuncToIRModule(f.value()),
+                  /*mod=*/query_mod,
                   /*target=*/target_,
                   /*workload_name=*/prim_fn_var->name_hint)) {
+            LayoutFreeConstantCollector const_collector;
+            const_collector(f.value()->body);
+
             static InstructionKind kind_transform_layout = InstructionKind::Get("TransformLayout");
             TuningRecord record = opt_record.value();
             for (const Instruction& inst : record->trace->insts) {
               if (inst->kind.same_as(kind_transform_layout)) {
                 ICHECK_EQ(inst->attrs.size(), 4);
-                MetaScheduleLayoutRewriter::LayoutQueuePush(Downcast<IndexMap>(inst->attrs[2]));
+                auto index_map = Downcast<IndexMap>(inst->attrs[2]);
+
+                if (!const_collector.constants.empty()) {
+                  // In this case, RewriteLayout is acting on an AllocateConst node.
+                  // After tuning, we reach this code path twice: First by
+                  // the Relay MetaScheduleLayoutRewrite pass, and next by the final
+                  // compilation (Relay to TE schedule lowering).
+                  //
+                  // Due to Relay MetaScheduleLayoutRewrite and FoldConstant passes,
+                  // the Relay subgraph for which we query the database during the
+                  // final compilation has its weight tensor transformed according to
+                  // the index map, determined during tuning. For example,
+                  //
+                  // fn (%p0: Tensor[(1, 56, 56, 64), float32]) {
+                  //   %0 = nn.conv2d(%p0, meta[relay.Constant][0],
+                  //                       /*ty=Tensor[(4, 2, 2, 3, 3, 32, 8), float32]*/, ...);
+                  //   add(%0, meta[relay.Constant][1])
+                  // }
+                  //
+                  // Note that the database does not have an entry corresponding to such subgraphs,
+                  // since an input subgraph to the tuning system always has its weight tensor in
+                  // the original layout, e.g.
+                  //
+                  // fn (%p0: Tensor[(1, 56, 56, 64), float32]) {
+                  //   %0 = nn.conv2d(%p0, meta[relay.Constant][0],
+                  //                       /*ty=Tensor[(3, 3, 64, 64), float32]*/, ...);
+                  //   add(%0, meta[relay.Constant][1])
+                  // }
+                  //
+                  // Thus, in both of the two cases where we reach this code path, we need careful
+                  // logic to make sure that (1) the database lookup during the final compilation
+                  // succeeds and (2) the application of a schedule trace is well defined.
+
+                  ICHECK(const_collector.constants.size() == 1)
+                      << "Only one layout-free constant is supported by RewriteLayout for now";
+                  auto constant = const_collector.constants[0];
+
+                  if (constant.Shape().size() == index_map->initial_indices.size()) {
+                    // This is the first case, reached during the MetaScheduleLayoutRewrite pass.
+                    //
+                    // A layout-free constant having the same rank as an input to the index map
+                    // is assumed to be transformed by this index map.
+                    // TODO(masahi): If there are multiple layout-free constants in one
+                    // TIR mod (e.g. conv2d -> conv2d fusion), this assumption does not hold.
+                    // We need to determine which constant the given index map acts on.
+                    //
+                    // We know that, during the final compilation, we will query the database
+                    // for a subgraph that the tuner has never seen. We workaround this problem
+                    // by adding a dummy entry to the database. The dummy entry is carefully
+                    // constructed so that the lookup during the final compilation would succeed.
+                    runtime::NDArray rewritten_constant = index_map->MapNDArray(constant);
+                    auto f_dummy = AllocateConstReplaceConstant::Rewrite(
+                        f.value(), {{constant, rewritten_constant}});
+                    auto workload_dummy =
+                        database_.value()->CommitWorkload(backend::PrimFuncToIRModule(f_dummy));
+                    TuningRecord rec_dummy(record->trace, workload_dummy, record->run_secs,
+                                           record->target, record->args_info);
+                    database_.value()->CommitTuningRecord(rec_dummy);
+                  } else {
+                    // The constant is already transformed, so this is the second case, reached
+                    // during the final compilation.
+                    //
+                    // The schedule trace is supposed to be applied to the weight in its original
+                    // layout. But as explained above, the Relay subgraph we get in this case
+                    // has its weight tensor transformed according to the corresponding index map.
+                    // So effectively, we undo the layout transformation on the weight to restore
+                    // the original PrimFunc that the schedule trace is supposed to act on.
+                    ICHECK(index_map->inverse_index_map);
+                    auto inverse_map = Downcast<IndexMap>(index_map->inverse_index_map.value());
+                    ICHECK(constant.Shape().size() == inverse_map->initial_indices.size());
+                    runtime::NDArray orig_constant = inverse_map->MapNDArray(constant);
+                    auto f_ = AllocateConstReplaceConstant::Rewrite(f.value(),
+                                                                    {{constant, orig_constant}});
+                    query_mod = backend::PrimFuncToIRModule(f_);
+                  }
+                }
+                MetaScheduleLayoutRewriter::LayoutQueuePush(index_map);
               }
             }
-            Schedule sch = Schedule::Traced(record->workload->mod, /*seed=*/-1, /*debug_mask=*/0,
+            Schedule sch = Schedule::Traced(query_mod, /*seed=*/-1, /*debug_mask=*/0,
                                             tir::ScheduleErrorRenderLevel::kDetail);
             record->trace->ApplyToSchedule(sch, /*remove_postproc=*/false);
             IRModule mod = sch->mod();
diff --git a/src/relay/transforms/meta_schedule_layout_rewrite.cc b/src/relay/transforms/meta_schedule_layout_rewrite.cc
index 8a70f224c611..3e7d7f7cb1a1 100644
--- a/src/relay/transforms/meta_schedule_layout_rewrite.cc
+++ b/src/relay/transforms/meta_schedule_layout_rewrite.cc
@@ -95,8 +95,15 @@ class MetaScheduleFuncMutator : public ExprMutator {
           ICHECK_EQ(call->args.size(), 2);
           tir::IndexMap index_map = layout_queue_.front();
           layout_queue_.pop_front();
-          Var var = Downcast<Var>(call->args[1]);
-          Array<PrimExpr> shape = Downcast<TensorType>(var->type_annotation)->shape;
+          Array<PrimExpr> shape;
+          if (call->args[1]->IsInstance<VarNode>()) {
+            Var var = Downcast<Var>(call->args[1]);
+            shape = Downcast<TensorType>(var->type_annotation)->shape;
+          } else if (const ConstantNode* cnst = call->args[1].as<ConstantNode>()) {
+            shape = cnst->tensor_type()->shape;
+          } else {
+            LOG(FATAL) << "Unexpected input " << call->args[1];
+          }
           Attrs attrs{nullptr};
           TVM_RELAY_LAYOUT_WITH_ORIGINAL_SHAPE(call->attrs, Conv2DAttrs, shape, attrs);
           TVM_RELAY_LAYOUT_WITH_ORIGINAL_SHAPE(call->attrs, Conv2DWinogradAttrs, shape, attrs);
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index fb325684e65b..c222de81f2ad 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -108,14 +108,16 @@ class LayoutFreePlaceholdersNormalizer : public StmtMutator {
   Stmt VisitStmt_(const BlockNode* _block) final {
     Block block = Downcast<Block>(StmtMutator::VisitStmt_(_block));
     if (Optional<ObjectRef> ann = block->annotations.Get(topi_attr)) {
-      Array<Buffer> buffers = Downcast<Array<Buffer>>(ann);
-      for (Buffer buffer : buffers) {
+      Array<Buffer> new_buffers;
+      for (Buffer buffer : Downcast<Array<Buffer>>(ann)) {
         auto it = buffer2index_.find(buffer);
         if (it != buffer2index_.end()) {
           layout_free_buffer_indices_.insert(it->second);
+        } else {
+          new_buffers.push_back(buffer);
         }
       }
-      block.CopyOnWrite()->annotations.erase(topi_attr);
+      block.CopyOnWrite()->annotations.Set(topi_attr, new_buffers);
     }
     return std::move(block);
   }
@@ -473,10 +475,11 @@ PrimFunc GenerateAndCompletePrimFunc(const Array<te::Tensor>& arg_list,
   const auto* complete = runtime::Registry::Get("script.Complete");
   ICHECK(complete);
   func = (*complete)(std::move(func), info->root_alloc);
-  return LayoutFreePlaceholdersNormalizer().Process(std::move(func));
+  return func;
 }
 
-PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
+PrimFunc CreatePrimFuncWithConstants(const Array<te::Tensor>& arg_list,
+                                     const Array<runtime::NDArray>& constants) {
   // Infomations used in CreatePrimFunc and its sub-functions.
   CreateFuncInfo info(arg_list);
   // Root body stmts.
@@ -494,14 +497,15 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
   for (const te::Operation& op : order) {
     RewriteStageToBlock(op, &info, &root_stmts, &analyzer);
   }
+
   // Step 4. Create func and complete prim func.
-  return GenerateAndCompletePrimFunc(arg_list, root_stmts, &info);
+  auto func = GenerateAndCompletePrimFunc(arg_list, root_stmts, &info);
+  func = tir::BindParams(func, constants);
+  return LayoutFreePlaceholdersNormalizer().Process(std::move(func));
 }
 
-PrimFunc CreatePrimFuncWithConstants(const Array<te::Tensor>& arg_list,
-                                     const Array<runtime::NDArray>& constants) {
-  PrimFunc func = CreatePrimFunc(arg_list);
-  return tir::BindParams(func, constants);
+PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
+  return CreatePrimFuncWithConstants(arg_list, {});
 }
 
 TVM_REGISTER_GLOBAL("te.CreatePrimFunc").set_body_typed(CreatePrimFunc);
diff --git a/src/tir/transforms/remove_weight_layout_rewrite_block.cc b/src/tir/transforms/remove_weight_layout_rewrite_block.cc
index 8b2759579b72..86f6700f2289 100644
--- a/src/tir/transforms/remove_weight_layout_rewrite_block.cc
+++ b/src/tir/transforms/remove_weight_layout_rewrite_block.cc
@@ -22,32 +22,25 @@
  * \brief Remove weight layout rewrite block before benchmark
  */
 
+#include <tvm/tir/index_map.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <unordered_set>
+
 namespace tvm {
 namespace tir {
 
-class WeightLayoutRewriteBlockRemover : public StmtMutator {
+class RemoveLayoutRewriteBlock : public StmtMutator {
  public:
-  static PrimFunc Remove(PrimFunc f) {
-    WeightLayoutRewriteBlockRemover remover;
+  static std::tuple<PrimFunc, Map<Buffer, Buffer>, std::unordered_map<const VarNode*, IndexMap>>
+  Rewrite(PrimFunc f) {
+    RemoveLayoutRewriteBlock rewriter;
+
     PrimFuncNode* n = f.CopyOnWrite();
-    n->body = remover(std::move(n->body));
-    Map<tir::Var, Buffer> buffer_map;
-    for (const auto& kv : f->buffer_map) {
-      Var param = kv.first;
-      Buffer buffer = kv.second;
-      auto it = remover.buf_map_.find(buffer);
-      if (it != remover.buf_map_.end()) {
-        buffer_map.Set(param, (*it).second);
-      } else {
-        buffer_map.Set(param, buffer);
-      }
-    }
-    n->buffer_map = std::move(buffer_map);
-    return f;
+    n->body = rewriter(std::move(n->body));
+    return std::make_tuple(f, rewriter.buf_map_, rewriter.buffer_var_to_index_map_);
   }
 
  private:
@@ -94,6 +87,14 @@ class WeightLayoutRewriteBlockRemover : public StmtMutator {
     n->body = std::move(Evaluate(0));
     n->reads = {};
     n->writes = {};
+
+    Array<Var> load_indices;
+    for (auto ind : load->indices) {
+      ICHECK(ind->IsInstance<VarNode>());
+      load_indices.push_back(Downcast<Var>(ind));
+    }
+    buffer_var_to_index_map_[load->buffer->data.get()] = IndexMap(load_indices, store->indices);
+
     return Stmt(n);
   }
 
@@ -102,7 +103,144 @@ class WeightLayoutRewriteBlockRemover : public StmtMutator {
   Map<Buffer, Buffer> buf_map_;
   /*! \brief The buffer map from original layout buffer to rewritten buffer */
   std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> rewritten_buffers_;
+  /*! \brief Maps a buffer load to an index map associated with the load / store
+    in a layout rewrite block. */
+  std::unordered_map<const VarNode*, IndexMap> buffer_var_to_index_map_;
 };
+
+// After RemoveLayoutRewriteBlock, the body of a compute update block references a
+// non-existant buffer. For example, fused_constant_2_global below is originally a
+// cache_read buffer, whose allocation is removed by RemoveLayoutRewriteBlock:
+//
+// constant fused_constant_2[float32 * 3 * 3 * 64 * 64]
+// conv2d_nhwc[nn, yy, xx, ff] += ... * fused_constant_2_global[ry,
+//                                                              floordiv(rc, 32),
+//                                                              floordiv(ff, 16),
+//                                                              rx,
+//                                                              floormod(rc, 32),
+//                                                              floormod(ff, 16)]))
+//
+// When cache_read is reading from AllocateConstant, we need to replace the reference
+// to fused_constant_2_global with the corresponding transformed AllocateConstant.
+// To do that, we manually rewrite the original constant using the associated index map,
+// and let the body of the compute block to load from the rewritten constant.
+//
+// After this transformation, the example above looks like:
+//
+// constant fused_constant_2[float32 * 3 * 2 * 4 * 3 * 32 * 16]
+// conv2d_nhwc[nn, yy, xx, ff] += ... * fused_constant_2[ry,
+//                                                       floordiv(rc, 32),
+//                                                       floordiv(ff, 16),
+//                                                       rx,
+//                                                       floormod(rc, 32),
+//                                                       floormod(ff, 16)]))
+
+using BufferVarMap = std::unordered_map<const tir::VarNode*, const tir::VarNode*>;
+
+class AllocateConstRewrite : public StmtExprMutator {
+ public:
+  AllocateConstRewrite(const BufferVarMap& buffer_var_map,
+                       const std::unordered_map<const VarNode*, IndexMap>& buffer_var_to_index_map)
+      : buffer_var_map_(buffer_var_map), buffer_var_to_index_map_(buffer_var_to_index_map) {}
+
+ private:
+  Stmt VisitStmt_(const BlockNode* op) final {
+    Block block = Downcast<Block>(StmtMutator::VisitStmt_(op));
+    auto n = CopyOnWrite(block.get());
+    Array<BufferRegion> new_reads;
+    for (auto read_region : op->reads) {
+      if (auto it = new_load_buf_.find(read_region->buffer->data.get());
+          it != new_load_buf_.end()) {
+        new_reads.push_back(BufferRegion(it->second, read_region->region));
+      } else {
+        new_reads.push_back(read_region);
+      }
+    }
+    n->reads = new_reads;
+    return Stmt(n);
+  }
+
+  Stmt VisitStmt_(const AllocateConstNode* alloc) final {
+    if (auto it = buffer_var_to_index_map_.find(alloc->buffer_var.get());
+        it != buffer_var_to_index_map_.end()) {
+      auto new_body = StmtMutator::VisitStmt(alloc->body);
+      auto rewritten_ndarray = it->second->MapNDArray(alloc->data.value());
+      Array<PrimExpr> rewritten_extents;
+      for (auto s : rewritten_ndarray.Shape()) {
+        rewritten_extents.push_back(PrimExpr(static_cast<int>(s)));
+      }
+      return AllocateConst(alloc->buffer_var, alloc->dtype, rewritten_extents, rewritten_ndarray,
+                           new_body, alloc->annotations, alloc->span);
+    }
+    return StmtMutator::VisitStmt_(alloc);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    if (auto it = buffer_var_map_.find(op->buffer->data.get()); it != buffer_var_map_.end()) {
+      auto new_buffer =
+          Buffer(GetRef<Var>(it->second), op->buffer->dtype, op->buffer->shape, op->buffer->strides,
+                 op->buffer->elem_offset, it->second->name_hint, op->buffer->data_alignment,
+                 op->buffer->offset_factor, op->buffer->buffer_type);
+      new_load_buf_[op->buffer->data.get()] = new_buffer;
+      return BufferLoad(new_buffer, op->indices);
+    }
+    return ExprMutator::VisitExpr_(op);
+  }
+
+  /*! \brief Maps a buffer store to a load in a layout rewrite block */
+  BufferVarMap buffer_var_map_;
+  /*! \brief Maps a buffer load to an index map associated with the load / store
+    in a layout rewrite block. */
+  std::unordered_map<const VarNode*, IndexMap> buffer_var_to_index_map_;
+  /*! \brief Maps load buffer variables to newly created buffers */
+  std::unordered_map<const VarNode*, Buffer> new_load_buf_;
+};
+
+class CollectAllocateConstBufferVars : public StmtVisitor {
+ public:
+  void VisitStmt_(const AllocateConstNode* alloc) final {
+    StmtVisitor::VisitStmt_(alloc);
+    constant_buf_var.insert(alloc->buffer_var.get());
+  }
+
+  std::unordered_set<const VarNode*> constant_buf_var;
+};
+
+class WeightLayoutRewriteBlockRemover : public StmtMutator {
+ public:
+  static PrimFunc Remove(PrimFunc f) {
+    CollectAllocateConstBufferVars collector;
+    collector(f->body);
+
+    auto [f_, buf_map, buffer_var_to_index_map] = RemoveLayoutRewriteBlock().Rewrite(f);
+
+    BufferVarMap buffer_var_map;
+    for (const auto& [load_buf, store_buf] : buf_map) {
+      if (collector.constant_buf_var.find(load_buf->data.get()) !=
+          collector.constant_buf_var.end()) {
+        buffer_var_map[store_buf->data.get()] = load_buf->data.get();
+      }
+    }
+
+    PrimFuncNode* n = f_.CopyOnWrite();
+
+    AllocateConstRewrite rewriter(buffer_var_map, buffer_var_to_index_map);
+    n->body = rewriter(std::move(n->body));
+
+    Map<tir::Var, Buffer> buffer_map;
+    for (const auto& [param, buffer] : f_->buffer_map) {
+      auto it = buf_map.find(buffer);
+      if (it != buf_map.end()) {
+        buffer_map.Set(param, (*it).second);
+      } else {
+        buffer_map.Set(param, buffer);
+      }
+    }
+    n->buffer_map = std::move(buffer_map);
+    return f_;
+  }
+};
+
 namespace transform {
 
 Pass RemoveWeightLayoutRewriteBlock() {
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py
index e8caa9f04e87..6e12f4b205d1 100644
--- a/tests/python/contrib/test_hexagon/test_meta_schedule.py
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -205,7 +205,7 @@ def schedule_dense_for_tune(sch):
                     schedule_dense_for_tune,
                     sch_rules=[],
                     postprocs=[],
-                    mutator_probs=[],
+                    mutator_probs={},
                 ),
                 strategy="replay-trace",
                 builder=get_hexagon_local_builder(),
@@ -307,6 +307,7 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
         postproc.RewriteTensorize(vectorize_init_loop=True),
     ]
 
+    # Make this to False to compile and run the best tuned schedule
     if True:
         with tempfile.TemporaryDirectory() as work_dir:
             target = get_hexagon_target("v68")
@@ -315,10 +316,13 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
                 target=target,
                 max_trials_global=8,
                 num_trials_per_iter=8,
-                max_trials_per_task=8,
                 work_dir=work_dir,
-                sch_rules=lambda: sch_rules,
-                postprocs=lambda: postprocs,
+                space=ms.space_generator.PostOrderApply(
+                    f_block_filter=None,
+                    sch_rules=sch_rules,
+                    postprocs=postprocs,
+                    mutator_probs={},
+                ),
                 builder=get_hexagon_local_builder(),
                 runner=get_hexagon_rpc_runner(hexagon_launcher, number=10),
             )
@@ -368,20 +372,70 @@ def test_conv2d_relay_auto_schedule(hexagon_launcher):
     bias_np = np.random.randn(*bias_shape).astype("float16")
     params = {"weight": weight_np, "bias": bias_np}
 
-    target_llvm = tvm.target.Target("llvm")
+    ref = (
+        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
+        .evaluate()(*[data_np, weight_np, bias_np])
+        .numpy()
+    )
 
-    with tvm.transform.PassContext(
-        opt_level=3,
-    ):
-        lib_ref = relay.build(mod, target=target_llvm, params=params)
+    with tempfile.TemporaryDirectory() as work_dir:
+        target = get_hexagon_target("v69")
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            params=params,
+            target=target,
+            max_trials_global=8,
+            strategy="replay-trace",
+            work_dir=work_dir,
+            builder=get_hexagon_local_builder(),
+            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
+        )
+        lib = ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            params=params,
+            target=target,
+        )
 
-    rt_mod_ref = tvm.contrib.graph_executor.GraphModule(lib_ref["default"](tvm.cpu(0)))
+    with hexagon_launcher.start_session() as session:
+        rt_mod = session.get_executor_from_factory(lib)
+
+        rt_mod.set_input("data", data_np)
+
+        rt_mod.run()
+
+        out = rt_mod.get_output(0).numpy()
+        # Fairly loose check since fp16 results between x86 and Hexagon have
+        # non-trivial difference.
+        assert np.mean(np.abs(ref - out)) < 0.5
 
-    rt_mod_ref.set_input("data", data_np)
 
-    rt_mod_ref.run()
+@tvm.testing.requires_hexagon
+def test_dense_relay_auto_schedule(hexagon_launcher):
+    """
+    This is for testing RewriteLayout postproc. Without this postproc,
+    dense on Hexagon is extremely slow.
+    """
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    target_hexagon = tvm.target.hexagon("v69")
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+    data_shape = (128, 128)
+    weight_shape = (128, 128)
+
+    data = relay.var("data", shape=data_shape, dtype="float16")
+    weight = relay.var("weight", shape=weight_shape, dtype="float16")
+    dense = relay.nn.dense(data, weight)
+    mod = tvm.IRModule.from_expr(dense)
+    mod = mod.with_attr("executor", relay.backend.Executor("graph", {"link-params": True}))
+
+    weight_np = np.random.randn(*weight_shape).astype("float32")
 
-    ref = rt_mod_ref.get_output(0).numpy()
+    data_np = np.random.randn(*data_shape).astype("float32")
+    params = {"weight": weight_np}
+    ref = np.dot(data_np, weight_np.transpose())
 
     with tempfile.TemporaryDirectory() as work_dir:
         target = get_hexagon_target("v69")
@@ -390,9 +444,7 @@ def test_conv2d_relay_auto_schedule(hexagon_launcher):
             params=params,
             target=target,
             max_trials_global=8,
-            max_trials_per_task=8,
-            num_trials_per_iter=8,
-            strategy=ms.search_strategy.ReplayTrace(),
+            strategy="replay-trace",
             work_dir=work_dir,
             builder=get_hexagon_local_builder(),
             runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
@@ -412,4 +464,6 @@ def test_conv2d_relay_auto_schedule(hexagon_launcher):
         rt_mod.run()
 
         out = rt_mod.get_output(0).numpy()
-        print(np.max(np.abs(ref - out)), np.mean(np.abs(ref - out)))
+        # Fairly loose check since fp16 results between x86 and Hexagon have
+        # non-trivial difference.
+        assert np.mean(np.abs(ref - out)) < 0.1
diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index 3fdf027a490d..13b7b50b7afe 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -148,7 +148,7 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
     tune_tasks = list(
         filter(
             lambda task: op_name in task.task_name,
-            ms.relay_integration.extracted_task_from_relay(relay_mod, target, params),
+            ms.relay_integration.extract_tasks(relay_mod, target, params),
         )
     )
     with tempfile.TemporaryDirectory() as work_dir:
@@ -164,7 +164,7 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
             tasks=tasks,
             task_weights=task_weights,
             work_dir=work_dir,
-            max_trials_global=20000,
+            max_trials_global=32,
         )
     with database, tvm.transform.PassContext(
         opt_level=3,
@@ -251,6 +251,7 @@ def _test_bert_int8(relay_mod, params, input_info, target, sch_rules, postprocs)
             tasks=tasks,
             task_weights=task_weights,
             work_dir=work_dir,
+            max_trials_per_task=32,
             max_trials_global=20000,
         )
     with database, tvm.transform.PassContext(
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index cf61df0c6ba8..4047f44ac365 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Integration test for MetaSchedule"""
+import tempfile
 import numpy as np
 import pytest
 import tvm
@@ -489,5 +490,82 @@ def get_output(data, lib):
     assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
 
 
+def test_rewrite_layout_link_params():
+    I, O, H, W = 64, 64, 56, 56
+    kH = kW = 3
+
+    strides = (1, 1)
+    padding = (1, 1)
+
+    data_shape = (1, H, W, I)
+    w_shape = (kH, kW, I, O)
+    bias_shape = (1, 1, 1, O)
+
+    data = relay.var("data", shape=data_shape, dtype="float32")
+    weight = relay.var("weight1", shape=w_shape, dtype="float32")
+    bias = relay.var("bias", shape=bias_shape, dtype="float32")
+
+    conv = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=(kH, kW),
+        channels=O,
+        padding=padding,
+        strides=strides,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+
+    mod = tvm.IRModule.from_expr(conv + bias)
+
+    weight_np = np.random.randn(*w_shape).astype("float32")
+    bias_np = np.random.randn(*bias_shape).astype("float32")
+
+    params = {"weight1": weight_np, "bias": bias_np}
+
+    data_np = np.random.randn(*data_shape).astype("float32")
+
+    ref = (
+        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
+        .evaluate()(*[data_np, weight_np, bias_np])
+        .numpy()
+    )
+
+    link_params = True
+
+    target = "llvm --num-cores=4"
+
+    executor = relay.backend.Executor("graph", {"link-params": link_params})
+    mod = mod.with_attr("executor", executor)
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            target=target,
+            params=params,
+            work_dir=work_dir,
+            max_trials_global=4,
+            strategy="replay-trace",
+        )
+
+        lib = ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            target=target,
+            params=params,
+        )
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    np.testing.assert_allclose(ref, out, rtol=1e-4, atol=1e-4)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_vnni_integration.py b/tests/python/unittest/test_meta_schedule_vnni_integration.py
index 2cd609863056..710ea96d9f5c 100644
--- a/tests/python/unittest/test_meta_schedule_vnni_integration.py
+++ b/tests/python/unittest/test_meta_schedule_vnni_integration.py
@@ -133,7 +133,7 @@ def f_check(lib, dev):
     return relay_mod, params, f_check
 
 
-@pytest.mark.skip("Requires cascadelake")
+@tvm.testing.requires_cascadelake
 def test_vnni_schedule_fn_database():
     m, n, k = 1024, 1024, 1024
     target = tvm.target.Target("llvm -mcpu=cascadelake -num-cores 4")
@@ -164,7 +164,7 @@ def test_vnni_schedule_fn_database():
     f_check(lib, dev)
 
 
-@pytest.mark.skip("Requires cascadelake")
+@tvm.testing.requires_cascadelake
 def test_vnni_schedule_fn_tune():
     # pylint: disable=W0105
     """
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index d3f444ec081f..d10fd2d23d47 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -382,12 +382,14 @@ def expected_layout_attr(
     for i0, i1, i2 in T.grid(128, 128, 128):
         with T.block("C"):
             x, y, k = T.axis.remap("SSR", [i0, i1, i2])
+            T.block_attr({"layout_free_placeholders": []})
             with T.init():
                 C[x, y] = T.float32(0)
             C[x, y] = C[x, y] + A[x, k] * B[y, k]
     for i0, i1 in T.grid(128, 128):
         with T.block("D"):
             x, y = T.axis.remap("SS", [i0, i1])
+            T.block_attr({"layout_free_placeholders": [C]})
             D[x, y] = C[x, y] + T.float32(1)
 
 
From d92d47ad7ffcf47c7355cd3f83590b6e241b2420 Mon Sep 17 00:00:00 2001
From: wufeng15226 <47237320+wufeng15226@users.noreply.github.com>
Date: Sat, 8 Oct 2022 13:22:40 +0800
Subject: [PATCH 326/704] [docs] fixed codebase_walkthrough document bug
 (#13008)

When I was studying the "TVM Codebase Walkthrough by Example" document, I found that the code didn't work, so I fixed it.

Bind the iteration axis to threads in the GPU.
---
 docs/dev/tutorial/codebase_walkthrough.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/dev/tutorial/codebase_walkthrough.rst b/docs/dev/tutorial/codebase_walkthrough.rst
index efc8b32832c0..726e253057d0 100644
--- a/docs/dev/tutorial/codebase_walkthrough.rst
+++ b/docs/dev/tutorial/codebase_walkthrough.rst
@@ -93,11 +93,14 @@ This function is mapped to the C++ function in ``include/tvm/schedule.h``.
 
 ``Schedule`` and ``Stage`` are defined in ``tvm/python/te/schedule.py``, ``include/tvm/te/schedule.h``, and ``src/te/schedule/schedule_ops.cc``.
 
-To keep it simple, we call ``tvm.build(...)`` on the default schedule created by ``create_schedule()`` function above.
+To keep it simple, we call ``tvm.build(...)`` on the default schedule created by ``create_schedule()`` function above, and we must add necessary thread bindings to make it runnable on GPU.
 
 ::
 
    target = "cuda"
+   bx, tx = s[C].split(C.op.axis[0], factor=64)
+   s[C].bind(bx, tvm.te.thread_axis("blockIdx.x"))
+   s[C].bind(tx, tvm.te.thread_axis("threadIdx.x"))
    fadd = tvm.build(s, [A, B, C], target)
 
 ``tvm.build()``, defined in ``python/tvm/driver/build_module.py``, takes a schedule, input and output ``Tensor``, and a target, and returns a :py:class:`tvm.runtime.Module` object. A :py:class:`tvm.runtime.Module` object contains a compiled function which can be invoked with function call syntax.

From f21b5ca4aedd1cc814f43076d3a5a4bf2c7b516a Mon Sep 17 00:00:00 2001
From: multiverstack <39256082+multiverstack-intellif@users.noreply.github.com>
Date: Mon, 10 Oct 2022 17:33:05 +0800
Subject: [PATCH 327/704] [TIR][Schedule] Add cache_inplace primitive to cache
 opaque buffer (#12939)

---
 include/tvm/tir/schedule/schedule.h           |  10 +
 python/tvm/tir/schedule/schedule.py           |  89 +++++++
 src/tir/schedule/concrete_schedule.cc         |  13 +
 src/tir/schedule/concrete_schedule.h          |   2 +
 src/tir/schedule/primitive.h                  |  14 +-
 .../schedule/primitive/cache_read_write.cc    | 242 +++++++++++++++++-
 src/tir/schedule/schedule.cc                  |   2 +
 src/tir/schedule/traced_schedule.cc           |  16 ++
 src/tir/schedule/traced_schedule.h            |   2 +
 .../test_tir_schedule_cache_read_write.py     |  58 +++++
 10 files changed, 440 insertions(+), 8 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 049f063240df..9ec2841ebd5e 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -403,6 +403,16 @@ class ScheduleNode : public runtime::Object {
    */
   virtual BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                              const String& storage_scope) = 0;
+  /*!
+   * \brief Create 2 blocks that read&write a buffer region into a read/write cache.
+   * It requires the the target block both read & write the target buffer.
+   * \param block_rv The target block operates on the target buffer.
+   * \param read_buffer_index The index of the buffer in block's read region.
+   * \param storage_scope The target storage scope
+   * \return The cache stage blocks, cache read block together with cache write block.
+   */
+  virtual Array<BlockRV> CacheInplace(const BlockRV& block_rv, int read_buffer_index,
+                                      const String& storage_scope) = 0;
   /*!
    * \brief Create a block that read/write a buffer region into a read/write cache with reindexing.
    * The layout of the cache will be the same as by the iterators of the block that reads/writes the
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 2268196b5898..4814271f4023 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -1189,6 +1189,95 @@ def after_cache_write(a: T.handle, b: T.handle) -> None:
             self, block, write_buffer_index, storage_scope
         )
 
+    @type_checked
+    def cache_inplace(
+        self,
+        block: Union[BlockRV, str],
+        read_buffer_index: Union[int, str, Buffer],
+        storage_scope: str,
+    ) -> List[BlockRV]:
+        """Create blocks that reads & write a buffer region into a cache block.
+        It requires the the target block both read & write the target buffer.
+        Mainly for inplace operation.
+
+        Parameters
+        ----------
+        block : Union[BlockRV, str]
+            The target block operates on the target buffer.
+
+        read_buffer_index: int
+            The index of the buffer in block's read region, the unique
+            name of a read buffer in the block, or a Buffer object
+            that is within the blocks read region.
+
+        storage_scope: str
+            The target storage scope.
+
+
+        Returns
+        -------
+        cached_blocks : List[BlockRV]
+            The blocks of the cache stage, read cache first, write cache second
+
+        Examples
+        --------
+        Before cache_inplace, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def before_cache_inplace(data_io: T.Buffer[(64), "int32"]):
+                for i0 in T.serial(1):
+                    with T.block("A"):
+                        T.reads(data_io[:64])
+                        T.writes(data_io[:64])
+                        T.evaluate(T.call_extern("call_impl", data_io.data, dtype=""))
+
+        Create the schedule and cache_inplace:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_cache_inplace)
+            block_a = sch.get_block("A")
+            sch.cache_inplace(block_a, 0, "local")
+            print(sch.mod["main"].script())
+
+        After applying cache_inplace, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def cache_inplace(data_io: T.Buffer[64, "int32"]) -> None:
+                data_io_local = T.alloc_buffer([64], dtype="int32", scope="local")
+                for i0 in T.serial(1):
+                    for ax0 in T.serial(64):
+                        with T.block("data_io_local"):
+                            v0 = T.axis.spatial(64, ax0)
+                            T.reads(data_io[v0])
+                            T.writes(data_io_local[v0])
+                            data_io_local[v0] = data_io[v0]
+                    with T.block("A"):
+                        T.reads(data_io_local[0 : 64])
+                        T.writes(data_io_local[0 : 64])
+                        T.evaluate(T.call_extern("call_impl", data_io_local.data, dtype=""))
+                    for ax0 in T.serial(64):
+                        with T.block("data_io_local"):
+                            v0 = T.axis.spatial(64, ax0)
+                            T.reads(data_io_local[v0])
+                            T.writes(data_io[v0])
+                            data_io[v0] = data_io_local[v0]
+
+        """
+        block = self._normalize_block_arg(block)
+
+        if not isinstance(read_buffer_index, int):
+            _, read_buffer_index, _ = self._normalize_buffer_arg(
+                block, read_buffer_index, required_buffer_type="read"
+            )
+        return _ffi_api.ScheduleCacheInplace(  # type: ignore # pylint: disable=no-member
+            self, block, read_buffer_index, storage_scope
+        )
+
     @type_checked
     def reindex(
         self,
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 1d9272cf2dd5..3960087cf745 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -561,6 +561,19 @@ BlockRV ConcreteScheduleNode::CacheWrite(const BlockRV& block_rv, int write_buff
   return CreateRV<BlockRV>(result);
 }
 
+Array<BlockRV> ConcreteScheduleNode::CacheInplace(const BlockRV& block_rv, int write_buffer_index,
+                                                  const String& storage_scope) {
+  Array<StmtSRef> results;
+  TVM_TIR_SCHEDULE_BEGIN();
+  results = tir::CacheInplace(state_, this->GetSRef(block_rv), write_buffer_index, storage_scope);
+  TVM_TIR_SCHEDULE_END("cache-buffer", this->error_render_level_);
+  this->state_->DebugVerify();
+  Array<BlockRV> return_blocks;
+  return_blocks.push_back(CreateRV<BlockRV>(results[0]));
+  return_blocks.push_back(CreateRV<BlockRV>(results[1]));
+  return return_blocks;
+}
+
 BlockRV ConcreteScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index,
                                       BufferIndexType buffer_index_type) {
   StmtSRef result{nullptr};
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 59a9e3752859..bfdc082d4ce6 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -116,6 +116,8 @@ class ConcreteScheduleNode : public ScheduleNode {
                     const Array<BlockRV> consumer_blocks = {}) override;
   BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                      const String& storage_scope) override;
+  Array<BlockRV> CacheInplace(const BlockRV& block_rv, int read_buffer_index,
+                              const String& storage_scope) override;
   BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
                   BufferIndexType buffer_index_type) override;
   /******** Schedule: Compute location ********/
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 21388ff132ae..88331fb5b9d3 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -267,6 +267,18 @@ TVM_DLL StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int r
  */
 TVM_DLL StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_buffer_index,
                             const String& storage_scope);
+/*!
+ *!
+ * \brief Create 2 blocks that read&write a buffer region into a read/write cache.
+ * It requires the the target block both read & write the target buffer.
+ * \param self The state of the schedule
+ * \param block_sref The target block operates on the target buffer.
+ * \param read_buffer_index The index of the buffer in block's read region.
+ * \param storage_scope The target storage scope
+ * \return The cache stage blocks, cache read block together with cache write block.
+ */
+TVM_DLL Array<StmtSRef> CacheInplace(ScheduleState self, const StmtSRef& block_sref,
+                                     int read_buffer_index, const String& storage_scope);
 /*!
  *!
  * \brief Create a block that read/write a buffer region into a read/write cache with reindexing.
@@ -275,7 +287,7 @@ TVM_DLL StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int
  * 1) There is only one block who reads/writes the target buffer
  * 2) There is only one buffer load/store of this buffer in the block
  * \param self The state of the schedule
- * \param block_rv The block operates on the target buffer.
+ * \param block_sref The block operates on the target buffer.
  * \param buffer_index The index of the buffer in block's read or write region.
  * \param buffer_index_type The type of the buffer index, kRead or kWrite.
  * \return The reindex stage block.
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index e03b1058d4ef..58d622268c78 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -66,7 +66,7 @@ struct CacheStageInfo {
   /*! \brief The buffer to be written. */
   Buffer write_buffer;
   /*! \brief The buffer allocation to be inserted into the block signature. */
-  Buffer alloc;
+  Optional<Buffer> alloc;
   /*! \brief The AST node whose body is where the cache stage should be inserted. */
   StmtSRef loc_sref;
   /*! \brief The index to insert the cache_read/cache_write stage. */
@@ -494,6 +494,92 @@ class CacheLocDetector : public StmtVisitor {
   int loc_pos_{-1};
 };
 
+/*! \brief Detect the insertion position of the new cache stage */
+class CacheInplaceLocDetector : public StmtVisitor {
+ public:
+  /*!
+   * \brief Detect the insertion position of the cache stage, and write the position into the
+   * CacheStageInfo \param self The state of the schedule \param block_sref The sref of the unique
+   * block of the buffer being applied cache_inplace \param scope_sref The sref
+   * of the scope block of the cached block \param info The cache stage info.
+   */
+  static void Detect(const ScheduleState& self, const StmtSRef& block_sref,
+                     const StmtSRef& scope_sref, CacheStageInfo* info) {
+    CacheInplaceLocDetector detector(self, block_sref, scope_sref);
+    detector(GetRef<Stmt>(scope_sref->stmt));
+    info->loc_sref = detector.loc_sref_;
+    info->loc_pos = detector.loc_pos_;
+  }
+
+ private:
+  /*!
+   * \brief Constructor
+   * \param self The state of the schedule
+   * \param block_sref The sref of the unique writer block of the buffer being applied cache_inplace
+   * \param scope_sref The sref of the scope block of the cached block
+   */
+  CacheInplaceLocDetector(const ScheduleState self, const StmtSRef& block_sref,
+                          const StmtSRef& scope_sref)
+      : self_(self), block_sref_(block_sref), scope_sref_(scope_sref) {}
+
+  void VisitStmt_(const SeqStmtNode* seq_stmt) final {
+    for (size_t i = 0; i < seq_stmt->size(); ++i) {
+      if (loc_pos_ != -1) {
+        break;
+      }
+      VisitStmt(seq_stmt->seq[i]);
+      // `pos` can be assigned only once when we visited `block_sref`
+      if (visited_block_ && loc_pos_ == -1) {
+        // The offset of insert position from the block
+        loc_pos_ = i;
+        return;
+      }
+    }
+  }
+
+  void VisitStmt_(const BlockNode* block) final {
+    // Only visit the current scope under buffer writer's parent block
+    if (block == scope_sref_->stmt) {
+      // The block vistied is the current parent scope
+      StmtVisitor::VisitStmt_(block);
+      // Handling cases when insert outside any loop
+      if (visited_block_ && !loc_sref_.defined()) {
+        loc_sref_ = self_->stmt2ref.at(block);
+        // Handling for input buffer
+        if (loc_pos_ == -1) {
+          loc_pos_ = 0;
+        }
+      }
+    } else if (block_sref_->stmt == block) {
+      visited_block_ = true;
+    }
+  }
+
+  void VisitStmt_(const ForNode* loop) final {
+    StmtVisitor::VisitStmt_(loop);
+    if (visited_block_ && !loc_sref_.defined()) {
+      loc_sref_ = self_->stmt2ref.at(loop);
+      if (loc_pos_ == -1) {
+        loc_pos_ = 0;
+      }
+    }
+  }
+
+ private:
+  /*! \brief The schedule class */
+  const ScheduleState self_;
+  /*! \brief The dominate block which write the buffer */
+  const StmtSRef& block_sref_;
+  /*! \brief The parent scope of the dominate block */
+  const StmtSRef& scope_sref_;
+  /*! \brief The flag whether we have visited the target block */
+  bool visited_block_{false};
+  /*! \brief The AST node whose body is where the cache stage should be inserted */
+  StmtSRef loc_sref_{nullptr};
+  /*! \brief The index to insert the cache_read/cache_write stage */
+  int loc_pos_{-1};
+};
+
 /*! \brief Mutator for CacheRead. */
 class CacheReadRewriter : public StmtExprMutator {
  public:
@@ -557,8 +643,11 @@ class CacheReadRewriter : public StmtExprMutator {
     if (block == scope_sref_->stmt) {
       // If so, put buffer allocation on the parent scope
       ObjectPtr<BlockNode> n = make_object<BlockNode>(*stmt.as<BlockNode>());
-      n->alloc_buffers.push_back(info_->alloc);
-      stmt = Block(n);
+      // In cache_inplace case, alloc_buffer may be already exits.
+      if (info_->alloc.defined()) {
+        n->alloc_buffers.push_back(info_->alloc.value());
+        stmt = Block(n);
+      }
     } else {
       // Otherwise, update read regions and match_buffers
       // Only make this change if the block is one of the specified consumers.
@@ -664,8 +753,11 @@ class CacheWriteRewriter : public StmtExprMutator {
     // Put buffer allocation on the parent scope
     if (block == scope_sref_->stmt) {
       ObjectPtr<BlockNode> n = make_object<BlockNode>(*stmt.as<BlockNode>());
-      n->alloc_buffers.push_back(info_->alloc);
-      stmt = Block(n);
+      // In cache_inplace case, alloc_buffer may be already exits.
+      if (info_->alloc.defined()) {
+        n->alloc_buffers.push_back(info_->alloc.value());
+        stmt = Block(n);
+      }
     } else {
       // Since cache_write changes the block, we need to update the buffer it writes
       auto writes = ReplaceBuffer(block->writes, info_->write_buffer, info_->read_buffer);
@@ -892,7 +984,7 @@ class ReIndexRewriter : public StmtExprMutator {
   explicit ReIndexRewriter(const StmtSRef& block_sref, CacheStageInfo* info,
                            const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& covered)
       : block_sref_(block_sref), info_(info), covered_(covered) {
-    new_buffer_ = info->alloc;
+    new_buffer_ = info->alloc.value();
     old_buffer_ = info->read_buffer.same_as(new_buffer_) ? info->write_buffer : info->read_buffer;
   }
 
@@ -904,7 +996,7 @@ class ReIndexRewriter : public StmtExprMutator {
       // Insert cache stage into the loop
       ObjectPtr<BlockNode> n = make_object<BlockNode>(*stmt.as<BlockNode>());
       n->body = InsertCacheStage(n->body, info_->loc_pos, info_->cache_stage);
-      n->alloc_buffers.push_back(info_->alloc);
+      n->alloc_buffers.push_back(info_->alloc.value());
       stmt = Block(n);
       info_->block_reuse.Set(old_stmt, stmt);
       return std::move(stmt);
@@ -1140,6 +1232,113 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu
   return result_block_sref;
 }
 
+/*! \brief The schedule error that the target block doesn't both read&write target buffer. */
+class NotReadWriteError : public ScheduleError {
+ public:
+  NotReadWriteError(IRModule mod, Block block, Buffer buffer)
+      : mod_(std::move(mod)), block_(std::move(block)), buffer_(std::move(buffer)) {}
+  String FastErrorString() const final {
+    return "ScheduleError: The target block does not both read & write target buffer.";
+  }
+
+  String DetailRenderTemplate() const final {
+    return "The target block {0} does not both read & write target buffer {1}.";
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_, buffer_}; }
+  IRModule mod_;
+  Block block_;
+  Buffer buffer_;
+};
+
+Array<StmtSRef> CacheInplace(ScheduleState self, const StmtSRef& block_sref, int read_buffer_index,
+                             const String& storage_scope) {
+  /*!
+   * Do cache read then cache write
+   */
+
+  // Check 0. Check the input storage scope.
+  CheckStorageScope(self, storage_scope);
+
+  // Check 1. Check index, get the target buffer and the parent scope
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
+  Buffer buffer =
+      GetNthAccessBuffer(self, GetRef<Block>(block), read_buffer_index, BufferIndexType::kRead);
+  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
+
+  // Check 3. Check required region cover for cache_read
+  CheckRegionCover(self, scope_sref);
+
+  // Check 4. Check if target block both read & write target buffer.
+  const BlockNode* rw_block = TVM_SREF_TO_BLOCK(block_sref);
+  Optional<BufferRegion> read_region = GetBufferRegionFromBuffer(rw_block->reads, buffer);
+  Optional<BufferRegion> write_region = GetBufferRegionFromBuffer(rw_block->writes, buffer);
+  if (!read_region.defined() || !write_region.defined()) {
+    throw NotReadWriteError(self->mod, GetRef<Block>(rw_block), buffer);
+  }
+
+  Array<StmtSRef> results_block_sref;
+  Buffer new_buffer = WithScope(buffer, storage_scope);
+
+  // Do cache read
+  // Cache read step 0. Create CacheStageInfo
+  CacheStageInfo info;
+  info.read_buffer = buffer;
+  // Create the corresponding buffer to be written for cache_read
+  info.write_buffer = new_buffer;
+  // Create the corresponding buffer allocation
+  info.alloc = info.write_buffer;
+  // Indicate which buffers should consume the cache.
+  info.consumer_blocks.push_back(block_sref);
+
+  // Cache read step 1. Detect insert position
+  CacheInplaceLocDetector::Detect(self, block_sref, scope_sref, &info);
+
+  // Cache read step 2. Making new cache stage block and rewrite readers.
+  Block cache_read_stage = MakeCacheStage(/*cache_region=*/read_region.value(), /*info=*/&info,
+                                          /*storage_scope=*/storage_scope);
+  Stmt new_scope = CacheReadRewriter::Rewrite(/*scope_sref=*/scope_sref, /*info=*/&info);
+
+  // Cache read step 3. Replacing and updating flags for cache read.
+  self->Replace(scope_sref, new_scope, info.block_reuse);
+  StmtSRef result_block_sref = self->stmt2ref.at(cache_read_stage.get());
+  BlockInfo& block_info_read = self->block_info[result_block_sref];
+  block_info_read.affine_binding = CalculateAffineFlag(self, result_block_sref);
+  results_block_sref.push_back(result_block_sref);
+
+  // Do cache write
+  // Cache write step 0. Update cache stage info for cache_read.
+  info.read_buffer = new_buffer;
+  // Create the corresponding buffer to be written, i.e. result of cache_write
+  info.write_buffer = buffer;
+  // Create the corresponding buffer allocation
+  info.alloc = nullptr;
+  info.consumer_blocks.clear();
+
+  // Cache write step 1. Detect insert position
+  CacheInplaceLocDetector::Detect(self, block_sref, scope_sref, &info);
+  // insert after target block for cache write
+  info.loc_pos += 1;
+
+  // Cache write step 2. Making new cache stage block and rewrite readers.
+  Block cache_write_stage = MakeCacheStage(/*cache_region=*/write_region.value(), /*info=*/&info,
+                                           /*storage_scope=*/storage_scope);
+  new_scope = CacheWriteRewriter::Rewrite(/*scope_sref=*/scope_sref,
+                                          /*writer_block_sref=*/block_sref, /*info=*/&info);
+
+  // Cache write step 4. Replacing and updating flags for cache write.
+  self->Replace(scope_sref, new_scope, info.block_reuse);
+  result_block_sref = self->stmt2ref.at(cache_write_stage.get());
+  BlockInfo& block_info_write = self->block_info[result_block_sref];
+  block_info_write.affine_binding = CalculateAffineFlag(self, result_block_sref);
+  block_info_write.region_cover = true;
+  block_info_write.scope->stage_pipeline = false;
+  results_block_sref.push_back(result_block_sref);
+
+  return results_block_sref;
+}
+
 StmtSRef ReIndex(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
                  BufferIndexType buffer_index_type) {
   const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref);
@@ -1276,6 +1475,34 @@ struct CacheWriteTraits : public UnpackedInstTraits<CacheWriteTraits> {
   friend struct ::tvm::tir::UnpackedInstTraits;
 };
 
+struct CacheInplaceTraits : public UnpackedInstTraits<CacheInplaceTraits> {
+  static constexpr const char* kName = "CacheInplace";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumAttrs = 2;
+  static constexpr size_t kNumDecisions = 0;
+
+  static Array<BlockRV> UnpackedApplyToSchedule(Schedule sch, BlockRV block,
+                                                Integer read_buffer_index, String storage_scope) {
+    return sch->CacheInplace(block, read_buffer_index->value, storage_scope);
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String block, Integer read_buffer_index,
+                                 String storage_scope) {
+    PythonAPICall py("cache_inplace");
+    py.Input("block", block);
+    py.Input("read_buffer_index", read_buffer_index->value);
+    py.Input("storage_scope", storage_scope);
+    py.OutputList(outputs);
+    return py.Str();
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
 struct ReIndexTraits : public UnpackedInstTraits<ReIndexTraits> {
   static constexpr const char* kName = "ReIndex";
   static constexpr bool kIsPure = false;
@@ -1309,6 +1536,7 @@ struct ReIndexTraits : public UnpackedInstTraits<ReIndexTraits> {
 
 TVM_REGISTER_INST_KIND_TRAITS(CacheReadTraits);
 TVM_REGISTER_INST_KIND_TRAITS(CacheWriteTraits);
+TVM_REGISTER_INST_KIND_TRAITS(CacheInplaceTraits);
 TVM_REGISTER_INST_KIND_TRAITS(ReIndexTraits);
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index 2f27dbb9fbf1..280d0af92a8c 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -179,6 +179,8 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheRead")
     .set_body_method<Schedule>(&ScheduleNode::CacheRead);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheWrite")
     .set_body_method<Schedule>(&ScheduleNode::CacheWrite);
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheInplace")
+    .set_body_method<Schedule>(&ScheduleNode::CacheInplace);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleReIndex")
     .set_body_typed([](Schedule self, const BlockRV& block_rv, int buffer_index,
                        int buffer_index_type) {
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 9ff793dc39dd..b67b008feda4 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -307,6 +307,22 @@ BlockRV TracedScheduleNode::CacheWrite(const BlockRV& block_rv, int write_buffer
   return result;
 }
 
+Array<BlockRV> TracedScheduleNode::CacheInplace(const BlockRV& block_rv, int read_buffer_index,
+                                                const String& storage_scope) {
+  Array<BlockRV> result =
+      ConcreteScheduleNode::CacheInplace(block_rv, read_buffer_index, storage_scope);
+  Array<ObjectRef> results;
+  for (const BlockRV& r : result) {
+    results.push_back(r);
+  }
+  static const InstructionKind& kind = InstructionKind::Get("CacheInplace");
+  trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
+                                      /*inputs=*/{block_rv},
+                                      /*attrs=*/{Integer(read_buffer_index), storage_scope},
+                                      /*outputs=*/results));
+  return result;
+}
+
 BlockRV TracedScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index,
                                     BufferIndexType buffer_index_type) {
   BlockRV result = ConcreteScheduleNode::ReIndex(block_rv, buffer_index, buffer_index_type);
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 0e83b35f44e9..016de60726b9 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -76,6 +76,8 @@ class TracedScheduleNode : public ConcreteScheduleNode {
                     const Array<BlockRV> consumer_blocks = {}) final;
   BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                      const String& storage_scope) final;
+  Array<BlockRV> CacheInplace(const BlockRV& block_rv, int read_buffer_index,
+                              const String& storage_scope) final;
   BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
                   BufferIndexType buffer_index_type) final;
   /******** Schedule: Compute location ********/
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index 334fb988d775..a237a5b75839 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -241,6 +241,15 @@ def inplace_func(data_io: T.Buffer[(64), "int32"]):
             data_io[v0] = data_1d[v0]
 
 
+@T.prim_func
+def inplace_call(data_io: T.Buffer[(64), "int32"]):
+    for i0 in T.serial(1):
+        with T.block("ext_call"):
+            T.reads(data_io[:64])
+            T.writes(data_io[:64])
+            T.evaluate(T.call_extern("call_impl", data_io.data, dtype=""))
+
+
 ########## Expected function after cache_read ##########
 
 
@@ -548,6 +557,42 @@ def cache_read_inplace(data_io: T.Buffer[64, "int32"]) -> None:
             data_io[v0] = data_1d[v0]
 
 
+@T.prim_func
+def cache_inplace_buffer(data_io: T.Buffer[64, "int32"]) -> None:
+    data_io_local = T.alloc_buffer([64], dtype="int32", scope="local")
+    data_io_global = T.alloc_buffer([64], dtype="int32")
+    data_io_global_1 = T.alloc_buffer([64], dtype="int32")
+    for ax0 in T.serial(64):
+        with T.block("data_io_global"):
+            v0 = T.axis.spatial(64, ax0)
+            T.reads(data_io[v0])
+            T.writes(data_io_global[v0])
+            data_io_global[v0] = data_io[v0]
+    for i0 in T.serial(1):
+        for ax0 in T.serial(64):
+            with T.block("data_io_local"):
+                v0 = T.axis.spatial(64, ax0)
+                T.reads(data_io_global[v0])
+                T.writes(data_io_local[v0])
+                data_io_local[v0] = data_io_global[v0]
+        with T.block("ext_call"):
+            T.reads(data_io_local[0:64])
+            T.writes(data_io_local[0:64])
+            T.evaluate(T.call_extern("call_impl", data_io_local.data, dtype=""))
+        for ax0 in T.serial(64):
+            with T.block("data_io_local"):
+                v0 = T.axis.spatial(64, ax0)
+                T.reads(data_io_local[v0])
+                T.writes(data_io_global_1[v0])
+                data_io_global_1[v0] = data_io_local[v0]
+    for ax0 in T.serial(64):
+        with T.block("data_io_global"):
+            v0 = T.axis.spatial(64, ax0)
+            T.reads(data_io_global_1[v0])
+            T.writes(data_io[v0])
+            data_io[v0] = data_io_global_1[v0]
+
+
 ########## Expected function after cache_write ##########
 
 
@@ -931,6 +976,19 @@ def test_inplace_cache_read():
     verify_trace_roundtrip(sch=sch, mod=inplace_func)
 
 
+def test_cache_inplace():
+    # cache_inplace could introduce WAR, which is expected but stage pipeline property changes
+    debug_mask = tvm.tir.schedule.state.ScheduleDebugMask.VERIFY_SREF_TREE
+    sch = tvm.tir.Schedule(inplace_call, debug_mask=debug_mask)
+    block = sch.get_block("ext_call")
+    blocks = sch.cache_inplace(block, 0, "local")
+    block = sch.cache_read(blocks[0], 0, "global", [blocks[0]])
+    block = sch.cache_write(blocks[1], 0, "global")
+
+    tvm.ir.assert_structural_equal(cache_inplace_buffer, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=inplace_call, debug_mask=debug_mask)
+
+
 ########## Testcases for cache_write ##########
 
 
From aedbe45b6aa4651619bc7e93f088e76d03e83cfa Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 11 Oct 2022 04:49:19 +0900
Subject: [PATCH 328/704] [Node] Expose StructuralEqual/Hash handler
 implemenation to header (#13001)

When we use `StructuralEqual/Hash`, the actual equality testing / hashing work are deferred to `RemapVarSEqualHandler` and `VarCountingSHashHandler` class respectively, defined in `.cc` files. To customize equality / hash behavior, I want to be able to derive from these classes outside of `structural_equal.cc` or `structural_hash.cc`. This is the first step toward replacing my previous attempt https://github.com/apache/tvm/pull/12706 to allow ignoring NDArray raw data in `StructuralEqual/Hash`.

So I propose to expose them as the "default" handler, while still hiding their implementation details. This lets me define a custom hasher that ignores NDArray content simply by:

```
class SHashHandlerIgnoreNDArray : public SHashHandlerDefault {
 protected:
  void DispatchSHash(const ObjectRef& object, bool map_free_vars) override {
    ICHECK(object.defined());
    if (auto ndarray = object.as<runtime::NDArray::Container>()) {
      SHashReducer hash_reduce(this, map_free_vars);
      NDArrayHash(ndarray, &hash_reduce, false);
    } else {
      SHashHandlerDefault::DispatchSHash(object, map_free_vars);
    }
  }
};
```
---
 include/tvm/node/structural_equal.h |  42 +++++++++++
 include/tvm/node/structural_hash.h  |  38 ++++++++++
 src/node/structural_equal.cc        | 109 +++++++++++++++++++---------
 src/node/structural_hash.cc         |  64 +++++++++++-----
 4 files changed, 200 insertions(+), 53 deletions(-)

diff --git a/include/tvm/node/structural_equal.h b/include/tvm/node/structural_equal.h
index b51021fe4076..371b8f9c7bd9 100644
--- a/include/tvm/node/structural_equal.h
+++ b/include/tvm/node/structural_equal.h
@@ -324,5 +324,47 @@ class SEqualReducer {
   bool map_free_vars_ = false;
 };
 
+/*! \brief The default handler for equality testing.
+ *
+ * Users can derive from this class and override the DispatchSEqualReduce method,
+ * to customize equality testing.
+ */
+class SEqualHandlerDefault : public SEqualReducer::Handler {
+ public:
+  SEqualHandlerDefault(bool assert_mode, Optional<ObjectPathPair>* first_mismatch);
+  virtual ~SEqualHandlerDefault();
+
+  bool SEqualReduce(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars,
+                    const Optional<ObjectPathPair>& current_paths) override;
+  void DeferFail(const ObjectPathPair& mismatch_paths) override;
+  ObjectRef MapLhsToRhs(const ObjectRef& lhs) override;
+  void MarkGraphNode() override;
+
+  /*!
+   * \brief The entry point for equality testing
+   * \param lhs The left operand.
+   * \param rhs The right operand.
+   * \param map_free_vars Whether or not to remap variables if possible.
+   * \return The equality result.
+   */
+  virtual bool Equal(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars);
+
+ protected:
+  /*!
+   * \brief The dispatcher for equality testing of intermediate objects
+   * \param lhs The left operand.
+   * \param rhs The right operand.
+   * \param map_free_vars Whether or not to remap variables if possible.
+   * \param current_paths Optional paths to `lhs` and `rhs` objects, for error traceability.
+   * \return The equality result.
+   */
+  virtual bool DispatchSEqualReduce(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars,
+                                    const Optional<ObjectPathPair>& current_paths);
+
+ private:
+  class Impl;
+  Impl* impl;
+};
+
 }  // namespace tvm
 #endif  // TVM_NODE_STRUCTURAL_EQUAL_H_
diff --git a/include/tvm/node/structural_hash.h b/include/tvm/node/structural_hash.h
index a30a2c59d0d1..8b8a403326c4 100644
--- a/include/tvm/node/structural_hash.h
+++ b/include/tvm/node/structural_hash.h
@@ -200,6 +200,44 @@ class SHashReducer {
   bool map_free_vars_;
 };
 
+/*! \brief The default handler for hash key computation
+ *
+ * Users can derive from this class and override the DispatchSHash method,
+ * to customize hashing.
+ */
+class SHashHandlerDefault : public SHashReducer::Handler {
+ public:
+  SHashHandlerDefault();
+  virtual ~SHashHandlerDefault();
+
+  void SHashReduceHashedValue(size_t hashed_value) override;
+  void SHashReduce(const ObjectRef& key, bool map_free_vars) override;
+  void SHashReduceFreeVar(const runtime::Object* var, bool map_free_vars) override;
+  bool LookupHashedValue(const ObjectRef& key, size_t* hashed_value) override;
+  void MarkGraphNode() override;
+
+  /*!
+   * \brief The entry point for hashing
+   * \param object The object to be hashed.
+   * \param map_free_vars Whether or not to remap variables if possible.
+   * \return The hash result.
+   */
+  virtual size_t Hash(const ObjectRef& object, bool map_free_vars);
+
+ protected:
+  /*!
+   * \brief The dispatcher for hashing of intermediate objects
+   * \param object An intermediate object to be hashed.
+   * \param map_free_vars Whether or not to remap variables if possible.
+   * \return The hash result.
+   */
+  virtual void DispatchSHash(const ObjectRef& object, bool map_free_vars);
+
+ private:
+  class Impl;
+  Impl* impl;
+};
+
 class SEqualReducer;
 struct NDArrayContainerTrait {
   static constexpr const std::nullptr_t VisitAttrs = nullptr;
diff --git a/src/node/structural_equal.cc b/src/node/structural_equal.cc
index 01874c0536ae..2f49d9ef5629 100644
--- a/src/node/structural_equal.cc
+++ b/src/node/structural_equal.cc
@@ -198,13 +198,13 @@ bool SEqualReducer::ObjectAttrsEqual(const ObjectRef& lhs, const ObjectRef& rhs,
  *  The order of SEqual being called is the same as the order as if we
  *  eagerly do recursive calls in SEqualReduce.
  */
-class RemapVarSEqualHandler : public SEqualReducer::Handler {
+class SEqualHandlerDefault::Impl {
  public:
-  explicit RemapVarSEqualHandler(bool assert_mode, Optional<ObjectPathPair>* first_mismatch)
-      : assert_mode_(assert_mode), first_mismatch_(first_mismatch) {}
+  Impl(SEqualHandlerDefault* parent, bool assert_mode, Optional<ObjectPathPair>* first_mismatch)
+      : parent_(parent), assert_mode_(assert_mode), first_mismatch_(first_mismatch) {}
 
   bool SEqualReduce(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars,
-                    const Optional<ObjectPathPair>& current_paths) final {
+                    const Optional<ObjectPathPair>& current_paths) {
     // We cannot use check lhs.same_as(rhs) to check equality.
     // if we choose to enable var remapping.
     //
@@ -239,17 +239,17 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
     return CheckResult(run(), lhs, rhs, current_paths);
   }
 
-  void DeferFail(const ObjectPathPair& mismatch_paths) final {
+  void DeferFail(const ObjectPathPair& mismatch_paths) {
     pending_tasks_.emplace_back(Task::ForceFailTag{}, mismatch_paths);
   }
 
-  void MarkGraphNode() final {
+  void MarkGraphNode() {
     // need to push to pending tasks in this case
     ICHECK(!allow_push_to_stack_ && !task_stack_.empty());
     task_stack_.back().graph_equal = true;
   }
 
-  ObjectRef MapLhsToRhs(const ObjectRef& lhs) final {
+  ObjectRef MapLhsToRhs(const ObjectRef& lhs) {
     auto it = equal_map_lhs_.find(lhs);
     if (it != equal_map_lhs_.end()) return it->second;
     return ObjectRef(nullptr);
@@ -279,7 +279,35 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
     return RunTasks();
   }
 
+  // The default equal as registered in the structural equal vtable.
+  bool DispatchSEqualReduce(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars,
+                            const Optional<ObjectPathPair>& current_paths) {
+    auto compute = [=]() {
+      ICHECK(lhs.defined() && rhs.defined() && lhs->type_index() == rhs->type_index());
+      // skip entries that already have equality maps.
+      auto it = equal_map_lhs_.find(lhs);
+      if (it != equal_map_lhs_.end()) {
+        return it->second.same_as(rhs);
+      }
+      if (equal_map_rhs_.count(rhs)) return false;
+
+      SEqualReducer reducer = GetReducer(lhs, rhs, map_free_vars, current_paths);
+      return vtable_->SEqualReduce(lhs.get(), rhs.get(), reducer);
+    };
+    return CheckResult(compute(), lhs, rhs, current_paths);
+  }
+
  protected:
+  SEqualReducer GetReducer(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars,
+                           const Optional<ObjectPathPair>& current_paths) {
+    if (!IsPathTracingEnabled()) {
+      return SEqualReducer(parent_, nullptr, map_free_vars);
+    } else {
+      PathTracingData tracing_data = {current_paths.value(), lhs, rhs, first_mismatch_};
+      return SEqualReducer(parent_, &tracing_data, map_free_vars);
+    }
+  }
+
   // Check the result.
   bool CheckResult(bool result, const ObjectRef& lhs, const ObjectRef& rhs,
                    const Optional<ObjectPathPair>& current_paths) {
@@ -335,7 +363,8 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
         // which populates the pending tasks.
         ICHECK_EQ(pending_tasks_.size(), 0U);
         allow_push_to_stack_ = false;
-        if (!DispatchSEqualReduce(entry.lhs, entry.rhs, entry.map_free_vars, entry.current_paths))
+        if (!parent_->DispatchSEqualReduce(entry.lhs, entry.rhs, entry.map_free_vars,
+                                           entry.current_paths))
           return false;
         allow_push_to_stack_ = true;
         // Push pending tasks in reverse order, so earlier tasks get to
@@ -349,31 +378,6 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
     return true;
   }
 
-  // The default equal as registered in the structural equal vtable.
-  bool DispatchSEqualReduce(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars,
-                            const Optional<ObjectPathPair>& current_paths) {
-    auto compute = [=]() {
-      ICHECK(lhs.defined() && rhs.defined() && lhs->type_index() == rhs->type_index());
-      // skip entries that already have equality maps.
-      auto it = equal_map_lhs_.find(lhs);
-      if (it != equal_map_lhs_.end()) {
-        return it->second.same_as(rhs);
-      }
-      if (equal_map_rhs_.count(rhs)) return false;
-
-      // Run reduce check for free nodes.
-      if (!IsPathTracingEnabled()) {
-        return vtable_->SEqualReduce(lhs.get(), rhs.get(),
-                                     SEqualReducer(this, nullptr, map_free_vars));
-      } else {
-        PathTracingData tracing_data = {current_paths.value(), lhs, rhs, first_mismatch_};
-        return vtable_->SEqualReduce(lhs.get(), rhs.get(),
-                                     SEqualReducer(this, &tracing_data, map_free_vars));
-      }
-    };
-    return CheckResult(compute(), lhs, rhs, current_paths);
-  }
-
  private:
   /*! \brief Pending reduce tasks. */
   struct Task {
@@ -407,6 +411,8 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
 
   bool IsPathTracingEnabled() const { return first_mismatch_ != nullptr; }
 
+  // The owner of this impl
+  SEqualHandlerDefault* parent_;
   // list of pending tasks to be pushed to the stack.
   std::vector<Task> pending_tasks_;
   // Internal task stack to executed the task.
@@ -425,22 +431,53 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
   std::unordered_map<ObjectRef, ObjectRef, ObjectPtrHash, ObjectPtrEqual> equal_map_rhs_;
 };
 
+SEqualHandlerDefault::SEqualHandlerDefault(bool assert_mode,
+                                           Optional<ObjectPathPair>* first_mismatch) {
+  impl = new Impl(this, assert_mode, first_mismatch);
+}
+
+SEqualHandlerDefault::~SEqualHandlerDefault() { delete impl; }
+
+bool SEqualHandlerDefault::SEqualReduce(const ObjectRef& lhs, const ObjectRef& rhs,
+                                        bool map_free_vars,
+                                        const Optional<ObjectPathPair>& current_paths) {
+  return impl->SEqualReduce(lhs, rhs, map_free_vars, current_paths);
+}
+
+void SEqualHandlerDefault::DeferFail(const ObjectPathPair& mismatch_paths) {
+  impl->DeferFail(mismatch_paths);
+}
+
+ObjectRef SEqualHandlerDefault::MapLhsToRhs(const ObjectRef& lhs) { return impl->MapLhsToRhs(lhs); }
+
+void SEqualHandlerDefault::MarkGraphNode() { impl->MarkGraphNode(); }
+
+bool SEqualHandlerDefault::Equal(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars) {
+  return impl->Equal(lhs, rhs, map_free_vars);
+}
+
+bool SEqualHandlerDefault::DispatchSEqualReduce(const ObjectRef& lhs, const ObjectRef& rhs,
+                                                bool map_free_vars,
+                                                const Optional<ObjectPathPair>& current_paths) {
+  return impl->DispatchSEqualReduce(lhs, rhs, map_free_vars, current_paths);
+}
+
 TVM_REGISTER_GLOBAL("node.StructuralEqual")
     .set_body_typed([](const ObjectRef& lhs, const ObjectRef& rhs, bool assert_mode,
                        bool map_free_vars) {
-      return RemapVarSEqualHandler(assert_mode, nullptr).Equal(lhs, rhs, map_free_vars);
+      return SEqualHandlerDefault(assert_mode, nullptr).Equal(lhs, rhs, map_free_vars);
     });
 
 TVM_REGISTER_GLOBAL("node.GetFirstStructuralMismatch")
     .set_body_typed([](const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars) {
       Optional<ObjectPathPair> first_mismatch;
-      bool equal = RemapVarSEqualHandler(false, &first_mismatch).Equal(lhs, rhs, map_free_vars);
+      bool equal = SEqualHandlerDefault(false, &first_mismatch).Equal(lhs, rhs, map_free_vars);
       ICHECK(equal == !first_mismatch.defined());
       return first_mismatch;
     });
 
 bool StructuralEqual::operator()(const ObjectRef& lhs, const ObjectRef& rhs) const {
-  return RemapVarSEqualHandler(false, nullptr).Equal(lhs, rhs, false);
+  return SEqualHandlerDefault(false, nullptr).Equal(lhs, rhs, false);
 }
 
 }  // namespace tvm
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index b40b1751fb78..a355e44028b6 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -55,8 +55,10 @@ void ReflectionVTable::SHashReduce(const Object* self, SHashReducer reducer) con
 // In particular, when we traverse unordered_map, we should first sort
 // the entries by keys(or hash of keys) before traversing.
 
-class VarCountingSHashHandler : public SHashReducer::Handler {
+class SHashHandlerDefault::Impl {
  public:
+  explicit Impl(SHashHandlerDefault* parent) : parent_(parent) {}
+
   /*! \brief Pending reduce tasks. */
   struct Task {
     /*!
@@ -81,15 +83,13 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
         : object(object), reduced_hash(reduced_hash), map_free_vars(map_free_vars) {}
   };
 
-  VarCountingSHashHandler() {}
-
-  void MarkGraphNode() final {
+  void MarkGraphNode() {
     // need to push to pending tasks in this case
     ICHECK(!allow_push_to_stack_ && !task_stack_.empty());
     task_stack_.back().graph_node_hash = true;
   }
 
-  bool LookupHashedValue(const ObjectRef& key, size_t* hash_value) final {
+  bool LookupHashedValue(const ObjectRef& key, size_t* hash_value) {
     auto it = hash_memo_.find(key);
     if (it != hash_memo_.end()) {
       hash_value[0] = it->second;
@@ -98,11 +98,11 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
     return false;
   }
 
-  void SHashReduceHashedValue(size_t hashed_value) final {
+  void SHashReduceHashedValue(size_t hashed_value) {
     pending_tasks_.emplace_back(Task(ObjectRef(nullptr), hashed_value, false));
   }
 
-  void SHashReduceFreeVar(const runtime::Object* var, bool map_free_vars) final {
+  void SHashReduceFreeVar(const runtime::Object* var, bool map_free_vars) {
     ICHECK(!hash_memo_.count(GetRef<ObjectRef>(var)));
     if (map_free_vars) {
       // use counter value.
@@ -115,7 +115,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
     }
   }
 
-  void SHashReduce(const ObjectRef& object, bool map_free_vars) final {
+  void SHashReduce(const ObjectRef& object, bool map_free_vars) {
     // Directly push the result
     // Note: it is still important to push the result to pendng tasks
     // so that the reduction order of hash values stays the same.
@@ -151,6 +151,11 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
     return ret;
   }
 
+  void DispatchSHash(const ObjectRef& object, bool map_free_vars) {
+    ICHECK(object.defined());
+    vtable_->SHashReduce(object.get(), SHashReducer(parent_, map_free_vars));
+  }
+
  protected:
   /*!
    * \brief Pop the top entry of the task stack and push the hash into the result stack.
@@ -219,7 +224,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
           ICHECK_EQ(pending_tasks_.size(), 0U);
           allow_push_to_stack_ = false;
           // dispatch hash, reduce to the current slot.
-          this->DispatchSHash(entry.object, entry.map_free_vars);
+          parent_->DispatchSHash(entry.object, entry.map_free_vars);
           allow_push_to_stack_ = true;
           // Move pending tasks to the stack until the marked point.
           while (pending_tasks_.size() != 0) {
@@ -231,13 +236,9 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
     }
   }
 
-  // The default equal as registered in the structural equal vtable.
-  void DispatchSHash(const ObjectRef& object, bool map_free_vars) {
-    ICHECK(object.defined());
-    vtable_->SHashReduce(object.get(), SHashReducer(this, map_free_vars));
-  }
-
  private:
+  // The owner of this impl
+  SHashHandlerDefault* parent_;
   // free var counter.
   size_t free_var_counter_{0};
   // graph node counter.
@@ -256,14 +257,43 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
   std::unordered_map<ObjectRef, size_t, ObjectPtrHash, ObjectPtrEqual> hash_memo_;
 };
 
+SHashHandlerDefault::SHashHandlerDefault() { impl = new Impl(this); }
+SHashHandlerDefault::~SHashHandlerDefault() { delete impl; }
+
+void SHashHandlerDefault::SHashReduceHashedValue(size_t hashed_value) {
+  return impl->SHashReduceHashedValue(hashed_value);
+}
+
+void SHashHandlerDefault::SHashReduce(const ObjectRef& key, bool map_free_vars) {
+  impl->SHashReduce(key, map_free_vars);
+}
+
+void SHashHandlerDefault::SHashReduceFreeVar(const runtime::Object* var, bool map_free_vars) {
+  impl->SHashReduceFreeVar(var, map_free_vars);
+}
+
+bool SHashHandlerDefault::LookupHashedValue(const ObjectRef& key, size_t* hashed_value) {
+  return impl->LookupHashedValue(key, hashed_value);
+}
+
+void SHashHandlerDefault::MarkGraphNode() { impl->MarkGraphNode(); }
+
+size_t SHashHandlerDefault::Hash(const ObjectRef& object, bool map_free_vars) {
+  return impl->Hash(object, map_free_vars);
+}
+
+void SHashHandlerDefault::DispatchSHash(const ObjectRef& key, bool map_free_vars) {
+  impl->DispatchSHash(key, map_free_vars);
+}
+
 TVM_REGISTER_GLOBAL("node.StructuralHash")
     .set_body_typed([](const ObjectRef& object, bool map_free_vars) -> int64_t {
-      size_t hashed_value = VarCountingSHashHandler().Hash(object, map_free_vars);
+      size_t hashed_value = SHashHandlerDefault().Hash(object, map_free_vars);
       return static_cast<int64_t>(hashed_value);
     });
 
 size_t StructuralHash::operator()(const ObjectRef& object) const {
-  return VarCountingSHashHandler().Hash(object, false);
+  return SHashHandlerDefault().Hash(object, false);
 }
 
 // SEQualReduce traits for runtime containers.

From f8666a9469f7c984283596ce796ef05489459b5e Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 10 Oct 2022 13:29:08 -0700
Subject: [PATCH 329/704] [ci] Template build steps (#12983)

This uses templating for the build steps and fixes a few instances of
missing skip markers and timeout wrappers. This also adds a retry to
JUnit S3 uploads which was missing before.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                | 254 +++++++++++++++++-----------
 ci/jenkins/Build.groovy.j2 | 328 +++++++++++++++++--------------------
 ci/jenkins/macros.j2       |  47 ++++--
 3 files changed, 345 insertions(+), 284 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index e964ac79a3ce..78addc9b2c93 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-09-26T10:48:49.577077
+// Generated at 2022-10-04T13:17:33.929159
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -731,13 +731,15 @@ stage('Build') {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
   parallel(
-    'BUILD: GPU': {
+
+  'BUILD: GPU': {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") {
           init_git()
           docker_init(ci_gpu)
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
           sh(
             script: """
@@ -775,18 +777,22 @@ stage('Build') {
             """,
             label: 'Upload artifacts to S3',
           )
-
+          }
         }
       }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: GPU')
     }
   },
+
   'BUILD: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu") {
           init_git()
           docker_init(ci_cpu)
-          sh (
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
             label: 'Create CPU cmake config',
           )
@@ -809,11 +815,10 @@ stage('Build') {
             label: 'Upload artifacts to S3',
           )
 
-          timeout(time: max_time, unit: 'MINUTES') {
-            ci_setup(ci_cpu)
-            // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
-            // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
-            sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
+          ci_setup(ci_cpu)
+          // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
+          // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
+          sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
           }
         }
       }
@@ -821,13 +826,15 @@ stage('Build') {
       Utils.markStageSkippedForConditional('BUILD: CPU')
     }
   },
+
   'BUILD: CPU MINIMAL': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu-minimal") {
           init_git()
           docker_init(ci_minimal)
-          sh (
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
             script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build",
             label: 'Create CPU minimal cmake config',
           )
@@ -845,31 +852,32 @@ stage('Build') {
             """,
             label: 'Upload artifacts to S3',
           )
-
+          }
         }
       }
     } else {
       Utils.markStageSkippedForConditional('BUILD: CPU MINIMAL')
     }
   },
+
   'BUILD: WASM': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-wasm") {
           init_git()
           docker_init(ci_wasm)
-          sh (
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
             label: 'Create WASM cmake config',
           )
           make(ci_wasm, 'build', '-j2')
           cpp_unittest(ci_wasm)
-          timeout(time: max_time, unit: 'MINUTES') {
-            ci_setup(ci_wasm)
-            sh (
-              script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
-              label: 'Run WASM lint and tests',
-            )
+          ci_setup(ci_wasm)
+          sh (
+            script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
+            label: 'Run WASM lint and tests',
+          )
           }
         }
       }
@@ -877,13 +885,15 @@ stage('Build') {
       Utils.markStageSkippedForConditional('BUILD: WASM')
     }
   },
+
   'BUILD: i386': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-i386") {
           init_git()
           docker_init(ci_i386)
-          sh (
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
             label: 'Create i386 cmake config',
           )
@@ -905,20 +915,22 @@ stage('Build') {
             """,
             label: 'Upload artifacts to S3',
           )
-
+          }
         }
       }
     } else {
       Utils.markStageSkippedForConditional('BUILD: i386')
     }
   },
+
   'BUILD: arm': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-arm") {
           init_git()
           docker_init(ci_arm)
-          sh (
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
             label: 'Create ARM cmake config',
           )
@@ -938,20 +950,22 @@ stage('Build') {
             """,
             label: 'Upload artifacts to S3',
           )
-
+          }
         }
       }
-     } else {
+    } else {
       Utils.markStageSkippedForConditional('BUILD: arm')
     }
   },
+
   'BUILD: Cortex-M': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cortexm") {
           init_git()
           docker_init(ci_cortexm)
-          sh (
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
             script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build",
             label: 'Create Cortex-M cmake config',
           )
@@ -970,20 +984,22 @@ stage('Build') {
             """,
             label: 'Upload artifacts to S3',
           )
-
+          }
         }
       }
-     } else {
+    } else {
       Utils.markStageSkippedForConditional('BUILD: Cortex-M')
     }
   },
+
   'BUILD: Hexagon': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") {
           init_git()
           docker_init(ci_hexagon)
-          sh (
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
             label: 'Create Hexagon cmake config',
           )
@@ -1006,20 +1022,22 @@ stage('Build') {
             """,
             label: 'Upload artifacts to S3',
           )
-
+          }
         }
       }
-     } else {
+    } else {
       Utils.markStageSkippedForConditional('BUILD: Hexagon')
     }
   },
+
   'BUILD: RISC-V': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-riscv") {
           init_git()
           docker_init(ci_riscv)
-          sh (
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
             script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build",
             label: 'Create RISC-V cmake config',
           )
@@ -1038,13 +1056,14 @@ stage('Build') {
             """,
             label: 'Upload artifacts to S3',
           )
-
+          }
         }
       }
-     } else {
+    } else {
       Utils.markStageSkippedForConditional('BUILD: RISC-V')
     }
   },
+
   )
 }
 }
@@ -1118,7 +1137,8 @@ def shard_run_unittest_GPU_1_of_3() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1181,7 +1201,8 @@ def shard_run_unittest_GPU_2_of_3() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1240,7 +1261,8 @@ def shard_run_unittest_GPU_3_of_3() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1298,7 +1320,8 @@ def shard_run_integration_CPU_1_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1355,7 +1378,8 @@ def shard_run_integration_CPU_2_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1412,7 +1436,8 @@ def shard_run_integration_CPU_3_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1469,7 +1494,8 @@ def shard_run_integration_CPU_4_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1527,7 +1553,8 @@ def shard_run_python_i386_1_of_3() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1584,7 +1611,8 @@ def shard_run_python_i386_2_of_3() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1640,7 +1668,8 @@ def shard_run_python_i386_3_of_3() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1697,7 +1726,8 @@ def shard_run_test_Hexagon_1_of_8() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1752,7 +1782,8 @@ def shard_run_test_Hexagon_2_of_8() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1807,7 +1838,8 @@ def shard_run_test_Hexagon_3_of_8() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1862,7 +1894,8 @@ def shard_run_test_Hexagon_4_of_8() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1917,7 +1950,8 @@ def shard_run_test_Hexagon_5_of_8() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -1972,7 +2006,8 @@ def shard_run_test_Hexagon_6_of_8() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2027,7 +2062,8 @@ def shard_run_test_Hexagon_7_of_8() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2082,7 +2118,8 @@ def shard_run_test_Hexagon_8_of_8() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2139,7 +2176,8 @@ def shard_run_integration_aarch64_1_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2195,7 +2233,8 @@ def shard_run_integration_aarch64_2_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2251,7 +2290,8 @@ def shard_run_integration_aarch64_3_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2307,7 +2347,8 @@ def shard_run_integration_aarch64_4_of_4() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2363,7 +2404,8 @@ def shard_run_topi_GPU_1_of_3() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2418,7 +2460,8 @@ def shard_run_topi_GPU_2_of_3() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2473,7 +2516,8 @@ def shard_run_topi_GPU_3_of_3() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2529,7 +2573,8 @@ def shard_run_frontend_GPU_1_of_6() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2584,7 +2629,8 @@ def shard_run_frontend_GPU_2_of_6() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2639,7 +2685,8 @@ def shard_run_frontend_GPU_3_of_6() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2694,7 +2741,8 @@ def shard_run_frontend_GPU_4_of_6() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2749,7 +2797,8 @@ def shard_run_frontend_GPU_5_of_6() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2804,7 +2853,8 @@ def shard_run_frontend_GPU_6_of_6() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2865,7 +2915,8 @@ def shard_run_topi_aarch64_1_of_2() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2924,7 +2975,8 @@ def shard_run_topi_aarch64_2_of_2() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -2980,7 +3032,8 @@ def shard_run_frontend_aarch64_1_of_2() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3035,7 +3088,8 @@ def shard_run_frontend_aarch64_2_of_2() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3096,7 +3150,8 @@ def shard_run_test_Cortex_M_1_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3151,7 +3206,8 @@ def shard_run_test_Cortex_M_2_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3206,7 +3262,8 @@ def shard_run_test_Cortex_M_3_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3261,7 +3318,8 @@ def shard_run_test_Cortex_M_4_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3316,7 +3374,8 @@ def shard_run_test_Cortex_M_5_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3371,7 +3430,8 @@ def shard_run_test_Cortex_M_6_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3426,7 +3486,8 @@ def shard_run_test_Cortex_M_7_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3481,7 +3542,8 @@ def shard_run_test_Cortex_M_8_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3536,7 +3598,8 @@ def shard_run_test_Cortex_M_9_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3591,7 +3654,8 @@ def shard_run_test_Cortex_M_10_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3646,7 +3710,8 @@ def shard_run_test_Cortex_M_11_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3701,7 +3766,8 @@ def shard_run_test_Cortex_M_12_of_12() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3758,7 +3824,8 @@ def shard_run_test_RISC_V_1_of_1() {
           sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_RISC_V --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_RISC_V --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -3803,7 +3870,8 @@ def run_unittest_minimal() {
             sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_CPU_MINIMAL --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_CPU_MINIMAL --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -4013,7 +4081,8 @@ stage('Test') {
               sh(
                 script: """
                   set -eux
-                  aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_CPU --recursive
+                  . ci/scripts/retry.sh
+                  retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_CPU --recursive
                 """,
                 label: 'Upload JUnits to S3',
               )
@@ -4064,7 +4133,8 @@ stage('Test') {
               sh(
                 script: """
                   set -eux
-                  aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_CPU --recursive
+                  . ci/scripts/retry.sh
+                  retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_CPU --recursive
                 """,
                 label: 'Upload JUnits to S3',
               )
@@ -4110,14 +4180,14 @@ stage('Test') {
             )
           }
           sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              md5sum docs.tgz
-              retry 3 aws s3 cp --no-progress docs.tgz s3://${s3_prefix}/docs/docs.tgz
-            """,
-            label: 'Upload artifacts to S3',
-          )
+      script: """
+        set -eux
+        . ci/scripts/retry.sh
+        md5sum docs.tgz
+        retry 3 aws s3 cp --no-progress docs.tgz s3://${s3_prefix}/docs/docs.tgz
+      """,
+      label: 'Upload artifacts to S3',
+    )
 
           sh(
             script: "aws s3 cp --no-progress _docs s3://${s3_prefix}/docs --recursive",
diff --git a/ci/jenkins/Build.groovy.j2 b/ci/jenkins/Build.groovy.j2
index a083fe88ad80..49cffacdc16e 100644
--- a/ci/jenkins/Build.groovy.j2
+++ b/ci/jenkins/Build.groovy.j2
@@ -80,185 +80,157 @@ stage('Build') {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
   parallel(
-    'BUILD: GPU': {
-    if (!skip_ci) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
-          init_git()
-          docker_init(ci_gpu)
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
-          make("${ci_gpu} --no-gpu", 'build', '-j2')
-          {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
 
-          // compiler test
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
-          make("${ci_gpu} --no-gpu", 'build2', '-j2')
-          {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }}
-        }
-      }
-    }
-  },
-  'BUILD: CPU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
-          init_git()
-          docker_init(ci_cpu)
-          sh (
-            script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
-            label: 'Create CPU cmake config',
-          )
-          make(ci_cpu, 'build', '-j2')
-          {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
-          timeout(time: max_time, unit: 'MINUTES') {
-            ci_setup(ci_cpu)
-            // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
-            // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
-            sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('BUILD: CPU')
-    }
-  },
-  'BUILD: CPU MINIMAL': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-cpu-minimal') }}) {
-          init_git()
-          docker_init(ci_minimal)
-          sh (
-            script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build",
-            label: 'Create CPU minimal cmake config',
-          )
-          make(ci_minimal, 'build', '-j2')
-          {{ m.upload_artifacts(tag='cpu-minimal', filenames=tvm_lib) }}
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('BUILD: CPU MINIMAL')
-    }
-  },
-  'BUILD: WASM': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
-          init_git()
-          docker_init(ci_wasm)
-          sh (
-            script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
-            label: 'Create WASM cmake config',
-          )
-          make(ci_wasm, 'build', '-j2')
-          cpp_unittest(ci_wasm)
-          timeout(time: max_time, unit: 'MINUTES') {
-            ci_setup(ci_wasm)
-            sh (
-              script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
-              label: 'Run WASM lint and tests',
-            )
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('BUILD: WASM')
-    }
-  },
-  'BUILD: i386': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-i386') }}) {
-          init_git()
-          docker_init(ci_i386)
-          sh (
-            script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
-            label: 'Create i386 cmake config',
-          )
-          make(ci_i386, 'build', '-j2')
-          {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }}
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('BUILD: i386')
-    }
-  },
-  'BUILD: arm': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-arm') }}) {
-          init_git()
-          docker_init(ci_arm)
-          sh (
-            script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
-            label: 'Create ARM cmake config',
-          )
-          make(ci_arm, 'build', '-j4')
-          {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }}
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('BUILD: arm')
-    }
-  },
-  'BUILD: Cortex-M': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-cortexm') }}) {
-          init_git()
-          docker_init(ci_cortexm)
-          sh (
-            script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build",
-            label: 'Create Cortex-M cmake config',
-          )
-          make(ci_cortexm, 'build', '-j2')
-          {{ m.upload_artifacts(tag='cortexm', filenames=tvm_lib, folders=microtvm_template_projects) }}
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('BUILD: Cortex-M')
-    }
-  },
-  'BUILD: Hexagon': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
-          init_git()
-          docker_init(ci_hexagon)
-          sh (
-            script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
-            label: 'Create Hexagon cmake config',
-          )
-          make(ci_hexagon, 'build', '-j2')
-          sh (
-            script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-            label: 'Build Hexagon API',
-          )
-          {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib, folders=hexagon_api) }}
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('BUILD: Hexagon')
-    }
-  },
-  'BUILD: RISC-V': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-riscv') }}) {
-          init_git()
-          docker_init(ci_riscv)
-          sh (
-            script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build",
-            label: 'Create RISC-V cmake config',
-          )
-          make(ci_riscv, 'build', '-j2')
-          {{ m.upload_artifacts(tag='riscv', filenames=tvm_lib, folders=microtvm_template_projects) }}
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('BUILD: RISC-V')
-    }
-  },
+  {% call m.build_step(
+      name='BUILD: GPU',
+      node='CPU-SMALL',
+      condition='!skip_ci',
+      ws='tvm/build-gpu',
+      docker_image='ci_gpu',
+    ) %}
+    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
+    make("${ci_gpu} --no-gpu", 'build', '-j2')
+    {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
+
+    // compiler test
+    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
+    make("${ci_gpu} --no-gpu", 'build2', '-j2')
+    {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }}
+  {% endcall %}
+
+  {% call m.build_step(
+      name='BUILD: CPU',
+      node='CPU-SMALL',
+      condition='!skip_ci && is_docs_only_build != 1',
+      ws='tvm/build-cpu',
+      docker_image='ci_cpu',
+    ) %}
+    sh (
+      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
+      label: 'Create CPU cmake config',
+    )
+    make(ci_cpu, 'build', '-j2')
+    {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
+    ci_setup(ci_cpu)
+    // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
+    // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
+    sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
+  {% endcall %}
+
+  {% call m.build_step(
+      name='BUILD: CPU MINIMAL',
+      node='CPU-SMALL',
+      condition='!skip_ci && is_docs_only_build != 1',
+      ws='tvm/build-cpu-minimal',
+      docker_image='ci_minimal',
+    ) %}
+    sh (
+      script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build",
+      label: 'Create CPU minimal cmake config',
+    )
+    make(ci_minimal, 'build', '-j2')
+    {{ m.upload_artifacts(tag='cpu-minimal', filenames=tvm_lib) }}
+  {% endcall %}
+
+  {% call m.build_step(
+      name='BUILD: WASM',
+      node='CPU-SMALL',
+      condition='!skip_ci && is_docs_only_build != 1',
+      ws='tvm/build-wasm',
+      docker_image='ci_wasm',
+    ) %}
+    sh (
+      script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
+      label: 'Create WASM cmake config',
+    )
+    make(ci_wasm, 'build', '-j2')
+    cpp_unittest(ci_wasm)
+    ci_setup(ci_wasm)
+    sh (
+      script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
+      label: 'Run WASM lint and tests',
+    )
+  {% endcall %}
+
+  {% call m.build_step(
+      name='BUILD: i386',
+      node='CPU-SMALL',
+      condition='!skip_ci && is_docs_only_build != 1',
+      ws='tvm/build-i386',
+      docker_image='ci_i386',
+    ) %}
+    sh (
+      script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
+      label: 'Create i386 cmake config',
+    )
+    make(ci_i386, 'build', '-j2')
+    {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }}
+  {% endcall %}
+
+  {% call m.build_step(
+      name='BUILD: arm',
+      node='ARM-SMALL',
+      condition='!skip_ci && is_docs_only_build != 1',
+      ws='tvm/build-arm',
+      docker_image='ci_arm',
+    ) %}
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
+      label: 'Create ARM cmake config',
+    )
+    make(ci_arm, 'build', '-j4')
+    {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }}
+  {% endcall %}
+
+  {% call m.build_step(
+      name='BUILD: Cortex-M',
+      node='CPU-SMALL',
+      condition='!skip_ci && is_docs_only_build != 1',
+      ws='tvm/build-cortexm',
+      docker_image='ci_cortexm',
+    ) %}
+    sh (
+      script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build",
+      label: 'Create Cortex-M cmake config',
+    )
+    make(ci_cortexm, 'build', '-j2')
+    {{ m.upload_artifacts(tag='cortexm', filenames=tvm_lib, folders=microtvm_template_projects) }}
+  {% endcall %}
+
+  {% call m.build_step(
+      name='BUILD: Hexagon',
+      node='CPU-SMALL',
+      condition='!skip_ci && is_docs_only_build != 1',
+      ws='tvm/build-hexagon',
+      docker_image='ci_hexagon',
+    ) %}
+    sh (
+      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
+      label: 'Create Hexagon cmake config',
+    )
+    make(ci_hexagon, 'build', '-j2')
+    sh (
+      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+      label: 'Build Hexagon API',
+    )
+    {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib, folders=hexagon_api) }}
+  {% endcall %}
+
+  {% call m.build_step(
+      name='BUILD: RISC-V',
+      node='CPU-SMALL',
+      condition='!skip_ci && is_docs_only_build != 1',
+      ws='tvm/build-riscv',
+      docker_image='ci_riscv',
+    ) %}
+    sh (
+      script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build",
+      label: 'Create RISC-V cmake config',
+    )
+    make(ci_riscv, 'build', '-j2')
+    {{ m.upload_artifacts(tag='riscv', filenames=tvm_lib, folders=microtvm_template_projects) }}
+  {% endcall %}
+
   )
 }
 }
diff --git a/ci/jenkins/macros.j2 b/ci/jenkins/macros.j2
index e6e69097b076..618b1d9d6b09 100644
--- a/ci/jenkins/macros.j2
+++ b/ci/jenkins/macros.j2
@@ -23,7 +23,8 @@
 sh(
             script: """
               set -eux
-              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/{{ test_dir_name }} --recursive
+              . ci/scripts/retry.sh
+              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/{{ test_dir_name }} --recursive
             """,
             label: 'Upload JUnits to S3',
           )
@@ -88,6 +89,24 @@ def {{ method_name }}() {
 {% endfor %}
 {% endmacro %}
 
+{% macro build_step(name, condition, node, docker_image, ws) %}
+  '{{ name }}': {
+    if ({{ condition }}) {
+      node('{{ node }}') {
+        ws({{ per_exec_ws(ws) }}) {
+          init_git()
+          docker_init({{ docker_image }})
+          timeout(time: max_time, unit: 'MINUTES') {
+            {{ caller() | trim | indent(width=6) }}
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('{{ name }}')
+    }
+  },
+{% endmacro %}
+
 {% macro test_step_body(name, node, ws, docker_image, platform) %}
 {% set test_dir_name = name.replace(":", "").replace(" ", "-").replace("-", "_")|string %}
   if (!skip_ci && is_docs_only_build != 1) {
@@ -158,19 +177,19 @@ def {{ method_name }}() {
 
 {% macro upload_artifacts(tag, filenames, folders=None) %}
 sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              {% for filename in filenames %}
-              md5sum {{ filename }}
-              retry 3 aws s3 cp --no-progress {{ filename }} s3://${s3_prefix}/{{ tag }}/{{ filename }}
-              {% endfor %}
-              {% for folder in (folders or []) %}
-              retry 3 aws s3 cp --no-progress {{ folder }} s3://${s3_prefix}/{{ tag }}/{{ folder }} --recursive
-              {% endfor %}
-            """,
-            label: 'Upload artifacts to S3',
-          )
+      script: """
+        set -eux
+        . ci/scripts/retry.sh
+        {% for filename in filenames %}
+        md5sum {{ filename }}
+        retry 3 aws s3 cp --no-progress {{ filename }} s3://${s3_prefix}/{{ tag }}/{{ filename }}
+        {% endfor %}
+        {% for folder in (folders or []) %}
+        retry 3 aws s3 cp --no-progress {{ folder }} s3://${s3_prefix}/{{ tag }}/{{ folder }} --recursive
+        {% endfor %}
+      """,
+      label: 'Upload artifacts to S3',
+    )
 {% endmacro %}
 
 {% macro download_artifacts(tag, filenames, folders=None) %}

From 84c50ef0455d126f071f7bb958f30acc784d825a Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 10 Oct 2022 13:36:45 -0700
Subject: [PATCH 330/704] [ci][docs] Add external network resources to S3
 (#12989)

This sends requests for any web resource that isn't in a TVM/DMLC
controlled location to a mirror in S3 to improve reliability (to avoid
failures like in
https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-12973/3/pipeline/)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/request_hook/request_hook.py | 40 +++++++++++++++++-----
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py
index 46448f0a38a8..dd1adf0dedd9 100644
--- a/tests/scripts/request_hook/request_hook.py
+++ b/tests/scripts/request_hook/request_hook.py
@@ -26,21 +26,45 @@
 # To update this list, run the workflow <HERE> with the URL to download and the SHA512 of the file
 BASE = "https://tvm-ci-resources.s3.us-west-2.amazonaws.com"
 URL_MAP = {
-    "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip": f"{BASE}/oneflow/resnet18.zip",
-    "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_cora.torch": f"{BASE}/gcn_cora.torch",
-    "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg": f"{BASE}/vta_cat.jpg",
-    "https://people.linaro.org/~tom.gall/sine_model.tflite": f"{BASE}/sine_model.tflite",
-    "https://pjreddie.com/media/files/yolov3-tiny.weights?raw=true": f"{BASE}/yolov3-tiny.weights",
-    "https://pjreddie.com/media/files/yolov3.weights": f"{BASE}/yolov3.weights",
     "http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec": f"{BASE}/mxnet-val_256_q90.rec",
+    "http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel": f"{BASE}/bvlc_alexnet.caffemodel",
+    "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel": f"{BASE}/bvlc_googlenet.caffemodel",
     "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz": f"{BASE}/tf-mobilenet_v1_1.0_224.tgz",
     "http://images.cocodataset.org/zips/val2017.zip": f"{BASE}/cocodataset-val2017.zip",
     "https://bj.bcebos.com/x2paddle/models/paddle_resnet50.tar": f"{BASE}/bcebos-paddle_resnet50.tar",
     "https://data.deepai.org/stanfordcars.zip": f"{BASE}/deepai-stanfordcars.zip",
-    "http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel": f"{BASE}/bvlc_alexnet.caffemodel",
-    "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel": f"{BASE}/bvlc_googlenet.caffemodel",
+    "https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel": f"{BASE}/2022-10-05/MobileNet.mlmodel",
+    "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth": f"{BASE}/2022-10-05/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth",
+    "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth": f"{BASE}/2022-10-05/mobilenet_v2-b0353104.pth",
+    "https://download.pytorch.org/models/resnet18-f37072fd.pth": f"{BASE}/2022-10-05/resnet18-f37072fd.pth",
+    "https://gist.github.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/93672b029103648953c4e5ad3ac3aadf346a4cdc/super_resolution_0.2.onnx": f"{BASE}/2022-10-05/super_resolution_0.2.onnx",
+    "https://gist.githubusercontent.com/zhreshold/4d0b62f3d01426887599d4f7ede23ee5/raw/596b27d23537e5a1b5751d2b0481ef172f58b539/imagenet1000_clsid_to_human.txt": f"{BASE}/2022-10-05/imagenet1000_clsid_to_human.txt",
     "https://github.com/dmlc/web-data/blob/main/darknet/data/dog.jpg": f"{BASE}/dog.jpg",
+    "https://github.com/dmlc/web-data/blob/main/gluoncv/detection/street_small.jpg?raw=true": f"{BASE}/2022-10-05/small_street_raw.jpg",
+    "https://github.com/dmlc/web-data/raw/main/gluoncv/detection/street_small.jpg": f"{BASE}/2022-10-05/gluon-small-stree.jpg",
+    "https://github.com/JonathanCMitchell/mobilenet_v2_keras/releases/download/v1.1/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5": f"{BASE}/2022-10-05/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5",
     "https://github.com/onnx/models/raw/bd206494e8b6a27b25e5cf7199dbcdbfe9d05d1c/vision/classification/mnist/model/mnist-1.onnx": f"{BASE}/onnx/mnist-1.onnx",
+    "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v2-7.onnx": f"{BASE}/2022-10-05/resnet50-v2-7.onnx",
+    "https://github.com/pjreddie/darknet/blob/master/cfg/yolov3-tiny.cfg?raw=true": f"{BASE}/2022-10-05/yolov3-tiny-raw.cfg",
+    "https://github.com/uwsampl/web-data/raw/main/vta/models/synset.txt": f"{BASE}/2022-10-05/synset.txt",
+    "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_cora.torch": f"{BASE}/gcn_cora.torch",
+    "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg": f"{BASE}/vta_cat.jpg",
+    "https://objects.githubusercontent.com/github-production-release-asset-2e65be/130932608/4b196a8a-4e2d-11e8-9a11-be3c41846711?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221004%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221004T170456Z&X-Amz-Expires=300&X-Amz-Signature=0602b68e8864b9b01c9142eee22aed3543fe98a5482686eec33d98e2617a2295&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=130932608&response-content-disposition=attachment%3B%20filename%3Dmobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5&response-content-type=application%2Foctet-stream": f"{BASE}/2022-10-05/aws-mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5",
+    "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip": f"{BASE}/oneflow/resnet18.zip",
+    "https://people.linaro.org/~tom.gall/sine_model.tflite": f"{BASE}/sine_model.tflite",
+    "https://pjreddie.com/media/files/yolov3-tiny.weights?raw=true": f"{BASE}/yolov3-tiny.weights",
+    "https://pjreddie.com/media/files/yolov3.weights": f"{BASE}/yolov3.weights",
+    "https://raw.githubusercontent.com/Cadene/pretrained-models.pytorch/master/data/imagenet_classes.txt": f"{BASE}/2022-10-05/imagenet_classes.txt",
+    "https://raw.githubusercontent.com/Cadene/pretrained-models.pytorch/master/data/imagenet_synsets.txt": f"{BASE}/2022-10-05/imagenet_synsets.txt",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/gluoncv/detection/street_small.jpg": f"{BASE}/2022-10-05/small_street.jpg",
+    "https://raw.githubusercontent.com/dmlc/web-data/master/gluoncv/detection/street_small.jpg": f"{BASE}/2022-10-05/street_small.jpg",
+    "https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/lite/java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt": f"{BASE}/2022-10-05/labels_mobilenet_quant_v1_224.txt",
+    "https://raw.githubusercontent.com/tlc-pack/tophub/main/tophub/mali_v0.06.log": f"{BASE}/2022-10-05/mali_v0.06.log",
+    "https://s3.amazonaws.com/model-server/inputs/kitten.jpg": f"{BASE}/2022-10-05/kitten.jpg",
+    "https://s3.amazonaws.com/onnx-model-zoo/synset.txt": f"{BASE}/2022-10-05/synset-s3.txt",
+    "https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz": f"{BASE}/2022-10-05/mobilenet_v2_1.0_224_quant.tgz",
+    "https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/mobilenet_2_5_128_tf.h5": f"{BASE}/2022-10-05/mobilenet_2_5_128_tf.h5",
+    "https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5": f"{BASE}/2022-10-05/resnet50_weights_tf_dim_ordering_tf_kernels.h5",
 }
 
 
From fcbcd156c7dd995b24b363dbddca5a8766648693 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Tue, 11 Oct 2022 15:07:20 +0700
Subject: [PATCH 331/704] [microTVM] Add Cortex-M DSP schedules for optimal
 conv2d layouts (#12969)

* Rewrite conv2D to tensorize with tensordot

* Functional conv2D tensordot implementation

* Add stupid hack to work around TVM bug

* Unit testing for conv2d schedule

* Connect new implementations to Arm strategy

* Separate into new tensordot conv2d schedule

* Separate testing infrastructure

* Prototype depthwise implementation

* Unit testing for depthwise_conv2d

* Linting and documentation

* Enforce SIMD alignment in strategy

* Prevent black from butchering our formatting

* Address code review comments

* Fix alignment strategy bug

* Fix linting

* Remove unconventional offset behavior

* Replace math.prod function to support Python 3.7

* Fix CI tests
---
 python/tvm/relay/op/strategy/arm_cpu.py       |  67 ++++-
 python/tvm/topi/arm_cpu/conv2d.py             |  16 ++
 python/tvm/topi/arm_cpu/depthwise_conv2d.py   |  19 +-
 .../mprofile/dsp/micro_kernel/tensordot.py    | 155 ++++++++++
 .../arm_cpu/mprofile/dsp/tensordot_conv2ds.py | 271 ++++++++++++++++++
 python/tvm/topi/utils.py                      |  30 +-
 .../strategy/arm_cpu/test_conv2d_nhwc.py      |  36 ++-
 .../strategy/arm_cpu/test_depthwise_conv2d.py |  27 ++
 8 files changed, 606 insertions(+), 15 deletions(-)
 create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
 create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 947beb396ae2..e56e7ba12e94 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Definition of ARM CPU operator strategy."""
+from functools import reduce
 import logging
 
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
@@ -71,6 +72,32 @@ def schedule_pool_arm_cpu(attrs, outs, target):
         return topi.generic.schedule_pool(outs, layout)
 
 
+def _get_padding_width(padding):
+    assert isinstance(padding, tuple)
+    if len(padding) == 2:
+        _, (pad_left, pad_right) = padding
+    else:
+        _, pad_left, _, pad_right = padding
+    return pad_left + pad_right
+
+
+def _is_simd_aligned(dtype, dimensions, padding=None):
+    if padding:
+        assert len(dimensions) == len(padding)
+        padded_dims = (sum(x) for x in zip(dimensions, padding))
+    else:
+        padded_dims = dimensions
+
+    # Multiply all elements of padded_dims together. We can't use math.prod, as it
+    # does not exist in Python 3.7.
+    size = reduce(lambda x, y: x * y, padded_dims)
+    return (
+        (dtype == "int8" and size % 4 == 0)
+        or (dtype == "int16" and size % 2 == 0)
+        or (dtype == "int32")
+    )
+
+
 @conv2d_strategy.register("arm_cpu")
 def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d arm cpu strategy"""
@@ -159,7 +186,21 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 name="conv2d_hwcn.generic",
             )
         elif layout == "NHWC":
-            if target.features.has_dsp and kernel_layout == "HWOI":
+            data_width_padding = _get_padding_width(padding)
+            if (
+                target.features.has_dsp
+                and dilation_w == dilation_h == 1
+                and kernel_layout == "OHWI"
+                # Check SIMD alignment
+                and _is_simd_aligned(data.dtype, data.shape[2:], padding=(data_width_padding, 0))
+                and _is_simd_aligned(kernel.dtype, kernel.shape[2:])
+            ):
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_ohwi_dsp),
+                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_ohwi_dsp),
+                    name="conv2d_nhwc_ohwi_dsp.arm_cpu",
+                )
+            elif target.features.has_dsp and kernel_layout == "HWOI":
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_dsp),
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_dsp),
@@ -199,13 +240,25 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
         if layout == "NCHW":
             assert kernel_layout == "OIHW" or re.match(r"OIHW\d*o", kernel_layout)
-            # ARM conv2d depthwise schedule
             if kernel_layout == "OIHW":
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw),
-                    name="depthwise_conv2d_nchw.arm_cpu",
-                )
+                data_width_padding = _get_padding_width(padding)
+                if (
+                    target.features.has_dsp
+                    and dilation_w == dilation_h == 1
+                    and _is_simd_aligned(data.dtype, data.shape[3:], padding=(data_width_padding,))
+                    and _is_simd_aligned(kernel.dtype, kernel.shape[3:])
+                ):
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw_oihw_dsp),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_oihw_dsp),
+                        name="depthwise_conv2d_nchw_oihw_dsp.arm_cpu",
+                    )
+                else:
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw),
+                        name="depthwise_conv2d_nchw.arm_cpu",
+                    )
 
             # TODO:
             # This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py
index ab489161a8fa..bb29de8fa27b 100644
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ b/python/tvm/topi/arm_cpu/conv2d.py
@@ -37,6 +37,10 @@
     conv2d_nhwc_dsp_compute,
     conv2d_nhwc_dsp_schedule,
 )
+from .mprofile.dsp.tensordot_conv2ds import (
+    conv2d_nhwc_ohwi_dsp_compute,
+    tensordot_conv2ds_schedule,
+)
 
 
 @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
@@ -518,3 +522,15 @@ def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
 def schedule_conv2d_nhwc_dsp(cfg, outs):
     """Create schedule for conv2d_nhwc_dsp"""
     return conv2d_nhwc_dsp_schedule(cfg, outs)
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_ohwi_dsp.arm_cpu")
+def conv2d_nhwc_ohwi_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d_nhwc_ohwi with v7e-m DSP instructions and the tensordot kernel."""
+    return conv2d_nhwc_ohwi_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype)
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_ohwi_dsp.arm_cpu")
+def schedule_conv2d_nhwc_ohwi_dsp(cfg, outs):
+    """Create schedule for conv2d_nhwc_ohwi."""
+    return tensordot_conv2ds_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index 333db3d5e014..58cd11e8cc09 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -27,11 +27,14 @@
 from ..nn.utils import get_pad_tuple
 from .tensor_intrin import smlal_int16_int32
 from .arm_utils import is_aarch64_arm
-
 from .mprofile.dsp.depthwise_conv2d import (
     depthwise_conv2d_nhwc_dsp_compute,
     depthwise_conv2d_nhwc_dsp_schedule,
 )
+from .mprofile.dsp.tensordot_conv2ds import (
+    depthwise_conv2d_nchw_oihw_dsp_compute,
+    tensordot_conv2ds_schedule,
+)
 
 
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
@@ -718,3 +721,17 @@ def depthwise_conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out
 def schedule_depthwise_conv2d_nhwc_dsp(cfg, outs):
     """Create schedule for conv2d_nhwc_dsp"""
     return depthwise_conv2d_nhwc_dsp_schedule(cfg, outs)
+
+
+@autotvm.register_topi_compute("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
+def depthwise_conv2d_nchw_oihw_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute depthwise_conv2d_nchw_oihw with v7e-m DSP instructions and the tensordot kernel."""
+    return depthwise_conv2d_nchw_oihw_dsp_compute(
+        cfg, data, kernel, strides, padding, dilation, out_dtype
+    )
+
+
+@autotvm.register_topi_schedule("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
+def schedule_depthwise_conv2d_nchw_oihw_dsp(cfg, outs):
+    """Create schedule for depthwise_conv2d_nchw_oihw."""
+    return tensordot_conv2ds_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
new file mode 100644
index 000000000000..0fdffc06cf4f
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Computes a "jumpy tensordot" operator, which can be used to tensorize many common operators
+including regular conv2d, depthwise conv2d, and grouped conv2d provided the data and kernel layouts
+are the optimal ones. When groups=1, the optimal data layout is NHWC and kernel layout is OHWI. When
+this is a depthwise convolution, the optimal data layout is NCHW and kernel layout is OIHW."""
+
+import textwrap
+
+from tvm import te, tir
+
+from .common import num_simd_lanes_per_word
+
+
+def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix):
+    """Gets the C function name of the tensordot function."""
+    return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}"
+
+
+def make_intrin_tensordot(slices, strides, tensordot_params):
+    """Helper function for constructing tensordot intrinsic. We can't construct the whole thing here
+    (as multiple schedules use tensordot and each must build the intrinstic differently) but we can
+    build part here to simplify the code."""
+
+    # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params
+    data, kernel, output = slices
+    data_strides, kernel_strides = strides
+
+    data_buf = tir.decl_buffer(
+        data.shape, data.dtype, name="data", offset_factor=1, strides=data_strides
+    )
+    kernel_buf = tir.decl_buffer(
+        kernel.shape,
+        kernel.dtype,
+        name="kernel",
+        offset_factor=1,
+        strides=kernel_strides,
+    )
+    output_buf = tir.decl_buffer(
+        output.shape, output.dtype, name="output", offset_factor=1, strides=[1]
+    )
+
+    def intrin_func(ins, outs):
+        builder = tir.ir_builder.create()
+        builder.emit(
+            tir.call_extern(
+                "int32",
+                _get_func_name(*tensordot_params),
+                outs[0].access_ptr("w"),
+                ins[0].access_ptr("r"),
+                ins[1].access_ptr("r"),
+            )
+        )
+        return builder.get()
+
+    return te.decl_tensor_intrin(
+        output.op,
+        intrin_func,
+        binds={data: data_buf, kernel: kernel_buf, output: output_buf},
+    )
+
+
+def tensordot_impl(in_dtype: str, tensor_h: int, jump: int, tensor_w: int, suffix: str) -> str:
+    """Generates C code for taking the dot products of two `tensor_h` * `tensor_w` tensors. Also has
+    a `jump` argument that advances the pointer of one tensor by that many words after each row. The
+    `jump` and `tensor_w` values must be word-aligned for the input data type, as non-word-aligned
+    memory access is slow on the Cortex-M series. Depending on the input datatype, the code may
+    contain DSP instructions for Arm v7e-m. C code contains DSP instructions for Arm v7e-m. See
+    the below pseudocode for reference:
+
+    tensordot(out_ptr, dat_ptr, ker_ptr) {
+        sum = 0;
+        for (i = 0; i < tensor_h; i++) {
+            for (j = 0; j < tensor_w; j++) {
+                sum += (*dat_ptr++) * (*ker_ptr++);
+            }
+            dat_ptr += jump;
+        }
+        *out_ptr = sum;
+    }
+    """
+
+    simd_lanes = num_simd_lanes_per_word(in_dtype)
+    assert tensor_w % simd_lanes == 0
+    assert jump % simd_lanes == 0
+
+    if in_dtype == "int8":
+        inner_loop = """
+              uint32_t tensor_c20 = __SXTB16(tensor_batch);
+              uint32_t kernel_c20 = __SXTB16(kernel_batch);
+              sum = __SMLAD(tensor_c20, kernel_c20, sum);
+
+              uint32_t tensor_c31 = __SXTB16(__ROR(tensor_batch, 8));
+              uint32_t kernel_c31 = __SXTB16(__ROR(kernel_batch, 8));
+              sum = __SMLAD(tensor_c31, kernel_c31, sum);"""
+
+    elif in_dtype == "int16":
+        inner_loop = """
+              sum = __SMLAD(tensor_batch, kernel_batch, sum);"""
+
+    elif in_dtype == "int32":
+        inner_loop = """
+              // Compiles to a single MAC instruction
+              sum += tensor_batch * kernel_batch;"""
+
+    else:
+        raise ValueError(f"No tensordot implementation exists for dtype '{in_dtype}'!")
+
+    function_name = _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix)
+    return textwrap.dedent(
+        (
+            f"""
+        #include <stdint.h>
+        #include <arm_nnsupportfunctions.h>
+
+        #ifdef __cplusplus
+        extern "C"
+        #endif
+        __STATIC_FORCEINLINE int32_t {function_name}(
+            uint32_t *out,
+            uint32_t *tensor,
+            uint32_t *kernel) {{
+
+          uint32_t sum = 0;
+
+          #pragma GCC unroll {tensor_h}
+          for (int i = 0; i < {tensor_h}; i++) {{
+            #pragma GCC unroll {tensor_w // simd_lanes}
+            for (int j = 0; j < {tensor_w // simd_lanes}; j++) {{
+              uint32_t tensor_batch = *tensor++;
+              uint32_t kernel_batch = *kernel++;
+              {inner_loop.strip()}
+            }}
+            tensor += {jump // simd_lanes};
+          }}
+          out[0] = sum;
+          return 0;
+        }}
+        """
+        )
+    )
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py b/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
new file mode 100644
index 000000000000..ccd0c8e3ef32
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
@@ -0,0 +1,271 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Implementations of several conv2d variations, all tensorized using tensordot and optimized for
+Cortex-M DSP. Currently contains a standard conv2d and depthwise conv2d implementation, but could be
+extended to add a grouped conv2d operator. Due to the way we tensorize, this schedule ONLY works
+when the data and kernel layouts are NCHWxc and OIHWxi respectively, where x is the number of
+input channels divided by the number of groups."""
+
+import random
+import string
+from typing import Union, Tuple
+
+from tvm import te
+from tvm.tir import indexdiv, indexmod
+from tvm.topi.utils import traverse_inline
+from tvm.topi.nn.pad import pad
+
+from .micro_kernel.tensordot import (
+    make_intrin_tensordot,
+    tensordot_impl,
+)
+
+
+def _unpack_2d_argument(argument: Union[int, Tuple]) -> Tuple:
+    if isinstance(argument, int):
+        return (argument, argument)
+    assert len(argument) == 2
+    return argument
+
+
+def _check_no_dilation(dilation: Union[int, Tuple]) -> None:
+    """Takes a dilation argument as an integer or tuple, and makes sure both dimensions are 1.
+    Dilation prevents us from using DSP instructions, so this schedule can't work (aside from the
+    niche case where dilation_h == stride_h and dilation_w == stride_w, which is rare enough we
+    probably don't need to support it)."""
+
+    dilation_h, dilation_w = _unpack_2d_argument(dilation)
+    assert dilation_h == dilation_w == 1
+
+
+def _unpack_padding(padding: Tuple) -> Tuple:
+    assert isinstance(padding, tuple)
+    if len(padding) == 2:
+        (pad_up, pad_down), (pad_left, pad_right) = padding
+    else:
+        pad_up, pad_left, pad_down, pad_right = padding
+    return pad_up, pad_left, pad_down, pad_right
+
+
+def _pad_if_needed(data: te.tensor.Tensor, layout: str, padding: Tuple) -> te.tensor.Tensor:
+    """Performs padding on a te.tensor.Tensor object if necessary. If padding = (0, 0, 0, 0), the
+    input tensor is returned unmodified. We only care about tuples here - "VALID" and "SAME" padding
+    will be converted by the importer TFLite importer if present."""
+
+    pad_up, pad_left, pad_down, pad_right = padding
+    if not any(padding):
+        return data
+
+    # We want to pad the "H" and "W" columns, and their position depends on the layout
+    pad_before, pad_after = [0, 0, 0, 0], [0, 0, 0, 0]
+    pad_before[layout.index("H")] = pad_up
+    pad_before[layout.index("W")] = pad_left
+    pad_after[layout.index("H")] = pad_down
+    pad_after[layout.index("W")] = pad_right
+    return pad(data, pad_before, pad_after, name="padded_data")
+
+
+def _compute_output_dim(
+    data_dim: int, kernel_dim: int, pad_before: int, pad_after: int, stride: int
+) -> int:
+    """Computes an output dimension of a convolution, given the data dimension, kernel dimension,
+    padding, and stride along that axis. Note that when stride > 1, this division will often not
+    be perfectly even."""
+    return (data_dim + pad_before + pad_after - kernel_dim) // stride + 1
+
+
+def _get_suffix() -> str:
+    """Returns a random eight-character string to append to C function names. Prevents accidental
+    re-definition of functions if the same operator appears twice in a Relay graph."""
+    return "".join(random.choices(string.ascii_uppercase, k=8))
+
+
+def conv2d_nhwc_ohwi_dsp_compute(_cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Standard conv2d schedule that can be tensorized using tensordot."""
+
+    stride_h, stride_w = _unpack_2d_argument(strides)
+    pad_up, pad_left, pad_down, pad_right = _unpack_padding(padding)
+    _check_no_dilation(dilation)
+
+    batch_size, data_h, data_w, in_channels = data.shape
+    output_channels, kernel_h, kernel_w, _ = kernel.shape
+    assert kernel.shape[3] == in_channels
+
+    output_h = _compute_output_dim(data_h, kernel_h, pad_up, pad_down, stride_h)
+    output_w = _compute_output_dim(data_w, kernel_w, pad_left, pad_right, stride_w)
+
+    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
+    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
+    kc_i = te.reduce_axis((0, in_channels), name="rc")
+
+    padded_data = _pad_if_needed(data, "NHWC", (pad_up, pad_left, pad_down, pad_right))
+    return te.compute(
+        (batch_size, output_h, output_w, output_channels),
+        lambda n, y, x, c: te.sum(
+            padded_data[n, y * stride_h + kh_i, x * stride_w + kw_i, kc_i].astype(out_dtype)
+            * kernel[c, kh_i, kw_i, kc_i].astype(out_dtype),
+            axis=(kh_i, kw_i, kc_i),
+        ),
+        name="conv2d",
+        tag="conv2d_nhwc_ohwi_dsp",
+    )
+
+
+def _make_conv2d_tensorization(padded_data, kernel):
+    _, _, padded_w, in_channels = padded_data.shape
+    _, kernel_h, kernel_w, _ = kernel.shape
+    in_dtype = padded_data.dtype
+    suffix = _get_suffix()
+    assert in_dtype == kernel.dtype
+
+    data_slice = te.placeholder((kernel_h, kernel_w, in_channels), name="a", dtype=in_dtype)
+    kernel_slice = te.placeholder((kernel_h, kernel_w, in_channels), name="b", dtype=in_dtype)
+
+    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
+    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
+    kc_i = te.reduce_axis((0, in_channels), name="kc_i")
+
+    output_slice = te.compute(
+        (1,),
+        lambda k: te.sum(
+            data_slice[kh_i, kw_i, kc_i].astype("int32")
+            * kernel_slice[kh_i, kw_i, kc_i].astype("int32"),
+            axis=[kh_i, kw_i, kc_i],
+        ),
+        name="c",
+    )
+
+    # TVM has a really strange bug where the outer reduction axis (kh_i) having length 1 causes the
+    # decl_buffer strides check to fail. height_stride is a dark magic workaround for this.
+    height_stride = in_channels * padded_w if kernel_h > 1 else in_channels
+    jump = (padded_w - kernel_w) * in_channels
+    tensordot_params = (in_dtype, kernel_h, jump, kernel_w * in_channels, suffix)
+    intrin_tensordot = make_intrin_tensordot(
+        (data_slice, kernel_slice, output_slice),
+        ([height_stride, in_channels, 1], [kernel_w * in_channels, in_channels, 1]),
+        tensordot_params,
+    )
+
+    tensordot_code = tensordot_impl(*tensordot_params)
+    return (intrin_tensordot, tensordot_code)
+
+
+def depthwise_conv2d_nchw_oihw_dsp_compute(
+    _cfg, data, kernel, strides, padding, dilation, out_dtype
+):
+    """Depthwise conv2d schedule that can be tensorized using tensordot."""
+
+    stride_h, stride_w = _unpack_2d_argument(strides)
+    pad_up, pad_left, pad_down, pad_right = _unpack_padding(padding)
+    _check_no_dilation(dilation)
+
+    batch_size, in_channels, data_h, data_w = data.shape
+    _, c_mul, kernel_h, kernel_w = kernel.shape
+    output_channels = in_channels * c_mul
+    assert kernel.shape[0] == in_channels
+
+    output_h = _compute_output_dim(data_h, kernel_h, pad_up, pad_down, stride_h)
+    output_w = _compute_output_dim(data_w, kernel_w, pad_left, pad_right, stride_w)
+
+    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
+    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
+
+    padded_data = _pad_if_needed(data, "NCHW", (pad_up, pad_left, pad_down, pad_right))
+    return te.compute(
+        (batch_size, output_channels, output_h, output_w),
+        lambda n, c, y, x: te.sum(
+            padded_data[
+                n,
+                indexdiv(c, c_mul),
+                y * stride_h + kh_i,
+                x * stride_w + kw_i,
+            ].astype(out_dtype)
+            * kernel[indexdiv(c, c_mul), indexmod(c, c_mul), kh_i, kw_i].astype(out_dtype),
+            axis=(kh_i, kw_i),
+        ),
+        name="depthwise_conv2d",
+        tag="depthwise_conv2d_nchw_oihw_dsp",
+    )
+
+
+def _make_depthwise_conv2d_tensorization(padded_data, kernel):
+    _, _, _, padded_w = padded_data.shape
+    _, _, kernel_h, kernel_w = kernel.shape
+
+    in_dtype = padded_data.dtype
+    suffix = _get_suffix()
+    assert in_dtype == kernel.dtype
+
+    data_slice = te.placeholder((kernel_h, kernel_w), name="a", dtype=in_dtype)
+    kernel_slice = te.placeholder((kernel_h, kernel_w), name="b", dtype=in_dtype)
+
+    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
+    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
+
+    output_slice = te.compute(
+        (1,),
+        lambda k: te.sum(
+            data_slice[kh_i, kw_i].astype("int32") * kernel_slice[kh_i, kw_i].astype("int32"),
+            axis=[kh_i, kw_i],
+        ),
+        name="c",
+    )
+
+    jump = padded_w - kernel_w
+    tensordot_params = (in_dtype, kernel_h, jump, kernel_w, suffix)
+    intrin_tensordot = make_intrin_tensordot(
+        (data_slice, kernel_slice, output_slice),
+        ([padded_w, 1], [kernel_w, 1]),
+        tensordot_params,
+    )
+
+    tensordot_code = tensordot_impl(*tensordot_params)
+    return (intrin_tensordot, tensordot_code)
+
+
+def tensordot_conv2ds_schedule(_cfg, outs):
+    """Schedule function using v7e-m DSP instructions for all the conv2d operators in this file. We
+    use one schedule function for them all, because they are tensorized with the same kernel."""
+
+    schedule = te.create_schedule([x.op for x in outs])
+
+    def _callback(operator):
+        if "conv2d" in operator.tag:
+            output = operator.output(0)
+            padded_data = output.op.input_tensors[0]
+            kernel = output.op.input_tensors[1]
+
+            if operator.tag == "conv2d_nhwc_ohwi_dsp":
+                b_ax, y_ax, x_ax, co_ax = schedule[output].op.axis
+                kh_ax, kw_ax, ci_ax = schedule[output].op.reduce_axis
+                schedule[output].reorder(b_ax, y_ax, x_ax, co_ax, kh_ax, kw_ax, ci_ax)
+                intrin, code = _make_conv2d_tensorization(padded_data, kernel)
+
+            elif operator.tag == "depthwise_conv2d_nchw_oihw_dsp":
+                b_ax, y_ax, x_ax, co_ax = schedule[output].op.axis
+                kh_ax, kw_ax = schedule[output].op.reduce_axis
+                schedule[output].reorder(b_ax, co_ax, y_ax, x_ax, kh_ax, kw_ax)
+                intrin, code = _make_depthwise_conv2d_tensorization(padded_data, kernel)
+
+            else:
+                raise ValueError(f"Cannot tensorize {operator.tag} with tensordot!")
+
+            schedule[output].tensorize(kh_ax, intrin)
+            schedule[output].pragma(b_ax, "import_c", code)
+
+    traverse_inline(schedule, outs[-1].op, _callback)
+    return schedule
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index f1c6fb5aa4f4..f6ca03d32742 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -22,9 +22,8 @@
 
 import numpy as np
 import tvm
-from tvm import te
+from tvm import relay, te
 from tvm.tir import bijective_layout, layout
-
 from . import cpp, tag
 
 
@@ -432,6 +431,33 @@ def get_shape(src_shape, src_layout, dst_layout):
     return get_const_tuple(tuple([src_shape[i.value] for i in dst_indices]))
 
 
+def change_constant_shape(src, src_layout, dst_layout):
+    """Makes a copy of a Relay constant, reshaping it to a new data layout.
+
+    Parameter
+    ---------
+    src : relay.Constant
+        The Constant to be reformatted.
+
+    src_layout : str
+        The current layout of the Relay constant. Must be alphabetic (e.g. NHWC
+        or OIHW, but not NCHW2c).
+
+    dst_layout : str
+        The desired layout of new the Relay constant. Must be alphabetic (e.g. NHWC
+        or OIHW, but not NCHW2c).
+
+    Returns
+    -------
+    dst_shape : relay.Constant
+        A copy of the Constant with the new layout.
+    """
+    assert src_layout.isalpha() and dst_layout.isalpha()
+    axis_order = [src_layout.index(c) for c in dst_layout]
+    reshaped = np.transpose(src.data.numpy(), axis_order)
+    return relay.Constant(tvm.nd.array(reshaped))
+
+
 def within_index(b, e, s, i):
     """Return a boolean value that indicates if i is within the given index.
 
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
index f5ae6f51dbd7..f5de3b51b67d 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
@@ -22,6 +22,7 @@
 from tvm import relay
 from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
 from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+from tvm.topi.utils import change_constant_shape
 
 
 class BasicConv2dTests:
@@ -61,11 +62,7 @@ def test_conv2d(
         ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
 
         input1 = relay.var("input", relay.TensorType(ishape, dtype))
-
-        if kernel_layout == "HWOI":
-            weight1 = relay.const(np.moveaxis(weight_data, 2, -1))
-        elif kernel_layout == "HWIO":
-            weight1 = relay.const(weight_data)
+        weight1 = change_constant_shape(weight0, "HWIO", kernel_layout)
 
         out1 = relay.op.nn.conv2d(
             input1,
@@ -150,5 +147,34 @@ class TestConv2d_HWIO(BasicConv2dTests):
     schedule_name = tvm.testing.parameter("conv2d_nhwc_spatial_pack.arm_cpu")
 
 
+class TestConv2d_Tensordot(BasicConv2dTests):
+    data_shape, kernel_size, num_filter, strides, padding = tvm.testing.parameters(
+        # Disabled because these kernels are not an integral number of words
+        # ((1, 32, 32, 1), (3, 3), 12, 1, 0),
+        # ((1, 32, 10, 3), (3, 3), 16, 1, 0),
+        # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1)),
+        ((4, 16, 16, 8), (5, 5), 8, 2, (0, 3, 3, 0)),
+        ((4, 16, 16, 8), (5, 5), 16, 2, (0, 3, 3, 0)),
+        ((4, 16, 16, 8), (5, 5), 8, 2, 0),
+        ((4, 16, 16, 8), (5, 5), 16, 2, 0),
+        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0),
+        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0),
+        ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1)),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0)),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0),
+        ((1, 49, 10, 1), (10, 4), 64, (2, 2), (4, 1, 5, 1)),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 0, 1, 1)),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (1, 1, 2, 2)),
+        ((1, 16, 16, 8), (5, 5), 16, 2, (3, 3, 2, 2)),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0),
+        ((1, 16, 16, 32), (1, 1), 64, 1, 0),
+    )
+    dilation = tvm.testing.parameter(1)
+    dtype = tvm.testing.parameter("int8", "int16", "int32")
+    kernel_layout = tvm.testing.parameter("OHWI")
+    schedule_name = tvm.testing.parameter("conv2d_nhwc_ohwi_dsp.arm_cpu")
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
index 15ea2a31d864..36059c798cbb 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -185,5 +185,32 @@ class TestDepthwiseConv2d_NHWC_HWOI_DSP(BasicDepthwiseConv2dTests):
     schedule_name = tvm.testing.parameter("depthwise_conv2d_nhwc_dsp.arm_cpu")
 
 
+class TestDepthwiseConv2d_Tensordot(BasicDepthwiseConv2dTests):
+    data_shape, kernel_size, num_filter, strides, padding, dtype = tvm.testing.parameters(
+        # Currently, our schedule requires kernel_w be divisible by the number of simd lanes given
+        # its dtype. This means 3x3 and 5x5 kernels do not work on int16 or int8 for now. If you had
+        # to, you could hack around this by padding the data and kernel.
+        ((1, 8, 48, 48), (3, 3), 8, (1, 1), 1, "int32"),
+        ((1, 16, 48, 48), (3, 3), 16, (2, 2), (1, 1, 0, 0), "int32"),
+        ((1, 32, 24, 24), (3, 3), 32, (1, 1), 1, "int32"),
+        ((1, 32, 24, 24), (3, 3), 32, (2, 2), (1, 1, 0, 0), "int32"),
+        ((1, 64, 12, 12), (3, 3), 64, (1, 1), 1, "int32"),
+        ((1, 64, 12, 12), (3, 3), 64, (2, 2), (1, 1, 0, 0), "int32"),
+        ((1, 128, 6, 6), (3, 3), 128, (1, 1), 1, "int32"),
+        ((1, 128, 6, 6), (3, 3), 128, (2, 2), (1, 1, 0, 0), "int32"),
+        ((1, 256, 3, 3), (3, 3), 256, (1, 1), 1, "int32"),
+        ((1, 64, 25, 5), (3, 3), 64, (1, 1), 1, "int32"),
+        ((1, 8, 24, 24), (5, 5), 8, (1, 1), 1, "int32"),
+        ((1, 8, 24, 24), (3, 5), 8, (1, 1), 1, "int32"),
+        # These "evenly divisible" kernels work on smaller dtypes.
+        ((1, 8, 48, 48), (3, 2), 8, 1, 0, "int16"),
+        ((1, 8, 48, 48), (4, 4), 8, 1, 0, "int8"),
+    )
+    dilation = tvm.testing.parameter(1)
+    data_layout = tvm.testing.parameter("NCHW")
+    kernel_layout = tvm.testing.parameter("OIHW")
+    schedule_name = tvm.testing.parameter("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 0b034d77bd611ca7642b7211fa34110a87271f25 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@gmail.com>
Date: Wed, 12 Oct 2022 00:05:30 +0800
Subject: [PATCH 332/704] [Doc] Fix the typo in the debugging doc (#13032)

Fix the typo in the debugging doc
---
 docs/dev/how_to/debugging_tvm.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/dev/how_to/debugging_tvm.rst b/docs/dev/how_to/debugging_tvm.rst
index 0ad44fdd17ce..8e3161077053 100644
--- a/docs/dev/how_to/debugging_tvm.rst
+++ b/docs/dev/how_to/debugging_tvm.rst
@@ -17,7 +17,7 @@
 
 .. _debugging-tvm:
 
-Debuggging TVM
+Debugging TVM
 ==============
 
 **NOTE**: This page is a work in-progress. Everyone is welcomed to add suggestions and tips via

From cf96072a049bcf3787cf52f92573a8103883ebe1 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Tue, 11 Oct 2022 09:40:51 -0700
Subject: [PATCH 333/704] [COMMUNITY] Add andrewzhaoluo key for release 
 (#13025)

As per https://tvm.apache.org/docs/contribute/release_process.html#id3
---
 KEYS | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/KEYS b/KEYS
index 41a620e796bc..9abdcd817a72 100644
--- a/KEYS
+++ b/KEYS
@@ -464,3 +464,62 @@ aLLvg9K8HxeWTvQvowCAyFJo4NfIrS/7gMm5JcWMAqVFJ+IVxZNxZUIYV0VBC/AN
 rSSBN90DWxIgPhlAqgO0ofkbPSVwF/9i7nd3
 =XBuV
 -----END PGP PUBLIC KEY BLOCK-----
+pub   rsa4096 2022-10-10 [SC]
+      1B63BD2FFF5E515DA1BEF393C9A56ABD5CCA3EB8
+uid           [ultimate] Andrew Zhao Luo <andrewzhaoluo@apache.org>
+sig 3        C9A56ABD5CCA3EB8 2022-10-10  Andrew Zhao Luo <andrewzhaoluo@apache.org>
+sub   rsa4096 2022-10-10 [E]
+sig          C9A56ABD5CCA3EB8 2022-10-10  Andrew Zhao Luo <andrewzhaoluo@apache.org>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBGNEkfMBEAC4sFi6Msfxtv4pahjdp+nmfwprEemP0inI4yiuT9m5eEzc4/0/
+EwPHw4Kwx4SQypxSJXqMnxSI97w/54LW0Gob9hzRwcCLCe4zPR9YnJQ0JQo5yrjE
+zo9JvgyIGtGhM6rUTSMcCIO3eYb4Ogwe99DoWMz4w2NA9wB3nzkA2zL4VpaM3Ou/
+F54xlMLE7ht0EralHcHZUuSmvpzm43lScE6LwypFecNvfoBdiJJ5rGxbFMKJRGeF
+GGCZPuLy3EBrPbHe8cjWfgBKNj3XTl8B8YO5oEIZZPCpo3ML8DuD1Mf93PCk8Hd/
+yr9U7VMOXrEEZpTjGOEl3oL+5VVUFFBDs2tuxeBMExC90sEkoXDtJCRkSlHHBWZy
+tzTqwN9GcLe1N5YKwEphhnmN7tp5rLEJXdUasJ4RlWQHZqVJddDELLb595FYgZMe
+2dBXKbXrn8NvJBJf5yeLkSJh9gdkdtXwX/YN4D70LYKLz9+XZqhg3iPLAdrY+xVN
+lHtCZDKLSHpNHWPcqnBIcOre6ucBJu52S3ZoVtH/CCQrBkuVpXWSjnL1wCw1Djyx
+cNSVSVR9/yZQTgcWQh1zmErQEC9mmUrhTJ19IJ4bpseWgyhIETzuSYZ0Xm7c3eT9
+FrogP/D/uCWwfb5DJUIIFBh3fkEMCSEpc4TPIjulpJL3i3FaLLN4hYURKwARAQAB
+tCpBbmRyZXcgWmhhbyBMdW8gPGFuZHJld3poYW9sdW9AYXBhY2hlLm9yZz6JAk4E
+EwEKADgWIQQbY70v/15RXaG+85PJpWq9XMo+uAUCY0SR8wIbAwULCQgHAgYVCgkI
+CwIEFgIDAQIeAQIXgAAKCRDJpWq9XMo+uFYoD/4oALh+pdsnFwbyx1ycf3lLExwE
+vMrmr+hMrodQqoqQ7bt+anYzA78v0HD3U7zsLSqhIhYE34Ib3fB5Rv7z6DcNP6pl
+RUH7QU4DOyePRPRx/xYz5R3OkqVKrV7RUdzgXOn+5mujTJiRYzRbNyexg88dJVWK
+eQCiNyW9j8M/+5a/+gWjehxyvSmoSv1fEFUDV7hjIinSApWyMm0Q8tzoqxPmuaTE
+ll15VkgWx2t3bjQtfPCeft1eZ7Tb84k/PRN7JRFVEZYul4MtRSrJTDO1E5ewZ2qH
+PV/2JQPQMFKpaDEMseoGk6/O9I76sLjTIjQ7mfFOnEBiMph2BCF/cpMufxLnE2WC
+GyTCGVB+BPgvQ9kvD6rFTAHyiWetHA5Z0v/TYUYAPYIATk4N0Hop/2fx4NK4vWeX
+ehjvgPzp65vRPAHiIh7JJM4yt95yMdSpo7sUuduefyMf5FgzBpjaXTb2nI5NsUOr
+Ohh6MjaZWt1tZoNj7X81IILJJk4HDkDLpTsi8dDLPRzuHw7iNb9U0bn6cSqFW8JZ
+M+U1t6jpdJ9hEDlBiJPZbH3Ndky+ZyDoQQ6zp2mGbgkrT6soFzIi2zQ55qEpnMNM
+QpxR17BTJAJO6JEPIHhdovU/VDg8ho7blbhNFY/L8o72Q4RAnLW36rRBx+dsExHn
+Gn6OvtU24FhEfPlWyrkCDQRjRJHzARAAvdG8QPkyHtnV4SyAgaMp6lIm31OglXQO
+LFue4Xnv/UsUzXY8am281dnF7IbccnmxFxxlJq32lIav+L77I5wQUd/DuY3zj37b
+RddyskOuK7m1skMXnBgJFUlfwE9H6ypr+HPy05VAnp5zsqelXhvIoJmioTFysmgi
+IFZTUfV9RPp6ohO18r4Vdgyn0a/p+hCoNuxdjlZUSZ4WgY3b+11d+wcudUu2zfwc
+LSuXpsp30+tox5vcn82fANux0fnxbpc8Ic00XlEQCeUphF9NxhBPGnPRQV12rBpT
+eo+bOUp2UN3dEPgnYGWLBt8uuxVOr5XE1AwwlIokSdoS8zGVR8JPk+32PEW07Q4R
+8t0J/MFacFlvHHpWkBBStXU2pzzLs+AX5qO7s6XekqpXdb261vSEd86jH6ndqIo0
+KSSPlUmBi4FAKHKZIUhdSM0waR9CJQfYUWGqXLJpaKqKTojqIuXQWh4S343H9IRg
+n5nbihuiko8UrrzofNBb0TXfPOnYYjCB3cFTVQzIFl05aNGs5HQGLX0wbqD7+kfP
+m79b6p5SWLoNLmGNLj0dDcBelw+nAPhbOIn1rohwdPPJt5gU05BPv++X6CzmqFEA
+pVx01HXnbX62P2HT2V5YavLPw/R0FrXOB4ZWKH/tg+BPMBqS+E5eifadvVvKH/8w
+rc0Q1UwYxB8AEQEAAYkCNgQYAQoAIBYhBBtjvS//XlFdob7zk8mlar1cyj64BQJj
+RJHzAhsMAAoJEMmlar1cyj64hjMP/juNX8sFXlNCyR/HHKHwpfzn+nj6vVz3RgJi
+OFhf7HYAKh37yMizF3pN7ueyV55BBiiISQNbxf5eLh6yCJ2NGkun+mTKPow5CAyB
+yFS/z6zmlGduL+L8flI0Pao0UJgryhDUYkNrR5/PkZ4ksPKyI3sLlaoOPvIQAlk2
+aw1BI8RzTo05Y9OHralpFV0Nvufjvc9R0Q0934216M7NNK8nUSxXWeztM0yBHEIi
+V+/XY821F+yO2aBhHqnpQeJ1+6bc3UB7sbt8xA91rJ40Kw7TS4FGbTzQyXKRBMKY
+LoZVF61lRUoAFY4Fh+dRKAEel8ZnBhyHEyh5NCUWkHJNWxpPnl/XIVJZ3BbFbtfT
+W/CeWBEAkrJnCl5CfpXUyZRWYk2uwR1tA7apV+zJpaPwojnY5s+2IhMPrTdkxsNR
+zA4jpYkRVEwqy4LuLLbiVnTPba6y8DBiQ4by1m1CKJJJ09BMUKff5v1xSerONLBM
+uEKTrz3MJLLh1sZWkTO04K2VarbWoCygydcrxc9PNOuISq2mn+g2kzVhnUG45YnQ
+RRveMKZ6+uqGzsSYwp+lHNNso0ey94qgwy4qubT++rLZZ5eVqBSUWCsGoEayDBOQ
+v9YZLKL6qfuWuYN7rDdY1c82kPPmjaSkpXiPP7q6v8vUOGhnMFOAUNfxwpXP5Hs9
+/lFRrVmO
+=rAtV
+-----END PGP PUBLIC KEY BLOCK-----

From afeab6e8a320b3a4aedbbe0b25886501146c1942 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Tue, 11 Oct 2022 09:44:17 -0700
Subject: [PATCH 334/704] [TVMScript] AST, Source and diagnostics for Parser
 (#12978)

This PR introduces AST, Source and diagnostics for Parser
---
 python/tvm/script/_parser/__init__.py         |  18 +
 python/tvm/script/_parser/_core.py            |  19 +
 python/tvm/script/_parser/core/__init__.py    |  18 +
 python/tvm/script/_parser/core/diagnostics.py | 241 +++++++++
 python/tvm/script/_parser/core/doc.py         | 462 ++++++++++++++++++
 .../{printer => _parser/core}/doc_core.py     |   0
 python/tvm/script/_parser/core/utils.py       |  61 +++
 .../unittest/test_tvmscript_parser_source.py  |  86 ++++
 8 files changed, 905 insertions(+)
 create mode 100644 python/tvm/script/_parser/__init__.py
 create mode 100644 python/tvm/script/_parser/_core.py
 create mode 100644 python/tvm/script/_parser/core/__init__.py
 create mode 100644 python/tvm/script/_parser/core/diagnostics.py
 create mode 100644 python/tvm/script/_parser/core/doc.py
 rename python/tvm/script/{printer => _parser/core}/doc_core.py (100%)
 create mode 100644 python/tvm/script/_parser/core/utils.py
 create mode 100644 tests/python/unittest/test_tvmscript_parser_source.py

diff --git a/python/tvm/script/_parser/__init__.py b/python/tvm/script/_parser/__init__.py
new file mode 100644
index 000000000000..d885b405257b
--- /dev/null
+++ b/python/tvm/script/_parser/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the Licens.
+"""The parser"""
+from . import _core
diff --git a/python/tvm/script/_parser/_core.py b/python/tvm/script/_parser/_core.py
new file mode 100644
index 000000000000..a2dcc5b531dc
--- /dev/null
+++ b/python/tvm/script/_parser/_core.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the Licens.
+"""The core parser infra"""
+# pylint: disable=unused-import
+from .core import doc, utils
diff --git a/python/tvm/script/_parser/core/__init__.py b/python/tvm/script/_parser/core/__init__.py
new file mode 100644
index 000000000000..ae1521006d9b
--- /dev/null
+++ b/python/tvm/script/_parser/core/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The core parser infra"""
+from . import diagnostics, doc, doc_core, utils
diff --git a/python/tvm/script/_parser/core/diagnostics.py b/python/tvm/script/_parser/core/diagnostics.py
new file mode 100644
index 000000000000..b077d221424c
--- /dev/null
+++ b/python/tvm/script/_parser/core/diagnostics.py
@@ -0,0 +1,241 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""TVM Script Parser Source and diagnostics"""
+
+import inspect
+import re
+import sys
+from typing import Union
+
+from tvm.ir import IRModule, SourceName, Span, diagnostics
+
+from . import doc
+
+
+class Source:
+    """Source code class for TVMScript.
+
+    It is constructed by source code str or doc AST tree.
+
+    Parameters
+    ----------
+    source_name : str
+        The filename of the file where the source code locates.
+
+    start_line : int
+        The first line number of the source code.
+
+    start_column : int
+        The first column number of the first line of the source code.
+
+    source : str
+        The source code str of source code.
+
+    full_source : str
+        The complete source code of the file where the source code locates.
+    """
+
+    source_name: str
+    start_line: int
+    start_column: int
+    source: str
+    full_source: str
+
+    def __init__(self, program: Union[str, doc.AST]):
+        if isinstance(program, str):
+            self.source_name = "<str>"
+            self.start_line = 1
+            self.start_column = 0
+            self.source = program
+            self.full_source = program
+            return
+
+        self.source_name = inspect.getsourcefile(program)  # type: ignore
+        lines, self.start_line = getsourcelines(program)  # type: ignore
+        if lines:
+            self.start_column = len(lines[0]) - len(lines[0].lstrip())
+        else:
+            self.start_column = 0
+        if self.start_column and lines:
+            self.source = "\n".join([l[self.start_column :].rstrip() for l in lines])
+        else:
+            self.source = "".join(lines)
+        try:
+            # It will cause a problem when running in Jupyter Notebook.
+            # `mod` will be <module '__main__'>, which is a built-in module
+            # and `getsource` will throw a TypeError
+            mod = inspect.getmodule(program)
+            if mod:
+                self.full_source = inspect.getsource(mod)
+            else:
+                self.full_source = self.source
+        except TypeError:
+            # It's a work around for Jupyter problem.
+            # Since `findsource` is an internal API of inspect, we just use it
+            # as a fallback method.
+            src, _ = inspect.findsource(program)  # type: ignore
+            self.full_source = "".join(src)
+
+    def as_ast(self) -> doc.AST:
+        """Parse the source code into AST.
+
+        Returns
+        -------
+        res : doc.AST
+            The AST of source code.
+        """
+        return doc.parse(self.source)
+
+
+_getfile = inspect.getfile  # pylint: disable=invalid-name
+_findsource = inspect.findsource  # pylint: disable=invalid-name
+
+
+def _patched_inspect_getfile(obj):
+    """Work out which source or compiled file an object was defined in."""
+    if not inspect.isclass(obj):
+        return _getfile(obj)
+    mod = getattr(obj, "__module__", None)
+    if mod is not None:
+        file = getattr(sys.modules[mod], "__file__", None)
+        if file is not None:
+            return file
+    for _, member in inspect.getmembers(obj):
+        if inspect.isfunction(member):
+            if obj.__qualname__ + "." + member.__name__ == member.__qualname__:
+                return inspect.getfile(member)
+    raise TypeError(f"Source for {obj:!r} not found")
+
+
+def findsource(obj):
+    """Return the entire source file and starting line number for an object."""
+    import linecache  # pylint: disable=import-outside-toplevel
+
+    if not inspect.isclass(obj):
+        return _findsource(obj)
+
+    file = inspect.getsourcefile(obj)
+    if file:
+        linecache.checkcache(file)
+    else:
+        file = inspect.getfile(obj)
+        if not (file.startswith("<") and file.endswith(">")):
+            raise OSError("source code not available")
+
+    module = inspect.getmodule(obj, file)
+    if module:
+        lines = linecache.getlines(file, module.__dict__)
+    else:
+        lines = linecache.getlines(file)
+    if not lines:
+        raise OSError("could not get source code")
+    qual_names = obj.__qualname__.replace(".<locals>", "<locals>").split(".")
+    pattern_list = []
+    for name in qual_names:
+        if name.endswith("<locals>"):
+            pattern_list.append(re.compile(r"^(\s*)def\s*" + name[:-8] + r"\b"))
+        else:
+            pattern_list.append(re.compile(r"^(\s*)class\s*" + name + r"\b"))
+    for i, line in enumerate(lines):
+        match = pattern_list[0].match(line)
+        if match:
+            pattern_list.pop(0)
+        if not pattern_list:
+            return lines, i
+    raise OSError("could not find class definition")
+
+
+def getsourcelines(obj):
+    """Extract the block of code at the top of the given list of lines."""
+    obj = inspect.unwrap(obj)
+    lines, l_num = findsource(obj)
+    return inspect.getblock(lines[l_num:]), l_num + 1
+
+
+inspect.getfile = _patched_inspect_getfile
+
+
+class Diagnostics:
+    """Diagnostics class for error reporting in parser.
+
+    Parameters
+    ----------
+    source : Source
+        The source code.
+
+    ctx : diagnostics.DiagnosticContext
+        The diagnostic context for diagnostics.
+    """
+
+    source: Source
+    ctx: diagnostics.DiagnosticContext
+
+    def __init__(self, source: Source):
+        mod = IRModule()
+        mod.source_map.add(source.source_name, source.full_source)
+        self.source = source
+        self.ctx = diagnostics.DiagnosticContext(mod, diagnostics.get_renderer())
+
+    def _emit(self, node: doc.AST, message: str, level: diagnostics.DiagnosticLevel) -> None:
+        """Emit a diagnostic.
+
+        Parameters
+        ----------
+        node : doc.AST
+            The node with diagnostic information.
+
+        message : str
+            The diagnostic message.
+
+        level : diagnostics.DiagnosticLevel
+            The diagnostic level.
+        """
+        lineno = node.lineno or self.source.start_line
+        col_offset = node.col_offset or self.source.start_column
+        end_lineno = node.end_lineno or lineno
+        end_col_offset = node.end_col_offset or col_offset
+        lineno += self.source.start_line - 1
+        end_lineno += self.source.start_line - 1
+        col_offset += self.source.start_column + 1
+        end_col_offset += self.source.start_column + 1
+        self.ctx.emit(
+            diagnostics.Diagnostic(
+                level=level,
+                span=Span(
+                    source_name=SourceName(self.source.source_name),
+                    line=lineno,
+                    end_line=end_lineno,
+                    column=col_offset,
+                    end_column=end_col_offset,
+                ),
+                message=message,
+            )
+        )
+
+    def error(self, node: doc.AST, message: str) -> None:
+        """Emit a diagnostic error.
+
+        Parameters
+        ----------
+        node : doc.AST
+            The node with diagnostic error.
+
+        message : str
+            The diagnostic message.
+        """
+        self._emit(node, message, diagnostics.DiagnosticLevel.ERROR)
+        self.ctx.render()
diff --git a/python/tvm/script/_parser/core/doc.py b/python/tvm/script/_parser/core/doc.py
new file mode 100644
index 000000000000..5ea83749eadf
--- /dev/null
+++ b/python/tvm/script/_parser/core/doc.py
@@ -0,0 +1,462 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""TVM Script Parser doc AST"""
+
+import ast
+import inspect
+import sys
+import typing
+from collections import defaultdict
+
+from . import doc_core as doc
+from .doc_core import *  # pylint: disable=unused-import,wildcard-import,redefined-builtin,W0614
+
+FnToDoc = typing.Callable[[ast.AST], doc.AST]
+FnFromDoc = typing.Callable[[doc.AST], ast.AST]
+
+
+class Entry:
+    """Mapping entry between python AST node type str and doc AST.
+
+    Parameters
+    ----------
+    to_doc : typing.Optional[FnToDoc]
+        The callable methods for converting python AST node to doc AST.
+
+    from_doc : typing.Optional[FnFromDoc]
+        The callable methods for converting doc AST to python AST node.
+    """
+
+    to_doc: typing.Optional[FnToDoc]
+    from_doc: typing.Optional[FnFromDoc]
+
+    def __init__(self):
+        self.to_doc = None
+        self.from_doc = None
+
+
+class Registry:
+    """Registration map from python AST node type str to methods of conversion
+    between python AST node and doc AST node.
+
+    Parameters
+    ----------
+    _inst : typing.Optional["Registry"]
+        The instance of Registry.
+
+    table : typing.Dict[str, Entry]
+        The registration map from python AST node type str to methods of conversion
+        between python AST node and doc AST node.
+    """
+
+    _inst: typing.Optional["Registry"] = None
+    table: typing.Dict[str, Entry]
+
+    def __init__(self):
+        self.table = defaultdict(Entry)
+
+
+def register_to_doc(name: str):
+    """Register the to_doc method for python AST node type.
+
+    Parameters
+    ----------
+    name : str
+        The type of python AST node.
+
+    Returns
+    -------
+    f : Callable[[FnToDoc], None]
+        The function of registering the to_doc method for python AST node type.
+    """
+
+    def f(to_doc: FnToDoc):  # pylint: disable=redefined-outer-name
+        reg = Registry._inst  # pylint: disable=protected-access
+        reg.table[name].to_doc = to_doc
+
+    return f
+
+
+def register_from_doc(name: str):
+    """Register the from_doc method for python AST node type.
+
+    Parameters
+    ----------
+    name : str
+        The type of python AST node.
+
+    Returns
+    -------
+    f : Callable[[FnFromDoc], None]
+        The function of registering the from_doc method for python AST node type.
+    """
+
+    def f(to_doc: FnFromDoc):  # pylint: disable=redefined-outer-name
+        reg = Registry._inst  # pylint: disable=protected-access
+        reg.table[name].from_doc = to_doc
+
+    return f
+
+
+def _is_atomic_type(node):
+    return (
+        node is None
+        or node in [..., True, False]
+        or isinstance(
+            node,
+            (
+                int,
+                float,
+                str,
+                bool,
+                bytes,
+                complex,
+            ),
+        )
+    )
+
+
+def _get_registry_entry(cls_name, attr):
+    cls_name = cls_name.split(".")[-1]
+    reg = Registry._inst  # pylint: disable=protected-access
+    if cls_name in reg.table:
+        entry = reg.table[cls_name]
+        return getattr(entry, attr, None)
+    return None
+
+
+def from_doc(node):
+    """Get original python AST node from doc AST node.
+
+    Parameters
+    ----------
+    node : doc.AST
+        The doc AST node.
+
+    Returns
+    -------
+    res : ast.AST
+        The corresponding AST node.
+    """
+    if _is_atomic_type(node):
+        return node
+    if isinstance(node, tuple):
+        return tuple(from_doc(n) for n in node)
+    if isinstance(node, list):
+        return [from_doc(n) for n in node]
+    func = _get_registry_entry(node.__class__.__name__, "from_doc")
+    if not func:
+        raise NotImplementedError(f"from_doc is not implemented for: {node.__class__.__name__}")
+    return func(node)
+
+
+def to_doc(node):
+    """Get doc AST node from python AST node.
+
+    Parameters
+    ----------
+    node : ast.AST
+        The AST node.
+
+    Returns
+    -------
+    res : doc.AST
+        The corresponding doc AST node.
+    """
+    if _is_atomic_type(node):
+        return node
+    if isinstance(node, tuple):
+        return tuple(to_doc(n) for n in node)
+    if isinstance(node, list):
+        return [to_doc(n) for n in node]
+    func = _get_registry_entry(node.__class__.__name__, "to_doc")
+    if not func:
+        raise NotImplementedError(f"to_doc is not implemented for: {node.__class__.__name__}")
+    return func(node)
+
+
+def parse(
+    source: str,
+    filename: str = "<unknown>",
+    mode: str = "exec",
+) -> doc.AST:
+    """Parse TVMScript source code str to doc AST.
+
+    Its interface is consistent with python built-in ast.parse.
+    And it will parse by python 3.8 first if possible,
+    or it will parse with python version in current environment.
+
+    Parameters
+    ----------
+    source : str
+        The TVMScript source code.
+
+    filename : str
+        The optional filename of the file where source code locates.
+
+    mode : str
+        The parsing mode for ast.parse.
+
+    Returns
+    -------
+    res : doc.AST
+        The parsed doc AST.
+    """
+    try:
+        program = ast.parse(  # pylint: disable=unexpected-keyword-arg
+            source=source,
+            filename=filename,
+            mode=mode,
+            feature_version=(3, 8),
+        )
+    except:  # pylint: disable=bare-except
+        program = ast.parse(
+            source=source,
+            filename=filename,
+            mode=mode,
+        )
+    return to_doc(program)
+
+
+class NodeVisitor:
+    """Node visitor for doc AST"""
+
+    def visit(self, node: doc.AST) -> None:
+        if isinstance(node, (list, tuple)):
+            for item in node:
+                self.visit(item)
+            return
+        if not isinstance(node, doc.AST):
+            return
+        getattr(
+            self,
+            "visit_" + node.__class__.__name__.split(".")[-1],
+            self.generic_visit,
+        )(node)
+
+    def generic_visit(self, node: doc.AST) -> None:
+        for field in node.__class__._FIELDS:  # pylint: disable=protected-access
+            value = getattr(node, field, None)
+            if value is None:
+                pass
+            elif isinstance(value, (doc.AST, list, tuple)):
+                self.visit(value)
+
+
+class NodeTransformer:
+    """Node transformer for doc AST"""
+
+    def visit(self, node: doc.AST) -> doc.AST:
+        if isinstance(node, list):
+            return [self.visit(item) for item in node]
+        if isinstance(node, tuple):
+            return tuple(self.visit(item) for item in node)
+        if not isinstance(node, doc.AST):
+            return node
+        return getattr(
+            self,
+            "visit_" + node.__class__.__name__.split(".")[-1],
+            self.generic_visit,
+        )(node)
+
+    def generic_visit(self, node: doc.AST) -> doc.AST:
+        kv: typing.Dict[str, typing.Any] = {}
+        for field in node.__class__._FIELDS:  # pylint: disable=protected-access
+            value = getattr(node, field, None)
+            if value is None:
+                pass
+            elif isinstance(value, (doc.AST, list, tuple)):
+                value = self.visit(value)
+            kv[field] = value
+        return node.__class__(**kv)
+
+
+def _register_default():
+    class DefaultTranslator:
+        def __init__(self, doc_cls, func, fields):
+            self.doc_cls = doc_cls  # getattr(doc, name)
+            self.func = func
+            self.fields = fields
+
+        def __call__(self, node):
+            kv = {attr: self.func(getattr(node, attr, None)) for attr in self.fields}
+            return self.doc_cls(**kv)
+
+    Registry._inst = Registry()  # pylint: disable=protected-access
+    for cls_name in dir(doc):
+        doc_cls = getattr(doc, cls_name)
+        if not hasattr(ast, cls_name):
+            continue
+        if inspect.isclass(doc_cls) and issubclass(doc_cls, doc.AST):
+            assert "." not in cls_name
+            register_to_doc(cls_name)(
+                DefaultTranslator(
+                    getattr(doc, cls_name),
+                    to_doc,
+                    doc_cls._FIELDS,  # pylint: disable=protected-access
+                )
+            )
+            register_from_doc(cls_name)(
+                DefaultTranslator(
+                    getattr(ast, cls_name),
+                    from_doc,
+                    doc_cls._FIELDS,  # pylint: disable=protected-access
+                )
+            )
+
+
+def _py_version() -> typing.Tuple[int, int]:
+    return (sys.version_info.major, sys.version_info.minor)
+
+
+def _register_constant_handling():
+    if _py_version() not in [(3, 6), (3, 7)]:
+        return
+
+    def as_constant(f) -> doc.Constant:
+        def to_doc_func(x: ast.AST) -> doc.Constant:
+            return doc.Constant(
+                value=getattr(x, f) if isinstance(f, str) else f(x),
+                kind=None,
+                s=None,
+                n=None,
+                lineno=x.lineno,
+                col_offset=x.col_offset,
+                end_lineno=x.lineno,
+                end_col_offset=x.col_offset,
+            )
+
+        return to_doc_func
+
+    register_to_doc("Str")(as_constant("s"))
+    register_to_doc("NameConstant")(as_constant("value"))
+    register_to_doc("Num")(as_constant("n"))
+    register_to_doc("Bytes")(as_constant("s"))
+    register_to_doc("Ellipsis")(as_constant(lambda _: ...))
+
+
+def _register_subscription_handling():
+    if _py_version() >= (3, 9):
+        return
+
+    def subscript_to_doc(x: ast.Subscript) -> doc.Subscript:
+        if isinstance(x.slice, ast.Slice):
+            return doc.Subscript(
+                value=to_doc(x.value),
+                slice=doc.Slice(
+                    lower=to_doc(x.slice.lower),
+                    upper=to_doc(x.slice.upper),
+                    step=to_doc(x.slice.step),
+                    lineno=getattr(x.slice, "lineno", None),
+                    col_offset=getattr(x.slice, "col_offset", None),
+                    end_lineno=getattr(x.slice, "end_lineno", None),
+                    end_col_offset=getattr(x.slice, "end_col_offset", None),
+                ),
+                ctx=to_doc(x.ctx),
+                lineno=getattr(x, "lineno", None),
+                col_offset=getattr(x, "col_offset", None),
+                end_lineno=getattr(x, "end_lineno", None),
+                end_col_offset=getattr(x, "end_col_offset", None),
+            )
+        if isinstance(x.slice, ast.ExtSlice):
+            return doc.Subscript(
+                value=to_doc(x.value),
+                slice=doc.Tuple(
+                    elts=[to_doc(i) for i in x.slice.dims],
+                    ctx=doc.Load(
+                        lineno=None,
+                        col_offset=None,
+                        end_lineno=None,
+                        end_col_offset=None,
+                    ),
+                    lineno=getattr(x, "lineno", None),
+                    col_offset=getattr(x, "col_offset", None),
+                    end_lineno=getattr(x, "end_lineno", None),
+                    end_col_offset=getattr(x, "end_col_offset", None),
+                ),
+                ctx=to_doc(x.ctx),
+                lineno=getattr(x, "lineno", None),
+                col_offset=getattr(x, "col_offset", None),
+                end_lineno=getattr(x, "end_lineno", None),
+                end_col_offset=getattr(x, "end_col_offset", None),
+            )
+        if isinstance(x.slice, ast.Index):
+            return doc.Subscript(
+                value=to_doc(x.value),
+                slice=to_doc(x.slice.value),
+                ctx=to_doc(x.ctx),
+                lineno=getattr(x, "lineno", None),
+                col_offset=getattr(x, "col_offset", None),
+                end_lineno=getattr(x, "end_lineno", None),
+                end_col_offset=getattr(x, "end_col_offset", None),
+            )
+        raise TypeError(f"Unknown subscript type: {type(x.slice)}")
+
+    def subscript_from_doc(x: doc.Subscript) -> ast.Subscript:
+        if isinstance(x.slice, doc.Slice):
+            result = ast.Subscript(
+                value=from_doc(x.value),
+                slice=from_doc(x.slice),
+                ctx=from_doc(x.ctx),
+            )
+        elif isinstance(x.slice, doc.Tuple):
+            result = ast.Subscript(
+                value=from_doc(x.value),
+                slice=ast.ExtSlice(
+                    dims=[from_doc(i) for i in x.slice.elts],
+                ),
+                ctx=from_doc(x.ctx),
+            )
+        else:
+            result = ast.Subscript(
+                value=from_doc(x.value),
+                slice=ast.Index(value=from_doc(x.slice)),
+                ctx=from_doc(x.ctx),
+            )
+        result.lineno = x.lineno
+        result.col_offset = x.col_offset
+        result.end_lineno = x.end_lineno
+        result.end_col_offset = x.end_col_offset
+        return result
+
+    register_to_doc("Subscript")(subscript_to_doc)
+    register_from_doc("Subscript")(subscript_from_doc)
+
+
+def _register_index_handling():
+    if _py_version() >= (3, 9):
+        return
+
+    def index_to_doc(x: ast.Index) -> doc.Expr:
+        return to_doc(x.value)
+
+    def index_from_doc(x: doc.Expr) -> ast.Index:
+        result = ast.Index(value=from_doc(x), ctx=from_doc(x.ctx))
+        result.lineno = x.lineno
+        result.col_offset = x.col_offset
+        result.end_lineno = x.end_lineno
+        result.end_col_offset = x.end_col_offset
+        return result
+
+    register_to_doc("Index")(index_to_doc)
+    register_from_doc("Index")(index_from_doc)
+
+
+_register_default()
+_register_constant_handling()
+_register_subscription_handling()
+_register_index_handling()
diff --git a/python/tvm/script/printer/doc_core.py b/python/tvm/script/_parser/core/doc_core.py
similarity index 100%
rename from python/tvm/script/printer/doc_core.py
rename to python/tvm/script/_parser/core/doc_core.py
diff --git a/python/tvm/script/_parser/core/utils.py b/python/tvm/script/_parser/core/utils.py
new file mode 100644
index 000000000000..65e7166bfcc2
--- /dev/null
+++ b/python/tvm/script/_parser/core/utils.py
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""TVM Script Parser utils"""
+
+import inspect
+from typing import Any, Callable, Dict
+
+
+def inspect_function_capture(func: Callable) -> Dict[str, Any]:
+    """Capture function non-locals and global variables.
+
+    Parameters
+    ----------
+    func : Callable
+        The function to inspect.
+
+    Returns
+    -------
+    res : Dict[str, Any]
+        The function variables map with non-local or global variables.
+    """
+    captured = {
+        **inspect.getclosurevars(func).nonlocals,
+        **func.__globals__,  # type: ignore
+    }
+    return captured
+
+
+def inspect_class_capture(cls: type) -> Dict[str, Any]:
+    """Capture class non-locals and global variables.
+
+    Parameters
+    ----------
+    cls : type
+        The class to inspect.
+
+    Returns
+    -------
+    res : Dict[str, Any]
+        The class variables map with non-local or global variables.
+    """
+    result: Dict[str, Any] = {}
+    for _, v in cls.__dict__.items():
+        if inspect.isfunction(v):
+            func_vars = inspect_function_capture(v)
+            result.update(**func_vars)
+    return result
diff --git a/tests/python/unittest/test_tvmscript_parser_source.py b/tests/python/unittest/test_tvmscript_parser_source.py
new file mode 100644
index 000000000000..cb93a2dcf62b
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_parser_source.py
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unittests for tvm.script.parser.core"""
+import pytest
+import inspect
+import tvm.testing
+from tvm.script._parser.core.diagnostics import Source
+from tvm.script._parser.core import doc_core as doc
+from tvm.script import tir as T
+
+
+def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, [128, 128])
+    B = T.match_buffer(b, [128, 128])
+    C = T.match_buffer(c, [128, 128])
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("update"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+
+
+def test_source_base():
+    source = Source(matmul)
+    assert (
+        source.source_name == inspect.getsourcefile(matmul)
+        and source.start_line is not None
+        and source.start_column == 0
+        and source.source == inspect.getsource(matmul)
+        and source.full_source == inspect.getsource(inspect.getmodule(matmul))
+    )
+
+
+def test_source_ast():
+    source = Source(matmul)
+    mod = source.as_ast()
+    assert isinstance(mod, doc.Module)
+    func_def = mod.body[0]
+    assert isinstance(func_def, doc.FunctionDef)
+    assert func_def.name == "matmul"
+    func_args = func_def.args
+    assert (
+        len(func_args.args) == 3
+        and func_args.args[0].arg == "a"
+        and func_args.args[1].arg == "b"
+        and func_args.args[2].arg == "c"
+    )
+    func_body = func_def.body
+    assert len(func_body) == 4
+    func_assigns = func_body[:3]
+    assert (
+        isinstance(func_assigns[0], doc.Assign)
+        and func_assigns[0].targets[0].id == "A"
+        and isinstance(func_assigns[1], doc.Assign)
+        and func_assigns[1].targets[0].id == "B"
+        and isinstance(func_assigns[2], doc.Assign)
+        and func_assigns[2].targets[0].id == "C"
+    )
+    func_for = func_body[3]
+    assert (
+        len(func_for.target.elts) == 3
+        and func_for.target.elts[0].id == "i"
+        and func_for.target.elts[1].id == "j"
+        and func_for.target.elts[2].id == "k"
+    )
+    for_body = func_for.body
+    assert len(for_body) == 1
+    for_block = for_body[0]
+    assert isinstance(for_block, doc.With) and len(for_block.body) == 2
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From cd077575f365cc2e7fb20b263d7e533b202518e4 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Tue, 11 Oct 2022 11:55:27 -0700
Subject: [PATCH 335/704] Update to 0.11.dev0 (#13036)

Bumps up version in main branch to 0.11.dev0 given we are finalizing 0.10.0 release: #13026
---
 conda/recipe/meta.yaml              | 2 +-
 docs/conf.py                        | 2 +-
 include/tvm/runtime/c_runtime_api.h | 2 +-
 python/tvm/_ffi/libinfo.py          | 2 +-
 version.py                          | 2 +-
 web/package.json                    | 4 ++--
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
index 22a512493478..519b84c570d7 100644
--- a/conda/recipe/meta.yaml
+++ b/conda/recipe/meta.yaml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-{% set version = '0.10.dev0' %}
+{% set version = '0.11.dev0' %}
 {% set pkg_name = 'tvm' %}
 {% set cuda_tag = cuda_version | replace('.', '') %} # [cuda]
 {% set pkg_name = pkg_name + '-cu' + cuda_tag %} # [cuda]
diff --git a/docs/conf.py b/docs/conf.py
index d645958ca6db..0767ccf82e70 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -427,7 +427,7 @@ def force_gc(gallery_conf, fname):
     "header_dropdown": header_dropdown,
     "header_logo": header_logo,
     "header_logo_link": header_logo_link,
-    "version_prefixes": ["main", "v0.8.0/", "v0.9.0/"],
+    "version_prefixes": ["main", "v0.8.0/", "v0.9.0/", "v0.10.0/"],
 }
 
 # add additional overrides
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 8acc298e7890..03c662ca1922 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -66,7 +66,7 @@
 #endif
 
 // TVM version
-#define TVM_VERSION "0.10.dev0"
+#define TVM_VERSION "0.11.dev0"
 
 // TVM Runtime is DLPack compatible.
 #include <dlpack/dlpack.h>
diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index 44ce10bd5a2f..fdd888a470b9 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -220,4 +220,4 @@ def find_include_path(name=None, search_path=None, optional=False):
 # We use the version of the incoming release for code
 # that is under development.
 # The following line is set by tvm/python/update_version.py
-__version__ = "0.10.dev0"
+__version__ = "0.11.dev0"
diff --git a/version.py b/version.py
index c9bf76c784df..6381c0a856a9 100644
--- a/version.py
+++ b/version.py
@@ -44,7 +44,7 @@
 # Two tag formats are supported:
 # - vMAJ.MIN.PATCH (e.g. v0.8.0) or
 # - vMAJ.MIN.devN (e.g. v0.8.dev0)
-__version__ = "0.10.dev0"
+__version__ = "0.11.dev0"
 
 # ---------------------------------------------------
 
diff --git a/web/package.json b/web/package.json
index fb36c0e35d34..825056a8e37b 100644
--- a/web/package.json
+++ b/web/package.json
@@ -2,7 +2,7 @@
   "name": "tvmjs",
   "displayName": "TVM Wasm JS runtime",
   "license": "Apache-2.0",
-  "version": "0.10.0-dev0",
+  "version": "0.11.0-dev0",
   "scripts": {
     "prepwasm": "make && python3 tests/python/prepare_test_libs.py",
     "build": "tsc -b && make rmtypedep",
@@ -29,4 +29,4 @@
     "typescript": "^3.8.3",
     "ws": "^7.2.5"
   }
-}
+}
\ No newline at end of file

From df0ffc3b50921d938ee27ef6ae3914e37e2c8584 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Tue, 11 Oct 2022 12:30:18 -0700
Subject: [PATCH 336/704] [FIX,LOWERING] Add attrs from Relay Functions to
 PrimFuncs in Metaschedule Lowering (#13034)

Attrs were not propogated from Relay functions to the corresponding
PrimFunc when lowering with MetaSchedule enabled. These attrs are not
seperately copied in the MetaSchedule lowering flow.
---
 src/relay/backend/te_compiler_cache.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index ad99cb41aa8e..a1a4bedfb8b0 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -528,6 +528,9 @@ class ScheduleBuilder : public ExprVisitor {
             ICHECK_EQ(mod->functions.size(), 1);
             mod = tir::transform::RemoveWeightLayoutRewriteBlock()(std::move(mod));
             prim_func = Downcast<PrimFunc>(mod->Lookup("main"));
+            // Need to copy attrs from relay function over to prim func. Most notably the structural
+            // hash.
+            prim_func = WithAttrs(prim_func, relay_func->attrs->dict);
           } else {
             int dispatch = backend::UseMetaScheduleDispatch();
             // (dispatch & 2): controls whether to print TVMScript for missing TIR

From 05d30f250295b5dead90c16e7858e157abf89eea Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Tue, 11 Oct 2022 12:55:58 -0700
Subject: [PATCH 337/704] [Docs] Update Release Process Docs -- Add Ver. Bump
 Step (#13039)

Add a step to bump the version numbers on main.
---
 docs/contribute/release_process.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/contribute/release_process.rst b/docs/contribute/release_process.rst
index 685b56cf1310..a0ebda650524 100644
--- a/docs/contribute/release_process.rst
+++ b/docs/contribute/release_process.rst
@@ -145,6 +145,13 @@ Create GPG signature as well as the hash of the file,
 	shasum -a 512 apache-tvm-src-v0.6.0.rc0.tar.gz > apache-tvm-src-v0.6.0.rc0.tar.gz.sha512
 
 
+Update TVM Version on Main 
+--------------------------
+
+After cutting a release candidate, make sure to update the version numbers throughout `main`. For example if we are 
+releasing `v0.10.0` we want to bump the version numbers throughout the codebase from `v0.10.dev0` to `v0.11.dev0`. An 
+example of how to do this can be found here: `https://github.com/apache/tvm/pull/12190 <https://github.com/apache/tvm/pull/12190>`_. 
+
 Upload the Release Candidate
 ----------------------------
 

From 4f975c7506f4b8bd3f42d3f3e0f29ed9b3dbb8f4 Mon Sep 17 00:00:00 2001
From: Janet Schneider <janetsc@octoml.ai>
Date: Tue, 11 Oct 2022 16:47:50 -0400
Subject: [PATCH 338/704] [Hexagon] [runtime] Support VTCM alignments of 128 or
 2k (#12999)

In order to simplify the dynamic VTCM memory pool, it will support the following alignments:

2k boundaries, allocated from the front of the pool in the smallest open spot possible.
128 byte boundaries, allocated from the end of the pool.

Made this support explicit, modified test to hit these boundaries.
---
 src/runtime/hexagon/hexagon_buffer.cc         | 22 +++--
 src/runtime/hexagon/hexagon_vtcm_pool.cc      |  1 +
 .../hexagon/hexagon_vtcm_pool_tests.cc        | 95 +++++++++++++++----
 3 files changed, 89 insertions(+), 29 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index 861a8d9f4f7a..c58026e83cfe 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -57,14 +57,20 @@ struct DDRAllocation : public Allocation {
 
 struct VTCMAllocation : public Allocation {
   VTCMAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
-    // TODO(HWE): Handle alignments greater than 2k
-    CHECK(alignment <= 0x800) << "VTCMAllocation called for invalid alignment";
-    if ((nbytes & 0x7FF) && ((alignment & 0x7FF) == 0)) {
-      // Caller has requested 2k alignment, but the size is not a multiple of 2k
-      // Adjust size to be a multiple of 2k so that we will allocate from the front of the pool
-      nbytes = nbytes >> 11;
-      nbytes = nbytes << 11;
-      nbytes += 0x800;
+    // For simplicity, the current VTCM dynamic pool supports the following alignments: less than
+    // or equal to 128 (0x80), and 2k (0x800)
+    CHECK((alignment <= 0x80) || (alignment == 0x800))
+        << "VTCMAllocation called for invalid alignment " << alignment;
+
+    if (alignment == 0x800) {
+      // Adjust size to be a multiple of 2k so that we will allocate from the front of the pool.
+      nbytes = (nbytes + 0x7ff) & -0x800;
+    } else if (alignment <= 0x80) {
+      // Adjust size to be a multiple of 128 so that we will allocate from the back of the pool
+      // in 128 byte increments.
+      nbytes = (nbytes + 0x7f) & -0x80;
+    }
+    if (allocation_nbytes_ != nbytes) {
       DLOG(INFO) << "VTCMAllocation size adjusted for alignment " << allocation_nbytes_ << " to "
                  << nbytes;
       allocation_nbytes_ = nbytes;
diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.cc b/src/runtime/hexagon/hexagon_vtcm_pool.cc
index 107f4df0d187..63c815a6efca 100644
--- a/src/runtime/hexagon/hexagon_vtcm_pool.cc
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.cc
@@ -64,6 +64,7 @@ void* HexagonVtcmPool::Allocate(size_t nbytes) {
   std::lock_guard<std::mutex> lock(mutex_);
 
   CHECK(!free_.empty()) << "No free VTCM";
+  CHECK(nbytes >= 0x80) << "Minimum VTCM alloation must be 128 bytes - nbytes " << nbytes;
 
   // If this is not aligned on a 2k block, allocate from the end to avoid fragmentation
   if (nbytes & size_t(0x7FF)) {
diff --git a/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc b/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
index 766b414cd0a5..5c017b58a3a2 100644
--- a/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
@@ -25,50 +25,66 @@ using namespace tvm::runtime;
 using namespace tvm::runtime::hexagon;
 
 class HexagonVtcmPoolTest : public ::testing::Test {
-  void SetUp() override { vtcm_pool = HexagonDeviceAPI::Global()->VtcmPool(); }
+  void SetUp() override {
+    vtcm_pool = HexagonDeviceAPI::Global()->VtcmPool();
+    max_bytes = vtcm_pool->TotalBytes();
+  }
   void TearDown() override {}
 
  public:
   HexagonVtcmPool* vtcm_pool;
+  size_t max_bytes;
+  size_t two_k_block = 2048;
+  size_t one_k_block = 1024;
+  size_t min_bytes = 128;
 };
 
 TEST_F(HexagonVtcmPoolTest, basic) {
   void* ptr;
-  size_t max_bytes = vtcm_pool->TotalBytes();
-  size_t two_k_block = 2048;
-  size_t one_k_block = 1024;
-  size_t one_byte_block = 1;
+  void* ptr2;
+
   ptr = vtcm_pool->Allocate(max_bytes);
+  CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7FF) == 0)
+      << "Must be multiple of 2k " << ptr << " " << max_bytes;
   vtcm_pool->Free(ptr, max_bytes);
+
   ptr = vtcm_pool->Allocate(two_k_block);
+  CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7FF) == 0)
+      << "Must be multiple of 2k " << ptr << " " << two_k_block;
   vtcm_pool->Free(ptr, two_k_block);
+
   ptr = vtcm_pool->Allocate(one_k_block);
+  CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7F) == 0)
+      << "Must be multiple of 128 " << ptr << " " << one_k_block;
   vtcm_pool->Free(ptr, one_k_block);
-  ptr = vtcm_pool->Allocate(one_byte_block);
-  vtcm_pool->Free(ptr, one_byte_block);
+
+  ptr = vtcm_pool->Allocate(min_bytes);
+  CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7F) == 0)
+      << "Must be multiple of 128 " << ptr << " " << min_bytes;
+
+  ptr2 = vtcm_pool->Allocate(one_k_block);
+  CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7F) == 0)
+      << "Must be multiple of 128 " << ptr2 << " " << one_k_block;
+  vtcm_pool->Free(ptr, min_bytes);
+  vtcm_pool->Free(ptr2, one_k_block);
+
+  EXPECT_THROW(ptr = vtcm_pool->Allocate(1), InternalError);
 }
 
 TEST_F(HexagonVtcmPoolTest, no_free_vtcm) {
-  void* ptr;
-  size_t max_bytes = vtcm_pool->TotalBytes();
-  ptr = vtcm_pool->Allocate(max_bytes);
-  EXPECT_THROW(vtcm_pool->Allocate(1), InternalError);
+  void* ptr = vtcm_pool->Allocate(max_bytes);
+  EXPECT_THROW(vtcm_pool->Allocate(min_bytes), InternalError);
   vtcm_pool->Free(ptr, max_bytes);
 }
 
 TEST_F(HexagonVtcmPoolTest, not_enough_free_vtcm) {
-  void* ptr;
-  size_t max_bytes = vtcm_pool->TotalBytes();
-  size_t two_k_block = 2048;
-  ptr = vtcm_pool->Allocate(max_bytes - two_k_block);
+  void* ptr = vtcm_pool->Allocate(max_bytes - two_k_block);
   EXPECT_THROW(vtcm_pool->Allocate(two_k_block * 2), InternalError);
   vtcm_pool->Free(ptr, max_bytes - two_k_block);
 }
 
 TEST_F(HexagonVtcmPoolTest, free_with_wrong_size) {
-  void* ptr;
-  size_t two_k_block = 2048;
-  ptr = vtcm_pool->Allocate(two_k_block * 2);
+  void* ptr = vtcm_pool->Allocate(two_k_block * 2);
   EXPECT_THROW(vtcm_pool->Free(ptr, two_k_block), InternalError);
   vtcm_pool->Free(ptr, two_k_block * 2);
 }
@@ -79,7 +95,6 @@ TEST_F(HexagonVtcmPoolTest, free_alloc_combinations) {
   void* ptr3;
   void* ptr4;
   void* new_ptr;
-  size_t two_k_block = 2048;
   size_t max_less_3_blocks = vtcm_pool->TotalBytes() - (3 * two_k_block);
   ptr1 = vtcm_pool->Allocate(two_k_block);
   ptr2 = vtcm_pool->Allocate(two_k_block);
@@ -119,6 +134,44 @@ TEST_F(HexagonVtcmPoolTest, free_alloc_combinations) {
 
   // Make sure at the end we have the full amount
   // available again
-  ptr4 = vtcm_pool->Allocate(max_less_3_blocks);
-  vtcm_pool->Free(ptr4, max_less_3_blocks);
+  ptr4 = vtcm_pool->Allocate(max_bytes);
+  vtcm_pool->Free(ptr4, max_bytes);
+}
+
+// Test alignment edge cases allocating through HexagonBuffer
+TEST_F(HexagonVtcmPoolTest, vtcm_alignment) {
+  std::unique_ptr<HexagonBufferManager> test_hexbuffs = std::make_unique<HexagonBufferManager>();
+  void* ptr;
+
+  // Invalid alignments
+  EXPECT_THROW(test_hexbuffs->AllocateHexagonBuffer(min_bytes, 128 + 1, String("global")),
+               InternalError);
+  EXPECT_THROW(test_hexbuffs->AllocateHexagonBuffer(min_bytes, 2048 + 1, String("global")),
+               InternalError);
+
+  // Valid alignments, sizes need to be adjusted
+  ptr = test_hexbuffs->AllocateHexagonBuffer(1, 128, String("global"));
+  CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7F) == 0) << "Must be multiple of 128 " << ptr;
+
+  ptr = test_hexbuffs->AllocateHexagonBuffer(127, 128, String("global"));
+  CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7F) == 0) << "Must be multiple of 128 " << ptr;
+
+  ptr = test_hexbuffs->AllocateHexagonBuffer(129, 128, String("global"));
+  CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7F) == 0) << "Must be multiple of 128 " << ptr;
+
+  ptr = test_hexbuffs->AllocateHexagonBuffer(1, 2048, String("global"));
+  CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7FF) == 0) << "Must be multiple of 2k " << ptr;
+
+  ptr = test_hexbuffs->AllocateHexagonBuffer(2047, 2048, String("global"));
+  CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7FF) == 0) << "Must be multiple of 2k " << ptr;
+
+  ptr = test_hexbuffs->AllocateHexagonBuffer(2049, 2048, String("global"));
+  CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7FF) == 0) << "Must be multiple of 2k " << ptr;
+
+  test_hexbuffs.reset();
+
+  // Make sure at the end we have the full amount
+  // available again
+  ptr = vtcm_pool->Allocate(max_bytes);
+  vtcm_pool->Free(ptr, max_bytes);
 }

From cc235f88a4d1624933965d7b6512f21f4b1d48ef Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Tue, 11 Oct 2022 14:39:36 -0700
Subject: [PATCH 339/704] [TIR] Update ir_comparator message to be more clear
 about what is being compared (#13038)

Update ir_comparator message to be more clear about what is being compared. This would be more useful when debugging tensorize mismatches.
---
 src/tir/schedule/ir_comparator.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/tir/schedule/ir_comparator.cc b/src/tir/schedule/ir_comparator.cc
index 648305d3655d..93cb488eaf56 100644
--- a/src/tir/schedule/ir_comparator.cc
+++ b/src/tir/schedule/ir_comparator.cc
@@ -41,7 +41,9 @@ class TensorIntrinMismatchError : public ScheduleError {
 
   String DetailRenderTemplate() const final {
     std::ostringstream os;
-    os << "The stmt {0} doesn't match the tensor intrin\n " << rhs_stmt_;
+    os << "The stmt {0} doesn't match the tensor intrin\nThe pattern attempting to be matched:\n"
+       << lhs_stmt_ << "\nDoes not match the tensorize description:\n"
+       << rhs_stmt_;
     for (const auto& msg : error_messages_) {
       os << msg << std::endl;
     }

From a752b74956db5c9201e503d70ddf36977eafdf40 Mon Sep 17 00:00:00 2001
From: "Ehsan M. Kermani" <6980212+ehsanmok@users.noreply.github.com>
Date: Tue, 11 Oct 2022 23:05:57 -0700
Subject: [PATCH 340/704] [Conda] Specify python version in build-environment
 (#13047)

The current [build-environment.yaml](https://github.com/apache/tvm/blob/main/conda/build-environment.yaml) doesn't specify the python version for `tvm-build` conda environment and as a result, any python version might be selected depending on the available base conda. In my case, the base conda is 3.10 so tvm won't work given [tvm is unsupported for python >= 3.9](https://github.com/apache/tvm/issues/8577).

To validate run on `tvm-build` env

```
conda env update -f conda/build-environment.yaml
```
---
 conda/build-environment.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/build-environment.yaml b/conda/build-environment.yaml
index 9581ddf1ea6b..a1b43eb6ef0c 100644
--- a/conda/build-environment.yaml
+++ b/conda/build-environment.yaml
@@ -25,6 +25,7 @@ channels:
 
 # The packages to install to the environment
 dependencies:
+  - python=3.7 # or 3.8. See https://github.com/apache/tvm/issues/8577 for more details on >= 3.9
   - conda-build
   - git
   - llvmdev >=11

From 48829756e1394df7d3a05d312f0caf7dc322d50a Mon Sep 17 00:00:00 2001
From: LiangW <114222082+liangW-intellif@users.noreply.github.com>
Date: Wed, 12 Oct 2022 15:08:39 +0800
Subject: [PATCH 341/704] [TIR] Fix the stride calculation in
 InjectRollingBuffer (#13016)

This commit modified the calculation of rolling_buffer stride in
InjectRollingBuffer pass, so that the stride is calculated correctly
when fractional strides exist.

The InjectRollingBuffer pass seems to miscalculate rolling_buffer strides when the bound overlap is large and fractional strides exist.
The original test case `test_upscale` does not expose this problem because there is no positive bound overlap in the intermediate buffer, so I modified the pass and the test case as well.

To demonstrate this change, here is an example:
```
    @T.prim_func
    def pool_1d_upscale(a: T.handle, c: T.handle) -> None:
        # function attr dict
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "pool_1d_upscale", "tir.noalias": True})

        B = T.buffer_decl([1, 14], dtype="int8")
        # B = T.match_buffer(b,  [1, 14], dtype="int8")
        A = T.match_buffer(a,  [1, 14], dtype="int8")
        C = T.match_buffer(c,  [1, 24], dtype="int8")
        T.realize(C[0:1, 0:24], "")
        for ax0_outer in T.serial(0, 5):
            T.realize(B[0:1, T.floordiv(ax0_outer*5, 2): T.floordiv(ax0_outer*5, 2) + 5], "")
            T.attr(B, "rolling_buffer_scope", True)
            for ax0 in T.serial(0, 5):
                if T.likely(((T.floordiv(ax0_outer * 5, 2) + ax0) < 14), dtype='bool'):
                    B[0, (T.floordiv(ax0_outer * 5, 2) + ax0)] = A[0, (T.floordiv(ax0_outer * 5, 2) + ax0)]
            for ax0_1 in T.serial(0, 5):
                for rv0 in T.serial(0, 3):
                    if T.likely(((ax0_outer * 5 + ax0_1) < 24), dtype='bool'):
                        C[0, (ax0_outer * 5 + ax0_1)] = T.max(C[0, (ax0_outer * 5) + ax0_1], B[0, (T.floordiv((ax0_outer * 5) + ax0_1, 2) + rv0)])

```
The rolling_buffer stride computed at `inject_rolling_buffer.cc:137` would always be overwritten by the assignment on `inject_rolling_buffer.cc:156`, leat to bound_overlaps to be computed as {0, 0}.
After pass InjectRollingBuffer, the IR becomes as below, each overlapping element is recomputed.

```
    @T.prim_func
    def pool_1d_upscale(A: T.Buffer[(1, 14), "int8"], C: T.Buffer[(1, 24), "int8"]) -> None:
        # function attr dict
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "pool_1d_upscale", "tir.noalias": True})
        # buffer definition
        B = T.buffer_decl([1, 14], dtype="int8")
        # body
        T.realize(C[0:1, 0:24], "")
        T.realize(B[0:1, 0:5], "")
        for ax0_outer in T.serial(5):
            for ax0 in T.serial(5):
                if T.likely(ax0_outer * 5 // 2 + ax0 < 14, dtype="bool"):
                    B[0, (ax0_outer * 5 // 2 + ax0) % 5] = A[0, ax0_outer * 5 // 2 + ax0]
            for ax0_1, rv0 in T.grid(5, 3):
                if T.likely(ax0_outer * 5 + ax0_1 < 24, dtype="bool"):
                    C[0, ax0_outer * 5 + ax0_1] = T.max(C[0, ax0_outer * 5 + ax0_1], B[0, ((ax0_outer * 5 + ax0_1) // 2 + rv0) % 5])
```

After my modifications , the bound_overlaps would be correctly computed as {0, 2}, the IR becomes as below, which avoids recomputing some elements.

```
    @T.prim_func
    def pool_1d_upscale(A: T.Buffer[(1, 14), "int8"], C: T.Buffer[(1, 24), "int8"]) -> None:
        # function attr dict
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "pool_1d_upscale", "tir.noalias": True})
        # buffer definition
        B = T.buffer_decl([1, 14], dtype="int8")
        # body
        T.realize(C[0:1, 0:24], "")
        T.realize(B[0:1, 0:5], "")
        for ax0_outer in T.serial(5):
            for ax0 in T.serial(5):
                if T.likely(ax0_outer * 5 // 2 + ax0 < 14, dtype="bool"):
                    # Avoid recomputing some elements
                    if T.likely(ax0_outer < 1 or ax0 >= 2, dtype="bool"):
                        B[0, (ax0_outer * 5 // 2 + ax0) % 5] = A[0, ax0_outer * 5 // 2 + ax0]
            for ax0_1, rv0 in T.grid(5, 3):
                if T.likely(ax0_outer * 5 + ax0_1 < 24, dtype="bool"):
                    C[0, ax0_outer * 5 + ax0_1] = T.max(C[0, ax0_outer * 5 + ax0_1], B[0, ((ax0_outer * 5 + ax0_1) // 2 + rv0) % 5])
```
---
 src/tir/transforms/inject_rolling_buffer.cc   |  6 +++--
 ...est_tir_transform_inject_rolling_buffer.py | 25 ++++++++++++++-----
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/tir/transforms/inject_rolling_buffer.cc b/src/tir/transforms/inject_rolling_buffer.cc
index 43bf3b53f8e6..410efba9e215 100644
--- a/src/tir/transforms/inject_rolling_buffer.cc
+++ b/src/tir/transforms/inject_rolling_buffer.cc
@@ -126,15 +126,16 @@ class RollingBufferInjector : public StmtExprMutator {
         // We use the bound information of the BufferRealize to calculate
         // how we can legally roll
         auto stride{0};
+        auto divisor{1};
         Optional<Var> iter_var{};
         for (auto bound : buffer_realize->bounds) {
+          divisor = 1;
           if (auto floor_div = bound->min.as<FloorDivNode>()) {
             // Handle the case of fractional strides
             // They take this form: floordiv(hh.outer, 2)
             // Strip the floordiv and keep track of the divisor
-            auto divisor{Downcast<IntImm>(floor_div->b)->value};
+            divisor = Downcast<IntImm>(floor_div->b)->value;
             bound = Range::FromMinExtent(floor_div->a, bound->extent, bound->span);
-            stride = std::ceil(stride / divisor);
           }
           if (bound->min.as<IntImmNode>()) {
             // If the bound is an int, we can't roll over it
@@ -155,6 +156,7 @@ class RollingBufferInjector : public StmtExprMutator {
             iter_var = GetRef<Var>(a);
             stride = b->value;
           }
+          stride = std::ceil(static_cast<float>(stride) / divisor);
           bound_iter_vars.push_back(iter_var);
           if (iter_var) {
             bound_overlaps.push_back(Downcast<IntImm>(bound->extent)->value - stride);
diff --git a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
index 65a586b8ecfd..2e2b03ba721c 100644
--- a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
+++ b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
@@ -118,15 +118,28 @@ def test_implied_split():
     _verify_schedule(sch, [A], pool_b)
 
 
-def test_upscale():
-    A = te.placeholder((1, 12, 12, 16), name="A", dtype="int8")
-    pool = topi.nn.pool2d(A, (1, 1), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    upscale = te.compute((1, 24, 24, 16), lambda nn, hh, ww, cc: pool[nn, hh // 2, ww // 2, cc])
+@pytest.mark.parametrize("kernel_shape", [(1, 1), (3, 3)])
+def test_upscale(kernel_shape):
+    output_shape = (1, 24, 24, 16)
+    input_shape = (
+        output_shape[0],
+        output_shape[1] // 2 + 2 * (kernel_shape[0] - 1),
+        output_shape[2] // 2 + 2 * (kernel_shape[1] - 1),
+        output_shape[3],
+    )
+    A = te.placeholder(input_shape, name="A", dtype="int8")
+    pool_a = topi.nn.pool2d(A, kernel_shape, (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
+    pool_b = topi.nn.pool2d(
+        pool_a, kernel_shape, (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC"
+    )
+    upscale = te.compute((1, 24, 24, 16), lambda nn, hh, ww, cc: pool_b[nn, hh // 2, ww // 2, cc])
 
     sch = tvm.te.create_schedule([upscale.op])
     oi, ii = _tile_nd(sch, upscale, (1, 5, 5, 16))
-    sch[pool].compute_at(sch[upscale], oi[-1])
-    sch[pool].rolling_buffer()
+    sch[pool_b].compute_at(sch[upscale], oi[-1])
+    sch[pool_b].rolling_buffer()
+    sch[pool_a].compute_at(sch[upscale], oi[-1])
+    sch[pool_a].rolling_buffer()
 
     _verify_schedule(sch, [A], upscale)
 

From b8cfc4ca8257bfb582e8d15ef27c70e53bb2816e Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Wed, 12 Oct 2022 08:58:00 +0100
Subject: [PATCH 342/704] [CI] Update Compute Library to v22.08 (#13035)

This updates Compute Library from v21.11 to v22.08.

Changelog for this version can be found at:
https://arm-software.github.io/ComputeLibrary/v22.08/
---
 docker/install/ubuntu_download_arm_compute_lib_binaries.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_download_arm_compute_lib_binaries.sh b/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
index 051a94b71c0c..639332239c0b 100755
--- a/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
+++ b/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
@@ -28,7 +28,7 @@ if [ "$architecture_type" != "aarch64" ]; then
     gcc-aarch64-linux-gnu
 fi
 
-compute_lib_version="v21.11"
+compute_lib_version="v22.08"
 compute_lib_variant="arm64-v8a-neon"
 compute_lib_full_name="arm_compute-${compute_lib_version}-bin-linux-${compute_lib_variant}"
 compute_lib_base_url="https://github.com/ARM-software/ComputeLibrary/releases/download/${compute_lib_version}"

From 256075566b72e9c7ab42b3a82c58c0ee1960d731 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 13 Oct 2022 02:48:22 +0900
Subject: [PATCH 343/704] [MetaSchedule] Allow skipping exact NDArray rewrite
 in RemoveWeightLayoutRewriteBlock (#13052)

* Allow skipping exact NDArray rewrite in RemoveWeightLayoutRewriteBlock

* add doc

* add test
---
 include/tvm/tir/transform.h                   | 10 +++-
 .../meta_schedule/builder/local_builder.py    |  2 +-
 python/tvm/tir/transform/transform.py         | 16 ++++-
 src/meta_schedule/arg_info.cc                 |  3 +-
 .../feature_extractor/per_store_feature.cc    |  2 +-
 src/relay/backend/te_compiler_cache.cc        |  3 +-
 .../remove_weight_layout_rewrite_block.cc     | 59 +++++++++++++++----
 .../test_meta_schedule_relay_integration.py   | 43 +++++++-------
 8 files changed, 98 insertions(+), 40 deletions(-)

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 6aa1aca69970..e31919fbd223 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -672,9 +672,17 @@ TVM_DLL Pass InjectPTXAsyncCopy();
 
 /*!
  * \brief Remove the weight layout rewrite block
+ * \param skip_ndarray_rewrite If True, exact rewrite of NDArray, according to the given index map,
+ *  will be skipped. Only the shape of the NDArray is transformed correctly, and the content of
+ *  the destination array will be filled with random values.
+ *
+ *  When this pass is called many times during MetaSchedule tuning, the raw data of NDArray,
+ *  before and after rewrite, does not matter. Since NDArray layout rewrite, using IndexMap's
+ *  MapNDArray, is currently slow, skipping the exact rewrite is sometimes necessary.
+ *
  * \return The pass.
  */
-TVM_DLL Pass RemoveWeightLayoutRewriteBlock();
+TVM_DLL Pass RemoveWeightLayoutRewriteBlock(bool skip_ndarray_rewrite = false);
 
 /*!
  * \brief Add the explicit local stage for the shared memory access on GPU.
diff --git a/python/tvm/meta_schedule/builder/local_builder.py b/python/tvm/meta_schedule/builder/local_builder.py
index 6e282d8cb62d..3ddca032ef76 100644
--- a/python/tvm/meta_schedule/builder/local_builder.py
+++ b/python/tvm/meta_schedule/builder/local_builder.py
@@ -257,7 +257,7 @@ def default_build(mod: IRModule, target: Target, _params: Optional[Dict[str, NDA
     from tvm.tir.transform import RemoveWeightLayoutRewriteBlock
 
     # pylint: enable=import-outside-toplevel
-    mod = RemoveWeightLayoutRewriteBlock()(mod)
+    mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
     return tvm_build(mod, target=target)
 
 
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index d95d15c0dfbe..7b3a81acc525 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -964,14 +964,26 @@ def InjectPTXAsyncCopy():
     return _ffi_api.InjectPTXAsyncCopy()  # type: ignore
 
 
-def RemoveWeightLayoutRewriteBlock():
+def RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=False):
     """Remove weight layout rewrite block before benchmarking during tuning stage.
+
+    Parameters
+    ----------
+    skip_ndarray_rewrite : bool
+        If True, exact rewrite of NDArray, according to the given index map, will be skipped.
+        Only the shape of the NDArray is transformed correctly, and the content of the destination
+        array will be filled with random values.
+
+        When this pass is called many times during MetaSchedule tuning, the raw data of NDArray,
+        before and after rewrite, does not matter. Since NDArray layout rewrite, using IndexMap's
+        MapNDArray, is currently slow, skipping the exact rewrite is sometimes necessary.
+
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.RemoveWeightLayoutRewriteBlock()  # type: ignore
+    return _ffi_api.RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite)  # type: ignore
 
 
 def ManifestSharedMemoryLocalStage():
diff --git a/src/meta_schedule/arg_info.cc b/src/meta_schedule/arg_info.cc
index 84d861cb59c3..4663fd90762a 100644
--- a/src/meta_schedule/arg_info.cc
+++ b/src/meta_schedule/arg_info.cc
@@ -103,7 +103,8 @@ Array<ArgInfo> ArgInfo::FromPrimFunc(const tir::PrimFunc& func) {
 
 Array<ArgInfo> ArgInfo::FromEntryFunc(const IRModule& mod, bool remove_preproc) {
   if (remove_preproc) {
-    IRModule new_mod = tir::transform::RemoveWeightLayoutRewriteBlock()(mod);
+    IRModule new_mod =
+        tir::transform::RemoveWeightLayoutRewriteBlock(/*skip_ndarray_rewrite*/ true)(mod);
     return ArgInfo::FromPrimFunc(FindEntryFunc(new_mod));
   }
   return ArgInfo::FromPrimFunc(FindEntryFunc(mod));
diff --git a/src/meta_schedule/feature_extractor/per_store_feature.cc b/src/meta_schedule/feature_extractor/per_store_feature.cc
index 422f21abe17a..f0459785f352 100644
--- a/src/meta_schedule/feature_extractor/per_store_feature.cc
+++ b/src/meta_schedule/feature_extractor/per_store_feature.cc
@@ -301,7 +301,7 @@ Pass SimplifyForFeatureExtraction() {
  */
 Sequential PassListForPerStoreFeature() {
   return Sequential({
-      tir::transform::RemoveWeightLayoutRewriteBlock(),
+      tir::transform::RemoveWeightLayoutRewriteBlock(/*skip_ndarray_rewrite*/ true),
       tir::transform::SimplifyForFeatureExtraction(),
       tir::transform::LowerCrossThreadReduction(),
       tir::transform::LowerInitBlock(),
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index a1a4bedfb8b0..9a0a2bef9a47 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -526,7 +526,8 @@ class ScheduleBuilder : public ExprVisitor {
             record->trace->ApplyToSchedule(sch, /*remove_postproc=*/false);
             IRModule mod = sch->mod();
             ICHECK_EQ(mod->functions.size(), 1);
-            mod = tir::transform::RemoveWeightLayoutRewriteBlock()(std::move(mod));
+            mod = tir::transform::RemoveWeightLayoutRewriteBlock(/*skip_ndarray_rewrite*/ false)(
+                std::move(mod));
             prim_func = Downcast<PrimFunc>(mod->Lookup("main"));
             // Need to copy attrs from relay function over to prim func. Most notably the structural
             // hash.
diff --git a/src/tir/transforms/remove_weight_layout_rewrite_block.cc b/src/tir/transforms/remove_weight_layout_rewrite_block.cc
index 86f6700f2289..05b636f11403 100644
--- a/src/tir/transforms/remove_weight_layout_rewrite_block.cc
+++ b/src/tir/transforms/remove_weight_layout_rewrite_block.cc
@@ -34,13 +34,15 @@ namespace tir {
 
 class RemoveLayoutRewriteBlock : public StmtMutator {
  public:
-  static std::tuple<PrimFunc, Map<Buffer, Buffer>, std::unordered_map<const VarNode*, IndexMap>>
+  static std::tuple<PrimFunc, Map<Buffer, Buffer>, std::unordered_map<const VarNode*, IndexMap>,
+                    std::unordered_map<const VarNode*, Array<PrimExpr>>>
   Rewrite(PrimFunc f) {
     RemoveLayoutRewriteBlock rewriter;
 
     PrimFuncNode* n = f.CopyOnWrite();
     n->body = rewriter(std::move(n->body));
-    return std::make_tuple(f, rewriter.buf_map_, rewriter.buffer_var_to_index_map_);
+    return std::make_tuple(f, rewriter.buf_map_, rewriter.buffer_var_to_index_map_,
+                           rewriter.buffer_var_to_rewritten_shape_);
   }
 
  private:
@@ -95,6 +97,8 @@ class RemoveLayoutRewriteBlock : public StmtMutator {
     }
     buffer_var_to_index_map_[load->buffer->data.get()] = IndexMap(load_indices, store->indices);
 
+    buffer_var_to_rewritten_shape_[load->buffer->data.get()] = store->buffer->shape;
+
     return Stmt(n);
   }
 
@@ -106,6 +110,8 @@ class RemoveLayoutRewriteBlock : public StmtMutator {
   /*! \brief Maps a buffer load to an index map associated with the load / store
     in a layout rewrite block. */
   std::unordered_map<const VarNode*, IndexMap> buffer_var_to_index_map_;
+  /*! \brief Maps a buffer load to the shape of the corresponding rewritten buffer. */
+  std::unordered_map<const VarNode*, Array<PrimExpr>> buffer_var_to_rewritten_shape_;
 };
 
 // After RemoveLayoutRewriteBlock, the body of a compute update block references a
@@ -139,9 +145,15 @@ using BufferVarMap = std::unordered_map<const tir::VarNode*, const tir::VarNode*
 
 class AllocateConstRewrite : public StmtExprMutator {
  public:
-  AllocateConstRewrite(const BufferVarMap& buffer_var_map,
-                       const std::unordered_map<const VarNode*, IndexMap>& buffer_var_to_index_map)
-      : buffer_var_map_(buffer_var_map), buffer_var_to_index_map_(buffer_var_to_index_map) {}
+  AllocateConstRewrite(
+      const BufferVarMap& buffer_var_map,
+      const std::unordered_map<const VarNode*, IndexMap>& buffer_var_to_index_map,
+      const std::unordered_map<const VarNode*, Array<PrimExpr>>& buffer_var_to_rewritten_shape,
+      bool skip_ndarray_rewrite)
+      : buffer_var_map_(buffer_var_map),
+        buffer_var_to_index_map_(buffer_var_to_index_map),
+        buffer_var_to_rewritten_shape_(buffer_var_to_rewritten_shape),
+        skip_ndarray_rewrite_(skip_ndarray_rewrite) {}
 
  private:
   Stmt VisitStmt_(const BlockNode* op) final {
@@ -163,8 +175,10 @@ class AllocateConstRewrite : public StmtExprMutator {
   Stmt VisitStmt_(const AllocateConstNode* alloc) final {
     if (auto it = buffer_var_to_index_map_.find(alloc->buffer_var.get());
         it != buffer_var_to_index_map_.end()) {
+      ICHECK(buffer_var_to_rewritten_shape_.count(alloc->buffer_var.get()));
       auto new_body = StmtMutator::VisitStmt(alloc->body);
-      auto rewritten_ndarray = it->second->MapNDArray(alloc->data.value());
+      auto rewritten_ndarray = RewriteNDArray(
+          alloc->data.value(), it->second, buffer_var_to_rewritten_shape_[alloc->buffer_var.get()]);
       Array<PrimExpr> rewritten_extents;
       for (auto s : rewritten_ndarray.Shape()) {
         rewritten_extents.push_back(PrimExpr(static_cast<int>(s)));
@@ -187,13 +201,32 @@ class AllocateConstRewrite : public StmtExprMutator {
     return ExprMutator::VisitExpr_(op);
   }
 
+  runtime::NDArray RewriteNDArray(runtime::NDArray src, const IndexMap& index_map,
+                                  const Array<PrimExpr>& dst_shape) {
+    if (skip_ndarray_rewrite_) {
+      // Only the shape of the destination array needs to be correct.
+      std::vector<int64_t> dst_shape_int;
+      for (auto s : dst_shape) {
+        ICHECK(s->IsInstance<IntImmNode>());
+        dst_shape_int.push_back(s.as<IntImmNode>()->value);
+      }
+      return src.CreateView(dst_shape_int, src.DataType());
+    } else {
+      return index_map->MapNDArray(src);
+    }
+  }
+
   /*! \brief Maps a buffer store to a load in a layout rewrite block */
   BufferVarMap buffer_var_map_;
   /*! \brief Maps a buffer load to an index map associated with the load / store
     in a layout rewrite block. */
   std::unordered_map<const VarNode*, IndexMap> buffer_var_to_index_map_;
+  /*! \brief Maps a buffer load to the shape of the corresponding rewritten buffer. */
+  std::unordered_map<const VarNode*, Array<PrimExpr>> buffer_var_to_rewritten_shape_;
   /*! \brief Maps load buffer variables to newly created buffers */
   std::unordered_map<const VarNode*, Buffer> new_load_buf_;
+  /*! \brief Whether or not to skip rewriting of NDArray contents */
+  bool skip_ndarray_rewrite_;
 };
 
 class CollectAllocateConstBufferVars : public StmtVisitor {
@@ -208,11 +241,12 @@ class CollectAllocateConstBufferVars : public StmtVisitor {
 
 class WeightLayoutRewriteBlockRemover : public StmtMutator {
  public:
-  static PrimFunc Remove(PrimFunc f) {
+  static PrimFunc Remove(PrimFunc f, bool skip_ndarray_rewrite) {
     CollectAllocateConstBufferVars collector;
     collector(f->body);
 
-    auto [f_, buf_map, buffer_var_to_index_map] = RemoveLayoutRewriteBlock().Rewrite(f);
+    auto [f_, buf_map, buffer_var_to_index_map, buffer_var_to_rewritten_shape] =
+        RemoveLayoutRewriteBlock().Rewrite(f);
 
     BufferVarMap buffer_var_map;
     for (const auto& [load_buf, store_buf] : buf_map) {
@@ -224,7 +258,8 @@ class WeightLayoutRewriteBlockRemover : public StmtMutator {
 
     PrimFuncNode* n = f_.CopyOnWrite();
 
-    AllocateConstRewrite rewriter(buffer_var_map, buffer_var_to_index_map);
+    AllocateConstRewrite rewriter(buffer_var_map, buffer_var_to_index_map,
+                                  buffer_var_to_rewritten_shape, skip_ndarray_rewrite);
     n->body = rewriter(std::move(n->body));
 
     Map<tir::Var, Buffer> buffer_map;
@@ -243,9 +278,9 @@ class WeightLayoutRewriteBlockRemover : public StmtMutator {
 
 namespace transform {
 
-Pass RemoveWeightLayoutRewriteBlock() {
-  auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
-    return WeightLayoutRewriteBlockRemover::Remove(std::move(f));
+Pass RemoveWeightLayoutRewriteBlock(bool skip_ndarray_rewrite) {
+  auto pass_func = [skip_ndarray_rewrite](PrimFunc f, IRModule m, PassContext ctx) {
+    return WeightLayoutRewriteBlockRemover::Remove(std::move(f), skip_ndarray_rewrite);
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.RemoveWeightLayoutRewriteBlock", {});
 }
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index 4047f44ac365..d5c81bcc56ba 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -539,32 +539,33 @@ def test_rewrite_layout_link_params():
     executor = relay.backend.Executor("graph", {"link-params": link_params})
     mod = mod.with_attr("executor", executor)
 
-    with tempfile.TemporaryDirectory() as work_dir:
-        database = ms.relay_integration.tune_relay(
-            mod=mod,
-            target=target,
-            params=params,
-            work_dir=work_dir,
-            max_trials_global=4,
-            strategy="replay-trace",
-        )
+    for strategy in ["replay-trace", "evolutionary"]:
+        with tempfile.TemporaryDirectory() as work_dir:
+            database = ms.relay_integration.tune_relay(
+                mod=mod,
+                target=target,
+                params=params,
+                work_dir=work_dir,
+                max_trials_global=4,
+                strategy=strategy,
+            )
 
-        lib = ms.relay_integration.compile_relay(
-            database=database,
-            mod=mod,
-            target=target,
-            params=params,
-        )
+            lib = ms.relay_integration.compile_relay(
+                database=database,
+                mod=mod,
+                target=target,
+                params=params,
+            )
 
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+        dev = tvm.device(target, 0)
+        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
-    runtime.set_input("data", data_np)
-    runtime.run()
+        runtime.set_input("data", data_np)
+        runtime.run()
 
-    out = runtime.get_output(0).numpy()
+        out = runtime.get_output(0).numpy()
 
-    np.testing.assert_allclose(ref, out, rtol=1e-4, atol=1e-4)
+        np.testing.assert_allclose(ref, out, rtol=1e-4, atol=1e-4)
 
 
 if __name__ == "__main__":

From b18f6425f951f5e9e039f497898be7d3227ddc93 Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Wed, 12 Oct 2022 14:57:28 -0400
Subject: [PATCH 344/704] [Hexagon] [runtime] Manage RPC and runtime buffers
 separately (#13028)

Creates HexagonPageAllocator to manage allocations for the RPC server

Adds new Device APIs for RPC buffer management

Allocations are tracked by two separate buffer managers:

- rpc_hexbuffs is used exclusively for RPC buffers
- runtime_hexbuffs is used exclusively for runtime buffers

This will fix the throw on shutdown in the simulator.
---
 src/runtime/hexagon/hexagon_buffer_manager.h  | 13 +++++
 src/runtime/hexagon/hexagon_device_api.cc     | 48 ++++++++++++++++---
 src/runtime/hexagon/hexagon_device_api.h      | 36 +++++++++-----
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc | 32 ++++++++++++-
 .../hexagon/hexagon_device_api_tests.cc       | 27 +++++++----
 5 files changed, 127 insertions(+), 29 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_buffer_manager.h b/src/runtime/hexagon/hexagon_buffer_manager.h
index a698b0ecb163..eecf96a6db07 100644
--- a/src/runtime/hexagon/hexagon_buffer_manager.h
+++ b/src/runtime/hexagon/hexagon_buffer_manager.h
@@ -25,6 +25,7 @@
 #include <memory>
 #include <unordered_map>
 #include <utility>
+#include <vector>
 
 #include "hexagon_buffer.h"
 
@@ -85,6 +86,18 @@ class HexagonBufferManager {
     return hexagon_buffer_map_.empty();
   }
 
+  //! \brief Returns a vector of currently allocated pointers, owned by the manager.
+  // Note - this should only be used by the device API to keep track of what
+  // was in the manager when HexagonDeviceAPI::ReleaseResources is called.
+  std::vector<void*> current_allocations() {
+    std::vector<void*> allocated;
+    std::lock_guard<std::mutex> lock(map_mutex_);
+    for (const auto& [data_ptr, buffer] : hexagon_buffer_map_) {
+      allocated.push_back(data_ptr);
+    }
+    return allocated;
+  }
+
  private:
   //! \brief Contains the HexagonBuffer objects managed by this class.
   std::unordered_map<void*, std::unique_ptr<HexagonBuffer>> hexagon_buffer_map_;
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 7c251721b749..3574ab50182c 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -90,18 +90,23 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
 
   const size_t typesize = (dtype.bits / 8) * dtype.lanes;
 
+  CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with "
+                          << "HexagonDeviceAPI::AllocDataSpace before initializing resources.  "
+                          << "Please call HexagonDeviceAPI::AcquireResources";
+
   if (ndim == 0) {
     // Allocate storage for a single scalar value.
-    return mgr->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
+    return runtime_hexbuffs->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
   } else if (ndim == 1) {
     // Allocate a single, contiguous memory region.
     size_t nbytes = shape[0] * typesize;
-    return mgr->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
+    return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
   } else if (ndim == 2) {
     // Allocate the region(s) needed for Hexagon's indirect-tensor format.
     size_t nallocs = shape[0];
     size_t nbytes = shape[1] * typesize;
-    return mgr->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope);
+    return runtime_hexbuffs->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment,
+                                                   mem_scope);
   } else {
     return nullptr;  // unreachable
   }
@@ -115,13 +120,34 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme
   if (alignment < kHexagonAllocAlignment) {
     alignment = kHexagonAllocAlignment;
   }
-  return mgr->AllocateHexagonBuffer(nbytes, alignment, String("global"));
+  CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with "
+                          << "HexagonDeviceAPI::AllocDataSpace before initializing resources.  "
+                          << "Please call HexagonDeviceAPI::AcquireResources";
+  return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, alignment, String("global"));
 }
 
 void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
   CHECK(ptr) << "buffer pointer is null";
   CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
-  mgr->FreeHexagonBuffer(ptr);
+  if (runtime_hexbuffs) {
+    runtime_hexbuffs->FreeHexagonBuffer(ptr);
+  } else {
+    auto it = std::find(released_runtime_buffers.begin(), released_runtime_buffers.end(), ptr);
+    CHECK(it != released_runtime_buffers.end()) << "Attempted to free Hexagon data with "
+                                                << "HexagonDeviceAPI::FreeDataSpace that was not "
+                                                << "allocated during the session.";
+  }
+}
+
+void* HexagonDeviceAPI::AllocRpcBuffer(size_t nbytes, size_t alignment) {
+  CHECK(nbytes) << "number of bytes is zero";
+  CHECK(alignment) << "alignment is zero";
+  return rpc_hexbuffs.AllocateHexagonBuffer(nbytes, alignment, String("global"));
+}
+
+void HexagonDeviceAPI::FreeRpcBuffer(void* ptr) {
+  CHECK(ptr) << "buffer pointer is null";
+  rpc_hexbuffs.FreeHexagonBuffer(ptr);
 }
 
 // WorkSpace: runtime allocations for Hexagon
@@ -137,7 +163,10 @@ void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_
 
 void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
   CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
-  CHECK(mgr->count(data) != 0)
+  CHECK(runtime_hexbuffs) << "Attempted to free Hexagon workspace with "
+                          << "HexagonDeviceAPI::FreeWorkspace outside of a session.  "
+                          << "Please call HexagonDeviceAPI::AcquireResources";
+  CHECK(runtime_hexbuffs->count(data) != 0)
       << "Attempt made to free unknown or already freed workspace allocation";
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
@@ -160,8 +189,13 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan
   CHECK_EQ(from->byte_offset, 0);
   CHECK_EQ(to->byte_offset, 0);
   CHECK_EQ(GetDataSize(*from), GetDataSize(*to));
+  CHECK(runtime_hexbuffs) << "Attempted to copy Hexagon data with "
+                          << "HexagonDeviceAPI::CopyDataFromTo before initializing resources.  "
+                          << "Please call HexagonDeviceAPI::AcquireResources";
 
-  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* { return mgr->find(ptr); };
+  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* {
+    return runtime_hexbuffs->find(ptr);
+  };
 
   HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
   HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index 1c802f353062..8d2795e7a04e 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -48,7 +48,7 @@ class HexagonDeviceAPI final : public DeviceAPI {
   static HexagonDeviceAPI* Global();
 
   //! \brief Constructor
-  HexagonDeviceAPI() { mgr = &hexbuffs; }
+  HexagonDeviceAPI() {}
 
   //! \brief Destructor
   ~HexagonDeviceAPI() {}
@@ -60,7 +60,7 @@ class HexagonDeviceAPI final : public DeviceAPI {
 
     CHECK_EQ(runtime_hexbuffs, nullptr);
     runtime_hexbuffs = std::make_unique<HexagonBufferManager>();
-    mgr = runtime_hexbuffs.get();
+    released_runtime_buffers.clear();
 
     CHECK_EQ(runtime_threads, nullptr);
     runtime_threads = std::make_unique<HexagonThreadManager>(threads, stack_size, pipe_size);
@@ -78,10 +78,10 @@ class HexagonDeviceAPI final : public DeviceAPI {
     runtime_threads.reset();
 
     CHECK(runtime_hexbuffs) << "runtime_hexbuffs was not created in AcquireResources";
-    if (runtime_hexbuffs && !runtime_hexbuffs->empty()) {
-      LOG(INFO) << "runtime_hexbuffs was not empty in ReleaseResources";
+    if (!runtime_hexbuffs->empty()) {
+      DLOG(INFO) << "runtime_hexbuffs was not empty in ReleaseResources";
+      released_runtime_buffers = runtime_hexbuffs->current_allocations();
     }
-    mgr = &hexbuffs;
     runtime_hexbuffs.reset();
 
     CHECK(runtime_vtcm) << "runtime_vtcm was not created in AcquireResources";
@@ -106,6 +106,12 @@ class HexagonDeviceAPI final : public DeviceAPI {
   //! \brief Free the allocated HexagonBuffer.
   void FreeDataSpace(Device dev, void* ptr) final;
 
+  //! \brief Hexagon-only interface to allocate buffers used for the RPC server
+  void* AllocRpcBuffer(size_t nbytes, size_t alignment);
+
+  //! \brief Hexagon-only interface to free buffers used for the RPC server
+  void FreeRpcBuffer(void* ptr);
+
   /*! \brief Request a dynamically allocated HexagonBuffer from a workspace pool.
    *  \returns The underlying allocation pointer.
    */
@@ -190,15 +196,21 @@ class HexagonDeviceAPI final : public DeviceAPI {
            (DLDeviceType(dev.device_type) == kDLCPU);
   }
 
-  //! \brief Manages underlying HexagonBuffer allocations
-  // runtime_hexbuffs is used for runtime allocations.  It is created
-  // with a call to AcquireResources, and destroyed on ReleaseResources.
-  // hexbuffs is used for all allocations outside of the session lifetime.
-  HexagonBufferManager hexbuffs;
+  //! \brief Manages RPC HexagonBuffer allocations
+  // rpc_hexbuffs is used only in Alloc/FreeRpcBuffer.  It is static because it lives for the
+  // lifetime of the static Device API.
+  HexagonBufferManager rpc_hexbuffs;
+
+  //! \brief Manages runtime HexagonBuffer allocations
+  // runtime_hexbuffs is used for runtime allocations, separate from rpc_hexbuffs.  It is created
+  // with a call to AcquireResources, and destroyed on ReleaseResources.  The buffers in this
+  // manager are scoped to the lifetime of a user application session.
   std::unique_ptr<HexagonBufferManager> runtime_hexbuffs;
 
-  //! \brief Current buffer manager
-  HexagonBufferManager* mgr;
+  //! \brief Keeps a list of released runtime HexagonBuffer allocations
+  // ReleaseResources can be called when there are still buffers in runtime_hexbuffs.  This list
+  // stores the buffers that were released.
+  std::vector<void*> released_runtime_buffers;
 
   //! \brief Thread manager
   std::unique_ptr<HexagonThreadManager> runtime_threads;
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index 22a54043cd9f..29c3a1bdfe6d 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -38,6 +38,7 @@ extern "C" {
 #include "../../../library_module.h"
 #include "../../../minrpc/minrpc_server.h"
 #include "../../hexagon/hexagon_common.h"
+#include "../../hexagon/hexagon_device_api.h"
 #include "hexagon_rpc.h"
 
 namespace tvm {
@@ -145,6 +146,35 @@ class HexagonIOHandler {
   uint32_t write_buffer_available_length_;
 };
 
+// Internal allocator that redirects alloc to TVM's C API.
+template <typename TIOHandler>
+class HexagonPageAllocator {
+ public:
+  using ArenaPageHeader = tvm::support::ArenaPageHeader;
+
+  explicit HexagonPageAllocator(TIOHandler* io) : io_(io) {}
+
+  ArenaPageHeader* allocate(size_t min_size) {
+    size_t npages = ((min_size + kPageSize - 1) / kPageSize);
+    void* data;
+
+    data = HexagonDeviceAPI::Global()->AllocRpcBuffer(npages * kPageSize, kPageAlign);
+
+    ArenaPageHeader* header = static_cast<ArenaPageHeader*>(data);
+    header->size = npages * kPageSize;
+    header->offset = sizeof(ArenaPageHeader);
+    return header;
+  }
+
+  void deallocate(ArenaPageHeader* page) { HexagonDeviceAPI::Global()->FreeRpcBuffer(page); }
+
+  static const constexpr int kPageSize = 2 << 10;
+  static const constexpr int kPageAlign = 8;
+
+ private:
+  TIOHandler* io_;
+};
+
 class HexagonRPCServer {
  public:
   explicit HexagonRPCServer(uint8_t* receive_buffer, size_t receive_buffer_size_bytes)
@@ -185,7 +215,7 @@ class HexagonRPCServer {
 
  private:
   HexagonIOHandler io_;
-  MinRPCServer<HexagonIOHandler> rpc_server_;
+  MinRPCServer<HexagonIOHandler, HexagonPageAllocator> rpc_server_;
 };
 
 }  // namespace hexagon
diff --git a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
index 2139aa78f7ae..e262a16ada5c 100644
--- a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
@@ -147,22 +147,31 @@ TEST_F(HexagonDeviceAPITest, DISABLED_alloc_free_diff_dev) {
   EXPECT_THROW(hexapi->FreeDataSpace(cpu_dev, buf), InternalError);
 }
 
-// Alloc a non-runtime buffer
-// Alloc a runtime buffer
-// "Release" resources for runtime
-// Verify the runtime buffer cannot be freed, but the non-runtime buffer can
-// This test should be run last
-TEST_F(HexagonDeviceAPITest, leak_resources) {
+// Ensure runtime buffer manager is properly configured and destroyed
+// in Acquire/Release
+TEST_F(HexagonDeviceAPITest, runtime_buffer_manager) {
   hexapi->ReleaseResources();
-  void* pre_runtime_buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8);
-  CHECK(pre_runtime_buf != nullptr);
+  EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8), InternalError);
   hexapi->AcquireResources();
   void* runtime_buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8);
   CHECK(runtime_buf != nullptr);
   hexapi->ReleaseResources();
+  hexapi->FreeDataSpace(hex_dev, runtime_buf);
+  hexapi->AcquireResources();
   EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, runtime_buf), InternalError);
-  hexapi->FreeDataSpace(hex_dev, pre_runtime_buf);
+}
+
+// Ensure RPC buffer manager is always available
+TEST_F(HexagonDeviceAPITest, rpc_buffer_manager) {
+  void* rpc_buf;
+  rpc_buf = hexapi->AllocRpcBuffer(nbytes, alignment);
+  CHECK(rpc_buf != nullptr);
+  hexapi->ReleaseResources();
+  hexapi->FreeRpcBuffer(rpc_buf);
+  rpc_buf = hexapi->AllocRpcBuffer(nbytes, alignment);
+  CHECK(rpc_buf != nullptr);
   hexapi->AcquireResources();
+  hexapi->FreeRpcBuffer(rpc_buf);
 }
 
 // Ensure thread manager is properly configured and destroyed

From 90c666f860f840050a07e05e3abf196d7401cd81 Mon Sep 17 00:00:00 2001
From: Yuchao Zhang <16538059+Lucien0@users.noreply.github.com>
Date: Thu, 13 Oct 2022 04:29:29 +0800
Subject: [PATCH 345/704] [Relay][Pass] ConcretizeCastLikeRewrite for
 SimplifyExpr (#12923)

* Add pass concretize cast like

* Fix simplify consecutive cast testcase issue

* Fix typo
---
 src/relay/transforms/simplify_expr.cc         | 31 +++++++++++++++++++
 tests/python/relay/test_pass_simplify_expr.py | 13 +++++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 463f76995436..7a4a9bd9fae8 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -565,6 +565,36 @@ class ConcretizeBroadcastToLikeRewrite : public ConcretizeLikeRewrite {
   }
 };
 
+/*!
+ * \brief Converts cast_like operator to cast. Not inheriting from ConcretizeLikeRewrite
+ * because even if shape is not static, still can concretize.
+ */
+class ConcretizeCastLikeRewrite : public DFPatternRewrite {
+ public:
+  ConcretizeCastLikeRewrite() {
+    data_pat_ = IsWildcard();
+    like_pat_ = IsWildcard();
+    pattern_ = IsOp("cast_like")({data_pat_, like_pat_});
+  }
+
+  Expr Callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    const CallNode* call_node = pre.as<CallNode>();
+    ICHECK(call_node);
+
+    if (!call_node->checked_type().as<TensorTypeNode>()) {
+      return post;
+    }
+
+    const TensorTypeNode* like_ty = pre->checked_type().as<TensorTypeNode>();
+    return MakeCast(node_map[data_pat_][0], like_ty->dtype);
+  }
+
+ protected:
+  DFPattern data_pat_;
+  DFPattern like_pat_;
+};
+
 /*! \brief Eliminates expressions that are equivalent to identity. */
 class EliminateIdentityRewrite : public DFPatternRewrite {
  public:
@@ -762,6 +792,7 @@ Expr SimplifyExpr(const Expr& expr, const IRModule& mod) {
   composer.AddRewrite<ConcretizeReshapeLikeRewrite>();
   composer.AddRewrite<ConcretizeCollapseSumLikeRewrite>();
   composer.AddRewrite<ConcretizeBroadcastToLikeRewrite>();
+  composer.AddRewrite<ConcretizeCastLikeRewrite>();
   composer.AddRewrite<SimplifyRSqrt>();
   composer.AddRewrite<EliminateIdentityRewrite>();
   composer.AddRewrite<SimplifyReshape>();
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index dcd58602b0ac..16d5efe10c44 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -442,7 +442,7 @@ def test_simplify_consecutive_cast():
     expr1 = relay.cast(x, "int32")
     expr2 = relay.cast_like(expr1, y)
     actual = run_opt_pass(expr2, relay.transform.SimplifyExpr())
-    expected = run_infer_type(expr2)
+    expected = run_infer_type(relay.cast(expr1, "float32"))
     assert tvm.ir.structural_equal(actual, expected)
 
 
@@ -517,6 +517,17 @@ def test_concretize_broadcast_to_like():
     assert tvm.ir.structural_equal(actual, expected)
 
 
+def test_concretize_cast_like():
+    dim_any = tvm.tir.Any()
+    data = relay.var("data", shape=(3, dim_any, 5), dtype="float32")
+    dtype_like = relay.var("dtype_like", shape=(dim_any, 3, 3), dtype="int32")
+    expr = relay.cast_like(data, dtype_like)
+
+    expected = run_infer_type(relay.cast(data, "int32"))
+    actual = run_opt_pass(expr, relay.transform.SimplifyExpr())
+    assert tvm.ir.structural_equal(actual, expected)
+
+
 def test_concretize_multiple():
     x = relay.var("x", shape=(2, 3), dtype="float32")
     y = relay.var("y", shape=(3,), dtype="float32")

From 61c9742ea79d0057290502379a81a5487c77790d Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Wed, 12 Oct 2022 18:52:07 -0700
Subject: [PATCH 346/704] [Hexagon] Enable multi input Async DMA; same queue /
 stage (#13037)

* [Hexagon] Enable multi input Async DMA; same queue / stage

* add option to merge (or separate) async_commit_queue_scope attrs

* move merge_async_commit_queue_scope option select inside pass
---
 src/driver/driver_api.cc                      |   1 +
 .../transforms/inject_software_pipeline.cc    |  51 +++--
 src/tir/transforms/lower_async_dma.cc         |   7 +-
 .../test_software_pipeline_async.py           | 180 +++++++++++++-----
 4 files changed, 173 insertions(+), 66 deletions(-)

diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index b0af0fb65e16..5f8c8742695d 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -51,6 +51,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.merge_async_commit_queue_scope", Bool);
 
 using runtime::PackedFunc;
 using runtime::TVMArgs;
diff --git a/src/tir/transforms/inject_software_pipeline.cc b/src/tir/transforms/inject_software_pipeline.cc
index 08d57c53d1c2..51523a37399b 100644
--- a/src/tir/transforms/inject_software_pipeline.cc
+++ b/src/tir/transforms/inject_software_pipeline.cc
@@ -309,9 +309,10 @@ class PipelineRewriter : public StmtExprMutator {
       const Array<Buffer> pipeline_allocs, const For& pipeline_loop,
       const PipelineInfo& pipeline_info,
       const std::unordered_map<const VarNode*, FragmentInfo>& fragment_info,
-      const Map<String, ObjectRef> preserved_annotations) {
+      const Map<String, ObjectRef> preserved_annotations, bool merge_async_commit_queue_scope) {
     PipelineRewriter rewriter(buffer_data_to_buffer, double_buffers, pipeline_allocs, pipeline_loop,
-                              pipeline_info, fragment_info, preserved_annotations);
+                              pipeline_info, fragment_info, preserved_annotations,
+                              merge_async_commit_queue_scope);
     return rewriter.BuildPipeline();
   }
 
@@ -321,7 +322,8 @@ class PipelineRewriter : public StmtExprMutator {
                    const Array<Buffer>& pipeline_allocs, const For& pipeline_loop,
                    const PipelineInfo& pipeline_info,
                    const std::unordered_map<const VarNode*, FragmentInfo>& fragment_info,
-                   const Map<String, ObjectRef> preserved_annotations)
+                   const Map<String, ObjectRef> preserved_annotations,
+                   bool merge_async_commit_queue_scope)
 
       : buffer_data_to_buffer_(std::move(buffer_data_to_buffer)),
         double_buffers_(double_buffers),
@@ -329,7 +331,8 @@ class PipelineRewriter : public StmtExprMutator {
         pipeline_loop_(pipeline_loop),
         pipeline_info_(pipeline_info),
         fragment_info_(fragment_info),
-        preserved_annotations_(preserved_annotations) {}
+        preserved_annotations_(preserved_annotations),
+        merge_async_commit_queue_scope_(merge_async_commit_queue_scope) {}
 
   Stmt BuildPipeline() {
     // Step 1: Analyze accesses to the buffers in the pipeline and compute the number of versions
@@ -762,11 +765,19 @@ class PipelineRewriter : public StmtExprMutator {
               << "Predicates in the same stage are expected to be identical";
           group_bodies.push_back(new_blocks[i].block->body);
         }
-        auto body = group_bodies.size() > 1 ? SeqStmt(group_bodies) : group_bodies[0];
-        auto commit_queue_scope = AttrStmt(make_zero(DataType::Int(32)),
-                                           tir::attr::async_commit_queue_scope, stage_id, body);
-        auto new_block = MakeBlock(commit_queue_scope, buffer_data_to_buffer_);
-        stmts.push_back(BlockRealize({}, predicate, new_block));
+
+        if (merge_async_commit_queue_scope_ && group_bodies.size() > 1) {
+          auto merged_bodies = SeqStmt(group_bodies);
+          group_bodies.clear();
+          group_bodies.push_back(merged_bodies);
+        }
+
+        for (auto body : group_bodies) {
+          auto commit_queue_scope = AttrStmt(make_zero(DataType::Int(32)),
+                                             tir::attr::async_commit_queue_scope, stage_id, body);
+          auto new_block = MakeBlock(commit_queue_scope, buffer_data_to_buffer_);
+          stmts.push_back(BlockRealize({}, predicate, new_block));
+        }
       }
     }
 
@@ -842,7 +853,8 @@ class PipelineRewriter : public StmtExprMutator {
         auto& local_state = async_states_local[stage];
 
         int commit_group_id = -1;
-        if (local_state.commit_groups.empty() || local_state.consumed) {
+        if (local_state.commit_groups.empty() || local_state.consumed ||
+            !merge_async_commit_queue_scope_) {
           // consumed == true means there is already a consumer stage waiting for an
           // eariler async operation of this stage. In such cases, we make multiple commit_queue
           // for this stage.
@@ -942,6 +954,7 @@ class PipelineRewriter : public StmtExprMutator {
   Array<Block> ordered_stmts_;
   std::map<int, AsyncStateGlobal> async_states;
   Map<String, ObjectRef> preserved_annotations_;
+  bool merge_async_commit_queue_scope_ = true;
 };
 
 /*!
@@ -980,8 +993,8 @@ void BuildDependencyGraph(
 
 class PipelineInjector : private StmtExprMutator {
  public:
-  static Stmt Inject(const PrimFunc& func) {
-    PipelineInjector injector;
+  static Stmt Inject(const PrimFunc& func, bool merge_async_commit_queue_scope) {
+    PipelineInjector injector(merge_async_commit_queue_scope);
     for (const auto& kv : func->buffer_map) {
       const Buffer& buffer = kv.second;
       injector.buffer_data_to_buffer_.Set(buffer->data, buffer);
@@ -991,7 +1004,8 @@ class PipelineInjector : private StmtExprMutator {
   }
 
  private:
-  PipelineInjector() = default;
+  explicit PipelineInjector(bool merge_async_commit_queue_scope)
+      : merge_async_commit_queue_scope_(merge_async_commit_queue_scope) {}
 
   /*!
    * \brief Check the pipeline satisfies the following conditions:
@@ -1126,9 +1140,9 @@ class PipelineInjector : private StmtExprMutator {
     ValidatePipelineBody(pipeline_info, original_order);
 
     // Step 4: Rewrite the pipeline body.
-    Stmt pipeline = PipelineRewriter::Rewrite(buffer_data_to_buffer_, double_buffers,
-                                              pipeline_allocs, GetRef<For>(op), pipeline_info,
-                                              fragment_info_, preserved_annotations);
+    Stmt pipeline = PipelineRewriter::Rewrite(
+        buffer_data_to_buffer_, double_buffers, pipeline_allocs, GetRef<For>(op), pipeline_info,
+        fragment_info_, preserved_annotations, merge_async_commit_queue_scope_);
 
     if (const auto* realize = op->body.as<BlockRealizeNode>()) {
       const auto& block = realize->block;
@@ -1197,6 +1211,7 @@ class PipelineInjector : private StmtExprMutator {
   Map<Var, Buffer> buffer_data_to_buffer_;
   std::unordered_map<const VarNode*, FragmentInfo> fragment_info_;
   std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> double_buffers;
+  bool merge_async_commit_queue_scope_ = true;
 };
 
 }  // namespace software_pipeline
@@ -1210,7 +1225,9 @@ namespace transform {
 Pass InjectSoftwarePipeline() {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
     auto* fptr = f.CopyOnWrite();
-    fptr->body = software_pipeline::PipelineInjector::Inject(f);
+    bool merge_async_commit_queue_scope =
+        ctx->GetConfig<Bool>("tir.merge_async_commit_queue_scope", Bool(true)).value();
+    fptr->body = software_pipeline::PipelineInjector::Inject(f, merge_async_commit_queue_scope);
     fptr->body = ConvertSSA(std::move(fptr->body));
     return f;
   };
diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc
index 78d363f67c02..417e9d61f263 100644
--- a/src/tir/transforms/lower_async_dma.cc
+++ b/src/tir/transforms/lower_async_dma.cc
@@ -94,9 +94,6 @@ class AsyncDMALowerer : public StmtExprMutator {
       ICHECK(queue_id_node);
       int queue_id = queue_id_node->value;
 
-      // save queue ID for inspection in `wait` transform
-      queue_ids.insert(queue_id);
-
       // walk the graph to verify this is a mem copy ...
       // 1) async_commit_queue_scope contains async_scope
       auto async_scope = op->body.as<AttrStmtNode>();
@@ -161,6 +158,10 @@ class AsyncDMALowerer : public StmtExprMutator {
         return analyzer.Simplify(Substitute(std::move(expr), loop_var_remap));
       });
 
+      // now that we are about to perform the `copy` transform
+      // save queue ID for inspection in `wait` transform
+      queue_ids.insert(queue_id);
+
       return Evaluate(Call(DataType::Int(32), builtin::dma_copy(),
                            {queue_id,
                             Call(DataType::Handle(), builtin::address_of(),
diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
index 943d4262f9da..a883a9a251e3 100644
--- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -30,81 +30,169 @@
 inner = tvm.testing.parameter(64, 128)
 dtype = tvm.testing.parameter("uint8", "float16")
 scope = tvm.testing.parameter("global", "global.vtcm")
+# TODO(Joseph) Turn on "multi_input_diffQ" compute type once we have upstreamed
+# changes in the InjectSoftwarePipeline pass to alleviate this restriction:
+# 'A dependency on multiple async stages is not supported'
+comp_type = tvm.testing.parameter("single_input", "multi_input_sameQ")
 # TODO(Straw) Add back "cache_write" schedule type once we have upstreamed
 # buffer dependency analysis in InjectSoftwarePipeline pass
 # to insert approprite TIR "wait" attributes for this schedule
-sched = tvm.testing.parameter("cache_read", "cache_read_write")
+sched_type = tvm.testing.parameter("cache_read", "cache_read_write")
 
 
 @tvm.testing.fixture
-def compute(outer, inner, dtype):
-    @T.prim_func
-    def plus_one_primfunc(A: T.Buffer[(outer, inner), dtype], B: T.Buffer[(outer, inner), dtype]):
-        for i in T.serial(outer):
-            for j in T.serial(inner):
-                with T.block("compute"):
-                    with T.block():
-                        B[i, j] = A[i, j] + T.cast(1, dtype)
+def data(comp_type, outer, inner, dtype):
+    out_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
+    a_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
+    if comp_type == "single_input":
+        return out_np, a_np
+    else:
+        b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
+        return out_np, a_np, b_np
+
+
+@tvm.testing.fixture
+def compute(comp_type, outer, inner, dtype):
+    if comp_type == "single_input":
+
+        @T.prim_func
+        def a_plus_1_primfunc(
+            A: T.Buffer[(outer, inner), dtype], OUT: T.Buffer[(outer, inner), dtype]
+        ):
+            for i in T.serial(outer):
+                for j in T.serial(inner):
+                    with T.block("compute"):
+                        with T.block():
+                            OUT[i, j] = A[i, j] + T.cast(1, dtype)
+
+        return a_plus_1_primfunc
+    else:
+
+        @T.prim_func
+        def a_plus_b_plus_1_primfunc(
+            A: T.Buffer[(outer, inner), dtype],
+            B: T.Buffer[(outer, inner), dtype],
+            OUT: T.Buffer[(outer, inner), dtype],
+        ):
+            for i in T.serial(outer):
+                for j in T.serial(inner):
+                    with T.block("compute"):
+                        with T.block():
+                            OUT[i, j] = A[i, j] + B[i, j] + T.cast(1, dtype)
+
+        return a_plus_b_plus_1_primfunc
+
+
+@tvm.testing.fixture
+def reference(comp_type):
+    if comp_type == "single_input":
+
+        def a_plus_1_ref(a):
+            return a + 1
+
+        return a_plus_1_ref
+    else:
 
-    def plus_one_ref(a):
-        return a + 1
+        def a_plus_b_plus_1_ref(a, b):
+            return a + b + 1
 
-    return plus_one_primfunc, plus_one_ref
+        return a_plus_b_plus_1_ref
 
 
 @tvm.testing.fixture
-def schedule(compute, sched, scope):
-    sch = tir.Schedule(compute[0])
+def schedule(comp_type, compute, sched_type, scope):
+    sch = tir.Schedule(compute)
 
     compute_block = sch.get_block("compute")
     i, _ = sch.get_loops(compute_block)
 
-    if sched == "cache_read":
-        cache_read_block = sch.cache_read(compute_block, 0, scope)
-        sch.compute_at(cache_read_block, i)
-        sch.annotate(i, "software_pipeline_stage", [0, 1])
-        sch.annotate(i, "software_pipeline_order", [0, 1])
-        sch.annotate(i, "software_pipeline_async_stages", [0])
-    elif sched == "cache_write":
-        cache_write_block = sch.cache_write(compute_block, 0, scope)
-        sch.reverse_compute_at(cache_write_block, i)
+    if "read" in sched_type:
+        cache_read_a = sch.cache_read(compute_block, 0, scope)
+        sch.compute_at(cache_read_a, i)
+
+        if "multi_input" in comp_type:
+            cache_read_b = sch.cache_read(compute_block, 1, scope)
+            sch.compute_at(cache_read_b, i)
+
+    if "write" in sched_type:
+        cache_write_out = sch.cache_write(compute_block, 0, scope)
+        sch.reverse_compute_at(cache_write_out, i)
+
+    if "read" in sched_type and "write" in sched_type:
+        if comp_type == "single_input":
+            sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
+            sch.annotate(i, "software_pipeline_order", [0, 1, 2])
+            sch.annotate(i, "software_pipeline_async_stages", [0, 2])
+        elif comp_type == "multi_input_sameQ":
+            sch.annotate(i, "software_pipeline_stage", [0, 0, 1, 2])
+            sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3])
+            sch.annotate(i, "software_pipeline_async_stages", [0, 2])
+        elif comp_type == "multi_input_diffQ":
+            sch.annotate(i, "software_pipeline_stage", [0, 1, 2, 3])
+            sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3])
+            sch.annotate(i, "software_pipeline_async_stages", [0, 1, 2])
+
+    elif "read" in sched_type:
+        if comp_type == "single_input":
+            sch.annotate(i, "software_pipeline_stage", [0, 1])
+            sch.annotate(i, "software_pipeline_order", [0, 1])
+            sch.annotate(i, "software_pipeline_async_stages", [0])
+        elif comp_type == "multi_input_sameQ":
+            sch.annotate(i, "software_pipeline_stage", [0, 0, 1])
+            sch.annotate(i, "software_pipeline_order", [0, 1, 2])
+            sch.annotate(i, "software_pipeline_async_stages", [0])
+        elif comp_type == "multi_input_diffQ":
+            sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
+            sch.annotate(i, "software_pipeline_order", [0, 1, 2])
+            sch.annotate(i, "software_pipeline_async_stages", [0, 1])
+
+    elif "write" in sched_type:
         sch.annotate(i, "software_pipeline_stage", [0, 1])
         sch.annotate(i, "software_pipeline_order", [0, 1])
         sch.annotate(i, "software_pipeline_async_stages", [1])
-    elif sched == "cache_read_write":
-        cache_read_block = sch.cache_read(compute_block, 0, scope)
-        sch.compute_at(cache_read_block, i)
-        cache_write_block = sch.cache_write(compute_block, 0, scope)
-        sch.reverse_compute_at(cache_write_block, i)
-        sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
-        sch.annotate(i, "software_pipeline_order", [0, 1, 2])
-        sch.annotate(i, "software_pipeline_async_stages", [0, 2])
 
     return sch
 
 
-@tvm.testing.requires_hexagon
-def test_async_software_pipeline(hexagon_launcher, compute, schedule, outer, inner, dtype, scope):
-    sch = schedule
+@tvm.testing.fixture
+def verify(dtype):
+    def check(out, ref):
+        if "int" in dtype:
+            np.testing.assert_equal(out.numpy(), ref)
+        else:
+            np.testing.assert_allclose(out.numpy(), ref, rtol=1e-3, atol=1e-3)
+
+    return check
 
-    a_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
-    b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
-    ref = compute[1](a_np)
 
-    with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
-        func = tvm.build(sch.mod["main"], target=get_hexagon_target("v68"))
+@tvm.testing.requires_hexagon
+def test_async_software_pipeline(hexagon_launcher, comp_type, data, reference, schedule, verify):
+    out_np = data[0]
+    a_np = data[1]
+    if comp_type == "single_input":
+        ref = reference(a_np)
+    else:
+        b_np = data[2]
+        ref = reference(a_np, b_np)
+
+    with tvm.transform.PassContext(
+        config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": False}
+    ):
+        # tvm.lower(schedule.mod["main"]).show()
+        func = tvm.build(schedule.mod["main"], target=get_hexagon_target("v68"))
 
     with hexagon_launcher.start_session() as hexagon_session:
         dev = hexagon_session.device
-        a = tvm.nd.array(a_np, device=dev)
-        b = tvm.nd.array(b_np, device=dev)
         mod = hexagon_session.load_module(func)
-        mod(a, b)
-
-        if "int" in dtype:
-            np.testing.assert_equal(b.numpy(), ref)
+        out = tvm.nd.array(out_np, device=dev)
+        a = tvm.nd.array(a_np, device=dev)
+        if comp_type == "single_input":
+            mod(a, out)
         else:
-            np.testing.assert_allclose(b.numpy(), ref, rtol=1e-3, atol=1e-3)
+            b = tvm.nd.array(b_np, device=dev)
+            mod(a, b, out)
+
+        verify(out, ref)
 
 
 if __name__ == "__main__":

From 46c16eed838ceef4cb336ef6a2a50674f9938088 Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Thu, 13 Oct 2022 10:46:52 -0700
Subject: [PATCH 347/704] [microTVM][Windows] Enable building TVM on Windows
 with USE_MICRO (#12851)

* initial version of building TVM on windows with USE_MICRO enabled

* fix rand() lint issue

* a few more lint cleanups

* move ssize.h to an easier-to-use location; remove unused code;

* lint

* change to using CMake add_library()

* move ssize.h back to original location

* lint

* added new line
---
 CMakeLists.txt                                |  9 ++++--
 cmake/modules/StandaloneCrt.cmake             | 30 +++++++++++++++++--
 include/tvm/runtime/crt/logging.h             |  4 +++
 include/tvm/runtime/crt/platform.h            |  4 +++
 include/tvm/runtime/crt/rpc_common/session.h  | 13 ++++++++
 .../tvm/runtime/crt/rpc_common/write_stream.h |  2 ++
 src/runtime/micro/micro_session.cc            |  9 ++++++
 7 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47d30a89d2d1..3a21d22f78f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -564,10 +564,15 @@ include(cmake/modules/contrib/PAPI.cmake)
 if(USE_MICRO)
   # NOTE: cmake doesn't track dependencies at the file level across subdirectories. For the
   # Unix Makefiles generator, need to add these explicit target-level dependency)
-  add_dependencies(tvm host_standalone_crt)
-  add_dependencies(tvm_runtime host_standalone_crt)
   add_dependencies(tvm_runtime zephyr)
   add_dependencies(tvm_runtime arduino)
+  if(MSVC)
+    target_link_libraries(tvm PRIVATE host_standalone_crt )
+    target_link_libraries(tvm_runtime PRIVATE host_standalone_crt)
+  else()
+    add_dependencies(tvm host_standalone_crt)
+    add_dependencies(tvm_runtime host_standalone_crt)
+  endif()
 endif()
 
 if(USE_CPP_RPC)
diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index e7c132651ca4..5703058d3c3d 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -15,9 +15,32 @@
 # specific language governing permissions and limitations
 # under the License.
 
+
 if(USE_MICRO)
+
+if(MSVC)
+
+  # When building for Windows, use standard CMake for compatibility with
+  # Visual Studio build tools and not require Make to be on the system.
+
+  set(CRT_CONFIG, "src/runtime/micro/crt_config.h")
+
+  add_library(host_standalone_crt
+              STATIC
+              3rdparty/libcrc/src/crcccitt.c
+              src/runtime/crt/microtvm_rpc_common/frame_buffer.cc
+              src/runtime/crt/microtvm_rpc_common/framing.cc
+              src/runtime/crt/microtvm_rpc_common/session.cc
+              src/runtime/crt/microtvm_rpc_common/write_stream.cc)
+
+  target_include_directories(host_standalone_crt
+                             PRIVATE
+                             3rdparty/libcrc/include
+                             src/runtime/micro)
+
+else()
+
   message(STATUS "Build standalone CRT for microTVM")
-  tvm_file_glob(GLOB crt_srcs src/runtime/crt/**)
 
   function(tvm_crt_define_targets)
     # Build an isolated build directory, separate from the TVM tree.
@@ -45,6 +68,7 @@ if(USE_MICRO)
          "src/runtime/crt/microtvm_rpc_server *.cc -> src/runtime/crt/microtvm_rpc_server"
          "src/runtime/minrpc *.h -> src/runtime/minrpc"
          "src/support generic_arena.h -> src/support"
+         "src/support ssize.h -> src/support"
          "src/runtime/crt crt_config-template.h -> template"
          )
 
@@ -149,4 +173,6 @@ if(USE_MICRO)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${TVM_CRT_LINKER_LIB})
   endif()
 
-endif(USE_MICRO)
+endif()
+
+endif()
diff --git a/include/tvm/runtime/crt/logging.h b/include/tvm/runtime/crt/logging.h
index e955739ee80e..6cedf1b6df2b 100644
--- a/include/tvm/runtime/crt/logging.h
+++ b/include/tvm/runtime/crt/logging.h
@@ -37,7 +37,11 @@
 extern "C" {
 #endif
 
+#if defined(_MSC_VER)
+void TVMLogf(const char* fmt, ...);
+#else
 void __attribute__((format(printf, 1, 2))) TVMLogf(const char* fmt, ...);
+#endif
 
 #define LOG(level, x, ...)          \
   if (TVM_CRT_LOG_LEVEL >= level) { \
diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h
index bb916afacde1..1bc610e6cc53 100644
--- a/include/tvm/runtime/crt/platform.h
+++ b/include/tvm/runtime/crt/platform.h
@@ -40,7 +40,11 @@ extern "C" {
  *
  * \param code An error code.
  */
+#if defined(_MSC_VER)
+__declspec(noreturn) void TVMPlatformAbort(tvm_crt_error_t code);
+#else
 void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t code);
+#endif
 
 /*! \brief Called by the microTVM RPC server to implement TVMLogf.
  *
diff --git a/include/tvm/runtime/crt/rpc_common/session.h b/include/tvm/runtime/crt/rpc_common/session.h
index eee1de6072d2..9bea4b05e7eb 100644
--- a/include/tvm/runtime/crt/rpc_common/session.h
+++ b/include/tvm/runtime/crt/rpc_common/session.h
@@ -43,11 +43,24 @@ enum class MessageType : uint8_t {
   kNormal = 0x10,
 };
 
+#if defined(_MSC_VER)
+
+#pragma pack(push, 1)
+typedef struct SessionHeader {
+  uint16_t session_id;
+  MessageType message_type;
+} SessionHeader;
+#pragma pack(pop)
+
+#else
+
 typedef struct SessionHeader {
   uint16_t session_id;
   MessageType message_type;
 } __attribute__((packed)) SessionHeader;
 
+#endif
+
 /*!
  * \brief CRT communication session management class.
  * Assumes the following properties provided by the underlying transport:
diff --git a/include/tvm/runtime/crt/rpc_common/write_stream.h b/include/tvm/runtime/crt/rpc_common/write_stream.h
index cdc579585993..f72ba021def6 100644
--- a/include/tvm/runtime/crt/rpc_common/write_stream.h
+++ b/include/tvm/runtime/crt/rpc_common/write_stream.h
@@ -30,6 +30,8 @@
 #include <sys/types.h>
 #include <tvm/runtime/crt/error_codes.h>
 
+#include "../../../../../src/support/ssize.h"
+
 namespace tvm {
 namespace runtime {
 namespace micro_rpc {
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 6911c2021ac1..23ed09119a69 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -176,11 +176,20 @@ class MicroTransportChannel : public RPCChannel {
     // confusion.
     unsigned int seed = random_seed.load();
     if (seed == 0) {
+#if defined(_MSC_VER)
       seed = (unsigned int)time(nullptr);
+      srand(seed);
+#else
+      seed = (unsigned int)time(nullptr);
+#endif
     }
     uint8_t initial_nonce = 0;
     for (int i = 0; i < kNumRandRetries && initial_nonce == 0; ++i) {
+#if defined(_MSC_VER)
+      initial_nonce = rand();  // NOLINT(runtime/threadsafe_fn)
+#else
       initial_nonce = rand_r(&seed);
+#endif
     }
     random_seed.store(seed);
     ICHECK_NE(initial_nonce, 0) << "rand() does not seem to be producing random values";

From f06896f38f0898c7cdd17d62fd05318d10e20979 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 13 Oct 2022 10:56:33 -0700
Subject: [PATCH 348/704] [Relay] Fix simplifying consecutive casts when
 intermediate type has fewer bits (#13056)

* [Relay] Fix simplifying consecutive casts when intermediate type has fewer bits

* fix
---
 src/relay/transforms/simplify_expr.cc         | 2 +-
 tests/python/relay/aot/test_crt_aot_usmp.py   | 2 +-
 tests/python/relay/test_pass_simplify_expr.py | 7 +++++++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 7a4a9bd9fae8..cf594a09a266 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -146,7 +146,7 @@ class SimplifyConsecutiveCast : public DFPatternRewrite {
       // BFloat cast cannot be omitted
       return false;
     }
-    if (origin.code() < cast.code()) {
+    if (origin.code() < cast.code() && origin.bits() <= cast.bits()) {
       // Loosely have a hiearchy to datatypes
       // e.g. int --> uint --> float has increasing range of numbers they can represent
       return true;
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index b79350d172ac..75613d81e145 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -303,7 +303,7 @@ def test_byoc_microtvm(merge_compiler_regions):
     "model_url, usmp_algo, workspace_size, constant_size",
     [
         (MOBILENET_V1_URL, "greedy_by_size", 4845696, 8468008),
-        (MOBILENET_V1_URL, "greedy_by_conflicts", 4444288, 8468008),
+        (MOBILENET_V1_URL, "greedy_by_conflicts", 4845696, 8468008),
         (MOBILENET_V1_URL, "hill_climb", 3240064, 8468008),
     ],
 )
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index 16d5efe10c44..e84d238aaa75 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -445,6 +445,13 @@ def test_simplify_consecutive_cast():
     expected = run_infer_type(relay.cast(expr1, "float32"))
     assert tvm.ir.structural_equal(actual, expected)
 
+    x = relay.var("x", shape=(3, 4), dtype="int64")
+    expr1 = relay.cast(x, "bool")
+    expr2 = relay.cast(expr1, "int32")
+    actual = run_opt_pass(expr2, relay.transform.SimplifyExpr())
+    expected = run_infer_type(expr2)
+    assert tvm.ir.structural_equal(actual, expected)
+
 
 def test_concretize_reshape_like():
     data = relay.var("data", shape=(2, 3, 4), dtype="float32")

From 29a8f06066cdfb38df63f25d8ceb951107df69bb Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 13 Oct 2022 15:08:43 -0500
Subject: [PATCH 349/704] [Arith] Optional rewriting and simplification into
 AND of ORs (#12972)

Previously, `RewriteSimplifier` inspected the top branch of each `And`
and `Or` node, but didn't perform simplifications that would require
inspection across branches nested structures.  This commit introduces
`SimplifyAsAndOfOrs`, which converts to an internal representation as
conjunctive normal form, performs simplifications while in that form,
then converts back to a `PrimExpr`.

This utility is designed for data-propagation simplifications as part
of https://github.com/apache/tvm/issues/12261.  To avoid increasing
the CPU load unnecesarily, this utility is only used on an opt-in
basis, either using `RewriteSimplifer::SetEnabledFeatures` when called
directly, or using `PassContext` when used as part of
`tir::transform::Simplify`.

* [Arith][UnitTest] Unittests displaying desired behavior

* Corrected example in analyzer.h

* Added underscore for private members

* Rename Cleanup() to RemoveTrueFalse()

* Added comments for conversion to internal representation

* Updated comment in test case

* Preferentially order clauses according to first occurrence

* Added more unit tests
---
 include/tvm/arith/analyzer.h                  |   8 +
 src/arith/conjunctive_normal_form.cc          | 430 ++++++++++++++++++
 src/arith/conjunctive_normal_form.h           |  49 ++
 src/arith/rewrite_simplify.cc                 |  11 +
 src/arith/rewrite_simplify.h                  |   4 +
 src/tir/transforms/simplify.cc                |   9 +-
 .../unittest/test_tir_transform_simplify.py   | 130 ++++++
 7 files changed, 640 insertions(+), 1 deletion(-)
 create mode 100644 src/arith/conjunctive_normal_form.cc
 create mode 100644 src/arith/conjunctive_normal_form.h

diff --git a/include/tvm/arith/analyzer.h b/include/tvm/arith/analyzer.h
index 79b01d0cc859..b80d75a17058 100644
--- a/include/tvm/arith/analyzer.h
+++ b/include/tvm/arith/analyzer.h
@@ -297,6 +297,14 @@ class RewriteSimplifier {
      * if_then_else(i<j && j<k, i<k, false) => if_then_else(i<j && j<k, true, false)
      */
     kTransitivelyProveInequalities = (1 << 0),
+
+    /* When simplifying a boolean expression, convert to an AND of ORs
+     * (conjunctive normal form).
+     *
+     * Example:
+     *   (a && b) || c => (a || c) && (b || c)
+     */
+    kConvertBooleanToAndOfOrs = (1 << 1),
   };
 
   /*! \brief Enable an optional extension or extensions
diff --git a/src/arith/conjunctive_normal_form.cc b/src/arith/conjunctive_normal_form.cc
new file mode 100644
index 000000000000..19d6a234e6ad
--- /dev/null
+++ b/src/arith/conjunctive_normal_form.cc
@@ -0,0 +1,430 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/arith/conjunctive_normal_form.cc
+ */
+
+#include "conjunctive_normal_form.h"
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/tir/expr.h>
+
+#include <optional>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "pattern_match.h"
+#include "rewrite_simplify.h"
+
+namespace tvm {
+namespace arith {
+
+namespace {
+/* \brief A utility for simplifying expressions using conjunctive/disjuctive normal forms */
+class AndOfOrs {
+ public:
+  /*! \brief Construct the simplifier
+   *
+   * Convert a PrimExpr to the internal representation.
+   *
+   * \param expr The PrimExpr to be simplified.
+   */
+  explicit AndOfOrs(const PrimExpr& expr);
+
+  /*! \brief Convert internal representation to PrimExpr */
+  PrimExpr AsPrimExpr() const;
+
+  /*! \brief Simplify the internal representation */
+  void Simplify(Analyzer* analyzer);
+
+ private:
+  /*! \brief Internal utility, simplify within each group of expressions
+   *
+   * For each pair of values within a chunk, attempt to simplify them into
+   * a single expression.
+   *
+   * For example,
+   *    before = (a == 5) && ((b < 10) || (b > 10))
+   *    after  = (a == 5) && ((b != 10) || false)
+   */
+  void SimplifyWithinChunks(Analyzer* analyzer);
+
+  /*! \brief Internal utility, simplify across groups of expressions
+   *
+   * For each pair of chunks, if the two chunks differ by only a single
+   * term, attempt to simplify those differing terms.
+   *
+   * For example,
+   *    before = ((a == 5) || (b <= 10)) && ((a == 5) || (b >= 10))
+   *    after  = ((a == 5) || (b == 10)) && ((a == 5) || true)
+   */
+  void SimplifyAcrossChunks(Analyzer* analyzer);
+
+  /*! \brief Remove instances of true/false from internal representation
+   *
+   * To avoid invalidating iterators, `SimplifyWithinChunks` and
+   * `SimplifyAcrossChunks` may replace keys, but may not remove keys
+   * from the internal representation.  For example, `(a < 5) && (a <
+   * 10)` would be simplified to `(a < 5) && true`.  The
+   * `RemoveTrueFalse` function removes these leftover instances of
+   * true/false.
+   */
+  void RemoveTrueFalse();
+
+  /*! \brief Internal utility function used to convert to internal form */
+  static void VisitAndExpressions(const PrimExpr& expr,
+                                  std::function<void(const PrimExpr&)> callback);
+  /*! \brief Internal utility function used to convert to internal form */
+  static void VisitOrExpressions(const PrimExpr& expr,
+                                 std::function<void(const PrimExpr&)> callback);
+
+  /* \brief Type-safe wrapper class that represents an PrimExpr
+   *
+   * Because integer indices are used frequently through this class,
+   * maintaining a separation between integer indices used to access
+   * specific elements of the internal representation, and unique
+   * identifiers used to represent expressions PrimExpr, is useful.
+   */
+  enum class Key : size_t {};
+
+  /*! \brief Convert a PrimExpr to a Key */
+  Key GetKey(const PrimExpr& expr);
+
+  /*! \brief Convert a Key to a PrimExpr */
+  PrimExpr GetExpr(Key key) const;
+
+  /*! \brief Attempt to simplify (a && b)
+   *
+   * If successful, will overwrite the parameters `a` and `b` with the
+   * simplified form.
+   */
+  void TrySimplifyOr(Key* a, Key* b, Analyzer* analyzer);
+
+  /*! \brief Attempt to simplify (a || b)
+   *
+   * If successful, will overwrite the parameters `a` and `b` with the
+   * simplified form.
+   */
+  void TrySimplifyAnd(Key* a, Key* b, Analyzer* analyzer);
+
+  /*! \brief The internal representation
+   *
+   * `chunks[i][j]` is the j-th expression in the i-th OR-group.
+   */
+  std::vector<std::vector<Key>> chunks_;
+
+  /*! \brief Mapping from internal Key to PrimExpr */
+  std::unordered_map<Key, PrimExpr, StructuralHash, StructuralEqual> key_to_expr_;
+
+  /*! \brief Mapping from PrimExpr to internal Key */
+  std::unordered_map<PrimExpr, Key, StructuralHash, StructuralEqual> expr_to_key_;
+
+  /*! \brief Cached key representing tir::Bool(true) */
+  Key key_true_;
+
+  /*! \brief Cached key representing tir::Bool(false) */
+  Key key_false_;
+};
+
+AndOfOrs::AndOfOrs(const PrimExpr& expr)
+    : key_true_(GetKey(Bool(true))), key_false_(GetKey(Bool(false))) {
+  VisitAndExpressions(expr, [&](const PrimExpr& outer_expr) {
+    std::vector<Key> or_components;
+    VisitOrExpressions(outer_expr, [&](const PrimExpr& inner_expr) {
+      Key key = GetKey(inner_expr);
+      bool is_duplicate = std::any_of(or_components.begin(), or_components.end(),
+                                      [&](Key prev) { return prev == key; });
+      if (!is_duplicate) {
+        or_components.push_back(key);
+      }
+    });
+
+    bool is_permutation =
+        std::any_of(chunks_.begin(), chunks_.end(), [&](const std::vector<Key>& prev_components) {
+          return or_components.size() == prev_components.size() &&
+                 std::is_permutation(prev_components.begin(), prev_components.end(),
+                                     or_components.begin());
+        });
+    if (!is_permutation) {
+      chunks_.push_back(std::move(or_components));
+    }
+  });
+}
+
+void AndOfOrs::VisitAndExpressions(const PrimExpr& expr,
+                                   std::function<void(const PrimExpr&)> callback) {
+  PVar<PrimExpr> x, y, z;
+  if ((x && y).Match(expr)) {
+    // These are separate AND conditions, recurse into them in case
+    // they contain AND internally.
+    VisitAndExpressions(x.Eval(), callback);
+    VisitAndExpressions(y.Eval(), callback);
+  } else if ((x || y).Match(expr)) {
+    // This may be the bottom-most breakdown, but either x or y may
+    // themselves contain AND.  (e.g. (A && B) || (C && D) should be
+    // split into (A || C), (A || D), (B || C), and (B || D).)
+    // Recurse into each, then reconstruct an OR condition.
+    VisitAndExpressions(x.Eval(), [&](const PrimExpr& x_part) {
+      VisitAndExpressions(y.Eval(), [&](const PrimExpr& y_part) { callback(x_part || y_part); });
+    });
+  } else {
+    // This is bottom-most breakdown.
+    callback(expr);
+  }
+}
+
+void AndOfOrs::VisitOrExpressions(const PrimExpr& expr,
+                                  std::function<void(const PrimExpr&)> callback) {
+  PVar<PrimExpr> x, y, z;
+  if ((x || y).Match(expr)) {
+    // These are separate OR conditions, recurse into them in case
+    // they contain OR internally.
+    VisitOrExpressions(x.Eval(), callback);
+    VisitOrExpressions(y.Eval(), callback);
+  } else if ((x && y).Match(expr)) {
+    // This may be the bottom-most breakdown, but either x or y may
+    // themselves contain OR.  (e.g. (A || B) && (C || D) should be
+    // split into (A && C), (A && D), (B && C), and (B && D).)
+    // Recurse into each, then reconstruct an AND condition.
+    VisitOrExpressions(x.Eval(), [&](const PrimExpr& x_part) {
+      VisitOrExpressions(y.Eval(), [&](const PrimExpr& y_part) { callback(x_part && y_part); });
+    });
+  } else {
+    // This is bottom-most breakdown.
+    callback(expr);
+  }
+}
+
+AndOfOrs::Key AndOfOrs::GetKey(const PrimExpr& expr) {
+  auto it = expr_to_key_.find(expr);
+  if (it != expr_to_key_.end()) {
+    return it->second;
+  }
+
+  Key key{expr_to_key_.size()};
+  expr_to_key_[expr] = key;
+  key_to_expr_[key] = expr;
+  return key;
+}
+
+PrimExpr AndOfOrs::GetExpr(AndOfOrs::Key key) const {
+  auto it = key_to_expr_.find(key);
+  ICHECK(it != key_to_expr_.end());
+  return it->second;
+}
+
+PrimExpr AndOfOrs::AsPrimExpr() const {
+  PrimExpr expr = Bool(true);
+  for (const auto& chunk : chunks_) {
+    PrimExpr chunk_expr = Bool(false);
+    for (Key j : chunk) {
+      chunk_expr = chunk_expr || GetExpr(j);
+    }
+    expr = expr && chunk_expr;
+  }
+  return expr;
+}
+
+void AndOfOrs::TrySimplifyOr(Key* a_ptr, Key* b_ptr, Analyzer* analyzer) {
+  Key& a = *a_ptr;
+  Key& b = *b_ptr;
+  PrimExpr joint = GetExpr(a) || GetExpr(b);
+  PrimExpr simplified = analyzer->Simplify(joint);
+  if (!ExprDeepEqual()(simplified, joint)) {
+    if (auto* simplified_or = simplified.as<OrNode>()) {
+      a = GetKey(simplified_or->a);
+      b = GetKey(simplified_or->b);
+    } else {
+      a = GetKey(simplified);
+      b = key_false_;
+    }
+  }
+}
+
+void AndOfOrs::TrySimplifyAnd(Key* a_ptr, Key* b_ptr, Analyzer* analyzer) {
+  Key& a = *a_ptr;
+  Key& b = *b_ptr;
+  PrimExpr joint = GetExpr(a) && GetExpr(b);
+  PrimExpr simplified = analyzer->Simplify(joint);
+  if (!ExprDeepEqual()(simplified, joint)) {
+    if (auto* simplified_and = simplified.as<AndNode>()) {
+      a = GetKey(simplified_and->a);
+      b = GetKey(simplified_and->b);
+    } else {
+      a = GetKey(simplified);
+      b = key_true_;
+    }
+  }
+}
+
+void AndOfOrs::Simplify(Analyzer* analyzer) {
+  SimplifyWithinChunks(analyzer);
+  RemoveTrueFalse();
+  SimplifyAcrossChunks(analyzer);
+  RemoveTrueFalse();
+}
+
+void AndOfOrs::SimplifyWithinChunks(Analyzer* analyzer) {
+  for (auto& chunk : chunks_) {
+    for (size_t expr_i = 0; expr_i < chunk.size(); expr_i++) {
+      for (size_t expr_j = expr_i + 1; expr_j < chunk.size(); expr_j++) {
+        Key& key_i = chunk[expr_i];
+        Key& key_j = chunk[expr_j];
+
+        TrySimplifyOr(&key_i, &key_j, analyzer);
+      }
+    }
+  }
+}
+
+void AndOfOrs::SimplifyAcrossChunks(Analyzer* analyzer) {
+  for (size_t i_and = 0; i_and < chunks_.size(); i_and++) {
+    for (size_t j_and = i_and + 1; j_and < chunks_.size(); j_and++) {
+      auto& i_chunk = chunks_[i_and];
+      auto& j_chunk = chunks_[j_and];
+
+      if (i_chunk.size() == 1 && j_chunk.size() == 1) {
+        auto& key_i = i_chunk[0];
+        auto& key_j = j_chunk[0];
+        TrySimplifyAnd(&key_i, &key_j, analyzer);
+        continue;
+      }
+      std::unordered_set<Key> j_set(j_chunk.begin(), j_chunk.end());
+
+      std::optional<size_t> i_distinct_index;
+      for (size_t i = 0; i < i_chunk.size(); i++) {
+        if (!j_set.count(i_chunk[i])) {
+          i_distinct_index = i;
+          break;
+        }
+      }
+
+      if (!i_distinct_index.has_value()) {
+        // I = (i_0 || i_1 || ... || i_N)
+        // J = (i_0 || i_1 || ... || i_N || j_0 || ... || j_N)
+        // I && J == I == I && true
+
+        j_chunk = {key_true_};
+        continue;
+      }
+
+      std::unordered_set<Key> i_set(i_chunk.begin(), i_chunk.end());
+
+      std::optional<size_t> j_distinct_index;
+      for (size_t j = 0; j < j_chunk.size(); j++) {
+        if (!i_set.count(j_chunk[j])) {
+          j_distinct_index = j;
+          break;
+        }
+      }
+
+      if (!j_distinct_index.has_value()) {
+        // I = (i_0 || ... || i_N || j_0 || ... || j_N)
+        // J = (j_0 || ... || j_N)
+        // I && J == J == true && J
+
+        i_chunk = {key_true_};
+        continue;
+      }
+
+      if (i_chunk.size() == j_chunk.size()) {
+        size_t num_shared_exprs = 0;
+        for (const auto& j_key : j_chunk) {
+          if (i_set.count(j_key)) {
+            ++num_shared_exprs;
+          }
+        }
+
+        if (num_shared_exprs + 1 == i_chunk.size()) {
+          // All but one of the expressions are shared.  If the AND
+          // of the distinct expressions can be simplified, we can
+          // replace.
+          //
+          // (A or B) and (A or C) => A or (B and C)
+          auto& key_i = i_chunk[i_distinct_index.value()];
+          auto& key_j = j_chunk[j_distinct_index.value()];
+          TrySimplifyAnd(&key_i, &key_j, analyzer);
+        }
+      }
+    }
+  }
+}
+
+void AndOfOrs::RemoveTrueFalse() {
+  for (auto& chunk : chunks_) {
+    // Any occurrence of True inside an OR makes the entire expression True.
+    if (std::any_of(chunk.begin(), chunk.end(), [&](Key key) { return key == key_true_; })) {
+      chunk = {key_true_};
+    } else {
+      // Any occurrence of False inside an OR can be removed
+      chunk.erase(
+          std::remove_if(chunk.begin(), chunk.end(), [&](Key key) { return key == key_false_; }),
+          chunk.end());
+    }
+  }
+
+  // Any occurence of False inside an AND makes the entire expression False.
+  if (std::any_of(chunks_.begin(), chunks_.end(),
+                  [&](const std::vector<Key>& chunk) { return chunk.size() == 0; })) {
+    chunks_ = {{}};
+  } else {
+    // Any occurrence of True inside an AND can be removed.
+    chunks_.erase(std::remove_if(chunks_.begin(), chunks_.end(),
+                                 [&](const std::vector<Key>& chunk) {
+                                   return chunk.size() == 1 && chunk[0] == key_true_;
+                                 }),
+                  chunks_.end());
+  }
+}
+
+// Helper utility for temporarily disabling the
+// kConvertBooleanToAndOfOrs flag on an analyzer, to prevent infinite
+// recursion.
+class DisableAndOfOrRecursion {
+ public:
+  explicit DisableAndOfOrRecursion(Analyzer* analyzer)
+      : analyzer_(analyzer), cached_flags_(analyzer->rewrite_simplify.GetEnabledExtensions()) {
+    auto new_flags = static_cast<RewriteSimplifier::Extension>(
+        cached_flags_ & (~RewriteSimplifier::kConvertBooleanToAndOfOrs));
+    analyzer->rewrite_simplify.SetEnabledExtensions(new_flags);
+  }
+  ~DisableAndOfOrRecursion() { analyzer_->rewrite_simplify.SetEnabledExtensions(cached_flags_); }
+
+  DisableAndOfOrRecursion(const DisableAndOfOrRecursion&) = delete;
+  DisableAndOfOrRecursion& operator=(const DisableAndOfOrRecursion&) = delete;
+
+ private:
+  Analyzer* analyzer_;
+  RewriteSimplifier::Extension cached_flags_;
+};
+
+}  // namespace
+
+PrimExpr SimplifyAsAndOfOrs(const PrimExpr& expr, Analyzer* analyzer) {
+  DisableAndOfOrRecursion context(analyzer);
+  AndOfOrs repr(analyzer->Simplify(expr));
+  repr.Simplify(analyzer);
+  return repr.AsPrimExpr();
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arith/conjunctive_normal_form.h b/src/arith/conjunctive_normal_form.h
new file mode 100644
index 000000000000..84ee972d030e
--- /dev/null
+++ b/src/arith/conjunctive_normal_form.h
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file conjunctive_normal_form.h
+ *
+ * \brief Centralized location for simplifying into specific forms
+ */
+
+#ifndef TVM_ARITH_CONJUNCTIVE_NORMAL_FORM_H_
+#define TVM_ARITH_CONJUNCTIVE_NORMAL_FORM_H_
+
+#include <tvm/tir/expr.h>
+
+namespace tvm {
+namespace arith {
+
+class Analyzer;
+
+/*! \brief Convert boolean expression to AND of ORs and simplify
+ *
+ * \param expr The PrimExpr to be simplified
+ *
+ * \param analyzer The analyzer with which to simplify
+ *
+ * \return The simplified expression
+ */
+PrimExpr SimplifyAsAndOfOrs(const PrimExpr& expr, Analyzer* analyzer);
+
+}  // namespace arith
+}  // namespace tvm
+
+#endif  // TVM_ARITH_CONJUNCTIVE_NORMAL_FORM_H_
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index 019b8cd5d353..5e565d7e36c6 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -31,6 +31,7 @@
 #include <algorithm>
 
 #include "../target/datatype/registry.h"
+#include "conjunctive_normal_form.h"
 #include "const_fold.h"
 #include "constraint_extract.h"
 #include "pattern_match.h"
@@ -1558,8 +1559,13 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NotNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<AndNode>();
+
   if (auto const_res = TryConstFold<And>(op->a, op->b)) return const_res.value();
   if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
+  if ((enabled_extensions_ & RewriteSimplifier::kConvertBooleanToAndOfOrs) &&
+      !recursively_visiting_boolean_) {
+    return SimplifyAsAndOfOrs(ret, analyzer_);
+  }
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
@@ -1596,9 +1602,14 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
+
   op = ret.as<OrNode>();
   if (auto const_res = TryConstFold<Or>(op->a, op->b)) return const_res.value();
   if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
+  if ((enabled_extensions_ & RewriteSimplifier::kConvertBooleanToAndOfOrs) &&
+      !recursively_visiting_boolean_) {
+    return SimplifyAsAndOfOrs(ret, analyzer_);
+  }
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
diff --git a/src/arith/rewrite_simplify.h b/src/arith/rewrite_simplify.h
index 00c60e21ee42..02c54902153a 100644
--- a/src/arith/rewrite_simplify.h
+++ b/src/arith/rewrite_simplify.h
@@ -98,6 +98,10 @@ class RewriteSimplifier::Impl : public IRMutatorWithAnalyzer {
   // Optionally enabled extensions
   Extension enabled_extensions_{kNone};
 
+  /*! Whether the simplifier is current
+   */
+  bool recursively_visiting_boolean_{false};
+
   // maximum number of recursion allowed during a single pass.
   static const constexpr int kMaxRecurDepth = 5;
 
diff --git a/src/tir/transforms/simplify.cc b/src/tir/transforms/simplify.cc
index 2a7c0f5a3585..894dfb8ca09f 100644
--- a/src/tir/transforms/simplify.cc
+++ b/src/tir/transforms/simplify.cc
@@ -38,12 +38,17 @@ using namespace tir;
 
 struct SimplifyConfigNode : public tvm::AttrsNode<SimplifyConfigNode> {
   bool transitively_prove_inequalities;
+  bool convert_boolean_to_and_of_ors;
 
   TVM_DECLARE_ATTRS(SimplifyConfigNode, "tir.transform.SimplifyConfig") {
     TVM_ATTR_FIELD(transitively_prove_inequalities)
         .describe(
             "If true, simplify conditionals with transitive combinations of scoped constraints")
         .set_default(false);
+
+    TVM_ATTR_FIELD(convert_boolean_to_and_of_ors)
+        .describe("If true, simplify conditionals into an AND of ORs")
+        .set_default(false);
   }
 
   RewriteSimplifier::Extension GetEnabledExtensions() const {
@@ -52,6 +57,9 @@ struct SimplifyConfigNode : public tvm::AttrsNode<SimplifyConfigNode> {
       flags =
           RewriteSimplifier::Extension(flags | RewriteSimplifier::kTransitivelyProveInequalities);
     }
+    if (convert_boolean_to_and_of_ors) {
+      flags = RewriteSimplifier::Extension(flags | RewriteSimplifier::kConvertBooleanToAndOfOrs);
+    }
     return flags;
   }
 };
@@ -202,6 +210,5 @@ Pass Simplify() {
 TVM_REGISTER_GLOBAL("tir.transform.Simplify").set_body_typed(Simplify);
 
 }  // namespace transform
-
 }  // namespace tir
 }  // namespace tvm
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 0a1263f70287..2eb9c3546ee5 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -138,12 +138,14 @@ def sls(n, d):
 
 class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
     transitively_prove_inequalities = False
+    convert_boolean_to_and_of_ors = False
 
     def transform(self):
         def inner(mod):
             config = {
                 "tir.Simplify": {
                     "transitively_prove_inequalities": self.transitively_prove_inequalities,
+                    "convert_boolean_to_and_of_ors": self.convert_boolean_to_and_of_ors,
                 }
             }
             with tvm.transform.PassContext(config=config):
@@ -686,5 +688,133 @@ def before(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
     expected = before
 
 
+class TestRewriteAsAndOfOrs(BaseBeforeAfter):
+    """If enabled, rewrite boolean expressions into AND of OR"""
+
+    convert_boolean_to_and_of_ors = True
+
+    def before(A: T.Buffer[3, "bool"]):
+        T.evaluate(A[0] or (A[1] and A[2]))
+
+    def expected(A: T.Buffer[3, "bool"]):
+        T.evaluate((A[0] or A[1]) and (A[0] or A[2]))
+
+
+class TestSuppressRewriteAsAndOfOrs(BaseBeforeAfter):
+    """Only rewrite into AND of OR when allowed"""
+
+    convert_boolean_to_and_of_ors = False
+
+    def before(A: T.Buffer[3, "bool"]):
+        T.evaluate(A[0] or (A[1] and A[2]))
+
+    expected = before
+
+
+class TestRewriteAsAndOfOrsWithTopLevelAnd(BaseBeforeAfter):
+    """The expression being rewritten may start with an AND
+
+    Like TestRewriteAsAndOfOrs, but with an AndNode as the outermost
+    booelan operator.  Even though it is primarily OR nodes that are
+    being rewritten, the call to SimplifyAsAndOfOrs should apply to
+    the outermost AndNode or OrNode in order to enable better
+    simplification.
+    """
+
+    convert_boolean_to_and_of_ors = True
+
+    def before(A: T.Buffer[4, "bool"]):
+        T.evaluate((A[0] or A[1]) and (A[1] or (A[0] and A[2] and A[3])))
+
+    def expected(A: T.Buffer[4, "bool"]):
+        # If the simplification is applied to the OrNode, then a
+        # redundant `(A[1] or A[0])` would't be canceled out.  When
+        # applying SimplifyAsAndOfOrs to the top-level AndNode, the
+        # internal representation is `[[0,1], [1,0], [1,2], [1,3]]`, and
+        # the redundant `[1,0]` can be removed.
+        #
+        # If the simplification were only applied when encountering an
+        # OrNode, the internal representation would be `[[0,1]]` during
+        # the first call and `[[1,0], [1,2], [1,3]]` during the second
+        # call.  As a result, the `[0,1]` and `[1,0]` representations
+        # wouldn't occur within the same call, and the redundant `[1,0]`
+        # wouldn't be removed.
+        T.evaluate((A[0] or A[1]) and (A[1] or A[2]) and (A[1] or A[3]))
+
+
+class TestRewriteAsAndOfOrsWithSimplificationBetweenGroups(BaseBeforeAfter):
+    """Apply rewrite rules between OR groups that differ by a single element
+
+    The expression `(k==20 and k!=30)` could be rewritten into `(k==20)`.
+    However, by default these two terms must appear as part of an explict part
+    of the simplified expression.  The AndOfOr simplification checks for
+    rewrite patterns of the form `(A or B) and (A or C)`, where `(B and C)` can
+    simplify to a single expression `D`.  These can be rewritten to `(A or D)`.
+    """
+
+    convert_boolean_to_and_of_ors = True
+
+    def before(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
+        A[0] = (i == 0 or j == 10 or k == 20) and (i == 0 or j == 10 or k != 30)
+
+    def expected(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
+        A[0] = i == 0 or j == 10 or k == 20
+
+
+class TestRewriteAsAndOfOrsWithSimplificationBetweenReorderedGroups(BaseBeforeAfter):
+    """Rewrite rules between OR groups do not depend on order
+
+    Like TestRewriteAsAndOfOrsWithSimplificationBetweenGroups, but the groups
+    are ordered differently.  If this removes a group entirely, the result is
+    ordered according to the first group in the expression.
+    """
+
+    convert_boolean_to_and_of_ors = True
+
+    def before(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
+        A[0] = (i == 0 or j == 10 or k == 20) and (j == 10 or k != 30 or i == 0)
+
+    def expected(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
+        A[0] = i == 0 or j == 10 or k == 20
+
+
+class TestRewriteAsAndOfOrUsingSimplificationAcrossAnd(BaseBeforeAfter):
+    """Apply AndNode rewrites to non-adjacent expressions
+
+    The RewriteSimplifier rules only check for simplifications between
+    left/right branches of an And/Or node.  Simplifications that would require
+    rearranging components in a chain of And/Or nodes are not performed.
+    """
+
+    convert_boolean_to_and_of_ors = True
+
+    def before(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
+        A[0] = (k == 20) and ((i == 0 or j == 10) and (k != 30))
+
+    def expected(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
+        A[0] = (k == 20) and (i == 0 or j == 10)
+
+
+class TestRewriteAsAndOfOrUsingSimplificationWithinOr(BaseBeforeAfter):
+    """Rewrite rules between OR groups do not depend on order
+
+    The RewriteSimplifier rules only check for simplifications between
+    left/right branches of an And/Or node.  Simplifications that would require
+    rearranging components in a chain of And/Or nodes are not performed.
+
+    This test validates that `(i == 20) or (i != 30)` can be rewritten to
+    `(i != 30)`, even when there's an intervening clause between the
+    clauses being simplified.
+    """
+
+    convert_boolean_to_and_of_ors = True
+
+    def before(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
+        A[0] = (i == 20) or (j == 0) or (i != 30)
+
+    def expected(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
+        A[0] = (i != 30) or (j == 0)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From b389d4dac45bb1fd502940d87126d0a89e15188e Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Thu, 13 Oct 2022 14:16:29 -0700
Subject: [PATCH 350/704] [Torch] Fix torch contrib issues (#13061)

* fix torch contrib

* adding warning for USE_PT_TVMDSOOP flag

* fix import

* Fix broken as torch

* cleanup

Co-authored-by: Lite Ye <yelite958@gmail.com>
---
 apps/pt_tvmdsoop/tests/test_as_torch.py    | 20 +++++++++-----------
 python/tvm/contrib/torch/as_torch.py       |  7 +++++--
 python/tvm/contrib/torch/optimize_torch.py |  4 +++-
 python/tvm/meta_schedule/__init__.py       |  1 +
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/apps/pt_tvmdsoop/tests/test_as_torch.py b/apps/pt_tvmdsoop/tests/test_as_torch.py
index a13d669e7f36..0243e86edebd 100644
--- a/apps/pt_tvmdsoop/tests/test_as_torch.py
+++ b/apps/pt_tvmdsoop/tests/test_as_torch.py
@@ -25,7 +25,6 @@
 import torch.nn
 
 import tvm
-from tvm.meta_schedule.tune import TuneConfig
 from tvm.target.target import Target
 import tvm.testing
 from tvm.contrib.torch import as_torch
@@ -86,14 +85,6 @@ def func_with_part_access_region(a: T.handle, b: T.handle, c: T.handle) -> None:
                 C[vi, vj] = B[vi, vj] + T.float32(1)
 
 
-config = TuneConfig(
-    strategy="replay_trace",
-    num_trials_per_iter=128,
-    max_trials_per_task=128,
-    max_trials_global=128,
-)
-
-
 @as_torch
 @tvm.script.ir_module
 class MyModule:
@@ -232,7 +223,11 @@ def test_tvmscript_torch_loop_split():
 
     result = torch.sum(x.cpu(), dim=1).numpy()
 
-    loop_split.tune(config, Target("nvidia/geforce-rtx-3070"))
+    loop_split.tune(
+        "nvidia/geforce-rtx-3070",
+        max_trials_global=128,
+        strategy="replay-trace",
+    )
     loop_split(x, y)
 
     tvm.testing.assert_allclose(y.cpu().numpy(), result, atol=1e-5, rtol=1e-5)
@@ -246,7 +241,10 @@ def test_tvmscript_torch_elementwise_with_root():
     result = a1 + 2
 
     func = elementwise_with_root(128, 128, "float32")
-    func.tune(config)
+    func.tune(
+        max_trials_global=128,
+        strategy="replay-trace",
+    )
     func(a1, a2, a3)
 
     tvm.testing.assert_allclose(a3.numpy(), result.numpy(), atol=1e-5, rtol=1e-5)
diff --git a/python/tvm/contrib/torch/as_torch.py b/python/tvm/contrib/torch/as_torch.py
index 2412519ea9c5..918ce3ff3b6a 100644
--- a/python/tvm/contrib/torch/as_torch.py
+++ b/python/tvm/contrib/torch/as_torch.py
@@ -65,7 +65,7 @@ def tune(
         measure_callbacks: ms.MeasureCallback.CallbackListType = "default",
         task_scheduler: ms.TaskScheduler.TaskSchedulerType = "round-robin",
         space: ms.SpaceGenerator.SpaceGeneratorType = "post-order-apply",
-        strategy: ms.SearchStrategy.SearchStrategyType = "replay_trace",
+        strategy: ms.SearchStrategy.SearchStrategyType = "replay-trace",
         task_name: str = "main",
         num_threads: Union[Literal["physical", "logical"], int] = "physical",
         seed: Optional[int] = None,
@@ -112,7 +112,10 @@ def script(self):
 
     def build(self, target=None):
         runtime_module = tvm.build(self.ir_module, target=target)
-        func = tvm.get_global_func("tvmtorch.save_runtime_mod")
+        func = tvm.get_global_func("tvmtorch.save_runtime_mod", allow_missing=True)
+
+        if func is None:
+            raise ValueError('as_torch requires the flag /"USE_PT_TVMDSOOP/" set in config.cmake')
         func(runtime_module)
 
         self.rt_module = torch.classes.tvm_torch.OperatorModuleWrapper()
diff --git a/python/tvm/contrib/torch/optimize_torch.py b/python/tvm/contrib/torch/optimize_torch.py
index 347ea89f92ee..cbba590e85dc 100644
--- a/python/tvm/contrib/torch/optimize_torch.py
+++ b/python/tvm/contrib/torch/optimize_torch.py
@@ -167,7 +167,9 @@ def optimize_torch(
             backend="graph",
         )
 
-    save_runtime_mod = get_global_func("tvmtorch.save_runtime_mod")
+    save_runtime_mod = get_global_func("tvmtorch.save_runtime_mod", allow_missing=True)
+    if save_runtime_mod is None:
+        raise ValueError('optimize_torch requires the flag /"USE_PT_TVMDSOOP/" set in config.cmake')
     save_runtime_mod(executor_factory.module)
 
     return GraphExecutorFactoryWrapper(torch.classes.tvm_torch.GraphExecutorFactoryWrapper())
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index c92ed47d8a2a..04acdc9d4a75 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -48,6 +48,7 @@
 from .schedule_rule import ScheduleRule
 from .search_strategy import MeasureCandidate, SearchStrategy
 from .space_generator import SpaceGenerator
+from .task_scheduler import TaskScheduler
 from .tir_integration import tune_tir
 from .tune import tune_tasks
 from .tune_context import TuneContext

From f232272e75350a9b41e76155f72da2cc627a8e03 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 14 Oct 2022 09:32:51 -0700
Subject: [PATCH 351/704] [TIR] Refactor NarrowDataType into DataTypeLegalizer
 (#13049)

---
 include/tvm/tir/stmt_functor.h        |  51 +++++++
 src/tir/ir/data_type_rewriter.cc      | 195 ++++++++++++++++++++++++++
 src/tir/transforms/narrow_datatype.cc | 154 ++------------------
 tests/cpp/data_type_rewriter_test.cc  | 140 ++++++++++++++++++
 4 files changed, 398 insertions(+), 142 deletions(-)
 create mode 100644 src/tir/ir/data_type_rewriter.cc
 create mode 100644 tests/cpp/data_type_rewriter_test.cc

diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index 17530380e665..60973577ac92 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -459,6 +459,57 @@ bool ContainsNode(const Stmt& stmt) {
   return visitor.contains_node;
 }
 
+/*!
+ * \brief Legalize the data types of expressions to make sure they are consistent with other
+ * parts of the program.
+ *
+ * It enforces the following rules:
+ * - The data type of the index variable in a loop must be consistent with the data type of the loop
+ *  bounds.
+ * - The data type of the binary and ternary expressions must be consistent with the data types of
+ * each of their operands.
+ * - The data type of the bounds and binding values of block iter vars must be consistent with the
+ * data type of the block iter vars.
+ *
+ * Usually we enforce the consistency of data types when constructing the IR nodes. However, such
+ * inconsistency may happen as a result of IR mutation in some passes. This class can be used as
+ * base class of such passes to ensure the consistency of data types.
+ */
+class DataTypeLegalizer : public StmtExprMutator {
+ public:
+  Stmt VisitStmt_(const ForNode* op) override;
+
+  Stmt VisitStmt_(const AttrStmtNode* op) override;
+  Stmt VisitStmt_(const BlockRealizeNode* op) override;
+  Stmt VisitStmt_(const BlockNode* op) override;
+  PrimExpr VisitExpr_(const SelectNode* op) override;
+  PrimExpr VisitExpr_(const RampNode* op) override;
+  PrimExpr VisitExpr_(const AddNode* op) override;
+  PrimExpr VisitExpr_(const SubNode* op) override;
+  PrimExpr VisitExpr_(const MulNode* op) override;
+  PrimExpr VisitExpr_(const DivNode* op) override;
+  PrimExpr VisitExpr_(const ModNode* op) override;
+  PrimExpr VisitExpr_(const FloorDivNode* op) override;
+  PrimExpr VisitExpr_(const FloorModNode* op) override;
+  PrimExpr VisitExpr_(const MinNode* op) override;
+  PrimExpr VisitExpr_(const MaxNode* op) override;
+  PrimExpr VisitExpr_(const EQNode* op) override;
+  PrimExpr VisitExpr_(const NENode* op) override;
+  PrimExpr VisitExpr_(const LTNode* op) override;
+  PrimExpr VisitExpr_(const LENode* op) override;
+  PrimExpr VisitExpr_(const GTNode* op) override;
+  PrimExpr VisitExpr_(const GENode* op) override;
+  PrimExpr VisitExpr_(const CallNode* op) override;
+
+  using StmtExprMutator::VisitExpr_;
+  using StmtExprMutator::VisitStmt_;
+
+ protected:
+  // a map from IterVar before rewrite to that after rewrite,
+  // ensures one old IterVar maps to exactly one new IterVar
+  std::unordered_map<const IterVarNode*, IterVar> ivmap_;
+};
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/src/tir/ir/data_type_rewriter.cc b/src/tir/ir/data_type_rewriter.cc
new file mode 100644
index 000000000000..afa28d92589f
--- /dev/null
+++ b/src/tir/ir/data_type_rewriter.cc
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file data_type_rewriter.cc
+ * \brief Rewrite the data type of expressions.
+ */
+
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include "./functor_common.h"
+
+namespace tvm {
+namespace tir {
+
+Stmt DataTypeLegalizer::VisitStmt_(const ForNode* op) {
+  Stmt s = StmtExprMutator::VisitStmt_(op);
+  op = s.as<ForNode>();
+  ICHECK(op != nullptr) << "Expected type to be ForNode, but get " << s->GetTypeKey();
+  PrimExpr e = VisitExpr(op->loop_var);
+  Var var = Downcast<Var>(e);
+  return For(var, cast(var.dtype(), op->min), cast(var.dtype(), op->extent), op->kind, op->body,
+             op->thread_binding, op->annotations);
+}
+
+Stmt DataTypeLegalizer::VisitStmt_(const BlockRealizeNode* op) {
+  BlockRealize realize = Downcast<BlockRealize>(StmtExprMutator::VisitStmt_(op));
+  Array<PrimExpr> new_iter_values;
+  bool changed = false;
+  for (int i = 0; i < static_cast<int>(op->iter_values.size()); ++i) {
+    auto dtype = realize->block->iter_vars[i]->var->dtype;
+    if (op->iter_values[i]->dtype != dtype) {
+      new_iter_values.push_back(cast(dtype, realize->iter_values[i]));
+      changed = true;
+    } else {
+      new_iter_values.push_back(realize->iter_values[i]);
+    }
+  }
+  if (changed) {
+    realize.CopyOnWrite()->iter_values = std::move(new_iter_values);
+  }
+  return std::move(realize);
+}
+
+Stmt DataTypeLegalizer::VisitStmt_(const BlockNode* op) {
+  Block new_block = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
+  Array<IterVar> new_iter_vars = MutateArray(new_block->iter_vars, [this](const IterVar& iter) {
+    auto dtype = iter->var.dtype();
+    if (iter->dom->min->dtype != dtype || iter->dom->extent->dtype != dtype) {
+      IterVar new_iter = iter;
+      new_iter.CopyOnWrite()->dom =
+          Range(cast(dtype, iter->dom->min), cast(dtype, iter->dom->extent));
+      return new_iter;
+    } else {
+      return iter;
+    }
+  });
+  if (!op->iter_vars.same_as(new_iter_vars)) {
+    new_block.CopyOnWrite()->iter_vars = std::move(new_iter_vars);
+  }
+  return std::move(new_block);
+}
+
+Stmt DataTypeLegalizer::VisitStmt_(const AttrStmtNode* op) {
+  if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
+    Stmt s = StmtExprMutator::VisitStmt_(op);
+    op = s.as<AttrStmtNode>();
+    ICHECK(op != nullptr) << "Expected type to be AttrStmtNode"
+                          << ", but get " << s->GetTypeKey();
+    const IterVarNode* iv = op->node.as<IterVarNode>();
+    ICHECK(iv != nullptr) << "Expected type to be IterVarNode"
+                          << ", but get " << op->node->GetTypeKey();
+    PrimExpr e = VisitExpr(iv->var);
+    Var var = Downcast<Var>(e);
+    if (ivmap_.find(iv) == ivmap_.end()) {
+      Range dom = iv->dom;
+      if (dom.defined()) {
+        PrimExpr extend = dom->extent;
+        ICHECK(extend.dtype().is_int() && var.dtype().is_int());
+        if (var.dtype().bits() != extend.dtype().bits()) {
+          DataType dtype = var.dtype();
+          dom = Range(cast(dtype, dom->min), cast(dtype, extend), dom->span);
+        }
+      }
+      ivmap_[iv] = IterVar(dom, var, iv->iter_type, iv->thread_tag);
+    }
+    return AttrStmt(ivmap_[iv], op->attr_key, cast(var.dtype(), op->value), op->body);
+  }
+  return StmtExprMutator::VisitStmt_(op);
+}
+
+PrimExpr DataTypeLegalizer::VisitExpr_(const SelectNode* op) {
+  PrimExpr condition = this->VisitExpr(op->condition);
+  PrimExpr true_value = this->VisitExpr(op->true_value);
+  PrimExpr false_value = this->VisitExpr(op->false_value);
+  if (condition.same_as(op->condition) && true_value.same_as(op->true_value) &&
+      false_value.same_as(op->false_value) && true_value.dtype() == false_value.dtype()) {
+    return GetRef<PrimExpr>(op);
+  } else {
+    int bits = std::max(true_value.dtype().bits(), false_value.dtype().bits());
+    DataType dtype = true_value.dtype().with_bits(bits);
+    if (true_value.dtype() != dtype) true_value = cast(dtype, true_value);
+    if (false_value.dtype() != dtype) false_value = cast(dtype, false_value);
+    return Select(condition, true_value, false_value);
+  }
+}
+
+PrimExpr DataTypeLegalizer::VisitExpr_(const RampNode* op) {
+  PrimExpr base = VisitExpr(op->base);
+  PrimExpr stride = VisitExpr(op->stride);
+  if (base.same_as(op->base) && stride.same_as(op->stride) && base.dtype() == stride.dtype()) {
+    return GetRef<PrimExpr>(op);
+  } else {
+    ICHECK(base.dtype().is_int() && stride.dtype().is_int());
+    int bits = std::max(base.dtype().bits(), stride.dtype().bits());
+    DataType dtype = base.dtype().with_bits(bits);
+    if (base.dtype() != dtype) base = cast(dtype, base);
+    if (stride.dtype() != dtype) stride = cast(dtype, stride);
+    return Ramp(base, stride, op->lanes);
+  }
+}
+
+#define DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)                 \
+  PrimExpr DataTypeLegalizer::VisitExpr_(const OP* op) {                  \
+    PrimExpr a = this->VisitExpr(op->a);                                  \
+    PrimExpr b = this->VisitExpr(op->b);                                  \
+    if (op->a.same_as(a) && op->b.same_as(b) && a.dtype() == b.dtype()) { \
+      return GetRef<PrimExpr>(op);                                        \
+    } else {                                                              \
+      return FUNC(a, b);                                                  \
+    }                                                                     \
+  }
+
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(AddNode, operator+);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(SubNode, operator-);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MulNode, operator*);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(DivNode, div);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(ModNode, truncmod);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorDivNode, floordiv);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorModNode, floormod);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MinNode, min);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MaxNode, max);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(NENode, operator!=);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(LENode, operator<=);
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(LTNode, operator<);  // NOLINT(*)
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GTNode, operator>);  // NOLINT(*)
+DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GENode, operator>=);
+
+#undef DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH
+
+PrimExpr DataTypeLegalizer::VisitExpr_(const CallNode* op) {
+  PrimExpr e = StmtExprMutator::VisitExpr_(op);
+  op = e.as<CallNode>();
+  static const Op& builtin_pow_ = Op::Get("tir.pow");
+  ICHECK(op != nullptr) << "Expected type to be CallNode"
+                        << ", but get " << e->GetTypeKey();
+  if (op->op.same_as(builtin::shift_right())) {
+    return op->args[0] >> op->args[1];
+  } else if (op->op.same_as(builtin::shift_left())) {
+    return op->args[0] << op->args[1];
+  } else if (op->op.same_as(builtin::bitwise_and())) {
+    return op->args[0] & op->args[1];
+  } else if (op->op.same_as(builtin::bitwise_or())) {
+    return op->args[0] | op->args[1];
+  } else if (op->op.same_as(builtin::bitwise_xor())) {
+    return op->args[0] ^ op->args[1];
+  } else if (op->op.same_as(builtin_pow_)) {
+    return pow(op->args[0], op->args[1]);
+  } else if (op->op.same_as(builtin::if_then_else())) {
+    return if_then_else(op->args[0], op->args[1], op->args[2]);
+  }
+  return e;
+}
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index 047295180712..7f9c76f5257d 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -187,7 +187,9 @@ class DataTypeVisitor final : public StmtExprVisitor {
   arith::ConstIntBoundAnalyzer::BoundMapType bound_;
 };
 
-class DataTypeRewriter : public StmtExprMutator {
+class DataTypeRewriter : public DataTypeLegalizer {
+  using Parent = DataTypeLegalizer;
+
  public:
   explicit DataTypeRewriter(int target_bits) : visitor_(target_bits) {}
 
@@ -253,19 +255,8 @@ class DataTypeRewriter : public StmtExprMutator {
     return indices;
   }
 
-  Stmt VisitStmt_(const ForNode* op) final {
-    Stmt s = StmtExprMutator::VisitStmt_(op);
-    op = s.as<ForNode>();
-    ICHECK(op != nullptr) << "Expected type to be ForNode"
-                          << ", but get " << s->GetTypeKey();
-    PrimExpr e = VisitExpr(op->loop_var);
-    Var var = Downcast<Var>(e);
-    return For(var, cast(var.dtype(), op->min), cast(var.dtype(), op->extent), op->kind, op->body,
-               op->thread_binding, op->annotations);
-  }
-
   Stmt VisitStmt_(const IfThenElseNode* op) final {
-    IfThenElse updated = Downcast<IfThenElse>(StmtExprMutator::VisitStmt_(op));
+    IfThenElse updated = Downcast<IfThenElse>(Parent::VisitStmt_(op));
     is_condition_ = true;
     PrimExpr cond = VisitExpr(op->condition);
     is_condition_ = false;
@@ -275,34 +266,6 @@ class DataTypeRewriter : public StmtExprMutator {
     return std::move(updated);
   }
 
-  Stmt VisitStmt_(const AttrStmtNode* op) final {
-    if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
-      Stmt s = StmtExprMutator::VisitStmt_(op);
-      op = s.as<AttrStmtNode>();
-      ICHECK(op != nullptr) << "Expected type to be AttrStmtNode"
-                            << ", but get " << s->GetTypeKey();
-      const IterVarNode* iv = op->node.as<IterVarNode>();
-      ICHECK(iv != nullptr) << "Expected type to be IterVarNode"
-                            << ", but get " << op->node->GetTypeKey();
-      PrimExpr e = VisitExpr(iv->var);
-      Var var = Downcast<Var>(e);
-      if (ivmap_.find(iv) == ivmap_.end()) {
-        Range dom = iv->dom;
-        if (dom.defined()) {
-          PrimExpr extend = dom->extent;
-          if (extend.dtype().is_int() && var.dtype().is_int() &&
-              var.dtype().bits() != extend.dtype().bits()) {
-            DataType dtype = var.dtype();
-            dom = Range(cast(dtype, dom->min), cast(dtype, extend), dom->span);
-          }
-        }
-        ivmap_[iv] = IterVar(dom, var, iv->iter_type, iv->thread_tag);
-      }
-      return AttrStmt(ivmap_[iv], op->attr_key, cast(var.dtype(), op->value), op->body);
-    }
-    return StmtExprMutator::VisitStmt_(op);
-  }
-
   PrimExpr VisitExpr_(const VarNode* op) final {
     if (visitor_.vmap.find(op) != visitor_.vmap.end()) {
       if (vmap_.find(op) == vmap_.end()) {
@@ -310,42 +273,7 @@ class DataTypeRewriter : public StmtExprMutator {
       }
       return vmap_[op];
     }
-    return StmtExprMutator::VisitExpr_(op);
-  }
-
-  PrimExpr VisitExpr_(const SelectNode* op) final {
-    PrimExpr condition = this->VisitExpr(op->condition);
-    PrimExpr true_value = this->VisitExpr(op->true_value);
-    PrimExpr false_value = this->VisitExpr(op->false_value);
-    if (condition.same_as(op->condition) && true_value.same_as(op->true_value) &&
-        false_value.same_as(op->false_value)) {
-      return GetRef<PrimExpr>(op);
-    } else {
-      if (op->true_value.dtype().is_int() && op->false_value.dtype().is_int()) {
-        int bits = std::max(true_value.dtype().bits(), false_value.dtype().bits());
-        DataType dtype = true_value.dtype().with_bits(bits);
-        if (true_value.dtype() != dtype) true_value = cast(dtype, true_value);
-        if (false_value.dtype() != dtype) false_value = cast(dtype, false_value);
-      }
-      return Select(condition, true_value, false_value);
-    }
-  }
-
-  PrimExpr VisitExpr_(const RampNode* op) final {
-    PrimExpr base = VisitExpr(op->base);
-    PrimExpr stride = VisitExpr(op->stride);
-    if (base.same_as(op->base) && stride.same_as(op->stride)) {
-      return GetRef<PrimExpr>(op);
-    } else {
-      if (base.dtype().is_int()) {
-        ICHECK(stride.dtype().is_int()) << "Ramp base is int but stride is " << stride.dtype();
-        int bits = std::max(base.dtype().bits(), stride.dtype().bits());
-        DataType dtype = base.dtype().with_bits(bits);
-        if (base.dtype() != dtype) base = cast(dtype, base);
-        if (stride.dtype() != dtype) stride = cast(dtype, stride);
-      }
-      return Ramp(base, stride, op->lanes);
-    }
+    return Parent::VisitExpr_(op);
   }
 
   PrimExpr VisitExpr_(const SizeVarNode* op) final {
@@ -355,7 +283,7 @@ class DataTypeRewriter : public StmtExprMutator {
       }
       return vmap_[op];
     }
-    return StmtExprMutator::VisitExpr_(op);
+    return Parent::VisitExpr_(op);
   }
 
   PrimExpr VisitExpr_(const IntImmNode* op) final {
@@ -364,29 +292,20 @@ class DataTypeRewriter : public StmtExprMutator {
         return IntImm(visitor_.vmap[op], op->value);
       }
     }
-    return StmtExprMutator::VisitExpr_(op);
+    return Parent::VisitExpr_(op);
   }
 
   PrimExpr VisitExpr_(const CastNode* op) final {
     if (is_index_ && visitor_.vmap.find(op) != visitor_.vmap.end()) {
-      PrimExpr e = StmtExprMutator::VisitExpr_(op);
+      PrimExpr e = Parent::VisitExpr_(op);
       const CastNode* new_op = e.as<CastNode>();
       ICHECK(new_op != nullptr) << "Expected type to be CastNode"
                                 << ", but get " << e->GetTypeKey();
       return Cast(visitor_.vmap[op], new_op->value);
     }
-    return StmtExprMutator::VisitExpr_(op);
+    return Parent::VisitExpr_(op);
   }
 
-  PrimExpr VisitExpr_(const AddNode* op) final;
-  PrimExpr VisitExpr_(const SubNode* op) final;
-  PrimExpr VisitExpr_(const MulNode* op) final;
-  PrimExpr VisitExpr_(const DivNode* op) final;
-  PrimExpr VisitExpr_(const ModNode* op) final;
-  PrimExpr VisitExpr_(const FloorDivNode* op) final;
-  PrimExpr VisitExpr_(const FloorModNode* op) final;
-  PrimExpr VisitExpr_(const MinNode* op) final;
-  PrimExpr VisitExpr_(const MaxNode* op) final;
   PrimExpr VisitExpr_(const EQNode* op) final;
   PrimExpr VisitExpr_(const NENode* op) final;
   PrimExpr VisitExpr_(const LTNode* op) final;
@@ -401,28 +320,12 @@ class DataTypeRewriter : public StmtExprMutator {
   // a map from Var before rewrite to that after rewrite,
   // ensures one old Var maps to exactly one new Var
   std::unordered_map<const VarNode*, Var> vmap_;
-  // a map from IterVar before rewrite to that after rewrite,
-  // ensures one old IterVar maps to exactly one new IterVar
-  std::unordered_map<const IterVarNode*, IterVar> ivmap_;
   // indicator of index expr to rewrite
   bool is_index_{false};
   // indicator of condition
   bool is_condition_{false};
-  // cached ops
-  const Op& builtin_pow_ = Op::Get("tir.pow");
 };
 
-#define DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC) \
-  PrimExpr DataTypeRewriter::VisitExpr_(const OP* op) {   \
-    PrimExpr a = this->VisitExpr(op->a);                  \
-    PrimExpr b = this->VisitExpr(op->b);                  \
-    if (a.same_as(op->a) && b.same_as(op->b)) {           \
-      return GetRef<PrimExpr>(op);                        \
-    } else {                                              \
-      return FUNC(a, b);                                  \
-    }                                                     \
-  }
-
 #define DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)                          \
   PrimExpr DataTypeRewriter::VisitExpr_(const OP* op) {                             \
     bool is_index = is_index_;                                                      \
@@ -430,25 +333,11 @@ class DataTypeRewriter : public StmtExprMutator {
     if (rewrite) {                                                                  \
       is_index_ = true;                                                             \
     }                                                                               \
-    PrimExpr a = this->VisitExpr(op->a);                                            \
-    PrimExpr b = this->VisitExpr(op->b);                                            \
+    auto result = Parent::VisitExpr_(op);                                           \
     is_index_ = is_index;                                                           \
-    if (a.same_as(op->a) && b.same_as(op->b)) {                                     \
-      return GetRef<PrimExpr>(op);                                                  \
-    } else {                                                                        \
-      return FUNC(a, b);                                                            \
-    }                                                                               \
+    return std::move(result);                                                       \
   }
 
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(AddNode, operator+);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(SubNode, operator-);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MulNode, operator*);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(DivNode, div);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(ModNode, truncmod);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorDivNode, floordiv);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorModNode, floormod);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MinNode, min);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MaxNode, max);
 DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==);
 DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(NENode, operator!=);
 DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(LENode, operator<=);
@@ -465,26 +354,7 @@ PrimExpr DataTypeRewriter::VisitExpr_(const CallNode* op) {
     is_condition_ = is_condition;
     return if_then_else(cond, VisitExpr(op->args[1]), VisitExpr(op->args[2]));
   }
-
-  PrimExpr e = StmtExprMutator::VisitExpr_(op);
-  op = e.as<CallNode>();
-  ICHECK(op != nullptr) << "Expected type to be CallNode"
-                        << ", but get " << e->GetTypeKey();
-  if (op->op.same_as(builtin::shift_right())) {
-    return op->args[0] >> op->args[1];
-  } else if (op->op.same_as(builtin::shift_left())) {
-    return op->args[0] << op->args[1];
-  } else if (op->op.same_as(builtin::bitwise_and())) {
-    return op->args[0] & op->args[1];
-  } else if (op->op.same_as(builtin::bitwise_or())) {
-    return op->args[0] | op->args[1];
-  } else if (op->op.same_as(builtin::bitwise_xor())) {
-    return op->args[0] ^ op->args[1];
-  } else if (op->op.same_as(builtin_pow_)) {
-    return pow(op->args[0], op->args[1]);
-  }
-
-  return e;
+  return Parent::VisitExpr_(op);
 }
 
 Stmt NarrowDataType(Stmt stmt, int target_bits) { return DataTypeRewriter(target_bits)(stmt); }
diff --git a/tests/cpp/data_type_rewriter_test.cc b/tests/cpp/data_type_rewriter_test.cc
new file mode 100644
index 000000000000..d1ac9d782ce5
--- /dev/null
+++ b/tests/cpp/data_type_rewriter_test.cc
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+
+using namespace tvm;
+using namespace tvm::tir;
+using namespace tvm::runtime;
+
+using BinaryOpTypes =
+    ::testing::Types<Add, Sub, Mul, Div, Mod, FloorDiv, FloorMod, Min, Max, EQ, NE, LT, LE, GT, GE>;
+
+template <typename T>
+class DataTypeLegalizerBinaryOp : public ::testing::Test {};
+
+TYPED_TEST_SUITE(DataTypeLegalizerBinaryOp, BinaryOpTypes);
+
+TYPED_TEST(DataTypeLegalizerBinaryOp, Basic) {
+  using RefType = TypeParam;
+  using NodeType = typename RefType::ContainerType;
+  auto node = make_object<NodeType>();
+  node->a = Var("a", DataType::Int(32));
+  node->b = IntImm(DataType::Int(64), 2);
+  DataTypeLegalizer legalizer;
+  auto new_expr = Downcast<RefType>(legalizer(RefType(node)));
+  auto target_dtype = DataType::Int(64);
+  ASSERT_EQ(new_expr->a.dtype(), target_dtype);
+  ASSERT_EQ(new_expr->b.dtype(), target_dtype);
+}
+
+TEST(DataTypeLegalizer, Select) {
+  auto node = make_object<SelectNode>();
+  node->condition = Var("cond", DataType::Bool());
+  node->true_value = Var("a", DataType::Int(64));
+  node->false_value = IntImm(DataType::Int(32), 2);
+  DataTypeLegalizer legalizer;
+  Select new_select = Downcast<Select>(legalizer(Select(node)));
+  auto target_dtype = DataType::Int(64);
+  ASSERT_EQ(new_select->true_value.dtype(), target_dtype);
+  ASSERT_EQ(new_select->false_value.dtype(), target_dtype);
+  ASSERT_EQ(new_select.dtype(), target_dtype);
+  ASSERT_EQ(new_select->condition.dtype(), node->condition.dtype());
+}
+TEST(DataTypeLegalizer, IfThenElse) {
+  auto cond = Var("cond", DataType::Bool());
+  PrimExpr call = Call(DataType::Int(32), builtin::if_then_else(),
+                       {cond, Var("a", DataType::Int(64)), IntImm(DataType::Int(32), 2)});
+  DataTypeLegalizer legalizer;
+  Call new_call = Downcast<Call>(legalizer(call));
+  auto target_dtype = DataType::Int(64);
+  ASSERT_EQ(new_call->args[1].dtype(), target_dtype);
+  ASSERT_EQ(new_call->args[2].dtype(), target_dtype);
+  ASSERT_EQ(new_call->dtype, target_dtype);
+}
+
+TEST(DataTypeLegalizer, Block) {
+  auto block_node = make_object<BlockNode>();
+  auto iter_var_node = make_object<IterVarNode>();
+  iter_var_node->var = Var("i", DataType::Int(32));
+  iter_var_node->dom =
+      Range::FromMinExtent(IntImm(DataType::Int(64), 0), IntImm(DataType::Int(64), 10));
+  iter_var_node->iter_type = IterVarType::kDataPar;
+  block_node->iter_vars = {IterVar(iter_var_node)};
+  block_node->reads = {};
+  block_node->writes = {};
+  block_node->name_hint = "block";
+  block_node->body = Evaluate(Integer(0));
+  auto block_realize_node = make_object<BlockRealizeNode>();
+  auto loop_var = Var("i", DataType::Int(32));
+  block_realize_node->iter_values = {loop_var};
+  block_realize_node->predicate = const_true();
+  block_realize_node->block = Block(block_node);
+  auto for_node = make_object<ForNode>();
+  for_node->loop_var = loop_var;
+  for_node->min = IntImm(DataType::Int(64), 0);
+  for_node->extent = IntImm(DataType::Int(64), 10);
+  for_node->kind = ForKind::kSerial;
+  for_node->body = BlockRealize(block_realize_node);
+  Stmt stmt = For(for_node);
+
+  DataTypeLegalizer legalizer;
+  DataType target_dtype = loop_var->dtype;
+  Stmt new_stmt = legalizer(stmt);
+  const ForNode* new_for = new_stmt.as<ForNode>();
+  ASSERT_EQ(new_for->loop_var.dtype(), target_dtype);
+  ASSERT_EQ(new_for->min.dtype(), target_dtype);
+  ASSERT_EQ(new_for->extent.dtype(), target_dtype);
+  const BlockRealizeNode* new_block_realize = new_for->body.as<BlockRealizeNode>();
+  ASSERT_EQ(new_block_realize->iter_values[0].dtype(), target_dtype);
+  const BlockNode* new_block = new_block_realize->block.as<BlockNode>();
+  ASSERT_EQ(new_block->iter_vars[0]->dom->min.dtype(), target_dtype);
+  ASSERT_EQ(new_block->iter_vars[0]->dom->extent.dtype(), target_dtype);
+  ASSERT_EQ(new_block->iter_vars[0]->var.dtype(), target_dtype);
+}
+
+TEST(DataTypeLegalizer, For) {
+  auto node = make_object<ForNode>();
+  node->body = Evaluate(Integer(0));
+  node->loop_var = Var("i", DataType::Int(32));
+  node->min = IntImm(DataType::Int(64), 0);
+  node->extent = IntImm(DataType::Int(64), 10);
+  DataTypeLegalizer legalizer;
+  For new_for = Downcast<For>(legalizer(For(node)));
+  ASSERT_EQ(new_for->min.dtype(), DataType::Int(32));
+  ASSERT_EQ(new_for->extent.dtype(), DataType::Int(32));
+  ASSERT_EQ(new_for->loop_var.dtype(), DataType::Int(32));
+}
+
+TEST(DataTypeLegalizer, Ramp) {
+  auto node = make_object<RampNode>();
+  node->base = IntImm(DataType::Int(64), 0);
+  node->stride = IntImm(DataType::Int(32), 1);
+  int lanes = 4;
+  node->lanes = lanes;
+  DataTypeLegalizer legalizer;
+  Ramp new_ramp = Downcast<Ramp>(legalizer(Ramp(node)));
+  DataType target_dtype = DataType::Int(64);
+  ASSERT_EQ(new_ramp->base.dtype(), target_dtype);
+  ASSERT_EQ(new_ramp->stride.dtype(), target_dtype);
+  ASSERT_EQ(new_ramp->dtype, target_dtype.with_lanes(lanes));
+}

From 493458e552bd6b0cb29e7e453dc3ee4cd649ad57 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 14 Oct 2022 13:04:06 -0500
Subject: [PATCH 352/704] [TE] Raise error for non-bijective transformation
 (#12926)

This is a fix for a bug introduced in
https://github.com/apache/tvm/pull/12904.  Prior to then, an
exception was raised when the transformation wouldn't be bijective
over the transformed buffer's shape.  The PR replaced the bijective
check done as part of `DetectIterMap` with a check done on the
returned `padding_predicate`.  However, this check was not equivalent,
and some transformations could erroneously apply, rather than
raising an exception as being non-bijective.

This commit re-enables the bijectivity check in `DetectIterMap`, and
adds a test case for this behavior.
---
 src/arith/iter_affine_map.cc                  |  4 ++-
 src/tir/ir/index_map.cc                       | 33 +++++++++++--------
 .../python/unittest/test_transform_layout.py  | 13 ++++++++
 3 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 182eada24d96..d41db2ff135e 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -1739,7 +1739,9 @@ class IterMapToExprNormalizer : public ExprMutator {
 bool IterMapRewriter::CanProveDivisible(const PrimExpr& lhs, const PrimExpr& rhs) {
   const auto* clhs = lhs.as<IntImmNode>();
   const auto* crhs = rhs.as<IntImmNode>();
-  if (clhs && crhs) {
+  if (crhs && crhs->value == 0) {
+    return false;
+  } else if (clhs && crhs) {
     return clhs->value % crhs->value == 0;
   }
 
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index a25ecdd04079..e1cc9dbdd093 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -53,19 +53,21 @@ IndexMap IndexMap::FromFunc(int ndim, runtime::TypedPackedFunc<Array<PrimExpr>(A
   return IndexMap(initial_indices, func(initial_indices), std::move(inverse_index_map));
 }
 
-std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initial_ranges) const {
-  if ((*this)->inverse_index_map.defined()) {
+std::pair<IndexMap, PrimExpr> IndexMapInverseImpl(const IndexMap& self,
+                                                  const Array<Range>& initial_ranges,
+                                                  arith::IterMapLevel check_level) {
+  if (self->inverse_index_map.defined()) {
     // return the pre-defined inverse index map if exists.  In this
     // case, the user-defined inverse is assumed to be correct and
     // bijective.
     PrimExpr padding_predicate = Bool(false);
-    return {Downcast<IndexMap>((*this)->inverse_index_map.value()), padding_predicate};
+    return {Downcast<IndexMap>(self->inverse_index_map.value()), padding_predicate};
   }
 
   // Dummy variables to represent the inverse's inputs.
   Array<Var> output_vars;
-  for (size_t i = 0; i < (*this)->final_indices.size(); i++) {
-    PrimExpr index = (*this)->final_indices[i];
+  for (size_t i = 0; i < self->final_indices.size(); i++) {
+    PrimExpr index = self->final_indices[i];
     // TODO(Lunderberg): Better names for these variables.  A variable
     // that is passed through unmodified (`index` is an element of
     // `initial_indices`) should use that input index's name.  A pair
@@ -79,16 +81,16 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
 
   // Dummy ranges for the extent of each input.
   Map<Var, Range> input_iters;
-  ICHECK_EQ((*this)->initial_indices.size(), initial_ranges.size());
+  ICHECK_EQ(self->initial_indices.size(), initial_ranges.size());
   for (size_t i = 0; i < initial_ranges.size(); i++) {
-    input_iters.Set((*this)->initial_indices[i], initial_ranges[i]);
+    input_iters.Set(self->initial_indices[i], initial_ranges[i]);
   }
 
   // Unpack the output indices into linear combinations of the initial
   // indices.
   arith::Analyzer analyzer;
-  auto padded_iter_map = DetectIterMap((*this)->final_indices, input_iters, /* predicate = */ 1,
-                                       /*check_level=*/arith::IterMapLevel::NoCheck, &analyzer,
+  auto padded_iter_map = DetectIterMap(self->final_indices, input_iters, /* predicate = */ 1,
+                                       /*check_level=*/check_level, &analyzer,
                                        /*simplify_trivial_iterators=*/false);
   CHECK(padded_iter_map->errors.empty()) << "Could not parse mapping as sum of iterators.  "
                                          << "Error: " << padded_iter_map->errors[0];
@@ -100,8 +102,8 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
 
   // Unpack the map to an array, maintaining the same parameter order.
   Array<PrimExpr> inverse_exprs;
-  for (int i = 0, n = (*this)->initial_indices.size(); i < n; ++i) {
-    Var index = (*this)->initial_indices[i];
+  for (int i = 0, n = self->initial_indices.size(); i < n; ++i) {
+    Var index = self->initial_indices[i];
     PrimExpr expr;
     if (is_one(initial_ranges[i]->extent) && !inverse_exprs_map.count(index)) {
       expr = initial_ranges[i]->min;
@@ -116,7 +118,7 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
   padding_predicate = Substitute(padding_predicate, inverse_exprs_map);
 
   {
-    auto output_ranges = (*this)->MapRanges(initial_ranges);
+    auto output_ranges = self->MapRanges(initial_ranges);
     ICHECK_EQ(output_ranges.size(), output_vars.size());
 
     arith::Analyzer analyzer;
@@ -131,8 +133,13 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
   return {IndexMap(output_vars, inverse_exprs), padding_predicate};
 }
 
+std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initial_ranges) const {
+  return IndexMapInverseImpl(*this, initial_ranges, arith::IterMapLevel::NoCheck);
+}
+
 IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
-  auto [inverse, padding_predicate] = NonSurjectiveInverse(std::move(initial_ranges));
+  auto [inverse, padding_predicate] =
+      IndexMapInverseImpl(*this, initial_ranges, arith::IterMapLevel::Bijective);
   arith::Analyzer analyzer;
   CHECK(analyzer.CanProve(!padding_predicate))
       << "Bijective inverse should not contain padding, but inverse of " << *this << " over range "
diff --git a/tests/python/unittest/test_transform_layout.py b/tests/python/unittest/test_transform_layout.py
index 18b37741765f..375fe4a24d57 100755
--- a/tests/python/unittest/test_transform_layout.py
+++ b/tests/python/unittest/test_transform_layout.py
@@ -575,5 +575,18 @@ def test_size_one_buffer(shape, transform):
     s[B].transform_layout(transform)
 
 
+def test_non_divisible_transform_raises_error():
+    A = te.placeholder([1, 3, 8, 8])
+    B = te.compute(A.shape, lambda *indices: A[indices])
+    s = te.create_schedule(B.op)
+
+    transform = lambda n, c, h, w: [n, c // 4, h, w, c % 4]
+    # Error occurs here, because the transformation would introduce
+    # padding.  Padded transforms are supported in TIR-based
+    # schedules.
+    with pytest.raises(tvm.TVMError):
+        s[B].transform_layout(transform)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 605876e638a7303acb63ef8a9dd9ebfefc30ae24 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 15 Oct 2022 03:14:23 +0900
Subject: [PATCH 353/704] [TEST] Fix the broken VNNI MetaSchedule test (#13067)

* fixed MS vnni template-based tuning test

* enable local testing of dp4a auto tensorize

* fix rocm and vk auto-tensorize test

* tweaking to see why IS_IN_CI isn't working

* skip dp4a auto tensorize test since IS_IN_CI is not working

* fix broken hexagon test after onnx model update
---
 tests/python/contrib/test_hexagon/test_models.py |  4 ++--
 tests/python/integration/test_auto_tensorize.py  | 16 ++++++++--------
 .../test_meta_schedule_vnni_integration.py       | 11 ++++++-----
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
index 95e5191a8619..f4495f849fab 100644
--- a/tests/python/contrib/test_hexagon/test_models.py
+++ b/tests/python/contrib/test_hexagon/test_models.py
@@ -51,7 +51,7 @@ def test_mobilenet(hexagon_session: Session):
 
     data_in = np.random.rand(1, 3, 224, 224).astype(dtype=dtype)
 
-    input_name = "input"
+    input_name = "data"
     shape_dict = {input_name: data_in.shape}
     relay_mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, freeze_params=True)
     inputs = {input_name: data_in}
@@ -98,7 +98,7 @@ def test_mobilenet_aot(hexagon_session: Session, aot_host_target, aot_target, en
 
     data_in = np.random.rand(1, 3, 224, 224).astype(dtype=dtype)
 
-    input_name = "input"
+    input_name = "data"
     shape_dict = {input_name: data_in.shape}
     relay_mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, freeze_params=True)
     inputs = {input_name: data_in}
diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index 13b7b50b7afe..8c06e147c01f 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -283,10 +283,10 @@ def test_dp4a_dense():
     _test_dense("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070")
     # Uncomment to test on vulkan or rocm target
     # _test_dense(
-    #     "int8", sch_rules_for_dp4a, postprocs_for_dp4a, "vulkan -from_device=0"
+    #     "int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "vulkan -from_device=0"
     # )
     # _test_dense(
-    #     "int8", sch_rules_for_sdot4, postprocs_for_dp4a, "rocm"
+    #     "int8", SCH_RULES_FOR_SDOT4, POSTPROCS_FOR_DP4A, "rocm"
     # )
 
 
@@ -303,10 +303,10 @@ def test_dp4a_conv2d():
     _test_conv2d("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070")
     # Uncomment to test on vulkan or rocm target
     # _test_conv2d(
-    #     "int8", sch_rules_for_dp4a, postprocs_for_dp4a, "vulkan -from_device=0"
+    #     "int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "vulkan -from_device=0"
     # )
     # _test_conv2d(
-    #     "int8", sch_rules_for_sdot4, postprocs_for_dp4a, "rocm"
+    #     "int8", SCH_RULES_FOR_SDOT4, POSTPROCS_FOR_DP4A, "rocm"
     # )
 
 
@@ -342,16 +342,16 @@ def test_dp4a_bert_int8():
     #     params,
     #     input_info,
     #     "vulkan -from_device=0",
-    #     sch_rules_for_dp4a,
-    #     postprocs_for_dp4a,
+    #     SCH_RULES_FOR_DP4A,
+    #     POSTPROCS_FOR_DP4A,
     # )
     # _test_bert_int8(
     #     relay_mod,
     #     params,
     #     input_info,
     #     "rocm",
-    #     sch_rules_for_sdot4,
-    #     postprocs_for_dp4a,
+    #     SCH_RULES_FOR_SDOT4
+    #     POSTPROCS_FOR_DP4A,
     # )
 
 
diff --git a/tests/python/unittest/test_meta_schedule_vnni_integration.py b/tests/python/unittest/test_meta_schedule_vnni_integration.py
index 710ea96d9f5c..d0bfc913eca6 100644
--- a/tests/python/unittest/test_meta_schedule_vnni_integration.py
+++ b/tests/python/unittest/test_meta_schedule_vnni_integration.py
@@ -41,7 +41,7 @@ def _schedule_dense(m: Optional[int], do_tune: bool):
     """
 
     def schedule_fn(sch, dense_block: Optional[BlockRV] = None) -> bool:
-        if "dense" not in sch.mod.attrs["task_name"]:
+        if sch.mod.attrs is not None and "dense" not in sch.mod.attrs["task_name"]:
             return False
         if dense_block is None:
             dense_block = sch.get_block("compute")
@@ -204,7 +204,7 @@ def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
     with tempfile.TemporaryDirectory() as work_dir:
         # postprocs=lambda: [] is important to prevent default post processors from
         # tampering with the manual schedule.
-        tasks = ms.relay_integration.extracted_tasks_to_tune_contexts(
+        tasks, weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
             list(
                 filter(
                     lambda task: "dense" in task.task_name,
@@ -214,15 +214,16 @@ def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
             work_dir=work_dir,
             space=ms.space_generator.PostOrderApply(
                 f_block_filter=None,
-                sch_rules=None,
+                sch_rules="from-target",
                 postprocs=[],
-                mutator_probs=None,
+                mutator_probs="from-target",
             ),
         )
         database = ms.relay_integration.tune_tasks(
             tasks=tasks,
-            task_weights=[1.0] * len(tasks),
+            task_weights=weights,
             work_dir=work_dir,
+            max_trials_per_task=32,
             max_trials_global=20000,
         )
     with database, tvm.transform.PassContext(

From 44c35dcd96a3183a8ff95bd323f0e8321b48f44c Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 14 Oct 2022 14:59:57 -0700
Subject: [PATCH 354/704] [TVMScript] Fix parsing int64 loop with optional loop
 start (#13068)

* [TVMScript] Fix parsing int64 loop with optional loop start

* Update scope_handler.py

* fix range

* fix
---
 python/tvm/ir/expr.py                         |  7 ++--
 python/tvm/script/tir/scope_handler.py        | 42 +++++++++----------
 .../unittest/test_tvmscript_syntax_sugar.py   | 26 ++++++++++++
 3 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/python/tvm/ir/expr.py b/python/tvm/ir/expr.py
index 43cba1a83530..e16cd5ea9e2f 100644
--- a/python/tvm/ir/expr.py
+++ b/python/tvm/ir/expr.py
@@ -19,6 +19,7 @@
 
 from .base import Node
 from . import _ffi_api
+from ..runtime import const, convert
 
 
 class BaseExpr(Node):
@@ -118,9 +119,9 @@ class Range(Node):
 
     def __init__(self, begin, end=None, span=None):
         if end is None:
-            self.__init_handle_by_constructor__(_ffi_api.Range, 0, begin, span)
-        else:
-            self.__init_handle_by_constructor__(_ffi_api.Range, begin, end, span)
+            end = convert(begin)
+            begin = const(0, dtype=end.dtype, span=span)
+        self.__init_handle_by_constructor__(_ffi_api.Range, begin, end, span)
 
     @staticmethod
     def from_min_extent(min_value, extent, span=None):
diff --git a/python/tvm/script/tir/scope_handler.py b/python/tvm/script/tir/scope_handler.py
index 1d2550eecde2..69a414890655 100644
--- a/python/tvm/script/tir/scope_handler.py
+++ b/python/tvm/script/tir/scope_handler.py
@@ -23,7 +23,7 @@
 import tvm.tir
 from tvm.runtime import Object, String, convert
 from tvm.ir import Span, Range
-from tvm.tir import Stmt, PrimExpr, IterVar, Var, Buffer, BufferRegion, ForKind, IntImm
+from tvm.tir import Stmt, PrimExpr, IterVar, Var, Buffer, BufferRegion, ForKind
 
 from .node import BufferSlice
 
@@ -605,7 +605,7 @@ def exit_scope(
 
     def create_loop_info(
         self,
-        begin: PrimExpr,
+        begin: Optional[PrimExpr],
         end: PrimExpr,
         kind: ForKind,
         thread_binding: Optional[str] = None,
@@ -616,8 +616,8 @@ def create_loop_info(
 
         Parameters
         ----------
-        begin : PrimExpr
-            The beginning value.
+        begin : Optional[PrimExpr]
+            The beginning value. If None, it will be set to 0.
 
         end : PrimExpr
             The endding value.
@@ -639,11 +639,17 @@ def create_loop_info(
         for : For
             The constructed For.
         """
-        begin, end = [convert(_) for _ in [begin, end]]
+        end = convert(end)
+        if begin is None:
+            begin = tvm.tir.const(0, end.dtype)
+        else:
+            begin = convert(begin)
         assert self.context and self.node, "call 'exit_scope' before 'enter_scope'"
-        extent = end if begin == 0 else self.context.analyzer.simplify(end - begin)
-        if begin == 0 and isinstance(extent, PrimExpr):
-            begin = IntImm(extent.dtype, 0, begin.span)
+        extent = (
+            end
+            if self.context.analyzer.can_prove_equal(begin, 0)
+            else self.context.analyzer.simplify(end - begin)
+        )
         self.annotations: Mapping[str, Object] = {}
         if annotations is not None:
             self.annotations = {
@@ -665,8 +671,7 @@ def serial(
             annotations: Optional[Mapping[str, Object]] = None,
         ):
             if end is None:
-                end = begin
-                begin = 0
+                end, begin = begin, end
             self.create_loop_info(begin, end, ForKind.SERIAL, annotations=annotations)
 
         super().__init__(serial)
@@ -683,8 +688,7 @@ def parallel(
             annotations: Optional[Mapping[str, Object]] = None,
         ):
             if end is None:
-                end = begin
-                begin = 0
+                end, begin = begin, end
             self.create_loop_info(begin, end, ForKind.PARALLEL, annotations=annotations)
 
         super().__init__(parallel)
@@ -701,8 +705,7 @@ def vectorized(
             annotations: Optional[Mapping[str, Object]] = None,
         ):
             if end is None:
-                end = begin
-                begin = 0
+                end, begin = begin, end
             self.create_loop_info(begin, end, ForKind.VECTORIZED, annotations=annotations)
 
         super().__init__(vectorized)
@@ -719,8 +722,7 @@ def unroll(
             annotations: Optional[Mapping[str, Object]] = None,
         ):
             if end is None:
-                end = begin
-                begin = 0
+                end, begin = begin, end
             self.create_loop_info(begin, end, ForKind.UNROLLED, annotations=annotations)
 
         super().__init__(unroll)
@@ -744,8 +746,7 @@ def thread_binding(
                 else:
                     raise ValueError("Thread cannot be None for thread_binding")
             if end is None:
-                end = begin
-                begin = 0
+                end, begin = begin, end
             thread_iter_var = IterVar(None, None, IterVar.ThreadIndex, thread)
             self.create_loop_info(
                 begin,
@@ -771,8 +772,7 @@ def for_range(
             annotations: Optional[Mapping[str, Object]] = None,
         ):
             if end is None:
-                end = begin
-                begin = 0
+                end, begin = begin, end
             self.create_loop_info(begin, end, ForKind.SERIAL, annotations=annotations)
 
         super().__init__(for_range)
@@ -788,6 +788,6 @@ class Grid(ForScopeHandler):
     def __init__(self):
         def grid(*extents: List[PrimExpr]):
             for extent in extents:
-                self.create_loop_info(0, extent, ForKind.SERIAL)
+                self.create_loop_info(None, extent, ForKind.SERIAL)
 
         super().__init__(grid)
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index 2a2f7354d7cd..849b0fc03d92 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -387,5 +387,31 @@ def mma_sync_m16n16k16_desc_manual(a: T.handle, b: T.handle, c: T.handle) -> Non
     #         A[i] = A[ind]
 
 
+def test_int64_loop():
+    @T.prim_func
+    def int64_grid(
+        A: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+        B: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+    ) -> None:
+        for i, j in T.grid(T.int64(128), T.int64(128)):
+            with T.block("C"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                B[vi, vj] = A[vi, vj] + 1.0
+
+    @T.prim_func
+    def int64_grid_expanded(
+        A: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+        B: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+    ) -> None:
+        for i in range(T.int64(0), T.int64(128)):
+            for j in range(T.int64(0), T.int64(128)):
+                with T.block("C"):
+                    vi = T.axis.spatial(T.int64(128), i)
+                    vj = T.axis.spatial(T.int64(128), j)
+                    B[vi, vj] = A[vi, vj] + 1.0
+
+    assert_structural_equal(int64_grid, int64_grid_expanded)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 342ffb91d6687cfcb89c7c5aa4c624d44f640906 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 14 Oct 2022 15:03:08 -0700
Subject: [PATCH 355/704] [Hexagon]Register fast softmax schedule with default
 schedule (#13083)

* fast math schedule

* lint
---
 python/tvm/relay/op/strategy/hexagon.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py
index 693352d650ba..c1d64f2fe143 100644
--- a/python/tvm/relay/op/strategy/hexagon.py
+++ b/python/tvm/relay/op/strategy/hexagon.py
@@ -83,14 +83,14 @@ def conv2d_strategy_hexagon(attrs, inputs, out_type, target):
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
                 wrap_topi_schedule(topi.hexagon.schedule_depthwise_conv2d_nchw),
-                name="depthwise_conv2d_nchw.generic",
+                name="depthwise_conv2d_nchw.hexagon",
             )
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                 wrap_topi_schedule(topi.hexagon.schedule_depthwise_conv2d_nhwc),
-                name="depthwise_conv2d_nhwc.generic",
+                name="depthwise_conv2d_nhwc.hexagon",
             )
         else:
             raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
@@ -222,3 +222,15 @@ def dense_pack_strategy_hexagon(attrs, inputs, out_type, target):
         )
 
     return strategy
+
+
+@fast_softmax_strategy.register("hexagon")
+def fast_softmax_strategy_cpu(attrs, inputs, out_type, target):
+    """fast_softmax hexagon strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_softmax(topi.nn.fast_softmax),
+        wrap_topi_schedule(topi.hexagon.schedule_softmax),
+        name="fast_softmax.hexagon",
+    )
+    return strategy

From 5eab64885ad49fbbde4f4a825b5b1bd7a451790c Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 14 Oct 2022 15:33:32 -0700
Subject: [PATCH 356/704] [ROOFLINE] Add support for different dtypes (#13003)

* [ROOFLINE] Add support for different dtypes

Support different dtypes in roofline analysis. Only x86 support for now,
but the interface is there to add support on cuda.

* whoops

* add div, mad*2 to flops count

* remove skips (from bad rebase)

* match features by name format
---
 python/tvm/utils/roofline/__init__.py  |  64 +++---
 python/tvm/utils/roofline/cuda.py      | 167 ++++++++++++----
 python/tvm/utils/roofline/registry.py  |  46 ++++-
 python/tvm/utils/roofline/x86.py       | 261 ++++++++++++++++---------
 tests/python/unittest/test_roofline.py | 119 +++++------
 5 files changed, 423 insertions(+), 234 deletions(-)

diff --git a/python/tvm/utils/roofline/__init__.py b/python/tvm/utils/roofline/__init__.py
index 0affb0704997..3b0144cb90e8 100644
--- a/python/tvm/utils/roofline/__init__.py
+++ b/python/tvm/utils/roofline/__init__.py
@@ -15,20 +15,20 @@
 # specific language governing permissions and limitations
 # under the License.
 """Utilities for computing an approximate roofline model"""
-from typing import Dict, Union, Optional
+from typing import Dict, Optional, Union
+
 import numpy as np
 
-from ... import auto_scheduler, relay, tir, nd, IRModule, build, topi, transform, get_global_func
-from ...target import Target
-from ...runtime import profiler_vm, profiling, Device, num_threads
-from ...script import tir as T
-from ...ir.instrument import pass_instrument
+from ... import IRModule, auto_scheduler, build, get_global_func, nd, relay, tir, topi, transform
+from ...contrib import utils
 from ...ir.expr import GlobalVar
+from ...ir.instrument import pass_instrument
 from ...rpc.base import RPC_SESS_MASK
 from ...rpc.client import RPCSession
-from ...contrib import utils
-
-from . import registry, cuda, x86
+from ...runtime import Device, num_threads, profiler_vm, profiling
+from ...script import tir as T
+from ...target import Target
+from . import cuda, registry, x86
 
 
 def _create_args(mod: IRModule, dev: Device, func_name: str = "main", remote=None):
@@ -131,14 +131,9 @@ def roofline_from_existing(
         :py:func:`roofline_analysis` for more information on which metrics
         are included.
     """
-    with target:
-        peak_bandwidth = registry.estimate_peak_bandwidth(target, dev, remote)
-        peak_flops = registry.estimate_peak_flops(target, dev, remote)
-
-    ridge_point = peak_flops / peak_bandwidth
 
     all_features = {
-        prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim))
+        prim.attrs["hash"]: (name, prim, auto_scheduler.feature.named_features_from_primfunc(prim))
         for name, prim in tir_functions.items()
         if isinstance(prim, tir.PrimFunc) and "hash" in prim.attrs.keys()
     }
@@ -146,28 +141,17 @@ def roofline_from_existing(
     new_calls = []
     for call in report.calls:
         if "Hash" in call.keys() and call["Hash"] in all_features:
-            _, features = all_features[call["Hash"]]
-
-            flops = np.sum(features["float_addsub"] + features["float_mul"] + features["float_mad"])
-            loaded_bytes = 0.0
-            # assume no more than 100 buffers
-            for i in range(100):
-                if str(target.kind) == "cuda":
-                    # autoscheduler features do not take into account that 1.
-                    # global and shared memory have very different performance
-                    # characteristics -- both are included in the same bytes
-                    # touched count 2. multiple threads accessing the same byte
-                    # of memory does not use the same amount of bandwidth as
-                    # multiple threads accessing different bytes of memory. We
-                    # use unique bytes accessed here to avoid these two issues,
-                    # but this does bias results towards being more compute
-                    # bound.
-                    key = f"B{i}.unique_bytes"
-                else:
-                    key = f"B{i}.bytes"
-                if not key in features.keys():
-                    break
-                loaded_bytes += np.sum(features[key])
+            _, prim, features = all_features[call["Hash"]]
+
+            with target:
+                flops, peak_flops, flops_name = registry.estimate_peak_flops(
+                    prim, features, target, dev, remote
+                )
+                loaded_bytes, peak_bandwidth, bandwidth_name = registry.estimate_peak_bandwidth(
+                    prim, features, target, dev, remote
+                )
+            ridge_point = peak_flops / peak_bandwidth
+
             runtime = call["Duration (us)"].microseconds * 1e-6
             arith_inten = flops / loaded_bytes
             call = dict(call)
@@ -188,8 +172,10 @@ def roofline_from_existing(
         else:
             new_calls.append(call)
     new_configuration = dict(report.configuration.items())
-    new_configuration["Estimated Peak FLOP/s"] = profiling.Ratio(peak_flops)
-    new_configuration["Estimated Peak Bandwidth (byte/second)"] = profiling.Ratio(peak_bandwidth)
+    new_configuration[f"Estimated Peak FLOP/s ({flops_name})"] = profiling.Ratio(peak_flops)
+    new_configuration[
+        f"Estimated Peak Bandwidth ({bandwidth_name}, byte/second)"
+    ] = profiling.Ratio(peak_bandwidth)
     return profiling.Report(new_calls, report.device_metrics, new_configuration)
 
 
diff --git a/python/tvm/utils/roofline/cuda.py b/python/tvm/utils/roofline/cuda.py
index f5a3f5e1dde9..b6e8ae066459 100644
--- a/python/tvm/utils/roofline/cuda.py
+++ b/python/tvm/utils/roofline/cuda.py
@@ -15,25 +15,31 @@
 # specific language governing permissions and limitations
 # under the License.
 """Estimation of peak flops and memory bandwidth for cuda devices"""
-from typing import Optional
-from ...script import tir as T
-from ... import nd, build, transform
-from ...runtime import Device
-from ...target import Target
+import functools
+import re
+from typing import Dict, Optional, Tuple
+
+import numpy as np
+
+from ... import build, nd, transform
+from ...contrib import nvcc, utils
 from ...rpc.base import RPC_SESS_MASK
 from ...rpc.client import RPCSession
+from ...runtime import Device
+from ...script import tir as T
+from ...target import Target
+from ...tir import PrimFunc
 from . import registry
-from ...contrib import utils, nvcc
 
 
-@registry.estimate_peak_flops.register("cuda")
+@functools.lru_cache(maxsize=None)
 def estimate_peak_flops_tensorcore(
     target: Target,
     dev: Device,
     remote: Optional[RPCSession],
     mat_dtype: str = "float16",
     acc_dtype: str = "float32",
-) -> float:
+) -> Tuple[float, float, str]:
     """Estimate the peak FLOP/s of a cuda device with tensorcores.
 
     This estimate should only be used to compare with operators that can use
@@ -64,12 +70,11 @@ def estimate_peak_flops_tensorcore(
 
     Returns
     -------
-    float
+    peak_flops : float
         Approximate sustained FLOP/s of this target/device combo assuming
         mma instructions. Addition and multiplications are each counted as
         separate FLOPs.
     """
-    assert str(target.kind) == "cuda", "Only CUDA devices have tensorcores"
 
     @T.prim_func
     def peak_flops_tensorcore_tir(
@@ -161,6 +166,56 @@ def peak_flops_tensorcore_tir(
     return n * 16 * 16 * 16 * 2 * sms * 8 / times.min
 
 
+@registry.estimate_peak_flops.register("cuda")
+def estimate_peak_flops(
+    func: PrimFunc,  # pylint: disable=unused-argument
+    features: Dict[str, np.ndarray],
+    target: Target,
+    dev: Device,
+    remote: Optional[RPCSession],
+) -> Tuple[float, float, str]:
+    """Estimate the peak FLOP/s of a cuda device.
+
+    Parameters
+    ----------
+    func : PrimFunc
+        Function to estimate peak flops for. Used to check if a specific kind
+        intrinsic or dtype could be used with this function.
+    features : Dict[str, np.ndarry]
+        Features extracted from `func`. Used to check if a specific kind
+        intrinsic or dtype could be used with this function.
+    target : Target
+        Target to run on. This should be as specific to the actual hardware as
+        possible.
+    dev : Device
+        Device to run on.
+    remote : Optional[RPCSession]
+      Remote session used to upload artifacts for runtime evaluation. Must be
+      the same session used to create `dev`.
+
+    Returns
+    -------
+    flops : float
+        Estimated number of flops used by `func`.
+    peak_flops : float
+        Approximate sustained FLOP/s of this target/device combo. Addition and
+        multiplications are each counted as separate FLOPs.
+    name : str
+        Dtype/intrinsic used by `func` to achieve peak flops.
+    """
+    assert nvcc.have_tensorcore(
+        dev.compute_version
+    ), "CUDA roofline only works with devices that have tensorcores"
+    flops = np.sum(
+        features["float_addsub"]
+        + features["float_mul"]
+        + features["float_mad"] * 2
+        + features["float_divmod"]
+    )
+    peak_flops = estimate_peak_flops_tensorcore(target, dev, remote)
+    return flops, peak_flops, "float16 tensorcore"
+
+
 @T.prim_func
 def peak_bandwidth_tir(a: T.handle, b: T.handle, blocks: T.int32, warp_size: T.int32) -> None:
     # pylint: disable=invalid-name, missing-function-docstring
@@ -178,37 +233,13 @@ def peak_bandwidth_tir(a: T.handle, b: T.handle, blocks: T.int32, warp_size: T.i
                     B[i, l, j] += A[i, k, l, j]
 
 
-@registry.estimate_peak_bandwidth.register("cuda")
-def estimate_peak_bandwidth(
+@functools.lru_cache(maxsize=None)
+def estimate_peak_bandwidth_global_mem(
     target: Target,
     dev: Device,
     remote: Optional[RPCSession] = None,
-) -> float:
-    """Estimate peak memory bandwidth of a target/device combo.
-
-    Peak bandwidth is estimated by running a small experiment on the underlying
-    hardware. The peak bandwidth measurement assumes that vector instructions
-    are being used to load the data.
-
-    Parameters
-    ----------
-    target : Target
-        Target to use for measurement. This target should be as specific to the
-        underlying hardware as possible.
-    dev : Device
-        Device to measure peak bandwidth on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-    float
-        Peak memory bandwidth in bytes/seconds.
-    """
-    assert nvcc.have_tensorcore(
-        dev.compute_version
-    ), "CUDA roofline only works with devices that have tensorcores"
+) -> Tuple[float, float, str]:
+    """Estimate peak bandwidth of global memory. See estimate_peak_bandwidth"""
     warp_size = dev.warp_size
     # These sizes seem large enough to give the card time to hit a fixpoint on memory bandwidth
     blocks = 1024
@@ -234,3 +265,63 @@ def estimate_peak_bandwidth(
     b = nd.empty((blocks, 4, warp_size), dtype="float32", device=dev)
     times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(a, b)
     return a.numpy().size * 4 / times.min  # 4 bytes per float32
+
+
+@registry.estimate_peak_bandwidth.register("cuda")
+def estimate_peak_bandwidth(
+    func: PrimFunc,  # pylint: disable=unused-argument
+    features: Dict[str, np.ndarray],
+    target: Target,
+    dev: Device,
+    remote: Optional[RPCSession] = None,
+) -> Tuple[float, float, str]:
+    """Estimate peak memory bandwidth of a target/device combo.
+
+    Peak bandwidth is estimated by running a small experiment on the underlying
+    hardware. The peak bandwidth measurement assumes that vector instructions
+    are being used to load the data.
+
+    Parameters
+    ----------
+    func : PrimFunc
+        Function to estimate peak bandwidth for. Used to check if a specific
+        kind of memory could be used with this function.
+    features : Dict[str, np.ndarry]
+        Features extracted from `func`. Used to check if a specific kind of
+        memory could be used with this function.
+    target : Target
+        Target to use for measurement. This target should be as specific to the
+        underlying hardware as possible.
+    dev : Device
+        Device to measure peak bandwidth on.
+    remote : Optional[RPCSession]
+      Remote session used to upload artifacts for runtime evaluation. Must be
+      the same session used to create `dev`.
+
+    Returns
+    -------
+    loaded_bytes : float
+        Estimated bytes loaded by `func`.
+    peak_bandwidth : float
+        Peak memory bandwidth in bytes/seconds.
+    name : str
+        Name of the memory being used.
+    """
+    # autoscheduler features do not take into account that 1.
+    # global and shared memory have very different performance
+    # characteristics -- both are included in the same bytes
+    # touched count 2. multiple threads accessing the same byte
+    # of memory does not use the same amount of bandwidth as
+    # multiple threads accessing different bytes of memory. We
+    # use unique bytes accessed here to avoid these two issues,
+    # but this does bias results towards being more compute
+    # bound.
+    loaded_bytes = sum(
+        [
+            np.sum(x)
+            for (k, x) in features.items()
+            if re.match(r"^B[0-9]+\.unique_bytes$", k) is not None
+        ]
+    )
+    peak_bandwidth = estimate_peak_bandwidth_global_mem(target, dev, remote)
+    return loaded_bytes, peak_bandwidth, "global"
diff --git a/python/tvm/utils/roofline/registry.py b/python/tvm/utils/roofline/registry.py
index b3ea522be899..9358529b38ec 100644
--- a/python/tvm/utils/roofline/registry.py
+++ b/python/tvm/utils/roofline/registry.py
@@ -15,18 +15,24 @@
 # specific language governing permissions and limitations
 # under the License.
 """Definition of generic functions for estimating peak flops and bandwidth"""
-from typing import Optional
-from ...target import Target, generic_func
-from ...runtime import Device
+from typing import Dict, Optional, Tuple
+
+import numpy as np
+
 from ...rpc.client import RPCSession
+from ...runtime import Device
+from ...target import Target, generic_func
+from ...tir import PrimFunc
 
 
 @generic_func
 def estimate_peak_bandwidth(
+    func: PrimFunc,
+    features: Dict[str, np.ndarray],
     target: Target,
     dev: Device,
     remote: Optional[RPCSession] = None,
-) -> float:
+) -> Tuple[float, float, str]:
     """Estimate peak memory bandwidth of a target/device combo.
 
     Peak bandwidth is estimated by running a small experiment on the underlying
@@ -35,6 +41,12 @@ def estimate_peak_bandwidth(
 
     Parameters
     ----------
+    func : PrimFunc
+        Function to estimate peak bandwidth for. Used to check if a specific
+        kind of memory could be used with this function.
+    features : Dict[str, np.ndarry]
+        Features extracted from `func`. Used to check if a specific kind of
+        memory could be used with this function.
     target : Target
         Target to use for measurement. This target should be as specific to the
         underlying hardware as possible.
@@ -46,18 +58,24 @@ def estimate_peak_bandwidth(
 
     Returns
     -------
-    float
+    loaded_bytes : float
+        Estimated bytes loaded by `func`.
+    peak_bandwidth : float
         Peak memory bandwidth in bytes/seconds.
+    name : str
+        Name of the memory being used.
     """
     raise NotImplementedError()
 
 
 @generic_func
 def estimate_peak_flops(
+    func: PrimFunc,
+    features: Dict[str, np.ndarray],
     target: Target,
     dev: Device,
     remote: Optional[RPCSession],
-) -> float:
+) -> Tuple[float, float, str]:
     """
     Estimate the maximum number of FLOP/s this target/device combo is capable
     of reaching by running a test program. This is a generic function that
@@ -65,6 +83,12 @@ def estimate_peak_flops(
 
     Parameters
     ----------
+    func : PrimFunc
+        Function to estimate peak flops for. Used to check if a specific kind
+        intrinsic or dtype could be used with this function.
+    features : Dict[str, np.ndarry]
+        Features extracted from `func`. Used to check if a specific kind
+        intrinsic or dtype could be used with this function.
     target : Target
         Target to run on. This should be as specific to the actual hardware as
         possible to make sure that LLVM generates the best vector code.
@@ -76,8 +100,12 @@ def estimate_peak_flops(
 
     Returns
     -------
-    float
-        Approximate sustained FLOP/s of this target/device combo. Each FMA
-        operation counts as two FLOPs.
+    flops : float
+        Estimated number of flops used by `func`.
+    peak_flops : float
+        Approximate sustained FLOP/s of this target/device combo assuming
+        vectorized FMA instructions. Each FMA operation counts as two FLOPs.
+    name : str
+        Dtype/intrinsic used by `func` to achieve peak flops.
     """
     raise NotImplementedError()
diff --git a/python/tvm/utils/roofline/x86.py b/python/tvm/utils/roofline/x86.py
index d4a0e511848f..8ed7ac418f0c 100644
--- a/python/tvm/utils/roofline/x86.py
+++ b/python/tvm/utils/roofline/x86.py
@@ -15,15 +15,20 @@
 # specific language governing permissions and limitations
 # under the License.
 """Estimate peak flops and bandwidth for x86 devices"""
-from typing import Optional
+import functools
+import re
+from typing import Dict, Optional, Tuple
 
-from ... import nd, build, topi, transform, get_global_func
-from ...target import Target
-from ...runtime import Device, num_threads
-from ...script import tir as T
+import numpy as np
+
+from ... import build, get_global_func, nd, topi, transform
+from ...contrib import utils
 from ...rpc.base import RPC_SESS_MASK
 from ...rpc.client import RPCSession
-from ...contrib import utils
+from ...runtime import DataType, Device, num_threads
+from ...script import tir as T
+from ...target import Target
+from ...tir import PrimFunc
 from . import registry
 
 
@@ -44,7 +49,7 @@ def _detect_vec_width_registers(
     Returns
     -------
     vec_width: int
-        Width of a vector register on `target`.
+        Width of a vector register on `target` in bytes.
     num_vector_registers: int
         Number of vector registers on `target`.
     """
@@ -57,7 +62,7 @@ def _detect_vec_width_registers(
             and target.keys[0] == "cpu"
         ):
             with target:
-                vec_width = topi.x86.utils.get_simd_32bit_lanes()  # in number of float32s
+                vec_width = topi.x86.utils.get_simd_32bit_lanes() * 4  # in number of bytes
         else:
             raise RuntimeError(f"Cannot determine vector width for target {target}")
     if num_vector_registers is None:
@@ -68,66 +73,41 @@ def _detect_vec_width_registers(
     return vec_width, num_vector_registers
 
 
-@T.prim_func
-def peakflops_fma_tir(
-    a: T.handle,
-    vec_width: T.int32,
-    iters: T.int32,
-    num_vector_registers: T.int32,
-    threads: T.int32,
-) -> None:
-    # pylint: disable=invalid-name, missing-function-docstring
-    A = T.match_buffer(a, [threads, num_vector_registers, vec_width], "float32")
-    for t in T.parallel(threads):
-        for _j in range(iters):
-            for l in T.unroll(num_vector_registers):
-                # We want to use as few registers as possible, so we perform
-                # all operations on the same element
-                for k in T.vectorized(vec_width):
-                    A[t, l, k] = A[t, l, k] * A[t, l, k] + A[t, l, k]
-
-
-@registry.estimate_peak_flops.register("cpu")
-def estimate_peak_fma_flops(
+@functools.lru_cache(maxsize=None)
+def estimate_peak_fma_vector_flops(
     target: Target,
     dev: Device,
     remote: Optional[RPCSession],
+    dtype: DataType,
     vec_width: Optional[int] = None,
     num_vector_registers: Optional[int] = None,
-) -> float:
+):
+    """Estimate peak flops assuming vector fma instructions and no explicit
+    intrinsics. See estimate_peak_fma_flops.
     """
-    Estimate the maximum number of FLOP/s this target/device combo is capable
-    of reaching by running a test program. This assumes vectorized f32 FMA
-    (fused-multiply-add) instructions.
-
 
-    Parameters
-    ----------
-    target : Target
-        Target to run on. This should be as specific to the actual hardware as
-        possible to make sure that LLVM generates the best vector code.
-    dev : Device
-        Device to run on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-    vec_width : Optional[int]
-        Vector width of SIMD units on the underlying hardware. Will try to
-        infer if no value is provided.
-    num_vector_registers : Optional[int]
-        Number of vector registers on the underlying hardware. Will try to
-        infer if no value is provided.
+    @T.prim_func
+    def peakflops_fma_tir(
+        a: T.handle,
+        vec_width: T.int32,
+        iters: T.int32,
+        num_vector_registers: T.int32,
+        threads: T.int32,
+    ) -> None:
+        # pylint: disable=invalid-name, missing-function-docstring
+        A = T.match_buffer(a, [threads, num_vector_registers, vec_width], dtype)
+        for t in T.parallel(threads):
+            for _j in range(iters):
+                for l in T.unroll(num_vector_registers):
+                    # We want to use as few registers as possible, so we perform
+                    # all operations on the same element
+                    for k in T.vectorized(vec_width):
+                        A[t, l, k] = A[t, l, k] * A[t, l, k] + A[t, l, k]
 
-    Returns
-    -------
-    float
-        Approximate sustained FLOP/s of this target/device combo assuming
-        vectorized f32 FMA instructions. Each FMA operation counts as two FLOPs.
-    """
-    assert str(target.kind) == "llvm", "Only llvm targets are supported"
     vec_width, num_vector_registers = _detect_vec_width_registers(
         target, vec_width, num_vector_registers
     )
+    vec_width //= DataType(dtype).bits // 8
     iters = 1000000
     nthreads = num_threads()
     specialized = peakflops_fma_tir.specialize(
@@ -155,12 +135,82 @@ def estimate_peak_fma_flops(
         random_fill = get_global_func("tvm.contrib.random.random_fill")
     assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake"
 
-    a = nd.empty((nthreads, num_vector_registers, vec_width), dtype="float32", device=dev)
+    a = nd.empty((nthreads, num_vector_registers, vec_width), dtype=dtype, device=dev)
     random_fill(a)
     times = f.time_evaluator(f.entry_name, dev, repeat=100, number=1)(a)
     flops = 2 * vec_width * num_vector_registers * nthreads * iters  # fma is two flops
-    flop_s = flops / times.min
-    return flop_s
+    return flops / times.min
+
+
+@registry.estimate_peak_flops.register("cpu")
+def estimate_peak_fma_flops(
+    func: PrimFunc,
+    features: Dict[str, np.ndarray],
+    target: Target,
+    dev: Device,
+    remote: Optional[RPCSession],
+    vec_width: Optional[int] = None,
+    num_vector_registers: Optional[int] = None,
+) -> Tuple[float, float, str]:
+    """
+    Estimate the maximum number of FLOP/s this target/device combo is capable
+    of reaching by running a test program. This assumes vectorized FMA
+    (fused-multiply-add) instructions.
+
+
+    Parameters
+    ----------
+    func : PrimFunc
+        Function to estimate peak flops for. Used to check if a specific kind
+        intrinsic or dtype could be used with this function.
+    features : Dict[str, np.ndarry]
+        Features extracted from `func`. Used to check if a specific kind
+        intrinsic or dtype could be used with this function.
+    target : Target
+        Target to run on. This should be as specific to the actual hardware as
+        possible to make sure that LLVM generates the best vector code.
+    dev : Device
+        Device to run on.
+    remote : Optional[RPCSession]
+      Remote session used to upload artifacts for runtime evaluation. Must be
+      the same session used to create `dev`.
+    vec_width : Optional[int]
+        Vector width of SIMD units on the underlying hardware. Will try to
+        infer if no value is provided.
+    num_vector_registers : Optional[int]
+        Number of vector registers on the underlying hardware. Will try to
+        infer if no value is provided.
+
+    Returns
+    -------
+    flops : float
+        Estimated number of flops used by `func`.
+    peak_flops : float
+        Approximate sustained FLOP/s of this target/device combo assuming
+        vectorized FMA instructions. Each FMA operation counts as two FLOPs.
+    name : str
+        Dtype/intrinsic used by `func` to achieve peak flops.
+    """
+    # assume that the first argument's dtype is the one we want
+    dtype = list(func.buffer_map.values())[0].dtype
+    if "int" in dtype:
+        flops = np.sum(
+            features["int_addsub"]
+            + features["int_mul"]
+            + features["int_mad"] * 2
+            + features["int_divmod"]
+        )
+    else:
+        flops = np.sum(
+            features["float_addsub"]
+            + features["float_mul"]
+            + features["float_mad"] * 2
+            + features["float_divmod"]
+        )
+    peak_flops = estimate_peak_fma_vector_flops(
+        target, dev, remote, dtype, vec_width, num_vector_registers
+    )
+    return flops, peak_flops, f"{dtype} FMA"
 
 
 @T.prim_func
@@ -181,43 +231,14 @@ def peak_bandwidth_tir(a: T.handle, b: T.handle, threads: T.int32, vec_width: T.
                     B[i, l, j] += A[i, k, l, j]
 
 
-@registry.estimate_peak_bandwidth.register("cpu")
-def estimate_peak_bandwidth(
+@functools.lru_cache(maxsize=None)
+def estimate_peak_bandwidth_dram(
     target: Target,
     dev: Device,
     remote: Optional[RPCSession],
     vec_width: Optional[int] = None,
 ) -> float:
-    """Estimate peak memory bandwidth of a target/device combo.
-
-    Peak bandwidth is estimated by running a small experiment on the underlying
-    hardware. The peak bandwidth measurement assumes that vector instructions
-    are being used to load the data.
-
-    Parameters
-    ----------
-    target : Target
-        Target to use for measurement. This target should be as specific to the
-        underlying hardware as possible.
-    dev : Device
-        Device to measure peak bandwidth on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-    vec_width : Optional[int]
-        Vector unit width, determined from target if not supplied.
-
-    Returns
-    -------
-    float
-        Peak memory bandwidth in bytes/seconds.
-    """
-    # Ideally we'd be able to use this code to measure peak bandwidth of the
-    # different cache levels. If we could just generate load commands, then we
-    # could use those in a tight loop. Instead we need some code that is
-    # limited on the cache bandwidth. With the L1 cache we need an operation
-    # that has a very low arithmetic intensity and we haven't come up with one
-    # yet.
+    """Estimate peak bandwidth for DRAM. See estimate_peak_bandwidth."""
     vec_width, _ = _detect_vec_width_registers(target, vec_width, 1)
     specialized = peak_bandwidth_tir.specialize(
         {
@@ -252,3 +273,59 @@ def estimate_peak_bandwidth(
     random_fill(b)
     times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(a, b, threads)
     return a.numpy().size * 4 / times.min  # 4 bytes per float32
+
+
+@registry.estimate_peak_bandwidth.register("cpu")
+def estimate_peak_bandwidth(
+    func: PrimFunc,  # pylint: disable=unused-argument
+    features: Dict[str, np.ndarray],
+    target: Target,
+    dev: Device,
+    remote: Optional[RPCSession],
+    vec_width: Optional[int] = None,
+) -> Tuple[float, float, str]:
+    """Estimate peak memory bandwidth of a target/device combo.
+
+    Peak bandwidth is estimated by running a small experiment on the underlying
+    hardware. The peak bandwidth measurement assumes that vector instructions
+    are being used to load the data.
+
+    Parameters
+    ----------
+    func : PrimFunc
+        Function to estimate peak bandwidth for. Used to check if a specific
+        kind of memory could be used with this function.
+    features : Dict[str, np.ndarry]
+        Features extracted from `func`. Used to check if a specific kind of
+        memory could be used with this function.
+    target : Target
+        Target to use for measurement. This target should be as specific to the
+        underlying hardware as possible.
+    dev : Device
+        Device to measure peak bandwidth on.
+    remote : Optional[RPCSession]
+      Remote session used to upload artifacts for runtime evaluation. Must be
+      the same session used to create `dev`.
+    vec_width : Optional[int]
+        Vector unit width, determined from target if not supplied.
+
+    Returns
+    -------
+    loaded_bytes : float
+        Estimated bytes loaded by `func`.
+    peak_bandwidth : float
+        Peak memory bandwidth in bytes/seconds.
+    name : str
+        Name of the memory being used.
+    """
+    # Ideally we'd be able to use this code to measure peak bandwidth of the
+    # different cache levels. If we could just generate load commands, then we
+    # could use those in a tight loop. Instead we need some code that is
+    # limited on the cache bandwidth. With the L1 cache we need an operation
+    # that has a very low arithmetic intensity and we haven't come up with one
+    # yet.
+    peak_bandwidth = estimate_peak_bandwidth_dram(target, dev, remote, vec_width)
+    loaded_bytes = sum(
+        [np.sum(x) for (k, x) in features.items() if re.match(r"^B[0-9]+\.bytes$", k) is not None]
+    )
+    return loaded_bytes, peak_bandwidth, "DRAM"
diff --git a/tests/python/unittest/test_roofline.py b/tests/python/unittest/test_roofline.py
index e37f6e085bf6..a8bf4df497f6 100644
--- a/tests/python/unittest/test_roofline.py
+++ b/tests/python/unittest/test_roofline.py
@@ -14,81 +14,88 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import numpy as np
-import pytest
-from io import StringIO
 import csv
-import os
 import json
+import os
 import platform
+from io import StringIO
+
+import numpy as np
+import pytest
 
 import tvm.testing
 import tvm.utils
-from tvm.runtime import profiler_vm
-from tvm import relay
-from tvm.relay.testing import mlp
-from tvm.contrib.debugger import debug_executor
-from tvm import rpc
+from tvm import relay, rpc
 from tvm.contrib import utils
+from tvm.contrib.debugger import debug_executor
+from tvm.relay.testing import mlp
+from tvm.runtime import profiler_vm
 from tvm.runtime.profiling import Report
 from tvm.script import tir as T
 
 
-@tvm.testing.parametrize_targets("llvm", "cuda")
-def test_estimate_peak_flops(target, dev):
-    server = rpc.Server(key="roofline_flops")
-    remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops")
-    dev = remote.device(target)
+@tvm.testing.requires_llvm
+@pytest.mark.parametrize("dtype", ["float32", "int8", "int32"])
+def test_estimate_peak_flops_cpu(dtype):
+    server = rpc.Server(key="roofline_flops_cpu")
+    remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops_cpu")
+    target = tvm.target.Target("llvm -mattr=+fma,+avx2")
+    dev = remote.device(str(target))
     # This test uses vectorized instructions so we need a target that supports them
-    if target == "llvm":
-        target = "llvm -mattr=+fma,+avx2"
-    target = tvm.target.Target(target)
-    with target:
-        flops = tvm.utils.roofline.registry.estimate_peak_flops(target, dev, remote)
-    if str(target.kind) == "llvm":
-        # Assume we can achieve 1 GFLOP/s per thread, which is 1 FLOP per cycle on a 1GHz cpu.
-        assert (
-            flops > 10**9 and flops < 10**14
-        ), f"FLOP/s should be between 10^9 and 10^14, but it is {flops}"
-    elif str(target.kind) == "cuda":
-        # should be able to hit a TFLOP/s with tensor cores
-        assert (
-            flops > 10**12 and flops < 10**14
-        ), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"
-    else:
-        raise RuntimeError("Unsupported target " + str(target))
+    flops = tvm.utils.roofline.x86.estimate_peak_fma_vector_flops(target, dev, remote, "float32")
+    # Assume we can achieve 1 GFLOP/s per thread, which is 1 FLOP per cycle on a 1GHz cpu.
+    assert (
+        flops > 10**9 and flops < 10**14
+    ), f"FLOP/s should be between 10^9 and 10^14, but it is {flops}"
+
+
+@tvm.testing.requires_cuda
+def test_estimate_peak_flops_gpu():
+    server = rpc.Server(key="roofline_flops_gpu")
+    remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops_gpu")
+    target = tvm.target.Target("cuda")
+    dev = remote.device(str(target))
+    # This test uses vectorized instructions so we need a target that supports them
+    flops = tvm.utils.roofline.cuda.estimate_peak_flops_tensorcore(target, dev, remote)
+    # should be able to hit a TFLOP/s with tensor cores
+    assert (
+        flops > 10**12 and flops < 10**14
+    ), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"
 
 
 @tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
-@tvm.testing.parametrize_targets("llvm", "cuda")
-def test_estimate_peak_bandwidth(target, dev):
-    server = rpc.Server(key="roofline_bandwidth")
-    remote = rpc.connect("127.0.0.1", server.port, key="roofline_bandwidth")
-    dev = remote.device(target)
+@tvm.testing.requires_llvm
+def test_estimate_peak_bandwidth_cpu():
+    server = rpc.Server(key="roofline_bandwidth_cpu")
+    remote = rpc.connect("127.0.0.1", server.port, key="roofline_bandwidth_cpu")
+    target = tvm.target.Target("llvm -mattr=+fma,+avx2")
+    dev = remote.device(str(target))
+    # This test uses vectorized instructions so we need a target that supports them
+    bandwidth = tvm.utils.roofline.x86.estimate_peak_bandwidth_dram(target, dev, remote)
+    # Assume we can achieve 1 GB/s. DDR2 should transfer somewhere around 6
+    # GB/s, so this should leave enough wiggle room.
+    assert (
+        bandwidth > 10**9 and bandwidth < 10**12
+    ), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
+
+
+@tvm.testing.requires_cuda
+def test_estimate_peak_bandwidth_gpu():
+    server = rpc.Server(key="roofline_bandwidth_gpu")
+    remote = rpc.connect("127.0.0.1", server.port, key="roofline_bandwidth_gpu")
+    target = tvm.target.Target("cuda")
+    dev = remote.device(str(target))
     # This test uses vectorized instructions so we need a target that supports them
-    if target == "llvm":
-        target = "llvm -mattr=+fma,+avx2"
-    target = tvm.target.Target(target)
-    with target:
-        bandwidth = tvm.utils.roofline.registry.estimate_peak_bandwidth(target, dev, remote)
-    if str(target.kind) == "llvm":
-        # Assume we can achieve 1 GB/s. DDR2 should transfer somewhere around 6
-        # GB/s, so this should leave enough wiggle room.
-        assert (
-            bandwidth > 10**9 and bandwidth < 10**12
-        ), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
-    elif str(target.kind) == "cuda":
-        # should be able to hit a 100 GB/s on a GPU. GTX 280 hits 140 GB/s and
-        # it is really old.
-        assert (
-            bandwidth > 10**11 and bandwidth < 10**13
-        ), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
-    else:
-        raise RuntimeError("Unsupported target " + str(target))
+    bandwidth = tvm.utils.roofline.cuda.estimate_peak_bandwidth_global_mem(target, dev, remote)
+    # should be able to hit a 100 GB/s on a GPU. GTX 280 hits 140 GB/s and
+    # it is really old.
+    assert (
+        bandwidth > 10**11 and bandwidth < 10**13
+    ), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
 
 
 @tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
-@tvm.testing.parametrize_targets("llvm -mattr=+fma+avx2", "cuda")
+@tvm.testing.parametrize_targets("llvm -mattr=+fma,+avx2", "cuda")
 def test_roofline_analysis(target, dev):
     a = relay.var("a", relay.TensorType((512, 512), "float32"))
     b = relay.var("b", relay.TensorType((512, 512), "float32"))

From 5ed94eefad23df17cc998f058fb2dede6ce9f7ae Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 14 Oct 2022 16:47:21 -0700
Subject: [PATCH 357/704] [Node] Fix structural equal path tracing pointer
 usage (#13082)

---
 src/node/structural_equal.cc | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/node/structural_equal.cc b/src/node/structural_equal.cc
index 2f49d9ef5629..0a9a0ec0bbb7 100644
--- a/src/node/structural_equal.cc
+++ b/src/node/structural_equal.cc
@@ -291,23 +291,19 @@ class SEqualHandlerDefault::Impl {
       }
       if (equal_map_rhs_.count(rhs)) return false;
 
-      SEqualReducer reducer = GetReducer(lhs, rhs, map_free_vars, current_paths);
-      return vtable_->SEqualReduce(lhs.get(), rhs.get(), reducer);
+      if (!IsPathTracingEnabled()) {
+        return vtable_->SEqualReduce(lhs.get(), rhs.get(),
+                                     SEqualReducer(parent_, nullptr, map_free_vars));
+      } else {
+        PathTracingData tracing_data = {current_paths.value(), lhs, rhs, first_mismatch_};
+        return vtable_->SEqualReduce(lhs.get(), rhs.get(),
+                                     SEqualReducer(parent_, &tracing_data, map_free_vars));
+      }
     };
     return CheckResult(compute(), lhs, rhs, current_paths);
   }
 
  protected:
-  SEqualReducer GetReducer(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars,
-                           const Optional<ObjectPathPair>& current_paths) {
-    if (!IsPathTracingEnabled()) {
-      return SEqualReducer(parent_, nullptr, map_free_vars);
-    } else {
-      PathTracingData tracing_data = {current_paths.value(), lhs, rhs, first_mismatch_};
-      return SEqualReducer(parent_, &tracing_data, map_free_vars);
-    }
-  }
-
   // Check the result.
   bool CheckResult(bool result, const ObjectRef& lhs, const ObjectRef& rhs,
                    const Optional<ObjectPathPair>& current_paths) {

From 71f32ca4e8e6f33da55cd6c39c5019caadcdc78e Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 14 Oct 2022 20:24:50 -0700
Subject: [PATCH 358/704] [MetaSchedule][UX] Support Interactive Performance
 Table Printing in Notebook (#13006)

* Support interactive table printing.

* Rebase.

* Fix jupyter outputs.

* Fix CI.

* fix CI.

* Change file to filename.

* Address issues.
---
 include/tvm/meta_schedule/task_scheduler.h    |  4 +--
 python/tvm/meta_schedule/logging.py           | 24 +++++++------
 .../task_scheduler/task_scheduler.py          | 12 ++-----
 python/tvm/meta_schedule/utils.py             | 34 +++++++++++++++++-
 .../task_scheduler/gradient_based.cc          |  3 +-
 .../task_scheduler/task_scheduler.cc          | 29 ++++++++-------
 src/meta_schedule/utils.h                     | 36 +++++++++++--------
 7 files changed, 93 insertions(+), 49 deletions(-)

diff --git a/include/tvm/meta_schedule/task_scheduler.h b/include/tvm/meta_schedule/task_scheduler.h
index 17d82558fb82..f4fc491286dd 100644
--- a/include/tvm/meta_schedule/task_scheduler.h
+++ b/include/tvm/meta_schedule/task_scheduler.h
@@ -196,8 +196,8 @@ class TaskSchedulerNode : public runtime::Object {
    * \param task_id The task id to be checked.
    */
   void TouchTask(int task_id);
-  /*! \brief Returns a human-readable string of the tuning statistics. */
-  std::string TuningStatistics() const;
+  /*! \brief Print out a human-readable format of the tuning statistics. */
+  void PrintTuningStatistics();
 
   static constexpr const char* _type_key = "meta_schedule.TaskScheduler";
   TVM_DECLARE_BASE_OBJECT_INFO(TaskSchedulerNode, Object);
diff --git a/python/tvm/meta_schedule/logging.py b/python/tvm/meta_schedule/logging.py
index 9d673266a3f2..53353e3aa907 100644
--- a/python/tvm/meta_schedule/logging.py
+++ b/python/tvm/meta_schedule/logging.py
@@ -39,7 +39,7 @@ def get_logger(name: str) -> Logger:
     return logging.getLogger(name)
 
 
-def get_logging_func(logger: Logger) -> Optional[Callable[[int, str], None]]:
+def get_logging_func(logger: Logger) -> Optional[Callable[[int, str, int, str], None]]:
     """Get the logging function.
 
     Parameters
@@ -62,15 +62,15 @@ def get_logging_func(logger: Logger) -> Optional[Callable[[int, str], None]]:
         # logging.FATAL not included
     }
 
-    def logging_func(level: int, msg: str):
-        if level < 0:
+    def logging_func(level: int, filename: str, lineo: int, msg: str):
+        if level < 0:  # clear the output in notebook / console
             from IPython.display import (  # type: ignore # pylint: disable=import-outside-toplevel
                 clear_output,
             )
 
             clear_output(wait=True)
         else:
-            level2log[level](msg)
+            level2log[level](f"[{os.path.basename(filename)}:{lineo}] " + msg)
 
     return logging_func
 
@@ -94,12 +94,15 @@ def create_loggers(
     global_logger_name = "tvm.meta_schedule"
     global_logger = logging.getLogger(global_logger_name)
     if global_logger.level is logging.NOTSET:
-        global_logger.setLevel(logging.INFO)
+        global_logger.setLevel(logging.DEBUG)
+    console_logging_level = logging._levelToName[  # pylint: disable=protected-access
+        global_logger.level
+    ]
 
     config["loggers"].setdefault(
         global_logger_name,
         {
-            "level": logging._levelToName[global_logger.level],  # pylint: disable=protected-access
+            "level": logging.DEBUG,
             "handlers": [handler.get_name() for handler in global_logger.handlers]
             + [global_logger_name + ".console", global_logger_name + ".file"],
             "propagate": False,
@@ -108,7 +111,7 @@ def create_loggers(
     config["loggers"].setdefault(
         "{logger_name}",
         {
-            "level": "INFO",
+            "level": "DEBUG",
             "handlers": [
                 "{logger_name}.file",
             ],
@@ -121,6 +124,7 @@ def create_loggers(
             "class": "logging.StreamHandler",
             "stream": "ext://sys.stdout",
             "formatter": "tvm.meta_schedule.standard_formatter",
+            "level": console_logging_level,
         },
     )
     config["handlers"].setdefault(
@@ -129,7 +133,7 @@ def create_loggers(
             "class": "logging.FileHandler",
             "filename": "{log_dir}/" + __name__ + ".task_scheduler.log",
             "mode": "a",
-            "level": "INFO",
+            "level": "DEBUG",
             "formatter": "tvm.meta_schedule.standard_formatter",
         },
     )
@@ -139,14 +143,14 @@ def create_loggers(
             "class": "logging.FileHandler",
             "filename": "{log_dir}/{logger_name}.log",
             "mode": "a",
-            "level": "INFO",
+            "level": "DEBUG",
             "formatter": "tvm.meta_schedule.standard_formatter",
         },
     )
     config["formatters"].setdefault(
         "tvm.meta_schedule.standard_formatter",
         {
-            "format": "%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
+            "format": "%(asctime)s [%(levelname)s] %(message)s",
             "datefmt": "%Y-%m-%d %H:%M:%S",
         },
     )
diff --git a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
index f06f4d911fa8..d56d944474e9 100644
--- a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
+++ b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
@@ -163,15 +163,9 @@ def touch_task(self, task_id: int) -> None:
         """
         _ffi_api.TaskSchedulerTouchTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
-    def tuning_statistics(self) -> str:
-        """Returns a human-readable string of the tuning statistics.
-
-        Returns
-        -------
-        tuning_statistics : str
-            The tuning statistics.
-        """
-        return _ffi_api.TaskSchedulerTuningStatistics(self)  # type: ignore # pylint: disable=no-member
+    def print_tuning_statistics(self) -> None:
+        """Print out a human-readable format of the tuning statistics."""
+        return _ffi_api.TaskSchedulerPrintTuningStatistics(self)  # type: ignore # pylint: disable=no-member
 
     @staticmethod
     def create(  # pylint: disable=keyword-arg-before-vararg
diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index eb3c6437603c..401fdab08a26 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -188,13 +188,45 @@ def cpu_count(logical: bool = True) -> int:
 
 
 @register_func("meta_schedule.using_ipython")
-def _using_ipython():
+def _using_ipython() -> bool:
+    """Return whether the current process is running in an IPython shell.
+
+    Returns
+    -------
+    result : bool
+        Whether the current process is running in an IPython shell.
+    """
     try:
         return get_ipython().__class__.__name__ == "ZMQInteractiveShell"  # type: ignore
     except NameError:
         return False
 
 
+@register_func("meta_schedule.print_interactive_table")
+def print_interactive_table(data: str) -> None:
+    """Print the dataframe interactive table in notebook.
+
+    Parameters
+    ----------
+    data : str
+        The serialized performance table from MetaSchedule table printer.
+    """
+    import pandas as pd  # type: ignore # pylint: disable=import-outside-toplevel
+    from IPython.display import display  # type: ignore # pylint: disable=import-outside-toplevel
+
+    pd.set_option("display.max_rows", None)
+    pd.set_option("display.max_colwidth", None)
+    parsed = [
+        x.split("|")[1:] for x in list(filter(lambda x: set(x) != {"-"}, data.strip().split("\n")))
+    ]
+    display(
+        pd.DataFrame(
+            parsed[1:],
+            columns=parsed[0],
+        )
+    )
+
+
 def get_global_func_with_default_on_worker(
     name: Union[None, str, Callable],
     default: Callable,
diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
index bae52573a0f9..e0470337b536 100644
--- a/src/meta_schedule/task_scheduler/gradient_based.cc
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -60,7 +60,8 @@ class GradientBasedNode final : public TaskSchedulerNode {
     int n_tasks = this->tasks_.size();
     // Step 1. Check if it's in round robin mode.
     if (round_robin_rounds_ == 0) {
-      TVM_PY_LOG(INFO, this->logger) << "\n" << this->TuningStatistics();
+      TVM_PY_LOG_CLEAR_SCREEN(this->logger);
+      this->PrintTuningStatistics();
     }
     if (round_robin_rounds_ < n_tasks) {
       return round_robin_rounds_++;
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index 21efde26d993..69a70f63c5c0 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -232,9 +232,8 @@ Array<RunnerResult> TaskSchedulerNode::JoinRunningTask(int task_id) {
   }
   TaskCleanUp(task, task_id, results);
   TVM_PY_LOG_CLEAR_SCREEN(this->logger);
-  TVM_PY_LOG(INFO, this->logger) << "[Updated] Task #" << task_id << ": " << task->ctx->task_name
-                                 << "\n"
-                                 << this->TuningStatistics();
+  TVM_PY_LOG(INFO, this->logger) << "[Updated] Task #" << task_id << ": " << task->ctx->task_name;
+  this->PrintTuningStatistics();
   return results;
 }
 
@@ -257,12 +256,11 @@ void TaskSchedulerNode::TerminateTask(int task_id) {
   --this->remaining_tasks_;
   TVM_PY_LOG_CLEAR_SCREEN(this->logger);
   TVM_PY_LOG(INFO, this->logger) << "Task #" << task_id
-                                 << " has finished. Remaining task(s): " << this->remaining_tasks_
-                                 << "\n"
-                                 << this->TuningStatistics();
+                                 << " has finished. Remaining task(s): " << this->remaining_tasks_;
+  this->PrintTuningStatistics();
 }
 
-std::string TaskSchedulerNode::TuningStatistics() const {
+void TaskSchedulerNode::PrintTuningStatistics() {
   std::ostringstream os;
   int n_tasks = this->tasks_.size();
   int total_trials = 0;
@@ -307,11 +305,18 @@ std::string TaskSchedulerNode::TuningStatistics() const {
     }
   }
   p.Separator();
-  os << p.AsStr()                                  //
-     << "\nTotal trials: " << total_trials         //
+
+  os << "\nTotal trials: " << total_trials         //
      << "\nTotal latency (us): " << total_latency  //
      << "\n";
-  return os.str();
+
+  if (using_ipython()) {
+    print_interactive_table(p.AsStr());
+    std::cout << os.str() << std::endl << std::flush;
+    TVM_PY_LOG(DEBUG, this->logger) << "\n" << p.AsStr() << os.str();
+  } else {
+    TVM_PY_LOG(INFO, this->logger) << "\n" << p.AsStr() << os.str();
+  }
 }
 
 TaskScheduler TaskScheduler::PyTaskScheduler(
@@ -369,8 +374,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTerminateTask")
     .set_body_method<TaskScheduler>(&TaskSchedulerNode::TerminateTask);
 TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTouchTask")
     .set_body_method<TaskScheduler>(&TaskSchedulerNode::TouchTask);
-TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTuningStatistics")
-    .set_body_method<TaskScheduler>(&TaskSchedulerNode::TuningStatistics);
+TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerPrintTuningStatistics")
+    .set_body_method<TaskScheduler>(&TaskSchedulerNode::PrintTuningStatistics);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 41d8ffde558c..b14717f4b29e 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -82,36 +82,32 @@ class PyLogMessage {
     // FATAL not included
   };
 
-  explicit PyLogMessage(const char* file, int lineno, PackedFunc logger, Level logging_level)
-      : file_(file), lineno_(lineno), logger_(logger), logging_level_(logging_level) {
-    if (this->logger_ != nullptr) {
-      stream_ << "" << file_ << ":" << lineno_ << " ";
-    }
-  }
+  explicit PyLogMessage(const char* filename, int lineno, PackedFunc logger, Level logging_level)
+      : filename_(filename), lineno_(lineno), logger_(logger), logging_level_(logging_level) {}
 
   TVM_NO_INLINE ~PyLogMessage() {
     ICHECK(logging_level_ != Level::CLEAR)
         << "Cannot use CLEAR as logging level in TVM_PY_LOG, please use TVM_PY_LOG_CLEAR_SCREEN.";
     if (this->logger_ != nullptr) {
-      logger_(static_cast<int>(logging_level_), stream_.str());
+      logger_(static_cast<int>(logging_level_), std::string(filename_), lineno_, stream_.str());
     } else {
       if (logging_level_ == Level::INFO) {
-        runtime::detail::LogMessage(file_, lineno_).stream() << stream_.str();
+        runtime::detail::LogMessage(filename_, lineno_).stream() << stream_.str();
       } else if (logging_level_ == Level::WARNING) {
-        runtime::detail::LogMessage(file_, lineno_).stream() << "Warning: " << stream_.str();
+        runtime::detail::LogMessage(filename_, lineno_).stream() << "Warning: " << stream_.str();
       } else if (logging_level_ == Level::ERROR) {
-        runtime::detail::LogMessage(file_, lineno_).stream() << "Error: " << stream_.str();
+        runtime::detail::LogMessage(filename_, lineno_).stream() << "Error: " << stream_.str();
       } else if (logging_level_ == Level::DEBUG) {
-        runtime::detail::LogMessage(file_, lineno_).stream() << "Debug: " << stream_.str();
+        runtime::detail::LogMessage(filename_, lineno_).stream() << "Debug: " << stream_.str();
       } else {
-        runtime::detail::LogFatal(file_, lineno_).stream() << stream_.str();
+        runtime::detail::LogFatal(filename_, lineno_).stream() << stream_.str();
       }
     }
   }
   std::ostringstream& stream() { return stream_; }
 
  private:
-  const char* file_;
+  const char* filename_;
   int lineno_;
   std::ostringstream stream_;
   PackedFunc logger_;
@@ -131,6 +127,18 @@ inline bool using_ipython() {
   return flag;
 }
 
+/*!
+ * \brief Print out the performance table interactively in jupyter notebook.
+ * \param str The serialized performance table.
+ */
+inline void print_interactive_table(const String& data) {
+  const auto* f_print_interactive_table =
+      runtime::Registry::Get("meta_schedule.print_interactive_table");
+  ICHECK(f_print_interactive_table->defined())
+      << "Cannot find print_interactive_table function in registry.";
+  (*f_print_interactive_table)(data);
+}
+
 /*!
  * \brief A helper function to clear logging output for ipython kernel and console.
  * \param file The file name.
@@ -139,7 +147,7 @@ inline bool using_ipython() {
  */
 inline void clear_logging(const char* file, int lineno, PackedFunc logging_func) {
   if (logging_func.defined() && using_ipython()) {
-    logging_func(static_cast<int>(PyLogMessage::Level::CLEAR), "");
+    logging_func(static_cast<int>(PyLogMessage::Level::CLEAR), file, lineno, "");
   } else {
     // this would clear all logging output in the console
     runtime::detail::LogMessage(file, lineno).stream() << "\033c\033[3J\033[2J\033[0m\033[H";

From ec5c692148b77617105c6d18193c099429fcf42b Mon Sep 17 00:00:00 2001
From: Scott K Logan <logans@cottsay.net>
Date: Fri, 14 Oct 2022 21:21:18 -0700
Subject: [PATCH 359/704] Add include directory for OpenBLAS on RedHat (#13087)

RedHat distributions like Fedora and RHEL package the `cblas.h` header in /usr/include/openblas/cblas.h. Because `cblas.h` is included without the `openblas` directory, it is necessary to ensure that `/usr/include/openblas` is added to the include directory search paths.
---
 cmake/modules/contrib/BLAS.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
index 4840aaa0690d..542effb50463 100644
--- a/cmake/modules/contrib/BLAS.cmake
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -20,6 +20,11 @@ if(USE_BLAS STREQUAL "openblas")
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
   list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/cblas.cc)
   message(STATUS "Using BLAS library " ${BLAS_LIBRARY})
+  find_path(BLAS_INCLUDE_DIR cblas.h PATH_SUFFIXES openblas)
+  if(BLAS_INCLUDE_DIR)
+    message(STATUS "Using BLAS header in " ${BLAS_INCLUDE_DIR})
+    include_directories(SYSTEM ${BLAS_INCLUDE_DIR})
+  endif()
 elseif(USE_BLAS STREQUAL "atlas" OR USE_BLAS STREQUAL "blas")
   find_library(BLAS_LIBRARY cblas)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})

From 5e862d4e41712d664883c4319464fdd45e10e52c Mon Sep 17 00:00:00 2001
From: Oleksandr Viazlo <oleksandr.viazlo@axelera.ai>
Date: Sat, 15 Oct 2022 08:21:00 +0200
Subject: [PATCH 360/704] [Frontend][PyTorch]Fix keywords to canonicalize scale
 and zero point access for FX-quantized graphs (#13071)

fix keywords to canonicalize scale and zero point access for FX-quantized graphs
---
 python/tvm/relay/frontend/qnn_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 45cb8dedfd53..a4eb56c1048a 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -567,14 +567,14 @@ def get_full_attr_name(current):
     for node in graph.findAllNodes("prim::GetAttr", recurse=True):
         out_name = node.output().debugName()
 
-        if "_input_scale" in out_name or "_input_zero_point" in out_name:
+        if "_scale" in out_name or "_zero_point" in out_name:
             full_attr = get_full_attr_name(node)
             assert full_attr in params, "%s not found in param dict." % full_attr
             param_np = params[full_attr].numpy()
             new_const_node = graph.create("prim::Constant")
             new_const_node.insertBefore(node)
 
-            if "_input_scale" in out_name:
+            if "_scale" in out_name:
                 new_const_node.f_("value", param_np)
                 new_const_node.output().setType(torch._C.FloatType.get())
             else:

From 9299a29fd339b6b728487b32c0526f02a0181e3b Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Sat, 15 Oct 2022 19:45:46 -0700
Subject: [PATCH 361/704] [TVMScript] Evaluator, core parser, var table
 (#13088)

* [TVMScript] Evaluator, core parser, var table

This PR introduces Evaluator, core parser, var table as part of #12442

Co-authored-by: yongwww <yongcale@gmail.com>

* add module doc string

* apply code review suggestion

Co-authored-by: yongwww <yongcale@gmail.com>
---
 python/tvm/script/_parser/core/__init__.py    |   2 +-
 python/tvm/script/_parser/core/dispatch.py    | 156 +++++
 python/tvm/script/_parser/core/entry.py       |  48 ++
 python/tvm/script/_parser/core/evaluator.py   | 509 ++++++++++++++
 python/tvm/script/_parser/core/parser.py      | 647 ++++++++++++++++++
 .../test_tvmscript_parser_evaluator.py        |  63 ++
 6 files changed, 1424 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/script/_parser/core/dispatch.py
 create mode 100644 python/tvm/script/_parser/core/entry.py
 create mode 100644 python/tvm/script/_parser/core/evaluator.py
 create mode 100644 python/tvm/script/_parser/core/parser.py
 create mode 100644 tests/python/unittest/test_tvmscript_parser_evaluator.py

diff --git a/python/tvm/script/_parser/core/__init__.py b/python/tvm/script/_parser/core/__init__.py
index ae1521006d9b..94d8dab0322d 100644
--- a/python/tvm/script/_parser/core/__init__.py
+++ b/python/tvm/script/_parser/core/__init__.py
@@ -15,4 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 """The core parser infra"""
-from . import diagnostics, doc, doc_core, utils
+from . import diagnostics, dispatch, doc, doc_core, entry, evaluator, parser, utils
diff --git a/python/tvm/script/_parser/core/dispatch.py b/python/tvm/script/_parser/core/dispatch.py
new file mode 100644
index 000000000000..f803be05de92
--- /dev/null
+++ b/python/tvm/script/_parser/core/dispatch.py
@@ -0,0 +1,156 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Parser dispatching infrastructure"""
+
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Type
+
+from .doc import AST
+
+if TYPE_CHECKING:
+    from .parser import Parser
+
+
+ParseMethod = Callable[["Parser", AST], None]
+ParseVTable: Dict[Tuple[str, str], ParseMethod] = {}
+
+OpMethod = Callable[..., Any]
+OpVTable: Dict[Tuple[Type, AST, int], OpMethod] = {}
+
+
+def register(token: str, type_name: str):
+    """Register a method for a dispatch token and type name.
+
+    Parameters
+    ----------
+    token : str
+        The token for IR, e.g., T for TIR and R for Relax.
+
+    type_name : str
+        The type name of AST node, e.g., FunctionDef, With, For.
+
+    Returns
+    -------
+    func : callable
+        The function to register dispatched method of parsing
+        corresponding token and AST node type.
+    """
+
+    def func(method: ParseMethod):
+        """Register a method in parser virtual table.
+
+        Parameters
+        ----------
+        method : ParseMethod
+            The dispatched method to be registered in parser virtual table.
+        """
+        ParseVTable[(token, type_name)] = method
+
+    return func
+
+
+def get(
+    token: str,
+    type_name: str,
+    default: Optional[ParseMethod] = None,
+) -> Optional[ParseMethod]:
+    """Get a registered method for a dispatch token and type name,
+    or return a default method if no registered methods with this dispatch token and type name.
+
+    Parameters
+    ----------
+    token : str
+        The token for IR, e.g., T for TIR and R for Relax.
+
+    type_name : str
+        The type name of AST node, e.g., FunctionDef, With, For.
+
+    default : Optional[ParseMethod]
+        The default method when no registered methods with this dispatch token and type name.
+
+    Returns
+    -------
+    func : Optional[ParseMethod]
+        The dispatched method of parsing corresponding token and AST node type.
+    """
+    return ParseVTable.get((token, type_name), default)
+
+
+def register_op(operand_type: Type, op_node_type: AST, operand_index: int):
+    """Register a method for a operand type, AST operator node and operand index.
+
+    Parameters
+    ----------
+    operand_type : Type
+        The type of operands, e.g., tir.PrimExpr, tir.IterVar.
+
+    op_node_type : AST
+        The doc AST operator node type, e.g., doc.Add, doc.Eq.
+
+    operand_index : int
+        The operand index, i.e., 0 for left operand and 1 for right operand.
+
+    Returns
+    -------
+    func : callable
+        The function to register dispatched method of parsing
+        corresponding a operand type, AST operator node and operand index.
+    """
+
+    def func(method: OpMethod):
+        """Register a method in parser operator virtual table.
+
+        Parameters
+        ----------
+        method : ParseMethod
+            The dispatched method to be registered in parser operator virtual table.
+        """
+        OpVTable[(operand_type, op_node_type, operand_index)] = method
+
+    return func
+
+
+def get_op(
+    operand_type: Type,
+    op_node_type: Type,
+    operand_index: int,
+    default: Optional[OpMethod] = None,
+) -> Optional[OpMethod]:
+    """Register a method for a operand type, AST operator node and operand index.
+
+    Parameters
+    ----------
+    operand_type : Type
+        The type of operands, e.g., tir.PrimExpr, tir.IterVar.
+
+    op_node_type : AST
+        The doc AST operator node type, e.g., doc.Add, doc.Eq.
+
+    operand_index : int
+        The operand index, i.e., 0 for left operand and 1 for right operand.
+
+
+    default : Optional[OpMethod]
+        The default method when no registered methods with this operand type,
+        AST operator node and operand index.
+
+    Returns
+    -------
+    func : Optional[OpMethod]
+        The function to register dispatched method of parsing
+        corresponding a operand type, AST operator node and operand index.
+    """
+    return OpVTable.get((operand_type, op_node_type, operand_index), default)
diff --git a/python/tvm/script/_parser/core/entry.py b/python/tvm/script/_parser/core/entry.py
new file mode 100644
index 000000000000..a0974c8fd419
--- /dev/null
+++ b/python/tvm/script/_parser/core/entry.py
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The entry point of TVM parser."""
+
+from typing import Any, Dict, Union
+
+from ...ir_builder import IRBuilder
+from . import doc
+from .diagnostics import Source
+from .parser import Parser
+
+
+def parse(program: Union[doc.AST, Any, str], extra_vars: Dict[str, Any] = None) -> Any:
+    """Register a method for a operand type, AST operator node and operand index.
+
+    Parameters
+    ----------
+    program : Union[doc.AST, Any, str]
+        The TVMScript code to parse.
+
+    extra_vars : Dict[str, Any]
+        The extra variable table for parsing.
+
+    Returns
+    -------
+    func : Any
+        The parsed TVMScript program.
+    """
+
+    source = Source(program)
+    parser = Parser(source)
+    with IRBuilder() as builder:
+        parser.parse(extra_vars=extra_vars)
+    return builder.get()
diff --git a/python/tvm/script/_parser/core/evaluator.py b/python/tvm/script/_parser/core/evaluator.py
new file mode 100644
index 000000000000..3a72a3c33106
--- /dev/null
+++ b/python/tvm/script/_parser/core/evaluator.py
@@ -0,0 +1,509 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""AST Evaluation"""
+
+import ast
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type, Union
+
+from . import dispatch, doc
+
+if TYPE_CHECKING:
+    from .parser import Parser
+
+DEFAULT_OP: Dict[Type, Callable[..., Any]] = {
+    doc.Add: lambda a, b: a + b,
+    doc.Sub: lambda a, b: a - b,
+    doc.Mult: lambda a, b: a * b,
+    doc.Div: lambda a, b: a / b,
+    doc.FloorDiv: lambda a, b: a // b,
+    doc.Mod: lambda a, b: a % b,
+    doc.LShift: lambda a, b: a << b,
+    doc.RShift: lambda a, b: a >> b,
+    doc.BitOr: lambda a, b: a | b,
+    doc.BitXor: lambda a, b: a ^ b,
+    doc.BitAnd: lambda a, b: a & b,
+    doc.MatMult: lambda a, b: a @ b,
+    doc.Pow: lambda a, b: a**b,
+    doc.Eq: lambda a, b: a == b,
+    doc.NotEq: lambda a, b: a != b,
+    doc.Lt: lambda a, b: a < b,
+    doc.LtE: lambda a, b: a <= b,
+    doc.Gt: lambda a, b: a > b,
+    doc.GtE: lambda a, b: a >= b,
+    doc.Is: lambda a, b: a is b,
+    doc.IsNot: lambda a, b: a is not b,
+    doc.In: lambda a, b: a in b,
+    doc.NotIn: lambda a, b: a not in b,
+    doc.And: lambda a, b: a and b,
+    doc.Or: lambda a, b: a or b,
+    doc.Invert: lambda a: ~a,
+    doc.Not: lambda a: not a,
+    doc.UAdd: lambda a: +a,
+    doc.USub: lambda a: -a,
+}
+
+
+class ExprEvaluator:
+    """Expression evaluator for TVMScript parser.
+
+    Parameters
+    ----------
+    parser : Parser
+        The parser bound with the evaluator.
+
+    value_table : Dict[str, Any]
+        The value table for expression evaluation.
+
+    new_value_count : int
+        The count for ntermediate result added during evaluation.
+    """
+
+    parser: "Parser"
+    value_table: Dict[str, Any]
+    new_value_count: int
+
+    def __init__(self, parser: "Parser", value_table: Dict[str, Any]) -> None:
+        super().__init__()
+        self.parser = parser
+        self.value_table = value_table
+        self.new_value_count = 0
+
+    @staticmethod
+    def eval(parser: "Parser", value_table: Dict[str, Any], node: doc.AST) -> Any:
+        """Expression evaluation for TVMScript parser.
+
+        Parameters
+        ----------
+        parser : Parser
+            The parser bound with the evaluator.
+
+        value_table : Dict[str, Any]
+            The value table for expression evaluation.
+
+        node : doc.AST
+            The root node of AST tree node of expression to evaluate.
+
+        Returns
+        -------
+        res : Any
+            The evaluation result.
+        """
+        self = ExprEvaluator(parser, value_table)
+        result = self._visit(node)  # pylint: disable=protected-access
+        if isinstance(result, doc.Name):
+            if result.id not in self.value_table:
+                self.parser.report_error(result, f"Undefined variable: {result.id}")
+            return self.value_table[result.id]
+        if isinstance(result, doc.Constant):
+            return result.value
+        raise TypeError(f"Unexpected result type: {type(result)}")
+
+    def _add_intermediate_result(self, value: Any) -> doc.Name:
+        """Add intermediate result during evaluation into value table.
+
+        Parameters
+        ----------
+        value : Any
+            The intermediate result.
+
+        Returns
+        -------
+        name : doc.Name
+            The doc AST name node with intermediate name for intermediate result.
+        """
+        name = f"__tvm_tmp_value_{self.new_value_count}"
+        self.new_value_count += 1
+        self.value_table[name] = value
+        lineno = 0
+        col_offset = 0
+        return doc.Name(
+            id=name,
+            ctx=doc.Load(
+                lineno=lineno,
+                col_offset=col_offset,
+                end_lineno=None,
+                end_col_offset=None,
+            ),
+            lineno=lineno,
+            col_offset=col_offset,
+            end_lineno=None,
+            end_col_offset=None,
+        )
+
+    def _visit(self, node: doc.AST) -> Any:
+        """General doc AST node visiting method for expression evaluation.
+
+        Parameters
+        ----------
+        node : doc.AST
+            The root node of AST tree node of expression to evaluate.
+
+        Returns
+        -------
+        res : Any
+            The evaluation result.
+        """
+        if isinstance(node, list):
+            return [self._visit(n) for n in node]
+        if isinstance(node, tuple):
+            return tuple(self._visit(n) for n in node)
+        assert isinstance(node, doc.AST)
+        if isinstance(node, doc.Name):
+            if node.id not in self.value_table:
+                self.parser.report_error(node, f"Undefined variable: {node.id}")
+            return node
+        if isinstance(
+            node,
+            (
+                doc.Constant,
+                doc.expr_context,
+                doc.operator,
+                doc.boolop,
+                doc.unaryop,
+                doc.cmpop,
+            ),
+        ):
+            return node
+        if not isinstance(node, (doc.expr, doc.slice)):
+            return node
+        if isinstance(node, doc.Lambda):
+            return self._eval_lambda(node)
+        fields = {}
+        for field in node.__class__._FIELDS:  # pylint: disable=protected-access
+            attr = getattr(node, field)
+            if isinstance(attr, (doc.AST, tuple, list)):
+                fields[field] = self._visit(attr)
+            else:
+                fields[field] = attr
+        try:
+            if isinstance(node, doc.BoolOp):
+                value = self._eval_bool_op(fields)
+            elif isinstance(node, doc.Compare):
+                value = self._eval_compare(fields)
+            elif isinstance(node, doc.UnaryOp):
+                value = self._eval_unary_op(fields)
+            elif isinstance(node, doc.BinOp):
+                value = self._eval_bin_op(fields)
+            elif isinstance(node, doc.Slice):
+                value = self._eval_slice(fields)
+            else:
+                value = self._eval_expr(node.__class__(**fields))
+        except Exception as e:  # pylint: disable=broad-except,invalid-name
+            self.parser.report_error(node, str(e))
+        return self._add_intermediate_result(value)
+
+    def _eval_lambda(self, node: doc.Lambda) -> Any:
+        """The doc AST lambda node evaluating method.
+
+        Parameters
+        ----------
+        node : doc.Lambda
+            The root node of AST tree node of expression to evaluate.
+
+        Returns
+        -------
+        res : Any
+            The evaluation result.
+        """
+        try:
+            value = self._eval_expr(node)
+        except Exception as e:  # pylint: disable=broad-except,invalid-name
+            self.parser.report_error(node, str(e))
+        return self._add_intermediate_result(value)
+
+    def _eval_bool_op(self, fields: Dict[str, Any]) -> Any:
+        """The doc AST boolean operator node evaluating method.
+
+        Parameters
+        ----------
+        fields : Dict[str, Any]
+            The dictionary of boolean operation information,
+            e.g., operator types, operand values.
+
+        Returns
+        -------
+        res : Any
+            The evaluation result.
+        """
+        op = fields["op"]
+        if not isinstance(op, (doc.And, doc.Or)):
+            raise TypeError(f"Unexpected operator: {op}")
+        value = self._eval_expr(fields["values"][0])
+        for rhs in fields["values"][1:]:
+            value = _eval_op(op, values=[value, self._eval_expr(rhs)])
+        return value
+
+    def _eval_compare(self, fields: Dict[str, Any]) -> Any:
+        """The doc AST comparison operation node evaluating method.
+
+        Parameters
+        ----------
+        fields : Dict[str, Any]
+            The dictionary of comparison operation information,
+            e.g., operator types, operand values.
+
+        Returns
+        -------
+        res : Any
+            The evaluation result.
+        """
+        value = self._eval_expr(fields["left"])
+        for op, rhs in zip(fields["ops"], fields["comparators"]):
+            value = _eval_op(op, values=[value, self._eval_expr(rhs)])
+        return value
+
+    def _eval_unary_op(self, fields: Dict[str, Any]) -> Any:
+        """The doc AST unary operation node evaluating method.
+
+        Parameters
+        ----------
+        fields : Dict[str, Any]
+            The dictionary of unary operation information,
+            e.g., operator types, operand values.
+
+        Returns
+        -------
+        res : Any
+            The evaluation result.
+        """
+        value = self._eval_expr(fields["operand"])
+        value = _eval_op(fields["op"], values=[value])
+        return value
+
+    def _eval_bin_op(self, fields: Dict[str, Any]) -> Any:
+        """The doc AST binary operation node evaluating method.
+
+        Parameters
+        ----------
+        fields : Dict[str, Any]
+            The dictionary of binary operation information,
+            e.g., operator types, operand values.
+
+        Returns
+        -------
+        res : Any
+            The evaluation result.
+        """
+        return _eval_op(
+            fields["op"],
+            values=[
+                self._eval_expr(fields["left"]),
+                self._eval_expr(fields["right"]),
+            ],
+        )
+
+    def _eval_slice(self, fields: Dict[str, Any]) -> slice:
+        """The doc AST slice node evaluating method.
+
+        Parameters
+        ----------
+        fields : Dict[str, Any]
+            The dictionary of slice information,
+            e.g., lower bound, upper bound, step.
+
+        Returns
+        -------
+        res : slice
+            The evaluation result.
+        """
+        lower, upper, step = fields["lower"], fields["upper"], fields["step"]
+
+        lower = self._eval_expr(lower) if lower is not None else None
+        upper = self._eval_expr(upper) if upper is not None else None
+        step = self._eval_expr(step) if step is not None else None
+
+        return slice(lower, upper, step)
+
+    def _eval_expr(self, v: Any) -> Any:
+        """The doc AST expression node evaluating method.
+
+        Parameters
+        ----------
+        v : Any
+            The root node of AST tree node of expression to evaluate.
+
+        Returns
+        -------
+        res : Any
+            The evaluation result.
+        """
+        return _eval_expr(v, self.value_table)
+
+
+def eval_expr(
+    parser: "Parser",
+    node: Union[doc.expr, doc.Expression],
+    dict_globals: Optional[Dict[str, Any]],
+) -> Any:
+    """Expression evaluation for TVMScript parser.
+
+    Parameters
+    ----------
+    parser : Parser
+        The parser bound with the evaluator.
+
+    node : Union[doc.expr, doc.Expression]
+        The root node of AST tree node of expression to evaluate.
+
+    dict_globals : Optional[Dict[str, Any]]
+        The optional global value table for expression evaluation.
+
+    Returns
+    -------
+    res : Any
+        The evaluation result.
+    """
+    value_table = {}
+    if dict_globals is not None:
+        value_table.update(dict_globals)
+    return ExprEvaluator.eval(parser, value_table, node)
+
+
+def eval_assign(
+    parser: "Parser",
+    target: doc.expr,
+    source: Any,
+) -> Dict[str, Any]:
+    """Expression assignment evaluation for TVMScript parser.
+
+    Parameters
+    ----------
+    parser : Parser
+        The parser bound with the evaluator.
+
+    target : doc.expr
+        The root node of AST tree node of assigned expression to evaluate.
+
+    source : Any
+        The source to be assigned with evaluated expression.
+
+    Returns
+    -------
+    res : Any
+        The evaluation result.
+    """
+    try:
+        return _eval_assign(target, source)
+    except Exception as e:  # pylint: disable=broad-except,invalid-name
+        parser.report_error(target, f"Failed to evaluate assignment: {str(e)}")
+        raise
+
+
+def _eval_expr(
+    node: Union[doc.expr, doc.Expression],
+    dict_globals: Optional[Dict[str, Any]],
+) -> Any:
+    """Expression evaluation implementation for TVMScript parser.
+
+    Parameters
+    ----------
+    node : Union[doc.expr, doc.Expression]
+        The root node of AST tree node of expression to evaluate.
+
+    dict_globals : Optional[Dict[str, Any]]
+        The optional global value table for expression evaluation.
+
+    Returns
+    -------
+    res : Any
+        The evaluation result.
+    """
+    node = doc.from_doc(node)
+    if isinstance(node, ast.expr):
+        node = ast.Expression(body=node)
+    assert isinstance(node, ast.Expression), "Expects an ast.Expression, but gets: " + str(node)
+    if dict_globals is None:
+        dict_globals = {}
+    node = ast.fix_missing_locations(node)
+    exe = compile(node, filename="<ast>", mode="eval")
+    return eval(exe, dict_globals)  # pylint: disable=eval-used
+
+
+def _eval_op(
+    op: doc.AST,
+    values: List[Any],
+):
+    """Operation expression evaluation implementation for TVMScript parser.
+
+    Parameters
+    ----------
+    op : doc.AST
+        The root node of AST tree node of operation expression to evaluate.
+
+    values : List[Any]
+        The list of values of operands.
+
+    Returns
+    -------
+    res : Any
+        The evaluation result.
+    """
+    op_type = type(op)  # pylint: disable=protected-access
+    for i, v in enumerate(values):
+        v_type = getattr(type(v), "_dispatch_type", None)
+        if v_type is None:
+            continue
+        f = dispatch.get_op(
+            operand_type=v_type, op_node_type=op_type, operand_index=i, default=None
+        )
+        if f is not None:
+            return f(*values)
+    return DEFAULT_OP[op_type](*values)
+
+
+def _eval_assign(
+    target: doc.expr,
+    source: Any,
+) -> Dict[str, Any]:
+    """Expression assignment evaluation implementation for TVMScript parser.
+
+    Parameters
+    ----------
+    target : doc.expr
+        The root node of AST tree node of assigned expression to evaluate.
+
+    source : Any
+        The source to be assigned with evaluated expression.
+
+    Returns
+    -------
+    res : Any
+        The evaluation result.
+    """
+    target = doc.from_doc(target)
+    assert isinstance(target, ast.expr)
+    RHS_VAR_NAME = "__tvm_rhs_var__"  # pylint: disable=invalid-name
+    rhs_var_name = RHS_VAR_NAME
+    dict_locals = {rhs_var_name: source}
+    mod = ast.fix_missing_locations(
+        ast.Module(
+            body=[
+                ast.Assign(
+                    targets=[target],
+                    value=ast.Name(
+                        id=rhs_var_name,
+                        ctx=ast.Load(),
+                    ),
+                )
+            ],
+            type_ignores=[],
+        )
+    )
+    exe = compile(mod, filename="<ast>", mode="exec")
+    exec(exe, {}, dict_locals)  # pylint: disable=exec-used
+    del dict_locals[rhs_var_name]
+    return dict_locals
diff --git a/python/tvm/script/_parser/core/parser.py b/python/tvm/script/_parser/core/parser.py
new file mode 100644
index 000000000000..daf95cb3cd1b
--- /dev/null
+++ b/python/tvm/script/_parser/core/parser.py
@@ -0,0 +1,647 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The core parser"""
+
+from collections import defaultdict
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, List, Optional, Set, Union
+from tvm._ffi.base import TVMError
+
+from tvm.error import DiagnosticError
+
+from . import dispatch, doc
+from .diagnostics import Diagnostics, Source
+from .evaluator import eval_assign, eval_expr
+
+DEFAULT_VISIT = {
+    "Interactive",
+    "Module",
+    "Expression",
+    "Pass",
+}
+
+
+def _deferred(exit_f: Callable[[], None]):
+    """Created context with certain exit function.
+
+    Parameters
+    ----------
+    exit_f : Callable[[], None]
+        The function to call when exiting the context.
+
+    Returns
+    -------
+    res : Any
+        The created context.
+    """
+
+    @contextmanager
+    def context():
+        try:
+            yield
+        finally:
+            exit_f()
+
+    return context()
+
+
+class VarTableFrame:
+    """The variable table frame.
+    A frame of variable table stores the variables created in one block or scope.
+
+    Parameters
+    ----------
+    vars : Set[str]
+        The set of variable names in the variable table frame.
+    """
+
+    vars: Set[str]
+
+    def __init__(self):
+        self.vars = set()
+
+    def add(self, var: str):
+        """Add a new variable into variable table frame.
+
+        Parameters
+        ----------
+        var : str
+            The name of new variable.
+        """
+        if var in self.vars:
+            raise ValueError(f"Variable {var} already defined in current scope")
+        self.vars.add(var)
+
+    def pop_all(self, fn_pop: Callable[[str], None]):
+        """Pop out all variable in variable table frame.
+
+        Parameters
+        ----------
+        fn_pop : Callable[[str], None]
+            The methods to call when popping each variable.
+        """
+        for var in self.vars:
+            fn_pop(var)
+        self.vars.clear()
+
+
+class VarTable:
+    """The variable table.
+    A variable table stores the all variables when parsing TVMScript.
+
+    Parameters
+    ----------
+    frames : List[VarTableFrame]
+        The list or stack of variable table frame.
+
+    name2value : Dict[str, List[Any]]
+        The dictionary for variable table name-based query.
+    """
+
+    frames: List[VarTableFrame]
+    name2value: Dict[str, List[Any]]
+
+    def __init__(self):
+        self.frames = []
+        self.name2value = defaultdict(list)
+
+    def with_frame(self):
+        """Create a new variable table frame as with statement.
+
+        Returns
+        -------
+        res : Any
+            The context with new variable table frame.
+        """
+
+        def pop_frame():
+            frame = self.frames.pop()
+            frame.pop_all(lambda name: self.name2value[name].pop())
+
+        self.frames.append(VarTableFrame())
+        return _deferred(pop_frame)
+
+    def add(self, var: str, value: Any, allow_shadowing: bool = False):
+        """Add a new variable to variable table.
+
+        Parameters
+        ----------
+        var : str
+            The name of variable.
+
+        value : Any
+            The value of variable.
+
+        allow_shadowing : bool
+            The options of whether variable shadowing allwed for this variable.
+        """
+        # Skip if the key and value are equal to those in the var_table
+        if self.name2value[var] and self.name2value[var][-1] == value:
+            return
+        if allow_shadowing and var in self.frames[-1].vars:
+            # Shadowing
+            self.name2value[var][-1] = value
+        else:
+            self.frames[-1].add(var)
+            self.name2value[var].append(value)
+
+    def get(self) -> Dict[str, Any]:
+        """Get a variable dictionary of latest variables.
+
+        Returns
+        -------
+        res : Any
+            The variable dictionary copy of latest variables.
+        """
+        return {key: values[-1] for key, values in self.name2value.items() if values}
+
+    def exist(self, value: Any) -> bool:
+        """Check if any value exists in variable table.
+
+        Parameters
+        ----------
+        value : Any
+            The value of variable.
+
+        Returns
+        -------
+        res : bool
+            The existence of the value.
+        """
+        for v in self.name2value.values():
+            if v is value:
+                return True
+        return False
+
+
+def _dispatch_wrapper(func: dispatch.ParseMethod) -> dispatch.ParseMethod:
+    def _wrapper(self: "Parser", node: doc.AST) -> None:
+        try:
+            return func(self, node)
+        except DiagnosticError:
+            raise
+        except Exception as e:  # pylint: disable=broad-except,invalid-name
+            self.report_error(node, e)
+            raise
+
+    return _wrapper
+
+
+def _dispatch(self: "Parser", type_name: str) -> dispatch.ParseMethod:
+    for token in [self.dispatch_tokens[-1], "default"]:
+        func = dispatch.get(token=token, type_name=type_name, default=None)
+        if func is not None:
+            return _dispatch_wrapper(func)
+    return _dispatch_wrapper(lambda self, node: self.generic_visit(node))
+
+
+class Parser(doc.NodeVisitor):
+    """The TVMScript parser
+
+    Parameters
+    ----------
+    diag : Diagnostics
+        The diagnostics for error reporting.
+
+    dispatch_tokens : List[str]
+        The list of dispatching tokens to dispatching parsing method
+        of different IRs and different doc AST structure.
+
+    var_table : VarTable
+        The variable table for parsing.
+    """
+
+    diag: Diagnostics
+    dispatch_tokens: List[str]
+    var_table: VarTable
+
+    def __init__(self, source: Source) -> None:
+        self.diag = Diagnostics(source)
+        self.dispatch_tokens = ["default"]
+        self.var_table = VarTable()
+
+    def parse(self, extra_vars: Optional[Dict[str, Any]] = None) -> Any:
+        """The main parse method for parser.
+
+        Parameters
+        ----------
+        extra_vars : Optional[Dict[str, Any]]
+            The optional global value table for parsing.
+
+        Returns
+        -------
+        res : Any
+            The doc AST node visiting result.
+        """
+        if extra_vars is None:
+            extra_vars = {}
+        with self.var_table.with_frame():
+            for k, v in extra_vars.items():
+                self.var_table.add(k, v)
+            node = self.diag.source.as_ast()
+            self.visit(node)
+
+    def with_dispatch_token(self, token: str):
+        """Add a new dispatching token as with statement.
+
+        Parameters
+        ----------
+        token : str
+            The dispathing token.
+
+        Returns
+        -------
+        res : Any
+            The context with new dispatching token.
+        """
+
+        def pop_token():
+            self.dispatch_tokens.pop()
+
+        self.dispatch_tokens.append(token)
+        return _deferred(pop_token)
+
+    def eval_expr(
+        self,
+        node: Union[doc.Expression, doc.expr],
+        extra_vars: Optional[Dict[str, Any]] = None,
+    ) -> Any:
+        """Expression evaluation when parsing.
+
+        Parameters
+        ----------
+        node : Union[doc.expr, doc.Expression]
+            The root node of AST tree node of expression to evaluate.
+
+        extra_vars : Optional[Dict[str, Any]]
+            The optional global value table for expression evaluation.
+
+        Returns
+        -------
+        res : Any
+            The evaluation result.
+        """
+        var_values = self.var_table.get()
+        if extra_vars is not None:
+            for k, v in extra_vars.items():
+                var_values[k] = v
+        return eval_expr(self, node, var_values)
+
+    def _duplicate_lhs_check(self, target: doc.expr) -> Union[bool, Set[str]]:
+        """Check whether duplicate lhs exists in assignment.
+
+        Parameters
+        ----------
+        target : doc.expr
+            The doc AST expr node for lhs.
+
+        Returns
+        -------
+        res : Union[bool, Set[str]]
+            The result of true if duplicate lhs exists,
+            or the set of lhs names if no duplicate lhs exists.
+        """
+        if isinstance(target, (doc.Tuple, doc.List)):
+            vars: Set[str] = set()  # pylint: disable=redefined-builtin
+            for i in target.elts:
+                res = self._duplicate_lhs_check(i)
+                if isinstance(res, bool) and res:
+                    return True
+                assert isinstance(res, set)
+                if vars & res:
+                    return True
+                vars = vars.union(res)
+            return vars
+        elif isinstance(target, doc.Name):
+            return {target.id}
+        else:
+            self.report_error(target, "Invalid type in assign statement")
+            raise NotImplementedError
+
+    def eval_assign(
+        self,
+        target: doc.expr,
+        source: Any,
+        bind_value: Callable[["Parser", doc.expr, str, Any], Any],
+        allow_shadowing: bool = False,
+    ) -> Dict[str, Any]:
+        """Expression assignment evaluation when parsing.
+
+        Parameters
+        ----------
+        target : doc.expr
+            The root node of AST tree node of assigned expression to evaluate.
+
+        source : Any
+            The source to be assigned with evaluated expression.
+
+        bind_value : Callable[["Parser", doc.expr, str, Any], Any]
+            The value binding method when assigning the values to variables.
+
+        allow_shadowing : bool
+            The options of whether variable shadowing allwed for assignment.
+
+        Returns
+        -------
+        res : Dict[str, Any]
+            The dirctionary of assignment result.
+        """
+        if self._duplicate_lhs_check(target) is True:
+            self.report_error(target, "Duplicate vars assigned.")
+        var_values = eval_assign(self, target, source)
+        for k, v in var_values.items():
+            var = bind_value(self, target, k, v)
+            self.var_table.add(k, var, allow_shadowing)
+        return var_values
+
+    def report_error(
+        self, node: doc.AST, err: Union[Exception, str]
+    ) -> None:  # pylint: disable=no-self-use
+        """The error reporting when parsing.
+
+        Parameters
+        ----------
+        node : doc.AST
+            The doc AST node with errors.
+
+        err: Union[Exception, str]
+            The error to report.
+        """
+        # Only take the last line of the error message
+        if isinstance(err, TVMError):
+            msg = list(filter(None, str(err).split("\n")))[-1]
+        else:
+            msg = str(err)
+        self.diag.error(node, msg)
+
+    def visit(self, node: doc.AST) -> None:
+        """The general visiting method.
+
+        Parameters
+        ----------
+        node : doc.AST
+            The doc AST node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        if isinstance(node, (list, tuple)):
+            for item in node:
+                self.visit(item)
+            return
+        if not isinstance(node, doc.AST):
+            return
+        name = node.__class__.__name__.split(".")[-1]
+        if name in DEFAULT_VISIT:
+            func = self.generic_visit
+        else:
+            func = getattr(self, "visit_" + name, None)
+        if func is None:
+            raise NotImplementedError(f"Visitor of AST node is not implemented: {name}")
+        try:
+            func(node)
+        except DiagnosticError:
+            raise
+        except Exception as e:  # pylint: disable=broad-except,invalid-name
+            self.report_error(node, str(e))
+            raise
+
+    def visit_body(self, node: List[doc.stmt]) -> Any:
+        """The general body visiting method.
+
+        Parameters
+        ----------
+        node : List[doc.stmt]
+            The list of statements in body.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        for stmt in node:
+            self.visit(stmt)
+
+    def visit_tvm_annotation(self, node: doc.expr) -> Any:
+        """The general TVM annotation visiting method.
+
+        Parameters
+        ----------
+        node : doc.expr
+            The doc AST expr node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "tvm_annotation")(self, node)
+
+    def visit_FunctionDef(self, node: doc.FunctionDef) -> Any:  # pylint: disable=invalid-name
+        """The general function definition visiting method.
+
+        Parameters
+        ----------
+        node : doc.FunctionDef
+            The doc AST function definition node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        if not node.decorator_list:
+            self.report_error(node, "Function must be decorated")
+        # TODO: only the last decorator is parsed
+        decorator = self.eval_expr(node.decorator_list[-1])
+        if not hasattr(decorator, "dispatch_token"):
+            self.report_error(node, "The parser does not understand the decorator")
+        token = decorator.dispatch_token
+        func = dispatch.get(token=token, type_name="FunctionDef", default=None)
+        if func is None:
+            self.report_error(node, "The parser does not understand the decorator")
+        _dispatch_wrapper(func)(self, node)
+
+    def visit_ClassDef(self, node: doc.ClassDef) -> Any:  # pylint: disable=invalid-name
+        """The general class definition visiting method.
+
+        Parameters
+        ----------
+        node : doc.ClassDef
+            The doc AST class definition node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        func = dispatch.get(token="ir", type_name="ClassDef", default=None)
+        if func is None:
+            self.report_error(node, "The parser does not understand the decorator")
+        _dispatch_wrapper(func)(self, node)
+
+    def visit_arguments(self, node: doc.arguments) -> Any:
+        """The general arguments visiting method.
+
+        Parameters
+        ----------
+        node : doc.arguments
+            The doc AST arguments node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "arguments")(self, node)
+
+    def visit_For(self, node: doc.For) -> Any:  # pylint: disable=invalid-name
+        """The general for visiting method.
+
+        Parameters
+        ----------
+        node : doc.For
+            The doc AST for node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "For")(self, node)
+
+    def visit_While(self, node: doc.While) -> Any:  # pylint: disable=invalid-name
+        """The general while visiting method.
+
+        Parameters
+        ----------
+        node : doc.While
+            The doc AST while node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "While")(self, node)
+
+    def visit_With(self, node: doc.With) -> Any:  # pylint: disable=invalid-name
+        """The general with visiting method.
+
+        Parameters
+        ----------
+        node : doc.With
+            The doc AST with node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "With")(self, node)
+
+    def visit_Assign(self, node: doc.Assign) -> Any:  # pylint: disable=invalid-name
+        """The general assign visiting method.
+
+        Parameters
+        ----------
+        node : doc.Assign
+            The doc AST assign node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "Assign")(self, node)
+
+    def visit_Expr(self, node: doc.Expr) -> Any:  # pylint: disable=invalid-name
+        """The general expression visiting method.
+
+        Parameters
+        ----------
+        node : doc.Expr
+            The doc AST exprssion node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "Expr")(self, node)
+
+    def visit_If(self, node: doc.If) -> Any:  # pylint: disable=invalid-name
+        """The general if visiting method.
+
+        Parameters
+        ----------
+        node : doc.If
+            The doc AST if node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "If")(self, node)
+
+    def visit_AugAssign(self, node: doc.AugAssign) -> Any:  # pylint: disable=invalid-name
+        """The general augmented assignment visiting method.
+
+        Parameters
+        ----------
+        node : doc.AugAssign
+            The doc AST augmented assignment node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "AugAssign")(self, node)
+
+    def visit_Assert(self, node: doc.Assert) -> Any:  # pylint: disable=invalid-name
+        """The general assert visiting method.
+
+        Parameters
+        ----------
+        node : doc.Assert
+            The doc AST assert node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "Assert")(self, node)
+
+    def visit_Return(self, node: doc.Return) -> Any:  # pylint: disable=invalid-name
+        """The general return visiting method.
+
+        Parameters
+        ----------
+        node : doc.Return
+            The doc AST return node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "Return")(self, node)
diff --git a/tests/python/unittest/test_tvmscript_parser_evaluator.py b/tests/python/unittest/test_tvmscript_parser_evaluator.py
new file mode 100644
index 000000000000..4d6590306050
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_parser_evaluator.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unittests for tvm.script.parser.evaluator"""
+import pytest
+import tvm.testing
+from tvm.script._parser.core.diagnostics import Source
+from tvm.script._parser.core.evaluator import ExprEvaluator
+
+
+def _calc(expr, extra_vars=None):
+    if extra_vars is None:
+        extra_vars = {}
+    source = Source(expr)
+    mod_ast = source.as_ast()
+    mod_body_ast = mod_ast.body
+    expr_stmt_ast = mod_body_ast[0]
+    expr_ast = expr_stmt_ast.value
+    return ExprEvaluator.eval(None, extra_vars, expr_ast)
+
+
+def test_evaluator_basic():
+    assert _calc("1, 3.14, True, 'str'") == (1, 3.14, True, "str")
+
+
+def test_evaluator_op():
+    assert _calc("1 + 2, 1 - 2, 1 * 2, 1 / 2") == (3, -1, 2, 0.5)
+
+
+def test_evaluator_value_table():
+    res = _calc("a + b, a - b, a * b, a / b", {"a": 1, "b": 2})
+    a, b = 1, 2
+    assert res == (a + b, a - b, a * b, a / b)
+
+
+def test_evaluator_func_call():
+    def func(a, b):
+        return a + b, a - b, a * b, a / b
+
+    assert _calc("func(1, 2)", {"func": func}) == func(1, 2)
+
+
+def test_evaluator_slice():
+    res = _calc("a, a[1:], a[:5], a[1: 5], a[1: 5: 2]", {"a": [1, 2, 3, 4, 5, 6]})
+    a = [1, 2, 3, 4, 5, 6]
+    assert res == (a, a[1:], a[:5], a[1:5], a[1:5:2])
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From cbca28da003cf4791a19f0c4f9e0c930ecee522d Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 17 Oct 2022 13:36:23 +0900
Subject: [PATCH 362/704] [MetaSchedule] Consolidate module hashing and
 equality testing (#13050)

Currently, MS uses `StructuralEqual/Hash` in task extraction / evo search / database. Sometimes, we want to use different hashing and equality testing methods, for example (1) to ignore NDArray (https://github.com/apache/tvm/pull/12706) or (2) to enable anchor-op only tuning (identify `conv2d` and `conv2d -> add` subgraphs as equal).

To enable such flexibility, this PR consolidate raw calls to `StructuralEqual/Hash` into one place, which for now is named `ModuleEquality`. Since hashing is also done for equality testing, I think it is appropriate to call the component responsible for hashing / equality test that way. But other suggestions are welcome.

Importantly, task extraction and database are now using the same hashing / equal method based on TIR mod, while previously task extraction was using a cache key-ed on relay mod.
---
 include/tvm/meta_schedule/database.h          | 59 ++++++++++++---
 python/tvm/meta_schedule/database/database.py |  2 +
 .../meta_schedule/database/json_database.py   |  8 +-
 .../meta_schedule/database/memory_database.py | 18 ++++-
 .../database/schedule_fn_database.py          | 15 +++-
 python/tvm/meta_schedule/relay_integration.py | 15 +++-
 python/tvm/meta_schedule/tune.py              |  9 ++-
 src/meta_schedule/database/database.cc        | 20 +++--
 src/meta_schedule/database/json_database.cc   | 22 ++++--
 src/meta_schedule/database/memory_database.cc | 13 ++--
 .../database/schedule_fn_database.cc          |  7 +-
 src/meta_schedule/module_equality.cc          | 45 ++++++++++++
 src/meta_schedule/module_equality.h           | 73 +++++++++++++++++++
 .../search_strategy/evolutionary_search.cc    | 33 +++++++--
 src/relay/backend/task_extraction.cc          | 34 ++++++---
 15 files changed, 318 insertions(+), 55 deletions(-)
 create mode 100644 src/meta_schedule/module_equality.cc
 create mode 100644 src/meta_schedule/module_equality.h

diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index 4092fdae36dd..bcdffe9ff33b 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -31,9 +31,13 @@
 #include <tvm/tir/schedule/schedule.h>
 #include <tvm/tir/schedule/trace.h>
 
+#include <memory>
+
 namespace tvm {
 namespace meta_schedule {
 
+class ModuleEquality;
+
 /*! \brief A workload, i.e. an IRModule and its structural hash. */
 class WorkloadNode : public runtime::Object {
  public:
@@ -94,9 +98,13 @@ struct WorkloadHash {
 
 /*! \brief The equality check for Workload */
 struct WorkloadEqual {
-  bool operator()(const Workload& a, const Workload& b) const {
-    return a->shash == b->shash && tvm::StructuralEqual()(a->mod, b->mod);
-  }
+  explicit WorkloadEqual(const ModuleEquality& mod_eq) : mod_eq_(mod_eq) {}
+
+  bool operator()(const Workload& a, const Workload& b) const;
+
+ private:
+  /*! \brief The module equality testing and hashing method */
+  const ModuleEquality& mod_eq_;
 };
 
 /*! \brief The class of measure candidates. */
@@ -168,8 +176,16 @@ class TuningRecord : public runtime::ObjectRef {
 /* \brief The abstract interface of database. */
 class DatabaseNode : public runtime::Object {
  public:
+  /*!
+   * \brief Constructor
+   * \param mod_eq_name A string to specify the module equality testing and hashing method.
+   *  It must be one of the followings:
+   *    - "structural": Use StructuralEqual/Hash
+   */
+  explicit DatabaseNode(String mod_eq_name = "structural");
+
   /*! \brief Default destructor */
-  virtual ~DatabaseNode() = default;
+  virtual ~DatabaseNode();
   /*!
    * \brief Check if the database has the given workload.
    * \param mod The IRModule to be searched for.
@@ -232,13 +248,31 @@ class DatabaseNode : public runtime::Object {
   virtual Optional<IRModule> QueryIRModule(const IRModule& mod, const Target& target,
                                            const String& workload_name);
 
+  /*! \brief Return a reference to the owned module equality method instance. */
+  const ModuleEquality& GetModuleEquality() const {
+    ICHECK(mod_eq_);
+    return *mod_eq_;
+  }
+
   static constexpr const char* _type_key = "meta_schedule.Database";
   TVM_DECLARE_BASE_OBJECT_INFO(DatabaseNode, runtime::Object);
+
+ private:
+  /*! \brief The module equality testing and hashing method */
+  std::unique_ptr<ModuleEquality> mod_eq_;
 };
 
 /*! \brief The database with customized methods on the python-side. */
 class PyDatabaseNode : public DatabaseNode {
  public:
+  /*!
+   * \brief Constructor
+   * \param mod_eq_name A string to specify the module equality testing and hashing method.
+   *  It must be one of the followings:
+   *    - "structural": Use StructuralEqual/Hash
+   */
+  explicit PyDatabaseNode(String mod_eq_name = "structural");
+
   /*!
    * \brief The function type of `HasWorkload` method.
    * \param mod The IRModule to be searched for.
@@ -404,23 +438,28 @@ class PyDatabaseNode : public DatabaseNode {
  */
 class Database : public runtime::ObjectRef {
  public:
-  /*! An in-memory database. */
-  TVM_DLL static Database MemoryDatabase();
+  /*!
+   * \brief An in-memory database.
+   * \param mod_eq_name A string to specify the module equality testing and hashing method.
+   */
+  TVM_DLL static Database MemoryDatabase(String mod_eq_name = "structural");
   /*!
    * \brief A database for injecting handcrafted schedule functions.
    * \param schedule_fn The function to do scheduling, which takes a TIR schedule,
    * and returns a boolean indicating if the schedule is successful.
+   * \param mod_eq_name A string to specify the module equality testing and hashing method.
    */
   TVM_DLL static Database ScheduleFnDatabase(
-      runtime::TypedPackedFunc<bool(tir::Schedule)> schedule_fn);
+      runtime::TypedPackedFunc<bool(tir::Schedule)> schedule_fn, String mod_eq_name = "structural");
   /*!
    * \brief Create a default database that uses JSON file for tuning records.
    * \param path_workload The path to the workload table.
    * \param path_tuning_record The path to the database table.
    * \param allow_missing Whether to create new file when the given path is not found.
+   * \param mod_eq_name A string to specify the module equality testing and hashing method.
    */
   TVM_DLL static Database JSONDatabase(String path_workload, String path_tuning_record,
-                                       bool allow_missing);
+                                       bool allow_missing, String mod_eq_name = "structural");
   /*!
    * \brief A database composed of multiple databases, allowing users to guide IR rewriting using
    * combined knowledge of those databases. To each query, it returns the best record among all the
@@ -448,6 +487,7 @@ class Database : public runtime::ObjectRef {
    * \param f_query_schedule The packed function of `QuerySchedule`.
    * \param f_query_ir_module The packed function of `QueryIRModule`.
    * \param f_size The packed function of `Size`.
+   * \param mod_eq_name A string to specify the module equality testing and hashing method.
    * \return The created database.
    */
   TVM_DLL static Database PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload,
@@ -458,7 +498,8 @@ class Database : public runtime::ObjectRef {
                                      PyDatabaseNode::FQueryTuningRecord f_query_tuning_record,
                                      PyDatabaseNode::FQuerySchedule f_query_schedule,
                                      PyDatabaseNode::FQueryIRModule f_query_ir_module,
-                                     PyDatabaseNode::FSize f_size);
+                                     PyDatabaseNode::FSize f_size,
+                                     String mod_eq_name = "structural");
   /*! \return The current Database in the scope. */
   static Optional<Database> Current();
   /*! \brief Entering the scope of the context manager */
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index e21ce29ed699..b95cb1ddd7db 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -434,6 +434,7 @@ def __init__(
         f_query_schedule: Callable = None,
         f_query_ir_module: Callable = None,
         f_size: Callable = None,
+        module_equality: str = "structural",
     ):
         """Constructor."""
 
@@ -448,6 +449,7 @@ def __init__(
             f_query_schedule,
             f_query_ir_module,
             f_size,
+            module_equality,
         )
 
 
diff --git a/python/tvm/meta_schedule/database/json_database.py b/python/tvm/meta_schedule/database/json_database.py
index b36ac61ef2fb..aedc83ad89b3 100644
--- a/python/tvm/meta_schedule/database/json_database.py
+++ b/python/tvm/meta_schedule/database/json_database.py
@@ -26,7 +26,7 @@
 
 @register_object("meta_schedule.JSONDatabase")
 class JSONDatabase(Database):
-    """The class of tuning records.
+    """Database class backed by JSON.
 
     Parameters
     ----------
@@ -34,6 +34,10 @@ class JSONDatabase(Database):
         The path to the workload table.
     path_tuning_record : str
         The path to the tuning record table.
+    module_equality : Optional[str]
+        A string to specify the module equality testing and hashing method.
+        It must be one of the followings:
+          - "structural": Use StructuralEqual/Hash
     """
 
     path_workload: str
@@ -46,6 +50,7 @@ def __init__(
         *,
         work_dir: Optional[str] = None,
         allow_missing: bool = True,
+        module_equality: str = "structural",
     ) -> None:
         """Constructor.
 
@@ -77,4 +82,5 @@ def __init__(
             path_workload,
             path_tuning_record,
             allow_missing,
+            module_equality,
         )
diff --git a/python/tvm/meta_schedule/database/memory_database.py b/python/tvm/meta_schedule/database/memory_database.py
index f50e5a1afa94..e07f325d9d3d 100644
--- a/python/tvm/meta_schedule/database/memory_database.py
+++ b/python/tvm/meta_schedule/database/memory_database.py
@@ -23,9 +23,21 @@
 
 @register_object("meta_schedule.MemoryDatabase")
 class MemoryDatabase(Database):
-    """An in-memory database"""
+    """An in-memory database
 
-    def __init__(self) -> None:
+    Parameters
+    ----------
+    module_equality : Optional[str]
+        A string to specify the module equality testing and hashing method.
+        It must be one of the followings:
+          - "structural": Use StructuralEqual/Hash
+    """
+
+    def __init__(
+        self,
+        module_equality: str = "structural",
+    ) -> None:
         self.__init_handle_by_constructor__(
-            _ffi_api.DatabaseMemoryDatabase,  # type: ignore # pylint: disable=no-member
+            _ffi_api.DatabaseMemoryDatabase,  # type: ignore # pylint: disable=no-member,
+            module_equality,
         )
diff --git a/python/tvm/meta_schedule/database/schedule_fn_database.py b/python/tvm/meta_schedule/database/schedule_fn_database.py
index 2918f05799dc..273b84185287 100644
--- a/python/tvm/meta_schedule/database/schedule_fn_database.py
+++ b/python/tvm/meta_schedule/database/schedule_fn_database.py
@@ -26,13 +26,26 @@
 
 @register_object("meta_schedule.ScheduleFnDatabase")
 class ScheduleFnDatabase(Database):
-    """A database for injecting handcrafted schedule functions."""
+    """A database for injecting handcrafted schedule functions.
+
+    Parameters
+    ----------
+    schedule_fn : Callable[[Schedule], bool],
+        The function to do scheduling, which takes a TIR schedule, and returns
+        a boolean indicating if the schedule is committed to the database.
+    module_equality : Optional[str]
+        A string to specify the module equality testing and hashing method.
+        It must be one of the followings:
+          - "structural": Use StructuralEqual/Hash
+    """
 
     def __init__(
         self,
         schedule_fn: Callable[[Schedule], bool],
+        module_equality: str = "structural",
     ) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.DatabaseScheduleFnDatabase,  # type: ignore # pylint: disable=no-member
             schedule_fn,
+            module_equality,
         )
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index b3d8d582ba2b..b9c34e509ab4 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -119,6 +119,7 @@ def extract_tasks(
         }
     ),
     executor: Optional["relay.backend.Executor"] = None,
+    module_equality: str = "structural",
 ) -> List[ExtractedTask]:
     """Extract tuning tasks from a relay program.
 
@@ -136,6 +137,10 @@ def extract_tasks(
         The pass configuration
     executor : Optional[relay.backend.Executor]
         The executor to use
+    module_equality : Optional[str]
+        A string to specify the module equality testing and hashing method.
+        It must be one of the followings:
+          - "structural": Use StructuralEqual/Hash
 
     Returns
     -------
@@ -161,7 +166,7 @@ def extract_tasks(
                 opt_level=opt_level,
                 config=pass_config,
             ):
-                return list(_extract_task(mod, target, params))
+                return list(_extract_task(mod, target, params, module_equality))
 
 
 def extracted_tasks_to_tune_contexts(
@@ -237,6 +242,7 @@ def tune_relay(
     space: SpaceGenerator.SpaceGeneratorType = "post-order-apply",
     strategy: SearchStrategy.SearchStrategyType = "evolutionary",
     seed: Optional[int] = None,
+    module_equality: str = "structural",
 ) -> Database:
     """Tune a Relay program.
 
@@ -274,6 +280,10 @@ def tune_relay(
         The search strategy to use
     seed : Optional[int]
         The random seed
+    module_equality : Optional[str]
+        A string to specify the module equality testing and hashing method.
+        It must be one of the followings:
+          - "structural": Use StructuralEqual/Hash
 
     Returns
     -------
@@ -281,7 +291,7 @@ def tune_relay(
         The database that contains the tuning records
     """
     tasks, task_weights = extracted_tasks_to_tune_contexts(
-        extracted_tasks=extract_tasks(mod, target, params),
+        extracted_tasks=extract_tasks(mod, target, params, module_equality=module_equality),
         work_dir=work_dir,
         space=space,
         strategy=strategy,
@@ -300,6 +310,7 @@ def tune_relay(
         cost_model=cost_model,
         measure_callbacks=measure_callbacks,
         task_scheduler=task_scheduler,
+        module_equality=module_equality,
     )
 
 
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index f7a2d4dc376f..66cb60c32902 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -40,6 +40,7 @@ def tune_tasks(
     cost_model: CostModel.CostModelType = "xgb",
     measure_callbacks: MeasureCallback.CallbackListType = "default",
     task_scheduler: TaskScheduler.TaskSchedulerType = "gradient",
+    module_equality: str = "structural",
 ) -> Database:
     """Tune a list of tasks. Using a task scheduler.
 
@@ -69,6 +70,10 @@ def tune_tasks(
         The measure callbacks.
     task_scheduler : TaskScheduler.TaskSchedulerType
         The task scheduler.
+    module_equality : Optional[str]
+        A string to specify the module equality testing and hashing method.
+        It must be one of the followings:
+          - "structural": Use StructuralEqual/Hash
 
     Returns
     -------
@@ -86,9 +91,9 @@ def tune_tasks(
     if not isinstance(runner, Runner):
         runner = Runner.create(runner)
     if database == "json":
-        database = Database.create(database, work_dir=work_dir)
+        database = Database.create(database, work_dir=work_dir, module_equality=module_equality)
     elif not isinstance(database, Database):
-        database = Database.create(database)
+        database = Database.create(database, module_equality=module_equality)
     if not isinstance(cost_model, CostModel):
         cost_model = CostModel.create(cost_model)
     if isinstance(measure_callbacks, MeasureCallback):
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index 0976e158aaf0..da1d1db8f1cc 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -16,6 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include "../module_equality.h"
 #include "../utils.h"
 
 namespace tvm {
@@ -25,8 +26,8 @@ namespace meta_schedule {
 
 Workload::Workload(IRModule mod) {
   ObjectPtr<WorkloadNode> n = runtime::make_object<WorkloadNode>();
-  n->shash = tvm::StructuralHash()(mod);
   n->mod = mod;
+  n->shash = ModuleEquality::Create("structural")->Hash(mod);
   data_ = std::move(n);
 }
 
@@ -59,12 +60,8 @@ Workload Workload::FromJSON(const ObjectRef& json_obj) {
       String b64_mod = Downcast<String>(json_array->at(1));
       std::string json_mod = Base64Decode(b64_mod);
       mod = Downcast<IRModule>(LoadJSON(json_mod));
+      std::stringstream(str_shash) >> shash;
     }
-    // Verify SHash(mod) == shash
-    shash = tvm::StructuralHash()(mod);
-    String recalc_shash = SHash2Str(shash);
-    CHECK_EQ(recalc_shash, str_shash) << "ValueError: Structural hash changed. Given: " << str_shash
-                                      << "; Recalculated: " << recalc_shash;
   } catch (const std::runtime_error& e) {  // includes tvm::Error and dmlc::Error
     LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
                << "\nThe error is: " << e.what();
@@ -85,6 +82,10 @@ TuningRecord::TuningRecord(tir::Trace trace, Workload workload, Optional<Array<F
   this->data_ = n;
 }
 
+bool WorkloadEqual::operator()(const Workload& a, const Workload& b) const {
+  return a->shash == b->shash && mod_eq_.Equal(a->mod, b->mod);
+}
+
 MeasureCandidate TuningRecordNode::AsMeasureCandidate() const {
   tir::Schedule sch =
       tir::Schedule::Traced(workload->mod, -1, 0, tir::ScheduleErrorRenderLevel::kDetail);
@@ -155,6 +156,8 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w
 }
 
 /******** Database ********/
+DatabaseNode::DatabaseNode(String mod_eq_name) { mod_eq_ = ModuleEquality::Create(mod_eq_name); }
+DatabaseNode::~DatabaseNode() = default;
 
 Optional<TuningRecord> DatabaseNode::QueryTuningRecord(const IRModule& mod, const Target& target,
                                                        const String& workload_name) {
@@ -211,6 +214,7 @@ Optional<Database> Database::Current() {
 }
 
 /******** PyDatabase ********/
+PyDatabaseNode::PyDatabaseNode(String mod_eq_name) : DatabaseNode(mod_eq_name) {}
 
 Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload,
                               PyDatabaseNode::FCommitWorkload f_commit_workload,
@@ -220,8 +224,8 @@ Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload,
                               PyDatabaseNode::FQueryTuningRecord f_query_tuning_record,
                               PyDatabaseNode::FQuerySchedule f_query_schedule,
                               PyDatabaseNode::FQueryIRModule f_query_ir_module,
-                              PyDatabaseNode::FSize f_size) {
-  ObjectPtr<PyDatabaseNode> n = make_object<PyDatabaseNode>();
+                              PyDatabaseNode::FSize f_size, String mod_eq_name) {
+  ObjectPtr<PyDatabaseNode> n = make_object<PyDatabaseNode>(mod_eq_name);
   n->f_has_workload = f_has_workload;
   n->f_commit_workload = f_commit_workload;
   n->f_commit_tuning_record = f_commit_tuning_record;
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 91b96c82479f..aaa67600fc96 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -20,6 +20,7 @@
 #include <thread>
 #include <unordered_map>
 
+#include "../module_equality.h"
 #include "../utils.h"
 
 namespace tvm {
@@ -67,6 +68,10 @@ void JSONFileAppendLine(const String& path, const std::string& line) {
 /*! \brief The default database implementation, which mimics two database tables with two files. */
 class JSONDatabaseNode : public DatabaseNode {
  public:
+  explicit JSONDatabaseNode(String mod_eq_name = "structural")
+      : DatabaseNode(mod_eq_name),
+        workloads2idx_(/*bucket_count*/ 0, WorkloadHash(), WorkloadEqual(GetModuleEquality())) {}
+
   /*! \brief The path to the workload table */
   String path_workload;
   /*! \brief The path to the tuning record table */
@@ -88,13 +93,14 @@ class JSONDatabaseNode : public DatabaseNode {
 
  public:
   bool HasWorkload(const IRModule& mod) {
-    return workloads2idx_.find(Workload(mod, tvm::StructuralHash()(mod))) != workloads2idx_.end();
+    return workloads2idx_.find(Workload(mod, GetModuleEquality().Hash(mod))) !=
+           workloads2idx_.end();
   }
 
   Workload CommitWorkload(const IRModule& mod) {
     // Try to insert `mod` into `workloads_`
     auto [it, inserted] =
-        this->workloads2idx_.emplace(Workload(mod, tvm::StructuralHash()(mod)), -1);
+        this->workloads2idx_.emplace(Workload(mod, GetModuleEquality().Hash(mod)), -1);
     Workload workload = it->first;
     // If `mod` is new in `workloads2idx_`, append it to the workload file
     if (inserted) {
@@ -122,7 +128,7 @@ class JSONDatabaseNode : public DatabaseNode {
     results.reserve(top_k);
     int counter = 0;
     for (const TuningRecord& record : this->tuning_records_) {
-      if (WorkloadEqual()(record->workload, workload)) {
+      if (WorkloadEqual(GetModuleEquality())(record->workload, workload)) {
         results.push_back(record);
         if (++counter == top_k) {
           break;
@@ -144,10 +150,10 @@ class JSONDatabaseNode : public DatabaseNode {
   int64_t Size() { return tuning_records_.size(); }
 };
 
-Database Database::JSONDatabase(String path_workload, String path_tuning_record,
-                                bool allow_missing) {
+Database Database::JSONDatabase(String path_workload, String path_tuning_record, bool allow_missing,
+                                String mod_eq_name) {
   int num_threads = std::thread::hardware_concurrency();
-  ObjectPtr<JSONDatabaseNode> n = make_object<JSONDatabaseNode>();
+  ObjectPtr<JSONDatabaseNode> n = make_object<JSONDatabaseNode>(mod_eq_name);
   // Load `n->workloads2idx_` from `path_workload`
   std::vector<Workload> workloads;
   {
@@ -157,6 +163,10 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record,
     workloads.reserve(n_objs);
     for (int i = 0; i < n_objs; ++i) {
       Workload workload = Workload::FromJSON(json_objs[i]);
+      auto recalc_hash = n->GetModuleEquality().Hash(workload->mod);
+      CHECK_EQ(recalc_hash, workload->shash)
+          << "ValueError: Module hash changed. Given: " << workload->shash
+          << "; Recalculated: " << recalc_hash;
       n->workloads2idx_.emplace(workload, i);
       workloads.push_back(workload);
     }
diff --git a/src/meta_schedule/database/memory_database.cc b/src/meta_schedule/database/memory_database.cc
index b6c635555152..47f6a473d1e4 100644
--- a/src/meta_schedule/database/memory_database.cc
+++ b/src/meta_schedule/database/memory_database.cc
@@ -16,6 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include "../module_equality.h"
 #include "../utils.h"
 
 namespace tvm {
@@ -23,6 +24,8 @@ namespace meta_schedule {
 
 class MemoryDatabaseNode : public DatabaseNode {
  public:
+  explicit MemoryDatabaseNode(String mod_eq_name = "structural") : DatabaseNode(mod_eq_name) {}
+
   Array<TuningRecord> records;
   Array<Workload> workloads;
 
@@ -37,7 +40,7 @@ class MemoryDatabaseNode : public DatabaseNode {
  public:
   bool HasWorkload(const IRModule& mod) final {
     for (const auto& workload : workloads) {
-      if (StructuralEqual()(workload->mod, mod)) {
+      if (GetModuleEquality().Equal(workload->mod, mod)) {
         return true;
       }
     }
@@ -46,11 +49,11 @@ class MemoryDatabaseNode : public DatabaseNode {
 
   Workload CommitWorkload(const IRModule& mod) final {
     for (const auto& workload : workloads) {
-      if (StructuralEqual()(workload->mod, mod)) {
+      if (GetModuleEquality().Equal(workload->mod, mod)) {
         return workload;
       }
     }
-    Workload workload(mod, StructuralHash()(mod));
+    Workload workload(mod, GetModuleEquality().Hash(mod));
     workloads.push_back(workload);
     return workload;
   }
@@ -96,8 +99,8 @@ class MemoryDatabaseNode : public DatabaseNode {
   int64_t Size() final { return records.size(); }
 };
 
-Database Database::MemoryDatabase() {
-  ObjectPtr<MemoryDatabaseNode> n = make_object<MemoryDatabaseNode>();
+Database Database::MemoryDatabase(String mod_eq_name) {
+  ObjectPtr<MemoryDatabaseNode> n = make_object<MemoryDatabaseNode>(mod_eq_name);
   n->records.clear();
   n->workloads.clear();
   return Database(n);
diff --git a/src/meta_schedule/database/schedule_fn_database.cc b/src/meta_schedule/database/schedule_fn_database.cc
index 751721fe52d4..ed6d87f2448b 100644
--- a/src/meta_schedule/database/schedule_fn_database.cc
+++ b/src/meta_schedule/database/schedule_fn_database.cc
@@ -23,6 +23,8 @@ namespace meta_schedule {
 
 class ScheduleFnDatabaseNode : public DatabaseNode {
  public:
+  explicit ScheduleFnDatabaseNode(String mod_eq_name = "structural") : DatabaseNode(mod_eq_name) {}
+
   runtime::TypedPackedFunc<bool(tir::Schedule)> schedule_fn;
 
   void VisitAttrs(AttrVisitor* v) {
@@ -89,8 +91,9 @@ class ScheduleFnDatabaseNode : public DatabaseNode {
   }
 };
 
-Database Database::ScheduleFnDatabase(runtime::TypedPackedFunc<bool(tir::Schedule)> schedule_fn) {
-  ObjectPtr<ScheduleFnDatabaseNode> n = make_object<ScheduleFnDatabaseNode>();
+Database Database::ScheduleFnDatabase(runtime::TypedPackedFunc<bool(tir::Schedule)> schedule_fn,
+                                      String mod_eq_name) {
+  ObjectPtr<ScheduleFnDatabaseNode> n = make_object<ScheduleFnDatabaseNode>(mod_eq_name);
   n->schedule_fn = std::move(schedule_fn);
   return Database(n);
 }
diff --git a/src/meta_schedule/module_equality.cc b/src/meta_schedule/module_equality.cc
new file mode 100644
index 000000000000..084ae74bb09c
--- /dev/null
+++ b/src/meta_schedule/module_equality.cc
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "module_equality.h"
+
+#include <tvm/ir/module.h>
+#include <tvm/node/structural_equal.h>
+#include <tvm/node/structural_hash.h>
+
+#include <memory>
+
+namespace tvm {
+namespace meta_schedule {
+
+class ModuleEqualityStructural : public ModuleEquality {
+ public:
+  size_t Hash(IRModule mod) const { return tvm::StructuralHash()(mod); }
+  bool Equal(IRModule lhs, IRModule rhs) const { return tvm::StructuralEqual()(lhs, rhs); }
+};
+
+std::unique_ptr<ModuleEquality> ModuleEquality::Create(const std::string& mod_eq_name) {
+  if (mod_eq_name == "structural") {
+    return std::make_unique<ModuleEqualityStructural>();
+  }
+  LOG(FATAL) << "Unknown module equality " << mod_eq_name;
+  return nullptr;
+}
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/module_equality.h b/src/meta_schedule/module_equality.h
new file mode 100644
index 000000000000..3e6fb55d8a9b
--- /dev/null
+++ b/src/meta_schedule/module_equality.h
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_MODULE_EQUALITY_H_
+#define TVM_META_SCHEDULE_MODULE_EQUALITY_H_
+
+#include <tvm/ir/module.h>
+
+#include <memory>
+#include <string>
+
+namespace tvm {
+namespace meta_schedule {
+
+/*! \brief Method to compute hash and determine equality of modules  */
+class ModuleEquality {
+ public:
+  virtual ~ModuleEquality() = default;
+
+  virtual size_t Hash(IRModule mod) const = 0;
+  virtual bool Equal(IRModule lhs, IRModule rhs) const = 0;
+
+  /*!
+   * \brief Create a ModuleEquality instance
+   * \param mod_eq_name A string to specify the module equality testing and hashing method.
+   *  It must be one of the followings:
+   *    - "structural": Use StructuralEqual/Hash
+   * \return An owning pointer to the created instance
+   */
+  static std::unique_ptr<ModuleEquality> Create(const std::string& mod_eq_name);
+};
+
+/*! \brief Functor to compute hash a module using the provided method. */
+class ModuleHash {
+ public:
+  explicit ModuleHash(const ModuleEquality& mod_eq) : mod_eq_(mod_eq) {}
+  size_t operator()(const IRModule& mod) const { return mod_eq_.Hash(mod); }
+
+ private:
+  const ModuleEquality& mod_eq_;
+};
+
+/*! \brief Functor to determine equality of modules using the provided method. */
+class ModuleEqual {
+ public:
+  explicit ModuleEqual(const ModuleEquality& mod_eq) : mod_eq_(mod_eq) {}
+  bool operator()(const IRModule& lhs, const IRModule& rhs) const {
+    return mod_eq_.Equal(lhs, rhs);
+  }
+
+ private:
+  const ModuleEquality& mod_eq_;
+};
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_MODULE_EQUALITY_H_
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index df67d371929b..2cc45e01bbaf 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+#include "../module_equality.h"
 #include "../utils.h"
 
 #define TVM_META_SCHEDULE_CHECK_PROB_RANGE(p, name)                               \
@@ -33,6 +34,9 @@ using tir::Schedule;
 /*! \brief An auxiliary data structure to help deduplicate IRModules */
 class IRModuleSet {
  public:
+  explicit IRModuleSet(const ModuleEquality& mod_eq)
+      : tab_(/*bucket_count*/ 0, ItemHash(), ItemEqual(mod_eq)) {}
+
   /*! \brief Add an IRModule to the set */
   void Add(const IRModule& mod, size_t shash) { tab_.insert(Item{mod, shash}); }
   /*! \brief Check if the IRModule is in the set */
@@ -47,10 +51,16 @@ class IRModuleSet {
     size_t operator()(const Item& hash) const { return hash.shash; }
   };
   struct ItemEqual {
+    explicit ItemEqual(const ModuleEquality& mod_eq) : mod_eq_(mod_eq) {}
+    ItemEqual& operator=(const ItemEqual& other) { return *this; }
+
     bool operator()(const Item& lhs, const Item& rhs) const {
-      return lhs.shash == rhs.shash && StructuralEqual()(lhs.mod, rhs.mod);
+      return lhs.shash == rhs.shash && mod_eq_.Equal(lhs.mod, rhs.mod);
     }
+
+    const ModuleEquality& mod_eq_;
   };
+
   std::unordered_set<Item, ItemHash, ItemEqual> tab_;
 };
 
@@ -271,7 +281,8 @@ class EvolutionarySearchNode : public SearchStrategyNode {
           num_trials_per_iter(num_trials_per_iter),
           st(0),
           ed(num_trials_per_iter),
-          num_empty_iters(0) {
+          num_empty_iters(0),
+          measured_workloads_(database->GetModuleEquality()) {
       design_spaces.reserve(design_spaces.size());
       for (const Schedule& space : design_space_schedules) {
         design_spaces.push_back(space->trace().value()->Simplified(true));
@@ -322,6 +333,12 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     /*! \brief An interface method to be called by it's counterpart in EvolutionarySearchNode */
     inline void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                                     const Array<RunnerResult>& results);
+    /*!
+     * \brief Compute the hash for the given module.
+     * \param mod The input TIR module.
+     * \return The calculated hash.
+     */
+    inline size_t ModuleHash(const IRModule& mod) const;
   };
 
   /*! \brief The tuning context of the evolutionary search strategy. */
@@ -512,7 +529,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
 
 std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     std::vector<Schedule> population, int num) {
-  IRModuleSet exists;
+  IRModuleSet exists(database_->GetModuleEquality());
   {
     auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc/CopyMeasuredWorkloads");
     ICHECK_GT(num, 0);
@@ -531,7 +548,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
       for (int i = 0, n = population.size(); i < n; ++i) {
         Schedule sch = population.at(i);
         IRModule mod = sch->mod();
-        size_t shash = StructuralHash()(mod);
+        size_t shash = ModuleHash(mod);
         double score = scores.at(i);
         if (!exists.Has(mod, shash)) {
           exists.Add(mod, shash);
@@ -661,7 +678,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickWithEpsGreedy(
       }
     }
     IRModule mod = sch->mod();
-    size_t shash = StructuralHash()(mod);
+    size_t shash = ModuleHash(mod);
     if (!measured_workloads.Has(mod, shash)) {
       measured_workloads.Add(mod, shash);
       results.push_back(sch);
@@ -713,6 +730,10 @@ void EvolutionarySearchNode::State::NotifyRunnerResults(
   ed += results.size();
 }
 
+size_t EvolutionarySearchNode::State::ModuleHash(const IRModule& mod) const {
+  return database_->GetModuleEquality().Hash(mod);
+}
+
 SearchStrategy SearchStrategy::EvolutionarySearch(int population_size,         //
                                                   double init_measured_ratio,  //
                                                   int init_min_unmeasured,     //
@@ -754,7 +775,7 @@ Array<Schedule> EvolutionarySearchEvolveWithCostModel(EvolutionarySearch self,
   std::vector<Schedule> schs = self->state_->EvolveWithCostModel(population_vec, num);
   for (Schedule sch : schs) {
     IRModule mod = sch->mod();
-    size_t shash = StructuralHash()(mod);
+    size_t shash = self->state_->ModuleHash(mod);
     if (!self->state_->measured_workloads_.Has(mod, shash)) {
       self->state_->measured_workloads_.Add(mod, shash);
       result.push_back(sch);
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index 213841c621de..430b551a3b9e 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -22,6 +22,7 @@
 #include <tvm/relay/function.h>
 #include <tvm/target/target.h>
 
+#include "../../meta_schedule/module_equality.h"
 #include "../../te/operation/create_primfunc.h"
 #include "./te_compiler_cache.h"
 #include "./utils.h"
@@ -31,8 +32,11 @@ namespace relay {
 namespace backend {
 
 Array<meta_schedule::ExtractedTask> ExtractTask(IRModule mod, Target target,
-                                                Map<String, runtime::NDArray> params) {
+                                                Map<String, runtime::NDArray> params,
+                                                String mod_eq_name) {
   using meta_schedule::ExtractedTask;
+  using meta_schedule::ModuleEqual;
+  using meta_schedule::ModuleHash;
   backend::FTECompilerTIRConverter tir_converter = backend::GetTIRConverter();
   backend::BindParamsInModule(mod, params);
   // is_vm=true for backward compatibility
@@ -42,26 +46,36 @@ Array<meta_schedule::ExtractedTask> ExtractTask(IRModule mod, Target target,
   mod = transform::Sequential(pass_seqs)(std::move(mod));
 
   std::vector<ExtractedTask> tasks;
-  std::unordered_map<tec::CCacheKey, ExtractedTask> cache;
+
+  auto mod_eq = meta_schedule::ModuleEquality::Create(mod_eq_name);
+
+  std::unordered_map<IRModule, ExtractedTask, ModuleHash, ModuleEqual> cache(
+      /*bucket_count*/ 0, ModuleHash(*mod_eq), ModuleEqual(*mod_eq));
+
   PostOrderVisit(mod->Lookup("main"), [&target, &tasks, &cache, &tir_converter](const Expr& exp) {
     if (exp->IsInstance<FunctionNode>()) {
       Function relay_func = Downcast<Function>(exp);
       if (!relay_func->HasNonzeroAttr(attr::kPrimitive)) {
         return;
       }
-      tec::CCacheKey cache_key(relay_func, target);
-      auto it = cache.find(cache_key);
-      if (it != cache.end()) {
-        it->second->weight += 1;
-        return;
-      }
+
       auto [inputs_outputs, constants, fused_name] =
           tec::LowerTECompute(relay_func, target, /*return_inputs=*/true);
+
       if (Optional<tir::PrimFunc> f = tir_converter(inputs_outputs, constants)) {
+        IRModule tir_mod = PrimFuncToIRModule(f.value());
+
+        auto it = cache.find(tir_mod);
+        if (it != cache.end()) {
+          it->second->weight += 1;
+          return;
+        }
+
+        // Note that the cache is key-ed on the tir mod, rather than the relay mod
         IRModule relay_mod({{GlobalVar(fused_name), relay_func}});
-        ExtractedTask task(fused_name, relay_mod, target, {PrimFuncToIRModule(f.value())}, 1);
+        ExtractedTask task(fused_name, relay_mod, target, {tir_mod}, 1);
         tasks.push_back(task);
-        cache.emplace(cache_key, task);
+        cache.emplace(tir_mod, task);
       }
     }
   });

From e8ba1dc4ccd0ba08de9d48a2c5004ecb9f9ba43d Mon Sep 17 00:00:00 2001
From: Liam Sturge <50229489+Liam-Sturge@users.noreply.github.com>
Date: Mon, 17 Oct 2022 08:59:37 +0100
Subject: [PATCH 363/704] [CI] Update Docker Image tag to
 20221013-060115-61c9742ea (#13078)

Update all Docker image tags used in CI to tag 20221013-060115-61c9742ea
in order to bring Compute Library to v22.08.
---
 Jenkinsfile               | 20 ++++++++++----------
 ci/jenkins/Jenkinsfile.j2 | 20 ++++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 78addc9b2c93..d48e02cf13bf 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -49,16 +49,16 @@
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220925-060158-71f25b3d6'
-ci_gpu = 'tlcpack/ci-gpu:20220925-060158-71f25b3d6'
-ci_cpu = 'tlcpack/ci-cpu:20220925-060158-71f25b3d6'
-ci_minimal = 'tlcpack/ci-minimal:20220925-060158-71f25b3d6'
-ci_wasm = 'tlcpack/ci-wasm:20220925-060158-71f25b3d6'
-ci_i386 = 'tlcpack/ci-i386:20220925-060158-71f25b3d6'
-ci_cortexm = 'tlcpack/ci-cortexm:20220925-060158-71f25b3d6'
-ci_arm = 'tlcpack/ci-arm:20220925-060158-71f25b3d6'
-ci_hexagon = 'tlcpack/ci-hexagon:20220925-060158-71f25b3d6'
-ci_riscv = 'tlcpack/ci-riscv:20220925-060158-71f25b3d6'
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221013-060115-61c9742ea'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index 2fcbc9e7e042..f480f08b2b48 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -51,16 +51,16 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'ci/jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220925-060158-71f25b3d6'
-ci_gpu = 'tlcpack/ci-gpu:20220925-060158-71f25b3d6'
-ci_cpu = 'tlcpack/ci-cpu:20220925-060158-71f25b3d6'
-ci_minimal = 'tlcpack/ci-minimal:20220925-060158-71f25b3d6'
-ci_wasm = 'tlcpack/ci-wasm:20220925-060158-71f25b3d6'
-ci_i386 = 'tlcpack/ci-i386:20220925-060158-71f25b3d6'
-ci_cortexm = 'tlcpack/ci-cortexm:20220925-060158-71f25b3d6'
-ci_arm = 'tlcpack/ci-arm:20220925-060158-71f25b3d6'
-ci_hexagon = 'tlcpack/ci-hexagon:20220925-060158-71f25b3d6'
-ci_riscv = 'tlcpack/ci-riscv:20220925-060158-71f25b3d6'
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221013-060115-61c9742ea'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images

From 3f0d3f24b79c850f3415a40b594a5cdfc163dbd8 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 17 Oct 2022 02:27:36 -0700
Subject: [PATCH 364/704] [AOT] Sanitize input/output name in runtime (#13046)

This PR adds name sanitization to input/output at AOT runtime module. This means when we use set_input/set_output with AOT, even if we use the original name of those input/outputs which were sanitized and changed in the codegen, it will map them to the correct input/output.

For example if your model has an input with name `input:0`, the AOT codegen would change this to `input_0` but at runtime if we try to set this input it does not exist. Now, we use the same sanitization at runtime
---
 include/tvm/runtime/name_transforms.h       | 43 ++++++++++++++++++++
 python/tvm/relay/backend/name_transforms.py | 11 ------
 python/tvm/relay/frontend/tflite.py         |  2 +-
 python/tvm/runtime/name_transforms.py       | 32 +++++++++++++++
 src/relay/backend/aot/aot_lower_main.cc     |  5 ++-
 src/relay/backend/aot_executor_codegen.cc   |  3 +-
 src/relay/backend/name_transforms.cc        | 12 +-----
 src/relay/backend/name_transforms.h         |  7 ----
 src/relay/transforms/partition_graph.cc     |  5 ++-
 src/runtime/aot_executor/aot_executor.cc    | 11 +++---
 src/runtime/name_transforms.cc              | 44 +++++++++++++++++++++
 src/target/source/interface_c.cc            |  1 +
 src/target/source/source_module.cc          |  9 +++--
 tests/cpp/name_transforms_test.cc           |  1 +
 tests/python/relay/aot/test_cpp_aot.py      | 41 ++++++++++++++++++-
 tests/python/relay/test_name_transforms.py  |  4 +-
 16 files changed, 184 insertions(+), 47 deletions(-)
 create mode 100644 include/tvm/runtime/name_transforms.h
 create mode 100644 python/tvm/runtime/name_transforms.py
 create mode 100644 src/runtime/name_transforms.cc

diff --git a/include/tvm/runtime/name_transforms.h b/include/tvm/runtime/name_transforms.h
new file mode 100644
index 000000000000..267dda4158c8
--- /dev/null
+++ b/include/tvm/runtime/name_transforms.h
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/runtime/name_transforms.h
+ * \brief Transformations which are applied on names to generate appropriately named.
+ *  These functions are used in both Runtime and Backend.
+ */
+#ifndef TVM_RUNTIME_NAME_TRANSFORMS_H_
+#define TVM_RUNTIME_NAME_TRANSFORMS_H_
+
+#include <string>
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Sanitize name for output into compiler artifacts
+ * \param name Original name
+ * \return Sanitized name
+ */
+std::string SanitizeName(const std::string& name);
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_NAME_TRANSFORMS_H_
diff --git a/python/tvm/relay/backend/name_transforms.py b/python/tvm/relay/backend/name_transforms.py
index 19208725a8b9..bbf51a8e24b1 100644
--- a/python/tvm/relay/backend/name_transforms.py
+++ b/python/tvm/relay/backend/name_transforms.py
@@ -97,14 +97,3 @@ def prefix_generated_name(names: Union[List[str], str]):
     """
 
     return _backend.PrefixGeneratedName(_preprocess_names(names))
-
-
-def sanitize_name(original_name: str):
-    """Sanitize name for output into compiler artifacts
-
-    Parameters
-    ----------
-    original_name : str
-        Original name to sanitize
-    """
-    return _backend.SanitizeName(original_name)
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index a7e10ad72e55..1915eb9322ff 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -23,6 +23,7 @@
 import tvm
 from tvm import relay
 from tvm.ir import IRModule
+from tvm.runtime.name_transforms import sanitize_name
 
 from ... import nd as _nd
 from .. import analysis
@@ -30,7 +31,6 @@
 from .. import function as _function
 from .. import op as _op
 from .. import qnn as _qnn
-from ..backend.name_transforms import sanitize_name
 from .common import ExprTable
 from .common import infer_shape as _infer_shape
 from .common import lstm_cell, to_int_list, shape_of, try_infer_value
diff --git a/python/tvm/runtime/name_transforms.py b/python/tvm/runtime/name_transforms.py
new file mode 100644
index 000000000000..402a47f1a114
--- /dev/null
+++ b/python/tvm/runtime/name_transforms.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Name transformation functions shared in Backend and Runtime
+"""
+
+from . import _ffi_api
+
+
+def sanitize_name(original_name: str):
+    """Sanitize name for output into compiler artifacts
+
+    Parameters
+    ----------
+    original_name : str
+        Original name to sanitize
+    """
+    return _ffi_api.SanitizeName(original_name)
diff --git a/src/relay/backend/aot/aot_lower_main.cc b/src/relay/backend/aot/aot_lower_main.cc
index ce72595dc10b..51dd4b219313 100644
--- a/src/relay/backend/aot/aot_lower_main.cc
+++ b/src/relay/backend/aot/aot_lower_main.cc
@@ -23,6 +23,7 @@
  */
 #include "./aot_lower_main.h"
 
+#include <tvm/runtime/name_transforms.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/transform.h>
 
@@ -227,7 +228,7 @@ class AOTMainLowerer : public MixedModeVisitor {
 
     for (auto input : lowered_main_func->params) {
       input_vars_.push_back(input);
-      std::string input_name = SanitizeName(input->name_hint());
+      std::string input_name = tvm::runtime::SanitizeName(input->name_hint());
       // We don't want the compiler changing input names in the
       // event of a sanitization collision. Therefore, enforcing
       // the var created to use the input_name strictly.
@@ -518,7 +519,7 @@ class AOTMainLowerer : public MixedModeVisitor {
         return;
       }
       if (target_attr_map[target_kind.value()]) {
-        std::string context_name = SanitizeName(device_context_name);
+        std::string context_name = tvm::runtime::SanitizeName(device_context_name);
         tir::Var device_context_var("device_context_" + context_name, DataType::Handle());
 
         auto pair = target_contexts.find(target_kind.value());
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 6a9cadb6f770..786b3f81a5ae 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -29,6 +29,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/runtime.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/name_transforms.h>
 #include <tvm/runtime/object.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
@@ -534,7 +535,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
         return;
       }
       if (target_attr_map[target_kind.value()]) {
-        std::string context_name = SanitizeName(device_context_name);
+        std::string context_name = tvm::runtime::SanitizeName(device_context_name);
         tir::Var device_context_var("device_context_" + context_name, DataType::Handle());
 
         auto pair = target_contexts.find(target_kind.value());
diff --git a/src/relay/backend/name_transforms.cc b/src/relay/backend/name_transforms.cc
index a2f24216ec24..4f364b811bcc 100644
--- a/src/relay/backend/name_transforms.cc
+++ b/src/relay/backend/name_transforms.cc
@@ -19,6 +19,7 @@
 
 #include "name_transforms.h"
 
+#include <tvm/runtime/name_transforms.h>
 #include <tvm/runtime/registry.h>
 
 #include <cctype>
@@ -84,22 +85,11 @@ std::string CombineNames(const Array<String>& names) {
   return combined_name;
 }
 
-std::string SanitizeName(const std::string& name) {
-  ICHECK(!name.empty()) << "Name is empty";
-
-  auto isNotAlnum = [](char c) { return !std::isalnum(c); };
-  std::string sanitized_input = name;
-  std::replace_if(sanitized_input.begin(), sanitized_input.end(), isNotAlnum, '_');
-
-  return sanitized_input;
-}
-
 TVM_REGISTER_GLOBAL("relay.backend.ToCFunctionStyle").set_body_typed(ToCFunctionStyle);
 TVM_REGISTER_GLOBAL("relay.backend.ToCVariableStyle").set_body_typed(ToCVariableStyle);
 TVM_REGISTER_GLOBAL("relay.backend.ToCConstantStyle").set_body_typed(ToCConstantStyle);
 TVM_REGISTER_GLOBAL("relay.backend.PrefixName").set_body_typed(PrefixName);
 TVM_REGISTER_GLOBAL("relay.backend.PrefixGeneratedName").set_body_typed(PrefixGeneratedName);
-TVM_REGISTER_GLOBAL("relay.backend.SanitizeName").set_body_typed(SanitizeName);
 
 }  // namespace backend
 }  // namespace relay
diff --git a/src/relay/backend/name_transforms.h b/src/relay/backend/name_transforms.h
index a30ba6b10825..f59280af2222 100644
--- a/src/relay/backend/name_transforms.h
+++ b/src/relay/backend/name_transforms.h
@@ -102,13 +102,6 @@ inline std::string PrefixGeneratedName(const Array<String>& names) {
   return "TVMGen_" + CombineNames(names);
 }
 
-/*!
- * \brief Sanitize name for output into compiler artifacts
- * \param name Original name
- * \return Sanitized name
- */
-std::string SanitizeName(const std::string& name);
-
 }  // namespace backend
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index e2df2e4272ad..f6cdf6d1ca18 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -36,6 +36,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
+#include <tvm/runtime/name_transforms.h>
 
 #include <unordered_map>
 #include <unordered_set>
@@ -507,7 +508,7 @@ class NameMangleExtFuncs : public MixedModeMutator {
       if (auto* fn = pair.second.as<FunctionNode>()) {
         auto func = GetRef<Function>(fn);
         if (func->GetAttr<String>(attr::kCompiler).defined()) {
-          auto fn_name_mangled = relay::backend::SanitizeName(mangle_fn_(pair.first->name_hint));
+          auto fn_name_mangled = tvm::runtime::SanitizeName(mangle_fn_(pair.first->name_hint));
           GlobalVar gvar = GlobalVar(fn_name_mangled);
           mangled_gvars_[pair.first->name_hint] = gvar;
         }
@@ -526,7 +527,7 @@ class NameMangleExtFuncs : public MixedModeMutator {
         if (func->GetAttr<String>(attr::kCompiler).defined()) {
           auto new_dict = func->attrs->dict;
           new_dict.Set(tvm::attr::kGlobalSymbol,
-                       String(relay::backend::SanitizeName(mangle_fn_(pair.first->name_hint))));
+                       String(tvm::runtime::SanitizeName(mangle_fn_(pair.first->name_hint))));
           func = WithFields(func, func->params, VisitExpr(func->body), func->ret_type,
                             func->type_params, DictAttrs(new_dict));
 
diff --git a/src/runtime/aot_executor/aot_executor.cc b/src/runtime/aot_executor/aot_executor.cc
index 985c857ed55f..7f7daabf3fc2 100644
--- a/src/runtime/aot_executor/aot_executor.cc
+++ b/src/runtime/aot_executor/aot_executor.cc
@@ -27,6 +27,7 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/name_transforms.h>
 
 #include <limits>
 #include <memory>
@@ -98,7 +99,7 @@ PackedFunc AotExecutor::GetFunction(const std::string& name,
   if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       if (String::CanConvertFrom(args[0])) {
-        int in_idx = this->GetInputIndex(args[0].operator String());
+        int in_idx = this->GetInputIndex(tvm::runtime::SanitizeName(args[0].operator String()));
         if (in_idx >= 0) this->SetInput(in_idx, args[1]);
       } else {
         this->SetInput(args[0], args[1]);
@@ -107,7 +108,7 @@ PackedFunc AotExecutor::GetFunction(const std::string& name,
   } else if (name == "set_input_zero_copy") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       if (String::CanConvertFrom(args[0])) {
-        int in_idx = this->GetInputIndex(args[0].operator String());
+        int in_idx = this->GetInputIndex(tvm::runtime::SanitizeName(args[0].operator String()));
         if (in_idx >= 0) this->SetInputZeroCopy(in_idx, args[1]);
       } else {
         this->SetInputZeroCopy(args[0], args[1]);
@@ -116,7 +117,7 @@ PackedFunc AotExecutor::GetFunction(const std::string& name,
   } else if (name == "set_output_zero_copy") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       if (String::CanConvertFrom(args[0])) {
-        int out_idx = this->GetOutputIndex(args[0].operator String());
+        int out_idx = this->GetOutputIndex(tvm::runtime::SanitizeName(args[0].operator String()));
         if (out_idx >= 0) this->SetOutputZeroCopy(out_idx, args[1]);
       } else {
         this->SetOutputZeroCopy(args[0], args[1]);
@@ -134,7 +135,7 @@ PackedFunc AotExecutor::GetFunction(const std::string& name,
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       int in_idx = 0;
       if (String::CanConvertFrom(args[0])) {
-        in_idx = this->GetInputIndex(args[0].operator String());
+        in_idx = this->GetInputIndex(tvm::runtime::SanitizeName(args[0].operator String()));
       } else {
         in_idx = args[0];
       }
@@ -153,7 +154,7 @@ PackedFunc AotExecutor::GetFunction(const std::string& name,
   } else if (name == "get_input_index") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       CHECK(String::CanConvertFrom(args[0])) << "Input key is not a string";
-      *rv = this->GetInputIndex(args[0].operator String());
+      *rv = this->GetInputIndex(tvm::runtime::SanitizeName(args[0].operator String()));
     });
   } else {
     return PackedFunc();
diff --git a/src/runtime/name_transforms.cc b/src/runtime/name_transforms.cc
new file mode 100644
index 000000000000..608b88ac430e
--- /dev/null
+++ b/src/runtime/name_transforms.cc
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/logging.h>
+#include <tvm/runtime/name_transforms.h>
+#include <tvm/runtime/registry.h>
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+
+namespace tvm {
+namespace runtime {
+
+std::string SanitizeName(const std::string& name) {
+  ICHECK(!name.empty()) << "Name is empty";
+
+  auto isNotAlnum = [](char c) { return !std::isalnum(c); };
+  std::string sanitized_input = name;
+  std::replace_if(sanitized_input.begin(), sanitized_input.end(), isNotAlnum, '_');
+
+  return sanitized_input;
+}
+
+TVM_REGISTER_GLOBAL("runtime.SanitizeName").set_body_typed(SanitizeName);
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/target/source/interface_c.cc b/src/target/source/interface_c.cc
index fa38d9b9f4d1..ed7058f1f198 100644
--- a/src/target/source/interface_c.cc
+++ b/src/target/source/interface_c.cc
@@ -25,6 +25,7 @@
 #include <tvm/runtime/container/array.h>
 #include <tvm/runtime/container/string.h>
 #include <tvm/runtime/module.h>
+#include <tvm/runtime/name_transforms.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/tir/usmp/utils.h>
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 88a7a99b4c25..ce5f5d5b5357 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -26,6 +26,7 @@
 #include <dmlc/memory_io.h>
 #include <tvm/runtime/metadata.h>
 #include <tvm/runtime/module.h>
+#include <tvm/runtime/name_transforms.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
@@ -507,7 +508,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
           } else {
             codegen_c_base_.PrintType(input_var.dtype(), call_args_ss);
           }
-          call_args_ss << " " << relay::backend::SanitizeName(input_var->name_hint) << ",";
+          call_args_ss << " " << tvm::runtime::SanitizeName(input_var->name_hint) << ",";
         }
         for (unsigned int i = 0; i < metadata_->outputs.size(); ++i) {
           call_args_ss << "void* output" << i << ",";
@@ -565,10 +566,10 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       std::stringstream call_args_ss;
       if (metadata_->io_pool_allocations.empty()) {
         for (const auto& input : metadata_->inputs) {
-          call_args_ss << "inputs->" << relay::backend::SanitizeName(input->name_hint) << ",";
+          call_args_ss << "inputs->" << tvm::runtime::SanitizeName(input->name_hint) << ",";
         }
         for (const auto& output : metadata_->outputs) {
-          call_args_ss << "outputs->" << relay::backend::SanitizeName(output);
+          call_args_ss << "outputs->" << tvm::runtime::SanitizeName(output);
           call_args_ss << ",";
         }
       }
@@ -578,7 +579,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
         if (IsInternalWorkspaceBuffer(pool_var)) {
           call_args_ss << "&" << pool_name << ",";
         } else {
-          call_args_ss << "workspace_pools->" << relay::backend::SanitizeName(pool_name) << ",";
+          call_args_ss << "workspace_pools->" << tvm::runtime::SanitizeName(pool_name) << ",";
         }
       }
       for (const String& device : metadata_->devices) {
diff --git a/tests/cpp/name_transforms_test.cc b/tests/cpp/name_transforms_test.cc
index 09a5bbfb583a..12a2ce1d0761 100644
--- a/tests/cpp/name_transforms_test.cc
+++ b/tests/cpp/name_transforms_test.cc
@@ -21,6 +21,7 @@
 
 #include <gtest/gtest.h>
 #include <tvm/runtime/container/string.h>
+#include <tvm/runtime/name_transforms.h>
 
 using namespace tvm::relay::backend;
 using namespace tvm::runtime;
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index b67bc90d34fd..89c34eaac8b6 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -118,7 +118,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5),
 
 @pytest.mark.parametrize("enable_usmp", [True, False])
 @pytest.mark.parametrize("target_kind", ["c", "llvm"])
-def test_mobilenet(enable_usmp, target_kind):
+def test_mobilenet(enable_usmp: bool, target_kind: str):
     """Full network test with Mobilenet"""
     ir_mod, params = testing.mobilenet.get_workload(batch_size=1)
     data_shape = [int(x) for x in ir_mod["main"].checked_type.arg_types[0].shape]
@@ -203,5 +203,44 @@ def test_pass_wrong_device_arg():
     # TODO write asserts for # and type of device.
 
 
+@pytest.mark.parametrize("target_kind", ["c", "llvm"])
+@pytest.mark.parametrize("input_name", ["input:0", "input@0", "input_0"])
+def test_aot_input_name_with_special_character(target_kind: str, input_name: str):
+    """Test name transforms in AOT for input names with special characters."""
+    dtype = "float32"
+    input_1 = relay.var(input_name, shape=(10, 5), dtype=dtype)
+    weight = relay.var("weight", shape=(1, 5), dtype=dtype)
+    output = relay.add(input_1, weight)
+    func = relay.Function([input_1, weight], output)
+
+    input_data = np.random.rand(10, 5).astype(dtype)
+    weight_data = np.random.rand(1, 5).astype(dtype)
+    expected_output = input_data + weight_data
+    params = {"weight": weight_data}
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(
+            tvm.IRModule.from_expr(func),
+            target=target_kind,
+            params=params,
+            executor=tvm.relay.backend.Executor("aot", {"interface-api": "packed"}),
+        )
+    temp_dir = tvm.contrib.utils.TempDirectory()
+    test_so_path = temp_dir / "test.so"
+    mod.export_library(test_so_path, cc="c++", options=["-std=gnu++17", "-g3", "-O0"])
+    # test both original name and transformed name
+    for name in ["input_0", input_name]:
+        loaded_mod = tvm.runtime.load_module(test_so_path)
+        runner = tvm.runtime.executor.AotModule(loaded_mod["default"](tvm.cpu(0)))
+        inputs = {name: input_data}
+        runner.set_input(**inputs)
+
+        input_ind = runner.get_input_index(name)
+        assert (runner.get_input(input_ind).asnumpy() == input_data).all()
+
+        runner.run()
+        assert (runner.get_output(0).asnumpy() == expected_output).all()
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/test_name_transforms.py b/tests/python/relay/test_name_transforms.py
index 1c3435a6cc85..72976dc19c21 100644
--- a/tests/python/relay/test_name_transforms.py
+++ b/tests/python/relay/test_name_transforms.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
 
 from tvm import TVMError
 from tvm.relay.backend.name_transforms import (
@@ -22,9 +23,8 @@
     to_c_constant_style,
     prefix_name,
     prefix_generated_name,
-    sanitize_name,
 )
-import pytest
+from tvm.runtime.name_transforms import sanitize_name
 
 
 def test_to_c_function_style():

From b1c8c90384b88d67db15199cd78902b10fd8230e Mon Sep 17 00:00:00 2001
From: Havisha Panda <97978678+hpanda-naut@users.noreply.github.com>
Date: Mon, 17 Oct 2022 11:16:45 -0400
Subject: [PATCH 365/704] [skip ci] Added label tags links to the wiki page in
 issue templates to align with Issue Tracking RFC (#12988)

* [skip ci] Added links to label tag wiki page in issue templates to align with Issue Tracking RFC

* format changes
---
 .github/ISSUE_TEMPLATE/bug-report.md       | 4 ++--
 .github/ISSUE_TEMPLATE/ci-problem.md       | 4 ++--
 .github/ISSUE_TEMPLATE/documentation.md    | 4 ++--
 .github/ISSUE_TEMPLATE/feature-tracking.md | 4 ++--
 .github/ISSUE_TEMPLATE/flaky-test.md       | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
index b541eb3a317c..359a441ecf8f 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -1,6 +1,6 @@
 ---
 name: "\U0001F41B Bug report"
-about: Please include a description of your environment, preferably a minimum script to reproduce the problem. Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
+about: Please include a description of your environment, preferably a minimum script to reproduce the problem. Find the list of label tags at https://tinyurl.com/5fnd5rkn.
 title: "[Bug] "
 labels: "needs-triage, type: bug"
 ---
@@ -27,6 +27,6 @@ Preferably a minimal script to cause the issue to occur.
 
 ### Triage
 
-Please refer to the list of label tags linked above to find the relevant tags and add them here in a bullet format (example below).
+Please refer to the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels) to find the relevant tags and add them below in a bullet format (example below).
 
 * needs-triage
diff --git a/.github/ISSUE_TEMPLATE/ci-problem.md b/.github/ISSUE_TEMPLATE/ci-problem.md
index 73e485fbcac0..b1b874e848f8 100644
--- a/.github/ISSUE_TEMPLATE/ci-problem.md
+++ b/.github/ISSUE_TEMPLATE/ci-problem.md
@@ -1,6 +1,6 @@
 ---
 name: "\U0000274C CI Problem"
-about: To help the developers act on these problems, please give us as many details of the CI failure as possible. Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
+about: To help the developers act on these problems, please give us as many details of the CI failure as possible. Find the list of label tags at https://tinyurl.com/5fnd5rkn.
 title: "[CI Problem] "
 labels: "needs-triage, type:ci"
 ---
@@ -23,6 +23,6 @@ Have you seen this multiple times in this branch or in other branches?
 
 ### Triage
 
-Please refer to the list of label tags linked above to find the relevant tags and add them here in a bullet format (example below).
+Please refer to the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels) to find the relevant tags and add them below in a bullet format (example below).
 
 * needs-triage
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
index f41d1238b0ec..d6ae1180ee41 100644
--- a/.github/ISSUE_TEMPLATE/documentation.md
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -1,6 +1,6 @@
 ---
 name: "\U0001F4C4 Documentation"
-about: Use this template to suggest additions and changes to the documentation. Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
+about: Use this template to suggest additions and changes to the documentation. Find the list of label tags at https://tinyurl.com/5fnd5rkn.
 title: "[Docs] "
 labels: "needs-triage, type: doc"
 ---
@@ -21,7 +21,7 @@ Otherwise, specify what actions should be taken to provide additional clarity/re
 
 ### Triage
 
-Please refer to the list of label tags linked above to find the relevant tags and add them here in a bullet format (example below).
+Please refer to the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels) to find the relevant tags and add them below in a bullet format (example below).
 
 * needs-triage
 
diff --git a/.github/ISSUE_TEMPLATE/feature-tracking.md b/.github/ISSUE_TEMPLATE/feature-tracking.md
index 2113f4cbcff9..af217215d605 100644
--- a/.github/ISSUE_TEMPLATE/feature-tracking.md
+++ b/.github/ISSUE_TEMPLATE/feature-tracking.md
@@ -1,6 +1,6 @@
 ---
 name: "\U0001F527 Feature Tracking"
-about: List clear, small actionable items so we can track the progress of the change. Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
+about: List clear, small actionable items so we can track the progress of the change. Find the list of label tags at https://tinyurl.com/5fnd5rkn.
 title: "[Tracking Issue] "
 labels: "needs-triage, type:rfc-tracking"
 ---
@@ -14,6 +14,6 @@ Issues that are inactive for a period of time may get closed. We adopt this poli
 
 ### Triage
 
-Please refer to the list of label tags linked above to find the relevant tags and add them here in a bullet format (example below).
+Please refer to the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels) to find the relevant tags and add them below in a bullet format (example below).
 
 * needs-triage
diff --git a/.github/ISSUE_TEMPLATE/flaky-test.md b/.github/ISSUE_TEMPLATE/flaky-test.md
index d20da597d1de..bf64c08bbeba 100644
--- a/.github/ISSUE_TEMPLATE/flaky-test.md
+++ b/.github/ISSUE_TEMPLATE/flaky-test.md
@@ -1,6 +1,6 @@
 ---
 name: "\U00002744 Flaky Test"
-about: Report flaky tests, make sure to include link to CI runs, a sample failure log, and the name of the test(s). Find the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels).
+about: Report flaky tests, make sure to include link to CI runs, a sample failure log, and the name of the test(s). Find the list of label tags at https://tinyurl.com/5fnd5rkn.
 title: "[Flaky Test] "
 labels: "needs-triage, test: flaky"
 ---
@@ -19,6 +19,6 @@ These tests were found to be flaky (intermittently failing on `main` or failed i
 
 ### Triage
 
-Please refer to the list of label tags linked above to find the relevant tags and add them here in a bullet format (example below).
+Please refer to the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels) to find the relevant tags and add them below in a bullet format (example below).
 
 * needs-triage

From 34c43d43ea90548f0798f709635ab1c76727cb6d Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 18 Oct 2022 03:17:39 +0900
Subject: [PATCH 366/704] [MetaSchedule] Introduce a variant of ModuleEquality
 to enable ignoring NDArray raw data (#13091)

A follow up to https://github.com/apache/tvm/pull/13050, also builds on https://github.com/apache/tvm/pull/13001. This PR enables the functionality in https://github.com/apache/tvm/pull/12706 without changing the existing `StructuralEqual/Hash`.

A question for discussion: Should this be the default ModuleEquality used by MS? It has no effect for the `link-params = False` case, and it simplifies the MS tuning API usage for the `link-params = True` case (Hexagon etc).
---
 include/tvm/meta_schedule/database.h          |  4 ++
 .../meta_schedule/database/json_database.py   |  2 +
 .../meta_schedule/database/memory_database.py |  2 +
 .../database/schedule_fn_database.py          |  2 +
 python/tvm/meta_schedule/relay_integration.py |  4 ++
 python/tvm/meta_schedule/tune.py              |  2 +
 src/meta_schedule/module_equality.cc          | 42 +++++++++++++
 src/meta_schedule/module_equality.h           |  2 +
 src/node/ndarray_hash_equal.h                 | 52 ++++++++++++++++
 src/node/structural_equal.cc                  | 35 +++++++++++
 src/node/structural_hash.cc                   | 51 ++++++----------
 .../test_meta_schedule_relay_integration.py   | 59 +++++++++++++++++++
 12 files changed, 224 insertions(+), 33 deletions(-)
 create mode 100644 src/node/ndarray_hash_equal.h

diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index bcdffe9ff33b..9eead8d5ec31 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -181,6 +181,8 @@ class DatabaseNode : public runtime::Object {
    * \param mod_eq_name A string to specify the module equality testing and hashing method.
    *  It must be one of the followings:
    *    - "structural": Use StructuralEqual/Hash
+   *    - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+   *                        equality testing and hashing.
    */
   explicit DatabaseNode(String mod_eq_name = "structural");
 
@@ -270,6 +272,8 @@ class PyDatabaseNode : public DatabaseNode {
    * \param mod_eq_name A string to specify the module equality testing and hashing method.
    *  It must be one of the followings:
    *    - "structural": Use StructuralEqual/Hash
+   *    - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+   *                        equality testing and hashing.
    */
   explicit PyDatabaseNode(String mod_eq_name = "structural");
 
diff --git a/python/tvm/meta_schedule/database/json_database.py b/python/tvm/meta_schedule/database/json_database.py
index aedc83ad89b3..f81d8913c18a 100644
--- a/python/tvm/meta_schedule/database/json_database.py
+++ b/python/tvm/meta_schedule/database/json_database.py
@@ -38,6 +38,8 @@ class JSONDatabase(Database):
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
+          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+                              equality testing and hashing.
     """
 
     path_workload: str
diff --git a/python/tvm/meta_schedule/database/memory_database.py b/python/tvm/meta_schedule/database/memory_database.py
index e07f325d9d3d..96b9bb5a0112 100644
--- a/python/tvm/meta_schedule/database/memory_database.py
+++ b/python/tvm/meta_schedule/database/memory_database.py
@@ -31,6 +31,8 @@ class MemoryDatabase(Database):
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
+          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+                              equality testing and hashing.
     """
 
     def __init__(
diff --git a/python/tvm/meta_schedule/database/schedule_fn_database.py b/python/tvm/meta_schedule/database/schedule_fn_database.py
index 273b84185287..7a0b433996c5 100644
--- a/python/tvm/meta_schedule/database/schedule_fn_database.py
+++ b/python/tvm/meta_schedule/database/schedule_fn_database.py
@@ -37,6 +37,8 @@ class ScheduleFnDatabase(Database):
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
+          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+                              equality testing and hashing.
     """
 
     def __init__(
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index b9c34e509ab4..089f6e412e20 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -141,6 +141,8 @@ def extract_tasks(
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
+          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+                              equality testing and hashing.
 
     Returns
     -------
@@ -284,6 +286,8 @@ def tune_relay(
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
+          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+                              equality testing and hashing.
 
     Returns
     -------
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 66cb60c32902..07021eac3998 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -74,6 +74,8 @@ def tune_tasks(
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
           - "structural": Use StructuralEqual/Hash
+          - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+                              equality testing and hashing.
 
     Returns
     -------
diff --git a/src/meta_schedule/module_equality.cc b/src/meta_schedule/module_equality.cc
index 084ae74bb09c..caa7da170bd6 100644
--- a/src/meta_schedule/module_equality.cc
+++ b/src/meta_schedule/module_equality.cc
@@ -24,6 +24,8 @@
 
 #include <memory>
 
+#include "../node/ndarray_hash_equal.h"
+
 namespace tvm {
 namespace meta_schedule {
 
@@ -33,9 +35,49 @@ class ModuleEqualityStructural : public ModuleEquality {
   bool Equal(IRModule lhs, IRModule rhs) const { return tvm::StructuralEqual()(lhs, rhs); }
 };
 
+class SEqualHandlerIgnoreNDArray : public SEqualHandlerDefault {
+ public:
+  SEqualHandlerIgnoreNDArray() : SEqualHandlerDefault(false, nullptr) {}
+
+ protected:
+  bool DispatchSEqualReduce(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars,
+                            const Optional<ObjectPathPair>& current_paths) {
+    if (auto lhs_ptr = lhs.as<runtime::NDArray::Container>(),
+        rhs_ptr = rhs.as<runtime::NDArray::Container>();
+        lhs_ptr && rhs_ptr) {
+      SEqualReducer reducer(this, nullptr, map_free_vars);
+      return NDArrayEqual(lhs_ptr, rhs_ptr, reducer, false);
+    }
+    return SEqualHandlerDefault::DispatchSEqualReduce(lhs, rhs, map_free_vars, current_paths);
+  }
+};
+
+class SHashHandlerIgnoreNDArray : public SHashHandlerDefault {
+ protected:
+  void DispatchSHash(const ObjectRef& object, bool map_free_vars) override {
+    ICHECK(object.defined());
+    if (auto ndarray = object.as<runtime::NDArray::Container>()) {
+      SHashReducer hash_reduce(this, map_free_vars);
+      NDArrayHash(ndarray, &hash_reduce, false);
+    } else {
+      SHashHandlerDefault::DispatchSHash(object, map_free_vars);
+    }
+  }
+};
+
+class ModuleEqualityIgnoreNDArray : public ModuleEquality {
+ public:
+  size_t Hash(IRModule mod) const { return SHashHandlerIgnoreNDArray().Hash(mod, false); }
+  bool Equal(IRModule lhs, IRModule rhs) const {
+    return SEqualHandlerIgnoreNDArray().Equal(lhs, rhs, false);
+  }
+};
+
 std::unique_ptr<ModuleEquality> ModuleEquality::Create(const std::string& mod_eq_name) {
   if (mod_eq_name == "structural") {
     return std::make_unique<ModuleEqualityStructural>();
+  } else if (mod_eq_name == "ignore-ndarray") {
+    return std::make_unique<ModuleEqualityIgnoreNDArray>();
   }
   LOG(FATAL) << "Unknown module equality " << mod_eq_name;
   return nullptr;
diff --git a/src/meta_schedule/module_equality.h b/src/meta_schedule/module_equality.h
index 3e6fb55d8a9b..8c99b563551b 100644
--- a/src/meta_schedule/module_equality.h
+++ b/src/meta_schedule/module_equality.h
@@ -40,6 +40,8 @@ class ModuleEquality {
    * \param mod_eq_name A string to specify the module equality testing and hashing method.
    *  It must be one of the followings:
    *    - "structural": Use StructuralEqual/Hash
+   *    - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
+   *                        equality testing and hashing.
    * \return An owning pointer to the created instance
    */
   static std::unique_ptr<ModuleEquality> Create(const std::string& mod_eq_name);
diff --git a/src/node/ndarray_hash_equal.h b/src/node/ndarray_hash_equal.h
new file mode 100644
index 000000000000..d674018fbdd2
--- /dev/null
+++ b/src/node/ndarray_hash_equal.h
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_NODE_NDARRAY_HASH_EQUAL_H_
+#define TVM_NODE_NDARRAY_HASH_EQUAL_H_
+
+#include <tvm/runtime/ndarray.h>
+
+namespace tvm {
+
+class SEqualReducer;
+class SHashReducer;
+
+/*!
+ * \brief Test two NDArrays for equality.
+ * \param lhs The left operand.
+ * \param rhs The right operand.
+ * \param equal A Reducer class to reduce the structural equality result of two objects.
+ * See tvm/node/structural_equal.h.
+ * \param compare_data Whether or not to consider ndarray raw data in the equality testing.
+ * \return The equality testing result.
+ */
+bool NDArrayEqual(const runtime::NDArray::Container* lhs, const runtime::NDArray::Container* rhs,
+                  SEqualReducer equal, bool compare_data);
+
+/*!
+ * \brief Hash NDArray.
+ * \param arr The NDArray to compute the hash for.
+ * \param hash_reduce A Reducer class to reduce the structural hash value.
+ * See tvm/node/structural_hash.h.
+ * \param hash_data Whether or not to hash ndarray raw data.
+ */
+void NDArrayHash(const runtime::NDArray::Container* arr, SHashReducer* hash_reduce, bool hash_data);
+
+}  // namespace tvm
+
+#endif  //  TVM_NODE_NDARRAY_HASH_EQUAL_H_
diff --git a/src/node/structural_equal.cc b/src/node/structural_equal.cc
index 0a9a0ec0bbb7..0290b7afe3fd 100644
--- a/src/node/structural_equal.cc
+++ b/src/node/structural_equal.cc
@@ -29,6 +29,8 @@
 
 #include <unordered_map>
 
+#include "ndarray_hash_equal.h"
+
 namespace tvm {
 
 TVM_REGISTER_OBJECT_TYPE(ObjectPathPairNode);
@@ -476,4 +478,37 @@ bool StructuralEqual::operator()(const ObjectRef& lhs, const ObjectRef& rhs) con
   return SEqualHandlerDefault(false, nullptr).Equal(lhs, rhs, false);
 }
 
+bool NDArrayEqual(const runtime::NDArray::Container* lhs, const runtime::NDArray::Container* rhs,
+                  SEqualReducer equal, bool compare_data) {
+  if (lhs == rhs) return true;
+
+  auto ldt = lhs->dl_tensor.dtype;
+  auto rdt = rhs->dl_tensor.dtype;
+  ICHECK_EQ(lhs->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor";
+  ICHECK_EQ(rhs->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor";
+  ICHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor";
+  ICHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor";
+
+  if (lhs->dl_tensor.ndim != rhs->dl_tensor.ndim) return false;
+  for (int i = 0; i < lhs->dl_tensor.ndim; ++i) {
+    if (!equal(lhs->dl_tensor.shape[i], rhs->dl_tensor.shape[i])) return false;
+  }
+  if (ldt.code == rdt.code && ldt.lanes == rdt.lanes && ldt.bits == rdt.bits) {
+    size_t data_size = runtime::GetDataSize(lhs->dl_tensor);
+    if (compare_data) {
+      return std::memcmp(lhs->dl_tensor.data, rhs->dl_tensor.data, data_size) == 0;
+    } else {
+      return true;
+    }
+  } else {
+    return false;
+  }
+}
+
+bool NDArrayContainerTrait::SEqualReduce(const runtime::NDArray::Container* lhs,
+                                         const runtime::NDArray::Container* rhs,
+                                         SEqualReducer equal) {
+  return NDArrayEqual(lhs, rhs, equal, true);
+}
+
 }  // namespace tvm
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index a355e44028b6..1d1185cddc3d 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -35,6 +35,7 @@
 #include "../support/base64.h"
 #include "../support/str_escape.h"
 #include "../support/utils.h"
+#include "ndarray_hash_equal.h"
 
 namespace tvm {
 
@@ -359,41 +360,25 @@ struct ADTObjTrait {
 
 TVM_REGISTER_REFLECTION_VTABLE(runtime::ADTObj, ADTObjTrait);
 
-void NDArrayContainerTrait::SHashReduce(const runtime::NDArray::Container* key,
-                                        SHashReducer hash_reduce) {
-  ICHECK_EQ(key->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor";
-  ICHECK(runtime::IsContiguous(key->dl_tensor)) << "Can only hash contiguous tensor";
-  hash_reduce(runtime::DataType(key->dl_tensor.dtype));
-  hash_reduce(key->dl_tensor.ndim);
-  for (int i = 0; i < key->dl_tensor.ndim; ++i) {
-    hash_reduce(key->dl_tensor.shape[i]);
-  }
-  hash_reduce->SHashReduceHashedValue(runtime::String::HashBytes(
-      static_cast<const char*>(key->dl_tensor.data), runtime::GetDataSize(key->dl_tensor)));
+void NDArrayHash(const runtime::NDArray::Container* arr, SHashReducer* hash_reduce,
+                 bool hash_data) {
+  ICHECK_EQ(arr->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor";
+  ICHECK(runtime::IsContiguous(arr->dl_tensor)) << "Can only hash contiguous tensor";
+  (*hash_reduce)(runtime::DataType(arr->dl_tensor.dtype));
+  (*hash_reduce)(arr->dl_tensor.ndim);
+  for (int i = 0; i < arr->dl_tensor.ndim; ++i) {
+    (*hash_reduce)(arr->dl_tensor.shape[i]);
+  }
+  if (hash_data) {
+    (*hash_reduce)
+        ->SHashReduceHashedValue(runtime::String::HashBytes(
+            static_cast<const char*>(arr->dl_tensor.data), runtime::GetDataSize(arr->dl_tensor)));
+  }
 }
 
-bool NDArrayContainerTrait::SEqualReduce(const runtime::NDArray::Container* lhs,
-                                         const runtime::NDArray::Container* rhs,
-                                         SEqualReducer equal) {
-  if (lhs == rhs) return true;
-
-  auto ldt = lhs->dl_tensor.dtype;
-  auto rdt = rhs->dl_tensor.dtype;
-  ICHECK_EQ(lhs->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor";
-  ICHECK_EQ(rhs->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor";
-  ICHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor";
-  ICHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor";
-
-  if (lhs->dl_tensor.ndim != rhs->dl_tensor.ndim) return false;
-  for (int i = 0; i < lhs->dl_tensor.ndim; ++i) {
-    if (!equal(lhs->dl_tensor.shape[i], rhs->dl_tensor.shape[i])) return false;
-  }
-  if (ldt.code == rdt.code && ldt.lanes == rdt.lanes && ldt.bits == rdt.bits) {
-    size_t data_size = runtime::GetDataSize(lhs->dl_tensor);
-    return std::memcmp(lhs->dl_tensor.data, rhs->dl_tensor.data, data_size) == 0;
-  } else {
-    return false;
-  }
+void NDArrayContainerTrait::SHashReduce(const runtime::NDArray::Container* key,
+                                        SHashReducer hash_reduce) {
+  NDArrayHash(key, &hash_reduce, /*bool hash_data*/ true);
 }
 
 TVM_REGISTER_REFLECTION_VTABLE(runtime::NDArray::Container, NDArrayContainerTrait)
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index d5c81bcc56ba..e9908cbfde14 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -568,5 +568,64 @@ def test_rewrite_layout_link_params():
         np.testing.assert_allclose(ref, out, rtol=1e-4, atol=1e-4)
 
 
+def test_module_equality_ignore_ndarray():
+    target = "llvm --num-cores=4"
+
+    data_shape = (128, 128)
+    weight_shape1 = (128, 128)
+    weight_shape2 = (128, 128)
+
+    data = relay.var("data", shape=data_shape, dtype="float32")
+    weight1 = relay.var("weight1", shape=weight_shape1, dtype="float32")
+    weight2 = relay.var("weight2", shape=weight_shape2, dtype="float32")
+    dense1 = relay.nn.dense(data, weight1)
+    dense2 = relay.nn.dense(dense1, weight2)
+    mod = tvm.IRModule.from_expr(dense2)
+
+    weight1_np = np.random.randn(*weight_shape1).astype("float32")
+    weight2_np = np.random.randn(*weight_shape2).astype("float32")
+
+    params = {"weight1": weight1_np, "weight2": weight2_np}
+
+    executor = relay.backend.Executor("graph", {"link-params": True})
+    mod = mod.with_attr("executor", executor)
+
+    # Without using ignore-ndarray for module equality, we get duplicated tasks
+    assert len(ms.relay_integration.extract_tasks(mod, target, params)) == 2
+
+    module_eqality = "ignore-ndarray"
+    extracted_tasks = ms.relay_integration.extract_tasks(
+        mod, target, params, module_equality=module_eqality
+    )
+
+    assert len(extracted_tasks) == 1
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        tasks, task_weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
+            extracted_tasks, work_dir, strategy="replay-trace"
+        )
+        database = ms.tune.tune_tasks(
+            tasks=tasks,
+            task_weights=task_weights,
+            work_dir=work_dir,
+            max_trials_global=4,
+            module_equality=module_eqality,
+        )
+        lib = ms.relay_integration.compile_relay(database, mod, target, params)
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    data_np = np.random.randn(*data_shape).astype("float32")
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    ref = np.dot(np.dot(data_np, weight1_np.transpose()), weight2_np.transpose())
+    np.testing.assert_allclose(ref, out, rtol=1e-4, atol=1e-4)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 42d9c38a4d061f8128cd172faa1b94e2e0b546ea Mon Sep 17 00:00:00 2001
From: Mengyang Liu <49838178+liu-mengyang@users.noreply.github.com>
Date: Tue, 18 Oct 2022 02:21:25 +0800
Subject: [PATCH 367/704] [Doc] Fix typo in the document of installing
 from_source (#13090)

Fix typo in the document of installing from source in `tvm/docs/install/from_source.rst`: `ontain -> obtain a visual studio compiler`.
---
 docs/install/from_source.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 33328c586760..b92a921d61b6 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -240,7 +240,7 @@ If you are already using conda as your package manager and wish to directly buil
 
 Building on Windows
 ~~~~~~~~~~~~~~~~~~~
-TVM support build via MSVC using cmake. You will need to ontain a visual studio compiler.
+TVM support build via MSVC using cmake. You will need to obtain a visual studio compiler.
 The minimum required VS version is **Visual Studio Enterprise 2019** (NOTE: we test
 against GitHub Actions' `Windows 2019 Runner <https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md>`_, so see that page for full details.
 We recommend following :ref:`build-with-conda` to obtain necessary dependencies and

From 69ba30e509a3a9ce755117495eba38f934c716b3 Mon Sep 17 00:00:00 2001
From: Wu SiYu <wu.siyu@hotmail.com>
Date: Tue, 18 Oct 2022 02:21:55 +0800
Subject: [PATCH 368/704] [TIR][FIX] Fix crash when using 'if' without 'else'
 in TVMScript (#13054)

This commit fix the tvm's crash during TVMScript's auto-tune when having a
'if' statement which doesn't have a 'else' part.
It's cause in tvm's tir processing (tvm.tir.analysis.estimate_flops), in
VisitStmt_() for IfThenElseNode it trying to visit the 'else' part whether
it is exist.
So in this commit, an if logic is added in VisitStmt_() for
IfThenElseNode, it will not visit the 'else' part if it is not exist.

Co-authored-by: SiYu Wu <wusiyu@buaa.edu.cn>
---
 src/tir/analysis/estimate_flops.cc                |  6 +++++-
 .../test_tir_analysis_estimate_tir_flops.py       | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/tir/analysis/estimate_flops.cc b/src/tir/analysis/estimate_flops.cc
index 576476ae30aa..d8faf9bd1362 100644
--- a/src/tir/analysis/estimate_flops.cc
+++ b/src/tir/analysis/estimate_flops.cc
@@ -148,7 +148,11 @@ class FlopEstimator : private ExprFunctor<TResult(const PrimExpr& n)>,
 
   TResult VisitStmt_(const IfThenElseNode* branch) override {
     TResult cond = VisitExpr(branch->condition);
-    cond += VisitStmt(branch->then_case).MaxWith(VisitStmt(branch->else_case));
+    if (branch->else_case.defined()) {
+      cond += VisitStmt(branch->then_case).MaxWith(VisitStmt(branch->else_case));
+    } else {
+      cond += VisitStmt(branch->then_case);
+    }
     return cond;
   }
 
diff --git a/tests/python/unittest/test_tir_analysis_estimate_tir_flops.py b/tests/python/unittest/test_tir_analysis_estimate_tir_flops.py
index 68279043c64e..8c16c81388ed 100644
--- a/tests/python/unittest/test_tir_analysis_estimate_tir_flops.py
+++ b/tests/python/unittest/test_tir_analysis_estimate_tir_flops.py
@@ -62,5 +62,20 @@ def test_flops_with_let():
     assert flops == 8
 
 
+@T.prim_func
+def flops_with_if(a: T.Buffer[16, "float32"], b: T.Buffer[16, "float32"]):
+    for i in range(16):
+        if i % 2 == 0:
+            a[i] = b[i]
+        else:
+            if i % 3 == 0:
+                a[i] = b[i - 1] + b[i - 2]
+
+
+def test_flops_with_if():
+    flops = estimate_tir_flops(IRModule({"main": flops_with_if}))
+    assert flops == 16
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 5dd786b3c903d34faba8e6223eaafbe979e15c11 Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Mon, 17 Oct 2022 15:52:46 -0400
Subject: [PATCH 369/704] [Hexagon] [runtime] VTCM bugfix, runtime buffer
 clarification (#13066)

vtcm bugfix, runtime buffer clarification
---
 src/runtime/hexagon/hexagon_device_api.cc     |  4 +++
 src/runtime/hexagon/hexagon_vtcm_pool.cc      |  6 +++-
 .../hexagon/hexagon_vtcm_pool_tests.cc        | 33 ++++++++++++++++---
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 3574ab50182c..6d223017e270 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -132,6 +132,10 @@ void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
   if (runtime_hexbuffs) {
     runtime_hexbuffs->FreeHexagonBuffer(ptr);
   } else {
+    // Either AcquireResources was never called, or ReleaseResources was called.  Check the
+    // list of buffers that were still allocated at the time of release.  If this buffer is
+    // in that list, this is a no-op as it is being freed as part of another object teardown.
+    // If the pointer isn't in that list, we raise an exception as this is an unexpected Free.
     auto it = std::find(released_runtime_buffers.begin(), released_runtime_buffers.end(), ptr);
     CHECK(it != released_runtime_buffers.end()) << "Attempted to free Hexagon data with "
                                                 << "HexagonDeviceAPI::FreeDataSpace that was not "
diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.cc b/src/runtime/hexagon/hexagon_vtcm_pool.cc
index 63c815a6efca..6024550ba732 100644
--- a/src/runtime/hexagon/hexagon_vtcm_pool.cc
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.cc
@@ -69,12 +69,16 @@ void* HexagonVtcmPool::Allocate(size_t nbytes) {
   // If this is not aligned on a 2k block, allocate from the end to avoid fragmentation
   if (nbytes & size_t(0x7FF)) {
     DLOG(INFO) << "VTCM nbytes requested: " << nbytes << " allocate from the end";
-    auto last_free_entry = free_.rbegin();
+    auto last_free_entry = free_.end();
+    last_free_entry--;
     CHECK(last_free_entry->second >= nbytes)
         << "Not enough contiguous VTCM space at the end to allocate";
     char* ptr = last_free_entry->first + (last_free_entry->second - nbytes);
     allocations_.emplace_back(std::pair<char*, size_t>(ptr, nbytes));
     last_free_entry->second -= nbytes;
+    if (last_free_entry->second == 0) {
+      free_.erase(last_free_entry);
+    }
     // DebugDump();
     return ptr;
   }
diff --git a/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc b/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
index 5c017b58a3a2..13c459be0c34 100644
--- a/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
@@ -71,6 +71,33 @@ TEST_F(HexagonVtcmPoolTest, basic) {
   EXPECT_THROW(ptr = vtcm_pool->Allocate(1), InternalError);
 }
 
+TEST_F(HexagonVtcmPoolTest, small_allocations) {
+  void* ptr1;
+  void* ptr2;
+  void* ptr3;
+  void* ptr4;
+
+  // Allocate small chunk from the back
+  ptr1 = vtcm_pool->Allocate(min_bytes);
+
+  // Allocate from the front
+  ptr2 = vtcm_pool->Allocate(two_k_block);
+
+  // Allocate the rest
+  ptr3 = vtcm_pool->Allocate(max_bytes - min_bytes - two_k_block);
+
+  // Should be no more memory left
+  EXPECT_THROW(ptr4 = vtcm_pool->Allocate(min_bytes), InternalError);
+
+  vtcm_pool->Free(ptr1, min_bytes);
+  vtcm_pool->Free(ptr2, two_k_block);
+  vtcm_pool->Free(ptr3, max_bytes - min_bytes - two_k_block);
+
+  // Make sure at the end we have the full amount available again
+  ptr4 = vtcm_pool->Allocate(max_bytes);
+  vtcm_pool->Free(ptr4, max_bytes);
+}
+
 TEST_F(HexagonVtcmPoolTest, no_free_vtcm) {
   void* ptr = vtcm_pool->Allocate(max_bytes);
   EXPECT_THROW(vtcm_pool->Allocate(min_bytes), InternalError);
@@ -132,8 +159,7 @@ TEST_F(HexagonVtcmPoolTest, free_alloc_combinations) {
   vtcm_pool->Free(ptr3, two_k_block);
   vtcm_pool->Free(ptr2, two_k_block);
 
-  // Make sure at the end we have the full amount
-  // available again
+  // Make sure at the end we have the full amount available again
   ptr4 = vtcm_pool->Allocate(max_bytes);
   vtcm_pool->Free(ptr4, max_bytes);
 }
@@ -170,8 +196,7 @@ TEST_F(HexagonVtcmPoolTest, vtcm_alignment) {
 
   test_hexbuffs.reset();
 
-  // Make sure at the end we have the full amount
-  // available again
+  // Make sure at the end we have the full amount available again
   ptr = vtcm_pool->Allocate(max_bytes);
   vtcm_pool->Free(ptr, max_bytes);
 }

From 4074127b713bb3ea3d34f18cddebcd5712fda5f8 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Mon, 17 Oct 2022 22:54:21 +0300
Subject: [PATCH 370/704] quic-sanirudh -> Reviewer (#13098)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index a5da6c8abc79..383bd9032683 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -173,6 +173,7 @@ We do encourage everyone to work anything they are interested in.
 - [Jon Soifer](https://github.com/soiferj): @soiferj
 - [Adam Straw](https://github.com/adstraw): @adstraw
 - [Chris Sullivan](https://github.com/csullivan): @csullivan
+- [Anirudh Sundar Subramaniam](https://github.com/quic-sanirudh): @quic-sanirudh
 - [Zhixun Tan](https://github.com/phisiart): @phisiart
 - [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch
 - [Jorn Tuyls](https://github.com/jtuyls): @jtuyls

From c14f5e1e5068a127efec1ca4f2159021b1f8a00d Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Mon, 17 Oct 2022 12:57:25 -0700
Subject: [PATCH 371/704] [ONNX] Handle multiple imports (#13065)

* onnx get right import

* fixins
---
 python/tvm/relay/frontend/onnx.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 84a5fc3b8237..8c4c056221f6 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -5906,7 +5906,16 @@ def from_onnx(
     graph = model.graph
 
     try:
-        opset_in_model = model.opset_import[0].version if model.opset_import else 1
+        opset_in_model = 1
+        if model.opset_import:
+            # TODO: for now we only really support ai.onnx op set
+            # TODO: handle other namespaces well see https://github.com/apache/tvm/issues/10950
+            for opset_identifier in model.opset_import:
+                # As per https://github.com/onnx/onnx/blob/main/docs/IR.md
+                # All operator sets except the default one must specify the operator version
+                if str(opset_identifier.domain) in ["ai.onnx", ""]:
+                    opset_in_model = opset_identifier.version
+                    break
     except AttributeError:
         opset_in_model = 1
 

From f4e917960109ffbc4384c0f2c6e994dd0a3fbc24 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Mon, 17 Oct 2022 12:57:36 -0700
Subject: [PATCH 372/704] [TIR] Fix handling of int64 extent in blockize and
 tensorize (#13069)

* [TIR] Fix handling of int64 extent in blockize

* Fix handling of int64 extent in tensorize

* Update layout_transformation.cc
---
 src/arith/iter_affine_map.cc                  | 41 +++++----
 src/tir/schedule/ir_comparator.cc             | 12 +--
 .../schedule/primitive/blockize_tensorize.cc  |  2 +-
 .../primitive/layout_transformation.cc        | 11 ++-
 .../unittest/test_tir_schedule_blockize.py    | 35 ++++++++
 .../unittest/test_tir_schedule_tensorize.py   | 85 +++++++++++++++++++
 6 files changed, 161 insertions(+), 25 deletions(-)

diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index d41db2ff135e..7529019abda8 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -868,10 +868,10 @@ class IterMapRewriter : public ExprMutator {
     IterSumExpr structured_form = expr, flattened_form = expr;
     flattened_form.CopyOnWrite()->args =
         Array<IterSplitExpr>(flattened_iters.rbegin(), flattened_iters.rend());
-    flattened_form.CopyOnWrite()->base = 0;
+    flattened_form.CopyOnWrite()->base = make_const(expr.dtype(), 0);
     structured_form.CopyOnWrite()->args =
         Array<IterSplitExpr>(grouped_iters.rbegin(), grouped_iters.rend());
-    structured_form.CopyOnWrite()->base = 0;
+    structured_form.CopyOnWrite()->base = make_const(expr.dtype(), 0);
     auto it = sum_fuse_map_.find(flattened_form);
     if (it != sum_fuse_map_.end()) {
       // old iter
@@ -1831,11 +1831,20 @@ class SubspaceDivider {
     IterSplitExpr GetInnerAsSplit() const { return GetAsSplit(inner, inner_extent); }
 
     static DivisionResult Inner(const IterMapExpr& iter, const PrimExpr& extent) {
-      return DivisionResult(IterSumExpr({}, 0), 1, iter, extent);
+      auto dtype = iter.dtype();
+      return DivisionResult(IterSumExpr({}, make_const(dtype, 0)), make_const(dtype, 1), iter,
+                            extent);
     }
 
     static DivisionResult Outer(const IterMapExpr& iter, const PrimExpr& extent) {
-      return DivisionResult(iter, extent, IterSumExpr({}, 0), 1);
+      auto dtype = iter.dtype();
+      return DivisionResult(iter, extent, IterSumExpr({}, make_const(dtype, 0)),
+                            make_const(dtype, 1));
+    }
+
+    // Special value to indicate the division is not possible
+    static DivisionResult Failure() {
+      return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
     }
 
    private:
@@ -1853,14 +1862,16 @@ class SubspaceDivider {
 
   // Divide an IterSumExpr
   DivisionResult DivideIterSumExpr(const IterSumExpr& expr, const PrimExpr& mark_extent) {
+    auto dtype = expr.dtype();
     if (expr->args.empty()) {
       // base
-      return DivisionResult(IterSumExpr({}, 0), 1, IterSumExpr({}, expr->base), 1);
+      return DivisionResult(IterSumExpr({}, make_const(dtype, 0)), make_const(dtype, 1),
+                            IterSumExpr({}, expr->base), make_const(dtype, 1));
     } else if (expr->args.size() == 1) {
       // arg + base, if arg=Y*E(X)+X, then arg+base = Y*E(X)+(X+base)
       if (!is_one(expr->args[0]->scale)) {
         unresolved_count_++;
-        return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+        return DivisionResult::Failure();
       }
       DivisionResult res = DivideIterSplitExpr(expr->args[0]);
       if (!is_zero(expr->base)) res = AddBase(res, expr->base);
@@ -1869,7 +1880,7 @@ class SubspaceDivider {
     // arg1 + arg2 + ... + argn + base
     // then we can write it as Y*E(X)+X
     // if it starts with contiguous outer splits, followed by contiguous inner splits
-    PrimExpr extent = 1;
+    PrimExpr extent = make_const(dtype, 1);
     std::vector<IterSplitExpr> outer_args, inner_args;
     bool inner = true, scale_is_one = false;
     // we check in inverse order so we can visit from inner to outer
@@ -1881,7 +1892,7 @@ class SubspaceDivider {
       if (arg_division.IsInner()) {
         if (!inner) {
           unresolved_count_++;
-          return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+          return DivisionResult::Failure();
         }
         new_arg = arg_division.GetInnerAsSplit();
         inner_args.push_back(new_arg);
@@ -1892,13 +1903,13 @@ class SubspaceDivider {
         inner = false;
       } else {
         unresolved_count_++;
-        return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+        return DivisionResult::Failure();
       }
       extent *= new_arg->extent;
     }
     if (!scale_is_one) {
       unresolved_count_++;
-      return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+      return DivisionResult::Failure();
     }
     bool need_predicate = !analyzer_->CanProveEqual(extent, mark_extent);
     const IterMark& outer_mark = MarkFromArgsAndBase(outer_args, 0);
@@ -1919,7 +1930,7 @@ class SubspaceDivider {
         return DivisionResult::Inner(inner_source, mark_extent);
       } else {
         unresolved_count_++;
-        return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+        return DivisionResult::Failure();
       }
     }
     return DivisionResult(outer_source, outer_mark->extent, inner_source, inner_mark->extent);
@@ -1943,7 +1954,7 @@ class SubspaceDivider {
   // args are sorted from inner to outer
   static IterMark MarkFromArgsAndBase(const std::vector<IterSplitExpr>& args, PrimExpr base) {
     std::vector<IterSplitExpr> res;
-    PrimExpr extent = 1;
+    PrimExpr extent = make_const(base.dtype(), 1);
     for (const IterSplitExpr& it : args) {
       IterSplitExpr arg = it;
       arg.CopyOnWrite()->scale = extent;
@@ -2006,7 +2017,7 @@ class SubspaceDivider {
         }
         if (j == splits.size()) {
           unresolved_count_++;
-          return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+          return DivisionResult::Failure();
         }
         used[j] = true;
         if (!encountered_boundary) {
@@ -2020,7 +2031,7 @@ class SubspaceDivider {
       }
       if (!encountered_boundary) {
         unresolved_count_++;
-        return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+        return DivisionResult::Failure();
       }
       for (const IterSplitExpr& inner_iter : inner_iters) {
         IterSplitExpr new_iter = inner_iter;
@@ -2036,7 +2047,7 @@ class SubspaceDivider {
       }
     } else {
       unresolved_count_++;
-      return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+      return DivisionResult::Failure();
     }
     return split_map_.at(expr);
   }
diff --git a/src/tir/schedule/ir_comparator.cc b/src/tir/schedule/ir_comparator.cc
index 93cb488eaf56..ea0ac0bc733d 100644
--- a/src/tir/schedule/ir_comparator.cc
+++ b/src/tir/schedule/ir_comparator.cc
@@ -72,9 +72,9 @@ bool TensorizeComparator::VisitStmt(const Stmt& n, const Stmt& other) {
 }
 
 bool TensorizeComparator::VisitExpr(const PrimExpr& n, const PrimExpr& other) {
-  bool equal =
-      n.same_as(other) || ((n->type_index() == other->type_index()) && n->dtype == other->dtype &&
-                           ExprComparator::VisitExpr(n, other));
+  bool equal = n.same_as(other) ||
+               ((n->type_index() == other->type_index()) &&
+                n.dtype().code() == other.dtype().code() && ExprComparator::VisitExpr(n, other));
   if (!equal && assert_mode_) {
     std::ostringstream os;
     os << "Expression mismatch: " << n << " vs " << other;
@@ -185,7 +185,7 @@ bool TensorizeComparator::VisitExpr_(const VarNode* op, const PrimExpr& other) {
   const auto* rhs = other.as<VarNode>();
   auto lhs = GetRef<Var>(op);
   if (lhs.same_as(other)) return true;
-  if (op->dtype != rhs->dtype) return false;
+  if (op->dtype.code() != rhs->dtype.code()) return false;
   auto it = equal_map_.find(lhs);
   return it != equal_map_.end() && it->second.same_as(other);
 }
@@ -208,7 +208,9 @@ bool TensorizeComparator::DefEqual(const Var& lhs, const Var& rhs) {
   if (it != equal_map_.end()) return it->second.same_as(rhs);
   // Otherwise remap lhs to rhs
   equal_map_[lhs] = rhs;
-  analyzer_.Bind(lhs, rhs);
+  // Cast if necessary. This allows the workload and the tensor intrin to have different dtypes in
+  // the indices.
+  analyzer_.Bind(lhs, cast(lhs.dtype(), rhs));
   return true;
 }
 
diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc
index 7481a7c92494..98e30117e172 100644
--- a/src/tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/tir/schedule/primitive/blockize_tensorize.cc
@@ -572,7 +572,7 @@ void Tensorize(ScheduleState self, const StmtSRef& sref, const TensorIntrin& int
     }
     for (int i = 0; i < static_cast<int>(old_region.size()); i++) {
       PrimExpr min = indices_base[i + offset];
-      PrimExpr extent = old_region[i]->extent;
+      PrimExpr extent = cast(min.dtype(), old_region[i]->extent);
       new_region.push_back(Range::FromMinExtent(min, extent));
     }
     match_buffer_regions.push_back(MatchBufferRegion(impl, BufferRegion(cur, new_region)));
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 9d36a5f7e5c4..e4c91dac582c 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -369,7 +369,8 @@ class TransformLayoutPlanner : private StmtExprVisitor {
           ss << "v_" << var->name_hint;
           Var virtual_var(ss.str(), var.dtype());
           new_iter_values.push_back(var);
-          new_iter_vars.push_back(IterVar(Range::FromMinExtent(0, dim), virtual_var, kDataPar));
+          new_iter_vars.push_back(
+              IterVar(Range::FromMinExtent(make_zero(dim.dtype()), dim), virtual_var, kDataPar));
           new_access_indices.push_back(virtual_var);
           loop_var_to_virtual_var.Set(var, virtual_var);
         }
@@ -990,7 +991,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
   auto [inverse, padding_predicate] = [&]() {
     Array<Range> region;
     for (const auto& dim : old_buffer->shape) {
-      region.push_back(Range::FromMinExtent(0, dim));
+      region.push_back(Range::FromMinExtent(make_zero(dim.dtype()), dim));
     }
     return index_map.NonSurjectiveInverse(region);
   }();
@@ -1209,8 +1210,10 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
     if (iter_type == kOpaque) {
       throw OpaqueNewIterTypeError(self->mod, GetRef<Block>(block_ptr), transformed_block_iters[i]);
     }
-    new_block_iters.push_back(IterVar(/*dom=*/Range::FromMinExtent(0, new_block_iter_range[i]),
-                                      /*var=*/std::move(new_block_var), /*iter_type=*/iter_type));
+    auto dtype = new_block_var.dtype();
+    new_block_iters.push_back(IterVar(
+        /*dom=*/Range::FromMinExtent(make_zero(dtype), new_block_iter_range[i]),
+        /*var=*/std::move(new_block_var), /*iter_type=*/iter_type));
   }
 
   // Step 5.2: Update the block body. Use the inverse map f^{-1} to replace the original block iters
diff --git a/tests/python/unittest/test_tir_schedule_blockize.py b/tests/python/unittest/test_tir_schedule_blockize.py
index 6d13281320c0..12836cdb9e68 100644
--- a/tests/python/unittest/test_tir_schedule_blockize.py
+++ b/tests/python/unittest/test_tir_schedule_blockize.py
@@ -247,5 +247,40 @@ def after_rowsum_blockize(
     verify_trace_roundtrip(sch=s, mod=rowsum)
 
 
+def test_blockize_outer_int64_shape():
+    @T.prim_func
+    def single_elementwise_int64(
+        A: T.Buffer[(T.int64(16), T.int64(128)), "float32"],
+        B: T.Buffer[(T.int64(16), T.int64(128)), "float32"],
+    ) -> None:
+        for i0, j0, i1, j1 in T.grid(T.int64(1), T.int64(8), T.int64(16), T.int64(16)):
+            with T.block("B"):
+                vi = T.axis.S(T.int64(16), i0 * T.int64(16) + i1)
+                vj = T.axis.S(T.int64(128), j0 * T.int64(16) + j1)
+                B[vi, vj] = A[vi, vj] + 1.0
+
+    @T.prim_func
+    def after_single_elementwise_int64_blockize(
+        A: T.Buffer[(T.int64(16), T.int64(128)), "float32"],
+        B: T.Buffer[(T.int64(16), T.int64(128)), "float32"],
+    ) -> None:
+        for i0, j0 in T.grid(T.int64(1), T.int64(8)):
+            with T.block("B_o"):
+                vi_o = T.axis.spatial(T.int64(1), T.int64(0))
+                vj_o = T.axis.spatial(T.int64(8), j0)
+                for i1, j1 in T.grid(T.int64(16), T.int64(16)):
+                    with T.block("B"):
+                        vi_i, vj_i = T.axis.remap("SS", [i1, j1])
+                        B[vi_i, vj_o * T.int64(16) + vj_i] = A[
+                            vi_i, vj_o * T.int64(16) + vj_i
+                        ] + T.float32(1)
+
+    s = tir.Schedule(single_elementwise_int64, debug_mask="all")
+    _, _, i1, _ = s.get_loops(s.get_block("B"))
+    s.blockize(i1)
+    tvm.ir.assert_structural_equal(s.mod["main"], after_single_elementwise_int64_blockize)
+    verify_trace_roundtrip(sch=s, mod=single_elementwise_int64)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index f04de8e0051f..f30e91b892c5 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -653,5 +653,90 @@ def test_tensor_intrin_look_up():
         tir.TensorIntrin.get(intrin_name)
 
 
+def test_tensorize_matmul_mixed_dtype():
+    # fmt: off
+    @T.prim_func
+    def matmul_int64_shape(
+        A: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+        B: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+        C: T.Buffer[(T.int64(128), T.int64(128)), "float32"]
+    ) -> None:
+        for i_0, j_0 in T.grid(T.int64(8), T.int64(8)):
+            for i_1_init, j_1_init in T.grid(T.int64(16), T.int64(16)):
+                with T.block("init"):
+                    vi = T.axis.spatial(T.int64(128), i_0 * T.int64(16) + i_1_init)
+                    vj = T.axis.spatial(T.int64(128), j_0 * T.int64(16) + j_1_init)
+                    C[vi, vj] = T.float32(0)
+            for k_0, i_1, j_1, k_1 in T.grid(T.int64(8), T.int64(16), T.int64(16), T.int64(16)):
+                with T.block("update"):
+                    vi = T.axis.spatial(T.int64(128), i_0 * T.int64(16) + i_1)
+                    vj = T.axis.spatial(T.int64(128), j_0 * T.int64(16) + j_1)
+                    vk = T.axis.reduce(T.int64(128), k_0 * T.int64(16) + k_1)
+                    C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+
+    @T.prim_func
+    def tensorized_matmul_int64_shape(
+        A: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+        B: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+        C: T.Buffer[(T.int64(128), T.int64(128)), "float32"]
+    ) -> None:
+        for i_outer, j_outer in T.grid(T.int64(8), T.int64(8)):
+            for i_inner_init, j_inner_init in T.grid(T.int64(16), T.int64(16)):
+                with T.block("init"):
+                    vi = T.axis.spatial(T.int64(128), i_outer * T.int64(16) + i_inner_init)
+                    vj = T.axis.spatial(T.int64(128), j_outer * T.int64(16) + j_inner_init)
+                    C[vi, vj] = T.float32(0)
+            for k_outer in T.grid(T.int64(8)):
+                with T.block("update"):
+                    vi, vj, vk = T.axis.remap("SSR", [i_outer, j_outer, k_outer])
+                    T.reads(
+                        [
+                            C[vi * T.int64(16) : vi * T.int64(16) + T.int64(16), vj * T.int64(16) : vj * T.int64(16) + T.int64(16)],
+                            A[vi * T.int64(16) : vi * T.int64(16) + T.int64(16), vk * T.int64(16) : vk * T.int64(16) + T.int64(16)],
+                            B[vj * T.int64(16) : vj * T.int64(16) + T.int64(16), vk * T.int64(16) : vk * T.int64(16) + T.int64(16)],
+                        ]
+                    )
+                    T.writes(C[vi * T.int64(16) : vi * T.int64(16) + T.int64(16), vj * T.int64(16) : vj * T.int64(16) + T.int64(16)])
+                    A_elem_offset = T.var("int32")
+                    B_elem_offset = T.var("int32")
+                    C_elem_offset = T.var("int32")
+                    A_sub = T.match_buffer(
+                        A[vi * T.int64(16) : vi * T.int64(16) + T.int64(16), vk * T.int64(16) : vk * T.int64(16) + T.int64(16)],
+                        [16, 16],
+                        elem_offset=A_elem_offset,
+                    )
+                    B_sub = T.match_buffer(
+                        B[vj * T.int64(16) : vj * T.int64(16) + T.int64(16), vk * T.int64(16) : vk * T.int64(16) + T.int64(16)],
+                        [16, 16],
+                        elem_offset=B_elem_offset,
+                    )
+                    C_sub = T.match_buffer(
+                        C[vi * T.int64(16) : vi * T.int64(16) + T.int64(16), vj * T.int64(16) : vj * T.int64(16) + T.int64(16)],
+                        [16, 16],
+                        elem_offset=C_elem_offset,
+                    )
+                    T.evaluate(
+                        T.tvm_mma_sync(
+                            C_sub.data,
+                            T.floordiv(C_sub.elem_offset, 256),
+                            A_sub.data,
+                            T.floordiv(A_sub.elem_offset, 256),
+                            B_sub.data,
+                            T.floordiv(B_sub.elem_offset, 256),
+                            C_sub.data,
+                            T.floordiv(C_sub.elem_offset, 256),
+                            dtype="handle",
+                        )
+                    )
+    # fmt: on
+
+    s = tir.Schedule(matmul_int64_shape, debug_mask="all")
+    update = s.get_block("update")
+    ii = s.get_loops(update)[-3]
+    s.tensorize(ii, "test_mma_intrin")
+    tvm.ir.assert_structural_equal(s.mod["main"], tensorized_matmul_int64_shape)
+    verify_trace_roundtrip(sch=s, mod=matmul_int64_shape)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From e3d3a1a62edc5afc974aab0995a9f00f904ea695 Mon Sep 17 00:00:00 2001
From: "Ehsan M. Kermani" <6980212+ehsanmok@users.noreply.github.com>
Date: Mon, 17 Oct 2022 13:21:29 -0700
Subject: [PATCH 373/704] [Relay][Frontend][ONNX] Add LayerNormalization
 operator (#13074)

* [Relay][Frontend][ONNX] Add LayerNormalization operator

* Include mean in variance to reduce the number of expressions if already exists

* Fix lint

Co-authored-by: Ehsan M. Kermani <ehsanmok@users.noreply.github.com>
---
 python/tvm/relay/frontend/onnx.py          | 37 ++++++++++++++++++++--
 python/tvm/relay/op/reduce.py              |  7 ++--
 tests/python/frontend/onnx/test_forward.py | 19 -----------
 3 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 8c4c056221f6..ff7d5655e0d3 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -366,9 +366,11 @@ def flatten_to_nd(x, x_shape, nd=3):
 
 
 def layer_norm(x, eps, gamma, beta):
-    """Common function to handle layer norm"""
-    eps_dtype = infer_type(x).checked_type.dtype
+    """A common function to handle layer norm.
 
+    Use LayerNormalization for the actual onnx op.
+    """
+    eps_dtype = infer_type(x).checked_type.dtype
     u, s = _op.mean_variance(x, axis=-1, keepdims=True)
     output = _op.divide(
         _op.subtract(x, u),
@@ -944,6 +946,36 @@ def _impl_v1(cls, inputs, attr, params):
         return Gelu._impl_v1([inp], attr, params)
 
 
+class LayerNormalization(OnnxOpConverter):
+    """Operator converter for LayerNormalization from Microsoft onnxruntime contrib opset."""
+
+    @classmethod
+    def _impl_v17(cls, inputs, attr, params):
+        x = inputs[0]
+        gamma = inputs[1]
+        beta = inputs[2]
+        axis = attr.get("axis", -1)
+        eps = attr.get("epsilon", 1e-5)
+        # according to the onnx doc, given the int axis (default -1)
+        # to compute the mean and inv_stdev which are of dim [d[0], ..., d[axis-1], 1, ..., 1]
+        # the actual computation is over (axis, ..., rank(x) - 1) axes
+        # see https://github.com/onnx/onnx/blob/main/docs/Changelog.md#layernormalization-17
+        rank = len(infer_shape(x))
+        axis = tuple(range(axis, rank)) if axis >= 0 else tuple(range(rank + axis, rank))
+        dtype = infer_type(x).checked_type.dtype
+        mean = _op.mean(x, axis, keepdims=True)
+        var = _op.variance(x, axis, keepdims=True, with_mean=mean)
+        inv_stdev = _op.divide(
+            _op.const(1, dtype=dtype), _op.sqrt(_op.add(var, _op.const(eps, dtype=dtype)))
+        )
+        x_norm = _op.multiply(_op.subtract(x, mean), inv_stdev)
+        ln = _op.multiply(x_norm, gamma)
+        if beta is not None:
+            ln = _op.add(ln, beta)
+
+        return _expr.TupleWrapper(_expr.Tuple([ln, mean, inv_stdev]), 3)
+
+
 class EmbedLayerNormalization(OnnxOpConverter):
     """Operator converter for EmbedLayerNormalization from Microsoft onnxruntime contrib opset.
 
@@ -5336,6 +5368,7 @@ def _get_convert_map(opset):
         "Elu": Elu.get_converter(opset),
         "Gelu": Gelu.get_converter(opset),
         "BiasGelu": BiasGelu.get_converter(opset),
+        "LayerNormalization": LayerNormalization.get_converter(opset),
         # TODO: We need a better way to handle different domains, in case
         # of name collisions. EmbedLayerNormalization, SkipLayerNormalization, and Attention
         # are in the `com.microsoft` domain.
diff --git a/python/tvm/relay/op/reduce.py b/python/tvm/relay/op/reduce.py
index b3d71498ed8a..67dc82efaf87 100644
--- a/python/tvm/relay/op/reduce.py
+++ b/python/tvm/relay/op/reduce.py
@@ -322,7 +322,7 @@ def mean(data, axis=None, keepdims=False, exclude=False):
     return _make.mean(data, axis, keepdims, exclude)
 
 
-def variance(data, axis=None, keepdims=False, exclude=False, unbiased=False):
+def variance(data, axis=None, keepdims=False, exclude=False, unbiased=False, with_mean=None):
     """Computes the variance of data over given axes.
 
     Parameters
@@ -347,13 +347,16 @@ def variance(data, axis=None, keepdims=False, exclude=False, unbiased=False):
     unbiased : bool
         If this is set to True, the unbiased estimation will be used.
 
+    with_mean : Optional[relay.Expr]
+        To compute variance given an already computed mean
+
     Returns
     -------
     result : relay.Expr
         The computed result.
     """
     axis = [axis] if isinstance(axis, int) else axis
-    m = mean(data, axis, True, exclude)
+    m = mean(data, axis, True, exclude) if with_mean is None else with_mean
     return _make._variance(data, m, axis, keepdims, exclude, unbiased)
 
 
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index da6f5785023d..9fc00930af0e 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5295,25 +5295,6 @@ def verify_eyelike(indata, dynamic=False):
     "test_identity_sequence",
     "test_if_opt",
     "test_if_seq",
-    "test_layer_normalization_2d_axis0",
-    "test_layer_normalization_2d_axis1",
-    "test_layer_normalization_2d_axis_negative_1",
-    "test_layer_normalization_2d_axis_negative_2",
-    "test_layer_normalization_3d_axis0_epsilon",
-    "test_layer_normalization_3d_axis1_epsilon",
-    "test_layer_normalization_3d_axis2_epsilon",
-    "test_layer_normalization_3d_axis_negative_1_epsilon",
-    "test_layer_normalization_3d_axis_negative_2_epsilon",
-    "test_layer_normalization_3d_axis_negative_3_epsilon",
-    "test_layer_normalization_4d_axis0",
-    "test_layer_normalization_4d_axis1",
-    "test_layer_normalization_4d_axis2",
-    "test_layer_normalization_4d_axis3",
-    "test_layer_normalization_4d_axis_negative_1",
-    "test_layer_normalization_4d_axis_negative_2",
-    "test_layer_normalization_4d_axis_negative_3",
-    "test_layer_normalization_4d_axis_negative_4",
-    "test_layer_normalization_default_axis",
     "test_loop11",
     "test_loop13_seq",
     "test_loop16_seq_none",

From 8ccc43445a50df1b8f3c886113c379cc132a90c4 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Mon, 17 Oct 2022 14:45:27 -0700
Subject: [PATCH 374/704] [Hexagon] Async DMA pipelining test suite (#13005)

* [Hexagon] Add tests to show how to properly utilize async dma pipelining on hexagon.

* Formatting updates.

* Update comments and reformatting.

* Skip long tests in CI.
---
 .../test_hexagon/test_async_dma_pipeline.py   | 353 ++++++++++++++++++
 1 file changed, 353 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/test_async_dma_pipeline.py

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
new file mode 100644
index 000000000000..d05e0a6e9216
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -0,0 +1,353 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test different strategies for loading data into vtcm before running HVX workloads. """
+
+import numpy as np
+import tvm
+import pytest
+
+from tvm.script import tir as T
+from numpy.random import default_rng
+
+from tvm.tir.function import TensorIntrin
+
+VRMPY_SIZE_B = 128
+VRMPY_SIZE_INT32 = 32
+
+
+def conv_approximation(size_a, size_w):
+    a_shape = (size_a, VRMPY_SIZE_B)
+    w_shape = (size_w, VRMPY_SIZE_B)
+    out_shape = (size_a, VRMPY_SIZE_INT32)
+
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, a_shape, dtype="uint8")
+        W = T.match_buffer(b, w_shape, dtype="uint8")
+        C = T.match_buffer(c, out_shape, dtype="int32")
+        for n, i in T.grid(size_a, size_w):
+            with T.block("C"):
+                vn, vi = T.axis.remap("SR", [n, i])
+                T.reads(A[vn, 0:VRMPY_SIZE_B], W[vi, 0:VRMPY_SIZE_B], C[vn, 0:VRMPY_SIZE_INT32])
+                T.writes(C[vn, 0:VRMPY_SIZE_INT32])
+                with T.init():
+                    for x in T.serial(VRMPY_SIZE_INT32):
+                        C[vn, x] = 0
+                C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.acc.128B"),
+                    T.uint32(3),
+                    C[vn, T.ramp(0, 1, 32)],
+                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(W[vi, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    dtype="int32x32",
+                )
+        # Currently async DMA lowering does not add any wait to the end of schedules so
+        # for timing purposes we are manually adding a wait to ensure that all copies
+        # are complete when the schedule exits.
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.dma_wait",
+                0,  # QueueId
+                0,  # Wait for 0 in flight
+                dtype="int32",
+            )
+        )
+
+    return tvm.tir.Schedule(operator)
+
+
+def evaluate(hexagon_session, sch, a, b, size_a, expected_output, use_async_copy=0):
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    with tvm.transform.PassContext(config={"tir.use_async_copy": use_async_copy}):
+        func_tir = tvm.build(
+            sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+        )
+    module = hexagon_session.load_module(func_tir)
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
+    c_hexagon = tvm.runtime.ndarray.array(
+        np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32"), device=hexagon_session.device
+    )
+
+    if tvm.testing.utils.IS_IN_CI:
+        # Run with reduced number and repeat for CI
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=1, repeat=1)
+    else:
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=10, repeat=10)
+
+    time = timer(a_hexagon, b_hexagon, c_hexagon)
+    tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected_output)
+    return round(time.mean * 1000, 4)
+
+
+@tvm.testing.fixture
+def input_a(size_a):
+    return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8")
+
+
+@tvm.testing.fixture
+def input_w(size_w):
+    return default_rng().integers(0, 8, (size_w, VRMPY_SIZE_B), dtype="uint8")
+
+
+@tvm.testing.fixture
+def expected_output(size_a, size_w, input_a, input_w):
+    if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1):
+        pytest.skip("Skipping test since it takes too long in CI.")
+    expected_output = np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32")
+    for n in range(size_a):
+        for x in range(size_w):
+            for i in range(VRMPY_SIZE_INT32):
+                for r in range(4):
+                    expected_output[n, i] += np.uint32(input_a[n, i * 4 + r]) * np.uint32(
+                        input_w[x, i * 4 + r]
+                    )
+    return expected_output
+
+
+def get_single_dma_schedule(size_a, size_w):
+    a_shape = (size_a, VRMPY_SIZE_B)
+    w_shape = (size_w, VRMPY_SIZE_B)
+    out_shape = (size_a, VRMPY_SIZE_INT32)
+
+    a_bytes = size_a * VRMPY_SIZE_B
+    w_bytes = size_w * VRMPY_SIZE_B
+
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, a_shape, dtype="uint8", mem_scope="global")
+        W = T.match_buffer(b, w_shape, dtype="uint8", mem_scope="global")
+        C = T.match_buffer(c, out_shape, dtype="int32", mem_scope="global")
+        A_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", mem_scope="global.vtcm")
+        W_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", mem_scope="global.vtcm")
+        C_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", mem_scope="global.vtcm")
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.mem_copy_DLTensor",
+                T.tvm_stack_make_array(
+                    A_global_vtcm.data,
+                    T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
+                    0,
+                    2,
+                    A_global_vtcm.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    A.data,
+                    T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
+                    0,
+                    2,
+                    A.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.cast(a_bytes, dtype="int"),
+                dtype="int32",
+            )
+        )
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.mem_copy_DLTensor",
+                T.tvm_stack_make_array(
+                    W_global_vtcm.data,
+                    T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"),
+                    0,
+                    2,
+                    W_global_vtcm.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    W.data,
+                    T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"),
+                    0,
+                    2,
+                    W.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.cast(w_bytes, dtype="int"),
+                dtype="int32",
+            )
+        )
+        for n, i in T.grid(size_a, size_w):
+            with T.block("C"):
+                vn, vi = T.axis.remap("SR", [n, i])
+                T.reads(
+                    A_global_vtcm[vn, 0:VRMPY_SIZE_B],
+                    W_global_vtcm[vi, 0:VRMPY_SIZE_B],
+                    C_global_vtcm[vn, 0:VRMPY_SIZE_INT32],
+                )
+                T.writes(C_global_vtcm[vn, 0:VRMPY_SIZE_INT32])
+                with T.init():
+                    for x in T.serial(VRMPY_SIZE_INT32):
+                        C_global_vtcm[vn, x] = 0
+                C_global_vtcm[vn, T.ramp(0, 1, 32)] += T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
+                    T.uint32(2),
+                    T.reinterpret(A_global_vtcm[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(W_global_vtcm[vi, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    dtype="int32x32",
+                )
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.mem_copy_DLTensor",
+                T.tvm_stack_make_array(
+                    C.data,
+                    T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
+                    0,
+                    2,
+                    C.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    C_global_vtcm.data,
+                    T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
+                    0,
+                    2,
+                    C_global_vtcm.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.cast(a_bytes, dtype="int"),
+                dtype="int32",
+            )
+        )
+
+    sch = tvm.tir.Schedule(operator)
+
+    return sch
+
+
+def get_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
+    sch = conv_approximation(size_a, size_w)
+
+    compute_block = sch.get_block("C")
+    sch.cache_read(compute_block, 1, "global.vtcm")
+
+    n = sch.get_loops(compute_block)[0]
+    no, _ = sch.split(n, [blocks, None])
+
+    cache_read_block_a = sch.cache_read(compute_block, 0, "global.vtcm")
+    sch.compute_at(cache_read_block_a, no)
+    sch.fuse(*sch.get_loops(cache_read_block_a)[1:])
+
+    cache_read_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
+    sch.reverse_compute_at(cache_read_block_c, no)
+    sch.fuse(*sch.get_loops(cache_read_block_c)[1:])
+
+    return sch
+
+
+def print_results(test_key, runtimes):
+    print(test_key)
+    for runtime in runtimes.items():
+        print("-{} took {} ms".format(runtime[0], runtime[1]))
+    print()
+
+
+class TestAsyncDMAPipeline:
+    # Removed most of these to speedup CI.
+    size_a = tvm.testing.parameter(
+        1024,
+        64 * 64,
+        128 * 128,
+    )
+
+    size_w = tvm.testing.parameter(
+        1 * 1,
+        3 * 3,
+        7 * 7,
+        9 * 9,
+    )
+
+    @tvm.testing.requires_hexagon
+    def test_loading_vtcm_for_vrmpy(
+        self,
+        hexagon_session,
+        size_a,
+        size_w,
+        input_a,
+        input_w,
+        expected_output,
+    ):
+
+        if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1):
+            pytest.skip("Skipping test since it takes too long in CI.")
+
+        sch = conv_approximation(size_a, size_w)
+        base_runtime = evaluate(hexagon_session, sch, input_a, input_w, size_a, expected_output)
+
+        sch = get_fake_conv_vtcm_schedule(size_a, size_w)
+        base_vtcm_runtime = evaluate(
+            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+        )
+
+        sch = get_fake_conv_vtcm_schedule(size_a, size_w)
+        n = sch.get_loops(sch.get_block("C"))[0]
+        sch.annotate(n, "software_pipeline_stage", [0, 1, 2])
+        sch.annotate(n, "software_pipeline_order", [0, 1, 2])
+        sch.annotate(n, "software_pipeline_async_stages", [0])
+        async_input_runtime = evaluate(
+            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+        )
+
+        sch = get_fake_conv_vtcm_schedule(size_a, size_w)
+        n = sch.get_loops(sch.get_block("C"))[0]
+        sch.annotate(n, "software_pipeline_stage", [0, 1, 2])
+        sch.annotate(n, "software_pipeline_order", [0, 1, 2])
+        sch.annotate(n, "software_pipeline_async_stages", [0, 2])
+        async_input_output_runtime = evaluate(
+            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+        )
+
+        sch = get_fake_conv_vtcm_schedule(size_a, size_w)
+        n = sch.get_loops(sch.get_block("C"))[0]
+        sch.annotate(n, "software_pipeline_stage", [0, 1, 2])
+        sch.annotate(n, "software_pipeline_order", [0, 1, 2])
+        sch.annotate(n, "software_pipeline_async_stages", [2])
+        async_output_runtime = evaluate(
+            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+        )
+
+        sch = get_single_dma_schedule(size_a, size_w)
+        single_dma_runtime = evaluate(
+            hexagon_session, sch, input_a, input_w, size_a, expected_output
+        )
+
+        # Total transfer size is equal to the size of A + W + C which is equal to 2 * size_a * 128 + size_w * 128
+        transfer_mb = round((2 * size_a * VRMPY_SIZE_B + size_w * VRMPY_SIZE_B) / 1e6, 2)
+
+        # Total number of operations can be calculated given the total number of vrmpy calls (size_a * size_w) * operations per vrmpy accumulate (128 multiplies + 3 adds for reduction per lane + 1 add for accumulate per lane)
+        complexity = round(size_a * size_w * (VRMPY_SIZE_B * 4) / 1e9, 3)
+        print_results(
+            f"Test with A.size: {size_a * VRMPY_SIZE_B}, W.size: {size_w * VRMPY_SIZE_B}, computational complexity of {complexity} GOPs, and total memory transfer of {transfer_mb} MB...",
+            {
+                "without_vtcm": base_runtime,
+                "synchronous_dma": single_dma_runtime,
+                "base_vtcm": base_vtcm_runtime,
+                "async_dma_input": async_input_runtime,
+                "async_dma_output": async_output_runtime,
+                "async_dma_input_output": async_input_output_runtime,
+            },
+        )

From 8d2e887dbb3dc693f58dee00ab9c382c85ca21a4 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Tue, 18 Oct 2022 00:50:36 +0300
Subject: [PATCH 375/704] [HotFix] Fix python import (#13099)

* [HotFix] Fix python import

Tuning doesn't work after #12969.
It reports the following error:

```
ImportError: cannot import name 'get_const_float' from partially initialized module 'tvm.topi.utils' (most likely due to a circular import)
```

In this commit I moved import relay to a function which used in a test.
And it helps to fix this circular import

* Fix lint
---
 python/tvm/topi/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index f6ca03d32742..91e29665cda3 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 import tvm
-from tvm import relay, te
+from tvm import te
 from tvm.tir import bijective_layout, layout
 from . import cpp, tag
 
@@ -455,7 +455,7 @@ def change_constant_shape(src, src_layout, dst_layout):
     assert src_layout.isalpha() and dst_layout.isalpha()
     axis_order = [src_layout.index(c) for c in dst_layout]
     reshaped = np.transpose(src.data.numpy(), axis_order)
-    return relay.Constant(tvm.nd.array(reshaped))
+    return tvm.relay.Constant(tvm.nd.array(reshaped))
 
 
 def within_index(b, e, s, i):

From ee55333222c4e12de104322ed4b4c28a17dc6ecb Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Mon, 17 Oct 2022 17:39:56 -0700
Subject: [PATCH 376/704] [Hexagon] Add feature to copy logcat to
 --hexagon-debug and add new --sysmon-profile option to run sysmon profiler
 during the test (#13107)

* [Hexagon] Add feature to copy logcat to --hexagon-debug and add new --sysmon-profile option to run sysmon profiler during the test.

* Remove unused time import.

* Update debug options to be in hexagon launcher. Add clear logcat option.

* lint

* pylint issue

* More lint issues
---
 python/tvm/contrib/hexagon/build.py         | 137 +++++++++++++++++++-
 python/tvm/contrib/hexagon/pytest_plugin.py |  56 +++++++-
 2 files changed, 179 insertions(+), 14 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 8960d110b85e..8105e6e716c0 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -145,7 +145,7 @@ def start_server(self):
         ...
 
     @abc.abstractmethod
-    def stop_server(self, cleanup=True):
+    def stop_server(self):
         """Stop the RPC server"""
         ...
 
@@ -348,7 +348,13 @@ class HexagonLauncherAndroid(HexagonLauncherRPC):
     ]
 
     def __init__(
-        self, serial_number: str, rpc_info: dict, workspace: Union[str, pathlib.Path] = None
+        self,
+        serial_number: str,
+        rpc_info: dict,
+        workspace: Union[str, pathlib.Path] = None,
+        hexagon_debug: bool = False,
+        clear_logcat: bool = False,
+        sysmon_profile: bool = False,
     ):
         """Configure a new HexagonLauncherAndroid
 
@@ -362,6 +368,12 @@ def __init__(
             is used as the base directory.
         workspace : str or pathlib.Path, optional
             Test workspace path on android.
+        hexagon_debug: bool, optional
+            Should the server run debug options.
+        clear_logcat: bool, optional
+            Should the server clear logcat before running.
+        sysmon_profile: bool, optional
+            Should the server run sysmon profiler in the background.
         """
         if not rpc_info.get("workspace_base"):
             rpc_info["workspace_base"] = self.ANDROID_HEXAGON_TEST_BASE_DIR
@@ -369,6 +381,10 @@ def __init__(
         adb_socket = rpc_info["adb_server_socket"] if rpc_info["adb_server_socket"] else "tcp:5037"
         self._adb_device_sub_cmd = ["adb", "-L", adb_socket, "-s", self._serial_number]
         self.forwarded_ports_ = []
+        self._hexagon_debug = hexagon_debug
+        self._clear_logcat = clear_logcat
+        self._sysmon_profile = sysmon_profile
+        self._sysmon_process = None
 
         super(HexagonLauncherAndroid, self).__init__(rpc_info, workspace)
 
@@ -504,16 +520,113 @@ def cleanup_directory(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         subprocess.Popen(self._adb_device_sub_cmd + ["shell", f"rm -rf {self._workspace}"])
 
+    def _start_sysmon(self):
+        hexagon_sdk_root = os.environ.get("HEXAGON_SDK_ROOT", default="")
+        subprocess.call(
+            self._adb_device_sub_cmd
+            + ["push", f"{hexagon_sdk_root}/tools/utils/sysmon/sysMonApp", "/data/local/tmp/"]
+        )
+        sysmon_process = subprocess.Popen(
+            self._adb_device_sub_cmd
+            + [
+                "shell",
+                "/data/local/tmp/sysMonApp profiler --debugLevel 0 --samplePeriod 1 --q6 cdsp",
+            ],
+            stdin=subprocess.PIPE,
+        )
+        return sysmon_process
+
+    def _stop_sysmon(self):
+        if self._sysmon_process is not None:
+            self._sysmon_process.communicate(input=b"\n")
+            self._sysmon_process = None
+
+    def _retrieve_sysmon(self):
+        pathlib.Path("./sysmon_output/").mkdir(exist_ok=True)
+        subprocess.call(
+            self._adb_device_sub_cmd + ["pull", "/sdcard/sysmon_cdsp.bin", "./sysmon_output/"]
+        )
+        subprocess.call(self._adb_device_sub_cmd + ["root"])
+        hexagon_sdk_root = os.environ.get("HEXAGON_SDK_ROOT", default="")
+        subprocess.call(
+            f"{hexagon_sdk_root}/tools/utils/sysmon/parser_linux_v2/HTML_Parser/sysmon_parser "
+            + "./sysmon_output/sysmon_cdsp.bin --outdir ./sysmon_output/",
+            shell=True,
+        )
+
+    def _clear_debug_logs(self):
+        subprocess.call(self._adb_device_sub_cmd + ["shell", "logcat", "-c"])
+
+    def _retrieve_debug_logs(self):
+        run_start_time = subprocess.check_output(
+            self._adb_device_sub_cmd
+            + [
+                "shell",
+                "stat",
+                f"{self._workspace}/android_bash.sh | grep 'Change' | grep -oe '[0-9].*'",
+            ]
+        )
+        run_start_time = run_start_time[:-1].decode("UTF-8")
+        subprocess.call(
+            self._adb_device_sub_cmd
+            + [
+                "shell",
+                "logcat",
+                "-t",
+                f'"{run_start_time}"',
+                "-f",
+                f"{self._workspace}/logcat.txt",
+            ]
+        )
+        subprocess.call(self._adb_device_sub_cmd + ["pull", f"{self._workspace}/logcat.txt", "."])
+
+    def _print_cdsp_logs(self):
+        crash_count = 0
+        context_lines = 0
+        print_buffer = ""
+        try:
+            with open("./logcat.txt", "r") as f:
+                for line in f:
+                    if "Process on cDSP CRASHED" in line:
+                        if crash_count <= 5:
+                            print(print_buffer, "\n")
+                        context_lines = 40
+                        print_buffer = ""
+                        crash_count += 1
+                    if context_lines > 0 and "platform_qdi_driver" in line:
+                        context_lines -= 1
+                        print_buffer += line[80:]
+
+            if crash_count <= 5:
+                print(print_buffer, "\n")
+
+            print(
+                f"There were {crash_count} crashes on the cDSP during execution... "
+                + "Crash printing is limited to the first 5."
+            )
+        except FileNotFoundError:
+            print("Unable to parse logcat file.")
+
     def start_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._copy_binaries()
+        if self._sysmon_profile:
+            self._sysmon_process = self._start_sysmon()
         self._run_server_script()
+        if self._clear_logcat:
+            self._clear_debug_logs()
 
-    def stop_server(self, cleanup=True):
+    def stop_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
+        if self._sysmon_profile and self._sysmon_process is not None:
+            self._stop_sysmon()
+            self._retrieve_sysmon()
+        if self._hexagon_debug:
+            self._retrieve_debug_logs()
+            self._print_cdsp_logs()
         self._cleanup_port_forwarding()
         self._terminate_remote()
-        if cleanup:
+        if not self._hexagon_debug:
             self.cleanup_directory()
 
 
@@ -618,7 +731,7 @@ def _start(self):
     def cleanup_directory(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
 
-    def stop_server(self, cleanup=True):
+    def stop_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._server_process.terminate()
 
@@ -630,7 +743,17 @@ def _is_port_in_use(port: int) -> bool:
 
 
 # pylint: disable=invalid-name
-def HexagonLauncher(serial_number: str, rpc_info: dict, workspace: Union[str, pathlib.Path] = None):
+def HexagonLauncher(
+    serial_number: str,
+    rpc_info: dict,
+    workspace: Union[str, pathlib.Path] = None,
+    hexagon_debug: bool = False,
+    clear_logcat: bool = False,
+    sysmon_profile: bool = False,
+):
+    """Creates a HexagonLauncher"""
     if serial_number == "simulator":
         return HexagonLauncherSimulator(rpc_info, workspace)
-    return HexagonLauncherAndroid(serial_number, rpc_info, workspace)
+    return HexagonLauncherAndroid(
+        serial_number, rpc_info, workspace, hexagon_debug, clear_logcat, sysmon_profile
+    )
diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index 03f4a1a143c2..b99bfe7fa753 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -158,7 +158,13 @@ def adb_server_socket() -> str:
 
 @pytest.fixture(scope="session")
 def hexagon_server_process(
-    request, rpc_server_port_for_session, adb_server_socket, skip_rpc, hexagon_debug
+    request,
+    rpc_server_port_for_session,
+    adb_server_socket,
+    skip_rpc,
+    hexagon_debug,
+    sysmon_profile,
+    clear_logcat,
 ) -> HexagonLauncherRPC:
     """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined.
     This launcher is started only once per test session.
@@ -187,14 +193,20 @@ def hexagon_server_process(
             device_adr = read_device_list()[0]
         else:  # running in a subprocess here
             device_adr = workerinput["device_adr"]
-        launcher = HexagonLauncher(serial_number=device_adr, rpc_info=rpc_info)
+        launcher = HexagonLauncher(
+            serial_number=device_adr,
+            rpc_info=rpc_info,
+            hexagon_debug=hexagon_debug,
+            sysmon_profile=sysmon_profile,
+            clear_logcat=clear_logcat,
+        )
         try:
             if not skip_rpc:
                 launcher.start_server()
             yield {"launcher": launcher, "device_adr": device_adr}
         finally:
             if not skip_rpc:
-                launcher.stop_server(cleanup=(not hexagon_debug))
+                launcher.stop_server()
 
 
 def read_device_list():
@@ -222,6 +234,8 @@ def hexagon_launcher(
     tvm_tracker_port,
     adb_server_socket,
     hexagon_debug,
+    sysmon_profile,
+    clear_logcat,
 ) -> HexagonLauncherRPC:
     """Initials and returns hexagon launcher which reuses RPC info and Android serial number."""
     android_serial_num = android_serial_number()
@@ -235,19 +249,22 @@ def hexagon_launcher(
             "rpc_server_port": rpc_server_port,
             "adb_server_socket": adb_server_socket,
         }
-
     try:
         if android_serial_num == ["simulator"]:
             launcher = HexagonLauncher(serial_number=android_serial_num[0], rpc_info=rpc_info)
             launcher.start_server()
         else:
             launcher = HexagonLauncher(
-                serial_number=hexagon_server_process["device_adr"], rpc_info=rpc_info
+                serial_number=hexagon_server_process["device_adr"],
+                rpc_info=rpc_info,
+                hexagon_debug=hexagon_debug,
+                sysmon_profile=sysmon_profile,
+                clear_logcat=clear_logcat,
             )
         yield launcher
     finally:
         if android_serial_num == ["simulator"]:
-            launcher.stop_server(cleanup=(not hexagon_debug))
+            launcher.stop_server()
         elif not hexagon_debug:
             launcher.cleanup_directory()
 
@@ -304,7 +321,19 @@ def hexagon_debug(request) -> bool:
     return request.config.getoption("--hexagon-debug")
 
 
+@pytest.fixture(scope="session")
+def sysmon_profile(request) -> bool:
+    return request.config.getoption("--sysmon-profile")
+
+
+@pytest.fixture(scope="session")
+def clear_logcat(request) -> bool:
+    return request.config.getoption("--clear-logcat")
+
+
 def pytest_addoption(parser):
+    """Add pytest options."""
+
     parser.addoption("--gtest_args", action="store", default="")
 
     parser.addoption(
@@ -317,7 +346,20 @@ def pytest_addoption(parser):
         "--hexagon-debug",
         action="store_true",
         default=False,
-        help="If set true, it will keep the hexagon test directories on the target.",
+        help="If set true, it will keep the hexagon test directories on the target. "
+        + "Additionally logcat logs will be copied from device and cdsp errors printed out.",
+    )
+    parser.addoption(
+        "--sysmon-profile",
+        action="store_true",
+        default=False,
+        help="If set true, it will run sysmon profiler during the tests.",
+    )
+    parser.addoption(
+        "--clear-logcat",
+        action="store_true",
+        default=False,
+        help="If set true, it will clear logcat before execution.",
     )
 
 
From 468732c6b3aa50c4f99b69d7e58b7922bbaf53bf Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 17 Oct 2022 19:19:41 -0700
Subject: [PATCH 377/704] [PopenPool] Enable Stdout & Stderr Redirect in
 PopenPool & PopenWorker (#13112)

This PR enables redirection of the standard output stream and standard error stream to given subprocess handler.

This function especially improves the usage of `LocalRunner` because previously a sample candidate's exception during running could print long lines of output, making console logging / notebook logging obsolete. In the meantime, any failure of the candidates are still preserved in the per-task logging files (Schedule, trace & TIR available, doesn't include the stack trace though).

Credits to Hzfengsy's fix in https://github.com/mlc-ai/relax/pull/6.
---
 python/tvm/contrib/popen_pool.py              | 36 ++++++++++++++++---
 .../tvm/meta_schedule/runner/local_runner.py  |  2 ++
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/python/tvm/contrib/popen_pool.py b/python/tvm/contrib/popen_pool.py
index 300bb25321ed..d16cf31bc7bf 100644
--- a/python/tvm/contrib/popen_pool.py
+++ b/python/tvm/contrib/popen_pool.py
@@ -97,14 +97,22 @@ class PopenWorker:
         The maximum number of times a process can be used before being recycled,
         i.e. killed and restarted. If `None`, the process will be reused until
         an operation times out.
+
+    stdout: Union[None, int, IO[Any]]
+        The standard output streams handler specified for the popen process.
+
+    stderr: Union[None, int, IO[Any]]
+        The standard error streams handler specified for the popen process.
     """
 
-    def __init__(self, initializer=None, initargs=(), maximum_uses=None):
+    def __init__(self, initializer=None, initargs=(), maximum_uses=None, stdout=None, stderr=None):
         self._proc = None
         self._initializer = initializer
         self._initargs = initargs
         self._maximum_uses = maximum_uses
         self._remaining_uses = None
+        self._stdout = stdout
+        self._stderr = stderr
 
         if self._initializer is not None and not callable(self._initializer):
             raise TypeError("initializer must be callable for PopenWorker")
@@ -166,10 +174,14 @@ def _start(self):
             os.set_handle_inheritable(worker_read_handle, True)
             os.set_handle_inheritable(worker_write_handle, True)
             cmd += [str(worker_read_handle), str(worker_write_handle)]
-            self._proc = subprocess.Popen(cmd, close_fds=False)
+            self._proc = subprocess.Popen(
+                cmd, close_fds=False, stdout=self._stdout, stderr=self._stderr
+            )
         else:
             cmd += [str(worker_read), str(worker_write)]
-            self._proc = subprocess.Popen(cmd, pass_fds=(worker_read, worker_write))
+            self._proc = subprocess.Popen(
+                cmd, pass_fds=(worker_read, worker_write), stdout=self._stdout, stderr=self._stderr
+            )
 
         # close worker side of the pipe
         os.close(worker_read)
@@ -319,6 +331,12 @@ class PopenPoolExecutor:
         i.e. killed and restarted. If `None`, processes will be reused until an
         operation times out.
 
+    stdout: Union[None, int, IO[Any]]
+        The standard output streams handler specified for the workers in the pool.
+
+    stderr: Union[None, int, IO[Any]]
+        The standard error streams handler specified for the workers in the pool.
+
     Note
     ----
     If max_workers is NONE then the number returned by
@@ -333,6 +351,8 @@ def __init__(
         initializer=None,
         initargs=(),
         maximum_process_uses=None,
+        stdout=None,
+        stderr=None,
     ):
         if max_workers is None:
             max_workers = os.cpu_count()
@@ -344,6 +364,8 @@ def __init__(
         self._initializer = initializer
         self._initargs = initargs
         self._maximum_process_uses = maximum_process_uses
+        self._stdout = stdout
+        self._stderr = stderr
 
         if self._initializer is not None and not callable(self._initializer):
             raise TypeError("initializer must be callable for PopenPoolExecutor")
@@ -363,7 +385,13 @@ def _worker_run(self, fn, args, kwargs):
         self._lock.acquire()
         tid = threading.get_ident()
         if tid not in self._worker_map:
-            proc = PopenWorker(self._initializer, self._initargs, self._maximum_process_uses)
+            proc = PopenWorker(
+                self._initializer,
+                self._initargs,
+                self._maximum_process_uses,
+                self._stdout,
+                self._stderr,
+            )
             self._worker_map[tid] = proc
         else:
             proc = self._worker_map[tid]
diff --git a/python/tvm/meta_schedule/runner/local_runner.py b/python/tvm/meta_schedule/runner/local_runner.py
index dfd4764607fb..6c83545584fd 100644
--- a/python/tvm/meta_schedule/runner/local_runner.py
+++ b/python/tvm/meta_schedule/runner/local_runner.py
@@ -17,6 +17,7 @@
 """Local Runner"""
 from contextlib import contextmanager
 from typing import Callable, List, Optional, Union
+import subprocess
 
 import tvm
 
@@ -277,6 +278,7 @@ def __init__(
             max_workers=1,  # one local worker
             timeout=timeout_sec,
             initializer=initializer,
+            stderr=subprocess.DEVNULL,  # suppress the stderr output
         )
         self._sanity_check()
 

From 9f047c0627522a575f9b57fd7f20f47408b000c1 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 18 Oct 2022 11:32:43 +0530
Subject: [PATCH 378/704] [DOCKER][Adreno]Docker infra for Adreno target with
 CLML support (#12833)

* [DOCKER][Adreno] Docker infra for Adreno target with CLML support

New docker for Adreno that inherits gpu docker and amends android-sdk.

For CLML, we need to specify ADRENO_OPENCL via environment variable. CLML SDK can be downlaoded
from Qualcomm Developer Network by following
https://developer.qualcomm.com/blog/accelerate-your-models-our-opencl-ml-sdk
Adreno device is shared with host, hence networking is enabled for Adreno Docker.

* * CLML test cases incmpliance with pytest infra

* Update tests/scripts/task_build_adreno_bins.sh

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>

* * Addional container specific settings via CI invocation

* * review comments

* * trigger build

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>
---
 docker/Dockerfile.ci_adreno                   | 28 +++++++
 docker/bash.sh                                | 13 +++-
 docker/install/ubuntu_install_cmake_source.sh |  2 +-
 python/tvm/testing/utils.py                   | 13 +++-
 tests/python/contrib/test_clml/conftest.py    | 26 +++++++
 .../contrib/test_clml/infrastructure.py       | 43 +----------
 .../python/contrib/test_clml/test_network.py  | 52 ++++---------
 tests/python/contrib/test_clml/test_ops.py    | 74 +++++--------------
 .../test_conv2d_nchw_texture.py               | 42 +++++++----
 tests/scripts/ci.py                           | 34 ++++++++-
 tests/scripts/task_build_adreno_bins.sh       | 53 +++++++++++++
 tests/scripts/task_config_build_adreno.sh     | 31 ++++++++
 tests/scripts/task_python_adreno.sh           | 65 ++++++++++++++++
 13 files changed, 320 insertions(+), 156 deletions(-)
 create mode 100644 docker/Dockerfile.ci_adreno
 create mode 100644 tests/python/contrib/test_clml/conftest.py
 create mode 100755 tests/scripts/task_build_adreno_bins.sh
 create mode 100755 tests/scripts/task_config_build_adreno.sh
 create mode 100755 tests/scripts/task_python_adreno.sh

diff --git a/docker/Dockerfile.ci_adreno b/docker/Dockerfile.ci_adreno
new file mode 100644
index 000000000000..a08b2dfe8c64
--- /dev/null
+++ b/docker/Dockerfile.ci_adreno
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# CI docker GPU env
+FROM tlcpack/ci-gpu:20220908-060034-62bdc91b1
+
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
+# Android SDK
+COPY install/ubuntu_install_androidsdk.sh /install/ubuntu_install_androidsdk.sh
+RUN bash /install/ubuntu_install_androidsdk.sh
+ENV ANDROID_HOME=/opt/android-sdk-linux
+ENV ANDROID_NDK_HOME=/opt/android-sdk-linux/ndk/21.3.6528147
+ENV PATH /opt/android-sdk-linux/platform-tools:$PATH
diff --git a/docker/bash.sh b/docker/bash.sh
index 10d80478d3f7..3f8f3d8baba4 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -161,6 +161,7 @@ function parse_error() {
 break_joined_flag='if (( ${#1} == 2 )); then shift; else set -- -"${1#-i}" "${@:2}"; fi'
 
 DOCKER_ENV=( )
+DOCKER_FLAGS=( )
 
 while (( $# )); do
     case "$1" in
@@ -184,6 +185,11 @@ while (( $# )); do
             shift
             ;;
 
+        --net)
+            DOCKER_FLAGS+=( --net "$2" )
+            shift 2
+            ;;
+
         --mount)
             if [[ -n "$2" ]]; then
                 MOUNT_DIRS+=("$2")
@@ -212,6 +218,11 @@ while (( $# )); do
             shift 2
             ;;
 
+        --volume)
+            DOCKER_FLAGS+=( --volume "$2" )
+            shift 2
+            ;;
+
         --dry-run)
             DRY_RUN=true
             shift
@@ -284,7 +295,6 @@ fi
 
 source "$(dirname $0)/dev_common.sh" || exit 2
 
-DOCKER_FLAGS=( )
 DOCKER_MOUNT=( )
 DOCKER_DEVICES=( )
 
@@ -460,7 +470,6 @@ echo ""
 
 echo Running \'${COMMAND[@]+"${COMMAND[@]}"}\' inside ${DOCKER_IMAGE_NAME}...
 
-
 DOCKER_CMD=(${DOCKER_BINARY} run
             ${DOCKER_FLAGS[@]+"${DOCKER_FLAGS[@]}"}
             ${DOCKER_ENV[@]+"${DOCKER_ENV[@]}"}
diff --git a/docker/install/ubuntu_install_cmake_source.sh b/docker/install/ubuntu_install_cmake_source.sh
index 702130f07964..db0f990e0f8d 100755
--- a/docker/install/ubuntu_install_cmake_source.sh
+++ b/docker/install/ubuntu_install_cmake_source.sh
@@ -32,7 +32,7 @@ wget https://cmake.org/files/v${v}/cmake-${version}.tar.gz
 tar xvf cmake-${version}.tar.gz
 cd cmake-${version}
 ./bootstrap
-make -j"$(nproc)"
+make -j$(nproc)
 make install
 cd ..
 rm -rf cmake-${version} cmake-${version}.tar.gz
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index f89d5e636913..74ca326bca7e 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -900,8 +900,8 @@ def _any_gpu_exists():
     "OpenCL",
     cmake_flag="USE_OPENCL",
     target_kind_enabled="opencl",
-    target_kind_hardware="opencl",
-    parent_features="gpu",
+    target_kind_hardware="opencl" if "RPC_TARGET" not in os.environ else None,
+    parent_features="gpu" if "RPC_TARGET" not in os.environ else None,
 )
 
 # Mark a test as requiring the rocm runtime
@@ -934,6 +934,15 @@ def _any_gpu_exists():
     parent_features="gpu",
 )
 
+# Mark a test as requiring OpenCLML support in build.
+requires_openclml = Feature(
+    "OpenCLML",
+    "CLML",
+    cmake_flag="USE_CLML",
+    target_kind_enabled="opencl",
+)
+
+
 # Mark a test as requiring microTVM to run
 requires_micro = Feature("micro", "MicroTVM", cmake_flag="USE_MICRO")
 
diff --git a/tests/python/contrib/test_clml/conftest.py b/tests/python/contrib/test_clml/conftest.py
new file mode 100644
index 000000000000..a51fc8edf107
--- /dev/null
+++ b/tests/python/contrib/test_clml/conftest.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import tvm
+import pytest
+from test_clml.infrastructure import Device
+
+
+@pytest.fixture(scope="session")
+def device():
+    return Device()
diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py
index 08b11525ecd2..12accda3fda5 100644
--- a/tests/python/contrib/test_clml/infrastructure.py
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -73,12 +73,12 @@ class Device:
     """
 
     connection_type = "tracker"
-    host = "localhost"
-    port = 9150
+    host = os.getenv("TVM_TRACKER_HOST", "localhost")
+    port = int(os.getenv("TVM_TRACKER_PORT", 9090))
     target = "opencl"
     target_host = "llvm -mtriple=aarch64-linux-gnu"
     device_key = "android"
-    cross_compile = "aarch64-linux-android-g++"
+    cross_compile = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
 
     def __init__(self):
         """Keep remote device for lifetime of object."""
@@ -100,43 +100,6 @@ def _get_remote(cls):
 
         return device
 
-    @classmethod
-    def load(cls, file_name):
-        """Load test config
-
-        Load the test configuration by looking for file_name relative
-        to the test_clml directory.
-        """
-        location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-        config_file = os.path.join(location, file_name)
-        if not os.path.exists(config_file):
-            warnings.warn("Config file doesn't exist, resuming CLML tests with default config.")
-            return
-        with open(config_file, mode="r") as config:
-            test_config = json.load(config)
-
-        cls.connection_type = test_config["connection_type"]
-        cls.host = test_config["host"]
-        cls.port = test_config["port"]
-        cls.target = test_config["target"]
-        cls.target_host = test_config["target_host"]
-        cls.device_key = test_config.get("device_key") or ""
-        cls.cross_compile = test_config.get("cross_compile") or ""
-
-
-def skip_runtime_test():
-    """Skip test if it requires the runtime and it's not present."""
-    # CLML codegen not present.
-    if not tvm.get_global_func("relay.ext.clml", True):
-        print("Skip because CLML codegen is not available.")
-        return True
-
-    # Remote device is in use or CLML runtime not present
-    # Note: Ensure that the device config has been loaded before this check
-    if not Device.connection_type != "local" and not clml.is_clml_runtime_enabled():
-        print("Skip because runtime isn't present or a remote device isn't being used.")
-        return True
-
 
 def skip_codegen_test():
     """Skip test if it requires the CLML codegen and it's not present."""
diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py
index 95f3a45baf78..8d740d6dce4d 100644
--- a/tests/python/contrib/test_clml/test_network.py
+++ b/tests/python/contrib/test_clml/test_network.py
@@ -16,13 +16,13 @@
 # under the License.
 """OpenCL ML network tests."""
 
+import tvm
 import numpy as np
-import pytest
-from tvm import testing
 from tvm import relay
-
-import tvm
-from test_clml.infrastructure import skip_runtime_test, build_and_run, Device
+from tvm.relay import testing
+from tvm.contrib import utils
+from test_clml.infrastructure import build_and_run, Device
+import pytest
 
 
 def _build_and_run_network(mod, params, inputs, data, device, atol, rtol, tvm_log=""):
@@ -59,15 +59,9 @@ def get_bottom_top_model(model, layer_name):
     return mod, params, ref_output
 
 
-def test_mobilenet():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    dtype = "float16"
-
+@pytest.mark.parametrize("dtype", ["float16"])
+@tvm.testing.requires_openclml
+def test_mobilenet(device, dtype):
     def get_model():
         from tensorflow.keras.applications import MobileNet
         import tensorflow as tf
@@ -107,15 +101,9 @@ def get_model():
     tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5)
 
 
-def test_inception_v3():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    dtype = "float16"
-
+@pytest.mark.parametrize("dtype", ["float16"])
+@tvm.testing.requires_openclml
+def test_inception_v3(device, dtype):
     def get_model():
         from tensorflow.keras.applications import InceptionV3
         import tensorflow as tf
@@ -150,15 +138,9 @@ def get_model():
     tvm.testing.assert_allclose(opencl_sort[:5], clml_sort[:5], rtol=1e-5, atol=1e-5)
 
 
-def test_resnet50v2():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    dtype = "float16"
-
+@pytest.mark.parametrize("dtype", ["float16"])
+@tvm.testing.requires_openclml
+def test_resnet50v2(device, dtype):
     def get_model():
         from tensorflow.keras.applications import ResNet50V2
         import tensorflow as tf
@@ -202,9 +184,3 @@ def get_model():
     clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
 
     tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5)
-
-
-if __name__ == "__main__":
-    test_mobilenet()
-    test_resnet50v2()
-    test_inception_v3()
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
index d14a5ec6e90d..d2431d2dfd3b 100644
--- a/tests/python/contrib/test_clml/test_ops.py
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -16,21 +16,14 @@
 # under the License.
 """CLML integration conv2d tests."""
 
-import numpy as np
-
-np.random.seed(0)
-
 import tvm
-from tvm import testing
+import numpy as np
 from tvm import relay
+from tvm.relay import testing
 from tvm.ir import IRModule
-
-from test_clml.infrastructure import (
-    skip_runtime_test,
-    skip_codegen_test,
-    build_and_run,
-    Device,
-)
+from tvm.contrib import utils
+from test_clml.infrastructure import build_and_run, Device, skip_codegen_test
+import pytest
 
 
 def _get_conv_model(
@@ -98,17 +91,9 @@ def _get_conv_model(
     return out, params
 
 
-def test_conv2d():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    dtype = "float32"
-
+@pytest.mark.parametrize("dtype", ["float32"])
+@tvm.testing.requires_openclml
+def test_conv2d(device, dtype):
     trials = [
         # Normal convolution
         [3, 3, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
@@ -168,17 +153,9 @@ def test_conv2d():
         )
 
 
-def test_batchnorm():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    dtype = "float32"
-
+@pytest.mark.parametrize("dtype", ["float16"])
+@tvm.testing.requires_openclml
+def _test_batchnorm(device, dtype):
     in_shape = (1, 8, 64, 64)
     channels = 8
 
@@ -211,14 +188,9 @@ def test_batchnorm():
     )
 
 
-def test_concat():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    dtype = "float16"
+@pytest.mark.parametrize("dtype", ["float16"])
+@tvm.testing.requires_openclml
+def test_concat(device, dtype):
     in_shape_1 = (1, 16, 16, 16)
     in_shape_2 = (1, 16, 16, 16)
     a = relay.var("input_1", shape=in_shape_1, dtype=dtype)
@@ -241,14 +213,9 @@ def test_concat():
     )
 
 
-def test_avgpool():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    dtype = "float16"
+@pytest.mark.parametrize("dtype", ["float16"])
+@tvm.testing.requires_openclml
+def test_avgpool(device, dtype):
     trials = [
         # input size         pool_size stride  paading
         [(1, 64, 147, 147), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
@@ -288,10 +255,3 @@ def test_avgpool():
         tvm.testing.assert_allclose(
             clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
         )
-
-
-if __name__ == "__main__":
-    test_conv2d()
-    # test_batchnorm()
-    test_avgpool()
-    test_concat()
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
index 504a2b4e3ed3..0513a2d3f663 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
@@ -479,7 +479,6 @@ def test_conv2d_winograd_conv(target, dtype):
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-@pytest.mark.skipif(tvm.testing.utils.IS_IN_CI, reason="failed due to nvidia libOpencl in the CI")
 def test_residual_block(target, dtype):
     """
     - some kind of residual block followed by convolution to have texture after residual block
@@ -569,20 +568,33 @@ def test_residual_block(target, dtype):
         "weight2": tvm.nd.array(filter_data2),
         "weight3": tvm.nd.array(filter_data3),
     }
-
-    static_memory_scope = [
-        "global",
-        "global.texture",
-        "global.texture-weight",
-        "global.texture-weight",
-        "global.texture",
-        "global.texture-weight",
-        "global",
-        "global.texture",
-        "global.texture-weight",
-        "",
-        "",
-    ]
+    if dtype == "float16":
+        static_memory_scope = [
+            "global",
+            "global.texture",
+            "global.texture-weight",
+            "global.texture-weight",
+            "global.texture",
+            "global.texture-weight",
+            "global",
+            "global.texture",
+            "global.texture-weight",
+            "",
+            "",
+        ]
+    else:
+        static_memory_scope = [
+            "global",
+            "global.texture",
+            "global.texture-weight",
+            "global.texture-weight",
+            "global.texture",
+            "global.texture-weight",
+            "global.texture",
+            "global.texture-weight",
+            "",
+            "",
+        ]
 
     build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
 
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 31b7316d88d8..02ef7b888b80 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -147,7 +147,14 @@ def gen_name(s: str) -> str:
     return f"{s}-{suffix}"
 
 
-def docker(name: str, image: str, scripts: List[str], env: Dict[str, str], interactive: bool):
+def docker(
+    name: str,
+    image: str,
+    scripts: List[str],
+    env: Dict[str, str],
+    interactive: bool,
+    additional_flags: Dict[str, str],
+):
     """
     Invoke a set of bash scripts through docker/bash.sh
 
@@ -169,6 +176,7 @@ def docker(name: str, image: str, scripts: List[str], env: Dict[str, str], inter
         "ci_arm",
         "ci_hexagon",
         "ci_riscv",
+        "ci_adreno",
     }
 
     if image in sccache_images and os.getenv("USE_SCCACHE", "1") == "1":
@@ -196,6 +204,10 @@ def docker(name: str, image: str, scripts: List[str], env: Dict[str, str], inter
         command.append("--env")
         command.append(f"{key}={value}")
 
+    for key, value in additional_flags.items():
+        command.append(key)
+        command.append(value)
+
     SCRIPT_DIR.mkdir(exist_ok=True)
 
     script_file = SCRIPT_DIR / f"{name}.sh"
@@ -345,6 +357,7 @@ def generate_command(
     help: str,
     precheck: Optional[Callable[[], None]] = None,
     post_build: Optional[List[str]] = None,
+    additional_flags: Dict[str, str] = {},
 ):
     """
     Helper to generate CLIs that:
@@ -411,6 +424,7 @@ def fn(
                 "VERBOSE": "true" if verbose else "false",
             },
             interactive=interactive,
+            additional_flags=additional_flags,
         )
 
     fn.__name__ = name
@@ -687,6 +701,24 @@ def add_subparser(
             ),
         },
     ),
+    generate_command(
+        name="adreno",
+        help="Run Adreno build and test(s)",
+        post_build=["./tests/scripts/task_build_adreno_bins.sh"],
+        additional_flags={
+            "--volume": os.environ.get("ADRENO_OPENCL", "") + ":/adreno-opencl",
+            "--env": "ADRENO_OPENCL=/adreno-opencl",
+            "--net": "host",
+        },
+        options={
+            "test": (
+                "run Adreno API/Python tests",
+                [
+                    "./tests/scripts/task_python_adreno.sh " + os.environ.get("ANDROID_SERIAL", ""),
+                ],
+            ),
+        },
+    ),
 ]
 
 
diff --git a/tests/scripts/task_build_adreno_bins.sh b/tests/scripts/task_build_adreno_bins.sh
new file mode 100755
index 000000000000..5d453251606a
--- /dev/null
+++ b/tests/scripts/task_build_adreno_bins.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -x
+
+output_directory=$(realpath ${PWD}/build-adreno-target)
+rm -rf ${output_directory}
+
+mkdir -p ${output_directory}
+cd ${output_directory}
+
+cp ../cmake/config.cmake .
+
+echo set\(USE_CLML ON\) >> config.cmake
+echo set\(USE_CLML_GRAPH_EXECUTOR "${ADRENO_OPENCL}"\) >> config.cmake
+echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_CPP_RPC ON\) >> config.cmake
+echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake
+echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
+
+echo set\(ANDROID_ABI arm64-v8a\) >> config.cmake
+echo set\(ANDROID_PLATFORM android-28\) >> config.cmake
+echo set\(MACHINE_NAME aarch64-linux-gnu\) >> config.cmake
+
+cmake -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \
+      -DANDROID_ABI=arm64-v8a \
+      -DANDROID_PLATFORM=android-28 \
+      -DCMAKE_SYSTEM_VERSION=1 \
+      -DCMAKE_FIND_ROOT_PATH="${ADRENO_OPENCL}" \
+      -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+      -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+      -DCMAKE_CXX_COMPILER="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang++" \
+      -DCMAKE_C_COMPILER="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang" \
+      -DMACHINE_NAME="aarch64-linux-gnu" ..
+
+make -j$(nproc) tvm_rpc
diff --git a/tests/scripts/task_config_build_adreno.sh b/tests/scripts/task_config_build_adreno.sh
new file mode 100755
index 000000000000..d45c5e8b7dcf
--- /dev/null
+++ b/tests/scripts/task_config_build_adreno.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euxo pipefail
+
+BUILD_DIR=$1
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
+cp ../cmake/config.cmake .
+
+echo set\(USE_OPENCL ON\) >> config.cmake
+echo set\(USE_CLML ON\) >> config.cmake
+echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake
+echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
+echo set\(USE_LLVM ON\) >> config.cmake
diff --git a/tests/scripts/task_python_adreno.sh b/tests/scripts/task_python_adreno.sh
new file mode 100755
index 000000000000..2b131ec762be
--- /dev/null
+++ b/tests/scripts/task_python_adreno.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euxo pipefail
+
+export TVM_TEST_TARGETS="opencl"
+export TVM_RELAY_OPENCL_TEXTURE_TARGETS="opencl -device=adreno"
+
+source tests/scripts/setup-pytest-env.sh
+export PYTHONPATH=${PYTHONPATH}:${TVM_PATH}/apps/extension/python
+export LD_LIBRARY_PATH="build:${LD_LIBRARY_PATH:-}"
+export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-adreno
+
+export TVM_TRACKER_HOST=127.0.0.1
+export TVM_TRACKER_PORT=$(((RANDOM % 100) + 9100))
+export RPC_TARGET="adreno"
+export TVM_NDK_CC="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang"
+
+env PYTHONPATH=python python3 -m tvm.exec.rpc_tracker --host "${TVM_TRACKER_HOST}" --port "${TVM_TRACKER_PORT}" &
+TRACKER_PID=$!
+sleep 5   # Wait for tracker to bind
+
+export ANDROID_SERIAL=$1
+
+adb shell "mkdir -p /data/local/tmp/tvm_ci"
+adb push build-adreno-target/tvm_rpc /data/local/tmp/tvm_ci/tvm_rpc_ci
+adb push build-adreno-target/libtvm_runtime.so /data/local/tmp/tvm_ci
+
+adb reverse tcp:${TVM_TRACKER_PORT} tcp:${TVM_TRACKER_PORT}
+adb forward tcp:5000 tcp:5000
+adb forward tcp:5001 tcp:5001
+adb forward tcp:5002 tcp:5002
+env adb shell "cd /data/local/tmp/tvm_ci; killall -9 tvm_rpc_ci; sleep 2; LD_LIBRARY_PATH=/data/local/tmp/tvm_ci/ ./tvm_rpc_ci server --host=0.0.0.0 --port=5000 --port-end=5010 --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=android" &
+DEVICE_PID=$!
+sleep 5 # Wait for the device connections
+trap "{ kill ${TRACKER_PID}; kill ${DEVICE_PID}; }" 0
+
+# cleanup pycache
+find . -type f -path "*.pyc" | xargs rm -f
+# Test TVM
+make cython3
+
+# OpenCL texture test on Adreno
+run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture tests/python/relay/opencl_texture
+
+# Adreno CLML test
+run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-openclml tests/python/contrib/test_clml
+
+kill ${TRACKER_PID}
+kill ${DEVICE_PID}

From 48be4ff3449166a7fc1e9209bb3f165ebe769e34 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Tue, 18 Oct 2022 09:32:19 -0700
Subject: [PATCH 379/704] [Docs] Add instructions on downloads page updating on
 release process (#13106)

release process
---
 docs/contribute/release_process.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/contribute/release_process.rst b/docs/contribute/release_process.rst
index a0ebda650524..129d65fc9043 100644
--- a/docs/contribute/release_process.rst
+++ b/docs/contribute/release_process.rst
@@ -233,10 +233,12 @@ The website repository is located at `https://github.com/apache/tvm-site <https:
 
 	# add the docs and push
 	git add .
-	git commit -m"Add v0.9.0 docs"
+	git commit -m "Add v0.9.0 docs"
 	git push
 
 
+Afterwards, modify the `downloads page <https://tvm.apache.org/download>`_ to support the latest release. An example of how to do this is `here <https://github.com/apache/tvm-site/pull/38>`_.
+
 Post the Announcement
 ---------------------
 

From 64975a425fb52e4eb317d7ffa821151a5a77e829 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 18 Oct 2022 14:06:52 -0500
Subject: [PATCH 380/704] [skip ci][COMMUNITY] gigiblender -> Reviewer (#13122)

Add Florin Blanaru to the list of reviewers.
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 383bd9032683..d5fea2181a11 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -92,6 +92,7 @@ We do encourage everyone to work anything they are interested in.
 - [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri
 - [Matthew Barrett](https://github.com/mbaret): @mbaret
 - [Arnaud Bergeron](https://github.com/abergeron): @abergeron
+- [Florin Blanaru](https://github.com/gigiblender): @gigiblender
 - [Matthew Brookhart](https://github.com/mbrookhart): @mbrookhart
 - [Yaxing Cai](https://github.com/cyx-6): @cyx-6
 - [Liangfu Chen](https://github.com/liangfu): @liangfu

From 6056e13db9a4f933ed1c481767a6fba6a5bb3203 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Tue, 18 Oct 2022 22:08:13 +0300
Subject: [PATCH 381/704] [Adreno] Fix winograd accuracy (#13117)

* [Adreno] Fix winograd accuracy

For some convolutions winograd didn't work in proper way. The issue was
in the layout after alter_op.

* Apply comments
---
 python/tvm/topi/adreno/conv2d_alter_op.py     |   3 +
 .../test_conv2d_nchw_texture.py               |  33 ++++++
 .../test_conv2d_nhwc_texture.py               | 101 ++++++++++++++++++
 3 files changed, 137 insertions(+)

diff --git a/python/tvm/topi/adreno/conv2d_alter_op.py b/python/tvm/topi/adreno/conv2d_alter_op.py
index 6cf749a62b27..cf72cc2a846e 100644
--- a/python/tvm/topi/adreno/conv2d_alter_op.py
+++ b/python/tvm/topi/adreno/conv2d_alter_op.py
@@ -143,8 +143,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         CO, _, KH, KW = get_const_tuple(kernel_tensor.shape)
 
         # pre-compute weight transformation in winograd
+        # alpha, alpha, CO, CI
         weight = relay.nn.contrib_conv2d_winograd_weight_transform(inputs[1], tile_size=tile_size)
         weight = relay.transpose(weight, axes=[2, 3, 0, 1])  # HWOI -> OIHW
+        # (oc, ic, h, w) -> (h, w, ic, oc)
+        new_attrs["kernel_layout"] = "HWIO"
         new_attrs["tile_size"] = tile_size
         new_attrs["channels"] = CO
 
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
index 0513a2d3f663..c73e411a700e 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
@@ -1041,3 +1041,36 @@ def test_conv2d_different_lowering_same_op(target, dtype):
     ]
 
     build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_winograd_non_rect(target, dtype):
+    input_shape = (1, 771, 36, 64)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    filter_shape = (128, 771, 3, 3)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    D = relay.nn.conv2d(
+        A, B, padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3], out_dtype=dtype
+    )
+
+    mod = relay.Function([A, B], D)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    initializer("weight", filter_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+    }
+
+    temp = utils.tempdir()
+    stat_file = temp.relpath("stat.log")
+    with open(stat_file, "w") as f:
+        f.write(
+            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256 -texture_spatial_limit=16384 -thread_warp_size=1", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 771, 36, 64], "{dtype}"], ["TENSOR", [128, 771, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 5399, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 16], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 8]], ["tile_rc", "sp", [-1, 193]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
+        )
+    graph = build_run_compare(
+        mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
+    )
+    matches = re.findall("winograd", graph)
+    assert len(matches) > 0
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py
index 37c22137f035..0b89e3dc9c7f 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py
@@ -581,3 +581,104 @@ def test_conv2d_vgg16_winograd_4d(target, dtype):
     )
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_winograd_conv(target, dtype):
+    input_shape = (1, 3, 3, 4)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    filter_shape3 = (3, 3, 4, 8)
+    bias_shape3 = (1, 1, 1, 8)
+    B3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+    D = relay.nn.conv2d(
+        A,
+        B3,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[1, 1, 1, 1],
+        channels=8,
+        kernel_size=[3, 3],
+        out_dtype=dtype,
+    )
+
+    filter_shape4 = (3, 3, 8, 8)
+    bias_shape4 = (1, 1, 1, 8)
+    B4 = relay.var("weight4", shape=filter_shape4, dtype=dtype)
+    D = relay.nn.conv2d(
+        D,
+        B4,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[1, 1, 1, 1],
+        channels=8,
+        kernel_size=[3, 3],
+        out_dtype=dtype,
+    )
+    mod = relay.Function([A, B3, B4], D)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    bias_data3 = np.zeros(bias_shape3).astype(dtype)
+    filter_data4 = np.zeros(filter_shape4).astype(dtype)
+    bias_data4 = np.zeros(bias_shape4).astype(dtype)
+    initializer("weight", filter_data3)
+    initializer("bias", bias_data3)
+    initializer("weight", filter_data4)
+    initializer("bias", bias_data4)
+    params1 = {
+        "weight3": tvm.nd.array(filter_data3),
+        "weight4": tvm.nd.array(filter_data4),
+    }
+
+    temp = utils.tempdir()
+    stat_file = temp.relpath("stat.log")
+    with open(stat_file, "w") as f:
+        f.write(
+            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 3, 3, 4], "{dtype}"], ["TENSOR", [3, 3, 4, 8], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
+        )
+    graph = build_run_compare(
+        mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
+    )
+    matches = re.findall("winograd", graph)
+    assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_winograd_non_rect(target, dtype):
+    input_shape = (1, 36, 64, 771)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    filter_shape = (3, 3, 771, 128)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    D = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[1, 1, 1, 1],
+        channels=128,
+        kernel_size=[3, 3],
+        out_dtype=dtype,
+    )
+
+    mod = relay.Function([A, B], D)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    initializer("weight", filter_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+    }
+
+    temp = utils.tempdir()
+    stat_file = temp.relpath("stat.log")
+    with open(stat_file, "w") as f:
+        f.write(
+            f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256 -texture_spatial_limit=16384 -thread_warp_size=1", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 36, 64, 771], "{dtype}"], ["TENSOR", [3, 3, 771, 128], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 5399, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 16], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 8]], ["tile_rc", "sp", [-1, 193]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
+        )
+    graph = build_run_compare(
+        mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
+    )
+    matches = re.findall("winograd", graph)
+    assert len(matches) > 0

From 010d05c6804dd44fa28da27e5af787cceb74ed60 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Tue, 18 Oct 2022 22:20:14 +0300
Subject: [PATCH 382/704] [QNN][Hexagon] Disable QNN canonicalization pass
 (#12398)

* [QNN] Disable QNN canonicalization pass.

This commit enables work of TVM without QNN canonicalization pass.
It adds new TOPI ops for QNN + simple compute/schedules.

* added dependence of the qnn::transform::Legalize pass launch on target.

* Added new dense topi operator for the pattern qnn.dense+bias+requantize

* Added support of axis attribute for QNN TOPI ops

* Fixed TOPI compute implementation for qnn.add

* Fixed issue with non zero padding value for qnn.conv2d

* Fixed Bias.add for qnn.conv2d

* Added support of depthwise qnn.conv2d topi operator

* Added support of 1D quantization params in qnn.dequantize

* Added support of qnn.concatenate

* Fixed out of range array access

* Added meta_schedule_original_shape attribute in QDenseAttr and
QConv2DAttr

* Added support of qnn.batch_matmul as a standalone op.

* Added per channel zp in qnn.dense and qnn.conv2d.

* Fixed corner cases like dense+bias+bias+rq.

* Added unit test.

* Removed rq_out_dtype and axis attributes declaration in QConv2DAttra and
QDenseAttrs.

* Changed target x86->Hexagon to disable QNN passes.

* Fixed issue with QDenseAttrs and QConv2dAttrs.

* Fixed build for Cortex-M.

* Removed QDenseAttrs and QConv2dAttrs

* Fix tests after rebase

* Address code review comments.

* [QNN] Add option to disabe QNN passes.

QNN passes are enabled by default. To disable use
disabled_pass=["qnn.Legalize"] in pass config.

* Revert changes of GetPassPrefix interface.
---
 include/tvm/runtime/data_type.h               |  10 +
 python/tvm/relay/backend/te_compiler.py       |  33 +-
 python/tvm/relay/qnn/op/_qnn.py               |  35 +-
 python/tvm/relay/qnn/op/qnn.py                |   7 -
 python/tvm/relay/qnn/strategy/__init__.py     |  23 +
 python/tvm/relay/qnn/strategy/generic.py      | 249 +++++++
 python/tvm/relay/qnn/strategy/hexagon.py      | 136 ++++
 python/tvm/te/__init__.py                     |   1 +
 python/tvm/tir/__init__.py                    |   1 +
 python/tvm/topi/hexagon/qnn/__init__.py       |   1 +
 python/tvm/topi/hexagon/qnn/nn.py             | 667 ++++++++++++++++++
 src/relay/backend/te_compiler_cache.cc        | 111 ++-
 src/relay/qnn/pass/legalize.cc                |   2 +-
 src/relay/transforms/fuse_ops.cc              |   4 +-
 .../test_wo_qnn_canonicalization.py           | 185 +++++
 15 files changed, 1431 insertions(+), 34 deletions(-)
 create mode 100644 python/tvm/relay/qnn/strategy/__init__.py
 create mode 100644 python/tvm/relay/qnn/strategy/generic.py
 create mode 100644 python/tvm/relay/qnn/strategy/hexagon.py
 create mode 100644 python/tvm/topi/hexagon/qnn/nn.py
 create mode 100644 tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py

diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
index e0c3106e14fa..7f68ce2ad5bb 100644
--- a/include/tvm/runtime/data_type.h
+++ b/include/tvm/runtime/data_type.h
@@ -124,6 +124,16 @@ class DataType {
    * \return the result type.
    */
   DataType element_of() const { return with_lanes(1); }
+  /*!
+   * \brief Assignment operator.
+   */
+  DataType& operator=(const DataType& rhs) {
+    if (this == &rhs) {
+      return *this;
+    }
+    data_ = rhs.data_;
+    return *this;
+  }
   /*!
    * \brief Equal comparator.
    * \param other The data type to compare against.
diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index a2fbf555e12b..173f31ef08f9 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -281,25 +281,28 @@ def get_shape(shape):
 
 
 @tvm._ffi.register_func("relay.backend.lower_call")
-def lower_call(call, inputs, target):
+def lower_call(call, inputs, target, otype=None):
     """Lower the call expression to op implementation and tensor outputs."""
     assert isinstance(call.op, tvm.ir.Op)
     op = call.op
 
-    # Prepare the call_node->checked_type(). For the call node inputs, we ensure that
-    # the shape is Int32. Following code ensures the same for the output as well.
-    # TODO(@icemelon9): Support recursive tuple
-    ret_type = call.checked_type
-    if isinstance(ret_type, _ty.TensorType):
-        ret_type = _ty.TensorType(get_shape(ret_type.shape), ret_type.dtype)
-    elif isinstance(ret_type, _ty.TupleType):
-        new_fields = []
-        for field in ret_type.fields:
-            if isinstance(field, _ty.TensorType):
-                new_fields.append(_ty.TensorType(get_shape(field.shape), field.dtype))
-            else:
-                new_fields.append(field)
-        ret_type = _ty.TupleType(new_fields)
+    if otype is not None:
+        ret_type = otype
+    else:
+        # Prepare the call_node->checked_type(). For the call node inputs, we ensure that
+        # the shape is Int32. Following code ensures the same for the output as well.
+        # TODO(@icemelon9): Support recursive tuple
+        ret_type = call.checked_type
+        if isinstance(ret_type, _ty.TensorType):
+            ret_type = _ty.TensorType(get_shape(ret_type.shape), ret_type.dtype)
+        elif isinstance(ret_type, _ty.TupleType):
+            new_fields = []
+            for field in ret_type.fields:
+                if isinstance(field, _ty.TensorType):
+                    new_fields.append(_ty.TensorType(get_shape(field.shape), field.dtype))
+                else:
+                    new_fields.append(field)
+            ret_type = _ty.TupleType(new_fields)
 
     is_dyn = _ty.is_dynamic(call.checked_type)
     for arg in call.args:
diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
index a059c293a0f8..4e54583a3be0 100644
--- a/python/tvm/relay/qnn/op/_qnn.py
+++ b/python/tvm/relay/qnn/op/_qnn.py
@@ -19,9 +19,10 @@
 
 from tvm import topi
 
+from .. import strategy
 from ...op.op import register_compute
 from ...op.op import register_injective_schedule
-from ...op.op import register_pattern, OpPattern
+from ...op.op import register_strategy, register_pattern, OpPattern
 
 
 @register_compute("qnn.simulated_quantize")
@@ -50,3 +51,35 @@ def simulated_dequantize_compute(attrs, inputs, output_type):
 
 register_injective_schedule("qnn.simulated_dequantize")
 register_pattern("qnn.simulated_dequantize", OpPattern.ELEMWISE)
+
+# qnn.quantize
+register_strategy("qnn.quantize", strategy.qnn_quantize_strategy)
+register_pattern("qnn.quantize", OpPattern.ELEMWISE)
+
+# qnn.dequantize
+register_strategy("qnn.dequantize", strategy.qnn_dequantize_strategy)
+register_pattern("qnn.dequantize", OpPattern.ELEMWISE)
+
+# qnn.requantize
+register_strategy("qnn.requantize", strategy.qnn_requantize_strategy)
+register_pattern("qnn.requantize", OpPattern.ELEMWISE)
+
+# qnn.add
+register_strategy("qnn.add", strategy.qnn_add_strategy)
+register_pattern("qnn.add", OpPattern.BROADCAST)
+
+# qnn.concatenate
+register_strategy("qnn.concatenate", strategy.qnn_concatenate_strategy)
+register_pattern("qnn.concatenate", OpPattern.INJECTIVE)
+
+# qnn.conv2d
+register_strategy("qnn.conv2d", strategy.qnn_conv2d_strategy)
+register_pattern("qnn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# qnn.dense
+register_strategy("qnn.dense", strategy.qnn_dense_strategy)
+register_pattern("qnn.dense", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# qnn.batch_matmul
+register_strategy("qnn.batch_matmul", strategy.qnn_batch_matmul_strategy)
+register_pattern("qnn.batch_matmul", OpPattern.OUT_ELEMWISE_FUSABLE)
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 1f383851071b..78d6669413ca 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -29,8 +29,6 @@
 from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE
 from tvm.topi.x86.utils import target_has_sse41
 
-from ... import op as reg
-from ...op import OpPattern
 from . import _make, _requantize
 
 
@@ -1212,11 +1210,6 @@ def batch_matmul(x, y, x_zero_point, y_zero_point, x_scale, y_scale, out_dtype="
     return _make.batch_matmul(x, y, x_zero_point, y_zero_point, x_scale, y_scale, out_dtype)
 
 
-# register fuse pattern for qnn ops
-reg.register_pattern("qnn.quantize", OpPattern.OPAQUE)
-reg.register_pattern("qnn.dequantize", OpPattern.OPAQUE)
-
-
 def leaky_relu(x, alpha, input_scale, input_zero_point, output_scale, output_zero_point):
     """Quantized leaky relu.
 
diff --git a/python/tvm/relay/qnn/strategy/__init__.py b/python/tvm/relay/qnn/strategy/__init__.py
new file mode 100644
index 000000000000..05778c3e9f86
--- /dev/null
+++ b/python/tvm/relay/qnn/strategy/__init__.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=wildcard-import
+"""QNN op strategies."""
+from __future__ import absolute_import as _abs
+
+from .generic import *
+from . import hexagon
diff --git a/python/tvm/relay/qnn/strategy/generic.py b/python/tvm/relay/qnn/strategy/generic.py
new file mode 100644
index 000000000000..57a364f7e057
--- /dev/null
+++ b/python/tvm/relay/qnn/strategy/generic.py
@@ -0,0 +1,249 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of generic operator strategy."""
+
+from tvm.target import override_native_generic_func
+
+
+def wrap_topi_schedule(topi_schedule):
+    """Wrap TOPI schedule which doesn't use attrs"""
+
+    def wrapper(_attrs, outs, target):
+        with target:
+            return topi_schedule(outs)
+
+    return wrapper
+
+
+def wrap_topi_compute(topi_compute):
+    """Wrap TOPI compute which doesn't use attrs"""
+
+    def wrapper(_attrs, inputs, _out_type):
+        return [topi_compute(*inputs)]
+
+    return wrapper
+
+
+def wrap_compute_quantize(topi_compute):
+    """Wrap TOPI compute which use axis and out data type from attrs"""
+
+    def wrapper(attrs, inputs, _out_type):
+        axis = attrs.axis
+        out_dtype = attrs.out_dtype
+        args = [*inputs, axis, out_dtype]
+        return [topi_compute(*args)]
+
+    return wrapper
+
+
+def wrap_compute_dequantize(topi_compute):
+    """Wrap TOPI compute which use axis from attrs"""
+
+    def wrapper(attrs, inputs, _out_type):
+        args = [*inputs, attrs.axis]
+        return [topi_compute(*args)]
+
+    return wrapper
+
+
+def wrap_topi_qnn_conv2d(topi_compute):
+    """Wrap TOPI compute which use conv2d attrs and output data type"""
+
+    def wrapper(attrs, inputs, out_type):
+        out_dtype = out_type.dtype
+        oshape = out_type.shape
+        strides = attrs.strides
+        padding = attrs.padding
+        dilation = attrs.dilation
+        if len([*inputs]) == 11:
+            args = [*inputs, strides, padding, dilation, oshape, out_dtype]
+        elif len([*inputs]) == 10:
+            args = [  # QNN Conv2d params:
+                inputs[0],
+                inputs[1],
+                inputs[2],
+                inputs[3],
+                inputs[4],
+                inputs[5],
+                # Bias argument
+                None,
+                # Requantization params:
+                inputs[6],
+                inputs[7],
+                inputs[8],
+                inputs[9],
+                # Conv2d attrs:
+                strides,
+                padding,
+                dilation,
+                oshape,
+                out_dtype,
+            ]
+        else:
+            assert len([*inputs]) == 6
+            args = [  # QNN Conv2d params:
+                *inputs,
+                # Bias argument:
+                None,
+                # Requantization params:
+                None,
+                None,
+                None,
+                None,
+                strides,
+                padding,
+                dilation,
+                oshape,
+                out_dtype,
+            ]
+        return [topi_compute(*args)]
+
+    return wrapper
+
+
+def wrap_topi_qnn_dense(topi_compute):
+    """Wrap TOPI compute which use qnn.dense attrs"""
+
+    def wrapper(_attrs, inputs, out_type):
+        out_dtype = out_type.dtype
+        if len([*inputs]) == 11:
+            args = [*inputs, out_dtype]
+        elif len([*inputs]) == 10:
+            args = [  # QNN Dense params:
+                inputs[0],
+                inputs[1],
+                inputs[2],
+                inputs[3],
+                inputs[4],
+                inputs[5],
+                # Bias argument
+                None,
+                # Requantization params:
+                inputs[6],
+                inputs[7],
+                inputs[8],
+                inputs[9],
+                out_dtype,
+            ]
+        else:
+            assert len([*inputs]) == 6
+            args = [  # QNN Dense params:
+                *inputs,
+                # Bias argument:
+                None,
+                # Requantization params:
+                None,
+                None,
+                None,
+                None,
+                out_dtype,
+            ]
+        return [topi_compute(*args)]
+
+    return wrapper
+
+
+def wrap_topi_concatenate(topi_compute):
+    """Wrap TOPI compute which use qnn.concatenate attrs"""
+
+    def wrapper(attrs, inputs, out_type):
+        return [topi_compute(inputs, attrs.axis, out_type.dtype)]
+
+    return wrapper
+
+
+def wrap_topi_qnn_batch_matmul(topi_compute):
+    """Wrap TOPI compute which use qnn.batch_matmul attrs"""
+
+    def wrapper(attrs, inputs, _out_type):
+        assert len([*inputs]) == 6
+        args = [*inputs, attrs.transpose_a, attrs.transpose_b, attrs.out_dtype]
+        return [topi_compute(*args)]
+
+    return wrapper
+
+
+@override_native_generic_func("qnn_quantize_strategy")
+def qnn_quantize_strategy(attrs, inputs, out_type, target):
+    """qnn.quantize generic strategy"""
+    raise RuntimeError(
+        "qnn.quantize is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
+
+
+@override_native_generic_func("qnn_dequantize_strategy")
+def qnn_dequantize_strategy(attrs, inputs, out_type, target):
+    """qnn.dequantize generic strategy"""
+    raise RuntimeError(
+        "qnn.dequantize is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
+
+
+@override_native_generic_func("qnn_requantize_strategy")
+def qnn_requantize_strategy(attrs, inputs, out_type, target):
+    """qnn.requantize generic strategy"""
+    raise RuntimeError(
+        "qnn.requantize is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
+
+
+@override_native_generic_func("qnn_add_strategy")
+def qnn_add_strategy(attrs, inputs, out_type, target):
+    """qnn.add generic strategy"""
+    raise RuntimeError(
+        "qnn.add is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
+
+
+@override_native_generic_func("qnn_concatenate_strategy")
+def qnn_concatenate_strategy(attrs, inputs, out_type, target):
+    """qnn.concatenate generic strategy"""
+    raise RuntimeError(
+        "qnn.concatenate is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
+
+
+@override_native_generic_func("qnn_conv2d_strategy")
+def qnn_conv2d_strategy(attrs, inputs, out_type, target):
+    """qnn.conv2d generic strategy"""
+    raise RuntimeError(
+        "qnn.conv2d is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
+
+
+@override_native_generic_func("qnn_dense_strategy")
+def qnn_dense_strategy(attrs, inputs, out_type, target):
+    """qnn.dense generic strategy"""
+    raise RuntimeError(
+        "qnn.dense is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
+
+
+@override_native_generic_func("qnn_batch_matmul_strategy")
+def qnn_batch_matmul_strategy(attrs, inputs, out_type, target):
+    """qnn.batch_matmul generic strategy"""
+    raise RuntimeError(
+        "qnn.batch_matmul is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
diff --git a/python/tvm/relay/qnn/strategy/hexagon.py b/python/tvm/relay/qnn/strategy/hexagon.py
new file mode 100644
index 000000000000..c7f59cc096fc
--- /dev/null
+++ b/python/tvm/relay/qnn/strategy/hexagon.py
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of Hexagon operator strategy."""
+# pylint: disable=unused-argument,wildcard-import,unused-wildcard-import
+
+from tvm import topi
+from .generic import *
+from ... import op as _op
+from ...op.strategy.generic import is_depthwise_conv2d
+
+
+@qnn_quantize_strategy.register("hexagon")
+def qnn_quantize_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.quantize strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_quantize(topi.hexagon.qnn_quantize),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_quantize),
+        name="qnn_quantize.hexagon",
+    )
+    return strategy
+
+
+@qnn_dequantize_strategy.register("hexagon")
+def qnn_dequantize_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.dequantize strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_dequantize(topi.hexagon.qnn_dequantize),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_dequantize),
+        name="qnn_dequantize.hexagon",
+    )
+    return strategy
+
+
+@qnn_requantize_strategy.register("hexagon")
+def qnn_requantize_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.requantize strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_quantize(topi.hexagon.qnn_requantize),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_requantize),
+        name="qnn_requantize.hexagon",
+    )
+    return strategy
+
+
+@qnn_add_strategy.register("hexagon")
+def qnn_add_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.add strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_topi_compute(topi.hexagon.qnn_add),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_add),
+        name="qnn_add.hexagon",
+    )
+    return strategy
+
+
+@qnn_concatenate_strategy.register("hexagon")
+def qnn_concatenate_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.concatenate strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_topi_concatenate(topi.hexagon.qnn_concatenate),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_concatenate),
+        name="qnn_concatenate.hexagon",
+    )
+    return strategy
+
+
+@qnn_conv2d_strategy.register("hexagon")
+def qnn_conv2d_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.conv2d strategy for Hexagon"""
+    data = inputs[0]
+    kernel = inputs[1]
+    data_layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    groups = attrs.groups
+    strategy = _op.OpStrategy()
+    if groups == 1:
+        if data_layout == "NCHW" and kernel_layout == "OIHW":
+            strategy.add_implementation(
+                wrap_topi_qnn_conv2d(topi.hexagon.qnn_conv2d),
+                wrap_topi_schedule(topi.hexagon.schedule_qnn_conv2d),
+                name="qnn_conv2d.hexagon",
+            )
+    elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
+        if data_layout == "NCHW" and kernel_layout == "OIHW":
+            strategy.add_implementation(
+                wrap_topi_qnn_conv2d(topi.hexagon.qnn_depthwise_conv2d),
+                wrap_topi_schedule(topi.hexagon.schedule_qnn_depthwise_conv2d),
+                name="qnn_depthwise_conv2d.hexagon",
+            )
+    else:
+        raise RuntimeError("Unsupported strategy for group qnn.conv2d")
+
+    return strategy
+
+
+@qnn_dense_strategy.register("hexagon")
+def qnn_dense_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.dense strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_topi_qnn_dense(topi.hexagon.qnn_dense),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_dense),
+        name="qnn_dense.hexagon",
+    )
+    return strategy
+
+
+@qnn_batch_matmul_strategy.register("hexagon")
+def qnn_batch_matmul_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.batch_matmul strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_topi_qnn_batch_matmul(topi.hexagon.qnn_batch_matmul),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_batch_matmul),
+        name="qnn_batch_matmul.hexagon",
+    )
+    return strategy
diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
index a52422f6c1d2..0907ea2ebf85 100644
--- a/python/tvm/te/__init__.py
+++ b/python/tvm/te/__init__.py
@@ -26,6 +26,7 @@
 from tvm.tir import isnan, isfinite, isinf
 from tvm.tir import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod
 from tvm.tir import comm_reducer, min, max, sum
+from tvm.tir import add, subtract, multiply
 
 from .schedule import (
     Schedule,
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 8e637d2d6564..2767f2d5f779 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -74,6 +74,7 @@
 from .op import comm_reducer, min, max, sum
 from .op import q_multiply_shift, shift_left, shift_right
 from .op import TVMBackendAllocWorkspace, TVMBackendFreeWorkspace
+from .generic import add, subtract, multiply
 
 from .schedule import StmtSRef, BlockScope, ScheduleState, Schedule, ScheduleError
 
diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
index 2616b9315a9b..bafc6846b6fb 100644
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -25,3 +25,4 @@
 )
 
 from .quantize import quantize_compute, tir_quantize_schedule
+from .nn import *
diff --git a/python/tvm/topi/hexagon/qnn/nn.py b/python/tvm/topi/hexagon/qnn/nn.py
new file mode 100644
index 000000000000..40cfd0ee96b1
--- /dev/null
+++ b/python/tvm/topi/hexagon/qnn/nn.py
@@ -0,0 +1,667 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Hexagon QNN operators"""
+# pylint: disable=invalid-name
+
+import tvm
+from tvm import te, topi
+from ...utils import get_const_tuple
+from ...nn.utils import get_pad_tuple
+from ...nn.pad import pad
+from ... import tag, nn
+from ...x86.concat import concatenate
+
+
+def clip_cast(val, dtype):
+    # clip + cast:
+    const_min = tvm.tir.min_value(dtype)
+    const_max = tvm.tir.max_value(dtype)
+    return te.max(tvm.te.min(val, const_max), const_min).astype(dtype)
+
+
+def get_qnn_param(param, indices, axis):
+    # Account scalar and 1D quantization parameters:
+    if len(param.shape) == 0:
+        return param
+
+    param_idx = tvm.tir.indexmod(indices[axis], topi.shape(param)[0])
+    return param[param_idx]
+
+
+def default_schedule(outs):
+    """Simple default schedule for QNN ops.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of dense in the format
+        of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
+    s = tvm.te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
+    return s
+
+
+def qnn_quantize(data, output_scale, output_zero_point, axis, out_dtype):
+    """Compute for qnn.quantize
+
+    Q_output = clamp((round(input_tensor/output_scale) + output_zero_point),
+                     out_dtype::min,
+                     out_dtype::max)
+    """
+
+    assert len(output_scale.shape) == 0 or len(output_scale.shape) == 1
+    assert len(output_zero_point.shape) == 0 or len(output_zero_point.shape) == 1
+
+    def _compute(*indices):
+        value = data(*indices)
+        scale = get_qnn_param(output_scale, indices, axis)
+        zp = get_qnn_param(output_zero_point, indices, axis)
+
+        val = te.add(te.round(te.div(value, scale)), zp)
+        return clip_cast(val, out_dtype)
+
+    return te.compute(data.shape, _compute, tag=tag.ELEMWISE)
+
+
+def schedule_qnn_quantize(outs):
+    """Schedule for qnn.quantize
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.quantize
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
+
+
+def qnn_dequantize(data, input_scale, input_zero_point, axis):
+    """Compute for qnn.dequantize
+
+    fp_output = input_scale * (Q_input - input_zero_point)
+    """
+
+    def _compute(*indices):
+        value = data(*indices)
+        scale = get_qnn_param(input_scale, indices, axis)
+        zp = get_qnn_param(input_zero_point, indices, axis)
+
+        return te.multiply(scale, te.subtract(value, zp))
+
+    return te.compute(data.shape, _compute, tag=tag.ELEMWISE)
+
+
+def schedule_qnn_dequantize(outs):
+    """Schedule for qnn.dequantize
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.dequantize
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
+
+
+def qnn_requantize(data, input_scale, input_zp, output_scale, output_zp, axis, out_dtype):
+    """Compute for qnn.requantize
+
+    Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
+
+    TODO: support 'rounding' and 'compute_dtype' arguments.
+    """
+
+    def _compute(*indices):
+        value = data(*indices)
+
+        iscale = get_qnn_param(input_scale, indices, axis)
+        oscale = get_qnn_param(output_scale, indices, axis)
+
+        sub = te.subtract(value, input_zp)
+        mul = te.div(iscale, oscale)
+        val = te.add(te.round(te.multiply(mul, sub)), output_zp)
+
+        # clip + cast:
+        const_min = tvm.tir.min_value(out_dtype)
+        const_max = tvm.tir.max_value(out_dtype)
+        return te.max(tvm.te.min(val, const_max), const_min).astype(out_dtype)
+
+    return te.compute(data.shape, _compute)
+
+
+def schedule_qnn_requantize(outs):
+    """Schedule for qnn.requantize
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.requantize
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
+
+
+def qnn_add(
+    lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, output_zero_point
+):
+    """Compute for qnn.add
+
+    Q_output = zp_output + round((lhs_scale)/(scale_output) * (lhs_input - lhs_zp_input))
+                         + round((rhs_scale)/(scale_output) * (rhs_input - rhs_zp_input))
+
+    TODO: support 'axis' argument.
+    """
+
+    assert lhs.dtype == rhs.dtype
+    dtype = lhs.dtype
+
+    def _compute(*indices):
+        lvalue = lhs(*indices)
+        rvalue = rhs(*indices)
+        q_lv = te.round(
+            te.multiply(te.div(lhs_scale, output_scale), te.subtract(lvalue, lhs_zero_point))
+        ).astype("int32")
+        q_rv = te.round(
+            te.multiply(te.div(rhs_scale, output_scale), te.subtract(rvalue, rhs_zero_point))
+        ).astype("int32")
+        val = te.add(te.add(q_lv, q_rv), output_zero_point)
+
+        # clip + cast:
+        const_min = tvm.tir.min_value(dtype)
+        const_max = tvm.tir.max_value(dtype)
+        return te.max(tvm.te.min(val, const_max), const_min).astype(dtype)
+
+    return te.compute(lhs.shape, _compute)
+
+
+def schedule_qnn_add(outs):
+    """Schedule for qnn.add
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.add
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
+
+
+def requantize_tensor(tensor, i_scale, i_zp, o_scale, o_zp, out_dtype):
+    """Requantize tensor"""
+
+    def _compute(*indices):
+        value = tensor(*indices)
+        mul_value = te.round(
+            te.multiply(te.div(i_scale, o_scale), te.subtract(value, i_zp))
+        ).astype("int32")
+        rq_value = te.add(mul_value, o_zp)
+
+        return clip_cast(rq_value, out_dtype)
+
+    return te.compute(tensor.shape, _compute)
+
+
+def qnn_concatenate(data, axis, out_dtype):
+    """Compute for qnn.concatenate
+
+    Parameters
+    ----------
+    data: Array of Tensor
+          The computation graph description of qnn.concatenate
+          in the format of an array of tensors.
+
+    axis: int
+          The axis along which the tensors are concatenated.
+
+    out_dtype: string
+          Data type of output tensor
+
+    Returns
+    -------
+    out: Tensor
+        The computation for the op.
+    """
+
+    # Get output quantization parameters.
+    o_scale = data[-2]
+    o_zp = data[-1]
+
+    # Initially qnn.concatenate had 3 tuples: (1) tuple with input tensors, (2) tuple with input
+    # scales and (3) tuple with input zero points.
+    # Last 2 elements in data represent output scale and zero point.
+    num_of_tuples = 3
+    assert ((len(data) - 2) % num_of_tuples) == 0
+    args_num = (len(data) - 2) // num_of_tuples
+
+    args = []
+    for i in range(args_num):
+        # Get next tensor and its quantization parameters.
+        tensor = data[i]
+        i_scale = data[i + args_num]
+        i_zp = data[i + args_num * 2]
+
+        # Requantize tensors and add them to the list.
+        args.append(requantize_tensor(tensor, i_scale, i_zp, o_scale, o_zp, out_dtype))
+
+    # Call x86 implementation of concatenate.
+    return concatenate(args, axis)
+
+
+def schedule_qnn_concatenate(outs):
+    """Schedule for qnn.concatenate
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.add
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
+
+
+def qnn_conv2d(  # Conv2d inputs
+    data,
+    weight,
+    # Conv2d quantization params:
+    input_zero_point,
+    kernel_zero_point,
+    _input_scale,
+    _kernel_scale,
+    # bias
+    bias,
+    # Requantization params:
+    rq_input_scale,
+    rq_input_zero_point,
+    rq_output_scale,
+    rq_output_zero_point,
+    # Conv2d attributes:
+    strides,
+    padding,
+    dilation,
+    oshape,
+    odtype,
+):
+    """Compute for qnn.conv2d with NCHW layout.
+
+    Output data type should be specified through the 'odtype' parameter. qnn.conv2d leverages int32
+    type to store intermediate results. If 'odtype' differs from int32, you need to specify
+    requantization parameters.
+    """
+    in_channel = data.shape[1]  # NCHW layout
+    kernel_height = weight.shape[2]  # OIHW layout
+    kernel_width = weight.shape[3]  # OIHW layout
+
+    height_stride, width_stride = strides
+    dilation_h, dilation_w = dilation
+
+    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        get_const_tuple(padding), (dilated_kernel_h, dilated_kernel_w)
+    )
+
+    # Subtract zero point from input and then do padding with 0 value
+    data = te.compute(data.shape, lambda *indices: te.subtract(data(*indices), input_zero_point))
+
+    # DOPAD
+    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
+        pad_before = (0, 0, pad_top, pad_left)
+        pad_after = (0, 0, pad_down, pad_right)
+        data_pad = pad(data, pad_before, pad_after, name="data_pad")
+    else:
+        data_pad = data
+
+    ic = te.reduce_axis((0, in_channel), name="ic")
+    kh = te.reduce_axis((0, kernel_height), name="kh")
+    kw = te.reduce_axis((0, kernel_width), name="kw")
+
+    # axis=0 in get_qnn_param means 'O' dimension in "OIHW" weights layout.
+    out = te.compute(
+        oshape,
+        lambda n, oc, oh, ow: te.sum(
+            data_pad[
+                n,
+                ic,
+                oh * height_stride + kh * dilation_h,
+                ow * width_stride + kw * dilation_w,
+            ].astype("int32")
+            * te.subtract(
+                weight[oc, ic, kh, kw], get_qnn_param(kernel_zero_point, (oc, ic, kh, kw), axis=0)
+            ).astype("int32"),
+            axis=[ic, kh, kw],
+        ),
+    )
+
+    # Add bias
+    if bias is not None:
+        assert len(out.shape) == len(bias.shape)
+        assert bias.shape[2] == 1 and bias.shape[3] == 1
+        out = te.compute(out.shape, lambda n, c, h, w: out[n, c, h, w] + bias[n, c, 0, 0])
+
+    # Requantize output of convolution
+    # Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
+    if rq_input_scale is not None and rq_output_scale is not None:
+        # Now supported only scalar and 1D quantization parameters
+        assert len(rq_input_scale.shape) == 0 or len(rq_input_scale.shape) == 1
+        assert len(rq_output_scale.shape) == 0 or len(rq_output_scale.shape) == 1
+        axis = -1
+        if len(rq_input_scale.shape) == 1 or len(rq_output_scale.shape) == 1:
+            axis = 1  # Axis param should correspond to 'C' dimension.
+
+        return qnn_requantize(
+            out,
+            rq_input_scale,
+            rq_input_zero_point,
+            rq_output_scale,
+            rq_output_zero_point,
+            axis,
+            odtype,
+        )
+
+    return out
+
+
+def schedule_qnn_conv2d(outs):
+    """Schedule for qnn.conv2d
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.conv2d
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
+
+
+def qnn_depthwise_conv2d(  # Conv2d inputs
+    data,
+    weight,
+    # Conv2d quantization params:
+    input_zero_point,
+    kernel_zero_point,
+    _input_scale,
+    _kernel_scale,
+    # bias
+    bias,
+    # Requantization params:
+    rq_input_scale,
+    rq_input_zero_point,
+    rq_output_scale,
+    rq_output_zero_point,
+    # Conv2d attributes:
+    strides,
+    padding,
+    dilation,
+    oshape,
+    odtype,
+):
+    """Compute for qnn.conv2d with NCHW layout
+
+    Output data type should be specified through the 'odtype' parameter. qdepthwise nn.conv2d
+    leverages int32 type to store intermediate results. If 'odtype' differs from int32, you need to
+    specify requantization parameters.
+    """
+    kernel_height = weight.shape[2]  # OIHW layout
+    kernel_width = weight.shape[3]  # OIHW layout
+
+    height_stride, width_stride = strides
+    dilation_h, dilation_w = dilation
+
+    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        get_const_tuple(padding), (dilated_kernel_h, dilated_kernel_w)
+    )
+
+    # Subtract zero point from input and then do padding with 0 value
+    data = te.compute(data.shape, lambda *indices: te.subtract(data(*indices), input_zero_point))
+
+    # DOPAD
+    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
+        pad_before = (0, 0, pad_top, pad_left)
+        pad_after = (0, 0, pad_down, pad_right)
+        data_pad = pad(data, pad_before, pad_after, name="data_pad")
+    else:
+        data_pad = data
+
+    kh = te.reduce_axis((0, kernel_height), name="kh")
+    kw = te.reduce_axis((0, kernel_width), name="kw")
+
+    out = te.compute(
+        oshape,
+        lambda n, oc, oh, ow: te.sum(
+            data_pad[
+                n,
+                oc,
+                oh * height_stride + kh * dilation_h,
+                ow * width_stride + kw * dilation_w,
+            ].astype("int32")
+            * te.subtract(weight[oc, 0, kh, kw], kernel_zero_point).astype("int32"),
+            axis=[kh, kw],
+        ),
+    )
+
+    # Add bias
+    if bias is not None:
+        assert len(out.shape) == len(bias.shape)
+        assert bias.shape[2] == 1 and bias.shape[3] == 1
+        out = te.compute(out.shape, lambda n, c, h, w: out[n, c, h, w] + bias[n, c, 0, 0])
+
+    # Requantize output of convolution
+    # Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
+    if rq_input_scale is not None and rq_output_scale is not None:
+        # Now supported only scalar and 1D quantization parameters
+        assert len(rq_input_scale.shape) == 0 or len(rq_input_scale.shape) == 1
+        assert len(rq_output_scale.shape) == 0 or len(rq_output_scale.shape) == 1
+        axis = -1
+        if len(rq_input_scale.shape) == 1 or len(rq_output_scale.shape) == 1:
+            axis = 1  # Axis param should correspond to 'C' dimension.
+
+        return qnn_requantize(
+            out,
+            rq_input_scale,
+            rq_input_zero_point,
+            rq_output_scale,
+            rq_output_zero_point,
+            axis,
+            odtype,
+        )
+
+    return out
+
+
+def schedule_qnn_depthwise_conv2d(outs):
+    """Schedule for depthwise qnn.conv2d
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.conv2d
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
+
+
+def qnn_dense(
+    data,
+    weight,
+    # Dense quantization params:
+    input_zero_point,
+    kernel_zero_point,
+    _input_scale,
+    _kernel_scale,
+    # bias
+    bias,
+    # Requantization params:
+    rq_input_scale,
+    rq_input_zero_point,
+    rq_output_scale,
+    rq_output_zero_point,
+    out_dtype,
+):
+    """Compute for qnn.dense
+
+    Output data type should be specified through the 'odtype' parameter. qnn.dense leverages int32
+    type to store intermediate results. If 'odtype' differs from int32, you need to specify
+    requantization parameters.
+    """
+    M, K = get_const_tuple(data.shape)
+    N, _ = get_const_tuple(weight.shape)
+    k = te.reduce_axis((0, K), "k")
+    # This implementation uses "int32" dense output data type.
+    # axis=0 in get_qnn_param mean 'N' dimension in "NK" weights layout.
+    out = te.compute(
+        (M, N),
+        lambda m, n: te.sum(
+            te.subtract(data[m, k], input_zero_point).astype("int32")
+            * te.subtract(weight[n, k], get_qnn_param(kernel_zero_point, (n, k), axis=0)).astype(
+                "int32"
+            ),
+            axis=k,
+        ),
+    )
+
+    # Add bias
+    if bias is not None:
+        out = te.compute(out.shape, lambda n, c: out[n, c] + bias[c])
+
+    # Requantize output of dense
+    # Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
+    if rq_input_scale is not None and rq_output_scale is not None:
+        # Now supported only scalar and 1D quantization parameters
+        assert len(rq_input_scale.shape) == 0 or len(rq_input_scale.shape) == 1
+        assert len(rq_output_scale.shape) == 0 or len(rq_output_scale.shape) == 1
+        axis = -1
+        if len(rq_input_scale.shape) == 1 or len(rq_output_scale.shape) == 1:
+            axis = 1  # Axis param should correspond to 'N' dimension.
+
+        return qnn_requantize(
+            out,
+            rq_input_scale,
+            rq_input_zero_point,
+            rq_output_scale,
+            rq_output_zero_point,
+            axis,
+            out_dtype,
+        )
+
+    return out
+
+
+def schedule_qnn_dense(outs):
+    """Schedule for qnn.dense
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.dense
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
+
+
+def qnn_batch_matmul(
+    tensor_a,
+    tensor_b,
+    # batch_matmul quantization params:
+    a_zero_point,
+    b_zero_point,
+    _a_scale,
+    _b_scale,
+    # Attributes
+    transpose_a,
+    transpose_b,
+    out_dtype,
+):
+    """Compute for qnn.batch_matmul"""
+
+    # Preprocess tensor_a: subtract zp
+    a_sub_zp = te.compute(
+        tensor_a.shape, lambda *indices: te.subtract(tensor_a(*indices), a_zero_point)
+    )
+    # Preprocess tensor_b: subtract zp
+    b_sub_zp = te.compute(
+        tensor_b.shape, lambda *indices: te.subtract(tensor_b(*indices), b_zero_point)
+    )
+
+    return nn.batch_matmul(a_sub_zp, b_sub_zp, None, out_dtype, transpose_a, transpose_b)
+
+
+def schedule_qnn_batch_matmul(outs):
+    """Schedule for qnn.batch_matmul
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.batch_matmul
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 9a0a2bef9a47..e7326ed5dd4d 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -123,6 +123,81 @@ Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
   return res;
 }
 
+// Helper class that is used during lowering to TE.
+// It matches sequence of Ops and lower them into single TOPI operation. All supported patterns are
+// enumerated in "supported_patterns_".
+class QnnPatternMatcher {
+ public:
+  QnnPatternMatcher()
+      : qnn_conv2d_op_(Op::Get("qnn.conv2d")),
+        qnn_dense_op_(Op::Get("qnn.dense")),
+        qnn_requantize_op_(Op::Get("qnn.requantize")),
+        bias_add_op_(Op::Get("add")) {}
+
+  // Memoize visited operations
+  void Register(const CallNode* call_node) {
+    ICHECK(call_node->op.as<OpNode>());
+    Op op = Downcast<Op>(call_node->op);
+    if (op == qnn_conv2d_op_) {
+      registered_ops_.push_front(P_QConv2d);
+      ICHECK(anchor_op_ == nullptr);
+      anchor_op_ = call_node;
+    } else if (op == qnn_requantize_op_) {
+      registered_ops_.push_front(P_QRequantize);
+    } else if (op == bias_add_op_) {
+      registered_ops_.push_front(P_BiasAdd);
+    } else if (op == qnn_dense_op_) {
+      registered_ops_.push_front(P_QDense);
+      ICHECK(anchor_op_ == nullptr);
+      anchor_op_ = call_node;
+    } else {
+      registered_ops_.push_front(P_Opaque);
+    }
+  }
+
+  // Check whether given Op is a part of matched pattern.
+  bool find(const Op& op) {
+    if (registered_ops_.empty()) return false;
+
+    if (op == qnn_conv2d_op_ || op == qnn_requantize_op_ || op == bias_add_op_ ||
+        op == qnn_dense_op_) {
+      for (const auto& pat : supported_patterns_) {
+        auto it =
+            std::search(registered_ops_.begin(), registered_ops_.end(), pat.begin(), pat.end());
+        if (it != registered_ops_.end()) return true;
+      }
+    }
+    return false;
+  }
+
+  // returns whether given Op is last in the pattern sequence.
+  bool IsLeafOp(const Op& op) { return op == qnn_requantize_op_; }
+
+  const CallNode* GetAnchorOp() { return anchor_op_; }
+
+  void Clear() { registered_ops_.clear(); }
+
+ private:
+  const Op& qnn_conv2d_op_;
+  const Op& qnn_dense_op_;
+  const Op& qnn_requantize_op_;
+  const Op& bias_add_op_;
+
+  // Main (complicated) operation in the primitive (for example qnn.conv2d, qnn.dense etc.).
+  const CallNode* anchor_op_ = nullptr;
+
+  enum POper { P_QConv2d, P_QDense, P_BiasAdd, P_QRequantize, P_Opaque };
+
+  std::deque<POper> registered_ops_;
+
+  const std::vector<std::deque<POper>> supported_patterns_ = {
+      {P_QDense, P_BiasAdd, P_QRequantize},   // Pattern qnn.dense -> bias_add -> qnn.requantize
+      {P_QDense, P_QRequantize},              // Patter qnn.dense -> qnn.requantize
+      {P_QConv2d, P_BiasAdd, P_QRequantize},  // Pattern qnn.conv2d -> bias_add -> qnn.requantize
+      {P_QConv2d, P_QRequantize}              // Patter qnn.conv2d -> qnn.requantize
+  };
+};
+
 // Lowers Relay primitive Function to TE Compute
 class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
  public:
@@ -213,6 +288,8 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
     static auto flower_call = tvm::runtime::Registry::Get("relay.backend.lower_call");
     ICHECK(flower_call) << "relay.backend.lower_call is not registered.";
 
+    pattern_matcher_.Register(call_node);
+
     Array<te::Tensor> inputs;
     int count_tuple = 0;
     for (Expr arg : call_node->args) {
@@ -224,21 +301,35 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
       }
     }
 
-    if (count_tuple) {
-      ICHECK_EQ(call_node->args.size(), 1U)
-          << "Only functions with a single tuple input are allowed, but " << count_tuple
-          << " were provided.";
-    }
-
     ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
     Op op = Downcast<Op>(call_node->op);
 
     // TODO(mbs): device_copy cleanup
     ICHECK_NE(op, device_copy_op_) << "device_copy cannot be lowered";
 
-    LoweredOutput lowered_out = (*flower_call)(GetRef<Call>(call_node), inputs, target_);
-    Array<te::Tensor> outputs = lowered_out->outputs;
-    op_implementations_[op.operator->()] = lowered_out->implementation;
+    Array<te::Tensor> outputs;
+
+    if (pattern_matcher_.find(op)) {
+      if (pattern_matcher_.IsLeafOp(op)) {
+        // Lower anchor op when pattern leaf op was reached
+        auto anchor_op = pattern_matcher_.GetAnchorOp();
+        LoweredOutput lowered_out =
+            (*flower_call)(GetRef<Call>(anchor_op), inputs, target_, call_node->checked_type());
+        outputs = lowered_out->outputs;
+        Op a_op = Downcast<Op>(anchor_op->op);
+        op_implementations_[a_op.operator->()] = lowered_out->implementation;
+
+        pattern_matcher_.Clear();
+      } else {
+        // Forward inputs as "outputs" for successor.
+        readable_name_stream_ << '_' << op->name;
+        return inputs;
+      }
+    } else {
+      LoweredOutput lowered_out = (*flower_call)(GetRef<Call>(call_node), inputs, target_);
+      outputs = lowered_out->outputs;
+      op_implementations_[op.operator->()] = lowered_out->implementation;
+    }
 
     if (outputs.size() != 1) {
       const auto* tuple_type = call_node->checked_type().as<TupleTypeNode>();
@@ -294,6 +385,8 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
   std::string candidate_name_;
 
  private:
+  QnnPatternMatcher pattern_matcher_;
+
   tvm::Target target_;
   std::ostringstream readable_name_stream_;
   // Index of the global constants
diff --git a/src/relay/qnn/pass/legalize.cc b/src/relay/qnn/pass/legalize.cc
index 33b9e59ab241..a5906cf5e694 100644
--- a/src/relay/qnn/pass/legalize.cc
+++ b/src/relay/qnn/pass/legalize.cc
@@ -34,7 +34,7 @@ Pass Legalize() {
   Array<Pass> pass_seqs;
   pass_seqs.push_back(relay::transform::Legalize("FTVMQnnLegalize"));
   pass_seqs.push_back(relay::transform::Legalize("FTVMQnnCanonicalize"));
-  relay::transform::Pass seq = relay::transform::Sequential(pass_seqs);
+  relay::transform::Pass seq = relay::transform::Sequential(pass_seqs, "qnn.Legalize");
   return seq;
 }
 
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index dac5dc69ead5..afa60f1bb4e5 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -885,8 +885,10 @@ class FuseMutator : private MixedModeMutator {
   Expr Rewrite_(const CallNode* call, const Expr& post) {
     if (call->op.as<OpNode>()) {
       static auto fnoncomputational = Op::GetAttrMap<TNonComputational>("TNonComputational");
+      static auto fqnncanonicalize = Op::GetAttrMap<FTVMLegalize>("FTVMQnnCanonicalize");
 
-      if (fnoncomputational.get(Downcast<Op>(call->op), false)) {
+      Op op = Downcast<Op>(call->op);
+      if (fnoncomputational.get(op, false) && !fqnncanonicalize.count(op)) {
         return ExprMutator::VisitExpr_(call);
       }
 
diff --git a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
new file mode 100644
index 000000000000..24da1faac697
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import numpy as np
+
+import tvm.testing
+from tvm import relay
+from tvm.contrib.hexagon.session import Session
+from tvm.contrib import graph_executor
+from tvm.relay.backend import Executor
+
+
+@tvm.testing.requires_hexagon
+def test_no_qnn_pass():
+    x = relay.var("x", shape=(4, 8), dtype="float32")
+    op0 = relay.qnn.op.quantize(x, relay.const(2.0), relay.const(10), out_dtype="uint8")
+    op1 = relay.qnn.op.dequantize(op0, relay.const(0.5), relay.const(5))
+    mod = tvm.IRModule.from_expr(op1)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    # Default compilation flow
+    with tvm.transform.PassContext(opt_level=3):
+        opt_mod_1, _ = relay.optimize(mod, tvm.target.Target(target_hexagon, host=target_hexagon))
+
+    # Disable QNN legalization and canonicalization passes
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
+        opt_mod_2, _ = relay.optimize(mod, tvm.target.Target(target_hexagon, host=target_hexagon))
+
+    # Check that QNN ops are absent with default compilation flow.
+    assert "qnn.quantize" not in opt_mod_1.astext(show_meta_data=False)
+    assert "qnn.dequantize" not in opt_mod_1.astext(show_meta_data=False)
+
+    # Check that QNN ops are present without "qnn.Legalize" passes.
+    assert "qnn.quantize" in opt_mod_2.astext(show_meta_data=False)
+    assert "qnn.dequantize" in opt_mod_2.astext(show_meta_data=False)
+
+
+def execute(executor, data_np, weight_np, bias_np=None):
+    executor.set_input("data", data_np)
+    executor.set_input("weight", weight_np)
+    if bias_np is not None:
+        executor.set_input("bias", bias_np)
+    executor.run()
+    return executor.get_output(0)
+
+
+@tvm.testing.requires_hexagon
+def test_qnn_conv2d_rq(hexagon_session: Session):
+    data_shape = [1, 8, 32, 32]
+    weight_shape = [16, 8, 3, 3]
+    data = relay.var("data", shape=data_shape, dtype="float32")
+    weight = relay.var("weight", shape=weight_shape, dtype="float32")
+    op0 = relay.qnn.op.quantize(data, relay.const(0.078), relay.const(0), out_dtype="int8")
+    op1 = relay.qnn.op.quantize(weight, relay.const(0.07), relay.const(0), out_dtype="int8")
+    op2 = relay.qnn.op.conv2d(
+        op0,
+        op1,
+        input_zero_point=relay.const(0),
+        kernel_zero_point=relay.const(0),
+        input_scale=relay.const(0.078),
+        kernel_scale=relay.const(0.07),
+        padding=[0, 0, 0, 0],
+        channels=16,
+        kernel_size=[3, 3],
+    )
+    op5 = relay.qnn.op.requantize(
+        op2,
+        input_scale=relay.const(0.05),
+        input_zero_point=relay.const(0),
+        output_scale=relay.const(0.21),
+        output_zero_point=relay.const(61),
+        out_dtype="int8",
+    )
+    relay_mod = tvm.IRModule.from_expr(op5)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    target_llvm = tvm.target.Target("llvm")
+    executor = Executor("graph", {"link-params": True})
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
+        hexagon_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            executor=executor,
+        )
+
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            executor=executor,
+        )
+
+    data_np = np.random.rand(*data_shape) - 0.5
+    weight_np = np.random.rand(*weight_shape) - 0.5
+
+    hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
+    hexagon_output = execute(hx_m, data_np, weight_np)
+
+    dev = tvm.cpu(0)
+    llvm_m = graph_executor.GraphModule(llvm_lowered["default"](dev))
+    llvm_out = execute(llvm_m, data_np, weight_np)
+
+    np.testing.assert_equal(hexagon_output.numpy(), llvm_out.numpy())
+
+
+@tvm.testing.requires_hexagon
+def test_qnn_dense_bias_rq(hexagon_session: Session):
+    data_shape = [8, 8]
+    weight_shape = [16, 8]
+    bias_shape = [16]
+    data = relay.var("data", shape=data_shape, dtype="float32")
+    weight = relay.var("weight", shape=weight_shape, dtype="float32")
+    bias = relay.var("bias", shape=bias_shape, dtype="float32")
+
+    op0 = relay.qnn.op.quantize(data, relay.const(0.08), relay.const(0), out_dtype="int8")
+    op1 = relay.qnn.op.quantize(weight, relay.const(0.07), relay.const(0), out_dtype="int8")
+    op2 = relay.qnn.op.dense(
+        op0,
+        op1,
+        input_zero_point=relay.const(0),
+        kernel_zero_point=relay.const(0),
+        input_scale=relay.const(0.08),
+        kernel_scale=relay.const(0.07),
+        units=None,
+    )
+    op3 = relay.qnn.op.quantize(bias, relay.const(0.5), relay.const(0), out_dtype="int32")
+    op4 = relay.nn.bias_add(op2, op3)
+    op5 = relay.qnn.op.requantize(
+        op4,
+        input_scale=relay.const(0.05),
+        input_zero_point=relay.const(0),
+        output_scale=relay.const(0.212),
+        output_zero_point=relay.const(10),
+        out_dtype="int8",
+    )
+    relay_mod = tvm.IRModule.from_expr(op5)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    target_llvm = tvm.target.Target("llvm")
+    executor = Executor("graph", {"link-params": True})
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
+        hexagon_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            executor=executor,
+        )
+
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            executor=executor,
+        )
+
+    data_np = np.random.rand(*data_shape) - 0.5
+    weight_np = np.random.rand(*weight_shape) - 0.5
+    bias_np = np.random.rand(*bias_shape)
+
+    hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
+    hexagon_output = execute(hx_m, data_np, weight_np, bias_np)
+
+    dev = tvm.cpu(0)
+    llvm_m = graph_executor.GraphModule(llvm_lowered["default"](dev))
+    llvm_out = execute(llvm_m, data_np, weight_np, bias_np)
+
+    np.testing.assert_equal(hexagon_output.numpy(), llvm_out.numpy())
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 3d22dbffd094181fa4cb71d776f6ed3f518cd1e2 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Tue, 18 Oct 2022 14:26:46 -0500
Subject: [PATCH 383/704] [Relay] fix: add compute tag for trilu (#13120)

fix: add compute tag for trilu
---
 python/tvm/topi/transform.py         |  2 +-
 tests/python/relay/test_op_level3.py | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index 44263e131182..0347473f83b7 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -1058,4 +1058,4 @@ def _apply_trilu(*indices):
         value = data(*other_indices, row_index, col_index)
         return tvm.tir.Select(check_position, value, tvm.tir.const(0, data.dtype))
 
-    return te.compute(data.shape, _apply_trilu, name="trilu")
+    return te.compute(data.shape, _apply_trilu, name="trilu", tag=topi.tag.ELEMWISE)
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 400f7dcf0b42..9becfc12671d 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -2264,5 +2264,24 @@ def verify_trilu(data_shape, upper=True, k=0):
     verify_trilu((8, 6, 6), False, -2)
 
 
+def test_trilu_reduce():
+    data_i0 = np.ones((2, 2), dtype="int32")
+    k = 0
+
+    i0 = relay.var("i0", shape=[2, 2], dtype="int32")
+    i1 = relay.var("i1", shape=(), dtype="int64")
+    v0 = relay.trilu(i0, i1)
+    v1 = relay.argmin(v0, axis=[0])
+    f = relay.Function([i0, i1], v1)
+    tvm_res = (
+        relay.create_executor("graph", device=tvm.cpu(), target="llvm")
+        .evaluate(f)(data_i0, k)
+        .numpy()
+    )
+
+    np_res = np.triu(data_i0, k).argmin(axis=0)
+    tvm.testing.assert_allclose(tvm_res, np_res)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From e3b722b70d5b5ff9564c8da784a9306ce61cf698 Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Tue, 18 Oct 2022 18:19:29 -0400
Subject: [PATCH 384/704] [Hexagon] [runtime] Use malloc/free for RPC buffers
 (#13125)

Use malloc/free for RPC buffers
---
 src/runtime/hexagon/hexagon_device_api.cc       | 11 -----------
 src/runtime/hexagon/hexagon_device_api.h        | 17 +++--------------
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc   |  6 +++---
 .../hexagon/hexagon_device_api_tests.cc         | 13 -------------
 4 files changed, 6 insertions(+), 41 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 6d223017e270..5f7867590743 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -143,17 +143,6 @@ void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
   }
 }
 
-void* HexagonDeviceAPI::AllocRpcBuffer(size_t nbytes, size_t alignment) {
-  CHECK(nbytes) << "number of bytes is zero";
-  CHECK(alignment) << "alignment is zero";
-  return rpc_hexbuffs.AllocateHexagonBuffer(nbytes, alignment, String("global"));
-}
-
-void HexagonDeviceAPI::FreeRpcBuffer(void* ptr) {
-  CHECK(ptr) << "buffer pointer is null";
-  rpc_hexbuffs.FreeHexagonBuffer(ptr);
-}
-
 // WorkSpace: runtime allocations for Hexagon
 struct HexagonWorkspacePool : public WorkspacePool {
   HexagonWorkspacePool()
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index 8d2795e7a04e..7944878dd5dc 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -106,12 +106,6 @@ class HexagonDeviceAPI final : public DeviceAPI {
   //! \brief Free the allocated HexagonBuffer.
   void FreeDataSpace(Device dev, void* ptr) final;
 
-  //! \brief Hexagon-only interface to allocate buffers used for the RPC server
-  void* AllocRpcBuffer(size_t nbytes, size_t alignment);
-
-  //! \brief Hexagon-only interface to free buffers used for the RPC server
-  void FreeRpcBuffer(void* ptr);
-
   /*! \brief Request a dynamically allocated HexagonBuffer from a workspace pool.
    *  \returns The underlying allocation pointer.
    */
@@ -196,15 +190,10 @@ class HexagonDeviceAPI final : public DeviceAPI {
            (DLDeviceType(dev.device_type) == kDLCPU);
   }
 
-  //! \brief Manages RPC HexagonBuffer allocations
-  // rpc_hexbuffs is used only in Alloc/FreeRpcBuffer.  It is static because it lives for the
-  // lifetime of the static Device API.
-  HexagonBufferManager rpc_hexbuffs;
-
   //! \brief Manages runtime HexagonBuffer allocations
-  // runtime_hexbuffs is used for runtime allocations, separate from rpc_hexbuffs.  It is created
-  // with a call to AcquireResources, and destroyed on ReleaseResources.  The buffers in this
-  // manager are scoped to the lifetime of a user application session.
+  // runtime_hexbuffs is used for runtime allocations.  It is created with a call to
+  // AcquireResources, and destroyed on ReleaseResources.  The buffers in this manager are scoped
+  // to the lifetime of a user application session.
   std::unique_ptr<HexagonBufferManager> runtime_hexbuffs;
 
   //! \brief Keeps a list of released runtime HexagonBuffer allocations
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index 29c3a1bdfe6d..b4799d5d7127 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -23,10 +23,10 @@ extern "C" {
 #include <HAP_farf.h>
 #include <HAP_perf.h>
 #include <qurt_error.h>
-#include <qurt_hvx.h>
 }
 
 #include <dlfcn.h>
+#include <stdlib.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
@@ -158,7 +158,7 @@ class HexagonPageAllocator {
     size_t npages = ((min_size + kPageSize - 1) / kPageSize);
     void* data;
 
-    data = HexagonDeviceAPI::Global()->AllocRpcBuffer(npages * kPageSize, kPageAlign);
+    data = malloc(npages * kPageSize);
 
     ArenaPageHeader* header = static_cast<ArenaPageHeader*>(data);
     header->size = npages * kPageSize;
@@ -166,7 +166,7 @@ class HexagonPageAllocator {
     return header;
   }
 
-  void deallocate(ArenaPageHeader* page) { HexagonDeviceAPI::Global()->FreeRpcBuffer(page); }
+  void deallocate(ArenaPageHeader* page) { free(page); }
 
   static const constexpr int kPageSize = 2 << 10;
   static const constexpr int kPageAlign = 8;
diff --git a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
index e262a16ada5c..0d193042a950 100644
--- a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
@@ -161,19 +161,6 @@ TEST_F(HexagonDeviceAPITest, runtime_buffer_manager) {
   EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, runtime_buf), InternalError);
 }
 
-// Ensure RPC buffer manager is always available
-TEST_F(HexagonDeviceAPITest, rpc_buffer_manager) {
-  void* rpc_buf;
-  rpc_buf = hexapi->AllocRpcBuffer(nbytes, alignment);
-  CHECK(rpc_buf != nullptr);
-  hexapi->ReleaseResources();
-  hexapi->FreeRpcBuffer(rpc_buf);
-  rpc_buf = hexapi->AllocRpcBuffer(nbytes, alignment);
-  CHECK(rpc_buf != nullptr);
-  hexapi->AcquireResources();
-  hexapi->FreeRpcBuffer(rpc_buf);
-}
-
 // Ensure thread manager is properly configured and destroyed
 // in Acquire/Release
 TEST_F(HexagonDeviceAPITest, thread_manager) {

From 9c9f32536a767d20bd3fbf1beab0aeb909392bde Mon Sep 17 00:00:00 2001
From: Oleksandr Viazlo <oleksandr.viazlo@axelera.ai>
Date: Wed, 19 Oct 2022 01:36:31 +0200
Subject: [PATCH 385/704] Update Pytorch to version 1.12.0 and TorchVision to
 0.13.0 (#13126)

update Pytorch to version 1.12.0 and TorchVision to 0.13.0
---
 docker/install/ubuntu_install_onnx.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
index d775875bc7c5..d1036b790664 100755
--- a/docker/install/ubuntu_install_onnx.sh
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -36,6 +36,6 @@ pip3 install \
 pip3 install future
 
 pip3 install \
-    torch==1.11.0 \
-    torchvision==0.12.0 \
+    torch==1.12.0 \
+    torchvision==0.13.0 \
     --extra-index-url https://download.pytorch.org/whl/cpu

From 0b4836739c5fc1fafe16b4d4c27ccf24d2891dbe Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Tue, 18 Oct 2022 20:15:52 -0400
Subject: [PATCH 386/704] Skip stride check if shape is 1 in IsContiguous
 (#13121)

Skip checking stride if shape is 1 in IsContiguous

Skip stride check if shape[k] is 1, where the dimension is contiguous
regardless of the value of stride.

For example, PyTorch will normalize stride to 1 if shape is 1.
---
 include/tvm/runtime/ndarray.h |  9 +++++
 tests/cpp/ndarray_test.cc     | 73 +++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 tests/cpp/ndarray_test.cc

diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index d530ef587782..b6a4cfe453c1 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -365,6 +365,15 @@ static inline bool IsContiguous(const DLTensor& arr) {
   int64_t expected_stride = 1;
   for (int32_t i = arr.ndim; i != 0; --i) {
     int32_t k = i - 1;
+    if (arr.shape[k] == 1) {
+      // Skip stride check if shape[k] is 1, where the dimension is contiguous
+      // regardless of the value of stride.
+      //
+      // For example, PyTorch will normalize stride to 1 if shape is 1 when exporting
+      // to DLPack.
+      // More context: https://github.com/pytorch/pytorch/pull/83158
+      continue;
+    }
     if (arr.strides[k] != expected_stride) return false;
     expected_stride *= arr.shape[k];
   }
diff --git a/tests/cpp/ndarray_test.cc b/tests/cpp/ndarray_test.cc
new file mode 100644
index 000000000000..cd5c75410aae
--- /dev/null
+++ b/tests/cpp/ndarray_test.cc
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/runtime/ndarray.h>
+
+using namespace tvm;
+
+TEST(NDArrayTest, IsContiguous_ContiguousStride) {
+  auto array = runtime::NDArray::Empty({5, 10}, DataType::Float(32), {kDLCPU});
+  DLManagedTensor* managed_tensor = array.ToDLPack();
+
+  int64_t strides[] = {10, 1};
+  managed_tensor->dl_tensor.strides = strides;
+
+  ICHECK(runtime::IsContiguous(managed_tensor->dl_tensor));
+
+  managed_tensor->deleter(managed_tensor);
+}
+
+TEST(NDArrayTest, IsContiguous_NullStride) {
+  auto array = runtime::NDArray::Empty({5, 10}, DataType::Float(32), {kDLCPU});
+  DLManagedTensor* managed_tensor = array.ToDLPack();
+
+  managed_tensor->dl_tensor.strides = nullptr;
+
+  ICHECK(runtime::IsContiguous(managed_tensor->dl_tensor));
+
+  managed_tensor->deleter(managed_tensor);
+}
+
+TEST(NDArrayTest, IsContiguous_AnyStrideForSingular) {
+  auto array = runtime::NDArray::Empty({5, 1, 10}, DataType::Float(32), {kDLCPU});
+  DLManagedTensor* managed_tensor = array.ToDLPack();
+
+  int64_t strides[] = {10, 1, 1};  // strides[1] is normalized to 1 because shape[1] == 1.
+  managed_tensor->dl_tensor.strides = strides;
+
+  ICHECK(runtime::IsContiguous(managed_tensor->dl_tensor));
+
+  managed_tensor->dl_tensor.strides = nullptr;
+  managed_tensor->deleter(managed_tensor);
+}
+
+TEST(NDArrayTest, IsContiguous_UncontiguousStride) {
+  auto array = runtime::NDArray::Empty({5, 1, 10}, DataType::Float(32), {kDLCPU});
+  DLManagedTensor* managed_tensor = array.ToDLPack();
+
+  int64_t strides[] = {1, 1, 1};
+  managed_tensor->dl_tensor.strides = strides;
+
+  ICHECK(!runtime::IsContiguous(managed_tensor->dl_tensor));
+
+  managed_tensor->dl_tensor.strides = nullptr;
+  managed_tensor->deleter(managed_tensor);
+}

From a4840e7de38c5a2000917f2101f3ec4a374bcd39 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 19 Oct 2022 01:36:35 -0500
Subject: [PATCH 387/704] [ci] Lint for trailing newlines and spaces  (#13058)

This adds a short lint to ensure that all files have a single trailing newline and no trailing whitespaces. This PR is in two commits, one to add the check and another to fix currently offending files in the repo. See https://github.com/apache/tvm/pull/13058/commits/ba2c2e235e2a16d62fdeed959044c65012d9f942 for just the significant changes. Auto-corrections applied with

```
pre-commit run --all-files
```
---
 .gitattributes                                |   1 -
 .github/ISSUE_TEMPLATE/documentation.md       |   1 -
 .github/actions/setup/action.yml              |   1 -
 .github/dependabot.yml                        |   2 +-
 .github/workflows/main.yml                    |   4 +-
 3rdparty/libcrc/tab/gentab_ccitt.inc          |   1 -
 NEWS.md                                       |   2 -
 .../app/src/main/jni/Android.mk               |   2 +-
 .../app/src/main/res/layout/activity_main.xml |   2 +-
 apps/android_camera/models/requirements.txt   |   2 +-
 apps/android_deploy/app/build.gradle          |   4 +-
 .../android_deploy/app/download-models.gradle |   1 -
 apps/android_deploy/dev_tools/gen_keystore.sh |   4 +-
 apps/android_deploy/dev_tools/sign_apk.sh     |   4 +-
 apps/android_deploy/gradle.properties         |   2 +-
 apps/android_rpc/dev_tools/gen_keystore.sh    |   4 +-
 apps/android_rpc/dev_tools/sign_apk.sh        |   4 +-
 apps/bundle_deploy/README.md                  |   6 +-
 apps/cpp_rpc/CMakeLists.txt                   |   2 +-
 .../cmake/hexagon/CMakeLists.txt              |   1 -
 apps/ios_rpc/.gitignore                       |   1 -
 apps/ios_rpc/README.md                        |   4 +-
 .../AppIcon.appiconset/Contents.json          |   2 +-
 apps/microtvm/README.md                       |   2 -
 apps/microtvm/cmsisnn/README.md               |   2 +-
 apps/microtvm/cmsisnn/corstone300.ld          |   2 +-
 apps/microtvm/reference-vm/README.md          |   4 +-
 .../base-box/base_box_provision.sh            |   4 +-
 apps/microtvm/zephyr/README.md                |   1 -
 .../src/host_driven/fvp/semihost.c            |   2 +-
 apps/microtvm/zephyr_cmsisnn/CMakeLists.txt   |  18 +-
 apps/microtvm/zephyr_cmsisnn/README.md        |   2 +-
 apps/microtvm/zephyr_cmsisnn/model/labels.txt |   2 +-
 apps/microtvm/zephyr_cmsisnn/run_demo.sh      |   2 +-
 .../prepare_and_test_pt_tvm_class.sh          |   5 +-
 .../prepare_and_test_tfop_module.sh           |   5 +-
 apps/vta_rpc/start_rpc_server.sh              |   4 +-
 apps/wasm-standalone/README.md                |   4 +-
 ci/jenkins/.gitignore                         |   2 +-
 ci/scripts/git_skip_ci_globs.py               |   2 +
 cmake/modules/Git.cmake                       |   2 +-
 cmake/modules/contrib/BNNS.cmake              |   1 -
 cmake/modules/contrib/CODEGENC.cmake          |   1 -
 cmake/modules/contrib/DNNL.cmake              |   1 -
 cmake/modules/contrib/Posit.cmake             |   2 +-
 cmake/modules/contrib/TF_TVMDSOOP.cmake       |   9 +-
 cmake/modules/contrib/Verilator.cmake         |   1 -
 cmake/utils/FindEthosN.cmake                  |   2 +-
 cmake/utils/FindVulkan.cmake                  |   2 +-
 conda/condarc                                 |   8 +-
 docker/Dockerfile.ci_minimal                  |   2 +-
 docker/README.md                              |   6 +-
 docker/bash.sh                                |   2 +-
 .../install/ubuntu1804_manual_install_llvm.sh |   1 -
 docker/install/ubuntu_install_androidsdk.sh   |   5 +-
 docker/install/ubuntu_install_caffe2.sh       |   4 +-
 docker/install/ubuntu_install_cmsis.sh        |   1 -
 docker/install/ubuntu_install_coreml.sh       |   4 +-
 docker/install/ubuntu_install_dnnl.sh         |   4 +-
 docker/install/ubuntu_install_gluoncv.sh      |   4 +-
 docker/install/ubuntu_install_golang.sh       |   4 +-
 docker/install/ubuntu_install_gradle.sh       |   4 +-
 docker/install/ubuntu_install_hexagon.sh      |   4 +-
 docker/install/ubuntu_install_java.sh         |   4 +-
 docker/install/ubuntu_install_nodejs.sh       |   2 +-
 docker/install/ubuntu_install_opencl.sh       |   4 +-
 docker/install/ubuntu_install_rocm.sh         |   4 +-
 docker/install/ubuntu_install_universal.sh    |   2 +-
 .../install/ubuntu_install_vitis_ai_core.sh   |   4 +-
 .../ubuntu_install_vitis_ai_packages_ci.sh    |   4 +-
 docker/utils/apt-install-and-clear.sh         |   1 -
 docs/_static/img/README                       |   2 +-
 docs/arch/hybrid_script.rst                   |   2 +-
 docs/contribute/release_process.rst           |  10 +-
 docs/dev/how_to/relay_add_op.rst              |  56 +++----
 docs/errors.rst                               |   1 -
 docs/how_to/deploy/arm_compute_lib.rst        |   2 +-
 docs/how_to/deploy/index.rst                  |  14 +-
 docs/how_to/deploy/vitis_ai.rst               | 154 +++++++++---------
 docs/install/from_source.rst                  |   2 +-
 docs/reference/api/python/auto_scheduler.rst  |   1 -
 docs/reference/api/python/topi.rst            |   2 -
 docs/reference/langref/relay_pattern.rst      |   2 +-
 docs/topic/vta/.gitignore                     |   2 +-
 docs/topic/vta/dev/config.rst                 |   1 -
 docs/topic/vta/dev/hardware.rst               |   2 -
 docs/topic/vta/dev/index.rst                  |   2 +-
 docs/topic/vta/install.rst                    |   1 -
 .../tune_network_cuda.py                      |   2 +-
 .../tune_network_x86.py                       |   2 +-
 .../how_to/work_with_microtvm/micro_aot.py    |   4 +-
 .../work_with_pytorch/using_as_torch.py       |   2 +-
 .../using_optimized_torch.py                  |   2 +-
 gallery/tutorial/tvmc_python.py               |   2 +-
 jvm/conf/log4j.properties                     |   4 +-
 .../src/main/java/org/apache/tvm/Base.java    |   1 -
 .../tvm/rpc/ConnectProxyServerProcessor.java  |   4 +-
 .../test/java/org/apache/tvm/TestUtils.java   |   4 +-
 licenses/LICENSE.builtin_fp16.txt             |   2 +-
 licenses/LICENSE.cma.txt                      |   2 +-
 licenses/LICENSE.concurrentqueue.txt          |   2 +-
 licenses/LICENSE.libbacktrace.txt             |   6 +-
 licenses/LICENSE.picojson.txt                 |   2 +-
 mypy.ini                                      |   1 -
 .../meta_schedule/testing/torchbench/run.py   |   4 +-
 python/tvm/micro/contrib/stm32/emitter.py     |   8 +-
 rust/.rustfmt.toml                            |   1 -
 src/relay/collage/README.md                   |   2 +-
 src/runtime/crt/host/Makefile                 |   2 +-
 src/runtime/hexagon/README.md                 |   1 -
 tests/crt/contrib/stm32/Makefile              |   2 +-
 tests/lint/docker-format.sh                   |   2 +-
 tests/lint/pylint.sh                          |   1 -
 tests/lint/trailing_newlines.py               |  51 ++++++
 tests/lint/whitespace.sh                      |  39 +++++
 tests/python/ci/sample_prs/pr10786.json       |   2 +-
 tests/python/contrib/test_ethosu/infra.py     |   2 +-
 .../contrib/test_ethosu/test_create_tiles.py  |  14 +-
 .../test_ethosu/test_layout_optimizer.py      |   2 +-
 .../contrib/test_ethosu/test_lut_optimizer.py |   2 +-
 tests/python/contrib/test_hexagon/README.md   |   2 +-
 .../contrib/test_hexagon/conv2d/README.md     |   2 +-
 .../conv2d/test_conv2d_blocked.md             |  38 ++---
 .../test_hexagon/conv2d/test_conv2d_conv2d.md |  50 +++---
 .../contrib/test_hexagon/test_parallel_hvx.py |   2 +-
 .../relay/backend/test_pass_lower_te.py       |  20 +--
 .../relay/collage/demo_collage_partitioner.py |   2 +-
 tests/python/relay/collage/menangerie.py      |   4 +-
 tests/python/relay/collage/test_sub_graph.py  |  36 ++--
 tests/python/relay/test_ir_parser.py          |   2 +-
 .../relay/test_pass_collage_partition.py      |  22 +--
 .../relay/test_pass_dead_code_elimination.py  |   2 +-
 tests/python/relay/test_pass_plan_devices.py  |   6 +-
 tests/python/relay/test_target_hooks.py       |   4 +-
 .../transform/test_compiler_function_utils.py |   4 +-
 .../unittest/test_auto_scheduler_feature.py   |   2 +-
 ...est_tir_transform_compact_buffer_region.py |   2 +-
 tests/scripts/release/.gitignore              |   1 -
 tests/scripts/release/README.md               |   2 +-
 tests/scripts/task_config_build_cortexm.sh    |   1 -
 tests/scripts/task_config_build_i386.sh       |   1 -
 tests/scripts/task_config_build_jvm.sh        |   2 +-
 tests/scripts/task_cpp_unittest.sh            |   1 -
 tests/scripts/task_lint.sh                    |   3 +
 tests/scripts/task_python_integration.sh      |   2 +-
 vta/README.md                                 |   2 +-
 vta/tutorials/autotvm/README.txt              |   1 -
 web/Makefile                                  |   2 +-
 web/package.json                              |   2 +-
 web/src/compact.ts                            |   2 +-
 web/src/environment.ts                        |   2 +-
 web/src/index.ts                              |   2 +-
 web/src/support.ts                            |   2 +-
 web/tests/node/test_ndarray.js                |   1 -
 154 files changed, 466 insertions(+), 415 deletions(-)
 create mode 100755 tests/lint/trailing_newlines.py
 create mode 100755 tests/lint/whitespace.sh

diff --git a/.gitattributes b/.gitattributes
index 29e2373f30ff..1c7a460675f8 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1 @@
 Jenkinsfile linguist-generated=true
-
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
index d6ae1180ee41..49d0695b8e4d 100644
--- a/.github/ISSUE_TEMPLATE/documentation.md
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -24,4 +24,3 @@ Otherwise, specify what actions should be taken to provide additional clarity/re
 Please refer to the list of label tags [here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels) to find the relevant tags and add them below in a bullet format (example below).
 
 * needs-triage
-
diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 9a3917c656ec..b32ff90325d7 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -33,4 +33,3 @@ runs:
     run: |
       conda info
       conda list
-
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 38f8c629c3d5..946ba5338dbc 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -12,4 +12,4 @@ updates:
     directory: "/"
     schedule:
       interval: "monthly"
-    open-pull-requests-limit: 0
\ No newline at end of file
+    open-pull-requests-limit: 0
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 55fe5f1441cb..b03a1795ef9e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -60,7 +60,7 @@ jobs:
                        -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
                        -DCMAKE_BUILD_WITH_INSTALL_NAME_DIR=ON \
                        -DUSE_IOS_RPC=ON"
-          
+
           mkdir build-ios-simulator
           cd build-ios-simulator
           cmake .. ${CMAKE_FLAGS}
@@ -164,4 +164,4 @@ jobs:
         uses: actions/upload-artifact@v2
         with:
           name: android_camera-debug.apk
-          path: ./apps/android_camera/app/build/outputs/apk/debug/app-debug.apk
\ No newline at end of file
+          path: ./apps/android_camera/app/build/outputs/apk/debug/app-debug.apk
diff --git a/3rdparty/libcrc/tab/gentab_ccitt.inc b/3rdparty/libcrc/tab/gentab_ccitt.inc
index e3699471421a..8cd92ebc1794 100644
--- a/3rdparty/libcrc/tab/gentab_ccitt.inc
+++ b/3rdparty/libcrc/tab/gentab_ccitt.inc
@@ -267,4 +267,3 @@ const uint16_t crc_tabccitt[256] = {
 	0x0ED1u,
 	0x1EF0u
 };
-
diff --git a/NEWS.md b/NEWS.md
index 85dc0fbf363c..2b575f7aa214 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2770,5 +2770,3 @@ We also make major improvements in supporting new backends: ROCm for AMDGPUs and
 - DLPack integration support
 - AOT and module system
 - Basic code structure ready.
-
-
diff --git a/apps/android_camera/app/src/main/jni/Android.mk b/apps/android_camera/app/src/main/jni/Android.mk
index 4ff3da8f3327..513666a4ecb4 100644
--- a/apps/android_camera/app/src/main/jni/Android.mk
+++ b/apps/android_camera/app/src/main/jni/Android.mk
@@ -57,4 +57,4 @@ ifdef ADD_LDLIBS
 	LOCAL_LDLIBS += $(ADD_LDLIBS)
 endif
 
-include $(BUILD_SHARED_LIBRARY)
\ No newline at end of file
+include $(BUILD_SHARED_LIBRARY)
diff --git a/apps/android_camera/app/src/main/res/layout/activity_main.xml b/apps/android_camera/app/src/main/res/layout/activity_main.xml
index 80f9ac6902d6..14dba9cc01b5 100644
--- a/apps/android_camera/app/src/main/res/layout/activity_main.xml
+++ b/apps/android_camera/app/src/main/res/layout/activity_main.xml
@@ -20,4 +20,4 @@
     android:layout_width="match_parent"
     android:layout_height="match_parent"
     android:background="#000"
-    tools:context="org.apache.tvm.android.androidcamerademo.MainActivity" />
\ No newline at end of file
+    tools:context="org.apache.tvm.android.androidcamerademo.MainActivity" />
diff --git a/apps/android_camera/models/requirements.txt b/apps/android_camera/models/requirements.txt
index 1deff2b3548b..a44730c7ec37 100644
--- a/apps/android_camera/models/requirements.txt
+++ b/apps/android_camera/models/requirements.txt
@@ -1,4 +1,4 @@
 keras==2.9
 mxnet
 scipy
-tensorflow==2.9.1
\ No newline at end of file
+tensorflow==2.9.1
diff --git a/apps/android_deploy/app/build.gradle b/apps/android_deploy/app/build.gradle
index 2949775349bb..235bdcff5bfa 100644
--- a/apps/android_deploy/app/build.gradle
+++ b/apps/android_deploy/app/build.gradle
@@ -20,7 +20,7 @@ project.ext.ASSET_DIR = projectDir.toString() + '/src/main/assets'
 project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
 
 // Download default models(darknet framework extraction model compiled version);
-// if you wish to use your own models then place them in the "assets" directory 
+// if you wish to use your own models then place them in the "assets" directory
 // and comment out this line.
 apply from: "download-models.gradle"
 
@@ -101,4 +101,4 @@ dependencies {
     implementation 'com.android.support:design:28.0.0'
     implementation files('../../../jvm/core/target/tvm4j-core-0.0.1-SNAPSHOT.jar')
     testImplementation 'junit:junit:4.13.2'
-}
\ No newline at end of file
+}
diff --git a/apps/android_deploy/app/download-models.gradle b/apps/android_deploy/app/download-models.gradle
index 4d1620bfd953..38fd35a78653 100644
--- a/apps/android_deploy/app/download-models.gradle
+++ b/apps/android_deploy/app/download-models.gradle
@@ -78,4 +78,3 @@ tasks.whenTaskAdded { task ->
         task.dependsOn 'extractModels'
     }
 }
-
diff --git a/apps/android_deploy/dev_tools/gen_keystore.sh b/apps/android_deploy/dev_tools/gen_keystore.sh
index 56bdfd2200ee..31f3d2acf3b9 100644
--- a/apps/android_deploy/dev_tools/gen_keystore.sh
+++ b/apps/android_deploy/dev_tools/gen_keystore.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/apps/android_deploy/dev_tools/sign_apk.sh b/apps/android_deploy/dev_tools/sign_apk.sh
index 2ef58046f4ae..cd28998a7782 100644
--- a/apps/android_deploy/dev_tools/sign_apk.sh
+++ b/apps/android_deploy/dev_tools/sign_apk.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/apps/android_deploy/gradle.properties b/apps/android_deploy/gradle.properties
index 972e391a31ea..a7f4fa066909 100644
--- a/apps/android_deploy/gradle.properties
+++ b/apps/android_deploy/gradle.properties
@@ -17,4 +17,4 @@
 
 org.gradle.jvmargs=-Xmx4096M
 android.useAndroidX=true
-android.enableJetifier=true
\ No newline at end of file
+android.enableJetifier=true
diff --git a/apps/android_rpc/dev_tools/gen_keystore.sh b/apps/android_rpc/dev_tools/gen_keystore.sh
index 6a5d5bb6f6ca..6eeada70dc19 100755
--- a/apps/android_rpc/dev_tools/gen_keystore.sh
+++ b/apps/android_rpc/dev_tools/gen_keystore.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/apps/android_rpc/dev_tools/sign_apk.sh b/apps/android_rpc/dev_tools/sign_apk.sh
index 0541e893961a..e20e7ca6465a 100755
--- a/apps/android_rpc/dev_tools/sign_apk.sh
+++ b/apps/android_rpc/dev_tools/sign_apk.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/apps/bundle_deploy/README.md b/apps/bundle_deploy/README.md
index 619a2d7d05cc..dc7d29619a6e 100644
--- a/apps/bundle_deploy/README.md
+++ b/apps/bundle_deploy/README.md
@@ -48,13 +48,13 @@ This will:
 - Compile the model with Relay
 - Build a `bundle.so` shared object containing the model specification and
   parameters
-- Build a `demo_dynamic` executable that `dlopen`'s `bundle.so` (or `bundle_c.so` in 
+- Build a `demo_dynamic` executable that `dlopen`'s `bundle.so` (or `bundle_c.so` in
   terms of the MISRA-C runtime), instantiates the contained graph executor,
   and invokes the `GraphExecutor::Run` function on a cat image, then prints
   the output results.
 
 Type the following command to run the sample code with static linking.
-  
+
 ```bash
 make demo_static
 ```
@@ -63,5 +63,5 @@ This will:
 - Download the mobilenet0.25 model from the MXNet Gluon Model Zoo
 - Compile the model with Relay and outputs `model.o`
 - Build a `bundle_static.o` object containing the runtime functions
-- Build a `demo_static` executable which has static link to `bundle_static.o` and 
+- Build a `demo_static` executable which has static link to `bundle_static.o` and
   `model.o`, functions on a cat image, then prints the output results.
diff --git a/apps/cpp_rpc/CMakeLists.txt b/apps/cpp_rpc/CMakeLists.txt
index fc3aafcc4443..9f1180ee0fd3 100644
--- a/apps/cpp_rpc/CMakeLists.txt
+++ b/apps/cpp_rpc/CMakeLists.txt
@@ -70,5 +70,5 @@ if(BUILD_STATIC_RUNTIME)
 else()
   list(APPEND TVM_RPC_LINKER_LIBS tvm_runtime)
 endif()
- 
+
 target_link_libraries(tvm_rpc ${TVM_RPC_LINKER_LIBS})
diff --git a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
index fa17dcb4778b..5fae6b0a4099 100644
--- a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
+++ b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
@@ -100,4 +100,3 @@ add_library(h_tvm_runtime STATIC IMPORTED)
 set_target_properties(h_tvm_runtime PROPERTIES IMPORTED_LOCATION "${BINARY_DIR}/libtvm_runtime.a")
 
 target_link_libraries(launcher_rpc_skel -Wl,--whole-archive h_tvm_runtime -Wl,--no-whole-archive)
-
diff --git a/apps/ios_rpc/.gitignore b/apps/ios_rpc/.gitignore
index 829e4e719b69..5ac22469e6ac 100644
--- a/apps/ios_rpc/.gitignore
+++ b/apps/ios_rpc/.gitignore
@@ -1,2 +1 @@
 rpc_config.txt
-
diff --git a/apps/ios_rpc/README.md b/apps/ios_rpc/README.md
index c268d15d0179..2d9cc52dc0ad 100644
--- a/apps/ios_rpc/README.md
+++ b/apps/ios_rpc/README.md
@@ -79,7 +79,7 @@ You can get value of your `team_id` in the following ways:
   select target `tvmrpc`. At the bottom of this panel go to `Signing &
   Capabilities` tab and in the field `Team` select your local developer profile
   (`Your Name (Personal Team)`).
-  
+
   On the first run of the application you may see message `Could not launch
   "tvmrpc"` in the XCode and message `Untrusted Developer` on your device. In
   this case it will be necessary to check the certificate. Open
@@ -210,7 +210,7 @@ model and execute it on the target device. For this purpose we will use
 ```shell
 python3 tests/ios_rpc_test.py --host <host_ip_address> --port 9190 --mode "tracker"
 ```
-The output will be the same as in section 
+The output will be the same as in section
 [Standalone RPC](#standalone-rpc).
 
 ## Communication without Wi-Fi and speed up in case of slow Wi-Fi
diff --git a/apps/ios_rpc/tvmrpc/Assets.xcassets/AppIcon.appiconset/Contents.json b/apps/ios_rpc/tvmrpc/Assets.xcassets/AppIcon.appiconset/Contents.json
index 1d060ed28827..d7070bc5c02a 100644
--- a/apps/ios_rpc/tvmrpc/Assets.xcassets/AppIcon.appiconset/Contents.json
+++ b/apps/ios_rpc/tvmrpc/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -90,4 +90,4 @@
     "version" : 1,
     "author" : "xcode"
   }
-}
\ No newline at end of file
+}
diff --git a/apps/microtvm/README.md b/apps/microtvm/README.md
index 362bc407238e..1467c237c502 100644
--- a/apps/microtvm/README.md
+++ b/apps/microtvm/README.md
@@ -27,5 +27,3 @@ dependencies installed for running microTVM. To use it, run:
 $ poetry lock && poetry install
 $ poetry shell
 ```
-
-
diff --git a/apps/microtvm/cmsisnn/README.md b/apps/microtvm/cmsisnn/README.md
index f7c9ddfa74a8..befcda6bb063 100644
--- a/apps/microtvm/cmsisnn/README.md
+++ b/apps/microtvm/cmsisnn/README.md
@@ -28,7 +28,7 @@ If the demo is run in the ci_cpu Docker container provided with TVM, then the fo
 software will already be installed.
 
 If the demo is not run in the ci_cpu Docker container, then you will need the following:
-- Software required to build and run the demo (These can all be installed by running 
+- Software required to build and run the demo (These can all be installed by running
   tvm/docker/install/ubuntu_install_ethosu_driver_stack.sh.)
   - [Fixed Virtual Platform (FVP) based on Arm(R) Corstone(TM)-300 software](https://developer.arm.com/tools-and-software/open-source-software/arm-platforms-software/arm-ecosystem-fvps)
   - [cmake 3.19.5](https://github.com/Kitware/CMake/releases/)
diff --git a/apps/microtvm/cmsisnn/corstone300.ld b/apps/microtvm/cmsisnn/corstone300.ld
index e52b23da3360..2c5a0f7ef862 100644
--- a/apps/microtvm/cmsisnn/corstone300.ld
+++ b/apps/microtvm/cmsisnn/corstone300.ld
@@ -140,7 +140,7 @@ SECTIONS
     *(.rodata.tvm)
     . = ALIGN (16);
     *(.data.tvm);
-    . = ALIGN(16);    
+    . = ALIGN(16);
   } > DDR
 
   .text :
diff --git a/apps/microtvm/reference-vm/README.md b/apps/microtvm/reference-vm/README.md
index 3d419cd36463..6fe039a9fda9 100644
--- a/apps/microtvm/reference-vm/README.md
+++ b/apps/microtvm/reference-vm/README.md
@@ -30,8 +30,8 @@ For more information on how to use them, see the
 
 Each RTOS or platform (like Zephyr, Ardunio, etc) that integrates with microTVM
 can check-in installation scripts in the Reference VM in this directory to help
-the community collaborate. You should use the tools provided here to ensure a 
-uniform release process across all platforms. Typically, releases need to be 
+the community collaborate. You should use the tools provided here to ensure a
+uniform release process across all platforms. Typically, releases need to be
 created by TVM committers.
 
 Generally speaking, it's expected that any integrated platform with a regression
diff --git a/apps/microtvm/reference-vm/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/base-box/base_box_provision.sh
index d96852b7f57a..d8b987973735 100755
--- a/apps/microtvm/reference-vm/base-box/base_box_provision.sh
+++ b/apps/microtvm/reference-vm/base-box/base_box_provision.sh
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-#   Using this script we can reuse docker/install scripts to configure the reference 
+#   Using this script we can reuse docker/install scripts to configure the reference
 #   virtual machine similar to CI Cortex-M setup.
 #
 
@@ -42,7 +42,7 @@ cd ~
 sudo apt-get install -y ca-certificates
 
 # Install Arduino-CLI (specific version)
-# To keep in sync with the version 
+# To keep in sync with the version
 # defined in apps/microtvm/arduino/template_project/microtvm_api_server.py
 ARDUINO_CLI_VERSION="0.21.1"
 
diff --git a/apps/microtvm/zephyr/README.md b/apps/microtvm/zephyr/README.md
index 68e9975d4b1c..1003b65f824b 100644
--- a/apps/microtvm/zephyr/README.md
+++ b/apps/microtvm/zephyr/README.md
@@ -16,4 +16,3 @@
 <!--- under the License. -->
 
 This directory contains code to interface microTVM with the [Zephyr RTOS](https://zephyrproject.org/).
-
diff --git a/apps/microtvm/zephyr/template_project/src/host_driven/fvp/semihost.c b/apps/microtvm/zephyr/template_project/src/host_driven/fvp/semihost.c
index 64a43b02d933..2e03df096307 100644
--- a/apps/microtvm/zephyr/template_project/src/host_driven/fvp/semihost.c
+++ b/apps/microtvm/zephyr/template_project/src/host_driven/fvp/semihost.c
@@ -84,4 +84,4 @@ ssize_t semihost_write(void* unused_context, const uint8_t* data, size_t size) {
   write_req.size = size;
   uint32_t ret_val = semihost_cmd(0x05, &write_req);
   return size - ret_val;
-}
\ No newline at end of file
+}
diff --git a/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt b/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
index dd3582f86f7d..0ca000d48e6a 100644
--- a/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
+++ b/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT DEFINED CMSIS_PATH)
 endif()
 
 set(TVMC_COMMAND python3 -m tvm.driver.tvmc)
-set(TVMC_ARGS 
+set(TVMC_ARGS
     --target="cmsis-nn -mcpu=cortex-m55, c" # CMSIS-NN and C targets
     --runtime=crt # C Runtime
     --executor=aot # Ahead-of-Time Executor
@@ -41,14 +41,14 @@ set(TVM_RUNTIME
     ${CMAKE_CURRENT_BINARY_DIR}/runtime/src/runtime/crt/common/crt_backend_api.c
     ${CMAKE_CURRENT_BINARY_DIR}/runtime/src/runtime/crt/memory/stack_allocator.c
 )
-set(CODEGEN_OUTPUT 
+set(CODEGEN_OUTPUT
     ${CMAKE_CURRENT_BINARY_DIR}/codegen/host/src/default_lib0.c
     ${CMAKE_CURRENT_BINARY_DIR}/codegen/host/src/default_lib1.c
     ${CMAKE_CURRENT_BINARY_DIR}/codegen/host/src/default_lib2.c
 )
-set(DATA_FILES 
-    ${CMAKE_CURRENT_BINARY_DIR}/inputs.c 
-    ${CMAKE_CURRENT_BINARY_DIR}/outputs.c 
+set(DATA_FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/inputs.c
+    ${CMAKE_CURRENT_BINARY_DIR}/outputs.c
     ${CMAKE_CURRENT_BINARY_DIR}/labels.c
 )
 set(CMSIS_SOURCES
@@ -71,7 +71,7 @@ set(CMSIS_SOURCES
 
 add_custom_command(
     OUTPUT ${TVM_RUNTIME}
-    OUTPUT ${CODEGEN_OUTPUT} 
+    OUTPUT ${CODEGEN_OUTPUT}
     COMMAND ${TVMC_COMMAND} compile ${TVMC_ARGS} ${CMAKE_CURRENT_SOURCE_DIR}/model/cnn_s_quantized.tflite
     COMMAND tar xf ${CMAKE_CURRENT_BINARY_DIR}/module.tar
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
@@ -83,15 +83,15 @@ add_custom_command(
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
 )
 
-target_sources(app PRIVATE 
+target_sources(app PRIVATE
     src/main.c
     ${TVM_RUNTIME}
     ${CODEGEN_OUTPUT}
     ${DATA_FILES}
     ${CMSIS_SOURCES}
 )
-target_include_directories(app 
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include 
+target_include_directories(app
+    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
     PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/runtime/include ${CMAKE_CURRENT_BINARY_DIR}/codegen/host/include
     PUBLIC ${CMSIS_PATH}/CMSIS/NN/Include/ ${CMSIS_PATH}/CMSIS/DSP/Include
 )
diff --git a/apps/microtvm/zephyr_cmsisnn/README.md b/apps/microtvm/zephyr_cmsisnn/README.md
index e3e8f1c27876..df54acfbc736 100644
--- a/apps/microtvm/zephyr_cmsisnn/README.md
+++ b/apps/microtvm/zephyr_cmsisnn/README.md
@@ -30,7 +30,7 @@ Download the keyword spotting model to the `model` directory:
 wget \
     https://github.com/ARM-software/ML-zoo/blob/ee35139af86bdace5e502b09fe8b9da9cb1f06bb/models/keyword_spotting/cnn_small/tflite_int8/cnn_s_quantized.tflite \
     -O model/cnn_s_quantized.tflite
-``` 
+```
 
 Checkout [CMSIS_5](https://github.com/ARM-software/CMSIS_5.git) (default is `/opt/arm/ethosu/cmsis` to reflect `tlcpack/ci_cortexm`):
 ```
diff --git a/apps/microtvm/zephyr_cmsisnn/model/labels.txt b/apps/microtvm/zephyr_cmsisnn/model/labels.txt
index ba416458b011..f66507b143ec 100644
--- a/apps/microtvm/zephyr_cmsisnn/model/labels.txt
+++ b/apps/microtvm/zephyr_cmsisnn/model/labels.txt
@@ -9,4 +9,4 @@ right
 on
 off
 stop
-go
\ No newline at end of file
+go
diff --git a/apps/microtvm/zephyr_cmsisnn/run_demo.sh b/apps/microtvm/zephyr_cmsisnn/run_demo.sh
index 6588b561743b..5617e96e95f1 100755
--- a/apps/microtvm/zephyr_cmsisnn/run_demo.sh
+++ b/apps/microtvm/zephyr_cmsisnn/run_demo.sh
@@ -46,7 +46,7 @@ west zephyr-export
 west build
 west build -t run &> ${LOGDIR}/west.log &
 
-# Wait for "exit" keyword 
+# Wait for "exit" keyword
 until grep -m 1 "exit" ${LOGDIR}/west.log; do sleep 1 ; done
 
 # Check the log for correct output
diff --git a/apps/pt_tvmdsoop/prepare_and_test_pt_tvm_class.sh b/apps/pt_tvmdsoop/prepare_and_test_pt_tvm_class.sh
index 666f774017c8..ceb98857533a 100755
--- a/apps/pt_tvmdsoop/prepare_and_test_pt_tvm_class.sh
+++ b/apps/pt_tvmdsoop/prepare_and_test_pt_tvm_class.sh
@@ -22,12 +22,12 @@ echo "TVM_ROOT=${TVM_ROOT}"
 export PYTHONPATH=${TVM_ROOT}/python
 
 if [ ! -f $TVM_ROOT/build/libtvm.so ]; then
-    echo "$TVM_ROOT/build/libtvm.so missing" 
+    echo "$TVM_ROOT/build/libtvm.so missing"
     exit 1
 fi
 
 if [ ! -f $TVM_ROOT/build/libtvm_runtime.so ]; then
-    echo "$TVM_ROOT/build/libtvm_runtime.so missing" 
+    echo "$TVM_ROOT/build/libtvm_runtime.so missing"
     exit 1
 fi
 
@@ -43,4 +43,3 @@ if [ "$?" -eq 0 ]; then
 
     LD_LIBRARY_PATH=${TVM_ROOT}/build:./build:$LD_LIBRARY_PATH python3 -m pytest -v ./tests
 fi
-
diff --git a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
index 2bde4f87c84e..fa4f1f9f9715 100644
--- a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
+++ b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
@@ -22,14 +22,13 @@ echo "TVM_ROOT=${TVM_ROOT}"
 export PYTHONPATH=${TVM_ROOT}/python
 
 python3 -c "import tvm; print(tvm.runtime.enabled('gpu'))" | grep -e 1
-if [ "$?" -eq 0 ]; then 
+if [ "$?" -eq 0 ]; then
     echo "Build TF_TVMDSOOP with gpu support and execute tests"
     CMAKE_OPTIONS="-DUSE_CUDA=ON -DPython3_EXECUTABLE=python3 -DTVM_ROOT=${TVM_ROOT}"
- 
+
     mkdir -p build
     cd build; cmake .. ${CMAKE_OPTIONS} && make
     cd ..
 
     LD_LIBRARY_PATH=${TVM_ROOT}/build:./build:$LD_LIBRARY_PATH python3 -m pytest -v ./tests
 fi
-
diff --git a/apps/vta_rpc/start_rpc_server.sh b/apps/vta_rpc/start_rpc_server.sh
index a6f80e27f139..46258f9d7962 100755
--- a/apps/vta_rpc/start_rpc_server.sh
+++ b/apps/vta_rpc/start_rpc_server.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/apps/wasm-standalone/README.md b/apps/wasm-standalone/README.md
index b8a977f6ae50..34c844368029 100644
--- a/apps/wasm-standalone/README.md
+++ b/apps/wasm-standalone/README.md
@@ -169,8 +169,8 @@ input image belongs to the class `tiger cat`
 
 Note: this example also works without WASI support. Please modify `wasm-graph/.cargo/config` to change the target to
 `wasm32-unknown-unknown` and uncomment the raw wasm engine in `wasm-runtime/src/graph.rs` to run in pure wasm32. SIMD
-may not be supported without WASI support. You may also need to delete ` -mattr=+simd128` in the 
-[build script](apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py).  
+may not be supported without WASI support. You may also need to delete ` -mattr=+simd128` in the
+[build script](apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py).
 
 ## Future Work
 
diff --git a/ci/jenkins/.gitignore b/ci/jenkins/.gitignore
index 187a72392cc8..3d2dbd4b6317 100644
--- a/ci/jenkins/.gitignore
+++ b/ci/jenkins/.gitignore
@@ -1 +1 @@
-/_venv
\ No newline at end of file
+/_venv
diff --git a/ci/scripts/git_skip_ci_globs.py b/ci/scripts/git_skip_ci_globs.py
index 6c0f9d017605..f51df055c980 100755
--- a/ci/scripts/git_skip_ci_globs.py
+++ b/ci/scripts/git_skip_ci_globs.py
@@ -35,6 +35,8 @@
     # microTVM
     "apps/microtvm/poetry.lock",
     "apps/microtvm/pyproject.toml",
+    "tests/lint/*",
+    "tests/scripts/task_lint.sh",
 ]
 
 
diff --git a/cmake/modules/Git.cmake b/cmake/modules/Git.cmake
index 23840023965d..46c0a9901c03 100644
--- a/cmake/modules/Git.cmake
+++ b/cmake/modules/Git.cmake
@@ -48,7 +48,7 @@ if (${GIT_FOUND})
     message(STATUS "Found TVM_GIT_COMMIT_TIME=${TVM_GIT_COMMIT_TIME}")
   else()
     set(TVM_GIT_COMMIT_TIME "NOT-FOUND")
-  endif() 
+  endif()
 else()
   message(WARNING "Git not found")
   set(TVM_GIT_COMMIT_HASH "NOT-FOUND")
diff --git a/cmake/modules/contrib/BNNS.cmake b/cmake/modules/contrib/BNNS.cmake
index 1adb3ba10231..95c034b1b9ec 100644
--- a/cmake/modules/contrib/BNNS.cmake
+++ b/cmake/modules/contrib/BNNS.cmake
@@ -27,4 +27,3 @@ if(USE_BNNS STREQUAL "ON")
   list(APPEND RUNTIME_SRCS ${BNNS_CONTRIB_SRC})
   message(STATUS "Build with BNNS JSON runtime: " ${EXTERN_LIBRARY_BNNS})
 endif()
-
diff --git a/cmake/modules/contrib/CODEGENC.cmake b/cmake/modules/contrib/CODEGENC.cmake
index 412fa3e8ffc5..b461176e6a84 100644
--- a/cmake/modules/contrib/CODEGENC.cmake
+++ b/cmake/modules/contrib/CODEGENC.cmake
@@ -17,4 +17,3 @@
 
 tvm_file_glob(GLOB CSOURCE_RELAY_CONTRIB_SRC src/relay/backend/contrib/codegen_c/*.cc)
 list(APPEND COMPILER_SRCS ${CSOURCE_RELAY_CONTRIB_SRC})
-
diff --git a/cmake/modules/contrib/DNNL.cmake b/cmake/modules/contrib/DNNL.cmake
index caa5a84e4492..7547af81eb1a 100644
--- a/cmake/modules/contrib/DNNL.cmake
+++ b/cmake/modules/contrib/DNNL.cmake
@@ -60,4 +60,3 @@ elseif(USE_DNNL STREQUAL "OFF")
 else()
   message(FATAL_ERROR "Invalid option: USE_DNNL=" ${USE_DNNL})
 endif()
-
diff --git a/cmake/modules/contrib/Posit.cmake b/cmake/modules/contrib/Posit.cmake
index d62e09f27910..b8d180ee4480 100644
--- a/cmake/modules/contrib/Posit.cmake
+++ b/cmake/modules/contrib/Posit.cmake
@@ -20,7 +20,7 @@ if(USE_BYODT_POSIT)
   if (NOT UNIVERSAL_PATH)
     message(FATAL_ERROR "Fail to get Universal path")
   endif(NOT UNIVERSAL_PATH)
-  
+
   include_directories(${UNIVERSAL_PATH}/include)
   list(APPEND COMPILER_SRCS "src/target/datatype/posit/posit-wrapper.cc")
 endif(USE_BYODT_POSIT)
diff --git a/cmake/modules/contrib/TF_TVMDSOOP.cmake b/cmake/modules/contrib/TF_TVMDSOOP.cmake
index 86b14740212c..f5f3f036690f 100644
--- a/cmake/modules/contrib/TF_TVMDSOOP.cmake
+++ b/cmake/modules/contrib/TF_TVMDSOOP.cmake
@@ -17,14 +17,14 @@
 
 if(NOT USE_TF_TVMDSOOP STREQUAL "OFF")
   find_package(Python3 COMPONENTS Interpreter)
-  
+
   execute_process(COMMAND ${Python3_EXECUTABLE} -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_compile_flags()))"
     OUTPUT_VARIABLE TF_COMPILE_FLAGS_STR
     RESULT_VARIABLE TF_STATUS)
   if (NOT ${TF_STATUS} EQUAL 0)
     message(FATAL_ERROR "Fail to get TensorFlow compile flags")
   endif()
-  
+
   if(NOT USE_CUDA STREQUAL "OFF")
     add_definitions(-DTF_TVMDSOOP_ENABLE_GPU)
   endif()
@@ -45,13 +45,12 @@ if(NOT USE_TF_TVMDSOOP STREQUAL "OFF")
   tvm_file_glob(GLOB_RECURSE TFTVM_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/src/contrib/tf_op/*.cc)
   add_library(${OP_LIBRARY_NAME} SHARED ${TFTVM_SRCS})
   set(TFTVM_LINK_FLAGS  -ltvm -L${CMAKE_CURRENT_BINARY_DIR})
-  
+
   if (NOT BUILD_TVMDSOOP_ONLY STREQUAL "ON")
-      add_dependencies(${OP_LIBRARY_NAME} tvm) 
+      add_dependencies(${OP_LIBRARY_NAME} tvm)
   endif()
 
   target_compile_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_COMPILE_FLAGS} ${TF_COMPILE_FLAGS})
   target_link_libraries(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_LINK_FLAGS} ${TF_LINK_FLAGS})
 
 endif()
-
diff --git a/cmake/modules/contrib/Verilator.cmake b/cmake/modules/contrib/Verilator.cmake
index e74e582969d1..61a2e309d06e 100644
--- a/cmake/modules/contrib/Verilator.cmake
+++ b/cmake/modules/contrib/Verilator.cmake
@@ -21,4 +21,3 @@ if(USE_VERILATOR STREQUAL "ON")
   list(APPEND COMPILER_SRCS ${VERILATOR_RELAY_CONTRIB_SRC})
   list(APPEND RUNTIME_SRCS ${VERILATOR_CONTRIB_SRC})
 endif()
-
diff --git a/cmake/utils/FindEthosN.cmake b/cmake/utils/FindEthosN.cmake
index 07eaf5ddeee1..591d49f82915 100644
--- a/cmake/utils/FindEthosN.cmake
+++ b/cmake/utils/FindEthosN.cmake
@@ -85,7 +85,7 @@ macro(find_ethosn use_ethosn)
     else()
       set(ETHOSN_DEFINITIONS -DETHOSN_API_VERSION=${USE_ETHOSN_API_VERSION})
     endif()
-  
+
     if(ETHOSN_COMPILER_LIBRARY)
       set(ETHOSN_FOUND TRUE)
     endif()
diff --git a/cmake/utils/FindVulkan.cmake b/cmake/utils/FindVulkan.cmake
index 4349125d3f37..a2a6fb60352d 100644
--- a/cmake/utils/FindVulkan.cmake
+++ b/cmake/utils/FindVulkan.cmake
@@ -43,7 +43,7 @@ macro(find_vulkan use_vulkan use_khronos_spirv)
      set(__vulkan_sdk "")
    endif()
 
-   
+
    if(IS_DIRECTORY ${use_khronos_spirv})
      set(__use_khronos_spirv ${use_khronos_spirv})
      message(STATUS "Custom khronos spirv PATH=" ${__use_khronos_spirv})
diff --git a/conda/condarc b/conda/condarc
index eef4967f90fe..b4592374efcb 100644
--- a/conda/condarc
+++ b/conda/condarc
@@ -20,23 +20,23 @@
 # remote_connect_timeout_secs (float)
 #   The number seconds conda will wait for your client to establish a
 #   connection to a remote url resource.
-# 
+#
 remote_connect_timeout_secs: 10
 
 # remote_max_retries (int)
 #   The maximum number of retries each HTTP connection should attempt.
-# 
+#
 remote_max_retries: 6
 
 # remote_backoff_factor (int)
 #   The factor determines the time HTTP connection should wait for
 #   attempt.
-# 
+#
 remote_backoff_factor: 5
 
 # remote_read_timeout_secs (float)
 #   Once conda has connected to a remote resource and sent an HTTP
 #   request, the read timeout is the number of seconds conda will wait for
 #   the server to send a response.
-# 
+#
 remote_read_timeout_secs: 60.0
diff --git a/docker/Dockerfile.ci_minimal b/docker/Dockerfile.ci_minimal
index b4ba758901b4..2a3da14f0fe1 100644
--- a/docker/Dockerfile.ci_minimal
+++ b/docker/Dockerfile.ci_minimal
@@ -58,4 +58,4 @@ RUN bash /install/ubuntu_install_redis.sh
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
-ENV PATH /opt/sccache:$PATH
\ No newline at end of file
+ENV PATH /opt/sccache:$PATH
diff --git a/docker/README.md b/docker/README.md
index a05079d30881..c311e86d190a 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -80,7 +80,7 @@ You can also start an interactive session by typing
 The built docker images are prefixed by ``tvm.``, for example the command
 
 ````bash
-./docker/build.sh image_name 
+./docker/build.sh image_name
 ````
 
 produces the image ``tvm.ci_cpu`` that is displayed in the list of docker images
@@ -90,7 +90,7 @@ using the command ``docker images``. To run an interactive terminal, execute:
 ./docker/bash.sh tvm.ci_cpu
 ````
 
-or 
+or
 
 ````bash
 ./docker/bash.sh tvm.ci_cpu echo hello tvm world
@@ -98,7 +98,7 @@ or
 
 the same applies to the other images (``./docker/Dockerfile.*```).
 
-The command ``./docker/build.sh image_name COMMANDS`` is almost equivelant to 
+The command ``./docker/build.sh image_name COMMANDS`` is almost equivelant to
 ``./docker/bash.sh image_name COMMANDS`` but in the case of ``bash.sh``
 a build attempt is not done.
 
diff --git a/docker/bash.sh b/docker/bash.sh
index 3f8f3d8baba4..2af65b17f5ca 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -137,7 +137,7 @@ CONTAINER_NAME=
 # "${REPO_DIR}".  The consistent directory for Jenkins is currently
 # necessary to allow cmake build commands to run in CI after the build
 # steps.
-# TODO(https://github.com/apache/tvm/issues/11952): 
+# TODO(https://github.com/apache/tvm/issues/11952):
 # Figure out a better way to keep the same path
 # between build and testing stages.
 if [[ -n "${JENKINS_HOME:-}" ]]; then
diff --git a/docker/install/ubuntu1804_manual_install_llvm.sh b/docker/install/ubuntu1804_manual_install_llvm.sh
index f0e9abd1d9fd..bce4222b65c2 100755
--- a/docker/install/ubuntu1804_manual_install_llvm.sh
+++ b/docker/install/ubuntu1804_manual_install_llvm.sh
@@ -35,4 +35,3 @@ ninja install
 popd
 popd
 rm -rf llvm-project
-
diff --git a/docker/install/ubuntu_install_androidsdk.sh b/docker/install/ubuntu_install_androidsdk.sh
index a809ff349c41..bf2d80cd28ba 100755
--- a/docker/install/ubuntu_install_androidsdk.sh
+++ b/docker/install/ubuntu_install_androidsdk.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -83,4 +83,3 @@ for f in ${ANDROID_HOME}/ndk/21.3.6528147/* ; do
   ln --symbolic "$f" "/usr/bin/`basename $f`"
 done
 echo "export ANDROID_HOME=${ANDROID_HOME}" >> /etc/profile
-
diff --git a/docker/install/ubuntu_install_caffe2.sh b/docker/install/ubuntu_install_caffe2.sh
index fa091f950497..1a27bc44ad10 100755
--- a/docker/install/ubuntu_install_caffe2.sh
+++ b/docker/install/ubuntu_install_caffe2.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/install/ubuntu_install_cmsis.sh b/docker/install/ubuntu_install_cmsis.sh
index 9fcbcf61cefa..a41e3df0ae55 100755
--- a/docker/install/ubuntu_install_cmsis.sh
+++ b/docker/install/ubuntu_install_cmsis.sh
@@ -49,4 +49,3 @@ echo "$CMSIS_SHASUM" ${DOWNLOAD_PATH} | sha512sum -c
 tar -xf "${DOWNLOAD_PATH}" -C "${INSTALLATION_PATH}" --strip-components=1
 touch "${INSTALLATION_PATH}"/"${CMSIS_SHA}".sha
 echo "SUCCESS"
-
diff --git a/docker/install/ubuntu_install_coreml.sh b/docker/install/ubuntu_install_coreml.sh
index cbdc87666b4e..18802a89436b 100755
--- a/docker/install/ubuntu_install_coreml.sh
+++ b/docker/install/ubuntu_install_coreml.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/install/ubuntu_install_dnnl.sh b/docker/install/ubuntu_install_dnnl.sh
index 3654d140f55b..5aaf3be7fbbf 100755
--- a/docker/install/ubuntu_install_dnnl.sh
+++ b/docker/install/ubuntu_install_dnnl.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/install/ubuntu_install_gluoncv.sh b/docker/install/ubuntu_install_gluoncv.sh
index cb24ff74e9ff..d42705b86e2b 100755
--- a/docker/install/ubuntu_install_gluoncv.sh
+++ b/docker/install/ubuntu_install_gluoncv.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/install/ubuntu_install_golang.sh b/docker/install/ubuntu_install_golang.sh
index 4e8e743266c1..05c57c955bf2 100755
--- a/docker/install/ubuntu_install_golang.sh
+++ b/docker/install/ubuntu_install_golang.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/install/ubuntu_install_gradle.sh b/docker/install/ubuntu_install_gradle.sh
index 030be040c6e3..56dcb05a2014 100755
--- a/docker/install/ubuntu_install_gradle.sh
+++ b/docker/install/ubuntu_install_gradle.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/install/ubuntu_install_hexagon.sh b/docker/install/ubuntu_install_hexagon.sh
index e616c8a4977c..18b8a0f66587 100755
--- a/docker/install/ubuntu_install_hexagon.sh
+++ b/docker/install/ubuntu_install_hexagon.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/install/ubuntu_install_java.sh b/docker/install/ubuntu_install_java.sh
index d0ced98d89f1..5556f0d8fed5 100755
--- a/docker/install/ubuntu_install_java.sh
+++ b/docker/install/ubuntu_install_java.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/install/ubuntu_install_nodejs.sh b/docker/install/ubuntu_install_nodejs.sh
index 2bb8a115a0c4..b295d9e3e41d 100755
--- a/docker/install/ubuntu_install_nodejs.sh
+++ b/docker/install/ubuntu_install_nodejs.sh
@@ -29,4 +29,4 @@ apt-install-and-clear -y curl
 # The node install script fetched and executed here will update the
 # apt source list, hence the second apt-get update --fix-missing is necessary.
 curl -s -S -L https://deb.nodesource.com/setup_14.x | bash -
-apt-install-and-clear -y nodejs
\ No newline at end of file
+apt-install-and-clear -y nodejs
diff --git a/docker/install/ubuntu_install_opencl.sh b/docker/install/ubuntu_install_opencl.sh
index 705f4a65eedb..a1ae18471f85 100755
--- a/docker/install/ubuntu_install_opencl.sh
+++ b/docker/install/ubuntu_install_opencl.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/install/ubuntu_install_rocm.sh b/docker/install/ubuntu_install_rocm.sh
index 15cb1e143ac4..f8ed4d00fe68 100755
--- a/docker/install/ubuntu_install_rocm.sh
+++ b/docker/install/ubuntu_install_rocm.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/install/ubuntu_install_universal.sh b/docker/install/ubuntu_install_universal.sh
index a054aafdd5f7..07b38f1a5269 100755
--- a/docker/install/ubuntu_install_universal.sh
+++ b/docker/install/ubuntu_install_universal.sh
@@ -23,4 +23,4 @@ set -o pipefail
 git clone https://github.com/stillwater-sc/universal.git /opt/universal
 
 # Use specific versioning tag.
-(cd /opt/universal && git checkout e32899d551b53d758865fabd5fdd69eed35bfb0f)
\ No newline at end of file
+(cd /opt/universal && git checkout e32899d551b53d758865fabd5fdd69eed35bfb0f)
diff --git a/docker/install/ubuntu_install_vitis_ai_core.sh b/docker/install/ubuntu_install_vitis_ai_core.sh
index 48980d2e7ba2..2e395b45daaa 100755
--- a/docker/install/ubuntu_install_vitis_ai_core.sh
+++ b/docker/install/ubuntu_install_vitis_ai_core.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
index ccaf113cec58..569df12a37df 100755
--- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
+++ b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/docker/utils/apt-install-and-clear.sh b/docker/utils/apt-install-and-clear.sh
index 1840c17b37bb..56ef98b00fdb 100755
--- a/docker/utils/apt-install-and-clear.sh
+++ b/docker/utils/apt-install-and-clear.sh
@@ -17,4 +17,3 @@
 # under the License.
 
 apt-get install $@ && apt-get clean
-
diff --git a/docs/_static/img/README b/docs/_static/img/README
index 414328cc729d..8e908dc1af2f 100644
--- a/docs/_static/img/README
+++ b/docs/_static/img/README
@@ -1,2 +1,2 @@
 The logo file in this repo is an exception due to the need of sphinx.
-By default we avoid to put large binary blobs into this repo.
\ No newline at end of file
+By default we avoid to put large binary blobs into this repo.
diff --git a/docs/arch/hybrid_script.rst b/docs/arch/hybrid_script.rst
index 33a65f268335..a4fce342f728 100644
--- a/docs/arch/hybrid_script.rst
+++ b/docs/arch/hybrid_script.rst
@@ -96,5 +96,5 @@ except ``popcount`` and ``sigmoid``. I implemented them manually.
 Casting
 ~~~~~~~
 
-You can cast values by using the keywords ``uint8``, ``uint16`` ``uint32``, ``uint64``, ``int8``, ``int16``, ``int32``, ``int64``, 
+You can cast values by using the keywords ``uint8``, ``uint16`` ``uint32``, ``uint64``, ``int8``, ``int16``, ``int32``, ``int64``,
 ``float16``, ``float32``, ``float64``.
diff --git a/docs/contribute/release_process.rst b/docs/contribute/release_process.rst
index 129d65fc9043..4b5c45fc84ba 100644
--- a/docs/contribute/release_process.rst
+++ b/docs/contribute/release_process.rst
@@ -57,7 +57,7 @@ It is recommended to open a Github issue to collect feedbacks for the release no
 Prepare the Release Candidate
 -----------------------------
 
-There may be some code changes necessary to the release branch before the release. Ensure all version numbers are up to date 
+There may be some code changes necessary to the release branch before the release. Ensure all version numbers are up to date
 
 
 Prepare the GPG Key
@@ -145,12 +145,12 @@ Create GPG signature as well as the hash of the file,
 	shasum -a 512 apache-tvm-src-v0.6.0.rc0.tar.gz > apache-tvm-src-v0.6.0.rc0.tar.gz.sha512
 
 
-Update TVM Version on Main 
+Update TVM Version on Main
 --------------------------
 
-After cutting a release candidate, make sure to update the version numbers throughout `main`. For example if we are 
-releasing `v0.10.0` we want to bump the version numbers throughout the codebase from `v0.10.dev0` to `v0.11.dev0`. An 
-example of how to do this can be found here: `https://github.com/apache/tvm/pull/12190 <https://github.com/apache/tvm/pull/12190>`_. 
+After cutting a release candidate, make sure to update the version numbers throughout `main`. For example if we are
+releasing `v0.10.0` we want to bump the version numbers throughout the codebase from `v0.10.dev0` to `v0.11.dev0`. An
+example of how to do this can be found here: `https://github.com/apache/tvm/pull/12190 <https://github.com/apache/tvm/pull/12190>`_.
 
 Upload the Release Candidate
 ----------------------------
diff --git a/docs/dev/how_to/relay_add_op.rst b/docs/dev/how_to/relay_add_op.rst
index 2a8c771dc63d..225c60071fd0 100644
--- a/docs/dev/how_to/relay_add_op.rst
+++ b/docs/dev/how_to/relay_add_op.rst
@@ -15,13 +15,13 @@
     specific language governing permissions and limitations
     under the License.
 
-.. _relay-add-op: 
+.. _relay-add-op:
 
 Adding an Operator to Relay
 ===========================
 
-In this document we will go over the steps needed to register a new TVM operator 
-in Relay. We will be following this PR which adds a `cumulative product`_ operation as an example.  
+In this document we will go over the steps needed to register a new TVM operator
+in Relay. We will be following this PR which adds a `cumulative product`_ operation as an example.
 The PR itself builds upon another PR which adds a `cumulative sum`_ operation.
 
 .. _cumulative product: https://github.com/apache/tvm/pull/7722
@@ -30,20 +30,20 @@ The PR itself builds upon another PR which adds a `cumulative sum`_ operation.
 Registering a new operator requires a few steps:
 
 1. Add an attribute node declaring fixed arguments which are known at compile time
-2. Write a type relation for your operation to integrate into Relay's type system. 
-3. Use the ``RELAY_REGISTER_OP`` macro in C++ to register the operator's arity, type, and other hints for the compiler 
-4. Write how the operator is computed 
+2. Write a type relation for your operation to integrate into Relay's type system.
+3. Use the ``RELAY_REGISTER_OP`` macro in C++ to register the operator's arity, type, and other hints for the compiler
+4. Write how the operator is computed
 5. Register the compute, schedule with the relay operator
 6. Define a C++ function to produce a call node for the operator and registering a Python API hook for the function
 7. Wrapping the above Python API hook in a neater interface
-8. Writing tests for the new relay operator 
+8. Writing tests for the new relay operator
 
 1. Defining an Attribute Node
 -----------------------------
-Attributes are fixed arguments which are supposed to be known at compile time. The stride and dilation of a convolution  
+Attributes are fixed arguments which are supposed to be known at compile time. The stride and dilation of a convolution
 operator would be an appropriate example of fields which might belong in an attribute node for a convolution operator.
 
-Attributes should be defined in a file within the folder `include/tvm/relay/attrs/`_. 
+Attributes should be defined in a file within the folder `include/tvm/relay/attrs/`_.
 
 .. _include/tvm/relay/attrs/: https://github.com/apache/tvm/tree/main/include/tvm/relay/attrs
 
@@ -78,7 +78,7 @@ Ultimately we want to create an operator whose interface can be seen clearly in
 
 A similiar interface exists for ``cumsum()``.
 
-Therefore, when defining our attributes in ``include/tvm/relay/attrs/transform.h`` we choose the axis, 
+Therefore, when defining our attributes in ``include/tvm/relay/attrs/transform.h`` we choose the axis,
 accumulation dtype, and exclusivity of the operation as appropriate fields for the struct.
 
 .. code:: c++
@@ -104,13 +104,13 @@ expressivity and granularity in expressing types in Relay, operators
 are typed using relations between input and output types. These relations
 are represented as functions that take in a list of input types and
 output types (any of these types may be incomplete) and return a list
-of input and output types that satisfies the relation. This includes shape 
+of input and output types that satisfies the relation. This includes shape
 information which can be determined statically at compile time. Essentially, a
 relation for an operator can enforce all the necessary typing rules
 (namely by inspecting the input types) in addition to computing the
 output type.
 
-Type relation for the cumulative product and sum operators can be found in 
+Type relation for the cumulative product and sum operators can be found in
 ``src/relay/op/tensor/transform.cc``:
 
 .. code:: c++
@@ -182,13 +182,13 @@ Once again we add this to ``src/relay/op/tensor/transform.cc``:
         .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
 In this case the ``TOpPattern`` is a hint to the compiler on the pattern of computation the operator does, which might be
-useful for fusing operators. ``kOpaque`` tells TVM to not bother trying to fuse this operator. 
+useful for fusing operators. ``kOpaque`` tells TVM to not bother trying to fuse this operator.
 
 4. Defining the Compute of the Operation
 ----------------------------------------
 
-While we've now defined the interface for our operations we still need to define 
-how to perform the actual calculations for cumulative sum and product. 
+While we've now defined the interface for our operations we still need to define
+how to perform the actual calculations for cumulative sum and product.
 
 Writing this code is outside the scope of the tutorial. For now, we assume we
 have a well tested implementation for the operation's compute. For more details
@@ -206,13 +206,13 @@ representation where tensor expressions and topi will lower into.
 5. Hooking up Compute and Strategy with Relay
 ---------------------------------------------
 
-After you have implemented your compute function we now need to glue it to our 
-relay operation. Within TVM this means not only defining the computation, but also the schedule 
+After you have implemented your compute function we now need to glue it to our
+relay operation. Within TVM this means not only defining the computation, but also the schedule
 for an operation. A strategy is a method which picks which computation and which schedule
 to use. For example, for 2D convolutions we might recognize we are doing a depthwise convolution
-and dispatch to a more efficient computation and schedule as a result. In our case however we have 
-no such need except for dispatching between our CPU and GPU implementations. In 
-``python/tvm/relay/op/strategy/generic.py`` and ``python/tvm/relay/op/strategy/cuda.py`` we 
+and dispatch to a more efficient computation and schedule as a result. In our case however we have
+no such need except for dispatching between our CPU and GPU implementations. In
+``python/tvm/relay/op/strategy/generic.py`` and ``python/tvm/relay/op/strategy/cuda.py`` we
 add the following strategies:
 
 .. code:: python
@@ -259,8 +259,8 @@ add the following strategies:
             name="cumsum.cuda",
         )
         return strategy
-    
-    
+
+
     @cumprod_strategy.register(["cuda", "gpu"])
     def cumprod_strategy_cuda(attrs, inputs, out_type, target):
         """cumprod cuda strategy"""
@@ -271,7 +271,7 @@ add the following strategies:
             name="cumprod.cuda",
         )
         return strategy
-        
+
 Where in each strategy we define the compute we wrote and the schedule to use within ``add_implementation()``.
 We finally link the strategy and compute with the defined relay operator in ``python/tvm/relay/op/_transform.py``:
 
@@ -297,12 +297,12 @@ We finally link the strategy and compute with the defined relay operator in ``py
     _reg.register_strategy("cumprod", strategy.cumprod_strategy)
     _reg.register_shape_func("cumprod", False, elemwise_shape_func)
 
-The shape functions are used for determining output shape given a dynamically shaped tensor. In this 
+The shape functions are used for determining output shape given a dynamically shaped tensor. In this
 case we tell TVM the output shape will be the same as the input shape.
 
 6. Creating a Relay Call Node and Exposing a Python Hook
 --------------------------------------------------------
-We now have a working operation and now just need to properly call it 
+We now have a working operation and now just need to properly call it
 via a Relay Call Node. This step requires simply writing a function that takes
 the arguments to the operator (as Relay expressions) and
 returning a call node to the operator (i.e., the node that
@@ -314,7 +314,7 @@ are not supported, so it suffices to use ``Op::Get`` to fetch
 the operator's information from the operator registry and pass in
 the arguments to the call node, as below. In ``src/relay/op/tensor/transform.cc``:
 
-.. code:: c++ 
+.. code:: c++
 
     Expr MakeCumsum(Expr data, Integer axis, DataType dtype, Bool exclusive) {
         auto attrs = make_object<ScanopAttrs>();
@@ -346,7 +346,7 @@ in Python via ``relay.op._make.cumsum(...)`` and ``relay.op._make.cumsum(...)``.
 
 It is generally the convention in Relay, that functions exported
 through ``TVM_REGISTER_GLOBAL`` should be wrapped in a separate
-Python function rather than called directly in Python. For our 
+Python function rather than called directly in Python. For our
 operators we expose this cleaner interface in ``python/tvm/relay/op/transform.py``
 
 .. code:: python
@@ -383,7 +383,7 @@ before producing the call node:
 8. Writing Unit Tests!
 ----------------------
 This is self explanatory! Some example unit tests can be found in
-`tests/python/relay/test_op_level3.py`_ for our cumulative sum 
+`tests/python/relay/test_op_level3.py`_ for our cumulative sum
 and product operators.
 
 .. _tests/python/relay/test_op_level3.py: https://github.com/apache/tvm/blob/main/tests/python/relay/test_op_level3.py
diff --git a/docs/errors.rst b/docs/errors.rst
index 84ea5551601a..42ffc88c1b44 100644
--- a/docs/errors.rst
+++ b/docs/errors.rst
@@ -69,4 +69,3 @@ pieces of information:
 
 Without these details it is very difficult for the TVM developers to do very
 much to help you.
-
diff --git a/docs/how_to/deploy/arm_compute_lib.rst b/docs/how_to/deploy/arm_compute_lib.rst
index a7ec8b9501c7..31b815d05d19 100644
--- a/docs/how_to/deploy/arm_compute_lib.rst
+++ b/docs/how_to/deploy/arm_compute_lib.rst
@@ -34,7 +34,7 @@ Before installing Arm Compute Library, it is important to know what architecture
 to determine this is to use `lscpu` and look for the "Model name" of the CPU. You can then use this to
 determine the architecture by looking online.
 
-TVM only supports a single version of ACL, currently this is v21.08, there are two recommended ways to build and install 
+TVM only supports a single version of ACL, currently this is v21.08, there are two recommended ways to build and install
 the required libraries:
 
 * Use the script located at `docker/install/ubuntu_download_arm_compute_lib_binaries.sh`. You can use this
diff --git a/docs/how_to/deploy/index.rst b/docs/how_to/deploy/index.rst
index f28883446fd5..74bae0f9234e 100644
--- a/docs/how_to/deploy/index.rst
+++ b/docs/how_to/deploy/index.rst
@@ -36,7 +36,7 @@ Unlike traditional deep learning frameworks. TVM stack is divided into two major
 - TVM runtime, which runs on the target devices.
 
 In order to integrate the compiled module, we **do not** need to build entire
-TVM on the target device. You only need to build the TVM compiler stack on your 
+TVM on the target device. You only need to build the TVM compiler stack on your
 desktop and use that to cross-compile modules that are deployed on the target device.
 
 We only need to use a light-weight runtime API that can be integrated into various platforms.
@@ -77,9 +77,9 @@ architecture to be able to run the cross compiled model.
 Cross compile the TVM runtime for other architectures
 -----------------------------------------------------
 
-In the example :ref:`above <build-tvm-runtime-on-target-device>` the runtime library was 
-compiled on a Raspberry Pi. Producing the runtime library can be done much faster on 
-hosts that have high performace processors with ample resources (such as laptops, workstation) 
+In the example :ref:`above <build-tvm-runtime-on-target-device>` the runtime library was
+compiled on a Raspberry Pi. Producing the runtime library can be done much faster on
+hosts that have high performace processors with ample resources (such as laptops, workstation)
 compared to a target devices such as a Raspberry Pi. In-order to cross compile the runtime the toolchain
 for the target device must be installed. After installing the correct toolchain,
 the main difference compared to compiling natively is to pass some additional command
@@ -105,7 +105,7 @@ cross-compile for aarch64
         -DCMAKE_FIND_ROOT_PATH=/usr/aarch64-linux-gnu \
         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-        -DMACHINE_NAME=aarch64-linux-gnu 
+        -DMACHINE_NAME=aarch64-linux-gnu
 
     make -j$(nproc) runtime
 
@@ -135,7 +135,7 @@ cross-compile for RISC-V
         -DCMAKE_FIND_ROOT_PATH=/usr/riscv64-linux-gnu \
         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-        -DMACHINE_NAME=riscv64-linux-gnu 
+        -DMACHINE_NAME=riscv64-linux-gnu
 
     make -j$(nproc) runtime
 
@@ -147,7 +147,7 @@ The ``file`` command can be used to query the architecture of the produced runti
    file libtvm_runtime.so
    libtvm_runtime.so: ELF 64-bit LSB shared object, UCB RISC-V, version 1 (GNU/Linux), dynamically linked, BuildID[sha1]=e9ak845b3d7f2c126dab53632aea8e012d89477e, not stripped
 
-    
+
 Optimize and tune models for target devices
 -------------------------------------------
 
diff --git a/docs/how_to/deploy/vitis_ai.rst b/docs/how_to/deploy/vitis_ai.rst
index 7e97ddce8a89..8185822593c7 100755
--- a/docs/how_to/deploy/vitis_ai.rst
+++ b/docs/how_to/deploy/vitis_ai.rst
@@ -28,9 +28,9 @@ full potential of AI acceleration on Xilinx FPGA and ACAP.
 
 The current Vitis AI flow inside TVM enables acceleration of Neural
 Network model inference on edge and cloud with the `Zynq Ultrascale+
-MPSoc <https://www.xilinx.com/products/silicon-devices/soc/zynq-ultrascale-mpsoc.html>`__, 
-`Alveo <https://www.xilinx.com/products/boards-and-kits/alveo.html>`__ 
-and `Versal <https://www.xilinx.com/products/silicon-devices/acap/versal.html>`__ platforms. 
+MPSoc <https://www.xilinx.com/products/silicon-devices/soc/zynq-ultrascale-mpsoc.html>`__,
+`Alveo <https://www.xilinx.com/products/boards-and-kits/alveo.html>`__
+and `Versal <https://www.xilinx.com/products/silicon-devices/acap/versal.html>`__ platforms.
 The identifiers for the supported edge and cloud Deep Learning Processor Units (DPU's) are:
 
 +-----------------------------------------------------------------------------------------+-----------------------+----------------------------+
@@ -43,7 +43,7 @@ The identifiers for the supported edge and cloud Deep Learning Processor Units (
 | `Kria KV260 <https://www.xilinx.com/products/som/kria/kv260-vision-starter-kit.html>`__ | DPUCZDX8G             | DPUCZDX8G-kv260            |
 +-----------------------------------------------------------------------------------------+-----------------------+----------------------------+
 | `VCK190 <https://www.xilinx.com/products/boards-and-kits/vck190.html>`__                | DPUCVDX8G             | DPUCVDX8G                  |
-+-----------------------------------------------------------------------------------------+-----------------------+----------------------------+ 
++-----------------------------------------------------------------------------------------+-----------------------+----------------------------+
 | `VCK5000 <https://www.xilinx.com/products/boards-and-kits/vck5000.html>`__              | DPUCVDX8H             | DPUCVDX8H                  |
 +-----------------------------------------------------------------------------------------+-----------------------+----------------------------+
 | `U200 <https://www.xilinx.com/products/boards-and-kits/alveo/u200.html>`__              | DPUCADF8H             | DPUCADF8H                  |
@@ -52,7 +52,7 @@ The identifiers for the supported edge and cloud Deep Learning Processor Units (
 +-----------------------------------------------------------------------------------------+-----------------------+----------------------------+
 | `U50 <https://www.xilinx.com/products/boards-and-kits/alveo/u50.html>`__                | DPUCAHX8H / DPUCAHX8L | DPUCAHX8H-u50 / DPUCAHX8L  |
 +-----------------------------------------------------------------------------------------+-----------------------+----------------------------+
-| `U280 <https://www.xilinx.com/products/boards-and-kits/alveo/u280.html>`__              | DPUCAHX8H / DPUCAHX8L | DPUCAHX8H-u280 / DPUCAHX8L | 
+| `U280 <https://www.xilinx.com/products/boards-and-kits/alveo/u280.html>`__              | DPUCAHX8H / DPUCAHX8L | DPUCAHX8H-u280 / DPUCAHX8L |
 +-----------------------------------------------------------------------------------------+-----------------------+----------------------------+
 
 For more information about the DPU identifiers see following table:
@@ -66,23 +66,23 @@ For more information about the DPU identifiers see following table:
 |                   |             | | ZD: Zynq DDR                 | | R: RNN               | | M: Mixed Precision   | | C: Cost optimized    |
 +-------------------+-------------+--------------------------------+------------------------+------------------------+------------------------+
 
-On this page you will find information on how to `setup <#setup-instructions>`__ TVM with Vitis AI 
-on different platforms (Zynq, Alveo, Versal) and on how to get started with `Compiling a Model <#compiling-a-model>`__ 
+On this page you will find information on how to `setup <#setup-instructions>`__ TVM with Vitis AI
+on different platforms (Zynq, Alveo, Versal) and on how to get started with `Compiling a Model <#compiling-a-model>`__
 and executing on different platforms: `Inference <#inference>`__.
 
 System Requirements
 -------------------
 
-The `Vitis AI System Requirements page <https://github.com/Xilinx/Vitis-AI/blob/master/docs/learn/system_requirements.md>`__ 
-lists the system requirements for running docker containers as well as doing executing on Alveo cards. 
-For edge devices (e.g. Zynq), deploying models requires a host machine for compiling models using the TVM with Vitis AI flow, 
+The `Vitis AI System Requirements page <https://github.com/Xilinx/Vitis-AI/blob/master/docs/learn/system_requirements.md>`__
+lists the system requirements for running docker containers as well as doing executing on Alveo cards.
+For edge devices (e.g. Zynq), deploying models requires a host machine for compiling models using the TVM with Vitis AI flow,
 and an edge device for running the compiled models. The host system requirements are the same as specified in the link above.
 
 Setup instructions
 ------------------
 
-This section provide the instructions for setting up the TVM with Vitis AI flow for both cloud and edge. 
-TVM with Vitis AI support is provided through a docker container. The provided scripts and Dockerfile 
+This section provide the instructions for setting up the TVM with Vitis AI flow for both cloud and edge.
+TVM with Vitis AI support is provided through a docker container. The provided scripts and Dockerfile
 compiles TVM and Vitis AI into a single image.
 
 1. Clone TVM repo
@@ -91,7 +91,7 @@ compiles TVM and Vitis AI into a single image.
 
       git clone --recursive https://github.com/apache/tvm.git
       cd tvm
-      
+
 2. Build and start the TVM - Vitis AI docker container.
 
    .. code:: bash
@@ -122,7 +122,7 @@ compiles TVM and Vitis AI into a single image.
       pip3 install -e . --user
 
 Inside this docker container you can now compile models for both cloud and edge targets.
-To run on cloud Alveo or Versal VCK5000 cards inside the docker container, please follow the  
+To run on cloud Alveo or Versal VCK5000 cards inside the docker container, please follow the
 `Alveo <#alveo-setup>`__ respectively  `Versal VCK5000 <#versal-vck5000-setup>`__ setup instructions.
 To setup your Zynq or Versal VCK190 evaluation board for inference, please follow
 the `Zynq <#zynq-setup>`__ respectively `Versal VCK190 <#versal-vck190-setup>`__ instructions.
@@ -135,12 +135,12 @@ Check out following page for setup information: `Alveo Setup <https://github.com
 After setup, you can select the right DPU inside the docker container in the following way:
 
 .. code:: bash
-      
+
       cd /workspace
       git clone --branch v1.4 --single-branch --recursive https://github.com/Xilinx/Vitis-AI.git
       cd Vitis-AI/setup/alveo
       source setup.sh [DPU-IDENTIFIER]
-      
+
 The DPU identifier for this can be found in the second column of the DPU Targets table at the top of this page.
 
 Versal VCK5000 Setup
@@ -151,7 +151,7 @@ Check out following page for setup information: `VCK5000 Setup <https://github.c
 After setup, you can select the right DPU inside the docker container in the following way:
 
 .. code:: bash
-      
+
       cd /workspace
       git clone --branch v1.4 --single-branch --recursive https://github.com/Xilinx/Vitis-AI.git
       cd Vitis-AI/setup/vck5000
@@ -160,8 +160,8 @@ After setup, you can select the right DPU inside the docker container in the fol
 Zynq Setup
 ~~~~~~~~~~
 
-For the Zynq target (DPUCZDX8G) the compilation stage will run inside the docker on a host machine. 
-This doesn't require any specific setup except for building the TVM - Vitis AI docker. For executing the model, 
+For the Zynq target (DPUCZDX8G) the compilation stage will run inside the docker on a host machine.
+This doesn't require any specific setup except for building the TVM - Vitis AI docker. For executing the model,
 the Zynq board will first have to be set up and more information on that can be found here.
 
 1. Download the Petalinux image for your target:
@@ -175,17 +175,17 @@ the Zynq board will first have to be set up and more information on that can be
 6. Create 4GB of swap space on the board
 
 .. code:: bash
-    
+
       fallocate -l 4G /swapfile
       chmod 600 /swapfile
       mkswap /swapfile
       swapon /swapfile
       echo "/swapfile swap swap defaults 0 0" > /etc/fstab
-      
+
 7. Install hdf5 dependency (will take between 30 min and 1 hour to finish)
-      
+
 .. code:: bash
-    
+
       cd /tmp && \
         wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.7/src/hdf5-1.10.7.tar.gz && \
         tar -zxvf hdf5-1.10.7.tar.gz && \
@@ -194,25 +194,25 @@ the Zynq board will first have to be set up and more information on that can be
         make -j$(nproc) && \
         make install && \
         cd /tmp && rm -rf hdf5-1.10.7*
-        
+
 8. Install Python dependencies
 
 .. code:: bash
-    
+
       pip3 install Cython==0.29.23 h5py==2.10.0 pillow
 
-9. Install PyXIR 
+9. Install PyXIR
 
 .. code:: bash
-    
+
       git clone --recursive --branch rel-v0.3.1 --single-branch https://github.com/Xilinx/pyxir.git
       cd pyxir
       sudo python3 setup.py install --use_vart_edge_dpu
-      
+
 10. Build and install TVM with Vitis AI
 
 .. code:: bash
-    
+
       git clone --recursive https://github.com/apache/tvm
       cd tvm
       mkdir build
@@ -230,29 +230,29 @@ the Zynq board will first have to be set up and more information on that can be
 .. code:: bash
 
       python3 -c 'import pyxir; import tvm'
-      
+
 .. note::
 
     You might see a warning about the 'cpu-tf' runtime not being found. This warning is
     expected on the board and can be ignored.
-    
-      
+
+
 Versal VCK190 Setup
 ~~~~~~~~~~~~~~~~~~~
 
 For the Versal VCK190 setup, please follow the instructions for `Zynq Setup <#zynq-setup>`__,
 but now use the `VCK190 image <https://www.xilinx.com/member/forms/download/design-license-xef.html?filename=xilinx-vck190-dpu-v2020.2-v1.4.0.img.gz>`__
 in step 1. The other steps are the same.
-    
+
 
 Compiling a Model
 -----------------
 
-The TVM with Vitis AI flow contains two stages: Compilation and Inference. 
-During the compilation a user can choose a model to compile for the cloud or 
-edge target devices that are currently supported. Once a model is compiled, 
-the generated files can be used to run the model on a the specified target 
-device during the `Inference <#inference>`__ stage. Currently, the TVM with 
+The TVM with Vitis AI flow contains two stages: Compilation and Inference.
+During the compilation a user can choose a model to compile for the cloud or
+edge target devices that are currently supported. Once a model is compiled,
+the generated files can be used to run the model on a the specified target
+device during the `Inference <#inference>`__ stage. Currently, the TVM with
 Vitis AI flow supported a selected number of Xilinx data center and edge devices.
 
 In this section we walk through the typical flow for compiling models with Vitis AI
@@ -272,16 +272,16 @@ Make sure to import PyXIR and the DPU target (``import pyxir.contrib.target.DPUC
    from tvm.contrib.target import vitis_ai
    from tvm.contrib import utils, graph_executor
    from tvm.relay.op.contrib.vitis_ai import partition_for_vitis_ai
-   
+
 **Declare the Target**
 
 .. code:: python
 
    tvm_target = 'llvm'
    dpu_target = 'DPUCADF8H' # options: 'DPUCADF8H', 'DPUCAHX8H-u50', 'DPUCAHX8H-u280', 'DPUCAHX8L', 'DPUCVDX8H', 'DPUCZDX8G-zcu104', 'DPUCZDX8G-zcu102', 'DPUCZDX8G-kv260'
-   
-The TVM with Vitis AI flow currently supports the DPU targets listed in 
-the table at the top of this page. Once the appropriate targets are defined, 
+
+The TVM with Vitis AI flow currently supports the DPU targets listed in
+the table at the top of this page. Once the appropriate targets are defined,
 we invoke the TVM compiler to build the graph for the specified target.
 
 **Import the Model**
@@ -291,9 +291,9 @@ Example code to import an MXNet model:
 .. code:: python
 
    mod, params = relay.frontend.from_mxnet(block, input_shape)
-   
 
-**Partition the Model**   
+
+**Partition the Model**
 
 After importing the model, we utilize the Relay API to annotate the Relay expression for the provided DPU target and partition the graph.
 
@@ -302,7 +302,7 @@ After importing the model, we utilize the Relay API to annotate the Relay expres
     mod = partition_for_vitis_ai(mod, params, dpu=dpu_target)
 
 
-**Build the Model**   
+**Build the Model**
 
 The partitioned model is passed to the TVM compiler to generate the runtime libraries for the TVM Runtime.
 
@@ -330,7 +330,7 @@ inference will be accelerated for all next inputs. Note that the edge
 flow deviates slightly from the explained flow in that inference won't
 be accelerated after the first N inputs but the model will have been
 quantized and compiled and can be moved to the edge device for
-deployment. Please check out the `Running on Zynq <#running-on-zynq>`__ 
+deployment. Please check out the `Running on Zynq <#running-on-zynq>`__
 section below for more information.
 
 .. code:: python
@@ -344,30 +344,30 @@ section below for more information.
       module.set_input(input_name, inputs[i])
       module.run()
 
-By default, the number of images used for quantization is set to 128. 
-You could change the number of images used for On-The-Fly Quantization 
-with the PX_QUANT_SIZE environment variable. For example, execute the 
-following line in the terminal before calling the compilation script 
-to reduce the quantization calibration dataset to eight images. 
+By default, the number of images used for quantization is set to 128.
+You could change the number of images used for On-The-Fly Quantization
+with the PX_QUANT_SIZE environment variable. For example, execute the
+following line in the terminal before calling the compilation script
+to reduce the quantization calibration dataset to eight images.
 This can be used for quick testing.
 
 .. code:: bash
 
     export PX_QUANT_SIZE=8
-    
-Lastly, we store the compiled output from the TVM compiler on disk for 
-running the model on the target device. This happens as follows for 
+
+Lastly, we store the compiled output from the TVM compiler on disk for
+running the model on the target device. This happens as follows for
 cloud DPU's (Alveo, VCK5000):
 
 .. code:: python
 
    lib_path = "deploy_lib.so"
    lib.export_library(lib_path)
-   
-   
-For edge targets (Zynq, VCK190) we have to rebuild for aarch64. To do this 
-we first have to normally export the module to also serialize the Vitis AI 
-runtime module (vitis_ai.rtmod). We will load this runtime module again 
+
+
+For edge targets (Zynq, VCK190) we have to rebuild for aarch64. To do this
+we first have to normally export the module to also serialize the Vitis AI
+runtime module (vitis_ai.rtmod). We will load this runtime module again
 afterwards to rebuild and export for aarch64.
 
 .. code:: python
@@ -397,20 +397,20 @@ For instructions on how to run a compiled model please refer to the next section
 Inference
 ---------
 
-The TVM with Vitis AI flow contains two stages: Compilation and Inference. 
-During the compilation a user can choose to compile a model for any of the 
-target devices that are currently supported. Once a model is compiled, the 
-generated files can be used to run the model on a target device during the 
+The TVM with Vitis AI flow contains two stages: Compilation and Inference.
+During the compilation a user can choose to compile a model for any of the
+target devices that are currently supported. Once a model is compiled, the
+generated files can be used to run the model on a target device during the
 Inference stage.
 
 Check out the `Running on Alveo and VCK5000 <#running-on-alveo-and-vck5000>`__
-and `Running on Zynq and VCK190 <#running-on-zynq-and-vck190>`__ sections for 
+and `Running on Zynq and VCK190 <#running-on-zynq-and-vck190>`__ sections for
 doing inference on cloud accelerator cards respectively edge boards.
 
 Running on Alveo and VCK5000
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-After having followed the steps in the `Compiling a Model <#compiling-a-model>`__ 
+After having followed the steps in the `Compiling a Model <#compiling-a-model>`__
 section, you can continue running on new inputs inside the docker for accelerated
 inference:
 
@@ -418,8 +418,8 @@ inference:
 
     module.set_input(input_name, inputs[i])
     module.run()
-    
-Alternatively, you can load the exported runtime module (the deploy_lib.so 
+
+Alternatively, you can load the exported runtime module (the deploy_lib.so
 exported in  `Compiling a Model <#compiling-a-model>`__):
 
 .. code:: python
@@ -429,7 +429,7 @@ exported in  `Compiling a Model <#compiling-a-model>`__):
    from tvm.contrib import graph_executor
 
    dev = tvm.cpu()
-   
+
    # input_name = ...
    # input_data = ...
 
@@ -439,21 +439,21 @@ exported in  `Compiling a Model <#compiling-a-model>`__):
    module = graph_executor.GraphModule(lib["default"](dev))
    module.set_input(input_name, input_data)
    module.run()
-    
+
 Running on Zynq and VCK190
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Before proceeding, please follow the  `Zynq <#zynq-setup>`__ or 
+Before proceeding, please follow the  `Zynq <#zynq-setup>`__ or
 `Versal VCK190 <#versal-vck190-setup>`__ setup instructions.
 
-Prior to running a model on the board, you need to compile the model for 
-your target evaluation board and transfer the compiled model on to the board. 
-Please refer to the `Compiling a Model <#compiling-a-model>`__ section for 
+Prior to running a model on the board, you need to compile the model for
+your target evaluation board and transfer the compiled model on to the board.
+Please refer to the `Compiling a Model <#compiling-a-model>`__ section for
 information on how to compile a model.
 
-Afterwards, you will have to transfer the compiled model (deploy_lib_edge.so) 
+Afterwards, you will have to transfer the compiled model (deploy_lib_edge.so)
 to the evaluation board. Then, on the board you can use the typical
-"load_module" and "module.run" APIs to execute. For this, please make sure to 
+"load_module" and "module.run" APIs to execute. For this, please make sure to
 run the script as root (execute ``su`` in terminal to log into root).
 
 .. note::
@@ -468,7 +468,7 @@ run the script as root (execute ``su`` in terminal to log into root).
    from tvm.contrib import graph_executor
 
    dev = tvm.cpu()
-   
+
    # input_name = ...
    # input_data = ...
 
@@ -478,5 +478,3 @@ run the script as root (execute ``su`` in terminal to log into root).
    module = graph_executor.GraphModule(lib["default"](dev))
    module.set_input(input_name, input_data)
    module.run()
-   
-
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index b92a921d61b6..458a1570096c 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -90,7 +90,7 @@ Homebrew to ensure the dependencies are correctly installed and configured:
     brew install llvm
     brew install python@3.8
 
-If you are on macOS with an M1 Processor you may need to use conda to manage dependencies while building. Specifically you may need, `Miniforge <https://github.com/conda-forge/miniforge>`_ to ensure that the dependencies obtained using pip are compatible with M1. 
+If you are on macOS with an M1 Processor you may need to use conda to manage dependencies while building. Specifically you may need, `Miniforge <https://github.com/conda-forge/miniforge>`_ to ensure that the dependencies obtained using pip are compatible with M1.
 
 .. code:: bash
 
diff --git a/docs/reference/api/python/auto_scheduler.rst b/docs/reference/api/python/auto_scheduler.rst
index c5b8dccf1be2..8fa182307352 100644
--- a/docs/reference/api/python/auto_scheduler.rst
+++ b/docs/reference/api/python/auto_scheduler.rst
@@ -21,4 +21,3 @@ tvm.auto_scheduler
    :members:
    :imported-members:
    :autosummary:
-
diff --git a/docs/reference/api/python/topi.rst b/docs/reference/api/python/topi.rst
index c77b9eae89d4..0528844d682a 100644
--- a/docs/reference/api/python/topi.rst
+++ b/docs/reference/api/python/topi.rst
@@ -45,5 +45,3 @@ tvm.topi.sparse
    :members:
    :imported-members:
    :autosummary:
-
-
diff --git a/docs/reference/langref/relay_pattern.rst b/docs/reference/langref/relay_pattern.rst
index 4682e5aa5b33..16211b2cb125 100644
--- a/docs/reference/langref/relay_pattern.rst
+++ b/docs/reference/langref/relay_pattern.rst
@@ -89,7 +89,7 @@ Or a convolution with a specific kernel size:
         x = relay.var('x')
         y = relay.var('y')
         assert is_conv2d.match(relay.op.nn.conv2d(x, y, kernel_size=[3, 3]))
-      
+
 
 
 Matching an Optional Op
diff --git a/docs/topic/vta/.gitignore b/docs/topic/vta/.gitignore
index a07068979a60..7445cd0171c7 100644
--- a/docs/topic/vta/.gitignore
+++ b/docs/topic/vta/.gitignore
@@ -1 +1 @@
-tutorials
\ No newline at end of file
+tutorials
diff --git a/docs/topic/vta/dev/config.rst b/docs/topic/vta/dev/config.rst
index 2f98d777608e..b3ec49e769af 100644
--- a/docs/topic/vta/dev/config.rst
+++ b/docs/topic/vta/dev/config.rst
@@ -71,4 +71,3 @@ We provide additional detail below regarding each parameter:
  - ``HW_VER``: Hardware version which increments every time the VTA hardware design changes. This parameter is used to uniquely identity hardware bitstreams.
  - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension of inner tensor computation.
  - ``LOG_BLOCK``: Equivalent to B and C in multiplication of shape (A, B) x (B, C), or typically, the input/output channel dimensions of the inner tensor computation.
-
diff --git a/docs/topic/vta/dev/hardware.rst b/docs/topic/vta/dev/hardware.rst
index baddb56b23f6..8251278994da 100644
--- a/docs/topic/vta/dev/hardware.rst
+++ b/docs/topic/vta/dev/hardware.rst
@@ -296,5 +296,3 @@ Load and Store Modules
 The load and store modules perform 2D DMA loads with a strided access pattern from DRAM to SRAM.
 In addition, the load module can insert 2D padding on the fly, which is useful when blocking 2D convolution.
 This means that VTA can tile 2D convolution inputs without paying the overhead of re-laying data out in DRAM to insert spatial padding around input and weight tiles.
-
-
diff --git a/docs/topic/vta/dev/index.rst b/docs/topic/vta/dev/index.rst
index 2b715740ed29..753af7a21721 100644
--- a/docs/topic/vta/dev/index.rst
+++ b/docs/topic/vta/dev/index.rst
@@ -28,4 +28,4 @@ This developer guide details the complete VTA-TVM hardware-software stack.
    :maxdepth: 2
 
    config
-   hardware
\ No newline at end of file
+   hardware
diff --git a/docs/topic/vta/install.rst b/docs/topic/vta/install.rst
index e4b309ea9b61..ba76df410c1f 100644
--- a/docs/topic/vta/install.rst
+++ b/docs/topic/vta/install.rst
@@ -486,4 +486,3 @@ If you would like to run the full hardware compilation for the ``de10nano`` boar
 This process might be a bit lengthy, and might take up to half an hour to complete depending on the performance of your PC. The Quartus Prime software would automatically detect the number of cores available on your PC and try to utilize all of them to perform such process.
 
 Once the compilation completes, the generated bistream can be found under ``<tvm root>/3rdparty/vta-hw/build/hardware/intel/quartus/<configuration>/export/vta.rbf``. You can also open the Quartus project file (.qpf) available at ``<tvm root>/3rdparty/vta-hw/build/hardware/intel/quartus/<configuration>/de10_nano_top.qpf`` to look around the generated reports.
-
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
index cc29f27ba22b..a430411fd9ee 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
@@ -23,7 +23,7 @@
 best performance. This is a tutorial on how to tune a whole neural
 network for NVIDIA GPU with the auto-scheduler.
 
-To auto-tune a neural network, we partition the network into small subgraphs and 
+To auto-tune a neural network, we partition the network into small subgraphs and
 tune them independently. Each subgraph is treated as one search task.
 A task scheduler slices the time and dynamically allocates time resources to
 these tasks. The task scheduler predicts the impact of each task on the end-to-end
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py
index 5a321104c8e4..f8caba075de3 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py
@@ -24,7 +24,7 @@
 best performance. This is a tutorial on how to tune a whole neural
 network for x86 CPU with the auto-scheduler.
 
-To auto-tune a neural network, we partition the network into small subgraphs and 
+To auto-tune a neural network, we partition the network into small subgraphs and
 tune them independently. Each subgraph is treated as one search task.
 A task scheduler slices the time and dynamically allocates time resources to
 these tasks. The task scheduler predicts the impact of each task on the end-to-end
diff --git a/gallery/how_to/work_with_microtvm/micro_aot.py b/gallery/how_to/work_with_microtvm/micro_aot.py
index 9bfe4c39a967..79a72924cc63 100644
--- a/gallery/how_to/work_with_microtvm/micro_aot.py
+++ b/gallery/how_to/work_with_microtvm/micro_aot.py
@@ -24,8 +24,8 @@
 `Alan MacDonald <https://github.com/alanmacd>`_
 
 This tutorial is showcasing microTVM host-driven AoT compilation with
-a TFLite model. AoTExecutor reduces the overhead of parsing graph at runtime 
-compared to GraphExecutor. Also, we can have better memory management using ahead 
+a TFLite model. AoTExecutor reduces the overhead of parsing graph at runtime
+compared to GraphExecutor. Also, we can have better memory management using ahead
 of time compilation. This tutorial can be executed on a x86 CPU using C runtime (CRT)
 or on Zephyr platform on a microcontroller/board supported by Zephyr.
 """
diff --git a/gallery/how_to/work_with_pytorch/using_as_torch.py b/gallery/how_to/work_with_pytorch/using_as_torch.py
index e17a29e277ea..3528e754fdce 100644
--- a/gallery/how_to/work_with_pytorch/using_as_torch.py
+++ b/gallery/how_to/work_with_pytorch/using_as_torch.py
@@ -17,7 +17,7 @@
 """
 Wrap Your TVMScript as PyTorch Module
 ======================
-**Author**: 
+**Author**:
 `Yaoda Zhou <https://github.com/juda>`_
 
 This article is a tutorial on wrapping the TVMScript code as the PyTorch module.
diff --git a/gallery/how_to/work_with_pytorch/using_optimized_torch.py b/gallery/how_to/work_with_pytorch/using_optimized_torch.py
index aa68d9e68ec6..dc6caf5d597c 100644
--- a/gallery/how_to/work_with_pytorch/using_optimized_torch.py
+++ b/gallery/how_to/work_with_pytorch/using_optimized_torch.py
@@ -17,7 +17,7 @@
 """
 Compile PyTorch Models
 ======================
-**Author**: 
+**Author**:
 `Yaoda Zhou <https://github.com/juda>`_
 
 This article is a tutorial to optimize PyTorch models by using decorator `optimize_torch`.
diff --git a/gallery/tutorial/tvmc_python.py b/gallery/tutorial/tvmc_python.py
index 9658036a2cc6..0cd4f8ed9b9a 100644
--- a/gallery/tutorial/tvmc_python.py
+++ b/gallery/tutorial/tvmc_python.py
@@ -20,7 +20,7 @@
 **Author**:
 `Jocelyn Shiue <https://github.com/CircleSpin>`_
 
-Hi! Here we explain the scripting tool designed for the complete TVM beginner. 🙂                                                                                                      
+Hi! Here we explain the scripting tool designed for the complete TVM beginner. 🙂
 
 Before we get started let's get an example model if you don't already have one.
 Follow the steps to download a resnet model via the terminal:
diff --git a/jvm/conf/log4j.properties b/jvm/conf/log4j.properties
index ef523cb7bc4f..5d039e7a61b6 100644
--- a/jvm/conf/log4j.properties
+++ b/jvm/conf/log4j.properties
@@ -5,9 +5,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/jvm/core/src/main/java/org/apache/tvm/Base.java b/jvm/core/src/main/java/org/apache/tvm/Base.java
index e6fef4153284..f5e677a2e0b3 100644
--- a/jvm/core/src/main/java/org/apache/tvm/Base.java
+++ b/jvm/core/src/main/java/org/apache/tvm/Base.java
@@ -189,4 +189,3 @@ public TVMError(String err) {
   private Base() {
   }
 }
-
diff --git a/jvm/core/src/main/java/org/apache/tvm/rpc/ConnectProxyServerProcessor.java b/jvm/core/src/main/java/org/apache/tvm/rpc/ConnectProxyServerProcessor.java
index 2a1b3e81c28d..6a9fa010f024 100644
--- a/jvm/core/src/main/java/org/apache/tvm/rpc/ConnectProxyServerProcessor.java
+++ b/jvm/core/src/main/java/org/apache/tvm/rpc/ConnectProxyServerProcessor.java
@@ -45,8 +45,8 @@ public ConnectProxyServerProcessor(String host, int port, String key) {
     this.port = port;
     this.key = "server:" + key;
   }
-  
-  /** 
+
+  /**
    * Set a callback when a connection is received e.g., to record the time for a
    * watchdog.
    * @param callback Runnable object.
diff --git a/jvm/core/src/test/java/org/apache/tvm/TestUtils.java b/jvm/core/src/test/java/org/apache/tvm/TestUtils.java
index 3431bf6c5c5c..9b5c42862925 100644
--- a/jvm/core/src/test/java/org/apache/tvm/TestUtils.java
+++ b/jvm/core/src/test/java/org/apache/tvm/TestUtils.java
@@ -8,9 +8,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/licenses/LICENSE.builtin_fp16.txt b/licenses/LICENSE.builtin_fp16.txt
index 508c7f7ba7cd..5a79a1b9d5cb 100644
--- a/licenses/LICENSE.builtin_fp16.txt
+++ b/licenses/LICENSE.builtin_fp16.txt
@@ -308,4 +308,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
\ No newline at end of file
+THE SOFTWARE.
diff --git a/licenses/LICENSE.cma.txt b/licenses/LICENSE.cma.txt
index 4205858e98ca..00028209d171 100644
--- a/licenses/LICENSE.cma.txt
+++ b/licenses/LICENSE.cma.txt
@@ -19,4 +19,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
\ No newline at end of file
+THE SOFTWARE.
diff --git a/licenses/LICENSE.concurrentqueue.txt b/licenses/LICENSE.concurrentqueue.txt
index 4cd754581b49..b36f9eadc9f9 100644
--- a/licenses/LICENSE.concurrentqueue.txt
+++ b/licenses/LICENSE.concurrentqueue.txt
@@ -19,4 +19,4 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PRO
 OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/licenses/LICENSE.libbacktrace.txt b/licenses/LICENSE.libbacktrace.txt
index 097d2774e5df..e9e256244d69 100644
--- a/licenses/LICENSE.libbacktrace.txt
+++ b/licenses/LICENSE.libbacktrace.txt
@@ -5,13 +5,13 @@
 # met:
 
 #     (1) Redistributions of source code must retain the above copyright
-#     notice, this list of conditions and the following disclaimer. 
+#     notice, this list of conditions and the following disclaimer.
 
 #     (2) Redistributions in binary form must reproduce the above copyright
 #     notice, this list of conditions and the following disclaimer in
 #     the documentation and/or other materials provided with the
-#     distribution.  
-    
+#     distribution.
+
 #     (3) The name of the author may not be used to
 #     endorse or promote products derived from this software without
 #     specific prior written permission.
diff --git a/licenses/LICENSE.picojson.txt b/licenses/LICENSE.picojson.txt
index 5373d53cc8c4..72f355391110 100644
--- a/licenses/LICENSE.picojson.txt
+++ b/licenses/LICENSE.picojson.txt
@@ -22,4 +22,4 @@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/mypy.ini b/mypy.ini
index 02564a85469e..4a429b7dadad 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -31,4 +31,3 @@ ignore_errors = True
 
 [mypy-python.tvm.tir.schedule.*]
 ignore_errors = False
-
diff --git a/python/tvm/meta_schedule/testing/torchbench/run.py b/python/tvm/meta_schedule/testing/torchbench/run.py
index fe939b2c9ba9..20c633196900 100644
--- a/python/tvm/meta_schedule/testing/torchbench/run.py
+++ b/python/tvm/meta_schedule/testing/torchbench/run.py
@@ -197,7 +197,7 @@ def parse_args():
         type=str,
         required=True,
         help="""
-        The name of model to run. It should a directory name under 
+        The name of model to run. It should a directory name under
         https://github.com/pytorch/benchmark/tree/main/torchbenchmark/models.
         """,
     )
@@ -228,7 +228,7 @@ def parse_args():
         type=int,
         default=None,
         help="""
-        The max number of trials to run per task extracted in MetaSchedule. 
+        The max number of trials to run per task extracted in MetaSchedule.
         By default it's the same as --num-trials.
         """,
     )
diff --git a/python/tvm/micro/contrib/stm32/emitter.py b/python/tvm/micro/contrib/stm32/emitter.py
index 814f98f1b788..af0eb53ad325 100644
--- a/python/tvm/micro/contrib/stm32/emitter.py
+++ b/python/tvm/micro/contrib/stm32/emitter.py
@@ -569,12 +569,12 @@ def _emit_params_data(self, name, out_h, out_c):
                 f"""\
         #ifndef __{name_upper}_DATA_H_
         #define __{name_upper}_DATA_H_
-        
+
         #include \"ai_runtime_api.h\"
 
         AI_API_ENTRY
         const ai_ptr ai_{name}_data_weights_get (void);
-        
+
         #endif /* __{name_upper}_DATA_H_ */
         """
             )
@@ -658,7 +658,7 @@ def _emit_open(self, name, out_h, out_c):
                 f"""\
         #ifndef __AI_{name_upper}_H__
         #define __AI_{name_upper}_H__
-        
+
         #include \"ai_runtime_api.h\"
 
         #define _{name_upper}_INPUTS_COUNT_ ({input_size})
@@ -674,7 +674,7 @@ def _emit_open(self, name, out_h, out_c):
             textwrap.dedent(
                 f"""\
         #include <stdio.h>
-        
+
         #include \"dlpack/dlpack.h\"
         #include \"tvm/runtime/c_runtime_api.h\"
         #include \"{name}.h\"
diff --git a/rust/.rustfmt.toml b/rust/.rustfmt.toml
index 5a1f1d27514f..3c51bb384c68 100644
--- a/rust/.rustfmt.toml
+++ b/rust/.rustfmt.toml
@@ -29,4 +29,3 @@ merge_derives = true
 use_try_shorthand = false
 use_field_init_shorthand = false
 force_explicit_abi = true
-
diff --git a/src/relay/collage/README.md b/src/relay/collage/README.md
index dc56496092cc..945a775e383d 100644
--- a/src/relay/collage/README.md
+++ b/src/relay/collage/README.md
@@ -20,7 +20,7 @@ The `CollagePartition` pass for finding optimal partitionings of Relay models.
 See the [RFC](https://github.com/mbs-octoml/mbs-tvm-rfcs/blob/mbs-rfcs-collage/rfcs/xxxx-collage.md).
 
 Based on:
-> *Collage: Automated Integration of Deep Learning Backends*  
+> *Collage: Automated Integration of Deep Learning Backends*
 > Byungsoo Jeon, Sunghyun Park, Peiyuan Liao, Sheng Xu, Tianqi Chen, Zhihao Jia
 
 CAUTION: This is a prototype, do not use in prod.
diff --git a/src/runtime/crt/host/Makefile b/src/runtime/crt/host/Makefile
index f5f9ef8a2af2..d9e87c7d6a41 100644
--- a/src/runtime/crt/host/Makefile
+++ b/src/runtime/crt/host/Makefile
@@ -29,7 +29,7 @@ CXX ?= ${PREFIX}g++
 RANLIB ?= ${PREFIX}ranlib
 
 ifeq (${VERBOSE}, 1)
-QUIET ?= 
+QUIET ?=
 else
 QUIET ?= @
 endif
diff --git a/src/runtime/hexagon/README.md b/src/runtime/hexagon/README.md
index fed1d33e4245..7e19105a9ce8 100644
--- a/src/runtime/hexagon/README.md
+++ b/src/runtime/hexagon/README.md
@@ -68,4 +68,3 @@ USE_HEXAGON_SDK=/path/to/sdk
 ```
 
 As mentioned before, only build the `runtime` component (e.g. `make runtime`).
-
diff --git a/tests/crt/contrib/stm32/Makefile b/tests/crt/contrib/stm32/Makefile
index 293e0e7c7793..b2515b854d4f 100644
--- a/tests/crt/contrib/stm32/Makefile
+++ b/tests/crt/contrib/stm32/Makefile
@@ -50,7 +50,7 @@ C_SOURCES += $(STM32_RUNTIME_PATH)/ai_runtime_api.c
 #
 # Application sources
 #
-C_SOURCES += $(SRC_PATH)/main.c 
+C_SOURCES += $(SRC_PATH)/main.c
 
 vpath %.c $(sort $(dir $(C_SOURCES)))
 
diff --git a/tests/lint/docker-format.sh b/tests/lint/docker-format.sh
index 4ce804c83e62..6ee7cf98a661 100755
--- a/tests/lint/docker-format.sh
+++ b/tests/lint/docker-format.sh
@@ -31,4 +31,4 @@ if git grep "apt-get install" -- ':(exclude)docker/utils/apt-install-and-clear.s
   exit 1
 fi
 
-exit 0
\ No newline at end of file
+exit 0
diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 94fae289b6b9..6b5415987985 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -53,4 +53,3 @@ python3 -m pylint tests/python/frontend/oneflow/*.py --rcfile="$(dirname "$0")"/
 python3 -m pylint tests/python/frontend/tensorflow/test_forward.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/frontend/pytorch/test_forward.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/frontend/tflite/test_forward.py --rcfile="$(dirname "$0")"/pylintrc
-
diff --git a/tests/lint/trailing_newlines.py b/tests/lint/trailing_newlines.py
new file mode 100755
index 000000000000..2ace6a138749
--- /dev/null
+++ b/tests/lint/trailing_newlines.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import fileinput
+import os
+
+
+def has_one_trailing_newline(filename: str) -> bool:
+    """
+    Returns True if 'filename' has a single trailing newline
+    """
+    with open(filename, "rb") as f:
+        start_bytes = len(f.read(2))
+        if start_bytes == 0:
+            # empty file
+            return True
+        elif start_bytes == 1:
+            # 1 byte file
+            return False
+        else:
+            # skip to the end
+            f.seek(-2, os.SEEK_END)
+            end_bytes = f.read(2)
+
+            # should be a non-newline followed by a newline
+            return end_bytes[0] != ord("\n") and end_bytes[1] == ord("\n")
+
+
+if __name__ == "__main__":
+    exit_code = 1
+    for line in fileinput.input():
+        filename = line.rstrip()
+        if not has_one_trailing_newline(filename):
+            exit_code = 0
+            print(filename)
+    exit(exit_code)
diff --git a/tests/lint/whitespace.sh b/tests/lint/whitespace.sh
new file mode 100755
index 000000000000..78375ed58bdb
--- /dev/null
+++ b/tests/lint/whitespace.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+status=0
+
+if git --no-pager grep -Il '' -- . | ./tests/lint/trailing_newlines.py; then
+    echo "The above files are missing a trailing newline or have too many trailing newlines"
+    status=1
+fi
+
+if git --no-pager grep -In '[[:blank:]]$' -- .; then
+    echo "The above files have trailing spaces"
+    status=1
+fi
+
+if [ $status == "1" ]; then
+    echo "Found whitespace lint failures, 'pre-commit run --all-files' can auto-correct them"
+    exit 1
+else
+    echo "Found no whitespace lint failures"
+    exit 0
+fi
diff --git a/tests/python/ci/sample_prs/pr10786.json b/tests/python/ci/sample_prs/pr10786.json
index 79f20ca6094b..688025d45cfe 100644
--- a/tests/python/ci/sample_prs/pr10786.json
+++ b/tests/python/ci/sample_prs/pr10786.json
@@ -126,4 +126,4 @@
         }
       ]
     }
-  }
\ No newline at end of file
+  }
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 315c2367c82a..b2bbcd377b84 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -148,7 +148,7 @@ def create_test_runner(
     __attribute__((section(".bss.noinit.tvm"), aligned(16)))
     static uint8_t {pool.pool_name}[{_get_workspace_size_define_macro(pool.pool_name)}];
     #endif
-    
+
             """
             )
 
diff --git a/tests/python/contrib/test_ethosu/test_create_tiles.py b/tests/python/contrib/test_ethosu/test_create_tiles.py
index ffb828d9108a..77b69df91116 100644
--- a/tests/python/contrib/test_ethosu/test_create_tiles.py
+++ b/tests/python/contrib/test_ethosu/test_create_tiles.py
@@ -53,8 +53,8 @@ def main(placeholder1: T.Buffer[(100,), "int8"], placeholder2: T.Buffer[(100,),
             for i0 in T.serial(0, 1):
                 for i1 in T.serial(0, 6):
                     for i2 in T.serial(0, 1):
-                        for i3 in T.serial(0, 1):   
-                            for i4 in T.serial(0, 16):   
+                        for i3 in T.serial(0, 1):
+                            for i4 in T.serial(0, 16):
                                 placeholder1[((i1*16) + i4)] = placeholder2[((T.floormod((i1 + 4), 6)*16) + i4)]
 
         __tvm_meta__ = None
@@ -84,8 +84,8 @@ def main(placeholder1: T.Buffer[(100,), "int8"], placeholder2: T.Buffer[(100,),
             for i0 in T.serial(0, 1):
                 for i1 in T.serial(0, 1):
                     for i2 in T.serial(0, 1):
-                        for i3 in T.serial(0, 6):   
-                            for i4 in T.serial(0, 16):   
+                        for i3 in T.serial(0, 6):
+                            for i4 in T.serial(0, 16):
                                 placeholder1[((i3*16) + i4)] = placeholder2[((T.floormod((i3 + 4), 6)*16) + i4)]
 
         __tvm_meta__ = None
@@ -115,8 +115,8 @@ def main(placeholder1: T.Buffer[(100,), "int8"], placeholder2: T.Buffer[(100,),
             for i0 in T.serial(0, 1):
                 for i1 in T.serial(0, 6):
                     for i2 in T.serial(0, 1):
-                        for i3 in T.serial(0, 1):   
-                            for i4 in T.serial(0, 16):   
+                        for i3 in T.serial(0, 1):
+                            for i4 in T.serial(0, 16):
                                 placeholder1[((i1*16) + i4)] = placeholder2[((T.floormod((i1 + 4), 6)*8) + i4)]
 
         __tvm_meta__ = None
@@ -146,7 +146,7 @@ def main(placeholder1: T.Buffer[(100,), "int8"], placeholder2: T.Buffer[(100,),
             for i0 in T.serial(0, 1):
                 for i1 in T.serial(0, 5):
                     for i2 in T.serial(0, 6):
-                        for i3 in T.serial(0, 4):   
+                        for i3 in T.serial(0, 4):
                             placeholder1[(((i1*24) + (i2*4)) + i3)] = placeholder2[(((((T.floordiv((i1 - 1), 2)*48) + (T.floormod((i1 + 1), 2)*24)) + (i2*4)) + i3) + 96)]
 
         __tvm_meta__ = None
diff --git a/tests/python/contrib/test_ethosu/test_layout_optimizer.py b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
index eec963af7f35..4e134c9f4df0 100644
--- a/tests/python/contrib/test_ethosu/test_layout_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
@@ -381,7 +381,7 @@ def get_graph():
 def test_multiple_outputs():
     """Test the layout optimization pass works as expected when there
     are multiple outputs in the graph.
-    
+
           pool_1
        /    |   \
   pool_2 pool_3 pool_4
diff --git a/tests/python/contrib/test_ethosu/test_lut_optimizer.py b/tests/python/contrib/test_ethosu/test_lut_optimizer.py
index 87e625741b6c..12b6ed70d8ed 100644
--- a/tests/python/contrib/test_ethosu/test_lut_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_lut_optimizer.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Test the pass that removes unnecssary identity operation if the identity 
+"""Test the pass that removes unnecssary identity operation if the identity
 uses LUT and the preceding operator is LUT capable and doesn't already have a LUT.
 """
 import pytest
diff --git a/tests/python/contrib/test_hexagon/README.md b/tests/python/contrib/test_hexagon/README.md
index a2b108f7a4ed..665ff9aa73ce 100644
--- a/tests/python/contrib/test_hexagon/README.md
+++ b/tests/python/contrib/test_hexagon/README.md
@@ -90,7 +90,7 @@ make -j2
 
 # Build Hexagon API
 cd ..
-./tests/scripts/task_build_hexagon_api.sh 
+./tests/scripts/task_build_hexagon_api.sh
 ```
 
 Now that you have built required tools, you can jump to [run test examples](#run-tests).
diff --git a/tests/python/contrib/test_hexagon/conv2d/README.md b/tests/python/contrib/test_hexagon/conv2d/README.md
index 674e1af6029f..d29d8b9c8604 100644
--- a/tests/python/contrib/test_hexagon/conv2d/README.md
+++ b/tests/python/contrib/test_hexagon/conv2d/README.md
@@ -34,4 +34,4 @@ High Level Notes:
 
 [Conv2d](test_conv2d_blocked.md)
 
-[Conv2d -> Conv2d](test_conv2d_conv2d.md)
\ No newline at end of file
+[Conv2d -> Conv2d](test_conv2d_conv2d.md)
diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.md b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.md
index 6250f99efc9d..417ce0b12310 100644
--- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.md
+++ b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.md
@@ -44,7 +44,7 @@ pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2d
 ## To Do
 
 * n/a
-  
+
 ## Annotated TIR
 
 ```
@@ -68,7 +68,7 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
           for (hi: int32, 0, 8) {
             for (wi: int32, 0, 8) {
               for (ci: int32, 0, 32) {
-                input.cache[(((((wo*4096) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = 
+                input.cache[(((((wo*4096) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] =
                   (float32*)input_pointer[((((((ho.outer*32768) + (hi*4096)) + (wo*512)) + (wi*64)) + (co*32)) + ci)]
               }
             }
@@ -81,7 +81,7 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
         for (ci8: int32, 0, 8) {
           for (ki: int32, 0, 32) {
             for (ci4: int32, 0, 4) {
-              filter.cache[((((co*1024) + (ci8*128)) + (ki*4)) + ci4)] = 
+              filter.cache[((((co*1024) + (ci8*128)) + (ki*4)) + ci4)] =
                 (float32*)filter_pointer[(((((ko.outer*2048) + (co*1024)) + (ci8*128)) + (ki*4)) + ci4)]
             }
           }
@@ -106,9 +106,9 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
             for (wi.c: int32, 0, 8) {
               for (ki.c: int32, 0, 32) {
                 for (rc.inner: int32, 0, 32) {
-                  output.cache[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] = 
+                  output.cache[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] =
                   (
-                    (float32*)output.cache[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] + 
+                    (float32*)output.cache[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] +
                     (
                       (float32*)input.cache[(((((wo.c*4096) + (rc.outer*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner)] *
                       (float32*)filter.cache[((((rc.outer*1024) + (floordiv(rc.inner, 4)*128)) + (ki.c*4)) + floormod(rc.inner, 4))]
@@ -126,7 +126,7 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
         for (hi: int32, 0, 8) {
           for (wi: int32, 0, 8) {
             for (ki: int32, 0, 32) {
-              output_pointer[((((((ho.outer*65536) + (wo*8192)) + (ko.outer*2048)) + (hi*256)) + (wi*32)) + ki)] = 
+              output_pointer[((((((ho.outer*65536) + (wo*8192)) + (ko.outer*2048)) + (hi*256)) + (wi*32)) + ki)] =
                 (float32*)output.cache[((((wo*2048) + (hi*256)) + (wi*32)) + ki)]
             }
           }
@@ -215,7 +215,7 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
 
   // output cache grows by factor of h_split * k_split = 4
   allocate(output.cache: Pointer(global float32), float32, [65536]), storage_scope = global;
-  
+
   // ko.outer = outer loop split on ko using k_split factor
   for (ko.outer: int32, 0, 2) {
     // ho.outer = outer loop split on ho using h_split factor
@@ -229,7 +229,7 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
             for (hi: int32, 0, 8) {
               for (wi: int32, 0, 8) {
                 for (ci: int32, 0, 32) {
-                  input.cache[((((((ho.inner*32768) + (wo*4096)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = 
+                  input.cache[((((((ho.inner*32768) + (wo*4096)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] =
                     (float32*)input_pointer[(((((((ho.outer*65536) + (ho.inner*32768)) + (hi*4096)) + (wo*512)) + (wi*64)) + (co*32)) + ci)]
                 }
               }
@@ -244,7 +244,7 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
           for (ci8: int32, 0, 8) {
             for (ki: int32, 0, 32) {
               for (ci4: int32, 0, 4) {
-                filter.cache[(((((ko.inner*2048) + (co*1024)) + (ci8*128)) + (ki*4)) + ci4)] = 
+                filter.cache[(((((ko.inner*2048) + (co*1024)) + (ci8*128)) + (ki*4)) + ci4)] =
                   (float32*)filter_pointer[((((((ko.outer*4096) + (ko.inner*2048)) + (co*1024)) + (ci8*128)) + (ki*4)) + ci4)]
               }
             }
@@ -272,9 +272,9 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
                 for (wi.c: int32, 0, 8) {
                   for (ki.c: int32, 0, 32) {
                     for (rc.inner: int32, 0, 32) {
-                      output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] = 
+                      output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] =
                       (
-                        (float32*)output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] + 
+                        (float32*)output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] +
                         (
                           (float32*)input.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (rc.outer*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner)] *
                           (float32*)filter.cache[(((((ko.c.inner*2048) + (rc.outer*1024)) + (floordiv(rc.inner, 4)*128)) + (ki.c*4)) + floormod(rc.inner, 4))]
@@ -296,7 +296,7 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
             for (hi: int32, 0, 8) {
               for (wi: int32, 0, 8) {
                 for (ki: int32, 0, 32) {
-                  output_pointer[((((((((ho.outer*131072) + (ho.inner*65536)) + (wo*8192)) + (ko.outer*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] = 
+                  output_pointer[((((((((ho.outer*131072) + (ho.inner*65536)) + (wo*8192)) + (ko.outer*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] =
                     (float32*)output.cache[((((((ho.inner*32768) + (wo*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)]
                 }
               }
@@ -414,7 +414,7 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
               for (hi: int32, 0, 8) {
                 for (wi: int32, 0, 8) {
                   for (ci: int32, 0, 32) {
-                    input.cache[((((((ho.inner*32768) + (wo*4096)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = 
+                    input.cache[((((((ho.inner*32768) + (wo*4096)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] =
                       (float32*)input_pointer[(((((((ho.outer*65536) + (ho.inner*32768)) + (hi*4096)) + (wo*512)) + (wi*64)) + (co*32)) + ci)]
                   }
                 }
@@ -431,7 +431,7 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
               for (ci8: int32, 0, 8) {
                 for (ki: int32, 0, 32) {
                   for (ci4: int32, 0, 4) {
-                    filter.cache[(((((((ko.inner*18432) + (co*9216)) + (rh*3072)) + (rw*1024)) + (ci8*128)) + (ki*4)) + ci4)] = 
+                    filter.cache[(((((((ko.inner*18432) + (co*9216)) + (rh*3072)) + (rw*1024)) + (ci8*128)) + (ki*4)) + ci4)] =
                       (float32*)filter_pointer[((((((((ko.outer*36864) + (ko.inner*18432)) + (co*9216)) + (rh*3072)) + (rw*1024)) + (ci8*128)) + (ki*4)) + ci4)]
                   }
                 }
@@ -457,9 +457,9 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
                     for (rw: int32, 0, 3) {
                       for (ki.c: int32, 0, 32) {
                         for (rc.inner: int32, 0, 32) {
-                          output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] = 
+                          output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] =
                           (
-                            (float32*)output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] + 
+                            (float32*)output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] +
                             (
                               (float32*)input.cache[((((((((floordiv((hi.c + rh), 8)*32768) + (ho.c.inner*32768)) + (floordiv((wi.c + rw), 8)*4096)) + (wo.c*4096)) + (rc.outer*2048)) + (floormod((hi.c + rh), 8)*256)) + (floormod((wi.c + rw), 8)*32)) + rc.inner)] *
                               (float32*)filter.cache[(((((((ko.c.inner*18432) + (rc.outer*9216)) + (rh*3072)) + (rw*1024)) + (floordiv(rc.inner, 4)*128)) + (ki.c*4)) + floormod(rc.inner, 4))]
@@ -481,14 +481,14 @@ primfn(input_handle: handle, filter_handle: handle, output_handle: handle) -> ()
             for (hi: int32, 0, 8) {
               for (wi: int32, 0, 8) {
                 for (ki: int32, 0, 32) {
-                  output_pointer[((((((((ho.outer*131072) + (ho.inner*65536)) + (wo*8192)) + (ko.outer*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] = 
+                  output_pointer[((((((((ho.outer*131072) + (ho.inner*65536)) + (wo*8192)) + (ko.outer*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] =
                     (float32*)output.cache[((((((ho.inner*32768) + (wo*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)]
                 }
               }
             }
           }
-        } // end ho.inner 
+        } // end ho.inner
       } // end ko.inner
     } // end ho.outer
   } // end ko.outer
-}```
\ No newline at end of file
+}```
diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.md b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.md
index f7e3061c2230..61c1241c6792 100644
--- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.md
+++ b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.md
@@ -179,7 +179,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
           for (hi: int32, 0, 8) {
             for (wi: int32, 0, 8) {
               for (ci: int32, 0, 32) {
-                packed_input.global[(((((wo*8192) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = 
+                packed_input.global[(((((wo*8192) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] =
                   (float32*)placeholder_8[((((((ho.outer*65536) + (hi*8192)) + (wo*1024)) + (wi*128)) + (co*32)) + ci)]
               }
             }
@@ -195,7 +195,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
           for (cio: int32, 0, 8) {
             for (ki: int32, 0, 32) {
               for (cii: int32, 0, 4) {
-                packed_filter.global[((((co*1024) + (cio*128)) + (ki*4)) + cii)] = 
+                packed_filter.global[((((co*1024) + (cio*128)) + (ki*4)) + cii)] =
                   (float32*)placeholder_7[(((((ko.outer_1*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)]
               }
             }
@@ -220,9 +220,9 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
               for (wi: int32, 0, 8) {
                 for (ki: int32, 0, 32) {
                   for (rc.inner: int32, 0, 32) {
-                    temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi*256)) + (wi*32)) + ki)] = 
+                    temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi*256)) + (wi*32)) + ki)] =
                     (
-                      (float32*)temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi*256)) + (wi*32)) + ki)] + 
+                      (float32*)temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi*256)) + (wi*32)) + ki)] +
                       (
                         (float32*)packed_input.global[(((((wo*8192) + (rc.outer*2048)) + (hi*256)) + (wi*32)) + rc.inner)] *
                         (float32*)packed_filter.global[((((rc.outer*1024) + (floordiv(rc.inner, 4)*128)) + (ki*4)) + floormod(rc.inner, 4))]
@@ -242,7 +242,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
         for (cio: int32, 0, 8) {
           for (ki: int32, 0, 32) {
             for (cii: int32, 0, 4) {
-              packed_filter.global[((((co*1024) + (cio*128)) + (ki*4)) + cii)] = 
+              packed_filter.global[((((co*1024) + (cio*128)) + (ki*4)) + cii)] =
                 (float32*)placeholder_6[(((((ko.outer*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)]
             }
           }
@@ -268,9 +268,9 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
             for (wi.c: int32, 0, 8) {
               for (ki.c: int32, 0, 32) {
                 for (rc.inner_1: int32, 0, 32) {
-                  packed_input.global[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] = 
+                  packed_input.global[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] =
                   (
-                    (float32*)packed_input.global[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] + 
+                    (float32*)packed_input.global[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] +
                     (
                       (float32*)temp_output[(((((wo.c*8192) + (rc.outer_1*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner_1)] *
                       (float32*)packed_filter.global[((((rc.outer_1*1024) + (floordiv(rc.inner_1, 4)*128)) + (ki.c*4)) + floormod(rc.inner_1, 4))]
@@ -288,7 +288,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
         for (hi_1: int32, 0, 8) {
           for (wi_1: int32, 0, 8) {
             for (ki_1: int32, 0, 32) {
-              output_2[((((((ho.outer*65536) + (wo_1*8192)) + (ko.outer*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] = 
+              output_2[((((((ho.outer*65536) + (wo_1*8192)) + (ko.outer*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] =
                 (float32*)packed_input.global[((((wo_1*2048) + (hi_1*256)) + (wi_1*32)) + ki_1)]
             }
           }
@@ -477,7 +477,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
             for (hi: int32, 0, 8) {
               for (wi: int32, 0, 8) {
                 for (ci: int32, 0, 32) {
-                  packed_input.global[((((((ho.inner*65536) + (wo*8192)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = 
+                  packed_input.global[((((((ho.inner*65536) + (wo*8192)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] =
                     (float32*)placeholder_8[(((((((ho.outer*131072) + (ho.inner*65536)) + (hi*8192)) + (wo*1024)) + (wi*128)) + (co*32)) + ci)]
                 }
               }
@@ -495,7 +495,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
             for (cio: int32, 0, 8) {
               for (ki: int32, 0, 32) {
                 for (cii: int32, 0, 4) {
-                  packed_filter.global[(((((ko.inner*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] = 
+                  packed_filter.global[(((((ko.inner*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] =
                     (float32*)placeholder_7[((((((ko.outer_1*8192) + (ko.inner*4096)) + (co*1024)) + (cio*128)) + (ki*4)) + cii)]
                 }
               }
@@ -523,9 +523,9 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
                   for (wi: int32, 0, 8) {
                     for (ki: int32, 0, 32) {
                       for (rc.inner: int32, 0, 32) {
-                        temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] = 
+                        temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] =
                         (
-                          (float32*)temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] + 
+                          (float32*)temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] +
                           (
                             (float32*)packed_input.global[((((((ho.inner*65536) + (wo*8192)) + (rc.outer*2048)) + (hi*256)) + (wi*32)) + rc.inner)] *
                             (float32*)packed_filter.global[(((((ko.inner*4096) + (rc.outer*1024)) + (floordiv(rc.inner, 4)*128)) + (ki*4)) + floormod(rc.inner, 4))]
@@ -548,7 +548,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
           for (cio: int32, 0, 8) {
             for (ki: int32, 0, 32) {
               for (cii: int32, 0, 4) {
-                packed_filter.global[(((((ko.inner*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] = 
+                packed_filter.global[(((((ko.inner*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] =
                   (float32*)placeholder_6[((((((ko.outer*8192) + (ko.inner*4096)) + (co*1024)) + (cio*128)) + (ki*4)) + cii)]
               }
             }
@@ -577,9 +577,9 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
                 for (wi.c: int32, 0, 8) {
                   for (ki.c: int32, 0, 32) {
                     for (rc.inner_1: int32, 0, 32) {
-                      packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] = 
+                      packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] =
                       (
-                        (float32*)packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] + 
+                        (float32*)packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] +
                         (
                           (float32*)temp_output[((((((ho.c.inner*65536) + (wo.c*8192)) + (rc.outer_1*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner_1)] *
                           (float32*)packed_filter.global[(((((ko.c.inner*4096) + (rc.outer_1*1024)) + (floordiv(rc.inner_1, 4)*128)) + (ki.c*4)) + floormod(rc.inner_1, 4))]
@@ -601,7 +601,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
             for (hi_1: int32, 0, 8) {
               for (wi_1: int32, 0, 8) {
                 for (ki_1: int32, 0, 32) {
-                  output_2[((((((((ho.outer*131072) + (ho.inner_1*65536)) + (wo_1*8192)) + (ko.outer*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] = 
+                  output_2[((((((((ho.outer*131072) + (ho.inner_1*65536)) + (wo_1*8192)) + (ko.outer*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] =
                     (float32*)packed_input.global[((((((ho.inner_1*32768) + (wo_1*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)]
                 }
               }
@@ -827,7 +827,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
                   for (hi: int32, 0, 8) {
                     for (wi: int32, 0, 8) {
                       for (ci: int32, 0, 32) {
-                        packed_input.global[((((((ho.inner*65536) + (wo*8192)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = 
+                        packed_input.global[((((((ho.inner*65536) + (wo*8192)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] =
                           (float32*)placeholder_8[((((((((ho.outer_1*131072) + (ho.outer*131072)) + (ho.inner*65536)) + (hi*8192)) + (wo*1024)) + (wi*128)) + (co*32)) + ci)]
                       }
                     }
@@ -845,7 +845,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
                   for (cio: int32, 0, 8) {
                     for (ki: int32, 0, 32) {
                       for (cii: int32, 0, 4) {
-                        packed_filter.global[(((((((ko.inner*36864) + (co*9216)) + (rh*3072)) + (rw*1024)) + (cio*128)) + (ki*4)) + cii)] = 
+                        packed_filter.global[(((((((ko.inner*36864) + (co*9216)) + (rh*3072)) + (rw*1024)) + (cio*128)) + (ki*4)) + cii)] =
                           (float32*)placeholder_7[((((((((ko.outer_1*73728) + (ko.inner*36864)) + (co*9216)) + (rh*3072)) + (rw*1024)) + (cio*128)) + (ki*4)) + cii)]
                       }
                     }
@@ -881,9 +881,9 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
                             for (rw: int32, 0, 3) {
                               for (ki: int32, 0, 32) {
                                 for (rc.inner: int32, 0, 32) {
-                                  temp_output[((((((((ho.outer_1*131072) + (ho.inner*65536)) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] = 
+                                  temp_output[((((((((ho.outer_1*131072) + (ho.inner*65536)) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] =
                                   (
-                                    (float32*)temp_output[((((((((ho.outer_1*131072) + (ho.inner*65536)) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] + 
+                                    (float32*)temp_output[((((((((ho.outer_1*131072) + (ho.inner*65536)) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] +
                                     (
                                       (float32*)packed_input.global[((((((((floordiv((hi + rh), 8)*65536) + (ho.inner*65536)) + (floordiv((wi + rw), 8)*8192)) + (wo*8192)) + (rc.outer*2048)) + (floormod((hi + rh), 8)*256)) + (floormod((wi + rw), 8)*32)) + rc.inner)] *
                                       (float32*)packed_filter.global[(((((((ko.inner*36864) + (rc.outer*9216)) + (rh*3072)) + (rw*1024)) + (floordiv(rc.inner, 4)*128)) + (ki*4)) + floormod(rc.inner, 4))]
@@ -913,7 +913,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
               for (cio: int32, 0, 8) {
                 for (ki: int32, 0, 32) {
                   for (cii: int32, 0, 4) {
-                    packed_filter.global[(((((((ko.inner*36864) + (co*9216)) + (rh*3072)) + (rw*1024)) + (cio*128)) + (ki*4)) + cii)] = 
+                    packed_filter.global[(((((((ko.inner*36864) + (co*9216)) + (rh*3072)) + (rw*1024)) + (cio*128)) + (ki*4)) + cii)] =
                       (float32*)placeholder_6[((((((((ko.outer*73728) + (ko.inner*36864)) + (co*9216)) + (rh*3072)) + (rw*1024)) + (cio*128)) + (ki*4)) + cii)]
                   }
                 }
@@ -946,9 +946,9 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
                     for (rw_1: int32, 0, 3) {
                       for (ki.c: int32, 0, 32) {
                         for (rc.inner_1: int32, 0, 32) {
-                          packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] = 
+                          packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] =
                           (
-                            (float32*)packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] + 
+                            (float32*)packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] +
                             (
                               (float32*)temp_output[((((((((floordiv((hi.c + rh_1), 8)*65536) + (ho.c.inner*65536)) + (floordiv((wi.c + rw_1), 8)*8192)) + (wo.c*8192)) + (rc.outer_1*2048)) + (floormod((hi.c + rh_1), 8)*256)) + (floormod((wi.c + rw_1), 8)*32)) + rc.inner_1)] *
                               (float32*)packed_filter.global[(((((((ko.c.inner*36864) + (rc.outer_1*9216)) + (rh_1*3072)) + (rw_1*1024)) + (floordiv(rc.inner_1, 4)*128)) + (ki.c*4)) + floormod(rc.inner_1, 4))]
@@ -972,7 +972,7 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
             for (hi_1: int32, 0, 8) {
               for (wi_1: int32, 0, 8) {
                 for (ki_1: int32, 0, 32) {
-                  output_2[((((((((ho.outer*131072) + (ho.inner_1*65536)) + (wo_1*8192)) + (ko.outer*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] = 
+                  output_2[((((((((ho.outer*131072) + (ho.inner_1*65536)) + (wo_1*8192)) + (ko.outer*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] =
                     (float32*)packed_input.global[((((((ho.inner_1*32768) + (wo_1*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)]
                 }
               }
@@ -983,4 +983,4 @@ primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, outp
     }
   }
 }
-```
\ No newline at end of file
+```
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx.py b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
index 6ebe03d4e6b1..046f949a761f 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
@@ -16,7 +16,7 @@
 # under the License.
 
 """
-Test parallelizing HVX workloads and compare them to single thread examples. 
+Test parallelizing HVX workloads and compare them to single thread examples.
 """
 import numpy as np
 from numpy.random import default_rng
diff --git a/tests/python/relay/backend/test_pass_lower_te.py b/tests/python/relay/backend/test_pass_lower_te.py
index fb79c1f2e7a6..d439f22b1246 100644
--- a/tests/python/relay/backend/test_pass_lower_te.py
+++ b/tests/python/relay/backend/test_pass_lower_te.py
@@ -54,11 +54,11 @@ def test_lower_primitive():
         """
         #[version = "0.0.5"]
         def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
-          %0 = fn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Primitive=1) -> Tensor[(5, 7), float32] { 
+          %0 = fn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Primitive=1) -> Tensor[(5, 7), float32] {
             add(%x, %y)
           };
-          %0(%a, %a)  
-        }      
+          %0(%a, %a)
+        }
         """,
         "from_string",
         None,
@@ -99,11 +99,11 @@ def relay_ext_test_pass_lower_te(func):
         """
         #[version = "0.0.5"]
         def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
-          %0 = fn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Primitive=1, Compiler="test_pass_lower_te", global_symbol="test_add") -> Tensor[(5, 7), float32] { 
+          %0 = fn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Primitive=1, Compiler="test_pass_lower_te", global_symbol="test_add") -> Tensor[(5, 7), float32] {
             add(%x, %y)
           };
-          %0(%a, %a)  
-        }      
+          %0(%a, %a)
+        }
         """,
         "from_string",
         None,
@@ -146,9 +146,9 @@ def test_lower_extern():
         def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
           @my_add(%a, %a)
         }
-        def @my_add(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Extern=1) -> Tensor[(5, 7), float32] { 
+        def @my_add(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Extern=1) -> Tensor[(5, 7), float32] {
           add(%x, %y)
-        }      
+        }
         """,
         "from_string",
         None,
@@ -189,9 +189,9 @@ def test_lower_extern_with_dynamic_shape():
         def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(?, ?), float32] {
           @my_dyn(%a, %a)
         }
-        def @my_dyn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Extern=1) -> Tensor[(?, ?), float32] { 
+        def @my_dyn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Extern=1) -> Tensor[(?, ?), float32] {
           add(%x, %y)
-        }      
+        }
         """,
         "from_string",
         None,
diff --git a/tests/python/relay/collage/demo_collage_partitioner.py b/tests/python/relay/collage/demo_collage_partitioner.py
index c5a18c3832fa..47f2612d7f16 100644
--- a/tests/python/relay/collage/demo_collage_partitioner.py
+++ b/tests/python/relay/collage/demo_collage_partitioner.py
@@ -116,7 +116,7 @@ def run(label, name, device, lib_path, code_path, input_shapes, input_dtypes):
         for input_name in input_shapes.keys()
     }}
     logging.info(f"Benchmarking for {{name}} generated by {{label}}...")
-    profile = vm_estimate_seconds(device, vm, args) 
+    profile = vm_estimate_seconds(device, vm, args)
     logging.info(f"Benchmarked for {{name}} generated by {{label}}: {{profile}}")
     logging.info(f"RESULT: {{label}} | {{name}} | {{profile.median * 1e3}}ms")
 
diff --git a/tests/python/relay/collage/menangerie.py b/tests/python/relay/collage/menangerie.py
index 2cd8e1bcebe3..d5275fbd34c5 100644
--- a/tests/python/relay/collage/menangerie.py
+++ b/tests/python/relay/collage/menangerie.py
@@ -2852,7 +2852,7 @@ def @main(%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32
           %223 = reshape(%222, newshape=[0, -1]);
           %224 = nn.dense(%223, meta[relay.Constant][257], units=1000);
           add(%224, meta[relay.Constant][258])
-        }   
+        }
         """,
         "from_string",
         None,
@@ -3100,7 +3100,7 @@ def @main(%data: Tensor[(1, 3, 224, 224), float16]) -> Tensor[(1, 1000), float16
           %223 = reshape(%222, newshape=[0, -1]);
           %224 = nn.dense(%223, meta[relay.Constant][257], units=1000);
           add(%224, meta[relay.Constant][258])
-        }   
+        }
         """,
         "from_string",
         None,
diff --git a/tests/python/relay/collage/test_sub_graph.py b/tests/python/relay/collage/test_sub_graph.py
index de2d974bf934..21f12c43dccd 100644
--- a/tests/python/relay/collage/test_sub_graph.py
+++ b/tests/python/relay/collage/test_sub_graph.py
@@ -140,7 +140,7 @@ def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]
                 %0 = nn.conv2d(%x, %y);
                 %1 = add(%0, %z);
                 nn.relu(%1)
-              })(%a, %b, %c);           
+              })(%a, %b, %c);
               subtract(%2, %d)
             }
         """
@@ -160,7 +160,7 @@ def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]
               %2 = nn.relu(%1);                             // node 7
               %3 = nn.leaky_relu(%0, alpha=0f);             // node 9
               add(%2, %3)                                   // node 10
-            }   
+            }
         """
         )
 
@@ -194,7 +194,7 @@ def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]
               %2 = nn.relu(%1);                             // node 7
               %3 = nn.leaky_relu(%0, alpha=0f);             // node 9
               add(%2, %3)
-            }   
+            }
         """
         )
 
@@ -229,7 +229,7 @@ def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]
               %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5
               %1 = nn.relu(%0);                             // node 6
               add(%1, %0)
-            }            
+            }
         """
         )
 
@@ -244,9 +244,9 @@ def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]
                 (%0, %1)
               })(%a, %b);
               %3 = %2.1;
-              %4 = %2.0; 
+              %4 = %2.0;
               add(%3, %4)
-            }            
+            }
         """
         )
 
@@ -262,10 +262,10 @@ def input():
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
-              %0 = add(%a, %b); // node 3 
+              %0 = add(%a, %b); // node 3
               %1 = add(%0, %b);
               add(%1, %b)       // node 5
-            }            
+            }
         """
         )
 
@@ -278,8 +278,8 @@ def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
                 %0 = add(%x, %y);
                 %1 = add(%0, %y);
                 add(%1, %y)
-              })(%a, %b) 
-            }            
+              })(%a, %b)
+            }
         """
         )
 
@@ -303,8 +303,8 @@ def @main(%a: Tensor[(5, 7), float32]) {
               %5 = nn.relu(%4);  // node 8
               %6 = nn.relu(%4);  // node 9
               %7 = add(%5, %6);  // node 10
-              nn.relu(%7)        // node 11  
-            }            
+              nn.relu(%7)        // node 11
+            }
         """
         )
 
@@ -315,7 +315,7 @@ def expected():
             def @main(%a: Tensor[(5, 7), float32]) {
               (fn(%aa: Tensor[(5, 7), float32], Compiler="foo") {
                 %0 = nn.relu(%aa);
-                %4 = (fn(%y, Composite="a") { 
+                %4 = (fn(%y, Composite="a") {
                   %1 = nn.relu(%y);
                   %2 = nn.relu(%1);
                   %3 = nn.relu(%1);
@@ -327,7 +327,7 @@ def @main(%a: Tensor[(5, 7), float32]) {
                   add(%5, %6)
                 })(%4);
                 nn.relu(%7)
-              })(%a)  
+              })(%a)
             }
         """
         )
@@ -349,8 +349,8 @@ def @main(%a: Tensor[(5, 7), float32]) {
               %5 = nn.relu(%4);  // node 8
               %6 = nn.relu(%4);  // node 9
               %7 = add(%5, %6);  // node 10
-              add(%2, %7)        // node 11  
-            }            
+              add(%2, %7)        // node 11
+            }
         """
         )
 
@@ -361,7 +361,7 @@ def expected():
             def @main(%a: Tensor[(5, 7), float32]) {
               %0 = nn.relu(%a);
               %9 = (fn(%x: Tensor[(5, 7), float32], Compiler="foo") {
-                %5 = (fn(%y, Composite="a") { 
+                %5 = (fn(%y, Composite="a") {
                   %1 = nn.relu(%y);
                   %2 = nn.relu(%1);
                   %3 = nn.relu(%1);
@@ -375,7 +375,7 @@ def @main(%a: Tensor[(5, 7), float32]) {
                 })(%5.1);
                 (%5.0, %8)
               })(%0);
-              add(%9.0, %9.1)  
+              add(%9.0, %9.1)
             }
         """
         )
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index 3f0b74468b21..5ea6d7e5de6a 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -1011,7 +1011,7 @@ def test_init_module_and_metatable():
         + """
             def @f(%y : Tensor[(2, 3), float32]) -> Tensor[(2, 3), float32] {
               negative(%y)
-            }                                       
+            }
         """,
     )
     mod = tvm.parser.parse(
diff --git a/tests/python/relay/test_pass_collage_partition.py b/tests/python/relay/test_pass_collage_partition.py
index dfd4fb8fad52..fa7e0a472a49 100644
--- a/tests/python/relay/test_pass_collage_partition.py
+++ b/tests/python/relay/test_pass_collage_partition.py
@@ -473,7 +473,7 @@ def @collage_example_target_hook_concatenate(%FunctionVar_03: (Tensor[(10, 10),
         };
         %4(%FunctionVar_03)
       }
-        
+
       def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(20, 10), float32] {
         %5 = @collage_example_target_hook(%x);
         %6 = %5.0;
@@ -537,7 +537,7 @@ def @collage_example_target_hook_nn_relu_nn_relu_nn_relu_add_nn_relu(%FunctionVa
         };
         %8(%7)
       }
-        
+
       def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
         %9 = abs(%x);
         %10 = @collage_example_target_hook_nn_relu_nn_relu_nn_relu_add_nn_relu(%x, %9);
@@ -584,19 +584,19 @@ def @collage_example_target_hook_add_add(%FunctionVar_0: Tensor[(10, 10), float3
         };
         %2(%FunctionVar_0, %1)
       }
-        
+
       def @collage_example_target_hook_nn_relu(%FunctionVar_03: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu") -> Tensor[(10, 10), float32] {
         %3 = fn (%FunctionVar_04: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
           nn.relu(%FunctionVar_04)
         };
         %3(%FunctionVar_03)
       }
-        
+
       def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
         %4 = @collage_example_target_hook_nn_relu(%x);
         %5 = abs(%4);
         @collage_example_target_hook_add_add(%5, %4)
-      } 
+      }
     """
     expected_mod = tvm.parser.fromtext(expected_txt)
 
@@ -640,25 +640,25 @@ def @collage_example_target_hook_nn_relu_nn_relu_add_add(
         Compiler="example_target_hook",
         global_symbol="collage_example_target_hook_nn_relu_nn_relu_add_add") -> Tensor[(10, 10), float32] {
         %0 = fn (%FunctionVar_03: Tensor[(10, 10), float32] , Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_03) 
+          nn.relu(%FunctionVar_03)
         };
         %1 = %0(%FunctionVar_0) ;
         %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32] , Composite="relu") -> Tensor[(10, 10), float32] {
-          nn.relu(%FunctionVar_02) 
+          nn.relu(%FunctionVar_02)
         };
         %3 = %2(%1);
         %4 = fn (%FunctionVar_04: Tensor[(10, 10), float32] , %FunctionVar_11: Tensor[(10, 10), float32] , Composite="add") -> Tensor[(10, 10), float32] {
-          add(%FunctionVar_04, %FunctionVar_11) 
+          add(%FunctionVar_04, %FunctionVar_11)
         };
         %5 = %4(%1, %3);
         %6 = fn (%FunctionVar_01: Tensor[(10, 10), float32] , %FunctionVar_1: Tensor[(10, 10), float32] , Composite="add") -> Tensor[(10, 10), float32] {
-          add(%FunctionVar_01, %FunctionVar_1) 
+          add(%FunctionVar_01, %FunctionVar_1)
         };
-        %6(%3, %5) 
+        %6(%3, %5)
       }
 
       def @main(%x: Tensor[(10, 10), float32] ) -> Tensor[(10, 10), float32] {
-        @collage_example_target_hook_nn_relu_nn_relu_add_add(%x) 
+        @collage_example_target_hook_nn_relu_nn_relu_add_add(%x)
       }
     """
     expected_mod = tvm.parser.fromtext(expected_txt)
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index 8844de7567a6..abd9be99e3d9 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -321,7 +321,7 @@ def @f(%r) -> int {
         let %v = ref_read(%r);
         let %u = ref_write(%r, %v + 1);
         %v
-    }    
+    }
     def @main() -> int {
         let %r = ref(0);
         let %y = @f(%r);
diff --git a/tests/python/relay/test_pass_plan_devices.py b/tests/python/relay/test_pass_plan_devices.py
index 1b3d2199a065..1c48589a51aa 100644
--- a/tests/python/relay/test_pass_plan_devices.py
+++ b/tests/python/relay/test_pass_plan_devices.py
@@ -1804,13 +1804,13 @@ def @main(%data1: Tensor[(1, 32, 40, 40), float32],
           %3 = %0(%data2);
           %5 = fn (%a {virtual_device=meta[VirtualDevice][0]},  // global
                    %b {virtual_device=meta[VirtualDevice][0]},  // global
-                   virtual_device=meta[VirtualDevice][1],       // texture 
+                   virtual_device=meta[VirtualDevice][1],       // texture
                    Primitive=1) {
             add(%a, %b)
           };
           %6 = %5(%1, %3);
-          %10 = fn (%a, 
-                    virtual_device=meta[VirtualDevice][0],      // global 
+          %10 = fn (%a,
+                    virtual_device=meta[VirtualDevice][0],      // global
                     Primitive=1) {
             layout_transform(%a, src_layout="NCHW4c", dst_layout="NCHW")
           };
diff --git a/tests/python/relay/test_target_hooks.py b/tests/python/relay/test_target_hooks.py
index 046b2c7e541d..3a76fd2fbbf3 100644
--- a/tests/python/relay/test_target_hooks.py
+++ b/tests/python/relay/test_target_hooks.py
@@ -79,14 +79,14 @@ def test_tir_external_generation_outline_with_target_instance(check_result):
             def @main(%x: Tensor[(8), float32], %y: Tensor[(8), float32]) -> Tensor[(8), float32] {
               @replace_add_with_subtract(%x, %y) * 2.0f
             }
-            
+
             def @replace_add_with_subtract(%x: Tensor[(8), float32], %y: Tensor[(8), float32],
                                            Inline=1,
                                            Primitive=1,
                                            Compiler="example_target_hook",
                                            global_symbol="replace_add_with_subtract") -> Tensor[(8), float32] {
               %x + %y  // will be rewritten to TIR implementing %x - %y - 42.0f by custom pass
-            }  
+            }
         """
     )
 
diff --git a/tests/python/relay/transform/test_compiler_function_utils.py b/tests/python/relay/transform/test_compiler_function_utils.py
index b1056f60b82b..1bb07e268439 100644
--- a/tests/python/relay/transform/test_compiler_function_utils.py
+++ b/tests/python/relay/transform/test_compiler_function_utils.py
@@ -125,7 +125,7 @@ def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float1
           %3 = %2(%x3, meta[relay.Constant][2]);
           (%1, %3)
         }
-        
+
         def @tvmgen_default_cutlass_main_0(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
                   Inline=1, Compiler="cutlass", global_symbol="tvmgen_default_cutlass_main_0", Primitive=1) -> Tensor[(1600, 2304), float16] {
           %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
@@ -159,7 +159,7 @@ def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float1
           %3 = %2(%x3, meta[relay.Constant][2]);
           (%1, %3)
         }
-        
+
         def @tvmgen_default_cutlass_main_0(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
                   Extern=1) -> Tensor[(1600, 2304), float16] {
           %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py
index 6f9493fcbf99..4140e7732d7e 100644
--- a/tests/python/unittest/test_auto_scheduler_feature.py
+++ b/tests/python/unittest/test_auto_scheduler_feature.py
@@ -67,7 +67,7 @@ def test_cpu_matmul():
 
     """
     lowered IR:
-    
+
     Placeholder: A, B
     parallel i.0 (0,32)
       parallel j.0 (0,64)
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index 049de0bed4f9..34b3190b9aa2 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -878,7 +878,7 @@ def func(A: T.Buffer[(960, 770), "float32"], B: T.Buffer[(770, 2304), "float32"]
                                 for k_1, i_3, j_3, k_2, i_4, j_4 in T.grid(1, 8, 1, 4, 2, 2):
                                     with T.block("update_update"):
                                         C[(((bx // 18 + 0) * 8 + tx_p // 32) * 8 + i_3) * 2 + i_4, ((bx % 18 * 2 + vx % 2) * 32 + tx_p % 32 + j_3) * 2 + j_4] = C[(((bx // 18 + 0) * 8 + tx_p // 32) * 8 + i_3) * 2 + i_4, ((bx % 18 * 2 + vx % 2) * 32 + tx_p % 32 + j_3) * 2 + j_4] + A_shared[(((bx // 18 + 0) * 8 + tx_p // 32) * 8 + i_3) * 2 + i_4, (k_0 + k_1) * 4 + k_2] * B_shared[(k_0 + k_1) * 4 + k_2, ((bx % 18 * 2 + vx % 2) * 32 + tx_p % 32 + j_3) * 2 + j_4]
-    
+
     @T.prim_func
     def compacted_func(A: T.Buffer[(960, 770), "float32"], B: T.Buffer[(770, 2304), "float32"], C: T.Buffer[(960, 2304), "float32"]) -> None:
         for bx in T.thread_binding(144, thread="blockIdx.x"):
diff --git a/tests/scripts/release/.gitignore b/tests/scripts/release/.gitignore
index 3f183296deea..35846d92d265 100644
--- a/tests/scripts/release/.gitignore
+++ b/tests/scripts/release/.gitignore
@@ -2,4 +2,3 @@
 !README.md
 *.csv
 *.pkl
-
diff --git a/tests/scripts/release/README.md b/tests/scripts/release/README.md
index 82e2e5040ba6..9992e631401e 100644
--- a/tests/scripts/release/README.md
+++ b/tests/scripts/release/README.md
@@ -40,4 +40,4 @@ git clone https://github.com/apache/tvm-rfcs.git
 python list_rfcs.py --since-commit <hash> --rfcs-repo ./tvm-rfcs > rfc.md
 ```
 
-Finally, combine `rfc.md` and `out.md` along with some prose to create the final release notes.
\ No newline at end of file
+Finally, combine `rfc.md` and `out.md` along with some prose to create the final release notes.
diff --git a/tests/scripts/task_config_build_cortexm.sh b/tests/scripts/task_config_build_cortexm.sh
index 7292f1c31e0d..f15ed81711f6 100755
--- a/tests/scripts/task_config_build_cortexm.sh
+++ b/tests/scripts/task_config_build_cortexm.sh
@@ -35,4 +35,3 @@ echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
 echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
-
diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh
index 5e94f864ec90..369706dfd34a 100755
--- a/tests/scripts/task_config_build_i386.sh
+++ b/tests/scripts/task_config_build_i386.sh
@@ -37,4 +37,3 @@ echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake
 echo set\(USE_UMA OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
-
diff --git a/tests/scripts/task_config_build_jvm.sh b/tests/scripts/task_config_build_jvm.sh
index f14c90bebd4b..cf23c848127d 100755
--- a/tests/scripts/task_config_build_jvm.sh
+++ b/tests/scripts/task_config_build_jvm.sh
@@ -31,4 +31,4 @@ echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
-echo set\(USE_LLVM ON\) >> config.cmake
\ No newline at end of file
+echo set\(USE_LLVM ON\) >> config.cmake
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index 27899d06d703..22cc937e8784 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -63,4 +63,3 @@ if grep -Fq "USE_MICRO ON" ${BUILD_DIR}/TVMBuildOptions.txt; then
   make test_dynamic test_static
   popd
 fi
-
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 84f46523370e..f71cb0f60243 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -60,6 +60,9 @@ function shard1 {
 }
 
 function shard2 {
+  echo "check whitespace..."
+  tests/lint/whitespace.sh
+
   echo "Linting the Python code with pylint..."
   tests/lint/pylint.sh
 
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 5eac7b45ba61..2caeeeee8766 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -61,7 +61,7 @@ run_pytest cython ${TVM_INTEGRATION_TESTSUITE_NAME}-dso_plugin_module-1 apps/dso
 run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-integration tests/python/integration
 
 # Ignoring Arm(R) Ethos(TM)-U NPU tests in the collective to run to run them in parallel in the next step.
-run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib --ignore=tests/python/contrib/test_ethosu --ignore=tests/python/contrib/test_cmsisnn 
+run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib --ignore=tests/python/contrib/test_ethosu --ignore=tests/python/contrib/test_cmsisnn
 # forked is needed because the global registry gets contaminated
 TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \
     run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-relay tests/python/relay
diff --git a/vta/README.md b/vta/README.md
index 0c0edb8475df..debf2482b479 100644
--- a/vta/README.md
+++ b/vta/README.md
@@ -30,4 +30,4 @@ The key features of VTA include:
   - Customized and extensible TVM compiler back-end.
   - Flexible RPC support to ease deployment, and program FPGAs with the convenience of Python.
 
-Learn more about VTA [here](https://tvm.apache.org/docs/vta/index.html).
\ No newline at end of file
+Learn more about VTA [here](https://tvm.apache.org/docs/vta/index.html).
diff --git a/vta/tutorials/autotvm/README.txt b/vta/tutorials/autotvm/README.txt
index c511381dd57d..a282a740ec84 100644
--- a/vta/tutorials/autotvm/README.txt
+++ b/vta/tutorials/autotvm/README.txt
@@ -1,3 +1,2 @@
 Auto tuning
 -------------
-
diff --git a/web/Makefile b/web/Makefile
index d6adc94170fc..1153990a1880 100644
--- a/web/Makefile
+++ b/web/Makefile
@@ -27,7 +27,7 @@ all: dist/wasm/tvmjs_runtime.wasm dist/wasm/tvmjs_runtime.wasi.js
 EMCC = emcc
 
 EMCC_CFLAGS = $(INCLUDE_FLAGS) -O3 -std=c++17 -Wno-ignored-attributes --no-entry \
-	-s ALLOW_MEMORY_GROWTH=1 -s STANDALONE_WASM=1 -s ERROR_ON_UNDEFINED_SYMBOLS=0 
+	-s ALLOW_MEMORY_GROWTH=1 -s STANDALONE_WASM=1 -s ERROR_ON_UNDEFINED_SYMBOLS=0
 
 EMCC_LDFLAGS = --pre-js emcc/preload.js
 
diff --git a/web/package.json b/web/package.json
index 825056a8e37b..a45737ceb8d8 100644
--- a/web/package.json
+++ b/web/package.json
@@ -29,4 +29,4 @@
     "typescript": "^3.8.3",
     "ws": "^7.2.5"
   }
-}
\ No newline at end of file
+}
diff --git a/web/src/compact.ts b/web/src/compact.ts
index ac6af35abeff..35d2becaea60 100644
--- a/web/src/compact.ts
+++ b/web/src/compact.ts
@@ -44,4 +44,4 @@ export function createWebSocket(url: string): WebSocket {
     return new (WebSocket as any)(url);
   }
 
-}
\ No newline at end of file
+}
diff --git a/web/src/environment.ts b/web/src/environment.ts
index df0fe68c81e0..24126c096150 100644
--- a/web/src/environment.ts
+++ b/web/src/environment.ts
@@ -143,4 +143,4 @@ export class Environment implements LibraryProvider {
     };
     return Object.assign(defaultEnv, initEnv, newEnv);
   }
-}
\ No newline at end of file
+}
diff --git a/web/src/index.ts b/web/src/index.ts
index ed84ce7fbea1..ac82e5967f48 100644
--- a/web/src/index.ts
+++ b/web/src/index.ts
@@ -26,4 +26,4 @@ export { Disposable, LibraryProvider } from "./types";
 export { RPCServer } from "./rpc_server";
 export { wasmPath } from "./support";
 export { detectGPUDevice } from "./webgpu";
-export { assert } from "./support";
\ No newline at end of file
+export { assert } from "./support";
diff --git a/web/src/support.ts b/web/src/support.ts
index 7a2667a2299f..8fce98f35f12 100644
--- a/web/src/support.ts
+++ b/web/src/support.ts
@@ -61,4 +61,4 @@ export function assert(condition: boolean, msg?: string): asserts condition {
  */
 export function wasmPath(): string {
   return __dirname + "/wasm";
-}
\ No newline at end of file
+}
diff --git a/web/tests/node/test_ndarray.js b/web/tests/node/test_ndarray.js
index 9e50557e2a13..b7a5abdcb155 100644
--- a/web/tests/node/test_ndarray.js
+++ b/web/tests/node/test_ndarray.js
@@ -54,4 +54,3 @@ test("array copy", () => {
   testArrayCopy("uint8", Uint8Array);
   testArrayCopy("float64", Float64Array);
 });
-

From 0ab2285ef30dde7450ea944918aa347a189c6137 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Wed, 19 Oct 2022 11:02:53 -0400
Subject: [PATCH 388/704] [test][hexagon] Fix pytest fixture resolution bug
 (#13124)

Fix a bug in `test_benchmark_maxpool2d.py` where pytest was unable to find a fixture defined in a different module.
---
 tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
index b2de2c7e95f0..a22b85ee42a2 100644
--- a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
@@ -56,6 +56,10 @@
 from .infrastructure import allocate_hexagon_array, get_hexagon_target
 from . import benchmark_util as bu
 
+# Pytest seems to require that fixture names exist in the current module.
+# E.g., it doesn't allow: @pytest.mark.usefixtures("bu.benchmark_group")
+benchmark_group = bu.benchmark_group
+
 _SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
 
 
From e1bebe3d056dd93e362172f7ef2baa14ce206621 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 19 Oct 2022 10:30:12 -0500
Subject: [PATCH 389/704] [ci] Skip sccache for local hexagon builds (#13127)

sccache chokes on the hexagon builds for some users, so this disables it
by default locally but keeps it on in CI.
---
 docker/with_the_same_user                  |  4 ++++
 tests/scripts/task_config_build_hexagon.sh | 11 ++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/docker/with_the_same_user b/docker/with_the_same_user
index 397b885ee166..bd332cd91374 100644
--- a/docker/with_the_same_user
+++ b/docker/with_the_same_user
@@ -69,6 +69,10 @@ else
     CUDA_ENV=""
 fi
 
+if [[ "$CI_IMAGE_NAME" == *"hexagon"* ]] && [[ ${CI:-false} != "true" ]]; then
+  PATH=$(echo "$PATH" | sed 's/\/opt\/sccache://g')
+fi
+
 sudo -u "#${CI_BUILD_UID}" --preserve-env \
 ${CUDA_ENV} \
 PATH=${PATH} \
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index 101260b764e0..0736ed6b53b8 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -28,7 +28,16 @@ echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM "${CLANG_LLVM_HOME}/bin/llvm-config"\) >> config.cmake
-echo set\(CMAKE_CXX_COMPILER "/opt/sccache/clang++"\) >> config.cmake
+
+if [[ ${CI:-false} == "true" ]]; then
+    # sccache needs to be used in CI to speed up builds
+    echo set\(CMAKE_CXX_COMPILER "/opt/sccache/clang++"\) >> config.cmake
+else
+    echo 'Skipping sccache setup for local build'
+    echo set\(CMAKE_CXX_COMPILER \"/usr/bin/c++\"\) >> config.cmake
+    echo set\(CMAKE_C_COMPILER \"/usr/bin/cc\"\) >> config.cmake
+fi
+
 echo set\(USE_HEXAGON "ON"\) >> config.cmake
 echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_ROOT}"\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake

From 213fd768d7c1211b9c315d2cf31edf51df8e0c67 Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Wed, 19 Oct 2022 11:36:44 -0400
Subject: [PATCH 390/704] [Hexagon] [runtime] Create objects to manage thread
 hardware resources (#13111)

This is the first step towards threads dedicated to different hardware resources.

Added two objects to manage compute resources - HexagonHmx and HexagonHvx. These can be created on the main thread for now. The next step will be to acquire those resources on specific threads.

* Add HexagonHvx and HexagonHmx

* Disable pipe overflow test for now
---
 apps/hexagon_launcher/launcher_hexagon.cc     | 31 ------
 src/runtime/hexagon/hexagon_hmx.cc            | 99 +++++++++++++++++++
 src/runtime/hexagon/hexagon_hmx.h             | 64 ++++++++++++
 src/runtime/hexagon/hexagon_hvx.cc            | 57 +++++++++++
 src/runtime/hexagon/hexagon_hvx.h             | 55 +++++++++++
 src/runtime/hexagon/hexagon_thread_manager.cc | 11 +++
 src/runtime/hexagon/hexagon_thread_manager.h  | 10 ++
 .../hexagon/hexagon_thread_manager_tests.cc   | 10 +-
 8 files changed, 302 insertions(+), 35 deletions(-)
 create mode 100644 src/runtime/hexagon/hexagon_hmx.cc
 create mode 100644 src/runtime/hexagon/hexagon_hmx.h
 create mode 100644 src/runtime/hexagon/hexagon_hvx.cc
 create mode 100644 src/runtime/hexagon/hexagon_hvx.h

diff --git a/apps/hexagon_launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc
index d4fbf4bf5d73..03524661c4e6 100644
--- a/apps/hexagon_launcher/launcher_hexagon.cc
+++ b/apps/hexagon_launcher/launcher_hexagon.cc
@@ -23,7 +23,6 @@ extern "C" {
 #include <HAP_farf.h>
 #include <HAP_perf.h>
 #include <qurt_error.h>
-#include <qurt_hvx.h>
 }
 
 #include <tvm/runtime/object.h>
@@ -211,23 +210,6 @@ AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcyc
     return AEE_EBADSTATE;
   }
 
-  // Reserve HVX.
-  int res = qurt_hvx_reserve(QURT_HVX_RESERVE_ALL_AVAILABLE);
-  switch (res) {
-    case QURT_HVX_RESERVE_NOT_SUPPORTED:
-    case QURT_HVX_RESERVE_NOT_SUCCESSFUL:
-      LOG(ERROR) << "error reserving HVX: " << res;
-      return AEE_EFAILED;
-    default:
-      break;
-  }
-  // Lock HVX.
-  int lck = qurt_hvx_lock(QURT_HVX_MODE_128B);
-  if (lck != 0) {
-    LOG(ERROR) << "error locking HVX: " << lck;
-    return AEE_EFAILED;
-  }
-
   uint64_t us_begin = HAP_perf_get_time_us();
   uint64_t pc_begin = HAP_perf_get_pcycles();
 
@@ -238,18 +220,5 @@ AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcyc
   *pcycles = pc_end - pc_begin;
   *usecs = us_end - us_begin;
 
-  // Unlock HVX.
-  int unl = qurt_hvx_unlock();
-  if (unl != 0) {
-    LOG(ERROR) << "error unlocking HVX: " << unl;
-    return AEE_EFAILED;
-  }
-  // Release HVX.
-  int rel = qurt_hvx_cancel_reserve();
-  if (rel != 0) {
-    LOG(ERROR) << "error canceling HVX reservation: " << rel;
-    return AEE_EFAILED;
-  }
-
   return AEE_SUCCESS;
 }
diff --git a/src/runtime/hexagon/hexagon_hmx.cc b/src/runtime/hexagon/hexagon_hmx.cc
new file mode 100644
index 000000000000..66bde535ff6e
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_hmx.cc
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+extern "C" {
+#include <AEEStdDef.h>
+#include <AEEStdErr.h>
+#include <HAP_compute_res.h>
+#include <HAP_farf.h>
+#include <HAP_power.h>
+#include <qurt_error.h>
+}
+
+#include "hexagon_common.h"
+#include "hexagon_hmx.h"
+
+// Minimum timeout per SDK docs, excluding 0
+#define COMPUTE_RES_ACQ_TIMEOUT 200
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+HexagonHmx::HexagonHmx() {
+  PowerOn();
+  Acquire();
+}
+
+HexagonHmx::~HexagonHmx() {
+  Release();
+  PowerOff();
+}
+
+void HexagonHmx::PowerOn() {
+  HAP_power_request_t pwr_req;
+  int nErr;
+
+  hap_pwr_ctx_ = HAP_utils_create_context();
+  pwr_req.type = HAP_power_set_HMX;
+  pwr_req.hmx.power_up = true;
+  if ((nErr = HAP_power_set(hap_pwr_ctx_, &pwr_req))) {
+    LOG(FATAL) << "InternalError: HAP_power_set failed\n";
+  }
+}
+
+void HexagonHmx::PowerOff() {
+  HAP_power_request_t pwr_req;
+  int nErr;
+
+  pwr_req.type = HAP_power_set_HMX;
+  pwr_req.hmx.power_up = false;
+  if ((nErr = HAP_power_set(hap_pwr_ctx_, &pwr_req))) {
+    LOG(FATAL) << "InternalError: HAP_power_set failed\n";
+  }
+  HAP_utils_destroy_context(hap_pwr_ctx_);
+}
+
+void HexagonHmx::Acquire() {
+  compute_res_attr_t compute_res_attr;
+  int nErr;
+
+  if ((nErr = HAP_compute_res_attr_init(&compute_res_attr))) {
+    LOG(FATAL) << "InternalError: HAP_compute_res_attr_init failed\n";
+  }
+  if ((nErr = HAP_compute_res_attr_set_hmx_param(&compute_res_attr, 1))) {
+    LOG(FATAL) << "InternalError: HAP_compute_res_attr_set_hmx_param failed\n";
+  }
+  context_id_ = HAP_compute_res_acquire(&compute_res_attr, COMPUTE_RES_ACQ_TIMEOUT);
+
+  if (!context_id_) {
+    LOG(FATAL) << "InternalError: HAP_compute_res_acquire failed\n";
+  }
+  if ((nErr = HAP_compute_res_hmx_lock(context_id_))) {
+    LOG(FATAL) << "InternalError: Unable to lock HMX!";
+  }
+}
+
+void HexagonHmx::Release() {
+  HAP_compute_res_hmx_unlock((unsigned int)context_id_);
+  HAP_compute_res_release((unsigned int)context_id_);
+}
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon_hmx.h b/src/runtime/hexagon/hexagon_hmx.h
new file mode 100644
index 000000000000..1d5f56df7a4b
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_hmx.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HMX_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_HMX_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+class HexagonHmx {
+ public:
+  //! \brief Constructor.
+  HexagonHmx();
+
+  //! \brief Destructor.
+  ~HexagonHmx();
+
+  //! \brief Prevent copy construction of HexagonHmx.
+  HexagonHmx(const HexagonHmx&) = delete;
+
+  //! \brief Prevent copy assignment with HexagonHmx.
+  HexagonHmx& operator=(const HexagonHmx&) = delete;
+
+  //! \brief Prevent move construction.
+  HexagonHmx(HexagonHmx&&) = delete;
+
+  //! \brief Prevent move assignment.
+  HexagonHmx& operator=(HexagonHmx&&) = delete;
+
+ private:
+  //! \brief Power context
+  void* hap_pwr_ctx_;
+
+  //! \brief Acquisition context ID
+  unsigned int context_id_;
+
+  void PowerOn();
+  void PowerOff();
+  void Acquire();
+  void Release();
+};
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HMX_H_
diff --git a/src/runtime/hexagon/hexagon_hvx.cc b/src/runtime/hexagon/hexagon_hvx.cc
new file mode 100644
index 000000000000..0c3160a7d89b
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_hvx.cc
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+extern "C" {
+#include <AEEStdDef.h>
+#include <AEEStdErr.h>
+#include <HAP_farf.h>
+#include <qurt_error.h>
+#include <qurt_hvx.h>
+}
+
+#include "hexagon_common.h"
+#include "hexagon_hvx.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+HexagonHvx::HexagonHvx() {
+  // Reserve HVX.
+  int res = qurt_hvx_reserve(QURT_HVX_RESERVE_ALL_AVAILABLE);
+  CHECK((res != QURT_HVX_RESERVE_NOT_SUPPORTED) && (res != QURT_HVX_RESERVE_NOT_SUCCESSFUL))
+      << "error reserving HVX: " << res;
+
+  // Lock HVX.
+  int lck = qurt_hvx_lock(QURT_HVX_MODE_128B);
+  CHECK(lck == 0) << "error locking HVX: " << lck;
+}
+
+HexagonHvx::~HexagonHvx() {
+  // Unlock HVX.
+  int unl = qurt_hvx_unlock();
+  CHECK(unl == 0) << "error unlocking HVX: " << unl;
+
+  // Release HVX.
+  int rel = qurt_hvx_cancel_reserve();
+  CHECK(rel == 0) << "error releasing HVX: " << rel;
+}
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon_hvx.h b/src/runtime/hexagon/hexagon_hvx.h
new file mode 100644
index 000000000000..042977981c99
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_hvx.h
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HVX_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_HVX_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+class HexagonHvx {
+ public:
+  //! \brief Constructor.
+  // TODO(HWE): Pass in a parameter for which HVX instance to bind
+  HexagonHvx();
+
+  //! \brief Destructor.
+  ~HexagonHvx();
+
+  //! \brief Prevent copy construction of HexagonHvx.
+  HexagonHvx(const HexagonHvx&) = delete;
+
+  //! \brief Prevent copy assignment with HexagonHvx.
+  HexagonHvx& operator=(const HexagonHvx&) = delete;
+
+  //! \brief Prevent move construction.
+  HexagonHvx(HexagonHvx&&) = delete;
+
+  //! \brief Prevent move assignment.
+  HexagonHvx& operator=(HexagonHvx&&) = delete;
+
+ private:
+};
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HVX_H_
diff --git a/src/runtime/hexagon/hexagon_thread_manager.cc b/src/runtime/hexagon/hexagon_thread_manager.cc
index 5d67b142e575..e3de7710a73e 100644
--- a/src/runtime/hexagon/hexagon_thread_manager.cc
+++ b/src/runtime/hexagon/hexagon_thread_manager.cc
@@ -41,6 +41,11 @@ HexagonThreadManager::HexagonThreadManager(unsigned num_threads, unsigned thread
   DLOG(INFO) << "Spawning threads";
   SpawnThreads(thread_stack_size_bytes, thread_pipe_size_words);
 
+  DLOG(INFO) << "Acquiring hardware resources";
+  // TODO(HWE): Move these bindings to specific threads
+  hmx_ = std::make_unique<HexagonHmx>();
+  hvx_ = std::make_unique<HexagonHvx>();
+
   // Initially, block all threads until we get the Start() call
   qurt_sem_init_val(&start_semaphore_, 0);
   for (unsigned i = 0; i < nthreads_; i++) {
@@ -97,6 +102,12 @@ HexagonThreadManager::~HexagonThreadManager() {
   hexbuffs_.FreeHexagonBuffer(pipe_buffer_);
 
   DLOG(INFO) << "Buffers freed";
+
+  // Release hardware
+  hmx_.reset();
+  hvx_.reset();
+
+  DLOG(INFO) << "Hardware resources released";
 }
 
 void HexagonThreadManager::SpawnThreads(unsigned thread_stack_size_bytes,
diff --git a/src/runtime/hexagon/hexagon_thread_manager.h b/src/runtime/hexagon/hexagon_thread_manager.h
index 3422fef3879e..81c90bd1ae20 100644
--- a/src/runtime/hexagon/hexagon_thread_manager.h
+++ b/src/runtime/hexagon/hexagon_thread_manager.h
@@ -32,6 +32,8 @@
 #include "hexagon_buffer.h"
 #include "hexagon_buffer_manager.h"
 #include "hexagon_common.h"
+#include "hexagon_hmx.h"
+#include "hexagon_hvx.h"
 #include "qurt.h"
 
 namespace tvm {
@@ -185,6 +187,14 @@ class HexagonThreadManager {
     void* args;
     Command(voidfunc f, void* args) : f(f), args(args) {}
   };
+
+  //! \brief HMX hardware resource.
+  // TODO(HWE): Move binding of HMX to a specific thread
+  std::unique_ptr<HexagonHmx> hmx_;
+
+  //! \brief HVX hardware resource.
+  // TODO(HWE): Move binding of individual HVX instances to a specific thread
+  std::unique_ptr<HexagonHvx> hvx_;
 };
 
 }  // namespace hexagon
diff --git a/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
index aa86e4638df3..e8532eb54514 100644
--- a/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
@@ -20,6 +20,7 @@
 #include <gtest/gtest.h>
 #include <tvm/runtime/logging.h>
 
+#include "../src/runtime/hexagon/hexagon_device_api.h"
 #include "../src/runtime/hexagon/hexagon_thread_manager.h"
 
 using namespace tvm::runtime;
@@ -28,15 +29,15 @@ using namespace tvm::runtime::hexagon;
 class HexagonThreadManagerTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    htm = new HexagonThreadManager(threads, stack_size, pipe_size);
+    htm = HexagonDeviceAPI::Global()->ThreadManager();
     streams = htm->GetStreamHandles();
   }
-  void TearDown() override { delete htm; }
+  void TearDown() override {}
   HexagonThreadManager* htm{nullptr};
   std::vector<TVMStreamHandle> streams;
   int answer{0};
   const unsigned threads{6};
-  const unsigned pipe_size{100};
+  const unsigned pipe_size{1000};
   const unsigned stack_size{0x4000};  // 16KB
 };
 
@@ -161,7 +162,8 @@ TEST_F(HexagonThreadManagerTest, pipe_fill) {
   CHECK_EQ(answer, 42);
 }
 
-TEST_F(HexagonThreadManagerTest, pipe_overflow) {
+// TODO(HWE): Create a temporary thread manager with a smaller pipe for this test
+TEST_F(HexagonThreadManagerTest, DISABLED_pipe_overflow) {
   // fill the pipe
   for (int i = 0; i < pipe_size; ++i) {
     htm->Dispatch(streams[0], get_the_answer, &answer);

From 842f842add5992ec6d41663c79fc9f47ebaa1986 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 19 Oct 2022 11:33:05 -0700
Subject: [PATCH 391/704] [MetaSchedule][Minor] Restore Relay Integration Unit
 Test (#13128)

Fix test.
---
 .../test_meta_schedule_relay_integration.py   | 54 +++++++++++++++++--
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index e9908cbfde14..9a1c9e8dc7f5 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Integration test for MetaSchedule"""
+from typing import List
 import tempfile
 import numpy as np
 import pytest
@@ -26,6 +27,7 @@
 from tvm._ffi import register_func
 from tvm.contrib import graph_executor
 from tvm.ir.transform import PassContext
+from tvm.meta_schedule.database import Workload, TuningRecord
 from tvm.meta_schedule.testing.relay_workload import get_network
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
 from tvm.meta_schedule.tune_context import _normalize_mod
@@ -347,7 +349,7 @@ def test_extract_task_arm_conv2d_nchwc():
     assert list(out_type.shape) == [1, 8, 130, 130, 4]
 
 
-def test_meta_schedule_te2primfunc_argument_order():
+def test_meta_schedule_te2primfunc_argument_order_and_lowering():
     # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
     # fmt: off
     @tvm.script.ir_module
@@ -416,8 +418,52 @@ def main(placeholder: T.Buffer[(1, 1, 16, 16, 3), "float32"], placeholder_1: T.B
     # fmt: on
     # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
 
-    def _create_database():
-        database = ms.database.create("memory")
+    def _create_verification_database():
+        @ms.derived_object
+        class VerificationDatabase(ms.database.PyDatabase):
+            def __init__(self):
+                super().__init__()
+                self.tuning_records_: List[TuningRecord] = []
+                self.workloads_: List[Workload] = []
+
+            def has_workload(self, mod: IRModule) -> bool:
+                for workload in self.workloads_:
+                    if tvm.ir.structural_equal(mod, workload.mod):
+                        return True
+                # Note: The database has already put in all correct workloads
+                # This is where we can check if the workload is correct
+                raise ValueError(
+                    "The workload searched for is not in given database!"
+                    + " Incorrect TIR was generated from TE subgraph."
+                )
+
+            def commit_workload(self, mod: IRModule) -> ms.database.Workload:
+                # No need to deduplicate workload because they are specified
+                workload = ms.database.Workload(mod)
+                self.workloads_.append(workload)
+                return workload
+
+            def commit_tuning_record(self, record: TuningRecord) -> None:
+                self.tuning_records_.append(record)
+
+            def get_all_tuning_records(self) -> List[TuningRecord]:
+                return self.tuning_records_
+
+            def get_top_k(self, workload: ms.database.Workload, top_k: int) -> List[TuningRecord]:
+                return sorted(
+                    list(
+                        filter(
+                            lambda x: tvm.ir.structural_equal(workload.mod, x.workload.mod),
+                            self.tuning_records_,
+                        )
+                    ),
+                    key=lambda x: sum(x.run_secs) / len(x.run_secs) if x.run_secs else 1e9,
+                )[:top_k]
+
+            def __len__(self) -> int:
+                return len(self.tuning_records_)
+
+        database = VerificationDatabase()
 
         def _commit(mod):
             workload = database.commit_workload(mod)
@@ -464,7 +510,7 @@ def _create_relay_mod():
         dev,
     )
 
-    with target, _create_database(), PassContext(
+    with target, _create_verification_database(), PassContext(  # pylint: disable=not-context-manager
         opt_level=3,
         config={
             "relay.backend.use_meta_schedule": True,

From 86fd99313a3d5966808e2456791ff877f6a6dbe8 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 20 Oct 2022 03:33:50 +0900
Subject: [PATCH 392/704] [MetaSchedule] Fix AddReadReuse for multiple consumer
 block case (#13093)

* [MetaSchedule] Fix AddReadReuse for multiple consumer block case

* Populate consumer_blocks with all child blocks

* change internal consumer_blocks to be a set

* add unordered_set header
---
 .../schedule_rule/multi_level_tiling.cc       |  2 +-
 .../schedule/primitive/cache_read_write.cc    | 18 +++++++++++----
 .../test_meta_schedule_schedule_rule_mlt.py   | 23 +++++++++++++++++++
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index d9c46015eac3..9141c92de12c 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -264,7 +264,7 @@ std::vector<State> MultiLevelTilingNode::AddReadReuse(State state) const {
         continue;
       }
       // Do cache_read
-      BlockRV cache_read_block = sch->CacheRead(block_rv, i, config.scope);
+      BlockRV cache_read_block = sch->CacheRead(block_rv, i, config.scope, {block_rv});
       // Insert cache_read block to the proper place
       sch->ComputeAt(cache_read_block, loop_rv, true);
       // Fuse the iterators of the cache_read
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 58d622268c78..adadb46852cc 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -17,6 +17,8 @@
  * under the License.
  */
 
+#include <unordered_set>
+
 #include "../utils.h"
 
 namespace tvm {
@@ -75,8 +77,8 @@ struct CacheStageInfo {
   Stmt cache_stage;
   /*! \brief The map used for ScheduleStateNode::Replace. */
   Map<Block, Block> block_reuse;
-  /*! \brief A list of blocks that will consume the new cache. */
-  Array<StmtSRef> consumer_blocks;
+  /*! \brief A set of blocks that will consume the new cache. */
+  std::unordered_set<StmtSRef, ObjectHash, ObjectEqual> consumer_blocks;
 };
 
 /*! \brief Return the buffer region realted with the buffer */
@@ -1132,8 +1134,14 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff
   info.write_buffer = WithScope(read_buffer, storage_scope);
   // Create the corresponding buffer allocation
   info.alloc = info.write_buffer;
-  // Indicate which buffers should consume the cache.
-  info.consumer_blocks = consumer_blocks;
+
+  // info.consumer_blocks indicates which buffers should consume the cache.
+  for (auto consumer : consumer_blocks) {
+    info.consumer_blocks.insert(consumer);
+    for (auto child : tir::GetChildBlocks(self, consumer)) {
+      info.consumer_blocks.insert(child);
+    }
+  }
 
   // Step 3. Update cache stage info.
   BufferRegion cache_region{nullptr};
@@ -1290,7 +1298,7 @@ Array<StmtSRef> CacheInplace(ScheduleState self, const StmtSRef& block_sref, int
   // Create the corresponding buffer allocation
   info.alloc = info.write_buffer;
   // Indicate which buffers should consume the cache.
-  info.consumer_blocks.push_back(block_sref);
+  info.consumer_blocks.insert(block_sref);
 
   // Cache read step 1. Detect insert position
   CacheInplaceLocDetector::Detect(self, block_sref, scope_sref, &info);
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
index 28e6f295e78f..24e34302202b 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
@@ -623,6 +623,29 @@ def cpu_conv2d_nhwc(
     )
 
 
+def test_cache_read_specify_consumer():
+    A, B, C = te_workload.matmul(512, 512, 512)
+    mod = te.create_prim_func([A, B, C + A])
+
+    space = generate_design_space(
+        kind="cuda",
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3080"),
+        types=ms.schedule_rule.MultiLevelTiling,
+    )
+
+    residual_block = """
+        for i0, i1 in T.grid(512, 512):
+            with T.block("T_add"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(C[ax0, ax1], A[ax0, ax1])
+                T.writes(T_add[ax0, ax1])
+                T_add[ax0, ax1] = C[ax0, ax1] + A[ax0, ax1]
+    """
+
+    assert residual_block in space[0].mod.script()
+
+
 if __name__ == "__main__":
     test_cpu_matmul()
     test_cpu_matmul_relu()

From f3873d77177c5481403b10eab23c82d9eb36d186 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 19 Oct 2022 23:47:18 +0300
Subject: [PATCH 393/704] [skip ci] Add Janet and Thomas to triagers to help
 with Issue Triage RFC (#13141)

these two volunteered to help triage issues per [Issue Triage RFC](https://github.com/apache/tvm-rfcs/blob/main/rfcs/0093_Issue_Triage.md).
---
 .asf.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.asf.yaml b/.asf.yaml
index 9e337392aee8..f4aba210d2cc 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -47,7 +47,8 @@ github:
   collaborators:
     - hpanda-naut
     - denise-k
-    - driazati
+    - janetsc
+    - naut-thomas
     - tvm-bot  # For automated feedback in PR review.
 
   # See https://cwiki.apache.org/confluence/display/INFRA/Git+-+.asf.yaml+features#Git.asf.yamlfeatures-Branchprotection

From f3ffc32482884452fdb5ea29b09a16291b60841a Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Wed, 19 Oct 2022 17:46:09 -0400
Subject: [PATCH 394/704] [Hexagon] [runtime] Remove released buffer check for
 post-ReleaseResources calls to FreeDataSpace (#13139)

* Remove released buffer list

* PR feedback, make message debug only
---
 src/runtime/hexagon/hexagon_buffer_manager.h | 24 +++++---------------
 src/runtime/hexagon/hexagon_device_api.cc    | 11 +++------
 src/runtime/hexagon/hexagon_device_api.h     | 10 --------
 3 files changed, 9 insertions(+), 36 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_buffer_manager.h b/src/runtime/hexagon/hexagon_buffer_manager.h
index eecf96a6db07..e064114346ee 100644
--- a/src/runtime/hexagon/hexagon_buffer_manager.h
+++ b/src/runtime/hexagon/hexagon_buffer_manager.h
@@ -35,6 +35,12 @@ namespace hexagon {
 
 class HexagonBufferManager {
  public:
+  ~HexagonBufferManager() {
+    if (!hexagon_buffer_map_.empty()) {
+      DLOG(INFO) << "HexagonBufferManager is not empty upon destruction";
+    }
+  }
+
   /*!
    * \brief Free a HexagonBuffer.
    * \param ptr Address of the HexagonBuffer as returned by `AllocateHexagonBuffer`.
@@ -80,24 +86,6 @@ class HexagonBufferManager {
     return nullptr;
   }
 
-  //! \brief Returns whether the HexagonBufferManager has any allocations.
-  bool empty() {
-    std::lock_guard<std::mutex> lock(map_mutex_);
-    return hexagon_buffer_map_.empty();
-  }
-
-  //! \brief Returns a vector of currently allocated pointers, owned by the manager.
-  // Note - this should only be used by the device API to keep track of what
-  // was in the manager when HexagonDeviceAPI::ReleaseResources is called.
-  std::vector<void*> current_allocations() {
-    std::vector<void*> allocated;
-    std::lock_guard<std::mutex> lock(map_mutex_);
-    for (const auto& [data_ptr, buffer] : hexagon_buffer_map_) {
-      allocated.push_back(data_ptr);
-    }
-    return allocated;
-  }
-
  private:
   //! \brief Contains the HexagonBuffer objects managed by this class.
   std::unordered_map<void*, std::unique_ptr<HexagonBuffer>> hexagon_buffer_map_;
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 5f7867590743..f8824d515443 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -132,14 +132,9 @@ void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
   if (runtime_hexbuffs) {
     runtime_hexbuffs->FreeHexagonBuffer(ptr);
   } else {
-    // Either AcquireResources was never called, or ReleaseResources was called.  Check the
-    // list of buffers that were still allocated at the time of release.  If this buffer is
-    // in that list, this is a no-op as it is being freed as part of another object teardown.
-    // If the pointer isn't in that list, we raise an exception as this is an unexpected Free.
-    auto it = std::find(released_runtime_buffers.begin(), released_runtime_buffers.end(), ptr);
-    CHECK(it != released_runtime_buffers.end()) << "Attempted to free Hexagon data with "
-                                                << "HexagonDeviceAPI::FreeDataSpace that was not "
-                                                << "allocated during the session.";
+    // Either AcquireResources was never called, or ReleaseResources was called.  Since this can
+    // occur in the normal course of shutdown, log a message and continue.
+    DLOG(INFO) << "FreeDataSpace called outside a session for " << ptr;
   }
 }
 
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index 7944878dd5dc..e94ae4e87671 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -60,7 +60,6 @@ class HexagonDeviceAPI final : public DeviceAPI {
 
     CHECK_EQ(runtime_hexbuffs, nullptr);
     runtime_hexbuffs = std::make_unique<HexagonBufferManager>();
-    released_runtime_buffers.clear();
 
     CHECK_EQ(runtime_threads, nullptr);
     runtime_threads = std::make_unique<HexagonThreadManager>(threads, stack_size, pipe_size);
@@ -78,10 +77,6 @@ class HexagonDeviceAPI final : public DeviceAPI {
     runtime_threads.reset();
 
     CHECK(runtime_hexbuffs) << "runtime_hexbuffs was not created in AcquireResources";
-    if (!runtime_hexbuffs->empty()) {
-      DLOG(INFO) << "runtime_hexbuffs was not empty in ReleaseResources";
-      released_runtime_buffers = runtime_hexbuffs->current_allocations();
-    }
     runtime_hexbuffs.reset();
 
     CHECK(runtime_vtcm) << "runtime_vtcm was not created in AcquireResources";
@@ -196,11 +191,6 @@ class HexagonDeviceAPI final : public DeviceAPI {
   // to the lifetime of a user application session.
   std::unique_ptr<HexagonBufferManager> runtime_hexbuffs;
 
-  //! \brief Keeps a list of released runtime HexagonBuffer allocations
-  // ReleaseResources can be called when there are still buffers in runtime_hexbuffs.  This list
-  // stores the buffers that were released.
-  std::vector<void*> released_runtime_buffers;
-
   //! \brief Thread manager
   std::unique_ptr<HexagonThreadManager> runtime_threads;
   const unsigned threads{6};

From 687ef5b23d1b8309e91290cff415670db939f3f2 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 19 Oct 2022 14:51:54 -0700
Subject: [PATCH 395/704] [MetaSchedule] Remove Exception Catch for Multithread
 Postproc Application (#13084)

* Remove logging in multi-thread.

* Remove try-catch.
---
 src/meta_schedule/utils.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index b14717f4b29e..824cfcd6aa5c 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -326,14 +326,8 @@ struct ThreadedTraceApply {
 
     for (int i = 0; i < n_; ++i) {
       Item& item = items_[i];
-      try {
-        if (!item.postproc->Apply(sch)) {
-          ++item.fail_counter;
-          return NullOpt;
-        }
-      } catch (const std::exception& e) {
-        // Used in multi-thread, only output to screen but failure summary sent to logging
-        LOG(WARNING) << "ThreadedTraceApply::Apply failed with error " << e.what();
+      if (!item.postproc->Apply(sch)) {
+        item.fail_counter++;
         return NullOpt;
       }
     }

From 458ca81c01bd85d04ea6428e7cb4bfa021a1d37a Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 19 Oct 2022 14:55:23 -0700
Subject: [PATCH 396/704] [TIR] Allow IndexMap applied to arguments with
 different dtypes (#13085)

* [TIR] Allow IndexMap applied to arguments with different dtypes

* address comments

* Add SubstituteWithDataTypeLegalization
---
 include/tvm/tir/stmt_functor.h                | 26 ++++++
 src/tir/ir/index_map.cc                       | 25 +++++-
 src/tir/ir/stmt_functor.cc                    | 89 +++++++++++++++++++
 tests/python/unittest/test_index_map.py       | 10 +++
 .../test_tir_schedule_transform_layout.py     | 35 ++++++++
 5 files changed, 181 insertions(+), 4 deletions(-)

diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index 60973577ac92..fdb0a0aa9d1a 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -409,6 +409,32 @@ inline T Substitute(T input, const std::unordered_map<const VarNode*, PrimExpr>&
   return Substitute(std::move(input), vmap);
 }
 
+/*!
+ * \brief Substitute the var specified by vmap and legalize data types after substitution.
+ * \param stmt The source statement to be substituted
+ * \param vmap returns a new value if re-mapping is needed, otherwise returns nullptr.
+ *
+ * Unlike `Substitute`, this allows the substitution to change the data type of the expression.
+ *
+ * \sa Substitute
+ * \return The result.
+ */
+TVM_DLL Stmt SubstituteWithDataTypeLegalization(Stmt stmt,
+                                                std::function<Optional<PrimExpr>(const Var&)> vmap);
+
+/*!
+ * \brief Substitute the var specified by vmap and legalize data types after substitution.
+ * \param expr The source statement to be substituted
+ * \param vmap returns a new value if re-mapping is needed, otherwise returns nullptr.
+ *
+ * Unlike `Substitute`, this allows the substitution to change the data type of the expression.
+ *
+ * \sa Substitute
+ * \return The result.
+ */
+TVM_DLL PrimExpr SubstituteWithDataTypeLegalization(
+    PrimExpr expr, std::function<Optional<PrimExpr>(const Var&)> vmap);
+
 /*!
  * \brief Recursively visit the IR in pre DFS order node, apply fvisit.
  * If fvisit returns false, it won't visit the children of the node.
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index e1cc9dbdd093..03a2f29bd129 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -162,9 +162,11 @@ Array<PrimExpr> IndexMapNode::MapIndices(const Array<PrimExpr>& indices,
     analyzer = &local_analyzer;
   }
 
-  Array<PrimExpr> output = final_indices.Map(
-      [&](PrimExpr index) { return analyzer->Simplify(Substitute(std::move(index), vmap)); });
-
+  Array<PrimExpr> output = final_indices.Map([&](PrimExpr index) {
+    PrimExpr result = SubstituteWithDataTypeLegalization(
+        std::move(index), [&](const Var& var) { return vmap.Get(var); });
+    return analyzer->Simplify(result);
+  });
   return output;
 }
 
@@ -218,6 +220,21 @@ Array<Range> IndexMapNode::MapRanges(const Array<Range>& ranges, arith::Analyzer
                                             analyzer->Simplify(int_set.max() - int_set.min() + 1)));
     }
   }
+  auto output_dtype = [&]() {
+    int max_bits = 0;
+    for (const auto& range : ranges) {
+      max_bits = std::max(max_bits, range->extent.dtype().bits());
+    }
+    return DataType::Int(max_bits);
+  }();
+  output.MutateByApply([&](const Range& range) {
+    if (range->min.dtype() != output_dtype || range->extent.dtype() != output_dtype) {
+      return Range::FromMinExtent(cast(output_dtype, range->min),
+                                  cast(output_dtype, range->extent));
+    } else {
+      return range;
+    }
+  });
   return output;
 }
 
@@ -227,7 +244,7 @@ Array<PrimExpr> IndexMapNode::MapShape(const Array<PrimExpr>& shape,
 
   Array<Range> ranges;
   for (auto& dim : shape) {
-    ranges.push_back(Range(0, dim));
+    ranges.push_back(Range(make_zero(dim.dtype()), dim));
   }
   Array<Range> mapped = MapRanges(std::move(ranges), analyzer);
 
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index c2e2489cba92..6d0ee134c805 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -809,6 +809,95 @@ void PreOrderVisit(const ObjectRef& stmt_or_expr,
   }
 }
 
+class IRSubstituteWithDataTypeLegalization : public DataTypeLegalizer {
+ public:
+  explicit IRSubstituteWithDataTypeLegalization(std::function<Optional<PrimExpr>(const Var&)> vmap)
+      : vmap_(vmap) {}
+
+  PrimExpr VisitExpr_(const VarNode* op) final {
+    Var var = GetRef<Var>(op);
+    auto ret = vmap_(var);
+    if (ret.defined()) {
+      return ret.value();
+    }
+    return std::move(var);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    auto node = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    return VisitBufferAccess(std::move(node));
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    auto node = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    return VisitBufferAccess(std::move(node));
+  }
+
+  template <typename Node>
+  Node VisitBufferAccess(Node node) {
+    Buffer new_buf = GetRemappedBuffer(node->buffer);
+
+    if (!new_buf.same_as(node->buffer)) {
+      auto writer = node.CopyOnWrite();
+      writer->buffer = new_buf;
+    }
+
+    return node;
+  }
+
+  Buffer GetRemappedBuffer(Buffer buf) {
+    auto key = buf.get();
+    auto it = buf_remap_.find(key);
+    if (it != buf_remap_.end()) {
+      return it->second;
+    }
+
+    auto new_buffer_var = vmap_(buf->data);
+    if (new_buffer_var.defined() && !new_buffer_var.value().same_as(buf->data)) {
+      auto writer = buf.CopyOnWrite();
+      writer->data = Downcast<Var>(new_buffer_var);
+    }
+
+    buf_remap_[key] = buf;
+    return buf;
+  }
+
+  Stmt VisitStmt_(const AttrStmtNode* op) final {
+    Stmt ret = StmtExprMutator::VisitStmt_(op);
+    op = ret.as<AttrStmtNode>();
+    // remap var node in attr
+    if (const auto* var_node = op->node.as<VarNode>()) {
+      if (auto mapped_var = vmap_(GetRef<Var>(var_node))) {
+        return AttrStmt(mapped_var, op->attr_key, op->value, op->body);
+      }
+    }
+    return ret;
+  }
+
+ private:
+  // Caller provided function that defines the variables to be remapped.
+  std::function<Optional<PrimExpr>(const Var&)> vmap_;
+
+  /* \brief Generated map to track buffers being remapped.
+   *
+   * If a `Var BufferNode::data` is remapped, then all buffers
+   * containing that data pointer should also be remapped.  This map
+   * is used to track buffer modifications, and ensure all instances
+   * of a buffer are replaced by the same modified buffer object.
+   */
+  std::unordered_map<const BufferNode*, Buffer> buf_remap_;
+};
+
+Stmt SubstituteWithDataTypeLegalization(Stmt stmt,
+                                        std::function<Optional<PrimExpr>(const Var&)> vmap) {
+  return IRSubstituteWithDataTypeLegalization(vmap)(std::move(stmt));
+}
+
+PrimExpr SubstituteWithDataTypeLegalization(PrimExpr expr,
+                                            std::function<Optional<PrimExpr>(const Var&)> vmap) {
+  return IRSubstituteWithDataTypeLegalization(vmap)(std::move(expr));
+}
+
 TVM_REGISTER_GLOBAL("tir.IRTransform").set_body_typed(IRTransform);
 
 TVM_REGISTER_GLOBAL("tir.PostOrderVisit").set_body_typed([](ObjectRef node, PackedFunc f) {
diff --git a/tests/python/unittest/test_index_map.py b/tests/python/unittest/test_index_map.py
index 6882c2b42634..ac128690c415 100644
--- a/tests/python/unittest/test_index_map.py
+++ b/tests/python/unittest/test_index_map.py
@@ -21,6 +21,7 @@
 import tvm.testing
 from tvm.ir import assert_structural_equal
 from tvm.tir import IndexMap, IntImm, floordiv, floormod
+from tvm.runtime import const
 
 
 def assert_equal_index_map(map1: IndexMap, map2: IndexMap) -> None:
@@ -41,6 +42,9 @@ def test_index_mapping():
     assert_structural_equal(index_map.map_indices([3]), [0, 3])
     assert_structural_equal(index_map.map_indices([4]), [1, 0])
     assert_structural_equal(index_map.map_indices([42]), [10, 2])
+    assert_structural_equal(
+        index_map.map_indices([const(42, "int64")]), [const(10, "int64"), const(2, "int64")]
+    )
 
 
 def test_shape_mapping():
@@ -50,6 +54,12 @@ def test_shape_mapping():
     assert_structural_equal(index_map.map_shape([16]), [4, 4])
 
     assert_structural_equal(index_map.map_shape([14]), [4, 4])
+    assert_structural_equal(
+        index_map.map_shape([const(16, "int64")]), [const(4, "int64"), const(4, "int64")]
+    )
+    assert_structural_equal(
+        index_map.map_shape([const(14, "int64")]), [const(4, "int64"), const(4, "int64")]
+    )
 
 
 def test_inverse():
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index 174e9eb25cc0..0bf75becb2c0 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -376,6 +376,41 @@ def test_transform_block_layout_fail_mixed_iter_type(use_block_name):
         )
 
 
+def test_transform_block_layout_int64_extent(use_block_name):
+    @T.prim_func
+    def elementwise_int64_extent(
+        A: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+        B: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+    ) -> None:
+        for i, j in T.grid(T.int64(128), T.int64(128)):
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                B[vi, vj] = A[vi, vj] * 2.0
+
+    @T.prim_func
+    def elementwise_int64_extent_transformed(
+        A: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+        B: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+    ) -> None:
+        for i in range(T.int64(16384)):
+            with T.block("B"):
+                vi = T.axis.remap("S", [i])
+                B[vi // T.int64(128), vi % T.int64(128)] = (
+                    A[vi // T.int64(128), vi % T.int64(128)] * 2.0
+                )
+
+    sch = tir.Schedule(elementwise_int64_extent, debug_mask="all")
+    block = "B" if use_block_name else sch.get_block("B")
+    sch.transform_block_layout(block, lambda i, j: (i * 128 + j,))
+    print(
+        tvm.ir.base.get_first_structural_mismatch(
+            elementwise_int64_extent_transformed, sch.mod["main"]
+        )
+    )
+    tvm.ir.assert_structural_equal(elementwise_int64_extent_transformed, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=elementwise_int64_extent)
+
+
 class BasePaddingCompare(tvm.testing.CompareBeforeAfter):
     pad_value = tvm.testing.parameter(None)
 

From 59c0ef80b5186e1840523f6a8b60071f78436212 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Thu, 20 Oct 2022 10:02:47 +0300
Subject: [PATCH 397/704] [Android] Fix cpp_rpc build for android with NDK>=23
 (#13118)

With NDK>= 23 when we specify explicitly linker then we got the
following error:

```
ld.gold: --no-rosegment: unknown option
```

From the github repository of NDK, I found that the only one right way
to configure linker is using `-DANDROID_LD` variable:
https://github.com/android/ndk/issues/1426#issuecomment-760432467

Removed setting linker manually and by default `LLD` will be used.

Checked that it works on the following versions of NDK:
- 20.0.5594570
- 21.4.7075529
- 22.1.7171670
- 23.0.7599858
- 23.1.7779620
- 24.0.8215888
- 25.1.8937393
---
 apps/cpp_rpc/CMakeLists.txt | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/apps/cpp_rpc/CMakeLists.txt b/apps/cpp_rpc/CMakeLists.txt
index 9f1180ee0fd3..97c859045d76 100644
--- a/apps/cpp_rpc/CMakeLists.txt
+++ b/apps/cpp_rpc/CMakeLists.txt
@@ -32,12 +32,6 @@ if (OS)
    endif()
 endif()
 
-if(USE_OPENCL)
-  if (ANDROID_ABI)
-    set_property(TARGET tvm_rpc PROPERTY LINK_FLAGS -fuse-ld=gold)
-  endif()
-endif()
-
 target_include_directories(
   tvm_rpc
   PUBLIC "../../include"

From 3ae326cb76ead9de7b5559cb5bcac4be49b0059e Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 20 Oct 2022 00:13:51 -0700
Subject: [PATCH 398/704] [MetaSchedule] Support Tuning w/ No Cost Model & Fix
 Integration Test (#13151)

Add none for cost model & fix integration test.
---
 .../meta_schedule/cost_model/cost_model.py    |  8 ++-
 tests/python/integration/test_tuning.py       | 63 ++++++++++++-------
 2 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/python/tvm/meta_schedule/cost_model/cost_model.py b/python/tvm/meta_schedule/cost_model/cost_model.py
index 54a4d7a34391..f139fcc4e4b3 100644
--- a/python/tvm/meta_schedule/cost_model/cost_model.py
+++ b/python/tvm/meta_schedule/cost_model/cost_model.py
@@ -106,7 +106,7 @@ def predict(self, context: TuneContext, candidates: List[MeasureCandidate]) -> n
 
     @staticmethod
     def create(
-        kind: Literal["xgb", "mlp", "random"],
+        kind: Literal["xgb", "mlp", "random", "none"],
         *args,
         **kwargs,
     ) -> "CostModel":
@@ -114,8 +114,8 @@ def create(
 
         Parameters
         ----------
-        kind : Literal["xgb", "mlp", "random"]
-            The kind of the cost model. Can be "xgb", "mlp", or "random".
+        kind : Literal["xgb", "mlp", "random", "none"]
+            The kind of the cost model. Can be "xgb", "mlp", "random" or "none".
 
         Returns
         -------
@@ -134,6 +134,8 @@ def create(
             )
 
             return MLPModel(*args, **kwargs)  # type: ignore
+        if kind == "none":
+            return None  # no cost model required
         raise ValueError(f"Unknown CostModel: {kind}")
 
 
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index af5143908108..589845b13f7f 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -26,6 +26,7 @@
 from tvm import relay
 from tvm.contrib import graph_executor
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.meta_schedule.testing.tune_utils import generate_input_data
 from tvm.target.target import Target
 
 logging.basicConfig(
@@ -37,23 +38,21 @@
 
 @pytest.mark.skip("Integration test")
 @pytest.mark.parametrize(
-    "model_name, input_shape, target, layout",
+    "model_name, input_shape, data_type, target, layout",
     [
-        ("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC"),
-        ("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3090-ti", "NHWC"),
+        ("resnet_18", [1, 3, 224, 224], "float32", "llvm --num-cores=12", "NHWC"),
+        ("resnet_18", [1, 3, 224, 224], "float32", "nvidia/geforce-rtx-3070", "NHWC"),
     ],
 )
 def test_meta_schedule_tune_relay(
     model_name: str,
     input_shape: List[int],
+    data_type: str,
     target: str,
     layout: Optional[str],
 ):
     dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda()
-    if model_name.startswith("bert"):
-        data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev)  # embedding size
-    else:
-        data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev)
+    data = generate_input_data(input_shape, data_type)
 
     mod, params, (input_name, _, _) = get_network(
         name=model_name,
@@ -78,22 +77,44 @@ def test_meta_schedule_tune_relay(
                 params=params,
             )
         print(profiler.table())
-        # Compile without meta-schedule for correctness check
-        with tvm.transform.PassContext(opt_level=0):
-            rt_mod2 = relay.build(mod, target=target, params=params)
 
-        def get_output(data, lib):
-            module = graph_executor.GraphModule(lib["default"](dev))
-            module.set_input(input_name, data)
-            module.run()
-            return module.get_output(0).numpy()
+    def get_output(data, lib, dev):
+        module = graph_executor.GraphModule(lib["default"](dev))
+        module.set_input(input_name, tvm.nd.array(data, device=dev))
+        module.run()
+        return module.get_output(0).numpy()
 
-        # Check correctness
-        actual_output = get_output(data, rt_mod1)
-        expected_output = get_output(data, rt_mod2)
-        assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
+    # Check correctness
+    actual_output = get_output(data, rt_mod1, dev)
+    print(
+        f"{model_name} finished tuning and running on {Target(target).kind.name}. "
+        "Running baseline...",
+        flush=True,
+    )
+
+    # Compile without meta-schedule for correctness check
+    baseline_target = "llvm -num-cores=1"
+    with tvm.transform.PassContext(opt_level=0):
+        rt_mod2 = relay.build(mod, target=baseline_target, params=params)
+
+    expected_output = get_output(data, rt_mod2, tvm.cpu())
+    print(
+        f"Basline finished running on {Target(baseline_target).kind.name}. "
+        "Verifying correctness...",
+        flush=True,
+    )
+
+    assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
+    print(
+        f"Correctness verified for {model_name} on {Target(target).kind.name}.",
+        flush=True,
+    )
 
 
 if __name__ == """__main__""":
-    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC")
-    test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3090-ti", None)
+    test_meta_schedule_tune_relay(
+        "resnet_18", [1, 3, 224, 224], "float32", "llvm --num-cores=12", "NHWC"
+    )
+    test_meta_schedule_tune_relay(
+        "resnet_18", [1, 3, 224, 224], "float32", "nvidia/geforce-rtx-3070", None
+    )

From 308c20a4ca4814932e2b1f443317eac6564b304f Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Thu, 20 Oct 2022 10:24:44 +0100
Subject: [PATCH 399/704] [Target] Add Target Parser for Arm(R) Cortex(R)
 A-Profile CPUs (#12454)

This implements an initial Target Parser which aims to consolidate architecture feature detection from a few different places:
* https://github.com/apache/tvm/blob/d2db9cb0d839e32778f461b77e59f6418282a511/python/tvm/topi/arm_cpu/arm_utils.py#L24-L70
* https://github.com/apache/tvm/blob/02fbaf0ed9120a8f95155e63de42459f230584aa/python/tvm/relay/qnn/op/legalizations.py#L350-L359
* https://github.com/apache/tvm/blob/b542724873140bb051492530d97a78b9b7b7983d/python/tvm/relay/op/strategy/arm_cpu.py#L232

A further patch will remove all of the above and replace usages with the `.features` map.
---
 src/target/parsers/aprofile.cc            | 160 ++++++++++++
 src/target/parsers/aprofile.h             |  43 ++++
 tests/cpp/target/parsers/aprofile_test.cc | 299 ++++++++++++++++++++++
 3 files changed, 502 insertions(+)
 create mode 100644 src/target/parsers/aprofile.cc
 create mode 100644 src/target/parsers/aprofile.h
 create mode 100644 tests/cpp/target/parsers/aprofile_test.cc

diff --git a/src/target/parsers/aprofile.cc b/src/target/parsers/aprofile.cc
new file mode 100644
index 000000000000..2fd5fe71e617
--- /dev/null
+++ b/src/target/parsers/aprofile.cc
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/target/parsers/aprofile.cc
+ * \brief Target Parser for Arm(R) Cortex(R) A-Profile CPUs
+ */
+
+#include "aprofile.h"
+
+#include <string>
+
+#include "../../support/utils.h"
+
+namespace tvm {
+namespace target {
+namespace parsers {
+namespace aprofile {
+
+double GetArchVersion(Array<String> mattr) {
+  for (const String& attr : mattr) {
+    std::string attr_string = attr;
+    size_t attr_len = attr_string.size();
+    if (attr_len >= 4 && attr_string.substr(0, 2) == "+v" && attr_string.back() == 'a') {
+      std::string version_string = attr_string.substr(2, attr_string.size() - 2);
+      return atof(version_string.data());
+    }
+  }
+  return 0.0;
+}
+
+double GetArchVersion(Optional<Array<String>> attr) {
+  if (!attr) {
+    return false;
+  }
+  return GetArchVersion(attr.value());
+}
+
+static inline bool HasFlag(String attr, std::string flag) {
+  std::string attr_str = attr;
+  return attr_str.find(flag) != std::string::npos;
+}
+
+static inline bool HasFlag(Optional<String> attr, std::string flag) {
+  if (!attr) {
+    return false;
+  }
+  return HasFlag(attr.value(), flag);
+}
+
+static inline bool HasFlag(Optional<Array<String>> attr, std::string flag) {
+  if (!attr) {
+    return false;
+  }
+  Array<String> attr_array = attr.value();
+
+  auto matching_attr = std::find_if(attr_array.begin(), attr_array.end(),
+                                    [flag](String attr_str) { return HasFlag(attr_str, flag); });
+  return matching_attr != attr_array.end();
+}
+
+static bool HasFlag(Optional<String> mcpu, Optional<Array<String>> mattr, std::string flag) {
+  return HasFlag(mcpu, flag) || HasFlag(mattr, flag);
+}
+
+bool IsAArch32(Optional<String> mtriple, Optional<String> mcpu) {
+  if (mtriple) {
+    bool is_mprofile = mcpu && support::StartsWith(mcpu.value(), "cortex-m");
+    return support::StartsWith(mtriple.value(), "arm") && !is_mprofile;
+  }
+  return false;
+}
+
+bool IsAArch64(Optional<String> mtriple) {
+  if (mtriple) {
+    return support::StartsWith(mtriple.value(), "aarch64");
+  }
+  return false;
+}
+
+bool IsArch(TargetJSON attrs) {
+  Optional<String> mtriple = Downcast<Optional<String>>(attrs.Get("mtriple"));
+  Optional<String> mcpu = Downcast<Optional<String>>(attrs.Get("mcpu"));
+
+  return IsAArch32(mtriple, mcpu) || IsAArch64(mtriple);
+}
+
+static TargetFeatures GetFeatures(TargetJSON target) {
+  Optional<String> mcpu = Downcast<Optional<String>>(target.Get("mcpu"));
+  Optional<String> mtriple = Downcast<Optional<String>>(target.Get("mtriple"));
+  Optional<Array<String>> mattr = Downcast<Optional<Array<String>>>(target.Get("mattr"));
+
+  double arch_version = GetArchVersion(mattr);
+
+  bool is_aarch64 = IsAArch64(mtriple);
+
+  bool simd_flag = HasFlag(mcpu, mattr, "+neon") || HasFlag(mcpu, mattr, "+simd");
+  bool has_asimd = is_aarch64 || simd_flag;
+
+  bool i8mm_flag = HasFlag(mcpu, mattr, "+i8mm");
+  bool i8mm_disable = HasFlag(mcpu, mattr, "+noi8mm");
+  bool i8mm_default = arch_version >= 8.6;
+  bool i8mm_support = arch_version >= 8.2 && arch_version <= 8.5;
+  bool has_i8mm = (i8mm_default && !i8mm_disable) || (i8mm_support && i8mm_flag);
+
+  bool dotprod_flag = HasFlag(mcpu, mattr, "+dotprod");
+  bool dotprod_disable = HasFlag(mcpu, mattr, "+nodotprod");
+  bool dotprod_default = arch_version >= 8.4;
+  bool dotprod_support = arch_version >= 8.2 && arch_version <= 8.3;
+  bool has_dotprod = (dotprod_default && !dotprod_disable) || (dotprod_support && dotprod_flag);
+
+  return {
+      {"is_aarch64", Bool(is_aarch64)},
+      {"has_asimd", Bool(has_asimd)},
+      {"has_dotprod", Bool(has_dotprod)},
+      {"has_matmul_i8", Bool(has_i8mm)},
+  };
+}
+
+static Array<String> MergeKeys(Optional<Array<String>> existing_keys) {
+  const String kExtraKey = "arm_cpu";
+
+  if (!existing_keys) {
+    return {kExtraKey};
+  }
+
+  Array<String> keys = existing_keys.value();
+  if (std::find(keys.begin(), keys.end(), kExtraKey) == keys.end()) {
+    keys.push_back(kExtraKey);
+  }
+  return keys;
+}
+
+TargetJSON ParseTarget(TargetJSON target) {
+  target.Set("features", GetFeatures(target));
+  target.Set("keys", MergeKeys(Downcast<Optional<Array<String>>>(target.Get("keys"))));
+
+  return target;
+}
+
+}  // namespace aprofile
+}  // namespace parsers
+}  // namespace target
+}  // namespace tvm
diff --git a/src/target/parsers/aprofile.h b/src/target/parsers/aprofile.h
new file mode 100644
index 000000000000..7ded9ac5456a
--- /dev/null
+++ b/src/target/parsers/aprofile.h
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/target/parsers/aprofile.h
+ * \brief Target Parser for Arm(R) Cortex(R) A-Profile CPUs
+ */
+
+#ifndef TVM_TARGET_PARSERS_APROFILE_H_
+#define TVM_TARGET_PARSERS_APROFILE_H_
+
+#include <tvm/target/target.h>
+
+namespace tvm {
+namespace target {
+namespace parsers {
+namespace aprofile {
+
+bool IsArch(TargetJSON target);
+TargetJSON ParseTarget(TargetJSON target);
+
+}  // namespace aprofile
+}  // namespace parsers
+}  // namespace target
+}  // namespace tvm
+
+#endif  // TVM_TARGET_PARSERS_APROFILE_H_
diff --git a/tests/cpp/target/parsers/aprofile_test.cc b/tests/cpp/target/parsers/aprofile_test.cc
new file mode 100644
index 000000000000..0382e7a84bd7
--- /dev/null
+++ b/tests/cpp/target/parsers/aprofile_test.cc
@@ -0,0 +1,299 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "../src/target/parsers/aprofile.h"
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <string>
+
+namespace tvm {
+namespace target {
+namespace parsers {
+namespace aprofile {
+
+static float defaultI8MM = 8.6;
+static float optionalI8MM[] = {8.2, 8.3, 8.4, 8.5};
+static float defaultDotProd = 8.4;
+static float optionalDotProd[] = {8.2, 8.3};
+
+class AProfileOptionalI8MM : public testing::TestWithParam<float> {};
+class AProfileOptionalDotProd : public testing::TestWithParam<float> {};
+
+static TargetFeatures ParseTargetWithAttrs(String mcpu, String mtriple, Array<String> mattr) {
+  return ParseTarget({
+      {"mcpu", mcpu},
+      {"mtriple", mtriple},
+      {"mattr", mattr},
+  });
+}
+
+TEST(AProfileParser, ParseTargetKeys) {
+  TargetJSON target = ParseTarget({});
+  Array<String> keys = Downcast<Array<String>>(target.at("keys"));
+  ASSERT_EQ(keys.size(), 1);
+  ASSERT_EQ(keys[0], "arm_cpu");
+}
+
+TEST(AProfileParser, ParseTargetWithExistingKeys) {
+  TargetJSON target = ParseTarget({
+      {"keys", Array<String>{"cpu"}},
+  });
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  Array<String> keys = Downcast<Array<String>>(target.at("keys"));
+  ASSERT_EQ(keys.size(), 2);
+  ASSERT_EQ(keys[0], "cpu");
+  ASSERT_EQ(keys[1], "arm_cpu");
+}
+
+TEST(AProfileParser, ParseTargetWithDuplicateKey) {
+  TargetJSON target = ParseTarget({
+      {"keys", Array<String>{"cpu", "arm_cpu"}},
+  });
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  Array<String> keys = Downcast<Array<String>>(target.at("keys"));
+  ASSERT_EQ(keys.size(), 2);
+  ASSERT_EQ(keys[0], "cpu");
+  ASSERT_EQ(keys[1], "arm_cpu");
+}
+
+TEST(AProfileParser, ParseTargetDefaults) {
+  TargetJSON target = ParseTarget({});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+
+  ASSERT_EQ(Downcast<Bool>(features.at("is_aarch64")), false);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_asimd")), false);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_dotprod")), false);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_matmul_i8")), false);
+}
+
+TEST(AProfileParser, IsAArch64Triple) {
+  TargetJSON target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {""});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("is_aarch64")), true);
+}
+
+TEST(AProfileParser, IsAArch32Triple) {
+  TargetJSON target = ParseTargetWithAttrs("", "armv7a-arm-none-eabi", {""});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("is_aarch64")), false);
+
+  target = ParseTargetWithAttrs("", "armv8a-arm-none-eabi", {""});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("is_aarch64")), false);
+
+  target = ParseTargetWithAttrs("", "arm-unknown-linux-gnu", {""});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("is_aarch64")), false);
+}
+
+TEST(AProfileParser, IsAArch32BlankCPU) {
+  TargetJSON target = ParseTarget({
+      {"mtriple", String("arm-unknown-linux-gnu")},
+  });
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+}
+
+TEST(AProfileParser, IsAArch32TripleWithAProfile) {
+  TargetJSON target = ParseTargetWithAttrs("cortex-a53", "armv7a-arm-none-eabi", {""});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("is_aarch64")), false);
+
+  target = ParseTargetWithAttrs("cortex-a53", "armv8a-arm-none-eabi", {""});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("is_aarch64")), false);
+
+  target = ParseTargetWithAttrs("cortex-a53", "arm-unknown-linux-gnu", {""});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("is_aarch64")), false);
+}
+
+TEST(AProfileParser, IsAArch32TripleWithMProfile) {
+  TargetJSON target = ParseTargetWithAttrs("cortex-m33", "armv7a-arm-none-eabi", {""});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), false);
+
+  target = ParseTargetWithAttrs("cortex-m33", "armv8a-arm-none-eabi", {""});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), false);
+
+  target = ParseTargetWithAttrs("cortex-m33", "arm-unknown-linux-gnu", {""});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), false);
+}
+
+TEST(AProfileParser, AArch64HasASIMD) {
+  TargetJSON target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {""});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_asimd")), true);
+}
+
+TEST(AProfileParser, AArch32NoASIMD) {
+  TargetJSON target = ParseTargetWithAttrs("", "armv8a-arm-none-eabi", {});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_asimd")), false);
+}
+
+TEST(AProfileParser, AArch32HasASIMDWithOption) {
+  TargetJSON target = ParseTargetWithAttrs("", "armv8a-arm-none-eabi", {"+simd"});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_asimd")), true);
+
+  target = ParseTargetWithAttrs("cortex-a+simd", "armv8a-arm-none-eabi", {""});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_asimd")), true);
+}
+
+TEST(AProfileParser, AArch32HasASIMDWithAlternativeOption) {
+  TargetJSON target = ParseTargetWithAttrs("", "armv8a-arm-none-eabi", {"+neon"});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_asimd")), true);
+
+  target = ParseTargetWithAttrs("cortex-a+neon", "armv8a-arm-none-eabi", {""});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_asimd")), true);
+}
+
+TEST(AProfileParser, NoI8MMSupport) {
+  std::string attr = "+v8.0a";
+  TargetJSON target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {attr, "+i8mm"});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_matmul_i8")), false);
+}
+
+TEST(AProfileParser, DefaultI8MMSupport) {
+  std::string arch_attr = "+v" + std::to_string(defaultI8MM) + "a";
+  TargetJSON target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {arch_attr});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_matmul_i8")), true);
+}
+
+TEST(AProfileParser, DefaultI8MMSupportDisable) {
+  std::string arch_attr = "+v" + std::to_string(defaultI8MM) + "a";
+  TargetJSON target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {arch_attr, "+noi8mm"});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_matmul_i8")), false);
+
+  target = ParseTargetWithAttrs("cortex-a+noi8mm", "aarch64-arm-none-eabi", {arch_attr});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_matmul_i8")), false);
+}
+
+TEST_P(AProfileOptionalI8MM, OptionalI8MMSupport) {
+  std::string arch_attr = "+v" + std::to_string(GetParam()) + "a";
+
+  TargetJSON target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {arch_attr});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_matmul_i8")), false);
+
+  target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {arch_attr, "+i8mm"});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_matmul_i8")), true);
+
+  target = ParseTargetWithAttrs("cortex-a+i8mm", "aarch64-arm-none-eabi", {arch_attr});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_matmul_i8")), true);
+}
+
+TEST(AProfileParser, NoDotProdSupport) {
+  std::string attr = "+v8.0a";
+  TargetJSON target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {attr, "+dotprod"});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_dotprod")), false);
+}
+
+TEST(AProfileParser, DefaultDotProdSupport) {
+  std::string arch_attr = "+v" + std::to_string(defaultDotProd) + "a";
+  TargetJSON target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {arch_attr});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_dotprod")), true);
+}
+
+TEST(AProfileParser, DefaultDotProdSupportDisable) {
+  std::string arch_attr = "+v" + std::to_string(defaultDotProd) + "a";
+  TargetJSON target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {arch_attr, "+nodotprod"});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_dotprod")), false);
+
+  target = ParseTargetWithAttrs("cortex-a+nodotprod", "aarch64-arm-none-eabi", {arch_attr});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_dotprod")), false);
+}
+
+TEST_P(AProfileOptionalDotProd, OptionalDotProdSupport) {
+  std::string arch_attr = "+v" + std::to_string(GetParam()) + "a";
+
+  TargetJSON target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {arch_attr});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_dotprod")), false);
+
+  target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {arch_attr, "+dotprod"});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_dotprod")), true);
+
+  target = ParseTargetWithAttrs("cortex-a+dotprod", "aarch64-arm-none-eabi", {arch_attr});
+  features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_dotprod")), true);
+}
+
+TEST(AProfileParser, ArchVersionInvalidLetter) {
+  std::string arch_attr = "+v" + std::to_string(defaultDotProd) + "b";
+  TargetJSON target = ParseTargetWithAttrs("", "aarch64-arm-none-eabi", {arch_attr});
+  TargetFeatures features = Downcast<TargetFeatures>(target.at("features"));
+  ASSERT_EQ(IsArch(target), true);
+  ASSERT_EQ(Downcast<Bool>(features.at("has_dotprod")), false);
+}
+
+INSTANTIATE_TEST_CASE_P(AProfileParser, AProfileOptionalI8MM, ::testing::ValuesIn(optionalI8MM));
+INSTANTIATE_TEST_CASE_P(AProfileParser, AProfileOptionalDotProd,
+                        ::testing::ValuesIn(optionalDotProd));
+
+}  // namespace aprofile
+}  // namespace parsers
+}  // namespace target
+}  // namespace tvm

From 026c2db544d345a051f9b0ed9726fb4b84ee8bb8 Mon Sep 17 00:00:00 2001
From: Oleksandr Viazlo <oleksandr.viazlo@axelera.ai>
Date: Thu, 20 Oct 2022 12:28:43 +0200
Subject: [PATCH 400/704] Remove assert to fix onnx frontent unit tests for
 PyTorch 1.12 (#13155)

remove assert to fix onnx frontent unit tests for PyTorch 1.12
---
 tests/python/frontend/onnx/test_forward.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 9fc00930af0e..3a714af3a7a1 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5485,7 +5485,6 @@ def _convert_to_onnx(model, inputs):
             operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN,
         )
         onnx_model = onnx.load(file_name)
-        assert 's: "embedding_bag"' in str(onnx_model)
         return onnx_model
 
     def verify_embedding_bag(num_embedding, embedding_dim, data_shape, num_bags=None):

From 209e77c18bb9ad89c707572d4296015fe1054b50 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 20 Oct 2022 19:14:48 +0800
Subject: [PATCH 401/704] [tvmc] add instruments for PassContext (#13136)

---
 python/tvm/driver/tvmc/compiler.py        | 18 ++++++++++++++----
 tests/python/driver/tvmc/test_compiler.py | 23 +++++++++++++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 2955df55432d..c24d36c432df 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -19,13 +19,14 @@
 """
 import logging
 import os.path
-from typing import Any, Optional, Dict, List, Union, Callable
+from typing import Any, Optional, Dict, List, Union, Callable, Sequence
 from pathlib import Path
 
 import tvm
 from tvm import autotvm, auto_scheduler
 from tvm import relay
 from tvm.driver.tvmc.registry import generate_registry_args, reconstruct_registry_entity
+from tvm.ir.instrument import PassInstrument
 from tvm.ir.memory_pools import WorkspaceMemoryPools
 from tvm.target import Target
 from tvm.relay.backend import Executor, Runtime
@@ -223,6 +224,7 @@ def compile_model(
     use_vm: bool = False,
     mod_name: Optional[str] = "default",
     workspace_pools: Optional[WorkspaceMemoryPools] = None,
+    instruments: Optional[Sequence[PassInstrument]] = None,
 ):
     """Compile a model from a supported framework into a TVM module.
 
@@ -277,6 +279,8 @@ def compile_model(
     workspace_pools: WorkspaceMemoryPools, optional
         Specification of WorkspacePoolInfo objects to be used as workspace memory in the
         compilation.
+    instruments: Optional[Sequence[PassInstrument]]
+        The list of pass instrument implementations.
 
     Returns
     -------
@@ -316,7 +320,10 @@ def compile_model(
             with auto_scheduler.ApplyHistoryBest(tuning_records):
                 config["relay.backend.use_auto_scheduler"] = True
                 with tvm.transform.PassContext(
-                    opt_level=opt_level, config=config, disabled_pass=disabled_pass
+                    opt_level=opt_level,
+                    config=config,
+                    disabled_pass=disabled_pass,
+                    instruments=instruments,
                 ):
                     logger.debug("building relay graph with autoscheduler")
                     graph_module = build(
@@ -332,7 +339,10 @@ def compile_model(
         else:
             with autotvm.apply_history_best(tuning_records):
                 with tvm.transform.PassContext(
-                    opt_level=opt_level, config=config, disabled_pass=disabled_pass
+                    opt_level=opt_level,
+                    config=config,
+                    disabled_pass=disabled_pass,
+                    instruments=instruments,
                 ):
                     logger.debug("building relay graph with tuning records")
                     graph_module = build(
@@ -347,7 +357,7 @@ def compile_model(
                     )
     else:
         with tvm.transform.PassContext(
-            opt_level=opt_level, config=config, disabled_pass=disabled_pass
+            opt_level=opt_level, config=config, disabled_pass=disabled_pass, instruments=instruments
         ):
             logger.debug("building relay graph (no tuning records provided)")
             graph_module = build(
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 5535fc02249f..7cb50dd0e366 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -516,6 +516,7 @@ def test_compile_check_configs_composite_target(mock_pkg, mock_pc, mock_fe, mock
         config={"relay.ext.mock.options": {"testopt": "value"}},
         opt_level=3,
         disabled_pass=None,
+        instruments=None,
     )
     mock_pc.assert_has_calls(
         [
@@ -697,5 +698,27 @@ def test_compile_check_workspace_pools(mock_pkg, mock_fe, mock_relay):
     assert mock_relay.call_args_list[0][1]["workspace_memory_pools"] == memory_pools
 
 
+def test_compile_check_pass_instrument(keras_resnet50):
+    pytest.importorskip("tensorflow")
+
+    @tvm.instrument.pass_instrument
+    class PassesCounter:
+        def __init__(self):
+            self.run_before_count = 0
+            self.run_after_count = 0
+
+        def run_before_pass(self, mod, info):
+            self.run_before_count = self.run_before_count + 1
+
+        def run_after_pass(self, mod, info):
+            self.run_after_count = self.run_after_count + 1
+
+    passes_counter = PassesCounter()
+    tvmc_model = tvmc.load(keras_resnet50)
+    tvmc.compile(tvmc_model, target="llvm", instruments=[passes_counter])
+    assert passes_counter.run_after_count > 0
+    assert passes_counter.run_after_count == passes_counter.run_before_count
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 4fe8e96c77261515e3ef1132256ff4a8d48e60d6 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar Subramaniam <quic_sanirudh@quicinc.com>
Date: Thu, 20 Oct 2022 18:08:15 +0530
Subject: [PATCH 402/704] [Hexagon] Set c++17 standard for launcher (#13140)

The current launcher build fails because of missing c++17 flag, so this
patch fixes that error
---
 apps/hexagon_launcher/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/apps/hexagon_launcher/CMakeLists.txt b/apps/hexagon_launcher/CMakeLists.txt
index 122818b89e4d..35cdb316c15d 100644
--- a/apps/hexagon_launcher/CMakeLists.txt
+++ b/apps/hexagon_launcher/CMakeLists.txt
@@ -44,6 +44,7 @@ ExternalProject_Add(android_launcher_binaries
   "-DCMAKE_TOOLCHAIN_FILE=${USE_ANDROID_TOOLCHAIN}"
   "-DANDROID_PLATFORM=${ANDROID_PLATFORM}"
   "-DANDROID_ABI=${ANDROID_ABI}"
+  "-DCMAKE_CXX_STANDARD=17"
   "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
   "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
   INSTALL_COMMAND ""
@@ -64,6 +65,7 @@ ExternalProject_Add(hexagon_launcher_binaries
   CMAKE_ARGS
   "-DCMAKE_C_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang"
   "-DCMAKE_CXX_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang++"
+  "-DCMAKE_CXX_STANDARD=17"
   "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
   "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
   "-DUSE_CUSTOM_LOGGING=ON"

From 57b998892295deb47e07dbe063cc24fc15a95c22 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 21 Oct 2022 00:48:45 +0900
Subject: [PATCH 403/704] [Hexagon] Add scripts for e2e MetaSchedule tuning
 demonstration (#13135)

I've worked on a series of PRs to enable e2e MS tuning on Hexagon (mostly for supporting link-params = True in MS). Now that all the pieces have been upstreamed, I'm adding demo tuning scripts under test_hexagon/metaschedule_e2e.

They are not run on CI, and running it locally requires PyTorch to generate fp16 and int8 resnet50.

The scripts use a small number of tuning trials and replay-trace search strategy instead of the evolutionary search, to finish tuning quickly. Those interested in MS tuning can tweak these settings for better performance at the cost of more tuning times.

* change dtype in tvmscript roundtrip test to avoid int printing error

* allow printing non int8 array

* Revert "change dtype in tvmscript roundtrip test to avoid int printing error"

* add loose assert check on fp16 result
---
 python/tvm/tir/tensor_intrin/hexagon.py       |  50 +++++
 src/printer/tvmscript_printer.cc              |   9 +-
 .../test_hexagon/metaschedule_e2e/__init__.py |  18 ++
 .../metaschedule_e2e/export_models.py         |  74 +++++++
 .../metaschedule_e2e/test_resnet50_fp16.py    | 128 ++++++++++++
 .../metaschedule_e2e/test_resnet50_int8.py    | 186 ++++++++++++++++++
 6 files changed, 464 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/contrib/test_hexagon/metaschedule_e2e/__init__.py
 create mode 100644 tests/python/contrib/test_hexagon/metaschedule_e2e/export_models.py
 create mode 100644 tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
 create mode 100644 tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py

diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
index 0227312d6373..3cad94006dd8 100644
--- a/python/tvm/tir/tensor_intrin/hexagon.py
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -64,8 +64,58 @@ def dot_product_32x4_u8u8i32_vrmpy(
         )
 
 
+@T.prim_func
+def dot_product_32x4_u8i8i32_desc(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((32, 4), "int8", offset_factor=1),
+    C: T.Buffer((32,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+        T.writes(C[0:32])
+        for i in T.serial(0, 32):
+            with T.init():
+                C[i] = T.int32(0)
+            for k in T.serial(0, 4):
+                with T.block("update"):
+                    vi, vk = T.axis.remap("SR", [i, k])
+                    C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
+
+
+@T.prim_func
+def dot_product_32x4_u8i8i32_vrmpy(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((32, 4), "int8", offset_factor=1),
+    C: T.Buffer((32,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+        T.writes(C[0:32])
+
+        A_u8x4 = A.vload([0], "uint8x4")
+        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+
+        B_i8x128 = B.vload([0, 0], dtype="int8x128")
+        B_i32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+
+        C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpybusv.acc.128B"),
+            T.uint32(3),
+            C[T.ramp(T.int32(0), 1, 32)],
+            T.broadcast(A_i32, 32),
+            B_i32x32,
+            dtype="int32x32",
+        )
+
+
 VRMPY_u8u8i32_INTRIN = "dot_32x4_u8u8i32_vrmpy"
 
 TensorIntrin.register(
     VRMPY_u8u8i32_INTRIN, dot_product_32x4_u8u8i32_desc, dot_product_32x4_u8u8i32_vrmpy
 )
+
+VRMPY_u8i8i32_INTRIN = "dot_32x4_u8i8i32_vrmpy"
+
+TensorIntrin.register(
+    VRMPY_u8i8i32_INTRIN, dot_product_32x4_u8i8i32_desc, dot_product_32x4_u8i8i32_vrmpy
+)
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 936ac7580f28..39eb245f3ad9 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -457,6 +457,13 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
  */
 template <typename T>
 void NDArrayToTIR(::tvm::runtime::NDArray arr, std::ostream& os) {
+  if ((arr.DataType().code() == runtime::DataType::kInt ||
+       arr.DataType().code() == runtime::DataType::kUInt) &&
+      arr.DataType().bits() == 8) {
+    // Printing int8 NDArrays causes "UnicodeDecodeError: 'utf-8' codec can't decode byte"
+    // error during MetaSchedule tuning on int8 models.
+    return;
+  }
   int ndim = arr->ndim;
   int tot_dim = 1;
   for (int i = 0; i < ndim; i++) {
@@ -1166,7 +1173,7 @@ Doc TVMScriptPrinter::VisitStmt_(const AllocateConstNode* alloc) {
     }
   } else if (alloc->dtype.is_uint()) {
     if (alloc->dtype.bits() == 8) {
-      // NDArrayToTIR<uint8_t>(data, ss);
+      NDArrayToTIR<uint8_t>(data, ss);
     } else if (alloc->dtype.bits() == 16) {
       NDArrayToTIR<uint16_t>(data, ss);
     } else if (alloc->dtype.bits() == 32) {
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/__init__.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/__init__.py
new file mode 100644
index 000000000000..9a0514058201
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Demonstration of end-to-end MetaSchedule tuning."""
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/export_models.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/export_models.py
new file mode 100644
index 000000000000..660fbf757284
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/export_models.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import torch
+from torchvision.models import resnet
+from torchvision.models.quantization import resnet as qresnet
+
+import tvm
+from tvm import relay
+
+
+def export_resnet50_fp16():
+    model = resnet.resnet50(pretrained=True).eval()
+
+    pt_inp = torch.randn(1, 3, 224, 224)
+
+    script_module = torch.jit.trace(model, pt_inp).eval()
+
+    input_name = "image"
+    input_shapes = [(input_name, pt_inp.shape)]
+    mod, params = relay.frontend.from_pytorch(script_module, input_shapes)
+    mod = relay.transform.ToMixedPrecision("float16")(mod)
+
+    with open("resnet50_fp16.json", "w") as fo:
+        fo.write(tvm.ir.save_json(mod))
+
+    with open("resnet50_fp16.params", "wb") as fo:
+        fo.write(relay.save_param_dict(params))
+
+
+def export_resnet50_int8():
+    def quantize_model(model, inp):
+        model.fuse_model()
+        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
+        torch.quantization.prepare(model, inplace=True)
+        model(inp)
+        torch.quantization.convert(model, inplace=True)
+
+    model = qresnet.resnet50(pretrained=True).eval()
+
+    pt_inp = torch.randn(1, 3, 224, 224)
+    quantize_model(model, pt_inp)
+
+    script_module = torch.jit.trace(model, pt_inp).eval()
+
+    input_name = "image"
+    input_shapes = [(input_name, pt_inp.shape)]
+    mod, params = relay.frontend.from_pytorch(
+        script_module, input_shapes, keep_quantized_weight=True
+    )
+
+    with open("resnet50_int8.json", "w") as fo:
+        fo.write(tvm.ir.save_json(mod))
+
+    with open("resnet50_int8.params", "wb") as fo:
+        fo.write(relay.save_param_dict(params))
+
+
+if __name__ == "__main__":
+    export_resnet50_fp16()
+    export_resnet50_int8()
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
new file mode 100644
index 000000000000..4fe21c564330
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import pytest
+import tempfile
+
+import numpy as np
+
+import tvm.testing
+from tvm import relay
+from tvm import meta_schedule as ms
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
+from tvm.relay.backend import Executor
+from ..infrastructure import get_hexagon_target
+
+
+target = get_hexagon_target("v69")
+target_llvm = tvm.target.Target("llvm")
+model_json = "resnet50_fp16.json"
+model_params = "resnet50_fp16.params"
+
+
+def convert_conv2d_layout(mod, desired_layouts):
+    with tvm.transform.PassContext(opt_level=3):
+        seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
+        return seq(mod)
+
+
+@pytest.mark.skip("End-to-end tuning is skipped on CI.")
+@tvm.testing.requires_hexagon
+def test_resnet50(hexagon_launcher):
+    if not os.path.exists(model_json):
+        pytest.skip(msg="Run python export_models.py first.")
+
+    with open(model_json, "r") as fi:
+        mod = tvm.ir.load_json(fi.read())
+
+    with open(model_params, "rb") as fi:
+        params = relay.load_param_dict(fi.read())
+
+    mod = convert_conv2d_layout(mod, {"nn.conv2d": ["NHWC", "HWIO"]})
+
+    inp = np.random.randn(1, 3, 224, 224).astype("float32")
+    input_name = "image"
+
+    executor = Executor("graph", {"link-params": True})
+    # This line is necessary for link-params to take effect during
+    # task extraction and relay.build(...).
+    mod = mod.with_attr("executor", executor)
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            target=target,
+            params=params,
+            work_dir=work_dir,
+            # for faster tuning
+            max_trials_global=20000,
+            max_trials_per_task=8,
+            num_trials_per_iter=8,
+            strategy="replay-trace",
+            # max_trials_global=20000,
+            # num_trials_per_iter=32,
+            # max_trials_per_task=128,
+            # strategy="evolutionary",
+            builder=get_hexagon_local_builder(),
+            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
+            # Without this, the same workloads with different constant weights
+            # are treated as distinct tuning tasks.
+            module_equality="ignore-ndarray",
+        )
+
+        hexagon_lowered = ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            target=target,
+            params=params,
+        )
+
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            params=params,
+        )
+
+        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+        llvm_graph_mod.set_input(input_name, inp.copy())
+        llvm_graph_mod.run()
+        ref_result = llvm_graph_mod.get_output(0).numpy()
+
+    with hexagon_launcher.start_session() as session:
+        graph_mod = session.get_executor_from_factory(hexagon_lowered)
+        graph_mod.set_input(input_name, inp.copy())
+
+        graph_mod.run()
+        hexagon_output = graph_mod.get_output(0).numpy()
+
+        # Example output: max and mean abs difference with the reference: 0.1406 0.0126
+        print(
+            "max and mean abs difference with the reference:",
+            np.max(np.abs(ref_result - hexagon_output)),
+            np.mean(np.abs(ref_result - hexagon_output)),
+        )
+        tvm.testing.assert_allclose(ref_result, hexagon_output, atol=2e-1)
+
+        time_ms = graph_mod.benchmark(session.device, number=1, repeat=20).mean * 1e3
+
+        print("time elapsed: ", time_ms)
+
+        debug_ex = session.get_graph_debug_executor(
+            hexagon_lowered.get_graph_json(), hexagon_lowered.lib
+        )
+        print(debug_ex.profile(input_name=inp.copy()))
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
new file mode 100644
index 000000000000..4c8d91dd27ef
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -0,0 +1,186 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import numpy as np
+import pytest
+import tempfile
+
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.meta_schedule import postproc, schedule_rule
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8i8i32_INTRIN, VRMPY_u8u8i32_INTRIN
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
+from tvm import meta_schedule as ms
+from ..infrastructure import get_hexagon_target
+
+
+executor = relay.backend.Executor("graph", {"link-params": True})
+target = get_hexagon_target("v68")
+target_llvm = tvm.target.Target("llvm")
+model_json = "resnet50_int8.json"
+model_params = "resnet50_int8.params"
+
+
+def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
+    sch_rules = [
+        schedule_rule.AutoInline(
+            into_producer=False,
+            into_consumer=True,
+            inline_const_tensor=True,
+            disallow_if_then_else=True,
+            require_injective=True,
+            require_ordered=True,
+            disallow_op=["tir.exp"],
+        ),
+        # VRMPY_u8i8i32_INTRIN is used for conv2d. See topi/hexagon/conv2d_alter_op.py
+        # for why we use different intrins for conv2d and dense.
+        schedule_rule.MultiLevelTilingWithIntrin(
+            VRMPY_u8i8i32_INTRIN,
+            structure="SRSRS",
+            tile_binds=None,
+            max_innermost_factor=64,
+            vector_load_lens=None,
+            reuse_read=None,
+            reuse_write=schedule_rule.ReuseType(
+                req="may",
+                levels=[1, 2],
+                scope="global",
+            ),
+        ),
+        # VRMPY_u8u8i32_INTRIN is used for dense
+        schedule_rule.MultiLevelTilingWithIntrin(
+            VRMPY_u8u8i32_INTRIN,
+            structure="SRSRS",
+            tile_binds=None,
+            max_innermost_factor=64,
+            vector_load_lens=None,
+            reuse_read=None,
+            reuse_write=schedule_rule.ReuseType(
+                req="may",
+                levels=[1, 2],
+                scope="global",
+            ),
+        ),
+        schedule_rule.ParallelizeVectorizeUnroll(
+            max_jobs_per_core=16,
+            max_vectorize_extent=128,
+            unroll_max_steps=[0, 16, 64, 512],
+            unroll_explicit=True,
+        ),
+    ]
+
+    postprocs = [
+        postproc.RewriteParallelVectorizeUnroll(),
+        postproc.RewriteReductionBlock(),
+        postproc.RewriteTensorize(vectorize_init_loop=True),
+    ]
+
+    # This line is necessary for link-params to take effect during
+    # task extraction and relay.build(...).
+    mod = mod.with_attr("executor", executor)
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            target=target,
+            params=params,
+            work_dir=work_dir,
+            # for faster tuning
+            max_trials_global=20000,
+            max_trials_per_task=8,
+            num_trials_per_iter=8,
+            strategy="replay-trace",
+            # max_trials_global=20000,
+            # num_trials_per_iter=32,
+            # max_trials_per_task=128,
+            # strategy="evolutionary",
+            builder=get_hexagon_local_builder(),
+            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
+            space=ms.space_generator.PostOrderApply(
+                sch_rules=sch_rules,
+                postprocs=postprocs,
+                mutator_probs={},
+            ),
+            # Without this, the same workloads with different constant weights
+            # are treated as distinct tuning tasks.
+            module_equality="ignore-ndarray",
+        )
+
+        return ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            target=target,
+            params=params,
+        )
+
+
+@pytest.mark.skip("End-to-end tuning is skipped on CI.")
+@tvm.testing.requires_hexagon
+def test_resnet50(hexagon_launcher):
+    if not os.path.exists(model_json):
+        pytest.skip(msg="Run python export_models.py first.")
+
+    with open(model_json, "r") as fi:
+        mod = tvm.ir.load_json(fi.read())
+
+    with open(model_params, "rb") as fi:
+        params = relay.load_param_dict(fi.read())
+    inp = np.random.randn(1, 3, 224, 224).astype("float32")
+    input_name = "image"
+
+    do_tune = True
+
+    if do_tune:
+        hexagon_lowered = tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher)
+    else:
+        with tvm.transform.PassContext(opt_level=3):
+            hexagon_lowered = relay.build(
+                mod,
+                tvm.target.Target(target, host=target),
+                params=params,
+                executor=executor,
+            )
+
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            params=params,
+        )
+
+    with hexagon_launcher.start_session() as session:
+        graph_mod = session.get_executor_from_factory(hexagon_lowered)
+        graph_mod.set_input(input_name, inp.copy())
+        graph_mod.run()
+        hexagon_output = graph_mod.get_output(0).numpy()
+
+        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+        llvm_graph_mod.set_input(input_name, inp.copy())
+        llvm_graph_mod.run()
+        ref_result = llvm_graph_mod.get_output(0).numpy()
+
+        np.testing.assert_allclose(ref_result, hexagon_output, atol=1e-4, rtol=1e-5)
+
+        time_ms = graph_mod.benchmark(session.device, number=1, repeat=20).mean * 1e3
+
+        print("time elapsed: ", time_ms)
+
+        debug_ex = session.get_graph_debug_executor(
+            hexagon_lowered.get_graph_json(), hexagon_lowered.lib
+        )
+        print(debug_ex.profile(input_name=inp.copy()))

From 1dbd1fa4b80b9ee859a25bcdf2ce6f5f9aa50592 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 20 Oct 2022 09:48:03 -0700
Subject: [PATCH 404/704] [Hexagon] Support fetching and building Hexagon
 runtime with external runtime sources (#13138)

* Support fetching and building Hexagon runtime with external runtime sources.

* Ensure compiler flags are set to cover potential uses of Hexagon hardware features used by external libs.

* Add libinfo entries for USE_HEXAGON_EXTERNAL_LIBS.

* Better document options for external libs.

* Allow external hexagon libs to specify their own compile flags
by defining HEXAGON_EXTERNAL_LIBS_COMPILE_FLAGS in a top level
HexagonExternalCompileFlags.cmake config file which is
conditionally included if it exists.

* Check for USE_HEXAGON_EXTERNAL_LIBS triviality.
---
 CMakeLists.txt                  |  1 +
 apps/hexagon_api/CMakeLists.txt |  2 ++
 cmake/modules/Hexagon.cmake     | 37 +++++++++++++++++++++++++++++++++
 cmake/modules/LibInfo.cmake     |  1 +
 src/support/libinfo.cc          |  1 +
 5 files changed, 42 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a21d22f78f3..71a6555d203a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,6 +45,7 @@ tvm_option(USE_HEXAGON "Build with Hexagon support" OFF)
 tvm_option(USE_HEXAGON_SDK "Path to the Hexagon SDK root (required for Hexagon support)" /path/to/sdk)
 tvm_option(USE_HEXAGON_RPC "Enable Hexagon RPC using minRPC implementation over Android." OFF)
 tvm_option(USE_HEXAGON_GTEST "Path to Hexagon specific gtest version for runtime cpp tests." /path/to/hexagon/gtest)
+tvm_option(USE_HEXAGON_EXTERNAL_LIBS "Path to git repo containing external Hexagon runtime sources or libraries" OFF)
 tvm_option(USE_RPC "Build with RPC" ON)
 tvm_option(USE_THREADS "Build with thread support" ON)
 tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF)
diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index 9a05cf3675b6..3b5300ac5582 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -124,6 +124,8 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DCMAKE_CXX_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang++"
     "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
     "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
+    "-DUSE_HEXAGON_EXTERNAL_LIBS=${USE_HEXAGON_EXTERNAL_LIBS}"
+    "-DHEXAGON_EXTERNAL_LIBS_SHA=${HEXAGON_EXTERNAL_LIBS_SHA}"
     "-DCMAKE_CXX_STANDARD=17"
     "-DUSE_LIBBACKTRACE=OFF"
     "-DUSE_RPC=OFF"
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index aad770120201..399623ef1c3e 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -182,6 +182,43 @@ if(BUILD_FOR_HEXAGON)
     "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_fp16_hvx.cc"
     PROPERTIES COMPILE_FLAGS "-mhvx"
   )
+
+  # Include hexagon external library runtime sources
+  if(DEFINED USE_HEXAGON_EXTERNAL_LIBS AND NOT ${USE_HEXAGON_EXTERNAL_LIBS} STREQUAL "")
+    # Check if the libs are provided as an absolute path
+    if (EXISTS ${USE_HEXAGON_EXTERNAL_LIBS})
+    # Check if the libs are provided as a git url
+    elseif(USE_HEXAGON_EXTERNAL_LIBS MATCHES "\.git$")
+      if (NOT DEFINED HEXAGON_EXTERNAL_LIBS_SHA)
+        message(FATAL_ERROR "HEXAGON_EXTERNA_LIBS_SHA must be set when "
+          "USE_HEXAGON_EXTERNAL_LIBS is set to a git repository")
+      endif()
+      include(FetchContent)
+      FetchContent_Declare(hexagon_external
+        GIT_REPOSITORY "${USE_HEXAGON_EXTERNAL_LIBS}"
+        GIT_TAG "${HEXAGON_EXTERNAL_LIBS_SHA}")
+      FetchContent_MakeAvailable(hexagon_external)
+      set(USE_HEXAGON_EXTERNAL_LIBS "${hexagon_external_SOURCE_DIR}")
+    else()
+      message(FATAL_ERROR "Invalid use of USE_HEXAGON_EXTERNAL_LIBS="
+        "${USE_HEXAGON_EXTERNAL_LIBS}; USE_HEXAGON_EXTERNAL_LIBS only "
+        "supports absolute paths and git repository urls")
+    endif()
+
+    file_glob_append(HEXAGON_EXTERNAL_RUNTIME_SRCS
+      "${USE_HEXAGON_EXTERNAL_LIBS}/src/runtime/hexagon/*.cc"
+    )
+    list(APPEND RUNTIME_HEXAGON_SRCS "${HEXAGON_EXTERNAL_RUNTIME_SRCS}")
+    if (EXISTS "${USE_HEXAGON_EXTERNAL_LIBS}/HexagonExternalCompileFlags.cmake")
+      # External libraries will define HEXAGON_EXTERNAL_LIBS_COMPILE_FLAGS,
+      # changing this variable name will break downstream external libraries.
+      include("${USE_HEXAGON_EXTERNAL_LIBS}/HexagonExternalCompileFlags.cmake")
+      set_source_files_properties(
+        "${HEXAGON_EXTERNAL_RUNTIME_SRCS}"
+        PROPERTIES COMPILE_FLAGS "${HEXAGON_EXTERNAL_LIBS_COMPILE_FLAGS}"
+        )
+    endif()
+  endif()
 endif()
 
 if(USE_HEXAGON_RPC)
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 5e60dec3eede..7c24088c0ad2 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -75,6 +75,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_HEXAGON_RPC="${USE_HEXAGON_RPC}"
     TVM_INFO_USE_HEXAGON_SDK="${USE_HEXAGON_SDK}"
     TVM_INFO_USE_HEXAGON_GTEST="${USE_HEXAGON_GTEST}"
+    TVM_INFO_USE_HEXAGON_EXTERNAL_LIBS="${USE_HEXAGON_EXTERNAL_LIBS}"
     TVM_INFO_USE_IOS_RPC="${USE_IOS_RPC}"
     TVM_INFO_USE_KHRONOS_SPIRV="${USE_KHRONOS_SPIRV}"
     TVM_INFO_USE_LIBBACKTRACE="${USE_LIBBACKTRACE}"
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 40138b1b4d89..c0fc9881b4f5 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -280,6 +280,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_HEXAGON_RPC", TVM_INFO_USE_HEXAGON_RPC},
       {"USE_HEXAGON_SDK", TVM_INFO_USE_HEXAGON_SDK},
       {"USE_HEXAGON_GTEST", TVM_INFO_USE_HEXAGON_GTEST},
+      {"USE_HEXAGON_EXTERNAL_LIBS", TVM_INFO_USE_HEXAGON_EXTERNAL_LIBS},
       {"USE_IOS_RPC", TVM_INFO_USE_IOS_RPC},
       {"USE_KHRONOS_SPIRV", TVM_INFO_USE_KHRONOS_SPIRV},
       {"USE_LIBBACKTRACE", TVM_INFO_USE_LIBBACKTRACE},

From 9f4c7121b92bebd7771e9e3f3e72763744b961b0 Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Thu, 20 Oct 2022 12:49:41 -0400
Subject: [PATCH 405/704] [Hexagon] [runtime] Clarify compute resources
 (#13149)

Clarify naming for the tensor processing unit that will be power managed and acquired/released for the runtime
---
 .../{hexagon_hmx.cc => hexagon_htp.cc}        | 16 ++++++-------
 .../hexagon/{hexagon_hmx.h => hexagon_htp.h}  | 24 +++++++++----------
 src/runtime/hexagon/hexagon_thread_manager.cc |  4 ++--
 src/runtime/hexagon/hexagon_thread_manager.h  |  8 +++----
 4 files changed, 26 insertions(+), 26 deletions(-)
 rename src/runtime/hexagon/{hexagon_hmx.cc => hexagon_htp.cc} (90%)
 rename src/runtime/hexagon/{hexagon_hmx.h => hexagon_htp.h} (73%)

diff --git a/src/runtime/hexagon/hexagon_hmx.cc b/src/runtime/hexagon/hexagon_htp.cc
similarity index 90%
rename from src/runtime/hexagon/hexagon_hmx.cc
rename to src/runtime/hexagon/hexagon_htp.cc
index 66bde535ff6e..32084382ed7f 100644
--- a/src/runtime/hexagon/hexagon_hmx.cc
+++ b/src/runtime/hexagon/hexagon_htp.cc
@@ -26,7 +26,7 @@ extern "C" {
 }
 
 #include "hexagon_common.h"
-#include "hexagon_hmx.h"
+#include "hexagon_htp.h"
 
 // Minimum timeout per SDK docs, excluding 0
 #define COMPUTE_RES_ACQ_TIMEOUT 200
@@ -35,17 +35,17 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-HexagonHmx::HexagonHmx() {
+HexagonHtp::HexagonHtp() {
   PowerOn();
   Acquire();
 }
 
-HexagonHmx::~HexagonHmx() {
+HexagonHtp::~HexagonHtp() {
   Release();
   PowerOff();
 }
 
-void HexagonHmx::PowerOn() {
+void HexagonHtp::PowerOn() {
   HAP_power_request_t pwr_req;
   int nErr;
 
@@ -57,7 +57,7 @@ void HexagonHmx::PowerOn() {
   }
 }
 
-void HexagonHmx::PowerOff() {
+void HexagonHtp::PowerOff() {
   HAP_power_request_t pwr_req;
   int nErr;
 
@@ -69,7 +69,7 @@ void HexagonHmx::PowerOff() {
   HAP_utils_destroy_context(hap_pwr_ctx_);
 }
 
-void HexagonHmx::Acquire() {
+void HexagonHtp::Acquire() {
   compute_res_attr_t compute_res_attr;
   int nErr;
 
@@ -85,11 +85,11 @@ void HexagonHmx::Acquire() {
     LOG(FATAL) << "InternalError: HAP_compute_res_acquire failed\n";
   }
   if ((nErr = HAP_compute_res_hmx_lock(context_id_))) {
-    LOG(FATAL) << "InternalError: Unable to lock HMX!";
+    LOG(FATAL) << "InternalError: Unable to lock HTP!";
   }
 }
 
-void HexagonHmx::Release() {
+void HexagonHtp::Release() {
   HAP_compute_res_hmx_unlock((unsigned int)context_id_);
   HAP_compute_res_release((unsigned int)context_id_);
 }
diff --git a/src/runtime/hexagon/hexagon_hmx.h b/src/runtime/hexagon/hexagon_htp.h
similarity index 73%
rename from src/runtime/hexagon/hexagon_hmx.h
rename to src/runtime/hexagon/hexagon_htp.h
index 1d5f56df7a4b..b52e07e27b46 100644
--- a/src/runtime/hexagon/hexagon_hmx.h
+++ b/src/runtime/hexagon/hexagon_htp.h
@@ -17,32 +17,32 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HMX_H_
-#define TVM_RUNTIME_HEXAGON_HEXAGON_HMX_H_
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HTP_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_HTP_H_
 
 namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-class HexagonHmx {
+class HexagonHtp {
  public:
   //! \brief Constructor.
-  HexagonHmx();
+  HexagonHtp();
 
   //! \brief Destructor.
-  ~HexagonHmx();
+  ~HexagonHtp();
 
-  //! \brief Prevent copy construction of HexagonHmx.
-  HexagonHmx(const HexagonHmx&) = delete;
+  //! \brief Prevent copy construction of HexagonHtp.
+  HexagonHtp(const HexagonHtp&) = delete;
 
-  //! \brief Prevent copy assignment with HexagonHmx.
-  HexagonHmx& operator=(const HexagonHmx&) = delete;
+  //! \brief Prevent copy assignment with HexagonHtp.
+  HexagonHtp& operator=(const HexagonHtp&) = delete;
 
   //! \brief Prevent move construction.
-  HexagonHmx(HexagonHmx&&) = delete;
+  HexagonHtp(HexagonHtp&&) = delete;
 
   //! \brief Prevent move assignment.
-  HexagonHmx& operator=(HexagonHmx&&) = delete;
+  HexagonHtp& operator=(HexagonHtp&&) = delete;
 
  private:
   //! \brief Power context
@@ -61,4 +61,4 @@ class HexagonHmx {
 }  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HMX_H_
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HTP_H_
diff --git a/src/runtime/hexagon/hexagon_thread_manager.cc b/src/runtime/hexagon/hexagon_thread_manager.cc
index e3de7710a73e..546a41cff041 100644
--- a/src/runtime/hexagon/hexagon_thread_manager.cc
+++ b/src/runtime/hexagon/hexagon_thread_manager.cc
@@ -43,7 +43,7 @@ HexagonThreadManager::HexagonThreadManager(unsigned num_threads, unsigned thread
 
   DLOG(INFO) << "Acquiring hardware resources";
   // TODO(HWE): Move these bindings to specific threads
-  hmx_ = std::make_unique<HexagonHmx>();
+  htp_ = std::make_unique<HexagonHtp>();
   hvx_ = std::make_unique<HexagonHvx>();
 
   // Initially, block all threads until we get the Start() call
@@ -104,7 +104,7 @@ HexagonThreadManager::~HexagonThreadManager() {
   DLOG(INFO) << "Buffers freed";
 
   // Release hardware
-  hmx_.reset();
+  htp_.reset();
   hvx_.reset();
 
   DLOG(INFO) << "Hardware resources released";
diff --git a/src/runtime/hexagon/hexagon_thread_manager.h b/src/runtime/hexagon/hexagon_thread_manager.h
index 81c90bd1ae20..30a931554494 100644
--- a/src/runtime/hexagon/hexagon_thread_manager.h
+++ b/src/runtime/hexagon/hexagon_thread_manager.h
@@ -32,7 +32,7 @@
 #include "hexagon_buffer.h"
 #include "hexagon_buffer_manager.h"
 #include "hexagon_common.h"
-#include "hexagon_hmx.h"
+#include "hexagon_htp.h"
 #include "hexagon_hvx.h"
 #include "qurt.h"
 
@@ -188,9 +188,9 @@ class HexagonThreadManager {
     Command(voidfunc f, void* args) : f(f), args(args) {}
   };
 
-  //! \brief HMX hardware resource.
-  // TODO(HWE): Move binding of HMX to a specific thread
-  std::unique_ptr<HexagonHmx> hmx_;
+  //! \brief HTP hardware resource.
+  // TODO(HWE): Move binding of HTP to a specific thread
+  std::unique_ptr<HexagonHtp> htp_;
 
   //! \brief HVX hardware resource.
   // TODO(HWE): Move binding of individual HVX instances to a specific thread

From 0e21840553225103f3ad3e93f3e72d20b36a8c2b Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 20 Oct 2022 19:16:51 -0500
Subject: [PATCH 406/704] [Hexagon] Fix chunk address table generation in
 chunkify_hwio_16b (#13002)

* [Hexagon] Fix chunk address table generation in chunkify_hwio_16b

The filter data is contiguous, the iteration is over 16-bit elements,
not pointers.

* Re-disable the conv2d_fp16 test, since it still fails on hardware
---
 src/runtime/hexagon/ops/conv_utils.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/hexagon/ops/conv_utils.cc b/src/runtime/hexagon/ops/conv_utils.cc
index e1ec1e17277d..b10f7cc315b2 100644
--- a/src/runtime/hexagon/ops/conv_utils.cc
+++ b/src/runtime/hexagon/ops/conv_utils.cc
@@ -141,7 +141,7 @@ void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int de
 void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height,
                        int width, int idepth, int odepth) {
   auto inp_data = static_cast<uint16_t*>(inp);
-  auto out_data = static_cast<uintptr_t*>(out);
+  auto out_data = static_cast<uint16_t*>(out);
   const int stride_i = odepth;
   const int stride_x = stride_i * idepth;
   const int stride_y = stride_x * width;
@@ -158,7 +158,7 @@ void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, i
           int max_i = std::min(32, idepth - ci);
           int max_o = std::min(32, odepth - co);
 
-          auto chunk = reinterpret_cast<uint16_t*>(out_data);
+          auto chunk = out_data;
           for (int y = 0; y < max_y; ++y) {
             for (int x = max_x - 1; x >= 0; --x) {
               for (int i = 0; i < max_i; ++i) {

From f28b0ca9c4af169d2ff4574fcfce0bd916541ec1 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 21 Oct 2022 09:45:26 +0900
Subject: [PATCH 407/704] [CI] Update PyTorch to v1.12 in GPU image (#13160)

---
 Jenkinsfile               | 2 +-
 ci/jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index d48e02cf13bf..2fd707a570c9 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -50,7 +50,7 @@
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
 ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
 ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
 ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index f480f08b2b48..bca70349381b 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -52,7 +52,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
 ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
 ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
 ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'

From 9aedb8bddabb5d65c421dae7d7affd8a30362146 Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Thu, 20 Oct 2022 17:46:08 -0700
Subject: [PATCH 408/704] [Hexagon] refactor HexagonBufferManager class
 (#13145)

* [Hexagon] refactor HexagonBufferManager class

* add back print statement to device api
---
 src/runtime/hexagon/hexagon_buffer_manager.h | 16 ++++------------
 src/runtime/hexagon/hexagon_device_api.cc    |  4 ++--
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_buffer_manager.h b/src/runtime/hexagon/hexagon_buffer_manager.h
index e064114346ee..3c43c25b5863 100644
--- a/src/runtime/hexagon/hexagon_buffer_manager.h
+++ b/src/runtime/hexagon/hexagon_buffer_manager.h
@@ -46,14 +46,12 @@ class HexagonBufferManager {
    * \param ptr Address of the HexagonBuffer as returned by `AllocateHexagonBuffer`.
    */
   void FreeHexagonBuffer(void* ptr) {
+    std::lock_guard<std::mutex> lock(map_mutex_);
     auto it = hexagon_buffer_map_.find(ptr);
     CHECK(it != hexagon_buffer_map_.end())
-        << "Attempt made to free unknown or already freed dataspace allocation";
+        << "Attempt made to free unknown or already freed allocation";
     CHECK(it->second != nullptr);
-    {
-      std::lock_guard<std::mutex> lock(map_mutex_);
-      hexagon_buffer_map_.erase(it);
-    }
+    hexagon_buffer_map_.erase(it);
   }
   /*!
    * \brief Allocate a HexagonBuffer.
@@ -70,14 +68,8 @@ class HexagonBufferManager {
     return ptr;
   }
 
-  //! \brief Returns whether the HexagonBuffer is in the map.
-  size_t count(void* ptr) {
-    std::lock_guard<std::mutex> lock(map_mutex_);
-    return hexagon_buffer_map_.count(ptr);
-  }
-
   //! \brief Returns an iterator to the HexagonBuffer within the map.
-  HexagonBuffer* find(void* ptr) {
+  HexagonBuffer* FindHexagonBuffer(void* ptr) {
     std::lock_guard<std::mutex> lock(map_mutex_);
     auto it = hexagon_buffer_map_.find(ptr);
     if (it != hexagon_buffer_map_.end()) {
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index f8824d515443..50275a7b6101 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -154,7 +154,7 @@ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
   CHECK(runtime_hexbuffs) << "Attempted to free Hexagon workspace with "
                           << "HexagonDeviceAPI::FreeWorkspace outside of a session.  "
                           << "Please call HexagonDeviceAPI::AcquireResources";
-  CHECK(runtime_hexbuffs->count(data) != 0)
+  CHECK(runtime_hexbuffs->FindHexagonBuffer(data) != nullptr)
       << "Attempt made to free unknown or already freed workspace allocation";
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
@@ -182,7 +182,7 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan
                           << "Please call HexagonDeviceAPI::AcquireResources";
 
   auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* {
-    return runtime_hexbuffs->find(ptr);
+    return runtime_hexbuffs->FindHexagonBuffer(ptr);
   };
 
   HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);

From 1311cac88bc47e834fe0295b4d0701ea7bbc86a6 Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Fri, 21 Oct 2022 12:06:17 +0800
Subject: [PATCH 409/704] Fix typo in test_pipeline_executor.py (#13134)

---
 tests/python/relay/test_pipeline_executor.py | 25 +++++++++++---------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 06614977d4ca..0f9d3633c5d7 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -478,38 +478,41 @@ def test_pipeline():
             assert customized_parameters_mod == mod1
             # The global parameters group named "param_0" will be connected to "mod1" as parameters.
             pipe_config["param_group"]["param_0"].connect(pipe_config[mod1]["param"])
-            # The pipeline input named "data_0" will be connected to a input named "data_0"
+            # The pipeline input named "data_a" will be connected to a input named "data_0"
             # of mod1.
             pipe_config["input"]["data_a"].connect(pipe_config[mod1]["input"]["data_0"])
 
-            # The pipeline Input named "data_1" will be connected to a input named "data_1"
+            # The pipeline Input named "data_b" will be connected to a input named "data_1"
             # of mod2.
             pipe_config["input"]["data_b"].connect(pipe_config[mod2]["input"]["data_1"])
 
-            # The mod1 output[0] will be connected to a input named "data_0" of mod2.
+            # The mod1 output[0] will be connected to a input named "data_n_0" of mod2.
             pipe_config[mod1]["output"][0].connect(pipe_config[mod2]["input"]["data_n_0"])
 
-            # The mod1 output[1] will be connected to a input named "data_0" of mod3.
+            # The mod1 output[1] will be connected to a input named "data_n_2" of mod3.
             pipe_config[mod1]["output"][1].connect(pipe_config[mod3]["input"]["data_n_2"])
 
-            # The mod2 output[2] will be connected to a input named "data_1" of mod3.
+            # The mod2 output[2] will be connected to a input named "data_n_1" of mod3.
             pipe_config[mod2]["output"][0].connect(pipe_config[mod3]["input"]["data_n_1"])
 
             # The mod3 output[0] will be connected to pipeline output[0].
             pipe_config[mod3]["output"][0].connect(pipe_config["output"]["0"])
             # Print configuration (print(pipe_config)), the result looks like following.
             #
+            # Params
+            #   |param_0: mod0:param
+            #
             # Inputs
-            #   |data_a: mod1:data_0
-            #   |data_b: mod2:data_1
+            #   |data_a: mod0:data_0
+            #   |data_b: mod1:data_1
             #
             # output
-            #   |output(1) : mod3.output(0)
+            #   |output(0) : mod2.output(0)
             #
             # connections
-            #   |mod1.output(0)-> mod2.data_n_0
-            #   |mod1.output(1)-> mod3.data_n_2
-            #   |mod2.output(0)-> mod3.data_n_1
+            #   |mod0.output(0)-> mod1.data_n_0
+            #   |mod0.output(1)-> mod2.data_n_2
+            #   |mod1.output(0)-> mod2.data_n_1
 
             # Set other parameters.
             pipe_config[mod1].target = target[0]

From 100c050d604d6358d1578ffce31a8668c51e7e9e Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Fri, 21 Oct 2022 01:32:20 -0500
Subject: [PATCH 410/704] [Relay] fix: trilu check op for i64/i32 (#13123)

* fix: trilu check op for i64/i32

* format
---
 python/tvm/topi/transform.py         |  7 +++++++
 tests/python/relay/test_op_level3.py | 20 ++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index 0347473f83b7..23334da9c25c 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -1053,6 +1053,13 @@ def trilu(data, k, upper):
     def _apply_trilu(*indices):
         row_index = indices[-2]
         col_index = indices[-1]
+        # promote row & col indices
+        if row_index.dtype != col_index.dtype:
+            target_type = (col_index + row_index).dtype
+            if row_index.dtype != target_type:
+                row_index = tvm.tir.Cast(target_type, row_index)
+            else:
+                col_index = tvm.tir.Cast(target_type, col_index)
         other_indices = indices[:-2]
         check_position = check_op(row_index, col_index - k)
         value = data(*other_indices, row_index, col_index)
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 9becfc12671d..c3b3215e84e4 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -2264,6 +2264,26 @@ def verify_trilu(data_shape, upper=True, k=0):
     verify_trilu((8, 6, 6), False, -2)
 
 
+def test_trilu_shape_i64():
+    data_x = np.ones((2, 1), dtype="int32")
+
+    x = relay.var("x", shape=[2, 1], dtype="float32")
+    v0 = relay.broadcast_to(x, shape=relay.const([2, 1], dtype="int64"))
+    v2 = relay.add(relay.const([[1.0]]), v0)
+    v3 = relay.trilu(v0, k=0)
+
+    f = relay.Function([x], relay.Tuple([v2, v3]))
+    tvm_res = relay.create_executor("graph", device=tvm.cpu(), target="llvm").evaluate(f)(data_x)
+
+    np_res = (
+        np.array([[2.0], [2.0]], dtype=np.float32),
+        np.array([[1.0], [0.0]], dtype=np.float32),
+    )
+
+    tvm.testing.assert_allclose(tvm_res[0].numpy(), np_res[0])
+    tvm.testing.assert_allclose(tvm_res[1].numpy(), np_res[1])
+
+
 def test_trilu_reduce():
     data_i0 = np.ones((2, 2), dtype="int32")
     k = 0

From 46aa04f3f1b64558c55244469816ed225c13f6ba Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 21 Oct 2022 14:12:25 -0500
Subject: [PATCH 411/704] [ci] Wait longer during retries (#13102)

This bumps the wait time between failing network calls from `(10, 30)`
to `(30, 200)` seconds. A failing CI run is pretty costly so it makes
sense to wait a little longer and see if Docker/AWS/whoever starts
responding.

Motivated by
https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4495/pipeline
which saw 5 `docker pull` `500` errors in a row

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 ci/scripts/retry.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/scripts/retry.sh b/ci/scripts/retry.sh
index 08958fedce89..933eb1aba6ef 100644
--- a/ci/scripts/retry.sh
+++ b/ci/scripts/retry.sh
@@ -32,7 +32,7 @@ retry() {
           exit 1
       fi
 
-      WAIT=$(python3 -c 'import random; print(random.randint(10, 30))')
+      WAIT=$(python3 -c 'import random; print(random.randint(30, 200))')
       echo "failed to update $n / $max_retries, waiting $WAIT to try again"
       sleep "$WAIT"
   done

From eb8f225226fa4d991c83ca8f8f273c00383db800 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 21 Oct 2022 14:12:48 -0500
Subject: [PATCH 412/704] [ci] Ignore JUnit upload failures (#13142)

These are never really signal-carrying so there's no need for it to show
up as red on CI jobs. This also adds retries to the `git checkout` step
within Jenkins to deal with GitHub instability.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                  | 407 ++++++++++++++++++++++++++---------
 ci/jenkins/Prepare.groovy.j2 |   5 +-
 ci/jenkins/macros.j2         |  20 +-
 3 files changed, 325 insertions(+), 107 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 2fd707a570c9..ce69adb81ef9 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-10-04T13:17:33.929159
+// Generated at 2022-10-19T13:44:32.119961
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -116,8 +116,9 @@ def per_exec_ws(folder) {
 
 // initialize source codes
 def init_git() {
-  checkout scm
-
+  retry(5) {
+    checkout scm
+  }
 
   // Add more info about job node
   sh (
@@ -1134,7 +1135,8 @@ def shard_run_unittest_GPU_1_of_3() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1143,7 +1145,10 @@ def shard_run_unittest_GPU_1_of_3() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1198,7 +1203,8 @@ def shard_run_unittest_GPU_2_of_3() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1207,7 +1213,10 @@ def shard_run_unittest_GPU_2_of_3() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1258,7 +1267,8 @@ def shard_run_unittest_GPU_3_of_3() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1267,7 +1277,10 @@ def shard_run_unittest_GPU_3_of_3() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1317,7 +1330,8 @@ def shard_run_integration_CPU_1_of_4() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1326,7 +1340,10 @@ def shard_run_integration_CPU_1_of_4() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1375,7 +1392,8 @@ def shard_run_integration_CPU_2_of_4() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1384,7 +1402,10 @@ def shard_run_integration_CPU_2_of_4() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1433,7 +1454,8 @@ def shard_run_integration_CPU_3_of_4() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1442,7 +1464,10 @@ def shard_run_integration_CPU_3_of_4() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1491,7 +1516,8 @@ def shard_run_integration_CPU_4_of_4() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1500,7 +1526,10 @@ def shard_run_integration_CPU_4_of_4() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1550,7 +1579,8 @@ def shard_run_python_i386_1_of_3() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1559,7 +1589,10 @@ def shard_run_python_i386_1_of_3() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1608,7 +1641,8 @@ def shard_run_python_i386_2_of_3() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1617,7 +1651,10 @@ def shard_run_python_i386_2_of_3() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1665,7 +1702,8 @@ def shard_run_python_i386_3_of_3() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1674,7 +1712,10 @@ def shard_run_python_i386_3_of_3() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1723,7 +1764,8 @@ def shard_run_test_Hexagon_1_of_8() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1732,7 +1774,10 @@ def shard_run_test_Hexagon_1_of_8() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1779,7 +1824,8 @@ def shard_run_test_Hexagon_2_of_8() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1788,7 +1834,10 @@ def shard_run_test_Hexagon_2_of_8() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1835,7 +1884,8 @@ def shard_run_test_Hexagon_3_of_8() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1844,7 +1894,10 @@ def shard_run_test_Hexagon_3_of_8() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1891,7 +1944,8 @@ def shard_run_test_Hexagon_4_of_8() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1900,7 +1954,10 @@ def shard_run_test_Hexagon_4_of_8() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -1947,7 +2004,8 @@ def shard_run_test_Hexagon_5_of_8() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -1956,7 +2014,10 @@ def shard_run_test_Hexagon_5_of_8() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2003,7 +2064,8 @@ def shard_run_test_Hexagon_6_of_8() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2012,7 +2074,10 @@ def shard_run_test_Hexagon_6_of_8() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2059,7 +2124,8 @@ def shard_run_test_Hexagon_7_of_8() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2068,7 +2134,10 @@ def shard_run_test_Hexagon_7_of_8() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2115,7 +2184,8 @@ def shard_run_test_Hexagon_8_of_8() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2124,7 +2194,10 @@ def shard_run_test_Hexagon_8_of_8() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2173,7 +2246,8 @@ def shard_run_integration_aarch64_1_of_4() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2182,7 +2256,10 @@ def shard_run_integration_aarch64_1_of_4() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2230,7 +2307,8 @@ def shard_run_integration_aarch64_2_of_4() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2239,7 +2317,10 @@ def shard_run_integration_aarch64_2_of_4() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2287,7 +2368,8 @@ def shard_run_integration_aarch64_3_of_4() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2296,7 +2378,10 @@ def shard_run_integration_aarch64_3_of_4() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2344,7 +2429,8 @@ def shard_run_integration_aarch64_4_of_4() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2353,7 +2439,10 @@ def shard_run_integration_aarch64_4_of_4() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2401,7 +2490,8 @@ def shard_run_topi_GPU_1_of_3() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2410,7 +2500,10 @@ def shard_run_topi_GPU_1_of_3() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2457,7 +2550,8 @@ def shard_run_topi_GPU_2_of_3() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2466,7 +2560,10 @@ def shard_run_topi_GPU_2_of_3() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2513,7 +2610,8 @@ def shard_run_topi_GPU_3_of_3() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2522,7 +2620,10 @@ def shard_run_topi_GPU_3_of_3() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2570,7 +2671,8 @@ def shard_run_frontend_GPU_1_of_6() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2579,7 +2681,10 @@ def shard_run_frontend_GPU_1_of_6() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2626,7 +2731,8 @@ def shard_run_frontend_GPU_2_of_6() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2635,7 +2741,10 @@ def shard_run_frontend_GPU_2_of_6() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2682,7 +2791,8 @@ def shard_run_frontend_GPU_3_of_6() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2691,7 +2801,10 @@ def shard_run_frontend_GPU_3_of_6() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2738,7 +2851,8 @@ def shard_run_frontend_GPU_4_of_6() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2747,7 +2861,10 @@ def shard_run_frontend_GPU_4_of_6() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2794,7 +2911,8 @@ def shard_run_frontend_GPU_5_of_6() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2803,7 +2921,10 @@ def shard_run_frontend_GPU_5_of_6() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2850,7 +2971,8 @@ def shard_run_frontend_GPU_6_of_6() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2859,7 +2981,10 @@ def shard_run_frontend_GPU_6_of_6() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2912,7 +3037,8 @@ def shard_run_topi_aarch64_1_of_2() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2921,7 +3047,10 @@ def shard_run_topi_aarch64_1_of_2() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -2972,7 +3101,8 @@ def shard_run_topi_aarch64_2_of_2() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -2981,7 +3111,10 @@ def shard_run_topi_aarch64_2_of_2() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3029,7 +3162,8 @@ def shard_run_frontend_aarch64_1_of_2() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3038,7 +3172,10 @@ def shard_run_frontend_aarch64_1_of_2() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3085,7 +3222,8 @@ def shard_run_frontend_aarch64_2_of_2() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3094,7 +3232,10 @@ def shard_run_frontend_aarch64_2_of_2() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3147,7 +3288,8 @@ def shard_run_test_Cortex_M_1_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3156,7 +3298,10 @@ def shard_run_test_Cortex_M_1_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3203,7 +3348,8 @@ def shard_run_test_Cortex_M_2_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3212,7 +3358,10 @@ def shard_run_test_Cortex_M_2_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3259,7 +3408,8 @@ def shard_run_test_Cortex_M_3_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3268,7 +3418,10 @@ def shard_run_test_Cortex_M_3_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3315,7 +3468,8 @@ def shard_run_test_Cortex_M_4_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3324,7 +3478,10 @@ def shard_run_test_Cortex_M_4_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3371,7 +3528,8 @@ def shard_run_test_Cortex_M_5_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3380,7 +3538,10 @@ def shard_run_test_Cortex_M_5_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3427,7 +3588,8 @@ def shard_run_test_Cortex_M_6_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3436,7 +3598,10 @@ def shard_run_test_Cortex_M_6_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3483,7 +3648,8 @@ def shard_run_test_Cortex_M_7_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3492,7 +3658,10 @@ def shard_run_test_Cortex_M_7_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3539,7 +3708,8 @@ def shard_run_test_Cortex_M_8_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3548,7 +3718,10 @@ def shard_run_test_Cortex_M_8_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3595,7 +3768,8 @@ def shard_run_test_Cortex_M_9_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3604,7 +3778,10 @@ def shard_run_test_Cortex_M_9_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3651,7 +3828,8 @@ def shard_run_test_Cortex_M_10_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3660,7 +3838,10 @@ def shard_run_test_Cortex_M_10_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3707,7 +3888,8 @@ def shard_run_test_Cortex_M_11_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3716,7 +3898,10 @@ def shard_run_test_Cortex_M_11_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3763,7 +3948,8 @@ def shard_run_test_Cortex_M_12_of_12() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3772,7 +3958,10 @@ def shard_run_test_Cortex_M_12_of_12() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3821,7 +4010,8 @@ def shard_run_test_RISC_V_1_of_1() {
             })
           }
         } finally {
-          sh(
+          try {
+            sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3830,7 +4020,10 @@ def shard_run_test_RISC_V_1_of_1() {
             label: 'Upload JUnits to S3',
           )
 
-          junit 'build/pytest-results/*.xml'
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -3867,7 +4060,8 @@ def run_unittest_minimal() {
               python_unittest(ci_minimal)
             })
           } finally {
-            sh(
+            try {
+              sh(
             script: """
               set -eux
               . ci/scripts/retry.sh
@@ -3876,7 +4070,10 @@ def run_unittest_minimal() {
             label: 'Upload JUnits to S3',
           )
 
-            junit 'build/pytest-results/*.xml'
+              junit 'build/pytest-results/*.xml'
+            } catch (Exception e) {
+              echo 'Exception during JUnit upload: ' + e.toString()
+            }
           }
         }
       }
@@ -4078,6 +4275,7 @@ stage('Test') {
                 )
               })
             } finally {
+            try {
               sh(
                 script: """
                   set -eux
@@ -4088,6 +4286,9 @@ stage('Test') {
               )
 
               junit 'build/pytest-results/*.xml'
+            } catch (Exception e) {
+              echo 'Exception during JUnit upload: ' + e.toString()
+            }
             }
           }
         }
@@ -4130,6 +4331,7 @@ stage('Test') {
                 )
               })
             } finally {
+            try {
               sh(
                 script: """
                   set -eux
@@ -4140,6 +4342,9 @@ stage('Test') {
               )
 
               junit 'build/pytest-results/*.xml'
+            } catch (Exception e) {
+              echo 'Exception during JUnit upload: ' + e.toString()
+            }
             }
           }
         }
diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2
index cb677f437a3c..2c6f93090127 100644
--- a/ci/jenkins/Prepare.groovy.j2
+++ b/ci/jenkins/Prepare.groovy.j2
@@ -4,8 +4,9 @@ def per_exec_ws(folder) {
 
 // initialize source codes
 def init_git() {
-  checkout scm
-
+  retry(5) {
+    checkout scm
+  }
 
   // Add more info about job node
   sh (
diff --git a/ci/jenkins/macros.j2 b/ci/jenkins/macros.j2
index 618b1d9d6b09..78c5acd1c7ff 100644
--- a/ci/jenkins/macros.j2
+++ b/ci/jenkins/macros.j2
@@ -53,8 +53,12 @@ def {{ method_name }}() {
             })
           }
         } finally {
-          {{ junit_to_s3(test_dir_name) }}
-          junit 'build/pytest-results/*.xml'
+          try {
+            {{ junit_to_s3(test_dir_name) }}
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
         }
       }
     }
@@ -120,8 +124,12 @@ def {{ method_name }}() {
               {{ caller() | indent(width=8) | trim }}
             })
           } finally {
-            {{ junit_to_s3(test_dir_name) | indent(width=0) }}
-            junit 'build/pytest-results/*.xml'
+            try {
+              {{ junit_to_s3(test_dir_name) | indent(width=0) }}
+              junit 'build/pytest-results/*.xml'
+            } catch (Exception e) {
+              echo 'Exception during JUnit upload: ' + e.toString()
+            }
           }
         }
       }
@@ -147,8 +155,12 @@ def {{ method_name }}() {
                 {{ caller() | indent(width=12) | trim }}
               })
             } finally {
+            try {
               {{ junit_to_s3(test_dir_name) | indent(width=4) }}
               junit 'build/pytest-results/*.xml'
+            } catch (Exception e) {
+              echo 'Exception during JUnit upload: ' + e.toString()
+            }
             }
           }
         }

From 81e6e17ff0a710e5c1b1cf25d8d501f63f8aa922 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 21 Oct 2022 14:13:08 -0500
Subject: [PATCH 413/704] [skip ci][ci] Ignore certain files for triggering
 docker builds (#13167)

These files have no effect on Docker image builds so if they are changed
we shouldn't be triggering a Docker image build. Right now if anything
in `docker/` is touched a build is run, this adds a short exclusion list
of some files (each of which is used for local testing or is exercised
already in CI).

Marked with `[skip ci]` since this runs during the Jenkins prepare step
anyways.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 ci/scripts/git_change_docker.sh | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/ci/scripts/git_change_docker.sh b/ci/scripts/git_change_docker.sh
index 7fc855544af0..f2c5fc726e09 100755
--- a/ci/scripts/git_change_docker.sh
+++ b/ci/scripts/git_change_docker.sh
@@ -25,7 +25,21 @@ else
     changed_files=$(git diff --no-commit-id --name-only -r origin/main)
 fi
 
+FILES_THAT_SHOULDNT_TRIGGER_REBUILDS=(
+    "docker/bash.sh"
+    "docker/with_the_same_user"
+    "README.md"
+    "lint.sh"
+    "clear-stale-images.sh"
+)
+
 for file in $changed_files; do
+    # Certain files under docker/ don't matter for rebuilds, so ignore them
+    if printf '%s\0' "${FILES_THAT_SHOULDNT_TRIGGER_REBUILDS[@]}" | grep -F -x -z -- "$file"; then
+        echo "Skipping $file"
+        continue
+    fi
+    # if grep -q "docker/"
     echo "Checking $file"
     if grep -q "docker/" <<< "$file"; then
         exit 1

From e7a72af0c1572e22f3f139fe975227fd944e313c Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Sat, 22 Oct 2022 04:09:25 +0800
Subject: [PATCH 414/704] [microTVM] Improve code reuse in Corstone300 conv2d
 tests (#13051)

* Add support for out_layout to tensordot schedules and tests

* Move shared conv2d test logic into new file

* Rework depthwise and grouped convolutions to use common logic

* Linting and bugfixes

* Fix tests

* Fix depthwise and grouped tests

* More linting fixes

* Address code review comments

* Fix unit tests

* Address code review comments from Andrew

* Fix imports
---
 python/tvm/relay/op/strategy/arm_cpu.py       |   6 +-
 python/tvm/topi/arm_cpu/conv2d.py             |   6 +-
 python/tvm/topi/arm_cpu/depthwise_conv2d.py   |   8 +-
 .../arm_cpu/mprofile/dsp/tensordot_conv2ds.py |  39 ++-
 python/tvm/topi/utils.py                      |  27 --
 .../relay/strategy/arm_cpu/test_conv2d.py     | 133 ++++++++++
 .../strategy/arm_cpu/test_conv2d_nchw.py      | 110 --------
 .../strategy/arm_cpu/test_conv2d_nhwc.py      | 180 --------------
 .../strategy/arm_cpu/test_depthwise_conv2d.py | 235 ++++++------------
 .../arm_cpu/test_generalized_conv2d.py        | 152 +++++++++++
 .../strategy/arm_cpu/test_group_conv2d.py     | 142 +++--------
 11 files changed, 446 insertions(+), 592 deletions(-)
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv2d.py
 delete mode 100644 tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
 delete mode 100644 tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index e56e7ba12e94..f04438e675a3 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -196,7 +196,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 and _is_simd_aligned(kernel.dtype, kernel.shape[2:])
             ):
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_ohwi_dsp),
+                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_ohwi_dsp, need_out_layout=True),
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_ohwi_dsp),
                     name="conv2d_nhwc_ohwi_dsp.arm_cpu",
                 )
@@ -249,7 +249,9 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     and _is_simd_aligned(kernel.dtype, kernel.shape[3:])
                 ):
                     strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw_oihw_dsp),
+                        wrap_compute_conv2d(
+                            topi.arm_cpu.depthwise_conv2d_nchw_oihw_dsp, need_out_layout=True
+                        ),
                         wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_oihw_dsp),
                         name="depthwise_conv2d_nchw_oihw_dsp.arm_cpu",
                     )
diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py
index bb29de8fa27b..fc46f4b34f9d 100644
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ b/python/tvm/topi/arm_cpu/conv2d.py
@@ -525,9 +525,11 @@ def schedule_conv2d_nhwc_dsp(cfg, outs):
 
 
 @autotvm.register_topi_compute("conv2d_nhwc_ohwi_dsp.arm_cpu")
-def conv2d_nhwc_ohwi_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
+def conv2d_nhwc_ohwi_dsp(cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype):
     """Compute conv2d_nhwc_ohwi with v7e-m DSP instructions and the tensordot kernel."""
-    return conv2d_nhwc_ohwi_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype)
+    return conv2d_nhwc_ohwi_dsp_compute(
+        cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
+    )
 
 
 @autotvm.register_topi_schedule("conv2d_nhwc_ohwi_dsp.arm_cpu")
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index 58cd11e8cc09..960c311d51ba 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -161,7 +161,7 @@ def _callback(op):
 # This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
 # Let us comment it out but not remove.
 # see discussion:
-# https://discuss.tvm.apache.org/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
+# https://discuss.tvm.apache.org/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu
 @autotvm.register_topi_compute("depthwise_conv2d_nchw_spatial_pack.arm_cpu")
 def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """TOPI compute callback for depthwise_conv2d nchw
@@ -724,10 +724,12 @@ def schedule_depthwise_conv2d_nhwc_dsp(cfg, outs):
 
 
 @autotvm.register_topi_compute("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-def depthwise_conv2d_nchw_oihw_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
+def depthwise_conv2d_nchw_oihw_dsp(
+    cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
+):
     """Compute depthwise_conv2d_nchw_oihw with v7e-m DSP instructions and the tensordot kernel."""
     return depthwise_conv2d_nchw_oihw_dsp_compute(
-        cfg, data, kernel, strides, padding, dilation, out_dtype
+        cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
     )
 
 
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py b/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
index ccd0c8e3ef32..79564f98edfc 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
@@ -22,8 +22,9 @@
 
 import random
 import string
-from typing import Union, Tuple
+from typing import Callable, Tuple, Union
 
+import tvm
 from tvm import te
 from tvm.tir import indexdiv, indexmod
 from tvm.topi.utils import traverse_inline
@@ -88,13 +89,35 @@ def _compute_output_dim(
     return (data_dim + pad_before + pad_after - kernel_dim) // stride + 1
 
 
+def _wrap_te_compute(
+    shape: Tuple,
+    fcompute: Callable[[int, int, int, int], tvm.ir.PrimExpr],
+    desired_out_layout: str,
+    current_out_layout: str = "NHWC",
+    **kwargs,
+) -> te.tensor.Tensor:
+    """Wrapper over te.compute that allows the output layout to be easily changed."""
+    assert current_out_layout.isalpha() and desired_out_layout.isalpha()
+    assert sorted(current_out_layout) == sorted(desired_out_layout)
+    forward_order = (current_out_layout.index(c) for c in desired_out_layout)
+    reverse_order = (desired_out_layout.index(c) for c in current_out_layout)
+
+    return te.compute(
+        tuple(shape[i] for i in forward_order),
+        lambda *args: fcompute(*(args[i] for i in reverse_order)),
+        **kwargs,
+    )
+
+
 def _get_suffix() -> str:
     """Returns a random eight-character string to append to C function names. Prevents accidental
     re-definition of functions if the same operator appears twice in a Relay graph."""
     return "".join(random.choices(string.ascii_uppercase, k=8))
 
 
-def conv2d_nhwc_ohwi_dsp_compute(_cfg, data, kernel, strides, padding, dilation, out_dtype):
+def conv2d_nhwc_ohwi_dsp_compute(
+    _cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
+):
     """Standard conv2d schedule that can be tensorized using tensordot."""
 
     stride_h, stride_w = _unpack_2d_argument(strides)
@@ -113,13 +136,14 @@ def conv2d_nhwc_ohwi_dsp_compute(_cfg, data, kernel, strides, padding, dilation,
     kc_i = te.reduce_axis((0, in_channels), name="rc")
 
     padded_data = _pad_if_needed(data, "NHWC", (pad_up, pad_left, pad_down, pad_right))
-    return te.compute(
+    return _wrap_te_compute(
         (batch_size, output_h, output_w, output_channels),
         lambda n, y, x, c: te.sum(
             padded_data[n, y * stride_h + kh_i, x * stride_w + kw_i, kc_i].astype(out_dtype)
             * kernel[c, kh_i, kw_i, kc_i].astype(out_dtype),
             axis=(kh_i, kw_i, kc_i),
         ),
+        out_layout,
         name="conv2d",
         tag="conv2d_nhwc_ohwi_dsp",
     )
@@ -165,7 +189,7 @@ def _make_conv2d_tensorization(padded_data, kernel):
 
 
 def depthwise_conv2d_nchw_oihw_dsp_compute(
-    _cfg, data, kernel, strides, padding, dilation, out_dtype
+    _cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
 ):
     """Depthwise conv2d schedule that can be tensorized using tensordot."""
 
@@ -185,9 +209,9 @@ def depthwise_conv2d_nchw_oihw_dsp_compute(
     kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
 
     padded_data = _pad_if_needed(data, "NCHW", (pad_up, pad_left, pad_down, pad_right))
-    return te.compute(
-        (batch_size, output_channels, output_h, output_w),
-        lambda n, c, y, x: te.sum(
+    return _wrap_te_compute(
+        (batch_size, output_h, output_w, output_channels),
+        lambda n, y, x, c: te.sum(
             padded_data[
                 n,
                 indexdiv(c, c_mul),
@@ -197,6 +221,7 @@ def depthwise_conv2d_nchw_oihw_dsp_compute(
             * kernel[indexdiv(c, c_mul), indexmod(c, c_mul), kh_i, kw_i].astype(out_dtype),
             axis=(kh_i, kw_i),
         ),
+        out_layout,
         name="depthwise_conv2d",
         tag="depthwise_conv2d_nchw_oihw_dsp",
     )
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index 91e29665cda3..1fd842f2d4cc 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -431,33 +431,6 @@ def get_shape(src_shape, src_layout, dst_layout):
     return get_const_tuple(tuple([src_shape[i.value] for i in dst_indices]))
 
 
-def change_constant_shape(src, src_layout, dst_layout):
-    """Makes a copy of a Relay constant, reshaping it to a new data layout.
-
-    Parameter
-    ---------
-    src : relay.Constant
-        The Constant to be reformatted.
-
-    src_layout : str
-        The current layout of the Relay constant. Must be alphabetic (e.g. NHWC
-        or OIHW, but not NCHW2c).
-
-    dst_layout : str
-        The desired layout of new the Relay constant. Must be alphabetic (e.g. NHWC
-        or OIHW, but not NCHW2c).
-
-    Returns
-    -------
-    dst_shape : relay.Constant
-        A copy of the Constant with the new layout.
-    """
-    assert src_layout.isalpha() and dst_layout.isalpha()
-    axis_order = [src_layout.index(c) for c in dst_layout]
-    reshaped = np.transpose(src.data.numpy(), axis_order)
-    return tvm.relay.Constant(tvm.nd.array(reshaped))
-
-
 def within_index(b, e, s, i):
     """Return a boolean value that indicates if i is within the given index.
 
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_conv2d.py
new file mode 100644
index 000000000000..6cf4bbb8e6ed
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d.py
@@ -0,0 +1,133 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for arm_cpu schedules for regular conv2d."""
+
+from test_generalized_conv2d import GeneralizedConv2dTests
+from tvm.testing import fixture, main, parameter, parameters
+
+
+class Conv2dTests(GeneralizedConv2dTests):
+    """Helper for constructing regular Conv2ds. Always sets groups to 1. We set the reference
+    kernel layout here as we must pick something, but the x86 implementation supports several."""
+
+    @fixture
+    def groups(self):
+        """Using a fixture instead of a parameter stops Pytest from adding the (redundant) number of
+        groups to the name of each test."""
+        return 1
+
+    def setup_method(self):
+        self.ref_kernel_layout = "HWIO"
+
+
+class TestConv2d_NHWC_DSP(Conv2dTests):
+    """This test is for conv2d_nhwc_dsp.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = parameters(
+        # TODO(mehrdadh): Fails due to https://github.com/apache/tvm/issues/11216
+        # ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1),
+        # ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1),
+        # ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+        # from Keyword Spotting model from MLPerfTiny models
+        # TODO(mehrdad): Fails due to https://github.com/apache/tvm/issues/11216
+        # ((1, 49, 10, 1), (10, 4), 64, (2, 2), (4, 1, 5, 1), 1),
+        # from Visual Wake Word model from MLPerfTiny models
+        # TODO(mehrdadh): fails due to https://github.com/apache/tvm/issues/11216
+        # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1), 1),
+        # from Image Classification model from MLPerfTiny models
+        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0, 1),
+        ((4, 16, 16, 8), (5, 5), 8, 2, (0, 4, 4, 0), 1),
+        ((4, 16, 16, 8), (5, 5), 16, 2, (0, 4, 4, 0), 1),
+        ((4, 16, 16, 8), (5, 5), 8, 2, 0, 1),
+        ((4, 16, 16, 8), (5, 5), 16, 2, 0, 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 0, 1, 1), 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (1, 1, 2, 2), 1),
+        ((1, 16, 16, 8), (5, 5), 16, 2, (3, 3, 2, 2), 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 1, 2, 3), 1),
+    )
+    in_dtype = parameter("int8", "int16")
+
+    data_layout = parameter("NHWC")
+    kernel_layout = parameter("HWOI")
+    out_layout = parameter("NHWC")
+    schedule_name = parameter("conv2d_nhwc_dsp.arm_cpu")
+
+
+class TestConv2d_NHWC_Spatial_Pack(Conv2dTests):
+    """This test is for conv2d_nhwc_spatial_pack.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = parameters(
+        ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1),
+        ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1),
+        ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+    )
+    in_dtype = parameter("int8", "int16")
+
+    data_layout = parameter("NHWC")
+    kernel_layout = parameter("HWIO")
+    out_layout = parameter("NHWC")
+    schedule_name = parameter("conv2d_nhwc_spatial_pack.arm_cpu")
+
+
+class TestConv2d_Tensordot(Conv2dTests):
+    """This test is for the regular conv2d schedule tensorized using tensordot."""
+
+    data_shape, kernel_size, num_filter, strides, padding = parameters(
+        # Disabled because these kernels are not an integral number of words
+        # ((1, 32, 32, 1), (3, 3), 12, 1, 0),
+        # ((1, 32, 10, 3), (3, 3), 16, 1, 0),
+        # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1)),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0)),
+        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0),
+        ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1)),
+        ((4, 16, 16, 16), (5, 5), 8, 2, 0),
+    )
+    dilation = parameter(1)
+    in_dtype = parameter("int8", "int16", "int32")
+
+    data_layout = parameter("NHWC")
+    kernel_layout = parameter("OHWI")
+    out_layout = parameter("NHWC", "NCHW")
+    schedule_name = parameter("conv2d_nhwc_ohwi_dsp.arm_cpu")
+
+
+class TestConv2d_NCHW_Spatial_Pack(Conv2dTests):
+    """This test is for conv2d_nchw_spatial_pack.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation, in_dtype = parameters(
+        ((1, 32, 32, 16), (3, 3), 12, 1, 0, 1, "int8"),
+        ((1, 32, 32, 16), (3, 3), 12, 1, 0, 1, "int16"),
+        ((1, 16, 16, 32), (3, 3), 12, 1, 0, 1, "int16"),
+    )
+    data_layout = parameter("NCHW")
+    kernel_layout = parameter("OIHW")
+    out_layout = parameter("NCHW")
+    schedule_name = parameter("conv2d_nchw_spatial_pack.arm_cpu")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
deleted file mode 100644
index 6f1ea0b34a2e..000000000000
--- a/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import sys
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
-from tvm.micro.testing.aot_test_utils import (
-    AOT_CORSTONE300_RUNNER,
-)
-
-
-class BasicConv2dTests:
-    @tvm.testing.requires_corstone300
-    def test_conv2d(
-        self,
-        data_shape,
-        kernel_size,
-        kernel_layout,
-        num_filter,
-        strides,
-        padding,
-        dilation,
-        dtype,
-        schedule_name,
-    ):
-        """Test a subgraph with a single conv2d_nchw operator."""
-        ishape = data_shape
-        wshape = (num_filter, data_shape[1], *kernel_size)
-        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
-
-        input0 = relay.var("input", relay.TensorType(ishape, dtype))
-        weight0 = relay.const(weight_data)
-        out0 = relay.op.nn.conv2d(
-            input0,
-            weight0,
-            kernel_size=kernel_size,
-            strides=strides,
-            padding=padding,
-            dilation=(dilation, dilation),
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32",
-            out_layout="NCHW",
-        )
-        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
-
-        input1 = relay.var("input", relay.TensorType(ishape, dtype))
-        weight1 = relay.const(weight_data)
-
-        out1 = relay.op.nn.conv2d(
-            input1,
-            weight1,
-            kernel_size=kernel_size,
-            strides=strides,
-            padding=padding,
-            dilation=(dilation, dilation),
-            data_layout="NCHW",
-            kernel_layout=kernel_layout,
-            out_dtype="int32",
-            out_layout="NCHW",
-        )
-        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
-
-        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
-        output_list = generate_ref_data(ref_mod, inputs)
-
-        compile_and_run(
-            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
-            runner=AOT_CORSTONE300_RUNNER,
-            interface_api="c",
-            use_unpacked_api=True,
-            target_opts={
-                "-keys": "arm_cpu",
-                "-mcpu": "cortex-m7",
-            },
-            schedule_name=schedule_name,
-        )
-
-
-class TestConv2d_OIHW_small_kernel(BasicConv2dTests):
-    """This test is for conv2d_nchw_spatial_pack.arm_cpu schedule."""
-
-    data_shape, kernel_size, num_filter, strides, padding, dilation, dtype = tvm.testing.parameters(
-        ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1, "int8"),
-        ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1, "int16"),
-        ((1, 32, 16, 16), (3, 3), 12, 1, 0, 1, "int16"),
-    )
-    kernel_layout = tvm.testing.parameter("OIHW")
-    schedule_name = tvm.testing.parameter("conv2d_nchw_spatial_pack.arm_cpu")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
deleted file mode 100644
index f5de3b51b67d..000000000000
--- a/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import sys
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
-from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
-from tvm.topi.utils import change_constant_shape
-
-
-class BasicConv2dTests:
-    @tvm.testing.requires_corstone300
-    def test_conv2d(
-        self,
-        data_shape,
-        kernel_size,
-        kernel_layout,
-        num_filter,
-        strides,
-        padding,
-        dilation,
-        dtype,
-        schedule_name,
-    ):
-        """Test a subgraph with a single conv2d operator."""
-        ishape = data_shape
-        wshape = (*kernel_size, data_shape[-1], num_filter)
-
-        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
-
-        input0 = relay.var("input", relay.TensorType(ishape, dtype))
-        weight0 = relay.const(weight_data)
-        out0 = relay.op.nn.conv2d(
-            input0,
-            weight0,
-            kernel_size=kernel_size,
-            strides=strides,
-            padding=padding,
-            dilation=(dilation, dilation),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-            out_dtype="int32",
-            out_layout="NHWC",
-        )
-        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
-
-        input1 = relay.var("input", relay.TensorType(ishape, dtype))
-        weight1 = change_constant_shape(weight0, "HWIO", kernel_layout)
-
-        out1 = relay.op.nn.conv2d(
-            input1,
-            weight1,
-            kernel_size=kernel_size,
-            strides=strides,
-            padding=padding,
-            dilation=(dilation, dilation),
-            data_layout="NHWC",
-            kernel_layout=kernel_layout,
-            out_dtype="int32",
-            out_layout="NHWC",
-        )
-        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
-
-        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
-        output_list = generate_ref_data(ref_mod, inputs)
-
-        compile_and_run(
-            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
-            runner=AOT_CORSTONE300_RUNNER,
-            interface_api="c",
-            use_unpacked_api=True,
-            target_opts={
-                "-keys": "arm_cpu",
-                "-mcpu": "cortex-m7",
-            },
-            schedule_name=schedule_name,
-        )
-
-
-class TestConv2d_DSP_HWOI(BasicConv2dTests):
-    """This test is for conv2d_nhwc_dsp.arm_cpu schedule."""
-
-    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
-        # TODO(mehrdadh): Fails due to https://github.com/apache/tvm/issues/11216
-        # ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1),
-        # ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1),
-        # ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
-        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
-        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
-        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
-        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
-        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
-        # from Keyword Spotting model from MLPerfTiny models
-        # TODO(mehrdad): Fails due to https://github.com/apache/tvm/issues/11216
-        # ((1, 49, 10, 1), (10, 4), 64, (2, 2), (4, 1, 5, 1), 1),
-        # from Visual Wake Word model from MLPerfTiny models
-        # TODO(mehrdadh): fails due to https://github.com/apache/tvm/issues/11216
-        # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1), 1),
-        # from Image Classification model from MLPerfTiny models
-        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0, 1),
-        ((4, 16, 16, 8), (5, 5), 8, 2, (0, 4, 4, 0), 1),
-        ((4, 16, 16, 8), (5, 5), 16, 2, (0, 4, 4, 0), 1),
-        ((4, 16, 16, 8), (5, 5), 8, 2, 0, 1),
-        ((4, 16, 16, 8), (5, 5), 16, 2, 0, 1),
-        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 0, 1, 1), 1),
-        ((1, 16, 16, 8), (3, 3), 16, 2, (1, 1, 2, 2), 1),
-        ((1, 16, 16, 8), (5, 5), 16, 2, (3, 3, 2, 2), 1),
-        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 1, 2, 3), 1),
-    )
-    dtype = tvm.testing.parameter("int8", "int16")
-    kernel_layout = tvm.testing.parameter("HWOI")
-    schedule_name = tvm.testing.parameter("conv2d_nhwc_dsp.arm_cpu")
-
-
-class TestConv2d_HWIO(BasicConv2dTests):
-    """This test is for conv2d_nhwc_spatial_pack.arm_cpu schedule."""
-
-    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
-        ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1),
-        ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1),
-        ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
-        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
-        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
-        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
-        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
-        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
-    )
-    dtype = tvm.testing.parameter("int8", "int16")
-    kernel_layout = tvm.testing.parameter("HWIO")
-    schedule_name = tvm.testing.parameter("conv2d_nhwc_spatial_pack.arm_cpu")
-
-
-class TestConv2d_Tensordot(BasicConv2dTests):
-    data_shape, kernel_size, num_filter, strides, padding = tvm.testing.parameters(
-        # Disabled because these kernels are not an integral number of words
-        # ((1, 32, 32, 1), (3, 3), 12, 1, 0),
-        # ((1, 32, 10, 3), (3, 3), 16, 1, 0),
-        # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1)),
-        ((4, 16, 16, 8), (5, 5), 8, 2, (0, 3, 3, 0)),
-        ((4, 16, 16, 8), (5, 5), 16, 2, (0, 3, 3, 0)),
-        ((4, 16, 16, 8), (5, 5), 8, 2, 0),
-        ((4, 16, 16, 8), (5, 5), 16, 2, 0),
-        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0),
-        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0),
-        ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1)),
-        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0)),
-        ((1, 32, 32, 16), (3, 3), 16, 1, 0),
-        ((1, 32, 32, 16), (3, 3), 16, 1, 0),
-        ((1, 49, 10, 1), (10, 4), 64, (2, 2), (4, 1, 5, 1)),
-        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 0, 1, 1)),
-        ((1, 16, 16, 8), (3, 3), 16, 2, (1, 1, 2, 2)),
-        ((1, 16, 16, 8), (5, 5), 16, 2, (3, 3, 2, 2)),
-        ((1, 32, 32, 16), (3, 3), 16, 1, 0),
-        ((1, 16, 16, 32), (1, 1), 64, 1, 0),
-    )
-    dilation = tvm.testing.parameter(1)
-    dtype = tvm.testing.parameter("int8", "int16", "int32")
-    kernel_layout = tvm.testing.parameter("OHWI")
-    schedule_name = tvm.testing.parameter("conv2d_nhwc_ohwi_dsp.arm_cpu")
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
index 36059c798cbb..f45d27bdaee9 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -14,124 +14,50 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
-from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
-
-
-class BasicDepthwiseConv2dTests:
-    @tvm.testing.requires_corstone300
-    def test_conv2d(
-        self,
-        data_shape,
-        data_layout,
-        kernel_size,
-        kernel_layout,
-        num_filter,
-        strides,
-        padding,
-        dilation,
-        dtype,
-        schedule_name,
-    ):
-        """Test a subgraph with a single conv2d operator."""
-        ishape = data_shape
-        groups = num_filter
-
-        assert groups > 1, f"groups should be more than 1 to create a depthwise conv2d."
-
-        if data_layout == "NCHW" and kernel_layout == "OIHW":
-            assert (
-                num_filter == data_shape[1]
-            ), f"Output channels({num_filter}) should be equal to input channels({data_shape[1]})."
-            wshape = (num_filter, data_shape[1] // groups, *kernel_size)
-        elif data_layout == "NHWC" and kernel_layout == "HWOI":
-            assert (
-                num_filter == data_shape[3]
-            ), f"Output channels({num_filter}) should be equal to input channels({data_shape[3]})."
-            wshape = (*kernel_size, num_filter, data_shape[3] // groups)
-        else:
-            raise ValueError(
-                f"Incorrect data layout({data_layout}) and kernel layout({kernel_layout})."
-            )
-
-        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
-
-        input0 = relay.var("input", relay.TensorType(ishape, dtype))
-        weight0 = relay.const(weight_data)
-        out0 = relay.op.nn.conv2d(
-            input0,
-            weight0,
-            kernel_size=kernel_size,
-            strides=strides,
-            padding=padding,
-            groups=groups,
-            dilation=(dilation, dilation),
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-            out_dtype="int32",
-            out_layout=data_layout,
-        )
-        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
-
-        input1 = relay.var("input", relay.TensorType(ishape, dtype))
-        weight1 = relay.const(weight_data)
-        out1 = relay.op.nn.conv2d(
-            input1,
-            weight1,
-            kernel_size=kernel_size,
-            strides=strides,
-            padding=padding,
-            groups=groups,
-            dilation=(dilation, dilation),
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-            out_dtype="int32",
-            out_layout=data_layout,
-        )
-        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
-
-        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
-        output_list = generate_ref_data(ref_mod, inputs)
-
-        compile_and_run(
-            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
-            runner=AOT_CORSTONE300_RUNNER,
-            interface_api="c",
-            use_unpacked_api=True,
-            target_opts={
-                "-keys": "arm_cpu",
-                "-mcpu": "cortex-m7",
-            },
-            schedule_name=schedule_name,
-        )
-
-
-class TestDepthwiseConv2d_NCHW_OIHW(BasicDepthwiseConv2dTests):
+"""Tests for arm_cpu schedules for depthwise_conv2d."""
+
+from test_generalized_conv2d import GeneralizedConv2dTests
+from tvm.testing import fixture, main, parameter, parameters
+
+
+class DepthwiseConv2dTests(GeneralizedConv2dTests):
+    """Helper for constructing depthwise Conv2ds. Sets the reference kernel layout to what x86 code
+    supports."""
+
+    @fixture
+    def groups(self, data_shape):
+        """By definition, a depthwise_conv2d has a number of groups equal to the number of input
+        channels, so we don't need to specify the number of groups each time."""
+        return data_shape[3]
+
+    def setup_method(self):
+        self.ref_kernel_layout = "HWOI"
+
+
+class TestDepthwiseConv2d_NCHW_OIHW(DepthwiseConv2dTests):
     """This test is for depthwise_conv2d_nchw.arm_cpu schedule."""
 
-    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
-        ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1),
-        ((1, 32, 10, 3), (3, 3), 32, 1, 0, 1),
-        ((1, 32, 32, 16), (3, 3), 32, 1, (0, 2, 2, 0), 1),
-        ((1, 32, 32, 16), (3, 3), 32, 1, 0, 1),
-        ((1, 32, 32, 16), (3, 3), 32, 1, 0, 1),
-        ((1, 32, 32, 16), (3, 3), 32, 1, (0, 2, 2, 0), 2),
-        ((1, 16, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+    data_shape, kernel_size, num_filter, strides, padding, dilation = parameters(
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 10, 3, 32), (3, 3), 32, 1, 0, 1),
+        ((1, 32, 16, 32), (3, 3), 32, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 16, 32), (3, 3), 32, 1, 0, 1),
+        ((1, 32, 16, 32), (3, 3), 32, 1, 0, 1),
+        ((1, 32, 16, 32), (3, 3), 32, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 16, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
     )
-    data_layout = tvm.testing.parameter("NCHW")
-    dtype = tvm.testing.parameter("int8", "int16")
-    kernel_layout = tvm.testing.parameter("OIHW")
-    schedule_name = tvm.testing.parameter("depthwise_conv2d_nchw.arm_cpu")
+
+    in_dtype = parameter("int8", "int16")
+    data_layout = parameter("NCHW")
+    kernel_layout = parameter("OIHW")
+    out_layout = parameter("NCHW")
+    schedule_name = parameter("depthwise_conv2d_nchw.arm_cpu")
 
 
-class TestDepthwiseConv2d_NHWC_HWOI(BasicDepthwiseConv2dTests):
+class TestDepthwiseConv2d_NHWC_HWOI(DepthwiseConv2dTests):
     """This test is for depthwise_conv2d_nhwc.generic schedule."""
 
-    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+    data_shape, kernel_size, num_filter, strides, padding, dilation = parameters(
         ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
         ((1, 32, 10, 16), (3, 3), 16, 1, 0, 1),
         ((1, 49, 10, 64), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
@@ -141,20 +67,20 @@ class TestDepthwiseConv2d_NHWC_HWOI(BasicDepthwiseConv2dTests):
         ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
         ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
     )
-    data_layout = tvm.testing.parameter("NHWC")
-    dtype = tvm.testing.parameter("int8", "int16")
-    kernel_layout = tvm.testing.parameter("HWOI")
-    schedule_name = tvm.testing.parameter("depthwise_conv2d_nhwc.generic")
 
+    in_dtype = parameter("int8", "int16")
+    data_layout = parameter("NHWC")
+    kernel_layout = parameter("HWOI")
+    out_layout = parameter("NHWC")
+    schedule_name = parameter("depthwise_conv2d_nhwc.generic")
 
-class TestDepthwiseConv2d_NHWC_HWOI_DSP(BasicDepthwiseConv2dTests):
-    """This test is for depthwise_conv2d_nhwc_dsp.arm_cpu schedule."""
 
-    # Tests that work with both int8 and int16 data types. Tuple elements are:
-    # data_shape, kernel_size, num_filter, strides, padding
-    dtype_parameterized_tests = [
-        # Depthwise_conv2d parameters from MobileNetV1 0.25x. The LLVM implementation doesn't
-        # support "SAME" and "VALID" padding, so padding must be explicitly specified.
+class TestDepthwiseConv2d_NHWC_HWOI_DSP(DepthwiseConv2dTests):
+    """This test is for depthwise_conv2d_nhwc_dsp.arm_cpu schedule. The tests that are parameterized
+    by dtype work for both int8 and int16, while the others only work on the specified dtype."""
+
+    in_dtype_parameterized_tests = [
+        # Depthwise_conv2d parameters from MobileNetV1 0.25x
         ((1, 48, 48, 8), (3, 3), 8, (1, 1), 1),
         ((1, 48, 48, 16), (3, 3), 16, (2, 2), (1, 1, 0, 0)),
         ((1, 24, 24, 32), (3, 3), 32, (1, 1), 1),
@@ -164,53 +90,56 @@ class TestDepthwiseConv2d_NHWC_HWOI_DSP(BasicDepthwiseConv2dTests):
         ((1, 6, 6, 128), (3, 3), 128, (1, 1), 1),
         ((1, 6, 6, 128), (3, 3), 128, (2, 2), (1, 1, 0, 0)),
         ((1, 3, 3, 256), (3, 3), 256, (1, 1), 1),
-        # Asymmetric height and width
+        # Asymmetric and larger kernels
         ((1, 25, 5, 64), (3, 3), 64, (1, 1), 1),
-        # Larger kernel
         ((1, 24, 24, 8), (5, 5), 8, (1, 1), 1),
-        # Asymmetric kernel
         ((1, 24, 24, 8), (3, 5), 8, (1, 1), 1),
     ]
 
-    data_shape, kernel_size, num_filter, strides, padding, dtype = tvm.testing.parameters(
+    data_shape, kernel_size, num_filter, strides, padding, in_dtype = parameters(
         # Make a copy of each parameterized test for int8 and one for int16
-        *map(lambda t: t + ("int8",), dtype_parameterized_tests),
-        *map(lambda t: t + ("int16",), dtype_parameterized_tests),
+        *map(lambda t: t + ("int8",), in_dtype_parameterized_tests),
+        *map(lambda t: t + ("int16",), in_dtype_parameterized_tests),
         # Test the int16 implementation with channel numbers not divisible by four
         ((1, 48, 48, 6), (3, 3), 6, (1, 1), 1, "int16"),
     )
-    dilation = tvm.testing.parameter(1)
-    data_layout = tvm.testing.parameter("NHWC")
-    kernel_layout = tvm.testing.parameter("HWOI")
-    schedule_name = tvm.testing.parameter("depthwise_conv2d_nhwc_dsp.arm_cpu")
+    dilation = parameter(1)
+    data_layout = parameter("NHWC")
+    kernel_layout = parameter("HWOI")
+    out_layout = parameter("NHWC")
+    schedule_name = parameter("depthwise_conv2d_nhwc_dsp.arm_cpu")
+
 
+class TestDepthwiseConv2d_Tensordot(DepthwiseConv2dTests):
+    """This test is for the depthwise_conv2d schedule tensorized using tensordot."""
 
-class TestDepthwiseConv2d_Tensordot(BasicDepthwiseConv2dTests):
-    data_shape, kernel_size, num_filter, strides, padding, dtype = tvm.testing.parameters(
+    data_shape, kernel_size, num_filter, strides, padding, in_dtype = parameters(
         # Currently, our schedule requires kernel_w be divisible by the number of simd lanes given
         # its dtype. This means 3x3 and 5x5 kernels do not work on int16 or int8 for now. If you had
         # to, you could hack around this by padding the data and kernel.
-        ((1, 8, 48, 48), (3, 3), 8, (1, 1), 1, "int32"),
-        ((1, 16, 48, 48), (3, 3), 16, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 32, 24, 24), (3, 3), 32, (1, 1), 1, "int32"),
-        ((1, 32, 24, 24), (3, 3), 32, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 64, 12, 12), (3, 3), 64, (1, 1), 1, "int32"),
-        ((1, 64, 12, 12), (3, 3), 64, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 128, 6, 6), (3, 3), 128, (1, 1), 1, "int32"),
-        ((1, 128, 6, 6), (3, 3), 128, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 256, 3, 3), (3, 3), 256, (1, 1), 1, "int32"),
-        ((1, 64, 25, 5), (3, 3), 64, (1, 1), 1, "int32"),
-        ((1, 8, 24, 24), (5, 5), 8, (1, 1), 1, "int32"),
-        ((1, 8, 24, 24), (3, 5), 8, (1, 1), 1, "int32"),
+        ((1, 48, 48, 8), (3, 3), 8, (1, 1), 1, "int32"),
+        ((1, 48, 48, 16), (3, 3), 16, (2, 2), (1, 1, 0, 0), "int32"),
+        ((1, 24, 24, 32), (3, 3), 32, (1, 1), 1, "int32"),
+        ((1, 24, 24, 32), (3, 3), 32, (2, 2), (1, 1, 0, 0), "int32"),
+        ((1, 12, 12, 64), (3, 3), 64, (1, 1), 1, "int32"),
+        ((1, 12, 12, 64), (3, 3), 64, (2, 2), (1, 1, 0, 0), "int32"),
+        ((1, 6, 6, 128), (3, 3), 128, (1, 1), 1, "int32"),
+        ((1, 6, 6, 128), (3, 3), 128, (2, 2), (1, 1, 0, 0), "int32"),
+        ((1, 3, 3, 256), (3, 3), 256, (1, 1), 1, "int32"),
+        ((1, 25, 5, 64), (3, 3), 64, (1, 1), 1, "int32"),
+        ((1, 24, 24, 8), (5, 5), 8, (1, 1), 1, "int32"),
+        ((1, 24, 24, 8), (3, 5), 8, (1, 1), 1, "int32"),
         # These "evenly divisible" kernels work on smaller dtypes.
-        ((1, 8, 48, 48), (3, 2), 8, 1, 0, "int16"),
-        ((1, 8, 48, 48), (4, 4), 8, 1, 0, "int8"),
+        ((1, 48, 48, 8), (3, 2), 8, 1, 0, "int16"),
+        ((1, 48, 48, 8), (4, 4), 8, 1, 0, "int8"),
     )
-    dilation = tvm.testing.parameter(1)
-    data_layout = tvm.testing.parameter("NCHW")
-    kernel_layout = tvm.testing.parameter("OIHW")
-    schedule_name = tvm.testing.parameter("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
+    dilation = parameter(1)
+
+    data_layout = parameter("NCHW")
+    kernel_layout = parameter("OIHW")
+    out_layout = parameter("NHWC", "NCHW")
+    schedule_name = parameter("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
 
 
 if __name__ == "__main__":
-    tvm.testing.main()
+    main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py
new file mode 100644
index 000000000000..499d677e8f95
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Helper class for testing variations of 2D convolution. Should be used by subclassing
+`GeneralizedConv2dTests`, and then setting the arguments using tvm.testing.parameter(s)."""
+
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+
+
+def _change_ndarray_layout(arr, src_layout, dst_layout):
+    """Makes a copy of an ndarray, reshaping it to a new data layout.
+
+    Parameter
+    ---------
+    arr : numpy.ndarray
+        The ndarray to be reformatted.
+
+    src_layout : str
+        The current layout of the Relay constant. Must be alphabetic (e.g. NHWC
+        or OIHW, but not NCHW2c).
+
+    dst_layout : str
+        The desired layout of new the Relay constant. Must be alphabetic (e.g. NHWC
+        or OIHW, but not NCHW2c).
+
+    Returns
+    -------
+    dst_shape : numpy.ndarray
+        A copy of the ndarray with the new layout.
+    """
+    assert src_layout.isalpha() and dst_layout.isalpha()
+    axis_order = [src_layout.index(c) for c in dst_layout]
+    return np.transpose(arr, axis_order)
+
+
+class GeneralizedConv2dTests:
+    """Superclass which can be used to test regular, depthwise, or grouped conv2D. Cannot be used
+    for 5D data formats (NCHWc and such) as written, but could be extended. Might also be worth
+    abstracting some of this logic into an even more general class that could be used for other
+    operators.
+
+    Note that data_shape should always be a tuple of length four indicating the data shape in NHWC
+    format (it will later be reshaped according to the given data_layout), and kernel_size should be
+    a length two tuple giving the height and width of the kernel.
+
+    This test (and other base Conv2dTests classes) are not run by Pytest, as their names do not
+    start with `Test`."""
+
+    @tvm.testing.requires_corstone300
+    def test_conv2d(
+        self,
+        data_shape,
+        kernel_size,
+        num_filter,
+        in_dtype,
+        strides,
+        padding,
+        groups,
+        dilation,
+        data_layout,
+        kernel_layout,
+        out_layout,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv2d operator."""
+
+        ref_input_data = np.random.randint(low=-128, high=127, size=data_shape, dtype=in_dtype)
+        ref_input_var = relay.var("input", relay.TensorType(data_shape, in_dtype))  # NHWC layout
+        kernel_shape = (*kernel_size, data_shape[-1] // groups, num_filter)  # HWIO layout
+        ref_kernel_data = np.random.randint(low=-10, high=10, size=kernel_shape, dtype=in_dtype)
+
+        """Our x86 depthwise implementation only supports HWOI with NHWC, so we need to change our
+        kernel layout to work around this. We can't just change the whole thing to HWIO or
+        something else, as then group conv2d would not work. Eventually, we should switch to using
+        TensorFlow to create the reference output so we can ensure our implementation is right.
+        See https://github.com/apache/tvm/issues/13137 for details."""
+
+        ref_relay_op = relay.op.nn.conv2d(
+            ref_input_var,
+            relay.const(_change_ndarray_layout(ref_kernel_data, "HWIO", self.ref_kernel_layout)),
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            groups=groups,
+            dilation=(dilation, dilation),
+            data_layout="NHWC",
+            kernel_layout=self.ref_kernel_layout,
+            out_dtype="int32",
+            out_layout="NHWC",
+        )
+        ref_module = tvm.IRModule.from_expr(relay.Function([ref_input_var], ref_relay_op))
+        ref_outputs = generate_ref_data(ref_module, {"input": ref_input_data})
+
+        # Reshape output dictionary to match out_layout
+        assert len(ref_outputs) == 1
+        output_tensor_name, output_tensor = next(iter(ref_outputs.items()))
+        ref_outputs[output_tensor_name] = _change_ndarray_layout(output_tensor, "NHWC", out_layout)
+
+        test_input_data = _change_ndarray_layout(ref_input_data, "NHWC", data_layout)
+        test_input_var = relay.var("input", relay.TensorType(test_input_data.shape, in_dtype))
+        test_kernel_data = _change_ndarray_layout(ref_kernel_data, "HWIO", kernel_layout)
+
+        test_relay_op = relay.op.nn.conv2d(
+            test_input_var,
+            relay.const(test_kernel_data),
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            groups=groups,
+            dilation=(dilation, dilation),
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout=out_layout,
+        )
+        test_function = relay.Function([test_input_var], test_relay_op)
+        test_model = AOTTestModel(
+            module=tvm.IRModule.from_expr(test_function),
+            inputs={"input": test_input_data},
+            outputs=ref_outputs,
+        )
+
+        compile_and_run(
+            test_model,
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
diff --git a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
index 47fe6d9f74c2..fb11ceda5097 100644
--- a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
@@ -14,121 +14,45 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
-from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+"""Tests for arm_cpu schedules for grouped conv2d."""
 
+from test_generalized_conv2d import GeneralizedConv2dTests
+from tvm.testing import main, parameter, parameters
 
-class BasicGroupConv2dTests:
-    @tvm.testing.requires_corstone300
-    def test_conv2d(
-        self,
-        data_shape,
-        data_layout,
-        kernel_size,
-        kernel_layout,
-        num_filter,
-        strides,
-        padding,
-        dilation,
-        groups,
-        dtype,
-        schedule_name,
-    ):
-        """Test a subgraph with a single conv2d operator."""
-        ishape = data_shape
 
-        assert groups > 1, f"groups should be more than 1 to create a group conv2d."
+class GroupConv2dTests(GeneralizedConv2dTests):
+    """Helper for constructing group Conv2ds. Sets the reference kernel layout to what x86 code
+    supports."""
 
-        if data_layout == "NCHW" and kernel_layout == "OIHW":
-            assert data_shape[1] % groups == 0
-            wshape = (num_filter, data_shape[1] // groups, *kernel_size)
-        elif data_layout == "NHWC" and kernel_layout == "HWIO":
-            assert data_shape[3] % groups == 0
-            wshape = (*kernel_size, data_shape[3] // groups, num_filter)
-        else:
-            raise ValueError(
-                f"Incorrect data layout({data_layout}) and kernel layout({kernel_layout})."
-            )
+    def setup_method(self):
+        self.ref_kernel_layout = "HWIO"
 
-        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
 
-        input0 = relay.var("input", relay.TensorType(ishape, dtype))
-        weight0 = relay.const(weight_data)
-        out0 = relay.op.nn.conv2d(
-            input0,
-            weight0,
-            kernel_size=kernel_size,
-            strides=strides,
-            padding=padding,
-            groups=groups,
-            dilation=(dilation, dilation),
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-            out_dtype="int32",
-            out_layout=data_layout,
-        )
-        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
-
-        input1 = relay.var("input", relay.TensorType(ishape, dtype))
-        weight1 = relay.const(weight_data)
-        out1 = relay.op.nn.conv2d(
-            input1,
-            weight1,
-            kernel_size=kernel_size,
-            strides=strides,
-            padding=padding,
-            groups=groups,
-            dilation=(dilation, dilation),
-            data_layout=data_layout,
-            kernel_layout=kernel_layout,
-            out_dtype="int32",
-            out_layout=data_layout,
-        )
-        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
-
-        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
-        output_list = generate_ref_data(ref_mod, inputs)
-
-        compile_and_run(
-            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
-            runner=AOT_CORSTONE300_RUNNER,
-            interface_api="c",
-            use_unpacked_api=True,
-            target_opts={
-                "-keys": "arm_cpu",
-                "-mcpu": "cortex-m7",
-            },
-            schedule_name=schedule_name,
-        )
-
-
-class TestGroupConv2d_NCHW_OIHW(BasicGroupConv2dTests):
+class TestGroupConv2d_NCHW_OIHW(GroupConv2dTests):
     """This test is for group_conv2d_nchw.arm_cpu schedule."""
 
-    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
-        ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1),
-        ((1, 16, 32, 10), (3, 3), 16, 1, 0, 1),
-        ((1, 16, 32, 32), (3, 3), 16, 1, (0, 2, 2, 0), 1),
-        ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1),
-        ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1),
-        ((1, 16, 32, 32), (3, 3), 16, 1, (0, 2, 2, 0), 2),
-        ((1, 16, 32, 32), (3, 3), 32, 1, (1, 1, 2, 2), 2),
+    data_shape, kernel_size, num_filter, strides, padding, dilation = parameters(
+        ((1, 32, 32, 16), (3, 3), 12, 1, 0, 1),
+        ((1, 32, 10, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 32, 1, (1, 1, 2, 2), 2),
     )
-    groups = tvm.testing.parameter(2, 4)
-    data_layout = tvm.testing.parameter("NCHW")
-    dtype = tvm.testing.parameter("int8", "int16")
-    kernel_layout = tvm.testing.parameter("OIHW")
-    schedule_name = tvm.testing.parameter("group_conv2d_nchw.arm_cpu")
+    groups = parameter(2, 4)
+    in_dtype = parameter("int8", "int16")
+
+    data_layout = parameter("NCHW")
+    kernel_layout = parameter("OIHW")
+    out_layout = parameter("NCHW")
+    schedule_name = parameter("group_conv2d_nchw.arm_cpu")
 
 
-class TestGroupConv2d_NHWC_HWIO(BasicGroupConv2dTests):
+class TestGroupConv2d_NHWC_HWIO(GroupConv2dTests):
     """This test is for group_conv2d_nhwc.generic schedule."""
 
-    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+    data_shape, kernel_size, num_filter, strides, padding, dilation = parameters(
         ((1, 32, 32, 16), (3, 3), 12, 1, 0, 1),
         ((1, 32, 10, 16), (3, 3), 16, 1, 0, 1),
         ((1, 49, 10, 16), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
@@ -138,12 +62,14 @@ class TestGroupConv2d_NHWC_HWIO(BasicGroupConv2dTests):
         ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
         ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
     )
-    groups = tvm.testing.parameter(2, 4)
-    data_layout = tvm.testing.parameter("NHWC")
-    dtype = tvm.testing.parameter("int8", "int16")
-    kernel_layout = tvm.testing.parameter("HWIO")
-    schedule_name = tvm.testing.parameter("group_conv2d_nhwc.generic")
+    groups = parameter(2, 4)
+    in_dtype = parameter("int8", "int16")
+
+    data_layout = parameter("NHWC")
+    kernel_layout = parameter("HWIO")
+    out_layout = parameter("NHWC")
+    schedule_name = parameter("group_conv2d_nhwc.generic")
 
 
 if __name__ == "__main__":
-    tvm.testing.main()
+    main()

From eeb8b7067ea77a4848cb376ba2b6b8fe479a6c93 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 21 Oct 2022 13:12:20 -0700
Subject: [PATCH 415/704] [Docker][CI] Remove mounting local download path to
 docker (#13108)

remove mounting local tvm download directory
---
 docker/bash.sh | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/docker/bash.sh b/docker/bash.sh
index 2af65b17f5ca..5ee772867976 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -330,16 +330,6 @@ DOCKER_ENV+=( --env CI_BUILD_HOME="${REPO_MOUNT_POINT}"
               --env CI_IMAGE_NAME="${DOCKER_IMAGE_NAME}"
             )
 
-
-# Pass tvm test data folder through to the docker container, to avoid
-# repeated downloads.  Check if we have permissions to write to the
-# directory first, since the CI may not.
-TEST_DATA_PATH="${TVM_DATA_ROOT_PATH:-${HOME}/.tvm_test_data}"
-if [[ -d "${TEST_DATA_PATH}" && -w "${TEST_DATA_PATH}" ]]; then
-    DOCKER_MOUNT+=( --volume "${TEST_DATA_PATH}":"${REPO_MOUNT_POINT}"/.tvm_test_data )
-fi
-
-
 # Remove the container once it finishes running (--rm) and share the
 # PID namespace (--pid=host).  The process inside does not have pid 1
 # and SIGKILL is propagated to the process inside, allowing jenkins to

From ad117809a165bc65a8dc1330b0bcd75e65bc4e8d Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 21 Oct 2022 13:30:48 -0700
Subject: [PATCH 416/704] [Hexagon] Skip
 HexagonThreadManagerTest.thread_order_signal_wait unit test (#13171)

skip test
---
 tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
index e8532eb54514..a1a13ed39340 100644
--- a/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
@@ -259,6 +259,7 @@ TEST_F(HexagonThreadManagerTest, thread_order) {
 }
 
 TEST_F(HexagonThreadManagerTest, thread_order_signal_wait) {
+  GTEST_SKIP() << "Skipping due to: https://github.com/apache/tvm/issues/13169";
   std::vector<int> arr;
 
   htm->Wait(streams[1], 1);

From e024b0d2077096e3ef8473c66c6b83b343b2a29c Mon Sep 17 00:00:00 2001
From: Karl Koscher <kkoscher@octoml.ai>
Date: Fri, 21 Oct 2022 14:23:27 -0700
Subject: [PATCH 417/704] [Hexagon] Add power manager (#13162)

This PR adds a new class, HexagonPowerManager, which interfaces with the HAP_power API to request maximum performance.

* Add Hexagon Power Manager

* Remove HexagonPowerManager::LogPowerConfig

* Delete HexagonPowerManager in ReleaseResources

* Hexagon power manager must be destroyed after the thread manager
---
 src/runtime/hexagon/hexagon_common.h         |   1 +
 src/runtime/hexagon/hexagon_device_api.h     |  10 ++
 src/runtime/hexagon/hexagon_htp.cc           |  34 +-----
 src/runtime/hexagon/hexagon_htp.h            |   5 -
 src/runtime/hexagon/hexagon_power_manager.cc | 108 +++++++++++++++++++
 src/runtime/hexagon/hexagon_power_manager.h  |  63 +++++++++++
 6 files changed, 184 insertions(+), 37 deletions(-)
 create mode 100644 src/runtime/hexagon/hexagon_power_manager.cc
 create mode 100644 src/runtime/hexagon/hexagon_power_manager.h

diff --git a/src/runtime/hexagon/hexagon_common.h b/src/runtime/hexagon/hexagon_common.h
index 9f304836fcf1..025cc253eee9 100644
--- a/src/runtime/hexagon/hexagon_common.h
+++ b/src/runtime/hexagon/hexagon_common.h
@@ -41,6 +41,7 @@
     int result = api_call;                                                        \
     if (result != 0) {                                                            \
       HEXAGON_PRINT(ERROR, "ERROR: " #api_call " failed with error %d.", result); \
+      abort();                                                                    \
     }                                                                             \
   } while (0)
 
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index e94ae4e87671..5fe4a62e6908 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -31,6 +31,7 @@
 
 #include "hexagon_buffer.h"
 #include "hexagon_buffer_manager.h"
+#include "hexagon_power_manager.h"
 #include "hexagon_thread_manager.h"
 #include "hexagon_user_dma.h"
 #include "hexagon_vtcm_pool.h"
@@ -55,6 +56,9 @@ class HexagonDeviceAPI final : public DeviceAPI {
 
   //! \brief Ensures resource managers are in a good state for the runtime
   void AcquireResources() {
+    CHECK_EQ(runtime_power_manager, nullptr);
+    runtime_power_manager = std::make_unique<HexagonPowerManager>();
+
     CHECK_EQ(runtime_vtcm, nullptr);
     runtime_vtcm = std::make_unique<HexagonVtcmPool>();
 
@@ -81,6 +85,9 @@ class HexagonDeviceAPI final : public DeviceAPI {
 
     CHECK(runtime_vtcm) << "runtime_vtcm was not created in AcquireResources";
     runtime_vtcm.reset();
+
+    CHECK(runtime_power_manager) << "runtime_power_manager was not created in AcquireResources";
+    runtime_power_manager.reset();
   }
 
   /*! \brief Currently unimplemented interface to specify the active
@@ -202,6 +209,9 @@ class HexagonDeviceAPI final : public DeviceAPI {
 
   //! \brief VTCM memory manager
   std::unique_ptr<HexagonVtcmPool> runtime_vtcm;
+
+  //! \brief Hexagon power manager
+  std::unique_ptr<HexagonPowerManager> runtime_power_manager;
 };
 }  // namespace hexagon
 }  // namespace runtime
diff --git a/src/runtime/hexagon/hexagon_htp.cc b/src/runtime/hexagon/hexagon_htp.cc
index 32084382ed7f..01344ccf4a79 100644
--- a/src/runtime/hexagon/hexagon_htp.cc
+++ b/src/runtime/hexagon/hexagon_htp.cc
@@ -35,39 +35,9 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-HexagonHtp::HexagonHtp() {
-  PowerOn();
-  Acquire();
-}
-
-HexagonHtp::~HexagonHtp() {
-  Release();
-  PowerOff();
-}
-
-void HexagonHtp::PowerOn() {
-  HAP_power_request_t pwr_req;
-  int nErr;
-
-  hap_pwr_ctx_ = HAP_utils_create_context();
-  pwr_req.type = HAP_power_set_HMX;
-  pwr_req.hmx.power_up = true;
-  if ((nErr = HAP_power_set(hap_pwr_ctx_, &pwr_req))) {
-    LOG(FATAL) << "InternalError: HAP_power_set failed\n";
-  }
-}
-
-void HexagonHtp::PowerOff() {
-  HAP_power_request_t pwr_req;
-  int nErr;
+HexagonHtp::HexagonHtp() { Acquire(); }
 
-  pwr_req.type = HAP_power_set_HMX;
-  pwr_req.hmx.power_up = false;
-  if ((nErr = HAP_power_set(hap_pwr_ctx_, &pwr_req))) {
-    LOG(FATAL) << "InternalError: HAP_power_set failed\n";
-  }
-  HAP_utils_destroy_context(hap_pwr_ctx_);
-}
+HexagonHtp::~HexagonHtp() { Release(); }
 
 void HexagonHtp::Acquire() {
   compute_res_attr_t compute_res_attr;
diff --git a/src/runtime/hexagon/hexagon_htp.h b/src/runtime/hexagon/hexagon_htp.h
index b52e07e27b46..b3f0c0b5f71f 100644
--- a/src/runtime/hexagon/hexagon_htp.h
+++ b/src/runtime/hexagon/hexagon_htp.h
@@ -45,14 +45,9 @@ class HexagonHtp {
   HexagonHtp& operator=(HexagonHtp&&) = delete;
 
  private:
-  //! \brief Power context
-  void* hap_pwr_ctx_;
-
   //! \brief Acquisition context ID
   unsigned int context_id_;
 
-  void PowerOn();
-  void PowerOff();
   void Acquire();
   void Release();
 };
diff --git a/src/runtime/hexagon/hexagon_power_manager.cc b/src/runtime/hexagon/hexagon_power_manager.cc
new file mode 100644
index 000000000000..3d8f621bfcce
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_power_manager.cc
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "hexagon_power_manager.h"
+
+#include <AEEStdDef.h>
+#include <AEEStdErr.h>
+
+#include "HAP_power.h"
+#include "hexagon_common.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+HexagonPowerManager::HexagonPowerManager() {
+  hap_pwr_ctx_ = HAP_utils_create_context();
+  PowerOnHVX();
+  PowerOnHTP();
+  SetAppType();
+  SetDCVS();
+}
+
+HexagonPowerManager::~HexagonPowerManager() {
+  PowerOffHTP();
+  PowerOffHVX();
+  HAP_utils_destroy_context(hap_pwr_ctx_);
+}
+
+void HexagonPowerManager::PowerOnHVX() {
+  HAP_power_request_t pwr_req;
+
+  pwr_req.type = HAP_power_set_HVX;
+  pwr_req.hvx.power_up = true;
+  HEXAGON_SAFE_CALL(HAP_power_set(hap_pwr_ctx_, &pwr_req));
+}
+
+void HexagonPowerManager::PowerOffHVX() {
+  HAP_power_request_t pwr_req;
+
+  pwr_req.type = HAP_power_set_HVX;
+  pwr_req.hvx.power_up = false;
+  HEXAGON_SAFE_CALL(HAP_power_set(hap_pwr_ctx_, &pwr_req));
+}
+
+void HexagonPowerManager::PowerOnHTP() {
+  HAP_power_request_t pwr_req;
+
+  pwr_req.type = HAP_power_set_HMX;
+  pwr_req.hmx.power_up = true;
+  HEXAGON_SAFE_CALL(HAP_power_set(hap_pwr_ctx_, &pwr_req));
+}
+
+void HexagonPowerManager::PowerOffHTP() {
+  HAP_power_request_t pwr_req;
+
+  pwr_req.type = HAP_power_set_HMX;
+  pwr_req.hmx.power_up = false;
+  HEXAGON_SAFE_CALL(HAP_power_set(hap_pwr_ctx_, &pwr_req));
+}
+
+void HexagonPowerManager::SetAppType() {
+  HAP_power_request_t pwr_req;
+
+  pwr_req.type = HAP_power_set_apptype;
+  pwr_req.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
+  HEXAGON_SAFE_CALL(HAP_power_set(hap_pwr_ctx_, &pwr_req));
+}
+
+void HexagonPowerManager::SetDCVS() {
+  HAP_power_request_t pwr_req;
+
+  memset(&pwr_req, 0, sizeof(HAP_power_request_t));
+  pwr_req.type = HAP_power_set_DCVS_v3;
+  pwr_req.dcvs_v3.set_dcvs_enable = TRUE;
+  pwr_req.dcvs_v3.dcvs_enable = FALSE;
+  pwr_req.dcvs_v3.set_core_params = TRUE;
+  pwr_req.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+  pwr_req.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+  pwr_req.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+  pwr_req.dcvs_v3.set_bus_params = TRUE;
+  pwr_req.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+  pwr_req.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+  pwr_req.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+  pwr_req.dcvs_v3.set_sleep_disable = TRUE;
+  pwr_req.dcvs_v3.sleep_disable = TRUE;
+  HEXAGON_SAFE_CALL(HAP_power_set(hap_pwr_ctx_, &pwr_req));
+}
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon_power_manager.h b/src/runtime/hexagon/hexagon_power_manager.h
new file mode 100644
index 000000000000..6f88d92259c4
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_power_manager.h
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_POWER_MANAGER_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_POWER_MANAGER_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+class HexagonPowerManager {
+ public:
+  //! \brief Constructor.
+  HexagonPowerManager();
+
+  //! \brief Destructor.
+  ~HexagonPowerManager();
+
+  //! \brief Prevent copy construction of HexagonPowerManager.
+  HexagonPowerManager(const HexagonPowerManager&) = delete;
+
+  //! \brief Prevent copy assignment with HexagonPowerManager.
+  HexagonPowerManager& operator=(const HexagonPowerManager&) = delete;
+
+  //! \brief Prevent move construction.
+  HexagonPowerManager(HexagonPowerManager&&) = delete;
+
+  //! \brief Prevent move assignment.
+  HexagonPowerManager& operator=(HexagonPowerManager&&) = delete;
+
+ private:
+  //! \brief Power context
+  void* hap_pwr_ctx_;
+
+  void PowerOnHVX();
+  void PowerOffHVX();
+  void PowerOnHTP();
+  void PowerOffHTP();
+  void SetAppType();
+  void SetDCVS();
+};
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_POWER_MANAGER_H_

From 39489447454d1803ed2f624a7e5824addd278c51 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 21 Oct 2022 16:37:57 -0500
Subject: [PATCH 418/704] [ci] Disable flaky Java RPC tests (#13170)

See #13168

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/task_java_unittest.sh | 35 ++++++++++++++++-------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/tests/scripts/task_java_unittest.sh b/tests/scripts/task_java_unittest.sh
index 33467e661487..ef13e7963851 100755
--- a/tests/scripts/task_java_unittest.sh
+++ b/tests/scripts/task_java_unittest.sh
@@ -29,18 +29,23 @@ CURR_DIR=$(cd `dirname $0`; pwd)
 SCRIPT_DIR=$CURR_DIR/../../jvm/core/src/test/scripts
 TEMP_DIR=$(mktemp -d)
 
-python3 $SCRIPT_DIR/test_add_cpu.py $TEMP_DIR
-python3 $SCRIPT_DIR/test_add_gpu.py $TEMP_DIR
-python3 $SCRIPT_DIR/test_graph_executor.py $TEMP_DIR
-
-# start rpc proxy server
-PORT=$(( ( RANDOM % 1000 )  + 9000 ))
-python3 $SCRIPT_DIR/test_rpc_proxy_server.py $PORT 30 &
-
-make jvmpkg
-make jvmpkg JVM_TEST_ARGS="-DskipTests=false \
-  -Dtest.tempdir=$TEMP_DIR \
-  -Dtest.rpc.proxy.host=localhost \
-  -Dtest.rpc.proxy.port=$PORT"
-
-rm -rf $TEMP_DIR
+cleanup()
+{
+  rm -rf "$TEMP_DIR"
+}
+trap cleanup 0
+
+python3 "$SCRIPT_DIR"/test_add_cpu.py "$TEMP_DIR"
+python3 "$SCRIPT_DIR"/test_add_gpu.py "$TEMP_DIR"
+python3 "$SCRIPT_DIR"/test_graph_executor.py "$TEMP_DIR"
+
+# Skip the Java RPC Unittests, see https://github.com/apache/tvm/issues/13168
+# # start rpc proxy server
+# PORT=$(( ( RANDOM % 1000 )  + 9000 ))
+# python3 $SCRIPT_DIR/test_rpc_proxy_server.py $PORT 30 &
+
+# make jvmpkg
+# make jvmpkg JVM_TEST_ARGS="-DskipTests=false \
+#   -Dtest.tempdir=$TEMP_DIR \
+#   -Dtest.rpc.proxy.host=localhost \
+#   -Dtest.rpc.proxy.port=$PORT"

From e42e4bba28499f28064ab8dd2ee6776402aeb5da Mon Sep 17 00:00:00 2001
From: Karl Koscher <kkoscher@octoml.ai>
Date: Fri, 21 Oct 2022 16:23:21 -0700
Subject: [PATCH 419/704] [Hexagon] Adjust Hexagon pytest port range (#13172)

Adjust Hexagon pytest port range

ADB, the Android Debug Bridge, tries to determine if an emulator is present by
looking for port 5555 being open. Unfortunately, once adb thinks there's an
emulator running, it will continously try to connect to it, adding an error
to its log on every failed attempt. This log file quickly grows to hundreds
of gigabytes before developers notice they are completely out of disk space.
This problem is further compounded by the Hexagon Launcher forwarding 10
contiguous ports from the randomly-chosen RPC base port, causing this failure
to happen more often than you'd otherwise suspect.

See https://stackoverflow.com/questions/3152681/android-emulator-5554-offline/10356594#10356594
---
 python/tvm/contrib/hexagon/pytest_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index b99bfe7fa753..7ee16f50eab4 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -75,7 +75,7 @@ def android_serial_number() -> Optional[str]:
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
-LISTEN_PORT_MIN = 2000  # Well above the privileged ports (1024 or lower)
+LISTEN_PORT_MIN = 6000  # Avoid hitting well-known Android debug ports
 LISTEN_PORT_MAX = 9000  # Below the search range end (port_end=9199) of RPC server
 PREVIOUS_PORT = None
 

From f4fdc136b991ce20a9c751847d82fd9fda07ce51 Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Fri, 21 Oct 2022 19:56:25 -0400
Subject: [PATCH 420/704] [Hexagon] [runtime] Allow creation of thread manager
 without hardware resources (#13174)

This allows the creation of a thread manager with optional hardware resources.  This allows us to create a new object for unit tests without hardware resources.

This will fix the test instability we've seen recently, as all tests were using the global thread manager and things could be in an unknown state.
---
 src/runtime/hexagon/hexagon_device_api.h      |  4 ++-
 src/runtime/hexagon/hexagon_thread_manager.cc | 29 +++++++++++++------
 src/runtime/hexagon/hexagon_thread_manager.h  | 17 ++++++++++-
 .../hexagon/hexagon_thread_manager_tests.cc   | 19 ++++++++----
 4 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index 5fe4a62e6908..30ac61a92b07 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -66,7 +66,8 @@ class HexagonDeviceAPI final : public DeviceAPI {
     runtime_hexbuffs = std::make_unique<HexagonBufferManager>();
 
     CHECK_EQ(runtime_threads, nullptr);
-    runtime_threads = std::make_unique<HexagonThreadManager>(threads, stack_size, pipe_size);
+    runtime_threads =
+        std::make_unique<HexagonThreadManager>(threads, stack_size, pipe_size, hw_resources);
 
     CHECK_EQ(runtime_dma, nullptr);
     runtime_dma = std::make_unique<HexagonUserDMA>();
@@ -203,6 +204,7 @@ class HexagonDeviceAPI final : public DeviceAPI {
   const unsigned threads{6};
   const unsigned pipe_size{1000};
   const unsigned stack_size{0x4000};  // 16KB
+  const std::vector<HardwareResourceType> hw_resources{DMA_0, HTP_0, HVX_0, HVX_1, HVX_2, HVX_3};
 
   //! \brief User DMA manager
   std::unique_ptr<HexagonUserDMA> runtime_dma;
diff --git a/src/runtime/hexagon/hexagon_thread_manager.cc b/src/runtime/hexagon/hexagon_thread_manager.cc
index 546a41cff041..cf64cdc8b2d0 100644
--- a/src/runtime/hexagon/hexagon_thread_manager.cc
+++ b/src/runtime/hexagon/hexagon_thread_manager.cc
@@ -24,10 +24,10 @@ namespace runtime {
 namespace hexagon {
 
 HexagonThreadManager::HexagonThreadManager(unsigned num_threads, unsigned thread_stack_size_bytes,
-                                           unsigned thread_pipe_size_words) {
-  // Note: could technically manage more software threads than allowable hardware threads, but there
-  // is no system constant defined
-  //  in the qurt libs for that maximum.
+                                           unsigned thread_pipe_size_words,
+                                           const std::vector<HardwareResourceType> hw_resources) {
+  // Note: could technically manage more software threads than allowable hardware threads, but
+  // there is no system constant defined in the qurt libs for that maximum.
   CHECK(num_threads);
   CHECK_LE(num_threads, QURT_MAX_HTHREAD_LIMIT);
   nthreads_ = num_threads;
@@ -38,14 +38,25 @@ HexagonThreadManager::HexagonThreadManager(unsigned num_threads, unsigned thread
   CHECK_GE(thread_pipe_size_words, MIN_PIPE_SIZE_WORDS);
   CHECK_LE(thread_pipe_size_words, MAX_PIPE_SIZE_WORDS);
 
+  // Support either no resources or a specific set of hardware resources for now.
+  if (!hw_resources.empty()) {
+    CHECK((hw_resources.size() == nthreads_) && (nthreads_ == 6) && (hw_resources[0] == DMA_0) &&
+          (hw_resources[1] == HTP_0) && (hw_resources[2] == HVX_0) && (hw_resources[3] == HVX_1) &&
+          (hw_resources[4] == HVX_2) && (hw_resources[5] == HVX_3))
+        << "Unsupported hardware resource set";
+  }
+  hw_resources_ = hw_resources;
+
+  if (!hw_resources_.empty()) {
+    DLOG(INFO) << "Initialize hardware resource managers";
+    // Acquisition/locks will be performed on specific threads
+    htp_ = std::make_unique<HexagonHtp>();
+    hvx_ = std::make_unique<HexagonHvx>();
+  }
+
   DLOG(INFO) << "Spawning threads";
   SpawnThreads(thread_stack_size_bytes, thread_pipe_size_words);
 
-  DLOG(INFO) << "Acquiring hardware resources";
-  // TODO(HWE): Move these bindings to specific threads
-  htp_ = std::make_unique<HexagonHtp>();
-  hvx_ = std::make_unique<HexagonHvx>();
-
   // Initially, block all threads until we get the Start() call
   qurt_sem_init_val(&start_semaphore_, 0);
   for (unsigned i = 0; i < nthreads_; i++) {
diff --git a/src/runtime/hexagon/hexagon_thread_manager.h b/src/runtime/hexagon/hexagon_thread_manager.h
index 30a931554494..a263cf42dc58 100644
--- a/src/runtime/hexagon/hexagon_thread_manager.h
+++ b/src/runtime/hexagon/hexagon_thread_manager.h
@@ -40,6 +40,17 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
+typedef enum {
+  NONE = -1,
+  DMA_0 = 0,
+  HTP_0,
+  HVX_0,
+  HVX_1,
+  HVX_2,
+  HVX_3,
+  MAX,
+} HardwareResourceType;
+
 class HexagonThreadManager {
   //! \brief Void function.
   using voidfunc = void (*)(void*);
@@ -64,7 +75,8 @@ class HexagonThreadManager {
    * \param thread_stack_size_bytes Stack size in bytes per thread.
    * \param thread_pipe_size_words Pipe (or command buffer) size in words (or commands).
    */
-  HexagonThreadManager(unsigned, unsigned thread_stack_size_bytes, unsigned thread_pipe_size_words);
+  HexagonThreadManager(unsigned, unsigned thread_stack_size_bytes, unsigned thread_pipe_size_words,
+                       const std::vector<HardwareResourceType> = {});
 
   //! \brief Destructor
   ~HexagonThreadManager();
@@ -188,6 +200,9 @@ class HexagonThreadManager {
     Command(voidfunc f, void* args) : f(f), args(args) {}
   };
 
+  //! \brief List of hardware resources
+  std::vector<HardwareResourceType> hw_resources_;
+
   //! \brief HTP hardware resource.
   // TODO(HWE): Move binding of HTP to a specific thread
   std::unique_ptr<HexagonHtp> htp_;
diff --git a/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
index a1a13ed39340..d7bf0afed906 100644
--- a/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
@@ -29,15 +29,16 @@ using namespace tvm::runtime::hexagon;
 class HexagonThreadManagerTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    htm = HexagonDeviceAPI::Global()->ThreadManager();
+    // Create with no hardware resources so we don't conflict with session HexagonThreadManager
+    htm = new HexagonThreadManager(threads, stack_size, pipe_size);
     streams = htm->GetStreamHandles();
   }
-  void TearDown() override {}
+  void TearDown() override { delete htm; }
   HexagonThreadManager* htm{nullptr};
   std::vector<TVMStreamHandle> streams;
   int answer{0};
   const unsigned threads{6};
-  const unsigned pipe_size{1000};
+  const unsigned pipe_size{100};
   const unsigned stack_size{0x4000};  // 16KB
 };
 
@@ -54,6 +55,15 @@ TEST_F(HexagonThreadManagerTest, ctor_errors) {
   ASSERT_THROW(HexagonThreadManager(6, stack_size, 9), InternalError);
   // pipe too big
   ASSERT_THROW(HexagonThreadManager(6, stack_size, 0x10000000), InternalError);
+  // hw resources count doesn't match thread count
+  ASSERT_THROW(HexagonThreadManager(6, stack_size, pipe_size, {DMA_0}), InternalError);
+  // hw resources doesn't match specific supported configuration
+  ASSERT_THROW(
+      HexagonThreadManager(6, stack_size, pipe_size, {DMA_0, HTP_0, HVX_0, HVX_1, HVX_2, DMA_0}),
+      InternalError);
+  // hw resources doesn't match specific supported configuration
+  ASSERT_THROW(HexagonThreadManager(5, stack_size, pipe_size, {DMA_0, HTP_0, HVX_0, HVX_1, HVX_2}),
+               InternalError);
 }
 
 TEST_F(HexagonThreadManagerTest, init) {
@@ -163,7 +173,7 @@ TEST_F(HexagonThreadManagerTest, pipe_fill) {
 }
 
 // TODO(HWE): Create a temporary thread manager with a smaller pipe for this test
-TEST_F(HexagonThreadManagerTest, DISABLED_pipe_overflow) {
+TEST_F(HexagonThreadManagerTest, pipe_overflow) {
   // fill the pipe
   for (int i = 0; i < pipe_size; ++i) {
     htm->Dispatch(streams[0], get_the_answer, &answer);
@@ -259,7 +269,6 @@ TEST_F(HexagonThreadManagerTest, thread_order) {
 }
 
 TEST_F(HexagonThreadManagerTest, thread_order_signal_wait) {
-  GTEST_SKIP() << "Skipping due to: https://github.com/apache/tvm/issues/13169";
   std::vector<int> arr;
 
   htm->Wait(streams[1], 1);

From e5c7c9371e772a6772f5186380fdfcf7133ad397 Mon Sep 17 00:00:00 2001
From: "Ehsan M. Kermani" <6980212+ehsanmok@users.noreply.github.com>
Date: Sat, 22 Oct 2022 11:30:57 -0700
Subject: [PATCH 421/704] [TIR] Add the missing nearbyint intrinsic (#13175)

During testing of this PR #13163, it turns out that the `nearbyint` intrinsic is missing despite being supported.

Co-authored-by: Ehsan M. Kermani <ehsanmok@users.noreply.github.com>
---
 src/target/intrin_rule.cc                     | 3 +++
 src/target/llvm/intrin_rule_nvptx.cc          | 3 +++
 src/target/llvm/intrin_rule_rocm.cc           | 3 +++
 src/target/source/intrin_rule_aocl.cc         | 6 ++++++
 src/target/source/intrin_rule_cuda.cc         | 3 +++
 src/target/source/intrin_rule_metal.cc        | 3 +++
 src/target/source/intrin_rule_opencl.cc       | 3 +++
 src/target/source/intrin_rule_vhls.cc         | 3 +++
 src/target/spirv/intrin_rule_spirv.cc         | 6 ++++++
 tests/python/frontend/pytorch/test_forward.py | 5 +++--
 tests/python/relay/test_op_level5.py          | 5 +++--
 tests/python/topi/python/test_topi_image.py   | 5 +++--
 12 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/src/target/intrin_rule.cc b/src/target/intrin_rule.cc
index e697d9b60273..5042ae60cb0b 100644
--- a/src/target/intrin_rule.cc
+++ b/src/target/intrin_rule.cc
@@ -112,6 +112,9 @@ TVM_REGISTER_OP("tir.ceil")
 TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("default.FLowerIntrinsic", DispatchPureExtern<FloatSuffix>);
 
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("default.FLowerIntrinsic", DispatchPureExtern<FloatSuffix>);
+
 TVM_REGISTER_OP("tir.pow").set_attr<FLowerIntrinsic>("default.FLowerIntrinsic",
                                                      DispatchPureExtern<FloatSuffix>);
 
diff --git a/src/target/llvm/intrin_rule_nvptx.cc b/src/target/llvm/intrin_rule_nvptx.cc
index 0ee01a63c042..e7be40fb9041 100644
--- a/src/target/llvm/intrin_rule_nvptx.cc
+++ b/src/target/llvm/intrin_rule_nvptx.cc
@@ -68,6 +68,9 @@ TVM_REGISTER_OP("tir.ceil")
 TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("nvptx.FLowerIntrinsic", DispatchPureExternLibDevice);
 
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("nvptx.FLowerIntrinsic", DispatchPureExternLibDevice);
+
 TVM_REGISTER_OP("tir.trunc")
     .set_attr<FLowerIntrinsic>("nvptx.FLowerIntrinsic", DispatchPureExternLibDevice);
 
diff --git a/src/target/llvm/intrin_rule_rocm.cc b/src/target/llvm/intrin_rule_rocm.cc
index 072686868a81..86ac3d351614 100644
--- a/src/target/llvm/intrin_rule_rocm.cc
+++ b/src/target/llvm/intrin_rule_rocm.cc
@@ -119,6 +119,9 @@ TVM_REGISTER_OP("tir.ceil")
 TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("rocm.FLowerIntrinsic", DispatchPureExternOCML);
 
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("rocm.FLowerIntrinsic", DispatchPureExternOCML);
+
 TVM_REGISTER_OP("tir.trunc")
     .set_attr<FLowerIntrinsic>("rocm.FLowerIntrinsic", DispatchPureExternOCML);
 
diff --git a/src/target/source/intrin_rule_aocl.cc b/src/target/source/intrin_rule_aocl.cc
index 09fc087ca252..599e62f3f31c 100644
--- a/src/target/source/intrin_rule_aocl.cc
+++ b/src/target/source/intrin_rule_aocl.cc
@@ -45,6 +45,9 @@ TVM_REGISTER_OP("tir.fabs")
 TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic", DispatchPureExtern<Direct>);
 
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic", DispatchPureExtern<Direct>);
+
 TVM_REGISTER_OP("tir.exp").set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic",
                                                      DispatchPureExtern<Direct>);
 
@@ -78,6 +81,9 @@ TVM_REGISTER_OP("tir.fabs")
 TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic", DispatchPureExtern<Direct>);
 
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic", DispatchPureExtern<Direct>);
+
 TVM_REGISTER_OP("tir.exp").set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic",
                                                      DispatchPureExtern<Direct>);
 
diff --git a/src/target/source/intrin_rule_cuda.cc b/src/target/source/intrin_rule_cuda.cc
index a450b44b596b..95fbf7f1a513 100644
--- a/src/target/source/intrin_rule_cuda.cc
+++ b/src/target/source/intrin_rule_cuda.cc
@@ -148,6 +148,9 @@ TVM_REGISTER_OP("tir.fabs")
 TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
+
 TVM_REGISTER_OP("tir.exp").set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic",
                                                      DispatchPureExtern<CUDAFastMath>);
 
diff --git a/src/target/source/intrin_rule_metal.cc b/src/target/source/intrin_rule_metal.cc
index 74dbd5479b6e..7d7a5fb29a7c 100644
--- a/src/target/source/intrin_rule_metal.cc
+++ b/src/target/source/intrin_rule_metal.cc
@@ -46,6 +46,9 @@ TVM_REGISTER_OP("tir.fabs")
 TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("metal.FLowerIntrinsic", DispatchPureExtern<Direct>);
 
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("metal.FLowerIntrinsic", DispatchPureExtern<Direct>);
+
 TVM_REGISTER_OP("tir.exp").set_attr<FLowerIntrinsic>("metal.FLowerIntrinsic",
                                                      DispatchPureExtern<Direct>);
 
diff --git a/src/target/source/intrin_rule_opencl.cc b/src/target/source/intrin_rule_opencl.cc
index 64a50c3c84b1..94ab9d8b9d9c 100644
--- a/src/target/source/intrin_rule_opencl.cc
+++ b/src/target/source/intrin_rule_opencl.cc
@@ -46,6 +46,9 @@ TVM_REGISTER_OP("tir.fabs")
 TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("opencl.FLowerIntrinsic", DispatchPureExtern<Direct>);
 
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("opencl.FLowerIntrinsic", DispatchPureExtern<Direct>);
+
 TVM_REGISTER_OP("tir.exp").set_attr<FLowerIntrinsic>("opencl.FLowerIntrinsic",
                                                      DispatchPureExtern<Direct>);
 
diff --git a/src/target/source/intrin_rule_vhls.cc b/src/target/source/intrin_rule_vhls.cc
index 57be8ae17a57..7bfd7cd13659 100644
--- a/src/target/source/intrin_rule_vhls.cc
+++ b/src/target/source/intrin_rule_vhls.cc
@@ -45,6 +45,9 @@ TVM_REGISTER_OP("tir.fabs")
 TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
 
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
+
 TVM_REGISTER_OP("tir.exp").set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic",
                                                      DispatchPureExtern<Direct>);
 
diff --git a/src/target/spirv/intrin_rule_spirv.cc b/src/target/spirv/intrin_rule_spirv.cc
index eca7c4ce1700..0c65f1718a5d 100644
--- a/src/target/spirv/intrin_rule_spirv.cc
+++ b/src/target/spirv/intrin_rule_spirv.cc
@@ -68,6 +68,9 @@ TVM_REGISTER_OP("tir.ceil")
 TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("vulkan.FLowerIntrinsic", DispatchGLSLPureIntrin<GLSLstd450Round>);
 
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("vulkan.FLowerIntrinsic", DispatchGLSLPureIntrin<GLSLstd450Round>);
+
 TVM_REGISTER_OP("tir.trunc")
     .set_attr<FLowerIntrinsic>("vulkan.FLowerIntrinsic", DispatchGLSLPureIntrin<GLSLstd450Trunc>);
 
@@ -108,6 +111,9 @@ TVM_REGISTER_OP("tir.ceil")
 TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("webgpu.FLowerIntrinsic", DispatchGLSLPureIntrin<GLSLstd450Round>);
 
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("webgpu.FLowerIntrinsic", DispatchGLSLPureIntrin<GLSLstd450Round>);
+
 TVM_REGISTER_OP("tir.trunc")
     .set_attr<FLowerIntrinsic>("webgpu.FLowerIntrinsic", DispatchGLSLPureIntrin<GLSLstd450Trunc>);
 
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 3c8bd5efd80d..8045635127bb 100755
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -4847,8 +4847,9 @@ def forward(self, x, y):
 
     data_2D = torch.rand([4, 4, 8, 8]).float()
     grid_2D = torch.rand([4, 16, 16, 2]).float()
-    data_3D = torch.rand([4, 4, 8, 8, 8]).float()
-    grid_3D = torch.rand([4, 16, 16, 16, 3]).float()
+    # choosing smaller sizes to be testable on weaker GPUs
+    data_3D = torch.rand([4, 4, 4, 4, 4]).float()
+    grid_3D = torch.rand([4, 8, 8, 8, 3]).float()
 
     for _method in methods:
         for _padding in padding_modes:
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 8b5f849c3db7..10d0ea0d6d26 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -1419,8 +1419,9 @@ def verify_grid_sample(
 
     data_2D_shape = (4, 4, 8, 8)
     grid_2D_shape = (4, 2, 16, 16)
-    data_3D_shape = (4, 4, 8, 8, 8)
-    grid_3D_shape = (4, 3, 16, 16, 16)
+    # choosing smaller sizes to be testable on weaker GPUs
+    data_3D_shape = (4, 4, 4, 4, 4)
+    grid_3D_shape = (4, 3, 8, 8, 8)
 
     for _method in methods:
         for _padding in padding_modes:
diff --git a/tests/python/topi/python/test_topi_image.py b/tests/python/topi/python/test_topi_image.py
index be53fc603a77..56f7a2026d33 100644
--- a/tests/python/topi/python/test_topi_image.py
+++ b/tests/python/topi/python/test_topi_image.py
@@ -323,8 +323,9 @@ def check_target(target, dev):
     data_2D_shape = (4, 4, 8, 8)
     grid_2D_shape = (4, 2, 16, 16)
     layout_2D = "NCHW"
-    data_3D_shape = (4, 4, 8, 8, 8)
-    grid_3D_shape = (4, 3, 16, 16, 16)
+    # choosing smaller sizes to be testable on weaker GPUs
+    data_3D_shape = (4, 4, 4, 4, 4)
+    grid_3D_shape = (4, 3, 8, 8, 8)
     layout_3D = "NCDHW"
 
     for _method in methods:

From 24e89befd6c2f45fe47512de5d1445d8ea809a98 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Sun, 23 Oct 2022 11:58:31 -0700
Subject: [PATCH 422/704] [TVMScript] IRModule parser (#13176)

This PR introduces parser for IRModule as part of #12442

Co-authored-by: yongwww <yongcale@gmail.com>
---
 python/tvm/script/_parser/__init__.py         |  3 +-
 python/tvm/script/_parser/_core.py            |  5 +-
 python/tvm/script/_parser/ir/__init__.py      | 22 ++++++
 python/tvm/script/_parser/ir/entry.py         | 74 +++++++++++++++++++
 python/tvm/script/_parser/ir/parser.py        | 66 +++++++++++++++++
 .../unittest/test_tvmscript_parser_ir.py      | 35 +++++++++
 6 files changed, 203 insertions(+), 2 deletions(-)
 create mode 100644 python/tvm/script/_parser/ir/__init__.py
 create mode 100644 python/tvm/script/_parser/ir/entry.py
 create mode 100644 python/tvm/script/_parser/ir/parser.py
 create mode 100644 tests/python/unittest/test_tvmscript_parser_ir.py

diff --git a/python/tvm/script/_parser/__init__.py b/python/tvm/script/_parser/__init__.py
index d885b405257b..fd4e45818c20 100644
--- a/python/tvm/script/_parser/__init__.py
+++ b/python/tvm/script/_parser/__init__.py
@@ -15,4 +15,5 @@
 # specific language governing permissions and limitations
 # under the Licens.
 """The parser"""
-from . import _core
+from . import _core, ir
+from .ir import ir_module
diff --git a/python/tvm/script/_parser/_core.py b/python/tvm/script/_parser/_core.py
index a2dcc5b531dc..4f5411dc368f 100644
--- a/python/tvm/script/_parser/_core.py
+++ b/python/tvm/script/_parser/_core.py
@@ -16,4 +16,7 @@
 # under the Licens.
 """The core parser infra"""
 # pylint: disable=unused-import
-from .core import doc, utils
+from .core import dispatch, doc, utils
+from .core.dispatch import OpMethod, register_op
+from .core.entry import parse
+from .core.parser import Parser
diff --git a/python/tvm/script/_parser/ir/__init__.py b/python/tvm/script/_parser/ir/__init__.py
new file mode 100644
index 000000000000..b15468d37a7e
--- /dev/null
+++ b/python/tvm/script/_parser/ir/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The ir module parser"""
+
+from . import parser as _parser
+from .entry import ir_module, is_defined_in_class
+
+__all__ = ["ir_module", "is_defined_in_class"]
diff --git a/python/tvm/script/_parser/ir/entry.py b/python/tvm/script/_parser/ir/entry.py
new file mode 100644
index 000000000000..e8bc8b702db0
--- /dev/null
+++ b/python/tvm/script/_parser/ir/entry.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The entry point of TVM parser for ir module."""
+
+import inspect
+from typing import List, Type
+from types import FrameType
+
+from tvm.ir import IRModule
+
+from .._core import parse, utils
+
+
+def is_defined_in_class(frames: List[FrameType]) -> bool:
+    """Check whether a object is defined in a class scope.
+
+    Parameters
+    ----------
+    frames : List[FrameType]
+        The frame stack of the object, obtained by `inspect.stack()`.
+
+    Returns
+    -------
+    res : bool
+        The result if the object is defined in a class scope.
+    """
+    if len(frames) > 2:
+        maybe_class_frame = frames[2]
+        statement_list = maybe_class_frame[4]
+        if statement_list is None:
+            return False
+        first_statement = statement_list[0]
+        line = first_statement.strip()
+        if line.startswith("class "):
+            return True
+        if line.startswith("@") and "ir_module" in line:
+            return True
+    return False
+
+
+def ir_module(mod: Type) -> IRModule:
+    """The parsing method for ir module, by using `@ir_module` as decorator.
+
+    Parameters
+    ----------
+    mod : Type
+        The class to be parsed as ir module.
+
+    Returns
+    -------
+    irmodule : IRModule
+        The parsed ir module.
+    """
+    if not inspect.isclass(mod):
+        raise TypeError(f"Expect a class, but got: {mod}")
+
+    return parse(mod, utils.inspect_class_capture(mod))
+
+
+setattr(ir_module, "dispatch_token", "ir")
diff --git a/python/tvm/script/_parser/ir/parser.py b/python/tvm/script/_parser/ir/parser.py
new file mode 100644
index 000000000000..9532e7e32c00
--- /dev/null
+++ b/python/tvm/script/_parser/ir/parser.py
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The base parser for ir module"""
+
+from ...ir_builder import ir as I
+from .._core import Parser, dispatch, doc
+
+
+@dispatch.register(token="ir", type_name="ClassDef")
+def _visit_class_def(self: Parser, node: doc.ClassDef) -> None:
+    """The class definition visiting method for ir module.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.ClassDef
+        The doc AST class definition node.
+    """
+    with self.var_table.with_frame():
+        with I.ir_module():
+            with self.with_dispatch_token("ir"):
+                self.visit_body(node.body)
+
+
+@dispatch.register(token="ir", type_name="Assign")
+def _visit_assign(_self: Parser, _node: doc.Assign) -> None:
+    """The assign visiting method for ir module.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.ClassDef
+        The doc AST assign node.
+    """
+
+
+@dispatch.register(token="ir", type_name="Expr")
+def _visit_expr(_self: Parser, _node: doc.Expr) -> None:
+    """The expression visiting method for ir module.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.ClassDef
+        The doc AST expression node.
+    """
diff --git a/tests/python/unittest/test_tvmscript_parser_ir.py b/tests/python/unittest/test_tvmscript_parser_ir.py
new file mode 100644
index 000000000000..b235d85bb457
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_parser_ir.py
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unittests for tvm.script.parser.ir"""
+
+import pytest
+import inspect
+import tvm.testing
+from tvm.script._parser import ir_module
+from tvm.ir import IRModule
+
+
+def test_ir_base():
+    @ir_module
+    class BlankIRModule:
+        pass
+
+    assert isinstance(BlankIRModule, IRModule) and len(BlankIRModule.functions.items()) == 0
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 3131cdc56a75a3730024e0e04967e19f9e99fe58 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Mon, 24 Oct 2022 12:56:08 +0100
Subject: [PATCH 423/704] [Target] Replace utility functions with
 target.features (#12455)

Following on from #12454 this patch removes the utility functions in favour of the centralised `target.features` property.
---
 python/tvm/relay/op/strategy/arm_cpu.py       | 11 ++--
 python/tvm/relay/qnn/op/legalizations.py      | 21 +------
 python/tvm/topi/arm_cpu/arm_utils.py          | 58 ++-----------------
 python/tvm/topi/arm_cpu/conv2d_gemm.py        | 23 ++++----
 python/tvm/topi/arm_cpu/conv2d_int8.py        | 12 ++--
 python/tvm/topi/arm_cpu/depthwise_conv2d.py   |  5 +-
 src/target/parsers/cpu.cc                     |  5 ++
 .../test_arm_compute_lib/test_network.py      | 25 ++++++--
 tests/python/relay/test_op_level2.py          |  4 +-
 tests/python/target/test_arm_target.py        | 37 +++++++-----
 .../topi/python/test_topi_conv2d_int8.py      |  5 +-
 11 files changed, 88 insertions(+), 118 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index f04438e675a3..b7650480d0e4 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -207,21 +207,21 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     name="conv2d_nhwc_dsp.arm_cpu",
                 )
             elif kernel_layout == "HWIO":
-                is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm()
-                has_dot_prod = topi.arm_cpu.arm_utils.is_dotprod_available()
+                has_asimd = target.features.has_asimd
+                has_dot_prod = target.features.has_dotprod
                 if has_dot_prod and data.dtype in ["int8", "uint8"]:
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_native),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native),
                         name="conv2d_NHWC_quantized_native.arm_cpu",
                     )
-                if is_aarch64 and data.dtype in ["int8", "uint8"]:
+                if has_asimd and data.dtype in ["int8", "uint8"]:
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved),
                         name="conv2d_NHWC_quantized_interleaved.arm_cpu",
                     )
-                if (not is_aarch64) or (data.dtype not in ["int8", "uint8"]):
+                if (not has_asimd) or (data.dtype not in ["int8", "uint8"]):
                     # TODO(@giuseros)
                     # This strategy errors out for quantized data types when tuning.
                     # Let's use this only for non-aarch64 or non-quantized cases
@@ -283,8 +283,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 )
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
-            is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm()
-            if is_aarch64 or "+neon" in target.mattr:
+            if target.features.has_asimd:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.compute_depthwise_conv2d_nhwc),
                     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc),
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 9bc6efdad00f..ad016bc20089 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -405,18 +405,6 @@ def is_fast_int8_on_intel():
     return target_has_sse42(target.mcpu)
 
 
-def is_fast_int8_on_arm():
-    """Checks whether the hardware has support for fast Int8 arithmetic operations."""
-    target = tvm.target.Target.current(allow_none=False)
-    return "+v8.2a" in target.mattr and "+dotprod" in target.mattr
-
-
-def is_aarch64_arm():
-    """Checks whether we are compiling for an AArch64 target."""
-    target = tvm.target.Target.current(allow_none=False)
-    return "aarch64" in target.attrs.get("mtriple", "")
-
-
 ########################
 # ARM CPU legalizations.
 ########################
@@ -425,7 +413,6 @@ def is_aarch64_arm():
 @qnn_conv2d_legalize.register("arm_cpu")
 def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
     target = tvm.target.Target.current(allow_none=False)
-    has_asimd = is_aarch64_arm() or "+neon" in target.mattr
     is_depthwise = relay.op.strategy.is_depthwise_conv2d(
         types[0].shape,
         attrs["data_layout"],
@@ -434,9 +421,8 @@ def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
         attrs["groups"],
     )
     use_int8_on_arm = (not is_depthwise) and attrs["data_layout"] == "NHWC"
-    has_dotprod = is_fast_int8_on_arm()
-    other_options = use_int8_on_arm or has_dotprod
-    if has_asimd and not other_options:
+    other_options = use_int8_on_arm or target.features.has_dotprod
+    if target.features.has_asimd and not other_options:
         return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
     # ARM prefers the dtypes to be same.
     return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
@@ -445,8 +431,7 @@ def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
 @qnn_dense_legalize.register("arm_cpu")
 def _qnn_dense_legalize_arm_cpu(attrs, inputs, types):
     target = tvm.target.Target.current(allow_none=False)
-    has_asimd = is_aarch64_arm() or "+neon" in target.mattr
-    if has_asimd and not is_fast_int8_on_arm():
+    if target.features.has_asimd and not target.features.has_dotprod:
         return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.dense)
     # ARM prefers the dtypes to be same.
     return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense)
diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py
index 4ab72178b30b..1b2efc61ea56 100644
--- a/python/tvm/topi/arm_cpu/arm_utils.py
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -17,57 +17,7 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-member
 """Arm target utility functions"""
 
-import re
-import tvm
-
-
-def get_arch_version(target_mattr):
-    """Parse the LLVM target -mattr, and return
-    the architecture version in a decimal representation
-    (e.g., if -mattr=v8.4a, return 8.4)
-    """
-
-    arch_version = 8.0
-    m = re.compile(r"\+v(.*)\.(.*)a")
-    for attr in target_mattr:
-        match_obj = m.match(attr)
-        if match_obj:
-            major = int(match_obj.group(1))
-            minor = int(match_obj.group(2))
-            decimal = 10
-            if minor >= 10:
-                decimal = 100
-            arch_version = major + float(minor) / decimal
-
-    return arch_version
-
-
-def is_dotprod_available():
-    """Checks whether the hardware has support for udot/sdot instructions."""
-    target = tvm.target.Target.current(allow_none=False)
-    arch_version = get_arch_version(target.mattr)
-    return arch_version >= 8.4 or ((arch_version in (8.2, 8.3)) and "+dotprod" in target.mattr)
-
-
-def is_mmla_available():
-    """Checks whether the hardware has support for ummla/smmla instructions."""
-    target = tvm.target.Target.current(allow_none=False)
-    arch_version = get_arch_version(target.mattr)
-    return arch_version >= 8.6 or (
-        (arch_version in (8.2, 8.3, 8.4, 8.5)) and "+i8mm" in target.mattr
-    )
-
-
-def is_aarch64_arm():
-    """Checks whether we are compiling for an AArch64 target."""
-    target = tvm.target.Target.current(allow_none=False)
-    return "aarch64" in target.attrs.get("mtriple", "")
-
-
-def is_neon_available():
-    """Check if neon instructions are available"""
-    target = tvm.target.Target.current(allow_none=False)
-    return "+neon" in target.mattr
+from tvm.target import Target
 
 
 def get_tiling_B_interleaved_t(interleave_A):
@@ -94,13 +44,15 @@ def get_tiling_B_interleaved_t(interleave_A):
     tile_rows_B: the output tile rows of B'
     tile_cols_B: the output tile columns of B'
     """
-    if is_mmla_available():
+    target = Target.current(allow_none=False)
+
+    if target.features.has_matmul_i8:
         # If smmla/ummla is available,  A must be interleaved.
         # Each load from B' will contain 8 elements
         # and we are loading 12 rows of B' (i.e., 12 columns of B)
         tile_rows_B = 12
         tile_cols_B = 8
-    elif is_dotprod_available():
+    elif target.features.has_dotprod:
         # The number of tile rows of B' vary depending on the
         # strategy:
         # * If we are interleaving A, then we select 12 columns from B'(i.e.,
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
index 8e416be8daa2..04748a4d81fb 100644
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -18,6 +18,7 @@
 # pylint: disable=unused-argument, redefined-builtin
 """GEMM Convolution schedule on ARM"""
 import tvm
+from tvm.target import Target
 from tvm import te
 from tvm.topi import nn
 from tvm.autotvm.task.space import AnnotateEntity, ReorderEntity, OtherOptionEntity
@@ -29,10 +30,9 @@
     gemm_acc_nx16_int8_int8_int32,
     gemm_acc_2x2_int8_int8_int32,
 )
-from .arm_utils import is_aarch64_arm, is_dotprod_available, is_mmla_available
 
 
-def configure_knobs(cfg, M, K):
+def configure_knobs(cfg, M, K, target):
     """Configure auto-tuning knobs for the interleaved strategy"""
 
     x, y = cfg.axis(M // 4), cfg.axis(K // 16)
@@ -48,7 +48,7 @@ def configure_knobs(cfg, M, K):
         cfg["reorder_gemm"] = ReorderEntity([0, 1])
         cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"])
 
-    if not is_dotprod_available():
+    if not target.features.has_dotprod:
         cfg.define_knob("gemm_quantized_unroll", [True, False])
         if cfg.is_fallback:
             cfg["gemm_quantized_unroll"] = OtherOptionEntity(False)
@@ -133,12 +133,13 @@ def compute_conv2d_gemm_without_weight_transform(
     # - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
     # In order to have more information
     #
-    if is_mmla_available():
+    target = Target.current(allow_none=False)
+    if target.features.has_matmul_i8:
         # If smmla/ummla is enabled, we are loading 8 rows from A. Each row
         # will contain 8 elements
         tile_rows_A = 8
         tile_cols_A = 8
-    elif is_dotprod_available() and interleave_A:
+    elif target.features.has_dotprod and interleave_A:
         # If dot product has been enabled, and we are interleaving A
         # tile size should be 8x4
         tile_rows_A = 8
@@ -173,7 +174,7 @@ def compute_conv2d_gemm_without_weight_transform(
 
     if interleave_A:
         # Configuration space
-        configure_knobs(cfg, M_padded, K_padded)
+        configure_knobs(cfg, M_padded, K_padded, target)
 
         # Pack the input data
         A_interleaved = te.compute(
@@ -181,7 +182,8 @@ def compute_conv2d_gemm_without_weight_transform(
             lambda b, x, y, z, w: A[b, z + tile_rows_A * x, w + tile_cols_A * y],
             name="A_interleaved",
         )
-        if is_mmla_available():
+        target = Target.current(allow_none=False)
+        if target.features.has_matmul_i8:
             # Execute GEMM. In the case of mmla, we need to enforce the tiling
             # from the compute. This is because mmla is doing a tiled computation
             # as well. So we have a big 8x12 tile, with small 2x2 sub-tiles
@@ -323,7 +325,8 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
     k = C_interleaved.op.reduce_axis[0]
     _, M, N = C.shape
     if in_type in ["int8", "uint8"]:
-        if is_mmla_available():
+        target = Target.current(allow_none=False)
+        if target.features.has_matmul_i8:
             gemm_acc = gemm_acc_2x2_int8_int8_int32(in_type)
             xi_inner, yi_inner = C_interleaved.op.axis[-2:]
             k_outer, k_inner = s[C_interleaved].split(k, 8)
@@ -333,7 +336,7 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
             s[C_interleaved].tensorize(xi_inner, gemm_acc)
             s[C_interleaved].unroll(xi)
             s[C_interleaved].unroll(yi)
-        elif is_dotprod_available():
+        elif target.features.has_dotprod:
             gemm_acc = gemm_acc_4x4_int8_int8_int32(in_type)
             xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(
                 xi, yi, x_factor=8, y_factor=4
@@ -354,7 +357,7 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
             s[C_interleaved].tensorize(xi_inner_inner, gemm_acc)
             s[C_interleaved].unroll(xi_inner_outer)
 
-        elif is_aarch64_arm():
+        elif target.features.has_asimd:
             s[C_interleaved].reorder(yi, xi)
             K = A_interleaved_input.shape[2]
             assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported"
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 224d21b34d9a..df231c0bc083 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -30,7 +30,7 @@
     schedule_conv2d_gemm_interleaved,
     schedule_conv2d_gemm_native,
 )
-from .arm_utils import get_tiling_B_interleaved_t, is_dotprod_available, is_neon_available
+from .arm_utils import get_tiling_B_interleaved_t
 
 
 def _get_default_config(cfg, data, kernel, strides, padding, dilation, out_dtype):
@@ -124,7 +124,10 @@ def is_int8_hw_support(data_dtype, kernel_dtype):
     is_llvm_support = llvm_version >= 8
 
     # 3) Check target
-    is_target_support = is_neon_available() or is_dotprod_available()
+    current_target = target.Target.current(allow_none=False)
+    is_target_support = bool(
+        current_target.features.has_asimd or current_target.features.has_dotprod
+    )
 
     return is_dtype_support and is_llvm_support and is_target_support
 
@@ -154,9 +157,10 @@ def _callback(op):
             _, _, kh, kw, _, _, n_elems = get_const_tuple(kernel_vec.shape)
             assert n_elems == 4
             dtype = "uint" if data.dtype == "uint8" else "int"
-            if is_dotprod_available():
+            current_target = target.Target.current(allow_none=False)
+            if current_target.features.has_dotprod:
                 intrin = dot_int8_int8_int32_neon_82(int32_lanes=4, dtype=dtype)
-            elif is_neon_available():
+            elif current_target.features.has_asimd:
                 assert dtype == "int", "uint8 not supported if dot product is not available"
                 intrin = dot_int8_int8_int32_neon()
             else:
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index 960c311d51ba..9284b9474513 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -18,6 +18,7 @@
 """Depthwise convolution schedule for ARM CPU"""
 
 import tvm
+from tvm.target import Target
 from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
@@ -26,7 +27,6 @@
 from ..utils import traverse_inline, get_const_tuple, get_const_int
 from ..nn.utils import get_pad_tuple
 from .tensor_intrin import smlal_int16_int32
-from .arm_utils import is_aarch64_arm
 from .mprofile.dsp.depthwise_conv2d import (
     depthwise_conv2d_nhwc_dsp_compute,
     depthwise_conv2d_nhwc_dsp_schedule,
@@ -333,12 +333,13 @@ def schedule_conv(conv):
         co, ci = cfg["tile_c"].apply(s, conv, c)
 
         split_val = cfg["tile_c"].size[-1]
+        target = Target.current(allow_none=False)
         use_tensorization = (
             (in_type == "int16")
             and (split_val == 8)
             and (IC % split_val == 0)
             and (channel_multiplier == 1)
-            and is_aarch64_arm()
+            and target.features.has_asimd
         )
 
         data_pad_value = -1
diff --git a/src/target/parsers/cpu.cc b/src/target/parsers/cpu.cc
index fbf55f468313..3cfabb7639df 100644
--- a/src/target/parsers/cpu.cc
+++ b/src/target/parsers/cpu.cc
@@ -20,6 +20,7 @@
 
 #include <string>
 
+#include "aprofile.h"
 #include "mprofile.h"
 
 namespace tvm {
@@ -32,6 +33,10 @@ TargetJSON ParseTarget(TargetJSON target) {
     return mprofile::ParseTarget(target);
   }
 
+  if (aprofile::IsArch(target)) {
+    return aprofile::ParseTarget(target);
+  }
+
   return target;
 }
 
diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py
index b5b9ed6b6ef9..3cf81e971f77 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_network.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_network.py
@@ -20,11 +20,9 @@
 
 import numpy as np
 import pytest
-from tvm import testing
 from tvm import relay
 
-from test_arm_compute_lib.infrastructure import skip_runtime_test, build_and_run, verify
-from test_arm_compute_lib.infrastructure import Device
+from test_arm_compute_lib.infrastructure import Device, skip_runtime_test, build_and_run, verify
 
 
 def _build_and_run_network(mod, params, inputs, device, tvm_ops, acl_partitions, atol, rtol):
@@ -108,7 +106,12 @@ def get_model():
         return mod, params, inputs
 
     _build_and_run_network(
-        *get_model(), device=device, tvm_ops=4, acl_partitions=21, atol=0.002, rtol=0.01
+        *get_model(),
+        device=device,
+        tvm_ops=4,
+        acl_partitions=21,
+        atol=0.002,
+        rtol=0.01,
     )
 
 
@@ -180,7 +183,12 @@ def get_model():
         return mod, params, inputs
 
     _build_and_run_network(
-        *get_model(), device=device, tvm_ops=3, acl_partitions=30, atol=9, rtol=0
+        *get_model(),
+        device=device,
+        tvm_ops=3,
+        acl_partitions=30,
+        atol=10,
+        rtol=0,
     )
 
 
@@ -207,7 +215,12 @@ def get_model():
         return mod, params, inputs
 
     _build_and_run_network(
-        *get_model(), device=device, tvm_ops=9, acl_partitions=31, atol=8, rtol=0
+        *get_model(),
+        device=device,
+        tvm_ops=9,
+        acl_partitions=31,
+        atol=8,
+        rtol=0,
     )
 
 
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 7efec2db03b9..ca1adf940029 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2209,7 +2209,9 @@ def get_conv2d_nchw(
 
 @tvm.testing.requires_arm_dot
 def test_conv2d_int8_alter_dtype_arm():
-    _test_conv2d_int8_alter_dtype("uint8", "llvm --device arm_cpu -mattr=+v8.2a,+dotprod", "sdot")
+    _test_conv2d_int8_alter_dtype(
+        "uint8", "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod", "sdot"
+    )
 
 
 @tvm.testing.requires_cascadelake
diff --git a/tests/python/target/test_arm_target.py b/tests/python/target/test_arm_target.py
index 9106c169c869..dc8452710a8a 100644
--- a/tests/python/target/test_arm_target.py
+++ b/tests/python/target/test_arm_target.py
@@ -20,24 +20,31 @@
 from tvm.topi.arm_cpu.conv2d_int8 import is_int8_hw_support
 from tvm.target import codegen
 
-arm_target, input_dtype, kernel_dtype, is_supported = tvm.testing.parameters(
+llvm_version, arm_target, input_dtype, kernel_dtype, is_supported = tvm.testing.parameters(
     # Testing mcpu type
-    ("c -mcpu=cortex-m4 -keys=arm_cpu", "int8", "int8", False),
-    ("c -mcpu=cortex-m7 -keys=arm_cpu", "int8", "int8", False),
-    ("c -mcpu=cortex-m33 -keys=arm_cpu", "int8", "int8", False),
-    ("c -mcpu=cortex-m55 -keys=arm_cpu", "int8", "int8", False),
-    ("c -mcpu=cortex-m3 -keys=arm_cpu", "int8", "int8", False),
-    ("llvm -keys=arm_cpu -mattr=+neon", "int8", "int8", True),
-    # This fails because of a bug in topi.arm_cpu.arm_utils.get_arch_version
-    # ("llvm -keys=arm_cpu -mattr=v8.4a,+dotprod", "int8", "int8", True),
+    (8, "c -mcpu=cortex-m4", "int8", "int8", False),
+    (8, "c -mcpu=cortex-m7", "int8", "int8", False),
+    (8, "c -mcpu=cortex-m33", "int8", "int8", False),
+    (8, "c -mcpu=cortex-m55", "int8", "int8", False),
+    (8, "c -mcpu=cortex-m3", "int8", "int8", False),
+    (7, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", False),
+    (8, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", True),
+    (9, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", True),
+    (8, "llvm -mtriple=arm-linux-gnueabi", "int8", "int8", False),
+    (7, "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.4a,+dotprod", "int8", "int8", False),
+    (8, "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.4a,+dotprod", "int8", "int8", True),
+    (9, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", True),
+    (8, "llvm -mtriple=aarch64-linux-gnu", "int8", "int8", True),
     # Testing dtype
-    ("llvm -keys=arm_cpu -mattr=+neon", "int16", "int8", False),
-    ("llvm -keys=arm_cpu -mattr=+neon", "int8", "int16", False),
-    ("llvm -keys=arm_cpu -mattr=+neon", "int16", "int16", False),
+    (8, "llvm -mtriple=aarch64-linux-gnu -mattr=+neon", "int16", "int8", False),
+    (8, "llvm -mtriple=aarch64-linux-gnu -mattr=+neon", "int8", "int16", False),
+    (8, "llvm -mtriple=aarch64-linux-gnu -mattr=+neon", "int16", "int16", False),
 )
 
 
-def test_arm_conv2d_int8_support(arm_target, input_dtype, kernel_dtype, is_supported):
+def test_arm_conv2d_int8_support(
+    monkeypatch, llvm_version, arm_target, input_dtype, kernel_dtype, is_supported
+):
     """Test ARM conv2d int8 support for different targets.
 
     Parameters
@@ -52,5 +59,5 @@ def test_arm_conv2d_int8_support(arm_target, input_dtype, kernel_dtype, is_suppo
         Expected result.
     """
     with tvm.target.Target(arm_target):
-        expected_result = is_supported and (codegen.llvm_version_major() >= 8)
-        assert is_int8_hw_support(input_dtype, kernel_dtype) == expected_result
+        monkeypatch.setattr(codegen, "llvm_version_major", lambda: llvm_version)
+        assert is_int8_hw_support(input_dtype, kernel_dtype) == is_supported
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 6070cafa9c2c..c84f39ab5a66 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -26,7 +26,6 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.nn.utils import get_pad_tuple
 from tvm.topi.utils import get_const_tuple
-from tvm.topi.arm_cpu.conv2d_gemm import is_aarch64_arm
 from tvm.topi.nn.conv2d import _get_workload
 from tvm.topi.generic.conv2d import fallback_schedule_cpu_common_int8
 
@@ -94,8 +93,8 @@ def compile_conv2d_NHWC_gemm_int8_arm(
             print("Skip because %s is not enabled" % target)
             return
         print("Compiling on arm AArch64 target: %s" % target)
-        with tvm.target.Target(target):
-            assert is_aarch64_arm(), "AArch64 target not recognized"
+        with tvm.target.Target(target) as tvm_target:
+            assert tvm_target.features.is_aarch64, "AArch64 target not recognized"
 
             C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
             if add_bias:

From 7827fffe5c98fef213706bc296b97d96bb9f5019 Mon Sep 17 00:00:00 2001
From: "Ehsan M. Kermani" <6980212+ehsanmok@users.noreply.github.com>
Date: Mon, 24 Oct 2022 09:55:00 -0700
Subject: [PATCH 424/704] [Relay][Frontend][ONNX] Add GridSample operator
 (#13163)

* [Relay][Frontend][ONNX] Add GridSample operator

* Skip the flaky gridsample nearest test on cuda

* Remove the flaky skip case

Co-authored-by: Ehsan M. Kermani <ehsanmok@users.noreply.github.com>
---
 python/tvm/relay/frontend/onnx.py          | 18 ++++++++++++++++++
 python/tvm/relay/op/image/image.py         |  2 +-
 python/tvm/topi/image/grid_sample.py       | 18 +++++++++---------
 src/relay/op/image/grid_sample.cc          |  2 +-
 tests/python/frontend/onnx/test_forward.py | 12 ++++--------
 5 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index ff7d5655e0d3..743e39296c2e 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -4815,6 +4815,23 @@ def _impl_v14(cls, inputs, attr, params):
         return _op.trilu(data, k, upper)
 
 
+class GridSample(OnnxOpConverter):
+    """Operator converter for GridSample"""
+
+    @classmethod
+    def _impl_v16(cls, inputs, attr, params):
+        grid = inputs[1]
+        # onnx grid is of shape (N, H, W, 2) which should be transposed to (N, 2, H, W) for relay
+        grid = _op.transform.transpose(grid, axes=(0, 3, 1, 2))
+        method: str = attr.get("mode", b"bilinear").decode("utf-8")
+        padding_mode: str = attr.get("padding_mode", b"zeros").decode("utf-8")
+        # onnx default is 0 which should be changed to False in relay
+        align_corners = attr.get("align_corners", 0) != 0
+        return _op.image.grid_sample(
+            inputs[0], grid, method, padding_mode=padding_mode, align_corners=align_corners
+        )
+
+
 class RandomNormal(OnnxOpConverter):
     """Operator converter for random_normal"""
 
@@ -5494,6 +5511,7 @@ def _get_convert_map(opset):
         "Unique": Unique.get_converter(opset),
         "Einsum": Einsum.get_converter(opset),
         "Trilu": Trilu.get_converter(opset),
+        "GridSample": GridSample.get_converter(opset),
         # defs/control_flow
         "Loop": Loop.get_converter(opset),
         "If": If.get_converter(opset),
diff --git a/python/tvm/relay/op/image/image.py b/python/tvm/relay/op/image/image.py
index b5886300cbed..5a17532dd018 100644
--- a/python/tvm/relay/op/image/image.py
+++ b/python/tvm/relay/op/image/image.py
@@ -477,7 +477,7 @@ def grid_sample(
 
     The left-top corner (-1, -1) and right-bottom corner (1, 1) in grid will be map to
     (0, 0) and (h - 1, w - 1) of data if align_corners is "True", or
-    (-0.5, -0.5) and (h + 0.5, w + 0.5) of data if align_corners is "False".
+    (-0.5, -0.5) and (h - 0.5, w - 0.5) of data if align_corners is "False".
 
     The shape of the output will be
     4-D (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]), or
diff --git a/python/tvm/topi/image/grid_sample.py b/python/tvm/topi/image/grid_sample.py
index 705df8db7b54..711087022993 100644
--- a/python/tvm/topi/image/grid_sample.py
+++ b/python/tvm/topi/image/grid_sample.py
@@ -81,7 +81,7 @@ def _grid_sample_2d(
 
     The left-top corner (-1, -1) and right-bottom corner (1, 1) in grid will be map to
     (0, 0) and (h - 1, w - 1) of data if align_corners is "True", or
-    (-0.5, -0.5) and (h + 0.5, w + 0.5) of data if align_corners is "False".
+    (-0.5, -0.5) and (h - 0.5, w - 0.5) of data if align_corners is "False".
 
     The shape of the output will be (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]).
 
@@ -200,13 +200,13 @@ def _bilinear_sample(n, c, h, w):
 
     def _nearest_sample(n, c, h, w):
         y, x = _compute_source_index(n, h, w)
-        y_new = te.round(y).astype("int32")
-        x_new = te.round(x).astype("int32")
+        y_new = te.nearbyint(y).astype("int32")
+        x_new = te.nearbyint(x).astype("int32")
 
         return _get_pixel_value(n, c, y_new, x_new)
 
     def _bicubic_sample(n, c, h, w):
-        A = -0.75  # 0.75 is used in pytorch, it maybe different in other frameworks
+        A = -0.75  # -0.75 is used in pytorch, it maybe different in other frameworks
 
         def cubic_weight_1(fraction):
             return ((A + 2) * fraction - (A + 3)) * fraction * fraction + 1
@@ -310,7 +310,7 @@ def _grid_sample_3d(
 
     The left-top corner (-1, -1, -1) and right-bottom corner (1, 1, 1) in grid will be map to
     (0, 0, 0) and (d - 1, h - 1, w - 1) of data if align_corners is "True", or
-    (-0.5, -0.5, -0.5) and (d + 0.5, h + 0.5, w + 0.5) of data if align_corners is "False".
+    (-0.5, -0.5, -0.5) and (d - 0.5, h - 0.5, w - 0.5) of data if align_corners is "False".
 
     The shape of the output will be
     (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3], grid.shape[4]).
@@ -437,9 +437,9 @@ def _trilinear_sample(n, c, d, h, w):
 
     def _nearest_sample(n, c, d, h, w):
         z, y, x = _compute_source_index(n, d, h, w)
-        z_new = te.round(z).astype("int32")
-        y_new = te.round(y).astype("int32")
-        x_new = te.round(x).astype("int32")
+        z_new = te.nearbyint(z).astype("int32")
+        y_new = te.nearbyint(y).astype("int32")
+        x_new = te.nearbyint(x).astype("int32")
         return _get_pixel_value(n, c, z_new, y_new, x_new)
 
     if method == "bilinear":
@@ -474,7 +474,7 @@ def grid_sample(
 
     The left-top corner (-1, -1) and right-bottom corner (1, 1) in grid will be map to
     (0, 0) and (h - 1, w - 1) of data if align_corners is "True", or
-    (-0.5, -0.5) and (h + 0.5, w + 0.5) of data if align_corners is "False".
+    (-0.5, -0.5) and (h - 0.5, w - 0.5) of data if align_corners is "False".
 
     The shape of the output will be
     4-D (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]), or
diff --git a/src/relay/op/image/grid_sample.cc b/src/relay/op/image/grid_sample.cc
index 689a71ebc53b..46b9714aeeb8 100644
--- a/src/relay/op/image/grid_sample.cc
+++ b/src/relay/op/image/grid_sample.cc
@@ -177,7 +177,7 @@ inner pixel value if padding_mode is "reflection".
 
 The left-top corner (-1, -1) and right-bottom corner (1, 1) in grid will be map to
 (0, 0) and (h - 1, w - 1) of data if align_corners is "True", or
-(-0.5, -0.5) and (h + 0.5, w + 0.5) of data if align_corners is "False".
+(-0.5, -0.5) and (h - 0.5, w - 0.5) of data if align_corners is "False".
 
 The shape of the output will be
 4-D (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]), or
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 3a714af3a7a1..684a33fbceea 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5274,14 +5274,6 @@ def verify_eyelike(indata, dynamic=False):
     "test_dropout_default_mask",
     "test_dropout_default_mask_ratio",
     "test_dropout_default_ratio",
-    "test_gridsample",
-    "test_gridsample_aligncorners_true",
-    "test_gridsample_bicubic",
-    "test_gridsample_bilinear",
-    "test_gridsample_border_padding",
-    "test_gridsample_nearest",
-    "test_gridsample_reflection_padding",
-    "test_gridsample_zeros_padding",
     "test_gru_batchwise",
     "test_hammingwindow",
     "test_hammingwindow_expanded",
@@ -5419,6 +5411,10 @@ def test_onnx_nodes(target, dev, onnx_test):
         # in accuracy depending on implementation
         atol = 1e-4
 
+    if "bicubic" in test_dir:
+        # satisfies onnx precision for bicubic interpolation
+        atol = 1e-4
+
     onnx_model = onnx.load(test_dir + "/model.onnx")
     inputs = []
     outputs = []

From 5400b942d2ef5ec7145f4f3e253b24c8abb4265e Mon Sep 17 00:00:00 2001
From: Alexey Gladyshev <wotpricol@mail.ru>
Date: Mon, 24 Oct 2022 20:00:27 +0300
Subject: [PATCH 425/704] [ONNX] Add converter for FastGelu from Microsoft
 onnxruntime contrib opset (#13119)

* add converter for FastGelu from Microsoft onnxruntime contrib opset

* integrate FastGelu into test system for ONNX converters

* code review fixes

* returned constant calculation
---
 python/tvm/relay/frontend/onnx.py          | 38 ++++++++++++-
 tests/python/frontend/onnx/test_forward.py | 62 ++++++++++++++--------
 2 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 743e39296c2e..ca290cf9a81c 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -928,10 +928,45 @@ def _impl_v1(cls, inputs, attr, params):
         return _op.multiply(term1, term2)
 
 
+class FastGelu(OnnxOpConverter):
+    """Operator converter for FastGelu from Microsoft onnxruntime contrib opset.
+
+    fast_gelu(x) = 0.5x(1 + tanh(sqrt(2/pi)(x + 0.044715x^3)))
+                 = 0.5x(1 + tanh((sqrt(2/pi)x + 0.044715(sqrt(2/pi)x^3)))
+                 = 0.5x(1 + tanh(c1 * x + c2 * x^3)))
+    , where
+        c1 = sqrt(2/pi)
+        c2 = 0.044715 * sqrt(2/pi)
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        x = inputs[0]
+        if inputs[1]:
+            bias = inputs[1]
+            bias_shape = infer_shape(bias)
+            assert len(bias_shape) == 1, "bias term must be a 1D tensor"
+            x += bias
+
+        # Declare consts
+        const_dtype = infer_type(x).checked_type.dtype
+        half = _expr.const(0.5, dtype=const_dtype)
+        one = _expr.const(1.0, dtype=const_dtype)
+        const1 = _expr.const(math.sqrt(2 / math.pi), dtype=const_dtype)
+        const2 = _expr.const(0.044715 * math.sqrt(2 / math.pi), dtype=const_dtype)
+
+        # Compute FastGelu
+        term1 = _op.multiply(half, x)
+        term2 = _op.multiply(const1, x)
+        term3 = _op.multiply(const2, _op.power(x, _expr.const(3, const_dtype)))
+        tanh = _op.tanh(_op.add(term2, term3))
+        return _op.multiply(term1, _op.add(one, tanh))
+
+
 class BiasGelu(OnnxOpConverter):
     """Operator converter for BiasGelu from Microsoft onnxruntime contrib opset.
 
-    bias_gelu(x, b) = 0.5(x, b)(1 + erf((x + b)/sqrt(2)))
+    bias_gelu(x, b) = 0.5(x + b)(1 + erf((x + b)/sqrt(2)))
     """
 
     @classmethod
@@ -5384,6 +5419,7 @@ def _get_convert_map(opset):
         "Selu": Selu.get_converter(opset),
         "Elu": Elu.get_converter(opset),
         "Gelu": Gelu.get_converter(opset),
+        "FastGelu": FastGelu.get_converter(opset),
         "BiasGelu": BiasGelu.get_converter(opset),
         "LayerNormalization": LayerNormalization.get_converter(opset),
         # TODO: We need a better way to handle different domains, in case
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 684a33fbceea..5eac9a8dd4d6 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5614,13 +5614,18 @@ def verify_reverse_sequence(x, sequence_lens, batch_axis, time_axis):
     verify_reverse_sequence(x, sequence_lens, 1, 0)
 
 
+@pytest.mark.parametrize("op_name", ["Gelu", "FastGelu"], scope="session")
+@pytest.mark.parametrize("data_type", ["float16", "float32"], scope="session")
 @tvm.testing.parametrize_targets
-def test_gelu(target, dev):
+def test_gelu(target, dev, data_type, op_name):
     """test_gelu"""
+    dtype = np.dtype(data_type)
+    tensor_type = mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    absolute_tolerance = 1e-3 if data_type == "float16" else 1e-5
 
     def verify_gelu(x):
         node = onnx.helper.make_node(
-            "Gelu",
+            op_name,
             inputs=["x"],
             outputs=["y"],
             domain="com.microsoft",
@@ -5628,27 +5633,34 @@ def verify_gelu(x):
 
         graph = helper.make_graph(
             [node],
-            "gelu_test",
-            inputs=[helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x.shape))],
-            outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(x.shape))],
+            f"{op_name}_test",
+            inputs=[helper.make_tensor_value_info("x", tensor_type, list(x.shape))],
+            outputs=[helper.make_tensor_value_info("y", tensor_type, list(x.shape))],
         )
 
-        model = helper.make_model(graph, producer_name="gelu_test")
-        verify_with_ort_with_inputs(model, [x], [x.shape], target=target, dev=dev)
+        model = helper.make_model(graph, producer_name=f"{op_name}_test")
+        verify_with_ort_with_inputs(
+            model, [x], [x.shape], atol=absolute_tolerance, dtype=data_type, target=target, dev=dev
+        )
 
-    x = np.array([-1.0, 0, 1.0, 100.0, -100.0, 1000.0, -1000.0], dtype=np.float32)
+    x = np.array([-1.0, 0, 1.0, 100.0, -100.0, 1000.0, -1000.0], dtype=dtype)
     verify_gelu(x)
-    x = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    x = np.array([[1, 2], [3, 4]], dtype=dtype)
     verify_gelu(x)
 
 
+@pytest.mark.parametrize("op_name", ["BiasGelu", "FastGelu"], scope="session")
+@pytest.mark.parametrize("data_type", ["float16", "float32"], scope="session")
 @tvm.testing.parametrize_targets
-def test_biasgelu(target, dev):
+def test_biasgelu(target, dev, data_type, op_name):
     """test_biasgelu"""
+    dtype = np.dtype(data_type)
+    tensor_type = mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    absolute_tolerance = 1e-3 if data_type == "float16" else 1e-5
 
     def verify_biasgelu(x, bias):
         node = onnx.helper.make_node(
-            "BiasGelu",
+            op_name,
             inputs=["x", "bias"],
             outputs=["y"],
             domain="com.microsoft",
@@ -5656,23 +5668,31 @@ def verify_biasgelu(x, bias):
 
         graph = helper.make_graph(
             [node],
-            "biasgelu_test",
+            f"{op_name}_test",
             inputs=[
-                helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x.shape)),
-                helper.make_tensor_value_info("bias", TensorProto.FLOAT, list(bias.shape)),
+                helper.make_tensor_value_info("x", tensor_type, list(x.shape)),
+                helper.make_tensor_value_info("bias", tensor_type, list(bias.shape)),
             ],
-            outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(x.shape))],
+            outputs=[helper.make_tensor_value_info("y", tensor_type, list(x.shape))],
         )
 
-        model = helper.make_model(graph, producer_name="biasgelu_test")
-        verify_with_ort_with_inputs(model, [x, bias], [x.shape], target=target, dev=dev)
+        model = helper.make_model(graph, producer_name=f"{op_name}_test")
+        verify_with_ort_with_inputs(
+            model,
+            [x, bias],
+            [x.shape],
+            atol=absolute_tolerance,
+            dtype=data_type,
+            target=target,
+            dev=dev,
+        )
 
-    x = np.array([-1.0, 0, 1.0, 100.0, -100.0, 1000.0, -1000.0], dtype=np.float32)
-    bias = np.repeat(2.0, 7).astype("float32")
+    x = np.array([-1.0, 0, 1.0, 100.0, -100.0, 1000.0, -1000.0], dtype=dtype)
+    bias = np.repeat(2.0, 7).astype(dtype)
     verify_biasgelu(x, bias)
 
-    x = np.array([[1, 2], [3, 4]], dtype=np.float32)
-    bias = np.array([0.3, 4.0], dtype=np.float32)
+    x = np.array([[1, 2], [3, 4]], dtype=dtype)
+    bias = np.array([0.3, 4.0], dtype=dtype)
     verify_biasgelu(x, bias)
 
 
From 3e02ac5d2e46a6d17f56e9b3a452107b0d218060 Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Mon, 24 Oct 2022 22:23:55 +0300
Subject: [PATCH 426/704] [Adreno] Fix mem_scope annotations for prim funcs
 having several heads (#13153)

* [Adreno] Fix mem_scope annotations for prim funcs having several heads

2) fix of injective schedules for non blocking case

* Address PR comments

* fix lint

* Modify comment
---
 python/tvm/topi/adreno/utils.py               |  31 ++-
 .../transforms/annotate_texture_storage.cc    |  10 +-
 .../test_conv2d_nchw_texture.py               | 193 ++++++++++++++++++
 3 files changed, 217 insertions(+), 17 deletions(-)

diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py
index de0505af03d4..1a1cc747faac 100644
--- a/python/tvm/topi/adreno/utils.py
+++ b/python/tvm/topi/adreno/utils.py
@@ -525,28 +525,27 @@ def bind_data_copy(stage, axis_to_vectorize=None):
         stage.bind(block, te.thread_axis("blockIdx.z"))
         stage.bind(thread, te.thread_axis("threadIdx.z"))
     else:
-        axes = stage.op.axis
-        fused = stage.fuse(*axes[:-1])
-        if shape[-1] <= 32:
+        if shape[-1] == 4:
+            axes = stage.op.axis
+            fused = stage.fuse(*axes[:-1])
             ftc = numpy.prod(shape[:-1])
             div = get_div(ftc, 64)
             block, thread = stage.split(fused, factor=div)
             stage.bind(block, te.thread_axis("blockIdx.x"))
             stage.bind(thread, te.thread_axis("threadIdx.x"))
-            if shape[-1] == 4:
-                stage.vectorize(axes[-1])
-        # 1024 is the maximum work group size for Adreno devices.
-        # See: CL_DEVICE_MAX_WORK_GROUP_SIZE
-        elif shape[-1] > 1024:
-            ftc = numpy.prod(shape[:-1])
-            div = get_div(ftc, 1024)
-            by, ty = stage.split(axes[-1], factor=div)
-            stage.bind(fused, te.thread_axis("blockIdx.x"))
-            stage.bind(by, te.thread_axis("blockIdx.y"))
-            stage.bind(ty, te.thread_axis("threadIdx.y"))
+            stage.vectorize(axes[-1])
         else:
-            stage.bind(fused, te.thread_axis("blockIdx.x"))
-            stage.bind(*axes[-1:], te.thread_axis("threadIdx.x"))
+            ftc = numpy.prod(shape)
+            vthread = get_div(ftc, 8)
+            fused = stage.fuse(*stage.op.axis)
+            ftc = ftc / vthread
+            # 1024 is a maximum work group size on the most Adreno GPU
+            num_thread = get_div(ftc, 1024 // vthread)
+            a, b = stage.split(fused, factor=num_thread)
+            a, c = stage.split(a, factor=vthread)
+            stage.bind(c, te.thread_axis("vthread"))
+            stage.bind(a, te.thread_axis("blockIdx.x"))
+            stage.bind(b, te.thread_axis("threadIdx.x"))
 
 
 def get_texture_storage(shape):
diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc
index 6904c6b5d7cc..277c5e1da424 100644
--- a/src/relay/transforms/annotate_texture_storage.cc
+++ b/src/relay/transforms/annotate_texture_storage.cc
@@ -206,7 +206,9 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
       }
     }
 
-    primitive_supports_texture_ = SupportsTextureStorage(call);
+    if (!primitive_supports_texture_) {
+      primitive_supports_texture_ = SupportsTextureStorage(call);
+    }
 
     for (auto& arg : call->args) {
       Visit(arg);
@@ -362,6 +364,12 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
 
   bool SupportsTextureStorage(const CallNode* call) const {
     bool supports_texture_storage = false;
+    // we need to verify only entry functions since one of entry op defines main schedule
+    for (const auto& arg : call->args) {
+      if (!arg.as<VarNode>()) {
+        return false;
+      }
+    }
     if (auto attrs = call->attrs.as<Conv2DAttrs>()) {
       if (attrs->data_layout == "NCHW4c" && attrs->kernel_layout == "OIHW4o") {
         supports_texture_storage = true;
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
index c73e411a700e..5198cbdf6bc6 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
@@ -1074,3 +1074,196 @@ def test_conv2d_winograd_non_rect(target, dtype):
     )
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+# function repeat, params scope are different in reused functions
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_injective_nwo_inputs1(target, dtype):
+    """
+    Use case for verification of stability of annotation primary functions
+    having several ops accepting data outside of Primary function
+    The visiting of ops during traversing of graph inside primary function
+    can depend on order of relay graph creation. Thus the annotation mechanism
+    should be reliable for graph traversal order
+    The current decision if Prim Function support textures or not depend on
+    *any* op accepting input of the function and if op support textures
+                                     Input
+                               /                   \
+                layout_transform (NCHW->NCHW4c)    |
+                         |                        /
+                      conv2d (1)                 /
+                         |                      /
+                      conv2d (2)       mean    /
+                  /         \                 /   <- Primary function several head ops
+             (1)add    (2)layout_transform    |
+                 |        (NCHW4c->NCHW)      |
+                 |           |      \        /
+                 |           |       (3) add
+                 |           |         |
+    layout_transform          \       /
+     (NCHW4c->NCHW)             \    /
+                 \                mul
+                  \            /
+                        add
+
+    This test verifies a case when the latest op which is visited is (3) and does not
+    support textures, but there is (1) supporting textures, thus the whole func will
+    support textures
+    """
+    input_shape = (1, 4, 40, 40)
+    filter_shape1 = (4, 4, 3, 3)
+    filter_shape2 = (4, 4, 3, 3)
+    filter_shape3 = (4, 4, 3, 3)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    mean = relay.mean(A, axis=1, keepdims=True)
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[1, 1, 1, 1],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=4,
+        kernel_size=(3, 3),
+    )
+
+    conv2 = relay.nn.conv2d(
+        conv1,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[1, 1, 1, 1],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=4,
+        kernel_size=(3, 3),
+    )
+
+    ad3 = relay.op.add(conv1, conv2)
+    ad1 = relay.op.add(mean, conv1)
+    ad2 = relay.op.multiply(ad1, conv2)
+    ad4 = relay.op.add(ad3, ad2)
+
+    mod = relay.Function([A, W1, W2], ad4)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("weight", filter_data2)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "weight2": tvm.nd.array(filter_data2),
+    }
+
+    static_memory_scope = [
+        "global",
+        "global.texture",
+        "global.texture-nhwc",
+        "global.texture",
+        "global.texture-nhwc",
+        "global.texture",
+        "global",
+        "global",
+    ]
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+# function repeat, params scope are different in reused functions
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_injective_nwo_inputs2(target, dtype):
+    """
+    Use case for verification of stability of annotation primary functions
+    having several ops accepting data outside of Primary function
+    The visiting of ops during traversing of graph inside primary function
+    can depend on order of relay graph creation. Thus the annotation mechanism
+    should be reliable for graph traversal order
+    The current decision if Prim Function support textures or not depend on
+    *any* op accepting input of the function and if op support textures
+                                     Input
+                               /                   \
+                layout_transform (NCHW->NCHW4c)    |
+                         |                        /
+                      conv2d (1)                 /
+                         |                      /
+                      conv2d (2)       mean    /
+                  /         \                 /   <- Primary function several head ops
+             (1)add    (2)layout_transform    |
+                 |        (NCHW4c->NCHW)      |
+                 |           |      \        /
+                 |           |       (3) add
+                 |           |         |
+    layout_transform          \       /
+     (NCHW4c->NCHW)             \    /
+                 \                mul
+                  \            /
+                        add
+
+    This test verifies a case when the latest op which is (1), it supports textures
+    an whole prim function is considered as a func working with textures
+    """
+    input_shape = (1, 4, 40, 40)
+    filter_shape1 = (4, 4, 3, 3)
+    filter_shape2 = (4, 4, 3, 3)
+    filter_shape3 = (4, 4, 3, 3)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    mean = relay.mean(A, axis=1, keepdims=True)
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[1, 1, 1, 1],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=4,
+        kernel_size=(3, 3),
+    )
+
+    conv2 = relay.nn.conv2d(
+        conv1,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[1, 1, 1, 1],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=4,
+        kernel_size=(3, 3),
+    )
+
+    ad3 = relay.op.add(conv1, conv2)
+    ad1 = relay.op.add(mean, conv1)
+    ad2 = relay.op.multiply(ad1, conv2)
+    ad4 = relay.op.add(ad2, ad3)
+
+    mod = relay.Function([A, W1, W2], ad4)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("weight", filter_data2)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "weight2": tvm.nd.array(filter_data2),
+    }
+
+    static_memory_scope = [
+        "global",
+        "global.texture",
+        "global.texture-nhwc",
+        "global.texture",
+        "global",
+        "global.texture-nhwc",
+        "global.texture",
+        "global",
+    ]
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)

From 03d989f4411bf092e2eaf0e20c401900674f3ba1 Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Mon, 24 Oct 2022 23:32:22 +0300
Subject: [PATCH 427/704] [Adreno] Adapt reduction schedule for adreno (#13100)

* [Adreno] Adapt reduction schedule for adreno

Origin cuda schedule uses rfactor that is 10x-50x slower on
Adreno than without barries

* Address PR comments

* Remove copy-paste, start reuse cuda impl

* Address pylint hits

* Extend comment for cuda schedule_reduce_impl
---
 python/tvm/relay/op/strategy/adreno.py        |  7 ++
 python/tvm/topi/adreno/__init__.py            |  1 +
 python/tvm/topi/adreno/reduction.py           | 69 +++++++++++++++++++
 python/tvm/topi/cuda/reduction.py             | 20 ++++--
 .../opencl_texture/test_reduction_texture.py  | 51 ++++++++++++++
 5 files changed, 144 insertions(+), 4 deletions(-)
 create mode 100644 python/tvm/topi/adreno/reduction.py
 create mode 100644 tests/python/relay/opencl_texture/test_reduction_texture.py

diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py
index 9429fd71e1d9..011622d5374f 100644
--- a/python/tvm/relay/op/strategy/adreno.py
+++ b/python/tvm/relay/op/strategy/adreno.py
@@ -208,6 +208,13 @@ def schedule_injective_adreno(attrs, outs, target):
         return topi.adreno.schedule_injective(outs)
 
 
+@schedule_reduce.register(["adreno"])
+def schedule_reduce_adreno(attrs, outs, target):
+    """schedule reduction ops for adreno GPU"""
+    with target:
+        return topi.adreno.schedule_reduce(outs)
+
+
 @concatenate_strategy.register(["adreno"])
 def concatenate_strategy_adreno(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
diff --git a/python/tvm/topi/adreno/__init__.py b/python/tvm/topi/adreno/__init__.py
index 227ca6aa9a48..55bfbee2a8d7 100644
--- a/python/tvm/topi/adreno/__init__.py
+++ b/python/tvm/topi/adreno/__init__.py
@@ -26,3 +26,4 @@
 from .conv2d_nchw_winograd import *
 from .conv2d_nhwc_winograd import *
 from .injective import schedule_injective
+from .reduction import *
diff --git a/python/tvm/topi/adreno/reduction.py b/python/tvm/topi/adreno/reduction.py
new file mode 100644
index 000000000000..b95832c60f2a
--- /dev/null
+++ b/python/tvm/topi/adreno/reduction.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,too-many-locals,len-as-condition
+"""Schedule for reduce operators"""
+import numpy
+from tvm import te
+from ..utils import get_const_tuple
+from .injective import schedule_injective_from_existing
+from .utils import get_div
+from ..cuda.reduction import schedule_reduce_impl
+
+
+def _schedule_reduce_adreno(op, sch, is_idx_reduce=False):
+    if is_idx_reduce:
+        real_output = op.output(0)
+        temp_idx_input = op.input_tensors[0].op.output(0)
+        temp_val_input = op.input_tensors[0].op.output(1)
+    else:
+        real_output = op.output(0)
+    shape = get_const_tuple(real_output.shape)
+    latest4 = shape[-1] == 4
+    div4 = numpy.prod(shape) % 4 == 0
+
+    # Fuse and split the axis
+    if latest4:
+        fused_outer = sch[real_output].fuse(
+            *[sch[real_output].op.axis[i] for i in range(len(sch[real_output].op.axis) - 1)]
+        )
+    else:
+        fused_outer = sch[real_output].fuse(
+            *[sch[real_output].op.axis[i] for i in range(len(sch[real_output].op.axis))]
+        )
+
+    ftc = numpy.prod(shape)
+    a = fused_outer
+    if latest4:
+        sch[real_output].vectorize(sch[real_output].op.axis[-1])
+    elif div4 and not is_idx_reduce:
+        a, b = sch[real_output].split(fused_outer, factor=4)
+        sch[real_output].vectorize(b)
+        ftc = ftc / 4
+
+    num_thread = get_div(ftc, 128)
+
+    bx, outer_in = sch[real_output].split(a, factor=num_thread)
+
+    sch[real_output].bind(bx, te.thread_axis("blockIdx.x"))
+    sch[real_output].bind(outer_in, te.thread_axis("threadIdx.y"))
+    if is_idx_reduce:
+        sch[temp_idx_input].compute_at(sch[real_output], outer_in)
+        sch[temp_val_input].compute_at(sch[real_output], outer_in)
+
+
+def schedule_reduce(outs):
+    return schedule_reduce_impl(outs, _schedule_reduce_adreno, schedule_injective_from_existing)
diff --git a/python/tvm/topi/cuda/reduction.py b/python/tvm/topi/cuda/reduction.py
index fb6a3bfd5174..318d72b1e5d0 100644
--- a/python/tvm/topi/cuda/reduction.py
+++ b/python/tvm/topi/cuda/reduction.py
@@ -116,14 +116,22 @@ def is_scheduled(stage):
     return True
 
 
-def schedule_reduce(outs):
+def schedule_reduce_impl(outs, schedule_reduce_stage, schedule_injective_stage):
     """Schedule for inject->reduce->bcast ops.
+    Traverse over the stages in the schedule and schedule separate stages depending
+    on the position of the stage. Injecteve post-ops of reduction will be scheduled using
+    injection schedule, injective pre-ops of reduction will be inlined, reduction stage
+    will be scheduled using reduction schedule
 
     Parameters
     ----------
     outs: Array of Tensor
           The computation graph description of reduce in the format
           of an array of tensors.
+    schedule_reduce_stage: Function responsible for scheduling the reduction
+          stage
+    schedule_injective_stage: Function responsible for scheduling the
+          standalone injection stage
 
     Returns
     -------
@@ -153,7 +161,7 @@ def traverse_after_reduce(operator):
         """Internal traverse function"""
         if tag.is_broadcast(operator.tag):
             if operator not in scheduled_ops:
-                schedule_injective_from_existing(sch, operator.output(0))
+                schedule_injective_stage(sch, operator.output(0))
             for tensor in operator.input_tensors:
                 if tensor.op not in scheduled_ops:
                     if enable_auto_inline:
@@ -162,13 +170,13 @@ def traverse_after_reduce(operator):
                         traverse_after_reduce(tensor.op)
         elif operator.tag == "comm_reduce":
             if operator not in scheduled_ops:
-                _schedule_reduce(operator, sch, is_idx_reduce=False)
+                schedule_reduce_stage(operator, sch, is_idx_reduce=False)
             for tensor in operator.input_tensors:
                 if tensor.op not in scheduled_ops:
                     traverse_before_reduce(tensor.op)
         elif operator.tag == "comm_reduce_idx":
             if operator not in scheduled_ops:
-                _schedule_reduce(operator, sch, is_idx_reduce=True)
+                schedule_reduce_stage(operator, sch, is_idx_reduce=True)
             input_tensors = operator.input_tensors[0].op.input_tensors
             for tensor in input_tensors:
                 if tensor.op not in scheduled_ops:
@@ -183,3 +191,7 @@ def traverse_after_reduce(operator):
     for out in outs:
         traverse_after_reduce(out.op)
     return sch
+
+
+def schedule_reduce(outs):
+    return schedule_reduce_impl(outs, _schedule_reduce, schedule_injective_from_existing)
diff --git a/tests/python/relay/opencl_texture/test_reduction_texture.py b/tests/python/relay/opencl_texture/test_reduction_texture.py
new file mode 100644
index 000000000000..b14aefd2f9ab
--- /dev/null
+++ b/tests/python/relay/opencl_texture/test_reduction_texture.py
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import testing
+from tvm.contrib import utils
+from utils.adreno_utils import gpu_preprocess, build_run_compare
+
+
+dtype = tvm.testing.parameter("float32")
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_mean(target, dtype):
+    # NCHW
+    input_shape = (1, 3, 720, 1280)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    mean = relay.mean(A, axis=1, keepdims=True)
+    mod = relay.Function([A], mean)
+
+    build_run_compare(mod, {}, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_argmax(target, dtype):
+    # NCHW
+    input_shape = (1, 3, 720, 1280)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    argmax = relay.op.argmax(A, axis=[1])
+    mod = relay.Function([A], argmax)
+
+    build_run_compare(mod, {}, {"data": input_shape}, dtype, target)

From c302b3dce35507d1560a59597e7885ba0f63b21f Mon Sep 17 00:00:00 2001
From: Matthew Barrett <55580676+mbaret@users.noreply.github.com>
Date: Mon, 24 Oct 2022 23:32:56 +0300
Subject: [PATCH 428/704] [AOT] Add CreateFunctionMetadata analysis pass
 (#13095)

AOT requires FunctionInfo to be defined for all the functions
in the module. This stores information on how much memory the
functions use. This commit adds a separate analysis pass to
create all the FunctionInfos + some tests for the new pass.
---
 include/tvm/tir/analysis.h                    |   7 +
 python/tvm/ir/memory_pools.py                 |  52 +++
 python/tvm/relay/backend/aot.py               |  26 ++
 .../backend/aot/create_function_metadata.cc   | 125 ++++++++
 .../backend/aot/create_function_metadata.h    |  49 +++
 src/tir/usmp/utils.cc                         |   6 +-
 .../aot/test_aot_create_function_metadata.py  | 302 ++++++++++++++++++
 7 files changed, 564 insertions(+), 3 deletions(-)
 create mode 100644 src/relay/backend/aot/create_function_metadata.cc
 create mode 100644 src/relay/backend/aot/create_function_metadata.h
 create mode 100644 tests/python/relay/aot/test_aot_create_function_metadata.py

diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h
index 402fa5515431..7af591c23883 100644
--- a/include/tvm/tir/analysis.h
+++ b/include/tvm/tir/analysis.h
@@ -201,6 +201,13 @@ TVM_DLL Array<Array<BufferRegion>> GetBlockReadWriteRegion(const Block& block,
  */
 TVM_DLL size_t CalculateExprComplexity(const PrimExpr& expr);
 
+/*!
+ * \brief Calculate the constants size in bytes needed by the TIR allocates inside the TIR PrimFunc
+ * \param func The TIR PrimFunc for which the constants size to be calculated
+ * \param constant_byte_alignment The byte alignment required for each constant allocated
+ */
+TVM_DLL size_t CalculateConstantBytes(const PrimFunc& func, const Integer& constant_byte_alignment);
+
 /*!
  * \brief Calculate the workspace size in bytes needed by the TIR allocates inside the TIR PrimFunc
  * \param func The TIR PrimFunc for which the workspace size to be calculated
diff --git a/python/tvm/ir/memory_pools.py b/python/tvm/ir/memory_pools.py
index 553bb49e3c92..cd9a75a5c711 100644
--- a/python/tvm/ir/memory_pools.py
+++ b/python/tvm/ir/memory_pools.py
@@ -20,6 +20,7 @@
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
+from tvm.runtime import NDArray
 from . import _ffi_api
 
 
@@ -101,6 +102,34 @@ def __init__(
         )
 
 
+@register_object("ir.ConstantInfo")
+class ConstantInfo(Object):
+    """ConstantInfo object hold information on a constant pool.
+
+    Parameters
+    ----------
+    name_hint : str
+        Name of the constant.
+    byte_offset : int
+        The byte_offset of the constant.
+    data : NDArray
+        The data of the constant.
+    """
+
+    def __init__(
+        self,
+        name_hint: str,
+        byte_offset: int,
+        data: NDArray,
+    ):
+        self.__init_handle_by_constructor__(
+            _ffi_api.ConstantInfo,  # type: ignore # pylint: disable=no-member
+            name_hint,
+            byte_offset,
+            data,
+        )
+
+
 @register_object("ir.WorkspacePoolInfo")
 class WorkspacePoolInfo(PoolInfo):
     """WorkspacePoolInfo object holds information related to RW memory pools
@@ -214,3 +243,26 @@ def __init__(
         self.__init_handle_by_constructor__(
             _ffi_api.ConstantMemoryPools, pools  # type: ignore # pylint: disable=no-member
         )
+
+
+@register_object("ir.ConstantMemoryPools")
+class AllocatedPoolInfo(Object):
+    """Allocate memory in a given pool.
+
+    Parameters
+    ----------
+    pool : PoolInfo
+        The pool in which to allocate memory.
+    allocated_size : int
+        The size of memory to allocate.
+    """
+
+    def __init__(
+        self,
+        pool: PoolInfo,
+        allocated_size: int,
+        pool_var_idx: int = 0,
+    ):
+        self.__init_handle_by_constructor__(
+            _ffi_api.AllocatedPoolInfo, pool, allocated_size, pool_var_idx  # type: ignore # pylint: disable=no-member
+        )
diff --git a/python/tvm/relay/backend/aot.py b/python/tvm/relay/backend/aot.py
index 8e7406c72f32..b861d9298543 100644
--- a/python/tvm/relay/backend/aot.py
+++ b/python/tvm/relay/backend/aot.py
@@ -16,6 +16,9 @@
 # under the License.
 # pylint: disable=invalid-name
 """AOT passes"""
+from typing import Dict
+
+from tvm import IRModule
 from tvm.ir.transform import Pass
 from .utils import CallType
 
@@ -41,3 +44,26 @@ def AOTLowerMain(mod_name: str, config: object, call_type: CallType) -> Pass:
 
     """
     return _aot.AOTLowerMain(mod_name, config, call_type.value)
+
+
+def CreateFunctionMetadata(
+    mod: IRModule, workspace_byte_alignment: int, constant_byte_alignment: int
+) -> Dict[str, object]:
+    """Create the function metadata (FunctionInfos) from an AOT module.
+
+    Parameters
+    ----------
+    mod : IRModule
+        The IRModule.
+    workspace_byte_alignment : int
+        The alignment of the workspace buffer in bytes.
+    constant_byte_alignment : int
+        The alignment of the constant buffer in bytes.
+
+    Returns
+    -------
+    Dict[str, FunctionInfo]
+        A map between function names and FunctionInfos.
+
+    """
+    return _aot.CreateFunctionMetadata(mod, workspace_byte_alignment, constant_byte_alignment)
diff --git a/src/relay/backend/aot/create_function_metadata.cc b/src/relay/backend/aot/create_function_metadata.cc
new file mode 100644
index 000000000000..54fd270c1b25
--- /dev/null
+++ b/src/relay/backend/aot/create_function_metadata.cc
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/aot/create_function_metadata.cc
+ * \brief Create FunctionInfo metadata from a lowered TIR module.
+ */
+#include "./create_function_metadata.h"
+
+#include <tvm/ir/expr.h>
+#include <tvm/ir/module.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/module.h>
+#include <tvm/target/target_kind.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/usmp/utils.h>
+
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+namespace aot {
+
+/*!
+ * \brief Calculate FunctionInfo for all the PrimFuncs in a module.
+ */
+Map<String, backend::FunctionInfo> CalculateFunctionInfos(const IRModule& mod,
+                                                          Integer workspace_byte_alignment,
+                                                          Integer constant_byte_alignment) {
+  Map<String, backend::FunctionInfo> function_metadata;
+  for (const auto& kv : mod->functions) {
+    GlobalVar global_var = kv.first;
+    BaseFunc base_func = kv.second;
+    if (base_func->IsInstance<tir::PrimFuncNode>()) {
+      tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(base_func);
+      Optional<Target> tgt_opt = pfunc->GetAttr<Target>(tvm::attr::kTarget);
+      ICHECK(tgt_opt) << "Target must be defined for all primfuncs.";
+      Target tgt = tgt_opt.value();
+      // Determine the size of input/output buffers
+      auto params = pfunc->params;
+      int64_t total_io_bytes = 0;
+      for (const auto& param : params) {
+        // Inputs/outputs will be handles, workspaces are pointers
+        if (param->dtype.is_handle()) {
+          auto buffer = pfunc->buffer_map[param];
+          total_io_bytes += GetMemorySizeBytes(buffer->shape, buffer->dtype);
+        }
+      }
+      const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment);
+      const auto& cs = CalculateConstantBytes(pfunc, constant_byte_alignment);
+      backend::FunctionInfo finfo{
+          {{tgt, ws}}, {{tgt, total_io_bytes}}, {{tgt, cs}}, {{tgt, pfunc}}, {}};
+      function_metadata.Set(global_var->name_hint, finfo);
+    }
+  }
+  return function_metadata;
+}
+
+Map<String, backend::FunctionInfo> CreateFunctionMetadata(const IRModule& mod,
+                                                          Integer workspace_byte_alignment,
+                                                          Integer constant_byte_alignment) {
+  // First calculate the FunctionInfos from the buffers that are explicitly allocated
+  auto function_metadata =
+      CalculateFunctionInfos(mod, workspace_byte_alignment, constant_byte_alignment);
+  // Now adjust the FunctionInfo for the main func to also include PoolInfo allocations
+  // made by the USMP.
+  Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
+      mod->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
+  backend::FunctionInfo main_func_info =
+      function_metadata.Get(runtime::symbol::tvm_module_main).value();
+  if (allocated_pool_infos) {
+    for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) {
+      for (const auto& tgt : allocated_pool_info->pool_info->targets) {
+        VLOG(1) << "USMP requires target " << tgt->ToDebugString() << " to have pool size "
+                << allocated_pool_info->allocated_size->value;
+        size_t size = allocated_pool_info->allocated_size->value;
+        if (allocated_pool_info->pool_info->IsInstance<ConstantPoolInfoNode>()) {
+          size += main_func_info->constant_sizes.count(tgt)
+                      ? main_func_info->constant_sizes[tgt]->value
+                      : 0;
+          main_func_info->constant_sizes.Set(tgt, size);
+        } else if (allocated_pool_info->pool_info->IsInstance<WorkspacePoolInfoNode>()) {
+          size += main_func_info->workspace_sizes.count(tgt)
+                      ? main_func_info->workspace_sizes[tgt]->value
+                      : 0;
+          main_func_info->workspace_sizes.Set(tgt, size);
+        } else {
+          LOG(FATAL) << "Unknown pool type: " << allocated_pool_info->pool_info->GetTypeKey();
+        }
+      }
+    }
+  }
+  function_metadata.Set(runtime::symbol::tvm_module_main, main_func_info);
+  return function_metadata;
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.aot.CreateFunctionMetadata")
+    .set_body_typed(CreateFunctionMetadata);
+
+}  // namespace aot
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/aot/create_function_metadata.h b/src/relay/backend/aot/create_function_metadata.h
new file mode 100644
index 000000000000..8c7bf8753496
--- /dev/null
+++ b/src/relay/backend/aot/create_function_metadata.h
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RELAY_BACKEND_AOT_CREATE_FUNCTION_METADATA_H_
+#define TVM_RELAY_BACKEND_AOT_CREATE_FUNCTION_METADATA_H_
+
+#include <tvm/ir/module.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/container/string.h>
+
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+namespace aot {
+
+/*! \brief Create FunctionInfo metadata for all the PrimFuncs in a module lowered
+ *  for AOT execution.
+ * \param mod The module.
+ * \param workspace_byte_alignment The alignment of the workspace pool.
+ * \param constant_byte_alignment The alignment of the constant pool.
+ * \return A map between function names and FunctionInfos.
+ */
+Map<String, FunctionInfo> CreateFunctionMetadata(const IRModule& mod,
+                                                 Integer workspace_byte_alignment,
+                                                 Integer constant_byte_alignment);
+
+}  // namespace aot
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_AOT_CREATE_FUNCTION_METADATA_H_
diff --git a/src/tir/usmp/utils.cc b/src/tir/usmp/utils.cc
index 3350ecc5d47f..88a6496859a5 100644
--- a/src/tir/usmp/utils.cc
+++ b/src/tir/usmp/utils.cc
@@ -132,9 +132,9 @@ AllocatedPoolInfo::AllocatedPoolInfo(PoolInfo pool_info, Integer allocated_size,
 }
 
 TVM_REGISTER_NODE_TYPE(AllocatedPoolInfoNode);
-TVM_REGISTER_GLOBAL("tir.usmp.AllocatedPoolInfo")
-    .set_body_typed([](PoolInfo pool_info, Integer allocated_size) {
-      return AllocatedPoolInfo(pool_info, allocated_size);
+TVM_REGISTER_GLOBAL("ir.AllocatedPoolInfo")
+    .set_body_typed([](PoolInfo pool_info, Integer allocated_size, Integer pool_var_idx) {
+      return AllocatedPoolInfo(pool_info, allocated_size, pool_var_idx);
     });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
diff --git a/tests/python/relay/aot/test_aot_create_function_metadata.py b/tests/python/relay/aot/test_aot_create_function_metadata.py
new file mode 100644
index 000000000000..ff2a522572c5
--- /dev/null
+++ b/tests/python/relay/aot/test_aot_create_function_metadata.py
@@ -0,0 +1,302 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=line-too-long,missing-class-docstring,missing-module-docstring,missing-function-docstring,no-self-argument,unused-argument,invalid-name
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm.script import tir as T
+from tvm.runtime.ndarray import array
+from tvm.relay.backend.aot import CreateFunctionMetadata
+from tvm.ir.memory_pools import AllocatedPoolInfo, ConstantPoolInfo, WorkspacePoolInfo, ConstantInfo
+
+
+def _check_function_metadata(function_metadata, expected_infos):
+    for symbol, expected_info in expected_infos.items():
+        func_info = function_metadata[symbol]
+        # Check workspace_sizes
+        key, value = func_info.workspace_sizes.items()[0]
+        assert str(key) == expected_info["target"]
+        assert value == expected_info["workspace_sizes"]
+        # Check io_sizes
+        key, value = func_info.io_sizes.items()[0]
+        assert str(key) == expected_info["target"]
+        assert value == expected_info["io_sizes"]
+        # Check constant_sizes
+        key, value = func_info.constant_sizes.items()[0]
+        assert str(key) == expected_info["target"]
+        assert value == expected_info["constant_sizes"]
+        # Check tir_primfuncs
+        key, value = func_info.tir_primfuncs.items()[0]
+        assert str(key) == expected_info["target"]
+        tvm.ir.assert_structural_equal(value, expected_info["tir_primfuncs"])
+
+
+def test_create_function_metadata_workspace_allocate_only():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def __tvm_main__(a: T.handle, output: T.handle) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]})})
+            a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+            output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+            # body
+            sid_3 = T.allocate([140], "int8", "global.workspace")
+            sid_2 = T.allocate([140], "int8", "global.workspace")
+            sid_1 = T.allocate([140], "int8", "global.workspace")
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", a_buffer.data, sid_1, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_1, sid_2, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_2, sid_3, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_1", sid_2, sid_3, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    expected_infos = {
+        "__tvm_main__": {
+            "target": "llvm -keys=cpu ",
+            "workspace_sizes": 432,
+            "io_sizes": 280,
+            "constant_sizes": 0,
+            "tir_primfuncs": Module["__tvm_main__"],
+        }
+    }
+
+    function_metadata = CreateFunctionMetadata(Module, 16, 1)
+
+    _check_function_metadata(function_metadata, expected_infos)
+
+
+def test_create_function_metadata_constant_allocate_only():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def __tvm_main__(a: T.handle, output: T.handle) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "num_inputs": 1, "num_outputs": 1})
+            a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+            output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+            # body
+            constant_0 = T.allocate_const([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "float32", [5, 7])
+            T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, constant_0, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    expected_infos = {
+        "__tvm_main__": {
+            "target": "llvm -keys=cpu ",
+            "workspace_sizes": 0,
+            "io_sizes": 280,
+            "constant_sizes": 140,
+            "tir_primfuncs": Module["__tvm_main__"],
+        }
+    }
+
+    function_metadata = CreateFunctionMetadata(Module, 16, 1)
+
+    _check_function_metadata(function_metadata, expected_infos)
+
+
+def test_create_function_metadata_constant_pool_only():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def __tvm_main__(a: T.handle, output: T.handle) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "num_inputs": 1, "num_outputs": 1})
+            a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+            output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+            # body
+            T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, a_buffer.data, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    expected_infos = {
+        "__tvm_main__": {
+            "target": "llvm -keys=cpu ",
+            "workspace_sizes": 0,
+            "io_sizes": 280,
+            "constant_sizes": 256,
+            "tir_primfuncs": Module["__tvm_main__"],
+        }
+    }
+
+    target = Module["__tvm_main__"].attrs["target"]
+    mod = Module.with_attr(
+        "pool_args",
+        [
+            AllocatedPoolInfo(
+                ConstantPoolInfo(
+                    "flash",
+                    [target],
+                    [ConstantInfo("a", 0, array(np.array([0])))],
+                ),
+                256,
+            ),
+        ],
+    )
+
+    function_metadata = CreateFunctionMetadata(mod, 16, 1)
+
+    _check_function_metadata(function_metadata, expected_infos)
+
+
+def test_create_function_metadata_workspace_pool_only():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def __tvm_main__(a: T.handle, output: T.handle) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "num_inputs": 1, "num_outputs": 1})
+            a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+            output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+            # body
+            T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, a_buffer.data, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    expected_infos = {
+        "__tvm_main__": {
+            "target": "llvm -keys=cpu ",
+            "workspace_sizes": 256,
+            "io_sizes": 280,
+            "constant_sizes": 0,
+            "tir_primfuncs": Module["__tvm_main__"],
+        }
+    }
+
+    target = Module["__tvm_main__"].attrs["target"]
+    mod = Module.with_attr(
+        "pool_args",
+        [
+            AllocatedPoolInfo(
+                WorkspacePoolInfo("sram", [target]),
+                256,
+            ),
+        ],
+    )
+
+    function_metadata = CreateFunctionMetadata(mod, 16, 1)
+
+    _check_function_metadata(function_metadata, expected_infos)
+
+
+def test_create_function_metadata_all_single_func():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def __tvm_main__(a: T.handle, output: T.handle) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]})})
+            a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+            output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+            # body
+            sid_3 = T.allocate([140], "int8", "global.workspace")
+            sid_2 = T.allocate([140], "int8", "global.workspace")
+            sid_1 = T.allocate([140], "int8", "global.workspace")
+            constant_0 = T.allocate_const([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "float32", [5, 7])
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", a_buffer.data, sid_1, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_1, constant_0, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_2, sid_3, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_1", sid_2, sid_3, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    expected_infos = {
+        "__tvm_main__": {
+            "target": "llvm -keys=cpu ",
+            "workspace_sizes": 688,
+            "io_sizes": 280,
+            "constant_sizes": 652,
+            "tir_primfuncs": Module["__tvm_main__"],
+        }
+    }
+
+    target = Module["__tvm_main__"].attrs["target"]
+    mod = Module.with_attr(
+        "pool_args",
+        [
+            AllocatedPoolInfo(
+                ConstantPoolInfo(
+                    "flash",
+                    [target],
+                    [ConstantInfo("a", 0, array(np.array([0])))],
+                ),
+                512,
+            ),
+            AllocatedPoolInfo(
+                WorkspacePoolInfo("sram", [target]),
+                256,
+            ),
+        ],
+    )
+
+    function_metadata = CreateFunctionMetadata(mod, 16, 1)
+
+    _check_function_metadata(function_metadata, expected_infos)
+
+
+def test_create_function_metadata_workspace_multi_funcs():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def __tvm_main__(a: T.handle, output: T.handle) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "num_inputs": 1, "num_outputs": 1})
+            a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+            output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+            # body
+            T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, a_buffer.data, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+
+        @T.prim_func
+        def test_fused_add(a: T.handle, b: T.handle, output: T.handle) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "test_mod_test_fused_add", "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]})})
+            a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+            b_buffer = T.match_buffer(b, [5, 7], dtype="float32", align=16)
+            output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+            # body
+            sid_0 = T.allocate([140], "int8", "global.workspace")
+            constant_0 = T.allocate_const([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "float32", [5, 7])
+            T.evaluate(T.tvm_call_cpacked("magic", a_buffer.data, b_buffer.data, sid_0, constant_0, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    expected_infos = {
+        "__tvm_main__": {
+            "target": "llvm -keys=cpu ",
+            "workspace_sizes": 0,
+            "io_sizes": 280,
+            "constant_sizes": 0,
+            "tir_primfuncs": Module["__tvm_main__"],
+        },
+        "test_fused_add": {
+            "target": "llvm -keys=cpu ",
+            "workspace_sizes": 144,
+            "io_sizes": 420,
+            "constant_sizes": 140,
+            "tir_primfuncs": Module["test_fused_add"],
+        },
+    }
+
+    function_metadata = CreateFunctionMetadata(Module, 16, 1)
+
+    _check_function_metadata(function_metadata, expected_infos)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From bb8aad160d4bb59a943a8cf46ae850cdcff779b8 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 24 Oct 2022 17:17:25 -0700
Subject: [PATCH 429/704] [ci] Temporarily disable MacOS RPC tests (#13186)

See #13185

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/workflows/main.yml | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b03a1795ef9e..6cbb7aa8daf5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -69,14 +69,15 @@ jobs:
         shell: bash -l {0}
         run: >-
           python -m pytest -v tests/python/all-platform-minimal-test
-      - name: Test iOS RPC
-        shell: bash -l {0}
-        run: >-
-          python -m pip install tornado psutil cloudpickle &&
-          export PYTHONPATH=tests/python/contrib:${PYTHONPATH} &&
-          export BUNDLE_ID=org.apache.tvmrpc &&
-          export BUNDLE_PATH=build-ios-simulator/apps/ios_rpc/ios_rpc/src/ios_rpc-build/Release-iphonesimulator/tvmrpc.app &&
-          python -m pytest -v tests/python/contrib/test_rpc_server_device.py
+      # See https://github.com/apache/tvm/issues/13185
+      # - name: Test iOS RPC
+      #   shell: bash -l {0}
+      #   run: >-
+      #     python -m pip install tornado psutil cloudpickle &&
+      #     export PYTHONPATH=tests/python/contrib:${PYTHONPATH} &&
+      #     export BUNDLE_ID=org.apache.tvmrpc &&
+      #     export BUNDLE_PATH=build-ios-simulator/apps/ios_rpc/ios_rpc/src/ios_rpc-build/Release-iphonesimulator/tvmrpc.app &&
+      #     python -m pytest -v tests/python/contrib/test_rpc_server_device.py
 
   Windows:
     if: ${{ github.repository == 'apache/tvm' }}

From 7950271ceb16be076208bd2d144eb475f956c982 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Tue, 25 Oct 2022 06:04:44 -0700
Subject: [PATCH 430/704] [TVMScript] TIR parser (#13190)

---
 python/tvm/script/_parser/__init__.py         |   3 +-
 python/tvm/script/_parser/core/parser.py      |  15 +
 python/tvm/script/_parser/tir/__init__.py     |  25 +
 python/tvm/script/_parser/tir/entry.py        | 108 ++++
 python/tvm/script/_parser/tir/operation.py    |  85 ++++
 python/tvm/script/_parser/tir/parser.py       | 468 ++++++++++++++++++
 .../unittest/test_tvmscript_parser_tir.py     |  63 +++
 7 files changed, 766 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/script/_parser/tir/__init__.py
 create mode 100644 python/tvm/script/_parser/tir/entry.py
 create mode 100644 python/tvm/script/_parser/tir/operation.py
 create mode 100644 python/tvm/script/_parser/tir/parser.py
 create mode 100644 tests/python/unittest/test_tvmscript_parser_tir.py

diff --git a/python/tvm/script/_parser/__init__.py b/python/tvm/script/_parser/__init__.py
index fd4e45818c20..38c8b88cc7ca 100644
--- a/python/tvm/script/_parser/__init__.py
+++ b/python/tvm/script/_parser/__init__.py
@@ -15,5 +15,6 @@
 # specific language governing permissions and limitations
 # under the Licens.
 """The parser"""
-from . import _core, ir
+from . import _core, ir, tir
 from .ir import ir_module
+from .tir import prim_func
diff --git a/python/tvm/script/_parser/core/parser.py b/python/tvm/script/_parser/core/parser.py
index daf95cb3cd1b..c6d43f11cbf5 100644
--- a/python/tvm/script/_parser/core/parser.py
+++ b/python/tvm/script/_parser/core/parser.py
@@ -571,6 +571,21 @@ def visit_Assign(self, node: doc.Assign) -> Any:  # pylint: disable=invalid-name
         """
         return _dispatch(self, "Assign")(self, node)
 
+    def visit_AnnAssign(self, node: doc.AnnAssign) -> Any:  # pylint: disable=invalid-name
+        """The general annotated assign visiting method.
+
+        Parameters
+        ----------
+        node : doc.Assign
+            The doc AST annotated assign node.
+
+        Returns
+        -------
+        res : Any
+            The visiting result.
+        """
+        return _dispatch(self, "AnnAssign")(self, node)
+
     def visit_Expr(self, node: doc.Expr) -> Any:  # pylint: disable=invalid-name
         """The general expression visiting method.
 
diff --git a/python/tvm/script/_parser/tir/__init__.py b/python/tvm/script/_parser/tir/__init__.py
new file mode 100644
index 000000000000..7754baf087f5
--- /dev/null
+++ b/python/tvm/script/_parser/tir/__init__.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The tir parser"""
+
+from ...ir_builder.tir import *  # pylint: disable=redefined-builtin
+from ...ir_builder.tir import ir as _tir
+from . import operation as _operation
+from . import parser as _parser
+from .entry import Buffer, Ptr, prim_func
+
+__all__ = _tir.__all__ + ["Buffer", "Ptr", "prim_func"]
diff --git a/python/tvm/script/_parser/tir/entry.py b/python/tvm/script/_parser/tir/entry.py
new file mode 100644
index 000000000000..632b87aa24dc
--- /dev/null
+++ b/python/tvm/script/_parser/tir/entry.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The entry point of TVM parser for tir."""
+
+import inspect
+from typing import Callable, Union
+
+from tvm.tir import Buffer, PrimFunc
+
+from ...ir_builder.tir import buffer_decl, ptr
+from .._core import parse, utils
+from ..ir import is_defined_in_class
+
+
+def prim_func(func: Callable) -> Union[PrimFunc, Callable]:
+    """The parsing method for tir prim func, by using `@prim_func` as decorator.
+
+    Parameters
+    ----------
+    func : Callable
+        The function to be parsed as prim func.
+
+    Returns
+    -------
+    res : Union[PrimFunc, Callable]
+        The parsed tir prim func.
+    """
+    if not inspect.isfunction(func):
+        raise TypeError(f"Expect a function, but got: {func}")
+    if is_defined_in_class(inspect.stack()):
+        return func
+    return parse(func, utils.inspect_function_capture(func))
+
+
+setattr(prim_func, "dispatch_token", "tir")
+
+
+class BufferProxy:
+    """Buffer proxy class for constructing tir buffer.
+    Overload __call__ and __getitem__ to support syntax as T.Buffer() and T.Buffer[].
+    """
+
+    def __call__(
+        self,
+        shape,
+        dtype="float32",
+        data=None,
+        strides=None,
+        elem_offset=None,
+        scope="global",
+        align=0,
+        offset_factor=0,
+        buffer_type="",
+        axis_separators=None,
+    ) -> Buffer:
+        return buffer_decl(
+            shape,
+            dtype=dtype,
+            data=data,
+            strides=strides,
+            elem_offset=elem_offset,
+            scope=scope,
+            align=align,
+            offset_factor=offset_factor,
+            buffer_type=buffer_type,
+            axis_separators=axis_separators,
+        )
+
+    def __getitem__(self, keys) -> Buffer:
+        if not isinstance(keys, tuple):
+            return self(keys)
+        if len(keys) >= 2 and not isinstance(keys[1], str):
+            return self(keys)
+        return self(*keys)  # pylint: disable=no-member # type: ignore
+
+
+class PtrProxy:
+    """Ptr proxy class for constructing tir pointer.
+    Overload __call__ and __getitem__ to support syntax as T.Ptr() and T.Ptr[].
+    """
+
+    def __call__(self, dtype, storage_scope="global"):
+        if callable(dtype):
+            dtype = dtype().dtype
+        return ptr(dtype, storage_scope)  # pylint: disable=no-member # type: ignore
+
+    def __getitem__(self, keys):
+        if not isinstance(keys, tuple):
+            return self(keys)
+        return self(*keys)
+
+
+Buffer = BufferProxy()  # pylint: disable=invalid-name
+Ptr = PtrProxy()  # pylint: disable=invalid-name
diff --git a/python/tvm/script/_parser/tir/operation.py b/python/tvm/script/_parser/tir/operation.py
new file mode 100644
index 000000000000..ed8f07a06369
--- /dev/null
+++ b/python/tvm/script/_parser/tir/operation.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The tir expression operation registration"""
+
+from typing import Type
+
+from tvm import tir
+from tvm.tir import IntImm
+
+from .._core import OpMethod, doc, register_op
+
+
+def _register_expr_op(ty: Type):  # pylint: disable=invalid-name
+    ty._dispatch_type = ty  # pylint: disable=protected-access
+
+    def _and(a, b):
+        if isinstance(a, bool):
+            a = IntImm("bool", a)
+        if isinstance(b, bool):
+            b = IntImm("bool", b)
+        return tir.And(a, b)
+
+    def _or(a, b):
+        if isinstance(a, bool):
+            a = IntImm("bool", a)
+        if isinstance(b, bool):
+            b = IntImm("bool", b)
+        return tir.Or(a, b)
+
+    def r(op: Type, i: int, m: OpMethod):  # pylint: disable=invalid-name
+        register_op(ty, op, i)(m)
+
+    for i in [0, 1]:
+        # Case 1. binop
+        r(doc.Add, i, lambda a, b: a + b)
+        r(doc.Sub, i, lambda a, b: a - b)
+        r(doc.Mult, i, lambda a, b: a * b)
+        r(doc.Div, i, lambda a, b: a / b)
+        r(doc.FloorDiv, i, lambda a, b: a // b)
+        r(doc.Mod, i, lambda a, b: a % b)
+        r(doc.LShift, i, lambda a, b: a << b)
+        r(doc.RShift, i, lambda a, b: a >> b)
+        r(doc.BitOr, i, lambda a, b: a | b)
+        r(doc.BitXor, i, lambda a, b: a ^ b)
+        r(doc.BitAnd, i, lambda a, b: a & b)
+        # doc.MatMult <-- not implemented
+        # doc.Pow <-- not implemented
+        # Case 2. cmpop
+        r(doc.Eq, i, tir.EQ)
+        r(doc.NotEq, i, tir.NE)
+        r(doc.Lt, i, tir.LT)
+        r(doc.LtE, i, tir.LE)
+        r(doc.Gt, i, tir.GT)
+        r(doc.GtE, i, tir.GE)
+        # doc.Is <-- not implemented
+        # doc.IsNot <-- not implemented
+        # doc.In <-- not implemented
+        # doc.NotIn <-- not implemented
+        # Case 3. boolop
+        r(doc.And, i, _and)
+        r(doc.Or, i, _or)
+    for i in [0]:
+        #  Case 4. unaryop
+        r(doc.Invert, i, lambda a: ~a)
+        r(doc.Not, i, tir.Not)
+        r(doc.UAdd, i, lambda a: +a)
+        r(doc.USub, i, lambda a: -a)
+
+
+_register_expr_op(tir.PrimExpr)
+_register_expr_op(tir.IterVar)
diff --git a/python/tvm/script/_parser/tir/parser.py b/python/tvm/script/_parser/tir/parser.py
new file mode 100644
index 000000000000..909238563fab
--- /dev/null
+++ b/python/tvm/script/_parser/tir/parser.py
@@ -0,0 +1,468 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The base parser for tir"""
+
+import contextlib
+from functools import partial
+from typing import Any
+
+from tvm.ir import PrimType
+from tvm.tir import Buffer, IterVar, PrimExpr, Var
+
+from ...ir_builder import tir as T
+from ...ir_builder.base import IRBuilder
+from ...ir_builder.base import IRBuilderFrame as Frame
+from .._core import Parser, dispatch, doc
+
+
+def bind_with_value(self: Parser, node: doc.expr, var_name: str, value: Any) -> Any:
+    """Value binding methods when parsing with statement.
+    e.g. binding i, j, k with T.grid(128, 128, 128), when parsing
+        with T.grid(128, 128, 18) as i, j, k.
+
+    Parameters
+    ----------
+    self : Parser
+        The current parser.
+
+    node : doc.expr
+        The doc AST expression node for error reporting.
+
+    var_name : str
+        The variable name.
+
+    value : Any
+        The value to be bound with.
+
+    Returns
+    -------
+    res : Any
+        The bound value.
+    """
+    if isinstance(value, (list, tuple)):
+        for i, v in enumerate(value):
+            bind_with_value(self, node, f"{var_name}_{i}", v)
+        return value
+    elif isinstance(value, (Buffer, Var)):
+        IRBuilder.name(var_name, value)
+        return value
+    else:
+        self.report_error(node, f"Do not know how to bind type: {type(value)} in with statement")
+        raise NotImplementedError
+
+
+def bind_for_value(self: Parser, node: doc.expr, var_name: str, value: Any) -> Any:
+    """Value binding methods when parsing for statement.
+    e.g. binding i, j, k with T.grid(128, 128, 128), when parsing
+        for i, j, k in T.grid(128, 128, 128).
+
+    Parameters
+    ----------
+    self : Parser
+        The current parser.
+
+    node : doc.expr
+        The doc AST expression node for error reporting.
+
+    var_name : str
+        The variable name.
+
+    value : Any
+        The value to be bound with.
+
+    Returns
+    -------
+    res : Any
+        The bound value.
+    """
+    if isinstance(value, (list, tuple)):
+        for i, v in enumerate(value):
+            bind_for_value(self, node, f"{var_name}_{i}", v)
+        return value
+    elif isinstance(value, Var):
+        IRBuilder.name(var_name, value)
+        return value
+    else:
+        self.report_error(node, f"Do not know how to bind type: {type(value)} in for statement")
+        raise NotImplementedError
+
+
+def bind_assign_value(self: Parser, node: doc.expr, var_name: str, value: Any) -> Any:
+    """Value binding methods when parsing assign statement.
+    e.g. binding vi, vj, vk with T.axis.remap("SSR", [i, j, k]), when parsing
+        vi, vj, vk = T.axis.remap("SSR", [i, j, k]).
+
+    Parameters
+    ----------
+    self : Parser
+        The current parser.
+
+    node : doc.expr
+        The doc AST expression node for error reporting.
+
+    var_name : str
+        The variable name.
+
+    value : Any
+        The value to be bound with.
+
+    Returns
+    -------
+    res : Any
+        The bound value.
+    """
+    if isinstance(value, T.inline):
+        return value.value
+    elif isinstance(value, (list, tuple)):
+        for i, v in enumerate(value):
+            bind_assign_value(self, node, f"{var_name}_{i}", v)
+        return value
+    elif isinstance(value, Frame):
+        value.add_callback(partial(value.__exit__, None, None, None))
+        res = value.__enter__()
+        IRBuilder.name(var_name, res)
+        return res
+    elif isinstance(value, (Buffer, IterVar)) or (
+        isinstance(value, Var) and not self.var_table.exist(value)
+    ):
+        IRBuilder.name(var_name, value)
+        return value
+    elif isinstance(value, PrimExpr):
+        var = T.var(value.dtype)
+        IRBuilder.name(var_name, var)
+        frame = T.let(var, value)
+        frame.add_callback(partial(frame.__exit__, None, None, None))
+        frame.__enter__()
+        return var
+    return value
+
+
+@dispatch.register(token="tir", type_name="For")
+def visit_for(self: Parser, node: doc.For) -> None:
+    """The for visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.For
+        The doc AST for node.
+    """
+    for_frame = self.eval_expr(node.iter)
+    if not isinstance(for_frame, T.frame.ForFrame):
+        self.report_error(
+            node.iter,
+            "Expect the for loop to be one of the following: "
+            "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding",
+        )
+    with self.var_table.with_frame():
+        with for_frame as iters:
+            self.eval_assign(target=node.target, source=iters, bind_value=bind_for_value)
+            self.visit_body(node.body)
+
+
+@dispatch.register(token="tir", type_name="While")
+def visit_while(self: Parser, node: doc.While) -> None:
+    """The while visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.While
+        The doc AST while node.
+    """
+    with self.var_table.with_frame():
+        cond = self.eval_expr(node.test)
+        with T.While(cond):
+            self.visit_body(node.body)
+
+
+@dispatch.register(token="tir", type_name="Assign")
+def visit_assign(self: Parser, node: doc.Assign) -> None:
+    """The assign visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.Assign
+        The doc AST assign node.
+    """
+    if len(node.targets) != 1:
+        self.report_error(node, "Consequential assignments like 'a = b = c' are not supported.")
+    lhs = node.targets[0]
+    rhs = self.eval_expr(node.value)
+    if isinstance(lhs, doc.Subscript):
+        if isinstance(lhs.slice, doc.Tuple):
+            indices = []
+            for index in lhs.slice.elts:
+                indices.append(self.eval_expr(index))
+        else:
+            indices = [self.eval_expr(lhs.slice)]
+        T.buffer_store(self.eval_expr(lhs.value), rhs, indices)
+    else:
+        self.eval_assign(target=lhs, source=rhs, bind_value=bind_assign_value)
+
+
+@dispatch.register(token="tir", type_name="AugAssign")
+def visit_aug_assign(self: Parser, node: doc.AugAssign) -> None:
+    """The augmented assign visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.AugAssign
+        The doc AST augmented assign node.
+    """
+    lhs_pos = (
+        node.target.lineno,
+        node.target.col_offset,
+        node.target.end_lineno,
+        node.target.end_col_offset,
+    )
+    rhs_pos = (
+        node.value.lineno,
+        node.value.col_offset,
+        node.value.end_lineno,
+        node.value.end_col_offset,
+    )
+    node.target.ctx = doc.Load(*lhs_pos)
+    with self.var_table.with_frame():
+        lhs_name = "__tvm_tmp_value_aug_assign_lhs"
+        rhs_name = "__tvm_tmp_value_aug_assign_rhs"
+        lhs_expr = self.eval_expr(node.target)
+        rhs_expr = self.eval_expr(node.value)
+        self.var_table.add(lhs_name, lhs_expr)
+        self.var_table.add(rhs_name, rhs_expr)
+        op = doc.BinOp(
+            doc.Name(lhs_name, doc.Load(*lhs_pos), *lhs_pos),
+            node.op,
+            doc.Name(rhs_name, doc.Load(*rhs_pos), *rhs_pos),
+            *lhs_pos,
+        )
+        rhs = self.eval_expr(op)
+    lhs = node.target
+    lhs.ctx = doc.Store(*lhs_pos)
+    if isinstance(lhs, doc.Subscript):
+        if isinstance(lhs.slice, doc.Tuple):
+            indices = []
+            for index in lhs.slice.elts:
+                indices.append(self.eval_expr(index))
+        else:
+            indices = [self.eval_expr(lhs.slice)]
+        T.buffer_store(self.eval_expr(lhs.value), rhs, indices)
+    else:
+        self.eval_assign(target=lhs, source=rhs, bind_value=bind_assign_value)
+
+
+@dispatch.register(token="tir", type_name="AnnAssign")
+def visit_ann_assign(self: Parser, node: doc.AnnAssign) -> None:
+    """The annotated assign visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.AnnAssign
+        The doc AST annotated assign node.
+    """
+    lhs = node.target
+    rhs = self.eval_expr(node.value)
+    ann_var = self.visit_tvm_annotation(node.annotation)
+    if not isinstance(ann_var, Var):
+        self.report_error(node.annotation, "Annotation should be Var")
+    self.eval_assign(target=lhs, source=ann_var, bind_value=bind_assign_value)
+    frame = T.let(ann_var, rhs)
+    frame.add_callback(partial(frame.__exit__, None, None, None))
+    frame.__enter__()
+
+
+@dispatch.register(token="tir", type_name="With")
+def visit_with(self: Parser, node: doc.With) -> None:
+    """The with visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.With
+        The doc AST with node.
+    """
+    with contextlib.ExitStack() as stack:
+        stack.enter_context(self.var_table.with_frame())
+        for item in node.items:
+            frame = self.eval_expr(item.context_expr)
+            if not isinstance(frame, Frame):
+                self.report_error(
+                    item.context_expr, "Invalid context expression in the with-statement."
+                )
+            rhs = stack.enter_context(frame)
+            if item.optional_vars is not None:
+                self.eval_assign(target=item.optional_vars, source=rhs, bind_value=bind_with_value)
+        self.visit_body(node.body)
+
+
+@dispatch.register(token="tir", type_name="FunctionDef")
+def visit_function_def(self: Parser, node: doc.FunctionDef) -> None:
+    """The function definition visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.FunctionDef
+        The doc AST function definition node.
+    """
+    with self.var_table.with_frame():
+        self.var_table.add("range", T.serial)
+        with T.prim_func():
+            T.func_name(node.name)
+            if node.returns is not None:
+                ret_type = self.eval_expr(node.returns)
+                if callable(ret_type):
+                    ret_type = PrimType(ret_type().dtype)
+                T.func_ret(ret_type)
+            with self.with_dispatch_token("tir"):
+                self.visit(node.args)
+                self.visit_body(node.body)
+
+
+@dispatch.register(token="tir", type_name="arguments")
+def visit_arguments(self: Parser, node: doc.arguments) -> None:
+    """The arguments visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.arguments
+        The doc AST arguments node.
+    """
+    # TODO: handle different types of arguments:
+    # - vararg: arg | None
+    # - kwonlyargs: list[arg]
+    # - kw_defaults: list[expr | None]
+    # - kwarg: arg | None
+    # - defaults: list[expr]
+    # - posonlyargs: list[arg]
+    arg: doc.arg
+    for arg in node.args:
+        if arg.annotation is None:
+            self.report_error(arg, "Type annotation is required for function parameters.")
+        param = T.arg(arg.arg, self.visit_tvm_annotation(arg.annotation))
+        self.var_table.add(arg.arg, param)
+
+
+@dispatch.register(token="tir", type_name="tvm_annotation")
+def visit_tvm_annotation(self: Parser, node: doc.expr):
+    """The TVM annotation visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.expr
+        The doc AST expr node.
+    """
+    annotation = self.eval_expr(node)
+    if callable(annotation):
+        annotation = annotation()
+    return annotation
+
+
+@dispatch.register(token="tir", type_name="Expr")
+def visit_expr_stmt(self: Parser, node: doc.Expr) -> None:
+    """The expr statement visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.Expr
+        The doc AST Expr node.
+    """
+    res = self.eval_expr(node.value)
+    if isinstance(res, Frame):
+        res.add_callback(partial(res.__exit__, None, None, None))
+        res.__enter__()
+
+
+@dispatch.register(token="tir", type_name="If")
+def visit_if(self: Parser, node: doc.If) -> None:
+    """The if visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.If
+        The doc AST if node.
+    """
+    with self.var_table.with_frame():
+        with T.If(self.eval_expr(node.test)):
+            with T.Then():
+                self.visit_body(node.body)
+            if node.orelse:
+                with T.Else():
+                    self.visit_body(node.orelse)
+
+
+@dispatch.register(token="tir", type_name="Assert")
+def visit_assert(self: Parser, node: doc.Assert) -> None:
+    """The assert visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.Assert
+        The doc AST assert node.
+    """
+    cond = self.eval_expr(node.test)
+    msg = self.eval_expr(node.msg)
+    frame = T.Assert(cond, msg)
+    frame.add_callback(partial(frame.__exit__, None, None, None))
+    frame.__enter__()
+
+
+@dispatch.register(token="tir", type_name="Return")
+def visit_return(self: Parser, node: doc.Return) -> None:
+    """The return visiting method for tir.
+
+    Parameters
+    ----------
+    self : Parser
+        The visiting parser.
+
+    node : doc.Return
+        The doc AST return node.
+    """
+    self.report_error(node, "Return is not allowed.")
diff --git a/tests/python/unittest/test_tvmscript_parser_tir.py b/tests/python/unittest/test_tvmscript_parser_tir.py
new file mode 100644
index 000000000000..cfa1dc62b31b
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_parser_tir.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unittests for tvm.script.parser.tir"""
+
+import pytest
+import inspect
+import tvm.testing
+from tvm.script._parser import tir as T
+from tvm import ir, tir
+
+
+def test_tir_buffer_proxy():
+    buffer_0 = T.Buffer((128, 128), "float32")
+    assert (
+        isinstance(buffer_0, tir.Buffer)
+        and list(buffer_0.shape) == [128, 128]
+        and buffer_0.dtype == "float32"
+    )
+
+    buffer_1 = T.Buffer[(64, 64, 64), "int32"]
+    assert (
+        isinstance(buffer_1, tir.Buffer)
+        and list(buffer_1.shape) == [64, 64, 64]
+        and buffer_1.dtype == "int32"
+    )
+
+
+def test_tir_ptr_proxy():
+    ptr_0 = T.Ptr("int32", "global")
+    assert (
+        isinstance(ptr_0, tir.Var)
+        and ptr_0.dtype == "handle"
+        and isinstance(ptr_0.type_annotation, ir.PointerType)
+        and ptr_0.type_annotation.element_type == ir.PrimType("int32")
+        and ptr_0.type_annotation.storage_scope == "global"
+    )
+
+    ptr_1 = T.Ptr["float32", "shared"]
+    assert (
+        isinstance(ptr_1, tir.Var)
+        and ptr_1.dtype == "handle"
+        and isinstance(ptr_1.type_annotation, ir.PointerType)
+        and ptr_1.type_annotation.element_type == ir.PrimType("float32")
+        and ptr_1.type_annotation.storage_scope == "shared"
+    )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 0d4caa54ff76537000c7ac981f44721186312bbe Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 25 Oct 2022 07:10:47 -0700
Subject: [PATCH 431/704] [docs] Add 'Edit on GitHub' button (#13183)

This changes the 'View Source' button on the top right of each doc page
to a link that goes to a GitHub web editor for that docs page and
automatically routes to the right source of truth, be it `.rst` or
`.py`.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .../install/ubuntu_install_python_package.sh  |  2 +-
 docs/conf.py                                  | 28 +++++++++++++++++--
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 9fee9d01425c..757ad0228c5d 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -34,7 +34,7 @@ pip3 install --upgrade \
     Pillow==9.1.0 \
     psutil \
     pytest \
-    git+https://github.com/tlc-pack/tlcpack-sphinx-addon.git@545450acaf0ee4e2932d8c5d9ab6e321d0bc86c8 \
+    git+https://github.com/tlc-pack/tlcpack-sphinx-addon.git@768ec1dce349fe4708f6ad68be1ebb3f3dabafa1 \
     pytest-profiling \
     pytest-xdist \
     pytest-rerunfailures==10.2 \
diff --git a/docs/conf.py b/docs/conf.py
index 0767ccf82e70..592d149c4ce4 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -30,12 +30,10 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 import gc
-import importlib.util
 import inspect
 import os
 from pathlib import Path
-import shlex
-import subprocess
+import re
 import sys
 
 import sphinx_gallery
@@ -420,6 +418,24 @@ def force_gc(gallery_conf, fname):
     ],
 }
 
+
+def fixup_tutorials(original_url: str) -> str:
+    if "docs/tutorial" in original_url:
+        # tutorials true source is in Python or .txt files, but Sphinx only sees
+        # the generated .rst files so this maps them back to the source
+        if original_url.endswith("index.rst"):
+            # for index pages, go to the README files
+            return re.sub(
+                r"docs/tutorial/(.*)index\.rst", "gallery/tutorial/\\1README.txt", original_url
+            )
+        else:
+            # otherwise for tutorials, redirect to python files
+            return re.sub(r"docs/tutorial/(.*)\.rst", "gallery/tutorial/\\1.py", original_url)
+    else:
+        # do nothing for normal non-tutorial .rst files
+        return original_url
+
+
 html_context = {
     "footer_copyright": footer_copyright,
     "footer_note": footer_note,
@@ -428,6 +444,12 @@ def force_gc(gallery_conf, fname):
     "header_logo": header_logo,
     "header_logo_link": header_logo_link,
     "version_prefixes": ["main", "v0.8.0/", "v0.9.0/", "v0.10.0/"],
+    "display_github": True,
+    "github_user": "apache",
+    "github_repo": "tvm",
+    "github_version": "main/docs/",
+    "theme_vcs_pageview_mode": "edit",
+    "edit_link_hook_fn": fixup_tutorials,
 }
 
 # add additional overrides

From e41d0ed6ebe624b3eb8fea736856b481746403c5 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Tue, 25 Oct 2022 10:00:40 -0700
Subject: [PATCH 432/704] [Relay] Rewrite division by constant to multiply
 (#13182)

* [Relay] Rewrite division by constant to multiply

Convert division by a scalar constant into multiplication by the inverse
of the constant. Multiplication is faster than division and also allow
for more optimization opportunities. Only applies to float32 and
float64.

* formatting

* handle division by zero

* handle float16
---
 python/tvm/relay/transform/transform.py       |  7 +-
 src/relay/transforms/div_to_mul.cc            | 86 +++++++++++++++++++
 .../fake_quantization_to_integer.cc           |  2 +-
 tests/python/unittest/test_div_to_mul.py      | 31 +++++++
 4 files changed, 124 insertions(+), 2 deletions(-)
 create mode 100644 src/relay/transforms/div_to_mul.cc
 create mode 100644 tests/python/unittest/test_div_to_mul.py

diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 196f7ef81293..c1f184671780 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -27,8 +27,8 @@
 from tvm import relay, te
 from tvm.runtime import ndarray as _nd
 
-from . import _ffi_api
 from ..backend.utils import mangle_module_name
+from . import _ffi_api
 
 
 def build_config(opt_level=2, required_pass=None, disabled_pass=None, trace=None):
@@ -1484,3 +1484,8 @@ def CollagePartition(config, cost_estimator=None):
         cost_estimator = relay.collage.CostEstimator()
 
     return _ffi_api.CollagePartition(config, cost_estimator)
+
+
+def DivToMul():
+    """Transform division by a constant to multiplication by the inverse of the constant"""
+    return _ffi_api.DivToMul()
diff --git a/src/relay/transforms/div_to_mul.cc b/src/relay/transforms/div_to_mul.cc
new file mode 100644
index 000000000000..42983c520682
--- /dev/null
+++ b/src/relay/transforms/div_to_mul.cc
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/runtime/builtin_fp16.h>
+
+#include "pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+
+class DivToMulRewrite : public MixedModeMutator {
+  Expr Rewrite_(const CallNode* pre, const Expr& post) final {
+    if (const CallNode* call_node = post.as<CallNode>()) {
+      if (call_node->op == Op::Get("divide")) {
+        auto rhs = call_node->args[1].as<ConstantNode>();
+        if (rhs != nullptr) {
+          auto inv =
+              runtime::NDArray::Empty(rhs->data.Shape(), rhs->data.DataType(), rhs->data->device);
+          std::string dtype = DLDataType2String(rhs->data.DataType());
+          if (dtype == "float32") {
+            float rhs_val = static_cast<float*>(rhs->data->data)[0];
+            // Check for division by zero
+            if (rhs_val == 0.) {
+              return post;
+            }
+            static_cast<float*>(inv->data)[0] = 1. / rhs_val;
+          } else if (dtype == "float64") {
+            double rhs_val = static_cast<double*>(rhs->data->data)[0];
+            // Check for division by zero
+            if (rhs_val == 0.) {
+              return post;
+            }
+            static_cast<double*>(inv->data)[0] = 1. / rhs_val;
+          } else if (dtype == "float16") {
+            // Do f16 math in f32
+            float rhs_val = __gnu_h2f_ieee(static_cast<uint16_t*>(rhs->data->data)[0]);
+            // Check for division by zero
+            if (rhs_val == 0.) {
+              return post;
+            }
+            static_cast<uint16_t*>(inv->data)[0] = __gnu_f2h_ieee(1. / rhs_val);
+          } else {
+            // Cannot do 1/int because it will truncate
+            return post;
+          }
+          return Multiply(call_node->args[0], Constant(inv));
+        }
+      }
+    }
+    return post;
+  }
+};
+
+namespace transform {
+
+Pass DivToMul() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function f, IRModule m, PassContext pc) {
+        return Downcast<Function>(DivToMulRewrite().Mutate(f));
+      };
+  return CreateFunctionPass(pass_func, 0, "DivToMul", {"InferType", "FoldConstant"});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.DivToMul").set_body_typed(DivToMul);
+
+}  // namespace transform
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/fake_quantization_to_integer.cc b/src/relay/transforms/fake_quantization_to_integer.cc
index 0b9daed896c3..eb176df5c978 100644
--- a/src/relay/transforms/fake_quantization_to_integer.cc
+++ b/src/relay/transforms/fake_quantization_to_integer.cc
@@ -542,7 +542,7 @@ Pass FakeQuantizationToInteger(bool hard_fail, bool use_qat) {
       [=](Function f, IRModule m, PassContext pc) {
         return Downcast<Function>(FakeQuantizationToInteger(f, m, hard_fail, use_qat));
       };
-  return CreateFunctionPass(pass_func, 0, "FakeQuantizationToInteger", {"InferType"});
+  return CreateFunctionPass(pass_func, 0, "FakeQuantizationToInteger", {"InferType", "DivToMul"});
 }
 
 TVM_REGISTER_GLOBAL("relay._transform.FakeQuantizationToInteger")
diff --git a/tests/python/unittest/test_div_to_mul.py b/tests/python/unittest/test_div_to_mul.py
new file mode 100644
index 000000000000..60c67ae2499c
--- /dev/null
+++ b/tests/python/unittest/test_div_to_mul.py
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import relay
+import pytest
+import numpy as np
+
+
+@pytest.mark.parametrize("dtype, rtol", [("float16", 1e-3), ("float32", 1e-7), ("float64", 1e-12)])
+def test_div_to_mul(dtype, rtol):
+    x = relay.var("x", relay.TensorType((), dtype))
+    y = relay.Constant(tvm.nd.array(np.array([1.5]).astype(dtype)))
+    z = x / y
+    mod = tvm.IRModule.from_expr(z)
+    transformed = relay.transform.DivToMul()(mod)
+    assert transformed["main"].body.op.name == "multiply"
+    np.testing.assert_allclose(transformed["main"].body.args[1].data.numpy()[0], 1 / 1.5, rtol=rtol)

From 23c2909f2917fe9fbccad8e440199e5e4a106ba9 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <73191103+jverma-quic@users.noreply.github.com>
Date: Tue, 25 Oct 2022 14:04:13 -0500
Subject: [PATCH 433/704] [Hexagon] Add support for instrumentation based
 profiling for Hexagon (#12971)

* [Hexagon] Add support for instrumentation based profiling for Hexagon

This's done by instrumenting the code with profiling builtin calls using a TIR pass.
During codegen, these builtin calls are replaced with the calls to a hexagon specific
handler which records the runtime information into a buffer. This buffer is written
into a JSON file ('lwp.json') which is processed to construct function and loop-level
profiling information as a csv file.

At a high-level, this PR makes the following changes:

1) Add a TIR pass (src/tir/transforms/profile_instrumentation.cc) to instrument the
functions and loops with profilging builtins.
2) Hexagon codegen changes to replace profilng builtin calls with the call to Hexagon
specific handler. This handler record the runtime data into a buffer. For all other
targets, these builtin calls are ignored.
3) Add API to RPC Launcher to get the profiling data as a JSON file
4) A python script to process the profiling data and construct a CSV file
5) Add TVM script based unit tests to test and demonstrate various profiling config
flags: tests/python/unittest/test_tir_transform_profiling_instr.py
6) Adds two tests in tests/python/contrib/test_hexagon/test_launcher.py to demonstrate
necessary changes to enable profiling and to collect and process runtime data.

For additional details, please refer to src/runtime/hexagon/profiler/README.md

* Fix typos

* Update python/tvm/contrib/hexagon/build.py

Add type hint

Co-authored-by: Tristan Konolige <tristan.konolige@gmail.com>

* Address review comments

Simplify the interface to the lightweight profiling.

* Ignore profile builtins if llvm version < 15.0

* Add src/runtime/hexagon/profiler/lwp_handler.S to allowed list

* Address reformatting issues

* Fix pylint errors

* Address remaining linter failures

* clang-format issue

* Fix builtin names

* Resolve test failure for the simulator run

* Allow for the tests to provide .so name

Co-authored-by: Tristan Konolige <tristan.konolige@gmail.com>
---
 apps/hexagon_launcher/README.md               |  40 ++
 .../cmake/hexagon/CMakeLists.txt              |   5 +-
 apps/hexagon_launcher/launcher_android.cc     |   9 +-
 apps/hexagon_launcher/launcher_core.h         |   3 +
 apps/hexagon_launcher/launcher_hexagon.cc     |  10 +-
 apps/hexagon_launcher/launcher_main.cc        |  14 +-
 apps/hexagon_launcher/launcher_rpc.idl        |   2 +-
 cmake/modules/Hexagon.cmake                   |   6 +
 cmake/modules/HexagonSDK.cmake                |   6 +
 include/tvm/tir/builtin.h                     |  10 +
 include/tvm/tir/transform.h                   |   6 +
 python/tvm/contrib/hexagon/build.py           |  71 ++++
 .../tvm/contrib/hexagon/hexagon_profiler.py   | 119 ++++++
 .../hexagon/profiling/process_lwp_data.py     | 388 ++++++++++++++++++
 python/tvm/contrib/hexagon/session.py         |   5 +
 python/tvm/tir/transform/transform.py         |  11 +
 src/driver/driver_api.cc                      |  11 +
 src/runtime/dso_library.cc                    |  14 +
 src/runtime/hexagon/profiler/README.md        |  99 +++++
 src/runtime/hexagon/profiler/lwp_handler.S    | 115 ++++++
 src/runtime/hexagon/profiler/prof_utils.cc    |  78 ++++
 src/runtime/hexagon/profiler/prof_utils.h     |  30 ++
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc |  13 +
 .../hexagon/rpc/simulator/rpc_server.cc       |  13 +
 src/target/llvm/codegen_hexagon.cc            |  23 ++
 src/target/llvm/codegen_llvm.cc               |   4 +
 src/tir/op/builtin.cc                         |   6 +
 src/tir/transforms/profile_instrumentation.cc | 293 +++++++++++++
 tests/lint/check_file_type.py                 |   1 +
 .../contrib/test_hexagon/test_launcher.py     | 158 +++++++
 .../test_tir_transform_profiling_instr.py     | 340 +++++++++++++++
 31 files changed, 1892 insertions(+), 11 deletions(-)
 create mode 100755 python/tvm/contrib/hexagon/hexagon_profiler.py
 create mode 100644 python/tvm/contrib/hexagon/profiling/process_lwp_data.py
 create mode 100644 src/runtime/hexagon/profiler/README.md
 create mode 100644 src/runtime/hexagon/profiler/lwp_handler.S
 create mode 100644 src/runtime/hexagon/profiler/prof_utils.cc
 create mode 100644 src/runtime/hexagon/profiler/prof_utils.h
 create mode 100644 src/tir/transforms/profile_instrumentation.cc
 create mode 100644 tests/python/unittest/test_tir_transform_profiling_instr.py

diff --git a/apps/hexagon_launcher/README.md b/apps/hexagon_launcher/README.md
index cc433f245759..943a6266a3b4 100644
--- a/apps/hexagon_launcher/README.md
+++ b/apps/hexagon_launcher/README.md
@@ -189,6 +189,46 @@ lowered = tvm.relay.build(
 lowered.export_library("model-aot.so", tvm.contrib.hexagon.link_shared)
 ```
 
+
+## Profiling using hexagon launcher
+
+### Enabling lightweight profiling (LWP) instrumentation
+
+This profiling option can be used to get function and loop level processor cycles.
+This needs to be enabled explicitly while compiling a model. For example:
+
+```
+with tvm.transform.PassContext(config={'tir.instrument_lwp':True} ):
+    lib = relay.build(...)
+```
+
+Here, `instrument_lwp` is used to enable the tir pass which instruments the code with the builtin calls.
+
+During codegen, profiling builtin calls can be replaced with a target specific handler to record runtime
+information into a buffer. This buffer is written into a JSON file which is processed to construct
+function and loop level profiling information.
+
+To generate LWP JSON file, add `--gen_lwp_json` flag to launcher_android:
+
+```
+./launcher_android --in_config input.json --out_config output.json --gen_lwp_json
+```
+
+Please note that `--gen_lwp_json` flag by itself doesn't enable profiling and is only used to dump
+the profiling data into a json file called lwp.json. This file will be created at the same location
+on the device where launcher_android is executed from. To generate the data, profiling instrumentation
+must be enabled while compiling a model as mentioned above.
+
+Use this command to pull `lwp.json` from the device:
+
+```
+adb -s <DEVICE-ID> pull /path/to/lwp.json
+```
+
+**Note:** Please refer to src/runtime/hexagon/profiler/README.md for information on how
+to enable profiling using Hexagon RPC launcher and also to learn about additional profiling related
+config options.
+
 # Disclaimer
 
 The launcher does not perform any correctness verification. In order to verify
diff --git a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
index 5fae6b0a4099..e8bd67dde7a2 100644
--- a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
+++ b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
@@ -16,7 +16,7 @@
 # under the License.
 
 cmake_minimum_required(VERSION 3.2)
-project(HexagonLauncherRPCSkel C CXX)
+project(HexagonLauncherRPCSkel C CXX ASM)
 
 include("${CMAKE_CURRENT_SOURCE_DIR}/../HexagonLauncher.cmake")
 # From the include above get
@@ -68,11 +68,14 @@ set(SKEL_SRCS
   "${LAUNCHER_SRC}/launcher_core.cc"
   "${LAUNCHER_SRC}/launcher_hexagon.cc"
 )
+set(PROFILER_DIR "${TVM_SOURCE_DIR}/src/runtime/hexagon/profiler")
 
 add_library(launcher_rpc_skel SHARED
   "${LAUNCHER_RPC_H}"
   "${LAUNCHER_RPC_SKEL_C}"
   "${SKEL_SRCS}"
+  "${PROFILER_DIR}/prof_utils.cc"
+  "${PROFILER_DIR}/lwp_handler.S"
 )
 
 ExternalProject_Add(static_hexagon_tvm_runtime
diff --git a/apps/hexagon_launcher/launcher_android.cc b/apps/hexagon_launcher/launcher_android.cc
index 008e4fdfe1c4..34db0bdacb60 100644
--- a/apps/hexagon_launcher/launcher_android.cc
+++ b/apps/hexagon_launcher/launcher_android.cc
@@ -55,7 +55,8 @@ AEEResult set_remote_stack_size(int size) {
 }
 
 struct RPCChannel : public ExecutionSession {
-  explicit RPCChannel(const std::string& uri) {
+  explicit RPCChannel(const std::string& uri, bool gen_lwp_json = false)
+      : ExecutionSession(gen_lwp_json) {
     enable_unsigned_pd(true);
     set_remote_stack_size(128 * 1024);
 
@@ -127,7 +128,7 @@ struct RPCChannel : public ExecutionSession {
   }
 
   bool run(uint64_t* pcycles, uint64_t* usecs) override {
-    AEEResult rc = launcher_rpc_run(handle, pcycles, usecs);
+    AEEResult rc = launcher_rpc_run(handle, pcycles, usecs, gen_lwp_json);
     if (rc != AEE_SUCCESS) {
       std::cout << "error running model: " << std::hex << rc << '\n';
     }
@@ -158,8 +159,8 @@ struct RPCChannel : public ExecutionSession {
   std::vector<void*> allocations;
 };
 
-ExecutionSession* create_execution_session() {
-  auto* session = new RPCChannel(launcher_rpc_URI CDSP_DOMAIN);
+ExecutionSession* create_execution_session(bool gen_lwp_json) {
+  auto* session = new RPCChannel(launcher_rpc_URI CDSP_DOMAIN, gen_lwp_json);
   if (session->handle == -1) {
     delete session;
     session = nullptr;
diff --git a/apps/hexagon_launcher/launcher_core.h b/apps/hexagon_launcher/launcher_core.h
index a32bf937af58..da0dfcbbd5a6 100644
--- a/apps/hexagon_launcher/launcher_core.h
+++ b/apps/hexagon_launcher/launcher_core.h
@@ -94,6 +94,8 @@ struct Model {
 };
 
 struct ExecutionSession {
+  explicit ExecutionSession(bool lwp_json = false) : gen_lwp_json(lwp_json) {}
+
   template <typename T>
   T* alloc(size_t bytes, size_t align = 1) {
     return reinterpret_cast<T*>(alloc_mem(bytes, align));
@@ -111,6 +113,7 @@ struct ExecutionSession {
   virtual bool get_num_outputs(int* num_outputs) = 0;
   virtual bool get_output(int output_idx, tensor_meta* output_meta, int meta_size,
                           void* output_data, int data_size) = 0;
+  bool gen_lwp_json = false;
 };
 
 bool read_model_config(const std::string& file_name, ModelConfig* model_config);
diff --git a/apps/hexagon_launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc
index 03524661c4e6..2692caf90e66 100644
--- a/apps/hexagon_launcher/launcher_hexagon.cc
+++ b/apps/hexagon_launcher/launcher_hexagon.cc
@@ -35,6 +35,7 @@ extern "C" {
 #include "launcher_rpc.h"
 
 static std::unique_ptr<Model> TheModel;
+bool WriteLWPOutput(const std::string&);
 
 static AEEResult error_too_small(const std::string& func_name, const std::string& value_name,
                                  int given, int needed) {
@@ -203,7 +204,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_output)(remote_handle64 handle, int out
 }
 
 AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcycles,
-                                          uint64_t* usecs) {
+                                          uint64_t* usecs, int gen_lwp_json) {
   if (!TheModel) {
     // No model created.
     LOG(ERROR) << __func__ << ": no model created";
@@ -220,5 +221,12 @@ AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcyc
   *pcycles = pc_end - pc_begin;
   *usecs = us_end - us_begin;
 
+  if (gen_lwp_json) {
+    if (!WriteLWPOutput("lwp.json")) {
+      LOG(ERROR) << "ERROR: failed to generate lwp json file";
+      return AEE_EFAILED;
+    }
+  }
+
   return AEE_SUCCESS;
 }
diff --git a/apps/hexagon_launcher/launcher_main.cc b/apps/hexagon_launcher/launcher_main.cc
index 163d582db440..1ef3b5d2ff3c 100644
--- a/apps/hexagon_launcher/launcher_main.cc
+++ b/apps/hexagon_launcher/launcher_main.cc
@@ -27,12 +27,14 @@
 #include "launcher_core.h"
 #include "launcher_util.h"
 
-ExecutionSession* create_execution_session();
+ExecutionSession* create_execution_session(bool gen_lwp_json);
 
-int parse_command_line(int argc, char* argv[], std::string* in_path, std::string* out_path) {
+int parse_command_line(int argc, char* argv[], std::string* in_path, std::string* out_path,
+                       bool* gen_lwp_json) {
   static option long_options[] = {
       {"in_config", required_argument, nullptr, 0},
       {"out_config", required_argument, nullptr, 0},
+      {"gen_lwp_json", optional_argument, nullptr, 0},
   };
 
   bool show_usage = false;
@@ -49,6 +51,9 @@ int parse_command_line(int argc, char* argv[], std::string* in_path, std::string
       case 1:
         *out_path = std::string(optarg);
         break;
+      case 2:
+        *gen_lwp_json = true;
+        break;
     }
   }
   if (in_path->empty() || out_path->empty() || show_usage) {
@@ -61,7 +66,8 @@ int parse_command_line(int argc, char* argv[], std::string* in_path, std::string
 
 int main(int argc, char* argv[]) {
   std::string in_path, out_path;
-  if (parse_command_line(argc, argv, &in_path, &out_path) != 0) {
+  bool gen_lwp_json;
+  if (parse_command_line(argc, argv, &in_path, &out_path, &gen_lwp_json) != 0) {
     return 1;
   }
 
@@ -70,7 +76,7 @@ int main(int argc, char* argv[]) {
     return 1;
   }
 
-  ExecutionSession* session_ptr = create_execution_session();
+  ExecutionSession* session_ptr = create_execution_session(gen_lwp_json);
   if (session_ptr == nullptr) {
     return 1;
   }
diff --git a/apps/hexagon_launcher/launcher_rpc.idl b/apps/hexagon_launcher/launcher_rpc.idl
index 6677108a76f0..27e5d1d15d68 100644
--- a/apps/hexagon_launcher/launcher_rpc.idl
+++ b/apps/hexagon_launcher/launcher_rpc.idl
@@ -29,5 +29,5 @@ interface launcher_rpc : remote_handle64 {
   AEEResult set_input(in long input_idx, in buffer input_meta, in buffer input_value);
   AEEResult get_num_outputs(rout long num_outputs);
   AEEResult get_output(in long output_idx, rout buffer output_meta, rout buffer output_value);
-  AEEResult run(rout uint64_t pcycles, rout uint64_t usecs);
+  AEEResult run(rout uint64_t pcycles, rout uint64_t usecs, in long gen_lwp_json);
 };
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 399623ef1c3e..735d21e492b5 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -281,10 +281,14 @@ if(USE_HEXAGON_RPC)
       # TODO(masahi): Remove rpc_local_session.cc after verifying that things work without it
       "${TVMRT_SOURCE_DIR}/rpc/rpc_local_session.cc"
     )
+    set(HEXAGON_PROFILER_DIR "${TVMRT_SOURCE_DIR}/hexagon/profiler")
     # Add the hardware-specific RPC code into the skel library.
+    set_property(SOURCE ${HEXAGON_PROFILER_DIR}/lwp_handler.S PROPERTY LANGUAGE C)
     add_library(hexagon_rpc_skel SHARED
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon/rpc_server.cc"
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc_skel.c"
+      "${HEXAGON_PROFILER_DIR}/prof_utils.cc"
+      "${HEXAGON_PROFILER_DIR}/lwp_handler.S"
     )
     target_include_directories(hexagon_rpc_skel
       SYSTEM PRIVATE "${TVMRT_SOURCE_DIR}/hexagon/rpc"
@@ -293,6 +297,8 @@ if(USE_HEXAGON_RPC)
     # executed via run_main_on_sim.
     add_library(hexagon_rpc_sim SHARED
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/simulator/rpc_server.cc"
+      "${HEXAGON_PROFILER_DIR}/prof_utils.cc"
+      "${HEXAGON_PROFILER_DIR}/lwp_handler.S"
     )
     target_link_libraries(hexagon_rpc_sim
       -Wl,--whole-archive tvm_runtime -Wl,--no-whole-archive
diff --git a/cmake/modules/HexagonSDK.cmake b/cmake/modules/HexagonSDK.cmake
index 173f0f3b2d67..ddb158cad95e 100644
--- a/cmake/modules/HexagonSDK.cmake
+++ b/cmake/modules/HexagonSDK.cmake
@@ -157,9 +157,15 @@ function(_get_hexagon_sdk_property_impl
     if(_property STREQUAL "SDK_INCLUDE")
       set(_dirs "${_hexagon_sdk_root}/incs" "${_hexagon_sdk_root}/incs/stddef")
     elseif(_property STREQUAL "QURT_INCLUDE")
+      # Set the Hexagon arch directory for runtime linker.
+      set(_rtld_dir "hexagon_toolv84_${_hexagon_arch}")
+      if(_hexagon_arch STREQUAL "v69")
+        set(_rtld_dir "hexagon_toolv84_v68") # Use hexagon_toolv84_v68 for v69
+      endif()
       set(_dirs
         "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/include/posix"
         "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/include/qurt"
+        "${_hexagon_sdk_root}/ipc/fastrpc/rtld/ship/${_rtld_dir}"
       )
     elseif(_property STREQUAL "QURT_LIB")
       set(_dirs "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/lib/pic")
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index a1a97595bfd8..9f6b7f9ce5d1 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -747,6 +747,16 @@ TVM_DLL const Op& assume();
  */
 TVM_DLL const Op& undef();
 
+/*!
+ * \brief Profiling intrinsic
+ */
+TVM_DLL const Op& start_profile_intrinsic();
+
+/*!
+ * \brief Profiling intrinsic
+ */
+TVM_DLL const Op& end_profile_intrinsic();
+
 /*! \brief The kind of structure field info used in intrinsic */
 enum TVMStructFieldKind : int {
   // array head address
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index e31919fbd223..48372565469b 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -690,6 +690,12 @@ TVM_DLL Pass RemoveWeightLayoutRewriteBlock(bool skip_ndarray_rewrite = false);
  */
 TVM_DLL Pass ManifestSharedMemoryLocalStage();
 
+/*!
+ * \brief Insert intrinsic calls to instrument function and loop level profiling.
+ * \return The pass.
+ */
+TVM_DLL Pass InstrumentProfileIntrinsics();
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 8105e6e716c0..c0e6439d0357 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -32,6 +32,7 @@
 from typing import Union
 
 import tvm
+from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
 from ..._ffi import libinfo
 from .session import Session
 
@@ -336,6 +337,29 @@ def get_graph_debug_executor(
         """
         return session.get_graph_debug_executor(graph_json, module, dump_root=dump_root)
 
+    @abc.abstractmethod
+    def get_profile_output(
+        self,
+        hex_profiler: HexagonProfiler,
+        session: Session,
+    ) -> str:
+        """Extract profile output.
+
+        Parameters
+        ----------
+        hex_profiler : HexagonProfiler
+            HexagonProfiler object that contains the profiling related information.
+        session : Session
+            Remote session. The session must be established (via __enter__)
+            prior to calling this function.
+
+        Returns
+        -------
+        profile_data : str
+            Path of the profiling data file
+        """
+        ...
+
 
 class HexagonLauncherAndroid(HexagonLauncherRPC):
     """Hexagon Launcher for Android."""
@@ -392,6 +416,7 @@ def _copy_to_remote(
         self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path]
     ):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
+
         _check_call_verbose(self._adb_device_sub_cmd + ["push", str(local_path), str(remote_path)])
 
     def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path:
@@ -629,6 +654,32 @@ def stop_server(self):
         if not self._hexagon_debug:
             self.cleanup_directory()
 
+    def get_profile_output(
+        self,
+        hex_profiler: HexagonProfiler,
+        session: Session,
+    ):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
+        profile_data = ""
+        if hex_profiler.is_lwp_enabled():
+            temp_dir = hex_profiler.get_temp_dir()
+            remote_path = hex_profiler.get_remote_path()
+            if not temp_dir:
+                raise RuntimeError("tempdir not passed")
+            fname = "lwp.json"
+            out_path = os.path.join(remote_path, fname)
+            profile_data = temp_dir.relpath(fname)
+            ret = session.get_profile_output(hex_profiler.get_mode(), fname)
+            if ret:
+                subprocess.check_call(self._adb_device_sub_cmd + ["pull", out_path, profile_data])
+            else:
+                raise RuntimeError("Error generating profile output")
+        elif hex_profiler.profiling_mode == "etm":
+            hex_profiler.pull_files_for_etm_processing(self._workspace)
+        else:
+            raise RuntimeError("Profiling not enabled")
+        return profile_data
+
 
 class HexagonLauncherSimulator(HexagonLauncherRPC):
     """Hexagon Launcher for Hexagon simulator."""
@@ -735,6 +786,26 @@ def stop_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._server_process.terminate()
 
+    def get_profile_output(
+        self,
+        hex_profiler: HexagonProfiler,
+        session: Session,
+    ):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
+        profile_data = ""
+        if hex_profiler.is_lwp_enabled():
+            fname = "lwp.json"
+            profile_data = f"{self._workspace}/{fname}"
+            ret = session.get_profile_output(hex_profiler.get_mode(), fname)
+            if not ret:
+                raise RuntimeError("Error generating profile output")
+        elif hex_profiler.profiling_mode == "etm":
+            raise RuntimeError("ETM Profiling not supported on the simulator")
+        else:
+            raise RuntimeError("Profiling not enabled")
+
+        return profile_data
+
 
 # https://stackoverflow.com/a/52872579/2689797
 def _is_port_in_use(port: int) -> bool:
diff --git a/python/tvm/contrib/hexagon/hexagon_profiler.py b/python/tvm/contrib/hexagon/hexagon_profiler.py
new file mode 100755
index 000000000000..9a5df3d9b99a
--- /dev/null
+++ b/python/tvm/contrib/hexagon/hexagon_profiler.py
@@ -0,0 +1,119 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Define HexagonProfiler class to enable profiling for Hexagon"""
+
+import os
+import subprocess
+from tvm.ir.transform import PassContext
+from tvm.contrib.hexagon.profiling.process_lwp_data import process_lwp_output
+from tvm.relay.backend.executor_factory import ExecutorFactoryModule
+from tvm.contrib import utils
+
+
+class HexagonProfiler:
+    """Hexagon Profiler"""
+
+    def __init__(
+        self, dso_binary: str, module: ExecutorFactoryModule, hexagon_server_process, enable_debug
+    ):
+        """Configure HexagonProfiler"""
+        # Save test .so to process profiling data
+        self._temp_dir = utils.tempdir(keep_for_debug=enable_debug)
+        self._dso_binary_path = self._temp_dir.relpath(dso_binary)
+        module.get_lib().save(self._dso_binary_path)
+
+        self._android_serial_number = os.environ.get("ANDROID_SERIAL_NUMBER")
+        self._remote_path = ""
+        self._logcat_path = ""
+
+        self._profiling_mode = None
+        config = PassContext.current().config
+        if self._android_serial_number is None:
+            raise RuntimeError("ANDROID_SERIAL_NUMBER must be set for profiling")
+
+        if ("tir.instrument_lwp", True) in config.items():
+            # Set profiling mode
+            self._profiling_mode = "lwp"
+
+            if self._android_serial_number != "simulator":
+                # Clear the logcat buffer and create a child process to redirect logcat output
+                # into a file.
+                launcher = hexagon_server_process["launcher"]
+                subprocess.check_call(launcher._adb_device_sub_cmd + ["logcat", "-c"])
+                self._logcat_path = self._temp_dir.relpath("logcat.log")
+                self._fo = open(self._logcat_path, "w")
+                self._proc = subprocess.Popen(
+                    launcher._adb_device_sub_cmd + ["logcat"], stdout=self._fo
+                )
+
+                # Get the remote workspace on the device from where the lwp data needs to be copied.
+                self._remote_path = launcher._workspace
+
+        if self._profiling_mode is None:
+            raise RuntimeError("Profiling mode was not set or was not a valid one.")
+
+    def get_mode(self):
+        return self._profiling_mode
+
+    def is_lwp_enabled(self):
+        return self._profiling_mode == "lwp"
+
+    def get_temp_dir(self):
+        return self._temp_dir
+
+    def get_remote_path(self):
+        return self._remote_path
+
+    def get_profile_output(self, hexagon_launcher, hexagon_session):
+        """Get runtime profiling data"""
+        prof_out = hexagon_launcher.get_profile_output(self, hexagon_session)
+
+        print("lwp json can be found at -- ", prof_out)
+
+        # Process lightweight profiling output into an easily readable csv file
+        # The post-processing requires following parameters:
+        # 1) Path of the binary file
+        # 2) android_serial_number
+        # 3) Path of the lwp json file (lwp.json) which gets created in the current directory
+        # 4) Path to the run log depending on the environment:
+        #    a) For on-device runs:
+        #       Use logcat output as the run log
+        #    b) For simulator runs:
+        #       Use "stdout.txt" as the run log. There is no need to specify the full path to
+        #       "stdout.txt" as it will be inferred based on 'prof_out' location.
+        # 5) lwp processed output file -  "lwp.csv"
+        #
+        lwp_csv = self._temp_dir.relpath("lwp.csv")
+        if self._android_serial_number == "simulator":
+            process_lwp_output(
+                self._dso_binary_path, self._android_serial_number, prof_out, "stdout.txt", lwp_csv
+            )
+        else:
+            # For on-device run
+            self._proc.kill()  # End the child process for logcat
+            self._fo.close()
+            if os.path.exists(self._logcat_path):
+                process_lwp_output(
+                    self._dso_binary_path,
+                    self._android_serial_number,
+                    prof_out,
+                    self._logcat_path,
+                    lwp_csv,
+                )
+            else:
+                raise RuntimeError("Error processing lwp output - missing logcat file")
diff --git a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
new file mode 100644
index 000000000000..eb92228b7cf3
--- /dev/null
+++ b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
@@ -0,0 +1,388 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+import csv
+import subprocess
+import argparse
+import os
+from re import search, compile
+from collections import OrderedDict
+
+ENABLE_DEBUG = False
+"""
+Process lightweight profiling output and generate a CSV file with processor
+cycles for the instrumented functions and loops.
+
+Please note that some assumptions have been made while processing
+the lightweight profiling output. They are as follows:
+
+1) We don't expect profiled functions to call another profiled function.
+  This constraint can be relaxed if needed but it simplifies the processing
+  significantly without introducing any limitations for our use case.
+2) For now, it's also assumed that every unique section (loop) ID has same start
+  and end offset which will not be true while a loop gets unrolled as it will
+  create multiple profiling section with the same ID. The current
+  implementation doesn't handle this case.
+
+"""
+
+
+def get_func_info(model_so):
+    """Get all the .text sections along with their start and end offset values"""
+    hexagon_nm_path = os.environ["HEXAGON_TOOLCHAIN"] + "/bin/hexagon-nm"
+    out = subprocess.Popen(
+        [hexagon_nm_path, "--print-size", model_so],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    )
+    stdo, stde = out.communicate()
+    stdo = stdo.decode("utf-8")
+
+    func_info = []
+    for l in stdo.split("\n"):
+        info = {}
+        if search(" (T|t) ", l):  # If .text section
+            parts = l.split(" ")
+            assert len(parts) == 4
+            info["start"] = int(parts[0], base=16)
+            info["end"] = int(parts[0], base=16) + int(parts[1], base=16)
+            info["name"] = parts[3]
+            func_info.append(info)
+
+    # Sort the entries in the increasing order of the start offset value.
+    func_info = sorted(func_info, key=lambda d: d["start"])
+
+    if ENABLE_DEBUG:
+        print("func_info :\n ")
+        for f in func_info:
+            print(f)
+    return func_info
+
+
+def find_func(func_info, offset):
+    """For a given offset, find the function it belongs to."""
+    fidx = 0
+    lidx = len(func_info) - 1
+    while fidx <= lidx:
+        midx = (fidx + lidx) // 2
+        ms = func_info[midx]["start"]
+        me = func_info[midx]["end"]
+        if fidx == lidx:
+            assert (
+                offset >= ms and offset <= me
+            ), f"Couldn't find a function for this offset: {offset}"
+            return fidx
+        else:
+            if offset > me:
+                fidx = midx + 1
+            elif offset < ms:
+                lidx = midx - 1
+            else:
+                return midx
+    assert False, "Possible mismatch between model .so and LWP data"
+
+
+def accumulate_cycles(overall_cycles, func_cycles, func_name):
+    """Accumulate function cycles"""
+    acc_cycles = overall_cycles[func_name]
+    for id in func_cycles:
+        assert id in acc_cycles, f"id [{id}] missing in the existing function record"
+        assert (
+            acc_cycles[id]["start"] == func_cycles[id]["start"]
+        ), "Offset value doesn't match with the existing function record."
+        acc_cycles[id]["cycles"] += func_cycles[id]["cycles"]
+        acc_cycles[id]["count"] += func_cycles[id]["count"]
+    overall_cycles.update({func_name: acc_cycles})
+    return overall_cycles
+
+
+def adjust_per_loop_counts(overall_cycles, data):
+    """
+    Use execution count and the number of entries recorded for each function/loop
+    to compute the overall cycles spent on them.
+    """
+    for func in overall_cycles:
+        func_cycles = overall_cycles[func]
+        for id in func_cycles:
+            exec_count = data["loop_counts"][id]
+            rec_count = func_cycles[id]["count"]
+            assert exec_count != 0, "Execution count should have been non-zero."
+            assert rec_count != 0, "Entry count should have been non-zero."
+            exec_cycles = ((int(func_cycles[id]["cycles"])) * exec_count) // rec_count
+            func_cycles[id]["cycles"] = exec_cycles
+            func_cycles[id]["count"] = exec_count
+        overall_cycles.update({func: OrderedDict(sorted(func_cycles.items()))})
+    return overall_cycles
+
+
+def create_csv_report(overall_cycles, fname):
+    """Create csv report"""
+    header = [
+        "function name",
+        "loop/function id",
+        "loop depth",
+        "start offset",
+        "end offset",
+        "pcycles",
+        "parent count",
+    ]
+    with open(fname, "w") as f:
+        writer = csv.writer(f)
+        writer.writerow(header)
+        for func in overall_cycles:
+            func_cycles = overall_cycles[func]
+            data = []
+            root = -1
+            outer_most = -1
+            for key, value in func_cycles.items():
+                if value["parent"] == -1:
+                    assert root == -1, "Can't have multiple root nodes."
+                    root = key
+
+                data.append(func)
+                data.append(key)
+                if value["parent"] == -1:
+                    data.append("-")  # Total cycles over all invocations of this function.
+                elif value["parent"] == root:
+                    data.append(0)
+                    outer_most = key
+                else:
+                    if outer_most > -1:
+                        data.append(key - outer_most)
+                    else:
+                        data.append(key - value["parent"])
+                data.append(hex(value["start"]))
+                data.append(hex(value["end"]))
+                data.append(value["cycles"])
+                data.append(value["count"])
+                writer.writerow(data)
+                data.clear()
+
+
+def process_data(data, func_info, so_ld_addr):
+    """Process data"""
+    # Keep an ordered list of loop IDs as they are being visited. This is used
+    # to match entry and exit pairs. Once the function/loop is processed, it's
+    # removed from the list.
+    ordered_visited_list = []
+    # Store information regarding visited nodes as they are being processed. Once
+    # the function/loop is processed, it's removed from the set.
+    visited_set = {}
+    # Dictionary to store cycles for the entire model which is grouped into functions.
+    overall_cycles = {}
+    func_cycles = {}
+
+    func_idx = -1
+    func_name = ""
+    prev_func_name = ""
+    func_start = 0
+    func_end = 0
+    save_data = False
+    # Iterate over all the entries in the LWP data file and process them
+    # to construct a report.
+    for entry in data["entries"]:
+        id = entry["id"]
+        offset = entry["ret"] - so_ld_addr
+
+        # Recorded return address should fall within the function begin and end
+        # offsets. If not, find the function it belongs to.
+        if offset < func_start or offset > func_end:
+            prev_func_name = func_name
+            if ENABLE_DEBUG:
+                print("offset : ", offset)
+                print("id : ", id)
+
+            func_idx = find_func(func_info, offset)
+            func_name = func_info[func_idx]["name"]
+            func_start = func_info[func_idx]["start"]
+            func_end = func_info[func_idx]["end"]
+            if ENABLE_DEBUG:
+                print("func_name : ", func_name)
+
+            if save_data:
+                # overall_cycles = save_func_cycles(prev_func_name, overall_cycles, func_cycles, ordered_visited_list)
+                # Done processing the previous function, copy its info into 'overall_cycles'.
+                if prev_func_name not in overall_cycles:
+                    overall_cycles[prev_func_name] = func_cycles.copy()
+                else:
+                    # Accumulate cycles into existing function entry.
+                    overall_cycles = accumulate_cycles(overall_cycles, func_cycles, prev_func_name)
+                # We don't allow for fused operators (functions) calling another operator.
+                if ENABLE_DEBUG:
+                    print("ordered_visited_list : ", ordered_visited_list)
+
+                assert len(ordered_visited_list) == 0, (
+                    f"\nDone processing function [{prev_func_name}] but ordered_visited_list not empty.\n"
+                    f"\t Possible reasons -- \n"
+                    f"\t\t1) Mismatch between model .so and json file.\n"
+                    f"\t\t2) LWP buffer may have overflowed resulting into missing entries!"
+                )
+                func_cycles.clear()
+
+            save_data = True
+
+        if id not in visited_set:  # Found 'entry' record
+            visited_info = {"func_idx": func_idx, "ret": offset, "cyc": entry["cyc"]}
+            visited_set[id] = visited_info
+            ordered_visited_list.append(id)
+        else:  # Found 'exit' record
+            # This should be the last entry in the ordered_visited_list. If not, error out.
+            assert ordered_visited_list[-1] == id, (
+                "Problem with LWP output - Interleaved handler calls found."
+                f"Loop [{ordered_visited_list[-1]}] hasn't exited yet."
+            )
+            ordered_visited_list.pop()
+            entry_node = visited_set.pop(id)
+            assert (
+                entry_node["func_idx"] == func_idx
+            ), f'Error - Found under a different function name : {entry_node["func_idx"]}'
+            cycles = entry["cyc"] - entry_node["cyc"]
+            parent = -1
+            if ordered_visited_list:
+                parent = int(ordered_visited_list[-1])
+            if id in func_cycles:
+                fcycles = func_cycles[id]
+                fcycles["cycles"] += cycles
+                fcycles["count"] += 1
+                func_cycles[id] = fcycles
+            else:
+                func_cycles[id] = {
+                    "cycles": cycles,
+                    "start": entry_node["ret"],
+                    "end": offset,
+                    "parent": parent,
+                    "count": 1,
+                }
+
+    # Done processing the previous function, copy its info into 'overall_cycles'.
+    if func_name not in overall_cycles:
+        overall_cycles[func_name] = func_cycles.copy()
+    else:
+        # Accumulate cycles into existing function entry.
+        overall_cycles = accumulate_cycles(overall_cycles, func_cycles, func_name)
+    # We don't allow for fused operators (functions) calling another operator.
+    if ENABLE_DEBUG:
+        print("ordered_visited_list : ", ordered_visited_list)
+
+    assert len(ordered_visited_list) == 0, (
+        f"\nDone processing function [{prev_func_name}] but ordered_visited_list not empty.\n"
+        f"\t Possible reasons -- \n"
+        f"\t\t1) Mismatch between model .so and json file.\n"
+        f"\t\t2) LWP buffer may have overflowed resulting into missing entries!" % prev_func_name
+    )
+
+    overall_cycles = adjust_per_loop_counts(overall_cycles, data)
+    return overall_cycles
+
+
+def get_load_addr(binary_path: str, serial_number: str, lwp_json: str, run_log: str):
+    """Get load address of the binary file"""
+    if serial_number == "simulator":
+        basedir = os.path.dirname(lwp_json)
+        if run_log is None:
+            run_log = os.path.join(basedir, "stdout.txt")
+        else:
+            # If the directory name is specified for the run_log of the
+            # simulator (stdout.txt) then it must be same as lwp_json.
+            run_log_dir = os.path.dirname(run_log)
+            assert (
+                run_log_dir == "" or run_log_dir == basedir
+            ), f"stdout.txt and {os.path.basename(lwp_json)} must be in the same directory"
+            run_log = os.path.join(basedir, os.path.basename(run_log))
+        # To extract load address for the simulator run
+        pattern = compile(r"Model.*: (\w+):")
+    else:
+        # To extract load address for on-device run
+        binary_name = os.path.basename(binary_path)
+        pattern = compile(r"{}, len \w+, laddr (\w+)".format(binary_name))
+
+    with open(run_log, "r") as f:
+        lines = f.read()
+        a = pattern.search(lines)
+        load_addr = int(a.group(1), 16)
+    if ENABLE_DEBUG:
+        print("load_addr : ", load_addr)
+    return load_addr
+
+
+def process_lwp_output(
+    binary_path: str,
+    serial_number: str,
+    lwp_json: str,
+    run_log: str,
+    lwp_out: str,
+    enable_debug: bool = False,
+):
+    """Process lightweight profiling data"""
+    # Enable debug messages
+    global ENABLE_DEBUG
+    ENABLE_DEBUG = enable_debug
+
+    # Get load address for the binary
+    load_addr = get_load_addr(binary_path, serial_number, lwp_json, run_log)
+    # Opening JSON file
+    with open(lwp_json, "r") as f:
+        # Returns JSON object as a dictionary
+        data = json.load(f)
+
+    # Get function names, and their start and end offsets from the model .so
+    func_info = get_func_info(binary_path)
+
+    # Get the load address for model .so.
+    so_ld_addr = load_addr
+
+    # Process profiling data to construct a CSV report.
+    overall_cycles = process_data(data, func_info, so_ld_addr)
+    create_csv_report(overall_cycles, lwp_out)
+    print("lwp processed output written to -- ", lwp_out)
+    print("[NOTE: Use '--hexagon-debug' to keep the temp directory]")
+
+
+def get_args():
+    """Add commandline arguments to run the script manually if needed"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lwp-json", help="LWP json file", required=True)
+    parser.add_argument("--serial-num", help="device-id/simulator", required=True)
+    parser.add_argument("--test-so", help="Test shared library", required=True)
+    parser.add_argument(
+        "--run-log",
+        help="Logcat file for on-device run and stdout.txt for simulator run",
+        required=True,
+    )
+    parser.add_argument("--lwp-out", help="LWP output file name", required=True)
+    parser.add_argument(
+        "--debug",
+        help="Enable debug output from the script",
+        dest="debug",
+        action="store_true",
+        required=False,
+    )
+    parser.set_defaults(debug=False)
+    args = parser.parse_args()
+
+    global ENABLE_DEBUG
+    ENABLE_DEBUG = args.debug
+
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    process_lwp_output(
+        args.test_so, args.serial_num, args.lwp_json, args.run_log, args.lwp_out, args.debug
+    )
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index b69382fe1290..d6ea51b53e17 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -393,3 +393,8 @@ def _aot_executor_from_factory(
             remote_file_path = self.upload(binary_path, binary_name)
 
         return self.get_aot_executor(remote_file_path)
+
+    def get_profile_output(self, mode: str, path: str):
+        assert isinstance(mode, str), f"Invalid mode type, {type(mode)} != str"
+        assert isinstance(path, str), f"Invalid path type, {type(path)} != str"
+        return self._rpc.get_function("tvm.hexagon.get_profile_output")(mode, path)
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 7b3a81acc525..82533a2f9f5a 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -995,3 +995,14 @@ def ManifestSharedMemoryLocalStage():
         The result pass
     """
     return _ffi_api.ManifestSharedMemoryLocalStage()  # type: ignore
+
+
+def InstrumentProfileIntrinsics():
+    """Insert intrinsic calls to instrument function and loop level profiling.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.InstrumentProfileIntrinsics()  # type: ignore
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 5f8c8742695d..2b9a354f5c7e 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -52,6 +52,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.merge_async_commit_queue_scope", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_lwp", Bool);
 
 using runtime::PackedFunc;
 using runtime::TVMArgs;
@@ -157,6 +158,8 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
       pass_ctx->GetConfig<Array<Array<ObjectRef>>>("tir.add_lower_pass", Array<Array<ObjectRef>>())
           .value();
 
+  bool instrument_lwp = pass_ctx->GetConfig<Bool>("tir.instrument_lwp", Bool(false)).value();
+
   Array<transform::Pass> user_lower_phase0 = Array<transform::Pass>();
   Array<transform::Pass> user_lower_phase1 = Array<transform::Pass>();
   Array<transform::Pass> user_lower_phase2 = Array<transform::Pass>();
@@ -253,6 +256,14 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   pass_list.push_back(
       tir::transform::CommonSubexprElimTIR(!disable_cse_tir, enable_equiv_terms_in_cse_tir));
 
+  // This pass instruments the loops with the profile builtin calls to capture the runtime
+  // performance data (only enabled for Hexagon at the moment). To ensure that no other
+  // optimizations are performed on the instrumented code, this pass must be added at the end
+  // of the list.
+  if (instrument_lwp) {
+    pass_list.push_back(tir::transform::InstrumentProfileIntrinsics());
+  }
+
   return pass_list;
 }
 
diff --git a/src/runtime/dso_library.cc b/src/runtime/dso_library.cc
index 81eb30ee12d2..a0c6c48b5e44 100644
--- a/src/runtime/dso_library.cc
+++ b/src/runtime/dso_library.cc
@@ -34,6 +34,12 @@
 #include <dlfcn.h>
 #endif
 
+#if defined(__hexagon__)
+extern "C" {
+#include <HAP_farf.h>
+}
+#endif
+
 namespace tvm {
 namespace runtime {
 
@@ -118,6 +124,14 @@ void DSOLibrary::Load(const std::string& name) {
   lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
   ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name << " "
                                  << dlerror();
+#if defined(__hexagon__)
+  int p;
+  int rc = dlinfo(lib_handle_, RTLD_DI_LOAD_ADDR, &p);
+  if (rc)
+    FARF(ERROR, "error getting model .so start address : %u", rc);
+  else
+    FARF(ALWAYS, "Model .so Start Address : %x", p);
+#endif
 }
 
 void* DSOLibrary::GetSymbol_(const char* name) { return dlsym(lib_handle_, name); }
diff --git a/src/runtime/hexagon/profiler/README.md b/src/runtime/hexagon/profiler/README.md
new file mode 100644
index 000000000000..492e45c98498
--- /dev/null
+++ b/src/runtime/hexagon/profiler/README.md
@@ -0,0 +1,99 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Hexagon lightweight instrumentation based profiling (LWP)
+
+For Hexagon, LWP can be used to get function and loop level processor cycle count.
+This is done by instrumenting the code with profiling builtin calls using a TIR pass.
+During codegen, these builtin calls are replaced with the calls to a hexagon specific
+handler which records the runtime information into a buffer.
+This buffer is written into a JSON file ('lwp.json') which is processed to construct
+function and loop level profiling information as a csv file.
+
+**Note:** During codegen, the profiling builtin calls are ignored for other targets.
+
+The TIR pass offers several config flags to control the level of instrumentation
+as mentioned below:
+
+1) `lwp_disable_func_prof`: To disable function level profiling. By default, it is
+set to 'False', i.e., the function level profiling is enabled.
+
+2) `instr_siblings`: When enabled, only loops with siblings are instrumented and rest are
+ignored. The inner-most loops are always excluded from instrumentation unless overwritten
+using `lwp_min_height`. This is done to minimize the adverse effect of instrumentation on
+actual performance. By default, it is set to 'True'.
+
+3) `lwp_max_depth`: To instrument loops up to a certain depth. This flag is effective
+only when `instr_siblings` is disabled. By default, it is set to 0.
+
+4) `lwp_min_height`: To exclude inner loops up to a certain height from instrumentation.
+By default, it is set to 1.
+
+For additional usage information on various config flags, please refer to the tests in
+`tests/python/unittest/test_tir_transform_profiling_instr.py`
+
+
+## How to use lightweight profiling with RPC Launcher:
+
+`tests/python/contrib/test_hexagon/test_launcher.py` contains two tests, `test_lwp` and
+`test_lwp_multiple_conv2d`, to demonstrate lightweight profiling usage.
+
+The steps involved are as follows:
+
+1) While building a model, set `tir.instrument_lwp` to `True`.
+   By default, the builtin calls will only be inserted for the loops with siblings. But it
+   can be altered using LWP config options as described above.
+2) Create `HexagonProfiler` object
+
+```
+with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}):
+    lowered = tvm.relay.build(
+        relay_mod,
+        tvm.target.Target(target_hexagon, host=target_hexagon),
+        ...
+    )
+
+    # Create HexagonProfiler object. It sets the profiling mode based on the PassContext config.
+    # '--hexagon-debug' to pytest can be used to retain any temp or test directories to
+    # inspect the profiling data.
+    profiler = HexagonProfiler(lowered, hexagon_server_process, hexagon_debug)
+```
+
+4) Run the model and get the profiling data as a CSV file. It is done by post-processing
+   'lwp.json' file generated during runtime.
+
+```
+    graph_mod.run(**inputs)
+
+    # Get lightweight profiling output as a CSV file
+    profiler.get_profile_output(hexagon_launcher, hexagon_session, hexagon_server_process)
+```
+**Note:**
+
+- For on-device runs, 'lwp.json' is copied into a temp directory along with the test .so and the processed
+  CSV file
+- For the simulator runs, the file is generated in the simulator test output directory. Test  .so
+  will still be in a separate temp directory. lwp CSV file will also be in the same directory.
+
+**Helpful Hints:**
+
+- To prevent the test directories on the Hexagon device as well as temporary test directory on x86
+from being deleted for profiling related runs, pass `--hexagon-debug` to pytest.
+
+```
+python -m pytest --hexagon-debug tests/python/contrib/test_hexagon/test_launcher.py::test_lwp
+```
diff --git a/src/runtime/hexagon/profiler/lwp_handler.S b/src/runtime/hexagon/profiler/lwp_handler.S
new file mode 100644
index 000000000000..611c0713111a
--- /dev/null
+++ b/src/runtime/hexagon/profiler/lwp_handler.S
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+Lightweight profiling handler to record processor cycles in a buffer
+(pointed by __lwp_buffer_ptr) for a given invocation of the handler. To keep the
+buffer size within a resonable limit, we only recond data for the first 100
+invocation of the handler for a given loop or function ID (passed in R0 register).
+The buffer size wouldn't be a concern if the loops with only siblings are getting
+profiled. However, since the instrumentation provides several different profiling
+options, this approach ensures that they all function as expexted. We use second
+buffer (pointed by __lwp_counter) to keep count of the calls made to lwp_handler
+function for each function/loop.
+
+Brief explanation of all the global variables used:
+1) __lwp_counter : Pointer to the buffer that keeps count of the number of times handler
+ is called for a given ID. To reduce the complexity of the handler, __lwp_counter is
+ indexed using the ID itself.
+2) __lwp_buffer_ptr : Pointer to the buffer that records loop/function ID, processor cycles
+ and return addresss of the handler. Return address is used to reconstruct the call graph
+ (loop-nest) to make it easier to analyze the profiling data.
+3) __lwp_buffer_size : Size of the buffer
+4) __lwp_buffer_count : Offset into main lwp buffer where data for the current handler
+invocation needs to be written.
+
+NOTE: The handler function saves and restores R0-R5 registers which are caller saved registers
+on Hexagon and should be handled at the callsite. However, to reduce the codegen impact
+of the handler calls on the caller functions, we decided to move this part into the
+handler itself.
+
+*/
+  .text
+  .globl  lwp_handler
+  .falign
+  .type  lwp_handler,@function
+lwp_handler:
+  { allocframe(#24)                            // Allocate 24 bytes on the stack to save R0-R5 registers
+    memd(r29+#-16) = r5:4                      // Save R5,R4
+  }
+  {
+    memd(r29+#8) = r3:2                        // Save R3,R2
+    memd(r29+#0) = r1:0                        // Save R1, R0
+    r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL) // Get GOT address
+  }
+  {
+    r5 = memw(r2+##__lwp_counter@GOT)         // Get address of the pointer to __lwp_counter
+    r3 = memw(r2+##__lwp_buffer_count@GOT)    // Get the address of __lwp_buffer_count
+  }
+  {
+    r5 = memw(r5+#0)                          // Get the address of __lwp_counter (address of the main lwp buffer)
+    r3 = memw(r3+#0)                          // Get the __lwp_buffer_count value (offset into the main buffer)
+  }
+  {
+    r4 = memw(r5+r0<<#2)                      // Get the handler invocation count for the ID (passed in R0)
+    r1 = memw(r2+##__lwp_buffer_size@GOT)     // Get the address of __lwp_buffer_size
+  }
+  {
+    r4 = add(r4,#1)                           // Increment count
+    memw(r5+r0<<#2) = r4.new                  // Update count in __lwp_counter for a given ID
+    r1 = memw(r1+#0)                          // Get the buffer size
+  }
+  {
+    p0 = cmp.gtu(r4,#100)                     // Exit if count for a given ID is greater than 100
+    if (p0.new) jump:nt .LBB0_3
+    r5 = memw(r2+##__lwp_buffer_ptr@GOT)      // Get address of the pointer to __lwp_buffer_ptr
+  }
+  {
+    r5 = memw(r5+#0)                          // Get address of __lwp_buffer_ptr
+    r2 = memw(r2+##__lwp_buffer_count@GOT)    // Get address of __lwp_buffer_count
+  }
+  {
+    r4 = add(r3,#4)                           // Increment the offset by 4 since 4 int32 values are stored for each invocation
+    if (!cmp.gtu(r1,r4.new)) jump:t .LBB0_3   // Exit if the main lwp buffer has run out of space
+  }
+  {
+    r5 = addasl(r5,r3,#2)                     // Get the address where the data needs to be recorded
+    memw(r2+#0) = r4                          // Save next offset into __lwp_buffer_count
+  }
+  {
+    memw(r5+#0) = r31                         // Save return address of this function
+    r1:0 = C15:14                             // Control registers that keep processor cycle count (64-bits)
+    memw(r5+#4) = r0                          // Save loop/function ID
+  }
+  {
+    memw(r5+#12) = r1                         // Save upper 32 bits
+    memw(r5+#8) = r0                          // Save lower 32 bits
+  }
+  .falign
+.LBB0_3:
+  {
+    r5:4 = memd(r29+#16)                     // Restore the registers from the stack
+    r3:2 = memd(r29+#8)
+  }
+  {
+    r1:0 = memd(r29+#0)
+    dealloc_return                          // Deallocate the stack and return
+  }
+.Lfunc_end0:
+  .size  lwp_handler, .Lfunc_end0-lwp_handler
diff --git a/src/runtime/hexagon/profiler/prof_utils.cc b/src/runtime/hexagon/profiler/prof_utils.cc
new file mode 100644
index 000000000000..45cbe607031d
--- /dev/null
+++ b/src/runtime/hexagon/profiler/prof_utils.cc
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+// The max loop/function id used among all lwp_handler calls. Since
+// the id is used to index into the lwp_counter buffer, the size of the
+// buffer must be equal or greater than the max possible id.
+#define LWP_COUNTER_SIZE 5000
+
+// LWP_BUFFER_SIZE needs to be at most 100 * LWP_COUNTER_SIZE since 100 is
+// the max number of entries recorded for each instrumented location.
+#define LWP_BUFFER_SIZE (LWP_COUNTER_SIZE * 100)
+
+uint32_t lwp_counter[LWP_COUNTER_SIZE] = {0};
+uint32_t lwp_buffer[LWP_BUFFER_SIZE];
+uint32_t* __lwp_counter = lwp_counter;
+uint32_t* __lwp_buffer_ptr = lwp_buffer;
+uint32_t __lwp_buffer_size = LWP_BUFFER_SIZE;
+uint32_t __lwp_enable_flag = 1;
+uint32_t __lwp_buffer_count = 0;
+
+bool WriteLWPOutput(const std::string& out_json) {
+  std::ostringstream s;
+  s << "{\n";
+  s << "\t\"entries\":[\n";
+  for (size_t i = 0; i < __lwp_buffer_count; i += 4) {
+    s << "\t{\n";
+    s << "\t\t\"ret\":" << std::dec << lwp_buffer[i] << ",\n";
+    s << "\t\t\"id\":" << std::dec << lwp_buffer[i + 1] << ",\n";
+    uint64_t pcycles = (static_cast<uint64_t>(lwp_buffer[i + 3]) << 32) + lwp_buffer[i + 2];
+    s << "\t\t\"cyc\":" << std::dec << pcycles << "\n";
+    s << "\t}";
+    if (i < __lwp_buffer_count - 4) {
+      s << ",\n";
+    }
+  }
+  s << "\t],\n\n";
+  s << "\t\"loop_counts\":[\n";
+  for (size_t i = 0; i < LWP_COUNTER_SIZE; i++) {
+    s << "\t\t" << lwp_counter[i] / 2;
+    if (i < LWP_COUNTER_SIZE - 1)
+      s << ",\n";
+    else
+      s << "\n";
+  }
+  s << "\t]\n}\n";
+  std::ofstream ofc(out_json);
+  if (!ofc.is_open()) {
+    return false;
+  }
+
+  ofc << s.str() << "\n";
+
+  if (!ofc) {
+    return false;
+  }
+  ofc.close();
+  return true;
+}
diff --git a/src/runtime/hexagon/profiler/prof_utils.h b/src/runtime/hexagon/profiler/prof_utils.h
new file mode 100644
index 000000000000..e086f7a26b36
--- /dev/null
+++ b/src/runtime/hexagon/profiler/prof_utils.h
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prof_utils.h
+ */
+#ifndef TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_
+#define TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_
+
+#include <string>
+
+bool WriteLWPOutput(const std::string&);
+
+#endif  // TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index b4799d5d7127..41c63d0affeb 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -39,6 +39,7 @@ extern "C" {
 #include "../../../minrpc/minrpc_server.h"
 #include "../../hexagon/hexagon_common.h"
 #include "../../hexagon/hexagon_device_api.h"
+#include "../../profiler/prof_utils.h"
 #include "hexagon_rpc.h"
 
 namespace tvm {
@@ -329,3 +330,15 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
       tvm::ObjectPtr<tvm::runtime::Library> n = tvm::runtime::CreateDSOLibraryObject(soname);
       *rv = CreateModuleFromLibrary(n);
     });
+
+TVM_REGISTER_GLOBAL("tvm.hexagon.get_profile_output")
+    .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
+      std::string profiling_mode = args[0];
+      std::string out_file = args[1];
+      if (profiling_mode.compare("lwp") == 0) {
+        *rv = WriteLWPOutput(out_file);
+      } else {
+        HEXAGON_PRINT(ERROR, "ERROR: Unsupported profiling mode: %s", profiling_mode.c_str());
+        *rv = false;
+      }
+    });
diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index 119e999276f8..41bb2da6f8b1 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -29,6 +29,7 @@
 #include "../../../library_module.h"
 #include "../../../minrpc/minrpc_server.h"
 #include "../../hexagon_common.h"
+#include "../../profiler/prof_utils.h"
 #include "hexagon_sim_proto.h"
 #include "tvm/runtime/packed_func.h"
 #include "tvm/runtime/registry.h"
@@ -336,3 +337,15 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
       tvm::ObjectPtr<tvm::runtime::Library> n = tvm::runtime::CreateDSOLibraryObject(soname);
       *rv = CreateModuleFromLibrary(n);
     });
+
+TVM_REGISTER_GLOBAL("tvm.hexagon.get_profile_output")
+    .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
+      std::string profiling_mode = args[0];
+      std::string out_file = args[1];
+      if (profiling_mode.compare("lwp") == 0) {
+        *rv = WriteLWPOutput(out_file);
+      } else {
+        HEXAGON_PRINT(ERROR, "ERROR: Unsupported profiling mode: %s", profiling_mode.c_str());
+        *rv = false;
+      }
+    });
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index ff59dfcceb8d..9552a45a60f9 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -75,6 +75,7 @@ class CodeGenHexagon final : public CodeGenCPU {
 
   using CodeGenCPU::VisitStmt_;
   llvm::Value* VisitExpr_(const BufferLoadNode* op) override;
+  llvm::Value* CreateIntrinsic(const CallNode* op) override;
 
   llvm::Value* CreateCallExtern(Type ret_type, String global_symbol, const Array<PrimExpr>& args,
                                 bool skip_first_arg) override;
@@ -193,6 +194,28 @@ llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) {
   return CodeGenCPU::VisitExpr_(op);
 }
 
+llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
+#if TVM_LLVM_VERSION >= 150
+  if (op->op.same_as(builtin::start_profile_intrinsic()) ||
+      op->op.same_as(builtin::end_profile_intrinsic())) {
+    llvm::Value* id = MakeValue(op->args[0]);
+    auto instrprof_id = llvm::Intrinsic::hexagon_instrprof_custom;
+    llvm::Function* func = llvm::Intrinsic::getDeclaration(module_.get(), instrprof_id);
+    llvm::GlobalVariable* name_var = module_->getGlobalVariable("handler_name");
+    if (!name_var) {
+      llvm::StringRef init_str = "lwp_handler";
+      llvm::Constant* init = llvm::ConstantDataArray::getString(module_->getContext(), init_str);
+
+      name_var = new llvm::GlobalVariable(*module_, init->getType(), true,
+                                          llvm::GlobalValue::InternalLinkage, init, "handler_name");
+    }
+    llvm::Type* t_int8_p_ = t_int8_->getPointerTo();
+    return builder_->CreateCall(func, {llvm::ConstantExpr::getBitCast(name_var, t_int8_p_), id});
+  }
+#endif
+  return CodeGenCPU::CreateIntrinsic(op);
+}
+
 void CodeGenHexagon::CreatePrintf(const std::string& format,
                                   llvm::ArrayRef<llvm::Value*> format_args) {
   // This function generates LLVM instructions to call HAP_debug_v2,
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 6a50dd4534c2..ea8a5ff5106a 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1318,6 +1318,10 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     // TODO(masahi): Support atomic for CPU backend
     LOG(FATAL) << "CPU backend does not support atomic add yet.";
     return nullptr;
+  } else if (op->op.same_as(builtin::start_profile_intrinsic()) ||
+             op->op.same_as(builtin::end_profile_intrinsic())) {
+    LOG(INFO) << "Ignoring profile_intrinsic ... " << op->op;
+    return nullptr;
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->op;
     return nullptr;
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 1e2d790c76e1..b605b9de1e02 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -302,6 +302,12 @@ TIR_DEFINE_BUILTIN_FUNC(undef)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kReadState))
     .set_num_inputs(0);
 
+TIR_DEFINE_BUILTIN_FUNC(start_profile_intrinsic)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_BUILTIN_FUNC(end_profile_intrinsic)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
+
 }  // namespace builtin
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/transforms/profile_instrumentation.cc b/src/tir/transforms/profile_instrumentation.cc
new file mode 100644
index 000000000000..5f52fc6630bc
--- /dev/null
+++ b/src/tir/transforms/profile_instrumentation.cc
@@ -0,0 +1,293 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file profile_instrumentation.cc
+ */
+// Insert profile intrinsic at loop and function level. During codegen,
+// these instruction can be replaced with a call to a target specific handler
+// and can be used to capture profiling information such as processor cycles.
+
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tir {
+namespace lwp {
+
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_disable_func_prof", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_max_depth", Integer);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_min_height", Integer);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.instr_siblings", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.reset_start_id", Bool);
+
+static int32_t start_id = 0;
+
+struct LoopInfo {
+  LoopInfo() = default;
+  LoopInfo(unsigned i, unsigned d, unsigned h = 0) : id(i), depth(d), height(h) {
+    has_siblings = false;
+    has_parallel = false;
+  }
+  unsigned id;
+  int32_t depth;
+  int32_t height;
+  bool has_siblings;
+  // Set to 'true' if ForKind::kParallel is set for the current loop or one of its ancestor
+  bool has_parallel;
+};
+
+using LoopInfoMap = std::unordered_map<const ForNode*, LoopInfo>;
+// Traverse loops depth first and assign them a unique number.
+class LoopAnalyzer : public StmtExprVisitor {
+ public:
+  LoopInfoMap Analyze(const Stmt& stmt) {
+    this->VisitStmt(stmt);
+    return loops;
+  }
+  void VisitStmt_(const ForNode* op) final {
+    LoopInfo loop_info(start_id, 0);
+    start_id++;
+    loop_info.height = TraverseLoop(op->body, 0);
+    loops[op] = loop_info;
+  }
+
+  unsigned TraverseLoop(const Stmt& stmt, unsigned parent_depth, bool has_parallel = false) {
+    if (stmt->IsInstance<SeqStmtNode>()) {
+      std::vector<const ForNode*> siblings;
+      unsigned height = 0;
+      bool has_loop = false;
+      const SeqStmtNode* n = stmt.as<SeqStmtNode>();
+      for (Stmt s : n->seq) {
+        if (s->IsInstance<ForNode>()) {
+          has_loop = true;
+          const ForNode* f = s.as<ForNode>();
+          LoopInfo loop_info(start_id, parent_depth + 1);
+          start_id++;
+          bool parent_parallel = false;
+          if (has_parallel) {
+            loop_info.has_parallel = true;
+            parent_parallel = true;
+          } else if (f->kind == ForKind::kParallel) {
+            // has_parallel for the current loop is being set to 'false' since the
+            // intrinsic is added outside of the loop. The instrumentation isn't
+            // allowed for the subsequent nested loops.
+            loop_info.has_parallel = false;
+            parent_parallel = true;
+          }
+          siblings.push_back(f);
+          height = std::max(height, TraverseLoop(f->body, parent_depth + 1, parent_parallel));
+          loop_info.height = height;
+          loops[f] = loop_info;
+        }
+      }
+      if (siblings.size() > 1) {
+        for (auto* l : siblings) {
+          loops[l].has_siblings = true;
+        }
+      }
+      height = has_loop ? height + 1 : height;
+      return height;  // Parent's height : max of all children's height
+    } else if (stmt->IsInstance<IfThenElseNode>()) {
+      const IfThenElseNode* n = stmt.as<IfThenElseNode>();
+      unsigned height = TraverseLoop(n->then_case, parent_depth, has_parallel);
+      if (n->else_case.defined()) {
+        height = std::max(height, TraverseLoop(n->else_case, parent_depth, has_parallel));
+      }
+      return height;
+    } else if (stmt->IsInstance<ForNode>()) {
+      const ForNode* f = stmt.as<ForNode>();
+      LoopInfo loop_info(start_id, parent_depth + 1);
+      start_id++;
+      bool parent_parallel = false;
+      if (has_parallel) {
+        loop_info.has_parallel = true;
+        parent_parallel = true;
+      } else if (f->kind == ForKind::kParallel) {
+        // has_parallel for the current loop is being set to 'false' since the
+        // intrinsic is added outside of the loop. The instrumentation isn't
+        // allowed for the subsequent nested loops.
+        loop_info.has_parallel = false;
+        parent_parallel = true;
+      }
+      unsigned height = TraverseLoop(f->body, parent_depth + 1, parent_parallel);
+      loop_info.height = height;
+      loops[f] = loop_info;
+      return height + 1;
+    } else if (stmt->IsInstance<LetStmtNode>()) {
+      const LetStmtNode* n = stmt.as<LetStmtNode>();
+      return TraverseLoop(n->body, parent_depth, has_parallel);
+    } else if (stmt->IsInstance<AttrStmtNode>()) {
+      const AttrStmtNode* n = stmt.as<AttrStmtNode>();
+      return TraverseLoop(n->body, parent_depth, has_parallel);
+    } else if (stmt->IsInstance<AllocateNode>()) {
+      const AllocateNode* n = stmt.as<AllocateNode>();
+      return TraverseLoop(n->body, parent_depth, has_parallel);
+    } else {
+      return 0;  // inner-most loop
+    }
+  }
+
+ private:
+  LoopInfoMap loops;
+};
+
+class InstrumentIntrin : public StmtMutator {
+ public:
+  InstrumentIntrin(int32_t max_depth, int32_t min_height, bool instr_siblings)
+      : max_instr_depth_(max_depth),
+        min_instr_height_(min_height),
+        instr_siblings_(instr_siblings) {}
+
+  void GetLoopInfo(PrimFuncNode* op) {
+    LoopAnalyzer analzer;
+    loops_ = std::move(analzer.Analyze(op->body));
+  }
+
+  Stmt VisitStmt_(const SeqStmtNode* op) final {
+    Stmt stmt = StmtMutator::VisitStmt_(op);
+    return SeqStmt::Flatten(stmt);
+  }
+
+  Stmt VisitStmt_(const ForNode* op) final {
+    Stmt stmt = StmtMutator::VisitStmt_(op);
+    if (loops_.count(op) < 1) return stmt;
+
+    LoopInfo loop_info = loops_[op];
+
+    if (loop_info.has_parallel) {
+      return stmt;
+    }
+
+    // Exclude inner-most loops from instrumentation. The inner-most loop has
+    // height '0' and it increases as we move outward in the loop nest.
+    if (loop_info.height < min_instr_height_) {
+      return stmt;
+    }
+
+    // Only instrument loops with a sibling
+    if (instr_siblings_ && !loop_info.has_siblings) {
+      return stmt;
+    }
+
+    // If instr_siblings_ is set, ignore max depth for instrumentation
+    if (!instr_siblings_ && loop_info.depth > max_instr_depth_) {
+      return stmt;
+    }
+    PrimExpr id = static_cast<int32_t>(loop_info.id);
+    PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {id});
+    PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {id});
+    const Stmt start_profile = Evaluate(start_call);
+    const Stmt end_profile = Evaluate(end_call);
+    Stmt new_stmt = SeqStmt({start_profile, stmt, end_profile});
+    return new_stmt;
+  }
+
+ private:
+  LoopInfoMap loops_;
+  int32_t max_instr_depth_;
+  int32_t min_instr_height_;
+  bool instr_siblings_;
+};
+
+class CheckParallelLoops : public StmtExprVisitor {
+ public:
+  bool HasParallelLoops(const Stmt& stmt) {
+    this->VisitStmt(stmt);
+    return has_parallel;
+  }
+
+ private:
+  void VisitStmt_(const ForNode* op) final {
+    if (op->kind == ForKind::kParallel) {
+      has_parallel = true;
+    } else {
+      StmtExprVisitor::VisitStmt_(op);
+    }
+  }
+
+  bool has_parallel = false;
+};
+
+PrimFunc AddProfileBuiltins(PrimFunc func, int32_t max_instr_depth, int32_t min_instr_height,
+                            bool instr_siblings, bool disable_func_instrumentation) {
+  auto* func_ptr = func.CopyOnWrite();
+
+  PrimExpr e = start_id++;
+  if (!disable_func_instrumentation) {
+    PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {e});
+    PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {e});
+    const Stmt start_profile = Evaluate(start_call);
+    const Stmt end_profile = Evaluate(end_call);
+    func_ptr->body = SeqStmt({start_profile, std::move(func_ptr->body), end_profile});
+  }
+  InstrumentIntrin p(max_instr_depth, min_instr_height, instr_siblings);
+  p.GetLoopInfo(func_ptr);
+  func_ptr->body = p(std::move(func_ptr->body));
+  return std::move(func);
+}
+
+}  // namespace lwp
+
+namespace transform {
+Pass InstrumentProfileIntrinsics() {
+  auto pass_func = [](IRModule m, PassContext ctx) {
+    auto* mptr = m.CopyOnWrite();
+
+    // All loops with depth <= max_instr_depth are instrumented. By default,
+    // only outer-most loops are instrumented which has a depth of 0.
+    // In addition, loops with siblings are also instrumented provided
+    // their loop depth is >= min_instr_height. This is done to avoid
+    // instrumenting inner-most loops.
+    auto max_instr_depth = ctx->GetConfig<Integer>("tir.lwp_max_depth", Integer(0)).value();
+    auto min_instr_height = ctx->GetConfig<Integer>("tir.lwp_min_height", Integer(1)).value();
+    bool instr_siblings = ctx->GetConfig<Bool>("tir.instr_siblings", Bool(true)).value();
+    bool disable_func_instrumentation =
+        ctx->GetConfig<Bool>("tir.lwp_disable_func_prof", Bool(false)).value();
+    bool reset_start_id = ctx->GetConfig<Bool>("tir.reset_start_id", Bool(false)).value();
+    if (reset_start_id) lwp::start_id = 0;
+    std::vector<std::pair<GlobalVar, PrimFunc>> updates;
+    for (const auto& kv : mptr->functions) {
+      if (auto* n = kv.second.as<PrimFuncNode>()) {
+        PrimFunc func = GetRef<PrimFunc>(n);
+        auto updated_func =
+            lwp::AddProfileBuiltins(func, max_instr_depth.IntValue(), min_instr_height.IntValue(),
+                                    instr_siblings, disable_func_instrumentation);
+        updates.push_back({kv.first, updated_func});
+      }
+    }
+    for (const auto& pair : updates) {
+      mptr->AddUnchecked(pair.first, pair.second);
+    }
+    return m;
+  };
+
+  return tvm::transform::CreateModulePass(pass_func, 0, "tir.InstrumentProfileIntrinsics", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.InstrumentProfileIntrinsics")
+    .set_body_typed(InstrumentProfileIntrinsics);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 162e4a1cc7a1..527c79754796 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -157,6 +157,7 @@
     "apps/microtvm/reference-vm/base-box/Vagrantfile.packer-template",
     # Hexagon
     "src/runtime/hexagon/rpc/android_bash.sh.template",
+    "src/runtime/hexagon/profiler/lwp_handler.S",
 }
 
 
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 565999c32957..76d5cba60a1f 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -24,6 +24,8 @@
 from tvm import relay, te
 from tvm.contrib.hexagon.session import Session
 from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.hexagon.build import HexagonLauncherRPC
+from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
 
 from .infrastructure import get_hexagon_target
 
@@ -568,5 +570,161 @@ def test_dense_relay_vrmpy(hexagon_session, data_dtype, weight_dtype):
     np.testing.assert_equal(out, ref)
 
 
+@tvm.testing.requires_hexagon
+def test_lwp(
+    hexagon_server_process,
+    hexagon_launcher: HexagonLauncherRPC,
+    hexagon_session: Session,
+    hexagon_debug,
+):
+    dtype = "float32"
+    data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
+    weight = relay.var("weight", relay.TensorType((5, 5, 3, 8), dtype))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+
+    f = relay.Function([data, weight], y)
+    relay_mod = tvm.IRModule.from_expr(f)
+    relay_mod = relay.transform.InferType()(relay_mod)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    runtime = Runtime("cpp")
+    executor = Executor("graph")
+
+    weight_in = np.random.rand(5, 5, 3, 8).astype(dtype=dtype)
+    data_in = np.random.rand(1, 64, 64, 3).astype(dtype=dtype)
+    params = {"weight": weight_in}
+    inputs = {"data": data_in}
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}):
+        lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            runtime=runtime,
+            executor=executor,
+        )
+        # Create HexagonProfiler object
+        dso_binary = "test_binary.so"
+        profiler = HexagonProfiler(dso_binary, lowered, hexagon_server_process, hexagon_debug)
+
+    graph_mod = hexagon_session.get_executor_from_factory(lowered)
+    graph_mod.set_input(**params)
+    graph_mod.run(**inputs)
+    hexagon_output = graph_mod.get_output(0).numpy()
+
+    # Get lightweight profiling output as a CSV file
+    profiler.get_profile_output(hexagon_launcher, hexagon_session)
+
+    target_llvm = tvm.target.Target("llvm")
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            runtime=runtime,
+            executor=executor,
+        )
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    llvm_graph_mod.set_input(weight=weight_in)
+    llvm_graph_mod.run(data=data_in)
+    expected_output = llvm_graph_mod.get_output(0).numpy()
+
+    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
+
+
+@tvm.testing.requires_hexagon
+def test_lwp_multiple_conv2d(
+    hexagon_server_process,
+    hexagon_launcher: HexagonLauncherRPC,
+    hexagon_session: Session,
+    hexagon_debug,
+):
+    dtype = "float32"
+    input_shape = (1, 8, 8, 3)
+    w1_shape = (5, 5, 3, 1)
+    w2_shape = (5, 5, 1, 3)
+    data = relay.var("data", relay.TensorType(input_shape, dtype))
+    weight1 = relay.var("weight1", relay.TensorType(w1_shape, dtype))
+    weight2 = relay.var("weight2", relay.TensorType(w2_shape, dtype))
+    y1 = relay.nn.conv2d(
+        data,
+        weight1,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+    y2 = relay.nn.conv2d(
+        y1,
+        weight2,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+    f = relay.Function([data, weight1, weight2], y2)
+    relay_mod = tvm.IRModule.from_expr(f)
+    relay_mod = relay.transform.InferType()(relay_mod)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    runtime = Runtime("cpp")
+    executor = Executor("graph")
+
+    weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(
+        dtype=dtype
+    )
+    weight2_data = np.random.rand(w2_shape[0], w2_shape[1], w2_shape[2], w2_shape[3]).astype(
+        dtype=dtype
+    )
+    input_data = np.random.rand(
+        input_shape[0], input_shape[1], input_shape[2], input_shape[3]
+    ).astype(dtype=dtype)
+
+    params = {"weight1": weight1_data, "weight2": weight2_data}
+    inputs = {"data": input_data}
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}):
+        lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            runtime=runtime,
+            executor=executor,
+        )
+        # Create HexagonProfiler object
+        dso_binary = "test_binary.so"
+        profiler = HexagonProfiler(dso_binary, lowered, hexagon_server_process, hexagon_debug)
+
+    graph_mod = hexagon_session.get_executor_from_factory(lowered)
+    graph_mod.set_input(**params)
+    graph_mod.run(**inputs)
+    hexagon_output = graph_mod.get_output(0).numpy()
+
+    # Get lightweight profiling output as a CSV file
+    profiler.get_profile_output(hexagon_launcher, hexagon_session)
+
+    target_llvm = tvm.target.Target("llvm")
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            runtime=runtime,
+            executor=executor,
+        )
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    llvm_graph_mod.set_input(**params)
+    llvm_graph_mod.run(**inputs)
+    expected_output = llvm_graph_mod.get_output(0).numpy()
+
+    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_profiling_instr.py b/tests/python/unittest/test_tir_transform_profiling_instr.py
new file mode 100644
index 000000000000..d14e2a4c8925
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_profiling_instr.py
@@ -0,0 +1,340 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import tvm.testing
+from tvm import te
+from tvm.ir.module import IRModule
+from tvm.script import tir as T
+import numpy
+
+default_lwp_test_config = {
+    "tir.instrument_lwp": True,
+    "tir.lwp_disable_func_prof": True,
+    "tir.reset_start_id": True,
+}
+
+
+@T.prim_func
+def input1(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    for i, j in T.grid(8, 8):
+        for k, l in T.grid(8, 16):
+            with T.block("B"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+        for k, l in T.grid(8, 16):
+            with T.block("C"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+
+
+@T.prim_func
+def input2(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        for j in T.serial(0, 8):
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+        for j in T.serial(0, 8):
+            for k, l in T.grid(8, 16):
+                with T.block("C"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+
+
+@T.prim_func
+def input3(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        for j in T.parallel(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+        for j in T.serial(0, 8):
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+
+
+@T.prim_func
+def test1_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    for i, j in T.grid(8, 8):
+        T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
+        for k, l in T.grid(8, 16):
+            with T.block("B"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
+        for k, l in T.grid(8, 16):
+            with T.block("C"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
+
+
+@T.prim_func
+def test2_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
+    for i in T.serial(0, 8):
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))
+
+
+@T.prim_func
+def test3_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
+    for i in T.serial(0, 8):
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+            T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))
+
+
+@T.prim_func
+def test4_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+            T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(7, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.start_profile_intrinsic(8, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("C"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            T.evaluate(T.end_profile_intrinsic(8, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(10, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+            T.evaluate(T.end_profile_intrinsic(10, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(7, dtype="handle"))
+
+
+@T.prim_func
+def test5_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
+    for i in T.serial(0, 8):
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))
+
+
+@T.prim_func
+def test6_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
+        for j in T.parallel(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(7, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.start_profile_intrinsic(8, dtype="handle"))
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            T.evaluate(T.end_profile_intrinsic(8, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(10, dtype="handle"))
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+            T.evaluate(T.end_profile_intrinsic(10, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(7, dtype="handle"))
+
+
+# By default, only loops with siblings are instrumented.
+def test1():
+    with tvm.transform.PassContext(config=default_lwp_test_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test1_expected_output)
+
+
+# By default, only loops with siblings are instrumented. Here, 'lwp_max_depth'
+# doesn't have any effect unless 'instr_siblings' is set to False (ex: test3).
+def test2():
+    test2_config = default_lwp_test_config.copy()
+    test2_config.update({"tir.lwp_max_depth": 3})
+    with tvm.transform.PassContext(config=test2_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test1_expected_output)
+
+
+# test3: Use 'lwp_max_depth' to instrument loops upto a certain depth. This flag
+# is effective only when 'instr_siblings' is disabled. Also, note that inner-most
+# loops are always excluded from instrumentation unless overwritten using
+# 'lwp_min_height' (ex: test5)
+def test3():
+    test3_config = default_lwp_test_config.copy()
+    test3_config.update({"tir.lwp_max_depth": 3, "tir.instr_siblings": False})
+    with tvm.transform.PassContext(config=test3_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test3_expected_output)
+
+
+# test4: Use 'lwp_min_height' to exclude inner loops upto a certain height from
+# instrumentation.
+def test4():
+    with tvm.transform.PassContext(config=default_lwp_test_config):
+        mod = tvm.IRModule.from_expr(input2)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test4_expected_output)
+
+
+# test5: Use both 'lwp_min_height' and 'lwp_max_depth'.
+# instrumentation.
+def test5():
+    test5_config = default_lwp_test_config.copy()
+    test5_config.update(
+        {"tir.lwp_max_depth": 3, "tir.instr_siblings": False, "tir.lwp_min_height": 2}
+    )
+    with tvm.transform.PassContext(config=test5_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test5_expected_output)
+
+
+# test6: Tests instrumentation for the parallel loops
+def test6():
+    with tvm.transform.PassContext(config=default_lwp_test_config):
+        mod = tvm.IRModule.from_expr(input3)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test6_expected_output)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 7983ef2faa8ae98d0d43dfc40b1b9ca95be14302 Mon Sep 17 00:00:00 2001
From: "Ehsan M. Kermani" <6980212+ehsanmok@users.noreply.github.com>
Date: Tue, 25 Oct 2022 13:42:14 -0700
Subject: [PATCH 434/704] [Relay][ONNX] Resolve proto ingestion issue for
 non-tensor type and add CastLike op (#13184)

This PR

* adds the missing CastLike op and
* fixes proto injection issue for onnx node tests that include other types besides TensorType with Optional and Sequence types for the upcoming PRs
* fixes Cast version issue.
---
 python/tvm/relay/frontend/onnx.py          | 12 +++-
 tests/python/frontend/onnx/test_forward.py | 70 +++++++++++++---------
 2 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index ca290cf9a81c..aedb56f79336 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1843,7 +1843,7 @@ def _impl_v1(cls, inputs, attr, params):
         return AttrCvt(op_name="cast", transforms={"to": "dtype"})(inputs, attr)
 
     @classmethod
-    def _impl_v5(cls, inputs, attr, params):
+    def _impl_v6(cls, inputs, attr, params):
         try:
             from onnx import TensorProto
         except ImportError as e:
@@ -1860,9 +1860,18 @@ def _impl_v5(cls, inputs, attr, params):
                 attr["to"] = str(TENSOR_TYPE_TO_NP_TYPE[attr["to"]])
             except ImportError as e:
                 raise ImportError("Unable to import onnx.mapping which is required {}".format(e))
+
         return AttrCvt(op_name="cast", transforms={"to": "dtype"})(inputs, attr)
 
 
+class CastLike(OnnxOpConverter):
+    """Operator converter for CastLike."""
+
+    @classmethod
+    def _impl_v15(cls, inputs, attr, params):
+        return AttrCvt(op_name="cast_like")(inputs, attr)
+
+
 class Unsqueeze(OnnxOpConverter):
     """Operator converter for Unsqueeze."""
 
@@ -5511,6 +5520,7 @@ def _get_convert_map(opset):
         "TopK": TopK.get_converter(opset),
         # defs/tensor
         "Cast": Cast.get_converter(opset),
+        "CastLike": CastLike.get_converter(opset),
         "Reshape": Reshape.get_converter(opset),
         "Expand": Expand.get_converter(opset),
         "Concat": Concat.get_converter(opset),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 5eac9a8dd4d6..08cc48c3c393 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5238,24 +5238,17 @@ def verify_eyelike(indata, dynamic=False):
     "test_blackmanwindow_expanded",
     "test_blackmanwindow_symmetric",
     "test_blackmanwindow_symmetric_expanded",
-    "test_cast_DOUBLE_to_FLOAT16",
+    # the follow cast and castlike cases have lowering issues
     "test_cast_FLOAT_to_STRING",
     "test_cast_STRING_to_FLOAT",
-    "test_castlike_BFLOAT16_to_FLOAT",
-    "test_castlike_BFLOAT16_to_FLOAT_expanded",
-    "test_castlike_DOUBLE_to_FLOAT",
-    "test_castlike_DOUBLE_to_FLOAT16",
-    "test_castlike_DOUBLE_to_FLOAT16_expanded",
-    "test_castlike_FLOAT16_to_DOUBLE",
-    "test_castlike_FLOAT16_to_FLOAT",
-    "test_castlike_FLOAT_to_BFLOAT16",
-    "test_castlike_FLOAT_to_BFLOAT16_expanded",
-    "test_castlike_FLOAT_to_DOUBLE",
-    "test_castlike_FLOAT_to_FLOAT16",
     "test_castlike_FLOAT_to_STRING",
     "test_castlike_FLOAT_to_STRING_expanded",
     "test_castlike_STRING_to_FLOAT",
     "test_castlike_STRING_to_FLOAT_expanded",
+    # the following cast and castlike cases segfault
+    "test_cast_DOUBLE_to_FLOAT16",
+    "test_castlike_DOUBLE_to_FLOAT16",
+    "test_castlike_DOUBLE_to_FLOAT16_expanded",
     "test_convtranspose_autopad_same",
     "test_convtranspose_dilations",
     "test_cumsum_1d",
@@ -5287,7 +5280,6 @@ def verify_eyelike(indata, dynamic=False):
     "test_identity_sequence",
     "test_if_opt",
     "test_if_seq",
-    "test_loop11",
     "test_loop13_seq",
     "test_loop16_seq_none",
     "test_lstm_batchwise",
@@ -5376,6 +5368,27 @@ def verify_eyelike(indata, dynamic=False):
 }
 
 
+def _load_proto(proto_filename, target_list, model_type_proto):
+    with open(proto_filename, "rb") as fin:
+        protobuf_content = fin.read()
+        if model_type_proto.HasField("sequence_type"):
+            sequence = onnx.SequenceProto()
+            sequence.ParseFromString(protobuf_content)
+            target_list.append(numpy_helper.to_list(sequence))
+        elif model_type_proto.HasField("tensor_type"):
+            tensor = onnx.TensorProto()
+            tensor.ParseFromString(protobuf_content)
+            target_list.append(numpy_helper.to_array(tensor))
+        elif model_type_proto.HasField("optional_type"):
+            optional = onnx.OptionalProto()
+            optional.ParseFromString(protobuf_content)
+            target_list.append(numpy_helper.to_optional(optional))
+        else:
+            raise ValueError(
+                "Loading proto of that specific type (Map/Sparse Tensor) is currently not supported"
+            )
+
+
 @pytest.mark.parametrize("onnx_test", onnx_test_folders)
 @tvm.testing.parametrize_targets
 def test_onnx_nodes(target, dev, onnx_test):
@@ -5415,22 +5428,21 @@ def test_onnx_nodes(target, dev, onnx_test):
         # satisfies onnx precision for bicubic interpolation
         atol = 1e-4
 
-    onnx_model = onnx.load(test_dir + "/model.onnx")
-    inputs = []
-    outputs = []
-    for dataset in glob.glob(test_dir + "/*/"):
-        tensors = sorted(glob.glob(dataset + "/*.pb"))
-        for tensor in tensors:
-            new_tensor = onnx.TensorProto()
-            with open(tensor, "rb") as f:
-                new_tensor.ParseFromString(f.read())
-            if "input" in tensor.split("/")[-1]:
-                inputs.append(numpy_helper.to_array(new_tensor))
-            elif "output" in tensor.split("/")[-1]:
-                outputs.append(numpy_helper.to_array(new_tensor))
-            else:
-                raise ImportError(str(tensor) + " not labeled as an import or an output")
-    tvm_val = get_tvm_output_with_vm(onnx_model, inputs, target, dev)
+    model = onnx.load(os.path.join(test_dir, "model.onnx"))
+    for test_data_dir in glob.glob(os.path.join(test_dir, "test_data_set*")):
+        inputs = []
+        n_inputs = len(glob.glob(os.path.join(test_data_dir, "input_*.pb")))
+        for i in range(n_inputs):
+            input_file = os.path.join(test_data_dir, f"input_{i}.pb")
+            _load_proto(input_file, inputs, model.graph.input[i].type)
+
+        outputs = []
+        n_outputs = len(glob.glob(os.path.join(test_data_dir, "output_*.pb")))
+        for i in range(n_outputs):
+            output_file = os.path.join(test_data_dir, f"output_{i}.pb")
+            _load_proto(output_file, outputs, model.graph.output[i].type)
+
+    tvm_val = get_tvm_output_with_vm(model, inputs, target, dev)
     if len(outputs) == 1:
         tvm.testing.assert_allclose(outputs[0], tvm_val, rtol=rtol, atol=atol)
     else:

From 30c38e0dcec49f447d9d0ffeb13e2d78c20f0448 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Tue, 25 Oct 2022 17:32:05 -0700
Subject: [PATCH 435/704] [Fix,Auto_scheduler] Default to extent of 1 if extent
 cannot be determined (#13196)

This fixes a bug in auto-scheduler featurization.
---
 src/auto_scheduler/utils.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/auto_scheduler/utils.h b/src/auto_scheduler/utils.h
index 9fc5a1dd8f22..f8c00d924dd1 100755
--- a/src/auto_scheduler/utils.h
+++ b/src/auto_scheduler/utils.h
@@ -192,7 +192,9 @@ inline bool StrEndsWith(const String& a, const String& b) {
 /*! \brief Get an int value from an Expr */
 inline int64_t GetIntImm(const PrimExpr& expr) {
   auto pint = expr.as<IntImmNode>();
-  ICHECK(pint != nullptr) << "Expect an IntImm but get " << expr;
+  if (pint == nullptr) {
+    return 1;
+  }
   return pint->value;
 }
 

From f44ef53a6bf1041ac6f3b6eceb9382f3a4b467c3 Mon Sep 17 00:00:00 2001
From: "Ehsan M. Kermani" <6980212+ehsanmok@users.noreply.github.com>
Date: Tue, 25 Oct 2022 19:42:48 -0700
Subject: [PATCH 436/704] [Relay][ONNX] Add Optional, OptionalHasElement and
 OptionalGetElement ops based on Sequence op (#13189)

[Relay][ONNX] Add Optional, OptionalHasElement and OptionalGetElement based on Sequence op

Co-authored-by: Ehsan M. Kermani <ehsanmok@users.noreply.github.com>
---
 python/tvm/relay/frontend/onnx.py          | 30 +++++++++++++++++++++-
 python/tvm/runtime/vm.py                   |  2 ++
 tests/python/frontend/onnx/test_forward.py | 16 +++++++-----
 3 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index aedb56f79336..e244b4d9a1ad 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1682,6 +1682,32 @@ def _impl_v1(cls, inputs, attr, params):
         return inputs[len(inputs) - 1]
 
 
+class Optional_(OnnxOpConverter):
+    """Operator converter for Optional based on sequence construction op."""
+
+    @classmethod
+    def _impl_v15(cls, inputs, attr, params):
+        return SequenceConstruct._impl_v11(inputs, attr, params)
+
+
+class OptionalHasElement(OnnxOpConverter):
+    """Operator converter for OptionalHasElement."""
+
+    @classmethod
+    def _impl_v15(cls, inputs, attr, params):
+        shape = infer_shape(inputs[0])
+        return _op.const(True) if shape else _op.const(False)
+
+
+class OptionalGetElement(OnnxOpConverter):
+    """Operator converter for OptionalGetElement based on sequence construction op."""
+
+    @classmethod
+    def _impl_v15(cls, inputs, attr, params):
+        opt_as_seq = Optional_._impl_v15(inputs, attr, params)
+        return _expr.TupleGetItem(opt_as_seq, 0)
+
+
 class Affine(OnnxOpConverter):
     """Operator converter for Affine transformation."""
 
@@ -5383,6 +5409,9 @@ def _get_convert_map(opset):
     return {
         # defs/experimental
         "Identity": Renamer("copy"),
+        "Optional": Optional_.get_converter(opset),
+        "OptionalHasElement": OptionalHasElement.get_converter(opset),
+        "OptionalGetElement": OptionalGetElement.get_converter(opset),
         "Affine": Affine.get_converter(opset),
         "BitShift": BitShift.get_converter(opset),
         "ThresholdedRelu": ThresholdedRelu.get_converter(opset),
@@ -5402,7 +5431,6 @@ def _get_convert_map(opset):
         "Upsample": Upsample.get_converter(opset),
         "SpatialBN": BatchNorm.get_converter(opset),
         # defs/generator
-        # 'Constant' # Implemented
         # 'RandomUniform'
         # 'RandomNormal'
         # 'RandomUniformLike'
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 20778c40fd51..45ec312b17bb 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -42,6 +42,8 @@ def _gettype(arg):
 
     if isinstance(arg, Object):
         cargs.append(arg)
+    elif arg is None:
+        cargs.append(tvm.nd.array([], device=tvm.cpu(0)))
     elif isinstance(arg, np.ndarray):
         nd_arr = tvm.nd.array(arg, device=tvm.cpu(0))
         cargs.append(nd_arr)
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 08cc48c3c393..6e2c7734c3e1 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -53,11 +53,19 @@ def get_input_data_shape_dict(graph_def, input_data):
         shape_dict = {}
         for i, _ in enumerate(input_data):
             input_names[i] = graph_def.graph.input[i].name
-            if input_data[i] is None or input_data[i].shape == ():
+            input_ = input_data[i]
+
+            if input_ is None or not hasattr(input_, "shape") or input_.shape == ():
                 # Skip adding input shape data when the input data is None;
                 # This is to enable optional arguments for onnx operators.
                 continue
-            shape_dict[input_names[i]] = input_data[i].shape
+
+            elif isinstance(input_, list):
+                shape_dict[input_names[i]] = (len(input_),)
+
+            else:
+                shape_dict[input_names[i]] = input_.shape
+
     else:
         input_names = graph_def.graph.input[0].name
         shape_dict = {input_names: input_data.shape}
@@ -5289,10 +5297,6 @@ def verify_eyelike(indata, dynamic=False):
     "test_melweightmatrix",
     # This test fails llvm with a lowering error:
     "test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded",
-    "test_optional_has_element",
-    "test_optional_get_element",
-    "test_optional_get_element_sequence",
-    "test_optional_has_element_empty",
     "test_qlinearmatmul_3D",
     "test_range_float_type_positive_delta_expanded",
     "test_range_int32_type_negative_delta_expanded",

From 62789a6423bba035d8e70c392fe44866dd0eff6c Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Wed, 26 Oct 2022 00:12:24 -0700
Subject: [PATCH 437/704] [Hexagon] Add fix for vtcm allocation searches
 (#13197)

* [Hexagon] Add fix for VTCM allocation search and new test case to cover the issue.

* Add another test for a specific case.

* [Hexagon] Fix tests.

* Remove unused var

* Change to comments to make more clear.
---
 src/runtime/hexagon/hexagon_vtcm_pool.cc      |  3 +-
 .../hexagon/hexagon_vtcm_pool_tests.cc        | 80 +++++++++++++++++++
 2 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.cc b/src/runtime/hexagon/hexagon_vtcm_pool.cc
index 6024550ba732..17089852a954 100644
--- a/src/runtime/hexagon/hexagon_vtcm_pool.cc
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.cc
@@ -85,7 +85,8 @@ void* HexagonVtcmPool::Allocate(size_t nbytes) {
 
   auto entry_to_allocate = free_.begin();
   for (auto it = free_.begin(); it != free_.end(); it++) {
-    if ((it->second < entry_to_allocate->second) && (it->second >= nbytes)) {
+    if ((entry_to_allocate->second < nbytes || it->second < entry_to_allocate->second) &&
+        it->second >= nbytes) {
       entry_to_allocate = it;
       if (entry_to_allocate->second == nbytes) {
         break;
diff --git a/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc b/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
index 13c459be0c34..81bd31cc84d5 100644
--- a/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
@@ -34,6 +34,7 @@ class HexagonVtcmPoolTest : public ::testing::Test {
  public:
   HexagonVtcmPool* vtcm_pool;
   size_t max_bytes;
+  size_t four_k_block = 4096;
   size_t two_k_block = 2048;
   size_t one_k_block = 1024;
   size_t min_bytes = 128;
@@ -164,6 +165,85 @@ TEST_F(HexagonVtcmPoolTest, free_alloc_combinations) {
   vtcm_pool->Free(ptr4, max_bytes);
 }
 
+TEST_F(HexagonVtcmPoolTest, find_allocation) {
+  void* ptr1;
+  void* ptr2;
+  void* ptr3;
+
+  ptr1 = vtcm_pool->Allocate(two_k_block);
+  ptr2 = vtcm_pool->Allocate(two_k_block);
+
+  // Free the first allocation
+  vtcm_pool->Free(ptr1, two_k_block);
+
+  // Allocate a new larger block to initiate search and ensure
+  // it succeeds despite there not being a match in the first free block.
+  ptr3 = vtcm_pool->Allocate(four_k_block);
+
+  // Clean up the ptrs
+  vtcm_pool->Free(ptr2, two_k_block);
+  vtcm_pool->Free(ptr3, four_k_block);
+
+  // Make sure at the end we have the full amount available again
+  ptr1 = vtcm_pool->Allocate(max_bytes);
+  vtcm_pool->Free(ptr1, max_bytes);
+}
+
+TEST_F(HexagonVtcmPoolTest, find_smallest_allocation_combinations) {
+  void* ptr1;
+  void* ptr2;
+  void* ptr3;
+  void* ptr4;
+  void* new_ptr;
+
+  ptr1 = vtcm_pool->Allocate(two_k_block);
+  ptr2 = vtcm_pool->Allocate(two_k_block);
+  ptr3 = vtcm_pool->Allocate(four_k_block);
+  ptr4 = vtcm_pool->Allocate(four_k_block);
+
+  // Fragment memory allocations.
+  vtcm_pool->Free(ptr2, two_k_block);
+  vtcm_pool->Free(ptr3, four_k_block);
+
+  // Reallocate memory allocations and ensure that the smallest free allocations are used.
+  new_ptr = vtcm_pool->Allocate(two_k_block);
+  CHECK(new_ptr == ptr2);
+
+  new_ptr = vtcm_pool->Allocate(two_k_block);
+  CHECK(new_ptr == ptr3);
+
+  vtcm_pool->Free(ptr1, two_k_block);
+  vtcm_pool->Free(ptr2, two_k_block);
+  vtcm_pool->Free(ptr3, two_k_block);
+  vtcm_pool->Free(ptr4, four_k_block);
+
+  // Rerun the same test for non 2k aligned allocations.
+  ptr1 = vtcm_pool->Allocate(min_bytes);
+  ptr2 = vtcm_pool->Allocate(min_bytes);
+  ptr3 = vtcm_pool->Allocate(one_k_block);
+  ptr4 = vtcm_pool->Allocate(one_k_block);
+
+  // Fragment memory allocations.
+  vtcm_pool->Free(ptr2, min_bytes);
+  vtcm_pool->Free(ptr3, one_k_block);
+
+  // Reallocate memory allocations and ensure that the smallest free allocations are used.
+  new_ptr = vtcm_pool->Allocate(min_bytes);
+  CHECK(new_ptr == ptr2);
+
+  new_ptr = vtcm_pool->Allocate(one_k_block);
+  CHECK(new_ptr == ptr3);
+
+  vtcm_pool->Free(ptr1, min_bytes);
+  vtcm_pool->Free(ptr2, min_bytes);
+  vtcm_pool->Free(ptr3, one_k_block);
+  vtcm_pool->Free(ptr4, one_k_block);
+
+  // Make sure at the end we have the full amount available again
+  ptr4 = vtcm_pool->Allocate(max_bytes);
+  vtcm_pool->Free(ptr4, max_bytes);
+}
+
 // Test alignment edge cases allocating through HexagonBuffer
 TEST_F(HexagonVtcmPoolTest, vtcm_alignment) {
   std::unique_ptr<HexagonBufferManager> test_hexbuffs = std::make_unique<HexagonBufferManager>();

From 2a2dd9ac50313dca09bdf1234f780846f31501da Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 26 Oct 2022 01:31:44 -0700
Subject: [PATCH 438/704] [ci] Update Docker images (#13200)

This updates the images to include #13183
---
 Jenkinsfile               | 20 ++++++++++----------
 ci/jenkins/Jenkinsfile.j2 | 20 ++++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ce69adb81ef9..135f64dc1d94 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -49,16 +49,16 @@
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+ci_lint = 'tlcpack/ci-lint:20221025-182121-e41d0ed6e'
+ci_gpu = 'tlcpack/ci-gpu:20221025-182121-e41d0ed6e'
+ci_cpu = 'tlcpack/ci-cpu:20221025-182121-e41d0ed6e'
+ci_minimal = 'tlcpack/ci-minimal:20221025-182121-e41d0ed6e'
+ci_wasm = 'tlcpack/ci-wasm:20221025-182121-e41d0ed6e'
+ci_i386 = 'tlcpack/ci-i386:20221025-182121-e41d0ed6e'
+ci_cortexm = 'tlcpack/ci-cortexm:20221025-182121-e41d0ed6e'
+ci_arm = 'tlcpack/ci-arm:20221025-182121-e41d0ed6e'
+ci_hexagon = 'tlcpack/ci-hexagon:20221025-182121-e41d0ed6e'
+ci_riscv = 'tlcpack/ci-riscv:20221025-182121-e41d0ed6e'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index bca70349381b..34c1d66e43ba 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -51,16 +51,16 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'ci/jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+ci_lint = 'tlcpack/ci-lint:20221025-182121-e41d0ed6e'
+ci_gpu = 'tlcpack/ci-gpu:20221025-182121-e41d0ed6e'
+ci_cpu = 'tlcpack/ci-cpu:20221025-182121-e41d0ed6e'
+ci_minimal = 'tlcpack/ci-minimal:20221025-182121-e41d0ed6e'
+ci_wasm = 'tlcpack/ci-wasm:20221025-182121-e41d0ed6e'
+ci_i386 = 'tlcpack/ci-i386:20221025-182121-e41d0ed6e'
+ci_cortexm = 'tlcpack/ci-cortexm:20221025-182121-e41d0ed6e'
+ci_arm = 'tlcpack/ci-arm:20221025-182121-e41d0ed6e'
+ci_hexagon = 'tlcpack/ci-hexagon:20221025-182121-e41d0ed6e'
+ci_riscv = 'tlcpack/ci-riscv:20221025-182121-e41d0ed6e'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images

From 7cdb7a71ab7572e61f74212ac4b92ac76e934814 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 26 Oct 2022 08:45:57 -0700
Subject: [PATCH 439/704] [Hexagon]: Add upload function to hexagon session
 (#13161)

* [Hexagon]: Add upload to hexagon session

* lint

* fix typo

* fix serial device skips

* fix test on device

* add serial device to rpc key

* update error name

* fix comment

* move create_session to launcher

* add is_simulator

* lint

* address Eric comment on Session object

* rebase with main
---
 python/tvm/contrib/hexagon/build.py           | 195 +++++-------------
 python/tvm/contrib/hexagon/meta_schedule.py   |   2 +-
 python/tvm/contrib/hexagon/pytest_plugin.py   |  15 +-
 python/tvm/contrib/hexagon/session.py         |  50 +++--
 python/tvm/contrib/hexagon/tools.py           |   2 +
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc |  14 ++
 .../hexagon/rpc/simulator/rpc_server.cc       |  14 ++
 .../python/contrib/test_hexagon/README_RPC.md |   2 +-
 .../contrib/test_hexagon/benchmark_util.py    |   6 +-
 .../metaschedule_e2e/test_resnet50_fp16.py    |   3 +-
 .../metaschedule_e2e/test_resnet50_int8.py    |   2 +-
 .../test_benchmark_elemwise_add.py            | 107 +++++-----
 .../test_hexagon/test_benchmark_maxpool2d.py  |   2 +-
 .../test_hexagon/test_meta_schedule.py        |  18 +-
 .../contrib/test_hexagon/test_models.py       |   2 +-
 .../test_software_pipeline_async.py           |   3 +-
 .../test_hexagon/topi/test_cast_slice.py      |   5 +-
 17 files changed, 196 insertions(+), 246 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index c0e6439d0357..bc3b065dd941 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -31,14 +31,14 @@
 import tempfile
 from typing import Union
 
-import tvm
 from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
 from ..._ffi import libinfo
 from .session import Session
-
+from .tools import HEXAGON_SIMULATOR_NAME
 
 HEXAGON_RPC_LIB_DIR = os.environ.get("HEXAGON_RPC_LIB_DIR")
 ANDROID_BASH_FILE_NAME = "android_bash.sh"
+HEXAGON_REMOTE_DEVICE_KEY = "hexagon-dev"
 
 
 def _check_call_verbose(cmd, **kwargs) -> None:
@@ -103,14 +103,9 @@ class HexagonLauncherRPC(metaclass=abc.ABCMeta):
     The basic flow of interaction with the launcher is
         launcher = HexagonLauncher(...)
         launcher.start_server()
-        with launcher.start_session() as session:
+        with launcher.create_session() as session:
             # Do something with the session
         launcher.stop_server()
-    """
-
-    HEXAGON_REMOTE_DEVICE_KEY = "hexagon-dev"
-
-    """Configure HexagonLauncherRPC.
 
     Parameters
     ----------
@@ -129,7 +124,9 @@ class HexagonLauncherRPC(metaclass=abc.ABCMeta):
         used.
     """
 
-    def __init__(self, rpc_info: dict, workspace: Union[str, pathlib.Path] = None):
+    def __init__(
+        self, rpc_info: dict, workspace: Union[str, pathlib.Path] = None, serial_number: str = None
+    ):
         self._rpc_info = {
             "rpc_tracker_host": "0.0.0.0",
             "rpc_tracker_port": 9190,
@@ -138,7 +135,7 @@ def __init__(self, rpc_info: dict, workspace: Union[str, pathlib.Path] = None):
         }
         self._rpc_info.update(rpc_info)
         self._workspace = self._create_workspace(workspace)
-        self._device_key = self.HEXAGON_REMOTE_DEVICE_KEY
+        self._serial_number = serial_number
 
     @abc.abstractmethod
     def start_server(self):
@@ -205,138 +202,6 @@ def _create_workspace(self, workspace: Union[str, pathlib.Path]) -> pathlib.Path
             workspace = os.path.join(base_dir, _get_test_directory_name())
         return self._create_remote_directory(workspace)
 
-    def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str) -> pathlib.Path:
-        """Upload a local file to the remote workspace.
-
-        Parameters
-        ----------
-        local_path : str or pathlib.Path
-            Path to the local file to be copied.
-        remote_filename : str
-            Name of the file in the remote workspace.
-
-        Returns
-        -------
-        pathlib.Path :
-            Uploaded file remote path.
-        """
-        assert self._workspace
-        remote_file_path = self._workspace / remote_filename
-        self._copy_to_remote(local_path, str(remote_file_path))
-        return remote_file_path
-
-    def start_session(self, session_name: str = "hexagon-rpc") -> Session:
-        """Connect to the RPC server.
-
-        Parameters
-        ----------
-        session_name : str
-            RPC session name.
-
-        Returns
-        -------
-        Session :
-            The session object.
-        """
-        hexagon_remote_kw = {
-            "host": self._rpc_info["rpc_tracker_host"],
-            "port": self._rpc_info["rpc_tracker_port"],
-            "priority": 0,
-            "timeout": 0,
-            "key": self._device_key,
-        }
-        return Session(self, hexagon_remote_kw, session_name=session_name)
-
-    def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module], session: Session):
-        """Load TVM module.
-
-        Parameters
-        ----------
-        module : Union[str, pathlib.Path, tvm.runtime.Module]
-
-            The module to load.  If `module` is a
-            `tvm.runtime.Module`, it will be uploaded to the remote
-            session and loaded.
-
-            If the object passed is a string or pathlib.Path, it must
-            be a full path in the remote system.
-
-        session : Session
-
-            Remote session. The session must be established (via __enter__)
-            prior to calling this function.
-
-        Returns
-        -------
-        TVMModule :
-            TVM module object.
-
-        """
-        return session.load_module(module)
-
-    def get_graph_executor(
-        self,
-        graph_json: str,
-        module: Union[str, pathlib.Path, tvm.runtime.Module],
-        session: Session,
-    ):
-        """Create a local GraphModule which consumes a remote libmod.
-
-        Parameters
-        ----------
-        graph_json : str
-            The string with the graph JSON.
-        module : Union[str, pathlib.Path, tvm.runtime.Module]
-
-            The module to load.  If `module` is a
-            `tvm.runtime.Module`, it will be uploaded to the remote
-            session and loaded.
-
-            If the object passed is a string or pathlib.Path, it must
-            be a full path in the remote system.
-        session : Session
-            Remote session. The session must be established (via __enter__)
-            prior to calling this function.
-
-        Returns
-        -------
-        GraphModule :
-            Runtime graph module that can be used to execute the graph.
-        """
-        return session.get_graph_executor(graph_json, module)
-
-    def get_graph_debug_executor(
-        self,
-        graph_json: str,
-        module: Union[str, pathlib.Path, tvm.runtime.Module],
-        session: Session,
-        dump_root: Union[str, pathlib.Path] = None,
-    ):
-        """Create a local GraphModuleDebug which consumes a remote libmod.
-
-        Parameters
-        ----------
-        graph_json : str
-            The string with the graph JSON.
-        module : Union[str, pathlib.Path, tvm.runtime.Module]
-
-            The module to load.  If `module` is a
-            `tvm.runtime.Module`, it will be uploaded to the remote
-            session and loaded.
-
-            If the object passed is a string or pathlib.Path, it must
-            be a full path in the remote system.
-        session : Session
-            Remote session. The session must be established (via __enter__)
-            prior to calling this function.
-
-        Returns
-        -------
-        GraphModuleDebug :
-            Runtime debug graph module that can be used to debug the graph.
-        """
-        return session.get_graph_debug_executor(graph_json, module, dump_root=dump_root)
-
     @abc.abstractmethod
     def get_profile_output(
         self,
@@ -360,6 +225,31 @@ def get_profile_output(
         """
         ...
 
+    def create_session(self, session_name: str = "hexagon-rpc") -> Session:
+        """Create an RPC session.
+
+        Parameters
+        ----------
+        session_name : str
+            RPC session name.
+
+        Returns
+        -------
+        Session :
+            The session object.
+        """
+        hexagon_session_kw = {
+            "remote_workspace": self._workspace,
+            "rpc_tracker": (self._rpc_info["rpc_tracker_host"], self._rpc_info["rpc_tracker_port"]),
+            "rpc_server_key": self._rpc_info["device_key"],
+            "serial_number": self._serial_number,
+            "session_name": session_name,
+        }
+        return Session(**hexagon_session_kw)
+
+    def is_simulator(self):
+        return self._serial_number == HEXAGON_SIMULATOR_NAME
+
 
 class HexagonLauncherAndroid(HexagonLauncherRPC):
     """Hexagon Launcher for Android."""
@@ -402,6 +292,8 @@ def __init__(
         if not rpc_info.get("workspace_base"):
             rpc_info["workspace_base"] = self.ANDROID_HEXAGON_TEST_BASE_DIR
         self._serial_number = serial_number
+        assert self._serial_number != "", "Android serial number is not set."
+
         adb_socket = rpc_info["adb_server_socket"] if rpc_info["adb_server_socket"] else "tcp:5037"
         self._adb_device_sub_cmd = ["adb", "-L", adb_socket, "-s", self._serial_number]
         self.forwarded_ports_ = []
@@ -409,8 +301,9 @@ def __init__(
         self._clear_logcat = clear_logcat
         self._sysmon_profile = sysmon_profile
         self._sysmon_process = None
+        rpc_info["device_key"] = HEXAGON_REMOTE_DEVICE_KEY + "." + self._serial_number
 
-        super(HexagonLauncherAndroid, self).__init__(rpc_info, workspace)
+        super(HexagonLauncherAndroid, self).__init__(rpc_info, workspace, self._serial_number)
 
     def _copy_to_remote(
         self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path]
@@ -442,7 +335,9 @@ def _copy_binaries(self):
                                 "<RPC_TRACKER_PORT>", str(self._rpc_info["rpc_tracker_port"])
                             )
                         if "<HEXAGON_REMOTE_DEVICE_KEY>" in line:
-                            line = line.replace("<HEXAGON_REMOTE_DEVICE_KEY>", self._device_key)
+                            line = line.replace(
+                                "<HEXAGON_REMOTE_DEVICE_KEY>", self._rpc_info["device_key"]
+                            )
                         if "<RPC_SERVER_PORT>" in line:
                             line = line.replace(
                                 "<RPC_SERVER_PORT>", str(self._rpc_info["rpc_server_port"])
@@ -691,12 +586,13 @@ def __init__(self, rpc_info: dict, workspace: Union[str, pathlib.Path] = None):
 
         Parameters are same as for HexagonLauncherRPC.
         """
-        super(HexagonLauncherSimulator, self).__init__(rpc_info, workspace)
 
         self._toolchain = os.environ.get("HEXAGON_TOOLCHAIN")
         if not self._toolchain:
             raise RuntimeError("Please set HEXAGON_TOOLCHAIN env variable")
-        self._serial_number = "simulator"
+        self._serial_number = HEXAGON_SIMULATOR_NAME
+
+        super(HexagonLauncherSimulator, self).__init__(rpc_info, workspace, self._serial_number)
 
     def _copy_to_remote(
         self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path]
@@ -740,18 +636,19 @@ def start_server(self):
             self._copy_to_remote(lib_dir / item, self._workspace / item)
         # Copy libc++ from the toolchain to the workspace
         self._copy_libcxx(self._workspace)
-        self._device_key = self.HEXAGON_REMOTE_DEVICE_KEY + "." + str(os.getpid())
+        self._rpc_info["device_key"] = HEXAGON_REMOTE_DEVICE_KEY + "." + str(os.getpid())
 
         rpc_tracker_host = self._rpc_info["rpc_tracker_host"]
         rpc_tracker_port = self._rpc_info["rpc_tracker_port"]
         rpc_server_port = self._rpc_info["rpc_server_port"]
+        device_key = self._rpc_info["device_key"]
         server_exe = os.path.join(".", "tvm_rpc_x86")
 
         args = [
             "server",
             f"--tracker={rpc_tracker_host}:{rpc_tracker_port}",
             f"--port={rpc_server_port}",
-            f"--key={self._device_key}",
+            f"--key={device_key}",
             "--timeout=0",
         ]
 
@@ -823,7 +720,7 @@ def HexagonLauncher(
     sysmon_profile: bool = False,
 ):
     """Creates a HexagonLauncher"""
-    if serial_number == "simulator":
+    if serial_number == HEXAGON_SIMULATOR_NAME:
         return HexagonLauncherSimulator(rpc_info, workspace)
     return HexagonLauncherAndroid(
         serial_number, rpc_info, workspace, hexagon_debug, clear_logcat, sysmon_profile
diff --git a/python/tvm/contrib/hexagon/meta_schedule.py b/python/tvm/contrib/hexagon/meta_schedule.py
index 8a4de74b6131..aaf3f8c7f8d5 100644
--- a/python/tvm/contrib/hexagon/meta_schedule.py
+++ b/python/tvm/contrib/hexagon/meta_schedule.py
@@ -100,7 +100,7 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
 
 
 def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path, args_info):
-    with hexagon_launcher.start_session() as session:
+    with hexagon_launcher.create_session() as session:
         device = session.device
         _, remote_path = os.path.split(artifact_path)
         uploaded = session.upload(artifact_path, remote_path)
diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index 7ee16f50eab4..d462a65ef930 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -29,6 +29,7 @@
 import tvm.rpc.tracker
 from tvm.contrib.hexagon.build import HexagonLauncher, HexagonLauncherRPC
 from tvm.contrib.hexagon.session import Session
+from tvm.contrib.hexagon.tools import HEXAGON_SIMULATOR_NAME
 
 HEXAGON_TOOLCHAIN = "HEXAGON_TOOLCHAIN"
 TVM_TRACKER_HOST = "TVM_TRACKER_HOST"
@@ -173,7 +174,7 @@ def hexagon_server_process(
 
     if android_serial_num is None:
         pytest.skip("ANDROID_SERIAL_NUMBER is not set.")
-    if android_serial_num == ["simulator"]:
+    if android_serial_num == [HEXAGON_SIMULATOR_NAME]:
         yield None
     else:
         # Requesting these fixtures sets up a local tracker, if one
@@ -220,7 +221,7 @@ def pytest_configure(config):
 
 
 def pytest_configure_node(node):
-    # the master for each node fills slaveinput dictionary
+    # the master for each node fills node input dictionary
     # which pytest-xdist will transfer to the subprocess
     if node.config.iplist is not None:
         node.workerinput["device_adr"] = node.config.iplist.pop()
@@ -240,7 +241,7 @@ def hexagon_launcher(
     """Initials and returns hexagon launcher which reuses RPC info and Android serial number."""
     android_serial_num = android_serial_number()
 
-    if android_serial_num != ["simulator"]:
+    if android_serial_num != [HEXAGON_SIMULATOR_NAME]:
         rpc_info = hexagon_server_process["launcher"]._rpc_info
     else:
         rpc_info = {
@@ -250,7 +251,7 @@ def hexagon_launcher(
             "adb_server_socket": adb_server_socket,
         }
     try:
-        if android_serial_num == ["simulator"]:
+        if android_serial_num == [HEXAGON_SIMULATOR_NAME]:
             launcher = HexagonLauncher(serial_number=android_serial_num[0], rpc_info=rpc_info)
             launcher.start_server()
         else:
@@ -263,7 +264,7 @@ def hexagon_launcher(
             )
         yield launcher
     finally:
-        if android_serial_num == ["simulator"]:
+        if android_serial_num == [HEXAGON_SIMULATOR_NAME]:
             launcher.stop_server()
         elif not hexagon_debug:
             launcher.cleanup_directory()
@@ -274,7 +275,7 @@ def hexagon_session(hexagon_launcher: HexagonLauncherRPC) -> Session:
     if hexagon_launcher is None:
         yield None
     else:
-        with hexagon_launcher.start_session() as session:
+        with hexagon_launcher.create_session() as session:
             yield session
 
 
@@ -289,7 +290,7 @@ def terminate_rpc_servers():
     # yield happens every time.
     serial = os.environ.get(ANDROID_SERIAL_NUMBER)
     yield []
-    if serial == ["simulator"]:
+    if serial == [HEXAGON_SIMULATOR_NAME]:
         os.system("ps ax | grep tvm_rpc_x86 | awk '{print $1}' | xargs kill")
 
 
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index d6ea51b53e17..466103f6e2c9 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -30,7 +30,7 @@
     AOTExecutorFactoryModule,
     GraphExecutorFactoryModule,
 )
-from .tools import export_module
+from .tools import export_module, HEXAGON_SIMULATOR_NAME
 
 
 class Session:
@@ -38,11 +38,17 @@ class Session:
 
     Parameters
     ----------
-    launcher : HexagonLauncherRPC
-        The launcher from which this session was started.
+    remote_workspace : Union[str, pathlib.Path]
+        Remote workspace path
 
-    remote_kw : dict
-        Remote configs for RPC tracker.
+    rpc_tracker : tuple(str, int)
+        RPC tracker host and port number.
+
+    rpc_server_key : str
+        RPC server key on remote device.
+
+    serial_number : str
+        Device serial number. `simulator` used for hexagon simulator.
 
     session_name : str
         Hexagon RPC session name.
@@ -50,21 +56,28 @@ class Session:
     remote_stack_size_bytes : int
         The stack size of the remote device, to be passed to
         tvm.contrib.hexagon.create_hexagon_session.
+
+    rpc_receive_buffer_size_bytes : int
+        RPC receive buffer size in bytes.
     """
 
     def __init__(
         self,
-        launcher: "HexagonLauncherRPC",
-        remote_kw: dict,
+        remote_workspace: Union[str, pathlib.Path],
+        rpc_tracker: tuple,
+        rpc_server_key: str,
+        serial_number: str,
         session_name: str = "hexagon-rpc",
         remote_stack_size_bytes: int = 256 * 1024,  # Min size for main thread in QuRT/sim
         rpc_receive_buffer_size_bytes: int = 256 * 1024 * 1024,  # Size for passing hexagon tests
     ):
-        self._launcher = launcher
+        self._workspace = str(remote_workspace)
+        self._rpc_tracker = rpc_tracker
+        self._rpc_server_key = rpc_server_key
+        self._serial_number = serial_number
         self._session_name: str = session_name
         self._remote_stack_size_bytes: int = remote_stack_size_bytes
         self._rpc_receive_buffer_size_bytes: int = rpc_receive_buffer_size_bytes
-        self._remote_kw: dict = remote_kw
         self._rpc = None
         self._requires_cpu_device = False
         self._device = None
@@ -74,12 +87,12 @@ def __enter__(self):
             # Already initialized
             return self
 
-        tracker = _rpc.connect_tracker(self._remote_kw["host"], self._remote_kw["port"])
+        tracker = _rpc.connect_tracker(self._rpc_tracker[0], self._rpc_tracker[1])
         try:
             self._rpc = tracker.request(
-                self._remote_kw["key"],
-                priority=self._remote_kw["priority"],
-                session_timeout=self._remote_kw["timeout"],
+                self._rpc_server_key,
+                priority=0,
+                session_timeout=0,
                 session_constructor_args=[
                     "tvm.contrib.hexagon.create_hexagon_session",
                     self._session_name,
@@ -124,6 +137,9 @@ def device(self):
 
         return self._device
 
+    def is_simulator(self):
+        return self._serial_number == HEXAGON_SIMULATOR_NAME
+
     def get_function(self, name):
         return self._rpc.get_function(name)
 
@@ -142,7 +158,12 @@ def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str) ->
         pathlib.Path :
             Uploaded file remote path.
         """
-        return self._launcher.upload(local_path, remote_filename)
+        upload_func = self._rpc.get_function("tvm.rpc.server.upload")
+        remote_path = f"{self._workspace}/{remote_filename}"
+        with open(local_path, mode="rb") as src_f:
+            data = bytearray(src_f.read())
+        upload_func(remote_path, data)
+        return remote_path
 
     def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
         """Load TVM module.
@@ -206,7 +227,6 @@ def get_graph_executor(
             Runtime graph module that can be used to execute the graph.
 
         """
-
         graph_mod = self.load_module(module_name)
         self._set_device_type(graph_mod)
         return tvm.contrib.graph_executor.create(graph_json, graph_mod, self.device)
diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py
index 3f4adb90f645..8c37261744d5 100644
--- a/python/tvm/contrib/hexagon/tools.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -53,6 +53,8 @@
     pathlib.Path(HEXAGON_SDK_ROOT) / "incs" / "stddef",
 ]
 
+HEXAGON_SIMULATOR_NAME = "simulator"
+
 
 def register_linker(f):
     """Register a function that will return the path to the Hexagon linker."""
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index 41c63d0affeb..f39944615bfd 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -32,6 +32,7 @@ extern "C" {
 #include <tvm/runtime/registry.h>
 
 #include <algorithm>
+#include <fstream>
 #include <memory>
 #include <string>
 
@@ -342,3 +343,16 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.get_profile_output")
         *rv = false;
       }
     });
+
+void SaveBinaryToFile(const std::string& file_name, const std::string& data) {
+  std::ofstream fs(file_name, std::ios::out | std::ios::binary);
+  ICHECK(!fs.fail()) << "Cannot open " << file_name;
+  fs.write(&data[0], data.length());
+}
+
+TVM_REGISTER_GLOBAL("tvm.rpc.server.upload")
+    .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
+      std::string file_name = args[0];
+      std::string data = args[1];
+      SaveBinaryToFile(file_name, data);
+    });
diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index 41bb2da6f8b1..f4370bd3c88c 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -23,6 +23,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
+#include <fstream>
 #include <sstream>
 #include <string>
 
@@ -349,3 +350,16 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.get_profile_output")
         *rv = false;
       }
     });
+
+void SaveBinaryToFile(const std::string& file_name, const std::string& data) {
+  std::ofstream fs(file_name, std::ios::out | std::ios::binary);
+  ICHECK(!fs.fail()) << "Cannot open " << file_name;
+  fs.write(&data[0], data.length());
+}
+
+TVM_REGISTER_GLOBAL("tvm.rpc.server.upload")
+    .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
+      std::string file_name = args[0];
+      std::string data = args[1];
+      SaveBinaryToFile(file_name, data);
+    });
diff --git a/tests/python/contrib/test_hexagon/README_RPC.md b/tests/python/contrib/test_hexagon/README_RPC.md
index 348be2d9e457..228922d3f184 100644
--- a/tests/python/contrib/test_hexagon/README_RPC.md
+++ b/tests/python/contrib/test_hexagon/README_RPC.md
@@ -60,7 +60,7 @@ subprocess.Popen(
 ./tvm_rpc_android server --port=<RPC_SERVER_PORT> --tracker=<RPC_TRACKER_HOST>:<RPC_TRACKER_PORT> --key=<HEXAGON_REMOTE_DEVICE_KEY>&
 ```
 
-When we do `launcher.start_session()` , a remote RPC session between x86 and android is established via this line:
+When we do `launcher.create_session()` , a remote RPC session between x86 and android is established via this line:
 
 [https://github.com/apache/tvm/blob/0c0245ae2230fa07d3e4b8be490fc9c88965730c/python/tvm/contrib/hexagon/session.py#L57-L67](https://github.com/apache/tvm/blob/0c0245ae2230fa07d3e4b8be490fc9c88965730c/python/tvm/contrib/hexagon/session.py#L57-L67)
 
diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py
index 0ded60dc498b..0ccbe514326c 100644
--- a/tests/python/contrib/test_hexagon/benchmark_util.py
+++ b/tests/python/contrib/test_hexagon/benchmark_util.py
@@ -23,8 +23,10 @@
 
 import pytest
 
+from tvm.contrib.hexagon.tools import HEXAGON_SIMULATOR_NAME
 
-def skip_bencharks_flag_and_reason():
+
+def skip_benchmarks_flag_and_reason():
     """
     Returns one of these tuples:
         (False, '') or
@@ -37,7 +39,7 @@ def skip_bencharks_flag_and_reason():
     """
     asn = os.environ.get("ANDROID_SERIAL_NUMBER")
 
-    if asn == "simulator":
+    if asn == HEXAGON_SIMULATOR_NAME:
         return (True, "Skipping benchmarks when  ANDROID_SERIAL_NUMBER='simluator'")
 
     return (False, "")
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
index 4fe21c564330..84a33b9c80d3 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
@@ -25,6 +25,7 @@
 from tvm import meta_schedule as ms
 from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
 from tvm.relay.backend import Executor
+
 from ..infrastructure import get_hexagon_target
 
 
@@ -103,7 +104,7 @@ def test_resnet50(hexagon_launcher):
         llvm_graph_mod.run()
         ref_result = llvm_graph_mod.get_output(0).numpy()
 
-    with hexagon_launcher.start_session() as session:
+    with hexagon_launcher.create_session() as session:
         graph_mod = session.get_executor_from_factory(hexagon_lowered)
         graph_mod.set_input(input_name, inp.copy())
 
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
index 4c8d91dd27ef..a541c25f3cbc 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -163,7 +163,7 @@ def test_resnet50(hexagon_launcher):
             params=params,
         )
 
-    with hexagon_launcher.start_session() as session:
+    with hexagon_launcher.create_session() as session:
         graph_mod = session.get_executor_from_factory(hexagon_lowered)
         graph_mod.set_input(input_name, inp.copy())
         graph_mod.run()
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
index 3dcb9a880e00..ee58514569ae 100644
--- a/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
@@ -26,13 +26,13 @@
 
 import tvm.script
 import tvm.testing
-from tvm.contrib.hexagon.build import HexagonLauncherRPC
+from tvm.contrib.hexagon.session import Session
 from tvm.script import tir as T
 
 from . import benchmark_util as bu
 from .infrastructure import get_hexagon_target
 
-_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
+_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_benchmarks_flag_and_reason()
 
 # This is a fixed detail of the v68 architecture.
 HVX_VECTOR_BYTES = 128
@@ -160,7 +160,7 @@ def main(a: T.handle, b: T.handle, c: T.handle):
 
 
 def _benchmark_hexagon_elementwise_add_kernel(
-    hexagon_launcher: HexagonLauncherRPC, shape: list, dtype: str, mem_scope: str
+    hexagon_session: Session, shape: list, dtype: str, mem_scope: str
 ):
     """
     Generate and benchmark a single elementwise-add kernel for Hexagon.
@@ -230,7 +230,7 @@ def _benchmark_hexagon_elementwise_add_kernel(
             # Upload the .so to the Android device's file system (or wherever is appropriate
             # when using the Hexagon simulator)...
             target_dso_binary_filename = "test_binary.so"
-            target_dso_binary_pathname = hexagon_launcher.upload(
+            target_dso_binary_pathname = hexagon_session.upload(
                 host_dso_binary_path, target_dso_binary_filename
             )
 
@@ -241,58 +241,57 @@ def _benchmark_hexagon_elementwise_add_kernel(
                 host_numpy_output_data_expected,
             ) = _get_elemwise_add_reference_value_tensors(shape, dtype)
 
-            with hexagon_launcher.start_session() as sess:
-                # On the target device / simulator, make our Hexagon-native shared object
-                # available for use...
-                loaded_hexagon_module: tvm.runtime.module.Module = hexagon_launcher.load_module(
-                    target_dso_binary_pathname, sess
-                )
+            # On the target device / simulator, make our Hexagon-native shared object
+            # available for use...
+            loaded_hexagon_module: tvm.runtime.module.Module = hexagon_session.load_module(
+                target_dso_binary_pathname
+            )
 
-                # Create the target-side tensors to hold the primfunc's inputs and outputs...
-                input1_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
-                input2_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
-                output_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+            # Create the target-side tensors to hold the primfunc's inputs and outputs...
+            input1_data = tvm.nd.empty(shape, dtype, hexagon_session.device, mem_scope)
+            input2_data = tvm.nd.empty(shape, dtype, hexagon_session.device, mem_scope)
+            output_data = tvm.nd.empty(shape, dtype, hexagon_session.device, mem_scope)
 
-                # Populate the primfunc's input tensors...
-                input1_data.copyfrom(host_numpy_input1_data)
-                input2_data.copyfrom(host_numpy_input2_data)
+            # Populate the primfunc's input tensors...
+            input1_data.copyfrom(host_numpy_input1_data)
+            input2_data.copyfrom(host_numpy_input2_data)
 
-                # Actually benchmark the primfunc...
-                timer = loaded_hexagon_module.time_evaluator(
-                    "main", sess.device, number=10, repeat=1
+            # Actually benchmark the primfunc...
+            timer = loaded_hexagon_module.time_evaluator(
+                "main", hexagon_session.device, number=10, repeat=1
+            )
+            timing_result = timer(input1_data, input2_data, output_data)
+
+            print(f"TIMING RESULT: {timing_result}")
+            log_file.write(f"TIMING RESULT: {timing_result}\n")
+
+            # Verify that the computation actually happened, and produced the correct result.
+            result = output_data.numpy()
+
+            if dtype == "float16":
+                # These are the closest tolerance we currently expect / require for these
+                # kernels.  They may be changed in the future.
+                rel_tolerance = 0.005
+                abs_tolerance = 2.0
+            elif dtype == "int8":
+                rel_tolerance = 0
+                abs_tolerance = 0
+            else:
+                raise Exception(f"Unexpected dtype: {dtype}")
+
+            # TODO: We're assuming that *any* assertion thrown by 'assert_allclose' is because
+            # the numerical differences were too large.  But ideally this code would
+            # differentiate between (a) numerical difference errors, which should simply be
+            # recorded as a failed benchmark run, vs. (b) more serious errors that should
+            # kill the overall script.
+            try:
+                tvm.testing.assert_allclose(
+                    result, host_numpy_output_data_expected, rel_tolerance, abs_tolerance
                 )
-                timing_result = timer(input1_data, input2_data, output_data)
-
-                print(f"TIMING RESULT: {timing_result}")
-                log_file.write(f"TIMING RESULT: {timing_result}\n")
-
-                # Verify that the computation actually happened, and produced the correct result.
-                result = output_data.numpy()
-
-                if dtype == "float16":
-                    # These are the closest tolerance we currently expect / require for these
-                    # kernels.  They may be changed in the future.
-                    rel_tolerance = 0.005
-                    abs_tolerance = 2.0
-                elif dtype == "int8":
-                    rel_tolerance = 0
-                    abs_tolerance = 0
-                else:
-                    raise Exception(f"Unexpected dtype: {dtype}")
-
-                # TODO: We're assuming that *any* assertion thrown by 'assert_allclose' is because
-                # the numerical differences were too large.  But ideally this code would
-                # differentiate between (a) numerical difference errors, which should simply be
-                # recorded as a failed benchmark run, vs. (b) more serious errors that should
-                # kill the overall script.
-                try:
-                    tvm.testing.assert_allclose(
-                        result, host_numpy_output_data_expected, rel_tolerance, abs_tolerance
-                    )
-                except AssertionError as err:
-                    raise bu.NumericalAccuracyException(str(err))
-
-                _BT.record_success(timing_result, **keys_dict)
+            except AssertionError as err:
+                raise bu.NumericalAccuracyException(str(err))
+
+            _BT.record_success(timing_result, **keys_dict)
 
         except bu.NumericalAccuracyException as err:
             print()
@@ -377,7 +376,7 @@ def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
 
 @pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
 @tvm.testing.requires_hexagon
-def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
+def test_elemwise_add(hexagon_session: Session):
     """Main elementwise add test function"""
     for dtype in [
         "int8",
@@ -411,7 +410,7 @@ def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
                 ]
 
                 print()
-                _benchmark_hexagon_elementwise_add_kernel(hexagon_launcher, shape, dtype, mem_scope)
+                _benchmark_hexagon_elementwise_add_kernel(hexagon_session, shape, dtype, mem_scope)
 
     print("-" * 80)
     print(f"OUTPUT DIRECTORY: {_HOST_OUTPUT_DIR}")
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
index a22b85ee42a2..7e8a6d79f492 100644
--- a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
@@ -60,7 +60,7 @@
 # E.g., it doesn't allow: @pytest.mark.usefixtures("bu.benchmark_group")
 benchmark_group = bu.benchmark_group
 
-_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
+_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_benchmarks_flag_and_reason()
 
 
 def _ceil_div(numerator, denominator):
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py
index 6e12f4b205d1..a7f4cbc39cb1 100644
--- a/tests/python/contrib/test_hexagon/test_meta_schedule.py
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -62,7 +62,7 @@ def main(  # type: ignore  # pylint: disable=no-self-argument
 
 @tvm.testing.requires_hexagon
 def test_builder_runner(hexagon_launcher):
-    if hexagon_launcher._serial_number == "simulator":
+    if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
     mod = MatmulModule
@@ -175,7 +175,7 @@ def verify_dense(sch, target, M, N, K, hexagon_session):
 
 @tvm.testing.requires_hexagon
 def test_vrmpy_dense(hexagon_launcher):
-    if hexagon_launcher._serial_number == "simulator":
+    if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
     do_tune = True
@@ -213,7 +213,7 @@ def schedule_dense_for_tune(sch):
             )
             sch = ms.tir_integration.compile_tir(database, workload, target)
 
-    with hexagon_launcher.start_session() as session:
+    with hexagon_launcher.create_session() as session:
         verify_dense(sch, get_hexagon_target("v68"), M, N, K, session)
 
 
@@ -273,7 +273,7 @@ def main(  # type: ignore
 
 @tvm.testing.requires_hexagon
 def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
-    if hexagon_launcher._serial_number == "simulator":
+    if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
     M, N, K = 128, 768, 768
@@ -330,13 +330,13 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
     else:
         sch = tvm.tir.Schedule(Module_vrmpy_auto_tensorize, debug_mask="all")
 
-    with hexagon_launcher.start_session() as session:
+    with hexagon_launcher.create_session() as session:
         verify_dense(sch, get_hexagon_target("v68"), M, N, K, session)
 
 
 @tvm.testing.requires_hexagon
 def test_conv2d_relay_auto_schedule(hexagon_launcher):
-    if hexagon_launcher._serial_number == "simulator":
+    if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
     I, O, H, W = 64, 64, 56, 56
@@ -397,7 +397,7 @@ def test_conv2d_relay_auto_schedule(hexagon_launcher):
             target=target,
         )
 
-    with hexagon_launcher.start_session() as session:
+    with hexagon_launcher.create_session() as session:
         rt_mod = session.get_executor_from_factory(lib)
 
         rt_mod.set_input("data", data_np)
@@ -416,7 +416,7 @@ def test_dense_relay_auto_schedule(hexagon_launcher):
     This is for testing RewriteLayout postproc. Without this postproc,
     dense on Hexagon is extremely slow.
     """
-    if hexagon_launcher._serial_number == "simulator":
+    if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
     target_hexagon = tvm.target.hexagon("v69")
@@ -456,7 +456,7 @@ def test_dense_relay_auto_schedule(hexagon_launcher):
             target=target,
         )
 
-    with hexagon_launcher.start_session() as session:
+    with hexagon_launcher.create_session() as session:
         rt_mod = session.get_executor_from_factory(lib)
 
         rt_mod.set_input("data", data_np)
diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
index f4495f849fab..db578d1057a6 100644
--- a/tests/python/contrib/test_hexagon/test_models.py
+++ b/tests/python/contrib/test_hexagon/test_models.py
@@ -90,7 +90,7 @@ def test_mobilenet(hexagon_session: Session):
 @tvm.testing.requires_hexagon
 def test_mobilenet_aot(hexagon_session: Session, aot_host_target, aot_target, enable_usmp):
     """Test mobilenet with aot executor"""
-    if hexagon_session._launcher._serial_number == "simulator":
+    if hexagon_session.is_simulator():
         pytest.skip(msg="Skip on simulator due to long runtime.")
 
     dtype = "float32"
diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
index a883a9a251e3..f80a579f58fe 100644
--- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -21,7 +21,6 @@
 
 import tvm
 from tvm import tir
-from tvm.contrib.hexagon.session import Session
 from tvm.script import tir as T
 
 from .infrastructure import get_hexagon_target
@@ -181,7 +180,7 @@ def test_async_software_pipeline(hexagon_launcher, comp_type, data, reference, s
         # tvm.lower(schedule.mod["main"]).show()
         func = tvm.build(schedule.mod["main"], target=get_hexagon_target("v68"))
 
-    with hexagon_launcher.start_session() as hexagon_session:
+    with hexagon_launcher.create_session() as hexagon_session:
         dev = hexagon_session.device
         mod = hexagon_session.load_module(func)
         out = tvm.nd.array(out_np, device=dev)
diff --git a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
index 326370eb72d7..7f59e3ffa7fd 100644
--- a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
@@ -22,6 +22,7 @@
 import tvm.testing
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
+
 from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
@@ -74,7 +75,7 @@ def test_cast_fp16_fp32_slice(
         """
         Top level testing function for cast fp16 to fp32
         """
-        if hexagon_session._launcher._serial_number != "simulator":
+        if hexagon_session.is_simulator():
             pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
 
         cast_input = te.placeholder(input_shape, name="A", dtype=dtype)
@@ -160,7 +161,7 @@ def test_cast_fp32_fp16_slice(
         """
         Top level testing function for cast fp32 to fp16
         """
-        if hexagon_session._launcher._serial_number != "simulator":
+        if hexagon_session.is_simulator():
             pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
 
         cast_input = te.placeholder(input_shape, name="A", dtype=dtype)

From 9dd0a70a0753bbd300227ade0224431ce78baeff Mon Sep 17 00:00:00 2001
From: Anirudh Sundar Subramaniam <quic_sanirudh@quicinc.com>
Date: Wed, 26 Oct 2022 21:33:05 +0530
Subject: [PATCH 440/704] [Hexagon] Fix correctness error in conv2d hvx
 (#13165)

The output vector was not initialized to zero and that was causing
incorrect output on target intermittently, but was always working on
simulator. This patch fixes the error and enables the test case again
---
 src/runtime/hexagon/ops/conv2d_fp16_hvx.cc                      | 2 +-
 .../python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
index cf4dc43c6515..a478fbab352d 100644
--- a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
+++ b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
@@ -268,7 +268,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
              << ", h: " << h << ", wo: " << wo << " out_element_ptr: " << out_element_ptr;
 
     HVX_Vector* out_vector = reinterpret_cast<HVX_Vector*>(out_element_ptr);
-    HVX_Vector existing_out_vec = *out_vector;
+    HVX_Vector existing_out_vec = Q6_V_vzero();
 
     for (int fh = 0; fh < filt_height; ++fh) {
       for (int fw = 0; fw < filt_width; ++fw) {
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
index 3f88a6e432b7..5066a532df9b 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
@@ -197,7 +197,7 @@ class TestConv2dIntrin:
     inp_offset = tvm.testing.parameter((0, 0), ids=["offset0x0"])
 
     @tvm.testing.requires_hexagon
-    def DISABLED_test_conv2d(self, act_shape, wgt_shape, inp_stride, inp_offset, hexagon_session):
+    def test_conv2d(self, act_shape, wgt_shape, inp_stride, inp_offset, hexagon_session):
         """Test conv2d intrinsic implementation"""
         assert act_shape[3] == wgt_shape[2]
 

From 3836eb9ca921888a2ec1290fc64884ed8e323a7c Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Wed, 26 Oct 2022 10:59:56 -0700
Subject: [PATCH 441/704] [Hexagon] [runtime] Per-thread hardware resource
 management (#13181)

Add support to lock and unlock resources on the individual threads. Adds APIs to get a thread handle for a given resource type, and get a resource type for a given thread handle. This will aid in asynchronous support that is sending work to dedicated threads for the hardware resources.

There are two steps to fully acquire a resource: reserve and lock. HTP/HVX manager objects are created on the global thread, which will reserve the resources. The lock/unlock will occur on the thread that owns a particular instance. The manager objects will be deconstructed when the thread manager is deconstructed. This will release the resources.
---
 src/runtime/hexagon/hexagon_htp.cc            | 12 ++-
 src/runtime/hexagon/hexagon_htp.h             |  3 +
 src/runtime/hexagon/hexagon_hvx.cc            | 27 +++---
 src/runtime/hexagon/hexagon_hvx.h             | 13 +++
 src/runtime/hexagon/hexagon_thread_manager.cc | 84 +++++++++++++++----
 src/runtime/hexagon/hexagon_thread_manager.h  | 32 ++++++-
 .../hexagon/hexagon_thread_manager_tests.cc   | 35 ++++++--
 7 files changed, 168 insertions(+), 38 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_htp.cc b/src/runtime/hexagon/hexagon_htp.cc
index 01344ccf4a79..f6c1d2f01ffb 100644
--- a/src/runtime/hexagon/hexagon_htp.cc
+++ b/src/runtime/hexagon/hexagon_htp.cc
@@ -54,15 +54,19 @@ void HexagonHtp::Acquire() {
   if (!context_id_) {
     LOG(FATAL) << "InternalError: HAP_compute_res_acquire failed\n";
   }
+}
+
+void HexagonHtp::Release() { HAP_compute_res_release((unsigned int)context_id_); }
+
+void HexagonHtp::Lock() {
+  int nErr;
+
   if ((nErr = HAP_compute_res_hmx_lock(context_id_))) {
     LOG(FATAL) << "InternalError: Unable to lock HTP!";
   }
 }
 
-void HexagonHtp::Release() {
-  HAP_compute_res_hmx_unlock((unsigned int)context_id_);
-  HAP_compute_res_release((unsigned int)context_id_);
-}
+void HexagonHtp::Unlock() { HAP_compute_res_hmx_unlock((unsigned int)context_id_); }
 
 }  // namespace hexagon
 }  // namespace runtime
diff --git a/src/runtime/hexagon/hexagon_htp.h b/src/runtime/hexagon/hexagon_htp.h
index b3f0c0b5f71f..928936133dd8 100644
--- a/src/runtime/hexagon/hexagon_htp.h
+++ b/src/runtime/hexagon/hexagon_htp.h
@@ -44,6 +44,9 @@ class HexagonHtp {
   //! \brief Prevent move assignment.
   HexagonHtp& operator=(HexagonHtp&&) = delete;
 
+  void Lock();
+  void Unlock();
+
  private:
   //! \brief Acquisition context ID
   unsigned int context_id_;
diff --git a/src/runtime/hexagon/hexagon_hvx.cc b/src/runtime/hexagon/hexagon_hvx.cc
index 0c3160a7d89b..4fc97bf95475 100644
--- a/src/runtime/hexagon/hexagon_hvx.cc
+++ b/src/runtime/hexagon/hexagon_hvx.cc
@@ -31,25 +31,28 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-HexagonHvx::HexagonHvx() {
-  // Reserve HVX.
-  int res = qurt_hvx_reserve(QURT_HVX_RESERVE_ALL_AVAILABLE);
-  CHECK((res != QURT_HVX_RESERVE_NOT_SUPPORTED) && (res != QURT_HVX_RESERVE_NOT_SUCCESSFUL))
-      << "error reserving HVX: " << res;
+HexagonHvx::HexagonHvx() { Acquire(); }
 
-  // Lock HVX.
+HexagonHvx::~HexagonHvx() { Release(); }
+
+void HexagonHvx::Acquire() {
+  reserved_count_ = qurt_hvx_reserve(QURT_HVX_RESERVE_ALL);
+  CHECK(reserved_count_ == QURT_HVX_RESERVE_ALL) << "error reserving HVX: " << reserved_count_;
+}
+
+void HexagonHvx::Release() {
+  int rel = qurt_hvx_cancel_reserve();
+  CHECK(rel == 0) << "error releasing HVX: " << rel;
+}
+
+void HexagonHvx::Lock() {
   int lck = qurt_hvx_lock(QURT_HVX_MODE_128B);
   CHECK(lck == 0) << "error locking HVX: " << lck;
 }
 
-HexagonHvx::~HexagonHvx() {
-  // Unlock HVX.
+void HexagonHvx::Unlock() {
   int unl = qurt_hvx_unlock();
   CHECK(unl == 0) << "error unlocking HVX: " << unl;
-
-  // Release HVX.
-  int rel = qurt_hvx_cancel_reserve();
-  CHECK(rel == 0) << "error releasing HVX: " << rel;
 }
 
 }  // namespace hexagon
diff --git a/src/runtime/hexagon/hexagon_hvx.h b/src/runtime/hexagon/hexagon_hvx.h
index 042977981c99..06394d7c5c7d 100644
--- a/src/runtime/hexagon/hexagon_hvx.h
+++ b/src/runtime/hexagon/hexagon_hvx.h
@@ -45,7 +45,20 @@ class HexagonHvx {
   //! \brief Prevent move assignment.
   HexagonHvx& operator=(HexagonHvx&&) = delete;
 
+  //! \brief Lock one HVX to the calling thread.
+  void Lock();
+
+  //! \brief Unlock the HVX for the calling thread.
+  void Unlock();
+
+  //! \brief Number of HVX units reserved.
+  int ReservedCount() { return reserved_count_; }
+
  private:
+  int reserved_count_;
+
+  void Acquire();
+  void Release();
 };
 
 }  // namespace hexagon
diff --git a/src/runtime/hexagon/hexagon_thread_manager.cc b/src/runtime/hexagon/hexagon_thread_manager.cc
index cf64cdc8b2d0..2fbc231e5781 100644
--- a/src/runtime/hexagon/hexagon_thread_manager.cc
+++ b/src/runtime/hexagon/hexagon_thread_manager.cc
@@ -38,18 +38,15 @@ HexagonThreadManager::HexagonThreadManager(unsigned num_threads, unsigned thread
   CHECK_GE(thread_pipe_size_words, MIN_PIPE_SIZE_WORDS);
   CHECK_LE(thread_pipe_size_words, MAX_PIPE_SIZE_WORDS);
 
-  // Support either no resources or a specific set of hardware resources for now.
-  if (!hw_resources.empty()) {
-    CHECK((hw_resources.size() == nthreads_) && (nthreads_ == 6) && (hw_resources[0] == DMA_0) &&
-          (hw_resources[1] == HTP_0) && (hw_resources[2] == HVX_0) && (hw_resources[3] == HVX_1) &&
-          (hw_resources[4] == HVX_2) && (hw_resources[5] == HVX_3))
-        << "Unsupported hardware resource set";
-  }
   hw_resources_ = hw_resources;
+  CheckResources();
 
-  if (!hw_resources_.empty()) {
+  if (create_resource_managers_) {
     DLOG(INFO) << "Initialize hardware resource managers";
-    // Acquisition/locks will be performed on specific threads
+    // This creates the manager objects, which reserves (acquires) the resources.
+    // Calls to lock/unlock will be performed on threads dedicated to instances.
+    // This must be done before spawning threads so we can pass pointers to the
+    // objects in the thread context.
     htp_ = std::make_unique<HexagonHtp>();
     hvx_ = std::make_unique<HexagonHvx>();
   }
@@ -74,9 +71,9 @@ HexagonThreadManager::~HexagonThreadManager() {
 
   // dispatch a command to each thread to exit with status 0
   for (unsigned i = 0; i < nthreads_; i++) {
-    bool success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_exit, nullptr);
+    bool success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_exit, contexts_[i]);
     while (!success) {
-      success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_exit, nullptr);
+      success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_exit, contexts_[i]);
     }
   }
 
@@ -121,6 +118,24 @@ HexagonThreadManager::~HexagonThreadManager() {
   DLOG(INFO) << "Hardware resources released";
 }
 
+void HexagonThreadManager::CheckResources() {
+  create_resource_managers_ = false;
+  CHECK(hw_resources_.empty() || hw_resources_.size() == nthreads_)
+      << "Thread count must match resource count";
+  if (!hw_resources_.empty()) {
+    // Ensure that no more than one of each hardware resource is specified
+    for (int i = 0; i < hw_resources_.size(); i++) {
+      if (hw_resources_[i] != NONE) {
+        create_resource_managers_ = true;
+        for (int j = i + 1; j < hw_resources_.size(); j++) {
+          CHECK(hw_resources_[i] != hw_resources_[j])
+              << "No more than one of each resource type may be specified " << hw_resources_[i];
+        }
+      }
+    }
+  }
+}
+
 void HexagonThreadManager::SpawnThreads(unsigned thread_stack_size_bytes,
                                         unsigned thread_pipe_size_words) {
   // allocate all stack space for threads
@@ -168,7 +183,8 @@ void HexagonThreadManager::SpawnThreads(unsigned thread_stack_size_bytes,
     next_stack_start += thread_stack_size_bytes;
 
     // create the thread
-    contexts_[i] = new ThreadContext(&pipes_[i], i);
+    contexts_[i] = new ThreadContext(&pipes_[i], i, hw_resources_.empty() ? NONE : hw_resources_[i],
+                                     hvx_.get(), htp_.get());
     int rc = qurt_thread_create(&threads_[i], &thread_attr, thread_main, contexts_[i]);
     CHECK_EQ(rc, QURT_EOK);
   }
@@ -185,6 +201,21 @@ const std::vector<TVMStreamHandle> HexagonThreadManager::GetStreamHandles() {
   return out;
 }
 
+TVMStreamHandle HexagonThreadManager::GetStreamHandleByResourceType(HardwareResourceType type) {
+  for (unsigned i = 0; i < hw_resources_.size(); i++) {
+    if (hw_resources_[i] == type) {
+      return reinterpret_cast<TVMStreamHandle>(i);
+    }
+  }
+  CHECK(false) << "Thread for resource type " << type << " not found";
+}
+
+HardwareResourceType HexagonThreadManager::GetResourceTypeForStreamHandle(TVMStreamHandle thread) {
+  CHECK(hw_resources_.size() > reinterpret_cast<int>(thread))
+      << "No thread for handle id exists " << thread;
+  return hw_resources_[reinterpret_cast<int>(thread)];
+}
+
 bool HexagonThreadManager::Dispatch(TVMStreamHandle stream, voidfunc f, void* args) {
   unsigned thread = reinterpret_cast<unsigned>(stream);
   DLOG(INFO) << "Dispatching to stream " << thread;
@@ -284,18 +315,41 @@ void HexagonThreadManager::thread_wait_free(void* semaphore) {
   free(semaphore);
 }
 
-void HexagonThreadManager::thread_exit(void* status) {
-  DLOG(INFO) << "thread exiting";
-  qurt_thread_exit((uint64_t)status);
+void HexagonThreadManager::thread_exit(void* context) {
+  ThreadContext* tc = static_cast<ThreadContext*>(context);
+  unsigned index = tc->index;
+  HardwareResourceType resource_type = tc->resource_type;
+
+  if ((resource_type == HVX_0) || (resource_type == HVX_1) || (resource_type == HVX_2) ||
+      (resource_type == HVX_3)) {
+    tc->hvx->Unlock();
+    DLOG(INFO) << "Thread " << index << " unlocked an HVX instance";
+  } else if (resource_type == HTP_0) {
+    tc->htp->Unlock();
+    DLOG(INFO) << "Thread " << index << " unlocked the HTP";
+  }
+
+  DLOG(INFO) << "Thread " << index << " exiting";
+  qurt_thread_exit((uint64_t)tc->status);
 }
 
 void HexagonThreadManager::thread_main(void* context) {
   ThreadContext* tc = static_cast<ThreadContext*>(context);
   unsigned index = tc->index;
   qurt_pipe_t* mypipe = tc->pipe;
+  HardwareResourceType resource_type = tc->resource_type;
 
   DLOG(INFO) << "Thread " << index << " spawned";
 
+  if ((resource_type == HVX_0) || (resource_type == HVX_1) || (resource_type == HVX_2) ||
+      (resource_type == HVX_3)) {
+    tc->hvx->Lock();
+    DLOG(INFO) << "Thread " << index << " locked an HVX instance";
+  } else if (resource_type == HTP_0) {
+    tc->htp->Lock();
+    DLOG(INFO) << "Thread " << index << " locked the HTP";
+  }
+
   while (true) {  // loop, executing commands from pipe
     DLOG(INFO) << "Thread " << index << " receiving command";
     qurt_pipe_data_t msg = qurt_pipe_receive(mypipe);  // blocks if empty
diff --git a/src/runtime/hexagon/hexagon_thread_manager.h b/src/runtime/hexagon/hexagon_thread_manager.h
index a263cf42dc58..c911d1326a39 100644
--- a/src/runtime/hexagon/hexagon_thread_manager.h
+++ b/src/runtime/hexagon/hexagon_thread_manager.h
@@ -87,6 +87,18 @@ class HexagonThreadManager {
    */
   const std::vector<TVMStreamHandle> GetStreamHandles();
 
+  /*!
+   * \brief Get the spawned threads as stream handles for a resource type.
+   * \returns stream handle.
+   */
+  TVMStreamHandle GetStreamHandleByResourceType(HardwareResourceType type);
+
+  /*!
+   * \brief Get the resource type for a stream handle
+   * \returns stream handle.
+   */
+  HardwareResourceType GetResourceTypeForStreamHandle(TVMStreamHandle thread);
+
   /*!
    * \brief Non-blocking dispatch of a void function and args on a given thread.
    * \param thread Stream handle of the thread on which to dispatch the void function.
@@ -137,9 +149,22 @@ class HexagonThreadManager {
   struct ThreadContext {
     qurt_pipe_t* pipe;
     unsigned index;
-    ThreadContext(qurt_pipe_t* pipe, unsigned index) : pipe(pipe), index(index) {}
+    HardwareResourceType resource_type;
+    HexagonHvx* hvx;
+    HexagonHtp* htp;
+    uint64_t status;
+    ThreadContext(qurt_pipe_t* pipe, unsigned index, HardwareResourceType resource_type,
+                  HexagonHvx* hvx, HexagonHtp* htp)
+        : pipe(pipe), index(index), resource_type(resource_type), hvx(hvx), htp(htp), status(0) {
+      CHECK(resource_type == NONE || (hvx && htp))
+          << "Missing resource manager pointer, type: " << resource_type << " hvx: " << hvx
+          << " htp: " << htp;
+    }
   };
 
+  //! \brief Helper function to ensure the set of requested resources is valid.
+  void CheckResources();
+
   //! \brief Helper function for the constructor to spawn threads.
   void SpawnThreads(unsigned thread_stack_size_bytes, unsigned thread_pipe_size_words);
 
@@ -157,7 +182,7 @@ class HexagonThreadManager {
   static void thread_wait_free(void* semaphore);
 
   //! \brief Void function executed by a thread to exit at time of destruction.
-  static void thread_exit(void* status);
+  static void thread_exit(void* context);
 
   //! \brief Void function executed by each thread as `main`.
   static void thread_main(void* context);
@@ -203,6 +228,9 @@ class HexagonThreadManager {
   //! \brief List of hardware resources
   std::vector<HardwareResourceType> hw_resources_;
 
+  //! \brief Whether or not resource managers should be created
+  bool create_resource_managers_{false};
+
   //! \brief HTP hardware resource.
   // TODO(HWE): Move binding of HTP to a specific thread
   std::unique_ptr<HexagonHtp> htp_;
diff --git a/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
index d7bf0afed906..af29a428bc69 100644
--- a/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
@@ -42,7 +42,7 @@ class HexagonThreadManagerTest : public ::testing::Test {
   const unsigned stack_size{0x4000};  // 16KB
 };
 
-TEST_F(HexagonThreadManagerTest, ctor_errors) {
+TEST_F(HexagonThreadManagerTest, ctor_edge_cases) {
   // zero threads
   ASSERT_THROW(HexagonThreadManager(0, stack_size, pipe_size), InternalError);
   // too many threads
@@ -57,13 +57,16 @@ TEST_F(HexagonThreadManagerTest, ctor_errors) {
   ASSERT_THROW(HexagonThreadManager(6, stack_size, 0x10000000), InternalError);
   // hw resources count doesn't match thread count
   ASSERT_THROW(HexagonThreadManager(6, stack_size, pipe_size, {DMA_0}), InternalError);
-  // hw resources doesn't match specific supported configuration
+  // no more than one of each hw resource may be specified
+  ASSERT_THROW(HexagonThreadManager(4, stack_size, pipe_size, {DMA_0, HTP_0, HVX_0, HVX_0}),
+               InternalError);
+  // no more than one of each hw resource may be specified
   ASSERT_THROW(
       HexagonThreadManager(6, stack_size, pipe_size, {DMA_0, HTP_0, HVX_0, HVX_1, HVX_2, DMA_0}),
       InternalError);
-  // hw resources doesn't match specific supported configuration
-  ASSERT_THROW(HexagonThreadManager(5, stack_size, pipe_size, {DMA_0, HTP_0, HVX_0, HVX_1, HVX_2}),
-               InternalError);
+  // multiple entries for no resource is allowed.
+  HexagonThreadManager* htm_none = new HexagonThreadManager(2, stack_size, pipe_size, {NONE, NONE});
+  delete htm_none;
 }
 
 TEST_F(HexagonThreadManagerTest, init) {
@@ -334,3 +337,25 @@ TEST_F(HexagonThreadManagerTest, dispatch_writes) {
     CHECK_EQ(array[i], truth[i]);
   }
 }
+
+// Validate threads created for hw resources on global manager
+TEST_F(HexagonThreadManagerTest, threads_for_resource_types) {
+  HexagonThreadManager* thread_manager = HexagonDeviceAPI::Global()->ThreadManager();
+  TVMStreamHandle thread;
+
+  thread = thread_manager->GetStreamHandleByResourceType(DMA_0);
+  CHECK(thread_manager->GetResourceTypeForStreamHandle(thread) == DMA_0);
+  thread = thread_manager->GetStreamHandleByResourceType(HTP_0);
+  CHECK(thread_manager->GetResourceTypeForStreamHandle(thread) == HTP_0);
+  thread = thread_manager->GetStreamHandleByResourceType(HVX_0);
+  CHECK(thread_manager->GetResourceTypeForStreamHandle(thread) == HVX_0);
+  thread = thread_manager->GetStreamHandleByResourceType(HVX_1);
+  CHECK(thread_manager->GetResourceTypeForStreamHandle(thread) == HVX_1);
+  thread = thread_manager->GetStreamHandleByResourceType(HVX_2);
+  CHECK(thread_manager->GetResourceTypeForStreamHandle(thread) == HVX_2);
+  thread = thread_manager->GetStreamHandleByResourceType(HVX_3);
+  CHECK(thread_manager->GetResourceTypeForStreamHandle(thread) == HVX_3);
+  EXPECT_THROW(thread_manager->GetStreamHandleByResourceType(NONE), InternalError);
+  thread = reinterpret_cast<TVMStreamHandle>(6);
+  EXPECT_THROW(thread_manager->GetResourceTypeForStreamHandle(thread), InternalError);
+}

From 9b9b7458e92ea3e673c68750e217d68c26458351 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 26 Oct 2022 12:39:13 -0700
Subject: [PATCH 442/704] [docs] Fix empty code blocks in tutorials (#13188)

Fixes #12343

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 gallery/how_to/compile_models/from_tflite.py  |  8 +-
 .../optimize_operators/opt_conv_tensorcore.py | 12 +--
 .../tune_with_autotvm/tune_conv2d_cuda.py     | 12 +--
 .../tune_with_autotvm/tune_relay_arm.py       | 12 +--
 .../tune_with_autotvm/tune_relay_cuda.py      | 12 +--
 .../tune_relay_mobile_gpu.py                  | 12 +--
 .../how_to/work_with_microtvm/micro_tflite.py | 12 +--
 gallery/tutorial/autotvm_matmul_x86.py        | 12 +--
 gallery/tutorial/cross_compilation_and_rpc.py | 12 +--
 gallery/tutorial/install.py                   | 12 +--
 tests/lint/check_request_hook.py              | 74 ++++++++++++++++---
 11 files changed, 124 insertions(+), 66 deletions(-)

diff --git a/gallery/how_to/compile_models/from_tflite.py b/gallery/how_to/compile_models/from_tflite.py
index 712269381f84..d1b78f11d5b0 100644
--- a/gallery/how_to/compile_models/from_tflite.py
+++ b/gallery/how_to/compile_models/from_tflite.py
@@ -53,14 +53,16 @@
 Below you can find an example on how to compile TFLite model using TVM.
 """
 
+######################################################################
+# Utils for downloading and extracting zip files
+# ----------------------------------------------
+
 # sphinx_gallery_start_ignore
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
-######################################################################
-# Utils for downloading and extracting zip files
-# ----------------------------------------------
+
 import os
 
 
diff --git a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
index 4cc2b40b7b8c..8db20b9b9bf8 100644
--- a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
+++ b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
@@ -27,12 +27,6 @@
 
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 ################################################################
 # TensorCore Introduction
 # -----------------------
@@ -57,6 +51,12 @@
 # We use stride size 1 and padding size 1 for the convolution. In the example, we use
 # NHWCnc memory layout.The following code defines the convolution algorithm in TVM.
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
index 95d6dcb0a19c..4560cf881ed8 100644
--- a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
+++ b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
@@ -28,12 +28,6 @@
 __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 ######################################################################
 # Install dependencies
 # --------------------
@@ -54,6 +48,12 @@
 #
 # Now return to python code. Import packages.
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import logging
 import sys
 import numpy as np
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_arm.py b/gallery/how_to/tune_with_autotvm/tune_relay_arm.py
index ab278021d2ca..a8f66d9b08a1 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_arm.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_arm.py
@@ -41,12 +41,6 @@
 __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 ######################################################################
 # Install dependencies
 # --------------------
@@ -68,6 +62,12 @@
 #
 # Now return to python code. Import packages.
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import os
 
 import numpy as np
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
index 459b2798c295..4cf397e2567e 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
@@ -39,12 +39,6 @@
 __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 ######################################################################
 # Install dependencies
 # --------------------
@@ -65,6 +59,12 @@
 #
 # Now return to python code. Import packages.
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import os
 
 import numpy as np
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py b/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py
index 5a4f0c56d2e7..d73e46448b7d 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py
@@ -39,12 +39,6 @@
 __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 ######################################################################
 # Install dependencies
 # --------------------
@@ -66,6 +60,12 @@
 #
 # Now return to python code. Import packages.
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import os
 
 import numpy as np
diff --git a/gallery/how_to/work_with_microtvm/micro_tflite.py b/gallery/how_to/work_with_microtvm/micro_tflite.py
index 7bbc5fc228cc..b04a2fdca18f 100644
--- a/gallery/how_to/work_with_microtvm/micro_tflite.py
+++ b/gallery/how_to/work_with_microtvm/micro_tflite.py
@@ -25,12 +25,6 @@
 model with Relay.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 ######################################################################
 # .. note::
 #     If you want to run this tutorial on the microTVM Reference VM, download the Jupyter
@@ -128,6 +122,12 @@
 # Load the pretrained TFLite model from a file in your current
 # directory into a buffer
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import os
 import json
 import tarfile
diff --git a/gallery/tutorial/autotvm_matmul_x86.py b/gallery/tutorial/autotvm_matmul_x86.py
index ebdbacb22153..f074c454bde4 100644
--- a/gallery/tutorial/autotvm_matmul_x86.py
+++ b/gallery/tutorial/autotvm_matmul_x86.py
@@ -45,12 +45,6 @@
   :code:`if __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 ################################################################################
 # Install dependencies
 # --------------------
@@ -70,6 +64,12 @@
 #
 # Now return to python code. Begin by importing the required packages.
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import logging
 import sys
 
diff --git a/gallery/tutorial/cross_compilation_and_rpc.py b/gallery/tutorial/cross_compilation_and_rpc.py
index 3f74899f7b1d..feab28fa11c1 100644
--- a/gallery/tutorial/cross_compilation_and_rpc.py
+++ b/gallery/tutorial/cross_compilation_and_rpc.py
@@ -31,12 +31,6 @@
 and the Firefly-RK3399 for an OpenCL example.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 ######################################################################
 # Build TVM Runtime on Device
 # ---------------------------
@@ -99,6 +93,12 @@
 #
 # Here we will declare a simple kernel on the local machine:
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import numpy as np
 
 import tvm
diff --git a/gallery/tutorial/install.py b/gallery/tutorial/install.py
index a499b037940c..b864dbfa85f4 100644
--- a/gallery/tutorial/install.py
+++ b/gallery/tutorial/install.py
@@ -28,12 +28,6 @@
 * Installing from third-party binary package.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 ################################################################################
 # Installing From Source
 # ----------------------
@@ -54,3 +48,9 @@
 # Check out  `TLCPack <https://tlcpack.ai>`_ to learn more. Note that the
 # third party binary packages could contain additional licensing terms for
 # the hardware drivers that are bundled with it.
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
diff --git a/tests/lint/check_request_hook.py b/tests/lint/check_request_hook.py
index 6e5c523d1187..35b1a85c3a43 100644
--- a/tests/lint/check_request_hook.py
+++ b/tests/lint/check_request_hook.py
@@ -19,6 +19,7 @@
 import fnmatch
 import re
 from pathlib import Path
+from typing import List, Optional
 
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent
@@ -30,6 +31,33 @@
 # sphinx_gallery_end_ignore
 """.rstrip()
 IGNORE_PATTERNS = ["*/micro_tvmc.py", "*/micro_train.py"]
+APACHE_HEADER_LINES = 16
+
+
+def find_code_block_line(lines: List[str]) -> Optional[int]:
+    """
+    This returns the index in 'lines' of the first line of code in the tutorial
+    or none if there are no code blocks.
+    """
+    in_multiline_string = False
+    in_sphinx_directive = False
+
+    i = 0
+    lines = lines[APACHE_HEADER_LINES:]
+    while i < len(lines):
+        line = lines[i].strip()
+        if '"""' in line:
+            in_multiline_string = not in_multiline_string
+        elif "# sphinx_gallery_" in line:
+            in_sphinx_directive = not in_sphinx_directive
+        elif line.startswith("#") or in_sphinx_directive or in_multiline_string or line == "":
+            pass
+        else:
+            return i
+        i += 1
+
+    return None
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -41,6 +69,7 @@
     args = parser.parse_args()
 
     gallery_files = (REPO_ROOT / "gallery").glob("**/*.py")
+    # gallery_files = [x for x in gallery_files if "cross_compi" in str(x)]
 
     errors = []
     for file in gallery_files:
@@ -56,23 +85,43 @@
             content = f.read()
 
         if EXPECTED not in content:
-            errors.append(file)
+            errors.append((file, None))
+            continue
+
+        index = content.index(EXPECTED)
+        line = content.count("\n", 0, index) + EXPECTED.count("\n") + 2
+        expected = find_code_block_line(content.split("\n"))
+
+        if expected is not None and line < expected:
+            errors.append((file, (line, expected)))
 
     if args.fix:
-        for error in errors:
+        for error, line_info in errors:
             with open(error) as f:
                 content = f.read()
 
+            # Note: There must be a little bit of care taken here since inserting
+            # the block between a comment and multiline string will lead to an
+            # empty code block in the HTML output
             if "from __future__" in content:
                 # Place after the last __future__ import
                 new_content = re.sub(
                     r"((?:from __future__.*?\n)+)", r"\1\n" + EXPECTED, content, flags=re.MULTILINE
                 )
             else:
-                # Place after the module doc comment
-                new_content = re.sub(
-                    r"(\"\"\"(?:.*\n)+\"\"\")", r"\1\n" + EXPECTED, content, flags=re.MULTILINE
-                )
+                # Place in the first codeblock
+                lines = content.split("\n")
+                position = find_code_block_line(lines)
+                if position is None:
+                    new_content = "\n".join(lines) + EXPECTED + "\n"
+                else:
+                    print(position)
+                    new_content = (
+                        "\n".join(lines[:position])
+                        + EXPECTED
+                        + "\n\n"
+                        + "\n".join(lines[position:])
+                    )
 
             with open(error, "w") as f:
                 f.write(new_content)
@@ -80,12 +129,19 @@
         # Don't fix, just check and print an error message
         if len(errors) > 0:
             print(
-                f"These {len(errors)} files did not contain the expected text to "
-                "override urllib.request.Request.\n"
+                f"These {len(errors)} file(s) did not contain the expected text to "
+                "override urllib.request.Request, it was at the wrong position, or "
+                "the whitespace is incorrect.\n"
                 "You can run 'python3 tests/lint/check_request_hook.py --fix' to "
                 "automatically fix these errors:\n"
-                f"{EXPECTED}\n\nFiles:\n" + "\n".join([str(error_path) for error_path in errors])
+                f"{EXPECTED}\n\nFiles:"
             )
+            for file, line_info in errors:
+                if line_info is None:
+                    print(f"{file} (missing hook)")
+                else:
+                    actual, expected = line_info
+                    print(f"{file} (misplaced hook at {actual}, expected at {expected})")
             exit(1)
         else:
             print("All files successfully override urllib.request.Request")

From 697533e136b091b2833e4f6c16298496257e987f Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 27 Oct 2022 04:44:21 +0900
Subject: [PATCH 443/704] [TIR] Add utility for anchor block extraction
 (#13194)

I'm working on enabling "anchor block" tuning for MS. This is a utility to extract anchor blocks.

I define the "anchor block" to be the block (1) with an init statement and (2) having the biggest flops count. The latter condition is only used when there are multiple blocks with an init statement.

For example, if the input module is conv2d + fused spatial blocks, conv2d is the anchor block. A module created from winograd convolution has multiple blocks with an init statement (input transform, batched GEMM, and output transform). We use the second condition, the flops count, to determine that the batched GEMM block is the anchor block.
---
 include/tvm/tir/analysis.h                    |  25 +++
 python/tvm/tir/analysis/analysis.py           |  27 ++++
 src/tir/analysis/stmt_finding.cc              | 150 ++++++++++++++++++
 src/tir/schedule/analysis.h                   |   9 --
 src/tir/schedule/analysis/analysis.cc         |  41 -----
 .../test_tir_analysis_stmt_finding.py         |  55 +++++++
 6 files changed, 257 insertions(+), 50 deletions(-)
 create mode 100644 src/tir/analysis/stmt_finding.cc
 create mode 100644 tests/python/unittest/test_tir_analysis_stmt_finding.py

diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h
index 7af591c23883..e9796eca6505 100644
--- a/include/tvm/tir/analysis.h
+++ b/include/tvm/tir/analysis.h
@@ -236,6 +236,31 @@ TVM_DLL Map<Buffer, Optional<Stmt>> DetectBufferAccessLCA(const PrimFunc& func);
  */
 TVM_DLL bool VerifyWellFormed(const PrimFunc& func, bool assert_mode = true);
 
+/*!
+ * \brief Find the entry function of the given IRModule, i.e, functions marked by
+ * `tir::attr::kIsEntryFunc`, whose name is `main` or being the only PrimeFunc.
+ * \param mod The IRModule to find the entry function.
+ * \param result_g_var The result GlobalVar of the entry function.
+ * \return The entry function.
+ */
+const PrimFuncNode* FindEntryFunc(const IRModule& mod, GlobalVar* result_g_var);
+
+/*!
+ * \brief Find the "anchor block" of the given module.
+ * We define the anchor block to be the block with (1) an init statement and (2) having
+ * the biggest flops count. The latter condition is only used when there are multiple blocks
+ * with an init statement.
+ * For example, if the input module is conv2d + fused spatial blocks, conv2d is the anchor block.
+ * The input module may not contain more than one such block. For example, a module having
+ * two conv2d is not allowed as an input.
+ * However, a module created from winograd convolution has multiple blocks with an init statement
+ * (input transform, batched GEMM, and output transform). We use the second condition, the flops
+ * count, to determine that the batched GEMM block is the anchor block.
+ * \param mod The input TIR module.
+ * \return The anchor block if found, nullptr otherwise.
+ */
+const tir::BlockNode* FindAnchorBlock(const IRModule& mod);
+
 // Pass variants of verification analysis
 // directly throws RuntimeError when verification fails.
 namespace transform {
diff --git a/python/tvm/tir/analysis/analysis.py b/python/tvm/tir/analysis/analysis.py
index 545404171309..efb869efd6dc 100644
--- a/python/tvm/tir/analysis/analysis.py
+++ b/python/tvm/tir/analysis/analysis.py
@@ -331,3 +331,30 @@ def OOBChecker():
         The result pass
     """
     return _ffi_api.OOBChecker()  # type: ignore
+
+
+def find_anchor_block(mod: IRModule) -> Block:
+    """Find the "anchor block" of the given module.
+
+    We define the anchor block to be the block with (1) an init statement and (2) having
+    the biggest flops count. The latter condition is only used when there are multiple blocks
+    with an init statement.
+
+    For example, if the input module is conv2d + fused spatial blocks, conv2d is the anchor block.
+    The input module may not contain more than one such block. For example, a module having
+    two conv2d is not allowed as an input.
+
+    However, a module created from winograd convolution has multiple blocks with an init statement
+    (input transform, batched GEMM, and output transform). We use the second condition, the flops
+    count, to determine that the batched GEMM block is the anchor block.
+
+    Parameters
+    ----------
+    mod: tvm.ir.IRModule
+        The input TIR module.
+    Returns
+    -------
+    anchor_block: Block
+        The anchor block if found, None otherwise.
+    """
+    return _ffi_api.find_anchor_block(mod)  # type: ignore # pylint: disable=no-member
diff --git a/src/tir/analysis/stmt_finding.cc b/src/tir/analysis/stmt_finding.cc
new file mode 100644
index 000000000000..107786a0eb38
--- /dev/null
+++ b/src/tir/analysis/stmt_finding.cc
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/stmt_functor.h>
+
+namespace tvm {
+namespace tir {
+
+const PrimFuncNode* FindEntryFunc(const IRModule& mod, GlobalVar* result_g_var) {
+  GlobalVar result = NullValue<GlobalVar>();
+  // Priority 1: PrimFunc marked as `tir::attr::kIsEntryFunc`
+  int num_prim_func = 0;
+  const tir::PrimFuncNode* main_func = nullptr;
+  const tir::PrimFuncNode* last_func = nullptr;
+  for (const auto& kv : mod->functions) {
+    GlobalVar gv = kv.first;
+    BaseFunc base_func = kv.second;
+    if (const auto* func = base_func.as<tir::PrimFuncNode>()) {
+      last_func = func;
+      if (func->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
+        if (result_g_var != nullptr) {
+          *result_g_var = gv;
+        }
+        return func;
+      }
+      if (gv->name_hint == "main") {
+        main_func = func;
+        result = gv;
+      }
+      ++num_prim_func;
+    }
+  }
+  // Priority 2: PrimFunc whose name is `main`
+  if (main_func != nullptr) {
+    if (result_g_var != nullptr) {
+      *result_g_var = result;
+    }
+    return main_func;
+  }
+  // Priority 3: The only PrimFunc in the IRModule
+  if (num_prim_func == 1) {
+    if (result_g_var != nullptr) {
+      *result_g_var = result;
+    }
+    return last_func;
+  }
+  return nullptr;
+}
+
+Stmt GetEnclosingLoop(const BlockNode* block, Stmt func_body) {
+  struct GetRootSeqStmt : public StmtVisitor {
+    void VisitStmt_(const SeqStmtNode* seq) override { result = seq; }
+    const SeqStmtNode* result;
+  };
+
+  struct BlockFinder : public StmtVisitor {
+    explicit BlockFinder(const BlockNode* tgt) : target(tgt) {}
+
+    void VisitStmt_(const BlockNode* block) override {
+      if (block == target) {
+        found = true;
+      }
+    }
+
+    const BlockNode* target;
+    bool found = false;
+  };
+
+  GetRootSeqStmt seq_finder;
+  seq_finder(func_body);
+
+  ICHECK(seq_finder.result);
+
+  for (auto stmt : seq_finder.result->seq) {
+    if (stmt->IsInstance<ForNode>()) {
+      BlockFinder finder(block);
+      finder(stmt);
+      if (finder.found) {
+        return stmt;
+      }
+    }
+  }
+
+  LOG(FATAL) << "Enclosing loop not found for a block " << GetRef<Block>(block);
+  return Stmt();
+}
+
+const BlockNode* FindAnchorBlock(const IRModule& mod) {
+  struct ReductionBlockCollector : public StmtVisitor {
+    void VisitStmt_(const BlockNode* block) override {
+      if (block->init) {
+        blocks.push_back(block);
+      }
+      StmtVisitor::VisitStmt(block->body);
+    }
+    std::vector<const BlockNode*> blocks;
+  };
+
+  auto prim_func = FindEntryFunc(mod, nullptr);
+
+  ReductionBlockCollector collector;
+  collector(prim_func->body);
+
+  const auto& candidates = collector.blocks;
+
+  if (candidates.empty()) {
+    return nullptr;
+  } else if (candidates.size() == 1) {
+    return candidates[0];
+  }
+
+  double best_flops = -1;
+  int best_idx = 0;
+  for (size_t i = 0; i < candidates.size(); ++i) {
+    auto loop = GetEnclosingLoop(candidates[i], prim_func->body);
+    auto flops = EstimateTIRFlops(loop);
+    if (flops > best_flops) {
+      best_flops = flops;
+      best_idx = i;
+    }
+  }
+  return candidates[best_idx];
+}
+
+TVM_REGISTER_GLOBAL("tir.analysis.find_anchor_block").set_body_typed([](const IRModule& mod) {
+  auto ret = FindAnchorBlock(mod);
+  if (ret) {
+    return Optional<Block>(GetRef<Block>(ret));
+  }
+  return Optional<Block>(NullOpt);
+});
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index 7df991826728..bc505a0104be 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -72,15 +72,6 @@ const PrimFuncNode* GetRootPrimFunc(const IRModule& mod, const StmtNode* root_bl
  */
 StmtSRef GetSRefTreeRoot(const StmtSRef& sref);
 
-/*!
- * \brief Find the entry function of the given IRModule, i.e, functions marked by
- * `tir::attr::kIsEntryFunc`, whose name is `main` or being the only PrimeFunc.
- * \param mod The IRModule to find the entry function.
- * \param result_g_var The result GlobalVar of the entry function.
- * \return The entry function.
- */
-const PrimFuncNode* FindEntryFunc(const IRModule& mod, GlobalVar* result_g_var);
-
 /******** Scope ********/
 /*!
  * \brief Checks if scope the specified sref is in is a stage-pipeline and return it
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 384d006562f0..d8b4f31f4c1b 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -46,47 +46,6 @@ const PrimFuncNode* GetRootPrimFunc(const IRModule& mod, const StmtNode* root_bl
   throw;
 }
 
-const PrimFuncNode* FindEntryFunc(const IRModule& mod, GlobalVar* result_g_var) {
-  GlobalVar result = NullValue<GlobalVar>();
-  // Priority 1: PrimFunc marked as `tir::attr::kIsEntryFunc`
-  int num_prim_func = 0;
-  const tir::PrimFuncNode* main_func = nullptr;
-  const tir::PrimFuncNode* last_func = nullptr;
-  for (const auto& kv : mod->functions) {
-    GlobalVar gv = kv.first;
-    BaseFunc base_func = kv.second;
-    if (const auto* func = base_func.as<tir::PrimFuncNode>()) {
-      last_func = func;
-      if (func->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
-        if (result_g_var != nullptr) {
-          *result_g_var = gv;
-        }
-        return func;
-      }
-      if (gv->name_hint == "main") {
-        main_func = func;
-        result = gv;
-      }
-      ++num_prim_func;
-    }
-  }
-  // Priority 2: PrimFunc whose name is `main`
-  if (main_func != nullptr) {
-    if (result_g_var != nullptr) {
-      *result_g_var = result;
-    }
-    return main_func;
-  }
-  // Priority 3: The only PrimFunc in the IRModule
-  if (num_prim_func == 1) {
-    if (result_g_var != nullptr) {
-      *result_g_var = result;
-    }
-    return last_func;
-  }
-  return nullptr;
-}
-
 /******** Scope ********/
 
 StmtSRef GetScopeRoot(const ScheduleState& self, const StmtSRef& sref,
diff --git a/tests/python/unittest/test_tir_analysis_stmt_finding.py b/tests/python/unittest/test_tir_analysis_stmt_finding.py
new file mode 100644
index 000000000000..791699e4e4ed
--- /dev/null
+++ b/tests/python/unittest/test_tir_analysis_stmt_finding.py
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+import tvm
+from tvm.tir.analysis import find_anchor_block
+from tvm import te, topi
+from tvm.meta_schedule.testing.te_workload import matmul, conv2d_winograd_nhwc
+
+
+def test_matmul_add():
+    n = m = k = 128
+    A, B, C = matmul(n, m, k)
+    mod = tvm.IRModule()
+    mod["main"] = te.create_prim_func([A, B, C + A])
+
+    block = find_anchor_block(mod)
+
+    assert block.name_hint == "C"
+
+
+def test_winograd():
+    mod = tvm.IRModule()
+    mod["main"] = te.create_prim_func(conv2d_winograd_nhwc(1, 56, 56, 64, 64, 3))
+
+    block = find_anchor_block(mod)
+
+    assert block.name_hint == "bgemm"
+
+
+def test_no_anchor_block():
+    inp = te.placeholder((10,), name="input")
+    out = topi.nn.relu(inp + 1.0)
+    mod = tvm.IRModule()
+    mod["main"] = te.create_prim_func([inp, out])
+
+    assert find_anchor_block(mod) is None
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From 04afd8311386b6293bda04843738e972df8e93f1 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Wed, 26 Oct 2022 13:57:04 -0700
Subject: [PATCH 444/704] =?UTF-8?q?[Hexagon]=20Add=20a=20test=20to=20show?=
 =?UTF-8?q?=20how=20to=20use=20multi=20input=20async=20dma=20pipelin?=
 =?UTF-8?q?=E2=80=A6=20(#13110)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [Hexagon] Add a test to show how to use multi input async dma pipelining.

* updates to variable naming and removal of 4.19MB test that fails on 888 devices.

* [Hexagon] Add test cases for adding async dma pipleining to metaschedule generated conv2d.

* Add tests for actual conv2d sourced from resnet50 metascheduling.

* [Hexagon] Change logs.

* Add test to show effect of increased buffer sizes for pipelining.

* skip tests in CI.

* lint
---
 .../test_hexagon/test_async_dma_pipeline.py   | 446 +++++++++++++++++-
 1 file changed, 427 insertions(+), 19 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index d05e0a6e9216..45e8eb0f68c6 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -24,8 +24,6 @@
 from tvm.script import tir as T
 from numpy.random import default_rng
 
-from tvm.tir.function import TensorIntrin
-
 VRMPY_SIZE_B = 128
 VRMPY_SIZE_INT32 = 32
 
@@ -72,9 +70,23 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
     return tvm.tir.Schedule(operator)
 
 
-def evaluate(hexagon_session, sch, a, b, size_a, expected_output, use_async_copy=0):
+def evaluate(
+    hexagon_session,
+    sch,
+    a,
+    b,
+    c,
+    expected_output=None,
+    use_async_copy=0,
+    merge_async_commit_queue_scope=False,
+):
     target_hexagon = tvm.target.hexagon("v68", link_params=True)
-    with tvm.transform.PassContext(config={"tir.use_async_copy": use_async_copy}):
+    with tvm.transform.PassContext(
+        config={
+            "tir.use_async_copy": use_async_copy,
+            "tir.merge_async_commit_queue_scope": merge_async_commit_queue_scope,
+        }
+    ):
         func_tir = tvm.build(
             sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
         )
@@ -82,9 +94,7 @@ def evaluate(hexagon_session, sch, a, b, size_a, expected_output, use_async_copy
 
     a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
     b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
-    c_hexagon = tvm.runtime.ndarray.array(
-        np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32"), device=hexagon_session.device
-    )
+    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
 
     if tvm.testing.utils.IS_IN_CI:
         # Run with reduced number and repeat for CI
@@ -93,7 +103,8 @@ def evaluate(hexagon_session, sch, a, b, size_a, expected_output, use_async_copy
         timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=10, repeat=10)
 
     time = timer(a_hexagon, b_hexagon, c_hexagon)
-    tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected_output)
+    if expected_output is not None:
+        tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected_output)
     return round(time.mean * 1000, 4)
 
 
@@ -252,9 +263,32 @@ def get_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
     sch.compute_at(cache_read_block_a, no)
     sch.fuse(*sch.get_loops(cache_read_block_a)[1:])
 
-    cache_read_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
-    sch.reverse_compute_at(cache_read_block_c, no)
-    sch.fuse(*sch.get_loops(cache_read_block_c)[1:])
+    cache_write_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
+    sch.reverse_compute_at(cache_write_block_c, no)
+    sch.fuse(*sch.get_loops(cache_write_block_c)[1:])
+
+    return sch
+
+
+def get_multi_input_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
+    sch = conv_approximation(size_a, size_w)
+
+    compute_block = sch.get_block("C")
+
+    n = sch.get_loops(compute_block)[0]
+    no, _ = sch.split(n, [blocks, None])
+
+    cache_read_block_a = sch.cache_read(compute_block, 0, "global.vtcm")
+    sch.compute_at(cache_read_block_a, no)
+    sch.fuse(*sch.get_loops(cache_read_block_a)[1:])
+
+    cache_read_block_b = sch.cache_read(compute_block, 1, "global.vtcm")
+    sch.compute_at(cache_read_block_b, no)
+    sch.fuse(*sch.get_loops(cache_read_block_b)[1:])
+
+    cache_write_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
+    sch.reverse_compute_at(cache_write_block_c, no)
+    sch.fuse(*sch.get_loops(cache_write_block_c)[1:])
 
     return sch
 
@@ -271,13 +305,12 @@ class TestAsyncDMAPipeline:
     size_a = tvm.testing.parameter(
         1024,
         64 * 64,
-        128 * 128,
+        128 * 64,
     )
 
     size_w = tvm.testing.parameter(
         1 * 1,
         3 * 3,
-        7 * 7,
         9 * 9,
     )
 
@@ -296,11 +329,24 @@ def test_loading_vtcm_for_vrmpy(
             pytest.skip("Skipping test since it takes too long in CI.")
 
         sch = conv_approximation(size_a, size_w)
-        base_runtime = evaluate(hexagon_session, sch, input_a, input_w, size_a, expected_output)
+        base_runtime = evaluate(
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+        )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
         base_vtcm_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+            use_async_copy=1,
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
@@ -309,7 +355,13 @@ def test_loading_vtcm_for_vrmpy(
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0])
         async_input_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+            use_async_copy=1,
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
@@ -318,7 +370,44 @@ def test_loading_vtcm_for_vrmpy(
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0, 2])
         async_input_output_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+            use_async_copy=1,
+        )
+
+        sch = get_fake_conv_vtcm_schedule(size_a, size_w)
+        n = sch.get_loops(sch.get_block("C"))[0]
+        sch.annotate(n, "software_pipeline_stage", [0, 3, 6])
+        sch.annotate(n, "software_pipeline_order", [0, 1, 2])
+        sch.annotate(n, "software_pipeline_async_stages", [0, 6])
+        async_input_output_runtime_larger_buffers = evaluate(
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+            use_async_copy=1,
+        )
+
+        sch = get_multi_input_fake_conv_vtcm_schedule(size_a, size_w)
+        n = sch.get_loops(sch.get_block("C"))[0]
+        sch.annotate(n, "software_pipeline_stage", [0, 0, 1, 2])
+        sch.annotate(n, "software_pipeline_order", [0, 1, 2, 3])
+        sch.annotate(n, "software_pipeline_async_stages", [0, 2])
+        async_multi_input_output_runtime = evaluate(
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+            use_async_copy=1,
+            merge_async_commit_queue_scope=False,
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
@@ -327,12 +416,23 @@ def test_loading_vtcm_for_vrmpy(
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [2])
         async_output_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+            use_async_copy=1,
         )
 
         sch = get_single_dma_schedule(size_a, size_w)
         single_dma_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, size_a, expected_output
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
         )
 
         # Total transfer size is equal to the size of A + W + C which is equal to 2 * size_a * 128 + size_w * 128
@@ -349,5 +449,313 @@ def test_loading_vtcm_for_vrmpy(
                 "async_dma_input": async_input_runtime,
                 "async_dma_output": async_output_runtime,
                 "async_dma_input_output": async_input_output_runtime,
+                "async_dma_multi_input_output": async_multi_input_output_runtime,
+                "async_input_output_runtime_larger_buffers": async_input_output_runtime_larger_buffers,
             },
         )
+
+
+# from tvm.script import tir as T
+@tvm.script.ir_module
+class ModulePipelined:
+    @T.prim_func
+    def main(
+        p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
+        p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
+        T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"tir.noalias": True, "global_symbol": "main"})
+        # body
+        # with T.block("root")
+        conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32", scope="global.vtcm")
+        p0_global_vtcm = T.alloc_buffer([1, 1, 230, 230, 4], dtype="uint8", scope="global.vtcm")
+        p1_global_vtcm = T.alloc_buffer([2, 1, 7, 7, 1, 32, 4], dtype="int8", scope="global.vtcm")
+        for ax0, ax1, ax2, ax3, ax4, ax5, ax6 in T.grid(2, 1, 7, 7, 1, 32, 4):
+            with T.block("p1_global.vtcm"):
+                v0, v1, v2, v3, v4, v5, v6 = T.axis.remap(
+                    "SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6]
+                )
+                T.reads(p1[v0, v1, v2, v3, v4, v5, v6])
+                T.writes(p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6])
+                p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6] = p1[v0, v1, v2, v3, v4, v5, v6]
+        for po in T.serial(4):
+            for i in T.serial(55876):
+                with T.block("p0_global.vtcm"):
+                    v0 = T.axis.spatial(1, 0)
+                    v1 = T.axis.spatial(1, 0)
+                    v2 = T.axis.spatial(230, po * 56 + i // 916)
+                    v3 = T.axis.spatial(230, i % 916 // 4)
+                    v4 = T.axis.spatial(4, i % 4)
+                    T.reads(p0[v0, v1, v2, v3, v4])
+                    T.writes(p0_global_vtcm[v0, v1, v2, v3, v4])
+                    p0_global_vtcm[v0, v1, v2, v3, v4] = p0[v0, v1, v2, v3, v4]
+            for i in T.parallel(28):
+                for ii, iii, iiii in T.grid(2, 14, 8):
+                    with T.block("conv2d_NCHWc_int8_o_init"):
+                        n = T.axis.spatial(1, 0)
+                        oc_chunk = T.axis.spatial(2, ii)
+                        oh = T.axis.spatial(112, (po * 28 + i) // 14 * 14 + iii)
+                        ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + iiii)
+                        oc_block_o = T.axis.spatial(1, 0)
+                        T.reads()
+                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
+                        for i4_1 in T.vectorized(32):
+                            with T.block("conv2d_NCHWc_int8_init"):
+                                oc_block_i_init = T.axis.spatial(32, i4_1)
+                                T.reads()
+                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
+                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                for i1_1, i5_1, i6_1, i2_2, i3_2 in T.grid(2, 7, 7, 14, 8):
+                    with T.block("conv2d_NCHWc_int8_o_update"):
+                        n = T.axis.spatial(1, 0)
+                        oc_chunk = T.axis.spatial(2, i1_1)
+                        oh = T.axis.spatial(112, (po * 28 + i) // 14 * 14 + i2_2)
+                        ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + i3_2)
+                        oc_block_o = T.axis.spatial(1, 0)
+                        kh = T.axis.reduce(7, i5_1)
+                        kw = T.axis.reduce(7, i6_1)
+                        ic_outer = T.axis.reduce(1, 0)
+                        ic_f_inner = T.axis.reduce(1, 0)
+                        ic_s_inner_o = T.axis.reduce(1, 0)
+                        T.reads(
+                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                            p0_global_vtcm[
+                                n,
+                                ic_outer,
+                                oh * 2 + kh,
+                                ow * 2 + kw,
+                                ic_f_inner * 4 : ic_f_inner * 4 + 4,
+                            ],
+                            p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                        )
+                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
+                        A = T.match_buffer(
+                            p0_global_vtcm[
+                                n,
+                                ic_outer,
+                                oh * 2 + kh,
+                                ow * 2 + kw,
+                                ic_f_inner * 4 : ic_f_inner * 4 + 4,
+                            ],
+                            [4],
+                            dtype="uint8",
+                            offset_factor=1,
+                            scope="global.vtcm",
+                        )
+                        B = T.match_buffer(
+                            p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                            [32, 4],
+                            dtype="int8",
+                            offset_factor=1,
+                            scope="global.vtcm",
+                        )
+                        C = T.match_buffer(
+                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                            [32],
+                            dtype="int32",
+                            offset_factor=1,
+                            scope="global.vtcm",
+                        )
+                        A_u8x4: T.uint8x4 = A[0:4]
+                        A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                        B_i8x128 = B[0, 0:128]
+                        B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                        C[0:32] = T.call_llvm_pure_intrin(
+                            4217,
+                            T.uint32(3),
+                            C[0:32],
+                            T.broadcast(A_i32, 32),
+                            B_i32x32,
+                            dtype="int32x32",
+                        )
+            for i in T.serial(200704):
+                with T.block("conv2d_NCHWc_int8.vtcm"):
+                    ax0_1 = T.axis.spatial(1, 0)
+                    ax1_1 = T.axis.spatial(2, i % 7168 // 3584)
+                    ax2_1 = T.axis.spatial(112, (po * 28 + i // 7168) // 14 * 14 + i % 3584 // 256)
+                    ax3_1 = T.axis.spatial(112, (po * 28 + i // 7168) % 14 * 8 + i % 256 // 32)
+                    ax4 = T.axis.spatial(32, i % 32)
+                    T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                    T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                    T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[
+                        ax0_1, ax1_1, ax2_1, ax3_1, ax4
+                    ]
+
+
+# from tvm.script import tir as T
+@tvm.script.ir_module
+class ModuleBase:
+    @T.prim_func
+    def main(
+        p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
+        p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
+        T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"tir.noalias": True, "global_symbol": "main"})
+        # buffer definition
+        # body
+        # with T.block("root")
+        conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32")
+        for i0_0_i1_0_i2_0_i3_0_fused in T.parallel(
+            112, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}
+        ):
+            for i4_0_0 in T.serial(1):
+                for i1_1_init, i2_1_init, i3_1_init, i1_2_init, i2_2_init, i3_2_init in T.grid(
+                    2, 1, 1, 1, 14, 8
+                ):
+                    with T.block("conv2d_NCHWc_int8_o_init"):
+                        n = T.axis.spatial(1, 0)
+                        oc_chunk = T.axis.spatial(2, i1_1_init + i1_2_init)
+                        oh = T.axis.spatial(
+                            112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1_init * 14 + i2_2_init
+                        )
+                        ow = T.axis.spatial(
+                            112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1_init * 8 + i3_2_init
+                        )
+                        oc_block_o = T.axis.spatial(1, 0)
+                        T.reads()
+                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
+                        for i4_1 in T.vectorized(32):
+                            with T.block("conv2d_NCHWc_int8_init"):
+                                oc_block_i_init = T.axis.spatial(32, i4_1)
+                                T.reads()
+                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
+                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                for i5_0, i6_0, i7_0, i8_0, i9_0_0 in T.grid(1, 1, 1, 1, 1):
+                    for (
+                        i0_1,
+                        i1_1,
+                        i2_1,
+                        i3_1,
+                        i4_0_1,
+                        i5_1,
+                        i6_1,
+                        i7_1,
+                        i8_1,
+                        i9_0_1,
+                        i0_2,
+                        i1_2,
+                        i2_2,
+                        i3_2,
+                        i4_0_2,
+                    ) in T.grid(1, 2, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 14, 8, 1):
+                        with T.block("conv2d_NCHWc_int8_o_update"):
+                            n = T.axis.spatial(1, 0)
+                            oc_chunk = T.axis.spatial(2, i1_1 + i1_2)
+                            oh = T.axis.spatial(
+                                112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1 * 14 + i2_2
+                            )
+                            ow = T.axis.spatial(
+                                112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1 * 8 + i3_2
+                            )
+                            oc_block_o = T.axis.spatial(1, 0)
+                            kh = T.axis.reduce(7, i5_0 * 7 + i5_1)
+                            kw = T.axis.reduce(7, i6_0 * 7 + i6_1)
+                            ic_outer = T.axis.reduce(1, 0)
+                            ic_f_inner = T.axis.reduce(1, 0)
+                            ic_s_inner_o = T.axis.reduce(1, 0)
+                            T.reads(
+                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                                p0[
+                                    n,
+                                    ic_outer,
+                                    oh * 2 + kh,
+                                    ow * 2 + kw,
+                                    ic_f_inner * 4 : ic_f_inner * 4 + 4,
+                                ],
+                                p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                            )
+                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
+                            A = T.match_buffer(
+                                p0[
+                                    n,
+                                    ic_outer,
+                                    oh * 2 + kh,
+                                    ow * 2 + kw,
+                                    ic_f_inner * 4 : ic_f_inner * 4 + 4,
+                                ],
+                                [4],
+                                dtype="uint8",
+                                offset_factor=1,
+                            )
+                            B = T.match_buffer(
+                                p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                                [32, 4],
+                                dtype="int8",
+                                offset_factor=1,
+                            )
+                            C = T.match_buffer(
+                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                                [32],
+                                dtype="int32",
+                                offset_factor=1,
+                            )
+                            A_u8x4: T.uint8x4 = A[0:4]
+                            A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                            B_i8x128 = B[0, 0:128]
+                            B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                            C[0:32] = T.call_llvm_pure_intrin(
+                                4217,
+                                T.uint32(3),
+                                C[0:32],
+                                T.broadcast(A_i32, 32),
+                                B_i32x32,
+                                dtype="int32x32",
+                            )
+                    for ax0, ax1, ax2, ax3 in T.grid(1, 2, 14, 8):
+                        for ax4_fused in T.vectorized(32):
+                            with T.block("T_cast_2"):
+                                ax0_1, ax1_1 = T.axis.remap("SS", [ax0, ax1])
+                                ax2_1 = T.axis.spatial(
+                                    112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + ax2
+                                )
+                                ax3_1 = T.axis.spatial(
+                                    112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + ax3
+                                )
+                                ax4 = T.axis.spatial(32, ax4_fused)
+                                T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[
+                                    ax0_1, ax1_1, ax2_1, ax3_1, ax4
+                                ]
+
+
+@tvm.testing.requires_hexagon
+def test_meta(hexagon_session):
+    if tvm.testing.utils.IS_IN_CI:
+        pytest.skip("Skipping test since it takes too long in CI.")
+
+    a = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
+    w = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
+    c = np.zeros((1, 2, 112, 112, 32), dtype="int32")
+
+    sch = tvm.tir.Schedule(ModuleBase)
+    base_runtime = evaluate(hexagon_session, sch, a, w, c)
+
+    sch = tvm.tir.Schedule(ModulePipelined)
+    compute_block = sch.get_block("conv2d_NCHWc_int8_o_update")
+    o = sch.get_loops(compute_block)[0]
+
+    unscheduled_vtcm_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
+
+    sch = tvm.tir.Schedule(ModulePipelined)
+    compute_block = sch.get_block("conv2d_NCHWc_int8_o_update")
+    o = sch.get_loops(compute_block)[0]
+
+    sch.annotate(o, "software_pipeline_stage", [0, 1, 2])
+    sch.annotate(o, "software_pipeline_order", [0, 1, 2])
+    sch.annotate(o, "software_pipeline_async_stages", [0, 2])
+
+    pipeline_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
+
+    transfer_mb = round((a.size + w.size + c.size) / 1e6, 2)
+    print_results(
+        f"Test with A.size: {a.size}, W.size: {w.size}, and total memory transfer of {transfer_mb} MB...",
+        {
+            "without_vtcm": base_runtime,
+            "unscheduled_vtcm_runtime": unscheduled_vtcm_runtime,
+            "pipeline_runtime": pipeline_runtime,
+        },
+    )

From 6d2aa09cd906c02703aaac571be8c6fd613c38ff Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 26 Oct 2022 20:13:16 -0700
Subject: [PATCH 445/704] [Hexagon] Fix port range bug and add random.seed
 (#13207)

* Fix port range and add random.seed

* fix seed
---
 python/tvm/contrib/hexagon/pytest_plugin.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index d462a65ef930..0771468051d7 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -37,6 +37,7 @@
 ANDROID_REMOTE_DIR = "ANDROID_REMOTE_DIR"
 ANDROID_SERIAL_NUMBER = "ANDROID_SERIAL_NUMBER"
 ADB_SERVER_SOCKET = "ADB_SERVER_SOCKET"
+RNG_SEEDED = False
 
 
 @tvm.testing.fixture
@@ -84,10 +85,18 @@ def android_serial_number() -> Optional[str]:
 def get_free_port() -> int:
     """Return the next port that is available to listen on"""
     global PREVIOUS_PORT
+    global RNG_SEEDED
+
+    if not RNG_SEEDED:
+        random.seed(0)
+        RNG_SEEDED = True
+
     if PREVIOUS_PORT is None:
         port = random.randint(LISTEN_PORT_MIN, LISTEN_PORT_MAX)
     else:
         port = PREVIOUS_PORT + 1
+        if port > LISTEN_PORT_MAX:
+            port = LISTEN_PORT_MIN
 
     while tvm.contrib.hexagon.build._is_port_in_use(port):
         port = port + 1 if port < LISTEN_PORT_MAX else LISTEN_PORT_MIN

From 645a5eafa36ad358a9160f993e849702c82387d7 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Thu, 27 Oct 2022 12:41:42 +0300
Subject: [PATCH 446/704] [Relay][Hexagon] Add per-channel FixedPointMultiply
 operation (#13080)

* [Relay][Hexagon] Add per-channel FixedPointMultiply operation

Main goal of this commit is to improve performance for Hexagon target and
preserve performance/accuracy for x86, GPU and etc. targets.

"qnn.requantize" operation is lowered into the sequence of multiply, add, shift
during QNN canonicalization pass if scale quantization parameter is the vector
of scalars. This commit adds new Relay per-channel/per-axis FixedPointMultiply
operation and is used in "qnn.requantize" operation lowering.

per-channel/per-axis FixedPointMultiply is implemented through
tir.q_multiply_shift_per_axis intrinsic. For Hexagon target it overrides default
implementation and generates HVX vmpye/vmpyo instruction (see
_q_multiply_shift_per_axis_hexagon). For all other targets it uses default
implementation (64 bits arithmetic).

Performance/accuracy measurement:

CPU(x86) target: accuracy and performance are the same. For other targets should
be the same (otherwise it is bug).

Hexagon target: speedup of qnn.requantize 7x-9x times (Snapdragon 888, 3.08 ms -> 0.39 ms)

* Address code review comments
---
 include/tvm/relay/attrs/transform.h           |  17 +++
 python/tvm/relay/op/_tensor.py                |  13 ++
 python/tvm/tir/__init__.py                    |   2 +-
 python/tvm/tir/op.py                          |  54 +++++++-
 python/tvm/topi/hexagon/tensor_intrin.py      |  84 ++++++++++--
 python/tvm/topi/math.py                       |  58 ++++++++
 src/relay/op/make_op.h                        |   4 +
 src/relay/op/tensor/transform.cc              | 129 ++++++++++++++++++
 src/relay/qnn/op/requantize.cc                |  12 +-
 src/relay/qnn/utils.cc                        |  31 +++++
 src/relay/qnn/utils.h                         |  17 +++
 src/relay/transforms/pattern_utils.h          |   7 +
 src/runtime/crt/host/Makefile                 |   2 +-
 src/target/intrin_rule.cc                     |  84 ++++++++----
 src/tir/op/builtin.cc                         |   5 +
 .../test_hexagon/test_fixed_point_multiply.py |  55 ++++++--
 16 files changed, 516 insertions(+), 58 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 2741d68eec14..274a421e5719 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -405,6 +405,23 @@ struct FixedPointMultiplyAttrs : public tvm::AttrsNode<FixedPointMultiplyAttrs>
   }
 };
 
+/*! \brief Attributes for per channel/per axes FixedPointMultiply operator */
+struct FixedPointMultiplyPerAxisAttrs : public tvm::AttrsNode<FixedPointMultiplyPerAxisAttrs> {
+  bool is_lshift_required;
+  bool is_rshift_required;
+  Array<Integer> axes;
+
+  TVM_DECLARE_ATTRS(FixedPointMultiplyPerAxisAttrs, "relay.attrs.FixedPointMultiplyPerAxisAttrs") {
+    TVM_ATTR_FIELD(is_lshift_required)
+        .describe("Whether left shift is required in fixed point multiplication.")
+        .set_default(false);
+    TVM_ATTR_FIELD(is_rshift_required)
+        .describe("Whether right shift is required in fixed point multiplication.")
+        .set_default(false);
+    TVM_ATTR_FIELD(axes).describe("List of axes on which input data was quantized.");
+  }
+};
+
 /*! \brief Attributes for LayoutTransform operator */
 struct LayoutTransformAttrs : public tvm::AttrsNode<LayoutTransformAttrs> {
   std::string src_layout;
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index a04199f6a5b1..cf318a025c36 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -170,6 +170,19 @@ def fixed_point_multiply_compute(attrs, inputs, output_type):
 
 register_injective_schedule("fixed_point_multiply")
 
+# per-channel/per-axis fixed point multiply
+@register_compute("fixed_point_multiply_per_axis")
+def fixed_point_multiply_per_axis_compute(attrs, inputs, output_type):
+    assert len(inputs) == 4
+    return [
+        topi.fixed_point_multiply_per_axis(
+            *inputs, attrs.is_lshift_required, attrs.is_rshift_required, attrs.axes
+        )
+    ]
+
+
+register_broadcast_schedule("fixed_point_multiply_per_axis")
+
 # full
 @script
 def _full_shape_func(shape):
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 2767f2d5f779..d02f7fab7a5c 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -72,7 +72,7 @@
 from .op import likely, isnan, isnullptr, isfinite, isinf, copysign
 from .op import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod, ceildiv
 from .op import comm_reducer, min, max, sum
-from .op import q_multiply_shift, shift_left, shift_right
+from .op import q_multiply_shift, q_multiply_shift_per_axis, shift_left, shift_right
 from .op import TVMBackendAllocWorkspace, TVMBackendFreeWorkspace
 from .generic import add, subtract, multiply
 
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index 1fd3050c0a7f..588b40ae4033 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -21,10 +21,10 @@
 import tvm._ffi
 from tvm.ir.base import Span
 from tvm.runtime import convert, const
-from tvm.ir import Array, Op
+from tvm.ir import Array, Op, PrimExpr
 
 from .buffer import Buffer
-from .expr import Call, PrimExprWithOp, StringImm, Var, CommReducer
+from .expr import Call, PrimExprWithOp, StringImm, Var, CommReducer, IntImm
 from . import _ffi_api
 
 
@@ -263,8 +263,6 @@ def call_llvm_intrin(dtype, name, *args, span=None):
     # pylint: disable=import-outside-toplevel
     from tvm.target import codegen
 
-    from .expr import IntImm
-
     if isinstance(name, str):
         llvm_id = codegen.llvm_lookup_intrinsic_id(name)
     elif isinstance(name, IntImm):
@@ -307,8 +305,6 @@ def call_llvm_pure_intrin(dtype, name, *args, span=None):
     # pylint: disable=import-outside-toplevel
     from tvm.target import codegen
 
-    from .expr import IntImm
-
     if isinstance(name, str):
         llvm_id = codegen.llvm_lookup_intrinsic_id(name)
     elif isinstance(name, IntImm):
@@ -2238,6 +2234,52 @@ def q_multiply_shift(x, y, q, s):
     return call_intrin("int32", "tir.q_multiply_shift", x, y, q, s)
 
 
+def q_multiply_shift_per_axis(
+    x: PrimExpr,
+    y: PrimExpr,
+    ls: PrimExpr,
+    rs: PrimExpr,
+    q: IntImm,
+    is_lshift_required: IntImm,
+    is_rshift_required: IntImm,
+):
+    """Execute a multiplication between two Q-numbers x and y
+
+    Parameters
+    ----------
+    x : PrimExpr
+        First Q-number.
+    y : PrimExpr
+        Second Q-number.
+    ls : PrimExpr
+         Integer left shift.
+    rs : PrimExpr
+         Integer right shift.
+    q : IntImm
+        Number of fractional bits in x and y. Needs to be > 0.
+    is_lshift_required : IntImm
+                         Whether we need to do left shift or not.
+    is_rshift_required : IntImm
+                         Whether we need to do right shift or not.
+
+    Returns
+    -------
+    z : PrimExpr
+        The result.
+    """
+    return call_intrin(
+        "int32",
+        "tir.q_multiply_shift_per_axis",
+        x,
+        y,
+        ls,
+        rs,
+        q,
+        is_lshift_required,
+        is_rshift_required,
+    )
+
+
 def shift_left(x, y, span=None):
     """Return the result of x left shifted by y bits.
 
diff --git a/python/tvm/topi/hexagon/tensor_intrin.py b/python/tvm/topi/hexagon/tensor_intrin.py
index adea4690d4a7..3e9fd47b0fc6 100644
--- a/python/tvm/topi/hexagon/tensor_intrin.py
+++ b/python/tvm/topi/hexagon/tensor_intrin.py
@@ -25,12 +25,6 @@
 def _q_multiply_shift_hexagon(op):
     """
     Implementation of q_multiply_shift through hexagon intrinsics vmpyewuh and vmpyowh when q == 31.
-
-    Please note that this is introducing a small round-up error for some corner cases with negative
-    shift argument. This is because we are rounding twice instead than only once. I.e.:
-
-        * original q_multiply_shift: round(x*y*2^-s)
-        * hexagon q_multiply_shift: round(round(x*y)*2^-s)
     """
     x = op.args[0]
     y = op.args[1]
@@ -47,9 +41,9 @@ def _q_multiply_shift_hexagon(op):
         op.dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), x, y
     )
     mul_o_1 = tvm.tir.call_llvm_intrin(
-        op.dtype, "llvm.hexagon.V6.vmpyowh.rnd.sacc.128B", tvm.tir.const(3, "uint32"), mul_e_1, x, y
+        op.dtype, "llvm.hexagon.V6.vmpyowh.sacc.128B", tvm.tir.const(3, "uint32"), mul_e_1, x, y
     )
-    fixup = mul_o_1 & (-shift)
+    fixup = 1 << (-shift - 1)
     round_mul = mul_o_1 + fixup
     out_negative_shift = tvm.tir.call_llvm_intrin(
         op.dtype, "llvm.hexagon.V6.vaslwv.128B", tvm.tir.const(2, "uint32"), round_mul, shift
@@ -73,6 +67,80 @@ def _q_multiply_shift_hexagon(op):
 )
 
 
+def _q_multiply_shift_per_axis_hexagon(op):
+    """
+    Implementation of q_multiply_shift_per_axis through hexagon intrinsics vmpyewuh and vmpyowh when
+    q == 31.
+    """
+    x = op.args[0]
+    y = op.args[1]
+    left_shift = op.args[2]
+    right_shift = op.args[3]
+    fractional_bits = op.args[4]
+    is_lshift_required = op.args[5]
+    is_rshift_required = op.args[6]
+
+    # Don't use this intrinsic if we don't have a int32x32 vector
+    # or if we are not multiplying q31 numbers
+    if x.dtype != "int32x32" or fractional_bits.value != 31:
+        return op
+
+    # Don't use this intrinsic when we need do both: left and right shifts.
+    # For now it is not clear how to implement this case through vector HVX instructions without
+    # accuracy drop.
+    if is_rshift_required.value and is_lshift_required.value:
+        return op
+
+    # Case 1: do the left shift
+    shifted_x = x << left_shift
+    mul_e_1 = tvm.tir.call_llvm_intrin(
+        op.dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), shifted_x, y
+    )
+    left_shift_out = tvm.tir.call_llvm_intrin(
+        op.dtype,
+        "llvm.hexagon.V6.vmpyowh.rnd.sacc.128B",
+        tvm.tir.const(3, "uint32"),
+        mul_e_1,
+        shifted_x,
+        y,
+    )
+
+    # Case 2: do the right shift
+    mul_e_2 = tvm.tir.call_llvm_intrin(
+        op.dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), x, y
+    )
+    mul_o_2 = tvm.tir.call_llvm_intrin(
+        op.dtype, "llvm.hexagon.V6.vmpyowh.sacc.128B", tvm.tir.const(3, "uint32"), mul_e_2, x, y
+    )
+    fixup = 1 << (right_shift - 1)
+    round_mul = mul_o_2 + fixup
+    right_shift_out = tvm.tir.call_llvm_intrin(
+        op.dtype, "llvm.hexagon.V6.vasrwv.128B", tvm.tir.const(2, "uint32"), round_mul, right_shift
+    )
+
+    # Case 3: do neither right nor left shift
+    mul_e_3 = tvm.tir.call_llvm_intrin(
+        op.dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), x, y
+    )
+    no_shift_out = tvm.tir.call_llvm_intrin(
+        op.dtype, "llvm.hexagon.V6.vmpyowh.rnd.sacc.128B", tvm.tir.const(3, "uint32"), mul_e_3, x, y
+    )
+
+    return tvm.tir.Select(
+        tvm.tir.Not(tvm.tir.Or(is_lshift_required, is_rshift_required)),
+        no_shift_out,
+        tvm.tir.Select(is_lshift_required, left_shift_out, right_shift_out),
+    )
+
+
+register_intrin_lowering(
+    "tir.q_multiply_shift_per_axis",
+    target="hexagon",
+    f=_q_multiply_shift_per_axis_hexagon,
+    level=99,
+)
+
+
 def dot_vrmpy(x_ty, y_ty):
     """Generates vrmpy instruciton for tensorization."""
     int32_lanes = 32
diff --git a/python/tvm/topi/math.py b/python/tvm/topi/math.py
index 9823024ea0bf..dd191c49be28 100644
--- a/python/tvm/topi/math.py
+++ b/python/tvm/topi/math.py
@@ -20,6 +20,7 @@
 from tvm import te
 from . import tag
 from . import cpp
+from .utils import get_const_tuple
 
 
 @tvm.te.tag_scope(tag=tag.ELEMWISE)
@@ -672,6 +673,63 @@ def _compute(*indices):
     return te.compute(x.shape, _compute)
 
 
+@tvm.te.tag_scope(tag=tag.BROADCAST)
+def fixed_point_multiply_per_axis(
+    x: te.Tensor,
+    y: te.Tensor,
+    lshift: te.Tensor,
+    rshift: te.Tensor,
+    is_lshift_required: int,
+    is_rshift_required: int,
+    axes,
+):
+    """Fixed point multiplication between data and a fixed point constant expressed as
+    multiplier * 2^(-shift), where multiplier is a Q-number with 31 fractional bits
+
+    Parameters
+    ----------
+    x : tvm.te.Tensor
+        Input argument.
+    y : tvm.te.Tensor
+        Multiplier of a fixed floating point number described as multiplier*2^(-shift).
+    lshift : tvm.te.Tensor
+        Left shifts of a fixed floating point number described as multiplier*2^(-shift).
+    rshift : tvm.te.Tensor
+        Right shifts of a fixed floating point number described as multiplier*2^(-shift).
+    is_lshift_required : int
+        Whether we need to do left shift or not.
+    is_rshift_required : int
+        Whether we need to do right shift or not.
+
+    Returns
+    -------
+    z : tvm.te.Tensor
+        The result.
+    """
+
+    def _compute(*indices):
+        elements = []
+        for element in get_const_tuple(axes):
+            elements += [indices[element]]
+        param_indices = tuple(elements)
+
+        value = x(*indices)
+        m = y(*param_indices)
+        l_shift = lshift(*param_indices)
+        r_shift = rshift(*param_indices)
+        return tvm.tir.q_multiply_shift_per_axis(
+            value,
+            m,
+            l_shift,
+            r_shift,
+            tvm.tir.const(31, "int32"),
+            tvm.tir.const(is_lshift_required, "bool"),
+            tvm.tir.const(is_rshift_required, "bool"),
+        )
+
+    return te.compute(x.shape, _compute)
+
+
 def cast(x, dtype, span=None):
     """Cast input to specified data type.
 
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index 85938a739182..50d8531c7dd0 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -54,6 +54,10 @@ Expr MakeBatchMatmul(Expr lhs, Expr rhs, DataType out_dtype, bool transpose_a, b
 
 Expr MakeExpandDims(Expr data, int axis, int num_newaxis);
 
+Expr MakeFixedPointMultiplyPerAxis(Expr x, Expr m, Expr lshift, Expr rshift,
+                                   bool is_lshift_required, bool is_rshift_required,
+                                   Array<Integer> axis);
+
 Expr MakeFull(Expr fill_value, Array<Integer> shape, DataType dtype);
 
 Expr MakeLayoutTransform(Expr data, String src_layout, String dst_layout);
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 985222307ad9..5f063a290740 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -4302,5 +4302,134 @@ RELAY_REGISTER_OP("trilu")
     .set_support_level(3)
     .set_attr<TOpPattern>("TOpPattern", kElemWise);
 
+// FixedPointMultiplyPerAxis
+
+TVM_REGISTER_NODE_TYPE(FixedPointMultiplyPerAxisAttrs);
+
+bool FixedPointMultiplyPerAxisRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                                  const TypeReporter& reporter) {
+  ICHECK_EQ(types.size(), 5) << "FixedPointMultiplyPerAxis: expect 5 types but " << types.size()
+                             << " provided";
+  ICHECK_EQ(num_inputs, 4) << "FixedPointMultiplyPerAxis: expect 4 inputs but " << num_inputs
+                           << " provided";
+
+  for (int i = 0; i < num_inputs; i++) {
+    auto data = types[i].as<TensorTypeNode>();
+    if (data == nullptr) {
+      ICHECK(types[i].as<IncompleteTypeNode>())
+          << "FixedPointMultiplyPerAxis: expect input type to be TensorType but get " << types[i];
+      return false;
+    }
+  }
+
+  return IdentityRel({types[0], types[4]}, 1, attrs, reporter);
+}
+
+InferCorrectLayoutOutput FixedPointMultiplyPerAxisInferCorrectLayout(
+    const Attrs& attrs, const Array<Layout>& new_in_layouts, const Array<Layout>& old_in_layouts,
+    const Array<tvm::relay::Type>& old_in_types) {
+  const auto* attrs_ptr = attrs.as<FixedPointMultiplyPerAxisAttrs>();
+  ICHECK(attrs_ptr);
+  ObjectPtr<FixedPointMultiplyPerAxisAttrs> param =
+      make_object<FixedPointMultiplyPerAxisAttrs>(*attrs_ptr);
+
+  Array<Array<IndexExpr>> old_in_shapes;
+  for (auto old_in_t : old_in_types) {
+    ICHECK(old_in_t.as<TensorTypeNode>());
+    old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
+  }
+
+  Array<Layout> input_layouts, output_layouts;
+
+  if (new_in_layouts.defined()) {
+    const Layout& new_layout = new_in_layouts[0];
+    const Layout& old_layout = old_in_layouts[0];
+
+    std::unordered_set<std::string> old_dims;
+    for (auto axis : param->axes) {
+      ICHECK_GE(axis->value, 0) << "Axis out of bounds in FixedPointMultiplyPerAxis operator.";
+      ICHECK_LT(axis->value, old_in_shapes[0].size())
+          << "Axis out of bounds in FixedPointMultiplyPerAxis operator.";
+      old_dims.emplace(old_layout[axis->value].name());
+    }
+
+    Array<tvm::Integer> new_axes;
+    std::string new_layout_string = "";
+    for (size_t axis_index = 0; axis_index < new_layout->axes.size(); ++axis_index) {
+      const auto& layout_axis = LayoutAxis::Get(new_layout->axes[axis_index]);
+      const std::string& layout_dim = layout_axis.name();
+      if (layout_axis.IsPrimal()) {
+        if (old_dims.count(layout_dim)) {
+          new_axes.push_back(tvm::Integer(axis_index));
+          new_layout_string += layout_dim;
+        }
+      } else {
+        auto primal_dim = layout_axis.ToPrimal().name();
+        if (old_dims.count(primal_dim)) {
+          new_axes.push_back(tvm::Integer(axis_index));
+          new_layout_string += std::to_string(new_layout.FactorOf(layout_axis)) + layout_dim;
+        }
+      }
+    }
+
+    Layout channel_layout = Layout(new_layout_string);
+
+    input_layouts = {new_layout, channel_layout, channel_layout, channel_layout};
+    output_layouts = {new_layout};
+    param->axes = std::move(new_axes);
+  } else if (old_in_layouts.defined()) {
+    ICHECK_EQ(old_in_layouts.size(), 4);
+    ICHECK_EQ(param->axes.size(), 1);  // Not tested other cases
+    const Layout& old_layout = old_in_layouts[0];
+    if (old_layout.defined()) {
+      std::string layout_string = old_layout[param->axes[0]->value].name();
+      Layout channel_layout = Layout(layout_string);
+
+      input_layouts = {old_layout, channel_layout, channel_layout, channel_layout};
+      output_layouts = {old_layout};
+    } else {
+      // Set the layouts to undef.
+      Layout undef = Layout::Undef();
+      input_layouts = Array<Layout>(4, undef);
+      output_layouts = {undef};
+    }
+  } else {
+    // Set the layouts to undef.
+    Layout undef = Layout::Undef();
+    input_layouts = Array<Layout>(4, undef);
+    output_layouts = {undef};
+  }
+
+  return InferCorrectLayoutOutput(input_layouts, output_layouts, Attrs(param));
+}
+
+Expr MakeFixedPointMultiplyPerAxis(Expr x, Expr m, Expr lshift, Expr rshift,
+                                   bool is_lshift_required, bool is_rshift_required,
+                                   Array<Integer> axes) {
+  auto attrs = make_object<FixedPointMultiplyPerAxisAttrs>();
+  attrs->is_lshift_required = is_lshift_required;
+  attrs->is_rshift_required = is_rshift_required;
+  attrs->axes = std::move(axes);
+  static const Op& op = Op::Get("fixed_point_multiply_per_axis");
+  return Call(op, {x, m, lshift, rshift}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.fixed_point_multiply_per_axis")
+    .set_body_typed(MakeFixedPointMultiplyPerAxis);
+
+RELAY_REGISTER_OP("fixed_point_multiply_per_axis")
+    .describe(R"code(per channel fixed point multiplication)code" TVM_ADD_FILELINE)
+    .set_num_inputs(4)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .add_argument("fp_multiplier", "Tensor", "The multipliers tensor.")
+    .add_argument("left_shift", "Tensor", "The left shifts tensor.")
+    .add_argument("right_shift", "Tensor", "The right shifts tensor.")
+    .add_type_rel("FixedPointMultiplyPerAxis", FixedPointMultiplyPerAxisRel)
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                                   FixedPointMultiplyPerAxisInferCorrectLayout)
+    .set_attrs_type<FixedPointMultiplyPerAxisAttrs>()
+    .set_support_level(10);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index ae321b459788..1614652719c6 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -214,6 +214,7 @@ Expr RequantizeLowerInt(const Expr& input_tensor, const Expr& input_scale,
   // if the input scale is per-tensor or per-channel. If it is per-tensor, there is single scale for
   // the whole tensor. For per-channel (aka per-axis), there is a vector of scales for the input
   // tensor. Depending on the quantization type, the fixed point multiplication routing is called.
+  const bool is_upward_rounding = (param->rounding == "UPWARD");
   auto scaled_int32_t = tensor;
   float output_scale_float = GetScalarFromConstant<float>(output_scale);
   if (IsConstScalar(input_scale)) {
@@ -225,8 +226,6 @@ Expr RequantizeLowerInt(const Expr& input_tensor, const Expr& input_scale,
     if (!IsEqualScalar(input_scale, output_scale)) {
       auto [fixed_point_multiplier, shift] = GetFixedPointMultiplierShift(double_multiplier);
 
-      const bool is_upward_rounding = (param->rounding == "UPWARD");
-
       // When using upward rounding (i.e., x.5 rounded to x+1), leverage
       // the FixedPointMultiply operator
       scaled_int32_t =
@@ -246,8 +245,13 @@ Expr RequantizeLowerInt(const Expr& input_tensor, const Expr& input_scale,
     }
     int axis = param->axis;
     axis = (axis == -1) ? input_shape.size() - 1 : axis;
-    scaled_int32_t = FixedPointMultiplyPerChannel(scaled_int32_t, double_multipliers, input_shape,
-                                                  axis, param->rounding);
+
+    // When using "upward" rounding, leverage the FixedPointMultiplyPerAxis operator,
+    // for "tonearest" rounding - lower to multiply, add, shift operators sequence.
+    scaled_int32_t = is_upward_rounding
+                         ? FixedPointMultiplyPerChannel(scaled_int32_t, double_multipliers, axis)
+                         : FixedPointMultiplyPerChannelToNearest(scaled_int32_t, double_multipliers,
+                                                                 input_shape, axis);
   }
 
   // 3) Add the output zero point.
diff --git a/src/relay/qnn/utils.cc b/src/relay/qnn/utils.cc
index ed7a415cf6af..ab72bd957080 100644
--- a/src/relay/qnn/utils.cc
+++ b/src/relay/qnn/utils.cc
@@ -108,6 +108,32 @@ Expr FixedPointMultiplyToNearest(Expr tensor, double multiplier,
   return Cast(tensor, DataType::Int(32));
 }
 
+Expr FixedPointMultiplyPerChannel(Expr tensor, const std::vector<double>& multipliers, int axis) {
+  DataType dtype = DataType::Int(32);
+  int64_t n_channels = static_cast<int64_t>(multipliers.size());
+
+  std::vector<int32_t> fixed_pt_multipliers, lshifts, rshifts;
+  bool is_lshift_required = false, is_rshift_required = false;
+  for (auto multiplier : multipliers) {
+    auto [fixed_pt_multiplier, shift] = GetFixedPointMultiplierShift(multiplier);
+    int lshift = shift > 0 ? shift : 0;
+    int rshift = shift > 0 ? 0 : -shift;
+    fixed_pt_multipliers.push_back(fixed_pt_multiplier);
+    lshifts.push_back(lshift);
+    rshifts.push_back(rshift);
+    is_lshift_required = is_lshift_required | (lshift != 0);
+    is_rshift_required = is_rshift_required | (rshift != 0);
+  }
+
+  auto left_shift_expr = MakeConstantTensor(dtype, {n_channels}, lshifts);
+  auto right_shift_expr = MakeConstantTensor(dtype, {n_channels}, rshifts);
+  auto fixed_pt_multiplier_expr = MakeConstantTensor(dtype, {n_channels}, fixed_pt_multipliers);
+
+  return FixedPointMultiplyPerAxis(tensor, fixed_pt_multiplier_expr, left_shift_expr,
+                                   right_shift_expr, is_lshift_required, is_rshift_required,
+                                   {axis});
+}
+
 Expr FixedPointMultiplyPerChannel(Expr tensor, std::vector<double> multipliers,
                                   const Array<IndexExpr>& input_shape, int channel_axis,
                                   const std::string& rounding) {
@@ -197,6 +223,11 @@ Expr FixedPointMultiplyPerChannel(Expr tensor, std::vector<double> multipliers,
   return Cast(tensor, DataType::Int(32));
 }
 
+Expr FixedPointMultiplyPerChannelToNearest(Expr tensor, std::vector<double> multipliers,
+                                           const Array<IndexExpr>& input_shape, int channel_axis) {
+  return FixedPointMultiplyPerChannel(tensor, multipliers, input_shape, channel_axis, "TONEAREST");
+}
+
 std::string SelectRequntizeParameter(const std::string& arg_value, const std::string& cfg_value,
                                      const bool is_cfg_default, const std::string& name) {
   if (arg_value == "None") {
diff --git a/src/relay/qnn/utils.h b/src/relay/qnn/utils.h
index d084e4871e95..87195eb34d94 100644
--- a/src/relay/qnn/utils.h
+++ b/src/relay/qnn/utils.h
@@ -212,6 +212,23 @@ Expr FixedPointMultiplyToNearest(Expr tensor, double multiplier,
 Expr FixedPointMultiplyPerChannel(Expr tensor, std::vector<double> multiplier,
                                   const Array<IndexExpr>& input_shape, int channel_axis,
                                   const std::string& rounding);
+
+/*
+ * Wrapper for 'FixedPointMultiplyPerChannel' with rounding parameter == "TONEAREST".
+ */
+Expr FixedPointMultiplyPerChannelToNearest(Expr tensor, std::vector<double> multiplier,
+                                           const Array<IndexExpr>& input_shape, int channel_axis);
+
+/*
+ * \brief Creates FixedPointMultiply operation where the input tensor is
+ per-axis/per-channel quantized..
+ * \param tensor The quantized input tensor.
+ * \param multipliers List of scalar multipliers.
+ * \param channel_axis The channel_axis along which the input tensor is quantized.
+ * \return The Relay op.
+ */
+Expr FixedPointMultiplyPerChannel(Expr tensor, const std::vector<double>& multipliers, int axis);
+
 /*
  * \brief Checks whether an expr type is scalar of a given data type.
  * \param expr_type The type of expr to be checked.
diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h
index ffe1cc2ca2ab..d03939e09ea8 100644
--- a/src/relay/transforms/pattern_utils.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -661,6 +661,13 @@ inline Expr FixedPointMultiply(Expr x, int32_t multiplier, int32_t shift) {
   return Call(op, {x}, Attrs(attrs), {});
 }
 
+inline Expr FixedPointMultiplyPerAxis(Expr x, Expr m, Expr lshift, Expr rshift,
+                                      bool is_lshift_required, bool is_rshift_required,
+                                      Array<Integer> axes) {
+  return MakeFixedPointMultiplyPerAxis(x, m, lshift, rshift, is_lshift_required, is_rshift_required,
+                                       axes);
+}
+
 inline Expr Add(Expr lhs, Expr rhs) {
   static const Op& op = Op::Get("add");
   return Call(op, {lhs, rhs}, Attrs(), {});
diff --git a/src/runtime/crt/host/Makefile b/src/runtime/crt/host/Makefile
index d9e87c7d6a41..ea2966045bb2 100644
--- a/src/runtime/crt/host/Makefile
+++ b/src/runtime/crt/host/Makefile
@@ -21,7 +21,7 @@ CXXFLAGS ?= -Werror -Wall -std=c++11 -DTVM_HOST_USE_GRAPH_EXECUTOR_MODULE
 LDFLAGS ?= -Werror -Wall
 
 # Codegen produces spurious lines like: int32_t arg2_code = ((int32_t*)arg_type_ids)[(2)];
-MODEL_CFLAGS ?= -Wno-error=unused-variable -Wno-error=missing-braces
+MODEL_CFLAGS ?= -Wno-error=unused-variable -Wno-error=missing-braces -Wno-error=unused-const-variable
 
 AR ?= ${PREFIX}ar
 CC ?= ${PREFIX}gcc
diff --git a/src/target/intrin_rule.cc b/src/target/intrin_rule.cc
index 5042ae60cb0b..8c7ff1abad51 100644
--- a/src/target/intrin_rule.cc
+++ b/src/target/intrin_rule.cc
@@ -154,6 +154,46 @@ TVM_REGISTER_OP("tir.isinf")
       return isinf(call->args[0]);
     });
 
+/*!
+ * \brief Makes fixed point multiplication.
+ * \param x Input tensor.
+ * \param y Integer multiplier.
+ * \param left_shift Integer left shift.
+ * \param right_shift Integer right shift.
+ * \param is_left_shift_required Flag whether we need to do left shift or not.
+ * \return Calculated expression.
+ */
+static PrimExpr QMultiplyShift(PrimExpr x, PrimExpr y, PrimExpr q, PrimExpr left_shift,
+                               PrimExpr right_shift, PrimExpr is_left_shift_required) {
+  // Only int32 types are supported (any number of lanes is allowed)
+  ICHECK(y.dtype().code() == DLDataTypeCode::kDLInt && y.dtype().bits() == 32);
+  ICHECK(left_shift.dtype().code() == DLDataTypeCode::kDLInt && left_shift.dtype().bits() == 32);
+  ICHECK(right_shift.dtype().code() == DLDataTypeCode::kDLInt && right_shift.dtype().bits() == 32);
+
+  DataType hp_dtype = DataType::Int(64, x.dtype().lanes());
+  DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
+
+  // 1) Cast and Multiply the integer multiplier
+  PrimExpr one = make_const(hp_dtype, 1);
+  x = cast(hp_dtype, x);
+  y = cast(hp_dtype, y);
+  x = tir::Select(is_left_shift_required, x << left_shift, x);
+
+  // 2) Perform the multiplication in higher precision.
+  x = x * y;
+
+  // 3) Find the rounding scalar
+  PrimExpr total_right_shift = right_shift + q;
+  PrimExpr pos_rounding_value = (one << (total_right_shift - 1));
+  x = x + pos_rounding_value;
+
+  // 4) Simply right shift the result to get the final output.
+  x = x >> total_right_shift;
+
+  // 5) The fixed point multiplication keeps the value in int32 range. Casting back to int32.
+  return cast(lp_dtype, x);
+}
+
 TVM_REGISTER_OP("tir.q_multiply_shift")
     .set_attr<FLegalize>("default.FLegalize", [](const PrimExpr& e) -> PrimExpr {
       using tir::make_const;
@@ -197,40 +237,34 @@ TVM_REGISTER_OP("tir.q_multiply_shift")
         }
       } else {
         // Only int32 types are supported (any number of lanes is allowed)
-        ICHECK(y.dtype().code() == DLDataTypeCode::kDLInt && y.dtype().bits() == 32);
         ICHECK(s.dtype().code() == DLDataTypeCode::kDLInt && s.dtype().bits() == 32);
 
-        DataType hp_dtype = DataType::Int(64, x.dtype().lanes());
-        DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
-
-        // 1) Calculating the integer multiplier and integer shift
+        // Calculating integer shifts
         PrimExpr zero = make_const(s.dtype(), 0);
         PrimExpr left_shift = tir::Select(s > zero, s, zero);
         PrimExpr right_shift = tir::Select(s > zero, zero, -s);
+        PrimExpr is_left_shift_required = (left_shift != zero);
 
-        // 2) Cast and Multiply the integer multiplier
-        PrimExpr one = make_const(hp_dtype, 1);
-        x = cast(hp_dtype, x);
-        y = cast(hp_dtype, y);
-        x = tir::Select(left_shift != zero, x << left_shift, x);
-
-        // 3) Perform the multiplication in higher precision.
-        x = x * y;
-
-        // 4) Find the rounding scalar
-        PrimExpr total_right_shift = right_shift + q;
-        PrimExpr pos_rounding_value = (one << (total_right_shift - 1));
-        x = x + pos_rounding_value;
-
-        // 5) Simply right shift the result to get the final output.
-        x = x >> total_right_shift;
-
-        // 6) The fixed point multiplication keeps the value in int32 range. Casting back to
-        // int32.
-        return cast(lp_dtype, x);
+        return QMultiplyShift(x, y, q, left_shift, right_shift, is_left_shift_required);
       }
     });
 
+TVM_REGISTER_OP("tir.q_multiply_shift_per_axis")
+    .set_attr<FLegalize>("default.FLegalize", [](const PrimExpr& e) -> PrimExpr {
+      const tir::CallNode* call = e.as<tir::CallNode>();
+      ICHECK(call != nullptr);
+
+      PrimExpr x = call->args[0];
+      PrimExpr y = call->args[1];
+      PrimExpr left_shift = call->args[2];
+      PrimExpr right_shift = call->args[3];
+      PrimExpr q = call->args[4];
+      PrimExpr is_lshift_required = call->args[5];
+      // Note, 7th argument is "is_rshift_required" flag, but we don't need that here.
+      // PrimExpr is_rshift_required = call->args[6];
+
+      return QMultiplyShift(x, y, q, left_shift, right_shift, is_lshift_required);
+    });
 }  // namespace legalize
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index b605b9de1e02..9feba142eb6a 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -98,6 +98,11 @@ TIR_DEFINE_BUILTIN_FUNC(q_multiply_shift)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure))
     .set_attr<TVectorizable>("TVectorizable", true);
 
+TIR_DEFINE_BUILTIN_FUNC(q_multiply_shift_per_axis)
+    .set_num_inputs(7)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure))
+    .set_attr<TVectorizable>("TVectorizable", true);
+
 TIR_DEFINE_BUILTIN_FUNC(isnullptr).set_num_inputs(1).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
index ee03599ff1f4..e7e4aa212e35 100644
--- a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
+++ b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
@@ -78,11 +78,25 @@ def run_module(graph_mod, inputs):
     return output
 
 
+in_scale_const, out_scale_const = tvm.testing.parameters(
+    (1.3, 30.0),
+    (1.37, 1.0),
+    (0.6, 1.0),
+    ((1.7, 0.6), 1.0),
+    ((0.007, 1.9), 1.0),
+)
+
+multiplier, shift = tvm.testing.parameters(
+    (1288490240, -2),  # 0.15
+    (1395864320, 1),  # 1.3
+    (1288490188, 0),  # 0.6
+)
+
+
 @tvm.testing.requires_hexagon
-def test_fixed_point_multiply_positive_shift(hexagon_session: Session):
+def test_fixed_point_multiply(hexagon_session: Session, multiplier: int, shift: int):
     ishape = (6, 32)
     a = relay.var("a", relay.TensorType(ishape, "int32"))
-    multiplier, shift = (1395864320, 1)  # 1.3
     fpm = relay.fixed_point_multiply(a, multiplier, shift)
     relay_mod = tvm.IRModule.from_expr(fpm)
 
@@ -108,22 +122,37 @@ def test_fixed_point_multiply_positive_shift(hexagon_session: Session):
 
 
 @tvm.testing.requires_hexagon
-def test_fixed_point_multiply_negative_shift(hexagon_session: Session):
-    ishape = (6, 32)
-    a = relay.var("a", relay.TensorType(ishape, "int32"))
-    multiplier, shift = (1288490240, -2)  # 0.15
-    fpm = relay.fixed_point_multiply(a, multiplier, shift)
-    relay_mod = tvm.IRModule.from_expr(fpm)
+def test_per_channel_fixed_point_multiply(
+    hexagon_session: Session, in_scale_const, out_scale_const
+):
+    ishape = [1, 128, 56, 56]
+    axis = 1
+    a = relay.var("a", shape=ishape, dtype="int32")
+
+    # Make list of input scales from in_scale_const parameter.
+    if isinstance(in_scale_const, tuple):
+        in_scale = list(in_scale_const) * (ishape[axis] // len(in_scale_const))
+    else:
+        in_scale = [in_scale_const] * ishape[axis]
+    assert len(in_scale) == ishape[axis]
+
+    # qnn.requantize is lowered to fixed_point_multiply if zp == 0 and in_dtype == out_dtype.
+    iscale = relay.const(in_scale)
+    izero = relay.const(0)
+    oscale = relay.const(out_scale_const)
+    ozero = relay.const(0)
+    op = relay.qnn.op.requantize(a, iscale, izero, oscale, ozero, axis=axis, out_dtype="int32")
+    mod = tvm.IRModule.from_expr(op)
 
     with tvm.transform.PassContext(opt_level=3):
         # Compile for Hexagon...
-        hexagon_lowered = build_module(relay_mod, tvm.target.hexagon("v68"))
+        hexagon_lowered = build_module(mod, tvm.target.hexagon("v68"))
 
         # Compile for LLVM...
-        llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm"))
+        llvm_lowered = build_module(mod, tvm.target.Target("llvm"))
 
-    data_in = np.arange(-96, 96).reshape(ishape)
-    inputs = {"a": data_in}
+    a_np = np.random.randint(-1000, 1000, size=np.prod(ishape)).reshape(ishape)
+    inputs = {"a": a_np}
 
     # Run hexagon...
     graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
@@ -133,7 +162,7 @@ def test_fixed_point_multiply_negative_shift(hexagon_session: Session):
     llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
     expected_output = run_module(llvm_graph_mod, inputs)
 
-    tvm.testing.assert_allclose(hexagon_output, expected_output, atol=1)
+    tvm.testing.assert_allclose(hexagon_output, expected_output)
 
 
 if __name__ == "__main__":

From 5c9066d816408bd2858c9758b0865ca08112c78f Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Thu, 27 Oct 2022 17:42:33 +0800
Subject: [PATCH 447/704] [AMP] refine AMP and the corresponding tests for
 bfloat16 (#12787)

* refine AMP for bfloat16

* refine AMP tests to cover bfloat16

* refine accuracy checking for dnnl bf16
---
 src/relay/transforms/to_mixed_precision.cc    |  12 +-
 tests/python/contrib/test_dnnl.py             |   5 +-
 tests/python/relay/test_to_mixed_precision.py | 178 ++++++++++--------
 3 files changed, 114 insertions(+), 81 deletions(-)

diff --git a/src/relay/transforms/to_mixed_precision.cc b/src/relay/transforms/to_mixed_precision.cc
index e1d3a264c222..18161b3c2508 100644
--- a/src/relay/transforms/to_mixed_precision.cc
+++ b/src/relay/transforms/to_mixed_precision.cc
@@ -161,7 +161,9 @@ class MixedPrecisionPass : public MixedModeMutator {
      */
     DataType cur_type = (attrs->out_dtype);
     ObjectPtr<T> new_attrs = make_object<T>(*attrs);
-    if (cur_type.is_float() || cur_type.is_void()) new_attrs->out_dtype = accumulation_dtype;
+    if (cur_type.is_float() || cur_type.is_bfloat16() || cur_type.is_void()) {
+      new_attrs->out_dtype = accumulation_dtype;
+    }
     return Attrs(new_attrs);
   }
 
@@ -175,7 +177,9 @@ class MixedPrecisionPass : public MixedModeMutator {
     */
     DataType cur_type = (attrs->dtype);
     ObjectPtr<T> new_attrs = make_object<T>(*attrs);
-    if (cur_type.is_float() || cur_type.is_void()) new_attrs->dtype = accumulation_dtype;
+    if (cur_type.is_float() || cur_type.is_bfloat16() || cur_type.is_void()) {
+      new_attrs->dtype = accumulation_dtype;
+    }
     return Attrs(new_attrs);
   }
 
@@ -217,7 +221,7 @@ class MixedPrecisionPass : public MixedModeMutator {
     /* Cast tensor to the wanted datatype, returning a cached version if it's already been done. */
 
     // If this is not a floating point type, do not cast. E.g. it might be an integer
-    if (!expr_dtype.is_float()) {
+    if (!(expr_dtype.is_float() || expr_dtype.is_bfloat16())) {
       return expr;
     }
 
@@ -299,7 +303,7 @@ class MixedPrecisionPass : public MixedModeMutator {
         original_dtype_.push_back((root_->checked_type_).as<TensorTypeNode>()->dtype);
       }
     }
-    if (!mixed_precision_type_.is_float() && !mixed_precision_type_.is_bfloat16()) {
+    if (!(mixed_precision_type_.is_float() || mixed_precision_type_.is_bfloat16())) {
       LOG(FATAL) << "Only support IEEE floating point mixed precision types and bfloat16, but got "
                  << mixed_precision_type_;
     }
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index c4adc9785c19..f23b3c70aa96 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -150,9 +150,8 @@ def assert_result_dict_holds(result_dict):
         res1 = vmobj_to_list(result_dict[k1])
         res2 = vmobj_to_list(result_dict[k2])
         for r1, r2 in zip(res1, res2):
-            if "bf16" in k1 or "bf16" in k2:
-                np.testing.assert_array_almost_equal(r1, r2, decimal=1)
-            else:
+            # ignore the accuracy checking if only one bf16 result presents
+            if ("bf16" in k1) == ("bf16" in k2):
                 tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
 
 
diff --git a/tests/python/relay/test_to_mixed_precision.py b/tests/python/relay/test_to_mixed_precision.py
index 026b458bde12..51d040c311f4 100644
--- a/tests/python/relay/test_to_mixed_precision.py
+++ b/tests/python/relay/test_to_mixed_precision.py
@@ -24,6 +24,12 @@
 from tvm.relay.testing import lstm
 from tvm.relay.transform import InferType, ToMixedPrecision, mixed_precision
 
+target_precision = tvm.testing.parameter(
+    pytest.param("float16"),
+    pytest.param("bfloat16"),
+    ids=["float16", "bfloat16"],
+)
+
 
 def run_module(mod: tvm.runtime.Module, mod_params: Dict[str, Any]) -> List:
     dev = tvm.device("llvm", 0)
@@ -48,28 +54,29 @@ def verify_mixed_precision_output_close(
     result_fp32 = run_module(mod, mod_params)
 
     if not keep_orig_output_dtype:
-        fp16_mod = ToMixedPrecision(mixed_precision_dtype)(mod)
-        result_fp16 = run_module(fp16_mod, mod_params)
+        amp_mod = ToMixedPrecision(mixed_precision_dtype)(mod)
+        result_amp = run_module(amp_mod, mod_params)
     else:
         with tvm.transform.PassContext(
             config={"relay.ToMixedPrecision.keep_orig_output_dtype": True}
         ):
-            fp16_mod = ToMixedPrecision(mixed_precision_dtype)(mod)
-            result_fp16 = run_module(fp16_mod, mod_params)
+            amp_mod = ToMixedPrecision(mixed_precision_dtype)(mod)
+            result_amp = run_module(amp_mod, mod_params)
 
     # Ensure the results are close
-    for fp32, fp16 in zip(result_fp32, result_fp16):
-        np.testing.assert_allclose(fp32, fp16, rtol=rtol, atol=atol)
+    if mixed_precision_dtype != "bfloat16":
+        for fp32, amp in zip(result_fp32, result_amp):
+            np.testing.assert_allclose(fp32, amp, rtol=rtol, atol=atol)
 
     if keep_orig_output_dtype:
         assert (
-            np.array(result_fp16).dtype == np.array(result_fp32).dtype
+            np.array(result_amp).dtype == np.array(result_fp32).dtype
         ), "output type and original type mismatch"
 
-    return fp16_mod
+    return amp_mod
 
 
-def test_lstm():
+def test_lstm(target_precision):
     """A small stress test on a single unrolled lstm unit.
 
     Has internal functions and let statements the pass must work on.
@@ -87,7 +94,9 @@ def test_lstm():
             -10, 10, (1, units)
         ).astype("float32")
 
-    verify_mixed_precision_output_close(mod, mod_params, rtol=0.01, atol=0.01)
+    verify_mixed_precision_output_close(
+        mod, mod_params, mixed_precision_dtype=target_precision, rtol=0.01, atol=0.01
+    )
 
 
 def test_lstm_float64():
@@ -114,7 +123,7 @@ def test_lstm_float64():
     )
 
 
-def test_convert_single_conv():
+def test_convert_single_conv(target_precision):
     """Conv is a green listed operation meaning it will always use fp16 workload.
 
     By default it accumulates to fp32 and outputs fp16.
@@ -131,26 +140,31 @@ def test_convert_single_conv():
         "data": np.random.uniform(-1, 1, size=data_shape).astype("float32"),
         "weight": np.random.uniform(-1, 1, size=weight_shape).astype("float32"),
     }
-    fp16_mod = verify_mixed_precision_output_close(
-        mod, mod_params, atol=0.01, rtol=1e-3, keep_orig_output_dtype=True
+    amp_mod = verify_mixed_precision_output_close(
+        mod,
+        mod_params,
+        mixed_precision_dtype=target_precision,
+        atol=0.01,
+        rtol=1e-3,
+        keep_orig_output_dtype=True,
     )
 
     expected_mod = tvm.IRModule.from_expr(
         relay.cast(
             relay.nn.conv2d(
-                relay.cast(data, "float16"),
-                relay.cast(weight, "float16"),
+                relay.cast(data, target_precision),
+                relay.cast(weight, target_precision),
                 strides=(1, 1),
                 padding=(1, 1),
-                out_dtype="float16",
+                out_dtype=target_precision,
             ),
             "float32",
         )
     )
     expected_mod = tvm.relay.transform.InferType()(expected_mod)
 
-    assert not tvm.ir.structural_equal(fp16_mod, mod)
-    assert tvm.ir.structural_equal(fp16_mod, expected_mod)
+    assert not tvm.ir.structural_equal(amp_mod, mod)
+    assert tvm.ir.structural_equal(amp_mod, expected_mod)
 
 
 def test_convert_single_conv_fp64():
@@ -167,7 +181,7 @@ def test_convert_single_conv_fp64():
         "data": np.random.uniform(-1, 1, size=data_shape).astype("float32"),
         "weight": np.random.uniform(-1, 1, size=weight_shape).astype("float32"),
     }
-    fp16_mod = verify_mixed_precision_output_close(
+    amp_mod = verify_mixed_precision_output_close(
         mod, mod_params, mixed_precision_dtype="float64", atol=0.01, rtol=1e-3
     )
 
@@ -184,11 +198,11 @@ def test_convert_single_conv_fp64():
     )
     expected_mod = tvm.relay.transform.InferType()(expected_mod)
 
-    assert not tvm.ir.structural_equal(fp16_mod, mod)
-    assert tvm.ir.structural_equal(fp16_mod, expected_mod)
+    assert not tvm.ir.structural_equal(amp_mod, mod)
+    assert tvm.ir.structural_equal(amp_mod, expected_mod)
 
 
-def test_convert_conv_bn():
+def test_convert_conv_bn(target_precision):
     """Conv is green and batch norm is gray. As Conv should output fp16 batch_norm should be green."""
     data_shape = (1, 3, 32, 32)
     weight_shape = (5, 3, 3, 3)
@@ -213,49 +227,51 @@ def test_convert_conv_bn():
         "moving_mean": np.random.uniform(-1, 1, size=bn_shape).astype("float32"),
         "moving_var": np.random.uniform(-1, 1, size=bn_shape).astype("float32"),
     }
-    fp16_mod = verify_mixed_precision_output_close(mod, mod_params, atol=0.025, rtol=0.01)
+    amp_mod = verify_mixed_precision_output_close(
+        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.025, rtol=0.01
+    )
 
     # Creating expected module
-    data = relay.cast(relay.var("data", shape=data_shape), "float16")
-    weight = relay.cast(relay.var("weight", shape=weight_shape), "float16")
-    conv = relay.nn.conv2d(data, weight, strides=(1, 1), padding=(1, 1), out_dtype="float16")
+    data = relay.cast(relay.var("data", shape=data_shape), target_precision)
+    weight = relay.cast(relay.var("weight", shape=weight_shape), target_precision)
+    conv = relay.nn.conv2d(data, weight, strides=(1, 1), padding=(1, 1), out_dtype=target_precision)
 
     bn_shape = [5]
-    gamma = relay.cast(relay.var("gamma", shape=bn_shape), "float16")
-    beta = relay.cast(relay.var("beta", shape=bn_shape), "float16")
-    moving_mean = relay.cast(relay.var("moving_mean", shape=bn_shape), "float16")
-    moving_var = relay.cast(relay.var("moving_var", shape=bn_shape), "float16")
+    gamma = relay.cast(relay.var("gamma", shape=bn_shape), target_precision)
+    beta = relay.cast(relay.var("beta", shape=bn_shape), target_precision)
+    moving_mean = relay.cast(relay.var("moving_mean", shape=bn_shape), target_precision)
+    moving_var = relay.cast(relay.var("moving_var", shape=bn_shape), target_precision)
     bn = relay.nn.batch_norm(conv, gamma, beta, moving_mean, moving_var)
 
     expected_mod = tvm.IRModule.from_expr(bn[0])
     expected_mod = tvm.relay.transform.InferType()(expected_mod)
-    assert not tvm.ir.structural_equal(fp16_mod, mod)
-    assert tvm.ir.structural_equal(fp16_mod, expected_mod)
+    assert not tvm.ir.structural_equal(amp_mod, mod)
+    assert tvm.ir.structural_equal(amp_mod, expected_mod)
 
 
-def test_do_not_convert_softmax():
+def test_do_not_convert_softmax(target_precision):
     """Softmax is a red listed operation and therefore should never be fp16."""
     shape = [1, 2, 3]
     a = relay.var("a", shape=shape)
     b = relay.nn.softmax(a)
     mod = tvm.IRModule.from_expr(b)
     mod = tvm.relay.transform.InferType()(mod)
-    out_mod = ToMixedPrecision("float16")(mod)
+    out_mod = ToMixedPrecision(target_precision)(mod)
     orig_mod = tvm.relay.transform.InferType()(mod)
     assert tvm.ir.structural_equal(orig_mod, out_mod)
 
 
-def test_do_not_convert_arange():
+def test_do_not_convert_arange(target_precision):
     """Arange is a red listed operation and therefore should never be fp16."""
     dtype = "float32"
     arange = relay.arange(relay.const(1, dtype), relay.const(128, dtype))
     mod = tvm.IRModule.from_expr(arange)
-    out_mod = ToMixedPrecision("float16")(mod)
+    out_mod = ToMixedPrecision(target_precision)(mod)
     orig_mod = tvm.relay.transform.InferType()(mod)
     assert tvm.ir.structural_equal(orig_mod, out_mod)
 
 
-def test_do_not_convert_summation():
+def test_do_not_convert_summation(target_precision):
     """Ops that could involve a large summation are not allowed in fp16."""
     shape = [1, 3, 16, 16]
     a = relay.var("a", shape=shape)
@@ -267,12 +283,12 @@ def test_do_not_convert_summation():
     ]
     for op in ops:
         mod = tvm.IRModule.from_expr(op(a))
-        out_mod = ToMixedPrecision("float16")(mod)
+        out_mod = ToMixedPrecision(target_precision)(mod)
         orig_mod = tvm.relay.transform.InferType()(mod)
         assert tvm.ir.structural_equal(orig_mod, out_mod)
 
 
-def test_green_gray_propagates_simple():
+def test_green_gray_propagates_simple(target_precision):
     """Conv is a green listed operation, while addition is gray.
 
     As Conv outputs fp16 the add should be done in fp16.
@@ -290,23 +306,25 @@ def test_green_gray_propagates_simple():
         "data": np.random.uniform(-1, 1, size=data_shape).astype("float32"),
         "weight": np.random.uniform(-1, 1, size=weight_shape).astype("float32"),
     }
-    fp16_mod = verify_mixed_precision_output_close(mod, mod_params, atol=0.01, rtol=0.01)
+    amp_mod = verify_mixed_precision_output_close(
+        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=0.01
+    )
 
     conv_expr = relay.nn.conv2d(
-        relay.cast(data, "float16"),
-        relay.cast(weight, "float16"),
+        relay.cast(data, target_precision),
+        relay.cast(weight, target_precision),
         strides=(1, 1),
         padding=(1, 1),
-        out_dtype="float16",
+        out_dtype=target_precision,
     )
     expected_mod = tvm.IRModule.from_expr(conv_expr + conv_expr)
     expected_mod = tvm.relay.transform.InferType()(expected_mod)
 
-    assert not tvm.ir.structural_equal(fp16_mod, mod)
-    assert tvm.ir.structural_equal(fp16_mod, expected_mod)
+    assert not tvm.ir.structural_equal(amp_mod, mod)
+    assert tvm.ir.structural_equal(amp_mod, expected_mod)
 
 
-def test_green_red_not_use_extraneous_cast():
+def test_green_red_not_use_extraneous_cast(target_precision):
     """Conv. is a green listed operation, while softmax is red.
 
     Conv. also by default accumulates to fp32 but outputs fp16.
@@ -346,16 +364,18 @@ def test_green_red_not_use_extraneous_cast():
         "data": np.random.uniform(-1, 1, size=data_shape).astype("float32"),
         "weight": np.random.uniform(-1, 1, size=weight_shape).astype("float32"),
     }
-    fp16_mod = verify_mixed_precision_output_close(mod, mod_params, atol=0.01, rtol=1e-3)
+    amp_mod = verify_mixed_precision_output_close(
+        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=1e-3
+    )
 
     # Construct expected structure
     conv = relay.cast(
         relay.nn.conv2d(
-            relay.cast(data, "float16"),
-            relay.cast(weight, "float16"),
+            relay.cast(data, target_precision),
+            relay.cast(weight, target_precision),
             strides=(1, 1),
             padding=(1, 1),
-            out_dtype="float16",
+            out_dtype=target_precision,
         ),
         "float32",
     )
@@ -363,10 +383,10 @@ def test_green_red_not_use_extraneous_cast():
     expected_mod = tvm.IRModule.from_expr(result)
     expected_mod = InferType()(expected_mod)
 
-    assert tvm.ir.structural_equal(expected_mod, fp16_mod)
+    assert tvm.ir.structural_equal(expected_mod, amp_mod)
 
 
-def test_red_gray_propagates_simple():
+def test_red_gray_propagates_simple(target_precision):
     """Everything after a softmax should be in FP32 (exception green colored ops)"""
     shape = [1, 2, 3]
     a = relay.var("a", shape=shape)
@@ -378,12 +398,14 @@ def test_red_gray_propagates_simple():
     mod_params = {
         "a": np.random.uniform(-1, 1, size=shape).astype("float32"),
     }
-    output_mod = verify_mixed_precision_output_close(mod, mod_params, atol=0.0, rtol=0.0)
+    output_mod = verify_mixed_precision_output_close(
+        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.0, rtol=0.0
+    )
 
     assert tvm.ir.structural_equal(mod, output_mod)
 
 
-def test_let_statement_simple():
+def test_let_statement_simple(target_precision):
     """A 'simple' let statement example.
 
     Noticeable is the mutation of the bound variable types.
@@ -405,23 +427,25 @@ def test_let_statement_simple():
         "data": np.random.uniform(-1, 1, size=[1, 20]).astype("float32"),
         "weight": np.random.uniform(-1, 1, size=[20, 20]).astype("float32"),
     }
-    output_mod = verify_mixed_precision_output_close(mod, mod_params, atol=0.05, rtol=0.15)
+    output_mod = verify_mixed_precision_output_close(
+        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.05, rtol=0.15
+    )
 
     # Construct expected structure
-    var1 = relay.var("var1", shape=[1, 20], dtype="float16")
-    var2 = relay.var("var2", shape=[1, 20], dtype="float16")
-    data = relay.cast(relay.var("data", shape=[1, 20]), "float16")
-    weight = relay.cast(relay.var("weight", shape=[20, 20]), "float16")
+    var1 = relay.var("var1", shape=[1, 20], dtype=target_precision)
+    var2 = relay.var("var2", shape=[1, 20], dtype=target_precision)
+    data = relay.cast(relay.var("data", shape=[1, 20]), target_precision)
+    weight = relay.cast(relay.var("weight", shape=[20, 20]), target_precision)
     r1 = var1 + var1
     r2 = var2 + var2
     let2 = relay.Let(
         var2,
-        relay.nn.dense(r1, weight, units=20, out_dtype="float16"),
+        relay.nn.dense(r1, weight, units=20, out_dtype=target_precision),
         r2,
     )
     let1 = relay.Let(
         var1,
-        relay.nn.dense(data, weight, units=20, out_dtype="float16"),
+        relay.nn.dense(data, weight, units=20, out_dtype=target_precision),
         let2,
     )
     expected_mod = tvm.IRModule.from_expr(let1)
@@ -430,7 +454,7 @@ def test_let_statement_simple():
     assert tvm.ir.structural_equal(expected_mod, output_mod)
 
 
-def test_where_simple():
+def test_where_simple(target_precision):
     data = relay.var("data", shape=[1, 20])
     weight = relay.var("weight", shape=[20, 20])
     a = relay.nn.dense(data, weight, units=20)
@@ -441,12 +465,14 @@ def test_where_simple():
         "weight": np.random.uniform(-1, 1, size=[20, 20]).astype("float32"),
     }
 
-    output_mod = verify_mixed_precision_output_close(mod, mod_params, atol=0.01, rtol=0.01)
+    output_mod = verify_mixed_precision_output_close(
+        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=0.01
+    )
 
     # Create expected module
-    data = relay.cast(relay.var("data", shape=[1, 20]), "float16")
-    weight = relay.cast(relay.var("weight", shape=[20, 20]), "float16")
-    a = relay.nn.dense(data, weight, units=20, out_dtype="float16")
+    data = relay.cast(relay.var("data", shape=[1, 20]), target_precision)
+    weight = relay.cast(relay.var("weight", shape=[20, 20]), target_precision)
+    a = relay.nn.dense(data, weight, units=20, out_dtype=target_precision)
     b = relay.where(data, a, a)
     expected_mod = tvm.IRModule.from_expr(b)
     expected_mod = InferType()(expected_mod)
@@ -454,7 +480,7 @@ def test_where_simple():
     assert tvm.ir.structural_equal(expected_mod, output_mod)
 
 
-def test_batch_matmul_simple():
+def test_batch_matmul_simple(target_precision):
     """Batch matmul is a special case where we try to accumulate to fp16.
 
     This is due to the fact heterogenous accumulation dtypes does not work
@@ -468,17 +494,19 @@ def test_batch_matmul_simple():
         "data": np.random.uniform(-1, 1, size=[1, 1, 20]).astype("float32"),
         "weight": np.random.uniform(-1, 1, size=[1, 20, 20]).astype("float32"),
     }
-    output_mod = verify_mixed_precision_output_close(mod, mod_params, atol=0.01, rtol=0.01)
+    output_mod = verify_mixed_precision_output_close(
+        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=0.01
+    )
     # Create expected module
-    data = relay.cast(relay.var("data", shape=[1, 1, 20]), "float16")
-    weight = relay.cast(relay.var("weight", shape=[1, 20, 20]), "float16")
-    a = relay.nn.batch_matmul(data, weight, out_dtype="float16")
+    data = relay.cast(relay.var("data", shape=[1, 1, 20]), target_precision)
+    weight = relay.cast(relay.var("weight", shape=[1, 20, 20]), target_precision)
+    a = relay.nn.batch_matmul(data, weight, out_dtype=target_precision)
     expected_mod = tvm.IRModule.from_expr(a)
     expected_mod = InferType()(expected_mod)
     assert tvm.ir.structural_equal(expected_mod, output_mod)
 
 
-def test_convert_follow_node_with_integer_arguments():
+def test_convert_follow_node_with_integer_arguments(target_precision):
     """Tests the conversion of a follow op with integer arguments + constant float args.
 
     The follow op should convert the floating point argument into fp16 as constants/vars
@@ -497,10 +525,12 @@ def test_convert_follow_node_with_integer_arguments():
         "data": np.random.uniform(-1, 1, size=[1, 10]).astype("float32"),
         "indices": np.array([[0]]).astype("int32"),
     }
-    output_mod = verify_mixed_precision_output_close(mod, mod_params, atol=0.01, rtol=0.01)
+    output_mod = verify_mixed_precision_output_close(
+        mod, mod_params, mixed_precision_dtype=target_precision, atol=0.01, rtol=0.01
+    )
 
     # Create expected module
-    data = relay.cast(relay.var("data", shape=[1, 10]), "float16")
+    data = relay.cast(relay.var("data", shape=[1, 10]), target_precision)
     take = relay.take(data, indices, axis=0)
     expected_mod = tvm.IRModule.from_expr(take)
     expected_mod = InferType()(expected_mod)

From 5acf3f90c63b6760cd23796b442f8ac20e645af0 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 27 Oct 2022 11:51:51 -0700
Subject: [PATCH 448/704] [ci] Protect release branches (#13208)

This adds a fnmatch entry for the release branches to ensure that
changes come from PRs and that force pushes / rebased aren't allowed so
it's always possible to trace the release branch back to a mainline TVM
commit.
---
 .asf.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.asf.yaml b/.asf.yaml
index f4aba210d2cc..1e4371d594d2 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -61,3 +61,10 @@ github:
 
       required_pull_request_reviews:
         required_approving_review_count: 1
+
+    # protect release branches from unsigned updates and force pushes
+    'v[0-9]*':
+      required_pull_request_reviews:
+        required_approving_review_count: 1
+      required_linear_history: true
+      required_signatures: true

From 3157cf188c8da93a48a37ad5c466f41b39169bc2 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 27 Oct 2022 17:30:28 -0500
Subject: [PATCH 449/704] [IR] Remove shadowing in
 IRSubstituteWithDataTypeLegalization (#13219)

Previously, the `IRSubstituteWithDataTypeLegalization` class
implemented some virtual functions of `DataTypeLegalizer`, but not
all.  As a result, some compilers gave warnings that the base class
methods were being shadowed.  This commit adds the `using` declaration
to avoid shadowing.
---
 src/tir/ir/stmt_functor.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index 6d0ee134c805..59630d34c38e 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -814,6 +814,9 @@ class IRSubstituteWithDataTypeLegalization : public DataTypeLegalizer {
   explicit IRSubstituteWithDataTypeLegalization(std::function<Optional<PrimExpr>(const Var&)> vmap)
       : vmap_(vmap) {}
 
+  using DataTypeLegalizer::VisitExpr_;
+  using DataTypeLegalizer::VisitStmt_;
+
   PrimExpr VisitExpr_(const VarNode* op) final {
     Var var = GetRef<Var>(op);
     auto ret = vmap_(var);

From 0554a46789a6226b14ab309bb10401b6c23b8413 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 27 Oct 2022 17:51:10 -0500
Subject: [PATCH 450/704] [TIR][Arith] Support negative coeff in ModularSet
 (#13081)

Prior to this commit, any use of negative coefficients in `ModularSet`
would result in an error.  This included cases where a constraint is
being entered, such as `floormod(i, -2)==0` appearing as the condition
of an if/else block.  These negative indices can also arise as
intermediate simplification steps produced by `CanonicalSimplifier`,
such as `floormod(-i,2)` being canonicalized to `floormod(i,-2)`.

This commit adds support for negative coefficients in `ModularSet`,
using the same sign convention as is used by `CanonicalSimplifier` for
negative denominators, and adds unit tests to verify that sign
convention.
---
 src/arith/modular_set.cc                      | 13 ++++++++-
 .../unittest/test_arith_canonical_simplify.py |  4 +++
 .../unittest/test_tir_transform_simplify.py   | 29 +++++++++++++++++++
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/arith/modular_set.cc b/src/arith/modular_set.cc
index f455695438ec..ac6bf94b1198 100644
--- a/src/arith/modular_set.cc
+++ b/src/arith/modular_set.cc
@@ -67,7 +67,18 @@ struct ModularSetAnalyzer::Entry {
   Entry() = default;
 
   Entry(int64_t coeff, int64_t base) {
-    ICHECK_GE(coeff, 0);
+    if (coeff < 0) {
+      // `analyzer->canonical_simplify()` can generate expressions with
+      // negative coefficients (e.g. simplifying `floormod(-i, 2)`
+      // into `floormod(i, -2) * -1`).  When this happens, the
+      // ModularSet may enter a constraint based on this expression.
+      //
+      // Handling a negative coeff uses the same sign convention as
+      // canonical_simplify, requiring that
+      // `floormod(var, coeff) == -floormod(var, -coeff)`.
+      coeff *= -1;
+      base *= -1;
+    }
     this->coeff = coeff;
     if (coeff != 0) {
       base = base % coeff;
diff --git a/tests/python/unittest/test_arith_canonical_simplify.py b/tests/python/unittest/test_arith_canonical_simplify.py
index 81a163d0d431..9f187685991e 100644
--- a/tests/python/unittest/test_arith_canonical_simplify.py
+++ b/tests/python/unittest/test_arith_canonical_simplify.py
@@ -97,6 +97,8 @@ def test_split_index_simplify():
     # cannot simplify mixed case, unless we canonicalize into one mode.
     ck.verify(tdiv(x, 6) * 2 + tmod(fld(x, 3), 2), tdiv(x, 6) * 2 + tmod(fld(x, 3), 2))
 
+    ck.verify(tmod(-x, 2), tmod(x, -2) * -1)
+
 
 def test_div_simplify():
     ck = CanonicalChecker()
@@ -129,6 +131,8 @@ def test_floormod_simplify():
     ck.verify(flm(flm((x * 4) + y - 466036, 24528) - 24512, 16), flm((x * 4) + y + 12, 16))
     ck.verify(flm(flm((x * 4), 16), 8), flm(x, 2) * 4)
 
+    ck.verify(flm(-x, 2), flm(x, -2) * -1)
+
 
 def test_canonical_mixed():
     ck = CanonicalChecker()
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 2eb9c3546ee5..46b6858ec773 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -816,5 +816,34 @@ def expected(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
         A[0] = (i != 30) or (j == 0)
 
 
+class TestConditionalFloorMod(BaseBeforeAfter):
+    """A regression test for negative floormod denominator
+
+    Previously, simplifying this function could throw an error.  First, the
+    `canonical_simplify` would rewrite `floormod(0-i,2)` to the equivalent
+    `floormod(i,-2)`.  Then, the rewrite_simplifier would enter a
+    constrained context in which `floormod(i,-2)==1`.  Passing this
+    expression to `ModularSet::EnterConstraint`, which previously did not
+    support a negative value for the second argument, threw an error.
+
+    The analogous failure mode never occurred for `truncmod`, because
+    `truncmod(0-i,2)` would be canonicalized to `truncmod(i, -2) * -1`, and
+    the pattern matching in `ModularSet` didn't recognize the constant
+    factor.
+
+    This failure mode was resolved by supporting negative arguments in
+    `ModularSet`, using the same sign convention as is used by
+    `canonical_simplify`.
+    """
+
+    def before(A: T.Buffer[1, "bool"], i: T.int32):
+        if T.floormod(0 - i, 2) == 0:
+            A[0] = T.floormod(i, 2) == 0
+
+    def expected(A: T.Buffer[1, "bool"], i: T.int32):
+        if T.floormod(i, -2) == 0:
+            A[0] = True
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 0c10302fd02a0c88964f885591bc84011134aa1a Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 27 Oct 2022 18:16:50 -0500
Subject: [PATCH 451/704] [Arith] Add internal NarrowPredicateExpression
 utility (#13041)

Implements `tvm::arith::NarrowPredicateExpression`, a utility that
removes free parameters from a boolean expression, such that the
resulting expression being true implies that the original expression
is true.  For example, the predicate `(0 <= i+f) && (i+f < 16)`, where
`f` is a free parameter on the range `0 <= f < 2)`, can be narrowed to
the expression `(0 <= i+0) && (i+2 < 16)`.

In effect, `NarrowPredicateExpression` functions as a context-sentive
`tvm::tir::Substitute`, where the value substituted is selected such
that the resulting expression errs on the side of being false.  This
is an internal utility used as part of the simplifications for layout
transformations ([tracking issue
link](https://github.com/apache/tvm/issues/12261)).
---
 src/arith/narrow_predicate_expression.cc      | 219 ++++++++++++++++++
 src/arith/narrow_predicate_expression.h       |  57 +++++
 .../test_arith_narrow_predicate_expression.py |  87 +++++++
 3 files changed, 363 insertions(+)
 create mode 100644 src/arith/narrow_predicate_expression.cc
 create mode 100644 src/arith/narrow_predicate_expression.h
 create mode 100644 tests/python/unittest/test_arith_narrow_predicate_expression.py

diff --git a/src/arith/narrow_predicate_expression.cc b/src/arith/narrow_predicate_expression.cc
new file mode 100644
index 000000000000..1c8931d2dec4
--- /dev/null
+++ b/src/arith/narrow_predicate_expression.cc
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file narrow_predicate_expression.cc
+ * \brief Utility to deduce bound of expression
+ */
+#include <tvm/arith/int_solver.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+
+namespace tvm {
+namespace arith {
+
+using namespace tir;
+
+/* \brief Given a true expression that includes free parameter,
+ * generate a true expression without the free parameters.
+ *
+ * This function provides two guarantees:
+ *
+ * 1. If the resulting expression evaluates to True, then the original
+ * expression also evaluates to True.
+ *
+ * 2. The resulting expression does not contain any of the free
+ * parameters.
+ *
+ */
+// Utility for generating a known true expression from an expression
+// with free parameters, and the range of those parameters.
+class ExpressionNarrower : public tir::ExprMutator {
+ public:
+  static PrimExpr Apply(PrimExpr expr, Map<Var, Range> free_parameters) {
+    ICHECK(expr.dtype().is_bool()) << "Expected boolean expression, but received " << expr;
+    ExpressionNarrower mutator(free_parameters);
+    return mutator(expr);
+  }
+
+ private:
+  explicit ExpressionNarrower(Map<Var, Range> free_parameters)
+      : free_parameters_(free_parameters) {}
+
+  using Parent = tir::ExprMutator;
+  using Parent::VisitExpr_;
+
+  enum class Context {
+    Maximize,
+    Minimize,
+  };
+
+  template <typename T>
+  PrimExpr VisitInequality(T t, Context a_ctx, Context b_ctx) {
+    PrimExpr a = [&]() {
+      WithContext context(this, a_ctx);
+      return this->VisitExpr(t->a);
+    }();
+
+    PrimExpr b = [&]() {
+      WithContext context(this, b_ctx);
+      return this->VisitExpr(t->b);
+    }();
+
+    if (contains_unknown_expr_ && t.dtype().is_bool()) {
+      contains_unknown_expr_ = false;
+      return Bool(CurrentContext() == Context::Minimize);
+    } else if (a.same_as(t->a) && b.same_as(t->b)) {
+      return std::move(t);
+    } else {
+      return T(a, b);
+    }
+  }
+
+  PrimExpr VisitExpr_(const FloorModNode* op) override {
+    // FloorMod is non-monotonic, so inserting min/max won't remove
+    // the free parameters.
+    contains_unknown_expr_ = true;
+    return Parent::VisitExpr_(op);
+  }
+
+  PrimExpr VisitExpr_(const FloorDivNode* op) override {
+    auto res_a = this->VisitExpr(op->a);
+    auto res_b = this->VisitExpr(op->b);
+    if (is_zero(res_b)) {
+      contains_unknown_expr_ = true;
+      return IntImm(op->dtype, 0);
+    } else {
+      return floordiv(res_a, res_b);
+    }
+  }
+
+  PrimExpr VisitExpr_(const GTNode* op) override {
+    auto current = CurrentContext();
+    return VisitInequality(GetRef<GT>(op), OppositeContext(current), current);
+  }
+
+  PrimExpr VisitExpr_(const GENode* op) override {
+    auto current = CurrentContext();
+    return VisitInequality(GetRef<GE>(op), OppositeContext(current), current);
+  }
+
+  PrimExpr VisitExpr_(const LTNode* op) override {
+    auto current = CurrentContext();
+    return VisitInequality(GetRef<LT>(op), current, OppositeContext(current));
+  }
+
+  PrimExpr VisitExpr_(const LENode* op) override {
+    auto current = CurrentContext();
+    return VisitInequality(GetRef<LE>(op), current, OppositeContext(current));
+  }
+
+  PrimExpr VisitExpr_(const EQNode* op) override {
+    auto res_a = this->VisitExpr(op->a <= op->b);
+    auto res_b = this->VisitExpr(op->b <= op->a);
+    return res_a && res_b;
+  }
+
+  PrimExpr VisitExpr_(const NENode* op) override {
+    auto res_a = this->VisitExpr(op->a < op->b);
+    auto res_b = this->VisitExpr(op->b < op->a);
+    return res_a || res_b;
+  }
+
+  PrimExpr VisitExpr_(const SubNode* op) override {
+    auto current = CurrentContext();
+    return VisitInequality(GetRef<Sub>(op), current, OppositeContext(current));
+  }
+
+  PrimExpr VisitExpr_(const NotNode* op) override {
+    auto current = CurrentContext();
+    WithContext context(this, OppositeContext(current));
+    return !VisitExpr(op->a);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) override {
+    contains_unknown_expr_ = true;
+    return GetRef<PrimExpr>(op);
+  }
+
+  PrimExpr VisitExpr_(const VarNode* op) override {
+    auto it = free_parameters_.find(GetRef<Var>(op));
+    if (it == free_parameters_.end()) {
+      return Parent::VisitExpr_(op);
+    }
+
+    Range range = (*it).second;
+
+    switch (CurrentContext()) {
+      case Context::Minimize:
+        return range->min;
+
+      case Context::Maximize:
+        return range->min + range->extent - 1;
+    }
+
+    return Parent::VisitExpr_(op);
+  }
+
+  Context CurrentContext() const {
+    if (context_stack_.size()) {
+      return context_stack_.back();
+    } else {
+      return Context::Maximize;
+    }
+  }
+
+  Context OppositeContext(Context context) const {
+    switch (context) {
+      case Context::Minimize:
+        return Context::Maximize;
+
+      case Context::Maximize:
+        return Context::Minimize;
+
+      default:
+        LOG(FATAL) << "Unhandled Context, all legal values should be handled";
+        return Context::Maximize;
+    }
+  }
+
+  struct WithContext {
+    WithContext(ExpressionNarrower* self, Context context) : self(self) {
+      self->context_stack_.push_back(context);
+    }
+    ~WithContext() { self->context_stack_.pop_back(); }
+    ExpressionNarrower* self;
+  };
+
+  std::vector<Context> context_stack_;
+  Map<Var, Range> free_parameters_;
+  bool contains_unknown_expr_{false};
+};
+
+PrimExpr NarrowPredicateExpression(PrimExpr expr, Map<Var, Range> free_parameters) {
+  return ExpressionNarrower::Apply(std::move(expr), std::move(free_parameters));
+}
+
+TVM_REGISTER_GLOBAL("arith.NarrowPredicateExpression").set_body_typed(NarrowPredicateExpression);
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arith/narrow_predicate_expression.h b/src/arith/narrow_predicate_expression.h
new file mode 100644
index 000000000000..1e452e3ad493
--- /dev/null
+++ b/src/arith/narrow_predicate_expression.h
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file narrow_predicate_expression.h
+ * \brief Utility for extracting and interacting with buffer touch points
+ */
+
+#include <tvm/ir/expr.h>
+#include <tvm/tir/var.h>
+
+#ifndef TVM_ARITH_NARROW_PREDICATE_EXPRESSION_H_
+#define TVM_ARITH_NARROW_PREDICATE_EXPRESSION_H_
+
+namespace tvm {
+namespace arith {
+
+/* \brief Narrow a true expression to remove free parameters
+ *
+ * This function provides two guarantees:
+ *
+ * 1. If the resulting expression evaluates to True, then the original
+ * expression also evaluates to True.
+ *
+ * 2. The resulting expression does not contain any of the free
+ * parameters.
+ *
+ * 3. The resulting expression does not contain any BufferLoad
+ *
+ * \param expr The expression to be examined.
+ *
+ * \param ranges The variables to be removed from the expression
+ *
+ * \returns An expression that, if true, implies that the original
+ * expression is also true.
+ */
+PrimExpr NarrowPredicateExpression(PrimExpr expr, Map<tir::Var, Range> free_parameters);
+
+}  // namespace arith
+}  // namespace tvm
+#endif  // TVM_ARITH_NARROW_PREDICATE_EXPRESSION_H_
diff --git a/tests/python/unittest/test_arith_narrow_predicate_expression.py b/tests/python/unittest/test_arith_narrow_predicate_expression.py
new file mode 100644
index 000000000000..d38fe70f6b5c
--- /dev/null
+++ b/tests/python/unittest/test_arith_narrow_predicate_expression.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import tvm.testing
+
+from tvm import tir
+from tvm.runtime import convert
+
+
+i = tir.Var("i", "int32")
+j = tir.Var("j", "int32")
+n = tir.Var("n", "int32")
+m = tir.Var("m", "int32")
+b = tir.Var("b", "bool")
+buf = tir.decl_buffer(16, "int32", "buf")
+
+tir_false = tir.IntImm("bool", False)
+tir_true = tir.IntImm("bool", True)
+
+before, expected = tvm.testing.parameters(
+    # General arithmatic
+    [tir_true, tir_true],
+    [tir_false, tir_false],
+    [b, b],
+    [i > 5, i > 5],
+    [i > n, i > 7],
+    [i < n, i < 0],
+    [i <= n, i <= 0],
+    [i >= n, i >= 7],
+    [n > i, convert(0) > i],
+    [n < i, convert(7) < i],
+    [n <= i, convert(7) <= i],
+    [n >= i, convert(0) >= i],
+    [i == n, tir.all(i <= 0, convert(7) <= i)],
+    [n == i, tir.all(convert(7) <= i, i <= 0)],
+    [i != n, tir.any(i < 0, convert(7) < i)],
+    [n != i, tir.any(convert(7) < i, i < 0)],
+    [i // 4 > n, i // 4 > 7],
+    [n < i // 4, convert(7) < i // 4],
+    [(i + n) // 4 > 0, tir.Add(i, 0) // 4 > 0],
+    [(i + n) // 4 == 0, tir.all(tir.Add(i, 7) // 4 <= 0, convert(0) <= tir.Add(i, 0) // 4)],
+    [i + n < 10, i + 7 < 10],
+    [i - n < 10, tir.Sub(i, 0) < 10],
+    [tir.Not(i < n), tir.Not(i < 7)],
+    # Use of FloorMod should make the narrowing strategy bail out, as
+    # it is non-monotonic.
+    [i % 8 == n, tir_false],
+    # Ensure that dividing by a free parameter doesn't generate a
+    # divide-by-zero to be triggered later.
+    [i // n == 0, tir_false],
+    ### Buffer handling
+    [buf.vload(0) > 0, tir_false],
+    [buf.vload(0) > i, tir_false],
+    [buf.vload(i) > 0, tir_false],
+    [tir.And(buf.vload(i) > 0, i <= 0), tir.And(tir_false, i <= 0)],
+    [tir.Or(buf.vload(i) > 0, i <= n), tir.Or(tir_false, i <= 0)],
+    [tir.Or(tir.Not(buf.vload(i) > 0), i <= n), tir.Or(tir_false, i <= 0)],
+)
+
+
+def test_narrow_expression(before, expected):
+    ranges = {n: tvm.ir.Range(0, 8)}
+    after = tvm.arith._ffi_api.NarrowPredicateExpression(before, ranges)
+
+    if expected is None:
+        assert after is None
+    else:
+        tvm.ir.assert_structural_equal(after, expected)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From de7f76243601e127782e530f84ab7d51115a4c14 Mon Sep 17 00:00:00 2001
From: jrr <55054584+janakiramreddy3@users.noreply.github.com>
Date: Fri, 28 Oct 2022 05:23:13 +0530
Subject: [PATCH 452/704] typo in 'build' NNPACK (#13215)

---
 docs/install/nnpack.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/install/nnpack.rst b/docs/install/nnpack.rst
index 2afd95a5ef3f..c5516235a303 100644
--- a/docs/install/nnpack.rst
+++ b/docs/install/nnpack.rst
@@ -50,7 +50,7 @@ If the trained model meets some conditions of using NNPACK,
 you can build TVM with NNPACK support.
 Follow these simple steps:
 
-uild NNPACK shared library with the following commands. TVM will link NNPACK dynamically.
+build NNPACK shared library with the following commands. TVM will link NNPACK dynamically.
 
 Note: The following NNPACK installation instructions have been tested on Ubuntu 16.04.
 

From 1c697d7460565f7f744eb244b43de425957b4f89 Mon Sep 17 00:00:00 2001
From: Wubin <wubin.wu@imgtec.com>
Date: Fri, 28 Oct 2022 12:06:36 +0800
Subject: [PATCH 453/704] [Frontend][Paddle] Add depthwise_conv2d_transpose op
 mapping (#13214)

Hi, I noticed missing a depthwise_conv2d_transpose, but there is a convert function, convert_conv2d_transpose, seems to support the convertion for this op.
---
 python/tvm/relay/frontend/paddlepaddle.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index 9b909895e084..068f7d2eac95 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -2042,6 +2042,7 @@ def convert_unsqueeze(g, op, block):
     "cosh": convert_unary_op,
     "cumsum": convert_cumsum,
     "depthwise_conv2d": convert_conv2d,
+    "depthwise_conv2d_transpose": convert_conv2d_transpose,
     "dot": convert_dot,
     "dropout": convert_dropout,
     "elementwise_add": convert_elementwise_op,

From cf2a8ea42658e1cc5d9d36970ba5c5fc3e57edf0 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 27 Oct 2022 23:25:22 -0700
Subject: [PATCH 454/704] [Hexagon] Update tests to use AOT Executor  (#13221)

* Update Tests

* lint

* lint
---
 python/tvm/contrib/hexagon/pytest_plugin.py   | 13 ++--
 .../test_hexagon/test_fixed_point_multiply.py | 34 +++++----
 .../contrib/test_hexagon/test_models.py       | 73 +++----------------
 .../python/contrib/test_hexagon/test_usmp.py  | 10 +--
 .../test_wo_qnn_canonicalization.py           | 16 ++--
 5 files changed, 47 insertions(+), 99 deletions(-)

diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index 0771468051d7..8e209611133a 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -39,6 +39,12 @@
 ADB_SERVER_SOCKET = "ADB_SERVER_SOCKET"
 RNG_SEEDED = False
 
+HEXAGON_AOT_LLVM_TARGET = (
+    "llvm -keys=hexagon "
+    "-mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp "
+    "-mcpu=hexagonv68 -mtriple=hexagon"
+)
+
 
 @tvm.testing.fixture
 def shape_nhwc(batch, in_channel, in_size):
@@ -303,12 +309,7 @@ def terminate_rpc_servers():
         os.system("ps ax | grep tvm_rpc_x86 | awk '{print $1}' | xargs kill")
 
 
-aot_host_target = tvm.testing.parameter(
-    "c",
-    "llvm -keys=hexagon "
-    "-mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp "
-    "-mcpu=hexagonv68 -mtriple=hexagon",
-)
+aot_host_target = tvm.testing.parameter("c", HEXAGON_AOT_LLVM_TARGET)
 
 
 @tvm.testing.fixture
diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
index e7e4aa212e35..43feb827af42 100644
--- a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
+++ b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
@@ -21,6 +21,7 @@
 from tvm import relay
 from tvm.relay.backend import Executor
 from tvm.contrib.hexagon.session import Session
+from tvm.contrib.hexagon.pytest_plugin import HEXAGON_AOT_LLVM_TARGET
 
 from .infrastructure import get_hexagon_target
 
@@ -28,7 +29,8 @@
 @tvm.testing.requires_hexagon
 def test_vmpy_intrinsic_presence():
     """
-    check intrinsic lowering for fixed_point_multiply operation
+    check intrinsic lowering for fixed_point_multiply operation.
+    GraphExecutor is used here since get_source("asm") is not supported with aot.
     """
     ishape = (1, 128)
     a = relay.var("a", relay.TensorType(ishape, "int32"))
@@ -61,7 +63,7 @@ def test_vmpy_intrinsic_presence():
 
 def build_module(relay_mod, target):
     params = {}
-    executor = Executor("graph", {"link-params": True})
+    executor = Executor("aot", {"link-params": True})
     lowered = tvm.relay.build(
         relay_mod,
         tvm.target.Target(target, host=target),
@@ -71,10 +73,10 @@ def build_module(relay_mod, target):
     return lowered
 
 
-def run_module(graph_mod, inputs):
-    graph_mod.set_input(**inputs)
-    graph_mod.run()
-    output = graph_mod.get_output(0).numpy()
+def run_module(mod, inputs):
+    mod.set_input(**inputs)
+    mod.run()
+    output = mod.get_output(0).numpy()
     return output
 
 
@@ -102,7 +104,7 @@ def test_fixed_point_multiply(hexagon_session: Session, multiplier: int, shift:
 
     with tvm.transform.PassContext(opt_level=3):
         # Compile for Hexagon...
-        hexagon_lowered = build_module(relay_mod, tvm.target.hexagon("v68"))
+        hexagon_lowered = build_module(relay_mod, HEXAGON_AOT_LLVM_TARGET)
 
         # Compile for LLVM...
         llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm"))
@@ -111,12 +113,12 @@ def test_fixed_point_multiply(hexagon_session: Session, multiplier: int, shift:
     inputs = {"a": data_in}
 
     # Run hexagon...
-    graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    hexagon_output = run_module(graph_mod, inputs)
+    hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+    hexagon_output = run_module(hexagon_mod, inputs)
 
     # Run llvm...
-    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
-    expected_output = run_module(llvm_graph_mod, inputs)
+    llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
+    expected_output = run_module(llvm_mod, inputs)
 
     tvm.testing.assert_allclose(hexagon_output, expected_output)
 
@@ -146,7 +148,7 @@ def test_per_channel_fixed_point_multiply(
 
     with tvm.transform.PassContext(opt_level=3):
         # Compile for Hexagon...
-        hexagon_lowered = build_module(mod, tvm.target.hexagon("v68"))
+        hexagon_lowered = build_module(mod, HEXAGON_AOT_LLVM_TARGET)
 
         # Compile for LLVM...
         llvm_lowered = build_module(mod, tvm.target.Target("llvm"))
@@ -155,12 +157,12 @@ def test_per_channel_fixed_point_multiply(
     inputs = {"a": a_np}
 
     # Run hexagon...
-    graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    hexagon_output = run_module(graph_mod, inputs)
+    hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+    hexagon_output = run_module(hexagon_mod, inputs)
 
     # Run llvm...
-    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
-    expected_output = run_module(llvm_graph_mod, inputs)
+    llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
+    expected_output = run_module(llvm_mod, inputs)
 
     tvm.testing.assert_allclose(hexagon_output, expected_output)
 
diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
index db578d1057a6..f91b660766ed 100644
--- a/tests/python/contrib/test_hexagon/test_models.py
+++ b/tests/python/contrib/test_hexagon/test_models.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""Test mobilenet model with both graph and aot executor"""
+"""Test mobilenet model with aot executor"""
 
 import numpy as np
 import pytest
@@ -25,8 +25,6 @@
 from tvm.contrib.hexagon.session import Session
 from tvm.relay.backend import Executor, Runtime
 
-from .infrastructure import get_hexagon_target
-
 
 def get_mobilenet():
     """Download and import mobilenet model with ONNX"""
@@ -39,60 +37,10 @@ def get_mobilenet():
     return onnx.load(model_path)
 
 
-@tvm.testing.requires_hexagon
-def test_mobilenet(hexagon_session: Session):
-    """Test mobilenet with graph executor"""
-    dtype = "float32"
-    onnx_model = get_mobilenet()
-
-    target_llvm = tvm.target.Target("llvm")
-    runtime = Runtime("cpp")
-    executor = Executor("graph", {"link-params": True})
-
-    data_in = np.random.rand(1, 3, 224, 224).astype(dtype=dtype)
-
-    input_name = "data"
-    shape_dict = {input_name: data_in.shape}
-    relay_mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, freeze_params=True)
-    inputs = {input_name: data_in}
-
-    with tvm.transform.PassContext(opt_level=3):
-        hexagon_lowered = tvm.relay.build(
-            relay_mod,
-            get_hexagon_target("v68"),
-            runtime=runtime,
-            executor=executor,
-            params=params,
-        )
-
-        llvm_lowered = tvm.relay.build(
-            relay_mod,
-            tvm.target.Target(target_llvm, host=target_llvm),
-            runtime=runtime,
-            executor=executor,
-            params=params,
-        )
-
-    graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    graph_mod.set_input(**inputs)
-    graph_mod.run()
-    hexagon_output = graph_mod.get_output(0).numpy()
-
-    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
-    llvm_graph_mod.set_input(**inputs)
-    llvm_graph_mod.run()
-    expected_output = llvm_graph_mod.get_output(0).numpy()
-
-    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
-
-
 @pytest.mark.parametrize("enable_usmp", [False, True])
 @tvm.testing.requires_hexagon
 def test_mobilenet_aot(hexagon_session: Session, aot_host_target, aot_target, enable_usmp):
     """Test mobilenet with aot executor"""
-    if hexagon_session.is_simulator():
-        pytest.skip(msg="Skip on simulator due to long runtime.")
-
     dtype = "float32"
     onnx_model = get_mobilenet()
 
@@ -114,25 +62,24 @@ def test_mobilenet_aot(hexagon_session: Session, aot_host_target, aot_target, en
             params=params,
         )
 
-    aot_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    aot_mod.set_input(**inputs)
-    aot_mod.run()
-    hexagon_output = aot_mod.get_output(0).numpy()
+    hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+    hexagon_mod.set_input(**inputs)
+    hexagon_mod.run()
+    hexagon_output = hexagon_mod.get_output(0).numpy()
 
     with tvm.transform.PassContext(opt_level=3):
         llvm_lowered = tvm.relay.build(
             relay_mod,
             tvm.target.Target(target_llvm, host=target_llvm),
             runtime=Runtime("cpp"),
-            executor=Executor("graph", {"link-params": True}),
+            executor=Executor("aot", {"interface-api": "packed"}),
             params=params,
         )
 
-    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
-    llvm_graph_mod.set_input(**inputs)
-    llvm_graph_mod.run()
-    expected_output = llvm_graph_mod.get_output(0).numpy()
-
+    llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
+    llvm_mod.set_input(**inputs)
+    llvm_mod.run()
+    expected_output = llvm_mod.get_output(0).numpy()
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
diff --git a/tests/python/contrib/test_hexagon/test_usmp.py b/tests/python/contrib/test_hexagon/test_usmp.py
index d56531b2541d..adfebcd122b3 100644
--- a/tests/python/contrib/test_hexagon/test_usmp.py
+++ b/tests/python/contrib/test_hexagon/test_usmp.py
@@ -94,13 +94,13 @@ def test_conv2d(hexagon_session: Session, aot_host_target, aot_target, usmp_enab
             relay_mod,
             tvm.target.Target(target_llvm, host=target_llvm),
             runtime=Runtime("cpp"),
-            executor=Executor("graph"),
+            executor=Executor("aot"),
         )
 
-    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
-    llvm_graph_mod.set_input(**params)
-    llvm_graph_mod.run(**inputs)
-    expected_output = llvm_graph_mod.get_output(0).numpy()
+    llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
+    llvm_mod.set_input(**params)
+    llvm_mod.run(**inputs)
+    expected_output = llvm_mod.get_output(0).numpy()
 
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
diff --git a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
index 24da1faac697..2fc607c0c521 100644
--- a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
+++ b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
@@ -21,7 +21,7 @@
 import tvm.testing
 from tvm import relay
 from tvm.contrib.hexagon.session import Session
-from tvm.contrib import graph_executor
+from tvm.contrib.hexagon.pytest_plugin import HEXAGON_AOT_LLVM_TARGET
 from tvm.relay.backend import Executor
 
 
@@ -88,13 +88,12 @@ def test_qnn_conv2d_rq(hexagon_session: Session):
     )
     relay_mod = tvm.IRModule.from_expr(op5)
 
-    target_hexagon = tvm.target.hexagon("v68")
     target_llvm = tvm.target.Target("llvm")
-    executor = Executor("graph", {"link-params": True})
+    executor = Executor("aot")
     with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
         hexagon_lowered = tvm.relay.build(
             relay_mod,
-            tvm.target.Target(target_hexagon, host=target_hexagon),
+            tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET),
             executor=executor,
         )
 
@@ -112,7 +111,7 @@ def test_qnn_conv2d_rq(hexagon_session: Session):
     hexagon_output = execute(hx_m, data_np, weight_np)
 
     dev = tvm.cpu(0)
-    llvm_m = graph_executor.GraphModule(llvm_lowered["default"](dev))
+    llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
     llvm_out = execute(llvm_m, data_np, weight_np)
 
     np.testing.assert_equal(hexagon_output.numpy(), llvm_out.numpy())
@@ -150,13 +149,12 @@ def test_qnn_dense_bias_rq(hexagon_session: Session):
     )
     relay_mod = tvm.IRModule.from_expr(op5)
 
-    target_hexagon = tvm.target.hexagon("v68")
     target_llvm = tvm.target.Target("llvm")
-    executor = Executor("graph", {"link-params": True})
+    executor = Executor("aot")
     with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
         hexagon_lowered = tvm.relay.build(
             relay_mod,
-            tvm.target.Target(target_hexagon, host=target_hexagon),
+            tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET),
             executor=executor,
         )
 
@@ -175,7 +173,7 @@ def test_qnn_dense_bias_rq(hexagon_session: Session):
     hexagon_output = execute(hx_m, data_np, weight_np, bias_np)
 
     dev = tvm.cpu(0)
-    llvm_m = graph_executor.GraphModule(llvm_lowered["default"](dev))
+    llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
     llvm_out = execute(llvm_m, data_np, weight_np, bias_np)
 
     np.testing.assert_equal(hexagon_output.numpy(), llvm_out.numpy())

From 187a2570edcbbfb2669af7f257638943604fe2ba Mon Sep 17 00:00:00 2001
From: multiverstack <39256082+multiverstack-intellif@users.noreply.github.com>
Date: Fri, 28 Oct 2022 16:18:24 +0800
Subject: [PATCH 455/704] [TIR][Schedule] Add cache_index to precompute index
 of buffer load (#13192)

Co-authored-by: Min Chen <chen.min@intellif.com>
---
 include/tvm/arith/int_set.h                   |   9 +
 include/tvm/tir/schedule/schedule.h           |   8 +
 python/tvm/tir/schedule/schedule.py           |  86 ++++
 src/tir/schedule/concrete_schedule.cc         |  13 +
 src/tir/schedule/concrete_schedule.h          |   1 +
 src/tir/schedule/primitive.h                  |   9 +
 src/tir/schedule/primitive/cache_index.cc     | 484 ++++++++++++++++++
 src/tir/schedule/schedule.cc                  |   2 +
 src/tir/schedule/traced_schedule.cc           |  14 +
 src/tir/schedule/traced_schedule.h            |   1 +
 .../unittest/test_tir_schedule_cache_index.py |  78 +++
 11 files changed, 705 insertions(+)
 create mode 100644 src/tir/schedule/primitive/cache_index.cc
 create mode 100644 tests/python/unittest/test_tir_schedule_cache_index.py

diff --git a/include/tvm/arith/int_set.h b/include/tvm/arith/int_set.h
index 5ef7108d9797..60d7c53d28e8 100644
--- a/include/tvm/arith/int_set.h
+++ b/include/tvm/arith/int_set.h
@@ -169,6 +169,15 @@ Map<Var, IntSet> ConvertDomMap(const std::unordered_map<const VarNode*, IntSet>&
  * \return An integer set that can cover all the possible values of e.
  */
 IntSet EvalSet(PrimExpr e, const Map<IterVar, IntSet>& dom_map);
+/*!
+ * \brief Find an symbolic integer set that contains all possible values of
+ *  e given the domain of each variables.
+ *
+ * \param e The expression to be evaluated.
+ * \param dom_map The domain of each variable.
+ * \return An integer set that can cover all the possible values of e.
+ */
+IntSet EvalSet(PrimExpr e, const Map<Var, IntSet>& dom_map);
 /*!
  * \brief Same as EvalSet, but takes unordered_map
  *
diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 9ec2841ebd5e..3394e37070ff 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -413,6 +413,14 @@ class ScheduleNode : public runtime::Object {
    */
   virtual Array<BlockRV> CacheInplace(const BlockRV& block_rv, int read_buffer_index,
                                       const String& storage_scope) = 0;
+  /*!
+   * \brief Create a block to cache precomputed index for later use.
+   * if there is no index computation, keep unchanged.
+   * \param block_rv The target block
+   * \param buffer_index The index of the target buffer in block's read region
+   * \return The cache stage blocks.
+   */
+  virtual Array<BlockRV> CacheIndex(const BlockRV& block_rv, int buffer_index) = 0;
   /*!
    * \brief Create a block that read/write a buffer region into a read/write cache with reindexing.
    * The layout of the cache will be the same as by the iterators of the block that reads/writes the
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 4814271f4023..6c620045e90d 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -1278,6 +1278,92 @@ def cache_inplace(data_io: T.Buffer[64, "int32"]) -> None:
             self, block, read_buffer_index, storage_scope
         )
 
+    @type_checked
+    def cache_index(
+        self, block: Union[BlockRV, str], buffer_index: Union[int, str, Buffer]
+    ) -> List[BlockRV]:
+        """Create a block to cache precomputed index for later use.
+        if there is no index computation, keep unchanged.
+
+        Parameters
+        ----------
+        block : Union[BlockRV, str]
+            The target block operates on the target buffer.
+
+        buffer_index: int
+            The index of the target buffer in block's read region
+
+
+        Returns
+        -------
+        cached_blocks : List[BlockRV]
+            The blocks of the stage writing the cache buffers
+
+        Examples
+        --------
+        Before cache_inplace, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def resize(a: T.handle, b: T.handle) -> None:
+                A = T.match_buffer(a, (1, 3, 40, 40))
+                B = T.match_buffer(b, (1, 3, 80, 80))
+                for i0, i1, i2, i3 in T.grid(1, 3, 80, 80):
+                    with T.block("A"):
+                        n, c, vi, vj = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                        B[n, c, vi, vj] = A[n, c, vi//4 + vj//4, vj//2]
+
+        Create the schedule and cache_index:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(resize)
+            block_a = sch.get_block("A")
+            sch.cache_index(block_a, 0)
+            print(sch.mod["main"].script())
+
+        After applying cache_index, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def resize_cache_index(
+                A: T.Buffer[(1, 3, 40, 40), "float32"], B: T.Buffer[(1, 3, 80, 80), "float32"]
+            ) -> None:
+                index_var_0 = T.alloc_buffer([80, 80], dtype="int32", strides=[1])
+                index_var_1 = T.alloc_buffer([80], dtype="int32", strides=[1])
+                for ax0, ax1 in T.grid(80, 80):
+                    with T.block("index_0"):
+                        v0 = T.axis.spatial(80, ax0)
+                        v1 = T.axis.spatial(80, ax1)
+                        T.reads()
+                        T.writes(index_var_0[v0, v1])
+                        index_var_0[v0, v1] = v0 // 4 + v1 // 4
+                for ax0 in T.serial(80):
+                    with T.block("index_1"):
+                        v0 = T.axis.spatial(80, ax0)
+                        T.reads()
+                        T.writes(index_var_1[v0])
+                        index_var_1[v0] = v0 // 2
+                for i0, i1, i2, i3 in T.grid(1, 3, 80, 80):
+                    with T.block("A"):
+                        n, c, vi, vj = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                        T.reads(A[n, c, vi // 4 + vj // 4, vj // 2])
+                        T.writes(B[n, c, vi, vj])
+                        B[n, c, vi, vj] = A[n, c, index_var_0[vi, vj], index_var_1[vj]]
+
+        """
+        block = self._normalize_block_arg(block)
+
+        if not isinstance(buffer_index, int):
+            _, buffer_index, _ = self._normalize_buffer_arg(
+                block, buffer_index, required_buffer_type="read"
+            )
+        return _ffi_api.ScheduleCacheIndex(  # type: ignore # pylint: disable=no-member
+            self, block, buffer_index
+        )
+
     @type_checked
     def reindex(
         self,
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 3960087cf745..7144ba8ae1f5 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -574,6 +574,19 @@ Array<BlockRV> ConcreteScheduleNode::CacheInplace(const BlockRV& block_rv, int w
   return return_blocks;
 }
 
+Array<BlockRV> ConcreteScheduleNode::CacheIndex(const BlockRV& block_rv, int buffer_index) {
+  Array<StmtSRef> result;
+  TVM_TIR_SCHEDULE_BEGIN();
+  result = tir::CacheIndex(state_, this->GetSRef(block_rv), buffer_index);
+  TVM_TIR_SCHEDULE_END("cache-index", this->error_render_level_);
+  this->state_->DebugVerify();
+  Array<BlockRV> return_blocks;
+  for (const StmtSRef& blockrv : result) {
+    return_blocks.push_back(CreateRV<BlockRV>(blockrv));
+  }
+  return return_blocks;
+}
+
 BlockRV ConcreteScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index,
                                       BufferIndexType buffer_index_type) {
   StmtSRef result{nullptr};
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index bfdc082d4ce6..384b1ce2425f 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -118,6 +118,7 @@ class ConcreteScheduleNode : public ScheduleNode {
                      const String& storage_scope) override;
   Array<BlockRV> CacheInplace(const BlockRV& block_rv, int read_buffer_index,
                               const String& storage_scope) override;
+  Array<BlockRV> CacheIndex(const BlockRV& block_rv, int write_buffer_index) override;
   BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
                   BufferIndexType buffer_index_type) override;
   /******** Schedule: Compute location ********/
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 88331fb5b9d3..8e5ab91b8e7c 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -279,6 +279,15 @@ TVM_DLL StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int
  */
 TVM_DLL Array<StmtSRef> CacheInplace(ScheduleState self, const StmtSRef& block_sref,
                                      int read_buffer_index, const String& storage_scope);
+/*!
+ * \brief Create a block to cache precomputed index for later use.
+ * if there is no index computation, keep unchanged.
+ * \param block_sref The target block
+ * \param buffer_index The index of the target buffer in block's read region,
+ * \return The cache stage block.
+ */
+TVM_DLL Array<StmtSRef> CacheIndex(ScheduleState self, const StmtSRef& block_sref,
+                                   int buffer_index);
 /*!
  *!
  * \brief Create a block that read/write a buffer region into a read/write cache with reindexing.
diff --git a/src/tir/schedule/primitive/cache_index.cc b/src/tir/schedule/primitive/cache_index.cc
new file mode 100644
index 000000000000..ba58f81038cb
--- /dev/null
+++ b/src/tir/schedule/primitive/cache_index.cc
@@ -0,0 +1,484 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/arith/int_set.h>
+
+#include "../utils.h"
+
+namespace tvm {
+namespace tir {
+
+/******** Helper Functions/Classes ********/
+
+/*! \brief The auxiliary info used for the insertion point and content of the cache stage. */
+struct IndexInfo {
+  /*! \brief The target buffer to cache the index. */
+  Buffer target_buffer;
+  /*! \brief The cache buffer to store the precomputed index */
+  std::vector<Buffer> cache_buffer;
+  /*! \brief The expr to be precomputed */
+  std::vector<PrimExpr> index_exprs;
+  /*! \brief The range of the loop vars relating to index computation */
+  Map<Var, Range> range_map;
+  /*! \brief The binding table of the block var and the loop var */
+  Map<Var, PrimExpr> var_binding;
+  /*! \brief The block var of the target block */
+  std::vector<Array<Var>> origin_block_vars;
+  /*! \brief The index to insert the cache stage. */
+  size_t loc_pos;
+  /*! \brief The cache stage to be inserted. */
+  Stmt cache_stage;
+  /*! \brief The map used for ScheduleStateNode::Replace. */
+  Map<Block, Block> block_reuse;
+};
+
+/*!
+ * \brief Determine the data type base on the integer range.
+ * \param range The range of the integer.
+ * \returns A data type that covers the input range.
+ */
+DataType DetermineDatatype(const arith::IntSet& range) {
+  arith::Analyzer ana;
+  if (ana.CanProve(range.min() >= INT32_MIN && range.max() <= INT32_MAX)) {
+    return DataType::Int(32);
+  } else {
+    ICHECK(ana.CanProve(range.min() >= make_const(DataType::Int(64), INT64_MIN) &&
+                        range.max() <= make_const(DataType::Int(64), INT64_MAX)));
+    return DataType::Int(64);
+  }
+}
+
+/*! \brief Collect the index info to be cached */
+class IndexInfoCollector : public StmtExprVisitor {
+ public:
+  /*!
+   * \brief Collect the index info for cache_index and write into the IndexInfo
+   * \param self The state of the schedule \param block_sref The sref of the target
+   * block of the target buffer being applied cache_index \param scope_sref The sref
+   * of the scope block of the target block \param info The index info.
+   */
+  static void Collect(const ScheduleState& self, const StmtSRef& block_sref,
+                      const StmtSRef& scope_sref, IndexInfo* info) {
+    IndexInfoCollector collector(self, block_sref, scope_sref, info->target_buffer);
+    collector(GetRef<Stmt>(scope_sref->stmt));
+    // info->loc_sref = collector.loc_sref_;
+    info->loc_pos = collector.loc_pos_;
+    info->index_exprs = collector.exprs_;
+    info->range_map = collector.range_map_;
+  }
+
+ private:
+  /*!
+   * \brief Constructor
+   * \param self The state of the schedule
+   * \param block_sref The sref of the target block of the buffer being applied cache_index
+   * \param scope_sref The sref of the scope block of the target block
+   * \param buffer The target buffer to cache the indexs
+   */
+  IndexInfoCollector(const ScheduleState self, const StmtSRef& block_sref,
+                     const StmtSRef& scope_sref, const Buffer& buffer)
+      : self_(self), block_sref_(block_sref), scope_sref_(scope_sref), buffer_(buffer) {}
+
+  void VisitStmt_(const SeqStmtNode* seq_stmt) final {
+    for (size_t i = 0; i < seq_stmt->size(); ++i) {
+      if (loc_pos_ != -1) {
+        break;
+      }
+      VisitStmt(seq_stmt->seq[i]);
+      // `pos` can be assigned only once when we visited `block_sref`
+      if (visited_block_ && loc_pos_ == -1 && update_seq_pos_) {
+        // The offset of insert position from the block
+        loc_pos_ = i;
+        return;
+      }
+    }
+  }
+
+  void VisitStmt_(const BlockNode* block) final {
+    // Only visit the target's parent block
+    StmtVisitor::VisitStmt_(block);
+    if (block == scope_sref_->stmt) {
+      // The block vistied is the current parent scope
+      // Handling cases when no SeqStmt in the scope
+      if (visited_block_ && loc_pos_ == -1) {
+        loc_pos_ = 0;
+      }
+    } else if (block_sref_->stmt == block) {
+      visited_block_ = true;
+    }
+    // Update seq pos only at top scope
+    if (visited_block_ && self_->stmt2ref.at(block)->parent == scope_sref_.get()) {
+      update_seq_pos_ = true;
+    }
+  }
+
+  void VisitStmt_(const ForNode* loop) final {
+    range_map_.Set(loop->loop_var, Range::FromMinExtent(loop->min, loop->extent));
+    StmtVisitor::VisitStmt_(loop);
+    // Update seq pos only at top scope
+    if (visited_block_ && self_->stmt2ref.at(loop)->parent == scope_sref_.get()) {
+      update_seq_pos_ = true;
+    }
+  }
+
+  void VisitExpr_(const BufferLoadNode* load) final {
+    if (load->buffer.same_as(buffer_)) {
+      for (const PrimExpr& it : load->indices) {
+        if (!it->IsInstance<VarNode>()) {
+          exprs_.push_back(it);
+        }
+      }
+    }
+    ExprVisitor::VisitExpr_(load);
+  }
+
+  /*! \brief The schedule class */
+  const ScheduleState self_;
+  /*! \brief The target block that read the target buffer */
+  const StmtSRef& block_sref_;
+  /*! \brief The parent scope of the target block */
+  const StmtSRef& scope_sref_;
+  /*! \brief The target buffer to cache the index */
+  const Buffer& buffer_;
+  /*! \brief The calculation expr to be precomputed */
+  std::vector<PrimExpr> exprs_;
+  /*! \brief The flag whether we have visited the target block */
+  bool visited_block_{false};
+  /*! \brief The index to insert the cache_index stage */
+  int loc_pos_{-1};
+  /*! \brief The flag indicating the right scope to update seq pos */
+  bool update_seq_pos_{false};
+  /*! \brief Record the ranges of iter vars */
+  Map<Var, Range> range_map_;
+};
+
+/*!
+ * \brief Create a loop nest that writes precomputed index into index buffer.
+ * \param cache_region The cached copy region.
+ * \param info The cache stage information, which will be updated in the function.
+ * \param storage_scope The storage scope of the cached buffer (only used in naming here)
+ * \returns A block indicating the body of the loop nesting.
+ */
+Array<Block> MakeIndexCacheStage(IndexInfo* info) {
+  Array<Block> blocks;
+  Array<Stmt> bodies;
+  bodies.reserve(info->index_exprs.size());
+  info->cache_buffer.reserve(info->index_exprs.size());
+  const String& storage_scope = info->target_buffer.scope();
+
+  // For each index calculation, create a block to pre-compute.
+  for (size_t expr_index = 0; expr_index < info->index_exprs.size(); expr_index++) {
+    const PrimExpr& index_expr = info->index_exprs[expr_index];
+
+    // Collect the block vars in original index computation
+    info->origin_block_vars.push_back({});
+    PostOrderVisit(index_expr, [&info, &expr_index](const ObjectRef& node) {
+      if (node->IsInstance<VarNode>()) {
+        Var iter_var = Downcast<Var>(node);
+        const Array<Var>& origin_block_var = info->origin_block_vars[expr_index];
+        auto find_result = std::find_if(origin_block_var.begin(), origin_block_var.end(),
+                                        [&](Var it) { return it.get() == iter_var.get(); });
+        if (find_result == origin_block_var.end()) {
+          info->origin_block_vars[expr_index].push_back(iter_var);
+        }
+      }
+    });
+
+    // Collect the loop vars corresponding to collected block vars,
+    // which will be used to create new loop vars
+    std::vector<Var> iter_vars;
+    for (const Var& it : info->origin_block_vars[expr_index]) {
+      PostOrderVisit(info->var_binding.at(it), [&info, &iter_vars](const ObjectRef& node) {
+        if (node->IsInstance<VarNode>()) {
+          Var iter_var = Downcast<Var>(node);
+          if (std::find_if(iter_vars.begin(), iter_vars.end(),
+                           [&](Var it) { return it.get() == iter_var.get(); }) == iter_vars.end()) {
+            iter_vars.push_back(iter_var);
+          }
+        }
+      });
+    }
+
+    // Inference the shape and create cache buffer
+    arith::IntSet val_range =
+        arith::EvalSet(Substitute(index_expr, info->var_binding), arith::AsIntSet(info->range_map));
+    DataType data_type = DetermineDatatype(val_range);
+    Var index_buffer_var("index_var_" + std::to_string(expr_index),
+                         PointerType(PrimType(data_type), storage_scope));
+    Array<PrimExpr> buffer_shape;
+    for (const Var& it : info->origin_block_vars[expr_index]) {
+      buffer_shape.push_back(
+          arith::EvalSet(info->var_binding.at(it), arith::AsIntSet(info->range_map)).max() + 1);
+    }
+    info->cache_buffer.push_back(Buffer(index_buffer_var, data_type, buffer_shape, {1}, {0},
+                                        index_buffer_var->name_hint, 0, 0, kDefault));
+
+    // Create loop vars and block vars' binding_value
+    std::vector<Var> loop_vars;
+    Map<Var, PrimExpr> replace_table;
+    for (const Var& it : iter_vars) {
+      DataType data_type = DetermineDatatype(arith::IntSet::FromRange(info->range_map.at(it)));
+      Var loop_var("ax" + std::to_string(replace_table.size()), data_type);
+      loop_vars.push_back(loop_var);
+      replace_table.Set(it, loop_var);
+    }
+    // Create iter_values from the original block.
+    std::vector<PrimExpr> iter_values;
+    for (const Var& it : info->origin_block_vars[expr_index]) {
+      iter_values.push_back(Substitute(info->var_binding.at(it), replace_table));
+    }
+    // block variables
+    Array<IterVar> block_vars;
+    // block access region for write buffers
+    Region access_region;
+    // indices used in block body
+    Array<PrimExpr> access_indices;
+    Map<Var, PrimExpr> block_var_map;
+    // Create block vars, block's accessed region and accessing indices
+    for (size_t i = 0; i < info->origin_block_vars[expr_index].size(); i++) {
+      const Var& block_var = info->origin_block_vars[expr_index][i];
+      Var var("v" + std::to_string(access_indices.size()), block_var.dtype());
+      Range range = Range::FromMinExtent(make_zero(block_var.dtype()),
+                                         info->range_map.at(iter_vars[i])->extent);
+      block_vars.push_back(IterVar(/*dom=*/range,
+                                   /*var=*/var,
+                                   /*IterVarType=*/kDataPar));
+
+      access_indices.push_back(var);
+      access_region.push_back(Range::FromMinExtent(var, make_const(var.dtype(), 1)));
+      block_var_map.Set(block_var, var);
+    }
+
+    // Create the index computing block
+    PrimExpr new_expr = Substitute(index_expr, block_var_map);
+    Block block(
+        /*iter_vars=*/std::move(block_vars),
+        /*reads=*/{},
+        /*writes=*/{BufferRegion(info->cache_buffer[expr_index], access_region)},
+        /*name_hint=*/"index_" + std::to_string(expr_index),
+        /*body=*/
+        BufferStore(info->cache_buffer[expr_index], new_expr, access_indices),
+        /*init=*/NullOpt,
+        /*alloc_buffers=*/{},
+        /*match_buffers=*/{},
+        /*annotations=*/{});
+    blocks.push_back(block);
+    // Create the block realize node
+    Stmt body = BlockRealize(/*values=*/iter_values,
+                             /*predicate=*/const_true(),
+                             /*block=*/block);
+    // Create surrounding loops
+    for (size_t i = loop_vars.size(); i >= 1; --i) {
+      body = For(/*loop_var=*/loop_vars[i - 1],
+                 /*min=*/0,
+                 /*extent=*/info->range_map.at(iter_vars[i - 1])->extent,
+                 /*kind=*/ForKind::kSerial,
+                 /*body=*/body);
+    }
+    bodies.push_back(body);
+  }
+
+  info->cache_stage = SeqStmt(bodies);
+  return blocks;
+}
+
+/*!
+ * \brief Insert the cache stages into the specific position
+ * \param stmt A sequence of statements or a single statement that the new stage is inserted in
+ * \param pos The position where the cache stage is inserted
+ * \param stage The stage to be inserted
+ * \return A SeqStmt, the result after insertion
+ */
+Stmt InsertIndexStage(const Stmt& stmt, int pos, const Stmt& stage) {
+  if (const auto* seq_stmt = stmt.as<SeqStmtNode>()) {
+    ObjectPtr<SeqStmtNode> result = make_object<SeqStmtNode>(*seq_stmt);
+    result->seq.insert(result->seq.begin() + pos, stage);
+    return SeqStmt(result);
+  }
+  if (pos == 0) {
+    return SeqStmt::Flatten<Array<Stmt>>({stage, stmt});
+  }
+  ICHECK_EQ(pos, 1);
+  return SeqStmt::Flatten<Array<Stmt>>({stmt, stage});
+}
+
+/*! \brief Mutator for CacheIndex. */
+class CacheIndexRewriter : public StmtExprMutator {
+ public:
+  /*!
+   * \brief Rewrite the AST and add stages of writting precomputed index
+   * \param scope_sref The parent scope of this mutation
+   * \param info The index information
+   * \return The new AST rooting at the original parent scope
+   */
+  static Stmt Rewrite(const StmtSRef& scope_sref, IndexInfo* info) {
+    CacheIndexRewriter rewriter(scope_sref, info);
+    return rewriter(GetRef<Stmt>(scope_sref->stmt));
+  }
+
+ private:
+  explicit CacheIndexRewriter(const StmtSRef& scope_sref, IndexInfo* info)
+      : scope_sref_(scope_sref), info_(info) {
+    cache_indices_.reserve(info_->origin_block_vars.size());
+    for (const Array<Var>& group_it : info_->origin_block_vars) {
+      cache_indices_.push_back({});
+      for (const Var& it : group_it) {
+        cache_indices_.back().push_back(it);
+      }
+    }
+  }
+
+  Stmt VisitStmt_(const BlockNode* block) final {
+    Block old_stmt = GetRef<Block>(block);
+    // Mutate the body
+    Block stmt = Downcast<Block>(StmtMutator::VisitStmt_(block));
+
+    // Check if it is the block corresponding to the parent scope
+    if (block == scope_sref_->stmt) {
+      // If so, put buffer allocation and insert cache stages on the parent scope
+      ObjectPtr<BlockNode> n = make_object<BlockNode>(*stmt.as<BlockNode>());
+      n->body = InsertIndexStage(n->body, info_->loc_pos, info_->cache_stage);
+      for (const Buffer& it : info_->cache_buffer) {
+        n->alloc_buffers.push_back(it);
+      }
+      stmt = Block(n);
+    }
+    info_->block_reuse.Set(old_stmt, stmt);
+    return std::move(stmt);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* load) final {
+    if (load->buffer.same_as(info_->target_buffer)) {
+      // Rewrite the target buffer load
+      Array<PrimExpr> new_indices;
+      for (const PrimExpr& index : load->indices) {
+        auto it = std::find_if(info_->index_exprs.begin(), info_->index_exprs.end(),
+                               [&](PrimExpr& e) { return e.get() == index.get(); });
+        if (it == info_->index_exprs.end()) {
+          new_indices.push_back(index);
+        } else {
+          // Replace load index with cached index
+          auto offset = std::distance(info_->index_exprs.begin(), it);
+          new_indices.push_back(BufferLoad(info_->cache_buffer[offset], cache_indices_[offset]));
+        }
+      }
+      return BufferLoad(load->buffer, new_indices);
+    }
+    return ExprMutator::VisitExpr_(load);
+  }
+
+  PrimExpr VisitExpr_(const LoadNode* op) final {
+    LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
+    return PrimExpr();
+  }
+
+ private:
+  /*! \brief The parent scope of the insertion */
+  const StmtSRef& scope_sref_;
+  /*! \brief The info for inserting cache stage */
+  IndexInfo* info_;
+  /*! \brief The indices for the cache buffer */
+  std::vector<Array<PrimExpr>> cache_indices_;
+};
+
+Array<StmtSRef> CacheIndex(ScheduleState self, const StmtSRef& block_sref, int buffer_index) {
+  /*!
+   * Check:
+   *   - The index is in the array of block reading region
+   *
+   * Mutate:
+   *   - Allocate new cache buffers under the current scope.
+   *   - Precompute the index and store it in cache buffers.
+   */
+
+  // Step 0. Checking index, getting the target buffer and the parent scope
+  IndexInfo info;
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
+  info.target_buffer =
+      GetNthAccessBuffer(self, GetRef<Block>(block), buffer_index, BufferIndexType::kRead);
+  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
+
+  // Step 1. Collect the indexing info of target buffer.
+  IndexInfoCollector::Collect(self, block_sref, scope_sref, &info);
+
+  // Step 2. Create cache stages and rewrite the stmt.
+  BlockRealize realize = GetBlockRealize(self, block_sref);
+  info.var_binding = GetBindings(realize);
+  Array<Block> cache_stages = MakeIndexCacheStage(&info);
+  Stmt new_scope = CacheIndexRewriter::Rewrite(/*scope_sref=*/scope_sref, /*info=*/&info);
+
+  bool old_stage_pipeline = self->block_info[block_sref].scope->stage_pipeline;
+
+  // Step 3. Replacing and updating flags.
+  self->Replace(scope_sref, new_scope, info.block_reuse);
+  Array<StmtSRef> result_block_srefs;
+  for (const Block& it : cache_stages) {
+    StmtSRef result_block_sref = self->stmt2ref.at(it.get());
+    result_block_srefs.push_back(result_block_sref);
+    BlockInfo& block_info = self->block_info[result_block_sref];
+
+    bool affine_binding = false;
+    if (result_block_sref->parent == nullptr) {
+      affine_binding = true;
+    } else {
+      arith::Analyzer analyzer;
+      StmtSRef parent_sref = GetRef<StmtSRef>(result_block_sref->parent);
+      affine_binding = IsAffineBinding(/*realize=*/GetBlockRealize(self, result_block_sref),
+                                       /*loop_var_ranges=*/LoopDomainOfSRefTreePath(parent_sref),
+                                       /*analyzer=*/&analyzer);
+    }
+
+    block_info.affine_binding = affine_binding;
+    block_info.region_cover = true;
+    block_info.scope->stage_pipeline = old_stage_pipeline;
+  }
+
+  return result_block_srefs;
+}
+
+/******** InstructionKind Registration ********/
+
+struct CacheIndexTraits : public UnpackedInstTraits<CacheIndexTraits> {
+  static constexpr const char* kName = "CacheIndex";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumAttrs = 1;
+  static constexpr size_t kNumDecisions = 0;
+
+  static Array<BlockRV> UnpackedApplyToSchedule(Schedule sch, BlockRV block, Integer buffer_index) {
+    return sch->CacheIndex(block, buffer_index->value);
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String block, Integer buffer_index) {
+    PythonAPICall py("cache_index");
+    py.Input("block", block);
+    py.Input("buffer_index", buffer_index->value);
+    py.OutputList(outputs);
+    return py.Str();
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
+TVM_REGISTER_INST_KIND_TRAITS(CacheIndexTraits);
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index 280d0af92a8c..6425ae0766ae 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -181,6 +181,8 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheWrite")
     .set_body_method<Schedule>(&ScheduleNode::CacheWrite);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheInplace")
     .set_body_method<Schedule>(&ScheduleNode::CacheInplace);
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheIndex")
+    .set_body_method<Schedule>(&ScheduleNode::CacheIndex);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleReIndex")
     .set_body_typed([](Schedule self, const BlockRV& block_rv, int buffer_index,
                        int buffer_index_type) {
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index b67b008feda4..f2ad27fb6962 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -323,6 +323,20 @@ Array<BlockRV> TracedScheduleNode::CacheInplace(const BlockRV& block_rv, int rea
   return result;
 }
 
+Array<BlockRV> TracedScheduleNode::CacheIndex(const BlockRV& block_rv, int buffer_index) {
+  Array<BlockRV> result = ConcreteScheduleNode::CacheIndex(block_rv, buffer_index);
+  Array<ObjectRef> outputs;
+  for (const BlockRV& r : result) {
+    outputs.push_back(r);
+  }
+  static const InstructionKind& kind = InstructionKind::Get("CacheIndex");
+  trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
+                                      /*inputs=*/{block_rv},
+                                      /*attrs=*/{Integer(buffer_index)},
+                                      /*outputs=*/outputs));
+  return result;
+}
+
 BlockRV TracedScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index,
                                     BufferIndexType buffer_index_type) {
   BlockRV result = ConcreteScheduleNode::ReIndex(block_rv, buffer_index, buffer_index_type);
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 016de60726b9..06128c1a6ebc 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -80,6 +80,7 @@ class TracedScheduleNode : public ConcreteScheduleNode {
                               const String& storage_scope) final;
   BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
                   BufferIndexType buffer_index_type) final;
+  Array<BlockRV> CacheIndex(const BlockRV& block_rv, int buffer_index) final;
   /******** Schedule: Compute location ********/
   void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops,
                  int index = -1) final;
diff --git a/tests/python/unittest/test_tir_schedule_cache_index.py b/tests/python/unittest/test_tir_schedule_cache_index.py
new file mode 100644
index 000000000000..0c2882d1b617
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_cache_index.py
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-function-docstring,missing-module-docstring
+import sys
+
+import pytest
+import tvm
+import tvm.testing
+from tvm import tir
+from tvm.script import tir as T
+from tvm.tir.schedule.testing import verify_trace_roundtrip
+
+# pylint: disable=no-member,invalid-name,unused-variable
+
+########## Function before schedule ##########
+
+
+@T.prim_func
+def resize(a: T.handle, b: T.handle) -> None:
+    A = T.match_buffer(a, (1, 3, 40, 40))
+    B = T.match_buffer(b, (1, 3, 80, 80))
+    for i0, i1, i2, i3 in T.grid(1, 3, 80, 80):
+        with T.block("A"):
+            n, c, vi, vj = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            B[n, c, vi, vj] = A[n, c, vi // 4 + vj // 4, vj // 2]
+
+
+@T.prim_func
+def resize_cache_index(
+    A: T.Buffer[(1, 3, 40, 40), "float32"], B: T.Buffer[(1, 3, 80, 80), "float32"]
+) -> None:
+    index_var_0 = T.alloc_buffer([80, 80], dtype="int32", strides=[1])
+    index_var_1 = T.alloc_buffer([80], dtype="int32", strides=[1])
+    for ax0, ax1 in T.grid(80, 80):
+        with T.block("index_0"):
+            v0 = T.axis.spatial(80, ax0)
+            v1 = T.axis.spatial(80, ax1)
+            T.reads()
+            T.writes(index_var_0[v0, v1])
+            index_var_0[v0, v1] = v0 // 4 + v1 // 4
+    for ax0 in T.serial(80):
+        with T.block("index_1"):
+            v0 = T.axis.spatial(80, ax0)
+            T.reads()
+            T.writes(index_var_1[v0])
+            index_var_1[v0] = v0 // 2
+    for i0, i1, i2, i3 in T.grid(1, 3, 80, 80):
+        with T.block("A"):
+            n, c, vi, vj = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            T.reads(A[n, c, vi // 4 + vj // 4, vj // 2])
+            T.writes(B[n, c, vi, vj])
+            B[n, c, vi, vj] = A[n, c, index_var_0[vi, vj], index_var_1[vj]]
+
+
+def test_inplace_cache_read():
+    sch = tvm.tir.Schedule(resize, debug_mask="all")
+    block = sch.get_block("A")
+    sch.cache_index(block, 0)
+    tvm.ir.assert_structural_equal(resize_cache_index, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=resize)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 3cce9738bd4a6d94657d2979b47a20238956e5cd Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Fri, 28 Oct 2022 04:36:08 -0400
Subject: [PATCH 456/704] [BugFix][TIR] Affine-binding check should not
 simplify trivial iterators (#13203)

* Fix affine bindings

* Regression test and test update
---
 src/tir/schedule/analysis/analysis.cc         |  3 +-
 .../test_tir_schedule_state_cached_flags.py   | 30 +++++++++++
 ..._transform_lower_cross_thread_reduction.py | 54 ++++++++++---------
 3 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index d8b4f31f4c1b..a2c0bc759401 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -540,7 +540,8 @@ bool IsAffineBinding(const BlockRealize& realize, const Map<Var, Range>& loop_va
       /*input_iters=*/loop_var_ranges,
       /*predicate=*/realize->predicate,
       /*check_level=*/arith::IterMapLevel::Surjective,
-      /*analyzer=*/analyzer);
+      /*analyzer=*/analyzer,
+      /*simplify_trivial_iterators=*/false);
   if (res->indices.empty()) {
     return false;
   }
diff --git a/tests/python/unittest/test_tir_schedule_state_cached_flags.py b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
index 987821714078..70935814ba40 100644
--- a/tests/python/unittest/test_tir_schedule_state_cached_flags.py
+++ b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
@@ -438,6 +438,25 @@ def matmul_relu_padding(A: T.Buffer[(127, 127), "float16"], B: T.Buffer[(127, 12
             compute[i0_1, i1_1] = T.max(C[i0_1, i1_1], T.float32(0))
 
 
+@T.prim_func
+def splitted_square_sum_with_predicate(
+    A: T.Buffer[(1, 7, 7, 512), "float32"], B: T.Buffer[(1, 1, 1, 512), "float32"]
+) -> None:
+    for i0_i1_i2_i3_0_fused, ax0, ax1, ax2, ax3 in T.grid(2, 1, 1, 1, 256):
+        for ax4_ax5_fused_0, ax4_ax5_fused_1 in T.grid(1, 256):
+            with T.block("B"):
+                T.where(ax4_ax5_fused_0 * 256 + ax4_ax5_fused_1 < 49)
+                ax0_1, ax1_1, ax2_1 = T.axis.remap("SSS", [ax0, ax1, ax2])
+                ax3_1 = T.axis.spatial(512, i0_i1_i2_i3_0_fused * 256 + ax3)
+                rv0 = T.axis.reduce(7, (ax4_ax5_fused_0 * 256 + ax4_ax5_fused_1) // 7)
+                rv1 = T.axis.reduce(7, (ax4_ax5_fused_0 * 256 + ax4_ax5_fused_1) % 7)
+                T.reads(A[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1])
+                T.writes(B[ax0_1, ax1_1, ax2_1, ax3_1])
+                with T.init():
+                    B[ax0_1, ax1_1, ax2_1, ax3_1] = T.float32(0)
+                B[ax0_1, ax1_1, ax2_1, ax3_1] += A[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1]
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
 # fmt: on
 
@@ -865,5 +884,16 @@ def test_matmul_relu_padding():
     # pylint: enable=protected-access
 
 
+def test_splitted_square_sum_with_predicate():
+    s = tir.ScheduleState(splitted_square_sum_with_predicate, debug_mask="all")
+    # pylint: disable=protected-access
+    assert s._get_cached_flags(_get_block(s, "B")) == CachedFlags(
+        affine_binding=True,
+        region_cover=True,
+        stage_pipeline=True,
+    )
+    # pylint: enable=protected-access
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
index 8c139b710e23..9ae4f4cf862e 100644
--- a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
+++ b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
@@ -341,8 +341,8 @@ def single_reduction_loop_with_block_predicate(
         for ax0, ax1_0 in T.grid(1, 1):
             for ax1_1 in T.thread_binding(512, thread="threadIdx.x"):
                 with T.block("T_softmax_maxelem"):
-                    i0_1 = T.axis.spatial(256, i0)
-                    k = T.axis.reduce(256, ax1_1)
+                    i0_1 = T.axis.spatial(256, i0 + ax0)
+                    k = T.axis.reduce(256, ax1_0 * 512 + ax1_1)
                     T.where(ax1_0 * 512 + ax1_1 < 256)
                     T.reads(A[i0_1, k])
                     T.writes(T_softmax_maxelem_shared[i0_1])
@@ -354,8 +354,8 @@ def single_reduction_loop_with_block_predicate(
         for ax0, ax1_0 in T.grid(1, 1):
             for ax1_1 in T.thread_binding(512, thread="threadIdx.x"):
                 with T.block("T_softmax_expsum"):
-                    i0_2 = T.axis.spatial(256, i0)
-                    k = T.axis.reduce(256, ax1_1)
+                    i0_2 = T.axis.spatial(256, i0 + ax0)
+                    k = T.axis.reduce(256, ax1_0 * 512 + ax1_1)
                     T.where(ax1_0 * 512 + ax1_1 < 256)
                     T.reads(A[i0_2, k], T_softmax_maxelem_shared[i0_2])
                     T.writes(T_softmax_expsum_shared[i0_2])
@@ -368,7 +368,7 @@ def single_reduction_loop_with_block_predicate(
             for i1_1 in T.thread_binding(512, thread="threadIdx.x"):
                 with T.block("T_softmax_norm"):
                     i0_3 = T.axis.spatial(256, i0)
-                    i1 = T.axis.spatial(256, i1_1)
+                    i1 = T.axis.spatial(256, i1_0 * 512 + i1_1)
                     T.where(i1_0 * 512 + i1_1 < 256)
                     T.reads(
                         A[i0_3, i1], T_softmax_maxelem_shared[i0_3], T_softmax_expsum_shared[i0_3]
@@ -392,19 +392,20 @@ def lowered_single_reduction_loop_with_block_predicate(
     cross_thread_1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local")
     in_thread_1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local")
     for i0 in T.serial(256):
-        for ax0, ax1_0 in T.grid(1, 1):
+        for ax0 in T.serial(1):
             for ax1_1 in T.thread_binding(512, thread="threadIdx.x"):
                 with T.block("T_softmax_maxelem_in_thread_init"):
                     T.reads()
                     T.writes(in_thread_0[0])
                     in_thread_0[0] = T.float32(-3.4028234663852886e38)
-                with T.block("T_softmax_maxelem_in_thread"):
-                    i0_1 = T.axis.spatial(256, i0)
-                    k = T.axis.reduce(256, ax1_1)
-                    T.where(ax1_0 * 512 + ax1_1 < 256)
-                    T.reads(A[i0_1, k])
-                    T.writes(in_thread_0[0])
-                    in_thread_0[0] = T.max(in_thread_0[0], A[i0_1, k])
+                for ax1_0 in T.serial(1):
+                    with T.block("T_softmax_maxelem_in_thread"):
+                        T.where(ax1_0 * 512 + ax1_1 < 256)
+                        i0_1 = T.axis.spatial(256, i0 + ax0)
+                        k = T.axis.reduce(256, ax1_0 * 512 + ax1_1)
+                        T.reads(A[i0_1, k])
+                        T.writes(in_thread_0[0])
+                        in_thread_0[0] = T.max(in_thread_0[0], A[i0_1, k])
                 with T.block("T_softmax_maxelem_cross_thread"):
                     T.reads(in_thread_0[0])
                     T.writes(cross_thread_0[0])
@@ -426,25 +427,26 @@ def lowered_single_reduction_loop_with_block_predicate(
                         )
                     )
                 with T.block("T_softmax_maxelem_write_back"):
-                    i0_2 = T.axis.spatial(256, i0)
+                    i0_2 = T.axis.spatial(256, i0 + ax0)
                     T.reads(cross_thread_0[0])
                     T.writes(T_softmax_maxelem_shared[i0_2])
                     T_softmax_maxelem_shared[i0_2] = cross_thread_0[0]
-        for ax0, ax1_0 in T.grid(1, 1):
+        for ax0 in T.serial(1):
             for ax1_1 in T.thread_binding(512, thread="threadIdx.x"):
                 with T.block("T_softmax_expsum_in_thread_init"):
                     T.reads()
                     T.writes(in_thread_1[0])
                     in_thread_1[0] = T.float32(0)
-                with T.block("T_softmax_expsum_in_thread"):
-                    i0_3 = T.axis.spatial(256, i0)
-                    k = T.axis.reduce(256, ax1_1)
-                    T.where(ax1_0 * 512 + ax1_1 < 256)
-                    T.reads(A[i0_3, k], T_softmax_maxelem_shared[i0_3])
-                    T.writes(in_thread_1[0])
-                    in_thread_1[0] = in_thread_1[0] + T.exp(
-                        A[i0_3, k] - T_softmax_maxelem_shared[i0_3], dtype="float32"
-                    )
+                for ax1_0 in T.serial(1):
+                    with T.block("T_softmax_expsum_in_thread"):
+                        T.where(ax1_0 * 512 + ax1_1 < 256)
+                        i0_3 = T.axis.spatial(256, i0 + ax0)
+                        k = T.axis.reduce(256, ax1_0 * 512 + ax1_1)
+                        T.reads(A[i0_3, k], T_softmax_maxelem_shared[i0_3])
+                        T.writes(in_thread_1[0])
+                        in_thread_1[0] = in_thread_1[0] + T.exp(
+                            A[i0_3, k] - T_softmax_maxelem_shared[i0_3], dtype="float32"
+                        )
                 with T.block("T_softmax_expsum_cross_thread"):
                     T.reads(in_thread_1[0])
                     T.writes(cross_thread_1[0])
@@ -464,7 +466,7 @@ def lowered_single_reduction_loop_with_block_predicate(
                         )
                     )
                 with T.block("T_softmax_expsum_write_back"):
-                    i0_4 = T.axis.spatial(256, i0)
+                    i0_4 = T.axis.spatial(256, i0 + ax0)
                     T.reads(cross_thread_1[0])
                     T.writes(T_softmax_expsum_shared[i0_4])
                     T_softmax_expsum_shared[i0_4] = cross_thread_1[0]
@@ -472,7 +474,7 @@ def lowered_single_reduction_loop_with_block_predicate(
             for i1_1 in T.thread_binding(512, thread="threadIdx.x"):
                 with T.block("T_softmax_norm"):
                     i0_5 = T.axis.spatial(256, i0)
-                    i1 = T.axis.spatial(256, i1_1)
+                    i1 = T.axis.spatial(256, i1_0 * 512 + i1_1)
                     T.where(i1_0 * 512 + i1_1 < 256)
                     T.reads(
                         A[i0_5, i1], T_softmax_maxelem_shared[i0_5], T_softmax_expsum_shared[i0_5]

From e1ac5658ad3daa774c3c3e1ba16a1dad1e1a6d8c Mon Sep 17 00:00:00 2001
From: joshherr-quic <95375797+joshherr-quic@users.noreply.github.com>
Date: Fri, 28 Oct 2022 11:15:07 -0500
Subject: [PATCH 457/704] [Hexagon]Pull and build specific LLVM sha for hexagon
 docker CI (#13199)

* Pull and build specific LLVM sha for hexagon docker CI

* Address comments and add default triple

* Build shouldn't need lld for codegen
---
 docker/install/ubuntu_install_hexagon.sh | 31 +++++++++++++++++++-----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/docker/install/ubuntu_install_hexagon.sh b/docker/install/ubuntu_install_hexagon.sh
index 18b8a0f66587..722cfaa40cb3 100755
--- a/docker/install/ubuntu_install_hexagon.sh
+++ b/docker/install/ubuntu_install_hexagon.sh
@@ -21,9 +21,28 @@ set -o pipefail
 
 # Install LLVM/clang
 CLANG_LLVM_HOME=/opt/clang-llvm
-CLANG_LLVM_VERSION=14.0.0
-CLANG_LLVM_FILENAME=clang_llvm.tar.xz
-wget -q https://github.com/llvm/llvm-project/releases/download/llvmorg-${CLANG_LLVM_VERSION}/clang+llvm-${CLANG_LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04.tar.xz -O ${CLANG_LLVM_FILENAME}
-mkdir ${CLANG_LLVM_HOME}
-tar -xvf ${CLANG_LLVM_FILENAME} -C ${CLANG_LLVM_HOME} --strip-components=1
-rm ${CLANG_LLVM_FILENAME}
+LLVM_SHA=361a27c155ec8b222e3318488a208c0eb39624c8
+
+mkdir llvm-hexagon
+pushd llvm-hexagon
+git init
+git remote add origin https://github.com/llvm/llvm-project.git
+git fetch origin ${LLVM_SHA}
+git reset --hard FETCH_HEAD
+mkdir build
+pushd build
+cmake \
+  -G Ninja \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_INSTALL_PREFIX=${CLANG_LLVM_HOME} \
+  -DLLVM_ENABLE_ASSERTIONS=ON \
+  -DLLVM_TARGETS_TO_BUILD:STRING="Hexagon;X86" \
+  -DLLVM_ENABLE_PROJECTS:STRING="clang;llvm" \
+  -DTARGET_TRIPLE=x86_64-unknown-linux-gnu \
+  -DLLVM_DEFAULT_TARGET_TRIPLE=x86_64-unknown-linux-gnu \
+  ../llvm
+ninja install
+
+popd
+popd
+rm -rf llvm-hexagon

From f42826eec49998452cff30a2b6510e7d3c31e3ec Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 29 Oct 2022 02:44:47 +0900
Subject: [PATCH 458/704] [MetaSchedule] Enable anchor-block tuning (#13206)

* Introduce new module equality to extract only anchor block tasks

* enabling application of anchor trace to different subgraph

* fixed anchor block extraction

* fixed UB in task extraction

* Reworked anchor trace application and inlining logic

* fixed anchor block extraction for winograd

* fix inline logic for winograd

* refactor, clean up, renaming

* fix reverse compute inline unapplicable case

* fixed get_block applicablity condition

* adding test

* introduce HasBlock utility

* Decoupled trace creation and application in Trace::ApplyJSONToschedule

* add test

* adding more test

* black

* Revert "Decoupled trace creation and application in Trace::ApplyJSONToschedule"

This reverts commit 02df571bff58064927659f6e81e1d35279826825.

* add tests

* add doc

* use anchor tuning in hexagon int8 tuning test

* cpplint

* suppress mypy on ffi

* add workaround for false positive maybe-uninitialized warning

* add a minimal anchor tuning test

* relax tol for i386, remove gpu test since it requires sm86

* add doc for "anchor-block" module equality

* address comments

* add test for cache_write + AllocateConst bug
---
 include/tvm/meta_schedule/database.h          |    8 +
 include/tvm/meta_schedule/schedule_rule.h     |    5 +-
 python/tvm/meta_schedule/__init__.py          |    1 +
 .../meta_schedule/database/json_database.py   |    4 +
 .../meta_schedule/database/memory_database.py |    4 +
 .../database/schedule_fn_database.py          |    4 +
 python/tvm/meta_schedule/relay_integration.py |    8 +
 python/tvm/meta_schedule/trace_apply.py       |   39 +
 python/tvm/meta_schedule/tune.py              |    4 +
 python/tvm/script/tir/__init__.py             |    5 +-
 python/tvm/script/tir/ty.py                   |    2 +-
 python/tvm/tir/schedule/analysis.py           |   18 +
 src/meta_schedule/module_equality.cc          |   24 +
 src/meta_schedule/module_equality.h           |    4 +
 src/meta_schedule/schedule_rule/auto_bind.cc  |    5 +-
 .../schedule_rule/schedule_rule.cc            |   18 +-
 .../space_generator/space_generator.cc        |    7 +-
 src/meta_schedule/trace_apply.cc              |  235 ++
 src/meta_schedule/trace_apply.h               |   48 +
 src/meta_schedule/utils.h                     |   36 +
 src/relay/backend/task_extraction.cc          |   75 +-
 src/relay/backend/te_compiler_cache.cc        |   20 +-
 src/target/target_kind.cc                     |    3 +
 src/tir/schedule/analysis/analysis.cc         |    2 +
 .../schedule/primitive/cache_read_write.cc    |    7 +-
 src/tir/schedule/utils.h                      |   46 +
 .../metaschedule_e2e/test_resnet50_int8.py    |    8 +-
 .../test_meta_schedule_relay_integration.py   |  145 +
 .../test_meta_schedule_trace_apply.py         | 2745 +++++++++++++++++
 .../test_meta_schedule_vnni_integration.py    |    2 +
 30 files changed, 3486 insertions(+), 46 deletions(-)
 create mode 100644 python/tvm/meta_schedule/trace_apply.py
 create mode 100644 src/meta_schedule/trace_apply.cc
 create mode 100644 src/meta_schedule/trace_apply.h
 create mode 100644 tests/python/unittest/test_meta_schedule_trace_apply.py

diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index 9eead8d5ec31..a1dd4a412eec 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -183,6 +183,10 @@ class DatabaseNode : public runtime::Object {
    *    - "structural": Use StructuralEqual/Hash
    *    - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
    *                        equality testing and hashing.
+   *    - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
+   *                      given module. The "ignore-ndarray" varint is used for the extracted blocks
+   *                      or in case no anchor block is found.
+   *                      For the definition of the anchor block, see tvm/tir/analysis.h.
    */
   explicit DatabaseNode(String mod_eq_name = "structural");
 
@@ -274,6 +278,10 @@ class PyDatabaseNode : public DatabaseNode {
    *    - "structural": Use StructuralEqual/Hash
    *    - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
    *                        equality testing and hashing.
+   *    - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
+   *                      given module. The "ignore-ndarray" varint is used for the extracted blocks
+   *                      or in case no anchor block is found.
+   *                      For the definition of the anchor block, see tvm/tir/analysis.h.
    */
   explicit PyDatabaseNode(String mod_eq_name = "structural");
 
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 3bc30e09c74a..1b018512146f 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -245,9 +245,12 @@ class ScheduleRule : public runtime::ObjectRef {
    * \brief Auto bind loops around the block to BlockIdx and ThreadIdx
    * \param max_threadblocks The maximum number of threadblock on GPU
    * \param thread_extents Candidates of thread axis extent.
+   * \param max_threads_per_block The maximum number of threads per block, if it is known
+   * when this schedule rule is created.
    * \return The schedule rule created
    */
-  TVM_DLL static ScheduleRule AutoBind(int max_threadblocks, Array<Integer> thread_extents);
+  TVM_DLL static ScheduleRule AutoBind(int max_threadblocks, Array<Integer> thread_extents,
+                                       int max_threads_per_block = -1);
   /*!
    * \brief Create a schedule rule with customized methods on the python-side.
    * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`.
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index 04acdc9d4a75..0dd679e047e0 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -30,6 +30,7 @@
     search_strategy,
     space_generator,
     tir_integration,
+    trace_apply,
 )
 from .builder import Builder
 from .cost_model import CostModel
diff --git a/python/tvm/meta_schedule/database/json_database.py b/python/tvm/meta_schedule/database/json_database.py
index f81d8913c18a..102a13b90d98 100644
--- a/python/tvm/meta_schedule/database/json_database.py
+++ b/python/tvm/meta_schedule/database/json_database.py
@@ -40,6 +40,10 @@ class JSONDatabase(Database):
           - "structural": Use StructuralEqual/Hash
           - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
                               equality testing and hashing.
+          - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
+                            given module. The "ignore-ndarray" varint is used for the extracted
+                            blocks or in case no anchor block is found.
+                            For the definition of the anchor block, see tir/analysis/analysis.py.
     """
 
     path_workload: str
diff --git a/python/tvm/meta_schedule/database/memory_database.py b/python/tvm/meta_schedule/database/memory_database.py
index 96b9bb5a0112..34a6a141970a 100644
--- a/python/tvm/meta_schedule/database/memory_database.py
+++ b/python/tvm/meta_schedule/database/memory_database.py
@@ -33,6 +33,10 @@ class MemoryDatabase(Database):
           - "structural": Use StructuralEqual/Hash
           - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
                               equality testing and hashing.
+          - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
+                            given module. The "ignore-ndarray" varint is used for the extracted
+                            blocks or in case no anchor block is found.
+                            For the definition of the anchor block, see tir/analysis/analysis.py.
     """
 
     def __init__(
diff --git a/python/tvm/meta_schedule/database/schedule_fn_database.py b/python/tvm/meta_schedule/database/schedule_fn_database.py
index 7a0b433996c5..c7d175cb79d3 100644
--- a/python/tvm/meta_schedule/database/schedule_fn_database.py
+++ b/python/tvm/meta_schedule/database/schedule_fn_database.py
@@ -39,6 +39,10 @@ class ScheduleFnDatabase(Database):
           - "structural": Use StructuralEqual/Hash
           - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
                               equality testing and hashing.
+          - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
+                            given module. The "ignore-ndarray" varint is used for the extracted
+                            blocks or in case no anchor block is found.
+                            For the definition of the anchor block, see tir/analysis/analysis.py.
     """
 
     def __init__(
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index 089f6e412e20..5e77181d32bf 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -143,6 +143,10 @@ def extract_tasks(
           - "structural": Use StructuralEqual/Hash
           - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
                               equality testing and hashing.
+          - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
+                            given module. The "ignore-ndarray" varint is used for the extracted
+                            blocks or in case no anchor block is found.
+                            For the definition of the anchor block, see tir/analysis/analysis.py.
 
     Returns
     -------
@@ -288,6 +292,10 @@ def tune_relay(
           - "structural": Use StructuralEqual/Hash
           - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
                               equality testing and hashing.
+          - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
+                            given module. The "ignore-ndarray" varint is used for the extracted
+                            blocks or in case no anchor block is found.
+                            For the definition of the anchor block, see tir/analysis/analysis.py.
 
     Returns
     -------
diff --git a/python/tvm/meta_schedule/trace_apply.py b/python/tvm/meta_schedule/trace_apply.py
new file mode 100644
index 000000000000..c621cf973af2
--- /dev/null
+++ b/python/tvm/meta_schedule/trace_apply.py
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Specialized applications of trace"""
+from ..tir.schedule import Schedule, Trace
+from ..target import Target
+from . import _ffi_api
+
+
+def schedule_using_anchor_trace(sch: Schedule, anchor_trace: Trace, target: Target) -> None:
+    """Apply the trace from a TIR module whose anchor block is the same but fused elemewise op
+    blocks differ. This function can be used for transferring a trace tuned on a conv2d -> add
+    subgraph to other subgraphs having the same conv2d workload, for example. We call such trace
+    an "anchor trace". Those blocks that are not scheduled by the given anchor trace will be either
+    inlined or parallelized.
+
+    Parameters
+    ----------
+    sch : Schedule
+        The target schedule
+    anchor_trace: Trace
+        The trace generated for other TIR module having the same anchor block
+    target : tvm.target.Target
+        The compilation target
+    """
+    _ffi_api.ScheduleUsingAnchorTrace(sch, anchor_trace, target)  # type: ignore
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 07021eac3998..a69c8f126272 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -76,6 +76,10 @@ def tune_tasks(
           - "structural": Use StructuralEqual/Hash
           - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
                               equality testing and hashing.
+          - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
+                            given module. The "ignore-ndarray" varint is used for the extracted
+                            blocks or in case no anchor block is found.
+                            For the definition of the anchor block, see tir/analysis/analysis.py.
 
     Returns
     -------
diff --git a/python/tvm/script/tir/__init__.py b/python/tvm/script/tir/__init__.py
index d7db182f9d20..662dd10ec068 100644
--- a/python/tvm/script/tir/__init__.py
+++ b/python/tvm/script/tir/__init__.py
@@ -25,8 +25,9 @@
 # add all floating point and integer datatypes to the module
 for _dtype in ["float", "uint", "int"]:
     for _size in ["8", "16", "32", "64"]:
-        for _lanes in ["", "x4", "x8", "x16", "x32"]:
+        for _lanes in ["", "x4", "x8", "x16", "x32", "x64"]:
             from . import ty
 
             _name = _dtype + _size + _lanes
-            globals()[_name] = getattr(ty, _name)
+            if hasattr(ty, _name):
+                globals()[_name] = getattr(ty, _name)
diff --git a/python/tvm/script/tir/ty.py b/python/tvm/script/tir/ty.py
index b8323dd4a167..b17b571e88e7 100644
--- a/python/tvm/script/tir/ty.py
+++ b/python/tvm/script/tir/ty.py
@@ -202,7 +202,7 @@ def __getitem__(self, args):
 # add all floating point and integer datatypes to the module
 for _dtype in ["float", "uint", "int"]:
     for _size in ["8", "16", "32", "64"]:
-        for _lanes in ["", "x4", "x8", "x16", "x32"]:
+        for _lanes in ["", "x4", "x8", "x16", "x32", "x64"]:
             _name = _dtype + _size + _lanes
             globals()[_name] = ConcreteType(_name)
 
diff --git a/python/tvm/tir/schedule/analysis.py b/python/tvm/tir/schedule/analysis.py
index 90c585ac8ce1..e1c0019d9bf0 100644
--- a/python/tvm/tir/schedule/analysis.py
+++ b/python/tvm/tir/schedule/analysis.py
@@ -122,3 +122,21 @@ def get_auto_tensorize_mapping_info(
     intrinsics.
     """
     return _ffi_api.GetAutoTensorizeMappingInfo(sch, block, desc_func)  # type: ignore
+
+
+def has_block(sch: Schedule, block_name: str) -> bool:
+    """Query if the given block name exists in the module associated with the provided schedule.
+
+    Parameters
+    ----------
+    sch : Schedule
+        The schedule
+    block_name : str
+        The name of the block to query
+
+    Returns
+    -------
+    yes/no: bool
+        True if the given block exists in the schedule.
+    """
+    return _ffi_api.HasBlock(sch, block_name)  # type: ignore
diff --git a/src/meta_schedule/module_equality.cc b/src/meta_schedule/module_equality.cc
index caa7da170bd6..f9ffe82aa271 100644
--- a/src/meta_schedule/module_equality.cc
+++ b/src/meta_schedule/module_equality.cc
@@ -21,6 +21,7 @@
 #include <tvm/ir/module.h>
 #include <tvm/node/structural_equal.h>
 #include <tvm/node/structural_hash.h>
+#include <tvm/tir/analysis.h>
 
 #include <memory>
 
@@ -73,11 +74,34 @@ class ModuleEqualityIgnoreNDArray : public ModuleEquality {
   }
 };
 
+// The NDArray-ignoring variant of structural equal / hash is used for the module equality
+// on the extracted anchor blocks.
+class ModuleEqualityAnchorBlock : public ModuleEquality {
+  size_t Hash(IRModule mod) const {
+    auto anchor_block = tir::FindAnchorBlock(mod);
+    if (anchor_block) {
+      return SHashHandlerIgnoreNDArray().Hash(GetRef<tir::Block>(anchor_block), false);
+    }
+    return ModuleEqualityIgnoreNDArray().Hash(mod);
+  }
+  bool Equal(IRModule lhs, IRModule rhs) const {
+    auto anchor_block_lhs = tir::FindAnchorBlock(lhs);
+    auto anchor_block_rhs = tir::FindAnchorBlock(rhs);
+    if (anchor_block_lhs && anchor_block_rhs) {
+      return SEqualHandlerIgnoreNDArray().Equal(GetRef<tir::Block>(anchor_block_lhs),
+                                                GetRef<tir::Block>(anchor_block_rhs), false);
+    }
+    return ModuleEqualityIgnoreNDArray().Equal(lhs, rhs);
+  }
+};
+
 std::unique_ptr<ModuleEquality> ModuleEquality::Create(const std::string& mod_eq_name) {
   if (mod_eq_name == "structural") {
     return std::make_unique<ModuleEqualityStructural>();
   } else if (mod_eq_name == "ignore-ndarray") {
     return std::make_unique<ModuleEqualityIgnoreNDArray>();
+  } else if (mod_eq_name == "anchor-block") {
+    return std::make_unique<ModuleEqualityAnchorBlock>();
   }
   LOG(FATAL) << "Unknown module equality " << mod_eq_name;
   return nullptr;
diff --git a/src/meta_schedule/module_equality.h b/src/meta_schedule/module_equality.h
index 8c99b563551b..ba5877471e2c 100644
--- a/src/meta_schedule/module_equality.h
+++ b/src/meta_schedule/module_equality.h
@@ -42,6 +42,10 @@ class ModuleEquality {
    *    - "structural": Use StructuralEqual/Hash
    *    - "ignore-ndarray": Same as "structural", but ignore ndarray raw data during
    *                        equality testing and hashing.
+   *    - "anchor-block": Apply equality testing and hashing on the anchor block extracted from a
+   *                      given module. The "ignore-ndarray" varint is used for the extracted blocks
+   *                      or in case no anchor block is found.
+   *                      For the definition of the anchor block, see tvm/tir/analysis.h.
    * \return An owning pointer to the created instance
    */
   static std::unique_ptr<ModuleEquality> Create(const std::string& mod_eq_name);
diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc
index 7af1418d8f3e..4d16a6d4d65d 100644
--- a/src/meta_schedule/schedule_rule/auto_bind.cc
+++ b/src/meta_schedule/schedule_rule/auto_bind.cc
@@ -208,10 +208,11 @@ Array<tir::Schedule> AutoBindNode::Apply(const tir::Schedule& sch, const tir::Bl
   return {sch};
 }
 
-ScheduleRule ScheduleRule::AutoBind(int max_threadblocks, Array<Integer> thread_extents) {
+ScheduleRule ScheduleRule::AutoBind(int max_threadblocks, Array<Integer> thread_extents,
+                                    int max_threads_per_block) {
   ObjectPtr<AutoBindNode> n = make_object<AutoBindNode>();
   n->max_threadblocks_ = max_threadblocks;
-  n->max_threads_per_block_ = -1;
+  n->max_threads_per_block_ = max_threads_per_block;
   n->thread_extents_ = std::move(thread_extents);
   return ScheduleRule(n);
 }
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index 8333833bfafa..bd492d03eac6 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -53,14 +53,7 @@ ScheduleRule ScheduleRule::PyScheduleRule(
 
 Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
   return {
-      ScheduleRule::AutoInline(
-          /*into_producer=*/false,
-          /*into_consumer=*/true,
-          /*inline_const_tensor=*/true,
-          /*disallow_if_then_else=*/true,
-          /*require_injective=*/true,
-          /*require_ordered=*/true,
-          /*disallow_op=*/Array<String>{"tir.exp"}),
+      GetDefaultAutoInline("llvm"),
       ScheduleRule::AddRFactor(
           /*max_jobs_per_core=*/16,
           /*max_innermost_factor=*/Integer(64)),
@@ -98,14 +91,7 @@ Array<ScheduleRule> ScheduleRule::DefaultCUDA() {
           Map<String, ObjectRef>{{"req", String("must")},
                                  {"levels", Array<Integer>{3}},  //
                                  {"scope", String("local")}}),
-      ScheduleRule::AutoInline(
-          /*into_producer=*/true,
-          /*into_consumer=*/true,
-          /*inline_const_tensor=*/true,
-          /*disallow_if_then_else=*/false,
-          /*require_injective=*/false,
-          /*require_ordered=*/false,
-          /*disallow_op=*/Array<String>{}),
+      GetDefaultAutoInline("cuda"),
       ScheduleRule::CrossThreadReduction(
           /*thread_extents=*/Array<Integer>{4, 8, 16, 32, 64, 128, 256, 512}),
       ScheduleRule::ParallelizeVectorizeUnroll(
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index 53107bafb2c0..bcc0673e5924 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -45,12 +45,11 @@ String GetRuleKindFromTarget(const Target& target) {
     }
     return "cuda";
   }
-  if (target->kind->name == "rocm") {
-    return "cuda";
-  }
-  if (target->kind->name == "vulkan") {
+
+  if (IsGPUTarget(target->kind->name)) {
     return "cuda";
   }
+
   LOG(FATAL) << "Unsupported target: " << target;
   throw;
 }
diff --git a/src/meta_schedule/trace_apply.cc b/src/meta_schedule/trace_apply.cc
new file mode 100644
index 000000000000..70b6451d3546
--- /dev/null
+++ b/src/meta_schedule/trace_apply.cc
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "trace_apply.h"
+
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using namespace tir;
+
+// Returns true if b1 is an ancestor of b2
+bool IsAncestor(BlockRV b1, BlockRV b2, Schedule sch) {
+  if (sch->Get(b1)->name_hint == sch->Get(b2)->name_hint) {
+    return true;
+  }
+  for (auto prod : sch->GetProducers(b2)) {
+    if (IsAncestor(b1, prod, sch)) return true;
+  }
+  return false;
+}
+
+// Inline or reverse inline spatial blocks after the anchor block
+void InlinePostBlocks(Schedule sch, Trace anchor_trace, Target target) {
+  static auto kind_get_block = InstructionKind::Get("GetBlock");
+  // We let blocks whose names are referenced in the anchor trace be scheduled by the anchor trace.
+  // We record such block names to avoid inlining them here.
+  std::unordered_set<std::string> get_block_names;
+  for (const auto& inst : anchor_trace->insts) {
+    if (inst->kind.same_as(kind_get_block)) {
+      auto block_name = Downcast<String>(inst->attrs[0]);
+      ICHECK(block_name.defined());
+      get_block_names.insert(block_name);
+    }
+  }
+
+  auto anchor_block = FindAnchorBlock(sch->mod());
+
+  auto inline_rule = GetDefaultAutoInline(target->kind->name);
+
+  for (auto name : GetBlockNames(sch->mod())) {
+    auto block = sch->GetBlock(name);
+    if (anchor_block) {
+      auto anchor_block_rv = sch->GetBlock(anchor_block->name_hint);
+      if (IsAncestor(block, anchor_block_rv, sch)) continue;
+    }
+    // Spatial blocks which are not referenced in the anchor trace will be inlined here.
+    if (IsSpatial(sch->GetSRef(block)) && !get_block_names.count(name)) {
+      inline_rule->Apply(sch, block);
+    }
+  }
+}
+
+// Apply instructions from the anchor trace to the target schedule, and returns blocks
+// that remain unscheduled.
+std::vector<BlockRV> ApplyAnchorTrace(Schedule sch, Trace anchor_trace) {
+  static auto kind_get_child_blocks = InstructionKind::Get("GetChildBlocks");
+  static auto kind_get_block = InstructionKind::Get("GetBlock");
+  static auto kind_compute_inline = InstructionKind::Get("ComputeInline");
+  static auto kind_reverse_compute_inline = InstructionKind::Get("ReverseComputeInline");
+
+  const auto block_names_orig = GetBlockNames(sch->mod());
+  const auto sch_orig = sch->Copy();
+
+  std::unordered_map<const Object*, const Object*> rv_map;
+  // Blocks and loops that appear in the anchor trace but are not part of the target schedule.
+  std::unordered_set<BlockRV, ObjectHash, ObjectEqual> foreign_blocks;
+  std::unordered_set<LoopRV, ObjectHash, ObjectEqual> foreign_loops;
+
+  // Instructions in the anchor trace can be applied only if all inputs are part of the target
+  // schedule.
+  auto is_inst_applicable = [&foreign_blocks, &foreign_loops](Instruction inst) {
+    for (auto input : inst->inputs) {
+      if (!input.defined()) continue;
+      if ((input->IsInstance<BlockRVNode>() && foreign_blocks.count(Downcast<BlockRV>(input))) ||
+          (input->IsInstance<LoopRVNode>() && foreign_loops.count(Downcast<LoopRV>(input)))) {
+        return false;
+      }
+    }
+    return true;
+  };
+
+  for (const auto& inst : anchor_trace->insts) {
+    if (!is_inst_applicable(inst)) {
+      // If we find an instruction that is not applicable, its outputs are recorded as "foreign"
+      // to the target schedule.
+      for (auto output : inst->outputs) {
+        if (output->IsInstance<BlockRVNode>()) {
+          foreign_blocks.insert(Downcast<BlockRV>(output));
+        } else if (output->IsInstance<LoopRVNode>()) {
+          foreign_loops.insert(Downcast<LoopRV>(output));
+        }
+      }
+      continue;
+    }
+
+    Array<ObjectRef> inputs = TranslateInputRVs(inst->inputs, rv_map);
+
+    if (inst->kind.same_as(kind_get_block) && !HasBlock(sch, Downcast<String>(inst->attrs[0]))) {
+      // The anchor trace does get_block on a block that is not part of the target schedule.
+      auto block = Downcast<BlockRV>(inst->outputs[0]);
+      foreign_blocks.insert(block);
+      continue;
+    } else if (inst->kind.same_as(kind_reverse_compute_inline)) {
+      // The anchor trace does reverse_compute_inline on a block, but the block with the same name
+      // in the target schedule cannot be reverse compute inline-ed.
+      // In such cases, it should be possible to apply compute_inline instead.
+      auto block = Downcast<BlockRV>(inputs[0]);
+      auto block_sref = sch->GetSRef(block);
+      if (!CanReverseComputeInline(sch->state(), block_sref)) {
+        ICHECK(CanComputeInline(sch->state(), block_sref));
+        sch->ComputeInline(block);
+        continue;
+      }
+    } else if (inst->kind.same_as(kind_compute_inline)) {
+      // Similar to the reverse_compute_inline case above.
+      auto block = Downcast<BlockRV>(inputs[0]);
+      auto block_sref = sch->GetSRef(block);
+      if (!CanComputeInline(sch->state(), block_sref)) {
+        ICHECK(CanReverseComputeInline(sch->state(), block_sref));
+        sch->ReverseComputeInline(block);
+        continue;
+      }
+    }
+
+    Optional<ObjectRef> decision = anchor_trace->GetDecision(inst);
+    Array<ObjectRef> outputs = inst->kind->f_apply_to_schedule(sch, inputs, inst->attrs, decision);
+
+    if (inst->kind.same_as(kind_get_child_blocks)) {
+      // We want to allow a trace generated for a single conv2d block to be applied to
+      // conv2d -> elemwise blocks, where two conv2d are the same workload.
+      // GetChildBlocks returns a different number of blocks for the two cases above, which
+      // violates the assumption made by TranslateAddOutputRVs: old_outputs.size() ==
+      // new_outputs.size(). We workaround this problem by assuming that the prefix of the "new"
+      // outputs matches with the "old" outputs, and truncating the new outputs accordingly.
+      ICHECK(inst->outputs.size() <= outputs.size());
+      TranslateAddOutputRVs(
+          inst->outputs, Array<ObjectRef>(outputs.begin(), outputs.begin() + inst->outputs.size()),
+          &rv_map);
+    } else {
+      TranslateAddOutputRVs(inst->outputs, outputs, &rv_map);
+    }
+  }
+
+  auto is_scheduled = [=](const std::string& block_name) {
+    auto loops = sch->GetLoops(sch->GetBlock(block_name));
+    auto loops_orig = sch_orig->GetLoops(sch_orig->GetBlock(block_name));
+    if (loops.size() != loops_orig.size()) {
+      return true;
+    }
+    for (size_t i = 0; i < loops.size(); ++i) {
+      auto loop = sch->Get(loops[i]);
+      auto loop_orig = sch_orig->Get(loops_orig[i]);
+      if (loop->kind != loop_orig->kind) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  const auto block_names_now = GetBlockNames(sch->mod());
+  std::vector<BlockRV> unscheduled_blocks;
+
+  for (auto name : block_names_orig) {
+    if (block_names_now.count(name) && name != "root" && !is_scheduled(name)) {
+      unscheduled_blocks.push_back(sch->GetBlock(name));
+    }
+  }
+
+  return unscheduled_blocks;
+}
+
+void ScheduleUsingAnchorTrace(Schedule sch, const Trace& anchor_trace, const tvm::Target& target) {
+  InlinePostBlocks(sch, anchor_trace, target);
+
+  auto unscheduled_blocks = ApplyAnchorTrace(sch, anchor_trace);
+  ICHECK(unscheduled_blocks.size() <= 1)
+      << "All blocks should have been scheduled or only one (fused) spatial block can remain "
+         "unscheduled at this point.";
+
+  if (unscheduled_blocks.empty()) {
+    // All blocks have already been scheduled.
+    return;
+  }
+
+  auto last_block = unscheduled_blocks[0];
+  auto last_block_producers = sch->GetProducers(last_block);
+
+  if (last_block_producers.size() == 1 && IsSpatial(sch->GetSRef(last_block_producers[0]))) {
+    // Inline into the cache write stage
+    sch->ReverseComputeInline(last_block);
+  } else if (target->kind->name == "llvm" || target->kind->name == "hexagon") {
+    sch->Parallel(sch->Fuse(sch->GetLoops(last_block)));
+  } else if (IsGPUTarget(target->kind->name)) {
+    auto max_threads_per_block = target->GetAttr<Integer>("max_threads_per_block");
+    ICHECK(max_threads_per_block.defined())
+        << "ValueError: missing attribute `max_threads_per_block` in the target";
+
+    auto auto_bind_rule =
+        ScheduleRule::AutoBind(/*max_threadblocks=*/256,
+                               /*thread_extents*/ Array<Integer>{32, 64, 128, 256, 512, 1024},
+                               max_threads_per_block.value()->value);
+    auto_bind_rule->Apply(sch, last_block);
+  }
+}
+
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleUsingAnchorTrace")
+    .set_body_typed(ScheduleUsingAnchorTrace);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/trace_apply.h b/src/meta_schedule/trace_apply.h
new file mode 100644
index 000000000000..9a9068ab914f
--- /dev/null
+++ b/src/meta_schedule/trace_apply.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_TRACE_APPLY_H_
+#define TVM_META_SCHEDULE_TRACE_APPLY_H_
+
+#include <tvm/meta_schedule/schedule_rule.h>
+#include <tvm/target/target.h>
+#include <tvm/tir/schedule/schedule.h>
+#include <tvm/tir/schedule/trace.h>
+
+#include <string>
+
+namespace tvm {
+namespace meta_schedule {
+
+/*!
+ * \brief Apply the trace from a TIR module whose anchor block is the same but fused elemewise
+ * op blocks differ. This function can be used for transferring a trace tuned on a conv2d -> add
+ * subgraph to other subgraphs having the same conv2d workload, for example. We call such trace
+ * an "anchor trace". Those blocks that are not scheduled by the given anchor trace will be either
+ * inlined or parallelized.
+ * \param sch The schedule to apply the anchor trace.
+ * \param anchor_trace The trace tuned on other subgraph with the same anchor-block workload.
+ * \param target The target information needed for inlining and parallelization.
+ */
+void ScheduleUsingAnchorTrace(tir::Schedule sch, const tir::Trace& anchor_trace,
+                              const tvm::Target& target);
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_TRACE_APPLY_H_
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 824cfcd6aa5c..7240fa418839 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -44,6 +44,7 @@
 
 #include <algorithm>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -502,6 +503,41 @@ inline void CloneRules(const SpaceGeneratorNode* src, SpaceGeneratorNode* dst) {
   }
 }
 
+/*! \brief Returns true if the given target is one of the supported gpu targets. */
+inline bool IsGPUTarget(const std::string& target_name) {
+  static const std::unordered_set<std::string> gpu_targets{"cuda", "rocm", "vulkan", "metal"};
+  return gpu_targets.count(target_name);
+}
+
+/*!
+ * \brief Create an AutoInline schedule rule for the given target.
+ * \param target_name The name of the target ("llvm", "cuda", etc.)
+ * \return The AutoInline schedule rule for the given target.
+ */
+inline ScheduleRule GetDefaultAutoInline(const std::string& target_name) {
+  if (target_name == "llvm" || target_name == "hexagon") {
+    return ScheduleRule::AutoInline(
+        /*into_producer=*/false,
+        /*into_consumer=*/true,
+        /*inline_const_tensor=*/true,
+        /*disallow_if_then_else=*/true,
+        /*require_injective=*/true,
+        /*require_ordered=*/true,
+        /*disallow_op=*/Array<String>{"tir.exp"});
+  } else if (IsGPUTarget(target_name)) {
+    return ScheduleRule::AutoInline(
+        /*into_producer=*/true,
+        /*into_consumer=*/true,
+        /*inline_const_tensor=*/true,
+        /*disallow_if_then_else=*/false,
+        /*require_injective=*/false,
+        /*require_ordered=*/false,
+        /*disallow_op=*/Array<String>{});
+  }
+  LOG(FATAL) << "Unsupported target " << target_name;
+  return ScheduleRule(nullptr);
+}
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index 430b551a3b9e..7e66dafe16f5 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -22,6 +22,8 @@
 #include <tvm/relay/function.h>
 #include <tvm/target/target.h>
 
+#include <numeric>
+
 #include "../../meta_schedule/module_equality.h"
 #include "../../te/operation/create_primfunc.h"
 #include "./te_compiler_cache.h"
@@ -31,6 +33,25 @@ namespace tvm {
 namespace relay {
 namespace backend {
 
+class OpCounter : public ExprVisitor {
+ public:
+  static size_t GetOpCount(relay::Function func) {
+    OpCounter counter;
+    counter(func->body);
+    return counter.count;
+  }
+
+ private:
+  void VisitExpr_(const CallNode* call) final {
+    if (call->op->IsInstance<OpNode>()) {
+      ++count;
+    }
+    ExprVisitor::VisitExpr_(call);
+  }
+
+  size_t count{0};
+};
+
 Array<meta_schedule::ExtractedTask> ExtractTask(IRModule mod, Target target,
                                                 Map<String, runtime::NDArray> params,
                                                 String mod_eq_name) {
@@ -52,33 +73,59 @@ Array<meta_schedule::ExtractedTask> ExtractTask(IRModule mod, Target target,
   std::unordered_map<IRModule, ExtractedTask, ModuleHash, ModuleEqual> cache(
       /*bucket_count*/ 0, ModuleHash(*mod_eq), ModuleEqual(*mod_eq));
 
-  PostOrderVisit(mod->Lookup("main"), [&target, &tasks, &cache, &tir_converter](const Expr& exp) {
+  std::vector<std::tuple<std::string, Function, IRModule>> lower_results;
+
+  PostOrderVisit(mod->Lookup("main"), [&lower_results, &target, &tir_converter](const Expr& exp) {
     if (exp->IsInstance<FunctionNode>()) {
       Function relay_func = Downcast<Function>(exp);
       if (!relay_func->HasNonzeroAttr(attr::kPrimitive)) {
         return;
       }
-
       auto [inputs_outputs, constants, fused_name] =
           tec::LowerTECompute(relay_func, target, /*return_inputs=*/true);
 
       if (Optional<tir::PrimFunc> f = tir_converter(inputs_outputs, constants)) {
         IRModule tir_mod = PrimFuncToIRModule(f.value());
-
-        auto it = cache.find(tir_mod);
-        if (it != cache.end()) {
-          it->second->weight += 1;
-          return;
-        }
-
-        // Note that the cache is key-ed on the tir mod, rather than the relay mod
-        IRModule relay_mod({{GlobalVar(fused_name), relay_func}});
-        ExtractedTask task(fused_name, relay_mod, target, {tir_mod}, 1);
-        tasks.push_back(task);
-        cache.emplace(tir_mod, task);
+        lower_results.push_back(std::make_tuple(fused_name, relay_func, tir_mod));
       }
     }
   });
+
+  std::vector<int> indices(lower_results.size());
+  std::iota(indices.begin(), indices.end(), 0);
+
+  if (mod_eq_name == "anchor-block") {
+    std::vector<size_t> op_counts(lower_results.size());
+    for (size_t i = 0; i < op_counts.size(); ++i) {
+      op_counts[i] = OpCounter::GetOpCount(std::get<1>(lower_results[i]));
+    }
+
+    // When anchor-block based equality is used, tuning tasks "nn_conv2d_add_nn_relu" and
+    // "nn_conv2d_add_add_nn_relu", for example, can be identified as equal. Thus, one of
+    // them will be filtered by the cache below.
+    //
+    // To make sure that we tune "nn_conv2d_add_nn_relu" and not "nn_conv2d_add_add_nn_relu",
+    // we sort the TE lowering results based on the number of relay ops. This way,
+    // "nn_conv2d_add_nn_relu" will be added to the cache first, and "nn_conv2d_add_add_nn_relu"
+    // will be filtered.
+    std::sort(indices.begin(), indices.end(),
+              [&op_counts](int i1, int i2) { return op_counts[i1] < op_counts[i2]; });
+  }
+
+  for (auto i : indices) {
+    const auto& [fused_name, relay_func, tir_mod] = lower_results[i];
+    auto it = cache.find(tir_mod);
+    if (it != cache.end()) {
+      it->second->weight += 1;
+      continue;
+    }
+    // Note that the cache is key-ed on the tir mod, rather than the relay mod
+    IRModule relay_mod({{GlobalVar(fused_name), relay_func}});
+    ExtractedTask task(fused_name, relay_mod, target, {tir_mod}, 1);
+    tasks.push_back(task);
+    cache.emplace(tir_mod, task);
+  }
+
   // Tasks are extracted via post order visit, return the reversed list.
   std::reverse(tasks.begin(), tasks.end());
   NameSupply name_supply = NameSupply("");
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index e7326ed5dd4d..c97efb565d9d 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -44,6 +44,7 @@
 
 #include <functional>
 #include <limits>
+#include <memory>
 #include <mutex>
 #include <unordered_map>
 #include <utility>
@@ -52,6 +53,8 @@
 #include "../../printer/text_printer.h"
 #include "../../te/operation/create_primfunc.h"
 #include "../op/memory/memory.h"
+#include "../src/meta_schedule/module_equality.h"
+#include "../src/meta_schedule/trace_apply.h"
 #include "../transforms/meta_schedule_layout_rewrite.h"
 #include "utils.h"
 
@@ -461,7 +464,9 @@ class AllocateConstReplaceConstant : public StmtExprMutator {
 // Construct a schedule for a given Relay primitive function and target.
 class ScheduleBuilder : public ExprVisitor {
  public:
-  explicit ScheduleBuilder(Target target) : target_(target) {
+  explicit ScheduleBuilder(Target target)
+      : target_(target),
+        mod_eq_structural_(meta_schedule::ModuleEquality::Create("ignore-ndarray")) {
     // Whether to use auto_scheduler schedule.
     use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
     if (backend::IsMetaScheduleEnabled()) {
@@ -614,9 +619,19 @@ class ScheduleBuilder : public ExprVisitor {
                 MetaScheduleLayoutRewriter::LayoutQueuePush(index_map);
               }
             }
+
             Schedule sch = Schedule::Traced(query_mod, /*seed=*/-1, /*debug_mask=*/0,
                                             tir::ScheduleErrorRenderLevel::kDetail);
-            record->trace->ApplyToSchedule(sch, /*remove_postproc=*/false);
+
+            if (!mod_eq_structural_->Equal(query_mod, opt_record.value()->workload->mod)) {
+              // When the database lookup succeeds while structural equality check fails,
+              // it implies that the anchor block based equality has been used during tuning.
+              // The trace in the record cannot directly be applied to this query module.
+              meta_schedule::ScheduleUsingAnchorTrace(sch, record->trace, target_);
+            } else {
+              record->trace->ApplyToSchedule(sch, /*remove_postproc=*/false);
+            }
+
             IRModule mod = sch->mod();
             ICHECK_EQ(mod->functions.size(), 1);
             mod = tir::transform::RemoveWeightLayoutRewriteBlock(/*skip_ndarray_rewrite*/ false)(
@@ -698,6 +713,7 @@ class ScheduleBuilder : public ExprVisitor {
   int anchor_op_pattern_{0};
   bool use_auto_scheduler_;
   Optional<meta_schedule::Database> database_;
+  std::unique_ptr<meta_schedule::ModuleEquality> mod_eq_structural_;
 };
 
 /*!
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index a95f55357f2d..ef350004ad52 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -354,8 +354,11 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
 // `max_function_args` was introduced. It specifies the maximum number of kernel argumetns. More
 // information about this limitation can be found here:
 // https://developer.apple.com/documentation/metal/buffers/about_argument_buffers?language=objc
+// See also https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
 TVM_REGISTER_TARGET_KIND("metal", kDLMetal)
     .add_attr_option<Integer>("max_num_threads", Integer(256))
+    .add_attr_option<Integer>("max_threads_per_block", Integer(256))
+    .add_attr_option<Integer>("max_shared_memory_per_block", Integer(32768))
     .add_attr_option<Integer>("thread_warp_size", Integer(16))
     .add_attr_option<Integer>("max_function_args", Integer(31))
     .set_default_keys({"metal", "gpu"});
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index a2c0bc759401..56e42d4052fb 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -2060,5 +2060,7 @@ TVM_REGISTER_GLOBAL("tir.schedule.GetAutoTensorizeMappingInfo")
       return GetAutoTensorizeMappingInfo(sch->state(), sch->GetSRef(block), desc_func);
     });
 
+TVM_REGISTER_GLOBAL("tir.schedule.HasBlock").set_body_typed(HasBlock);
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index adadb46852cc..2c86c2df2d25 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -297,7 +297,12 @@ bool CalculateAffineFlag(const ScheduleState& self, const StmtSRef& block_sref)
  * \param stage The stage to be inserted
  * \return A SeqStmt, the result after insertion
  */
-SeqStmt InsertCacheStage(const Stmt& stmt, int pos, const Stmt& stage) {
+Stmt InsertCacheStage(const Stmt& stmt, int pos, const Stmt& stage) {
+  if (const auto* alloc = stmt.as<AllocateConstNode>()) {
+    auto seq_stmt = InsertCacheStage(alloc->body, pos, stage);
+    return AllocateConst(alloc->buffer_var, alloc->dtype, alloc->extents, alloc->data, seq_stmt,
+                         alloc->annotations, alloc->span);
+  }
   if (const auto* seq_stmt = stmt.as<SeqStmtNode>()) {
     ObjectPtr<SeqStmtNode> result = make_object<SeqStmtNode>(*seq_stmt);
     result->seq.insert(result->seq.begin() + pos, stage);
diff --git a/src/tir/schedule/utils.h b/src/tir/schedule/utils.h
index c289309acc2d..bcc8b7facbc9 100644
--- a/src/tir/schedule/utils.h
+++ b/src/tir/schedule/utils.h
@@ -31,7 +31,9 @@
 #include <tvm/tir/schedule/trace.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 #include "../../arith/pattern_match.h"
@@ -442,6 +444,50 @@ inline String BufferIndexType2Str(BufferIndexType buffer_index_type) {
   }
 }
 
+/******** Utilities for retrieving information about blocks ********/
+
+/*! \brief Returns the names of the blocks in the provided module. */
+inline std::unordered_set<std::string> GetBlockNames(const IRModule& mod) {
+  struct BlockNameCollector : public tir::StmtVisitor {
+    void VisitStmt_(const tir::BlockNode* block) override {
+      block_names.insert(block->name_hint);
+      StmtVisitor::VisitStmt(block->body);
+    }
+    std::unordered_set<std::string> block_names;
+  };
+
+  auto prim_func = tir::FindEntryFunc(mod, nullptr);
+  BlockNameCollector collector;
+  collector(prim_func->body);
+  return collector.block_names;
+}
+
+/*! \brief Query if the given block name exists in the module associated with the schedule */
+inline bool HasBlock(const Schedule& sch, const std::string& block_name) {
+  auto block_names = GetBlockNames(sch->mod());
+  return block_names.count(block_name);
+}
+
+/******** Utilites for trace application ********/
+
+/*!
+ * \brief Translate the input objects using the provided substitution map.
+ * \param inputs The input objects.
+ * \param rv_map The substitution map for variables.
+ * \return The transformed objects.
+ */
+Array<ObjectRef> TranslateInputRVs(const Array<ObjectRef>& inputs,
+                                   const std::unordered_map<const Object*, const Object*>& rv_map);
+
+/*!
+ * \brief Update the variable substitution map according to the new outputs.
+ * \param old_outputs The previous outputs of a schedule instruction.
+ * \param new_outputs The new outputs of the same schedule instruction.
+ * \param rv_map The substitution map for variables.
+ */
+void TranslateAddOutputRVs(const Array<ObjectRef>& old_outputs, const Array<ObjectRef>& new_outputs,
+                           std::unordered_map<const Object*, const Object*>* rv_map);
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
index a541c25f3cbc..addbb052a2da 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -116,9 +116,11 @@ def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
                 postprocs=postprocs,
                 mutator_probs={},
             ),
-            # Without this, the same workloads with different constant weights
-            # are treated as distinct tuning tasks.
-            module_equality="ignore-ndarray",
+            # This enables anchor-block tuning, where different subgraphs
+            # with the same anchor block workload will be identified as equal.
+            # It reduces the number of conv2d tuning tasks in the int8 resnet50 model
+            # from 36 to 23, with negligible performance difference.
+            module_equality="anchor-block",
         )
 
         return ms.relay_integration.compile_relay(
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index 9a1c9e8dc7f5..c689a15c56b2 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -107,6 +107,41 @@ def test_meta_schedule_integration_extract_from_resnet():
         assert t.task_name in expected_task_names, t.task_name
 
 
+@requires_torch
+def test_task_extraction_anchor_block():
+    mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
+    extracted_tasks = ms.relay_integration.extract_tasks(
+        mod, target="llvm", params=params, module_equality="anchor-block"
+    )
+
+    # Note that there is no task from residual blocks
+    expected_task_names = [
+        "fused_" + s
+        for s in [
+            "nn_max_pool2d",
+            "nn_adaptive_avg_pool2d",
+            "nn_dense_add",
+            "nn_conv2d_add",
+            "nn_conv2d_add_1",
+            "nn_conv2d_add_2",
+            "nn_conv2d_add_nn_relu",
+            "nn_conv2d_add_nn_relu_1",
+            "nn_conv2d_add_nn_relu_2",
+            "nn_conv2d_add_nn_relu_3",
+            "nn_conv2d_add_nn_relu_4",
+            "nn_conv2d_add_nn_relu_5",
+            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu",
+            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1",
+            "layout_transform",
+            "layout_transform_reshape_squeeze",
+        ]
+    ]
+
+    assert len(extracted_tasks) == len(expected_task_names)
+    for t in extracted_tasks:
+        assert t.task_name in expected_task_names, t.task_name
+
+
 @requires_torch
 def test_meta_schedule_integration_extract_from_bert_base():
     pytest.importorskip(
@@ -673,5 +708,115 @@ def test_module_equality_ignore_ndarray():
     np.testing.assert_allclose(ref, out, rtol=1e-4, atol=1e-4)
 
 
+def _test_anchor_tuning(target):
+    data_shape = (128, 128)
+    weight_shape1 = (128, 128)
+    weight_shape2 = (128, 128)
+
+    data = relay.var("data", shape=data_shape, dtype="float32")
+    weight1 = relay.var("weight1", shape=weight_shape1, dtype="float32")
+    weight2 = relay.var("weight2", shape=weight_shape2, dtype="float32")
+    dense1 = relay.nn.dense(data, weight1)
+    dense2 = relay.nn.dense(dense1 + relay.const(1.0, dtype="float32"), weight2)
+    mod = tvm.IRModule.from_expr(dense2 - data + relay.const(1.0, dtype="float32"))
+
+    weight1_np = np.random.randn(*weight_shape1).astype("float32")
+    weight2_np = np.random.randn(*weight_shape2).astype("float32")
+
+    data_np = np.random.randn(*data_shape).astype("float32")
+    params = {"weight1": weight1_np, "weight2": weight2_np}
+
+    module_equality = "anchor-block"
+
+    extracted_tasks = ms.relay_integration.extract_tasks(
+        mod, target, params, module_equality=module_equality
+    )
+
+    assert len(extracted_tasks) == 1
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            target=target,
+            params=params,
+            work_dir=work_dir,
+            max_trials_global=4,
+            strategy="replay-trace",
+            module_equality=module_equality,
+        )
+        lib = ms.relay_integration.compile_relay(database, mod, target, params)
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+    out = runtime.get_output(0).numpy()
+
+    ref = (
+        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
+        .evaluate()(*[data_np, weight1_np, weight2_np])
+        .numpy()
+    )
+
+    np.testing.assert_allclose(ref, out, atol=1e-3)
+
+
+def test_anchor_tuning_cpu():
+    _test_anchor_tuning("llvm --num-cores=4")
+
+
+def test_anchor_tuning_cpu_link_params():
+    data_shape = (128, 128)
+    weight_shape1 = (128, 128)
+    weight_shape2 = (128, 128)
+
+    data = relay.var("data", shape=data_shape, dtype="float32")
+    weight1 = relay.var("weight1", shape=weight_shape1, dtype="float32")
+    weight2 = relay.var("weight2", shape=weight_shape2, dtype="float32")
+    dense1 = relay.nn.dense(data, weight1)
+    dense2 = relay.nn.dense(dense1, weight2)
+    mod = tvm.IRModule.from_expr(dense2 + relay.const(1.0, dtype="float32"))
+
+    weight1_np = np.random.randn(*weight_shape1).astype("float32")
+    weight2_np = np.random.randn(*weight_shape2).astype("float32")
+
+    data_np = np.random.randn(*data_shape).astype("float32")
+    params = {"weight1": weight1_np, "weight2": weight2_np}
+
+    module_equality = "anchor-block"
+    target = "llvm --num-cores=4"
+
+    executor = relay.backend.Executor("graph", {"link-params": True})
+    mod = mod.with_attr("executor", executor)
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            target=target,
+            params=params,
+            work_dir=work_dir,
+            max_trials_global=4,
+            strategy="replay-trace",
+            module_equality=module_equality,
+        )
+        lib = ms.relay_integration.compile_relay(database, mod, target, params)
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+    out = runtime.get_output(0).numpy()
+
+    ref = (
+        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
+        .evaluate()(*[data_np, weight1_np, weight2_np])
+        .numpy()
+    )
+
+    np.testing.assert_allclose(ref, out, atol=1e-3)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
new file mode 100644
index 000000000000..6ff21c72c9ea
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -0,0 +1,2745 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.meta_schedule as ms
+from tvm.script import tir as T
+from tvm.tir import Schedule, floormod, floordiv
+from tvm.tir.tensor_intrin.cuda import *
+from tvm.target import Target
+from tvm.target.codegen import llvm_lookup_intrinsic_id
+
+
+# fmt: off
+@tvm.script.ir_module
+class Dense:
+    @T.prim_func
+    def main(
+        p0: T.Buffer[(128, 128), "float32"],
+        p1: T.Buffer[(128, 128), "float32"],
+        T_matmul_NT: T.Buffer[(128, 128), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"layout_free_buffers": [1], "tir.noalias": True, "global_symbol": "main"})
+        # body
+        # with T.block("root")
+        for i0, i1, i2 in T.grid(128, 128, 128):
+            with T.block("T_matmul_NT"):
+                i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                T.reads(p0[i, k], p1[j, k])
+                T.writes(T_matmul_NT[i, j])
+                T.block_attr({"layout_free_placeholders": []})
+                with T.init():
+                    T_matmul_NT[i, j] = T.float32(0)
+                T_matmul_NT[i, j] = T_matmul_NT[i, j] + p0[i, k] * p1[j, k]
+
+
+@tvm.script.ir_module
+class DenseAdd:
+    @T.prim_func
+    def main(
+        p0: T.Buffer[(128, 128), "float32"],
+        p1: T.Buffer[(128, 128), "float32"],
+        T_add: T.Buffer[(128, 128), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
+        # body
+        # with T.block("root")
+        T_matmul_NT = T.alloc_buffer([128, 128], dtype="float32")
+        compile_engine_const = T.alloc_buffer([], dtype="float32")
+        for i0, i1, i2 in T.grid(128, 128, 128):
+            with T.block("T_matmul_NT"):
+                i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                T.reads(p0[i, k], p1[j, k])
+                T.writes(T_matmul_NT[i, j])
+                T.block_attr({"layout_free_placeholders": []})
+                with T.init():
+                    T_matmul_NT[i, j] = T.float32(0)
+                T_matmul_NT[i, j] = T_matmul_NT[i, j] + p0[i, k] * p1[j, k]
+        with T.block("compile_engine_const"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const[()])
+            compile_engine_const[()] = T.float32(1)
+        for i0, i1 in T.grid(128, 128):
+            with T.block("T_add"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_matmul_NT[ax0, ax1], compile_engine_const[()])
+                T.writes(T_add[ax0, ax1])
+                T_add[ax0, ax1] = T_matmul_NT[ax0, ax1] + compile_engine_const[()]
+
+
+@tvm.script.ir_module
+class DenseAdd_scheduled_cpu:
+    @T.prim_func
+    def main(
+        p0: T.Buffer[(128, 128), "float32"],
+        p1: T.Buffer[(128, 128), "float32"],
+        T_add: T.Buffer[(128, 128), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
+        # body
+        # with T.block("root")
+        T_matmul_NT_global = T.alloc_buffer([128, 128], dtype="float32")
+        p1_global = T.alloc_buffer([2, 128, 64], dtype="float32")
+        for ax0, ax1 in T.grid(128, 128):
+            with T.block("p1_global"):
+                v0, v1 = T.axis.remap("SS", [ax0, ax1])
+                T.reads(p1[v0, v1])
+                T.writes(p1_global[v0 // 64, v1, v0 % 64])
+                T.block_attr({"meta_schedule.layout_rewrite_preproc": 1})
+                p1_global[v0 // 64, v1, v0 % 64] = p1[v0, v1]
+        for i0_0_i1_0_fused_fused in T.parallel(4):
+            for i0_1, i1_1 in T.grid(8, 1):
+                for i0_2_init, i1_2_init, i0_3_init in T.grid(4, 1, 2):
+                    for i1_3_fused_init in T.vectorized(64):
+                        with T.block("T_matmul_NT_init"):
+                            i = T.axis.spatial(
+                                128,
+                                i0_0_i1_0_fused_fused // 2 * 64
+                                + i0_1 * 8
+                                + i0_2_init * 2
+                                + i0_3_init,
+                            )
+                            j = T.axis.spatial(
+                                128,
+                                i0_0_i1_0_fused_fused % 2 * 64
+                                + i1_1 * 64
+                                + i1_2_init * 64
+                                + i1_3_fused_init,
+                            )
+                            T.reads()
+                            T.writes(T_matmul_NT_global[i, j])
+                            T.block_attr(
+                                {
+                                    "layout_free_placeholders": [],
+                                    "meta_schedule.tiling_structure": "SSRSRS",
+                                }
+                            )
+                            T_matmul_NT_global[i, j] = T.float32(0)
+                for i2_0, i0_2, i1_2, i2_1, i0_3 in T.grid(128, 4, 1, 1, 2):
+                    for i1_3_fused in T.vectorized(64):
+                        with T.block("T_matmul_NT_update"):
+                            i = T.axis.spatial(
+                                128, i0_0_i1_0_fused_fused // 2 * 64 + i0_1 * 8 + i0_2 * 2 + i0_3
+                            )
+                            j = T.axis.spatial(
+                                128,
+                                i0_0_i1_0_fused_fused % 2 * 64 + i1_1 * 64 + i1_2 * 64 + i1_3_fused,
+                            )
+                            k = T.axis.reduce(128, i2_0 + i2_1)
+                            T.reads(
+                                T_matmul_NT_global[i, j], p0[i, k], p1_global[j // 64, k, j % 64]
+                            )
+                            T.writes(T_matmul_NT_global[i, j])
+                            T.block_attr(
+                                {
+                                    "layout_free_placeholders": [],
+                                    "meta_schedule.tiling_structure": "SSRSRS",
+                                }
+                            )
+                            T_matmul_NT_global[i, j] = (
+                                T_matmul_NT_global[i, j] + p0[i, k] * p1_global[j // 64, k, j % 64]
+                            )
+            for ax0 in T.serial(64):
+                for ax1_fused in T.vectorized(64):
+                    with T.block("T_matmul_NT_global"):
+                        v0 = T.axis.spatial(128, i0_0_i1_0_fused_fused // 2 * 64 + ax0)
+                        v1 = T.axis.spatial(128, i0_0_i1_0_fused_fused % 2 * 64 + ax1_fused)
+                        T.reads(T_matmul_NT_global[v0, v1])
+                        T.writes(T_add[v0, v1])
+                        T_add[v0, v1] = T_matmul_NT_global[v0, v1] + T.float32(1)
+
+
+@tvm.script.ir_module
+class DenseAdd_cpu_no_write_cache:
+    @T.prim_func
+    def main(p0: T.Buffer[(128, 128), "float32"], p1: T.Buffer[(128, 128), "float32"], T_add: T.Buffer[(128, 128), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
+        # body
+        # with T.block("root")
+        T_matmul_NT = T.alloc_buffer([128, 128], dtype="float32")
+        p1_global = T.alloc_buffer([8, 4, 16, 32], dtype="float32")
+        for ax0, ax1 in T.grid(128, 128):
+            with T.block("p1_global"):
+                v0, v1 = T.axis.remap("SS", [ax0, ax1])
+                T.reads(p1[v0, v1])
+                T.writes(p1_global[v1 // 16, v0 // 32, v1 % 16, v0 % 32])
+                T.block_attr({"meta_schedule.layout_rewrite_preproc":1})
+                p1_global[v1 // 16, v0 // 32, v1 % 16, v0 % 32] = p1[v0, v1]
+        for i0_0_i1_0_i0_1_i1_1_fused in T.parallel(16, annotations={"pragma_auto_unroll_max_step":16, "pragma_unroll_explicit":1}):
+            for i0_2_init, i1_2_init, i0_3_init in T.grid(4, 4, 2):
+                for i1_3_fused_init in T.vectorized(32):
+                    with T.block("T_matmul_NT_init"):
+                        i = T.axis.spatial(128, i0_0_i1_0_i0_1_i1_1_fused // 4 * 32 + i0_0_i1_0_i0_1_i1_1_fused % 4 * 8 + i0_2_init * 2 + i0_3_init)
+                        j = T.axis.spatial(128, i1_2_init * 32 + i1_3_fused_init)
+                        T.reads()
+                        T.writes(T_matmul_NT[i, j])
+                        T.block_attr({"layout_free_placeholders":[], "meta_schedule.tiling_structure":"SSRSRS"})
+                        T_matmul_NT[i, j] = T.float32(0)
+            for i2_0, i0_2, i1_2, i2_1, i0_3 in T.grid(8, 4, 4, 16, 2):
+                for i1_3_fused in T.vectorized(32):
+                    with T.block("T_matmul_NT_update"):
+                        i = T.axis.spatial(128, i0_0_i1_0_i0_1_i1_1_fused // 4 * 32 + i0_0_i1_0_i0_1_i1_1_fused % 4 * 8 + i0_2 * 2 + i0_3)
+                        j = T.axis.spatial(128, i1_2 * 32 + i1_3_fused)
+                        k = T.axis.reduce(128, i2_0 * 16 + i2_1)
+                        T.reads(T_matmul_NT[i, j], p0[i, k], p1_global[k // 16, j // 32, k % 16, j % 32])
+                        T.writes(T_matmul_NT[i, j])
+                        T.block_attr({"layout_free_placeholders":[], "meta_schedule.tiling_structure":"SSRSRS"})
+                        T_matmul_NT[i, j] = T_matmul_NT[i, j] + p0[i, k] * p1_global[k // 16, j // 32, k % 16, j % 32]
+        for i0_i1_fused in T.parallel(16384):
+            with T.block("T_add"):
+                ax0 = T.axis.spatial(128, i0_i1_fused // 128)
+                ax1 = T.axis.spatial(128, i0_i1_fused % 128)
+                T.reads(T_matmul_NT[ax0, ax1])
+                T.writes(T_add[ax0, ax1])
+                T_add[ax0, ax1] = T_matmul_NT[ax0, ax1] + T.float32(1)
+
+
+@tvm.script.ir_module
+class DenseAdd_scheduled_gpu:
+    @T.prim_func
+    def main(
+        p0: T.Buffer[(128, 128), "float32"],
+        p1: T.Buffer[(128, 128), "float32"],
+        T_add: T.Buffer[(128, 128), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
+        # body
+        # with T.block("root")
+        T_matmul_NT_local = T.alloc_buffer([128, 128], dtype="float32", scope="local")
+        p0_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+        p1_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared")
+        for i0_0_i1_0_fused in T.thread_binding(
+            32,
+            thread="blockIdx.x",
+            annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1},
+        ):
+            for i0_1_i1_1_fused in T.thread_binding(1, thread="vthread.x"):
+                for i0_2_i1_2_fused in T.thread_binding(128, thread="threadIdx.x"):
+                    for i0_3_init, i1_3_init, i0_4_init, i1_4_init in T.grid(1, 4, 1, 1):
+                        with T.block("T_matmul_NT_init"):
+                            i = T.axis.spatial(
+                                128,
+                                i0_0_i1_0_fused // 4 * 16
+                                + i0_2_i1_2_fused // 8
+                                + i0_3_init
+                                + i0_4_init,
+                            )
+                            j = T.axis.spatial(
+                                128,
+                                i1_4_init
+                                + i0_0_i1_0_fused % 4 * 32
+                                + i0_2_i1_2_fused % 8 * 4
+                                + i1_3_init,
+                            )
+                            T.reads()
+                            T.writes(T_matmul_NT_local[i, j])
+                            T.block_attr(
+                                {
+                                    "layout_free_placeholders": [],
+                                    "meta_schedule.thread_extent_high_inclusive": 256,
+                                    "meta_schedule.thread_extent_low_inclusive": 16,
+                                    "meta_schedule.tiling_structure": "SSSRRSRS",
+                                }
+                            )
+                            T_matmul_NT_local[i, j] = T.float32(0)
+                    for i2_0 in T.serial(32):
+                        for ax0_ax1_fused_0 in T.serial(1):
+                            for ax0_ax1_fused_1 in T.thread_binding(128, thread="threadIdx.x"):
+                                for ax0_ax1_fused_2 in T.vectorized(2):
+                                    with T.block("p0_shared"):
+                                        T.where(
+                                            (ax0_ax1_fused_0 * 128 + ax0_ax1_fused_1) * 2
+                                            + ax0_ax1_fused_2
+                                            < 64
+                                        )
+                                        v0 = T.axis.spatial(
+                                            128,
+                                            i0_0_i1_0_fused // 4 * 16
+                                            + (
+                                                ax0_ax1_fused_0 * 256
+                                                + ax0_ax1_fused_1 * 2
+                                                + ax0_ax1_fused_2
+                                            )
+                                            // 4,
+                                        )
+                                        v1 = T.axis.spatial(
+                                            128,
+                                            i2_0 * 4
+                                            + (
+                                                ax0_ax1_fused_0 * 256
+                                                + ax0_ax1_fused_1 * 2
+                                                + ax0_ax1_fused_2
+                                            )
+                                            % 4,
+                                        )
+                                        T.reads(p0[v0, v1])
+                                        T.writes(p0_shared[v0, v1])
+                                        p0_shared[v0, v1] = p0[v0, v1]
+                        for ax0_ax1_fused_0 in T.serial(1):
+                            for ax0_ax1_fused_1 in T.thread_binding(128, thread="threadIdx.x"):
+                                for ax0_ax1_fused_2 in T.vectorized(4):
+                                    with T.block("p1_shared"):
+                                        T.where(
+                                            (ax0_ax1_fused_0 * 128 + ax0_ax1_fused_1) * 4
+                                            + ax0_ax1_fused_2
+                                            < 128
+                                        )
+                                        v0 = T.axis.spatial(
+                                            128,
+                                            i0_0_i1_0_fused % 4 * 32
+                                            + (
+                                                ax0_ax1_fused_0 * 512
+                                                + ax0_ax1_fused_1 * 4
+                                                + ax0_ax1_fused_2
+                                            )
+                                            // 4,
+                                        )
+                                        v1 = T.axis.spatial(
+                                            128,
+                                            i2_0 * 4
+                                            + (
+                                                ax0_ax1_fused_0 * 512
+                                                + ax0_ax1_fused_1 * 4
+                                                + ax0_ax1_fused_2
+                                            )
+                                            % 4,
+                                        )
+                                        T.reads(p1[v0, v1])
+                                        T.writes(p1_shared[v0, v1])
+                                        p1_shared[v0, v1] = p1[v0, v1]
+                        for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(1, 1, 4, 4, 1, 1):
+                            with T.block("T_matmul_NT_update"):
+                                i = T.axis.spatial(
+                                    128,
+                                    i0_0_i1_0_fused // 4 * 16 + i0_2_i1_2_fused // 8 + i0_3 + i0_4,
+                                )
+                                j = T.axis.spatial(
+                                    128,
+                                    i1_4
+                                    + i0_0_i1_0_fused % 4 * 32
+                                    + i0_2_i1_2_fused % 8 * 4
+                                    + i1_3,
+                                )
+                                k = T.axis.reduce(128, i2_0 * 4 + i2_1 * 4 + i2_2)
+                                T.reads(T_matmul_NT_local[i, j], p0_shared[i, k], p1_shared[j, k])
+                                T.writes(T_matmul_NT_local[i, j])
+                                T.block_attr(
+                                    {
+                                        "layout_free_placeholders": [],
+                                        "meta_schedule.thread_extent_high_inclusive": 256,
+                                        "meta_schedule.thread_extent_low_inclusive": 16,
+                                        "meta_schedule.tiling_structure": "SSSRRSRS",
+                                    }
+                                )
+                                T_matmul_NT_local[i, j] = (
+                                    T_matmul_NT_local[i, j] + p0_shared[i, k] * p1_shared[j, k]
+                                )
+                    for ax0, ax1 in T.grid(1, 4):
+                        with T.block("T_matmul_NT_local"):
+                            v0 = T.axis.spatial(
+                                128, i0_0_i1_0_fused // 4 * 16 + i0_2_i1_2_fused // 8 + ax0
+                            )
+                            v1 = T.axis.spatial(
+                                128, i0_0_i1_0_fused % 4 * 32 + i0_2_i1_2_fused % 8 * 4 + ax1
+                            )
+                            T.reads(T_matmul_NT_local[v0, v1])
+                            T.writes(T_add[v0, v1])
+                            T_add[v0, v1] = T_matmul_NT_local[v0, v1] + T.float32(1)
+
+
+@tvm.script.ir_module
+class Conv2dInt8:
+    @T.prim_func
+    def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "int8"], p2: T.Buffer[(1, 1, 1, 256), "int32"], p3: T.Buffer[(1, 1, 1, 256), "int32"], p4: T.Buffer[(1, 1, 1, 256), "int64"], p5: T.Buffer[(1, 1, 1, 256), "int64"], p6: T.Buffer[(1, 1, 1, 256), "int64"], p7: T.Buffer[(), "int32"], p8: T.Buffer[1, "int32"], compute: T.Buffer[(16, 56, 56, 256), "int32"]) -> None:
+        # function attr dict
+        T.func_attr({"tir.noalias": True, "global_symbol": "main"})
+        # body
+        # with T.block("root")
+        pad_temp = T.alloc_buffer([16, 56, 56, 64], dtype="int8")
+        conv2d_nhwc = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_subtract = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_add = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_cast = T.alloc_buffer([16, 56, 56, 256], dtype="int64")
+        T_multiply = T.alloc_buffer([16, 56, 56, 256], dtype="int64")
+        T_add_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int64")
+        T_right_shift = T.alloc_buffer([16, 56, 56, 256], dtype="int64")
+        T_cast_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_add_2 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        compute_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_cast_2 = T.alloc_buffer([16, 56, 56, 256], dtype="uint8")
+        T_cast_3 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_subtract_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 64):
+            with T.block("pad_temp"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(p0[i0_1, i1_1, i2_1, i3_1])
+                T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1])
+                pad_temp[i0_1, i1_1, i2_1, i3_1] = p0[i0_1, i1_1, i2_1, i3_1]
+        for i0, i1, i2, i3, i4, i5, i6 in T.grid(16, 56, 56, 256, 1, 1, 64):
+            with T.block("conv2d_nhwc"):
+                nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+                T.reads(pad_temp[nn, yy + ry, xx + rx, rc], p1[ff, ry, rx, rc])
+                T.writes(conv2d_nhwc[nn, yy, xx, ff])
+                with T.init():
+                    conv2d_nhwc[nn, yy, xx, ff] = 0
+                conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + T.cast(pad_temp[nn, yy + ry, xx + rx, rc], "int32") * T.cast(p1[ff, ry, rx, rc], "int32")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_subtract"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], p2[0, 0, 0, ax3])
+                T.writes(T_subtract[ax0, ax1, ax2, ax3])
+                T_subtract[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] - p2[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_add"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_subtract[ax0, ax1, ax2, ax3], p3[0, 0, 0, ax3])
+                T.writes(T_add[ax0, ax1, ax2, ax3])
+                T_add[ax0, ax1, ax2, ax3] = T_subtract[ax0, ax1, ax2, ax3] + p3[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_cast"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add[ax0, ax1, ax2, ax3])
+                T.writes(T_cast[ax0, ax1, ax2, ax3])
+                T_cast[ax0, ax1, ax2, ax3] = T.cast(T_add[ax0, ax1, ax2, ax3], "int64")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_multiply"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_cast[ax0, ax1, ax2, ax3], p4[0, 0, 0, ax3])
+                T.writes(T_multiply[ax0, ax1, ax2, ax3])
+                T_multiply[ax0, ax1, ax2, ax3] = T_cast[ax0, ax1, ax2, ax3] * p4[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_add_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_multiply[ax0, ax1, ax2, ax3], p5[0, 0, 0, ax3])
+                T.writes(T_add_1[ax0, ax1, ax2, ax3])
+                T_add_1[ax0, ax1, ax2, ax3] = T_multiply[ax0, ax1, ax2, ax3] + p5[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_right_shift"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add_1[ax0, ax1, ax2, ax3], p6[0, 0, 0, ax3])
+                T.writes(T_right_shift[ax0, ax1, ax2, ax3])
+                T_right_shift[ax0, ax1, ax2, ax3] = T.shift_right(T_add_1[ax0, ax1, ax2, ax3], p6[0, 0, 0, ax3], dtype="int64")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_cast_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_right_shift[ax0, ax1, ax2, ax3])
+                T.writes(T_cast_1[ax0, ax1, ax2, ax3])
+                T_cast_1[ax0, ax1, ax2, ax3] = T.cast(T_right_shift[ax0, ax1, ax2, ax3], "int32")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_add_2"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(p7[()], T_cast_1[ax0, ax1, ax2, ax3])
+                T.writes(T_add_2[ax0, ax1, ax2, ax3])
+                T_add_2[ax0, ax1, ax2, ax3] = p7[()] + T_cast_1[ax0, ax1, ax2, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("compute"):
+                i0_2, i1_2, i2_2, i3_2 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add_2[i0_2, i1_2, i2_2, i3_2])
+                T.writes(compute_1[i0_2, i1_2, i2_2, i3_2])
+                compute_1[i0_2, i1_2, i2_2, i3_2] = T.max(T.min(T_add_2[i0_2, i1_2, i2_2, i3_2], 255), 0)
+        for i0_3, i1_3, i2_3, i3_3 in T.grid(16, 56, 56, 256):
+            with T.block("T_cast_2"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_3, i1_3, i2_3, i3_3])
+                T.reads(compute_1[ax0, ax1, ax2, ax3])
+                T.writes(T_cast_2[ax0, ax1, ax2, ax3])
+                T_cast_2[ax0, ax1, ax2, ax3] = T.cast(compute_1[ax0, ax1, ax2, ax3], "uint8")
+        for i0_4, i1_4, i2_4, i3_4 in T.grid(16, 56, 56, 256):
+            with T.block("T_cast_3"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_4, i1_4, i2_4, i3_4])
+                T.reads(T_cast_2[ax0, ax1, ax2, ax3])
+                T.writes(T_cast_3[ax0, ax1, ax2, ax3])
+                T_cast_3[ax0, ax1, ax2, ax3] = T.cast(T_cast_2[ax0, ax1, ax2, ax3], "int32")
+        for i0_5, i1_5, i2_5, i3_5 in T.grid(16, 56, 56, 256):
+            with T.block("T_subtract_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_5, i1_5, i2_5, i3_5])
+                T.reads(T_cast_3[ax0, ax1, ax2, ax3], p8[0])
+                T.writes(T_subtract_1[ax0, ax1, ax2, ax3])
+                T_subtract_1[ax0, ax1, ax2, ax3] = T_cast_3[ax0, ax1, ax2, ax3] - p8[0]
+        for i0_6, i1_6, i2_6, i3_6 in T.grid(16, 56, 56, 256):
+            with T.block("compute_1"):
+                i0_7, i1_7, i2_7, i3_7 = T.axis.remap("SSSS", [i0_6, i1_6, i2_6, i3_6])
+                T.reads(T_subtract_1[i0_7, i1_7, i2_7, i3_7])
+                T.writes(compute[i0_7, i1_7, i2_7, i3_7])
+                compute[i0_7, i1_7, i2_7, i3_7] = T.q_multiply_shift(T_subtract_1[i0_7, i1_7, i2_7, i3_7], 1963325822, 31, 1, dtype="int32")
+
+
+@tvm.script.ir_module
+class Conv2dInt8_target:
+    @T.prim_func
+    def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "int8"], p2: T.Buffer[(1, 1, 1, 256), "int32"], p3: T.Buffer[(1, 1, 1, 256), "int32"], p4: T.Buffer[(1, 1, 1, 256), "int64"], p5: T.Buffer[(1, 1, 1, 256), "int64"], p6: T.Buffer[(1, 1, 1, 256), "int64"], p7: T.Buffer[(), "int32"], p8: T.Buffer[1, "int32"], p9: T.Buffer[(16, 56, 56, 256), "int32"], compute: T.Buffer[(16, 56, 56, 256), "uint8"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        pad_temp = T.alloc_buffer([16, 56, 56, 64], dtype="int8")
+        conv2d_nhwc = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_subtract = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_add = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_cast = T.alloc_buffer([16, 56, 56, 256], dtype="int64")
+        T_multiply = T.alloc_buffer([16, 56, 56, 256], dtype="int64")
+        T_add_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int64")
+        T_right_shift = T.alloc_buffer([16, 56, 56, 256], dtype="int64")
+        T_cast_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_add_2 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        compute_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_cast_2 = T.alloc_buffer([16, 56, 56, 256], dtype="uint8")
+        T_cast_3 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_subtract_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        compute_2 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_add_3 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        compute_3 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_cast_4 = T.alloc_buffer([16, 56, 56, 256], dtype="uint8")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 64):
+            with T.block("pad_temp"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(p0[i0_1, i1_1, i2_1, i3_1])
+                T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1])
+                pad_temp[i0_1, i1_1, i2_1, i3_1] = p0[i0_1, i1_1, i2_1, i3_1]
+        for i0, i1, i2, i3, i4, i5, i6 in T.grid(16, 56, 56, 256, 1, 1, 64):
+            with T.block("conv2d_nhwc"):
+                nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+                T.reads(pad_temp[nn, yy + ry, xx + rx, rc], p1[ff, ry, rx, rc])
+                T.writes(conv2d_nhwc[nn, yy, xx, ff])
+                with T.init():
+                    conv2d_nhwc[nn, yy, xx, ff] = 0
+                conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + T.cast(pad_temp[nn, yy + ry, xx + rx, rc], "int32") * T.cast(p1[ff, ry, rx, rc], "int32")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_subtract"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], p2[0, 0, 0, ax3])
+                T.writes(T_subtract[ax0, ax1, ax2, ax3])
+                T_subtract[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] - p2[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_add"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_subtract[ax0, ax1, ax2, ax3], p3[0, 0, 0, ax3])
+                T.writes(T_add[ax0, ax1, ax2, ax3])
+                T_add[ax0, ax1, ax2, ax3] = T_subtract[ax0, ax1, ax2, ax3] + p3[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_cast"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add[ax0, ax1, ax2, ax3])
+                T.writes(T_cast[ax0, ax1, ax2, ax3])
+                T_cast[ax0, ax1, ax2, ax3] = T.cast(T_add[ax0, ax1, ax2, ax3], "int64")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_multiply"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_cast[ax0, ax1, ax2, ax3], p4[0, 0, 0, ax3])
+                T.writes(T_multiply[ax0, ax1, ax2, ax3])
+                T_multiply[ax0, ax1, ax2, ax3] = T_cast[ax0, ax1, ax2, ax3] * p4[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_add_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_multiply[ax0, ax1, ax2, ax3], p5[0, 0, 0, ax3])
+                T.writes(T_add_1[ax0, ax1, ax2, ax3])
+                T_add_1[ax0, ax1, ax2, ax3] = T_multiply[ax0, ax1, ax2, ax3] + p5[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_right_shift"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add_1[ax0, ax1, ax2, ax3], p6[0, 0, 0, ax3])
+                T.writes(T_right_shift[ax0, ax1, ax2, ax3])
+                T_right_shift[ax0, ax1, ax2, ax3] = T.shift_right(T_add_1[ax0, ax1, ax2, ax3], p6[0, 0, 0, ax3], dtype="int64")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_cast_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_right_shift[ax0, ax1, ax2, ax3])
+                T.writes(T_cast_1[ax0, ax1, ax2, ax3])
+                T_cast_1[ax0, ax1, ax2, ax3] = T.cast(T_right_shift[ax0, ax1, ax2, ax3], "int32")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_add_2"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(p7[()], T_cast_1[ax0, ax1, ax2, ax3])
+                T.writes(T_add_2[ax0, ax1, ax2, ax3])
+                T_add_2[ax0, ax1, ax2, ax3] = p7[()] + T_cast_1[ax0, ax1, ax2, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("compute"):
+                i0_2, i1_2, i2_2, i3_2 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add_2[i0_2, i1_2, i2_2, i3_2])
+                T.writes(compute_1[i0_2, i1_2, i2_2, i3_2])
+                compute_1[i0_2, i1_2, i2_2, i3_2] = T.max(T.min(T_add_2[i0_2, i1_2, i2_2, i3_2], 255), 0)
+        for i0_3, i1_3, i2_3, i3_3 in T.grid(16, 56, 56, 256):
+            with T.block("T_cast_2"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_3, i1_3, i2_3, i3_3])
+                T.reads(compute_1[ax0, ax1, ax2, ax3])
+                T.writes(T_cast_2[ax0, ax1, ax2, ax3])
+                T_cast_2[ax0, ax1, ax2, ax3] = T.cast(compute_1[ax0, ax1, ax2, ax3], "uint8")
+        for i0_4, i1_4, i2_4, i3_4 in T.grid(16, 56, 56, 256):
+            with T.block("T_cast_3"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_4, i1_4, i2_4, i3_4])
+                T.reads(T_cast_2[ax0, ax1, ax2, ax3])
+                T.writes(T_cast_3[ax0, ax1, ax2, ax3])
+                T_cast_3[ax0, ax1, ax2, ax3] = T.cast(T_cast_2[ax0, ax1, ax2, ax3], "int32")
+        for i0_5, i1_5, i2_5, i3_5 in T.grid(16, 56, 56, 256):
+            with T.block("T_subtract_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_5, i1_5, i2_5, i3_5])
+                T.reads(T_cast_3[ax0, ax1, ax2, ax3], p8[0])
+                T.writes(T_subtract_1[ax0, ax1, ax2, ax3])
+                T_subtract_1[ax0, ax1, ax2, ax3] = T_cast_3[ax0, ax1, ax2, ax3] - p8[0]
+        for i0_6, i1_6, i2_6, i3_6 in T.grid(16, 56, 56, 256):
+            with T.block("compute_1"):
+                i0_7, i1_7, i2_7, i3_7 = T.axis.remap("SSSS", [i0_6, i1_6, i2_6, i3_6])
+                T.reads(T_subtract_1[i0_7, i1_7, i2_7, i3_7])
+                T.writes(compute_2[i0_7, i1_7, i2_7, i3_7])
+                compute_2[i0_7, i1_7, i2_7, i3_7] = T.q_multiply_shift(T_subtract_1[i0_7, i1_7, i2_7, i3_7], 1098990753, 31, 1, dtype="int32")
+        for i0_8, i1_8, i2_8, i3_8 in T.grid(16, 56, 56, 256):
+            with T.block("T_add_3"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_8, i1_8, i2_8, i3_8])
+                T.reads(compute_2[ax0, ax1, ax2, ax3], p9[ax0, ax1, ax2, ax3])
+                T.writes(T_add_3[ax0, ax1, ax2, ax3])
+                T_add_3[ax0, ax1, ax2, ax3] = compute_2[ax0, ax1, ax2, ax3] + p9[ax0, ax1, ax2, ax3]
+        for i0_9, i1_9, i2_9, i3_9 in T.grid(16, 56, 56, 256):
+            with T.block("compute_2"):
+                i0_10, i1_10, i2_10, i3_10 = T.axis.remap("SSSS", [i0_9, i1_9, i2_9, i3_9])
+                T.reads(T_add_3[i0_10, i1_10, i2_10, i3_10])
+                T.writes(compute_3[i0_10, i1_10, i2_10, i3_10])
+                compute_3[i0_10, i1_10, i2_10, i3_10] = T.max(T.min(T_add_3[i0_10, i1_10, i2_10, i3_10], 255), 0)
+        for i0_11, i1_11, i2_11, i3_11 in T.grid(16, 56, 56, 256):
+            with T.block("T_cast_4"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_11, i1_11, i2_11, i3_11])
+                T.reads(compute_3[ax0, ax1, ax2, ax3])
+                T.writes(T_cast_4[ax0, ax1, ax2, ax3])
+                T_cast_4[ax0, ax1, ax2, ax3] = T.cast(compute_3[ax0, ax1, ax2, ax3], "uint8")
+        for i0_12, i1_12, i2_12, i3_12 in T.grid(16, 56, 56, 256):
+            with T.block("compute_3"):
+                i0_13, i1_13, i2_13, i3_13 = T.axis.remap("SSSS", [i0_12, i1_12, i2_12, i3_12])
+                T.reads(T_cast_4[i0_13, i1_13, i2_13, i3_13])
+                T.writes(compute[i0_13, i1_13, i2_13, i3_13])
+                compute[i0_13, i1_13, i2_13, i3_13] = T.max(T.min(T_cast_4[i0_13, i1_13, i2_13, i3_13], T.uint8(255)), T.uint8(0))
+
+
+@tvm.script.ir_module
+class Conv2dInt8_tensorcore_scheduled:
+    @T.prim_func
+    def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "int8"], p2: T.Buffer[(1, 1, 1, 256), "int32"], p3: T.Buffer[(1, 1, 1, 256), "int32"], p4: T.Buffer[(1, 1, 1, 256), "int64"], p5: T.Buffer[(1, 1, 1, 256), "int64"], p6: T.Buffer[(1, 1, 1, 256), "int64"], p7: T.Buffer[(), "int32"], p8: T.Buffer[1, "int32"], p9: T.Buffer[(16, 56, 56, 256), "int32"], compute: T.Buffer[(16, 56, 56, 256), "uint8"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        a0 = T.var("int32")
+        a1 = T.var("int32")
+        b0 = T.var("int32")
+        b1 = T.var("int32")
+        c0 = T.var("int32")
+        c1 = T.var("int32")
+        d0 = T.var("int32")
+        d0_1 = T.var("int32")
+        d0_2 = T.var("int32")
+        d0_3 = T.var("int32")
+        d1 = T.var("int32")
+        d1_1 = T.var("int32")
+        d1_2 = T.var("int32")
+        d1_3 = T.var("int32")
+        s0 = T.var("int32")
+        s0_1 = T.var("int32")
+        s0_2 = T.var("int32")
+        s1 = T.var("int32")
+        s1_1 = T.var("int32")
+        s1_2 = T.var("int32")
+        # body
+        # with T.block("root")
+        conv2d_nhwc_reindex_shared = T.alloc_buffer([50176, 256], dtype="int32", scope="shared")
+        conv2d_nhwc_reindex_shared_wmma_accumulator = T.alloc_buffer([50176, 256], dtype="int32", scope="wmma.accumulator")
+        pad_temp_reindex_shared = T.alloc_buffer([50176, 64], dtype="int8", scope="shared")
+        p1_reindex_shared = T.alloc_buffer([1, 1, 256, 64], dtype="int8", scope="shared")
+        pad_temp_reindex_shared_wmma_matrix_a = T.alloc_buffer([50176, 64], dtype="int8", scope="wmma.matrix_a")
+        p1_reindex_shared_wmma_matrix_b = T.alloc_buffer([1, 1, 256, 64], dtype="int8", scope="wmma.matrix_b")
+        for ax2_0_0_ax3_0_0_fused in T.thread_binding(3136, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}):
+            for ax2_0_1_ax3_0_1_fused in T.thread_binding(1, thread="vthread.x"):
+                for ax2_0_2_ax3_0_2_fused in T.thread_binding(16, thread="threadIdx.x"):
+                    for ax0_0, ax1_0 in T.grid(1, 1):
+                        for ax2_0_3_init, ax3_0_3_init, ax2_0_4_init, ax3_0_4_init in T.grid(1, 1, 1, 1):
+                            with T.block("conv2d_nhwc_o_init"):
+                                v2_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2 + ax2_0_3_init + ax2_0_4_init)
+                                v3_o = T.axis.spatial(16, ax3_0_4_init + ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2 + ax3_0_3_init)
+                                T.reads()
+                                T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "warp_execution":1})
+                                C = T.match_buffer(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int32", strides=[d1, d0], scope="wmma.accumulator", offset_factor=16)
+                                T.evaluate(T.tvm_fill_fragment(C.data, 16, 16, 16, C.elem_offset // d1 // 16 * (d1 // 16) + C.elem_offset % d1 // 16, T.float32(0), dtype="handle"))
+                        for ax4_0_0 in T.serial(2):
+                            for ax0_ax1_fused_0 in T.serial(16):
+                                for ax0_ax1_fused_1 in T.thread_binding(16, thread="threadIdx.x"):
+                                    for ax0_ax1_fused_2 in T.vectorized(16):
+                                        with T.block("pad_temp_reindex_shared"):
+                                            v0 = T.axis.spatial(50176, ax2_0_0_ax3_0_0_fused // 8 * 128 + (ax0_ax1_fused_0 * 256 + ax0_ax1_fused_1 * 16 + ax0_ax1_fused_2) // 32)
+                                            v1 = T.axis.spatial(64, ax4_0_0 * 32 + (ax0_ax1_fused_0 * 256 + ax0_ax1_fused_1 * 16 + ax0_ax1_fused_2) % 32)
+                                            T.reads(p0[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1])
+                                            T.writes(pad_temp_reindex_shared[v0, v1])
+                                            T.block_attr({"buffer_dim_align":[[0, 0, 32, 16]]})
+                                            pad_temp_reindex_shared[v0, v1] = p0[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1]
+                            for ax0_ax1_ax2_ax3_fused_0 in T.serial(8):
+                                for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(16, thread="threadIdx.x"):
+                                    for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(8):
+                                        with T.block("p1_reindex_shared"):
+                                            v0 = T.axis.spatial(1, 0)
+                                            v1 = T.axis.spatial(1, 0)
+                                            v2 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused % 8 * 32 + (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 8 + ax0_ax1_ax2_ax3_fused_2) // 32)
+                                            v3 = T.axis.spatial(64, ax4_0_0 * 32 + (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 8 + ax0_ax1_ax2_ax3_fused_2) % 32)
+                                            T.reads(p1[v2, v0, v1, v3])
+                                            T.writes(p1_reindex_shared[v0, v1, v2, v3])
+                                            T.block_attr({"buffer_dim_align":[[0, 2, 32, 16]]})
+                                            p1_reindex_shared[v0, v1, v2, v3] = p1[v2, v0, v1, v3]
+                            for ax0_1, ax1_1, ax4_0_1 in T.grid(1, 1, 1):
+                                for ax0_0_1, ax1_0_1 in T.grid(1, 2):
+                                    with T.block("pad_temp_reindex_shared_wmma.matrix_a_o"):
+                                        v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2)
+                                        v1_o = T.axis.spatial(4, ax4_0_0 * 2 + ax1_0_1)
+                                        T.reads(pad_temp_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                        T.writes(pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                        A = T.match_buffer(pad_temp_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int8", strides=[s1, s0], scope="shared", offset_factor=16)
+                                        C_1 = T.match_buffer(pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int8", strides=[d1_1, d0_1], scope="wmma.matrix_a", offset_factor=16)
+                                        T.evaluate(T.tvm_load_matrix_sync(C_1.data, 16, 16, 16, C_1.elem_offset // d1_1 // 16 * (d1_1 // 16) + C_1.elem_offset % d1_1 // 16, T.tvm_access_ptr(T.type_annotation(dtype="int8"), A.data, A.elem_offset, s1 * 16, 1, dtype="handle"), s1, "row_major", dtype="handle"))
+                                for ax0, ax1, ax2_0, ax3_0 in T.grid(1, 1, 1, 2):
+                                    with T.block("p1_reindex_shared_wmma.matrix_b_o"):
+                                        v0 = T.axis.spatial(1, 0)
+                                        v1 = T.axis.spatial(1, 0)
+                                        v2_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2)
+                                        v3_o = T.axis.spatial(4, ax4_0_0 * 2 + ax3_0)
+                                        T.reads(p1_reindex_shared[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                        T.writes(p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                        A_1 = T.match_buffer(p1_reindex_shared[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int8", strides=[s1_1, s0_1], scope="shared", offset_factor=16)
+                                        C_2 = T.match_buffer(p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int8", strides=[d1_2, d0_2], scope="wmma.matrix_b", offset_factor=16)
+                                        T.evaluate(T.tvm_load_matrix_sync(C_2.data, 16, 16, 16, C_2.elem_offset // d1_2 // 16 * (d1_2 // 16) + C_2.elem_offset % d1_2 // 16, T.tvm_access_ptr(T.type_annotation(dtype="int8"), A_1.data, A_1.elem_offset, s1_1 * 16, 1, dtype="handle"), s1_1, "col_major", dtype="handle"))
+                                for ax2_0_3, ax3_0_3, ax0_2, ax1_2, ax4_0_2, ax2_0_4, ax3_0_4 in T.grid(1, 1, 1, 1, 2, 1, 1):
+                                    with T.block("conv2d_nhwc_o_update"):
+                                        v0 = T.axis.reduce(1, 0)
+                                        v1 = T.axis.reduce(1, 0)
+                                        v2_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2 + ax2_0_3 + ax2_0_4)
+                                        v3_o = T.axis.spatial(16, ax3_0_4 + ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2 + ax3_0_3)
+                                        v4_o = T.axis.reduce(4, ax4_0_0 * 2 + ax4_0_1 * 2 + ax4_0_2)
+                                        T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 : v2_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 : v3_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16])
+                                        T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                        T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "warp_execution":1})
+                                        A_2 = T.match_buffer(pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 : v2_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], [16, 16], dtype="int8", strides=[a1, a0], scope="wmma.matrix_a", offset_factor=16)
+                                        B = T.match_buffer(p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 : v3_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], [16, 16], dtype="int8", strides=[b1, b0], scope="wmma.matrix_b", offset_factor=16)
+                                        C_3 = T.match_buffer(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int32", strides=[c1, c0], scope="wmma.accumulator", offset_factor=16)
+                                        T.evaluate(T.tvm_mma_sync(C_3.data, C_3.elem_offset // c1 // 16 * (c1 // 16) + C_3.elem_offset % c1 // 16, A_2.data, A_2.elem_offset // a1 // 16 * (a1 // 16) + A_2.elem_offset % a1 // 16, B.data, B.elem_offset // b1 // 16 * (b1 // 16) + B.elem_offset % b1 // 16, C_3.data, C_3.elem_offset // c1 // 16 * (c1 // 16) + C_3.elem_offset % c1 // 16, dtype="handle"))
+                    for ax0_0, ax1_0 in T.grid(1, 1):
+                        with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
+                            v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2)
+                            v1_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2)
+                            T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                            A_3 = T.match_buffer(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int32", strides=[d1_3, d0_3], scope="wmma.accumulator", offset_factor=16)
+                            C_4 = T.match_buffer(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int32", strides=[s1_2, s0_2], scope="shared", offset_factor=16)
+                            T.evaluate(T.tvm_store_matrix_sync(A_3.data, 16, 16, 16, A_3.elem_offset // d1_3 // 16 * (d1_3 // 16) + A_3.elem_offset % d1_3 // 16, T.tvm_access_ptr(T.type_annotation(dtype="int32"), C_4.data, C_4.elem_offset, s1_2 * 16, 2, dtype="handle"), s1_2, "row_major", dtype="handle"))
+                for ax0, ax1_0 in T.grid(128, 2):
+                    for ax1_1 in T.thread_binding(16, thread="threadIdx.x"):
+                        with T.block("conv2d_nhwc_reindex_shared"):
+                            v0 = T.axis.spatial(50176, ax2_0_0_ax3_0_0_fused // 8 * 128 + ax0)
+                            v1 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused % 8 * 32 + ax1_0 * 16 + ax1_1)
+                            T.reads(p7[()], conv2d_nhwc_reindex_shared[v0, v1], p2[0, 0, 0, v1], p3[0, 0, 0, v1], p4[0, 0, 0, v1], p5[0, 0, 0, v1], p6[0, 0, 0, v1], p8[0], p9[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1])
+                            T.writes(compute[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1])
+                            compute[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1] = T.max(T.min(T.cast(T.max(T.min(T.q_multiply_shift(T.cast(T.cast(T.max(T.min(p7[()] + T.cast(T.shift_right(T.cast(conv2d_nhwc_reindex_shared[v0, v1] - p2[0, 0, 0, v1] + p3[0, 0, 0, v1], "int64") * p4[0, 0, 0, v1] + p5[0, 0, 0, v1], p6[0, 0, 0, v1], dtype="int64"), "int32"), 255), 0), "uint8"), "int32") - p8[0], 1098990753, 31, 1, dtype="int32") + p9[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1], 255), 0), "uint8"), T.uint8(255)), T.uint8(0))
+
+
+@tvm.script.ir_module
+class Conv2dInt8_NCHWc:
+    @T.prim_func
+    def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1, 4, 16, 4), "int8"], p2: T.Buffer[(1, 128, 1, 1, 16), "int32"], p3: T.Buffer[(1, 128, 1, 1, 16), "float32"], p4: T.Buffer[1, "float32"], p5: T.Buffer[(1, 128, 7, 7, 16), "int32"], compute: T.Buffer[(1, 128, 7, 7, 16), "uint8"]) -> None:
+        # function attr dict
+        T.func_attr({"tir.noalias": True, "global_symbol": "main"})
+        # body
+        # with T.block("root")
+        compile_engine_const = T.alloc_buffer([], dtype="float32")
+        conv2d_NCHWc_int8 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        T_add = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        T_cast = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_multiply = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        compile_engine_const_1 = T.alloc_buffer([], dtype="float32")
+        T_add_1 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_floor = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_cast_1 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        compute_1 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        T_cast_2 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="uint8")
+        T_cast_3 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_subtract = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_multiply_1 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        compile_engine_const_2 = T.alloc_buffer([], dtype="float32")
+        T_add_2 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_floor_1 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_cast_4 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        T_add_3 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        compute_2 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        T_cast_5 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="uint8")
+        with T.block("compile_engine_const"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const[()])
+            compile_engine_const[()] = T.float32(0.94537687301635742)
+        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 128, 7, 7, 16, 1, 1, 32, 4, 4):
+            with T.block("conv2d_NCHWc_int8"):
+                n, oc_chunk, oh, ow, oc_block, kh, kw, ic_outer, ic_f_inner, ic_s_inner = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
+                T.reads(p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner])
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                T.block_attr({"schedule_rule":"meta_schedule.conv2d_NCHWc_int8", "workload":["conv2d_NCHWc_int8.x86", ["TENSOR", [1, 32, 7, 7, 16], "uint8"], ["TENSOR", [128, 32, 1, 1, 4, 16, 4], "int8"], [1, 1], [0, 0, 0, 0], [1, 1], "NCHW16c", "NCHW16c", "int32"]})
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] + T.cast(p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32") * T.cast(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner], "int32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_add"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(conv2d_NCHWc_int8[ax0, ax1, ax2, ax3, ax4], p2[ax0, ax1, 0, 0, ax4])
+                T.writes(T_add[ax0, ax1, ax2, ax3, ax4])
+                T_add[ax0, ax1, ax2, ax3, ax4] = conv2d_NCHWc_int8[ax0, ax1, ax2, ax3, ax4] + p2[ax0, ax1, 0, 0, ax4]
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_add[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast[ax0, ax1, ax2, ax3, ax4])
+                T_cast[ax0, ax1, ax2, ax3, ax4] = T.cast(T_add[ax0, ax1, ax2, ax3, ax4], "float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_multiply"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_cast[ax0, ax1, ax2, ax3, ax4], p3[ax0, ax1, 0, 0, ax4])
+                T.writes(T_multiply[ax0, ax1, ax2, ax3, ax4])
+                T_multiply[ax0, ax1, ax2, ax3, ax4] = T_cast[ax0, ax1, ax2, ax3, ax4] * p3[ax0, ax1, 0, 0, ax4]
+        with T.block("compile_engine_const_1"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const_1[()])
+            compile_engine_const_1[()] = T.float32(54.5)
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_add_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_multiply[ax0, ax1, ax2, ax3, ax4], compile_engine_const_1[()])
+                T.writes(T_add_1[ax0, ax1, ax2, ax3, ax4])
+                T_add_1[ax0, ax1, ax2, ax3, ax4] = T_multiply[ax0, ax1, ax2, ax3, ax4] + compile_engine_const_1[()]
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_floor"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_add_1[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_floor[ax0, ax1, ax2, ax3, ax4])
+                T_floor[ax0, ax1, ax2, ax3, ax4] = T.floor(T_add_1[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_floor[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_1[ax0, ax1, ax2, ax3, ax4])
+                T_cast_1[ax0, ax1, ax2, ax3, ax4] = T.cast(T_floor[ax0, ax1, ax2, ax3, ax4], "int32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("compute"):
+                i0_1, i1_1, i2_1, i3_1, i4_1 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_cast_1[i0_1, i1_1, i2_1, i3_1, i4_1])
+                T.writes(compute_1[i0_1, i1_1, i2_1, i3_1, i4_1])
+                compute_1[i0_1, i1_1, i2_1, i3_1, i4_1] = T.max(T.min(T_cast_1[i0_1, i1_1, i2_1, i3_1, i4_1], 255), 0)
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(compute_1[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_2[ax0, ax1, ax2, ax3, ax4])
+                T_cast_2[ax0, ax1, ax2, ax3, ax4] = T.cast(compute_1[ax0, ax1, ax2, ax3, ax4], "uint8")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_3"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_cast_2[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_3[ax0, ax1, ax2, ax3, ax4])
+                T_cast_3[ax0, ax1, ax2, ax3, ax4] = T.cast(T_cast_2[ax0, ax1, ax2, ax3, ax4], "float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_subtract"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_cast_3[ax0, ax1, ax2, ax3, ax4], p4[0])
+                T.writes(T_subtract[ax0, ax1, ax2, ax3, ax4])
+                T_subtract[ax0, ax1, ax2, ax3, ax4] = T_cast_3[ax0, ax1, ax2, ax3, ax4] - p4[0]
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_multiply_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(compile_engine_const[()], T_subtract[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_multiply_1[ax0, ax1, ax2, ax3, ax4])
+                T_multiply_1[ax0, ax1, ax2, ax3, ax4] = compile_engine_const[()] * T_subtract[ax0, ax1, ax2, ax3, ax4]
+        with T.block("compile_engine_const_2"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const_2[()])
+            compile_engine_const_2[()] = T.float32(0.5)
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_add_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_multiply_1[ax0, ax1, ax2, ax3, ax4], compile_engine_const_2[()])
+                T.writes(T_add_2[ax0, ax1, ax2, ax3, ax4])
+                T_add_2[ax0, ax1, ax2, ax3, ax4] = T_multiply_1[ax0, ax1, ax2, ax3, ax4] + compile_engine_const_2[()]
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_floor_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_add_2[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_floor_1[ax0, ax1, ax2, ax3, ax4])
+                T_floor_1[ax0, ax1, ax2, ax3, ax4] = T.floor(T_add_2[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_4"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_floor_1[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_4[ax0, ax1, ax2, ax3, ax4])
+                T_cast_4[ax0, ax1, ax2, ax3, ax4] = T.cast(T_floor_1[ax0, ax1, ax2, ax3, ax4], "int32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_add_3"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_cast_4[ax0, ax1, ax2, ax3, ax4], p5[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_add_3[ax0, ax1, ax2, ax3, ax4])
+                T_add_3[ax0, ax1, ax2, ax3, ax4] = T_cast_4[ax0, ax1, ax2, ax3, ax4] + p5[ax0, ax1, ax2, ax3, ax4]
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("compute_1"):
+                i0_2, i1_2, i2_2, i3_2, i4_2 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_add_3[i0_2, i1_2, i2_2, i3_2, i4_2])
+                T.writes(compute_2[i0_2, i1_2, i2_2, i3_2, i4_2])
+                compute_2[i0_2, i1_2, i2_2, i3_2, i4_2] = T.max(T.min(T_add_3[i0_2, i1_2, i2_2, i3_2, i4_2], 255), 0)
+        for i0_3, i1_3, i2_3, i3_3, i4_3 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_5"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0_3, i1_3, i2_3, i3_3, i4_3])
+                T.reads(compute_2[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_5[ax0, ax1, ax2, ax3, ax4])
+                T_cast_5[ax0, ax1, ax2, ax3, ax4] = T.cast(compute_2[ax0, ax1, ax2, ax3, ax4], "uint8")
+        for i0_4, i1_4, i2_4, i3_4, i4_4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("compute_2"):
+                i0_5, i1_5, i2_5, i3_5, i4_5 = T.axis.remap("SSSSS", [i0_4, i1_4, i2_4, i3_4, i4_4])
+                T.reads(T_cast_5[i0_5, i1_5, i2_5, i3_5, i4_5])
+                T.writes(compute[i0_5, i1_5, i2_5, i3_5, i4_5])
+                compute[i0_5, i1_5, i2_5, i3_5, i4_5] = T.max(T.min(T_cast_5[i0_5, i1_5, i2_5, i3_5, i4_5], T.uint8(255)), T.uint8(0))
+
+
+@tvm.script.ir_module
+class Conv2dInt8_NCHWc_target:
+    @T.prim_func
+    def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1, 4, 16, 4), "int8"], p2: T.Buffer[(1, 128, 1, 1, 16), "int32"], p3: T.Buffer[(1, 128, 1, 1, 16), "float32"], p4: T.Buffer[1, "float32"], p5: T.Buffer[(1, 128, 7, 7, 16), "uint8"], T_cast: T.Buffer[(1, 128, 7, 7, 16), "int32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        compile_engine_const = T.alloc_buffer([], dtype="float32")
+        conv2d_NCHWc_int8 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        T_add = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        T_cast_1 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_multiply = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        compile_engine_const_1 = T.alloc_buffer([], dtype="float32")
+        T_add_1 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_floor = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_cast_2 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        compute = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        T_cast_3 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="uint8")
+        T_cast_4 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_subtract = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_multiply_1 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        compile_engine_const_2 = T.alloc_buffer([], dtype="float32")
+        T_add_2 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_floor_1 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_cast_5 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        compile_engine_const_3 = T.alloc_buffer([], dtype="float32")
+        T_cast_6 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_multiply_2 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        compile_engine_const_4 = T.alloc_buffer([], dtype="float32")
+        T_add_3 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_floor_2 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="float32")
+        T_cast_7 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        T_add_4 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        compute_1 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+        T_cast_8 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="uint8")
+        compute_2 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="uint8")
+        with T.block("compile_engine_const"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const[()])
+            compile_engine_const[()] = T.float32(0.95489668846130371)
+        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 128, 7, 7, 16, 1, 1, 32, 4, 4):
+            with T.block("conv2d_NCHWc_int8"):
+                n, oc_chunk, oh, ow, oc_block, kh, kw, ic_outer, ic_f_inner, ic_s_inner = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
+                T.reads(p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner])
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                T.block_attr({"schedule_rule":"meta_schedule.conv2d_NCHWc_int8", "workload":["conv2d_NCHWc_int8.x86", ["TENSOR", [1, 32, 7, 7, 16], "uint8"], ["TENSOR", [128, 32, 1, 1, 4, 16, 4], "int8"], [1, 1], [0, 0, 0, 0], [1, 1], "NCHW16c", "NCHW16c", "int32"]})
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] + T.cast(p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32") * T.cast(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner], "int32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_add"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(conv2d_NCHWc_int8[ax0, ax1, ax2, ax3, ax4], p2[ax0, ax1, 0, 0, ax4])
+                T.writes(T_add[ax0, ax1, ax2, ax3, ax4])
+                T_add[ax0, ax1, ax2, ax3, ax4] = conv2d_NCHWc_int8[ax0, ax1, ax2, ax3, ax4] + p2[ax0, ax1, 0, 0, ax4]
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_add[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_1[ax0, ax1, ax2, ax3, ax4])
+                T_cast_1[ax0, ax1, ax2, ax3, ax4] = T.cast(T_add[ax0, ax1, ax2, ax3, ax4], "float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_multiply"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_cast_1[ax0, ax1, ax2, ax3, ax4], p3[ax0, ax1, 0, 0, ax4])
+                T.writes(T_multiply[ax0, ax1, ax2, ax3, ax4])
+                T_multiply[ax0, ax1, ax2, ax3, ax4] = T_cast_1[ax0, ax1, ax2, ax3, ax4] * p3[ax0, ax1, 0, 0, ax4]
+        with T.block("compile_engine_const_1"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const_1[()])
+            compile_engine_const_1[()] = T.float32(65.5)
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_add_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_multiply[ax0, ax1, ax2, ax3, ax4], compile_engine_const_1[()])
+                T.writes(T_add_1[ax0, ax1, ax2, ax3, ax4])
+                T_add_1[ax0, ax1, ax2, ax3, ax4] = T_multiply[ax0, ax1, ax2, ax3, ax4] + compile_engine_const_1[()]
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_floor"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_add_1[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_floor[ax0, ax1, ax2, ax3, ax4])
+                T_floor[ax0, ax1, ax2, ax3, ax4] = T.floor(T_add_1[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_floor[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_2[ax0, ax1, ax2, ax3, ax4])
+                T_cast_2[ax0, ax1, ax2, ax3, ax4] = T.cast(T_floor[ax0, ax1, ax2, ax3, ax4], "int32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("compute"):
+                i0_1, i1_1, i2_1, i3_1, i4_1 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_cast_2[i0_1, i1_1, i2_1, i3_1, i4_1])
+                T.writes(compute[i0_1, i1_1, i2_1, i3_1, i4_1])
+                compute[i0_1, i1_1, i2_1, i3_1, i4_1] = T.max(T.min(T_cast_2[i0_1, i1_1, i2_1, i3_1, i4_1], 255), 0)
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(compute[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_3[ax0, ax1, ax2, ax3, ax4])
+                T_cast_3[ax0, ax1, ax2, ax3, ax4] = T.cast(compute[ax0, ax1, ax2, ax3, ax4], "uint8")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_3"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_cast_3[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_4[ax0, ax1, ax2, ax3, ax4])
+                T_cast_4[ax0, ax1, ax2, ax3, ax4] = T.cast(T_cast_3[ax0, ax1, ax2, ax3, ax4], "float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_subtract"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_cast_4[ax0, ax1, ax2, ax3, ax4], p4[0])
+                T.writes(T_subtract[ax0, ax1, ax2, ax3, ax4])
+                T_subtract[ax0, ax1, ax2, ax3, ax4] = T_cast_4[ax0, ax1, ax2, ax3, ax4] - p4[0]
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_multiply_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(compile_engine_const[()], T_subtract[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_multiply_1[ax0, ax1, ax2, ax3, ax4])
+                T_multiply_1[ax0, ax1, ax2, ax3, ax4] = compile_engine_const[()] * T_subtract[ax0, ax1, ax2, ax3, ax4]
+        with T.block("compile_engine_const_2"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const_2[()])
+            compile_engine_const_2[()] = T.float32(0.5)
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_add_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_multiply_1[ax0, ax1, ax2, ax3, ax4], compile_engine_const_2[()])
+                T.writes(T_add_2[ax0, ax1, ax2, ax3, ax4])
+                T_add_2[ax0, ax1, ax2, ax3, ax4] = T_multiply_1[ax0, ax1, ax2, ax3, ax4] + compile_engine_const_2[()]
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_floor_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_add_2[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_floor_1[ax0, ax1, ax2, ax3, ax4])
+                T_floor_1[ax0, ax1, ax2, ax3, ax4] = T.floor(T_add_2[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_4"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_floor_1[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_5[ax0, ax1, ax2, ax3, ax4])
+                T_cast_5[ax0, ax1, ax2, ax3, ax4] = T.cast(T_floor_1[ax0, ax1, ax2, ax3, ax4], "int32")
+        with T.block("compile_engine_const_3"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const_3[()])
+            compile_engine_const_3[()] = T.float32(0.71245479583740234)
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_5"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(p5[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_6[ax0, ax1, ax2, ax3, ax4])
+                T_cast_6[ax0, ax1, ax2, ax3, ax4] = T.cast(p5[ax0, ax1, ax2, ax3, ax4], "float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_multiply_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(compile_engine_const_3[()], T_cast_6[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_multiply_2[ax0, ax1, ax2, ax3, ax4])
+                T_multiply_2[ax0, ax1, ax2, ax3, ax4] = compile_engine_const_3[()] * T_cast_6[ax0, ax1, ax2, ax3, ax4]
+        with T.block("compile_engine_const_4"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const_4[()])
+            compile_engine_const_4[()] = T.float32(0.5)
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_add_3"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_multiply_2[ax0, ax1, ax2, ax3, ax4], compile_engine_const_4[()])
+                T.writes(T_add_3[ax0, ax1, ax2, ax3, ax4])
+                T_add_3[ax0, ax1, ax2, ax3, ax4] = T_multiply_2[ax0, ax1, ax2, ax3, ax4] + compile_engine_const_4[()]
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_floor_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_add_3[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_floor_2[ax0, ax1, ax2, ax3, ax4])
+                T_floor_2[ax0, ax1, ax2, ax3, ax4] = T.floor(T_add_3[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_6"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_floor_2[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_7[ax0, ax1, ax2, ax3, ax4])
+                T_cast_7[ax0, ax1, ax2, ax3, ax4] = T.cast(T_floor_2[ax0, ax1, ax2, ax3, ax4], "int32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_add_4"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_cast_5[ax0, ax1, ax2, ax3, ax4], T_cast_7[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_add_4[ax0, ax1, ax2, ax3, ax4])
+                T_add_4[ax0, ax1, ax2, ax3, ax4] = T_cast_5[ax0, ax1, ax2, ax3, ax4] + T_cast_7[ax0, ax1, ax2, ax3, ax4]
+        for i0, i1, i2, i3, i4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("compute_1"):
+                i0_2, i1_2, i2_2, i3_2, i4_2 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_add_4[i0_2, i1_2, i2_2, i3_2, i4_2])
+                T.writes(compute_1[i0_2, i1_2, i2_2, i3_2, i4_2])
+                compute_1[i0_2, i1_2, i2_2, i3_2, i4_2] = T.max(T.min(T_add_4[i0_2, i1_2, i2_2, i3_2, i4_2], 255), 0)
+        for i0_3, i1_3, i2_3, i3_3, i4_3 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_7"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0_3, i1_3, i2_3, i3_3, i4_3])
+                T.reads(compute_1[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast_8[ax0, ax1, ax2, ax3, ax4])
+                T_cast_8[ax0, ax1, ax2, ax3, ax4] = T.cast(compute_1[ax0, ax1, ax2, ax3, ax4], "uint8")
+        for i0_4, i1_4, i2_4, i3_4, i4_4 in T.grid(1, 128, 7, 7, 16):
+            with T.block("compute_2"):
+                i0_5, i1_5, i2_5, i3_5, i4_5 = T.axis.remap("SSSSS", [i0_4, i1_4, i2_4, i3_4, i4_4])
+                T.reads(T_cast_8[i0_5, i1_5, i2_5, i3_5, i4_5])
+                T.writes(compute_2[i0_5, i1_5, i2_5, i3_5, i4_5])
+                compute_2[i0_5, i1_5, i2_5, i3_5, i4_5] = T.max(T.min(T_cast_8[i0_5, i1_5, i2_5, i3_5, i4_5], T.uint8(255)), T.uint8(0))
+        for i0_6, i1_6, i2_6, i3_6, i4_6 in T.grid(1, 128, 7, 7, 16):
+            with T.block("T_cast_8"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0_6, i1_6, i2_6, i3_6, i4_6])
+                T.reads(compute_2[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_cast[ax0, ax1, ax2, ax3, ax4])
+                T_cast[ax0, ax1, ax2, ax3, ax4] = T.cast(compute_2[ax0, ax1, ax2, ax3, ax4], "int32")
+
+
+def get_conv2d_vnni_mod(intrin_id):
+    @tvm.script.ir_module
+    class Conv2dInt8_NCHWc_scheduled:
+        @T.prim_func
+        def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1, 4, 16, 4), "int8"], p2: T.Buffer[(1, 128, 1, 1, 16), "int32"], p3: T.Buffer[(1, 128, 1, 1, 16), "float32"], p4: T.Buffer[1, "float32"], p5: T.Buffer[(1, 128, 7, 7, 16), "uint8"], T_cast: T.Buffer[(1, 128, 7, 7, 16), "int32"]) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            # body
+            # with T.block("root")
+            conv2d_NCHWc_int8 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+            for i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused in T.parallel(128, annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}):
+                for i2_1, i3_1, i4_0_1 in T.grid(7, 1, 1):
+                    for i5_0, i6_0 in T.grid(1, 1):
+                        for i1_2_init, i2_2_init, i3_2_init, i1_3_init, i2_3_init, i3_3_init in T.grid(1, 1, 1, 1, 1, 7):
+                            with T.block("conv2d_NCHWc_int8_o_init"):
+                                n = T.axis.spatial(1, 0)
+                                oc_chunk = T.axis.spatial(128, i1_2_init + i1_3_init + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32)
+                                oh = T.axis.spatial(7, i2_1 + i2_2_init + i2_3_init)
+                                ow = T.axis.spatial(7, i3_1 * 7 + i3_2_init * 7 + i3_3_init)
+                                oc_block_o = T.axis.spatial(1, 0)
+                                T.reads()
+                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
+                                for i4_1 in T.vectorized(16):
+                                    with T.block("conv2d_NCHWc_int8_init"):
+                                        oc_block_i_init = T.axis.spatial(16, i4_1)
+                                        T.reads()
+                                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
+                                        conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                        for i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 7, 1):
+                            with T.block("conv2d_NCHWc_int8_o_update"):
+                                n = T.axis.spatial(1, 0)
+                                oc_chunk = T.axis.spatial(128, i1_2 + i1_3 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32)
+                                oh = T.axis.spatial(7, i2_1 + i2_2 + i2_3)
+                                ow = T.axis.spatial(7, i3_1 * 7 + i3_2 * 7 + i3_3)
+                                oc_block_o = T.axis.spatial(1, 0)
+                                kh = T.axis.reduce(1, 0)
+                                kw = T.axis.reduce(1, 0)
+                                ic_outer = T.axis.reduce(32, i7_0 * 8 + i7_1)
+                                ic_f_inner = T.axis.reduce(4, i8_1 + i8_0)
+                                ic_s_inner_o = T.axis.reduce(1, 0)
+                                T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
+                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
+                                A = T.match_buffer(p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1)
+                                B = T.match_buffer(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4], [16, 4], dtype="int8", offset_factor=1)
+                                C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], [16], dtype="int32", offset_factor=1)
+                                A_u8x4: T.uint8x4 = A[0:4]
+                                A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                                B_i8x64: T.int8x64 = B[0, 0:64]
+                                B_i32x16: T.int32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
+                                C[0:16] = C[0:16] + T.call_llvm_pure_intrin(intrin_id, T.uint32(0), T.broadcast(0, 16), T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
+                    for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 7):
+                        for ax4_fused in T.vectorized(16):
+                            with T.block("T_cast_8"):
+                                ax0_1 = T.axis.spatial(1, ax0)
+                                ax1_1 = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32 + ax1)
+                                ax2_1 = T.axis.spatial(7, i2_1 + ax2)
+                                ax3_1, ax4 = T.axis.remap("SS", [ax3, ax4_fused])
+                                T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4], p2[ax0_1, ax1_1, 0, 0, ax4], p3[ax0_1, ax1_1, 0, 0, ax4], p4[0], p5[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = T.cast(T.max(T.min(T.cast(T.max(T.min(T.cast(T.floor(T.float32(0.95489668846130371) * (T.cast(T.cast(T.max(T.min(T.cast(T.floor(T.cast(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4] + p2[ax0_1, ax1_1, 0, 0, ax4], "float32") * p3[ax0_1, ax1_1, 0, 0, ax4] + T.float32(65.5), dtype="float32"), "int32"), 255), 0), "uint8"), "float32") - p4[0]) + T.float32(0.5), dtype="float32"), "int32") + T.cast(T.floor(T.float32(0.71245479583740234) * T.cast(p5[ax0_1, ax1_1, ax2_1, ax3_1, ax4], "float32") + T.float32(0.5), dtype="float32"), "int32"), 255), 0), "uint8"), T.uint8(255)), T.uint8(0)), "int32")
+
+    return Conv2dInt8_NCHWc_scheduled
+
+
+@tvm.script.ir_module
+class Conv2dWinogradAddRelu:
+    @T.prim_func
+    def main(p0: T.Buffer[(1, 56, 56, 64), "float32"], p1: T.Buffer[(6, 6, 64, 64), "float32"], p2: T.Buffer[(1, 1, 1, 64), "float32"], T_relu: T.Buffer[(1, 56, 56, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"layout_free_buffers": [1], "tir.noalias": True, "global_symbol": "main"})
+        # body
+        # with T.block("root")
+        data_pad = T.alloc_buffer([1, 58, 58, 64], dtype="float32")
+        input_tile = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+        B = T.alloc_buffer([6, 6], dtype="float32")
+        data_pack = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+        bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+        A = T.alloc_buffer([6, 4], dtype="float32")
+        inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32")
+        conv2d_winograd = T.alloc_buffer([1, 56, 56, 64], dtype="float32")
+        T_add = T.alloc_buffer([1, 56, 56, 64], dtype="float32")
+        for i0, i1, i2, i3 in T.grid(1, 58, 58, 64):
+            with T.block("data_pad"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(p0[i0_1, i1_1 - 1, i2_1 - 1, i3_1])
+                T.writes(data_pad[i0_1, i1_1, i2_1, i3_1])
+                T.block_attr({"schedule_rule":"None"})
+                data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 57 and 1 <= i2_1 and i2_1 < 57, p0[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32")
+        for i0, i1, i2, i3 in T.grid(6, 6, 196, 64):
+            with T.block("input_tile"):
+                eps, nu, p, ci = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(data_pad[p // 196, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu, ci])
+                T.writes(input_tile[eps, nu, p, ci])
+                T.block_attr({"schedule_rule":"None"})
+                input_tile[eps, nu, p, ci] = data_pad[p // 196, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu, ci]
+        for i0, i1 in T.grid(6, 6):
+            with T.block("B"):
+                i, j = T.axis.remap("SS", [i0, i1])
+                T.reads()
+                T.writes(B[i, j])
+                T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"})
+                B[i, j] = T.Select(i % 6 == 5 and j % 6 == 5, T.float32(1), T.Select(i % 6 == 5 and j % 6 == 4, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 3, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 2, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 1, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 0, T.float32(0), T.Select(i % 6 == 4 and j % 6 == 5, T.float32(1.5), T.Select(i % 6 == 4 and j % 6 == 4, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 3, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 2, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 1, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 0, T.float32(1), T.Select(i % 6 == 3 and j % 6 == 5, T.float32(-2), T.Select(i % 6 == 3 and j % 6 == 4, T.float32(-0.5), T.Select(i % 6 == 3 and j % 6 == 3, T.float32(2), T.Select(i % 6 == 3 and j % 6 == 2, T.float32(2.5), T.Select(i % 6 == 3 and j % 6 == 1, T.float32(0.5), T.Select(i % 6 == 3 and j % 6 == 0, T.float32(1.5), T.Select(i % 6 == 2 and j % 6 == 5, T.float32(-1.5), T.Select(i % 6 == 2 and j % 6 == 4, T.float32(-1), T.Select(i % 6 == 2 and j % 6 == 3, T.float32(-1), T.Select(i % 6 == 2 and j % 6 == 2, T.float32(0.5), T.Select(i % 6 == 2 and j % 6 == 1, T.float32(-2.5), T.Select(i % 6 == 2 and j % 6 == 0, T.float32(-2), T.Select(i % 6 == 1 and j % 6 == 5, T.float32(1), T.Select(i % 6 == 1 and j % 6 == 4, T.float32(0.5), T.Select(i % 6 == 1 and j % 6 == 3, T.float32(-2), T.Select(i % 6 == 1 and j % 6 == 2, T.float32(-1), T.Select(i % 6 == 1 and j % 6 == 1, T.float32(1), T.Select(i % 6 == 1 and j % 6 == 0, T.float32(-1.5), T.Select(i % 6 == 0 and j % 6 == 5, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 4, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 3, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 2, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 1, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
+        for i0, i1, i2, i3, i4, i5 in T.grid(6, 6, 196, 64, 6, 6):
+            with T.block("data_pack"):
+                eps, nu, p, ci, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+                T.reads(input_tile[r_a, r_b, p, ci], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1])
+                T.writes(data_pack[eps, nu, p, ci])
+                T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"})
+                with T.init():
+                    data_pack[eps, nu, p, ci] = T.float32(0)
+                data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * B[r_a, eps] * B[r_b, nu]
+        for i0, i1, i2, i3, i4 in T.grid(6, 6, 196, 64, 64):
+            with T.block("bgemm"):
+                eps, nu, p, co, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4])
+                T.reads(data_pack[eps, nu, p, ci], p1[eps, nu, co, ci])
+                T.writes(bgemm[eps, nu, p, co])
+                T.block_attr({"layout_free_placeholders":[]})
+                with T.init():
+                    bgemm[eps, nu, p, co] = T.float32(0)
+                bgemm[eps, nu, p, co] = bgemm[eps, nu, p, co] + data_pack[eps, nu, p, ci] * p1[eps, nu, co, ci]
+        for i0, i1 in T.grid(6, 4):
+            with T.block("A"):
+                i, j = T.axis.remap("SS", [i0, i1])
+                T.reads()
+                T.writes(A[i, j])
+                T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"})
+                A[i, j] = T.Select(i % 6 == 5 and j % 4 == 3, T.float32(1), T.Select(i % 6 == 5 and j % 4 == 2, T.float32(0), T.Select(i % 6 == 5 and j % 4 == 1, T.float32(0), T.Select(i % 6 == 5 and j % 4 == 0, T.float32(0), T.Select(i % 6 == 4 and j % 4 == 3, T.float32(-8), T.Select(i % 6 == 4 and j % 4 == 2, T.float32(4), T.Select(i % 6 == 4 and j % 4 == 1, T.float32(-2), T.Select(i % 6 == 4 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 3 and j % 4 == 3, T.float32(0.125), T.Select(i % 6 == 3 and j % 4 == 2, T.float32(0.25), T.Select(i % 6 == 3 and j % 4 == 1, T.float32(0.5), T.Select(i % 6 == 3 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 3, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 6 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 6 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 6 == 1 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))
+        for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 196, 64, 6, 6):
+            with T.block("inverse"):
+                vh, vw, p, co, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+                T.reads(bgemm[r_a, r_b, p, co], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1])
+                T.writes(inverse[vh, vw, p, co])
+                T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse.cuda"})
+                with T.init():
+                    inverse[vh, vw, p, co] = T.float32(0)
+                inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * A[r_a, vh] * A[r_b, vw]
+        for i0, i1, i2, i3 in T.grid(1, 56, 56, 64):
+            with T.block("conv2d_winograd"):
+                n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co])
+                T.writes(conv2d_winograd[n, h, w, co])
+                conv2d_winograd[n, h, w, co] = inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co]
+        for i0, i1, i2, i3 in T.grid(1, 56, 56, 64):
+            with T.block("T_add"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], p2[ax0, 0, 0, ax3])
+                T.writes(T_add[ax0, ax1, ax2, ax3])
+                T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + p2[ax0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(1, 56, 56, 64):
+            with T.block("T_relu"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add[ax0, ax1, ax2, ax3])
+                T.writes(T_relu[ax0, ax1, ax2, ax3])
+                T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float32(0))
+
+
+@tvm.script.ir_module
+class Conv2dWinogradAddResidualRelu:
+    @T.prim_func
+    def main(p0: T.Buffer[(1, 56, 56, 64), "float32"], p1: T.Buffer[(6, 6, 64, 64), "float32"], p2: T.Buffer[(1, 1, 1, 64), "float32"], p3: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 56, 56, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
+        # body
+        # with T.block("root")
+        data_pad = T.alloc_buffer([1, 58, 58, 64], dtype="float32")
+        input_tile = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+        B = T.alloc_buffer([6, 6], dtype="float32")
+        data_pack = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+        bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+        A = T.alloc_buffer([6, 4], dtype="float32")
+        inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32")
+        conv2d_winograd = T.alloc_buffer([1, 56, 56, 64], dtype="float32")
+        T_add = T.alloc_buffer([1, 56, 56, 64], dtype="float32")
+        T_add_1 = T.alloc_buffer([1, 56, 56, 64], dtype="float32")
+        for i0, i1, i2, i3 in T.grid(1, 58, 58, 64):
+            with T.block("data_pad"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(p0[i0_1, i1_1 - 1, i2_1 - 1, i3_1])
+                T.writes(data_pad[i0_1, i1_1, i2_1, i3_1])
+                T.block_attr({"schedule_rule":"None"})
+                data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 57 and 1 <= i2_1 and i2_1 < 57, p0[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32")
+        for i0, i1, i2, i3 in T.grid(6, 6, 196, 64):
+            with T.block("input_tile"):
+                eps, nu, p, ci = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(data_pad[p // 196, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu, ci])
+                T.writes(input_tile[eps, nu, p, ci])
+                T.block_attr({"schedule_rule":"None"})
+                input_tile[eps, nu, p, ci] = data_pad[p // 196, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu, ci]
+        for i0, i1 in T.grid(6, 6):
+            with T.block("B"):
+                i, j = T.axis.remap("SS", [i0, i1])
+                T.reads()
+                T.writes(B[i, j])
+                T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"})
+                B[i, j] = T.Select(i % 6 == 5 and j % 6 == 5, T.float32(1), T.Select(i % 6 == 5 and j % 6 == 4, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 3, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 2, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 1, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 0, T.float32(0), T.Select(i % 6 == 4 and j % 6 == 5, T.float32(1.5), T.Select(i % 6 == 4 and j % 6 == 4, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 3, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 2, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 1, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 0, T.float32(1), T.Select(i % 6 == 3 and j % 6 == 5, T.float32(-2), T.Select(i % 6 == 3 and j % 6 == 4, T.float32(-0.5), T.Select(i % 6 == 3 and j % 6 == 3, T.float32(2), T.Select(i % 6 == 3 and j % 6 == 2, T.float32(2.5), T.Select(i % 6 == 3 and j % 6 == 1, T.float32(0.5), T.Select(i % 6 == 3 and j % 6 == 0, T.float32(1.5), T.Select(i % 6 == 2 and j % 6 == 5, T.float32(-1.5), T.Select(i % 6 == 2 and j % 6 == 4, T.float32(-1), T.Select(i % 6 == 2 and j % 6 == 3, T.float32(-1), T.Select(i % 6 == 2 and j % 6 == 2, T.float32(0.5), T.Select(i % 6 == 2 and j % 6 == 1, T.float32(-2.5), T.Select(i % 6 == 2 and j % 6 == 0, T.float32(-2), T.Select(i % 6 == 1 and j % 6 == 5, T.float32(1), T.Select(i % 6 == 1 and j % 6 == 4, T.float32(0.5), T.Select(i % 6 == 1 and j % 6 == 3, T.float32(-2), T.Select(i % 6 == 1 and j % 6 == 2, T.float32(-1), T.Select(i % 6 == 1 and j % 6 == 1, T.float32(1), T.Select(i % 6 == 1 and j % 6 == 0, T.float32(-1.5), T.Select(i % 6 == 0 and j % 6 == 5, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 4, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 3, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 2, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 1, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
+        for i0, i1, i2, i3, i4, i5 in T.grid(6, 6, 196, 64, 6, 6):
+            with T.block("data_pack"):
+                eps, nu, p, ci, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+                T.reads(input_tile[r_a, r_b, p, ci], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1])
+                T.writes(data_pack[eps, nu, p, ci])
+                T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"})
+                with T.init():
+                    data_pack[eps, nu, p, ci] = T.float32(0)
+                data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * B[r_a, eps] * B[r_b, nu]
+        for i0, i1, i2, i3, i4 in T.grid(6, 6, 196, 64, 64):
+            with T.block("bgemm"):
+                eps, nu, p, co, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4])
+                T.reads(data_pack[eps, nu, p, ci], p1[eps, nu, co, ci])
+                T.writes(bgemm[eps, nu, p, co])
+                T.block_attr({"layout_free_placeholders":[]})
+                with T.init():
+                    bgemm[eps, nu, p, co] = T.float32(0)
+                bgemm[eps, nu, p, co] = bgemm[eps, nu, p, co] + data_pack[eps, nu, p, ci] * p1[eps, nu, co, ci]
+        for i0, i1 in T.grid(6, 4):
+            with T.block("A"):
+                i, j = T.axis.remap("SS", [i0, i1])
+                T.reads()
+                T.writes(A[i, j])
+                T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"})
+                A[i, j] = T.Select(i % 6 == 5 and j % 4 == 3, T.float32(1), T.Select(i % 6 == 5 and j % 4 == 2, T.float32(0), T.Select(i % 6 == 5 and j % 4 == 1, T.float32(0), T.Select(i % 6 == 5 and j % 4 == 0, T.float32(0), T.Select(i % 6 == 4 and j % 4 == 3, T.float32(-8), T.Select(i % 6 == 4 and j % 4 == 2, T.float32(4), T.Select(i % 6 == 4 and j % 4 == 1, T.float32(-2), T.Select(i % 6 == 4 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 3 and j % 4 == 3, T.float32(0.125), T.Select(i % 6 == 3 and j % 4 == 2, T.float32(0.25), T.Select(i % 6 == 3 and j % 4 == 1, T.float32(0.5), T.Select(i % 6 == 3 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 3, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 6 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 6 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 6 == 1 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))
+        for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 196, 64, 6, 6):
+            with T.block("inverse"):
+                vh, vw, p, co, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+                T.reads(bgemm[r_a, r_b, p, co], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1])
+                T.writes(inverse[vh, vw, p, co])
+                T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse.cuda"})
+                with T.init():
+                    inverse[vh, vw, p, co] = T.float32(0)
+                inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * A[r_a, vh] * A[r_b, vw]
+        for i0, i1, i2, i3 in T.grid(1, 56, 56, 64):
+            with T.block("conv2d_winograd"):
+                n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co])
+                T.writes(conv2d_winograd[n, h, w, co])
+                conv2d_winograd[n, h, w, co] = inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co]
+        for i0, i1, i2, i3 in T.grid(1, 56, 56, 64):
+            with T.block("T_add"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], p2[ax0, 0, 0, ax3])
+                T.writes(T_add[ax0, ax1, ax2, ax3])
+                T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + p2[ax0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(1, 56, 56, 64):
+            with T.block("T_add_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add[ax0, ax1, ax2, ax3], p3[ax0, ax1, ax2, ax3])
+                T.writes(T_add_1[ax0, ax1, ax2, ax3])
+                T_add_1[ax0, ax1, ax2, ax3] = T_add[ax0, ax1, ax2, ax3] + p3[ax0, ax1, ax2, ax3]
+        for i0, i1, i2, i3 in T.grid(1, 56, 56, 64):
+            with T.block("T_relu"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add_1[ax0, ax1, ax2, ax3])
+                T.writes(T_relu[ax0, ax1, ax2, ax3])
+                T_relu[ax0, ax1, ax2, ax3] = T.max(T_add_1[ax0, ax1, ax2, ax3], T.float32(0))
+
+
+@tvm.script.ir_module
+class Conv2dWinogradAddResidualRelu_scheduled:
+    @T.prim_func
+    def main(p0: T.Buffer[(1, 56, 56, 64), "float32"], p1: T.Buffer[(6, 6, 64, 64), "float32"], p2: T.Buffer[(1, 1, 1, 64), "float32"], p3: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 56, 56, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
+        # body
+        # with T.block("root")
+        input_tile_local = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="local")
+        data_pack = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+        bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+        inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32")
+        bgemm_local = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="local")
+        data_pack_shared = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="shared")
+        p1_shared = T.alloc_buffer([6, 6, 64, 64], dtype="float32", scope="shared")
+        for i2_0_i3_0_i2_1_i3_1_fused_0 in T.thread_binding(98, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":1024, "pragma_unroll_explicit":1}):
+            for i2_0_i3_0_i2_1_i3_1_fused_1 in T.thread_binding(128, thread="threadIdx.x"):
+                for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1):
+                    with T.block("input_tile"):
+                        eps, nu = T.axis.remap("SS", [ax0, ax1])
+                        p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) // 896 * 14 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) % 112 // 8 + ax2)
+                        ci = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) % 896 // 112 * 8 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) % 8 + ax3)
+                        T.reads(p0[p // 196, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1, ci])
+                        T.writes(input_tile_local[eps, nu, p, ci])
+                        T.block_attr({"schedule_rule":"None"})
+                        input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 196 // 14 * 4 + eps and p % 196 // 14 * 4 + eps < 57 and 1 <= p % 14 * 4 + nu and p % 14 * 4 + nu < 57, p0[p // 196, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1, ci], T.float32(0), dtype="float32")
+                for i0 in T.unroll(6):
+                    for i1 in T.unroll(6):
+                        with T.block("data_pack_init"):
+                            eps, nu = T.axis.remap("SS", [i0, i1])
+                            p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) // 896 * 14 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) % 112 // 8)
+                            ci = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) % 896 // 112 * 8 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) % 8)
+                            T.reads()
+                            T.writes(data_pack[eps, nu, p, ci])
+                            T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"})
+                            data_pack[eps, nu, p, ci] = T.float32(0)
+                        for i4 in T.unroll(6):
+                            for i5 in T.unroll(6):
+                                with T.block("data_pack_update"):
+                                    eps, nu = T.axis.remap("SS", [i0, i1])
+                                    p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) // 896 * 14 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) % 112 // 8)
+                                    ci = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) % 896 // 112 * 8 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 128 + i2_0_i3_0_i2_1_i3_1_fused_1) % 8)
+                                    r_a, r_b = T.axis.remap("RR", [i4, i5])
+                                    T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci])
+                                    T.writes(data_pack[eps, nu, p, ci])
+                                    T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"})
+                                    data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_b % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_b % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_b % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_b % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_b % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_b % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_b % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_b % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_b % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_b % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_b % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_b % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_b % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_b % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
+        for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(168, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":1024, "pragma_unroll_explicit":1}):
+            for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(4, thread="vthread.x"):
+                for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(48, thread="threadIdx.x"):
+                    for i0_3_init, i1_3_init, i2_3_init, i3_3_init, i0_4_init, i1_4_init, i2_4_init, i3_4_init in T.grid(1, 1, 14, 1, 1, 1, 1, 1):
+                        with T.block("bgemm_init"):
+                            eps = T.axis.spatial(6, i0_4_init + i0_1_i1_1_i2_1_i3_1_fused // 2 * 3 + i0_2_i1_2_i2_2_i3_2_fused // 16 + i0_3_init)
+                            nu = T.axis.spatial(6, i1_4_init + i0_0_i1_0_i2_0_i3_0_fused // 28 + i1_3_init)
+                            p = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 28 // 4 * 28 + i0_1_i1_1_i2_1_i3_1_fused % 2 * 14 + i2_3_init + i2_4_init)
+                            co = T.axis.spatial(64, i3_4_init + i0_0_i1_0_i2_0_i3_0_fused % 4 * 16 + i0_2_i1_2_i2_2_i3_2_fused % 16 + i3_3_init)
+                            T.reads()
+                            T.writes(bgemm_local[eps, nu, p, co])
+                            T.block_attr({"layout_free_placeholders":[], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
+                            bgemm_local[eps, nu, p, co] = T.float32(0)
+                    for i4_0 in T.serial(2):
+                        for ax0_ax1_ax2_ax3_fused_0 in T.serial(28):
+                            for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(48, thread="threadIdx.x"):
+                                for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4):
+                                    with T.block("data_pack_shared"):
+                                        v0 = T.axis.spatial(6, (ax0_ax1_ax2_ax3_fused_0 * 192 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) // 896)
+                                        v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 28)
+                                        v2 = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 28 // 4 * 28 + (ax0_ax1_ax2_ax3_fused_0 * 192 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 896 // 32)
+                                        v3 = T.axis.spatial(64, i4_0 * 32 + (ax0_ax1_ax2_ax3_fused_0 * 192 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 32)
+                                        T.reads(data_pack[v0, v1, v2, v3])
+                                        T.writes(data_pack_shared[v0, v1, v2, v3])
+                                        data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3]
+                        for ax0_ax1_ax2_ax3_fused_0 in T.serial(16):
+                            for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(48, thread="threadIdx.x"):
+                                for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4):
+                                    with T.block("p1_shared"):
+                                        v0 = T.axis.spatial(6, (ax0_ax1_ax2_ax3_fused_0 * 192 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) // 512)
+                                        v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 28)
+                                        v2 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 4 * 16 + (ax0_ax1_ax2_ax3_fused_0 * 192 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 512 // 32)
+                                        v3 = T.axis.spatial(64, i4_0 * 32 + (ax0_ax1_ax2_ax3_fused_0 * 192 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 32)
+                                        T.reads(p1[v0, v1, v2, v3])
+                                        T.writes(p1_shared[v0, v1, v2, v3])
+                                        p1_shared[v0, v1, v2, v3] = p1[v0, v1, v2, v3]
+                        for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(2, 1, 1, 14, 1, 16, 1, 1, 1, 1):
+                            with T.block("bgemm_update"):
+                                eps = T.axis.spatial(6, i0_4 + i0_1_i1_1_i2_1_i3_1_fused // 2 * 3 + i0_2_i1_2_i2_2_i3_2_fused // 16 + i0_3)
+                                nu = T.axis.spatial(6, i1_4 + i0_0_i1_0_i2_0_i3_0_fused // 28 + i1_3)
+                                p = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 28 // 4 * 28 + i0_1_i1_1_i2_1_i3_1_fused % 2 * 14 + i2_3 + i2_4)
+                                co = T.axis.spatial(64, i3_4 + i0_0_i1_0_i2_0_i3_0_fused % 4 * 16 + i0_2_i1_2_i2_2_i3_2_fused % 16 + i3_3)
+                                ci = T.axis.reduce(64, i4_0 * 32 + i4_1 * 16 + i4_2)
+                                T.reads(bgemm_local[eps, nu, p, co], data_pack_shared[eps, nu, p, ci], p1_shared[eps, nu, co, ci])
+                                T.writes(bgemm_local[eps, nu, p, co])
+                                T.block_attr({"layout_free_placeholders":[], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
+                                bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * p1_shared[eps, nu, co, ci]
+                    for ax0, ax1, ax2, ax3 in T.grid(1, 1, 14, 1):
+                        with T.block("bgemm_local"):
+                            v0 = T.axis.spatial(6, i0_1_i1_1_i2_1_i3_1_fused // 2 * 3 + i0_2_i1_2_i2_2_i3_2_fused // 16 + ax0)
+                            v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 28 + ax1)
+                            v2 = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 28 // 4 * 28 + i0_1_i1_1_i2_1_i3_1_fused % 2 * 14 + ax2)
+                            v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 4 * 16 + i0_2_i1_2_i2_2_i3_2_fused % 16 + ax3)
+                            T.reads(bgemm_local[v0, v1, v2, v3])
+                            T.writes(bgemm[v0, v1, v2, v3])
+                            bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3]
+        for i2_0_i3_0_i2_1_i3_1_fused_0 in T.thread_binding(25, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":1024, "pragma_unroll_explicit":1}):
+            for i2_0_i3_0_i2_1_i3_1_fused_1 in T.thread_binding(512, thread="threadIdx.x"):
+                for i0 in T.unroll(4):
+                    for i1 in T.unroll(4):
+                        with T.block("inverse_init"):
+                            T.where(i2_0_i3_0_i2_1_i3_1_fused_0 * 512 + i2_0_i3_0_i2_1_i3_1_fused_1 < 12544)
+                            vh, vw = T.axis.remap("SS", [i0, i1])
+                            p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 512 + i2_0_i3_0_i2_1_i3_1_fused_1) // 448 * 7 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 512 + i2_0_i3_0_i2_1_i3_1_fused_1) % 224 // 32)
+                            co = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_fused_0 * 512 + i2_0_i3_0_i2_1_i3_1_fused_1) % 448 // 224 * 32 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 512 + i2_0_i3_0_i2_1_i3_1_fused_1) % 32)
+                            T.reads()
+                            T.writes(inverse[vh, vw, p, co])
+                            T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse.cuda"})
+                            inverse[vh, vw, p, co] = T.float32(0)
+                        for i4 in T.unroll(6):
+                            for i5 in T.unroll(6):
+                                with T.block("inverse_update"):
+                                    T.where(i2_0_i3_0_i2_1_i3_1_fused_0 * 512 + i2_0_i3_0_i2_1_i3_1_fused_1 < 12544)
+                                    vh, vw = T.axis.remap("SS", [i0, i1])
+                                    p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 512 + i2_0_i3_0_i2_1_i3_1_fused_1) // 448 * 7 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 512 + i2_0_i3_0_i2_1_i3_1_fused_1) % 224 // 32)
+                                    co = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_fused_0 * 512 + i2_0_i3_0_i2_1_i3_1_fused_1) % 448 // 224 * 32 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 512 + i2_0_i3_0_i2_1_i3_1_fused_1) % 32)
+                                    r_a, r_b = T.axis.remap("RR", [i4, i5])
+                                    T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co])
+                                    T.writes(inverse[vh, vw, p, co])
+                                    T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse.cuda"})
+                                    inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 6 == 5 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 5 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 0, T.float32(0), T.Select(r_a % 6 == 4 and vh % 4 == 3, T.float32(-8), T.Select(r_a % 6 == 4 and vh % 4 == 2, T.float32(4), T.Select(r_a % 6 == 4 and vh % 4 == 1, T.float32(-2), T.Select(r_a % 6 == 4 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 3 and vh % 4 == 3, T.float32(0.125), T.Select(r_a % 6 == 3 and vh % 4 == 2, T.float32(0.25), T.Select(r_a % 6 == 3 and vh % 4 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 1, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 3, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 1, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 0 and vh % 4 == 3, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 5 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 0, T.float32(0), T.Select(r_b % 6 == 4 and vw % 4 == 3, T.float32(-8), T.Select(r_b % 6 == 4 and vw % 4 == 2, T.float32(4), T.Select(r_b % 6 == 4 and vw % 4 == 1, T.float32(-2), T.Select(r_b % 6 == 4 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 3 and vw % 4 == 3, T.float32(0.125), T.Select(r_b % 6 == 3 and vw % 4 == 2, T.float32(0.25), T.Select(r_b % 6 == 3 and vw % 4 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 1, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 3, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 1, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 0 and vw % 4 == 3, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))
+        for i0_i1_i2_i3_fused_0 in T.thread_binding(1568, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":1024, "pragma_unroll_explicit":1}):
+            for i0_i1_i2_i3_fused_1 in T.thread_binding(128, thread="threadIdx.x"):
+                with T.block("conv2d_winograd"):
+                    n = T.axis.spatial(1, 0)
+                    h = T.axis.spatial(56, (i0_i1_i2_i3_fused_0 * 128 + i0_i1_i2_i3_fused_1) // 3584)
+                    w = T.axis.spatial(56, (i0_i1_i2_i3_fused_0 * 128 + i0_i1_i2_i3_fused_1) % 3584 // 64)
+                    co = T.axis.spatial(64, (i0_i1_i2_i3_fused_0 * 128 + i0_i1_i2_i3_fused_1) % 64)
+                    T.reads(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co], p2[n, 0, 0, co], p3[n, h, w, co])
+                    T.writes(T_relu[n, h, w, co])
+                    T_relu[n, h, w, co] = T.max(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co] + p2[n, 0, 0, co] + p3[n, h, w, co], T.float32(0))
+
+
+# fmt: on
+def verify(anchor_mod, anchor_trace_fun, target_mod, target, ref):
+    anchor_sch = Schedule(anchor_mod)
+    anchor_trace_fun(anchor_sch)
+    anchor_trace = anchor_sch.trace
+
+    sch = Schedule(target_mod)
+
+    ms.trace_apply.schedule_using_anchor_trace(sch, anchor_trace, Target(target))
+
+    tvm.ir.assert_structural_equal(ref, sch.mod)
+
+
+def test_dense_add_cpu():
+    def apply_anchor_trace(sch: Schedule) -> None:
+        b0 = sch.get_block(name="T_matmul_NT", func_name="main")
+        b1 = sch.get_block(name="root", func_name="main")
+        sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
+        l2, l3, l4 = sch.get_loops(block=b0)
+        v5, v6, v7, v8 = sch.sample_perfect_tile(
+            loop=l2, n=4, max_innermost_factor=64, decision=[2, 8, 4, 2]
+        )
+        l9, l10, l11, l12 = sch.split(loop=l2, factors=[v5, v6, v7, v8], preserve_unit_iters=True)
+        v13, v14, v15, v16 = sch.sample_perfect_tile(
+            loop=l3, n=4, max_innermost_factor=64, decision=[2, 1, 1, 64]
+        )
+        l17, l18, l19, l20 = sch.split(
+            loop=l3, factors=[v13, v14, v15, v16], preserve_unit_iters=True
+        )
+        v21, v22 = sch.sample_perfect_tile(loop=l4, n=2, max_innermost_factor=64, decision=[128, 1])
+        l23, l24 = sch.split(loop=l4, factors=[v21, v22], preserve_unit_iters=True)
+        sch.reorder(l9, l17, l10, l18, l23, l11, l19, l24, l12, l20)
+        b25 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")
+        sch.reverse_compute_at(block=b25, loop=l17, preserve_unit_loops=True, index=-1)
+        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.parallel", ann_val=160)
+        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.vectorize", ann_val=64)
+        v26 = sch.sample_categorical(
+            candidates=[0, 16, 64, 512], probs=[0.25, 0.25, 0.25, 0.25], decision=0
+        )
+        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.unroll_explicit", ann_val=v26)
+        sch.enter_postproc()
+        b27 = sch.get_block(name="root", func_name="main")
+        sch.unannotate(block_or_loop=b27, ann_key="meta_schedule.parallel")
+        sch.unannotate(block_or_loop=b27, ann_key="meta_schedule.vectorize")
+        sch.unannotate(block_or_loop=b27, ann_key="meta_schedule.unroll_explicit")
+        b28, b29 = sch.get_child_blocks(b27)
+        l30, l31, l32, l33, l34, l35, l36, l37, l38, l39 = sch.get_loops(block=b28)
+        l40 = sch.fuse(l30, l31, preserve_unit_iters=True)
+        sch.parallel(loop=l40)
+        l41 = sch.fuse(l39, preserve_unit_iters=True)
+        sch.vectorize(loop=l41)
+        l42, l43, l44 = sch.get_loops(block=b29)
+        l45 = sch.fuse(l42, preserve_unit_iters=True)
+        sch.parallel(loop=l45)
+        l46 = sch.fuse(l44, preserve_unit_iters=True)
+        sch.vectorize(loop=l46)
+        b47 = sch.get_block(name="T_matmul_NT", func_name="main")
+        l48, l49, l50, l51, l52, l53, l54, l55, l56 = sch.get_loops(block=b47)
+        b57 = sch.decompose_reduction(block=b47, loop=l51)
+        b58 = sch.get_block(name="T_matmul_NT_update", func_name="main")
+        b59 = sch.cache_read(block=b58, read_buffer_index=2, storage_scope="global")
+        sch.transform_layout(
+            block=b58,
+            buffer=("read", 2),
+            index_map=tvm.tir.IndexMap.from_func(
+                lambda i0, i1: (
+                    floordiv(i0, 64),
+                    i1,
+                    floormod(i0, 64),
+                ),
+                inverse_index_map=lambda i0, i1, i2: (
+                    ((i0 * 64) + i2),
+                    i1,
+                ),
+            ),
+            pad_value=None,
+        )
+        sch.annotate(block_or_loop=b59, ann_key="meta_schedule.layout_rewrite_preproc", ann_val=1)
+
+    verify(Dense, apply_anchor_trace, DenseAdd, "llvm", DenseAdd_scheduled_cpu)
+
+
+def test_dense_add_cpu_no_write_cache():
+    def apply_trace(sch):
+        b0 = sch.get_block(name="T_matmul_NT", func_name="main")
+        b1 = sch.get_block(name="root", func_name="main")
+        sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
+        l2, l3, l4 = sch.get_loops(block=b0)
+        v5, v6, v7, v8 = sch.sample_perfect_tile(
+            loop=l2, n=4, max_innermost_factor=64, decision=[4, 4, 4, 2]
+        )
+        l9, l10, l11, l12 = sch.split(loop=l2, factors=[v5, v6, v7, v8], preserve_unit_iters=True)
+        v13, v14, v15, v16 = sch.sample_perfect_tile(
+            loop=l3, n=4, max_innermost_factor=64, decision=[1, 1, 4, 32]
+        )
+        l17, l18, l19, l20 = sch.split(
+            loop=l3, factors=[v13, v14, v15, v16], preserve_unit_iters=True
+        )
+        v21, v22 = sch.sample_perfect_tile(loop=l4, n=2, max_innermost_factor=64, decision=[8, 16])
+        l23, l24 = sch.split(loop=l4, factors=[v21, v22], preserve_unit_iters=True)
+        sch.reorder(l9, l17, l10, l18, l23, l11, l19, l24, l12, l20)
+        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.parallel", ann_val=160)
+        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.vectorize", ann_val=64)
+        v25 = sch.sample_categorical(
+            candidates=[0, 16, 64, 512], probs=[0.25, 0.25, 0.25, 0.25], decision=1
+        )
+        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.unroll_explicit", ann_val=v25)
+        sch.enter_postproc()
+        b26 = sch.get_block(name="root", func_name="main")
+        sch.unannotate(block_or_loop=b26, ann_key="meta_schedule.parallel")
+        sch.unannotate(block_or_loop=b26, ann_key="meta_schedule.vectorize")
+        sch.unannotate(block_or_loop=b26, ann_key="meta_schedule.unroll_explicit")
+        (b27,) = sch.get_child_blocks(b26)
+        l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
+        l38 = sch.fuse(l28, l29, l30, l31, preserve_unit_iters=True)
+        sch.parallel(loop=l38)
+        l39 = sch.fuse(l37, preserve_unit_iters=True)
+        sch.vectorize(loop=l39)
+        sch.annotate(block_or_loop=l38, ann_key="pragma_auto_unroll_max_step", ann_val=16)
+        sch.annotate(block_or_loop=l38, ann_key="pragma_unroll_explicit", ann_val=1)
+        b40 = sch.get_block(name="T_matmul_NT", func_name="main")
+        l41, l42, l43, l44, l45, l46, l47 = sch.get_loops(block=b40)
+        b48 = sch.decompose_reduction(block=b40, loop=l42)
+        b49 = sch.get_block(name="T_matmul_NT_update", func_name="main")
+        b50 = sch.cache_read(block=b49, read_buffer_index=2, storage_scope="global")
+        sch.transform_layout(
+            block=b49,
+            buffer=("read", 2),
+            index_map=tvm.tir.IndexMap.from_func(
+                lambda i0, i1: (
+                    floordiv(i1, 16),
+                    floordiv(i0, 32),
+                    floormod(i1, 16),
+                    floormod(i0, 32),
+                ),
+                inverse_index_map=lambda i0, i1, i2, i3: (
+                    ((i1 * 32) + i3),
+                    ((i0 * 16) + i2),
+                ),
+            ),
+            pad_value=None,
+        )
+        sch.annotate(block_or_loop=b50, ann_key="meta_schedule.layout_rewrite_preproc", ann_val=1)
+
+    verify(Dense, apply_trace, DenseAdd, "llvm", DenseAdd_cpu_no_write_cache)
+
+
+def test_dense_add_gpu():
+    def apply_anchor_trace(sch: Schedule) -> None:
+        b0 = sch.get_block(name="T_matmul_NT", func_name="main")
+        b1 = sch.get_block(name="root", func_name="main")
+        sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
+        l2, l3, l4 = sch.get_loops(block=b0)
+        v5, v6, v7, v8, v9 = sch.sample_perfect_tile(
+            loop=l2, n=5, max_innermost_factor=64, decision=[8, 1, 16, 1, 1]
+        )
+        l10, l11, l12, l13, l14 = sch.split(
+            loop=l2, factors=[v5, v6, v7, v8, v9], preserve_unit_iters=True
+        )
+        v15, v16, v17, v18, v19 = sch.sample_perfect_tile(
+            loop=l3, n=5, max_innermost_factor=64, decision=[4, 1, 8, 4, 1]
+        )
+        l20, l21, l22, l23, l24 = sch.split(
+            loop=l3, factors=[v15, v16, v17, v18, v19], preserve_unit_iters=True
+        )
+        v25, v26, v27 = sch.sample_perfect_tile(
+            loop=l4, n=3, max_innermost_factor=64, decision=[32, 1, 4]
+        )
+        l28, l29, l30 = sch.split(loop=l4, factors=[v25, v26, v27], preserve_unit_iters=True)
+        sch.reorder(l10, l20, l11, l21, l12, l22, l28, l29, l13, l23, l30, l14, l24)
+        l31 = sch.fuse(l10, l20, preserve_unit_iters=True)
+        sch.bind(loop=l31, thread_axis="blockIdx.x")
+        l32 = sch.fuse(l11, l21, preserve_unit_iters=True)
+        sch.bind(loop=l32, thread_axis="vthread.x")
+        l33 = sch.fuse(l12, l22, preserve_unit_iters=True)
+        sch.bind(loop=l33, thread_axis="threadIdx.x")
+        sch.annotate(
+            block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=16
+        )
+        sch.annotate(
+            block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=256
+        )
+        b34 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")
+        sch.reverse_compute_at(block=b34, loop=l33, preserve_unit_loops=True, index=-1)
+        b35 = sch.cache_read(
+            block=b0, read_buffer_index=0, storage_scope="shared", consumer_blocks=[b0]
+        )
+        sch.compute_at(block=b35, loop=l28, preserve_unit_loops=True, index=-1)
+        l36, l37, l38, l39, l40, l41 = sch.get_loops(block=b35)
+        l42 = sch.fuse(l40, l41, preserve_unit_iters=True)
+        v43 = sch.sample_categorical(
+            candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1
+        )
+        sch.annotate(block_or_loop=b35, ann_key="meta_schedule.cooperative_fetch", ann_val=v43)
+        b44 = sch.cache_read(
+            block=b0, read_buffer_index=1, storage_scope="shared", consumer_blocks=[b0]
+        )
+        sch.compute_at(block=b44, loop=l28, preserve_unit_loops=True, index=-1)
+        l45, l46, l47, l48, l49, l50 = sch.get_loops(block=b44)
+        l51 = sch.fuse(l49, l50, preserve_unit_iters=True)
+        v52 = sch.sample_categorical(
+            candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3
+        )
+        sch.annotate(block_or_loop=b44, ann_key="meta_schedule.cooperative_fetch", ann_val=v52)
+        v53 = sch.sample_categorical(
+            candidates=[0, 16, 64, 512, 1024],
+            probs=[
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+            ],
+            decision=2,
+        )
+        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.unroll_explicit", ann_val=v53)
+        sch.enter_postproc()
+        sch.unannotate(block_or_loop=b35, ann_key="meta_schedule.cooperative_fetch")
+        l54, l55, l56, l57, l58 = sch.get_loops(block=b35)
+        l59, l60, l61 = sch.split(loop=l58, factors=[None, 128, 2], preserve_unit_iters=True)
+        sch.vectorize(loop=l61)
+        sch.bind(loop=l60, thread_axis="threadIdx.x")
+        sch.unannotate(block_or_loop=b44, ann_key="meta_schedule.cooperative_fetch")
+        l62, l63, l64, l65, l66 = sch.get_loops(block=b44)
+        l67, l68, l69 = sch.split(loop=l66, factors=[None, 128, 4], preserve_unit_iters=True)
+        sch.vectorize(loop=l69)
+        sch.bind(loop=l68, thread_axis="threadIdx.x")
+        b70 = sch.get_block(name="root", func_name="main")
+        sch.unannotate(block_or_loop=b70, ann_key="meta_schedule.unroll_explicit")
+        b71, b72, b73, b74 = sch.get_child_blocks(b70)
+        l75, l76, l77, l78, l79, l80, l81 = sch.get_loops(block=b71)
+        sch.annotate(block_or_loop=l75, ann_key="pragma_auto_unroll_max_step", ann_val=64)
+        sch.annotate(block_or_loop=l75, ann_key="pragma_unroll_explicit", ann_val=1)
+        l82, l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b72)
+        sch.annotate(block_or_loop=l82, ann_key="pragma_auto_unroll_max_step", ann_val=64)
+        sch.annotate(block_or_loop=l82, ann_key="pragma_unroll_explicit", ann_val=1)
+        l89, l90, l91, l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b73)
+        sch.annotate(block_or_loop=l89, ann_key="pragma_auto_unroll_max_step", ann_val=64)
+        sch.annotate(block_or_loop=l89, ann_key="pragma_unroll_explicit", ann_val=1)
+        l99, l100, l101, l102, l103 = sch.get_loops(block=b74)
+        sch.annotate(block_or_loop=l99, ann_key="pragma_auto_unroll_max_step", ann_val=64)
+        sch.annotate(block_or_loop=l99, ann_key="pragma_unroll_explicit", ann_val=1)
+        b104 = sch.get_block(name="T_matmul_NT", func_name="main")
+        l105, l106, l107, l108, l109, l110, l111, l112, l113, l114 = sch.get_loops(block=b104)
+        b115 = sch.decompose_reduction(block=b104, loop=l108)
+
+    verify(Dense, apply_anchor_trace, DenseAdd, "cuda", DenseAdd_scheduled_gpu)
+
+
+def test_conv2d_int8_tensorcore():
+    def apply_trace(sch):
+        b0 = sch.get_block(name="pad_temp", func_name="main")
+        b1 = sch.get_block(name="conv2d_nhwc", func_name="main")
+        b2 = sch.get_block(name="T_subtract", func_name="main")
+        b3 = sch.get_block(name="T_add", func_name="main")
+        b4 = sch.get_block(name="T_cast", func_name="main")
+        b5 = sch.get_block(name="T_multiply", func_name="main")
+        b6 = sch.get_block(name="T_add_1", func_name="main")
+        b7 = sch.get_block(name="T_right_shift", func_name="main")
+        b8 = sch.get_block(name="T_cast_1", func_name="main")
+        b9 = sch.get_block(name="T_add_2", func_name="main")
+        b10 = sch.get_block(name="compute", func_name="main")
+        b11 = sch.get_block(name="T_cast_2", func_name="main")
+        b12 = sch.get_block(name="T_cast_3", func_name="main")
+        b13 = sch.get_block(name="T_subtract_1", func_name="main")
+        b14 = sch.get_block(name="compute_1", func_name="main")
+        b15 = sch.get_block(name="root", func_name="main")
+        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
+        b16 = sch.reindex(block=b1, buffer=("write", 0))
+        b17 = sch.reindex(block=b1, buffer=("read", 0))
+        b18 = sch.reindex(block=b1, buffer=("read", 1))
+        sch.transform_layout(
+            block=b1,
+            buffer=("read", 0),
+            index_map=lambda nn, yy, xx, rc: (
+                (((nn * 3136) + (yy * 56)) + xx),
+                rc,
+            ),
+            pad_value=None,
+        )
+        sch.transform_layout(
+            block=b1,
+            buffer=("read", 1),
+            index_map=lambda ff, ry, rx, rc: (
+                ry,
+                rx,
+                ff,
+                rc,
+            ),
+            pad_value=None,
+        )
+        sch.transform_layout(
+            block=b1,
+            buffer=("write", 0),
+            index_map=lambda nn, yy, xx, ff: (
+                (((nn * 3136) + (yy * 56)) + xx),
+                ff,
+            ),
+            pad_value=None,
+        )
+        sch.transform_block_layout(
+            block=b16,
+            index_map=lambda nn, yy, xx, ff: (
+                (((nn * 3136) + (yy * 56)) + xx),
+                ff,
+            ),
+        )
+        sch.transform_block_layout(
+            block=b17,
+            index_map=lambda nn, yy, xx, rc: (
+                (((nn * 3136) + (yy * 56)) + xx),
+                rc,
+            ),
+        )
+        sch.transform_block_layout(
+            block=b18,
+            index_map=lambda ff, ry, rx, rc: (
+                ry,
+                rx,
+                ff,
+                rc,
+            ),
+        )
+        sch.transform_block_layout(
+            block=b1,
+            index_map=lambda nn, yy, xx, ff, ry, rx, rc: (
+                ry,
+                rx,
+                (((nn * 3136) + (yy * 56)) + xx),
+                ff,
+                rc,
+            ),
+        )
+        l19, l20, l21, l22, l23 = sch.get_loops(block=b1)
+        l24, l25 = sch.split(loop=l23, factors=[None, 16], preserve_unit_iters=True)
+        l26, l27 = sch.split(loop=l22, factors=[None, 16], preserve_unit_iters=True)
+        l28, l29 = sch.split(loop=l21, factors=[None, 16], preserve_unit_iters=True)
+        l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b1)
+        sch.reorder(l34, l36, l29, l27, l25)
+        b38 = sch.blockize(loop=l29)
+        sch.annotate(
+            block_or_loop=b38,
+            ann_key="meta_schedule.auto_tensorize",
+            ann_val="wmma_sync_16x16x16_s8s8s32_trans",
+        )
+        sch.annotate(
+            block_or_loop=b38,
+            ann_key="meta_schedule.auto_tensorize_init",
+            ann_val="wmma_fill_16x16x16_s32",
+        )
+        sch.annotate(block_or_loop=b38, ann_key="warp_execution", ann_val=1)
+        l39, l40, l41, l42, l43 = sch.get_loops(block=b38)
+        v44, v45, v46 = sch.sample_perfect_tile(
+            loop=l39, n=3, max_innermost_factor=4, decision=[1, 1, 1]
+        )
+        l47, l48, l49 = sch.split(loop=l39, factors=[v44, v45, v46], preserve_unit_iters=True)
+        v50, v51, v52 = sch.sample_perfect_tile(
+            loop=l40, n=3, max_innermost_factor=4, decision=[1, 1, 1]
+        )
+        l53, l54, l55 = sch.split(loop=l40, factors=[v50, v51, v52], preserve_unit_iters=True)
+        v56, v57, v58, v59, v60 = sch.sample_perfect_tile(
+            loop=l41, n=5, max_innermost_factor=4, decision=[392, 1, 8, 1, 1]
+        )
+        l61, l62, l63, l64, l65 = sch.split(
+            loop=l41, factors=[v56, v57, v58, v59, v60], preserve_unit_iters=True
+        )
+        v66, v67, v68, v69, v70 = sch.sample_perfect_tile(
+            loop=l42, n=5, max_innermost_factor=4, decision=[8, 1, 2, 1, 1]
+        )
+        l71, l72, l73, l74, l75 = sch.split(
+            loop=l42, factors=[v66, v67, v68, v69, v70], preserve_unit_iters=True
+        )
+        v76, v77, v78 = sch.sample_perfect_tile(
+            loop=l43, n=3, max_innermost_factor=4, decision=[2, 1, 2]
+        )
+        l79, l80, l81 = sch.split(loop=l43, factors=[v76, v77, v78], preserve_unit_iters=True)
+        sch.reorder(
+            l61,
+            l71,
+            l62,
+            l72,
+            l63,
+            l73,
+            l47,
+            l53,
+            l79,
+            l48,
+            l54,
+            l80,
+            l64,
+            l74,
+            l49,
+            l55,
+            l81,
+            l65,
+            l75,
+        )
+        l82 = sch.fuse(l61, l71, preserve_unit_iters=True)
+        sch.bind(loop=l82, thread_axis="blockIdx.x")
+        l83 = sch.fuse(l62, l72, preserve_unit_iters=True)
+        sch.bind(loop=l83, thread_axis="vthread.x")
+        l84 = sch.fuse(l63, l73, preserve_unit_iters=True)
+        sch.bind(loop=l84, thread_axis="threadIdx.x")
+        sch.annotate(
+            block_or_loop=b38, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32
+        )
+        sch.annotate(
+            block_or_loop=b38, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024
+        )
+        b85 = sch.cache_write(block=b38, write_buffer_index=0, storage_scope="shared")
+        sch.reverse_compute_at(block=b85, loop=l83, preserve_unit_loops=True, index=-1)
+        b86 = sch.cache_write(block=b38, write_buffer_index=0, storage_scope="wmma.accumulator")
+        sch.reverse_compute_at(block=b86, loop=l84, preserve_unit_loops=True, index=-1)
+        v87 = sch.sample_categorical(
+            candidates=[1, 2, 3, 4, 8, 16],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=0,
+        )
+        sch.annotate(block_or_loop=b85, ann_key="meta_schedule.cooperative_fetch", ann_val=v87)
+        sch.reverse_compute_inline(block=b16)
+        l88, l89, l90, l91, l92 = sch.get_loops(block=b86)
+        l93, l94 = sch.split(loop=l92, factors=[None, 16], preserve_unit_iters=True)
+        l95, l96 = sch.split(loop=l91, factors=[None, 16], preserve_unit_iters=True)
+        l97, l98, l99, l100, l101, l102, l103 = sch.get_loops(block=b86)
+        sch.reorder(l102, l96, l94)
+        b104 = sch.blockize(loop=l96)
+        sch.annotate(
+            block_or_loop=b104,
+            ann_key="meta_schedule.auto_tensorize",
+            ann_val="wmma_store_16x16x16_s32_shared",
+        )
+        b105 = sch.cache_read(
+            block=b38, read_buffer_index=0, storage_scope="shared", consumer_blocks=[b38]
+        )
+        sch.compute_at(block=b105, loop=l79, preserve_unit_loops=True, index=-1)
+        l106, l107, l108, l109, l110, l111, l112, l113 = sch.get_loops(block=b105)
+        l114 = sch.fuse(l112, l113, preserve_unit_iters=True)
+        v115 = sch.sample_categorical(
+            candidates=[1, 2, 3, 4, 8, 16],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=5,
+        )
+        sch.annotate(block_or_loop=b105, ann_key="meta_schedule.cooperative_fetch", ann_val=v115)
+        b116 = sch.cache_read(
+            block=b38, read_buffer_index=1, storage_scope="shared", consumer_blocks=[b38]
+        )
+        sch.compute_at(block=b116, loop=l79, preserve_unit_loops=True, index=-1)
+        l117, l118, l119, l120, l121, l122, l123, l124, l125, l126 = sch.get_loops(block=b116)
+        l127 = sch.fuse(l123, l124, l125, l126, preserve_unit_iters=True)
+        v128 = sch.sample_categorical(
+            candidates=[1, 2, 3, 4, 8, 16],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=4,
+        )
+        sch.annotate(block_or_loop=b116, ann_key="meta_schedule.cooperative_fetch", ann_val=v128)
+        b129 = sch.cache_read(block=b38, read_buffer_index=0, storage_scope="wmma.matrix_a")
+        sch.compute_at(block=b129, loop=l80, preserve_unit_loops=True, index=-1)
+        l130, l131, l132, l133, l134, l135, l136, l137, l138, l139, l140 = sch.get_loops(block=b129)
+        l141, l142 = sch.split(loop=l140, factors=[None, 16], preserve_unit_iters=True)
+        l143, l144 = sch.split(loop=l139, factors=[None, 16], preserve_unit_iters=True)
+        (
+            l145,
+            l146,
+            l147,
+            l148,
+            l149,
+            l150,
+            l151,
+            l152,
+            l153,
+            l154,
+            l155,
+            l156,
+            l157,
+        ) = sch.get_loops(block=b129)
+        sch.reorder(l156, l144, l142)
+        b158 = sch.blockize(loop=l144)
+        sch.annotate(
+            block_or_loop=b158,
+            ann_key="meta_schedule.auto_tensorize",
+            ann_val="wmma_load_16x16x16_s8_a",
+        )
+        b159 = sch.cache_read(block=b38, read_buffer_index=1, storage_scope="wmma.matrix_b")
+        sch.compute_at(block=b159, loop=l80, preserve_unit_loops=True, index=-1)
+        (
+            l160,
+            l161,
+            l162,
+            l163,
+            l164,
+            l165,
+            l166,
+            l167,
+            l168,
+            l169,
+            l170,
+            l171,
+            l172,
+        ) = sch.get_loops(block=b159)
+        l173, l174 = sch.split(loop=l172, factors=[None, 16], preserve_unit_iters=True)
+        l175, l176 = sch.split(loop=l171, factors=[None, 16], preserve_unit_iters=True)
+        (
+            l177,
+            l178,
+            l179,
+            l180,
+            l181,
+            l182,
+            l183,
+            l184,
+            l185,
+            l186,
+            l187,
+            l188,
+            l189,
+            l190,
+            l191,
+        ) = sch.get_loops(block=b159)
+        sch.reorder(l190, l176, l174)
+        b192 = sch.blockize(loop=l176)
+        sch.annotate(
+            block_or_loop=b192,
+            ann_key="meta_schedule.auto_tensorize",
+            ann_val="wmma_load_16x16x16_s8_b_trans",
+        )
+        sch.compute_inline(block=b17)
+        sch.compute_inline(block=b18)
+        sch.storage_align(block=b105, buffer_index=0, axis=-2, factor=32, offset=16)
+        sch.storage_align(block=b116, buffer_index=0, axis=-2, factor=32, offset=16)
+        sch.reverse_compute_inline(block=b14)
+        sch.reverse_compute_inline(block=b13)
+        sch.reverse_compute_inline(block=b12)
+        sch.reverse_compute_inline(block=b11)
+        sch.reverse_compute_inline(block=b10)
+        sch.reverse_compute_inline(block=b9)
+        sch.reverse_compute_inline(block=b8)
+        sch.reverse_compute_inline(block=b7)
+        sch.reverse_compute_inline(block=b6)
+        sch.reverse_compute_inline(block=b5)
+        sch.reverse_compute_inline(block=b4)
+        sch.reverse_compute_inline(block=b3)
+        sch.reverse_compute_inline(block=b2)
+        sch.compute_inline(block=b0)
+        v193 = sch.sample_categorical(
+            candidates=[0, 16, 64, 512, 1024],
+            probs=[
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+            ],
+            decision=3,
+        )
+        sch.annotate(block_or_loop=b15, ann_key="meta_schedule.unroll_explicit", ann_val=v193)
+        sch.enter_postproc()
+        sch.unannotate(block_or_loop=b85, ann_key="meta_schedule.cooperative_fetch")
+        l194, l195, l196, l197 = sch.get_loops(block=b85)
+        l198, l199 = sch.split(loop=l197, factors=[None, 16], preserve_unit_iters=True)
+        sch.bind(loop=l199, thread_axis="threadIdx.x")
+        sch.unannotate(block_or_loop=b105, ann_key="meta_schedule.cooperative_fetch")
+        l200, l201, l202, l203, l204, l205, l206 = sch.get_loops(block=b105)
+        l207, l208, l209 = sch.split(loop=l206, factors=[None, 16, 16], preserve_unit_iters=True)
+        sch.vectorize(loop=l209)
+        sch.bind(loop=l208, thread_axis="threadIdx.x")
+        sch.unannotate(block_or_loop=b116, ann_key="meta_schedule.cooperative_fetch")
+        l210, l211, l212, l213, l214, l215, l216 = sch.get_loops(block=b116)
+        l217, l218, l219 = sch.split(loop=l216, factors=[None, 16, 8], preserve_unit_iters=True)
+        sch.vectorize(loop=l219)
+        sch.bind(loop=l218, thread_axis="threadIdx.x")
+        b220 = sch.get_block(name="root", func_name="main")
+        sch.unannotate(block_or_loop=b220, ann_key="meta_schedule.unroll_explicit")
+        b221, b222, b223, b224, b225, b226, b227 = sch.get_child_blocks(b220)
+        l228, l229, l230, l231, l232, l233, l234, l235, l236 = sch.get_loops(block=b221)
+        sch.annotate(block_or_loop=l228, ann_key="pragma_auto_unroll_max_step", ann_val=512)
+        sch.annotate(block_or_loop=l228, ann_key="pragma_unroll_explicit", ann_val=1)
+        l237, l238, l239, l240, l241, l242, l243, l244, l245 = sch.get_loops(block=b222)
+        sch.annotate(block_or_loop=l237, ann_key="pragma_auto_unroll_max_step", ann_val=512)
+        sch.annotate(block_or_loop=l237, ann_key="pragma_unroll_explicit", ann_val=1)
+        l246, l247, l248, l249, l250, l251, l252, l253, l254, l255, l256 = sch.get_loops(block=b223)
+        sch.annotate(block_or_loop=l246, ann_key="pragma_auto_unroll_max_step", ann_val=512)
+        sch.annotate(block_or_loop=l246, ann_key="pragma_unroll_explicit", ann_val=1)
+        (
+            l257,
+            l258,
+            l259,
+            l260,
+            l261,
+            l262,
+            l263,
+            l264,
+            l265,
+            l266,
+            l267,
+            l268,
+            l269,
+        ) = sch.get_loops(block=b224)
+        sch.annotate(block_or_loop=l257, ann_key="pragma_auto_unroll_max_step", ann_val=512)
+        sch.annotate(block_or_loop=l257, ann_key="pragma_unroll_explicit", ann_val=1)
+        (
+            l270,
+            l271,
+            l272,
+            l273,
+            l274,
+            l275,
+            l276,
+            l277,
+            l278,
+            l279,
+            l280,
+            l281,
+            l282,
+            l283,
+            l284,
+            l285,
+        ) = sch.get_loops(block=b225)
+        sch.annotate(block_or_loop=l270, ann_key="pragma_auto_unroll_max_step", ann_val=512)
+        sch.annotate(block_or_loop=l270, ann_key="pragma_unroll_explicit", ann_val=1)
+        l286, l287, l288, l289, l290 = sch.get_loops(block=b226)
+        sch.annotate(block_or_loop=l286, ann_key="pragma_auto_unroll_max_step", ann_val=512)
+        sch.annotate(block_or_loop=l286, ann_key="pragma_unroll_explicit", ann_val=1)
+        l291, l292, l293, l294, l295 = sch.get_loops(block=b227)
+        sch.annotate(block_or_loop=l291, ann_key="pragma_auto_unroll_max_step", ann_val=512)
+        sch.annotate(block_or_loop=l291, ann_key="pragma_unroll_explicit", ann_val=1)
+        b296 = sch.get_block(name="conv2d_nhwc_o", func_name="main")
+        (
+            l297,
+            l298,
+            l299,
+            l300,
+            l301,
+            l302,
+            l303,
+            l304,
+            l305,
+            l306,
+            l307,
+            l308,
+            l309,
+            l310,
+            l311,
+            l312,
+        ) = sch.get_loops(block=b296)
+        b313 = sch.decompose_reduction(block=b296, loop=l302)
+        sch.unannotate(block_or_loop=b313, ann_key="meta_schedule.auto_tensorize")
+        sch.annotate(
+            block_or_loop=b313,
+            ann_key="meta_schedule.auto_tensorize",
+            ann_val="wmma_fill_16x16x16_s32",
+        )
+        sch.unannotate(block_or_loop=b296, ann_key="meta_schedule.auto_tensorize_init")
+        sch.unannotate(block_or_loop=b313, ann_key="meta_schedule.auto_tensorize_init")
+        b314 = sch.get_block(name="conv2d_nhwc_o_init", func_name="main")
+        sch.unannotate(block_or_loop=b314, ann_key="meta_schedule.auto_tensorize")
+        sch.tensorize(block_or_loop=b314, tensor_intrin="wmma_fill_16x16x16_s32")
+        b315 = sch.get_block(name="pad_temp_reindex_shared_wmma.matrix_a_o", func_name="main")
+        sch.unannotate(block_or_loop=b315, ann_key="meta_schedule.auto_tensorize")
+        sch.tensorize(block_or_loop=b315, tensor_intrin="wmma_load_16x16x16_s8_a")
+        b316 = sch.get_block(name="p1_reindex_shared_wmma.matrix_b_o", func_name="main")
+        sch.unannotate(block_or_loop=b316, ann_key="meta_schedule.auto_tensorize")
+        sch.tensorize(block_or_loop=b316, tensor_intrin="wmma_load_16x16x16_s8_b_trans")
+        b317 = sch.get_block(name="conv2d_nhwc_o_update", func_name="main")
+        sch.unannotate(block_or_loop=b317, ann_key="meta_schedule.auto_tensorize")
+        sch.tensorize(block_or_loop=b317, tensor_intrin="wmma_sync_16x16x16_s8s8s32_trans")
+        b318 = sch.get_block(name="conv2d_nhwc_reindex_shared_wmma.accumulator_o", func_name="main")
+        sch.unannotate(block_or_loop=b318, ann_key="meta_schedule.auto_tensorize")
+        sch.tensorize(block_or_loop=b318, tensor_intrin="wmma_store_16x16x16_s32_shared")
+
+    verify(Conv2dInt8, apply_trace, Conv2dInt8_target, "cuda", Conv2dInt8_tensorcore_scheduled)
+
+
+def test_conv2d_int8_vnni():
+    def apply_trace(sch):
+        b0 = sch.get_block(name="compile_engine_const", func_name="main")
+        b1 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
+        b2 = sch.get_block(name="T_add", func_name="main")
+        b3 = sch.get_block(name="T_cast", func_name="main")
+        b4 = sch.get_block(name="T_multiply", func_name="main")
+        b5 = sch.get_block(name="compile_engine_const_1", func_name="main")
+        b6 = sch.get_block(name="T_add_1", func_name="main")
+        b7 = sch.get_block(name="T_floor", func_name="main")
+        b8 = sch.get_block(name="T_cast_1", func_name="main")
+        b9 = sch.get_block(name="compute", func_name="main")
+        b10 = sch.get_block(name="T_cast_2", func_name="main")
+        b11 = sch.get_block(name="T_cast_3", func_name="main")
+        b12 = sch.get_block(name="T_subtract", func_name="main")
+        b13 = sch.get_block(name="T_multiply_1", func_name="main")
+        b14 = sch.get_block(name="compile_engine_const_2", func_name="main")
+        b15 = sch.get_block(name="T_add_2", func_name="main")
+        b16 = sch.get_block(name="T_floor_1", func_name="main")
+        b17 = sch.get_block(name="T_cast_4", func_name="main")
+        b18 = sch.get_block(name="T_add_3", func_name="main")
+        b19 = sch.get_block(name="compute_1", func_name="main")
+        b20 = sch.get_block(name="T_cast_5", func_name="main")
+        b21 = sch.get_block(name="root", func_name="main")
+        sch.compute_inline(block=b20)
+        sch.compute_inline(block=b19)
+        sch.compute_inline(block=b18)
+        sch.compute_inline(block=b17)
+        sch.compute_inline(block=b16)
+        sch.compute_inline(block=b15)
+        sch.compute_inline(block=b14)
+        sch.compute_inline(block=b13)
+        sch.compute_inline(block=b12)
+        sch.compute_inline(block=b11)
+        sch.compute_inline(block=b10)
+        sch.compute_inline(block=b9)
+        sch.compute_inline(block=b8)
+        sch.compute_inline(block=b7)
+        sch.compute_inline(block=b6)
+        sch.compute_inline(block=b5)
+        sch.compute_inline(block=b4)
+        sch.compute_inline(block=b3)
+        sch.compute_inline(block=b2)
+        sch.compute_inline(block=b0)
+        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
+        l22, l23, l24, l25, l26, l27, l28, l29, l30, l31 = sch.get_loops(block=b1)
+        l32, l33 = sch.split(loop=l31, factors=[None, 4], preserve_unit_iters=True)
+        l34, l35 = sch.split(loop=l26, factors=[None, 16], preserve_unit_iters=True)
+        l36, l37, l38, l39, l40, l41, l42, l43, l44, l45, l46, l47 = sch.get_loops(block=b1)
+        sch.reorder(l42, l43, l44, l45, l46, l35, l33)
+        b48 = sch.blockize(loop=l35)
+        sch.annotate(
+            block_or_loop=b48, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni"
+        )
+        l49, l50, l51, l52, l53, l54, l55, l56, l57, l58 = sch.get_loops(block=b48)
+        v59, v60, v61, v62 = sch.sample_perfect_tile(
+            loop=l49, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
+        )
+        l63, l64, l65, l66 = sch.split(
+            loop=l49, factors=[v59, v60, v61, v62], preserve_unit_iters=True
+        )
+        v67, v68, v69, v70 = sch.sample_perfect_tile(
+            loop=l50, n=4, max_innermost_factor=64, decision=[4, 32, 1, 1]
+        )
+        l71, l72, l73, l74 = sch.split(
+            loop=l50, factors=[v67, v68, v69, v70], preserve_unit_iters=True
+        )
+        v75, v76, v77, v78 = sch.sample_perfect_tile(
+            loop=l51, n=4, max_innermost_factor=64, decision=[1, 7, 1, 1]
+        )
+        l79, l80, l81, l82 = sch.split(
+            loop=l51, factors=[v75, v76, v77, v78], preserve_unit_iters=True
+        )
+        v83, v84, v85, v86 = sch.sample_perfect_tile(
+            loop=l52, n=4, max_innermost_factor=64, decision=[1, 1, 1, 7]
+        )
+        l87, l88, l89, l90 = sch.split(
+            loop=l52, factors=[v83, v84, v85, v86], preserve_unit_iters=True
+        )
+        v91, v92, v93, v94 = sch.sample_perfect_tile(
+            loop=l53, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
+        )
+        l95, l96, l97, l98 = sch.split(
+            loop=l53, factors=[v91, v92, v93, v94], preserve_unit_iters=True
+        )
+        v99, v100 = sch.sample_perfect_tile(loop=l54, n=2, max_innermost_factor=64, decision=[1, 1])
+        l101, l102 = sch.split(loop=l54, factors=[v99, v100], preserve_unit_iters=True)
+        v103, v104 = sch.sample_perfect_tile(
+            loop=l55, n=2, max_innermost_factor=64, decision=[1, 1]
+        )
+        l105, l106 = sch.split(loop=l55, factors=[v103, v104], preserve_unit_iters=True)
+        v107, v108 = sch.sample_perfect_tile(
+            loop=l56, n=2, max_innermost_factor=64, decision=[4, 8]
+        )
+        l109, l110 = sch.split(loop=l56, factors=[v107, v108], preserve_unit_iters=True)
+        v111, v112 = sch.sample_perfect_tile(
+            loop=l57, n=2, max_innermost_factor=64, decision=[4, 1]
+        )
+        l113, l114 = sch.split(loop=l57, factors=[v111, v112], preserve_unit_iters=True)
+        v115, v116 = sch.sample_perfect_tile(
+            loop=l58, n=2, max_innermost_factor=64, decision=[1, 1]
+        )
+        l117, l118 = sch.split(loop=l58, factors=[v115, v116], preserve_unit_iters=True)
+        sch.reorder(
+            l63,
+            l71,
+            l79,
+            l87,
+            l95,
+            l64,
+            l72,
+            l80,
+            l88,
+            l96,
+            l101,
+            l105,
+            l109,
+            l113,
+            l117,
+            l65,
+            l73,
+            l81,
+            l89,
+            l97,
+            l102,
+            l106,
+            l110,
+            l114,
+            l118,
+            l66,
+            l74,
+            l82,
+            l90,
+            l98,
+        )
+        (b119,) = sch.get_consumers(block=b48)
+        sch.reverse_compute_at(block=b119, loop=l96, preserve_unit_loops=True, index=-1)
+        sch.annotate(block_or_loop=b21, ann_key="meta_schedule.parallel", ann_val=96)
+        sch.annotate(block_or_loop=b21, ann_key="meta_schedule.vectorize", ann_val=64)
+        v120 = sch.sample_categorical(
+            candidates=[0, 16, 64, 512], probs=[0.25, 0.25, 0.25, 0.25], decision=2
+        )
+        sch.annotate(block_or_loop=b21, ann_key="meta_schedule.unroll_explicit", ann_val=v120)
+        sch.enter_postproc()
+        b121 = sch.get_block(name="root", func_name="main")
+        sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.parallel")
+        sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.vectorize")
+        sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit")
+        b122, b123 = sch.get_child_blocks(b121)
+        (
+            l124,
+            l125,
+            l126,
+            l127,
+            l128,
+            l129,
+            l130,
+            l131,
+            l132,
+            l133,
+            l134,
+            l135,
+            l136,
+            l137,
+            l138,
+            l139,
+            l140,
+            l141,
+            l142,
+            l143,
+            l144,
+            l145,
+            l146,
+            l147,
+            l148,
+            l149,
+            l150,
+            l151,
+            l152,
+            l153,
+        ) = sch.get_loops(block=b122)
+        l154 = sch.fuse(l124, l125, l126, l127, l128, l129, l130, preserve_unit_iters=True)
+        sch.parallel(loop=l154)
+        sch.annotate(block_or_loop=l154, ann_key="pragma_auto_unroll_max_step", ann_val=64)
+        sch.annotate(block_or_loop=l154, ann_key="pragma_unroll_explicit", ann_val=1)
+        l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b123)
+        l164 = sch.fuse(l163, preserve_unit_iters=True)
+        sch.vectorize(loop=l164)
+        sch.annotate(block_or_loop=l155, ann_key="pragma_auto_unroll_max_step", ann_val=64)
+        sch.annotate(block_or_loop=l155, ann_key="pragma_unroll_explicit", ann_val=1)
+        b165 = sch.get_block(name="conv2d_NCHWc_int8_o", func_name="main")
+        (
+            l166,
+            l167,
+            l168,
+            l169,
+            l170,
+            l171,
+            l172,
+            l173,
+            l174,
+            l175,
+            l176,
+            l177,
+            l178,
+            l179,
+            l180,
+            l181,
+            l182,
+            l183,
+            l184,
+            l185,
+            l186,
+            l187,
+            l188,
+            l189,
+        ) = sch.get_loops(block=b165)
+        b190 = sch.decompose_reduction(block=b165, loop=l172)
+        sch.unannotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize")
+        sch.annotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize", ann_val="")
+        b191 = sch.get_block(name="conv2d_NCHWc_int8_o_init", func_name="main")
+        sch.unannotate(block_or_loop=b191, ann_key="meta_schedule.auto_tensorize")
+        (b192,) = sch.get_child_blocks(b191)
+        (l193,) = sch.get_loops(block=b192)
+        sch.vectorize(loop=l193)
+        b194 = sch.get_block(name="conv2d_NCHWc_int8_o_update", func_name="main")
+        sch.unannotate(block_or_loop=b194, ann_key="meta_schedule.auto_tensorize")
+        sch.tensorize(block_or_loop=b194, tensor_intrin="dot_16x4_vnni")
+
+    vnni_id = llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512")
+    verify(
+        Conv2dInt8_NCHWc,
+        apply_trace,
+        Conv2dInt8_NCHWc_target,
+        "llvm -mcpu=cascadelake",
+        get_conv2d_vnni_mod(vnni_id),
+    )
+
+
+def test_winograd_gpu():
+    def apply_trace(sch):
+        b0 = sch.get_block(name="B", func_name="main")
+        b1 = sch.get_block(name="data_pack", func_name="main")
+        b2 = sch.get_block(name="bgemm", func_name="main")
+        b3 = sch.get_block(name="A", func_name="main")
+        b4 = sch.get_block(name="inverse", func_name="main")
+        b5 = sch.get_block(name="conv2d_winograd", func_name="main")
+        b6 = sch.get_block(name="T_add", func_name="main")
+        b7 = sch.get_block(name="T_relu", func_name="main")
+        b8 = sch.get_block(name="root", func_name="main")
+        sch.compute_inline(block=b0)
+        (b9,) = sch.get_producers(block=b1)
+        (b10,) = sch.get_producers(block=b9)
+        l11, l12, l13, l14, l15, l16 = sch.get_loops(block=b1)
+        v17, v18 = sch.sample_perfect_tile(
+            loop=l13, n=2, max_innermost_factor=64, decision=[14, 14]
+        )
+        l19, l20 = sch.split(loop=l13, factors=[v17, v18], preserve_unit_iters=True)
+        v21, v22 = sch.sample_perfect_tile(loop=l14, n=2, max_innermost_factor=64, decision=[8, 8])
+        l23, l24 = sch.split(loop=l14, factors=[v21, v22], preserve_unit_iters=True)
+        sch.unroll(loop=l11)
+        sch.unroll(loop=l12)
+        sch.unroll(loop=l15)
+        sch.unroll(loop=l16)
+        sch.reorder(l19, l23, l20, l24, l11, l12, l15, l16)
+        sch.compute_at(block=b9, loop=l24, preserve_unit_loops=True, index=-1)
+        sch.set_scope(block=b9, buffer_index=0, storage_scope="local")
+        sch.compute_inline(block=b10)
+        l25, l26, l27, l28, l29, l30, l31, l32 = sch.get_loops(block=b1)
+        l33 = sch.fuse(l25, l26, l27, l28, preserve_unit_iters=True)
+        v34 = sch.sample_categorical(
+            candidates=[32, 64, 128, 256, 512, 1024],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=2,
+        )
+        l35, l36 = sch.split(loop=l33, factors=[None, v34], preserve_unit_iters=True)
+        sch.bind(loop=l35, thread_axis="blockIdx.x")
+        sch.bind(loop=l36, thread_axis="threadIdx.x")
+        sch.compute_inline(block=b3)
+        l37, l38, l39, l40, l41, l42 = sch.get_loops(block=b4)
+        v43, v44 = sch.sample_perfect_tile(loop=l39, n=2, max_innermost_factor=64, decision=[28, 7])
+        l45, l46 = sch.split(loop=l39, factors=[v43, v44], preserve_unit_iters=True)
+        v47, v48 = sch.sample_perfect_tile(loop=l40, n=2, max_innermost_factor=64, decision=[2, 32])
+        l49, l50 = sch.split(loop=l40, factors=[v47, v48], preserve_unit_iters=True)
+        sch.unroll(loop=l37)
+        sch.unroll(loop=l38)
+        sch.unroll(loop=l41)
+        sch.unroll(loop=l42)
+        sch.reorder(l45, l49, l46, l50, l37, l38, l41, l42)
+        l51, l52, l53, l54, l55, l56, l57, l58 = sch.get_loops(block=b4)
+        l59 = sch.fuse(l51, l52, l53, l54, preserve_unit_iters=True)
+        v60 = sch.sample_categorical(
+            candidates=[32, 64, 128, 256, 512, 1024],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=4,
+        )
+        l61, l62 = sch.split(loop=l59, factors=[None, v60], preserve_unit_iters=True)
+        sch.bind(loop=l61, thread_axis="blockIdx.x")
+        sch.bind(loop=l62, thread_axis="threadIdx.x")
+        sch.annotate(block_or_loop=b2, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
+        l63, l64, l65, l66, l67 = sch.get_loops(block=b2)
+        v68, v69, v70, v71, v72 = sch.sample_perfect_tile(
+            loop=l63, n=5, max_innermost_factor=64, decision=[1, 2, 3, 1, 1]
+        )
+        l73, l74, l75, l76, l77 = sch.split(
+            loop=l63, factors=[v68, v69, v70, v71, v72], preserve_unit_iters=True
+        )
+        v78, v79, v80, v81, v82 = sch.sample_perfect_tile(
+            loop=l64, n=5, max_innermost_factor=64, decision=[6, 1, 1, 1, 1]
+        )
+        l83, l84, l85, l86, l87 = sch.split(
+            loop=l64, factors=[v78, v79, v80, v81, v82], preserve_unit_iters=True
+        )
+        v88, v89, v90, v91, v92 = sch.sample_perfect_tile(
+            loop=l65, n=5, max_innermost_factor=64, decision=[7, 2, 1, 14, 1]
+        )
+        l93, l94, l95, l96, l97 = sch.split(
+            loop=l65, factors=[v88, v89, v90, v91, v92], preserve_unit_iters=True
+        )
+        v98, v99, v100, v101, v102 = sch.sample_perfect_tile(
+            loop=l66, n=5, max_innermost_factor=64, decision=[4, 1, 16, 1, 1]
+        )
+        l103, l104, l105, l106, l107 = sch.split(
+            loop=l66, factors=[v98, v99, v100, v101, v102], preserve_unit_iters=True
+        )
+        v108, v109, v110 = sch.sample_perfect_tile(
+            loop=l67, n=3, max_innermost_factor=64, decision=[2, 2, 16]
+        )
+        l111, l112, l113 = sch.split(loop=l67, factors=[v108, v109, v110], preserve_unit_iters=True)
+        sch.reorder(
+            l73,
+            l83,
+            l93,
+            l103,
+            l74,
+            l84,
+            l94,
+            l104,
+            l75,
+            l85,
+            l95,
+            l105,
+            l111,
+            l112,
+            l76,
+            l86,
+            l96,
+            l106,
+            l113,
+            l77,
+            l87,
+            l97,
+            l107,
+        )
+        l114 = sch.fuse(l73, l83, l93, l103, preserve_unit_iters=True)
+        sch.bind(loop=l114, thread_axis="blockIdx.x")
+        l115 = sch.fuse(l74, l84, l94, l104, preserve_unit_iters=True)
+        sch.bind(loop=l115, thread_axis="vthread.x")
+        l116 = sch.fuse(l75, l85, l95, l105, preserve_unit_iters=True)
+        sch.bind(loop=l116, thread_axis="threadIdx.x")
+        sch.annotate(
+            block_or_loop=b2, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32
+        )
+        sch.annotate(
+            block_or_loop=b2, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024
+        )
+        b117 = sch.cache_write(block=b2, write_buffer_index=0, storage_scope="local")
+        sch.reverse_compute_at(block=b117, loop=l116, preserve_unit_loops=True, index=-1)
+        b118 = sch.cache_read(
+            block=b2, read_buffer_index=0, storage_scope="shared", consumer_blocks=[b2]
+        )
+        sch.compute_at(block=b118, loop=l111, preserve_unit_loops=True, index=-1)
+        l119, l120, l121, l122, l123, l124, l125, l126 = sch.get_loops(block=b118)
+        l127 = sch.fuse(l123, l124, l125, l126, preserve_unit_iters=True)
+        v128 = sch.sample_categorical(
+            candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3
+        )
+        sch.annotate(block_or_loop=b118, ann_key="meta_schedule.cooperative_fetch", ann_val=v128)
+        b129 = sch.cache_read(
+            block=b2, read_buffer_index=1, storage_scope="shared", consumer_blocks=[b2]
+        )
+        sch.compute_at(block=b129, loop=l111, preserve_unit_loops=True, index=-1)
+        l130, l131, l132, l133, l134, l135, l136, l137 = sch.get_loops(block=b129)
+        l138 = sch.fuse(l134, l135, l136, l137, preserve_unit_iters=True)
+        v139 = sch.sample_categorical(
+            candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3
+        )
+        sch.annotate(block_or_loop=b129, ann_key="meta_schedule.cooperative_fetch", ann_val=v139)
+        sch.reverse_compute_inline(block=b7)
+        sch.reverse_compute_inline(block=b6)
+        v140 = sch.sample_categorical(
+            candidates=[0, 16, 64, 512, 1024],
+            probs=[
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+            ],
+            decision=4,
+        )
+        sch.annotate(block_or_loop=b8, ann_key="meta_schedule.unroll_explicit", ann_val=v140)
+        l141, l142, l143, l144 = sch.get_loops(block=b5)
+        l145 = sch.fuse(l141, l142, l143, l144, preserve_unit_iters=True)
+        v146 = sch.sample_categorical(
+            candidates=[32, 64, 128, 256, 512, 1024],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=2,
+        )
+        l147, l148 = sch.split(loop=l145, factors=[None, v146], preserve_unit_iters=True)
+        sch.bind(loop=l147, thread_axis="blockIdx.x")
+        sch.bind(loop=l148, thread_axis="threadIdx.x")
+        sch.enter_postproc()
+        sch.unannotate(block_or_loop=b118, ann_key="meta_schedule.cooperative_fetch")
+        l149, l150, l151, l152, l153 = sch.get_loops(block=b118)
+        l154, l155, l156 = sch.split(loop=l153, factors=[None, 48, 4], preserve_unit_iters=True)
+        sch.vectorize(loop=l156)
+        sch.bind(loop=l155, thread_axis="threadIdx.x")
+        sch.unannotate(block_or_loop=b129, ann_key="meta_schedule.cooperative_fetch")
+        l157, l158, l159, l160, l161 = sch.get_loops(block=b129)
+        l162, l163, l164 = sch.split(loop=l161, factors=[None, 48, 4], preserve_unit_iters=True)
+        sch.vectorize(loop=l164)
+        sch.bind(loop=l163, thread_axis="threadIdx.x")
+        b165 = sch.get_block(name="root", func_name="main")
+        sch.unannotate(block_or_loop=b165, ann_key="meta_schedule.unroll_explicit")
+        b166, b167, b168, b169, b170, b171, b172, b173 = sch.get_child_blocks(b165)
+        l174, l175, l176, l177, l178, l179 = sch.get_loops(block=b166)
+        sch.annotate(block_or_loop=l174, ann_key="pragma_auto_unroll_max_step", ann_val=1024)
+        sch.annotate(block_or_loop=l174, ann_key="pragma_unroll_explicit", ann_val=1)
+        l180, l181, l182, l183, l184, l185 = sch.get_loops(block=b167)
+        sch.annotate(block_or_loop=l180, ann_key="pragma_auto_unroll_max_step", ann_val=1024)
+        sch.annotate(block_or_loop=l180, ann_key="pragma_unroll_explicit", ann_val=1)
+        l186, l187, l188, l189, l190, l191, l192 = sch.get_loops(block=b168)
+        sch.annotate(block_or_loop=l186, ann_key="pragma_auto_unroll_max_step", ann_val=1024)
+        sch.annotate(block_or_loop=l186, ann_key="pragma_unroll_explicit", ann_val=1)
+        l193, l194, l195, l196, l197, l198, l199 = sch.get_loops(block=b169)
+        sch.annotate(block_or_loop=l193, ann_key="pragma_auto_unroll_max_step", ann_val=1024)
+        sch.annotate(block_or_loop=l193, ann_key="pragma_unroll_explicit", ann_val=1)
+        (
+            l200,
+            l201,
+            l202,
+            l203,
+            l204,
+            l205,
+            l206,
+            l207,
+            l208,
+            l209,
+            l210,
+            l211,
+            l212,
+            l213,
+        ) = sch.get_loops(block=b170)
+        sch.annotate(block_or_loop=l200, ann_key="pragma_auto_unroll_max_step", ann_val=1024)
+        sch.annotate(block_or_loop=l200, ann_key="pragma_unroll_explicit", ann_val=1)
+        l214, l215, l216, l217, l218, l219, l220 = sch.get_loops(block=b171)
+        sch.annotate(block_or_loop=l214, ann_key="pragma_auto_unroll_max_step", ann_val=1024)
+        sch.annotate(block_or_loop=l214, ann_key="pragma_unroll_explicit", ann_val=1)
+        l221, l222, l223, l224, l225, l226 = sch.get_loops(block=b172)
+        sch.annotate(block_or_loop=l221, ann_key="pragma_auto_unroll_max_step", ann_val=1024)
+        sch.annotate(block_or_loop=l221, ann_key="pragma_unroll_explicit", ann_val=1)
+        l227, l228 = sch.get_loops(block=b173)
+        sch.annotate(block_or_loop=l227, ann_key="pragma_auto_unroll_max_step", ann_val=1024)
+        sch.annotate(block_or_loop=l227, ann_key="pragma_unroll_explicit", ann_val=1)
+        b229 = sch.get_block(name="data_pack", func_name="main")
+        l230, l231, l232, l233, l234, l235 = sch.get_loops(block=b229)
+        b236 = sch.decompose_reduction(block=b229, loop=l234)
+        b237 = sch.get_block(name="bgemm", func_name="main")
+        (
+            l238,
+            l239,
+            l240,
+            l241,
+            l242,
+            l243,
+            l244,
+            l245,
+            l246,
+            l247,
+            l248,
+            l249,
+            l250,
+            l251,
+        ) = sch.get_loops(block=b237)
+        b252 = sch.decompose_reduction(block=b237, loop=l241)
+        b253 = sch.get_block(name="inverse", func_name="main")
+        l254, l255, l256, l257, l258, l259 = sch.get_loops(block=b253)
+        b260 = sch.decompose_reduction(block=b253, loop=l258)
+
+    verify(
+        Conv2dWinogradAddRelu,
+        apply_trace,
+        Conv2dWinogradAddResidualRelu,
+        "cuda",
+        Conv2dWinogradAddResidualRelu_scheduled,
+    )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_vnni_integration.py b/tests/python/unittest/test_meta_schedule_vnni_integration.py
index d0bfc913eca6..1f91dc593143 100644
--- a/tests/python/unittest/test_meta_schedule_vnni_integration.py
+++ b/tests/python/unittest/test_meta_schedule_vnni_integration.py
@@ -26,6 +26,7 @@
 from tvm import relay
 from tvm._ffi import register_func
 from tvm.tir.schedule import BlockRV, Schedule
+from tvm.tir.schedule.analysis import has_block
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 
 logging.basicConfig(
@@ -44,6 +45,7 @@ def schedule_fn(sch, dense_block: Optional[BlockRV] = None) -> bool:
         if sch.mod.attrs is not None and "dense" not in sch.mod.attrs["task_name"]:
             return False
         if dense_block is None:
+            assert has_block(sch, "compute")
             dense_block = sch.get_block("compute")
             assert "dense_vnni" in sch.get(dense_block).annotations["schedule_rule"]
 

From 602c194f25d52c8159631e6b1220218624433c40 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 28 Oct 2022 13:36:30 -0500
Subject: [PATCH 459/704] [TIR][Arith] Implement
 kApplyConstraintsToBooleanBranches extension (#13129)

When simplifying a branch of a boolean AND or a boolean OR, the
other branch may be assumed not to dominate the result of the
operator.  For example, when simplifying `(A and B)`, `A` may be
simplified on the assumption that `B` is true.  Similarly, when
simplifying `(A or B)`, `A` may be simplified on the assumption
that `B` is false.

Prior to this commit, these constraints were not used for
simplifications.  This commit introduced an optional extension,
`kApplyConstraintsToBooleanBranches`, which exposes these
constraints for simplification.  This isn't enabled by default,
as some cases require a second visit to each branch of a boolean
operator.  (e.g. Simplifying the LHS of an operator, using a
constraint provided by the RHS, where the sub-analyzer that uses
the constraint expects it to already be simplified.)
---
 include/tvm/arith/analyzer.h                  |  13 ++
 src/arith/rewrite_simplify.cc                 | 137 ++++++++++++++++-
 src/tir/transforms/simplify.cc                |  11 ++
 .../unittest/test_tir_transform_simplify.py   | 144 ++++++++++++++++++
 4 files changed, 297 insertions(+), 8 deletions(-)

diff --git a/include/tvm/arith/analyzer.h b/include/tvm/arith/analyzer.h
index b80d75a17058..e2d60684da7b 100644
--- a/include/tvm/arith/analyzer.h
+++ b/include/tvm/arith/analyzer.h
@@ -305,6 +305,19 @@ class RewriteSimplifier {
      *   (a && b) || c => (a || c) && (b || c)
      */
     kConvertBooleanToAndOfOrs = (1 << 1),
+
+    /* When simplifying a boolean AND or a boolean OR, simplify each
+     * branch under the assumption that the other branch does not
+     * already dominate the result.  That is, simplify each branch of
+     * (A && B) under the assumption that the other branch is true,
+     * and simplify each branch of (A || B) under the assumption that
+     * the other branch is false.
+     *
+     * Example:
+     *   (n < 10) && (n < 5) => (n < 10)
+     *   (n < 10) || (n < 5) => (n < 5)
+     */
+    kApplyConstraintsToBooleanBranches = (1 << 2),
   };
 
   /*! \brief Enable an optional extension or extensions
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index 5e565d7e36c6..6cc2aa9e4591 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -72,6 +72,40 @@ using namespace tir;
 // handled by CanonicalSimplifier.
 //
 
+/* Utility for rewriting only boolean portions of an expression
+ *
+ * Performs a subset of simplifications done by RewriteSimplifier,
+ * sufficient to negate a simplified expression.  Intended for
+ * application on an expression that has previously been simplified.
+ *
+ * \param expr The boolean expression to be normalized
+ *
+ * \returns The normalized boolean expression
+ */
+PrimExpr NormalizeBooleanOperators(PrimExpr expr) {
+  PVar<PrimExpr> x, y;
+
+  while (true) {
+    if ((!!x).Match(expr)) {
+      expr = x.Eval();
+    } else if ((!(x || y)).Match(expr)) {
+      return NormalizeBooleanOperators(!x.Eval()) && NormalizeBooleanOperators(!y.Eval());
+    } else if ((!(x && y)).Match(expr)) {
+      return NormalizeBooleanOperators(!x.Eval()) || NormalizeBooleanOperators(!y.Eval());
+    } else if ((x >= y).Match(expr) || (!(x < y)).Match(expr) || (!(y > x)).Match(expr)) {
+      return y.Eval() <= x.Eval();
+    } else if ((x > y).Match(expr) || (!(x <= y)).Match(expr) || (!(y >= x)).Match(expr)) {
+      return y.Eval() < x.Eval();
+    } else if ((!(x == y)).Match(expr)) {
+      return x.Eval() != y.Eval();
+    } else if ((!(x != y)).Match(expr)) {
+      return x.Eval() == y.Eval();
+    } else {
+      return expr;
+    }
+  }
+}
+
 CompareResult RewriteSimplifier::Impl::TryCompare(const PrimExpr& x, const PrimExpr& y) {
   CompareResult output = CompareResult::kUnknown;
 
@@ -261,17 +295,17 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
   for (const PrimExpr& subconstraint : ExtractConstraints(new_constraint)) {
     if (SideEffect(subconstraint) <= CallEffectKind::kPure) {
       literal_constraints_.push_back(subconstraint);
-      // We could apply this during TryMatchLiteralConstraint, but
-      // that would require performing a rewrite of each expression
-      // being checked.  This way, we only apply a rewrite for each
-      // constraint being applied.
       PrimExpr negation;
       if (subconstraint.dtype().is_bool()) {
-        negation = Not(subconstraint);
+        // We could apply NormalizeBooleanOperators during
+        // TryMatchLiteralConstraint, but that would require
+        // performing a rewrite of each expression being checked.
+        // This way, we only apply a rewrite for each constraint being
+        // applied.
+        negation = NormalizeBooleanOperators(Not(subconstraint));
       } else {
         negation = subconstraint == make_zero(subconstraint.dtype());
       }
-      negation = operator()(negation);
       literal_constraints_.push_back(Not(negation));
     }
   }
@@ -1557,7 +1591,50 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NotNode* op) {
 }
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
-  PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
+  PrimExpr ret = [&]() -> PrimExpr {
+    // If this extension isn't enabled, just delegate out.
+    if (!(enabled_extensions_ & kApplyConstraintsToBooleanBranches)) {
+      return IRMutatorWithAnalyzer::VisitExpr_(op);
+    }
+
+    PrimExpr a = op->a;
+    PrimExpr b = op->b;
+
+    // Alternate which branch is used as the constraint, and which is
+    // being simplified.  Because some sub-analyzers expect their
+    // constraints to already be simplified, each branch may require
+    // more than one update.  The loop condition allows each branch to
+    // be visited up to twice, but only performs the second visit if
+    // necessary.
+    size_t iterations_since_update = 0;
+    for (size_t i = 0; i < 4; i++) {
+      PrimExpr& to_update = (i % 2 == 0) ? a : b;
+      const PrimExpr& constraint = (i % 2 == 0) ? b : a;
+
+      With<ConstraintContext> context(analyzer_, constraint);
+      PrimExpr updated = VisitExpr(to_update);
+
+      if (!to_update.same_as(updated)) {
+        to_update = updated;
+        iterations_since_update = 0;
+      } else {
+        iterations_since_update++;
+        if (iterations_since_update >= 2) {
+          break;
+        }
+      }
+    }
+
+    // Only construct a new object if a change has been made.
+    // Otherwise, follow ExprMutator's convention of returning the
+    // original object.
+    if (a.same_as(op->a) && b.same_as(op->b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return And(a, b);
+    }
+  }();
+
   op = ret.as<AndNode>();
 
   if (auto const_res = TryConstFold<And>(op->a, op->b)) return const_res.value();
@@ -1601,7 +1678,51 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
 }
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) {
-  PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
+  PrimExpr orig = GetRef<PrimExpr>(op);
+
+  PrimExpr ret = [&]() -> PrimExpr {
+    // If this extension isn't enabled, just delegate out.
+    if (!(enabled_extensions_ & kApplyConstraintsToBooleanBranches)) {
+      return IRMutatorWithAnalyzer::VisitExpr_(op);
+    }
+
+    PrimExpr a = op->a;
+    PrimExpr b = op->b;
+
+    // Alternate which branch is used as the constraint, and which
+    // is being simplified.  Because some sub-analyzers expect their
+    // constraints to already be simplified, each branch may require
+    // more than update.  The loop condition allows each branch to be
+    // visited up to twice, but only if performs the second visit if
+    // necessary.
+    size_t iterations_since_update = 0;
+    for (size_t i = 0; i < 4; i++) {
+      PrimExpr& to_update = (i % 2 == 0) ? a : b;
+      const PrimExpr& constraint = (i % 2 == 0) ? b : a;
+
+      With<ConstraintContext> context(analyzer_, NormalizeBooleanOperators(Not(constraint)));
+      PrimExpr updated = VisitExpr(to_update);
+
+      if (!to_update.same_as(updated)) {
+        to_update = updated;
+        iterations_since_update = 0;
+      } else {
+        iterations_since_update++;
+        if (iterations_since_update >= 2) {
+          break;
+        }
+      }
+    }
+
+    // Only construct a new object if a change has been made.
+    // Otherwise, follow ExprMutator's convention of returning the
+    // original object.
+    if (a.same_as(op->a) && b.same_as(op->b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return Or(a, b);
+    }
+  }();
 
   op = ret.as<OrNode>();
   if (auto const_res = TryConstFold<Or>(op->a, op->b)) return const_res.value();
diff --git a/src/tir/transforms/simplify.cc b/src/tir/transforms/simplify.cc
index 894dfb8ca09f..b6e3581aa614 100644
--- a/src/tir/transforms/simplify.cc
+++ b/src/tir/transforms/simplify.cc
@@ -39,6 +39,7 @@ using namespace tir;
 struct SimplifyConfigNode : public tvm::AttrsNode<SimplifyConfigNode> {
   bool transitively_prove_inequalities;
   bool convert_boolean_to_and_of_ors;
+  bool apply_constraints_to_boolean_branches;
 
   TVM_DECLARE_ATTRS(SimplifyConfigNode, "tir.transform.SimplifyConfig") {
     TVM_ATTR_FIELD(transitively_prove_inequalities)
@@ -49,6 +50,12 @@ struct SimplifyConfigNode : public tvm::AttrsNode<SimplifyConfigNode> {
     TVM_ATTR_FIELD(convert_boolean_to_and_of_ors)
         .describe("If true, simplify conditionals into an AND of ORs")
         .set_default(false);
+
+    TVM_ATTR_FIELD(apply_constraints_to_boolean_branches)
+        .describe(
+            "If true, simplify each branch of AND/OR "
+            "under a constraints provided by the other branch")
+        .set_default(false);
   }
 
   RewriteSimplifier::Extension GetEnabledExtensions() const {
@@ -60,6 +67,10 @@ struct SimplifyConfigNode : public tvm::AttrsNode<SimplifyConfigNode> {
     if (convert_boolean_to_and_of_ors) {
       flags = RewriteSimplifier::Extension(flags | RewriteSimplifier::kConvertBooleanToAndOfOrs);
     }
+    if (apply_constraints_to_boolean_branches) {
+      flags = RewriteSimplifier::Extension(flags |
+                                           RewriteSimplifier::kApplyConstraintsToBooleanBranches);
+    }
     return flags;
   }
 };
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 46b6858ec773..91ef60f9d3f1 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -139,6 +139,7 @@ def sls(n, d):
 class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
     transitively_prove_inequalities = False
     convert_boolean_to_and_of_ors = False
+    apply_constraints_to_boolean_branches = False
 
     def transform(self):
         def inner(mod):
@@ -146,6 +147,7 @@ def inner(mod):
                 "tir.Simplify": {
                     "transitively_prove_inequalities": self.transitively_prove_inequalities,
                     "convert_boolean_to_and_of_ors": self.convert_boolean_to_and_of_ors,
+                    "apply_constraints_to_boolean_branches": self.apply_constraints_to_boolean_branches,
                 }
             }
             with tvm.transform.PassContext(config=config):
@@ -845,5 +847,147 @@ def expected(A: T.Buffer[1, "bool"], i: T.int32):
             A[0] = True
 
 
+class TestSimplifyRHSOfBooleanAndUsingLHS(BaseBeforeAfter):
+    """Boolean expressions can introduce contexts.
+
+    In `A and B`, the result of `B` only matters when `A` is
+    true, and can be simplified under that context.  This test
+    simplifies `n < 10` under the assumption that `n < 5`.
+    """
+
+    apply_constraints_to_boolean_branches = True
+
+    def before(A: T.Buffer[1, "bool"], n: T.int32):
+        A[0] = n < 5 and n < 10
+
+    def expected(A: T.Buffer[1, "bool"], n: T.int32):
+        A[0] = n < 5
+
+
+class TestSimplifyLHSOfBooleanAndUsingRHS(BaseBeforeAfter):
+    """Boolean expressions can introduce contexts for their arguments.
+
+    Like TestSimplifyRHSOfBooleanAndUsingLHS, but using the RHS to
+    simplify the LHS.
+    """
+
+    apply_constraints_to_boolean_branches = True
+
+    def before(A: T.Buffer[1, "bool"], n: T.int32):
+        A[0] = n < 10 and n < 5
+
+    def expected(A: T.Buffer[1, "bool"], n: T.int32):
+        A[0] = n < 5
+
+
+class TestSimplifyRHSOfBooleanOrUsingLHS(BaseBeforeAfter):
+    """Boolean expressions can introduce contexts.
+
+    In `A or B`, the result of `B` only matters when `A` is false, so
+    `B` can be simplified under the assumption that `A` is false.
+    This test simplifies `n < 5` under the assumption that `!(n < 10)`
+    """
+
+    apply_constraints_to_boolean_branches = True
+
+    def before(A: T.Buffer[1, "bool"], n: T.int32):
+        A[0] = n < 10 or n < 5
+
+    def expected(A: T.Buffer[1, "bool"], n: T.int32):
+        A[0] = n < 10
+
+
+class TestSimplifyLHSOfBooleanOrUsingRHS(BaseBeforeAfter):
+    """Boolean expressions can introduce contexts for their arguments.
+
+    Like TestSimplifyRHSOfBooleanOrUsingLHS, but using the RHS to
+    simplify the LHS.
+    """
+
+    apply_constraints_to_boolean_branches = True
+
+    def before(A: T.Buffer[1, "bool"], n: T.int32):
+        A[0] = n < 5 or n < 10
+
+    def expected(A: T.Buffer[1, "bool"], n: T.int32):
+        A[0] = n < 10
+
+
+class TestSimplifyRHSOfBooleanAndUsingLHSWithoutConst(BaseBeforeAfter):
+    """Boolean expressions can introduce contexts.
+
+    Like TestSimplifyRHSOfBooleanAndUsingLHS, but with variables in
+    the conditions, preventing ConstIntBoundAnalyzer from handling it.
+    This proof requires the extension to transitively prove
+    inequalities.
+    """
+
+    apply_constraints_to_boolean_branches = True
+    transitively_prove_inequalities = True
+
+    def before(A: T.Buffer[1, "bool"], n: T.int32, m: T.int32):
+        A[0] = n < m + 5 and n < m + 10
+
+    def expected(A: T.Buffer[1, "bool"], n: T.int32, m: T.int32):
+        A[0] = n < m + 5
+
+
+class TestSimplifyLHSOfBooleanAndUsingRHSWithoutConst(BaseBeforeAfter):
+    """Boolean expressions can introduce contexts for their arguments.
+
+    Like TestSimplifyLHSOfBooleanAndUsingRHS, but with variables in
+    the conditions, preventing ConstIntBoundAnalyzer from handling it.
+    This proof requires the extension to transitively prove
+    inequalities.
+    """
+
+    apply_constraints_to_boolean_branches = True
+    transitively_prove_inequalities = True
+
+    def before(A: T.Buffer[1, "bool"], n: T.int32, m: T.int32):
+        A[0] = n < m + 10 and n < m + 5
+
+    def expected(A: T.Buffer[1, "bool"], n: T.int32, m: T.int32):
+        A[0] = n < m + 5
+
+
+class TestSimplifyRHSOfBooleanOrUsingLHSWithoutConst(BaseBeforeAfter):
+    """Boolean expressions can introduce contexts.
+
+    Like TestSimplifyRHSOfBooleanOrUsingLHS, but with variables in the
+    conditions, preventing ConstIntBoundAnalyzer from handling it.
+    This proof requires the extension to transitively prove
+    inequalities.
+    """
+
+    apply_constraints_to_boolean_branches = True
+    transitively_prove_inequalities = True
+
+    def before(A: T.Buffer[1, "bool"], n: T.int32, m: T.int32):
+        A[0] = n < m + 10 or n < m + 5
+
+    def expected(A: T.Buffer[1, "bool"], n: T.int32, m: T.int32):
+        A[0] = n < m + 10
+
+
+class TestSimplifyLHSOfBooleanOrUsingRHSWithoutConst(BaseBeforeAfter):
+    """Boolean expressions can introduce contexts for their arguments.
+
+    Like TestSimplifyLHSOfBooleanOrUsingRHS, but with variables in the
+    conditions, preventing ConstIntBoundAnalyzer from handling it.
+    This proof requires the extension to transitively prove
+    inequalities.
+    """
+
+    apply_constraints_to_boolean_branches = True
+    transitively_prove_inequalities = True
+
+    def before(A: T.Buffer[1, "bool"], n: T.int32, m: T.int32):
+        A[0] = n < m + 5 or n < m + 10
+
+    def expected(A: T.Buffer[1, "bool"], n: T.int32, m: T.int32):
+        A[0] = n < m + 10
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From c0f33dfb97d9689640c85d2324505299bf43437d Mon Sep 17 00:00:00 2001
From: Karl Koscher <kkoscher@octoml.ai>
Date: Fri, 28 Oct 2022 11:59:46 -0700
Subject: [PATCH 460/704] [Logging][Hexagon] Improve logging on Hexagon
 (#13072)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [Logging][Hexagon] Improve logging on Hexagon

Currently Hexagon logging is done with a custom LogMessageImpl in
hexagon_common.cc. This ends up calling HexagonLog and HEXAGON_PRINT
which uses the HAP FARF API. Unfortunately, the TVM log level is lost
along the way, with logs being produced at FARF’s ALWAYS level. This
becomes especially noisy with RPC debug logging, which generates enough
noise to cause some log data to be dropped. It also introduces a lot of
useless noise, as the FARF API produces its own line number information,
which only points to where hexagon_common.cc calls HEXAGON_PRINT. Using
the HAP_debug_v2 API lets us pass the log level and file line
information directly, and enables runtime selection of logging levels.

This commit explicity passes the log level to LogMessage/LogMessageImpl
and updates Hexagon's custom LogMessageImpl to use the HAP_debug_v2 API.

* Adjust Hexagon rpc_server logging to use the DEBUG level

* Update hexagon_api launcher script to omit DEBUG-level logs

* Update WASM LogMessageImpl to accept explicit level

* Update Android LogMessageImpl to accept and forward explicit log level

* Move LogMessage::level_strings_ out of some ifdefs

* Update iOS LogMessageImpl to accept explicit log level

* Attempt to fix Windows build

* Add comments about runtime hexagon log level encodings

* Remove unneeded string processing in LogMessage

* Remove TODO

* Update HexagonLauncherAndroid to accept runtime log filtering configuration
---
 .../app/src/main/jni/tvm_runtime.h            |  6 +--
 .../app/src/main/jni/tvm_runtime.h            |  6 +--
 apps/ios_rpc/tvmrpc/TVMRuntime.mm             |  2 +-
 include/tvm/runtime/logging.h                 | 30 ++++++++---
 python/tvm/contrib/hexagon/build.py           | 50 ++++++++++++++++++-
 src/meta_schedule/utils.h                     | 15 ++++--
 src/runtime/hexagon/hexagon_common.cc         | 13 ++---
 .../hexagon/rpc/android_bash.sh.template      |  3 +-
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc | 22 ++++----
 src/runtime/logging.cc                        |  7 +++
 web/emcc/wasm_runtime.cc                      | 10 +++-
 11 files changed, 123 insertions(+), 41 deletions(-)

diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h
index b20227b34db4..658534780130 100644
--- a/apps/android_camera/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h
@@ -81,12 +81,12 @@ namespace detail {
 // Override logging mechanism
 [[noreturn]] void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
   std::string m = file + ":" + std::to_string(lineno) + ": " + message;
-  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+  __android_log_write(ANDROID_LOG_FATAL, "TVM_RUNTIME", m.c_str());
   throw InternalError(file, lineno, message);
 }
-void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+void LogMessageImpl(const std::string& file, int lineno, int level, const std::string& message) {
   std::string m = file + ":" + std::to_string(lineno) + ": " + message;
-  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+  __android_log_write(ANDROID_LOG_DEBUG + level, "TVM_RUNTIME", m.c_str());
 }
 
 }  // namespace detail
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index 95b793a985d1..543c9c85334e 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -95,12 +95,12 @@ namespace detail {
 // Override logging mechanism
 [[noreturn]] void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
   std::string m = file + ":" + std::to_string(lineno) + ": " + message;
-  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+  __android_log_write(ANDROID_LOG_FATAL, "TVM_RUNTIME", m.c_str());
   throw InternalError(file, lineno, message);
 }
-void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+void LogMessageImpl(const std::string& file, int lineno, int level, const std::string& message) {
   std::string m = file + ":" + std::to_string(lineno) + ": " + message;
-  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+  __android_log_write(ANDROID_LOG_DEBUG + level, "TVM_RUNTIME", m.c_str());
 }
 
 }  // namespace detail
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
index 3225b850befe..baf5c8867a92 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
@@ -45,7 +45,7 @@
   throw tvm::runtime::InternalError(file, lineno, message);
 }
 
-void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+void LogMessageImpl(const std::string& file, int lineno, int level, const std::string& message) {
   NSLog(@"%s:%d: %s", file.c_str(), lineno, message.c_str());
 }
 
diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h
index 7dbc6d810dc0..2128fc485ba4 100644
--- a/include/tvm/runtime/logging.h
+++ b/include/tvm/runtime/logging.h
@@ -294,7 +294,8 @@ namespace detail {
  *
  * \sa TVM_LOG_CUSTOMIZE
  */
-TVM_DLL void LogMessageImpl(const std::string& file, int lineno, const std::string& message);
+TVM_DLL void LogMessageImpl(const std::string& file, int lineno, int level,
+                            const std::string& message);
 
 /*!
  * \brief Class to accumulate an error message and throw it. Do not use
@@ -325,13 +326,15 @@ class LogFatal {
  */
 class LogMessage {
  public:
-  LogMessage(const std::string& file, int lineno) : file_(file), lineno_(lineno) {}
-  ~LogMessage() { LogMessageImpl(file_, lineno_, stream_.str()); }
+  LogMessage(const std::string& file, int lineno, int level)
+      : file_(file), lineno_(lineno), level_(level) {}
+  ~LogMessage() { LogMessageImpl(file_, lineno_, level_, stream_.str()); }
   std::ostringstream& stream() { return stream_; }
 
  private:
   std::string file_;
   int lineno_;
+  int level_;
   std::ostringstream stream_;
 };
 
@@ -378,17 +381,19 @@ class LogFatal {
  */
 class LogMessage {
  public:
-  LogMessage(const std::string& file, int lineno) {
+  LogMessage(const std::string& file, int lineno, int level) {
     std::time_t t = std::time(nullptr);
     stream_ << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "] " << file << ":" << lineno
-            << ": ";
+            << level_strings_[level];
   }
   TVM_NO_INLINE ~LogMessage() { std::cerr << stream_.str() << std::endl; }
   std::ostringstream& stream() { return stream_; }
 
  private:
   std::ostringstream stream_;
+  TVM_DLL static const char* level_strings_[];
 };
+
 #endif
 
 // Below is from dmlc-core
@@ -568,11 +573,20 @@ TVM_CHECK_FUNC(_NE, !=)
 
 }  // namespace detail
 
+#define TVM_LOG_LEVEL_DEBUG 0
+#define TVM_LOG_LEVEL_INFO 1
+#define TVM_LOG_LEVEL_WARNING 2
+#define TVM_LOG_LEVEL_ERROR 3
+#define TVM_LOG_LEVEL_FATAL 4
 #define LOG(level) LOG_##level
+#define LOG_DEBUG \
+  ::tvm::runtime::detail::LogMessage(__FILE__, __LINE__, TVM_LOG_LEVEL_DEBUG).stream()
 #define LOG_FATAL ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream()
-#define LOG_INFO ::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream()
-#define LOG_ERROR (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "Error: ")
-#define LOG_WARNING (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "Warning: ")
+#define LOG_INFO ::tvm::runtime::detail::LogMessage(__FILE__, __LINE__, TVM_LOG_LEVEL_INFO).stream()
+#define LOG_ERROR \
+  ::tvm::runtime::detail::LogMessage(__FILE__, __LINE__, TVM_LOG_LEVEL_ERROR).stream()
+#define LOG_WARNING \
+  ::tvm::runtime::detail::LogMessage(__FILE__, __LINE__, TVM_LOG_LEVEL_WARNING).stream()
 
 #define TVM_CHECK_BINARY_OP(name, op, x, y)                                \
   if (auto __tvm__log__err = ::tvm::runtime::detail::LogCheck##name(x, y)) \
diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index bc3b065dd941..e67009829771 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -19,6 +19,7 @@
 
 import abc
 import datetime
+import logging
 import multiprocessing as mp
 import os
 import pathlib
@@ -269,6 +270,7 @@ def __init__(
         hexagon_debug: bool = False,
         clear_logcat: bool = False,
         sysmon_profile: bool = False,
+        farf_config: str = "0x1e",
     ):
         """Configure a new HexagonLauncherAndroid
 
@@ -288,6 +290,10 @@ def __init__(
             Should the server clear logcat before running.
         sysmon_profile: bool, optional
             Should the server run sysmon profiler in the background.
+        farf_config: str, optional
+            Configuration string for runtime log level filtering.
+            Use farf_config_from_python_log_level to generate a bitmask
+            string from a Python logging level (e.g., logging.INFO)
         """
         if not rpc_info.get("workspace_base"):
             rpc_info["workspace_base"] = self.ANDROID_HEXAGON_TEST_BASE_DIR
@@ -301,6 +307,7 @@ def __init__(
         self._clear_logcat = clear_logcat
         self._sysmon_profile = sysmon_profile
         self._sysmon_process = None
+        self._farf_config = farf_config
         rpc_info["device_key"] = HEXAGON_REMOTE_DEVICE_KEY + "." + self._serial_number
 
         super(HexagonLauncherAndroid, self).__init__(rpc_info, workspace, self._serial_number)
@@ -342,6 +349,8 @@ def _copy_binaries(self):
                             line = line.replace(
                                 "<RPC_SERVER_PORT>", str(self._rpc_info["rpc_server_port"])
                             )
+                        if "<FARF_CONFIG>" in line:
+                            line = line.replace("<FARF_CONFIG>", str(self._farf_config))
                         dest_f.write(line)
 
                 # Make shell script executable
@@ -710,6 +719,44 @@ def _is_port_in_use(port: int) -> bool:
         return s.connect_ex(("localhost", port)) == 0
 
 
+def farf_config_from_python_log_level(level) -> str:
+    """Generates a FARF configuration string enabling logging at the specified level
+
+    Parameters
+    ----------
+    level : str or int
+        Minimum level to log at. Must be a known Python logging level or string
+        (e.g., logging.INFO or "INFO")
+    """
+
+    # Runtime log levels can be selectively enabled by computing a bitmask
+    # corresponding to the levels you want to enable. These get forwarded to
+    # logcat by the DSP RPC daemon. The bits for each level are:
+
+    # 0x01 - Hexagon LOW / TVM DEBUG / Python DEBUG
+    # 0x02 - Hexagon MEDIUM / TVM INFO / Python INFO
+    # 0x04 - Hexagon HIGH / TVM WARN / Python WARNING
+    # 0x08 - Hexagon ERROR / TVM ERROR / Python ERROR
+    # 0x10 - Hexagon FATAL / TVM FATAL / Python CRITICAL
+
+    # Runtime logging can also be filtered on filenames by appending a
+    # comma-separated list of filenames. For more information, see
+    # the Hexagon SDK documentation.
+
+    if level in (logging.DEBUG, "DEBUG"):
+        return "0x1F"
+    if level in (logging.INFO, "INFO"):
+        return "0x1E"
+    if level in (logging.WARNING, "WARNING"):
+        return "0x1C"
+    if level in (logging.ERROR, "ERROR"):
+        return "0x18"
+    if level in (logging.CRITICAL, "CRITICAL"):
+        return "0x10"
+
+    raise ValueError("Argument must be a known Python logging level or string")
+
+
 # pylint: disable=invalid-name
 def HexagonLauncher(
     serial_number: str,
@@ -718,10 +765,11 @@ def HexagonLauncher(
     hexagon_debug: bool = False,
     clear_logcat: bool = False,
     sysmon_profile: bool = False,
+    farf_config: str = farf_config_from_python_log_level(logging.INFO),
 ):
     """Creates a HexagonLauncher"""
     if serial_number == HEXAGON_SIMULATOR_NAME:
         return HexagonLauncherSimulator(rpc_info, workspace)
     return HexagonLauncherAndroid(
-        serial_number, rpc_info, workspace, hexagon_debug, clear_logcat, sysmon_profile
+        serial_number, rpc_info, workspace, hexagon_debug, clear_logcat, sysmon_profile, farf_config
     )
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 7240fa418839..37e0d1db5e98 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -93,13 +93,17 @@ class PyLogMessage {
       logger_(static_cast<int>(logging_level_), std::string(filename_), lineno_, stream_.str());
     } else {
       if (logging_level_ == Level::INFO) {
-        runtime::detail::LogMessage(filename_, lineno_).stream() << stream_.str();
+        runtime::detail::LogMessage(filename_, lineno_, TVM_LOG_LEVEL_INFO).stream()
+            << stream_.str();
       } else if (logging_level_ == Level::WARNING) {
-        runtime::detail::LogMessage(filename_, lineno_).stream() << "Warning: " << stream_.str();
+        runtime::detail::LogMessage(filename_, lineno_, TVM_LOG_LEVEL_WARNING).stream()
+            << stream_.str();
       } else if (logging_level_ == Level::ERROR) {
-        runtime::detail::LogMessage(filename_, lineno_).stream() << "Error: " << stream_.str();
+        runtime::detail::LogMessage(filename_, lineno_, TVM_LOG_LEVEL_ERROR).stream()
+            << stream_.str();
       } else if (logging_level_ == Level::DEBUG) {
-        runtime::detail::LogMessage(filename_, lineno_).stream() << "Debug: " << stream_.str();
+        runtime::detail::LogMessage(filename_, lineno_, TVM_LOG_LEVEL_DEBUG).stream()
+            << stream_.str();
       } else {
         runtime::detail::LogFatal(filename_, lineno_).stream() << stream_.str();
       }
@@ -151,7 +155,8 @@ inline void clear_logging(const char* file, int lineno, PackedFunc logging_func)
     logging_func(static_cast<int>(PyLogMessage::Level::CLEAR), file, lineno, "");
   } else {
     // this would clear all logging output in the console
-    runtime::detail::LogMessage(file, lineno).stream() << "\033c\033[3J\033[2J\033[0m\033[H";
+    runtime::detail::LogMessage(file, lineno, TVM_LOG_LEVEL_INFO).stream()
+        << "\033c\033[3J\033[2J\033[0m\033[H";
   }
 }
 
diff --git a/src/runtime/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon_common.cc
index ec65dffebe51..79acbf8b99e2 100644
--- a/src/runtime/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon_common.cc
@@ -32,6 +32,7 @@
 #include <vector>
 
 #include "../library_module.h"
+#include "HAP_debug.h"
 #include "HAP_perf.h"
 #include "hexagon_buffer.h"
 
@@ -69,22 +70,22 @@ std::vector<std::string> SplitString(const std::string& str, char delim) {
   }
   return lines;
 }
-void HexagonLog(const std::string& file, int lineno, const std::string& message) {
-  HEXAGON_PRINT(ALWAYS, "INFO: %s:%d:", file.c_str(), lineno);
+void HexagonLog(const std::string& file, int lineno, int level, const std::string& message) {
   std::vector<std::string> err_lines = SplitString(message, '\n');
   for (auto& line : err_lines) {
-    HEXAGON_PRINT(ALWAYS, "INFO: %s", line.c_str());
+    // TVM log levels roughly map to HAP log levels
+    HAP_debug_runtime(level, file.c_str(), lineno, line.c_str());
   }
 }
 }  // namespace
 
 namespace detail {
 [[noreturn]] void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
-  HexagonLog(file, lineno, message);
+  HexagonLog(file, lineno, TVM_LOG_LEVEL_FATAL, message);
   throw InternalError(file, lineno, message);
 }
-void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
-  HexagonLog(file, lineno, message);
+void LogMessageImpl(const std::string& file, int lineno, int level, const std::string& message) {
+  HexagonLog(file, lineno, level, message);
 }
 }  // namespace detail
 
diff --git a/src/runtime/hexagon/rpc/android_bash.sh.template b/src/runtime/hexagon/rpc/android_bash.sh.template
index d9f7613b0fc0..c45b03818fd3 100644
--- a/src/runtime/hexagon/rpc/android_bash.sh.template
+++ b/src/runtime/hexagon/rpc/android_bash.sh.template
@@ -20,7 +20,8 @@ export LD_LIBRARY_PATH=.
 
 # Enable FARF-based logging for Hexagon code invoked by 'tvm_rpc_android_server'.
 export ADSP_LIBRARY_PATH=`pwd`
-echo 0x1f > tvm_rpc_android.farf
+
+echo <FARF_CONFIG> > tvm_rpc_android.farf
 
 ./tvm_rpc_android server --port=<RPC_SERVER_PORT> --tracker=<RPC_TRACKER_HOST>:<RPC_TRACKER_PORT> --key=<HEXAGON_REMOTE_DEVICE_KEY> >${PWD}/tvm_rpc_android.log 2>&1 &
 
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index f39944615bfd..8daeeff66284 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -64,7 +64,7 @@ class HexagonIOHandler {
   void MessageStart(size_t message_size_bytes) {}
 
   ssize_t PosixWrite(const uint8_t* buf, size_t write_len_bytes) {
-    LOG(INFO) << "HexagonIOHandler PosixWrite called, write_len_bytes(" << write_len_bytes << ")";
+    LOG(DEBUG) << "HexagonIOHandler PosixWrite called, write_len_bytes(" << write_len_bytes << ")";
     int32_t written_size = write_buffer_.sputn(reinterpret_cast<const char*>(buf), write_len_bytes);
     if (written_size != write_len_bytes) {
       LOG(ERROR) << "written_size(" << written_size << ") != write_len_bytes(" << write_len_bytes
@@ -74,11 +74,11 @@ class HexagonIOHandler {
     return (ssize_t)written_size;
   }
 
-  void MessageDone() { LOG(INFO) << "Message Done."; }
+  void MessageDone() { LOG(DEBUG) << "Message Done."; }
 
   ssize_t PosixRead(uint8_t* buf, size_t read_len_bytes) {
-    LOG(INFO) << "HexagonIOHandler PosixRead called, read_len_bytes(" << read_len_bytes
-              << "), read_buffer_index_(" << read_buffer_index_ << ")";
+    LOG(DEBUG) << "HexagonIOHandler PosixRead called, read_len_bytes(" << read_len_bytes
+               << "), read_buffer_index_(" << read_buffer_index_ << ")";
 
     uint32_t bytes_to_read = 0;
     if (read_buffer_index_ < read_len_bytes) {
@@ -101,9 +101,9 @@ class HexagonIOHandler {
    * \return The status
    */
   AEEResult SetReadBuffer(const uint8_t* data, size_t data_size_bytes) {
-    LOG(INFO) << "HexagonIOHandler SetReadBuffer: data_size_bytes(" << data_size_bytes
-              << "), read_buffer_index_(" << read_buffer_index_ << "), read_buffer_size_bytes_("
-              << read_buffer_size_bytes_ << ")";
+    LOG(DEBUG) << "HexagonIOHandler SetReadBuffer: data_size_bytes(" << data_size_bytes
+               << "), read_buffer_index_(" << read_buffer_index_ << "), read_buffer_size_bytes_("
+               << read_buffer_size_bytes_ << ")";
     if (data_size_bytes > read_buffer_size_bytes_) {
       LOG(ERROR) << "ERROR: data_size_bytes(" << data_size_bytes << ") > read_buffer_size_bytes_("
                  << read_buffer_size_bytes_ << ")";
@@ -123,8 +123,8 @@ class HexagonIOHandler {
    * \return The size of data that is read in bytes.
    */
   int64_t ReadFromWriteBuffer(uint8_t* buf, size_t read_size_bytes) {
-    LOG(INFO) << "HexagonIOHandler ReadFromWriteBuffer called, read_size_bytes: "
-              << read_size_bytes;
+    LOG(DEBUG) << "HexagonIOHandler ReadFromWriteBuffer called, read_size_bytes: "
+               << read_size_bytes;
     int64_t size = (int64_t)write_buffer_.sgetn(reinterpret_cast<char*>(buf), read_size_bytes);
     write_buffer_available_length_ -= size;
 
@@ -135,7 +135,7 @@ class HexagonIOHandler {
     return size;
   }
 
-  void Close() { LOG(INFO) << "HexagonIOHandler Close called"; }
+  void Close() { LOG(DEBUG) << "HexagonIOHandler Close called"; }
 
   void Exit(int code) { exit(code); }
 
@@ -264,7 +264,7 @@ int __QAIC_HEADER(hexagon_rpc_open)(const char* uri, remote_handle64* handle) {
 }
 
 int __QAIC_HEADER(hexagon_rpc_close)(remote_handle64 handle) {
-  LOG(INFO) << __func__;
+  LOG(DEBUG) << __func__;
   if (handle) {
     free(reinterpret_cast<void*>(static_cast<uintptr_t>(handle)));
   }
diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc
index d62051f7cee0..d0ce77c931db 100644
--- a/src/runtime/logging.cc
+++ b/src/runtime/logging.cc
@@ -212,6 +212,13 @@ namespace tvm {
 namespace runtime {
 namespace detail {
 
+const char* ::tvm::runtime::detail::LogMessage::level_strings_[] = {
+    ": Debug: ",    // TVM_LOG_LEVEL_DEBUG
+    ": ",           // TVM_LOG_LEVEL_INFO
+    ": Warning: ",  // TVM_LOG_LEVEL_WARNING
+    ": Error: ",    // TVM_LOG_LEVEL_ERROR
+};
+
 namespace {
 constexpr const char* kSrcPrefix = "/src/";
 // Note: Better would be std::char_traits<const char>::length(kSrcPrefix) but it is not
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index addc3a3e0c11..2b0ee49d7edd 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -73,8 +73,14 @@ namespace detail {
   abort();
 }
 
-void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
-  std::cout << "[INFO] " << file << ":" << lineno << ": " << message << std::endl;
+void LogMessageImpl(const std::string& file, int lineno, int level, const std::string& message) {
+  static const char* level_strings_[] = {
+      "[DEBUG] ",
+      "[INFO] ",
+      "[WARNING] ",
+      "[ERROR] ",
+  };
+  std::cout << level_strings_[level] << file << ":" << lineno << ": " << message << std::endl;
 }
 
 }  // namespace detail

From 3149ee5a73133dd320c8ec259459a4c32b1955d8 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 28 Oct 2022 12:08:37 -0700
Subject: [PATCH 461/704] [skip ci] Remove naut-thomas from .asf.yaml (#13231)

We have to remove and re-add this user to re-trigger the asf invitation

Follow on to #13141
---
 .asf.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.asf.yaml b/.asf.yaml
index 1e4371d594d2..c82d4bfab3dd 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -48,7 +48,6 @@ github:
     - hpanda-naut
     - denise-k
     - janetsc
-    - naut-thomas
     - tvm-bot  # For automated feedback in PR review.
 
   # See https://cwiki.apache.org/confluence/display/INFRA/Git+-+.asf.yaml+features#Git.asf.yamlfeatures-Branchprotection

From 60e865a6fed6eeb5f9840bac005e31870d98e550 Mon Sep 17 00:00:00 2001
From: Alexey Gladyshev <wotpricol@mail.ru>
Date: Fri, 28 Oct 2022 22:23:58 +0300
Subject: [PATCH 462/704] [CI] Enable iOS RPC tests (#13229)

These tests were previously disabled (see https://github.com/apache/tvm/issues/13185). The error was related to updating `MacOS` to version `12.6` and `XCode` update.
This PR fixes the error and re-enables tests for `iOS RPC`.
---
 .github/workflows/main.yml  | 17 ++++++++---------
 python/tvm/contrib/xcode.py |  4 ++--
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6cbb7aa8daf5..b03a1795ef9e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -69,15 +69,14 @@ jobs:
         shell: bash -l {0}
         run: >-
           python -m pytest -v tests/python/all-platform-minimal-test
-      # See https://github.com/apache/tvm/issues/13185
-      # - name: Test iOS RPC
-      #   shell: bash -l {0}
-      #   run: >-
-      #     python -m pip install tornado psutil cloudpickle &&
-      #     export PYTHONPATH=tests/python/contrib:${PYTHONPATH} &&
-      #     export BUNDLE_ID=org.apache.tvmrpc &&
-      #     export BUNDLE_PATH=build-ios-simulator/apps/ios_rpc/ios_rpc/src/ios_rpc-build/Release-iphonesimulator/tvmrpc.app &&
-      #     python -m pytest -v tests/python/contrib/test_rpc_server_device.py
+      - name: Test iOS RPC
+        shell: bash -l {0}
+        run: >-
+          python -m pip install tornado psutil cloudpickle &&
+          export PYTHONPATH=tests/python/contrib:${PYTHONPATH} &&
+          export BUNDLE_ID=org.apache.tvmrpc &&
+          export BUNDLE_PATH=build-ios-simulator/apps/ios_rpc/ios_rpc/src/ios_rpc-build/Release-iphonesimulator/tvmrpc.app &&
+          python -m pytest -v tests/python/contrib/test_rpc_server_device.py
 
   Windows:
     if: ${{ github.repository == 'apache/tvm' }}
diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
index 6d5e10f611db..236341e1a450 100644
--- a/python/tvm/contrib/xcode.py
+++ b/python/tvm/contrib/xcode.py
@@ -46,9 +46,9 @@ def xcrun(cmd):
 
 
 def __get_min_os_version(sdk):
-    if sdk in ("macosx", "iphonesimulator"):
+    if sdk == "macosx":
         return None
-    if sdk == "iphoneos":
+    if sdk in ("iphoneos", "iphonesimulator"):
         return "13.0"
     raise RuntimeError("Unsupported sdk: %s" % sdk)
 

From 6cd1bb5e89e155d4809680ab28a3e57f33083eb6 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <73191103+jverma-quic@users.noreply.github.com>
Date: Fri, 28 Oct 2022 14:48:28 -0500
Subject: [PATCH 463/704] [Hexagon] Update search pattern to find .so address
 for on device runs (#13230)

* [Hexagon] Update search pattern to find .so address for on device runs

* [Hexagon] Handle OperatorModule in hexagon_profiler
---
 python/tvm/contrib/hexagon/hexagon_profiler.py      | 13 +++++++++++--
 .../contrib/hexagon/profiling/process_lwp_data.py   |  7 +++----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/python/tvm/contrib/hexagon/hexagon_profiler.py b/python/tvm/contrib/hexagon/hexagon_profiler.py
index 9a5df3d9b99a..a0eaf2f719d1 100755
--- a/python/tvm/contrib/hexagon/hexagon_profiler.py
+++ b/python/tvm/contrib/hexagon/hexagon_profiler.py
@@ -19,9 +19,11 @@
 
 import os
 import subprocess
+import typing
 from tvm.ir.transform import PassContext
 from tvm.contrib.hexagon.profiling.process_lwp_data import process_lwp_output
 from tvm.relay.backend.executor_factory import ExecutorFactoryModule
+from tvm.driver.build_module import OperatorModule
 from tvm.contrib import utils
 
 
@@ -29,13 +31,20 @@ class HexagonProfiler:
     """Hexagon Profiler"""
 
     def __init__(
-        self, dso_binary: str, module: ExecutorFactoryModule, hexagon_server_process, enable_debug
+        self,
+        dso_binary: str,
+        module: typing.Union[ExecutorFactoryModule, OperatorModule],
+        hexagon_server_process,
+        enable_debug,
     ):
         """Configure HexagonProfiler"""
         # Save test .so to process profiling data
         self._temp_dir = utils.tempdir(keep_for_debug=enable_debug)
         self._dso_binary_path = self._temp_dir.relpath(dso_binary)
-        module.get_lib().save(self._dso_binary_path)
+        if isinstance(module, OperatorModule):
+            module.save(self._dso_binary_path)
+        else:
+            module.get_lib().save(self._dso_binary_path)
 
         self._android_serial_number = os.environ.get("ANDROID_SERIAL_NUMBER")
         self._remote_path = ""
diff --git a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
index eb92228b7cf3..7fccfbd09636 100644
--- a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
+++ b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
@@ -290,7 +290,7 @@ def process_data(data, func_info, so_ld_addr):
     return overall_cycles
 
 
-def get_load_addr(binary_path: str, serial_number: str, lwp_json: str, run_log: str):
+def get_load_addr(serial_number: str, lwp_json: str, run_log: str):
     """Get load address of the binary file"""
     if serial_number == "simulator":
         basedir = os.path.dirname(lwp_json)
@@ -308,8 +308,7 @@ def get_load_addr(binary_path: str, serial_number: str, lwp_json: str, run_log:
         pattern = compile(r"Model.*: (\w+):")
     else:
         # To extract load address for on-device run
-        binary_name = os.path.basename(binary_path)
-        pattern = compile(r"{}, len \w+, laddr (\w+)".format(binary_name))
+        pattern = compile(r"Model.*: (\w+)")
 
     with open(run_log, "r") as f:
         lines = f.read()
@@ -334,7 +333,7 @@ def process_lwp_output(
     ENABLE_DEBUG = enable_debug
 
     # Get load address for the binary
-    load_addr = get_load_addr(binary_path, serial_number, lwp_json, run_log)
+    load_addr = get_load_addr(serial_number, lwp_json, run_log)
     # Opening JSON file
     with open(lwp_json, "r") as f:
         # Returns JSON object as a dictionary

From b9e5c02d0b8cf148e7e07690397c098a02228cee Mon Sep 17 00:00:00 2001
From: Gayatri P K <quic_gpk@quicinc.com>
Date: Sat, 29 Oct 2022 02:06:47 +0530
Subject: [PATCH 464/704] [TOPI][Hexagon] Implement quantized depthwise conv2d
 (#12499)

* [TOPI][Hexagon] Implement quantized depthwise conv2d

* Fix lint errors

* Fix lint error

* Fix lint errors
---
 python/tvm/topi/hexagon/qnn/__init__.py       |   1 +
 .../hexagon/qnn/qdepthwise_conv2d_slice.py    | 217 +++++++++++
 python/tvm/topi/hexagon/slice_ops/dwconv2d.py |   5 +-
 python/tvm/topi/hexagon/utils.py              |  19 +
 .../topi/test_depthwise_conv2d_slice.py       | 337 ++++++++++++++++++
 .../test_hexagon/topi/test_dwconv2d_slice.py  | 314 ----------------
 6 files changed, 577 insertions(+), 316 deletions(-)
 create mode 100644 python/tvm/topi/hexagon/qnn/qdepthwise_conv2d_slice.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d_slice.py
 delete mode 100644 tests/python/contrib/test_hexagon/topi/test_dwconv2d_slice.py

diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
index bafc6846b6fb..f7a018d2257a 100644
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -26,3 +26,4 @@
 
 from .quantize import quantize_compute, tir_quantize_schedule
 from .nn import *
+from .qdepthwise_conv2d_slice import qdepthwise_conv2d_compute, qdepthwise_conv2d_schedule
diff --git a/python/tvm/topi/hexagon/qnn/qdepthwise_conv2d_slice.py b/python/tvm/topi/hexagon/qnn/qdepthwise_conv2d_slice.py
new file mode 100644
index 000000000000..9a275c1cc370
--- /dev/null
+++ b/python/tvm/topi/hexagon/qnn/qdepthwise_conv2d_slice.py
@@ -0,0 +1,217 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals
+"""
+Please note the following assumptions made by the implementation:
+1) The input must be padded in advance to account for 'padding'. In addition,
+   both input and output must be padded as per the physical buffer layout.
+2) 'padding' is ignored. It must be handled outside of the sliced op.
+3) The weights are expected to be as per physical layout
+
+The initial compute for quantized depthwise conv2d is as follows
+where cm = channel_multiplier; assumed to be 1,
+zp_a = Activation_zero_point,
+zp_w = Weight_zero_point,
+Qa = Quantized Activation,
+Qw = Quantized Weights.
+
+     a) Qc(n, oh, ow, oc) = (Sigma(r, s) (Qw(r, s, oc%cm, oc/cm) - zp_w)
+                                      * (Qa(n, oh + r, ow + s, oc/cm) - zp_a))
+                                      * scale_value
+        where scale_value = (activation_scale * weight_scale) / output_scale
+
+        This can be written as
+
+     b) Qc(n, oh, ow, oc) = (t1 - t2 - t3 + t4) * scale_value
+
+        where t1 = Sigma(r, s) Qw(r, s, oc%cm, oc/cm) * Qa(n, oh + r, ow + s, oc/cm)
+              t2 = Sigma(r, s) zp_w * Qa(n, oh + r, ow + s, oc/cm)
+              t3 = Sigma(r, s) zp_a * Qw(r, s, oc%cm, oc/cm)
+              t4 = Sigma(r, s) zp_a * zp_w
+
+     c) Qc(n, oh, ow, oc) = saturate(((t1 - t2 - t3 + t4) * fixed_scale_value)) >> rsh)
+
+        where fixed_scale_value, rsh are fixed point values for scale_value.
+
+
+Compute and schedule for quantized depthwise conv2d slice op"""
+
+import typing
+import tvm
+from tvm import te
+from ..utils import get_layout_transform_fn, get_fixed_point_value, saturate
+
+
+def qdepthwise_conv2d_compute(
+    activations: te.Tensor,
+    weights: te.Tensor,
+    out_shape: typing.Tuple,
+    stride: typing.Tuple,
+    dilation: typing.Tuple,
+    dtype: str,
+    # quantization params:
+    activation_zero_point,
+    activation_scale,
+    weight_zero_point,
+    weight_scale,
+    output_zero_point,
+    output_scale,
+):
+    """Compute for quantized depthwise conv2d"""
+    filt_shape = weights.shape
+    ob, oh, ow, oc = out_shape
+
+    if dtype == "uint8":
+        temp_dtype = "int32"
+        big_dtype = "int64"
+    elif dtype == "int8":
+        temp_dtype = "int32"
+        big_dtype = "int64"
+    else:
+        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
+
+    reduce_height = tvm.te.reduce_axis((0, filt_shape[0]), name="reduce_height")
+    reduce_width = tvm.te.reduce_axis((0, filt_shape[1]), name="reduce_width")
+    stride_height, stride_width = stride
+    dilation_height, dilation_width = dilation
+
+    scale_value = (activation_scale * weight_scale) / output_scale
+    fixed_scale_value, rsh = get_fixed_point_value(scale_value, "int16")
+
+    t1 = tvm.te.compute(
+        out_shape,
+        lambda n, h, w, c: tvm.te.sum(
+            (
+                (
+                    activations[
+                        n,
+                        h * stride_height + reduce_height * dilation_height,
+                        w * stride_width + reduce_width * dilation_width,
+                        c,
+                    ].astype(temp_dtype)
+                )
+                * (weights[reduce_height, reduce_width, 0, c].astype(temp_dtype))
+            ).astype(temp_dtype),
+            axis=[reduce_height, reduce_width],
+        ),
+        name="t1",
+    )
+
+    t2 = tvm.te.compute(
+        out_shape,
+        lambda n, h, w, c: tvm.te.sum(
+            (
+                (
+                    activations[
+                        n,
+                        h * stride_height + reduce_height * dilation_height,
+                        w * stride_width + reduce_width * dilation_width,
+                        c,
+                    ].astype(temp_dtype)
+                )
+                * weight_zero_point
+            ).astype(temp_dtype),
+            axis=[reduce_height, reduce_width],
+        ),
+        name="t2",
+    )
+
+    t3 = tvm.te.compute(
+        (oc,),
+        lambda c: tvm.te.sum(
+            (
+                ((weights[reduce_height, reduce_width, 0, c].astype(temp_dtype)))
+                * activation_zero_point
+            ).astype(temp_dtype),
+            axis=[reduce_height, reduce_width],
+        ),
+        name="t3",
+    )
+
+    t4 = activation_zero_point * weight_zero_point * reduce_height * reduce_width
+
+    output = tvm.te.compute(
+        out_shape,
+        lambda n, h, w, c: saturate(
+            (
+                (
+                    (
+                        ((t1[n, h, w, c]).astype(big_dtype) - t2[n, h, w, c] - t3[c] + t4)
+                        * fixed_scale_value
+                    )
+                    >> rsh
+                )
+                + (output_zero_point).astype(big_dtype)
+            ),
+            dtype,
+        ).astype(dtype),
+        name="output",
+    )
+
+    return output
+
+
+def qdepthwise_conv2d_schedule(
+    outs: te.Tensor,
+    ins: typing.List[te.Tensor],
+    transform_activation_layout: str,
+    transform_weights: str,
+):
+    """
+    Schedule for quantized depthwise conv2d for input layout nhwc-8h8w32c
+    assert len(ins) == 2, "This schedule expects only 2 inputs - Activations and Weights
+    """
+    source_expr = ins + [outs]
+    prim_func = tvm.te.create_prim_func(source_expr)
+    sch = tvm.tir.Schedule(prim_func)
+
+    compute = sch.get_block("output")
+    compute1 = sch.get_block("t1")
+
+    transform_layout_fn = get_layout_transform_fn(transform_activation_layout)
+    transform_layout_weights = get_layout_transform_fn(transform_weights)
+
+    # Apply layout_transform for activation
+    sch.transform_layout(compute1, ins[0].name, transform_layout_fn)
+
+    # Apply layout_transform for weights
+    sch.transform_layout(compute1, ins[1].name, transform_layout_weights)
+
+    # Apply layout_transform for output
+    sch.transform_layout(compute, outs.name, transform_layout_fn)
+
+    # This returns the original 6d loop
+    batch, height, width, channel, reduce_height, reduce_width = sch.get_loops(compute1)
+    h_outer, h_inner = sch.split(height, [None, 8])
+    w_outer, w_inner = sch.split(width, [None, 8])
+    c_outer, c_inner = sch.split(channel, [None, 32])
+    sch.reorder(
+        batch,
+        h_outer,
+        w_outer,
+        c_outer,
+        h_inner,
+        reduce_height,
+        reduce_width,
+        w_inner,
+        c_inner,
+    )
+
+    sch.decompose_reduction(compute1, reduce_height)
+    # wi_ci = sch.fuse(w_inner,c_inner)
+    # sch.vectorize(wi_ci)
+    return sch
diff --git a/python/tvm/topi/hexagon/slice_ops/dwconv2d.py b/python/tvm/topi/hexagon/slice_ops/dwconv2d.py
index 698495daf1b7..d22dc02a5c1b 100644
--- a/python/tvm/topi/hexagon/slice_ops/dwconv2d.py
+++ b/python/tvm/topi/hexagon/slice_ops/dwconv2d.py
@@ -85,7 +85,7 @@ def dwconv2d_schedule(
     outs: te.Tensor,
     ins: typing.List[te.Tensor],
     transform_activation_layout: str,
-    transform_weights: typing.Callable,
+    transform_weights: str,
 ) -> tvm.tir.Schedule:
     """STIR schedule definition for the compute defined above by dwconv2d_compute.
         - Auto-generated prim_func before applying schedule primitives for reference
@@ -128,11 +128,12 @@ def main(InputTensor: T.Buffer[(1, 16, 8, 32), "float16"], Weights: T.Buffer[(3,
     sch = tvm.tir.Schedule(prim_func)
     compute = sch.get_block("Output")
     transform_layout_fn = get_layout_transform_fn(transform_activation_layout)
+    transform_layout_weights = get_layout_transform_fn(transform_weights)
     # Apply layout_transform for activation
     sch.transform_layout(compute, ins[0].name, transform_layout_fn)
 
     # Apply layout_transform for weights
-    sch.transform_layout(compute, ins[1].name, transform_weights)
+    sch.transform_layout(compute, ins[1].name, transform_layout_weights)
 
     # Apply layout_transform for output
     sch.transform_layout(compute, outs.name, transform_layout_fn)
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index dab9aa3f74ab..890ebeb9fd11 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -127,6 +127,10 @@ def iohw_16i32o2i_1d(height, width, in_channel, out_channel):
     ]
 
 
+def ohwi32o_1d(height, width, in_channel, out_channel):
+    return [out_channel // 32, height, width, in_channel, out_channel % 32]
+
+
 def get_layout_transform_fn(layout):
     """Return index map function as per the layout string"""
     if layout == "nhwc-8h2w32c2w-2d":
@@ -167,6 +171,8 @@ def get_layout_transform_fn(layout):
         return nhwc_8h8w32c_2d
     if layout == "n11c-2048c-2d":
         return n11c_2048c_2d
+    if layout == "ohwi32o-1d":
+        return ohwi32o_1d
     raise RuntimeError(f"Unexpected layout '{layout}'")
 
 
@@ -235,6 +241,19 @@ def get_fixed_point_value(flp: float, dtype: str = "int16") -> Tuple[int, int]:
     best scaling factor for 'int16' type that can be used to convert the floating-point value to
     fixed-point with the least amount of precision loss.
 
+
+    Here is a more rigorous explanation of the above, for non-negative scale values, which are of
+    interest. M < 2, so M * 2^(E-Bias+x) < 2 ^ (E-Bias+x+1)   [Note: LHS is a fraction, RHS int]
+    => round(M * 2^(E-Bias+x)) <= 2 ^ (E-Bias+x+1)  [Note the "<=", not "<"]
+    We want x s.t. round(M * 2^(E-Bias+x)) <= 2^15 - 1
+    We know round(M * 2^(E-Bias+x)) <= 2^(E-Bias+x+1)
+    It will be sufficient to choose x s.t. 2^(E-Bias+x+1) <= 2^15 - 1
+    That is, max x. s.t. 2^(E-Bias+x+1) < 2^15
+    E-Bias+x+1 < 15
+    E-Bias+x+1 <= 14
+    Max x will make E-Bias+x+1 = 14
+    x = 13 - E + Bias
+
     Additonal notes on various floating-point values:
     ------------------------------------------------
     1) Denormalized values: causes assertion failure. The problem with the denormalized values
diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d_slice.py
new file mode 100644
index 000000000000..840a462917ae
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d_slice.py
@@ -0,0 +1,337 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable, unused-argument, disable=line-too-long, redefined-outer-name
+
+"""Test depthwise_conv2d slice op for hexagon."""
+
+import numpy as np
+import tvm
+import tvm.testing
+import tvm.topi.hexagon.qnn as qn
+from tvm.topi.testing import depthwise_conv2d_python_nhwc
+from tvm.topi.hexagon.slice_ops.dwconv2d import dwconv2d_compute, dwconv2d_schedule
+from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
+
+
+@tvm.testing.fixture
+def input_np(in_shape, dtype, low, high):
+    if dtype in ("uint8"):
+        return np.random.uniform(low=low, high=high, size=in_shape).astype("float32")
+    if dtype in ("int8"):
+        return np.random.uniform(low=-low, high=high, size=in_shape).astype("float32")
+    return np.random.uniform(size=in_shape).astype(dtype)
+
+
+@tvm.testing.fixture
+def input_np_padded(input_np, in_shape, padded_in_shape):
+    pad_height = padded_in_shape[1] - in_shape[1]
+    pad_width = padded_in_shape[2] - in_shape[2]
+    pad_channel = padded_in_shape[3] - in_shape[3]
+    input_padded = np.pad(
+        input_np, ((0, 0), (0, pad_height), (0, pad_width), (0, pad_channel)), "constant"
+    )
+    return input_padded
+
+
+@tvm.testing.fixture
+def in_out_layout(dtype):
+    if dtype == "float16":
+        return "nhwc-8h2w32c2w-2d"
+    elif dtype in ("uint8", "int8"):
+        return "nhwc-8h8w32c-2d"
+    else:
+        raise RuntimeError(f"Unsupported quantized data type '{dtype}'")
+
+
+@tvm.testing.fixture
+def expected_output_np(input_np, dilated_weights_np, stride, dtype):
+    dilated_weights_np_t = dilated_weights_np.transpose(0, 1, 3, 2)
+    ref_type = dtype
+    if dtype in ("uint8", "int8"):
+        # for quantized versions, return float32 output
+        ref_type = "float32"
+    ref_np = depthwise_conv2d_python_nhwc(
+        input_np.astype("float32"), dilated_weights_np_t.astype("float32"), stride, padding=0
+    ).astype(ref_type)
+    return ref_np
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, in_out_layout, dtype):
+    if dtype == "float16":
+        return transform_numpy(expected_output_np, "nhwc", in_out_layout)
+    elif dtype in ("uint8", "int8"):
+        quant_arr, scale, zero_point = quantize_np(expected_output_np, dtype)
+        return [transform_numpy(quant_arr, "nhwc", in_out_layout), scale, zero_point]
+    else:
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+
+@tvm.testing.fixture
+def transformed_input_np_padded(input_np_padded, in_out_layout, dtype):
+    if dtype == "float16":
+        return transform_numpy(input_np_padded, "nhwc", in_out_layout)
+    if dtype in ("uint8", "int8"):
+        quant_arr, scale, zero_point = quantize_np(input_np_padded, dtype)
+        return [transform_numpy(quant_arr, "nhwc", in_out_layout), scale, zero_point]
+    raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+
+@tvm.testing.fixture
+def weights_np(filt_shape, dtype):
+    if dtype == "float16":
+        return np.random.uniform(size=filt_shape).astype(dtype)
+    elif dtype in ("uint8", "int8"):
+        weight_arr = np.random.uniform(low=-5, high=5, size=filt_shape).astype("float32")
+        return weight_arr
+    else:
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+
+@tvm.testing.fixture
+def dilated_filt_shape(filt_shape, dilation):
+    """Compute the dilated filter shape when dilation > 1"""
+    filt_height, filt_width, in_channel, out_channel = filt_shape
+    dilation_height, dilation_width = dilation
+    if dilation_height == 1 and dilation_width == 1:
+        return filt_shape
+    dilated_height = dilation_height * (filt_height - 1) + 1
+    dilated_width = dilation_width * (filt_width - 1) + 1
+    return dilated_height, dilated_width, in_channel, out_channel
+
+
+@tvm.testing.fixture
+def dilated_weights_np(weights_np, dilation, dilated_filt_shape, dtype):
+    """Get dilated weights from original weights for testing"""
+    if dtype in ["int8", "uint8"]:
+        dtype = "float32"
+    filt_height, filt_width, in_channels, out_channels = weights_np.shape
+    dilated_weights = np.zeros(dilated_filt_shape)
+    dilation_height, dilation_width = dilation
+    if dilation_height == 1 and dilation_width == 1:
+        return weights_np
+    dilated_height, dilated_width = dilated_filt_shape[0], dilated_filt_shape[1]
+    for in_channel in range(in_channels):
+        for out_channel in range(out_channels):
+            for dilation_i, height_i in zip(
+                range(0, dilated_height, dilation_height), range(filt_height)
+            ):
+                for dilation_j, width_j in zip(
+                    range(0, dilated_width, dilation_width), range(filt_width)
+                ):
+                    dilated_weights[dilation_i, dilation_j, in_channel, out_channel] = weights_np[
+                        height_i, width_j, in_channel, out_channel
+                    ]
+    return dilated_weights
+
+
+@tvm.testing.fixture
+def transformed_weights_np(weights_np, dtype):
+    height, width, in_channel, out_channel = weights_np.shape
+    t = weights_np.reshape([height, width, in_channel, out_channel // 32, 32]).transpose(
+        3, 0, 1, 2, 4
+    )
+    if dtype == "float16":
+        return t
+    if dtype in ("uint8", "int8"):
+        quant_arr, scale, zero_point = quantize_np(t, dtype)
+        return [quant_arr, scale, zero_point]
+    raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+
+def generate_test_config(test_params):
+    """Utility function to generate test config with meaningful ids"""
+    test_config = {}
+
+    dims = lambda vals: "x".join(map(str, vals))
+
+    for param in test_params:
+        in_shape, filt_shape, stride, dilation = param[:4]
+        test_name = f"nhwc{dims(in_shape)}-hwio{dims(filt_shape)}-stride{dims(stride)}-dilation{dims(dilation)}"
+        test_config[test_name] = param
+
+    return test_config
+
+
+class Testdwconv2dSlice:
+    """Test class that defines the dwconv2d slice test"""
+
+    test_params = [
+        [(1, 10, 10, 32), (3, 3, 1, 32), (1, 1), (1, 1), 0.0, 10.0],
+        [(1, 10, 10, 64), (3, 3, 1, 64), (1, 1), (1, 1), 0.0, 10.0],
+        [(1, 12, 12, 32), (5, 5, 1, 32), (1, 1), (1, 1), 0.0, 20.0],
+        [(1, 16, 16, 32), (5, 5, 1, 32), (1, 1), (2, 2), 0.0, 1.0],
+        [(1, 18, 10, 32), (3, 3, 1, 32), (1, 1), (1, 1), 0.0, 10.0],
+        [(1, 18, 18, 32), (3, 3, 1, 32), (2, 2), (1, 1), 0.0, 10.0],
+        [(1, 18, 10, 96), (3, 3, 1, 96), (1, 1), (1, 1), 0.0, 10.0],
+        [(1, 21, 21, 32), (7, 7, 1, 32), (2, 2), (1, 1), 0.0, 10.0],
+        [(1, 28, 28, 32), (7, 7, 1, 32), (2, 2), (2, 2), 0.0, 10.0],
+        [(1, 28, 28, 96), (7, 7, 1, 96), (2, 2), (2, 2), 0.0, 10.0],
+        [(1, 10, 16, 32), (3, 1, 1, 32), (1, 1), (1, 1), 0.0, 10.0],
+    ]
+
+    test_config = generate_test_config(test_params)
+
+    in_shape, filt_shape, stride, dilation, low, high = tvm.testing.parameters(
+        *test_config.values(), ids=test_config.keys()
+    )
+    dtype = tvm.testing.parameter("float16", "uint8")
+    working_scope = tvm.testing.parameter("global.vtcm")
+    weights_layout = tvm.testing.parameter("ohwi32o-1d")
+
+    @tvm.testing.fixture
+    def padded_in_shape(self, in_shape, dtype):
+        """Padding the input shape according to layout"""
+        # NOTE: For float16, the input layout is always assumed to be nhwc-8h2w32c2w-2d and
+        # for int8/uint8, it's nhwc-8h8w32c-2d.
+        # For both nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d, the height should be a multiple
+        # of 8. However, the width should be a multiple of 4 for the first case and 8 for
+        # the second case.
+        in_batch, in_height, in_width, in_channel = in_shape
+        in_height = ((in_height + 7) // 8) * 8
+
+        if dtype == "float16":
+            in_width = ((in_width + 3) // 4) * 4
+        elif dtype in ("uint8", "int8"):
+            in_width = ((in_width + 7) // 8) * 8
+
+        in_channel = ((in_channel + 31) // 32) * 32
+
+        return in_batch, in_height, in_width, in_channel
+
+    @tvm.testing.fixture
+    def out_shape(self, in_shape, dilated_filt_shape, stride):
+        in_batch, in_height, in_width, _ = in_shape
+        filt_height, filt_width, _, num_filt = dilated_filt_shape
+        out_height = (in_height - filt_height) // stride[0] + 1
+        out_width = (in_width - filt_width) // stride[1] + 1
+        out_channel = num_filt
+        return in_batch, out_height, out_width, out_channel
+
+    @tvm.testing.requires_hexagon
+    def test_dwconv2d(
+        self,
+        dtype,
+        in_out_layout,
+        weights_layout,
+        padded_in_shape,
+        weights_np,
+        filt_shape,
+        stride,
+        dilation,
+        out_shape,
+        input_np,
+        input_np_padded,
+        transformed_weights_np,
+        expected_output_np,
+        target,
+        working_scope,
+        transformed_input_np_padded,
+        transformed_expected_output_np,
+        hexagon_session,
+    ):
+        """Main test function that tests the dwconv2d slice op"""
+        input_tensor = tvm.te.placeholder(padded_in_shape, name="InputTensor", dtype=dtype)
+        weights = tvm.te.placeholder(filt_shape, name="Weights", dtype=dtype)
+
+        target_hexagon = tvm.target.hexagon("v69")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+        # Construct compute and schedule based on dtype
+        if dtype in ("uint8", "int8"):
+            in_data_np, activation_scale, activation_zero_point = transformed_input_np_padded
+            (
+                weights_data_np,
+                weight_scale,
+                weight_zero_point,
+            ) = transformed_weights_np
+            out_data_np, output_scale, output_zero_point = transformed_expected_output_np
+
+            output_tensor = qn.qdepthwise_conv2d_compute(
+                input_tensor,
+                weights,
+                out_shape,
+                stride,
+                dilation,
+                dtype,
+                activation_zero_point,
+                activation_scale,
+                weight_zero_point,
+                weight_scale,
+                output_zero_point,
+                output_scale,
+            )
+
+            tir_schedule = qn.qdepthwise_conv2d_schedule(
+                output_tensor, [input_tensor, weights], in_out_layout, weights_layout
+            )
+
+        elif dtype == "float16":
+            in_data_np = transformed_input_np_padded
+            out_data_np = transformed_expected_output_np
+            weights_data_np = transformed_weights_np
+            output_tensor = dwconv2d_compute(
+                input_tensor, weights, out_shape, stride, dilation, dtype
+            )
+
+            tir_schedule = dwconv2d_schedule(
+                output_tensor, [input_tensor, weights], in_out_layout, weights_layout
+            )
+        else:
+            raise RuntimeError(f"Unsupport dtype '{dtype}'")
+
+        func_name = "depthwise_conv2d_slice"
+        with tvm.transform.PassContext(opt_level=3):
+            runtime_module = tvm.build(
+                tir_schedule.mod,
+                [input_tensor, output_tensor],
+                target=target,
+                name=func_name,
+            )
+
+        input_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            data=in_data_np,
+            axis_separators=[4],
+            mem_scope=working_scope,
+        )
+
+        weights_arr = allocate_hexagon_array(
+            hexagon_session.device, data=weights_data_np, mem_scope=working_scope
+        )
+
+        output_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            out_data_np.shape,
+            dtype=dtype,
+            axis_separators=[4],
+            mem_scope=working_scope,
+        )
+
+        mod = hexagon_session.load_module(runtime_module)
+        mod(input_arr, weights_arr, output_arr)
+        n, h, w, c = out_shape
+
+        if dtype in ("uint8", "int8"):
+            output_np = output_arr.numpy().reshape([n, h // 8, w // 8, c // 32, 8, 8, 32])
+            np.testing.assert_allclose(output_np, out_data_np, atol=3, rtol=0.02)
+        elif dtype == "float16":
+            output_np = output_arr.numpy()
+            np.testing.assert_allclose(output_np, out_data_np, atol=0.01, rtol=0.01)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_dwconv2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_dwconv2d_slice.py
deleted file mode 100644
index 3e43718afd8d..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_dwconv2d_slice.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=line-too-long, redefined-outer-name
-
-"""Test dwconv2d slice op for hexagon. Input layout is always nhwc"""
-
-import numpy as np
-
-import tvm
-import tvm.testing
-
-from tvm.topi.testing import depthwise_conv2d_python_nhwc
-from tvm.topi.hexagon.slice_ops.dwconv2d import dwconv2d_compute, dwconv2d_schedule
-
-from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
-
-
-@tvm.testing.fixture
-def input_np(in_shape, dtype):
-    return np.random.uniform(size=in_shape).astype(dtype)
-
-
-@tvm.testing.fixture
-def weights_np(filt_shape, dtype):
-    return (np.random.uniform(size=filt_shape)).astype(dtype)
-
-
-@tvm.testing.fixture
-def dilated_filt_shape(filt_shape, dilation):
-    """Compute the dilated filter shape when dilation > 1"""
-    filt_height, filt_width, in_channel, out_channel = filt_shape
-    dilation_height, dilation_width = dilation
-    if dilation_height == 1 and dilation_width == 1:
-        return filt_shape
-    dilated_height, dilated_width = (
-        dilation_height * (filt_height - 1) + 1,
-        dilation_width * (filt_width - 1) + 1,
-    )
-    return dilated_height, dilated_width, in_channel, out_channel
-
-
-@tvm.testing.fixture
-def dilated_weights_np(weights_np, dilation, dilated_filt_shape):
-    """Get dilated weights from original weights for testing"""
-    filt_height, filt_width, in_channels, out_channels = weights_np.shape
-    dilation_height, dilation_width = dilation
-    if dilation_height == 1 and dilation_width == 1:
-        return weights_np
-    dilated_height, dilated_width = dilated_filt_shape[0], dilated_filt_shape[1]
-    dilated_weights = np.zeros(dilated_filt_shape, dtype="float16")
-    for in_channel in range(in_channels):
-        for out_channel in range(out_channels):
-            for dilation_i, height_i in zip(
-                range(0, dilated_height, dilation_height), range(filt_height)
-            ):
-                for dilation_j, width_j in zip(
-                    range(0, dilated_width, dilation_width), range(filt_width)
-                ):
-                    dilated_weights[dilation_i, dilation_j, in_channel, out_channel] = weights_np[
-                        height_i, width_j, in_channel, out_channel
-                    ]
-
-    return dilated_weights
-
-
-@tvm.testing.fixture
-def input_np_padded(input_np, in_shape, padded_in_shape):
-    pad_height = padded_in_shape[1] - in_shape[1]
-    pad_width = padded_in_shape[2] - in_shape[2]
-    pad_channel = padded_in_shape[3] - in_shape[3]
-    input_padded = np.pad(
-        input_np, ((0, 0), (0, pad_height), (0, pad_width), (0, pad_channel)), "constant"
-    )
-    return input_padded
-
-
-@tvm.testing.fixture
-def weights_np_transformed(weights_np):
-    height, width, in_channel, out_channel = weights_np.shape
-    return weights_np.reshape([height, width, in_channel, out_channel // 32, 32]).transpose(
-        3, 0, 1, 2, 4
-    )
-
-
-def generate_test_config(test_params):
-    """Utility function to generate test config with meaningful ids"""
-    test_config = {}
-
-    dims = lambda vals: "x".join(map(str, vals))
-
-    for param in test_params:
-        in_shape, filt_shape, stride, dilation = param
-        test_name = f"nhwc{dims(in_shape)}-hwio{dims(filt_shape)}-stride{dims(stride)}-dilation{dims(dilation)}"
-        test_config[test_name] = param
-
-    return test_config
-
-
-class Testdwconv2dSlice:
-    """Test class that defines the dwconv2d slice test"""
-
-    test_params = [
-        [
-            (1, 10, 6, 32),
-            (3, 3, 1, 32),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 18, 10, 32),
-            (3, 3, 1, 32),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 10, 6, 64),
-            (3, 3, 1, 64),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 12, 8, 32),
-            (3, 3, 1, 32),
-            (1, 1),
-            (2, 2),
-        ],
-        [
-            (1, 12, 8, 32),
-            (5, 5, 1, 32),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 16, 12, 32),
-            (5, 5, 1, 32),
-            (1, 1),
-            (2, 2),
-        ],
-        [
-            (1, 13, 9, 32),
-            (6, 6, 1, 32),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 18, 10, 32),
-            (3, 3, 1, 32),
-            (2, 2),
-            (1, 1),
-        ],
-        [
-            (1, 18, 10, 96),
-            (3, 3, 1, 96),
-            (2, 2),
-            (1, 1),
-        ],
-        [
-            (1, 20, 12, 32),
-            (5, 5, 1, 32),
-            (2, 2),
-            (1, 1),
-        ],
-        [
-            (1, 22, 14, 32),
-            (7, 7, 1, 32),
-            (2, 2),
-            (1, 1),
-        ],
-        [
-            (1, 28, 20, 32),
-            (7, 7, 1, 32),
-            (2, 2),
-            (2, 2),
-        ],
-        [
-            (1, 28, 20, 96),
-            (7, 7, 1, 96),
-            (2, 2),
-            (2, 2),
-        ],
-        [
-            (1, 10, 4, 32),
-            (3, 1, 1, 32),
-            (1, 1),
-            (1, 1),
-        ],
-        [
-            (1, 18, 8, 32),
-            (3, 1, 1, 32),
-            (2, 2),
-            (1, 1),
-        ],
-        [
-            (1, 20, 8, 32),
-            (3, 1, 1, 32),
-            (2, 2),
-            (2, 2),
-        ],
-    ]
-    test_config = generate_test_config(test_params)
-
-    in_shape, filt_shape, stride, dilation = tvm.testing.parameters(
-        *test_config.values(), ids=test_config.keys()
-    )
-    dtype = tvm.testing.parameter("float16")
-    working_scope = tvm.testing.parameter("global.vtcm")
-    in_out_layout = tvm.testing.parameter("nhwc-8h2w32c2w-2d")
-
-    @tvm.testing.fixture
-    def padded_in_shape(self, in_shape):
-        in_batch, in_height, in_width, in_channel = in_shape
-        in_height = ((in_height + 7) // 8) * 8
-        in_width = ((in_width + 3) // 4) * 4
-        in_channel = ((in_channel + 31) // 32) * 32
-        return in_batch, in_height, in_width, in_channel
-
-    @tvm.testing.fixture
-    def out_shape(self, in_shape, dilated_filt_shape, stride):
-        in_batch, in_height, in_width, _ = in_shape
-        filt_height, filt_width, _, num_filt = dilated_filt_shape
-        out_height = (in_height - filt_height) // stride[0] + 1
-        out_width = (in_width - filt_width) // stride[1] + 1
-        out_channel = num_filt
-        return in_batch, out_height, out_width, out_channel
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np, dilated_weights_np, stride):
-        dilated_weights_np_t = dilated_weights_np.transpose(0, 1, 3, 2)
-        ref_np = depthwise_conv2d_python_nhwc(
-            input_np.astype("float32"), dilated_weights_np_t.astype("float32"), stride, padding=0
-        ).astype("float16")
-        return ref_np
-
-    @tvm.testing.requires_hexagon
-    def test_dwconv2d(
-        self,
-        padded_in_shape,
-        filt_shape,
-        stride,
-        dilation,
-        dtype,
-        out_shape,
-        in_out_layout,
-        input_np_padded,
-        weights_np_transformed,
-        expected_output_np,
-        working_scope,
-        hexagon_session,
-    ):
-        """Main test function that tests the dwconv2d slice op"""
-        input_tensor = tvm.te.placeholder(padded_in_shape, name="InputTensor", dtype=dtype)
-        weights = tvm.te.placeholder(filt_shape, name="Weights", dtype=dtype)
-
-        output_tensor = dwconv2d_compute(input_tensor, weights, out_shape, stride, dilation, dtype)
-
-        def transform_weights(height, width, in_channel, out_channel):
-            return [out_channel // 32, height, width, in_channel, out_channel % 32]
-
-        tir_schedule = dwconv2d_schedule(
-            output_tensor, [input_tensor, weights], in_out_layout, transform_weights
-        )
-
-        func_name = f"fdwconv2d_{dtype}"
-        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_assert": True}):
-            runtime_module = tvm.build(
-                tir_schedule.mod,
-                target=get_hexagon_target("v69"),
-                name=func_name,
-            )
-
-        input_np_transformed = transform_numpy(input_np_padded, "nhwc", in_out_layout)
-        output_np_transformed = transform_numpy(expected_output_np, "nhwc", in_out_layout)
-
-        input_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            data=input_np_transformed,
-            axis_separators=[4],
-            mem_scope=working_scope,
-        )
-
-        weights_arr = allocate_hexagon_array(
-            hexagon_session.device, data=weights_np_transformed, mem_scope=working_scope
-        )
-
-        output_arr = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=output_np_transformed.shape,
-            dtype=output_np_transformed.dtype,
-            axis_separators=[4],
-            mem_scope=working_scope,
-        )
-
-        mod = hexagon_session.load_module(runtime_module)
-        mod(input_arr, weights_arr, output_arr)
-        output_np = output_arr.numpy()
-        np.testing.assert_allclose(output_np, output_np_transformed, atol=0.01, rtol=0.01)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()

From 20082614be85f14a7d6f4e653f2adcd89d0a3717 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 28 Oct 2022 13:51:21 -0700
Subject: [PATCH 465/704] [skip ci] Revert "[skip ci] Remove naut-thomas from
 .asf.yaml (#13231)" (#13232)

This reverts commit 3149ee5a73133dd320c8ec259459a4c32b1955d8.

This is needed to re-send the invitation to the repo
---
 .asf.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.asf.yaml b/.asf.yaml
index c82d4bfab3dd..1e4371d594d2 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -48,6 +48,7 @@ github:
     - hpanda-naut
     - denise-k
     - janetsc
+    - naut-thomas
     - tvm-bot  # For automated feedback in PR review.
 
   # See https://cwiki.apache.org/confluence/display/INFRA/Git+-+.asf.yaml+features#Git.asf.yamlfeatures-Branchprotection

From da76587505267a67f585c17f4b7afcc5d4f8bb92 Mon Sep 17 00:00:00 2001
From: Jianjian Guan <jacquesguan@me.com>
Date: Sat, 29 Oct 2022 13:59:04 +0800
Subject: [PATCH 466/704] [runtime] Fix typo in runtime/registry.h (#13224)

Fix typo in runtime/registry.h.
---
 include/tvm/runtime/registry.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index 7d781461c569..5a467c877930 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -292,8 +292,8 @@ class Registry {
   /*!
    * \brief Register a function with given name
    * \param name The name of the function.
-   * \param override Whether allow oveeride existing function.
-   * \return Reference to theregistry.
+   * \param override Whether allow override existing function.
+   * \return Reference to the registry.
    */
   TVM_DLL static Registry& Register(const std::string& name, bool override = false);  // NOLINT(*)
   /*!

From 9dd0225a70c4d5dabd8112a1b0262005a81916be Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Sat, 29 Oct 2022 00:59:37 -0500
Subject: [PATCH 467/704] [TIR] Use Optional<Stmt> for
 IfThenElseNode::else_case (#13218)

This parameter is nullable for cases where the else block isn't
present.  Previously, it was represented as a `Stmt` holding
`nullptr`, because
`IfThenElse` (https://github.com/apache/tvm/pull/3533) predates the
`Optional` utility (https://github.com/apache/tvm/pull/5314).  This
commit updates to use `Optional<Stmt>` instead, and updates all usages
of `else_case`.
---
 include/tvm/tir/stmt.h                           |  4 ++--
 src/arith/ir_mutator_with_analyzer.cc            | 12 +++++-------
 src/arith/ir_visitor_with_analyzer.cc            |  4 ++--
 src/contrib/hybrid/codegen_hybrid.cc             |  4 ++--
 src/printer/tir_text_printer.cc                  |  4 ++--
 src/printer/tvmscript_printer.cc                 |  4 ++--
 src/target/llvm/codegen_llvm.cc                  |  4 ++--
 src/target/source/codegen_c.cc                   |  4 ++--
 src/target/spirv/codegen_spirv.cc                |  4 ++--
 src/target/stackvm/codegen_stackvm.cc            |  4 ++--
 src/tir/analysis/block_access_region_detector.cc |  4 ++--
 src/tir/analysis/estimate_flops.cc               |  4 ++--
 src/tir/ir/stmt.cc                               |  4 ++--
 src/tir/ir/stmt_functor.cc                       | 10 +++++-----
 src/tir/transforms/common_subexpr_elim_tools.cc  |  6 +++---
 src/tir/transforms/compact_buffer_region.cc      |  4 ++--
 src/tir/transforms/coproc_sync.cc                |  4 ++--
 src/tir/transforms/inject_virtual_thread.cc      |  6 +++---
 src/tir/transforms/ir_utils.cc                   |  2 +-
 src/tir/transforms/lift_attr_scope.cc            |  4 ++--
 src/tir/transforms/profile_instrumentation.cc    |  4 ++--
 src/tir/transforms/remove_no_op.cc               |  4 ++--
 src/tir/transforms/simplify.cc                   |  4 ++--
 src/tir/transforms/storage_access.cc             |  4 ++--
 src/tir/transforms/vectorize_loop.cc             |  6 +++---
 25 files changed, 58 insertions(+), 60 deletions(-)

diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index e16d773f02b3..e0e191b282e5 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -824,7 +824,7 @@ class IfThenElseNode : public StmtNode {
   /*! \brief The branch to be executed when condition is true. */
   Stmt then_case;
   /*! \brief The branch to be executed when condition is false, can be null. */
-  Stmt else_case;
+  Optional<Stmt> else_case;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("condition", &condition);
@@ -854,7 +854,7 @@ class IfThenElseNode : public StmtNode {
  */
 class IfThenElse : public Stmt {
  public:
-  TVM_DLL IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case = Stmt(),
+  TVM_DLL IfThenElse(PrimExpr condition, Stmt then_case, Optional<Stmt> else_case = NullOpt,
                      Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(IfThenElse, Stmt, IfThenElseNode);
diff --git a/src/arith/ir_mutator_with_analyzer.cc b/src/arith/ir_mutator_with_analyzer.cc
index 9cae3b7a6ac8..199f06191e4e 100644
--- a/src/arith/ir_mutator_with_analyzer.cc
+++ b/src/arith/ir_mutator_with_analyzer.cc
@@ -71,21 +71,19 @@ Stmt IRMutatorWithAnalyzer::VisitStmt_(const IfThenElseNode* op) {
     }
   }
 
-  Stmt then_case, else_case;
+  Stmt then_case;
+  Optional<Stmt> else_case;
   {
     With<ConstraintContext> ctx(analyzer_, real_condition);
     then_case = this->VisitStmt(op->then_case);
   }
-  if (op->else_case.defined()) {
+  if (op->else_case) {
     With<ConstraintContext> ctx(analyzer_, analyzer_->rewrite_simplify(Not(real_condition)));
-    else_case = this->VisitStmt(op->else_case);
+    else_case = this->VisitStmt(op->else_case.value());
   }
   if (is_one(real_condition)) return then_case;
   if (is_zero(real_condition)) {
-    if (else_case.defined()) {
-      return else_case;
-    }
-    return Evaluate(0);
+    return else_case.value_or(Evaluate(0));
   }
 
   if (condition.same_as(op->condition) && then_case.same_as(op->then_case) &&
diff --git a/src/arith/ir_visitor_with_analyzer.cc b/src/arith/ir_visitor_with_analyzer.cc
index 75ae22ef9915..e7cf3ea7eadd 100644
--- a/src/arith/ir_visitor_with_analyzer.cc
+++ b/src/arith/ir_visitor_with_analyzer.cc
@@ -58,9 +58,9 @@ void IRVisitorWithAnalyzer::VisitStmt_(const IfThenElseNode* op) {
     With<ConstraintContext> constraint(&analyzer_, real_condition);
     this->VisitStmt(op->then_case);
   }
-  if (op->else_case.defined()) {
+  if (op->else_case) {
     With<ConstraintContext> constraint(&analyzer_, analyzer_.rewrite_simplify(Not(real_condition)));
-    this->VisitStmt(op->else_case);
+    this->VisitStmt(op->else_case.value());
   }
 }
 
diff --git a/src/contrib/hybrid/codegen_hybrid.cc b/src/contrib/hybrid/codegen_hybrid.cc
index 79c9e567b459..687da61fa019 100644
--- a/src/contrib/hybrid/codegen_hybrid.cc
+++ b/src/contrib/hybrid/codegen_hybrid.cc
@@ -381,11 +381,11 @@ void CodeGenHybrid::VisitStmt_(const IfThenElseNode* op) {
   PrintStmt(op->then_case);
   indent_ -= tab_;
 
-  if (!is_noop(op->else_case)) {
+  if (op->else_case && !is_noop(op->else_case.value())) {
     PrintIndent();
     stream << "else:\n";
     indent_ += tab_;
-    PrintStmt(op->else_case);
+    PrintStmt(op->else_case.value());
     indent_ -= tab_;
   }
 }
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index cdfc8fd318fd..e50559ac10ff 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -572,8 +572,8 @@ Doc TIRTextPrinter::VisitStmt_(const DeclBufferNode* op) {
 Doc TIRTextPrinter::VisitStmt_(const IfThenElseNode* op) {
   Doc doc;
   doc << "if " << Print(op->condition) << PrintBody(op->then_case);
-  if (!is_one(op->condition) && op->else_case.defined()) {
-    doc << " else" << PrintBody(op->else_case);
+  if (!is_one(op->condition) && op->else_case) {
+    doc << " else" << PrintBody(op->else_case.value());
   }
   return doc;
 }
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 39eb245f3ad9..d8d5d89be0a4 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -1244,9 +1244,9 @@ Doc TVMScriptPrinter::VisitStmt_(const IfThenElseNode* op) {
   Doc doc;
   doc << "if " << Print(op->condition) << ":";
   doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->then_case));
-  if (!is_one(op->condition) && op->else_case.defined()) {
+  if (!is_one(op->condition) && op->else_case) {
     doc << Doc::NewLine();
-    doc << "else:" << Doc::Indent(4, Doc::NewLine() << PrintBody(op->else_case));
+    doc << "else:" << Doc::Indent(4, Doc::NewLine() << PrintBody(op->else_case.value()));
   }
   return doc;
 }
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index ea8a5ff5106a..87479ec74237 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1759,14 +1759,14 @@ void CodeGenLLVM::VisitStmt_(const IfThenElseNode* op) {
   llvm::LLVMContext* ctx = llvm_target_->GetContext();
   auto* then_block = llvm::BasicBlock::Create(*ctx, "if_then", function_);
   auto* end_block = llvm::BasicBlock::Create(*ctx, "if_end", function_);
-  if (op->else_case.defined()) {
+  if (op->else_case) {
     auto* else_block = llvm::BasicBlock::Create(*ctx, "if_else", function_);
     builder_->CreateCondBr(cond, then_block, else_block);
     builder_->SetInsertPoint(then_block);
     this->VisitStmt(op->then_case);
     builder_->CreateBr(end_block);
     builder_->SetInsertPoint(else_block);
-    this->VisitStmt(op->else_case);
+    this->VisitStmt(op->else_case.value());
     builder_->CreateBr(end_block);
   } else {
     builder_->CreateCondBr(cond, then_block, end_block, md_very_likely_branch_);
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index b69f76914495..66c92181c126 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -936,11 +936,11 @@ void CodeGenC::VisitStmt_(const IfThenElseNode* op) {
   PrintStmt(op->then_case);
   this->EndScope(then_scope);
 
-  if (op->else_case.defined()) {
+  if (op->else_case) {
     PrintIndent();
     stream << "} else {\n";
     int else_scope = BeginScope();
-    PrintStmt(op->else_case);
+    PrintStmt(op->else_case.value());
     this->EndScope(else_scope);
   }
   PrintIndent();
diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc
index 4f875e955576..c291a478dd3f 100644
--- a/src/target/spirv/codegen_spirv.cc
+++ b/src/target/spirv/codegen_spirv.cc
@@ -628,7 +628,7 @@ void CodeGenSPIRV::VisitStmt_(const IfThenElseNode* op) {
   spirv::Value cond = MakeValue(op->condition);
   spirv::Label then_label = builder_->NewLabel();
   spirv::Label merge_label = builder_->NewLabel();
-  if (op->else_case.defined()) {
+  if (op->else_case) {
     spirv::Label else_label = builder_->NewLabel();
     builder_->MakeInst(spv::OpSelectionMerge, merge_label, spv::SelectionControlMaskNone);
     builder_->MakeInst(spv::OpBranchConditional, cond, then_label, else_label);
@@ -638,7 +638,7 @@ void CodeGenSPIRV::VisitStmt_(const IfThenElseNode* op) {
     builder_->MakeInst(spv::OpBranch, merge_label);
     // else block
     builder_->StartLabel(else_label);
-    this->VisitStmt(op->else_case);
+    this->VisitStmt(op->else_case.value());
     builder_->MakeInst(spv::OpBranch, merge_label);
   } else {
     builder_->MakeInst(spv::OpSelectionMerge, merge_label, spv::SelectionControlMaskNone);
diff --git a/src/target/stackvm/codegen_stackvm.cc b/src/target/stackvm/codegen_stackvm.cc
index 80a5c4bfde6a..eac9ad849419 100644
--- a/src/target/stackvm/codegen_stackvm.cc
+++ b/src/target/stackvm/codegen_stackvm.cc
@@ -475,13 +475,13 @@ void CodeGenStackVM::VisitStmt_(const IfThenElseNode* op) {
   int64_t else_jump = this->PushOp(StackVM::RJUMP_IF_FALSE, 0);
   this->PushOp(StackVM::POP);
   this->Push(op->then_case);
-  if (op->else_case.defined()) {
+  if (op->else_case) {
     int64_t label_then_jump = this->GetPC();
     int64_t then_jump = this->PushOp(StackVM::RJUMP, 0);
     int64_t else_begin = this->GetPC();
     this->SetOperand(else_jump, else_begin - label_ejump);
     this->PushOp(StackVM::POP);
-    this->Push(op->else_case);
+    this->Push(op->else_case.value());
     int64_t if_end = this->GetPC();
     this->SetOperand(then_jump, if_end - label_then_jump);
   } else {
diff --git a/src/tir/analysis/block_access_region_detector.cc b/src/tir/analysis/block_access_region_detector.cc
index c65a422ed3d0..e9bff1b6fdee 100644
--- a/src/tir/analysis/block_access_region_detector.cc
+++ b/src/tir/analysis/block_access_region_detector.cc
@@ -173,10 +173,10 @@ void BlockReadWriteDetector::VisitStmt_(const IfThenElseNode* op) {
     With<ConditionalBoundsContext> ctx(op->condition, &dom_map_, &hint_map_, true);
     StmtExprVisitor::VisitStmt(op->then_case);
   }
-  if (op->else_case.defined()) {
+  if (op->else_case) {
     // Visit else branch
     With<ConditionalBoundsContext> ctx(op->condition, &dom_map_, &hint_map_, false);
-    StmtExprVisitor::VisitStmt(op->else_case);
+    StmtExprVisitor::VisitStmt(op->else_case.value());
   }
 }
 
diff --git a/src/tir/analysis/estimate_flops.cc b/src/tir/analysis/estimate_flops.cc
index d8faf9bd1362..d158a001b2d8 100644
--- a/src/tir/analysis/estimate_flops.cc
+++ b/src/tir/analysis/estimate_flops.cc
@@ -148,8 +148,8 @@ class FlopEstimator : private ExprFunctor<TResult(const PrimExpr& n)>,
 
   TResult VisitStmt_(const IfThenElseNode* branch) override {
     TResult cond = VisitExpr(branch->condition);
-    if (branch->else_case.defined()) {
-      cond += VisitStmt(branch->then_case).MaxWith(VisitStmt(branch->else_case));
+    if (branch->else_case) {
+      cond += VisitStmt(branch->then_case).MaxWith(VisitStmt(branch->else_case.value()));
     } else {
       cond += VisitStmt(branch->then_case);
     }
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index 8f2a7b4ffe5b..a8d8936c905a 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -641,7 +641,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // IfThenElse
-IfThenElse::IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case, Span span) {
+IfThenElse::IfThenElse(PrimExpr condition, Stmt then_case, Optional<Stmt> else_case, Span span) {
   ICHECK(condition.defined());
   ICHECK(then_case.defined());
   // else_case may be null.
@@ -670,7 +670,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
         p->Print(op->then_case);
         p->indent -= 2;
 
-        if (!op->else_case.defined()) {
+        if (!op->else_case) {
           break;
         }
 
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index 59630d34c38e..e445432e5b6f 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -86,8 +86,8 @@ void StmtVisitor::VisitStmt_(const BufferRealizeNode* op) {
 void StmtVisitor::VisitStmt_(const IfThenElseNode* op) {
   this->VisitExpr(op->condition);
   this->VisitStmt(op->then_case);
-  if (op->else_case.defined()) {
-    this->VisitStmt(op->else_case);
+  if (op->else_case) {
+    this->VisitStmt(op->else_case.value());
   }
 }
 
@@ -352,9 +352,9 @@ Stmt StmtMutator::VisitStmt_(const DeclBufferNode* op) {
 Stmt StmtMutator::VisitStmt_(const IfThenElseNode* op) {
   PrimExpr condition = this->VisitExpr(op->condition);
   Stmt then_case = this->VisitStmt(op->then_case);
-  Stmt else_case;
-  if (op->else_case.defined()) {
-    else_case = this->VisitStmt(op->else_case);
+  Optional<Stmt> else_case = NullOpt;
+  if (op->else_case) {
+    else_case = this->VisitStmt(op->else_case.value());
   }
   if (condition.same_as(op->condition) && then_case.same_as(op->then_case) &&
       else_case.same_as(op->else_case)) {
diff --git a/src/tir/transforms/common_subexpr_elim_tools.cc b/src/tir/transforms/common_subexpr_elim_tools.cc
index 39d7a750a99c..130004c51cd8 100644
--- a/src/tir/transforms/common_subexpr_elim_tools.cc
+++ b/src/tir/transforms/common_subexpr_elim_tools.cc
@@ -434,9 +434,9 @@ void ComputationsDoneBy::VisitStmt_(const IfThenElseNode* op) {
   table_of_computations_.clear();
 
   ComputationTable computations_done_by_else;
-  if (op->else_case.defined()) {
-    // And finally calls the VisitStmt() method on the `then_case` child
-    VisitStmt(op->else_case);
+  if (op->else_case) {
+    // And finally calls the VisitStmt() method on the `else_case` child
+    VisitStmt(op->else_case.value());
     computations_done_by_else = table_of_computations_;
     table_of_computations_.clear();
   }
diff --git a/src/tir/transforms/compact_buffer_region.cc b/src/tir/transforms/compact_buffer_region.cc
index 249b8cca77b0..b517150ce9f4 100644
--- a/src/tir/transforms/compact_buffer_region.cc
+++ b/src/tir/transforms/compact_buffer_region.cc
@@ -184,10 +184,10 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
       With<ConditionalBoundsContext> ctx(op->condition, &dom_map_, &hint_map_, true);
       StmtExprVisitor::VisitStmt(op->then_case);
     }
-    if (op->else_case.defined()) {
+    if (op->else_case) {
       // Visit else branch
       With<ConditionalBoundsContext> ctx(op->condition, &dom_map_, &hint_map_, false);
-      StmtExprVisitor::VisitStmt(op->else_case);
+      StmtExprVisitor::VisitStmt(op->else_case.value());
     }
   }
 
diff --git a/src/tir/transforms/coproc_sync.cc b/src/tir/transforms/coproc_sync.cc
index 1b1cabeadb71..69913f4bd604 100644
--- a/src/tir/transforms/coproc_sync.cc
+++ b/src/tir/transforms/coproc_sync.cc
@@ -417,8 +417,8 @@ class CoProcInstDepDetector : public StmtVisitor {
       first_state_.clear();
       last_state_.clear();
     }
-    if (op->else_case.defined()) {
-      this->VisitStmt(op->else_case);
+    if (op->else_case) {
+      this->VisitStmt(op->else_case.value());
       if (last_state_.node != nullptr) {
         curr_state.node = op;
         MatchFixEnterPop(first_state_);
diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc
index f49b6b2ace8e..a1ebdcef9855 100644
--- a/src/tir/transforms/inject_virtual_thread.cc
+++ b/src/tir/transforms/inject_virtual_thread.cc
@@ -360,11 +360,11 @@ class VTInjector : public arith::IRMutatorWithAnalyzer {
     visit_touched_var_ = false;
     ICHECK_EQ(max_loop_depth_, 0);
     Stmt then_case = this->VisitStmt(op->then_case);
-    Stmt else_case;
-    if (op->else_case.defined()) {
+    Optional<Stmt> else_case = NullOpt;
+    if (op->else_case) {
       int temp = max_loop_depth_;
       max_loop_depth_ = 0;
-      else_case = this->VisitStmt(op->else_case);
+      else_case = this->VisitStmt(op->else_case.value());
       max_loop_depth_ = std::max(temp, max_loop_depth_);
     }
     if (condition.same_as(op->condition) && then_case.same_as(op->then_case) &&
diff --git a/src/tir/transforms/ir_utils.cc b/src/tir/transforms/ir_utils.cc
index b7e3e01f7506..6893aecc4d00 100644
--- a/src/tir/transforms/ir_utils.cc
+++ b/src/tir/transforms/ir_utils.cc
@@ -56,7 +56,7 @@ Stmt MergeNest(const std::vector<Stmt>& nest, Stmt body) {
     } else if (const auto* ite = s.as<IfThenElseNode>()) {
       auto n = make_object<IfThenElseNode>(*ite);
       ICHECK(is_no_op(n->then_case));
-      ICHECK(!n->else_case.defined());
+      ICHECK(!n->else_case);
       n->then_case = body;
       body = Stmt(n);
     } else if (const auto* seq = s.as<SeqStmtNode>()) {
diff --git a/src/tir/transforms/lift_attr_scope.cc b/src/tir/transforms/lift_attr_scope.cc
index 40d152b3b3b6..272e16d40d97 100644
--- a/src/tir/transforms/lift_attr_scope.cc
+++ b/src/tir/transforms/lift_attr_scope.cc
@@ -122,7 +122,7 @@ class AttrScopeLifter : public StmtMutator {
   }
 
   Stmt VisitStmt_(const IfThenElseNode* op) final {
-    if (!op->else_case.defined()) {
+    if (!op->else_case) {
       return StmtMutator::VisitStmt_(op);
     }
     Stmt then_case = this->VisitStmt(op->then_case);
@@ -130,7 +130,7 @@ class AttrScopeLifter : public StmtMutator {
     PrimExpr first_value;
     std::swap(first_node, attr_node_);
     std::swap(first_value, attr_value_);
-    Stmt else_case = this->VisitStmt(op->else_case);
+    Stmt else_case = this->VisitStmt(op->else_case.value());
     if (attr_node_.defined() && attr_value_.defined() && first_node.defined() &&
         first_value.defined() && attr_node_.same_as(first_node) &&
         ValueSame(attr_value_, first_value)) {
diff --git a/src/tir/transforms/profile_instrumentation.cc b/src/tir/transforms/profile_instrumentation.cc
index 5f52fc6630bc..68d5b0a204d5 100644
--- a/src/tir/transforms/profile_instrumentation.cc
+++ b/src/tir/transforms/profile_instrumentation.cc
@@ -110,8 +110,8 @@ class LoopAnalyzer : public StmtExprVisitor {
     } else if (stmt->IsInstance<IfThenElseNode>()) {
       const IfThenElseNode* n = stmt.as<IfThenElseNode>();
       unsigned height = TraverseLoop(n->then_case, parent_depth, has_parallel);
-      if (n->else_case.defined()) {
-        height = std::max(height, TraverseLoop(n->else_case, parent_depth, has_parallel));
+      if (n->else_case) {
+        height = std::max(height, TraverseLoop(n->else_case.value(), parent_depth, has_parallel));
       }
       return height;
     } else if (stmt->IsInstance<ForNode>()) {
diff --git a/src/tir/transforms/remove_no_op.cc b/src/tir/transforms/remove_no_op.cc
index 8728817aad57..41250408a7f2 100644
--- a/src/tir/transforms/remove_no_op.cc
+++ b/src/tir/transforms/remove_no_op.cc
@@ -69,8 +69,8 @@ class NoOpRemover : public StmtMutator {
   Stmt VisitStmt_(const IfThenElseNode* op) final {
     Stmt stmt = StmtMutator::VisitStmt_(op);
     op = stmt.as<IfThenElseNode>();
-    if (op->else_case.defined()) {
-      if (is_no_op(op->else_case)) {
+    if (op->else_case) {
+      if (is_no_op(op->else_case.value())) {
         if (is_no_op(op->then_case)) {
           return MakeEvaluate(op->condition);
         } else {
diff --git a/src/tir/transforms/simplify.cc b/src/tir/transforms/simplify.cc
index b6e3581aa614..1dbf9e688027 100644
--- a/src/tir/transforms/simplify.cc
+++ b/src/tir/transforms/simplify.cc
@@ -139,8 +139,8 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     if (const int64_t* as_int = as_const_int(cond)) {
       if (*as_int) {
         return this->VisitStmt(op->then_case);
-      } else if (op->else_case.defined()) {
-        return this->VisitStmt(op->else_case);
+      } else if (op->else_case) {
+        return this->VisitStmt(op->else_case.value());
       } else {
         return Evaluate(0);
       }
diff --git a/src/tir/transforms/storage_access.cc b/src/tir/transforms/storage_access.cc
index 4f19f708880c..8729ab1ed296 100644
--- a/src/tir/transforms/storage_access.cc
+++ b/src/tir/transforms/storage_access.cc
@@ -187,9 +187,9 @@ void StorageAccessVisitor::VisitStmt_(const IfThenElseNode* op) {
   s.stmt = op;
   s.access = Summarize(std::move(scope_.back()), nullptr);
   scope_.pop_back();
-  if (op->else_case.defined()) {
+  if (op->else_case) {
     scope_.push_back(std::vector<StmtEntry>());
-    this->VisitStmt(op->else_case);
+    this->VisitStmt(op->else_case.value());
     auto v = Summarize(std::move(scope_.back()), nullptr);
     scope_.pop_back();
     s.access.insert(s.access.end(), v.begin(), v.end());
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index 3cc17847e69b..8efed83ccdf1 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -490,9 +490,9 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       return Scalarize(GetRef<Stmt>(op));
     }
     Stmt then_case = this->VisitStmt(op->then_case);
-    Stmt else_case;
-    if (op->else_case.defined()) {
-      else_case = this->VisitStmt(op->else_case);
+    Optional<Stmt> else_case = NullOpt;
+    if (op->else_case) {
+      else_case = this->VisitStmt(op->else_case.value());
     }
     if (condition.same_as(op->condition) && then_case.same_as(op->then_case) &&
         else_case.same_as(op->else_case)) {

From f07f22fac98908a1e024438617e56c61f2a26158 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 28 Oct 2022 23:00:30 -0700
Subject: [PATCH 468/704] [MetaSchedule][Minor] Fix Memory Database Module
 Equality (#13198)

Previously the memory database assumed the workload tokens are from `CommitWorkload` function, this PR allows the use of module equality function in `GetTopK` function to compare workloads.
---
 src/meta_schedule/database/json_database.cc   | 3 ++-
 src/meta_schedule/database/memory_database.cc | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index aaa67600fc96..bd5183f0cf60 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -128,7 +128,8 @@ class JSONDatabaseNode : public DatabaseNode {
     results.reserve(top_k);
     int counter = 0;
     for (const TuningRecord& record : this->tuning_records_) {
-      if (WorkloadEqual(GetModuleEquality())(record->workload, workload)) {
+      if (record->workload.same_as(workload) ||
+          WorkloadEqual(GetModuleEquality())(record->workload, workload)) {
         results.push_back(record);
         if (++counter == top_k) {
           break;
diff --git a/src/meta_schedule/database/memory_database.cc b/src/meta_schedule/database/memory_database.cc
index 47f6a473d1e4..24fba6dfa105 100644
--- a/src/meta_schedule/database/memory_database.cc
+++ b/src/meta_schedule/database/memory_database.cc
@@ -71,7 +71,8 @@ class MemoryDatabaseNode : public DatabaseNode {
       if (run_secs.empty()) {
         continue;
       }
-      if (record->workload.same_as(workload)) {
+      if (record->workload.same_as(workload) ||
+          WorkloadEqual(GetModuleEquality())(record->workload, workload)) {
         double sum = 0.0;
         for (const FloatImm& i : run_secs) {
           sum += i->value;

From e971956d760aa92cbae1fa4030f46c1046eaa87a Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Sat, 29 Oct 2022 14:05:34 +0800
Subject: [PATCH 469/704] [Relay][transform][SimplifyExpr] simplify adjacent
 muls and adds with constants (#13213)

* simplify adjacent muls and adds with constants

* apply FoldConstant inside SimplifyExpr
---
 src/relay/transforms/simplify_expr.cc         |  85 +++++++++-----
 tests/python/relay/test_pass_simplify_expr.py | 109 +++++++++++-------
 2 files changed, 128 insertions(+), 66 deletions(-)

diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index cf594a09a266..0dfb45577280 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -672,47 +672,79 @@ class EliminateIdentityRewrite : public DFPatternRewrite {
   DFPattern const_;
 };
 
-/*! \brief Make two consecutive add able to be constant_folded.
- * This pattern matching supports commutative property for addition.
+/*! \brief Switch adjacent add-mul with constants to mul-add.
+ * As mul-add pattern is more friendly to FoldScaleAxis.
  */
-class SimplifyConsecutiveAdd : public DFPatternRewrite {
+class SwitchAddMultiply : public DFPatternRewrite {
  public:
-  SimplifyConsecutiveAdd() {
+  SwitchAddMultiply() {
     x_ = IsWildcard();
-    const1_ = IsConstant();
-    const2_ = IsConstant();
-    DFPattern add_op = IsOp("add");
-    pattern_ = add_op({add_op({x_, const1_}), const2_});
+    c1_ = IsConstant();
+    c2_ = IsConstant();
+    pattern_ = (x_ + c1_) * c2_;
   }
 
   Expr Callback(const Expr& pre, const Expr& post,
                 const Map<DFPattern, Array<Expr>>& node_map) const override {
-    const CallNode* call = pre.as<CallNode>();
     auto x = node_map[x_][0];
-    auto c1 = node_map[const1_][0];
-    auto c2 = node_map[const2_][0];
+    auto c1 = node_map[c1_][0];
+    auto c2 = node_map[c2_][0];
 
-    auto pre_call = call;
-    // Find the next add call.
-    if (pre_call->args[1].as<ConstantNode>()) {
-      pre_call = pre_call->args[0].as<CallNode>();
-    } else {
-      pre_call = pre_call->args[1].as<CallNode>();
+    if (x.as<ConstantNode>()) {
+      return post;
     }
-    // Do nothing if both inputs are not constants as they will be constant folded already.
-    if (pre_call->args[0].as<ConstantNode>() && pre_call->args[1].as<ConstantNode>()) {
+
+    Expr const_expr = Call(Op::Get("multiply"), {c1, c2});
+    IRModule const_mod = IRModule::FromExpr(const_expr);
+    const_mod = transform::FoldConstant()(const_mod);
+    GlobalVar const_main = const_mod->GetGlobalVar("main");
+    Expr const_val = Downcast<Function>(const_mod->functions[const_main])->body;
+
+    return Call(Op::Get("add"), {Call(Op::Get("multiply"), {x, c2}), const_val});
+  }
+
+ private:
+  DFPattern x_;
+  DFPattern c1_;
+  DFPattern c2_;
+};
+
+/*! \brief Simplify two adjacent multiply or add with constants for further constant folding.
+ * The pattern matching supports commutative property.
+ */
+class SimplifyAdjacentMultiplyOrAdd : public DFPatternRewrite {
+ public:
+  SimplifyAdjacentMultiplyOrAdd() {
+    x_ = IsWildcard();
+    c1_ = IsConstant();
+    c2_ = IsConstant();
+    pattern_ = (x_ * c1_ * c2_) || (x_ + c1_ + c2_);
+  }
+
+  Expr Callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    const CallNode* call = pre.as<CallNode>();
+    auto x = node_map[x_][0];
+    auto c1 = node_map[c1_][0];
+    auto c2 = node_map[c2_][0];
+
+    if (x.as<ConstantNode>()) {
       return post;
-    } else {
-      auto add_res = Call(call->op, {c1, c2});
-      return Call(call->op, {x, add_res});
     }
-    return post;
+
+    Expr const_expr = Call(call->op, {c1, c2});
+    IRModule const_mod = IRModule::FromExpr(const_expr);
+    const_mod = transform::FoldConstant()(const_mod);
+    GlobalVar const_main = const_mod->GetGlobalVar("main");
+    Expr const_val = Downcast<Function>(const_mod->functions[const_main])->body;
+
+    return Call(call->op, {x, const_val});
   }
 
  private:
   DFPattern x_;
-  DFPattern const1_;
-  DFPattern const2_;
+  DFPattern c1_;
+  DFPattern c2_;
 };
 
 /*! \brief Simplifying x/sqrt to x*sqrt */
@@ -800,7 +832,8 @@ Expr SimplifyExpr(const Expr& expr, const IRModule& mod) {
   composer.AddRewrite<SimplifySameCast>();
   composer.AddRewrite<SimplifyConsecutiveCast>();
   composer.AddRewrite<FullElementwise>();
-  composer.AddRewrite<SimplifyConsecutiveAdd>();
+  composer.AddRewrite<SwitchAddMultiply>();
+  composer.AddRewrite<SimplifyAdjacentMultiplyOrAdd>();
   composer.AddRewrite<SimplifyDQArgMax>();
   composer.AddRewrite<SimplifyDQArgMin>();
   composer.AddRewrite<SimplifyDQArgSort>();
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index e84d238aaa75..8d5ea28ade61 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -560,46 +560,75 @@ def test_concretize_multiple():
     assert tvm.ir.structural_equal(actual, expected)
 
 
-def test_simplify_consecutive_add():
-    shape = (32, 1, 1)
-    c_data = np.empty(shape).astype("float32")
-    c1 = relay.const(c_data)
-    c2 = relay.const(c_data)
-
-    def before_const_right():
-        x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
-        w = relay.var("w", shape=(32, 16, 3, 3), dtype="float32")
-        y = relay.nn.conv2d(x, w, padding=(1, 1))
-        y = relay.add(y, c1)
-        y = relay.add(y, c2)
-        y = relay.nn.relu(y)
-        return relay.Function([x, w], y)
-
-    def before_const_left():
-        x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
-        w = relay.var("w", shape=(32, 16, 3, 3), dtype="float32")
-        y = relay.nn.conv2d(x, w, padding=(1, 1))
-        y = relay.add(c1, y)
-        y = relay.add(c2, y)
-        y = relay.nn.relu(y)
-        return relay.Function([x, w], y)
-
-    def expected():
-        x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
-        w = relay.var("w", shape=(32, 16, 3, 3), dtype="float32")
-        y = relay.nn.conv2d(x, w, padding=(1, 1))
-        c3 = relay.add(c1, c2)
-        y = relay.add(y, c3)
-        y = relay.nn.relu(y)
-        return relay.Function([x, w], y)
-
-    zr = before_const_right()
-    zl = before_const_left()
-    zzr = run_opt_pass(zr, transform.SimplifyExpr())
-    zzl = run_opt_pass(zl, transform.SimplifyExpr())
-    after = run_opt_pass(expected(), transform.InferType())
-    assert tvm.ir.structural_equal(zzr, after)
-    assert tvm.ir.structural_equal(zzl, after)
+def test_simplify_mul_add():
+    def check_simple_fold(origin_exprs, expect_expr):
+        for origin_expr in origin_exprs:
+            simple_expr = run_opt_pass(origin_expr, transform.SimplifyExpr())
+            assert tvm.ir.structural_equal(simple_expr, expect_expr)
+
+    n = 32
+    c1_val = np.random.uniform(size=n).astype("float32")
+    c2_val = np.random.uniform(size=n).astype("float32")
+    c3_val = np.random.uniform(size=n).astype("float32")
+
+    x = relay.var("x", shape=(n,), dtype="float32")
+    c1 = relay.const(c1_val)
+    c2 = relay.const(c2_val)
+    c3 = relay.const(c3_val)
+
+    # add-add -> add
+    origin_exprs = [
+        x + c1 + c2,
+        c1 + x + c2,
+    ]
+    expect_expr = x + relay.const(c1_val + c2_val)
+    check_simple_fold(origin_exprs, expect_expr)
+
+    # mul-mul -> mul
+    origin_exprs = [
+        x * c1 * c2,
+        c1 * x * c2,
+    ]
+    expect_expr = x * relay.const(c1_val * c2_val)
+    check_simple_fold(origin_exprs, expect_expr)
+
+    # add-mul -> mul-add
+    origin_exprs = [
+        (x + c1) * c2,
+        (c1 + x) * c2,
+        c2 * (x + c1),
+        c2 * (c1 + x),
+    ]
+    expect_expr = x * c2 + relay.const(c1_val * c2_val)
+    check_simple_fold(origin_exprs, expect_expr)
+
+    # add-mul-add -> mul-add
+    origin_exprs = [
+        (x + c1) * c2 + c3,
+        (c1 + x) * c2 + c3,
+        c2 * (x + c1) + c3,
+        c2 * (c1 + x) + c3,
+        c3 + (x + c1) * c2,
+        c3 + (c1 + x) * c2,
+        c3 + c2 * (x + c1),
+        c3 + c2 * (c1 + x),
+    ]
+    expect_expr = x * c2 + relay.const(c1_val * c2_val + c3_val)
+    check_simple_fold(origin_exprs, expect_expr)
+
+    # mul-add-mul -> mul-add
+    origin_exprs = [
+        (x * c1 + c2) * c3,
+        (c1 * x + c2) * c3,
+        (c2 + x * c1) * c3,
+        (c2 + c1 * x) * c3,
+        c3 * (x * c1 + c2),
+        c3 * (c1 * x + c2),
+        c3 * (c2 + x * c1),
+        c3 * (c2 + c1 * x),
+    ]
+    expect_expr = x * relay.const(c1_val * c3_val) + relay.const(c2_val * c3_val)
+    check_simple_fold(origin_exprs, expect_expr)
 
 
 def test_simplify_rsqrt():

From 25a0d47d2b55f3404ea711a3ff28bf22f7cc0e17 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Sat, 29 Oct 2022 12:55:55 -0500
Subject: [PATCH 470/704] [Arith][TIR] Check for constant offsets of known
 literal constraints (#13023)

Previously, the checks for a literal constraint would find exact
matches for an inequality, but any alterations to the conditional
would break this exact matching.  This commit introduces checks for
constant offsets relative to a known value.  These checks are not
always expressible using the existing `ConstIntSetAnalyzer`, which
represents allowed values using a single contiguous
region.  (e.g. `i!=5` is not representable, because it requires a
region for `i<5` and another for `i>5`.)

This implementation reuses the internal representation for
inequalities introduced in https://github.com/apache/tvm/pull/12863,
along with much of its implementation.  However, the indirect
comparisons (e.g. using `a < b` and `b < c` to prove that `a < c`)
introduced in that PR still require an explicit flag to be used.
---
 include/tvm/arith/analyzer.h                  |  11 +-
 src/arith/rewrite_simplify.cc                 |   7 +-
 src/arith/transitive_comparison_analyzer.cc   | 168 +++++++++++++-----
 .../unittest/test_tir_transform_simplify.py   |  14 ++
 4 files changed, 155 insertions(+), 45 deletions(-)

diff --git a/include/tvm/arith/analyzer.h b/include/tvm/arith/analyzer.h
index e2d60684da7b..885c23f49186 100644
--- a/include/tvm/arith/analyzer.h
+++ b/include/tvm/arith/analyzer.h
@@ -409,10 +409,19 @@ class TransitiveComparisonAnalyzer {
    *
    * \param rhs The right-hand side of the comparison
    *
+   * \param propagate_inequalities If true, attempt to find a sequence
+   * of transitive inequalities that allow the lhs and rhs to be
+   * compared.  If false, only use the known comparison that have been
+   * directly provided.  Using `propagate_inequalities = false` is
+   * roughly equivalent to comparing against all known inequality
+   * expressions using `ExprDeepEqual`, but also allows for constant
+   * offsets on either side of the inequality.
+   *
    * \return The most specific result that can be proven about the
    * comparison.  If nothing can be proven, returns kUnknown.
    */
-  TVM_DLL CompareResult TryCompare(const PrimExpr& lhs, const PrimExpr& rhs);
+  TVM_DLL CompareResult TryCompare(const PrimExpr& lhs, const PrimExpr& rhs,
+                                   bool propagate_inequalities = true);
 
   /*! \brief Bind a variable as being equal to a known expression
    *
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index 6cc2aa9e4591..a42303e459d8 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -118,9 +118,7 @@ CompareResult RewriteSimplifier::Impl::TryCompare(const PrimExpr& x, const PrimE
 
   if (is_finished()) return output;
 
-  if (enabled_extensions_ & kTransitivelyProveInequalities) {
-    output = CompareResult(output & TryCompareUsingKnownInequalities(x, y));
-  }
+  output = CompareResult(output & TryCompareUsingKnownInequalities(x, y));
 
   return output;
 }
@@ -132,7 +130,8 @@ CompareResult RewriteSimplifier::Impl::TryCompareUsingConstIntBounds(const PrimE
 
 CompareResult RewriteSimplifier::Impl::TryCompareUsingKnownInequalities(const PrimExpr& x,
                                                                         const PrimExpr& y) {
-  return analyzer_->transitive_comparisons.TryCompare(x, y);
+  bool propagate_inequalities = enabled_extensions_ & kTransitivelyProveInequalities;
+  return analyzer_->transitive_comparisons.TryCompare(x, y, propagate_inequalities);
 }
 
 // try to prove x equals val
diff --git a/src/arith/transitive_comparison_analyzer.cc b/src/arith/transitive_comparison_analyzer.cc
index 9a835f7fdec8..b71096a479b5 100644
--- a/src/arith/transitive_comparison_analyzer.cc
+++ b/src/arith/transitive_comparison_analyzer.cc
@@ -43,10 +43,19 @@ class TransitiveComparisonAnalyzer::Impl {
    *
    * \param rhs The right-hand side of the comparison
    *
+   * \param propagate_inequalities If true, attempt to find a sequence
+   * of transitive inequalities that allow the lhs and rhs to be
+   * compared.  If false, only use the known comparison that have been
+   * directly provided.  Using `propagate_inequalities = false` is
+   * roughly equivalent to comparing against all known values with
+   * `ExprDeepEqual`, but also allowing for constant offsets on either
+   * side of the inequality.
+   *
    * \return The most specific result that can be proven about the
    * comparison.  If nothing can be proven, returns kUnknown.
    */
-  CompareResult TryCompare(const PrimExpr& lhs, const PrimExpr& rhs) const;
+  CompareResult TryCompare(const PrimExpr& lhs, const PrimExpr& rhs,
+                           bool propagate_inequalities = true) const;
 
   /*! \brief Bind a variable as being equal to a known expression
    *
@@ -192,7 +201,37 @@ class TransitiveComparisonAnalyzer::Impl {
    */
   void AddKnown(const PrimExpr& expr, std::vector<Comparison>* vec);
 
-  /*! \brief Attempt to compare the expressions, starting at the lhs.
+  /*! Collect known comparisons between LHS and RHS, without propagation
+   *
+   * Allows the internal representation to handle any constant
+   * offsets, without searching for a sequence of inequalities.
+   *
+   * \param lhs_key The left-hand side of the comparison
+   *
+   * \param rhs_key The right-hand side of the comparison
+   *
+   * \returns A subset of `knowns_` and `scoped_knowns_`, filtered to
+   * only include comparisons between `lhs_key` and `rhs_key`,
+   * normalized such that `lhs_key` is on the left-hand side.
+   */
+  std::vector<Comparison> CollectDirectComparisons(Key lhs_key, Key rhs_key) const;
+
+  /*! Collect known comparisons between LHS and RHS, with propagation
+   *
+   * \param lhs_key The left-hand side of the comparison
+   *
+   * \param rhs_key The right-hand side of the comparison
+   *
+   * \returns All comparisons between `lhs_key` and `rhs_key`,
+   * including the explicitly-provided comparisons in `knowns_` and
+   * `scoped_knowns_`, and comparisons provable through a series of
+   * comparisons through other values.  All comparisons returned are
+   * between `lhs_key` and `rhs_key`, and are normalized such that
+   * `lhs_key` is on the left-hand side.
+   */
+  std::vector<Comparison> CollectIndirectComparisons(Key lhs_key, Key rhs_key) const;
+
+  /*! \brief Internal function used by CollectIndirectComparisons
    *
    * Perform a depth-first search through the space of known
    * expressions, starting at the LHS of a comparison.  In this
@@ -208,14 +247,29 @@ class TransitiveComparisonAnalyzer::Impl {
    * expression D, then combine the comparisons that compose the path
    * into the expression A<=D-4.
    *
-   * \param lhs The left-hand side of the comparison
+   * \param lhs_key The left-hand side of the comparison
    *
-   * \param rhs The right-hand side of the comparison
+   * \param rhs_key The right-hand side of the comparison
+   *
+   * \returns A vector of comparisons between the two expressions.
+   */
+  std::vector<Comparison> DFSFromLHS(Key lhs_key, Key rhs_key) const;
+
+  /*! \brief Combine a set of comparisons that share a LHS and RHS
+   *
+   * \param lhs_to_rhs The comparisons to merge.  These should all
+   * have the same LHS and RHS.  This parameter will typically be the
+   * result from `CollectDirectComparisons` or
+   * `CollectIndirectComparisons`.
    *
-   * \return The result of the comparison
+   * \param offset The constant offset in the comparison being proven.
+   * This is extracted from any additive/subtractive constants in the
+   * `PrimExpr` arguments to `TryCompare`.
+   *
+   * \returns The possible comparisons between LHS and RHS provided
+   * inequalities.
    */
-  CompareResult DFSFromLHS(Key lhs_key, Key rhs_key, int64_t offset, const PrimExpr& lhs,
-                           const PrimExpr& rhs) const;
+  CompareResult MergeComparisons(const std::vector<Comparison>& lhs_to_rhs, int64_t offset) const;
 
   /*! \brief Previous Range bindings
    *
@@ -475,8 +529,9 @@ bool TransitiveComparisonAnalyzer::Impl::Comparison::Implies(
 TransitiveComparisonAnalyzer::TransitiveComparisonAnalyzer() : impl_(std::make_unique<Impl>()) {}
 TransitiveComparisonAnalyzer::~TransitiveComparisonAnalyzer() {}
 
-CompareResult TransitiveComparisonAnalyzer::TryCompare(const PrimExpr& lhs, const PrimExpr& rhs) {
-  return impl_->TryCompare(lhs, rhs);
+CompareResult TransitiveComparisonAnalyzer::TryCompare(const PrimExpr& lhs, const PrimExpr& rhs,
+                                                       bool propagate_inequalities) {
+  return impl_->TryCompare(lhs, rhs, propagate_inequalities);
 }
 
 void TransitiveComparisonAnalyzer::Bind(const Var& var, const PrimExpr& expr, bool allow_override) {
@@ -547,7 +602,8 @@ std::function<void()> TransitiveComparisonAnalyzer::Impl::EnterConstraint(const
 }
 
 CompareResult TransitiveComparisonAnalyzer::Impl::TryCompare(const PrimExpr& lhs_expr,
-                                                             const PrimExpr& rhs_expr) const {
+                                                             const PrimExpr& rhs_expr,
+                                                             bool propagate_inequalities) const {
   // Currently only supports integer checks
   if (!lhs_expr.dtype().is_int() || !rhs_expr.dtype().is_int()) {
     return CompareResult::kUnknown;
@@ -575,29 +631,59 @@ CompareResult TransitiveComparisonAnalyzer::Impl::TryCompare(const PrimExpr& lhs
     return CompareResult::kUnknown;
   }
 
-  auto from_lhs = DFSFromLHS(lhs_key.value(), rhs_key.value(), offset, lhs, rhs);
-  auto from_rhs = Reverse(DFSFromLHS(rhs_key.value(), lhs_key.value(), -offset, rhs, lhs));
-  auto output = from_lhs & from_rhs;
+  auto lhs_to_rhs = [&]() {
+    if (propagate_inequalities) {
+      return CollectIndirectComparisons(lhs_key.value(), rhs_key.value());
+    } else {
+      return CollectDirectComparisons(lhs_key.value(), rhs_key.value());
+    }
+  }();
+  return MergeComparisons(lhs_to_rhs, offset);
+}
+
+std::vector<TransitiveComparisonAnalyzer::Impl::Comparison>
+TransitiveComparisonAnalyzer::Impl::CollectDirectComparisons(Key lhs_key, Key rhs_key) const {
+  std::vector<Comparison> output;
+
+  auto append_known = [&](Comparison cmp) {
+    if (auto normalized = cmp.WithLHS(lhs_key)) {
+      if (normalized.value().rhs_ == rhs_key) {
+        output.push_back(normalized.value());
+      }
+    }
+  };
+
+  for (const auto& known : knowns_) {
+    append_known(known);
+  }
+  for (const auto& known : scoped_knowns_) {
+    append_known(known);
+  }
 
   return output;
 }
 
-CompareResult TransitiveComparisonAnalyzer::Impl::DFSFromLHS(Key lhs_key_input, Key rhs_key_input,
-                                                             int64_t offset_input,
-                                                             const PrimExpr& lhs_input,
-                                                             const PrimExpr& rhs_input) const {
-  Key lhs_key = lhs_key_input;
-  Key rhs_key = rhs_key_input;
-  int64_t offset = offset_input;
+std::vector<TransitiveComparisonAnalyzer::Impl::Comparison>
+TransitiveComparisonAnalyzer::Impl::CollectIndirectComparisons(Key lhs_key, Key rhs_key) const {
+  auto output = DFSFromLHS(lhs_key, rhs_key);
+  for (Comparison cmp : DFSFromLHS(rhs_key, lhs_key)) {
+    auto opt_normalized = cmp.WithLHS(lhs_key);
+    ICHECK(opt_normalized.has_value());
+    output.push_back(opt_normalized.value());
+  }
+  return output;
+}
 
+std::vector<TransitiveComparisonAnalyzer::Impl::Comparison>
+TransitiveComparisonAnalyzer::Impl::DFSFromLHS(Key lhs_key, Key rhs_key) const {
   // Everything in `to_visit` has lhs as its lhs.
   std::unordered_set<Key> seen;
   std::unordered_set<Key> to_visit;
-  std::unordered_map<Key, std::vector<Comparison>> compared_to_x;
+  std::unordered_map<Key, std::vector<Comparison>> compared_to_lhs;
 
   // Utility function to add a new known statement
   auto declare_known = [&](Comparison cmp) {
-    std::vector<Comparison>& knowns = compared_to_x[cmp.rhs_];
+    std::vector<Comparison>& knowns = compared_to_lhs[cmp.rhs_];
 
     // The comparison adds no new information, no modification
     // required.
@@ -646,8 +732,8 @@ CompareResult TransitiveComparisonAnalyzer::Impl::DFSFromLHS(Key lhs_key_input,
     Key middle_key = *to_visit.begin();
     to_visit.erase(to_visit.begin());
 
-    std::vector<Comparison>& prev_knowns_using_middle = compared_to_x.at(middle_key);
-    ICHECK(compared_to_x.count(middle_key));
+    std::vector<Comparison>& prev_knowns_using_middle = compared_to_lhs.at(middle_key);
+    ICHECK(compared_to_lhs.count(middle_key));
 
     std::vector<Comparison> new_knowns_using_lhs;
 
@@ -721,27 +807,29 @@ CompareResult TransitiveComparisonAnalyzer::Impl::DFSFromLHS(Key lhs_key_input,
     }
   }
 
-  // It's possible that we don't have any transitive comparisons that
-  // can prove something about LHS and RHS.
-  auto it = compared_to_x.find(rhs_key);
-  if (it == compared_to_x.end()) {
-    return CompareResult::kUnknown;
+  if (auto it = compared_to_lhs.find(rhs_key); it != compared_to_lhs.end()) {
+    return it->second;
+  } else {
+    // There are known comparisons involving the LHS and the RHS, but
+    // no path that connects the two expressions.
+    return {};
   }
+}
 
-  const std::vector<Comparison>& known_between_lhs_and_rhs = it->second;
-
+CompareResult TransitiveComparisonAnalyzer::Impl::MergeComparisons(
+    const std::vector<Comparison>& lhs_to_rhs, int64_t offset) const {
   // Just because we found a comparison involving LHS and RHS doesn't
   // mean that it's useful.  e.g. Knowing that `x < y` doesn't let us
   // prove whether `x + 5 < y`.
   CompareResult result = CompareResult::kUnknown;
-  for (const auto& known : known_between_lhs_and_rhs) {
-    switch (known.result_) {
+  for (const auto& cmp : lhs_to_rhs) {
+    switch (cmp.result_) {
       case CompareResult::kInconsistent:
         result = CompareResult::kInconsistent;
         break;
 
       case CompareResult::kEQ:
-        if (offset == known.offset_) {
+        if (offset == cmp.offset_) {
           result = result & CompareResult::kEQ;
         } else {
           result = result & CompareResult::kNE;
@@ -749,23 +837,23 @@ CompareResult TransitiveComparisonAnalyzer::Impl::DFSFromLHS(Key lhs_key_input,
         break;
 
       case CompareResult::kLE:
-        if (known.offset_ < offset) {
+        if (cmp.offset_ < offset) {
           result = result & CompareResult::kLT;
-        } else if (known.offset_ <= offset) {
+        } else if (cmp.offset_ <= offset) {
           result = result & CompareResult::kLE;
         }
         break;
 
       case CompareResult::kGE:
-        if (known.offset_ > offset) {
+        if (cmp.offset_ > offset) {
           result = result & CompareResult::kGT;
-        } else if (known.offset_ >= offset) {
+        } else if (cmp.offset_ >= offset) {
           result = result & CompareResult::kGE;
         }
         break;
 
       case CompareResult::kNE:
-        if (offset == known.offset_) {
+        if (offset == cmp.offset_) {
           result = result & CompareResult::kNE;
         }
         break;
@@ -779,7 +867,7 @@ CompareResult TransitiveComparisonAnalyzer::Impl::DFSFromLHS(Key lhs_key_input,
         return CompareResult::kInconsistent;
 
       default:
-        LOG(FATAL) << "Invalid CompareResult: " << static_cast<int>(known.result_);
+        LOG(FATAL) << "Invalid CompareResult: " << static_cast<int>(cmp.result_);
         return CompareResult::kInconsistent;
     }
   }
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 91ef60f9d3f1..4c5499edcfb0 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -989,5 +989,19 @@ def expected(A: T.Buffer[1, "bool"], n: T.int32, m: T.int32):
         A[0] = n < m + 10
 
 
+class TestProvableConditionWithOffset(BaseBeforeAfter):
+    """Use scoped-constraint to prove inequalities"""
+
+    transitively_prove_inequalities = False
+
+    def before(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32):
+        if i < j:
+            A[0] = i < j + 1
+
+    def expected(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32):
+        if i < j:
+            A[0] = True
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 76cd298bd908583b53bc97a383a809101a569cb4 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 31 Oct 2022 08:51:51 -0700
Subject: [PATCH 471/704] [Hexagon] Add E2E test demonstrating how to apply
 blocked layout schedule to conv2d via metaschedule (#13180)

Demonstrates the use of a custom search space generation from a manual written schedule. In this there are no degrees of freedom so tuning will not improve the schedule currently, but the test demonstrates the schedule-time layout transformation of the conv2d primfunc to the 8x8x32 packed layout for convolution ops.

* Add test that demonstrates applying a custom TIR schedule to E2E model.

* Add example scheduling that demonstrates converting input and output activation
to Hexagon's blocked layout.

* PR feedback: Remove latency measurement and profiling report.

* Update comment to reflect PR discussion summary, remove TODO.

* Use ScheduleFn space generator to disable any autotuning.
---
 .../metaschedule_e2e/test_resnet50_int8.py    | 174 ++++++++++++++++++
 1 file changed, 174 insertions(+)

diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
index addbb052a2da..e7400aee61f6 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -18,14 +18,17 @@
 import numpy as np
 import pytest
 import tempfile
+from typing import Optional
 
 import tvm
 import tvm.testing
 from tvm import relay
+from tvm._ffi import register_func
 from tvm.meta_schedule import postproc, schedule_rule
 from tvm.tir.tensor_intrin.hexagon import VRMPY_u8i8i32_INTRIN, VRMPY_u8u8i32_INTRIN
 from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
 from tvm import meta_schedule as ms
+from tvm.tir.schedule import BlockRV, Schedule
 from ..infrastructure import get_hexagon_target
 
 
@@ -186,3 +189,174 @@ def test_resnet50(hexagon_launcher):
             hexagon_lowered.get_graph_json(), hexagon_lowered.lib
         )
         print(debug_ex.profile(input_name=inp.copy()))
+
+
+def _schedule_packed_8x8x32_conv2d(do_tune: bool):
+    """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
+    using 8x8x32 packed layout.
+    """
+
+    def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
+        if conv2d_block == None:
+            try:
+                conv2d_block = sch.get_block("conv2d_NCHWc_int8")
+            except:
+                return False
+
+        assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
+
+        # Apply scheduling
+
+        post_blocks = sch.get_consumers(conv2d_block)
+        if len(post_blocks) > 0:
+            # Fuse all intermediate post ops into the last op.
+            # This is equivalent to the traverse_inline function used in TE schedules.
+            while True:
+                next_post_blocks = []
+                for post_block in post_blocks:
+                    next_consumers = sch.get_consumers(post_block)
+                    if len(next_consumers) > 0:
+                        sch.compute_inline(post_block)
+                    next_post_blocks += next_consumers
+                if len(next_post_blocks) == 0:
+                    assert len(post_blocks) == 1
+                    outer_block = post_blocks[0]
+                    break
+                post_blocks = next_post_blocks
+        else:
+            outer_block = conv2d_block
+
+        # Move the conv2d mma into the injective post mma compute block
+        if outer_block != conv2d_block:
+            loops = sch.get_loops(outer_block)
+            # TODO(csullivan): Currently does all post conv2d mma steps
+            # directly after accumulation for one spatial pixel. May
+            # be desirable to do this with coarser spatial granularity
+            sch.compute_at(conv2d_block, loops[4])
+
+        def index_map_nchw32c_nchw8h8w32c(n, c, h, w, c32):
+            return [n, c, h // 8, w // 8, h % 8, w % 8, c32]
+
+        # Add cache for input and output activation layout transform,
+        # note that weight is already in correct layout
+        input_cache = sch.cache_read(conv2d_block, 0, "global")
+        output_cache = sch.cache_write(outer_block, 0, "global")
+        # Transform the layout of the input
+        sch.transform_layout(
+            conv2d_block, ("read", 0), index_map=index_map_nchw32c_nchw8h8w32c, pad_value=0
+        )
+        # Transform the layout of the int32 accumulator
+        sch.transform_layout(
+            conv2d_block, ("write", 0), index_map=index_map_nchw32c_nchw8h8w32c, pad_value=0
+        )
+        # Transform the layout of the output
+        sch.transform_layout(
+            outer_block, ("write", 0), index_map=index_map_nchw32c_nchw8h8w32c, pad_value=0
+        )
+        return True
+
+    return schedule_fn
+
+
+def tune_packed_8x8x32_template(mod, params, hexagon_launcher):
+    def schedule_rule_conv2d_packed_8x8x32(sch: Schedule, conv2d_block: BlockRV):
+        _schedule_packed_8x8x32_conv2d(do_tune=True)(sch, conv2d_block)
+        return [sch]
+
+    register_func("meta_schedule.conv2d_NCHWc_int8", schedule_rule_conv2d_packed_8x8x32)
+
+    def schedule_conv2d_for_tune(sch: Schedule):
+        _schedule_packed_8x8x32_conv2d(do_tune=True)(sch)
+
+    # This line is necessary for link-params to take effect during
+    # task extraction and relay.build(...).
+    mod = mod.with_attr("executor", executor)
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            target=target,
+            params=params,
+            work_dir=work_dir,
+            max_trials_global=20000,
+            max_trials_per_task=1,
+            num_trials_per_iter=1,
+            strategy="replay-trace",
+            builder=get_hexagon_local_builder(),
+            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
+            # Apply MS auto scheduling rules for all blocks, but utilize
+            # the custom block scheduling strategy registered above for
+            # blocks annotated as `schedule_rule:meta_schedule.conv2d_NCHWc_int8`
+            # space=ms.space_generator.PostOrderApply(
+            #     f_block_filter=None,
+            #     sch_rules="from-target",
+            #     postprocs=[],
+            #     mutator_probs="from-target",
+            # ),
+            # Constrain search space to only be the single
+            # schedule provided for all blocks. No auto
+            # scheduling will be possible.
+            space=ms.space_generator.ScheduleFn(
+                schedule_conv2d_for_tune,
+                sch_rules=[],
+                postprocs=[],
+                mutator_probs={},
+            ),
+            # Without this, the same workloads with different constant weights
+            # are treated as distinct tuning tasks.
+            module_equality="ignore-ndarray",
+        )
+        return ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            target=target,
+            params=params,
+        )
+
+
+@pytest.mark.skip("End-to-end tuning is skipped on CI.")
+@tvm.testing.requires_hexagon
+def test_packed_8x8x32_resnet50(hexagon_launcher):
+    if not os.path.exists(model_json):
+        pytest.skip(msg="Run python export_models.py first.")
+
+    with open(model_json, "r") as fi:
+        mod = tvm.ir.load_json(fi.read())
+
+    with open(model_params, "rb") as fi:
+        params = relay.load_param_dict(fi.read())
+    inp = np.random.randn(1, 3, 224, 224).astype("float32")
+    input_name = "image"
+
+    do_tune = True
+
+    if do_tune:
+        hexagon_lowered = tune_packed_8x8x32_template(mod, params, hexagon_launcher)
+    else:
+        with tvm.transform.PassContext(opt_level=3):
+            hexagon_lowered = relay.build(
+                mod,
+                tvm.target.Target(target, host=target),
+                params=params,
+                executor=executor,
+            )
+
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            params=params,
+        )
+
+    with hexagon_launcher.start_session() as session:
+        graph_mod = session.get_executor_from_factory(hexagon_lowered)
+        graph_mod.set_input(input_name, inp.copy())
+        graph_mod.run()
+        hexagon_output = graph_mod.get_output(0).numpy()
+
+        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+        llvm_graph_mod.set_input(input_name, inp.copy())
+        llvm_graph_mod.run()
+        ref_result = llvm_graph_mod.get_output(0).numpy()
+
+        np.testing.assert_allclose(ref_result, hexagon_output, atol=1e-4, rtol=1e-5)

From c9b10a80fd9b99baa696f152d3eb7fba2233b9c7 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 31 Oct 2022 12:12:23 -0700
Subject: [PATCH 472/704] [ci][java] Use mavenCentral for dependencies (#13239)

This swaps from jcenter to mavenCentral to resolve networking issues. Fixes #13240
---
 apps/android_deploy/app/download-models.gradle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/android_deploy/app/download-models.gradle b/apps/android_deploy/app/download-models.gradle
index 38fd35a78653..dc1abf81aa35 100644
--- a/apps/android_deploy/app/download-models.gradle
+++ b/apps/android_deploy/app/download-models.gradle
@@ -31,7 +31,7 @@ def models = ['extraction.zip']
 def MODEL_URL = 'https://github.com/PariksheetPinjari909/TVM_models/blob/master/extraction_model'
 buildscript {
     repositories {
-        jcenter()
+        mavenCentral()
     }
     dependencies {
         classpath 'de.undercouch:gradle-download-task:5.0.4'

From e48dded3b4705d061f2925919790e3a430327d30 Mon Sep 17 00:00:00 2001
From: "i.Pear" <434317543@qq.com>
Date: Tue, 1 Nov 2022 04:48:31 +0800
Subject: [PATCH 473/704] [BugFix][LLVM] Add UseInitArray flag in
 target_options_ (#13115)

Pointers to constructor functions will be placed in section ".ctors" or ".init_array" in a binary file. However, the ".ctors" section is deprecated and Glibc is not going to call the constructor functions in it.
If we use GNU linker ld to link objects (by default), it will convert ".ctors" section to ".init_array" automatically, but not every linker has the compatibility. If we use lld (the LLVM linker) for better runtime performance (such as Link-Time-Optimization) and better compilation efficiency, the constructor functions will not be called, which will cause critical problems.
Adding "UseInitArray" option to target_options_ will generate ".init_array" instead of ".ctors" at compile time, which will solve this problem. This is also Clang's default behavior.
---
 src/target/llvm/llvm_instance.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/target/llvm/llvm_instance.cc b/src/target/llvm/llvm_instance.cc
index 19ff480452e8..44454fc6b92d 100644
--- a/src/target/llvm/llvm_instance.cc
+++ b/src/target/llvm/llvm_instance.cc
@@ -242,6 +242,8 @@ LLVMTargetInfo::LLVMTargetInfo(LLVMInstance& instance, const Target& target) {
     opt_level_ = defaults::opt_level;
   }
 
+  target_options_.UseInitArray = true;
+
   // Fast math options
 
   auto GetBoolFlag = [&target](llvm::StringRef flag) -> bool {

From c3bb62e5c405a2b4b1854f3dc9bd7e9bf3110773 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 31 Oct 2022 15:26:59 -0700
Subject: [PATCH 474/704] [skip ci][docs] Disable scipy intersphinx linking
 (#13245)

This disables scipy intersphinx linking for the same reason that
matplotlib and numpy are disabled. Should fix failures like
https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4613/pipeline,
though we should investigate if we can switch these back on for release
doc builds
---
 docs/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index 592d149c4ce4..e44c5baf6d6a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -201,7 +201,7 @@ def git_describe_version(original_version):
 intersphinx_mapping = {
     "python": ("https://docs.python.org/{.major}".format(sys.version_info), None),
     # "numpy": ("https://numpy.org/doc/stable", None),
-    "scipy": ("https://docs.scipy.org/doc/scipy", None),
+    # "scipy": ("https://docs.scipy.org/doc/scipy", None),
     # "matplotlib": ("https://matplotlib.org/", None),
 }
 

From 67fa959f1854229281ebb5174801591ec9f34c37 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 31 Oct 2022 21:00:57 -0500
Subject: [PATCH 475/704] [CI] Make additional_flags parameter optional in
 tests/scripts/ci.py (#13238)

This parameter was introduced in
https://github.com/apache/tvm/pull/12833, and was passed for all
subcommands defined using `generate_command`.  However, this broke the
`ci.py lint` subcommand.  This PR makes the `additional_flags`
parameter to the `docker` function be optional, to avoid this
breakage.
---
 tests/scripts/ci.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 02ef7b888b80..cfb91b37ce56 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -153,7 +153,7 @@ def docker(
     scripts: List[str],
     env: Dict[str, str],
     interactive: bool,
-    additional_flags: Dict[str, str],
+    additional_flags: Optional[Dict[str, str]] = None,
 ):
     """
     Invoke a set of bash scripts through docker/bash.sh
@@ -204,9 +204,10 @@ def docker(
         command.append("--env")
         command.append(f"{key}={value}")
 
-    for key, value in additional_flags.items():
-        command.append(key)
-        command.append(value)
+    if additional_flags is not None:
+        for key, value in additional_flags.items():
+            command.append(key)
+            command.append(value)
 
     SCRIPT_DIR.mkdir(exist_ok=True)
 
@@ -357,7 +358,7 @@ def generate_command(
     help: str,
     precheck: Optional[Callable[[], None]] = None,
     post_build: Optional[List[str]] = None,
-    additional_flags: Dict[str, str] = {},
+    additional_flags: Optional[Dict[str, str]] = None,
 ):
     """
     Helper to generate CLIs that:

From 0683ece044451a0cf933fb6487d604b9a69a8f6e Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Mon, 31 Oct 2022 20:06:28 -0700
Subject: [PATCH 476/704] [MetaSchedule] Fix thread bindings of
 MultiLevelTilingTensorCore (#13243)

---
 .../schedule_rule/multi_level_tiling_tensor_core.cc          | 5 +++++
 src/meta_schedule/schedule_rule/schedule_rule.cc             | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index e8a03c722656..37c35248329a 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -556,6 +556,11 @@ ScheduleRule ScheduleRule::MultiLevelTilingTensorCore(
     Optional<Integer> max_innermost_factor, Optional<Array<Integer>> vector_load_lens,
     Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write,
     bool use_software_pipeline) {
+  if (tile_binds.defined()) {
+    for (const String& tile_bind : tile_binds.value()) {
+      CHECK_NE(tile_bind, "threadIdx.x") << "Cannot bind to threadIdx.x when using tensor core.";
+    }
+  }
   auto node = MultiLevelTilingInitCommon<MultiLevelTilingTensorCoreNode>(
       structure, tile_binds, max_innermost_factor, vector_load_lens, reuse_read, reuse_write);
 
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index bd492d03eac6..8e4642b50ddb 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -139,7 +139,7 @@ Array<ScheduleRule> ScheduleRule::DefaultCUDATensorCore() {
   Array<ScheduleRule> results{ScheduleRule::MultiLevelTilingTensorCore(
       /*intrin_groups=*/intrin_groups,
       /*structure=*/"SSSRRSRS",
-      /*tile_binds=*/Array<String>{"blockIdx.x", "vthread.x", "threadIdx.x"},
+      /*tile_binds=*/Array<String>{"blockIdx.y", "blockIdx.x", "threadIdx.y"},
       /*max_innermost_factor=*/Integer(4),
       /*vector_load_lens=*/Array<Integer>{1, 2, 3, 4, 8, 16},
       /*reuse_read=*/

From c69f8ce9c97d82acbd509c12b82a536d5e7cef09 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Tue, 1 Nov 2022 06:07:07 +0300
Subject: [PATCH 477/704] [Relay] Add ClipAndConsecutiveCast and CastClip to
 SimplifyExpr (#13236)

This commit adds SimplifyClipAndConsecutiveCast and SimplifyCastClip to
SimplifyExpr Relay pass. These simplify sequence clip->cast->cast and
cast->clip based on Clip min/max attributes and Cast target data type.

1) SimplifyClipAndConsecutiveCast example:
   %0 == [type=int32]
   %1 = clip(%0, a_min=0f, a_max=255f) [type=int32]
   %2 = cast(%1, dtype="uint8") [type=uint8]
   %3 = cast(%2, dtype="int32") [type=int32]

   --> Here Clip dtype == Cast2 dtype and max_value("uint8") == 255
   min_value("uint8") == 0

   Optimized sequence (both casts can be removed):
   %1 = clip(%0, a_min=0f, a_max=255f) [type=int32]

2) SimplifyCastClip example:
   %1 = cast(%0, dtype="uint8") [type=uint8]
   %2 = clip(%1, a_min=0f, a_max=255f) [type=int8]

   Optimized sequence (remove Clip):
   %1 = cast(%0, dtype="uint8") [type=uint8]
---
 src/relay/transforms/simplify_expr.cc         | 102 ++++++++++++++++++
 tests/python/relay/test_pass_simplify_expr.py |  31 ++++++
 2 files changed, 133 insertions(+)

diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 0dfb45577280..6cae728b304f 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -159,6 +159,106 @@ class SimplifyConsecutiveCast : public DFPatternRewrite {
   DFPattern cast1_;
 };
 
+bool CheckDataTypeMaxMinValue(DataType dtype, double min_value, double max_value) {
+  if (dtype.is_int() || dtype.is_uint()) {
+    double ubound = static_cast<double>(Downcast<IntImm>(tvm::max_value(dtype))->value);
+    double lbound = static_cast<double>(Downcast<IntImm>(tvm::min_value(dtype))->value);
+    return ubound == max_value && lbound == min_value;
+  } else if (dtype.is_float()) {
+    double ubound = Downcast<FloatImm>(tvm::max_value(dtype))->value;
+    double lbound = Downcast<FloatImm>(tvm::min_value(dtype))->value;
+    return ubound == max_value && lbound == min_value;
+  }
+
+  return false;
+}
+
+/*!
+ * \brief SimplifyClipAndConsecutiveCast matches the pattern clip->cast->cast and remove redundant
+ *   casts.
+ * Analysis of "redundancy" is done based on clip min/max values and min/max values of casted data
+ * type.
+ */
+class SimplifyClipAndConsecutiveCast : public DFPatternRewrite {
+ public:
+  SimplifyClipAndConsecutiveCast() {
+    clip_ = IsOp("clip")({IsWildcard()});
+    cast1_ = IsOp("cast")({clip_});
+    pattern_ = IsOp("cast")({cast1_});
+  }
+
+  Expr Callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    auto clip = Downcast<Call>(node_map[clip_][0]);
+    const CallNode* clip_node = clip.as<CallNode>();
+    const ClipAttrs* clip_attrs = clip_node->attrs.as<ClipAttrs>();
+    DataType clip_dtype = Downcast<TensorType>(clip->checked_type())->dtype;
+
+    auto cast1 = Downcast<Call>(node_map[cast1_][0]);
+    DataType cast1_dtype = Downcast<TensorType>(cast1->checked_type())->dtype;
+
+    auto cast2 = Downcast<Call>(post);
+    DataType cast2_dtype = Downcast<TensorType>(cast2->checked_type())->dtype;
+
+    if (clip_dtype == cast2_dtype &&
+        CheckDataTypeMaxMinValue(cast1_dtype, clip_attrs->a_min, clip_attrs->a_max)) {
+      // Case 1:
+      // Data type of Clip == target data type of second Cast and min/max value of Clip == min/max
+      // value of first Clip target data type. In this case both Clip ops can be removed.
+      // Example:
+      //   %0 == [type=int32]
+      //   %1 = clip(%0, a_min=0f, a_max=255f) [type=int32]
+      //   %2 = cast(%1, dtype="uint8") [type=uint8]
+      //   %3 = cast(%2, dtype="int32") [type=int32]
+      //
+      // Optimized to (both casts can be removed):
+      //   %1 = clip(%0, a_min=0f, a_max=255f) [type=int32]
+      return node_map[clip_][0];
+    }
+    return post;
+  }
+
+ protected:
+  DFPattern clip_, cast1_;
+};
+
+/*!
+ * \brief SimplifyCastClip matches the pattern cast->clip and remove redundant Cast based on Clip
+ *    min/max values and min/max values of Cast target data type.
+ *
+ * Example:
+ *   %1 = cast(%0, dtype="uint8") [type=uint8]
+ *   %2 = clip(%1, a_min=0f, a_max=255f) [type=int8]
+ *
+ * Optimized to (remove Clip):
+ *   %1 = cast(%0, dtype="uint8") [type=uint8]
+ */
+class SimplifyCastClip : public DFPatternRewrite {
+ public:
+  SimplifyCastClip() {
+    cast_ = IsOp("cast")({IsWildcard()});
+    pattern_ = IsOp("clip")({cast_});
+  }
+
+  Expr Callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    auto cast = Downcast<Call>(node_map[cast_][0]);
+    DataType cast_dtype = Downcast<TensorType>(cast->checked_type())->dtype;
+
+    auto clip = Downcast<Call>(post);
+    const CallNode* clip_node = clip.as<CallNode>();
+    const ClipAttrs* clip_attrs = clip_node->attrs.as<ClipAttrs>();
+
+    if (CheckDataTypeMaxMinValue(cast_dtype, clip_attrs->a_min, clip_attrs->a_max)) {
+      return node_map[cast_][0];
+    }
+    return post;
+  }
+
+ protected:
+  DFPattern clip_, cast_;
+};
+
 /*!
  * \brief SimplifyTranspose matches the pattern of consecutive transpose op,
  *   and merges or cancels them.
@@ -837,6 +937,8 @@ Expr SimplifyExpr(const Expr& expr, const IRModule& mod) {
   composer.AddRewrite<SimplifyDQArgMax>();
   composer.AddRewrite<SimplifyDQArgMin>();
   composer.AddRewrite<SimplifyDQArgSort>();
+  composer.AddRewrite<SimplifyClipAndConsecutiveCast>();
+  composer.AddRewrite<SimplifyCastClip>();
   return RewritePatterns(composer.MakeCallbacks(), expr, mod);
 }
 
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index 8d5ea28ade61..6df07966eb0a 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -698,5 +698,36 @@ def expected():
     assert tvm.ir.structural_equal(opt, after)
 
 
+def test_simplify_clip_cast():
+    x = relay.var("x", shape=(4, 8), dtype="int32")
+
+    def before():
+        clip = relay.clip(x, a_min=0.0, a_max=255.0)
+        cast = relay.cast(clip, "uint8")
+        return relay.cast(cast, "int32")
+
+    def expected():
+        return relay.clip(x, a_min=0.0, a_max=255.0)
+
+    opt = run_opt_pass(before(), transform.SimplifyExpr())
+    ref = run_infer_type(expected())
+    assert tvm.ir.structural_equal(opt, ref)
+
+
+def test_simplify_cast_clip():
+    x = relay.var("x", shape=(4, 8), dtype="int32")
+
+    def before():
+        cast = relay.cast(x, "uint8")
+        return relay.clip(cast, a_min=0.0, a_max=255.0)
+
+    def expected():
+        return relay.cast(x, "uint8")
+
+    opt = run_opt_pass(before(), transform.SimplifyExpr())
+    ref = run_infer_type(expected())
+    assert tvm.ir.structural_equal(opt, ref)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 3259580bd536360f19440fe3053c1914cb5ccada Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Mon, 31 Oct 2022 20:45:47 -0700
Subject: [PATCH 478/704] [Hexagon] Make pytest use a random port if not
 running in CI (#13244)

[Hexagon] Make pytest use a random port if not running in CI.
---
 python/tvm/contrib/hexagon/pytest_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index 8e209611133a..bc167f25045d 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -93,7 +93,7 @@ def get_free_port() -> int:
     global PREVIOUS_PORT
     global RNG_SEEDED
 
-    if not RNG_SEEDED:
+    if tvm.testing.utils.IS_IN_CI and not RNG_SEEDED:
         random.seed(0)
         RNG_SEEDED = True
 

From 9cdc97fe237dd4fc9741b9d83e50b25bdead60cb Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Tue, 1 Nov 2022 01:21:16 -0400
Subject: [PATCH 479/704] [DLPack][runtime] Update DLPack to v0.7 (#13177)

- Update the `3rdparty/dlpack` git submodule from v0.5 to v0.7, so that
the `DLDeviceType` enumeration has an explicitly-stated underlying
storage type.  This addresses a compiler warning generated by clang
15.0.3.

- Remove `kDLHexagon` and `kDLWebGPU` from `TVMDeviceExtType`, because
those enumerators are now provided by `DLDeviceType`.

- Renumber the members of `TVMDeviceExtType` to reduce the chance of
unnoticed collision with members of `DLDeviceType`.
---
 3rdparty/dlpack                               |  2 +-
 include/tvm/runtime/c_runtime_api.h           | 73 +++++++++++++--
 include/tvm/runtime/device_api.h              | 11 +++
 .../src/main/java/org/apache/tvm/Device.java  | 79 +++++++++-------
 .../src/main/java/org/apache/tvm/NDArray.java | 31 +++----
 .../java/org/apache/tvm/rpc/RPCSession.java   | 25 +++--
 python/tvm/_ffi/runtime_ctypes.py             | 93 ++++++++++++-------
 python/tvm/rpc/client.py                      | 19 ++--
 python/tvm/runtime/ndarray.py                 | 26 +++---
 src/runtime/aot_executor/aot_executor.cc      |  4 +-
 src/runtime/hexagon/hexagon_common.h          |  4 +-
 src/runtime/hexagon/hexagon_device_api.cc     |  6 +-
 src/runtime/hexagon/hexagon_device_api.h      |  3 +-
 13 files changed, 239 insertions(+), 137 deletions(-)

diff --git a/3rdparty/dlpack b/3rdparty/dlpack
index ddeb264880a1..e2bdd3bee8cb 160000
--- a/3rdparty/dlpack
+++ b/3rdparty/dlpack
@@ -1 +1 @@
-Subproject commit ddeb264880a1fa7e7be238ab3901a810324fbe5f
+Subproject commit e2bdd3bee8cb6501558042633fa59144cc8b7f5f
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 03c662ca1922..cd1146697759 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -80,17 +80,74 @@ extern "C" {
 /*! \brief type of array index. */
 typedef int64_t tvm_index_t;
 
-/*! \brief Extension device types in TVM */
+/*! \brief Extension device types in TVM
+ *
+ * Additional enumerators to supplement those provided by
+ * DLPack's `DLDeviceType` enumeration.
+ *
+ * MAINTAINERS NOTE #1: We need to ensure that the two devices
+ * are identified by the same integer.
+ * Currently this requires manual verification.
+ * Discussed here: https://github.com/dmlc/dlpack/issues/111
+ * As of DLPack v0.7, the highest-valued enumerator in
+ * `DLDeviceType` is kDLHexagon = 16.
+ *
+ * MAINTAINERS NOTE #2: As of DLPack v0.7, the definition for
+ * `DLDeviceType` specifies an underlying storage type of
+ * `int32_t`.  That guarantees a variable of type
+ * `DLDeviceType` is capable of holding any integers provided
+ * by *either* of these enumerations.
+ *
+ * However, the `int32_t` specification only applies when the
+ * header file is compiled as C++, and this header file is also
+ * meant to work as C code.  So the unspecified storage type
+ * could be a latent bug when compiled as C.
+ */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
 typedef enum {
-  kDLAOCL = 5,
-  kDLSDAccel = 6,
-  kOpenGL = 11,
-  kDLMicroDev = 13,
-  kDLHexagon = 14,
-  kDLWebGPU = 15
-  // AddExtraTVMType which is not in DLPack here
+#endif
+  // To help avoid accidental conflicts between `DLDeviceType`
+  // and this enumeration, start numbering the new enumerators
+  // a little higher than (currently) seems necessary.
+  kDLAOCL = 32,
+  kDLSDAccel,
+  kOpenGL,
+  kDLMicroDev,
+  TVMDeviceExtType_End,  // sentinel value
 } TVMDeviceExtType;
 
+#ifdef __cplusplus
+// Some other parts of TVM hardcode the integer identifier for
+// some DLPack / TVM devices, rather then using the symbolic
+// enumerator.   E.g., `2` rather than `kDLCUDA`.
+// These asserts should alert us when that mapping breaks.
+#define TVM_HARCODED_INTEGER_CHANGED_MSG                                                          \
+  "Change in compile-time integer.  Make sure hardcoded uses of this integer throughout TVM are " \
+  "updated."
+static_assert(kDLCPU == 1, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLCUDA == 2, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLCUDAHost == 3, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLOpenCL == 4, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLVulkan == 7, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLMetal == 8, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLVPI == 9, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLROCM == 10, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLROCMHost == 11, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLExtDev == 12, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLCUDAManaged == 13, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLOneAPI == 14, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLWebGPU == 15, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLHexagon == 16, TVM_HARCODED_INTEGER_CHANGED_MSG);
+
+static_assert(kDLAOCL == 32, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLSDAccel == 33, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kOpenGL == 34, TVM_HARCODED_INTEGER_CHANGED_MSG);
+static_assert(kDLMicroDev == 35, TVM_HARCODED_INTEGER_CHANGED_MSG);
+#undef TVM_HARCODED_INTEGER_CHANGED_MSG
+#endif
+
 /*!
  * \brief The type code in used and only used in TVM FFI for argument passing.
  *
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 1bb10fa17ae6..9613563f95b4 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -234,6 +234,7 @@ class TVM_DLL DeviceAPI {
 
 /*! \brief The device type bigger than this is RPC device */
 constexpr int kRPCSessMask = 128;
+static_assert(kRPCSessMask >= TVMDeviceExtType_End);
 
 /*!
  * \brief The name of Device API factory.
@@ -248,6 +249,8 @@ inline const char* DeviceName(int type) {
       return "cuda";
     case kDLCUDAHost:
       return "cuda_host";
+    case kDLCUDAManaged:
+      return "cuda_managed";
     case kDLOpenCL:
       return "opencl";
     case kDLSDAccel:
@@ -262,12 +265,20 @@ inline const char* DeviceName(int type) {
       return "vpi";
     case kDLROCM:
       return "rocm";
+    case kDLROCMHost:
+      return "rocm_host";
     case kDLExtDev:
       return "ext_dev";
+    case kDLOneAPI:
+      return "oneapi";
     case kDLWebGPU:
       return "webgpu";
     case kDLHexagon:
       return "hexagon";
+    case kOpenGL:
+      return "opengl";
+    case kDLMicroDev:
+      return "microdev";
     default:
       LOG(FATAL) << "unknown type =" << type;
       return "Unknown";
diff --git a/jvm/core/src/main/java/org/apache/tvm/Device.java b/jvm/core/src/main/java/org/apache/tvm/Device.java
index 6b10a26ed3b3..a5f341a69055 100644
--- a/jvm/core/src/main/java/org/apache/tvm/Device.java
+++ b/jvm/core/src/main/java/org/apache/tvm/Device.java
@@ -17,32 +17,40 @@
 
 package org.apache.tvm;
 
-import org.apache.tvm.rpc.RPC;
-
 import java.util.HashMap;
 import java.util.Map;
+import org.apache.tvm.rpc.RPC;
 
 public class Device {
+  /**
+   * Provides the same information as the C++ enums DLDeviceType and
+   * TVMDeviceExtType.
+   */
+  static final int kDLCPU = 1, kDLCUDA = 2, kDLCUDAHost = 3, kDLOpenCL = 4, kDLVulkan = 7,
+                   kDLMetal = 8, kDLVPI = 9, kDLROCM = 10, kDLROCMHost = 11, kDLExtDev = 12,
+                   kDLCUDAManaged = 13, kDLOneAPI = 14, kDLWebGPU = 15, kDLHexagon = 16,
+                   kDLAOCL = 32, kDLSDAccel = 33, kOpenGL = 34, kDLMicroDev = 35;
+
   private static final Map<Integer, String> MASK2STR = new HashMap<Integer, String>();
   private static final Map<String, Integer> STR2MASK = new HashMap<String, Integer>();
 
   static {
-    MASK2STR.put(1, "cpu");
-    MASK2STR.put(2, "cuda");
-    MASK2STR.put(4, "opencl");
-    MASK2STR.put(7, "vulkan");
-    MASK2STR.put(8, "metal");
-    MASK2STR.put(9, "vpi");
-    MASK2STR.put(14, "hexagon");
-
-    STR2MASK.put("cpu", 1);
-    STR2MASK.put("cuda", 2);
-    STR2MASK.put("cl", 4);
-    STR2MASK.put("opencl", 4);
-    STR2MASK.put("vulkan", 7);
-    STR2MASK.put("metal", 8);
-    STR2MASK.put("vpi", 9);
-    STR2MASK.put("hexagon", 14);
+    MASK2STR.put(kDLCPU, "cpu");
+    MASK2STR.put(kDLCUDA, "cuda");
+    MASK2STR.put(kDLOpenCL, "opencl");
+    MASK2STR.put(kDLVulkan, "vulkan");
+    MASK2STR.put(kDLMetal, "metal");
+    MASK2STR.put(kDLVPI, "vpi");
+    MASK2STR.put(kDLHexagon, "hexagon");
+
+    STR2MASK.put("cpu", kDLCPU);
+    STR2MASK.put("cuda", kDLCUDA);
+    STR2MASK.put("cl", kDLOpenCL);
+    STR2MASK.put("opencl", kDLOpenCL);
+    STR2MASK.put("vulkan", kDLVulkan);
+    STR2MASK.put("metal", kDLMetal);
+    STR2MASK.put("vpi", kDLVPI);
+    STR2MASK.put("hexagon", kDLHexagon);
   }
 
   /**
@@ -51,7 +59,7 @@ public class Device {
    * @return The created device
    */
   public static Device cpu(int devId) {
-    return new Device(1, devId);
+    return new Device(kDLCPU, devId);
   }
 
   public static Device cpu() {
@@ -64,7 +72,7 @@ public static Device cpu() {
    * @return The created device
    */
   public static Device cuda(int devId) {
-    return new Device(2, devId);
+    return new Device(kDLCUDA, devId);
   }
 
   public static Device cuda() {
@@ -77,7 +85,7 @@ public static Device cuda() {
    * @return The created device
    */
   public static Device opencl(int devId) {
-    return new Device(4, devId);
+    return new Device(kDLOpenCL, devId);
   }
 
   public static Device opencl() {
@@ -90,7 +98,7 @@ public static Device opencl() {
    * @return The created device
    */
   public static Device vulkan(int devId) {
-    return new Device(7, devId);
+    return new Device(kDLVulkan, devId);
   }
 
   public static Device vulkan() {
@@ -103,7 +111,7 @@ public static Device vulkan() {
    * @return The created device
    */
   public static Device metal(int devId) {
-    return new Device(8, devId);
+    return new Device(kDLMetal, devId);
   }
 
   public static Device metal() {
@@ -116,7 +124,7 @@ public static Device metal() {
    * @return The created device
    */
   public static Device vpi(int devId) {
-    return new Device(9, devId);
+    return new Device(kDLVPI, devId);
   }
 
   public static Device vpi() {
@@ -129,7 +137,7 @@ public static Device vpi() {
    * @return The created device
    */
   public static Device hexagon(int devId) {
-    return new Device(14, devId);
+    return new Device(kDLHexagon, devId);
   }
 
   public static Device hexagon() {
@@ -153,8 +161,8 @@ public Device(String deviceType, int deviceId) {
    * @return true if exists.
    */
   public boolean exist() {
-    TVMValue ret = APIInternal.get("_GetDeviceAttr")
-        .pushArg(deviceType).pushArg(deviceId).pushArg(0).invoke();
+    TVMValue ret =
+        APIInternal.get("_GetDeviceAttr").pushArg(deviceType).pushArg(deviceId).pushArg(0).invoke();
     return ((TVMValueLong) ret).value != 0;
   }
 
@@ -163,8 +171,8 @@ public boolean exist() {
    * @return the maximum thread number.
    */
   public long maxThreadsPerBlock() {
-    TVMValue ret = APIInternal.get("_GetDeviceAttr")
-        .pushArg(deviceType).pushArg(deviceId).pushArg(1).invoke();
+    TVMValue ret =
+        APIInternal.get("_GetDeviceAttr").pushArg(deviceType).pushArg(deviceId).pushArg(1).invoke();
     return ((TVMValueLong) ret).value;
   }
 
@@ -173,8 +181,8 @@ public long maxThreadsPerBlock() {
    * @return the thread number.
    */
   public long warpSize() {
-    TVMValue ret = APIInternal.get("_GetDeviceAttr")
-        .pushArg(deviceType).pushArg(deviceId).pushArg(2).invoke();
+    TVMValue ret =
+        APIInternal.get("_GetDeviceAttr").pushArg(deviceType).pushArg(deviceId).pushArg(2).invoke();
     return ((TVMValueLong) ret).value;
   }
 
@@ -185,11 +193,13 @@ public void sync() {
     Base.checkCall(Base._LIB.tvmSynchronize(deviceType, deviceId));
   }
 
-  @Override public int hashCode() {
+  @Override
+  public int hashCode() {
     return (deviceType << 16) | deviceId;
   }
 
-  @Override public boolean equals(Object other) {
+  @Override
+  public boolean equals(Object other) {
     if (other != null && other instanceof Device) {
       Device obj = (Device) other;
       return deviceId == obj.deviceId && deviceType == obj.deviceType;
@@ -197,7 +207,8 @@ public void sync() {
     return false;
   }
 
-  @Override public String toString() {
+  @Override
+  public String toString() {
     if (deviceType >= RPC.RPC_SESS_MASK) {
       int tblId = deviceType / RPC.RPC_SESS_MASK - 1;
       int devType = deviceType % RPC.RPC_SESS_MASK;
diff --git a/jvm/core/src/main/java/org/apache/tvm/NDArray.java b/jvm/core/src/main/java/org/apache/tvm/NDArray.java
index a301d23dfbfa..68020db03999 100644
--- a/jvm/core/src/main/java/org/apache/tvm/NDArray.java
+++ b/jvm/core/src/main/java/org/apache/tvm/NDArray.java
@@ -35,7 +35,8 @@ public class NDArray extends NDArrayBase {
     this.device = dev;
   }
 
-  @Override protected void finalize() throws Throwable {
+  @Override
+  protected void finalize() throws Throwable {
     super.finalize();
   }
 
@@ -169,8 +170,8 @@ public void copyFrom(char[] sourceArray) {
   private void checkCopySize(int sourceLength) {
     long arrSize = size();
     if (arrSize != sourceLength) {
-      throw new IllegalArgumentException(String.format("Array shape size not match: %d v.s. %d",
-        sourceLength, size()));
+      throw new IllegalArgumentException(
+          String.format("Array shape size not match: %d v.s. %d", sourceLength, size()));
     }
   }
 
@@ -219,7 +220,7 @@ public long size() {
   public double[] asDoubleArray() {
     if (dtype.typeCode != TVMType.FLOAT || dtype.bits != 64) {
       throw new IllegalArgumentException(
-        "Cannot set convert to double[] for " + dtype.toString() + " array");
+          "Cannot set convert to double[] for " + dtype.toString() + " array");
     }
     byte[][] units = groupInternalBytes();
     double[] array = new double[units.length];
@@ -237,7 +238,7 @@ public double[] asDoubleArray() {
   public float[] asFloatArray() {
     if (dtype.typeCode != TVMType.FLOAT || dtype.bits != 32) {
       throw new IllegalArgumentException(
-        "Cannot set convert to float[] for " + dtype.toString() + " array");
+          "Cannot set convert to float[] for " + dtype.toString() + " array");
     }
     byte[][] units = groupInternalBytes();
     float[] array = new float[units.length];
@@ -255,7 +256,7 @@ public float[] asFloatArray() {
   public long[] asLongArray() {
     if (dtype.typeCode != TVMType.INT || dtype.bits != 64) {
       throw new IllegalArgumentException(
-        "Cannot set convert to long[] for " + dtype.toString() + " array");
+          "Cannot set convert to long[] for " + dtype.toString() + " array");
     }
     byte[][] units = groupInternalBytes();
     long[] array = new long[units.length];
@@ -273,7 +274,7 @@ public long[] asLongArray() {
   public int[] asIntArray() {
     if (dtype.typeCode != TVMType.INT || dtype.bits != 32) {
       throw new IllegalArgumentException(
-        "Cannot set convert to int[] for " + dtype.toString() + " array");
+          "Cannot set convert to int[] for " + dtype.toString() + " array");
     }
     byte[][] units = groupInternalBytes();
     int[] array = new int[units.length];
@@ -291,7 +292,7 @@ public int[] asIntArray() {
   public short[] asShortArray() {
     if (dtype.typeCode != TVMType.INT || dtype.bits != 16) {
       throw new IllegalArgumentException(
-        "Cannot set convert to short[] for " + dtype.toString() + " array");
+          "Cannot set convert to short[] for " + dtype.toString() + " array");
     }
     byte[][] units = groupInternalBytes();
     short[] array = new short[units.length];
@@ -309,7 +310,7 @@ public short[] asShortArray() {
   public char[] asCharArray() {
     if (dtype.typeCode != TVMType.UINT || dtype.bits != 16) {
       throw new IllegalArgumentException(
-        "Cannot set convert to char[] for " + dtype.toString() + " array");
+          "Cannot set convert to char[] for " + dtype.toString() + " array");
     }
     byte[][] units = groupInternalBytes();
     char[] array = new char[units.length];
@@ -327,7 +328,7 @@ public char[] asCharArray() {
   public byte[] asByteArray() {
     if (dtype.typeCode != TVMType.INT || dtype.bits != 8) {
       throw new IllegalArgumentException(
-        "Cannot set convert to byte[] for " + dtype.toString() + " array");
+          "Cannot set convert to byte[] for " + dtype.toString() + " array");
     }
     return internal();
   }
@@ -351,8 +352,7 @@ private byte[][] groupInternalBytes() {
     int unitSize = dtype.numOfBytes;
     if (raw.length <= 0 || raw.length % unitSize != 0) {
       throw new IllegalArgumentException(String.format(
-        "%s size %d cannot divide byte array size %d",
-        dtype.toString(), unitSize, raw.length));
+          "%s size %d cannot divide byte array size %d", dtype.toString(), unitSize, raw.length));
     }
 
     int numOfUnits = raw.length / unitSize;
@@ -381,8 +381,7 @@ public Device device() {
   public static NDArray empty(long[] shape, TVMType dtype, Device dev) {
     Base.RefLong refHandle = new Base.RefLong();
     Base.checkCall(Base._LIB.tvmArrayAlloc(
-        shape, dtype.typeCode, dtype.bits, dtype.lanes,
-        dev.deviceType, dev.deviceId, refHandle));
+        shape, dtype.typeCode, dtype.bits, dtype.lanes, dev.deviceType, dev.deviceId, refHandle));
     return new NDArray(refHandle.value, false, dtype, dev);
   }
 
@@ -393,7 +392,7 @@ public static NDArray empty(long[] shape, TVMType dtype, Device dev) {
    * @return The array tvm supported.
    */
   public static NDArray empty(long[] shape, TVMType dtype) {
-    return empty(shape, dtype, new Device(1, 0));
+    return empty(shape, dtype, Device.cpu(0));
   }
 
   /**
@@ -402,7 +401,7 @@ public static NDArray empty(long[] shape, TVMType dtype) {
    * @return The array tvm supported.
    */
   public static NDArray empty(long[] shape) {
-    return empty(shape, new TVMType("float32", 1), new Device(1, 0));
+    return empty(shape, new TVMType("float32", 1), Device.cpu(0));
   }
 
   /**
diff --git a/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java
index 7acafa6cfbe8..07278f07b8c2 100644
--- a/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java
+++ b/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java
@@ -17,16 +17,15 @@
 
 package org.apache.tvm.rpc;
 
-import org.apache.tvm.Device;
-import org.apache.tvm.Function;
-import org.apache.tvm.Module;
-
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashMap;
 import java.util.Map;
+import org.apache.tvm.Device;
+import org.apache.tvm.Function;
+import org.apache.tvm.Module;
 
 /**
  * RPC Client session module.
@@ -98,7 +97,7 @@ public Device device(int devType) {
    * @return Remote CPU device.
    */
   public Device cpu(int devId) {
-    return device(1, devId);
+    return Device.cpu(devId);
   }
 
   /**
@@ -115,7 +114,7 @@ public Device cpu() {
    * @return Remote CUDA GPU device.
    */
   public Device cuda(int devId) {
-    return device(2, devId);
+    return Device.cuda(devId);
   }
 
   /**
@@ -132,7 +131,7 @@ public Device cuda() {
    * @return Remote OpenCL device.
    */
   public Device cl(int devId) {
-    return device(4, devId);
+    return Device.opencl(devId);
   }
 
   /**
@@ -149,7 +148,7 @@ public Device cl() {
    * @return Remote OpenCL device.
    */
   public Device vulkan(int devId) {
-    return device(7, devId);
+    return Device.vulkan(devId);
   }
 
   /**
@@ -160,14 +159,13 @@ public Device vulkan() {
     return vulkan(0);
   }
 
-
   /**
    * Construct remote Metal device.
    * @param devId device id.
    * @return Remote metal device.
    */
   public Device metal(int devId) {
-    return device(8, devId);
+    return Device.metal(devId);
   }
 
   /**
@@ -240,7 +238,6 @@ public Module loadModule(String path) {
     return RPC.getApi("LoadRemoteModule").pushArg(session).pushArg(path).invoke().asModule();
   }
 
-
   private static byte[] getBytesFromFile(File file) throws IOException {
     // Get the size of the file
     long length = file.length();
@@ -250,7 +247,7 @@ private static byte[] getBytesFromFile(File file) throws IOException {
     }
 
     // cannot create an array using a long type.
-    byte[] bytes = new byte[(int)length];
+    byte[] bytes = new byte[(int) length];
 
     // Read in the bytes
     int offset = 0;
@@ -258,8 +255,8 @@ private static byte[] getBytesFromFile(File file) throws IOException {
 
     InputStream is = new FileInputStream(file);
     try {
-      while (offset < bytes.length
-          && (numRead = is.read(bytes, offset, bytes.length - offset)) >= 0) {
+      while (
+          offset < bytes.length && (numRead = is.read(bytes, offset, bytes.length - offset)) >= 0) {
         offset += numRead;
       }
     } finally {
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index d871fff50b31..fa12bf9ce37a 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -195,44 +195,73 @@ class Device(ctypes.Structure):
     OpenCL.  Some properties may return None depending on whether an
     API exposes that particular property.
 
+    NOTE!  The integer values in MASK2STR and STR2MASK *must* correspond
+    to the values provided by the DLDeviceType and TVMDeviceExtType enums.
     """
 
+    kDLCPU = 1
+    kDLCUDA = 2
+    kDLCUDAHost = 3
+    kDLOpenCL = 4
+    kDLVulkan = 7
+    kDLMetal = 8
+    kDLVPI = 9
+    kDLROCM = 10
+    kDLROCMHost = 11
+    kDLExtDev = 12
+    kDLCUDAManaged = 13
+    kDLOneAPI = 14
+    kDLWebGPU = 15
+    kDLHexagon = 16
+    kDLAOCL = 32
+    kDLSDAccel = 33
+    kOpenGL = 34
+    kDLMicroDev = 35
+
     _fields_ = [("device_type", ctypes.c_int), ("device_id", ctypes.c_int)]
     MASK2STR = {
-        1: "cpu",
-        2: "cuda",
-        4: "opencl",
-        5: "aocl",
-        7: "vulkan",
-        8: "metal",
-        9: "vpi",
-        10: "rocm",
-        12: "ext_dev",
-        14: "hexagon",
-        15: "webgpu",
+        kDLCPU: "cpu",
+        kDLCUDA: "cuda",
+        kDLCUDAHost: "cuda_host",
+        kDLCUDAManaged: "cuda_managed",
+        kDLOpenCL: "opencl",
+        kDLVulkan: "vulkan",
+        kDLMetal: "metal",
+        kDLVPI: "vpi",
+        kDLROCM: "rocm",
+        kDLROCMHost: "rocm_host",
+        kDLExtDev: "ext_dev",
+        kDLOneAPI: "oneapi",
+        kDLWebGPU: "webgpu",
+        kDLHexagon: "hexagon",
+        kDLAOCL: "aocl",
+        kDLSDAccel: "sdaccel",
+        kOpenGL: "opengl",
+        kDLMicroDev: "microdev",
     }
+
     STR2MASK = {
-        "llvm": 1,
-        "stackvm": 1,
-        "cpu": 1,
-        "c": 1,
-        "test": 1,
-        "hybrid": 1,
-        "composite": 1,
-        "cuda": 2,
-        "nvptx": 2,
-        "cl": 4,
-        "opencl": 4,
-        "sdaccel": 4,
-        "aocl": 5,
-        "aocl_sw_emu": 5,
-        "vulkan": 7,
-        "metal": 8,
-        "vpi": 9,
-        "rocm": 10,
-        "ext_dev": 12,
-        "hexagon": 14,
-        "webgpu": 15,
+        "llvm": kDLCPU,
+        "stackvm": kDLCPU,
+        "cpu": kDLCPU,
+        "c": kDLCPU,
+        "test": kDLCPU,
+        "hybrid": kDLCPU,
+        "composite": kDLCPU,
+        "cuda": kDLCUDA,
+        "nvptx": kDLCUDA,
+        "cl": kDLOpenCL,
+        "opencl": kDLOpenCL,
+        "sdaccel": kDLOpenCL,
+        "aocl": kDLAOCL,
+        "aocl_sw_emu": kDLAOCL,
+        "vulkan": kDLVulkan,
+        "metal": kDLMetal,
+        "vpi": kDLVPI,
+        "rocm": kDLROCM,
+        "ext_dev": kDLExtDev,
+        "hexagon": kDLHexagon,
+        "webgpu": kDLWebGPU,
     }
 
     def __init__(self, device_type, device_id):
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index eddc324b3390..e7071bae8d2e 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -25,6 +25,7 @@
 from tvm._ffi.base import TVMError
 from tvm.contrib import utils
 from tvm.runtime import ndarray as nd
+from tvm._ffi.runtime_ctypes import Device
 
 from . import _ffi_api, base, server
 
@@ -197,39 +198,39 @@ def download_linked_module(self, path):
 
     def cpu(self, dev_id=0):
         """Construct CPU device."""
-        return self.device(1, dev_id)
+        return self.device(Device.kDLCPU, dev_id)
 
     def cuda(self, dev_id=0):
         """Construct CUDA GPU device."""
-        return self.device(2, dev_id)
+        return self.device(Device.kDLCUDA, dev_id)
 
     def cl(self, dev_id=0):
         """Construct OpenCL device."""
-        return self.device(4, dev_id)
+        return self.device(Device.kDLOpenCL, dev_id)
 
     def vulkan(self, dev_id=0):
         """Construct Vulkan device."""
-        return self.device(7, dev_id)
+        return self.device(Device.kDLVulkan, dev_id)
 
     def metal(self, dev_id=0):
         """Construct Metal device."""
-        return self.device(8, dev_id)
+        return self.device(Device.kDLMetal, dev_id)
 
     def rocm(self, dev_id=0):
         """Construct ROCm device."""
-        return self.device(10, dev_id)
+        return self.device(Device.kDLROCM, dev_id)
 
     def ext_dev(self, dev_id=0):
         """Construct extension device."""
-        return self.device(12, dev_id)
+        return self.device(Device.kDLExtDev, dev_id)
 
     def hexagon(self, dev_id=0):
         """Construct Hexagon device."""
-        return self.device(14, dev_id)
+        return self.device(Device.kDLHexagon, dev_id)
 
     def webgpu(self, dev_id=0):
         """Construct WebGPU device."""
-        return self.device(15, dev_id)
+        return self.device(Device.kDLWebGPU, dev_id)
 
 
 class LocalSession(RPCSession):
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 16790ca2c783..b7a325948895 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -328,11 +328,11 @@ def numpyasarray(np_data):
     arr.dtype = DataType(np.dtype(data.dtype).name)
     arr.ndim = data.ndim
     # CPU device
-    arr.device = device(1, 0)
+    arr.device = device(Device.kDLCPU, 0)
     return arr, shape
 
 
-def empty(shape, dtype="float32", device=device(1, 0), mem_scope=None):
+def empty(shape, dtype="float32", device=device(Device.kDLCPU, 0), mem_scope=None):
     """Create an empty array given shape and device
 
     Parameters
@@ -399,7 +399,7 @@ def cpu(dev_id=0):
     dev : Device
         The created device
     """
-    return Device(1, dev_id)
+    return Device(Device.kDLCPU, dev_id)
 
 
 def cuda(dev_id=0):
@@ -415,7 +415,7 @@ def cuda(dev_id=0):
     dev : Device
         The created device
     """
-    return Device(2, dev_id)
+    return Device(Device.kDLCUDA, dev_id)
 
 
 def gpu(dev_id=0):
@@ -437,7 +437,7 @@ def gpu(dev_id=0):
     warnings.warn(
         "Please use tvm.cuda() instead of tvm.gpu(). tvm.gpu() is going to be deprecated in 0.9.0",
     )
-    return Device(2, dev_id)
+    return Device(Device.kDLCUDA, dev_id)
 
 
 def rocm(dev_id=0):
@@ -453,7 +453,7 @@ def rocm(dev_id=0):
     dev : Device
         The created device
     """
-    return Device(10, dev_id)
+    return Device(Device.kDLROCM, dev_id)
 
 
 def opencl(dev_id=0):
@@ -469,7 +469,7 @@ def opencl(dev_id=0):
     dev : Device
         The created device
     """
-    return Device(4, dev_id)
+    return Device(Device.kDLOpenCL, dev_id)
 
 
 def metal(dev_id=0):
@@ -485,7 +485,7 @@ def metal(dev_id=0):
     dev : Device
         The created device
     """
-    return Device(8, dev_id)
+    return Device(Device.kDLMetal, dev_id)
 
 
 def vpi(dev_id=0):
@@ -501,7 +501,7 @@ def vpi(dev_id=0):
     dev : Device
         The created device
     """
-    return Device(9, dev_id)
+    return Device(Device.kDLVPI, dev_id)
 
 
 def vulkan(dev_id=0):
@@ -517,7 +517,7 @@ def vulkan(dev_id=0):
     dev : Device
         The created device
     """
-    return Device(7, dev_id)
+    return Device(Device.kDLVulkan, dev_id)
 
 
 def ext_dev(dev_id=0):
@@ -538,7 +538,7 @@ def ext_dev(dev_id=0):
     This API is reserved for quick testing of new
     device by plugin device API as ext_dev.
     """
-    return Device(12, dev_id)
+    return Device(Device.kDLExtDev, dev_id)
 
 
 def hexagon(dev_id=0):
@@ -554,7 +554,7 @@ def hexagon(dev_id=0):
     dev : Device
         The created device
     """
-    return Device(14, dev_id)
+    return Device(Device.kDLHexagon, dev_id)
 
 
 def webgpu(dev_id=0):
@@ -570,7 +570,7 @@ def webgpu(dev_id=0):
     dev : Device
         The created device
     """
-    return Device(15, dev_id)
+    return Device(Device.kDLWebGPU, dev_id)
 
 
 cl = opencl
diff --git a/src/runtime/aot_executor/aot_executor.cc b/src/runtime/aot_executor/aot_executor.cc
index 7f7daabf3fc2..292fe4fd64ce 100644
--- a/src/runtime/aot_executor/aot_executor.cc
+++ b/src/runtime/aot_executor/aot_executor.cc
@@ -49,8 +49,8 @@ AotExecutor::AotExecutor(tvm::runtime::Module module, const std::vector<Device>&
   ICHECK_EQ(devices_[0].device_id, expected_device.device_id)
       << "At this time, AOTExecutor supports only execution on kDLCPU 0";
   // TODO(tvm-team): Temporary hack since Hexagon is defined different than kDLCPU.
-  bool is_valid_device = (TVMDeviceExtType(devices_[0].device_type) == kDLHexagon) ||
-                         (DLDeviceType(devices_[0].device_type) == kDLCPU);
+  bool is_valid_device =
+      (devices_[0].device_type == kDLHexagon) || (devices_[0].device_type == kDLCPU);
   CHECK(is_valid_device)
       << "At this time, AOTExecutor supports only execution on kDLCPU 0 or kDLHexagon 0";
 
diff --git a/src/runtime/hexagon/hexagon_common.h b/src/runtime/hexagon/hexagon_common.h
index 025cc253eee9..5834093a9e43 100644
--- a/src/runtime/hexagon/hexagon_common.h
+++ b/src/runtime/hexagon/hexagon_common.h
@@ -45,9 +45,7 @@
     }                                                                             \
   } while (0)
 
-inline bool IsHexagonDevice(DLDevice dev) {
-  return TVMDeviceExtType(dev.device_type) == kDLHexagon;
-}
+inline bool IsHexagonDevice(DLDevice dev) { return dev.device_type == kDLHexagon; }
 
 constexpr int kHexagonAllocAlignment = 2048;
 
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 50275a7b6101..7221be03cc53 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -81,7 +81,7 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
 
   // NOTE: This check should be superfluous, but it's probably a good idea to leave it in
   // until the AoT executor's multi-device dispatch code is mature. --cconvey 2022-08-26
-  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon)
+  CHECK(dev.device_type == kDLHexagon)
       << "dev.device_type: " << dev.device_type << " DeviceName(" << dev.device_type
       << "): " << DeviceName(dev.device_type) << "";
 
@@ -162,14 +162,14 @@ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
 void* HexagonDeviceAPI::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape,
                                            DLDataType dtype, Optional<String> mem_scope) {
   // must be Hexagon device (not CPU)
-  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
+  CHECK(dev.device_type == kDLHexagon) << "dev.device_type: " << dev.device_type;
   CHECK((ndim == 1 || ndim == 2) && "Hexagon Device API supports only 1d and 2d allocations");
   return AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
 }
 
 void HexagonDeviceAPI::FreeVtcmWorkspace(Device dev, void* ptr) {
   // must be Hexagon device (not CPU)
-  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
+  CHECK(dev.device_type == kDLHexagon) << "dev.device_type: " << dev.device_type;
   FreeDataSpace(dev, ptr);
 }
 
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index 30ac61a92b07..e3adaf65548d 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -189,8 +189,7 @@ class HexagonDeviceAPI final : public DeviceAPI {
    */
   bool IsValidDevice(DLDevice dev) {
     // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU
-    return (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
-           (DLDeviceType(dev.device_type) == kDLCPU);
+    return (dev.device_type == kDLHexagon) || (dev.device_type == kDLCPU);
   }
 
   //! \brief Manages runtime HexagonBuffer allocations

From 2c1fecd0978d013a552ddb8d5992942acb40e5bc Mon Sep 17 00:00:00 2001
From: LiangW <114222082+liangW-intellif@users.noreply.github.com>
Date: Tue, 1 Nov 2022 16:04:31 +0800
Subject: [PATCH 480/704] [TIR][Primitive] Support rolling_buffer schedule
 primitive in TensorIR (#13033)

* [TIR][Primitive] Support rolling_buffer schedule primitive in TensorIR

* Address review comments

* Add dependency checks
---
 include/tvm/tir/schedule/schedule.h           |  17 +
 python/tvm/tir/schedule/schedule.py           | 108 ++++
 src/tir/schedule/concrete_schedule.cc         |  12 +
 src/tir/schedule/concrete_schedule.h          |   2 +
 src/tir/schedule/primitive.h                  |  16 +
 src/tir/schedule/primitive/rolling_buffer.cc  | 474 +++++++++++++++
 src/tir/schedule/schedule.cc                  |   3 +
 src/tir/schedule/traced_schedule.cc           |  12 +
 src/tir/schedule/traced_schedule.h            |   2 +
 .../test_tir_schedule_rolling_buffer.py       | 573 ++++++++++++++++++
 10 files changed, 1219 insertions(+)
 create mode 100644 src/tir/schedule/primitive/rolling_buffer.cc
 create mode 100644 tests/python/unittest/test_tir_schedule_rolling_buffer.py

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 3394e37070ff..5dbc1b5af395 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -689,6 +689,23 @@ class ScheduleNode : public runtime::Object {
    */
   virtual void PadEinsum(const BlockRV& block_rv, const Array<Integer>& padding) = 0;
 
+  /******** Schedule: Buffer transformation ********/
+  /*!
+   * \brief Compute the target buffer via rolling buffering.
+   * \details This primitive selects the outermost rollable axis with a positive bound overlap that
+   * appears in the block's ancestor loops as `rolling axis`, fold and circularize the buffer along
+   * the rolling dimension, append block predicate to avoid recomputing overlapping elements.
+   * It requires:
+   * 1) The buffer to be an intermediate buffer defined via `alloc_buffer`.
+   * 2) The LCA of the producer and consumer of the buffer is a for loop, typically,
+   *    the producer and consumer of the buffer are cascaded through compute_at.
+   * 3) The access region of the buffer has at least one dimension that contains
+   *    a positive bound overlap.
+   * \param block_rv The producer block of the buffer.
+   * \param write_buffer_index The index of the buffer in block's write region.
+   */
+  virtual void RollingBuffer(const BlockRV& block_rv, int write_buffer_index) = 0;
+
   /******** Schedule: Misc ********/
   /*! \brief A no-op that marks the start of postprocessing phase of scheduling */
   virtual void EnterPostproc() = 0;
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 6c620045e90d..c5b7937c6066 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -3128,6 +3128,114 @@ def after_pad_einsum(
             self, block, padding
         )
 
+    ######## Schedule: Buffer transformation ########
+
+    @type_checked
+    def rolling_buffer(
+        self,
+        block: Union[BlockRV, str],
+        write_buffer_index: int,
+    ) -> None:
+        """Compute the target buffer via rolling buffering, select the outermost rollable
+        axis with a positive bound overlap that appears in the block's ancestor loops
+        as `rolling axis`, fold and circularize the buffer along the rolling dimension,
+        append block predicate to avoid recomputing overlapping elements. It requires:
+
+        1) The block is not an output block and has only RAW dependencies.
+
+        2) The buffer to be an intermediate buffer defined via `alloc_buffer`.
+
+        3) The LCA of the producer and consumer of the buffer is a for loop, typically,
+        the producer and consumer of the buffer are cascaded through compute_at.
+
+        4) The access region of the buffer has at least one dimension that contains
+        a positive bound overlap.
+
+        Parameters
+        ----------
+        block : Union[BlockRV, str]
+            The producer block of the buffer.
+        write_buffer_index : int
+            The index of the buffer in block's write region.
+
+        Examples
+        --------
+
+        Before rolling_buffer, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def before_rolling_buffer(
+                A: T.Buffer[(12, 12), "int8"], C: T.Buffer[(8, 8), "int8"]
+            ) -> None:
+                # body
+                # with T.block("root")
+                B = T.alloc_buffer([10, 10], dtype="int8")
+                for i0, i1 in T.grid(2, 2):
+                    for ax0, ax1, ax2, ax3 in T.grid(6, 6, 3, 3):
+                        with T.block("B"):
+                            ax0_1 = T.axis.spatial(10, i0 * 4 + ax0)
+                            ax1_1 = T.axis.spatial(10, i1 * 4 + ax1)
+                            rv0, rv1 = T.axis.remap("RR", [ax2, ax3])
+                            B[ax0_1, ax1_1] = T.max(
+                                B[ax0_1, ax1_1], A[ax0_1 + rv0, ax1_1 + rv1]
+                            )
+                    for ax0, ax1, ax2, ax3 in T.grid(4, 4, 3, 3):
+                        with T.block("C"):
+                            ax0_1 = T.axis.spatial(8, i0 * 4 + ax0)
+                            ax1_1 = T.axis.spatial(8, i1 * 4 + ax1)
+                            rv0, rv1 = T.axis.remap("RR", [ax2, ax3])
+                            C[ax0_1, ax1_1] = T.max(
+                                C[ax0_1, ax1_1], B[ax0_1 + rv0, ax1_1 + rv1]
+                            )
+
+        Create the schedule and do rolling_buffer:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_rolling_buffer)
+            sch.rolling_buffer(sch.get_block("B"), write_buffer_index=0)
+            print(sch.mod["main"].script())
+
+        After applying rolling_buffer, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def after_rolling_buffer(
+                A: T.Buffer[(12, 12), "int8"],
+                C: T.Buffer[(8, 8), "int8"]
+            ) -> None:
+                # body
+                # with T.block("root")
+                B = T.alloc_buffer([6, 10], dtype="int8")
+                for i0, i1 in T.grid(2, 2):
+                    for ax0, ax1, ax2, ax3 in T.grid(6, 6, 3, 3):
+                        with T.block("B"):
+                            T.where((i0 < 1 or 2 <= ax0) and (i1 < 1 or 2 <= ax1))
+                            ax0_1 = T.axis.spatial(10, i0 * 4 + ax0)
+                            ax1_1 = T.axis.spatial(10, i1 * 4 + ax1)
+                            rv0, rv1 = T.axis.remap("RR", [ax2, ax3])
+                            B[ax0_1 % 6, ax1_1] = T.max(
+                                B[ax0_1 % 6, ax1_1], A[ax0_1 + rv0, ax1_1 + rv1]
+                            )
+                    for ax0, ax1, ax2, ax3 in T.grid(4, 4, 3, 3):
+                        with T.block("C"):
+                            ax0_1 = T.axis.spatial(8, i0 * 4 + ax0)
+                            ax1_1 = T.axis.spatial(8, i1 * 4 + ax1)
+                            rv0, rv1 = T.axis.remap("RR", [ax2, ax3])
+                            C[ax0_1, ax1_1] = T.max(
+                                C[ax0_1, ax1_1], B[ax0_1 % 6 + rv0, ax1_1 + rv1]
+                            )
+
+        Note
+        ----
+        The region_cover property of the consumer block of the target buffer will become false.
+        """
+        block = self._normalize_block_arg(block)
+        return _ffi_api.ScheduleRollingBuffer(self, block, write_buffer_index)  # type: ignore # pylint: disable=no-member
+
     ########## Schedule: Misc ##########
 
     @type_checked
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 7144ba8ae1f5..a0d29a00f886 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -814,6 +814,8 @@ void ConcreteScheduleNode::SetAxisSeparator(const BlockRV& block_rv, int buffer_
   this->state_->DebugVerify();
 }
 
+/******** Schedule: Padding ********/
+
 BlockRV ConcreteScheduleNode::DecomposePadding(const BlockRV& block_rv, const LoopRV& loop_rv) {
   StmtSRef result{nullptr};
   TVM_TIR_SCHEDULE_BEGIN();
@@ -829,6 +831,16 @@ void ConcreteScheduleNode::PadEinsum(const BlockRV& block_rv, const Array<Intege
   TVM_TIR_SCHEDULE_END("pad-einsum", this->error_render_level_);
   this->state_->DebugVerify();
 }
+
+/******** Schedule: Buffer Transformation ********/
+
+void ConcreteScheduleNode::RollingBuffer(const BlockRV& block_rv, int write_buffer_index) {
+  TVM_TIR_SCHEDULE_BEGIN();
+  tir::RollingBuffer(state_, this->GetSRef(block_rv), write_buffer_index);
+  TVM_TIR_SCHEDULE_END("rolling-buffer", this->error_render_level_);
+  this->state_->DebugVerify();
+}
+
 /******** Schedule: Misc ********/
 
 }  // namespace tir
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 384b1ce2425f..66fca107715b 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -154,6 +154,8 @@ class ConcreteScheduleNode : public ScheduleNode {
                         const Array<IntImm>& axis_separators) override;
   /******** Schedule: Padding decomposition ********/
   BlockRV DecomposePadding(const BlockRV& block_rv, const LoopRV& loop_rv) override;
+  /******** Schedule: Buffer transformation ********/
+  void RollingBuffer(const BlockRV& block_rv, int write_buffer_index) override;
   /******** Schedule: Misc ********/
   void EnterPostproc() override {}
 
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 8e5ab91b8e7c..af1988eaaf36 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -533,6 +533,22 @@ TVM_DLL StmtSRef DecomposePadding(ScheduleState self, const StmtSRef& block_sref
 TVM_DLL void PadEinsum(ScheduleState self, const StmtSRef& block_sref,
                        const Array<Integer>& padding);
 
+/******** Schedule: Buffer transformation ********/
+/*!
+ * \brief Compute the target buffer via rolling buffering.
+ * \details This primitive selects the outermost rollable axis with a positive bound overlap that
+ * appears in the block's ancestor loops as `rolling axis`, fold and circularize the buffer along
+ * the rolling dimension, append block predicate to avoid recomputing overlapping elements.
+ * It requires:
+ * 1) The buffer to be an intermediate buffer defined via `alloc_buffer`.
+ * 2) The LCA of the producer and consumer of the buffer is a for loop, typically,
+ *    the producer and consumer of the buffer are cascaded through compute_at.
+ * 3) The access region of the buffer has at least one dimension that contains
+ *    a positive bound overlap.
+ * \param block_rv The producer block of the buffer.
+ * \param write_buffer_index The index of the buffer in block's write region.
+ */
+TVM_DLL void RollingBuffer(ScheduleState self, const StmtSRef& block_sref, int write_buffer_index);
 /******** Schedule: Misc ********/
 
 }  // namespace tir
diff --git a/src/tir/schedule/primitive/rolling_buffer.cc b/src/tir/schedule/primitive/rolling_buffer.cc
new file mode 100644
index 000000000000..c01d6c568fcd
--- /dev/null
+++ b/src/tir/schedule/primitive/rolling_buffer.cc
@@ -0,0 +1,474 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <functional>
+
+#include "../ir_comparator.h"
+#include "../utils.h"
+
+namespace tvm {
+namespace tir {
+
+namespace {
+
+struct RollingBufferInfo {
+  Buffer old_buffer;
+  Buffer new_buffer;
+  int rolling_axis;
+  PrimExpr rolling_extent;
+  std::vector<int> axis_overlaps;
+  std::vector<Optional<Var>> axis_iter_vars;
+  /*! \brief The map used for ScheduleStateNode::Replace. */
+  Map<Block, Block> block_reuse;
+};
+
+BufferRegion GetRelaxedBufferRegion(const BlockRealize& realize, const BufferRegion& buffer_region,
+                                    const Map<Var, arith::IntSet>& dom_map) {
+  Array<arith::IntSet> relaxed_intsets =
+      arith::EvalSet(Substitute(buffer_region->region, GetBindings(realize)), dom_map);
+  Region relaxed_region;
+  relaxed_region.reserve(relaxed_intsets.size());
+  for (size_t i = 0; i < relaxed_intsets.size(); ++i) {
+    relaxed_region.push_back(
+        relaxed_intsets[i].CoverRange(Range::FromMinExtent(0, buffer_region->buffer->shape[i])));
+  }
+  return BufferRegion(buffer_region->buffer, relaxed_region);
+}
+
+class RollingBufferDependencyError : public ScheduleError {
+ public:
+  explicit RollingBufferDependencyError(IRModule mod, Block block)
+      : mod_(mod), block_(std::move(block)) {}
+
+  String FastErrorString() const final {
+    return "ScheduleError: The target block is required to have only RAW dependencies";
+  }
+
+  String DetailRenderTemplate() const final {
+    return "The target block {0} is required to have only RAW dependencies";
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+
+  /*!
+   * \brief Check if the block has only RAW dependencies.
+   * \param self The schedule state
+   * \param block_sref The sref of the block to be checked
+   * \param scope_root_sref The sref of the scope root
+   * \throw ScheduleError if the block has WAW or WAR dependency.
+   */
+  static void Check(const ScheduleState& self, const StmtSRef& block_sref,
+                    const StmtSRef& scope_root_sref) {
+    BlockScope scope = self->GetBlockScope(scope_root_sref);
+    for (const Dependency& producers : scope->GetDepsByDst(block_sref)) {
+      if (!(producers->kind == DepKind::kRAW)) {
+        const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
+        throw RollingBufferDependencyError(self->mod, GetRef<Block>(block));
+      }
+    }
+    for (const Dependency& consumers : scope->GetDepsBySrc(block_sref)) {
+      if (!(consumers->kind == DepKind::kRAW)) {
+        const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
+        throw RollingBufferDependencyError(self->mod, GetRef<Block>(block));
+      }
+    }
+  }
+
+ private:
+  IRModule mod_;
+  Block block_;
+};
+
+class RollingBufferMatchError : public ScheduleError {
+ public:
+  RollingBufferMatchError(IRModule mod, Block block, BufferRegion buffer_region)
+      : mod_(mod), block_(block), buffer_region_(buffer_region) {}
+  String FastErrorString() const final {
+    return "ScheduleError: rolling_buffer expect the buffer region to have at least one dimention"
+           "matching the rolling pattern such as: hh.outer * stride + hh.inner";
+  }
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "The target buffer " << buffer_region_->buffer->name << " with region "
+       << buffer_region_->region
+       << " should have at least one dimension range that matches a rolling pattern "
+          "such as hh.outer * stride + hh.inner. ";
+    return os.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+
+ private:
+  IRModule mod_;
+  Block block_;
+  BufferRegion buffer_region_;
+};
+
+class RollingBufferInsertionError : public ScheduleError {
+ public:
+  RollingBufferInsertionError(IRModule mod, Buffer buffer, Block block)
+      : mod_(mod), buffer_(std::move(buffer)), block_(block) {}
+  String FastErrorString() const final {
+    return "ScheduleError: rolling_buffer injection is invalid, the lca of the access "
+           "location of the target buffer is not a for loop. ";
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "rolling_buffer injection is invalid. The block {0} should be tiled so that "
+       << "the lca of the access location of the target buffer " << buffer_->name
+       << " is a for loop. ";
+    return os.str();
+  }
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+
+ private:
+  IRModule mod_;
+  Buffer buffer_;
+  Block block_;
+};
+
+class RollingBufferInfoCollector {
+ public:
+  static RollingBufferInfo CheckAndGetRollingBufferInfo(const IRModule& mod,
+                                                        const StmtSRef& block_sref,
+                                                        const BufferRegion& buffer_region) {
+    RollingBufferInfoCollector collector;
+    if (!collector.MatchRollingBuffer(block_sref, buffer_region)) {
+      const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
+      throw RollingBufferMatchError(mod, GetRef<Block>(block), buffer_region);
+    }
+    return collector.info_;
+  }
+
+ private:
+  bool MatchRollingBuffer(const StmtSRef& block_sref, const BufferRegion& buffer_region) {
+    const Buffer& buffer = buffer_region->buffer;
+    const Region& region = buffer_region->region;
+
+    std::vector<Optional<Var>> bound_iter_vars;
+    std::vector<int> bound_overlaps;
+
+    arith::PVar<Var> p_var;
+    arith::PVar<IntImm> p_stride, p_divisor;
+    for (auto bound : region) {
+      auto stride = 0;
+      auto divisor = 1;
+
+      Optional<Var> iter_var;
+      if (floordiv((p_var * p_stride), p_divisor).Match(bound->min)) {
+        // Handle the case of fractional strides
+        // They take this form: floordiv(hh.outer, 2)
+        // Strip the floordiv and keep track of the divisor
+        iter_var = p_var.Eval();
+        divisor = p_divisor.Eval()->value;
+        stride = std::ceil(static_cast<float>(p_stride.Eval()->value) / divisor);
+      } else if ((p_var * p_stride).Match(bound->min)) {
+        // The bound is the iter var multiplied by the stride
+        iter_var = p_var.Eval();
+        stride = p_stride.Eval()->value;
+      } else if (p_var.Match(bound->min)) {
+        // If the bound is just a Var, that implies the stride is 1
+        iter_var = p_var.Eval();
+        stride = 1;
+      } else if (is_const_int(bound->min)) {
+        // If the bound is an int, we can't roll over it
+        iter_var = NullOpt;
+      } else {
+        // If all of the above matches fail, we're in unknown behaviour
+        return false;
+      }
+      auto bound_overlap = 0;
+      if (iter_var.defined()) {
+        auto extent = Downcast<IntImm>(bound->extent)->value;
+        bound_overlap = extent - stride;
+        // Since Pass CompactBufferAllocation will be responsible for compacting the buffer
+        // allocation region, there is no need to roll over the axis where the overlap is not
+        // positive, so reset iter_var to NullOpt.
+        if (bound_overlap <= 0) {
+          iter_var = NullOpt;
+        }
+      }
+      bound_iter_vars.push_back(iter_var);
+      bound_overlaps.push_back(bound_overlap);
+    }
+
+    Array<StmtSRef> loop_srefs = GetLoops(block_sref);
+    // Pick the outermost iter_var that's mentioned in the bounds
+    // to be the rolling axis
+    Optional<Var> roll_iter_var;
+    int roll_axis;
+    for (const tir::StmtSRef& loop_sref : loop_srefs) {
+      auto loop_var = loop_sref->StmtAs<ForNode>()->loop_var;
+
+      auto it{std::find_if(bound_iter_vars.begin(), bound_iter_vars.end(), [&](Optional<Var> var) {
+        return var && (var.get() == loop_var.get());
+      })};
+      if (it != bound_iter_vars.end()) {
+        auto i = std::distance(bound_iter_vars.begin(), it);
+        roll_iter_var = loop_var;
+        roll_axis = i;
+        break;
+      }
+    }
+
+    if (!roll_iter_var.defined()) {
+      return false;
+    }
+    Array<PrimExpr> new_shape = buffer->shape;
+    new_shape.Set(roll_axis, region[roll_axis]->extent);
+    Buffer new_buffer = buffer;
+    new_buffer.CopyOnWrite()->shape = new_shape;
+
+    info_.old_buffer = buffer;
+    info_.new_buffer = new_buffer;
+    info_.rolling_axis = roll_axis;
+    info_.rolling_extent = region[roll_axis]->extent;
+    info_.axis_overlaps = bound_overlaps;
+    info_.axis_iter_vars = bound_iter_vars;
+
+    return true;
+  }
+
+  RollingBufferInfo info_;
+};
+
+class RollingBufferRewriter : public StmtExprMutator {
+ public:
+  static Stmt Rewrite(const StmtSRef& scope_sref, RollingBufferInfo* info) {
+    RollingBufferRewriter rewriter(scope_sref, info);
+    return rewriter(GetRef<Stmt>(scope_sref->stmt));
+  }
+
+ private:
+  explicit RollingBufferRewriter(const StmtSRef& scope_sref, RollingBufferInfo* info)
+      : scope_sref_(scope_sref), info_(info) {}
+
+  void RewriteAccessRegion(Array<BufferRegion>* old_access_regions,
+                           const Array<BufferRegion>& infered_access_regions) {
+    auto fmutate = [this, &infered_access_regions](const BufferRegion& buffer_region) {
+      if (buffer_region->buffer.same_as(info_->old_buffer)) {
+        ICHECK(infered_access_regions.size() == 1);
+        return infered_access_regions[0];
+      }
+      return buffer_region;
+    };
+    (*old_access_regions).MutateByApply(fmutate);
+  }
+
+  void RewriteBufferAccess(Buffer* buffer, Array<PrimExpr>* indices) const {
+    Array<PrimExpr> new_indices;
+    new_indices.reserve(indices->size());
+    // First modify the access indices to use modulo arithmetic
+    // for the rolling axis
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (static_cast<int>(i) == info_->rolling_axis) {
+        new_indices.push_back(FloorMod((*indices)[i], info_->rolling_extent));
+      } else {
+        new_indices.push_back((*indices)[i]);
+      }
+    }
+    // Replace the accessed buffer with the new buffer.
+    *buffer = info_->new_buffer;
+    *indices = std::move(new_indices);
+  }
+
+  Stmt VisitStmt_(const BlockNode* block) final {
+    Block old_stmt = GetRef<Block>(block);
+    Block stmt = Downcast<Block>(StmtExprMutator::VisitStmt_(block));
+    BlockNode* n = stmt.CopyOnWrite();
+    if (block == scope_sref_->stmt) {
+      Array<Buffer> new_alloc_buffers;
+      for (const Buffer& buffer : stmt->alloc_buffers) {
+        if (buffer != info_->old_buffer) {
+          new_alloc_buffers.push_back(buffer);
+        } else {
+          new_alloc_buffers.push_back(info_->new_buffer);
+        }
+      }
+      n->alloc_buffers = std::move(new_alloc_buffers);
+    } else {
+      Array<IterVar> new_iter_vars;
+      for (size_t i = 0; i < stmt->iter_vars.size(); ++i) {
+        auto old_iter_var = stmt->iter_vars[i];
+        if (static_cast<int>(i) == info_->rolling_axis) {
+          // All inner loops of the rolling axis has a loop carried dependency
+          // (i.e. each iteration calculation of the rolling axis depends on
+          // the calculation results of all the historical iterations of inner loops),
+          // so annotate the iteration type of the rolling axis as 'opaque',
+          // avoid the iterative range of its inner loop from being compressed
+          // during lowering phase.
+          IterVar new_iter_var =
+              IterVar(old_iter_var->dom, old_iter_var->var, IterVarType::kOpaque);
+          new_iter_vars.push_back(new_iter_var);
+        } else {
+          new_iter_vars.push_back(old_iter_var);
+        }
+      }
+      Map<Var, Buffer> buffer_data_to_buffer = {{info_->new_buffer->data, info_->new_buffer}};
+      auto infered_access_regions = GetBlockReadWriteRegion(stmt, buffer_data_to_buffer);
+
+      n->iter_vars = std::move(new_iter_vars);
+      RewriteAccessRegion(&n->reads, infered_access_regions[0]);
+      RewriteAccessRegion(&n->writes, infered_access_regions[1]);
+    }
+    info_->block_reuse.Set(old_stmt, stmt);
+    return std::move(stmt);
+  }
+
+  Stmt VisitStmt_(const BlockRealizeNode* realize) final {
+    BlockRealize stmt = Downcast<BlockRealize>(StmtExprMutator::VisitStmt_(realize));
+    // Append block predicate to avoid recomputing elements.
+    if (rewrite_block_predicate_) {
+      rewrite_block_predicate_ = false;
+      PrimExpr condition = stmt->predicate;
+      for (size_t i = 0; i < info_->axis_iter_vars.size(); ++i) {
+        auto iter_var = info_->axis_iter_vars[i];
+        if (iter_var && info_->axis_overlaps[i] > 0) {
+          Var var = iter_var.value();
+          const Map<Var, arith::IntSet> dmap = {std::make_pair(var, arith::IntSet::Interval(0, 0))};
+          auto iter_value = realize->iter_values[i];
+          arith::Analyzer analyzer;
+          auto term_2 = analyzer.int_set(iter_value, dmap).min();
+          condition = analyzer.Simplify(
+              And(condition, Or(LT(var, 1), GE(term_2, info_->axis_overlaps[i]))));
+        }
+      }
+      BlockRealizeNode* n = stmt.CopyOnWrite();
+      n->predicate = condition;
+    }
+    return std::move(stmt);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    BufferStore stmt = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    if (stmt->buffer.same_as(info_->old_buffer)) {
+      BufferStoreNode* n = stmt.CopyOnWrite();
+      RewriteBufferAccess(&n->buffer, &n->indices);
+      // Need to add predicate to the current block to avoid recomputing elements.
+      rewrite_block_predicate_ = true;
+    }
+    return std::move(stmt);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    BufferLoad stmt = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    if (stmt->buffer.same_as(info_->old_buffer)) {
+      BufferLoadNode* n = stmt.CopyOnWrite();
+      RewriteBufferAccess(&n->buffer, &n->indices);
+    }
+    return std::move(stmt);
+  }
+
+ private:
+  const StmtSRef& scope_sref_;
+  RollingBufferInfo* info_;
+  bool rewrite_block_predicate_ = false;
+};
+
+}  // namespace
+
+void RollingBuffer(ScheduleState self, const StmtSRef& block_sref, int write_buffer_index) {
+  /*!
+   *  Check
+   *    - The block is not an output block.
+   *    - The block has only RAW dependencies.
+   *    - The block is tiled and there is access overlap between adjacent tiles.
+   *  Mutate
+   *    - Select the outermost rollable axis appeared in the block's loop nest
+   *      as the 'rolling axis', trim the target buffer from the rolling axis.
+   *    - Use modulo arithmetic to modify the target buffer's read and load
+   *      indices to circularize the buffer along the rolling dimension.
+   *    - Append block predicate to avoid recomputing overlapping elements.
+   */
+  Map<Var, arith::IntSet> dom_map;
+  const BlockRealize& realize = GetBlockRealize(self, block_sref);
+  const Block& block = realize->block;
+
+  // Step 1. Checking index, getting the target buffer region and the parent scope.
+  const BufferRegion& buffer_region =
+      GetNthAccessBufferRegion(self, block, write_buffer_index, BufferIndexType::kWrite);
+  StmtSRef scope_root_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
+  // Step 2. Check if the target block is not an output block and has only RAW dependencies.
+  CheckNotOutputBlock(self, block_sref, scope_root_sref);
+  RollingBufferDependencyError::Check(self, block_sref, scope_root_sref);
+
+  // Step 3. Find the lca of the access location of the target buffer and relax the buffer
+  Array<StmtSRef> loop_srefs = GetLoops(block_sref);
+  Array<StmtSRef> consumers_sref = GetConsumers(self, block_sref);
+  consumers_sref.push_back(block_sref);
+  StmtSRef lca = GetSRefLowestCommonAncestor(consumers_sref);
+  if (!lca->StmtAs<ForNode>()) {
+    throw RollingBufferInsertionError(self->mod, buffer_region->buffer, block);
+  }
+
+  for (auto it = loop_srefs.rbegin(); it != loop_srefs.rend(); ++it) {
+    auto stmt = *it;
+    // Stop at the lca of all the rolling_buffer access points;
+    if (stmt == lca) {
+      break;
+    }
+    For cur_loop = GetRef<For>(stmt->StmtAs<ForNode>());
+    Range range = Range::FromMinExtent(cur_loop->min, cur_loop->extent);
+    dom_map.Set(cur_loop->loop_var, arith::IntSet::FromRange(range));
+  }
+  BufferRegion relaxed_region = GetRelaxedBufferRegion(realize, buffer_region, dom_map);
+
+  // Step 4. Find a valid rolling axis and collect bound overlaps on the target buffer.
+  RollingBufferInfo info = RollingBufferInfoCollector::CheckAndGetRollingBufferInfo(
+      self->mod, block_sref, relaxed_region);
+  // Step 5. Mutate IR to apply rolling access pattern.
+  Stmt new_scope_root = RollingBufferRewriter::Rewrite(scope_root_sref, &info);
+
+  // Step 6. Update schedule states
+  self->Replace(scope_root_sref, new_scope_root, info.block_reuse);
+  // Step 7. Regenerate block info from the root block, because `region_cover` for the target block
+  // and `stage_pipeline` for the root block are no longer satisfied after rolling buffer injection.
+  self->UpdateScopeBlockInfo(tir::GetBlockRealize(self, self->stmt2ref.at(new_scope_root.get())));
+}
+
+struct RollingBufferTraits : public UnpackedInstTraits<RollingBufferTraits> {
+  static constexpr const char* kName = "RollingBuffer";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumAttrs = 1;
+  static constexpr size_t kNumDecisions = 0;
+
+  static void UnpackedApplyToSchedule(Schedule sch, BlockRV block, Integer write_buffer_index) {
+    return sch->RollingBuffer(block, write_buffer_index.IntValue());
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String block, Integer write_buffer_index) {
+    PythonAPICall py("rolling_buffer");
+    py.Input("block", block);
+    py.Input("write_buffer_index", write_buffer_index);
+    return py.Str();
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
+TVM_REGISTER_INST_KIND_TRAITS(RollingBufferTraits);
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index 6425ae0766ae..3fe81c9f433b 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -272,6 +272,9 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleDecomposePadding")
     .set_body_method<Schedule>(&ScheduleNode::DecomposePadding);
 TVM_REGISTER_GLOBAL("tir.schedule.SchedulePadEinsum")
     .set_body_method<Schedule>(&ScheduleNode::PadEinsum);
+/******** (FFI) Buffer transformation ********/
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleRollingBuffer")
+    .set_body_method<Schedule>(&ScheduleNode::RollingBuffer);
 /******** (FFI) Misc ********/
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleEnterPostproc")
     .set_body_method<Schedule>(&ScheduleNode::EnterPostproc);
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index f2ad27fb6962..010730f66c60 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -575,6 +575,18 @@ void TracedScheduleNode::PadEinsum(const BlockRV& block_rv, const Array<Integer>
       /*outputs=*/{}));
 }
 
+/******** Schedule: Buffer transformation ********/
+
+void TracedScheduleNode::RollingBuffer(const BlockRV& block_rv, int write_buffer_index) {
+  ConcreteScheduleNode::RollingBuffer(block_rv, write_buffer_index);
+  static const InstructionKind& kind = InstructionKind::Get("RollingBuffer");
+  trace_->Append(/*inst=*/Instruction(
+      /*kind=*/kind,
+      /*inputs=*/{block_rv},
+      /*attrs=*/{Integer(write_buffer_index)},
+      /*outputs=*/{}));
+}
+
 /******** Schedule: Misc ********/
 
 void TracedScheduleNode::EnterPostproc() {
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 06128c1a6ebc..cea2096d20a6 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -114,6 +114,8 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   /******** Schedule: Padding ********/
   BlockRV DecomposePadding(const BlockRV& block_rv, const LoopRV& loop_rv) final;
   void PadEinsum(const BlockRV& block_rv, const Array<Integer>& padding) final;
+  /******** Schedule: Buffer transformation ********/
+  void RollingBuffer(const BlockRV& block_rv, int write_buffer_index) final;
   /******** Schedule: Misc ********/
   void EnterPostproc() final;
 };
diff --git a/tests/python/unittest/test_tir_schedule_rolling_buffer.py b/tests/python/unittest/test_tir_schedule_rolling_buffer.py
new file mode 100644
index 000000000000..c55c41e451cc
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_rolling_buffer.py
@@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-function-docstring,missing-module-docstring
+import numpy as np
+import tvm
+import tvm.testing
+from tvm import tir
+from tvm.script import tir as T
+from tvm.tir.schedule.testing import verify_trace_roundtrip
+import pytest
+
+
+def check_rolling_buffer(
+    sch: tir.Schedule, origin: tir.PrimFunc, expected: tir.PrimFunc, check_run=False
+):
+    scheduled = sch.mod["main"]
+    tvm.ir.assert_structural_equal(scheduled, expected)
+    verify_trace_roundtrip(sch, origin)
+    if check_run:
+        in_buffer = origin.buffer_map[origin.params[0]]
+        out_buffer = origin.buffer_map[origin.params[1]]
+        in_shape = [int(_) for _ in in_buffer.shape]
+        out_shape = [int(_) for _ in out_buffer.shape]
+        x = tvm.nd.array(np.random.uniform(0, 64, in_shape).astype(in_buffer.dtype))
+        y0 = tvm.nd.array(np.zeros(out_shape).astype(out_buffer.dtype))
+        y1 = tvm.nd.array(np.zeros(out_shape).astype(out_buffer.dtype))
+        f_origin = tvm.build(origin)
+        f_scheduled = tvm.build(scheduled)
+        f_origin(x, y0)
+        f_scheduled(x, y1)
+        tvm.testing.assert_allclose(y0.numpy(), y1.numpy())
+
+
+def _tile_nd(s, tile, block_name):
+    outer_indices = []
+    inner_indices = []
+    block = s.get_block(block_name)
+    loops = s.get_loops(block)
+    for i, size in enumerate(tile):
+        outer, inner = s.split(loops[i], [None, size])
+        outer_indices.append(outer)
+        inner_indices.append(inner)
+
+    s.reorder(*outer_indices, *inner_indices)
+    return outer_indices, inner_indices
+
+
+def test_1d_rolling_buffer():
+    @T.prim_func
+    def before(A: T.Buffer[(4, 12), "int32"], C: T.Buffer[(4, 8), "int32"]):
+        B = T.alloc_buffer((4, 10), "int32")
+        for c in T.serial(4):
+            for i in T.serial(0, 10):
+                for k in T.serial(3):
+                    with T.block("B"):
+                        cc, vi, vk = T.axis.remap("SSR", [c, i, k])
+                        with T.init():
+                            B[cc, vi] = 0
+                        B[cc, vi] = B[cc, vi] + A[cc, vi + vk]
+            for i in T.serial(0, 8):
+                for k in T.serial(3):
+                    with T.block("C"):
+                        cc, vi, vk = T.axis.remap("SSR", [c, i, k])
+                        with T.init():
+                            C[cc, vi] = 0
+                        C[cc, vi] = C[cc, vi] + B[cc, vi + vk]
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 12), "int32"], C: T.Buffer[(4, 8), "int32"]):
+        B = T.alloc_buffer([4, 6], dtype="int32")
+        for c, i_0 in T.grid(4, 2):
+            for ax0, ax1 in T.grid(6, 3):
+                with T.block("B"):
+                    T.where(i_0 < 1 or 2 <= ax0)
+                    cc = T.axis.spatial(4, c)
+                    vi = T.axis.opaque(10, i_0 * 4 + ax0)
+                    vk = T.axis.reduce(3, ax1)
+                    T.reads(A[cc, vi + vk])
+                    T.writes(B[cc, vi % 6])
+                    with T.init():
+                        B[cc, vi % 6] = 0
+                    B[cc, vi % 6] = B[cc, vi % 6] + A[cc, vi + vk]
+            for i_1, k in T.grid(4, 3):
+                with T.block("C"):
+                    cc = T.axis.spatial(4, c)
+                    vi = T.axis.opaque(8, i_0 * 4 + i_1)
+                    vk = T.axis.reduce(3, k)
+                    T.reads(B[cc, (vi + vk) % 6])
+                    T.writes(C[cc, vi])
+                    with T.init():
+                        C[cc, vi] = 0
+                    C[cc, vi] = C[cc, vi] + B[cc, (vi + vk) % 6]
+
+    sch = tir.Schedule(before, debug_mask="all")
+    _, i, _ = sch.get_loops(sch.get_block("C"))
+    io, _ = sch.split(i, [2, 4])
+    sch.compute_at(sch.get_block("B"), io)
+    sch.rolling_buffer(sch.get_block("B"), 0)
+    check_rolling_buffer(sch, before, expected, check_run=True)
+
+
+@T.prim_func
+def cascade_2_max_pool2d(A: T.Buffer[(1, 12, 12, 16), "int8"], C: T.Buffer[(1, 8, 8, 16), "int8"]):
+    B = T.alloc_buffer([1, 10, 10, 16], dtype="int8")
+    for i0, i1, i2, i3, i4, i5 in T.grid(1, 10, 10, 16, 3, 3):
+        with T.block("B"):
+            ax0, ax1, ax2, ax3, rv0, rv1 = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+            with T.init():
+                B[ax0, ax1, ax2, ax3] = T.int8(-128)
+            B[ax0, ax1, ax2, ax3] = T.max(B[ax0, ax1, ax2, ax3], A[ax0, ax1 + rv0, ax2 + rv1, ax3])
+    for i0, i1, i2, i3, i4, i5 in T.grid(1, 8, 8, 16, 3, 3):
+        with T.block("C"):
+            ax0, ax1, ax2, ax3, rv0, rv1 = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+            with T.init():
+                C[ax0, ax1, ax2, ax3] = T.int8(-128)
+            C[ax0, ax1, ax2, ax3] = T.max(C[ax0, ax1, ax2, ax3], B[ax0, ax1 + rv0, ax2 + rv1, ax3])
+
+
+@T.prim_func
+def cascade_3_max_pool2d_with_stride(
+    A: T.Buffer[(1, 24, 24, 16), "int8"], C: T.Buffer[(1, 8, 8, 16), "int8"]
+):
+    B_0 = T.alloc_buffer([1, 22, 22, 16], dtype="int8")
+    B_1 = T.alloc_buffer([1, 10, 10, 16], dtype="int8")
+    for i0, i1, i2, i3, i4, i5 in T.grid(1, 22, 22, 16, 3, 3):
+        with T.block("B_0"):
+            ax0, ax1, ax2, ax3, rv0, rv1 = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+            with T.init():
+                B_0[ax0, ax1, ax2, ax3] = T.int8(-128)
+            B_0[ax0, ax1, ax2, ax3] = T.max(
+                B_0[ax0, ax1, ax2, ax3], A[ax0, ax1 + rv0, ax2 + rv1, ax3]
+            )
+    for i0, i1, i2, i3, i4, i5 in T.grid(1, 10, 10, 16, 3, 3):
+        with T.block("B_1"):
+            ax0, ax1, ax2, ax3, rv0, rv1 = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+            with T.init():
+                B_1[ax0, ax1, ax2, ax3] = T.int8(-128)
+            B_1[ax0, ax1, ax2, ax3] = T.max(
+                B_1[ax0, ax1, ax2, ax3], B_0[ax0, ax1 * 2 + rv0, ax2 * 2 + rv1, ax3]
+            )
+    for i0, i1, i2, i3, i4, i5 in T.grid(1, 8, 8, 16, 3, 3):
+        with T.block("C"):
+            ax0, ax1, ax2, ax3, rv0, rv1 = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+            with T.init():
+                C[ax0, ax1, ax2, ax3] = T.int8(-128)
+            C[ax0, ax1, ax2, ax3] = T.max(
+                C[ax0, ax1, ax2, ax3], B_1[ax0, ax1 + rv0, ax2 + rv1, ax3]
+            )
+
+
+def test_cascade_max_pool2d_w_tiled():
+    @T.prim_func
+    def expected(A: T.Buffer[(1, 12, 12, 16), "int8"], C: T.Buffer[(1, 8, 8, 16), "int8"]):
+        B = T.alloc_buffer([1, 10, 6, 16], dtype="int8")
+        for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 1, 2, 1):
+            for ax0, ax1, ax2, ax3, ax4 in T.grid(10, 6, 16, 3, 3):
+                with T.block("B"):
+                    T.where(i2_0 < 1 or 2 <= ax1)
+                    ax0_1 = T.axis.spatial(1, 0)
+                    ax1_1 = T.axis.spatial(10, ax0)
+                    ax2_1 = T.axis.opaque(10, i2_0 * 4 + ax1)
+                    ax3_1, rv0, rv1 = T.axis.remap("SRR", [ax2, ax3, ax4])
+                    T.reads(A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1])
+                    T.writes(B[ax0_1, ax1_1, ax2_1 % 6, ax3_1])
+                    with T.init():
+                        B[ax0_1, ax1_1, ax2_1 % 6, ax3_1] = T.int8(-128)
+                    B[ax0_1, ax1_1, ax2_1 % 6, ax3_1] = T.max(
+                        B[ax0_1, ax1_1, ax2_1 % 6, ax3_1], A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1]
+                    )
+            for i0_1, i1_1, i2_1, i3_1, i4, i5 in T.grid(1, 8, 4, 16, 3, 3):
+                with T.block("C"):
+                    ax0 = T.axis.spatial(1, i0_0 + i0_1)
+                    ax1 = T.axis.spatial(8, i1_0 * 8 + i1_1)
+                    ax2 = T.axis.opaque(8, i2_0 * 4 + i2_1)
+                    ax3 = T.axis.spatial(16, i3_0 * 16 + i3_1)
+                    rv0, rv1 = T.axis.remap("RR", [i4, i5])
+                    T.reads(B[ax0, ax1 + rv0, (ax2 + rv1) % 6, ax3])
+                    T.writes(C[ax0, ax1, ax2, ax3])
+                    with T.init():
+                        C[ax0, ax1, ax2, ax3] = T.int8(-128)
+                    C[ax0, ax1, ax2, ax3] = T.max(
+                        C[ax0, ax1, ax2, ax3], B[ax0, ax1 + rv0, (ax2 + rv1) % 6, ax3]
+                    )
+
+    sch = tir.Schedule(cascade_2_max_pool2d, debug_mask="all")
+    oi, _ = _tile_nd(sch, [1, 8, 4, 16], "C")
+    sch.compute_at(sch.get_block("B"), oi[-1])
+    sch.rolling_buffer(sch.get_block("B"), 0)
+    check_rolling_buffer(sch, cascade_2_max_pool2d, expected, check_run=True)
+
+
+def test_cascade_max_pool2d_h_tiled():
+    @T.prim_func
+    def expected(A: T.Buffer[(1, 12, 12, 16), "int8"], C: T.Buffer[(1, 8, 8, 16), "int8"]):
+        B = T.alloc_buffer([1, 6, 10, 16], dtype="int8")
+        for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 2, 1, 1):
+            for ax0, ax1, ax2, ax3, ax4 in T.grid(6, 10, 16, 3, 3):
+                with T.block("B"):
+                    T.where(i1_0 < 1 or 2 <= ax0)
+                    ax0_1 = T.axis.spatial(1, 0)
+                    ax1_1 = T.axis.opaque(10, i1_0 * 4 + ax0)
+                    ax2_1 = T.axis.spatial(10, ax1)
+                    ax3_1, rv0, rv1 = T.axis.remap("SRR", [ax2, ax3, ax4])
+                    T.reads(A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1])
+                    T.writes(B[ax0_1, ax1_1 % 6, ax2_1, ax3_1])
+                    with T.init():
+                        B[ax0_1, ax1_1 % 6, ax2_1, ax3_1] = T.int8(-128)
+                    B[ax0_1, ax1_1 % 6, ax2_1, ax3_1] = T.max(
+                        B[ax0_1, ax1_1 % 6, ax2_1, ax3_1], A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1]
+                    )
+            for i0_1, i1_1, i2_1, i3_1, i4, i5 in T.grid(1, 4, 8, 16, 3, 3):
+                with T.block("C"):
+                    ax0 = T.axis.spatial(1, i0_0 + i0_1)
+                    ax1 = T.axis.opaque(8, i1_0 * 4 + i1_1)
+                    ax2 = T.axis.spatial(8, i2_0 * 8 + i2_1)
+                    ax3 = T.axis.spatial(16, i3_0 * 16 + i3_1)
+                    rv0, rv1 = T.axis.remap("RR", [i4, i5])
+                    T.reads(B[ax0, (ax1 + rv0) % 6, ax2 + rv1, ax3])
+                    T.writes(C[ax0, ax1, ax2, ax3])
+                    with T.init():
+                        C[ax0, ax1, ax2, ax3] = T.int8(-128)
+                    C[ax0, ax1, ax2, ax3] = T.max(
+                        C[ax0, ax1, ax2, ax3], B[ax0, (ax1 + rv0) % 6, ax2 + rv1, ax3]
+                    )
+
+    sch = tir.Schedule(cascade_2_max_pool2d, debug_mask="all")
+    io, _ = _tile_nd(sch, [1, 4, 8, 16], "C")
+    sch.compute_at(sch.get_block("B"), io[-1])
+    sch.rolling_buffer(sch.get_block("B"), 0)
+    check_rolling_buffer(sch, cascade_2_max_pool2d, expected, check_run=True)
+
+
+def test_cascade_max_pool2d_h_w_c_tiled():
+    @T.prim_func
+    def expected(A: T.Buffer[(1, 12, 12, 16), "int8"], C: T.Buffer[(1, 8, 8, 16), "int8"]):
+        B = T.alloc_buffer([1, 6, 10, 16], dtype="int8")
+        for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 2, 2, 2):
+            for ax0, ax1, ax2, ax3, ax4 in T.grid(6, 6, 8, 3, 3):
+                with T.block("B"):
+                    T.where((i1_0 < 1 or 2 <= ax0) and (i2_0 < 1 or 2 <= ax1))
+                    ax0_1 = T.axis.spatial(1, 0)
+                    ax1_1 = T.axis.opaque(10, i1_0 * 4 + ax0)
+                    ax2_1 = T.axis.spatial(10, i2_0 * 4 + ax1)
+                    ax3_1 = T.axis.spatial(16, i3_0 * 8 + ax2)
+                    rv0, rv1 = T.axis.remap("RR", [ax3, ax4])
+                    T.reads(A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1])
+                    T.writes(B[ax0_1, ax1_1 % 6, ax2_1, ax3_1])
+                    with T.init():
+                        B[ax0_1, ax1_1 % 6, ax2_1, ax3_1] = T.int8(-128)
+                    B[ax0_1, ax1_1 % 6, ax2_1, ax3_1] = T.max(
+                        B[ax0_1, ax1_1 % 6, ax2_1, ax3_1], A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1]
+                    )
+            for i0_1, i1_1, i2_1, i3_1, i4, i5 in T.grid(1, 4, 4, 8, 3, 3):
+                with T.block("C"):
+                    ax0 = T.axis.spatial(1, i0_0 + i0_1)
+                    ax1 = T.axis.opaque(8, i1_0 * 4 + i1_1)
+                    ax2 = T.axis.spatial(8, i2_0 * 4 + i2_1)
+                    ax3 = T.axis.spatial(16, i3_0 * 8 + i3_1)
+                    rv0, rv1 = T.axis.remap("RR", [i4, i5])
+                    T.reads(B[ax0, (ax1 + rv0) % 6, ax2 + rv1, ax3])
+                    T.writes(C[ax0, ax1, ax2, ax3])
+                    with T.init():
+                        C[ax0, ax1, ax2, ax3] = T.int8(-128)
+                    C[ax0, ax1, ax2, ax3] = T.max(
+                        C[ax0, ax1, ax2, ax3], B[ax0, (ax1 + rv0) % 6, ax2 + rv1, ax3]
+                    )
+
+    sch = tir.Schedule(cascade_2_max_pool2d, debug_mask="all")
+    io, _ = _tile_nd(sch, [1, 4, 4, 8], "C")
+    sch.compute_at(sch.get_block("B"), io[-1])
+    sch.rolling_buffer(sch.get_block("B"), 0)
+    check_rolling_buffer(sch, cascade_2_max_pool2d, expected, check_run=True)
+
+
+def test_cascade_max_pool2d_non_perfect_tiled():
+    @T.prim_func
+    def expected(A: T.Buffer[(1, 12, 12, 16), "int8"], C: T.Buffer[(1, 8, 8, 16), "int8"]) -> None:
+        B = T.alloc_buffer([1, 8, 10, 16], dtype="int8")
+        for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 2, 2, 1):
+            for ax0, ax1, ax2, ax3, ax4 in T.grid(8, 8, 16, 3, 3):
+                with T.block("B"):
+                    T.where(
+                        i1_0 * 6 + ax0 < 10
+                        and i2_0 * 6 + ax1 < 10
+                        and (i1_0 < 1 or 2 <= ax0)
+                        and (i2_0 < 1 or 2 <= ax1)
+                    )
+                    ax0_1 = T.axis.spatial(1, 0)
+                    ax1_1 = T.axis.opaque(10, i1_0 * 6 + ax0)
+                    ax2_1 = T.axis.spatial(10, i2_0 * 6 + ax1)
+                    ax3_1, rv0, rv1 = T.axis.remap("SRR", [ax2, ax3, ax4])
+                    T.reads(A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1])
+                    T.writes(B[ax0_1, ax1_1 % 8, ax2_1, ax3_1])
+                    with T.init():
+                        B[ax0_1, ax1_1 % 8, ax2_1, ax3_1] = T.int8(-128)
+                    B[ax0_1, ax1_1 % 8, ax2_1, ax3_1] = T.max(
+                        B[ax0_1, ax1_1 % 8, ax2_1, ax3_1], A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1]
+                    )
+            for i0_1, i1_1, i2_1, i3_1, i4, i5 in T.grid(1, 6, 6, 16, 3, 3):
+                with T.block("C"):
+                    T.where(i1_0 * 6 + i1_1 < 8 and i2_0 * 6 + i2_1 < 8)
+                    ax0 = T.axis.spatial(1, i0_0 + i0_1)
+                    ax1 = T.axis.opaque(8, i1_0 * 6 + i1_1)
+                    ax2 = T.axis.spatial(8, i2_0 * 6 + i2_1)
+                    ax3 = T.axis.spatial(16, i3_0 * 16 + i3_1)
+                    rv0, rv1 = T.axis.remap("RR", [i4, i5])
+                    T.reads(B[ax0, (ax1 + rv0) % 8, ax2 + rv1, ax3])
+                    T.writes(C[ax0, ax1, ax2, ax3])
+                    with T.init():
+                        C[ax0, ax1, ax2, ax3] = T.int8(-128)
+                    C[ax0, ax1, ax2, ax3] = T.max(
+                        C[ax0, ax1, ax2, ax3], B[ax0, (ax1 + rv0) % 8, ax2 + rv1, ax3]
+                    )
+
+    sch = tir.Schedule(cascade_2_max_pool2d, debug_mask="all")
+    io, _ = _tile_nd(sch, [1, 6, 6, 16], "C")
+    sch.compute_at(sch.get_block("B"), io[-1])
+    sch.rolling_buffer(sch.get_block("B"), 0)
+    check_rolling_buffer(sch, cascade_2_max_pool2d, expected, check_run=True)
+
+
+def test_cascade_3_max_pool2d_with_stride():
+    @T.prim_func
+    def expected(A: T.Buffer[(1, 24, 24, 16), "int8"], C: T.Buffer[(1, 8, 8, 16), "int8"]) -> None:
+        B_0 = T.alloc_buffer([1, 13, 22, 16], dtype="int8")
+        B_1 = T.alloc_buffer([1, 6, 10, 16], dtype="int8")
+        for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 2, 2, 1):
+            for ax0, ax1, ax2, ax3, ax4 in T.grid(13, 13, 16, 3, 3):
+                with T.block("B_0"):
+                    T.where((i1_0 < 1 or 5 <= ax0) and (i2_0 < 1 or 5 <= ax1))
+                    ax0_1 = T.axis.spatial(1, 0)
+                    ax1_1 = T.axis.opaque(22, i1_0 * 8 + ax0)
+                    ax2_1 = T.axis.spatial(22, i2_0 * 8 + ax1)
+                    ax3_1, rv0, rv1 = T.axis.remap("SRR", [ax2, ax3, ax4])
+                    T.reads(A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1])
+                    T.writes(B_0[ax0_1, ax1_1 % 13, ax2_1, ax3_1])
+                    with T.init():
+                        B_0[ax0_1, ax1_1 % 13, ax2_1, ax3_1] = T.int8(-128)
+                    B_0[ax0_1, ax1_1 % 13, ax2_1, ax3_1] = T.max(
+                        B_0[ax0_1, ax1_1 % 13, ax2_1, ax3_1],
+                        A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1],
+                    )
+            for ax0, ax1, ax2, ax3, ax4 in T.grid(6, 6, 16, 3, 3):
+                with T.block("B_1"):
+                    T.where((i1_0 < 1 or 2 <= ax0) and (i2_0 < 1 or 2 <= ax1))
+                    ax0_2 = T.axis.spatial(1, 0)
+                    ax1_2 = T.axis.opaque(10, i1_0 * 4 + ax0)
+                    ax2_2 = T.axis.spatial(10, i2_0 * 4 + ax1)
+                    ax3_2, rv0, rv1 = T.axis.remap("SRR", [ax2, ax3, ax4])
+                    T.reads(B_0[ax0_2, (ax1_2 * 2 + rv0) % 13, ax2_2 * 2 + rv1, ax3_2])
+                    T.writes(B_1[ax0_2, ax1_2 % 6, ax2_2, ax3_2])
+                    with T.init():
+                        B_1[ax0_2, ax1_2 % 6, ax2_2, ax3_2] = T.int8(-128)
+                    B_1[ax0_2, ax1_2 % 6, ax2_2, ax3_2] = T.max(
+                        B_1[ax0_2, ax1_2 % 6, ax2_2, ax3_2],
+                        B_0[ax0_2, (ax1_2 * 2 + rv0) % 13, ax2_2 * 2 + rv1, ax3_2],
+                    )
+            for i0_1, i1_1, i2_1, i3_1, i4, i5 in T.grid(1, 4, 4, 16, 3, 3):
+                with T.block("C"):
+                    ax0_3 = T.axis.spatial(1, i0_0 + i0_1)
+                    ax1_3 = T.axis.opaque(8, i1_0 * 4 + i1_1)
+                    ax2_3 = T.axis.spatial(8, i2_0 * 4 + i2_1)
+                    ax3_3 = T.axis.spatial(16, i3_0 * 16 + i3_1)
+                    rv0, rv1 = T.axis.remap("RR", [i4, i5])
+                    T.reads(B_1[ax0_3, (ax1_3 + rv0) % 6, ax2_3 + rv1, ax3_3])
+                    T.writes(C[ax0_3, ax1_3, ax2_3, ax3_3])
+                    with T.init():
+                        C[ax0_3, ax1_3, ax2_3, ax3_3] = T.int8(-128)
+                    C[ax0_3, ax1_3, ax2_3, ax3_3] = T.max(
+                        C[ax0_3, ax1_3, ax2_3, ax3_3],
+                        B_1[ax0_3, (ax1_3 + rv0) % 6, ax2_3 + rv1, ax3_3],
+                    )
+
+    sch = tir.Schedule(cascade_3_max_pool2d_with_stride, debug_mask="all")
+    io, _ = _tile_nd(sch, [1, 4, 4, 16], "C")
+    sch.compute_at(sch.get_block("B_1"), io[-1])
+    sch.compute_at(sch.get_block("B_0"), io[-1])
+    sch.rolling_buffer(sch.get_block("B_0"), 0)
+    sch.rolling_buffer(sch.get_block("B_1"), 0)
+    check_rolling_buffer(sch, cascade_3_max_pool2d_with_stride, expected, check_run=True)
+
+
+def test_upscale():
+    @T.prim_func
+    def before(A: T.Buffer[(1, 16, 16, 16), "int8"], C: T.Buffer[(1, 24, 24, 16), "int8"]) -> None:
+        B = T.alloc_buffer([1, 14, 14, 16], dtype="int8")
+        for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 5, 5, 1):
+            for ax0, ax1, ax2, ax3, ax4 in T.grid(5, 5, 16, 3, 3):
+                with T.block("B"):
+                    T.where(i1_0 * 5 // 2 + ax0 < 14 and i2_0 * 5 // 2 + ax1 < 14)
+                    ax0_1 = T.axis.spatial(1, 0)
+                    ax1_1 = T.axis.spatial(14, i1_0 * 5 // 2 + ax0)
+                    ax2_1 = T.axis.spatial(14, i2_0 * 5 // 2 + ax1)
+                    ax3_1 = T.axis.spatial(16, ax2)
+                    rv0, rv1 = T.axis.remap("RR", [ax3, ax4])
+                    T.reads(A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1])
+                    T.writes(B[ax0_1, ax1_1, ax2_1, ax3_1])
+                    with T.init():
+                        B[ax0_1, ax1_1, ax2_1, ax3_1] = T.int8(-128)
+                    B[ax0_1, ax1_1, ax2_1, ax3_1] = T.max(
+                        B[ax0_1, ax1_1, ax2_1, ax3_1], A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1]
+                    )
+            for i0_1, i1_1, i2_1, i3_1, i4, i5 in T.grid(1, 5, 5, 16, 3, 3):
+                with T.block("C"):
+                    T.where(i1_0 * 5 + i1_1 < 24 and i2_0 * 5 + i2_1 < 24)
+                    ax0 = T.axis.spatial(1, i0_0 + i0_1)
+                    ax1 = T.axis.spatial(24, i1_0 * 5 + i1_1)
+                    ax2 = T.axis.spatial(24, i2_0 * 5 + i2_1)
+                    ax3 = T.axis.spatial(16, i3_0 * 16 + i3_1)
+                    rv0, rv1 = T.axis.remap("RR", [i4, i5])
+                    T.reads(B[ax0, ax1 // 2 + rv0, ax2 // 2 + rv1, ax3])
+                    T.writes(C[ax0, ax1, ax2, ax3])
+                    with T.init():
+                        C[ax0, ax1, ax2, ax3] = T.int8(-128)
+                    C[ax0, ax1, ax2, ax3] = T.max(
+                        C[ax0, ax1, ax2, ax3], B[ax0, ax1 // 2 + rv0, ax2 // 2 + rv1, ax3]
+                    )
+
+    @T.prim_func
+    def expected(
+        A: T.Buffer[(1, 16, 16, 16), "int8"], C: T.Buffer[(1, 24, 24, 16), "int8"]
+    ) -> None:
+        B = T.alloc_buffer([1, 5, 14, 16], dtype="int8")
+        for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 5, 5, 1):
+            for ax0, ax1, ax2, ax3, ax4 in T.grid(5, 5, 16, 3, 3):
+                with T.block("B"):
+                    T.where(
+                        i1_0 * 5 // 2 + ax0 < 14
+                        and i2_0 * 5 // 2 + ax1 < 14
+                        and (i1_0 < 1 or 2 <= ax0)
+                        and (i2_0 < 1 or 2 <= ax1)
+                    )
+                    ax0_1 = T.axis.spatial(1, 0)
+                    ax1_1 = T.axis.opaque(14, i1_0 * 5 // 2 + ax0)
+                    ax2_1 = T.axis.spatial(14, i2_0 * 5 // 2 + ax1)
+                    ax3_1 = T.axis.spatial(16, ax2)
+                    rv0, rv1 = T.axis.remap("RR", [ax3, ax4])
+                    T.reads(A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1])
+                    T.writes(B[ax0_1, ax1_1 % 5, ax2_1, ax3_1])
+                    with T.init():
+                        B[ax0_1, ax1_1 % 5, ax2_1, ax3_1] = T.int8(-128)
+                    B[ax0_1, ax1_1 % 5, ax2_1, ax3_1] = T.max(
+                        B[ax0_1, ax1_1 % 5, ax2_1, ax3_1], A[ax0_1, ax1_1 + rv0, ax2_1 + rv1, ax3_1]
+                    )
+            for i0_1, i1_1, i2_1, i3_1, i4, i5 in T.grid(1, 5, 5, 16, 3, 3):
+                with T.block("C"):
+                    T.where(i1_0 * 5 + i1_1 < 24 and i2_0 * 5 + i2_1 < 24)
+                    ax0 = T.axis.spatial(1, i0_0 + i0_1)
+                    ax1 = T.axis.opaque(24, i1_0 * 5 + i1_1)
+                    ax2 = T.axis.spatial(24, i2_0 * 5 + i2_1)
+                    ax3 = T.axis.spatial(16, i3_0 * 16 + i3_1)
+                    rv0, rv1 = T.axis.remap("RR", [i4, i5])
+                    T.reads(B[ax0, (ax1 // 2 + rv0) % 5, ax2 // 2 + rv1, ax3])
+                    T.writes(C[ax0, ax1, ax2, ax3])
+                    with T.init():
+                        C[ax0, ax1, ax2, ax3] = T.int8(-128)
+                    C[ax0, ax1, ax2, ax3] = T.max(
+                        C[ax0, ax1, ax2, ax3], B[ax0, (ax1 // 2 + rv0) % 5, ax2 // 2 + rv1, ax3]
+                    )
+
+    sch = tir.Schedule(before, debug_mask="all")
+    sch.rolling_buffer(sch.get_block("B"), 0)
+    check_rolling_buffer(sch, before, expected, check_run=True)
+
+
+def test_fail_rolling_buffer_multi_writers():
+    @T.prim_func
+    def func_multi_writers(
+        A: T.Buffer[(1, 12, 12, 16), "int8"], C: T.Buffer[(1, 12, 12, 16), "int8"]
+    ):
+        B = T.alloc_buffer([1, 12, 12, 16], dtype="int8")
+        for i0, i1, i2, i3 in T.grid(1, 3, 3, 1):
+            for ax0, ax1, ax2 in T.grid(6, 6, 16):
+                with T.block("B_writer_0"):
+                    ax0_1 = T.axis.spatial(1, i0)
+                    ax1_1 = T.axis.spatial(12, i1 * 4 + ax0)
+                    ax2_1 = T.axis.spatial(12, i2 * 4 + ax1)
+                    ax3_1 = T.axis.spatial(16, ax2)
+                    with T.init():
+                        B[ax0_1, ax1_1, ax2_1, ax3_1] = T.int8(-128)
+                    B[ax0_1, ax1_1, ax2_1, ax3_1] = A[ax0_1, ax1_1, ax2_1, ax3_1] + T.int8(1)
+            for ax0, ax1, ax2 in T.grid(6, 6, 16):
+                with T.block("B_writer_1"):
+                    ax0_2 = T.axis.spatial(1, i0)
+                    ax1_2 = T.axis.spatial(12, i1 * 4 + ax0)
+                    ax2_2 = T.axis.spatial(12, i2 * 4 + ax1)
+                    ax3_2 = T.axis.spatial(16, ax2)
+                    with T.init():
+                        B[ax0_2, ax1_2, ax2_2, ax3_2] = T.int8(-128)
+                    B[ax0_2, ax1_2, ax2_2, ax3_2] = B[ax0_2, ax1_2, ax2_2, ax3_2] + A[
+                        ax0_2, ax1_2, ax2_2, ax3_2
+                    ] * T.int8(2)
+            for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 4, 4, 16, 3, 3):
+                with T.block("C"):
+                    ax0_3 = T.axis.spatial(1, i0 + ax0)
+                    ax1_3 = T.axis.spatial(12, i1 * 4 + ax1)
+                    ax2_3 = T.axis.spatial(12, i2 * 4 + ax2)
+                    ax3_3 = T.axis.spatial(16, i3 * 16 + ax3)
+                    rv0, rv1 = T.axis.remap("RR", [ax4, ax5])
+                    with T.init():
+                        C[ax0_3, ax1_3, ax2_3, ax3_3] = T.int8(-128)
+                    C[ax0_3, ax1_3, ax2_3, ax3_3] = T.max(
+                        C[ax0_3, ax1_3, ax2_3, ax3_3], B[ax0_3, ax1_3 + rv0, ax2_3 + rv1, ax3_3]
+                    )
+
+    sch = tir.Schedule(func_multi_writers, debug_mask="all")
+    with pytest.raises(tvm.tir.ScheduleError):
+        sch.rolling_buffer(sch.get_block("B_writer_0"), 0)
+
+
+def test_fail_rolling_buffer_not_match():
+    @T.prim_func
+    def func_non_overlap(
+        A: T.Buffer[(1, 12, 12, 16), "int8"], C: T.Buffer[(1, 12, 12, 16), "int8"]
+    ):
+        B = T.alloc_buffer([1, 12, 12, 16], dtype="int8")
+        for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 3, 3, 1):
+            for ax0, ax1, ax2 in T.grid(4, 4, 16):
+                with T.block("B"):
+                    ax0_1 = T.axis.spatial(1, 0)
+                    ax1_1 = T.axis.spatial(12, i1_0 * 4 + ax0)
+                    ax2_1 = T.axis.spatial(12, i2_0 * 4 + ax1)
+                    ax3 = T.axis.spatial(16, ax2)
+                    T.reads(A[ax0_1, ax1_1, ax2_1, ax3])
+                    T.writes(B[ax0_1, ax1_1, ax2_1, ax3])
+                    with T.init():
+                        B[ax0_1, ax1_1, ax2_1, ax3] = T.int8(-128)
+                    B[ax0_1, ax1_1, ax2_1, ax3] = A[ax0_1, ax1_1, ax2_1, ax3]
+            for i0_1, i1_1, i2_1, i3_1, i4, i5 in T.grid(1, 4, 4, 16, 1, 1):
+                with T.block("C"):
+                    ax0 = T.axis.spatial(1, i0_0 + i0_1)
+                    ax1 = T.axis.spatial(12, i1_0 * 4 + i1_1)
+                    ax2 = T.axis.spatial(12, i2_0 * 4 + i2_1)
+                    ax3 = T.axis.spatial(16, i3_0 * 16 + i3_1)
+                    rv0, rv1 = T.axis.remap("RR", [i4, i5])
+                    T.reads(B[ax0, ax1 + rv0, ax2 + rv1, ax3])
+                    T.writes(C[ax0, ax1, ax2, ax3])
+                    with T.init():
+                        C[ax0, ax1, ax2, ax3] = T.int8(-128)
+                    C[ax0, ax1, ax2, ax3] = T.max(
+                        C[ax0, ax1, ax2, ax3], B[ax0, ax1 + rv0, ax2 + rv1, ax3]
+                    )
+
+    sch = tir.Schedule(func_non_overlap, debug_mask="all")
+    with pytest.raises(tvm.tir.ScheduleError):
+        sch.rolling_buffer(sch.get_block("B"), 0)
+
+
+def test_fail_rolling_buffer_injection_invalid():
+    sch = tir.Schedule(cascade_2_max_pool2d, debug_mask="all")
+    # Block B is not compute_at to Block C, so rolling_buffer injection is invalid.
+    _, _ = _tile_nd(sch, [1, 4, 8, 16], "C")
+    _, _ = _tile_nd(sch, [1, 4, 8, 16], "B")
+    with pytest.raises(tvm.tir.ScheduleError):
+        sch.rolling_buffer(sch.get_block("B"), 0)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From e9e8c4b263aa33bfd6bd54151a4bc7eb89da8deb Mon Sep 17 00:00:00 2001
From: alter-xp <xp56@linux.alibaba.com>
Date: Wed, 2 Nov 2022 00:54:19 +0800
Subject: [PATCH 481/704] fix GPU other build (#13235)

fixes #12777

Co-authored-by: thead_iot_autotest <thead_iot_autotest@alibaba-inc.com>
Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>
---
 Jenkinsfile                | 24 +++++-------------------
 ci/jenkins/Build.groovy.j2 |  8 ++++----
 2 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 135f64dc1d94..496887479c87 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-10-19T13:44:32.119961
+// Generated at 2022-11-01T15:54:54.217190
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -677,7 +677,7 @@ def fsim_test(image) {
 
 def cmake_build(image, path, make_flag) {
   sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod --build-dir ${path}",
     label: 'Run cmake build',
   )
 }
@@ -761,23 +761,9 @@ stage('Build') {
 
 
           // compiler test
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
-          make("${ci_gpu} --no-gpu", 'build2', '-j2')
-          sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              md5sum build/libtvm.so
-              retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu2/build/libtvm.so
-              md5sum build/libvta_fsim.so
-              retry 3 aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/gpu2/build/libvta_fsim.so
-              md5sum build/libtvm_runtime.so
-              retry 3 aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/gpu2/build/libtvm_runtime.so
-              md5sum build/config.cmake
-              retry 3 aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/gpu2/build/config.cmake
-            """,
-            label: 'Upload artifacts to S3',
-          )
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_clean.sh build",
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
+          make("${ci_gpu} --no-gpu", 'build', '-j2')
           }
         }
       }
diff --git a/ci/jenkins/Build.groovy.j2 b/ci/jenkins/Build.groovy.j2
index 49cffacdc16e..315057c5d5cc 100644
--- a/ci/jenkins/Build.groovy.j2
+++ b/ci/jenkins/Build.groovy.j2
@@ -21,7 +21,7 @@ def fsim_test(image) {
 
 def cmake_build(image, path, make_flag) {
   sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod --build-dir ${path}",
     label: 'Run cmake build',
   )
 }
@@ -93,9 +93,9 @@ stage('Build') {
     {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
 
     // compiler test
-    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
-    make("${ci_gpu} --no-gpu", 'build2', '-j2')
-    {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }}
+    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_clean.sh build",
+    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
+    make("${ci_gpu} --no-gpu", 'build', '-j2')
   {% endcall %}
 
   {% call m.build_step(

From 5d15428994fee408ed14807fad4362ef2d783d25 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 2 Nov 2022 01:47:30 +0800
Subject: [PATCH 482/704] [Relay] Add set_attrs_type registry to broadcast_to
 op (#13096)

Add set_attr_type registry to broadcast_to op
---
 src/relay/op/tensor/transform.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 5f063a290740..c41eb0f8ad99 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -2557,6 +2557,7 @@ RELAY_REGISTER_OP("broadcast_to")
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(4)
     .add_type_rel("BroadCastTo", BroadCastToRel)
+    .set_attrs_type<InitOpAttrs>()
     .set_attr<FTVMCompute>("FTVMCompute", BroadCastToCompute)
     .set_attr<TOpPattern>("TOpPattern", kBroadcast);
 

From 6551b715888710f338836fcf4c13c3ea1a34dbec Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Tue, 1 Nov 2022 17:10:56 -0400
Subject: [PATCH 483/704] [COMMUNITY] Jyotsna Verma -> Reviewer (#13251)

adding Jyotsna to reviewers list
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index d5fea2181a11..448c13b60c1a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -180,6 +180,7 @@ We do encourage everyone to work anything they are interested in.
 - [Jorn Tuyls](https://github.com/jtuyls): @jtuyls
 - [Gavin Uberti](https://github.com/guberti): @guberti
 - [Luis Vega](https://github.com/vegaluisjose): @vegaluisjose
+- [Jyotsna Verma](https://github.com/jverma-quic): @jverma-quic
 - [Thomas Viehmann](https://github.com/t-vi): @t-vi
 - [An Wang](https://github.com/anwang2009): @anwang2009
 - [Yao Wang](https://github.com/kevinthesun): @kevinthesun

From 87f52af48a23e2e94ac13863b1fa378b0d4cc846 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 1 Nov 2022 15:32:33 -0700
Subject: [PATCH 484/704] [skip ci] Revert "fix GPU other build (#13235)"
 (#13261)

This reverts commit e9e8c4b263aa33bfd6bd54151a4bc7eb89da8deb.
---
 Jenkinsfile                | 24 +++++++++++++++++++-----
 ci/jenkins/Build.groovy.j2 |  8 ++++----
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 496887479c87..135f64dc1d94 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-11-01T15:54:54.217190
+// Generated at 2022-10-19T13:44:32.119961
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -677,7 +677,7 @@ def fsim_test(image) {
 
 def cmake_build(image, path, make_flag) {
   sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod --build-dir ${path}",
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
     label: 'Run cmake build',
   )
 }
@@ -761,9 +761,23 @@ stage('Build') {
 
 
           // compiler test
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_clean.sh build",
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
-          make("${ci_gpu} --no-gpu", 'build', '-j2')
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
+          make("${ci_gpu} --no-gpu", 'build2', '-j2')
+          sh(
+            script: """
+              set -eux
+              . ci/scripts/retry.sh
+              md5sum build/libtvm.so
+              retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu2/build/libtvm.so
+              md5sum build/libvta_fsim.so
+              retry 3 aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/gpu2/build/libvta_fsim.so
+              md5sum build/libtvm_runtime.so
+              retry 3 aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/gpu2/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              retry 3 aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/gpu2/build/config.cmake
+            """,
+            label: 'Upload artifacts to S3',
+          )
           }
         }
       }
diff --git a/ci/jenkins/Build.groovy.j2 b/ci/jenkins/Build.groovy.j2
index 315057c5d5cc..49cffacdc16e 100644
--- a/ci/jenkins/Build.groovy.j2
+++ b/ci/jenkins/Build.groovy.j2
@@ -21,7 +21,7 @@ def fsim_test(image) {
 
 def cmake_build(image, path, make_flag) {
   sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod --build-dir ${path}",
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
     label: 'Run cmake build',
   )
 }
@@ -93,9 +93,9 @@ stage('Build') {
     {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
 
     // compiler test
-    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_clean.sh build",
-    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
-    make("${ci_gpu} --no-gpu", 'build', '-j2')
+    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
+    make("${ci_gpu} --no-gpu", 'build2', '-j2')
+    {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }}
   {% endcall %}
 
   {% call m.build_step(

From 7536068e80edab03177b6f3e42f6657b9f37c3d5 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 2 Nov 2022 10:34:48 +0900
Subject: [PATCH 485/704] [MetaSchedule] Swap the order of RewriteTensorize and
 VerifyGPUCode to reduce tuning time (#13259)

* [MetaSchedule] Swap the order of RewriteTensorize and VerifyGPUCode to
reduce tuning time

* add comment
---
 src/meta_schedule/postproc/postproc.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc
index acd783b1860d..0738c871120f 100644
--- a/src/meta_schedule/postproc/postproc.cc
+++ b/src/meta_schedule/postproc/postproc.cc
@@ -77,8 +77,10 @@ Array<Postproc> Postproc::DefaultCUDATensorCore() {
       Postproc::RewriteUnboundBlock(/*max_threadblocks=*/256),
       Postproc::RewriteParallelVectorizeUnroll(),
       Postproc::RewriteReductionBlock(),
-      Postproc::RewriteTensorize(/*vectorize_init_loop=*/false),
       Postproc::VerifyGPUCode(),
+      // RewriteTensorize is relatively expensive and it doesn't affect the validity of a sample, so
+      // run it only on samples that have passed VerifyGPUCode.
+      Postproc::RewriteTensorize(/*vectorize_init_loop=*/false),
   };
 }
 

From 84fadc45d01e5b76472b716a130015a454940147 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 2 Nov 2022 12:04:37 +0000
Subject: [PATCH 486/704] [CI] Skip failing Caffe tests due to broken URL
 (#13228)

See issue #13227.

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>
---
 tests/python/frontend/caffe/test_forward.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/python/frontend/caffe/test_forward.py b/tests/python/frontend/caffe/test_forward.py
index 21e4c9d03181..d0ba1dfac40b 100644
--- a/tests/python/frontend/caffe/test_forward.py
+++ b/tests/python/frontend/caffe/test_forward.py
@@ -23,6 +23,7 @@
 import os
 import logging
 import numpy as np
+import pytest
 
 from google.protobuf import text_format
 import caffe
@@ -1092,6 +1093,7 @@ def _test_alexnet(data):
     _test_network(data_process, proto_file, blob_file)
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/13227")
 def test_forward_Alexnet():
     """Alexnet"""
     data = np.random.randint(0, 256, size=(1, 3, 227, 227)).astype(np.float32)
@@ -1153,6 +1155,7 @@ def _test_inceptionv1(data):
     _test_network(data_process, proto_file, blob_file)
 
 
+@pytest.mark.skip(reason="See issue https://github.com/apache/tvm/issues/13227")
 def test_forward_Inceptionv1():
     """Inceptionv4"""
     data = np.random.randint(0, 256, size=(1, 3, 224, 224)).astype(np.float32)

From 4ecf3036951f153a578fbf4685542a47b0068733 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 2 Nov 2022 14:15:00 +0000
Subject: [PATCH 487/704] [TVMC] Apply constant folding when converting layout
 (#13216)

This commit ensures that constant folding is applied when a desired
layout is selected during compilation. It ensures that
`layout_transform` operations are removed where possible so that
pattern matching for BYOC backends can work effectively.

A test has been added to check this regression.
---
 python/tvm/driver/tvmc/transform.py        |  1 +
 tests/python/driver/tvmc/test_transform.py | 57 ++++++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 tests/python/driver/tvmc/test_transform.py

diff --git a/python/tvm/driver/tvmc/transform.py b/python/tvm/driver/tvmc/transform.py
index 3f7776577876..51c9e52f21d6 100644
--- a/python/tvm/driver/tvmc/transform.py
+++ b/python/tvm/driver/tvmc/transform.py
@@ -50,6 +50,7 @@ def convert_graph_layout(mod, desired_layout):
         [
             relay.transform.RemoveUnusedFunctions(),
             relay.transform.ConvertLayout(desired_layouts),
+            relay.transform.FoldConstant(),
         ]
     )
 
diff --git a/tests/python/driver/tvmc/test_transform.py b/tests/python/driver/tvmc/test_transform.py
new file mode 100644
index 000000000000..98a0210a1bb6
--- /dev/null
+++ b/tests/python/driver/tvmc/test_transform.py
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.driver.tvmc.transform import convert_graph_layout
+
+
+def test_layout_transform():
+    """
+    Test layout is correctly transformed and constant folding is applied.
+    """
+    dtype = "int8"
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+
+    x = relay.var("x", shape=(1, 4, 2, 2), dtype=dtype)
+    weight = relay.const(
+        np.random.randint(data_min, data_max, size=(2, 4, 2, 2), dtype=dtype), dtype=dtype
+    )
+    x = relay.nn.conv2d(x, weight)
+    func = relay.Function(relay.analysis.free_vars(x), x)
+    mod = tvm.IRModule.from_expr(func)
+
+    desired_layout = "NHWC"
+    mod = convert_graph_layout(mod, desired_layout)
+
+    main_expr = mod["main"].body
+    conv = main_expr.args[0]
+    assert conv.op.name == "nn.conv2d"
+    assert conv.attrs["data_layout"] == "NHWC"
+    assert conv.attrs["kernel_layout"] == "HWIO"
+
+    # Ensure transform has been folded into the constant
+    weights = conv.args[1]
+    assert isinstance(weights, relay.expr.Constant)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From c3c145423322cc2493157500e832025b95959d27 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 2 Nov 2022 15:14:32 +0000
Subject: [PATCH 488/704] Apply group write permissions to Python virtual
 environment (#13252)

This commit applies additional write permission to the "tvm-venv"
group virtual environment. Currently after entering a container from
a newly built image it dosn't seem possible to install/update Python
packages. E.g. updating pip will give errors such as:
```
$ pip install --upgrade pip
ERROR: Could not install packages due to an OSError: [Errno 13]
Permission denied: '/venv/apache-tvm-py3.7/bin/pip' Check the
permissions.
```

Enabling write access for this group fixes this as long as the
current user is a member of the "tvm-venv" group.
---
 docker/install/ubuntu_install_python.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index fb31c41dccea..eb6dcec45c5d 100755
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -89,6 +89,7 @@ pip3 install \
 addgroup tvm-venv
 chgrp -R tvm-venv "${TVM_VENV}"
 setfacl -R -d -m group:tvm-venv:rwx "${TVM_VENV}"
+setfacl -R -m group:tvm-venv:rwx "${TVM_VENV}"
 
 # Prevent further use of pip3 via the system.
 # There may be multiple (i.e. from python3-pip apt package and pip3 install -U).

From da4bb4a65bdb20c53e2025981c19a58e72dd2028 Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Wed, 2 Nov 2022 11:16:52 -0400
Subject: [PATCH 489/704] Fix a typo in rpc/client.py (#12842)

---
 python/tvm/rpc/client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index e7071bae8d2e..9dd9023f7b6d 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -33,7 +33,7 @@
 class RPCSession(object):
     """RPC Client session module
 
-    Do not directly create the obhect, call connect
+    Do not directly create the object, call connect
     """
 
     # pylint: disable=invalid-name

From d261fa883811209144cb3525982086cba8a2c831 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 2 Nov 2022 15:21:49 -0500
Subject: [PATCH 490/704] [Hexagon] Add pylint on tests (#13233)

* [Hexagon] Tests pylint

* fix error

* Fix buffer name
---
 tests/lint/pylint.sh                          |  17 +-
 .../contrib/test_hexagon/pytest_util.py       |  36 +-
 .../test_hexagon/test_async_dma_pipeline.py   | 514 ++++++++++--------
 .../test_hexagon/test_benchmark_maxpool2d.py  | 190 +++----
 .../test_hexagon/test_cache_read_write.py     |   4 +
 .../test_fixed_point_conversion.py            |  13 +-
 .../test_hexagon/test_fixed_point_multiply.py | 167 +++---
 .../contrib/test_hexagon/test_memory_alloc.py |  22 +-
 .../test_hexagon/test_meta_schedule.py        | 147 +++--
 .../contrib/test_hexagon/test_parallel_hvx.py |  81 +--
 .../test_parallel_hvx_load_vtcm.py            | 252 +++++----
 .../test_hexagon/test_parallel_scalar.py      |  63 ++-
 .../test_hexagon/test_run_unit_tests.py       |   4 +
 .../contrib/test_hexagon/test_sigmoid.py      |  40 +-
 .../test_software_pipeline_async.py           | 269 ++++-----
 .../test_hexagon/test_vtcm_bandwidth.py       |  53 +-
 .../test_wo_qnn_canonicalization.py           |   5 +-
 17 files changed, 1036 insertions(+), 841 deletions(-)

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 6b5415987985..e41dc2bb80b8 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -27,20 +27,9 @@ python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc
 
 # tests/python/contrib/test_hexagon tests
-python3 -m pylint tests/python/contrib/test_hexagon/benchmark_util.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/conftest.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/infrastructure.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_2d_physical_buffers.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_autotvm.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_cache_read_write.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_launcher.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_models.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_run_unit_tests.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_thread_pool.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_usmp.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_hexagon/*.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_hexagon/conv2d/*.py --rcfile="$(dirname "$0")"/pylintrc
+
 
 # tests/python/frontend tests
 python3 -m pylint tests/python/frontend/caffe/test_forward.py --rcfile="$(dirname "$0")"/pylintrc
diff --git a/tests/python/contrib/test_hexagon/pytest_util.py b/tests/python/contrib/test_hexagon/pytest_util.py
index 77842ce91493..c078edf7a934 100644
--- a/tests/python/contrib/test_hexagon/pytest_util.py
+++ b/tests/python/contrib/test_hexagon/pytest_util.py
@@ -15,11 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import pytest
-import numpy as np
-from typing import *
+""" Hexagon pytest utility functions """
+
+from typing import List, Optional, Union
 import collections
-import tvm.testing
+import numpy as np
 
 
 def get_test_id(*test_params, test_param_descs: List[Optional[str]] = None) -> str:
@@ -47,35 +47,35 @@ def get_test_id(*test_params, test_param_descs: List[Optional[str]] = None) -> s
         assert len(test_param_descs) == len(test_params)
 
     def get_single_param_chunk(param_val, param_desc: Optional[str]):
-        if type(param_val) == list:
+        if isinstance(param_val, list):
             # Like str(list), but avoid the whitespace padding.
             val_str = "[" + ",".join(str(x) for x in param_val) + "]"
             need_prefix_separator = False
 
-        elif type(param_val) == bool:
+        elif isinstance(param_val, bool):
             if param_val:
                 val_str = "T"
             else:
                 val_str = "F"
             need_prefix_separator = True
 
-        elif type(param_val) == TensorContentConstant:
+        elif isinstance(param_val, TensorContentConstant):
             val_str = f"const[{param_val.elem_value}]"
             need_prefix_separator = True
 
-        elif type(param_val) == TensorContentDtypeMin:
+        elif isinstance(param_val, TensorContentDtypeMin):
             val_str = "min"
             need_prefix_separator = True
 
-        elif type(param_val) == TensorContentDtypeMax:
+        elif isinstance(param_val, TensorContentDtypeMax):
             val_str = "max"
             need_prefix_separator = True
 
-        elif type(param_val) == TensorContentRandom:
+        elif isinstance(param_val, TensorContentRandom):
             val_str = "random"
             need_prefix_separator = True
 
-        elif type(param_val) == TensorContentSequentialCOrder:
+        elif isinstance(param_val, TensorContentSequentialCOrder):
             val_str = f"seqC[start:{param_val.start_value},inc:{param_val.increment}]"
             need_prefix_separator = True
 
@@ -148,26 +148,26 @@ def create_populated_numpy_ndarray(
     """
     itp = input_tensor_populator  # just for brevity
 
-    if type(itp) == TensorContentConstant:
+    if isinstance(itp, TensorContentConstant):
         return np.full(tuple(input_shape), itp.elem_value, dtype=dtype)
 
-    elif type(itp) == TensorContentDtypeMin:
+    elif isinstance(itp, TensorContentDtypeMin):
         info = get_numpy_dtype_info(dtype)
         return np.full(tuple(input_shape), info.min, dtype=dtype)
 
-    elif type(itp) == TensorContentDtypeMax:
+    elif isinstance(itp, TensorContentDtypeMax):
         info = get_numpy_dtype_info(dtype)
         return np.full(tuple(input_shape), info.max, dtype=dtype)
 
-    elif type(itp) == TensorContentRandom:
+    elif isinstance(itp, TensorContentRandom):
         return np.random.random(input_shape).astype(dtype)
 
-    elif type(itp) == TensorContentSequentialCOrder:
+    elif isinstance(itp, TensorContentSequentialCOrder):
         a = np.empty(tuple(input_shape), dtype)
 
-        with np.nditer(a, op_flags=["writeonly"], order="C") as it:
+        with np.nditer(a, op_flags=["writeonly"], order="C") as iterator:
             next_elem_val = itp.start_value
-            for elem in it:
+            for elem in iterator:
                 elem[...] = next_elem_val
                 next_elem_val += itp.increment
         return a
diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index 45e8eb0f68c6..a7a05c2aa3a7 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -29,30 +29,35 @@
 
 
 def conv_approximation(size_a, size_w):
+    """Conv approximation."""
     a_shape = (size_a, VRMPY_SIZE_B)
     w_shape = (size_w, VRMPY_SIZE_B)
     out_shape = (size_a, VRMPY_SIZE_INT32)
 
     @T.prim_func
-    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+    def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, a_shape, dtype="uint8")
-        W = T.match_buffer(b, w_shape, dtype="uint8")
-        C = T.match_buffer(c, out_shape, dtype="int32")
-        for n, i in T.grid(size_a, size_w):
-            with T.block("C"):
-                vn, vi = T.axis.remap("SR", [n, i])
-                T.reads(A[vn, 0:VRMPY_SIZE_B], W[vi, 0:VRMPY_SIZE_B], C[vn, 0:VRMPY_SIZE_INT32])
-                T.writes(C[vn, 0:VRMPY_SIZE_INT32])
+        a_buffer = T.match_buffer(a_input, a_shape, dtype="uint8")
+        w_buffer = T.match_buffer(b_input, w_shape, dtype="uint8")
+        c_buffer = T.match_buffer(c_output, out_shape, dtype="int32")
+        for n, index_0 in T.grid(size_a, size_w):
+            with T.block("c_buffer"):
+                vn_index, vi_index = T.axis.remap("SR", [n, index_0])
+                T.reads(
+                    a_buffer[vn_index, 0:VRMPY_SIZE_B],
+                    w_buffer[vi_index, 0:VRMPY_SIZE_B],
+                    c_buffer[vn_index, 0:VRMPY_SIZE_INT32],
+                )
+                T.writes(c_buffer[vn_index, 0:VRMPY_SIZE_INT32])
                 with T.init():
                     for x in T.serial(VRMPY_SIZE_INT32):
-                        C[vn, x] = 0
-                C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
+                        c_buffer[vn_index, x] = 0
+                c_buffer[vn_index, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.acc.128B"),
                     T.uint32(3),
-                    C[vn, T.ramp(0, 1, 32)],
-                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(W[vi, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    c_buffer[vn_index, T.ramp(0, 1, 32)],
+                    T.reinterpret(a_buffer[vn_index, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(w_buffer[vi_index, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int32x32",
                 )
         # Currently async DMA lowering does not add any wait to the end of schedules so
@@ -73,13 +78,14 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 def evaluate(
     hexagon_session,
     sch,
-    a,
-    b,
-    c,
+    a_data,
+    b_data,
+    c_data,
     expected_output=None,
     use_async_copy=0,
     merge_async_commit_queue_scope=False,
 ):
+    """Evaluate function."""
     target_hexagon = tvm.target.hexagon("v68", link_params=True)
     with tvm.transform.PassContext(
         config={
@@ -92,9 +98,9 @@ def evaluate(
         )
     module = hexagon_session.load_module(func_tir)
 
-    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
-    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
-    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+    a_hexagon = tvm.runtime.ndarray.array(a_data, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.ndarray.array(b_data, device=hexagon_session.device)
+    c_hexagon = tvm.runtime.ndarray.array(c_data, device=hexagon_session.device)
 
     if tvm.testing.utils.IS_IN_CI:
         # Run with reduced number and repeat for CI
@@ -108,32 +114,8 @@ def evaluate(
     return round(time.mean * 1000, 4)
 
 
-@tvm.testing.fixture
-def input_a(size_a):
-    return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8")
-
-
-@tvm.testing.fixture
-def input_w(size_w):
-    return default_rng().integers(0, 8, (size_w, VRMPY_SIZE_B), dtype="uint8")
-
-
-@tvm.testing.fixture
-def expected_output(size_a, size_w, input_a, input_w):
-    if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1):
-        pytest.skip("Skipping test since it takes too long in CI.")
-    expected_output = np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32")
-    for n in range(size_a):
-        for x in range(size_w):
-            for i in range(VRMPY_SIZE_INT32):
-                for r in range(4):
-                    expected_output[n, i] += np.uint32(input_a[n, i * 4 + r]) * np.uint32(
-                        input_w[x, i * 4 + r]
-                    )
-    return expected_output
-
-
 def get_single_dma_schedule(size_a, size_w):
+    """Generate single DMA schedule."""
     a_shape = (size_a, VRMPY_SIZE_B)
     w_shape = (size_w, VRMPY_SIZE_B)
     out_shape = (size_a, VRMPY_SIZE_INT32)
@@ -142,32 +124,32 @@ def get_single_dma_schedule(size_a, size_w):
     w_bytes = size_w * VRMPY_SIZE_B
 
     @T.prim_func
-    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+    def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, a_shape, dtype="uint8", mem_scope="global")
-        W = T.match_buffer(b, w_shape, dtype="uint8", mem_scope="global")
-        C = T.match_buffer(c, out_shape, dtype="int32", mem_scope="global")
-        A_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", mem_scope="global.vtcm")
-        W_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", mem_scope="global.vtcm")
-        C_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", mem_scope="global.vtcm")
+        a_buffer = T.match_buffer(a_input, a_shape, dtype="uint8", mem_scope="global")
+        w_buffer = T.match_buffer(b_input, w_shape, dtype="uint8", mem_scope="global")
+        c_buffer = T.match_buffer(c_output, out_shape, dtype="int32", mem_scope="global")
+        a_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", mem_scope="global.vtcm")
+        w_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", mem_scope="global.vtcm")
+        c_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", mem_scope="global.vtcm")
         T.evaluate(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    A_global_vtcm.data,
+                    a_global_vtcm.data,
                     T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    A_global_vtcm.dtype,
+                    a_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    A.data,
+                    a_buffer.data,
                     T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    A.dtype,
+                    a_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -179,20 +161,20 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    W_global_vtcm.data,
+                    w_global_vtcm.data,
                     T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    W_global_vtcm.dtype,
+                    w_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    W.data,
+                    w_buffer.data,
                     T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    W.dtype,
+                    w_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -200,43 +182,43 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
                 dtype="int32",
             )
         )
-        for n, i in T.grid(size_a, size_w):
-            with T.block("C"):
-                vn, vi = T.axis.remap("SR", [n, i])
+        for n, index_0 in T.grid(size_a, size_w):
+            with T.block("c_buffer"):
+                vn_index, vi_index = T.axis.remap("SR", [n, index_0])
                 T.reads(
-                    A_global_vtcm[vn, 0:VRMPY_SIZE_B],
-                    W_global_vtcm[vi, 0:VRMPY_SIZE_B],
-                    C_global_vtcm[vn, 0:VRMPY_SIZE_INT32],
+                    a_global_vtcm[vn_index, 0:VRMPY_SIZE_B],
+                    w_global_vtcm[vi_index, 0:VRMPY_SIZE_B],
+                    c_global_vtcm[vn_index, 0:VRMPY_SIZE_INT32],
                 )
-                T.writes(C_global_vtcm[vn, 0:VRMPY_SIZE_INT32])
+                T.writes(c_global_vtcm[vn_index, 0:VRMPY_SIZE_INT32])
                 with T.init():
                     for x in T.serial(VRMPY_SIZE_INT32):
-                        C_global_vtcm[vn, x] = 0
-                C_global_vtcm[vn, T.ramp(0, 1, 32)] += T.call_llvm_intrin(
+                        c_global_vtcm[vn_index, x] = 0
+                c_global_vtcm[vn_index, T.ramp(0, 1, 32)] += T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
-                    T.reinterpret(A_global_vtcm[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(W_global_vtcm[vi, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(a_global_vtcm[vn_index, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(w_global_vtcm[vi_index, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int32x32",
                 )
         T.evaluate(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    C.data,
+                    c_buffer.data,
                     T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    C.dtype,
+                    c_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    C_global_vtcm.data,
+                    c_global_vtcm.data,
                     T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    C_global_vtcm.dtype,
+                    c_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -251,43 +233,45 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def get_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
+    """Generate fake conv schedule with VTCM."""
     sch = conv_approximation(size_a, size_w)
 
-    compute_block = sch.get_block("C")
+    compute_block = sch.get_block("c_buffer")
     sch.cache_read(compute_block, 1, "global.vtcm")
 
     n = sch.get_loops(compute_block)[0]
-    no, _ = sch.split(n, [blocks, None])
+    n_outer, _ = sch.split(n, [blocks, None])
 
     cache_read_block_a = sch.cache_read(compute_block, 0, "global.vtcm")
-    sch.compute_at(cache_read_block_a, no)
+    sch.compute_at(cache_read_block_a, n_outer)
     sch.fuse(*sch.get_loops(cache_read_block_a)[1:])
 
     cache_write_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
-    sch.reverse_compute_at(cache_write_block_c, no)
+    sch.reverse_compute_at(cache_write_block_c, n_outer)
     sch.fuse(*sch.get_loops(cache_write_block_c)[1:])
 
     return sch
 
 
 def get_multi_input_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
+    """Generate multi input fake Conv using VTCM."""
     sch = conv_approximation(size_a, size_w)
 
-    compute_block = sch.get_block("C")
+    compute_block = sch.get_block("c_buffer")
 
     n = sch.get_loops(compute_block)[0]
-    no, _ = sch.split(n, [blocks, None])
+    n_outer, _ = sch.split(n, [blocks, None])
 
     cache_read_block_a = sch.cache_read(compute_block, 0, "global.vtcm")
-    sch.compute_at(cache_read_block_a, no)
+    sch.compute_at(cache_read_block_a, n_outer)
     sch.fuse(*sch.get_loops(cache_read_block_a)[1:])
 
     cache_read_block_b = sch.cache_read(compute_block, 1, "global.vtcm")
-    sch.compute_at(cache_read_block_b, no)
+    sch.compute_at(cache_read_block_b, n_outer)
     sch.fuse(*sch.get_loops(cache_read_block_b)[1:])
 
     cache_write_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
-    sch.reverse_compute_at(cache_write_block_c, no)
+    sch.reverse_compute_at(cache_write_block_c, n_outer)
     sch.fuse(*sch.get_loops(cache_write_block_c)[1:])
 
     return sch
@@ -301,6 +285,8 @@ def print_results(test_key, runtimes):
 
 
 class TestAsyncDMAPipeline:
+    """Async DMA pipeline test class."""
+
     # Removed most of these to speedup CI.
     size_a = tvm.testing.parameter(
         1024,
@@ -314,6 +300,29 @@ class TestAsyncDMAPipeline:
         9 * 9,
     )
 
+    @tvm.testing.fixture
+    def input_a(self, size_a):
+        return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8")
+
+    @tvm.testing.fixture
+    def input_w(self, size_w):
+        return default_rng().integers(0, 8, (size_w, VRMPY_SIZE_B), dtype="uint8")
+
+    @tvm.testing.fixture
+    def expected_output(self, size_a, size_w, input_a, input_w):
+        """Generate expected output."""
+        if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1):
+            pytest.skip("Skipping test since it takes too long in CI.")
+        expected_result = np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32")
+        for n in range(size_a):
+            for x in range(size_w):
+                for index_0 in range(VRMPY_SIZE_INT32):
+                    for r_index in range(4):
+                        expected_result[n, index_0] += np.uint32(
+                            input_a[n, index_0 * 4 + r_index]
+                        ) * np.uint32(input_w[x, index_0 * 4 + r_index])
+        return expected_result
+
     @tvm.testing.requires_hexagon
     def test_loading_vtcm_for_vrmpy(
         self,
@@ -324,6 +333,7 @@ def test_loading_vtcm_for_vrmpy(
         input_w,
         expected_output,
     ):
+        """VTCM for VRMPY test."""
 
         if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1):
             pytest.skip("Skipping test since it takes too long in CI.")
@@ -350,7 +360,7 @@ def test_loading_vtcm_for_vrmpy(
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
-        n = sch.get_loops(sch.get_block("C"))[0]
+        n = sch.get_loops(sch.get_block("c_buffer"))[0]
         sch.annotate(n, "software_pipeline_stage", [0, 1, 2])
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0])
@@ -365,11 +375,11 @@ def test_loading_vtcm_for_vrmpy(
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
-        n = sch.get_loops(sch.get_block("C"))[0]
+        n = sch.get_loops(sch.get_block("c_buffer"))[0]
         sch.annotate(n, "software_pipeline_stage", [0, 1, 2])
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0, 2])
-        async_input_output_runtime = evaluate(
+        async_input_output = evaluate(
             hexagon_session,
             sch,
             input_a,
@@ -380,11 +390,11 @@ def test_loading_vtcm_for_vrmpy(
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
-        n = sch.get_loops(sch.get_block("C"))[0]
+        n = sch.get_loops(sch.get_block("c_buffer"))[0]
         sch.annotate(n, "software_pipeline_stage", [0, 3, 6])
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0, 6])
-        async_input_output_runtime_larger_buffers = evaluate(
+        async_larger_buffers = evaluate(
             hexagon_session,
             sch,
             input_a,
@@ -395,11 +405,11 @@ def test_loading_vtcm_for_vrmpy(
         )
 
         sch = get_multi_input_fake_conv_vtcm_schedule(size_a, size_w)
-        n = sch.get_loops(sch.get_block("C"))[0]
+        n = sch.get_loops(sch.get_block("c_buffer"))[0]
         sch.annotate(n, "software_pipeline_stage", [0, 0, 1, 2])
         sch.annotate(n, "software_pipeline_order", [0, 1, 2, 3])
         sch.annotate(n, "software_pipeline_async_stages", [0, 2])
-        async_multi_input_output_runtime = evaluate(
+        async_multi_input_output = evaluate(
             hexagon_session,
             sch,
             input_a,
@@ -411,7 +421,7 @@ def test_loading_vtcm_for_vrmpy(
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
-        n = sch.get_loops(sch.get_block("C"))[0]
+        n = sch.get_loops(sch.get_block("c_buffer"))[0]
         sch.annotate(n, "software_pipeline_stage", [0, 1, 2])
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [2])
@@ -435,22 +445,30 @@ def test_loading_vtcm_for_vrmpy(
             expected_output,
         )
 
-        # Total transfer size is equal to the size of A + W + C which is equal to 2 * size_a * 128 + size_w * 128
+        # Total transfer size is equal to the size of
+        # a_buffer + w_buffer + c_buffer which is equal to 2 * size_a * 128 + size_w * 128
         transfer_mb = round((2 * size_a * VRMPY_SIZE_B + size_w * VRMPY_SIZE_B) / 1e6, 2)
 
-        # Total number of operations can be calculated given the total number of vrmpy calls (size_a * size_w) * operations per vrmpy accumulate (128 multiplies + 3 adds for reduction per lane + 1 add for accumulate per lane)
+        # Total number of operations can be calculated given
+        # the total number of vrmpy calls (size_a * size_w) * operations
+        # per vrmpy accumulate (128 multiplies + 3 adds for reduction
+        # per lane + 1 add for accumulate per lane)
         complexity = round(size_a * size_w * (VRMPY_SIZE_B * 4) / 1e9, 3)
         print_results(
-            f"Test with A.size: {size_a * VRMPY_SIZE_B}, W.size: {size_w * VRMPY_SIZE_B}, computational complexity of {complexity} GOPs, and total memory transfer of {transfer_mb} MB...",
+            (
+                f"Test with a_buffer.size: {size_a * VRMPY_SIZE_B}, w_buffer.size:"
+                f" {size_w * VRMPY_SIZE_B}, computational complexity of {complexity} GOPs"
+                f", and total memory transfer of {transfer_mb} MB..."
+            ),
             {
                 "without_vtcm": base_runtime,
                 "synchronous_dma": single_dma_runtime,
                 "base_vtcm": base_vtcm_runtime,
                 "async_dma_input": async_input_runtime,
                 "async_dma_output": async_output_runtime,
-                "async_dma_input_output": async_input_output_runtime,
-                "async_dma_multi_input_output": async_multi_input_output_runtime,
-                "async_input_output_runtime_larger_buffers": async_input_output_runtime_larger_buffers,
+                "async_dma_input_output": async_input_output,
+                "async_dma_multi_input_output": async_multi_input_output,
+                "async_input_output_runtime_larger_buffers": async_larger_buffers,
             },
         )
 
@@ -458,84 +476,102 @@ def test_loading_vtcm_for_vrmpy(
 # from tvm.script import tir as T
 @tvm.script.ir_module
 class ModulePipelined:
+    """Pipelined module class."""
+
+    # pylint: disable=no-self-argument
     @T.prim_func
     def main(
-        p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
-        p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
-        T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
+        p0_buffer: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
+        p1_buffer: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
+        t_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
     ) -> None:
+        # pylint: disable=missing-function-docstring
         # function attr dict
         T.func_attr({"tir.noalias": True, "global_symbol": "main"})
         # body
         # with T.block("root")
-        conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32", scope="global.vtcm")
+        conv2d_nchwc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32", scope="global.vtcm")
         p0_global_vtcm = T.alloc_buffer([1, 1, 230, 230, 4], dtype="uint8", scope="global.vtcm")
         p1_global_vtcm = T.alloc_buffer([2, 1, 7, 7, 1, 32, 4], dtype="int8", scope="global.vtcm")
         for ax0, ax1, ax2, ax3, ax4, ax5, ax6 in T.grid(2, 1, 7, 7, 1, 32, 4):
             with T.block("p1_global.vtcm"):
-                v0, v1, v2, v3, v4, v5, v6 = T.axis.remap(
+                v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind = T.axis.remap(
                     "SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6]
                 )
-                T.reads(p1[v0, v1, v2, v3, v4, v5, v6])
-                T.writes(p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6])
-                p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6] = p1[v0, v1, v2, v3, v4, v5, v6]
-        for po in T.serial(4):
-            for i in T.serial(55876):
+                T.reads(p1_buffer[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind])
+                T.writes(p1_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind])
+                p1_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind] = p1_buffer[
+                    v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind
+                ]
+        for p_outer in T.serial(4):
+            for index_0 in T.serial(55876):
                 with T.block("p0_global.vtcm"):
-                    v0 = T.axis.spatial(1, 0)
-                    v1 = T.axis.spatial(1, 0)
-                    v2 = T.axis.spatial(230, po * 56 + i // 916)
-                    v3 = T.axis.spatial(230, i % 916 // 4)
-                    v4 = T.axis.spatial(4, i % 4)
-                    T.reads(p0[v0, v1, v2, v3, v4])
-                    T.writes(p0_global_vtcm[v0, v1, v2, v3, v4])
-                    p0_global_vtcm[v0, v1, v2, v3, v4] = p0[v0, v1, v2, v3, v4]
-            for i in T.parallel(28):
-                for ii, iii, iiii in T.grid(2, 14, 8):
+                    v0_ind = T.axis.spatial(1, 0)
+                    v1_ind = T.axis.spatial(1, 0)
+                    v2_ind = T.axis.spatial(230, p_outer * 56 + index_0 // 916)
+                    v3_ind = T.axis.spatial(230, index_0 % 916 // 4)
+                    v4_ind = T.axis.spatial(4, index_0 % 4)
+                    T.reads(p0_buffer[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind])
+                    T.writes(p0_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind])
+                    p0_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind] = p0_buffer[
+                        v0_ind, v1_ind, v2_ind, v3_ind, v4_ind
+                    ]
+            for index_0 in T.parallel(28):
+                for index_1, index_2, index_3 in T.grid(2, 14, 8):
                     with T.block("conv2d_NCHWc_int8_o_init"):
                         n = T.axis.spatial(1, 0)
-                        oc_chunk = T.axis.spatial(2, ii)
-                        oh = T.axis.spatial(112, (po * 28 + i) // 14 * 14 + iii)
-                        ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + iiii)
-                        oc_block_o = T.axis.spatial(1, 0)
+                        oc_chunk = T.axis.spatial(2, index_1)
+                        o_height = T.axis.spatial(
+                            112, (p_outer * 28 + index_0) // 14 * 14 + index_2
+                        )
+                        o_width = T.axis.spatial(112, (p_outer * 28 + index_0) % 14 * 8 + index_3)
+                        oc_block_o = T.axis.spatial(1, 0)  # pylint: disable=unused-variable
                         T.reads()
-                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
+                        T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32])
                         for i4_1 in T.vectorized(32):
                             with T.block("conv2d_NCHWc_int8_init"):
                                 oc_block_i_init = T.axis.spatial(32, i4_1)
                                 T.reads()
-                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
-                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                                T.writes(
+                                    conv2d_nchwc_int8[
+                                        n, oc_chunk, o_height, o_width, oc_block_i_init
+                                    ]
+                                )
+                                conv2d_nchwc_int8[
+                                    n, oc_chunk, o_height, o_width, oc_block_i_init
+                                ] = 0
                 for i1_1, i5_1, i6_1, i2_2, i3_2 in T.grid(2, 7, 7, 14, 8):
                     with T.block("conv2d_NCHWc_int8_o_update"):
                         n = T.axis.spatial(1, 0)
                         oc_chunk = T.axis.spatial(2, i1_1)
-                        oh = T.axis.spatial(112, (po * 28 + i) // 14 * 14 + i2_2)
-                        ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + i3_2)
-                        oc_block_o = T.axis.spatial(1, 0)
-                        kh = T.axis.reduce(7, i5_1)
-                        kw = T.axis.reduce(7, i6_1)
+                        o_height = T.axis.spatial(112, (p_outer * 28 + index_0) // 14 * 14 + i2_2)
+                        o_width = T.axis.spatial(112, (p_outer * 28 + index_0) % 14 * 8 + i3_2)
+                        oc_block_o = T.axis.spatial(1, 0)  # pylint: disable=unused-variable
+                        k_height = T.axis.reduce(7, i5_1)
+                        k_width = T.axis.reduce(7, i6_1)
                         ic_outer = T.axis.reduce(1, 0)
                         ic_f_inner = T.axis.reduce(1, 0)
-                        ic_s_inner_o = T.axis.reduce(1, 0)
+                        ic_s_inner_o = T.axis.reduce(1, 0)  # pylint: disable=unused-variable
                         T.reads(
-                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                            conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32],
                             p0_global_vtcm[
                                 n,
                                 ic_outer,
-                                oh * 2 + kh,
-                                ow * 2 + kw,
+                                o_height * 2 + k_height,
+                                o_width * 2 + k_width,
                                 ic_f_inner * 4 : ic_f_inner * 4 + 4,
                             ],
-                            p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                            p1_global_vtcm[
+                                oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4
+                            ],
                         )
-                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
-                        A = T.match_buffer(
+                        T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32])
+                        a_buffer = T.match_buffer(
                             p0_global_vtcm[
                                 n,
                                 ic_outer,
-                                oh * 2 + kh,
-                                ow * 2 + kw,
+                                o_height * 2 + k_height,
+                                o_width * 2 + k_width,
                                 ic_f_inner * 4 : ic_f_inner * 4 + 4,
                             ],
                             [4],
@@ -543,42 +579,48 @@ def main(
                             offset_factor=1,
                             scope="global.vtcm",
                         )
-                        B = T.match_buffer(
-                            p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                        b_buffer = T.match_buffer(
+                            p1_global_vtcm[
+                                oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4
+                            ],
                             [32, 4],
                             dtype="int8",
                             offset_factor=1,
                             scope="global.vtcm",
                         )
-                        C = T.match_buffer(
-                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                        c_buffer = T.match_buffer(
+                            conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32],
                             [32],
                             dtype="int32",
                             offset_factor=1,
                             scope="global.vtcm",
                         )
-                        A_u8x4: T.uint8x4 = A[0:4]
-                        A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
-                        B_i8x128 = B[0, 0:128]
-                        B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
-                        C[0:32] = T.call_llvm_pure_intrin(
+                        a_u8x4: T.uint8x4 = a_buffer[0:4]
+                        a_i32: T.int32 = T.reinterpret(a_u8x4, dtype="int32")
+                        b_i8x128 = b_buffer[0, 0:128]
+                        b_i32x32: T.int32x32 = T.reinterpret(b_i8x128, dtype="int32x32")
+                        c_buffer[0:32] = T.call_llvm_pure_intrin(
                             4217,
                             T.uint32(3),
-                            C[0:32],
-                            T.broadcast(A_i32, 32),
-                            B_i32x32,
+                            c_buffer[0:32],
+                            T.broadcast(a_i32, 32),
+                            b_i32x32,
                             dtype="int32x32",
                         )
-            for i in T.serial(200704):
-                with T.block("conv2d_NCHWc_int8.vtcm"):
+            for index_0 in T.serial(200704):
+                with T.block("conv2d_nchwc_int8.vtcm"):
                     ax0_1 = T.axis.spatial(1, 0)
-                    ax1_1 = T.axis.spatial(2, i % 7168 // 3584)
-                    ax2_1 = T.axis.spatial(112, (po * 28 + i // 7168) // 14 * 14 + i % 3584 // 256)
-                    ax3_1 = T.axis.spatial(112, (po * 28 + i // 7168) % 14 * 8 + i % 256 // 32)
-                    ax4 = T.axis.spatial(32, i % 32)
-                    T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                    T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                    T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[
+                    ax1_1 = T.axis.spatial(2, index_0 % 7168 // 3584)
+                    ax2_1 = T.axis.spatial(
+                        112, (p_outer * 28 + index_0 // 7168) // 14 * 14 + index_0 % 3584 // 256
+                    )
+                    ax3_1 = T.axis.spatial(
+                        112, (p_outer * 28 + index_0 // 7168) % 14 * 8 + index_0 % 256 // 32
+                    )
+                    ax4 = T.axis.spatial(32, index_0 % 32)
+                    T.reads(conv2d_nchwc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                    T.writes(t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                    t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_nchwc_int8[
                         ax0_1, ax1_1, ax2_1, ax3_1, ax4
                     ]
 
@@ -586,122 +628,138 @@ def main(
 # from tvm.script import tir as T
 @tvm.script.ir_module
 class ModuleBase:
+    """Base module test class."""
+
+    # pylint: disable=no-self-argument
     @T.prim_func
     def main(
-        p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
-        p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
-        T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
+        p0_buffer: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
+        p1_buffer: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
+        t_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
     ) -> None:
+        # pylint: disable=missing-function-docstring
         # function attr dict
         T.func_attr({"tir.noalias": True, "global_symbol": "main"})
         # buffer definition
         # body
         # with T.block("root")
-        conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32")
+        conv2d_nchwc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32")
         for i0_0_i1_0_i2_0_i3_0_fused in T.parallel(
             112, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}
         ):
-            for i4_0_0 in T.serial(1):
+            for i4_0_0 in T.serial(1):  # pylint: disable=unused-variable
                 for i1_1_init, i2_1_init, i3_1_init, i1_2_init, i2_2_init, i3_2_init in T.grid(
                     2, 1, 1, 1, 14, 8
                 ):
                     with T.block("conv2d_NCHWc_int8_o_init"):
                         n = T.axis.spatial(1, 0)
                         oc_chunk = T.axis.spatial(2, i1_1_init + i1_2_init)
-                        oh = T.axis.spatial(
+                        o_height = T.axis.spatial(
                             112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1_init * 14 + i2_2_init
                         )
-                        ow = T.axis.spatial(
+                        o_width = T.axis.spatial(
                             112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1_init * 8 + i3_2_init
                         )
-                        oc_block_o = T.axis.spatial(1, 0)
+                        oc_block_o = T.axis.spatial(1, 0)  # pylint: disable=unused-variable
                         T.reads()
-                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
+                        T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32])
                         for i4_1 in T.vectorized(32):
                             with T.block("conv2d_NCHWc_int8_init"):
                                 oc_block_i_init = T.axis.spatial(32, i4_1)
                                 T.reads()
-                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
-                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
-                for i5_0, i6_0, i7_0, i8_0, i9_0_0 in T.grid(1, 1, 1, 1, 1):
+                                T.writes(
+                                    conv2d_nchwc_int8[
+                                        n, oc_chunk, o_height, o_width, oc_block_i_init
+                                    ]
+                                )
+                                conv2d_nchwc_int8[
+                                    n, oc_chunk, o_height, o_width, oc_block_i_init
+                                ] = 0
+                for i5_0, i6_0, i7_0, i8_0, i9_0_0 in T.grid(  # pylint: disable=unused-variable
+                    1, 1, 1, 1, 1
+                ):  # pylint: disable=unused-variable
                     for (
-                        i0_1,
+                        i0_1,  # pylint: disable=unused-variable
                         i1_1,
                         i2_1,
                         i3_1,
-                        i4_0_1,
+                        i4_0_1,  # pylint: disable=unused-variable
                         i5_1,
                         i6_1,
-                        i7_1,
-                        i8_1,
-                        i9_0_1,
-                        i0_2,
+                        i7_1,  # pylint: disable=unused-variable
+                        i8_1,  # pylint: disable=unused-variable
+                        i9_0_1,  # pylint: disable=unused-variable
+                        i0_2,  # pylint: disable=unused-variable
                         i1_2,
                         i2_2,
                         i3_2,
-                        i4_0_2,
+                        i4_0_2,  # pylint: disable=unused-variable
                     ) in T.grid(1, 2, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 14, 8, 1):
                         with T.block("conv2d_NCHWc_int8_o_update"):
                             n = T.axis.spatial(1, 0)
                             oc_chunk = T.axis.spatial(2, i1_1 + i1_2)
-                            oh = T.axis.spatial(
+                            o_height = T.axis.spatial(
                                 112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1 * 14 + i2_2
                             )
-                            ow = T.axis.spatial(
+                            o_width = T.axis.spatial(
                                 112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1 * 8 + i3_2
                             )
-                            oc_block_o = T.axis.spatial(1, 0)
-                            kh = T.axis.reduce(7, i5_0 * 7 + i5_1)
-                            kw = T.axis.reduce(7, i6_0 * 7 + i6_1)
+                            oc_block_o = T.axis.spatial(1, 0)  # pylint: disable=unused-variable
+                            k_height = T.axis.reduce(7, i5_0 * 7 + i5_1)
+                            k_width = T.axis.reduce(7, i6_0 * 7 + i6_1)
                             ic_outer = T.axis.reduce(1, 0)
                             ic_f_inner = T.axis.reduce(1, 0)
-                            ic_s_inner_o = T.axis.reduce(1, 0)
+                            ic_s_inner_o = T.axis.reduce(1, 0)  # pylint: disable=unused-variable
                             T.reads(
-                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
-                                p0[
+                                conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32],
+                                p0_buffer[
                                     n,
                                     ic_outer,
-                                    oh * 2 + kh,
-                                    ow * 2 + kw,
+                                    o_height * 2 + k_height,
+                                    o_width * 2 + k_width,
                                     ic_f_inner * 4 : ic_f_inner * 4 + 4,
                                 ],
-                                p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                                p1_buffer[
+                                    oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4
+                                ],
                             )
-                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
-                            A = T.match_buffer(
-                                p0[
+                            T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32])
+                            a_buffer = T.match_buffer(
+                                p0_buffer[
                                     n,
                                     ic_outer,
-                                    oh * 2 + kh,
-                                    ow * 2 + kw,
+                                    o_height * 2 + k_height,
+                                    o_width * 2 + k_width,
                                     ic_f_inner * 4 : ic_f_inner * 4 + 4,
                                 ],
                                 [4],
                                 dtype="uint8",
                                 offset_factor=1,
                             )
-                            B = T.match_buffer(
-                                p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                            b_buffer = T.match_buffer(
+                                p1_buffer[
+                                    oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4
+                                ],
                                 [32, 4],
                                 dtype="int8",
                                 offset_factor=1,
                             )
-                            C = T.match_buffer(
-                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                            c_buffer = T.match_buffer(
+                                conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32],
                                 [32],
                                 dtype="int32",
                                 offset_factor=1,
                             )
-                            A_u8x4: T.uint8x4 = A[0:4]
-                            A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
-                            B_i8x128 = B[0, 0:128]
-                            B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
-                            C[0:32] = T.call_llvm_pure_intrin(
+                            a_u8x4: T.uint8x4 = a_buffer[0:4]
+                            a_i32: T.int32 = T.reinterpret(a_u8x4, dtype="int32")
+                            b_i8x128 = b_buffer[0, 0:128]
+                            b_i32x32: T.int32x32 = T.reinterpret(b_i8x128, dtype="int32x32")
+                            c_buffer[0:32] = T.call_llvm_pure_intrin(
                                 4217,
                                 T.uint32(3),
-                                C[0:32],
-                                T.broadcast(A_i32, 32),
-                                B_i32x32,
+                                c_buffer[0:32],
+                                T.broadcast(a_i32, 32),
+                                b_i32x32,
                                 dtype="int32x32",
                             )
                     for ax0, ax1, ax2, ax3 in T.grid(1, 2, 14, 8):
@@ -715,47 +773,57 @@ def main(
                                     112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + ax3
                                 )
                                 ax4 = T.axis.spatial(32, ax4_fused)
-                                T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                                T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                                T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[
+                                T.reads(conv2d_nchwc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                T.writes(t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_nchwc_int8[
                                     ax0_1, ax1_1, ax2_1, ax3_1, ax4
                                 ]
 
 
 @tvm.testing.requires_hexagon
 def test_meta(hexagon_session):
+    """Test meta."""
     if tvm.testing.utils.IS_IN_CI:
         pytest.skip("Skipping test since it takes too long in CI.")
 
-    a = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
-    w = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
-    c = np.zeros((1, 2, 112, 112, 32), dtype="int32")
+    a_data = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
+    w_data = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
+    c_data = np.zeros((1, 2, 112, 112, 32), dtype="int32")
 
     sch = tvm.tir.Schedule(ModuleBase)
-    base_runtime = evaluate(hexagon_session, sch, a, w, c)
+    base_runtime = evaluate(hexagon_session, sch, a_data, w_data, c_data)
 
     sch = tvm.tir.Schedule(ModulePipelined)
     compute_block = sch.get_block("conv2d_NCHWc_int8_o_update")
-    o = sch.get_loops(compute_block)[0]
+    outer = sch.get_loops(compute_block)[0]
 
-    unscheduled_vtcm_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
+    unscheduled_vtcm_runtime = evaluate(
+        hexagon_session, sch, a_data, w_data, c_data, use_async_copy=1
+    )
 
     sch = tvm.tir.Schedule(ModulePipelined)
     compute_block = sch.get_block("conv2d_NCHWc_int8_o_update")
-    o = sch.get_loops(compute_block)[0]
+    outer = sch.get_loops(compute_block)[0]
 
-    sch.annotate(o, "software_pipeline_stage", [0, 1, 2])
-    sch.annotate(o, "software_pipeline_order", [0, 1, 2])
-    sch.annotate(o, "software_pipeline_async_stages", [0, 2])
+    sch.annotate(outer, "software_pipeline_stage", [0, 1, 2])
+    sch.annotate(outer, "software_pipeline_order", [0, 1, 2])
+    sch.annotate(outer, "software_pipeline_async_stages", [0, 2])
 
-    pipeline_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
+    pipeline_runtime = evaluate(hexagon_session, sch, a_data, w_data, c_data, use_async_copy=1)
 
-    transfer_mb = round((a.size + w.size + c.size) / 1e6, 2)
+    transfer_mb = round((a_data.size + w_data.size + c_data.size) / 1e6, 2)
     print_results(
-        f"Test with A.size: {a.size}, W.size: {w.size}, and total memory transfer of {transfer_mb} MB...",
+        (
+            f"Test with a_buffer.size: {a_data.size}, w_buffer.size: {w_data.size}"
+            f", and total memory transfer of {transfer_mb} MB..."
+        ),
         {
             "without_vtcm": base_runtime,
             "unscheduled_vtcm_runtime": unscheduled_vtcm_runtime,
             "pipeline_runtime": pipeline_runtime,
         },
     )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
index 7e8a6d79f492..24d1a3f788cf 100644
--- a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
@@ -28,7 +28,7 @@
     - Testing parameters (input shapes, dtypes, etc.) currently
       support only one value for each parameter.
 
-    - H, W, C must be integer multiples of 8, 8, and 32,
+    - height, width, channel must be integer multiples of 8, 8, and 32,
       respectively.  I.e., partial blocks aren't currently
       supported by this script.
 
@@ -42,23 +42,25 @@
       primfuncs and demonstrate more coding strategies.
 """
 
-import pytest
-import numpy as np
+from typing import List
 import copy
 import os
 
+import pytest
+import numpy as np
+
 import tvm.testing
 from tvm import te, topi, tir
 from tvm.topi import testing
 from tvm.contrib.hexagon.session import Session
-from typing import List
+
 
 from .infrastructure import allocate_hexagon_array, get_hexagon_target
 from . import benchmark_util as bu
 
 # Pytest seems to require that fixture names exist in the current module.
 # E.g., it doesn't allow: @pytest.mark.usefixtures("bu.benchmark_group")
-benchmark_group = bu.benchmark_group
+BENCHMARK_GROUP = bu.benchmark_group
 
 _SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_benchmarks_flag_and_reason()
 
@@ -67,25 +69,25 @@ def _ceil_div(numerator, denominator):
     return (numerator + (denominator - 1)) // denominator
 
 
-def _int8_nhwc_8h8w32c_map(n, h, w, c):
+def _int8_nhwc_8h8w32c_map(n_batch, height, width, channel):
     return [
-        n,
-        h // 8,
-        w // 8,
-        c // 32,
+        n_batch,
+        height // 8,
+        width // 8,
+        channel // 32,
         te.AXIS_SEPARATOR,
-        h % 8,
-        w % 8,
-        c % 32,
+        height % 8,
+        width % 8,
+        channel % 32,
     ]
 
 
-def _int8_nhwc_8h8w32c_shape(n, h, w, c) -> List[int]:
+def _int8_nhwc_8h8w32c_shape(n_batch, height, width, channel) -> List[int]:
     return [
-        n,
-        _ceil_div(h, 8),
-        _ceil_div(w, 8),
-        _ceil_div(c, 32),
+        n_batch,
+        _ceil_div(height, 8),
+        _ceil_div(width, 8),
+        _ceil_div(channel, 32),
         8,
         8,
         32,
@@ -100,10 +102,10 @@ def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray:
     stage1 = copy.copy(arr_in)
 
     (
-        n,
-        h,
-        w,
-        c,
+        n_batch,
+        height,
+        width,
+        channel,
     ) = stage1.shape
 
     (
@@ -112,9 +114,9 @@ def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray:
         c_minor,
     ) = [8, 8, 32]
 
-    h_major = _ceil_div(h, h_minor)
-    w_major = _ceil_div(w, w_minor)
-    c_major = _ceil_div(c, c_minor)
+    h_major = _ceil_div(height, h_minor)
+    w_major = _ceil_div(width, w_minor)
+    c_major = _ceil_div(channel, c_minor)
 
     # This handles cases where the dimensions of arr_in are not cleanly divided
     # by the minor block size, i.e. [8, 8, 32].
@@ -122,10 +124,12 @@ def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray:
     # Any additional array elements that this creates will ahve value 0.
     # We shouldn't actually care what value is used for those elements, because they
     # shouldn't be treated as meaningful by any of our algorithms.
-    if (h % h_minor) or (w % w_minor) or (c % c_minor):
-        stage1.resize((n, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False)
+    if (height % h_minor) or (width % w_minor) or (channel % c_minor):
+        stage1.resize(
+            (n_batch, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False
+        )
 
-    stage2 = stage1.reshape(n, h_major, h_minor, w_major, w_minor, c_major, c_minor)
+    stage2 = stage1.reshape(n_batch, h_major, h_minor, w_major, w_minor, c_major, c_minor)
     stage3 = stage2.transpose(0, 1, 3, 5, 2, 4, 6)
     return stage3
 
@@ -137,8 +141,10 @@ def _create_test_input(shape, dtype: str) -> np.ndarray:
     return np.random.randint(low=min_value, high=max_value, size=tuple(shape), dtype=np.int8)
 
 
-@pytest.mark.usefixtures("benchmark_group")
+@pytest.mark.usefixtures("BENCHMARK_GROUP")
 class TestMaxPool2D:
+    """maxpool2D base test class"""
+
     csv_column_order = [
         # Identifies which TE-compute / TIRScript is used as the basis for the
         # benchmarked primfunc. Only needs to be meaningful to humans.
@@ -150,12 +156,12 @@ class TestMaxPool2D:
         # Values directly based on test parameters...
         "input_shape_4d",
         "block_shape",
-        "DTYPE",
-        "KERNEL",
-        "STRIDE",
-        "DILATION",
-        "PADDING",
-        "IO_TENSOR_MEM_SCOPE",
+        "dtype",
+        "kernel",
+        "stride",
+        "dilation",
+        "padding",
+        "io_tensor_mem_scope",
         # Reserved columns defined by the BenchmarksTable class.
         "row_status",
         "timings_min_usecs",
@@ -170,48 +176,50 @@ class TestMaxPool2D:
         "comments",
     ]
 
-    DTYPE = tvm.testing.parameter("int8")
+    dtype = tvm.testing.parameter("int8")
 
-    # FIXME(cconvey): The script currently fails when H, W, or C is not an
+    # FIXME(cconvey): The script currently fails when height, width, or channel is not an
     # integer multiple of 8, 8, or 32, respectively.
-    N = tvm.testing.parameter(1)
-    H = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
-    W = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
-    C = tvm.testing.parameter(*[x * 32 for x in [1, 2]])
+    n_batch = tvm.testing.parameter(1)
+    height = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+    width = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+    channel = tvm.testing.parameter(*[x * 32 for x in [1, 2]])
 
-    KERNEL = tvm.testing.parameter((1, 1), (3, 3))
-    STRIDE = tvm.testing.parameter((1, 1))
-    DILATION = tvm.testing.parameter((1, 1))
-    PADDING = tvm.testing.parameter((0, 0, 0, 0))
-    IO_TENSOR_MEM_SCOPE = tvm.testing.parameter("global.vtcm")
+    kernel = tvm.testing.parameter((1, 1), (3, 3))
+    stride = tvm.testing.parameter((1, 1))
+    dilation = tvm.testing.parameter((1, 1))
+    padding = tvm.testing.parameter((0, 0, 0, 0))
+    io_tensor_mem_scope = tvm.testing.parameter("global.vtcm")
 
     @pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
     @tvm.testing.requires_hexagon
     def test_maxpool2d_nhwc(
         self,
-        N,
-        H,
-        W,
-        C,
-        DTYPE,
-        KERNEL,
-        STRIDE,
-        DILATION,
-        PADDING,
-        IO_TENSOR_MEM_SCOPE,
+        n_batch,
+        height,
+        width,
+        channel,
+        dtype,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        io_tensor_mem_scope,
         hexagon_session: Session,
     ):
+        """Test maxpool2d NHWC"""
+
         keys_dict = {
             "basic_kernel": "max_pool2d",
             "sched_type": 1,
-            "input_shape_4d": [N, H, W, C],
+            "input_shape_4d": [n_batch, height, width, channel],
             "block_shape": [8, 8, 32],
-            "DTYPE": DTYPE,
-            "KERNEL": KERNEL,
-            "STRIDE": STRIDE,
-            "DILATION": DILATION,
-            "PADDING": PADDING,
-            "IO_TENSOR_MEM_SCOPE": IO_TENSOR_MEM_SCOPE,
+            "dtype": dtype,
+            "kernel": kernel,
+            "stride": stride,
+            "dilation": dilation,
+            "padding": padding,
+            "io_tensor_mem_scope": io_tensor_mem_scope,
         }
 
         desc = bu.get_benchmark_decription(keys_dict)
@@ -229,13 +237,13 @@ def test_maxpool2d_nhwc(
             log_file.write(f"CONFIGURATION: {desc}\n")
 
             try:
-                input_tensor_shape_4d = [N, H, W, C]
-                input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(N, H, W, C)
+                input_tensor_shape_4d = [n_batch, height, width, channel]
+                input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(n_batch, height, width, channel)
 
-                data = te.placeholder(tuple(input_tensor_shape_4d), dtype=DTYPE)
+                data = te.placeholder(tuple(input_tensor_shape_4d), dtype=dtype)
 
                 output = topi.nn.pool2d(
-                    data, KERNEL, STRIDE, DILATION, PADDING, "max", layout="NHWC"
+                    data, kernel, stride, dilation, padding, "max", layout="NHWC"
                 )
                 primfunc = te.create_prim_func([data, output])
 
@@ -262,20 +270,21 @@ def test_maxpool2d_nhwc(
                 # Note that we'll eventually need it in two different layouts:
                 # (1) NHWC as an argument to testing.poolnd_python.
                 # (2) NHWC_8h8w32c for as an argument to our Hexagon primfunc.
-                # a_numpy_4d = np.random.randint(low=-128, high=127, size=input_tensor_shape_4d, dtype=np.int8)
-                a_numpy_4d = _create_test_input(input_tensor_shape_4d, DTYPE)
+                # a_numpy_4d = np.random.randint(low=-128, high=127,
+                #   size=input_tensor_shape_4d, dtype=np.int8)
+                a_numpy_4d = _create_test_input(input_tensor_shape_4d, dtype)
 
                 ref_output_4d = testing.poolnd_python(
                     a_numpy_4d.astype("int32"),
-                    KERNEL,
-                    STRIDE,
-                    DILATION,
-                    PADDING[0:2],
-                    PADDING[2:],
+                    kernel,
+                    stride,
+                    dilation,
+                    padding[0:2],
+                    padding[2:],
                     pool_type="max",
                     dtype="int32",
                     layout="NHWC",
-                ).astype(DTYPE)
+                ).astype(dtype)
 
                 output_tensor_shape_4d = ref_output_4d.shape
 
@@ -285,28 +294,25 @@ def test_maxpool2d_nhwc(
                     hexagon_session.device,
                     tensor_shape=input_tensor_shape_7d,
                     axis_separators=[4],
-                    dtype=DTYPE,
-                    mem_scope=IO_TENSOR_MEM_SCOPE,
+                    dtype=dtype,
+                    mem_scope=io_tensor_mem_scope,
                 )
 
                 c_hexagon_4d = allocate_hexagon_array(
                     hexagon_session.device,
                     tensor_shape=output_tensor_shape_4d,
                     axis_separators=[],
-                    dtype=DTYPE,
-                    mem_scope=IO_TENSOR_MEM_SCOPE,
+                    dtype=dtype,
+                    mem_scope=io_tensor_mem_scope,
                 )
 
                 a_hexagon_7d.copyfrom(a_numpy_7d)
 
-                if DTYPE == "int8":
+                if dtype == "int8":
                     rel_tolerance = 0
                     abs_tolerance = 0
                 else:
-                    assert False, f"TODO: decide acceptable tolerances for DTYPE {DTYPE}"
-
-                # hexagon_mod(a_hexagon_7d, c_hexagon_4d)
-                # tvm.testing.assert_allclose(ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance)
+                    assert False, f"TODO: decide acceptable tolerances for dtype {dtype}"
 
                 timer = hexagon_mod.time_evaluator(
                     "main", hexagon_session.device, number=10, repeat=1
@@ -317,29 +323,29 @@ def test_maxpool2d_nhwc(
                     tvm.testing.assert_allclose(
                         ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance
                     )
-                except AssertionError as e:
-                    raise bu.NumericalAccuracyException(str(e))
+                except AssertionError as exception:
+                    raise bu.NumericalAccuracyException(str(exception))
 
-            except bu.NumericalAccuracyException as e:
+            except bu.NumericalAccuracyException as exception:
                 print()
                 print(f"FAIL: Numerical accuracy error. See log file.")
 
                 log_file.write("\n")
-                log_file.write(f"FAIL: {e}\n")
+                log_file.write(f"FAIL: {exception}\n")
 
                 self.benchmark_table.record_fail(
                     **keys_dict, comments=f"Numerical accuracy error. See log file."
                 )
 
-            except bu.UnsupportedException as e:
+            except bu.UnsupportedException as exception:
                 print()
-                print(f"SKIP: {e}")
+                print(f"SKIP: {exception}")
 
                 log_file.write("\n")
-                log_file.write(f"SKIP: {e}\n")
+                log_file.write(f"SKIP: {exception}\n")
 
                 self.benchmark_table.record_skip(
-                    **keys_dict, comments=f"Unsupported configuration: {e}"
+                    **keys_dict, comments=f"Unsupported configuration: {exception}"
                 )
 
             self.benchmark_table.record_success(timing_result, **keys_dict)
diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py
index af5e7a398870..3ac297fd80d8 100644
--- a/tests/python/contrib/test_hexagon/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py
@@ -220,3 +220,7 @@ def _visit(stmt):
         "AllocateNode found in lowered IRModule, "
         "VTCM allocations should have been lowered to tir.nd_mem_alloc_with_scope"
     )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py b/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py
index 5ec46cf4ae70..40edbda550b7 100644
--- a/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py
+++ b/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py
@@ -15,11 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import math
-import struct
-import numpy as np
-import tvm.topi.hexagon.utils as utils
-
 """
 Test float to fixed-point conversion. We do it by constructing a numpy array with the
 wide range of floating-point values. These values are converted into the
@@ -29,9 +24,17 @@
 raised if they happened to be outside of the expected tolerance.
 """
 
+import math
+import struct
+import numpy as np
+import tvm.topi.hexagon.utils as utils
+
 
 class TestFixedPointConversion:
+    """Fixed point conversation test class"""
+
     def test_fixed_point_conversion(self):
+        """Test fixed point conversion"""
         # Construct array with wide range of values
         fp1 = np.random.uniform(0.00001, 0.0002, size=(10))
         fp2 = np.random.uniform(0.001, 0.02, size=(10))
diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
index 43feb827af42..5eac35f2d683 100644
--- a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
+++ b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test Fixed Point Multiply on Hexagon."""
+
 import re
 import numpy as np
 
@@ -80,91 +82,92 @@ def run_module(mod, inputs):
     return output
 
 
-in_scale_const, out_scale_const = tvm.testing.parameters(
-    (1.3, 30.0),
-    (1.37, 1.0),
-    (0.6, 1.0),
-    ((1.7, 0.6), 1.0),
-    ((0.007, 1.9), 1.0),
-)
-
-multiplier, shift = tvm.testing.parameters(
-    (1288490240, -2),  # 0.15
-    (1395864320, 1),  # 1.3
-    (1288490188, 0),  # 0.6
-)
-
-
-@tvm.testing.requires_hexagon
-def test_fixed_point_multiply(hexagon_session: Session, multiplier: int, shift: int):
-    ishape = (6, 32)
-    a = relay.var("a", relay.TensorType(ishape, "int32"))
-    fpm = relay.fixed_point_multiply(a, multiplier, shift)
-    relay_mod = tvm.IRModule.from_expr(fpm)
-
-    with tvm.transform.PassContext(opt_level=3):
-        # Compile for Hexagon...
-        hexagon_lowered = build_module(relay_mod, HEXAGON_AOT_LLVM_TARGET)
-
-        # Compile for LLVM...
-        llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm"))
-
-    data_in = np.arange(-96, 96).reshape(ishape)
-    inputs = {"a": data_in}
-
-    # Run hexagon...
-    hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    hexagon_output = run_module(hexagon_mod, inputs)
+class TestFixedPointMultiply:
+    """Fixed point Multiply test class"""
 
-    # Run llvm...
-    llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
-    expected_output = run_module(llvm_mod, inputs)
-
-    tvm.testing.assert_allclose(hexagon_output, expected_output)
-
-
-@tvm.testing.requires_hexagon
-def test_per_channel_fixed_point_multiply(
-    hexagon_session: Session, in_scale_const, out_scale_const
-):
-    ishape = [1, 128, 56, 56]
-    axis = 1
-    a = relay.var("a", shape=ishape, dtype="int32")
-
-    # Make list of input scales from in_scale_const parameter.
-    if isinstance(in_scale_const, tuple):
-        in_scale = list(in_scale_const) * (ishape[axis] // len(in_scale_const))
-    else:
-        in_scale = [in_scale_const] * ishape[axis]
-    assert len(in_scale) == ishape[axis]
-
-    # qnn.requantize is lowered to fixed_point_multiply if zp == 0 and in_dtype == out_dtype.
-    iscale = relay.const(in_scale)
-    izero = relay.const(0)
-    oscale = relay.const(out_scale_const)
-    ozero = relay.const(0)
-    op = relay.qnn.op.requantize(a, iscale, izero, oscale, ozero, axis=axis, out_dtype="int32")
-    mod = tvm.IRModule.from_expr(op)
-
-    with tvm.transform.PassContext(opt_level=3):
-        # Compile for Hexagon...
-        hexagon_lowered = build_module(mod, HEXAGON_AOT_LLVM_TARGET)
-
-        # Compile for LLVM...
-        llvm_lowered = build_module(mod, tvm.target.Target("llvm"))
-
-    a_np = np.random.randint(-1000, 1000, size=np.prod(ishape)).reshape(ishape)
-    inputs = {"a": a_np}
-
-    # Run hexagon...
-    hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    hexagon_output = run_module(hexagon_mod, inputs)
+    in_scale_const, out_scale_const = tvm.testing.parameters(
+        (1.3, 30.0),
+        (1.37, 1.0),
+        (0.6, 1.0),
+        ((1.7, 0.6), 1.0),
+        ((0.007, 1.9), 1.0),
+    )
 
-    # Run llvm...
-    llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
-    expected_output = run_module(llvm_mod, inputs)
+    multiplier, shift = tvm.testing.parameters(
+        (1288490240, -2),  # 0.15
+        (1395864320, 1),  # 1.3
+        (1288490188, 0),  # 0.6
+    )
 
-    tvm.testing.assert_allclose(hexagon_output, expected_output)
+    @tvm.testing.requires_hexagon
+    def test_fixed_point_multiply(self, hexagon_session: Session, multiplier: int, shift: int):
+        """Fixed point multiply test."""
+        ishape = (6, 32)
+        a = relay.var("a", relay.TensorType(ishape, "int32"))
+        fpm = relay.fixed_point_multiply(a, multiplier, shift)
+        relay_mod = tvm.IRModule.from_expr(fpm)
+
+        with tvm.transform.PassContext(opt_level=3):
+            # Compile for Hexagon...
+            hexagon_lowered = build_module(relay_mod, HEXAGON_AOT_LLVM_TARGET)
+
+            # Compile for LLVM...
+            llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm"))
+
+        data_in = np.arange(-96, 96).reshape(ishape)
+        inputs = {"a": data_in}
+
+        # Run hexagon...
+        hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+        hexagon_output = run_module(hexagon_mod, inputs)
+
+        # Run llvm...
+        llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
+        expected_output = run_module(llvm_mod, inputs)
+
+        tvm.testing.assert_allclose(hexagon_output, expected_output)
+
+    @tvm.testing.requires_hexagon
+    def test_per_channel(self, hexagon_session: Session, in_scale_const, out_scale_const):
+        """Per channel multiply test."""
+        ishape = [1, 128, 56, 56]
+        axis = 1
+        a = relay.var("a", shape=ishape, dtype="int32")
+
+        # Make list of input scales from in_scale_const parameter.
+        if isinstance(in_scale_const, tuple):
+            in_scale = list(in_scale_const) * (ishape[axis] // len(in_scale_const))
+        else:
+            in_scale = [in_scale_const] * ishape[axis]
+        assert len(in_scale) == ishape[axis]
+
+        # qnn.requantize is lowered to fixed_point_multiply if zp == 0 and in_dtype == out_dtype.
+        iscale = relay.const(in_scale)
+        izero = relay.const(0)
+        oscale = relay.const(out_scale_const)
+        ozero = relay.const(0)
+        op = relay.qnn.op.requantize(a, iscale, izero, oscale, ozero, axis=axis, out_dtype="int32")
+        mod = tvm.IRModule.from_expr(op)
+
+        with tvm.transform.PassContext(opt_level=3):
+            # Compile for Hexagon...
+            hexagon_lowered = build_module(mod, HEXAGON_AOT_LLVM_TARGET)
+
+            # Compile for LLVM...
+            llvm_lowered = build_module(mod, tvm.target.Target("llvm"))
+
+        a_np = np.random.randint(-1000, 1000, size=np.prod(ishape)).reshape(ishape)
+        inputs = {"a": a_np}
+
+        # Run hexagon...
+        hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+        hexagon_output = run_module(hexagon_mod, inputs)
+
+        # Run llvm...
+        llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
+        expected_output = run_module(llvm_mod, inputs)
+
+        tvm.testing.assert_allclose(hexagon_output, expected_output)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_hexagon/test_memory_alloc.py b/tests/python/contrib/test_hexagon/test_memory_alloc.py
index a6d011eddd5a..f44e3cd0dc36 100644
--- a/tests/python/contrib/test_hexagon/test_memory_alloc.py
+++ b/tests/python/contrib/test_hexagon/test_memory_alloc.py
@@ -14,8 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-import os.path
+"""Test memory allocation."""
 
 import numpy as np
 
@@ -25,23 +24,25 @@
 from .infrastructure import allocate_hexagon_array, get_hexagon_target
 
 
-@tvm.testing.fixture
-def generated_func(shape, scope, dtype, axis_separators):
+def generated_func(shape: tuple, dtype: str, axis_separators: list):
+    """Generate element wise function."""
     dim0, dim1 = shape
 
     @T.prim_func
     def elwise(a: T.handle, b: T.handle):
-        A = T.match_buffer(a, shape, dtype=dtype, axis_separators=axis_separators)
-        B = T.match_buffer(b, shape, dtype=dtype, axis_separators=axis_separators)
+        a_buffer = T.match_buffer(a, shape, dtype=dtype, axis_separators=axis_separators)
+        b_buffer = T.match_buffer(b, shape, dtype=dtype, axis_separators=axis_separators)
 
         for i, j in T.grid(dim0, dim1):
             with T.block("compute"):
-                B[i, j] = A[i, j] * T.cast(2, dtype=dtype)
+                b_buffer[i, j] = a_buffer[i, j] * T.cast(2, dtype=dtype)
 
     return elwise
 
 
 class TestMemoryAlloc:
+    """Memory allocation test."""
+
     dtype = tvm.testing.parameter("int8")
     shape = tvm.testing.parameter((128, 128))
 
@@ -53,11 +54,10 @@ class TestMemoryAlloc:
         ("global.ddr", [1]),
     )
 
-    def test_global_axis_separator(
-        self, hexagon_session, generated_func, shape, dtype, scope, axis_separators
-    ):
+    def test_global_axis_separator(self, hexagon_session, shape, dtype, scope, axis_separators):
+        """Test with global axis separator."""
         mod1 = tvm.build(
-            generated_func,
+            generated_func(shape, dtype, axis_separators),
             target=get_hexagon_target("v69"),
         )
         mod2 = hexagon_session.load_module(mod1)
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py
index a7f4cbc39cb1..a83a3b279a7f 100644
--- a/tests/python/contrib/test_hexagon/test_meta_schedule.py
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -44,24 +44,30 @@
 
 @tvm.script.ir_module
 class MatmulModule:
+    """Matmultest class"""
+
+    # pylint: disable=no-self-argument
     @T.prim_func
-    def main(  # type: ignore  # pylint: disable=no-self-argument
-        a: T.handle, b: T.handle, c: T.handle
-    ) -> None:
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # type: ignore
+        # pylint: disable=missing-function-docstring
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, (16, 16), "float32")
-        B = T.match_buffer(b, (16, 16), "float32")
-        C = T.match_buffer(c, (16, 16), "float32")
+        a_buffer = T.match_buffer(a, (16, 16), "float32")
+        b_buffer = T.match_buffer(b, (16, 16), "float32")
+        c_buffer = T.match_buffer(c, (16, 16), "float32")
         for i, j, k in T.grid(16, 16, 16):
             with T.block("matmul"):
-                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                vi_axis, vj_axis, vk_axis = T.axis.remap("SSR", [i, j, k])
                 with T.init():
-                    C[vi, vj] = 0.0  # type: ignore
-                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+                    c_buffer[vi_axis, vj_axis] = 0.0  # type: ignore
+                c_buffer[vi_axis, vj_axis] = (
+                    c_buffer[vi_axis, vj_axis]
+                    + a_buffer[vi_axis, vk_axis] * b_buffer[vk_axis, vj_axis]
+                )
 
 
 @tvm.testing.requires_hexagon
 def test_builder_runner(hexagon_launcher):
+    """Test builder and runner."""
     if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
@@ -96,33 +102,35 @@ def test_builder_runner(hexagon_launcher):
         assert result >= 0.0
 
 
-def dense(m, n, k):
+def dense_compute(m, n, k):
+    """dense compute"""
     X = te.placeholder((m, k), name="X", dtype="uint8")
-    packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", dtype="uint8")
+    packed_width = te.placeholder((n // 32, k // 4, 32, 4), name="packed_width", dtype="uint8")
 
-    ak = te.reduce_axis((0, k), name="k")
+    axis_k = te.reduce_axis((0, k), name="k")
     out = te.compute(
         (m, n),
         lambda i, j: te.sum(
-            X[i, ak].astype("int32")
-            * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype(
-                "int32"
-            ),
-            axis=ak,
+            X[i, axis_k].astype("int32")
+            * packed_width[
+                tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(axis_k, 4), j % 32, axis_k % 4
+            ].astype("int32"),
+            axis=axis_k,
         ),
         name="compute",
     )
-    return [X, packedW, out]
+    return [X, packed_width, out]
 
 
-def schedule_dense(sch, block, M, do_tune):
+def schedule_dense(sch, block, m_size, do_tune):
+    """dense schedule"""
     a_y, a_x, _ = sch.get_loops(block)[-3:]
 
     if do_tune:
         y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
         a_yo, a_yi = sch.split(a_y, factors=y_factors)
     else:
-        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+        a_yo, a_yi = sch.split(a_y, factors=[None, min(m_size, 32)])
 
     a_xo, a_xi = sch.split(a_x, factors=[None, 32])
     sch.reorder(a_yo, a_xo, a_yi, a_xi)
@@ -143,51 +151,55 @@ def schedule_dense(sch, block, M, do_tune):
     sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
 
 
-def verify_dense(sch, target, M, N, K, hexagon_session):
+def verify_dense(sch, target, m_size, n_size, k_size, hexagon_session):
+    """Verify dense operator."""
     f = tvm.build(sch.mod["main"], target=target, name="dense")
     mod = hexagon_session.load_module(f)
     dev = hexagon_session.device
 
-    a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
-    b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8")
+    a_np = np.random.uniform(1, 10, size=(m_size, k_size)).astype("uint8")
+    b_np = np.random.uniform(1, 10, size=(n_size, k_size)).astype("uint8")
     c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32"))
 
-    packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 4)).astype("uint8")
+    pack_width = np.random.uniform(1, 10, size=(n_size // 32, (k_size // 4), 32, 4)).astype("uint8")
 
-    for r_idx in range(N // 32):
-        for ko in range(K // 4):
+    for r_idx in range(n_size // 32):
+        for k_output in range(k_size // 4):
             for s_idx in range(32):
                 for t_idx in range(4):
-                    packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ko * 4 + t_idx]
+                    pack_width[r_idx][k_output][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][
+                        k_output * 4 + t_idx
+                    ]
 
     a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(packW, dev)
-    c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
+    b = tvm.nd.array(pack_width, dev)
+    c = tvm.nd.array(np.zeros((m_size, n_size), dtype="int32"), dev)
 
     mod(a, b, c)
     np.testing.assert_equal(c.numpy(), c_np)
 
     evaluator = mod.time_evaluator(mod.entry_name, dev, number=10)
-    gflops = (N * M * K) * 2 / 1e9
+    gflops = (n_size * m_size * k_size) * 2 / 1e9
     time_ms = evaluator(a, b, c).mean * 1e3
     print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
 
 
 @tvm.testing.requires_hexagon
 def test_vrmpy_dense(hexagon_launcher):
+    """Test vector reduce muliply dense."""
     if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
     do_tune = True
 
-    M, N, K = 128, 768, 768
-    workload = te.create_prim_func(dense(M, N, K))
+    m_size, n_size, k_size = 128, 768, 768
+    workload = te.create_prim_func(dense_compute(m_size, n_size, k_size))
 
     if not do_tune:
         ir_module = tvm.IRModule({"main": workload})
         sch = tvm.tir.Schedule(ir_module)
         block = sch.get_block("compute")
-        schedule_dense(sch, block, M, do_tune)
+        schedule_dense(sch, block, m_size, do_tune)
     else:
         with tempfile.TemporaryDirectory() as work_dir:
 
@@ -214,19 +226,23 @@ def schedule_dense_for_tune(sch):
             sch = ms.tir_integration.compile_tir(database, workload, target)
 
     with hexagon_launcher.create_session() as session:
-        verify_dense(sch, get_hexagon_target("v68"), M, N, K, session)
+        verify_dense(sch, get_hexagon_target("v68"), m_size, n_size, k_size, session)
 
 
 # This is an example of a schedule found by vrmpy auto tensorization.
 # It gets 440 GFLOPS on SD888.
 @tvm.script.ir_module
-class Module_vrmpy_auto_tensorize:
+class ModuleVRMPYAutoTensorize:
+    """Vector Reduce Multimply auto tensorize test class."""
+
+    # pylint: disable=no-self-argument
     @T.prim_func
     def main(  # type: ignore
         X: T.Buffer[(128, 768), "uint8"],  # type: ignore
-        packedW: T.Buffer[(24, 192, 32, 4), "uint8"],  # type: ignore
+        packed_width: T.Buffer[(24, 192, 32, 4), "uint8"],  # type: ignore
         compute: T.Buffer[(128, 768), "int32"],  # type: ignore
     ) -> None:
+        # pylint: disable=missing-function-docstring
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         for i0_0_i1_0_0_fused in T.parallel(
             512, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}
@@ -251,33 +267,42 @@ def main(  # type: ignore
                     T.reads(
                         compute[i, j_o * 32 : j_o * 32 + 32],  # type: ignore
                         X[i, k_o * 4 : k_o * 4 + 4],  # type: ignore
-                        packedW[j_o, k_o, 0:32, 0:4],  # type: ignore
+                        packed_width[j_o, k_o, 0:32, 0:4],  # type: ignore
                     )
                     T.writes(compute[i, j_o * 32 : j_o * 32 + 32])  # type: ignore
-                    A = T.match_buffer(
-                        X[i, k_o * 4 : k_o * 4 + 4], [4], dtype="uint8", offset_factor=1  # type: ignore
+                    a_buffer = T.match_buffer(
+                        X[i, k_o * 4 : k_o * 4 + 4],
+                        [4],
+                        dtype="uint8",
+                        offset_factor=1,  # type: ignore
                     )
-                    B = T.match_buffer(
-                        packedW[j_o, k_o, 0:32, 0:4], [32, 4], dtype="uint8", offset_factor=1
+                    b_buffer = T.match_buffer(
+                        packed_width[j_o, k_o, 0:32, 0:4], [32, 4], dtype="uint8", offset_factor=1
                     )
-                    C = T.match_buffer(
-                        compute[i, j_o * 32 : j_o * 32 + 32], [32], dtype="int32", offset_factor=1  # type: ignore
+                    c_buffer = T.match_buffer(
+                        compute[i, j_o * 32 : j_o * 32 + 32],
+                        [32],
+                        dtype="int32",
+                        offset_factor=1,  # type: ignore
                     )
-                    A_u8x4: T.uint8x4 = A[0:4]  # type: ignore
-                    A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")  # type: ignore
-                    B_i32x32: T.int32x32 = T.reinterpret(B[0, 0:128], dtype="int32x32")  # type: ignore
-                    C[0:32] = T.call_llvm_pure_intrin(  # type: ignore
-                        4390, T.uint32(3), C[0:32], B_i32x32, A_i32, dtype="int32x32"
+                    a_u8x4: T.uint8x4 = a_buffer[0:4]  # type: ignore
+                    a_i32: T.int32 = T.reinterpret(a_u8x4, dtype="int32")  # type: ignore
+                    b_i32x32: T.int32x32 = T.reinterpret(
+                        b_buffer[0, 0:128], dtype="int32x32"
+                    )  # type: ignore
+                    c_buffer[0:32] = T.call_llvm_pure_intrin(  # type: ignore
+                        4390, T.uint32(3), c_buffer[0:32], b_i32x32, a_i32, dtype="int32x32"
                     )
 
 
 @tvm.testing.requires_hexagon
 def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
+    """Test VRMPY dense operator."""
     if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
-    M, N, K = 128, 768, 768
-    workload = te.create_prim_func(dense(M, N, K))
+    m_size, n_size, k_size = 128, 768, 768
+    workload = te.create_prim_func(dense_compute(m_size, n_size, k_size))
 
     sch_rules = [
         schedule_rule.MultiLevelTilingWithIntrin(
@@ -308,7 +333,8 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
     ]
 
     # Make this to False to compile and run the best tuned schedule
-    if True:
+    run_tuning = True
+    if run_tuning:
         with tempfile.TemporaryDirectory() as work_dir:
             target = get_hexagon_target("v68")
             database = ms.tir_integration.tune_tir(
@@ -328,25 +354,26 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
             )
             sch = ms.tir_integration.compile_tir(database, workload, target)
     else:
-        sch = tvm.tir.Schedule(Module_vrmpy_auto_tensorize, debug_mask="all")
+        sch = tvm.tir.Schedule(ModuleVRMPYAutoTensorize, debug_mask="all")
 
     with hexagon_launcher.create_session() as session:
-        verify_dense(sch, get_hexagon_target("v68"), M, N, K, session)
+        verify_dense(sch, get_hexagon_target("v68"), m_size, n_size, k_size, session)
 
 
 @tvm.testing.requires_hexagon
 def test_conv2d_relay_auto_schedule(hexagon_launcher):
+    """Test conv2d using auto schedule."""
     if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
-    I, O, H, W = 64, 64, 56, 56
-    kH = kW = 3
+    i_size, o_size, h_size, w_size = 64, 64, 56, 56
+    k_height_size = k_width_size = 3
 
     strides = (1, 1)
     padding = (1, 1)
 
-    d_shape = (1, H, W, I)
-    w_shape = (kH, kW, I, O)
+    d_shape = (1, h_size, w_size, i_size)
+    w_shape = (k_height_size, k_width_size, i_size, o_size)
     bias_shape = (1, 1, 1, w_shape[3])
     out_channel = w_shape[3]
 
@@ -356,7 +383,7 @@ def test_conv2d_relay_auto_schedule(hexagon_launcher):
     conv2d = relay.nn.conv2d(
         data=data,
         weight=weight,
-        kernel_size=(kH, kW),
+        kernel_size=(k_height_size, k_width_size),
         channels=out_channel,
         padding=padding,
         strides=strides,
@@ -467,3 +494,7 @@ def test_dense_relay_auto_schedule(hexagon_launcher):
         # Fairly loose check since fp16 results between x86 and Hexagon have
         # non-trivial difference.
         assert np.mean(np.abs(ref - out)) < 0.1
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx.py b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
index 046f949a761f..15273afdd41e 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
@@ -26,7 +26,11 @@
 
 from .infrastructure import get_hexagon_target
 
-TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n    -Single Thread: {} ms \n    -Parallel: {} ms\n    -Speedup: {}x\n"
+TEST_OUTPUT_TEMPLATE = (
+    "Test {} with {} operations... \n"
+    "    -Single Thread: {} ms \n"
+    "    -Parallel: {} ms\n    -Speedup: {}x\n"
+)
 
 
 def get_vrmpy_shape_dtypes(operations):
@@ -61,28 +65,30 @@ def vrmpy_expected_producer(shape, a, b):
     expected = np.zeros(shape, dtype="int32")
     for n in range(shape[0]):
         for i in range(32):
-            for r in range(4):
-                expected[n, i] = expected[n, i] + np.uint32(a[n, i * 4 + r]) * np.uint32(
-                    b[n, i * 4 + r]
+            for r_ind in range(4):
+                expected[n, i] = expected[n, i] + np.uint32(a[n, i * 4 + r_ind]) * np.uint32(
+                    b[n, i * 4 + r_ind]
                 )
     return expected
 
 
 def get_vmpy_operator(operations):
+    """Generate vector multiply operator"""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8")
-        B = T.match_buffer(b, [operations, 128], dtype="uint8")
-        C = T.match_buffer(c, [operations, 128], dtype="int16")
+        a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
+        b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
+        c_buffer = T.match_buffer(c, [operations, 128], dtype="int16")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vmpybusv.128B"),
                     T.uint32(2),
-                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int16x128",
                 )
 
@@ -90,20 +96,22 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def get_vadd_operator(operations):
+    """Generate vadd operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8")
-        B = T.match_buffer(b, [operations, 128], dtype="uint8")
-        C = T.match_buffer(c, [operations, 128], dtype="int16")
+        a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
+        b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
+        c_buffer = T.match_buffer(c, [operations, 128], dtype="int16")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vaddubh.128B"),
                     T.uint32(2),
-                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int16x128",
                 )
 
@@ -111,20 +119,22 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def get_vrmpy_operator(operations):
+    """Generate vrmpy operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8")
-        B = T.match_buffer(b, [operations, 128], dtype="uint8")
-        C = T.match_buffer(c, [operations, 32], dtype="int32")
+        a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
+        b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
+        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
-                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int32x32",
                 )
 
@@ -132,6 +142,7 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch):
+    """Evaluate schedule."""
     a_shape, a_dtype, b_shape, b_dtype, c_shape, c_dtype = shape_dtypes
 
     func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v68"))
@@ -160,6 +171,7 @@ def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch):
 
 
 class TestMatMulVec:
+    """MatMul test class."""
 
     (
         operation_name,
@@ -182,9 +194,11 @@ class TestMatMulVec:
         128,
         # 256,
         # 512,
-        # 1024,  # Single thread runs faster since L2 cache can handle the entire request quickly
+        # Single thread runs faster since L2 cache can handle the entire request quickly
+        # 1024,
         # 2048,
-        # 4096,  # Significant performance degredation once the inputs and outputs cannot all fit in L2
+        # Significant performance degredation once the inputs and outputs cannot all fit in L2
+        # 4096,
         # 8192,
         # 16384,
     )
@@ -200,6 +214,7 @@ def test(
         expected_output_producer,
         split_factor,
     ):
+        """Test function handler."""
 
         sch = tvm.tir.Schedule(operator_producer(operation_count))
         single_thread_runtime = evaluate(
@@ -207,10 +222,10 @@ def test(
         )
 
         sch = tvm.tir.Schedule(operator_producer(operation_count))
-        block = sch.get_block("C")
+        block = sch.get_block("c_buffer")
         b = sch.get_loops(block)
-        bo, _ = sch.split(b[0], factors=[split_factor, None])
-        sch.parallel(bo)
+        b_output, _ = sch.split(b[0], factors=[split_factor, None])
+        sch.parallel(b_output)
 
         parallel_runtime = evaluate(
             hexagon_session, shape_dtypes_producer(operation_count), expected_output_producer, sch
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
index 6e43298a4eb5..fb398f43977a 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -25,14 +25,24 @@
 
 from .infrastructure import get_hexagon_target
 
-TEST_OUTPUT_TEMPLATE = "Test with {} MB of data to load... \n    -No VTCM: {} Gops \n    -Basic VTCM: {} Gops \n    -Vectorized: {} Gops\n    -Vectorized and Parallelized: {} Gops\n    -Preallocated and Vectorized: {} Gops\n    -Preallocated, Vectorized, and Parallelized: {} Gops\n    -Single DMA: {} Gops\n    -Preloaded: {} Gops\n"
+TEST_OUTPUT_TEMPLATE = (
+    "Test with {} MB of data to load... \n"
+    "    -No VTCM: {} Gops \n    -Basic VTCM: {} Gops \n"
+    "    -Vectorized: {} Gops\n    -Vectorized and"
+    " Parallelized: {} Gops\n    -Preallocated and Vectorized: {} Gops\n"
+    "    -Preallocated, Vectorized, and Parallelized: {} Gops\n"
+    "    -Single DMA: {} Gops\n    -Preloaded: {} Gops\n"
+)
 
 
 def apply_parallel_unroll_vectorize(sch, blocks, outer_split, unroll_split, vector_split):
+    """Apply parallel unroll vectorized."""
     for block in blocks:
-        vb, vi = sch.get_loops(block)
-        v = sch.fuse(vb, vi)
-        vbo, vbi, vio, vii = sch.split(v, factors=[outer_split, None, unroll_split, vector_split])
+        vb_index, vi_index = sch.get_loops(block)
+        v = sch.fuse(vb_index, vi_index)
+        vbo, vbi, vio, vii = sch.split(  # pylint: disable=unused-variable
+            v, factors=[outer_split, None, unroll_split, vector_split]
+        )  # pylint: disable=unused-variable
         sch.vectorize(vii)
         sch.unroll(vio)
         sch.parallel(vbo)
@@ -41,8 +51,8 @@ def apply_parallel_unroll_vectorize(sch, blocks, outer_split, unroll_split, vect
 
 def apply_unroll_vectorize(sch, blocks, unroll_split, vector_split):
     for block in blocks:
-        vb, vi = sch.get_loops(block)
-        v = sch.fuse(vb, vi)
+        vb_index, vi_index = sch.get_loops(block)
+        v = sch.fuse(vb_index, vi_index)
         _, vio, vii = sch.split(v, factors=[None, unroll_split, vector_split])
         sch.vectorize(vii)
         sch.unroll(vio)
@@ -50,15 +60,15 @@ def apply_unroll_vectorize(sch, blocks, unroll_split, vector_split):
 
 
 def apply_vrmpy_parallelization(sch):
-    block = sch.get_block("C")
+    block = sch.get_block("c_buffer")
     b = sch.get_loops(block)
-    bo, _ = sch.split(b[0], factors=[4, None])
-    sch.parallel(bo)
+    b_outer, _ = sch.split(b[0], factors=[4, None])
+    sch.parallel(b_outer)
     return sch
 
 
 def apply_vtcm_cache_read_write(sch):
-    block = sch.get_block("C")
+    block = sch.get_block("c_buffer")
     sch.cache_read(block, 0, "global.vtcm")
     sch.cache_read(block, 1, "global.vtcm")
     sch.cache_write(block, 0, "global.vtcm")
@@ -66,20 +76,22 @@ def apply_vtcm_cache_read_write(sch):
 
 
 def vrmpy(operations):
+    """Generate VRMPY operator"""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128)
-        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128)
-        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128)
+        a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8", align=128)
+        b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8", align=128)
+        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128)
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
-                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int32x32",
                 )
 
@@ -87,34 +99,40 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def preloaded_vrmpy(operations):
+    """Generate preloaded VRMPY operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(
+        a_buffer = T.match_buffer(
             a,
             [T.cast(operations, "int32") * 128],
             dtype="uint8",
             align=128,
             mem_scope="global.vtcm",
         )
-        B = T.match_buffer(
+        b_buffer = T.match_buffer(
             b,
             [T.cast(operations, "int32") * 128],
             dtype="uint8",
             align=128,
             mem_scope="global.vtcm",
         )
-        C = T.match_buffer(
+        c_buffer = T.match_buffer(
             c, [T.cast(operations, "int32") * 32], dtype="int32", align=128, mem_scope="global.vtcm"
         )
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[T.ramp(T.cast(vn_ind, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
-                    T.reinterpret(A[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(B[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(
+                        a_buffer[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
+                    T.reinterpret(
+                        b_buffer[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
                     dtype="int32x32",
                 )
 
@@ -122,6 +140,7 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def preallocated_vrmpy(operations):
+    """Generate preallocated VRMPY operator."""
     size = operations * 128
     out_size = operations * 32
 
@@ -130,49 +149,56 @@ def operator(
         a: T.handle, b: T.handle, c: T.handle, a_v: T.handle, b_v: T.handle, c_v: T.handle
     ) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global")
-        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global")
-        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
-        A_global_vtcm = T.match_buffer(
+        a_buffer = T.match_buffer(
+            a, [operations, 128], dtype="uint8", align=128, mem_scope="global"
+        )
+        b_buffer = T.match_buffer(
+            b, [operations, 128], dtype="uint8", align=128, mem_scope="global"
+        )
+        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
+        a_global_vtcm = T.match_buffer(
             a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
         )
-        B_global_vtcm = T.match_buffer(
+        b_global_vtcm = T.match_buffer(
             b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
         )
-        C_global_vtcm = T.match_buffer(
+        c_global_vtcm = T.match_buffer(
             c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm"
         )
         for n, i in T.grid(operations, 128):
-            with T.block("A_global.vtcm"):
-                vn, vi = T.axis.remap("SS", [n, i])
-                A_global_vtcm[vn * 128 + vi] = A[vn, vi]
+            with T.block("a_buffer_global.vtcm"):
+                vn_ind, vi_index = T.axis.remap("SS", [n, i])
+                a_global_vtcm[vn_ind * 128 + vi_index] = a_buffer[vn_ind, vi_index]
         for n, i in T.grid(operations, 128):
-            with T.block("B_global.vtcm"):
-                vn, vi = T.axis.remap("SS", [n, i])
-                B_global_vtcm[vn * 128 + vi] = B[vn, vi]
+            with T.block("b_buffer_global.vtcm"):
+                vn_ind, vi_index = T.axis.remap("SS", [n, i])
+                b_global_vtcm[vn_ind * 128 + vi_index] = b_buffer[vn_ind, vi_index]
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
                     T.reinterpret(
-                        A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                        a_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)],
+                        dtype="int32x32",
                     ),
                     T.reinterpret(
-                        B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                        b_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)],
+                        dtype="int32x32",
                     ),
                     dtype="int32x32",
                 )
         for n, i in T.grid(operations, 32):
-            with T.block("C_global.vtcm"):
-                vn, vi = T.axis.remap("SS", [n, i])
-                C[vn, vi] = C_global_vtcm[vn * 32 + vi]
+            with T.block("c_buffer_global.vtcm"):
+                vn_ind, vi_index = T.axis.remap("SS", [n, i])
+                c_buffer[vn_ind, vi_index] = c_global_vtcm[vn_ind * 32 + vi_index]
 
     return operator
 
 
 def preallocated_single_dma_vrmpy(operations):
+    """Generate preallocated single DMA VRMPY operator."""
     size = operations * 128
     out_size = operations * 32
 
@@ -186,36 +212,40 @@ def operator(
         c_v: T.handle,
     ) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global")
-        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global")
-        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
-        A_global_vtcm = T.match_buffer(
+        a_buffer = T.match_buffer(
+            a, [operations, 128], dtype="uint8", align=128, mem_scope="global"
+        )
+        b_buffer = T.match_buffer(
+            b, [operations, 128], dtype="uint8", align=128, mem_scope="global"
+        )
+        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
+        a_global_vtcm = T.match_buffer(
             a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
         )
-        B_global_vtcm = T.match_buffer(
+        b_global_vtcm = T.match_buffer(
             b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
         )
-        C_global_vtcm = T.match_buffer(
+        c_global_vtcm = T.match_buffer(
             c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm"
         )
         T.evaluate(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    A_global_vtcm.data,
+                    a_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    A_global_vtcm.dtype,
+                    a_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    A.data,
+                    a_buffer.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    A.dtype,
+                    a_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -227,20 +257,20 @@ def operator(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    B_global_vtcm.data,
+                    b_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    B_global_vtcm.dtype,
+                    b_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    B.data,
+                    b_buffer.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    B.dtype,
+                    b_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -249,16 +279,18 @@ def operator(
             )
         )
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
                     T.reinterpret(
-                        A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                        a_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)],
+                        dtype="int32x32",
                     ),
                     T.reinterpret(
-                        B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                        b_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)],
+                        dtype="int32x32",
                     ),
                     dtype="int32x32",
                 )
@@ -266,20 +298,20 @@ def operator(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    C.data,
+                    c_buffer.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    C.dtype,
+                    c_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    C_global_vtcm.data,
+                    c_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    C_global_vtcm.dtype,
+                    c_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -296,11 +328,12 @@ def evaluate_result(operations, tag, time, result, expected_output):
     gops = round(operations * 128 * 3 / time.mean / 1e9, 3)
     mean_ms = round(time.mean * 1000, 6)
 
-    print("\ntest_{}MB_{} took {} ms @ GOPS: {}".format(transfer_mb, tag, mean_ms, gops))
+    print(f"\ntest_{transfer_mb}MB_{tag} took {mean_ms} ms @ GOPS: {gops}")
     tvm.testing.assert_allclose(result, expected_output)
 
 
 def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global"):
+    """Setup and run operator."""
     func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69"))
     module = hexagon_session.load_module(func_tir)
 
@@ -321,6 +354,7 @@ def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global")
 
 
 def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations):
+    """Setup and run for preallocated."""
     func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69"))
     module = hexagon_session.load_module(func_tir)
 
@@ -353,34 +387,9 @@ def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations):
     return gops, c_hexagon.asnumpy()
 
 
-@tvm.testing.fixture
-def input_a(operations):
-    return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
-
-
-@tvm.testing.fixture
-def input_b(operations):
-    return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
-
-
-@tvm.testing.fixture
-def input_c(operations):
-    return np.zeros((operations, 32), dtype="int32")
-
-
-@tvm.testing.fixture
-def expected_output(operations, input_a, input_b, input_c):
-    expected_output = np.zeros(input_c.shape, dtype="int32")
-    for n in range(operations):
-        for i in range(32):
-            for r in range(4):
-                expected_output[n, i] = expected_output[n, i] + np.uint32(
-                    input_a[n, i * 4 + r]
-                ) * np.uint32(input_b[n, i * 4 + r])
-    return expected_output
-
-
 class TestMatMulVec:
+    """MatMul test class."""
+
     # Removed most of these to speedup CI.
     operations = tvm.testing.parameter(
         1024,
@@ -398,6 +407,29 @@ class TestMatMulVec:
     c_vector_split = tvm.testing.parameter(16)
     c_vector_split_unallocated = tvm.testing.parameter(8)
 
+    @tvm.testing.fixture
+    def input_a(self, operations):
+        return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
+
+    @tvm.testing.fixture
+    def input_b(self, operations):
+        return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
+
+    @tvm.testing.fixture
+    def input_c(self, operations):
+        return np.zeros((operations, 32), dtype="int32")
+
+    @tvm.testing.fixture
+    def expected_output(self, operations, input_a, input_b, input_c):
+        expected_output = np.zeros(input_c.shape, dtype="int32")
+        for n in range(operations):
+            for i in range(32):
+                for r_ind in range(4):  # pylint: disable=unused-variable
+                    expected_output[n, i] = expected_output[n, i] + np.uint32(
+                        input_a[n, i * 4 + r_ind]
+                    ) * np.uint32(input_b[n, i * 4 + r_ind])
+        return expected_output
+
     @tvm.testing.requires_hexagon
     def test_loading_vtcm_for_vrmpy(
         self,
@@ -413,7 +445,7 @@ def test_loading_vtcm_for_vrmpy(
         c_vector_split,
         c_vector_split_unallocated,
     ):
-
+        """Load VTCM for VRMPY operator test."""
         # Run parallel vrmpy without loading to VTCM.
         sch = tvm.tir.Schedule(vrmpy(operations))
         sch = apply_vrmpy_parallelization(sch)
@@ -437,12 +469,12 @@ def test_loading_vtcm_for_vrmpy(
         sch = apply_vrmpy_parallelization(sch)
         sch = apply_unroll_vectorize(
             sch,
-            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")],
             unroll_split,
             vector_split,
         )
         sch = apply_unroll_vectorize(
-            sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split_unallocated
+            sch, [sch.get_block("c_buffer_global.vtcm")], unroll_split, c_vector_split_unallocated
         )
         vectorized_runtime, result = setup_and_run(
             hexagon_session, sch, input_a, input_b, input_c, operations
@@ -455,14 +487,14 @@ def test_loading_vtcm_for_vrmpy(
         sch = apply_vrmpy_parallelization(sch)
         sch = apply_parallel_unroll_vectorize(
             sch,
-            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")],
             outer_split,
             unroll_split,
             vector_split,
         )
         sch = apply_parallel_unroll_vectorize(
             sch,
-            [sch.get_block("C_global.vtcm")],
+            [sch.get_block("c_buffer_global.vtcm")],
             outer_split,
             unroll_split,
             c_vector_split_unallocated,
@@ -477,12 +509,12 @@ def test_loading_vtcm_for_vrmpy(
         sch = apply_vrmpy_parallelization(sch)
         sch = apply_unroll_vectorize(
             sch,
-            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")],
             unroll_split,
             vector_split,
         )
         sch = apply_unroll_vectorize(
-            sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split
+            sch, [sch.get_block("c_buffer_global.vtcm")], unroll_split, c_vector_split
         )
         preallocated_vectorized_runtime, result = setup_and_run_preallocated(
             hexagon_session, sch, input_a, input_b, input_c, operations
@@ -495,15 +527,15 @@ def test_loading_vtcm_for_vrmpy(
         sch = apply_vrmpy_parallelization(sch)
         sch = apply_parallel_unroll_vectorize(
             sch,
-            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")],
             outer_split,
             unroll_split,
             vector_split,
         )
         sch = apply_parallel_unroll_vectorize(
-            sch, [sch.get_block("C_global.vtcm")], outer_split, unroll_split, c_vector_split
+            sch, [sch.get_block("c_buffer_global.vtcm")], outer_split, unroll_split, c_vector_split
         )
-        preallocated_vectorized_parallelized_runtime, result = setup_and_run_preallocated(
+        prealloc_vector_parallelized, result = setup_and_run_preallocated(
             hexagon_session, sch, input_a, input_b, input_c, operations
         )
         result = result.reshape((operations, 32))
@@ -539,8 +571,12 @@ def test_loading_vtcm_for_vrmpy(
                 vectorized_runtime,
                 vectorized_parallelized_runtime,
                 preallocated_vectorized_runtime,
-                preallocated_vectorized_parallelized_runtime,
+                prealloc_vector_parallelized,
                 single_dma_runtime,
                 preloaded_runtime,
             )
         )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_parallel_scalar.py b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
index fd3eef1b195b..b96265d9df99 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_scalar.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
@@ -25,55 +25,66 @@
 
 from .infrastructure import get_hexagon_target
 
-TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n    -Single Thread: {} ms \n    -Parallel: {} ms\n    -Speedup: {}x\n"
+TEST_OUTPUT_TEMPLATE = (
+    "Test {} with {} operations... \n"
+    "    -Single Thread: {} ms \n"
+    "    -Parallel: {} ms\n    -Speedup: {}x\n"
+)
 
 
 def get_add_operator(operations):
+    """Generate add operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations], dtype="float64")
-        B = T.match_buffer(b, [operations], dtype="float64")
-        C = T.match_buffer(c, [operations], dtype="float64")
+        a_buffer = T.match_buffer(a, [operations], dtype="float64")
+        b_buffer = T.match_buffer(b, [operations], dtype="float64")
+        c_buffer = T.match_buffer(c, [operations], dtype="float64")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn] = A[vn] + B[vn]
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind] = a_buffer[vn_ind] + b_buffer[vn_ind]
 
     return operator
 
 
 def get_multiply_operator(operations):
+    """Generate multiply operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations], dtype="float64")
-        B = T.match_buffer(b, [operations], dtype="float64")
-        C = T.match_buffer(c, [operations], dtype="float64")
+        a_buffer = T.match_buffer(a, [operations], dtype="float64")
+        b_buffer = T.match_buffer(b, [operations], dtype="float64")
+        c_buffer = T.match_buffer(c, [operations], dtype="float64")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn] = A[vn] * B[vn]
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind] = a_buffer[vn_ind] * b_buffer[vn_ind]
 
     return operator
 
 
 def get_sub_operator(operations):
+    """Generate subtract operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations], dtype="float64")
-        B = T.match_buffer(b, [operations], dtype="float64")
-        C = T.match_buffer(c, [operations], dtype="float64")
+        a_buffer = T.match_buffer(a, [operations], dtype="float64")
+        b_buffer = T.match_buffer(b, [operations], dtype="float64")
+        c_buffer = T.match_buffer(c, [operations], dtype="float64")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn] = A[vn] - B[vn]
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind] = a_buffer[vn_ind] - b_buffer[vn_ind]
 
     return operator
 
 
 def evaluate(hexagon_session, operations, expected, sch):
+    """Evalute schedule."""
     shape = operations
     dtype = "float64"
 
@@ -104,6 +115,7 @@ def evaluate(hexagon_session, operations, expected, sch):
 
 
 class TestMatMulVec:
+    """MatMul test class."""
 
     (operation_name, operator_producer, expected_output_producer,) = tvm.testing.parameters(
         ("add", get_add_operator, (lambda a, b: a + b)),
@@ -116,9 +128,11 @@ class TestMatMulVec:
         128,
         # 256,
         # 512,
-        # 1024,  # Single thread runs faster since L2 cache can handle the entire request quickly
+        # Single thread runs faster since L2 cache can handle the entire request quickly
+        # 1024,
         # 2048,
-        # 4096,  # Significant performance degredation once the inputs and outputs cannot all fit in L2
+        # Significant performance degredation once the inputs and outputs cannot all fit in L2
+        # 4096,
         # 8192,
         # 16384,
     )
@@ -135,15 +149,16 @@ def test_add(
         operations,
         split_factor,
     ):
+        """Test Add operator."""
 
         sch = tvm.tir.Schedule(operator_producer(operations))
         single_thread_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch)
 
         sch = tvm.tir.Schedule(operator_producer(operations))
-        block = sch.get_block("C")
+        block = sch.get_block("c_buffer")
         b = sch.get_loops(block)
-        bo, _ = sch.split(b[0], factors=[split_factor, None])
-        sch.parallel(bo)
+        b_output, _ = sch.split(b[0], factors=[split_factor, None])
+        sch.parallel(b_output)
         parallel_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch)
 
         speedup = round(single_thread_runtime / parallel_runtime, 2)
diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
index 24c9f33a8ecb..cd4e5c9b0d66 100644
--- a/tests/python/contrib/test_hexagon/test_run_unit_tests.py
+++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
@@ -48,3 +48,7 @@ def test_run_unit_tests(hexagon_session: Session, gtest_args):
         raise RuntimeError(
             f"Hexagon gtest retruned non-zero error code = {gtest_error_code}:\n{gtest_output}"
         )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_sigmoid.py b/tests/python/contrib/test_hexagon/test_sigmoid.py
index 1ff5bf3db340..e115b188a3f0 100644
--- a/tests/python/contrib/test_hexagon/test_sigmoid.py
+++ b/tests/python/contrib/test_hexagon/test_sigmoid.py
@@ -14,26 +14,25 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Sigmoid operator tests."""
 
 import numpy as np
-import pytest
 
 import tvm
 import tvm.testing
 from tvm import te
 from tvm import tir
 from tvm import topi
-from tvm.contrib.hexagon.build import HexagonLauncher
 
 from .infrastructure import allocate_hexagon_array, get_hexagon_target
 
 
-def sigmoid_compute(Input):
-    return topi.sigmoid(Input)
+def sigmoid_compute(sigmoid_input):
+    return topi.sigmoid(sigmoid_input)
 
 
-def sigmoid_stir_schedule(Input, Output):
-    sigmoid_func = te.create_prim_func([Input, Output])
+def sigmoid_stir_schedule(sigmoid_input, sigmoid_output):
+    sigmoid_func = te.create_prim_func([sigmoid_input, sigmoid_output])
     sch = tir.Schedule(sigmoid_func, debug_mask="all")
     block = sch.get_block("compute")
 
@@ -42,17 +41,6 @@ def sigmoid_stir_schedule(Input, Output):
     return sch
 
 
-@tvm.testing.fixture
-def input_np(in_shape, dtype, min_val, max_val):
-    return np.random.uniform(low=min_val, high=max_val, size=in_shape).astype(dtype)
-
-
-@tvm.testing.fixture
-def ref_output_np(input_np):
-    output_np = 1 / (1 + np.exp(-input_np))
-    return output_np
-
-
 class BaseSigmoid:
     (in_shape, dtype, min_val, max_val,) = tvm.testing.parameters(
         ((64,), "float16", -8.0, 8.0),
@@ -64,6 +52,17 @@ class BaseSigmoid:
 
 
 class TestSigmoid(BaseSigmoid):
+    """Sigmoid test class."""
+
+    @tvm.testing.fixture
+    def input_np(self, in_shape, dtype, min_val, max_val):
+        return np.random.uniform(low=min_val, high=max_val, size=in_shape).astype(dtype)
+
+    @tvm.testing.fixture
+    def ref_output_np(self, input_np):
+        output_np = 1 / (1 + np.exp(-input_np))
+        return output_np
+
     @tvm.testing.requires_hexagon
     def test_sigmoid(
         self,
@@ -73,11 +72,12 @@ def test_sigmoid(
         ref_output_np,
         hexagon_session,
     ):
-        InputTensor = te.placeholder(in_shape, name="InputTensor", dtype=dtype)
+        """Sigmoid test."""
+        input_tensor = te.placeholder(in_shape, name="input_tensor", dtype=dtype)
 
-        OutputTensor = sigmoid_compute(InputTensor)
+        output_tensor = sigmoid_compute(input_tensor)
 
-        tir_s = sigmoid_stir_schedule(InputTensor, OutputTensor)
+        tir_s = sigmoid_stir_schedule(input_tensor, output_tensor)
 
         input_data = allocate_hexagon_array(
             hexagon_session.device,
diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
index f80a579f58fe..ba7513a4f39c 100644
--- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -14,9 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Async software pipeline tests."""
 
-import sys
-import pytest
 import numpy as np
 
 import tvm
@@ -25,174 +24,178 @@
 
 from .infrastructure import get_hexagon_target
 
-outer = tvm.testing.parameter(8, 16)
-inner = tvm.testing.parameter(64, 128)
-dtype = tvm.testing.parameter("uint8", "float16")
-scope = tvm.testing.parameter("global", "global.vtcm")
-# TODO(Joseph) Turn on "multi_input_diffQ" compute type once we have upstreamed
-# changes in the InjectSoftwarePipeline pass to alleviate this restriction:
-# 'A dependency on multiple async stages is not supported'
-comp_type = tvm.testing.parameter("single_input", "multi_input_sameQ")
-# TODO(Straw) Add back "cache_write" schedule type once we have upstreamed
-# buffer dependency analysis in InjectSoftwarePipeline pass
-# to insert approprite TIR "wait" attributes for this schedule
-sched_type = tvm.testing.parameter("cache_read", "cache_read_write")
-
-
-@tvm.testing.fixture
-def data(comp_type, outer, inner, dtype):
-    out_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
-    a_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
-    if comp_type == "single_input":
-        return out_np, a_np
-    else:
-        b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
-        return out_np, a_np, b_np
 
-
-@tvm.testing.fixture
 def compute(comp_type, outer, inner, dtype):
+    """Generate compute function."""
     if comp_type == "single_input":
 
         @T.prim_func
         def a_plus_1_primfunc(
-            A: T.Buffer[(outer, inner), dtype], OUT: T.Buffer[(outer, inner), dtype]
+            a_buffer: T.Buffer[(outer, inner), dtype], out: T.Buffer[(outer, inner), dtype]
         ):
             for i in T.serial(outer):
                 for j in T.serial(inner):
                     with T.block("compute"):
                         with T.block():
-                            OUT[i, j] = A[i, j] + T.cast(1, dtype)
+                            out[i, j] = a_buffer[i, j] + T.cast(1, dtype)
 
         return a_plus_1_primfunc
     else:
 
         @T.prim_func
         def a_plus_b_plus_1_primfunc(
-            A: T.Buffer[(outer, inner), dtype],
-            B: T.Buffer[(outer, inner), dtype],
-            OUT: T.Buffer[(outer, inner), dtype],
+            a_buffer: T.Buffer[(outer, inner), dtype],
+            b_buffer: T.Buffer[(outer, inner), dtype],
+            out: T.Buffer[(outer, inner), dtype],
         ):
             for i in T.serial(outer):
                 for j in T.serial(inner):
                     with T.block("compute"):
                         with T.block():
-                            OUT[i, j] = A[i, j] + B[i, j] + T.cast(1, dtype)
+                            out[i, j] = a_buffer[i, j] + b_buffer[i, j] + T.cast(1, dtype)
 
         return a_plus_b_plus_1_primfunc
 
 
-@tvm.testing.fixture
-def reference(comp_type):
-    if comp_type == "single_input":
-
-        def a_plus_1_ref(a):
-            return a + 1
-
-        return a_plus_1_ref
-    else:
-
-        def a_plus_b_plus_1_ref(a, b):
-            return a + b + 1
-
-        return a_plus_b_plus_1_ref
-
-
-@tvm.testing.fixture
-def schedule(comp_type, compute, sched_type, scope):
-    sch = tir.Schedule(compute)
-
-    compute_block = sch.get_block("compute")
-    i, _ = sch.get_loops(compute_block)
-
-    if "read" in sched_type:
-        cache_read_a = sch.cache_read(compute_block, 0, scope)
-        sch.compute_at(cache_read_a, i)
+class TestAsyncSoftwarePipeline:
+    """Async software pipeline test class."""
+
+    outer = tvm.testing.parameter(8, 16)
+    inner = tvm.testing.parameter(64, 128)
+    dtype = tvm.testing.parameter("uint8", "float16")
+    scope = tvm.testing.parameter("global", "global.vtcm")
+    # TODO(Joseph) Turn on "multi_input_diffQ" compute type once we have upstreamed
+    # changes in the InjectSoftwarePipeline pass to alleviate this restriction:
+    # 'a_buffer dependency on multiple async stages is not supported'
+    comp_type = tvm.testing.parameter("single_input", "multi_input_sameQ")
+    # TODO(Straw) Add back "cache_write" schedule type once we have upstreamed
+    # buffer dependency analysis in InjectSoftwarePipeline pass
+    # to insert approprite TIR "wait" attributes for this schedule
+    sched_type = tvm.testing.parameter("cache_read", "cache_read_write")
+
+    @tvm.testing.fixture
+    def data(self, comp_type, outer, inner, dtype):
+        out_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
+        a_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
+        if comp_type == "single_input":
+            return out_np, a_np
+        else:
+            b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
+            return out_np, a_np, b_np
+
+    @tvm.testing.fixture
+    def verify(self, dtype):
+        def check(out, ref):
+            if "int" in dtype:
+                np.testing.assert_equal(out.numpy(), ref)
+            else:
+                np.testing.assert_allclose(out.numpy(), ref, rtol=1e-3, atol=1e-3)
+
+        return check
+
+    @tvm.testing.fixture
+    def reference(self, comp_type):
+        """Returns reference data."""
+        if comp_type == "single_input":
 
-        if "multi_input" in comp_type:
-            cache_read_b = sch.cache_read(compute_block, 1, scope)
-            sch.compute_at(cache_read_b, i)
+            def a_plus_1_ref(a):
+                return a + 1
 
-    if "write" in sched_type:
-        cache_write_out = sch.cache_write(compute_block, 0, scope)
-        sch.reverse_compute_at(cache_write_out, i)
+            return a_plus_1_ref
+        else:
 
-    if "read" in sched_type and "write" in sched_type:
-        if comp_type == "single_input":
-            sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
-            sch.annotate(i, "software_pipeline_order", [0, 1, 2])
-            sch.annotate(i, "software_pipeline_async_stages", [0, 2])
-        elif comp_type == "multi_input_sameQ":
-            sch.annotate(i, "software_pipeline_stage", [0, 0, 1, 2])
-            sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3])
-            sch.annotate(i, "software_pipeline_async_stages", [0, 2])
-        elif comp_type == "multi_input_diffQ":
-            sch.annotate(i, "software_pipeline_stage", [0, 1, 2, 3])
-            sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3])
-            sch.annotate(i, "software_pipeline_async_stages", [0, 1, 2])
-
-    elif "read" in sched_type:
-        if comp_type == "single_input":
+            def a_plus_b_plus_1_ref(a, b):
+                return a + b + 1
+
+            return a_plus_b_plus_1_ref
+
+    @tvm.testing.fixture
+    def schedule(self, comp_type, sched_type, outer, inner, dtype, scope):
+        """Generate schedule."""
+        sch = tir.Schedule(compute(comp_type, outer, inner, dtype))
+
+        compute_block = sch.get_block("compute")
+        i, _ = sch.get_loops(compute_block)
+
+        if "read" in sched_type:
+            cache_read_a = sch.cache_read(compute_block, 0, scope)
+            sch.compute_at(cache_read_a, i)
+
+            if "multi_input" in comp_type:
+                cache_read_b = sch.cache_read(compute_block, 1, scope)
+                sch.compute_at(cache_read_b, i)
+
+        if "write" in sched_type:
+            cache_write_out = sch.cache_write(compute_block, 0, scope)
+            sch.reverse_compute_at(cache_write_out, i)
+
+        if "read" in sched_type and "write" in sched_type:
+            if comp_type == "single_input":
+                sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
+                sch.annotate(i, "software_pipeline_order", [0, 1, 2])
+                sch.annotate(i, "software_pipeline_async_stages", [0, 2])
+            elif comp_type == "multi_input_sameQ":
+                sch.annotate(i, "software_pipeline_stage", [0, 0, 1, 2])
+                sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3])
+                sch.annotate(i, "software_pipeline_async_stages", [0, 2])
+            elif comp_type == "multi_input_diffQ":
+                sch.annotate(i, "software_pipeline_stage", [0, 1, 2, 3])
+                sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3])
+                sch.annotate(i, "software_pipeline_async_stages", [0, 1, 2])
+
+        elif "read" in sched_type:
+            if comp_type == "single_input":
+                sch.annotate(i, "software_pipeline_stage", [0, 1])
+                sch.annotate(i, "software_pipeline_order", [0, 1])
+                sch.annotate(i, "software_pipeline_async_stages", [0])
+            elif comp_type == "multi_input_sameQ":
+                sch.annotate(i, "software_pipeline_stage", [0, 0, 1])
+                sch.annotate(i, "software_pipeline_order", [0, 1, 2])
+                sch.annotate(i, "software_pipeline_async_stages", [0])
+            elif comp_type == "multi_input_diffQ":
+                sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
+                sch.annotate(i, "software_pipeline_order", [0, 1, 2])
+                sch.annotate(i, "software_pipeline_async_stages", [0, 1])
+
+        elif "write" in sched_type:
             sch.annotate(i, "software_pipeline_stage", [0, 1])
             sch.annotate(i, "software_pipeline_order", [0, 1])
-            sch.annotate(i, "software_pipeline_async_stages", [0])
-        elif comp_type == "multi_input_sameQ":
-            sch.annotate(i, "software_pipeline_stage", [0, 0, 1])
-            sch.annotate(i, "software_pipeline_order", [0, 1, 2])
-            sch.annotate(i, "software_pipeline_async_stages", [0])
-        elif comp_type == "multi_input_diffQ":
-            sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
-            sch.annotate(i, "software_pipeline_order", [0, 1, 2])
-            sch.annotate(i, "software_pipeline_async_stages", [0, 1])
-
-    elif "write" in sched_type:
-        sch.annotate(i, "software_pipeline_stage", [0, 1])
-        sch.annotate(i, "software_pipeline_order", [0, 1])
-        sch.annotate(i, "software_pipeline_async_stages", [1])
-
-    return sch
-
-
-@tvm.testing.fixture
-def verify(dtype):
-    def check(out, ref):
-        if "int" in dtype:
-            np.testing.assert_equal(out.numpy(), ref)
-        else:
-            np.testing.assert_allclose(out.numpy(), ref, rtol=1e-3, atol=1e-3)
-
-    return check
+            sch.annotate(i, "software_pipeline_async_stages", [1])
 
+        return sch
 
-@tvm.testing.requires_hexagon
-def test_async_software_pipeline(hexagon_launcher, comp_type, data, reference, schedule, verify):
-    out_np = data[0]
-    a_np = data[1]
-    if comp_type == "single_input":
-        ref = reference(a_np)
-    else:
-        b_np = data[2]
-        ref = reference(a_np, b_np)
-
-    with tvm.transform.PassContext(
-        config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": False}
+    @tvm.testing.requires_hexagon
+    def test_async_software_pipeline(
+        self, hexagon_launcher, comp_type, data, reference, schedule, verify
     ):
-        # tvm.lower(schedule.mod["main"]).show()
-        func = tvm.build(schedule.mod["main"], target=get_hexagon_target("v68"))
-
-    with hexagon_launcher.create_session() as hexagon_session:
-        dev = hexagon_session.device
-        mod = hexagon_session.load_module(func)
-        out = tvm.nd.array(out_np, device=dev)
-        a = tvm.nd.array(a_np, device=dev)
+        """Async software pipeline test."""
+        out_np = data[0]
+        a_np = data[1]
         if comp_type == "single_input":
-            mod(a, out)
+            ref = reference(a_np)
         else:
-            b = tvm.nd.array(b_np, device=dev)
-            mod(a, b, out)
+            b_np = data[2]
+            ref = reference(a_np, b_np)
+
+        with tvm.transform.PassContext(
+            config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": False}
+        ):
+            # tvm.lower(schedule.mod["main"]).show()
+            func = tvm.build(schedule.mod["main"], target=get_hexagon_target("v68"))
+
+        with hexagon_launcher.create_session() as hexagon_session:
+            dev = hexagon_session.device
+            mod = hexagon_session.load_module(func)
+            out = tvm.nd.array(out_np, device=dev)
+            a = tvm.nd.array(a_np, device=dev)
+            if comp_type == "single_input":
+                mod(a, out)
+            else:
+                b = tvm.nd.array(b_np, device=dev)
+                mod(a, b, out)
 
-        verify(out, ref)
+            verify(out, ref)
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 307d3a96bf15..980ac0cf4c2a 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -27,47 +27,56 @@
 
 MB = 1024**2
 KB = 1024
-TEST_OUTPUT_TEMPLATE = "Test bandwidth with buffer size {}MB... \n    -Base: {} GBps \n    -Vectorized: {} GBps\n    -Vectorized and Parallelized: {} GBps\n    -Single DMA Copy: {} GBps\n"
+TEST_OUTPUT_TEMPLATE = (
+    "Test bandwidth with buffer size {}MB... \n"
+    "    -Base: {} GBps \n    -Vectorized: {} GBps\n"
+    "    -Vectorized and Parallelized: {} GBps\n"
+    "    -Single DMA Copy: {} GBps\n"
+)
 
 
 def memcopy_operator(size):
+    """Generate memory copy operator."""
+
     @T.prim_func
     def operator(a: T.handle, a_v: T.handle) -> None:
-        A = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
-        A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
+        a_buffer = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
+        a_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
         for ax0 in T.serial(size):
             with T.block("A_global.vtcm"):
-                v0 = T.axis.spatial(size, ax0)
-                T.reads(A[v0])
-                T.writes(A_global_vtcm[v0])
-                A_global_vtcm[v0] = A[v0]
+                v0_ind = T.axis.spatial(size, ax0)
+                T.reads(a_buffer[v0_ind])
+                T.writes(a_global_vtcm[v0_ind])
+                a_global_vtcm[v0_ind] = a_buffer[v0_ind]
 
     return operator
 
 
 def single_dma_operator(size):
+    """Generate single dma operator."""
+
     @T.prim_func
     def operator(a: T.handle, a_v: T.handle) -> None:
-        A = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
-        A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
+        a_buffer = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
+        a_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
         T.evaluate(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    A_global_vtcm.data,
+                    a_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    A_global_vtcm.dtype,
+                    a_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    A.data,
+                    a_buffer.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    A.dtype,
+                    a_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -80,6 +89,7 @@ def operator(a: T.handle, a_v: T.handle) -> None:
 
 
 def evaluate(hexagon_session, sch, size):
+    """Evaluate schedule."""
     a_shape = size
 
     func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69"))
@@ -110,6 +120,7 @@ def evaluate(hexagon_session, sch, size):
 
 
 class TestMatMulVec:
+    """MatMul test class."""
 
     # Removed most of these to speedup CI.
     size = tvm.testing.parameter(
@@ -133,7 +144,7 @@ class TestMatMulVec:
 
     @tvm.testing.requires_hexagon
     def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split):
-
+        """Test bandwidth."""
         # Run the base memcopy operator.
         sch = tvm.tir.Schedule(memcopy_operator(size))
         base_gpbs = evaluate(hexagon_session, sch, size)
@@ -141,8 +152,8 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
         # Run with some basic unroll and vectorize scheduling.
         sch = tvm.tir.Schedule(memcopy_operator(size))
         vtcm_block_a = sch.get_block("A_global.vtcm")
-        vb = sch.get_loops(vtcm_block_a)
-        vbi_a, vio_a, vii_a = sch.split(vb[0], factors=[None, unroll_split, vector_split])
+        v_block = sch.get_loops(vtcm_block_a)
+        _, vio_a, vii_a = sch.split(v_block[0], factors=[None, unroll_split, vector_split])
         sch.unroll(vio_a)
         sch.vectorize(vii_a)
         vectorize_gbps = evaluate(hexagon_session, sch, size)
@@ -150,9 +161,9 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
         # Run with some basic unroll and vectorize scheduling and parallelization.
         sch = tvm.tir.Schedule(memcopy_operator(size))
         vtcm_block_a = sch.get_block("A_global.vtcm")
-        vb = sch.get_loops(vtcm_block_a)
-        vbo_a, vbi_a, vio_a, vii_a = sch.split(
-            vb[0], factors=[outer_split, None, unroll_split, vector_split]
+        v_block = sch.get_loops(vtcm_block_a)
+        vbo_a, _, vio_a, vii_a = sch.split(
+            v_block[0], factors=[outer_split, None, unroll_split, vector_split]
         )
         sch.unroll(vio_a)
         sch.vectorize(vii_a)
@@ -169,3 +180,7 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
                 mbs, base_gpbs, vectorize_gbps, parallel_gbps, single_dma_gbps
             )
         )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
index 2fc607c0c521..e4edf2919a00 100644
--- a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
+++ b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
@@ -14,8 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""No QNN canonicalization tests."""
 
-import pytest
 import numpy as np
 
 import tvm.testing
@@ -27,6 +27,7 @@
 
 @tvm.testing.requires_hexagon
 def test_no_qnn_pass():
+    """No QNN pass test."""
     x = relay.var("x", shape=(4, 8), dtype="float32")
     op0 = relay.qnn.op.quantize(x, relay.const(2.0), relay.const(10), out_dtype="uint8")
     op1 = relay.qnn.op.dequantize(op0, relay.const(0.5), relay.const(5))
@@ -61,6 +62,7 @@ def execute(executor, data_np, weight_np, bias_np=None):
 
 @tvm.testing.requires_hexagon
 def test_qnn_conv2d_rq(hexagon_session: Session):
+    """QNN conv2d test."""
     data_shape = [1, 8, 32, 32]
     weight_shape = [16, 8, 3, 3]
     data = relay.var("data", shape=data_shape, dtype="float32")
@@ -119,6 +121,7 @@ def test_qnn_conv2d_rq(hexagon_session: Session):
 
 @tvm.testing.requires_hexagon
 def test_qnn_dense_bias_rq(hexagon_session: Session):
+    """QNN dense with bias test."""
     data_shape = [8, 8]
     weight_shape = [16, 8]
     bias_shape = [16]

From 404d95f05483b736d4066402fd02e594b1546f4b Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Wed, 2 Nov 2022 18:32:31 -0400
Subject: [PATCH 491/704] [build][relay][te][tir] remove unused vars / args
 (#13266)

- Fix clang 15.0.3 '-Wunused-but-set-variable' and '-Wunused-lambda-capture' warnings by removing / commenting-out code.
---
 src/relay/backend/graph_plan_memory.cc             | 2 +-
 src/relay/backend/te_compiler_cache.cc             | 4 ++--
 src/relay/transforms/remove_standalone_reshapes.cc | 4 +---
 src/te/operation/compute_op.cc                     | 4 +---
 src/tir/ir/data_type_rewriter.cc                   | 2 +-
 src/tir/schedule/primitive/cache_index.cc          | 2 +-
 6 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index dab951b7e91f..f927bf633732 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -631,7 +631,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
   // allocator
   support::Arena arena_;
   // scale used for rough match
-  size_t match_range_{16};
+  // size_t match_range_{16};
   // free list of storage entry
   std::multimap<size_t, StorageToken*> free_;
   // all the storage resources available
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index c97efb565d9d..47a19cbef8fa 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -294,10 +294,10 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
     pattern_matcher_.Register(call_node);
 
     Array<te::Tensor> inputs;
-    int count_tuple = 0;
+    // int count_tuple = 0;
     for (Expr arg : call_node->args) {
       if (arg->checked_type().as<TupleTypeNode>()) {
-        ++count_tuple;
+        // ++count_tuple;
       }
       for (te::Tensor tensor : VisitExpr(arg)) {
         inputs.push_back(tensor);
diff --git a/src/relay/transforms/remove_standalone_reshapes.cc b/src/relay/transforms/remove_standalone_reshapes.cc
index 28924e8bdfed..063060b3ebf9 100644
--- a/src/relay/transforms/remove_standalone_reshapes.cc
+++ b/src/relay/transforms/remove_standalone_reshapes.cc
@@ -36,7 +36,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("relay.remove_standalone_reshapes.enable", Bool)
  */
 class RemoveStandaloneReshapesMutator : public MixedModeMutator {
  public:
-  explicit RemoveStandaloneReshapesMutator(IRModule& mod) : ir_module_(mod) {}
+  explicit RemoveStandaloneReshapesMutator(IRModule& mod) {}  // NOLINT(runtime/references)
 
   using MixedModeMutator::VisitExpr_;
 
@@ -85,8 +85,6 @@ class RemoveStandaloneReshapesMutator : public MixedModeMutator {
  private:
   /*! \brief Map of LetNode's var to previous call_lowered. */
   Map<Var, Call> let_var_to_call_lowered_;
-  /*! \brief Module that contains global reshape functions. */
-  IRModule& ir_module_;
 };
 
 namespace transform {
diff --git a/src/te/operation/compute_op.cc b/src/te/operation/compute_op.cc
index 7f8facad5568..3ca40c9a6b5b 100644
--- a/src/te/operation/compute_op.cc
+++ b/src/te/operation/compute_op.cc
@@ -387,7 +387,7 @@ enum class ComputeType { kNormal, kCrossThreadReduction, kTensorize };
 
 ComputeType DetectComputeType(const ComputeOpNode* self, const Stage& stage) {
   // Verify correctness of leaf nest.
-  int normal_red = 0, thread_red = 0, tensorize = 0;
+  int thread_red = 0, tensorize = 0;
 
   for (IterVar iv : stage->leaf_iter_vars) {
     IterVarAttr attr;
@@ -401,8 +401,6 @@ ComputeType DetectComputeType(const ComputeOpNode* self, const Stage& stage) {
     if (iv->iter_type == kCommReduce) {
       if (attr.defined() && attr->bind_thread.defined()) {
         ++thread_red;
-      } else {
-        ++normal_red;
       }
     } else {
       ICHECK_EQ(thread_red, 0) << "Cross thread reduce cannot swap with normal data axis";
diff --git a/src/tir/ir/data_type_rewriter.cc b/src/tir/ir/data_type_rewriter.cc
index afa28d92589f..102989acf6e0 100644
--- a/src/tir/ir/data_type_rewriter.cc
+++ b/src/tir/ir/data_type_rewriter.cc
@@ -62,7 +62,7 @@ Stmt DataTypeLegalizer::VisitStmt_(const BlockRealizeNode* op) {
 
 Stmt DataTypeLegalizer::VisitStmt_(const BlockNode* op) {
   Block new_block = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
-  Array<IterVar> new_iter_vars = MutateArray(new_block->iter_vars, [this](const IterVar& iter) {
+  Array<IterVar> new_iter_vars = MutateArray(new_block->iter_vars, [/*this*/](const IterVar& iter) {
     auto dtype = iter->var.dtype();
     if (iter->dom->min->dtype != dtype || iter->dom->extent->dtype != dtype) {
       IterVar new_iter = iter;
diff --git a/src/tir/schedule/primitive/cache_index.cc b/src/tir/schedule/primitive/cache_index.cc
index ba58f81038cb..1db86a5444ff 100644
--- a/src/tir/schedule/primitive/cache_index.cc
+++ b/src/tir/schedule/primitive/cache_index.cc
@@ -203,7 +203,7 @@ Array<Block> MakeIndexCacheStage(IndexInfo* info) {
     // which will be used to create new loop vars
     std::vector<Var> iter_vars;
     for (const Var& it : info->origin_block_vars[expr_index]) {
-      PostOrderVisit(info->var_binding.at(it), [&info, &iter_vars](const ObjectRef& node) {
+      PostOrderVisit(info->var_binding.at(it), [/*&info,*/ &iter_vars](const ObjectRef& node) {
         if (node->IsInstance<VarNode>()) {
           Var iter_var = Downcast<Var>(node);
           if (std::find_if(iter_vars.begin(), iter_vars.end(),

From ff6aaeb12ae71393fef37da8f9c72a0f2017e6d5 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Wed, 2 Nov 2022 15:50:56 -0700
Subject: [PATCH 492/704] [Frontend][Tensorflow2] Import graph_def to default
 graph before calling function_def_to_graph_def (#13260)

[TF2] Import graph_def to default graph before calling function_def_to_graph_def
---
 python/tvm/relay/frontend/tensorflow2.py | 30 ++++++++++++++----------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow2.py b/python/tvm/relay/frontend/tensorflow2.py
index 465f530624b9..2a2a64b2952f 100644
--- a/python/tvm/relay/frontend/tensorflow2.py
+++ b/python/tvm/relay/frontend/tensorflow2.py
@@ -25,6 +25,7 @@
 """
 
 import numpy as np
+import tensorflow as tf
 from tensorflow.python.framework import function_def_to_graph, tensor_util, dtypes
 
 import tvm
@@ -839,16 +840,21 @@ def @main(%x1: Tensor[(2, 2), float32], %add/y1: Tensor[(2, 2), float32]) {
 
     """
 
-    # Subgraph graph_defs are cached here to avoid a TF error when parsing after prelude init
-    graph_def_library = {}
-    for func in graph_def.library.function:
-        inshape = func.attr["_input_shapes"].list.shape
-        graph_def_library[func.signature.name], _ = function_def_to_graph.function_def_to_graph_def(
-            func, inshape
+    with tf.Graph().as_default():
+        tf.import_graph_def(graph_def, name="")
+        # Subgraph graph_defs are cached here to avoid a TF error when parsing after prelude init
+        graph_def_library = {}
+        for func in graph_def.library.function:
+            inshape = func.attr["_input_shapes"].list.shape
+            (
+                graph_def_library[func.signature.name],
+                _,
+            ) = function_def_to_graph.function_def_to_graph_def(func, inshape)
+        module = RelayModule()
+        g = GraphProto(module)
+        func, params = g.from_tensorflow(
+            graph_def, layout, shape, outputs, gdef_lib=graph_def_library
         )
-    module = RelayModule()
-    g = GraphProto(module)
-    func, params = g.from_tensorflow(graph_def, layout, shape, outputs, gdef_lib=graph_def_library)
-    module.mod["main"] = func
-    module.params.update(params)
-    return module.mod, module.params
+        module.mod["main"] = func
+        module.params.update(params)
+        return module.mod, module.params

From d998187e1684c23af758374e011a979690f64e81 Mon Sep 17 00:00:00 2001
From: Wubin <wubin.wu@imgtec.com>
Date: Thu, 3 Nov 2022 10:27:47 +0800
Subject: [PATCH 493/704] =?UTF-8?q?[Frontend][PaddlePaddle]=20Fix=20Unboun?=
 =?UTF-8?q?dLocalError:=20local=20variable=20'shape=E2=80=A6=20(#13247)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are a local variable referenced before assignment in convert_interpolate function. I think varible 'size' is real want to be referenced.
---
 python/tvm/relay/frontend/paddlepaddle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index 068f7d2eac95..ffbcf12de543 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -778,7 +778,7 @@ def get_interpolate_mode(op):
         for name in input_size_tensor:
             size = g.get_node(name)
             if len(infer_shape(size)) == 0:
-                shape = _op.reshape(shape, [-1])
+                size = _op.reshape(size, [-1])
             out_size.append(size)
         out_size = _op.concatenate(out_size, axis=0)
         out_size, infered = try_infer_value(out_size, parameters=g.get_params())

From e9ba9865a7f89ed36535c43f890e510682db9bcc Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 3 Nov 2022 02:47:46 -0700
Subject: [PATCH 494/704] [skip ci] Revert "[ci] Protect release branches
 (#13208)" (#13274)

This reverts commit 5acf3f90c63b6760cd23796b442f8ac20e645af0.

Reverting since this is causing some spam from the ASF Infra bot related
to https://issues.apache.org/jira/browse/INFRA-23834. As in that issue
the protections have been applied manually by ASF Infra so this revert
shouldn't have any real effect
---
 .asf.yaml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.asf.yaml b/.asf.yaml
index 1e4371d594d2..f4aba210d2cc 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -61,10 +61,3 @@ github:
 
       required_pull_request_reviews:
         required_approving_review_count: 1
-
-    # protect release branches from unsigned updates and force pushes
-    'v[0-9]*':
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-      required_linear_history: true
-      required_signatures: true

From f15afd225140e2a501b8b6aa2def0fd94d31bc54 Mon Sep 17 00:00:00 2001
From: Benson Muite <bkmgit@users.noreply.github.com>
Date: Thu, 3 Nov 2022 14:55:35 +0300
Subject: [PATCH 495/704] [Docs] Minimal dependencies for Fedora/CentOS
 (#13248)

Minimal dependencies for Fedora/CentOS

This commit indicates how to install minimal set of
dependencies for building Apache TVM on Fedora and
CentOS. It supplements existing information for
Ubuntu and MacOS.
---
 docs/install/from_source.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 458a1570096c..7a6b93705759 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -81,6 +81,15 @@ linux operating systems, execute (in a terminal):
 
 Note that the version of CMake on apt may not be sufficiently up to date; it may be necessary to install it directly from `Kitware's third-party APT repository <https://apt.kitware.com/>`_.
 
+
+On Fedora/CentOS and related operating systems use:
+
+.. code:: bash
+
+    sudo dnf update
+    sudo dnf groupinstall -y "Development Tools"
+    sudo dnf install -y python-devel ncurses-compat-libs zlib-devel cmake libedit-devel libxml2-devel
+
 Use Homebrew to install the required dependencies for macOS running either the Intel or M1 processors. You must follow the post-installation steps specified by
 Homebrew to ensure the dependencies are correctly installed and configured:
 

From 9df3a33ff454024b116467537096271b23947fda Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Thu, 3 Nov 2022 12:32:04 -0400
Subject: [PATCH 496/704] [build][doc] Fix clang doxygen warnings (#13270)

Fix occurrences of clang's `-Wdocumentation-unknown-command` warning.
---
 src/relay/backend/annotate_used_memory.cc |  4 ++++
 src/relay/collage/mock_cost_estimator.h   | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/src/relay/backend/annotate_used_memory.cc b/src/relay/backend/annotate_used_memory.cc
index 4dcdb2e541c5..001d7635e786 100644
--- a/src/relay/backend/annotate_used_memory.cc
+++ b/src/relay/backend/annotate_used_memory.cc
@@ -52,6 +52,7 @@ namespace backend {
  * A simple example:
  *
  * Before:
+ * \verbatim
  * def @main(%input: Tensor[(1, 2, 2, 4), int8]) -> Tensor[(1, 2, 2, 4), int8] {
  *   let %x_0 = fn (%x: Tensor[(1, 2, 2, 4), int8], Primitive=1) -> Tensor[(1, 2, 2, 4), int8] {
  *     nn.max_pool2d(%x, pool_size=[1, 1], padding=[0, 0, 0, 0])
@@ -59,8 +60,10 @@ namespace backend {
  *   let %x_1 = %x_0(%input);
  *   %x_1
  * }
+ * \endverbatim
  *
  * After:
+ * \verbatim
  * def @main(%input: Tensor[(1, 2, 2, 4), int8], io_used_memory=32) -> Tensor[(1, 2, 2, 4), int8] {
  *   let %x_0: fn (%x: Tensor[(1, 2, 2, 4), int8], Primitive=1, used_memory=[32]) -> Tensor[(1, 2,
  * 2, 4), int8] {
@@ -69,6 +72,7 @@ namespace backend {
  *   let %x_1: Tensor[(1, 2, 2, 4), int8] = %x_0(%input);
  *   %x_1
  * }
+ * \endverbatim
  *
  * Note that in the simple example above io_used_memory and used_memory are the same since there
  * is only one primitive function.
diff --git a/src/relay/collage/mock_cost_estimator.h b/src/relay/collage/mock_cost_estimator.h
index f47cb71fb60c..3aa97923a201 100644
--- a/src/relay/collage/mock_cost_estimator.h
+++ b/src/relay/collage/mock_cost_estimator.h
@@ -34,6 +34,13 @@ namespace tvm {
 namespace relay {
 namespace collage {
 
+// Clang (15.0.3, at least) validly complains about `@main`, but it invalidly
+// complains even about `\c @main`.
+#if __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdocumentation-unknown-command"
+#endif
+
 /*!
  * \brief A mock cost estimator which can determine the cost of a candidate based on both
  * the candidate's target and the number of operator calls inside it.
@@ -69,6 +76,9 @@ class MockCostEstimatorNode : public CostEstimatorNode {
 
   friend class MockCostEstimator;
 };
+#if __clang__
+#pragma clang diagnostic pop
+#endif
 
 class MockCostEstimator : public CostEstimator {
  public:

From 0d553127e51d7533b93e97b1c0e7876c612b6dc9 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Thu, 3 Nov 2022 12:32:27 -0400
Subject: [PATCH 497/704] [build][tir] fix clang redundant-move warning
 (#13268)

Fix code to address a valid `-Wredundant-move` clang warning.
---
 src/tir/transforms/profile_instrumentation.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tir/transforms/profile_instrumentation.cc b/src/tir/transforms/profile_instrumentation.cc
index 68d5b0a204d5..3a2ef796c688 100644
--- a/src/tir/transforms/profile_instrumentation.cc
+++ b/src/tir/transforms/profile_instrumentation.cc
@@ -243,7 +243,7 @@ PrimFunc AddProfileBuiltins(PrimFunc func, int32_t max_instr_depth, int32_t min_
   InstrumentIntrin p(max_instr_depth, min_instr_height, instr_siblings);
   p.GetLoopInfo(func_ptr);
   func_ptr->body = p(std::move(func_ptr->body));
-  return std::move(func);
+  return func;
 }
 
 }  // namespace lwp

From 75921fb55953613f5789a73fbccd985312163bbe Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 3 Nov 2022 17:22:17 +0000
Subject: [PATCH 498/704] [ETHOSN] Inline non-compute-intensive partitions
 (#13092)

* [ETHOSN] Inline non-compute-intensive partitions

Adds a pass that analyzes functions partitioned for the NPU and inlines
those that are deemed "non-compute-intensive" back to the main function
so that they can be considered for other backends. The current heurisic
for deciding a non-compute-intensive function is to collectively check
all of the operations in the function have no multiply accumulate
operations. This heuristic is not optimial; optimization is left for
future exploration.

This pass is inspired by the "IsComputeIntensiveGraph" pass in the
TensorRT integration.

Change-Id: I20c197702f5252f102cfc1e4b4635ab836aa7835

* Address comments

* 'inline_non_compute_intensive_partitions' -> 'is_inline_non_compute
_intensive_partitions_enabled'.
* remove no MAC operations.
* fix network test.

Change-Id: Ie1015b27f37e47544bed6f0aff819ee4649de579

* Fix failing unit tests due to optimization

Change-Id: I0ee0af071dc77c91e0ef0f6753506cb40d1d1859

* Add future exploration suggestions

Change-Id: Ie918d7f1059f032282f1f5eeffda38f4febcd59c
---
 python/tvm/relay/op/contrib/ethosn.py         |  55 ++++--
 .../backend/contrib/ethosn/codegen_ethosn.h   |  17 ++
 .../contrib/ethosn/inline_partitions.cc       | 126 +++++++++++++
 .../contrib/test_ethosn/infrastructure.py     |  28 ++-
 .../contrib/test_ethosn/test_addition.py      |   4 +-
 .../contrib/test_ethosn/test_concatenate.py   |   2 +-
 .../test_ethosn/test_depth_to_space.py        |   2 +-
 .../test_ethosn/test_inline_partitions.py     | 167 ++++++++++++++++++
 .../contrib/test_ethosn/test_leaky_relu.py    |   2 +-
 .../contrib/test_ethosn/test_multiply.py      |   4 +-
 .../contrib/test_ethosn/test_networks.py      |   5 +-
 tests/python/contrib/test_ethosn/test_relu.py |   2 +-
 .../contrib/test_ethosn/test_requantize.py    |   4 +-
 .../contrib/test_ethosn/test_reshape.py       |   6 +-
 .../python/contrib/test_ethosn/test_split.py  |   4 +-
 tests/python/contrib/test_ethosn/test_tanh.py |   2 +-
 .../contrib/test_ethosn/test_topologies.py    |  61 +++++--
 17 files changed, 439 insertions(+), 52 deletions(-)
 create mode 100644 src/relay/backend/contrib/ethosn/inline_partitions.cc
 create mode 100644 tests/python/contrib/test_ethosn/test_inline_partitions.py

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 80cc1ca3b202..9afab68ccd8f 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -64,14 +64,42 @@ def ConvertEquivalents() -> tvm.ir.IRModule:  # pylint: disable=invalid-name
     """Converts operations into a numerically equivalent form
     that can be understood by the NPU codegen.
 
-    Return
-    ------
+    Returns
+    -------
     Pass
         The module pass.
     """
     return _ethosn.ConvertEquivalents()
 
 
+def InlineNonComputeIntensivePartitions() -> tvm.ir.IRModule:  # pylint: disable=invalid-name
+    """This pass checks whether functions partitioned for the NPU are considered
+    non-compute intensive. If they are not, they will be unpartitioned and passed onto
+    other backends to consider.
+
+    A partitioned function is currently considered non-compute intensive if it contains
+    no multiply accumulate operations.
+
+    Returns
+    -------
+    Pass
+        The module pass.
+    """
+    return _ethosn.InlineNonComputeIntensivePartitions()
+
+
+def is_inline_non_compute_intensive_partitions_enabled() -> bool:
+    """
+    Determine whether to inline none-compute-intensive partitions.
+
+    Returns
+    -------
+    True if inlining should happen, False if not.
+    """
+    compiler_attrs = tvm.get_global_func("relay.ext.ethos-n.get_compiler_attrs")()
+    return compiler_attrs.inline_non_compute_intensive_partitions
+
+
 def partition_for_ethosn(mod, params=None, **opts):
     """Partition the graph greedily offloading supported
     operators to Arm Ethos-N NPU.
@@ -112,17 +140,18 @@ def partition_for_ethosn(mod, params=None, **opts):
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
 
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.MergeComposite(pattern_table()),
-            transform.AnnotateTarget("ethos-n"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-            ConvertEquivalents(),
-        ]
-    )
-    return seq(mod)
+    passes = [
+        transform.InferType(),
+        transform.MergeComposite(pattern_table()),
+        transform.AnnotateTarget("ethos-n"),
+        transform.MergeCompilerRegions(),
+        transform.PartitionGraph(),
+        ConvertEquivalents(),
+    ]
+    if is_inline_non_compute_intensive_partitions_enabled():
+        passes.append(InlineNonComputeIntensivePartitions())
+
+    return tvm.transform.Sequential(passes)(mod)
 
 
 @register_pattern_table("ethos-n")
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index ab853599aa2d..c640db47b6dd 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -251,6 +251,7 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
   bool enable_intermediate_compression;
   bool disable_winograd;
   String debug_dir;
+  bool inline_non_compute_intensive_partitions;
 
   TVM_DECLARE_ATTRS(EthosnCompilerConfigNode, "ext.attrs.EthosnCompilerConfigNode") {
     TVM_ATTR_FIELD(variant).describe("See Ethos-N documentation.").set_default("n78");
@@ -278,6 +279,12 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
     TVM_ATTR_FIELD(enable_intermediate_compression).set_default(true);
     TVM_ATTR_FIELD(disable_winograd).set_default(false);
     TVM_ATTR_FIELD(debug_dir).set_default(".");
+    TVM_ATTR_FIELD(inline_non_compute_intensive_partitions)
+        .describe(
+            "A heuristic to improve performance. Inlines functions partitioned for Arm(R) "
+            "Ethos(TM)-N that are deemed 'non-compute-intensive'. The inlined functions will "
+            "continue through TVM's standard compilation flow.")
+        .set_default(true);
   }
 };
 
@@ -289,6 +296,16 @@ class EthosnCompilerConfig : public Attrs {
 TVM_REGISTER_NODE_TYPE(EthosnCompilerConfigNode);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.ethos-n.options", EthosnCompilerConfig);
 
+auto GetCompilerAttrs() {
+  auto ctx = transform::PassContext::Current();
+  auto cfg = ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options");
+  if (!cfg.defined()) {
+    cfg = AttrsWithDefaultValues<EthosnCompilerConfig>();
+  }
+  return cfg;
+}
+TVM_REGISTER_GLOBAL("relay.ext.ethos-n.get_compiler_attrs").set_body_typed(GetCompilerAttrs);
+
 /*! \brief The compiler for Ethos-N functions */
 class EthosnCompiler {
  public:
diff --git a/src/relay/backend/contrib/ethosn/inline_partitions.cc b/src/relay/backend/contrib/ethosn/inline_partitions.cc
new file mode 100644
index 000000000000..f8cc3fc00d10
--- /dev/null
+++ b/src/relay/backend/contrib/ethosn/inline_partitions.cc
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/ethosn/inline_partitions.cc
+ * \brief A pass to inline NPU partitions that are not considered compute
+ * intensive.
+ */
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+
+#include "../../../transforms/compiler_function_utils.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace ethosn {
+
+class IsComputeIntensivePartition : MixedModeVisitor {
+ public:
+  /*!
+   * \brief Check if the partitioned function is compute
+   * intensive. If it has not multiply-accumulate operations
+   * it is not considered compute intensive.
+   *
+   * \param expr The partitioned function to check.
+   */
+  bool CheckSubgraph(const Expr& expr) {
+    is_compute_intensive = false;
+    VisitExpr(expr);
+    return is_compute_intensive;
+  }
+
+  /*!
+   * \brief Visit the call nodes of a partitioned function
+   * and check if operators or composite functions make the
+   * partitioned function compute intensive.
+   *
+   * \param op The call node to check.
+   */
+  void VisitExpr_(const CallNode* op) override {
+    Call call = GetRef<Call>(op);
+    std::string op_name = "";
+    if (const auto* op = call->op.as<OpNode>()) {
+      op_name = op->name;
+    } else if (const auto* func = call->op.as<FunctionNode>()) {
+      op_name = func->GetAttr<String>(attr::kComposite, "").value();
+    }
+
+    if (op_name != "") {
+      if (compute_intensive_operators.find(op_name) != compute_intensive_operators.end()) {
+        is_compute_intensive = true;
+      }
+    }
+  }
+
+ private:
+  /*! \brief Whether or not the partitioned function is consdiered compute intensive. */
+  bool is_compute_intensive;
+  /*! \brief A set of operators considered compute intensive. */
+  const std::unordered_set<std::string> compute_intensive_operators{
+      "ethos-n.qnn_conv2d",     "ethos-n.qnn_conv2d_transpose",
+      "ethos-n.qnn_avg_pool2d", "ethos-n.qnn_sigmoid",
+      "ethos-n.qnn_fc",         "ethos-n.qnn_mean",
+      "ethos-n.qnn_resize",     "nn.max_pool2d",
+  };
+};
+
+/*!
+ * \brief This pass checks whether functions partitioned for the NPU are considered
+ * non-compute intensive. If they are not, they will be unpartitioned and passed onto
+ * other backends to consider.
+ *
+ * A partitioned function is currently considered non-compute intensive if it contains
+ * no multiply accumulate operations. Note that this is not an optimal heuristic.
+ *
+ * Some suggestions for future exploration:
+ * - Making a better choice about large non-compute-intensive subgraphs
+ *   as currently these are inlined.
+ * - Allowing the user to input ops that are considered compute-intensive.
+ * - Inline "small" compute intensive operations.
+ */
+tvm::transform::Pass InlineNonComputeIntensivePartitions() {
+  runtime::TypedPackedFunc<IRModule(IRModule, tvm::transform::PassContext)> pass_func =
+      [=](IRModule mod, tvm::transform::PassContext ctx) {
+        auto analyzer = IsComputeIntensivePartition();
+        Array<GlobalVar> gvs_to_inline;
+        for (auto gv : mod->GetGlobalVars()) {
+          Function func = Downcast<Function>(mod->Lookup(gv));
+          auto compiler_name = func->GetAttr<String>(attr::kCompiler);
+          if (compiler_name.defined() && compiler_name == "ethos-n") {
+            if (!analyzer.CheckSubgraph(func->body)) {
+              gvs_to_inline.push_back(gv);
+            }
+          }
+        }
+        return relay::transform::InlineCompilerFunctionsBoundTo(gvs_to_inline)(mod);
+      };
+  return tvm::transform::CreateModulePass(
+      pass_func, 0, "relay.backend.contrib.ethos-n.InlineNonComputeIntensivePartitions", {});
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.InlineNonComputeIntensivePartitions")
+    .set_body_typed(InlineNonComputeIntensivePartitions);
+
+}  // namespace ethosn
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index 6b019686968e..85ebd98efcff 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -143,7 +143,7 @@ def visit_call(self, call):
     return c.count
 
 
-def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
+def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1, optimize_partitions=True):
     """Build a network with or without Ethos-N offloading.
 
     Parameters
@@ -158,10 +158,18 @@ def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
         The number of ops expected to remain on the host.
     npu_partitions : int, optional
         The number of Ethos-N partitions expected.
+    optimize_partitions : bool, optional
+        Disable the pass that optimizes NPU partitions post partitioning.
     """
     relay.backend.te_compiler.get().clear()
     with tvm.transform.PassContext(
-        opt_level=3, config={"relay.ext.ethos-n.options": {"variant": get_ethosn_variant()}}
+        opt_level=3,
+        config={
+            "relay.ext.ethos-n.options": {
+                "variant": get_ethosn_variant(),
+                "inline_non_compute_intensive_partitions": optimize_partitions,
+            }
+        },
     ):
         with tvm.target.Target("llvm"):
             if npu:
@@ -228,8 +236,20 @@ def run(lib, inputs, outputs, npu=True):
     return out
 
 
-def build_and_run(mod, inputs, outputs, params, npu=True, expected_host_ops=0, npu_partitions=1):
-    lib = build(mod, params, npu, expected_host_ops, npu_partitions)
+def build_and_run(
+    mod,
+    inputs,
+    outputs,
+    params,
+    npu=True,
+    expected_host_ops=0,
+    npu_partitions=1,
+    optimize_partitions=True,
+):
+    """
+    Convenient wrapper for building and running a module on the NPU.
+    """
+    lib = build(mod, params, npu, expected_host_ops, npu_partitions, optimize_partitions)
     return run(lib, inputs, outputs, npu)
 
 
diff --git a/tests/python/contrib/test_ethosn/test_addition.py b/tests/python/contrib/test_ethosn/test_addition.py
index 11d8b8d1cd56..53afd01b8449 100644
--- a/tests/python/contrib/test_ethosn/test_addition.py
+++ b/tests/python/contrib/test_ethosn/test_addition.py
@@ -111,7 +111,7 @@ def test_addition(dtype, shape):
     model = _get_model(shape, shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype)
     for npu in [False, True]:
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
@@ -227,7 +227,7 @@ def test_addition_to_reinterpret_quantize(lhs_shape, lhs_is_constant, rhs_shape,
     outputs = []
     for npu in [False, True]:
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
     tei.verify(outputs, dtype, 1)
 
 
diff --git a/tests/python/contrib/test_ethosn/test_concatenate.py b/tests/python/contrib/test_ethosn/test_concatenate.py
index 0389b3c5b103..f8521b595060 100644
--- a/tests/python/contrib/test_ethosn/test_concatenate.py
+++ b/tests/python/contrib/test_ethosn/test_concatenate.py
@@ -76,7 +76,7 @@ def test_concatenate(dtype, shapes, axis):
     for npu in [False, True]:
         model = _get_model(shapes, dtype, axis)
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
         tei.verify(outputs, dtype, 0)
 
diff --git a/tests/python/contrib/test_ethosn/test_depth_to_space.py b/tests/python/contrib/test_ethosn/test_depth_to_space.py
index 732932d8f324..814693b664ca 100644
--- a/tests/python/contrib/test_ethosn/test_depth_to_space.py
+++ b/tests/python/contrib/test_ethosn/test_depth_to_space.py
@@ -53,7 +53,7 @@ def test_depth_to_space(dtype, shape):
     for npu in [False, True]:
         model = _get_model(shape, 2, dtype, "NHWC")
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_inline_partitions.py b/tests/python/contrib/test_ethosn/test_inline_partitions.py
new file mode 100644
index 000000000000..79c35fc5bcb2
--- /dev/null
+++ b/tests/python/contrib/test_ethosn/test_inline_partitions.py
@@ -0,0 +1,167 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Tests for the 'InlineNonComputeIntensivePartitions' pass.
+"""
+
+import tvm
+from tvm import relay
+from tvm.testing import requires_ethosn
+from tvm.relay.op.contrib.ethosn import InlineNonComputeIntensivePartitions
+
+from . import infrastructure as tei
+
+
+def _assert_structural_equal(a, b):
+    """Check structural equality of two Relay expressions."""
+    reason = (
+        "Actual and expected relay functions are not equal. "
+        "InlineNonComputeIntensiveSubgraphs is not correctly "
+        "transforming the input graph."
+    )
+    assert tvm.ir.structural_equal(a, b, map_free_vars=True), reason
+
+
+@requires_ethosn
+def test_single_reshape():
+    """Check that a single reshape is inlined correctly."""
+
+    def get_reshape():
+        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+        return relay.reshape(x, newshape=(2, 2, 4))
+
+    def before():
+        reshape = get_reshape()
+        return tei.make_ethosn_partition(reshape)
+
+    def expected():
+        reshape = get_reshape()
+        mod = tvm.IRModule.from_expr(reshape)
+        return relay.transform.InferType()(mod)
+
+    mod = before()
+    mod = InlineNonComputeIntensivePartitions()(mod)
+    expected_mod = expected()
+    _assert_structural_equal(mod, expected_mod)
+
+
+@requires_ethosn
+def test_multiple_non_compute_intensive_ops():
+    """
+    Check that a partitioned function is correctly inlined
+    when it contains multiple non-compute intensive operations.
+    """
+
+    def get_graph():
+        x = relay.var("x", shape=(2, 2, 4), dtype="int8")
+        x = relay.reshape(x, newshape=(1, 2, 2, 4))
+        x = relay.clip(x, 0.0, 1.0)
+        x = relay.reshape(x, newshape=(2, 2, 4))
+        return relay.clip(x, 0.0, 1.0)
+
+    def before():
+        func = get_graph()
+        return tei.make_ethosn_partition(func)
+
+    def expected():
+        func = get_graph()
+        mod = tvm.IRModule.from_expr(func)
+        return relay.transform.InferType()(mod)
+
+    mod = before()
+    mod = InlineNonComputeIntensivePartitions()(mod)
+    expected_mod = expected()
+    _assert_structural_equal(mod, expected_mod)
+
+
+@requires_ethosn
+def test_compute_intensive_ops():
+    """
+    Check that a partitioned function that is considered
+    compute intensive is not inlined.
+    """
+
+    def before():
+        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+        x = relay.nn.max_pool2d(x, layout="NHWC")
+        x = relay.reshape(x, newshape=(2, 2, 4))
+        return tei.make_ethosn_partition(x)
+
+    mod = before()
+    transformed_mod = InlineNonComputeIntensivePartitions()(mod)
+    for global_var in mod.get_global_vars():
+        _assert_structural_equal(mod[global_var], transformed_mod[global_var])
+
+
+@requires_ethosn
+def test_multiple_partitioned_functions():
+    """
+    Tests the pass on a number of partitioned functions.
+    """
+
+    def before():
+        composite_func_name = "ethos-n_0"
+        inp = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+
+        # partitioned func 1 (non compute intensive)
+        x = relay.reshape(inp, newshape=(1, 2, 2, 4))
+        partitioned_func_1 = tei.make_ethosn_partition(x)[composite_func_name]
+        gv_1 = relay.GlobalVar("ethos-n_0")
+
+        # partitioned func 2 (compute intensive)
+        x = relay.nn.max_pool2d(inp, layout="NHWC")
+        partitioned_func_2 = tei.make_ethosn_partition(x)[composite_func_name]
+        gv_2 = relay.GlobalVar("ethos-n_1")
+
+        # partitioned func 3 (non compute intensive)
+        x = relay.clip(inp, 0.0, 1.0)
+        partitioned_func_3 = tei.make_ethosn_partition(x)[composite_func_name]
+        gv_3 = relay.GlobalVar("ethos-n_2")
+
+        mod = tvm.IRModule({})
+        mod[gv_1] = partitioned_func_1
+        mod[gv_2] = partitioned_func_2
+        mod[gv_3] = partitioned_func_3
+        main_expr = relay.Call(gv_1, [inp])
+        main_expr = relay.Call(gv_2, [main_expr])
+        main_expr = relay.Call(gv_3, [main_expr])
+        mod["main"] = relay.Function([inp], main_expr)
+        return relay.transform.InferType()(mod)
+
+    def expected():
+        composite_func_name = "ethos-n_0"
+        inp = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+
+        # partitioned func 2 (compute intensive)
+        x = relay.nn.max_pool2d(inp, layout="NHWC")
+        partitioned_func_2 = tei.make_ethosn_partition(x)[composite_func_name]
+        gv_2 = relay.GlobalVar("ethos-n_1")
+
+        mod = tvm.IRModule({})
+        mod[gv_2] = partitioned_func_2
+        main_expr = relay.reshape(inp, newshape=(1, 2, 2, 4))
+        main_expr = relay.Call(gv_2, [main_expr])
+        main_expr = relay.clip(main_expr, 0.0, 1.0)
+        mod["main"] = relay.Function([inp], main_expr)
+        return relay.transform.InferType()(mod)
+
+    mod = before()
+    mod = InlineNonComputeIntensivePartitions()(mod)
+    expected_mod = expected()
+    for global_var in mod.get_global_vars():
+        _assert_structural_equal(mod[global_var.name_hint], expected_mod[global_var.name_hint])
diff --git a/tests/python/contrib/test_ethosn/test_leaky_relu.py b/tests/python/contrib/test_ethosn/test_leaky_relu.py
index 3c3bbc709679..7c1969ec44ba 100644
--- a/tests/python/contrib/test_ethosn/test_leaky_relu.py
+++ b/tests/python/contrib/test_ethosn/test_leaky_relu.py
@@ -65,7 +65,7 @@ def test_leaky_relu(dtype, shape, alpha):
     for npu in [False, True]:
         model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype, alpha)
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_multiply.py b/tests/python/contrib/test_ethosn/test_multiply.py
index 41c06092447a..a7b97e39cb13 100644
--- a/tests/python/contrib/test_ethosn/test_multiply.py
+++ b/tests/python/contrib/test_ethosn/test_multiply.py
@@ -151,7 +151,9 @@ def test_multiply_to_reinterpret_quantize(shape, constant_shape, reverse_inputs)
     outputs = []
     for npu in [False, True]:
         mod = tei.make_module(model, params)
-        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+        outputs.append(
+            tei.build_and_run(mod, inputs, 1, params, npu=npu, optimize_partitions=False)
+        )
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 5bd133ba20bb..68402cd5e8a9 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -146,7 +146,6 @@ def test_resnet_50_int8():
     # on hardware that isn't available in CI.
     _compile_hash = {
         "f16dc9caa8e696bc5da8a5c6a644eb72",
-        "6e5fcbab831607b9da1039aff4e56871",
         "41acecca37b2735bd580f6ec38d8c2e0",
     }
     _test_image_network(
@@ -156,8 +155,8 @@ def test_resnet_50_int8():
         input_dict={"input": (1, 224, 224, 3)},
         compile_hash=_compile_hash,
         output_count=1,
-        host_ops=9,
-        npu_partitions=3,
+        host_ops=10,
+        npu_partitions=2,
     )
 
 
diff --git a/tests/python/contrib/test_ethosn/test_relu.py b/tests/python/contrib/test_ethosn/test_relu.py
index db1894931dd9..8ecea0d23ce4 100644
--- a/tests/python/contrib/test_ethosn/test_relu.py
+++ b/tests/python/contrib/test_ethosn/test_relu.py
@@ -60,7 +60,7 @@ def test_relu(dtype, shape, a_min, a_max):
     for npu in [False, True]:
         model = _get_model(inputs["a"].shape, dtype, a_min, a_max)
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_requantize.py b/tests/python/contrib/test_ethosn/test_requantize.py
index 3187c22f3391..618b00c6e4ee 100644
--- a/tests/python/contrib/test_ethosn/test_requantize.py
+++ b/tests/python/contrib/test_ethosn/test_requantize.py
@@ -64,7 +64,7 @@ def test_requantize(in_dtype, out_dtype, shape):
             out_dtype=out_dtype,
         )
         mod = tei.make_module(model, [])
-        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu)
+        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False)
         outputs.append(x)
 
     tei.verify(outputs, out_dtype, 1)
@@ -128,7 +128,7 @@ def get_model():
     for npu in [False, True]:
         model = get_model()
         mod = tei.make_module(model, {})
-        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu)
+        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False)
         outputs.append(x)
 
     tei.verify(outputs, out_dtype, 1)
diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py
index 2d6eae9b2522..d60ad50b97bc 100644
--- a/tests/python/contrib/test_ethosn/test_reshape.py
+++ b/tests/python/contrib/test_ethosn/test_reshape.py
@@ -71,7 +71,9 @@ def test_reshape(dtype, input_shape, output_shape):
     for npu in [False, True]:
         model, params = _get_model(input_shape, output_shape, dtype)
         mod = tei.make_module(model, params)
-        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+        outputs.append(
+            tei.build_and_run(mod, inputs, 1, params, npu=npu, optimize_partitions=False)
+        )
 
     tei.verify(outputs, dtype, 1)
 
@@ -91,4 +93,4 @@ def test_reshape_failure(input_shape, output_shape):
 
     model, params = _get_model(input_shape, output_shape, "int8")
     mod = tei.make_module(model, params)
-    tei.build(mod, params, expected_host_ops=1, npu_partitions=0)
+    tei.build(mod, params, expected_host_ops=1, npu_partitions=0, optimize_partitions=False)
diff --git a/tests/python/contrib/test_ethosn/test_split.py b/tests/python/contrib/test_ethosn/test_split.py
index 57335feadbba..56e51e2de159 100644
--- a/tests/python/contrib/test_ethosn/test_split.py
+++ b/tests/python/contrib/test_ethosn/test_split.py
@@ -56,7 +56,9 @@ def test_split(dtype, shape, splits, axis):
         model = _get_model(shape, dtype, splits, axis)
         mod = tei.make_module(model, {})
         output_count = splits if isinstance(splits, int) else len(splits) + 1
-        outputs.append(tei.build_and_run(mod, inputs, output_count, {}, npu=npu))
+        outputs.append(
+            tei.build_and_run(mod, inputs, output_count, {}, npu=npu, optimize_partitions=False)
+        )
 
         tei.verify(outputs, dtype, 0)
 
diff --git a/tests/python/contrib/test_ethosn/test_tanh.py b/tests/python/contrib/test_ethosn/test_tanh.py
index 68170601c5f8..c2fc5188e5f1 100644
--- a/tests/python/contrib/test_ethosn/test_tanh.py
+++ b/tests/python/contrib/test_ethosn/test_tanh.py
@@ -59,7 +59,7 @@ def test_tanh(dtype, shape):
     for npu in [False, True]:
         model = _get_model(shape, zp_min + 120, 0.0250629, zp_min + 128, 0.0078125, dtype)
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py
index 6425eb0faba3..4a4fc1e4d126 100644
--- a/tests/python/contrib/test_ethosn/test_topologies.py
+++ b/tests/python/contrib/test_ethosn/test_topologies.py
@@ -81,23 +81,18 @@ def get_model(input_shape, dtype, var_names):
         expected_host_ops = 0
         npu_partitions = 1
 
-        # Mock inference is only supported when the whole graph is offloaded to the NPU
-        if ethosn_available() == Available.SW_ONLY:
-            tei.build(
-                mod, {}, npu=npu, expected_host_ops=expected_host_ops, npu_partitions=npu_partitions
-            )
-        else:
-            outputs.append(
-                tei.build_and_run(
-                    mod,
-                    inputs,
-                    1,
-                    {},
-                    npu=npu,
-                    expected_host_ops=expected_host_ops,
-                    npu_partitions=npu_partitions,
-                )
+        outputs.append(
+            tei.build_and_run(
+                mod,
+                inputs,
+                1,
+                {},
+                npu=npu,
+                expected_host_ops=expected_host_ops,
+                npu_partitions=npu_partitions,
+                optimize_partitions=False,
             )
+        )
 
     if outputs:
         tei.verify(outputs, dtype, 2)
@@ -183,7 +178,7 @@ def get_model(input_shape, dtype, var_names):
     for npu in [False, True]:
         model = get_model(inputs["a"].shape, dtype, iter(inputs))
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 8, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 8, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
@@ -291,6 +286,7 @@ def get_model(shape, dtype, splits, axis):
                 npu=npu,
                 expected_host_ops=expected_host_ops,
                 npu_partitions=npu_partitions,
+                optimize_partitions=False,
             )
         else:
             outputs.append(
@@ -302,6 +298,7 @@ def get_model(shape, dtype, splits, axis):
                     npu=npu,
                     expected_host_ops=expected_host_ops,
                     npu_partitions=npu_partitions,
+                    optimize_partitions=False,
                 )
             )
 
@@ -332,7 +329,7 @@ def get_model(dtype):
     for npu in [False, True]:
         model = get_model(dtype)
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 4, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 4, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 0)
 
@@ -381,7 +378,33 @@ def get_model(shapes, dtype, axis):
             mod = tei.make_module(model, {})
         else:
             mod = tei.make_ethosn_partition(model)
-        lib = tei.build(mod, {}, npu=False)
+        lib = tei.build(mod, {}, npu=False, optimize_partitions=False)
         outputs.append(tei.run(lib, inputs, 1, npu=npu))
 
     tei.verify(outputs, dtype, 0)
+
+
+@requires_ethosn
+def test_inline_non_compute_intensive_operations():
+    """Tests the case when a subgraph is unpartitioned."""
+    np.random.seed(0)
+    dtype = "int8"
+    shape = (1, 2, 2, 4)
+
+    inp = relay.var("x", shape=shape, dtype=dtype)
+    reshape = relay.reshape(inp, newshape=(1, 1, 4, 4))
+
+    inputs = {
+        "x": tvm.nd.array(
+            np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype)
+        ),
+    }
+    outputs = []
+
+    for npu in [False, True]:
+        mod = tei.make_module(reshape, {})
+        outputs.append(
+            tei.build_and_run(mod, inputs, 1, {}, npu=npu, expected_host_ops=1, npu_partitions=0)
+        )
+
+    tei.verify(outputs, dtype, 0)

From 47da418fbfba7c80a4556ee7c5a28595a169d3af Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 3 Nov 2022 17:22:50 +0000
Subject: [PATCH 499/704] [ETHOSN] Throw error message when inference fails
 (#13022)

* [ETHOSN] Throw error message when inference fails

Previously the runtime would silently skip interence failures and return
random values as the result. This can make spotting inference failures
challenging. The runtime now throws a fatal error when inference did not
complete successfully along with an error message that gives some
details about the error that occurred.

Change-Id: Iadb6da04ad1c906e3ec49959eb3da0978295aebf

* Address comments

* clarify test file brief
* add test case for running status
* add driver stack reference to WaitStatus class

Change-Id: I792742892b761534904816135ae2ffcb3f028b2c
---
 CMakeLists.txt                                |  3 +
 src/runtime/contrib/ethosn/ethosn_device.cc   | 70 +++++++++++-------
 src/runtime/contrib/ethosn/ethosn_runtime.h   | 33 +++++++++
 .../runtime/contrib/ethosn/inference_test.cc  | 74 +++++++++++++++++++
 4 files changed, 154 insertions(+), 26 deletions(-)
 create mode 100644 tests/cpp/runtime/contrib/ethosn/inference_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 71a6555d203a..d0e45c3d3a41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -683,6 +683,9 @@ if(GTEST_FOUND)
   if(DEFINED LLVM_LIBS)
     target_link_libraries(cpptest PRIVATE ${LLVM_LIBS})
   endif()
+  if(DEFINED ETHOSN_RUNTIME_LIBRARY)
+    target_link_libraries(cpptest PRIVATE ${ETHOSN_RUNTIME_LIBRARY})
+  endif()
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_ALL 1)
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
   if(USE_RELAY_DEBUG)
diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc
index 612f4b4cec39..0d79f69815fa 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.cc
+++ b/src/runtime/contrib/ethosn/ethosn_device.cc
@@ -32,6 +32,7 @@
 
 #include <algorithm>
 #include <memory>
+#include <string>
 
 #include "ethosn_driver_library/Buffer.hpp"
 #include "ethosn_runtime.h"
@@ -48,7 +49,7 @@ namespace ethosn {
 
 namespace dl = ::ethosn::driver_library;
 
-bool WaitForInference(dl::Inference* inference, int timeout) {
+InferenceWaitStatus WaitForInference(dl::Inference* inference, int timeout) {
   // Wait for inference to complete
   int fd = inference->GetFileDescriptor();
   struct pollfd fds;
@@ -58,20 +59,32 @@ bool WaitForInference(dl::Inference* inference, int timeout) {
 
   const int ms_per_seconds = 1000;
   int poll_result = poll(&fds, 1, timeout * ms_per_seconds);
-  if (poll_result > 0) {
-    dl::InferenceResult result;
-    if (read(fd, &result, sizeof(result)) != sizeof(result)) {
-      return false;
-    }
-    if (result != dl::InferenceResult::Completed) {
-      return false;
-    }
+  int poll_error_code = errno;
+
+  if (poll_result < 0) {
+    return InferenceWaitStatus(InferenceWaitErrorCode::kError,
+                               "Error while waiting for the inference to complete (" +
+                                   std::string(strerror(poll_error_code)) + ")");
   } else if (poll_result == 0) {
-    return false;
-  } else {
-    return false;
+    return InferenceWaitStatus(InferenceWaitErrorCode::kTimeout,
+                               "Timed out while waiting for the inference to complete.");
   }
-  return true;
+
+  // poll_result > 0
+  dl::InferenceResult npu_result;
+  if (read(fd, &npu_result, sizeof(npu_result)) != static_cast<ssize_t>(sizeof(npu_result))) {
+    return InferenceWaitStatus(
+        InferenceWaitErrorCode::kError,
+        "Failed to read inference result status (" + std::string(strerror(poll_error_code)) + ")");
+  }
+
+  if (npu_result != dl::InferenceResult::Completed) {
+    return InferenceWaitStatus(
+        InferenceWaitErrorCode::kError,
+        "Inference failed with status " + std::to_string(static_cast<uint32_t>(npu_result)));
+  }
+
+  return InferenceWaitStatus(InferenceWaitErrorCode::kSuccess);
 }
 
 void CreateBuffers(std::vector<std::shared_ptr<dl::Buffer>>* fm,
@@ -123,21 +136,26 @@ bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
   }
 
   // Execute the inference.
-  std::unique_ptr<dl::Inference> result(
+  std::unique_ptr<dl::Inference> inference(
       npu->ScheduleInference(ifm_raw, n_inputs, ofm_raw, n_outputs));
-  bool inferenceCompleted = WaitForInference(result.get(), 60);
-  if (inferenceCompleted) {
-    for (size_t i = 0; i < n_outputs; i++) {
-      DLTensor* tensor = outputs[i];
-      dl::Buffer* source_buffer = ofm_raw[i];
-      uint8_t* dest_buffer = static_cast<uint8_t*>(tensor->data);
-      size_t size = source_buffer->GetSize();
-      uint8_t* source_buffer_data = source_buffer->Map();
-      std::copy(source_buffer_data, source_buffer_data + size, dest_buffer);
-      source_buffer->Unmap();
-    }
+  InferenceWaitStatus result = WaitForInference(inference.get(), 60);
+
+  if (result.GetErrorCode() != InferenceWaitErrorCode::kSuccess) {
+    LOG(FATAL) << "An error has occured waiting for the inference of a sub-graph on the NPU: "
+               << result.GetErrorDescription();
+  }
+
+  for (size_t i = 0; i < n_outputs; i++) {
+    DLTensor* tensor = outputs[i];
+    dl::Buffer* source_buffer = ofm_raw[i];
+    uint8_t* dest_buffer = static_cast<uint8_t*>(tensor->data);
+    size_t size = source_buffer->GetSize();
+    uint8_t* source_buffer_data = source_buffer->Map();
+    std::copy(source_buffer_data, source_buffer_data + size, dest_buffer);
+    source_buffer->Unmap();
   }
-  return inferenceCompleted;
+
+  return true;
 }
 
 }  // namespace ethosn
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.h b/src/runtime/contrib/ethosn/ethosn_runtime.h
index 7c8c32e784be..b8942fef12d9 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.h
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.h
@@ -107,6 +107,39 @@ class EthosnModule : public ModuleNode {
   std::map<std::string, OrderedCompiledNetwork> network_map_;
 };
 
+/*!
+ * \brief Error codes for evaluating the result of inference on the NPU.
+ */
+enum class InferenceWaitErrorCode { kSuccess = 0, kTimeout = 1, kError = 2 };
+
+/*!
+ * \brief A helper class holding the status of inference on the NPU and
+ * associated error message(s) if any occurred.
+ *
+ * Similar to the implementation of 'WaitStatus' in the driver stack:
+ * https://github.com/ARM-software/ethos-n-driver-stack/blob/22.08/armnn-ethos-n-backend/workloads/EthosNPreCompiledWorkload.cpp#L48
+ */
+class InferenceWaitStatus {
+ public:
+  InferenceWaitStatus() : error_code_(InferenceWaitErrorCode::kSuccess), error_description_("") {}
+
+  explicit InferenceWaitStatus(InferenceWaitErrorCode errorCode, std::string errorDescription = "")
+      : error_code_(errorCode), error_description_(errorDescription) {}
+
+  InferenceWaitStatus(const InferenceWaitStatus&) = default;
+  InferenceWaitStatus(InferenceWaitStatus&&) = default;
+  InferenceWaitStatus& operator=(const InferenceWaitStatus&) = default;
+  InferenceWaitStatus& operator=(InferenceWaitStatus&&) = default;
+
+  explicit operator bool() const { return error_code_ == InferenceWaitErrorCode::kSuccess; }
+  InferenceWaitErrorCode GetErrorCode() const { return error_code_; }
+  std::string GetErrorDescription() const { return error_description_; }
+
+ private:
+  InferenceWaitErrorCode error_code_;
+  std::string error_description_;
+};
+
 }  // namespace ethosn
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/cpp/runtime/contrib/ethosn/inference_test.cc b/tests/cpp/runtime/contrib/ethosn/inference_test.cc
new file mode 100644
index 000000000000..95b27070e19a
--- /dev/null
+++ b/tests/cpp/runtime/contrib/ethosn/inference_test.cc
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tests/cpp/runtime/contrib/ethosn/inference_test.cc
+ * \brief Tests to check Arm(R) Ethos(TM)-N runtime components used during inference.
+ */
+
+#ifdef ETHOSN_HW
+
+#include <gtest/gtest.h>
+
+#include "../../../../../src/runtime/contrib/ethosn/ethosn_device.cc"
+
+namespace tvm {
+namespace runtime {
+namespace ethosn {
+
+TEST(WaitForInference, InferenceScheduled) {
+  const int inference_result = 0 /* Scheduled */;
+  const int timeout = 0;
+
+  dl::Inference inference = dl::Inference(inference_result);
+  InferenceWaitStatus result = WaitForInference(&inference, timeout);
+
+  ASSERT_EQ(result.GetErrorCode(), InferenceWaitErrorCode::kTimeout);
+  ICHECK_EQ(result.GetErrorDescription(), "Timed out while waiting for the inference to complete.");
+}
+
+TEST(WaitForInference, InferenceRunning) {
+  const int inference_result = 1 /* Running */;
+  const int timeout = 0;
+
+  dl::Inference inference = dl::Inference(inference_result);
+  InferenceWaitStatus result = WaitForInference(&inference, timeout);
+
+  ASSERT_EQ(result.GetErrorCode(), InferenceWaitErrorCode::kTimeout);
+  std::cout << result.GetErrorDescription() << std::endl;
+  ICHECK_EQ(result.GetErrorDescription(), "Timed out while waiting for the inference to complete.");
+}
+
+TEST(WaitForInference, InferenceError) {
+  const int inference_result = 3 /* Error */;
+  const int timeout = 0;
+
+  dl::Inference inference = dl::Inference(inference_result);
+  InferenceWaitStatus result = WaitForInference(&inference, timeout);
+
+  ASSERT_EQ(result.GetErrorCode(), InferenceWaitErrorCode::kError);
+  ICHECK_EQ(result.GetErrorDescription(),
+            "Failed to read inference result status (No such file or directory)");
+}
+
+}  // namespace ethosn
+}  // namespace runtime
+}  // namespace tvm
+
+#endif

From 1d1db352367e3dd435a58f41283ac75cdf9d8858 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 3 Nov 2022 14:15:17 -0500
Subject: [PATCH 500/704] [MetaSchedule] Fix Task Hanging in EvolutionarySearch
 (#13246)

This PR introduces a new argument for EvolutionarySearch that limits the failures (defined as rounds of no new generated candidate) in the `SampleInitPopulation` stage. In this way we can avoid the task to be hanging forever in special cases, e.g., some postproc always fails. This should fix #12330.
---
 include/tvm/meta_schedule/search_strategy.h   |  2 +
 .../search_strategy/evolutionary_search.py    |  4 ++
 .../search_strategy/evolutionary_search.cc    | 18 +++++-
 .../test_meta_schedule_search_strategy.py     | 56 +++++++++++++++++++
 4 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index c2399eef0824..3f44a2438d22 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -200,6 +200,7 @@ class SearchStrategy : public runtime::ObjectRef {
    * \param population_size The initial sample population.
    * \param init_measured_ratio The ratio of measures samples in initial population.
    * \param init_min_unmeasured The minimal size of unmeasured population in the initial sampling.
+   * \param max_fail_count The max number of failure during initial sampling.
    * \param genetic_num_iters The iterations to run the genetic algorithm.
    * \param genetic_mutate_prob The probability of mutation.
    * \param genetic_max_fail_count The maximum number to try evolving the given trace.
@@ -208,6 +209,7 @@ class SearchStrategy : public runtime::ObjectRef {
   TVM_DLL static SearchStrategy EvolutionarySearch(int population_size,         //
                                                    double init_measured_ratio,  //
                                                    int init_min_unmeasured,     //
+                                                   int max_fail_count,          //
                                                    int genetic_num_iters,       //
                                                    double genetic_mutate_prob,  //
                                                    int genetic_max_fail_count,  //
diff --git a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
index 2851ebe7b1d1..65e7ddc468b5 100644
--- a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
+++ b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
@@ -35,6 +35,8 @@ class EvolutionarySearch(SearchStrategy):
         The ratio of measured samples in the initial population.
     init_min_unmeasured : int
         The minimal size of unmeasured population in the initial sampling.
+    max_fail_count : int
+        The maximum number of failure during initial sampling.
     genetic_num_iters : int
         The number of iterations for genetic algorithm.
     genetic_mutate_prob : float
@@ -59,6 +61,7 @@ def __init__(
         population_size: int = 2048,
         init_measured_ratio: float = 0.2,
         init_min_unmeasured: int = 50,
+        max_fail_count: int = 5,
         genetic_num_iters: int = 4,
         genetic_mutate_prob: float = 0.85,
         genetic_max_fail_count: int = 10,
@@ -70,6 +73,7 @@ def __init__(
             population_size,
             init_measured_ratio,
             init_min_unmeasured,
+            max_fail_count,
             genetic_num_iters,
             genetic_mutate_prob,
             genetic_max_fail_count,
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 2cc45e01bbaf..cc9995123951 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -365,6 +365,8 @@ class EvolutionarySearchNode : public SearchStrategyNode {
   double init_measured_ratio;
   /*! \brief The minimal size of unmeasured population in the initial sampling.*/
   int init_min_unmeasured;
+  /*! \brief The maximum number of failure during initial sampling. */
+  int max_fail_count;
   /*** Configuration: evolution ***/
   /*! \brief The number of iterations performed by generic algorithm. */
   int genetic_num_iters;
@@ -387,6 +389,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     /*** Configuration: the initial population ***/
     v->Visit("init_measured_ratio", &init_measured_ratio);
     v->Visit("init_min_unmeasured", &init_min_unmeasured);
+    v->Visit("max_fail_count", &max_fail_count);
     /*** Configuration: evolution ***/
     v->Visit("genetic_num_iters", &genetic_num_iters);
     v->Visit("genetic_mutate_prob", &genetic_mutate_prob);
@@ -456,6 +459,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     n->num_empty_iters_before_early_stop = this->num_empty_iters_before_early_stop;
     n->init_measured_ratio = this->init_measured_ratio;
     n->init_min_unmeasured = this->init_min_unmeasured;
+    n->max_fail_count = this->max_fail_count;
     n->genetic_num_iters = this->genetic_num_iters;
     n->genetic_mutate_prob = this->genetic_mutate_prob;
     n->genetic_max_fail_count = this->genetic_max_fail_count;
@@ -501,7 +505,9 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
   auto _ = Profiler::TimedScope("EvoSearch/SampleInitPopulation");
   ThreadedTraceApply pp(self->postprocs_);
   std::vector<Schedule> out_schs;
-  while (static_cast<int>(out_schs.size()) < self->init_min_unmeasured) {
+  int fail_count = 0;
+  while (static_cast<int>(out_schs.size()) < self->init_min_unmeasured &&
+         fail_count < self->max_fail_count) {
     std::vector<Schedule> results(num, Schedule{nullptr});
     auto f_proc_unmeasured = [this, &results, &pp](int thread_id, int trace_id) -> void {
       PerThreadData& data = this->per_thread_data_.at(thread_id);
@@ -516,11 +522,14 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
       }
     };
     support::parallel_for_dynamic(0, num, self->ctx_->num_threads, f_proc_unmeasured);
+    bool found_new = false;
     for (int i = 0; i < num; i++) {
       if (results[i].defined()) {
+        found_new = true;
         out_schs.push_back(results[i]);
       }
     }
+    fail_count += !found_new;
     TVM_PY_LOG(INFO, self->ctx_->logger) << "Sample-Init-Population summary:\n"
                                          << pp.SummarizeFailures();
   }
@@ -706,6 +715,11 @@ Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasure
   TVM_PY_LOG(INFO, self->ctx_->logger)
       << "Picked top " << measured.size() << " candidate(s) from database";
   std::vector<Schedule> unmeasured = SampleInitPopulation(pop - measured.size());
+  if (static_cast<int>(unmeasured.size()) < self->init_min_unmeasured) {
+    TVM_PY_LOG(WARNING, self->ctx_->logger)
+        << "Cannot sample enough initial population, evolutionary search failed.";
+    return NullOpt;
+  }
   TVM_PY_LOG(INFO, self->ctx_->logger) << "Sampled " << unmeasured.size() << " candidate(s)";
   inits.insert(inits.end(), measured.begin(), measured.end());
   inits.insert(inits.end(), unmeasured.begin(), unmeasured.end());
@@ -737,6 +751,7 @@ size_t EvolutionarySearchNode::State::ModuleHash(const IRModule& mod) const {
 SearchStrategy SearchStrategy::EvolutionarySearch(int population_size,         //
                                                   double init_measured_ratio,  //
                                                   int init_min_unmeasured,     //
+                                                  int max_fail_count,          //
                                                   int genetic_num_iters,       //
                                                   double genetic_mutate_prob,  //
                                                   int genetic_max_fail_count,  //
@@ -749,6 +764,7 @@ SearchStrategy SearchStrategy::EvolutionarySearch(int population_size,         /
   n->num_empty_iters_before_early_stop = 5;
   n->init_measured_ratio = init_measured_ratio;
   n->init_min_unmeasured = init_min_unmeasured;
+  n->max_fail_count = max_fail_count;
   n->genetic_num_iters = genetic_num_iters;
   n->genetic_max_fail_count = genetic_max_fail_count;
   n->genetic_mutate_prob = genetic_mutate_prob;
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index e34554420600..29c20ced0488 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -22,6 +22,7 @@
 import tvm
 import tvm.testing
 from tvm import meta_schedule as ms
+from tvm.meta_schedule.utils import derived_object
 from tvm.meta_schedule.testing.dummy_object import DummyMutator
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule, Trace
@@ -251,8 +252,63 @@ def _schedule_matmul_empty(sch: Schedule):
     assert num_trials_each_iter == [1, 0, 0, 0, 0]
 
 
+def test_meta_schedule_evolutionary_search_fail_init_population():  # pylint: disable = invalid-name
+    @derived_object
+    class AlwaysFailPostproc(ms.postproc.PyPostproc):
+        """A postproc that always fails."""
+
+        def _initialize_with_tune_context(self, context: ms.TuneContext) -> None:
+            pass
+
+        def apply(self, sch: Schedule) -> bool:
+            return False
+
+        def clone(self) -> "AlwaysFailPostproc":
+            return AlwaysFailPostproc()
+
+        def __str__(self) -> str:
+            return "AlwaysFailPostproc"
+
+    num_trials_per_iter = 10
+    max_trials_per_task = 2000
+
+    context = ms.TuneContext(
+        mod=Matmul,
+        space_generator=ms.space_generator.ScheduleFn(
+            sch_fn=_schedule_matmul,
+            sch_rules=[],
+            postprocs=[AlwaysFailPostproc()],
+            mutator_probs={
+                DummyMutator(): 1.0,
+            },
+        ),
+        search_strategy=ms.search_strategy.EvolutionarySearch(
+            population_size=5,
+            init_measured_ratio=0.1,
+            init_min_unmeasured=50,
+            genetic_num_iters=3,
+            genetic_mutate_prob=0.5,
+            genetic_max_fail_count=10,
+            eps_greedy=0.9,
+        ),
+        target=tvm.target.Target("llvm"),
+        num_threads=1,  # because we are using a mutator from the python side
+    )
+    strategy = context.search_strategy
+    strategy.pre_tuning(
+        max_trials=max_trials_per_task,
+        num_trials_per_iter=num_trials_per_iter,
+        design_spaces=context.space_generator.generate_design_space(context.mod),
+        database=ms.database.MemoryDatabase(),
+        cost_model=ms.cost_model.RandomModel(),
+    )
+    candidates = strategy.generate_measure_candidates()
+    assert candidates is None
+
+
 if __name__ == "__main__":
     test_meta_schedule_replay_func(ms.search_strategy.ReplayFunc)
     test_meta_schedule_replay_func(ms.search_strategy.ReplayTrace)
     test_meta_schedule_evolutionary_search()
     test_meta_schedule_evolutionary_search_early_stop()
+    test_meta_schedule_evolutionary_search_fail_init_population()

From 215f0e2fc2b13857d73c3d81e2638388d63207c5 Mon Sep 17 00:00:00 2001
From: Sunghyun Park <49998730+sunggg@users.noreply.github.com>
Date: Thu, 3 Nov 2022 12:15:45 -0700
Subject: [PATCH 501/704] [Bugfix][TIR] Fix version conflict with `typing` for
 Python 3.9  (#13269)

Current type checker for TIR schedule had issue with typing for Python 3.9.
This simple patch fixes this problem.
---
 python/tvm/tir/schedule/_type_checker.py | 49 +++++++++++++++++++-----
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py
index 0c66f7ef6cdf..4130e76e0892 100644
--- a/python/tvm/tir/schedule/_type_checker.py
+++ b/python/tvm/tir/schedule/_type_checker.py
@@ -21,55 +21,81 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, Union
 import typing
+import sys
 
 
 def _is_none_type(type_: Any) -> bool:
     return type_ is None or type_ is type(None)
 
 
+def get_python_version():
+    return sys.version_info[:3]
+
+
 if hasattr(typing, "_GenericAlias"):
     # For python versions 3.7 onward, check the __origin__ attribute.
 
     class _Subtype:
         @staticmethod
         def _origin(type_: Any) -> Any:
-            if isinstance(type_, typing._GenericAlias):  # type: ignore # pylint: disable=protected-access
-                return type_.__origin__
+            if get_python_version() >= (3, 9, 0):
+                if isinstance(type_, typing._SpecialGenericAlias):  # type: ignore # pylint: disable=protected-access
+                    return type_.__origin__
+            else:
+                if isinstance(type_, typing._GenericAlias):  # type: ignore # pylint: disable=protected-access
+                    return type_.__origin__
             return None
 
         @staticmethod
         def list_(type_: Any) -> Any:
             if _Subtype._origin(type_) is list:
-                (subtype,) = type_.__args__
+                if hasattr(typing, "get_args"):
+                    (subtype,) = typing.get_args(type_)  # type: ignore
+                else:
+                    (subtype,) = type_.__args__
                 return [subtype]
             return None
 
         @staticmethod
         def dict_(type_: Any) -> Any:
             if _Subtype._origin(type_) is dict:
-                (ktype, vtype) = type_.__args__
+                if hasattr(typing, "get_args"):
+                    (ktype, vtype) = typing.get_args(type_)  # type: ignore
+                else:
+                    (ktype, vtype) = type_.__args__
                 return [ktype, vtype]
             return None
 
         @staticmethod
         def tuple_(type_: Any) -> Optional[List[type]]:
             if _Subtype._origin(type_) is tuple:
-                subtypes = type_.__args__
+                if hasattr(typing, "get_args"):
+                    subtypes = typing.get_args(type_)  # type: ignore
+                else:
+                    subtypes = type_.__args__
                 return subtypes
             return None
 
         @staticmethod
-        def optional(type_: Any) -> Optional[List[type]]:
+        def optional(  # pylint: disable=missing-function-docstring
+            type_: Any,
+        ) -> Optional[List[type]]:
             if _Subtype._origin(type_) is Union:
-                subtypes = type_.__args__
+                if hasattr(typing, "get_args"):
+                    subtypes = typing.get_args(type_)  # type: ignore
+                else:
+                    subtypes = type_.__args__
                 if len(subtypes) == 2 and _is_none_type(subtypes[1]):
                     return [subtypes[0]]
             return None
 
         @staticmethod
-        def union(type_: Any) -> Optional[List[type]]:
+        def union(type_: Any) -> Optional[List[type]]:  # pylint: disable=missing-function-docstring
             if _Subtype._origin(type_) is Union:
-                subtypes = type_.__args__
+                if hasattr(typing, "get_args"):
+                    subtypes = typing.get_args(type_)  # type: ignore
+                else:
+                    subtypes = type_.__args__
                 if len(subtypes) != 2 or not _is_none_type(subtypes[1]):
                     return list(subtypes)
             return None
@@ -77,7 +103,10 @@ def union(type_: Any) -> Optional[List[type]]:
         @staticmethod
         def callable(type_: Any) -> Optional[List[type]]:
             if _Subtype._origin(type_) is collections.abc.Callable:
-                subtypes = type_.__args__
+                if hasattr(typing, "get_args"):
+                    subtypes = typing.get_args(type_)  # type: ignore
+                else:
+                    subtypes = type_.__args__
                 return subtypes
             return None
 

From b98b9f92daa8dc3694707ed536ef0262767df18b Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Thu, 3 Nov 2022 15:16:34 -0400
Subject: [PATCH 502/704] [MetaSchedule] Improve the script for TorchBench
 model tuning & benchmarking (#13255)

This PR adds features to the `python/tvm/meta_schedule/testing/torchbench/run.py`.

- Integrate with the TVM PyTorch integration to handle boolean tensor and unaligned memory.
- Deduplicate collected tuning tasks to prevent thousands of tasks created by hundreds of subgraphs with similar structure.
- Add option to cast model to float32, which are more stable numerically than float16 and prevents inaccurate result from many models.
- Add option to choose search strategy in MetaSchedule.
- Inspect output error if the actual output doesn't match the expectation. Also save the actual output and expected output for further analysis if needed.
- Save subgraphs and their example input for debug purpose.
- Print MetaSchedule profiling information at the end of execution.
- Detach PyTorch tensor before exporting to dlpack.
- Fix the sys path to avoid conflict with the `benchmarks` package installed by TorchBench dependency.
- Trim all command line args passed in, in order to prevent breaking some TorchBench model that depends on args.
- Empty cuda cache before starting the actual benchmark.
---
 .../meta_schedule/testing/torchbench/run.py   | 272 ++++++++++++++----
 .../meta_schedule/testing/torchbench/utils.py |   8 +-
 2 files changed, 218 insertions(+), 62 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/torchbench/run.py b/python/tvm/meta_schedule/testing/torchbench/run.py
index 20c633196900..5df77cf25c3f 100644
--- a/python/tvm/meta_schedule/testing/torchbench/run.py
+++ b/python/tvm/meta_schedule/testing/torchbench/run.py
@@ -89,19 +89,25 @@
 ```
 """
 # pylint: disable=logging-format-interpolation
+
 import argparse
-import functools
+import contextlib
 import logging
+import os
+import sys
 import warnings
+from collections import defaultdict
 from enum import Enum
-from typing import Callable, List, Tuple
+from typing import Callable, List, Tuple, Dict
 
 import numpy as np  # type: ignore
 import torch  # type: ignore
+from scipy.stats import ttest_ind  # type: ignore
+
 import tvm
 import tvm.relay
-from scipy.stats import ttest_ind  # type: ignore
 from tvm import meta_schedule as ms
+from tvm._ffi import get_global_func
 from tvm.contrib.graph_executor import GraphModule
 from tvm.meta_schedule.testing.torchbench.utils import (
     load_torchdynamo_benchmark_runner,
@@ -201,6 +207,13 @@ def parse_args():
         https://github.com/pytorch/benchmark/tree/main/torchbenchmark/models.
         """,
     )
+    args.add_argument(
+        "--float32",
+        action="store_true",
+        help="""
+        Cast model and inputs to fp32
+        """,
+    )
 
     # Tuning-related config
     args.add_argument(
@@ -217,6 +230,12 @@ def parse_args():
         The working directory to save intermediate results and store databases for compilation.
         """,
     )
+    args.add_argument(
+        "--strategy",
+        type=str,
+        default="evolutionary",
+        help="The search strategy used by MetaSchdule.",
+    )
     args.add_argument(
         "--num-trials",
         type=int,
@@ -293,6 +312,10 @@ def parse_args():
     )
 
     parsed = args.parse_args()
+
+    # Trim all args, otherwise it confuses the arg parser of timm_efficientdet
+    sys.argv = sys.argv[:1]
+
     return parsed
 
 
@@ -311,6 +334,7 @@ def parse_args():
 runner = load_torchdynamo_benchmark_runner(  # pylint: disable=invalid-name
     IS_CUDA,
     cosine_similarity=ARGS.result_metric == ResultComparisonMetric.COSINE,
+    float32=ARGS.float32,
 )
 
 
@@ -343,26 +367,49 @@ def get_meta_schedule_runner() -> ms.runner.PyRunner:
         return ms.runner.LocalRunner()
 
 
-def get_graph_executor_forward(mod: GraphModule, device: tvm.runtime.Device) -> Callable:
+def get_graph_executor_forward(
+    graph_executor_factory: tvm.runtime.Module, device: tvm.runtime.Device
+) -> Callable:
     """
     Get the forward function for graph executor, in order to integrate with TorchDynamo.
     """
 
-    def forward(*args):
-        if IS_CUDA:
-            torch.cuda.synchronize()
-        args = tuple(arg.contiguous() for arg in args)
-        for idx, arg in enumerate(args, 0):
-            mod.set_input(
-                f"inp_{idx}",
-                tvm.nd.from_dlpack(arg),
-            )
-        mod.run()
-        device.sync()
-        result = [torch.from_dlpack(mod.get_output(i)) for i in range(mod.get_num_outputs())]
-        return result
+    # It has to lazily import this package, loading the C++ PyTorch integration
+    # after the transformers package is imported when loading model. Otherwise
+    # there will be segfault caused by the protobuf library.
+    import tvm.contrib.torch  # pylint: disable=import-outside-toplevel, unused-import, redefined-outer-name
 
-    return forward
+    save_runtime_mod = get_global_func("tvmtorch.save_runtime_mod", allow_missing=True)
+    if save_runtime_mod is None:
+        warnings.warn(
+            "C++ PyTorch TVM integration is missing. Fallback to Python forward function."
+            "Build TVM with 'USE_PT_TVMDSOOP' to enable the C++ custom operator"
+        )
+        mod = GraphModule(graph_executor_factory["default"](device))
+
+        def forward(*args):
+            if IS_CUDA:
+                torch.cuda.synchronize()
+            args = tuple(arg.detach().contiguous() for arg in args)
+            for idx, arg in enumerate(args, 0):
+                mod.set_input(
+                    f"inp_{idx}",
+                    tvm.nd.from_dlpack(arg),
+                )
+            mod.run()
+            device.sync()
+            result = [torch.from_dlpack(mod.get_output(i)) for i in range(mod.get_num_outputs())]
+            return result
+
+        return forward
+    else:
+        save_runtime_mod(graph_executor_factory.module)
+        module = torch.classes.tvm_torch.GraphExecutorFactoryWrapper()
+
+        def forward(*args):  # type: ignore  # isort: skip, pylint: disable=function-redefined
+            return module.forward(args)
+
+        return forward
 
 
 def get_vm_forward(virtual_machine: VirtualMachine, device: tvm.runtime.Device) -> Callable:
@@ -373,7 +420,7 @@ def get_vm_forward(virtual_machine: VirtualMachine, device: tvm.runtime.Device)
     def forward(*args):
         if IS_CUDA:
             torch.cuda.synchronize()
-        args = tuple(tvm.nd.from_dlpack(arg.contiguous()) for arg in args)
+        args = tuple(tvm.nd.from_dlpack(arg.detach().contiguous()) for arg in args)
         result = virtual_machine.invoke("main", *args)
         device.sync()
 
@@ -384,13 +431,36 @@ def forward(*args):
     return forward
 
 
-def create_tvm_task_collection_backend(tasks: List[ms.ExtractedTask]) -> Callable:
+def create_tvm_task_collection_backend() -> Tuple[Callable, List[ms.ExtractedTask]]:
     """
     This torchdynamo backend only collects the extracted tasks from MetaSchedule.
     It doesn't tune the model.
     """
 
+    subgraph_idx = 0
+    subgraphs_dir = os.path.join(ARGS.work_dir, "subgraphs")
+    os.makedirs(subgraphs_dir, exist_ok=True)
+
+    collected_tasks = []
+    task_index: Dict[int, List[ms.ExtractedTask]] = defaultdict(list)
+
+    def collect_task(task):
+        task_hash = tvm.ir.structural_hash(task.dispatched[0])
+
+        for duplicate_task in task_index[task_hash]:
+            if tvm.ir.structural_equal(duplicate_task.dispatched[0], task.dispatched[0]):
+                duplicate_task.weight += task.weight
+                return
+
+        task_index[task_hash].append(task)
+        collected_tasks.append(task)
+
     def backend(graph_module, example_inputs):
+        nonlocal subgraph_idx
+
+        torch.save(graph_module, os.path.join(subgraphs_dir, f"graph_module_{subgraph_idx}"))
+        torch.save(example_inputs, os.path.join(subgraphs_dir, f"example_inputs_{subgraph_idx}"))
+
         jit_mod = torch.jit.trace(graph_module, example_inputs)
         shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
         ir_mod, params = tvm.relay.frontend.from_pytorch(jit_mod, shape_list)
@@ -400,12 +470,21 @@ def backend(graph_module, example_inputs):
             target=ARGS.target,
             params=params,
         )
-        logger.info("Extracted %d tasks", len(extracted_tasks))
-        tasks.extend(extracted_tasks)
+        old_tasks_count = len(collected_tasks)
+        for task in extracted_tasks:
+            collect_task(task)
+        logger.info(
+            "Extracted %d tasks from graph %d, with %d new tasks",
+            len(extracted_tasks),
+            subgraph_idx,
+            len(collected_tasks) - old_tasks_count,
+        )
+
+        subgraph_idx += 1
 
         return graph_module.forward
 
-    return backend
+    return backend, collected_tasks
 
 
 def create_tvm_compilation_backend(database: ms.database.Database) -> Callable:
@@ -429,8 +508,7 @@ def backend(graph_module, example_inputs):
         device = tvm.cuda(0) if IS_CUDA else tvm.cpu(0)
 
         if ARGS.backend == "graph":
-            mod = GraphModule(lib["default"](device))
-            return get_graph_executor_forward(mod, device)
+            return get_graph_executor_forward(lib, device)
         elif ARGS.backend == "vm":
             vm = VirtualMachine(lib, device)  # pylint: disable=invalid-name
             return get_vm_forward(vm, device)
@@ -463,6 +541,67 @@ def is_output_correct(output: torch.Tensor, expected: torch.Tensor) -> bool:
         raise RuntimeError(f"Unknown comparison metric {comparison_metric}")
 
 
+def inspect_output_error(output, expected):
+    """
+    Inpsect the error between the actual output and expected output.
+    """
+    if not isinstance(output, torch.Tensor):
+        logger.info(
+            f"Unsupported type for error inspection: {type(output).__name__}."
+            f"Please manually check output.pt"
+        )
+        return
+    output = output.cpu().float()
+    expected = expected.cpu().float()
+
+    abs_error = (output - expected).abs()
+    rel_error = (abs_error / expected).abs()
+
+    def format_error_table(error, bins) -> str:
+        bin_tensor = torch.as_tensor([float(b) for b in bins], dtype=error.dtype)
+        error_hist = torch.histogram(error, bin_tensor).hist.int()
+        return "\n".join(f"< {b}\t{e}" for e, b in zip(error_hist, bins[1:]))
+
+    abs_error_bins = [
+        "-1e10",
+        "0",
+        "1e-8",
+        "1e-6",
+        "1e-5",
+        "1e-4",
+        "1e-3",
+        "1e-2",
+        "1e-1",
+        "1",
+        "1e10",
+    ]
+    rel_error_bins = [
+        "-1e10",
+        "0",
+        "1e-4",
+        "1e-3",
+        "1e-2",
+        "1e-1",
+        "1",
+        "1e1",
+        "1e2",
+        "1e3",
+        "1e100",
+    ]
+
+    large_rel_error_idx = rel_error > 1
+    abs_error_with_large_rel_error = abs_error[large_rel_error_idx]
+
+    logger.error(f"Expected (PyTorch eager): {expected}")
+    logger.error(f"Actual (Optimized): {output}")
+    logger.error(f"Absolute Error\n{format_error_table(abs_error, abs_error_bins)}")
+    logger.error(f"Relative Error\n{format_error_table(rel_error, rel_error_bins)}")
+    logger.error(
+        f"Max absolute error for position with large relative error (> 1):"
+        f"{abs_error_with_large_rel_error.max()}"
+    )
+
+
 def performance_experiment(
     model_iter_fn: Callable,
     model: torch.nn.Module,
@@ -473,6 +612,8 @@ def performance_experiment(
     Simplified from https://github.com/pytorch/torchdynamo/blob/c537639f9712621dc04ca09908796dbbe86c354b/benchmarks/common.py#L494 pylint: disable=line-too-long
     """
     timings = np.zeros((ARGS.benchmark_repeat, 2), np.float64)
+    if IS_CUDA:
+        torch.cuda.empty_cache()
 
     is_correct = True
 
@@ -500,10 +641,11 @@ def performance_experiment(
         f"optimized:{format_time(median[1])} "
         f"speedup:{speedup:.3f}x p:{pvalue:.3f}"
     )
+    torch.save(actual_output, os.path.join(ARGS.work_dir, "output.pt"))
+    torch.save(expected_output, os.path.join(ARGS.work_dir, "expected.pt"))
     if not is_correct:
         logger.error("Result is incorrect.")
-        logger.error(f"Expected (PyTorch eager): {expected_output}")
-        logger.error(f"Actual (Optimized): {actual_output}")
+        inspect_output_error(actual_output, expected_output)
 
     return ""
 
@@ -523,7 +665,10 @@ def main():
     """
     describe()
 
-    database = ms.database.JSONDatabase(work_dir=ARGS.work_dir)
+    meta_schedule_work_dir = os.path.join(ARGS.work_dir, "meta_schedule")
+    os.makedirs(meta_schedule_work_dir, exist_ok=True)
+
+    database = ms.database.JSONDatabase(work_dir=meta_schedule_work_dir)
     if not ARGS.mode.should_tune:
         if len(database) == 0:
             raise RuntimeError(
@@ -539,45 +684,54 @@ def main():
         ARGS.cpu_flush = False
 
     try:
+        logger.info(f"Loading model with batch size: {ARGS.batch_size}")
         _, name, model, example_inputs, batch_size = runner.load_model(
             get_torch_device_type(ARGS.target),
             ARGS.model,
             batch_size=ARGS.batch_size,
         )
-        logger.info(
-            f"batch size: {batch_size} input shape: {[input.shape for input in example_inputs]}"
-        )
+        model, example_inputs = runner.maybe_cast(model, example_inputs)
+        logger.info(f"Got model with batch size: {batch_size}")
     except NotImplementedError:
-        logging.exception(f"{ARGS.model} failed to load")
-        return
+        logger.exception(f"{ARGS.model} failed to load")
+        raise
+
+    with contextlib.ExitStack() as stack:
+        profiler = stack.enter_context(ms.Profiler())
+        stack.enter_context(torch.no_grad())
+
+        if ARGS.mode.should_tune:
+            task_collect_backend, extracted_tasks = create_tvm_task_collection_backend()
+            task_collect_ctx = torchdynamo.optimize(task_collect_backend)
+            task_collect_ctx(runner.model_iter_fn)(model, example_inputs)
+
+            tasks, task_weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
+                extracted_tasks=extracted_tasks,
+                work_dir=ARGS.work_dir,
+                strategy=ARGS.strategy,
+            )
+            database = ms.tune.tune_tasks(
+                tasks=tasks,
+                task_weights=task_weights,
+                work_dir=ARGS.work_dir,
+                max_trials_global=ARGS.num_trials,
+                max_trials_per_task=ARGS.max_trials_per_task,
+                runner=get_meta_schedule_runner(),  # type: ignore
+                database=database,
+                cost_model=ms.cost_model.XGBModel(  # type: ignore
+                    extractor=ms.feature_extractor.PerStoreFeature(),
+                    adaptive_training=ARGS.adaptive_training,
+                ),
+            )
 
-    if ARGS.mode.should_tune:
-        extracted_tasks: List[ms.ExtractedTask] = []
-        task_collect_ctx = torchdynamo.optimize(create_tvm_task_collection_backend(extracted_tasks))
-        task_collect_ctx(runner.model_iter_fn)(model, example_inputs)
-        tasks, task_weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
-            extracted_tasks=extracted_tasks,
-            work_dir=ARGS.work_dir,
-        )
-        database = ms.tune.tune_tasks(
-            tasks=tasks,
-            task_weights=task_weights,
-            work_dir=ARGS.work_dir,
-            max_trials_global=ARGS.num_trials,
-            max_trials_per_task=ARGS.num_trials_per_task,
-            runner=get_meta_schedule_runner(),  # type: ignore
-            database=database,
-            cost_model=ms.cost_model.XGBModel(  # type: ignore
-                extractor=ms.feature_extractor.PerStoreFeature(),
-                adaptive_training=ARGS.adaptive_training,
-            ),
-        )
+        if ARGS.mode.should_eval:
+            torchdynamo.reset()
+            model_compile_ctx = torchdynamo.optimize(create_tvm_compilation_backend(database))
+            model_compile_ctx(runner.model_iter_fn)(model, example_inputs)
+            with torch.no_grad():
+                performance_experiment(runner.model_iter_fn, model, example_inputs)
 
-    if ARGS.mode.should_eval:
-        torchdynamo.reset()
-        model_compile_ctx = torchdynamo.optimize(create_tvm_compilation_backend(database))
-        experiment = functools.partial(performance_experiment, runner.model_iter_fn)
-        runner.run_one_model(name, model, example_inputs, model_compile_ctx, experiment)
+    print(profiler.table())
 
 
 if __name__ == "__main__":
diff --git a/python/tvm/meta_schedule/testing/torchbench/utils.py b/python/tvm/meta_schedule/testing/torchbench/utils.py
index f5a745ea008a..8bd022a9cb18 100644
--- a/python/tvm/meta_schedule/testing/torchbench/utils.py
+++ b/python/tvm/meta_schedule/testing/torchbench/utils.py
@@ -51,7 +51,9 @@ def find_torchdynamo() -> str:
 
 
 DYNAMO_DIR = find_torchdynamo()
-sys.path.append(DYNAMO_DIR)
+sys.path.insert(
+    0, DYNAMO_DIR
+)  # opacus_cifar10 depends on opacus, which installs a package called 'benchmarks'
 sys.path.append(f"{DYNAMO_DIR}/benchmarks")
 
 # pylint: disable=wrong-import-position, unused-import
@@ -62,7 +64,7 @@ def find_torchdynamo() -> str:
 
 
 def load_torchdynamo_benchmark_runner(
-    is_cuda: bool, cosine_similarity: bool = False
+    is_cuda: bool, cosine_similarity: bool = False, float32: bool = False
 ) -> TorchBenchmarkRunner:
     """
     Load the benchmark runner from TorchDynamo.
@@ -86,7 +88,7 @@ class RunnerArgs:
 
         cosine: bool = False  # Whether to use consine similarity to check if output is correct.
 
-    args = RunnerArgs(cosine=cosine_similarity)
+    args = RunnerArgs(cosine=cosine_similarity, float32=float32)
 
     runner = TorchBenchmarkRunner()
     runner.args = args

From 90ed632280898dafeac40913d006993bc71a8409 Mon Sep 17 00:00:00 2001
From: WANG Zihan <wzh1999_frog@126.com>
Date: Fri, 4 Nov 2022 04:42:07 +0800
Subject: [PATCH 503/704] [Relay] Add tensor rank check for `nn.instance_norm`
 (#13280)

Add tensor rank check for `nn.instance_norm`.
---
 src/relay/op/nn/nn.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 8644957b1c8e..9e2fe63b006a 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -923,6 +923,7 @@ bool InstanceNormRel(const Array<Type>& types, int num_inputs, const Attrs& attr
   ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
+  ICHECK_GT(data->shape.size(), 2);
   const InstanceNormAttrs* param = attrs.as<InstanceNormAttrs>();
   int axis = param->axis >= 0 ? param->axis : param->axis + data->shape.size();
   ICHECK(axis >= 0 && axis < (int)data->shape.size());

From b1a099b2130fa91b0424c6ac5bec414d96dc288f Mon Sep 17 00:00:00 2001
From: leiwen83 <leiwen83@users.noreply.github.com>
Date: Fri, 4 Nov 2022 07:01:20 +0800
Subject: [PATCH 504/704] [Relay] Enhancement for fold_scale_axis and
 simplify_expr (#13275)

add(%1, %1) convert to multiply(%1, 2f); enhance fold_scale_axis to fold multiply(%1, 2f) into conv

Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
---
 src/relay/analysis/util.cc                    |  3 +-
 src/relay/ir/dataflow_matcher.cc              | 10 ----
 src/relay/transforms/pattern_utils.h          | 29 +++++++++++
 src/relay/transforms/simplify_expr.cc         | 32 ++++++++++++
 .../python/relay/test_pass_fold_scale_axis.py | 51 +++++++++++++++++++
 tests/python/relay/test_pass_simplify_expr.py | 15 ++++++
 6 files changed, 129 insertions(+), 11 deletions(-)

diff --git a/src/relay/analysis/util.cc b/src/relay/analysis/util.cc
index a4120d20288f..96db7d762cae 100644
--- a/src/relay/analysis/util.cc
+++ b/src/relay/analysis/util.cc
@@ -394,6 +394,7 @@ bool IsAllPositiveConstant(const Expr& expr) {
   static const auto& reshape_op = Op::Get("reshape");
   static const auto& transpose_op = Op::Get("transpose");
   static const auto& squeeze_op = Op::Get("squeeze");
+  static const auto& repeat_op = Op::Get("repeat");
 
   // peel through a few common transform ops.
   if (const auto* constant = expr.as<ConstantNode>()) {
@@ -419,7 +420,7 @@ bool IsAllPositiveConstant(const Expr& expr) {
   } else if (const auto* op = expr.as<CallNode>()) {
     // tail recursion.
     if (op->op == expand_dims_op || op->op == reshape_op || op->op == transpose_op ||
-        op->op == squeeze_op) {
+        op->op == squeeze_op || op->op == repeat_op) {
       return IsAllPositiveConstant(op->args[0]);
     } else {
       return false;
diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index 7518380de3b1..7334308e4a16 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -427,16 +427,6 @@ bool DFPatternMatcher::VisitDFPattern_(const LetPatternNode* op, const Expr& exp
   return false;
 }
 
-Expr InferType(const Expr& expr) {
-  auto mod = IRModule::FromExpr(expr);
-  mod = transform::InferType()(mod);
-  if (expr.as<FunctionNode>()) {
-    return mod->Lookup("main");
-  } else {
-    return mod->Lookup("main").as<FunctionNode>()->body;
-  }
-}
-
 Expr InferTypeWithModule(const Expr& expr, const IRModule& m) {
   IRModule mod(m->functions, m->type_definitions, m->Imports());
   GlobalVarSupply global_var_supply = GlobalVarSupply(mod);
diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h
index d03939e09ea8..aa4ef03c95a4 100644
--- a/src/relay/transforms/pattern_utils.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -102,6 +102,24 @@ namespace relay {
     LOG(FATAL) << "unknown data type " << type;                                       \
   }
 
+/*!
+ * \brief Try to do the type inference over expr:
+ *
+ * Do the infer_type over each node in expr
+ *
+ * \param expr The IR expression
+ * \return infered expr if succeed.
+ */
+inline Expr InferType(const Expr& expr) {
+  auto mod = IRModule::FromExpr(expr);
+  mod = transform::InferType()(mod);
+  if (expr.as<FunctionNode>()) {
+    return mod->Lookup("main");
+  } else {
+    return mod->Lookup("main").as<FunctionNode>()->body;
+  }
+}
+
 /*!
  * \brief Try to match lhs and rhs via broadcasting rule, such that:
  *
@@ -121,6 +139,17 @@ inline bool MatchBroadcastToLeftAxes(const TensorTypeNode* tlhs, const TensorTyp
   size_t base = tlhs->shape.size() - trhs->shape.size();
   size_t j = 0;
 
+  // handle case trhs is simple constant
+  if (trhs->shape.size() == 0 && rhs_value != nullptr && lhs_axes.size() > 0) {
+    *rhs_value = MakeExpandDims(*rhs_value, 0, lhs_axes.size());
+    for (size_t i = 0; i < lhs_axes.size(); i++) {
+      int repeat_value =
+          tlhs->shape[static_cast<size_t>(lhs_axes[j]->value)].as<IntImmNode>()->value;
+      *rhs_value = MakeRepeat(*rhs_value, repeat_value, i);
+    }
+    return true;
+  }
+
   ObjectPtr<SqueezeAttrs> squeeze_attrs;
   if (rhs_value != nullptr) {
     squeeze_attrs = make_object<SqueezeAttrs>();
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 6cae728b304f..923a18f7bc93 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -847,6 +847,37 @@ class SimplifyAdjacentMultiplyOrAdd : public DFPatternRewrite {
   DFPattern c2_;
 };
 
+/*! \brief Simplifying x+x to x*2 */
+class SimplifyAdd : public DFPatternRewrite {
+ public:
+  SimplifyAdd() {
+    x_ = IsWildcard();
+    y_ = IsWildcard();
+    pattern_ = IsOp("add")({x_, y_});
+  }
+
+  Expr Callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    Type pre_type = pre->checked_type_;
+    auto dtype = pre_type.as<TensorTypeNode>()->dtype;
+    auto x = node_map[x_][0];
+    auto y = node_map[y_][0];
+    auto data_type = Downcast<TensorType>(x->checked_type());
+
+    if (x == y) {
+      Expr value;
+      value = MakeConstantScalar(dtype, 2);
+      return InferType(Call(Op::Get("multiply"), {x, value}));
+    }
+    return post;
+  }
+
+ private:
+  /*! \brief Pattern input */
+  DFPattern x_;
+  DFPattern y_;
+};
+
 /*! \brief Simplifying x/sqrt to x*sqrt */
 class SimplifyRSqrt : public DFPatternRewrite {
  public:
@@ -925,6 +956,7 @@ Expr SimplifyExpr(const Expr& expr, const IRModule& mod) {
   composer.AddRewrite<ConcretizeCollapseSumLikeRewrite>();
   composer.AddRewrite<ConcretizeBroadcastToLikeRewrite>();
   composer.AddRewrite<ConcretizeCastLikeRewrite>();
+  composer.AddRewrite<SimplifyAdd>();
   composer.AddRewrite<SimplifyRSqrt>();
   composer.AddRewrite<EliminateIdentityRewrite>();
   composer.AddRewrite<SimplifyReshape>();
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
index 12fc722d8604..8ffa3ef832e0 100644
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -20,6 +20,12 @@
 from tvm import te
 from tvm import relay
 from tvm.relay import transform
+from tvm.relay.testing import create_workload
+from tvm.relay.build_module import bind_params_by_name
+
+
+def initializer(_, param):
+    param = np.zeros(param.shape)
 
 
 def _get_positive_scale(size):
@@ -636,6 +642,50 @@ def check(shape, in_channels, channels, blocking):
     check((2, 2, 10, 10, 2), 4, 8, (2, 2))
 
 
+def test_fold_bwd_simple_constant():
+    def before(data, weight, out_bias, channels):
+        y = relay.nn.conv2d(
+            data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)
+        )
+
+        y = relay.add(y, out_bias)
+        c2 = relay.const(2.0)
+        y = relay.nn.relu(y)
+        y = relay.multiply(y, c2)
+        mod, params = create_workload(y, initializer)
+        mod["main"] = bind_params_by_name(mod["main"], params)
+        return mod
+
+    def expected(data, weight, out_bias, channels):
+        y0 = relay.nn.conv2d(
+            data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)
+        )
+        y0 = relay.add(y0, out_bias)
+        y0 = relay.nn.relu(y0)
+        mod, params = create_workload(y0, initializer)
+        mod["main"] = bind_params_by_name(mod["main"], params)
+        return mod
+
+    def check(shape, channels):
+        x = relay.var("data", relay.TensorType(shape, "float32"))
+        weight = relay.var("weight")
+        out_bias = relay.var("in_bias", shape=(channels, 1, 1))
+
+        y0 = before(x, weight, out_bias, channels)
+        remove_last_multiply = tvm.transform.Sequential(
+            [
+                relay.transform.InferType(),
+                relay.transform.FoldScaleAxis(),
+            ]
+        )
+        with tvm.transform.PassContext(opt_level=3):
+            y0 = remove_last_multiply(y0)
+        _expect = expected(x, weight, out_bias, channels)
+        tvm.ir.assert_structural_equal(y0, _expect)
+
+    check((1, 3, 200, 200), 16)
+
+
 def test_fold_bwd_dual_consumer():
     def before(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
         args = [x, conv_weight, out_bias]
@@ -1211,6 +1261,7 @@ def check(shape, in_channels, channels, blocking):
     test_fold_fwd_relu_fail()
     test_fold_fwd_negative_scale()
     test_fold_fwd_dense()
+    test_fold_bwd_simple_constant()
     test_fold_bwd_simple()
     test_fold_bwd_dual_path()
     test_fold_bwd_dual_consumer()
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index 6df07966eb0a..fa9773b8e3d9 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -729,5 +729,20 @@ def expected():
     assert tvm.ir.structural_equal(opt, ref)
 
 
+def test_simplify_add():
+    x = relay.var("x", shape=(1, 3, 100, 100), dtype="float32")
+
+    def before():
+        return relay.add(x, x)
+
+    def expected():
+        s = relay.const(2.0)
+        return relay.multiply(x, s)
+
+    opt = run_opt_pass(before(), transform.SimplifyExpr())
+    ref = run_infer_type(expected())
+    assert tvm.ir.structural_equal(opt, ref)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From de8a79d9ba500b15b8cda7a89ca60bb65b17c944 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 4 Nov 2022 06:58:59 -0400
Subject: [PATCH 505/704] [skip-ci][COMMUNITY] New committer Ashutosh Parkhi
 (#13286)

[COMMUNITY] New committer Ashutosh Parkhi
---
 CONTRIBUTORS.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 448c13b60c1a..fb1353e49d5d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -62,6 +62,7 @@ We do encourage everyone to work anything they are interested in.
 - [Trevor Morris](https://github.com/trevor-m): @trevor-m - byoc, compiler
 - [Leandro Nunes](https://github.com/leandron) (PMC): @leandron - tvmc
 - [Lily Orth-Smith](https://github.com/electriclilies): @electriclilies - relay
+- [Ashutosh Parkhi](https://github.com/ashutosh-arm): @ashutosh-arm - cmsis-nn
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic) (PMC): @kparzysz-quic - hexagon, llvm
 - [Andrew Reusch](https://github.com/areusch): (PMC) @areusch - runtime, microTVM
 - [David Riazati](https://github.com/driazati): @driazati - ci, community
@@ -151,8 +152,8 @@ We do encourage everyone to work anything they are interested in.
 - [Lily Orth-Smith](https://github.com/electriclilies): @electriclilies
 - [Wei Pan](https://github.com/wpan11nv): @wpan11nv
 - [Michalis Papadimitriou](https://github.com/mikepapadim): @mikepapadim
-- [Ashutosh Parkhi](https://github.com/ashutosh-arm): @ashutosh-arm
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic
+- [Ashutosh Parkhi](https://github.com/ashutosh-arm): @ashutosh-arm
 - [Alexander Peskov](https://github.com/apeskov): @apeskov
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909): @PariksheetPinjari909
 - [Josh Pollock](https://github.com/joshpoll): @joshpoll

From ccb7d07159fdcc52f8a04e28afe3d7caac3cf829 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 4 Nov 2022 10:28:20 -0500
Subject: [PATCH 506/704] [TIR][Arith] Use TryCompare to narrow inequalities if
 possible (#13024)

Prior to this commit, the result of TryCompare would only be used if
it could definitively prove a conditional to be either true or false.
For example, if it is known that `0 <= i`, a conditional of `i <= 0`
would be left as-is.

This commit introduces rewrite rules to preferentially simplify
into more restrictive conditions.  Using the same example, if it is
known that `0 <= i`, a conditional of `i <= 0` would be simplified
into `i == 0`.  Similarly, if it is known that `0 <= i`, a
conditional of `i != 0` would be simplified into `0 < i`.

Because this change does not introduce significant overhead, as the
results of `RewriteSimplifier::Impl::TryCompare` are already
available, this change is enabled for all use cases and does not
require a call to `RewriteSimplifier::SetEnabledExtensions`.
---
 src/arith/rewrite_simplify.cc                 | 144 +++++++++++++++---
 src/arith/rewrite_simplify.h                  |  21 +++
 .../unittest/test_arith_rewrite_simplify.py   |   2 +-
 tests/python/unittest/test_index_map.py       |   8 +-
 ..._tir_transform_inject_software_pipeline.py |   4 +-
 .../unittest/test_tir_transform_simplify.py   |  46 ++++++
 6 files changed, 193 insertions(+), 32 deletions(-)

diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index a42303e459d8..d0fb943334de 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -29,6 +29,7 @@
 #include <tvm/tir/op.h>
 
 #include <algorithm>
+#include <utility>
 
 #include "../target/datatype/registry.h"
 #include "conjunctive_normal_form.h"
@@ -1384,11 +1385,16 @@ Optional<PrimExpr> RewriteSimplifier::Impl::TryMatchLiteralConstraint(const Prim
 }
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const EQNode* op) {
-  PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
-  op = ret.as<EQNode>();
+  EQ ret = Downcast<EQ>(IRMutatorWithAnalyzer::VisitExpr_(op));
+  op = ret.get();
+
   if (auto const_res = TryConstFold<EQ>(op->a, op->b)) return const_res.value();
   if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
+  return ApplyRewriteRules(ret);
+}
+
+PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(EQ ret) {
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
   // Pattern var match IntImm
@@ -1396,32 +1402,106 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const EQNode* op) {
   PVar<int> lanes;
 
   // vector rule
-  if (op->dtype.lanes() != 1) {
+  if (ret->dtype.lanes() != 1) {
     TVM_TRY_REWRITE(broadcast(x, lanes) == broadcast(y, lanes), broadcast(x == y, lanes));
   }
 
-  if (IsIndexType(op->a.dtype())) {
-    CompareResult result = TryCompare(op->a, op->b);
+  if (IsIndexType(ret->a.dtype())) {
+    CompareResult result = TryCompare(ret->a, ret->b);
     if (result == CompareResult::kEQ) {
-      return make_const(op->dtype, true);
+      return make_const(ret->dtype, true);
     } else if (result == CompareResult::kNE || result == CompareResult::kGT ||
                result == CompareResult::kLT) {
-      return make_const(op->dtype, false);
+      return make_const(ret->dtype, false);
     }
+    TVM_TRY_REWRITE(c1 == x, x == c1);
+
     TVM_TRY_REWRITE(x - c1 == 0, x == c1);
     TVM_TRY_REWRITE(c1 - x == 0, x == c1);
     TVM_TRY_REWRITE(x + c1 == 0, x == 0 - c1);
     TVM_TRY_REWRITE(x * y == 0, x == 0 || y == 0);
   }
-  return ret;
+  return std::move(ret);
 }
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NENode* op) {
-  return this->VisitExpr(Not(op->a == op->b));
+  PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
+  op = ret.as<NENode>();
+
+  if (auto const_res = TryConstFold<NE>(op->a, op->b)) return const_res.value();
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
+
+  if (IsIndexType(op->a.dtype())) {
+    CompareResult result = TryCompare(op->a, op->b);
+    if (result == CompareResult::kNE || result == CompareResult::kGT ||
+        result == CompareResult::kLT) {
+      return make_const(op->dtype, true);
+    } else if (result == CompareResult::kEQ) {
+      return make_const(op->dtype, false);
+    } else if (result == CompareResult::kGE) {
+      // Known: a >= b
+      //
+      // a != b
+      // (a < b) or (b < a)
+      // False or (b < a)
+      // b < a
+      return ApplyRewriteRules(LT(op->b, op->a));
+    } else if (result == CompareResult::kLE) {
+      // Known: a <= b
+      //
+      // a != b
+      // (a < b) or (b < a)
+      // (a < b) or False
+      // a < b
+      return ApplyRewriteRules(LT(op->a, op->b));
+    }
+  }
+
+  return ApplyRewriteRules(Not(ApplyRewriteRules(EQ(op->a, op->b))));
 }
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LENode* op) {
-  return this->VisitExpr(Not(op->b < op->a));
+  PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
+  op = ret.as<LENode>();
+  ICHECK(op);
+
+  if (auto const_res = TryConstFold<LE>(op->a, op->b)) return const_res.value();
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
+
+  // Check for applicable rewrites before attempting to prove/disprove
+  // the inequality.  This preserves earlier behavior, where (A<=B*x)
+  // simplifies to (ceildiv(A,B)<=x) when (A%B!=0).  Performing the
+  // TryCompare first would simplify to the equivalent
+  // (floordiv(A,B)<x) in these cases instead.
+  ret = ApplyRewriteRules(Not(ApplyRewriteRules(LT(op->b, op->a))));
+
+  if (auto op = ret.as<LENode>(); op && IsIndexType(op->a.dtype())) {
+    CompareResult result = TryCompare(op->a, op->b);
+    if (result == CompareResult::kLE || result == CompareResult::kLT ||
+        result == CompareResult::kEQ) {
+      return make_const(op->dtype, true);
+    } else if (result == CompareResult::kGT) {
+      return make_const(op->dtype, false);
+    } else if (result == CompareResult::kNE) {
+      // Known: a != b
+      //
+      // a <= b
+      // (a < b) or (a == b)
+      // (a < b) or False
+      // a < b
+      return ApplyRewriteRules(LT(op->a, op->b));
+    } else if (result == CompareResult::kGE) {
+      // Known: a >= b
+      //
+      // a <= b
+      // (a < b) or (a == b)
+      // False or (a == b)
+      // a == b
+      return ApplyRewriteRules(EQ(op->a, op->b));
+    }
+  }
+
+  return ret;
 }
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const GTNode* op) {
@@ -1429,15 +1509,20 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const GTNode* op) {
 }
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const GENode* op) {
-  return this->VisitExpr(Not(op->a < op->b));
+  return this->VisitExpr(op->b <= op->a);
 }
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LTNode* op) {
-  PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
-  op = ret.as<LTNode>();
+  LT node = Downcast<LT>(IRMutatorWithAnalyzer::VisitExpr_(op));
+  op = node.get();
+
   if (auto const_res = TryConstFold<LT>(op->a, op->b)) return const_res.value();
-  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
+  if (auto match = TryMatchLiteralConstraint(node)) return match.value();
+
+  return ApplyRewriteRules(node);
+}
 
+PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(LT ret) {
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, s1, s2;
   // Pattern var match IntImm
@@ -1445,19 +1530,19 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LTNode* op) {
   PVar<int> lanes;
 
   // vector rule
-  if (op->dtype.lanes() != 1) {
+  if (ret->dtype.lanes() != 1) {
     TVM_TRY_REWRITE(broadcast(x, lanes) < broadcast(y, lanes), broadcast(x < y, lanes));
     TVM_TRY_REWRITE(ramp(x, s1, lanes) < ramp(y, s1, lanes), broadcast(x < y, lanes));
   }
 
-  if (IsIndexType(op->a.dtype())) {
-    CompareResult result = TryCompare(op->a, op->b);
+  if (IsIndexType(ret->a.dtype())) {
+    CompareResult result = TryCompare(ret->a, ret->b);
     if (result == CompareResult::kLT) {
-      return make_const(op->dtype, true);
+      return make_const(ret->dtype, true);
     }
     if (result == CompareResult::kEQ || result == CompareResult::kGT ||
         result == CompareResult::kGE) {
-      return make_const(op->dtype, false);
+      return make_const(ret->dtype, false);
     }
 
     // clang-format off
@@ -1561,19 +1646,22 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LTNode* op) {
     TVM_TRY_REWRITE(x - c1 < 0, x < c1);
     // clang-format on
   }
-  return ret;
+  return std::move(ret);
 }
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NotNode* op) {
-  PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
-  op = ret.as<NotNode>();
-  if (auto const_res = TryConstFold<Not>(op->a)) return const_res.value();
+  Not ret = Downcast<Not>(IRMutatorWithAnalyzer::VisitExpr_(op));
+  if (auto const_res = TryConstFold<Not>(ret->a)) return const_res.value();
   if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
+  return ApplyRewriteRules(ret);
+}
+
+PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(Not ret) {
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
   PVar<int> lanes;
-  if (op->dtype.lanes() != 1) {
+  if (ret->dtype.lanes() != 1) {
     TVM_TRY_REWRITE(!broadcast(x, lanes), broadcast(!x, lanes));
   }
 
@@ -1586,7 +1674,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NotNode* op) {
   TVM_TRY_REWRITE(!(x != y), x == y);
   TVM_TRY_RECURSIVE_REWRITE(!(x || y), (!x) && (!y));
   TVM_TRY_RECURSIVE_REWRITE(!(x && y), (!x) || (!y));
-  return ret;
+  return std::move(ret);
 }
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
@@ -1762,6 +1850,12 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) {
 
   TVM_TRY_REWRITE(x != c1 || x == c2, x != c1 || c1 == c2);
   TVM_TRY_REWRITE(x == c2 || x != c1, x != c1 || c1 == c2);
+
+  TVM_TRY_RECURSIVE_REWRITE(x < y || x == y, x <= y);
+  TVM_TRY_RECURSIVE_REWRITE(x < y || y == x, x <= y);
+  TVM_TRY_RECURSIVE_REWRITE(x == y || x < y, x <= y);
+  TVM_TRY_RECURSIVE_REWRITE(y == x || x < y, x <= y);
+
   return ret;
 }
 
diff --git a/src/arith/rewrite_simplify.h b/src/arith/rewrite_simplify.h
index 02c54902153a..b8e7fcdd9433 100644
--- a/src/arith/rewrite_simplify.h
+++ b/src/arith/rewrite_simplify.h
@@ -137,6 +137,27 @@ class RewriteSimplifier::Impl : public IRMutatorWithAnalyzer {
    */
   Optional<PrimExpr> TryMatchLiteralConstraint(const PrimExpr& expr) const;
 
+  /*! \brief Rewrite rules for Less Than comparisons
+   *
+   * These are separate from the VisitExpr_(const LTNode*) method, as
+   * they may required from rewrites of LT or LE.
+   */
+  PrimExpr ApplyRewriteRules(LT node);
+
+  /*! \brief Rewrite rules for Equal comparisons
+   *
+   * These are separate from the VisitExpr_(const EQNode*) method, as
+   * they may required from rewrites of LE or NE.
+   */
+  PrimExpr ApplyRewriteRules(EQ node);
+
+  /*! \brief Rewrite rules for Equal comparisons
+   *
+   * These are separate from the VisitExpr_(const EQNode*) method, as
+   * they may required from rewrites of LT, LE, or NE.
+   */
+  PrimExpr ApplyRewriteRules(Not node);
+
  private:
   CompareResult TryCompareUsingKnownInequalities(const PrimExpr& x, const PrimExpr& y);
   CompareResult TryCompareUsingConstIntBounds(const PrimExpr& x, const PrimExpr y);
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 77751b160177..4477e1d9c713 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -863,7 +863,7 @@ def test_cmp_simplify():
     ck.verify(fld(x, 2) <= -1, tvm.tir.LE(x, -1))
 
     ck.verify(fld(x, 4) * 4 < x, tvm.tir.LT(0, flm(x, 4)))
-    ck.verify(fld(x, 4) * 4 >= x, tvm.tir.LE(flm(x, 4), 0))
+    ck.verify(fld(x, 4) * 4 >= x, tvm.tir.EQ(flm(x, 4), 0))
 
     ck.verify(fld(x, 4) * 4 < x + y, tvm.tir.LT(0, flm(x, 4) + y))
     ck.verify(fld(x, 4) * 4 < x - y, tvm.tir.LT(y, flm(x, 4)))
diff --git a/tests/python/unittest/test_index_map.py b/tests/python/unittest/test_index_map.py
index ac128690c415..5eb31cd378c4 100644
--- a/tests/python/unittest/test_index_map.py
+++ b/tests/python/unittest/test_index_map.py
@@ -91,7 +91,7 @@ def test_nonbijective_inverse_gives_error():
             inverse=lambda i, j: [4 * i + j],
             pre_shape=[15],
             post_shape=[4, 4],
-            padding=lambda i, j: tvm.tir.And(i == 3, j >= 3),
+            padding=lambda i, j: tvm.tir.And(i == 3, tvm.runtime.convert(3) == j),
         ),
         "left_padding": dict(
             forward=lambda i: [(i + 1) // 4, (i + 1) % 4],
@@ -107,7 +107,7 @@ def test_nonbijective_inverse_gives_error():
             post_shape=[4, 4],
             padding=lambda i, j: tvm.tir.Or(
                 tvm.tir.And(i == 0, j < 1),
-                tvm.tir.And(i == 3, j >= 3),
+                tvm.tir.And(i == 3, tvm.runtime.convert(3) == j),
             ),
         ),
         "dynamic_size": dict(
@@ -136,7 +136,7 @@ def test_nonbijective_inverse_gives_error():
             padding=lambda i_outer, j_outer, i_inner, j_inner: tvm.tir.Or(
                 tvm.tir.Or(
                     tvm.tir.And(i_outer == 0, i_inner < 1),
-                    tvm.tir.And(i_outer == 3, i_inner >= 3),
+                    tvm.tir.And(i_outer == 3, tvm.runtime.convert(3) == i_inner),
                 ),
                 tvm.tir.Or(
                     tvm.tir.And(j_outer == 0, j_inner < 5),
@@ -177,7 +177,7 @@ def test_nonbijective_inverse_gives_error():
             inverse=lambda i, j: [i * 4 + j],
             pre_shape=[3],
             post_shape=[1, 4],
-            padding=lambda i, j: 3 <= j,
+            padding=lambda i, j: tvm.runtime.convert(3) == j,
         ),
     }
 )
diff --git a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
index 9334a4d9e827..2a4cabc541c6 100644
--- a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
+++ b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
@@ -263,7 +263,7 @@ def transformed_three_stage_compute(
                         T.writes(B[0:2, tx, 0])
                         B[i, tx, 0] = A[tx, i] * T.float32(2)
                     with T.block():
-                        T.where(1 <= i)
+                        T.where(i == 1)
                         T.reads(B[0:2, tx, 0])
                         T.writes(C[0:2, tx, 0])
                         C[(i + 1) % 2, tx, 0] = B[(i + 1) % 2, tx, 0] + T.float32(2)
@@ -1349,7 +1349,7 @@ def ref(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]) -> N
                                 with T.attr(0, "async_scope", 1):
                                     B[i % 2, tx, 0] = A[tx, i] * T.float32(2)
                         with T.block():
-                            T.where(1 <= i and i - 1 < 16)
+                            T.where(i == 1 and i - 1 < 16)
                             T.reads(B[(i + 1) % 2, tx, 0])
                             T.writes(C[(i + 1) % 2, tx, 0])
                             with T.attr(0, "async_commit_queue_scope", 1):
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 4c5499edcfb0..8d9c76c6b20d 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -1003,5 +1003,51 @@ def expected(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32):
             A[0] = True
 
 
+class TestMostRestrictiveConditional(BaseBeforeAfter):
+    """Preferentially prove part of a compound conditional.
+
+    Even if we cannot prove a conditional as true or false on its own,
+    proving that a conditional must satisfy a stronger condition may
+    allow for later rewrites.  For example, if it is known that `a <= b`,
+    then `a >= b` cannot be proven, but can be reduced to `a == b`.
+    """
+
+    i, j, k = [tvm.tir.Var(name, "int32") for name in "ijk"]
+    tir_int = tvm.tir.IntImm("int32", 0)
+
+    test_case = tvm.testing.parameter(
+        (i <= tir_int, tir_int <= i, i == tir_int),
+        (i <= tir_int, i != tir_int, i < tir_int),
+        (i != tir_int, i <= tir_int, i < tir_int),
+        (i != tir_int, tir_int <= i, tir_int < i),
+        (i <= j, j <= i, j == i),
+        (i <= j, i != j, i < j),
+        (i != j, i <= j, i < j),
+        (i != j, j <= i, j < i),
+    )
+
+    @tvm.testing.fixture
+    def before(self, test_case):
+        priors, expr_before, _ = test_case
+
+        @T.prim_func
+        def func(A: T.Buffer[1, "bool"]):
+            if priors:
+                A[0] = expr_before
+
+        return func
+
+    @tvm.testing.fixture
+    def expected(self, test_case):
+        priors, _, expr_after = test_case
+
+        @T.prim_func
+        def func(A: T.Buffer[1, "bool"]):
+            if priors:
+                A[0] = expr_after
+
+        return func
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From e86088492314be322ff9151f411085455fca1d22 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 4 Nov 2022 12:50:30 -0400
Subject: [PATCH 507/704] [build][hexagon] remove unused variable (#13291)

Remove unused member variable in the `SimulatorRPCChannel` class.
Fixes a clang warning.
---
 src/runtime/hexagon/rpc/simulator/session.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/runtime/hexagon/rpc/simulator/session.cc b/src/runtime/hexagon/rpc/simulator/session.cc
index 54a9ff8c5884..a7f7896b11b3 100644
--- a/src/runtime/hexagon/rpc/simulator/session.cc
+++ b/src/runtime/hexagon/rpc/simulator/session.cc
@@ -680,8 +680,6 @@ Message SimulatorRPCChannel::SendMsg(Message msg) {
         << "Expecting HEX_CORE_BREAKPOINT, received: " << core_.str();
   };
 
-  Message_ msg_ = {msg};
-
   WriteToProcess(message_buffer_v_, &msg, sizeof msg);
   run();
 

From 6da298b3a07ce33eca390c6e5afd029d8c388f3d Mon Sep 17 00:00:00 2001
From: LiangW <114222082+liangW-intellif@users.noreply.github.com>
Date: Sat, 5 Nov 2022 03:31:15 +0800
Subject: [PATCH 508/704] [BugFix][Pattern] Fixed a crash when AltPattern and
 FunctionPattern are used nested (#13278)

The PatternGroup doesn not check if the FunctionPattern is matched
while processing the FunctionPattern, but when FunctionPattern
is nested with AltPattern, the FunctionPattern may not be matched,
resulting in a crash when looking up matched nodes.
This commit adds a check at handling FunctionPattern to fix this crash.
---
 src/relay/ir/dataflow_matcher.cc            | 14 +++++----
 tests/python/relay/test_dataflow_pattern.py | 33 +++++++++++++++++++++
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index 7334308e4a16..cf186c474ee0 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -612,12 +612,14 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
     }
     // Don't treat Function params or body as input variables for partition
     if (node->ref().as<FunctionPatternNode>()) {
-      auto matches = node_map[node->ref()];
-      for (auto match : matches) {
-        auto sub_graph = CreateIndexedGraph(match.as<FunctionNode>()->body);
-        for (PostDfsIndex sub_index = 0; sub_index < sub_graph->size(); ++sub_index) {
-          auto sub_node = sub_graph->index_to_node(sub_index);
-          fuzzy_matches.insert(sub_node->ref());
+      if (node_map.count(node->ref())) {
+        auto matches = node_map[node->ref()];
+        for (auto match : matches) {
+          auto sub_graph = CreateIndexedGraph(match.as<FunctionNode>()->body);
+          for (PostDfsIndex sub_index = 0; sub_index < sub_graph->size(); ++sub_index) {
+            auto sub_node = sub_graph->index_to_node(sub_index);
+            fuzzy_matches.insert(sub_node->ref());
+          }
         }
       }
     }
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index 24a405b0f6fd..a11673bf6930 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -1609,6 +1609,39 @@ def test_partition_function():
     assert tvm.ir.structural_equal(pattern.partition(expr), expr2)
 
 
+def test_partition_optional_function():
+    x = relay.var("x")
+    w = relay.var("w")
+    b = relay.var("b")
+
+    x1 = relay.var("x1")
+    w1 = relay.var("w1")
+
+    wc_x = wildcard()
+    wc_w = wildcard()
+    wc_x1 = wildcard()
+    wc_w1 = wildcard()
+
+    func_pattern0 = FunctionPattern(
+        [wc_x1, wc_w1], is_op("sigmoid")(is_op("nn.conv2d")(wc_x1, wc_w1))
+    )
+    func_pattern1 = FunctionPattern(
+        [wc_x1, wc_w1], is_op("nn.relu")(is_op("nn.conv2d")(wc_x1, wc_w1))
+    )
+    pattern = func_pattern0(wc_x, wc_w) | func_pattern1(wc_x, wc_w)
+
+    func = relay.Function([x1, w1], relay.nn.relu(relay.nn.conv2d(x1, w1)))
+    expr = func(x, w) + b
+
+    x2 = relay.var("x2")
+    w2 = relay.var("w2")
+    func2 = relay.Function([x2, w2], func(x2, w2)).with_attr(
+        "PartitionedFromPattern", "nn.conv2d_nn.relu_FunctionCall_"
+    )
+    expr2 = func2(x, w) + b
+    assert tvm.ir.structural_equal(pattern.partition(expr), expr2)
+
+
 def test_rewrite_function_with_fuzzy_body():
     """Allow Rewriting a function with a fuzzy body via dominator analysis"""
     x = relay.var("x")

From dec74cb93d3460d686f1935f933dc24404c5e995 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 4 Nov 2022 17:22:43 -0400
Subject: [PATCH 509/704] [build][tir] suppress -Woverloaded-virtual warning
 (#13267)

- Address a (valid) warning from  clang-15.0.3 regarding the
  `tvm::tir::DataTypeRewriter` class.

- Make some class methods `protected` rather than `public`
  to better reflect authors' intent.
---
 include/tvm/tir/stmt_functor.h        | 3 +--
 src/tir/transforms/narrow_datatype.cc | 9 +++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index fdb0a0aa9d1a..8057108803db 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -502,7 +502,7 @@ bool ContainsNode(const Stmt& stmt) {
  * base class of such passes to ensure the consistency of data types.
  */
 class DataTypeLegalizer : public StmtExprMutator {
- public:
+ protected:
   Stmt VisitStmt_(const ForNode* op) override;
 
   Stmt VisitStmt_(const AttrStmtNode* op) override;
@@ -530,7 +530,6 @@ class DataTypeLegalizer : public StmtExprMutator {
   using StmtExprMutator::VisitExpr_;
   using StmtExprMutator::VisitStmt_;
 
- protected:
   // a map from IterVar before rewrite to that after rewrite,
   // ensures one old IterVar maps to exactly one new IterVar
   std::unordered_map<const IterVarNode*, IterVar> ivmap_;
diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index 7f9c76f5257d..2d287deec44c 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -206,6 +206,15 @@ class DataTypeRewriter : public DataTypeLegalizer {
     return VisitStmt(s);
   }
 
+ protected:
+  // This class adds some overrides of `VisitStmt_` and `VisitExpr_` that
+  // are *not* present in the parent class.
+  // These `using` statements ensure that all of the *other* overrides
+  // provided by the parent class are fully visible to users of this class.
+  // (Discussed further in https://github.com/apache/tvm/pull/13267)
+  using Parent::VisitExpr_;
+  using Parent::VisitStmt_;
+
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
     return Stmt();

From be44e9c811c071f01f66002729b8a9cb356a3adf Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Fri, 4 Nov 2022 14:46:59 -0700
Subject: [PATCH 510/704] [Tensorize] Add logs to comparator to make debugging
 tensorize failures easier (#13285)

* [TIR][Tensorize] Add error logs to IR comparator to display what caused tensorization to fail

* lint issues
---
 src/tir/schedule/ir_comparator.cc | 247 +++++++++++++++++++++++++++---
 1 file changed, 227 insertions(+), 20 deletions(-)

diff --git a/src/tir/schedule/ir_comparator.cc b/src/tir/schedule/ir_comparator.cc
index ea0ac0bc733d..9d89c641630b 100644
--- a/src/tir/schedule/ir_comparator.cc
+++ b/src/tir/schedule/ir_comparator.cc
@@ -85,16 +85,63 @@ bool TensorizeComparator::VisitExpr(const PrimExpr& n, const PrimExpr& other) {
 
 bool TensorizeComparator::VisitStmt_(const ForNode* op, const Stmt& other) {
   const auto* rhs = other.as<ForNode>();
-  if (!DefEqual(op->loop_var, rhs->loop_var)) return false;
-  if (!VisitExpr(op->min, rhs->min)) return false;
-  if (!VisitExpr(op->extent, rhs->extent)) return false;
-  if (op->thread_binding.defined() != rhs->thread_binding.defined()) return false;
+  if (!DefEqual(op->loop_var, rhs->loop_var)) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "ForNode loop vars do not match: op->loop_var=" << op->loop_var
+         << " vs rhs->loop_var=" << rhs->loop_var;
+      EmitError(os.str());
+    }
+    return false;
+  }
+  if (!VisitExpr(op->min, rhs->min)) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "ForNode min values do not match: op->min=" << op->min << " vs rhs->min=" << rhs->min;
+      EmitError(os.str());
+    }
+    return false;
+  }
+  if (!VisitExpr(op->extent, rhs->extent)) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "ForNode extent values do not match: op->extent=" << op->extent
+         << " vs rhs->extent=" << rhs->extent;
+      EmitError(os.str());
+    }
+    return false;
+  }
+  if (op->thread_binding.defined() != rhs->thread_binding.defined()) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "ForNode thread_bindings do not match: op->thread_binding.defined()="
+         << op->thread_binding.defined()
+         << " vs rhs->thread_binding.defined()=" << rhs->thread_binding.defined();
+      EmitError(os.str());
+    }
+    return false;
+  }
   if (op->thread_binding.defined() &&
       !VisitExpr(op->thread_binding.value(), rhs->thread_binding.value())) {
     return false;
   }
-  if (op->kind != rhs->kind) return false;
-  if (!CompareAnnotationMap(op->annotations, rhs->annotations)) return false;
+  if (op->kind != rhs->kind) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "ForNode kinds do not match: op->kind=" << op->kind << " vs rhs->kind=" << rhs->kind;
+      EmitError(os.str());
+    }
+    return false;
+  }
+  if (!CompareAnnotationMap(op->annotations, rhs->annotations)) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "ForNode annotation maps do not match: op->annotations=" << op->annotations
+         << " vs rhs->annotations=" << rhs->annotations;
+      EmitError(os.str());
+    }
+    return false;
+  }
   return VisitStmt(op->body, rhs->body);
 }
 
@@ -112,6 +159,12 @@ bool TensorizeComparator::VisitStmt_(const BlockRealizeNode* op, const Stmt& oth
   const auto* rhs = other.as<BlockRealizeNode>();
   if (!is_scope_block) {
     if (!CompareArray(op->iter_values, rhs->iter_values, &TensorizeComparator::VisitExpr)) {
+      if (assert_mode_) {
+        std::ostringstream os;
+        os << "BlockRealizeNode iter_values do not match: op->iter_values=" << op->iter_values
+           << " vs rhs->iter_values=" << rhs->iter_values;
+        EmitError(os.str());
+      }
       return false;
     }
   }
@@ -125,16 +178,40 @@ bool TensorizeComparator::VisitStmt_(const BlockNode* op, const Stmt& other) {
   // When checking iter vars, DefEqual is used to remap variables.
   if (!is_scope_block) {
     if (!CompareArray(op->iter_vars, rhs->iter_vars, &TensorizeComparator::CompareIterVar)) {
+      if (assert_mode_) {
+        std::ostringstream os;
+        os << "BlockNode iter_vars do not match: op->alloc_buffers=" << op->iter_vars
+           << " vs rhs->alloc_buffers=" << rhs->iter_vars;
+        EmitError(os.str());
+      }
       return false;
     }
     if (!CompareArray(op->alloc_buffers, rhs->alloc_buffers, &TensorizeComparator::CompareBuffer)) {
+      if (assert_mode_) {
+        std::ostringstream os;
+        os << "BlockNode alloc_buffers do not match: op->alloc_buffers=" << op->alloc_buffers
+           << " vs rhs->alloc_buffers=" << rhs->alloc_buffers;
+        EmitError(os.str());
+      }
       return false;
     }
   }
   if (!CompareArray(op->writes, rhs->writes, &TensorizeComparator::CompareBufferRegion)) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "BlockNode write buffers do not match: op->writes=" << op->writes
+         << " vs rhs->writes=" << rhs->writes;
+      EmitError(os.str());
+    }
     return false;
   }
   if (!CompareArray(op->reads, rhs->reads, &TensorizeComparator::CompareBufferRegion)) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "BlockNode read buffers regions do not match: op->reads=" << op->reads
+         << " vs rhs->reads=" << rhs->reads;
+      EmitError(os.str());
+    }
     return false;
   }
   is_scope_block = false;
@@ -168,12 +245,30 @@ TVM_DECLARE_TENSORIZE_COMPARATOR_BINOP(FloorModNode);
 
 bool TensorizeComparator::VisitExpr_(const IntImmNode* op, const PrimExpr& other) {
   const auto* rhs = other.as<IntImmNode>();
-  return op->value == rhs->value;
+  if (op->value != rhs->value) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "IntImmNode values do not match: op->value=" << op->value
+         << " vs rhs->value=" << rhs->value;
+      EmitError(os.str());
+    }
+    return false;
+  }
+  return true;
 }
 
 bool TensorizeComparator::VisitExpr_(const FloatImmNode* op, const PrimExpr& other) {
   const auto* rhs = other.as<FloatImmNode>();
-  return op->value == rhs->value;
+  if (op->value != rhs->value) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "FloatImmNode values do not match: op->value=" << op->value
+         << " vs rhs->value=" << rhs->value;
+      EmitError(os.str());
+    }
+    return false;
+  }
+  return true;
 }
 
 bool TensorizeComparator::VisitExpr_(const CastNode* op, const PrimExpr& other) {
@@ -185,7 +280,15 @@ bool TensorizeComparator::VisitExpr_(const VarNode* op, const PrimExpr& other) {
   const auto* rhs = other.as<VarNode>();
   auto lhs = GetRef<Var>(op);
   if (lhs.same_as(other)) return true;
-  if (op->dtype.code() != rhs->dtype.code()) return false;
+  if (op->dtype.code() != rhs->dtype.code()) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "VarNode data type codes do not match: op->dtype.code()=" << op->dtype.code()
+         << " vs rhs->dtype.code()=" << rhs->dtype.code();
+      EmitError(os.str());
+    }
+    return false;
+  }
   auto it = equal_map_.find(lhs);
   return it != equal_map_.end() && it->second.same_as(other);
 }
@@ -216,14 +319,30 @@ bool TensorizeComparator::DefEqual(const Var& lhs, const Var& rhs) {
 
 bool TensorizeComparator::CompareAnnotation(const std::pair<String, ObjectRef>& lhs,
                                             const std::pair<String, ObjectRef>& rhs) {
-  if (lhs.first != rhs.first) return false;
+  if (lhs.first != rhs.first) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "CompareAnnotation key mismatch: lhs.first=" << lhs.first
+         << " vs rhs.first=" << rhs.first;
+      EmitError(os.str());
+    }
+    return false;
+  }
   return VisitExpr(Downcast<PrimExpr>(lhs.second), Downcast<PrimExpr>(rhs.second));
 }
 
 bool TensorizeComparator::CompareAnnotationMap(const Map<String, ObjectRef>& lhs,
                                                const Map<String, ObjectRef>& rhs) {
   if (lhs.same_as(rhs)) return true;
-  if (lhs.size() != rhs.size()) return false;
+  if (lhs.size() != rhs.size()) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "CompareAnnotationMap size mismatch: lhs.size()=" << lhs.size()
+         << " vs rhs.size()=" << rhs.size();
+      EmitError(os.str());
+    }
+    return false;
+  }
 
   auto sort_map =
       [](const Map<String, ObjectRef>& map) -> std::vector<std::pair<String, ObjectRef>> {
@@ -236,7 +355,14 @@ bool TensorizeComparator::CompareAnnotationMap(const Map<String, ObjectRef>& lhs
   std::vector<std::pair<String, ObjectRef>> rhs_array = sort_map(rhs);
 
   for (size_t i = 0; i < lhs.size(); ++i) {
-    if (!CompareAnnotation(lhs_array[i], rhs_array[i])) return false;
+    if (!CompareAnnotation(lhs_array[i], rhs_array[i])) {
+      if (assert_mode_) {
+        std::ostringstream os;
+        os << "CompareAnnotationMap annotations mismatch within AnnotationMap.";
+        EmitError(os.str());
+      }
+      return false;
+    }
   }
   return true;
 }
@@ -253,6 +379,14 @@ bool TensorizeComparator::CompareBuffer(const Buffer& lhs, const Buffer& rhs) {
         DefEqual(lhs->data, rhs->data) && lhs->dtype == rhs->dtype && lhs.scope() == rhs.scope();
     if (equal) {
       rhs_buffer_map_[rhs] = lhs;
+    } else {
+      if (assert_mode_) {
+        std::ostringstream os;
+        os << "CompareBuffer buffer mismatch. data: " << lhs->data << " vs " << rhs->data
+           << ", dtypes: " << lhs->dtype << " vs " << rhs->dtype << ", scope(): " << lhs.scope()
+           << " vs " << rhs.scope();
+        EmitError(os.str());
+      }
     }
   }
   return equal;
@@ -262,14 +396,24 @@ bool TensorizeComparator::CompareBufferRegion(const BufferRegion& lhs, const Buf
   if (!CompareBuffer(lhs->buffer, rhs->buffer)) {
     if (assert_mode_) {
       std::ostringstream os;
-      os << "Buffer mismatch: " << lhs->buffer << " vs " << rhs->buffer;
+      os << "CompareBufferRegion returning false due to buffer mismatch: lhs->buffer="
+         << lhs->buffer << " vs rhs->buffer=" << rhs->buffer;
       EmitError(os.str());
     }
     return false;
   }
   int offset = static_cast<int>(lhs->region.size()) - static_cast<int>(rhs->region.size());
   // Number of indices in RHS (desc of the tensor intrinsic) must be smaller than it in LHS
-  if (offset < 0) return false;
+  if (offset < 0) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "CompareBufferRegion returning false because buffer region sizes do not match: "
+            "lhs->region.size()="
+         << lhs->region.size() << " vs rhs->region.size()=" << rhs->region.size();
+      EmitError(os.str());
+    }
+    return false;
+  }
 
   auto it = buffer_indices_.find(lhs->buffer);
   if (it == buffer_indices_.end()) {
@@ -279,7 +423,16 @@ bool TensorizeComparator::CompareBufferRegion(const BufferRegion& lhs, const Buf
     indices_base.reserve(lhs->region.size());
     for (int i = 0; i < offset; i++) {
       // High-dim region must be element-wise
-      if (!is_one(lhs->region[i]->extent)) return false;
+      if (!is_one(lhs->region[i]->extent)) {
+        if (assert_mode_) {
+          std::ostringstream os;
+          os << "CompareBufferRegion returning false because buffer extent high-dim region must be "
+                "element-wise. lhs->region[i]->extent="
+             << lhs->region[i]->extent;
+          EmitError(os.str());
+        }
+        return false;
+      }
       indices_base.emplace_back(lhs->region[i]->min);
     }
     for (size_t i = 0; i < rhs->region.size(); i++) {
@@ -287,6 +440,12 @@ bool TensorizeComparator::CompareBufferRegion(const BufferRegion& lhs, const Buf
       indices_base.emplace_back(lhs->region[i + offset]->min);
       // check extent match
       if (!analyzer_.CanProveEqual(lhs->region[i + offset]->extent, rhs->region[i]->extent)) {
+        if (assert_mode_) {
+          std::ostringstream os;
+          os << "CompareBufferRegion buffer extent mismatch: lhs->region[i + offset]="
+             << lhs->region[i + offset] << " vs rhs->region[i]=" << rhs->region[i];
+          EmitError(os.str());
+        }
         return false;
       }
     }
@@ -296,16 +455,46 @@ bool TensorizeComparator::CompareBufferRegion(const BufferRegion& lhs, const Buf
     const std::vector<PrimExpr>& indices_base = it->second;
     for (int i = 0; i < offset; i++) {
       // High-dim region must be element-wise
-      if (!is_one(lhs->region[i]->extent)) return false;
-      if (!analyzer_.CanProveEqual(indices_base[i], lhs->region[i]->min)) return false;
+      if (!is_one(lhs->region[i]->extent)) {
+        if (assert_mode_) {
+          std::ostringstream os;
+          os << "CompareBufferRegion returning false because buffer extent high-dim region must be "
+                "element-wise. lhs->region[i]->extent="
+             << lhs->region[i]->extent;
+          EmitError(os.str());
+        }
+        return false;
+      }
+      if (!analyzer_.CanProveEqual(indices_base[i], lhs->region[i]->min)) {
+        if (assert_mode_) {
+          std::ostringstream os;
+          os << "Buffer base index consistency check failed due to unequal index base: "
+                "indices_base[i]="
+             << indices_base[i] << " vs lhs->region[i]->min=" << lhs->region[i]->min;
+          EmitError(os.str());
+        }
+        return false;
+      }
     }
     for (size_t i = 0; i < rhs->region.size(); i++) {
       // check extent match
       if (!analyzer_.CanProveEqual(lhs->region[i + offset]->extent, rhs->region[i]->extent)) {
+        if (assert_mode_) {
+          std::ostringstream os;
+          os << "CompareBufferRegion buffer region extent mismatch. lhs->region[i + offset]="
+             << lhs->region[i + offset] << " vs rhs->region[i]=" << rhs->region[i];
+          EmitError(os.str());
+        }
         return false;
       }
       PrimExpr normalized_lhs_min = (lhs->region[i + offset]->min - indices_base[i + offset]);
       if (!analyzer_.CanProveEqual(normalized_lhs_min, rhs->region[i]->min)) {
+        if (assert_mode_) {
+          std::ostringstream os;
+          os << "CompareBufferRegion buffer region min mismatch. lhs->region[i + offset]="
+             << lhs->region[i + offset] << " vs rhs->region[i]=" << rhs->region[i];
+          EmitError(os.str());
+        }
         return false;
       }
     }
@@ -318,7 +507,16 @@ template <typename T>
 bool TensorizeComparator::CompareBufferAccess(const T* lhs, const T* rhs) {
   if (!CompareBuffer(lhs->buffer, rhs->buffer)) return false;
   int offset = static_cast<int>(lhs->indices.size()) - static_cast<int>(rhs->indices.size());
-  if (offset < 0) return false;
+  if (offset < 0) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "CompareBufferAccess returning false because buffer indices sizes do not match: "
+            "lhs->indices.size()="
+         << lhs->indices.size() << " vs rhs->indices.size()=" << rhs->indices.size();
+      EmitError(os.str());
+    }
+    return false;
+  }
   auto it = buffer_indices_.find(lhs->buffer);
   ICHECK(it != buffer_indices_.end());
   const std::vector<PrimExpr>& indices_base = (*it).second;
@@ -328,7 +526,8 @@ bool TensorizeComparator::CompareBufferAccess(const T* lhs, const T* rhs) {
     if (!analyzer_.CanProveEqual(normalized_lhs_index, rhs->indices[i])) {
       if (assert_mode_) {
         std::ostringstream os;
-        os << "Buffer indices mismatch: " << lhs->indices[i + offset] << " vs " << rhs->indices[i];
+        os << "CompareBufferAccess buffer indices mismatch. lhs->indices[i + offset]="
+           << lhs->indices[i + offset] << " vs rhs->indices[i]=" << rhs->indices[i];
         EmitError(os.str());
       }
       return false;
@@ -340,7 +539,15 @@ bool TensorizeComparator::CompareBufferAccess(const T* lhs, const T* rhs) {
 template <typename T, typename Self, typename F>
 bool TensorizeComparator::CompareArray(const Array<T>& lhs, const Array<T>& rhs, F Self::*cmp) {
   if (lhs.same_as(rhs)) return true;
-  if (lhs.size() != rhs.size()) return false;
+  if (lhs.size() != rhs.size()) {
+    if (assert_mode_) {
+      std::ostringstream os;
+      os << "CompareArray array size mismatch. lhs.size()=" << lhs.size()
+         << " vs rhs.size()=" << rhs.size();
+      EmitError(os.str());
+    }
+    return false;
+  }
   for (size_t i = 0; i < lhs.size(); ++i) {
     if (!(static_cast<Self*>(this)->*cmp)(lhs[i], rhs[i])) return false;
   }

From 62fadacd12d5dd8633012bf26890cc5c30327d7e Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 4 Nov 2022 19:39:36 -0500
Subject: [PATCH 511/704] [Hexagon] Lint tests part 2 (#13271)

* Hexagon test lint part 2

* fix import

* fix global variable

* fix import issue

* fix import

* fix exception error

* address comments
---
 tests/lint/pylint.sh                          |   3 +-
 .../metaschedule_e2e/export_models.py         |  21 +-
 .../metaschedule_e2e/test_resnet50_fp16.py    |  32 +-
 .../metaschedule_e2e/test_resnet50_int8.py    |  89 +++---
 .../contrib/test_hexagon/topi/__init__.py     |   2 +-
 .../test_hexagon/topi/slice_op/__init__.py    |  18 ++
 .../topi/{ => slice_op}/test_argmax_slice.py  |   2 +-
 .../{ => slice_op}/test_avg_pool2d_slice.py   |   5 +-
 .../topi/{ => slice_op}/test_cast_slice.py    |   2 +-
 .../test_clip_slice.py}                       |   4 +-
 .../topi/{ => slice_op}/test_conv2d_slice.py  |   2 +-
 .../test_depthwise_conv2d_slice.py            |   4 +-
 .../{ => slice_op}/test_dequantize_slice.py   |   2 +-
 .../{ => slice_op}/test_max_pool2d_slice.py   |   6 +-
 .../topi/{ => slice_op}/test_relu_slice.py    |   3 +-
 .../topi/{ => slice_op}/test_softmax_slice.py |   2 +-
 .../topi/{ => slice_op}/test_tanh_slice.py    |   3 +-
 .../topi/test_add_subtract_multiply.py        | 272 ++++++++---------
 .../test_hexagon/topi/test_batch_matmul.py    |  54 ++--
 .../topi/test_conv2d_fp16_intrin.py           | 197 +++++++------
 .../test_hexagon/topi/test_conv2d_nchw.py     | 133 ++++-----
 .../test_hexagon/topi/test_conv2d_nhwc.py     |  49 ++--
 .../topi/test_conv2d_transpose.py             |  83 ++----
 .../contrib/test_hexagon/topi/test_dense.py   | 178 ++++++------
 .../test_hexagon/topi/test_depth_to_space.py  |  49 ++--
 .../topi/test_depthwise_conv2d.py             | 273 +++++++++---------
 .../contrib/test_hexagon/topi/test_pad.py     |  11 +-
 .../contrib/test_hexagon/topi/test_pooling.py |  39 ++-
 .../test_hexagon/topi/test_quantize.py        |  60 ++--
 .../contrib/test_hexagon/topi/test_reduce.py  | 209 +++++++-------
 .../contrib/test_hexagon/topi/test_reshape.py |  76 ++---
 .../test_hexagon/topi/test_resize2d.py        | 144 ++++-----
 .../contrib/test_hexagon/topi/test_softmax.py | 121 ++++----
 33 files changed, 1127 insertions(+), 1021 deletions(-)
 create mode 100644 tests/python/contrib/test_hexagon/topi/slice_op/__init__.py
 rename tests/python/contrib/test_hexagon/topi/{ => slice_op}/test_argmax_slice.py (97%)
 rename tests/python/contrib/test_hexagon/topi/{ => slice_op}/test_avg_pool2d_slice.py (99%)
 rename tests/python/contrib/test_hexagon/topi/{ => slice_op}/test_cast_slice.py (98%)
 rename tests/python/contrib/test_hexagon/topi/{test_clip.py => slice_op/test_clip_slice.py} (97%)
 mode change 100755 => 100644
 rename tests/python/contrib/test_hexagon/topi/{ => slice_op}/test_conv2d_slice.py (99%)
 mode change 100755 => 100644
 rename tests/python/contrib/test_hexagon/topi/{ => slice_op}/test_depthwise_conv2d_slice.py (99%)
 rename tests/python/contrib/test_hexagon/topi/{ => slice_op}/test_dequantize_slice.py (99%)
 rename tests/python/contrib/test_hexagon/topi/{ => slice_op}/test_max_pool2d_slice.py (98%)
 rename tests/python/contrib/test_hexagon/topi/{ => slice_op}/test_relu_slice.py (97%)
 rename tests/python/contrib/test_hexagon/topi/{ => slice_op}/test_softmax_slice.py (98%)
 rename tests/python/contrib/test_hexagon/topi/{ => slice_op}/test_tanh_slice.py (97%)
 mode change 100755 => 100644 tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
 mode change 100755 => 100644 tests/python/contrib/test_hexagon/topi/test_quantize.py
 mode change 100755 => 100644 tests/python/contrib/test_hexagon/topi/test_resize2d.py

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index e41dc2bb80b8..2b0b8365649d 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -29,7 +29,8 @@ python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc
 # tests/python/contrib/test_hexagon tests
 python3 -m pylint tests/python/contrib/test_hexagon/*.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/contrib/test_hexagon/conv2d/*.py --rcfile="$(dirname "$0")"/pylintrc
-
+python3 -m pylint tests/python/contrib/test_hexagon/topi/*.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_hexagon/metaschedule_e2e/*.py --rcfile="$(dirname "$0")"/pylintrc
 
 # tests/python/frontend tests
 python3 -m pylint tests/python/frontend/caffe/test_forward.py --rcfile="$(dirname "$0")"/pylintrc
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/export_models.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/export_models.py
index 660fbf757284..3e331cbf8ccb 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/export_models.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/export_models.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Hexagon MetaSchedule test helper functions."""
+
 import torch
 from torchvision.models import resnet
 from torchvision.models.quantization import resnet as qresnet
@@ -23,6 +25,7 @@
 
 
 def export_resnet50_fp16():
+    """Export Resnet50 FP16."""
     model = resnet.resnet50(pretrained=True).eval()
 
     pt_inp = torch.randn(1, 3, 224, 224)
@@ -34,14 +37,16 @@ def export_resnet50_fp16():
     mod, params = relay.frontend.from_pytorch(script_module, input_shapes)
     mod = relay.transform.ToMixedPrecision("float16")(mod)
 
-    with open("resnet50_fp16.json", "w") as fo:
-        fo.write(tvm.ir.save_json(mod))
+    with open("resnet50_fp16.json", "w") as file:
+        file.write(tvm.ir.save_json(mod))
 
-    with open("resnet50_fp16.params", "wb") as fo:
-        fo.write(relay.save_param_dict(params))
+    with open("resnet50_fp16.params", "wb") as file:
+        file.write(relay.save_param_dict(params))
 
 
 def export_resnet50_int8():
+    """Export Resnet50 INT8."""
+
     def quantize_model(model, inp):
         model.fuse_model()
         model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
@@ -62,11 +67,11 @@ def quantize_model(model, inp):
         script_module, input_shapes, keep_quantized_weight=True
     )
 
-    with open("resnet50_int8.json", "w") as fo:
-        fo.write(tvm.ir.save_json(mod))
+    with open("resnet50_int8.json", "w") as file:
+        file.write(tvm.ir.save_json(mod))
 
-    with open("resnet50_int8.params", "wb") as fo:
-        fo.write(relay.save_param_dict(params))
+    with open("resnet50_int8.params", "wb") as file:
+        file.write(relay.save_param_dict(params))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
index 84a33b9c80d3..117e9d4b6f19 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_fp16.py
@@ -14,10 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test Resnet50 float16 with MetaSchedule"""
+
 import os
-import pytest
 import tempfile
 
+import pytest
 import numpy as np
 
 import tvm.testing
@@ -29,12 +31,6 @@
 from ..infrastructure import get_hexagon_target
 
 
-target = get_hexagon_target("v69")
-target_llvm = tvm.target.Target("llvm")
-model_json = "resnet50_fp16.json"
-model_params = "resnet50_fp16.params"
-
-
 def convert_conv2d_layout(mod, desired_layouts):
     with tvm.transform.PassContext(opt_level=3):
         seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
@@ -44,14 +40,20 @@ def convert_conv2d_layout(mod, desired_layouts):
 @pytest.mark.skip("End-to-end tuning is skipped on CI.")
 @tvm.testing.requires_hexagon
 def test_resnet50(hexagon_launcher):
+    """Test Resnet50."""
+    model_json = "resnet50_fp16.json"
+    target_llvm = tvm.target.Target("llvm")
+    target_hexagon = get_hexagon_target("v69")
+    model_params = "resnet50_fp16.params"
+
     if not os.path.exists(model_json):
         pytest.skip(msg="Run python export_models.py first.")
 
-    with open(model_json, "r") as fi:
-        mod = tvm.ir.load_json(fi.read())
+    with open(model_json, "r") as file:
+        mod = tvm.ir.load_json(file.read())
 
-    with open(model_params, "rb") as fi:
-        params = relay.load_param_dict(fi.read())
+    with open(model_params, "rb") as file:
+        params = relay.load_param_dict(file.read())
 
     mod = convert_conv2d_layout(mod, {"nn.conv2d": ["NHWC", "HWIO"]})
 
@@ -66,7 +68,7 @@ def test_resnet50(hexagon_launcher):
     with tempfile.TemporaryDirectory() as work_dir:
         database = ms.relay_integration.tune_relay(
             mod=mod,
-            target=target,
+            target=target_hexagon,
             params=params,
             work_dir=work_dir,
             # for faster tuning
@@ -88,7 +90,7 @@ def test_resnet50(hexagon_launcher):
         hexagon_lowered = ms.relay_integration.compile_relay(
             database=database,
             mod=mod,
-            target=target,
+            target=target_hexagon,
             params=params,
         )
 
@@ -127,3 +129,7 @@ def test_resnet50(hexagon_launcher):
             hexagon_lowered.get_graph_json(), hexagon_lowered.lib
         )
         print(debug_ex.profile(input_name=inp.copy()))
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
index e7400aee61f6..6970b0ac06b5 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -14,12 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test Resnet50 int8 with MetaSchedule"""
+
 import os
-import numpy as np
-import pytest
 import tempfile
 from typing import Optional
 
+import numpy as np
+import pytest
+
 import tvm
 import tvm.testing
 from tvm import relay
@@ -31,15 +34,15 @@
 from tvm.tir.schedule import BlockRV, Schedule
 from ..infrastructure import get_hexagon_target
 
-
-executor = relay.backend.Executor("graph", {"link-params": True})
-target = get_hexagon_target("v68")
-target_llvm = tvm.target.Target("llvm")
-model_json = "resnet50_int8.json"
-model_params = "resnet50_int8.params"
+MODEL_JSON = "resnet50_int8.json"
+EXECUTOR = relay.backend.Executor("graph", {"link-params": True})
+TARGET_LLVM = tvm.target.Target("llvm")
+TARGET_HEXAGON = get_hexagon_target("v68")
+MODEL_PARAMS = "resnet50_int8.params"
 
 
 def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
+    """Tune VRMPY with auto tensorization."""
     sch_rules = [
         schedule_rule.AutoInline(
             into_producer=False,
@@ -95,12 +98,12 @@ def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
 
     # This line is necessary for link-params to take effect during
     # task extraction and relay.build(...).
-    mod = mod.with_attr("executor", executor)
+    mod = mod.with_attr("executor", EXECUTOR)
 
     with tempfile.TemporaryDirectory() as work_dir:
         database = ms.relay_integration.tune_relay(
             mod=mod,
-            target=target,
+            target=TARGET_HEXAGON,
             params=params,
             work_dir=work_dir,
             # for faster tuning
@@ -129,7 +132,7 @@ def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
         return ms.relay_integration.compile_relay(
             database=database,
             mod=mod,
-            target=target,
+            target=TARGET_HEXAGON,
             params=params,
         )
 
@@ -137,14 +140,15 @@ def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
 @pytest.mark.skip("End-to-end tuning is skipped on CI.")
 @tvm.testing.requires_hexagon
 def test_resnet50(hexagon_launcher):
-    if not os.path.exists(model_json):
+    """Test Resnet50."""
+    if not os.path.exists(MODEL_JSON):
         pytest.skip(msg="Run python export_models.py first.")
 
-    with open(model_json, "r") as fi:
-        mod = tvm.ir.load_json(fi.read())
+    with open(MODEL_JSON, "r") as file:
+        mod = tvm.ir.load_json(file.read())
 
-    with open(model_params, "rb") as fi:
-        params = relay.load_param_dict(fi.read())
+    with open(MODEL_PARAMS, "rb") as file:
+        params = relay.load_param_dict(file.read())
     inp = np.random.randn(1, 3, 224, 224).astype("float32")
     input_name = "image"
 
@@ -156,15 +160,15 @@ def test_resnet50(hexagon_launcher):
         with tvm.transform.PassContext(opt_level=3):
             hexagon_lowered = relay.build(
                 mod,
-                tvm.target.Target(target, host=target),
+                tvm.target.Target(TARGET_HEXAGON, host=TARGET_HEXAGON),
                 params=params,
-                executor=executor,
+                executor=EXECUTOR,
             )
 
     with tvm.transform.PassContext(opt_level=3):
         llvm_lowered = tvm.relay.build(
             mod,
-            tvm.target.Target(target_llvm, host=target_llvm),
+            tvm.target.Target(TARGET_LLVM, host=TARGET_LLVM),
             params=params,
         )
 
@@ -191,16 +195,16 @@ def test_resnet50(hexagon_launcher):
         print(debug_ex.profile(input_name=inp.copy()))
 
 
-def _schedule_packed_8x8x32_conv2d(do_tune: bool):
+def _schedule_packed_8x8x32_conv2d():
     """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
     using 8x8x32 packed layout.
     """
 
     def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
-        if conv2d_block == None:
+        if conv2d_block is None:
             try:
                 conv2d_block = sch.get_block("conv2d_NCHWc_int8")
-            except:
+            except ValueError:
                 return False
 
         assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
@@ -234,13 +238,13 @@ def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
             # be desirable to do this with coarser spatial granularity
             sch.compute_at(conv2d_block, loops[4])
 
-        def index_map_nchw32c_nchw8h8w32c(n, c, h, w, c32):
-            return [n, c, h // 8, w // 8, h % 8, w % 8, c32]
+        def index_map_nchw32c_nchw8h8w32c(n_batch, channel, height, width, channel_32):
+            return [n_batch, channel, height // 8, width // 8, height % 8, width % 8, channel_32]
 
         # Add cache for input and output activation layout transform,
         # note that weight is already in correct layout
-        input_cache = sch.cache_read(conv2d_block, 0, "global")
-        output_cache = sch.cache_write(outer_block, 0, "global")
+        input_cache = sch.cache_read(conv2d_block, 0, "global")  # pylint: disable=unused-variable
+        output_cache = sch.cache_write(outer_block, 0, "global")  # pylint: disable=unused-variable
         # Transform the layout of the input
         sch.transform_layout(
             conv2d_block, ("read", 0), index_map=index_map_nchw32c_nchw8h8w32c, pad_value=0
@@ -259,23 +263,25 @@ def index_map_nchw32c_nchw8h8w32c(n, c, h, w, c32):
 
 
 def tune_packed_8x8x32_template(mod, params, hexagon_launcher):
+    """Generate packed 8*8*32 template."""
+
     def schedule_rule_conv2d_packed_8x8x32(sch: Schedule, conv2d_block: BlockRV):
-        _schedule_packed_8x8x32_conv2d(do_tune=True)(sch, conv2d_block)
+        _schedule_packed_8x8x32_conv2d()(sch, conv2d_block)
         return [sch]
 
     register_func("meta_schedule.conv2d_NCHWc_int8", schedule_rule_conv2d_packed_8x8x32)
 
     def schedule_conv2d_for_tune(sch: Schedule):
-        _schedule_packed_8x8x32_conv2d(do_tune=True)(sch)
+        _schedule_packed_8x8x32_conv2d()(sch)
 
     # This line is necessary for link-params to take effect during
     # task extraction and relay.build(...).
-    mod = mod.with_attr("executor", executor)
+    mod = mod.with_attr("executor", EXECUTOR)
 
     with tempfile.TemporaryDirectory() as work_dir:
         database = ms.relay_integration.tune_relay(
             mod=mod,
-            target=target,
+            target=TARGET_HEXAGON,
             params=params,
             work_dir=work_dir,
             max_trials_global=20000,
@@ -309,7 +315,7 @@ def schedule_conv2d_for_tune(sch: Schedule):
         return ms.relay_integration.compile_relay(
             database=database,
             mod=mod,
-            target=target,
+            target=TARGET_HEXAGON,
             params=params,
         )
 
@@ -317,14 +323,15 @@ def schedule_conv2d_for_tune(sch: Schedule):
 @pytest.mark.skip("End-to-end tuning is skipped on CI.")
 @tvm.testing.requires_hexagon
 def test_packed_8x8x32_resnet50(hexagon_launcher):
-    if not os.path.exists(model_json):
+    """Test packed 8*8*32 Resnet50"""
+    if not os.path.exists(MODEL_JSON):
         pytest.skip(msg="Run python export_models.py first.")
 
-    with open(model_json, "r") as fi:
-        mod = tvm.ir.load_json(fi.read())
+    with open(MODEL_JSON, "r") as file:
+        mod = tvm.ir.load_json(file.read())
 
-    with open(model_params, "rb") as fi:
-        params = relay.load_param_dict(fi.read())
+    with open(MODEL_PARAMS, "rb") as file:
+        params = relay.load_param_dict(file.read())
     inp = np.random.randn(1, 3, 224, 224).astype("float32")
     input_name = "image"
 
@@ -336,15 +343,15 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
         with tvm.transform.PassContext(opt_level=3):
             hexagon_lowered = relay.build(
                 mod,
-                tvm.target.Target(target, host=target),
+                tvm.target.Target(TARGET_HEXAGON, host=TARGET_HEXAGON),
                 params=params,
-                executor=executor,
+                executor=EXECUTOR,
             )
 
     with tvm.transform.PassContext(opt_level=3):
         llvm_lowered = tvm.relay.build(
             mod,
-            tvm.target.Target(target_llvm, host=target_llvm),
+            tvm.target.Target(TARGET_LLVM, host=TARGET_LLVM),
             params=params,
         )
 
@@ -360,3 +367,7 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
         ref_result = llvm_graph_mod.get_output(0).numpy()
 
         np.testing.assert_allclose(ref_result, hexagon_output, atol=1e-4, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/__init__.py b/tests/python/contrib/test_hexagon/topi/__init__.py
index fb6657b09e51..dce5413e66e2 100644
--- a/tests/python/contrib/test_hexagon/topi/__init__.py
+++ b/tests/python/contrib/test_hexagon/topi/__init__.py
@@ -15,4 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 
-""" Testing infrastructure for Hexagon/TOPI """
+""" Hexagon TOPI tests """
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/__init__.py b/tests/python/contrib/test_hexagon/topi/slice_op/__init__.py
new file mode 100644
index 000000000000..baf28ad93323
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Hexagon TOPI Slice OP tests """
diff --git a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_argmax_slice.py
similarity index 97%
rename from tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
rename to tests/python/contrib/test_hexagon/topi/slice_op/test_argmax_slice.py
index 5ed86a1fcc92..5f4a594fcfb1 100644
--- a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_argmax_slice.py
@@ -22,7 +22,7 @@
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.contrib.hexagon
-from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
 class TestArgMaxSlice:
diff --git a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
similarity index 99%
rename from tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
rename to tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
index 6f6a7d762747..13876da87295 100644
--- a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import pytest
 import numpy as np
 from typing import *
 
@@ -24,13 +23,13 @@
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.topi.hexagon.qnn as qn
-from ..infrastructure import (
+from ...infrastructure import (
     allocate_hexagon_array,
     transform_numpy,
     quantize_np,
     get_hexagon_target,
 )
-from ..pytest_util import (
+from ...pytest_util import (
     get_multitest_ids,
     create_populated_numpy_ndarray,
     TensorContentRandom,
diff --git a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_cast_slice.py
similarity index 98%
rename from tests/python/contrib/test_hexagon/topi/test_cast_slice.py
rename to tests/python/contrib/test_hexagon/topi/slice_op/test_cast_slice.py
index 7f59e3ffa7fd..3118c7be8efb 100644
--- a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_cast_slice.py
@@ -23,7 +23,7 @@
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
 
-from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
 class TestCastF16F32Slice2d:
diff --git a/tests/python/contrib/test_hexagon/topi/test_clip.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_clip_slice.py
old mode 100755
new mode 100644
similarity index 97%
rename from tests/python/contrib/test_hexagon/topi/test_clip.py
rename to tests/python/contrib/test_hexagon/topi/slice_op/test_clip_slice.py
index 3f8f5077c758..e0a2e20a0b6b
--- a/tests/python/contrib/test_hexagon/topi/test_clip.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_clip_slice.py
@@ -20,10 +20,10 @@
 import numpy as np
 
 from tvm import te
-
 import tvm.testing
 import tvm.topi.hexagon.slice_ops as sl
-from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+
+from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 input_layout = tvm.testing.parameter(
     "nhwc-8h2w32c2w-2d",
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py
old mode 100755
new mode 100644
similarity index 99%
rename from tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py
rename to tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py
index 242265169fb8..c314e9655c9a
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py
@@ -25,7 +25,7 @@
 from tvm.topi.hexagon.slice_ops.conv2d import conv2d_compute, conv2d_schedule
 from tvm.topi.testing import conv2d_nhwc_python
 
-from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 input_layout = tvm.testing.parameter(
     "nhwc-8h2w32c2w-2d",
diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_depthwise_conv2d_slice.py
similarity index 99%
rename from tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d_slice.py
rename to tests/python/contrib/test_hexagon/topi/slice_op/test_depthwise_conv2d_slice.py
index 840a462917ae..74e4d05446ed 100644
--- a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_depthwise_conv2d_slice.py
@@ -19,12 +19,14 @@
 """Test depthwise_conv2d slice op for hexagon."""
 
 import numpy as np
+
 import tvm
 import tvm.testing
 import tvm.topi.hexagon.qnn as qn
 from tvm.topi.testing import depthwise_conv2d_python_nhwc
 from tvm.topi.hexagon.slice_ops.dwconv2d import dwconv2d_compute, dwconv2d_schedule
-from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
+
+from ...infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
 
 
 @tvm.testing.fixture
diff --git a/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_dequantize_slice.py
similarity index 99%
rename from tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py
rename to tests/python/contrib/test_hexagon/topi/slice_op/test_dequantize_slice.py
index 6ed217180aba..9b1c5bc5f614 100644
--- a/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_dequantize_slice.py
@@ -23,7 +23,7 @@
 import tvm.testing
 from tvm import te
 from tvm.topi.hexagon import qnn
-from ..infrastructure import (
+from ...infrastructure import (
     allocate_hexagon_array,
     transform_numpy,
     quantize_np,
diff --git a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_max_pool2d_slice.py
similarity index 98%
rename from tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py
rename to tests/python/contrib/test_hexagon/topi/slice_op/test_max_pool2d_slice.py
index f2ee76863cb6..fcb4411609b2 100644
--- a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_max_pool2d_slice.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import pytest
 import numpy as np
 from typing import *
 
@@ -23,8 +22,9 @@
 import tvm.testing
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
-from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
-from ..pytest_util import (
+
+from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ...pytest_util import (
     get_multitest_ids,
     create_populated_numpy_ndarray,
     TensorContentRandom,
diff --git a/tests/python/contrib/test_hexagon/topi/test_relu_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_relu_slice.py
similarity index 97%
rename from tests/python/contrib/test_hexagon/topi/test_relu_slice.py
rename to tests/python/contrib/test_hexagon/topi/slice_op/test_relu_slice.py
index fd04cca061da..93a8d77827bf 100644
--- a/tests/python/contrib/test_hexagon/topi/test_relu_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_relu_slice.py
@@ -16,14 +16,13 @@
 # under the License.
 
 import numpy as np
-import pytest
 
 import tvm
 import tvm.testing
 from tvm.topi.hexagon.slice_ops.relu import relu_compute, relu_stir_schedule
 from tvm import te
 
-from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
 @tvm.testing.fixture
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_softmax_slice.py
similarity index 98%
rename from tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
rename to tests/python/contrib/test_hexagon/topi/slice_op/test_softmax_slice.py
index 1329fda7aa4a..a3db1b6dcdbe 100644
--- a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_softmax_slice.py
@@ -21,7 +21,7 @@
 from tvm.topi.testing import softmax_python
 import tvm.topi.hexagon.slice_ops as sl
 
-from ..infrastructure import allocate_hexagon_array, get_hexagon_target
+from ...infrastructure import allocate_hexagon_array
 
 
 def transform_numpy(arr_np, layout):
diff --git a/tests/python/contrib/test_hexagon/topi/test_tanh_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_tanh_slice.py
similarity index 97%
rename from tests/python/contrib/test_hexagon/topi/test_tanh_slice.py
rename to tests/python/contrib/test_hexagon/topi/slice_op/test_tanh_slice.py
index 02c587b9809c..f8c14ef934a1 100644
--- a/tests/python/contrib/test_hexagon/topi/test_tanh_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_tanh_slice.py
@@ -16,14 +16,13 @@
 # under the License.
 """ Test for Hexagon slice tanh op """
 import numpy as np
-import pytest
 
 import tvm
 import tvm.testing
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.contrib.hexagon
-from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 # pylint: disable=invalid-name
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
old mode 100755
new mode 100644
index 711d725e842f..d689888d6e85
--- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
+++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-
-import pytest
+"""Test code for Add, Subtract and Multiply."""
 import numpy as np
 
 import tvm
@@ -30,78 +28,14 @@
     get_hexagon_target,
 )
 
+ZERO_POINT_A_VAL = None
+SCALE_A_VAL = None
 
-@tvm.testing.fixture
-def expected_output_np(input_np_A, input_np_B, op_name):
-    if op_name == "add":
-        out_ref = np.add(input_np_A, input_np_B)
-    elif op_name == "subtract":
-        out_ref = np.subtract(input_np_A, input_np_B)
-    elif op_name == "multiply":
-        out_ref = np.multiply(input_np_A, input_np_B)
-    return out_ref
-
-
-@tvm.testing.fixture
-def input_np_A(input_shape_A, dtype):
-    if dtype == "uint8" or dtype == "int8":
-        dtype = "float32"
-    return np.random.random(input_shape_A).astype(dtype)
-
-
-@tvm.testing.fixture
-def input_np_B(input_shape_B, dtype):
-    if dtype == "uint8" or dtype == "int8":
-        dtype = "float32"
-    return np.random.random(input_shape_B).astype(dtype)
-
-
-@tvm.testing.fixture
-def quantize_input_np_A(input_np_A, dtype):
-    if dtype == "uint8" or dtype == "int8":
-        global zero_point_A_val, scale_A_val
-        input_np_A_quantized, scale_A_val, zero_point_A_val = quantize_np(input_np_A, dtype)
-        return input_np_A_quantized
-
-
-@tvm.testing.fixture
-def quantize_input_np_B(input_np_B, dtype):
-    if dtype == "uint8" or dtype == "int8":
-        global zero_point_B_val, scale_B_val
-        input_np_B_quantized, scale_B_val, zero_point_B_val = quantize_np(input_np_B, dtype)
-        return input_np_B_quantized
-
-
-@tvm.testing.fixture
-def transformed_input_np_A(input_np_A, quantize_input_np_A, input_A_layout, dtype):
-    if dtype == "float16":
-        return transform_numpy(input_np_A, "nhwc", input_A_layout)
-    if dtype == "uint8" or dtype == "int8":
-        return transform_numpy(quantize_input_np_A, "nhwc", input_A_layout)
-
-    raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-
-@tvm.testing.fixture
-def transformed_input_np_B(input_np_B, quantize_input_np_B, input_B_layout, dtype):
-    if dtype == "float16":
-        return transform_numpy(input_np_B, "nhwc", input_B_layout)
-    if dtype == "uint8" or dtype == "int8":
-        return transform_numpy(quantize_input_np_B, "nhwc", input_B_layout)
+ZERO_POINT_B_VAL = None
+SCALE_B_VAL = None
 
-    raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-
-@tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, output_layout, dtype):
-    if dtype == "float16":
-        return transform_numpy(expected_output_np, "nhwc", output_layout)
-    if dtype == "uint8" or dtype == "int8":
-        global zero_point_M_val, scale_M_val
-        out_ref_quantized, scale_M_val, zero_point_M_val = quantize_np(expected_output_np, dtype)
-        return transform_numpy(out_ref_quantized, "nhwc", output_layout)
-
-    raise RuntimeError(f"Unsupported data type '{dtype}'")
+ZERO_POINT_M_VAL = None
+SCALE_M_VAL = None
 
 
 def hexagon_wrapper_allocation(
@@ -114,7 +48,7 @@ def hexagon_wrapper_allocation(
     dtype=None,
 ):
     """Input layout can either be nhwc-8h2w32c2w-2d or nhwc"""
-    if layout == "nhwc-8h2w32c2w-2d" or layout == "nhwc-8h8w32c-2d":
+    if layout in ["nhwc-8h2w32c2w-2d", "nhwc-8h8w32c-2d"]:
         data_nd = allocate_hexagon_array(
             device,
             tensor_shape=tensor_shape,
@@ -132,11 +66,13 @@ def hexagon_wrapper_allocation(
 
 
 class TestAddSubtractMultiplyBroadcast2d:
+    """Test Add, Subtract and Multiply class."""
+
     (
-        input_shape_A,
-        input_shape_B,
-        input_A_layout,
-        input_B_layout,
+        input_shape_a,
+        input_shape_b,
+        input_a_layout,
+        input_b_layout,
         output_layout,
         dtype,
     ) = tvm.testing.parameters(
@@ -269,60 +205,134 @@ class TestAddSubtractMultiplyBroadcast2d:
 
     op_name = tvm.testing.parameter("add", "subtract", "multiply")
 
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np_a, input_np_b, op_name):
+        """Generate expected output."""
+        if op_name == "add":
+            out_ref = np.add(input_np_a, input_np_b)
+        elif op_name == "subtract":
+            out_ref = np.subtract(input_np_a, input_np_b)
+        elif op_name == "multiply":
+            out_ref = np.multiply(input_np_a, input_np_b)
+        return out_ref
+
+    @tvm.testing.fixture
+    def transformed_expected_output_np(self, expected_output_np, output_layout, dtype):
+        """Generate expected output."""
+        if dtype == "float16":
+            return transform_numpy(expected_output_np, "nhwc", output_layout)
+        if dtype in ["uint8", "int8"]:
+            global ZERO_POINT_M_VAL, SCALE_M_VAL
+            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
+                expected_output_np, dtype
+            )
+            return transform_numpy(out_ref_quantized, "nhwc", output_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def input_np_a(self, input_shape_a, dtype):
+        """Generate numpy input for variable a."""
+        if dtype in ["uint8", "int8"]:
+            dtype = "float32"
+        return np.random.random(input_shape_a).astype(dtype)
+
+    @tvm.testing.fixture
+    def input_np_b(self, input_shape_b, dtype):
+        """Generate numpy input for variable b."""
+        if dtype in ["uint8", "int8"]:
+            dtype = "float32"
+        return np.random.random(input_shape_b).astype(dtype)
+
+    @tvm.testing.fixture
+    def quantize_input_np_a(self, input_np_a, dtype):
+        if dtype in ["uint8", "int8"]:
+            global ZERO_POINT_A_VAL, SCALE_A_VAL
+            input_np_a_quantized, SCALE_A_VAL, ZERO_POINT_A_VAL = quantize_np(input_np_a, dtype)
+            return input_np_a_quantized
+        return None
+
+    @tvm.testing.fixture
+    def quantize_input_np_b(self, input_np_b, dtype):
+        if dtype in ["uint8", "int8"]:
+            global ZERO_POINT_B_VAL, SCALE_B_VAL
+            input_np_b_quantized, SCALE_B_VAL, ZERO_POINT_B_VAL = quantize_np(input_np_b, dtype)
+            return input_np_b_quantized
+        return None
+
+    @tvm.testing.fixture
+    def transformed_input_np_a(self, input_np_a, quantize_input_np_a, input_a_layout, dtype):
+        if dtype == "float16":
+            return transform_numpy(input_np_a, "nhwc", input_a_layout)
+        if dtype in ["uint8", "int8"]:
+            return transform_numpy(quantize_input_np_a, "nhwc", input_a_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def transformed_input_np_b(self, input_np_b, quantize_input_np_b, input_b_layout, dtype):
+        if dtype == "float16":
+            return transform_numpy(input_np_b, "nhwc", input_b_layout)
+        if dtype in ["uint8", "int8"]:
+            return transform_numpy(quantize_input_np_b, "nhwc", input_b_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
     @tvm.testing.requires_hexagon
     def test_transform(
         self,
         dtype,
-        input_shape_A,
-        input_shape_B,
-        input_np_A,
-        input_np_B,
-        quantize_input_np_A,
-        quantize_input_np_B,
-        transformed_input_np_A,
-        transformed_input_np_B,
+        input_shape_a,
+        input_shape_b,
+        input_np_a,
+        input_np_b,
+        quantize_input_np_a,
+        quantize_input_np_b,
+        transformed_input_np_a,
+        transformed_input_np_b,
         expected_output_np,
         transformed_expected_output_np,
         hexagon_session,
         output_layout,
-        input_A_layout,
-        input_B_layout,
+        input_a_layout,
+        input_b_layout,
         op_name,
     ):
+        """Test transform."""
         output_shape = expected_output_np.shape
-        A = te.placeholder(input_shape_A, name="A", dtype=dtype)
-        B = te.placeholder(input_shape_B, name="B", dtype=dtype)
+        a_tensor = te.placeholder(input_shape_a, name="a_tensor", dtype=dtype)
+        b_tensor = te.placeholder(input_shape_b, name="b_tensor", dtype=dtype)
         if dtype == "float16":
             if op_name == "add":
-                M = sl.add_broadcast_compute(A, B)
+                m_tensor = sl.add_broadcast_compute(a_tensor, b_tensor)
             elif op_name == "subtract":
-                M = sl.subtract_broadcast_compute(A, B)
+                m_tensor = sl.subtract_broadcast_compute(a_tensor, b_tensor)
             elif op_name == "multiply":
-                M = sl.multiply_broadcast_compute(A, B)
+                m_tensor = sl.multiply_broadcast_compute(a_tensor, b_tensor)
             tir_schedule = sl.tir_broadcast_schedule(
-                M, A, B, output_layout, input_A_layout, input_B_layout, op_name
+                m_tensor, a_tensor, b_tensor, output_layout, input_a_layout, input_b_layout, op_name
             )
-        elif dtype == "uint8" or dtype == "int8":
+        elif dtype in ["uint8", "int8"]:
             args = [
-                A,
-                B,
+                a_tensor,
+                b_tensor,
                 output_shape,
-                zero_point_A_val,
-                scale_A_val,
-                zero_point_B_val,
-                scale_B_val,
-                zero_point_M_val,
-                scale_M_val,
+                ZERO_POINT_A_VAL,
+                SCALE_A_VAL,
+                ZERO_POINT_B_VAL,
+                SCALE_B_VAL,
+                ZERO_POINT_M_VAL,
+                SCALE_M_VAL,
                 dtype,
             ]
             if op_name == "add":
-                M = qn.qadd_broadcast_compute(*args)
+                m_tensor = qn.qadd_broadcast_compute(*args)
             elif op_name == "subtract":
-                M = qn.qsubtract_broadcast_compute(*args)
+                m_tensor = qn.qsubtract_broadcast_compute(*args)
             elif op_name == "multiply":
-                M = qn.qmultiply_broadcast_compute(*args)
+                m_tensor = qn.qmultiply_broadcast_compute(*args)
             tir_schedule = qn.tir_schedule_quant(
-                M, A, B, output_layout, input_A_layout, input_B_layout
+                m_tensor, a_tensor, b_tensor, output_layout, input_a_layout, input_b_layout
             )
 
         sch = tir_schedule.mod
@@ -339,35 +349,35 @@ def test_transform(
         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(
                 sch,
-                [A, B, M],
+                [a_tensor, b_tensor, m_tensor],
                 get_hexagon_target("v69"),
                 name="slice_op_with_transform",
             )
 
         if dtype == "float16":
-            in_data_np_A = input_np_A
-            in_data_np_B = input_np_B
-        elif dtype == "int8" or dtype == "uint8":
-            in_data_np_A = quantize_input_np_A
-            in_data_np_B = quantize_input_np_B
+            in_data_np_a = input_np_a
+            in_data_np_b = input_np_b
+        elif dtype in ["int8", "uint8"]:
+            in_data_np_a = quantize_input_np_a
+            in_data_np_b = quantize_input_np_b
         else:
             raise RuntimeError(f"Unsupport dtype '{dtype}'")
 
-        A_data_nd = hexagon_wrapper_allocation(
+        a_data_nd = hexagon_wrapper_allocation(
             hexagon_session.device,
-            layout=input_A_layout,
-            data_original=in_data_np_A,
-            transformed_data=transformed_input_np_A,
+            layout=input_a_layout,
+            data_original=in_data_np_a,
+            transformed_data=transformed_input_np_a,
             axis_separators=input_axis_separator,
         )
-        B_data_nd = hexagon_wrapper_allocation(
+        b_data_nd = hexagon_wrapper_allocation(
             hexagon_session.device,
-            layout=input_B_layout,
-            data_original=in_data_np_B,
-            transformed_data=transformed_input_np_B,
+            layout=input_b_layout,
+            data_original=in_data_np_b,
+            transformed_data=transformed_input_np_b,
             axis_separators=input_axis_separator,
         )
-        M_data_nd = hexagon_wrapper_allocation(
+        m_data_nd = hexagon_wrapper_allocation(
             hexagon_session.device,
             layout=output_layout,
             tensor_shape=transformed_expected_output_np.shape,
@@ -376,21 +386,25 @@ def test_transform(
         )
 
         mod = hexagon_session.load_module(func)
-        mod(A_data_nd, B_data_nd, M_data_nd)
+        mod(a_data_nd, b_data_nd, m_data_nd)
 
-        b, h, w, c = output_shape
+        batch, height, width, channel = output_shape
         # convert nd to np and reshape to fixed chunk size layout
         if output_layout == "nhwc-8h2w32c2w-2d":
-            M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
+            m_data_np = m_data_nd.numpy().reshape(
+                [batch, height // 8, width // 4, channel // 32, 8, 2, 32, 2]
+            )
         elif output_layout == "nhwc-8h8w32c-2d":
-            M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32])
+            m_data_np = m_data_nd.numpy().reshape(
+                [batch, height // 8, width // 8, channel // 32, 8, 8, 32]
+            )
 
         if dtype == "float16":
             np.testing.assert_allclose(
-                transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3
+                transformed_expected_output_np, m_data_np, rtol=1e-3, atol=1e-3
             )
-        elif dtype == "int8" or dtype == "uint8":
-            np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1, atol=1)
+        elif dtype in ["int8", "uint8"]:
+            np.testing.assert_allclose(transformed_expected_output_np, m_data_np, rtol=1, atol=1)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
index f3273ea8b65b..22fd96254ca7 100644
--- a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
+++ b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
@@ -25,18 +25,14 @@
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
-from tvm.contrib.hexagon.session import Session
 
 from ..infrastructure import get_hexagon_target
 
-dtype = tvm.testing.parameter(
-    "float32",
-    "float16",
-)
-
 
 class TestMatMulFloat:
-    x_batch, y_batch, M, N, K = tvm.testing.parameters(
+    """Test MatMul Float class."""
+
+    x_batch, y_batch, m_size, n_size, k_size = tvm.testing.parameters(
         (1, 1, 16, 16, 32),
         (5, 5, 16, 16, 32),
         (5, 5, 16, 20, 32),
@@ -46,18 +42,26 @@ class TestMatMulFloat:
         (5, 1, 16, 16, 32),
     )
 
+    dtype = tvm.testing.parameter(
+        "float32",
+        "float16",
+    )
+
     # TODO(mehrdadh): add dynamic testing
     @tvm.testing.requires_hexagon
-    def test_batch_matmul(self, hexagon_session: Session, x_batch, y_batch, M, N, K, dtype):
+    def test_batch_matmul(
+        self, hexagon_session: Session, x_batch, y_batch, m_size, n_size, k_size, dtype
+    ):
+        """Test batch MatMul."""
         if dtype == "float16":
             pytest.xfail("float16 is not supported.")
 
-        x = te.placeholder((x_batch, M, K), name="x")
-        y = te.placeholder((y_batch, N, K), name="y")
+        x = te.placeholder((x_batch, m_size, k_size), name="x")
+        y = te.placeholder((y_batch, n_size, k_size), name="y")
 
         def get_ref_data():
-            a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype)
-            b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype)
+            a_np = np.random.uniform(size=(x_batch, m_size, k_size)).astype(dtype)
+            b_np = np.random.uniform(size=(y_batch, n_size, k_size)).astype(dtype)
             c_np = tvm.topi.testing.batch_matmul(a_np, b_np)
             return (a_np, b_np, c_np)
 
@@ -89,7 +93,9 @@ def get_ref_data():
 
 
 class TestMatMulInt8:
-    x_batch, y_batch, M, N, K = tvm.testing.parameters(
+    """Test MatMul INT8 class."""
+
+    x_batch, y_batch, m_size, n_size, k_size = tvm.testing.parameters(
         (1, 1, 2, 3, 1),
         (1, 1, 16, 24, 32),
         (5, 5, 24, 16, 32),
@@ -98,17 +104,29 @@ class TestMatMulInt8:
         (5, 1, 16, 16, 32),
     )
 
+    dtype = tvm.testing.parameter(
+        "float32",
+        "float16",
+    )
+
     @tvm.testing.requires_hexagon
-    def test_batch_matmul_int8(self, hexagon_session: Session, x_batch, y_batch, M, N, K):
+    def test_batch_matmul_int8(
+        self, hexagon_session: Session, x_batch, y_batch, m_size, n_size, k_size
+    ):
+        """Test batch matmul INT8."""
         dtype = "int8"
         out_dtype = "int8"
         assert x_batch == y_batch or x_batch == 1 or y_batch == 1
-        x = te.placeholder((x_batch, M, K), name="x", dtype=dtype)
-        y = te.placeholder((y_batch, N, K), name="y", dtype=dtype)
+        x = te.placeholder((x_batch, m_size, k_size), name="x", dtype=dtype)
+        y = te.placeholder((y_batch, n_size, k_size), name="y", dtype=dtype)
 
         def get_ref_data():
-            a_np = np.random.randint(low=-128, high=127, size=(x_batch, M, K)).astype(dtype)
-            b_np = np.random.randint(low=-128, high=127, size=(y_batch, N, K)).astype(dtype)
+            a_np = np.random.randint(low=-128, high=127, size=(x_batch, m_size, k_size)).astype(
+                dtype
+            )
+            b_np = np.random.randint(low=-128, high=127, size=(y_batch, n_size, k_size)).astype(
+                dtype
+            )
             c_np = tvm.topi.testing.batch_matmul(a_np, b_np, out_dtype=out_dtype)
             return (a_np, b_np, c_np)
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
index 5066a532df9b..41fe310d8484 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py
@@ -76,105 +76,6 @@ def build_conv2d(target):
     return module
 
 
-shape_parameters = [
-    (
-        (1, 8, 4, 3),
-        (3, 3, 3, 3),
-        (1, 1),
-    ),
-    (
-        (1, 10, 14, 3),
-        (3, 3, 3, 3),
-        (1, 1),
-    ),
-    (
-        (1, 14, 6, 3),
-        (3, 3, 3, 3),
-        (1, 1),
-    ),
-    (
-        (1, 14, 6, 3),
-        (3, 3, 3, 64),
-        (1, 1),
-    ),
-    (
-        (1, 14, 6, 3),
-        (5, 5, 3, 3),
-        (1, 1),
-    ),
-    (
-        (1, 8, 8, 3),
-        (2, 2, 3, 3),
-        (1, 1),
-    ),
-    (
-        (1, 14, 6, 64),
-        (3, 3, 64, 3),
-        (1, 1),
-    ),
-    (
-        (1, 4, 4, 40),
-        (3, 3, 40, 3),
-        (1, 1),
-    ),
-    (
-        (1, 4, 4, 3),
-        (3, 3, 3, 3),
-        (1, 1),
-    ),
-    (
-        (1, 5, 5, 3),
-        (3, 3, 3, 3),
-        (1, 1),
-    ),
-    (
-        (1, 6, 6, 3),
-        (3, 3, 3, 3),
-        (1, 1),
-    ),
-    (
-        (1, 7, 7, 3),
-        (3, 3, 3, 3),
-        (1, 1),
-    ),
-    (
-        (1, 8, 8, 3),
-        (3, 3, 3, 3),
-        (1, 1),
-    ),
-    (
-        (1, 8, 8, 3),
-        (5, 5, 3, 3),
-        (1, 1),
-    ),
-    (
-        (1, 8, 8, 64),
-        (2, 2, 64, 64),
-        (1, 1),
-    ),
-    (
-        (1, 8, 4, 3),
-        (3, 3, 3, 3),
-        (2, 2),
-    ),
-    (
-        (1, 14, 6, 3),
-        (3, 3, 3, 64),
-        (2, 2),
-    ),
-    (
-        (1, 14, 6, 3),
-        (5, 5, 3, 3),
-        (2, 2),
-    ),
-    (
-        (1, 8, 8, 3),
-        (2, 2, 3, 3),
-        (2, 2),
-    ),
-]
-
-
 def gen_config(params):
     """Utility function to generate useful ids for shape_parameters"""
 
@@ -192,6 +93,104 @@ def gen_config(params):
 class TestConv2dIntrin:
     """Test Conv2d Intrin class"""
 
+    shape_parameters = [
+        (
+            (1, 8, 4, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ),
+        (
+            (1, 10, 14, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ),
+        (
+            (1, 14, 6, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ),
+        (
+            (1, 14, 6, 3),
+            (3, 3, 3, 64),
+            (1, 1),
+        ),
+        (
+            (1, 14, 6, 3),
+            (5, 5, 3, 3),
+            (1, 1),
+        ),
+        (
+            (1, 8, 8, 3),
+            (2, 2, 3, 3),
+            (1, 1),
+        ),
+        (
+            (1, 14, 6, 64),
+            (3, 3, 64, 3),
+            (1, 1),
+        ),
+        (
+            (1, 4, 4, 40),
+            (3, 3, 40, 3),
+            (1, 1),
+        ),
+        (
+            (1, 4, 4, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ),
+        (
+            (1, 5, 5, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ),
+        (
+            (1, 6, 6, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ),
+        (
+            (1, 7, 7, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ),
+        (
+            (1, 8, 8, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ),
+        (
+            (1, 8, 8, 3),
+            (5, 5, 3, 3),
+            (1, 1),
+        ),
+        (
+            (1, 8, 8, 64),
+            (2, 2, 64, 64),
+            (1, 1),
+        ),
+        (
+            (1, 8, 4, 3),
+            (3, 3, 3, 3),
+            (2, 2),
+        ),
+        (
+            (1, 14, 6, 3),
+            (3, 3, 3, 64),
+            (2, 2),
+        ),
+        (
+            (1, 14, 6, 3),
+            (5, 5, 3, 3),
+            (2, 2),
+        ),
+        (
+            (1, 8, 8, 3),
+            (2, 2, 3, 3),
+            (2, 2),
+        ),
+    ]
+
     config = gen_config(shape_parameters)
     act_shape, wgt_shape, inp_stride = tvm.testing.parameters(*config.values(), ids=config.keys())
     inp_offset = tvm.testing.parameter((0, 0), ids=["offset0x0"])
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
index 0b94d6e781a7..9c89427e1b01 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
@@ -28,66 +28,66 @@
 
 from ..infrastructure import get_hexagon_target
 
-dtype = tvm.testing.parameter("float32")
-random_seed = tvm.testing.parameter(0)
-
-
-@tvm.testing.fixture
-def input_shape(batch, in_channel, in_size):
-    return (batch, in_channel, in_size, in_size)
-
-
-@tvm.testing.fixture
-def weight_shape(num_filter, in_channel, kernel):
-    return (num_filter, in_channel, kernel, kernel)
-
-
-@tvm.testing.fixture
-def bias_shape(num_filter):
-    return (num_filter, 1, 1)
 
+class BaseConv2DTests:
+    """Conv2D test class."""
 
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(
-    random_seed,
-    input_shape,
-    weight_shape,
-    bias_shape,
-    dtype,
-    stride,
-    padding,
-    dilation,
-    add_bias,
-    apply_relu,
-):
-    np.random.seed(random_seed)
+    add_bias = tvm.testing.parameter(False)
+    apply_relu = tvm.testing.parameter(False)
+    dilation = tvm.testing.parameter(1)
+    batch = tvm.testing.parameter(1)
+    dtype = tvm.testing.parameter("float32")
 
-    # scipy.signal.convolve2d does not support float16 data types, and
-    # the python fallback is too slow for general use.  Computing
-    # ref_data in float32 will have fewer rounding errors than the TVM
-    # float16 compute, but those vary based on schedule anyways.
-    conv_dtype = "float32" if dtype == "float16" else dtype
+    random_seed = tvm.testing.parameter(0)
 
-    a_np = np.random.uniform(size=input_shape).astype(dtype)
-    w_np = np.random.uniform(size=weight_shape).astype(dtype)
-    b_np = np.random.uniform(size=bias_shape).astype(dtype)
-    dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-    c_np = tvm.topi.testing.conv2d_nchw_python(
-        a_np.astype(conv_dtype), dw_np.astype(conv_dtype), stride, padding
-    ).astype(dtype)
+    @tvm.testing.fixture
+    def input_shape(self, batch, in_channel, in_size):
+        return (batch, in_channel, in_size, in_size)
 
-    if add_bias:
-        c_np = c_np + b_np
-    if apply_relu:
-        c_np = np.maximum(c_np, 0)
-    return a_np, w_np, b_np, c_np
+    @tvm.testing.fixture
+    def weight_shape(self, num_filter, in_channel, kernel):
+        return (num_filter, in_channel, kernel, kernel)
 
+    @tvm.testing.fixture
+    def bias_shape(self, num_filter):
+        return (num_filter, 1, 1)
 
-class BaseConv2DTests:
-    add_bias = tvm.testing.parameter(False)
-    apply_relu = tvm.testing.parameter(False)
-    dilation = tvm.testing.parameter(1)
-    batch = tvm.testing.parameter(1)
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_data(
+        self,
+        random_seed,
+        input_shape,
+        weight_shape,
+        bias_shape,
+        dtype,
+        stride,
+        padding,
+        dilation,
+        add_bias,
+        apply_relu,
+    ):
+        """Generate reference data."""
+        np.random.seed(random_seed)
+
+        # scipy.signal.convolve2d does not support float16 data types, and
+        # the python fallback is too slow for general use.  Computing
+        # ref_data in float32 will have fewer rounding errors than the TVM
+        # float16 compute, but those vary based on schedule anyways.
+        conv_dtype = "float32" if dtype == "float16" else dtype
+
+        a_np = np.random.uniform(size=input_shape).astype(dtype)
+        w_np = np.random.uniform(size=weight_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = tvm.topi.testing.conv2d_nchw_python(
+            a_np.astype(conv_dtype), dw_np.astype(conv_dtype), stride, padding
+        ).astype(dtype)
+
+        if add_bias:
+            c_np = c_np + b_np
+        if apply_relu:
+            c_np = np.maximum(c_np, 0)
+        return a_np, w_np, b_np, c_np
 
     @tvm.testing.requires_hexagon
     def test_conv2d_nchw(
@@ -106,14 +106,15 @@ def test_conv2d_nchw(
         add_bias,
         apply_relu,
     ):
+        """Test Conv2d NCHW."""
 
         pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
         padding_sum = pad_top + pad_left + pad_bottom + pad_right
 
         a_np, w_np, b_np, c_np = ref_data
 
-        A = te.placeholder(a_np.shape, name="A", dtype=dtype)
-        W = te.placeholder(w_np.shape, name="W", dtype=dtype)
+        a_tensor = te.placeholder(a_np.shape, name="a_tensor", dtype=dtype)
+        w_tensor = te.placeholder(w_np.shape, name="w_tensor", dtype=dtype)
         bias = te.placeholder(b_np.shape, name="bias", dtype=dtype)
 
         if "int" in dtype:
@@ -121,7 +122,7 @@ def test_conv2d_nchw(
         elif dtype == "float32":
             tol = {"rtol": 1e-4, "atol": 2e-4}
         elif dtype == "float16":
-            # A summation in float16 with a single accumulator very
+            # a_tensor summation in float16 with a single accumulator very
             # quickly runs into large rounding errors.  At some point,
             # this tolerance should be schedule-dependent for to avoid
             # false negatives.
@@ -132,12 +133,14 @@ def test_conv2d_nchw(
         with tvm.target.Target(get_hexagon_target("v68")):
             fcompute = topi.nn.conv2d_nchw
             fschedule = topi.hexagon.schedule_conv2d_nchw
-            C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
+            c_tensor = fcompute(
+                a_tensor, w_tensor, (stride, stride), padding, (dilation, dilation), dtype
+            )
             if add_bias:
-                C = topi.add(C, bias)
+                c_tensor = topi.add(c_tensor, bias)
             if apply_relu:
-                C = topi.nn.relu(C)
-            s = fschedule([C])
+                c_tensor = topi.nn.relu(c_tensor)
+            s = fschedule([c_tensor])
 
         func_name = "conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
             dtype,
@@ -152,19 +155,19 @@ def test_conv2d_nchw(
         )
         func = tvm.build(
             s,
-            [A, W, bias, C],
+            [a_tensor, w_tensor, bias, c_tensor],
             get_hexagon_target("v68"),
             name=func_name,
         )
         mod = hexagon_session.load_module(func)
 
         dev = hexagon_session.device
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
+        a_data = tvm.nd.array(a_np, dev)
+        weight = tvm.nd.array(w_np, dev)
         b = tvm.nd.array(b_np, dev)
 
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        mod[func_name](a, w, b, c)
+        c = tvm.nd.array(np.zeros(get_const_tuple(c_tensor.shape), dtype=c_tensor.dtype), dev)
+        mod[func_name](a_data, weight, b, c)
         tvm.testing.assert_allclose(c.numpy(), c_np, **tol)
 
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
index 2068f1e6e6fc..9edc04db4398 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
@@ -27,23 +27,27 @@
 
 from ..infrastructure import get_hexagon_target
 
-dtype = tvm.testing.parameter("float32")
 
+class BaseConv2DTests:
+    """Test Conv2D base class."""
 
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation):
-    in_height = in_width = in_size
-    a_shape = (batch, in_height, in_width, in_channel)
-    w_shape = (kernel, kernel, in_channel, num_filter)
+    dtype = tvm.testing.parameter("float32")
 
-    a_np = np.random.uniform(size=a_shape).astype(dtype)
-    w_np = np.random.uniform(size=w_shape).astype(dtype)
-    dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-    b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
-    return a_np, w_np, b_np
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_data(
+        self, dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation
+    ):
+        """Generate reference data."""
+        in_height = in_width = in_size
+        a_shape = (batch, in_height, in_width, in_channel)
+        w_shape = (kernel, kernel, in_channel, num_filter)
 
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
+        b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
+        return a_np, w_np, b_np
 
-class BaseConv2DTests:
     @tvm.testing.requires_hexagon
     def test_conv2d_nhwc(
         self,
@@ -59,16 +63,17 @@ def test_conv2d_nhwc(
         padding,
         dilation,
     ):
+        """Test Conv2D NHWC."""
         a_np, w_np, b_np = ref_data
 
-        A = te.placeholder(a_np.shape, name="A", dtype=dtype)
-        W = te.placeholder(w_np.shape, name="W", dtype=dtype)
+        a_tensor = te.placeholder(a_np.shape, name="a_tensor", dtype=dtype)
+        w_tensor = te.placeholder(w_np.shape, name="w_tensor", dtype=dtype)
 
         with tvm.target.Target(get_hexagon_target("v68")):
             fcompute = topi.nn.conv2d_nhwc
             fschedule = topi.hexagon.schedule_conv2d_nhwc
-            B = fcompute(A, W, stride, padding, dilation, dtype)
-            s = fschedule([B])
+            b_tensor = fcompute(a_tensor, w_tensor, stride, padding, dilation, dtype)
+            s = fschedule([b_tensor])
 
         func_name = "conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
             dtype,
@@ -81,15 +86,17 @@ def test_conv2d_nhwc(
             padding,
             dilation,
         )
-        func = tvm.build(s, [A, W, B], get_hexagon_target("v68"), name=func_name)
+        func = tvm.build(
+            s, [a_tensor, w_tensor, b_tensor], get_hexagon_target("v68"), name=func_name
+        )
         mod = hexagon_session.load_module(func)
 
         dev = hexagon_session.device
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+        a_data = tvm.nd.array(a_np, dev)
+        weight = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(b_tensor.shape), dtype=b_tensor.dtype), dev)
 
-        mod[func_name](a, w, b)
+        mod[func_name](a_data, weight, b)
         tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
 
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
index 40c8efa1cec2..d19223a42d74 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
@@ -29,45 +29,12 @@
 
 # TODO Should add kernal to tvm.testing.fixture
 
-random_seed = tvm.testing.parameter(0)
 
+class BaseConv2DTransposeTests:
+    """Conv2D transpose base class."""
 
-@tvm.testing.fixture
-def shift_shape(batch):
-    return batch
-
-
-@tvm.testing.fixture
-def shift_shape(in_channel):
-    return in_channel
-
-
-@tvm.testing.fixture
-def shift_shape(in_size):
-    return in_size
-
-
-@tvm.testing.fixture
-def shift_shape(num_filter):
-    return num_filter
-
-
-@tvm.testing.fixture
-def shift_shape(stride):
-    return stride
-
-
-@tvm.testing.fixture
-def shift_shape(padding):
-    return padding
-
-
-@tvm.testing.fixture
-def shift_shape(output_padding):
-    return output_padding
-
+    random_seed = tvm.testing.parameter(0)
 
-class BaseConv2DTransposeTests:
     @tvm.testing.requires_hexagon
     def test_conv2d(
         self,
@@ -81,17 +48,20 @@ def test_conv2d(
         output_padding,
         random_seed,
     ):
+        """Test conv2D."""
         in_height, in_width = in_size
         kernel_height, kernel_width = (1, 1)
         stride_height, stride_width = stride
         pad_top, pad_left, pad_bottom, pad_right = padding
 
-        A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
-        W = te.placeholder((in_channel, num_filter, kernel_height, kernel_width), name="W")
+        a_tensor = te.placeholder((batch, in_channel, in_height, in_width), name="a_tensor")
+        w_tensor = te.placeholder(
+            (in_channel, num_filter, kernel_height, kernel_width), name="w_tensor"
+        )
 
-        a_shape = get_const_tuple(A.shape)
-        w_shape = get_const_tuple(W.shape)
-        dtype = A.dtype
+        a_shape = get_const_tuple(a_tensor.shape)
+        w_shape = get_const_tuple(w_tensor.shape)
+        dtype = a_tensor.dtype
 
         def get_ref_data():
 
@@ -107,42 +77,43 @@ def get_ref_data():
         a_np, w_np, b_np, c_np = get_ref_data()
 
         fcompute_args = (
-            A,
-            W,
+            a_tensor,
+            w_tensor,
             [stride_height, stride_width],
             [pad_top, pad_left, pad_bottom, pad_right],
-            A.dtype,
+            a_tensor.dtype,
             output_padding,
         )
 
         with tvm.target.Target(get_hexagon_target("v68")):
             fcompute = topi.nn.conv2d_transpose_nchw
             fschedule = topi.hexagon.schedule_conv2d_transpose_nchw
-            B = fcompute(*fcompute_args)
-            C = topi.nn.relu(B)
-            s1 = fschedule([B])
-            s2 = fschedule([C])
+            b_tensor = fcompute(*fcompute_args)
+            c_tensor = topi.nn.relu(b_tensor)
+            schedule_1 = fschedule([b_tensor])
+            schedule_2 = fschedule([c_tensor])
 
             dev = hexagon_session.device
 
-            a = tvm.nd.array(a_np, dev)
-            w = tvm.nd.array(w_np, dev)
-            b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+            a_data = tvm.nd.array(a_np, dev)
+            weight = tvm.nd.array(w_np, dev)
+            b = tvm.nd.array(np.zeros(get_const_tuple(b_tensor.shape), dtype=b_tensor.dtype), dev)
+            c = tvm.nd.array(np.zeros(get_const_tuple(c_tensor.shape), dtype=c_tensor.dtype), dev)
 
-            func1 = tvm.build(s1, [A, W, B], get_hexagon_target("v68"))
-            func2 = tvm.build(s2, [A, W, C], get_hexagon_target("v68"))
+            func1 = tvm.build(schedule_1, [a_tensor, w_tensor, b_tensor], get_hexagon_target("v68"))
+            func2 = tvm.build(schedule_2, [a_tensor, w_tensor, c_tensor], get_hexagon_target("v68"))
 
             mod1 = hexagon_session.load_module(func1)
             mod2 = hexagon_session.load_module(func2)
 
-            mod1(a, w, b)
-            mod2(a, w, c)
+            mod1(a_data, weight, b)
+            mod2(a_data, weight, c)
             tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
             tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
 
 class TestConv2DTranspose(BaseConv2DTransposeTests):
+    """Test Conv2D transpose class."""
 
     (batch, in_channel, in_size, num_filter, stride) = tvm.testing.parameters(
         (1, 3, (224, 224), 1, (1, 1)),
diff --git a/tests/python/contrib/test_hexagon/topi/test_dense.py b/tests/python/contrib/test_hexagon/topi/test_dense.py
index c76006ac08c2..fff4fd989f6d 100644
--- a/tests/python/contrib/test_hexagon/topi/test_dense.py
+++ b/tests/python/contrib/test_hexagon/topi/test_dense.py
@@ -28,89 +28,101 @@
 
 from ..infrastructure import get_hexagon_target
 
-random_seed = tvm.testing.parameter(0)
-
-use_bias = tvm.testing.parameter(True, False)
-
-# batch_size more than 8 would break
-batch_size = tvm.testing.parameter(1, 2, 8)
-
-in_dim, out_dim = tvm.testing.parameters((1024, 1000))
-
-in_dtype, out_dtype = tvm.testing.parameters(
-    ("float32", "float32"),
-    ("float16", "float32"),
-    ("int8", "int32"),
-)
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def dense_ref_data(random_seed, batch_size, in_dim, out_dim, use_bias, in_dtype, out_dtype):
-    np.random.seed(random_seed)
-
-    if "float" in in_dtype:
-        a_np = np.random.uniform(size=(batch_size, in_dim)).astype(in_dtype)
-        b_np = np.random.uniform(size=(out_dim, in_dim)).astype(in_dtype)
-        c_np = np.random.uniform(size=(out_dim,)).astype(out_dtype)
-    elif in_dtype == "int8":
-        a_np = np.random.randint(low=-128, high=127, size=(batch_size, in_dim)).astype(in_dtype)
-        b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(in_dtype)
-        c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(out_dtype)
-    else:
-        raise ValueError("No method to generate test data for data type '{}'".format(in_dtype))
-
-    matmul = np.dot(a_np.astype(out_dtype), b_np.T.astype(out_dtype))
-
-    if use_bias:
-        matmul += c_np
-
-    d_np = np.maximum(matmul, 0)
-    return (a_np, b_np, c_np, d_np)
-
-
-@tvm.testing.requires_hexagon
-def test_dense(
-    hexagon_session: Session,
-    batch_size,
-    in_dim,
-    out_dim,
-    use_bias,
-    in_dtype,
-    out_dtype,
-    dense_ref_data,
-):
-    if in_dtype == "float16":
-        pytest.xfail("float16 is not supported.")
-
-    if "int" in in_dtype:
-        tol = {"atol": 0, "rtol": 0}
-    elif in_dtype == "float32":
-        tol = {"rtol": 1e-5, "atol": 1e-5}
-
-    A = te.placeholder((batch_size, in_dim), name="A", dtype=in_dtype)
-    B = te.placeholder((out_dim, in_dim), name="B", dtype=in_dtype)
-    C = te.placeholder((out_dim,), name="C", dtype=out_dtype)
-
-    a_np, b_np, c_np, d_np = dense_ref_data
-
-    fcompute = topi.nn.dense
-    fschedule = topi.hexagon.schedule_dense
-
-    with tvm.target.Target(get_hexagon_target("v68")):
-        D = fcompute(A, B, C if use_bias else None, out_dtype)
-        D = topi.nn.relu(D)
-        s = fschedule([D])
-
-    func = tvm.build(s, [A, B, C, D], get_hexagon_target("v68"), name="dense")
-    mod = hexagon_session.load_module(func)
-
-    dev = hexagon_session.device
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(c_np, dev)
-    d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev)
-    mod["dense"](a, b, c, d)
-    tvm.testing.assert_allclose(d.numpy(), d_np, **tol)
+
+class TestDense:
+    """Dense test class."""
+
+    random_seed = tvm.testing.parameter(0)
+
+    use_bias = tvm.testing.parameter(True, False)
+
+    # batch_size more than 8 would break
+    batch_size = tvm.testing.parameter(1, 2, 8)
+
+    in_dim, out_dim = tvm.testing.parameters((1024, 1000))
+
+    in_dtype, out_dtype = tvm.testing.parameters(
+        ("float32", "float32"),
+        ("float16", "float32"),
+        ("int8", "int32"),
+    )
+
+    @tvm.testing.fixture(cache_return_value=True)
+    def dense_ref_data(
+        self, random_seed, batch_size, in_dim, out_dim, use_bias, in_dtype, out_dtype
+    ):
+        """Generate reference data."""
+        np.random.seed(random_seed)
+
+        if "float" in in_dtype:
+            a_np = np.random.uniform(size=(batch_size, in_dim)).astype(in_dtype)
+            b_np = np.random.uniform(size=(out_dim, in_dim)).astype(in_dtype)
+            c_np = np.random.uniform(size=(out_dim,)).astype(out_dtype)
+        elif in_dtype == "int8":
+            a_np = np.random.randint(low=-128, high=127, size=(batch_size, in_dim)).astype(in_dtype)
+            b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(in_dtype)
+            c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(out_dtype)
+        else:
+            raise ValueError("No method to generate test data for data type '{}'".format(in_dtype))
+
+        matmul = np.dot(a_np.astype(out_dtype), b_np.T.astype(out_dtype))
+
+        if use_bias:
+            matmul += c_np
+
+        d_np = np.maximum(matmul, 0)
+        return (a_np, b_np, c_np, d_np)
+
+    @tvm.testing.requires_hexagon
+    def test_dense(
+        self,
+        hexagon_session: Session,
+        batch_size,
+        in_dim,
+        out_dim,
+        use_bias,
+        in_dtype,
+        out_dtype,
+        dense_ref_data,
+    ):
+        """Test dense."""
+        if in_dtype == "float16":
+            pytest.xfail("float16 is not supported.")
+
+        if "int" in in_dtype:
+            tol = {"atol": 0, "rtol": 0}
+        elif in_dtype == "float32":
+            tol = {"rtol": 1e-5, "atol": 1e-5}
+
+        a_tensor = te.placeholder((batch_size, in_dim), name="a_tensor", dtype=in_dtype)
+        b_tensor = te.placeholder((out_dim, in_dim), name="b_tensor", dtype=in_dtype)
+        c_tensor = te.placeholder((out_dim,), name="c_tensor", dtype=out_dtype)
+
+        a_np, b_np, c_np, d_np = dense_ref_data
+
+        fcompute = topi.nn.dense
+        fschedule = topi.hexagon.schedule_dense
+
+        with tvm.target.Target(get_hexagon_target("v68")):
+            d_tensor = fcompute(a_tensor, b_tensor, c_tensor if use_bias else None, out_dtype)
+            d_tensor = topi.nn.relu(d_tensor)
+            schedule = fschedule([d_tensor])
+
+        func = tvm.build(
+            schedule,
+            [a_tensor, b_tensor, c_tensor, d_tensor],
+            get_hexagon_target("v68"),
+            name="dense",
+        )
+        mod = hexagon_session.load_module(func)
+
+        dev = hexagon_session.device
+        a_data = tvm.nd.array(a_np, dev)
+        b_data = tvm.nd.array(b_np, dev)
+        c_data = tvm.nd.array(c_np, dev)
+        d_data = tvm.nd.array(np.zeros(get_const_tuple(d_tensor.shape), dtype=out_dtype), dev)
+        mod["dense"](a_data, b_data, c_data, d_data)
+        tvm.testing.assert_allclose(d_data.numpy(), d_np, **tol)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py b/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
index 3de9ec13497a..0cb41b595255 100644
--- a/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
+++ b/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
@@ -19,7 +19,6 @@
 """Test depth_to_space slice op for hexagon"""
 
 import numpy as np
-import pytest
 
 import tvm
 from tvm import te
@@ -30,28 +29,27 @@
 from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
-d2s_fp16_tests = (
-    ((1, 8, 8, 256), 2, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-    ((1, 8, 8, 1024), 4, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-    ((1, 16, 16, 256), 2, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-    ((1, 16, 16, 1024), 4, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-    ((1, 8, 8, 256), 2, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-    ((1, 8, 8, 1024), 4, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-    ((1, 16, 16, 256), 2, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-    ((1, 16, 16, 1024), 4, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
-)
-
-d2s_uint8_tests = (
-    ((1, 8, 8, 256), 2, "CDR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
-    ((1, 8, 8, 1024), 4, "CDR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
-    ((1, 8, 8, 256), 2, "DCR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
-    ((1, 8, 8, 1024), 4, "DCR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
-)
-
-
 class TestD2SSlice:
     """Test class that defines the Depth to Space slice test"""
 
+    d2s_fp16_tests = (
+        ((1, 8, 8, 256), 2, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+        ((1, 8, 8, 1024), 4, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+        ((1, 16, 16, 256), 2, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+        ((1, 16, 16, 1024), 4, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+        ((1, 8, 8, 256), 2, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+        ((1, 8, 8, 1024), 4, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+        ((1, 16, 16, 256), 2, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+        ((1, 16, 16, 1024), 4, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"),
+    )
+
+    d2s_uint8_tests = (
+        ((1, 8, 8, 256), 2, "CDR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
+        ((1, 8, 8, 1024), 4, "CDR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
+        ((1, 8, 8, 256), 2, "DCR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
+        ((1, 8, 8, 1024), 4, "DCR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"),
+    )
+
     (input_shape, block_size, mode, dtype, input_layout, output_layout,) = tvm.testing.parameters(
         *d2s_fp16_tests,
         *d2s_uint8_tests,
@@ -93,11 +91,11 @@ def test_d2s_slice(
         transformed_ref_output_np,
     ):
         """Top level testing function for depth to space"""
-        Input = te.placeholder(input_shape, name="Input", dtype=dtype)
+        input_tensor = te.placeholder(input_shape, name="input_tensor", dtype=dtype)
 
-        Output = d2s_compute(Input, block_size, "NHWC", mode)
+        output = d2s_compute(input_tensor, block_size, "NHWC", mode)
 
-        tir_s = d2s_schedule(Input, Output, input_layout, output_layout)
+        tir_s = d2s_schedule(input_tensor, output, input_layout, output_layout)
 
         input_data = allocate_hexagon_array(
             hexagon_session.device,
@@ -114,7 +112,10 @@ def test_d2s_slice(
         )
         with tvm.transform.PassContext(opt_level=3):
             runtime_module = tvm.build(
-                tir_s.mod, [Input, Output], target=get_hexagon_target("v69"), name="depth_to_space"
+                tir_s.mod,
+                [input_tensor, output],
+                target=get_hexagon_target("v69"),
+                name="depth_to_space",
             )
         mod = hexagon_session.load_module(runtime_module)
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
index 063541cc21a0..f95d41093043 100644
--- a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
@@ -14,8 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-import sys
+"""Depthwise Conv2D Tests."""
 
 import numpy as np
 
@@ -30,121 +29,6 @@
 
 from ..infrastructure import get_hexagon_target
 
-random_seed = tvm.testing.parameter(0)
-
-in_dtype, out_dtype = tvm.testing.parameters(
-    ("float32", "float32"),
-)
-
-
-@tvm.testing.fixture
-def input_shape(layout, batch, in_channel, in_size, filter_shape):
-    if layout == "NCHW":
-        return (batch, in_channel, in_size, in_size)
-    elif layout == "NHWC":
-        return (batch, in_size, in_size, in_channel)
-    elif layout == "NCHWc":
-        oc_block = filter_shape[-1]
-        ic_block = next(bn for bn in range(oc_block, 0, -1) if in_channel % bn == 0)
-        return (batch, in_channel // ic_block, in_size, in_size, ic_block)
-
-
-@tvm.testing.fixture
-def filter_shape(layout, in_channel, channel_multiplier, kernel):
-    filter_channel = in_channel
-    if layout == "NCHW":
-        return (filter_channel, channel_multiplier, kernel, kernel)
-    elif layout == "NHWC":
-        return (kernel, kernel, filter_channel, channel_multiplier)
-    elif layout == "NCHWc":
-        out_channel = in_channel * channel_multiplier
-        # For testing the functionality, we choose an arbitrary block
-        # size that can divide out_channel, regardless of the
-        # performance.
-        oc_block = next(bn for bn in range(16, 0, -1) if out_channel % bn == 0)
-        return (out_channel // oc_block, 1, kernel, kernel, 1, oc_block)
-
-
-@tvm.testing.fixture
-def scale_shape(layout, in_channel, channel_multiplier, filter_shape):
-    out_channel = in_channel * channel_multiplier
-
-    if layout in ("NCHW", "NHWC"):
-        return (out_channel,)
-
-    if layout == "NCHWc":
-        oc_block = filter_shape[-1]
-        return (out_channel // oc_block, oc_block)
-
-    raise ValueError("Unknown layout {}".format(layout))
-
-
-@tvm.testing.fixture
-def shift_shape(scale_shape):
-    return scale_shape
-
-
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(
-    random_seed,
-    in_dtype,
-    out_dtype,
-    layout,
-    input_shape,
-    filter_shape,
-    dilation,
-    stride,
-    padding,
-    scale_shape,
-    shift_shape,
-    use_scale_shift,
-    apply_relu,
-):
-    np.random.seed(random_seed)
-
-    print(input_shape)
-
-    # scipy.signal.convolve2d does not support float16 data types, and
-    # the python fallback is too slow for general use.  Computing
-    # ref_data in float32 will have fewer rounding errors than the TVM
-    # float16 compute, but those vary based on schedule anyways.
-    conv_dtype = "float32" if in_dtype == "float16" else in_dtype
-
-    input_np = np.random.uniform(size=input_shape).astype(in_dtype)
-    filter_np = np.random.uniform(size=filter_shape).astype(in_dtype)
-    scale_np = np.random.uniform(size=scale_shape).astype(out_dtype)
-    shift_np = np.random.uniform(size=shift_shape).astype(out_dtype)
-    if layout == "NCHW":
-        np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nchw
-        dilation = (1, 1, dilation, dilation)
-        reshape = (1, -1, 1, 1)
-    elif layout == "NHWC":
-        np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nhwc
-        dilation = (dilation, dilation, 1, 1)
-        reshape = (1, 1, 1, -1)
-    elif layout == "NCHWc":
-        np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nchwc
-        dilation = (1, 1, dilation, dilation, 1, 1)
-        reshape = (1, scale_shape[0], 1, 1, scale_shape[1])
-
-    dilated_filter_np = tvm.topi.testing.dilate_python(filter_np, dilation)
-    output_np = np_depthwise_conv2d(
-        input_np.astype(conv_dtype), dilated_filter_np.astype(conv_dtype), stride, padding
-    ).astype(out_dtype)
-
-    if use_scale_shift:
-        output_np = output_np * scale_np.reshape(reshape) + shift_np.reshape(reshape)
-    if apply_relu:
-        output_np = np.maximum(output_np, 0)
-
-    return (
-        input_np,
-        filter_np,
-        scale_np,
-        shift_np,
-        output_np,
-    )
-
 
 class BaseDepthwiseConv2D:
     """Provides the test_conv2d test function, to be used by other test classes.
@@ -154,6 +38,124 @@ class BaseDepthwiseConv2D:
     (e.g. implemented only for llvm).
     """
 
+    random_seed = tvm.testing.parameter(0)
+
+    in_dtype, out_dtype = tvm.testing.parameters(
+        ("float32", "float32"),
+    )
+
+    @tvm.testing.fixture
+    def input_shape(self, layout, batch, in_channel, in_size, filter_shape):
+        """Returns input shape."""
+        if layout == "NCHW":
+            return (batch, in_channel, in_size, in_size)
+        elif layout == "NHWC":
+            return (batch, in_size, in_size, in_channel)
+        elif layout == "NCHWc":
+            oc_block = filter_shape[-1]
+            ic_block = next(bn for bn in range(oc_block, 0, -1) if in_channel % bn == 0)
+            return (batch, in_channel // ic_block, in_size, in_size, ic_block)
+        else:
+            raise RuntimeError(f"Not supported layout {layout}")
+
+    @tvm.testing.fixture
+    def filter_shape(self, layout, in_channel, channel_multiplier, kernel):
+        """Returns filter shape."""
+        filter_channel = in_channel
+        if layout == "NCHW":
+            return (filter_channel, channel_multiplier, kernel, kernel)
+        elif layout == "NHWC":
+            return (kernel, kernel, filter_channel, channel_multiplier)
+        elif layout == "NCHWc":
+            out_channel = in_channel * channel_multiplier
+            # For testing the functionality, we choose an arbitrary block
+            # size that can divide out_channel, regardless of the
+            # performance.
+            oc_block = next(bn for bn in range(16, 0, -1) if out_channel % bn == 0)
+            return (out_channel // oc_block, 1, kernel, kernel, 1, oc_block)
+        else:
+            raise RuntimeError(f"Not supported layout {layout}")
+
+    @tvm.testing.fixture
+    def scale_shape(self, layout, in_channel, channel_multiplier, filter_shape):
+        """Returns scale shape."""
+        out_channel = in_channel * channel_multiplier
+
+        if layout in ("NCHW", "NHWC"):
+            return (out_channel,)
+
+        if layout == "NCHWc":
+            oc_block = filter_shape[-1]
+            return (out_channel // oc_block, oc_block)
+
+        raise ValueError("Unknown layout {}".format(layout))
+
+    @tvm.testing.fixture
+    def shift_shape(self, scale_shape):
+        """Returns shift shape."""
+        return scale_shape
+
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_data(
+        self,
+        random_seed,
+        in_dtype,
+        out_dtype,
+        layout,
+        input_shape,
+        filter_shape,
+        dilation,
+        stride,
+        padding,
+        scale_shape,
+        shift_shape,
+        use_scale_shift,
+        apply_relu,
+    ):
+        """Generate reference data."""
+        np.random.seed(random_seed)
+
+        # scipy.signal.convolve2d does not support float16 data types, and
+        # the python fallback is too slow for general use.  Computing
+        # ref_data in float32 will have fewer rounding errors than the TVM
+        # float16 compute, but those vary based on schedule anyways.
+        conv_dtype = "float32" if in_dtype == "float16" else in_dtype
+
+        input_np = np.random.uniform(size=input_shape).astype(in_dtype)
+        filter_np = np.random.uniform(size=filter_shape).astype(in_dtype)
+        scale_np = np.random.uniform(size=scale_shape).astype(out_dtype)
+        shift_np = np.random.uniform(size=shift_shape).astype(out_dtype)
+        if layout == "NCHW":
+            np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nchw
+            dilation = (1, 1, dilation, dilation)
+            reshape = (1, -1, 1, 1)
+        elif layout == "NHWC":
+            np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nhwc
+            dilation = (dilation, dilation, 1, 1)
+            reshape = (1, 1, 1, -1)
+        elif layout == "NCHWc":
+            np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nchwc
+            dilation = (1, 1, dilation, dilation, 1, 1)
+            reshape = (1, scale_shape[0], 1, 1, scale_shape[1])
+
+        dilated_filter_np = tvm.topi.testing.dilate_python(filter_np, dilation)
+        output_np = np_depthwise_conv2d(
+            input_np.astype(conv_dtype), dilated_filter_np.astype(conv_dtype), stride, padding
+        ).astype(out_dtype)
+
+        if use_scale_shift:
+            output_np = output_np * scale_np.reshape(reshape) + shift_np.reshape(reshape)
+        if apply_relu:
+            output_np = np.maximum(output_np, 0)
+
+        return (
+            input_np,
+            filter_np,
+            scale_np,
+            shift_np,
+            output_np,
+        )
+
     @tvm.testing.requires_hexagon
     def test_conv2d(
         self,
@@ -167,15 +169,13 @@ def test_conv2d(
         shift_shape,
         use_scale_shift,
         apply_relu,
-        batch,
-        in_channel,
-        channel_multiplier,
         kernel,
         stride,
         padding,
         dilation,
         ref_data,
     ):
+        """Test conv2D."""
         # Transform the padding argument from 'str' to 'tuple' to
         # match the "workload" tuple in TopHub.  Which padding_args to
         # use for each layout chosen to reproduce previous behavior.
@@ -187,26 +187,26 @@ def test_conv2d(
             padding_args = padding
 
         # placeholder
-        Input = te.placeholder(input_shape, name="Input", dtype=in_dtype)
-        Filter = te.placeholder(filter_shape, name="Filter", dtype=in_dtype)
-        Scale = te.placeholder(scale_shape, name="Scale", dtype=out_dtype)
-        Shift = te.placeholder(shift_shape, name="Shift", dtype=out_dtype)
+        input_tensor = te.placeholder(input_shape, name="input_tensor", dtype=in_dtype)
+        filter_tensor = te.placeholder(filter_shape, name="filter_tensor", dtype=in_dtype)
+        scale = te.placeholder(scale_shape, name="scale", dtype=out_dtype)
+        shift = te.placeholder(shift_shape, name="shift", dtype=out_dtype)
 
         if layout == "NCHW":
             topi_scale_shift = topi.nn.scale_shift_nchw
-            fcompute_args = (Input, Filter, stride, padding_args, dilation, out_dtype)
+            fcompute_args = (input_tensor, filter_tensor, stride, padding_args, dilation, out_dtype)
 
         elif layout == "NHWC":
             topi_scale_shift = topi.nn.scale_shift_nhwc
-            fcompute_args = (Input, Filter, stride, padding_args, dilation, out_dtype)
+            fcompute_args = (input_tensor, filter_tensor, stride, padding_args, dilation, out_dtype)
 
         elif layout == "NCHWc":
             topi_scale_shift = topi.nn.scale_shift_nchwc
             in_layout = "NCHW{}c".format(input_shape[-1])
             out_layout = "NCHW{}c".format(filter_shape[-1])
             fcompute_args = (
-                Input,
-                Filter,
+                input_tensor,
+                filter_tensor,
                 stride,
                 padding,
                 dilation,
@@ -223,18 +223,18 @@ def test_conv2d(
             elif layout == "NHWC":
                 fcompute = topi.nn.depthwise_conv2d_nhwc
                 fschedule = topi.hexagon.schedule_depthwise_conv2d_nhwc
-            C = fcompute(*fcompute_args)
+            c_tensor = fcompute(*fcompute_args)
             if use_scale_shift:
-                C = topi_scale_shift(C, Scale, Shift)
+                c_tensor = topi_scale_shift(c_tensor, scale, shift)
             if apply_relu:
-                C = topi.nn.relu(C)
+                c_tensor = topi.nn.relu(c_tensor)
 
-            s = fschedule([C])
+            schedule = fschedule([c_tensor])
 
             # Build and run
             f = tvm.build(
-                s,
-                [Input, Filter, Scale, Shift, C],
+                schedule,
+                [input_tensor, filter_tensor, scale, shift, c_tensor],
                 get_hexagon_target("v68"),
             )
             mod = hexagon_session.load_module(f)
@@ -247,7 +247,7 @@ def test_conv2d(
             scale_tvm = tvm.nd.array(scale_np, dev)
             shift_tvm = tvm.nd.array(shift_np, dev)
             output_tvm = tvm.nd.array(
-                np.zeros(shape=get_const_tuple(C.shape), dtype=C.dtype),
+                np.zeros(shape=get_const_tuple(c_tensor.shape), dtype=c_tensor.dtype),
                 dev,
             )
 
@@ -257,7 +257,7 @@ def test_conv2d(
             tvm.testing.assert_allclose(output_np, output_tvm.numpy(), **tol)
 
 
-class TestDepthwiseConv2D_MobilenetWorkloads(BaseDepthwiseConv2D):
+class TestDepthwiseConv2DMobilenetWorkloads(BaseDepthwiseConv2D):
     """Extra tests to verify functionality for workloads used by mobilenet."""
 
     layout = tvm.testing.parameter("NCHW", "NHWC")
@@ -280,6 +280,7 @@ class TestDepthwiseConv2D_MobilenetWorkloads(BaseDepthwiseConv2D):
 
 
 class TestDepthwiseConv2D(BaseDepthwiseConv2D):
+    """Test depthwise conv2D class."""
 
     layout = tvm.testing.parameter("NCHW", "NHWC")
     use_scale_shift = tvm.testing.parameter(True, False, ids=["with_scale_shift", "no_scale_shift"])
diff --git a/tests/python/contrib/test_hexagon/topi/test_pad.py b/tests/python/contrib/test_hexagon/topi/test_pad.py
index 06b939bf6409..18a392e5b1ac 100644
--- a/tests/python/contrib/test_hexagon/topi/test_pad.py
+++ b/tests/python/contrib/test_hexagon/topi/test_pad.py
@@ -27,25 +27,26 @@
 
 @tvm.testing.requires_hexagon
 def test_nn_pad(hexagon_session: Session):
+    """Test nn pad."""
     dtype = "uint8"
     in_shape = (1, 56, 56, 32)
 
     data_in = np.ones(in_shape).astype(dtype)
 
-    A = te.placeholder(shape=in_shape, name="A", dtype=dtype)
+    a_tensor = te.placeholder(shape=in_shape, name="a_tensor", dtype=dtype)
 
-    C = topi.nn.pad(A, [0, 1, 1, 0], [0, 1, 1, 0], pad_value=0)
+    c_tensor = topi.nn.pad(a_tensor, [0, 1, 1, 0], [0, 1, 1, 0], pad_value=0)
 
     with tvm.target.Target(get_hexagon_target("v68")):
         fschedule = topi.hexagon.schedule_pad
-        s = fschedule(C)
+        s = fschedule(c_tensor)
 
-    func = tvm.build(s, [A, C], get_hexagon_target("v68"), name="pad")
+    func = tvm.build(s, [a_tensor, c_tensor], get_hexagon_target("v68"), name="pad")
     mod = hexagon_session.load_module(func)
 
     dev = hexagon_session.device
     a = tvm.nd.array(data_in, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(c_tensor.shape), dtype=c_tensor.dtype), dev)
     mod["pad"](a, b)
 
     # Reference numpy pad output
diff --git a/tests/python/contrib/test_hexagon/topi/test_pooling.py b/tests/python/contrib/test_hexagon/topi/test_pooling.py
index ecc998875296..5ae857c2dca5 100644
--- a/tests/python/contrib/test_hexagon/topi/test_pooling.py
+++ b/tests/python/contrib/test_hexagon/topi/test_pooling.py
@@ -29,6 +29,8 @@
 
 
 class TestAdaptivePool:
+    """Adaptive pool test class."""
+
     dshape, out_size, pool_type, layout = tvm.testing.parameters(
         ((1, 3, 112, 112), (1, 1), "max", "NCHW"),
         ((1, 3, 112, 112), (1, 1), "avg", "NCHW"),
@@ -58,6 +60,7 @@ class TestAdaptivePool:
 
     @tvm.testing.requires_hexagon
     def test_adaptive_pool(self, hexagon_session: Session, dshape, out_size, pool_type, layout):
+        """Test adaptive pool."""
         dtype = "float32"
         np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
         np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
@@ -103,11 +106,12 @@ def verify_poolnd(
     count_include_pad=True,
     layout="NCW",
 ):
-    A = te.placeholder(input_shape, name="A")
+    """Pool test verification."""
+    a_tensor = te.placeholder(input_shape, name="a_tensor")
 
     if n == 1:
-        B = topi.nn.pool1d(
-            A,
+        b_tensor = topi.nn.pool1d(
+            a_tensor,
             kernel=kernel,
             stride=stride,
             dilation=dilation,
@@ -118,8 +122,8 @@ def verify_poolnd(
             count_include_pad=count_include_pad,
         )
     elif n == 2:
-        B = topi.nn.pool2d(
-            A,
+        b_tensor = topi.nn.pool2d(
+            a_tensor,
             kernel=kernel,
             stride=stride,
             dilation=dilation,
@@ -130,8 +134,8 @@ def verify_poolnd(
             count_include_pad=count_include_pad,
         )
     elif n == 3:
-        B = topi.nn.pool3d(
-            A,
+        b_tensor = topi.nn.pool3d(
+            a_tensor,
             kernel=kernel,
             stride=stride,
             dilation=dilation,
@@ -144,9 +148,9 @@ def verify_poolnd(
     else:
         raise ValueError(f"PoolND only supports n=1, 2, 3 got n={n}")
 
-    B = topi.nn.relu(B)
-    dtype = A.dtype
-    output_shape = [int(i) for i in B.shape]
+    b_tensor = topi.nn.relu(b_tensor)
+    dtype = a_tensor.dtype
+    output_shape = [int(i) for i in b_tensor.shape]
 
     input_np = np.random.uniform(low=0.001, size=input_shape).astype(dtype)
 
@@ -169,20 +173,22 @@ def verify_poolnd(
 
     with tvm.target.Target(get_hexagon_target("v68")):
         fschedule = topi.hexagon.schedule_pool
-        s = fschedule(B, layout)
+        s = fschedule(b_tensor, layout)
 
-    func = tvm.build(s, [A, B], get_hexagon_target("v68"), name="pool")
+    func = tvm.build(s, [a_tensor, b_tensor], get_hexagon_target("v68"), name="pool")
     mod = hexagon_session.load_module(func)
 
     dev = hexagon_session.device
     a = tvm.nd.array(input_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(b_tensor.shape), dtype=dtype), dev)
     mod["pool"](a, b)
 
     tvm.testing.assert_allclose(b.numpy(), ref_np, rtol=1e-5)
 
 
 class TestPool1D:
+    """Pool1D test class."""
+
     (
         input_shape,
         kernel,
@@ -244,6 +250,7 @@ def test_pool1d(
         count_include_pad,
         layout,
     ):
+        """Test Pool1D."""
         verify_poolnd(
             hexagon_session,
             1,
@@ -260,6 +267,8 @@ def test_pool1d(
 
 
 class TestPool2D:
+    """Pool2D test class."""
+
     (
         input_shape,
         kernel,
@@ -321,6 +330,7 @@ def test_pool2d(
         count_include_pad,
         layout,
     ):
+        """Test Pool2D."""
         verify_poolnd(
             hexagon_session,
             2,
@@ -337,6 +347,8 @@ def test_pool2d(
 
 
 class TestPool3D:
+    """Pool3D test class."""
+
     (
         input_shape,
         kernel,
@@ -719,6 +731,7 @@ def test_pool3d(
         count_include_pad,
         layout,
     ):
+        """Test Pool3D."""
         verify_poolnd(
             hexagon_session,
             3,
diff --git a/tests/python/contrib/test_hexagon/topi/test_quantize.py b/tests/python/contrib/test_hexagon/topi/test_quantize.py
old mode 100755
new mode 100644
index 0b6e1dfa0e73..a188f7cb2fe1
--- a/tests/python/contrib/test_hexagon/topi/test_quantize.py
+++ b/tests/python/contrib/test_hexagon/topi/test_quantize.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""TIR quantize schedule tests."""
 import numpy as np
 
 import tvm
@@ -26,30 +27,31 @@
     get_hexagon_target,
 )
 
+QUANTIZE_SCALE = None
+QUANTIZE_ZERO_POINT = None
 
-@tvm.testing.fixture
-def expected_output_np(input_np, output_dtype):
-    global scale, zero_point
-    quant_np, scale, zero_point = quantize_np(input_np, output_dtype)
-    return quant_np
-
-
-@tvm.testing.fixture
-def input_np(input_shape, input_dtype):
-    return np.random.random(input_shape).astype(input_dtype)
 
+class TestQuantize:
+    """Test quantize class."""
 
-@tvm.testing.fixture
-def transformed_input_np(input_np, input_crouton_layout):
-    return transform_numpy(input_np, "nhwc", input_crouton_layout)
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np, output_dtype):
+        global QUANTIZE_SCALE, QUANTIZE_ZERO_POINT
+        quant_np, QUANTIZE_SCALE, QUANTIZE_ZERO_POINT = quantize_np(input_np, output_dtype)
+        return quant_np
 
+    @tvm.testing.fixture
+    def input_np(self, input_shape, input_dtype):
+        return np.random.random(input_shape).astype(input_dtype)
 
-@tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, output_layout):
-    return transform_numpy(expected_output_np, "nhwc", output_layout)
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, input_crouton_layout):
+        return transform_numpy(input_np, "nhwc", input_crouton_layout)
 
+    @tvm.testing.fixture
+    def transformed_expected_output_np(self, expected_output_np, output_layout):
+        return transform_numpy(expected_output_np, "nhwc", output_layout)
 
-class TestQuantize:
     input_crouton_layout, output_layout, input_dtype = tvm.testing.parameters(
         ("nhwc-4h2w32c2w-2d", "nhwc-8h8w32c-2d", "float32"),
     )
@@ -65,7 +67,6 @@ def test_quantize(
         self,
         input_dtype,
         output_dtype,
-        input_np,
         transformed_input_np,
         input_shape,
         expected_output_np,
@@ -74,11 +75,14 @@ def test_quantize(
         output_layout,
         hexagon_session,
     ):
-        A = te.placeholder(input_shape, name="A", dtype=input_dtype)
+        """Test quantize."""
+        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=input_dtype)
 
-        M = s1.quantize_compute(A, scale, zero_point, output_dtype)
+        m_tensor = s1.quantize_compute(a_tensor, QUANTIZE_SCALE, QUANTIZE_ZERO_POINT, output_dtype)
 
-        tir_schedule = s1.tir_quantize_schedule(M, A, input_crouton_layout, output_layout)
+        tir_schedule = s1.tir_quantize_schedule(
+            m_tensor, a_tensor, input_crouton_layout, output_layout
+        )
 
         sch = tir_schedule.mod
 
@@ -88,12 +92,12 @@ def test_quantize(
         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(
                 sch,
-                [A, M],
+                [a_tensor, m_tensor],
                 get_hexagon_target("v69"),
                 name="quantize",
             )
 
-        A_data_nd = allocate_hexagon_array(
+        a_data_nd = allocate_hexagon_array(
             hexagon_session.device,
             data=transformed_input_np,
             dtype=input_dtype,
@@ -101,7 +105,7 @@ def test_quantize(
             mem_scope="global.vtcm",
         )
 
-        M_data_nd = allocate_hexagon_array(
+        m_data_nd = allocate_hexagon_array(
             hexagon_session.device,
             tensor_shape=transformed_expected_output_np.shape,
             dtype=output_dtype,
@@ -110,14 +114,14 @@ def test_quantize(
         )
 
         mod = hexagon_session.load_module(func)
-        mod(A_data_nd, M_data_nd)
+        mod(a_data_nd, m_data_nd)
 
-        b, h, w, c = expected_output_np.shape
+        b, h, weight, c = expected_output_np.shape
 
         # convert nd to np and reshape to fixed chunk size layout
-        M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32])
+        m_data_np = m_data_nd.numpy().reshape([b, h // 8, weight // 8, c // 32, 8, 8, 32])
 
-        np.testing.assert_allclose(transformed_expected_output_np, M_data_np, atol=1)
+        np.testing.assert_allclose(transformed_expected_output_np, m_data_np, atol=1)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_hexagon/topi/test_reduce.py b/tests/python/contrib/test_hexagon/topi/test_reduce.py
index 8fc0b6d901ab..eb798db1dd2b 100644
--- a/tests/python/contrib/test_hexagon/topi/test_reduce.py
+++ b/tests/python/contrib/test_hexagon/topi/test_reduce.py
@@ -24,24 +24,6 @@
 
 from ..infrastructure import get_hexagon_target
 
-in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters(
-    ((32,), 0, False, "argmax", "float32"),
-    ((32, 24, 32, 24), (1, 2, 3), True, "sum", "float32"),
-    ((2, 3), None, True, "all", "bool"),
-    ((32, 24 * 32 * 24), (1,), False, "max", "float32"),
-    ((32, 128, 24), None, True, "sum", "float32"),
-    ((32, 128, 24), None, True, "all", "bool"),
-    ((32, 24, 32, 24), (0, 2), False, "min", "float32"),
-    ((32, 128), 1, True, "argmax", "float32"),
-    ((32, 24, 32, 24), 2, False, "argmin", "float32"),
-    ((31, 21, 15), None, True, "argmax", "float32"),
-    ((31, 21, 15), None, False, "sum", "float32"),
-    ((2, 3), None, True, "any", "bool"),
-    ((32, 128, 24), None, True, "any", "bool"),
-    ((1, 4, 7), 1, True, "any", "bool"),
-    ((32, 24, 32, 24), 2, False, "any", "bool"),
-)
-
 
 def _my_npy_argmax(arr, axis, keepdims):
     if not keepdims:
@@ -68,93 +50,114 @@ def _my_npy_argmin(arr, axis, keepdims):
         return arr.argmin(axis=axis).reshape(out_shape)
 
 
-@tvm.testing.fixture(cache_return_value=True)
-def ref_data(in_shape, axis, keepdims, reduce_type, dtype):
-    # Test
-    if dtype == "bool":
-        in_npy_map = in_npy = np.random.choice([True, False], size=in_shape)
-    else:
-        in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
-        in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
-
-    if reduce_type == "sum":
-        out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
-    elif reduce_type == "all" and dtype == "bool":
-        out_npy = in_npy_map.all(axis=axis, keepdims=keepdims)
-    elif reduce_type == "any" and dtype == "bool":
-        out_npy = in_npy_map.any(axis=axis, keepdims=keepdims)
-    elif reduce_type == "max":
-        out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
-    elif reduce_type == "min":
-        out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
-    elif reduce_type == "argmax":
-        out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
-    elif reduce_type == "argmin":
-        out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
-    else:
-        raise NotImplementedError
-
-    return in_npy, in_npy_map, out_npy
-
-
-@tvm.testing.requires_hexagon
-def test_reduce_map(
-    hexagon_session: Session, ref_data, in_shape, axis, keepdims, reduce_type, dtype
-):
-    in_npy, in_npy_map, out_npy = ref_data
-
-    # Build the logic and compile the function
-    A = te.placeholder(shape=in_shape, name="A", dtype=dtype)
-    A1 = topi.sqrt(topi.exp(A))
-    out_dtype = dtype
-    if reduce_type == "sum":
-        B = topi.sum(A1, axis=axis, keepdims=keepdims)
-    elif reduce_type == "all":
-        B = topi.all(A, axis=axis, keepdims=keepdims)
-    elif reduce_type == "any":
-        B = topi.any(A, axis=axis, keepdims=keepdims)
-    elif reduce_type == "max":
-        B = topi.max(A1, axis=axis, keepdims=keepdims)
-    elif reduce_type == "min":
-        B = topi.min(A1, axis=axis, keepdims=keepdims)
-    elif reduce_type == "argmax":
-        B = topi.argmax(A1, axis=axis, keepdims=keepdims)
-        out_dtype = "int32"
-    elif reduce_type == "argmin":
-        B = topi.argmin(A1, axis=axis, keepdims=keepdims)
-        out_dtype = "int32"
-    else:
-        raise NotImplementedError
-
-    with tvm.target.Target(get_hexagon_target("v68")):
-        fschedule = topi.hexagon.schedule_reduce
-        s = fschedule(B)
-
-    func = tvm.build(s, [A, B], get_hexagon_target("v68"), name=reduce_type)
-    mod = hexagon_session.load_module(func)
-
-    dev = hexagon_session.device
-    data_tvm = tvm.nd.array(in_npy, device=dev)
-    out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=out_dtype)
-
-    mod[reduce_type](data_tvm, out_tvm)
-
-    if reduce_type == "argmax" or reduce_type == "argmin":
-        out_tvm_indices = out_tvm.numpy()
-        if keepdims:
-            out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
-        if axis is None:
-            out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
+class TestReduce:
+    """Test reduce class."""
+
+    in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters(
+        ((32,), 0, False, "argmax", "float32"),
+        ((32, 24, 32, 24), (1, 2, 3), True, "sum", "float32"),
+        ((2, 3), None, True, "all", "bool"),
+        ((32, 24 * 32 * 24), (1,), False, "max", "float32"),
+        ((32, 128, 24), None, True, "sum", "float32"),
+        ((32, 128, 24), None, True, "all", "bool"),
+        ((32, 24, 32, 24), (0, 2), False, "min", "float32"),
+        ((32, 128), 1, True, "argmax", "float32"),
+        ((32, 24, 32, 24), 2, False, "argmin", "float32"),
+        ((31, 21, 15), None, True, "argmax", "float32"),
+        ((31, 21, 15), None, False, "sum", "float32"),
+        ((2, 3), None, True, "any", "bool"),
+        ((32, 128, 24), None, True, "any", "bool"),
+        ((1, 4, 7), 1, True, "any", "bool"),
+        ((32, 24, 32, 24), 2, False, "any", "bool"),
+    )
+
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_data(self, in_shape, axis, keepdims, reduce_type, dtype):
+        """Generate test reference data."""
+        if dtype == "bool":
+            in_npy_map = in_npy = np.random.choice([True, False], size=in_shape)
         else:
-            other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis + 1) :]))
-            sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
-            out_tvm_val = in_npy_map[sel_indices]
-        if reduce_type == "argmax":
-            tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1e-3, 1e-3)
+            in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
+            in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
+
+        if reduce_type == "sum":
+            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
+        elif reduce_type == "all" and dtype == "bool":
+            out_npy = in_npy_map.all(axis=axis, keepdims=keepdims)
+        elif reduce_type == "any" and dtype == "bool":
+            out_npy = in_npy_map.any(axis=axis, keepdims=keepdims)
+        elif reduce_type == "max":
+            out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
+        elif reduce_type == "min":
+            out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
+        elif reduce_type == "argmax":
+            out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
         elif reduce_type == "argmin":
-            tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1e-3, 1e-3)
-    else:
-        tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)
+            out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
+        else:
+            raise NotImplementedError
+
+        return in_npy, in_npy_map, out_npy
+
+    @tvm.testing.requires_hexagon
+    def test_reduce_map(
+        self, hexagon_session: Session, ref_data, in_shape, axis, keepdims, reduce_type, dtype
+    ):
+        """Test reduce map."""
+        in_npy, in_npy_map, out_npy = ref_data
+
+        # Build the logic and compile the function
+        a_tensor = te.placeholder(shape=in_shape, name="a_tensor", dtype=dtype)
+        a1_tensor = topi.sqrt(topi.exp(a_tensor))
+        out_dtype = dtype
+        if reduce_type == "sum":
+            b_tensor = topi.sum(a1_tensor, axis=axis, keepdims=keepdims)
+        elif reduce_type == "all":
+            b_tensor = topi.all(a_tensor, axis=axis, keepdims=keepdims)
+        elif reduce_type == "any":
+            b_tensor = topi.any(a_tensor, axis=axis, keepdims=keepdims)
+        elif reduce_type == "max":
+            b_tensor = topi.max(a1_tensor, axis=axis, keepdims=keepdims)
+        elif reduce_type == "min":
+            b_tensor = topi.min(a1_tensor, axis=axis, keepdims=keepdims)
+        elif reduce_type == "argmax":
+            b_tensor = topi.argmax(a1_tensor, axis=axis, keepdims=keepdims)
+            out_dtype = "int32"
+        elif reduce_type == "argmin":
+            b_tensor = topi.argmin(a1_tensor, axis=axis, keepdims=keepdims)
+            out_dtype = "int32"
+        else:
+            raise NotImplementedError
+
+        with tvm.target.Target(get_hexagon_target("v68")):
+            fschedule = topi.hexagon.schedule_reduce
+            s = fschedule(b_tensor)
+
+        func = tvm.build(s, [a_tensor, b_tensor], get_hexagon_target("v68"), name=reduce_type)
+        mod = hexagon_session.load_module(func)
+
+        dev = hexagon_session.device
+        data_tvm = tvm.nd.array(in_npy, device=dev)
+        out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=out_dtype)
+
+        mod[reduce_type](data_tvm, out_tvm)
+
+        if reduce_type in ["argmax", "argmin"]:
+            out_tvm_indices = out_tvm.numpy()
+            if keepdims:
+                out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
+            if axis is None:
+                out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
+            else:
+                other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis + 1) :]))
+                sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
+                out_tvm_val = in_npy_map[sel_indices]
+            if reduce_type == "argmax":
+                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1e-3, 1e-3)
+            elif reduce_type == "argmin":
+                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1e-3, 1e-3)
+        else:
+            tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_hexagon/topi/test_reshape.py b/tests/python/contrib/test_hexagon/topi/test_reshape.py
index 38b8a9cf9a82..33bb31902eaa 100644
--- a/tests/python/contrib/test_hexagon/topi/test_reshape.py
+++ b/tests/python/contrib/test_hexagon/topi/test_reshape.py
@@ -14,9 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Test reshape class."""
 import numpy as np
-import pytest
 
 import tvm
 import tvm.testing
@@ -25,6 +24,18 @@
 
 from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
+BATCH_FLATTEN_FP16_TESTS = (
+    ([1, 1, 1, 2048], [1, 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
+    ([1, 2, 4, 2048], [1, 2 * 4 * 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
+    ([1, 8, 8, 1024], [1, 8 * 8 * 1024], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
+    ([2, 4, 8, 1024], [2, 4 * 8 * 1024], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
+)
+
+BATCH_FLATTEN_UINT8_TESTS = (
+    ([1, 1, 1, 2048], [1, 2048], "nhwc-2048c-2d", "nc-2048-2d", "uint8"),
+    ([1, 2, 4, 2048], [1, 2 * 4 * 2048], "nhwc-2048c-2d", "nc-2048-2d", "uint8"),
+)
+
 
 def reshape_helper(
     func,
@@ -37,17 +48,18 @@ def reshape_helper(
     output_layout,
     hexagon_session,
 ):
+    """Reshape helper function."""
 
-    A = te.placeholder(input_shape, name="A", dtype=data_type)
+    a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=data_type)
     if func == "reshape":
-        D = fcompute(A, output_shape)
+        d_tesnsor = fcompute(a_tensor, output_shape)
     elif func == "batch_flatten":
-        D = fcompute(A)
+        d_tesnsor = fcompute(a_tensor)
     else:
         raise RuntimeError(f"Unexpected func'{func}'")
     tir_s = fschedule(
-        D,
-        A,
+        d_tesnsor,
+        a_tensor,
         output_layout,
         input_layout,
     )
@@ -87,28 +99,18 @@ def reshape_helper(
     np.testing.assert_allclose(output.numpy(), ref_np_transformed, atol=1e-07, rtol=0)
 
 
-batch_flatten_fp16_tests = (
-    ([1, 1, 1, 2048], [1, 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
-    ([1, 2, 4, 2048], [1, 2 * 4 * 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
-    ([1, 8, 8, 1024], [1, 8 * 8 * 1024], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
-    ([2, 4, 8, 1024], [2, 4 * 8 * 1024], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
-)
-
-
-batch_flatten_uint8_tests = (
-    ([1, 1, 1, 2048], [1, 2048], "nhwc-2048c-2d", "nc-2048-2d", "uint8"),
-    ([1, 2, 4, 2048], [1, 2 * 4 * 2048], "nhwc-2048c-2d", "nc-2048-2d", "uint8"),
-)
-
-
 class BaseTestBatchFlatten:
+    """Test batch flatten class."""
+
     (input_shape, output_shape, input_layout, output_layout, data_type,) = tvm.testing.parameters(
-        *batch_flatten_fp16_tests,
-        *batch_flatten_uint8_tests,
+        *BATCH_FLATTEN_FP16_TESTS,
+        *BATCH_FLATTEN_UINT8_TESTS,
     )
 
 
 class TestBatchFlatten(BaseTestBatchFlatten):
+    """Test batch flatten class."""
+
     @tvm.testing.requires_hexagon
     def test_batch_flatten(
         self,
@@ -119,6 +121,7 @@ def test_batch_flatten(
         output_layout,
         hexagon_session,
     ):
+        """Test batch flatten."""
         reshape_helper(
             "batch_flatten",
             sl.batch_flatten_compute,
@@ -132,28 +135,30 @@ def test_batch_flatten(
         )
 
 
-reshape_fp16_tests = (
-    ([1, 8, 4, 64], [1, 8, 8, 32], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
-    ([1, 16, 8, 128], [1, 16, 16, 64], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
-)
-
+class BaseTestReshape(BaseTestBatchFlatten):
+    """Test reshape base class."""
 
-reshape_uint8_tests = (
-    ([1, 8, 8, 128], [1, 8, 16, 64], "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
-    ([1, 16, 64, 128], [1, 16, 128, 64], "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
-)
+    reshape_fp16_tests = (
+        ([1, 8, 4, 64], [1, 8, 8, 32], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
+        ([1, 16, 8, 128], [1, 16, 16, 64], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
+    )
 
+    reshape_uint8_tests = (
+        ([1, 8, 8, 128], [1, 8, 16, 64], "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
+        ([1, 16, 64, 128], [1, 16, 128, 64], "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
+    )
 
-class BaseTestReshape(BaseTestBatchFlatten):
     (input_shape, output_shape, input_layout, output_layout, data_type,) = tvm.testing.parameters(
-        *batch_flatten_fp16_tests,
-        *batch_flatten_uint8_tests,
+        *BATCH_FLATTEN_FP16_TESTS,
+        *BATCH_FLATTEN_UINT8_TESTS,
         *reshape_fp16_tests,
         *reshape_uint8_tests,
     )
 
 
 class TestReshape(BaseTestReshape):
+    """Test reshape class."""
+
     @tvm.testing.requires_hexagon
     def test_reshape(
         self,
@@ -164,6 +169,7 @@ def test_reshape(
         output_layout,
         hexagon_session,
     ):
+        """Test reshape."""
         reshape_helper(
             "reshape",
             sl.reshape_compute,
diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
old mode 100755
new mode 100644
index 80cfba5c6c9e..44d9c95a2f06
--- a/tests/python/contrib/test_hexagon/topi/test_resize2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
@@ -14,7 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import pytest
+"""Resize 2D tesst.
+"""
 import numpy as np
 
 import tvm
@@ -24,61 +25,9 @@
 from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
 
 
-@tvm.testing.fixture
-def expected_output_np(
-    input_np,
-    in_height,
-    in_width,
-    out_height,
-    out_width,
-    layout,
-    method,
-    coord_trans,
-    dtype,
-):
-    scale_h = out_height / in_height
-    scale_w = out_width / in_width
-
-    return resize2d_python(input_np, (scale_h, scale_w), layout, method, coord_trans)
-
-
-@tvm.testing.fixture
-def input_np(input_shape, dtype):
-    if dtype == "float16":
-        return np.random.random(input_shape).astype(dtype)
-    if dtype == "uint8":
-        return np.random.randint(0, 255, input_shape).astype(dtype)
-    if dtype == "int8":
-        return np.random.randint(-128, 127, input_shape).astype(dtype)
-
-
-@tvm.testing.fixture
-def transformed_input_np(input_np, layout, input_crouton_layout, dtype):
-    if dtype == "float16" or dtype == "uint8" or dtype == "int8":
-        return transform_numpy(input_np, layout.lower(), input_crouton_layout)
-
-    raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-
-@tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, layout, output_layout, dtype):
-    if dtype == "float16" or dtype == "uint8" or dtype == "int8":
-        return transform_numpy(expected_output_np, layout.lower(), output_layout)
-
-    raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-
-@tvm.testing.fixture
-def input_shape(batch, channel, in_height, in_width):
-    return (batch, in_height, in_width, channel)
-
-
-@tvm.testing.fixture
-def output_shape(batch, channel, out_height, out_width):
-    return (batch, out_height, out_width, channel)
-
-
 class TestResize2d:
+    """Test resize 2D class."""
+
     (batch, channel, in_height, in_width, out_height, out_width,) = tvm.testing.parameters(
         (
             1,
@@ -106,6 +55,56 @@ class TestResize2d:
     coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
     method = tvm.testing.parameter("nearest_neighbor", "linear")
 
+    @tvm.testing.fixture
+    def expected_output_np(
+        self,
+        input_np,
+        in_height,
+        in_width,
+        out_height,
+        out_width,
+        layout,
+        method,
+        coord_trans,
+    ):
+        """Generate expected output."""
+        scale_h = out_height / in_height
+        scale_w = out_width / in_width
+
+        return resize2d_python(input_np, (scale_h, scale_w), layout, method, coord_trans)
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, dtype):
+        if dtype == "float16":
+            return np.random.random(input_shape).astype(dtype)
+        if dtype == "uint8":
+            return np.random.randint(0, 255, input_shape).astype(dtype)
+        if dtype == "int8":
+            return np.random.randint(-128, 127, input_shape).astype(dtype)
+        raise RuntimeError(f"dtype {dtype} is not valid.")
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, layout, input_crouton_layout, dtype):
+        if dtype in ["float16", "uint8", "int8"]:
+            return transform_numpy(input_np, layout.lower(), input_crouton_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def transformed_expected_output_np(self, expected_output_np, layout, output_layout, dtype):
+        if dtype in ["float16", "uint8", "int8"]:
+            return transform_numpy(expected_output_np, layout.lower(), output_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def input_shape(self, batch, channel, in_height, in_width):
+        return (batch, in_height, in_width, channel)
+
+    @tvm.testing.fixture
+    def output_shape(self, batch, channel, out_height, out_width):
+        return (batch, out_height, out_width, channel)
+
     @tvm.testing.requires_hexagon
     def test_resize2d(
         self,
@@ -123,10 +122,11 @@ def test_resize2d(
         method,
         hexagon_session,
     ):
-        A = te.placeholder(input_shape, name="A", dtype=dtype)
+        """Test resize 2D."""
+        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
 
-        M = s1.resize2d_compute(
-            A,
+        m_tensor = s1.resize2d_compute(
+            a_tensor,
             [0.0] * 4,
             (output_shape[1], output_shape[2]),
             layout=layout,
@@ -135,7 +135,9 @@ def test_resize2d(
             out_dtype=dtype,
         )
 
-        tir_schedule = s1.tir_resize2d_schedule(M, A, input_crouton_layout, output_layout)
+        tir_schedule = s1.tir_resize2d_schedule(
+            m_tensor, a_tensor, input_crouton_layout, output_layout
+        )
 
         sch = tir_schedule.mod
 
@@ -151,12 +153,12 @@ def test_resize2d(
         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(
                 sch,
-                [A, M],
+                [a_tensor, m_tensor],
                 get_hexagon_target("v69"),
                 name="resize2d",
             )
 
-        A_data_nd = allocate_hexagon_array(
+        a_data_nd = allocate_hexagon_array(
             hexagon_session.device,
             data=transformed_input_np,
             dtype=dtype,
@@ -164,7 +166,7 @@ def test_resize2d(
             mem_scope="global.vtcm",
         )
 
-        M_data_nd = allocate_hexagon_array(
+        m_data_nd = allocate_hexagon_array(
             hexagon_session.device,
             transformed_expected_output_np.shape,
             dtype=dtype,
@@ -173,21 +175,25 @@ def test_resize2d(
         )
 
         mod = hexagon_session.load_module(func)
-        mod(A_data_nd, M_data_nd)
+        mod(a_data_nd, m_data_nd)
 
-        b, h, w, c = output_shape
+        batch_size, height, width, channel = output_shape
         # convert nd to np and reshape to fixed chunk size layout
         if output_layout == "nhwc-8h2w32c2w-2d":
-            M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
+            m_data_np = m_data_nd.numpy().reshape(
+                [batch_size, height // 8, width // 4, channel // 32, 8, 2, 32, 2]
+            )
         elif output_layout == "nhwc-8h8w32c-2d":
-            M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32])
+            m_data_np = m_data_nd.numpy().reshape(
+                [batch_size, height // 8, width // 8, channel // 32, 8, 8, 32]
+            )
 
         if dtype == "float16":
             np.testing.assert_allclose(
-                transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3
+                transformed_expected_output_np, m_data_np, rtol=1e-3, atol=1e-3
             )
-        elif dtype == "int8" or dtype == "uint8":
-            np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1, atol=1)
+        elif dtype in ["int8", "uint8"]:
+            np.testing.assert_allclose(transformed_expected_output_np, m_data_np, rtol=1, atol=1)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax.py b/tests/python/contrib/test_hexagon/topi/test_softmax.py
index 91f348494d6d..e1b4d97bc171 100644
--- a/tests/python/contrib/test_hexagon/topi/test_softmax.py
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax.py
@@ -28,13 +28,8 @@
 
 from ..infrastructure import get_hexagon_target
 
-dtype = tvm.testing.parameter(
-    "float16",
-    "float32",
-)
-
 # TODO(mehrdadh): add log_softmax to config
-configs = {
+OPERATOR_CONFIGS = {
     "softmax": {
         "topi": topi.nn.softmax,
         "ref": tvm.topi.testing.softmax_python,
@@ -42,57 +37,69 @@
     },
 }
 
-# TODO(mehrdadh): larger size like (1, 16, 256, 256) would fail due to TVM_HEXAGON_RPC_BUFF_SIZE_BYTES
-shapes = [(32, 10), (3, 4), (1, 16, 32, 32)]
-softmax_operation, shape = tvm.testing.parameters(
-    *[
-        (name, shape)
-        for name, config in configs.items()
-        for shape in shapes
-        if len(shape) in config["dimensions"]
-    ]
-)
-
-
-@tvm.testing.requires_hexagon
-def test_softmax(hexagon_session: Session, shape, dtype, softmax_operation):
-    if dtype == "float16":
-        pytest.xfail("float16 is not supported.")
-    A = te.placeholder(shape, dtype=dtype, name="A")
-
-    topi_op = configs[softmax_operation]["topi"]
-    B = topi_op(A, axis=1)
-
-    def get_ref_data(shape):
-        ref_func = tvm.topi.testing.softmax_python
-        a_np = np.random.uniform(size=shape).astype(dtype)
-
-        if len(shape) == 2:
-            b_np = ref_func(a_np)
-        elif len(shape) == 4:
-            _, c, h, w = a_np.shape
-            a_np_2d = a_np.transpose(0, 2, 3, 1).reshape(h * w, c)
-            b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
-            b_np = b_np_2d.reshape(1, h, w, c).transpose(0, 3, 1, 2)
-
-        return a_np, b_np
-
-    # get the test data
-    a_np, b_np = get_ref_data(shape)
-
-    with tvm.target.Target(get_hexagon_target("v68")):
-        fschedule = topi.hexagon.schedule_softmax
-        s = fschedule(B)
-
-    func = tvm.build(s, [A, B], get_hexagon_target("v68"), name="softmax")
-    mod = hexagon_session.load_module(func)
-
-    dev = hexagon_session.device
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-    mod["softmax"](a, b)
-
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
+
+class TestSoftmax:
+    """Softmax test class."""
+
+    dtype = tvm.testing.parameter(
+        "float16",
+        "float32",
+    )
+
+    # TODO(mehrdadh): larger size like (1, 16, 256, 256)
+    # would fail due to TVM_HEXAGON_RPC_BUFF_SIZE_BYTES
+    shape = tvm.testing.parameter((32, 10), (3, 4), (1, 16, 32, 32))
+
+    @tvm.testing.fixture
+    def softmax_operation(self, shape) -> tuple:
+        """Returns the operation name and shape."""
+        for name, config in OPERATOR_CONFIGS.items():
+            if len(shape) in config["dimensions"]:
+                return name
+            else:
+                raise ValueError(f"Shape {shape} is not supported.")
+
+    @tvm.testing.requires_hexagon
+    def test_softmax(self, hexagon_session: Session, dtype, shape, softmax_operation):
+        """Test softmax."""
+        if dtype == "float16":
+            pytest.xfail("float16 is not supported.")
+
+        a_tensor = te.placeholder(shape, dtype=dtype, name="a_tensor")
+
+        topi_op = OPERATOR_CONFIGS[softmax_operation]["topi"]
+        b_tensor = topi_op(a_tensor, axis=1)
+
+        def get_ref_data(shape):
+            ref_func = tvm.topi.testing.softmax_python
+            a_np = np.random.uniform(size=shape).astype(dtype)
+
+            if len(shape) == 2:
+                b_np = ref_func(a_np)
+            elif len(shape) == 4:
+                _, c, height, width = a_np.shape
+                a_np_2d = a_np.transpose(0, 2, 3, 1).reshape(height * width, c)
+                b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
+                b_np = b_np_2d.reshape(1, height, width, c).transpose(0, 3, 1, 2)
+
+            return a_np, b_np
+
+        # get the test data
+        a_np, b_np = get_ref_data(shape)
+
+        with tvm.target.Target(get_hexagon_target("v68")):
+            fschedule = topi.hexagon.schedule_softmax
+            s = fschedule(b_tensor)
+
+        func = tvm.build(s, [a_tensor, b_tensor], get_hexagon_target("v68"), name="softmax")
+        mod = hexagon_session.load_module(func)
+
+        dev = hexagon_session.device
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(b_tensor.shape), dtype=b_tensor.dtype), dev)
+        mod["softmax"](a, b)
+
+        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
 
 
 if __name__ == "__main__":

From 56878fab7c277eedc7fc1ead7c0cfa911305ff49 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 5 Nov 2022 12:26:04 +0900
Subject: [PATCH 512/704] [TE] Make `elem_offset` of the buffers created by
 `te.extern` a variable to avoid crash (#13297)

* make elem_offset of the buffers created by te.extern a variable

Co-authored-by: Eric Lunderberg <elunderberg@octoml.ai>

* add test

* fix te extern create_prim_func test

Co-authored-by: Eric Lunderberg <elunderberg@octoml.ai>
---
 python/tvm/te/operation.py                    | 10 +++++--
 tests/python/relay/test_op_level1.py          | 26 +++++++++++++++++++
 .../unittest/test_te_create_primfunc.py       | 15 ++++++-----
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 8da78a599c28..5279c46aebc2 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -326,7 +326,11 @@ def extern(
         if not isinstance(t, _tensor.Tensor):
             raise ValueError("expect inputs to be tensor")
         if in_buffers is None:
-            input_placeholders.append(tvm.tir.decl_buffer(t.shape, t.dtype, t.op.name))
+            input_placeholders.append(
+                tvm.tir.decl_buffer(
+                    t.shape, t.dtype, t.op.name, elem_offset=tvm.tir.Var("elem_offset", "int32")
+                )
+            )
         types.add(t.dtype)
 
     if dtype is None:
@@ -339,7 +343,9 @@ def extern(
 
     if out_buffers is None:
         for shp, dt in zip(shape, dtype):
-            output_placeholders.append(tvm.tir.decl_buffer(shp, dt, name))
+            output_placeholders.append(
+                tvm.tir.decl_buffer(shp, dt, name, elem_offset=tvm.tir.Var("elem_offset", "int32"))
+            )
     body = fcompute(input_placeholders, output_placeholders)
     if isinstance(body, tvm.tir.PrimExpr):
         body = tvm.tir.Evaluate(body)
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 3436bdd9f28d..4234c18c110f 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -820,5 +820,31 @@ def test_dense_rocm_sdot4():
     np.testing.assert_equal(out, ref)
 
 
+def test_extern_concat_injective_fuse():
+    # This is a subgraph from MobileBERT, which crashes compilation if buffers created in te.extern(...)
+    # do not have their elem_offset explicitly set as a variable.
+
+    # fmt: off
+    mod = tvm.parser.fromtext(
+        """
+       #[version = "0.0.5"]
+       def @main(%p0844: Tensor[(1, 384), int64], %p1652: Tensor[(2016, 128), float16]) {
+        %1331 = cast(%p0844, dtype="int32");
+        %1332 = take(%p1652, %1331, axis=0);
+        %1333 = strided_slice(%1332, begin=[0, 1, 0], end=[1, 384, 128], strides=[1, 1, 1], axes=None);
+        %1334 = strided_slice(%1332, begin=[0, 0, 0], end=[1, -1, 128], strides=[1, 1, 1], axes=None);
+        %1335 = nn.pad(%1333, 0, pad_width=[[0, 0], [0, 1], [0, 0]]);
+        %1336 = nn.pad(%1334, 0, pad_width=[[0, 0], [1, 0], [0, 0]]);
+        %1337 = (%1335, %1332, %1336);
+        %1338 = concatenate(%1337, axis=2);
+        reshape(%1338, newshape=[-1, 384])
+      }
+    """
+    )
+    # fmt: on
+
+    relay.build(mod, params={}, target="llvm")
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index d10fd2d23d47..4c216cdbc53a 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -216,9 +216,12 @@ def te_extern():
 @T.prim_func
 def tir_extern(a: T.handle, b: T.handle, c: T.handle) -> None:
     T.func_attr({"global_symbol": "main", "tir.noalias": True})
-    A = T.match_buffer(a, (128, 128))
-    B = T.match_buffer(b, (128, 128))
-    C = T.match_buffer(c, (128, 128))
+    off1 = te.var("elem_offset")
+    off2 = te.var("elem_offset_1")
+    off3 = te.var("elem_offset_2")
+    A = T.match_buffer(a, (128, 128), elem_offset=off1)
+    B = T.match_buffer(b, (128, 128), elem_offset=off2)
+    C = T.match_buffer(c, (128, 128), elem_offset=off3)
     # body
     with T.block("C"):
         T.reads([A[0:128, 0:128], B[0:128, 0:128]])
@@ -232,7 +235,7 @@ def tir_extern(a: T.handle, b: T.handle, c: T.handle) -> None:
                     0,
                     2,
                     0.0,
-                    0,
+                    off1,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
@@ -241,7 +244,7 @@ def tir_extern(a: T.handle, b: T.handle, c: T.handle) -> None:
                     0,
                     2,
                     0.0,
-                    0,
+                    off2,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
@@ -250,7 +253,7 @@ def tir_extern(a: T.handle, b: T.handle, c: T.handle) -> None:
                     0,
                     2,
                     0.0,
-                    0,
+                    off3,
                     dtype="handle",
                 ),
                 0,

From 1e793641d8f6ea15b7207dd3820b3c0657ba27ab Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 4 Nov 2022 22:26:14 -0500
Subject: [PATCH 513/704] [TIR] Added unit test for dynamic parameter in layout
 transform (#13298)

---
 .../test_tir_schedule_transform_layout.py     | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index 0bf75becb2c0..282f1dcf49e9 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -836,5 +836,53 @@ def before(A: T.Buffer[14, "int32"]):
     expected = tvm.tir.schedule.schedule.ScheduleError
 
 
+class TestTransformLayoutWithVar(tvm.testing.CompareBeforeAfter):
+    """Layout transform with dynamic parameter in transform"""
+
+    @pytest.fixture
+    def transform(self):
+        def transform(mod):
+            sch = tir.Schedule(mod)
+
+            n = sch.mod["main"].params[1]
+
+            sch.transform_layout(
+                "block",
+                "B",
+                lambda i: [i // n, i % n],
+                pad_value=0,
+            )
+            return sch.mod
+
+        return transform
+
+    def before(A: T.Buffer[16, "int32"], n: T.int32):
+        B = T.alloc_buffer(16, "int32")
+        for i in T.serial(16):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi]
+
+    def expected(A: T.Buffer[16, "int32"], n: T.int32):
+        B = T.alloc_buffer([(-16 % n + 16) // n, n], dtype="int32")
+        for i, j in T.grid((-16 % n + 16) // n, n):
+            with T.block("block"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                B[vi, vj] = T.if_then_else(
+                    # Checks if the transform introduced padding
+                    -16 % n != 0
+                    and (
+                        # If so, is vi in the last group (which may
+                        # include padding).
+                        (vj + vi * n) // n == 16 // n
+                        # And is vj within the padding
+                        and 16 % n <= (vj + vi * n) % n
+                    ),
+                    0,
+                    A[vj + vi * n],
+                    dtype="int32",
+                )
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 732e34fe3d9facf0d710970aed8c7a86c038b070 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Sat, 5 Nov 2022 15:39:04 +0800
Subject: [PATCH 514/704] [TIR] Preserve loop annotation after loop
 partitioning (#13292)

Preserve loop annotations when the loop is get partitioned. Also we bind the loop region info to the analyzer for some cases some partition condition could not get solved due to unknown (but trivial) loop region.
---
 src/tir/transforms/loop_partition.cc          |   6 +-
 .../test_tir_transform_loop_partition.py      | 128 +++++++++++++++---
 2 files changed, 114 insertions(+), 20 deletions(-)

diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc
index e1445d29dacf..1d995ef26ed8 100644
--- a/src/tir/transforms/loop_partition.cc
+++ b/src/tir/transforms/loop_partition.cc
@@ -393,6 +393,7 @@ class LoopPartitioner : public StmtMutator {
   }
 
   Stmt VisitStmt_(const ForNode* op) final {
+    analyzer_.Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent), true);
     auto fs = GetRef<Stmt>(op);
     if (selector.candidates.count(fs)) {
       Stmt s = TryPartition(fs, op->loop_var, op->min, op->min + op->extent - 1, op->body, false);
@@ -697,12 +698,13 @@ inline Stmt LoopPartitioner::MakeFor(const Object* node, PrimExpr extent, Stmt b
   const ForNode* for_node = static_cast<const ForNode*>(node);
   ICHECK(for_node);
   if (analyzer_.CanProve(extent == make_const(DataType::Int(32), 1)) &&
-      !no_unroll_loop_with_extent_one_) {
+      !no_unroll_loop_with_extent_one_ && for_node->annotations.empty()) {
     // If the loop extent is 1, do not create the loop anymore
     return Substitute(body, {{Var{for_node->loop_var}, make_const(DataType::Int(32), 0)}});
   } else {
     ICHECK(for_node->kind != ForKind::kThreadBinding);
-    return For(for_node->loop_var, IntImm(for_node->min.dtype(), 0), extent, for_node->kind, body);
+    return For(for_node->loop_var, IntImm(for_node->min.dtype(), 0), extent, for_node->kind, body,
+               for_node->thread_binding, for_node->annotations);
   }
 }
 
diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py
index 23a0064ee6ff..56128155295e 100644
--- a/tests/python/unittest/test_tir_transform_loop_partition.py
+++ b/tests/python/unittest/test_tir_transform_loop_partition.py
@@ -568,6 +568,17 @@ def test_explicit_partition_hint():
     assert tvm.ir.structural_equal(mod["main"], partitioned_concat)
 
 
+def partition_from_scheduled_tir(prim_func, pass_cfg):
+    with tvm.transform.PassContext(config=pass_cfg):
+        mod = IRModule.from_expr(prim_func)
+        mod = tvm.tir.transform.LowerOpaqueBlock()(mod)
+        mod = tvm.tir.transform.FlattenBuffer()(mod)
+        mod = tvm.tir.transform.LoopPartition()(mod)
+        mod = tvm.tir.transform.Simplify()(mod)
+        mod = tvm.tir.transform.RemoveNoOp()(mod)
+        return mod
+
+
 @T.prim_func
 def partitioned_concat_3(
     placeholder: T.Buffer[(50176,), "int8"],
@@ -609,13 +620,9 @@ def concat_func_3(
 
 
 def test_condition_mutually_exclusive():
-    mod = IRModule.from_expr(concat_func_3)
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod = tvm.tir.transform.LowerOpaqueBlock()(mod)
-        mod = tvm.tir.transform.FlattenBuffer()(mod)
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        mod = tvm.tir.transform.Simplify()(mod)
-        mod = tvm.tir.transform.RemoveNoOp()(mod)
+    mod = partition_from_scheduled_tir(
+        concat_func_3, {"tir.LoopPartition": {"partition_const_loop": True}}
+    )
     assert tvm.ir.structural_equal(mod["main"], partitioned_concat_3)
 
 
@@ -650,23 +657,108 @@ def partitioned_main(A: T.Buffer[150528, "int8"], B: T.Buffer[25088, "int8"]) ->
             if ax2 < 5 and ax3 < 3:
                 B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax2 + 219]
 
-    mod = tvm.ir.module.IRModule.from_expr(main)
-    with tvm.transform.PassContext(
-        config={
+    mod = partition_from_scheduled_tir(
+        main,
+        {
             "tir.LoopPartition": {
                 "partition_const_loop": True,
                 "unroll_loop_with_partition_hint_no_interval": True,
             }
-        }
-    ):
-        mod = tvm.tir.transform.LowerOpaqueBlock()(mod)
-        mod = tvm.tir.transform.FlattenBuffer()(mod)
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        mod = tvm.tir.transform.UnrollLoop()(mod)
-        mod = tvm.tir.transform.RemoveNoOp()(mod)
-        mod = tvm.tir.transform.Simplify()(mod)
+        },
+    )
+    mod = tvm.tir.transform.UnrollLoop()(mod)
+    mod = tvm.tir.transform.RemoveNoOp()(mod)
+    mod = tvm.tir.transform.Simplify()(mod)
     assert tvm.ir.structural_equal(mod["main"], partitioned_main)
 
 
+def test_loop_partition_keep_loop_annotations():
+    @T.prim_func
+    def before(A: T.Buffer[160, "int32"], B: T.Buffer[160, "int32"]) -> None:
+        for i in T.serial(
+            160,
+            annotations={"pragma_loop_partition_hint": True, "key": "value"},
+        ):
+            if i < 10:
+                B[i] = A[i] + 1
+            elif 10 <= i and i < 150:
+                B[i] = A[i] + 2
+            else:
+                B[i] = A[i] + 3
+
+    @T.prim_func
+    def after(A: T.Buffer[160, "int32"], B: T.Buffer[160, "int32"]) -> None:
+        T.preflattened_buffer(A, [160], dtype="int32", data=A.data)
+        T.preflattened_buffer(B, [160], dtype="int32", data=B.data)
+        for i in T.serial(10, annotations={"key": "value"}):
+            B[i] = A[i] + 1
+        for i in T.serial(140, annotations={"key": "value"}):
+            B[i + 10] = A[i + 10] + 2
+        for i in T.serial(10, annotations={"key": "value"}):
+            B[i + 150] = A[i + 150] + 3
+
+    mod = partition_from_scheduled_tir(
+        before,
+        {
+            "tir.LoopPartition": {
+                "partition_const_loop": True,
+            }
+        },
+    )
+    assert tvm.ir.structural_equal(mod["main"], after)
+
+
+def test_loop_partition_with_unit_loop_in_condition():
+    @T.prim_func
+    def before(
+        placeholder: T.Buffer[(50176,), "int8"],
+        placeholder_1: T.Buffer[(25088,), "int8"],
+        placeholder_2: T.Buffer[(25088,), "int8"],
+        T_concat: T.Buffer[(100352,), "int8"],
+    ) -> None:
+        for k in range(1, annotations={"preserve_unit_loop": True}):
+            for i1 in T.serial(128, annotations={"pragma_loop_partition_hint": 1}):
+                for i2, i3 in T.grid(28, 28):
+                    if 96 <= k * 128 + i1:
+                        T_concat[k * i1 * 784 + i2 * 28 + i3] = placeholder_2[
+                            i1 * 784 + i2 * 28 + i3 - 75264
+                        ]
+                    if 64 <= k * 128 + i1 and k * 128 + i1 < 96:
+                        T_concat[i1 * 784 + i2 * 28 + i3] = placeholder_1[
+                            i1 * 784 + i2 * 28 + i3 - 50176
+                        ]
+                    if k * 128 + i1 < 64:
+                        T_concat[i1 * 784 + i2 * 28 + i3] = placeholder[i1 * 784 + i2 * 28 + i3]
+
+    @T.prim_func
+    def after(
+        placeholder: T.Buffer[50176, "int8"],
+        placeholder_1: T.Buffer[25088, "int8"],
+        placeholder_2: T.Buffer[25088, "int8"],
+        T_concat: T.Buffer[100352, "int8"],
+    ) -> None:
+        T.preflattened_buffer(placeholder, [50176], dtype="int8", data=placeholder.data)
+        T.preflattened_buffer(placeholder_1, [25088], dtype="int8", data=placeholder_1.data)
+        T.preflattened_buffer(placeholder_2, [25088], dtype="int8", data=placeholder_2.data)
+        T.preflattened_buffer(T_concat, [100352], dtype="int8", data=T_concat.data)
+        for _ in T.serial(1, annotations={"preserve_unit_loop": True}):
+            for i1, i2, i3 in T.grid(64, 28, 28):
+                T_concat[i1 * 784 + i2 * 28 + i3] = placeholder[i1 * 784 + i2 * 28 + i3]
+            for i1, i2, i3 in T.grid(32, 28, 28):
+                T_concat[i1 * 784 + i2 * 28 + i3 + 50176] = placeholder_1[i1 * 784 + i2 * 28 + i3]
+            for i1, i2, i3 in T.grid(32, 28, 28):
+                T_concat[i2 * 28 + i3] = placeholder_2[i1 * 784 + i2 * 28 + i3]
+
+    mod = partition_from_scheduled_tir(
+        before,
+        {
+            "tir.LoopPartition": {
+                "partition_const_loop": True,
+            }
+        },
+    )
+    assert tvm.ir.structural_equal(mod["main"], after)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From b51c491c0072f323658b8d5803cd3321ac37a7d3 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Sat, 5 Nov 2022 11:16:12 -0700
Subject: [PATCH 515/704] [FIX] Handle matmul where one inner dimension is
 unknown (#13287)

Unify the two inner dimensions in the type checker so if one is unknown
it will be filled in.
---
 include/tvm/relay/attrs/nn.h         |  6 +++--
 python/tvm/topi/nn/dense.py          |  4 ++-
 src/relay/op/nn/nn.h                 | 37 ++++++++++++++++++++++------
 tests/python/relay/test_op_level1.py |  8 ++++++
 4 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 5b84942a57cf..5ffc4711cac6 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -1052,7 +1052,8 @@ struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
   DataType out_dtype;
   bool transpose_a;
   bool transpose_b;
-  tvm::String auto_scheduler_rewritten_layout;   // The layout after auto-scheduler's layout rewrite
+  // layout of B after auto-scheduler's layout rewrite
+  tvm::String auto_scheduler_rewritten_layout;
   Array<PrimExpr> meta_schedule_original_shape;  // The original shape of the weights
 
   TVM_DECLARE_ATTRS(MatmulAttrs, "relay.attrs.MatmulAttrs") {
@@ -1076,7 +1077,8 @@ struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
 /*! \brief Attributes for dense operator */
 struct DenseAttrs : public tvm::AttrsNode<DenseAttrs> {
   IndexExpr units;
-  tvm::String auto_scheduler_rewritten_layout;   // The layout after auto-scheduler's layout rewrite
+  // layout of B after auto-scheduler's layout rewrite
+  tvm::String auto_scheduler_rewritten_layout;
   Array<PrimExpr> meta_schedule_original_shape;  // The original shape of the weights
   DataType out_dtype;
 
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index 61f9c4e17c50..d7d475fb0c02 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -94,7 +94,9 @@ def matmul(
         red_dim, out_dim = tensor_b.shape
 
     # cmp should be done by values
-    assert int(in_dim) == int(red_dim)
+    assert int(in_dim) == int(
+        red_dim
+    ), "Inner dimensions of dense do not match. {in_dim} vs {red_dim}."
 
     k = te.reduce_axis((0, in_dim), name="k")
     if (transpose_a, transpose_b) == (True, True):
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index 872970000392..30f7f9e3304d 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -31,6 +31,7 @@
 
 #include <algorithm>
 #include <utility>
+#include <vector>
 
 #include "../op_common.h"
 
@@ -70,6 +71,7 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     reduce = dshape[dshape.size() - 2];
     oshape.Set((oshape.size() - 2), dshape[oshape.size() - 1]);
   }
+  auto tensor_b_dtype = (tensor_b == nullptr ? tensor_a->dtype : tensor_b->dtype);
   if (param->units.defined()) {
     // validate the tensor_b shape is proper if defined
     // Assign tensor_b type
@@ -78,7 +80,6 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     // It is possible for tensor_b to be nullptr in which case we will use
     // data dtype as the tensor_b dtype. However if tensor_b dtype is explicitly
     // present we will use that.
-    auto tensor_b_dtype = (tensor_b == nullptr ? tensor_a->dtype : tensor_b->dtype);
     if (param->auto_scheduler_rewritten_layout.size() != 0) {
       // If the layout is rewritten by auto-scheduler or meta-schedule,
       // we just forcefully apply the layout provided by auto-scheduler and
@@ -102,12 +103,34 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       oshape.Set(oshape.size() - 1, tensor_b_elements / dshape[dshape.size() - 1]);
       // Otherwise just pull it out of the tensor_b shape directly.
     } else {
-      ICHECK(static_cast<int>(tensor_b->shape.size()) == 2);
-      if (!tensor_a->shape.back().as<tir::AnyNode>()) {
-        ICHECK((transpose_b && reporter->AssertEQ(reduce, tensor_b->shape[1])) ||
-               (!transpose_b && reporter->AssertEQ(reduce, tensor_b->shape[0])))
-            << "MatmulRel: input dimension doesn't match,"
-            << " tensor_a shape=" << tensor_a->shape << ", tensor_b shape=" << tensor_b->shape;
+      if (param->auto_scheduler_rewritten_layout.size() == 0 &&
+          param->meta_schedule_original_shape.size() == 0) {
+        // ensure inner dimension matches between data and weight. If one inner
+        // dimension is dynamic then it is inferred to match the other inner
+        // dimension.
+        std::vector<PrimExpr> A_shape(tensor_a->shape.begin(), tensor_a->shape.end());
+        std::vector<PrimExpr> B_shape(tensor_b->shape.begin(), tensor_b->shape.end());
+        auto sa = A_shape.size();
+        auto sb = B_shape.size();
+        if (transpose_a && transpose_b) {
+          auto tmp = A_shape[sa - 2];
+          A_shape[sa - 2] = B_shape[sb - 1];
+          B_shape[sb - 1] = tmp;
+        } else if (transpose_a) {
+          auto tmp = A_shape[sa - 2];
+          A_shape[sa - 2] = B_shape[sb - 2];
+          B_shape[sb - 2] = tmp;
+        } else if (transpose_b) {
+          auto tmp = A_shape[sa - 1];
+          A_shape[sa - 1] = B_shape[sb - 1];
+          B_shape[sb - 1] = tmp;
+        } else {
+          auto tmp = A_shape[sa - 1];
+          A_shape[sa - 1] = B_shape[sb - 2];
+          B_shape[sb - 2] = tmp;
+        }
+        reporter->Assign(types[0], TensorType(A_shape, tensor_a->dtype));
+        reporter->Assign(types[1], TensorType(B_shape, tensor_b_dtype));
       }
       oshape.Set(oshape.size() - 1, transpose_b ? wshape[0] : wshape[1]);
     }
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 4234c18c110f..7884fa35a48b 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -683,6 +683,14 @@ def test_dense(executor_kind):
         yy = run_infer_type(y)
         assert yy.checked_type == relay.TensorType((n, c, h, ww), dtype)
 
+        # test dynamic shape in inner
+        m, k = 4, 2
+        x = relay.var("x", relay.TensorType((m, k), dtype))
+        k, nw = relay.Any(), 6
+        w = relay.var("w", relay.TensorType((k, n), dtype))
+        y = relay.nn.dense(x, w)
+        yy = run_infer_type(y)
+
         n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2
         x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
         w = relay.var("w", relay.IncompleteType())

From f2a740331f21106787a29566185d8924e5dcb25a Mon Sep 17 00:00:00 2001
From: "Xiangxi Guo (Ryan)" <ryan.guo99@gmail.com>
Date: Sun, 6 Nov 2022 08:23:48 -0600
Subject: [PATCH 516/704] [DOCS][TVMC] Use correct argument to reuse tuning
 records (#13302)

Update tvmc tutorial code to use correct argument for reusing tuning
records. Specifically, current code uses tuning_records, which is meant
for saving the generated tuning results, not reusing prior results. We
should use prior_records instead.
---
 gallery/tutorial/tvmc_python.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gallery/tutorial/tvmc_python.py b/gallery/tutorial/tvmc_python.py
index 0cd4f8ed9b9a..417f8ad88747 100644
--- a/gallery/tutorial/tvmc_python.py
+++ b/gallery/tutorial/tvmc_python.py
@@ -248,7 +248,7 @@
 #      ...
 #
 #      # Later run tuning and reuse tuning results
-#      tvmc.tune(model, target="llvm",tuning_records=log_file)
+#      tvmc.tune(model, target="llvm", prior_records=log_file)
 #
 # Method 2:
 #    .. code-block:: python
@@ -259,7 +259,7 @@
 #      ...
 #
 #      # Later run tuning and reuse tuning results
-#      tvmc.tune(model, target="llvm",tuning_records=tuning_records)
+#      tvmc.tune(model, target="llvm", prior_records=tuning_records)
 #
 
 ################################################################################

From 60e2c98fdb14721d0010d261d5c6b94d94e053d2 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar Subramaniam <quic_sanirudh@quicinc.com>
Date: Mon, 7 Nov 2022 10:27:49 +0530
Subject: [PATCH 517/704] [Hexagon] Fix Hexagon external libs check (#13257)

When building tvm runtime with hexagon we face the below error if
USE_HEXAGON_EXTERNAL_LIBS is not defined. This happens because
USE_HEXAGON_EXTERNAL_LIBS=OFF is defined as the default in
CMakeLists.txt. The modified condition can check for all cases including
undefined variable, empty string and OFF

CMake Error at cmake/modules/Hexagon.cmake:203 (message):
  Invalid use of USE_HEXAGON_EXTERNAL_LIBS=OFF; USE_HEXAGON_EXTERNAL_LIBS
  only supports absolute paths and git repository urls
Call Stack (most recent call first):
  CMakeLists.txt:477 (include)
---
 cmake/modules/Hexagon.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 735d21e492b5..31cece8a19e0 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -184,7 +184,7 @@ if(BUILD_FOR_HEXAGON)
   )
 
   # Include hexagon external library runtime sources
-  if(DEFINED USE_HEXAGON_EXTERNAL_LIBS AND NOT ${USE_HEXAGON_EXTERNAL_LIBS} STREQUAL "")
+  if(USE_HEXAGON_EXTERNAL_LIBS)
     # Check if the libs are provided as an absolute path
     if (EXISTS ${USE_HEXAGON_EXTERNAL_LIBS})
     # Check if the libs are provided as a git url

From dd257e478ef88e3e2a2c1df1b36efbfe4347ee5f Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@octoml.ai>
Date: Mon, 7 Nov 2022 09:22:06 -0800
Subject: [PATCH 518/704] [Relay][Op] Add support for large index fp16 mean and
 var (#13289)

Add support for large index fp16 mean and var.
---
 src/relay/op/tensor/reduce.cc        | 37 ++++++++++++++++++++++------
 tests/python/relay/test_op_level4.py | 23 +++++++++++------
 2 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index 2b1afc6e55f2..d82705e3fc55 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -606,18 +606,29 @@ Example::
 
 Array<te::Tensor> MeanCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                               const Type& out_type) {
-  IndexExpr count = tir::make_const(inputs[0]->dtype, 1);
+  auto data = inputs[0];
+  IndexExpr count = tir::make_const(DataType::Int(64), 1);
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
   ICHECK(param != nullptr);
   auto axes = param->axis;
   for (int64_t i : GetReduceAxes(inputs[0]->shape.size(), param->axis, param->exclude)) {
     count *= inputs[0]->shape[i];
   }
-  // Although count is created as inputs[0]->dtype,
-  // its type may be changed (promoted) during multiplication
-  count = cast(inputs[0]->dtype, count);
-  auto res = ReduceCompute(attrs, inputs, out_type, topi::sum);
-  return {topi::divide(res[0], count)};
+  // Check the datatype of input data. If it's fp16, we'll have trouble representing all
+  // indices and summation needed so we instead just cast to fp32.
+  bool recast_fp16 = false;
+  if (data->dtype.is_float16()) {
+    recast_fp16 = true;
+    data = topi::cast(data, DataType::Float(32));
+  }
+  count = cast(data->dtype, count);
+  auto res = ReduceCompute(attrs, {data}, out_type, topi::sum);
+  auto output = topi::divide(res[0], count);
+  // Set the output back to the appropriate fp16 type if needed.
+  if (recast_fp16) {
+    output = topi::cast(output, DataType::Float(16));
+  }
+  return {output};
 }
 
 RELAY_REGISTER_REDUCE_OP("mean")
@@ -667,7 +678,7 @@ bool VarianceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
 Array<te::Tensor> VarianceCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                   const Type& out_type) {
-  IndexExpr count = tir::make_const(inputs[0]->dtype, 1);
+  IndexExpr count = tir::make_const(DataType::Int(64), 1);
   const VarianceAttrs* param = attrs.as<VarianceAttrs>();
   ICHECK(param != nullptr);
   auto axes = param->axis;
@@ -687,8 +698,20 @@ Array<te::Tensor> VarianceCompute(const Attrs& attrs, const Array<te::Tensor>& i
     axes = GetExcludeAxes(sq_diff->shape.size(), param->axis);
     ICHECK_NE(axes.size(), 0);
   }
+  // If the input is fp16, we might have trouble representing the full sum of
+  // indices or values. We recast to fp32 to avoid this issue.
+  bool recast_fp16 = false;
+  if (data->dtype.is_float16()) {
+    recast_fp16 = true;
+    sq_diff = topi::cast(sq_diff, DataType::Float(32));
+  }
   auto var = topi::divide(topi::sum(sq_diff, axes, param->keepdims, false), count);
 
+  // Recast back to fp16 if needed.
+  if (recast_fp16) {
+    var = topi::cast(var, DataType::Float(16));
+  }
+
   return {var};
 }
 
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index a8eb7f406c37..c4207b158c94 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -397,24 +397,28 @@ def get_test_case(shape, gt_func, test_argmin=False):
                 assert op_res.numpy().item() == ans
 
 
-def verify_mean_var_std(executor_kind, funcs, shape, axis, keepdims):
+def verify_mean_var_std(executor_kind, funcs, shape, axis, keepdims, dtype="float32"):
     test_func = funcs[0]
     ref_func = funcs[1]
-    dtype = "float32"
 
     x = relay.var("x", relay.TensorType(shape, dtype))
     z = test_func(x, axis, keepdims)
     func = relay.Function([x], z.astuple())
-    x_data = np.random.uniform(size=shape).astype(dtype)
-    ref_mean = np.mean(x_data, axis=axis, dtype=dtype, keepdims=keepdims)
-    ref_res = ref_func(x_data, axis=axis, dtype=dtype, keepdims=keepdims)
+    x_data = np.random.uniform(size=shape).astype("float32")
+    ref_mean = np.mean(x_data, axis=axis, dtype="float32", keepdims=keepdims).astype(dtype)
+    ref_res = ref_func(x_data, axis=axis, dtype="float32", keepdims=keepdims).astype(dtype)
 
     for target, dev in tvm.testing.enabled_targets():
         op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
-            x_data
+            x_data.astype(dtype)
         )
-        tvm.testing.assert_allclose(op_res[0].numpy(), ref_mean, rtol=1e-5)
-        tvm.testing.assert_allclose(op_res[1].numpy(), ref_res, rtol=1e-5)
+        # FP16 is always a little less accurate.
+        if dtype == "float16":
+            rtol, atol = (1e-2, 1e-2)
+        else:
+            rtol, atol = (1e-5, 1e-5)
+        tvm.testing.assert_allclose(op_res[0].numpy(), ref_mean, rtol=rtol, atol=atol)
+        tvm.testing.assert_allclose(op_res[1].numpy(), ref_res, rtol=rtol, atol=atol)
 
 
 @tvm.testing.uses_gpu
@@ -430,6 +434,9 @@ def test_mean_var_std(executor_kind):
         verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), False)
         verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 1), True)
         verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), True)
+        # Test FP16 reduction with large indices.
+        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), True, "float16")
+        verify_mean_var_std(executor_kind, func, (128, 24, 128), None, False, "float16")
 
 
 @tvm.testing.uses_gpu

From 6b238c4b6e53ab41a2a359f2190b58633460f42e Mon Sep 17 00:00:00 2001
From: Hao Cheng <35162184+Wanger-SJTU@users.noreply.github.com>
Date: Tue, 8 Nov 2022 04:15:36 +0800
Subject: [PATCH 519/704] [Bugfix][Runtime] Fix sched_setaffinity in Android
 (#13158)

* fix sched_setaffinity error on Android

* fix sched_setaffinity error on Android

* fix sched_setaffinity error on Android

* clang format

* add ndk api verion macro

* clang format
---
 src/runtime/threading_backend.cc | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index ef1aa69f6455..9f7f2cd8d98a 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -25,6 +25,9 @@
 #include <tvm/runtime/threading_backend.h>
 
 #if defined(__linux__) || defined(__ANDROID__)
+#if __ANDROID_API__ >= 21
+#include <pthread.h>
+#endif
 #include <fstream>
 #include <sstream>
 #else
@@ -167,7 +170,19 @@ class ThreadGroup::Impl {
       CPU_SET(id, &cpuset);
     }
 #if defined(__ANDROID__)
-    sched_setaffinity(thread, sizeof(cpu_set_t), &cpuset);
+#if __ANDROID_API__ >= 21
+    pid_t tid = pthread_gettid_np(thread);
+#else
+    typedef struct {
+      void* next;
+      void* pred;
+      pid_t tid;
+    } pthread_internal;
+    pid_t tid = reinterpret_cast<pthread_internal*>(thread)->tid;
+#endif
+    if (sched_setaffinity(tid, sizeof(cpu_set_t), &cpuset) != 0) {
+      LOG(WARNING) << "sched_setaffinity failed";
+    }
 #else
     pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
 #endif

From e398d16de8e222b766070f48217cbf746996d987 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 8 Nov 2022 06:34:11 +0900
Subject: [PATCH 520/704] [Torch] Fix advanced indexing with boolean mask
 (#13306)

* [Torch] Fix advanced indexing with boolean mask

* add comment
---
 python/tvm/relay/frontend/pytorch.py          | 15 +++++++++++++--
 tests/python/frontend/pytorch/test_forward.py |  8 ++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 1b86b120dfcc..30f14b490b1b 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2323,8 +2323,19 @@ def one_hot(self, inputs, input_types):
 
     def index(self, inputs, input_types):
         data = inputs[0]
-        indices = inputs[1]
-        return _op.adv_index([data] + indices)
+        indices_list = []
+
+        for indices in inputs[1]:
+            if self.infer_type(indices).dtype == "bool":
+                # adv_index does not support a mask as the index tensor (it will treat 0/1 as
+                # an index rather than a flag).
+                # So we use argwhere to turn the mask into indices, which will also take care
+                # of the dynamism in the indexing by mask.
+                indices_list.append(_op.squeeze(_op.transform.argwhere(indices), axis=[1]))
+            else:
+                indices_list.append(indices)
+
+        return _op.adv_index([data] + indices_list)
 
     def meshgrid(self, inputs, input_types):
         data = inputs[0]
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 8045635127bb..36bb5bede475 100755
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -4034,6 +4034,14 @@ def forward(self, x):
     input_data = torch.rand(input_shape).float()
     verify_model(Index1().eval(), input_data=input_data)
 
+    def test_fn_bool_mask():
+        return lambda data, mask: data[0, mask]
+
+    data = torch.tensor([[1, 2, 3], [4, 5, 6]])
+    mask = torch.tensor([True, True, False])
+
+    verify_trace_model(test_fn_bool_mask(), [data, mask], ["llvm", "cuda"])
+
 
 def test_logsumexp():
     """test_logsumexp"""

From ce777fde18bb4c1ef23a856a998c50606c7947f8 Mon Sep 17 00:00:00 2001
From: M <mengceng.he@intel.com>
Date: Tue, 8 Nov 2022 06:30:45 +0800
Subject: [PATCH 521/704] [TOPI] Enhance VNNI dot product (#12588)

change
---
 python/tvm/topi/x86/tensor_intrin.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/tvm/topi/x86/tensor_intrin.py b/python/tvm/topi/x86/tensor_intrin.py
index 727319c95c5c..9e91e32b20e5 100644
--- a/python/tvm/topi/x86/tensor_intrin.py
+++ b/python/tvm/topi/x86/tensor_intrin.py
@@ -305,15 +305,16 @@ def _instr(index):
 
             if llvm_id != 0:  # VNNI is available for current LLVM version
                 vec_bi32 = tvm.tir.call_intrin("int32x16", "tir.reinterpret", vec_b)
-                vec_zero = tvm.tir.const(0, "int32x16")
+                vec_c = outs[0].vload([0], "int32x16")
                 quad_reduction = tvm.tir.call_llvm_pure_intrin(
                     "int32x16",
                     "llvm.x86.avx512.vpdpbusd.512",
                     tvm.tir.const(0, "uint32"),
-                    vec_zero,
+                    vec_c,
                     vec_ai32,
                     vec_bi32,
                 )
+                ib.emit(outs[0].vstore(0, quad_reduction))
             else:  # Fall back to the normal AVX512
                 vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai32)
                 vec_one = tvm.tir.const(1, "int16x32")
@@ -331,11 +332,10 @@ def _instr(index):
                     pair_reduction,
                     vec_one,
                 )
-
-            if index == 0:
-                ib.emit(outs[0].vstore(0, quad_reduction))
-            else:
-                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], "int32x16")))
+                if index == 0:
+                    ib.emit(outs[0].vstore(0, quad_reduction))
+                else:
+                    ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], "int32x16")))
             return ib.get()
 
         # body, reset, update

From b16a64d6edb9fd1a014fc51995dff7d0e2f4c84e Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 7 Nov 2022 14:55:26 -0800
Subject: [PATCH 522/704] [MetaSchedule] Refactor ScheduleRule Attributes
 (#13195)

---
 .../tvm/meta_schedule/schedule/cpu/.gitignore |   0
 .../meta_schedule/schedule/cuda/thread_bind.h |  50 ++-
 .../meta_schedule/schedule/generic/winograd.h |  37 ++
 .../tvm/meta_schedule/schedule/x86/.gitignore |   0
 include/tvm/meta_schedule/schedule_rule.h     |   8 +
 python/tvm/meta_schedule/__init__.py          |   6 +-
 python/tvm/meta_schedule/relay_integration.py |  16 +-
 python/tvm/meta_schedule/schedule/__init__.py |  18 +
 .../meta_schedule/schedule/cpu/__init__.py    |  17 +
 .../meta_schedule/schedule/cuda/__init__.py   |  17 +
 .../schedule/generic/__init__.py              |  17 +
 .../meta_schedule/schedule/x86/__init__.py    |  17 +
 .../meta_schedule/schedule_rule/__init__.py   |   5 +-
 .../schedule_rule/apply_custom_rule.py        |  33 ++
 .../search_strategy/evolutionary_search.py    |   2 +-
 .../testing/conv2d_winograd_cpu.py            | 172 ---------
 .../testing/conv2d_winograd_cuda.py           | 173 ---------
 .../meta_schedule/testing/relay_workload.py   |   1 -
 .../meta_schedule/testing/space_generation.py |   2 +-
 .../tvm/meta_schedule/testing/te_workload.py  | 150 ++++----
 python/tvm/relay/backend/te_compiler.py       |   4 +-
 python/tvm/relay/op/nn/_nn.py                 |   4 +-
 python/tvm/relay/op/strategy/adreno.py        |  10 +-
 python/tvm/relay/op/strategy/arm_cpu.py       |  12 +-
 python/tvm/relay/op/strategy/bifrost.py       |  10 +-
 python/tvm/relay/op/strategy/cuda.py          |  72 +++-
 python/tvm/relay/op/strategy/generic.py       |  10 +-
 python/tvm/relay/op/strategy/mali.py          |  10 +-
 python/tvm/relay/op/strategy/x86.py           |  10 +-
 python/tvm/topi/cuda/conv2d_alter_op.py       |  56 +--
 python/tvm/topi/cuda/conv2d_nhwc_winograd.py  |   4 +-
 python/tvm/topi/cuda/conv2d_winograd.py       |  35 +-
 python/tvm/topi/nn/conv2d.py                  | 352 ++++++++++++++---
 python/tvm/topi/utils.py                      |  14 +-
 python/tvm/topi/x86/batch_matmul.py           |   8 +-
 python/tvm/topi/x86/dense.py                  |  14 +-
 .../postproc/rewrite_unbound_block.cc         |   3 +-
 src/meta_schedule/schedule/cpu/winograd.cc    | 101 +++++
 .../schedule/cuda/thread_bind.cc              | 181 +++++++++
 src/meta_schedule/schedule/cuda/winograd.cc   | 163 ++++++++
 .../schedule/generic/winograd.cc              |  46 +++
 src/meta_schedule/schedule/x86/.gitignore     |   0
 .../schedule_rule/apply_custom_rule.cc        |  92 +++++
 src/meta_schedule/schedule_rule/auto_bind.cc  | 138 +------
 .../schedule_rule/schedule_rule.cc            |  56 ++-
 src/meta_schedule/schedule_rule/winograd.cc   | 249 ------------
 .../space_generator/post_order_apply.cc       |  49 +--
 src/meta_schedule/utils.h                     |  35 +-
 src/target/tag.cc                             |   9 +-
 src/te/operation/create_primfunc.cc           |  17 +-
 .../metaschedule_e2e/test_resnet50_int8.py    |  14 +-
 ..._meta_schedule_custom_rule_winograd_cpu.py | 206 ----------
 ...meta_schedule_custom_rule_winograd_cuda.py | 328 ----------------
 .../test_meta_schedule_post_order_apply.py    |  43 ---
 .../test_meta_schedule_relay_integration.py   |   7 +-
 .../test_meta_schedule_space_cpu_winograd.py  | 168 +++++++++
 .../unittest/test_meta_schedule_space_cuda.py | 169 ---------
 .../test_meta_schedule_space_cuda_winograd.py | 355 ++++++++++++++++++
 .../test_meta_schedule_vnni_integration.py    |  14 +-
 .../unittest/test_te_create_primfunc.py       |   2 -
 .../test_tir_analysis_stmt_finding.py         |   7 +-
 61 files changed, 1966 insertions(+), 1852 deletions(-)
 create mode 100644 include/tvm/meta_schedule/schedule/cpu/.gitignore
 rename src/meta_schedule/schedule_rule/auto_bind.h => include/tvm/meta_schedule/schedule/cuda/thread_bind.h (57%)
 create mode 100644 include/tvm/meta_schedule/schedule/generic/winograd.h
 create mode 100644 include/tvm/meta_schedule/schedule/x86/.gitignore
 create mode 100644 python/tvm/meta_schedule/schedule/__init__.py
 create mode 100644 python/tvm/meta_schedule/schedule/cpu/__init__.py
 create mode 100644 python/tvm/meta_schedule/schedule/cuda/__init__.py
 create mode 100644 python/tvm/meta_schedule/schedule/generic/__init__.py
 create mode 100644 python/tvm/meta_schedule/schedule/x86/__init__.py
 create mode 100644 python/tvm/meta_schedule/schedule_rule/apply_custom_rule.py
 delete mode 100644 python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py
 delete mode 100644 python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py
 create mode 100644 src/meta_schedule/schedule/cpu/winograd.cc
 create mode 100644 src/meta_schedule/schedule/cuda/thread_bind.cc
 create mode 100644 src/meta_schedule/schedule/cuda/winograd.cc
 create mode 100644 src/meta_schedule/schedule/generic/winograd.cc
 create mode 100644 src/meta_schedule/schedule/x86/.gitignore
 create mode 100644 src/meta_schedule/schedule_rule/apply_custom_rule.cc
 delete mode 100644 src/meta_schedule/schedule_rule/winograd.cc
 delete mode 100644 tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
 delete mode 100644 tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
 create mode 100644 tests/python/unittest/test_meta_schedule_space_cpu_winograd.py
 create mode 100644 tests/python/unittest/test_meta_schedule_space_cuda_winograd.py

diff --git a/include/tvm/meta_schedule/schedule/cpu/.gitignore b/include/tvm/meta_schedule/schedule/cpu/.gitignore
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/src/meta_schedule/schedule_rule/auto_bind.h b/include/tvm/meta_schedule/schedule/cuda/thread_bind.h
similarity index 57%
rename from src/meta_schedule/schedule_rule/auto_bind.h
rename to include/tvm/meta_schedule/schedule/cuda/thread_bind.h
index b397d2015c19..ae6d492bfe12 100644
--- a/src/meta_schedule/schedule_rule/auto_bind.h
+++ b/include/tvm/meta_schedule/schedule/cuda/thread_bind.h
@@ -16,37 +16,53 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#ifndef TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_
-#define TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_
+#ifndef TVM_META_SCHEDULE_SCHEDULE_CUDA_THREAD_BIND_H_
+#define TVM_META_SCHEDULE_SCHEDULE_CUDA_THREAD_BIND_H_
 
-#include "../utils.h"
+#include <tvm/tir/schedule/schedule.h>
+
+#include <algorithm>
+#include <limits>
+#include <utility>
 
 namespace tvm {
 namespace meta_schedule {
 
 /*!
- * \brief Bind the given block if it is not bound to blockIdx or threadIdx.
+ * \brief Given candidates of thread_extents, make a sampler that use `sch->SampleCategorical`
+ * to return a random thread extent.
+ * \param sch The schedule
+ * \param thread_extents The candidate thread extents.
+ * \return A sampler that returns a random thread extent.
+ */
+std::function<tir::ExprRV(int64_t)> MakeFactorSampler(tir::Schedule sch,
+                                                      Array<Integer> thread_extents);
+
+/*!
+ * \brief Bind blockIdx.x and threadIdx.x to the given loop
  * \param sch The schedule.
- * \param block The block to be bound.
+ * \param loop The loop to be bound.
  * \param max_threadblocks The maximum number of threadblocks allowed.
- * \param max_threads The maximum number of threads allowed.
+ * \param max_threads_per_block The maximum number of threads allowed.
  * \param get_factor A function that returns the tiling factor.
  */
-void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block,
-                        int64_t max_threadblocks, int64_t max_threads_per_block,
-                        std::function<tir::ExprRV(int64_t max_extent)> get_factor);
+Array<tir::LoopRV> BindSpatialLoop(tir::Schedule sch, tir::LoopRV loop,  //
+                                   int64_t max_threadblocks, int64_t max_threads_per_block,
+                                   std::function<tir::ExprRV(int64_t)> get_factor = nullptr);
 
 /*!
- * \brief Given candidates of thread_extents, make a sampler that use `sch->SampleCategorical`
- * to return a random thread extent.
- * \param sch The schedule
- * \param thread_extents The candidate thread extents.
- * \return A sampler that returns a random thread extent.
+ * \brief Bind the given block if it is not bound to blockIdx or threadIdx.
+ * \param sch The schedule.
+ * \param block The block to be bound.
+ * \param max_threadblocks The maximum number of threadblocks allowed.
+ * \param max_threads_per_block The maximum number of threads allowed.
+ * \param get_factor A function that returns the tiling factor.
  */
-std::function<tir::ExprRV(int64_t max_extent)> MakeFactorSampler(tir::Schedule sch,
-                                                                 Array<Integer> thread_extents);
+void BindBlockThreadIdx(tir::Schedule sch, tir::BlockRV block,  //
+                        int64_t max_threadblocks, int64_t max_threads_per_block,
+                        std::function<tir::ExprRV(int64_t max_extent)> get_factor = nullptr);
 
 }  // namespace meta_schedule
 }  // namespace tvm
 
-#endif  // TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_
+#endif  // TVM_META_SCHEDULE_SCHEDULE_CUDA_THREAD_BIND_H_
diff --git a/include/tvm/meta_schedule/schedule/generic/winograd.h b/include/tvm/meta_schedule/schedule/generic/winograd.h
new file mode 100644
index 000000000000..dc9b32fd10de
--- /dev/null
+++ b/include/tvm/meta_schedule/schedule/generic/winograd.h
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_SCHEDULE_GENERIC_WINOGRAD_H_
+#define TVM_META_SCHEDULE_SCHEDULE_GENERIC_WINOGRAD_H_
+
+#include <tvm/tir/schedule/schedule.h>
+
+namespace tvm {
+namespace meta_schedule {
+
+/*!
+ * \brief Get the producer block of a given block.
+ * If there is a constant winograd transform matrix, inline it.
+ * \return The only producer block.
+ */
+tir::BlockRV GetWinogradProducerAndInlineConst(tir::Schedule sch, tir::BlockRV block);
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_SCHEDULE_GENERIC_WINOGRAD_H_
diff --git a/include/tvm/meta_schedule/schedule/x86/.gitignore b/include/tvm/meta_schedule/schedule/x86/.gitignore
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 1b018512146f..da8f1faa8e1d 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -99,6 +99,14 @@ class ScheduleRule : public runtime::ObjectRef {
    * \return The cloned schedule rule.
    */
   using FClone = runtime::TypedPackedFunc<ScheduleRule()>;
+  /*!
+   * \brief Create a rule that applies customized rules registered using block attribute
+   * `schedule_rule`. The rule will be dispatched according to target keys.
+   * \return The created schedule rule.
+   */
+  TVM_DLL static ScheduleRule ApplyCustomRule();
+  /*! \brief Check if the rule is `ApplyCustomRule` */
+  TVM_DLL static bool IsApplyCustomRule(const ScheduleRule& rule);
   /*!
    * \brief Create an auto-inline rule that inlines spatial blocks if it satisfies some conditions
    * \param into_producer If allows to inline a block into its producer
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index 0dd679e047e0..30a4fc6d9467 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -26,6 +26,7 @@
     postproc,
     relay_integration,
     runner,
+    schedule,
     schedule_rule,
     search_strategy,
     space_generator,
@@ -41,10 +42,7 @@
 from .mutator import Mutator
 from .postproc import Postproc
 from .profiler import Profiler
-from .relay_integration import (
-    is_meta_schedule_dispatch_enabled,
-    is_meta_schedule_enabled,
-)
+from .relay_integration import is_meta_schedule_enabled
 from .runner import Runner
 from .schedule_rule import ScheduleRule
 from .search_strategy import MeasureCandidate, SearchStrategy
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index 5e77181d32bf..df76684d2d42 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -377,7 +377,7 @@ def compile_relay(
     mod, target, params, pass_config, executor = _normalize_params(
         mod, target, params, pass_config, executor
     )
-    pass_config.setdefault("relay.backend.use_meta_schedule_dispatch", target.kind.name != "cuda")
+    pass_config.setdefault("relay.backend.use_meta_schedule_dispatch", True)
     with Profiler.timeit("PostTuningCompilation"):
         with target, _autotvm_silencer(), database:
             with transform.PassContext(
@@ -404,17 +404,3 @@ def is_meta_schedule_enabled() -> bool:
         "relay.backend.use_meta_schedule",
         False,
     )
-
-
-def is_meta_schedule_dispatch_enabled() -> bool:
-    """Return whether the meta-schedule dispatch is enabled.
-
-    Returns
-    -------
-    enabled: bool
-        Whether the meta schedule is enabled
-    """
-    return transform.PassContext.current().config.get(
-        "relay.backend.use_meta_schedule_dispatch",
-        False,
-    )
diff --git a/python/tvm/meta_schedule/schedule/__init__.py b/python/tvm/meta_schedule/schedule/__init__.py
new file mode 100644
index 000000000000..0f5efce9ff65
--- /dev/null
+++ b/python/tvm/meta_schedule/schedule/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Per-block schedule rules in MetaSchedule"""
+from . import cpu, cuda, generic, x86
diff --git a/python/tvm/meta_schedule/schedule/cpu/__init__.py b/python/tvm/meta_schedule/schedule/cpu/__init__.py
new file mode 100644
index 000000000000..ddc0155ee4f4
--- /dev/null
+++ b/python/tvm/meta_schedule/schedule/cpu/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Per-block schedule rules in MetaSchedule for target key 'cpu'"""
diff --git a/python/tvm/meta_schedule/schedule/cuda/__init__.py b/python/tvm/meta_schedule/schedule/cuda/__init__.py
new file mode 100644
index 000000000000..937a6e16a91b
--- /dev/null
+++ b/python/tvm/meta_schedule/schedule/cuda/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Per-block schedule rules in MetaSchedule for target key 'cuda'"""
diff --git a/python/tvm/meta_schedule/schedule/generic/__init__.py b/python/tvm/meta_schedule/schedule/generic/__init__.py
new file mode 100644
index 000000000000..38ba5beb6772
--- /dev/null
+++ b/python/tvm/meta_schedule/schedule/generic/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Per-block schedule rules in MetaSchedule for generic cases"""
diff --git a/python/tvm/meta_schedule/schedule/x86/__init__.py b/python/tvm/meta_schedule/schedule/x86/__init__.py
new file mode 100644
index 000000000000..d41979638078
--- /dev/null
+++ b/python/tvm/meta_schedule/schedule/x86/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Per-block schedule rules in MetaSchedule for target key 'x86'"""
diff --git a/python/tvm/meta_schedule/schedule_rule/__init__.py b/python/tvm/meta_schedule/schedule_rule/__init__.py
index a015d0eb1ab2..5971ad53c48c 100644
--- a/python/tvm/meta_schedule/schedule_rule/__init__.py
+++ b/python/tvm/meta_schedule/schedule_rule/__init__.py
@@ -20,15 +20,16 @@
 blocks in a schedule. See also PostOrderApply.
 """
 from .add_rfactor import AddRFactor
+from .apply_custom_rule import ApplyCustomRule
 from .auto_bind import AutoBind
 from .auto_inline import AutoInline
 from .cross_thread_reduction import CrossThreadReduction
 from .multi_level_tiling import (
     MultiLevelTiling,
-    MultiLevelTilingWithIntrin,
-    ReuseType,
     MultiLevelTilingTensorCore,
     MultiLevelTilingWideVector,
+    MultiLevelTilingWithIntrin,
+    ReuseType,
 )
 from .parallel_vectorize_unroll import ParallelizeVectorizeUnroll
 from .random_compute_location import RandomComputeLocation
diff --git a/python/tvm/meta_schedule/schedule_rule/apply_custom_rule.py b/python/tvm/meta_schedule/schedule_rule/apply_custom_rule.py
new file mode 100644
index 000000000000..29e25f992930
--- /dev/null
+++ b/python/tvm/meta_schedule/schedule_rule/apply_custom_rule.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Create a rule that applies customized rules registered using block attribute `schedule_rule`.
+The rule will be dispatched according to target keys."""
+from tvm._ffi import register_object
+
+from .. import _ffi_api
+from .schedule_rule import ScheduleRule
+
+
+@register_object("meta_schedule.ApplyCustomRule")
+class ApplyCustomRule(ScheduleRule):
+    """A rule that applies customized rules registered using block attribute `schedule_rule`.
+    The rule will be dispatched according to target keys."""
+
+    def __init__(self) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.ScheduleRuleApplyCustomRule,  # type: ignore # pylint: disable=no-member
+        )
diff --git a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
index 65e7ddc468b5..44f32527fad9 100644
--- a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
+++ b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
@@ -58,7 +58,7 @@ class EvolutionarySearch(SearchStrategy):
     def __init__(
         self,
         *,
-        population_size: int = 2048,
+        population_size: int = 512,
         init_measured_ratio: float = 0.2,
         init_min_unmeasured: int = 50,
         max_fail_count: int = 5,
diff --git a/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py b/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py
deleted file mode 100644
index d6242020726b..000000000000
--- a/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-
-from tvm.script import tir as T
-
-# pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,no-self-use,unused-argument,chained-comparison,misplaced-comparison-constant
-
-
-@T.prim_func
-def conv2d_winograd_cpu(
-    X: T.Buffer[(1, 14, 14, 128), "float32"],  # type: ignore
-    W: T.Buffer[(6, 6, 128, 128), "float32"],  # type: ignore
-    conv2d_winograd: T.Buffer[(1, 12, 12, 128), "float32"],  # type: ignore
-) -> None:
-    # body
-    data_pad = T.alloc_buffer([1, 16, 16, 128])
-    input_tile = T.alloc_buffer([6, 6, 9, 128])
-    B = T.alloc_buffer([6, 6])
-    data_pack = T.alloc_buffer([6, 6, 9, 128])
-    bgemm = T.alloc_buffer([6, 6, 9, 128])
-    A = T.alloc_buffer([6, 4])
-    inverse = T.alloc_buffer([4, 4, 9, 128])
-    for i0, i1, i2, i3 in T.grid(1, 16, 16, 128):
-        with T.block("data_pad"):
-            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-            T.block_attr({"schedule_rule": "None"})
-            T.reads([X[i0_1, i1_1, i2_1, i3_1]])
-            T.writes([data_pad[i0_1, i1_1, i2_1, i3_1]])
-            data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
-                0 <= i1_1 and i1_1 < 14 and 0 <= i2_1 and i2_1 < 14,  # type: ignore
-                X[i0_1, i1_1, i2_1, i3_1],
-                T.float32(0),
-                dtype="float32",
-            )
-    for i0_2, i1_2, i2_2, i3_2 in T.grid(6, 6, 9, 128):
-        with T.block("input_tile"):
-            eps, nu, p, ci = T.axis.remap("SSSS", [i0_2, i1_2, i2_2, i3_2])
-            T.block_attr({"schedule_rule": "None"})
-            T.reads(
-                data_pad[
-                    T.floordiv(p, 9),  # type: ignore
-                    ((T.floordiv(T.floormod(p, 9), 3) * 4) + eps),  # type: ignore
-                    ((T.floormod(p, 3) * 4) + nu),  # type: ignore
-                    ci,
-                ]
-            )
-            T.writes([input_tile[eps, nu, p, ci]])
-            input_tile[eps, nu, p, ci] = data_pad[
-                T.floordiv(p, 9),  # type: ignore
-                ((T.floordiv(T.floormod(p, 9), 3) * 4) + eps),  # type: ignore
-                ((T.floormod(p, 3) * 4) + nu),  # type: ignore
-                ci,
-            ]
-    for i0_3, i1_3 in T.grid(6, 6):
-        with T.block("B"):
-            i, j = T.axis.remap("SS", [i0_3, i1_3])
-            T.block_attr({"schedule_rule": "meta_schedule.compute_inline"})
-            T.writes([B[i, j]])
-            # fmt: off
-            B[i, j] = T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 5)), T.float32(1), T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 4)), T.float32(0), T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 3)), T.float32(0), T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 2)), T.float32(0), T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 1)), T.float32(0), T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 0)), T.float32(0), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 5)), T.float32(1.5), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 4)), T.float32(1), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 3)), T.float32(1), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 2)), T.float32(1), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 1)), T.float32(1), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 0)), T.float32(1), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 5)), T.float32(-2), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 4)), T.float32(-0.5), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 3)), T.float32(2), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 2)), T.float32(2.5), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 1)), T.float32(0.5), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 0)), T.float32(1.5), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 5)), T.float32(-1.5), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 4)), T.float32(-1), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 3)), T.float32(-1), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 2)), T.float32(0.5), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 1)), T.float32(-2.5), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 0)), T.float32(-2), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 5)), T.float32(1), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 4)), T.float32(0.5), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 3)), T.float32(-2), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 2)), T.float32(-1), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 1)), T.float32(1), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 0)), T.float32(-1.5), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 5)), T.float32(0), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 4)), T.float32(0), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 3)), T.float32(0), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 2)), T.float32(0), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 1)), T.float32(0), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 0)), T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))  # type: ignore
-            # fmt: on
-    for i0_4, i1_4, i2_3, i3_3, i4, i5 in T.grid(6, 6, 9, 128, 6, 6):
-        with T.block("data_pack"):
-            eps_1, nu_1, p_1, ci_1, r_a, r_b = T.axis.remap(
-                "SSSSRR", [i0_4, i1_4, i2_3, i3_3, i4, i5]
-            )
-            T.block_attr({"schedule_rule": "meta_schedule.winograd_data_pack.llvm"})
-            T.reads(
-                [
-                    data_pack[eps_1, nu_1, p_1, ci_1],
-                    input_tile[r_a, r_b, p_1, ci_1],
-                    B[
-                        T.min(r_a, r_b) : (  # type: ignore
-                            T.min(r_a, r_b) + ((T.max(r_a, r_b) + 1) - T.min(r_a, r_b))  # type: ignore
-                        ),
-                        T.min(eps_1, nu_1) : (  # type: ignore
-                            T.min(eps_1, nu_1) + ((T.max(eps_1, nu_1) + 1) - T.min(eps_1, nu_1))  # type: ignore
-                        ),
-                    ],
-                ]
-            )
-            T.writes([data_pack[eps_1, nu_1, p_1, ci_1]])
-            with T.init():
-                data_pack[eps_1, nu_1, p_1, ci_1] = T.float32(0)
-            data_pack[eps_1, nu_1, p_1, ci_1] = data_pack[eps_1, nu_1, p_1, ci_1] + (
-                (input_tile[r_a, r_b, p_1, ci_1] * B[r_a, eps_1]) * B[r_b, nu_1]
-            )
-    for i0_5, i1_5, i2_4, i3_4, i4_1 in T.grid(6, 6, 9, 128, 128):
-        with T.block("bgemm"):
-            eps_2, nu_2, p_2, co, ci_2 = T.axis.remap("SSSSR", [i0_5, i1_5, i2_4, i3_4, i4_1])
-            T.block_attr({"meta_schedule.write_cache_level": [2]})
-            T.reads(
-                [
-                    bgemm[eps_2, nu_2, p_2, co],
-                    data_pack[eps_2, nu_2, p_2, ci_2],
-                    W[eps_2, nu_2, co, ci_2],
-                ]
-            )
-            T.writes([bgemm[eps_2, nu_2, p_2, co]])
-            with T.init():
-                bgemm[eps_2, nu_2, p_2, co] = T.float32(0)
-            bgemm[eps_2, nu_2, p_2, co] = (
-                bgemm[eps_2, nu_2, p_2, co]
-                + data_pack[eps_2, nu_2, p_2, ci_2] * W[eps_2, nu_2, co, ci_2]
-            )
-    for i0_6, i1_6 in T.grid(6, 4):
-        with T.block("A"):
-            i_1, j_1 = T.axis.remap("SS", [i0_6, i1_6])
-            T.block_attr({"schedule_rule": "meta_schedule.compute_inline"})
-            T.writes([A[i_1, j_1]])
-            # fmt: off
-            A[i_1, j_1] = T.Select(((T.floormod(i_1, 6) == 5) and (T.floormod(j_1, 4) == 3)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 5) and (T.floormod(j_1, 4) == 2)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 5) and (T.floormod(j_1, 4) == 1)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 5) and (T.floormod(j_1, 4) == 0)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 4) and (T.floormod(j_1, 4) == 3)), T.float32(-8), T.Select(((T.floormod(i_1, 6) == 4) and (T.floormod(j_1, 4) == 2)), T.float32(4), T.Select(((T.floormod(i_1, 6) == 4) and (T.floormod(j_1, 4) == 1)), T.float32(-2), T.Select(((T.floormod(i_1, 6) == 4) and (T.floormod(j_1, 4) == 0)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 3) and (T.floormod(j_1, 4) == 3)), T.float32(0.125), T.Select(((T.floormod(i_1, 6) == 3) and (T.floormod(j_1, 4) == 2)), T.float32(0.25), T.Select(((T.floormod(i_1, 6) == 3) and (T.floormod(j_1, 4) == 1)), T.float32(0.5), T.Select(((T.floormod(i_1, 6) == 3) and (T.floormod(j_1, 4) == 0)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 2) and (T.floormod(j_1, 4) == 3)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 2) and (T.floormod(j_1, 4) == 2)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 2) and (T.floormod(j_1, 4) == 1)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 2) and (T.floormod(j_1, 4) == 0)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 1) and (T.floormod(j_1, 4) == 3)), T.float32(-1), T.Select(((T.floormod(i_1, 6) == 1) and (T.floormod(j_1, 4) == 2)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 1) and (T.floormod(j_1, 4) == 1)), T.float32(-1), T.Select(((T.floormod(i_1, 6) == 1) and (T.floormod(j_1, 4) == 0)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 0) and (T.floormod(j_1, 4) == 3)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 0) and (T.floormod(j_1, 4) == 2)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 0) and (T.floormod(j_1, 4) == 1)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 0) and (T.floormod(j_1, 4) == 0)), T.float32(1), T.float32(0)))))))))))))))))))))))))  # type: ignore
-            # fmt: on
-    for i0_7, i1_7, i2_5, i3_5, i4_2, i5_1 in T.grid(4, 4, 9, 128, 6, 6):
-        with T.block("inverse"):
-            vh, vw, p_3, co_1, r_a_1, r_b_1 = T.axis.remap(
-                "SSSSRR", [i0_7, i1_7, i2_5, i3_5, i4_2, i5_1]
-            )
-            T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse.llvm"})
-            T.reads(
-                [
-                    inverse[vh, vw, p_3, co_1],
-                    bgemm[r_a_1, r_b_1, p_3, co_1],
-                    A[
-                        T.min(r_a_1, r_b_1) : (  # type: ignore
-                            T.min(r_a_1, r_b_1) + ((T.max(r_a_1, r_b_1) + 1) - T.min(r_a_1, r_b_1))  # type: ignore
-                        ),
-                        T.min(vh, vw) : (T.min(vh, vw) + ((T.max(vh, vw) + 1) - T.min(vh, vw))),  # type: ignore
-                    ],
-                ]
-            )
-            T.writes([inverse[vh, vw, p_3, co_1]])
-            with T.init():
-                inverse[vh, vw, p_3, co_1] = T.float32(0)
-            inverse[vh, vw, p_3, co_1] = inverse[vh, vw, p_3, co_1] + (
-                (bgemm[r_a_1, r_b_1, p_3, co_1] * A[r_a_1, vh]) * A[r_b_1, vw]
-            )
-    for i0_8, i1_8, i2_6, i3_6 in T.grid(1, 12, 12, 128):
-        with T.block("conv2d_winograd"):
-            n, h, w, co_2 = T.axis.remap("SSSS", [i0_8, i1_8, i2_6, i3_6])
-            T.reads(
-                [
-                    inverse[
-                        T.floormod(h, 4),  # type: ignore
-                        T.floormod(w, 4),  # type: ignore
-                        (((n * 9) + (T.floordiv(h, 4) * 3)) + T.floordiv(w, 4)),  # type: ignore
-                        co_2,
-                    ]
-                ]
-            )
-            T.writes([conv2d_winograd[n, h, w, co_2]])
-            conv2d_winograd[n, h, w, co_2] = inverse[
-                T.floormod(h, 4),  # type: ignore
-                T.floormod(w, 4),  # type: ignore
-                (((n * 9) + (T.floordiv(h, 4) * 3)) + T.floordiv(w, 4)),  # type: ignore
-                co_2,
-            ]
diff --git a/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py b/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py
deleted file mode 100644
index e737f9b04e62..000000000000
--- a/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-
-from tvm.script import tir as T
-
-# pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,no-self-use,unused-argument,chained-comparison,misplaced-comparison-constant
-
-
-@T.prim_func
-def conv2d_winograd_cuda(  # type: ignore
-    placeholder: T.Buffer[(1, 14, 14, 128), "float32"],  # type: ignore
-    placeholder_1: T.Buffer[(6, 6, 128, 128), "float32"],  # type: ignore
-    conv2d_winograd: T.Buffer[(1, 12, 12, 128), "float32"],  # type: ignore
-) -> None:
-    # type: ignore
-    data_pad = T.alloc_buffer([1, 16, 16, 128])
-    input_tile = T.alloc_buffer([6, 6, 9, 128])
-    B = T.alloc_buffer([6, 6])
-    data_pack = T.alloc_buffer([6, 6, 9, 128])
-    bgemm = T.alloc_buffer([6, 6, 9, 128])
-    A = T.alloc_buffer([6, 4])
-    inverse = T.alloc_buffer([4, 4, 9, 128])
-    for i0, i1, i2, i3 in T.grid(1, 16, 16, 128):
-        with T.block("data_pad"):
-            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-            T.block_attr({"schedule_rule": "None"})
-            T.reads([placeholder[i0_1, i1_1, i2_1, i3_1]])
-            T.writes([data_pad[i0_1, i1_1, i2_1, i3_1]])
-            data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
-                0 <= i1_1 and i1_1 < 14 and 0 <= i2_1 and i2_1 < 14,  # type: ignore
-                placeholder[i0_1, i1_1, i2_1, i3_1],
-                T.float32(0),
-                dtype="float32",
-            )
-    for i0_2, i1_2, i2_2, i3_2 in T.grid(6, 6, 9, 128):
-        with T.block("input_tile"):
-            eps, nu, p, ci = T.axis.remap("SSSS", [i0_2, i1_2, i2_2, i3_2])
-            T.block_attr({"schedule_rule": "None"})
-            T.reads(
-                [
-                    data_pad[
-                        T.floordiv(p, 9),  # type: ignore
-                        ((T.floordiv(T.floormod(p, 9), 3) * 4) + eps),  # type: ignore
-                        ((T.floormod(p, 3) * 4) + nu),  # type: ignore
-                        ci,
-                    ]
-                ]
-            )
-            T.writes([input_tile[eps, nu, p, ci]])
-            input_tile[eps, nu, p, ci] = data_pad[
-                T.floordiv(p, 9),  # type: ignore
-                ((T.floordiv(T.floormod(p, 9), 3) * 4) + eps),  # type: ignore
-                ((T.floormod(p, 3) * 4) + nu),  # type: ignore
-                ci,
-            ]
-    for i0_3, i1_3 in T.grid(6, 6):
-        with T.block("B"):
-            i, j = T.axis.remap("SS", [i0_3, i1_3])
-            T.block_attr({"schedule_rule": "meta_schedule.compute_inline"})
-            T.writes([B[i, j]])
-            # fmt: off
-            B[i, j] = T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 5)), T.float32(1), T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 4)), T.float32(0), T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 3)), T.float32(0), T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 2)), T.float32(0), T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 1)), T.float32(0), T.Select(((T.floormod(i, 6) == 5) and (T.floormod(j, 6) == 0)), T.float32(0), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 5)), T.float32(1.5), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 4)), T.float32(1), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 3)), T.float32(1), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 2)), T.float32(1), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 1)), T.float32(1), T.Select(((T.floormod(i, 6) == 4) and (T.floormod(j, 6) == 0)), T.float32(1), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 5)), T.float32(-2), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 4)), T.float32(-0.5), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 3)), T.float32(2), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 2)), T.float32(2.5), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 1)), T.float32(0.5), T.Select(((T.floormod(i, 6) == 3) and (T.floormod(j, 6) == 0)), T.float32(1.5), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 5)), T.float32(-1.5), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 4)), T.float32(-1), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 3)), T.float32(-1), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 2)), T.float32(0.5), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 1)), T.float32(-2.5), T.Select(((T.floormod(i, 6) == 2) and (T.floormod(j, 6) == 0)), T.float32(-2), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 5)), T.float32(1), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 4)), T.float32(0.5), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 3)), T.float32(-2), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 2)), T.float32(-1), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 1)), T.float32(1), T.Select(((T.floormod(i, 6) == 1) and (T.floormod(j, 6) == 0)), T.float32(-1.5), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 5)), T.float32(0), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 4)), T.float32(0), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 3)), T.float32(0), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 2)), T.float32(0), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 1)), T.float32(0), T.Select(((T.floormod(i, 6) == 0) and (T.floormod(j, 6) == 0)), T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))  # type: ignore
-            # fmt: on
-    for i0_4, i1_4, i2_3, i3_3, i4, i5 in T.grid(6, 6, 9, 128, 6, 6):
-        with T.block("data_pack"):
-            eps_1, nu_1, p_1, ci_1, r_a, r_b = T.axis.remap(
-                "SSSSRR", [i0_4, i1_4, i2_3, i3_3, i4, i5]
-            )
-            T.block_attr({"schedule_rule": "meta_schedule.winograd_data_pack.cuda"})
-            T.reads(
-                [
-                    data_pack[eps_1, nu_1, p_1, ci_1],
-                    input_tile[r_a, r_b, p_1, ci_1],
-                    B[
-                        T.min(r_a, r_b) : (  # type: ignore
-                            T.min(r_a, r_b) + ((T.max(r_a, r_b) + 1) - T.min(r_a, r_b))  # type: ignore
-                        ),
-                        T.min(eps_1, nu_1) : (  # type: ignore
-                            T.min(eps_1, nu_1) + ((T.max(eps_1, nu_1) + 1) - T.min(eps_1, nu_1))  # type: ignore
-                        ),
-                    ],
-                ]
-            )
-            T.writes([data_pack[eps_1, nu_1, p_1, ci_1]])
-            with T.init():
-                data_pack[eps_1, nu_1, p_1, ci_1] = T.float32(0)
-            data_pack[eps_1, nu_1, p_1, ci_1] = data_pack[eps_1, nu_1, p_1, ci_1] + (
-                (input_tile[r_a, r_b, p_1, ci_1] * B[r_a, eps_1]) * B[r_b, nu_1]
-            )
-    for i0_5, i1_5, i2_4, i3_4, i4_1 in T.grid(6, 6, 9, 128, 128):
-        with T.block("bgemm"):
-            eps_2, nu_2, p_2, co, ci_2 = T.axis.remap("SSSSR", [i0_5, i1_5, i2_4, i3_4, i4_1])
-            T.block_attr({"meta_schedule.write_cache_level": [3]})
-            T.reads(
-                [
-                    bgemm[eps_2, nu_2, p_2, co],
-                    data_pack[eps_2, nu_2, p_2, ci_2],
-                    placeholder_1[eps_2, nu_2, co, ci_2],
-                ]
-            )
-            T.writes([bgemm[eps_2, nu_2, p_2, co]])
-            with T.init():
-                bgemm[eps_2, nu_2, p_2, co] = T.float32(0)
-            bgemm[eps_2, nu_2, p_2, co] = bgemm[eps_2, nu_2, p_2, co] + (
-                data_pack[eps_2, nu_2, p_2, ci_2] * placeholder_1[eps_2, nu_2, co, ci_2]
-            )
-    for i0_6, i1_6 in T.grid(6, 4):
-        with T.block("A"):
-            i_1, j_1 = T.axis.remap("SS", [i0_6, i1_6])
-            T.block_attr({"schedule_rule": "meta_schedule.compute_inline"})
-            T.writes([A[i_1, j_1]])
-            # fmt: off
-            A[i_1, j_1] = T.Select(((T.floormod(i_1, 6) == 5) and (T.floormod(j_1, 4) == 3)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 5) and (T.floormod(j_1, 4) == 2)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 5) and (T.floormod(j_1, 4) == 1)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 5) and (T.floormod(j_1, 4) == 0)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 4) and (T.floormod(j_1, 4) == 3)), T.float32(-8), T.Select(((T.floormod(i_1, 6) == 4) and (T.floormod(j_1, 4) == 2)), T.float32(4), T.Select(((T.floormod(i_1, 6) == 4) and (T.floormod(j_1, 4) == 1)), T.float32(-2), T.Select(((T.floormod(i_1, 6) == 4) and (T.floormod(j_1, 4) == 0)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 3) and (T.floormod(j_1, 4) == 3)), T.float32(0.125), T.Select(((T.floormod(i_1, 6) == 3) and (T.floormod(j_1, 4) == 2)), T.float32(0.25), T.Select(((T.floormod(i_1, 6) == 3) and (T.floormod(j_1, 4) == 1)), T.float32(0.5), T.Select(((T.floormod(i_1, 6) == 3) and (T.floormod(j_1, 4) == 0)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 2) and (T.floormod(j_1, 4) == 3)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 2) and (T.floormod(j_1, 4) == 2)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 2) and (T.floormod(j_1, 4) == 1)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 2) and (T.floormod(j_1, 4) == 0)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 1) and (T.floormod(j_1, 4) == 3)), T.float32(-1), T.Select(((T.floormod(i_1, 6) == 1) and (T.floormod(j_1, 4) == 2)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 1) and (T.floormod(j_1, 4) == 1)), T.float32(-1), T.Select(((T.floormod(i_1, 6) == 1) and (T.floormod(j_1, 4) == 0)), T.float32(1), T.Select(((T.floormod(i_1, 6) == 0) and (T.floormod(j_1, 4) == 3)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 0) and (T.floormod(j_1, 4) == 2)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 0) and (T.floormod(j_1, 4) == 1)), T.float32(0), T.Select(((T.floormod(i_1, 6) == 0) and (T.floormod(j_1, 4) == 0)), T.float32(1), T.float32(0)))))))))))))))))))))))))  # type: ignore
-            # fmt: on
-    for i0_7, i1_7, i2_5, i3_5, i4_2, i5_1 in T.grid(4, 4, 9, 128, 6, 6):
-        with T.block("inverse"):
-            vh, vw, p_3, co_1, r_a_1, r_b_1 = T.axis.remap(
-                "SSSSRR", [i0_7, i1_7, i2_5, i3_5, i4_2, i5_1]
-            )
-            T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse.cuda"})
-            T.reads(
-                [
-                    inverse[vh, vw, p_3, co_1],
-                    bgemm[r_a_1, r_b_1, p_3, co_1],
-                    A[
-                        T.min(r_a_1, r_b_1) : (  # type: ignore
-                            T.min(r_a_1, r_b_1) + ((T.max(r_a_1, r_b_1) + 1) - T.min(r_a_1, r_b_1))  # type: ignore
-                        ),
-                        T.min(vh, vw) : (T.min(vh, vw) + ((T.max(vh, vw) + 1) - T.min(vh, vw))),  # type: ignore
-                    ],
-                ]
-            )
-            T.writes([inverse[vh, vw, p_3, co_1]])
-            with T.init():
-                inverse[vh, vw, p_3, co_1] = T.float32(0)
-            inverse[vh, vw, p_3, co_1] = inverse[vh, vw, p_3, co_1] + (
-                (bgemm[r_a_1, r_b_1, p_3, co_1] * A[r_a_1, vh]) * A[r_b_1, vw]
-            )
-    for i0_8, i1_8, i2_6, i3_6 in T.grid(1, 12, 12, 128):
-        with T.block("conv2d_winograd"):
-            n, h, w, co_2 = T.axis.remap("SSSS", [i0_8, i1_8, i2_6, i3_6])
-            T.reads(
-                [
-                    inverse[
-                        T.floormod(h, 4),  # type: ignore
-                        T.floormod(w, 4),  # type: ignore
-                        (((n * 9) + (T.floordiv(h, 4) * 3)) + T.floordiv(w, 4)),  # type: ignore
-                        co_2,
-                    ]
-                ]
-            )
-            T.writes([conv2d_winograd[n, h, w, co_2]])
-            conv2d_winograd[n, h, w, co_2] = inverse[
-                T.floormod(h, 4),  # type: ignore
-                T.floormod(w, 4),  # type: ignore
-                (((n * 9) + (T.floordiv(h, 4) * 3)) + T.floordiv(w, 4)),  # type: ignore
-                co_2,
-            ]
diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py
index 6d1cd7f1604c..20abcfce3dc1 100644
--- a/python/tvm/meta_schedule/testing/relay_workload.py
+++ b/python/tvm/meta_schedule/testing/relay_workload.py
@@ -232,7 +232,6 @@ def get_network(
     inputs : Tuple[str, List[int], str]
         The name, shape and dtype of the input tensor.
     """
-
     mod: IRModule
     params: Dict[str, NDArray]
     inputs: Tuple[str, List[int], str]
diff --git a/python/tvm/meta_schedule/testing/space_generation.py b/python/tvm/meta_schedule/testing/space_generation.py
index 5ac20f8fdf2f..0b7072b65afe 100644
--- a/python/tvm/meta_schedule/testing/space_generation.py
+++ b/python/tvm/meta_schedule/testing/space_generation.py
@@ -127,7 +127,7 @@ def check_sketches(
 def print_sketches(sketches: List[Schedule]):
     for i, sch in enumerate(sketches):
         print(f"###### {i}")
-        print(sch.mod.script())
+        sch.mod.show()
         for inst in sch.trace.insts:
             if inst in sch.trace.decisions:
                 print(f'("{inst.kind.name}", {sch.trace.decisions[inst]}),')
diff --git a/python/tvm/meta_schedule/testing/te_workload.py b/python/tvm/meta_schedule/testing/te_workload.py
index 6fac1c2960ac..cdc430087542 100644
--- a/python/tvm/meta_schedule/testing/te_workload.py
+++ b/python/tvm/meta_schedule/testing/te_workload.py
@@ -19,6 +19,7 @@
 from typing import Tuple
 
 from tvm import te, tir, topi
+from tvm.target import Target
 
 
 def batch_matmul_nkkm(  # pylint: disable=invalid-name,missing-docstring
@@ -519,93 +520,68 @@ def conv2d_winograd_nhwc(  # pylint: disable=invalid-name,missing-docstring
     stride: int = 1,
     padding: int = 0,
     dilation: int = 1,
+    tile_size: int = 4,
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    tile_size = 4  # _infer_tile_size(data, kernel)
-    inputs = te.placeholder((N, H, W, CI), name="inputs")
-    N, H, W, CI = topi.utils.get_const_tuple(inputs.shape)
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
-
-    KH = KW = kernel_size
-    HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW))
-    HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride
-    assert HSTR == 1 and WSTR == 1 and KH == KW
-
-    data_pad = topi.nn.pad(inputs, (0, HPAD, WPAD, 0), (0, HPAD, WPAD, 0), name="data_pad")
-
-    r = KW
-    m = tile_size
-    alpha = m + r - 1
-    A, B, _G = topi.nn.winograd_util.winograd_transform_matrices(m, r, "float32")
-
-    H = (H + 2 * HPAD - KH) // HSTR + 1
-    W = (W + 2 * WPAD - KW) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-    P = N * nH * nW
-    _rkh = te.reduce_axis((0, KH), name="r_kh")
-    _rkw = te.reduce_axis((0, KW), name="r_kw")
-    kshape = (alpha, alpha, CI, CO)
-    kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight")
-
-    idxdiv = te.indexdiv
-    idxmod = te.indexmod
-    # pack input tile
-    input_tile = te.compute(
-        (alpha, alpha, P, CI),
-        lambda eps, nu, p, ci: data_pad[idxdiv(p, (nH * nW))][idxmod(idxdiv(p, nW), nH) * m + eps][
-            idxmod(p, nW) * m + nu
-        ][ci],
-        name="input_tile",
-    )
-
-    # transform data
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    data_pack = te.compute(
-        (alpha, alpha, P, CI),
-        lambda eps, nu, p, ci: te.sum(
-            input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
-        ),
-        name="data_pack",
-        attrs={"auto_scheduler_simplify_const_tensor_indices": ["eps", "nu", "r_a", "r_b"]},
+    from tvm.topi.nn.conv2d import (  # pylint: disable=import-outside-toplevel
+        _conv2d_winograd_nhwc_impl,
     )
 
-    # do batch gemm
-    ci = te.reduce_axis((0, CI), name="ci")
-    bgemm = te.compute(
-        (alpha, alpha, P, CO),
-        lambda eps, nu, p, co: te.sum(
-            data_pack[eps][nu][p][ci] * kernel_pack[eps][nu][ci][co], axis=[ci]
-        ),
-        name="bgemm",
+    target = Target.current(allow_none=True)
+    if target is not None and target.kind.name == "cuda":
+        write_cache_level = 3
+    else:
+        write_cache_level = 2
+    data = te.placeholder((N, H, W, CI), "float32", name="data")
+    weight = te.placeholder((kernel_size, kernel_size, CO, CI), "float32", name="weight")
+    out = _conv2d_winograd_nhwc_impl(
+        data,
+        weight,
+        stride,
+        padding,
+        dilation,
+        "float32",
+        pre_computed=True,
+        auto_scheduler_rewritten_layout="",
+        meta_schedule_original_shape=None,
+        tile_size=tile_size,
+        write_cache_level=write_cache_level,
+    )
+    return (data, weight, out)
+
+
+def conv2d_winograd_nchw(  # pylint: disable=invalid-name,missing-docstring
+    N: int,
+    H: int,
+    W: int,
+    CI: int,
+    CO: int,
+    kernel_size: int,
+    stride: int = 1,
+    padding: int = 1,
+    dilation: int = 1,
+) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
+    from tvm.topi.cuda.conv2d_winograd import (  # pylint: disable=import-outside-toplevel
+        _infer_tile_size,
     )
-
-    # inverse transform
-    r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_b")
-    inverse = te.compute(
-        (m, m, P, CO),
-        lambda vh, vw, p, co: te.sum(
-            bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
-        ),
-        name="inverse",
-        attrs={"auto_scheduler_simplify_const_tensor_indices": ["vh", "vw", "r_a", "r_b"]},
+    from tvm.topi.nn.conv2d import (  # pylint: disable=import-outside-toplevel
+        _conv2d_winograd_nchw_impl,
     )
 
-    # output
-    output = te.compute(
-        (N, H, W, CO),
-        lambda n, h, w, co: inverse[
-            idxmod(h, m), idxmod(w, m), n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), co
-        ],
-        name="conv2d_winograd",
+    data = te.placeholder((N, CI, H, W), "float32", name="data")
+    weight = te.placeholder((kernel_size, kernel_size, CI, CO), "float32", name="weight")
+    out = _conv2d_winograd_nchw_impl(
+        data,
+        weight,
+        stride,
+        padding,
+        dilation,
+        "float32",
+        pre_computed=True,
+        auto_scheduler_rewritten_layout="",
+        meta_schedule_original_shape=None,
+        tile_size=_infer_tile_size(data, weight),
     )
-
-    return (inputs, kernel_pack, output)
+    return (data, weight, out)
 
 
 def matmul(
@@ -833,7 +809,7 @@ def create_te_workload(name: str, idx: int) -> tir.PrimFunc:
     "T2D": (
         conv2d_transpose_nhwc,
         [
-            # all conv2d tranpose layers in DCGAN
+            # all conv2d transpose layers in DCGAN
             (1, 4, 4, 512, 256, 4, 2, 1),
             (1, 8, 8, 256, 128, 4, 2, 1),
             (1, 16, 16, 128, 64, 4, 2, 1),
@@ -886,4 +862,16 @@ def create_te_workload(name: str, idx: int) -> tir.PrimFunc:
             (1, 128, 12, 128),
         ],
     ),
+    "C2D_WIN_NHWC": (
+        conv2d_winograd_nhwc,
+        [
+            (1, 14, 14, 128, 128, 6),
+        ],
+    ),
+    "C2D_WIN_NCHW": (
+        conv2d_winograd_nchw,
+        [
+            (1, 56, 56, 64, 64, 6),
+        ],
+    ),
 }
diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index 173f31ef08f9..5594e36cb855 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -24,7 +24,7 @@
 import tvm
 from tvm import autotvm, te
 from tvm.auto_scheduler import is_auto_scheduler_enabled
-from tvm.meta_schedule import is_meta_schedule_dispatch_enabled
+from tvm.meta_schedule import is_meta_schedule_enabled
 from tvm.runtime import Object
 from tvm.support import libinfo
 from tvm.target import Target
@@ -181,7 +181,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
 
     # Disable autotvm if auto_scheduler is enabled.
     # (i.e., always return the implementation with the highest priority for auto-scheduler).
-    if is_auto_scheduler_enabled() or is_meta_schedule_dispatch_enabled():
+    if is_auto_scheduler_enabled() or is_meta_schedule_enabled():
         use_autotvm = False
 
     # If not use autotvm, always return the implementation with the highest priority
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 90a94c422992..53aec11e5816 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -459,7 +459,7 @@ def convert_conv3d(attrs, inputs, tinfos, desired_layouts):
 # conv3d_winograd related operators
 reg.register_strategy(
     "nn.contrib_conv3d_winograd_without_weight_transform",
-    strategy.conv3d_winograd_without_weight_transfrom_strategy,
+    strategy.conv3d_winograd_without_weight_transform_strategy,
 )
 
 
@@ -733,7 +733,7 @@ def mirror_pad_func(attrs, inputs, _):
 # conv2d_winograd related operators
 reg.register_strategy(
     "nn.contrib_conv2d_winograd_without_weight_transform",
-    strategy.conv2d_winograd_without_weight_transfrom_strategy,
+    strategy.conv2d_winograd_without_weight_transform_strategy,
 )
 
 
diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py
index 011622d5374f..21252215fc28 100644
--- a/python/tvm/relay/op/strategy/adreno.py
+++ b/python/tvm/relay/op/strategy/adreno.py
@@ -162,14 +162,14 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
     return strategy
 
 
-@conv2d_winograd_without_weight_transfrom_strategy.register("adreno")
-def conv2d_winograd_without_weight_transfrom_strategy_adreno(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transfrom adreno strategy"""
+@conv2d_winograd_without_weight_transform_strategy.register("adreno")
+def conv2d_winograd_without_weight_transform_strategy_adreno(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transform adreno strategy"""
     dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int("groups")
     layout = attrs.data_layout
     assert dilation == (1, 1), "Do not support dilate now"
-    assert groups == 1, "Do not supoort arbitrary group number"
+    assert groups == 1, "Do not support arbitrary group number"
     strategy = _op.OpStrategy()
     if layout in ("NCHW", "NCHW4c"):
         strategy.add_implementation(
@@ -187,7 +187,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_adreno(attrs, inputs, out_
         )
     else:
         raise RuntimeError(
-            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
+            "Unsupported conv2d_winograd_without_weight_transform layout {}".format(layout)
         )
     return strategy
 
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index b7650480d0e4..5c25696a1ee1 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -395,9 +395,9 @@ def _compute_conv2d_nnpack(attrs, inputs, out_type):
     return _compute_conv2d_nnpack
 
 
-@conv2d_winograd_without_weight_transfrom_strategy.register("arm_cpu")
-def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transfrom arm cpu strategy"""
+@conv2d_winograd_without_weight_transform_strategy.register("arm_cpu")
+def conv2d_winograd_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transform arm cpu strategy"""
     dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int("groups")
     layout = attrs.data_layout
@@ -405,7 +405,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out
     kernel = inputs[1]
     assert dilation == (1, 1), "Do not support dilate now"
     assert strides == (1, 1), "Do not support strides now"
-    assert groups == 1, "Do not supoort arbitrary group number"
+    assert groups == 1, "Do not support arbitrary group number"
     strategy = _op.OpStrategy()
     if layout == "NCHW":
         if len(kernel.shape) == 5:
@@ -436,7 +436,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out
             raise RuntimeError("Unsupported kernel shape: {}".format(kernel.shape))
     else:
         raise RuntimeError(
-            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
+            "Unsupported conv2d_winograd_without_weight_transform layout {}".format(layout)
         )
     return strategy
 
@@ -463,7 +463,7 @@ def _compute_conv2d_gemm(attrs, inputs, out_type):
 
 @conv2d_gemm_without_weight_transform_strategy.register("arm_cpu")
 def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transfrom arm cpu strategy"""
+    """conv2d_winograd_without_weight_transform arm cpu strategy"""
     layout = attrs.data_layout
     data = inputs[0]
     strategy = _op.OpStrategy()
diff --git a/python/tvm/relay/op/strategy/bifrost.py b/python/tvm/relay/op/strategy/bifrost.py
index ec3edab2c8b1..46ebb6048c2d 100644
--- a/python/tvm/relay/op/strategy/bifrost.py
+++ b/python/tvm/relay/op/strategy/bifrost.py
@@ -100,16 +100,16 @@ def conv2d_strategy_bifrost(attrs, inputs, out_type, target):
     return strategy
 
 
-@conv2d_winograd_without_weight_transfrom_strategy.register("bifrost")
-def conv2d_winograd_without_weight_transfrom_strategy_bifrost(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transfrom mali(bifrost) strategy"""
+@conv2d_winograd_without_weight_transform_strategy.register("bifrost")
+def conv2d_winograd_without_weight_transform_strategy_bifrost(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transform mali(bifrost) strategy"""
     dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int("groups")
     layout = attrs.data_layout
     strides = attrs.get_int_tuple("strides")
     assert dilation == (1, 1), "Do not support dilate now"
     assert strides == (1, 1), "Do not support strides now"
-    assert groups == 1, "Do not supoort arbitrary group number"
+    assert groups == 1, "Do not support arbitrary group number"
     strategy = _op.OpStrategy()
     if layout == "NCHW":
         strategy.add_implementation(
@@ -119,7 +119,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_bifrost(attrs, inputs, out
         )
     else:
         raise RuntimeError(
-            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
+            "Unsupported conv2d_winograd_without_weight_transform layout {}".format(layout)
         )
     return strategy
 
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 9bedfe8cb038..312ec0fe2f97 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -145,7 +145,6 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
     kernel_layout = attrs.kernel_layout
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
-
     if groups == 1:
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
@@ -166,9 +165,34 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                     wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
                     name="conv2d_nchw.cuda",
                 )
-            _, _, kh, kw = get_const_tuple(kernel.shape)
-            if (
-                (2 < kh < 8 and 2 < kw < 8 and kh == kw)
+            N, _, H, W = get_const_tuple(data.shape)
+            CO, CI, KH, KW = get_const_tuple(kernel.shape)
+            (_, _, judge_winograd_auto_scheduler) = judge_winograd(
+                N,
+                H,
+                W,
+                KH,
+                KW,
+                CI,
+                CO,
+                padding,
+                stride_h,
+                stride_w,
+                dilation_h,
+                dilation_w,
+                data.dtype,
+                kernel.dtype,
+                pre_flag=False,
+            )
+            if is_meta_schedule_enabled() and judge_winograd_auto_scheduler:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nchw),
+                    naive_schedule,  # this implementation should never be picked by autotvm
+                    name="conv2d_nchw_winograd.cuda",
+                    plevel=15,
+                )
+            elif (
+                (2 < KH < 8 and 2 < KW < 8 and KH == KW)
                 and (stride_h == 1 and stride_w == 1)
                 and (dilation_h == 1 and dilation_w == 1)
             ):
@@ -490,9 +514,9 @@ def judge_winograd(
     return judge_winograd_tensorcore, judge_winograd_autotvm, judge_winograd_auto_scheduler
 
 
-@conv2d_winograd_without_weight_transfrom_strategy.register(["cuda", "gpu"])
-def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transfrom cuda strategy"""
+@conv2d_winograd_without_weight_transform_strategy.register(["cuda", "gpu"])
+def conv2d_winograd_without_weight_transform_strategy_cuda(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transform cuda strategy"""
     dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int("groups")
     layout = attrs.data_layout
@@ -500,14 +524,24 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
     stride_h, stride_w = attrs.get_int_tuple("strides")
     padding = attrs.get_int_tuple("padding")
     assert dilation == (1, 1), "Do not support dilate now"
-    assert groups == 1, "Do not supoort arbitrary group number"
+    assert groups == 1, "Do not support arbitrary group number"
     strategy = _op.OpStrategy()
     if layout == "NCHW":
-        strategy.add_implementation(
-            wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform),
-            wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform),
-            name="conv2d_nchw_winograd_without_weight_transform.cuda",
-        )
+        if is_meta_schedule_enabled():
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_winograd_nchw_without_weight_transform),
+                naive_schedule,  # this implementation should never be picked by autotvm
+                name="conv2d_nchw_winograd_without_weight_transform",
+                plevel=15,
+            )
+        else:
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform),
+                wrap_topi_schedule(
+                    topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform
+                ),
+                name="conv2d_nchw_winograd_without_weight_transform.cuda",
+            )
     elif layout == "NHWC":
         N, H, W, _ = get_const_tuple(data.shape)
         alpha, _, CI, CO = get_const_tuple(kernel.shape)
@@ -568,7 +602,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
             )
     else:
         raise RuntimeError(
-            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
+            "Unsupported conv2d_winograd_without_weight_transform layout {}".format(layout)
         )
     return strategy
 
@@ -744,14 +778,14 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
-@conv3d_winograd_without_weight_transfrom_strategy.register(["cuda", "gpu"])
-def conv3d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target):
-    """conv3d_winograd_without_weight_transfrom cuda strategy"""
+@conv3d_winograd_without_weight_transform_strategy.register(["cuda", "gpu"])
+def conv3d_winograd_without_weight_transform_strategy_cuda(attrs, inputs, out_type, target):
+    """conv3d_winograd_without_weight_transform cuda strategy"""
     dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int("groups")
     layout = attrs.data_layout
     assert dilation == (1, 1, 1), "Do not support dilate now"
-    assert groups == 1, "Do not supoort arbitrary group number"
+    assert groups == 1, "Do not support arbitrary group number"
     strategy = _op.OpStrategy()
     if layout == "NCDHW":
         strategy.add_implementation(
@@ -761,7 +795,7 @@ def conv3d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
         )
     else:
         raise RuntimeError(
-            "Unsupported conv3d_winograd_without_weight_transfrom layout {}".format(layout)
+            "Unsupported conv3d_winograd_without_weight_transform layout {}".format(layout)
         )
     return strategy
 
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 6ab281abeb37..1cf55f7145cd 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -386,15 +386,15 @@ def depthwise_conv2d_NCHWc_strategy(attrs, inputs, out_type, target):
 
 # conv2d_winograd_without_weight_transform
 @override_native_generic_func("conv2d_winograd_without_weight_transform_strategy")
-def conv2d_winograd_without_weight_transfrom_strategy(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transfrom generic strategy"""
+def conv2d_winograd_without_weight_transform_strategy(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transform generic strategy"""
     raise ValueError("No generic implemenation for conv2d_winograd_without_weight_transform")
 
 
 # conv2d_gemm_without_weight_transform
 @override_native_generic_func("conv2d_gemm_without_weight_transform_strategy")
 def conv2d_gemm_without_weight_transform_strategy(attrs, inputs, out_type, target):
-    """conv2d_gemm_without_weight_transfrom generic strategy"""
+    """conv2d_gemm_without_weight_transform generic strategy"""
     raise ValueError("No generic implemenation for conv2d_gemm_without_weight_transform")
 
 
@@ -619,8 +619,8 @@ def conv3d_strategy(attrs, inputs, out_type, target):
 
 # conv3d_winograd_without_weight_transform
 @override_native_generic_func("conv3d_winograd_without_weight_transform_strategy")
-def conv3d_winograd_without_weight_transfrom_strategy(attrs, inputs, out_type, target):
-    """conv3d_winograd_without_weight_transfrom generic strategy"""
+def conv3d_winograd_without_weight_transform_strategy(attrs, inputs, out_type, target):
+    """conv3d_winograd_without_weight_transform generic strategy"""
     raise ValueError("No generic implemenation for conv3d_winograd_without_weight_transform")
 
 
diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py
index dca684835ba4..c39487b16d55 100644
--- a/python/tvm/relay/op/strategy/mali.py
+++ b/python/tvm/relay/op/strategy/mali.py
@@ -169,9 +169,9 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target):
     return strategy
 
 
-@conv2d_winograd_without_weight_transfrom_strategy.register("mali")
-def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transfrom mali strategy"""
+@conv2d_winograd_without_weight_transform_strategy.register("mali")
+def conv2d_winograd_without_weight_transform_strategy_mali(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transform mali strategy"""
     dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int("groups")
     layout = attrs.data_layout
@@ -179,7 +179,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty
     kernel = inputs[1]
     assert dilation == (1, 1), "Do not support dilate now"
     assert strides == (1, 1), "Do not support strides now"
-    assert groups == 1, "Do not supoort arbitrary group number"
+    assert groups == 1, "Do not support arbitrary group number"
     strategy = _op.OpStrategy()
     if layout == "NCHW":
         assert len(kernel.shape) == 5, "Kernel must be packed into 5-dim"
@@ -208,7 +208,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty
             )
     else:
         raise RuntimeError(
-            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
+            "Unsupported conv2d_winograd_without_weight_transform layout {}".format(layout)
         )
     return strategy
 
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 6575e0f5c5a2..10d7fbb3a926 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -764,16 +764,16 @@ def scatter_nd_strategy_cpu(attrs, inputs, out_type, target):
     return strategy
 
 
-@conv2d_winograd_without_weight_transfrom_strategy.register("cpu")
-def conv2d_winograd_without_weight_transfrom_strategy_cpu(attrs, inputs, out_type, target):
-    """conv2d_winograd_without_weight_transfrom cpu strategy"""
+@conv2d_winograd_without_weight_transform_strategy.register("cpu")
+def conv2d_winograd_without_weight_transform_strategy_cpu(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transform cpu strategy"""
     dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int("groups")
     layout = attrs.data_layout
     strides = attrs.get_int_tuple("strides")
     assert dilation == (1, 1), "Do not support dilate now"
     assert strides == (1, 1), "Do not support strides now"
-    assert groups == 1, "Do not supoort arbitrary group number"
+    assert groups == 1, "Do not support arbitrary group number"
     strategy = _op.OpStrategy()
     need_auto_scheduler_layout = is_auto_scheduler_enabled()
     need_meta_schedule_layout = is_meta_schedule_enabled()
@@ -802,7 +802,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_cpu(attrs, inputs, out_typ
             raise RuntimeError("Both AutoScheduler and MetaSchedule are not enabled")
     else:
         raise RuntimeError(
-            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
+            "Unsupported conv2d_winograd_without_weight_transform layout {}".format(layout)
         )
     return strategy
 
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 35d50eb3673c..93512ca07d9e 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -18,15 +18,15 @@
 """Conv2D alter op and legalize functions for cuda backend"""
 
 import logging
+
 import tvm
-from tvm import te, relay, autotvm
+from tvm import autotvm, relay, te
 
 from .. import nn
+from ..nn import conv2d_legalize
 from ..utils import get_const_tuple, is_target
 from .conv2d_winograd import _infer_tile_size
 from .tensorcore_alter_op import pad_to_tensorcore
-from ..nn import conv2d_legalize
-
 
 logger = logging.getLogger("topi")
 
@@ -61,24 +61,38 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
                 logger.warning("Does not support weight pre-transform for dilated convolution.")
                 return None
 
-            assert data_layout == "NHWC" and kernel_layout == "HWIO"
-            N, H, W, CI = get_const_tuple(data.shape)
-            KH, KW, _, CO = get_const_tuple(kernel.shape)
-
-            # Pre-compute weight transformation in winograd
-            tile_size = _infer_tile_size(tinfos[0], tinfos[1], layout="NHWC")
-
-            # HWIO -> OIHW
-            kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
-            # alpha, alpha, CO, CI
-            weight = relay.nn.contrib_conv2d_winograd_weight_transform(
-                kernel_transform, tile_size=tile_size
-            )
-            new_attrs["tile_size"] = tile_size
-            new_attrs["channels"] = CO
-            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
-                inputs[0], weight, **new_attrs
-            )
+            if data_layout == "NHWC" and kernel_layout == "HWIO":
+                N, H, W, CI = get_const_tuple(data.shape)
+                KH, KW, _, CO = get_const_tuple(kernel.shape)
+                # Pre-compute weight transformation in winograd
+                tile_size = _infer_tile_size(tinfos[0], tinfos[1], layout="NHWC")
+                # HWIO -> OIHW
+                kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
+                # alpha, alpha, CO, CI
+                weight = relay.nn.contrib_conv2d_winograd_weight_transform(
+                    kernel_transform, tile_size=tile_size
+                )
+                new_attrs["tile_size"] = tile_size
+                new_attrs["channels"] = CO
+                return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                    inputs[0], weight, **new_attrs
+                )
+            elif data_layout == "NCHW" and kernel_layout == "OIHW":
+                N, CI, H, W = get_const_tuple(data.shape)
+                CO, _, KH, KW = get_const_tuple(kernel.shape)
+                # Pre-compute weight transformation in winograd
+                tile_size = _infer_tile_size(tinfos[0], tinfos[1], layout="NCHW")
+                # alpha, alpha, CO, CI
+                weight = relay.nn.contrib_conv2d_winograd_weight_transform(
+                    inputs[1], tile_size=tile_size
+                )
+                # alpha, alpha, CI, CO
+                weight = relay.transpose(weight, axes=[0, 1, 3, 2])
+                new_attrs["tile_size"] = tile_size
+                new_attrs["channels"] = CO
+                return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                    inputs[0], weight, **new_attrs
+                )
 
         return None
 
diff --git a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
index 8accbbe53273..77b332400d0b 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
@@ -408,7 +408,6 @@ def nhwc_winograd_cuda(
             input_tile[p][ci][r_a][r_b] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
         ),
         name="data_pack",
-        attrs={"schedule_rule": "meta_schedule.winograd_data_pack.cuda"},
     )
 
     # Convert data type of input feature maps and weights for tensorcore
@@ -433,14 +432,13 @@ def nhwc_winograd_cuda(
 
     # Inverse transform
     r_a = te.reduce_axis((0, alpha), "r_a")
-    r_b = te.reduce_axis((0, alpha), "r_a")
+    r_b = te.reduce_axis((0, alpha), "r_b")
     inverse = te.compute(
         (P, CO, m, m),
         lambda p, co, vh, vw: te.sum(
             bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
         ),
         name="inverse",
-        attrs={"schedule_rule": "meta_schedule.winograd_inverse.cuda"},
     )
 
     # Output
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py
index 239d05844b40..eca51c921016 100644
--- a/python/tvm/topi/cuda/conv2d_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -23,7 +23,12 @@
 from tvm import autotvm, te
 
 from .. import nn
-from ..nn.conv2d import _conv2d_winograd_nhwc_impl, conv2d_winograd_nhwc
+from ..nn.conv2d import (
+    _conv2d_winograd_nchw_impl,
+    _conv2d_winograd_nhwc_impl,
+    conv2d_winograd_nchw,
+    conv2d_winograd_nhwc,
+)
 from ..nn.winograd_util import winograd_transform_matrices
 from ..utils import get_const_int, get_const_tuple, traverse_inline
 
@@ -104,7 +109,6 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
                 kernel[co][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
             ),
             name="kernel_pack",
-            attrs={"schedule_rule": "meta_schedule.winograd_kernel_pack.nchw.cuda"},
         )
     else:
         kernel_pack = kernel
@@ -129,7 +133,6 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
             input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
         ),
         name="data_pack",
-        attrs={"schedule_rule": "meta_schedule.winograd_data_pack.nchw.cuda"},
     )
 
     # do batch gemm
@@ -151,7 +154,6 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
             bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
         ),
         name="inverse",
-        attrs={"schedule_rule": "meta_schedule.winograd_inverse.nchw.cuda"},
     )
 
     # output
@@ -162,10 +164,6 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
         ],
         name="output",
         tag="conv2d_nchw_winograd",
-        attrs={
-            "schedule_rule": "meta_schedule.winograd_output.nchw.cuda",
-            "winograd_tile_size": alpha - 3 + 1,
-        },
     )
 
     if isinstance(N, int):
@@ -391,3 +389,24 @@ def conv2d_winograd_nhwc_cuda(
     return _conv2d_winograd_nhwc_impl(
         data, weight, strides, padding, dilation, out_dtype, tile_size, pre_computed
     )
+
+
+@conv2d_winograd_nchw.register(["cuda", "gpu"])
+def conv2d_winograd_nchw_cuda(
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    pre_computed=False,
+    auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
+):
+    """Conv2D Winograd in NCHW layout.
+    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+    """
+    tile_size = _infer_tile_size(data, weight, layout="NCHW")
+    return _conv2d_winograd_nchw_impl(
+        data, weight, strides, padding, dilation, out_dtype, tile_size, pre_computed
+    )
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 5070c84c7e51..db1bcaa27694 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -548,7 +548,7 @@ def conv2d_NCHWc_int8(
             ),
             name="conv2d_NCHWc_int8",
             tag="conv2d_NCHWc_int8",
-            attrs={"schedule_rule": "meta_schedule.conv2d_NCHWc_int8"},
+            attrs={"schedule_rule": "conv2d_NCHWc_int8"},
         )
     # for int8 group conv support
     ic_chunk = in_channel // ic_bn
@@ -571,7 +571,7 @@ def conv2d_NCHWc_int8(
         ),
         name="conv2d_NCHWc_int8",
         tag="conv2d_NCHWc_int8",
-        attrs={"schedule_rule": "meta_schedule.conv2d_NCHWc_int8"},
+        attrs={"schedule_rule": "conv2d_NCHWc_int8"},
     )
 
 
@@ -989,6 +989,119 @@ def unpack_NCHWc_to_nchw(packed_out, out_dtype):
     return unpacked_out
 
 
+@tvm.target.generic_func
+def conv2d_winograd_nhwc(
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    pre_computed=False,
+    auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
+):
+    """Conv2D Winograd in NHWC layout.
+    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+    weight : tvm.te.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+    strides : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+    out_dtype : str, optional
+        Specifies the output data type.
+    pre_computed: bool
+        Whether the kernel is precomputed
+    auto_scheduler_rewritten_layout: str = ""
+        The layout after auto-scheduler's layout rewrite pass.
+    meta_schedule_original_shape: Optional[List[PrimExpr]] = None
+        The original shape of the input tensor.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    tile_size = 4
+    return _conv2d_winograd_nhwc_impl(
+        data,
+        weight,
+        strides,
+        padding,
+        dilation,
+        out_dtype,
+        tile_size,
+        pre_computed=pre_computed,
+        write_cache_level=2,
+        auto_scheduler_rewritten_layout=auto_scheduler_rewritten_layout,
+        meta_schedule_original_shape=meta_schedule_original_shape,
+    )
+
+
+@tvm.target.generic_func
+def conv2d_winograd_nchw(
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    pre_computed=False,
+    auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
+):
+    """Conv2D Winograd in NCHW layout.
+    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+    weight : tvm.te.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+    strides : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+    out_dtype : str, optional
+        Specifies the output data type.
+    pre_computed: bool
+        Whether the kernel is precomputed
+    auto_scheduler_rewritten_layout: str = ""
+        The layout after auto-scheduler's layout rewrite pass.
+    meta_schedule_original_shape: Optional[List[PrimExpr]] = None
+        The original shape of the input tensor.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    tile_size = 4
+    return _conv2d_winograd_nchw_impl(
+        data,
+        weight,
+        strides,
+        padding,
+        dilation,
+        out_dtype,
+        tile_size,
+        pre_computed,
+        auto_scheduler_rewritten_layout,
+        meta_schedule_original_shape,
+    )
+
+
 def _conv2d_winograd_nhwc_impl(
     data,
     weight,
@@ -998,6 +1111,7 @@ def _conv2d_winograd_nhwc_impl(
     out_dtype,
     tile_size,
     pre_computed=False,
+    write_cache_level=None,
     auto_scheduler_rewritten_layout="",
     meta_schedule_original_shape=None,
 ):
@@ -1022,6 +1136,8 @@ def _conv2d_winograd_nhwc_impl(
         The size of the tile to use for the Winograd filter
     pre_computed: bool = False
         Whether the kernel is precomputed
+    write_cache_level: Optional[int] = None
+        The cache level to write to in multi-level tiling rule in MetaSchedule.
     auto_scheduler_rewritten_layout: str = ""
         The layout after auto-scheduler's layout rewrite pass.
     meta_schedule_original_shape: Optional[List[PrimExpr]] = None
@@ -1085,45 +1201,48 @@ def _conv2d_winograd_nhwc_impl(
         kernel_pack = te.compute(
             (alpha, alpha, CO, CI),
             lambda eps, nu, co, ci: te.sum(
-                weight[r_kh][r_kw][ci][co] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
+                weight[r_kh, r_kw, ci, co] * G[eps, r_kh] * G[nu, r_kw],
+                axis=[r_kh, r_kw],
             ),
             name="kernel_pack",
         )
-        attrs = {}
+        bgemm_attrs = {}
     else:
         kernel_pack = weight
-        attrs = {"layout_free_placeholders": [kernel_pack]}
+        bgemm_attrs = {"layout_free_placeholders": [kernel_pack]}
+    if write_cache_level is not None:
+        if not isinstance(write_cache_level, int):
+            bgemm_attrs["meta_schedule.write_cache_level"] = write_cache_level
+        else:
+            bgemm_attrs["meta_schedule.write_cache_level"] = [write_cache_level]
 
     # pack data tile
     input_tile = te.compute(
         (alpha, alpha, P, CI),
-        lambda eps, nu, p, ci: data_pad[p // (nH * nW)][((p // nW) % nH) * m + eps][
-            (p % nW) * m + nu
-        ][ci],
+        lambda eps, nu, p, ci: data_pad[
+            p // (nH * nW),
+            ((p // nW) % nH) * m + eps,
+            (p % nW) * m + nu,
+            ci,
+        ],
         name="input_tile",
         attrs={"schedule_rule": "None"},
     )
 
     # transform data
-    target = tvm.target.Target.current(allow_none=True)
-    if target is not None:
-        target_kind = "meta_schedule.winograd_data_pack." + target.kind.name
-    else:
-        target_kind = "None"
-
     r_a = te.reduce_axis((0, alpha), "r_a")
     r_b = te.reduce_axis((0, alpha), "r_b")
     data_pack = te.compute(
         (alpha, alpha, P, CI),
         lambda eps, nu, p, ci: te.sum(
-            input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
+            input_tile[r_a, r_b, p, ci] * B[r_a, eps] * B[r_b, nu],
+            axis=[r_a, r_b],
         ),
         name="data_pack",
         attrs={
             "auto_scheduler_simplify_const_tensor_indices": ["eps", "nu", "r_a", "r_b"],
-            "schedule_rule": target_kind,
+            "schedule_rule": "conv2d_nhwc_winograd_data_pack",
         },
-        # the attrs are necessary hints for the auto-scheduler
     )
 
     # do batch gemm
@@ -1131,59 +1250,211 @@ def _conv2d_winograd_nhwc_impl(
     bgemm = te.compute(
         (alpha, alpha, P, CO),
         lambda eps, nu, p, co: te.sum(
-            data_pack[eps][nu][p][ci] * kernel_pack[eps][nu][co][ci], axis=[ci]
+            data_pack[eps, nu, p, ci] * kernel_pack[eps, nu, co, ci],
+            axis=[ci],
         ),
         name="bgemm",
-        attrs=attrs,
+        attrs=bgemm_attrs,
     )
 
     if auto_scheduler_rewritten_layout:
         bgemm = auto_scheduler.rewrite_compute_body(bgemm, auto_scheduler_rewritten_layout)
 
     # inverse transform
-    if target is not None:
-        target_kind = "meta_schedule.winograd_inverse." + target.kind.name
-    else:
-        target_kind = "None"
 
     r_a = te.reduce_axis((0, alpha), "r_a")
     r_b = te.reduce_axis((0, alpha), "r_b")
     inverse = te.compute(
         (m, m, P, CO),
         lambda vh, vw, p, co: te.sum(
-            bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
+            bgemm[r_a, r_b, p, co] * A[r_a, vh] * A[r_b, vw],
+            axis=[r_a, r_b],
         ),
         name="inverse",
         attrs={
             "auto_scheduler_simplify_const_tensor_indices": ["vh", "vw", "r_a", "r_b"],
-            "schedule_rule": target_kind,
+            "schedule_rule": "conv2d_nhwc_winograd_inverse",
         },
-        # the attrs are necessary hints for the auto-scheduler
     )
 
     # output
     output = te.compute(
         (N, H, W, CO),
-        lambda n, h, w, co: inverse[h % m, w % m, n * nH * nW + (h // m) * nW + (w // m), co],
+        lambda n, h, w, co: inverse[
+            h % m,
+            w % m,
+            n * nH * nW + (h // m) * nW + (w // m),
+            co,
+        ],
         name="conv2d_winograd",
     )
 
     return output
 
 
-@tvm.target.generic_func
-def conv2d_winograd_nhwc(
+def _conv2d_winograd_nchw_impl(
     data,
     weight,
     strides,
     padding,
     dilation,
     out_dtype,
+    tile_size,
     pre_computed=False,
+    write_cache_level=None,
     auto_scheduler_rewritten_layout="",
     meta_schedule_original_shape=None,
 ):
-    """Conv2D Winograd in NHWC layout.
+    """
+    write_cache_level: Optional[int] = None
+        The cache level to write to in multi-level tiling rule in MetaSchedule.
+    """
+    del auto_scheduler_rewritten_layout
+
+    N, CI, H, W = get_const_tuple(data.shape)
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+    if meta_schedule_original_shape:
+        auto_scheduler.rewrite_tensor_shape(weight, meta_schedule_original_shape)
+
+    assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
+    HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
+
+    if not pre_computed:  # kernel tensor is raw tensor, do strict check
+        CO, CI, KH, KW = get_const_tuple(weight.shape)
+        alpha = KW + tile_size - 1
+        assert HSTR == 1 and WSTR == 1 and KH == KW
+    else:
+        alpha, _, CI, CO = get_const_tuple(weight.shape)
+        KH = KW = alpha + 1 - tile_size
+        assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1
+
+    pad_t, pad_l, pad_b, pad_r = get_pad_tuple(padding, (KH, KW))
+    assert HSTR == 1 and WSTR == 1 and KH == 3 and KW == 3
+
+    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
+    data_pad = pad(
+        data,
+        (0, 0, pt, pl),
+        (0, 0, pb, pr),
+        name="data_pad",
+    )
+
+    r = KW
+    m = tile_size
+    A, B, G = winograd_transform_matrices(m, r, out_dtype)
+
+    H = (H + pt + pb - KH) // HSTR + 1
+    W = (W + pl + pr - KW) // WSTR + 1
+    nH, nW = (H + m - 1) // m, (W + m - 1) // m
+
+    P = N * nH * nW if isinstance(N, int) else nH * nW
+
+    # transform kernel
+    if not pre_computed:
+        r_kh = te.reduce_axis((0, KH), name="r_kh")
+        r_kw = te.reduce_axis((0, KW), name="r_kw")
+        kernel_pack = te.compute(
+            (alpha, alpha, CI, CO),
+            lambda eps, nu, ci, co: te.sum(
+                weight[co, ci, r_kh, r_kw] * G[eps, r_kh] * G[nu, r_kw],
+                axis=[r_kh, r_kw],
+            ),
+            name="kernel_pack",
+        )
+        bgemm_attrs = {}
+    else:
+        kernel_pack = weight
+        bgemm_attrs = {"layout_free_placeholders": [kernel_pack]}
+    if write_cache_level is not None:
+        if not isinstance(write_cache_level, int):
+            bgemm_attrs["meta_schedule.write_cache_level"] = write_cache_level
+        else:
+            bgemm_attrs["meta_schedule.write_cache_level"] = [write_cache_level]
+
+    # pack data tile
+    input_tile = te.compute(
+        (CI, P, alpha, alpha),
+        lambda ci, p, eps, nu: data_pad[
+            p // (nH * nW),
+            ci,
+            ((p // nW) % nH) * m + eps,
+            (p % nW) * m + nu,
+        ],
+        name="input_tile",
+        attrs={"schedule_rule": "None"},
+    )
+
+    # transform data
+    r_a = te.reduce_axis((0, alpha), "r_a")
+    r_b = te.reduce_axis((0, alpha), "r_b")
+    data_pack = te.compute(
+        (alpha, alpha, CI, P),
+        lambda eps, nu, ci, p: te.sum(
+            input_tile[ci, p, r_a, r_b] * B[r_a, eps] * B[r_b, nu],
+            axis=[r_a, r_b],
+        ),
+        name="data_pack",
+        attrs={
+            "schedule_rule": "conv2d_nchw_winograd_data_pack",
+        },
+    )
+
+    # do batch gemm
+    ci = te.reduce_axis((0, CI), name="ci")
+    bgemm = te.compute(
+        (alpha, alpha, CO, P),
+        lambda eps, nu, co, p: te.sum(
+            data_pack[eps, nu, ci, p] * kernel_pack[eps, nu, ci, co],
+            axis=[ci],
+        ),
+        name="bgemm",
+        attrs=bgemm_attrs,
+    )
+
+    # inverse transform
+    r_a = te.reduce_axis((0, alpha), "r_a")
+    r_b = te.reduce_axis((0, alpha), "r_b")
+    inverse = te.compute(
+        (CO, P, m, m),
+        lambda co, p, vh, vw: te.sum(
+            bgemm[r_a, r_b, co, p] * A[r_a, vh] * A[r_b, vw],
+            axis=[r_a, r_b],
+        ),
+        name="inverse",
+        attrs={
+            "schedule_rule": "conv2d_nchw_winograd_inverse",
+        },
+    )
+
+    # output
+    output = te.compute(
+        (N, CO, H, W),
+        lambda n, co, h, w: inverse[
+            co,
+            n * nH * nW + (h // m) * nW + (w // m),
+            h % m,
+            w % m,
+        ],
+        name="conv2d_winograd",
+    )
+
+    return output
+
+
+def conv2d_winograd_nhwc_without_weight_transform(
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
+):
+    """Conv2D Winograd without layout transform in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and GPU.
 
     Parameters
@@ -1200,8 +1471,6 @@ def conv2d_winograd_nhwc(
         dilation size, or [dilation_height, dilation_width]
     out_dtype : str, optional
         Specifies the output data type.
-    pre_computed: bool
-        Whether the kernel is precomputed
     auto_scheduler_rewritten_layout: str = ""
         The layout after auto-scheduler's layout rewrite pass.
     meta_schedule_original_shape: Optional[List[PrimExpr]] = None
@@ -1212,23 +1481,21 @@ def conv2d_winograd_nhwc(
     output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
-    tile_size = 4
 
-    return _conv2d_winograd_nhwc_impl(
+    return conv2d_winograd_nhwc(
         data,
         weight,
         strides,
         padding,
         dilation,
         out_dtype,
-        tile_size,
-        pre_computed,
-        auto_scheduler_rewritten_layout,
-        meta_schedule_original_shape,
+        pre_computed=True,
+        auto_scheduler_rewritten_layout=auto_scheduler_rewritten_layout,
+        meta_schedule_original_shape=meta_schedule_original_shape,
     )
 
 
-def conv2d_winograd_nhwc_without_weight_transform(
+def conv2d_winograd_nchw_without_weight_transform(
     data,
     weight,
     strides,
@@ -1238,8 +1505,8 @@ def conv2d_winograd_nhwc_without_weight_transform(
     auto_scheduler_rewritten_layout="",
     meta_schedule_original_shape=None,
 ):
-    """Conv2D Winograd without layout transform in NHWC layout.
-    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+    """Conv2D Winograd without layout transform in NCHW layout.
+    This is a clean version to be used by meta-schedule for both CPU and GPU.
 
     Parameters
     ----------
@@ -1265,8 +1532,7 @@ def conv2d_winograd_nhwc_without_weight_transform(
     output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
-
-    return conv2d_winograd_nhwc(
+    return conv2d_winograd_nchw(
         data,
         weight,
         strides,
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index 1fd842f2d4cc..8251dac4137b 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -24,6 +24,7 @@
 import tvm
 from tvm import te
 from tvm.tir import bijective_layout, layout
+
 from . import cpp, tag
 
 
@@ -325,7 +326,7 @@ def unravel_index(idx, shape):
     return indices
 
 
-def const_matrix(matrix, name="const_matrix"):
+def const_matrix(matrix, name="const_matrix", attrs=None):
     """convert a const numpy 2-dimensional matrix to tvm tensor
 
     Parameters
@@ -355,14 +356,17 @@ def select_array(i, j):
                 )
         return now
 
+    if attrs is None:
+        attrs = {
+            "const_matrix": True,
+            "schedule_rule": "None",
+        }
+
     return te.compute(
         matrix.shape,
         select_array,
         name=name,
-        attrs={
-            "const_matrix": True,
-            "schedule_rule": "meta_schedule.compute_inline",
-        },
+        attrs=attrs,
     )
 
 
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index 3d64239044e2..025f41660c9c 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -17,13 +17,13 @@
 # pylint: disable=invalid-name,too-many-locals,unused-variable
 """x86 batch_matmul operators"""
 import tvm
-from tvm import te
-from tvm import autotvm
+from tvm import autotvm, te
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cblas, mkl
+
 from .. import generic, nn
 from ..transform import layout_transform
-from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
+from ..utils import get_const_tuple, get_max_power2_factor, traverse_inline
 from .dense import dense_vnni_schedule
 from .injective import schedule_injective_from_existing
 
@@ -47,7 +47,7 @@ def batch_matmul_vnni_compute(cfg, x, y, *_):
             axis=ak,
         ),
         tag="batch_matmul_vnni",
-        attrs={"schedule_rule": "meta_schedule.batch_matmul_vnni"},
+        attrs={"schedule_rule": "batch_matmul_vnni"},
     )
 
     _, a_y, _ = z.op.axis
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 88a2499c2c1e..8ddb8d7a5c9a 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -18,18 +18,16 @@
 # pylint: disable=no-value-for-parameter
 """x86 dense operators"""
 from __future__ import absolute_import as _abs
+
 import tvm
-from tvm import te
-from tvm import autotvm
+from tvm import autotvm, te
 from tvm.autotvm.task.space import SplitEntity
-from tvm.contrib import cblas
-from tvm.contrib import mkl
-from tvm.contrib import dnnl
+from tvm.contrib import cblas, dnnl, mkl
 
-from .utils import get_simd_32bit_lanes
 from .. import generic, tag
-from ..utils import traverse_inline, get_const_tuple
+from ..utils import get_const_tuple, traverse_inline
 from .tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
+from .utils import get_simd_32bit_lanes
 
 
 def _schedule_dense_pack_template(cfg, s, C, O):
@@ -296,7 +294,7 @@ def dense_vnni_compute(cfg, X, packed_w, bias=None):
             axis=ak,
         ),
         tag="dense_vnni",
-        attrs={"schedule_rule": "meta_schedule.dense_vnni"},
+        attrs={"schedule_rule": "dense_vnni"},
     )
 
     if bias is not None:
diff --git a/src/meta_schedule/postproc/rewrite_unbound_block.cc b/src/meta_schedule/postproc/rewrite_unbound_block.cc
index 1ba68538ea04..27ce34a8cb27 100644
--- a/src/meta_schedule/postproc/rewrite_unbound_block.cc
+++ b/src/meta_schedule/postproc/rewrite_unbound_block.cc
@@ -16,7 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include "../schedule_rule/auto_bind.h"
+#include <tvm/meta_schedule/schedule/cuda/thread_bind.h>
+
 #include "../utils.h"
 
 namespace tvm {
diff --git a/src/meta_schedule/schedule/cpu/winograd.cc b/src/meta_schedule/schedule/cpu/winograd.cc
new file mode 100644
index 000000000000..16e53b56923a
--- /dev/null
+++ b/src/meta_schedule/schedule/cpu/winograd.cc
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/meta_schedule/schedule/generic/winograd.h>
+
+#include "../../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using namespace tvm::tir;
+
+static Array<tir::LoopRV> ScheduleDataPack(tir::Schedule sch, tir::BlockRV block,
+                                           std::vector<int> tiled, std::vector<int> unrolled) {
+  using namespace tvm::tir;
+  ICHECK_EQ(tiled.size(), 2);
+  ICHECK_EQ(unrolled.size(), 4);
+  Array<ExprRV> factors{nullptr};
+  Array<LoopRV> loops = sch->GetLoops(block);
+  ICHECK_EQ(loops.size(), 6);
+
+  factors = sch->SamplePerfectTile(loops[tiled[0]], /*n=*/2, /*max_innermost_factor=*/64);
+  Array<LoopRV> t0 = sch->Split(loops[tiled[0]], {factors.begin(), factors.end()});
+  ICHECK_EQ(t0.size(), 2);
+
+  factors = sch->SamplePerfectTile(loops[tiled[1]], /*n=*/2, /*max_innermost_factor=*/64);
+  Array<LoopRV> t1 = sch->Split(loops[tiled[1]], {factors.begin(), factors.end()});
+  ICHECK_EQ(t1.size(), 2);
+
+  sch->Unroll(loops[unrolled[0]]);
+  sch->Unroll(loops[unrolled[1]]);
+  sch->Unroll(loops[unrolled[2]]);
+  sch->Unroll(loops[unrolled[3]]);
+  sch->Reorder({
+      t0[0],
+      t1[0],
+      t0[1],
+      t1[1],
+      loops[unrolled[0]],
+      loops[unrolled[1]],
+      loops[unrolled[2]],
+      loops[unrolled[3]],
+  });
+  return {t0[0], t1[0], t0[1], t1[1]};
+}
+
+TVM_REGISTER_GLOBAL("meta_schedule.cpu.conv2d_nhwc_winograd_data_pack")
+    .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array<Schedule> {
+      BlockRV input_tile = GetWinogradProducerAndInlineConst(sch, data_pack);
+      BlockRV data_pad = GetWinogradProducerAndInlineConst(sch, input_tile);
+      ScheduleDataPack(sch, data_pack, {2, 3}, {0, 1, 4, 5});
+      sch->ComputeAt(input_tile, /*loop_rv=*/sch->SampleComputeLocation(input_tile),
+                     /*preserve_unit_loops=*/true);
+      sch->ComputeAt(data_pad, /*loop_rv=*/sch->SampleComputeLocation(data_pad),
+                     /*preserve_unit_loops=*/true);
+      return {sch};
+    });
+
+TVM_REGISTER_GLOBAL("meta_schedule.cpu.conv2d_nhwc_winograd_inverse")
+    .set_body_typed([](Schedule sch, BlockRV block) -> Array<Schedule> {
+      GetWinogradProducerAndInlineConst(sch, block);
+      ScheduleDataPack(sch, block, {2, 3}, {0, 1, 4, 5});
+      return {sch};
+    });
+
+TVM_REGISTER_GLOBAL("meta_schedule.cpu.conv2d_nchw_winograd_data_pack")
+    .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array<Schedule> {
+      BlockRV input_tile = GetWinogradProducerAndInlineConst(sch, data_pack);
+      BlockRV data_pad = GetWinogradProducerAndInlineConst(sch, input_tile);
+      ScheduleDataPack(sch, data_pack, {2, 3}, {0, 1, 4, 5});
+      sch->ComputeAt(input_tile, /*loop_rv=*/sch->SampleComputeLocation(input_tile),
+                     /*preserve_unit_loops=*/true);
+      sch->ComputeAt(data_pad, /*loop_rv=*/sch->SampleComputeLocation(data_pad),
+                     /*preserve_unit_loops=*/true);
+      return {sch};
+    });
+
+TVM_REGISTER_GLOBAL("meta_schedule.cpu.conv2d_nchw_winograd_inverse")
+    .set_body_typed([](Schedule sch, BlockRV block) -> Array<Schedule> {
+      GetWinogradProducerAndInlineConst(sch, block);
+      ScheduleDataPack(sch, block, {0, 1}, {2, 3, 4, 5});
+      return {sch};
+    });
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/schedule/cuda/thread_bind.cc b/src/meta_schedule/schedule/cuda/thread_bind.cc
new file mode 100644
index 000000000000..e5dd5068783d
--- /dev/null
+++ b/src/meta_schedule/schedule/cuda/thread_bind.cc
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/meta_schedule/schedule/cuda/thread_bind.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/schedule/schedule.h>
+
+#include <algorithm>
+#include <limits>
+#include <utility>
+
+#include "../../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using namespace tvm::tir;
+
+std::function<ExprRV(int64_t)> MakeFactorSampler(Schedule sch, Array<Integer> thread_extents) {
+  return [sch = std::move(sch),
+          thread_extents = std::move(thread_extents)](int64_t max_extent) -> ExprRV {
+    Array<Integer> extents;
+    extents.reserve(thread_extents.size());
+    for (const Integer extent : thread_extents) {
+      if (extent->value <= max_extent) {
+        extents.push_back(extent);
+      }
+    }
+    int n = extents.size();
+    if (n == 0) {
+      return Integer(max_extent);
+    }
+    if (n == 1) {
+      return Integer(extents[0]);
+    }
+    Array<FloatImm> probs(n, FloatImm(DataType::Float(64), 1.0 / n));
+    return sch->SampleCategorical(extents, probs);
+  };
+}
+
+Array<LoopRV> BindSpatialLoop(Schedule sch, LoopRV loop, int64_t max_threadblocks,
+                              int64_t max_threads_per_block,
+                              std::function<ExprRV(int64_t)> get_factor) {
+  int64_t extent = -1;
+  if (const int64_t* e = as_const_int(sch->Get(loop)->extent)) {
+    extent = *e;
+  } else {
+    extent = std::numeric_limits<int64_t>::max();
+  }
+  if (extent <= max_threadblocks * max_threads_per_block) {
+    if (!get_factor) {
+      get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024});
+    }
+    ExprRV factor = get_factor(std::min(extent, max_threads_per_block));
+    Array<LoopRV> splits = sch->Split(loop, {NullOpt, factor});
+    ICHECK_EQ(splits.size(), 2);
+    sch->Bind(splits[0], "blockIdx.x");
+    sch->Bind(splits[1], "threadIdx.x");
+    return {splits[0], splits[1]};
+  } else {
+    Array<LoopRV> splits = sch->Split(loop, {NullOpt,
+                                             Integer(max_threadblocks),  //
+                                             Integer(max_threads_per_block)});
+    ICHECK_EQ(splits.size(), 3);
+    sch->Reorder({splits[1], splits[2], splits[0]});
+    sch->Bind(splits[1], "blockIdx.x");
+    sch->Bind(splits[2], "threadIdx.x");
+    return {splits[1], splits[2]};
+  }
+}
+
+void BindBlockThreadIdx(tir::Schedule sch, tir::BlockRV block_rv,  //
+                        int64_t max_threadblocks, int64_t max_threads_per_block,
+                        std::function<tir::ExprRV(int64_t)> get_factor) {
+  using namespace tvm::tir;
+  StmtSRef block_sref = sch->GetSRef(block_rv);
+  if (block_sref->parent == nullptr) {
+    return;
+  }
+  if (tir::HasBeenMultiLevelTiled(block_sref)) {
+    return;
+  }
+  Array<StmtSRef> loops = tir::GetLoops(block_sref);
+  int n = loops.size();
+  int i_block_idx = -1;
+  int i_thread_idx = -1;
+  int i_multi_child = -1;
+  int i_spatial_loop = -1;
+  for (int i = 0; i < n; ++i) {
+    const StmtSRef& loop_sref = loops[i];
+    const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
+    runtime::ThreadScope thread_scope = GetThreadScope(loop);
+    if (IsBlockIdx(thread_scope)) {
+      if (i_block_idx == -1) {
+        i_block_idx = i;
+      }
+    }
+    if (IsThreadIdx(thread_scope)) {
+      if (i_thread_idx == -1) {
+        i_thread_idx = i;
+      }
+    }
+    if (loop->kind != ForKind::kSerial) {
+      if (i_multi_child == -1) {
+        i_multi_child = i;
+      }
+    }
+    if (!IsSingleStmt(loop->body)) {
+      if (i_multi_child == -1) {
+        i_multi_child = i + 1;
+      }
+    }
+    if (GetLoopIterType(loop_sref) == IterVarType::kDataPar) {
+      if (i_spatial_loop == i - 1) {
+        ++i_spatial_loop;
+      }
+    }
+  }
+  if (i_multi_child == -1) {
+    i_multi_child = n;
+  }
+  if (i_block_idx != -1 && i_thread_idx != -1) {
+    return;
+  }
+  if (i_block_idx != -1 && i_thread_idx == -1) {
+    ICHECK(false) << "Unsupported case, where blockIdx is bound but threadIdx is not";
+    throw;
+  }
+  LoopRV loop_rv{nullptr};
+  {
+    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
+    if (i_spatial_loop == -1) {
+      LoopRV spatial_loop_rv{nullptr};
+      if (loop_rvs.empty()) {
+        spatial_loop_rv = sch->AddUnitLoop(block_rv);
+      } else {
+        spatial_loop_rv = sch->AddUnitLoop(loop_rvs[0]);
+      }
+      loop_rvs.insert(loop_rvs.begin(), spatial_loop_rv);
+      i_spatial_loop = 0;
+      if (i_block_idx != -1) {
+        i_block_idx += 1;
+      }
+      if (i_thread_idx != -1) {
+        i_thread_idx += 1;
+      }
+      if (i_multi_child != -1) {
+        i_multi_child += 1;
+      }
+    }
+    if (i_block_idx == -1 && i_thread_idx != -1) {
+      int num_fuse = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1);
+      Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
+      loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+      sch->Bind(loop_rv, "blockIdx.x");
+      return;
+    } else {  // i_block_idx == -1 && i_thread_idx == -1
+      int num_fuse = std::min(i_multi_child, i_spatial_loop + 1);
+      loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+    }
+  }
+  BindSpatialLoop(sch, loop_rv, max_threadblocks, max_threads_per_block, get_factor);
+}
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/schedule/cuda/winograd.cc b/src/meta_schedule/schedule/cuda/winograd.cc
new file mode 100644
index 000000000000..5334c4df2ac9
--- /dev/null
+++ b/src/meta_schedule/schedule/cuda/winograd.cc
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/meta_schedule/schedule/cuda/thread_bind.h>
+#include <tvm/meta_schedule/schedule/generic/winograd.h>
+
+#include <vector>
+
+#include "../../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using namespace tvm::tir;
+
+static Array<tir::LoopRV> ScheduleDataPack(tir::Schedule sch, tir::BlockRV block,
+                                           std::vector<int> tiled, std::vector<int> unrolled) {
+  // This method is used for NHWC layout only. Will likely be refactored into a more schedule
+  using namespace tvm::tir;
+  ICHECK_EQ(tiled.size(), 2);
+  ICHECK_EQ(unrolled.size(), 4);
+  Array<ExprRV> factors{nullptr};
+  Array<LoopRV> loops = sch->GetLoops(block);
+  ICHECK_EQ(loops.size(), 6);
+
+  factors = sch->SamplePerfectTile(loops[tiled[0]], /*n=*/2, /*max_innermost_factor=*/64);
+  Array<LoopRV> t0 = sch->Split(loops[tiled[0]], {factors.begin(), factors.end()});
+  ICHECK_EQ(t0.size(), 2);
+
+  factors = sch->SamplePerfectTile(loops[tiled[1]], /*n=*/2, /*max_innermost_factor=*/64);
+  Array<LoopRV> t1 = sch->Split(loops[tiled[1]], {factors.begin(), factors.end()});
+  ICHECK_EQ(t1.size(), 2);
+
+  sch->Unroll(loops[unrolled[0]]);
+  sch->Unroll(loops[unrolled[1]]);
+  sch->Unroll(loops[unrolled[2]]);
+  sch->Unroll(loops[unrolled[3]]);
+  sch->Reorder({
+      t0[0],
+      t1[0],
+      t0[1],
+      t1[1],
+      loops[unrolled[0]],
+      loops[unrolled[1]],
+      loops[unrolled[2]],
+      loops[unrolled[3]],
+  });
+  return {t0[0], t1[0], t0[1], t1[1]};
+}
+
+TVM_REGISTER_GLOBAL("meta_schedule.cuda.conv2d_nhwc_winograd_data_pack")
+    .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array<Schedule> {
+      BlockRV input_tile = GetWinogradProducerAndInlineConst(sch, data_pack);
+      BlockRV data_pad = GetWinogradProducerAndInlineConst(sch, input_tile);
+      Array<LoopRV> loops = ScheduleDataPack(sch, data_pack, {2, 3}, {0, 1, 4, 5});
+      {
+        BlockRV data_pack_local = sch->CacheWrite(data_pack, 0, "local");
+        sch->ReverseComputeAt(data_pack_local, loops.back(), /*preserve_unit_loops=*/true);
+      }
+      {
+        sch->ComputeAt(input_tile, /*loop_rv=*/loops.back(), /*preserve_unit_loops=*/true);
+        sch->SetScope(input_tile, /*buffer_index=*/0, /*storage_scope=*/"local");
+        sch->ComputeInline(data_pad);
+      }
+      {
+        int64_t max_threadblocks = 256;
+        int64_t max_threads_per_block = 1024;
+        Array<LoopRV> loops = sch->GetLoops(data_pack);
+        ICHECK_EQ(loops.size(), 8);
+        BindSpatialLoop(sch, sch->Fuse({loops[0], loops[1], loops[2], loops[3]}), max_threadblocks,
+                        max_threads_per_block);
+      }
+      return {sch};
+    });
+
+TVM_REGISTER_GLOBAL("meta_schedule.cuda.conv2d_nhwc_winograd_inverse")
+    .set_body_typed([](Schedule sch, BlockRV inverse) -> Array<Schedule> {
+      GetWinogradProducerAndInlineConst(sch, inverse);
+      ScheduleDataPack(sch, inverse, /*tiled=*/{2, 3}, /*unrolled=*/{0, 1, 4, 5});
+      int64_t max_threadblocks = 256;
+      int64_t max_threads_per_block = 1024;
+      Array<LoopRV> loops = sch->GetLoops(inverse);
+      ICHECK_EQ(loops.size(), 8);
+      BindSpatialLoop(sch, sch->Fuse({loops[0], loops[1], loops[2], loops[3]}), max_threadblocks,
+                      max_threads_per_block);
+      return {sch};
+    });
+
+TVM_REGISTER_GLOBAL("meta_schedule.cuda.conv2d_nchw_winograd_data_pack")
+    .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array<Schedule> {
+      int64_t max_threadblocks = 256;
+      int64_t max_threads_per_block = 1024;
+      BlockRV input_tile = GetWinogradProducerAndInlineConst(sch, data_pack);
+      BlockRV data_pad = GetWinogradProducerAndInlineConst(sch, input_tile);
+      LoopRV outer{nullptr};
+      {
+        Array<LoopRV> loops = sch->GetLoops(data_pack);
+        ICHECK_EQ(loops.size(), 6);
+        sch->Reorder({loops[2], loops[3], loops[0], loops[1], loops[4], loops[5]});
+        sch->Unroll(loops[0]);
+        sch->Unroll(loops[1]);
+        sch->Unroll(loops[4]);
+        sch->Unroll(loops[5]);
+        outer = BindSpatialLoop(sch, sch->Fuse({loops[2], loops[3]}), max_threadblocks,
+                                max_threads_per_block)[1];
+      }
+      {
+        BlockRV data_pack_local = sch->CacheWrite(data_pack, 0, "local");
+        sch->ReverseComputeAt(data_pack_local, outer, /*preserve_unit_loops=*/true);
+      }
+      {
+        sch->ComputeAt(input_tile, /*loop_rv=*/outer, /*preserve_unit_loops=*/true);
+        sch->SetScope(input_tile, /*buffer_index=*/0, /*storage_scope=*/"local");
+        sch->ComputeInline(data_pad);
+      }
+      return {sch};
+    });
+
+TVM_REGISTER_GLOBAL("meta_schedule.cuda.conv2d_nchw_winograd_inverse")
+    .set_body_typed([](Schedule sch, BlockRV inverse) -> Array<Schedule> {
+      GetWinogradProducerAndInlineConst(sch, inverse);
+      // loops on top of the inverse block: [CO, P, tile_size, tile_size, alpha, alpha]
+      int64_t tile_size = Downcast<IntImm>(sch->Get(inverse)->writes[0]->buffer->shape[2])->value;
+      LoopRV outer{nullptr};
+      {
+        BlockRV output = sch->GetConsumers(inverse)[0];
+        Array<LoopRV> nchw = sch->GetLoops(output);
+        ICHECK_EQ(nchw.size(), 4);
+        Array<LoopRV> hs = sch->Split(nchw[2], {NullOpt, Integer(tile_size)});
+        Array<LoopRV> ws = sch->Split(nchw[3], {NullOpt, Integer(tile_size)});
+        sch->Reorder({hs[0], ws[0], hs[1], ws[1]});
+        outer = ws[0];
+      }
+      {
+        sch->ComputeAt(inverse, /*loop_rv=*/outer, /*preserve_unit_loops=*/true);
+        sch->SetScope(inverse, /*buffer_index=*/0, /*storage_scope=*/"local");
+        Array<LoopRV> loops = sch->GetLoops(inverse);
+        ICHECK_EQ(loops.size(), 10);
+        sch->Unroll(loops[6]);
+        sch->Unroll(loops[7]);
+        sch->Unroll(loops[8]);
+        sch->Unroll(loops[9]);
+      }
+      return {sch};
+    });
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/schedule/generic/winograd.cc b/src/meta_schedule/schedule/generic/winograd.cc
new file mode 100644
index 000000000000..edb14667bcec
--- /dev/null
+++ b/src/meta_schedule/schedule/generic/winograd.cc
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/meta_schedule/schedule/generic/winograd.h>
+
+namespace tvm {
+namespace meta_schedule {
+
+using namespace tvm::tir;
+
+/*!
+ * \brief Get the producer block of a given block.
+ * If there is a constant winograd transform matrix, inline it.
+ * \return The only producer block.
+ */
+BlockRV GetWinogradProducerAndInlineConst(Schedule sch, BlockRV block) {
+  Array<BlockRV> producers = sch->GetProducers(block);
+  Array<BlockRV> results;
+  for (const BlockRV& producer : producers) {
+    if (sch->Get(producer)->reads.empty()) {
+      sch->ComputeInline(producer);
+    } else {
+      results.push_back(producer);
+    }
+  }
+  ICHECK_EQ(results.size(), 1);
+  return results[0];
+}
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/schedule/x86/.gitignore b/src/meta_schedule/schedule/x86/.gitignore
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/src/meta_schedule/schedule_rule/apply_custom_rule.cc b/src/meta_schedule/schedule_rule/apply_custom_rule.cc
new file mode 100644
index 000000000000..4b0fa675acc7
--- /dev/null
+++ b/src/meta_schedule/schedule_rule/apply_custom_rule.cc
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+class ApplyCustomRuleNode : public ScheduleRuleNode {
+ public:
+  // Inherited from ScheduleRuleNode
+  void InitializeWithTuneContext(const TuneContext& context) final {
+    CHECK(context->target.defined()) << "ValueError: Target is not defined in the tune context.";
+    this->target_ = context->target;
+  }
+
+  static std::string GetCustomRuleName(const std::string& name, const std::string& key) {
+    return "meta_schedule." + key + "." + name;
+  }
+
+  // Inherited from ScheduleRuleNode
+  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final {
+    CHECK(this->target_.defined())
+        << "ValueError: ApplyCustomRule is not initialized with TuneContext that has a Target.";
+    Array<String> keys = this->target_.value()->keys;
+    if (Optional<String> ann = tir::GetAnn<String>(sch->GetSRef(block_rv), "schedule_rule")) {
+      if (ann.value() != "None") {
+        for (const String& key : keys) {
+          if (const runtime::PackedFunc* custom_schedule_fn =
+                  runtime::Registry::Get(GetCustomRuleName(ann.value(), key))) {
+            Array<tir::Schedule> result = ((*custom_schedule_fn)(sch, block_rv));
+            return result;
+          }
+        }
+        std::ostringstream os;
+        os << "Unknown schedule rule \"" << ann.value() << "\" for target keys \"" << keys
+           << "\". Checked PackedFuncs:";
+        for (const String& key : keys) {
+          os << "\n  " << GetCustomRuleName(ann.value(), key);
+        }
+        LOG(WARNING) << os.str();
+      }
+    }
+    return {sch};
+  }
+
+  // Inherited from ScheduleRuleNode
+  ScheduleRule Clone() const final {
+    ObjectPtr<ApplyCustomRuleNode> n = make_object<ApplyCustomRuleNode>(*this);
+    n->target_ = target_;
+    return ScheduleRule(n);
+  }
+
+ public:
+  Optional<Target> target_ = NullOpt;
+
+  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("target_", &target_); }
+
+  static constexpr const char* _type_key = "meta_schedule.ApplyCustomRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ApplyCustomRuleNode, ScheduleRuleNode);
+};
+
+ScheduleRule ScheduleRule::ApplyCustomRule() {
+  ObjectPtr<ApplyCustomRuleNode> n = make_object<ApplyCustomRuleNode>();
+  return ScheduleRule(n);
+}
+
+bool ScheduleRule::IsApplyCustomRule(const ScheduleRule& rule) {
+  return rule->IsInstance<ApplyCustomRuleNode>();
+}
+
+TVM_REGISTER_NODE_TYPE(ApplyCustomRuleNode);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleApplyCustomRule")
+    .set_body_typed(ScheduleRule::ApplyCustomRule);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc
index 4d16a6d4d65d..fa47d1edb860 100644
--- a/src/meta_schedule/schedule_rule/auto_bind.cc
+++ b/src/meta_schedule/schedule_rule/auto_bind.cc
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include "./auto_bind.h"
+#include <tvm/meta_schedule/schedule/cuda/thread_bind.h>
 
 #include <algorithm>
 #include <limits>
@@ -26,142 +26,6 @@
 namespace tvm {
 namespace meta_schedule {
 
-void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
-                        int64_t max_threadblocks, int64_t max_threads_per_block,
-                        std::function<tir::ExprRV(int64_t)> get_factor) {
-  using namespace tvm::tir;
-  StmtSRef block_sref = sch->GetSRef(block_rv);
-  if (block_sref->parent == nullptr) {
-    return;
-  }
-  if (tir::HasBeenMultiLevelTiled(block_sref)) {
-    return;
-  }
-  Array<StmtSRef> loops = tir::GetLoops(block_sref);
-  int n = loops.size();
-  int i_block_idx = -1;
-  int i_thread_idx = -1;
-  int i_multi_child = -1;
-  int i_spatial_loop = -1;
-  for (int i = 0; i < n; ++i) {
-    const StmtSRef& loop_sref = loops[i];
-    const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
-    runtime::ThreadScope thread_scope = GetThreadScope(loop);
-    if (IsBlockIdx(thread_scope)) {
-      if (i_block_idx == -1) {
-        i_block_idx = i;
-      }
-    }
-    if (IsThreadIdx(thread_scope)) {
-      if (i_thread_idx == -1) {
-        i_thread_idx = i;
-      }
-    }
-    if (loop->kind != ForKind::kSerial) {
-      if (i_multi_child == -1) {
-        i_multi_child = i;
-      }
-    }
-    if (!IsSingleStmt(loop->body)) {
-      if (i_multi_child == -1) {
-        i_multi_child = i + 1;
-      }
-    }
-    if (GetLoopIterType(loop_sref) == IterVarType::kDataPar) {
-      if (i_spatial_loop == i - 1) {
-        ++i_spatial_loop;
-      }
-    }
-  }
-  if (i_multi_child == -1) {
-    i_multi_child = n;
-  }
-  if (i_block_idx != -1 && i_thread_idx != -1) {
-    return;
-  }
-  if (i_block_idx != -1 && i_thread_idx == -1) {
-    ICHECK(false) << "Unsupported case, where blockIdx is bound but threadIdx is not";
-    throw;
-  }
-  LoopRV loop_rv{nullptr};
-  {
-    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
-    if (i_spatial_loop == -1) {
-      LoopRV spatial_loop_rv{nullptr};
-      if (loop_rvs.empty()) {
-        spatial_loop_rv = sch->AddUnitLoop(block_rv);
-      } else {
-        spatial_loop_rv = sch->AddUnitLoop(loop_rvs[0]);
-      }
-      loop_rvs.insert(loop_rvs.begin(), spatial_loop_rv);
-      i_spatial_loop = 0;
-      if (i_block_idx != -1) {
-        i_block_idx += 1;
-      }
-      if (i_thread_idx != -1) {
-        i_thread_idx += 1;
-      }
-      if (i_multi_child != -1) {
-        i_multi_child += 1;
-      }
-    }
-    if (i_block_idx == -1 && i_thread_idx != -1) {
-      int num_fuse = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1);
-      Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
-      loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
-      sch->Bind(loop_rv, "blockIdx.x");
-      return;
-    } else {  // i_block_idx == -1 && i_thread_idx == -1
-      int num_fuse = std::min(i_multi_child, i_spatial_loop + 1);
-      loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
-    }
-  }
-  int64_t extent = -1;
-  if (const int64_t* e = GetLoopIntExtent(sch->Get(loop_rv).get())) {
-    extent = *e;
-  } else {
-    extent = std::numeric_limits<int64_t>::max();
-  }
-  if (extent <= max_threadblocks * max_threads_per_block) {
-    ExprRV factor = get_factor(std::min(extent, max_threads_per_block));
-    Array<LoopRV> splits = sch->Split(loop_rv, {NullOpt, factor});
-    ICHECK_EQ(splits.size(), 2);
-    sch->Bind(splits[0], "blockIdx.x");
-    sch->Bind(splits[1], "threadIdx.x");
-  } else {
-    Array<LoopRV> splits = sch->Split(loop_rv, {NullOpt,
-                                                Integer(max_threadblocks),  //
-                                                Integer(max_threads_per_block)});
-    ICHECK_EQ(splits.size(), 3);
-    sch->Reorder({splits[1], splits[2], splits[0]});
-    sch->Bind(splits[1], "blockIdx.x");
-    sch->Bind(splits[2], "threadIdx.x");
-  }
-}
-
-std::function<tir::ExprRV(int64_t)> MakeFactorSampler(tir::Schedule sch,
-                                                      Array<Integer> thread_extents) {
-  return [sch = std::move(sch),
-          thread_extents = std::move(thread_extents)](int64_t max_extent) -> tir::ExprRV {
-    Array<Integer> extents;
-    extents.reserve(thread_extents.size());
-    for (const Integer extent : thread_extents) {
-      if (extent->value <= max_extent) {
-        extents.push_back(extent);
-      }
-    }
-    int n = extents.size();
-    if (n == 0) {
-      return Integer(max_extent);
-    }
-    if (n == 1) {
-      return Integer(extents[0]);
-    }
-    Array<FloatImm> probs(n, FloatImm(DataType::Float(64), 1.0 / n));
-    return sch->SampleCategorical(extents, probs);
-  };
-}
-
 class AutoBindNode : public ScheduleRuleNode {
  public:
   // Inherited from ScheduleRuleNode
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index 8e4642b50ddb..141b93be5e34 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -53,7 +53,15 @@ ScheduleRule ScheduleRule::PyScheduleRule(
 
 Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
   return {
-      GetDefaultAutoInline("llvm"),
+      ScheduleRule::ApplyCustomRule(),
+      ScheduleRule::AutoInline(
+          /*into_producer=*/false,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/true,
+          /*require_injective=*/true,
+          /*require_ordered=*/true,
+          /*disallow_op=*/Array<String>{"tir.exp"}),
       ScheduleRule::AddRFactor(
           /*max_jobs_per_core=*/16,
           /*max_innermost_factor=*/Integer(64)),
@@ -78,6 +86,7 @@ Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
 
 Array<ScheduleRule> ScheduleRule::DefaultCUDA() {
   return {
+      ScheduleRule::ApplyCustomRule(),
       ScheduleRule::MultiLevelTiling(
           /*structure=*/"SSSRRSRS",
           /*tile_binds=*/Array<String>{"blockIdx.x", "vthread.x", "threadIdx.x"},
@@ -91,7 +100,14 @@ Array<ScheduleRule> ScheduleRule::DefaultCUDA() {
           Map<String, ObjectRef>{{"req", String("must")},
                                  {"levels", Array<Integer>{3}},  //
                                  {"scope", String("local")}}),
-      GetDefaultAutoInline("cuda"),
+      ScheduleRule::AutoInline(
+          /*into_producer=*/true,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/false,
+          /*require_injective=*/false,
+          /*require_ordered=*/false,
+          /*disallow_op=*/Array<String>{}),
       ScheduleRule::CrossThreadReduction(
           /*thread_extents=*/Array<Integer>{4, 8, 16, 32, 64, 128, 256, 512}),
       ScheduleRule::ParallelizeVectorizeUnroll(
@@ -136,28 +152,32 @@ Array<ScheduleRule> ScheduleRule::DefaultCUDATensorCore() {
           {"store", "wmma_store_16x16x16_s32_shared"},
       },
   };
-  Array<ScheduleRule> results{ScheduleRule::MultiLevelTilingTensorCore(
-      /*intrin_groups=*/intrin_groups,
-      /*structure=*/"SSSRRSRS",
-      /*tile_binds=*/Array<String>{"blockIdx.y", "blockIdx.x", "threadIdx.y"},
-      /*max_innermost_factor=*/Integer(4),
-      /*vector_load_lens=*/Array<Integer>{1, 2, 3, 4, 8, 16},
-      /*reuse_read=*/
-      Map<String, ObjectRef>{{"req", String("must")},
-                             {"levels", Array<Integer>{4}},  //
-                             {"scope", String("shared")}},
-      /*reuse_write=*/
-      Map<String, ObjectRef>{{"req", String("must")},
-                             {"levels", Array<Integer>{2}},  //
-                             {"scope", String("shared")}},
-      /*use_software_pipeline=*/false)};
+  Array<ScheduleRule> results{
+      ScheduleRule::ApplyCustomRule(),
+      ScheduleRule::MultiLevelTilingTensorCore(
+          /*intrin_groups=*/intrin_groups,
+          /*structure=*/"SSSRRSRS",
+          /*tile_binds=*/Array<String>{"blockIdx.y", "blockIdx.x", "threadIdx.y"},
+          /*max_innermost_factor=*/Integer(4),
+          /*vector_load_lens=*/Array<Integer>{1, 2, 3, 4, 8, 16},
+          /*reuse_read=*/
+          Map<String, ObjectRef>{{"req", String("must")},
+                                 {"levels", Array<Integer>{4}},  //
+                                 {"scope", String("shared")}},
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("must")},
+                                 {"levels", Array<Integer>{2}},  //
+                                 {"scope", String("shared")}},
+          /*use_software_pipeline=*/false)  //
+  };
   Array<ScheduleRule> append = ScheduleRule::DefaultCUDA();
-  results.insert(results.end(), append.begin(), append.end());
+  results.insert(results.end(), append.begin() + 1, append.end());
   return results;
 }
 
 Array<ScheduleRule> ScheduleRule::DefaultHexagon() {
   return {
+      ScheduleRule::ApplyCustomRule(),
       ScheduleRule::AutoInline(
           /*into_producer=*/false,
           /*into_consumer=*/true,
diff --git a/src/meta_schedule/schedule_rule/winograd.cc b/src/meta_schedule/schedule_rule/winograd.cc
deleted file mode 100644
index 22e2300d63b6..000000000000
--- a/src/meta_schedule/schedule_rule/winograd.cc
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include "../utils.h"
-#include "./auto_bind.h"
-
-namespace tvm {
-namespace meta_schedule {
-
-using namespace tvm::tir;
-
-TVM_REGISTER_GLOBAL("meta_schedule.compute_inline")
-    .set_body_typed([](Schedule sch, BlockRV block) -> Array<Schedule> {
-      sch->ComputeInline(block);
-      return {sch};
-    });
-
-inline BlockRV GetOnlyProducer(Schedule sch, BlockRV block) {
-  Array<BlockRV> producers = sch->GetProducers(block);
-  ICHECK_EQ(producers.size(), 1);
-  return producers[0];
-}
-
-inline BlockRV GetOnlyConsumer(Schedule sch, BlockRV block) {
-  Array<BlockRV> consumers = sch->GetConsumers(block);
-  ICHECK_EQ(consumers.size(), 1);
-  return consumers[0];
-}
-
-inline LoopRV ScheduleDataPack(Schedule sch, BlockRV block) {
-  Array<ExprRV> factors{nullptr};
-  Array<LoopRV> loops = sch->GetLoops(block);
-  ICHECK_EQ(loops.size(), 6);
-
-  factors = sch->SamplePerfectTile(loops[2], /*n=*/2, /*max_innermost_factor=*/64);
-  Array<LoopRV> t0 = sch->Split(loops[2], {factors.begin(), factors.end()});
-  ICHECK_EQ(t0.size(), 2);
-
-  factors = sch->SamplePerfectTile(loops[3], /*n=*/2, /*max_innermost_factor=*/64);
-  Array<LoopRV> t1 = sch->Split(loops[3], {factors.begin(), factors.end()});
-  ICHECK_EQ(t1.size(), 2);
-
-  if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[0]))) {
-    if (*i <= 16) {
-      sch->Unroll(loops[0]);
-    }
-  }
-  if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[1]))) {
-    if (*i <= 16) {
-      sch->Unroll(loops[1]);
-    }
-  }
-  sch->Unroll(loops[4]);
-  sch->Unroll(loops[5]);
-  sch->Reorder({
-      t0[0],
-      t1[0],
-      t0[1],
-      t1[1],
-      loops[0],
-      loops[1],
-      loops[4],
-      loops[5],
-  });
-  return t1[1];
-}
-
-inline LoopRV ScheduleDataPackNCHW(Schedule sch, BlockRV block) {
-  Array<LoopRV> loops = sch->GetLoops(block);
-  ICHECK_EQ(loops.size(), 6);
-
-  if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[0]))) {
-    if (*i <= 16) {
-      sch->Unroll(loops[0]);
-    }
-  }
-  if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[1]))) {
-    if (*i <= 16) {
-      sch->Unroll(loops[1]);
-    }
-  }
-  sch->Unroll(loops[4]);
-  sch->Unroll(loops[5]);
-
-  Array<ExprRV> factors = sch->SamplePerfectTile(loops[3], /*n=*/2, /*max_innermost_factor=*/64);
-  Array<LoopRV> split =
-      sch->Split(loops[3], /*factors=*/{factors[0], factors[1]}, /*preserve_unit_loops=*/true);
-
-  LoopRV fused = sch->Fuse({loops[2], split[0]});
-  sch->Reorder({fused, split[1], loops[0], loops[1]});
-  return split[1];
-}
-
-TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.llvm")
-    .set_body_typed([](Schedule sch, BlockRV block) -> Array<Schedule> {
-      ScheduleDataPack(sch, block);
-      return {sch};
-    });
-
-TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.llvm")
-    .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array<Schedule> {
-      BlockRV input_tile = GetOnlyProducer(sch, data_pack);
-      BlockRV data_pad = GetOnlyProducer(sch, input_tile);
-      ScheduleDataPack(sch, data_pack);
-      sch->ComputeAt(input_tile, /*loop_rv=*/sch->SampleComputeLocation(input_tile),
-                     /*preserve_unit_loops=*/true);
-      sch->ComputeAt(data_pad, /*loop_rv=*/sch->SampleComputeLocation(data_pad),
-                     /*preserve_unit_loops=*/true);
-      return {sch};
-    });
-
-TVM_REGISTER_GLOBAL("meta_schedule.winograd_output.nchw.cuda")
-    .set_body_typed([](Schedule sch, BlockRV output) -> Array<Schedule> {
-      // get loops
-      Array<LoopRV> loops = sch->GetLoops(output);
-      ICHECK_EQ(loops.size(), 4);
-
-      BlockRV OL{nullptr};
-
-      // tile
-      Optional<PrimExpr> tile_size =
-          tir::GetAnn<PrimExpr>(sch->GetSRef(output), "winograd_tile_size");
-      ICHECK(tile_size.defined()) << "Winograd tile size is not defined in block annotation!";
-      Array<LoopRV> split0 = sch->Split(loops[2], {NullOpt, tile_size.value()});
-      Array<LoopRV> split1 = sch->Split(loops[3], {NullOpt, tile_size.value()});
-      sch->Reorder({split0[0], split1[0], split0[1], split1[1]});
-
-      // compute_at
-      BlockRV inverse = GetOnlyProducer(sch, output);
-      sch->ComputeAt(inverse, /*loop_rv=*/split1[0],
-                     /*preserve_unit_loops=*/true);
-
-      // fuse
-      LoopRV fused = sch->Fuse({loops[0], loops[1], split0[0], split1[0]});
-
-      int64_t max_threadblocks = 256;
-      int64_t max_threads_per_block = 1024;
-      auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024});
-      BindBlockThreadIdx(sch, output, max_threadblocks, max_threads_per_block, get_factor);
-      return {sch};
-    });
-
-TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.cuda")
-    .set_body_typed([](Schedule sch, BlockRV block) -> Array<Schedule> {
-      ScheduleDataPack(sch, block);
-      int64_t max_threadblocks = 256;
-      int64_t max_threads_per_block = 1024;
-      auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024});
-      BindBlockThreadIdx(sch, block, max_threadblocks, max_threads_per_block, get_factor);
-      return {sch};
-    });
-
-TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.nchw.cuda")
-    .set_body_typed([](Schedule sch, BlockRV inverse) -> Array<Schedule> {
-      sch->SetScope(inverse, /*buffer_index=*/0, /*storage_scope=*/"local");
-      Array<LoopRV> loops = sch->GetLoops(inverse);
-      ICHECK_EQ(loops.size(), 6);
-      if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[2]))) {
-        if (*i <= 16) {
-          sch->Unroll(loops[2]);
-        }
-      }
-      if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[3]))) {
-        if (*i <= 16) {
-          sch->Unroll(loops[3]);
-        }
-      }
-      sch->Unroll(loops[4]);
-      sch->Unroll(loops[5]);
-      return {sch};
-    });
-
-TVM_REGISTER_GLOBAL("meta_schedule.winograd_kernel_pack.nchw.cuda")
-    .set_body_typed([](Schedule sch, BlockRV kernel_pack) -> Array<Schedule> {
-      Array<LoopRV> loops = sch->GetLoops(kernel_pack);
-      ICHECK_EQ(loops.size(), 6);
-      if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[0]))) {
-        if (*i <= 16) {
-          sch->Unroll(loops[0]);
-        }
-      }
-      if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[1]))) {
-        if (*i <= 16) {
-          sch->Unroll(loops[1]);
-        }
-      }
-      sch->Unroll(loops[4]);
-      sch->Unroll(loops[5]);
-
-      LoopRV fused = sch->Fuse({loops[2], loops[3]});
-
-      int64_t max_threadblocks = 256;
-      int64_t max_threads_per_block = 1024;
-      auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024});
-      BindBlockThreadIdx(sch, kernel_pack, max_threadblocks, max_threads_per_block, get_factor);
-      return {sch};
-    });
-
-TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.cuda")
-    .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array<Schedule> {
-      BlockRV input_tile = GetOnlyProducer(sch, data_pack);
-      BlockRV data_pad = GetOnlyProducer(sch, input_tile);
-      LoopRV loop = ScheduleDataPack(sch, data_pack);
-      sch->ComputeAt(input_tile, /*loop_rv=*/loop, /*preserve_unit_loops=*/true);
-      sch->SetScope(input_tile, /*buffer_index=*/0, /*storage_scope=*/"local");
-      sch->ComputeInline(data_pad);
-      int64_t max_threadblocks = 256;
-      int64_t max_threads_per_block = 1024;
-      auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024});
-      BindBlockThreadIdx(sch, data_pack, max_threadblocks, max_threads_per_block, get_factor);
-      return {sch};
-    });
-
-TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.nchw.cuda")
-    .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array<Schedule> {
-      BlockRV input_tile = GetOnlyProducer(sch, data_pack);
-      BlockRV data_pad = GetOnlyProducer(sch, input_tile);
-
-      BlockRV data_l = sch->CacheWrite(data_pack, /*buffer_index=*/0, /*storage_scope=*/"local");
-      BlockRV d = sch->CacheRead(data_pack, /*buffer_index=*/0, /*storage_scope=*/"local");
-      LoopRV loop = ScheduleDataPackNCHW(sch, data_pack);
-      sch->ReverseComputeAt(data_l, loop, /*preserve_unit_loops=*/true);
-      sch->ComputeAt(d, /*loop_rv=*/loop, /*preserve_unit_loops=*/true);
-      sch->ComputeInline(data_pad);
-
-      int64_t max_threadblocks = 256;
-      int64_t max_threads_per_block = 1024;
-      auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024});
-      BindBlockThreadIdx(sch, data_pack, max_threadblocks, max_threads_per_block, get_factor);
-      return {sch};
-    });
-
-}  // namespace meta_schedule
-}  // namespace tvm
diff --git a/src/meta_schedule/space_generator/post_order_apply.cc b/src/meta_schedule/space_generator/post_order_apply.cc
index 8eb2760dc791..491af6e28f77 100644
--- a/src/meta_schedule/space_generator/post_order_apply.cc
+++ b/src/meta_schedule/space_generator/post_order_apply.cc
@@ -118,20 +118,11 @@ class PostOrderApplyNode : public SpaceGeneratorNode {
 
     std::vector<ScheduleAndUnvisitedBlocks> stack;
     Array<tir::Schedule> result{sch};
-    // Enumerate the schedule rules first because you can
-    // always concat multiple schedule rules as one
     Array<tir::BlockRV> all_blocks = BlockCollector::Collect(sch, f_block_filter_);
-    Array<Optional<ScheduleRule>> rules{NullOpt};
-    rules.insert(rules.end(), sch_rules.value().begin(), sch_rules.value().end());
-    for (Optional<ScheduleRule> sch_rule : rules) {
-      if (sch_rule.defined()) {
-        for (const tir::Schedule& sch : result) {
-          stack.emplace_back(sch, all_blocks);
-        }
-      } else {
-        for (const tir::Schedule& sch : result) {
-          stack.emplace_back(sch, Array<tir::BlockRV>{all_blocks.rbegin(), all_blocks.rend()});
-        }
+
+    for (ScheduleRule sch_rule : sch_rules.value()) {
+      for (const tir::Schedule& sch : result) {
+        stack.emplace_back(sch, all_blocks);
       }
       result.clear();
       while (!stack.empty()) {
@@ -150,33 +141,13 @@ class PostOrderApplyNode : public SpaceGeneratorNode {
           stack.emplace_back(sch, blocks);
           continue;
         }
-
-        Optional<String> ann = tir::GetAnn<String>(sch->GetSRef(block_rv), "schedule_rule");
-        const runtime::PackedFunc* custom_schedule_fn =
-            ann.defined() ? runtime::Registry::Get(ann.value()) : nullptr;
-        const bool has_schedule_rule = custom_schedule_fn != nullptr;
-
-        if (ann.defined() && ann.value() != "None" && !has_schedule_rule) {
-          LOG(WARNING) << "Custom schedule rule not found, ignoring schedule_rule annotation: "
-                       << ann.value();
-        }
-
-        if ((has_schedule_rule && sch_rule.defined()) ||
-            (!has_schedule_rule && !sch_rule.defined()) ||
-            (ann.defined() && ann.value() == "None")) {
-          stack.emplace_back(sch, blocks);
-          continue;
-        }
-
-        Array<tir::Schedule> applied{nullptr};
-        if (sch_rule.defined()) {
-          applied = sch_rule.value()->Apply(sch, /*block=*/block_rv);
-        } else {
-          ICHECK(custom_schedule_fn)
-              << "ValueError: Custom schedule rule not found: " << ann.value();
-          applied = (*custom_schedule_fn)(sch, block_rv);
+        if (!ScheduleRule::IsApplyCustomRule(sch_rule)) {
+          if (tir::GetAnn<String>(sch->GetSRef(block_rv), "schedule_rule").defined()) {
+            stack.emplace_back(sch, blocks);
+            continue;
+          }
         }
-
+        Array<tir::Schedule> applied = sch_rule->Apply(sch, /*block=*/block_rv);
         for (const tir::Schedule& sch : applied) {
           stack.emplace_back(sch, blocks);
         }
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 37e0d1db5e98..80264516c4ce 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -520,27 +520,24 @@ inline bool IsGPUTarget(const std::string& target_name) {
  * \return The AutoInline schedule rule for the given target.
  */
 inline ScheduleRule GetDefaultAutoInline(const std::string& target_name) {
-  if (target_name == "llvm" || target_name == "hexagon") {
-    return ScheduleRule::AutoInline(
-        /*into_producer=*/false,
-        /*into_consumer=*/true,
-        /*inline_const_tensor=*/true,
-        /*disallow_if_then_else=*/true,
-        /*require_injective=*/true,
-        /*require_ordered=*/true,
-        /*disallow_op=*/Array<String>{"tir.exp"});
+  Array<ScheduleRule> rules{nullptr};
+  if (target_name == "llvm") {
+    rules = ScheduleRule::DefaultLLVM();
+  } else if (target_name == "hexagon") {
+    rules = ScheduleRule::DefaultHexagon();
   } else if (IsGPUTarget(target_name)) {
-    return ScheduleRule::AutoInline(
-        /*into_producer=*/true,
-        /*into_consumer=*/true,
-        /*inline_const_tensor=*/true,
-        /*disallow_if_then_else=*/false,
-        /*require_injective=*/false,
-        /*require_ordered=*/false,
-        /*disallow_op=*/Array<String>{});
+    rules = ScheduleRule::DefaultCUDA();
+  } else {
+    LOG(FATAL) << "ValueError: Unsupported target: " << target_name;
+  }
+  for (const ScheduleRule& rule : rules) {
+    if (rule->GetTypeKey() == "meta_schedule.AutoInline") {
+      return rule;
+    }
   }
-  LOG(FATAL) << "Unsupported target " << target_name;
-  return ScheduleRule(nullptr);
+  LOG(FATAL) << "ValueError: AutoInline rule is not found in the default rules for target: "
+             << target_name;
+  throw;
 }
 
 }  // namespace meta_schedule
diff --git a/src/target/tag.cc b/src/target/tag.cc
index 0747769b1e04..c9f24145814b 100644
--- a/src/target/tag.cc
+++ b/src/target/tag.cc
@@ -97,6 +97,7 @@ TVM_REGISTER_TARGET_TAG("nvidia/jetson-agx-xavier")
 #define TVM_REGISTER_CUDA_TAG(Name, Arch, SharedMem, RegPerBlock) \
   TVM_REGISTER_TARGET_TAG(Name).set_config({                      \
       {"kind", String("cuda")},                                   \
+      {"keys", Array<String>{"cuda", "gpu"}},                     \
       {"arch", String(Arch)},                                     \
       {"max_shared_memory_per_block", Integer(SharedMem)},        \
       {"max_threads_per_block", Integer(1024)},                   \
@@ -358,9 +359,11 @@ TVM_REGISTER_CUDA_TAG("nvidia/tegra-x1", "sm_53", 49152, 32768);
 
 #undef TVM_REGISTER_CUDA_TAG
 
-#define TVM_REGISTER_TAG_AWS_C5(Name, Cores, Arch) \
-  TVM_REGISTER_TARGET_TAG(Name).set_config(        \
-      {{"kind", String("llvm")}, {"mcpu", String(Arch)}, {"num-cores", Integer(Cores)}});
+#define TVM_REGISTER_TAG_AWS_C5(Name, Cores, Arch)                                 \
+  TVM_REGISTER_TARGET_TAG(Name).set_config({{"kind", String("llvm")},              \
+                                            {"keys", Array<String>{"x86", "cpu"}}, \
+                                            {"mcpu", String(Arch)},                \
+                                            {"num-cores", Integer(Cores)}});
 
 TVM_REGISTER_TAG_AWS_C5("aws/cpu/c5.large", 1, "skylake-avx512");
 TVM_REGISTER_TAG_AWS_C5("aws/cpu/c5.xlarge", 2, "skylake-avx512");
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index c222de81f2ad..80da5a727926 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -30,6 +30,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+#include <vector>
 
 #include "../../tir/ir/functor_common.h"
 #include "../../tir/transforms/ir_utils.h"
@@ -107,17 +108,21 @@ class LayoutFreePlaceholdersNormalizer : public StmtMutator {
 
   Stmt VisitStmt_(const BlockNode* _block) final {
     Block block = Downcast<Block>(StmtMutator::VisitStmt_(_block));
-    if (Optional<ObjectRef> ann = block->annotations.Get(topi_attr)) {
-      Array<Buffer> new_buffers;
+    BlockNode* n = block.CopyOnWrite();
+    if (Optional<ObjectRef> ann = n->annotations.Get(topi_attr)) {
       for (Buffer buffer : Downcast<Array<Buffer>>(ann)) {
         auto it = buffer2index_.find(buffer);
         if (it != buffer2index_.end()) {
           layout_free_buffer_indices_.insert(it->second);
-        } else {
-          new_buffers.push_back(buffer);
         }
       }
-      block.CopyOnWrite()->annotations.Set(topi_attr, new_buffers);
+      n->annotations.erase(topi_attr);
+    }
+    for (const String& attr : this->blocklist) {
+      auto it = n->annotations.find(attr);
+      if (it != n->annotations.end()) {
+        n->annotations.erase(attr);
+      }
     }
     return std::move(block);
   }
@@ -125,6 +130,8 @@ class LayoutFreePlaceholdersNormalizer : public StmtMutator {
   std::unordered_map<tir::Buffer, int, ObjectPtrHash, ObjectPtrEqual> buffer2index_;
   std::set<int> layout_free_buffer_indices_;
   String topi_attr = "layout_free_placeholders";
+  std::vector<String> blocklist = {"const_matrix", "auto_scheduler_simplify_const_tensor_indices",
+                                   "workload"};
 };
 
 BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
index 6970b0ac06b5..b703c79c5d3a 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -22,16 +22,19 @@
 
 import numpy as np
 import pytest
-
 import tvm
 import tvm.testing
+from tvm import meta_schedule as ms
 from tvm import relay
 from tvm._ffi import register_func
+from tvm.contrib.hexagon.meta_schedule import (
+    get_hexagon_local_builder,
+    get_hexagon_rpc_runner,
+)
 from tvm.meta_schedule import postproc, schedule_rule
-from tvm.tir.tensor_intrin.hexagon import VRMPY_u8i8i32_INTRIN, VRMPY_u8u8i32_INTRIN
-from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
-from tvm import meta_schedule as ms
 from tvm.tir.schedule import BlockRV, Schedule
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8i8i32_INTRIN, VRMPY_u8u8i32_INTRIN
+
 from ..infrastructure import get_hexagon_target
 
 MODEL_JSON = "resnet50_int8.json"
@@ -44,6 +47,7 @@
 def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
     """Tune VRMPY with auto tensorization."""
     sch_rules = [
+        schedule_rule.ApplyCustomRule(),
         schedule_rule.AutoInline(
             into_producer=False,
             into_consumer=True,
@@ -269,7 +273,7 @@ def schedule_rule_conv2d_packed_8x8x32(sch: Schedule, conv2d_block: BlockRV):
         _schedule_packed_8x8x32_conv2d()(sch, conv2d_block)
         return [sch]
 
-    register_func("meta_schedule.conv2d_NCHWc_int8", schedule_rule_conv2d_packed_8x8x32)
+    register_func("meta_schedule.conv2d_NCHWc_int8.hexagon", schedule_rule_conv2d_packed_8x8x32)
 
     def schedule_conv2d_for_tune(sch: Schedule):
         _schedule_packed_8x8x32_conv2d()(sch)
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
deleted file mode 100644
index ac18bab81006..000000000000
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-
-import tvm
-from tvm import meta_schedule as ms
-from tvm.ir import IRModule
-from tvm.meta_schedule.testing.conv2d_winograd_cpu import conv2d_winograd_cpu
-from tvm.target import Target
-from tvm.tir.schedule import Schedule, Trace
-
-
-def _get_mod():
-    # pylint: disable=invalid-name
-    def inline(sch: Schedule):
-        b1 = sch.get_block(name="A")
-        b2 = sch.get_block(name="B")
-        sch.compute_inline(block=b1)
-        sch.compute_inline(block=b2)
-
-    def input_tile_data_pad(sch: Schedule):
-        b78 = sch.get_block(name="input_tile")
-        l80 = sch.sample_compute_location(block=b78, decision=4)
-        sch.compute_at(block=b78, loop=l80, preserve_unit_loops=True)
-
-        b81 = sch.get_block(name="data_pad")
-        l83 = sch.sample_compute_location(block=b81, decision=-2)
-        sch.compute_at(block=b81, loop=l83, preserve_unit_loops=True)
-
-    def data_pack(sch: Schedule):
-        b18 = sch.get_block(name="data_pack")
-        l19, l20, l21, l22, l23, l24 = sch.get_loops(block=b18)
-        sch.unroll(loop=l19)
-        sch.unroll(loop=l20)
-        v25, v26 = sch.sample_perfect_tile(
-            n=2,
-            loop=l21,
-            max_innermost_factor=64,
-            decision=[9, 1],
-        )
-        l27, l28 = sch.split(loop=l21, factors=[v25, v26])
-        v29, v30 = sch.sample_perfect_tile(
-            n=2,
-            loop=l22,
-            max_innermost_factor=64,
-            decision=[32, 4],
-        )
-        l31, l32 = sch.split(loop=l22, factors=[v29, v30])
-        sch.unroll(loop=l23)
-        sch.unroll(loop=l24)
-        sch.reorder(l27, l31, l28, l32, l19, l20, l23, l24)
-
-    def bgemm(sch: Schedule):
-        bgemm = sch.get_block(name="bgemm")
-        write_cache = sch.cache_write(
-            block=bgemm,
-            write_buffer_index=0,
-            storage_scope="global",
-        )
-        sch.annotate(
-            block_or_loop=bgemm,
-            ann_key="meta_schedule.tiling_structure",
-            ann_val="SSRSRS",
-        )
-        # b33, b34 = b34, b33
-        l35, l36, l37, l38, l39 = sch.get_loops(block=bgemm)
-        v40, v41, v42, v43 = sch.sample_perfect_tile(
-            n=4,
-            loop=l35,
-            max_innermost_factor=64,
-            decision=[1, 2, 3, 1],
-        )
-        l44, l45, l46, l47 = sch.split(loop=l35, factors=[v40, v41, v42, v43])
-        v48, v49, v50, v51 = sch.sample_perfect_tile(
-            n=4,
-            loop=l36,
-            max_innermost_factor=64,
-            decision=[1, 1, 1, 6],
-        )
-        l52, l53, l54, l55 = sch.split(loop=l36, factors=[v48, v49, v50, v51])
-        v56, v57, v58, v59 = sch.sample_perfect_tile(
-            n=4,
-            loop=l37,
-            max_innermost_factor=64,
-            decision=[1, 1, 1, 9],
-        )
-        l60, l61, l62, l63 = sch.split(loop=l37, factors=[v56, v57, v58, v59])
-        v64, v65, v66, v67 = sch.sample_perfect_tile(
-            n=4,
-            loop=l38,
-            max_innermost_factor=64,
-            decision=[2, 1, 16, 4],
-        )
-        l68, l69, l70, l71 = sch.split(loop=l38, factors=[v64, v65, v66, v67])
-        v72, v73 = sch.sample_perfect_tile(
-            n=2,
-            loop=l39,
-            max_innermost_factor=64,
-            decision=[16, 8],
-        )
-        l74, l75 = sch.split(loop=l39, factors=[v72, v73])
-        sch.reorder(
-            # fmt: off
-                l44, l52, l60, l68,
-                l45, l53, l61, l69,
-                l74,
-                l46, l54, l62, l70,
-                l75,
-                l47, l55, l63, l71,
-            # fmt: on
-        )
-        sch.reverse_compute_at(block=write_cache, loop=l69, preserve_unit_loops=True)
-
-    def inverse(sch: Schedule):
-        b3 = sch.get_block(name="inverse")
-        l4, l5, l6, l7, l8, l9 = sch.get_loops(block=b3)
-        sch.unroll(loop=l4)
-        sch.unroll(loop=l5)
-        v10, v11 = sch.sample_perfect_tile(
-            n=2,
-            loop=l6,
-            max_innermost_factor=64,
-            decision=[1, 9],
-        )
-        l12, l13 = sch.split(loop=l6, factors=[v10, v11])
-        v14, v15 = sch.sample_perfect_tile(
-            n=2,
-            loop=l7,
-            max_innermost_factor=64,
-            decision=[2, 64],
-        )
-        l16, l17 = sch.split(loop=l7, factors=[v14, v15])
-        sch.unroll(loop=l8)
-        sch.unroll(loop=l9)
-        sch.reorder(l12, l16, l13, l17, l4, l5, l8, l9)
-
-    # pylint: enable=invalid-name
-
-    sch = Schedule(mod=conv2d_winograd_cpu)
-    inline(sch)
-    data_pack(sch)
-    input_tile_data_pad(sch)
-    bgemm(sch)
-    inverse(sch)
-    return sch.mod
-
-
-def test_conv2d_winograd_cpu():
-    mod = conv2d_winograd_cpu
-    mod = IRModule({"main": mod})
-    target = Target("llvm --num-cores=16")
-    context = ms.TuneContext(
-        mod=mod,
-        target=target,
-        task_name="Custom Search Space Task",
-        space_generator=ms.space_generator.PostOrderApply(),
-    )
-    post_order_apply = context.space_generator
-    (sch,) = post_order_apply.generate_design_space(mod)
-    decisions = dict(
-        zip(
-            [i for i in sch.trace.insts[:-4] if i.kind.name.startswith("Sample")],
-            [
-                # data_pack
-                [9, 1],
-                [32, 4],
-                # input_tile
-                4,
-                # data_pad
-                -2,
-                # inverse
-                [1, 9],
-                [2, 64],
-                # bgemm
-                [1, 2, 3, 1],
-                [1, 1, 1, 6],
-                [1, 1, 1, 9],
-                [2, 1, 16, 4],
-                [16, 8],
-            ],
-        )
-    )
-    trace = Trace(sch.trace.insts[:-4], decisions=decisions)
-    sch = Schedule(mod=mod)
-    trace.apply_to_schedule(sch, remove_postproc=False)
-    answer = sch.mod
-    expected = _get_mod()
-    tvm.ir.assert_structural_equal(answer, expected)
-
-
-if __name__ == "__main__":
-    test_conv2d_winograd_cpu()
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
deleted file mode 100644
index 89a04a9464ce..000000000000
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-
-import tvm
-from tvm import meta_schedule as ms
-from tvm.ir import IRModule
-from tvm.meta_schedule.testing.conv2d_winograd_cuda import conv2d_winograd_cuda
-from tvm.target import Target
-from tvm.tir.schedule import Schedule, Trace
-
-
-def _get_mod():
-    # pylint: disable=invalid-name
-    def inline(sch: Schedule):
-        b125 = sch.get_block(name="A")
-        sch.compute_inline(block=b125)
-        b126 = sch.get_block(name="B")
-        sch.compute_inline(block=b126)
-
-    def input_tile_data_pad(sch: Schedule):
-        b115 = sch.get_block(name="input_tile")
-        (b116,) = sch.get_consumers(block=b115)
-        _, _, _, l120, _, _, _, _ = sch.get_loops(block=b116)
-        sch.compute_at(block=b115, loop=l120, preserve_unit_loops=True)
-        sch.set_scope(block=b115, buffer_index=0, storage_scope="local")
-
-        b127 = sch.get_block(name="data_pad")
-        sch.compute_inline(block=b127)
-
-        b3 = sch.get_block(name="data_pack")
-        l25, l26, l27, l28, _, _, _, _ = sch.get_loops(block=b3)
-        l33 = sch.fuse(l25, l26, l27, l28)
-        v34 = sch.sample_categorical(
-            candidates=[32, 64, 128, 256, 512, 1024],
-            probs=[
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-            ],
-            decision=2,
-        )
-        l35, l36 = sch.split(loop=l33, factors=[None, v34])
-        sch.bind(loop=l35, thread_axis="blockIdx.x")
-        sch.bind(loop=l36, thread_axis="threadIdx.x")
-
-    def data_pack(sch: Schedule):
-        b16 = sch.get_block(name="data_pack")
-        l17, l18, l19, l20, l21, l22 = sch.get_loops(block=b16)
-        sch.unroll(loop=l17)
-        sch.unroll(loop=l18)
-        v23, v24 = sch.sample_perfect_tile(
-            n=2,
-            loop=l19,
-            max_innermost_factor=64,
-            decision=[3, 3],
-        )
-        l25, l26 = sch.split(loop=l19, factors=[v23, v24])
-        v27, v28 = sch.sample_perfect_tile(
-            n=2,
-            loop=l20,
-            max_innermost_factor=64,
-            decision=[64, 2],
-        )
-        l29, l30 = sch.split(loop=l20, factors=[v27, v28])
-        sch.unroll(loop=l21)
-        sch.unroll(loop=l22)
-        sch.reorder(l25, l29, l26, l30, l17, l18, l21, l22)
-
-    def bgemm(sch: Schedule):
-        b31 = sch.get_block(name="bgemm")
-        sch.annotate(
-            block_or_loop=b31,
-            ann_key="meta_schedule.tiling_structure",
-            ann_val="SSSRRSRS",
-        )
-        sch.annotate(
-            block_or_loop=b31,
-            ann_key="meta_schedule.thread_extent_low_inclusive",
-            ann_val=32,
-        )
-        sch.annotate(
-            block_or_loop=b31,
-            ann_key="meta_schedule.thread_extent_high_inclusive",
-            ann_val=1024,
-        )
-        b32 = sch.cache_write(block=b31, write_buffer_index=0, storage_scope="local")
-        b31, b32 = b32, b31
-        l33, l34, l35, l36, l37 = sch.get_loops(block=b32)
-        v38, v39, v40, v41, v42 = sch.sample_perfect_tile(
-            n=5,
-            loop=l33,
-            max_innermost_factor=64,
-            decision=[1, 1, 1, 1, 6],
-        )
-        l43, l44, l45, l46, l47 = sch.split(loop=l33, factors=[v38, v39, v40, v41, v42])
-        v48, v49, v50, v51, v52 = sch.sample_perfect_tile(
-            n=5,
-            loop=l34,
-            max_innermost_factor=64,
-            decision=[1, 1, 1, 3, 2],
-        )
-        l53, l54, l55, l56, l57 = sch.split(loop=l34, factors=[v48, v49, v50, v51, v52])
-        v58, v59, v60, v61, v62 = sch.sample_perfect_tile(
-            n=5,
-            loop=l35,
-            max_innermost_factor=64,
-            decision=[3, 1, 1, 1, 3],
-        )
-        l63, l64, l65, l66, l67 = sch.split(loop=l35, factors=[v58, v59, v60, v61, v62])
-        v68, v69, v70, v71, v72 = sch.sample_perfect_tile(
-            n=5,
-            loop=l36,
-            max_innermost_factor=64,
-            decision=[4, 2, 1, 4, 4],
-        )
-        l73, l74, l75, l76, l77 = sch.split(loop=l36, factors=[v68, v69, v70, v71, v72])
-        v78, v79, v80 = sch.sample_perfect_tile(
-            n=3,
-            loop=l37,
-            max_innermost_factor=64,
-            decision=[32, 1, 4],
-        )
-        l81, l82, l83 = sch.split(loop=l37, factors=[v78, v79, v80])
-        sch.reorder(
-            # fmt: off
-            l43, l53, l63, l73,
-            l44, l54, l64, l74,
-            l45, l55, l65, l75,
-            l81,
-            l82,
-            l46, l56, l66, l76,
-            l83,
-            l47, l57, l67, l77,
-            # fmt: on
-        )
-        l84 = sch.fuse(l43, l53, l63, l73)
-        sch.bind(loop=l84, thread_axis="blockIdx.x")
-        l85 = sch.fuse(l44, l54, l64, l74)
-        sch.bind(loop=l85, thread_axis="vthread.x")
-        l86 = sch.fuse(l45, l55, l65, l75)
-        sch.bind(loop=l86, thread_axis="threadIdx.x")
-
-        b87 = sch.cache_read(block=b32, read_buffer_index=1, storage_scope="shared")
-        sch.compute_at(block=b87, loop=l81, preserve_unit_loops=True)
-        _, _, _, _, l92, l93, l94, l95 = sch.get_loops(block=b87)
-        sch.fuse(l92, l93, l94, l95)
-        v97 = sch.sample_categorical(
-            candidates=[1, 2, 3, 4],
-            probs=[0.25, 0.25, 0.25, 0.25],
-            decision=1,
-        )
-        sch.annotate(
-            block_or_loop=b87,
-            ann_key="meta_schedule.cooperative_fetch",
-            ann_val=v97,
-        )
-
-        b101 = sch.cache_read(block=b32, read_buffer_index=2, storage_scope="shared")
-        sch.compute_at(block=b101, loop=l81, preserve_unit_loops=True)
-        _, _, _, _, l106, l107, l108, l109 = sch.get_loops(block=b101)
-        sch.fuse(l106, l107, l108, l109)
-        v110 = sch.sample_categorical(
-            candidates=[1, 2, 3, 4],
-            probs=[0.25, 0.25, 0.25, 0.25],
-            decision=1,
-        )
-        sch.annotate(
-            block_or_loop=b101,
-            ann_key="meta_schedule.cooperative_fetch",
-            ann_val=v110,
-        )
-
-        sch.reverse_compute_at(block=b31, loop=l86, preserve_unit_loops=True)
-
-    def inverse(sch: Schedule):
-        b1 = sch.get_block(name="inverse")
-        l2, l3, l4, l5, l6, l7 = sch.get_loops(block=b1)
-        sch.unroll(loop=l2)
-        sch.unroll(loop=l3)
-        v8, v9 = sch.sample_perfect_tile(
-            n=2,
-            loop=l4,
-            max_innermost_factor=64,
-            decision=[3, 3],
-        )
-        l10, l11 = sch.split(loop=l4, factors=[v8, v9])
-        v12, v13 = sch.sample_perfect_tile(
-            n=2,
-            loop=l5,
-            max_innermost_factor=64,
-            decision=[2, 64],
-        )
-        l14, l15 = sch.split(loop=l5, factors=[v12, v13])
-        sch.unroll(loop=l6)
-        sch.unroll(loop=l7)
-        sch.reorder(l10, l14, l11, l15, l2, l3, l6, l7)
-        l59 = sch.fuse(l10, l14, l11, l15)
-        v60 = sch.sample_categorical(
-            candidates=[32, 64, 128, 256, 512, 1024],
-            probs=[
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-            ],
-            decision=2,
-        )
-        l61, l62 = sch.split(loop=l59, factors=[None, v60])
-        sch.bind(loop=l61, thread_axis="blockIdx.x")
-        sch.bind(loop=l62, thread_axis="threadIdx.x")
-
-    def conv2d(sch: Schedule):
-        b7 = sch.get_block(name="conv2d_winograd")
-        l141, l142, l143, l144 = sch.get_loops(block=b7)
-        l145 = sch.fuse(l141, l142, l143, l144)
-        v146 = sch.sample_categorical(
-            candidates=[32, 64, 128, 256, 512, 1024],
-            probs=[
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-                0.16666666666666666,
-            ],
-            decision=2,
-        )
-        l147, l148 = sch.split(loop=l145, factors=[None, v146])
-        sch.bind(loop=l147, thread_axis="blockIdx.x")
-        sch.bind(loop=l148, thread_axis="threadIdx.x")
-
-    def root_anno(sch: Schedule):
-        b8 = sch.get_block(name="root", func_name="main")
-        v140 = sch.sample_categorical(
-            candidates=[0, 16, 64, 512, 1024],
-            probs=[
-                0.20000000000000001,
-                0.20000000000000001,
-                0.20000000000000001,
-                0.20000000000000001,
-                0.20000000000000001,
-            ],
-            decision=2,
-        )
-        sch.annotate(block_or_loop=b8, ann_key="meta_schedule.unroll_explicit", ann_val=v140)
-
-    # pylint: enable=invalid-name
-
-    sch = Schedule(mod=conv2d_winograd_cuda)
-    inline(sch)
-    data_pack(sch)
-    input_tile_data_pad(sch)
-    bgemm(sch)
-    inverse(sch)
-    conv2d(sch)
-    root_anno(sch)
-
-    return sch.mod
-
-
-def test_conv2d_winograd_cuda():
-    mod = conv2d_winograd_cuda
-    mod = IRModule({"main": mod})
-    context = ms.TuneContext(
-        mod=mod,
-        target=Target("nvidia/geforce-rtx-3090", host="llvm"),
-        task_name="Custom Search Space Task",
-        space_generator=ms.space_generator.PostOrderApply(),
-    )
-    post_order_apply = context.space_generator
-    (sch,) = post_order_apply.generate_design_space(mod)
-    decisions = dict(
-        zip(
-            [i for i in sch.trace.insts if i.kind.name.startswith("Sample")],
-            [
-                # data_pack
-                [3, 3],
-                [64, 2],
-                2,
-                # inverse
-                [3, 3],
-                [2, 64],
-                2,
-                # bgemm
-                [1, 1, 1, 1, 6],
-                [1, 1, 1, 3, 2],
-                [3, 1, 1, 1, 3],
-                [4, 2, 1, 4, 4],
-                [32, 1, 4],
-                1,
-                1,
-                # root anno
-                2,
-                # conv2d
-                2,
-            ],
-        )
-    )
-    trace = Trace(sch.trace.insts, decisions=decisions)
-    sch = Schedule(mod=mod)
-    trace.apply_to_schedule(sch, remove_postproc=False)
-    answer = sch.mod
-    expected = _get_mod()
-    tvm.ir.assert_structural_equal(answer, expected)
-
-
-if __name__ == "__main__":
-    test_conv2d_winograd_cuda()
diff --git a/tests/python/unittest/test_meta_schedule_post_order_apply.py b/tests/python/unittest/test_meta_schedule_post_order_apply.py
index 9026feb9e08e..c1d2dc3d0788 100644
--- a/tests/python/unittest/test_meta_schedule_post_order_apply.py
+++ b/tests/python/unittest/test_meta_schedule_post_order_apply.py
@@ -122,23 +122,6 @@ def main(a: T.handle, d: T.handle) -> None:
                 D[vi, vj] = (B[vi, vj] + T.float32(3)) * T.float32(5)
 
 
-@tvm.script.ir_module
-class MatmulCustomized:
-    @T.prim_func
-    def main(a: T.handle, b: T.handle, c: T.handle) -> None:
-        T.func_attr({"global_symbol": "main"})
-        A = T.match_buffer(a, (1024, 1024), "float32")
-        B = T.match_buffer(b, (1024, 1024), "float32")
-        C = T.match_buffer(c, (1024, 1024), "float32")
-        with T.block("root"):
-            for i, j, k in T.grid(1024, 1024, 1024):
-                with T.block("matmul"):
-                    T.block_attr({"schedule_rule": "tvm.meta_schedule.test.custom_search_space"})
-                    vi, vj, vk = T.axis.remap("SSR", [i, j, k])
-                    with T.init():
-                        C[vi, vj] = 0.0
-                    C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
-
 # fmt: on
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
 
@@ -382,32 +365,6 @@ def correct_trace(a, b, c, d):
         )
 
 
-def test_meta_schedule_custom_search_space():
-    mod = MatmulCustomized
-    context = TuneContext(
-        mod=mod,
-        target=Target("llvm"),
-        task_name="Custom Search Space Task",
-        space_generator=PostOrderApply(
-            sch_rules=[],
-            postprocs=[],
-            mutator_probs={},
-        ),
-    )
-    post_order_apply = context.space_generator
-    post_order_apply.generate_design_space(mod)
-    called = False
-
-    def custom_search_space_func(sch: Schedule, _: BlockRV) -> List[Schedule]:
-        nonlocal called
-        called = True
-        return [sch]
-
-    register_func("tvm.meta_schedule.test.custom_search_space", custom_search_space_func)
-    post_order_apply.generate_design_space(mod)
-    assert called
-
-
 def test_target_blocks_search_space():
     # Test that specific blocks of trinity matmul can be targeted.
     def filter_fn(block, target_names) -> bool:
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index c689a15c56b2..bf302cd0e5bf 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -15,8 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Integration test for MetaSchedule"""
-from typing import List
 import tempfile
+from typing import List
+
 import numpy as np
 import pytest
 import tvm
@@ -27,7 +28,7 @@
 from tvm._ffi import register_func
 from tvm.contrib import graph_executor
 from tvm.ir.transform import PassContext
-from tvm.meta_schedule.database import Workload, TuningRecord
+from tvm.meta_schedule.database import TuningRecord, Workload
 from tvm.meta_schedule.testing.relay_workload import get_network
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
 from tvm.meta_schedule.tune_context import _normalize_mod
@@ -333,7 +334,6 @@ def _test(mod, params, target):
 
             assert "schedule_rule" in annotations
             assert "vnni" in annotations["schedule_rule"]
-        ...
 
     mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
     _test(mod, params, target="llvm -mcpu=cascadelake")
@@ -445,7 +445,6 @@ def main(placeholder: T.Buffer[(1, 1, 16, 16, 3), "float32"], placeholder_1: T.B
                     n, oc_chunk, oh, ow, oc_block, ic, kh, kw = T.axis.remap("SSSSSRRR", [i0, i1, i2, i3, i4, i5, i6, i7])
                     T.reads(data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3], placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block]) # type: ignore
                     T.writes(conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block])
-                    T.block_attr({"workload":["conv2d_NCHWc.x86", ["TENSOR", [1, 1, 16, 16, 3], "float32"], ["TENSOR", [2, 1, 5, 5, 3, 4], "float32"], [1, 1], [2, 2, 2, 2], [1, 1], "NCHW3c", "NCHW4c", "float32"]})
                     with T.init():
                         conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = T.float32(0)
                     conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3] * placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block] # type: ignore
diff --git a/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py b/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py
new file mode 100644
index 000000000000..78b75d592ed4
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py
@@ -0,0 +1,168 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for MetaSchedule search space on CPU"""
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    generate_design_space,
+    print_sketches,
+)
+from tvm.meta_schedule.testing.te_workload import create_te_workload
+from tvm.script import tir as T
+from tvm.target import Target
+
+
+def _target():
+    return Target("aws/cpu/c5.9xlarge")
+
+
+def _design_space(mod):
+    return generate_design_space(
+        kind="llvm",
+        mod=mod,
+        target=_target(),
+        types=ms.ScheduleRule,
+    )
+
+
+def test_cpu_nhwc():
+    # fmt: off
+    @T.prim_func
+    def cpu_nhwc_0(X: T.Buffer[(1, 14, 14, 128), "float32"], W: T.Buffer[(6, 6, 128, 128), "float32"], conv2d_winograd: T.Buffer[(1, 12, 12, 128), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":64, "meta_schedule.vectorize":64})
+            data_pad = T.alloc_buffer([1, 16, 16, 128], dtype="float32")
+            input_tile = T.alloc_buffer([6, 6, 9, 128], dtype="float32")
+            data_pack = T.alloc_buffer([6, 6, 9, 128], dtype="float32")
+            bgemm = T.alloc_buffer([6, 6, 9, 128], dtype="float32")
+            inverse = T.alloc_buffer([4, 4, 9, 128], dtype="float32")
+            bgemm_global = T.alloc_buffer([6, 6, 9, 128], dtype="float32")
+            for i2_0 in T.serial(9):
+                for ax0, ax1, ax2, ax3 in T.grid(1, 6, 6, 128):
+                    with T.block("data_pad"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(16, i2_0 // 3 * 4 + ax1)
+                        i2 = T.axis.spatial(16, i2_0 % 3 * 4 + ax2)
+                        i3 = T.axis.spatial(128, ax3)
+                        T.reads(X[i0, i1, i2, i3])
+                        T.writes(data_pad[i0, i1, i2, i3])
+                        T.block_attr({"schedule_rule":"None"})
+                        data_pad[i0, i1, i2, i3] = T.if_then_else(0 <= i1 and i1 < 14 and 0 <= i2 and i2 < 14, X[i0, i1, i2, i3], T.float32(0), dtype="float32")
+                for i3_0 in T.serial(2):
+                    for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 64):
+                        with T.block("input_tile"):
+                            eps, nu = T.axis.remap("SS", [ax0, ax1])
+                            p = T.axis.spatial(9, i2_0 + ax2)
+                            ci = T.axis.spatial(128, i3_0 * 64 + ax3)
+                            T.reads(data_pad[p // 9, p % 9 // 3 * 4 + eps, p % 3 * 4 + nu, ci])
+                            T.writes(input_tile[eps, nu, p, ci])
+                            T.block_attr({"schedule_rule":"None"})
+                            input_tile[eps, nu, p, ci] = data_pad[p // 9, p % 9 // 3 * 4 + eps, p % 3 * 4 + nu, ci]
+                    for i2_1, i3_1 in T.grid(1, 64):
+                        for i0 in T.unroll(6):
+                            for i1 in T.unroll(6):
+                                for i4 in T.unroll(6):
+                                    for i5 in T.unroll(6):
+                                        with T.block("data_pack"):
+                                            eps, nu = T.axis.remap("SS", [i0, i1])
+                                            p = T.axis.spatial(9, i2_0 + i2_1)
+                                            ci = T.axis.spatial(128, i3_0 * 64 + i3_1)
+                                            r_a, r_b = T.axis.remap("RR", [i4, i5])
+                                            T.reads(input_tile[r_a, r_b, p, ci])
+                                            T.writes(data_pack[eps, nu, p, ci])
+                                            T.block_attr({"schedule_rule":"conv2d_nhwc_winograd_data_pack"})
+                                            with T.init():
+                                                data_pack[eps, nu, p, ci] = T.float32(0)
+                                            data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_b % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_b % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_b % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_b % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_b % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_b % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_b % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_b % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_b % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_b % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_b % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_b % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_b % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_b % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
+            for i0_0, i1_0, i2_0, i3_0, i0_1, i1_1, i2_1, i3_1 in T.grid(3, 2, 3, 1, 1, 1, 1, 1):
+                for i4_0, i0_2, i1_2, i2_2, i3_2, i4_1, i0_3, i1_3, i2_3, i3_3 in T.grid(32, 1, 1, 1, 2, 4, 2, 3, 3, 64):
+                    with T.block("bgemm"):
+                        eps = T.axis.spatial(6, i0_0 * 2 + i0_1 * 2 + i0_2 * 2 + i0_3)
+                        nu = T.axis.spatial(6, i1_0 * 3 + i1_1 * 3 + i1_2 * 3 + i1_3)
+                        p = T.axis.spatial(9, i2_0 * 3 + i2_1 * 3 + i2_2 * 3 + i2_3)
+                        co = T.axis.spatial(128, i3_0 * 128 + i3_1 * 128 + i3_2 * 64 + i3_3)
+                        ci = T.axis.reduce(128, i4_0 * 4 + i4_1)
+                        T.reads(data_pack[eps, nu, p, ci], W[eps, nu, co, ci])
+                        T.writes(bgemm_global[eps, nu, p, co])
+                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS", "meta_schedule.write_cache_level":[2]})
+                        with T.init():
+                            bgemm_global[eps, nu, p, co] = T.float32(0)
+                        bgemm_global[eps, nu, p, co] = bgemm_global[eps, nu, p, co] + data_pack[eps, nu, p, ci] * W[eps, nu, co, ci]
+                for ax0, ax1, ax2, ax3 in T.grid(2, 3, 3, 128):
+                    with T.block("bgemm_global"):
+                        v0 = T.axis.spatial(6, i0_0 * 2 + ax0)
+                        v1 = T.axis.spatial(6, i1_0 * 3 + ax1)
+                        v2 = T.axis.spatial(9, i2_0 * 3 + ax2)
+                        v3 = T.axis.spatial(128, ax3)
+                        T.reads(bgemm_global[v0, v1, v2, v3])
+                        T.writes(bgemm[v0, v1, v2, v3])
+                        bgemm[v0, v1, v2, v3] = bgemm_global[v0, v1, v2, v3]
+            for i2_0, i3_0, i2_1, i3_1 in T.grid(3, 8, 3, 16):
+                for i0 in T.unroll(4):
+                    for i1 in T.unroll(4):
+                        for i4 in T.unroll(6):
+                            for i5 in T.unroll(6):
+                                with T.block("inverse"):
+                                    vh, vw = T.axis.remap("SS", [i0, i1])
+                                    p = T.axis.spatial(9, i2_0 * 3 + i2_1)
+                                    co = T.axis.spatial(128, i3_0 * 16 + i3_1)
+                                    r_a, r_b = T.axis.remap("RR", [i4, i5])
+                                    T.reads(bgemm[r_a, r_b, p, co])
+                                    T.writes(inverse[vh, vw, p, co])
+                                    T.block_attr({"schedule_rule":"conv2d_nhwc_winograd_inverse"})
+                                    with T.init():
+                                        inverse[vh, vw, p, co] = T.float32(0)
+                                    inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 6 == 5 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 5 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 0, T.float32(0), T.Select(r_a % 6 == 4 and vh % 4 == 3, T.float32(-8), T.Select(r_a % 6 == 4 and vh % 4 == 2, T.float32(4), T.Select(r_a % 6 == 4 and vh % 4 == 1, T.float32(-2), T.Select(r_a % 6 == 4 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 3 and vh % 4 == 3, T.float32(0.125), T.Select(r_a % 6 == 3 and vh % 4 == 2, T.float32(0.25), T.Select(r_a % 6 == 3 and vh % 4 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 1, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 3, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 1, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 0 and vh % 4 == 3, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 5 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 0, T.float32(0), T.Select(r_b % 6 == 4 and vw % 4 == 3, T.float32(-8), T.Select(r_b % 6 == 4 and vw % 4 == 2, T.float32(4), T.Select(r_b % 6 == 4 and vw % 4 == 1, T.float32(-2), T.Select(r_b % 6 == 4 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 3 and vw % 4 == 3, T.float32(0.125), T.Select(r_b % 6 == 3 and vw % 4 == 2, T.float32(0.25), T.Select(r_b % 6 == 3 and vw % 4 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 1, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 3, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 1, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 0 and vw % 4 == 3, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))
+            for i0, i1, i2, i3 in T.grid(1, 12, 12, 128):
+                with T.block("conv2d_winograd"):
+                    n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                    T.reads(inverse[h % 4, w % 4, n * 9 + h // 4 * 3 + w // 4, co])
+                    T.writes(conv2d_winograd[n, h, w, co])
+                    conv2d_winograd[n, h, w, co] = inverse[h % 4, w % 4, n * 9 + h // 4 * 3 + w // 4, co]
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [3, 3]),
+        ("SamplePerfectTile", [8, 16]),
+        ("SamplePerfectTile", [9, 1]),
+        ("SamplePerfectTile", [2, 64]),
+        ("SampleComputeLocation", 1),
+        ("SampleComputeLocation", 0),
+        ("SamplePerfectTile", [3, 1, 1, 2]),
+        ("SamplePerfectTile", [2, 1, 1, 3]),
+        ("SamplePerfectTile", [3, 1, 1, 3]),
+        ("SamplePerfectTile", [1, 1, 2, 64]),
+        ("SamplePerfectTile", [32, 4]),
+        ("SampleCategorical", 2),
+    ]
+    with _target():
+        mod = create_te_workload("C2D_WIN_NHWC", 0)
+    actual = _design_space(mod)
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[cpu_nhwc_0],
+        expected_decisions=[decision_0],
+    )
+
+
+if __name__ == "__main__":
+    test_cpu_nhwc()
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index f0f6e91ea655..324d8a9ec4f8 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -15,9 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Tests for MetaSchedule search space on CUDA"""
-from tvm import autotvm
 from tvm import meta_schedule as ms
-from tvm import te, topi
 from tvm.meta_schedule.testing.space_generation import (
     check_sketches,
     generate_design_space,
@@ -41,27 +39,6 @@ def _design_space(mod):
     )
 
 
-def _conv2d_winograd_nchw():
-    data = te.placeholder((1, 64, 224, 224), name="data", dtype="float32")
-    kernel = te.placeholder((6, 6, 64, 64), name="kernel", dtype="float32")
-    return te.create_prim_func(
-        [
-            data,
-            kernel,
-            topi.cuda.conv2d_winograd.winograd_cuda(
-                cfg=autotvm.ConfigSpace(),
-                data=data,
-                kernel=kernel,
-                strides=(1, 1),
-                padding=(1, 1),
-                dilation=(1, 1),
-                out_dtype="float32",
-                pre_computed=True,
-            ),
-        ]
-    )
-
-
 def test_cuda_c1d():
     # fmt: off
     @T.prim_func
@@ -1272,151 +1249,6 @@ def tbg_0(query: T.Buffer[(1, 128, 12, 64), "float32"], value: T.Buffer[(1, 128,
     )
 
 
-def test_cuda_winograd_nchw_conv2d():
-    # fmt: off
-    @T.prim_func
-    def winograd_nchw_conv2d(data: T.Buffer[(1, 64, 224, 224), "float32"], kernel: T.Buffer[(6, 6, 64, 64), "float32"], output: T.Buffer[(1, 64, 224, 224), "float32"]) -> None:
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # body
-        with T.block("root"):
-            T.reads()
-            T.writes()
-            T.block_attr({"meta_schedule.unroll_explicit":1024})
-            data_pack = T.alloc_buffer([6, 6, 64, 3136], dtype="float32")
-            bgemm = T.alloc_buffer([6, 6, 64, 3136], dtype="float32")
-            inverse_local = T.alloc_buffer([64, 3136, 4, 4], dtype="float32", scope="local")
-            data_pack_local = T.alloc_buffer([6, 6, 64, 3136], dtype="float32", scope="local")
-            d_local = T.alloc_buffer([64, 3136, 6, 6], dtype="float32", scope="local")
-            bgemm_local = T.alloc_buffer([6, 6, 64, 3136], dtype="float32", scope="local")
-            kernel_shared = T.alloc_buffer([6, 6, 64, 64], dtype="float32", scope="shared")
-            data_pack_shared = T.alloc_buffer([6, 6, 64, 3136], dtype="float32", scope="shared")
-            for i2_i3_0_fused_i3_1_fused_0 in T.thread_binding(3136, thread="blockIdx.x"):
-                for i2_i3_0_fused_i3_1_fused_1 in T.thread_binding(64, thread="threadIdx.x"):
-                    for ax0, ax1, ax2, ax3 in T.grid(1, 1, 6, 6):
-                        with T.block("d_local"):
-                            v0 = T.axis.spatial(64, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) // 3136 + ax0)
-                            v1 = T.axis.spatial(3136, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 3136 // 7 * 7 + (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 7 + ax1)
-                            v2, v3 = T.axis.remap("SS", [ax2, ax3])
-                            T.reads(data[v1 // 3136, v0, v1 % 3136 // 56 * 4 + v2 - 1, v1 % 56 * 4 + v3 - 1])
-                            T.writes(d_local[v0, v1, v2, v3])
-                            d_local[v0, v1, v2, v3] = T.if_then_else(1 <= v1 % 3136 // 56 * 4 + v2 and v1 % 3136 // 56 * 4 + v2 < 225 and 1 <= v1 % 56 * 4 + v3 and v1 % 56 * 4 + v3 < 225, data[v1 // 3136, v0, v1 % 3136 // 56 * 4 + v2 - 1, v1 % 56 * 4 + v3 - 1], T.float32(0), dtype="float32")
-                    for i0 in T.unroll(6):
-                        for i1 in T.unroll(6):
-                            for i4 in T.unroll(6):
-                                for i5 in T.unroll(6):
-                                    with T.block("data_pack"):
-                                        eps, nu = T.axis.remap("SS", [i0, i1])
-                                        ci = T.axis.spatial(64, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) // 3136)
-                                        p = T.axis.spatial(3136, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 3136 // 7 * 7 + (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 7)
-                                        r_a, r_a_1 = T.axis.remap("RR", [i4, i5])
-                                        T.reads(d_local[ci, p, r_a, r_a_1])
-                                        T.writes(data_pack_local[eps, nu, ci, p])
-                                        T.block_attr({"schedule_rule":"meta_schedule.winograd_data_pack.nchw.cuda"})
-                                        with T.init():
-                                            data_pack_local[eps, nu, ci, p] = T.float32(0)
-                                        data_pack_local[eps, nu, ci, p] = data_pack_local[eps, nu, ci, p] + d_local[ci, p, r_a, r_a_1] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_a_1 % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_a_1 % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_a_1 % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_a_1 % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_a_1 % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_a_1 % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_a_1 % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_a_1 % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_a_1 % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_a_1 % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_a_1 % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_a_1 % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_a_1 % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
-                    for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1):
-                        with T.block("data_pack_local"):
-                            v0, v1 = T.axis.remap("SS", [ax0, ax1])
-                            v2 = T.axis.spatial(64, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) // 3136 + ax2)
-                            v3 = T.axis.spatial(3136, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 3136 // 7 * 7 + (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 7 + ax3)
-                            T.reads(data_pack_local[v0, v1, v2, v3])
-                            T.writes(data_pack[v0, v1, v2, v3])
-                            data_pack[v0, v1, v2, v3] = data_pack_local[v0, v1, v2, v3]
-            for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(96, thread="blockIdx.x"):
-                for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(12, thread="vthread.x"):
-                    for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(224, thread="threadIdx.x"):
-                        for i4_0 in T.serial(32):
-                            for ax0_ax1_ax2_ax3_fused in T.serial(192):
-                                with T.block("kernel_shared"):
-                                    v0 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused // 32)
-                                    v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 16)
-                                    v2 = T.axis.spatial(64, i4_0 * 2 + ax0_ax1_ax2_ax3_fused % 32 // 16)
-                                    v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 16 // 4 * 16 + ax0_ax1_ax2_ax3_fused % 16)
-                                    T.reads(kernel[v0, v1, v2, v3])
-                                    T.writes(kernel_shared[v0, v1, v2, v3])
-                                    T.block_attr({"meta_schedule.cooperative_fetch":1})
-                                    kernel_shared[v0, v1, v2, v3] = kernel[v0, v1, v2, v3]
-                            for ax0_ax1_ax2_ax3_fused in T.serial(9408):
-                                with T.block("data_pack_shared"):
-                                    v0 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused // 1568)
-                                    v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 16)
-                                    v2 = T.axis.spatial(64, i4_0 * 2 + ax0_ax1_ax2_ax3_fused % 1568 // 784)
-                                    v3 = T.axis.spatial(3136, i0_0_i1_0_i2_0_i3_0_fused % 4 * 784 + ax0_ax1_ax2_ax3_fused % 784)
-                                    T.reads(data_pack[v0, v1, v2, v3])
-                                    T.writes(data_pack_shared[v0, v1, v2, v3])
-                                    T.block_attr({"meta_schedule.cooperative_fetch":3})
-                                    data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3]
-                            for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(2, 1, 1, 1, 14, 1, 1, 1, 2, 1):
-                                with T.block("bgemm"):
-                                    eps = T.axis.spatial(6, i0_4 + i0_1_i1_1_i2_1_i3_1_fused // 4 * 2 + i0_2_i1_2_i2_2_i3_2_fused // 112 + i0_3)
-                                    nu = T.axis.spatial(6, i1_4 + i0_0_i1_0_i2_0_i3_0_fused // 16 + i1_3)
-                                    co = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 16 // 4 * 16 + i0_1_i1_1_i2_1_i3_1_fused % 4 * 4 + i0_2_i1_2_i2_2_i3_2_fused % 112 // 56 * 2 + i2_3 * 2 + i2_4)
-                                    p = T.axis.spatial(3136, i3_4 + i0_0_i1_0_i2_0_i3_0_fused % 4 * 784 + i0_2_i1_2_i2_2_i3_2_fused % 56 * 14 + i3_3)
-                                    ci = T.axis.reduce(64, i4_0 * 2 + i4_1 + i4_2)
-                                    T.reads(kernel_shared[eps, nu, ci, co], data_pack_shared[eps, nu, ci, p])
-                                    T.writes(bgemm_local[eps, nu, co, p])
-                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
-                                    with T.init():
-                                        bgemm_local[eps, nu, co, p] = T.float32(0)
-                                    bgemm_local[eps, nu, co, p] = bgemm_local[eps, nu, co, p] + kernel_shared[eps, nu, ci, co] * data_pack_shared[eps, nu, ci, p]
-                        for ax0, ax1, ax2, ax3 in T.grid(1, 1, 2, 14):
-                            with T.block("bgemm_local"):
-                                v0 = T.axis.spatial(6, i0_1_i1_1_i2_1_i3_1_fused // 4 * 2 + i0_2_i1_2_i2_2_i3_2_fused // 112 + ax0)
-                                v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 16 + ax1)
-                                v2 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 16 // 4 * 16 + i0_1_i1_1_i2_1_i3_1_fused % 4 * 4 + i0_2_i1_2_i2_2_i3_2_fused % 112 // 56 * 2 + ax2)
-                                v3 = T.axis.spatial(3136, i0_0_i1_0_i2_0_i3_0_fused % 4 * 784 + i0_2_i1_2_i2_2_i3_2_fused % 56 * 14 + ax3)
-                                T.reads(bgemm_local[v0, v1, v2, v3])
-                                T.writes(bgemm[v0, v1, v2, v3])
-                                bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3]
-            for i0_i1_i2_0_i3_0_fused_fused_0 in T.thread_binding(6272, thread="blockIdx.x"):
-                for i0_i1_i2_0_i3_0_fused_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
-                    for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 1, 4, 4, 6, 6):
-                        with T.block("inverse"):
-                            co = T.axis.spatial(64, ax0 + (i0_i1_i2_0_i3_0_fused_fused_0 * 32 + i0_i1_i2_0_i3_0_fused_fused_1) // 3136)
-                            p = T.axis.spatial(3136, ax1 + (i0_i1_i2_0_i3_0_fused_fused_0 * 32 + i0_i1_i2_0_i3_0_fused_fused_1) % 3136 // 56 * 56 + (i0_i1_i2_0_i3_0_fused_fused_0 * 32 + i0_i1_i2_0_i3_0_fused_fused_1) % 56)
-                            vh, vw, r_a_2, r_a_3 = T.axis.remap("SSRR", [ax2, ax3, ax4, ax5])
-                            T.reads(bgemm[r_a_2, r_a_3, co, p])
-                            T.writes(inverse_local[co, p, vh, vw])
-                            T.block_attr({"schedule_rule":"meta_schedule.winograd_inverse.nchw.cuda"})
-                            with T.init():
-                                inverse_local[co, p, vh, vw] = T.float32(0)
-                            inverse_local[co, p, vh, vw] = inverse_local[co, p, vh, vw] + bgemm[r_a_2, r_a_3, co, p] * T.Select(r_a_2 % 6 == 5 and vh % 4 == 3, T.float32(1), T.Select(r_a_2 % 6 == 5 and vh % 4 == 2, T.float32(0), T.Select(r_a_2 % 6 == 5 and vh % 4 == 1, T.float32(0), T.Select(r_a_2 % 6 == 5 and vh % 4 == 0, T.float32(0), T.Select(r_a_2 % 6 == 4 and vh % 4 == 3, T.float32(-8), T.Select(r_a_2 % 6 == 4 and vh % 4 == 2, T.float32(4), T.Select(r_a_2 % 6 == 4 and vh % 4 == 1, T.float32(-2), T.Select(r_a_2 % 6 == 4 and vh % 4 == 0, T.float32(1), T.Select(r_a_2 % 6 == 3 and vh % 4 == 3, T.float32(0.125), T.Select(r_a_2 % 6 == 3 and vh % 4 == 2, T.float32(0.25), T.Select(r_a_2 % 6 == 3 and vh % 4 == 1, T.float32(0.5), T.Select(r_a_2 % 6 == 3 and vh % 4 == 0, T.float32(1), T.Select(r_a_2 % 6 == 2 and vh % 4 == 3, T.float32(1), T.Select(r_a_2 % 6 == 2 and vh % 4 == 2, T.float32(1), T.Select(r_a_2 % 6 == 2 and vh % 4 == 1, T.float32(1), T.Select(r_a_2 % 6 == 2 and vh % 4 == 0, T.float32(1), T.Select(r_a_2 % 6 == 1 and vh % 4 == 3, T.float32(-1), T.Select(r_a_2 % 6 == 1 and vh % 4 == 2, T.float32(1), T.Select(r_a_2 % 6 == 1 and vh % 4 == 1, T.float32(-1), T.Select(r_a_2 % 6 == 1 and vh % 4 == 0, T.float32(1), T.Select(r_a_2 % 6 == 0 and vh % 4 == 3, T.float32(0), T.Select(r_a_2 % 6 == 0 and vh % 4 == 2, T.float32(0), T.Select(r_a_2 % 6 == 0 and vh % 4 == 1, T.float32(0), T.Select(r_a_2 % 6 == 0 and vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(r_a_3 % 6 == 5 and vw % 4 == 3, T.float32(1), T.Select(r_a_3 % 6 == 5 and vw % 4 == 2, T.float32(0), T.Select(r_a_3 % 6 == 5 and vw % 4 == 1, T.float32(0), T.Select(r_a_3 % 6 == 5 and vw % 4 == 0, T.float32(0), T.Select(r_a_3 % 6 == 4 and vw % 4 == 3, T.float32(-8), T.Select(r_a_3 % 6 == 4 and vw % 4 == 2, T.float32(4), T.Select(r_a_3 % 6 == 4 and vw % 4 == 1, T.float32(-2), T.Select(r_a_3 % 6 == 4 and vw % 4 == 0, T.float32(1), T.Select(r_a_3 % 6 == 3 and vw % 4 == 3, T.float32(0.125), T.Select(r_a_3 % 6 == 3 and vw % 4 == 2, T.float32(0.25), T.Select(r_a_3 % 6 == 3 and vw % 4 == 1, T.float32(0.5), T.Select(r_a_3 % 6 == 3 and vw % 4 == 0, T.float32(1), T.Select(r_a_3 % 6 == 2 and vw % 4 == 3, T.float32(1), T.Select(r_a_3 % 6 == 2 and vw % 4 == 2, T.float32(1), T.Select(r_a_3 % 6 == 2 and vw % 4 == 1, T.float32(1), T.Select(r_a_3 % 6 == 2 and vw % 4 == 0, T.float32(1), T.Select(r_a_3 % 6 == 1 and vw % 4 == 3, T.float32(-1), T.Select(r_a_3 % 6 == 1 and vw % 4 == 2, T.float32(1), T.Select(r_a_3 % 6 == 1 and vw % 4 == 1, T.float32(-1), T.Select(r_a_3 % 6 == 1 and vw % 4 == 0, T.float32(1), T.Select(r_a_3 % 6 == 0 and vw % 4 == 3, T.float32(0), T.Select(r_a_3 % 6 == 0 and vw % 4 == 2, T.float32(0), T.Select(r_a_3 % 6 == 0 and vw % 4 == 1, T.float32(0), T.Select(r_a_3 % 6 == 0 and vw % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))
-                    for i2_1, i3_1 in T.grid(4, 4):
-                        with T.block("output"):
-                            n = T.axis.spatial(1, 0)
-                            co = T.axis.spatial(64, (i0_i1_i2_0_i3_0_fused_fused_0 * 32 + i0_i1_i2_0_i3_0_fused_fused_1) // 3136)
-                            h = T.axis.spatial(224, (i0_i1_i2_0_i3_0_fused_fused_0 * 32 + i0_i1_i2_0_i3_0_fused_fused_1) % 3136 // 56 * 4 + i2_1)
-                            w = T.axis.spatial(224, (i0_i1_i2_0_i3_0_fused_fused_0 * 32 + i0_i1_i2_0_i3_0_fused_fused_1) % 56 * 4 + i3_1)
-                            T.reads(inverse_local[co, n * 3136 + h // 4 * 56 + w // 4, h % 4, w % 4])
-                            T.writes(output[n, co, h, w])
-                            T.block_attr({"schedule_rule":"meta_schedule.winograd_output.nchw.cuda", "winograd_tile_size":4})
-                            output[n, co, h, w] = inverse_local[co, n * 3136 + h // 4 * 56 + w // 4, h % 4, w % 4]
-    # fmt: on
-    decision_0 = [
-        ("SamplePerfectTile", [448, 7]),
-        ("SampleCategorical", 1),
-        ("SampleCategorical", 0),
-        ("SamplePerfectTile", [1, 3, 2, 1, 1]),
-        ("SamplePerfectTile", [6, 1, 1, 1, 1]),
-        ("SamplePerfectTile", [4, 4, 2, 1, 2]),
-        ("SamplePerfectTile", [4, 1, 56, 14, 1]),
-        ("SamplePerfectTile", [32, 2, 1]),
-        ("SampleCategorical", 0),
-        ("SampleCategorical", 2),
-        ("SampleCategorical", 4),
-    ]
-    mod = _conv2d_winograd_nchw()
-    actual = _design_space(mod)
-    check_sketches(
-        mod,
-        sketches=actual,
-        expected_mods=[winograd_nchw_conv2d],
-        expected_decisions=[decision_0],
-    )
-
-
 if __name__ == "__main__":
     test_cuda_c1d()
     test_cuda_c2d()
@@ -1431,4 +1263,3 @@ def winograd_nchw_conv2d(data: T.Buffer[(1, 64, 224, 224), "float32"], kernel: T
     test_cuda_sfm()
     test_cuda_cbr()
     test_cuda_tbg()
-    test_cuda_winograd_nchw_conv2d()
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda_winograd.py b/tests/python/unittest/test_meta_schedule_space_cuda_winograd.py
new file mode 100644
index 000000000000..16f9e64252ad
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_space_cuda_winograd.py
@@ -0,0 +1,355 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for MetaSchedule search space on CUDA"""
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.space_generation import (
+    check_sketches,
+    generate_design_space,
+    print_sketches,
+)
+from tvm.meta_schedule.testing.te_workload import create_te_workload
+from tvm.script import tir as T
+from tvm.target import Target
+
+
+def _target():
+    return Target("nvidia/geforce-rtx-3070")
+
+
+def _design_space(mod):
+    return generate_design_space(
+        kind="cuda",
+        mod=mod,
+        target=_target(),
+        types=ms.ScheduleRule,
+    )
+
+
+def test_cuda_nhwc():
+    # fmt: off
+    @T.prim_func
+    def cuda_nhwc_0(data: T.Buffer[(1, 14, 14, 128), "float32"], weight: T.Buffer[(6, 6, 128, 128), "float32"], conv2d_winograd: T.Buffer[(1, 12, 12, 128), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.unroll_explicit":16})
+            input_tile_local = T.alloc_buffer([6, 6, 9, 128], dtype="float32", scope="local")
+            data_pack = T.alloc_buffer([6, 6, 9, 128], dtype="float32")
+            bgemm = T.alloc_buffer([6, 6, 9, 128], dtype="float32")
+            inverse = T.alloc_buffer([4, 4, 9, 128], dtype="float32")
+            data_pack_local = T.alloc_buffer([6, 6, 9, 128], dtype="float32", scope="local")
+            bgemm_local = T.alloc_buffer([6, 6, 9, 128], dtype="float32", scope="local")
+            data_pack_shared = T.alloc_buffer([6, 6, 9, 128], dtype="float32", scope="shared")
+            weight_shared = T.alloc_buffer([6, 6, 128, 128], dtype="float32", scope="shared")
+            for i2_0_i3_0_i2_1_i3_1_fused_0 in T.thread_binding(2, thread="blockIdx.x"):
+                for i2_0_i3_0_i2_1_i3_1_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
+                    for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1):
+                        with T.block("input_tile"):
+                            T.where(i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1 < 1152)
+                            eps, nu = T.axis.remap("SS", [ax0, ax1])
+                            p = T.axis.spatial(9, (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) // 384 * 3 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) % 24 // 8 + ax2)
+                            ci = T.axis.spatial(128, (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) % 384 // 24 * 8 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) % 8 + ax3)
+                            T.reads(data[p // 9, p % 9 // 3 * 4 + eps, p % 3 * 4 + nu, ci])
+                            T.writes(input_tile_local[eps, nu, p, ci])
+                            T.block_attr({"schedule_rule":"None"})
+                            input_tile_local[eps, nu, p, ci] = T.if_then_else(0 <= p % 9 // 3 * 4 + eps and p % 9 // 3 * 4 + eps < 14 and 0 <= p % 3 * 4 + nu and p % 3 * 4 + nu < 14, data[p // 9, p % 9 // 3 * 4 + eps, p % 3 * 4 + nu, ci], T.float32(0), dtype="float32")
+                    for i0 in T.unroll(6):
+                        for i1 in T.unroll(6):
+                            for i4 in T.unroll(6):
+                                for i5 in T.unroll(6):
+                                    with T.block("data_pack"):
+                                        T.where(i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1 < 1152)
+                                        eps, nu = T.axis.remap("SS", [i0, i1])
+                                        p = T.axis.spatial(9, (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) // 384 * 3 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) % 24 // 8)
+                                        ci = T.axis.spatial(128, (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) % 384 // 24 * 8 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) % 8)
+                                        r_a, r_b = T.axis.remap("RR", [i4, i5])
+                                        T.reads(input_tile_local[r_a, r_b, p, ci])
+                                        T.writes(data_pack_local[eps, nu, p, ci])
+                                        T.block_attr({"schedule_rule":"conv2d_nhwc_winograd_data_pack"})
+                                        with T.init():
+                                            data_pack_local[eps, nu, p, ci] = T.float32(0)
+                                        data_pack_local[eps, nu, p, ci] = data_pack_local[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_b % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_b % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_b % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_b % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_b % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_b % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_b % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_b % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_b % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_b % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_b % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_b % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_b % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_b % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
+                    for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1):
+                        with T.block("data_pack_local"):
+                            T.where(i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1 < 1152)
+                            v0, v1 = T.axis.remap("SS", [ax0, ax1])
+                            v2 = T.axis.spatial(9, (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) // 384 * 3 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) % 24 // 8 + ax2)
+                            v3 = T.axis.spatial(128, (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) % 384 // 24 * 8 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1) % 8 + ax3)
+                            T.reads(data_pack_local[v0, v1, v2, v3])
+                            T.writes(data_pack[v0, v1, v2, v3])
+                            data_pack[v0, v1, v2, v3] = data_pack_local[v0, v1, v2, v3]
+            for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(96, thread="blockIdx.x"):
+                for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(4, thread="vthread.x"):
+                    for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(27, thread="threadIdx.x"):
+                        for i4_0 in T.serial(8):
+                            for ax0_ax1_ax2_ax3_fused in T.serial(1728):
+                                with T.block("data_pack_shared"):
+                                    v0 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 32 * 2 + ax0_ax1_ax2_ax3_fused // 864)
+                                    v1 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused % 864 // 144)
+                                    v2 = T.axis.spatial(9, ax0_ax1_ax2_ax3_fused % 144 // 16)
+                                    v3 = T.axis.spatial(128, i4_0 * 16 + ax0_ax1_ax2_ax3_fused % 16)
+                                    T.reads(data_pack[v0, v1, v2, v3])
+                                    T.writes(data_pack_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":1})
+                                    data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3]
+                            for ax0_ax1_ax2_ax3_fused in T.serial(768):
+                                with T.block("weight_shared"):
+                                    v0 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 32 * 2 + ax0_ax1_ax2_ax3_fused // 384)
+                                    v1 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused % 384 // 64)
+                                    v2 = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 32 * 4 + ax0_ax1_ax2_ax3_fused % 64 // 16)
+                                    v3 = T.axis.spatial(128, i4_0 * 16 + ax0_ax1_ax2_ax3_fused % 16)
+                                    T.reads(weight[v0, v1, v2, v3])
+                                    T.writes(weight_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":3})
+                                    weight_shared[v0, v1, v2, v3] = weight[v0, v1, v2, v3]
+                            for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 2, 1, 1, 2, 16, 1, 1, 1, 1):
+                                with T.block("bgemm"):
+                                    eps = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 32 * 2 + i0_3 + i0_4)
+                                    nu = T.axis.spatial(6, i1_3 + i1_4 + i0_1_i1_1_i2_1_i3_1_fused // 2 * 3 + i0_2_i1_2_i2_2_i3_2_fused // 9)
+                                    p = T.axis.spatial(9, i0_2_i1_2_i2_2_i3_2_fused % 9 + i2_3 + i2_4)
+                                    co = T.axis.spatial(128, i3_4 + i0_0_i1_0_i2_0_i3_0_fused % 32 * 4 + i0_1_i1_1_i2_1_i3_1_fused % 2 * 2 + i3_3)
+                                    ci = T.axis.reduce(128, i4_0 * 16 + i4_1 * 16 + i4_2)
+                                    T.reads(data_pack_shared[eps, nu, p, ci], weight_shared[eps, nu, co, ci])
+                                    T.writes(bgemm_local[eps, nu, p, co])
+                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "meta_schedule.write_cache_level":[3]})
+                                    with T.init():
+                                        bgemm_local[eps, nu, p, co] = T.float32(0)
+                                    bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * weight_shared[eps, nu, co, ci]
+                        for ax0, ax1, ax2, ax3 in T.grid(2, 1, 1, 2):
+                            with T.block("bgemm_local"):
+                                v0 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 32 * 2 + ax0)
+                                v1 = T.axis.spatial(6, i0_1_i1_1_i2_1_i3_1_fused // 2 * 3 + i0_2_i1_2_i2_2_i3_2_fused // 9 + ax1)
+                                v2 = T.axis.spatial(9, i0_2_i1_2_i2_2_i3_2_fused % 9 + ax2)
+                                v3 = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 32 * 4 + i0_1_i1_1_i2_1_i3_1_fused % 2 * 2 + ax3)
+                                T.reads(bgemm_local[v0, v1, v2, v3])
+                                T.writes(bgemm[v0, v1, v2, v3])
+                                bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3]
+            for i2_0_i3_0_i2_1_i3_1_fused_0 in T.thread_binding(18, thread="blockIdx.x"):
+                for i2_0_i3_0_i2_1_i3_1_fused_1 in T.thread_binding(64, thread="threadIdx.x"):
+                    for i0 in T.unroll(4):
+                        for i1 in T.unroll(4):
+                            for i4 in T.unroll(6):
+                                for i5 in T.unroll(6):
+                                    with T.block("inverse"):
+                                        vh, vw = T.axis.remap("SS", [i0, i1])
+                                        p = T.axis.spatial(9, (i2_0_i3_0_i2_1_i3_1_fused_0 * 64 + i2_0_i3_0_i2_1_i3_1_fused_1) // 384 * 3 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 64 + i2_0_i3_0_i2_1_i3_1_fused_1) % 24 // 8)
+                                        co = T.axis.spatial(128, (i2_0_i3_0_i2_1_i3_1_fused_0 * 64 + i2_0_i3_0_i2_1_i3_1_fused_1) % 384 // 24 * 8 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 64 + i2_0_i3_0_i2_1_i3_1_fused_1) % 8)
+                                        r_a, r_b = T.axis.remap("RR", [i4, i5])
+                                        T.reads(bgemm[r_a, r_b, p, co])
+                                        T.writes(inverse[vh, vw, p, co])
+                                        T.block_attr({"schedule_rule":"conv2d_nhwc_winograd_inverse"})
+                                        with T.init():
+                                            inverse[vh, vw, p, co] = T.float32(0)
+                                        inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 6 == 5 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 5 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 0, T.float32(0), T.Select(r_a % 6 == 4 and vh % 4 == 3, T.float32(-8), T.Select(r_a % 6 == 4 and vh % 4 == 2, T.float32(4), T.Select(r_a % 6 == 4 and vh % 4 == 1, T.float32(-2), T.Select(r_a % 6 == 4 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 3 and vh % 4 == 3, T.float32(0.125), T.Select(r_a % 6 == 3 and vh % 4 == 2, T.float32(0.25), T.Select(r_a % 6 == 3 and vh % 4 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 1, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 3, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 1, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 0 and vh % 4 == 3, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 5 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 0, T.float32(0), T.Select(r_b % 6 == 4 and vw % 4 == 3, T.float32(-8), T.Select(r_b % 6 == 4 and vw % 4 == 2, T.float32(4), T.Select(r_b % 6 == 4 and vw % 4 == 1, T.float32(-2), T.Select(r_b % 6 == 4 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 3 and vw % 4 == 3, T.float32(0.125), T.Select(r_b % 6 == 3 and vw % 4 == 2, T.float32(0.25), T.Select(r_b % 6 == 3 and vw % 4 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 1, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 3, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 1, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 0 and vw % 4 == 3, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))
+            for i0_i1_i2_i3_fused_0 in T.thread_binding(144, thread="blockIdx.x"):
+                for i0_i1_i2_i3_fused_1 in T.thread_binding(128, thread="threadIdx.x"):
+                    with T.block("conv2d_winograd"):
+                        n = T.axis.spatial(1, 0)
+                        h = T.axis.spatial(12, (i0_i1_i2_i3_fused_0 * 128 + i0_i1_i2_i3_fused_1) // 1536)
+                        w = T.axis.spatial(12, (i0_i1_i2_i3_fused_0 * 128 + i0_i1_i2_i3_fused_1) % 1536 // 128)
+                        co = T.axis.spatial(128, (i0_i1_i2_i3_fused_0 * 128 + i0_i1_i2_i3_fused_1) % 128)
+                        T.reads(inverse[h % 4, w % 4, n * 9 + h // 4 * 3 + w // 4, co])
+                        T.writes(conv2d_winograd[n, h, w, co])
+                        conv2d_winograd[n, h, w, co] = inverse[h % 4, w % 4, n * 9 + h // 4 * 3 + w // 4, co]
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [3, 3]),
+        ("SamplePerfectTile", [16, 8]),
+        ("SampleCategorical", 1),
+        ("SamplePerfectTile", [3, 3]),
+        ("SamplePerfectTile", [16, 8]),
+        ("SampleCategorical", 5),
+        ("SamplePerfectTile", [3, 1, 1, 2, 1]),
+        ("SamplePerfectTile", [1, 2, 3, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 9, 1, 1]),
+        ("SamplePerfectTile", [32, 2, 1, 2, 1]),
+        ("SamplePerfectTile", [8, 1, 16]),
+        ("SampleCategorical", 0),
+        ("SampleCategorical", 2),
+        ("SampleCategorical", 1),
+        ("SampleCategorical", 2),
+    ]
+    with _target():
+        mod = create_te_workload("C2D_WIN_NHWC", 0)
+    actual = _design_space(mod)
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[cuda_nhwc_0],
+        expected_decisions=[decision_0],
+    )
+
+
+def test_cuda_nchw():
+    # fmt: off
+    @T.prim_func
+    def cuda_nchw_0(data: T.Buffer[(1, 64, 56, 56), "float32"], weight: T.Buffer[(6, 6, 64, 64), "float32"], conv2d_winograd: T.Buffer[(1, 64, 56, 56), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.unroll_explicit":16})
+            input_tile_local = T.alloc_buffer([64, 196, 6, 6], dtype="float32", scope="local")
+            data_pack = T.alloc_buffer([6, 6, 64, 196], dtype="float32")
+            bgemm = T.alloc_buffer([6, 6, 64, 196], dtype="float32")
+            inverse_local = T.alloc_buffer([64, 196, 4, 4], dtype="float32", scope="local")
+            data_pack_local = T.alloc_buffer([6, 6, 64, 196], dtype="float32", scope="local")
+            bgemm_local = T.alloc_buffer([6, 6, 64, 196], dtype="float32", scope="local")
+            data_pack_shared = T.alloc_buffer([6, 6, 64, 196], dtype="float32", scope="shared")
+            weight_shared = T.alloc_buffer([6, 6, 64, 64], dtype="float32", scope="shared")
+            for i2_i3_fused_0 in T.thread_binding(25, thread="blockIdx.x"):
+                for i2_i3_fused_1 in T.thread_binding(512, thread="threadIdx.x"):
+                    for ax0, ax1, ax2, ax3 in T.grid(1, 1, 6, 6):
+                        with T.block("input_tile"):
+                            T.where(i2_i3_fused_0 * 512 + i2_i3_fused_1 < 12544)
+                            ci = T.axis.spatial(64, (i2_i3_fused_0 * 512 + i2_i3_fused_1) // 196 + ax0)
+                            p = T.axis.spatial(196, (i2_i3_fused_0 * 120 + i2_i3_fused_1) % 196 + ax1)
+                            eps, nu = T.axis.remap("SS", [ax2, ax3])
+                            T.reads(data[p // 196, ci, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1])
+                            T.writes(input_tile_local[ci, p, eps, nu])
+                            T.block_attr({"schedule_rule":"None"})
+                            input_tile_local[ci, p, eps, nu] = T.if_then_else(1 <= p % 196 // 14 * 4 + eps and p % 196 // 14 * 4 + eps < 57 and 1 <= p % 14 * 4 + nu and p % 14 * 4 + nu < 57, data[p // 196, ci, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1], T.float32(0), dtype="float32")
+                    for i0 in T.unroll(6):
+                        for i1 in T.unroll(6):
+                            for i4 in T.unroll(6):
+                                for i5 in T.unroll(6):
+                                    with T.block("data_pack"):
+                                        T.where(i2_i3_fused_0 * 512 + i2_i3_fused_1 < 12544)
+                                        eps, nu = T.axis.remap("SS", [i0, i1])
+                                        ci = T.axis.spatial(64, (i2_i3_fused_0 * 512 + i2_i3_fused_1) // 196)
+                                        p = T.axis.spatial(196, (i2_i3_fused_0 * 512 + i2_i3_fused_1) % 196)
+                                        r_a, r_b = T.axis.remap("RR", [i4, i5])
+                                        T.reads(input_tile_local[ci, p, r_a, r_b])
+                                        T.writes(data_pack_local[eps, nu, ci, p])
+                                        T.block_attr({"schedule_rule":"conv2d_nchw_winograd_data_pack"})
+                                        with T.init():
+                                            data_pack_local[eps, nu, ci, p] = T.float32(0)
+                                        data_pack_local[eps, nu, ci, p] = data_pack_local[eps, nu, ci, p] + input_tile_local[ci, p, r_a, r_b] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_b % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_b % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_b % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_b % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_b % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_b % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_b % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_b % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_b % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_b % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_b % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_b % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_b % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_b % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
+                    for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1):
+                        with T.block("data_pack_local"):
+                            T.where(i2_i3_fused_0 * 512 + i2_i3_fused_1 < 12544)
+                            v0, v1 = T.axis.remap("SS", [ax0, ax1])
+                            v2 = T.axis.spatial(64, (i2_i3_fused_0 * 512 + i2_i3_fused_1) // 196 + ax2)
+                            v3 = T.axis.spatial(196, (i2_i3_fused_0 * 120 + i2_i3_fused_1) % 196 + ax3)
+                            T.reads(data_pack_local[v0, v1, v2, v3])
+                            T.writes(data_pack[v0, v1, v2, v3])
+                            data_pack[v0, v1, v2, v3] = data_pack_local[v0, v1, v2, v3]
+            for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(14, thread="blockIdx.x"):
+                for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(224, thread="vthread.x"):
+                    for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(2, thread="threadIdx.x"):
+                        for i4_0 in T.serial(2):
+                            for ax0_ax1_ax2_ax3_fused in T.serial(32256):
+                                with T.block("data_pack_shared"):
+                                    v0 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused // 5376)
+                                    v1 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused % 5376 // 896)
+                                    v2 = T.axis.spatial(64, i4_0 * 32 + ax0_ax1_ax2_ax3_fused % 896 // 28)
+                                    v3 = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 7 * 28 + ax0_ax1_ax2_ax3_fused % 28)
+                                    T.reads(data_pack[v0, v1, v2, v3])
+                                    T.writes(data_pack_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":4})
+                                    data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3]
+                            for ax0_ax1_ax2_ax3_fused in T.serial(36864):
+                                with T.block("weight_shared"):
+                                    v0 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused // 6144)
+                                    v1 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused % 6144 // 1024)
+                                    v2 = T.axis.spatial(64, i4_0 * 32 + ax0_ax1_ax2_ax3_fused % 1024 // 32)
+                                    v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused // 7 * 32 + ax0_ax1_ax2_ax3_fused % 32)
+                                    T.reads(weight[v0, v1, v2, v3])
+                                    T.writes(weight_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":3})
+                                    weight_shared[v0, v1, v2, v3] = weight[v0, v1, v2, v3]
+                            for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(16, 2, 3, 1, 4, 2, 3, 1, 1, 1):
+                                with T.block("bgemm"):
+                                    eps = T.axis.spatial(6, i0_3 * 3 + i0_4)
+                                    nu = T.axis.spatial(6, i1_4 + i0_1_i1_1_i2_1_i3_1_fused // 112 * 3 + i1_3)
+                                    co = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused // 7 * 32 + i0_1_i1_1_i2_1_i3_1_fused % 112 // 7 * 2 + i0_2_i1_2_i2_2_i3_2_fused + i2_3 + i2_4)
+                                    p = T.axis.spatial(196, i3_4 + i0_0_i1_0_i2_0_i3_0_fused % 7 * 28 + i0_1_i1_1_i2_1_i3_1_fused % 7 * 4 + i3_3)
+                                    ci = T.axis.reduce(64, i4_0 * 32 + i4_1 * 2 + i4_2)
+                                    T.reads(data_pack_shared[eps, nu, ci, p], weight_shared[eps, nu, ci, co])
+                                    T.writes(bgemm_local[eps, nu, co, p])
+                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
+                                    with T.init():
+                                        bgemm_local[eps, nu, co, p] = T.float32(0)
+                                    bgemm_local[eps, nu, co, p] = bgemm_local[eps, nu, co, p] + data_pack_shared[eps, nu, ci, p] * weight_shared[eps, nu, ci, co]
+                        for ax0, ax1, ax2, ax3 in T.grid(6, 3, 1, 4):
+                            with T.block("bgemm_local"):
+                                v0 = T.axis.spatial(6, ax0)
+                                v1 = T.axis.spatial(6, i0_1_i1_1_i2_1_i3_1_fused // 112 * 3 + ax1)
+                                v2 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused // 7 * 32 + i0_1_i1_1_i2_1_i3_1_fused % 112 // 7 * 2 + i0_2_i1_2_i2_2_i3_2_fused + ax2)
+                                v3 = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 7 * 28 + i0_1_i1_1_i2_1_i3_1_fused % 7 * 4 + ax3)
+                                T.reads(bgemm_local[v0, v1, v2, v3])
+                                T.writes(bgemm[v0, v1, v2, v3])
+                                bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3]
+            for i0_i1_i2_0_i3_0_fused_0 in T.thread_binding(196, thread="blockIdx.x"):
+                for i0_i1_i2_0_i3_0_fused_1 in T.thread_binding(64, thread="threadIdx.x"):
+                    for ax0, ax1 in T.grid(1, 1):
+                        for ax2 in T.unroll(4):
+                            for ax3 in T.unroll(4):
+                                for ax4 in T.unroll(6):
+                                    for ax5 in T.unroll(6):
+                                        with T.block("inverse"):
+                                            co = T.axis.spatial(64, (i0_i1_i2_0_i3_0_fused_0 * 64 + i0_i1_i2_0_i3_0_fused_1) // 196 + ax0)
+                                            p = T.axis.spatial(196, (i0_i1_i2_0_i3_0_fused_0 * 64 + i0_i1_i2_0_i3_0_fused_1) % 196 // 14 * 14 + (i0_i1_i2_0_i3_0_fused_0 * 64 + i0_i1_i2_0_i3_0_fused_1) % 14 + ax1)
+                                            vh, vw, r_a, r_b = T.axis.remap("SSRR", [ax2, ax3, ax4, ax5])
+                                            T.reads(bgemm[r_a, r_b, co, p])
+                                            T.writes(inverse_local[co, p, vh, vw])
+                                            T.block_attr({"schedule_rule":"conv2d_nchw_winograd_inverse"})
+                                            with T.init():
+                                                inverse_local[co, p, vh, vw] = T.float32(0)
+                                            inverse_local[co, p, vh, vw] = inverse_local[co, p, vh, vw] + bgemm[r_a, r_b, co, p] * T.Select(r_a % 6 == 5 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 5 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 0, T.float32(0), T.Select(r_a % 6 == 4 and vh % 4 == 3, T.float32(-8), T.Select(r_a % 6 == 4 and vh % 4 == 2, T.float32(4), T.Select(r_a % 6 == 4 and vh % 4 == 1, T.float32(-2), T.Select(r_a % 6 == 4 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 3 and vh % 4 == 3, T.float32(0.125), T.Select(r_a % 6 == 3 and vh % 4 == 2, T.float32(0.25), T.Select(r_a % 6 == 3 and vh % 4 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 1, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 3, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 1, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 0 and vh % 4 == 3, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 5 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 0, T.float32(0), T.Select(r_b % 6 == 4 and vw % 4 == 3, T.float32(-8), T.Select(r_b % 6 == 4 and vw % 4 == 2, T.float32(4), T.Select(r_b % 6 == 4 and vw % 4 == 1, T.float32(-2), T.Select(r_b % 6 == 4 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 3 and vw % 4 == 3, T.float32(0.125), T.Select(r_b % 6 == 3 and vw % 4 == 2, T.float32(0.25), T.Select(r_b % 6 == 3 and vw % 4 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 1, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 3, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 1, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 0 and vw % 4 == 3, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))
+                    for i2_1, i3_1 in T.grid(4, 4):
+                        with T.block("conv2d_winograd"):
+                            n = T.axis.spatial(1, 0)
+                            co = T.axis.spatial(64, (i0_i1_i2_0_i3_0_fused_0 * 64 + i0_i1_i2_0_i3_0_fused_1) // 196)
+                            h = T.axis.spatial(56, (i0_i1_i2_0_i3_0_fused_0 * 64 + i0_i1_i2_0_i3_0_fused_1) % 196 // 14 * 4 + i2_1)
+                            w = T.axis.spatial(56, (i0_i1_i2_0_i3_0_fused_0 * 64 + i0_i1_i2_0_i3_0_fused_1) % 14 * 4 + i3_1)
+                            T.reads(inverse_local[co, n * 196 + h // 4 * 14 + w // 4, h % 4, w % 4])
+                            T.writes(conv2d_winograd[n, co, h, w])
+                            conv2d_winograd[n, co, h, w] = inverse_local[co, n * 196 + h // 4 * 14 + w // 4, h % 4, w % 4]
+    # fmt: on
+    decision_0 = [
+        ("SampleCategorical", 4),
+        ("SamplePerfectTile", [1, 1, 1, 2, 3]),
+        ("SamplePerfectTile", [1, 2, 1, 3, 1]),
+        ("SamplePerfectTile", [2, 16, 2, 1, 1]),
+        ("SamplePerfectTile", [7, 7, 1, 4, 1]),
+        ("SamplePerfectTile", [2, 16, 2]),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 2),
+        ("SampleCategorical", 1),
+        ("SampleCategorical", 1),
+    ]
+    with _target():
+        mod = create_te_workload("C2D_WIN_NCHW", 0)
+    actual = _design_space(mod)
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[cuda_nchw_0],
+        expected_decisions=[decision_0],
+        debug_mask=0,
+    )
+
+
+if __name__ == "__main__":
+    test_cuda_nhwc()
+    test_cuda_nchw()
diff --git a/tests/python/unittest/test_meta_schedule_vnni_integration.py b/tests/python/unittest/test_meta_schedule_vnni_integration.py
index 1f91dc593143..3bbe916472f5 100644
--- a/tests/python/unittest/test_meta_schedule_vnni_integration.py
+++ b/tests/python/unittest/test_meta_schedule_vnni_integration.py
@@ -20,8 +20,8 @@
 from typing import Optional
 
 import numpy as np  # type: ignore
-import pytest
 import tvm
+import tvm.testing
 from tvm import meta_schedule as ms
 from tvm import relay
 from tvm._ffi import register_func
@@ -176,29 +176,29 @@ def test_vnni_schedule_fn_tune():
 
     C = te.compute(
         ...
-        attrs={"schedule_rule": "meta_schedule.dense_vnni"},
+        attrs={"schedule_rule": "meta_schedule.x86.dense_vnni"},
     )
 
     When the MetaSchedule encounters a TensorIR block with the "schedule_rule" annotation,
     it looks up the packed func registry for a function that is associated with the given schedule
-    rule key ("meta_schedule.dense_vnni" in this example). The signature of such custom schedule
-    functions must be
+    rule key ("meta_schedule.x86.dense_vnni" in this example). The signature of such custom
+    schedule functions must be
 
        (tir.schedule.Schedule, tir.schedule.BlockRV) -> [tir.schedule.Schedule].
 
     The BlockRV argument corresponds to the TE compute annotated with "schedule_rule".
 
-    The relevant code is in meta_schedule/space_generator/post_order_apply.cc.
+    The relevant code is in `src/meta_schedule/space_generator/apply_custom_rule.cc`.
     """
 
     def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
         _schedule_dense(m=None, do_tune=True)(sch, dense_block)
         return [sch]
 
-    register_func("meta_schedule.dense_vnni", schedule_rule_dense_vnni)
+    register_func("meta_schedule.x86.dense_vnni", schedule_rule_dense_vnni)
 
     m, n, k = 1024, 1024, 1024
-    target = tvm.target.Target("llvm -mcpu=cascadelake -num-cores 4")
+    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=cascadelake -num-cores=4")
     dev = tvm.cpu(0)
     relay_mod, params, f_check = _relay_dense(m, n, k)
 
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 4c216cdbc53a..b59880758e5d 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -385,14 +385,12 @@ def expected_layout_attr(
     for i0, i1, i2 in T.grid(128, 128, 128):
         with T.block("C"):
             x, y, k = T.axis.remap("SSR", [i0, i1, i2])
-            T.block_attr({"layout_free_placeholders": []})
             with T.init():
                 C[x, y] = T.float32(0)
             C[x, y] = C[x, y] + A[x, k] * B[y, k]
     for i0, i1 in T.grid(128, 128):
         with T.block("D"):
             x, y = T.axis.remap("SS", [i0, i1])
-            T.block_attr({"layout_free_placeholders": [C]})
             D[x, y] = C[x, y] + T.float32(1)
 
 
diff --git a/tests/python/unittest/test_tir_analysis_stmt_finding.py b/tests/python/unittest/test_tir_analysis_stmt_finding.py
index 791699e4e4ed..acb5faa0de12 100644
--- a/tests/python/unittest/test_tir_analysis_stmt_finding.py
+++ b/tests/python/unittest/test_tir_analysis_stmt_finding.py
@@ -15,11 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 import pytest
-
 import tvm
-from tvm.tir.analysis import find_anchor_block
 from tvm import te, topi
-from tvm.meta_schedule.testing.te_workload import matmul, conv2d_winograd_nhwc
+from tvm.meta_schedule.testing.te_workload import conv2d_winograd_nhwc, matmul
+from tvm.tir.analysis import find_anchor_block
 
 
 def test_matmul_add():
@@ -35,7 +34,7 @@ def test_matmul_add():
 
 def test_winograd():
     mod = tvm.IRModule()
-    mod["main"] = te.create_prim_func(conv2d_winograd_nhwc(1, 56, 56, 64, 64, 3))
+    mod["main"] = te.create_prim_func(conv2d_winograd_nhwc(1, 14, 14, 128, 128, 6))
 
     block = find_anchor_block(mod)
 

From 904ae7748f2c1a6270fd2ab1337c8c7eb4ceae1b Mon Sep 17 00:00:00 2001
From: Wubin <wubin.wu@imgtec.com>
Date: Tue, 8 Nov 2022 00:26:53 +0000
Subject: [PATCH 523/704] =?UTF-8?q?[Frontend][PaddlePaddle]=20Add=20test?=
 =?UTF-8?q?=20case=20for=20interpolate=20op=20convert=20func=E2=80=A6=20(#?=
 =?UTF-8?q?13277)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add test case for interpolate op convert function #13247
---
 tests/python/frontend/paddlepaddle/test_forward.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index 8b696404e2b0..ba983eb0878e 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -919,6 +919,7 @@ def __init__(
             use_scale=False,
             use_list=False,
             use_const=False,
+            use_scaler=False,
         ):
             super(Interpolate, self).__init__()
             self.mode = mode
@@ -928,6 +929,7 @@ def __init__(
             self.use_scale = use_scale
             self.use_list = use_list
             self.use_const = use_const
+            self.use_scaler = use_scaler
 
         @paddle.jit.to_static
         def forward(self, x):
@@ -939,9 +941,13 @@ def forward(self, x):
             elif not self.use_const:
                 size0 = paddle.to_tensor(size[0:1])
                 size = [size0, int(size[1])]
-            else:
+            elif not self.use_scaler:
                 size = size.tolist()
                 scale = scale.tolist()
+            else:
+                size = list(size)
+                h, w = paddle.rand(size).shape  # add decrease_axis
+                size = [h, w]
             if not self.use_scale:
                 return paddle.nn.functional.interpolate(
                     x,
@@ -965,6 +971,7 @@ def forward(self, x):
     verify_model(Interpolate(), input_data)
     verify_model(Interpolate(use_list=True), input_data)
     verify_model(Interpolate(use_scale=True, use_const=True), input_data)
+    verify_model(Interpolate(use_const=True, use_scaler=True), input_data)
     verify_model(Interpolate("bilinear", use_scale=True), input_data)
     verify_model(Interpolate("bilinear", use_scale=True, align_corners=True), input_data)
     verify_model(

From f8691180e044545bbd9582dfbc3d363e454029ad Mon Sep 17 00:00:00 2001
From: "Xiangxi Guo (Ryan)" <ryan.guo99@gmail.com>
Date: Mon, 7 Nov 2022 18:27:31 -0600
Subject: [PATCH 524/704] [BugFix][Driver] Correctly propogate simple-mode flag
 in LowerSchedule (#13311)

Currently one version of `tvm::LowerSchedule` doesn't pass along the input `simple_mode` flag, which causes it to default back to `false`. This commit fixes it by passing along the input flag.
---
 src/driver/driver_api.cc | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 2b9a354f5c7e..bb4990e3e502 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -54,9 +54,6 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.merge_async_commit_queue_scope", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_lwp", Bool);
 
-using runtime::PackedFunc;
-using runtime::TVMArgs;
-using runtime::TVMRetValue;
 using tvm::Array;
 using tvm::transform::Pass;
 
@@ -365,7 +362,7 @@ IRModule LowerSchedule(te::Schedule sch, const Array<te::Tensor>& args, const st
   for (ObjectRef x : args) {
     ref_args.push_back(x);
   }
-  return LowerSchedule(std::move(sch), ref_args, name, binds, global_var_supply);
+  return LowerSchedule(std::move(sch), ref_args, name, binds, global_var_supply, simple_mode);
 }
 
 IRModule LowerSchedule(te::Schedule sch, const Array<ObjectRef>& args, const std::string& name,

From e43841d2efec6eedaace0c9a53cd63c607b93c36 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 7 Nov 2022 21:02:07 -0600
Subject: [PATCH 525/704] [microTVM] Fix RPC session close on runtime side
 (#13310)

Currently, the RPC session on C/C++ side does not know if the session
was closed on Python side which causes extra read/write on transport
while the session is already closed. This commit reuses the Hexagon
approach in microTVM to shutdown the RPC session.
---
 python/tvm/micro/project_api/client.py | 6 +++---
 python/tvm/micro/session.py            | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/tvm/micro/project_api/client.py b/python/tvm/micro/project_api/client.py
index f1eb115cfbbe..c9f889e9b6dd 100644
--- a/python/tvm/micro/project_api/client.py
+++ b/python/tvm/micro/project_api/client.py
@@ -85,17 +85,17 @@ def __init__(
 
     @property
     def is_shutdown(self):
-        return self.read_file is None
+        return self.read_file.closed
 
     def shutdown(self):
-        if self.is_shutdown:
+        if self.is_shutdown:  # pylint: disable=using-constant-test
             return
 
         self.read_file.close()
         self.write_file.close()
 
     def _request_reply(self, method, params):
-        if self.is_shutdown:
+        if self.is_shutdown:  # pylint: disable=using-constant-test
             raise ConnectionShutdownError("connection already closed")
 
         request = {
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 8a51f1082dda..7d01baa75289 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -157,6 +157,8 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
         if not self._exit_called:
             self._exit_called = True
             self.transport.__exit__(exc_type, exc_value, exc_traceback)
+            shutdown_func = self._rpc._sess.get_function("CloseRPCConnection")
+            shutdown_func()
 
     def _cleanup(self):
         self.__exit__(None, None, None)

From b807613c794321bf4fa765df026b1c3b454811b3 Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Mon, 7 Nov 2022 21:57:32 -0600
Subject: [PATCH 526/704] [Hexagon] [runtime] Move lock/unlock to HexagonHtp
 temporarily (#13318)

Move lock/unlock to HexagonHtp temporarily
---
 src/runtime/hexagon/hexagon_htp.cc            | 12 ++++++++++--
 src/runtime/hexagon/hexagon_thread_manager.cc | 10 ++++++----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_htp.cc b/src/runtime/hexagon/hexagon_htp.cc
index f6c1d2f01ffb..ac1b267902c7 100644
--- a/src/runtime/hexagon/hexagon_htp.cc
+++ b/src/runtime/hexagon/hexagon_htp.cc
@@ -35,9 +35,17 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-HexagonHtp::HexagonHtp() { Acquire(); }
+HexagonHtp::HexagonHtp() {
+  Acquire();
+  // TODO(HWE): Perform HTP lock/unlock in thread instead of HexagonHtp
+  Lock();
+}
 
-HexagonHtp::~HexagonHtp() { Release(); }
+HexagonHtp::~HexagonHtp() {
+  // TODO(HWE): Perform HTP lock/unlock in thread instead of HexagonHtp
+  Unlock();
+  Release();
+}
 
 void HexagonHtp::Acquire() {
   compute_res_attr_t compute_res_attr;
diff --git a/src/runtime/hexagon/hexagon_thread_manager.cc b/src/runtime/hexagon/hexagon_thread_manager.cc
index 2fbc231e5781..3658611cf00d 100644
--- a/src/runtime/hexagon/hexagon_thread_manager.cc
+++ b/src/runtime/hexagon/hexagon_thread_manager.cc
@@ -325,8 +325,9 @@ void HexagonThreadManager::thread_exit(void* context) {
     tc->hvx->Unlock();
     DLOG(INFO) << "Thread " << index << " unlocked an HVX instance";
   } else if (resource_type == HTP_0) {
-    tc->htp->Unlock();
-    DLOG(INFO) << "Thread " << index << " unlocked the HTP";
+    // TODO(HWE): Perform HTP lock/unlock in thread instead of HexagonHtp
+    // tc->htp->Unlock();
+    // DLOG(INFO) << "Thread " << index << " unlocked the HTP";
   }
 
   DLOG(INFO) << "Thread " << index << " exiting";
@@ -346,8 +347,9 @@ void HexagonThreadManager::thread_main(void* context) {
     tc->hvx->Lock();
     DLOG(INFO) << "Thread " << index << " locked an HVX instance";
   } else if (resource_type == HTP_0) {
-    tc->htp->Lock();
-    DLOG(INFO) << "Thread " << index << " locked the HTP";
+    // TODO(HWE): Perform HTP lock/unlock in thread instead of HexagonHtp
+    // tc->htp->Lock();
+    // DLOG(INFO) << "Thread " << index << " locked the HTP";
   }
 
   while (true) {  // loop, executing commands from pipe

From c898dc6b55c5ef26ef2699dd21965e328b92a8cd Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Tue, 8 Nov 2022 01:33:49 -0500
Subject: [PATCH 527/704] [TIR] Add thread sync if access index doesn't depend
 on thread index (#13314)

This PR updates the `src/tir/transforms/thread_storage_sync.cc`, to make it insert storage sync if the access index doesn't depend on the innermost thread index, i.e., being constant wit respect to the innermost thread id.

This fixes an accuracy problem on model https://github.com/pytorch/benchmark/tree/main/torchbenchmark/models/timm_efficientdet
---
 src/tir/transforms/thread_storage_sync.cc     | 31 ++++++++++++----
 .../test_tir_transform_thread_sync.py         | 36 ++++++++++++++++---
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/src/tir/transforms/thread_storage_sync.cc b/src/tir/transforms/thread_storage_sync.cc
index 954f4f7cc47d..466a52d632a3 100644
--- a/src/tir/transforms/thread_storage_sync.cc
+++ b/src/tir/transforms/thread_storage_sync.cc
@@ -197,20 +197,39 @@ class ThreadSyncPlanner : public StorageAccessVisitor {
     // Same index value means no conflicts
     // TODO(tqchen) more standard set based testing.
     bool has_same_index = true;
+    // Even if access has the same index, those indices need to
+    // depend on the innermost thread id to avoid race condition
+    bool depends_on_thread_index = true;
+    const VarNode* thread_index_var = nullptr;
+    if (!curr.threads.empty()) {
+      thread_index_var = curr.threads.back()->var.get();
+    }
+
     for (size_t i = 0; i < prev.touched.size(); i++) {
       const auto& prev_intset = prev.touched[i];
       const auto& curr_intset = curr.touched[i];
 
-      bool provably_same_index =
-          prev_intset.IsSinglePoint() && curr_intset.IsSinglePoint() &&
-          ExprDeepEqual()(prev_intset.PointValue(), curr_intset.PointValue());
-
-      if (!provably_same_index) {
+      if (prev_intset.IsSinglePoint() && curr_intset.IsSinglePoint()) {
+        PrimExpr prev_index = prev_intset.PointValue();
+        PrimExpr curr_index = curr_intset.PointValue();
+        has_same_index = ExprDeepEqual()(prev_index, curr_index);
+        if (thread_index_var != nullptr) {
+          auto f_uses_thread_index = [=](const tvm::tir::VarNode* parameter) {
+            return parameter == thread_index_var;
+          };
+          depends_on_thread_index = depends_on_thread_index &&
+                                    UsesVar(curr_index, f_uses_thread_index) &&
+                                    UsesVar(prev_index, f_uses_thread_index);
+        }
+      } else {
         has_same_index = false;
+      }
+
+      if (!(has_same_index && depends_on_thread_index)) {
         break;
       }
     }
-    if (has_same_index) {
+    if (has_same_index && depends_on_thread_index) {
       return false;
     }
 
diff --git a/tests/python/unittest/test_tir_transform_thread_sync.py b/tests/python/unittest/test_tir_transform_thread_sync.py
index ffdf4b5916c4..18607ca1a005 100644
--- a/tests/python/unittest/test_tir_transform_thread_sync.py
+++ b/tests/python/unittest/test_tir_transform_thread_sync.py
@@ -15,12 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te
 import tvm.testing
+from tvm import te
+from tvm.script import tir as T
 
 
-def run_passes(inputs, stmt):
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc(inputs, stmt, None)
+def run_passes(func: tvm.tir.PrimFunc):
     mod = tvm.IRModule.from_expr(func)
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
 
@@ -53,7 +53,8 @@ def test_thread_storage_sync():
     assert isinstance(bounds, tvm.container.Map)
     stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
-    mod = run_passes([A, A2], stmt)
+    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, A2], stmt, None)
+    mod = run_passes(func)
     f = mod["test_kernel0"]
     body_list = tvm.tir.stmt_list(f.body.body.body)
     assert body_list[1].value.op.same_as(tvm.ir.Op.get("tir.tvm_storage_sync"))
@@ -89,10 +90,35 @@ def ir(A, B):
     A = tvm.tir.decl_buffer((8,), "float32")
     B = tvm.tir.decl_buffer((8,), "float32")
     stmt = ir(A, B)
-    mod = run_passes([A, B], stmt)
+    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
+    mod = run_passes(func)
+    assert "@tir.tvm_storage_sync" in str(mod)
+
+
+@tvm.testing.requires_cuda
+def test_sync_read_thread_id_independent_location():
+    @T.prim_func
+    def func(p0: T.Buffer[2, "float32"], p1: T.Buffer[2, "float32"]) -> None:
+        threadIdx_x = T.env_thread("threadIdx.x")
+        blockIdx_x = T.env_thread("blockIdx.x")
+        T.preflattened_buffer(p0, [1, 2, 1, 1], dtype="float32", data=p0.data)
+        T.launch_thread(blockIdx_x, 8)
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        temp_shared = T.alloc_buffer([1], dtype="float32", scope="shared")
+        T.launch_thread(threadIdx_x, 4)
+        result_local[0] = T.float32(0)
+        if threadIdx_x < 1:
+            temp_shared[0] = p0[0]
+        result_local[0] = result_local[0] + temp_shared[0] * p1[0]
+        if threadIdx_x < 1:
+            temp_shared[0] = p0[1]
+        result_local[0] = result_local[0] + temp_shared[0] * p1[1]
+
+    mod = run_passes(func)
     assert "@tir.tvm_storage_sync" in str(mod)
 
 
 if __name__ == "__main__":
     test_thread_storage_sync()
     test_sync_else_branch()
+    test_sync_read_thread_id_independent_location()

From 79093a171833d3edeeae976e81e2d57e91a0e9df Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 8 Nov 2022 10:19:52 +0000
Subject: [PATCH 528/704] [ETHOSN] Consolidate target string usage (#13159)

* [ETHOSN] Consolidate target string usage

Removes support for a deprecated target string. The deprecation warning
has been around for a couple of releases now so it should be safe to
remove. The target to use moving forward is: `ethos-n -variant=n78 ...`

Refactored direct use of a driver stack target string in the testing
infrastructure to use the same string we expect users to provide. This
simplified some of the code in codegen and hopefully avoids confusion
in the future.
---
 python/tvm/driver/tvmc/composite_target.py    | 12 ---
 python/tvm/relay/op/contrib/ethosn.py         | 17 +---
 src/relay/backend/contrib/ethosn/codegen.cc   | 85 ++++++++-----------
 .../backend/contrib/ethosn/codegen_ethosn.h   |  9 +-
 .../contrib/test_ethosn/infrastructure.py     | 39 +++++----
 .../contrib/test_ethosn/test_addition.py      | 22 ++++-
 .../contrib/test_ethosn/test_codegen.py       | 52 ++++++++++++
 .../contrib/test_ethosn/test_concatenate.py   | 11 ++-
 .../test_ethosn/test_depth_to_space.py        | 11 ++-
 .../contrib/test_ethosn/test_leaky_relu.py    | 11 ++-
 .../contrib/test_ethosn/test_multiply.py      |  9 +-
 .../test_ethosn/test_partition_params.py      | 82 ------------------
 tests/python/contrib/test_ethosn/test_relu.py | 11 ++-
 .../contrib/test_ethosn/test_requantize.py    | 18 +++-
 .../contrib/test_ethosn/test_reshape.py       | 17 +++-
 .../python/contrib/test_ethosn/test_split.py  |  9 +-
 tests/python/contrib/test_ethosn/test_tanh.py | 11 ++-
 .../contrib/test_ethosn/test_topologies.py    | 35 ++++++--
 tests/scripts/task_python_ethosn_tests.sh     |  7 +-
 19 files changed, 264 insertions(+), 204 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosn/test_codegen.py
 delete mode 100644 tests/python/contrib/test_ethosn/test_partition_params.py

diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
index 88bea9980014..a7087ea9239f 100644
--- a/python/tvm/driver/tvmc/composite_target.py
+++ b/python/tvm/driver/tvmc/composite_target.py
@@ -18,7 +18,6 @@
 Provides support to composite target on TVMC.
 """
 import logging
-import warnings
 
 # Make sure Vitis AI codegen is registered
 import tvm.contrib.target.vitis_ai  # pylint: disable=unused-import
@@ -72,11 +71,6 @@
         "config_key": "relay.ext.vitis_ai.options",
         "pass_pipeline": partition_for_vitis_ai,
     },
-    # Deprecated in favour of "ethos-n".
-    "ethos-n78": {
-        "config_key": "relay.ext.ethos-n.options",
-        "pass_pipeline": partition_for_ethosn,
-    },
 }
 
 
@@ -105,12 +99,6 @@ def get_codegen_by_target(name):
         requested target codegen information
     """
     try:
-        if name == "ethos-n78":
-            warnings.warn(
-                "Please use 'ethos-n' instead of the deprecated 'ethos-n78' target, "
-                "which will be removed in a later release of TVM.",
-                DeprecationWarning,
-            )
         return REGISTERED_CODEGEN[name]
     except KeyError:
         raise TVMCException("Composite target %s is not defined in TVMC." % name)
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 9afab68ccd8f..bbe95dac9bba 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -17,7 +17,6 @@
 # pylint: disable=invalid-name, unused-argument
 """Arm(R) Ethos(TM)-N NPU supported operators."""
 from enum import Enum
-import warnings
 from distutils.version import LooseVersion
 
 import tvm.ir
@@ -97,6 +96,8 @@ def is_inline_non_compute_intensive_partitions_enabled() -> bool:
     True if inlining should happen, False if not.
     """
     compiler_attrs = tvm.get_global_func("relay.ext.ethos-n.get_compiler_attrs")()
+    if not compiler_attrs:
+        return False
     return compiler_attrs.inline_non_compute_intensive_partitions
 
 
@@ -115,20 +116,6 @@ def partition_for_ethosn(mod, params=None, **opts):
     -------
     ret : annotated and partitioned module.
     """
-    opts = opts or {}
-    if "variant" not in opts:
-        raise ValueError("Please specify a variant in the target string, e.g. -variant=n78.")
-
-    # -variant=ethos-n78 deprecated in favour of -variant=n78
-    if opts["variant"].lower() == "ethos-n78":
-        warnings.warn(
-            "Please use '-variant=n78' instead of the deprecated "
-            "'-variant=ethos-n78', which will be removed in TVM v0.9.",
-            DeprecationWarning,
-        )
-    elif opts["variant"] != "n78":
-        raise ValueError("When targeting Ethos(TM)-N78, -variant=n78 should be set.")
-
     api_version = ethosn_api_version()
     supported_api_versions = ["3.1.0"]
     if all(api_version != LooseVersion(exp_ver) for exp_ver in supported_api_versions):
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index d2281f782615..edf7caca820d 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -226,36 +226,27 @@ sl::TensorsAndId MakeOps(const sl::TensorAndId<sl::Operand>& op) {
   return ops;
 }
 
-String MakeVariant(Optional<EthosnCompilerConfig> configuration) {
-  String variant = configuration.value()->variant;
-  // Transform variant string to lowercase for comparison
-  std::string variant_string = variant.c_str();
-
-  // Checking deprecated variant format. Support for specifying
-  // the variant in this way only remains for backwards compatibility
-  // and will be removed in a later release of TVM.
-  std::string deprecated_variant_string = variant_string;
-  std::transform(deprecated_variant_string.begin(), deprecated_variant_string.end(),
-                 deprecated_variant_string.begin(), ::tolower);
-  if (variant_string == "n78" || deprecated_variant_string == "ethos-n78") {
-    String tops = configuration.value()->tops;
-    String ple_ratio = configuration.value()->ple_ratio;
-    variant = "Ethos-N78_" + tops + "TOPS_" + ple_ratio + "PLE_RATIO";
-  }
-  return variant;
+sl::EthosNVariant MakeVariant(EthosnCompilerConfig configuration) {
+  String variant = configuration->variant;
+  String tops = configuration->tops;
+  String ple_ratio = configuration->ple_ratio;
+
+  std::string capitalized_variant = variant;
+  std::transform(capitalized_variant.begin(), capitalized_variant.end(),
+                 capitalized_variant.begin(), ::toupper);
+  std::string sl_variant_string =
+      "Ethos-" + capitalized_variant + "_" + tops + "TOPS_" + ple_ratio + "PLE_RATIO";
+  return sl::EthosNVariantFromString(sl_variant_string.c_str());
 }
 
 NetworkWithIDs ConstructNetworkVisitor::Construct(const Function& func) {
   // Initialise everything
-  auto ctx = transform::PassContext::Current();
-  auto cfg = ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options");
-  if (!cfg.defined()) {
-    cfg = AttrsWithDefaultValues<EthosnCompilerConfig>();
-  }
+  EthosnCompilerConfig cfg = GetCompilerAttrs();
+  sl::EthosNVariant variant = MakeVariant(cfg);
+
   NetworkWithIDs network_with_ids;
   network_ = sl::CreateNetwork(
-      sl::GetFwAndHwCapabilities(sl::EthosNVariantFromString(MakeVariant(cfg).c_str()),
-                                 static_cast<uint32_t>(std::stoul(cfg.value()->sram_size))));
+      sl::GetFwAndHwCapabilities(variant, static_cast<uint32_t>(std::stoul(cfg->sram_size))));
   network_with_ids.network = network_;
   operand_table_.clear();
 
@@ -744,28 +735,24 @@ runtime::ethosn::OrderedCompiledNetwork EthosnCompiler::CompileEthosnFunc(const
 }
 
 sl::CompilationOptions EthosnCompiler::CreateOptions() {
-  auto ctx = transform::PassContext::Current();
-  auto cfg = ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options");
-  if (!cfg.defined()) {
-    cfg = AttrsWithDefaultValues<EthosnCompilerConfig>();
-  }
+  EthosnCompilerConfig cfg = GetCompilerAttrs();
 
   sl::CompilationOptions options;
-  options.m_Strategy0 = cfg.value()->strategy0;
-  options.m_Strategy1 = cfg.value()->strategy1;
-  options.m_Strategy3 = cfg.value()->strategy3;
-  options.m_Strategy4 = cfg.value()->strategy4;
-  options.m_Strategy6 = cfg.value()->strategy6;
-  options.m_Strategy7 = cfg.value()->strategy7;
-  options.m_DebugInfo.m_DumpRam = cfg.value()->dump_ram;
-  options.m_DebugInfo.m_InitialSramDump = cfg.value()->initial_sram_dump;
-  options.m_BlockConfig16x16 = cfg.value()->block_config_16x16;
-  options.m_BlockConfig32x8 = cfg.value()->block_config_32x8;
-  options.m_BlockConfig8x32 = cfg.value()->block_config_8x32;
-  options.m_BlockConfig8x8 = cfg.value()->block_config_8x8;
-  options.m_EnableIntermediateCompression = cfg.value()->enable_intermediate_compression;
-  options.m_DisableWinograd = cfg.value()->disable_winograd;
-  options.m_DebugInfo.m_DebugDir = cfg.value()->debug_dir;
+  options.m_Strategy0 = cfg->strategy0;
+  options.m_Strategy1 = cfg->strategy1;
+  options.m_Strategy3 = cfg->strategy3;
+  options.m_Strategy4 = cfg->strategy4;
+  options.m_Strategy6 = cfg->strategy6;
+  options.m_Strategy7 = cfg->strategy7;
+  options.m_DebugInfo.m_DumpRam = cfg->dump_ram;
+  options.m_DebugInfo.m_InitialSramDump = cfg->initial_sram_dump;
+  options.m_BlockConfig16x16 = cfg->block_config_16x16;
+  options.m_BlockConfig32x8 = cfg->block_config_32x8;
+  options.m_BlockConfig8x32 = cfg->block_config_8x32;
+  options.m_BlockConfig8x8 = cfg->block_config_8x8;
+  options.m_EnableIntermediateCompression = cfg->enable_intermediate_compression;
+  options.m_DisableWinograd = cfg->disable_winograd;
+  options.m_DebugInfo.m_DebugDir = cfg->debug_dir;
   return options;
 }
 
@@ -806,12 +793,10 @@ std::unique_ptr<sl::SupportQueries> EthosnCompiler::m_Queries;
 
 EthosnError EthosnCompiler::SupportedSetup() {
   if (m_Queries == nullptr) {
-    auto ctx = transform::PassContext::Current();
-    auto cfg = ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options").defined()
-                   ? ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options")
-                   : AttrsWithDefaultValues<EthosnCompilerConfig>();
-    m_Queries = std::make_unique<sl::SupportQueries>(sl::GetFwAndHwCapabilities(
-        sl::EthosNVariantFromString(MakeVariant(cfg).c_str()), std::stoul(cfg.value()->sram_size)));
+    EthosnCompilerConfig cfg = GetCompilerAttrs();
+    sl::EthosNVariant variant = MakeVariant(cfg);
+    m_Queries = std::make_unique<sl::SupportQueries>(
+        sl::GetFwAndHwCapabilities(variant, std::stoul(cfg->sram_size)));
     if (m_Queries == nullptr) {
       return EthosnError("Could not initialise Arm(R) Ethos(TM)-N compiler isSupported");
     }
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index c640db47b6dd..7c52da713c5c 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -296,13 +296,14 @@ class EthosnCompilerConfig : public Attrs {
 TVM_REGISTER_NODE_TYPE(EthosnCompilerConfigNode);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.ethos-n.options", EthosnCompilerConfig);
 
-auto GetCompilerAttrs() {
+EthosnCompilerConfig GetCompilerAttrs() {
   auto ctx = transform::PassContext::Current();
-  auto cfg = ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options");
+  Optional<EthosnCompilerConfig> cfg =
+      ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options");
   if (!cfg.defined()) {
-    cfg = AttrsWithDefaultValues<EthosnCompilerConfig>();
+    return AttrsWithDefaultValues<EthosnCompilerConfig>();
   }
-  return cfg;
+  return cfg.value();
 }
 TVM_REGISTER_GLOBAL("relay.ext.ethos-n.get_compiler_attrs").set_body_typed(GetCompilerAttrs);
 
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index 85ebd98efcff..8a469403872f 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -31,6 +31,7 @@
 from tvm import relay
 from tvm.contrib import utils, graph_executor, download
 from tvm.relay.op.contrib import partition_for_ethosn
+from tvm.driver.tvmc.target import parse_target
 
 from . import _infrastructure
 
@@ -143,7 +144,9 @@ def visit_call(self, call):
     return c.count
 
 
-def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1, optimize_partitions=True):
+def build(
+    mod, params, npu=True, expected_host_ops=0, npu_partitions=1, additional_config_args=None
+):
     """Build a network with or without Ethos-N offloading.
 
     Parameters
@@ -158,22 +161,18 @@ def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1, optimize
         The number of ops expected to remain on the host.
     npu_partitions : int, optional
         The number of Ethos-N partitions expected.
-    optimize_partitions : bool, optional
-        Disable the pass that optimizes NPU partitions post partitioning.
+    additional_config_args : dict, optional
+        Additional compiler config options for the NPU.
     """
     relay.backend.te_compiler.get().clear()
-    with tvm.transform.PassContext(
-        opt_level=3,
-        config={
-            "relay.ext.ethos-n.options": {
-                "variant": get_ethosn_variant(),
-                "inline_non_compute_intensive_partitions": optimize_partitions,
-            }
-        },
-    ):
+    if not additional_config_args:
+        additional_config_args = {}
+    npu_config = {**get_ethosn_device_options(), **additional_config_args}
+    print(npu_config)
+    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.ethos-n.options": npu_config}):
         with tvm.target.Target("llvm"):
             if npu:
-                mod = partition_for_ethosn(mod, params, variant="n78")
+                mod = partition_for_ethosn(mod, params)
                 host_op_count = get_host_op_count(mod)
                 assert (
                     host_op_count == expected_host_ops
@@ -244,12 +243,12 @@ def build_and_run(
     npu=True,
     expected_host_ops=0,
     npu_partitions=1,
-    optimize_partitions=True,
+    additional_config_args=None,
 ):
     """
     Convenient wrapper for building and running a module on the NPU.
     """
-    lib = build(mod, params, npu, expected_host_ops, npu_partitions, optimize_partitions)
+    lib = build(mod, params, npu, expected_host_ops, npu_partitions, additional_config_args)
     return run(lib, inputs, outputs, npu)
 
 
@@ -285,7 +284,7 @@ def test_error(mod, params, err_msg):
 
     caught = None
     with tvm.transform.PassContext(
-        opt_level=3, config={"relay.ext.ethos-n.options": {"variant": get_ethosn_variant()}}
+        opt_level=3, config={"relay.ext.ethos-n.options": get_ethosn_device_options()}
     ):
         with tvm.target.Target("llvm"):
             try:
@@ -403,5 +402,9 @@ def get_same_padding(
     return (pad_top, pad_left, pad_bottom, pad_right)
 
 
-def get_ethosn_variant():
-    return os.getenv("ETHOSN_VARIANT_CONFIG", default="Ethos-N78_1TOPS_2PLE_RATIO")
+def get_ethosn_device_options():
+    """Determine the NPU configuration used for testing."""
+    default_target_string = "ethos-n -variant=n78 -tops=1 -ple_ratio=2"
+    target_string = os.getenv("ETHOSN_TEST_TARGET_CONFIG", default_target_string)
+    target = parse_target(target_string)
+    return target[0]["opts"]
diff --git a/tests/python/contrib/test_ethosn/test_addition.py b/tests/python/contrib/test_ethosn/test_addition.py
index 53afd01b8449..9841e798aff4 100644
--- a/tests/python/contrib/test_ethosn/test_addition.py
+++ b/tests/python/contrib/test_ethosn/test_addition.py
@@ -111,7 +111,16 @@ def test_addition(dtype, shape):
     model = _get_model(shape, shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype)
     for npu in [False, True]:
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
+        outputs.append(
+            tei.build_and_run(
+                mod,
+                inputs,
+                1,
+                {},
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
+        )
 
     tei.verify(outputs, dtype, 1)
 
@@ -227,7 +236,16 @@ def test_addition_to_reinterpret_quantize(lhs_shape, lhs_is_constant, rhs_shape,
     outputs = []
     for npu in [False, True]:
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
+        outputs.append(
+            tei.build_and_run(
+                mod,
+                inputs,
+                1,
+                {},
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
+        )
     tei.verify(outputs, dtype, 1)
 
 
diff --git a/tests/python/contrib/test_ethosn/test_codegen.py b/tests/python/contrib/test_ethosn/test_codegen.py
new file mode 100644
index 000000000000..c50dfb7963d3
--- /dev/null
+++ b/tests/python/contrib/test_ethosn/test_codegen.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""NPU codegen tests"""
+
+import pytest
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.testing import requires_ethosn
+
+from . import infrastructure as tei
+
+
+@requires_ethosn
+def test_compile_with_unsupported_variant():
+    """Test compilation with unsupported variant."""
+    dtype = "int8"
+    input_shape = (1, 2, 2, 2)
+
+    x = relay.var("x", shape=input_shape, dtype=dtype)
+    y = relay.reshape(x, newshape=(1, 1, 1, 8))
+    mod = tei.make_ethosn_partition(y)
+
+    additional_config_args = {
+        "variant": "foo",
+        "inline_non_compute_intensive_partitions": False,
+    }
+
+    inputs = {
+        "x": np.random.randint(
+            low=np.iinfo(dtype).min, high=np.iinfo(dtype).max, size=input_shape, dtype=dtype
+        )
+    }
+
+    with pytest.raises(tvm.TVMError, match=r"Unknown NPU type"):
+        tei.build_and_run(mod, inputs, 1, {}, True, additional_config_args=additional_config_args)
diff --git a/tests/python/contrib/test_ethosn/test_concatenate.py b/tests/python/contrib/test_ethosn/test_concatenate.py
index f8521b595060..83e84046d0a6 100644
--- a/tests/python/contrib/test_ethosn/test_concatenate.py
+++ b/tests/python/contrib/test_ethosn/test_concatenate.py
@@ -76,7 +76,16 @@ def test_concatenate(dtype, shapes, axis):
     for npu in [False, True]:
         model = _get_model(shapes, dtype, axis)
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
+        outputs.append(
+            tei.build_and_run(
+                mod,
+                inputs,
+                1,
+                {},
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
+        )
 
         tei.verify(outputs, dtype, 0)
 
diff --git a/tests/python/contrib/test_ethosn/test_depth_to_space.py b/tests/python/contrib/test_ethosn/test_depth_to_space.py
index 814693b664ca..7bbd532241fd 100644
--- a/tests/python/contrib/test_ethosn/test_depth_to_space.py
+++ b/tests/python/contrib/test_ethosn/test_depth_to_space.py
@@ -53,7 +53,16 @@ def test_depth_to_space(dtype, shape):
     for npu in [False, True]:
         model = _get_model(shape, 2, dtype, "NHWC")
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
+        outputs.append(
+            tei.build_and_run(
+                mod,
+                inputs,
+                1,
+                {},
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
+        )
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_leaky_relu.py b/tests/python/contrib/test_ethosn/test_leaky_relu.py
index 7c1969ec44ba..ccf67151bf1e 100644
--- a/tests/python/contrib/test_ethosn/test_leaky_relu.py
+++ b/tests/python/contrib/test_ethosn/test_leaky_relu.py
@@ -65,7 +65,16 @@ def test_leaky_relu(dtype, shape, alpha):
     for npu in [False, True]:
         model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype, alpha)
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
+        outputs.append(
+            tei.build_and_run(
+                mod,
+                inputs,
+                1,
+                {},
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
+        )
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_multiply.py b/tests/python/contrib/test_ethosn/test_multiply.py
index a7b97e39cb13..d7ebcfab40a8 100644
--- a/tests/python/contrib/test_ethosn/test_multiply.py
+++ b/tests/python/contrib/test_ethosn/test_multiply.py
@@ -152,7 +152,14 @@ def test_multiply_to_reinterpret_quantize(shape, constant_shape, reverse_inputs)
     for npu in [False, True]:
         mod = tei.make_module(model, params)
         outputs.append(
-            tei.build_and_run(mod, inputs, 1, params, npu=npu, optimize_partitions=False)
+            tei.build_and_run(
+                mod,
+                inputs,
+                1,
+                params,
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
         )
 
     tei.verify(outputs, dtype, 1)
diff --git a/tests/python/contrib/test_ethosn/test_partition_params.py b/tests/python/contrib/test_ethosn/test_partition_params.py
deleted file mode 100644
index e8ac687c04b0..000000000000
--- a/tests/python/contrib/test_ethosn/test_partition_params.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Arm(R) Ethos(TM)-N partition parameter tests"""
-
-import pytest
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm.relay.op.contrib.ethosn import partition_for_ethosn
-from tvm.testing import requires_ethosn
-
-
-@requires_ethosn
-def test_ethosn78_partition_no_error():
-    """Test Arm(R) Ethos(TM)-N78 partition"""
-
-    a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-    weights = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-    res = relay.nn.conv2d(
-        a, weights, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
-    )
-    b = relay.var("b", shape=[8], dtype="uint8")
-    res = relay.nn.bias_add(res, b, axis=1)
-
-    mod = tvm.IRModule.from_expr(res)
-    opts = {"variant": "n78"}
-    partition_for_ethosn(mod, **opts)
-
-
-@requires_ethosn
-def test_ethosn78_partition_undefined_variant():
-    """Test Arm(R) Ethos(TM)-N78 partition with undefined variant"""
-
-    with pytest.raises(
-        ValueError, match=r".*Please specify a variant in the target string, e.g. -variant=n78.*"
-    ):
-        a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-        weights = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-        res = relay.nn.conv2d(
-            a, weights, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
-        )
-        b = relay.var("b", shape=[8], dtype="uint8")
-        res = relay.nn.bias_add(res, b, axis=1)
-
-        mod = tvm.IRModule.from_expr(res)
-        partition_for_ethosn(mod)
-
-
-@requires_ethosn
-def test_ethosn78_partition_invalid_variant():
-    """Test Arm(R) Ethos(TM)-N78 partition with invalid variant"""
-
-    with pytest.raises(
-        ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=n78 should be set.*"
-    ):
-        a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-        wwights = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-        res = relay.nn.conv2d(
-            a, wwights, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
-        )
-        b = relay.var("b", shape=[8], dtype="uint8")
-        res = relay.nn.bias_add(res, b, axis=1)
-
-        mod = tvm.IRModule.from_expr(res)
-        opts = {"variant": "Ethos-N"}
-        partition_for_ethosn(mod, **opts)
diff --git a/tests/python/contrib/test_ethosn/test_relu.py b/tests/python/contrib/test_ethosn/test_relu.py
index 8ecea0d23ce4..b1ab6ede2c42 100644
--- a/tests/python/contrib/test_ethosn/test_relu.py
+++ b/tests/python/contrib/test_ethosn/test_relu.py
@@ -60,7 +60,16 @@ def test_relu(dtype, shape, a_min, a_max):
     for npu in [False, True]:
         model = _get_model(inputs["a"].shape, dtype, a_min, a_max)
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
+        outputs.append(
+            tei.build_and_run(
+                mod,
+                inputs,
+                1,
+                {},
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
+        )
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_requantize.py b/tests/python/contrib/test_ethosn/test_requantize.py
index 618b00c6e4ee..315beddbe45e 100644
--- a/tests/python/contrib/test_ethosn/test_requantize.py
+++ b/tests/python/contrib/test_ethosn/test_requantize.py
@@ -64,7 +64,14 @@ def test_requantize(in_dtype, out_dtype, shape):
             out_dtype=out_dtype,
         )
         mod = tei.make_module(model, [])
-        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False)
+        x = tei.build_and_run(
+            mod,
+            inputs,
+            1,
+            {},
+            npu=npu,
+            additional_config_args={"inline_non_compute_intensive_partitions": False},
+        )
         outputs.append(x)
 
     tei.verify(outputs, out_dtype, 1)
@@ -128,7 +135,14 @@ def get_model():
     for npu in [False, True]:
         model = get_model()
         mod = tei.make_module(model, {})
-        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False)
+        x = tei.build_and_run(
+            mod,
+            inputs,
+            1,
+            {},
+            npu=npu,
+            additional_config_args={"inline_non_compute_intensive_partitions": False},
+        )
         outputs.append(x)
 
     tei.verify(outputs, out_dtype, 1)
diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py
index d60ad50b97bc..2c6b4fda5af5 100644
--- a/tests/python/contrib/test_ethosn/test_reshape.py
+++ b/tests/python/contrib/test_ethosn/test_reshape.py
@@ -72,7 +72,14 @@ def test_reshape(dtype, input_shape, output_shape):
         model, params = _get_model(input_shape, output_shape, dtype)
         mod = tei.make_module(model, params)
         outputs.append(
-            tei.build_and_run(mod, inputs, 1, params, npu=npu, optimize_partitions=False)
+            tei.build_and_run(
+                mod,
+                inputs,
+                1,
+                params,
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
         )
 
     tei.verify(outputs, dtype, 1)
@@ -93,4 +100,10 @@ def test_reshape_failure(input_shape, output_shape):
 
     model, params = _get_model(input_shape, output_shape, "int8")
     mod = tei.make_module(model, params)
-    tei.build(mod, params, expected_host_ops=1, npu_partitions=0, optimize_partitions=False)
+    tei.build(
+        mod,
+        params,
+        expected_host_ops=1,
+        npu_partitions=0,
+        additional_config_args={"inline_non_compute_intensive_partitions": False},
+    )
diff --git a/tests/python/contrib/test_ethosn/test_split.py b/tests/python/contrib/test_ethosn/test_split.py
index 56e51e2de159..0c13df97eef3 100644
--- a/tests/python/contrib/test_ethosn/test_split.py
+++ b/tests/python/contrib/test_ethosn/test_split.py
@@ -57,7 +57,14 @@ def test_split(dtype, shape, splits, axis):
         mod = tei.make_module(model, {})
         output_count = splits if isinstance(splits, int) else len(splits) + 1
         outputs.append(
-            tei.build_and_run(mod, inputs, output_count, {}, npu=npu, optimize_partitions=False)
+            tei.build_and_run(
+                mod,
+                inputs,
+                output_count,
+                {},
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
         )
 
         tei.verify(outputs, dtype, 0)
diff --git a/tests/python/contrib/test_ethosn/test_tanh.py b/tests/python/contrib/test_ethosn/test_tanh.py
index c2fc5188e5f1..25f46e51eda9 100644
--- a/tests/python/contrib/test_ethosn/test_tanh.py
+++ b/tests/python/contrib/test_ethosn/test_tanh.py
@@ -59,7 +59,16 @@ def test_tanh(dtype, shape):
     for npu in [False, True]:
         model = _get_model(shape, zp_min + 120, 0.0250629, zp_min + 128, 0.0078125, dtype)
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
+        outputs.append(
+            tei.build_and_run(
+                mod,
+                inputs,
+                1,
+                {},
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
+        )
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py
index 4a4fc1e4d126..78aa19a846eb 100644
--- a/tests/python/contrib/test_ethosn/test_topologies.py
+++ b/tests/python/contrib/test_ethosn/test_topologies.py
@@ -90,7 +90,7 @@ def get_model(input_shape, dtype, var_names):
                 npu=npu,
                 expected_host_ops=expected_host_ops,
                 npu_partitions=npu_partitions,
-                optimize_partitions=False,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
             )
         )
 
@@ -178,7 +178,16 @@ def get_model(input_shape, dtype, var_names):
     for npu in [False, True]:
         model = get_model(inputs["a"].shape, dtype, iter(inputs))
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 8, {}, npu=npu, optimize_partitions=False))
+        outputs.append(
+            tei.build_and_run(
+                mod,
+                inputs,
+                8,
+                {},
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
+        )
 
     tei.verify(outputs, dtype, 1)
 
@@ -286,7 +295,7 @@ def get_model(shape, dtype, splits, axis):
                 npu=npu,
                 expected_host_ops=expected_host_ops,
                 npu_partitions=npu_partitions,
-                optimize_partitions=False,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
             )
         else:
             outputs.append(
@@ -298,7 +307,7 @@ def get_model(shape, dtype, splits, axis):
                     npu=npu,
                     expected_host_ops=expected_host_ops,
                     npu_partitions=npu_partitions,
-                    optimize_partitions=False,
+                    additional_config_args={"inline_non_compute_intensive_partitions": False},
                 )
             )
 
@@ -329,7 +338,16 @@ def get_model(dtype):
     for npu in [False, True]:
         model = get_model(dtype)
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 4, {}, npu=npu, optimize_partitions=False))
+        outputs.append(
+            tei.build_and_run(
+                mod,
+                inputs,
+                4,
+                {},
+                npu=npu,
+                additional_config_args={"inline_non_compute_intensive_partitions": False},
+            )
+        )
 
     tei.verify(outputs, dtype, 0)
 
@@ -378,7 +396,12 @@ def get_model(shapes, dtype, axis):
             mod = tei.make_module(model, {})
         else:
             mod = tei.make_ethosn_partition(model)
-        lib = tei.build(mod, {}, npu=False, optimize_partitions=False)
+        lib = tei.build(
+            mod,
+            {},
+            npu=False,
+            additional_config_args={"inline_non_compute_intensive_partitions": False},
+        )
         outputs.append(tei.run(lib, inputs, 1, npu=npu))
 
     tei.verify(outputs, dtype, 0)
diff --git a/tests/scripts/task_python_ethosn_tests.sh b/tests/scripts/task_python_ethosn_tests.sh
index d49b8518a4ad..812c02798da8 100755
--- a/tests/scripts/task_python_ethosn_tests.sh
+++ b/tests/scripts/task_python_ethosn_tests.sh
@@ -27,8 +27,7 @@ source tests/scripts/setup-pytest-env.sh
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
-# Note: Default behaviour is to assume the test target is Ethos-N78
-# but setting ETHOSN_VARIANT_CONFIG appropriately
-# (e.g. ETHOSN_VARIANT_CONFIG=Ethos-N78_1TOPS_2PLE_RATIO)
-# switches the target to various Ethos-N78 configurations.
+# Note: Setting ETHOSN_TEST_TARGET_CONFIG appropriately
+# (e.g. ETHOSN_TEST_TARGET_CONFIG="ethos-n -variant=n78 -tops=1 -ple_ratio=2")
+# switches the target to various NPU configurations.
 run_pytest ctypes python-ethosn tests/python/contrib/test_ethosn

From be30238947305ccbf63655fb11162e726c319804 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Tue, 8 Nov 2022 14:45:33 +0300
Subject: [PATCH 529/704] [Adreno][Textures] Fix static memory planner 
 (#13253)

* [Adreno][Textures] Fix static memory planner

Fix memory reusage in static memory planner.

* Move token allocators to separate file

* Add test on TokenAllocator2d

* Apply comments and fix CI
---
 src/relay/backend/graph_plan_memory.cc        | 243 +-----------
 src/relay/backend/token_allocator.cc          | 201 ++++++++++
 src/relay/backend/token_allocator.h           | 161 ++++++++
 .../relay/backend/graph_plan_token_alloc.cc   | 351 ++++++++++++++++++
 4 files changed, 714 insertions(+), 242 deletions(-)
 create mode 100644 src/relay/backend/token_allocator.cc
 create mode 100644 src/relay/backend/token_allocator.h
 create mode 100644 tests/cpp/relay/backend/graph_plan_token_alloc.cc

diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index f927bf633732..d85ffd78291c 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -37,6 +37,7 @@
 #include "../op/call/call.h"
 #include "../op/memory/memory.h"
 #include "../transforms/device_aware_visitors.h"
+#include "./token_allocator.h"
 #include "./utils.h"
 
 namespace tvm {
@@ -50,33 +51,6 @@ using backend::StaticMemoryPlan;
 using backend::StorageInfo;
 using IntegerArray = Array<Integer>;
 
-/*! A representation of a block of memory required at runtime on some device. */
-struct StorageToken {
-  /*! \brief Reference counter */
-  int ref_counter{0};
-  /*! \brief number of bytes */
-  size_t max_bytes{0};
-  /*! \brief The corresponding tensor type. */
-  TensorType ttype{nullptr};
-  /*! \brief VirtualDevice on which the memory will reside. */
-  VirtualDevice virtual_device = VirtualDevice::FullyUnconstrained();
-  /*! \brief The storage id */
-  int64_t storage_id{-1};
-
-  bool is_valid() const { return !virtual_device->IsFullyUnconstrained(); }
-
-  bool is_compatible(const StorageToken& that) const {
-    return virtual_device == that.virtual_device;
-  }
-
-  std::string ToString() const {
-    std::ostringstream os;
-    os << "{storage_id: " << storage_id << ", max_bytes: " << max_bytes
-       << ", ttype: " << PrettyPrint(ttype) << ", virtual_device: " << virtual_device << "}";
-    return os.str();
-  }
-};
-
 class StorageAllocaBaseVisitor : public transform::DeviceAwareExprVisitor {
  public:
   StorageAllocaBaseVisitor() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
@@ -380,221 +354,6 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     }
   }
 
-  /**
-   * @brief Memory manager for flattened 1d memory (buffers)
-   */
-  class TokenAllocator1D {
-   public:
-    /*!
-     * \brief ceil(size/word_size) to get number of words.
-     * \param size The original size.
-     * \param word_size The element size.
-     */
-    static size_t DivRoundUp(size_t size, size_t word_size) {
-      return (size + word_size - 1) / word_size;
-    }
-
-    /*!
-     * \brief Get the memory requirement.
-     * \param prototype The prototype token.
-     * \return The required memory size.
-     *
-     * TODO(mbs): Gf GetMemorySizeBytes in aot_executor_codegen.cc,
-     * CalculateRelayExprSizeBytes in utils.cc
-     */
-    size_t GetMemorySize(StorageToken* prototype) {
-      TensorType ttype = prototype->ttype;
-      ICHECK(ttype.defined());
-      size_t size = 1;
-      for (IndexExpr dim : ttype->shape) {
-        const int64_t* pval = tir::as_const_int(dim);
-        ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
-        ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
-        size *= static_cast<size_t>(pval[0]);
-      }
-      size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
-      return size;
-    }
-    /*!
-     * \brief Request a storage token for a given prototype.
-     * \param prototype. The prototype storage token.
-     * \return The result token.
-     */
-    StorageToken* Request(StorageToken* prototype) {
-      // calculate the size;
-      size_t size = GetMemorySize(prototype);
-      // search memory block in [size / match_range_, size * match_range_)
-      if (match_range_ == 0) {
-        return nullptr;
-      }
-      auto begin = free_.lower_bound(size / match_range_);
-      auto mid = free_.lower_bound(size);
-      auto end = free_.upper_bound(size * match_range_);
-      // search for memory blocks larger than requested
-      for (auto it = mid; it != end; ++it) {
-        StorageToken* tok = it->second;
-        if (!tok->is_compatible(*prototype)) continue;
-        ICHECK_EQ(tok->ref_counter, 0);
-        // Use exect matching strategy
-        tok->max_bytes = std::max(size, tok->max_bytes);
-        tok->ref_counter = prototype->ref_counter;
-        // find a exact match, erase from map and return
-        free_.erase(it);
-        return tok;
-      }
-      // then search for memory blocks smaller than requested space
-      for (auto it = mid; it != begin;) {
-        --it;
-        StorageToken* tok = it->second;
-        if (!tok->is_compatible(*prototype)) continue;
-        ICHECK_EQ(tok->ref_counter, 0);
-        // Use exect matching strategy
-        tok->max_bytes = std::max(size, tok->max_bytes);
-        tok->ref_counter = prototype->ref_counter;
-        // erase from map and return
-        free_.erase(it);
-        return tok;
-      }
-      return nullptr;
-    }
-    /*!
-     * \brief Alloacte a storage token by consuming prototype
-     * \param prototype The prototype token.
-     * \param size The size of memory being requested.
-     */
-    StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) {
-      size_t size = GetMemorySize(prototype);
-      prototype->max_bytes = size;
-      prototype->storage_id = storage_id;
-      data_.push_back(prototype);
-      return prototype;
-    }
-    /*!
-     * \brief Check if we can release token.
-     * \param tok The token to be released.
-     */
-    void CheckForRelease(StorageToken* tok) {
-      ICHECK_GE(tok->storage_id, 0);
-      ICHECK_GE(tok->ref_counter, 0);
-      if (tok->ref_counter == 0) {
-        free_.insert({tok->max_bytes, tok});
-      }
-    }
-
-   private:
-    // scale used for rough match
-    const size_t match_range_{16};
-    // free list of storage entry
-    std::multimap<size_t, StorageToken*> free_;
-    // all the storage resources available
-    std::vector<StorageToken*> data_;
-  };
-
-  /**
-   * @brief Memory manager for 2d memory (textures)
-   */
-  class TokenAllocator2D {
-   public:
-    /*!
-     * \brief Request a storage token for a given prototype.
-     * \param prototype. The prototype storage token.
-     * \return The result token.
-     */
-    StorageToken* Request(StorageToken* prototype) {
-      auto shape = GetSize2D(prototype);
-      int64_t requested_size = shape.height * shape.width;
-      int64_t min_added_size = std::numeric_limits<int64_t>::max();
-      int64_t min_wasted_size = std::numeric_limits<int64_t>::max();
-      int64_t best_storage_id = -1;
-      MemBlock best_mem, new_mem;
-      for (int64_t free_id : free_list_) {
-        MemBlock& cached = blocks_[free_id];
-        // Can only reuse texture 2d blocks of the same type
-        if (cached.token_->ttype->dtype != prototype->ttype->dtype) {
-          continue;
-        }
-        int64_t cached_size = cached.x_ * cached.y_;
-        new_mem.x_ = std::max(cached.x_, shape.width);
-        new_mem.y_ = std::max(cached.y_, shape.height);
-        int64_t expanded_size = new_mem.x_ * new_mem.y_;
-        int64_t added_size = expanded_size - cached_size;
-        int64_t wasted_size = expanded_size - requested_size;
-        // Prioritize minimization of added size first, then minimize
-        // wasted size among blocks which would not require expansion
-        if ((min_added_size > 0 && added_size < min_added_size) ||
-            (min_added_size == 0 && wasted_size < min_wasted_size)) {
-          min_added_size = added_size;
-          min_wasted_size = wasted_size;
-          best_storage_id = free_id;
-          best_mem = new_mem;
-        }
-      }
-
-      if (min_added_size <= requested_size) {
-        best_mem.token_ = blocks_[best_storage_id].token_;
-        // Reset the reference counter of the now live token
-        best_mem.token_->ref_counter = prototype->ref_counter;
-        blocks_[best_storage_id] = best_mem;
-        free_list_.erase(best_storage_id);
-        return best_mem.token_;
-      }
-      return nullptr;
-    }
-    /*!
-     * \brief Alloacte a storage token by consuming prototype
-     * \param prototype The prototype token.
-     * \param size The size of memory being requested.
-     */
-    StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) {
-      auto shape = GetSize2D(prototype);
-      MemBlock block;
-      block.x_ = shape.width;
-      block.y_ = shape.height;
-      prototype->storage_id = storage_id;
-      block.token_ = prototype;
-      blocks_[prototype->storage_id] = block;
-      return prototype;
-    }
-    /*!
-     * \brief Check if we can release token.
-     * \param tok The token to be released.
-     */
-    void CheckForRelease(StorageToken* tok) {
-      ICHECK_GE(tok->storage_id, 0);
-      ICHECK_GE(tok->ref_counter, 0);
-      if (tok->ref_counter == 0) {
-        free_list_.insert(tok->storage_id);
-      }
-    }
-    /*!
-     * \brief Get the texture 2d size requirement
-     * \param prototype The prototype token.
-     * \return The required texture 2d memory size in (width, height, channel).
-     */
-    Texture2DShape GetSize2D(StorageToken* prototype) {
-      TensorType ttype = prototype->ttype;
-      ICHECK(ttype.defined());
-      size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(),
-                                                           prototype->virtual_device->memory_scope);
-      struct Shape {
-        const Array<PrimExpr>& shape;
-        int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); }
-      };
-      return runtime::ApplyTexture2DFlattening<int64_t>(Shape{ttype->shape}, ttype->shape.size(),
-                                                        axis);
-    }
-
-   private:
-    struct MemBlock {
-      StorageToken* token_;
-      int64_t x_;
-      int64_t y_;
-    };
-
-    std::unordered_map<int64_t, MemBlock> blocks_;
-    std::unordered_set<int64_t> free_list_;
-  };
-
   class TokenAllocator {
    public:
     StorageToken* Alloc(StorageToken* proto) {
diff --git a/src/relay/backend/token_allocator.cc b/src/relay/backend/token_allocator.cc
new file mode 100644
index 000000000000..bdecba9afad7
--- /dev/null
+++ b/src/relay/backend/token_allocator.cc
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/token_allocator.cc
+ * \brief Token allocation classes for backend
+ */
+
+#include "token_allocator.h"
+
+#include <tvm/tir/op.h>
+
+#include <algorithm>
+#include <limits>
+
+namespace tvm {
+namespace relay {
+
+size_t TokenAllocator1D::GetMemorySize(StorageToken* prototype) {
+  TensorType ttype = prototype->ttype;
+  ICHECK(ttype.defined());
+  size_t size = 1;
+  for (IndexExpr dim : ttype->shape) {
+    const int64_t* pval = tir::as_const_int(dim);
+    ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
+    ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
+    size *= static_cast<size_t>(pval[0]);
+  }
+  size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
+  return size;
+}
+
+StorageToken* TokenAllocator1D::Request(StorageToken* prototype) {
+  // calculate the size;
+  size_t size = GetMemorySize(prototype);
+  // search memory block in [size / match_range_, size * match_range_)
+  if (match_range_ == 0) {
+    return nullptr;
+  }
+  auto begin = free_.lower_bound(size / match_range_);
+  auto mid = free_.lower_bound(size);
+  auto end = free_.upper_bound(size * match_range_);
+  // search for memory blocks larger than requested
+  for (auto it = mid; it != end; ++it) {
+    StorageToken* tok = it->second;
+    if (!tok->is_compatible(*prototype)) continue;
+    ICHECK_EQ(tok->ref_counter, 0);
+    // Use exect matching strategy
+    tok->max_bytes = std::max(size, tok->max_bytes);
+    tok->ref_counter = prototype->ref_counter;
+    // find a exact match, erase from map and return
+    free_.erase(it);
+    return tok;
+  }
+  // then search for memory blocks smaller than requested space
+  for (auto it = mid; it != begin;) {
+    --it;
+    StorageToken* tok = it->second;
+    if (!tok->is_compatible(*prototype)) continue;
+    ICHECK_EQ(tok->ref_counter, 0);
+    // Use exect matching strategy
+    tok->max_bytes = std::max(size, tok->max_bytes);
+    tok->ref_counter = prototype->ref_counter;
+    // erase from map and return
+    free_.erase(it);
+    return tok;
+  }
+  return nullptr;
+}
+
+StorageToken* TokenAllocator1D::Alloc(StorageToken* prototype, int64_t storage_id) {
+  size_t size = GetMemorySize(prototype);
+  prototype->max_bytes = size;
+  prototype->storage_id = storage_id;
+  data_.push_back(prototype);
+  return prototype;
+}
+
+void TokenAllocator1D::CheckForRelease(StorageToken* tok) {
+  ICHECK_GE(tok->storage_id, 0);
+  ICHECK_GE(tok->ref_counter, 0);
+  if (tok->ref_counter == 0) {
+    free_.insert({tok->max_bytes, tok});
+  }
+}
+
+StorageToken* TokenAllocator2D::Request(StorageToken* prototype) {
+  auto shape = GetSize2D(prototype);
+  const int64_t max_ratio = 5;
+  int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
+  int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
+  int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
+  int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
+  int64_t best_storage_id = -1;
+  MemBlock new_mem;
+  for (int64_t free_id : free_list_) {
+    MemBlock& cached = blocks_[free_id];
+    // Can only reuse texture 2d blocks of the same type
+    if (cached.token_->ttype->dtype != prototype->ttype->dtype) {
+      continue;
+    }
+    // Can only reuse texture 2d blocks of the same scope
+    // Because reusing textures with different memory scope may lead to
+    // accuracy issues, because the data will be packed in a different way for
+    // different memory scopes.
+    if (cached.token_->virtual_device->memory_scope != prototype->virtual_device->memory_scope) {
+      continue;
+    }
+    // avoid reusing too small and too big textures
+    if (shape.width / cached.x_ > max_ratio || cached.x_ / shape.width > max_ratio ||
+        shape.height / cached.y_ > max_ratio || cached.y_ / shape.height > max_ratio) {
+      continue;
+    }
+    int64_t new_width = std::max(cached.x_, shape.width);
+    int64_t new_height = std::max(cached.y_, shape.height);
+    int64_t added_size_x = new_width - cached.x_;
+    int64_t added_size_y = new_height - cached.y_;
+    int64_t wasted_size_x = new_width - shape.width;
+    int64_t wasted_size_y = new_height - shape.height;
+    // Prioritize minimization of added size first, then minimize
+    // wasted size among blocks which would not require expansion
+    if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
+        (min_added_size_y > 0 && added_size_y < min_added_size_y) ||
+        (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
+        (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
+      min_added_size_x = added_size_x;
+      min_added_size_y = added_size_y;
+      min_wasted_size_x = wasted_size_x;
+      min_wasted_size_y = wasted_size_y;
+      best_storage_id = free_id;
+      new_mem.x_ = new_width;
+      new_mem.y_ = new_height;
+    }
+  }
+
+  if (min_added_size_x == 0 && min_added_size_y == 0) {
+    // use existing block
+    free_list_.erase(best_storage_id);
+    blocks_[best_storage_id].token_->ref_counter += prototype->ref_counter;
+    return blocks_[best_storage_id].token_;
+  } else if (min_added_size_x <= shape.width || min_added_size_y <= shape.height) {
+    // Reset the reference counter of the now live token
+    free_list_.erase(best_storage_id);
+    new_mem.token_ = prototype;
+    new_mem.token_->ref_counter += 1;
+    new_mem.token_->storage_id = best_storage_id;
+    blocks_[best_storage_id] = new_mem;
+    return new_mem.token_;
+  }
+  return nullptr;
+}
+
+StorageToken* TokenAllocator2D::Alloc(StorageToken* prototype, int64_t storage_id) {
+  auto shape = GetSize2D(prototype);
+  MemBlock block;
+  block.x_ = shape.width;
+  block.y_ = shape.height;
+  prototype->storage_id = storage_id;
+  block.token_ = prototype;
+  blocks_[prototype->storage_id] = block;
+  return prototype;
+}
+
+void TokenAllocator2D::CheckForRelease(StorageToken* tok) {
+  ICHECK_GE(tok->storage_id, 0);
+  ICHECK_GE(tok->ref_counter, 0);
+  if (tok->ref_counter == 0) {
+    free_list_.insert(tok->storage_id);
+  }
+}
+
+runtime::Texture2DShape<int64_t> TokenAllocator2D::GetSize2D(StorageToken* prototype) {
+  TensorType ttype = prototype->ttype;
+  ICHECK(ttype.defined());
+  size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(),
+                                                       prototype->virtual_device->memory_scope);
+  struct Shape {
+    const Array<PrimExpr>& shape;
+    int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); }
+  };
+  return runtime::ApplyTexture2DFlattening<int64_t>(Shape{ttype->shape}, ttype->shape.size(), axis);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/token_allocator.h b/src/relay/backend/token_allocator.h
new file mode 100644
index 000000000000..3aebd71b6c2b
--- /dev/null
+++ b/src/relay/backend/token_allocator.h
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/token_allocator.h
+ * \brief Token allocation classes for backend
+ */
+#ifndef TVM_RELAY_BACKEND_TOKEN_ALLOCATOR_H_
+#define TVM_RELAY_BACKEND_TOKEN_ALLOCATOR_H_
+
+#include <tvm/relay/type.h>
+#include <tvm/target/virtual_device.h>
+
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "../../runtime/texture.h"
+
+namespace tvm {
+namespace relay {
+
+/*! A representation of a block of memory required at runtime on some device. */
+struct StorageToken {
+  /*! \brief Reference counter */
+  int ref_counter{0};
+  /*! \brief number of bytes */
+  size_t max_bytes{0};
+  /*! \brief The corresponding tensor type. */
+  TensorType ttype{nullptr};
+  /*! \brief VirtualDevice on which the memory will reside. */
+  VirtualDevice virtual_device = VirtualDevice::FullyUnconstrained();
+  /*! \brief The storage id */
+  int64_t storage_id{-1};
+
+  bool is_valid() const { return !virtual_device->IsFullyUnconstrained(); }
+
+  bool is_compatible(const StorageToken& that) const {
+    return virtual_device == that.virtual_device;
+  }
+
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "{storage_id: " << storage_id << ", max_bytes: " << max_bytes
+       << ", ttype: " << PrettyPrint(ttype) << ", virtual_device: " << virtual_device << "}";
+    return os.str();
+  }
+};
+
+/**
+ * @brief Memory manager for flattened 1d memory (buffers)
+ */
+class TokenAllocator1D {
+ public:
+  /*!
+   * \brief ceil(size/word_size) to get number of words.
+   * \param size The original size.
+   * \param word_size The element size.
+   */
+  static size_t DivRoundUp(size_t size, size_t word_size) {
+    return (size + word_size - 1) / word_size;
+  }
+
+  /*!
+   * \brief Get the memory requirement.
+   * \param prototype The prototype token.
+   * \return The required memory size.
+   *
+   * TODO(mbs): Gf GetMemorySizeBytes in aot_executor_codegen.cc,
+   * CalculateRelayExprSizeBytes in utils.cc
+   */
+  size_t GetMemorySize(StorageToken* prototype);
+  /*!
+   * \brief Request a storage token for a given prototype.
+   * \param prototype. The prototype storage token.
+   * \return The result token.
+   */
+  StorageToken* Request(StorageToken* prototype);
+  /*!
+   * \brief Alloacte a storage token by consuming prototype
+   * \param prototype The prototype token.
+   * \param size The size of memory being requested.
+   */
+  StorageToken* Alloc(StorageToken* prototype, int64_t storage_id);
+  /*!
+   * \brief Check if we can release token.
+   * \param tok The token to be released.
+   */
+  void CheckForRelease(StorageToken* tok);
+
+ private:
+  // scale used for rough match
+  const size_t match_range_{16};
+  // free list of storage entry
+  std::multimap<size_t, StorageToken*> free_;
+  // all the storage resources available
+  std::vector<StorageToken*> data_;
+};
+
+/**
+ * @brief Memory manager for 2d memory (textures)
+ */
+class TokenAllocator2D {
+ public:
+  /*!
+   * \brief Request a storage token for a given prototype.
+   * \param prototype. The prototype storage token.
+   * \return The result token.
+   */
+  StorageToken* Request(StorageToken* prototype);
+  /*!
+   * \brief Alloacte a storage token by consuming prototype
+   * \param prototype The prototype token.
+   * \param size The size of memory being requested.
+   */
+  StorageToken* Alloc(StorageToken* prototype, int64_t storage_id);
+  /*!
+   * \brief Check if we can release token.
+   * \param tok The token to be released.
+   */
+  void CheckForRelease(StorageToken* tok);
+  /*!
+   * \brief Get the texture 2d size requirement
+   * \param prototype The prototype token.
+   * \return The required texture 2d memory size in (width, height, channel).
+   */
+  runtime::Texture2DShape<int64_t> GetSize2D(StorageToken* prototype);
+
+ protected:
+  struct MemBlock {
+    StorageToken* token_;
+    int64_t x_;
+    int64_t y_;
+  };
+
+  std::unordered_map<int64_t, MemBlock> blocks_;
+  std::unordered_set<int64_t> free_list_;
+};
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_TOKEN_ALLOCATOR_H_
diff --git a/tests/cpp/relay/backend/graph_plan_token_alloc.cc b/tests/cpp/relay/backend/graph_plan_token_alloc.cc
new file mode 100644
index 000000000000..4641da2cb8b5
--- /dev/null
+++ b/tests/cpp/relay/backend/graph_plan_token_alloc.cc
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../src/relay/backend/token_allocator.h"
+
+namespace tvm {
+namespace relay {
+
+// TokenAllocator2d is necessary because in class TokenAllocator2D we don't
+// have an access to its protected members. In this class we add new methods
+// which allow us to get and check internal state of class TokenAllocator2D
+class TokenAllocator2DWrapper : public TokenAllocator2D {
+ public:
+  inline size_t FreeListSize() const { return free_list_.size(); }
+  inline size_t BlockMapSize() const { return blocks_.size(); }
+};
+
+TEST(Token2DAlloc, OneToken) {
+  TokenAllocator2DWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto size2d = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(size2d.channel, 4);
+  EXPECT_EQ(size2d.height, 22);
+  EXPECT_EQ(size2d.width, 400);
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+}
+
+TEST(Token2DAlloc, EqualSizeTokenReuse) {
+  TokenAllocator2DWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto size2d = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(size2d.channel, 4);
+  EXPECT_EQ(size2d.height, 22);
+  EXPECT_EQ(size2d.width, 400);
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  StorageToken tok2 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto req = alloc.Request(&tok2);
+  EXPECT_NE(req, nullptr);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req->storage_id, storage_ids - 1);
+  EXPECT_EQ(req->ref_counter, 1);
+  auto sizeReq = alloc.GetSize2D(req);
+  EXPECT_EQ(sizeReq.channel, 4);
+  EXPECT_EQ(sizeReq.height, 22);
+  EXPECT_EQ(sizeReq.width, 400);
+}
+
+TEST(Token2DAlloc, EqualSizeDiffTypes) {
+  TokenAllocator2DWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto size2d = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(size2d.channel, 4);
+  EXPECT_EQ(size2d.height, 22);
+  EXPECT_EQ(size2d.width, 400);
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  TensorType tt2({1, 22, 20, 20, 4}, DataType(kDLFloat, 16, 1));
+  StorageToken tok2 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt2,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  EXPECT_EQ(alloc.Request(&tok2), nullptr);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  alloc.Alloc(&tok2, storage_ids++);
+  EXPECT_EQ(alloc.BlockMapSize(), 2);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  tok2.ref_counter -= 1;
+  alloc.CheckForRelease(&tok2);
+  EXPECT_EQ(alloc.BlockMapSize(), 2);
+  EXPECT_EQ(alloc.FreeListSize(), 2);
+}
+
+TEST(Token2DAlloc, DifferentSizesTokenReuse) {
+  TokenAllocator2DWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto size2d = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(size2d.channel, 4);
+  EXPECT_EQ(size2d.height, 22);
+  EXPECT_EQ(size2d.width, 400);
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  TensorType tt2({1, 40, 30, 30, 4}, DataType(kDLFloat, 32, 1));
+  StorageToken tok2 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt2,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto req = alloc.Request(&tok2);
+  EXPECT_NE(req, nullptr);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req->storage_id, storage_ids - 1);
+  EXPECT_EQ(req->ref_counter, 2);
+  auto sizeReq = alloc.GetSize2D(req);
+  EXPECT_EQ(sizeReq.channel, 4);
+  EXPECT_EQ(sizeReq.height, 40);
+  EXPECT_EQ(sizeReq.width, 900);
+
+  tok2.ref_counter -= 1;
+  req->ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  TensorType tt3({1, 25, 30, 30, 4}, DataType(kDLFloat, 32, 1));
+  StorageToken tok3 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt3,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto req2 = alloc.Request(&tok3);
+  EXPECT_NE(req2, nullptr);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req2->storage_id, storage_ids - 1);
+  EXPECT_EQ(req2->ref_counter, 1);
+  auto sizeReq2 = alloc.GetSize2D(req2);
+  EXPECT_EQ(sizeReq2.channel, 4);
+  EXPECT_EQ(sizeReq2.height, 40);
+  EXPECT_EQ(sizeReq2.width, 900);
+}
+
+TEST(Token2DAlloc, DifferentSizesTokenReuse2) {
+  TokenAllocator2DWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto size2d = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(size2d.channel, 4);
+  EXPECT_EQ(size2d.height, 22);
+  EXPECT_EQ(size2d.width, 400);
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  TensorType tt2({1, 5, 30, 20, 4}, DataType(kDLFloat, 32, 1));
+  StorageToken tok2 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt2,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto req = alloc.Request(&tok2);
+  EXPECT_NE(req, nullptr);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req->storage_id, storage_ids - 1);
+  EXPECT_EQ(req->ref_counter, 2);
+  auto sizeReq = alloc.GetSize2D(req);
+  EXPECT_EQ(sizeReq.channel, 4);
+  EXPECT_EQ(sizeReq.height, 5);
+  EXPECT_EQ(sizeReq.width, 600);
+}
+
+TEST(Token2DAlloc, SameSizesButDiffMemoryScopes) {
+  TokenAllocator2DWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({28, 676, 1, 1, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-weight"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto size2d = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(size2d.channel, 4);
+  EXPECT_EQ(size2d.height, 28);
+  EXPECT_EQ(size2d.width, 676);
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  TensorType tt2({1, 28, 26, 26, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd2(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  StorageToken tok2 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt2,  // tensor type
+      vd2,  // virtual device
+      -1    // storage_id
+  };
+  auto tok2Size = alloc.GetSize2D(&tok2);
+  EXPECT_EQ(tok2Size.channel, 4);
+  EXPECT_EQ(tok2Size.height, 28);
+  EXPECT_EQ(tok2Size.width, 676);
+
+  EXPECT_EQ(alloc.Request(&tok2), nullptr);
+  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  alloc.Alloc(&tok2, storage_ids++);
+  EXPECT_EQ(alloc.BlockMapSize(), 2);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  tok2.ref_counter -= 1;
+  alloc.CheckForRelease(&tok2);
+  EXPECT_EQ(alloc.BlockMapSize(), 2);
+  EXPECT_EQ(alloc.FreeListSize(), 2);
+}
+}  // namespace relay
+}  // namespace tvm

From bf77e793ab2f31320ca09b841ccd96004a457331 Mon Sep 17 00:00:00 2001
From: leiwen83 <leiwen83@users.noreply.github.com>
Date: Wed, 9 Nov 2022 03:39:47 +0800
Subject: [PATCH 530/704] Fixup libtorch backend build (#13320)

Add clang-format disable for header to prevent reorder.
Torch header file need to be put at the end since torch's dlpack
is a little different with tvm's.

Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
---
 .../backend/contrib/libtorch/libtorch_codegen.cc      | 11 +++++++----
 tests/lint/cpplint.sh                                 |  3 ++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/relay/backend/contrib/libtorch/libtorch_codegen.cc b/src/relay/backend/contrib/libtorch/libtorch_codegen.cc
index f70466f00eed..29fee504349c 100644
--- a/src/relay/backend/contrib/libtorch/libtorch_codegen.cc
+++ b/src/relay/backend/contrib/libtorch/libtorch_codegen.cc
@@ -22,11 +22,8 @@
  * \brief Implementation of libtorch codegen.
  */
 
-#include <ATen/DLConvertor.h>
+// clang-format off
 #include <dlpack/dlpack.h>
-#include <torch/csrc/jit/api/compilation_unit.h>
-#include <torch/csrc/jit/serialization/import.h>
-#include <torch/torch.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op.h>
@@ -43,6 +40,12 @@
 
 #include "../../utils.h"
 
+#include <ATen/DLConvertor.h>
+#include <torch/csrc/jit/api/compilation_unit.h>
+#include <torch/csrc/jit/serialization/import.h>
+#include <torch/torch.h>
+// clang-format on
+
 namespace tvm {
 namespace relay {
 namespace contrib {
diff --git a/tests/lint/cpplint.sh b/tests/lint/cpplint.sh
index 6c01f0eb0a6b..38c30b2ed6c6 100755
--- a/tests/lint/cpplint.sh
+++ b/tests/lint/cpplint.sh
@@ -26,4 +26,5 @@ python3 3rdparty/dmlc-core/scripts/lint.py --quiet tvm cpp \
 	tests/cpp tests/crt \
 	--exclude_path  "src/runtime/hexagon/rpc/hexagon_rpc.h" \
 			"src/runtime/hexagon/rpc/hexagon_rpc_skel.c" \
-			"src/runtime/hexagon/rpc/hexagon_rpc_stub.c"
+			"src/runtime/hexagon/rpc/hexagon_rpc_stub.c" \
+			"src/relay/backend/contrib/libtorch/libtorch_codegen.cc"

From 15752e4009bdd5528d2085984ab67a60267fa237 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 8 Nov 2022 13:40:07 -0600
Subject: [PATCH 531/704] [TVMScript] Hide trailing return type if None
 (#13308)

Because the majority of TIR PrimFuncs operate on buffers, write
their outputs to an output parameter, and do not return a value,
the `-> None` in the function signature becomes visual noise.
This commit removes printing of the return type in cases where
the PrimFunc has no return value.
---
 src/printer/tvmscript_printer.cc                  |  9 ++++++++-
 tests/python/unittest/test_tvmscript_roundtrip.py | 15 +++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index d8d5d89be0a4..64a576ef52f5 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -1634,7 +1634,14 @@ Doc TVMScriptPrinter::PrintPrimFunc(const PrimFunc& primFunc) {
     }
     params.push_back(Print(param) << ": " << Print(GetType(param)));
   }
-  doc << PrintSep(params, Doc::Text(", ")) << ") -> " << Print(primFunc->ret_type) << ":";
+  doc << PrintSep(params, Doc::Text(", ")) << ")";
+  if (primFunc->ret_type.defined()) {
+    auto as_tuple = primFunc->ret_type.as<TupleTypeNode>();
+    if (!as_tuple || as_tuple->fields.size()) {
+      doc << " -> " << Print(primFunc->ret_type);
+    }
+  }
+  doc << ":";
 
   Doc body = Doc::NewLine();
   // print buffer_bind
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 3b72a5ae8a92..dd6706762dc3 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3434,6 +3434,14 @@ def func() -> None:
     return func
 
 
+def return_none():
+    @T.prim_func
+    def func():
+        T.evaluate(0)
+
+    return func
+
+
 def bool_primitive():
     @T.prim_func
     def func() -> None:
@@ -3500,6 +3508,7 @@ def func() -> None:
     bool_variable_annotation,
     bool_primitive,
     bool_cast,
+    return_none,
 )
 
 
@@ -3509,5 +3518,11 @@ def test_roundtrip(ir_generator):
     tvm.ir.assert_structural_equal(original, after_roundtrip, True)
 
 
+def test_return_none_no_trailing_type():
+    func = return_none()
+    script = func.script()
+    assert "-> None" not in script
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 750ba9f742da772e730b9d77075834dd53e37682 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Tue, 8 Nov 2022 23:00:32 +0300
Subject: [PATCH 532/704] [OpenCL][unit tests] Fix opencl cpp unit tests
 (#13254)

* [OpenCL][unit tests] Fix opencl cpp unit tests

After some changes in Hexagon, the run of cpp opencl tests leads to the
following error:
```
pluggy.manager.PluginValidationError: unknown hook 'pytest_configure_node' in plugin <module 'tvm.contrib.hexagon.pytest_plugin'
```
Added `pytest_plugin` for OpenCL CPP tests for avoiding this error and
processing gtest arguments.

* Fix fail than gtest_args option was already added

* Move `gtest_args` deginition to the main testing plugin
---
 python/tvm/contrib/hexagon/pytest_plugin.py  |  8 ------
 python/tvm/testing/plugin.py                 | 10 +++++++
 tests/python/contrib/test_opencl/conftest.py | 29 --------------------
 3 files changed, 10 insertions(+), 37 deletions(-)
 delete mode 100644 tests/python/contrib/test_opencl/conftest.py

diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index bc167f25045d..585a6cc3c5bb 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -345,8 +345,6 @@ def clear_logcat(request) -> bool:
 def pytest_addoption(parser):
     """Add pytest options."""
 
-    parser.addoption("--gtest_args", action="store", default="")
-
     parser.addoption(
         "--skip-rpc",
         action="store_true",
@@ -372,9 +370,3 @@ def pytest_addoption(parser):
         default=False,
         help="If set true, it will clear logcat before execution.",
     )
-
-
-def pytest_generate_tests(metafunc):
-    option_value = metafunc.config.option.gtest_args
-    if "gtest_args" in metafunc.fixturenames and option_value is not None:
-        metafunc.parametrize("gtest_args", [option_value])
diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index 2d845b70ff11..c72bf0426e84 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -70,12 +70,22 @@ def pytest_configure(config):
     print("pytest marker:", config.option.markexpr)
 
 
+def pytest_addoption(parser):
+    """Add pytest options."""
+    parser.addoption("--gtest_args", action="store", default="")
+
+
 def pytest_generate_tests(metafunc):
     """Called once per unit test, modifies/parametrizes it as needed."""
     _parametrize_correlated_parameters(metafunc)
     _auto_parametrize_target(metafunc)
     _add_target_specific_marks(metafunc)
 
+    # Process gtest arguments
+    option_value = metafunc.config.option.gtest_args
+    if "gtest_args" in metafunc.fixturenames and option_value is not None:
+        metafunc.parametrize("gtest_args", [option_value])
+
 
 def pytest_collection_modifyitems(config, items):
     """Called after all tests are chosen, currently used for bookkeeping."""
diff --git a/tests/python/contrib/test_opencl/conftest.py b/tests/python/contrib/test_opencl/conftest.py
deleted file mode 100644
index 0a8b9e1c631f..000000000000
--- a/tests/python/contrib/test_opencl/conftest.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" OpenCL testing fixtures used to deduce testing argument
-    values from testing parameters """
-
-
-import pytest
-
-import tvm
-import tvm.testing
-
-pytest_plugins = [
-    "tvm.contrib.hexagon.pytest_plugin",
-]

From 16bb1a6c2ee7a8f68f15ab8710588c75880e0dd5 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 8 Nov 2022 15:30:14 -0600
Subject: [PATCH 533/704] [microTVM][CRT] Add memory size as project option
 (#13313)

* Add memory size as project option

* cleanup

* address comments

* address comments
---
 cmake/modules/StandaloneCrt.cmake             |  2 +-
 .../crt/host/{Makefile => Makefile.template}  |  3 +-
 src/runtime/crt/host/main.cc                  |  2 +-
 src/runtime/crt/host/microtvm_api_server.py   | 44 +++++++++++++++++--
 tests/lint/check_file_type.py                 |  2 +
 5 files changed, 47 insertions(+), 6 deletions(-)
 rename src/runtime/crt/host/{Makefile => Makefile.template} (96%)

diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index 5703058d3c3d..8c25cf48df27 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -62,7 +62,7 @@ else()
          "src/runtime/crt/graph_executor_module *.c -> src/runtime/crt/graph_executor_module"
          "src/runtime/crt/host *.cc -> template/host"
          "src/runtime/crt/host *.py -> template/host"
-         "src/runtime/crt/host Makefile -> template/host"
+         "src/runtime/crt/host Makefile.template -> template/host"
          "src/runtime/crt/memory *.c -> src/runtime/crt/memory"
          "src/runtime/crt/microtvm_rpc_common *.cc -> src/runtime/crt/microtvm_rpc_common"
          "src/runtime/crt/microtvm_rpc_server *.cc -> src/runtime/crt/microtvm_rpc_server"
diff --git a/src/runtime/crt/host/Makefile b/src/runtime/crt/host/Makefile.template
similarity index 96%
rename from src/runtime/crt/host/Makefile
rename to src/runtime/crt/host/Makefile.template
index ea2966045bb2..a8e725ade297 100644
--- a/src/runtime/crt/host/Makefile
+++ b/src/runtime/crt/host/Makefile.template
@@ -16,8 +16,9 @@
 # under the License.
 
 INCLUDES ?= -isystem crt/include -Icrt_config
+MEMORY_SIZE_BYTES := <MEMORY_SIZE_BYTES>
 CFLAGS ?= -Werror -Wall
-CXXFLAGS ?= -Werror -Wall -std=c++11 -DTVM_HOST_USE_GRAPH_EXECUTOR_MODULE
+CXXFLAGS ?= -Werror -Wall -std=c++11 -DTVM_HOST_USE_GRAPH_EXECUTOR_MODULE -DMEMORY_SIZE_BYTES=$(MEMORY_SIZE_BYTES)
 LDFLAGS ?= -Werror -Wall
 
 # Codegen produces spurious lines like: int32_t arg2_code = ((int32_t*)arg_type_ids)[(2)];
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index d8fa95fe236b..e9f6813f9b3c 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -112,7 +112,7 @@ tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
 }
 }
 
-uint8_t memory[2048 * 1024];
+uint8_t memory[MEMORY_SIZE_BYTES];
 
 static char** g_argv = NULL;
 
diff --git a/src/runtime/crt/host/microtvm_api_server.py b/src/runtime/crt/host/microtvm_api_server.py
index 925dc4ea7597..d8b35660e414 100644
--- a/src/runtime/crt/host/microtvm_api_server.py
+++ b/src/runtime/crt/host/microtvm_api_server.py
@@ -24,6 +24,8 @@
 import subprocess
 import tarfile
 import time
+import re
+
 from tvm.micro.project_api import server
 
 
@@ -35,6 +37,10 @@
 
 IS_TEMPLATE = not os.path.exists(os.path.join(PROJECT_DIR, MODEL_LIBRARY_FORMAT_RELPATH))
 
+MEMORY_SIZE_BYTES = 2 * 1024 * 1024
+
+MAKEFILE_FILENAME = "Makefile"
+
 
 class Handler(server.ProjectAPIHandler):
 
@@ -57,7 +63,14 @@ def server_info_query(self, tvm_version):
                     optional=["build"],
                     type="bool",
                     help="Run make with verbose output",
-                )
+                ),
+                server.ProjectOption(
+                    "memory_size_bytes",
+                    optional=["generate_project"],
+                    type="int",
+                    default=MEMORY_SIZE_BYTES,
+                    help="Sets the value of MEMORY_SIZE_BYTES.",
+                ),
             ],
         )
 
@@ -67,6 +80,27 @@ def server_info_query(self, tvm_version):
     # The build target given to make
     BUILD_TARGET = "build/main"
 
+    def _populate_makefile(
+        self,
+        makefile_template_path: pathlib.Path,
+        makefile_path: pathlib.Path,
+        memory_size: int,
+    ):
+        """Generate Makefile from template."""
+        flags = {
+            "MEMORY_SIZE_BYTES": str(memory_size),
+        }
+
+        regex = re.compile(r"([A-Z_]+) := (<[A-Z_]+>)")
+        with open(makefile_path, "w") as makefile_f:
+            with open(makefile_template_path, "r") as makefile_template_f:
+                for line in makefile_template_f:
+                    m = regex.match(line)
+                    if m:
+                        var, token = m.groups()
+                        line = line.replace(token, flags[var])
+                    makefile_f.write(line)
+
     def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
         # Make project directory.
         project_dir.mkdir(parents=True)
@@ -97,8 +131,12 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
             else:
                 shutil.copy2(src_path, dst_path)
 
-        # Populate Makefile.
-        shutil.copy2(pathlib.Path(__file__).parent / "Makefile", project_dir / "Makefile")
+        # Populate Makefile
+        self._populate_makefile(
+            pathlib.Path(__file__).parent / f"{MAKEFILE_FILENAME}.template",
+            project_dir / MAKEFILE_FILENAME,
+            options.get("memory_size_bytes", MEMORY_SIZE_BYTES),
+        )
 
         # Populate crt-config.h
         crt_config_dir = project_dir / "crt_config"
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 527c79754796..2b8b61c41361 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -151,6 +151,8 @@
     "apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay",
     # microTVM Arduino runtime
     "apps/microtvm/arduino/template_project/Makefile.template",
+    # microTVM CRT
+    "src/runtime/crt/host/Makefile.template",
     # microTVM Virtual Machines
     "apps/microtvm/poetry.lock",
     "apps/microtvm/reference-vm/Vagrantfile",

From 36b1c5c4d952085de48728582abe20673bec7a0f Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 8 Nov 2022 21:12:26 -0800
Subject: [PATCH 534/704] [TIR] Remove redundant add in vnni/arm intrin
 (#13319)

* [TIR] Remove redundant add in vnni intrin

* Update arm intrin

Co-authored-by: Ubuntu <ubuntu@ubuntu.com>
---
 python/tvm/tir/tensor_intrin/arm_cpu.py                     | 6 ++++--
 python/tvm/tir/tensor_intrin/x86.py                         | 5 +++--
 .../test_meta_schedule_postproc_rewrite_tensorize.py        | 5 +++--
 tests/python/unittest/test_meta_schedule_trace_apply.py     | 3 ++-
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/arm_cpu.py b/python/tvm/tir/tensor_intrin/arm_cpu.py
index fde3e015fd7c..9357f0ceb28a 100644
--- a/python/tvm/tir/tensor_intrin/arm_cpu.py
+++ b/python/tvm/tir/tensor_intrin/arm_cpu.py
@@ -119,10 +119,12 @@ def dot_product_4x4_i8i8i32_sdot(
 
         vec_b = B.vload([0, 0], dtype="int8x16")
 
-        C[T.ramp(T.int32(0), 1, 4)] += T.call_llvm_pure_intrin(
+        vec_c = C.vload([0], dtype="int32x4")
+
+        C[T.ramp(T.int32(0), 1, 4)] = T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.aarch64.neon.sdot.v4i32.v16i8"),
             T.uint32(3),
-            T.int32x4(0),
+            vec_c,
             vec_a,
             vec_b,
             dtype="int32x4",
diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index f1c603228715..d93167f9e614 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -55,11 +55,12 @@ def dot_product_16x4_u8i8i32_vnni(
 
         B_i8x64 = B.vload([0, 0], dtype="int8x64")
         B_i32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
+        C_i32x16 = C.vload([0], dtype="int32x16")
 
-        C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(  # Note: this is an update +=
+        C[T.ramp(T.int32(0), 1, 16)] = T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
             T.uint32(0),
-            T.int32x16(0),
+            C_i32x16,
             T.broadcast(A_i32, 16),
             B_i32x16,
             dtype="int32x16",
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
index 8f9d287621e2..1d51b932f359 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
@@ -233,10 +233,11 @@ def main(
                     A_i32 = T.reinterpret(A_u8x4, dtype="int32")
                     B_i8x64 = B.vload([0, 0], dtype="int8x64")
                     B_i32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
-                    C[T.ramp(0, 1, 16)] = C[T.ramp(0, 1, 16)] + T.call_llvm_pure_intrin(
+                    C_i32x16 = C.vload([0], dtype="int32x16")
+                    C[T.ramp(0, 1, 16)] = T.call_llvm_pure_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
                         T.uint32(0),
-                        T.broadcast(0, 16),
+                        C_i32x16,
                         T.broadcast(A_i32, 16),
                         B_i32x16,
                         dtype="int32x16",
diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index 6ff21c72c9ea..9d871cc981b1 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -1182,7 +1182,8 @@ def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1,
                                 A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
                                 B_i8x64: T.int8x64 = B[0, 0:64]
                                 B_i32x16: T.int32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
-                                C[0:16] = C[0:16] + T.call_llvm_pure_intrin(intrin_id, T.uint32(0), T.broadcast(0, 16), T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
+                                C_i32x16: T.int32x16 = C[0:16]
+                                C[0:16] = T.call_llvm_pure_intrin(intrin_id, T.uint32(0), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
                     for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 7):
                         for ax4_fused in T.vectorized(16):
                             with T.block("T_cast_8"):

From 244bceb45776fdaf7bf47863ddf067c3928f1e19 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 8 Nov 2022 21:12:47 -0800
Subject: [PATCH 535/704] [TIR] Allow folding cast with broadcast and ramp
 (#13317)

---
 src/tir/op/op.cc                 | 11 ++++++++++-
 tests/cpp/arith_simplify_test.cc | 22 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 3f7f05fe8e64..9896fe40d833 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -296,9 +296,9 @@ PrimExpr cast(const DataType& t, PrimExpr value, Span span) {
     ICHECK(!value.dtype().is_handle()) << "Can't cast a handle to other types.";
     return tir::Cast(t, value, span);
   } else {
+    DataType vtype = t.element_of();
     if (value.dtype().lanes() == 1) {
       // manually unroll cast
-      DataType vtype = t.element_of();
       if (value.dtype() != vtype) {
         if (const IntImmNode* op = value.as<IntImmNode>()) {
           value = make_const(vtype, op->value, op->span);
@@ -311,6 +311,15 @@ PrimExpr cast(const DataType& t, PrimExpr value, Span span) {
       return tir::Broadcast(value, t.lanes(), span);
     } else {
       ICHECK(value.dtype().lanes() == t.lanes());
+      if (const auto* broadcast = value.as<tir::BroadcastNode>()) {
+        return tir::Broadcast(cast(vtype, broadcast->value, span), t.lanes(), span);
+      } else if (const auto* ramp = value.as<tir::RampNode>()) {
+        if (t.is_int() || t.is_uint()) {
+          // only cast to index data type can be folded to ramp
+          return tir::Ramp(cast(vtype, ramp->base, span), cast(vtype, ramp->stride, span),
+                           ramp->lanes, span);
+        }
+      }
       return tir::Cast(t, value, span);
     }
   }
diff --git a/tests/cpp/arith_simplify_test.cc b/tests/cpp/arith_simplify_test.cc
index a3e9bdfa56bd..073b4269eb6f 100644
--- a/tests/cpp/arith_simplify_test.cc
+++ b/tests/cpp/arith_simplify_test.cc
@@ -53,3 +53,25 @@ TEST(Simplify, Mod) {
   auto es = ana.canonical_simplify(mod - x);
   ICHECK(tvm::tir::is_zero(es));
 }
+
+TEST(ConstantFold, Broadcast) {
+  tvm::StructuralEqual checker;
+  auto i32x4 = tvm::tir::Broadcast(tvm::IntImm(tvm::DataType::Int(32), 10), 4);
+  auto i64x4 = tvm::cast(i32x4->dtype.with_bits(64), i32x4);
+  auto i64x4_expected = tvm::tir::Broadcast(tvm::IntImm(tvm::DataType::Int(64), 10), 4);
+  ASSERT_TRUE(checker(i64x4, i64x4_expected));
+}
+
+TEST(ConstantFold, Ramp) {
+  tvm::StructuralEqual checker;
+  auto i32x4 = tvm::tir::Ramp(tvm::IntImm(tvm::DataType::Int(32), 10),
+                              tvm::IntImm(tvm::DataType::Int(32), 1), 4);
+  auto i64x4 = tvm::cast(i32x4->dtype.with_bits(64), i32x4);
+  auto i64x4_expected = tvm::tir::Ramp(tvm::IntImm(tvm::DataType::Int(64), 10),
+                                       tvm::IntImm(tvm::DataType::Int(64), 1), 4);
+  ASSERT_TRUE(checker(i64x4, i64x4_expected));
+
+  auto f32x4 = tvm::cast(tvm::DataType::Float(32, 4), i32x4);
+  auto f32x4_expected = tvm::tir::Cast(tvm::DataType::Float(32, 4), i32x4);
+  ASSERT_TRUE(checker(f32x4, f32x4_expected));
+}

From 65dbee7f0c3a601779718acb3785451f1089ee79 Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Wed, 9 Nov 2022 08:28:00 +0200
Subject: [PATCH 536/704] [Doc] Add desc of keep_orig_output_dtype for
 ToMixedPrecision (#13321)

---
 python/tvm/relay/transform/transform.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index c1f184671780..1f5b91da4432 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1349,6 +1349,13 @@ def ToMixedPrecision(mixed_precision_type="float16", missing_op_mode=1):
         1: Allow missing ops but emit warnings.
         2: Allow missing ops and silently ignore them.
 
+    relay.ToMixedPrecision.keep_orig_output_dtype: boolean
+      Defines if outputs should be retained in original data type or convert to
+      mixed_precision_type. By default this parameter is False and transformation
+      modifies the data types of outputs to mixed_precision_type.
+      This parameter is not part of explicit arguments of the transformation, but should
+      be passed through tvm.transform.PassContext.
+
     Returns
     -------
     ret : tvm.transform.Pass

From 0e395c389ccd173cf6c1f254b47a81e715762626 Mon Sep 17 00:00:00 2001
From: Matthew Barrett <55580676+mbaret@users.noreply.github.com>
Date: Wed, 9 Nov 2022 09:17:53 +0000
Subject: [PATCH 537/704] [AOT] Add CreateExecutorMetadata analysis pass
 (#13250)

AOT requires the ExecutorCodegenMetadata object to be
populated containing various pieces of information about
the compiled module. This commit adds a separate analysis
pass to create the metadata + some tests for the new pass.

In order to collect the device information correctly,
AOTLowerMain is extended to attach the device info as a
function attribute.
---
 python/tvm/relay/backend/aot.py               |  34 ++++
 src/relay/backend/aot/aot_lower_main.cc       |   5 +
 .../backend/aot/create_executor_metadata.cc   |  86 +++++++++
 .../backend/aot/create_executor_metadata.h    |  50 +++++
 src/relay/backend/utils.h                     |   2 +
 .../aot/test_aot_create_executor_metadata.py  | 176 ++++++++++++++++++
 .../relay/aot/test_pass_aot_lower_main.py     |  24 +--
 7 files changed, 365 insertions(+), 12 deletions(-)
 create mode 100644 src/relay/backend/aot/create_executor_metadata.cc
 create mode 100644 src/relay/backend/aot/create_executor_metadata.h
 create mode 100644 tests/python/relay/aot/test_aot_create_executor_metadata.py

diff --git a/python/tvm/relay/backend/aot.py b/python/tvm/relay/backend/aot.py
index b861d9298543..778c9b4164dd 100644
--- a/python/tvm/relay/backend/aot.py
+++ b/python/tvm/relay/backend/aot.py
@@ -19,6 +19,7 @@
 from typing import Dict
 
 from tvm import IRModule
+from tvm.relay.backend import Executor
 from tvm.ir.transform import Pass
 from .utils import CallType
 
@@ -67,3 +68,36 @@ def CreateFunctionMetadata(
 
     """
     return _aot.CreateFunctionMetadata(mod, workspace_byte_alignment, constant_byte_alignment)
+
+
+def CreateExecutorMetadata(
+    mod: IRModule,
+    mod_name: str,
+    executor: Executor,
+    workspace_byte_alignment: int,
+    constant_byte_alignment: int,
+) -> object:
+    """Create the executor metadata from an AOT module.
+
+    Parameters
+    ----------
+    mod : IRModule
+        The IRModule.
+    mod_name : str
+        The name of the module.
+    executor : Executor
+        The executor configuration.
+    workspace_byte_alignment : int
+        The alignment of the workspace buffer in bytes.
+    constant_byte_alignment : int
+        The alignment of the constant buffer in bytes.
+
+    Returns
+    -------
+    ExecutorCodegenMetadata
+        The executor metadata.
+
+    """
+    return _aot.CreateExecutorMetadata(
+        mod, mod_name, executor, workspace_byte_alignment, constant_byte_alignment
+    )
diff --git a/src/relay/backend/aot/aot_lower_main.cc b/src/relay/backend/aot/aot_lower_main.cc
index 51dd4b219313..82393c535c43 100644
--- a/src/relay/backend/aot/aot_lower_main.cc
+++ b/src/relay/backend/aot/aot_lower_main.cc
@@ -493,6 +493,11 @@ class AOTMainLowerer : public MixedModeVisitor {
         Array<tir::Var>(main_signature_.begin() + input_vars_.size(),
                         main_signature_.begin() + input_vars_.size() + return_sid_.size());
     dict_attrs.Set("output_vars", output_vars);
+    Array<String> device_names;
+    for (const auto& it : devices_) {
+      device_names.push_back(it.first);
+    }
+    dict_attrs.Set("devices", device_names);
 
     tir::Stmt device_activations = GenerateAllDeviceHook("Activate");
     tir::Stmt device_deactivations = GenerateAllDeviceHook("Deactivate");
diff --git a/src/relay/backend/aot/create_executor_metadata.cc b/src/relay/backend/aot/create_executor_metadata.cc
new file mode 100644
index 000000000000..8ad3566880fa
--- /dev/null
+++ b/src/relay/backend/aot/create_executor_metadata.cc
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/aot/create_executor_metadata.cc
+ * \brief Create the ExecutorCodegenMetadata from a compiled IRModule.
+ */
+
+#include "./create_executor_metadata.h"
+
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+namespace aot {
+
+ExecutorCodegenMetadata CreateExecutorMetadata(const IRModule& mod, String mod_name,
+                                               Executor executor, Integer workspace_byte_alignment,
+                                               Integer constant_byte_alignment) {
+  // Get relevant executor config information
+  std::string interface_api = executor->GetAttr<String>("interface-api").value_or("packed");
+  bool unpacked_api = executor->GetAttr<Bool>("unpacked-api").value_or(Bool(false));
+  // Get the input vars
+  auto tir_main_func = Downcast<tir::PrimFunc>(mod->Lookup(runtime::symbol::tvm_module_main));
+  Array<tir::Var> inputs = tir_main_func->GetAttr<Array<tir::Var>>("input_vars").value();
+  Array<TensorType> input_tensor_types;
+  for (const auto& input : inputs) {
+    auto buffer = tir_main_func->buffer_map.Get(input).value();
+    input_tensor_types.push_back(TensorType(buffer->shape, buffer->dtype));
+  }
+  // Extract USMP metadata to pass onto metadata sources
+  Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_var_info;
+  std::vector<tir::Var> pool_vars;
+  Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
+      tir_main_func->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
+  if (allocated_pool_infos) {
+    for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) {
+      int pool_var_index = allocated_pool_info->pool_var_idx.value()->value;
+      pool_vars.push_back(tir_main_func->params[pool_var_index]);
+      pool_var_info.Set(tir_main_func->params[pool_var_index], allocated_pool_info);
+    }
+  }
+  Map<String, tir::usmp::PoolAllocation> io_pool_allocations =
+      mod->GetAttr<Map<String, tir::usmp::PoolAllocation>>(tvm::attr::kIOTensorPoolAllocations)
+          .value_or({});
+
+  Array<tir::Var> outputs = tir_main_func->GetAttr<Array<tir::Var>>("output_vars").value();
+  Array<TensorType> output_tensor_types;
+  std::vector<String> output_var_names;
+  for (const auto& output : outputs) {
+    auto buffer = tir_main_func->buffer_map.Get(output).value();
+    output_tensor_types.push_back(TensorType(buffer->shape, buffer->dtype));
+    output_var_names.push_back(output->name_hint);
+  }
+  auto devices = tir_main_func->GetAttr<Array<String>>("devices").value_or({});
+
+  return ExecutorCodegenMetadata(inputs, input_tensor_types, output_var_names, output_tensor_types,
+                                 pool_vars, devices, runtime::kTvmExecutorAot, mod_name,
+                                 interface_api, unpacked_api, workspace_byte_alignment,
+                                 constant_byte_alignment, pool_var_info, io_pool_allocations);
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.aot.CreateExecutorMetadata")
+    .set_body_typed(CreateExecutorMetadata);
+
+}  // namespace aot
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/aot/create_executor_metadata.h b/src/relay/backend/aot/create_executor_metadata.h
new file mode 100644
index 000000000000..5657aa02809c
--- /dev/null
+++ b/src/relay/backend/aot/create_executor_metadata.h
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RELAY_BACKEND_AOT_CREATE_EXECUTOR_METADATA_H_
+#define TVM_RELAY_BACKEND_AOT_CREATE_EXECUTOR_METADATA_H_
+
+#include <tvm/ir/module.h>
+#include <tvm/relay/executor.h>
+#include <tvm/runtime/container/string.h>
+
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+namespace aot {
+
+/*! \brief Create ExecutorCodegenMetadata needed for AOT execution.
+ * \param mod The module.
+ * \param mod_name The module name.
+ * \param executor The executor configuration.
+ * \param workspace_byte_alignment The alignment of the workspace pool.
+ * \param constant_byte_alignment The alignment of the constant pool.
+ * \return The ExecutorCodegenMetadata.
+ */
+ExecutorCodegenMetadata CreateExecutorMetadata(const IRModule& mod, String mod_name,
+                                               Executor executor, Integer workspace_byte_alignment,
+                                               Integer constant_byte_alignment);
+
+}  // namespace aot
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_AOT_CREATE_EXECUTOR_METADATA_H_
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 00c75921f2f2..d5cf4baf7243 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -168,11 +168,13 @@ class ExecutorCodegenMetadataNode : public Object {
     v->Visit("pools", &pools);
     v->Visit("devices", &devices);
     v->Visit("executor", &executor);
+    v->Visit("interface_api", &interface_api);
     v->Visit("unpacked_api", &unpacked_api);
     v->Visit("workspace_alignment", &workspace_alignment);
     v->Visit("constant_alignment", &constant_alignment);
     v->Visit("pool_inputs", &pool_inputs);
     v->Visit("io_pool_allocations", &io_pool_allocations);
+    v->Visit("mod_name", &mod_name);
   }
 
   static constexpr const char* _type_key = "MetadataObj";
diff --git a/tests/python/relay/aot/test_aot_create_executor_metadata.py b/tests/python/relay/aot/test_aot_create_executor_metadata.py
new file mode 100644
index 000000000000..0ef4449541f8
--- /dev/null
+++ b/tests/python/relay/aot/test_aot_create_executor_metadata.py
@@ -0,0 +1,176 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=line-too-long,missing-class-docstring,missing-module-docstring,missing-function-docstring,no-self-argument,unused-argument,invalid-name
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm.script import tir as T
+from tvm.runtime.ndarray import array
+from tvm.relay.backend import Executor
+from tvm.relay.backend.aot import CreateExecutorMetadata
+from tvm.relay import TensorType
+from tvm.tir.usmp.utils import PoolAllocation
+from tvm.ir.memory_pools import AllocatedPoolInfo, ConstantPoolInfo, WorkspacePoolInfo, ConstantInfo
+
+
+def _check_executor_metadata(executor_metadata, expected_metadata):
+    assert list(executor_metadata.inputs) == expected_metadata["inputs"]
+    assert list(executor_metadata.input_tensor_types) == expected_metadata["input_tensor_types"]
+    assert list(executor_metadata.outputs) == expected_metadata["outputs"]
+    assert list(executor_metadata.output_tensor_types) == expected_metadata["output_tensor_types"]
+    assert list(executor_metadata.pools) == expected_metadata["pools"]
+    assert executor_metadata.devices == expected_metadata["devices"]
+    assert executor_metadata.executor == expected_metadata["executor"]
+    assert executor_metadata.mod_name == expected_metadata["mod_name"]
+    assert executor_metadata.interface_api == expected_metadata["interface_api"]
+    assert executor_metadata.unpacked_api == expected_metadata["unpacked_api"]
+    assert executor_metadata.workspace_alignment == expected_metadata["workspace_alignment"]
+    assert executor_metadata.constant_alignment == expected_metadata["constant_alignment"]
+    assert set(executor_metadata.pool_inputs.keys()) == set(expected_metadata["pool_inputs"].keys())
+    assert set(executor_metadata.io_pool_allocations.keys()) == set(
+        expected_metadata["io_pool_allocations"].keys()
+    )
+
+
+def test_create_executor_metadata_single_func():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def __tvm_main__(
+            a: T.handle, output: T.handle, workspace: T.Ptr[T.uint8], constants: T.Ptr[T.uint8]
+        ) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind": "llvm", "tag": "", "keys": ["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": ["test_device"]})
+            a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+            output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+            # body
+            sid_3 = T.allocate([140], "int8", "global.workspace")
+            sid_2 = T.allocate([140], "int8", "global.workspace")
+            sid_1 = T.allocate([140], "int8", "global.workspace")
+            constant_0 = T.allocate_const([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "float32", [5, 7])
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", a_buffer.data, sid_1, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_1, constant_0, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_2, sid_3, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_1", sid_2, sid_3, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    target = Module["__tvm_main__"].attrs["target"]
+    executor = Executor("aot", {"interface-api": "c"})
+    workspace_pool_info = AllocatedPoolInfo(
+        WorkspacePoolInfo("sram", [target]),
+        256,
+        3,
+    )
+    constant_pool_info = AllocatedPoolInfo(
+        ConstantPoolInfo(
+            "flash",
+            [target],
+            [ConstantInfo("a", 0, array(np.array([0])))],
+        ),
+        512,
+        2,
+    )
+    io_pool_allocations = {
+        "a": PoolAllocation(WorkspacePoolInfo("sram", [target]), 0),
+        "output": PoolAllocation(WorkspacePoolInfo("sram", [target]), 0),
+    }
+    mod = Module.with_attr("io_tensor_pool_allocations", io_pool_allocations)
+    mod["__tvm_main__"] = mod["__tvm_main__"].with_attr(
+        "pool_args",
+        [
+            constant_pool_info,
+            workspace_pool_info,
+        ],
+    )
+    f = mod["__tvm_main__"]
+    expected_metadata = {
+        "inputs": [f.params[0]],
+        "input_tensor_types": [TensorType((5, 7), "float32")],
+        "outputs": ["output"],
+        "output_tensor_types": [TensorType((5, 7), "float32")],
+        "pools": f.params[2:],
+        "devices": f.attrs["devices"],
+        "executor": "aot",
+        "mod_name": "test_mod",
+        "interface_api": "c",
+        "unpacked_api": False,
+        "workspace_alignment": 16,
+        "constant_alignment": 1,
+        "pool_inputs": {
+            f.params[2]: workspace_pool_info,
+            f.params[3]: constant_pool_info,
+        },
+        "io_pool_allocations": io_pool_allocations,
+    }
+
+    executor_metadata = CreateExecutorMetadata(mod, "test_mod", executor, 16, 1)
+
+    _check_executor_metadata(executor_metadata, expected_metadata)
+
+
+def test_create_executor_metadata_no_usmp():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def __tvm_main__(
+            a: T.handle, output: T.handle
+        ) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind": "llvm", "tag": "", "keys": ["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": ["test_device"]})
+            a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
+            output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
+            # body
+            sid_3 = T.allocate([140], "int8", "global.workspace")
+            sid_2 = T.allocate([140], "int8", "global.workspace")
+            sid_1 = T.allocate([140], "int8", "global.workspace")
+            constant_0 = T.allocate_const([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "float32", [5, 7])
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", a_buffer.data, sid_1, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_1, constant_0, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_2, sid_3, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+            T.evaluate(T.tvm_call_cpacked("test_fused_add_1", sid_2, sid_3, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
+    # fmt: on
+
+    executor = Executor("aot", {"interface-api": "c"})
+    mod = Module
+    f = mod["__tvm_main__"]
+    expected_metadata = {
+        "inputs": [f.params[0]],
+        "input_tensor_types": [TensorType((5, 7), "float32")],
+        "outputs": ["output"],
+        "output_tensor_types": [TensorType((5, 7), "float32")],
+        "pools": f.params[2:],
+        "devices": f.attrs["devices"],
+        "executor": "aot",
+        "mod_name": "test_mod",
+        "interface_api": "c",
+        "unpacked_api": False,
+        "workspace_alignment": 16,
+        "constant_alignment": 1,
+        "pool_inputs": {},
+        "io_pool_allocations": {},
+    }
+
+    executor_metadata = CreateExecutorMetadata(mod, "test_mod", executor, 16, 1)
+
+    _check_executor_metadata(executor_metadata, expected_metadata)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/aot/test_pass_aot_lower_main.py b/tests/python/relay/aot/test_pass_aot_lower_main.py
index c583b287727a..0a9d95247af0 100644
--- a/tests/python/relay/aot/test_pass_aot_lower_main.py
+++ b/tests/python/relay/aot/test_pass_aot_lower_main.py
@@ -68,7 +68,7 @@ def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
     @T.prim_func
     def func(a: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": []})
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
         # body
@@ -95,7 +95,7 @@ def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
     @T.prim_func
     def func(a: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": []})
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
         # body
@@ -122,7 +122,7 @@ def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
     @T.prim_func
     def func(a: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": []})
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
         # body
@@ -150,7 +150,7 @@ def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
     @T.prim_func
     def func(a: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "global_symbol": "test_mod___tvm_main__", "input_vars": [a], "output_vars": [output]})
+        T.func_attr({"runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "global_symbol": "test_mod___tvm_main__", "input_vars": [a], "output_vars": [output], "devices": []})
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
         # body
@@ -177,7 +177,7 @@ def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
     @T.prim_func
     def func(a: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": []})
         tmp_read = T.buffer_var("uint8", "")
         # buffer definition
         tmp_read_1 = T.buffer_decl([T.uint64(140)], dtype="uint8", data=tmp_read)
@@ -212,7 +212,7 @@ def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
     @T.prim_func
     def func(a: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": []})
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
         # body
@@ -241,7 +241,7 @@ def @main(%a: Tensor[(5, 7), float32]) -> (Tensor[(5, 7), float32], Tensor[(5, 7
     @T.prim_func
     def func(a: T.handle, output0: T.handle, output1: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output0, output1]})
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output0, output1], "devices": []})
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         output0_buffer = T.match_buffer(output0, [5, 7], dtype="float32", align=16)
         output1_buffer = T.match_buffer(output1, [5, 7], dtype="float32", align=16)
@@ -272,7 +272,7 @@ def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
     @T.prim_func
     def func(a: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": []})
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
         # body
@@ -302,7 +302,7 @@ def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) -> Tensor[(5
     @T.prim_func
     def func(a: T.handle, b: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a, b], "output_vars": [output]})
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a, b], "output_vars": [output], "devices": []})
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         b_buffer = T.match_buffer(b, [5, 7], dtype="float32", align=16)
         output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
@@ -331,7 +331,7 @@ def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
     @T.prim_func
     def func(a: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": []})
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
         # body
@@ -366,7 +366,7 @@ def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
     @T.prim_func
     def func(a: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": []})
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
         # body
@@ -401,7 +401,7 @@ def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
     @T.prim_func
     def func(a: T.handle, output: T.handle, device_context_example_target_hook: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]})
+        T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": ["example_target_hook"]})
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
         # body

From fbe174bd6c3054ec480c9551610030bdf2d8b64d Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 9 Nov 2022 06:34:40 -0600
Subject: [PATCH 538/704] [microTVM][CRT][DOCS] Add a PyTorch tutorial for
 microTVM with CRT (#13324)

This commit adds a tutorial to compile and run a PyTorch model using
microTVM, the AOT host-driven executor, and C runtime (CRT).
---
 docs/conf.py                                  |   1 +
 .../how_to/work_with_microtvm/micro_aot.py    |   6 +-
 .../work_with_microtvm/micro_pytorch.py       | 206 ++++++++++++++++++
 src/runtime/crt/host/Makefile.template        |   2 +-
 src/runtime/crt/host/microtvm_api_server.py   |   2 +
 tests/scripts/task_python_microtvm.sh         |   1 +
 6 files changed, 212 insertions(+), 6 deletions(-)
 create mode 100644 gallery/how_to/work_with_microtvm/micro_pytorch.py

diff --git a/docs/conf.py b/docs/conf.py
index e44c5baf6d6a..b4982f14c049 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -318,6 +318,7 @@ def git_describe_version(original_version):
         "micro_ethosu.py",
         "micro_tvmc.py",
         "micro_aot.py",
+        "micro_pytorch.py",
     ],
 }
 
diff --git a/gallery/how_to/work_with_microtvm/micro_aot.py b/gallery/how_to/work_with_microtvm/micro_aot.py
index 79a72924cc63..f02a1ebbbd0b 100644
--- a/gallery/how_to/work_with_microtvm/micro_aot.py
+++ b/gallery/how_to/work_with_microtvm/micro_aot.py
@@ -94,7 +94,7 @@
 # Use the C runtime (crt) and enable static linking by setting system-lib to True
 RUNTIME = Runtime("crt", {"system-lib": True})
 
-# Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc <https://github.com/apache/tvm/blob/main/src/runtime/crt/host/main.cc>`_.
+# Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc`.
 # To use physical hardware, replace "host" with something matching your hardware.
 TARGET = tvm.target.target.micro("host")
 
@@ -174,7 +174,3 @@
     aot_executor.run()
     result = aot_executor.get_output(0).numpy()
     print(f"Label is `{labels[np.argmax(result)]}` with index `{np.argmax(result)}`")
-#
-# Output:
-# Label is `left` with index `6`
-#
diff --git a/gallery/how_to/work_with_microtvm/micro_pytorch.py b/gallery/how_to/work_with_microtvm/micro_pytorch.py
new file mode 100644
index 000000000000..cd4af05fb561
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_pytorch.py
@@ -0,0 +1,206 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-micro-Pytorch:
+
+microTVM PyTorch Tutorial
+===========================
+**Authors**:
+`Mehrdad Hessar <https://github.com/mehrdadh>`_
+
+This tutorial is showcasing microTVM host-driven AoT compilation with
+a PyTorch model. This tutorial can be executed on a x86 CPU using C runtime (CRT).
+
+**Note:** This tutorial only runs on x86 CPU using CRT and does not run on Zephyr
+since the model would not fit on our current supported Zephyr boards.
+"""
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
+import pathlib
+
+import torch
+import torchvision
+from torchvision import transforms
+import numpy as np
+from PIL import Image
+
+import tvm
+from tvm import relay
+from tvm.contrib.download import download_testdata
+from tvm.relay.backend import Executor
+
+##################################
+# Load a pre-trained PyTorch model
+# --------------------------------
+#
+# To begin with, load pre-trained MobileNetV2 from torchvision. Then,
+# download a cat image and preprocess it to use as the model input.
+#
+
+model = torchvision.models.quantization.mobilenet_v2(weights="DEFAULT", quantize=True)
+model = model.eval()
+
+input_shape = [1, 3, 224, 224]
+input_data = torch.randn(input_shape)
+scripted_model = torch.jit.trace(model, input_data).eval()
+
+img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
+img_path = download_testdata(img_url, "cat.png", module="data")
+img = Image.open(img_path).resize((224, 224))
+
+# Preprocess the image and convert to tensor
+my_preprocess = transforms.Compose(
+    [
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ]
+)
+img = my_preprocess(img)
+img = np.expand_dims(img, 0)
+
+input_name = "input0"
+shape_list = [(input_name, input_shape)]
+relay_mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
+
+#####################################
+# Define Target, Runtime and Executor
+# -----------------------------------
+#
+# In this tutorial we use AOT host-driven executor. To compile the model
+# for an emulated embedded environment on an x86 machine we use C runtime (CRT)
+# and we use `host` micro target. Using this setup, TVM compiles the model
+# for C runtime which can run on a x86 CPU machine with the same flow that
+# would run on a physical microcontroller.
+#
+
+
+# Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc`
+# To use physical hardware, replace "host" with another physical micro target, e.g. `nrf52840`
+# or `mps2_an521`. See more more target examples in micro_train.py and micro_tflite.py tutorials.
+target = tvm.target.target.micro("host")
+
+# Use the C runtime (crt) and enable static linking by setting system-lib to True
+runtime = tvm.relay.backend.Runtime("crt", {"system-lib": True})
+
+# Use the AOT executor rather than graph or vm executors. Don't use unpacked API or C calling style.
+executor = Executor("aot")
+
+####################
+# Compile the model
+# ------------------
+#
+# Now, we compile the model for the target:
+#
+
+with tvm.transform.PassContext(
+    opt_level=3,
+    config={"tir.disable_vectorize": True},
+):
+    module = tvm.relay.build(
+        relay_mod, target=target, runtime=runtime, executor=executor, params=params
+    )
+
+###########################
+# Create a microTVM project
+# -------------------------
+#
+# Now that we have the compiled model as an IRModule, we need to create a firmware project
+# to use the compiled model with microTVM. To do this, we use Project API.
+#
+
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("crt"))
+project_options = {"verbose": False, "memory_size_bytes": 6 * 1024 * 1024}
+
+temp_dir = tvm.contrib.utils.tempdir() / "project"
+project = tvm.micro.generate_project(
+    str(template_project_path),
+    module,
+    temp_dir,
+    project_options,
+)
+
+####################################
+# Build, flash and execute the model
+# ----------------------------------
+# Next, we build the microTVM project and flash it. Flash step is specific to
+# physical microcontroller and it is skipped if it is simulating a microcontroller
+# via the host `main.cc`` or if a Zephyr emulated board is selected as the target.
+#
+
+project.build()
+project.flash()
+
+input_data = {input_name: tvm.nd.array(img.astype("float32"))}
+with tvm.micro.Session(project.transport()) as session:
+    aot_executor = tvm.runtime.executor.aot_executor.AotModule(session.create_aot_executor())
+    aot_executor.set_input(**input_data)
+    aot_executor.run()
+    result = aot_executor.get_output(0).numpy()
+
+#####################
+# Look up synset name
+# -------------------
+# Look up prediction top 1 index in 1000 class synset.
+#
+
+synset_url = (
+    "https://raw.githubusercontent.com/Cadene/"
+    "pretrained-models.pytorch/master/data/"
+    "imagenet_synsets.txt"
+)
+synset_name = "imagenet_synsets.txt"
+synset_path = download_testdata(synset_url, synset_name, module="data")
+with open(synset_path) as f:
+    synsets = f.readlines()
+
+synsets = [x.strip() for x in synsets]
+splits = [line.split(" ") for line in synsets]
+key_to_classname = {spl[0]: " ".join(spl[1:]) for spl in splits}
+
+class_url = (
+    "https://raw.githubusercontent.com/Cadene/"
+    "pretrained-models.pytorch/master/data/"
+    "imagenet_classes.txt"
+)
+class_path = download_testdata(class_url, "imagenet_classes.txt", module="data")
+with open(class_path) as f:
+    class_id_to_key = f.readlines()
+
+class_id_to_key = [x.strip() for x in class_id_to_key]
+
+# Get top-1 result for TVM
+top1_tvm = np.argmax(result)
+tvm_class_key = class_id_to_key[top1_tvm]
+
+# Convert input to PyTorch variable and get PyTorch result for comparison
+with torch.no_grad():
+    torch_img = torch.from_numpy(img)
+    output = model(torch_img)
+
+    # Get top-1 result for PyTorch
+    top1_torch = np.argmax(output.numpy())
+    torch_class_key = class_id_to_key[top1_torch]
+
+print("Relay top-1 id: {}, class name: {}".format(top1_tvm, key_to_classname[tvm_class_key]))
+print("Torch top-1 id: {}, class name: {}".format(top1_torch, key_to_classname[torch_class_key]))
diff --git a/src/runtime/crt/host/Makefile.template b/src/runtime/crt/host/Makefile.template
index a8e725ade297..2caf7ba0bc23 100644
--- a/src/runtime/crt/host/Makefile.template
+++ b/src/runtime/crt/host/Makefile.template
@@ -22,7 +22,7 @@ CXXFLAGS ?= -Werror -Wall -std=c++11 -DTVM_HOST_USE_GRAPH_EXECUTOR_MODULE -DMEMO
 LDFLAGS ?= -Werror -Wall
 
 # Codegen produces spurious lines like: int32_t arg2_code = ((int32_t*)arg_type_ids)[(2)];
-MODEL_CFLAGS ?= -Wno-error=unused-variable -Wno-error=missing-braces -Wno-error=unused-const-variable
+MODEL_CFLAGS ?= -Wno-error=unused-variable -Wno-error=missing-braces -Wno-error=unused-const-variable -Wno-unused-variable
 
 AR ?= ${PREFIX}ar
 CC ?= ${PREFIX}gcc
diff --git a/src/runtime/crt/host/microtvm_api_server.py b/src/runtime/crt/host/microtvm_api_server.py
index d8b35660e414..b84abdf45985 100644
--- a/src/runtime/crt/host/microtvm_api_server.py
+++ b/src/runtime/crt/host/microtvm_api_server.py
@@ -37,6 +37,7 @@
 
 IS_TEMPLATE = not os.path.exists(os.path.join(PROJECT_DIR, MODEL_LIBRARY_FORMAT_RELPATH))
 
+# Used this size to pass most CRT tests in TVM.
 MEMORY_SIZE_BYTES = 2 * 1024 * 1024
 
 MAKEFILE_FILENAME = "Makefile"
@@ -62,6 +63,7 @@ def server_info_query(self, tvm_version):
                     "verbose",
                     optional=["build"],
                     type="bool",
+                    default=False,
                     help="Run make with verbose output",
                 ),
                 server.ProjectOption(
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index e8907c99e303..6153cdf82392 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -48,6 +48,7 @@ run_pytest ctypes python-microtvm-project_api tests/micro/project_api
 python3 gallery/how_to/work_with_microtvm/micro_tflite.py
 python3 gallery/how_to/work_with_microtvm/micro_autotune.py
 python3 gallery/how_to/work_with_microtvm/micro_aot.py
+python3 gallery/how_to/work_with_microtvm/micro_pytorch.py
 ./gallery/how_to/work_with_microtvm/micro_tvmc.sh
 
 # Tutorials running with Zephyr

From 999eee8c1a5e32a2103ef78bbc713eb4be6dc0cf Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Wed, 9 Nov 2022 13:55:55 -0800
Subject: [PATCH 539/704] [ci] Update Jenkins readme to match new directory
 structure (#13333)

Update Jenkins readme to match new directory structure
---
 ci/jenkins/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ci/jenkins/README.md b/ci/jenkins/README.md
index 6d42770b8096..ff136bdca6f1 100644
--- a/ci/jenkins/README.md
+++ b/ci/jenkins/README.md
@@ -25,7 +25,7 @@ Jenkins runs all of the linux-based TVM CI-enabled regression tests. This includ
 
 ## GitHub Actions
 
-GitHub Actions is used to run Windows jobs, MacOS jobs, and various on-GitHub automations. These are defined in [`.github/workflows`](../.github/workflows/). These automations include bots to:
+GitHub Actions is used to run Windows jobs, MacOS jobs, and various on-GitHub automations. These are defined in [`.github/workflows`](../../.github/workflows/). These automations include bots to:
 * [cc people based on subscribed teams/topics](https://github.com/apache/tvm/issues/10317)
 * [allow non-committers to merge approved / CI passing PRs](https://discuss.tvm.apache.org/t/rfc-allow-merging-via-pr-comments/12220)
 * [add cc-ed people as reviewers on GitHub](https://discuss.tvm.apache.org/t/rfc-remove-codeowners/12095)
@@ -39,19 +39,19 @@ https://github.com/apache/tvm/actions has the logs for each of these workflows.
 TVM uses Jenkins for running Linux continuous integration (CI) tests on
 [branches](https://ci.tlcpack.ai/job/tvm/) and
 [pull requests](https://ci.tlcpack.ai/job/tvm/view/change-requests/) through a
-build configuration specified in a [`Jenkinsfile`](../Jenkinsfile).
+build configuration specified in a [`Jenkinsfile`](../../Jenkinsfile).
 Other jobs run in GitHub Actions for Windows and MacOS jobs.
 
 ## `Jenkinsfile`
 
-The template files in this directory are used to generate the [`Jenkinsfile`](../Jenkinsfile) used by Jenkins to run CI jobs for each commit to PRs and branches.
+The template files in this directory are used to generate the [`Jenkinsfile`](../../Jenkinsfile) used by Jenkins to run CI jobs for each commit to PRs and branches.
 
 To regenerate the `Jenkinsfile`, run
 
 ```bash
 python3 -mvenv _venv
-_venv/bin/pip3 install -r jenkins/requirements.txt
-_venv/bin/python3 jenkins/generate.py
+_venv/bin/pip3 install -r ci/jenkins/requirements.txt
+_venv/bin/python3 ci/jenkins/generate.py
 ```
 
 # Infrastructure

From 8453c9c35708554ee889135b2015d79db87cf0e4 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 10 Nov 2022 07:12:59 +0900
Subject: [PATCH 540/704] [MetaSchedule] Fix the order of applying `AutoInline`
 in `ScheduleUsingAnchorTrace` (#13329)

* index on concat-fusion-fix: 3ffe5b12b fix te extern create_prim_func test

* Apply AutoInline to the last block after all other blocks are processed

* Do not require CanReverseComputeInline to be true when
CanComputeInline is false

* add comment

* add test

* cpplint
---
 src/meta_schedule/trace_apply.cc              |  36 +-
 .../test_meta_schedule_trace_apply.py         | 630 ++++++++++++++++++
 2 files changed, 660 insertions(+), 6 deletions(-)

diff --git a/src/meta_schedule/trace_apply.cc b/src/meta_schedule/trace_apply.cc
index 70b6451d3546..9213d414e1b5 100644
--- a/src/meta_schedule/trace_apply.cc
+++ b/src/meta_schedule/trace_apply.cc
@@ -21,11 +21,14 @@
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include <optional>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
+#include "../tir/schedule/analysis.h"
 #include "utils.h"
 
 namespace tvm {
@@ -60,7 +63,8 @@ void InlinePostBlocks(Schedule sch, Trace anchor_trace, Target target) {
 
   auto anchor_block = FindAnchorBlock(sch->mod());
 
-  auto inline_rule = GetDefaultAutoInline(target->kind->name);
+  std::vector<std::string> inline_todos;
+  std::optional<int> last_block_idx{std::nullopt};
 
   for (auto name : GetBlockNames(sch->mod())) {
     auto block = sch->GetBlock(name);
@@ -69,10 +73,26 @@ void InlinePostBlocks(Schedule sch, Trace anchor_trace, Target target) {
       if (IsAncestor(block, anchor_block_rv, sch)) continue;
     }
     // Spatial blocks which are not referenced in the anchor trace will be inlined here.
-    if (IsSpatial(sch->GetSRef(block)) && !get_block_names.count(name)) {
-      inline_rule->Apply(sch, block);
+    auto block_sref = sch->GetSRef(block);
+    if (IsSpatial(block_sref) && !get_block_names.count(name)) {
+      if (IsOutputBlock(sch->state(), block_sref, GetScopeRoot(sch->state(), block_sref, false))) {
+        last_block_idx = inline_todos.size();
+      }
+      inline_todos.push_back(name);
     }
   }
+
+  if (last_block_idx) {
+    // The last block can only be reverse compute inlined. We make sure to inline all
+    // producer blocks of the last block beforehand so that reverse compute inline can succeed.
+    std::swap(inline_todos[*last_block_idx], inline_todos.back());
+  }
+
+  auto inline_rule = GetDefaultAutoInline(target->kind->name);
+
+  for (auto name : inline_todos) {
+    inline_rule->Apply(sch, sch->GetBlock(name));
+  }
 }
 
 // Apply instructions from the anchor trace to the target schedule, and returns blocks
@@ -140,9 +160,13 @@ std::vector<BlockRV> ApplyAnchorTrace(Schedule sch, Trace anchor_trace) {
       // Similar to the reverse_compute_inline case above.
       auto block = Downcast<BlockRV>(inputs[0]);
       auto block_sref = sch->GetSRef(block);
-      if (!CanComputeInline(sch->state(), block_sref)) {
-        ICHECK(CanReverseComputeInline(sch->state(), block_sref));
-        sch->ReverseComputeInline(block);
+      auto state = sch->state();
+      if (!CanComputeInline(state, block_sref)) {
+        ICHECK(IsOutputBlock(state, block_sref, GetScopeRoot(state, block_sref, false)))
+            << "If a spatial block cannot be inlined, it should be the output block";
+        if (CanReverseComputeInline(sch->state(), block_sref)) {
+          sch->ReverseComputeInline(block);
+        }
         continue;
       }
     }
diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index 9d871cc981b1..7e361d2c095c 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -1529,6 +1529,290 @@ def main(p0: T.Buffer[(1, 56, 56, 64), "float32"], p1: T.Buffer[(6, 6, 64, 64),
                     T_relu[n, h, w, co] = T.max(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co] + p2[n, 0, 0, co] + p3[n, h, w, co], T.float32(0))
 
 
+@tvm.script.ir_module
+class Conv2dInt8_with_predicate:
+    @T.prim_func
+    def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "int8"], p2: T.Buffer[(1, 1, 1, 256), "int32"], p3: T.Buffer[(1, 1, 1, 256), "int32"], p4: T.Buffer[256, "int32"], p5: T.Buffer[256, "int32"], p6: T.Buffer[256, "int32"], p7: T.Buffer[(), "int32"], p8: T.Buffer[1, "int32"], compute: T.Buffer[(16, 56, 56, 256), "int32"]) -> None:
+        # function attr dict
+        T.func_attr({"tir.noalias": True, "global_symbol": "main"})
+        # body
+        # with T.block("root")
+        pad_temp = T.alloc_buffer([16, 56, 56, 64], dtype="int8")
+        conv2d_nhwc = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_subtract = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_add = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        compute_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_add_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        compute_2 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_subtract_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 64):
+            with T.block("pad_temp"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(p0[i0_1, i1_1, i2_1, i3_1])
+                T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1])
+                pad_temp[i0_1, i1_1, i2_1, i3_1] = p0[i0_1, i1_1, i2_1, i3_1]
+        for i0, i1, i2, i3, i4, i5, i6 in T.grid(16, 56, 56, 256, 1, 1, 64):
+            with T.block("conv2d_nhwc"):
+                nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+                T.reads(pad_temp[nn, yy + ry, xx + rx, rc], p1[ff, ry, rx, rc])
+                T.writes(conv2d_nhwc[nn, yy, xx, ff])
+                with T.init():
+                    conv2d_nhwc[nn, yy, xx, ff] = 0
+                conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + T.cast(pad_temp[nn, yy + ry, xx + rx, rc], "int32") * T.cast(p1[ff, ry, rx, rc], "int32")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_subtract"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], p2[0, 0, 0, ax3])
+                T.writes(T_subtract[ax0, ax1, ax2, ax3])
+                T_subtract[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] - p2[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_add"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_subtract[ax0, ax1, ax2, ax3], p3[0, 0, 0, ax3])
+                T.writes(T_add[ax0, ax1, ax2, ax3])
+                T_add[ax0, ax1, ax2, ax3] = T_subtract[ax0, ax1, ax2, ax3] + p3[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("compute"):
+                i0_2, i1_2, i2_2, i3_2 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add[i0_2, i1_2, i2_2, i3_2], p4[i3_2], p5[i3_2], p6[i3_2])
+                T.writes(compute_1[i0_2, i1_2, i2_2, i3_2])
+                compute_1[i0_2, i1_2, i2_2, i3_2] = T.q_multiply_shift_per_axis(T_add[i0_2, i1_2, i2_2, i3_2], p4[i3_2], p5[i3_2], p6[i3_2], 31, False, True, dtype="int32")
+        for i0_3, i1_3, i2_3, i3_3 in T.grid(16, 56, 56, 256):
+            with T.block("T_add_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_3, i1_3, i2_3, i3_3])
+                T.reads(p7[()], compute_1[ax0, ax1, ax2, ax3])
+                T.writes(T_add_1[ax0, ax1, ax2, ax3])
+                T_add_1[ax0, ax1, ax2, ax3] = p7[()] + compute_1[ax0, ax1, ax2, ax3]
+        for i0_4, i1_4, i2_4, i3_4 in T.grid(16, 56, 56, 256):
+            with T.block("compute_1"):
+                i0_5, i1_5, i2_5, i3_5 = T.axis.remap("SSSS", [i0_4, i1_4, i2_4, i3_4])
+                T.reads(T_add_1[i0_5, i1_5, i2_5, i3_5])
+                T.writes(compute_2[i0_5, i1_5, i2_5, i3_5])
+                compute_2[i0_5, i1_5, i2_5, i3_5] = T.max(T.min(T_add_1[i0_5, i1_5, i2_5, i3_5], 255), 0)
+        for i0_6, i1_6, i2_6, i3_6 in T.grid(16, 56, 56, 256):
+            with T.block("T_subtract_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_6, i1_6, i2_6, i3_6])
+                T.reads(compute_2[ax0, ax1, ax2, ax3], p8[0])
+                T.writes(T_subtract_1[ax0, ax1, ax2, ax3])
+                T_subtract_1[ax0, ax1, ax2, ax3] = compute_2[ax0, ax1, ax2, ax3] - p8[0]
+        for i0_7, i1_7, i2_7, i3_7 in T.grid(16, 56, 56, 256):
+            with T.block("compute_2"):
+                i0_8, i1_8, i2_8, i3_8 = T.axis.remap("SSSS", [i0_7, i1_7, i2_7, i3_7])
+                T.reads(T_subtract_1[i0_8, i1_8, i2_8, i3_8])
+                T.writes(compute[i0_8, i1_8, i2_8, i3_8])
+                compute[i0_8, i1_8, i2_8, i3_8] = T.q_multiply_shift(T_subtract_1[i0_8, i1_8, i2_8, i3_8], 1963325822, 31, 1, dtype="int32")
+
+
+@tvm.script.ir_module
+class Conv2dInt8_with_predicate_target:
+    @T.prim_func
+    def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "int8"], p2: T.Buffer[(1, 1, 1, 256), "int32"], p3: T.Buffer[(1, 1, 1, 256), "int32"], p4: T.Buffer[256, "int32"], p5: T.Buffer[256, "int32"], p6: T.Buffer[256, "int32"], p7: T.Buffer[(), "int32"], p8: T.Buffer[1, "int32"], p9: T.Buffer[(16, 56, 56, 256), "int32"], compute: T.Buffer[(16, 56, 56, 256), "int32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        pad_temp = T.alloc_buffer([16, 56, 56, 64], dtype="int8")
+        conv2d_nhwc = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_subtract = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_add = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        compute_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_add_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        compute_2 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_subtract_1 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        compute_3 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        compute_4 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        T_add_2 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 64):
+            with T.block("pad_temp"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(p0[i0_1, i1_1, i2_1, i3_1])
+                T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1])
+                pad_temp[i0_1, i1_1, i2_1, i3_1] = p0[i0_1, i1_1, i2_1, i3_1]
+        for i0, i1, i2, i3, i4, i5, i6 in T.grid(16, 56, 56, 256, 1, 1, 64):
+            with T.block("conv2d_nhwc"):
+                nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+                T.reads(pad_temp[nn, yy + ry, xx + rx, rc], p1[ff, ry, rx, rc])
+                T.writes(conv2d_nhwc[nn, yy, xx, ff])
+                with T.init():
+                    conv2d_nhwc[nn, yy, xx, ff] = 0
+                conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + T.cast(pad_temp[nn, yy + ry, xx + rx, rc], "int32") * T.cast(p1[ff, ry, rx, rc], "int32")
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_subtract"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], p2[0, 0, 0, ax3])
+                T.writes(T_subtract[ax0, ax1, ax2, ax3])
+                T_subtract[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] - p2[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("T_add"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_subtract[ax0, ax1, ax2, ax3], p3[0, 0, 0, ax3])
+                T.writes(T_add[ax0, ax1, ax2, ax3])
+                T_add[ax0, ax1, ax2, ax3] = T_subtract[ax0, ax1, ax2, ax3] + p3[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 56, 56, 256):
+            with T.block("compute"):
+                i0_2, i1_2, i2_2, i3_2 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add[i0_2, i1_2, i2_2, i3_2], p4[i3_2], p5[i3_2], p6[i3_2])
+                T.writes(compute_1[i0_2, i1_2, i2_2, i3_2])
+                compute_1[i0_2, i1_2, i2_2, i3_2] = T.q_multiply_shift_per_axis(T_add[i0_2, i1_2, i2_2, i3_2], p4[i3_2], p5[i3_2], p6[i3_2], 31, False, True, dtype="int32")
+        for i0_3, i1_3, i2_3, i3_3 in T.grid(16, 56, 56, 256):
+            with T.block("T_add_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_3, i1_3, i2_3, i3_3])
+                T.reads(p7[()], compute_1[ax0, ax1, ax2, ax3])
+                T.writes(T_add_1[ax0, ax1, ax2, ax3])
+                T_add_1[ax0, ax1, ax2, ax3] = p7[()] + compute_1[ax0, ax1, ax2, ax3]
+        for i0_4, i1_4, i2_4, i3_4 in T.grid(16, 56, 56, 256):
+            with T.block("compute_1"):
+                i0_5, i1_5, i2_5, i3_5 = T.axis.remap("SSSS", [i0_4, i1_4, i2_4, i3_4])
+                T.reads(T_add_1[i0_5, i1_5, i2_5, i3_5])
+                T.writes(compute_2[i0_5, i1_5, i2_5, i3_5])
+                compute_2[i0_5, i1_5, i2_5, i3_5] = T.max(T.min(T_add_1[i0_5, i1_5, i2_5, i3_5], 255), 0)
+        for i0_6, i1_6, i2_6, i3_6 in T.grid(16, 56, 56, 256):
+            with T.block("T_subtract_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_6, i1_6, i2_6, i3_6])
+                T.reads(compute_2[ax0, ax1, ax2, ax3], p8[0])
+                T.writes(T_subtract_1[ax0, ax1, ax2, ax3])
+                T_subtract_1[ax0, ax1, ax2, ax3] = compute_2[ax0, ax1, ax2, ax3] - p8[0]
+        for i0_7, i1_7, i2_7, i3_7 in T.grid(16, 56, 56, 256):
+            with T.block("compute_2"):
+                i0_8, i1_8, i2_8, i3_8 = T.axis.remap("SSSS", [i0_7, i1_7, i2_7, i3_7])
+                T.reads(T_subtract_1[i0_8, i1_8, i2_8, i3_8])
+                T.writes(compute_3[i0_8, i1_8, i2_8, i3_8])
+                compute_3[i0_8, i1_8, i2_8, i3_8] = T.q_multiply_shift(T_subtract_1[i0_8, i1_8, i2_8, i3_8], 1457846997, 31, 0, dtype="int32")
+        for i0_9, i1_9, i2_9, i3_9 in T.grid(16, 56, 56, 256):
+            with T.block("compute_3"):
+                i0_10, i1_10, i2_10, i3_10 = T.axis.remap("SSSS", [i0_9, i1_9, i2_9, i3_9])
+                T.reads(p9[i0_10, i1_10, i2_10, i3_10])
+                T.writes(compute_4[i0_10, i1_10, i2_10, i3_10])
+                compute_4[i0_10, i1_10, i2_10, i3_10] = T.q_multiply_shift(p9[i0_10, i1_10, i2_10, i3_10], 2101000910, 31, 0, dtype="int32")
+        for i0_11, i1_11, i2_11, i3_11 in T.grid(16, 56, 56, 256):
+            with T.block("T_add_2"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_11, i1_11, i2_11, i3_11])
+                T.reads(compute_3[ax0, ax1, ax2, ax3], compute_4[ax0, ax1, ax2, ax3])
+                T.writes(T_add_2[ax0, ax1, ax2, ax3])
+                T_add_2[ax0, ax1, ax2, ax3] = compute_3[ax0, ax1, ax2, ax3] + compute_4[ax0, ax1, ax2, ax3]
+        for i0_12, i1_12, i2_12, i3_12 in T.grid(16, 56, 56, 256):
+            with T.block("compute_4"):
+                i0_13, i1_13, i2_13, i3_13 = T.axis.remap("SSSS", [i0_12, i1_12, i2_12, i3_12])
+                T.reads(T_add_2[i0_13, i1_13, i2_13, i3_13])
+                T.writes(compute[i0_13, i1_13, i2_13, i3_13])
+                compute[i0_13, i1_13, i2_13, i3_13] = T.max(T.min(T_add_2[i0_13, i1_13, i2_13, i3_13], 255), 0)
+
+
+@tvm.script.ir_module
+class Conv2dInt8_with_predicate_scheduled:
+    @T.prim_func
+    def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "int8"], p2: T.Buffer[(1, 1, 1, 256), "int32"], p3: T.Buffer[(1, 1, 1, 256), "int32"], p4: T.Buffer[256, "int32"], p5: T.Buffer[256, "int32"], p6: T.Buffer[256, "int32"], p7: T.Buffer[(), "int32"], p8: T.Buffer[1, "int32"], p9: T.Buffer[(16, 56, 56, 256), "int32"], compute: T.Buffer[(16, 56, 56, 256), "int32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.unroll_explicit":1024})
+            conv2d_nhwc_reindex_shared = T.alloc_buffer([50176, 256], dtype="int32", scope="shared")
+            conv2d_nhwc_reindex_shared_wmma_accumulator = T.alloc_buffer([50176, 256], dtype="int32", scope="wmma.accumulator")
+            pad_temp_reindex_shared = T.alloc_buffer([50176, 64], dtype="int8", scope="shared")
+            p1_reindex_shared = T.alloc_buffer([1, 1, 256, 64], dtype="int8", scope="shared")
+            pad_temp_reindex_shared_wmma_matrix_a = T.alloc_buffer([50176, 64], dtype="int8", scope="wmma.matrix_a")
+            p1_reindex_shared_wmma_matrix_b = T.alloc_buffer([1, 1, 256, 64], dtype="int8", scope="wmma.matrix_b")
+            for ax2_0_0_ax3_0_0_fused in T.thread_binding(32, thread="blockIdx.y"):
+                for ax2_0_1_ax3_0_1_fused in T.thread_binding(196, thread="blockIdx.x"):
+                    for ax2_0_2_ax3_0_2_fused in T.thread_binding(4, thread="threadIdx.y"):
+                        for ax0_0, ax1_0, ax4_0_0 in T.grid(1, 1, 2):
+                            for ax0_ax1_fused in T.serial(1024):
+                                with T.block("pad_temp_reindex_shared"):
+                                    v0 = T.axis.spatial(50176, ax2_0_0_ax3_0_0_fused // 4 * 6272 + ax2_0_1_ax3_0_1_fused * 32 + ax0_ax1_fused // 32)
+                                    v1 = T.axis.spatial(64, ax4_0_0 * 32 + ax0_ax1_fused % 32)
+                                    T.reads(p0[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1])
+                                    T.writes(pad_temp_reindex_shared[v0, v1])
+                                    T.block_attr({"buffer_dim_align":[[0, 0, 32, 16]], "meta_schedule.cooperative_fetch":4})
+                                    pad_temp_reindex_shared[v0, v1] = p0[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1]
+                            for ax0_ax1_ax2_ax3_fused in T.serial(2048):
+                                with T.block("p1_reindex_shared"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1 = T.axis.spatial(1, 0)
+                                    v2 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused % 4 * 64 + ax0_ax1_ax2_ax3_fused // 32)
+                                    v3 = T.axis.spatial(64, ax4_0_0 * 32 + ax0_ax1_ax2_ax3_fused % 32)
+                                    T.reads(p1[v2, v0, v1, v3])
+                                    T.writes(p1_reindex_shared[v0, v1, v2, v3])
+                                    T.block_attr({"buffer_dim_align":[[0, 2, 32, 16]], "meta_schedule.cooperative_fetch":3})
+                                    p1_reindex_shared[v0, v1, v2, v3] = p1[v2, v0, v1, v3]
+                            for ax0_1, ax1_1, ax4_0_1 in T.grid(1, 1, 2):
+                                for ax0_0_1, ax1_0_1 in T.grid(1, 1):
+                                    with T.block("pad_temp_reindex_shared_wmma.matrix_a_o"):
+                                        v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 4 * 392 + ax2_0_1_ax3_0_1_fused * 2 + ax2_0_2_ax3_0_2_fused // 2)
+                                        v1_o = T.axis.spatial(4, ax4_0_0 * 2 + ax4_0_1)
+                                        T.reads(pad_temp_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                        T.writes(pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                        T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_s8_a"})
+                                        for ax0_1_1, ax1_1_1 in T.grid(16, 16):
+                                            with T.block("pad_temp_reindex_shared_wmma.matrix_a"):
+                                                v0_i, v1_i = T.axis.remap("SS", [ax0_1_1, ax1_1_1])
+                                                T.reads(pad_temp_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                                T.writes(pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                                pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = pad_temp_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                                for ax0, ax1, ax2_0, ax3_0 in T.grid(1, 1, 2, 1):
+                                    with T.block("p1_reindex_shared_wmma.matrix_b_o"):
+                                        v0 = T.axis.spatial(1, 0)
+                                        v1 = T.axis.spatial(1, 0)
+                                        v2_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 4 * 4 + ax2_0_2_ax3_0_2_fused % 2 * 2 + ax2_0)
+                                        v3_o = T.axis.spatial(4, ax4_0_0 * 2 + ax4_0_1)
+                                        T.reads(p1_reindex_shared[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                        T.writes(p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                        T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_s8_b_trans"})
+                                        for ax2_1, ax3_1 in T.grid(16, 16):
+                                            with T.block("p1_reindex_shared_wmma.matrix_b"):
+                                                v2_i, v3_i = T.axis.remap("SS", [ax2_1, ax3_1])
+                                                T.reads(p1_reindex_shared[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i])
+                                                T.writes(p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i])
+                                                p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i] = p1_reindex_shared[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i]
+                                for ax2_0_3, ax3_0_3, ax0_2, ax1_2, ax4_0_2, ax2_0_4, ax3_0_4 in T.grid(1, 1, 1, 1, 1, 1, 2):
+                                    with T.block("conv2d_nhwc_o"):
+                                        v0 = T.axis.reduce(1, 0)
+                                        v1 = T.axis.reduce(1, 0)
+                                        v2_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 4 * 392 + ax2_0_1_ax3_0_1_fused * 2 + ax2_0_2_ax3_0_2_fused // 2 + ax2_0_3 + ax2_0_4)
+                                        v3_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 4 * 4 + ax2_0_2_ax3_0_2_fused % 2 * 2 + ax3_0_3 * 2 + ax3_0_4)
+                                        v4_o = T.axis.reduce(4, ax4_0_0 * 2 + ax4_0_1 + ax4_0_2)
+                                        T.reads(pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 : v2_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 : v3_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16])
+                                        T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                        T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_s8s8s32_trans", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_s32", "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "warp_execution":1})
+                                        with T.init():
+                                            for ax2_1, ax3_1 in T.grid(16, 16):
+                                                with T.block("conv2d_nhwc_init"):
+                                                    v2_i_init, v3_i_init = T.axis.remap("SS", [ax2_1, ax3_1])
+                                                    T.reads()
+                                                    T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i_init, v3_o * 16 + v3_i_init])
+                                                    conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i_init, v3_o * 16 + v3_i_init] = 0
+                                        for ax2_1, ax3_1, ax4_1 in T.grid(16, 16, 16):
+                                            with T.block("conv2d_nhwc"):
+                                                v2_i, v3_i, v4_i = T.axis.remap("SSR", [ax2_1, ax3_1, ax4_1])
+                                                T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i], pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 + v2_i, v4_o * 16 + v4_i], p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 + v3_i, v4_o * 16 + v4_i])
+                                                T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i])
+                                                T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
+                                                conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] + T.cast(pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 + v2_i, v4_o * 16 + v4_i], "int32") * T.cast(p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 + v3_i, v4_o * 16 + v4_i], "int32")
+                        for ax0_0, ax1_0 in T.grid(1, 2):
+                            with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
+                                v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 4 * 392 + ax2_0_1_ax3_0_1_fused * 2 + ax2_0_2_ax3_0_2_fused // 2)
+                                v1_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 4 * 4 + ax2_0_2_ax3_0_2_fused % 2 * 2 + ax1_0)
+                                T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_s32_shared"})
+                                for ax0_1, ax1_1 in T.grid(16, 16):
+                                    with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator"):
+                                        v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                        T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                        T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                        conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                    for ax0, ax1_0, ax1_1, ax1_2, ax1_3 in T.grid(32, 1, 4, 32, 2):
+                        with T.block("conv2d_nhwc_reindex_shared"):
+                            T.where(((ax1_0 * 4 + ax1_1) * 32 + ax1_2) * 2 + ax1_3 < 64)
+                            v0 = T.axis.spatial(50176, ax2_0_0_ax3_0_0_fused // 4 * 6272 + ax2_0_1_ax3_0_1_fused * 32 + ax0)
+                            v1 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused % 4 * 64 + (ax1_0 * 256 + ax1_1 * 64 + ax1_2 * 2 + ax1_3))
+                            T.reads(p7[()], conv2d_nhwc_reindex_shared[v0, v1], p2[0, 0, 0, v1], p3[0, 0, 0, v1], p4[v1], p5[v1], p6[v1], p8[0], p9[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1])
+                            T.writes(compute[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1])
+                            compute[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1] = T.max(T.min(T.q_multiply_shift(T.max(T.min(p7[()] + T.q_multiply_shift_per_axis(conv2d_nhwc_reindex_shared[v0, v1] - p2[0, 0, 0, v1] + p3[0, 0, 0, v1], p4[v1], p5[v1], p6[v1], 31, False, True, dtype="int32"), 255), 0) - p8[0], 1457846997, 31, 0, dtype="int32") + T.q_multiply_shift(p9[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1], 2101000910, 31, 0, dtype="int32"), 255), 0)
+
+
 # fmt: on
 def verify(anchor_mod, anchor_trace_fun, target_mod, target, ref):
     anchor_sch = Schedule(anchor_mod)
@@ -2742,5 +3026,351 @@ def apply_trace(sch):
     )
 
 
+def test_inline_order():
+    # In this test, the order of applying AutoInline is tested.
+    # We need to make sure that the last block in Conv2dInt8_with_predicate_target,
+    # "compute_4", is AutoInline-ed after all other blocks have been processed.
+    #
+    # Otherwise, if the order is "T_add_2" -> "compute_4" -> "compute_3", "compute_4" is neither
+    # inlined (because this is the last block) nor reverse-inlined
+    # (because it has multiple producers). This results in the "compute_4" block being
+    # reverse-inlined at the very end of ScheduleUsingAnchorTrace, where its producer block
+    # "conv2d_nhwc_reindex_shared" has the predicate
+    # T.where(((ax1_0 * 4 + ax1_1) * 32 + ax1_2) * 2 + ax1_3 < 64) due to anchor-block scheduling
+    # (see Conv2dInt8_with_predicate_scheduled). Currently, if we try to reverse-inline a block to
+    # its producer that has a predicate, the predicate disappears after reverse inlining.
+
+    def apply_trace(sch: Schedule) -> None:
+        b0 = sch.get_block(name="pad_temp", func_name="main")
+        b1 = sch.get_block(name="conv2d_nhwc", func_name="main")
+        b2 = sch.get_block(name="T_subtract", func_name="main")
+        b3 = sch.get_block(name="T_add", func_name="main")
+        b4 = sch.get_block(name="compute", func_name="main")
+        b5 = sch.get_block(name="T_add_1", func_name="main")
+        b6 = sch.get_block(name="compute_1", func_name="main")
+        b7 = sch.get_block(name="T_subtract_1", func_name="main")
+        b8 = sch.get_block(name="compute_2", func_name="main")
+        b9 = sch.get_block(name="root", func_name="main")
+        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
+        b10 = sch.reindex(block=b1, buffer=("write", 0))
+        b11 = sch.reindex(block=b1, buffer=("read", 0))
+        b12 = sch.reindex(block=b1, buffer=("read", 1))
+        sch.transform_layout(
+            block=b1,
+            buffer=("read", 0),
+            index_map=lambda nn, yy, xx, rc: (
+                (((nn * 3136) + (yy * 56)) + xx),
+                rc,
+            ),
+            pad_value=None,
+        )
+        sch.transform_layout(
+            block=b1,
+            buffer=("read", 1),
+            index_map=lambda ff, ry, rx, rc: (
+                ry,
+                rx,
+                ff,
+                rc,
+            ),
+            pad_value=None,
+        )
+        sch.transform_layout(
+            block=b1,
+            buffer=("write", 0),
+            index_map=lambda nn, yy, xx, ff: (
+                (((nn * 3136) + (yy * 56)) + xx),
+                ff,
+            ),
+            pad_value=None,
+        )
+        sch.transform_block_layout(
+            block=b10,
+            index_map=lambda nn, yy, xx, ff: (
+                (((nn * 3136) + (yy * 56)) + xx),
+                ff,
+            ),
+        )
+        sch.transform_block_layout(
+            block=b11,
+            index_map=lambda nn, yy, xx, rc: (
+                (((nn * 3136) + (yy * 56)) + xx),
+                rc,
+            ),
+        )
+        sch.transform_block_layout(
+            block=b12,
+            index_map=lambda ff, ry, rx, rc: (
+                ry,
+                rx,
+                ff,
+                rc,
+            ),
+        )
+        sch.transform_block_layout(
+            block=b1,
+            index_map=lambda nn, yy, xx, ff, ry, rx, rc: (
+                ry,
+                rx,
+                (((nn * 3136) + (yy * 56)) + xx),
+                ff,
+                rc,
+            ),
+        )
+        l13, l14, l15, l16, l17 = sch.get_loops(block=b1)
+        l18, l19 = sch.split(loop=l17, factors=[None, 16], preserve_unit_iters=True)
+        l20, l21 = sch.split(loop=l16, factors=[None, 16], preserve_unit_iters=True)
+        l22, l23 = sch.split(loop=l15, factors=[None, 16], preserve_unit_iters=True)
+        l24, l25, l26, l27, l28, l29, l30, l31 = sch.get_loops(block=b1)
+        sch.reorder(l28, l30, l23, l21, l19)
+        b32 = sch.blockize(loop=l23)
+        sch.annotate(
+            block_or_loop=b32,
+            ann_key="meta_schedule.auto_tensorize",
+            ann_val="wmma_sync_16x16x16_s8s8s32_trans",
+        )
+        sch.annotate(
+            block_or_loop=b32,
+            ann_key="meta_schedule.auto_tensorize_init",
+            ann_val="wmma_fill_16x16x16_s32",
+        )
+        sch.annotate(block_or_loop=b32, ann_key="warp_execution", ann_val=1)
+        l33, l34, l35, l36, l37 = sch.get_loops(block=b32)
+        v38, v39, v40 = sch.sample_perfect_tile(
+            loop=l33, n=3, max_innermost_factor=4, decision=[1, 1, 1]
+        )
+        l41, l42, l43 = sch.split(loop=l33, factors=[v38, v39, v40], preserve_unit_iters=True)
+        v44, v45, v46 = sch.sample_perfect_tile(
+            loop=l34, n=3, max_innermost_factor=4, decision=[1, 1, 1]
+        )
+        l47, l48, l49 = sch.split(loop=l34, factors=[v44, v45, v46], preserve_unit_iters=True)
+        v50, v51, v52, v53, v54 = sch.sample_perfect_tile(
+            loop=l35, n=5, max_innermost_factor=4, decision=[8, 196, 2, 1, 1]
+        )
+        l55, l56, l57, l58, l59 = sch.split(
+            loop=l35, factors=[v50, v51, v52, v53, v54], preserve_unit_iters=True
+        )
+        v60, v61, v62, v63, v64 = sch.sample_perfect_tile(
+            loop=l36, n=5, max_innermost_factor=4, decision=[4, 1, 2, 1, 2]
+        )
+        l65, l66, l67, l68, l69 = sch.split(
+            loop=l36, factors=[v60, v61, v62, v63, v64], preserve_unit_iters=True
+        )
+        v70, v71, v72 = sch.sample_perfect_tile(
+            loop=l37, n=3, max_innermost_factor=4, decision=[2, 2, 1]
+        )
+        l73, l74, l75 = sch.split(loop=l37, factors=[v70, v71, v72], preserve_unit_iters=True)
+        sch.reorder(
+            l55,
+            l65,
+            l56,
+            l66,
+            l57,
+            l67,
+            l41,
+            l47,
+            l73,
+            l42,
+            l48,
+            l74,
+            l58,
+            l68,
+            l43,
+            l49,
+            l75,
+            l59,
+            l69,
+        )
+        l76 = sch.fuse(l55, l65, preserve_unit_iters=True)
+        sch.bind(loop=l76, thread_axis="blockIdx.y")
+        l77 = sch.fuse(l56, l66, preserve_unit_iters=True)
+        sch.bind(loop=l77, thread_axis="blockIdx.x")
+        l78 = sch.fuse(l57, l67, preserve_unit_iters=True)
+        sch.bind(loop=l78, thread_axis="threadIdx.y")
+        sch.annotate(
+            block_or_loop=b32, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32
+        )
+        sch.annotate(
+            block_or_loop=b32, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024
+        )
+        b79 = sch.cache_write(block=b32, write_buffer_index=0, storage_scope="shared")
+        sch.reverse_compute_at(block=b79, loop=l77, preserve_unit_loops=True, index=-1)
+        b80 = sch.cache_write(block=b32, write_buffer_index=0, storage_scope="wmma.accumulator")
+        sch.reverse_compute_at(block=b80, loop=l78, preserve_unit_loops=True, index=-1)
+        v81 = sch.sample_categorical(
+            candidates=[1, 2, 3, 4, 8, 16],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=1,
+        )
+        sch.annotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch", ann_val=v81)
+        sch.reverse_compute_inline(block=b10)
+        l82, l83, l84, l85, l86 = sch.get_loops(block=b80)
+        l87, l88 = sch.split(loop=l86, factors=[None, 16], preserve_unit_iters=True)
+        l89, l90 = sch.split(loop=l85, factors=[None, 16], preserve_unit_iters=True)
+        l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b80)
+        sch.reorder(l96, l90, l88)
+        b98 = sch.blockize(loop=l90)
+        sch.annotate(
+            block_or_loop=b98,
+            ann_key="meta_schedule.auto_tensorize",
+            ann_val="wmma_store_16x16x16_s32_shared",
+        )
+        b99 = sch.cache_read(
+            block=b32, read_buffer_index=0, storage_scope="shared", consumer_blocks=[b32]
+        )
+        sch.compute_at(block=b99, loop=l73, preserve_unit_loops=True, index=-1)
+        l100, l101, l102, l103, l104, l105, l106, l107 = sch.get_loops(block=b99)
+        l108 = sch.fuse(l106, l107, preserve_unit_iters=True)
+        v109 = sch.sample_categorical(
+            candidates=[1, 2, 3, 4, 8, 16],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=3,
+        )
+        sch.annotate(block_or_loop=b99, ann_key="meta_schedule.cooperative_fetch", ann_val=v109)
+        b110 = sch.cache_read(
+            block=b32, read_buffer_index=1, storage_scope="shared", consumer_blocks=[b32]
+        )
+        sch.compute_at(block=b110, loop=l73, preserve_unit_loops=True, index=-1)
+        l111, l112, l113, l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b110)
+        l121 = sch.fuse(l117, l118, l119, l120, preserve_unit_iters=True)
+        v122 = sch.sample_categorical(
+            candidates=[1, 2, 3, 4, 8, 16],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=2,
+        )
+        sch.annotate(block_or_loop=b110, ann_key="meta_schedule.cooperative_fetch", ann_val=v122)
+        b123 = sch.cache_read(block=b32, read_buffer_index=0, storage_scope="wmma.matrix_a")
+        sch.compute_at(block=b123, loop=l74, preserve_unit_loops=True, index=-1)
+        l124, l125, l126, l127, l128, l129, l130, l131, l132, l133, l134 = sch.get_loops(block=b123)
+        l135, l136 = sch.split(loop=l134, factors=[None, 16], preserve_unit_iters=True)
+        l137, l138 = sch.split(loop=l133, factors=[None, 16], preserve_unit_iters=True)
+        (
+            l139,
+            l140,
+            l141,
+            l142,
+            l143,
+            l144,
+            l145,
+            l146,
+            l147,
+            l148,
+            l149,
+            l150,
+            l151,
+        ) = sch.get_loops(block=b123)
+        sch.reorder(l150, l138, l136)
+        b152 = sch.blockize(loop=l138)
+        sch.annotate(
+            block_or_loop=b152,
+            ann_key="meta_schedule.auto_tensorize",
+            ann_val="wmma_load_16x16x16_s8_a",
+        )
+        b153 = sch.cache_read(block=b32, read_buffer_index=1, storage_scope="wmma.matrix_b")
+        sch.compute_at(block=b153, loop=l74, preserve_unit_loops=True, index=-1)
+        (
+            l154,
+            l155,
+            l156,
+            l157,
+            l158,
+            l159,
+            l160,
+            l161,
+            l162,
+            l163,
+            l164,
+            l165,
+            l166,
+        ) = sch.get_loops(block=b153)
+        l167, l168 = sch.split(loop=l166, factors=[None, 16], preserve_unit_iters=True)
+        l169, l170 = sch.split(loop=l165, factors=[None, 16], preserve_unit_iters=True)
+        (
+            l171,
+            l172,
+            l173,
+            l174,
+            l175,
+            l176,
+            l177,
+            l178,
+            l179,
+            l180,
+            l181,
+            l182,
+            l183,
+            l184,
+            l185,
+        ) = sch.get_loops(block=b153)
+        sch.reorder(l184, l170, l168)
+        b186 = sch.blockize(loop=l170)
+        sch.annotate(
+            block_or_loop=b186,
+            ann_key="meta_schedule.auto_tensorize",
+            ann_val="wmma_load_16x16x16_s8_b_trans",
+        )
+        sch.compute_inline(block=b11)
+        sch.compute_inline(block=b12)
+        sch.storage_align(block=b99, buffer_index=0, axis=-2, factor=32, offset=16)
+        sch.storage_align(block=b110, buffer_index=0, axis=-2, factor=32, offset=16)
+        sch.reverse_compute_inline(block=b8)
+        sch.reverse_compute_inline(block=b7)
+        sch.reverse_compute_inline(block=b6)
+        sch.reverse_compute_inline(block=b5)
+        sch.reverse_compute_inline(block=b4)
+        sch.reverse_compute_inline(block=b3)
+        sch.reverse_compute_inline(block=b2)
+        sch.compute_inline(block=b0)
+
+        v187 = sch.sample_categorical(
+            candidates=[0, 16, 64, 512, 1024],
+            probs=[
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+            ],
+            decision=4,
+        )
+        sch.annotate(block_or_loop=b9, ann_key="meta_schedule.unroll_explicit", ann_val=v187)
+        sch.enter_postproc()
+        sch.unannotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch")
+        l188, l189, l190, l191 = sch.get_loops(block=b79)
+
+        l192, l193, l194, l195 = sch.split(
+            loop=l191, factors=[None, 4, 32, 2], preserve_unit_iters=True
+        )
+
+    verify(
+        Conv2dInt8_with_predicate,
+        apply_trace,
+        Conv2dInt8_with_predicate_target,
+        "cuda",
+        Conv2dInt8_with_predicate_scheduled,
+    )
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 5dc418633839d112c5b7519111d5745d365e941e Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 9 Nov 2022 14:42:48 -0800
Subject: [PATCH 541/704] [MetaSchedule] Add JSON Database Validation Scripts
 (#12948)

* Add validation scripts.

* Fix testing script.

* Fix lint.

* Fix lint.

* Fix inputs.

* Fix lint.

* Fix lint.

* Add timer func.

* Fix ci.

* Address comments.

* Add total time statistics.

* Fix lint.
---
 python/tvm/meta_schedule/profiler.py          |   2 +-
 .../testing/custom_builder_runner.py          |   4 +-
 .../tvm/meta_schedule/testing/tune_utils.py   |  55 +++-
 .../testing/validate_database.py              | 282 ++++++++++++++++++
 4 files changed, 336 insertions(+), 7 deletions(-)
 create mode 100644 python/tvm/meta_schedule/testing/validate_database.py

diff --git a/python/tvm/meta_schedule/profiler.py b/python/tvm/meta_schedule/profiler.py
index 7446578a38d7..1776666f4ed5 100644
--- a/python/tvm/meta_schedule/profiler.py
+++ b/python/tvm/meta_schedule/profiler.py
@@ -34,7 +34,7 @@ def __init__(self) -> None:
         )
 
     def get(self) -> Dict[str, float]:
-        """Get the profiling results in minutes"""
+        """Get the profiling results in seconds"""
         return _ffi_api.ProfilerGet(self)  # type: ignore # pylint: disable=no-member
 
     def table(self) -> str:
diff --git a/python/tvm/meta_schedule/testing/custom_builder_runner.py b/python/tvm/meta_schedule/testing/custom_builder_runner.py
index 1cfd4ab833be..7129546dd8b7 100644
--- a/python/tvm/meta_schedule/testing/custom_builder_runner.py
+++ b/python/tvm/meta_schedule/testing/custom_builder_runner.py
@@ -17,7 +17,7 @@
 """Customized builder and runner methods"""
 # pylint: disable=import-outside-toplevel
 
-from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union, Callable
 
 if TYPE_CHECKING:
     import numpy as np  # type: ignore
@@ -143,7 +143,7 @@ def run_module_via_rpc(
     rpc_config: "RPCConfig",
     lib: Union["Module", "Executable"],
     dev_type: str,
-    args: Dict[str, "np.ndarray"],
+    args: Union[Dict[int, "np.ndarray"], Dict[str, "np.ndarray"]],
     continuation: Callable,
     backend: Optional[str] = "graph",
 ):
diff --git a/python/tvm/meta_schedule/testing/tune_utils.py b/python/tvm/meta_schedule/testing/tune_utils.py
index fe0984d51c50..17064c64ab52 100644
--- a/python/tvm/meta_schedule/testing/tune_utils.py
+++ b/python/tvm/meta_schedule/testing/tune_utils.py
@@ -86,7 +86,7 @@ def create_timer(backend: str) -> Callable:
 
     def f_timer(
         rt_mod: Union[tvm.runtime.Module, tvm.runtime.vm.Executable],
-        dev: tvm.device,
+        dev: tvm.runtime.Device,
         input_data: Dict[str, NDArray],
     ) -> None:
         """Run and benchmark the given runtime module, print out the result.
@@ -95,7 +95,7 @@ def f_timer(
         ----------
         rt_mod : Union[tvm.runtime.Module, tvm.runtime.vm.Executable]
             The runtime module or vm executable.
-        dev : tvm.device
+        dev : tvm.runtime.Device
             The device type to run workload.
         input_data : Dict[str, np.ndarray]
             The input data as a dictionary.
@@ -152,7 +152,7 @@ def create_time_per_layer(graph: str) -> Callable:
 
     def f_time_per_layer(
         rt_mod: tvm.runtime.Module,
-        dev: tvm.device,
+        dev: tvm.runtime.Device,
         input_data: Dict[str, NDArray],
     ) -> None:
         """Run and benchmark the per-layer performance of given runtime module,
@@ -162,7 +162,7 @@ def f_time_per_layer(
         ----------
         rt_mod : tvm.runtime.Module
             The runtime module.
-        dev : tvm.device
+        dev : tvm.runtime.Device
             The device type to run workload.
         input_data : Dict[str, np.ndarray]
             The input data as a dictionary.
@@ -192,3 +192,50 @@ def f_time_per_layer(
             )
 
     return f_time_per_layer
+
+
+def create_calculator(backend: str) -> Callable:
+    """Create a function to fetch the computing result of running the given runtime module.
+
+    Parameters
+    ----------
+    backend : str
+        The backend to use, only tir is supported for now.
+
+    Returns
+    -------
+    func : Callable
+        The function to fetch the computing result.
+    """
+
+    def f_calculator(
+        rt_mod: tvm.runtime.Module,
+        dev: tvm.runtime.Device,  # pylint: disable=unused-argument
+        input_data: Dict[str, NDArray],
+    ) -> List[NDArray]:
+        """Fetch the result of running the given runtime module.
+
+        Parameters
+        ----------
+        rt_mod : Union[tvm.runtime.Module, tvm.runtime.vm.Executable]
+            The runtime module or vm executable.
+        dev : tvm.device
+            The device type to run workload.
+        input_data : Dict[str, np.ndarray]
+            The input data as a dictionary.
+        """
+        try:
+            if backend == "tir":
+                data = [v for _, v in sorted(input_data.items(), key=lambda x: x[0])]
+                rt_mod(*data)
+                return data
+            else:
+                raise ValueError(f"Backend {backend} not supported in f_calculator!")
+
+        except Exception as exc:  # pylint: disable=broad-except
+            print(
+                f"Run module f_calculator via RPC failed, exception: {exc}",
+            )
+            return None
+
+    return f_calculator
diff --git a/python/tvm/meta_schedule/testing/validate_database.py b/python/tvm/meta_schedule/testing/validate_database.py
new file mode 100644
index 000000000000..5e48bfb6b04e
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/validate_database.py
@@ -0,0 +1,282 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""JSON Database validation script"""
+from typing import Union, Callable, List
+from distutils.util import strtobool
+import argparse
+import logging
+import warnings
+import numpy as np  # type: ignore
+
+import tvm
+from tvm.target import Target
+from tvm.ir import IRModule
+from tvm.tir import Schedule
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.meta_schedule.testing.tune_utils import create_calculator, generate_input_data
+from tvm._ffi import get_global_func, register_func
+from tvm.support import describe
+
+DELIMITOR = "\n" + "-" * 30 + "\n"
+
+
+def _parse_args():
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--work-dir",
+        type=str,
+        required=True,
+        help="The path to the work directory containing database files.",
+    )
+    args.add_argument(
+        "--target",
+        type=Target,
+        required=True,
+    )
+    args.add_argument(
+        "--baseline-target",
+        type=Target,
+        default="llvm -num-cores=1",
+        required=False,
+        help="The baseline target to compile the original module.",
+    )
+    args.add_argument(
+        "--rpc-host",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-port",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-key",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=lambda x: bool(strtobool(x)),
+        help="example: True / False",
+        required=True,
+    )
+    parsed = args.parse_args()
+    parsed.target = tvm.target.Target(parsed.target)
+    parsed.rpc_config = ms.runner.RPCConfig(
+        tracker_host=parsed.rpc_host,
+        tracker_port=parsed.rpc_port,
+        tracker_key=parsed.rpc_key,
+        session_timeout_sec=600,
+    )
+    if parsed.cpu_flush and parsed.target.kind.name != "llvm":
+        warnings.warn("cpu_flush is only supported on llvm target")
+    return parsed
+
+
+# logging
+logging.basicConfig(
+    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+)
+logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
+
+# arg parser
+ARGS = _parse_args()
+
+
+@register_func("tvm.meta_schedule.testing.default_input_generator")
+def default_input_generator(mod: IRModule) -> List[tvm.nd.NDArray]:
+    args_info = ms.arg_info.TensorInfo.from_prim_func(mod["main"])
+    inputs = [
+        tvm.nd.array(generate_input_data(input_shape=arg_info.shape, input_dtype=arg_info.dtype))
+        for arg_info in args_info
+    ]
+    return inputs
+
+
+@register_func("tvm.meta_schedule.testing.default_check_metric")
+def default_check_metric(a: List[tvm.nd.NDArray], b: List[tvm.nd.NDArray]) -> bool:
+    assert len(a) == len(b), "Different number of outputs from two modules"
+    for i, _ in enumerate(a):
+        if not np.allclose(a[i].numpy(), b[i].numpy(), rtol=1e-3, atol=2e-3):
+            return False
+    return True
+
+
+def validate_correctness(
+    original_mod: IRModule,  # compiled for "baseline_target"
+    scheduled_mod: IRModule,  # compiled for "target"
+    *,
+    baseline_target: Target,
+    target: Target,
+    dev_type: str,
+    rpc_config: ms.runner.RPCConfig,
+    f_input_generator: Union[
+        str, Callable[[IRModule], List[tvm.nd.NDArray]]
+    ] = default_input_generator,
+    f_check_metric: Union[
+        str, Callable[[tvm.nd.NDArray, tvm.nd.NDArray], bool]
+    ] = default_check_metric,
+) -> bool:
+    """Function to validate the correctness of a scheduled module.
+
+    Parameters
+    ----------
+    original_mod : IRModule
+        The original module to be compiled.
+    scheduled_mod : IRModule
+        The scheduled module to be compiled.
+    baseline_target : Target
+        The baseline target to compile the original module.
+    target : Target
+        The target to compile the scheduled module.
+    dev_type : str
+        The device type to run the module via rpc.
+    rpc_config : RPCConfig
+        The RPCConfig to run the scheduled module.
+    f_input_generator : Union[str, Callable]
+        The function to generate the input data.
+    f_check_metric : Union[str, Callable]
+        The function to check the metric.
+
+    Returns
+    -------
+    result : bool
+        The result of the validation.
+    """
+
+    def to_numpy(a: List[tvm.nd.NDArray]) -> List[np.ndarray]:
+        """Convert a list of TVM NDArray to a list of numpy array"""
+        assert a is not None, "Empty result cannot be converted to numpy"
+        return [x.numpy() for x in a]
+
+    def to_tvm_ndarray(a: List[np.ndarray]) -> List[tvm.nd.NDArray]:
+        """Convert a list of numpy array to a list of TVM NDArray"""
+        assert a is not None, "Empty result cannot be converted to TVM NDArray"
+        return [tvm.nd.array(x) for x in a]
+
+    def build_and_run(mod: IRModule, target: Target, dev_type: str) -> np.ndarray:
+        """Build and run the module on the target device."""
+        rt_mod = tvm.build(mod, target=target)
+        return run_module_via_rpc(
+            rpc_config=rpc_config,
+            lib=rt_mod,
+            dev_type=dev_type,
+            args={i: v for i, v in enumerate(inputs)},  # pylint: disable=unnecessary-comprehension
+            continuation=create_calculator(backend="tir"),
+            backend="tir",
+        )
+
+    # fetch functions & prepare inputs
+    if isinstance(f_input_generator, str):
+        f_input_generator = get_global_func(f_input_generator)
+    if isinstance(f_check_metric, str):
+        f_check_metric = get_global_func(f_check_metric)
+    inputs = to_numpy(f_input_generator(original_mod))  # type: ignore
+    # build & run original result
+    original_res = to_numpy(build_and_run(original_mod, target=baseline_target, dev_type="cpu"))
+    scheduled_res = to_numpy(build_and_run(scheduled_mod, target=target, dev_type=dev_type))
+    # check metric
+    if f_check_metric(to_tvm_ndarray(original_res), to_tvm_ndarray(scheduled_res)):  # type: ignore
+        return True
+    else:
+        print(
+            ("\n\n").join(
+                [
+                    "Validation failed!",
+                    "Original Result:" + DELIMITOR + str(original_res),
+                    "Scheduled Result:" + DELIMITOR + str(scheduled_res),
+                    "Input:" + DELIMITOR + str(inputs),
+                    "Original IRModule:" + DELIMITOR + original_mod.script(),
+                    "Scheduled IRModule:" + DELIMITOR + scheduled_mod.script(),
+                ]
+            )
+        )
+        return False
+
+
+def main():
+    """Main function"""
+    describe()
+    database = ms.database.create(work_dir=ARGS.work_dir)
+    target = ARGS.target
+    if target.kind.name == "llvm":
+        dev_type = "cpu"
+    elif target.kind.name == "cuda":
+        dev_type = "cuda"
+    else:
+        raise RuntimeError(f"Unsupported target kind: {target.kind.name}")
+    records = database.get_all_tuning_records()
+    with ms.Profiler() as profiler:
+        for i, record in enumerate(records):
+            scope_name = f"validate #{i}"
+            with profiler.timeit(scope_name):
+                original_mod = record.workload.mod
+                sch = Schedule(original_mod)
+                record.trace.apply_to_schedule(sch=sch, remove_postproc=False)
+                scheduled_mod = sch.mod
+                is_success = False
+                try:
+                    is_success = validate_correctness(
+                        original_mod=original_mod,
+                        scheduled_mod=scheduled_mod,
+                        target=target,
+                        baseline_target=ARGS.baseline_target,
+                        dev_type=dev_type,
+                        rpc_config=ARGS.rpc_config,
+                    )
+                except Exception as e:  # pylint: disable=broad-except, invalid-name
+                    print(
+                        ("\n\n").join(
+                            [
+                                "Validation failed!",
+                                "Original IRModule:" + DELIMITOR + original_mod.script(),
+                                "Scheduled IRModule:" + DELIMITOR + scheduled_mod.script(),
+                                "Exception" + DELIMITOR + str(e),
+                            ]
+                        )
+                    )
+            if is_success:
+                print(
+                    f"Progress {i+1: 6d} / {len(records): 6d} checked,"
+                    f" used {float(profiler.get()[scope_name]): 3.3f} sec."
+                )
+            else:
+                return
+
+    print("Validation passed!")
+    print(f"Total time spent: {float(profiler.get()['Total']): 3.3f} sec.")
+
+
+if __name__ == "__main__":
+    main()

From b4b90d76d9517d52c94c1e21acfb85ca3985ce16 Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Thu, 10 Nov 2022 03:24:25 +0300
Subject: [PATCH 542/704] [QNN, ONNX] Extension of QLinearMatMul in ONNX
 front-end for all ranks of input tensors (#13322)

* QLinearMatMul was extended for all ranks of a and b

* CI test for QLinearMatMul was implemented (onnx front-end)

* fix after black check

* numpy type fix

* fix weight scale and zero point, output type

* fix after pylint

* resolve different input types in tests

* skip resolved TODO

* update covering of QLinearMatMul by tests

* pylint fixes

* skip test of QLinearMatMul on CUDA

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 python/tvm/relay/frontend/onnx.py          | 212 ++++++++++++++++++---
 tests/python/frontend/onnx/test_forward.py | 121 ++++++++++++
 2 files changed, 304 insertions(+), 29 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index e244b4d9a1ad..a14bb47956ee 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -242,6 +242,22 @@ def get_scalar_or_1d_tensor(x, params, dtype="float32"):
     return _op.cast(x, dtype)
 
 
+def flatten_to_nd(x, x_shape, nd=3):
+    """Flatten input tensor to nd rank"""
+    ndims = infer_shape(x_shape)[0]
+    if ndims == nd:
+        return x
+    newshape = _op.concatenate(
+        [
+            _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
+            _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
+        ],
+        0,
+    )
+    out = _op.reshape(x, fold_constant(newshape))
+    return out
+
+
 def matmul_out_dtype(inputs, out_dtype):
     """Common function to handle MatMul and MatMulInteger16"""
     a_shape = shape_of(inputs[0])
@@ -249,21 +265,6 @@ def matmul_out_dtype(inputs, out_dtype):
     b_shape = shape_of(inputs[1])
     b_rank = infer_shape(b_shape)[0]
     if a_rank > 2 or b_rank > 2:
-
-        def flatten_to_nd(x, x_shape, nd=3):
-            ndims = infer_shape(x_shape)[0]
-            if ndims == nd:
-                return x
-            newshape = _op.concatenate(
-                [
-                    _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
-                    _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
-                ],
-                0,
-            )
-            out = _op.reshape(x, fold_constant(newshape))
-            return out
-
         # Determine the output batch dimension.
         new_a_shape = a_shape
         new_b_shape = b_shape
@@ -365,6 +366,167 @@ def flatten_to_nd(x, x_shape, nd=3):
     return _op.nn.dense(inputs[0], input_1_t, out_dtype=out_dtype)
 
 
+def qmatmul(
+    a,
+    b,
+    a_zp_scalar,
+    b_zp_scalar,
+    a_scale_scalar,
+    b_scale_scalar,
+    transform_num_hidden_units,
+    matmul_result_dtype,
+):
+    """
+    Helper function to handle QLinearMatMul
+    It is very close to 'matmul_out_dtype' but separated due to
+    differences in signatures of dense, matmul, batch_matmul of nn and qnn.
+    They requre scaling and zero point arguments
+    """
+    a_shape = shape_of(a)
+    a_rank = infer_shape(a_shape)[0]
+    b_shape = shape_of(b)
+    b_rank = infer_shape(b_shape)[0]
+    if a_rank > 2 or b_rank > 2:
+        # Determine the output batch dimension.
+        new_a_shape = a_shape
+        new_b_shape = b_shape
+        if a_rank > b_rank:
+            rank_diff = a_rank - b_rank
+            new_b_shape = _op.concatenate(
+                [
+                    _expr.const([1] * rank_diff, dtype=infer_type(b_shape).checked_type.dtype),
+                    b_shape,
+                ],
+                0,
+            )
+        elif a_rank < b_rank:
+            rank_diff = b_rank - a_rank
+            new_a_shape = _op.concatenate(
+                [
+                    _expr.const([1] * rank_diff, dtype=infer_type(a_shape).checked_type.dtype),
+                    a_shape,
+                ],
+                0,
+            )
+        else:
+            pass
+
+        out_batch = _op.concatenate(
+            [
+                _op.maximum(
+                    _op.strided_slice(new_b_shape, [i], [i + 1]),
+                    _op.strided_slice(new_a_shape, [i], [i + 1]),
+                )
+                for i in range(max(a_rank, b_rank) - 2)
+            ],
+            0,
+        )
+
+        b_type = infer_type(b)
+        # Convert to dense if the second matrix is 2d and non-dynamic
+        if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
+            a = flatten_to_nd(a, a_shape, 2)
+            b = _op.transpose(b)
+            output = _qnn.op.dense(
+                a,
+                b,
+                a_zp_scalar,
+                b_zp_scalar,
+                a_scale_scalar,
+                b_scale_scalar,
+                transform_num_hidden_units,
+                matmul_result_dtype,
+            )
+        else:
+            # broadcast a and b
+            a_broadcasted_shape = fold_constant(
+                _op.concatenate(
+                    [
+                        out_batch,
+                        _op.strided_slice(a_shape, [a_rank - 2], [a_rank]),
+                    ],
+                    0,
+                )
+            )
+            b_broadcasted_shape = fold_constant(
+                _op.concatenate(
+                    [
+                        out_batch,
+                        _op.strided_slice(b_shape, [b_rank - 2], [b_rank]),
+                    ],
+                    0,
+                )
+            )
+            if not tvm.ir.structural_equal(a_shape, a_broadcasted_shape):
+                a = _op.transform.broadcast_to(a, a_broadcasted_shape)
+            if not tvm.ir.structural_equal(b_shape, b_broadcasted_shape):
+                b = _op.transform.broadcast_to(b, b_broadcasted_shape)
+            # Convert a and b into 3 dimensional tensors.
+            a = flatten_to_nd(a, shape_of(a), 3)
+            b = flatten_to_nd(b, shape_of(b), 3)
+            # Transpose matrix dimensions of b.
+            bt = _op.transpose(b, [0, 2, 1])
+            # Perform a NT batch matmul.
+            output = _qnn.op.batch_matmul(
+                a,
+                bt,
+                a_zp_scalar,
+                b_zp_scalar,
+                a_scale_scalar,
+                b_scale_scalar,
+                matmul_result_dtype,
+            )
+        # Reshape output to original dimensions.
+        final_shape = _op.concatenate(
+            [
+                out_batch,
+                _op.strided_slice(a_shape, [a_rank - 2], [a_rank - 1]),
+                _op.strided_slice(b_shape, [b_rank - 1], [b_rank]),
+            ],
+            0,
+        )
+        return _op.reshape(output, fold_constant(final_shape))
+
+    if a_rank == 1:
+        # TODO(vvchernov): There should be qnn.matmul but it is not implemented
+        # return _op.squeeze(_qnn.op.matmul(_op.expand_dims(a, axis=0),
+        #                                   b,
+        #                                   a_zp_scalar,
+        #                                   b_zp_scalar,
+        #                                   a_scale_scalar,
+        #                                   b_scale_scalar,
+        #                                   transform_num_hidden_units,
+        #                                   matmul_result_dtype,
+        #                                  ),
+        #                    axis=[0]
+        #                   )
+        return _op.squeeze(
+            _qnn.op.dense(
+                _op.expand_dims(a, axis=0),
+                _op.transpose(b),
+                a_zp_scalar,
+                b_zp_scalar,
+                a_scale_scalar,
+                b_scale_scalar,
+                transform_num_hidden_units,
+                matmul_result_dtype,
+            ),
+            axis=[0],
+        )
+
+    # Otherwise a simple dense op will get the job done.
+    return _qnn.op.dense(
+        a,
+        _op.transpose(b),
+        a_zp_scalar,
+        b_zp_scalar,
+        a_scale_scalar,
+        b_scale_scalar,
+        transform_num_hidden_units,
+        matmul_result_dtype,
+    )
+
+
 def layer_norm(x, eps, gamma, beta):
     """A common function to handle layer norm.
 
@@ -4437,7 +4599,6 @@ class QLinearMatMul(OnnxOpConverter):
     Operator converter for QLinearMatMul from Microsoft onnxruntime contrib opset.
 
     Limitations:
-    - Only supports 2D input tensors.
     - Not guaranteed to meet the integer-overflow behavior stipulated in the
       ONNX documentation for this operator.
 
@@ -4487,9 +4648,6 @@ def try_resolve_to_const(x, dtype_override=None):
         y_scale_type = infer_type(y_scale).checked_type
         y_zp_type = infer_type(y_zp).checked_type  # 'T3' in ONNX doc for this op
 
-        a_shape = infer_shape(a)
-        b_shape = infer_shape(b)
-
         # Verify type assumptions, based on the ONNX doc for this op...
         assert a_type.dtype in ["int8", "uint8"]
         assert a_scale_type.dtype == "float32"
@@ -4502,14 +4660,6 @@ def try_resolve_to_const(x, dtype_override=None):
         assert y_scale_type.dtype == "float32"
         assert y_zp_type.dtype in expected_out_dtypes
 
-        # TODO: relax this limitation in a future version of this importer.
-        a_rank = len(a_shape)
-        b_rank = len(b_shape)
-        assert (a_rank == 2) and (b_rank == 2), (
-            "QLinearMatMul importer currently requires both 'a' and 'b' tensors to be 2D, but"
-            " rank(a)={}, rank(b)={}".format(a_rank, b_rank)
-        )
-
         # _qnn.op.dense requires the zero-point values to have dtype int32.
         a_scale_scalar = try_resolve_to_const(a_scale)
         a_zp_scalar = try_resolve_to_const(a_zp, "int32")
@@ -4541,10 +4691,14 @@ def try_resolve_to_const(x, dtype_override=None):
         # expressed in a Relay graph. And then update this importer and various TVM
         # backends accordingly.
         matmul_result_dtype = "int32"
+        # TODO(vvchernov): possibly it is better to use unsigned type for result
+        # if input types are unsigned:
+        # if a_type.dtype == "uint8" and b_type.dtype == "uint8":
+        #     matmul_result_dtype = "uint32"
 
-        matmul_result = _qnn.op.dense(
+        matmul_result = qmatmul(
             a,
-            _op.transpose(b),
+            b,
             a_zp_scalar,
             b_zp_scalar,
             a_scale_scalar,
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 6e2c7734c3e1..017a1621d7d8 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5367,6 +5367,7 @@ def verify_eyelike(indata, dynamic=False):
         "test_range_int32_type_positive_delta_expanded",
         "test_mod_mixed_sign_float16",
         "test_qlinearconv",
+        "test_qlinearmatmul",
         "test_resize_upsample_sizes_nearest",
     ]
 }
@@ -6151,6 +6152,126 @@ def repeat(num, dims):
     )
 
 
+# TODO(vvchernov): fix problem with quantization on cuda
+@tvm.testing.known_failing_targets("cuda")
+@tvm.testing.parametrize_targets
+def test_qlinearmatmul(target, dev):
+    """test_qlinearmatmul"""
+
+    def verify_qlinearmatmul(
+        x_shape,
+        w_shape,
+        y_shape,
+        x_dtype="uint8",
+        w_dtype="uint8",
+    ):
+        def get_randint_numpy_scalar(dtype="uint8"):
+            if dtype == "uint8":
+                return np.random.randint(0, 255)
+            else:  # "int8"
+                return np.random.randint(-128, 127)
+
+        if x_dtype == "uint8":
+            x_array = np.random.randint(low=0, high=255, size=x_shape).astype("uint8")
+        else:  # "int8"
+            x_array = np.random.randint(low=-128, high=127, size=x_shape).astype("int8")
+        if w_dtype == "uint8":
+            w_array = np.random.uniform(low=0, high=255, size=w_shape).astype("uint8")
+        else:  # "int8"
+            w_array = np.random.uniform(low=-128, high=127, size=w_shape).astype("int8")
+
+        x_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(x_dtype)]
+        w_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(w_dtype)]
+
+        y_dtype = "int8"
+        if x_dtype == "uint8" and w_dtype == "uint8":
+            y_dtype = "uint8"
+        y_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(y_dtype)]
+
+        initializer = [
+            helper.make_tensor("x_scale", TensorProto.FLOAT, (), [np.random.rand()]),
+            # TODO: 0 value for int8?
+            helper.make_tensor(
+                "x_zero_point", x_proto_type, (), [get_randint_numpy_scalar(x_dtype)]
+            ),
+            helper.make_tensor("w_scale", TensorProto.FLOAT, (), [np.random.rand()]),
+            # TODO: 0 value for int8?
+            helper.make_tensor(
+                "w_zero_point", w_proto_type, (), [get_randint_numpy_scalar(w_dtype)]
+            ),
+            helper.make_tensor("y_scale", TensorProto.FLOAT, (), [np.random.rand()]),
+            helper.make_tensor(
+                "y_zero_point", y_proto_type, (), [get_randint_numpy_scalar(y_dtype)]
+            ),
+        ]
+
+        input_nodes = [
+            helper.make_tensor_value_info("x", x_proto_type, list(x_shape)),
+            helper.make_tensor_value_info("w", w_proto_type, list(w_shape)),
+        ]
+        input_names = [
+            "x",
+            "x_scale",
+            "x_zero_point",
+            "w",
+            "w_scale",
+            "w_zero_point",
+            "y_scale",
+            "y_zero_point",
+        ]
+        input_values = [x_array, w_array]
+
+        node = helper.make_node(
+            "QLinearMatMul",
+            inputs=input_names,
+            outputs=["y"],
+        )
+
+        y_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype("int8")]
+        if x_dtype == "uint8" and w_dtype == "uint8":
+            y_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype("uint8")]
+
+        graph = helper.make_graph(
+            [node],
+            "qmatmul_test",
+            inputs=input_nodes,
+            outputs=[helper.make_tensor_value_info("y", y_proto_type, list(y_shape))],
+            initializer=initializer,
+        )
+        model = helper.make_model(graph, producer_name="qlinearmatmul_test")
+        # opt_level=1 will cause error
+        verify_with_ort_with_inputs(model, input_values, opt_level=2, target=target, dev=dev)
+
+    # Default matmul both ranks = 2 (x_dtype = "uint8", w_dtype = "uint8")
+    verify_qlinearmatmul((2, 3), (3, 2), (2, 2))
+
+    # Default matmul both ranks = 2 (x_dtype = "int8", w_dtype = "int8")
+    verify_qlinearmatmul((2, 3), (3, 2), (2, 2), "int8", "int8")
+
+    # TODO(vvchernov): problems on ONNX Runtime side and type check (onnx.py:L4763) on TVM side
+    # Default matmul both ranks = 2 (x_dtype = "uint8", w_dtype = "int8")
+    # verify_qlinearmatmul((2, 3), (3, 2), (2, 2), "uint8", "int8")
+
+    # TODO(vvchernov): problems on ONNX Runtime side and type check (onnx.py:L4763) on TVM side
+    # Default matmul both ranks = 2 (x_dtype = "int8", w_dtype = "uint8")
+    # verify_qlinearmatmul((2, 3), (3, 2), (2, 2), "int8", "uint8")
+
+    # Reduced matmul: x_ranks = 1, w_rank = 2 (x_dtype = "uint8", w_dtype = "uint8")
+    verify_qlinearmatmul((3,), (3, 2), (2,))
+
+    # Special case matmul: x_ranks = 3, w_rank = 2 (x_dtype = "uint8", w_dtype = "uint8")
+    verify_qlinearmatmul((2, 3, 4), (4, 3), (2, 3, 3))
+
+    # GPT2-style matmul both ranks = 4 (x_dtype = "uint8", w_dtype = "uint8")
+    verify_qlinearmatmul((2, 4, 3, 3), (2, 4, 3, 3), (2, 4, 3, 3))
+
+    # Asymetric matmul: x_ranks = 4, w_rank = 3 (x_dtype = "uint8", w_dtype = "uint8")
+    verify_qlinearmatmul((2, 4, 3, 3), (4, 3, 3), (2, 4, 3, 3))
+
+    # Asymetric matmul: x_ranks = 2, w_rank = 3 (x_dtype = "uint8", w_dtype = "uint8")
+    # verify_qlinearmatmul((3, 3), (4, 3, 3), (4, 3, 3))
+
+
 @tvm.testing.parametrize_targets
 def test_qlinearconcat(target, dev):
     """test_qlinearconcat"""

From 6d9d2132c1781c4cdc15e2e287cafc33f658bc5e Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 10 Nov 2022 12:38:11 +0900
Subject: [PATCH 543/704] [TIR] Check producer predicate in
 `ReverseComputeInline` (#13338)

* [TIR] Disallow reverse inline into a producer with non-trivial predicate

* add test

* Allow cases where the producer predicate can be implied by the new
predicate of the inlined block

* remove unused variable

* update comment in test to reflect the change in ReverseComputeInline
---
 src/tir/schedule/primitive/compute_inline.cc  |  46 ++++-
 .../test_meta_schedule_trace_apply.py         |   4 +-
 .../test_tir_schedule_compute_inline.py       | 178 ++++++++++++++++++
 3 files changed, 221 insertions(+), 7 deletions(-)

diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc
index 2ea641a2cbd4..d54be8a05fdc 100644
--- a/src/tir/schedule/primitive/compute_inline.cc
+++ b/src/tir/schedule/primitive/compute_inline.cc
@@ -214,6 +214,32 @@ class OpaqueAccessError : public ScheduleError {
   Block scope_root_;
 };
 
+class ProducerHasNonTrivialPredicateError : public ScheduleError {
+ public:
+  explicit ProducerHasNonTrivialPredicateError(IRModule mod, BlockRealize producer,
+                                               PrimExpr new_predicate)
+      : mod_(mod), producer_(producer), new_predicate_(new_predicate) {}
+
+  String FastErrorString() const final {
+    return "ScheduleError: The producer block has a non-trivial predicate.";
+  }
+
+  String DetailRenderTemplate() const final {
+    return "ScheduleError: The producer block {0} has a non-trivial predicate " +
+           PrettyPrint(producer_->predicate) +
+           " that cannot be implied "
+           "by the synthesized predicate " +
+           PrettyPrint(new_predicate_) + " of the new inlined block.";
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {producer_}; }
+
+  IRModule mod_;
+  BlockRealize producer_;
+  PrimExpr new_predicate_;
+};
+
 /*!
  * \brief The base class of the inliner, which handles:
  * 1) Substitute a subtree with the specific block being inlined
@@ -533,10 +559,11 @@ class ReverseComputeInliner : public BaseInliner {
  public:
   explicit ReverseComputeInliner(const Buffer& inlined_buffer, const BlockNode* producer_block,
                                  const BlockRealize& consumer_block_realize,
-                                 const StmtSRef& scope_root_sref)
+                                 const StmtSRef& scope_root_sref, const IRModule& mod)
       : BaseInliner(inlined_buffer, consumer_block_realize->block, scope_root_sref),
         producer_block_(producer_block),
-        consumer_block_(consumer_block_realize->block.get()) {
+        consumer_block_(consumer_block_realize->block.get()),
+        mod_(mod) {
     // Initialize the predicates to ensure consumer block iters are in-bound
     consumer_iter_in_bound_ = Bool(true);
     for (const IterVar& iter : consumer_block_realize->block->iter_vars) {
@@ -632,8 +659,15 @@ class ReverseComputeInliner : public BaseInliner {
   Stmt VisitStmt_(const BlockRealizeNode* op) final {
     BlockRealize new_block_realize = Downcast<BlockRealize>(StmtMutator::VisitStmt_(op));
     if (op->block.get() == producer_block_) {
-      new_block_realize.CopyOnWrite()->predicate =
-          BuildInlinedConsumerPredicate(new_block_realize.get());
+      auto new_predicate = BuildInlinedConsumerPredicate(new_block_realize.get());
+
+      With<arith::ConstraintContext> ctx(&analyzer_, new_predicate);
+      if (!analyzer_.CanProve(op->predicate)) {
+        // We do not allow cases where the new predicate for the inlined block cannot
+        // imply the original predicate in the producer block.
+        throw ProducerHasNonTrivialPredicateError(mod_, GetRef<BlockRealize>(op), new_predicate);
+      }
+      new_block_realize.CopyOnWrite()->predicate = new_predicate;
     }
     return std::move(new_block_realize);
   }
@@ -749,6 +783,8 @@ class ReverseComputeInliner : public BaseInliner {
   PrimExpr consumer_iter_in_bound_{nullptr};
   /*! \brief The arithmetic analyzer */
   arith::Analyzer analyzer_;
+  /*! \brief The target module, only used for error reporting. */
+  const IRModule& mod_;
 };
 
 void ComputeInlineImpl(ScheduleState self, const StmtSRef& producer_block_sref,
@@ -814,7 +850,7 @@ void ReverseComputeInlineImpl(ScheduleState self, const StmtSRef& consumer_block
       NonSingleProducerError::Check(self, consumer_block_sref, scope_root_sref);
   // Step 4. Analyze the block body
   ReverseComputeInliner inliner(inlined_buffer, producer_block_sref->StmtAs<BlockNode>(),
-                                consumer_block_realize, scope_root_sref);
+                                consumer_block_realize, scope_root_sref, self->mod);
   if (!inliner.BodyPatternAllowInline(consumer_block_realize)) {
     throw BodyAnalysisError(true, self->mod, consumer_block);
   }
diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index 7e361d2c095c..c8e6bf6a0c73 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -3037,8 +3037,8 @@ def test_inline_order():
     # reverse-inlined at the very end of ScheduleUsingAnchorTrace, where its producer block
     # "conv2d_nhwc_reindex_shared" has the predicate
     # T.where(((ax1_0 * 4 + ax1_1) * 32 + ax1_2) * 2 + ax1_3 < 64) due to anchor-block scheduling
-    # (see Conv2dInt8_with_predicate_scheduled). Currently, if we try to reverse-inline a block to
-    # its producer that has a predicate, the predicate disappears after reverse inlining.
+    # (see Conv2dInt8_with_predicate_scheduled). ReverseComputeInline cannot be applied in
+    # such cases.
 
     def apply_trace(sch: Schedule) -> None:
         b0 = sch.get_block(name="pad_temp", func_name="main")
diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py
index 20eafabc7a22..f9c5e22e97ce 100644
--- a/tests/python/unittest/test_tir_schedule_compute_inline.py
+++ b/tests/python/unittest/test_tir_schedule_compute_inline.py
@@ -626,6 +626,158 @@ def elementwise_producer_not_cover_consumer(
             D[vi, vj] = T.if_then_else(vi >= 128, B[vi - 128, vj], T.float32(0), dtype="float32")
 
 
+@T.prim_func
+def elementwise_predicate_producer(a: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (128, 128))
+    B = T.alloc_buffer((127, 128))
+    C = T.match_buffer(c, (127, 128))
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.where(i < 127)
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(127, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
+
+
+@T.prim_func
+def elementwise_predicate_producer_inlined(a: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (128, 128))
+    C = T.match_buffer(c, (127, 128))
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            T.where(i < 127)
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A[vi, vj])
+            T.writes(C[vi, vj])
+            C[vi, vj] = A[vi, vj] * T.float32(2) + T.float32(1)
+
+
+# fmt: off
+@tvm.script.ir_module
+class Conv2dInt8_TensorCore_with_predicate:
+    @T.prim_func
+    def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "int8"], p2: T.Buffer[(1, 1, 1, 256), "int32"], p3: T.Buffer[(1, 1, 1, 256), "int32"], p4: T.Buffer[256, "int32"], p5: T.Buffer[256, "int32"], p6: T.Buffer[256, "int32"], p7: T.Buffer[(), "int32"], p8: T.Buffer[1, "int32"], p9: T.Buffer[(16, 56, 56, 256), "int32"], compute: T.Buffer[(16, 56, 56, 256), "int32"]):
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.unroll_explicit":1024})
+            compute_3 = T.alloc_buffer([16, 56, 56, 256], dtype="int32")
+            conv2d_nhwc_reindex_shared = T.alloc_buffer([50176, 256], dtype="int32", scope="shared")
+            conv2d_nhwc_reindex_shared_wmma_accumulator = T.alloc_buffer([50176, 256], dtype="int32", scope="wmma.accumulator")
+            pad_temp_reindex_shared = T.alloc_buffer([50176, 64], dtype="int8", scope="shared")
+            p1_reindex_shared = T.alloc_buffer([1, 1, 256, 64], dtype="int8", scope="shared")
+            pad_temp_reindex_shared_wmma_matrix_a = T.alloc_buffer([50176, 64], dtype="int8", scope="wmma.matrix_a")
+            p1_reindex_shared_wmma_matrix_b = T.alloc_buffer([1, 1, 256, 64], dtype="int8", scope="wmma.matrix_b")
+            for ax2_0_0_ax3_0_0_fused in T.thread_binding(32, thread="blockIdx.y"):
+                for ax2_0_1_ax3_0_1_fused in T.thread_binding(196, thread="blockIdx.x"):
+                    for ax2_0_2_ax3_0_2_fused in T.thread_binding(4, thread="threadIdx.y"):
+                        for ax0_0, ax1_0, ax4_0_0 in T.grid(1, 1, 2):
+                            for ax0_ax1_fused in T.serial(1024):
+                                with T.block("pad_temp_reindex_shared"):
+                                    v0 = T.axis.spatial(50176, ax2_0_0_ax3_0_0_fused // 4 * 6272 + ax2_0_1_ax3_0_1_fused * 32 + ax0_ax1_fused // 32)
+                                    v1 = T.axis.spatial(64, ax4_0_0 * 32 + ax0_ax1_fused % 32)
+                                    T.reads(p0[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1])
+                                    T.writes(pad_temp_reindex_shared[v0, v1])
+                                    T.block_attr({"buffer_dim_align":[[0, 0, 32, 16]], "meta_schedule.cooperative_fetch":4})
+                                    pad_temp_reindex_shared[v0, v1] = p0[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1]
+                            for ax0_ax1_ax2_ax3_fused in T.serial(2048):
+                                with T.block("p1_reindex_shared"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1 = T.axis.spatial(1, 0)
+                                    v2 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused % 4 * 64 + ax0_ax1_ax2_ax3_fused // 32)
+                                    v3 = T.axis.spatial(64, ax4_0_0 * 32 + ax0_ax1_ax2_ax3_fused % 32)
+                                    T.reads(p1[v2, v0, v1, v3])
+                                    T.writes(p1_reindex_shared[v0, v1, v2, v3])
+                                    T.block_attr({"buffer_dim_align":[[0, 2, 32, 16]], "meta_schedule.cooperative_fetch":3})
+                                    p1_reindex_shared[v0, v1, v2, v3] = p1[v2, v0, v1, v3]
+                            for ax0_1, ax1_1, ax4_0_1 in T.grid(1, 1, 2):
+                                for ax0_0_1, ax1_0_1 in T.grid(1, 1):
+                                    with T.block("pad_temp_reindex_shared_wmma.matrix_a_o"):
+                                        v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 4 * 392 + ax2_0_1_ax3_0_1_fused * 2 + ax2_0_2_ax3_0_2_fused // 2)
+                                        v1_o = T.axis.spatial(4, ax4_0_0 * 2 + ax4_0_1)
+                                        T.reads(pad_temp_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                        T.writes(pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                        T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_s8_a"})
+                                        for ax0_1_1, ax1_1_1 in T.grid(16, 16):
+                                            with T.block("pad_temp_reindex_shared_wmma.matrix_a"):
+                                                v0_i, v1_i = T.axis.remap("SS", [ax0_1_1, ax1_1_1])
+                                                T.reads(pad_temp_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                                T.writes(pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                                pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = pad_temp_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                                for ax0, ax1, ax2_0, ax3_0 in T.grid(1, 1, 2, 1):
+                                    with T.block("p1_reindex_shared_wmma.matrix_b_o"):
+                                        v0 = T.axis.spatial(1, 0)
+                                        v1 = T.axis.spatial(1, 0)
+                                        v2_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 4 * 4 + ax2_0_2_ax3_0_2_fused % 2 * 2 + ax2_0)
+                                        v3_o = T.axis.spatial(4, ax4_0_0 * 2 + ax4_0_1)
+                                        T.reads(p1_reindex_shared[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                        T.writes(p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                        T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_s8_b_trans"})
+                                        for ax2_1, ax3_1 in T.grid(16, 16):
+                                            with T.block("p1_reindex_shared_wmma.matrix_b"):
+                                                v2_i, v3_i = T.axis.remap("SS", [ax2_1, ax3_1])
+                                                T.reads(p1_reindex_shared[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i])
+                                                T.writes(p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i])
+                                                p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i] = p1_reindex_shared[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i]
+                                for ax2_0_3, ax3_0_3, ax0_2, ax1_2, ax4_0_2, ax2_0_4, ax3_0_4 in T.grid(1, 1, 1, 1, 1, 1, 2):
+                                    with T.block("conv2d_nhwc_o"):
+                                        v0 = T.axis.reduce(1, 0)
+                                        v1 = T.axis.reduce(1, 0)
+                                        v2_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 4 * 392 + ax2_0_1_ax3_0_1_fused * 2 + ax2_0_2_ax3_0_2_fused // 2 + ax2_0_3 + ax2_0_4)
+                                        v3_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 4 * 4 + ax2_0_2_ax3_0_2_fused % 2 * 2 + ax3_0_3 * 2 + ax3_0_4)
+                                        v4_o = T.axis.reduce(4, ax4_0_0 * 2 + ax4_0_1 + ax4_0_2)
+                                        T.reads(pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 : v2_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 : v3_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16])
+                                        T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                        T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_s8s8s32_trans", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_s32", "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "warp_execution":1})
+                                        with T.init():
+                                            for ax2_1, ax3_1 in T.grid(16, 16):
+                                                with T.block("conv2d_nhwc_init"):
+                                                    v2_i_init, v3_i_init = T.axis.remap("SS", [ax2_1, ax3_1])
+                                                    T.reads()
+                                                    T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i_init, v3_o * 16 + v3_i_init])
+                                                    conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i_init, v3_o * 16 + v3_i_init] = 0
+                                        for ax2_1, ax3_1, ax4_1 in T.grid(16, 16, 16):
+                                            with T.block("conv2d_nhwc"):
+                                                v2_i, v3_i, v4_i = T.axis.remap("SSR", [ax2_1, ax3_1, ax4_1])
+                                                T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i], pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 + v2_i, v4_o * 16 + v4_i], p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 + v3_i, v4_o * 16 + v4_i])
+                                                T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i])
+                                                T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
+                                                conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] + T.cast(pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 + v2_i, v4_o * 16 + v4_i], "int32") * T.cast(p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 + v3_i, v4_o * 16 + v4_i], "int32")
+                        for ax0_0, ax1_0 in T.grid(1, 2):
+                            with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
+                                v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 4 * 392 + ax2_0_1_ax3_0_1_fused * 2 + ax2_0_2_ax3_0_2_fused // 2)
+                                v1_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 4 * 4 + ax2_0_2_ax3_0_2_fused % 2 * 2 + ax1_0)
+                                T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_s32_shared"})
+                                for ax0_1, ax1_1 in T.grid(16, 16):
+                                    with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator"):
+                                        v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1])
+                                        T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                        T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
+                                        conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
+                    for ax0, ax1_0, ax1_1, ax1_2, ax1_3 in T.grid(32, 1, 4, 32, 2):
+                        with T.block("conv2d_nhwc_reindex_shared"):
+                            T.where(((ax1_0 * 4 + ax1_1) * 32 + ax1_2) * 2 + ax1_3 < 64)
+                            v0 = T.axis.spatial(50176, ax2_0_0_ax3_0_0_fused // 4 * 6272 + ax2_0_1_ax3_0_1_fused * 32 + ax0)
+                            v1 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused % 4 * 64 + (ax1_0 * 256 + ax1_1 * 64 + ax1_2 * 2 + ax1_3))
+                            T.reads(p7[()], conv2d_nhwc_reindex_shared[v0, v1], p2[0, 0, 0, v1], p3[0, 0, 0, v1], p4[v1], p5[v1], p6[v1], p8[0])
+                            T.writes(compute_3[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1])
+                            compute_3[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1] = T.q_multiply_shift(T.max(T.min(p7[()] + T.q_multiply_shift_per_axis(conv2d_nhwc_reindex_shared[v0, v1] - p2[0, 0, 0, v1] + p3[0, 0, 0, v1], p4[v1], p5[v1], p6[v1], 31, False, True, dtype="int32"), 255), 0) - p8[0], 1457846997, 31, 0, dtype="int32")
+            for i0_12, i1_12, i2_12, i3_12 in T.grid(16, 56, 56, 256):
+                with T.block("compute_4"):
+                    i0_13, i1_13, i2_13, i3_13 = T.axis.remap("SSSS", [i0_12, i1_12, i2_12, i3_12])
+                    T.reads(compute_3[i0_13, i1_13, i2_13, i3_13], p9[i0_13, i1_13, i2_13, i3_13])
+                    T.writes(compute[i0_13, i1_13, i2_13, i3_13])
+                    compute[i0_13, i1_13, i2_13, i3_13] = T.max(T.min(compute_3[i0_13, i1_13, i2_13, i3_13] + T.q_multiply_shift(p9[i0_13, i1_13, i2_13, i3_13], 2101000910, 31, 0, dtype="int32"), 255), 0)
+# fmt: on
+
 # pylint: enable=no-member,invalid-name,unused-variable
 
 use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
@@ -883,5 +1035,31 @@ def test_reverse_compute_inline_error_producer_not_cover_consumer(use_block_name
         sch.reverse_compute_inline(compute)
 
 
+def test_reverse_compute_inline_producer_predicate_allowed():
+    """Test a case where reverse compute inline is allowed even though the producer has a
+    non-trivial predicate.
+    """
+
+    sch = tir.Schedule(elementwise_predicate_producer, debug_mask="all")
+    sch.reverse_compute_inline(sch.get_block("C"))
+    tvm.ir.assert_structural_equal(elementwise_predicate_producer_inlined, sch.mod["main"])
+
+
+def test_reverse_compute_inline_producer_predicate_disallowed():
+    """Test reverse compute inline failure when the producer has a non-trivial predicate that cannot be
+    implied by the synthesized predicate of the new inlined block.
+    """
+
+    sch = tir.Schedule(Conv2dInt8_TensorCore_with_predicate, debug_mask="all")
+
+    with pytest.raises(tvm.tir.ScheduleError) as e:
+        sch.reverse_compute_inline(sch.get_block("compute_4"))
+
+    assert (
+        "that cannot be implied by the synthesized predicate True of the new inlined block"
+        in str(e)
+    )
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From a16a8904833e9c72aa7571ca336e781d89c128aa Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 10 Nov 2022 12:38:20 +0900
Subject: [PATCH 544/704] [TOPI] Fix conv2d transpose for small channel
 (#13341)

* [TOPI] Fix conv2d transpose for small channel

* black
---
 python/tvm/topi/cuda/conv2d_transpose.py                     | 2 +-
 tests/python/topi/python/test_topi_group_conv2d_transpose.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/conv2d_transpose.py b/python/tvm/topi/cuda/conv2d_transpose.py
index 07f717ab4d0c..006b67a5515e 100644
--- a/python/tvm/topi/cuda/conv2d_transpose.py
+++ b/python/tvm/topi/cuda/conv2d_transpose.py
@@ -161,7 +161,7 @@ def _fallback_schedule(N, F, Y, X):
             cfg["tile_n"] = SplitEntity([1, 1, 1, 1])
         # split F (output channel dimension)
         if F > 1:
-            cfg["tile_f"] = SplitEntity([-1, 1, 64, 1])
+            cfg["tile_f"] = SplitEntity([-1, 1, 4, 1])
         # split Y (height dimension)
         y_split_factor = 1
         for candidate in range(5, 17):
diff --git a/tests/python/topi/python/test_topi_group_conv2d_transpose.py b/tests/python/topi/python/test_topi_group_conv2d_transpose.py
index f55b906990fb..e9f7ce5ef4dd 100644
--- a/tests/python/topi/python/test_topi_group_conv2d_transpose.py
+++ b/tests/python/topi/python/test_topi_group_conv2d_transpose.py
@@ -158,6 +158,7 @@ def test_group_conv2d_transpose_nchw():
     verify_group_conv2d_transpose_nchw(
         1, 3, (224, 224), 32, (3, 3), (2, 2), (1, 1, 1, 1), (0, 0), 1
     )
+    verify_group_conv2d_transpose_nchw(1, 48, (64, 64), 12, (4, 4), (2, 2), (1, 1, 1, 1), (0, 0), 1)
 
 
 if __name__ == "__main__":

From 1228104726b3cfb63c0c13da7a584ca6d7b5e584 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 9 Nov 2022 22:13:36 -0800
Subject: [PATCH 545/704] [Minor][Testing] Consolidate IRs into corresponding
 functions (#13339)

We moved most of the IR definition into the testing methods correspondingly.

Co-authored-by: Yaxing Cai <caiyaxing666@gmail.com>
---
 python/tvm/testing/__init__.py                |   2 -
 python/tvm/testing/tir.py                     |  45 +-
 .../unittest/test_tvmscript_error_report.py   | 710 ++++++++----------
 .../unittest/test_tvmscript_syntax_sugar.py   |  13 +-
 4 files changed, 330 insertions(+), 440 deletions(-)

diff --git a/python/tvm/testing/__init__.py b/python/tvm/testing/__init__.py
index 9a18f1689100..d84846725ec4 100644
--- a/python/tvm/testing/__init__.py
+++ b/python/tvm/testing/__init__.py
@@ -28,7 +28,5 @@
 from .popen_pool import call_py_ffi, call_cpp_py_ffi, fast_summation, slow_summation
 from .popen_pool import timeout_job
 
-from .tir import check_error
-
 from . import auto_scheduler
 from . import autotvm
diff --git a/python/tvm/testing/tir.py b/python/tvm/testing/tir.py
index 8dd482673829..57c1a85c5b9f 100644
--- a/python/tvm/testing/tir.py
+++ b/python/tvm/testing/tir.py
@@ -16,49 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name, import-outside-toplevel, unused-variable
 """Common utility functions in TVM tir"""
-import inspect
-import re
-import tvm
-from tvm.ir.diagnostics import override_renderer
-
-
-CHECK_ERROR_RE = re.compile(r"^.*# check_error: (.+)$")
-
-
-def check_error(func, rel_lineno):
-    """check if TIR script throws error"""
-    # Override the default renderer to accumulate errors
-    errors = []
-
-    def render(e):
-        for d in e.diagnostics:
-            errors.append(d)
-
-    override_renderer(render)
-    # The diagnostic context throws an exception when it gets an error
-    try:
-        source_code = inspect.getsource(func)
-        source_code = "@T.prim_func\n" + source_code
-        from tvm.script import from_source
-
-        # to avoid cyclic import
-        from_source(source_code)
-    except tvm.error.DiagnosticError as e:
-        pass
-    assert len(errors) == 1, errors
-    for d in errors:
-        assert (
-            d.span.line - 1 == rel_lineno
-        ), f"Expected error to be on line {rel_lineno}, but it was on {d.span.line - 1}"
-
-    error_line = source_code.split("\n")[rel_lineno]
-    m = CHECK_ERROR_RE.match(error_line)
-    if m:
-        expected_error_text = m.group(1)
-        errors = [e.message for e in errors]
-        assert (
-            expected_error_text in errors
-        ), f'check_error expects "{expected_error_text} in str(errors): {errors}'
 
 
 def mma_schedule(
@@ -80,6 +37,8 @@ def mma_schedule(
     shared_scope="shared",
 ):
     """Create a tensorized schedule for GEMM with MMA intrinsics."""
+    import tvm  # pylint: disable=import-outside-toplevel
+
     ir_module = tvm.IRModule({"main": workload})
     sch = tvm.tir.Schedule(ir_module)
 
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index acc68af065dd..36de35fa928b 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -14,310 +14,304 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import inspect
+import re
 
 import pytest
-import sys
 import tvm
+import tvm.testing
 from tvm import tir
-from tvm.testing import check_error
+from tvm.ir.diagnostics import override_renderer
+from tvm.script import from_source
 from tvm.script import tir as T
 
 
-def buffer_bind_missing_args(a: T.handle) -> None:
-    A = T.match_buffer((16, 16), "float32")  # error
+def check_error(func, rel_lineno):
+    check_error_re = re.compile(r"^.*# check_error: (.+)$")
+    """check if TIR script throws error"""
+    # Override the default renderer to accumulate errors
+    errors = []
+
+    def render(e):
+        for d in e.diagnostics:
+            errors.append(d)
+
+    override_renderer(render)
+    # The diagnostic context throws an exception when it gets an error
+    try:
+        source_code = inspect.getsource(func)
+        indent = len(re.match(r"^\s*", source_code).group(0))
+        source_code = "@T.prim_func\n" + "\n".join(
+            line[indent:] for line in source_code.splitlines()
+        )
+        from_source(source_code)
+    except tvm.error.DiagnosticError as e:
+        pass
+    assert len(errors) == 1, errors
+    if rel_lineno is None:
+        return
+    error = errors[0]
+    assert (
+        error.span.line - 1 == rel_lineno
+    ), f"Expected error to be on line {rel_lineno}, but it was on {error.span.line - 1}"
+
+    error_line = source_code.split("\n")[rel_lineno]
+    m = check_error_re.match(error_line)
+    if m:
+        expected_error_text = m.group(1)
+        error = error.message
+        assert (
+            expected_error_text == error
+        ), f'check_error expects "{expected_error_text} in str(errors): {error}'
 
 
 def test_buffer_bind():
-    check_error(buffer_bind_missing_args, 2)
-
+    def buffer_bind_missing_args(a: T.handle) -> None:
+        A = T.match_buffer((16, 16), "float32")  # error
 
-def undefined_buffer(a: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), "float32")
-
-    T.attr(A, "realize_scope", "")
-    T.realize(C[0:16, 0:16], "")  # error
-    for i in T.serial(16):
-        for j in T.serial(0, 16):
-            A[i, j] = 0.0
+    check_error(buffer_bind_missing_args, 2)
 
 
 def test_undefined_buffer():
-    check_error(undefined_buffer, 5)
+    def undefined_buffer(a: T.handle) -> None:
+        A = T.match_buffer(a, (16, 16), "float32")
 
+        T.attr(A, "realize_scope", "")
+        T.realize(C[0:16, 0:16], "")  # error
+        for i in T.serial(16):
+            for j in T.serial(0, 16):
+                A[i, j] = 0.0
 
-def unsupported_stmt(a: T.int32) -> None:
-    if a > 0:
-        print("I love tvm")  # error
+    check_error(undefined_buffer, 5)
 
 
 def test_unsupported_stmt():
-    check_error(unsupported_stmt, 3)
-
+    def unsupported_stmt(a: T.int32) -> None:
+        if a > 0:
+            print("I love tvm")  # error
 
-def unsupported_function_call(a: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), "float32")
-
-    T.attr(A, "realize_scope", "")
-    T.realize(A[0:16, 0:16], "")
-    for i in T.const_range(16):  # error
-        for j in T.serial(0, 16):
-            A[i, j] = 0.0
+    check_error(unsupported_stmt, 3)
 
 
 def test_unsupported_function_call():
-    check_error(unsupported_function_call, 6)
+    def unsupported_function_call(a: T.handle) -> None:
+        A = T.match_buffer(a, (16, 16), "float32")
 
+        T.attr(A, "realize_scope", "")
+        T.realize(A[0:16, 0:16], "")
+        for i in T.const_range(16):  # error
+            for j in T.serial(0, 16):
+                A[i, j] = 0.0
 
-def missing_type_annotation(a) -> None:  # error
-    T.evaluate(0.0)
+    check_error(unsupported_function_call, 6)
 
 
 def test_missing_type_annotation():
-    check_error(missing_type_annotation, 1)
-
-
-def invalid_expr_stmt() -> None:
-    T.max(1, 2)  # error
-
-
-def test_invalid_expr_stmt():
-    check_error(invalid_expr_stmt, 2)
-
-
-def invalid_for_function(a: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), "float32")
+    def missing_type_annotation(a) -> None:  # error
+        T.evaluate(0.0)
 
-    for i in T.evaluate(0.0):  # error
-        for j in T.serial(0, 16):
-            A[i, j] = 0.0
+    check_error(missing_type_annotation, 1)
 
 
 def test_invalid_for_function():
-    check_error(invalid_for_function, 4)
+    def invalid_for_function(a: T.handle) -> None:
+        A = T.match_buffer(a, (16, 16), "float32")
 
+        for i in T.evaluate(0.0):  # error
+            for j in T.serial(0, 16):
+                A[i, j] = 0.0
 
-def invalid_block_function(a: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), "float32")
-
-    with T.evaluate(0.0):  # error
-        T.evaluate(1.0)
+    check_error(invalid_for_function, 4)
 
 
 def test_invalid_block_function():
-    check_error(invalid_block_function, 4)
+    def invalid_block_function(a: T.handle) -> None:
+        A = T.match_buffer(a, (16, 16), "float32")
 
+        with T.evaluate(0.0):  # error
+            T.evaluate(1.0)
 
-def return_not_allowed(a: T.handle) -> None:
-    return T.evaluate(0)  # error
+    check_error(invalid_block_function, 4)
 
 
 def test_return_not_allowed():
-    check_error(return_not_allowed, 2)
+    def return_not_allowed(a: T.handle) -> None:
+        return T.evaluate(0)  # error
 
-
-def tir_assert(a: T.handle) -> None:
-    T.Assert(0, "")  # error
-
-
-def test_tir_assert():
-    check_error(tir_assert, 2)
-
-
-def no_body(a: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), "float32")
-    T.realize(A, "")  # error
+    check_error(return_not_allowed, 2)
 
 
 def test_no_body():
-    check_error(no_body, 3)
+    def no_body(a: T.handle) -> None:
+        A = T.match_buffer(a, (16, 16), "float32")
+        T.realize(A, "")  # error
 
-
-def allocate_with_buffers() -> None:
-    with T.allocate([1], "float32", "") as [A, B]:  # error
-        T.evaluate(1.0)
+    check_error(no_body, 3)
 
 
 def test_allocate_with_buffers():
-    check_error(allocate_with_buffers, 2)
-
+    def allocate_with_buffers() -> None:
+        with T.allocate([1], "float32", "") as [A, B]:  # error
+            T.evaluate(1.0)
 
-def inconsistent_binding_value() -> None:
-    for i, j in T.grid(16, 16):
-        vi, vj = T.axis.remap("SS", [i])  # error
-        T.evaluate(1.0)
+    check_error(allocate_with_buffers, 2)
 
 
-def inconsistent_binding_type() -> None:
-    for i, j in T.grid(16, 16):
-        vi, vj = T.axis.remap("S", [i, j])  # error
-        T.evaluate(1.0)
+def test_inconsistent_binding():
+    def inconsistent_binding_value() -> None:
+        for i, j in T.grid(16, 16):
+            vi, vj = T.axis.remap("SS", [i])  # error
+            T.evaluate(1.0)
 
+    def inconsistent_binding_type() -> None:
+        for i, j in T.grid(16, 16):
+            vi, vj = T.axis.remap("S", [i, j])  # error
+            T.evaluate(1.0)
 
-def test_inconsistent_binding():
     check_error(inconsistent_binding_value, 3)
     check_error(inconsistent_binding_type, 3)
 
 
-def error_remap_type() -> None:
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi, vj = T.axis.remap("TT", [i, j])  # error
-            T.evaluate(1.0)
-
-
-def error_remap_value() -> None:
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i + j, j])  # error
-            T.evaluate(1.0)
+def test_error_remap_args():
+    def error_remap_type() -> None:
+        for i, j in T.grid(16, 16):
+            with T.block():
+                vi, vj = T.axis.remap("TT", [i, j])  # error
+                T.evaluate(1.0)
 
+    def error_remap_value() -> None:
+        for i, j in T.grid(16, 16):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i + j, j])  # error
+                T.evaluate(1.0)
 
-def test_error_remap_args():
     check_error(error_remap_type, 4)
     check_error(error_remap_value, 4)
 
 
-def invalid_block_axes(a: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), "float32")
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi = T.axis.S(i, A)  # error
-            T.evaluate(1.0)
-
-
 def test_invalid_block_axes():
-    check_error(invalid_block_axes, 5)
-
+    def invalid_block_axes(a: T.handle) -> None:
+        A = T.match_buffer(a, (16, 16), "float32")
+        for i, j in T.grid(16, 16):
+            with T.block():
+                vi = T.axis.S(i, A)  # error
+                T.evaluate(1.0)
 
-def duplicate_block_axes() -> None:
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi = T.axis.S(16, i)
-            vi = T.axis.S(16, j)  # error
-            T.evaluate(1.0)
+    check_error(invalid_block_axes, 5)
 
 
-def duplicate_block_axes_remap() -> None:
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi, vi = T.axis.remap("SS", [i, j])  # error
-            T.evaluate(1.0)
+def test_duplicate_block_axes():
+    def duplicate_block_axes() -> None:
+        for i, j in T.grid(16, 16):
+            with T.block():
+                vi = T.axis.S(16, i)
+                vi = T.axis.S(16, j)  # error
+                T.evaluate(1.0)
 
+    def duplicate_block_axes_remap() -> None:
+        for i, j in T.grid(16, 16):
+            with T.block():
+                vi, vi = T.axis.remap("SS", [i, j])  # error
+                T.evaluate(1.0)
 
-def test_duplicate_block_axes():
     check_error(duplicate_block_axes, 5)
     check_error(duplicate_block_axes_remap, 4)
 
 
-def miss_block_bind_value() -> None:
-    for i, j in T.grid(128, 128):
-        with T.block():
-            vi = T.axis.S(i)  # error
-            T.evaluate(1.0)
-
-
 def test_miss_block_bind():
-    check_error(miss_block_bind_value, 4)
-
+    def miss_block_bind_value() -> None:
+        for i, j in T.grid(128, 128):
+            with T.block():
+                vi = T.axis.S(i)  # error
+                T.evaluate(1.0)
 
-def invalid_loop_var() -> None:
-    for i, j in range(0, 16):  # error
-        T.evaluate(1.0)
+    check_error(miss_block_bind_value, 4)
 
 
 def test_invalid_loop_var():
-    check_error(invalid_loop_var, 2)
-
+    def invalid_loop_var() -> None:
+        for i, j in range(0, 16):  # error
+            T.evaluate(1.0)
 
-def inconsistent_grid() -> None:
-    for i in T.grid(16, 16):  # error
-        T.evaluate(1.0)
+    check_error(invalid_loop_var, 2)
 
 
 def test_inconsistent_grid():
-    check_error(inconsistent_grid, 2)
-
-
-def invalid_match_buffer_region() -> None:
-    for i, j in T.grid(128, 128):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            A = T.match_buffer(vi)  # error
+    def inconsistent_grid() -> None:
+        for i in T.grid(16, 16):  # error
             T.evaluate(1.0)
 
+    check_error(inconsistent_grid, 2)
 
-def test_invalid_match_buffer_region():
-    check_error(invalid_match_buffer_region, 5)
 
+def test_invalid_match_buffer_region():
+    def invalid_match_buffer_region() -> None:
+        for i, j in T.grid(128, 128):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                A = T.match_buffer(vi)  # error
+                T.evaluate(1.0)
 
-def duplicate_buffer() -> None:
-    A = T.alloc_buffer((128, 128), "float32")
-    for i, j in T.grid(128, 128):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            A = T.alloc_buffer((128, 128), "float32")  # error
-            T.evaluate(1.0)
+    check_error(invalid_match_buffer_region, 5)
 
 
 def test_duplicate_buffer():
-    check_error(duplicate_buffer, 6)
+    def duplicate_buffer() -> None:
+        A = T.alloc_buffer((128, 128), "float32")
+        A = T.alloc_buffer((128, 128), "float32")  # error
 
-
-def duplicate_reads() -> None:
-    A = T.alloc_buffer((128, 128), "float32")
-    for i, j in T.grid(128, 128):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            T.reads(A[0:8, 0:8])
-            T.reads(A[0:16, 0:16])  # error
-            T.evaluate(1.0)
-
-
-def duplicate_writes() -> None:
-    A = T.alloc_buffer((128, 128), "float32")
-    for i, j in T.grid(128, 128):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            T.writes(A[0:8, 0:8])
-            T.writes(A[0:16, 0:16])  # error
-            T.evaluate(1.0)
+    check_error(duplicate_buffer, 3)
 
 
-def duplicate_predicate() -> None:
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            T.where(1)
-            T.where(0)  # error
-
-
-def duplicate_annotations() -> None:
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            T.block_attr({})
-            T.block_attr({})  # error
-
-
-def duplicate_init() -> None:
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            with T.init():
+def test_duplicate_block_signature():
+    def duplicate_reads() -> None:
+        A = T.alloc_buffer((128, 128), "float32")
+        for i, j in T.grid(128, 128):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.reads(A[0:8, 0:8])
+                T.reads(A[0:16, 0:16])  # error
                 T.evaluate(1.0)
-            with T.init():  # error
+
+    def duplicate_writes() -> None:
+        A = T.alloc_buffer((128, 128), "float32")
+        for i, j in T.grid(128, 128):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.writes(A[0:8, 0:8])
+                T.writes(A[0:16, 0:16])  # error
                 T.evaluate(1.0)
 
+    def duplicate_predicate() -> None:
+        for i, j in T.grid(16, 16):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.where(1)
+                T.where(0)  # error
 
-def duplicate_axes() -> None:
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            vi = T.axis.S(i, 16)  # error
-            T.evaluate(1.0)
+    def duplicate_annotations() -> None:
+        for i, j in T.grid(16, 16):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.block_attr({})
+                T.block_attr({})  # error
 
+    def duplicate_init() -> None:
+        for i, j in T.grid(16, 16):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                with T.init():
+                    T.evaluate(1.0)
+                with T.init():  # error
+                    T.evaluate(1.0)
+
+    def duplicate_axes() -> None:
+        for i, j in T.grid(16, 16):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                vi = T.axis.S(i, 16)  # error
+                T.evaluate(1.0)
 
-def test_duplicate_block_signature():
     check_error(duplicate_reads, 7)
     check_error(duplicate_writes, 7)
     check_error(duplicate_predicate, 6)
@@ -326,143 +320,105 @@ def test_duplicate_block_signature():
     check_error(duplicate_axes, 5)
 
 
-def opaque_access_during_complete(a: T.handle) -> None:  # error
-    A = T.match_buffer(a, (16, 16), "float32")
-    for i, j in T.grid(16, 16):
-        with T.block():
-            T.evaluate(T.call_extern("dummy_extern_function", A.data, dtype="int32"))
-
-
 def test_opaque_access_during_complete():
-    check_error(opaque_access_during_complete, 1)
-
+    def opaque_access_during_complete(a: T.handle) -> None:  # error
+        A = T.match_buffer(a, (16, 16), "float32")
+        for i, j in T.grid(16, 16):
+            with T.block():
+                T.evaluate(T.call_extern("dummy_extern_function", A.data, dtype="int32"))
 
-def convert_slice_to_bufferload() -> None:
-    A = T.alloc_buffer((128, 128), "float32")
-    for i, j in T.grid(128, 128):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            A[vi, vj] = A[vi : vi + 2, vj] + 1  # error
+    check_error(opaque_access_during_complete, None)
 
 
 def test_convert_slice_to_bufferload():
-    check_error(convert_slice_to_bufferload, 6)
-
-
-def error_index_type() -> None:
-    A = T.alloc_buffer((128, 128), "float32")
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            A[vi, vj] = A[vi, 0.0] + 1  # error
-
-
-def error_bufferslice_index_type() -> None:
-    A = T.alloc_buffer((1,), "float32")
-    B = T.alloc_buffer((16, 16), "float32")
-    C = T.alloc_buffer((16, 16), "float32")
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            C[vi, vj] = B[vi, A[0]]  # error
-
-
-def test_error_index_type():
-    check_error(error_index_type, 6)
-    check_error(error_bufferslice_index_type, 8)
-
-
-def special_stmt_except() -> None:
-    A = T.alloc_buffer("(128, 128)", "float32")  # error
-    T.evaluate(1.0)
+    def convert_slice_to_bufferload() -> None:
+        A = T.alloc_buffer((128, 128), "float32")
+        for i, j in T.grid(128, 128):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                A[vi, vj] = A[vi : vi + 2, vj] + 1  # error
 
-
-def scope_handler_except() -> None:
-    for i in T.serial("1", "1"):  # error
-        T.evaluate(1)
+    check_error(convert_slice_to_bufferload, 6)
 
 
-def intrin_except_unassign(a: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), "float32")
-    T.evaluate(A)  # error
+def test_tvm_exception_catch():
+    def special_stmt_except() -> None:
+        A = T.alloc_buffer("(128, 128)", "float32")  # error
+        T.evaluate(1.0)
 
+    def scope_handler_except() -> None:
+        for i in T.serial("1", "1"):  # error
+            T.evaluate(1)
 
-def intrin_except_assign(a: T.handle) -> None:
-    A = T.match_buffer(a, (16, 16), "float32")
-    A[0, 0] = A[A]  # error
+    def intrin_except_unassign(a: T.handle) -> None:
+        A = T.match_buffer(a, (16, 16), "float32")
+        T.evaluate(A)  # error
 
+    def intrin_except_assign(a: T.handle) -> None:
+        A = T.match_buffer(a, (16, 16), "float32")
+        A[0, 0] = A[A]  # error
 
-def test_tvm_exception_catch():
-    # test catching c++ side exception
     check_error(special_stmt_except, 2)
     check_error(scope_handler_except, 2)
     check_error(intrin_except_unassign, 3)
     check_error(intrin_except_assign, 3)
 
 
-def buffer_shape_mismatch(a: T.handle) -> None:
-    A = T.match_buffer(a, (8, 8))
-    for i, j in T.grid(8, 2):
-        with T.block():
-            T.reads([])
-            T.writes([A[i, j * 4 : j * 4 + 4]])
-            sub_A = T.match_buffer(
-                A[i, j * 4 : j * 4 + 4], (5)
-            )  # error: shape mismatched between 4 and 5
-            for jj in range(0, 4):
-                sub_A[i, j * 4 + jj] = 1
-
-
 def test_match_buffer_shape_mismatch():
-    check_error(buffer_shape_mismatch, 7)
-
+    def buffer_shape_mismatch(a: T.handle) -> None:
+        A = T.match_buffer(a, (8, 8))
+        for i, j in T.grid(8, 2):
+            with T.block():
+                T.reads([])
+                T.writes([A[i, j * 4 : j * 4 + 4]])
+                sub_A = T.match_buffer(
+                    A[i, j * 4 : j * 4 + 4], (5)
+                )  # error: shape mismatched between 4 and 5
+                for jj in range(0, 4):
+                    sub_A[i, j * 4 + jj] = 1
 
-def high_dim_store() -> None:
-    with T.block("root"):
-        B = T.allocate([256], "float32", "global")
-        for i, j in T.grid(16, 16):
-            B[i, j] = 1.0  # error: Store is only allowed with one index
+    check_error(buffer_shape_mismatch, 7)
 
 
 def test_high_dim_store():
-    check_error(high_dim_store, 5)
+    def high_dim_store() -> None:
+        with T.block("root"):
+            B = T.allocate([256], "float32", "global")
+            for i, j in T.grid(16, 16):
+                B[i, j] = 1.0  # error: Store is only allowed with one index
 
-
-def block_has_option_vars() -> None:
-    with T.block("root") as x:  # error: block does not support option_vars
-        T.evaluate(0.0)
+    check_error(high_dim_store, 5)
 
 
 def test_block_has_option_vars():
-    check_error(block_has_option_vars, 2)
-
-
-def implicit_root_has_read():
-    T.reads([])  # error: implicit root does not support reads
-    T.evaluate(0.0)
-
-
-def implicit_root_has_write():
-    T.writes([])  # error: implicit root does not support writes
-    T.evaluate(0.0)
+    def block_has_option_vars() -> None:
+        with T.block("root") as x:  # error: block does not support option_vars
+            T.evaluate(0.0)
 
+    check_error(block_has_option_vars, 2)
 
-def implicit_root_has_attrs():
-    T.block_attr({})  # error: implicit root does not support block_attr
-    T.evaluate(0.0)
 
+def test_implicit_root_has_attrs():
+    def implicit_root_has_read():
+        T.reads([])  # error: implicit root does not support reads
+        T.evaluate(0.0)
 
-def implicit_root_has_predicate():
-    T.where(True)  # error: implicit root does not support predicate
-    T.evaluate(0.0)
+    def implicit_root_has_write():
+        T.writes([])  # error: implicit root does not support writes
+        T.evaluate(0.0)
 
+    def implicit_root_has_attrs():
+        T.block_attr({})  # error: implicit root does not support block_attr
+        T.evaluate(0.0)
 
-def implicit_root_has_axes():
-    v = T.axis.S(0, 0)  # error: implicit root does not support axis define
-    T.evaluate(0.0)
+    def implicit_root_has_predicate():
+        T.where(True)  # error: implicit root does not support predicate
+        T.evaluate(0.0)
 
+    def implicit_root_has_axes():
+        v = T.axis.S(0, 0)  # error: implicit root does not support axis define
+        T.evaluate(0.0)
 
-def test_implicit_root_has_attrs():
     check_error(implicit_root_has_read, 2)
     check_error(implicit_root_has_write, 2)
     check_error(implicit_root_has_attrs, 2)
@@ -554,127 +510,115 @@ def test_report_error_root_block():
     assert expected_sub_error_message in str(execinfo.value)
 
 
-def load_var_multiple() -> None:
-    d = T.var("float32")
-    d[2] = d[2, 1]  # error cannot provide two indices to load
-
-
 def test_load_var():
-    check_error(load_var_multiple, 3)
-
+    def load_var_multiple() -> None:
+        d = T.var("float32")
+        d[2] = d[2, 1]  # error cannot provide two indices to load
 
-def store_var_multiple() -> None:
-    d = T.var("float32")
-    d[2, 1] = d[1]  # error cannot provide two indices to store
+    check_error(load_var_multiple, 3)
 
 
 def test_store_var():
-    check_error(store_var_multiple, 3)
-
+    def store_var_multiple() -> None:
+        d = T.var("float32")
+        d[2, 1] = d[1]  # error cannot provide two indices to store
 
-def load_handle(h: T.handle) -> None:
-    h_ = T.match_buffer(h, [1])
-    h_[0] = h[0]  # error cannot load from handle
+    check_error(store_var_multiple, 3)
 
 
 def test_load_handle():
-    check_error(load_var_multiple, 3)
+    def load_handle(h: T.handle) -> None:
+        h_ = T.match_buffer(h, [1])
+        h_[0] = h[0]  # error cannot load from handle
 
-
-def store_handle(h: T.handle) -> None:
-    h_ = T.match_buffer(h, [1])
-    h[0] = h_[0]  # error cannot store to handle
+    check_error(load_handle, 3)
 
 
 def test_store_handle():
-    check_error(store_var_multiple, 3)
-
+    def store_handle(h: T.handle) -> None:
+        h_ = T.match_buffer(h, [1])
+        h[0] = h_[0]  # error cannot store to handle
 
-def binop_bad_ast_type(h: T.handle):
-    h_ = T.match_buffer(h, [1])
-    h_[0] = h + [2]  # error rhs should be a primexpr
+    check_error(store_handle, 3)
 
 
 def test_binop_bad_ast_type():
-    check_error(binop_bad_ast_type, 3)
-
+    def binop_bad_ast_type(h: T.handle):
+        h_ = T.match_buffer(h, [1])
+        h_[0] = h + [2]  # error rhs should be a primexpr
 
-def binop_bad_type(h: T.handle):
-    h_ = T.match_buffer(h, [1])
-    h_[0] = h + 2  # error lhs and rhs should be the same type
+    check_error(binop_bad_ast_type, 3)
 
 
 def test_binop_bad_type():
-    check_error(binop_bad_type, 3)
-
-
-def floor_dtype(h: T.handle):
-    h_ = T.match_buffer(h, [1])
-    h_[0] = T.floor(2)  # error floor requires a dtype
-
+    def binop_bad_type(h: T.handle):
+        h_ = T.match_buffer(h, [1])
+        h_[0] = h + 2  # error lhs and rhs should be the same type
 
-def test_floor_dtype():
-    check_error(floor_dtype, 3)
-
-
-def non_integer_typed_block_iter():
-    with T.block():
-        i = T.axis.S(0.1, 0.1)  # error IterVar requires an integer dtype
+    check_error(binop_bad_type, 3)
 
 
 def test_non_integer_typed_block_iter():
-    check_error(non_integer_typed_block_iter, 3)
-
+    def non_integer_typed_block_iter():
+        with T.block():
+            i = T.axis.S(0.1, 0.1)  # error IterVar requires an integer dtype
 
-def preflattened_buffer_map_align_nonint(foo: T.handle):
-    foo_1 = T.match_buffer(foo, [1])
-    T.preflattened_buffer(
-        foo_1, [1], align="bar"
-    )  # check_error: align: want int or IntImm, got 'bar'
+    check_error(non_integer_typed_block_iter, 3)
 
 
 def test_preflattened_buffer_map_align():
-    check_error(preflattened_buffer_map_align_nonint, 3)
-
+    def preflattened_buffer_map_align_nonint(foo: T.handle):
+        foo_1 = T.match_buffer(foo, [1])
+        T.preflattened_buffer(
+            foo_1, [1], align="bar"
+        )  # check_error: align: want int or IntImm, got 'bar'
 
-def preflattened_buffer_map_offset_factor_nonint(foo: T.handle):
-    foo_1 = T.match_buffer(foo, [1])
-    T.preflattened_buffer(
-        foo_1, [1], offset_factor="bar"
-    )  # check_error: offset_factor: want int or IntImm, got 'bar'
+    check_error(preflattened_buffer_map_align_nonint, 3)
 
 
 def test_preflattened_buffer_map_offset_factor():
-    check_error(preflattened_buffer_map_offset_factor_nonint, 3)
-
-
-def strided_buffer_region(A: T.handle):
-    # do not allow stride in buffer region
-    A = T.match_buffer((128, 128), "int32")
-    with T.block():
-        T.reads([])
-        T.writes([A[0:128:2, 0:128:3]])  # error
-        T.evaluate(T.call_extern("strided_compute", dtype=""))
+    def preflattened_buffer_map_offset_factor_nonint(foo: T.handle):
+        foo_1 = T.match_buffer(foo, [1])
+        T.preflattened_buffer(
+            foo_1, [1], offset_factor="bar"
+        )  # check_error: offset_factor: want int or IntImm, got 'bar'
 
+    check_error(preflattened_buffer_map_offset_factor_nonint, 3)
 
-def access_reversed_slice(A: T.handle):
-    # do not allow reversed slice step
-    A = T.match_buffer((128,), "int32")
-    A[0:128:-1] = T.broadcast(1, 128)  # error
 
+def test_illegal_buffer_slice():
+    def strided_buffer_region(A: T.handle):
+        # do not allow stride in buffer region
+        A = T.match_buffer((128, 128), "int32")
+        with T.block():
+            T.reads([])
+            T.writes([A[0:128:2, 0:128:3]])  # error
+            T.evaluate(T.call_extern("strided_compute", dtype=""))
 
-def access_non_const_slice_length(A: T.handle):
-    # do not allow non-constant slice length
-    A = T.match_buffer((128,), "int32")
-    for i in range(4):
-        T.evaluate(A[0:i:1])  # error
+    def access_reversed_slice(A: T.handle):
+        # do not allow reversed slice step
+        A = T.match_buffer((128,), "int32")
+        A[0:128:-1] = T.broadcast(1, 128)  # error
 
+    def access_non_const_slice_length(A: T.handle):
+        # do not allow non-constant slice length
+        A = T.match_buffer((128,), "int32")
+        for i in range(4):
+            T.evaluate(A[0:i:1])  # error
 
-def test_illegal_buffer_slice():
     check_error(strided_buffer_region, 3)
     check_error(access_reversed_slice, 3)
     check_error(access_non_const_slice_length, 3)
 
 
+def test_syntax_sugar_fail():
+    def loop_syntax_sugar_fail(a: T.handle) -> None:
+        A = T.match_buffer(a, (128,))
+        for i in T.thread_binding(128, 128):
+            A[i] = A[i] * 2.0
+
+    check_error(loop_syntax_sugar_fail, 3)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index 849b0fc03d92..32572d392c51 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -20,9 +20,8 @@
 import pytest
 import tvm.testing
 from tvm.ir import assert_structural_equal
+from tvm.script import from_source
 from tvm.script import tir as T
-from tvm.script.parser import from_source
-from tvm.testing import check_error
 
 
 @T.prim_func
@@ -89,20 +88,10 @@ def loop_syntax_sugar(a: T.handle) -> None:
                             A[i, j, k, x] = A[i, j, k, x] * 2.0
 
 
-def loop_syntax_sugar_fail(a: T.handle) -> None:
-    A = T.match_buffer(a, (128,))
-    for i in T.thread_binding(128, 128):
-        A[i] = A[i] * 2.0
-
-
 def test_loop_syntax_sugar():
     assert_structural_equal(loop_no_syntax_sugar, loop_syntax_sugar)
 
 
-def test_syntax_sugar_fail():
-    check_error(loop_syntax_sugar_fail, 3)
-
-
 # match buffer - use kwargs
 @T.prim_func
 def elementwise_handle(

From a0dcab2b24ffbef7f3032683b6eaf0e916ff5f3d Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 10 Nov 2022 11:53:41 +0530
Subject: [PATCH 546/704] [CPP_RPC][ANDROID] Fix cpp_rpc build failure (#13305)

* cpp_rpc build failure for Android devices with NDK version < 23

* * Make environment variable ANDROID_NDK_MAJOR optional.

Co-authored-by: Siva Rama Krishna Reddy B <sivb@blr-ubuntu-ripper.qualcomm.com>
---
 apps/cpp_rpc/CMakeLists.txt    | 10 ++++++++++
 docker/Dockerfile.ci_adreno    |  1 +
 docker/Dockerfile.ci_cpu       |  1 +
 docker/Dockerfile.ci_hexagon   |  1 +
 docker/Dockerfile.demo_android |  1 +
 5 files changed, 14 insertions(+)

diff --git a/apps/cpp_rpc/CMakeLists.txt b/apps/cpp_rpc/CMakeLists.txt
index 97c859045d76..4b2b6ca61d7d 100644
--- a/apps/cpp_rpc/CMakeLists.txt
+++ b/apps/cpp_rpc/CMakeLists.txt
@@ -32,6 +32,16 @@ if (OS)
    endif()
 endif()
 
+if(USE_OPENCL)
+   if (ANDROID_ABI)
+     if(DEFINED ENV{ANDROID_NDK_MAJOR})
+       if($ENV{ANDROID_NDK_MAJOR} VERSION_LESS "23")
+         set_property(TARGET tvm_rpc PROPERTY LINK_FLAGS -fuse-ld=gold)
+       endif()
+     endif()
+   endif()
+endif()
+
 target_include_directories(
   tvm_rpc
   PUBLIC "../../include"
diff --git a/docker/Dockerfile.ci_adreno b/docker/Dockerfile.ci_adreno
index a08b2dfe8c64..2f609a69c45f 100644
--- a/docker/Dockerfile.ci_adreno
+++ b/docker/Dockerfile.ci_adreno
@@ -25,4 +25,5 @@ COPY install/ubuntu_install_androidsdk.sh /install/ubuntu_install_androidsdk.sh
 RUN bash /install/ubuntu_install_androidsdk.sh
 ENV ANDROID_HOME=/opt/android-sdk-linux
 ENV ANDROID_NDK_HOME=/opt/android-sdk-linux/ndk/21.3.6528147
+ENV ANDROID_NDK_MAJOR=21
 ENV PATH /opt/android-sdk-linux/platform-tools:$PATH
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 155f9ef7d914..e71f98ec7af8 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -127,6 +127,7 @@ COPY install/ubuntu_install_androidsdk.sh /install/ubuntu_install_androidsdk.sh
 RUN bash /install/ubuntu_install_androidsdk.sh
 ENV ANDROID_HOME=/opt/android-sdk-linux/
 ENV ANDROID_NDK_HOME=/opt/android-sdk-linux/ndk/21.3.6528147/
+ENV ANDROID_NDK_MAJOR=21
 
 # PaddlePaddle deps
 COPY install/ubuntu_install_paddle.sh /install/ubuntu_install_paddle.sh
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index f1fc7be52484..52849c1c4aaa 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -60,6 +60,7 @@ COPY install/ubuntu_install_androidsdk.sh /install/ubuntu_install_androidsdk.sh
 RUN bash /install/ubuntu_install_androidsdk.sh
 ENV ANDROID_HOME=/opt/android-sdk-linux
 ENV ANDROID_NDK_HOME=/opt/android-sdk-linux/ndk/21.3.6528147
+ENV ANDROID_NDK_MAJOR=21
 ENV PATH /opt/android-sdk-linux/platform-tools:$PATH
 
 # Hexagon
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index e66fb3aa3cfa..2f90e5ad664e 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -72,3 +72,4 @@ RUN cd /usr && \
 ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/vta/python:${PYTHONPATH}
 ENV ANDROID_HOME=/opt/android-sdk-linux/
 ENV ANDROID_NDK_HOME=/opt/android-sdk-linux/ndk/21.3.6528147/
+ENV ANDROID_NDK_MAJOR=21

From 3a30df670145371d21a45315e0edc771c11c3d63 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 10 Nov 2022 00:39:13 -0800
Subject: [PATCH 547/704] [Hexagon] Make allocate_hexagon_array a hexagon
 contrib API (#13336)

Make 'allocate_hexagon_array' a hexagon contrib API
---
 python/tvm/contrib/hexagon/tools.py           | 39 +++++++++++++++++++
 .../contrib/test_hexagon/infrastructure.py    | 38 ------------------
 .../test_hexagon/test_2d_physical_buffers.py  |  3 +-
 .../test_hexagon/test_benchmark_maxpool2d.py  |  4 +-
 .../contrib/test_hexagon/test_memory_alloc.py |  3 +-
 .../contrib/test_hexagon/test_sigmoid.py      |  3 +-
 .../topi/slice_op/test_argmax_slice.py        |  4 +-
 .../topi/slice_op/test_avg_pool2d_slice.py    |  2 +-
 .../topi/slice_op/test_cast_slice.py          |  3 +-
 .../topi/slice_op/test_clip_slice.py          |  3 +-
 .../topi/slice_op/test_conv2d_slice.py        |  3 +-
 .../slice_op/test_depthwise_conv2d_slice.py   |  3 +-
 .../topi/slice_op/test_dequantize_slice.py    |  2 +-
 .../topi/slice_op/test_max_pool2d_slice.py    |  3 +-
 .../topi/slice_op/test_relu_slice.py          |  3 +-
 .../topi/slice_op/test_softmax_slice.py       |  3 +-
 .../topi/slice_op/test_tanh_slice.py          |  4 +-
 .../topi/test_add_subtract_multiply.py        |  2 +-
 .../test_hexagon/topi/test_depth_to_space.py  |  3 +-
 .../test_hexagon/topi/test_quantize.py        |  2 +-
 .../contrib/test_hexagon/topi/test_reshape.py |  3 +-
 .../test_hexagon/topi/test_resize2d.py        |  4 +-
 22 files changed, 77 insertions(+), 60 deletions(-)
 mode change 100644 => 100755 tests/python/contrib/test_hexagon/test_2d_physical_buffers.py

diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py
index 8c37261744d5..1c6468a0f5c7 100644
--- a/python/tvm/contrib/hexagon/tools.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -20,6 +20,7 @@
 import os
 import pathlib
 from typing import Union
+import numpy
 
 import tvm
 import tvm.contrib.cc as cc
@@ -203,3 +204,41 @@ def export_module(module, out_dir, binary_name="test_binary.so"):
     binary_path = pathlib.Path(out_dir) / binary_name
     module.save(str(binary_path))
     return binary_path
+
+
+def allocate_hexagon_array(
+    dev, tensor_shape=None, dtype=None, data=None, axis_separators=None, mem_scope=None
+):
+    """
+    Allocate a hexagon array which could be a 2D array
+    on physical memory defined by axis_separators
+    """
+    if tensor_shape is None:
+        assert data is not None, "Must provide either tensor shape or numpy data array"
+        tensor_shape = data.shape
+    elif data is not None:
+        assert (
+            tensor_shape == data.shape
+        ), "Mismatch between provided tensor shape and numpy data array shape"
+
+    if dtype is None:
+        assert data is not None, "Must provide either dtype or numpy data array"
+        dtype = data.dtype.name
+    elif data is not None:
+        assert dtype == data.dtype, "Mismatch between provided dtype and numpy data array dtype"
+
+    if axis_separators is None:
+        axis_separators = []
+
+    boundaries = [0, *axis_separators, len(tensor_shape)]
+    physical_shape = [
+        numpy.prod(tensor_shape[dim_i:dim_f])
+        for dim_i, dim_f in zip(boundaries[:-1], boundaries[1:])
+    ]
+
+    arr = tvm.nd.empty(physical_shape, dtype=dtype, device=dev, mem_scope=mem_scope)
+
+    if data is not None:
+        arr.copyfrom(data.reshape(physical_shape))
+
+    return arr._create_view(tensor_shape)
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 6f7e1904da2f..c04631156f1e 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -23,44 +23,6 @@
 from tvm import te
 
 
-def allocate_hexagon_array(
-    dev, tensor_shape=None, dtype=None, data=None, axis_separators=None, mem_scope=None
-):
-    """
-    Allocate a hexagon array which could be a 2D array
-    on physical memory defined by axis_separators
-    """
-    if tensor_shape is None:
-        assert data is not None, "Must provide either tensor shape or numpy data array"
-        tensor_shape = data.shape
-    elif data is not None:
-        assert (
-            tensor_shape == data.shape
-        ), "Mismatch between provided tensor shape and numpy data array shape"
-
-    if dtype is None:
-        assert data is not None, "Must provide either dtype or numpy data array"
-        dtype = data.dtype.name
-    elif data is not None:
-        assert dtype == data.dtype, "Mismatch between provided dtype and numpy data array dtype"
-
-    if axis_separators is None:
-        axis_separators = []
-
-    boundaries = [0, *axis_separators, len(tensor_shape)]
-    physical_shape = [
-        numpy.prod(tensor_shape[dim_i:dim_f])
-        for dim_i, dim_f in zip(boundaries[:-1], boundaries[1:])
-    ]
-
-    arr = tvm.nd.empty(physical_shape, dtype=dtype, device=dev, mem_scope=mem_scope)
-
-    if data is not None:
-        arr.copyfrom(data.reshape(physical_shape))
-
-    return arr._create_view(tensor_shape)
-
-
 def ceildiv(o, d):
     assert o >= 0
     assert d >= 0
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
old mode 100644
new mode 100755
index 7804ae2e4898..fb41e99a9bcb
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -31,8 +31,9 @@
 from tvm import te
 from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain
 from tvm.tir.stmt_functor import post_order_visit
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-from .infrastructure import allocate_hexagon_array, get_hexagon_target
+from .infrastructure import get_hexagon_target
 
 # Disabling invalid name as pylint assumes global variables as constants and
 # expects them to be all upper-case. Since these are used as
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
index 24d1a3f788cf..42c77a9c9d2d 100644
--- a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
@@ -53,9 +53,9 @@
 from tvm import te, topi, tir
 from tvm.topi import testing
 from tvm.contrib.hexagon.session import Session
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-
-from .infrastructure import allocate_hexagon_array, get_hexagon_target
+from .infrastructure import get_hexagon_target
 from . import benchmark_util as bu
 
 # Pytest seems to require that fixture names exist in the current module.
diff --git a/tests/python/contrib/test_hexagon/test_memory_alloc.py b/tests/python/contrib/test_hexagon/test_memory_alloc.py
index f44e3cd0dc36..a0e3255a5428 100644
--- a/tests/python/contrib/test_hexagon/test_memory_alloc.py
+++ b/tests/python/contrib/test_hexagon/test_memory_alloc.py
@@ -20,8 +20,9 @@
 
 import tvm
 from tvm.script import tir as T
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-from .infrastructure import allocate_hexagon_array, get_hexagon_target
+from .infrastructure import get_hexagon_target
 
 
 def generated_func(shape: tuple, dtype: str, axis_separators: list):
diff --git a/tests/python/contrib/test_hexagon/test_sigmoid.py b/tests/python/contrib/test_hexagon/test_sigmoid.py
index e115b188a3f0..cc633795c217 100644
--- a/tests/python/contrib/test_hexagon/test_sigmoid.py
+++ b/tests/python/contrib/test_hexagon/test_sigmoid.py
@@ -23,8 +23,9 @@
 from tvm import te
 from tvm import tir
 from tvm import topi
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-from .infrastructure import allocate_hexagon_array, get_hexagon_target
+from .infrastructure import get_hexagon_target
 
 
 def sigmoid_compute(sigmoid_input):
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_argmax_slice.py
index 5f4a594fcfb1..92a951106765 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_argmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_argmax_slice.py
@@ -22,7 +22,9 @@
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.contrib.hexagon
-from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from tvm.contrib.hexagon import allocate_hexagon_array
+
+from ...infrastructure import transform_numpy, get_hexagon_target
 
 
 class TestArgMaxSlice:
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
index 13876da87295..0eedfdbf8da1 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
@@ -23,8 +23,8 @@
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.topi.hexagon.qnn as qn
+from tvm.contrib.hexagon import allocate_hexagon_array
 from ...infrastructure import (
-    allocate_hexagon_array,
     transform_numpy,
     quantize_np,
     get_hexagon_target,
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_cast_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_cast_slice.py
index 3118c7be8efb..77776bc8da0b 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_cast_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_cast_slice.py
@@ -22,8 +22,9 @@
 import tvm.testing
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ...infrastructure import transform_numpy, get_hexagon_target
 
 
 class TestCastF16F32Slice2d:
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_clip_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_clip_slice.py
index e0a2e20a0b6b..d3f9804cd6c3 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_clip_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_clip_slice.py
@@ -22,8 +22,9 @@
 from tvm import te
 import tvm.testing
 import tvm.topi.hexagon.slice_ops as sl
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ...infrastructure import transform_numpy, get_hexagon_target
 
 input_layout = tvm.testing.parameter(
     "nhwc-8h2w32c2w-2d",
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py
index c314e9655c9a..e06636cde365 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py
@@ -24,8 +24,9 @@
 import tvm.testing
 from tvm.topi.hexagon.slice_ops.conv2d import conv2d_compute, conv2d_schedule
 from tvm.topi.testing import conv2d_nhwc_python
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ...infrastructure import transform_numpy, get_hexagon_target
 
 input_layout = tvm.testing.parameter(
     "nhwc-8h2w32c2w-2d",
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_depthwise_conv2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_depthwise_conv2d_slice.py
index 74e4d05446ed..e5a22e8879b5 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_depthwise_conv2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_depthwise_conv2d_slice.py
@@ -25,8 +25,9 @@
 import tvm.topi.hexagon.qnn as qn
 from tvm.topi.testing import depthwise_conv2d_python_nhwc
 from tvm.topi.hexagon.slice_ops.dwconv2d import dwconv2d_compute, dwconv2d_schedule
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-from ...infrastructure import allocate_hexagon_array, transform_numpy, quantize_np
+from ...infrastructure import transform_numpy, quantize_np
 
 
 @tvm.testing.fixture
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_dequantize_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_dequantize_slice.py
index 9b1c5bc5f614..8b9f49458df2 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_dequantize_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_dequantize_slice.py
@@ -23,8 +23,8 @@
 import tvm.testing
 from tvm import te
 from tvm.topi.hexagon import qnn
+from tvm.contrib.hexagon import allocate_hexagon_array
 from ...infrastructure import (
-    allocate_hexagon_array,
     transform_numpy,
     quantize_np,
     get_hexagon_target,
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_max_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_max_pool2d_slice.py
index fcb4411609b2..4cd92f4dd27d 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_max_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_max_pool2d_slice.py
@@ -22,8 +22,9 @@
 import tvm.testing
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ...infrastructure import transform_numpy, get_hexagon_target
 from ...pytest_util import (
     get_multitest_ids,
     create_populated_numpy_ndarray,
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_relu_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_relu_slice.py
index 93a8d77827bf..1430551df719 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_relu_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_relu_slice.py
@@ -21,8 +21,9 @@
 import tvm.testing
 from tvm.topi.hexagon.slice_ops.relu import relu_compute, relu_stir_schedule
 from tvm import te
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ...infrastructure import transform_numpy, get_hexagon_target
 
 
 @tvm.testing.fixture
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_softmax_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_softmax_slice.py
index a3db1b6dcdbe..2707ed3a5af1 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_softmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_softmax_slice.py
@@ -20,8 +20,7 @@
 from tvm import te
 from tvm.topi.testing import softmax_python
 import tvm.topi.hexagon.slice_ops as sl
-
-from ...infrastructure import allocate_hexagon_array
+from tvm.contrib.hexagon import allocate_hexagon_array
 
 
 def transform_numpy(arr_np, layout):
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_tanh_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_tanh_slice.py
index f8c14ef934a1..6297ef2c1e6e 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_tanh_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_tanh_slice.py
@@ -22,7 +22,9 @@
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.contrib.hexagon
-from ...infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from tvm.contrib.hexagon import allocate_hexagon_array
+
+from ...infrastructure import transform_numpy, get_hexagon_target
 
 # pylint: disable=invalid-name
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
index d689888d6e85..e0bb6b5864d3 100644
--- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
+++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
@@ -21,8 +21,8 @@
 from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.topi.hexagon.qnn as qn
+from tvm.contrib.hexagon import allocate_hexagon_array
 from ..infrastructure import (
-    allocate_hexagon_array,
     transform_numpy,
     quantize_np,
     get_hexagon_target,
diff --git a/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py b/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
index 0cb41b595255..7d4afb953a50 100644
--- a/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
+++ b/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py
@@ -25,8 +25,9 @@
 import tvm.testing
 from tvm.topi.hexagon.slice_ops.depth_to_space import d2s_compute, d2s_schedule
 from tvm.topi.testing import depth_to_space_python
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ..infrastructure import transform_numpy, get_hexagon_target
 
 
 class TestD2SSlice:
diff --git a/tests/python/contrib/test_hexagon/topi/test_quantize.py b/tests/python/contrib/test_hexagon/topi/test_quantize.py
index a188f7cb2fe1..ac4f4d4e3047 100644
--- a/tests/python/contrib/test_hexagon/topi/test_quantize.py
+++ b/tests/python/contrib/test_hexagon/topi/test_quantize.py
@@ -20,8 +20,8 @@
 import tvm
 from tvm import te
 import tvm.topi.hexagon.qnn as s1
+from tvm.contrib.hexagon import allocate_hexagon_array
 from ..infrastructure import (
-    allocate_hexagon_array,
     transform_numpy,
     quantize_np,
     get_hexagon_target,
diff --git a/tests/python/contrib/test_hexagon/topi/test_reshape.py b/tests/python/contrib/test_hexagon/topi/test_reshape.py
index 33bb31902eaa..51ac12506023 100644
--- a/tests/python/contrib/test_hexagon/topi/test_reshape.py
+++ b/tests/python/contrib/test_hexagon/topi/test_reshape.py
@@ -21,8 +21,9 @@
 import tvm.testing
 import tvm.topi.hexagon.slice_ops as sl
 from tvm import te
+from tvm.contrib.hexagon import allocate_hexagon_array
 
-from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from ..infrastructure import transform_numpy, get_hexagon_target
 
 BATCH_FLATTEN_FP16_TESTS = (
     ([1, 1, 1, 2048], [1, 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
index 44d9c95a2f06..c0c6e7ca0fb4 100644
--- a/tests/python/contrib/test_hexagon/topi/test_resize2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
@@ -22,7 +22,9 @@
 from tvm import te
 from tvm.topi.testing import resize2d_python
 import tvm.topi.hexagon as s1
-from ..infrastructure import allocate_hexagon_array, transform_numpy, get_hexagon_target
+from tvm.contrib.hexagon import allocate_hexagon_array
+
+from ..infrastructure import transform_numpy, get_hexagon_target
 
 
 class TestResize2d:

From 54bd5e1f5fa52c498b4a4ff13d795daf52a81bfd Mon Sep 17 00:00:00 2001
From: Sergei Smirnov <89378719+sergey-grovety@users.noreply.github.com>
Date: Thu, 10 Nov 2022 12:01:05 +0300
Subject: [PATCH 548/704] [microNPU] Fixed MergeConstants pass on striped
 networks (#13281)

This PR fixes the bug in MergeConstants pass on striped networks on Ethos-U NPU.

The issue was caused by _DivideConstants_ pass which is introducing new mod parameters and changing their order. So ethosu_write parameter in some cases is moved from the end of the list to the middle.
E.g. from:
`[ethos-u_0_i0, p1, p2, p3, p4, p5, p6, ethosu_write]`
To:
`[ethos-u_0_i0, p1, p2, ethosu_write, placeholder, placeholder, placeholder, placeholder, placeholder, placeholder, placeholder, placeholder]`

Updated version of the  _GetArgsToMergeWithoutArgsNotInConstDict_ and _MakeNewConstDict_ methods in passes.cc can now correctly modify const_dict according to the new parameter list.
---
 .../backend/contrib/ethosu/tir/compiler.py    |   5 +-
 src/tir/contrib/ethosu/passes.cc              |  27 ++-
 .../test_ethosu/test_encode_constants.py      |  32 +--
 .../test_ethosu/test_merge_constants.py       | 189 ++++++++++++++++++
 4 files changed, 224 insertions(+), 29 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index aaac59ad4a52..4133aff6ef51 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -91,10 +91,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
         mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod)
         mod = ethosu_passes.HoistAllocates()(mod)
         mod = tvm.tir.transform.RemoveNoOp()(mod)
-        #  MergeConstant pass currently does not support striped schedules.
-        #  It requires further investigation.
-        if not util.is_striping_enabled():
-            mod, const_dict = ethosu_passes.MergeConstants(const_dict)(mod)
+        mod, const_dict = ethosu_passes.MergeConstants(const_dict)(mod)
         mod = ethosu_passes.CopyComputeReordering()(mod)
 
         # When striping is enabled and if storage_rewrite is not run
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index 2f6fa8f3ea33..d51ffbf833a4 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -514,7 +514,7 @@ class MergeConstantsMutator : public StmtExprMutator {
 
     // Make the new const dict
     Array<Array<IntImm>> args_to_merge{GetArgsToMerge(main_func->buffer_map, main_func->params)};
-    Array<Array<IntImm>> buffers_to_merge{
+    Map<IntImm, Array<IntImm>> buffers_to_merge{
         GetArgsToMergeWithoutArgsNotInConstDict(args_to_merge, const_dict)};
     Map<IntImm, runtime::NDArray> new_const_dict{MakeNewConstDict(buffers_to_merge, const_dict)};
 
@@ -832,9 +832,11 @@ class MergeConstantsMutator : public StmtExprMutator {
     return vector;
   }
 
-  Array<Array<IntImm>> GetArgsToMergeWithoutArgsNotInConstDict(
+  Map<IntImm, Array<IntImm>> GetArgsToMergeWithoutArgsNotInConstDict(
       const Array<Array<IntImm>>& args_to_merge, const Map<IntImm, runtime::NDArray>& const_dict) {
-    Array<Array<IntImm>> new_args_to_merge{};
+    Map<IntImm, Array<IntImm>> new_args_to_merge{};
+    bool first_arg_found = false;
+    int64_t new_arg_key = 0;  // the updated key of the merged const_dict
     for (Array<IntImm> args : args_to_merge) {
       IntImm key{args[0]};
       auto it = std::find_if(const_dict.begin(), const_dict.end(),
@@ -842,21 +844,29 @@ class MergeConstantsMutator : public StmtExprMutator {
                                return pair.first->value == key->value;
                              });
       if (it != const_dict.end()) {
-        new_args_to_merge.push_back(args);
+        if (first_arg_found == false) {
+          first_arg_found = true;
+          new_arg_key = key->value;
+        }
+        new_args_to_merge.Set(IntImm(DataType::Int(64), new_arg_key), args);
+      }
+      if (first_arg_found) {
+        new_arg_key++;
       }
     }
     return new_args_to_merge;
   }
 
-  Map<IntImm, runtime::NDArray> MakeNewConstDict(const Array<Array<IntImm>>& args_to_merge,
+  Map<IntImm, runtime::NDArray> MakeNewConstDict(const Map<IntImm, Array<IntImm>>& args_to_merge,
                                                  Map<IntImm, runtime::NDArray> const_dict) {
     Map<IntImm, runtime::NDArray> new_const_dict{};
     if (args_to_merge.size() == 0) {
       return new_const_dict;
     }
 
-    int64_t key = args_to_merge[0][0]->value;
-    for (Array<IntImm> args : args_to_merge) {
+    for (auto const& elem : args_to_merge) {
+      IntImm key = elem.first;
+      Array<IntImm> args = elem.second;
       int64_t size = 0;
       for (IntImm arg : args) {
         auto it = std::find_if(const_dict.begin(), const_dict.end(),
@@ -876,8 +886,7 @@ class MergeConstantsMutator : public StmtExprMutator {
         arg_constant.CopyToBytes(static_cast<uint8_t*>(constant->data) + offset, nbytes);
         offset += nbytes;
       }
-      new_const_dict.Set(IntImm(DataType::Int(64), key), constant);
-      key += 1;
+      new_const_dict.Set(key, constant);
     }
     return new_const_dict;
   }
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 6ffbf22312ff..c751d44b6156 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -340,15 +340,15 @@ def _get_func():
 @tvm.script.ir_module
 class MixedReadU55:
     @T.prim_func
-    def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(112,), "uint8"]) -> None:
+    def main(ifm: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer1 = T.buffer_decl([112], "uint8")
         buffer3 = T.buffer_decl([112], "uint8")
         buffer5 = T.buffer_decl([112], "uint8")
+        buffer7 = T.buffer_decl([112], "uint8")
         buffer9 = T.buffer_decl([592], "uint8")
         buffer10 = T.buffer_decl([160], "uint8")
-        buffer11 = T.buffer_decl([2048], "int8")
         # body
         p1_data = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
         p1 = T.buffer_decl([112], "uint8", data=p1_data)
@@ -357,21 +357,21 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(112,)
         p2_data = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
         p2 = T.buffer_decl([112], "uint8", data=p2_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 112, p1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, T.int8(-1), T.int8(-1), 12, buffer10[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, T.int8(-1), T.int8(-1), 12, buffer10[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p1[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p1[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 112, p1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, T.int8(-1), T.int8(-1), 12, p2[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 112, p2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p1[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, T.int8(-1), T.int8(-1), 12, p2[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, T.int8(-1), T.int8(-1), 12, p2[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 112, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p1[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, T.int8(-1), T.int8(-1), 12, p2[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
 @tvm.script.ir_module
 class MixedReadU65:
     @T.prim_func
-    def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(128,), "uint8"]) -> None:
+    def main(ifm: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
 
@@ -381,7 +381,7 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(128,)
         buffer3 = T.buffer_decl([128], dtype="uint8")
         buffer4 = T.buffer_decl([608], dtype="uint8")
         buffer5 = T.buffer_decl([160], dtype="uint8")
-        buffer6 = T.buffer_decl([2048], dtype="int8")
+        buffer6 = T.buffer_decl([128], dtype="uint8")
         p1_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
         p1 = T.buffer_decl([128], "uint8", data=p1_data)
         p2_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
@@ -389,14 +389,14 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(128,)
         p3_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
         p3 = T.buffer_decl([128], "uint8", data=p3_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer4[0], 304, buffer4[304], 304, 12, buffer5[0], 80, buffer5[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer4[0], 304, buffer4[304], 304, 12, buffer5[0], 80, buffer5[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 128, p1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 128, p3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 128, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_merge_constants.py b/tests/python/contrib/test_ethosu/test_merge_constants.py
index 337b5c70d125..a5adcfceac83 100644
--- a/tests/python/contrib/test_ethosu/test_merge_constants.py
+++ b/tests/python/contrib/test_ethosu/test_merge_constants.py
@@ -441,6 +441,195 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint
     check_const_dictionaries(const_dict, new_const_dict)
 
 
+def test_arbitrary_argument_order():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint8"], buffer2: T.Buffer[(96,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"], buffer3: T.Buffer[(368,), "uint8"], buffer4: T.Buffer[(96,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # buffer definition
+            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
+            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            # body
+            p1_data = T.allocate([368], "uint8", "global")
+            p1 = T.buffer_decl([368], "uint8", data=p1_data)
+            p2_data = T.allocate([96], "uint8", "global")
+            p2 = T.buffer_decl([96], "uint8", data=p2_data)
+            p3_data = T.allocate([368], "uint8", "global")
+            p3 = T.buffer_decl([368], "uint8", data=p3_data)
+            p4_data = T.allocate([96], "uint8", "global")
+            p4 = T.buffer_decl([96], "uint8", data=p4_data)
+            T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p2[0], 48, p2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 368, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[2048], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 192, p3[192], 176, 12, p4[0], 48, p4[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        __tvm_meta__ = None
+
+
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"], buffer2: T.Buffer[(464,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # body
+            p1_data = T.allocate([464], "uint8", "global")
+            p1 = T.buffer_decl([464], "uint8", data=p1_data)
+            p2_data = T.allocate([464], "uint8", "global")
+            p2 = T.buffer_decl([464], "uint8", data=p2_data)
+            T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 464, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 464, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[2048], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 192, p2[192], 176, 12, p2[368], 48, p2[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    __tvm_meta__ = None
+    # fmt: on
+
+    const_dict = {
+        1: np.array([1], dtype=np.uint8),
+        2: np.array([2], dtype=np.uint8),
+        4: np.array([4], dtype=np.uint8),
+        5: np.array([5], dtype=np.uint8),
+    }
+    new_const_dict = {
+        1: np.concatenate((const_dict[1], const_dict[2])),
+        3: np.concatenate((const_dict[4], const_dict[5])),
+    }
+    test_mod, const_dict = MergeConstants(const_dict)(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, False)
+    check_const_dictionaries(const_dict, new_const_dict)
+
+
+def test_arbitrary_argument_order_const_split():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"], buffer2: T.Buffer[(96,), "uint8"], buffer3: T.Buffer[(368,), "uint8"], buffer4: T.Buffer[(96,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # buffer definition
+            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
+            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            # body
+            p1_data = T.allocate([368], "uint8", "global")
+            p1 = T.buffer_decl([368], "uint8", data=p1_data)
+            p2_data = T.allocate([96], "uint8", "global")
+            p2 = T.buffer_decl([96], "uint8", data=p2_data)
+            p3_data = T.allocate([368], "uint8", "global")
+            p3 = T.buffer_decl([368], "uint8", data=p3_data)
+            p4_data = T.allocate([96], "uint8", "global")
+            p4 = T.buffer_decl([96], "uint8", data=p4_data)
+            T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p2[0], 48, p2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 368, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[2048], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 192, p3[192], 176, 12, p4[0], 48, p4[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        __tvm_meta__ = None
+
+
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"], buffer2: T.Buffer[(464,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # body
+            p1_data = T.allocate([464], "uint8", "global")
+            p1 = T.buffer_decl([464], "uint8", data=p1_data)
+            p2_data = T.allocate([464], "uint8", "global")
+            p2 = T.buffer_decl([464], "uint8", data=p2_data)
+            T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 464, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 464, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[2048], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 192, p2[192], 176, 12, p2[368], 48, p2[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    __tvm_meta__ = None
+    # fmt: on
+
+    const_dict = {
+        1: np.array([1], dtype=np.uint8),
+        3: np.array([3], dtype=np.uint8),
+        4: np.array([4], dtype=np.uint8),
+        5: np.array([5], dtype=np.uint8),
+    }
+    new_const_dict = {
+        1: np.concatenate((const_dict[1], const_dict[3])),
+        3: np.concatenate((const_dict[4], const_dict[5])),
+    }
+    test_mod, const_dict = MergeConstants(const_dict)(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+    check_const_dictionaries(const_dict, new_const_dict)
+
+
+def test_arbitrary_argument_order_const_split_mixed():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint8"], buffer2: T.Buffer[(368,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"], buffer3: T.Buffer[(96,), "uint8"], buffer4: T.Buffer[(96,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # buffer definition
+            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
+            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            # body
+            p1_data = T.allocate([368], "uint8", "global")
+            p1 = T.buffer_decl([368], "uint8", data=p1_data)
+            p2_data = T.allocate([368], "uint8", "global")
+            p2 = T.buffer_decl([368], "uint8", data=p2_data)
+            p3_data = T.allocate([96], "uint8", "global")
+            p3 = T.buffer_decl([96], "uint8", data=p3_data)
+            p4_data = T.allocate([96], "uint8", "global")
+            p4 = T.buffer_decl([96], "uint8", data=p4_data)
+            T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 96, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p3[0], 48, p3[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 368, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[2048], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 192, p2[192], 176, 12, p4[0], 48, p4[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        __tvm_meta__ = None
+
+
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint8"], buffer2: T.Buffer[(464,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # body
+            p1_data = T.allocate([464], "uint8", "global")
+            p1 = T.buffer_decl([464], "uint8", data=p1_data)
+            p2_data = T.allocate([464], "uint8", "global")
+            p2 = T.buffer_decl([464], "uint8", data=p2_data)
+            T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 464, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 464, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[2048], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 192, p2[192], 176, 12, p2[368], 48, p2[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    __tvm_meta__ = None
+    # fmt: on
+
+    const_dict = {
+        1: np.array([1], dtype=np.uint8),
+        2: np.array([2], dtype=np.uint8),
+        4: np.array([4], dtype=np.uint8),
+        5: np.array([5], dtype=np.uint8),
+    }
+    new_const_dict = {
+        1: np.concatenate((const_dict[1], const_dict[4])),
+        2: np.concatenate((const_dict[2], const_dict[5])),
+    }
+    test_mod, const_dict = MergeConstants(const_dict)(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+    check_const_dictionaries(const_dict, new_const_dict)
+
+
 def test_cycle_count():
     # fmt: off
     @tvm.script.ir_module

From 23ade0c14b29c2c2710f6580035878f130eea52b Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 10 Nov 2022 13:49:20 +0000
Subject: [PATCH 549/704] [TVMC] Global pass context for compile and tune
 (#13309)

* [TVMC] Global pass context for compile and tune

Comes as a followup from conversations in #13216. By making the pass
context a global value for both `compile` and `tune` commands, we can
ensure the pass context is exactly as the user expected and also
test components such as `convert_graph_layout` under a pass context
suitable for testing (e.g. add instruments). With this change, it
becomes the users responsibility to ensure the PassContext they
select is suitable for the passes that will be run. By default,
`opt_level` remains as 3 so current workflows that do not alter the pass
context from the command line / TVMC Python API should not be affected.

Change-Id: I7a601daf6fbe664f77bce1b45efeb7ca29f621b3

* fix vitis-ai test and typo

Change-Id: I04f5bd031ae4717825f42e373bcb0e1e2c1c9d90
---
 python/tvm/driver/tvmc/autotuner.py        | 197 +++++++++++----------
 python/tvm/driver/tvmc/compiler.py         | 118 ++++++------
 python/tvm/driver/tvmc/transform.py        |  11 +-
 tests/python/driver/tvmc/conftest.py       |  16 ++
 tests/python/driver/tvmc/test_autotuner.py |  16 ++
 tests/python/driver/tvmc/test_compiler.py  |   8 +-
 tests/python/driver/tvmc/test_frontends.py |  17 +-
 tests/python/driver/tvmc/test_transform.py |  65 ++++---
 8 files changed, 247 insertions(+), 201 deletions(-)

diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index f9ba427ffaa6..98293e596b5d 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -389,110 +389,115 @@ def tune_model(
     # model is fixed. For now, creating a clone avoids the issue.
     mod = deepcopy(tvmc_model.mod)
     params = tvmc_model.params
-    if tuning_records is None:
-        tuning_records = tvmc_model.default_tuning_records_path()
-
-    for codegen_from_cli in extra_targets:
-        codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
-        partition_function = codegen["pass_pipeline"]
-        mod = partition_function(mod, params, **codegen_from_cli["opts"])
-
-    # min_repeat_ms should be:
-    # a. the value provided by the user, if any, or
-    # b. 0ms in case target is "cpu"; otherwise 1000ms
-    if min_repeat_ms is None:
-        min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000
-        logger.info("Default --min-repeat-ms for this target is %s", min_repeat_ms)
-
-    if rpc_key:
-        if hostname is None or port is None:
-            raise TVMCException(
-                "You must provide a hostname and port to connect to a remote RPC device."
-            )
-        if isinstance(port, str):
-            port = int(port)
-
-        logger.info("Tuning will be performed on device %s at %s:%d.", rpc_key, hostname, port)
-
-        runner_ctor = auto_scheduler.RPCRunner if enable_autoscheduler else autotvm.RPCRunner
-        runner = runner_ctor(
-            key=rpc_key,
-            host=hostname,
-            port=port,
-            number=number,
-            repeat=repeat,
-            n_parallel=parallel,
-            timeout=timeout,
-            min_repeat_ms=min_repeat_ms,
-        )
-    else:
-        logger.info("Starting localhost tuning.")
-        runner_ctor = (
-            auto_scheduler.LocalRPCMeasureContext if enable_autoscheduler else autotvm.LocalRunner
-        )
-        local_server = runner_ctor(
-            number=number,
-            repeat=repeat,
-            timeout=timeout,
-            min_repeat_ms=min_repeat_ms,
-        )
 
-        # For autoscheduling on some devices, we need to maintain a LocalRPCMeasureContext object.
-        if enable_autoscheduler:
-            runner = local_server.runner
+    with tvm.transform.PassContext(opt_level=3):
+        if tuning_records is None:
+            tuning_records = tvmc_model.default_tuning_records_path()
+
+        for codegen_from_cli in extra_targets:
+            codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
+            partition_function = codegen["pass_pipeline"]
+            mod = partition_function(mod, params, **codegen_from_cli["opts"])
+
+        # min_repeat_ms should be:
+        # a. the value provided by the user, if any, or
+        # b. 0ms in case target is "cpu"; otherwise 1000ms
+        if min_repeat_ms is None:
+            min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000
+            logger.info("Default --min-repeat-ms for this target is %s", min_repeat_ms)
+
+        if rpc_key:
+            if hostname is None or port is None:
+                raise TVMCException(
+                    "You must provide a hostname and port to connect to a remote RPC device."
+                )
+            if isinstance(port, str):
+                port = int(port)
+
+            logger.info("Tuning will be performed on device %s at %s:%d.", rpc_key, hostname, port)
+
+            runner_ctor = auto_scheduler.RPCRunner if enable_autoscheduler else autotvm.RPCRunner
+            runner = runner_ctor(
+                key=rpc_key,
+                host=hostname,
+                port=port,
+                number=number,
+                repeat=repeat,
+                n_parallel=parallel,
+                timeout=timeout,
+                min_repeat_ms=min_repeat_ms,
+            )
         else:
-            runner = local_server
+            logger.info("Starting localhost tuning.")
+            runner_ctor = (
+                auto_scheduler.LocalRPCMeasureContext
+                if enable_autoscheduler
+                else autotvm.LocalRunner
+            )
+            local_server = runner_ctor(
+                number=number,
+                repeat=repeat,
+                timeout=timeout,
+                min_repeat_ms=min_repeat_ms,
+            )
 
-    if enable_autoscheduler:
+            # For autoscheduling on some devices, we need to maintain a
+            # LocalRPCMeasureContext object.
+            if enable_autoscheduler:
+                runner = local_server.runner
+            else:
+                runner = local_server
 
-        tasks, weights = autoscheduler_get_tuning_tasks(
-            mod=mod,
-            params=params,
-            target=target,
-            alter_layout=desired_layout,
-            hardware_params=hardware_params,
-            include_simple_tasks=include_simple_tasks,
-        )
+        if enable_autoscheduler:
 
-        # Create the autoscheduler tuning options
-        tuning_options = auto_scheduler.TuningOptions(
-            num_measure_trials=trials,
-            measure_callbacks=[auto_scheduler.RecordToFile(tuning_records)],
-            runner=runner,
-            early_stopping=early_stopping,
-        )
+            tasks, weights = autoscheduler_get_tuning_tasks(
+                mod=mod,
+                params=params,
+                target=target,
+                alter_layout=desired_layout,
+                hardware_params=hardware_params,
+                include_simple_tasks=include_simple_tasks,
+            )
+
+            # Create the autoscheduler tuning options
+            tuning_options = auto_scheduler.TuningOptions(
+                num_measure_trials=trials,
+                measure_callbacks=[auto_scheduler.RecordToFile(tuning_records)],
+                runner=runner,
+                early_stopping=early_stopping,
+            )
 
-        logger.info("Autoscheduling with configuration: %s", tuning_options)
+            logger.info("Autoscheduling with configuration: %s", tuning_options)
 
-        # Schedule the tasks (i.e., produce a schedule for each task)
-        schedule_tasks(tasks, weights, tuning_options, prior_records, log_estimated_latency)
-    else:
-        tasks = autotvm_get_tuning_tasks(
-            mod=mod,
-            params=params,
-            target=target,
-            alter_layout=desired_layout,
-        )
+            # Schedule the tasks (i.e., produce a schedule for each task)
+            schedule_tasks(tasks, weights, tuning_options, prior_records, log_estimated_latency)
+        else:
+            tasks = autotvm_get_tuning_tasks(
+                mod=mod,
+                params=params,
+                target=target,
+                alter_layout=desired_layout,
+            )
 
-        # In autotvm, trials is specified per task. We can convert the per-model input
-        # provided to per-task trials by dividing by the number of tasks.
-        trials = int(trials / max(len(tasks), 1))
-        logger.info("Autotuning with %d trials per task.", trials)
-
-        tuning_options = {
-            "tuner": tuner,
-            "trials": trials,
-            "early_stopping": early_stopping,
-            "measure_option": autotvm.measure_option(
-                builder=autotvm.LocalBuilder(build_func="default"), runner=runner
-            ),
-            "tuning_records": prior_records,
-        }
-        logger.info("Autotuning with configuration: %s", tuning_options)
-
-        tune_tasks(tasks, tuning_records, **tuning_options)
-
-    return tuning_records
+            # In autotvm, trials is specified per task. We can convert the per-model input
+            # provided to per-task trials by dividing by the number of tasks.
+            trials = int(trials / max(len(tasks), 1))
+            logger.info("Autotuning with %d trials per task.", trials)
+
+            tuning_options = {
+                "tuner": tuner,
+                "trials": trials,
+                "early_stopping": early_stopping,
+                "measure_option": autotvm.measure_option(
+                    builder=autotvm.LocalBuilder(build_func="default"), runner=runner
+                ),
+                "tuning_records": prior_records,
+            }
+            logger.info("Autotuning with configuration: %s", tuning_options)
+
+            tune_tasks(tasks, tuning_records, **tuning_options)
+
+        return tuning_records
 
 
 def autotvm_get_tuning_tasks(
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index c24d36c432df..eec80820cdb1 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -292,39 +292,42 @@ def compile_model(
 
     config = parse_configs(pass_context_configs)
 
-    if desired_layout:
-        mod = convert_graph_layout(mod, desired_layout)
-
     tvm_target, extra_targets = target_from_cli(target, additional_target_options)
     tvm_target, target_host = Target.canon_target_and_host(tvm_target, target_host)
 
+    partition_functions = []
+    partition_opts = []
     for codegen_from_cli in extra_targets:
         codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
-        partition_function = codegen["pass_pipeline"]
-
+        partition_functions.append(codegen["pass_pipeline"])
+        partition_opts.append(codegen_from_cli["opts"])
         if codegen["config_key"] is not None:
             config[codegen["config_key"]] = codegen_from_cli["opts"]
-        with tvm.transform.PassContext(config=config):
-            mod = partition_function(mod, params, mod_name=mod_name, **codegen_from_cli["opts"])
-
-    if tuning_records and os.path.exists(tuning_records):
-        logger.debug("tuning records file provided: %s", tuning_records)
-
-        use_autoscheduler = True
-        try:
-            auto_scheduler.load_records(tuning_records)
-        except tvm._ffi.base.TVMError:
-            use_autoscheduler = False
-
-        if use_autoscheduler:
-            with auto_scheduler.ApplyHistoryBest(tuning_records):
-                config["relay.backend.use_auto_scheduler"] = True
-                with tvm.transform.PassContext(
-                    opt_level=opt_level,
-                    config=config,
-                    disabled_pass=disabled_pass,
-                    instruments=instruments,
-                ):
+
+    with tvm.transform.PassContext(
+        opt_level=opt_level,
+        config=config,
+        disabled_pass=disabled_pass,
+        instruments=instruments,
+    ):
+        if desired_layout:
+            mod = convert_graph_layout(mod, desired_layout)
+
+        for partition_function, opts in zip(partition_functions, partition_opts):
+            mod = partition_function(mod, params, mod_name=mod_name, **opts)
+
+        if tuning_records and os.path.exists(tuning_records):
+            logger.debug("tuning records file provided: %s", tuning_records)
+
+            use_autoscheduler = True
+            try:
+                auto_scheduler.load_records(tuning_records)
+            except tvm._ffi.base.TVMError:
+                use_autoscheduler = False
+
+            if use_autoscheduler:
+                with auto_scheduler.ApplyHistoryBest(tuning_records):
+                    config["relay.backend.use_auto_scheduler"] = True
                     logger.debug("building relay graph with autoscheduler")
                     graph_module = build(
                         mod,
@@ -336,14 +339,8 @@ def compile_model(
                         mod_name=mod_name,
                         workspace_pools=workspace_pools,
                     )
-        else:
-            with autotvm.apply_history_best(tuning_records):
-                with tvm.transform.PassContext(
-                    opt_level=opt_level,
-                    config=config,
-                    disabled_pass=disabled_pass,
-                    instruments=instruments,
-                ):
+            else:
+                with autotvm.apply_history_best(tuning_records):
                     logger.debug("building relay graph with tuning records")
                     graph_module = build(
                         mod,
@@ -355,10 +352,7 @@ def compile_model(
                         mod_name=mod_name,
                         workspace_pools=workspace_pools,
                     )
-    else:
-        with tvm.transform.PassContext(
-            opt_level=opt_level, config=config, disabled_pass=disabled_pass, instruments=instruments
-        ):
+        else:
             logger.debug("building relay graph (no tuning records provided)")
             graph_module = build(
                 mod,
@@ -371,32 +365,32 @@ def compile_model(
                 workspace_pools=workspace_pools,
             )
 
-    # Generate output dump files with sources
-    if dump_code is None:
-        dump_code = []
-    if not isinstance(dump_code, list):
-        dump_code = [dump_code]
-    dumps = {}
-    for source_type in dump_code:
-        if use_vm:
-            lib = graph_module.lib
-        else:
-            lib = graph_module.get_lib()
-        # TODO lib.get_source call have inconsistent behavior for unsupported
-        #      formats (@leandron).
-        source = str(mod) if source_type == "relay" else lib.get_source(source_type)
-        dumps[source_type] = source
-
-    # Create a new tvmc model package object from the graph definition.
-    package_path = tvmc_model.export_package(
-        graph_module, package_path, cross, cross_options, output_format
-    )
+        # Generate output dump files with sources
+        if dump_code is None:
+            dump_code = []
+        if not isinstance(dump_code, list):
+            dump_code = [dump_code]
+        dumps = {}
+        for source_type in dump_code:
+            if use_vm:
+                lib = graph_module.lib
+            else:
+                lib = graph_module.get_lib()
+            # TODO lib.get_source call have inconsistent behavior for unsupported
+            #      formats (@leandron).
+            source = str(mod) if source_type == "relay" else lib.get_source(source_type)
+            dumps[source_type] = source
+
+        # Create a new tvmc model package object from the graph definition.
+        package_path = tvmc_model.export_package(
+            graph_module, package_path, cross, cross_options, output_format
+        )
 
-    # Write dumps to file.
-    if dumps:
-        save_dumps(package_path, dumps)
+        # Write dumps to file.
+        if dumps:
+            save_dumps(package_path, dumps)
 
-    return TVMCPackage(package_path)
+        return TVMCPackage(package_path)
 
 
 def build(
diff --git a/python/tvm/driver/tvmc/transform.py b/python/tvm/driver/tvmc/transform.py
index 51c9e52f21d6..8527c48b6b04 100644
--- a/python/tvm/driver/tvmc/transform.py
+++ b/python/tvm/driver/tvmc/transform.py
@@ -54,10 +54,7 @@ def convert_graph_layout(mod, desired_layout):
         ]
     )
 
-    with transform.PassContext(opt_level=3):
-        try:
-            return seq(mod)
-        except Exception as err:
-            raise TVMCException(
-                "Error converting layout to {0}: {1}".format(desired_layout, str(err))
-            )
+    try:
+        return seq(mod)
+    except Exception as err:
+        raise TVMCException("Error converting layout to {0}: {1}".format(desired_layout, str(err)))
diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py
index 8009448bff77..e0dbeebf9871 100644
--- a/tests/python/driver/tvmc/conftest.py
+++ b/tests/python/driver/tvmc/conftest.py
@@ -23,6 +23,8 @@
 
 from PIL import Image
 
+import tvm
+from tvm import relay
 from tvm.driver import tvmc
 
 from tvm.contrib.download import download_testdata
@@ -284,3 +286,17 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5),
     with open(file_path, "w") as relay_text:
         relay_text.write(RELAY_MODEL)
     return file_path
+
+
+@pytest.fixture(scope="session")
+def relay_conv2d():
+    """
+    Simple conv2d Relay implementation.
+    """
+    dtype = "float32"
+
+    x = relay.var("x", shape=(1, 4, 2, 2), dtype=dtype)
+    weight = relay.const(np.random.uniform(size=(2, 4, 2, 2)), dtype=dtype)
+    x = relay.nn.conv2d(x, weight)
+    func = relay.Function(relay.analysis.free_vars(x), x)
+    return tvm.IRModule.from_expr(func)
diff --git a/tests/python/driver/tvmc/test_autotuner.py b/tests/python/driver/tvmc/test_autotuner.py
index 7c05ff804fa4..eb6550e40cdc 100644
--- a/tests/python/driver/tvmc/test_autotuner.py
+++ b/tests/python/driver/tvmc/test_autotuner.py
@@ -23,6 +23,7 @@
 from os import path
 from pathlib import Path
 
+import tvm
 from tvm import autotvm
 from tvm.driver import tvmc
 
@@ -191,3 +192,18 @@ def test_tune_rpc_tracker_parsing(mock_load_model, mock_tune_model, mock_auto_sc
     assert "10.0.0.1" == kwargs["hostname"]
     assert "port" in kwargs
     assert 9999 == kwargs["port"]
+
+
+@mock.patch("tvm.transform.PassContext", return_value=tvm.transform.PassContext())
+def test_autotune_pass_context(mock_pc, onnx_mnist, tmpdir_factory):
+    """
+    Check that the pass context while tuning is as expected.
+    """
+    pytest.importorskip("onnx")
+
+    tmpdir_name = tmpdir_factory.mktemp("data")
+    _tuner_test_helper(onnx_mnist, "gridsearch", tmpdir_name)
+
+    # AutoTVM overrides the pass context later in the pipeline to disable AlterOpLayout
+    assert mock_pc.call_count == 2
+    assert mock_pc.call_args_list[0][1]["opt_level"] == 3
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 7cb50dd0e366..3a3f297729fd 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -508,10 +508,7 @@ def test_compile_check_configs_composite_target(mock_pkg, mock_pc, mock_fe, mock
     tvmc_model = tvmc.load("no_file_needed")
     tvmc.compile(tvmc_model, target="mockcodegen -testopt=value, llvm")
 
-    assert mock_pc.call_count == 2
-    codegen_partition_context = mock.call(
-        config={"relay.ext.mock.options": {"testopt": "value"}},
-    )
+    assert mock_pc.call_count == 1
     codegen_compile_context = mock.call(
         config={"relay.ext.mock.options": {"testopt": "value"}},
         opt_level=3,
@@ -520,9 +517,6 @@ def test_compile_check_configs_composite_target(mock_pkg, mock_pc, mock_fe, mock
     )
     mock_pc.assert_has_calls(
         [
-            codegen_partition_context,
-            codegen_partition_context.__enter__(),
-            codegen_partition_context.__exit__(None, None, None),
             codegen_compile_context,
             codegen_compile_context.__enter__(),
             codegen_compile_context.__exit__(None, None, None),
diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py
index c1a3be67c208..718babd15c29 100644
--- a/tests/python/driver/tvmc/test_frontends.py
+++ b/tests/python/driver/tvmc/test_frontends.py
@@ -297,7 +297,8 @@ def test_compile_tflite_module_nhwc_to_nchw(tflite_mobilenet_v1_1_quant):
     before = tvmc_model.mod
 
     expected_layout = "NCHW"
-    after = tvmc.transform.convert_graph_layout(before, expected_layout)
+    with tvm.transform.PassContext(opt_level=3):
+        after = tvmc.transform.convert_graph_layout(before, expected_layout)
 
     layout_transform_calls = []
 
@@ -322,7 +323,8 @@ def test_compile_onnx_module_nchw_to_nhwc(onnx_resnet50):
     before = tvmc_model.mod
 
     expected_layout = "NHWC"
-    after = tvmc.transform.convert_graph_layout(before, expected_layout)
+    with tvm.transform.PassContext(opt_level=3):
+        after = tvmc.transform.convert_graph_layout(before, expected_layout)
 
     layout_transform_calls = []
 
@@ -347,7 +349,8 @@ def test_compile_paddle_module_nchw_to_nhwc(paddle_resnet50):
     before = tvmc_model.mod
 
     expected_layout = "NHWC"
-    after = tvmc.transform.convert_graph_layout(before, expected_layout)
+    with tvm.transform.PassContext(opt_level=3):
+        after = tvmc.transform.convert_graph_layout(before, expected_layout)
 
     layout_transform_calls = []
 
@@ -372,7 +375,9 @@ def test_compile_tflite_module__same_layout__nhwc_to_nhwc(tflite_mobilenet_v1_1_
     before = tvmc_model.mod
 
     expected_layout = "NHWC"
-    after = tvmc.transform.convert_graph_layout(before, expected_layout)
+
+    with tvm.transform.PassContext(opt_level=3):
+        after = tvmc.transform.convert_graph_layout(before, expected_layout)
 
     layout_transform_calls = []
 
@@ -397,7 +402,9 @@ def test_compile_onnx_module__same_layout__nchw_to_nchw(onnx_resnet50):
     before = tvmc_model.mod
 
     expected_layout = "NCHW"
-    after = tvmc.transform.convert_graph_layout(before, expected_layout)
+
+    with tvm.transform.PassContext(opt_level=3):
+        after = tvmc.transform.convert_graph_layout(before, expected_layout)
 
     layout_transform_calls = []
 
diff --git a/tests/python/driver/tvmc/test_transform.py b/tests/python/driver/tvmc/test_transform.py
index 98a0210a1bb6..98bd3b5f98a3 100644
--- a/tests/python/driver/tvmc/test_transform.py
+++ b/tests/python/driver/tvmc/test_transform.py
@@ -14,43 +14,60 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import pytest
-import numpy as np
+
+from unittest.mock import MagicMock
 
 import tvm
 from tvm import relay
+from tvm.ir.instrument import pass_instrument
 from tvm.driver.tvmc.transform import convert_graph_layout
 
 
-def test_layout_transform():
+def test_layout_transform_fold_constant(relay_conv2d):
     """
     Test layout is correctly transformed and constant folding is applied.
     """
-    dtype = "int8"
-    iinfo = np.iinfo(dtype)
-    data_min = iinfo.min
-    data_max = iinfo.max
-
-    x = relay.var("x", shape=(1, 4, 2, 2), dtype=dtype)
-    weight = relay.const(
-        np.random.randint(data_min, data_max, size=(2, 4, 2, 2), dtype=dtype), dtype=dtype
-    )
-    x = relay.nn.conv2d(x, weight)
-    func = relay.Function(relay.analysis.free_vars(x), x)
-    mod = tvm.IRModule.from_expr(func)
+    desired_layout = "NHWC"
+
+    @pass_instrument
+    class CollectPassNames:
+        def __init__(self):
+            self.names = []
+
+        def run_after_pass(self, _, info):
+            self.names.append(info.name)
+
+    pass_names = CollectPassNames()
+    with tvm.transform.PassContext(opt_level=3, instruments=[pass_names]):
+        convert_graph_layout(relay_conv2d, desired_layout)
 
+    names = pass_names.names
+    assert "ConvertLayout" in names
+    assert "FoldConstant" in names
+    assert names.index("ConvertLayout") < names.index("FoldConstant")
+
+
+def test_layout_transform_convert_layout_pass_args(relay_conv2d, monkeypatch):
+    """
+    Check the convert layout desired layouts arugment is what is expected when
+    a desired layout is provided.
+    """
     desired_layout = "NHWC"
-    mod = convert_graph_layout(mod, desired_layout)
 
-    main_expr = mod["main"].body
-    conv = main_expr.args[0]
-    assert conv.op.name == "nn.conv2d"
-    assert conv.attrs["data_layout"] == "NHWC"
-    assert conv.attrs["kernel_layout"] == "HWIO"
+    mock_convert_layout = MagicMock()
+    mock_convert_layout.return_value = relay.transform.ConvertLayout({})
+    monkeypatch.setattr(relay.transform, "ConvertLayout", mock_convert_layout)
+
+    with tvm.transform.PassContext(opt_level=3):
+        convert_graph_layout(relay_conv2d, desired_layout)
 
-    # Ensure transform has been folded into the constant
-    weights = conv.args[1]
-    assert isinstance(weights, relay.expr.Constant)
+    mock_convert_layout.assert_called_once_with(
+        {
+            "nn.conv2d": ["NHWC", "default"],
+            "nn.conv2d_transpose": ["NHWC", "default"],
+            "qnn.conv2d": ["NHWC", "default"],
+        }
+    )
 
 
 if __name__ == "__main__":

From 7cd203dc3e8dbcbc9a0400c163e3d696afb9ebb1 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Thu, 10 Nov 2022 08:52:26 -0800
Subject: [PATCH 550/704] =?UTF-8?q?[TIR]=20Update=20ReductionIterNotIndexO?=
 =?UTF-8?q?utputBuffer=20to=20check=20BlockRealizeN=E2=80=A6=20(#13301)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeNodes match_buffer statements when validating writes

* Add test to verify that tensorized blocks are properly validated

* update to take into account all match buffer regions.

* lint
---
 src/tir/schedule/analysis/reducer.cc          |  20 ++-
 ..._transform_lower_cross_thread_reduction.py | 164 ++++++++++++++++++
 2 files changed, 182 insertions(+), 2 deletions(-)

diff --git a/src/tir/schedule/analysis/reducer.cc b/src/tir/schedule/analysis/reducer.cc
index 50813ef3cae8..5f1a0e355608 100644
--- a/src/tir/schedule/analysis/reducer.cc
+++ b/src/tir/schedule/analysis/reducer.cc
@@ -563,18 +563,34 @@ bool ReductionIterNotIndexOutputBuffer(const Block& block) {
       return reduction_block_iters.count(var);
     });
   };
+
+  std::unordered_map<const BufferNode*, const BufferNode*> match_buffer_sources;
+  for (const MatchBufferRegion& region : block->match_buffers) {
+    match_buffer_sources[region->buffer.get()] = region->source->buffer.get();
+  }
   bool affected = false;
   PreOrderVisit(block->body, [&](const ObjectRef& obj) {
     if (affected) {
       return false;
     }
+    const auto* block_node = obj.as<BlockNode>();
+    if (block_node) {
+      for (const MatchBufferRegion& region : block_node->match_buffers) {
+        match_buffer_sources[region->buffer.get()] = region->source->buffer.get();
+      }
+    }
     const auto* store = obj.as<BufferStoreNode>();
     if (!store) {
       return true;
     }
-    ICHECK(buffer_written.count(store->buffer.get()))
+
+    bool write_is_covered_by_match_buffer =
+        match_buffer_sources.count(store->buffer.get()) &&
+        buffer_written.count(match_buffer_sources.find(store->buffer.get())->second);
+    ICHECK(buffer_written.count(store->buffer.get()) || write_is_covered_by_match_buffer)
         << "ValueError: The buffer \"" << store->buffer
-        << "\" is written in the block but is not in the block's signature";
+        << "\" is written in the block but is not in the block's signature nor is it covered by "
+           "a match_buffer";
     for (const PrimExpr& index : store->indices) {
       if (f_uses_reduction_block_var(index)) {
         affected = true;
diff --git a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
index 9ae4f4cf862e..42c7fbc0d4a9 100644
--- a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
+++ b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
@@ -487,6 +487,149 @@ def lowered_single_reduction_loop_with_block_predicate(
                     )
 
 
+@T.prim_func
+def single_reduction_loop_with_tensorize(
+    input_A: T.Buffer[(1, 64, 7, 7, 32), "uint8"],
+    input_B: T.Buffer[(16, 64, 1, 1, 8, 32, 4), "int8"],
+    output: T.Buffer[(1, 16, 7, 7, 32), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for i1, i2, i3, i4, i5 in T.grid(16, 4, 98, 2, 32):
+        with T.block("compute_o"):
+            n = T.axis.spatial(1, 0)
+            oc_chunk = T.axis.spatial(16, i1)
+            oh = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) // 3584)
+            ow = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 3584 // 512)
+            kh = T.axis.reduce(1, 0)
+            kw = T.axis.reduce(1, 0)
+            ic_outer = T.axis.reduce(64, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 512 // 8)
+            ic_f_inner = T.axis.reduce(8, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 8)
+            T.reads(
+                input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+            )
+            T.writes(output[n, oc_chunk, oh, ow, 0:32])
+            with T.init():
+                for x in T.serial(32):
+                    with T.block("compute_init"):
+                        oc_block_i_init = T.axis.spatial(32, x)
+                        T.reads()
+                        T.writes(output[n, oc_chunk, oh, ow, oc_block_i_init])
+                        output[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+            with T.block("compute_o"):
+                T.reads(
+                    output[n, oc_chunk, oh, ow, 0:32],
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                )
+                T.writes(output[n, oc_chunk, oh, ow, 0:32])
+                A = T.match_buffer(
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    [4],
+                    dtype="uint8",
+                    offset_factor=1,
+                )
+                B = T.match_buffer(
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                    [32, 4],
+                    dtype="int8",
+                    offset_factor=1,
+                )
+                C = T.match_buffer(
+                    output[n, oc_chunk, oh, ow, 0:32], [32], dtype="int32", offset_factor=1
+                )
+                A_u8x4: T.uint8x4 = A[0:4]
+                A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                B_i8x128 = B[0, 0:128]
+                B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                C[0:32] = T.call_llvm_pure_intrin(
+                    4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32"
+                )
+
+
+@T.prim_func
+def nested_reduction_loop_with_inner_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):
+                    T.reads(
+                        out[yi, xr],
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                    )
+                    T.writes(out[yi, xr])
+                    A = T.match_buffer(
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    B = T.match_buffer(
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    C = T.match_buffer(out[yi, xr], [1], dtype="int32", offset_factor=1)
+                    A_i8x4: T.int8x4 = A[0:4]
+                    A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
+                    B_i8x4: T.int8x4 = B[0:4]
+                    B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
+                    C[0] = A_i32 + B_i32 + C[0]
+
+
+@T.prim_func
+def nested_reduction_loop_with_outer_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            A = T.match_buffer(in0[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            B = T.match_buffer(in1[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            C = T.match_buffer(out[yi, 0:4], [4], dtype="int32", offset_factor=1)
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):
+                    T.reads(
+                        out[yi, xr],
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                    )
+                    T.writes(out[yi, xr])
+                    A_i8x4: T.int8x4 = A[yi * 4 + xr : yi * 4 + xr + 4]
+                    A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
+                    B_i8x4: T.int8x4 = B[yi * 4 + xr : yi * 4 + xr + 4]
+                    B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
+                    C[xr] = A_i32 + B_i32 + C[xr]
+
+
 @T.prim_func
 def reducer_max(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [128, 128], dtype="float32")
@@ -1179,6 +1322,27 @@ def test_single_reduction_loop_with_block_predicate():
     )
 
 
+def test_single_reduction_loop_with_tensorize():
+    _check(
+        single_reduction_loop_with_tensorize,
+        single_reduction_loop_with_tensorize,
+    )
+
+
+def test_nested_reduction_loop_with_inner_match_buffers():
+    _check(
+        nested_reduction_loop_with_inner_match_buffers,
+        nested_reduction_loop_with_inner_match_buffers,
+    )
+
+
+def test_nested_reduction_loop_with_outer_match_buffers():
+    _check(
+        nested_reduction_loop_with_outer_match_buffers,
+        nested_reduction_loop_with_outer_match_buffers,
+    )
+
+
 def test_reducer_max():
     _check(reducer_max, lowered_reducer_max)
 

From c66bb00a48626d4c0b10c38c1069245548df0e99 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 10 Nov 2022 11:35:40 -0600
Subject: [PATCH 551/704] [Docker]Refactor timezone script and NRF installation
 (#13342)

This PR refactors timezone setup to a separate script that docker/install/ubuntu_install_core.sh
Also, it adds a script to install NRF and reused in both cortexm docker and RVM installation path.
---
 apps/microtvm/reference-vm/base-box-tool.py   |  1 +
 .../reference-vm/base-box/base_box_setup.sh   | 18 +--------
 docker/Dockerfile.ci_arm                      |  3 ++
 docker/Dockerfile.ci_cortexm                  |  7 ++++
 docker/Dockerfile.ci_cpu                      |  3 ++
 docker/Dockerfile.ci_gpu                      |  3 ++
 docker/Dockerfile.ci_hexagon                  |  3 ++
 docker/Dockerfile.ci_i386                     |  3 ++
 docker/Dockerfile.ci_minimal                  |  3 ++
 docker/Dockerfile.ci_riscv                    |  3 ++
 docker/Dockerfile.ci_wasm                     |  3 ++
 docker/Dockerfile.demo_android                |  3 ++
 docker/Dockerfile.demo_rocm                   |  3 ++
 docker/Dockerfile.demo_vitis_ai               |  3 ++
 docker/Dockerfile.docs                        |  3 ++
 docker/install/ubuntu_install_core.sh         |  5 ---
 docker/install/ubuntu_install_nrfjprog.sh     | 39 +++++++++++++++++++
 docker/install/ubuntu_setup_tz.sh             | 25 ++++++++++++
 18 files changed, 110 insertions(+), 21 deletions(-)
 create mode 100755 docker/install/ubuntu_install_nrfjprog.sh
 create mode 100755 docker/install/ubuntu_setup_tz.sh

diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index 325b9bc0c4c9..0f2a1242e993 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -61,6 +61,7 @@
     "docker/install/ubuntu_init_zephyr_project.sh",
     "docker/install/ubuntu_install_zephyr_sdk.sh",
     "docker/install/ubuntu_install_cmsis.sh",
+    "docker/install/ubuntu_install_nrfjprog.sh",
 ]
 
 PACKER_FILE_NAME = "packer.json"
diff --git a/apps/microtvm/reference-vm/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/base-box/base_box_setup.sh
index a1959e5d3bf7..33487150f935 100755
--- a/apps/microtvm/reference-vm/base-box/base_box_setup.sh
+++ b/apps/microtvm/reference-vm/base-box/base_box_setup.sh
@@ -36,22 +36,8 @@ sed -i "/^# If not running interactively,/ i export ZEPHYR_BASE=$HOME/zephyr/zep
 sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
 
 # nrfjprog
-NRF_COMMANDLINE_TOOLS_FILE=nRFCommandLineToolsLinuxamd64.tar.gz
-NRF_COMMANDLINE_TOOLS_URL=https://www.nordicsemi.com/-/media/Software-and-other-downloads/Desktop-software/nRF-command-line-tools/sw/Versions-10-x-x/10-12-1/nRFCommandLineTools10121Linuxamd64.tar.gz
-NRF_COMMANDLINE_TOOLS_INSTALLER=nRF-Command-Line-Tools_10_12_1_Linux-amd64.deb
-JLINK_LINUX_INSTALLER=JLink_Linux_V688a_x86_64.deb
-
-cd ~
-mkdir -p nrfjprog
-wget --no-verbose -O $NRF_COMMANDLINE_TOOLS_FILE $NRF_COMMANDLINE_TOOLS_URL
-cd nrfjprog
-tar -xzvf "../${NRF_COMMANDLINE_TOOLS_FILE}"
-sudo apt install -y "./${JLINK_LINUX_INSTALLER}"
-sudo apt install -y "./${NRF_COMMANDLINE_TOOLS_INSTALLER}"
-source ~/.profile
-nrfjprog --help
-cd ..
-rm -rf nrfjprog "${NRF_COMMANDLINE_TOOLS_FILE}"
+sudo ~/ubuntu_install_nrfjprog.sh
+rm -f ~/ubuntu_install_nrfjprog.sh
 
 # Zephyr
 pip3 install --user -U west
diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
index 2297e8f1e6e7..9bb34589b5f9 100644
--- a/docker/Dockerfile.ci_arm
+++ b/docker/Dockerfile.ci_arm
@@ -26,6 +26,9 @@ RUN apt-get update --fix-missing
 
 RUN apt-install-and-clear -y ca-certificates gnupg2
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm
index 8e8d2c0a4f9e..29a19454a9ee 100644
--- a/docker/Dockerfile.ci_cortexm
+++ b/docker/Dockerfile.ci_cortexm
@@ -23,6 +23,9 @@ COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 
 RUN apt-get update --fix-missing
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
@@ -81,6 +84,10 @@ RUN bash /install/ubuntu_install_zephyr.sh
 ENV ZEPHYR_BASE=/opt/zephyrproject/zephyr
 ENV PATH /opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH
 
+# NRF
+COPY install/ubuntu_install_nrfjprog.sh /install/ubuntu_install_nrfjprog.sh
+RUN bash /install/ubuntu_install_nrfjprog.sh
+
 # FreeRTOS deps
 COPY install/ubuntu_install_freertos.sh /install/ubuntu_install_freertos.sh
 RUN bash /install/ubuntu_install_freertos.sh
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index e71f98ec7af8..9436f1758e14 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -22,6 +22,9 @@ COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 
 RUN apt-get update --fix-missing
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 4b729a5f516e..a9ddc22c97a2 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -29,6 +29,9 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
 RUN rm -f /etc/apt/sources.list.d/nvidia-ml.list && apt-get clean
 RUN apt-get update --fix-missing
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index 52849c1c4aaa..2be11f034515 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -25,6 +25,9 @@ RUN apt-get update --fix-missing
 
 RUN apt-install-and-clear -y ca-certificates gnupg2 libxml2-dev
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index b37e849819be..9a2e08eaab76 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -26,6 +26,9 @@ RUN apt-get update --fix-missing
 
 RUN apt-install-and-clear -y ca-certificates
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.ci_minimal b/docker/Dockerfile.ci_minimal
index 2a3da14f0fe1..8ebcc6c9b9f2 100644
--- a/docker/Dockerfile.ci_minimal
+++ b/docker/Dockerfile.ci_minimal
@@ -22,6 +22,9 @@ COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 
 RUN apt-get update --fix-missing
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.ci_riscv b/docker/Dockerfile.ci_riscv
index 0d03db15e39b..3dd1943d27e1 100644
--- a/docker/Dockerfile.ci_riscv
+++ b/docker/Dockerfile.ci_riscv
@@ -23,6 +23,9 @@ COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 
 RUN apt-get update --fix-missing
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index 46f64b44dab5..3e794c312c66 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -20,6 +20,9 @@ COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 
 RUN apt-get update --fix-missing
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index 2f90e5ad664e..8a461269e75d 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -22,6 +22,9 @@ COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 
 RUN apt-get update --fix-missing
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.demo_rocm b/docker/Dockerfile.demo_rocm
index 1dd7d1bf119a..3f3ffef78e68 100644
--- a/docker/Dockerfile.demo_rocm
+++ b/docker/Dockerfile.demo_rocm
@@ -20,6 +20,9 @@ FROM ubuntu:18.04
 
 COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.demo_vitis_ai b/docker/Dockerfile.demo_vitis_ai
index 445f74ced982..fa024767fd1e 100644
--- a/docker/Dockerfile.demo_vitis_ai
+++ b/docker/Dockerfile.demo_vitis_ai
@@ -22,6 +22,9 @@ COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 
 RUN apt-get update --fix-missing
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.docs b/docker/Dockerfile.docs
index 9fe90a7302c0..f7904cabf750 100644
--- a/docker/Dockerfile.docs
+++ b/docker/Dockerfile.docs
@@ -23,6 +23,9 @@ COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 
 RUN apt-get update --fix-missing
 
+COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
+RUN bash /install/ubuntu_setup_tz.sh
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index 7f26c6def25d..c4c6f6b0bcc7 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -22,11 +22,6 @@ set -u
 set -x
 set -o pipefail
 
-export DEBIAN_FRONTEND=noninteractive
-export TZ=Etc/UTC
-ln -snf /usr/share/zoneinfo/$TZ /etc/localtime
-echo $TZ > /etc/timezone
-
 # install libraries for building c++ core on ubuntu
 apt-get update && apt-install-and-clear -y --no-install-recommends \
     apt-transport-https \
diff --git a/docker/install/ubuntu_install_nrfjprog.sh b/docker/install/ubuntu_install_nrfjprog.sh
new file mode 100755
index 000000000000..372c39a06a58
--- /dev/null
+++ b/docker/install/ubuntu_install_nrfjprog.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+set -x
+
+NRF_COMMANDLINE_TOOLS_FILE=nRFCommandLineToolsLinuxamd64.tar.gz
+NRF_COMMANDLINE_TOOLS_URL=https://www.nordicsemi.com/-/media/Software-and-other-downloads/Desktop-software/nRF-command-line-tools/sw/Versions-10-x-x/10-12-1/nRFCommandLineTools10121Linuxamd64.tar.gz
+NRF_COMMANDLINE_TOOLS_INSTALLER=nRF-Command-Line-Tools_10_12_1_Linux-amd64.deb
+JLINK_LINUX_INSTALLER=JLink_Linux_V688a_x86_64.deb
+
+cd ~
+mkdir -p nrfjprog
+wget --no-verbose -O $NRF_COMMANDLINE_TOOLS_FILE $NRF_COMMANDLINE_TOOLS_URL
+
+cd nrfjprog
+tar -xzvf "../${NRF_COMMANDLINE_TOOLS_FILE}"
+apt-install-and-clear -y "./${JLINK_LINUX_INSTALLER}"
+apt-install-and-clear -y "./${NRF_COMMANDLINE_TOOLS_INSTALLER}"
+
+cd ..
+rm -rf nrfjprog "${NRF_COMMANDLINE_TOOLS_FILE}"
diff --git a/docker/install/ubuntu_setup_tz.sh b/docker/install/ubuntu_setup_tz.sh
new file mode 100755
index 000000000000..adb8f5e06c17
--- /dev/null
+++ b/docker/install/ubuntu_setup_tz.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+export TZ=Etc/UTC
+ln -snf /usr/share/zoneinfo/$TZ /etc/localtime
+echo $TZ > /etc/timezone

From 3a639a41a830ece914df10a98f807123e46cd750 Mon Sep 17 00:00:00 2001
From: WANG Zihan <wzh1999_frog@126.com>
Date: Fri, 11 Nov 2022 04:43:38 +0800
Subject: [PATCH 552/704] [TIR][Arith] Fix divisor checking in `TryConstFold`
 (#13348)

Fix denominator checking in `TryConstFold`.
---
 src/arith/const_fold.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/arith/const_fold.h b/src/arith/const_fold.h
index a7466cf38c85..606bc28ddd22 100644
--- a/src/arith/const_fold.h
+++ b/src/arith/const_fold.h
@@ -236,7 +236,8 @@ inline Optional<PrimExpr> TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
       if (pb->value == 1) return a;
       ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
-    if (fa && fb && fb->value != 0) {
+    if (fa && fb) {
+      ICHECK_NE(fb->value, 0) << "Divide by zero";
       if (rtype.bits() == 32) {
         return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) /
                                                        static_cast<float>(fb->value)));

From b582cd12ae22595c64a0704b1f4f5ed67a9d02ca Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 10 Nov 2022 13:22:07 -0800
Subject: [PATCH 553/704] [MetaSchedule][Minor] Fix Typo in ApplyCustomRule
 Schedule Rule (#13353)

* Fix typo.

* Add regression test.
---
 .../schedule_rule/apply_custom_rule.cc        |  2 +-
 ...chedule_schedule_rule_apply_custom_rule.py | 66 +++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_apply_custom_rule.py

diff --git a/src/meta_schedule/schedule_rule/apply_custom_rule.cc b/src/meta_schedule/schedule_rule/apply_custom_rule.cc
index 4b0fa675acc7..b133cd30c1f4 100644
--- a/src/meta_schedule/schedule_rule/apply_custom_rule.cc
+++ b/src/meta_schedule/schedule_rule/apply_custom_rule.cc
@@ -85,7 +85,7 @@ bool ScheduleRule::IsApplyCustomRule(const ScheduleRule& rule) {
 }
 
 TVM_REGISTER_NODE_TYPE(ApplyCustomRuleNode);
-TVM_REGISTER_GLOBAL("meta_schedule.ScheduleApplyCustomRule")
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleApplyCustomRule")
     .set_body_typed(ScheduleRule::ApplyCustomRule);
 
 }  // namespace meta_schedule
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_apply_custom_rule.py b/tests/python/unittest/test_meta_schedule_schedule_rule_apply_custom_rule.py
new file mode 100644
index 000000000000..2bfa3070d1b4
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_apply_custom_rule.py
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+from typing import List
+import tempfile
+import pytest
+
+import tvm
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.schedule_rule import ApplyCustomRule
+from tvm.script import tir as T
+
+
+@tvm.script.ir_module
+class Matmul:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main"})
+        A = T.match_buffer(a, (1024, 1024), "float32")
+        B = T.match_buffer(b, (1024, 1024), "float32")
+        C = T.match_buffer(c, (1024, 1024), "float32")
+        for i, j, k in T.grid(1024, 1024, 1024):
+            with T.block("matmul"):
+                T.block_attr({"schedule_rule": "test_apply_custom_rule"})
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.register_func("meta_schedule.cpu.test_apply_custom_rule")
+def sch_fn(sch: tvm.tir.Schedule, block: tvm.tir.Block) -> List[tvm.tir.Schedule]:
+    raise ValueError("Intended for meta_schedule.cpu.test_apply_custom_rule")
+
+
+def test_custom_rule():
+    with pytest.raises(ValueError) as e_info:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            sch_rules = [ApplyCustomRule()]
+            space_gen = ms.space_generator.PostOrderApply(sch_rules=sch_rules)
+            ms.tune_tir(
+                mod=Matmul,
+                target="llvm -num-cores=1",
+                work_dir=tmpdir,
+                max_trials_global=10,
+                space=space_gen,
+            )
+    assert "ValueError: Intended for meta_schedule.cpu.test_apply_custom_rule" in str(e_info.value)
+
+
+if __name__ == "__main__":
+    test_custom_rule()

From 93fdf83e8f40b806ee5a8bd6625e0f4e431b459d Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 11 Nov 2022 09:08:51 +0900
Subject: [PATCH 554/704] [MetaSchedule] Improve inlining and `VerifyGPUCode`
 for quantized model workload (#13334)

* [MetaSchedule] Add a new schedule rule to inline all scalar constants

* add doc

* reorg

* identify constant block by its structure, not by name
---
 include/tvm/meta_schedule/schedule_rule.h     |  10 ++
 .../meta_schedule/schedule_rule/__init__.py   |   2 +-
 .../schedule_rule/auto_inline.py              |  17 +++
 src/meta_schedule/postproc/verify_gpu_code.cc |   2 +
 .../schedule_rule/auto_inline.cc              |  37 ++++++
 .../schedule_rule/schedule_rule.cc            |   3 +
 src/tir/analysis/verify_gpu_code.cc           |  13 ++
 .../metaschedule_e2e/test_resnet50_int8.py    |   5 +-
 ...meta_schedule_schedule_rule_auto_inline.py | 115 ++++++++++++++++++
 9 files changed, 201 insertions(+), 3 deletions(-)

diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index da8f1faa8e1d..70dec47e60bd 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -125,6 +125,16 @@ class ScheduleRule : public runtime::ObjectRef {
                                          bool require_injective,      //
                                          bool require_ordered,        //
                                          Optional<Array<String>> disallow_op);
+
+  /*!
+   * \brief Inline blocks that produce a constant scalar. Such blocks get in the way of
+   * ReverseComputeInline during AutoInline, since they are also counted as a producer block
+   * unless they are inlined first. So it is recommended to run InlineConstantScalars before
+   * AutoInline.
+   * \return The schedule rule created
+   */
+  TVM_DLL static ScheduleRule InlineConstantScalars();
+
   /*!
    * \brief Create a mega rule: multi-level tiling with data reuse
    * \param structure The tiling structure. Recommended:
diff --git a/python/tvm/meta_schedule/schedule_rule/__init__.py b/python/tvm/meta_schedule/schedule_rule/__init__.py
index 5971ad53c48c..d330fc713991 100644
--- a/python/tvm/meta_schedule/schedule_rule/__init__.py
+++ b/python/tvm/meta_schedule/schedule_rule/__init__.py
@@ -22,7 +22,7 @@
 from .add_rfactor import AddRFactor
 from .apply_custom_rule import ApplyCustomRule
 from .auto_bind import AutoBind
-from .auto_inline import AutoInline
+from .auto_inline import AutoInline, InlineConstantScalars
 from .cross_thread_reduction import CrossThreadReduction
 from .multi_level_tiling import (
     MultiLevelTiling,
diff --git a/python/tvm/meta_schedule/schedule_rule/auto_inline.py b/python/tvm/meta_schedule/schedule_rule/auto_inline.py
index 22206f3fcc24..c84dbaf89b97 100644
--- a/python/tvm/meta_schedule/schedule_rule/auto_inline.py
+++ b/python/tvm/meta_schedule/schedule_rule/auto_inline.py
@@ -65,3 +65,20 @@ def __init__(
             require_ordered,
             disallow_op,
         )
+
+
+@register_object("meta_schedule.InlineConstantScalars")
+class InlineConstantScalars(ScheduleRule):
+    """Inline blocks that produce a constant scalar.
+
+    Such blocks get in the way of ReverseComputeInline during AutoInline, since they are also
+    counted as a producer block unless they are inlined first. So it is recommended to run
+    InlineConstantScalars before AutoInline.
+    """
+
+    def __init__(
+        self,
+    ) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.ScheduleRuleInlineConstantScalars,  # type: ignore # pylint: disable=no-member
+        )
diff --git a/src/meta_schedule/postproc/verify_gpu_code.cc b/src/meta_schedule/postproc/verify_gpu_code.cc
index 0828ee538427..ae6f3474bbd6 100644
--- a/src/meta_schedule/postproc/verify_gpu_code.cc
+++ b/src/meta_schedule/postproc/verify_gpu_code.cc
@@ -175,10 +175,12 @@ class VerifyGPUCodeNode : public PostprocNode {
           pass_list.push_back(tir::transform::InjectDoubleBuffer());
           pass_list.push_back(tir::transform::StorageRewrite());
           pass_list.push_back(tir::transform::MergeDynamicSharedMemoryAllocations());
+          pass_list.push_back(tir::transform::LowerIntrin());
           // Convert Function to IRModule
           transform::PassContext pass_ctx = transform::PassContext::Current();
           tir::PrimFunc f = WithAttr(GetRef<tir::PrimFunc>(prim_func), "global_symbol",
                                      runtime::String(g_var->name_hint));
+          f = WithAttr(f, tvm::attr::kTarget, Target("cuda"));  // Required for LowerIntrin
           bool noalias = pass_ctx->GetConfig<Bool>("tir.noalias", Bool(true)).value();
           if (noalias) {
             f = WithAttr(std::move(f), "tir.noalias", Bool(true));
diff --git a/src/meta_schedule/schedule_rule/auto_inline.cc b/src/meta_schedule/schedule_rule/auto_inline.cc
index dcdc83f95cb1..d2d48b9008ce 100644
--- a/src/meta_schedule/schedule_rule/auto_inline.cc
+++ b/src/meta_schedule/schedule_rule/auto_inline.cc
@@ -189,5 +189,42 @@ TVM_REGISTER_NODE_TYPE(AutoInlineNode);
 TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleAutoInline")
     .set_body_typed(ScheduleRule::AutoInline);
 
+/*! \brief Inline blocks that produce a constant scalar. */
+class InlineConstantScalarsNode : public ScheduleRuleNode {
+ public:
+  void InitializeWithTuneContext(const TuneContext& context) final {}
+
+  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final {
+    // Look for a block of the form
+    // block compile_engine_const(iter_var(vi, range(min=0, ext=1))) {
+    //   reads([])
+    //   writes([compile_engine_const[]])
+    //   compile_engine_const[] = 59
+    // }
+    auto block = sch->Get(block_rv);
+    if (block->reads.size() == 0 && block->writes.size() == 1 &&
+        block->writes[0]->buffer->shape.size() == 0) {
+      sch->ComputeInline(block_rv);
+    }
+    return {sch};
+  }
+
+  ScheduleRule Clone() const final {
+    ObjectPtr<InlineConstantScalarsNode> n = make_object<InlineConstantScalarsNode>(*this);
+    return ScheduleRule(n);
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.InlineConstantScalars";
+  TVM_DECLARE_FINAL_OBJECT_INFO(InlineConstantScalarsNode, ScheduleRuleNode);
+};
+
+ScheduleRule ScheduleRule::InlineConstantScalars() {
+  ObjectPtr<InlineConstantScalarsNode> n = make_object<InlineConstantScalarsNode>();
+  return ScheduleRule(n);
+}
+
+TVM_REGISTER_NODE_TYPE(InlineConstantScalarsNode);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleInlineConstantScalars")
+    .set_body_typed(ScheduleRule::InlineConstantScalars);
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index 141b93be5e34..b1e8c3695d3e 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -54,6 +54,7 @@ ScheduleRule ScheduleRule::PyScheduleRule(
 Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
   return {
       ScheduleRule::ApplyCustomRule(),
+      ScheduleRule::InlineConstantScalars(),
       ScheduleRule::AutoInline(
           /*into_producer=*/false,
           /*into_consumer=*/true,
@@ -100,6 +101,7 @@ Array<ScheduleRule> ScheduleRule::DefaultCUDA() {
           Map<String, ObjectRef>{{"req", String("must")},
                                  {"levels", Array<Integer>{3}},  //
                                  {"scope", String("local")}}),
+      ScheduleRule::InlineConstantScalars(),
       ScheduleRule::AutoInline(
           /*into_producer=*/true,
           /*into_consumer=*/true,
@@ -178,6 +180,7 @@ Array<ScheduleRule> ScheduleRule::DefaultCUDATensorCore() {
 Array<ScheduleRule> ScheduleRule::DefaultHexagon() {
   return {
       ScheduleRule::ApplyCustomRule(),
+      ScheduleRule::InlineConstantScalars(),
       ScheduleRule::AutoInline(
           /*into_producer=*/false,
           /*into_consumer=*/true,
diff --git a/src/tir/analysis/verify_gpu_code.cc b/src/tir/analysis/verify_gpu_code.cc
index f0672f39217a..3377515a9589 100644
--- a/src/tir/analysis/verify_gpu_code.cc
+++ b/src/tir/analysis/verify_gpu_code.cc
@@ -209,6 +209,19 @@ class GPUCodeVerifier : public StmtExprVisitor {
     }
   }
 
+  void VisitExpr_(const CastNode* op) {
+    if (op->dtype.lanes() > 1) {
+      if (static_cast<size_t>(op->dtype.lanes() * op->dtype.bytes()) > max_vector_bytes_) {
+        std::stringstream s;
+        s << "Number of lanes (" << op->dtype.lanes() << ") times number of bytes ("
+          << op->dtype.bytes() << ") for dtype " << op->dtype
+          << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
+        errors_.push_back(s.str());
+      }
+    }
+    ExprVisitor::VisitExpr_(op);
+  }
+
   void VisitExpr_(const BufferLoadNode* op) {
     if (op->dtype.lanes() > 1) {
       if (static_cast<size_t>(op->dtype.lanes() * op->dtype.bytes()) > max_vector_bytes_) {
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
index b703c79c5d3a..9edf5877fd5e 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -33,6 +33,7 @@
 )
 from tvm.meta_schedule import postproc, schedule_rule
 from tvm.tir.schedule import BlockRV, Schedule
+from tvm.tir.schedule.analysis import has_block
 from tvm.tir.tensor_intrin.hexagon import VRMPY_u8i8i32_INTRIN, VRMPY_u8u8i32_INTRIN
 
 from ..infrastructure import get_hexagon_target
@@ -206,9 +207,9 @@ def _schedule_packed_8x8x32_conv2d():
 
     def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
         if conv2d_block is None:
-            try:
+            if has_block(sch, "conv2d_NCHWc_int8"):
                 conv2d_block = sch.get_block("conv2d_NCHWc_int8")
-            except ValueError:
+            else:
                 return False
 
         assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
index c17209e2cb77..1baa13793f38 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
@@ -15,7 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+import pytest
+
 import tvm
+from tvm.tir import Schedule
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.space_generation import generate_design_space
 from tvm.script import tir as T
@@ -334,6 +337,101 @@ def main(T_full: T.Buffer[(1, 12, 4096), "int64"]) -> None:
                 T.writes(T_full[ax0, ax1, ax2])
                 T_full[ax0, ax1, ax2] = T.int64(0)
 
+
+@tvm.script.ir_module
+class Conv2dInt8:
+    @T.prim_func
+    def main(p0: T.Buffer[(16, 14, 14, 256), "int8"], p1: T.Buffer[(1024, 1, 1, 256), "int8"], p2: T.Buffer[(1, 1, 1, 1024), "int32"], p3: T.Buffer[(1, 1, 1, 1024), "int32"], p4: T.Buffer[1024, "int32"], p5: T.Buffer[1024, "int32"], p6: T.Buffer[1024, "int32"], p7: T.Buffer[1, "int32"], p8: T.Buffer[(16, 14, 14, 1024), "int32"], compute: T.Buffer[(16, 14, 14, 1024), "int32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        compile_engine_const = T.alloc_buffer([], dtype="int32")
+        pad_temp = T.alloc_buffer([16, 14, 14, 256], dtype="int8")
+        conv2d_nhwc = T.alloc_buffer([16, 14, 14, 1024], dtype="int32")
+        T_subtract = T.alloc_buffer([16, 14, 14, 1024], dtype="int32")
+        T_add = T.alloc_buffer([16, 14, 14, 1024], dtype="int32")
+        compute_1 = T.alloc_buffer([16, 14, 14, 1024], dtype="int32")
+        T_add_1 = T.alloc_buffer([16, 14, 14, 1024], dtype="int32")
+        compute_2 = T.alloc_buffer([16, 14, 14, 1024], dtype="int32")
+        T_subtract_1 = T.alloc_buffer([16, 14, 14, 1024], dtype="int32")
+        compute_3 = T.alloc_buffer([16, 14, 14, 1024], dtype="int32")
+        T_add_2 = T.alloc_buffer([16, 14, 14, 1024], dtype="int32")
+        with T.block("compile_engine_const"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const[()])
+            compile_engine_const[()] = 59
+        for i0, i1, i2, i3 in T.grid(16, 14, 14, 256):
+            with T.block("pad_temp"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(p0[i0_1, i1_1, i2_1, i3_1])
+                T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1])
+                pad_temp[i0_1, i1_1, i2_1, i3_1] = p0[i0_1, i1_1, i2_1, i3_1]
+        for i0, i1, i2, i3, i4, i5, i6 in T.grid(16, 14, 14, 1024, 1, 1, 256):
+            with T.block("conv2d_nhwc"):
+                nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+                T.reads(pad_temp[nn, yy + ry, xx + rx, rc], p1[ff, ry, rx, rc])
+                T.writes(conv2d_nhwc[nn, yy, xx, ff])
+                with T.init():
+                    conv2d_nhwc[nn, yy, xx, ff] = 0
+                conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + T.cast(pad_temp[nn, yy + ry, xx + rx, rc], "int32") * T.cast(p1[ff, ry, rx, rc], "int32")
+        for i0, i1, i2, i3 in T.grid(16, 14, 14, 1024):
+            with T.block("T_subtract"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], p2[0, 0, 0, ax3])
+                T.writes(T_subtract[ax0, ax1, ax2, ax3])
+                T_subtract[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] - p2[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 14, 14, 1024):
+            with T.block("T_add"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_subtract[ax0, ax1, ax2, ax3], p3[0, 0, 0, ax3])
+                T.writes(T_add[ax0, ax1, ax2, ax3])
+                T_add[ax0, ax1, ax2, ax3] = T_subtract[ax0, ax1, ax2, ax3] + p3[0, 0, 0, ax3]
+        for i0, i1, i2, i3 in T.grid(16, 14, 14, 1024):
+            with T.block("compute"):
+                i0_2, i1_2, i2_2, i3_2 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add[i0_2, i1_2, i2_2, i3_2], p4[i3_2], p5[i3_2], p6[i3_2])
+                T.writes(compute_1[i0_2, i1_2, i2_2, i3_2])
+                compute_1[i0_2, i1_2, i2_2, i3_2] = T.q_multiply_shift_per_axis(T_add[i0_2, i1_2, i2_2, i3_2], p4[i3_2], p5[i3_2], p6[i3_2], 31, False, True, dtype="int32")
+        for i0_3, i1_3, i2_3, i3_3 in T.grid(16, 14, 14, 1024):
+            with T.block("T_add_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_3, i1_3, i2_3, i3_3])
+                T.reads(compile_engine_const[()], compute_1[ax0, ax1, ax2, ax3])
+                T.writes(T_add_1[ax0, ax1, ax2, ax3])
+                T_add_1[ax0, ax1, ax2, ax3] = compile_engine_const[()] + compute_1[ax0, ax1, ax2, ax3]
+        for i0_4, i1_4, i2_4, i3_4 in T.grid(16, 14, 14, 1024):
+            with T.block("compute_1"):
+                i0_5, i1_5, i2_5, i3_5 = T.axis.remap("SSSS", [i0_4, i1_4, i2_4, i3_4])
+                T.reads(T_add_1[i0_5, i1_5, i2_5, i3_5])
+                T.writes(compute_2[i0_5, i1_5, i2_5, i3_5])
+                compute_2[i0_5, i1_5, i2_5, i3_5] = T.max(T.min(T_add_1[i0_5, i1_5, i2_5, i3_5], 255), 0)
+        for i0_6, i1_6, i2_6, i3_6 in T.grid(16, 14, 14, 1024):
+            with T.block("T_subtract_1"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_6, i1_6, i2_6, i3_6])
+                T.reads(compute_2[ax0, ax1, ax2, ax3], p7[0])
+                T.writes(T_subtract_1[ax0, ax1, ax2, ax3])
+                T_subtract_1[ax0, ax1, ax2, ax3] = compute_2[ax0, ax1, ax2, ax3] - p7[0]
+        for i0_7, i1_7, i2_7, i3_7 in T.grid(16, 14, 14, 1024):
+            with T.block("compute_2"):
+                i0_8, i1_8, i2_8, i3_8 = T.axis.remap("SSSS", [i0_7, i1_7, i2_7, i3_7])
+                T.reads(T_subtract_1[i0_8, i1_8, i2_8, i3_8])
+                T.writes(compute_3[i0_8, i1_8, i2_8, i3_8])
+                compute_3[i0_8, i1_8, i2_8, i3_8] = T.q_multiply_shift(T_subtract_1[i0_8, i1_8, i2_8, i3_8], 1408572815, 31, 1, dtype="int32")
+        for i0_9, i1_9, i2_9, i3_9 in T.grid(16, 14, 14, 1024):
+            with T.block("T_add_2"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0_9, i1_9, i2_9, i3_9])
+                T.reads(compute_3[ax0, ax1, ax2, ax3], p8[ax0, ax1, ax2, ax3])
+                T.writes(T_add_2[ax0, ax1, ax2, ax3])
+                T_add_2[ax0, ax1, ax2, ax3] = compute_3[ax0, ax1, ax2, ax3] + p8[ax0, ax1, ax2, ax3]
+        for i0_10, i1_10, i2_10, i3_10 in T.grid(16, 14, 14, 1024):
+            with T.block("compute_3"):
+                i0_11, i1_11, i2_11, i3_11 = T.axis.remap("SSSS", [i0_10, i1_10, i2_10, i3_10])
+                T.reads(T_add_2[i0_11, i1_11, i2_11, i3_11])
+                T.writes(compute[i0_11, i1_11, i2_11, i3_11])
+                compute[i0_11, i1_11, i2_11, i3_11] = T.max(T.min(T_add_2[i0_11, i1_11, i2_11, i3_11], 255), 0)
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks
 # fmt: on
 
@@ -398,9 +496,26 @@ def test_inline_constant_tensor():
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=ConstConsumer)
 
 
+def test_conv2d_int8_inline_constant_scalars():
+    sch = Schedule(Conv2dInt8)
+
+    conv2d = sch.get_block("conv2d_nhwc")
+    sch.cache_write(conv2d, 0, "shared")
+
+    with pytest.raises(tvm.tir.ScheduleError) as e:
+        sch.reverse_compute_inline(sch.get_block("T_add_1"))
+
+    err_msg = "The block is only allowed to read a single buffer region, but it reads 2 region(s)"
+    assert err_msg in str(e)
+
+    ms.schedule_rule.InlineConstantScalars().apply(sch, sch.get_block("compile_engine_const"))
+    sch.reverse_compute_inline(sch.get_block("T_add_1"))
+
+
 if __name__ == "__main__":
     test_inline_consumer_chain()
     test_inline_into_cache()
     test_inline_into_multiple_consumers()
     test_inline_pure_spatial()
     test_inline_constant_tensor()
+    test_conv2d_int8_inline_constant_scalars()

From f950b118aa96cd2c14b02104defd78107403c9f1 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 10 Nov 2022 16:53:48 -0800
Subject: [PATCH 555/704] [MetaSchedule][Minor] Allow Zero Run Time In
 Benchmarking Result (#13354)

This PR introduces a check to prevent records with run time of zero into the training data of cost model. This is because when working on microTVM there're cases where the run time of certain successful runs is very tiny, such that it got recorded as zero. In such cases, the runtime of 0 would break XGBoost model because it introduces infinite running speed in GFLOPs. A regression test was also added.
---
 .../measure_callback/update_cost_model.cc     |  3 +-
 src/meta_schedule/utils.h                     | 13 +++++++++
 .../test_meta_schedule_measure_callback.py    | 28 ++++++++++++++++++-
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/src/meta_schedule/measure_callback/update_cost_model.cc b/src/meta_schedule/measure_callback/update_cost_model.cc
index 0563699ba6b9..8a8a43658409 100644
--- a/src/meta_schedule/measure_callback/update_cost_model.cc
+++ b/src/meta_schedule/measure_callback/update_cost_model.cc
@@ -42,7 +42,8 @@ class UpdateCostModelNode : public MeasureCallbackNode {
     pruned_candidate.reserve(n);
     pruned_runner_result.reserve(n);
     for (int i = 0; i < n; i++) {
-      if (!builder_results[i]->error_msg.defined()) {
+      if (!builder_results[i]->error_msg.defined() &&
+          Sum(runner_results[i]->run_secs.value()) > 0) {
         pruned_candidate.push_back(measure_candidates[i]);
         pruned_runner_result.push_back(runner_results[i]);
       }
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 80264516c4ce..969aa630df39 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -540,6 +540,19 @@ inline ScheduleRule GetDefaultAutoInline(const std::string& target_name) {
   throw;
 }
 
+/*!
+ * \brief Summarize the run time of the given FloatImm array.
+ * \param arr The array of FloatImm.
+ * \return The summary of the values in the given array.
+ */
+inline double Sum(const Array<FloatImm>& arr) {
+  double sum = 0;
+  for (const FloatImm& f : arr) {
+    sum += f->value;
+  }
+  return sum;
+}
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/tests/python/unittest/test_meta_schedule_measure_callback.py b/tests/python/unittest/test_meta_schedule_measure_callback.py
index 20596e8e8c4d..c3fbbbe97231 100644
--- a/tests/python/unittest/test_meta_schedule_measure_callback.py
+++ b/tests/python/unittest/test_meta_schedule_measure_callback.py
@@ -16,12 +16,12 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import re
+import tempfile
 from typing import List
 
 import pytest
 import tvm
 from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.dummy_object import DummyBuilder, DummyRunner
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule
 
@@ -123,7 +123,33 @@ def apply(
     assert pattern.match(str(measure_callback))
 
 
+def test_meta_schedule_measure_callback_update_cost_model_with_zero():
+    @ms.derived_object
+    class AllZeroRunnerFuture(ms.runner.PyRunnerFuture):
+        def done(self) -> bool:
+            return True
+
+        def result(self) -> ms.runner.RunnerResult:
+            return ms.runner.RunnerResult([0.0, 0.0], None)
+
+    @ms.derived_object
+    class AllZeroRunner(ms.runner.PyRunner):
+        def run(self, runner_inputs: List[ms.runner.RunnerInput]) -> List[ms.runner.RunnerResult]:
+            return [AllZeroRunnerFuture() for _ in runner_inputs]
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        ms.tune_tir(
+            mod=Matmul,
+            target="llvm -num-cores=1",
+            work_dir=work_dir,
+            max_trials_global=10,
+            runner=AllZeroRunner(),
+            measure_callbacks=[ms.measure_callback.UpdateCostModel()],
+        )
+
+
 if __name__ == "__main__":
     test_meta_schedule_measure_callback()
     test_meta_schedule_measure_callback_fail()
     test_meta_schedule_measure_callback_as_string()
+    test_meta_schedule_measure_callback_update_cost_model_with_zero()

From 6d68aff03023c90fcd5cf5d716eb99d43bcafb02 Mon Sep 17 00:00:00 2001
From: Sunghyun Park <49998730+sunggg@users.noreply.github.com>
Date: Thu, 10 Nov 2022 16:54:50 -0800
Subject: [PATCH 556/704] [Bugfix][TIR] Patch for PR#13269 to support Python
 3.10 (#13350)

It seems like there is some inconsistency across the python versions and make PR https://github.com/apache/tvm/pull/13269 fails at Python 3.10.
This patch fixes this issue.

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 python/tvm/tir/schedule/_type_checker.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py
index 4130e76e0892..becf8c095057 100644
--- a/python/tvm/tir/schedule/_type_checker.py
+++ b/python/tvm/tir/schedule/_type_checker.py
@@ -21,29 +21,24 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, Union
 import typing
-import sys
 
 
 def _is_none_type(type_: Any) -> bool:
     return type_ is None or type_ is type(None)
 
 
-def get_python_version():
-    return sys.version_info[:3]
-
-
 if hasattr(typing, "_GenericAlias"):
     # For python versions 3.7 onward, check the __origin__ attribute.
 
     class _Subtype:
         @staticmethod
         def _origin(type_: Any) -> Any:
-            if get_python_version() >= (3, 9, 0):
+            if hasattr(typing, "_SpecialGenericAlias"):
                 if isinstance(type_, typing._SpecialGenericAlias):  # type: ignore # pylint: disable=protected-access
                     return type_.__origin__
-            else:
-                if isinstance(type_, typing._GenericAlias):  # type: ignore # pylint: disable=protected-access
-                    return type_.__origin__
+
+            if isinstance(type_, typing._GenericAlias):  # type: ignore # pylint: disable=protected-access
+                return type_.__origin__
             return None
 
         @staticmethod

From a15663654b4edcefa6431a1c3fb033069d17bc71 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 11 Nov 2022 10:18:44 +0900
Subject: [PATCH 557/704] [MetaSchedule] Unannotate `schedule_rule` if
 corresponding schedule func is not found (#13346)

---
 src/meta_schedule/schedule_rule/apply_custom_rule.cc | 1 +
 tests/python/integration/test_auto_tensorize.py      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/meta_schedule/schedule_rule/apply_custom_rule.cc b/src/meta_schedule/schedule_rule/apply_custom_rule.cc
index b133cd30c1f4..011e96e895d2 100644
--- a/src/meta_schedule/schedule_rule/apply_custom_rule.cc
+++ b/src/meta_schedule/schedule_rule/apply_custom_rule.cc
@@ -54,6 +54,7 @@ class ApplyCustomRuleNode : public ScheduleRuleNode {
           os << "\n  " << GetCustomRuleName(ann.value(), key);
         }
         LOG(WARNING) << os.str();
+        sch->Unannotate(block_rv, "schedule_rule");
       }
     }
     return {sch};
diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index 8c06e147c01f..572da53b34fd 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -31,6 +31,7 @@
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 
 SCH_RULES_FOR_VNNI = [
+    ms.schedule_rule.ApplyCustomRule(),
     ms.schedule_rule.AutoInline(
         into_producer=False,
         into_consumer=True,

From f3eb2399897b830cd5a4014ba53ac0cbb3a8826e Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Thu, 10 Nov 2022 20:19:35 -0500
Subject: [PATCH 558/704] [MetaSchedule] Skip empty fx graph in TorchBench
 tuning script (#13356)

---
 .../meta_schedule/testing/torchbench/run.py   | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/python/tvm/meta_schedule/testing/torchbench/run.py b/python/tvm/meta_schedule/testing/torchbench/run.py
index 5df77cf25c3f..65e1a1a59f3e 100644
--- a/python/tvm/meta_schedule/testing/torchbench/run.py
+++ b/python/tvm/meta_schedule/testing/torchbench/run.py
@@ -98,7 +98,7 @@
 import warnings
 from collections import defaultdict
 from enum import Enum
-from typing import Callable, List, Tuple, Dict
+from typing import Callable, Dict, List, Tuple
 
 import numpy as np  # type: ignore
 import torch  # type: ignore
@@ -431,6 +431,20 @@ def forward(*args):
     return forward
 
 
+def should_skip_subgraph(graph_module: torch.fx.GraphModule) -> bool:
+    """
+    Returns whether it should skip optimizing the input graph module.
+    The graph could be empyt or only containing nodes calling function
+    for side effect.
+    """
+    graph = graph_module.graph
+
+    inputs = [n for n in graph.nodes if n.op == "placeholder"]
+    outputs = [n for n in graph.nodes if n.op == "output"]
+
+    return len(inputs) == 0 and all(output.args == ((),) for output in outputs)
+
+
 def create_tvm_task_collection_backend() -> Tuple[Callable, List[ms.ExtractedTask]]:
     """
     This torchdynamo backend only collects the extracted tasks from MetaSchedule.
@@ -461,6 +475,9 @@ def backend(graph_module, example_inputs):
         torch.save(graph_module, os.path.join(subgraphs_dir, f"graph_module_{subgraph_idx}"))
         torch.save(example_inputs, os.path.join(subgraphs_dir, f"example_inputs_{subgraph_idx}"))
 
+        if should_skip_subgraph(graph_module):
+            return graph_module.forward
+
         jit_mod = torch.jit.trace(graph_module, example_inputs)
         shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
         ir_mod, params = tvm.relay.frontend.from_pytorch(jit_mod, shape_list)
@@ -494,6 +511,9 @@ def create_tvm_compilation_backend(database: ms.database.Database) -> Callable:
     """
 
     def backend(graph_module, example_inputs):
+        if should_skip_subgraph(graph_module):
+            return graph_module.forward
+
         jit_mod = torch.jit.trace(graph_module, example_inputs)
         shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
         ir_mod, params = tvm.relay.frontend.from_pytorch(jit_mod, shape_list)

From 5364e5a39a5e33728b7f5a26ddb40543a544ea02 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 11 Nov 2022 12:54:48 +0900
Subject: [PATCH 559/704] [MetaSchedule] Fuse loops around shared to global
 store block in `MultiLevelTilingTensorCore` (#13357)

* Fuse shared to global store loops in MultiLevelTilingTensorCore

* update test
---
 .../postproc/rewrite_cooperative_fetch.cc     | 30 ++++++++++++++
 .../schedule_rule/multi_level_tiling.cc       |  4 +-
 .../multi_level_tiling_tensor_core.cc         |  5 +++
 ...test_meta_schedule_schedule_rule_mlt_tc.py | 39 ++++++++++---------
 4 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index 427653b06c2a..353b90c36423 100644
--- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -82,6 +82,29 @@ bool ParseWarpExecutionAnn(const Schedule& sch, const Instruction& inst) {
   return ann_key == attr::warp_execution;
 }
 
+size_t GetMaxUsedDtypeBytes(Block block) {
+  size_t max_bytes = 1;
+  static auto q_multiply_shift_per_axis = Op::Get("tir.q_multiply_shift_per_axis");
+  static auto q_multiply_shift = Op::Get("tir.q_multiply_shift");
+
+  tir::PostOrderVisit(block->body, [&](const ObjectRef& obj) {
+    if (const auto* store = obj.as<tir::BufferStoreNode>()) {
+      max_bytes = std::max(max_bytes, static_cast<size_t>(store->value->dtype.bytes()));
+    } else if (const auto* load = obj.as<tir::BufferLoadNode>()) {
+      max_bytes = std::max(max_bytes, static_cast<size_t>(load->dtype.bytes()));
+    } else if (const auto* call = obj.as<tir::CallNode>()) {
+      if (call->op.same_as(q_multiply_shift_per_axis) || call->op.same_as(q_multiply_shift)) {
+        // q_multiply_shift uses 64 bit multiply
+        max_bytes = std::max<size_t>(max_bytes, 8);
+      }
+    } else if (const auto* cast = obj.as<tir::CastNode>()) {
+      max_bytes = std::max<size_t>(max_bytes, cast->dtype.bytes());
+    }
+  });
+
+  return max_bytes;
+}
+
 }  // namespace tir
 
 namespace meta_schedule {
@@ -154,6 +177,13 @@ bool RewriteCooperativeFetchNode::Apply(const tir::Schedule& sch) {
       if (fused_extent % vector_lane != 0) {
         vector_lane = 1;
       }
+      // If the block involves 64 bit values, disable vectorization for now since
+      // vectorization of 64 bit values does not work well on CUDA.
+      // TODO(masahi, vinx13): Decouple epilogue fusion computation and shared to global store, so
+      // that we can always vectorize the latter.
+      if (tir::GetMaxUsedDtypeBytes(sch->Get(block)) > 4) {
+        vector_lane = 1;
+      }
       if (thread_extent_y != -1) {
         if (vector_lane > 1) {
           Array<tir::LoopRV> split = sch->Split(fused, {NullOpt,                   //
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index 9141c92de12c..fe24357fcad5 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -269,8 +269,8 @@ std::vector<State> MultiLevelTilingNode::AddReadReuse(State state) const {
       sch->ComputeAt(cache_read_block, loop_rv, true);
       // Fuse the iterators of the cache_read
       Array<LoopRV> buffer_loops = sch->GetLoops(cache_read_block);
-      LoopRV fused = sch->Fuse(Array<LoopRV>{buffer_loops.end() - buffer_ndim,  //
-                                             buffer_loops.end()});
+      sch->Fuse(Array<LoopRV>{buffer_loops.end() - buffer_ndim,  //
+                              buffer_loops.end()});
       AnnotateCooperativeFetching(&sch, cache_read_block);
       new_state->read_reuse.emplace(i, cache_read_block);
     }
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 37c35248329a..d5cca52d41f9 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -258,6 +258,11 @@ std::vector<State> MultiLevelTilingTensorCoreNode::AddWriteReuseTensorCore(
   sch->ReverseComputeAt(cache_write, loop, true);
 
   if (state->write_reuse.count(0)) {
+    // Fuse the iterators of the cache_write
+    Array<LoopRV> buffer_loops = sch->GetLoops(state->write_reuse[0]);
+    ICHECK_GT(buffer_loops.size(), 2);
+    sch->Fuse(Array<LoopRV>{buffer_loops.end() - 2,  // The src shmem is always 2D
+                            buffer_loops.end()});
     AnnotateCooperativeFetching(&sch, state->write_reuse[0]);
   }
   sch->ReverseComputeInline(state->tensor_core_reindex_store);
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
index 0e4bd6bf302a..acc626b904a1 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
@@ -162,14 +162,15 @@ def matmul_relu_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "f
                                     T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
-                for ax0, ax1 in T.grid(32, 32):
+                for ax0_ax1_fused in T.serial(1024):
                     with T.block("C_reindex_shared"):
-                        v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0)
-                        v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax1)
+                        v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0_ax1_fused // 32)
+                        v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0_ax1_fused % 32)
                         T.reads(C_reindex_shared[v0, v1])
                         T.writes(compute[v0, v1])
                         T.block_attr({"meta_schedule.cooperative_fetch":4})
                         compute[v0, v1] = T.max(C_reindex_shared[v0, v1], T.float32(0))
+
     # fmt: on
     decision_0 = [
         ("SamplePerfectTile", [4, 1, 1, 1, 2]),
@@ -303,10 +304,10 @@ def matmul_relu_fallback_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128,
                                     T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
-                for ax0, ax1 in T.grid(32, 128):
+                for ax0_ax1_fused in T.serial(4096):
                     with T.block("C_reindex_shared"):
-                        v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0)
-                        v1 = T.axis.spatial(128, ax1)
+                        v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0_ax1_fused // 128)
+                        v1 = T.axis.spatial(128, ax0_ax1_fused % 128)
                         T.reads(C_reindex_shared[v0, v1])
                         T.writes(compute[v0, v1])
                         T.block_attr({"meta_schedule.cooperative_fetch":4})
@@ -451,10 +452,10 @@ def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3,
                                     T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
-                for ax0, ax1 in T.grid(16, 16):
+                for ax0_ax1_fused in T.serial(256):
                     with T.block("conv2d_nhwc_reindex_shared"):
-                        v0 = T.axis.spatial(256, ax0_0_1_ax1_0_1_fused * 16 + ax0)
-                        v1 = T.axis.spatial(32, ax0_0_0_ax1_0_0_fused * 16 + ax1)
+                        v0 = T.axis.spatial(256, ax0_0_1_ax1_0_1_fused * 16 + ax0_ax1_fused // 16)
+                        v1 = T.axis.spatial(32, ax0_0_0_ax1_0_0_fused * 16 +  ax0_ax1_fused % 16)
                         T.reads(conv2d_nhwc_reindex_shared[v0, v1])
                         T.writes(conv2d_nhwc[v0 // 256, v0 // 16, v0 % 16, v1])
                         T.block_attr({"meta_schedule.cooperative_fetch":3})
@@ -617,10 +618,10 @@ def matmul_relu_pipeline_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128,
                                     T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
-                for ax0, ax1 in T.grid(32, 32):
+                for ax0_ax1_fused in T.grid(1024):
                     with T.block("C_reindex_shared"):
-                        v0 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused // 4 * 32 + ax0)
-                        v1 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused % 4 * 32 + ax1)
+                        v0 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused // 4 * 32 + ax0_ax1_fused // 32)
+                        v1 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused % 4 * 32 + ax0_ax1_fused % 32)
                         T.reads(C_reindex_shared[v0, v1])
                         T.writes(C[v0, v1])
                         T.block_attr({"meta_schedule.cooperative_fetch":3})
@@ -919,11 +920,11 @@ def padded_matmul_relu_0(A: T.Buffer[(127, 127), "float16"], B: T.Buffer[(127, 1
                                     T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
-                for ax0, ax1 in T.grid(32, 32):
+                for ax0_ax1_fused in T.serial(1024):
                     with T.block("C_reindex_shared"):
-                        T.where(ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0 < 127 and ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax1 < 127)
-                        v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0)
-                        v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax1)
+                        T.where(ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0_ax1_fused // 32 < 127 and ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0_ax1_fused % 32 < 127)
+                        v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0_ax1_fused // 32)
+                        v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0_ax1_fused % 32)
                         T.reads(C_reindex_shared[v0, v1])
                         T.writes(compute[v0, v1])
                         T.block_attr({"meta_schedule.cooperative_fetch":4})
@@ -1063,10 +1064,10 @@ def conv2d_1x1_0(inputs: T.Buffer[(1, 16, 16, 64), "float16"], weight: T.Buffer[
                                     T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i])
                                     conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
-                for ax0, ax1 in T.grid(16, 32):
+                for ax0_ax1_fused in T.serial(512):
                     with T.block("conv2d_nhwc_reindex_shared"):
-                        v0 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused // 2 * 32 + ax2_0_1_ax3_0_1_fused * 16 + ax0)
-                        v1 = T.axis.spatial(64, ax2_0_0_ax3_0_0_fused % 2 * 32 + ax1)
+                        v0 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused // 2 * 32 + ax2_0_1_ax3_0_1_fused * 16 + ax0_ax1_fused // 32)
+                        v1 = T.axis.spatial(64, ax2_0_0_ax3_0_0_fused % 2 * 32 + ax0_ax1_fused % 32)
                         T.reads(conv2d_nhwc_reindex_shared[v0, v1])
                         T.writes(conv2d_nhwc[v0 // 256, v0 // 16, v0 % 16, v1])
                         T.block_attr({"meta_schedule.cooperative_fetch":2})

From 45327127d5d30819752d685d6ef06247a573552e Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Fri, 11 Nov 2022 12:55:50 +0800
Subject: [PATCH 560/704] [TIR][Schedule] Make consistent implementation for
 GetProducers() & GetConsumers() (#13344)

Currently there are two versions of `GetConsumers()` and `GetProducers()` implementation. Make them consistent to avoid possible bug when there are WAR dependencies.
---
 src/tir/schedule/analysis/analysis.cc        | 34 +++++++++++++-------
 src/tir/schedule/primitive/get_block_loop.cc | 27 ++--------------
 2 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 56e42d4052fb..744801596ebd 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -1014,23 +1014,33 @@ std::pair<Array<StmtSRef>, std::vector<int>> CollectComputeLocation(const Schedu
 /******** Producer-consumer relation ********/
 
 Array<StmtSRef> GetProducers(const StmtSRef& block_sref, const BlockScope& scope) {
-  Array<Dependency> deps = scope->GetDepsByDst(block_sref);
-  Array<StmtSRef> result;
-  result.reserve(deps.size());
-  for (const Dependency& dep : deps) {
-    result.push_back(dep->src);
+  Array<Dependency> edges = scope->GetDepsByDst(block_sref);
+  Array<StmtSRef> results;
+  std::unordered_set<StmtSRef, ObjectPtrHash, ObjectPtrEqual> result_set;
+  results.reserve(edges.size());
+  for (const Dependency& edge : edges) {
+    if ((edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) &&
+        !result_set.count(edge->src)) {
+      results.push_back(edge->src);
+      result_set.emplace(edge->src);
+    }
   }
-  return result;
+  return results;
 }
 
 Array<StmtSRef> GetConsumers(const StmtSRef& block_sref, const BlockScope& scope) {
-  Array<Dependency> deps = scope->GetDepsBySrc(block_sref);
-  Array<StmtSRef> result;
-  result.reserve(deps.size());
-  for (const Dependency& dep : deps) {
-    result.push_back(dep->dst);
+  Array<Dependency> edges = scope->GetDepsBySrc(block_sref);
+  Array<StmtSRef> results;
+  std::unordered_set<StmtSRef, ObjectPtrHash, ObjectPtrEqual> result_set;
+  results.reserve(edges.size());
+  for (const Dependency& edge : edges) {
+    if ((edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) &&
+        !result_set.count(edge->dst)) {
+      results.push_back(edge->dst);
+      result_set.emplace(edge->dst);
+    }
   }
-  return result;
+  return results;
 }
 
 ProducerConsumerSplit ProducerConsumerSplit::Find(
diff --git a/src/tir/schedule/primitive/get_block_loop.cc b/src/tir/schedule/primitive/get_block_loop.cc
index ecbadce470b9..72f43a8d4929 100644
--- a/src/tir/schedule/primitive/get_block_loop.cc
+++ b/src/tir/schedule/primitive/get_block_loop.cc
@@ -16,6 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include "../analysis.h"
 #include "../utils.h"
 
 namespace tvm {
@@ -79,34 +80,12 @@ Array<StmtSRef> GetChildBlocks(const ScheduleState& self, const StmtSRef& parent
 
 Array<StmtSRef> GetProducers(const ScheduleState& self, const StmtSRef& block_sref) {
   StmtSRef scope_root = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
-  Array<Dependency> edges = self->GetBlockScope(scope_root)->GetDepsByDst(block_sref);
-  Array<StmtSRef> results;
-  std::unordered_set<StmtSRef, ObjectPtrHash, ObjectPtrEqual> result_set;
-  results.reserve(edges.size());
-  for (const Dependency& edge : edges) {
-    if ((edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) &&
-        !result_set.count(edge->src)) {
-      results.push_back(edge->src);
-      result_set.emplace(edge->src);
-    }
-  }
-  return results;
+  return tir::GetProducers(block_sref, self->GetBlockScope(scope_root));
 }
 
 Array<StmtSRef> GetConsumers(const ScheduleState& self, const StmtSRef& block_sref) {
   StmtSRef scope_root = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
-  Array<Dependency> edges = self->GetBlockScope(scope_root)->GetDepsBySrc(block_sref);
-  Array<StmtSRef> results;
-  std::unordered_set<StmtSRef, ObjectPtrHash, ObjectPtrEqual> result_set;
-  results.reserve(edges.size());
-  for (const Dependency& edge : edges) {
-    if ((edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) &&
-        !result_set.count(edge->dst)) {
-      results.push_back(edge->dst);
-      result_set.emplace(edge->dst);
-    }
-  }
-  return results;
+  return tir::GetConsumers(block_sref, self->GetBlockScope(scope_root));
 }
 
 /******** InstructionKind Registration ********/

From f9ed60aaef5399b0647aeeef68f86bda87653346 Mon Sep 17 00:00:00 2001
From: "Xiangxi Guo (Ryan)" <ryan.guo99@gmail.com>
Date: Thu, 10 Nov 2022 23:22:12 -0600
Subject: [PATCH 561/704] [Relay] Refactor constant folding over expr into a
 utility function (#13343)

---
 .../backend/contrib/constant_transforms.cc    | 10 +---
 .../backend/contrib/constant_transforms.h     |  9 ---
 .../contrib/ethosn/convert_equivalent.cc      |  4 +-
 src/relay/quantize/realize.cc                 | 18 ++----
 src/relay/quantize/realize.h                  |  2 -
 src/relay/transforms/fold_constant.cc         | 20 +++----
 src/relay/transforms/fold_constant.h          | 55 +++++++++++++++++++
 src/relay/transforms/simplify_expr.cc         | 11 +---
 8 files changed, 78 insertions(+), 51 deletions(-)
 create mode 100644 src/relay/transforms/fold_constant.h

diff --git a/src/relay/backend/contrib/constant_transforms.cc b/src/relay/backend/contrib/constant_transforms.cc
index 6041d37451aa..45669b5ef271 100644
--- a/src/relay/backend/contrib/constant_transforms.cc
+++ b/src/relay/backend/contrib/constant_transforms.cc
@@ -21,6 +21,7 @@
 
 #include <string>
 
+#include "../../transforms/fold_constant.h"
 #include "../../transforms/pattern_utils.h"
 #include "../../transforms/simplify_expr.h"
 
@@ -33,13 +34,6 @@ namespace tvm {
 namespace relay {
 namespace contrib {
 
-Expr FoldConstantExpr(const Expr& expr, bool fold_qnn) {
-  auto mod = IRModule::FromExpr(expr);
-  mod = transform::FoldConstant(fold_qnn)(mod);
-  auto entry_func = Downcast<Function>(mod->Lookup("main"));
-  return expr.as<FunctionNode>() == nullptr ? entry_func->body : entry_func;
-}
-
 Constant TransposeWeights(const Constant& data, const std::string& source_layout,
                           const std::string& target_layout) {
   Array<Integer> transpose_matrix;
@@ -48,7 +42,7 @@ Constant TransposeWeights(const Constant& data, const std::string& source_layout
     transpose_matrix.push_back(pos);
   }
   Expr transpose = MakeTranspose(data, transpose_matrix);
-  transpose = InferType(FoldConstantExpr(transpose));
+  transpose = InferType(transform::FoldConstantExpr(transpose));
   Constant transposed_data = Downcast<Constant>(transpose);
   return transposed_data;
 }
diff --git a/src/relay/backend/contrib/constant_transforms.h b/src/relay/backend/contrib/constant_transforms.h
index 39a9dc1d53d4..f642564115b6 100644
--- a/src/relay/backend/contrib/constant_transforms.h
+++ b/src/relay/backend/contrib/constant_transforms.h
@@ -33,15 +33,6 @@ namespace tvm {
 namespace relay {
 namespace contrib {
 
-/*!
- * \brief Apply constant folding on an expression.
- *
- * \param expr The expression to fold.
- * \param fold_qnn Whether to fold constants for QNN operations.
- * \returns The new folded expression.
- */
-Expr FoldConstantExpr(const Expr& expr, bool fold_qnn = true);
-
 /*!
  *\brief Transpose weights from `source_layout` to `target_layout`
  *
diff --git a/src/relay/backend/contrib/ethosn/convert_equivalent.cc b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
index 14d94192c84e..ef8c4a5ef567 100644
--- a/src/relay/backend/contrib/ethosn/convert_equivalent.cc
+++ b/src/relay/backend/contrib/ethosn/convert_equivalent.cc
@@ -30,9 +30,9 @@
 #include <unordered_map>
 
 #include "../../../qnn/utils.h"
+#include "../../../transforms/fold_constant.h"
 #include "../../../transforms/pattern_utils.h"
 #include "../../../transforms/simplify_expr.h"
-#include "../constant_transforms.h"
 #include "ethosn_api.h"
 
 namespace tvm {
@@ -176,7 +176,7 @@ Optional<Expr> ConvertQnnAddToDepthwise(const Expr& expr) {
   Expr reshape_bias = MakeReshape(requantize_bias, {channels});
 
   try {
-    reshape_bias = FoldConstantExpr(reshape_bias);
+    reshape_bias = transform::FoldConstantExpr(reshape_bias);
   } catch (tvm::Error& e) {
     // Conversion produced an invalid op.
     return NullOpt;
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 720ef25cd33d..3c2f6eb96d6b 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -33,6 +33,7 @@
 
 #include "../op/annotation/annotation.h"
 #include "../qnn/utils.h"
+#include "../transforms/fold_constant.h"
 #include "./quantize.h"
 
 namespace tvm {
@@ -154,13 +155,6 @@ Expr QuantizeRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
   return QRealizeIntExpr(round_data, dom_scale, DataType::Float(32));
 }
 
-Expr FoldConstantOpt(const Expr& expr) {
-  auto mod = IRModule::FromExpr(expr);
-  mod = transform::FoldConstant()(mod);
-  auto entry_func = Downcast<Function>(mod->Lookup("main"));
-  return expr.as<FunctionNode>() == nullptr ? entry_func->body : entry_func;
-}
-
 RELAY_REGISTER_OP("relay.op.annotation.simulated_quantize")
     .set_attr<FForwardRewrite>("FQRealizeRewrite", QuantizeRealize);
 
@@ -184,7 +178,7 @@ Expr Conv2dRealize(const Call& ref_call, const Array<Expr>& new_args, const Obje
 
     Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args);
     Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
-    Expr dom_scale = FoldConstantOpt(mul);
+    Expr dom_scale = FoldConstantExpr(mul);
     return QRealizeIntExpr(ret, dom_scale, out_dtype);
   }
   ICHECK(!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>());
@@ -218,7 +212,7 @@ Expr Conv1dRealize(const Call& ref_call, const Array<Expr>& new_args, const Obje
 
   Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args);
   Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
-  Expr dom_scale = FoldConstantOpt(mul);
+  Expr dom_scale = FoldConstantExpr(mul);
   return QRealizeIntExpr(ret, dom_scale, out_dtype);
 }
 
@@ -247,7 +241,7 @@ Expr DenseRealize(const Call& ref_call, const Array<Expr>& new_args, const Objec
 
   Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args);
   Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
-  Expr dom_scale = FoldConstantOpt(mul);
+  Expr dom_scale = FoldConstantExpr(mul);
   return QRealizeIntExpr(ret, dom_scale, out_dtype);
 }
 
@@ -273,7 +267,7 @@ Expr MulRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectR
 
     Expr ret = ForwardOp(ref_call, {ldata, rdata});
     Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
-    Expr dom_scale = FoldConstantOpt(mul);
+    Expr dom_scale = FoldConstantExpr(mul);
     return QRealizeIntExpr(ret, dom_scale, dtype);
   }
   ICHECK(!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>());
@@ -527,7 +521,7 @@ Expr BatchMatmulRealize(const Call& ref_call, const Array<Expr>& new_args, const
 
   Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args);
   Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
-  Expr dom_scale = FoldConstantOpt(mul);
+  Expr dom_scale = FoldConstantExpr(mul);
   return QRealizeIntExpr(ret, dom_scale, out_dtype);
 }
 
diff --git a/src/relay/quantize/realize.h b/src/relay/quantize/realize.h
index 16fdf79b246e..6eba69e9c9b1 100644
--- a/src/relay/quantize/realize.h
+++ b/src/relay/quantize/realize.h
@@ -69,8 +69,6 @@ class QRealizeIntExpr : public QRealizeExpr {
   TVM_DEFINE_OBJECT_REF_METHODS(QRealizeIntExpr, QRealizeExpr, QRealizeIntExprNode);
 };
 
-Expr FoldConstantOpt(const Expr& expr);
-
 }  // namespace quantize
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index 9dec840be0a7..aee402836f89 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -418,14 +418,6 @@ class ConstantFolder : public MixedModeMutator {
 
 TVM_REGISTER_GLOBAL("relay.analysis.check_constant").set_body_typed(IsComplexConstant);
 
-/*!
- * \brief Returns \p expr with any constants expressions evaluated and let-bound constants
- * inlined. Returns \p expr unchanged if no change.
- *
- * CAUTION: The importers rely on this function returning \p expr unchanged to preserve sharing
- * from their p.o.v. Furthermore, this function can be called before conversion to ANF so
- * we must avoid all recursion.
- */
 Expr FoldConstantExpr(const Expr& expr, const IRModule& mod, bool fold_qnn) {
   VLOG_CONTEXT << "FoldConstantExpr";
   VLOG(1) << "folding:" << std::endl << PrettyPrint(expr);
@@ -434,11 +426,19 @@ Expr FoldConstantExpr(const Expr& expr, const IRModule& mod, bool fold_qnn) {
   return result;
 }
 
-TVM_REGISTER_GLOBAL("relay._transform.FoldConstantExpr").set_body_typed(FoldConstantExpr);
+Expr FoldConstantExpr(const Expr& expr, bool fold_qnn) {
+  auto mod = IRModule::FromExpr(expr);
+  return FoldConstantExpr(expr, mod, fold_qnn);
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.FoldConstantExpr")
+    .set_body_typed([](const Expr& expr, const IRModule& mod, bool fold_qnn) {
+      return FoldConstantExpr(expr, mod, fold_qnn);
+    });
 
 Pass FoldConstant(bool fold_qnn) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
-      [=](Function f, IRModule m, PassContext pc) {
+      [=](Function f, IRModule m, PassContext /* pc */) {
         return Downcast<Function>(FoldConstantExpr(f, m, fold_qnn));
       };
   return CreateFunctionPass(pass_func, 2, "FoldConstant", {});
diff --git a/src/relay/transforms/fold_constant.h b/src/relay/transforms/fold_constant.h
new file mode 100644
index 000000000000..4f475037d195
--- /dev/null
+++ b/src/relay/transforms/fold_constant.h
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file fold_constant.h
+ * \brief Utility functions for folding constants in expressions.
+ */
+#ifndef TVM_RELAY_TRANSFORMS_FOLD_CONSTANT_H_
+#define TVM_RELAY_TRANSFORMS_FOLD_CONSTANT_H_
+
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+namespace transform {
+
+/*!
+ * \brief Apply constant folding on an expression.
+ *
+ * \param expr The expression to fold.
+ * \param fold_qnn Whether to fold constants for QNN operations.
+ * \returns The new folded expression.
+ */
+Expr FoldConstantExpr(const Expr& expr, bool fold_qnn = true);
+
+/*!
+ * \brief Returns \p expr with any constants expressions evaluated and let-bound constants
+ * inlined. Returns \p expr unchanged if no change.
+ *
+ * CAUTION: The importers rely on this function returning \p expr unchanged to preserve sharing
+ * from their p.o.v. Furthermore, this function can be called before conversion to ANF so
+ * we must avoid all recursion.
+ */
+Expr FoldConstantExpr(const Expr& expr, const IRModule& mod, bool fold_qnn);
+
+}  // namespace transform
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_TRANSFORMS_FOLD_CONSTANT_H_
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 923a18f7bc93..c64957b5b62a 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -37,6 +37,7 @@
 #include <utility>
 
 #include "../op/tensor/transform.h"
+#include "fold_constant.h"
 #include "pattern_utils.h"
 
 namespace tvm {
@@ -795,10 +796,7 @@ class SwitchAddMultiply : public DFPatternRewrite {
     }
 
     Expr const_expr = Call(Op::Get("multiply"), {c1, c2});
-    IRModule const_mod = IRModule::FromExpr(const_expr);
-    const_mod = transform::FoldConstant()(const_mod);
-    GlobalVar const_main = const_mod->GetGlobalVar("main");
-    Expr const_val = Downcast<Function>(const_mod->functions[const_main])->body;
+    Expr const_val = transform::FoldConstantExpr(const_expr);
 
     return Call(Op::Get("add"), {Call(Op::Get("multiply"), {x, c2}), const_val});
   }
@@ -833,10 +831,7 @@ class SimplifyAdjacentMultiplyOrAdd : public DFPatternRewrite {
     }
 
     Expr const_expr = Call(call->op, {c1, c2});
-    IRModule const_mod = IRModule::FromExpr(const_expr);
-    const_mod = transform::FoldConstant()(const_mod);
-    GlobalVar const_main = const_mod->GetGlobalVar("main");
-    Expr const_val = Downcast<Function>(const_mod->functions[const_main])->body;
+    Expr const_val = transform::FoldConstantExpr(const_expr);
 
     return Call(call->op, {x, const_val});
   }

From ce0e9abea0ae19554b6b735691319a5f9e670058 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 10 Nov 2022 23:33:01 -0800
Subject: [PATCH 562/704] [TIR] Make syntax of AST nodes different than ops
 (#13358)

As part of effort of more formal TIR semantics, we want to more
explicitly differentiate TIR AST nodes (defined in `tir/expr.h`)
and TIR ops (defined in `tir/op.h`).

A naming convention is that:
- Lowercased methods, for example, `tvm.tir.mul`, means an TIR op, which
  will be eagerly constant-folded, i.e. `mul(1, 2)` returns `3`
  immediately rather than creating an AST node.
- Capitalized callable, for example, `Mul`, means creating an AST node
  without constant folding.

This PR makes this behavior more explictly by printing `T.Mul(a, b)`
directly when `a` and `b` are both constants, rather than sugaring it
into `mul(a. b)` or `a * b`, so that the difference between an op and
an AST node is clarified.

Co-authored-by: Yaxing Cai <caiyaxing666@gmail.com>

Co-authored-by: Yaxing Cai <caiyaxing666@gmail.com>
---
 python/tvm/script/tir/intrin.py               | 80 ++++++++++++++-
 src/printer/tvmscript_printer.cc              | 97 +++++++++++--------
 .../test_hexagon/test_async_dma_pipeline.py   | 17 ++--
 .../test_parallel_hvx_load_vtcm.py            | 49 +++-------
 .../unittest/test_aot_legalize_packed_call.py | 12 +--
 .../unittest/test_meta_schedule_space_cuda.py |  2 +-
 ..._tir_transform_inject_software_pipeline.py | 16 +--
 ...est_tir_transform_inject_virtual_thread.py | 17 ++--
 .../test_tir_transform_thread_sync.py         |  2 +-
 .../unittest/test_tvmscript_roundtrip.py      |  8 +-
 10 files changed, 186 insertions(+), 114 deletions(-)

diff --git a/python/tvm/script/tir/intrin.py b/python/tvm/script/tir/intrin.py
index bd9aa1fdadfd..8e24f27325bd 100644
--- a/python/tvm/script/tir/intrin.py
+++ b/python/tvm/script/tir/intrin.py
@@ -17,12 +17,13 @@
 """TVM Script Parser Intrinsic Classes"""
 # pylint: disable=redefined-builtin, relative-beyond-top-level
 import builtins
-from typing import List, Any
+from typing import Any, List
 
 import tvm.tir
 from tvm.tir import FloatImm
-from ..registry import register
+
 from ...target import codegen
+from ..registry import register
 from ..utils import get_param_list, tvm_span_from_synr
 
 
@@ -229,3 +230,78 @@ def comm_reducer(lambda_io, identities, span):
 def llvm_lookup_intrinsic_id(name, span):
     # pylint: disable=unused-argument
     return codegen.llvm_lookup_intrinsic_id(name)
+
+
+@register
+def FloorMod(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.FloorMod(x, y, span)
+
+
+@register
+def FloorDiv(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.FloorDiv(x, y, span)
+
+
+@register
+def Mul(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.Mul(x, y, span)
+
+
+@register
+def Div(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.Div(x, y, span)
+
+
+@register
+def Add(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.Add(x, y, span)
+
+
+@register
+def Sub(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.Sub(x, y, span)
+
+
+@register
+def LT(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.LT(x, y, span)
+
+
+@register
+def LE(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.LE(x, y, span)
+
+
+@register
+def GT(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.GT(x, y, span)
+
+
+@register
+def GE(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.GE(x, y, span)
+
+
+@register
+def EQ(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.EQ(x, y, span)
+
+
+@register
+def NE(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.NE(x, y, span)
+
+
+@register
+def And(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.And(x, y, span)
+
+
+@register
+def Or(x, y, span):  # pylint: disable=invalid-name
+    return tvm.tir.Or(x, y, span)
+
+
+@register
+def Cast(dtype, value, span):  # pylint: disable=invalid-name
+    return tvm.tir.Cast(dtype, value, span)
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 64a576ef52f5..d7a3a406e352 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -788,7 +788,7 @@ Doc TVMScriptPrinter::VisitExpr_(const StringImmNode* op, ExprPrecedence* out_pr
 Doc TVMScriptPrinter::VisitExpr_(const CastNode* op, ExprPrecedence* out_precedence) {
   *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
-  doc << tir_prefix_ << ".cast(" << Print(op->value) << ", " << PrintDType(op->dtype) << ")";
+  doc << tir_prefix_ << ".Cast(" << PrintDType(op->dtype) << ", " << Print(op->value) << ")";
   return doc;
 }
 
@@ -798,46 +798,61 @@ Doc TVMScriptPrinter::VisitExpr_(const VarNode* op, ExprPrecedence* out_preceden
   return meta_.InMeta(var) ? meta_.GetMetaNode(var) : AllocVar(GetRef<Var>(op));
 }
 
-#define TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(OpName, OpString, OpPrecedence)            \
-  Doc TVMScriptPrinter::VisitExpr_(const OpName* op, ExprPrecedence* out_precedence) { \
-    Doc doc;                                                                           \
-    ExprPrecedence lhs_precedence = ExprPrecedence::kUnknown;                          \
-    ExprPrecedence rhs_precedence = ExprPrecedence::kUnknown;                          \
-    /* Get children expr out_precedence */                                             \
-    Doc lhs_doc = VisitExpr(op->a, &lhs_precedence);                                   \
-    Doc rhs_doc = VisitExpr(op->b, &rhs_precedence);                                   \
-    ICHECK(lhs_precedence != ExprPrecedence::kUnknown);                                \
-    ICHECK(rhs_precedence != ExprPrecedence::kUnknown);                                \
-    /* Update out_precedence of current node. */                                       \
-    *out_precedence = OpPrecedence;                                                    \
-    if (lhs_precedence > OpPrecedence) {                                               \
-      doc << "(" << lhs_doc << ")";                                                    \
-    } else {                                                                           \
-      doc << lhs_doc;                                                                  \
-    }                                                                                  \
-    doc << OpString;                                                                   \
-    if (rhs_precedence >= OpPrecedence) {                                              \
-      doc << "(" << rhs_doc << ")";                                                    \
-    } else {                                                                           \
-      doc << rhs_doc;                                                                  \
-    }                                                                                  \
-    return doc;                                                                        \
-  }
-
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(MulNode, " * ", ExprPrecedence::kMultiplicationDivision)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(DivNode, " / ", ExprPrecedence::kMultiplicationDivision)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(FloorDivNode, " // ", ExprPrecedence::kMultiplicationDivision)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(FloorModNode, " % ", ExprPrecedence::kMultiplicationDivision)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(AddNode, " + ", ExprPrecedence::kAdditionSubtraction)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(SubNode, " - ", ExprPrecedence::kAdditionSubtraction)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(LTNode, " < ", ExprPrecedence::kRelational)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(LENode, " <= ", ExprPrecedence::kRelational)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(GTNode, " > ", ExprPrecedence::kRelational)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(GENode, " >= ", ExprPrecedence::kRelational)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(EQNode, " == ", ExprPrecedence::kEquality)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(NENode, " != ", ExprPrecedence::kEquality)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(AndNode, " and ", ExprPrecedence::kAnd)
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(OrNode, " or ", ExprPrecedence::kOr)
+bool WillPrintConstScalar(const PrimExpr& expr) {
+  if (const auto* imm = expr.as<IntImmNode>()) {
+    DataType dtype = imm->dtype;
+    return dtype == DataType::Int(32) || dtype == DataType::Bool();
+  }
+  return false;
+}
+
+#define TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(OpName, OpString, OpClass, OpPrecedence)              \
+  Doc TVMScriptPrinter::VisitExpr_(const OpName* op, ExprPrecedence* out_precedence) {            \
+    Doc doc;                                                                                      \
+    if (WillPrintConstScalar(op->a) && WillPrintConstScalar(op->b)) {                             \
+      *out_precedence = ExprPrecedence::kIdentity;                                                \
+      doc << tir_prefix_ << "." << OpClass << "(" << Print(op->a) << ", " << Print(op->b) << ")"; \
+      return doc;                                                                                 \
+    }                                                                                             \
+    ExprPrecedence lhs_precedence = ExprPrecedence::kUnknown;                                     \
+    ExprPrecedence rhs_precedence = ExprPrecedence::kUnknown;                                     \
+    /* Get children expr out_precedence */                                                        \
+    Doc lhs_doc = VisitExpr(op->a, &lhs_precedence);                                              \
+    Doc rhs_doc = VisitExpr(op->b, &rhs_precedence);                                              \
+    ICHECK(lhs_precedence != ExprPrecedence::kUnknown);                                           \
+    ICHECK(rhs_precedence != ExprPrecedence::kUnknown);                                           \
+    /* Update out_precedence of current node. */                                                  \
+    *out_precedence = OpPrecedence;                                                               \
+    if (lhs_precedence > OpPrecedence) {                                                          \
+      doc << "(" << lhs_doc << ")";                                                               \
+    } else {                                                                                      \
+      doc << lhs_doc;                                                                             \
+    }                                                                                             \
+    doc << OpString;                                                                              \
+    if (rhs_precedence >= OpPrecedence) {                                                         \
+      doc << "(" << rhs_doc << ")";                                                               \
+    } else {                                                                                      \
+      doc << rhs_doc;                                                                             \
+    }                                                                                             \
+    return doc;                                                                                   \
+  }
+
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(MulNode, " * ", "Mul", ExprPrecedence::kMultiplicationDivision)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(DivNode, " / ", "Div", ExprPrecedence::kMultiplicationDivision)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(FloorDivNode, " // ", "FloorDiv",
+                                    ExprPrecedence::kMultiplicationDivision)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(FloorModNode, " % ", "FloorMod",
+                                    ExprPrecedence::kMultiplicationDivision)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(AddNode, " + ", "Add", ExprPrecedence::kAdditionSubtraction)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(SubNode, " - ", "Sub", ExprPrecedence::kAdditionSubtraction)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(LTNode, " < ", "LT", ExprPrecedence::kRelational)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(LENode, " <= ", "LE", ExprPrecedence::kRelational)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(GTNode, " > ", "GT", ExprPrecedence::kRelational)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(GENode, " >= ", "GE", ExprPrecedence::kRelational)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(EQNode, " == ", "EQ", ExprPrecedence::kEquality)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(NENode, " != ", "NE", ExprPrecedence::kEquality)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(AndNode, " and ", "And", ExprPrecedence::kAnd)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(OrNode, " or ", "Or", ExprPrecedence::kOr)
 
 Doc TVMScriptPrinter::VisitExpr_(const ModNode* op, ExprPrecedence* out_precedence) {
   *out_precedence = ExprPrecedence::kIdentity;
diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index a7a05c2aa3a7..9f8e639b5330 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -18,11 +18,10 @@
 """ Test different strategies for loading data into vtcm before running HVX workloads. """
 
 import numpy as np
-import tvm
 import pytest
-
-from tvm.script import tir as T
+import tvm
 from numpy.random import default_rng
+from tvm.script import tir as T
 
 VRMPY_SIZE_B = 128
 VRMPY_SIZE_INT32 = 32
@@ -126,9 +125,9 @@ def get_single_dma_schedule(size_a, size_w):
     @T.prim_func
     def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        a_buffer = T.match_buffer(a_input, a_shape, dtype="uint8", mem_scope="global")
-        w_buffer = T.match_buffer(b_input, w_shape, dtype="uint8", mem_scope="global")
-        c_buffer = T.match_buffer(c_output, out_shape, dtype="int32", mem_scope="global")
+        a_buffer = T.match_buffer(a_input, a_shape, dtype="uint8", scope="global")
+        w_buffer = T.match_buffer(b_input, w_shape, dtype="uint8", scope="global")
+        c_buffer = T.match_buffer(c_output, out_shape, dtype="int32", scope="global")
         a_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", mem_scope="global.vtcm")
         w_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", mem_scope="global.vtcm")
         c_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", mem_scope="global.vtcm")
@@ -153,7 +152,7 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
                     0,
                     dtype="handle",
                 ),
-                T.cast(a_bytes, dtype="int"),
+                T.Cast("int", a_bytes),
                 dtype="int32",
             )
         )
@@ -178,7 +177,7 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
                     0,
                     dtype="handle",
                 ),
-                T.cast(w_bytes, dtype="int"),
+                T.Cast("int", w_bytes),
                 dtype="int32",
             )
         )
@@ -222,7 +221,7 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
                     0,
                     dtype="handle",
                 ),
-                T.cast(a_bytes, dtype="int"),
+                T.Cast("int", a_bytes),
                 dtype="int32",
             )
         )
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
index fb398f43977a..e6fc0a3c201c 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -18,9 +18,8 @@
 """ Test different strategies for loading data into vtcm before running HVX workloads. """
 
 import numpy as np
-from numpy.random import default_rng
-
 import tvm
+from numpy.random import default_rng
 from tvm.script import tir as T
 
 from .infrastructure import get_hexagon_target
@@ -109,17 +108,17 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
             [T.cast(operations, "int32") * 128],
             dtype="uint8",
             align=128,
-            mem_scope="global.vtcm",
+            scope="global.vtcm",
         )
         b_buffer = T.match_buffer(
             b,
             [T.cast(operations, "int32") * 128],
             dtype="uint8",
             align=128,
-            mem_scope="global.vtcm",
+            scope="global.vtcm",
         )
         c_buffer = T.match_buffer(
-            c, [T.cast(operations, "int32") * 32], dtype="int32", align=128, mem_scope="global.vtcm"
+            c, [T.cast(operations, "int32") * 32], dtype="int32", align=128, scope="global.vtcm"
         )
         for n in T.grid(operations):
             with T.block("c_buffer"):
@@ -149,21 +148,13 @@ def operator(
         a: T.handle, b: T.handle, c: T.handle, a_v: T.handle, b_v: T.handle, c_v: T.handle
     ) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        a_buffer = T.match_buffer(
-            a, [operations, 128], dtype="uint8", align=128, mem_scope="global"
-        )
-        b_buffer = T.match_buffer(
-            b, [operations, 128], dtype="uint8", align=128, mem_scope="global"
-        )
-        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
-        a_global_vtcm = T.match_buffer(
-            a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
-        )
-        b_global_vtcm = T.match_buffer(
-            b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
-        )
+        a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, scope="global")
+        b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, scope="global")
+        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128, scope="global")
+        a_global_vtcm = T.match_buffer(a_v, [size], dtype="uint8", align=128, scope="global.vtcm")
+        b_global_vtcm = T.match_buffer(b_v, [size], dtype="uint8", align=128, scope="global.vtcm")
         c_global_vtcm = T.match_buffer(
-            c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm"
+            c_v, [out_size], dtype="int32", align=128, scope="global.vtcm"
         )
         for n, i in T.grid(operations, 128):
             with T.block("a_buffer_global.vtcm"):
@@ -212,21 +203,13 @@ def operator(
         c_v: T.handle,
     ) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        a_buffer = T.match_buffer(
-            a, [operations, 128], dtype="uint8", align=128, mem_scope="global"
-        )
-        b_buffer = T.match_buffer(
-            b, [operations, 128], dtype="uint8", align=128, mem_scope="global"
-        )
-        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
-        a_global_vtcm = T.match_buffer(
-            a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
-        )
-        b_global_vtcm = T.match_buffer(
-            b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
-        )
+        a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, scope="global")
+        b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, scope="global")
+        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128, scope="global")
+        a_global_vtcm = T.match_buffer(a_v, [size], dtype="uint8", align=128, scope="global.vtcm")
+        b_global_vtcm = T.match_buffer(b_v, [size], dtype="uint8", align=128, scope="global.vtcm")
         c_global_vtcm = T.match_buffer(
-            c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm"
+            c_v, [out_size], dtype="int32", align=128, scope="global.vtcm"
         )
         T.evaluate(
             T.tvm_call_packed(
diff --git a/tests/python/unittest/test_aot_legalize_packed_call.py b/tests/python/unittest/test_aot_legalize_packed_call.py
index 9c597a55e5cc..cd0114d46428 100644
--- a/tests/python/unittest/test_aot_legalize_packed_call.py
+++ b/tests/python/unittest/test_aot_legalize_packed_call.py
@@ -15,11 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring
+import pytest
 import tvm
-from tvm.script import tir as T
-from tvm import tir
 import tvm.testing
-import pytest
+from tvm import tir
+from tvm.script import tir as T
 
 
 @tvm.script.ir_module
@@ -85,7 +85,7 @@ def tir_packed_call() -> None:
                     T.tvm_stack_make_shape(1, dtype="handle"),
                     T.reinterpret(T.uint64(0), dtype="handle"),
                     T.uint32(1),
-                    T.cast(0, dtype="float32"),
+                    T.Cast("float32", 0),
                     0,
                     dtype="handle",
                 ),
@@ -94,7 +94,7 @@ def tir_packed_call() -> None:
                     T.tvm_stack_make_shape(1, dtype="handle"),
                     T.reinterpret(T.uint64(0), dtype="handle"),
                     T.uint32(1),
-                    T.cast(0, dtype="float32"),
+                    T.Cast("float32", 0),
                     0,
                     dtype="handle",
                 ),
@@ -103,7 +103,7 @@ def tir_packed_call() -> None:
                     T.tvm_stack_make_shape(1, dtype="handle"),
                     T.reinterpret(T.uint64(0), dtype="handle"),
                     T.uint32(1),
-                    T.cast(0, dtype="float32"),
+                    T.Cast("float32", 0),
                     0,
                     dtype="handle",
                 ),
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index 324d8a9ec4f8..0a518c840d11 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -856,7 +856,7 @@ def nrm_1(A: T.Buffer[(1, 256, 256), "float32"], D: T.Buffer[1, "float32"]) -> N
                 for i0_1 in T.thread_binding(128, thread="threadIdx.x"):
                     with T.block("D"):
                         b = T.axis.spatial(1, i0_1)
-                        T.where(0 * 128 + i0_1 < 1)
+                        T.where(T.Mul(0, 128) + i0_1 < 1)
                         T.reads(C_shared[b])
                         T.writes(D[b])
                         D[b] = T.sqrt(C_shared[b], dtype="float32")
diff --git a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
index 2a4cabc541c6..c70525b05712 100644
--- a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
+++ b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
@@ -14,16 +14,16 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import pytest
 import sys
-import numpy as np
 
+import numpy as np
+import pytest
 import tvm
 import tvm.testing
 import tvm.tir.tensor_intrin.cuda
-from tvm import tir, te, TVMError
-from tvm.script import tir as T
+from tvm import TVMError, te, tir
 from tvm.meta_schedule.testing import te_workload
+from tvm.script import tir as T
 from tvm.testing.tir import mma_schedule
 from tvm.tir.tensor_intrin.cuda import (
     LDMATRIX_16x16_A_DYN_INTRIN,
@@ -1116,7 +1116,7 @@ def test_simple_compute_async():
     mod = tvm.tir.transform.InjectSoftwarePipeline()(sch.mod)
 
     @T.prim_func
-    def ref(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]) -> None:
+    def ref(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
         for tx in T.thread_binding(16, thread="threadIdx.x"):
             with T.block():
                 T.reads(A[tx, 0:16])
@@ -1127,7 +1127,7 @@ def ref(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]) -> N
                     T.writes(B[0, tx, 0])
                     with T.attr(0, "async_commit_queue_scope", 0):
                         with T.attr(0, "async_scope", 1):
-                            B[0 % 2, tx, 0] = A[tx, 0] * T.float32(2)
+                            B[T.FloorMod(0, 2), tx, 0] = A[tx, 0] * T.float32(2)
                 with T.block():
                     T.reads(A[tx, 1:16], B[0:2, tx, 0])
                     T.writes(B[0:2, tx, 0], C[tx, 0:15])
@@ -1147,11 +1147,11 @@ def ref(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]) -> N
                                 with T.attr(0, "async_wait_inflight_count", 1):
                                     C[tx, i - 1 + 1] = B[(i - 1 + 1) % 2, tx, 0] + T.float32(1)
                 with T.block():
-                    T.reads(B[15 % 2, tx, 0])
+                    T.reads(B[T.FloorMod(15, 2), tx, 0])
                     T.writes(C[tx, 15])
                     with T.attr(0, "async_wait_queue_scope", 0):
                         with T.attr(0, "async_wait_inflight_count", 0):
-                            C[tx, 15] = B[15 % 2, tx, 0] + T.float32(1)
+                            C[tx, 15] = B[T.FloorMod(15, 2), tx, 0] + T.float32(1)
 
     tvm.ir.assert_structural_equal(mod["main"], ref, True)
 
diff --git a/tests/python/unittest/test_tir_transform_inject_virtual_thread.py b/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
index 548f3bc8d1d2..b4ea4e712d19 100644
--- a/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
+++ b/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
@@ -16,7 +16,6 @@
 # under the License.
 import tvm
 from tvm import te
-
 from tvm.script import tir as T
 
 vthread_name = tvm.testing.parameter("vthread", "cthread")
@@ -155,10 +154,10 @@ def expected_func():
         B = T.buffer_decl([16], "int32", data=B_data, scope="shared")
         # The indices for B should each be a single Ramp node, and
         # should not be the sum of a Ramp and Broadcast node.
-        B[0 * 4 : 0 * 4 + 4] = T.broadcast(0, 4)
-        B[1 * 4 : 1 * 4 + 4] = T.broadcast(1, 4)
-        B[2 * 4 : 2 * 4 + 4] = T.broadcast(2, 4)
-        B[3 * 4 : 3 * 4 + 4] = T.broadcast(3, 4)
+        B[T.Mul(0, 4) : T.Mul(0, 4) + 4] = T.broadcast(0, 4)
+        B[T.Mul(1, 4) : T.Mul(1, 4) + 4] = T.broadcast(1, 4)
+        B[T.Mul(2, 4) : T.Mul(2, 4) + 4] = T.broadcast(2, 4)
+        B[T.Mul(3, 4) : T.Mul(3, 4) + 4] = T.broadcast(3, 4)
 
     before_mod = tvm.IRModule.from_expr(before_func)
     after_mod = tvm.tir.transform.InjectVirtualThread()(before_mod)
@@ -182,10 +181,10 @@ def before_func():
     def expected_func():
         B_data = T.allocate([4], "int32x4", "shared")
         B = T.buffer_decl([4], "int32x4", data=B_data, scope="shared")
-        B[0 * 4 / 4] = T.broadcast(0, 4)
-        B[1 * 4 / 4] = T.broadcast(1, 4)
-        B[2 * 4 / 4] = T.broadcast(2, 4)
-        B[3 * 4 / 4] = T.broadcast(3, 4)
+        B[T.Mul(0, 4) / 4] = T.broadcast(0, 4)
+        B[T.Mul(1, 4) / 4] = T.broadcast(1, 4)
+        B[T.Mul(2, 4) / 4] = T.broadcast(2, 4)
+        B[T.Mul(3, 4) / 4] = T.broadcast(3, 4)
 
     before_mod = tvm.IRModule.from_expr(before_func)
     intermediate_mod = tvm.tir.transform.InjectVirtualThread()(before_mod)
diff --git a/tests/python/unittest/test_tir_transform_thread_sync.py b/tests/python/unittest/test_tir_transform_thread_sync.py
index 18607ca1a005..c80cd55ea27e 100644
--- a/tests/python/unittest/test_tir_transform_thread_sync.py
+++ b/tests/python/unittest/test_tir_transform_thread_sync.py
@@ -102,9 +102,9 @@ def func(p0: T.Buffer[2, "float32"], p1: T.Buffer[2, "float32"]) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
         blockIdx_x = T.env_thread("blockIdx.x")
         T.preflattened_buffer(p0, [1, 2, 1, 1], dtype="float32", data=p0.data)
-        T.launch_thread(blockIdx_x, 8)
         result_local = T.alloc_buffer([1], dtype="float32", scope="local")
         temp_shared = T.alloc_buffer([1], dtype="float32", scope="shared")
+        T.launch_thread(blockIdx_x, 8)
         T.launch_thread(threadIdx_x, 4)
         result_local[0] = T.float32(0)
         if threadIdx_x < 1:
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index dd6706762dc3..f22e61e1838d 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -90,9 +90,9 @@ class Module:
         def mmult(A: T.handle, B: T.handle, C: T.handle) -> None:
             # function attr dict
             T.func_attr({"global_symbol": "mmult", "tir.noalias": True})
-            A_1 = T.match_buffer(A, [1024 * 1024], elem_offset=0, align=64, offset_factor=1)
+            A_1 = T.match_buffer(A, [16384], elem_offset=0, align=64, offset_factor=1)
             B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=64, offset_factor=1)
-            C_1 = T.match_buffer(C, [1024 * 1024], elem_offset=0, align=64, offset_factor=1)
+            C_1 = T.match_buffer(C, [16384], elem_offset=0, align=64, offset_factor=1)
             # body
             packedB_data = T.allocate([32768], "float32", "global")
             packedB = T.buffer_decl(
@@ -3008,7 +3008,7 @@ def comm_reducer_single_reduce_group():
     def comm_reducer_single_reduce_group(a: T.handle, b: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         threadIdx_x = T.env_thread("threadIdx.x")
-        A = T.match_buffer(a, [128 * 128], dtype="float32")
+        A = T.match_buffer(a, [16384], dtype="float32")
         for i in T.serial(0, 128):
             T.launch_thread(threadIdx_x, 128)
             reduce_temp0_data = T.allocate([1], "float32", "local")
@@ -3024,7 +3024,7 @@ def comm_reducer_multiple_reduce_groups():
     def comm_reducer_multiple_reduce_groups(a: T.handle, b: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         threadIdx_x = T.env_thread("threadIdx.x")
-        A = T.match_buffer(a, [128 * 128], dtype="float32")
+        A = T.match_buffer(a, [16384], dtype="float32")
         for i in T.serial(0, 128):
             T.launch_thread(threadIdx_x, 128)
             reduce_temp0_data = T.allocate([1], "float32", "local")

From 5ffcfd9327e24129b50ebb3c8c0456961e444999 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Fri, 11 Nov 2022 10:49:24 +0300
Subject: [PATCH 563/704] [FQ2I] Add cast back to input data type after
 AvgPool2d (#13332)

[FQ2I] Add cast back to output data type after AvgPool2d

This commit fixes the following issue:
For the sequence of qnn.dequantize -> avg_pool2d -> conv2d ->
qnn.quantize FQ2I pass inserts qnn.requantize (or cast) to int32
unconditionally before AvgPool2d. As a result fake quantized qnn.conv2d
gets input as int32 dtype, but it is forbidden for qnn.conv2d (supports
only uint8/int8/int16).

This commit adds the following:
Add cast back to output data type after AvgPool2d. This preserve input
dtype == output dtype for this op.
---
 .../transform/fake_quantization_to_integer.py |  7 ++++++-
 .../test_pass_fake_quantization_to_integer.py | 21 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index 242740399f96..46bdd94ace1a 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -141,6 +141,7 @@ def avgpool2d(expr, type_map):
     arg = expr.args[0]
     t = type_map[arg]
     out_t = type_map[expr]
+    # Cast (or requantize) to int32.
     if not (
         approx_equal(t.scale, out_t.scale)
         and approx_equal(t.zero_point, out_t.zero_point)
@@ -158,7 +159,11 @@ def avgpool2d(expr, type_map):
     else:
         arg = relay.op.cast(arg, "int32")
     out = relay.op.nn.avg_pool2d(arg, **expr.attrs)
-    return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)]
+    if out_t.dtype != "int32":
+        # Cast back to output dtype to preserve input dtype == output dtype for AvgPool2d.
+        out = relay.op.clip(out, a_min=np.iinfo(out_t.dtype).min, a_max=np.iinfo(out_t.dtype).max)
+        out = relay.op.cast(out, out_t.dtype)
+    return [out, TensorAffineType(out_t.scale, out_t.zero_point, out_t.dtype, out_t.axis)]
 
 
 @register_fake_quantization_to_integer("nn.global_avg_pool2d")
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index 46979dfc3cba..569bd9d7d653 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -814,6 +814,27 @@ def run_test_case(partial_func):
     run_test_case(lambda x: relay.op.min(x, axis=1))
 
 
+def test_fq_avg_pool_conv2d():
+    dtype = "uint8"
+    shape_x = [1, 4, 24, 24]
+    shape_w = [8, 4, 1, 1]
+    x = relay.var("x", shape=shape_x, dtype=dtype)
+    w = relay.var("w", shape=shape_w, dtype=dtype)
+    zero = relay.const(0)
+    one = relay.const(1.0)
+
+    # Tested expression.
+    op0 = relay.qnn.op.dequantize(x, relay.const(0.64), relay.const(2))
+    op1 = relay.op.nn.avg_pool2d(op0, [3, 3])
+    op2 = relay.qnn.op.dequantize(w, relay.const(0.5), relay.const(10))
+    op3 = relay.op.nn.conv2d(op1, op2, kernel_size=[1, 1])
+    expr = relay.qnn.op.quantize(op3, one, zero, out_dtype="uint8")
+
+    x_np = np.random.randint(0, 255, size=shape_x, dtype=dtype)
+    w_np = np.random.randint(0, 255, size=shape_w, dtype=dtype)
+    compare_fq_to_int(expr, [x_np, w_np])
+
+
 def test_fq_hard_fail():
     @tvm.ir.register_op_attr("nn.conv2d", "FTVMFakeQuantizationToInteger", level=11)
     def conv2d(expr, type_map):  # pylint: disable=unused-variable

From 88979834842115ef9ea8487d9a631dc3275f7a7d Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 11 Nov 2022 08:40:17 -0800
Subject: [PATCH 564/704] [IRBuilder][Minor] Add intrinsics like `T.int32x4`
 (#13361)

This PR adds all common TIR intrinsics like `T.int32x4`, `T.floatx4`.

Co-authored-by: Yaxing Cai <caiyaxing666@gmail.com>
---
 include/tvm/script/ir_builder/tir/frame.h     |  16 +-
 include/tvm/script/ir_builder/tir/ir.h        |  46 +-
 python/tvm/script/ir_builder/tir/frame.py     |   4 +-
 python/tvm/script/ir_builder/tir/ir.py        | 473 +++++++-----------
 python/tvm/tir/op.py                          |  57 ++-
 src/script/ir_builder/tir/frame.cc            |  14 +-
 src/script/ir_builder/tir/ir.cc               |  56 ++-
 .../unittest/test_tvmscript_error_report.py   |   2 +-
 .../unittest/test_tvmscript_ir_builder_tir.py |  21 +-
 9 files changed, 348 insertions(+), 341 deletions(-)

diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h
index aa2386e7f1e4..b95d575360e6 100644
--- a/include/tvm/script/ir_builder/tir/frame.h
+++ b/include/tvm/script/ir_builder/tir/frame.h
@@ -453,8 +453,8 @@ class AllocateFrameNode : public TIRFrameNode {
   PrimExpr condition;
   /*! \brief Additional annotation hints. */
   Map<String, ObjectRef> annotations;
-  /*! \brief The buffer. */
-  tvm::tir::Buffer buffer;
+  /*! \brief The buffer var. */
+  tvm::tir::Var buffer_var;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     TIRFrameNode::VisitAttrs(v);
@@ -463,7 +463,7 @@ class AllocateFrameNode : public TIRFrameNode {
     v->Visit("storage_scope", &storage_scope);
     v->Visit("condition", &condition);
     v->Visit("annotations", &annotations);
-    v->Visit("buffer", &buffer);
+    v->Visit("buffer_var", &buffer_var);
   }
 
   static constexpr const char* _type_key = "script.ir_builder.tir.AllocateFrame";
@@ -500,8 +500,8 @@ class AllocateConstFrameNode : public TIRFrameNode {
   Array<PrimExpr> extents;
   /*! \brief The data associated with the constant. */
   tvm::runtime::NDArray data;
-  /*! \brief The buffer */
-  tvm::tir::Buffer buffer;
+  /*! \brief The buffer var */
+  tvm::tir::Var buffer_var;
   /*! \brief Additional annotations about the allocation. */
   Map<String, ObjectRef> annotations;
 
@@ -510,7 +510,7 @@ class AllocateConstFrameNode : public TIRFrameNode {
     v->Visit("dtype", &dtype);
     v->Visit("extents", &extents);
     v->Visit("data", &data);
-    v->Visit("buffer", &buffer);
+    v->Visit("buffer_var", &buffer_var);
     v->Visit("annotations", &annotations);
   }
 
@@ -723,11 +723,15 @@ class ElseFrame : public TIRFrame {
 
 class DeclBufferFrameNode : public TIRFrameNode {
  public:
+  /*! \brief The declared buffer. */
   tvm::tir::Buffer buffer;
+  /*! \brief The buffer allocated or not. */
+  bool allocated;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     TIRFrameNode::VisitAttrs(v);
     v->Visit("buffer", &buffer);
+    v->Visit("allocated", &allocated);
   }
 
   static constexpr const char* _type_key = "script.ir_builder.tir.DeclBufferFrame";
diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
index 7460099f9448..d9e1a1b49063 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -339,9 +339,8 @@ AllocateFrame Allocate(Array<PrimExpr> extents, DataType dtype, String storage_s
  * \param annotations Additional annotation hints.
  * \return The created AllocateConstFrame.
  */
-AllocateConstFrame AllocateConst(
-    NDArray data, DataType dtype, Array<PrimExpr> extents,
-    Map<String, ObjectRef> annotations = NullValue<Map<String, ObjectRef>>());
+AllocateConstFrame AllocateConst(NDArray data, DataType dtype, Array<PrimExpr> extents,
+                                 Optional<Map<String, ObjectRef>> annotations = NullOpt);
 
 /*!
  * \brief Create an attribute.
@@ -449,21 +448,32 @@ PrimExpr Ptr(runtime::DataType dtype, String storage_scope = "global");
     return expr.defined() ? tvm::cast(dtype, expr.value()) : tvm::tir::Var("", dtype); \
   }
 
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int8, DataType::Int(8));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int16, DataType::Int(16));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32, DataType::Int(32));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int64, DataType::Int(64));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt8, DataType::UInt(8));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt16, DataType::UInt(16));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt32, DataType::UInt(32));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt64, DataType::UInt(64));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float8, DataType::Float(8));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float16, DataType::Float(16));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float32, DataType::Float(32));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float64, DataType::Float(64));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32x4, DataType::Int(32, 4));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32x8, DataType::Int(32, 8));
-TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32x16, DataType::Int(32, 16));
+#define TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_SIZES(DType, FDType) \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(DType##8, FDType(8));      \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(DType##16, FDType(16));    \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(DType##32, FDType(32));    \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(DType##64, FDType(64));
+
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_SIZES(Float, DataType::Float);
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_SIZES(UInt, DataType::UInt);
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_SIZES(Int, DataType::Int);
+
+#define TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_LANES(FuncName, FDType, Size) \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x4, FDType(Size, 4));     \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x8, FDType(Size, 8));     \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x16, FDType(Size, 16));   \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x32, FDType(Size, 32));   \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x64, FDType(Size, 64));
+
+#define TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(DType, FDType) \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##8, FDType, 8);      \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##16, FDType, 16);    \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##32, FDType, 32);    \
+  TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##64, FDType, 64);
+
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(Float, DataType::Float);
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(UInt, DataType::UInt);
+TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(Int, DataType::Int);
 TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Boolean, DataType::Bool());
 TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Handle, DataType::Handle());
 TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Void, DataType::Void());
diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py
index b9b50dfa9876..a57c878bd929 100644
--- a/python/tvm/script/ir_builder/tir/frame.py
+++ b/python/tvm/script/ir_builder/tir/frame.py
@@ -69,14 +69,14 @@ class RealizeFrame(TIRFrame):
 class AllocateFrame(TIRFrame):
     def __enter__(self) -> Buffer:
         super().__enter__()
-        return self.buffer
+        return self.buffer_var
 
 
 @_register_object("script.ir_builder.tir.AllocateConstFrame")
 class AllocateConstFrame(TIRFrame):
     def __enter__(self) -> Buffer:
         super().__enter__()
-        return self.buffer
+        return self.buffer_var
 
 
 @_register_object("script.ir_builder.tir.AttrFrame")
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index 4ec1511f2907..bd9e4e1db522 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -14,41 +14,75 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=missing-docstring
 """IRBuilder for TIR"""
 
-import inspect
 import functools
+import inspect
 from numbers import Integral
-from typing import Any, Callable, Dict, List, Optional, Union, Tuple
-import numpy as np  # type: ignore
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+# isort: off
+from typing_extensions import Literal
 
+# isort: on
+
+import numpy as np  # type: ignore
 from tvm.ir import Range, Type
 from tvm.runtime import convert, ndarray
+from tvm.target import Target
+
+# pylint: disable=unused-import
 from tvm.target.codegen import llvm_lookup_intrinsic_id
-from tvm.tir import (
-    Buffer,
+from tvm.tir import Buffer, BufferRegion, PrimExpr
+from tvm.tir import op as _tir_op
+from tvm.tir import type_annotation
+
+# import tir.expr for direct ir construction to pass structural_equal comparison
+from tvm.tir.expr import (
+    EQ,
+    GE,
+    GT,
+    LE,
+    LT,
+    NE,
+    Add,
+    And,
+    Broadcast,
     BufferLoad,
-    BufferRegion,
+    Call,
+    CallEffectKind,
     Cast,
     CommReducer,
+    Div,
+    FloatImm,
+    FloorDiv,
+    FloorMod,
     IntImm,
     IterVar,
     Let,
-    PrimExpr,
+    Load,
+    Max,
+    Min,
+    Mod,
+    Mul,
+    Not,
+    Or,
+    ProducerLoad,
+    Ramp,
+    Reduce,
     Select,
     Shuffle,
+    SizeVar,
     StringImm,
-    type_annotation,
+    Sub,
     Var,
 )
-from tvm.tir import Broadcast as broadcast
-from tvm.tir import Ramp as ramp
-from tvm.tir import op as _tir_op
 from tvm.tir.generic import cast
 
 from . import _ffi_api, frame
 
+# pylint: enable=unused-import
+
 
 def buffer_decl(
     shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
@@ -56,7 +90,7 @@ def buffer_decl(
     data: Var = None,
     strides: List[PrimExpr] = None,
     elem_offset: PrimExpr = None,
-    scope: str = "",
+    scope: str = "global",
     align: int = 0,
     offset_factor: int = 0,
     buffer_type: str = "",
@@ -187,7 +221,7 @@ def func_ret(ret_type: Type) -> Type:
 
 def match_buffer(
     param: Union[Var, BufferLoad, BufferRegion],
-    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
+    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral] = None,
     dtype: str = "float32",
     data: Var = None,
     strides: List[PrimExpr] = None,
@@ -256,6 +290,12 @@ def match_buffer(
     res : Buffer
         The matched buffer.
     """
+    if shape is None:
+        if isinstance(param, BufferRegion):
+            dtype = param.buffer.dtype
+            shape = [region.extent for region in param.region]
+        else:
+            raise ValueError("Shape must be specified when binding input param")
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is None:
         strides = []
@@ -447,7 +487,7 @@ def alloc_buffer(
     data: Var = None,
     strides: List[PrimExpr] = None,
     elem_offset: PrimExpr = None,
-    scope: str = "",
+    scope: str = "global",
     align: int = -1,
     offset_factor: int = 0,
     buffer_type: str = "default",
@@ -526,10 +566,14 @@ def _as_range(dom: Union[Range, List[PrimExpr]]) -> Range:
         return dom
     if isinstance(dom, (list, tuple)):
         return Range(dom[0], dom[1])
+    if hasattr(dom, "dtype"):
+        return Range(IntImm(dom.dtype, 0), dom)
     return Range(0, dom)
 
 
 class axis:  # pylint: disable=invalid-name
+    """The axis class"""
+
     @staticmethod
     def spatial(
         dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32"
@@ -686,7 +730,10 @@ def serial(
     """
     if stop is None:
         stop = start
-        start = 0
+        if hasattr(start, "dtype"):
+            start = IntImm(start.dtype, 0)
+        else:
+            start = 0
     return _ffi_api.Serial(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
@@ -713,7 +760,10 @@ def parallel(
     """
     if stop is None:
         stop = start
-        start = 0
+        if hasattr(start, "dtype"):
+            start = IntImm(start.dtype, 0)
+        else:
+            start = 0
     return _ffi_api.Parallel(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
@@ -740,7 +790,10 @@ def vectorized(
     """
     if stop is None:
         stop = start
-        start = 0
+        if hasattr(start, "dtype"):
+            start = IntImm(start.dtype, 0)
+        else:
+            start = 0
     return _ffi_api.Vectorized(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
@@ -767,7 +820,10 @@ def unroll(
     """
     if stop is None:
         stop = start
-        start = 0
+        if hasattr(start, "dtype"):
+            start = IntImm(start.dtype, 0)
+        else:
+            start = 0
     return _ffi_api.Unroll(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
@@ -804,10 +860,16 @@ def thread_binding(
             raise ValueError("Thread cannot be None for thread_binding")
         thread = stop
         stop = start
-        start = 0
+        if hasattr(start, "dtype"):
+            start = IntImm(start.dtype, 0)
+        else:
+            start = 0
     elif stop is None:
         stop = start
-        start = 0
+        if hasattr(start, "dtype"):
+            start = IntImm(start.dtype, 0)
+        else:
+            start = 0
     return _ffi_api.ThreadBinding(  # type: ignore[attr-defined] # pylint: disable=no-member
         start, stop, thread, annotations
     )
@@ -907,7 +969,7 @@ def realize(
 def allocate(
     extents: List[PrimExpr],
     dtype: str,
-    scope: str = "",
+    scope: str = "global",
     condition: PrimExpr = None,
     annotations=None,
 ) -> frame.AllocateFrame:
@@ -959,9 +1021,18 @@ def allocate_const(
     annotations : Optional[Map]
         Additional annotations about the allocation.
     """
+    np_data = np.asarray(data, dtype=dtype)
+    prod_extent = 1
+    for extent in extents:
+        prod_extent *= extent
+    prod_shape = 1
+    for shape in np_data.shape:
+        prod_shape *= shape
+    if prod_extent == prod_shape:
+        np_data = np_data.reshape(extents)
 
     return _ffi_api.AllocateConst(  # type: ignore[attr-defined] # pylint: disable=no-member
-        ndarray.array(np.asarray(data, dtype)), dtype, extents, annotations
+        ndarray.array(np_data), dtype, extents, annotations
     )
 
 
@@ -1054,7 +1125,7 @@ def decl_buffer(
     data=None,
     strides=None,
     elem_offset=None,
-    scope="",
+    scope="global",
     align=0,
     offset_factor=0,
     buffer_type="",
@@ -1221,247 +1292,41 @@ def evaluate(value: PrimExpr) -> None:
     """
     if isinstance(value, str):
         value = StringImm(value)
+    if isinstance(value, bool):
+        value = cast(value, "bool")
     return _ffi_api.Evaluate(value)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def int8(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type int8 or cast expression to type int8.
+__all__ = []
+for _dtype in ["Float", "UInt", "Int"]:
+    for _size in ["8", "16", "32", "64"]:
+        for _lanes in ["", "x4", "x8", "x16", "x32", "x64"]:
+            _name = _dtype + _size + _lanes  # pylint: disable=invalid-name
 
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
+            def func_gen(name: str):
+                """Generate a function for each PrimExpr dtype.
 
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type int8 or casted expression with type int8.
-    """
-    return _ffi_api.Int8(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def int16(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type int16 or cast expression to type int16.
+                Parameters
+                ----------
+                name: str
+                    The ffi function name to call.
+                """
 
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
+                def func(
+                    expr: Union[
+                        None,
+                        PrimExpr,
+                        Literal["inf", "-inf", "nan"],
+                    ] = None
+                ) -> PrimExpr:
+                    if isinstance(expr, str):
+                        expr = float(expr)
+                    return getattr(_ffi_api, name)(expr)
 
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type int16 or casted expression with type int16.
-    """
-    return _ffi_api.Int16(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
+                return func
 
-
-def int32(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type int32 or cast expression to type int32.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type int32 or casted expression with type int32.
-    """
-    return _ffi_api.Int32(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def int64(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type int64 or cast expression to type int64.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type int64 or casted expression with type int64.
-    """
-    return _ffi_api.Int64(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def uint8(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type uint8 or cast expression to type uint8.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type uint8 or casted expression with type uint8.
-    """
-    return _ffi_api.UInt8(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def uint16(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type uint16 or cast expression to type uint16.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type uint16 or casted expression with type uint16.
-    """
-    return _ffi_api.UInt16(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def uint32(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type uint32 or cast expression to type uint32.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type uint32 or casted expression with type uint32.
-    """
-    return _ffi_api.UInt32(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def uint64(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type uint64 or cast expression to type uint64.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type uint64 or casted expression with type uint64.
-    """
-    return _ffi_api.UInt64(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def float8(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type float8 or cast expression to type float8.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type float8 or casted expression with type float8.
-    """
-    return _ffi_api.Float8(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def float16(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type float16 or cast expression to type float16.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type float16 or casted expression with type float16.
-    """
-    return _ffi_api.Float16(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def float32(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type float32 or cast expression to type float32.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type float32 or casted expression with type float32.
-    """
-    return _ffi_api.Float32(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def float64(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type float64 or cast expression to type float64.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type float64 or casted expression with type float64.
-    """
-    return _ffi_api.Float64(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def int32x4(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type int32x4 or cast expression to type int32x4.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type int32x4 or casted expression with type int32x4.
-    """
-    return _ffi_api.Int32x4(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def int32x8(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type int32x8 or cast expression to type int32x8.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type int32x8 or casted expression with type int32x8.
-    """
-    return _ffi_api.Int32x8(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
-
-
-def int32x16(expr: Optional[PrimExpr] = None) -> PrimExpr:
-    """Construct a new tir.Var with type int32x16 or cast expression to type int32x16.
-
-    Parameters
-    ----------
-    expr: PrimExpr
-        The expression to be cast.
-
-    Returns
-    -------
-    res : PrimExpr
-        The new tir.Var with type int32x16 or casted expression with type int32x16.
-    """
-    return _ffi_api.Int32x16(expr)  # type: ignore[attr-defined] # pylint: disable=no-member
+            globals()[_name.lower()] = func_gen(_name)
+            __all__.append(_name.lower())
 
 
 def boolean(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -1645,6 +1510,27 @@ def comm_reducer(combiner: Callable, identity: List[PrimExpr]) -> CommReducer:
     return CommReducer(args[: num_args // 2], args[num_args // 2 :], res, identity)
 
 
+def target(target_config: Union[Dict, str]) -> Target:
+    """
+    Create a target
+
+    Parameters
+    ----------
+    target_config : Union[Dict, str]
+        The target configuration.
+
+    Returns
+    -------
+    res : Target
+        The target.
+    """
+    if not isinstance(target_config, (str, dict)):
+        raise ValueError(
+            f"T.target expected a config dict or string, but got {type(target_config)}"
+        )
+    return Target(target_config)
+
+
 def _op_wrapper(func):
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
@@ -1667,6 +1553,9 @@ def wrapped(*args, **kwargs):
 
 # pylint: disable=invalid-name
 
+broadcast = Broadcast
+ramp = Ramp
+
 buffer_var = ptr
 abs = _op_wrapper(_tir_op.abs)  # pylint: disable=redefined-builtin
 fabs = abs
@@ -1713,6 +1602,7 @@ def wrapped(*args, **kwargs):
 popcount = _op_wrapper(_tir_op.popcount)
 power = _op_wrapper(_tir_op.power)
 q_multiply_shift = _op_wrapper(_tir_op.q_multiply_shift)
+q_multiply_shift_per_axis = _op_wrapper(_tir_op.q_multiply_shift_per_axis)
 ret = _op_wrapper(_tir_op.ret)
 reinterpret = _dtype_forward(_tir_op.reinterpret)
 round = _op_wrapper(_tir_op.round)  # pylint: disable=redefined-builtin
@@ -1733,6 +1623,7 @@ def wrapped(*args, **kwargs):
 tvm_stack_alloca = _op_wrapper(_tir_op.tvm_stack_alloca)
 tvm_stack_make_shape = _op_wrapper(_tir_op.tvm_stack_make_shape)
 tvm_stack_make_array = _op_wrapper(_tir_op.tvm_stack_make_array)
+tvm_check_return = _op_wrapper(_tir_op.tvm_check_return)
 call_packed = _op_wrapper(_tir_op.call_packed)
 call_cpacked = _op_wrapper(_tir_op.call_cpacked)
 call_packed_lowered = _op_wrapper(_tir_op.call_packed_lowered)
@@ -1742,7 +1633,6 @@ def wrapped(*args, **kwargs):
 call_llvm_intrin = _dtype_forward(_tir_op.call_llvm_intrin)
 call_llvm_pure_intrin = _dtype_forward(_tir_op.call_llvm_pure_intrin)
 call_pure_extern = _dtype_forward(_tir_op.call_pure_extern)
-tvm_access_ptr = _op_wrapper(_tir_op.tvm_access_ptr)
 tvm_tuple = _op_wrapper(_tir_op.tvm_tuple)
 tvm_struct_set = _op_wrapper(_tir_op.tvm_struct_set)
 tvm_struct_get = _tir_op.tvm_struct_get
@@ -1771,6 +1661,8 @@ def wrapped(*args, **kwargs):
 tvm_call_cpacked_lowered = call_cpacked_lowered
 TVMBackendAllocWorkspace = _op_wrapper(_tir_op.TVMBackendAllocWorkspace)
 TVMBackendFreeWorkspace = _op_wrapper(_tir_op.TVMBackendFreeWorkspace)
+start_profile_intrinsic = _op_wrapper(_tir_op.start_profile_intrinsic)
+end_profile_intrinsic = _op_wrapper(_tir_op.end_profile_intrinsic)
 
 
 class inline:
@@ -1796,7 +1688,7 @@ def f():
 # pylint: enable=invalid-name
 
 
-__all__ = [
+__all__ += [
     "buffer_decl",
     "prim_func",
     "arg",
@@ -1835,21 +1727,6 @@ def f():
     "buffer_store",
     "prefetch",
     "evaluate",
-    "int8",
-    "int16",
-    "int32",
-    "int64",
-    "uint8",
-    "uint16",
-    "uint32",
-    "uint64",
-    "float8",
-    "float16",
-    "float32",
-    "float64",
-    "int32x4",
-    "int32x8",
-    "int32x16",
     "boolean",
     "handle",
     "void",
@@ -1859,6 +1736,7 @@ def f():
     "max",
     "iter_var",
     "comm_reducer",
+    "target",
     "buffer_var",
     "abs",
     "fabs",
@@ -1905,6 +1783,7 @@ def f():
     "popcount",
     "power",
     "q_multiply_shift",
+    "q_multiply_shift_per_axis",
     "ret",
     "reinterpret",
     "round",
@@ -1925,6 +1804,7 @@ def f():
     "tvm_stack_alloca",
     "tvm_stack_make_shape",
     "tvm_stack_make_array",
+    "tvm_check_return",
     "call_packed",
     "call_cpacked",
     "call_packed_lowered",
@@ -1934,7 +1814,6 @@ def f():
     "call_llvm_intrin",
     "call_llvm_pure_intrin",
     "call_pure_extern",
-    "tvm_access_ptr",
     "tvm_tuple",
     "tvm_struct_set",
     "tvm_struct_get",
@@ -1963,14 +1842,50 @@ def f():
     "tvm_call_cpacked_lowered",
     "TVMBackendAllocWorkspace",
     "TVMBackendFreeWorkspace",
+    "start_profile_intrinsic",
+    "end_profile_intrinsic",
     "inline",
     "llvm_lookup_intrinsic_id",
-    "Cast",
-    "Let",
-    "Select",
-    "Shuffle",
     "type_annotation",
     "broadcast",
     "ramp",
     "cast",
+    # tvm.tir.expr
+    "Var",
+    "SizeVar",
+    "Reduce",
+    "FloatImm",
+    "IntImm",
+    "StringImm",
+    "Cast",
+    "Add",
+    "Sub",
+    "Mul",
+    "Div",
+    "Mod",
+    "FloorDiv",
+    "FloorMod",
+    "Min",
+    "Max",
+    "EQ",
+    "NE",
+    "LT",
+    "LE",
+    "GT",
+    "GE",
+    "And",
+    "Or",
+    "Not",
+    "Select",
+    "BufferLoad",
+    "ProducerLoad",
+    "Load",
+    "Ramp",
+    "Broadcast",
+    "Shuffle",
+    "Call",
+    "CallEffectKind",
+    "Let",
+    "IterVar",
+    "CommReducer",
 ]
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index 588b40ae4033..e1adc0a6bbd7 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -18,14 +18,15 @@
 """Operators used in TIR expression."""
 import warnings
 from typing import Any, Optional
+
 import tvm._ffi
-from tvm.ir.base import Span
-from tvm.runtime import convert, const
 from tvm.ir import Array, Op, PrimExpr
+from tvm.ir.base import Span
+from tvm.runtime import const, convert
 
-from .buffer import Buffer
-from .expr import Call, PrimExprWithOp, StringImm, Var, CommReducer, IntImm
 from . import _ffi_api
+from .buffer import Buffer
+from .expr import Call, CommReducer, IntImm, PrimExprWithOp, StringImm, Var
 
 
 def _pack_buffer(buf, span=None):
@@ -322,6 +323,24 @@ def call_llvm_pure_intrin(dtype, name, *args, span=None):
     )
 
 
+def tvm_check_return(expected, return_unexpected, nested_call):
+    """Return new on stack dtype[num]
+    Parameters
+    ----------
+    expected : int
+        The expected return code.
+    return_unexpected : int
+        The unexpected return code.
+    nested_call : PrimExpr
+        The call expression to check return.
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin("int32", "tir.tvm_check_return", expected, return_unexpected, nested_call)
+
+
 def tvm_stack_alloca(dtype_str, num):
     """Return new on stack dtype[num]
 
@@ -403,7 +422,7 @@ def assume(cond=None):
     call : PrimExpr
         The call expression.
     """
-    return call_intrin("int32", "tir.assume", cond)
+    return call_intrin("bool", "tir.assume", cond)
 
 
 def undef():
@@ -417,6 +436,34 @@ def undef():
     return call_intrin("int32", "tir.undef")
 
 
+def start_profile_intrinsic(id):
+    """Start profile intrinsic.
+    Parameters
+    ----------
+    id : int
+        The intrinsic id.
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin("handle", "tir.start_profile_intrinsic", id)
+
+
+def end_profile_intrinsic(id):
+    """End profile intrinsic.
+    Parameters
+    ----------
+    id : int
+        The intrinsic id.
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    return call_intrin("handle", "tir.end_profile_intrinsic", id)
+
+
 def tvm_tuple(*value):
     """Create a tuple structure in value field of AttrStmt
 
diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc
index aa9efa653f71..f48ee52506b4 100644
--- a/src/script/ir_builder/tir/frame.cc
+++ b/src/script/ir_builder/tir/frame.cc
@@ -117,14 +117,14 @@ void LaunchThreadFrameNode::ExitWithScope() {
 
 void AllocateFrameNode::ExitWithScope() {
   TIRFrameNode::ExitWithScope();
-  AddToParent(tvm::tir::Allocate(buffer->data, buffer->dtype, buffer->shape, condition,
-                                 AsStmt(stmts), annotations));
+  AddToParent(
+      tvm::tir::Allocate(buffer_var, dtype, extents, condition, AsStmt(stmts), annotations));
 }
 
 void AllocateConstFrameNode::ExitWithScope() {
   TIRFrameNode::ExitWithScope();
   AddToParent(
-      tvm::tir::AllocateConst(buffer->data, dtype, extents, data, AsStmt(stmts), annotations));
+      tvm::tir::AllocateConst(buffer_var, dtype, extents, data, AsStmt(stmts), annotations));
 }
 void AttrFrameNode::ExitWithScope() {
   TIRFrameNode::ExitWithScope();
@@ -182,7 +182,13 @@ void ElseFrameNode::ExitWithScope() {
 
 void DeclBufferFrameNode::ExitWithScope() {
   TIRFrameNode::ExitWithScope();
-  AddToParent(tvm::tir::DeclBuffer(buffer, AsStmt(stmts)));
+  if (allocated) {
+    AddToParent(tvm::tir::DeclBuffer(buffer, AsStmt(stmts)));
+  } else {
+    AddToParent(tvm::tir::Allocate(buffer->data, buffer->dtype, buffer->shape,
+                                   tvm::IntImm(DataType::Bool(), 1),
+                                   tvm::tir::DeclBuffer(buffer, AsStmt(stmts))));
+  }
 }
 
 TVM_REGISTER_NODE_TYPE(TIRFrameNode);
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index 6be6e2619fea..78107136d492 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -452,20 +452,19 @@ AllocateFrame Allocate(Array<PrimExpr> extents, DataType dtype, String storage_s
   n->storage_scope = storage_scope;
   n->condition = condition.value_or(tvm::Bool(true));
   n->annotations = annotations.value_or(Map<String, ObjectRef>());
-  n->buffer = BufferDecl(extents, dtype, "", NullOpt, NullOpt, NullOpt, storage_scope, 0, 0,
-                         "default", NullOpt);
+  n->buffer_var = Var("", tvm::PointerType(tvm::PrimType(dtype), storage_scope));
   return AllocateFrame(n);
 }
 
 AllocateConstFrame AllocateConst(tvm::runtime::NDArray data, DataType dtype,
-                                 Array<PrimExpr> extents, Map<String, ObjectRef> annotations) {
+                                 Array<PrimExpr> extents,
+                                 Optional<Map<String, ObjectRef>> annotations) {
   ObjectPtr<AllocateConstFrameNode> n = make_object<AllocateConstFrameNode>();
   n->dtype = dtype;
   n->extents = extents;
   n->data = data;
-  n->annotations = annotations;
-  n->buffer =
-      BufferDecl(extents, dtype, "", NullOpt, NullOpt, NullOpt, "", 0, 0, "default", NullOpt);
+  n->annotations = annotations.value_or(Map<String, ObjectRef>());
+  n->buffer_var = Var("", tvm::PointerType(tvm::PrimType(dtype)));
   return AllocateConstFrame(n);
 }
 
@@ -529,6 +528,7 @@ DeclBufferFrame DeclBuffer(Array<PrimExpr> shape, DataType dtype, String buffer_
   ObjectPtr<DeclBufferFrameNode> n = make_object<DeclBufferFrameNode>();
   n->buffer = BufferDecl(shape, dtype, buffer_name, data, strides, elem_offset, storage_scope,
                          align, offset_factor, buffer_type, axis_separators);
+  n->allocated = data.defined();
   return DeclBufferFrame(n);
 }
 
@@ -638,21 +638,35 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate);
 
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Ptr").set_body_typed(Ptr);
 
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int8").set_body_typed(Int8);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int16").set_body_typed(Int16);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32").set_body_typed(Int32);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int64").set_body_typed(Int64);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt8").set_body_typed(UInt8);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt16").set_body_typed(UInt16);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt32").set_body_typed(UInt32);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt64").set_body_typed(UInt64);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float8").set_body_typed(Float8);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float16").set_body_typed(Float16);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float32").set_body_typed(Float32);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float64").set_body_typed(Float64);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32x4").set_body_typed(Int32x4);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32x8").set_body_typed(Int32x8);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32x16").set_body_typed(Int32x16);
+#define TVM_TMP_STR(x) #x
+
+#define TVM_REGISTER_GLOBAL_SIZE(Prefix, DType)                          \
+  TVM_REGISTER_GLOBAL(Prefix TVM_TMP_STR(8)).set_body_typed(DType##8);   \
+  TVM_REGISTER_GLOBAL(Prefix TVM_TMP_STR(16)).set_body_typed(DType##16); \
+  TVM_REGISTER_GLOBAL(Prefix TVM_TMP_STR(32)).set_body_typed(DType##32); \
+  TVM_REGISTER_GLOBAL(Prefix TVM_TMP_STR(64)).set_body_typed(DType##64);
+
+TVM_REGISTER_GLOBAL_SIZE("script.ir_builder.tir.Float", Float);
+TVM_REGISTER_GLOBAL_SIZE("script.ir_builder.tir.UInt", UInt);
+TVM_REGISTER_GLOBAL_SIZE("script.ir_builder.tir.Int", Int);
+
+#define TVM_REGISTER_GLOBAL_LANES(Prefix, Func)                           \
+  TVM_REGISTER_GLOBAL(Prefix TVM_TMP_STR(x4)).set_body_typed(Func##x4);   \
+  TVM_REGISTER_GLOBAL(Prefix TVM_TMP_STR(x8)).set_body_typed(Func##x8);   \
+  TVM_REGISTER_GLOBAL(Prefix TVM_TMP_STR(x16)).set_body_typed(Func##x16); \
+  TVM_REGISTER_GLOBAL(Prefix TVM_TMP_STR(x32)).set_body_typed(Func##x32); \
+  TVM_REGISTER_GLOBAL(Prefix TVM_TMP_STR(x64)).set_body_typed(Func##x64);
+
+#define TVM_REGISTER_GLOBAL_SIZES_LANES(Prefix, DType)          \
+  TVM_REGISTER_GLOBAL_LANES(Prefix TVM_TMP_STR(8), DType##8);   \
+  TVM_REGISTER_GLOBAL_LANES(Prefix TVM_TMP_STR(16), DType##16); \
+  TVM_REGISTER_GLOBAL_LANES(Prefix TVM_TMP_STR(32), DType##32); \
+  TVM_REGISTER_GLOBAL_LANES(Prefix TVM_TMP_STR(64), DType##64);
+
+TVM_REGISTER_GLOBAL_SIZES_LANES("script.ir_builder.tir.Float", Float);
+TVM_REGISTER_GLOBAL_SIZES_LANES("script.ir_builder.tir.UInt", UInt);
+TVM_REGISTER_GLOBAL_SIZES_LANES("script.ir_builder.tir.Int", Int);
+
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Boolean").set_body_typed(Boolean);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Handle").set_body_typed(Handle);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Void").set_body_typed(Void);
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index 36de35fa928b..2ec52bfbfe41 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -52,7 +52,7 @@ def render(e):
         return
     error = errors[0]
     assert (
-        error.span.line - 1 == rel_lineno
+        error.span.line - 1 == rel_lineno or error.span.line == rel_lineno
     ), f"Expected error to be on line {rel_lineno}, but it was on {error.span.line - 1}"
 
     error_line = source_code.split("\n")[rel_lineno]
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index dbc9b594fb87..a3df5a183bab 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -16,15 +16,15 @@
 # under the License.
 # pylint: disable=invalid-name, missing-docstring
 """Unittests for tvm.script.ir_builder.tir"""
-import pytest
 import numpy as np
+import pytest
 import tvm
 import tvm.testing
 from tvm import tir
+from tvm.ir.base import assert_structural_equal
 from tvm.runtime import ndarray
-from tvm.script.ir_builder import tir as T
 from tvm.script.ir_builder import IRBuilder
-from tvm.ir.base import assert_structural_equal
+from tvm.script.ir_builder import tir as T
 
 
 def test_ir_builder_tir_primfunc_base():
@@ -372,7 +372,12 @@ def test_ir_builder_tir_allocate_const():
     # the expected allocate const
     buffer_var = tir.Var("v", tvm.ir.PointerType(tvm.ir.PrimType("int32")))
     ir_expected = tir.AllocateConst(
-        buffer_var, "int32", [10], ndarray.array(np.asarray(data, "int32")), tir.Evaluate(1)
+        buffer_var,
+        "int32",
+        [10],
+        ndarray.array(np.asarray(data, "int32")),
+        tir.Evaluate(1),
+        annotations={},
     )
 
     # Check if the generated ir is expected
@@ -470,7 +475,13 @@ def test_ir_builder_tir_decl_buffer():
 
     # the expected decl_buffer
     buffer = T.buffer_decl((128, 128), "float32")
-    ir_expected = tir.DeclBuffer(buffer, tir.Evaluate(0))
+    ir_expected = tir.Allocate(
+        buffer.data,
+        "float32",
+        (128, 128),
+        tir.IntImm("bool", True),
+        tir.DeclBuffer(buffer, tir.Evaluate(0)),
+    )
 
     # Check if the generated ir is expected
     assert_structural_equal(ir_actual, ir_expected, map_free_vars=True)

From 38771170ccf199ab37e6a2b9c89362f04544d43b Mon Sep 17 00:00:00 2001
From: multiverstack <39256082+multiverstack-intellif@users.noreply.github.com>
Date: Sat, 12 Nov 2022 11:58:02 +0800
Subject: [PATCH 565/704] [TIR][Schedule] Fix cache_read loc detecting and
 region_cover checking (#13345)

Fix 2 issues of cache related primitives:
*  Fix region_cover checking for cache related primitives
*  Fix CacheLocDetector for nested SeqStmt

Co-authored-by: Min Chen <chen.min@intellif.com>
---
 .../schedule/primitive/cache_read_write.cc    | 30 +++++---
 .../test_tir_schedule_cache_read_write.py     | 76 +++++++++++++++++++
 2 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 2c86c2df2d25..b3e0e8f1274e 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -437,13 +437,13 @@ class CacheLocDetector : public StmtVisitor {
       if (visited_block_ && visited_related_ && loc_pos_ == -1) {
         // The offset of insert position from the block
         loc_pos_ = i;
-        return;
+        break;
       } else if (visited_related_) {
         // If meet the target consumer, stop searching
-        visited_block_ = visited_block_ || previous_visited_block;
-        return;
+        break;
       }
     }
+    visited_block_ = visited_block_ || previous_visited_block;
   }
 
   void VisitStmt_(const BlockNode* block) final {
@@ -1078,7 +1078,7 @@ class ReIndexRewriter : public StmtExprMutator {
   Region region_;
 };
 
-void CheckRegionCover(const ScheduleState& self, StmtSRef scope_root) {
+void CheckRegionCover(const ScheduleState& self, StmtSRef scope_root, Buffer read_buffer) {
   class NotRegionCoverError : public ScheduleError {
    public:
     explicit NotRegionCoverError(IRModule mod, Block block) : mod_(mod), block_(block) {}
@@ -1095,12 +1095,16 @@ The region cover property require to hold for every of its child blocks
     IRModule mod_;
     Block block_;
   };
-  BlockScope scope = self->GetBlockScope(scope_root);
-  for (const auto& kv : scope->dst2deps) {
-    const StmtSRef& consumer_block_sref = kv.first;
-    if (!self->block_info.at(consumer_block_sref).region_cover) {
-      const BlockNode* block = TVM_SREF_TO_BLOCK(scope_root);
-      throw NotRegionCoverError(self->mod, GetRef<Block>(block));
+
+  for (const auto& child_block_sref : tir::GetChildBlocks(self, scope_root)) {
+    const BlockNode* child_block = TVM_SREF_TO_BLOCK(child_block_sref);
+    for (const BufferRegion& region : child_block->reads) {
+      if (region->buffer.same_as(read_buffer)) {
+        if (!self->block_info.at(child_block_sref).region_cover) {
+          const BlockNode* block = TVM_SREF_TO_BLOCK(scope_root);
+          throw NotRegionCoverError(self->mod, GetRef<Block>(block));
+        }
+      }
     }
   }
 }
@@ -1129,7 +1133,7 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff
       GetNthAccessBuffer(self, GetRef<Block>(block), read_buffer_index, BufferIndexType::kRead);
   StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
   // Check required region cover for cache_read
-  CheckRegionCover(self, scope_sref);
+  CheckRegionCover(self, scope_sref, read_buffer);
   const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref);
 
   // Step 2. Create CacheStageInfo
@@ -1281,7 +1285,7 @@ Array<StmtSRef> CacheInplace(ScheduleState self, const StmtSRef& block_sref, int
   StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
 
   // Check 3. Check required region cover for cache_read
-  CheckRegionCover(self, scope_sref);
+  CheckRegionCover(self, scope_sref, buffer);
 
   // Check 4. Check if target block both read & write target buffer.
   const BlockNode* rw_block = TVM_SREF_TO_BLOCK(block_sref);
@@ -1318,6 +1322,8 @@ Array<StmtSRef> CacheInplace(ScheduleState self, const StmtSRef& block_sref, int
   StmtSRef result_block_sref = self->stmt2ref.at(cache_read_stage.get());
   BlockInfo& block_info_read = self->block_info[result_block_sref];
   block_info_read.affine_binding = CalculateAffineFlag(self, result_block_sref);
+  block_info_read.region_cover = true;
+  block_info_read.scope->stage_pipeline = false;
   results_block_sref.push_back(result_block_sref);
 
   // Do cache write
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index a237a5b75839..3476ca083056 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -59,6 +59,33 @@ def elementwise_shape_int64(a: T.handle, c: T.handle) -> None:
             C[vi, vj] = B[vi, vj] + 1.0
 
 
+@T.prim_func
+def func_nested_seq(b: T.handle, c: T.handle) -> None:
+    A = T.alloc_buffer((128, 128))
+    B = T.match_buffer(b, (128, 128))
+    C = T.match_buffer(c, (128, 128))
+
+    for i, j in T.grid(128, 128):
+        with T.block("A"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            A[vi, vj] = 2.0
+    for i, j in T.grid(8, 8):
+        for x, y in T.grid(16, 16):
+            with T.block("B0"):
+                vi = T.axis.S(128, i * 16 + x)
+                vj = T.axis.S(128, j * 16 + y)
+                B[vi, vj] = 1.0
+        for x, y in T.grid(16, 16):
+            with T.block("B1"):
+                vi = T.axis.S(128, i * 16 + x)
+                vj = T.axis.S(128, j * 16 + y)
+                B[vi, vj] = A[vi, vj] + B[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A[vi, vj] * 2.0
+
+
 @T.prim_func
 def access_under_scope(b: T.handle, c: T.handle) -> None:
     A = T.alloc_buffer((128, 128))
@@ -250,6 +277,47 @@ def inplace_call(data_io: T.Buffer[(64), "int32"]):
             T.evaluate(T.call_extern("call_impl", data_io.data, dtype=""))
 
 
+@T.prim_func
+def cache_read_nested_seq_target(
+    B: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float32"]
+) -> None:
+    A = T.alloc_buffer([128, 128], dtype="float32")
+    A_global = T.alloc_buffer([128, 128], dtype="float32")
+    for i, j in T.grid(128, 128):
+        with T.block("A"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads()
+            T.writes(A[vi, vj])
+            A[vi, vj] = T.float32(2)
+    for i, j in T.grid(8, 8):
+        for x, y in T.grid(16, 16):
+            with T.block("B0"):
+                vi = T.axis.spatial(128, i * 16 + x)
+                vj = T.axis.spatial(128, j * 16 + y)
+                T.reads()
+                T.writes(B[vi, vj])
+                B[vi, vj] = T.float32(1)
+        for x, y in T.grid(16, 16):
+            with T.block("B1"):
+                vi = T.axis.spatial(128, i * 16 + x)
+                vj = T.axis.spatial(128, j * 16 + y)
+                T.reads(A[vi, vj], B[vi, vj])
+                T.writes(B[vi, vj])
+                B[vi, vj] = A[vi, vj] + B[vi, vj]
+    for ax0, ax1 in T.grid(128, 128):
+        with T.block("A_global"):
+            v0, v1 = T.axis.remap("SS", [ax0, ax1])
+            T.reads(A[v0, v1])
+            T.writes(A_global[v0, v1])
+            A_global[v0, v1] = A[v0, v1]
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A_global[vi, vj])
+            T.writes(C[vi, vj])
+            C[vi, vj] = A_global[vi, vj] * T.float32(2)
+
+
 ########## Expected function after cache_read ##########
 
 
@@ -989,6 +1057,14 @@ def test_cache_inplace():
     verify_trace_roundtrip(sch=sch, mod=inplace_call, debug_mask=debug_mask)
 
 
+def test_cache_read_nested_seq(use_block_name):
+    sch = tir.Schedule(func_nested_seq, debug_mask="all")
+    block_c = "C" if use_block_name else sch.get_block("C")
+    sch.cache_read(block_c, 0, "global", consumer_blocks=[block_c])
+    tvm.ir.assert_structural_equal(cache_read_nested_seq_target, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=func_nested_seq)
+
+
 ########## Testcases for cache_write ##########
 
 
From b20b7c4ad4ad3774a42f47614245f8eeabe875cb Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 11 Nov 2022 22:25:23 -0800
Subject: [PATCH 566/704] [TVMScript] Reorganize the folder structure (#12496)

This PR introduces some minor restructuring of the `python/tvm/script`
folder structure to make it more convenient for future upstreaming.

Co-authored-by: Yaxing Cai <caiyaxing666@gmail.com>
---
 include/tvm/ir/expr.h                         |  4 +
 python/tvm/script/__init__.py                 |  7 +-
 python/tvm/script/ir_builder/tir/__init__.py  |  1 +
 python/tvm/script/ir_builder/tir/ir.py        | 11 ++-
 .../script/{_parser => parser}/__init__.py    |  1 +
 .../tvm/script/{_parser => parser}/_core.py   |  0
 .../{_parser => parser}/core/__init__.py      |  0
 .../{_parser => parser}/core/diagnostics.py   | 39 +++++---
 .../{_parser => parser}/core/dispatch.py      |  0
 .../script/{_parser => parser}/core/doc.py    |  0
 .../{_parser => parser}/core/doc_core.py      |  0
 .../script/{_parser => parser}/core/entry.py  | 10 ++
 .../{_parser => parser}/core/evaluator.py     |  0
 .../script/{_parser => parser}/core/parser.py |  0
 .../script/{_parser => parser}/core/utils.py  | 36 ++++++-
 .../script/{_parser => parser}/ir/__init__.py |  4 +-
 .../script/{_parser => parser}/ir/entry.py    | 32 +------
 .../script/{_parser => parser}/ir/parser.py   |  0
 .../{_parser => parser}/tir/__init__.py       |  0
 .../script/{_parser => parser}/tir/entry.py   |  8 +-
 .../{_parser => parser}/tir/operation.py      | 12 +--
 .../script/{_parser => parser}/tir/parser.py  |  2 +-
 python/tvm/script/parser_v1/__init__.py       | 21 ++++
 python/tvm/script/{ => parser_v1}/_ffi_api.py |  0
 .../{ => parser_v1}/context_maintainer.py     |  0
 .../tvm/script/{ => parser_v1}/diagnostics.py |  0
 .../script/{ => parser_v1}/meta_unparser.py   |  0
 python/tvm/script/{ => parser_v1}/parser.py   |  0
 python/tvm/script/{ => parser_v1}/registry.py |  0
 .../script/{ => parser_v1}/tir/__init__.py    |  0
 .../script/{ => parser_v1}/tir/__init__.pyi   |  0
 .../tvm/script/{ => parser_v1}/tir/intrin.py  |  2 +-
 python/tvm/script/{ => parser_v1}/tir/node.py |  0
 .../script/{ => parser_v1}/tir/prim_func.py   |  0
 .../{ => parser_v1}/tir/scope_handler.py      |  0
 .../{ => parser_v1}/tir/special_stmt.py       |  0
 python/tvm/script/{ => parser_v1}/tir/ty.py   |  0
 python/tvm/script/{ => parser_v1}/utils.py    |  0
 python/tvm/tir/__init__.py                    |  2 +
 python/tvm/tir/buffer.py                      | 28 +++---
 python/tvm/tir/expr.py                        |  2 +
 python/tvm/tir/schedule/schedule.py           |  6 +-
 python/tvm/tir/tensor_intrin/cuda.py          | 12 +--
 python/tvm/tir/tensor_intrin/hexagon.py       |  8 +-
 python/tvm/tir/tensor_intrin/rocm.py          |  1 +
 .../test_hexagon/test_async_dma_pipeline.py   |  6 +-
 .../relay/aot/test_pass_aot_lower_main.py     |  6 +-
 .../unittest/test_tir_lower_match_buffer.py   | 42 +++-----
 .../unittest/test_tir_schedule_reindex.py     |  2 +-
 .../test_tir_schedule_transform_layout.py     | 11 ++-
 .../unittest/test_tir_schedule_utilities.py   |  2 +-
 ...est_tir_transform_inject_virtual_thread.py |  1 +
 ..._transform_lower_cross_thread_reduction.py | 96 -------------------
 .../test_tir_transform_remove_assume.py       |  6 +-
 .../unittest/test_tvmscript_error_report.py   |  1 -
 .../unittest/test_tvmscript_ir_builder_tir.py |  4 +-
 .../test_tvmscript_parser_evaluator.py        |  4 +-
 .../unittest/test_tvmscript_parser_ir.py      |  2 +-
 .../unittest/test_tvmscript_parser_source.py  |  4 +-
 .../unittest/test_tvmscript_parser_tir.py     |  2 +-
 tests/python/unittest/test_tvmscript_spans.py |  2 +-
 .../unittest/test_tvmscript_syntax_sugar.py   |  6 +-
 62 files changed, 199 insertions(+), 247 deletions(-)
 rename python/tvm/script/{_parser => parser}/__init__.py (97%)
 rename python/tvm/script/{_parser => parser}/_core.py (100%)
 rename python/tvm/script/{_parser => parser}/core/__init__.py (100%)
 rename python/tvm/script/{_parser => parser}/core/diagnostics.py (88%)
 rename python/tvm/script/{_parser => parser}/core/dispatch.py (100%)
 rename python/tvm/script/{_parser => parser}/core/doc.py (100%)
 rename python/tvm/script/{_parser => parser}/core/doc_core.py (100%)
 rename python/tvm/script/{_parser => parser}/core/entry.py (83%)
 rename python/tvm/script/{_parser => parser}/core/evaluator.py (100%)
 rename python/tvm/script/{_parser => parser}/core/parser.py (100%)
 rename python/tvm/script/{_parser => parser}/core/utils.py (63%)
 rename python/tvm/script/{_parser => parser}/ir/__init__.py (89%)
 rename python/tvm/script/{_parser => parser}/ir/entry.py (61%)
 rename python/tvm/script/{_parser => parser}/ir/parser.py (100%)
 rename python/tvm/script/{_parser => parser}/tir/__init__.py (100%)
 rename python/tvm/script/{_parser => parser}/tir/entry.py (94%)
 rename python/tvm/script/{_parser => parser}/tir/operation.py (90%)
 rename python/tvm/script/{_parser => parser}/tir/parser.py (99%)
 create mode 100644 python/tvm/script/parser_v1/__init__.py
 rename python/tvm/script/{ => parser_v1}/_ffi_api.py (100%)
 rename python/tvm/script/{ => parser_v1}/context_maintainer.py (100%)
 rename python/tvm/script/{ => parser_v1}/diagnostics.py (100%)
 rename python/tvm/script/{ => parser_v1}/meta_unparser.py (100%)
 rename python/tvm/script/{ => parser_v1}/parser.py (100%)
 rename python/tvm/script/{ => parser_v1}/registry.py (100%)
 rename python/tvm/script/{ => parser_v1}/tir/__init__.py (100%)
 rename python/tvm/script/{ => parser_v1}/tir/__init__.pyi (100%)
 rename python/tvm/script/{ => parser_v1}/tir/intrin.py (99%)
 rename python/tvm/script/{ => parser_v1}/tir/node.py (100%)
 rename python/tvm/script/{ => parser_v1}/tir/prim_func.py (100%)
 rename python/tvm/script/{ => parser_v1}/tir/scope_handler.py (100%)
 rename python/tvm/script/{ => parser_v1}/tir/special_stmt.py (100%)
 rename python/tvm/script/{ => parser_v1}/tir/ty.py (100%)
 rename python/tvm/script/{ => parser_v1}/utils.py (100%)

diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index b9afb4be2def..94927b4892eb 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -764,6 +764,10 @@ struct PackedFuncValueConverter<PrimExpr> {
       return PrimExpr(ObjectPtr<Object>(nullptr));
     }
     if (val.type_code() == kDLInt) {
+      int64_t value = val.operator int64_t();
+      if (value > std::numeric_limits<int>::max() || value < std::numeric_limits<int>::min()) {
+        return IntImm(runtime::DataType::Int(64), value);
+      }
       return IntImm(runtime::DataType::Int(32), val.operator int());
     }
     if (val.type_code() == kDLFloat) {
diff --git a/python/tvm/script/__init__.py b/python/tvm/script/__init__.py
index 555659d0c55e..21bdfa6f1691 100644
--- a/python/tvm/script/__init__.py
+++ b/python/tvm/script/__init__.py
@@ -14,8 +14,5 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""TVM Script APIs of TVM Python Package, aimed to support TIR"""
-
-from . import tir
-
-from .parser import ir_module, from_source
+"""TVM Script APIs of TVM Python Package"""
+from .parser import ir, ir_module, parse as from_source, tir
diff --git a/python/tvm/script/ir_builder/tir/__init__.py b/python/tvm/script/ir_builder/tir/__init__.py
index 1e43d1af3498..0a71af4db7e6 100644
--- a/python/tvm/script/ir_builder/tir/__init__.py
+++ b/python/tvm/script/ir_builder/tir/__init__.py
@@ -16,3 +16,4 @@
 # under the License.
 """Package tvm.script.ir_builder.tir"""
 from .ir import *  # pylint: disable=wildcard-import,redefined-builtin
+from .ir import boolean as bool  # pylint: disable=redefined-builtin
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index bd9e4e1db522..0678925e2f7c 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -1665,13 +1665,14 @@ def wrapped(*args, **kwargs):
 end_profile_intrinsic = _op_wrapper(_tir_op.end_profile_intrinsic)
 
 
-class inline:
-    """Inline function for meta-programming.
+class meta_var:
+    """A meta variable used in TVMScript metaprogramming. It means that the value of the variable
+    does not appear in the final TIR, but only stays in the parser.
 
     Parameters
     ----------
     value: Any
-        The value to be inlined.
+        The meta variable.
     """
 
     def __init__(self, value: Any) -> None:
@@ -1680,7 +1681,7 @@ def __init__(self, value: Any) -> None:
     def __iter__(self):
         def f():
             for i in self.value:
-                yield inline(i)
+                yield meta_var(i)
 
         return f()
 
@@ -1844,7 +1845,7 @@ def f():
     "TVMBackendFreeWorkspace",
     "start_profile_intrinsic",
     "end_profile_intrinsic",
-    "inline",
+    "meta_var",
     "llvm_lookup_intrinsic_id",
     "type_annotation",
     "broadcast",
diff --git a/python/tvm/script/_parser/__init__.py b/python/tvm/script/parser/__init__.py
similarity index 97%
rename from python/tvm/script/_parser/__init__.py
rename to python/tvm/script/parser/__init__.py
index 38c8b88cc7ca..5161a2601c49 100644
--- a/python/tvm/script/_parser/__init__.py
+++ b/python/tvm/script/parser/__init__.py
@@ -16,5 +16,6 @@
 # under the Licens.
 """The parser"""
 from . import _core, ir, tir
+from ._core import parse
 from .ir import ir_module
 from .tir import prim_func
diff --git a/python/tvm/script/_parser/_core.py b/python/tvm/script/parser/_core.py
similarity index 100%
rename from python/tvm/script/_parser/_core.py
rename to python/tvm/script/parser/_core.py
diff --git a/python/tvm/script/_parser/core/__init__.py b/python/tvm/script/parser/core/__init__.py
similarity index 100%
rename from python/tvm/script/_parser/core/__init__.py
rename to python/tvm/script/parser/core/__init__.py
diff --git a/python/tvm/script/_parser/core/diagnostics.py b/python/tvm/script/parser/core/diagnostics.py
similarity index 88%
rename from python/tvm/script/_parser/core/diagnostics.py
rename to python/tvm/script/parser/core/diagnostics.py
index b077d221424c..d673e0eb139f 100644
--- a/python/tvm/script/_parser/core/diagnostics.py
+++ b/python/tvm/script/parser/core/diagnostics.py
@@ -17,7 +17,6 @@
 """TVM Script Parser Source and diagnostics"""
 
 import inspect
-import re
 import sys
 from typing import Union
 
@@ -144,18 +143,34 @@ def findsource(obj):
     if not lines:
         raise OSError("could not get source code")
     qual_names = obj.__qualname__.replace(".<locals>", "<locals>").split(".")
-    pattern_list = []
-    for name in qual_names:
-        if name.endswith("<locals>"):
-            pattern_list.append(re.compile(r"^(\s*)def\s*" + name[:-8] + r"\b"))
-        else:
-            pattern_list.append(re.compile(r"^(\s*)class\s*" + name + r"\b"))
+    in_comment = 0
+    scope_stack = []
+    indent_info = {}
     for i, line in enumerate(lines):
-        match = pattern_list[0].match(line)
-        if match:
-            pattern_list.pop(0)
-        if not pattern_list:
-            return lines, i
+        n_comment = line.count('"""')
+        if n_comment:
+            # update multi-line comments status
+            in_comment = in_comment ^ (n_comment & 1)
+            continue
+        if in_comment:
+            # skip lines within multi-line comments
+            continue
+        indent = len(line) - len(line.lstrip())
+        tokens = line.split()
+        if len(tokens) > 1:
+            name = None
+            if tokens[0] == "def":
+                name = tokens[1].split(":")[0].split("(")[0] + "<locals>"
+            elif tokens[0] == "class":
+                name = tokens[1].split(":")[0].split("(")[0]
+            if name:
+                while scope_stack and indent_info[scope_stack[-1]] >= indent:
+                    scope_stack.pop()
+                scope_stack.append(name)
+                indent_info[name] = indent
+                if scope_stack == qual_names:
+                    return lines, i
+
     raise OSError("could not find class definition")
 
 
diff --git a/python/tvm/script/_parser/core/dispatch.py b/python/tvm/script/parser/core/dispatch.py
similarity index 100%
rename from python/tvm/script/_parser/core/dispatch.py
rename to python/tvm/script/parser/core/dispatch.py
diff --git a/python/tvm/script/_parser/core/doc.py b/python/tvm/script/parser/core/doc.py
similarity index 100%
rename from python/tvm/script/_parser/core/doc.py
rename to python/tvm/script/parser/core/doc.py
diff --git a/python/tvm/script/_parser/core/doc_core.py b/python/tvm/script/parser/core/doc_core.py
similarity index 100%
rename from python/tvm/script/_parser/core/doc_core.py
rename to python/tvm/script/parser/core/doc_core.py
diff --git a/python/tvm/script/_parser/core/entry.py b/python/tvm/script/parser/core/entry.py
similarity index 83%
rename from python/tvm/script/_parser/core/entry.py
rename to python/tvm/script/parser/core/entry.py
index a0974c8fd419..bf6a118672df 100644
--- a/python/tvm/script/_parser/core/entry.py
+++ b/python/tvm/script/parser/core/entry.py
@@ -40,6 +40,16 @@ def parse(program: Union[doc.AST, Any, str], extra_vars: Dict[str, Any] = None)
     func : Any
         The parsed TVMScript program.
     """
+    if extra_vars is None:
+        from tvm.script.parser import ir  # pylint: disable=import-outside-toplevel
+        from tvm.script.parser import tir  # pylint: disable=import-outside-toplevel
+
+        extra_vars = {
+            "I": ir,
+            "ir": ir,
+            "T": tir,
+            "tir": tir,
+        }
 
     source = Source(program)
     parser = Parser(source)
diff --git a/python/tvm/script/_parser/core/evaluator.py b/python/tvm/script/parser/core/evaluator.py
similarity index 100%
rename from python/tvm/script/_parser/core/evaluator.py
rename to python/tvm/script/parser/core/evaluator.py
diff --git a/python/tvm/script/_parser/core/parser.py b/python/tvm/script/parser/core/parser.py
similarity index 100%
rename from python/tvm/script/_parser/core/parser.py
rename to python/tvm/script/parser/core/parser.py
diff --git a/python/tvm/script/_parser/core/utils.py b/python/tvm/script/parser/core/utils.py
similarity index 63%
rename from python/tvm/script/_parser/core/utils.py
rename to python/tvm/script/parser/core/utils.py
index 65e7166bfcc2..a304afddbe55 100644
--- a/python/tvm/script/_parser/core/utils.py
+++ b/python/tvm/script/parser/core/utils.py
@@ -17,7 +17,10 @@
 """TVM Script Parser utils"""
 
 import inspect
-from typing import Any, Callable, Dict
+from types import FrameType
+from typing import Any, Callable, Dict, List
+
+from .diagnostics import findsource
 
 
 def inspect_function_capture(func: Callable) -> Dict[str, Any]:
@@ -59,3 +62,34 @@ def inspect_class_capture(cls: type) -> Dict[str, Any]:
             func_vars = inspect_function_capture(v)
             result.update(**func_vars)
     return result
+
+
+def is_defined_in_class(frames: List[FrameType], obj: Any) -> bool:
+    """Check whether a object is defined in a class scope.
+
+    Parameters
+    ----------
+    frames : List[FrameType]
+        The frame stack of the object, obtained by `inspect.stack()`.
+
+    Returns
+    -------
+    res : bool
+        The result if the object is defined in a class scope.
+    """
+    if len(frames) > 2:
+        frame_info = frames[2]
+        code_context = frame_info.code_context
+        if code_context is None:
+            return False
+        line = code_context[0].strip()
+        if line.startswith("@") and "ir_module" in line:
+            return True
+        if line.startswith("class"):
+            lineno = frame_info.lineno
+            if lineno >= 2:
+                source, _ = findsource(obj)
+                line = source[lineno - 2].strip()
+                if line.startswith("@") and "ir_module" in line:
+                    return True
+    return False
diff --git a/python/tvm/script/_parser/ir/__init__.py b/python/tvm/script/parser/ir/__init__.py
similarity index 89%
rename from python/tvm/script/_parser/ir/__init__.py
rename to python/tvm/script/parser/ir/__init__.py
index b15468d37a7e..fedd2f0a14a8 100644
--- a/python/tvm/script/_parser/ir/__init__.py
+++ b/python/tvm/script/parser/ir/__init__.py
@@ -17,6 +17,6 @@
 """The ir module parser"""
 
 from . import parser as _parser
-from .entry import ir_module, is_defined_in_class
+from .entry import ir_module
 
-__all__ = ["ir_module", "is_defined_in_class"]
+__all__ = ["ir_module"]
diff --git a/python/tvm/script/_parser/ir/entry.py b/python/tvm/script/parser/ir/entry.py
similarity index 61%
rename from python/tvm/script/_parser/ir/entry.py
rename to python/tvm/script/parser/ir/entry.py
index e8bc8b702db0..94fc3d2e2c7e 100644
--- a/python/tvm/script/_parser/ir/entry.py
+++ b/python/tvm/script/parser/ir/entry.py
@@ -17,41 +17,13 @@
 """The entry point of TVM parser for ir module."""
 
 import inspect
-from typing import List, Type
-from types import FrameType
+from typing import Type
 
 from tvm.ir import IRModule
 
 from .._core import parse, utils
 
 
-def is_defined_in_class(frames: List[FrameType]) -> bool:
-    """Check whether a object is defined in a class scope.
-
-    Parameters
-    ----------
-    frames : List[FrameType]
-        The frame stack of the object, obtained by `inspect.stack()`.
-
-    Returns
-    -------
-    res : bool
-        The result if the object is defined in a class scope.
-    """
-    if len(frames) > 2:
-        maybe_class_frame = frames[2]
-        statement_list = maybe_class_frame[4]
-        if statement_list is None:
-            return False
-        first_statement = statement_list[0]
-        line = first_statement.strip()
-        if line.startswith("class "):
-            return True
-        if line.startswith("@") and "ir_module" in line:
-            return True
-    return False
-
-
 def ir_module(mod: Type) -> IRModule:
     """The parsing method for ir module, by using `@ir_module` as decorator.
 
@@ -62,7 +34,7 @@ def ir_module(mod: Type) -> IRModule:
 
     Returns
     -------
-    irmodule : IRModule
+    ir_module : IRModule
         The parsed ir module.
     """
     if not inspect.isclass(mod):
diff --git a/python/tvm/script/_parser/ir/parser.py b/python/tvm/script/parser/ir/parser.py
similarity index 100%
rename from python/tvm/script/_parser/ir/parser.py
rename to python/tvm/script/parser/ir/parser.py
diff --git a/python/tvm/script/_parser/tir/__init__.py b/python/tvm/script/parser/tir/__init__.py
similarity index 100%
rename from python/tvm/script/_parser/tir/__init__.py
rename to python/tvm/script/parser/tir/__init__.py
diff --git a/python/tvm/script/_parser/tir/entry.py b/python/tvm/script/parser/tir/entry.py
similarity index 94%
rename from python/tvm/script/_parser/tir/entry.py
rename to python/tvm/script/parser/tir/entry.py
index 632b87aa24dc..a5c134a8594c 100644
--- a/python/tvm/script/_parser/tir/entry.py
+++ b/python/tvm/script/parser/tir/entry.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """The entry point of TVM parser for tir."""
-
 import inspect
 from typing import Callable, Union
 
@@ -23,7 +22,6 @@
 
 from ...ir_builder.tir import buffer_decl, ptr
 from .._core import parse, utils
-from ..ir import is_defined_in_class
 
 
 def prim_func(func: Callable) -> Union[PrimFunc, Callable]:
@@ -41,7 +39,7 @@ def prim_func(func: Callable) -> Union[PrimFunc, Callable]:
     """
     if not inspect.isfunction(func):
         raise TypeError(f"Expect a function, but got: {func}")
-    if is_defined_in_class(inspect.stack()):
+    if utils.is_defined_in_class(inspect.stack(), func):
         return func
     return parse(func, utils.inspect_function_capture(func))
 
@@ -57,7 +55,7 @@ class BufferProxy:
     def __call__(
         self,
         shape,
-        dtype="float32",
+        dtype=None,
         data=None,
         strides=None,
         elem_offset=None,
@@ -67,6 +65,8 @@ def __call__(
         buffer_type="",
         axis_separators=None,
     ) -> Buffer:
+        if dtype is None:
+            raise ValueError("Data type must be specified when constructing buffer")
         return buffer_decl(
             shape,
             dtype=dtype,
diff --git a/python/tvm/script/_parser/tir/operation.py b/python/tvm/script/parser/tir/operation.py
similarity index 90%
rename from python/tvm/script/_parser/tir/operation.py
rename to python/tvm/script/parser/tir/operation.py
index ed8f07a06369..f0c04f47cdf6 100644
--- a/python/tvm/script/_parser/tir/operation.py
+++ b/python/tvm/script/parser/tir/operation.py
@@ -46,12 +46,12 @@ def r(op: Type, i: int, m: OpMethod):  # pylint: disable=invalid-name
 
     for i in [0, 1]:
         # Case 1. binop
-        r(doc.Add, i, lambda a, b: a + b)
-        r(doc.Sub, i, lambda a, b: a - b)
-        r(doc.Mult, i, lambda a, b: a * b)
-        r(doc.Div, i, lambda a, b: a / b)
-        r(doc.FloorDiv, i, lambda a, b: a // b)
-        r(doc.Mod, i, lambda a, b: a % b)
+        r(doc.Add, i, tir.Add)
+        r(doc.Sub, i, tir.Sub)
+        r(doc.Mult, i, tir.Mul)
+        r(doc.Div, i, tir.Div)
+        r(doc.FloorDiv, i, tir.FloorDiv)
+        r(doc.Mod, i, tir.FloorMod)
         r(doc.LShift, i, lambda a, b: a << b)
         r(doc.RShift, i, lambda a, b: a >> b)
         r(doc.BitOr, i, lambda a, b: a | b)
diff --git a/python/tvm/script/_parser/tir/parser.py b/python/tvm/script/parser/tir/parser.py
similarity index 99%
rename from python/tvm/script/_parser/tir/parser.py
rename to python/tvm/script/parser/tir/parser.py
index 909238563fab..1370758f5a5b 100644
--- a/python/tvm/script/_parser/tir/parser.py
+++ b/python/tvm/script/parser/tir/parser.py
@@ -125,7 +125,7 @@ def bind_assign_value(self: Parser, node: doc.expr, var_name: str, value: Any) -
     res : Any
         The bound value.
     """
-    if isinstance(value, T.inline):
+    if isinstance(value, T.meta_var):
         return value.value
     elif isinstance(value, (list, tuple)):
         for i, v in enumerate(value):
diff --git a/python/tvm/script/parser_v1/__init__.py b/python/tvm/script/parser_v1/__init__.py
new file mode 100644
index 000000000000..555659d0c55e
--- /dev/null
+++ b/python/tvm/script/parser_v1/__init__.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""TVM Script APIs of TVM Python Package, aimed to support TIR"""
+
+from . import tir
+
+from .parser import ir_module, from_source
diff --git a/python/tvm/script/_ffi_api.py b/python/tvm/script/parser_v1/_ffi_api.py
similarity index 100%
rename from python/tvm/script/_ffi_api.py
rename to python/tvm/script/parser_v1/_ffi_api.py
diff --git a/python/tvm/script/context_maintainer.py b/python/tvm/script/parser_v1/context_maintainer.py
similarity index 100%
rename from python/tvm/script/context_maintainer.py
rename to python/tvm/script/parser_v1/context_maintainer.py
diff --git a/python/tvm/script/diagnostics.py b/python/tvm/script/parser_v1/diagnostics.py
similarity index 100%
rename from python/tvm/script/diagnostics.py
rename to python/tvm/script/parser_v1/diagnostics.py
diff --git a/python/tvm/script/meta_unparser.py b/python/tvm/script/parser_v1/meta_unparser.py
similarity index 100%
rename from python/tvm/script/meta_unparser.py
rename to python/tvm/script/parser_v1/meta_unparser.py
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser_v1/parser.py
similarity index 100%
rename from python/tvm/script/parser.py
rename to python/tvm/script/parser_v1/parser.py
diff --git a/python/tvm/script/registry.py b/python/tvm/script/parser_v1/registry.py
similarity index 100%
rename from python/tvm/script/registry.py
rename to python/tvm/script/parser_v1/registry.py
diff --git a/python/tvm/script/tir/__init__.py b/python/tvm/script/parser_v1/tir/__init__.py
similarity index 100%
rename from python/tvm/script/tir/__init__.py
rename to python/tvm/script/parser_v1/tir/__init__.py
diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/parser_v1/tir/__init__.pyi
similarity index 100%
rename from python/tvm/script/tir/__init__.pyi
rename to python/tvm/script/parser_v1/tir/__init__.pyi
diff --git a/python/tvm/script/tir/intrin.py b/python/tvm/script/parser_v1/tir/intrin.py
similarity index 99%
rename from python/tvm/script/tir/intrin.py
rename to python/tvm/script/parser_v1/tir/intrin.py
index 8e24f27325bd..9cde8e3f6d08 100644
--- a/python/tvm/script/tir/intrin.py
+++ b/python/tvm/script/parser_v1/tir/intrin.py
@@ -22,7 +22,7 @@
 import tvm.tir
 from tvm.tir import FloatImm
 
-from ...target import codegen
+from ....target import codegen
 from ..registry import register
 from ..utils import get_param_list, tvm_span_from_synr
 
diff --git a/python/tvm/script/tir/node.py b/python/tvm/script/parser_v1/tir/node.py
similarity index 100%
rename from python/tvm/script/tir/node.py
rename to python/tvm/script/parser_v1/tir/node.py
diff --git a/python/tvm/script/tir/prim_func.py b/python/tvm/script/parser_v1/tir/prim_func.py
similarity index 100%
rename from python/tvm/script/tir/prim_func.py
rename to python/tvm/script/parser_v1/tir/prim_func.py
diff --git a/python/tvm/script/tir/scope_handler.py b/python/tvm/script/parser_v1/tir/scope_handler.py
similarity index 100%
rename from python/tvm/script/tir/scope_handler.py
rename to python/tvm/script/parser_v1/tir/scope_handler.py
diff --git a/python/tvm/script/tir/special_stmt.py b/python/tvm/script/parser_v1/tir/special_stmt.py
similarity index 100%
rename from python/tvm/script/tir/special_stmt.py
rename to python/tvm/script/parser_v1/tir/special_stmt.py
diff --git a/python/tvm/script/tir/ty.py b/python/tvm/script/parser_v1/tir/ty.py
similarity index 100%
rename from python/tvm/script/tir/ty.py
rename to python/tvm/script/parser_v1/tir/ty.py
diff --git a/python/tvm/script/utils.py b/python/tvm/script/parser_v1/utils.py
similarity index 100%
rename from python/tvm/script/utils.py
rename to python/tvm/script/parser_v1/utils.py
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index d02f7fab7a5c..a2e341d82354 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -48,6 +48,7 @@
 from .op import call_packed_lowered, call_cpacked_lowered
 from .op import call_packed, call_cpacked, call_intrin, call_pure_extern, call_extern
 from .op import call_llvm_intrin, call_llvm_pure_intrin, ret, all, any, min_value, max_value, trace
+from .op import tvm_check_return
 from .op import tvm_stack_alloca, tvm_stack_make_shape, tvm_stack_make_array
 from .op import tvm_tuple, tvm_struct_get, tvm_struct_set
 from .op import address_of, lookup_param, assume, undef
@@ -74,6 +75,7 @@
 from .op import comm_reducer, min, max, sum
 from .op import q_multiply_shift, q_multiply_shift_per_axis, shift_left, shift_right
 from .op import TVMBackendAllocWorkspace, TVMBackendFreeWorkspace
+from .op import start_profile_intrinsic, end_profile_intrinsic
 from .generic import add, subtract, multiply
 
 from .schedule import StmtSRef, BlockScope, ScheduleState, Schedule, ScheduleError
diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py
index 6d9d3ce1d100..726d5d1c988c 100644
--- a/python/tvm/tir/buffer.py
+++ b/python/tvm/tir/buffer.py
@@ -184,31 +184,31 @@ def __getitem__(self, indices):
 
         if not isinstance(indices, (tuple, list)):
             indices = [indices]
-        if any(isinstance(index, slice) and index.step is None for index in indices):
+        has_slice = any(isinstance(i, slice) for i in indices)
+        has_step = any(isinstance(i, slice) and i.step is not None for i in indices)
+        analyzer = Analyzer()
+        if has_slice and not has_step:
             region = []
-            analyzer = Analyzer()
-            for index in indices:
+            for i, index in enumerate(indices):
                 if isinstance(index, slice):
-                    region.append(
-                        Range.from_min_extent(
-                            index.start, analyzer.simplify(index.stop - index.start)
-                        )
-                    )
+                    start = 0 if index.start is None else index.start
+                    stop = self.shape[i] if index.stop is None else index.stop
+                    region.append(Range.from_min_extent(start, analyzer.simplify(stop - start)))
                 else:
                     region.append(Range.from_min_extent(index, 1))
             return BufferRegion(self, region)
         else:
-            analyzer = Analyzer()
             expr_indices = []
             for index in indices:
                 if isinstance(index, slice):
-                    lanes = analyzer.simplify(
-                        (index.stop - index.start + index.step - 1) // index.step
-                    )
+                    start = 0 if index.start is None else index.start
+                    stop = self.shape[i] if index.stop is None else index.stop
+                    step = 1 if index.step is None else index.step
+                    lanes = analyzer.simplify((stop - start + step - 1) // step)
                     if lanes == 1:
-                        expr_indices.append(index.start)
+                        expr_indices.append(start)
                     else:
-                        expr_indices.append(Ramp(index.start, index.step, int(lanes)))
+                        expr_indices.append(Ramp(start, step, int(lanes)))
                 else:
                     expr_indices.append(index)
             return BufferLoad(self, expr_indices)
diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index beefcb0d28f8..d52fbb83c368 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -1005,6 +1005,8 @@ class Select(PrimExprWithOp):
     """
 
     def __init__(self, condition, true_value, false_value, span=None):
+        if isinstance(condition, bool):
+            condition = IntImm("bool", condition)
         self.__init_handle_by_constructor__(
             _ffi_api.Select, condition, true_value, false_value, span  # type: ignore
         )
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index c5b7937c6066..170179d0d4e8 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -1832,7 +1832,7 @@ def decompose_reduction(self, block: Union[BlockRV, str], loop: LoopRV) -> Block
 
         .. code-block:: python
 
-            @tvm.script.tir
+            @T.prim_func
             def before_decompose(a: ty.handle, c: ty.handle) -> None:
                 A = tir.match_buffer(a, [128, 128])
                 B = tir.match_buffer(b, [128, 128])
@@ -1851,13 +1851,13 @@ def before_decompose(a: ty.handle, c: ty.handle) -> None:
             C = sch.get_block("C")
             i, j, k = sch.get_loops(C)
             sch.decompose_reduction(C, i)
-            print(tvm.script.asscript(sch.mod["main"]))
+            print(sch.mod["main"].script())
 
         After applying decompose-reduction, the IR becomes:
 
         .. code-block:: python
 
-            @tvm.script.tir
+            @T.prim_func
             def after_decompose(a: ty.handle, c: ty.handle) -> None:
                 A = tir.match_buffer(a, [128, 128])
                 B = tir.match_buffer(b, [128, 128])
diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
index 86dd2eee5cd7..0cde7f246465 100644
--- a/python/tvm/tir/tensor_intrin/cuda.py
+++ b/python/tvm/tir/tensor_intrin/cuda.py
@@ -138,7 +138,7 @@ def ldmatrix_desc(warp_handle: T.handle, shared_handle: T.handle) -> None:
                     v0, v1 = T.axis.remap("SS", [ax0, ax1])
                     T.reads(shared[v0, v1])
 
-                    thread_id, local_id = index_map(v0, v1)
+                    thread_id, local_id = T.meta_var(index_map(v0, v1))
                     T.writes(warp[thread_id, local_id])
                     warp[thread_id, local_id] = shared[v0, v1]
 
@@ -245,9 +245,9 @@ def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
                     i, j, k = T.axis.remap("SSR", [i, j, k])
                     b_row_ind, b_col_ind = maybe_swap(k, j)
 
-                    thread_id_C, local_id_C = index_map_C(i, j)
-                    thread_id_A, local_id_A = index_map_A(i, k)
-                    thread_id_B, local_id_B = index_map_B(b_row_ind, b_col_ind)
+                    thread_id_C, local_id_C = T.meta_var(index_map_C(i, j))
+                    thread_id_A, local_id_A = T.meta_var(index_map_A(i, k))
+                    thread_id_B, local_id_B = T.meta_var(index_map_B(b_row_ind, b_col_ind))
 
                     T.reads(
                         C[thread_id_C, local_id_C],
@@ -339,7 +339,7 @@ def mma_fill_desc(a: T.handle) -> None:
             for i0, i1 in T.grid(M_DIM, N_DIM):
                 with T.block("C_warp"):
                     i, j = T.axis.remap("SS", [i0, i1])
-                    thread_id, local_id = index_map(i, j)
+                    thread_id, local_id = T.meta_var(index_map(i, j))
                     T.reads()
                     T.writes(C_warp[thread_id, local_id])
                     C_warp[thread_id, local_id] = zero
@@ -376,7 +376,7 @@ def mma_store_desc(a: T.handle, c: T.handle) -> None:
             for i0, i1 in T.grid(M_DIM, N_DIM):
                 with T.block("C_warp"):
                     v0, v1 = T.axis.remap("SS", [i0, i1])
-                    thread_id, local_id = index_map(v0, v1)
+                    thread_id, local_id = T.meta_var(index_map(v0, v1))
                     T.reads(C_warp[thread_id, local_id])
                     T.writes(C[v0, v1])
                     C[v0, v1] = C_warp[thread_id, local_id]
diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
index 3cad94006dd8..6fa9dd8f00ae 100644
--- a/python/tvm/tir/tensor_intrin/hexagon.py
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -30,10 +30,10 @@ def dot_product_32x4_u8u8i32_desc(
         T.reads(C[0:32], A[0:4], B[0:32, 0:4])
         T.writes(C[0:32])
         for i in T.serial(0, 32):
-            with T.init():
-                C[i] = T.int32(0)
             for k in T.serial(0, 4):
                 with T.block("update"):
+                    with T.init():
+                        C[i] = T.int32(0)
                     vi, vk = T.axis.remap("SR", [i, k])
                     C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
 
@@ -74,10 +74,10 @@ def dot_product_32x4_u8i8i32_desc(
         T.reads(C[0:32], A[0:4], B[0:32, 0:4])
         T.writes(C[0:32])
         for i in T.serial(0, 32):
-            with T.init():
-                C[i] = T.int32(0)
             for k in T.serial(0, 4):
                 with T.block("update"):
+                    with T.init():
+                        C[i] = T.int32(0)
                     vi, vk = T.axis.remap("SR", [i, k])
                     C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
 
diff --git a/python/tvm/tir/tensor_intrin/rocm.py b/python/tvm/tir/tensor_intrin/rocm.py
index 7a989d0bccaa..3700f3e8da47 100644
--- a/python/tvm/tir/tensor_intrin/rocm.py
+++ b/python/tvm/tir/tensor_intrin/rocm.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name,missing-function-docstring
 """Intrinsics for AMDGPU tensorization."""
 from tvm.script import tir as T
+
 from .. import TensorIntrin
 from .dot_product_common import dp4a_desc
 
diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index 9f8e639b5330..ef9b142d6f27 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -128,9 +128,9 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
         a_buffer = T.match_buffer(a_input, a_shape, dtype="uint8", scope="global")
         w_buffer = T.match_buffer(b_input, w_shape, dtype="uint8", scope="global")
         c_buffer = T.match_buffer(c_output, out_shape, dtype="int32", scope="global")
-        a_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", mem_scope="global.vtcm")
-        w_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", mem_scope="global.vtcm")
-        c_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", mem_scope="global.vtcm")
+        a_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", scope="global")
+        w_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", scope="global")
+        c_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", scope="global")
         T.evaluate(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
diff --git a/tests/python/relay/aot/test_pass_aot_lower_main.py b/tests/python/relay/aot/test_pass_aot_lower_main.py
index 0a9d95247af0..093305203a94 100644
--- a/tests/python/relay/aot/test_pass_aot_lower_main.py
+++ b/tests/python/relay/aot/test_pass_aot_lower_main.py
@@ -17,11 +17,11 @@
 # pylint: disable=line-too-long,missing-class-docstring,missing-module-docstring,missing-function-docstring,no-self-argument,unused-argument,invalid-name
 import numpy as np
 import pytest
-
 import tvm
 import tvm.testing
-from tvm.script import tir as T
+from tvm.ir import assert_structural_equal
 from tvm.relay.backend.aot import AOTLowerMain, CallType
+from tvm.script import tir as T
 
 
 def _make_const(dtype, shape):
@@ -48,7 +48,7 @@ def _assert_lowered_main(mod, main_func, call_type, print_script=False):
     if print_script:
         print(mod["__tvm_main__"].script())
 
-    assert mod["__tvm_main__"].script() == main_func.script()
+    assert_structural_equal(mod["__tvm_main__"], main_func)
 
 
 def test_single_call_cpacked():
diff --git a/tests/python/unittest/test_tir_lower_match_buffer.py b/tests/python/unittest/test_tir_lower_match_buffer.py
index 6120cf2b673c..535e0bb3294f 100644
--- a/tests/python/unittest/test_tir_lower_match_buffer.py
+++ b/tests/python/unittest/test_tir_lower_match_buffer.py
@@ -82,14 +82,13 @@ def opaque_access(a: T.handle, b: T.handle) -> None:
                 offset_factor=1,
             )
             T.evaluate(
-                T.intrin_test(
+                intrin_test(
                     sub_A.data,
                     sub_A.elem_offset,
                     sub_A.strides[0],
                     sub_A.strides[1],
                     sub_A.shape[0],
                     sub_A.shape[1],
-                    dtype="handle",
                 )
             )
     for i, j, k in T.grid(64, 2, 8):
@@ -105,14 +104,13 @@ def opaque_access(a: T.handle, b: T.handle) -> None:
                 offset_factor=1,
             )
             T.evaluate(
-                T.intrin_test(
+                intrin_test(
                     sub_B.data,
                     sub_B.elem_offset,
                     sub_B.strides[0],
                     sub_B.strides[1],
                     sub_B.shape[0],
                     sub_B.shape[1],
-                    dtype="handle",
                 )
             )
 
@@ -126,14 +124,13 @@ def transformed_opaque_access(a: T.handle, b: T.handle) -> None:
             T.reads([])
             T.writes(A[i * 16 : i * 16 + 16, j, k * 16 : k * 16 + 16])
             T.evaluate(
-                T.intrin_test(
+                intrin_test(
                     A.data,
                     i * 131072 + j * 128 + k * 16,
                     8192,
                     128,
                     16,
                     1,
-                    dtype="handle",
                 )
             )
     for i, j, k in T.grid(64, 2, 8):
@@ -141,14 +138,13 @@ def transformed_opaque_access(a: T.handle, b: T.handle) -> None:
             T.reads([])
             T.writes(B[i, j * 32 : j * 32 + 32, k * 8 : k * 8 + 8])
             T.evaluate(
-                T.intrin_test(
+                intrin_test(
                     B.data,
                     i * 4096 + j * 2048 + k * 8,
                     64,
                     1,
                     32,
                     8,
-                    dtype="handle",
                 )
             )
 
@@ -169,14 +165,13 @@ def high_dim_opaque_access(a: T.handle) -> None:
                 offset_factor=1,
             )
             T.evaluate(
-                T.intrin_test(
+                intrin_test(
                     sub_A.data,
                     sub_A.elem_offset,
                     sub_A.strides[0],
                     sub_A.strides[1],
                     sub_A.shape[0],
                     sub_A.shape[1],
-                    dtype="handle",
                 )
             )
 
@@ -189,14 +184,13 @@ def transformed_high_dim_opaque_access(a: T.handle) -> None:
             T.reads([])
             T.writes(A[i, j * 16 : j * 16 + 16, k * 16 : k * 16 + 16])
             T.evaluate(
-                T.intrin_test(
+                intrin_test(
                     A.data,
                     i * 2048 + j * 1024 + k * 16,
                     64,
                     1,
                     16,
                     16,
-                    dtype="handle",
                 )
             )
 
@@ -217,14 +211,13 @@ def high_dim_opaque_access_with_source_strides(a: T.handle) -> None:
                 offset_factor=1,
             )
             T.evaluate(
-                T.intrin_test(
+                intrin_test(
                     sub_A.data,
                     sub_A.elem_offset,
                     sub_A.strides[0],
                     sub_A.strides[1],
                     sub_A.shape[0],
                     sub_A.shape[1],
-                    dtype="handle",
                 )
             )
 
@@ -237,14 +230,13 @@ def transformed_high_dim_opaque_access_with_source_strides(a: T.handle) -> None:
             T.reads([])
             T.writes(A[i, j * 16 : j * 16 + 16, k * 16 : k * 16 + 16])
             T.evaluate(
-                T.intrin_test(
+                intrin_test(
                     A.data,
                     i * 2576 + j * 1280 + k * 16,
                     80,
                     1,
                     16,
                     16,
-                    dtype="handle",
                 )
             )
 
@@ -298,14 +290,13 @@ def recursive_match(a: T.handle, b: T.handle) -> None:
                         offset_factor=1,
                     )
                     T.evaluate(
-                        T.intrin_test(
+                        intrin_test(
                             sub_sub_A.data,
                             sub_sub_A.elem_offset,
                             sub_sub_A.strides[0],
                             sub_sub_A.strides[1],
                             sub_sub_A.shape[0],
                             sub_sub_A.shape[1],
-                            dtype="handle",
                         )
                     )
                     for jjj, kkk in T.grid(4, 4):
@@ -343,14 +334,13 @@ def transformed_recursive_match(a: T.handle, b: T.handle) -> None:
                         ]
                     )
                     T.evaluate(
-                        T.intrin_test(
+                        intrin_test(
                             A.data,
                             i * 4096 + j * 1024 + jj * 256 + k * 16 + kk * 4,
                             64,
                             1,
                             4,
                             4,
-                            dtype="handle",
                         )
                     )
                     for jjj, kkk in T.grid(4, 4):
@@ -375,14 +365,13 @@ def symbolic_match(a: T.handle, b: T.handle, n: T.int32, m: T.int32) -> None:
                 sub_A[ii, jj] = 1
             for j in range(0, 4):
                 T.evaluate(
-                    T.intrin_test(
+                    intrin_test(
                         sub_B.data,
                         sub_B.elem_offset,
                         sub_B.strides[0],
                         sub_B.strides[1],
                         sub_B.shape[0],
                         sub_B.shape[1],
-                        dtype="handle",
                     )
                 )
 
@@ -399,14 +388,13 @@ def transformed_symbolic_match(a: T.handle, b: T.handle, n: T.int32, m: T.int32)
                 A[i * m + ii, jj] = 1
             for j in range(0, 4):
                 T.evaluate(
-                    T.intrin_test(
+                    intrin_test(
                         B.data,
                         i * n * (m * 4),
                         m * 4,
                         1,
                         2,
                         m * 4,
-                        dtype="handle",
                     )
                 )
 
@@ -423,14 +411,13 @@ def rank0_buffer(a: T.handle, b: T.handle) -> None:
             sub_B = T.match_buffer(B[i, j], (), offset_factor=1)
             sub_A[()] = 1
             T.evaluate(
-                T.intrin_test(
+                intrin_test(
                     sub_B.data,
                     sub_B.elem_offset,
                     0,
                     0,
                     0,
                     0,
-                    dtype="handle",
                 )
             )
 
@@ -445,14 +432,13 @@ def transformed_rank0_buffer(a: T.handle, b: T.handle) -> None:
             T.writes([A[i, j], B[i, j]])
             A[i, j] = 1
             T.evaluate(
-                T.intrin_test(
+                intrin_test(
                     B.data,
                     i * 8 + j,
                     0,
                     0,
                     0,
                     0,
-                    dtype="handle",
                 )
             )
 
diff --git a/tests/python/unittest/test_tir_schedule_reindex.py b/tests/python/unittest/test_tir_schedule_reindex.py
index 53bc726ceaf3..b5e66943019f 100644
--- a/tests/python/unittest/test_tir_schedule_reindex.py
+++ b/tests/python/unittest/test_tir_schedule_reindex.py
@@ -233,7 +233,7 @@ def mixed_dtype_reindex_write(
     for ax0, ax1 in T.grid(T.int64(2), 1280):
         with T.block("T_matmul_NT_reindex"):
             v0 = T.axis.spatial(T.int64(2), ax0)
-            (v1,) = T.axis.remap("S", [ax1])
+            v1 = T.axis.remap("S", [ax1])
             T.reads(T_matmul_NT_reindex[v0, v1])
             T.writes(T_matmul_NT[v0, v1])
             T_matmul_NT[v0, v1] = T_matmul_NT_reindex[v0, v1]
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index 282f1dcf49e9..ca5ac12a97c0 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -18,7 +18,6 @@
 import sys
 
 import pytest
-
 import tvm
 import tvm.testing
 from tvm import tir
@@ -707,7 +706,7 @@ def expected(A: T.Buffer[(4, 4), "int32"], B: T.Buffer[14, "int32"]):
         for i, j in T.grid(4, 4):
             with T.block("buffer_A_assumption"):
                 vi, vj = T.axis.remap("SS", [i, j])
-                T.assume(not (vi == 3 and 2 <= vj) or A[vi, vj] == 42)
+                T.evaluate(T.assume(not (vi == 3 and 2 <= vj) or A[vi, vj] == 42))
 
         for i in T.serial(14):
             with T.block("block"):
@@ -790,9 +789,11 @@ def expected(A: T.Buffer[(4, 4), "int32"]):
         for i, j in T.grid(4, 4):
             with T.block("buffer_A_assumption"):
                 vi, vj = T.axis.remap("SS", [i, j])
-                T.assume(
-                    not (vi == 3 and 2 <= vj)
-                    or A[vi, vj] == A[((4 * vi + j) % 14) // 4, ((4 * vi + j) % 14) % 4]
+                T.evaluate(
+                    T.assume(
+                        not (vi == 3 and 2 <= vj)
+                        or A[vi, vj] == A[((4 * vi + j) % 14) // 4, ((4 * vi + j) % 14) % 4]
+                    )
                 )
 
         B = T.alloc_buffer(14, "int32")
diff --git a/tests/python/unittest/test_tir_schedule_utilities.py b/tests/python/unittest/test_tir_schedule_utilities.py
index 33ef0e221563..2f6c2f6a5120 100644
--- a/tests/python/unittest/test_tir_schedule_utilities.py
+++ b/tests/python/unittest/test_tir_schedule_utilities.py
@@ -150,7 +150,7 @@ def tuple_reduction(data: T.Buffer[(4, 32), "float32"], T_add: T.Buffer[(4,), "f
                 data_red_temp_v1[ax0] = v_data_red_temp_v1
         for i0 in range(4):
             with T.block("T_add"):
-                (ax0,) = T.axis.remap("S", [i0])
+                ax0 = T.axis.remap("S", [i0])
                 T.reads(data_red_temp_v0[ax0], data_red_temp_v1[ax0])
                 T.writes(T_add[ax0])
                 T_add[ax0] = data_red_temp_v0[ax0] + data_red_temp_v1[ax0]
diff --git a/tests/python/unittest/test_tir_transform_inject_virtual_thread.py b/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
index b4ea4e712d19..eb5ed08bb5af 100644
--- a/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
+++ b/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
 from tvm import te
 from tvm.script import tir as T
 
diff --git a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
index 42c7fbc0d4a9..3ab09f01dd01 100644
--- a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
+++ b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
@@ -548,88 +548,6 @@ def single_reduction_loop_with_tensorize(
                 )
 
 
-@T.prim_func
-def nested_reduction_loop_with_inner_match_buffers(
-    in0: T.Buffer[(4, 16), "int8"],
-    in1: T.Buffer[(4, 16), "int8"],
-    out: T.Buffer[(4, 4), "int32"],
-) -> None:
-    # body
-    # with T.block("root")
-    for y in T.serial(4):
-        with T.block("C"):
-            yi = T.axis.spatial(4, y)
-            T.reads(in0[yi, 0:16], in1[yi, 0:16])
-            T.writes(out[yi, 0:4])
-            for x in T.serial(4):
-                xr = T.axis.reduce(4, x)
-                with T.init():
-                    for i in T.serial(4):
-                        with T.block("C_init"):
-                            ii = T.axis.spatial(4, i)
-                            T.reads()
-                            T.writes(out[yi, ii])
-                            out[yi, ii] = 0
-                with T.block("C"):
-                    T.reads(
-                        out[yi, xr],
-                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
-                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
-                    )
-                    T.writes(out[yi, xr])
-                    A = T.match_buffer(
-                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
-                    )
-                    B = T.match_buffer(
-                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
-                    )
-                    C = T.match_buffer(out[yi, xr], [1], dtype="int32", offset_factor=1)
-                    A_i8x4: T.int8x4 = A[0:4]
-                    A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
-                    B_i8x4: T.int8x4 = B[0:4]
-                    B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
-                    C[0] = A_i32 + B_i32 + C[0]
-
-
-@T.prim_func
-def nested_reduction_loop_with_outer_match_buffers(
-    in0: T.Buffer[(4, 16), "int8"],
-    in1: T.Buffer[(4, 16), "int8"],
-    out: T.Buffer[(4, 4), "int32"],
-) -> None:
-    # body
-    # with T.block("root")
-    for y in T.serial(4):
-        with T.block("C"):
-            yi = T.axis.spatial(4, y)
-            T.reads(in0[yi, 0:16], in1[yi, 0:16])
-            T.writes(out[yi, 0:4])
-            A = T.match_buffer(in0[yi, 0:16], [16], dtype="int8", offset_factor=1)
-            B = T.match_buffer(in1[yi, 0:16], [16], dtype="int8", offset_factor=1)
-            C = T.match_buffer(out[yi, 0:4], [4], dtype="int32", offset_factor=1)
-            for x in T.serial(4):
-                xr = T.axis.reduce(4, x)
-                with T.init():
-                    for i in T.serial(4):
-                        with T.block("C_init"):
-                            ii = T.axis.spatial(4, i)
-                            T.reads()
-                            T.writes(out[yi, ii])
-                            out[yi, ii] = 0
-                with T.block("C"):
-                    T.reads(
-                        out[yi, xr],
-                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
-                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
-                    )
-                    T.writes(out[yi, xr])
-                    A_i8x4: T.int8x4 = A[yi * 4 + xr : yi * 4 + xr + 4]
-                    A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
-                    B_i8x4: T.int8x4 = B[yi * 4 + xr : yi * 4 + xr + 4]
-                    B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
-                    C[xr] = A_i32 + B_i32 + C[xr]
-
-
 @T.prim_func
 def reducer_max(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [128, 128], dtype="float32")
@@ -1329,20 +1247,6 @@ def test_single_reduction_loop_with_tensorize():
     )
 
 
-def test_nested_reduction_loop_with_inner_match_buffers():
-    _check(
-        nested_reduction_loop_with_inner_match_buffers,
-        nested_reduction_loop_with_inner_match_buffers,
-    )
-
-
-def test_nested_reduction_loop_with_outer_match_buffers():
-    _check(
-        nested_reduction_loop_with_outer_match_buffers,
-        nested_reduction_loop_with_outer_match_buffers,
-    )
-
-
 def test_reducer_max():
     _check(reducer_max, lowered_reducer_max)
 
diff --git a/tests/python/unittest/test_tir_transform_remove_assume.py b/tests/python/unittest/test_tir_transform_remove_assume.py
index 4223e40e3f2a..a2d68a075790 100644
--- a/tests/python/unittest/test_tir_transform_remove_assume.py
+++ b/tests/python/unittest/test_tir_transform_remove_assume.py
@@ -17,8 +17,8 @@
 
 import tvm
 import tvm.testing
-from tvm.script import tir as T
 from tvm import TVMError
+from tvm.script import tir as T
 
 
 class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
@@ -31,7 +31,7 @@ class TestRemoveAssume(BaseBeforeAfter):
     """Remove any instance of T.assume"""
 
     def before(A: T.Buffer[1, "int32"]):
-        T.assume(A[0] == 5)
+        T.evaluate(T.assume(A[0] == 5))
         A[0] = 10
 
     def expected(A: T.Buffer[1, "int32"]):
@@ -43,7 +43,7 @@ class TestRemoveAssumeLoop(BaseBeforeAfter):
 
     def before(A: T.Buffer[16, "int32"]):
         for i in T.serial(16):
-            T.assume(A[i] == 0)
+            T.evaluate(T.assume(A[i] == 0))
 
         for i in T.serial(16):
             A[i] = 10
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index 2ec52bfbfe41..32293cccdcf1 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -116,7 +116,6 @@ def missing_type_annotation(a) -> None:  # error
 def test_invalid_for_function():
     def invalid_for_function(a: T.handle) -> None:
         A = T.match_buffer(a, (16, 16), "float32")
-
         for i in T.evaluate(0.0):  # error
             for j in T.serial(0, 16):
                 A[i, j] = 0.0
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index a3df5a183bab..29e03f8bb63f 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -489,8 +489,8 @@ def test_ir_builder_tir_decl_buffer():
 
 def test_ir_builder_tir_inline():
     with IRBuilder() as ib:
-        m, n = T.inline(1), T.inline(2)
-        a, b = T.inline([3, 4])
+        m, n = T.meta_var(1), T.meta_var(2)
+        a, b = T.meta_var([3, 4])
         T.evaluate(m.value + n.value + a.value + b.value)
     # the evaluate generated by IRBuilder
     eval_actual = ib.get()
diff --git a/tests/python/unittest/test_tvmscript_parser_evaluator.py b/tests/python/unittest/test_tvmscript_parser_evaluator.py
index 4d6590306050..0f03e47ff933 100644
--- a/tests/python/unittest/test_tvmscript_parser_evaluator.py
+++ b/tests/python/unittest/test_tvmscript_parser_evaluator.py
@@ -17,8 +17,8 @@
 """Unittests for tvm.script.parser.evaluator"""
 import pytest
 import tvm.testing
-from tvm.script._parser.core.diagnostics import Source
-from tvm.script._parser.core.evaluator import ExprEvaluator
+from tvm.script.parser.core.diagnostics import Source
+from tvm.script.parser.core.evaluator import ExprEvaluator
 
 
 def _calc(expr, extra_vars=None):
diff --git a/tests/python/unittest/test_tvmscript_parser_ir.py b/tests/python/unittest/test_tvmscript_parser_ir.py
index b235d85bb457..d3e758fbe1a0 100644
--- a/tests/python/unittest/test_tvmscript_parser_ir.py
+++ b/tests/python/unittest/test_tvmscript_parser_ir.py
@@ -19,7 +19,7 @@
 import pytest
 import inspect
 import tvm.testing
-from tvm.script._parser import ir_module
+from tvm.script.parser import ir_module
 from tvm.ir import IRModule
 
 
diff --git a/tests/python/unittest/test_tvmscript_parser_source.py b/tests/python/unittest/test_tvmscript_parser_source.py
index cb93a2dcf62b..f5dc17fdfe56 100644
--- a/tests/python/unittest/test_tvmscript_parser_source.py
+++ b/tests/python/unittest/test_tvmscript_parser_source.py
@@ -18,8 +18,8 @@
 import pytest
 import inspect
 import tvm.testing
-from tvm.script._parser.core.diagnostics import Source
-from tvm.script._parser.core import doc_core as doc
+from tvm.script.parser.core.diagnostics import Source
+from tvm.script.parser.core import doc_core as doc
 from tvm.script import tir as T
 
 
diff --git a/tests/python/unittest/test_tvmscript_parser_tir.py b/tests/python/unittest/test_tvmscript_parser_tir.py
index cfa1dc62b31b..e3f87928acf8 100644
--- a/tests/python/unittest/test_tvmscript_parser_tir.py
+++ b/tests/python/unittest/test_tvmscript_parser_tir.py
@@ -19,7 +19,7 @@
 import pytest
 import inspect
 import tvm.testing
-from tvm.script._parser import tir as T
+from tvm.script.parser import tir as T
 from tvm import ir, tir
 
 
diff --git a/tests/python/unittest/test_tvmscript_spans.py b/tests/python/unittest/test_tvmscript_spans.py
index f863a4dd983e..2c0522e3e3c9 100644
--- a/tests/python/unittest/test_tvmscript_spans.py
+++ b/tests/python/unittest/test_tvmscript_spans.py
@@ -16,7 +16,7 @@
 # under the License.
 
 
-from tvm.script import tir as T
+from tvm.script.parser_v1 import tir as T
 
 
 @T.prim_func
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index 32572d392c51..16f1cb04945a 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -288,8 +288,8 @@ def constant_binds():
 
     @T.prim_func
     def constant_binds_wrapped():
-        x = T.int32(1)
-        y = T.float32(42.0)
+        x = T.meta_var(T.int32(1))
+        y = T.meta_var(T.float32(42.0))
         T.evaluate(T.cast(x, "float32") + y)
 
     assert_structural_equal(constant_binds, constant_binds_wrapped)
@@ -298,7 +298,7 @@ def constant_binds_wrapped():
 def test_func_call():
     def shared_16x16_to_ldmatrix_32x8_layout(i, j):
         thread_id = (i % 8) * 4 + (j % 8) // 2
-        return thread_id, (j // 8) * 4 + (i // 8) * 2 + (j % 2)
+        return T.meta_var((thread_id, (j // 8) * 4 + (i // 8) * 2 + (j % 2)))
 
     @T.prim_func
     def mma_sync_m16n16k16_desc(a: T.handle, b: T.handle, c: T.handle) -> None:

From b8384d105bec858e9adeb1e25662250502e8058e Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Sun, 13 Nov 2022 01:09:10 +0100
Subject: [PATCH 567/704] [ci] Assert some tests are not skipped in the CI
 (#12915)

In this PR, the skipped tests script will also check if tests in the `required_tests_to_run.json` have not been skipped. If there are skipped tests, they will be added to the returned comment.

I am not entirely sure where it's best to place the `required_tests_to_run` file, so I left it in `tvm/ci/scripts/`. I am happy to take suggestions.

Aims to prevent situations such as #12529
---
 ci/scripts/github_skipped_tests_comment.py |  77 +++++++++++++--
 ci/scripts/required_tests_to_run.json      |  11 +++
 tests/python/ci/test_ci.py                 | 108 ++++++++++++++++++++-
 3 files changed, 183 insertions(+), 13 deletions(-)
 create mode 100644 ci/scripts/required_tests_to_run.json

diff --git a/ci/scripts/github_skipped_tests_comment.py b/ci/scripts/github_skipped_tests_comment.py
index 7a62f16a5b81..46e579105324 100755
--- a/ci/scripts/github_skipped_tests_comment.py
+++ b/ci/scripts/github_skipped_tests_comment.py
@@ -15,6 +15,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import inspect
+import json
 import os
 import logging
 import subprocess
@@ -102,10 +104,30 @@ def to_node_name(dir_name: str):
     return dir_name.replace("_", ": ", 1)
 
 
+def build_diff_comment_with_main(
+    common_commit_sha,
+    skipped_list,
+    commit_sha,
+):
+    if len(skipped_list) == 0:
+        return f"No diff in skipped tests with main found in this branch for commit {commit_sha}.\n"
+
+    text = (
+        f"The list below shows tests that ran in main {common_commit_sha} but were "
+        f"skipped in the CI build of {commit_sha}:\n"
+        f"```\n"
+    )
+    for skip in skipped_list:
+        text += skip + "\n"
+    text += f"```\n"
+    return text
+
+
 def build_comment(
     common_commit_sha,
     common_main_build,
     skipped_list,
+    additional_skipped_list,
     pr_number,
     build_number,
     commit_sha,
@@ -114,18 +136,21 @@ def build_comment(
     if common_main_build["state"] != "success":
         return f"Unable to run tests bot because main failed to pass CI at {common_commit_sha}."
 
-    if len(skipped_list) == 0:
-        return f"No additional skipped tests found in this branch for commit {commit_sha}."
+    text = build_diff_comment_with_main(common_commit_sha, skipped_list, commit_sha)
+
+    if len(additional_skipped_list) != 0:
+        text += "\n"
+        text += (
+            f"Additional tests that were skipped in the CI build and present in the [`required_tests_to_run`]"
+            f"(https://github.com/apache/tvm/blob/main/ci/scripts/required_tests_to_run.json) file:"
+            f"\n```\n"
+        )
+        for skip in additional_skipped_list:
+            text += skip + "\n"
+        text += f"```\n"
 
-    text = (
-        f"The list below shows some tests that ran in main {common_commit_sha} but were "
-        f"skipped in the CI build of {commit_sha}:\n"
-        f"```\n"
-    )
-    for skip in skipped_list:
-        text += skip + "\n"
     text += (
-        f"```\nA detailed report of ran tests is [here](https://{jenkins_prefix}/job/tvm/job/PR-{str(pr_number)}"
+        f"A detailed report of ran tests is [here](https://{jenkins_prefix}/job/tvm/job/PR-{str(pr_number)}"
         f"/{str(build_number)}/testReport/)."
     )
     return text
@@ -148,6 +173,7 @@ def get_skipped_tests_comment(
     main_test_report_dir: str = "main-reports",
     common_commit_sha: Optional[str] = None,
     common_main_build: Optional[Dict[str, Any]] = None,
+    additional_tests_to_check_file: str = "required_tests_to_run.json",
 ) -> str:
     pr_head = pr["commits"]["nodes"][0]["commit"]
     target_url = find_target_url(pr_head)
@@ -195,10 +221,41 @@ def get_skipped_tests_comment(
     if len(skipped_list) == 0:
         logging.info("No skipped tests found.")
 
+    if not is_dry_run:
+        current_file = Path(__file__).resolve()
+        additional_tests_to_check_file = Path(current_file).parent / "required_tests_to_run.json"
+
+    logging.info(
+        f"Checking additional tests in file {additional_tests_to_check_file} are not skipped."
+    )
+    try:
+        with open(additional_tests_to_check_file, "r") as f:
+            additional_tests_to_check = json.load(f)
+    except IOError:
+        logging.info(
+            f"Failed to read additional tests from file: {additional_tests_to_check_file}."
+        )
+        additional_tests_to_check = {}
+
+    # Assert that tests present in "required_tests_to_run.json" are not skipped.
+    additional_skipped_tests = []
+    for subdir, test_set in additional_tests_to_check.items():
+        if subdir not in build_tests.keys():
+            logging.warning(f"Could not find directory {subdir} in the build test set.")
+            continue
+
+        for test in test_set:
+            if test in build_tests[subdir]:
+                additional_skipped_tests.append(f"{to_node_name(subdir)} -> {test}")
+
+    if len(additional_skipped_tests) == 0:
+        logging.info("No skipped tests found in the additional list.")
+
     body = build_comment(
         common_commit_sha,
         common_main_build,
         skipped_list,
+        additional_skipped_tests,
         pr_and_build["pr_number"],
         pr_and_build["build_number"],
         commit_sha,
diff --git a/ci/scripts/required_tests_to_run.json b/ci/scripts/required_tests_to_run.json
new file mode 100644
index 000000000000..8bd265c04fe0
--- /dev/null
+++ b/ci/scripts/required_tests_to_run.json
@@ -0,0 +1,11 @@
+{
+  "unittest_GPU":
+    [
+      "ctypes.tests.python.unittest.test_meta_schedule_integration#test_meta_schedule_integration_extract_from_bert_base",
+      "cython.tests.python.unittest.test_meta_schedule_integration#test_meta_schedule_integration_extract_from_bert_base",
+      "ctypes.tests.python.unittest.test_meta_schedule_integration#test_meta_schedule_dynamic_loop_extent",
+      "cython.tests.python.unittest.test_meta_schedule_integration#test_meta_schedule_dynamic_loop_extent",
+      "ctypes.tests.python.unittest.test_meta_schedule_integration#test_extract_task_arm_conv2d_nchwc",
+      "cython.tests.python.unittest.test_meta_schedule_integration#test_extract_task_arm_conv2d_nchwc"
+    ]
+}
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 4b8c5d9ad444..7b7b7298d8b3 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -46,7 +46,7 @@ def parameterize_named(**kwargs):
 
 # pylint: disable=line-too-long
 TEST_DATA_SKIPPED_BOT = {
-    "found-diff": {
+    "found-diff-no-additional": {
         "main_xml_file": "unittest/file1.xml",
         "main_xml_content": """<?xml version="1.0" encoding="utf-8"?>
                 <testsuites>
@@ -78,12 +78,61 @@ def parameterize_named(**kwargs):
                     </testsuite>
                 </testsuites>
                 """,
+        "additional_tests_to_check": """{
+                    "unittest": ["dummy_class#dummy_test"],
+                    "unittest_GPU": ["another_dummy_class#another_dummy_test"]
+                }
+                """,
+        "target_url": "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
+        "s3_prefix": "tvm-jenkins-artifacts-prod",
+        "jenkins_prefix": "ci.tlcpack.ai",
+        "common_main_build": """{"build_number": "4115", "state": "success"}""",
+        "commit_sha": "sha1234",
+        "expected_body": "The list below shows tests that ran in main sha1234 but were skipped in the CI build of sha1234:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\nunittest -> ctypes.tests.python.unittest.test_roofline#test_estimate_peak_bandwidth[cuda]\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).",
+    },
+    "found-diff-skipped-additional": {
+        "main_xml_file": "unittest/file1.xml",
+        "main_xml_content": """<?xml version="1.0" encoding="utf-8"?>
+                <testsuites>
+                    <testsuite errors="0" failures="0" hostname="13e7c5f749d8" name="python-unittest-gpu-0-shard-1-ctypes" skipped="102"
+                               tests="165" time="79.312" timestamp="2022-08-10T22:39:36.673781">
+                        <testcase classname="ctypes.tests.python.unittest.test_auto_scheduler_search_policy"
+                                  name="test_sketch_search_policy_cuda_rpc_runner" time="9.679">
+                        </testcase>
+                    </testsuite>
+                </testsuites>
+                """,
+        "pr_xml_file": "unittest/file2.xml",
+        "pr_xml_content": """<?xml version="1.0" encoding="utf-8"?>
+                <testsuites>
+                    <testsuite errors="0" failures="0" hostname="13e7c5f749d8" name="python-unittest-gpu-0-shard-1-ctypes" skipped="102"
+                               tests="165" time="79.312" timestamp="2022-08-10T22:39:36.673781">
+                        <testcase classname="ctypes.tests.python.unittest.test_auto_scheduler_search_policy"
+                                  name="test_sketch_search_policy_cuda_rpc_runner" time="9.679">
+                            <skipped message="This test is skipped" type="pytest.skip">
+                                Skipped
+                            </skipped>
+                        </testcase>
+                        <testcase classname="ctypes.tests.python.unittest.test_roofline"
+                                  name="test_estimate_peak_bandwidth[cuda]" time="4.679">
+                            <skipped message="This is another skippe test" type="pytest.skip">
+                                Skipped
+                            </skipped>
+                        </testcase>
+                    </testsuite>
+                </testsuites>
+                """,
+        "additional_tests_to_check": """{
+                    "unittest": ["ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner", "dummy_class#dummy_test"],
+                    "unittest_GPU": ["another_dummy_class#another_dummy_test"]
+                }
+                """,
         "target_url": "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
         "s3_prefix": "tvm-jenkins-artifacts-prod",
         "jenkins_prefix": "ci.tlcpack.ai",
         "common_main_build": """{"build_number": "4115", "state": "success"}""",
         "commit_sha": "sha1234",
-        "expected_body": "The list below shows some tests that ran in main sha1234 but were skipped in the CI build of sha1234:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\nunittest -> ctypes.tests.python.unittest.test_roofline#test_estimate_peak_bandwidth[cuda]\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).",
+        "expected_body": "The list below shows tests that ran in main sha1234 but were skipped in the CI build of sha1234:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\nunittest -> ctypes.tests.python.unittest.test_roofline#test_estimate_peak_bandwidth[cuda]\n```\n\nAdditional tests that were skipped in the CI build and present in the [`required_tests_to_run`](https://github.com/apache/tvm/blob/main/ci/scripts/required_tests_to_run.json) file:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).",
     },
     "no-diff": {
         "main_xml_file": "unittest/file1.xml",
@@ -114,12 +163,56 @@ def parameterize_named(**kwargs):
                     </testsuite>
                 </testsuites>
                 """,
+        "additional_tests_to_check": """{
+                }
+                """,
+        "target_url": "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
+        "s3_prefix": "tvm-jenkins-artifacts-prod",
+        "jenkins_prefix": "ci.tlcpack.ai",
+        "common_main_build": """{"build_number": "4115", "state": "success"}""",
+        "commit_sha": "sha1234",
+        "expected_body": "No diff in skipped tests with main found in this branch for commit sha1234.\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).",
+    },
+    "no-diff-skipped-additional": {
+        "main_xml_file": "unittest/file1.xml",
+        "main_xml_content": """<?xml version="1.0" encoding="utf-8"?>
+                <testsuites>
+                    <testsuite errors="0" failures="0" hostname="13e7c5f749d8" name="python-unittest-gpu-0-shard-1-ctypes" skipped="102"
+                               tests="165" time="79.312" timestamp="2022-08-10T22:39:36.673781">
+                        <testcase classname="ctypes.tests.python.unittest.test_auto_scheduler_search_policy"
+                                  name="test_sketch_search_policy_cuda_rpc_runner" time="9.679">
+                            <skipped message="This test is skipped" type="pytest.skip">
+                                Skipped
+                            </skipped>
+                        </testcase>
+                    </testsuite>
+                </testsuites>
+                """,
+        "pr_xml_file": "unittest/file2.xml",
+        "pr_xml_content": """<?xml version="1.0" encoding="utf-8"?>
+                <testsuites>
+                    <testsuite errors="0" failures="0" hostname="13e7c5f749d8" name="python-unittest-gpu-0-shard-1-ctypes" skipped="102"
+                               tests="165" time="79.312" timestamp="2022-08-10T22:39:36.673781">
+                        <testcase classname="ctypes.tests.python.unittest.test_auto_scheduler_search_policy"
+                                  name="test_sketch_search_policy_cuda_rpc_runner" time="9.679">
+                            <skipped message="This test is skipped" type="pytest.skip">
+                                Skipped
+                            </skipped>
+                        </testcase>
+                    </testsuite>
+                </testsuites>
+                """,
+        "additional_tests_to_check": """{
+                    "unittest": ["dummy_class#dummy_test", "ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner"],
+                    "unittest_GPU": ["another_dummy_class#another_dummy_test"]
+                }
+                """,
         "target_url": "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
         "s3_prefix": "tvm-jenkins-artifacts-prod",
         "jenkins_prefix": "ci.tlcpack.ai",
         "common_main_build": """{"build_number": "4115", "state": "success"}""",
         "commit_sha": "sha1234",
-        "expected_body": "No additional skipped tests found in this branch for commit sha1234.",
+        "expected_body": "No diff in skipped tests with main found in this branch for commit sha1234.\n\nAdditional tests that were skipped in the CI build and present in the [`required_tests_to_run`](https://github.com/apache/tvm/blob/main/ci/scripts/required_tests_to_run.json) file:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).",
     },
     "unable-to-run": {
         "main_xml_file": "unittest/file1.xml",
@@ -132,6 +225,11 @@ def parameterize_named(**kwargs):
                     <testsuites>
                     </testsuites>
                     """,
+        "additional_tests_to_check": """{
+                    "unittest": ["ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner", "dummy_class#dummy_test"],
+                    "unittest_GPU": ["another_dummy_class#another_dummy_test"]
+                }
+                """,
         "target_url": "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
         "s3_prefix": "tvm-jenkins-artifacts-prod",
         "jenkins_prefix": "ci.tlcpack.ai",
@@ -153,6 +251,7 @@ def test_skipped_tests_comment(
     main_xml_content,
     pr_xml_file,
     pr_xml_content,
+    additional_tests_to_check,
     target_url,
     s3_prefix,
     jenkins_prefix,
@@ -176,6 +275,8 @@ def write_xml_file(root_dir, xml_file, xml_content):
     write_xml_file(pr_test_report_dir, pr_xml_file, pr_xml_content)
     main_test_report_dir = Path(git.cwd) / "main-reports"
     write_xml_file(main_test_report_dir, main_xml_file, main_xml_content)
+    with open(Path(git.cwd) / "required_tests_to_run.json", "w") as f:
+        f.write(additional_tests_to_check)
 
     pr_data = {
         "commits": {
@@ -208,6 +309,7 @@ def write_xml_file(root_dir, xml_file, xml_content):
             pr_test_report_dir=pr_test_report_dir,
             main_test_report_dir=main_test_report_dir,
             common_main_build=json.loads(common_main_build),
+            additional_tests_to_check_file=Path(git.cwd) / "required_tests_to_run.json",
         )
     assert_in(expected_body, comment)
     assert_in(f"with target {target_url}", caplog.text)

From 5a767d053ddca688bb0e2e463106ae4c5d289b46 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Mon, 14 Nov 2022 18:32:12 +0100
Subject: [PATCH 568/704] [CI] Separate the ci scripts into Github and Jenkins
 scripts (#13368)

This PR is a duplicate of #12940 and #12941. For some reason, I am unable to reopen #12940.
---
 .github/disabled_workflows/pr_comment_bot.yml |   4 +-
 .github/workflows/cc_bot.yml                  |   2 +-
 .github/workflows/nightly_docker_update.yml   |   2 +-
 .github/workflows/ping_reviewers.yml          |   2 +-
 .github/workflows/tag_teams.yml               |   2 +-
 .github/workflows/tvmbot.yml                  |   2 +-
 .../update_last_successful_branch.yml         |   2 +-
 Jenkinsfile                                   | 176 +++++++++---------
 ci/jenkins/Deploy.groovy.j2                   |   4 +-
 ci/jenkins/DockerBuild.groovy.j2              |   4 +-
 ci/jenkins/Jenkinsfile.j2                     |   2 +
 ci/jenkins/Prepare.groovy.j2                  |  20 +-
 ci/jenkins/macros.j2                          |   4 +-
 ci/scripts/{ => github}/__init__.py           |   2 +-
 .../{ => github}/github_cc_reviewers.py       |   4 +
 ci/scripts/{ => github}/github_commenter.py   |   6 +
 .../{ => github}/github_docs_comment.py       |   0
 ci/scripts/{ => github}/github_pr_comment.py  |   6 +
 .../github_skipped_tests_comment.py           |   2 +-
 ci/scripts/{ => github}/github_tag_teams.py   |   5 +
 ci/scripts/{ => github}/github_tvmbot.py      |   5 +
 ci/scripts/{ => github}/ping_reviewers.py     |  11 +-
 ci/scripts/{ => github}/update_branch.py      |   7 +-
 ci/scripts/{ => jenkins}/check_pr.py          |   0
 ci/scripts/{ => jenkins}/cmd_utils.py         |   3 +-
 .../{ => jenkins}/determine_docker_images.py  |   3 +-
 ci/scripts/{ => jenkins}/git_change_docker.sh |   0
 ci/scripts/{ => jenkins}/git_change_docs.sh   |   0
 ci/scripts/{ => jenkins}/git_skip_ci.py       |   0
 ci/scripts/{ => jenkins}/git_skip_ci_globs.py |   0
 ci/scripts/{ => jenkins}/git_utils.py         |   0
 ci/scripts/{ => jenkins}/http_utils.py        |   0
 .../{ => jenkins}/open_docker_update_pr.py    |   3 +-
 ci/scripts/{ => jenkins}/pytest_ids.py        |   0
 ci/scripts/{ => jenkins}/pytest_wrapper.py    |   3 +-
 ci/scripts/{ => jenkins}/retry.sh             |   0
 .../{ => jenkins}/should_rebuild_docker.py    |   0
 .../{ => jenkins}/should_run_slow_tests.py    |   6 +-
 ci/scripts/required_tests_to_run.json         |  11 --
 docker/bash.sh                                |   3 +-
 tests/python/ci/test_ci.py                    |  38 ++--
 tests/python/ci/test_tvmbot.py                |   4 +-
 tests/python/ci/test_utils.py                 |   2 +
 tests/scripts/release/gather_prs.py           |   3 +-
 tests/scripts/release/make_notes.py           |   2 +
 tests/scripts/setup-pytest-env.sh             |   2 +-
 tests/scripts/task_build.py                   |   4 +-
 tests/scripts/task_python_frontend.sh         |   2 +-
 48 files changed, 197 insertions(+), 166 deletions(-)
 rename ci/scripts/{ => github}/__init__.py (94%)
 rename ci/scripts/{ => github}/github_cc_reviewers.py (95%)
 rename ci/scripts/{ => github}/github_commenter.py (95%)
 rename ci/scripts/{ => github}/github_docs_comment.py (100%)
 rename ci/scripts/{ => github}/github_pr_comment.py (94%)
 rename ci/scripts/{ => github}/github_skipped_tests_comment.py (99%)
 rename ci/scripts/{ => github}/github_tag_teams.py (97%)
 rename ci/scripts/{ => github}/github_tvmbot.py (99%)
 rename ci/scripts/{ => github}/ping_reviewers.py (96%)
 rename ci/scripts/{ => github}/update_branch.py (96%)
 rename ci/scripts/{ => jenkins}/check_pr.py (100%)
 rename ci/scripts/{ => jenkins}/cmd_utils.py (95%)
 rename ci/scripts/{ => jenkins}/determine_docker_images.py (98%)
 rename ci/scripts/{ => jenkins}/git_change_docker.sh (100%)
 rename ci/scripts/{ => jenkins}/git_change_docs.sh (100%)
 rename ci/scripts/{ => jenkins}/git_skip_ci.py (100%)
 rename ci/scripts/{ => jenkins}/git_skip_ci_globs.py (100%)
 rename ci/scripts/{ => jenkins}/git_utils.py (100%)
 rename ci/scripts/{ => jenkins}/http_utils.py (100%)
 rename ci/scripts/{ => jenkins}/open_docker_update_pr.py (99%)
 rename ci/scripts/{ => jenkins}/pytest_ids.py (100%)
 rename ci/scripts/{ => jenkins}/pytest_wrapper.py (98%)
 rename ci/scripts/{ => jenkins}/retry.sh (100%)
 rename ci/scripts/{ => jenkins}/should_rebuild_docker.py (100%)
 rename ci/scripts/{ => jenkins}/should_run_slow_tests.py (95%)
 delete mode 100644 ci/scripts/required_tests_to_run.json

diff --git a/.github/disabled_workflows/pr_comment_bot.yml b/.github/disabled_workflows/pr_comment_bot.yml
index 1ac33c77d2e5..436286768502 100644
--- a/.github/disabled_workflows/pr_comment_bot.yml
+++ b/.github/disabled_workflows/pr_comment_bot.yml
@@ -38,7 +38,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           set -eux
-          python ci/scripts/github_pr_comment.py --pr "$PR_NUMBER"
+          python ci/scripts/github/github_pr_comment.py --pr "$PR_NUMBER"
       - name: Comment bot comment (status)
         if: ${{ github.event.state }}
         env:
@@ -49,7 +49,7 @@ jobs:
           if [[ "$URL" == *"PR-"* ]]; then
             echo "PR status, sending comment"
             PR_NUMBER=$(echo $URL | sed 's/.*PR-//g' | sed 's/\/.*//g')
-            python ci/scripts/github_pr_comment.py --pr "$PR_NUMBER" || /bin/true
+            python ci/scripts/github/github_pr_comment.py --pr "$PR_NUMBER" || /bin/true
           else
             echo "Not a PR status, skipping"
           fi
diff --git a/.github/workflows/cc_bot.yml b/.github/workflows/cc_bot.yml
index 95aa96426229..54640148af73 100644
--- a/.github/workflows/cc_bot.yml
+++ b/.github/workflows/cc_bot.yml
@@ -44,4 +44,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python ci/scripts/github_cc_reviewers.py || echo step failed
+          python ci/scripts/github/github_cc_reviewers.py || echo step failed
diff --git a/.github/workflows/nightly_docker_update.yml b/.github/workflows/nightly_docker_update.yml
index c2441807430f..350987487306 100644
--- a/.github/workflows/nightly_docker_update.yml
+++ b/.github/workflows/nightly_docker_update.yml
@@ -28,4 +28,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python ci/scripts/open_docker_update_pr.py
+          python ci/scripts/jenkins/open_docker_update_pr.py
diff --git a/.github/workflows/ping_reviewers.yml b/.github/workflows/ping_reviewers.yml
index a2e3e996a033..f0395c09474b 100644
--- a/.github/workflows/ping_reviewers.yml
+++ b/.github/workflows/ping_reviewers.yml
@@ -20,4 +20,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python ci/scripts/ping_reviewers.py --wait-time-minutes 10080 || echo failed
+          python ci/scripts/github/ping_reviewers.py --wait-time-minutes 10080 || echo failed
diff --git a/.github/workflows/tag_teams.yml b/.github/workflows/tag_teams.yml
index c0c1b8b8299d..14d29604767e 100644
--- a/.github/workflows/tag_teams.yml
+++ b/.github/workflows/tag_teams.yml
@@ -38,4 +38,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python ci/scripts/github_tag_teams.py || echo failed
+          python ci/scripts/github/github_tag_teams.py || echo failed
diff --git a/.github/workflows/tvmbot.yml b/.github/workflows/tvmbot.yml
index 23e90aed5329..6965ea86f2e1 100644
--- a/.github/workflows/tvmbot.yml
+++ b/.github/workflows/tvmbot.yml
@@ -34,4 +34,4 @@ jobs:
           RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
         run: |
           set -eux
-          python ci/scripts/github_tvmbot.py --pr "$PR_NUMBER" --run-url "$RUN_URL" --trigger-comment-json "$ISSUE_COMMENT"
+          python ci/scripts/github/github_tvmbot.py --pr "$PR_NUMBER" --run-url "$RUN_URL" --trigger-comment-json "$ISSUE_COMMENT"
diff --git a/.github/workflows/update_last_successful_branch.yml b/.github/workflows/update_last_successful_branch.yml
index 6635b9ef4c47..da8852d8d296 100644
--- a/.github/workflows/update_last_successful_branch.yml
+++ b/.github/workflows/update_last_successful_branch.yml
@@ -41,4 +41,4 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -eux
-          python ci/scripts/update_branch.py || echo step failed
+          python ci/scripts/github/update_branch.py || echo step failed
diff --git a/Jenkinsfile b/Jenkinsfile
index 135f64dc1d94..171e200df379 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -106,6 +106,8 @@ rebuild_docker_images = false
 // Filenames for stashing between build and test steps
 s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
 
 // General note: Jenkins has limits on the size of a method (or top level code)
 // that are pretty strict, so most usage of groovy methods in these templates
@@ -146,7 +148,7 @@ def init_git() {
   sh(
     script: """
       set -eux
-      . ci/scripts/retry.sh
+      . ${jenkins_scripts_root}/retry.sh
       retry 3 timeout 5m git submodule update --init -f --jobs 0
     """,
     label: 'Update git submodules',
@@ -178,7 +180,7 @@ def docker_init(image) {
     sh(
       script: """
       set -eux
-      . ci/scripts/retry.sh
+      . ${jenkins_scripts_root}/retry.sh
       retry 5 docker pull ${image}
       """,
       label: 'Pull docker image',
@@ -194,7 +196,7 @@ def should_skip_slow_tests(pr_number) {
     // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
     result = sh (
       returnStatus: true,
-      script: "./ci/scripts/should_run_slow_tests.py --pr '${pr_number}'",
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
       label: 'Check if CI should run slow tests',
     )
   }
@@ -230,7 +232,7 @@ def checkout_trusted_files() {
     // (especially those that access secrets) should be checked out here so
     // only trusted versions are used in CI
     sh(
-      script: "git checkout ${upstream_revision} ci/scripts/.",
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
       label: 'Check out trusted files',
     )
   }
@@ -243,7 +245,7 @@ def should_skip_ci(pr_number) {
   }
   glob_skip_ci_code = sh (
     returnStatus: true,
-    script: "./ci/scripts/git_skip_ci_globs.py",
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
     label: 'Check if CI should be skipped due to changed files',
   )
   if (glob_skip_ci_code == 0) {
@@ -257,7 +259,7 @@ def should_skip_ci(pr_number) {
     // full CI just in case). Exit code of 0 means skip CI.
     git_skip_ci_code = sh (
       returnStatus: true,
-      script: "./ci/scripts/git_skip_ci.py --pr '${pr_number}'",
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
       label: 'Check if CI should be skipped',
     )
   }
@@ -274,7 +276,7 @@ def check_pr(pr_number) {
     variable: 'GITHUB_TOKEN',
     )]) {
     sh (
-      script: "python3 ci/scripts/check_pr.py --pr ${pr_number}",
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
       label: 'Check PR title and body',
     )
   }
@@ -291,7 +293,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./ci/scripts/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
@@ -374,14 +376,14 @@ def prepare() {
 
         is_docs_only_build = sh (
           returnStatus: true,
-          script: './ci/scripts/git_change_docs.sh',
+          script: './${jenkins_scripts_root}/git_change_docs.sh',
           label: 'Check for docs only changes',
         )
         skip_ci = should_skip_ci(env.CHANGE_ID)
         skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
         rebuild_docker_images = sh (
           returnStatus: true,
-          script: './ci/scripts/git_change_docker.sh',
+          script: './${jenkins_scripts_root}/git_change_docker.sh',
           label: 'Check for any docker changes',
         )
 
@@ -416,7 +418,7 @@ def ecr_push(full_name) {
       sh(
         script: """
           set -x
-          . ci/scripts/retry.sh
+          . ${jenkins_scripts_root}/retry.sh
           docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
           retry 5 docker push \$AWS_ECR_REPO/${full_name}
         """,
@@ -459,7 +461,7 @@ def ecr_pull(full_name) {
       sh(
         script: """
           set -eux
-          . ci/scripts/retry.sh
+          . ${jenkins_scripts_root}/retry.sh
           retry 5 docker pull ${full_name}
         """,
         label: 'Pull image from ECR'
@@ -745,7 +747,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              . ci/scripts/retry.sh
+              . ${jenkins_scripts_root}/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu/build/libtvm.so
               md5sum build/libvta_fsim.so
@@ -766,7 +768,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              . ci/scripts/retry.sh
+              . ${jenkins_scripts_root}/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu2/build/libtvm.so
               md5sum build/libvta_fsim.so
@@ -801,7 +803,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              . ci/scripts/retry.sh
+              . ${jenkins_scripts_root}/retry.sh
               md5sum build/libvta_tsim.so
               retry 3 aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/cpu/build/libvta_tsim.so
               md5sum build/libtvm.so
@@ -843,7 +845,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              . ci/scripts/retry.sh
+              . ${jenkins_scripts_root}/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cpu-minimal/build/libtvm.so
               md5sum build/libtvm_runtime.so
@@ -902,7 +904,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              . ci/scripts/retry.sh
+              . ${jenkins_scripts_root}/retry.sh
               md5sum build/libvta_tsim.so
               retry 3 aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/i386/build/libvta_tsim.so
               md5sum build/libtvm.so
@@ -939,7 +941,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              . ci/scripts/retry.sh
+              . ${jenkins_scripts_root}/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/arm/build/libtvm.so
               md5sum build/libvta_fsim.so
@@ -974,7 +976,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              . ci/scripts/retry.sh
+              . ${jenkins_scripts_root}/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cortexm/build/libtvm.so
               md5sum build/libtvm_runtime.so
@@ -1012,7 +1014,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              . ci/scripts/retry.sh
+              . ${jenkins_scripts_root}/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/hexagon/build/libtvm.so
               md5sum build/libtvm_runtime.so
@@ -1046,7 +1048,7 @@ stage('Build') {
           sh(
             script: """
               set -eux
-              . ci/scripts/retry.sh
+              . ${jenkins_scripts_root}/retry.sh
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/riscv/build/libtvm.so
               md5sum build/libtvm_runtime.so
@@ -1091,7 +1093,7 @@ def shard_run_unittest_GPU_1_of_3() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libvta_fsim.so build/libvta_fsim.so
@@ -1109,7 +1111,7 @@ def shard_run_unittest_GPU_1_of_3() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -1174,7 +1176,7 @@ def shard_run_unittest_GPU_2_of_3() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -1242,7 +1244,7 @@ def shard_run_unittest_GPU_3_of_3() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -1307,7 +1309,7 @@ def shard_run_integration_CPU_1_of_4() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
                           md5sum build/libvta_tsim.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
@@ -1369,7 +1371,7 @@ def shard_run_integration_CPU_2_of_4() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
                           md5sum build/libvta_tsim.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
@@ -1431,7 +1433,7 @@ def shard_run_integration_CPU_3_of_4() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
                           md5sum build/libvta_tsim.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
@@ -1493,7 +1495,7 @@ def shard_run_integration_CPU_4_of_4() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
                           md5sum build/libvta_tsim.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
@@ -1556,7 +1558,7 @@ def shard_run_python_i386_1_of_3() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
@@ -1618,7 +1620,7 @@ def shard_run_python_i386_2_of_3() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
@@ -1680,7 +1682,7 @@ def shard_run_python_i386_3_of_3() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
@@ -1742,7 +1744,7 @@ def shard_run_test_Hexagon_1_of_8() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -1803,7 +1805,7 @@ def shard_run_test_Hexagon_2_of_8() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -1863,7 +1865,7 @@ def shard_run_test_Hexagon_3_of_8() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -1923,7 +1925,7 @@ def shard_run_test_Hexagon_4_of_8() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -1983,7 +1985,7 @@ def shard_run_test_Hexagon_5_of_8() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2043,7 +2045,7 @@ def shard_run_test_Hexagon_6_of_8() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2103,7 +2105,7 @@ def shard_run_test_Hexagon_7_of_8() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2163,7 +2165,7 @@ def shard_run_test_Hexagon_8_of_8() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -2224,7 +2226,7 @@ def shard_run_integration_aarch64_1_of_4() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -2285,7 +2287,7 @@ def shard_run_integration_aarch64_2_of_4() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -2346,7 +2348,7 @@ def shard_run_integration_aarch64_3_of_4() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -2407,7 +2409,7 @@ def shard_run_integration_aarch64_4_of_4() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -2469,7 +2471,7 @@ def shard_run_topi_GPU_1_of_3() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -2529,7 +2531,7 @@ def shard_run_topi_GPU_2_of_3() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -2589,7 +2591,7 @@ def shard_run_topi_GPU_3_of_3() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -2650,7 +2652,7 @@ def shard_run_frontend_GPU_1_of_6() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -2710,7 +2712,7 @@ def shard_run_frontend_GPU_2_of_6() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -2770,7 +2772,7 @@ def shard_run_frontend_GPU_3_of_6() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -2830,7 +2832,7 @@ def shard_run_frontend_GPU_4_of_6() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -2890,7 +2892,7 @@ def shard_run_frontend_GPU_5_of_6() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -2950,7 +2952,7 @@ def shard_run_frontend_GPU_6_of_6() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -3011,7 +3013,7 @@ def shard_run_topi_aarch64_1_of_2() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -3076,7 +3078,7 @@ def shard_run_topi_aarch64_2_of_2() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -3141,7 +3143,7 @@ def shard_run_frontend_aarch64_1_of_2() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -3201,7 +3203,7 @@ def shard_run_frontend_aarch64_2_of_2() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
@@ -3262,7 +3264,7 @@ def shard_run_test_Cortex_M_1_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3327,7 +3329,7 @@ def shard_run_test_Cortex_M_2_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3387,7 +3389,7 @@ def shard_run_test_Cortex_M_3_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3447,7 +3449,7 @@ def shard_run_test_Cortex_M_4_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3507,7 +3509,7 @@ def shard_run_test_Cortex_M_5_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3567,7 +3569,7 @@ def shard_run_test_Cortex_M_6_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3627,7 +3629,7 @@ def shard_run_test_Cortex_M_7_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3687,7 +3689,7 @@ def shard_run_test_Cortex_M_8_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3747,7 +3749,7 @@ def shard_run_test_Cortex_M_9_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3807,7 +3809,7 @@ def shard_run_test_Cortex_M_10_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3867,7 +3869,7 @@ def shard_run_test_Cortex_M_11_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3927,7 +3929,7 @@ def shard_run_test_Cortex_M_12_of_12() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -3988,7 +3990,7 @@ def shard_run_test_RISC_V_1_of_1() {
               sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/riscv/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/riscv/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4045,7 +4047,7 @@ def run_unittest_minimal() {
               sh(
                     script: """
                       set -eux
-                      . ci/scripts/retry.sh
+                      . ${jenkins_scripts_root}/retry.sh
                       retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu-minimal/build/libtvm.so build/libtvm.so
                       md5sum build/libtvm.so
                       retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu-minimal/build/libtvm_runtime.so build/libtvm_runtime.so
@@ -4250,7 +4252,7 @@ stage('Test') {
                 sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
                           md5sum build/libvta_tsim.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
@@ -4311,7 +4313,7 @@ stage('Test') {
                 sh(
                         script: """
                           set -eux
-                          . ci/scripts/retry.sh
+                          . ${jenkins_scripts_root}/retry.sh
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
                           retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -4362,7 +4364,7 @@ stage('Test') {
           sh(
             script: """
               set -eux
-              . ci/scripts/retry.sh
+              . ${jenkins_scripts_root}/retry.sh
               retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
               md5sum build/libtvm.so
               retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
@@ -4387,7 +4389,7 @@ stage('Test') {
           sh(
       script: """
         set -eux
-        . ci/scripts/retry.sh
+        . ${jenkins_scripts_root}/retry.sh
         md5sum docs.tgz
         retry 3 aws s3 cp --no-progress docs.tgz s3://${s3_prefix}/docs/docs.tgz
       """,
@@ -4437,7 +4439,7 @@ def update_docker(ecr_image, hub_image) {
   sh(
     script: """
     set -eux
-    . ci/scripts/retry.sh
+    . ${jenkins_scripts_root}/retry.sh
     docker tag \
       ${ecr_image} \
       ${hub_image}
@@ -4501,7 +4503,7 @@ def deploy() {
                     sh(
                       script: """
                         set -eux
-                        . ci/scripts/retry.sh
+                        . ${jenkins_scripts_root}/retry.sh
                         retry 3 aws s3 cp --no-progress s3://${s3_prefix}/docs/docs.tgz docs.tgz
                         md5sum docs.tgz
                       """,
@@ -4582,7 +4584,7 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
-                              . ci/scripts/retry.sh
+                              . ${jenkins_scripts_root}/retry.sh
                               docker pull tlcpackstaging/ci_arm:${tag}
                               docker tag tlcpackstaging/ci_arm:${tag} tlcpack/ci-arm:${tag}
                               retry 5 docker push tlcpack/ci-arm:${tag}
@@ -4596,7 +4598,7 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
-                              . ci/scripts/retry.sh
+                              . ${jenkins_scripts_root}/retry.sh
                               docker pull tlcpackstaging/ci_cortexm:${tag}
                               docker tag tlcpackstaging/ci_cortexm:${tag} tlcpack/ci-cortexm:${tag}
                               retry 5 docker push tlcpack/ci-cortexm:${tag}
@@ -4610,7 +4612,7 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
-                              . ci/scripts/retry.sh
+                              . ${jenkins_scripts_root}/retry.sh
                               docker pull tlcpackstaging/ci_cpu:${tag}
                               docker tag tlcpackstaging/ci_cpu:${tag} tlcpack/ci-cpu:${tag}
                               retry 5 docker push tlcpack/ci-cpu:${tag}
@@ -4624,7 +4626,7 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
-                              . ci/scripts/retry.sh
+                              . ${jenkins_scripts_root}/retry.sh
                               docker pull tlcpackstaging/ci_gpu:${tag}
                               docker tag tlcpackstaging/ci_gpu:${tag} tlcpack/ci-gpu:${tag}
                               retry 5 docker push tlcpack/ci-gpu:${tag}
@@ -4638,7 +4640,7 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
-                              . ci/scripts/retry.sh
+                              . ${jenkins_scripts_root}/retry.sh
                               docker pull tlcpackstaging/ci_hexagon:${tag}
                               docker tag tlcpackstaging/ci_hexagon:${tag} tlcpack/ci-hexagon:${tag}
                               retry 5 docker push tlcpack/ci-hexagon:${tag}
@@ -4652,7 +4654,7 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
-                              . ci/scripts/retry.sh
+                              . ${jenkins_scripts_root}/retry.sh
                               docker pull tlcpackstaging/ci_i386:${tag}
                               docker tag tlcpackstaging/ci_i386:${tag} tlcpack/ci-i386:${tag}
                               retry 5 docker push tlcpack/ci-i386:${tag}
@@ -4666,7 +4668,7 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
-                              . ci/scripts/retry.sh
+                              . ${jenkins_scripts_root}/retry.sh
                               docker pull tlcpackstaging/ci_lint:${tag}
                               docker tag tlcpackstaging/ci_lint:${tag} tlcpack/ci-lint:${tag}
                               retry 5 docker push tlcpack/ci-lint:${tag}
@@ -4680,7 +4682,7 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
-                              . ci/scripts/retry.sh
+                              . ${jenkins_scripts_root}/retry.sh
                               docker pull tlcpackstaging/ci_minimal:${tag}
                               docker tag tlcpackstaging/ci_minimal:${tag} tlcpack/ci-minimal:${tag}
                               retry 5 docker push tlcpack/ci-minimal:${tag}
@@ -4694,7 +4696,7 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
-                              . ci/scripts/retry.sh
+                              . ${jenkins_scripts_root}/retry.sh
                               docker pull tlcpackstaging/ci_riscv:${tag}
                               docker tag tlcpackstaging/ci_riscv:${tag} tlcpack/ci-riscv:${tag}
                               retry 5 docker push tlcpack/ci-riscv:${tag}
@@ -4708,7 +4710,7 @@ def deploy() {
                           sh(
                             script: """
                               set -eux
-                              . ci/scripts/retry.sh
+                              . ${jenkins_scripts_root}/retry.sh
                               docker pull tlcpackstaging/ci_wasm:${tag}
                               docker tag tlcpackstaging/ci_wasm:${tag} tlcpack/ci-wasm:${tag}
                               retry 5 docker push tlcpack/ci-wasm:${tag}
diff --git a/ci/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2
index 798af6736e1e..f11d901258f6 100644
--- a/ci/jenkins/Deploy.groovy.j2
+++ b/ci/jenkins/Deploy.groovy.j2
@@ -30,7 +30,7 @@ def update_docker(ecr_image, hub_image) {
   sh(
     script: """
     set -eux
-    . ci/scripts/retry.sh
+    . ${jenkins_scripts_root}/retry.sh
     docker tag \
       ${ecr_image} \
       ${hub_image}
@@ -148,7 +148,7 @@ def deploy() {
                 sh(
                   script: """
                     set -eux
-                    . ci/scripts/retry.sh
+                    . ${jenkins_scripts_root}/retry.sh
                     docker pull tlcpackstaging/{{ image.name }}:${tag}
                     docker tag tlcpackstaging/{{ image.name }}:${tag} tlcpack/{{ image.name.replace("_", "-") }}:${tag}
                     retry 5 docker push tlcpack/{{ image.name.replace("_", "-") }}:${tag}
diff --git a/ci/jenkins/DockerBuild.groovy.j2 b/ci/jenkins/DockerBuild.groovy.j2
index 5ffbeded80fa..69e0db4f9e4f 100644
--- a/ci/jenkins/DockerBuild.groovy.j2
+++ b/ci/jenkins/DockerBuild.groovy.j2
@@ -21,7 +21,7 @@ def ecr_push(full_name) {
       sh(
         script: """
           set -x
-          . ci/scripts/retry.sh
+          . ${jenkins_scripts_root}/retry.sh
           docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
           retry 5 docker push \$AWS_ECR_REPO/${full_name}
         """,
@@ -64,7 +64,7 @@ def ecr_pull(full_name) {
       sh(
         script: """
           set -eux
-          . ci/scripts/retry.sh
+          . ${jenkins_scripts_root}/retry.sh
           retry 5 docker pull ${full_name}
         """,
         label: 'Pull image from ECR'
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index 34c1d66e43ba..7ceef81e4e7c 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -100,6 +100,8 @@ rebuild_docker_images = false
 {% set hexagon_api = ['build/hexagon_api_output',] %}
 s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
 {% set aws_default_region = "us-west-2" %}
 {% set aws_ecr_url = "dkr.ecr." + aws_default_region + ".amazonaws.com" %}
 
diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2
index 2c6f93090127..8bc57f466225 100644
--- a/ci/jenkins/Prepare.groovy.j2
+++ b/ci/jenkins/Prepare.groovy.j2
@@ -34,7 +34,7 @@ def init_git() {
   sh(
     script: """
       set -eux
-      . ci/scripts/retry.sh
+      . ${jenkins_scripts_root}/retry.sh
       retry 3 timeout 5m git submodule update --init -f --jobs 0
     """,
     label: 'Update git submodules',
@@ -66,7 +66,7 @@ def docker_init(image) {
     sh(
       script: """
       set -eux
-      . ci/scripts/retry.sh
+      . ${jenkins_scripts_root}/retry.sh
       retry 5 docker pull ${image}
       """,
       label: 'Pull docker image',
@@ -82,7 +82,7 @@ def should_skip_slow_tests(pr_number) {
     // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
     result = sh (
       returnStatus: true,
-      script: "./ci/scripts/should_run_slow_tests.py --pr '${pr_number}'",
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
       label: 'Check if CI should run slow tests',
     )
   }
@@ -118,7 +118,7 @@ def checkout_trusted_files() {
     // (especially those that access secrets) should be checked out here so
     // only trusted versions are used in CI
     sh(
-      script: "git checkout ${upstream_revision} ci/scripts/.",
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
       label: 'Check out trusted files',
     )
   }
@@ -131,7 +131,7 @@ def should_skip_ci(pr_number) {
   }
   glob_skip_ci_code = sh (
     returnStatus: true,
-    script: "./ci/scripts/git_skip_ci_globs.py",
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
     label: 'Check if CI should be skipped due to changed files',
   )
   if (glob_skip_ci_code == 0) {
@@ -145,7 +145,7 @@ def should_skip_ci(pr_number) {
     // full CI just in case). Exit code of 0 means skip CI.
     git_skip_ci_code = sh (
       returnStatus: true,
-      script: "./ci/scripts/git_skip_ci.py --pr '${pr_number}'",
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
       label: 'Check if CI should be skipped',
     )
   }
@@ -162,7 +162,7 @@ def check_pr(pr_number) {
     variable: 'GITHUB_TOKEN',
     )]) {
     sh (
-      script: "python3 ci/scripts/check_pr.py --pr ${pr_number}",
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
       label: 'Check PR title and body',
     )
   }
@@ -179,7 +179,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./ci/scripts/determine_docker_images.py {% for image in images %}{{ image.name }}={% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %} {% endfor %}",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py {% for image in images %}{{ image.name }}={% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %} {% endfor %}",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
@@ -205,14 +205,14 @@ def prepare() {
 
         is_docs_only_build = sh (
           returnStatus: true,
-          script: './ci/scripts/git_change_docs.sh',
+          script: './${jenkins_scripts_root}/git_change_docs.sh',
           label: 'Check for docs only changes',
         )
         skip_ci = should_skip_ci(env.CHANGE_ID)
         skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
         rebuild_docker_images = sh (
           returnStatus: true,
-          script: './ci/scripts/git_change_docker.sh',
+          script: './${jenkins_scripts_root}/git_change_docker.sh',
           label: 'Check for any docker changes',
         )
 
diff --git a/ci/jenkins/macros.j2 b/ci/jenkins/macros.j2
index 78c5acd1c7ff..b8ac0de91ce6 100644
--- a/ci/jenkins/macros.j2
+++ b/ci/jenkins/macros.j2
@@ -191,7 +191,7 @@ def {{ method_name }}() {
 sh(
       script: """
         set -eux
-        . ci/scripts/retry.sh
+        . ${jenkins_scripts_root}/retry.sh
         {% for filename in filenames %}
         md5sum {{ filename }}
         retry 3 aws s3 cp --no-progress {{ filename }} s3://${s3_prefix}/{{ tag }}/{{ filename }}
@@ -208,7 +208,7 @@ sh(
 sh(
             script: """
               set -eux
-              . ci/scripts/retry.sh
+              . ${jenkins_scripts_root}/retry.sh
               {% for filename in filenames %}
               retry 3 aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ filename }} {{ filename }}
               md5sum {{ filename }}
diff --git a/ci/scripts/__init__.py b/ci/scripts/github/__init__.py
similarity index 94%
rename from ci/scripts/__init__.py
rename to ci/scripts/github/__init__.py
index 064781fa158d..edc5fdff0e09 100644
--- a/ci/scripts/__init__.py
+++ b/ci/scripts/github/__init__.py
@@ -14,6 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Package to enable testing of CI scripts"""
+"""Package to enable testing of Github scripts"""
 
 from . import github_skipped_tests_comment, github_pr_comment, github_tag_teams, github_docs_comment
diff --git a/ci/scripts/github_cc_reviewers.py b/ci/scripts/github/github_cc_reviewers.py
similarity index 95%
rename from ci/scripts/github_cc_reviewers.py
rename to ci/scripts/github/github_cc_reviewers.py
index d8323221a7b0..b98a0290c17c 100755
--- a/ci/scripts/github_cc_reviewers.py
+++ b/ci/scripts/github/github_cc_reviewers.py
@@ -21,9 +21,13 @@
 import json
 import argparse
 import re
+from pathlib import Path
 from urllib import error
 from typing import Dict, Any, List
 
+# Hackery to enable importing of utils from ci/scripts/jenkins
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+sys.path.append(str(REPO_ROOT / "ci" / "scripts" / "jenkins"))
 
 from git_utils import git, GitHubRepo, parse_remote
 
diff --git a/ci/scripts/github_commenter.py b/ci/scripts/github/github_commenter.py
similarity index 95%
rename from ci/scripts/github_commenter.py
rename to ci/scripts/github/github_commenter.py
index dc71fcd1fd32..909ce8516402 100644
--- a/ci/scripts/github_commenter.py
+++ b/ci/scripts/github/github_commenter.py
@@ -18,8 +18,14 @@
 
 import re
 import logging
+import sys
+from pathlib import Path
 from typing import Dict, Tuple, Any, Optional, List, Union
 
+# Hackery to enable importing of utils from ci/scripts/jenkins
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+sys.path.append(str(REPO_ROOT / "ci" / "scripts" / "jenkins"))
+
 from git_utils import GitHubRepo
 
 BOT_COMMENT_START = "<!---bot-comment-->"
diff --git a/ci/scripts/github_docs_comment.py b/ci/scripts/github/github_docs_comment.py
similarity index 100%
rename from ci/scripts/github_docs_comment.py
rename to ci/scripts/github/github_docs_comment.py
diff --git a/ci/scripts/github_pr_comment.py b/ci/scripts/github/github_pr_comment.py
similarity index 94%
rename from ci/scripts/github_pr_comment.py
rename to ci/scripts/github/github_pr_comment.py
index bcf4c5096ab0..3ad7b02674c7 100755
--- a/ci/scripts/github_pr_comment.py
+++ b/ci/scripts/github/github_pr_comment.py
@@ -18,6 +18,12 @@
 import argparse
 import os
 import json
+import sys
+from pathlib import Path
+
+# Hackery to enable importing of utils from ci/scripts/jenkins
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+sys.path.append(str(REPO_ROOT / "ci" / "scripts" / "jenkins"))
 
 from git_utils import git, GitHubRepo, parse_remote, DRY_RUN
 from cmd_utils import init_log
diff --git a/ci/scripts/github_skipped_tests_comment.py b/ci/scripts/github/github_skipped_tests_comment.py
similarity index 99%
rename from ci/scripts/github_skipped_tests_comment.py
rename to ci/scripts/github/github_skipped_tests_comment.py
index 46e579105324..24c27a3cb0e6 100755
--- a/ci/scripts/github_skipped_tests_comment.py
+++ b/ci/scripts/github/github_skipped_tests_comment.py
@@ -142,7 +142,7 @@ def build_comment(
         text += "\n"
         text += (
             f"Additional tests that were skipped in the CI build and present in the [`required_tests_to_run`]"
-            f"(https://github.com/apache/tvm/blob/main/ci/scripts/required_tests_to_run.json) file:"
+            f"(https://github.com/apache/tvm/blob/main/ci/scripts/github/required_tests_to_run.json) file:"
             f"\n```\n"
         )
         for skip in additional_skipped_list:
diff --git a/ci/scripts/github_tag_teams.py b/ci/scripts/github/github_tag_teams.py
similarity index 97%
rename from ci/scripts/github_tag_teams.py
rename to ci/scripts/github/github_tag_teams.py
index fd63070db1ba..e50efa301d88 100755
--- a/ci/scripts/github_tag_teams.py
+++ b/ci/scripts/github/github_tag_teams.py
@@ -21,8 +21,13 @@
 import argparse
 import logging
 import re
+import sys
+from pathlib import Path
 from typing import Dict, Any, List, Tuple, Optional
 
+# Hackery to enable importing of utils from ci/scripts/jenkins
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+sys.path.append(str(REPO_ROOT / "ci" / "scripts" / "jenkins"))
 
 from git_utils import git, GitHubRepo, parse_remote, find_ccs, dry_run_token
 from cmd_utils import tags_from_title, init_log
diff --git a/ci/scripts/github_tvmbot.py b/ci/scripts/github/github_tvmbot.py
similarity index 99%
rename from ci/scripts/github_tvmbot.py
rename to ci/scripts/github/github_tvmbot.py
index ee9607dd0254..908551bdec0d 100755
--- a/ci/scripts/github_tvmbot.py
+++ b/ci/scripts/github/github_tvmbot.py
@@ -19,6 +19,7 @@
 import os
 import json
 import argparse
+import sys
 import warnings
 import logging
 import traceback
@@ -26,6 +27,10 @@
 from typing import Dict, Any, List, Optional, Callable, Union
 from pathlib import Path
 
+# Hackery to enable importing of utils from ci/scripts/jenkins
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+sys.path.append(str(REPO_ROOT / "ci" / "scripts" / "jenkins"))
+
 from git_utils import git, GitHubRepo, parse_remote, post
 from cmd_utils import init_log
 
diff --git a/ci/scripts/ping_reviewers.py b/ci/scripts/github/ping_reviewers.py
similarity index 96%
rename from ci/scripts/ping_reviewers.py
rename to ci/scripts/github/ping_reviewers.py
index af642a52a0eb..0ecdf76dd014 100755
--- a/ci/scripts/ping_reviewers.py
+++ b/ci/scripts/github/ping_reviewers.py
@@ -16,15 +16,20 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
 import argparse
 import re
 import datetime
 import json
+import sys
 import textwrap
-from typing import Dict, Any, List
+from pathlib import Path
+from typing import List
 
-from git_utils import git, GitHubRepo, parse_remote
+# Hackery to enable importing of utils from ci/scripts/jenkins
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+sys.path.append(str(REPO_ROOT / "ci" / "scripts" / "jenkins"))
+
+from git_utils import git, parse_remote
 
 GIT_DATE_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
 
diff --git a/ci/scripts/update_branch.py b/ci/scripts/github/update_branch.py
similarity index 96%
rename from ci/scripts/update_branch.py
rename to ci/scripts/github/update_branch.py
index 8f2558742217..9f689f6bfa65 100755
--- a/ci/scripts/update_branch.py
+++ b/ci/scripts/github/update_branch.py
@@ -19,9 +19,14 @@
 import os
 import json
 import argparse
-import tempfile
+import sys
+from pathlib import Path
 from typing import Any, Dict
 
+# Hackery to enable importing of utils from ci/scripts/jenkins
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+sys.path.append(str(REPO_ROOT / "ci" / "scripts" / "jenkins"))
+
 from git_utils import git, GitHubRepo, parse_remote
 
 
diff --git a/ci/scripts/check_pr.py b/ci/scripts/jenkins/check_pr.py
similarity index 100%
rename from ci/scripts/check_pr.py
rename to ci/scripts/jenkins/check_pr.py
diff --git a/ci/scripts/cmd_utils.py b/ci/scripts/jenkins/cmd_utils.py
similarity index 95%
rename from ci/scripts/cmd_utils.py
rename to ci/scripts/jenkins/cmd_utils.py
index f83ec6f24ecd..52eaf9ac0ad2 100644
--- a/ci/scripts/cmd_utils.py
+++ b/ci/scripts/jenkins/cmd_utils.py
@@ -24,7 +24,8 @@
 from typing import List
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+assert (REPO_ROOT / "Jenkinsfile").exists
 
 
 class RelativePathFilter(logging.Filter):
diff --git a/ci/scripts/determine_docker_images.py b/ci/scripts/jenkins/determine_docker_images.py
similarity index 98%
rename from ci/scripts/determine_docker_images.py
rename to ci/scripts/jenkins/determine_docker_images.py
index dbcde82cff7a..82acf2ea46b4 100755
--- a/ci/scripts/determine_docker_images.py
+++ b/ci/scripts/jenkins/determine_docker_images.py
@@ -26,8 +26,7 @@
 
 
 from http_utils import get
-from cmd_utils import init_log, REPO_ROOT
-
+from cmd_utils import init_log
 
 DOCKER_API_BASE = "https://hub.docker.com/v2/"
 PAGE_SIZE = 25
diff --git a/ci/scripts/git_change_docker.sh b/ci/scripts/jenkins/git_change_docker.sh
similarity index 100%
rename from ci/scripts/git_change_docker.sh
rename to ci/scripts/jenkins/git_change_docker.sh
diff --git a/ci/scripts/git_change_docs.sh b/ci/scripts/jenkins/git_change_docs.sh
similarity index 100%
rename from ci/scripts/git_change_docs.sh
rename to ci/scripts/jenkins/git_change_docs.sh
diff --git a/ci/scripts/git_skip_ci.py b/ci/scripts/jenkins/git_skip_ci.py
similarity index 100%
rename from ci/scripts/git_skip_ci.py
rename to ci/scripts/jenkins/git_skip_ci.py
diff --git a/ci/scripts/git_skip_ci_globs.py b/ci/scripts/jenkins/git_skip_ci_globs.py
similarity index 100%
rename from ci/scripts/git_skip_ci_globs.py
rename to ci/scripts/jenkins/git_skip_ci_globs.py
diff --git a/ci/scripts/git_utils.py b/ci/scripts/jenkins/git_utils.py
similarity index 100%
rename from ci/scripts/git_utils.py
rename to ci/scripts/jenkins/git_utils.py
diff --git a/ci/scripts/http_utils.py b/ci/scripts/jenkins/http_utils.py
similarity index 100%
rename from ci/scripts/http_utils.py
rename to ci/scripts/jenkins/http_utils.py
diff --git a/ci/scripts/open_docker_update_pr.py b/ci/scripts/jenkins/open_docker_update_pr.py
similarity index 99%
rename from ci/scripts/open_docker_update_pr.py
rename to ci/scripts/jenkins/open_docker_update_pr.py
index 516c8c1a7d8c..9dcb241d5fd8 100755
--- a/ci/scripts/open_docker_update_pr.py
+++ b/ci/scripts/jenkins/open_docker_update_pr.py
@@ -17,7 +17,6 @@
 # under the License.
 
 import argparse
-import re
 import logging
 import datetime
 import os
@@ -26,7 +25,7 @@
 from urllib import error
 from typing import List, Dict, Any, Optional, Callable
 from git_utils import git, parse_remote, GitHubRepo
-from cmd_utils import REPO_ROOT, init_log, Sh
+from cmd_utils import REPO_ROOT, init_log
 from should_rebuild_docker import docker_api
 
 JENKINSFILE = REPO_ROOT / "ci" / "jenkins" / "Jenkinsfile.j2"
diff --git a/ci/scripts/pytest_ids.py b/ci/scripts/jenkins/pytest_ids.py
similarity index 100%
rename from ci/scripts/pytest_ids.py
rename to ci/scripts/jenkins/pytest_ids.py
diff --git a/ci/scripts/pytest_wrapper.py b/ci/scripts/jenkins/pytest_wrapper.py
similarity index 98%
rename from ci/scripts/pytest_wrapper.py
rename to ci/scripts/jenkins/pytest_wrapper.py
index 4c4410bedc9c..4cc988f5bf72 100755
--- a/ci/scripts/pytest_wrapper.py
+++ b/ci/scripts/jenkins/pytest_wrapper.py
@@ -18,7 +18,6 @@
 import argparse
 import textwrap
 import junitparser
-import traceback
 from pathlib import Path
 from typing import List, Optional
 import os
@@ -28,7 +27,7 @@
 from cmd_utils import init_log
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
 
 
 def lstrip(s: str, prefix: str) -> str:
diff --git a/ci/scripts/retry.sh b/ci/scripts/jenkins/retry.sh
similarity index 100%
rename from ci/scripts/retry.sh
rename to ci/scripts/jenkins/retry.sh
diff --git a/ci/scripts/should_rebuild_docker.py b/ci/scripts/jenkins/should_rebuild_docker.py
similarity index 100%
rename from ci/scripts/should_rebuild_docker.py
rename to ci/scripts/jenkins/should_rebuild_docker.py
diff --git a/ci/scripts/should_run_slow_tests.py b/ci/scripts/jenkins/should_run_slow_tests.py
similarity index 95%
rename from ci/scripts/should_run_slow_tests.py
rename to ci/scripts/jenkins/should_run_slow_tests.py
index 40ce068520b8..04bf9b0a542f 100755
--- a/ci/scripts/should_run_slow_tests.py
+++ b/ci/scripts/jenkins/should_run_slow_tests.py
@@ -17,13 +17,9 @@
 # under the License.
 
 import os
-import json
 import argparse
-import subprocess
-import re
 import textwrap
-from urllib import request
-from typing import Dict, Tuple, Any, List, Optional
+from typing import Tuple, List, Optional
 
 
 from git_utils import GitHubRepo, parse_remote, git
diff --git a/ci/scripts/required_tests_to_run.json b/ci/scripts/required_tests_to_run.json
deleted file mode 100644
index 8bd265c04fe0..000000000000
--- a/ci/scripts/required_tests_to_run.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "unittest_GPU":
-    [
-      "ctypes.tests.python.unittest.test_meta_schedule_integration#test_meta_schedule_integration_extract_from_bert_base",
-      "cython.tests.python.unittest.test_meta_schedule_integration#test_meta_schedule_integration_extract_from_bert_base",
-      "ctypes.tests.python.unittest.test_meta_schedule_integration#test_meta_schedule_dynamic_loop_extent",
-      "cython.tests.python.unittest.test_meta_schedule_integration#test_meta_schedule_dynamic_loop_extent",
-      "ctypes.tests.python.unittest.test_meta_schedule_integration#test_extract_task_arm_conv2d_nchwc",
-      "cython.tests.python.unittest.test_meta_schedule_integration#test_extract_task_arm_conv2d_nchwc"
-    ]
-}
diff --git a/docker/bash.sh b/docker/bash.sh
index 5ee772867976..5973c7013b85 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -305,7 +305,8 @@ if [ -n "${EXPANDED_SHORTCUT}" ]; then
     if [ "${CI+x}" == "x" ]; then
         DOCKER_IMAGE_NAME="${EXPANDED_SHORTCUT}"
     else
-        python3 ci/scripts/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null
+        python3 ci/scripts/jenkins/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null
+        echo "HERE HERE HERE"
         DOCKER_IMAGE_NAME=$(cat ".docker-image-names/$DOCKER_IMAGE_NAME")
         if [[ "$DOCKER_IMAGE_NAME" == *"tlcpackstaging"* ]]; then
             echo "WARNING: resolved docker image to fallback tag in tlcpackstaging" >&2
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 7b7b7298d8b3..710f152c9b1e 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -26,13 +26,15 @@
 import pytest
 import tvm.testing
 
-from .test_utils import REPO_ROOT, TempGit, run_script
+from .test_utils import REPO_ROOT, GITHUB_SCRIPT_ROOT, JENKINS_SCRIPT_ROOT, TempGit, run_script
 
 # pylint: disable=wrong-import-position,wrong-import-order
 sys.path.insert(0, str(REPO_ROOT / "ci"))
-sys.path.insert(0, str(REPO_ROOT / "ci" / "scripts"))
+sys.path.insert(0, str(JENKINS_SCRIPT_ROOT))
+sys.path.insert(0, str(GITHUB_SCRIPT_ROOT))
 
-import scripts
+import scripts.github
+import scripts.jenkins
 
 # pylint: enable=wrong-import-position,wrong-import-order
 
@@ -132,7 +134,7 @@ def parameterize_named(**kwargs):
         "jenkins_prefix": "ci.tlcpack.ai",
         "common_main_build": """{"build_number": "4115", "state": "success"}""",
         "commit_sha": "sha1234",
-        "expected_body": "The list below shows tests that ran in main sha1234 but were skipped in the CI build of sha1234:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\nunittest -> ctypes.tests.python.unittest.test_roofline#test_estimate_peak_bandwidth[cuda]\n```\n\nAdditional tests that were skipped in the CI build and present in the [`required_tests_to_run`](https://github.com/apache/tvm/blob/main/ci/scripts/required_tests_to_run.json) file:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).",
+        "expected_body": "The list below shows tests that ran in main sha1234 but were skipped in the CI build of sha1234:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\nunittest -> ctypes.tests.python.unittest.test_roofline#test_estimate_peak_bandwidth[cuda]\n```\n\nAdditional tests that were skipped in the CI build and present in the [`required_tests_to_run`](https://github.com/apache/tvm/blob/main/ci/scripts/github/required_tests_to_run.json) file:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).",
     },
     "no-diff": {
         "main_xml_file": "unittest/file1.xml",
@@ -212,7 +214,7 @@ def parameterize_named(**kwargs):
         "jenkins_prefix": "ci.tlcpack.ai",
         "common_main_build": """{"build_number": "4115", "state": "success"}""",
         "commit_sha": "sha1234",
-        "expected_body": "No diff in skipped tests with main found in this branch for commit sha1234.\n\nAdditional tests that were skipped in the CI build and present in the [`required_tests_to_run`](https://github.com/apache/tvm/blob/main/ci/scripts/required_tests_to_run.json) file:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).",
+        "expected_body": "No diff in skipped tests with main found in this branch for commit sha1234.\n\nAdditional tests that were skipped in the CI build and present in the [`required_tests_to_run`](https://github.com/apache/tvm/blob/main/ci/scripts/github/required_tests_to_run.json) file:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).",
     },
     "unable-to-run": {
         "main_xml_file": "unittest/file1.xml",
@@ -300,7 +302,7 @@ def write_xml_file(root_dir, xml_file, xml_content):
         }
     }
     with caplog.at_level(logging.INFO):
-        comment = scripts.github_skipped_tests_comment.get_skipped_tests_comment(
+        comment = scripts.github.github_skipped_tests_comment.get_skipped_tests_comment(
             pr=pr_data,
             github=None,
             s3_prefix=s3_prefix,
@@ -350,7 +352,7 @@ def test_docs_comment(target_url, base_url, commit_sha, expected_body):
             ]
         }
     }
-    comment = scripts.github_docs_comment.get_doc_url(
+    comment = scripts.github.github_docs_comment.get_doc_url(
         pr=pr_data,
         base_docs_url=base_url,
     )
@@ -414,7 +416,7 @@ def test_cc_reviewers(
     """
     Test that reviewers are added from 'cc @someone' messages in PRs
     """
-    reviewers_script = REPO_ROOT / "ci" / "scripts" / "github_cc_reviewers.py"
+    reviewers_script = GITHUB_SCRIPT_ROOT / "github_cc_reviewers.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
     reviews = [{"user": {"login": r}} for r in existing_review_users]
@@ -497,7 +499,7 @@ def test_update_branch(tmpdir_factory, statuses, expected_rc, expected_output):
     """
     Test that the last-successful branch script updates successfully
     """
-    update_script = REPO_ROOT / "ci" / "scripts" / "update_branch.py"
+    update_script = GITHUB_SCRIPT_ROOT / "update_branch.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
     commit = {
@@ -608,7 +610,7 @@ def test_pr_comment(tmpdir_factory, pr_author, comments, expected):
     """
     Test the PR commenting bot
     """
-    comment_script = REPO_ROOT / "ci" / "scripts" / "github_pr_comment.py"
+    comment_script = GITHUB_SCRIPT_ROOT / "github_pr_comment.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
     target_url = "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect"
@@ -739,7 +741,7 @@ def test_skip_ci(tmpdir_factory, commands, should_skip, pr_title, why):
     """
     Test that CI is skipped when it should be
     """
-    skip_ci_script = REPO_ROOT / "ci" / "scripts" / "git_skip_ci.py"
+    skip_ci_script = JENKINS_SCRIPT_ROOT / "git_skip_ci.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
 
@@ -776,7 +778,7 @@ def test_skip_globs(tmpdir_factory, files, should_skip):
     """
     Test that CI is skipped if only certain files are edited
     """
-    script = REPO_ROOT / "ci" / "scripts" / "git_skip_ci_globs.py"
+    script = JENKINS_SCRIPT_ROOT / "git_skip_ci_globs.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
 
@@ -877,7 +879,7 @@ def test_ping_reviewers(tmpdir_factory, pull_request, check):
     """
     Test that reviewers are messaged after a time period of inactivity
     """
-    reviewers_script = REPO_ROOT / "ci" / "scripts" / "ping_reviewers.py"
+    reviewers_script = GITHUB_SCRIPT_ROOT / "ping_reviewers.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
 
@@ -1116,7 +1118,7 @@ def test_github_tag_teams(tmpdir_factory, source_type, data, check):
     """
     Check that individuals are tagged from team headers
     """
-    tag_script = REPO_ROOT / "ci" / "scripts" / "github_tag_teams.py"
+    tag_script = GITHUB_SCRIPT_ROOT / "github_tag_teams.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
 
@@ -1241,7 +1243,7 @@ def test_open_docker_update_pr(
     tmpdir_factory, tlcpackstaging_body, tlcpack_body, expected, expected_images
 ):
     """Test workflow to open a PR to update Docker images"""
-    tag_script = REPO_ROOT / "ci" / "scripts" / "open_docker_update_pr.py"
+    tag_script = JENKINS_SCRIPT_ROOT / "open_docker_update_pr.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
     git.run("config", "user.name", "ci")
@@ -1300,7 +1302,7 @@ def test_open_docker_update_pr(
 )
 def test_determine_docker_images(tmpdir_factory, images, expected):
     """Test script to decide whether to use tlcpack or tlcpackstaging for images"""
-    script = REPO_ROOT / "ci" / "scripts" / "determine_docker_images.py"
+    script = JENKINS_SCRIPT_ROOT / "determine_docker_images.py"
 
     git_dir = tmpdir_factory.mktemp("tmp_git_dir")
 
@@ -1355,7 +1357,7 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec
     """
     Check that the Docker images are built when necessary
     """
-    tag_script = REPO_ROOT / "ci" / "scripts" / "should_rebuild_docker.py"
+    tag_script = JENKINS_SCRIPT_ROOT / "should_rebuild_docker.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
     git.run("config", "user.name", "ci")
@@ -1434,7 +1436,7 @@ def test_pr_linter(title, body, expected, expected_code):
     """
     Test the PR linter
     """
-    tag_script = REPO_ROOT / "ci" / "scripts" / "check_pr.py"
+    tag_script = JENKINS_SCRIPT_ROOT / "check_pr.py"
     pr_data = {
         "title": title,
         "body": body,
diff --git a/tests/python/ci/test_tvmbot.py b/tests/python/ci/test_tvmbot.py
index ceabd46a9b03..de3ab9bb501b 100644
--- a/tests/python/ci/test_tvmbot.py
+++ b/tests/python/ci/test_tvmbot.py
@@ -23,7 +23,7 @@
 from typing import Dict, Any
 
 import tvm
-from .test_utils import REPO_ROOT, TempGit, run_script
+from .test_utils import GITHUB_SCRIPT_ROOT, TempGit, run_script
 
 
 SUCCESS_EXPECTED_OUTPUT = """
@@ -51,7 +51,7 @@ def test(self, tmpdir_factory):
         """
         Run the tvm-bot script using the data from preprocess_data
         """
-        mergebot_script = REPO_ROOT / "ci" / "scripts" / "github_tvmbot.py"
+        mergebot_script = GITHUB_SCRIPT_ROOT / "github_tvmbot.py"
         test_json_dir = Path(__file__).resolve().parent / "sample_prs"
         with open(test_json_dir / f"pr{self.NUMBER}.json") as f:
             test_data = json.load(f)
diff --git a/tests/python/ci/test_utils.py b/tests/python/ci/test_utils.py
index 4a0f2710e74a..107d16c11bc1 100644
--- a/tests/python/ci/test_utils.py
+++ b/tests/python/ci/test_utils.py
@@ -22,6 +22,8 @@
 from typing import List, Any
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
+GITHUB_SCRIPT_ROOT = REPO_ROOT / "ci" / "scripts" / "github"
+JENKINS_SCRIPT_ROOT = REPO_ROOT / "ci" / "scripts" / "jenkins"
 
 
 class TempGit:
diff --git a/tests/scripts/release/gather_prs.py b/tests/scripts/release/gather_prs.py
index 5fbfa2278feb..8f98076eb997 100644
--- a/tests/scripts/release/gather_prs.py
+++ b/tests/scripts/release/gather_prs.py
@@ -25,7 +25,8 @@
 from typing import Callable, Dict, List, Any
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
-sys.path.append(str(REPO_ROOT / "ci" / "scripts"))
+sys.path.append(str(REPO_ROOT / "ci" / "scripts" / "jenkins"))
+sys.path.append(str(REPO_ROOT / "ci" / "scripts" / "github"))
 
 from git_utils import git, GitHubRepo
 from github_tag_teams import tags_from_title
diff --git a/tests/scripts/release/make_notes.py b/tests/scripts/release/make_notes.py
index 95cb15197275..f39c1c567ceb 100644
--- a/tests/scripts/release/make_notes.py
+++ b/tests/scripts/release/make_notes.py
@@ -27,6 +27,8 @@
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
 sys.path.append(str(REPO_ROOT / "tests" / "scripts"))
+sys.path.append(str(REPO_ROOT / "tests" / "scripts" / "github"))
+sys.path.append(str(REPO_ROOT / "tests" / "scripts" / "jenkins"))
 
 
 def strip_header(title: str, header: str) -> str:
diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index 305f626d666c..fbb1ad4cbd3e 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -39,7 +39,7 @@ function cleanup() {
     set +x
     if [ "${#pytest_errors[@]}" -gt 0 ]; then
         echo "These pytest invocations failed, the results can be found in the Jenkins 'Tests' tab or by scrolling up through the raw logs here."
-        python3 ci/scripts/pytest_wrapper.py "${pytest_errors[@]}"
+        python3 ci/scripts/jenkins/pytest_wrapper.py "${pytest_errors[@]}"
         exit 1
     fi
     set -x
diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index 1a8a1d112fc0..157e8195ce78 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -24,9 +24,9 @@
 
 from pathlib import Path
 
-# Hackery to enable importing of utils from ci/scripts
+# Hackery to enable importing of utils from ci/scripts/jenkins
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent
-sys.path.append(str(REPO_ROOT / "ci" / "scripts"))
+sys.path.append(str(REPO_ROOT / "ci" / "scripts" / "jenkins"))
 from cmd_utils import Sh, init_log, REPO_ROOT
 
 
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index 61d7238a594b..ee6be87b36d0 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -42,7 +42,7 @@ run_pytest cython python-frontend-pytorch tests/python/frontend/pytorch
 
 echo "Running relay Tensorflow frontend test..."
 # Note: Tensorflow tests often have memory issues, so invoke each one separately
-TENSORFLOW_TESTS=$(./ci/scripts/pytest_ids.py --folder tests/python/frontend/tensorflow)
+TENSORFLOW_TESTS=$(./ci/scripts/jenkins/pytest_ids.py --folder tests/python/frontend/tensorflow)
 i=0
 for node_id in $TENSORFLOW_TESTS; do
     echo "$node_id"

From b6fae9b35eff4ad1f7cc2e83d8d7da5d701d8e44 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 14 Nov 2022 16:51:38 -0600
Subject: [PATCH 569/704] [TIR][Bugfix] Fix AXIS_SEPARATORS in
 tir.Schedule.transform_layout (#13326)

Preivously, the block SREF reuse only included a single step of
changes, and would have an incorrect mapping if multiple sequential
changes to the TIR block occurred.  This could happen if a
`BufferStore` was updated, followed by replacement of `Block` iter
vars/values.  This commit tracks the Block replacements across each
usage, to ensure the SREF instances remain valid.
---
 .../primitive/layout_transformation.cc        | 462 +++++++++++-------
 src/tir/schedule/state.cc                     |   7 +-
 .../test_tir_schedule_transform_layout.py     |  48 +-
 3 files changed, 326 insertions(+), 191 deletions(-)

diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index e4c91dac582c..c0b4ddfb4ac3 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -73,7 +73,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
   // Loops within the analyzed block that should be replaced
   struct ReplacementPlan {
     Map<For, Stmt> replacements;
-    Map<Block, Block> block_sref_reuse;
+    Map<Block, Block> new_block_to_old;
   };
 
   // The block to be inserted, along with the location at which it
@@ -100,6 +100,25 @@ class TransformLayoutPlanner : private StmtExprVisitor {
   }
 
  private:
+  struct WriteInfo {
+    // The BufferStore object
+    BufferStore store;
+
+    // The block realize that contains the store, if any.
+    Optional<BlockRealize> innermost_block_realize;
+
+    // The nested loops whose values contribute to the indices used in
+    // the store.  Not all loop variables in the loopnest need to
+    // contribute, but the first and last must.
+    std::vector<For> dependent_loopnest;
+
+    // Whether the padding could be represented as a tir::if_then_else
+    // node.  This requires that the surrounding loop iterators
+    // iterate over all pre-transformation buffer axes, that there are
+    // no data dependencies between loop iterations, and that
+    bool contains_row_major_traversal{false};
+  };
+
   explicit TransformLayoutPlanner(Buffer old_buffer) : old_buffer_(old_buffer) {}
 
   void VisitStmt_(const ForNode* op) override {
@@ -197,33 +216,217 @@ class TransformLayoutPlanner : private StmtExprVisitor {
 
   class BufferStoreReplacer : public StmtExprMutator {
    public:
-    BufferStoreReplacer(std::function<Optional<Stmt>(const BufferStoreNode*)> replace_store,
-                        std::function<Optional<Stmt>(const BlockRealizeNode*, const BlockRealize&)>
-                            replace_block_realize)
-        : replace_store_(replace_store), replace_block_realize_(replace_block_realize) {}
+    BufferStoreReplacer(const WriteInfo& info, const Buffer& new_buffer, PrimExpr padding_predicate,
+                        const IndexMap& inverse, const Optional<IndexMap>& pad_value,
+                        Map<Block, Block>* new_block_to_old)
+        : info(info),
+          new_buffer(new_buffer),
+          new_indices(inverse->initial_indices.Map([](const Var& var) -> PrimExpr { return var; })),
+          padding_predicate(padding_predicate),
+          inverse(inverse),
+          pad_value(pad_value),
+          new_block_to_old(*new_block_to_old) {
+      ICHECK_EQ(info.dependent_loopnest.size(), inverse->final_indices.size());
+      for (size_t i = 0; i < info.dependent_loopnest.size(); i++) {
+        Var var = info.dependent_loopnest[i]->loop_var;
+        PrimExpr expr = inverse->final_indices[i];
+        var_remap.Set(var, expr);
+      }
+
+      DefineBlockUpdates();
+    }
+
+    bool is_all_stores_replaced() const { return all_stores_replaced; }
+
+   private:
+    void DefineBlockUpdates() {
+      if (!info.innermost_block_realize) {
+        return;
+      }
+
+      BlockRealize block_realize = info.innermost_block_realize.value();
+      const auto& block = block_realize->block;
+      const Array<PrimExpr>& old_indices = info.store->indices;
+      const auto& old_iter_vars = block->iter_vars;
+
+      this->new_iter_vars = old_iter_vars;
+      this->new_iter_values = block_realize->iter_values;
+
+      if (old_indices.empty()) {
+        return;
+      }
+
+      // Find the block iterators that are used to access the buffer.  Must be in the same
+      // order as they appear in the indices.
+      if (block->iter_vars.size() < old_indices.size()) {
+        return;
+      }
+
+      size_t block_index_start = 0;
+      for (; block_index_start < old_iter_vars.size() - old_indices.size(); block_index_start++) {
+        if (old_indices[0].same_as(old_iter_vars[block_index_start]->var)) {
+          break;
+        }
+      }
+      if (block_index_start > old_iter_vars.size() - old_indices.size()) {
+        return;
+      }
+
+      for (size_t i = 0; i < old_indices.size(); i++) {
+        if (!old_indices[i].same_as(old_iter_vars[block_index_start + i]->var) ||
+            old_iter_vars[block_index_start + i]->iter_type != kDataPar) {
+          return;
+        }
+      }
+
+      // If we got to this point, all indices used to access the
+      // buffer are virtual indices defined in the innermost block.
+      // Therefore, generate new virtual indices for iterating over
+      // the post-transform buffer.
+
+      new_indices = inverse->initial_indices.Map([](Var var) -> PrimExpr {
+        std::stringstream ss;
+        ss << "v_" << var->name_hint;
+        return Var(ss.str(), var.dtype());
+      });
+
+      Map<Var, PrimExpr>
+          loop_var_to_virtual_var;  // For updating padding_predicate in terms of the new indices
+      Array<PrimExpr> new_iter_values;  // For BlockRealize
+      Array<IterVar> new_iter_vars;     // For Block
+
+      for (size_t i = 0; i < block_index_start; i++) {
+        new_iter_vars.push_back(old_iter_vars[i]);
+        new_iter_values.push_back(block_realize->iter_values[i]);
+      }
+
+      ICHECK_EQ(new_indices.size(), new_buffer->shape.size());
+      for (size_t i = 0; i < new_indices.size(); i++) {
+        Var var = inverse->initial_indices[i];
+        Var virtual_var = Downcast<Var>(new_indices[i]);
+        PrimExpr dim = new_buffer->shape[i];
+        new_iter_values.push_back(var);
+        new_iter_vars.push_back(
+            IterVar(Range::FromMinExtent(make_zero(dim.dtype()), dim), virtual_var, kDataPar));
+        loop_var_to_virtual_var.Set(var, virtual_var);
+      }
+
+      for (size_t i = block_index_start + old_indices.size(); i < old_iter_vars.size(); i++) {
+        new_iter_vars.push_back(old_iter_vars[i]);
+        new_iter_values.push_back(block_realize->iter_values[i]);
+      }
+
+      ICHECK_EQ(inverse->final_indices.size(), old_indices.size());
+      for (size_t i = 0; i < old_indices.size(); i++) {
+        Var var = Downcast<Var>(old_indices[i]);
+        PrimExpr expr = Substitute(inverse->final_indices[i], loop_var_to_virtual_var);
+        var_remap.Set(var, expr);
+      }
+
+      padding_predicate = Substitute(padding_predicate, loop_var_to_virtual_var);
+
+      this->new_iter_vars = new_iter_vars;
+      this->new_iter_values = new_iter_values;
+    }
 
     Stmt VisitStmt_(const BufferStoreNode* op) final {
-      if (auto replacement = replace_store_(op)) {
-        auto store = Downcast<BufferStore>(replacement.value());
-        return StmtExprMutator::VisitStmt_(store.get());
+      bool can_replace = [&]() -> bool {
+        if (!op->buffer.same_as(info.store->buffer)) {
+          return false;
+        }
+
+        const Array<PrimExpr>& old_indices = info.store->indices;
+
+        ICHECK_EQ(old_indices.size(), op->indices.size());
+        ExprDeepEqual expr_equal;
+        for (size_t i = 0; i < old_indices.size(); i++) {
+          if (!expr_equal(old_indices[i], op->indices[i])) {
+            return false;
+          }
+        }
+        return true;
+      }();
+
+      BufferStore store = GetRef<BufferStore>(op);
+      if (can_replace) {
+        PrimExpr pad_value_at_index = pad_value.value()->MapIndices(new_indices)[0];
+        store =
+            BufferStore(new_buffer, if_then_else(padding_predicate, pad_value_at_index, op->value),
+                        new_indices);
       } else {
-        return StmtExprMutator::VisitStmt_(op);
+        all_stores_replaced = false;
       }
+      return StmtExprMutator::VisitStmt_(store.get());
     }
 
     Stmt VisitStmt_(const BlockRealizeNode* op) final {
-      auto realize = Downcast<BlockRealize>(StmtExprMutator::VisitStmt_(op));
-      if (auto replacement = replace_block_realize_(op, realize)) {
-        return replacement.value();
+      BlockRealize realize = Downcast<BlockRealize>(StmtExprMutator::VisitStmt_(op));
+
+      if (op == info.innermost_block_realize.get()) {
+        Block block = realize->block;
+        if (!block->iter_vars.same_as(this->new_iter_vars)) {
+          block.CopyOnWrite()->iter_vars = this->new_iter_vars;
+          RecordReplacement(op->block, block);
+        }
+
+        if (!block.same_as(realize->block) ||
+            !realize->iter_values.same_as(this->new_iter_values)) {
+          auto write_ptr = realize.CopyOnWrite();
+          write_ptr->block = block;
+          write_ptr->iter_values = this->new_iter_values;
+        }
+      }
+
+      return std::move(realize);
+    }
+
+    Stmt VisitStmt_(const BlockNode* op) final {
+      Block orig = GetRef<Block>(op);
+      Block mutated = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
+
+      RecordReplacement(orig, mutated);
+      return std::move(mutated);
+    }
+
+    PrimExpr VisitExpr_(const VarNode* op) final {
+      Var var = GetRef<Var>(op);
+      if (auto opt = var_remap.Get(var)) {
+        return opt.value();
       } else {
-        return std::move(realize);
+        return std::move(var);
       }
     }
 
-   private:
-    std::function<Optional<Stmt>(const BufferStoreNode*)> replace_store_;
-    std::function<Optional<Stmt>(const BlockRealizeNode*, const BlockRealize&)>
-        replace_block_realize_;
+    void RecordReplacement(Block before, Block after) {
+      if (before.same_as(after)) {
+        return;
+      }
+
+      ICHECK(!new_block_to_old.count(after));
+
+      while (true) {
+        if (auto opt = new_block_to_old.Get(before)) {
+          before = opt.value();
+        } else {
+          break;
+        }
+      }
+
+      new_block_to_old.Set(after, before);
+    }
+
+    const WriteInfo& info;
+    const Buffer& new_buffer;
+    Array<PrimExpr> new_indices;
+    Array<IterVar> new_iter_vars;
+    Array<PrimExpr> new_iter_values;
+    PrimExpr padding_predicate;
+    const IndexMap& inverse;
+    const Optional<IndexMap>& pad_value;
+    Map<Block, Block>& new_block_to_old;
+    bool all_stores_replaced{true};
+
+    Map<Var, PrimExpr> var_remap;
   };
 
   TransformPlan Finalize(Buffer new_buffer, IndexMap index_map, IndexMap inverse,
@@ -296,159 +499,20 @@ class TransformLayoutPlanner : private StmtExprVisitor {
       return std::nullopt;
     }
 
+    Map<Block, Block> new_block_to_old;
     auto generate_if_then_else_block = [&](const WriteInfo& info) -> Optional<Stmt> {
       if (!info.contains_row_major_traversal || !pad_value.defined() ||
           is_zero(padding_predicate)) {
         return NullOpt;
       }
 
-      Array<PrimExpr> old_indices = info.store->indices;
-      PrimExpr if_then_else_condition = padding_predicate;
-      Array<PrimExpr> new_indices;
-      for (const auto& var : inverse->initial_indices) {
-        new_indices.push_back(var);
-      }
-
-      auto replace_block_realize =
-          [&]() -> std::function<Optional<Stmt>(const BlockRealizeNode*, const BlockRealize&)> {
-        auto no_change = [](const BlockRealizeNode*, const BlockRealize&) -> Optional<Stmt> {
-          return NullOpt;
-        };
-        if (!info.innermost_block_realize) {
-          return no_change;
-        }
-        if (old_indices.empty()) {
-          return no_change;
-        }
-
-        BlockRealize block_realize = info.innermost_block_realize.value();
-        const auto& block = block_realize->block;
-
-        // Find the block iterators that are used to access the buffer.  Must be in the same order
-        // as they appear in the indices.
-        if (block->iter_vars.size() < old_indices.size()) {
-          return no_change;
-        }
-        const auto& iter_vars = block->iter_vars;
-        size_t block_index_start = 0;
-        for (; block_index_start < iter_vars.size() - old_indices.size(); block_index_start++) {
-          if (old_indices[0].same_as(iter_vars[block_index_start]->var)) {
-            break;
-          }
-        }
-        if (block_index_start > iter_vars.size() - old_indices.size()) {
-          return no_change;
-        }
-
-        for (size_t i = 0; i < old_indices.size(); i++) {
-          if (!old_indices[i].same_as(iter_vars[block_index_start + i]->var) ||
-              iter_vars[block_index_start + i]->iter_type != kDataPar) {
-            return no_change;
-          }
-        }
-
-        // If we got to this point, all indices used to access the
-        // buffer are virtual indices defined in the innermost block.
-        // Therefore, generate new virtual indices for iterating over
-        // the post-transform buffer.
-        Array<PrimExpr> new_iter_values;             // For BlockRealize
-        Array<IterVar> new_iter_vars;                // For Block
-        Array<PrimExpr> new_access_indices;          // For BufferStore
-        Map<Var, PrimExpr> loop_var_to_virtual_var;  // For updating if_then_else_condition
-
-        for (size_t i = 0; i < block_index_start; i++) {
-          new_iter_vars.push_back(iter_vars[i]);
-          new_iter_values.push_back(block_realize->iter_values[i]);
-        }
-
-        ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size());
-        for (size_t i = 0; i < inverse->initial_indices.size(); i++) {
-          Var var = inverse->initial_indices[i];
-          PrimExpr dim = new_buffer->shape[i];
-          std::stringstream ss;
-          ss << "v_" << var->name_hint;
-          Var virtual_var(ss.str(), var.dtype());
-          new_iter_values.push_back(var);
-          new_iter_vars.push_back(
-              IterVar(Range::FromMinExtent(make_zero(dim.dtype()), dim), virtual_var, kDataPar));
-          new_access_indices.push_back(virtual_var);
-          loop_var_to_virtual_var.Set(var, virtual_var);
-        }
-
-        for (size_t i = block_index_start + old_indices.size(); i < iter_vars.size(); i++) {
-          new_iter_vars.push_back(iter_vars[i]);
-          new_iter_values.push_back(block_realize->iter_values[i]);
-        }
-
-        Map<Var, PrimExpr> old_virtual_var_to_new_virtual_var;
-        ICHECK_EQ(inverse->final_indices.size(), old_indices.size());
-        for (size_t i = 0; i < old_indices.size(); i++) {
-          Var var = Downcast<Var>(old_indices[i]);
-          PrimExpr expr = Substitute(inverse->final_indices[i], loop_var_to_virtual_var);
-          old_virtual_var_to_new_virtual_var.Set(var, expr);
-        }
-
-        if_then_else_condition = Substitute(if_then_else_condition, loop_var_to_virtual_var);
-        new_indices = new_access_indices;
-
-        return [target_realize = info.innermost_block_realize, new_iter_vars, new_iter_values,
-                old_virtual_var_to_new_virtual_var](const BlockRealizeNode* op,
-                                                    const BlockRealize& visited) -> Optional<Stmt> {
-          if (op == target_realize.get()) {
-            Block block = visited->block;
-            block =
-                Downcast<Block>(Substitute(std::move(block), old_virtual_var_to_new_virtual_var));
-            block.CopyOnWrite()->iter_vars = new_iter_vars;
-
-            BlockRealize realize = visited;
-            {
-              auto write_ptr = realize.CopyOnWrite();
-              write_ptr->block = block;
-              write_ptr->iter_values = new_iter_values;
-            }
-            return realize;
-          } else {
-            return NullOpt;
-          }
-        };
-      }();
-
-      bool all_stores_replaced = true;
-      auto replace_store = [&](const BufferStoreNode* op) -> Optional<Stmt> {
-        if (!op->buffer.same_as(info.store->buffer)) {
-          all_stores_replaced = false;
-          return NullOpt;
-        }
-        ICHECK_EQ(old_indices.size(), op->indices.size());
-        ExprDeepEqual expr_equal;
-        for (size_t i = 0; i < old_indices.size(); i++) {
-          if (!expr_equal(old_indices[i], op->indices[i])) {
-            all_stores_replaced = false;
-            return NullOpt;
-          }
-        }
-
-        PrimExpr pad_value_at_index = pad_value.value()->MapIndices(new_indices)[0];
-        return BufferStore(new_buffer,
-                           if_then_else(if_then_else_condition, pad_value_at_index, op->value),
-                           new_indices);
-      };
-
-      BufferStoreReplacer replacer(replace_store, replace_block_realize);
+      BufferStoreReplacer replacer(info, new_buffer, padding_predicate, inverse, pad_value,
+                                   &new_block_to_old);
       Stmt stmt = replacer(info.dependent_loopnest.back()->body);
-      if (!all_stores_replaced) {
+      if (!replacer.is_all_stores_replaced()) {
         return NullOpt;
       }
 
-      std::unordered_map<const VarNode*, PrimExpr> var_remap;
-      ICHECK_EQ(info.dependent_loopnest.size(), inverse->final_indices.size());
-      for (size_t i = 0; i < info.dependent_loopnest.size(); i++) {
-        Var var = info.dependent_loopnest[i]->loop_var;
-        PrimExpr expr = inverse->final_indices[i];
-        var_remap[var.get()] = expr;
-      }
-      stmt = Substitute(std::move(stmt), var_remap);
-
       ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size());
       for (size_t rev_i = 0; rev_i < inverse->initial_indices.size(); rev_i++) {
         size_t i = (inverse->initial_indices.size() - 1) - rev_i;
@@ -471,7 +535,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
     }
 
     if (loop_replacements.size()) {
-      return ReplacementPlan{std::move(loop_replacements)};
+      return ReplacementPlan{std::move(loop_replacements), std::move(new_block_to_old)};
     } else {
       return std::nullopt;
     }
@@ -603,25 +667,6 @@ class TransformLayoutPlanner : private StmtExprVisitor {
     std::vector<BindVariableDefinition> bound_vars_;
   };
 
-  struct WriteInfo {
-    // The BufferStore object
-    BufferStore store;
-
-    // The block realize that contains the store, if any.
-    Optional<BlockRealize> innermost_block_realize;
-
-    // The nested loops whose values contribute to the indices used in
-    // the store.  Not all loop variables in the loopnest need to
-    // contribute, but the first and last must.
-    std::vector<For> dependent_loopnest;
-
-    // Whether the padding could be represented as a tir::if_then_else
-    // node.  This requires that the surrounding loop iterators
-    // iterate over all pre-transformation buffer axes, that there are
-    // no data dependencies between loop iterations, and that
-    bool contains_row_major_traversal{false};
-  };
-
   /*! \brief Collected information about each BufferStore */
   std::vector<WriteInfo> write_info_;
 
@@ -683,7 +728,20 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
       auto write_ptr = result.CopyOnWrite();
       write_ptr->body = SeqStmt({plan_ptr->prologue, write_ptr->body});
     }
-    return {result, rewriter.block_sref_reuse_};
+
+    Map<Block, Block> block_sref_reuse;
+    for (auto [after, before] : rewriter.new_block_to_old_) {
+      while (auto opt = rewriter.new_block_to_old_.Get(before)) {
+        before = opt.value();
+      }
+      while (auto opt = block_sref_reuse.Get(after)) {
+        after = opt.value();
+      }
+
+      block_sref_reuse.Set(before, after);
+    }
+
+    return {result, block_sref_reuse};
   }
 
  private:
@@ -696,7 +754,11 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
         new_buffer_(new_buffer),
         index_map_(index_map),
         plan_(plan),
-        buffer_data_to_buffer_{{new_buffer->data, new_buffer}} {}
+        buffer_data_to_buffer_{{new_buffer->data, new_buffer}} {
+    if (auto plan_ptr = std::get_if<TransformLayoutPlanner::ReplacementPlan>(&plan_)) {
+      new_block_to_old_ = plan_ptr->new_block_to_old;
+    }
+  }
 
   void RewriteBufferAccess(Buffer* buffer, Array<PrimExpr>* indices) {
     *buffer = new_buffer_;
@@ -765,7 +827,20 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
   }
 
   Stmt VisitStmt_(const BlockNode* op) final {
+    Block orig = [&]() {
+      Block block = GetRef<Block>(op);
+      while (true) {
+        if (auto it = new_block_to_old_.find(block); it != new_block_to_old_.end()) {
+          block = (*it).second;
+        } else {
+          break;
+        }
+      }
+      return block;
+    }();
+
     Block block = Downcast<Block>(Parent::VisitStmt_(op));
+
     auto infered_access_regions = GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
     auto* n = block.CopyOnWrite();
     RewriteAccessRegion(&n->reads, infered_access_regions[0]);
@@ -777,16 +852,35 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
         return buffer;
       }
     });
-    block_sref_reuse_.Set(GetRef<Block>(op), block);
+
+    RecordReplacement(orig, block);
     return std::move(block);
   }
 
+  void RecordReplacement(Block before, Block after) {
+    if (before.same_as(after)) {
+      return;
+    }
+
+    ICHECK(!new_block_to_old_.count(after));
+
+    while (true) {
+      if (auto opt = new_block_to_old_.Get(before)) {
+        before = opt.value();
+      } else {
+        break;
+      }
+    }
+
+    new_block_to_old_.Set(after, before);
+  }
+
   const Buffer& old_buffer_;
   const Buffer& new_buffer_;
   const IndexMap& index_map_;
   const TransformLayoutPlanner::TransformPlan& plan_;
   Map<Var, Buffer> buffer_data_to_buffer_;
-  Map<Block, Block> block_sref_reuse_;
+  Map<Block, Block> new_block_to_old_;
 };
 
 class BufferIsSubregionError : public ScheduleError {
diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc
index 27056124d9e1..a901eff6f2d1 100644
--- a/src/tir/schedule/state.cc
+++ b/src/tir/schedule/state.cc
@@ -662,10 +662,11 @@ class SRefTreePruner : public StmtVisitor {
         << GetRef<Block>(op);
     StmtSRef& sref = it->second;
     // Detect reuse
-    auto reuse_it = reuse_info_.block_sref_reuse.find(op);
-    if (reuse_it != reuse_info_.block_sref_reuse.end()) {
+    const auto& sref_reuse = reuse_info_.block_sref_reuse;
+    if (auto reuse_it = sref_reuse.find(op); reuse_it != sref_reuse.end()) {
+      const BlockNode* to_reuse = reuse_it->second;
       // sref can be reused
-      reused_srefs_.emplace(reuse_it->second, std::move(sref));
+      reused_srefs_.emplace(to_reuse, std::move(sref));
     } else {
       sref->Reset();
       self_->block_info.erase(sref);
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index ca5ac12a97c0..e90478922324 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -415,13 +415,13 @@ class BasePaddingCompare(tvm.testing.CompareBeforeAfter):
 
     transformed_buffer = tvm.testing.parameter("A")
 
+    index_map = tvm.testing.parameter(lambda i: [i // 4, i % 4])
+
     @pytest.fixture
-    def transform(self, pad_value, transformed_buffer):
+    def transform(self, pad_value, transformed_buffer, index_map):
         def transform(mod):
             sch = tir.Schedule(mod)
-            sch.transform_layout(
-                "block", transformed_buffer, lambda i: [i // 4, i % 4], pad_value=pad_value
-            )
+            sch.transform_layout("block", transformed_buffer, index_map, pad_value=pad_value)
             return sch.mod
 
         return transform
@@ -885,5 +885,45 @@ def expected(A: T.Buffer[16, "int32"], n: T.int32):
                 )
 
 
+class TestTransformWithAxisSeparators(BasePaddingCompare):
+    """Axis separators may be specified in a transform"""
+
+    index_map = tvm.testing.parameter(lambda i: [i // 4, tvm.tir.IndexMap.AXIS_SEPARATOR, i % 4])
+    pad_value = tvm.testing.parameter(0)
+
+    def before(a: T.handle):
+        A = T.match_buffer(a, [14], "int32")
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                A[vi] = 42
+
+    def expected(a: T.handle):
+        A = T.match_buffer(a, [4, 4], "int32", axis_separators=[1])
+        for i, j in T.grid(4, 4):
+            with T.block("block"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                A[vi, vj] = T.if_then_else(vi == 3 and 2 <= vj, 0, 42, dtype="int32")
+
+
+class TestTransformWithAxisSeparatorsOpaqueBlock(BasePaddingCompare):
+    """Axis separators may be specified in a transform of opaque block"""
+
+    index_map = tvm.testing.parameter(lambda i: [i // 4, tvm.tir.IndexMap.AXIS_SEPARATOR, i % 4])
+    pad_value = tvm.testing.parameter(0)
+
+    def before(a: T.handle):
+        A = T.match_buffer(a, [14], "int32")
+        for i in T.serial(14):
+            with T.block("block"):
+                A[i] = 42
+
+    def expected(a: T.handle):
+        A = T.match_buffer(a, [4, 4], "int32", axis_separators=[1])
+        for i, j in T.grid(4, 4):
+            with T.block("block"):
+                A[i, j] = T.if_then_else(i == 3 and 2 <= j, 0, 42, dtype="int32")
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 68f51e692b79e799112dd709da8aa0ce854f7f23 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 14 Nov 2022 16:21:35 -0700
Subject: [PATCH 570/704] [ci] Fix Jenkins quoting (#13380)

Merging #13368 caused CI to pass but run more than it needed to due to
some failures in determination. This fixes the interpolation to use `"`
which should correctly pass through the variables

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                  | 6 +++---
 ci/jenkins/Prepare.groovy.j2 | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 171e200df379..079fb0688952 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-10-19T13:44:32.119961
+// Generated at 2022-11-14T12:32:18.663464
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -376,14 +376,14 @@ def prepare() {
 
         is_docs_only_build = sh (
           returnStatus: true,
-          script: './${jenkins_scripts_root}/git_change_docs.sh',
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
           label: 'Check for docs only changes',
         )
         skip_ci = should_skip_ci(env.CHANGE_ID)
         skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
         rebuild_docker_images = sh (
           returnStatus: true,
-          script: './${jenkins_scripts_root}/git_change_docker.sh',
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
           label: 'Check for any docker changes',
         )
 
diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2
index 8bc57f466225..4fd5e66e2f14 100644
--- a/ci/jenkins/Prepare.groovy.j2
+++ b/ci/jenkins/Prepare.groovy.j2
@@ -205,14 +205,14 @@ def prepare() {
 
         is_docs_only_build = sh (
           returnStatus: true,
-          script: './${jenkins_scripts_root}/git_change_docs.sh',
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
           label: 'Check for docs only changes',
         )
         skip_ci = should_skip_ci(env.CHANGE_ID)
         skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
         rebuild_docker_images = sh (
           returnStatus: true,
-          script: './${jenkins_scripts_root}/git_change_docker.sh',
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
           label: 'Check for any docker changes',
         )
 

From 41a22432608e5d1176f697848a06e87d3fe5c849 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Tue, 15 Nov 2022 00:30:22 +0100
Subject: [PATCH 571/704] [CI] Do not merge before running CI on main (#13372)

This PR does not merge `main` if CI is running already on `main`. It aims to avoid a case where a race happens between two subsequent commits, and one of them merges the other.

Fixes #12392.
---
 Jenkinsfile                  | 42 ++++++++++++++++++++++++------------
 ci/jenkins/Prepare.groovy.j2 | 42 ++++++++++++++++++++++++------------
 2 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 079fb0688952..9fd926430b6a 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -129,21 +129,13 @@ def init_git() {
   )
 
   // Determine merge commit to use for all stages
-  sh (
-    script: 'git fetch origin main',
-    label: 'Fetch upstream',
-  )
-  if (upstream_revision == null) {
-    upstream_revision = sh(
-      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
-      label: 'Determine upstream revision',
-      returnStdout: true,
-    ).trim()
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
   }
-  sh (
-    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
-    label: 'Merge to origin/main'
-  )
 
   sh(
     script: """
@@ -156,6 +148,28 @@ def init_git() {
   checkout_trusted_files()
 }
 
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
 def docker_init(image) {
   // Clear out all Docker images that aren't going to be used
   sh(
diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2
index 4fd5e66e2f14..6a82a887ede6 100644
--- a/ci/jenkins/Prepare.groovy.j2
+++ b/ci/jenkins/Prepare.groovy.j2
@@ -15,21 +15,13 @@ def init_git() {
   )
 
   // Determine merge commit to use for all stages
-  sh (
-    script: 'git fetch origin main',
-    label: 'Fetch upstream',
-  )
-  if (upstream_revision == null) {
-    upstream_revision = sh(
-      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
-      label: 'Determine upstream revision',
-      returnStdout: true,
-    ).trim()
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
   }
-  sh (
-    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
-    label: 'Merge to origin/main'
-  )
 
   sh(
     script: """
@@ -42,6 +34,28 @@ def init_git() {
   checkout_trusted_files()
 }
 
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
 def docker_init(image) {
   // Clear out all Docker images that aren't going to be used
   sh(

From 3aa16f72dd3f1807b11ec61cf372af07d32099c4 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Mon, 14 Nov 2022 17:45:39 -0800
Subject: [PATCH 572/704] [Codegen] Fix CUDA codegen for int64 Ramp (#13382)

---
 src/target/source/codegen_cuda.cc               | 4 +++-
 tests/python/topi/python/test_topi_transform.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index d96e0cbc1679..3ae74cc16da4 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -1005,7 +1005,9 @@ void CodeGenCUDA::VisitStmt_(const EvaluateNode* op) {
 
 void CodeGenCUDA::VisitExpr_(const RampNode* op, std::ostream& os) {
   CHECK_LE(op->lanes, 4) << "ValueError: Ramp of more than 4 lanes is not allowed.";
-  os << "(make_int" << op->lanes << "(";
+  os << "(make_";
+  PrintType(op->dtype, os);
+  os << "(";
   for (int i = 0; i < op->lanes; i++) {
     os << "(" << PrintExpr(op->base) << ")"
        << "+(" << PrintExpr(op->stride) << "*" << i << ")";
diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index dd5ad1b11926..0f64b486f375 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -1040,6 +1040,7 @@ def test_gather():
     verify_gather(np.random.randn(4, 7, 5), 1, np.random.randint(low=0, high=7, size=(4, 10, 5)))
     verify_gather(np.random.randn(4, 7, 5), 2, np.random.randint(low=0, high=5, size=(4, 7, 2)))
     verify_gather(np.random.randn(4, 7, 5), 2, np.random.randint(low=0, high=5, size=(4, 7, 10)))
+    verify_gather(np.random.randn(4, 7, 2), 0, np.random.randint(low=0, high=4, size=(4, 7, 2)))
 
 
 @tvm.testing.uses_gpu

From 647be2b42510bffb3ed78267c19e76263adcac36 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 14 Nov 2022 20:58:12 -0800
Subject: [PATCH 573/704] [MetaSchedule] Add `from-target` Defaults for LLVM
 VNNI Targets (#13383)

---
 include/tvm/meta_schedule/mutator.h           |  2 +
 include/tvm/meta_schedule/postproc.h          |  2 +
 include/tvm/meta_schedule/schedule_rule.h     |  2 +
 src/meta_schedule/mutator/mutator.cc          |  2 +
 src/meta_schedule/postproc/postproc.cc        |  8 ++++
 .../schedule_rule/schedule_rule.cc            | 45 +++++++++++++++++++
 .../space_generator/space_generator.cc        | 11 +++++
 7 files changed, 72 insertions(+)

diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h
index 08a8248dfdbc..4095d6ca0397 100644
--- a/include/tvm/meta_schedule/mutator.h
+++ b/include/tvm/meta_schedule/mutator.h
@@ -131,6 +131,8 @@ class Mutator : public runtime::ObjectRef {
                                    FApply f_apply, FClone f_clone, FAsString f_as_string);
   /*! \brief Create default mutators for LLVM */
   TVM_DLL static Map<Mutator, FloatImm, void> DefaultLLVM();
+  /*! \brief Create default mutators for x86 VNNI */
+  TVM_DLL static Map<Mutator, FloatImm, void> DefaultVNNI();
   /*! \brief Create default mutators for CUDA */
   TVM_DLL static Map<Mutator, FloatImm, void> DefaultCUDA();
   /*! \brief Create default mutators for CUDA with TensorCore */
diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index a680a647956c..13fe47058740 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -152,6 +152,8 @@ class Postproc : public runtime::ObjectRef {
   TVM_DLL static Postproc RewriteLayout();
   /*! \brief Create default postprocessors for LLVM */
   TVM_DLL static Array<Postproc, void> DefaultLLVM();
+  /*! \brief Create default postprocessors for x86 VNNI */
+  TVM_DLL static Array<Postproc, void> DefaultVNNI();
   /*! \brief Create default postprocessors for CUDA */
   TVM_DLL static Array<Postproc, void> DefaultCUDA();
   /*! \brief Create default postprocessors for CUDA with TensorCore */
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 70dec47e60bd..a3d6c7ef68bf 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -285,6 +285,8 @@ class ScheduleRule : public runtime::ObjectRef {
 
   /*! \brief Create default schedule rules for LLVM */
   TVM_DLL static Array<ScheduleRule, void> DefaultLLVM();
+  /*! \brief Create default schedule rules for x86 VNNI */
+  TVM_DLL static Array<ScheduleRule, void> DefaultVNNI();
   /*! \brief Create default schedule rules for CUDA */
   TVM_DLL static Array<ScheduleRule, void> DefaultCUDA();
   /*! \brief Create default postprocessors for CUDA with TensorCore */
diff --git a/src/meta_schedule/mutator/mutator.cc b/src/meta_schedule/mutator/mutator.cc
index 8e9bfc8bde4b..8f3d14b6c466 100644
--- a/src/meta_schedule/mutator/mutator.cc
+++ b/src/meta_schedule/mutator/mutator.cc
@@ -59,6 +59,8 @@ Map<Mutator, FloatImm> Mutator::DefaultLLVM() {
       {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(DataType::Float(64), 0.02)}};
 }
 
+Map<Mutator, FloatImm> Mutator::DefaultVNNI() { return Mutator::DefaultLLVM(); }
+
 Map<Mutator, FloatImm> Mutator::DefaultCUDA() {
   return Map<Mutator, FloatImm>{
       {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc
index 0738c871120f..c614f3230d59 100644
--- a/src/meta_schedule/postproc/postproc.cc
+++ b/src/meta_schedule/postproc/postproc.cc
@@ -59,6 +59,14 @@ Array<Postproc> Postproc::DefaultLLVM() {
   };
 }
 
+Array<Postproc> Postproc::DefaultVNNI() {
+  return Array<Postproc>{
+      Postproc::DisallowDynamicLoop(),   Postproc::RewriteParallelVectorizeUnroll(),
+      Postproc::RewriteReductionBlock(), Postproc::RewriteTensorize(/*vectorize_init_loop=*/true),
+      Postproc::RewriteLayout(),
+  };
+}
+
 Array<Postproc> Postproc::DefaultCUDA() {
   return Array<Postproc>{
       Postproc::DisallowDynamicLoop(),
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index b1e8c3695d3e..e4f97c1fa673 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -85,6 +85,51 @@ Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
   };
 }
 
+Array<ScheduleRule> ScheduleRule::DefaultVNNI() {
+  return {
+      ScheduleRule::ApplyCustomRule(),
+      ScheduleRule::InlineConstantScalars(),
+      ScheduleRule::AutoInline(
+          /*into_producer=*/false,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/true,
+          /*require_injective=*/true,
+          /*require_ordered=*/true,
+          /*disallow_op=*/Array<String>{"tir.exp"}),
+      ScheduleRule::AddRFactor(
+          /*max_jobs_per_core=*/16,
+          /*max_innermost_factor=*/Integer(64)),
+      ScheduleRule::MultiLevelTilingWithIntrin(
+          /*intrin_name=*/"dot_16x4_vnni",
+          /*structure=*/"SSRSRS",
+          /*tile_binds=*/NullOpt,
+          /*max_innermost_factor=*/Integer(64),
+          /*vector_load_lens=*/NullOpt,
+          /*reuse_read=*/NullOpt,
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("may")},
+                                 {"levels", Array<Integer>{1, 2}},
+                                 {"scope", String("global")}}),
+      ScheduleRule::MultiLevelTiling(
+          /*structure=*/"SSRSRS",
+          /*tile_binds=*/NullOpt,
+          /*max_innermost_factor=*/Integer(64),
+          /*vector_load_lens=*/NullOpt,
+          /*reuse_read=*/NullOpt,
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("may")},
+                                 {"levels", Array<Integer>{1, 2}},
+                                 {"scope", String("global")}}),
+      ScheduleRule::ParallelizeVectorizeUnroll(
+          /*max_jobs_per_core=*/16,
+          /*max_vectorize_extent=*/64,
+          /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512},
+          /*unroll_explicit=*/true),
+      ScheduleRule::RandomComputeLocation(),
+  };
+}
+
 Array<ScheduleRule> ScheduleRule::DefaultCUDA() {
   return {
       ScheduleRule::ApplyCustomRule(),
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index bcc0673e5924..bd124511b83c 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -23,6 +23,13 @@ namespace meta_schedule {
 
 String GetRuleKindFromTarget(const Target& target) {
   if (target->kind->name == "llvm") {
+    static const PackedFunc* f_check_vnni =
+        runtime::Registry::Get("tvm.topi.x86.utils.target_has_vnni");
+    ICHECK(*f_check_vnni != nullptr) << "The `target_has_vnni` func is not in tvm registry.";
+    if (target->GetAttr<String>("mcpu") &&
+        (*f_check_vnni)(target->GetAttr<String>("mcpu").value())) {
+      return "vnni";
+    }
     return "llvm";
   }
   if (target->kind->name == "hexagon") {
@@ -79,6 +86,10 @@ void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
       default_sch_rules = ScheduleRule::DefaultHexagon();
       default_postprocs = Postproc::DefaultHexagon();
       default_mutator_probs = Mutator::DefaultHexagon();
+    } else if (kind == "vnni") {
+      default_sch_rules = ScheduleRule::DefaultVNNI();
+      default_postprocs = Postproc::DefaultVNNI();
+      default_mutator_probs = Mutator::DefaultVNNI();
     } else {
       LOG(FATAL) << "Unsupported kind: " << kind;
       throw;

From 034dc67d032aac3b848e15a87a7fbb5b72a0b909 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Tue, 15 Nov 2022 10:30:50 +0000
Subject: [PATCH 574/704] [TFLite] Enable int64 biases for int16 quantized
 operators (#12042)

This enables int64 biases for quantized fully connected, requantize
and transpose convolution in TFLite networks. It goes on top of existing
int16 support for TFLite frontend.

Add a test case using DS_CNN int16 quantized.
---
 python/tvm/relay/frontend/tflite.py           |   6 +-
 src/relay/qnn/op/convolution_transpose.cc     |  10 +-
 src/relay/qnn/op/dense.cc                     |  10 +-
 src/relay/qnn/op/requantize.cc                |   5 +-
 .../test_ethosn/test_convert_equivalents.py   |   4 +-
 tests/python/frontend/tflite/test_forward.py  |  23 +
 tests/python/relay/test_op_qnn_requantize.py  | 495 ++++++++++--------
 7 files changed, 329 insertions(+), 224 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 1915eb9322ff..3d2f4a2f25e6 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -1966,7 +1966,7 @@ def convert_fully_connected(self, op):
                 input_scale=input_tensor.qnn_params["scale"],
                 kernel_scale=weight_tensor.qnn_params["scale"],
                 units=weight_shape[0],
-                out_dtype="int32",
+                out_dtype="int64" if output_tensor_type_str == "int16" else "int32",
             )
         else:
             out = _op.nn.dense(in_expr, weight_expr, units=weight_shape[0])
@@ -1977,7 +1977,7 @@ def convert_fully_connected(self, op):
             if bias_tensor.tensor_idx != -1:
                 bias_tensor_type = bias_tensor.tensor.Type()
                 # bias tensor type should be INT32 (quantization) or FLOAT32
-                assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32)
+                assert bias_tensor_type in (TensorType.INT32, TensorType.INT64, TensorType.FLOAT32)
                 bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
                 if self.has_expr(bias_tensor.tensor_idx):
                     bias_expr = self.get_expr(bias_tensor.tensor_idx)
@@ -3175,7 +3175,7 @@ def convert_transpose_conv(self, op):
             bias_tensor = input_tensors[3]
             bias_tensor_type = bias_tensor.tensor.Type()
             # bias tensor type should be INT32 (quantization) or FLOAT32
-            assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32)
+            assert bias_tensor_type in (TensorType.INT32, TensorType.INT64, TensorType.FLOAT32)
             bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
             if self.has_expr(bias_tensor.tensor_idx):
                 bias_expr = self.get_expr(bias_tensor.tensor_idx)
diff --git a/src/relay/qnn/op/convolution_transpose.cc b/src/relay/qnn/op/convolution_transpose.cc
index 6163e1c20429..951c1bdfb051 100644
--- a/src/relay/qnn/op/convolution_transpose.cc
+++ b/src/relay/qnn/op/convolution_transpose.cc
@@ -93,12 +93,14 @@ bool QnnConv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<Conv2DTransposeAttrs>();
   ICHECK(param != nullptr) << "Conv2DTransposeAttrs cannot be nullptr.";
-  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
-      << "Expected qnn conv2d type(int8, uint8) for input but was " << data->dtype;
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8) ||
+         data->dtype == DataType::Int(16) || data->dtype == DataType::UInt(16))
+      << "Expected qnn conv2d type(int8, uint8, int16) for input but was " << data->dtype;
   ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
       << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
-  ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32))
-      << "Expected qnn conv2d type(int32, int16) for output but was " << param->out_dtype;
+  ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32) ||
+         data->dtype == DataType::Int(64))
+      << "Expected qnn conv2d type(int16, int32, int64) for output but was " << param->out_dtype;
   ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
 
   // Check the types of scale and zero points.
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index adaf509e7daf..09d51e3c9ce7 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -47,12 +47,14 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<DenseAttrs>();
   ICHECK(param != nullptr) << "DenseAttrs cannot be nullptr.";
-  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
-      << "Expected quantized dense type(int8, uint8) for input but was " << data->dtype;
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8) ||
+         data->dtype == DataType::Int(16) || data->dtype == DataType::UInt(16))
+      << "Expected quantized dense type(int8, uint8, int16, uint16) for input but was "
+      << data->dtype;
   ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
       << "Expected quantized dense type(int8, uint8) for weight but was " << weight->dtype;
-  ICHECK(param->out_dtype == DataType::Int(32))
-      << "Expected quantized dense type(int32) for output but was " << param->out_dtype;
+  ICHECK(param->out_dtype == DataType::Int(32) || param->out_dtype == DataType::Int(64))
+      << "Expected quantized dense type(int32, int64) for output but was " << param->out_dtype;
 
   // Check the types of scale and zero points.
   for (size_t i = 2; i < 5; ++i) {
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 1614652719c6..e199ea27f1e4 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -480,8 +480,9 @@ bool RequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
   const auto in_dtype = data->dtype;
   ICHECK(in_dtype == DataType::Int(8) || in_dtype == DataType::UInt(8) ||
-         in_dtype == DataType::Int(32) || in_dtype == DataType::Int(64))
-      << "Input type should be one of [int8, uint8, int32, int64] but was " << in_dtype;
+         in_dtype == DataType::Int(16) || in_dtype == DataType::Int(32) ||
+         in_dtype == DataType::Int(64))
+      << "Input type should be one of [int8, uint8, int16, int32, int64] but was " << in_dtype;
 
   const RequantizeAttrs* requantize_attrs = attrs.as<RequantizeAttrs>();
   int axis = requantize_attrs->axis;
diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
index 77777293729c..a3e48f4424ad 100644
--- a/tests/python/contrib/test_ethosn/test_convert_equivalents.py
+++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
@@ -227,7 +227,7 @@ def expected():
 @requires_ethosn
 @pytest.mark.parametrize(
     "dtype,shape,constant_shape",
-    [("int16", (1, 16, 12, 4), None)],
+    [("float32", (1, 16, 12, 4), None)],
 )
 def test_unsupported_multiply_to_reinterpret_quantize(dtype, shape, constant_shape):
     """
@@ -445,7 +445,7 @@ def expected():
 @pytest.mark.parametrize(
     "dtype,shape,constant_shape",
     [
-        ("int16", (1, 16, 12, 4), None),
+        ("float32", (1, 16, 12, 4), None),
     ],
 )
 def test_unsupported_add_to_reinterpret_quantize(dtype, shape, constant_shape):
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 7b2bd60d8a20..877406ae2a64 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -4878,6 +4878,28 @@ def representative_dataset():
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
 
+def test_forward_ds_cnn_int16():
+    """Test DS_CNN int16 quantized model"""
+    tflite_model_file = download_testdata(
+        "https://github.com/ARM-software/ML-zoo/blob/48f458af1e9065d9aad2ad94d24b58d6e7c00817/"
+        "models/keyword_spotting/ds_cnn_small/tflite_int16/ds_cnn_quantized.tflite?raw=true",
+        "ds_cnn_quantized_int16.tflite",
+    )
+
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+
+    data = np.random.uniform(size=(1, 490)).astype("int16")
+
+    tflite_output = run_tflite_graph(tflite_model_buf, data)
+    tflite_predictions = np.squeeze(tflite_output)
+    tflite_sorted_labels = tflite_predictions.argsort()[-3:][::-1]
+    tvm_output = run_tvm_graph(tflite_model_buf, data, "serving_default_input:0")
+    tvm_predictions = np.squeeze(tvm_output)
+    tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1]
+    tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
+
+
 #######################################################################
 # Unidirectional Sequence LSTM
 # ---------------------
@@ -5250,3 +5272,4 @@ def test_forward_nms_v5():
     test_forward_tflite_float16()
 
     test_forward_tflite_int16()
+    test_forward_ds_cnn_int16()
diff --git a/tests/python/relay/test_op_qnn_requantize.py b/tests/python/relay/test_op_qnn_requantize.py
index 64306476dfe9..1dee1f5b619c 100644
--- a/tests/python/relay/test_op_qnn_requantize.py
+++ b/tests/python/relay/test_op_qnn_requantize.py
@@ -23,6 +23,7 @@
 
 roundings = ["UPWARD", "TONEAREST"]
 compute_dtypes = ["float32", "float64", "int64"]
+out_dtypes = ["int8", "int16"]
 
 
 def verify(mod, goldens, target="llvm"):
@@ -83,17 +84,18 @@ def test_same_scale():
     golden_output = golden_data
     for compute_dtype in compute_dtypes:
         for rounding in roundings:
-            mod = get_mod(
-                data_shape=(200,),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=0.5,
-                output_scale=0.5,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
-            assert "right_shift" not in mod.astext()
-            verify(mod, (golden_data, golden_output))
+            for qnn_out_dtype in out_dtypes:
+                mod = get_mod(
+                    data_shape=(200,),
+                    data_dtype="int32",
+                    out_dtype=qnn_out_dtype,
+                    input_scale=0.5,
+                    output_scale=0.5,
+                    rounding=rounding,
+                    compute_dtype=compute_dtype,
+                )
+                assert "right_shift" not in mod.astext()
+                verify(mod, (golden_data, golden_output))
 
 
 def test_scalar_same_scale():
@@ -102,75 +104,77 @@ def test_scalar_same_scale():
     golden_output = golden_data
     for compute_dtype in compute_dtypes:
         for rounding in roundings:
-            mod = get_mod(
-                data_shape=(),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=0.5,
-                output_scale=0.5,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
-            assert "right_shift" not in mod.astext()
-            verify(mod, (golden_data, golden_output))
+            for qnn_out_dtype in out_dtypes:
+                mod = get_mod(
+                    data_shape=(),
+                    data_dtype="int32",
+                    out_dtype=qnn_out_dtype,
+                    input_scale=0.5,
+                    output_scale=0.5,
+                    rounding=rounding,
+                    compute_dtype=compute_dtype,
+                )
+                assert "right_shift" not in mod.astext()
+                verify(mod, (golden_data, golden_output))
 
 
 def test_downscale():
     for compute_dtype in compute_dtypes:
         for rounding in roundings:
-            mod = get_mod(
-                data_shape=(32,),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=1,
-                output_scale=16,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
+            for qnn_out_dtype in out_dtypes:
+                mod = get_mod(
+                    data_shape=(32,),
+                    data_dtype="int32",
+                    out_dtype=qnn_out_dtype,
+                    input_scale=1,
+                    output_scale=16,
+                    rounding=rounding,
+                    compute_dtype=compute_dtype,
+                )
 
-            # Try positive values
-            # 8 corresponds to 0.5, resulting in 1
-            golden_data = np.arange(0, 32, 1).astype("int32")
-            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-            verify(mod, (golden_data, golden_output))
+                # Try positive values
+                # 8 corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype("int32")
+                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                verify(mod, (golden_data, golden_output))
 
-            # Try negative values
-            # -8 corresponds to -0.5. For UPWARD, this is 0
-            golden_data = np.arange(0, -32, -1).astype("int32")
-            if rounding == "UPWARD":
-                golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-            else:
-                golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-            verify(mod, (golden_data, golden_output))
+                # Try negative values
+                # -8 corresponds to -0.5. For UPWARD, this is 0
+                golden_data = np.arange(0, -32, -1).astype("int32")
+                if rounding == "UPWARD":
+                    golden_output = np.repeat([0, -1, -2], [9, 16, 7])
+                else:
+                    golden_output = np.repeat([0, -1, -2], [8, 16, 8])
+                verify(mod, (golden_data, golden_output))
 
-            # Try a different scale
-            mod = get_mod(
-                data_shape=(32,),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=1,
-                output_scale=4,
-                rounding=rounding,
-            )
+                # Try a different scale
+                mod = get_mod(
+                    data_shape=(32,),
+                    data_dtype="int32",
+                    out_dtype=qnn_out_dtype,
+                    input_scale=1,
+                    output_scale=4,
+                    rounding=rounding,
+                )
 
-            # Try positive values
-            # 2I corresponds to 0.5, resulting in 1
-            golden_data = np.arange(0, 32, 1).astype("int32")
-            golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8], [2, 4, 4, 4, 4, 4, 4, 4, 2])
-            verify(mod, (golden_data, golden_output))
+                # Try positive values
+                # 2I corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype("int32")
+                golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8], [2, 4, 4, 4, 4, 4, 4, 4, 2])
+                verify(mod, (golden_data, golden_output))
 
-            # Try negative values
-            # -8 corresponds to -0.5. For UPWARD, this is 0
-            golden_data = np.arange(0, -32, -1).astype("int32")
-            if rounding == "UPWARD":
-                golden_output = np.repeat(
-                    [0, -1, -2, -3, -4, -5, -6, -7, -8], [3, 4, 4, 4, 4, 4, 4, 4, 1]
-                )
-            else:
-                golden_output = np.repeat(
-                    [0, -1, -2, -3, -4, -5, -6, -7, -8], [2, 4, 4, 4, 4, 4, 4, 4, 2]
-                )
-            verify(mod, (golden_data, golden_output))
+                # Try negative values
+                # -8 corresponds to -0.5. For UPWARD, this is 0
+                golden_data = np.arange(0, -32, -1).astype("int32")
+                if rounding == "UPWARD":
+                    golden_output = np.repeat(
+                        [0, -1, -2, -3, -4, -5, -6, -7, -8], [3, 4, 4, 4, 4, 4, 4, 4, 1]
+                    )
+                else:
+                    golden_output = np.repeat(
+                        [0, -1, -2, -3, -4, -5, -6, -7, -8], [2, 4, 4, 4, 4, 4, 4, 4, 2]
+                    )
+                verify(mod, (golden_data, golden_output))
 
             # Try uint8 out_dtype
             mod = get_mod(
@@ -208,74 +212,76 @@ def test_downscale():
 def test_upscale():
     for compute_dtype in compute_dtypes:
         for rounding in roundings:
-            mod = get_mod(
-                data_shape=(32,),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=2,
-                output_scale=1,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
+            for qnn_out_dtype in out_dtypes:
+                mod = get_mod(
+                    data_shape=(32,),
+                    data_dtype="int32",
+                    out_dtype=qnn_out_dtype,
+                    input_scale=2,
+                    output_scale=1,
+                    rounding=rounding,
+                    compute_dtype=compute_dtype,
+                )
 
-            # Try positive values
-            # 8 corresponds to 0.5, resulting in 1
-            golden_data = np.arange(0, 32, 1).astype("int32")
-            golden_output = np.multiply(2, golden_data)
-            verify(mod, (golden_data, golden_output))
+                # Try positive values
+                # 8 corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype("int32")
+                golden_output = np.multiply(2, golden_data)
+                verify(mod, (golden_data, golden_output))
 
-            # Try negative values
-            # -8 corresponds to -0.5. For UPWARD, this is 0
-            golden_data = np.arange(0, -32, -1).astype("int32")
-            golden_output = np.multiply(2, golden_data)
-            verify(mod, (golden_data, golden_output))
+                # Try negative values
+                # -8 corresponds to -0.5. For UPWARD, this is 0
+                golden_data = np.arange(0, -32, -1).astype("int32")
+                golden_output = np.multiply(2, golden_data)
+                verify(mod, (golden_data, golden_output))
 
 
 def test_non_power_of_two():
     for compute_dtype in compute_dtypes:
         for rounding in roundings:
-            mod = get_mod(
-                data_shape=(32,),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=1,
-                output_scale=3,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
+            for qnn_out_dtype in out_dtypes:
+                mod = get_mod(
+                    data_shape=(32,),
+                    data_dtype="int32",
+                    out_dtype=qnn_out_dtype,
+                    input_scale=1,
+                    output_scale=3,
+                    rounding=rounding,
+                    compute_dtype=compute_dtype,
+                )
 
-            # Try positive values
-            golden_data = np.multiply(np.arange(0, 32, 1).astype("int32"), 3)
-            golden_output = np.arange(0, 32, 1)
-            verify(mod, (golden_data, golden_output))
+                # Try positive values
+                golden_data = np.multiply(np.arange(0, 32, 1).astype("int32"), 3)
+                golden_output = np.arange(0, 32, 1)
+                verify(mod, (golden_data, golden_output))
 
-            # Try negative values
-            golden_data = np.multiply(np.arange(0, -32, -1).astype("int32"), 3)
-            golden_output = np.arange(0, -32, -1)
-            verify(mod, (golden_data, golden_output))
+                # Try negative values
+                golden_data = np.multiply(np.arange(0, -32, -1).astype("int32"), 3)
+                golden_output = np.arange(0, -32, -1)
+                verify(mod, (golden_data, golden_output))
 
-            # Try a different scale
-            mod = get_mod(
-                data_shape=(32,),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=3,
-                output_scale=1,
-                rounding=rounding,
-            )
+                # Try a different scale
+                mod = get_mod(
+                    data_shape=(32,),
+                    data_dtype="int32",
+                    out_dtype=qnn_out_dtype,
+                    input_scale=3,
+                    output_scale=1,
+                    rounding=rounding,
+                )
 
-            # Try positive values
-            golden_data = np.arange(0, 32, 1).astype("int32")
-            golden_output = np.multiply(golden_data, 3)
-            verify(mod, (golden_data, golden_output))
+                # Try positive values
+                golden_data = np.arange(0, 32, 1).astype("int32")
+                golden_output = np.multiply(golden_data, 3)
+                verify(mod, (golden_data, golden_output))
 
-            # Try negative values
-            golden_data = np.arange(0, -32, -1).astype("int32")
-            golden_output = np.multiply(golden_data, 3)
-            verify(mod, (golden_data, golden_output))
+                # Try negative values
+                golden_data = np.arange(0, -32, -1).astype("int32")
+                golden_output = np.multiply(golden_data, 3)
+                verify(mod, (golden_data, golden_output))
 
 
-def test_saturation():
+def test_saturation_int8():
     for compute_dtype in compute_dtypes:
         for rounding in roundings:
             mod = get_mod(
@@ -322,6 +328,70 @@ def test_saturation():
             verify(mod, (golden_data, golden_output))
 
 
+def test_saturation_int16():
+    for compute_dtype in compute_dtypes:
+        for rounding in roundings:
+            mod = get_mod(
+                data_shape=(16,),
+                data_dtype="int32",
+                out_dtype="int16",
+                input_scale=0.5,
+                output_scale=0.5,
+                rounding=rounding,
+                compute_dtype=compute_dtype,
+            )
+            golden_data = np.arange(0, 16, 1).astype("int32")
+            golden_data = np.add(32760, golden_data)
+            output = np.array(
+                [
+                    32760,
+                    32761,
+                    32762,
+                    32763,
+                    32764,
+                    32765,
+                    32766,
+                    32767,
+                    32767,
+                    32767,
+                    32767,
+                    32767,
+                    32767,
+                    32767,
+                    32767,
+                    32767,
+                ]
+            )
+            golden_output = output
+            verify(mod, (golden_data, golden_output))
+
+            # Try negative numbers
+            golden_data = np.arange(0, -16, -1).astype("int32")
+            golden_data = np.add(-32760, golden_data)
+            output = np.array(
+                [
+                    -32760,
+                    -32761,
+                    -32762,
+                    -32763,
+                    -32764,
+                    -32765,
+                    -32766,
+                    -32767,
+                    -32768,
+                    -32768,
+                    -32768,
+                    -32768,
+                    -32768,
+                    -32768,
+                    -32768,
+                    -32768,
+                ]
+            )
+            golden_output = output
+            verify(mod, (golden_data, golden_output))
+
+
 def test_zero_point():
     # Output zero point
     for compute_dtype in compute_dtypes:
@@ -357,31 +427,32 @@ def test_zero_point():
     # Input zero point
     for compute_dtype in compute_dtypes:
         for rounding in roundings:
-            mod = get_mod(
-                data_shape=(32,),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=1,
-                output_scale=16,
-                input_zero_point=16,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
+            for qnn_out_dtype in out_dtypes:
+                mod = get_mod(
+                    data_shape=(32,),
+                    data_dtype="int32",
+                    out_dtype=qnn_out_dtype,
+                    input_scale=1,
+                    output_scale=16,
+                    input_zero_point=16,
+                    rounding=rounding,
+                    compute_dtype=compute_dtype,
+                )
 
-            # Try positive values
-            golden_data = np.arange(32, 64, 1).astype("int32")
-            golden_output = np.repeat([2, 3, 4], [8, 16, 8])
-            golden_output = np.subtract(golden_output, 1)
-            verify(mod, (golden_data, golden_output))
+                # Try positive values
+                golden_data = np.arange(32, 64, 1).astype("int32")
+                golden_output = np.repeat([2, 3, 4], [8, 16, 8])
+                golden_output = np.subtract(golden_output, 1)
+                verify(mod, (golden_data, golden_output))
 
-            # Try negative values
-            golden_data = np.arange(-32, -64, -1).astype("int32")
-            if rounding == "UPWARD":
-                golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
-            else:
-                golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
-            golden_output = np.subtract(golden_output, 1)
-            verify(mod, (golden_data, golden_output))
+                # Try negative values
+                golden_data = np.arange(-32, -64, -1).astype("int32")
+                if rounding == "UPWARD":
+                    golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+                else:
+                    golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+                golden_output = np.subtract(golden_output, 1)
+                verify(mod, (golden_data, golden_output))
 
 
 def test_per_channel_same_scale():
@@ -390,17 +461,18 @@ def test_per_channel_same_scale():
     golden_output = golden_data
     for compute_dtype in compute_dtypes:
         for rounding in roundings:
-            mod = get_mod(
-                data_shape=(5, 2),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=[0.5, 0.5],
-                output_scale=0.5,
-                axis=1,
-                rounding=rounding,
-                compute_dtype=compute_dtype,
-            )
-            verify(mod, (golden_data, golden_output))
+            for qnn_out_dtype in out_dtypes:
+                mod = get_mod(
+                    data_shape=(5, 2),
+                    data_dtype="int32",
+                    out_dtype=qnn_out_dtype,
+                    input_scale=[0.5, 0.5],
+                    output_scale=0.5,
+                    axis=1,
+                    rounding=rounding,
+                    compute_dtype=compute_dtype,
+                )
+                verify(mod, (golden_data, golden_output))
 
     # Change axis
     golden_data = np.arange(-10, 10, 1).astype("int32").reshape((2, 2, 5))
@@ -480,88 +552,93 @@ def test_per_channel_different_scale():
 
 
 def test_default_cfg_and_no_args():
-    mod = get_mod(
-        data_shape=(32,),
-        data_dtype="int32",
-        out_dtype="int8",
-        input_scale=1,
-        output_scale=16,
-    )
-    golden_data = np.arange(0, -32, -1).astype("int32")
-    golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-    verify(mod, (golden_data, golden_output))
+    for qnn_out_dtype in out_dtypes:
+        mod = get_mod(
+            data_shape=(32,),
+            data_dtype="int32",
+            out_dtype=qnn_out_dtype,
+            input_scale=1,
+            output_scale=16,
+        )
+        golden_data = np.arange(0, -32, -1).astype("int32")
+        golden_output = np.repeat([0, -1, -2], [9, 16, 7])
+        verify(mod, (golden_data, golden_output))
 
 
 def test_non_default_cfg_and_no_args():
     for rounding_cfg in roundings:
-        with relay.qnn.op.requantize_config(rounding=rounding_cfg):
-            mod = get_mod(
-                data_shape=(32,),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=1,
-                output_scale=16,
-            )
+        for qnn_out_dtype in out_dtypes:
+            with relay.qnn.op.requantize_config(rounding=rounding_cfg):
+                mod = get_mod(
+                    data_shape=(32,),
+                    data_dtype="int32",
+                    out_dtype=qnn_out_dtype,
+                    input_scale=1,
+                    output_scale=16,
+                )
 
-            golden_data = np.arange(0, -32, -1).astype("int32")
+                golden_data = np.arange(0, -32, -1).astype("int32")
 
-            if rounding_cfg == "UPWARD":
-                golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-            else:
-                golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-            verify(mod, (golden_data, golden_output))
+                if rounding_cfg == "UPWARD":
+                    golden_output = np.repeat([0, -1, -2], [9, 16, 7])
+                else:
+                    golden_output = np.repeat([0, -1, -2], [8, 16, 8])
+                verify(mod, (golden_data, golden_output))
 
 
 def test_default_cfg_and_args():
     for rounding in roundings:
-        with relay.qnn.op.requantize_config(rounding="UPWARD"):
-            mod = get_mod(
-                data_shape=(32,),
-                data_dtype="int32",
-                out_dtype="int8",
-                input_scale=1,
-                output_scale=16,
-                rounding=rounding,
-            )
-
-            golden_data = np.arange(0, -32, -1).astype("int32")
-
-            if rounding == "UPWARD":
-                golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-            else:
-                golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-            verify(mod, (golden_data, golden_output))
-
-
-def test_non_default_cfg_and_args():
-    for rounding_arg in roundings:
-        for rounding_cfg in roundings:
-            with relay.qnn.op.requantize_config(rounding=rounding_cfg):
+        for qnn_out_dtype in out_dtypes:
+            with relay.qnn.op.requantize_config(rounding="UPWARD"):
                 mod = get_mod(
                     data_shape=(32,),
                     data_dtype="int32",
-                    out_dtype="int8",
+                    out_dtype=qnn_out_dtype,
                     input_scale=1,
                     output_scale=16,
-                    rounding=rounding_arg,
+                    rounding=rounding,
                 )
 
                 golden_data = np.arange(0, -32, -1).astype("int32")
 
-                if rounding_arg == "UPWARD":
+                if rounding == "UPWARD":
                     golden_output = np.repeat([0, -1, -2], [9, 16, 7])
                 else:
                     golden_output = np.repeat([0, -1, -2], [8, 16, 8])
                 verify(mod, (golden_data, golden_output))
 
 
+def test_non_default_cfg_and_args():
+    for rounding_arg in roundings:
+        for rounding_cfg in roundings:
+            for qnn_out_dtype in out_dtypes:
+                with relay.qnn.op.requantize_config(rounding=rounding_cfg):
+                    mod = get_mod(
+                        data_shape=(32,),
+                        data_dtype="int32",
+                        out_dtype=qnn_out_dtype,
+                        input_scale=1,
+                        output_scale=16,
+                        rounding=rounding_arg,
+                    )
+
+                    golden_data = np.arange(0, -32, -1).astype("int32")
+
+                    if rounding_arg == "UPWARD":
+                        golden_output = np.repeat([0, -1, -2], [9, 16, 7])
+                    else:
+                        golden_output = np.repeat([0, -1, -2], [8, 16, 8])
+                    verify(mod, (golden_data, golden_output))
+
+
 if __name__ == "__main__":
     test_same_scale()
     test_scalar_same_scale()
     test_downscale()
     test_upscale()
     test_non_power_of_two()
-    test_saturation()
+    test_saturation_int8()
+    test_saturation_int16()
     test_zero_point()
     test_per_channel_same_scale()
     test_per_channel_different_scale()

From 4fd34b1bad12bc391c874a9cb656725dd8976df1 Mon Sep 17 00:00:00 2001
From: Lry89757 <77330637+LRY89757@users.noreply.github.com>
Date: Wed, 16 Nov 2022 01:39:08 +0800
Subject: [PATCH 575/704] [Fix][Warning] tvm.target.create() deprecated
 (#13391)

Update the example with the newer API.
---
 apps/howto_deploy/prepare_test_libs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/howto_deploy/prepare_test_libs.py b/apps/howto_deploy/prepare_test_libs.py
index 8e9f8b5f7335..f5afc5bf6750 100644
--- a/apps/howto_deploy/prepare_test_libs.py
+++ b/apps/howto_deploy/prepare_test_libs.py
@@ -44,7 +44,7 @@ def prepare_graph_lib(base_path):
     params = {"y": np.ones((2, 2), dtype="float32")}
     mod = tvm.IRModule.from_expr(relay.Function([x, y], x + y))
     # build a module
-    compiled_lib = relay.build(mod, tvm.target.create("llvm"), params=params)
+    compiled_lib = relay.build(mod, tvm.target.Target("llvm"), params=params)
     # export it as a shared library
     # If you are running cross compilation, you can also consider export
     # to tar and invoke host compiler later.

From 41b7a4508383d80ea8e380b44d48f89e42ed125e Mon Sep 17 00:00:00 2001
From: qzylalala <304228244@qq.com>
Date: Wed, 16 Nov 2022 01:39:55 +0800
Subject: [PATCH 576/704] feat: fix pylint issues for conftest (#13386)

* feat: fix pylint issues for conftest

* fix: add a trailing newline
---
 tests/lint/pylint.sh     |  1 +
 tests/python/conftest.py | 37 ++++++++++++++++++-------------------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 2b0b8365649d..d716ddea7ba1 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -25,6 +25,7 @@ python3 -m pylint tests/python/contrib/test_ethosn --rcfile="$(dirname "$0")"/py
 python3 -m pylint tests/python/relay/aot/*.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/conftest.py --rcfile="$(dirname "$0")"/pylintrc
 
 # tests/python/contrib/test_hexagon tests
 python3 -m pylint tests/python/contrib/test_hexagon/*.py --rcfile="$(dirname "$0")"/pylintrc
diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 0dbb3dcc79e8..2da5be3d0494 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -14,30 +14,29 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Configure pytest"""
 import sys
-import tvm
 import pytest
 
-collect_ignore = []
+COLLECT_IGNORE = []
 if sys.platform.startswith("win"):
-    collect_ignore.append("frontend/caffe")
-    collect_ignore.append("frontend/caffe2")
-    collect_ignore.append("frontend/coreml")
-    collect_ignore.append("frontend/darknet")
-    collect_ignore.append("frontend/keras")
-    collect_ignore.append("frontend/mxnet")
-    collect_ignore.append("frontend/pytorch")
-    collect_ignore.append("frontend/tensorflow")
-    collect_ignore.append("frontend/tflite")
-    collect_ignore.append("frontend/onnx")
-    collect_ignore.append("driver/tvmc/test_autoscheduler.py")
-    collect_ignore.append("unittest/test_auto_scheduler_cost_model.py")  # stack overflow
-    # collect_ignore.append("unittest/test_auto_scheduler_measure.py") # exception ignored
-    collect_ignore.append("unittest/test_auto_scheduler_search_policy.py")  # stack overflow
-    # collect_ignore.append("unittest/test_auto_scheduler_measure.py") # exception ignored
+    COLLECT_IGNORE.append("frontend/caffe")
+    COLLECT_IGNORE.append("frontend/caffe2")
+    COLLECT_IGNORE.append("frontend/coreml")
+    COLLECT_IGNORE.append("frontend/darknet")
+    COLLECT_IGNORE.append("frontend/keras")
+    COLLECT_IGNORE.append("frontend/mxnet")
+    COLLECT_IGNORE.append("frontend/pytorch")
+    COLLECT_IGNORE.append("frontend/tensorflow")
+    COLLECT_IGNORE.append("frontend/tflite")
+    COLLECT_IGNORE.append("frontend/onnx")
+    COLLECT_IGNORE.append("driver/tvmc/test_autoscheduler.py")
+    COLLECT_IGNORE.append("unittest/test_auto_scheduler_cost_model.py")  # stack overflow
+    # COLLECT_IGNORE.append("unittest/test_auto_scheduler_measure.py") # exception ignored
+    COLLECT_IGNORE.append("unittest/test_auto_scheduler_search_policy.py")  # stack overflow
+    # COLLECT_IGNORE.append("unittest/test_auto_scheduler_measure.py") # exception ignored
 
-    collect_ignore.append("unittest/test_tir_intrin.py")
+    COLLECT_IGNORE.append("unittest/test_tir_intrin.py")
 
 
 def pytest_addoption(parser):

From 24790d1d56c9aa8baceb054383a626dba604b959 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 15 Nov 2022 23:11:56 +0530
Subject: [PATCH 577/704] [RUNTIME][ALIGNMENT] Configurable kAllocAlignment if
 needed (#13307)

Not all plarforms 64bit aligned allocations. Platforms with 32bit alignment fail to support
set_input_zero_copy even though the ndarray is allocated by the tvm runtime itself.

This change enabled configurable option for such targets.

Co-authored-by: Siva Rama Krishna Reddy B <sivb@blr-ubuntu-ripper.qualcomm.com>
---
 CMakeLists.txt                          | 5 +++++
 cmake/config.cmake                      | 3 +++
 include/tvm/runtime/device_api.h        | 8 ++++++++
 tests/scripts/task_build_adreno_bins.sh | 1 +
 4 files changed, 17 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d0e45c3d3a41..b8d8f4c0239c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -463,6 +463,11 @@ if(USE_PIPELINE_EXECUTOR)
   list(APPEND RUNTIME_SRCS ${RUNTIME_PIPELINE_SRCS})
 endif(USE_PIPELINE_EXECUTOR)
 
+if(USE_KALLOC_ALIGNMENT)
+  message(STATUS "Build Alloc alignment set to ${USE_KALLOC_ALIGNMENT}")
+  add_definitions(-DTVM_KALLOC_ALIGNMENT=${USE_KALLOC_ALIGNMENT})
+endif(USE_KALLOC_ALIGNMENT)
+
 # Caches the build.
 # Note that ccache-3.x doesn't support nvcc well, so CUDA kernels may never hit the cache and still
 # need to be re-compiled every time. Using ccache 4.0+ can resolve this issue.
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 0b72047603f0..22a548d29895 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -401,3 +401,6 @@ set(USE_LIBTORCH OFF)
 
 # Whether to use the Universal Modular Accelerator Interface
 set(USE_UMA OFF)
+
+# Set custom Alloc Alignment for device allocated memory ndarray points to
+set(USE_KALLOC_ALIGNMENT 64)
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 9613563f95b4..e517eb0d7f34 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -51,11 +51,19 @@ enum DeviceAttrKind : int {
   kDriverVersion = 12
 };
 
+#ifdef TVM_KALLOC_ALIGNMENT
+/*! \brief Number of bytes each allocation must align to */
+constexpr int kAllocAlignment = TVM_KALLOC_ALIGNMENT;
+
+/*! \brief Number of bytes each allocation must align to in temporary allocation */
+constexpr int kTempAllocaAlignment = TVM_KALLOC_ALIGNMENT;
+#else
 /*! \brief Number of bytes each allocation must align to */
 constexpr int kAllocAlignment = 64;
 
 /*! \brief Number of bytes each allocation must align to in temporary allocation */
 constexpr int kTempAllocaAlignment = 64;
+#endif  // TVM_KALLOC_ALIGNMENT
 
 /*! \brief Maximum size that can be allocated on stack */
 constexpr int kMaxStackAlloca = 1024;
diff --git a/tests/scripts/task_build_adreno_bins.sh b/tests/scripts/task_build_adreno_bins.sh
index 5d453251606a..6a9bbd9554f1 100755
--- a/tests/scripts/task_build_adreno_bins.sh
+++ b/tests/scripts/task_build_adreno_bins.sh
@@ -34,6 +34,7 @@ echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_CPP_RPC ON\) >> config.cmake
 echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
+echo set\(USE_KALLOC_ALIGNMENT 32\) >> config.cmake
 
 echo set\(ANDROID_ABI arm64-v8a\) >> config.cmake
 echo set\(ANDROID_PLATFORM android-28\) >> config.cmake

From 72c60ad78a4cd9ca055f567ffd762462a2b85428 Mon Sep 17 00:00:00 2001
From: Matthew Barrett <55580676+mbaret@users.noreply.github.com>
Date: Tue, 15 Nov 2022 20:20:46 +0000
Subject: [PATCH 578/704] [AOT][FIX] Handle device contexts properly in
 CreateFunctionMetadata (#13392)

Device contexts created handles with no corresponding buffer which
would confuse CreateFunctionMetadata causing a segfault. The
logic has been changed to check that a buffer exists and a test
updated to include this case.
---
 src/relay/backend/aot/create_function_metadata.cc           | 3 +--
 tests/python/relay/aot/test_aot_create_function_metadata.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/relay/backend/aot/create_function_metadata.cc b/src/relay/backend/aot/create_function_metadata.cc
index 54fd270c1b25..2ef5e495abca 100644
--- a/src/relay/backend/aot/create_function_metadata.cc
+++ b/src/relay/backend/aot/create_function_metadata.cc
@@ -62,8 +62,7 @@ Map<String, backend::FunctionInfo> CalculateFunctionInfos(const IRModule& mod,
       auto params = pfunc->params;
       int64_t total_io_bytes = 0;
       for (const auto& param : params) {
-        // Inputs/outputs will be handles, workspaces are pointers
-        if (param->dtype.is_handle()) {
+        if (pfunc->buffer_map.find(param) != pfunc->buffer_map.end()) {
           auto buffer = pfunc->buffer_map[param];
           total_io_bytes += GetMemorySizeBytes(buffer->shape, buffer->dtype);
         }
diff --git a/tests/python/relay/aot/test_aot_create_function_metadata.py b/tests/python/relay/aot/test_aot_create_function_metadata.py
index ff2a522572c5..80137bd23f0c 100644
--- a/tests/python/relay/aot/test_aot_create_function_metadata.py
+++ b/tests/python/relay/aot/test_aot_create_function_metadata.py
@@ -264,7 +264,7 @@ def __tvm_main__(a: T.handle, output: T.handle) -> None:
             T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, a_buffer.data, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32"))
 
         @T.prim_func
-        def test_fused_add(a: T.handle, b: T.handle, output: T.handle) -> None:
+        def test_fused_add(a: T.handle, b: T.handle, output: T.handle, device_context_unused: T.handle) -> None:
             # function attr dict
             T.func_attr({"global_symbol": "test_mod_test_fused_add", "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]})})
             a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)

From 8c30bda738eb0b07c0457b6ee651f3f32857903b Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 16 Nov 2022 05:25:48 +0900
Subject: [PATCH 579/704] [MetaSchedule] Support schedules with cache read in
 RewriteLayout (#13384)

Currently when `CacheRead` and `RewriteLayout` are used together, the index map is derived based on the cache read block, which leads to weird result. This is because the current implementation assumes that the "layout-free" buffer is directly consumed by an "anchor" op such as conv2d or dense.

When `CacheRead` is involved, we need to find the index map for the cache-read buffer as it is consumed by an anchor op, and apply the same transformation to the layout-free buffer. My solution supports more general cases where there are multiple cache reads forming a "chain" of blocks, starting from the one that directly consumes the layout-free buffer passed as a parameter. So the layout transformation is back propagated over such chain.
---
 src/meta_schedule/postproc/rewrite_layout.cc  | 159 ++++++----
 src/te/operation/create_primfunc.cc           |   9 +-
 ...t_meta_schedule_postproc_rewrite_layout.py | 276 ++++++++++++++++++
 .../unittest/test_te_create_primfunc.py       |   1 +
 4 files changed, 390 insertions(+), 55 deletions(-)

diff --git a/src/meta_schedule/postproc/rewrite_layout.cc b/src/meta_schedule/postproc/rewrite_layout.cc
index 3aed6680e30d..71ae43387112 100644
--- a/src/meta_schedule/postproc/rewrite_layout.cc
+++ b/src/meta_schedule/postproc/rewrite_layout.cc
@@ -16,6 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <optional>
 #include <unordered_set>
 
 #include "../utils.h"
@@ -25,23 +26,15 @@ namespace tir {
 
 /*!
  * \brief Collect the block and index where the buffer is read.
- * \note The buffers are expected to be read by only one BufferLoad
+ * \note The buffer is expected to be read by only one BufferLoad
  */
 class BufferReadPosCollector : public StmtExprVisitor {
  public:
-  explicit BufferReadPosCollector(const Array<Buffer>& buffers) {
-    for (const Buffer& buf : buffers) {
-      buffers_.insert(buf.get());
-    }
-  }
+  explicit BufferReadPosCollector(const Buffer& buffer) : buffer_(buffer.get()) {}
 
-  const std::unordered_map<const BufferNode*, std::pair<Block, int>>& GetBufferLocations() const {
-    return buffer_locs_;
-  }
+  const std::pair<Block, int>& GetBufferLocation() const { return buffer_loc_; }
 
-  const std::unordered_map<const BufferNode*, Optional<IndexMap>>& GetBufferIndexMap() const {
-    return buffer_index_maps_;
-  }
+  const Optional<IndexMap> GetBufferIndexMap() const { return buffer_index_map_; }
 
  private:
   void VisitStmt_(const ForNode* op) final {
@@ -61,7 +54,7 @@ class BufferReadPosCollector : public StmtExprVisitor {
     CHECK(cur_realize_.defined()) << "BufferLoad occurred outside of any block";
 
     const Buffer& buffer = op->buffer;
-    if (buffers_.count(buffer.get())) {
+    if (buffer_ == buffer.get()) {
       Map<Var, PrimExpr> subst_map;
       for (size_t i = 0; i < cur_realize_->iter_values.size(); i++) {
         const Var& var = cur_realize_->block->iter_vars[i]->var;
@@ -72,14 +65,14 @@ class BufferReadPosCollector : public StmtExprVisitor {
       for (const PrimExpr& e : op->indices) {
         subst_indices.push_back(Substitute(e, subst_map));
       }
-      buffer_index_maps_[buffer.get()] = SuggestIndexMap(/*buffer=*/buffer,                      //
-                                                         /*indices=*/subst_indices,              //
-                                                         /*loops=*/loop_stack_,                  //
-                                                         /*predicate=*/cur_realize_->predicate,  //
-                                                         /*analyzer=*/&analyzer_);
+      buffer_index_map_ = SuggestIndexMap(/*buffer=*/buffer,                      //
+                                          /*indices=*/subst_indices,              //
+                                          /*loops=*/loop_stack_,                  //
+                                          /*predicate=*/cur_realize_->predicate,  //
+                                          /*analyzer=*/&analyzer_);
       int buffer_index = GetReadBufferIndex(cur_realize_->block, buffer);
       ICHECK(buffer_index != -1);
-      buffer_locs_[buffer.get()] = std::make_pair(cur_realize_->block, buffer_index);
+      buffer_loc_ = std::make_pair(cur_realize_->block, buffer_index);
     }
   }
 
@@ -93,12 +86,12 @@ class BufferReadPosCollector : public StmtExprVisitor {
   }
 
  private:
-  /*! \brief All interested buffer. */
-  std::unordered_set<const BufferNode*> buffers_;
-  /*! \brief The result mapping from buffer to its inner-most block and read index. */
-  std::unordered_map<const BufferNode*, std::pair<Block, int>> buffer_locs_;
-  /*! \brief The result mapping from buffer to its IndexMap. */
-  std::unordered_map<const BufferNode*, Optional<IndexMap>> buffer_index_maps_;
+  /*! \brief The buffer of interest. */
+  const BufferNode* buffer_;
+  /*! \brief The block that consumes the buffer and the corresponding read index. */
+  std::pair<Block, int> buffer_loc_;
+  /*! \brief The proposed IndexMap. */
+  Optional<IndexMap> buffer_index_map_;
 
   /*! \brief Loop stack for calculating IndexMap. */
   Array<For> loop_stack_;
@@ -143,8 +136,56 @@ Array<Buffer> CollectLayoutFreeBuffers(const PrimFuncNode* func) {
   return layout_free_buffers;
 }
 
+std::optional<std::tuple<Block, int, IndexMap>> GetSuggestedIndexMap(
+    Buffer buffer, const PrimFuncNode* prim_func) {
+  BufferReadPosCollector collector(buffer);
+  collector(prim_func->body);
+
+  const auto& index_map = collector.GetBufferIndexMap();
+
+  if (!index_map.defined() || !index_map) {
+    return std::nullopt;
+  }
+
+  const auto& [anchor_block, buffer_index] = collector.GetBufferLocation();
+
+  return std::make_tuple(anchor_block, buffer_index, index_map.value());
+}
+
+/*! \brief Get a chain of cache-read blocks, starting from the one consuming buf. */
+std::vector<std::string> GetCacheReadChain(const Buffer& buf, const PrimFuncNode* prim_func) {
+  class BufferReadChainCollector : public StmtVisitor {
+   public:
+    explicit BufferReadChainCollector(const Buffer& buffer) : cur_buffer_(buffer.get()) {}
+
+    void VisitStmt_(const BlockNode* op) final {
+      // Check if this block is doing cache_read or a similar operation that consumes cur_buffer_.
+      if (!op->init && op->reads.size() == 1 && op->writes.size() == 1 &&
+          op->reads[0]->buffer.get() == cur_buffer_) {
+        cache_read_chain.push_back(op->name_hint);
+        cur_buffer_ = op->writes[0]->buffer.get();
+      }
+      StmtVisitor::VisitStmt_(op);
+    }
+
+    std::vector<std::string> cache_read_chain;
+
+   private:
+    const BufferNode* cur_buffer_;
+  };
+
+  BufferReadChainCollector collector(buf);
+  collector(prim_func->body);
+  return collector.cache_read_chain;
+}
+
 bool RewriteLayout(const Schedule& sch) {
   std::vector<std::pair<StmtSRef, String>> results;
+  auto add_layout_rewrite_block = [&sch](BlockRV consumer_block_rv, int buffer_index) {
+    BlockRV rewrite_block_rv = sch->CacheRead(consumer_block_rv, buffer_index, "global");
+    sch->Annotate(rewrite_block_rv, attr::meta_schedule_layout_rewrite_preproc, const_true());
+  };
+
   for (const auto& [g_var, base_func] : sch->mod()->functions) {
     const String& func_name = g_var->name_hint;
     const auto* prim_func = base_func.as<PrimFuncNode>();
@@ -153,36 +194,46 @@ bool RewriteLayout(const Schedule& sch) {
       continue;
     }
 
-    Array<Buffer> layout_free_buffers = CollectLayoutFreeBuffers(prim_func);
-
-    // Collect Buffer read positions
-    BufferReadPosCollector collector(layout_free_buffers);
-    collector(prim_func->body);
-    const auto& locations = collector.GetBufferLocations();
-    const auto& index_maps = collector.GetBufferIndexMap();
-    // Check all buffers are collected
-    if (locations.size() != layout_free_buffers.size() ||
-        index_maps.size() != layout_free_buffers.size()) {
-      return false;
-    }
-
-    for (const auto& kv : locations) {
-      const Buffer& buffer = GetRef<Buffer>(kv.first);
-      const Block& block = kv.second.first;
-      int buffer_index = kv.second.second;
-
-      // Get IndexMap
-      const Optional<IndexMap> index_map = index_maps.at(buffer.get());
-      if (!index_map.defined()) {
-        continue;
+    for (auto buffer : CollectLayoutFreeBuffers(prim_func)) {
+      const auto cache_read_chain = GetCacheReadChain(buffer, prim_func);
+      if (cache_read_chain.empty()) {
+        // The common case, where the layout-free buffer is directly consumed by an anchor op such
+        // as conv2d or dense.
+        auto tup_opt = GetSuggestedIndexMap(buffer, prim_func);
+        if (tup_opt == std::nullopt) continue;
+
+        auto [anchor_block, buffer_index, index_map] = *tup_opt;
+        auto anchor_block_rv = sch->GetBlock(anchor_block->name_hint, func_name);
+        add_layout_rewrite_block(anchor_block_rv, buffer_index);
+        sch->TransformLayout(anchor_block_rv, buffer_index, BufferIndexType::kRead, index_map,
+                             NullOpt);
+      } else {
+        // When the layout-free buffer is consumed by cache_read, we need to find the index map
+        // for a cache-read buffer that is directly consumed by an anchor op. The last buffer
+        // in cache_read_chain corresponds to that buffer.
+        Block cache_read_block = sch->Get(sch->GetBlock(cache_read_chain.back(), func_name));
+        ICHECK_EQ(cache_read_block->writes.size(), 1);
+        auto tup_opt = GetSuggestedIndexMap(cache_read_block->writes[0]->buffer, prim_func);
+        if (tup_opt == std::nullopt) continue;
+
+        auto [anchor_block, buffer_index, index_map] = *tup_opt;
+        // Transform the layout of the last cache-read buffer.
+        sch->TransformLayout(sch->GetBlock(anchor_block->name_hint, func_name), buffer_index,
+                             BufferIndexType::kRead, index_map, NullOpt);
+
+        // Propagate the layout transformation over cache_read_chain, starting from
+        // the next-to-last cache-read buffer.
+        for (int i = static_cast<int>(cache_read_chain.size()) - 1; i >= 0; --i) {
+          BlockRV cache_read_block_rv = sch->GetBlock(cache_read_chain[i], func_name);
+          if (i == 0) {
+            // Before the first cache_read that consumes the layout-free buffer, insert
+            // a layout-rewrite block. Another cache-read buffer is added, and its layout is
+            // transformed by TransformLayout below.
+            add_layout_rewrite_block(cache_read_block_rv, 0);
+          }
+          sch->TransformLayout(cache_read_block_rv, 0, BufferIndexType::kRead, index_map, NullOpt);
+        }
       }
-
-      // Apply schedule
-      BlockRV block_rv = sch->GetBlock(block->name_hint, func_name);
-      BlockRV cached_block_rv = sch->CacheRead(block_rv, buffer_index, "global");
-      sch->TransformLayout(block_rv, buffer_index, BufferIndexType::kRead, index_map.value(),
-                           NullOpt);
-      sch->Annotate(cached_block_rv, attr::meta_schedule_layout_rewrite_preproc, const_true());
     }
   }
   return true;
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 80da5a727926..0581ad60e8f4 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -110,13 +110,20 @@ class LayoutFreePlaceholdersNormalizer : public StmtMutator {
     Block block = Downcast<Block>(StmtMutator::VisitStmt_(_block));
     BlockNode* n = block.CopyOnWrite();
     if (Optional<ObjectRef> ann = n->annotations.Get(topi_attr)) {
+      Array<Buffer> new_buffers;
       for (Buffer buffer : Downcast<Array<Buffer>>(ann)) {
         auto it = buffer2index_.find(buffer);
         if (it != buffer2index_.end()) {
           layout_free_buffer_indices_.insert(it->second);
+        } else {
+          new_buffers.push_back(buffer);
         }
       }
-      n->annotations.erase(topi_attr);
+      if (new_buffers.empty()) {
+        n->annotations.erase(topi_attr);
+      } else {
+        n->annotations.Set(topi_attr, new_buffers);
+      }
     }
     for (const String& attr : this->blocklist) {
       auto it = n->annotations.find(attr);
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py
index 91a51c8e9033..98c1f7368580 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py
@@ -204,5 +204,281 @@ def test_layout_rewrite():
     tvm.ir.assert_structural_equal(sch.mod["main"], rewritten_tir_matmul)
 
 
+# fmt: off
+@tvm.script.ir_module
+class Conv2dCacheRead:
+    @T.prim_func
+    def main(p0: T.Buffer[(1, 56, 56, 64), "float32"], p1: T.Buffer[(3, 3, 64, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 56, 56, 64), "float32"]):
+        T.func_attr({"layout_free_buffers": [1], "tir.noalias": True, "global_symbol": "main"})
+        pad_temp = T.alloc_buffer([1, 58, 58, 64], dtype="float32")
+        conv2d_nhwc_global = T.alloc_buffer([1, 56, 56, 64], dtype="float32")
+        pad_temp_global = T.alloc_buffer([1, 58, 58, 64], dtype="float32")
+        p1_global = T.alloc_buffer([3, 3, 64, 64], dtype="float32")
+        for i0_0_i1_0_i2_0_fused in T.parallel(4, annotations={"pragma_auto_unroll_max_step":16, "pragma_unroll_explicit":1}):
+            for ax0, ax1, ax2 in T.grid(1, 30, 30):
+                for ax3_fused in T.vectorized(64):
+                    with T.block("pad_temp"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused // 2 * 28 + ax1)
+                        i2 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused % 2 * 28 + ax2)
+                        i3 = T.axis.spatial(64, ax3_fused)
+                        T.reads(p0[i0, i1 - 1, i2 - 1, i3])
+                        T.writes(pad_temp[i0, i1, i2, i3])
+                        pad_temp[i0, i1, i2, i3] = T.if_then_else(1 <= i1 and i1 < 57 and 1 <= i2 and i2 < 57, p0[i0, i1 - 1, i2 - 1, i3], T.float32(0), dtype="float32")
+            for i3_0 in T.serial(16):
+                for ax0_ax1_ax2_ax3_fused in T.serial(57600):
+                    with T.block("pad_temp_global"):
+                        v0 = T.axis.spatial(1, 0)
+                        v1 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused // 2 * 28 + ax0_ax1_ax2_ax3_fused // 1920)
+                        v2 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused % 2 * 28 + ax0_ax1_ax2_ax3_fused % 1920 // 64)
+                        v3 = T.axis.spatial(64, ax0_ax1_ax2_ax3_fused % 64)
+                        T.reads(pad_temp[v0, v1, v2, v3])
+                        T.writes(pad_temp_global[v0, v1, v2, v3])
+                        pad_temp_global[v0, v1, v2, v3] = pad_temp[v0, v1, v2, v3]
+                for ax0_ax1_ax2_ax3_fused in T.serial(2304):
+                    with T.block("p1_global"):
+                        v0 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused // 768)
+                        v1 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 768 // 256)
+                        v2 = T.axis.spatial(64, ax0_ax1_ax2_ax3_fused % 256 // 4)
+                        v3 = T.axis.spatial(64, i3_0 * 4 + ax0_ax1_ax2_ax3_fused % 4)
+                        T.reads(p1[v0, v1, v2, v3])
+                        T.writes(p1_global[v0, v1, v2, v3])
+                        p1_global[v0, v1, v2, v3] = p1[v0, v1, v2, v3]
+                for i0_1, i1_1, i2_1, i3_1 in T.grid(1, 7, 2, 1):
+                    for i0_2_init, i1_2_init, i2_2_init, i3_2_init, i0_3_init, i1_3_init, i2_3_init in T.grid(1, 1, 14, 2, 1, 4, 1):
+                        for i3_3_fused_init in T.vectorized(2):
+                            with T.block("conv2d_nhwc_init"):
+                                nn = T.axis.spatial(1, i0_2_init + i0_3_init + i0_1)
+                                yy = T.axis.spatial(56, i0_0_i1_0_i2_0_fused // 2 * 28 + i1_1 * 4 + i1_2_init * 4 + i1_3_init)
+                                xx = T.axis.spatial(56, i2_3_init + i0_0_i1_0_i2_0_fused % 2 * 28 + i2_1 * 14 + i2_2_init)
+                                ff = T.axis.spatial(64, i3_0 * 4 + i3_1 * 4 + i3_2_init * 2 + i3_3_fused_init)
+                                T.reads()
+                                T.writes(conv2d_nhwc_global[nn, yy, xx, ff])
+                                T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                                conv2d_nhwc_global[nn, yy, xx, ff] = T.float32(0)
+                    for i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3 in T.grid(1, 1, 2, 1, 1, 14, 2, 3, 3, 32, 1, 4, 1):
+                        for i3_3_fused in T.vectorized(2):
+                            with T.block("conv2d_nhwc_update"):
+                                nn = T.axis.spatial(1, i0_2 + i0_3 + i0_1)
+                                yy = T.axis.spatial(56, i0_0_i1_0_i2_0_fused // 2 * 28 + i1_1 * 4 + i1_2 * 4 + i1_3)
+                                xx = T.axis.spatial(56, i2_3 + i0_0_i1_0_i2_0_fused % 2 * 28 + i2_1 * 14 + i2_2)
+                                ff = T.axis.spatial(64, i3_0 * 4 + i3_1 * 4 + i3_2 * 2 + i3_3_fused)
+                                ry = T.axis.reduce(3, i4_0 * 3 + i4_1)
+                                rx = T.axis.reduce(3, i5_0 * 3 + i5_1)
+                                rc = T.axis.reduce(64, i6_0 * 32 + i6_1)
+                                T.reads(conv2d_nhwc_global[nn, yy, xx, ff], pad_temp_global[nn, yy + ry, xx + rx, rc], p1_global[ry, rx, rc, ff])
+                                T.writes(conv2d_nhwc_global[nn, yy, xx, ff])
+                                T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                                conv2d_nhwc_global[nn, yy, xx, ff] = conv2d_nhwc_global[nn, yy, xx, ff] + pad_temp_global[nn, yy + ry, xx + rx, rc] * p1_global[ry, rx, rc, ff]
+                    for ax0, ax1, ax2 in T.grid(1, 4, 14):
+                        for ax3_fused in T.vectorized(4):
+                            with T.block("conv2d_nhwc_global"):
+                                v0 = T.axis.spatial(1, ax0)
+                                v1 = T.axis.spatial(56, i0_0_i1_0_i2_0_fused // 2 * 28 + i1_1 * 4 + ax1)
+                                v2 = T.axis.spatial(56, i0_0_i1_0_i2_0_fused % 2 * 28 + i2_1 * 14 + ax2)
+                                v3 = T.axis.spatial(64, i3_0 * 4 + ax3_fused)
+                                T.reads(conv2d_nhwc_global[v0, v1, v2, v3])
+                                T.writes(conv2d_nhwc[v0, v1, v2, v3])
+                                conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_global[v0, v1, v2, v3]
+
+
+@tvm.script.ir_module
+class Conv2dCacheReadRewritten:
+    @T.prim_func
+    def main(p0: T.Buffer[(1, 56, 56, 64), "float32"], p1: T.Buffer[(3, 3, 64, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 56, 56, 64), "float32"]):
+        T.func_attr({"layout_free_buffers": [1], "tir.noalias": True, "global_symbol": "main"})
+        pad_temp = T.alloc_buffer([1, 58, 58, 64], dtype="float32")
+        conv2d_nhwc_global = T.alloc_buffer([1, 56, 56, 64], dtype="float32")
+        pad_temp_global = T.alloc_buffer([1, 58, 58, 64], dtype="float32")
+        p1_global = T.alloc_buffer([16, 2, 2, 3, 3, 32, 2], dtype="float32")
+        p1_global_1 = T.alloc_buffer([16, 2, 2, 3, 3, 32, 2], dtype="float32")
+        for ax0, ax1, ax2, ax3 in T.grid(3, 3, 64, 64):
+            with T.block("p1_global"):
+                v0, v1, v2, v3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3])
+                T.reads(p1[v0, v1, v2, v3])
+                T.writes(p1_global_1[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2])
+                T.block_attr({"meta_schedule.layout_rewrite_preproc":True})
+                p1_global_1[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2] = p1[v0, v1, v2, v3]
+        for i0_0_i1_0_i2_0_fused in T.parallel(4, annotations={"pragma_auto_unroll_max_step":16, "pragma_unroll_explicit":1}):
+            for ax0, ax1, ax2 in T.grid(1, 30, 30):
+                for ax3_fused in T.vectorized(64):
+                    with T.block("pad_temp"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused // 2 * 28 + ax1)
+                        i2 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused % 2 * 28 + ax2)
+                        i3 = T.axis.spatial(64, ax3_fused)
+                        T.reads(p0[i0, i1 - 1, i2 - 1, i3])
+                        T.writes(pad_temp[i0, i1, i2, i3])
+                        pad_temp[i0, i1, i2, i3] = T.if_then_else(1 <= i1 and i1 < 57 and 1 <= i2 and i2 < 57, p0[i0, i1 - 1, i2 - 1, i3], T.float32(0), dtype="float32")
+            for i3_0 in T.serial(16):
+                for ax0_ax1_ax2_ax3_fused in T.serial(57600):
+                    with T.block("pad_temp_global"):
+                        v0 = T.axis.spatial(1, 0)
+                        v1 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused // 2 * 28 + ax0_ax1_ax2_ax3_fused // 1920)
+                        v2 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused % 2 * 28 + ax0_ax1_ax2_ax3_fused % 1920 // 64)
+                        v3 = T.axis.spatial(64, ax0_ax1_ax2_ax3_fused % 64)
+                        T.reads(pad_temp[v0, v1, v2, v3])
+                        T.writes(pad_temp_global[v0, v1, v2, v3])
+                        pad_temp_global[v0, v1, v2, v3] = pad_temp[v0, v1, v2, v3]
+                for ax0_ax1_ax2_ax3_fused in T.serial(2304):
+                    with T.block("p1_global"):
+                        v0 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused // 768)
+                        v1 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 768 // 256)
+                        v2 = T.axis.spatial(64, ax0_ax1_ax2_ax3_fused % 256 // 4)
+                        v3 = T.axis.spatial(64, i3_0 * 4 + ax0_ax1_ax2_ax3_fused % 4)
+                        T.reads(p1_global_1[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2])
+                        T.writes(p1_global[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2])
+                        p1_global[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2] = p1_global_1[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2]
+                for i0_1, i1_1, i2_1, i3_1 in T.grid(1, 7, 2, 1):
+                    for i0_2_init, i1_2_init, i2_2_init, i3_2_init, i0_3_init, i1_3_init, i2_3_init in T.grid(1, 1, 14, 2, 1, 4, 1):
+                        for i3_3_fused_init in T.vectorized(2):
+                            with T.block("conv2d_nhwc_init"):
+                                nn = T.axis.spatial(1, i0_2_init + i0_3_init + i0_1)
+                                yy = T.axis.spatial(56, i0_0_i1_0_i2_0_fused // 2 * 28 + i1_1 * 4 + i1_2_init * 4 + i1_3_init)
+                                xx = T.axis.spatial(56, i2_3_init + i0_0_i1_0_i2_0_fused % 2 * 28 + i2_1 * 14 + i2_2_init)
+                                ff = T.axis.spatial(64, i3_0 * 4 + i3_1 * 4 + i3_2_init * 2 + i3_3_fused_init)
+                                T.reads()
+                                T.writes(conv2d_nhwc_global[nn, yy, xx, ff])
+                                T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                                conv2d_nhwc_global[nn, yy, xx, ff] = T.float32(0)
+                    for i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3 in T.grid(1, 1, 2, 1, 1, 14, 2, 3, 3, 32, 1, 4, 1):
+                        for i3_3_fused in T.vectorized(2):
+                            with T.block("conv2d_nhwc_update"):
+                                nn = T.axis.spatial(1, i0_2 + i0_3 + i0_1)
+                                yy = T.axis.spatial(56, i0_0_i1_0_i2_0_fused // 2 * 28 + i1_1 * 4 + i1_2 * 4 + i1_3)
+                                xx = T.axis.spatial(56, i2_3 + i0_0_i1_0_i2_0_fused % 2 * 28 + i2_1 * 14 + i2_2)
+                                ff = T.axis.spatial(64, i3_0 * 4 + i3_1 * 4 + i3_2 * 2 + i3_3_fused)
+                                ry = T.axis.reduce(3, i4_0 * 3 + i4_1)
+                                rx = T.axis.reduce(3, i5_0 * 3 + i5_1)
+                                rc = T.axis.reduce(64, i6_0 * 32 + i6_1)
+                                T.reads(conv2d_nhwc_global[nn, yy, xx, ff], pad_temp_global[nn, yy + ry, xx + rx, rc], p1_global[ff // 4, rc // 32, ff % 4 // 2, ry, rx, rc % 32, ff % 2])
+                                T.writes(conv2d_nhwc_global[nn, yy, xx, ff])
+                                T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                                conv2d_nhwc_global[nn, yy, xx, ff] = conv2d_nhwc_global[nn, yy, xx, ff] + pad_temp_global[nn, yy + ry, xx + rx, rc] * p1_global[ff // 4, rc // 32, ff % 4 // 2, ry, rx, rc % 32, ff % 2]
+                    for ax0, ax1, ax2 in T.grid(1, 4, 14):
+                        for ax3_fused in T.vectorized(4):
+                            with T.block("conv2d_nhwc_global"):
+                                v0 = T.axis.spatial(1, ax0)
+                                v1 = T.axis.spatial(56, i0_0_i1_0_i2_0_fused // 2 * 28 + i1_1 * 4 + ax1)
+                                v2 = T.axis.spatial(56, i0_0_i1_0_i2_0_fused % 2 * 28 + i2_1 * 14 + ax2)
+                                v3 = T.axis.spatial(64, i3_0 * 4 + ax3_fused)
+                                T.reads(conv2d_nhwc_global[v0, v1, v2, v3])
+                                T.writes(conv2d_nhwc[v0, v1, v2, v3])
+                                conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_global[v0, v1, v2, v3]
+
+
+@tvm.script.ir_module
+class Conv2dCacheReadMultipleRewritten:
+    @T.prim_func
+    def main(p0: T.Buffer[(1, 56, 56, 64), "float32"], p1: T.Buffer[(3, 3, 64, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 56, 56, 64), "float32"]):
+        T.func_attr({"layout_free_buffers": [1], "tir.noalias": True, "global_symbol": "main"})
+        pad_temp = T.alloc_buffer([1, 58, 58, 64], dtype="float32")
+        conv2d_nhwc_global = T.alloc_buffer([1, 56, 56, 64], dtype="float32")
+        pad_temp_global = T.alloc_buffer([1, 58, 58, 64], dtype="float32")
+        p1_global = T.alloc_buffer([16, 2, 2, 3, 3, 32, 2], dtype="float32")
+        p1_global2 = T.alloc_buffer([16, 2, 2, 3, 3, 32, 2], dtype="float32", scope="global2")
+        p1_global_1 = T.alloc_buffer([16, 2, 2, 3, 3, 32, 2], dtype="float32")
+        for ax0, ax1, ax2, ax3 in T.grid(3, 3, 64, 64):
+            with T.block("p1_global"):
+                v0, v1, v2, v3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3])
+                T.reads(p1[v0, v1, v2, v3])
+                T.writes(p1_global_1[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2])
+                T.block_attr({"meta_schedule.layout_rewrite_preproc":True})
+                p1_global_1[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2] = p1[v0, v1, v2, v3]
+        for ax0, ax1, ax2, ax3 in T.grid(3, 3, 64, 64):
+            with T.block("p1_global2"):
+                v0, v1, v2, v3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3])
+                T.reads(p1_global_1[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2])
+                T.writes(p1_global2[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2])
+                p1_global2[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2] = p1_global_1[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2]
+        for i0_0_i1_0_i2_0_fused in T.parallel(4, annotations={"pragma_auto_unroll_max_step":16, "pragma_unroll_explicit":1}):
+            for ax0, ax1, ax2 in T.grid(1, 30, 30):
+                for ax3_fused in T.vectorized(64):
+                    with T.block("pad_temp"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused // 2 * 28 + ax1)
+                        i2 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused % 2 * 28 + ax2)
+                        i3 = T.axis.spatial(64, ax3_fused)
+                        T.reads(p0[i0, i1 - 1, i2 - 1, i3])
+                        T.writes(pad_temp[i0, i1, i2, i3])
+                        pad_temp[i0, i1, i2, i3] = T.if_then_else(1 <= i1 and i1 < 57 and 1 <= i2 and i2 < 57, p0[i0, i1 - 1, i2 - 1, i3], T.float32(0), dtype="float32")
+            for i3_0 in T.serial(16):
+                for ax0_ax1_ax2_ax3_fused in T.serial(57600):
+                    with T.block("pad_temp_global"):
+                        v0 = T.axis.spatial(1, 0)
+                        v1 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused // 2 * 28 + ax0_ax1_ax2_ax3_fused // 1920)
+                        v2 = T.axis.spatial(58, i0_0_i1_0_i2_0_fused % 2 * 28 + ax0_ax1_ax2_ax3_fused % 1920 // 64)
+                        v3 = T.axis.spatial(64, ax0_ax1_ax2_ax3_fused % 64)
+                        T.reads(pad_temp[v0, v1, v2, v3])
+                        T.writes(pad_temp_global[v0, v1, v2, v3])
+                        pad_temp_global[v0, v1, v2, v3] = pad_temp[v0, v1, v2, v3]
+                for ax0_ax1_ax2_ax3_fused in T.serial(2304):
+                    with T.block("p1_global"):
+                        v0 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused // 768)
+                        v1 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 768 // 256)
+                        v2 = T.axis.spatial(64, ax0_ax1_ax2_ax3_fused % 256 // 4)
+                        v3 = T.axis.spatial(64, i3_0 * 4 + ax0_ax1_ax2_ax3_fused % 4)
+                        T.reads(p1_global2[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2])
+                        T.writes(p1_global[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2])
+                        p1_global[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2] = p1_global2[v3 // 4, v2 // 32, v3 % 4 // 2, v0, v1, v2 % 32, v3 % 2]
+                for i0_1, i1_1, i2_1, i3_1 in T.grid(1, 7, 2, 1):
+                    for i0_2_init, i1_2_init, i2_2_init, i3_2_init, i0_3_init, i1_3_init, i2_3_init in T.grid(1, 1, 14, 2, 1, 4, 1):
+                        for i3_3_fused_init in T.vectorized(2):
+                            with T.block("conv2d_nhwc_init"):
+                                nn = T.axis.spatial(1, i0_2_init + i0_3_init + i0_1)
+                                yy = T.axis.spatial(56, i0_0_i1_0_i2_0_fused // 2 * 28 + i1_1 * 4 + i1_2_init * 4 + i1_3_init)
+                                xx = T.axis.spatial(56, i2_3_init + i0_0_i1_0_i2_0_fused % 2 * 28 + i2_1 * 14 + i2_2_init)
+                                ff = T.axis.spatial(64, i3_0 * 4 + i3_1 * 4 + i3_2_init * 2 + i3_3_fused_init)
+                                T.reads()
+                                T.writes(conv2d_nhwc_global[nn, yy, xx, ff])
+                                T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                                conv2d_nhwc_global[nn, yy, xx, ff] = T.float32(0)
+                    for i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3 in T.grid(1, 1, 2, 1, 1, 14, 2, 3, 3, 32, 1, 4, 1):
+                        for i3_3_fused in T.vectorized(2):
+                            with T.block("conv2d_nhwc_update"):
+                                nn = T.axis.spatial(1, i0_2 + i0_3 + i0_1)
+                                yy = T.axis.spatial(56, i0_0_i1_0_i2_0_fused // 2 * 28 + i1_1 * 4 + i1_2 * 4 + i1_3)
+                                xx = T.axis.spatial(56, i2_3 + i0_0_i1_0_i2_0_fused % 2 * 28 + i2_1 * 14 + i2_2)
+                                ff = T.axis.spatial(64, i3_0 * 4 + i3_1 * 4 + i3_2 * 2 + i3_3_fused)
+                                ry = T.axis.reduce(3, i4_0 * 3 + i4_1)
+                                rx = T.axis.reduce(3, i5_0 * 3 + i5_1)
+                                rc = T.axis.reduce(64, i6_0 * 32 + i6_1)
+                                T.reads(conv2d_nhwc_global[nn, yy, xx, ff], pad_temp_global[nn, yy + ry, xx + rx, rc], p1_global[ff // 4, rc // 32, ff % 4 // 2, ry, rx, rc % 32, ff % 2])
+                                T.writes(conv2d_nhwc_global[nn, yy, xx, ff])
+                                T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                                conv2d_nhwc_global[nn, yy, xx, ff] = conv2d_nhwc_global[nn, yy, xx, ff] + pad_temp_global[nn, yy + ry, xx + rx, rc] * p1_global[ff // 4, rc // 32, ff % 4 // 2, ry, rx, rc % 32, ff % 2]
+                    for ax0, ax1, ax2 in T.grid(1, 4, 14):
+                        for ax3_fused in T.vectorized(4):
+                            with T.block("conv2d_nhwc_global"):
+                                v0 = T.axis.spatial(1, ax0)
+                                v1 = T.axis.spatial(56, i0_0_i1_0_i2_0_fused // 2 * 28 + i1_1 * 4 + ax1)
+                                v2 = T.axis.spatial(56, i0_0_i1_0_i2_0_fused % 2 * 28 + i2_1 * 14 + ax2)
+                                v3 = T.axis.spatial(64, i3_0 * 4 + ax3_fused)
+                                T.reads(conv2d_nhwc_global[v0, v1, v2, v3])
+                                T.writes(conv2d_nhwc[v0, v1, v2, v3])
+                                conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_global[v0, v1, v2, v3]
+
+# fmt: on
+
+
+def test_layout_rewrite_cache_read():
+    target = Target("llvm")
+    ctx = _create_context(Conv2dCacheRead, target)
+    sch = tvm.tir.Schedule(Conv2dCacheRead, debug_mask="all")
+    sch.enter_postproc()
+    assert ctx.space_generator.postprocs[0].apply(sch)
+    tvm.ir.assert_structural_equal(sch.mod, Conv2dCacheReadRewritten)
+
+
+def test_layout_rewrite_cache_read_multiple():
+    target = Target("llvm")
+    ctx = _create_context(Conv2dCacheRead, target)
+    sch = tvm.tir.Schedule(Conv2dCacheRead, debug_mask="all")
+    sch.cache_read(sch.get_block("p1_global"), 0, "global2")
+    sch.enter_postproc()
+    assert ctx.space_generator.postprocs[0].apply(sch)
+    tvm.ir.assert_structural_equal(sch.mod, Conv2dCacheReadMultipleRewritten)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index b59880758e5d..7b8173d0b2d9 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -390,6 +390,7 @@ def expected_layout_attr(
             C[x, y] = C[x, y] + A[x, k] * B[y, k]
     for i0, i1 in T.grid(128, 128):
         with T.block("D"):
+            T.block_attr({"layout_free_placeholders": [C]})
             x, y = T.axis.remap("SS", [i0, i1])
             D[x, y] = C[x, y] + T.float32(1)
 

From 42f9a766ffdce46220dc46d9126998909829dd66 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 15 Nov 2022 12:32:18 -0800
Subject: [PATCH 580/704] [TOPI] Add padding for dense/batch matmul for x86
 vnni (#13385)

This added padding to make the shape of dense/batch matmul compatible with VNNI instructions.
---
 python/tvm/topi/x86/dense_alter_op.py | 35 +++++++++++++++++++++++----
 tests/python/relay/test_op_level1.py  |  7 +++---
 tests/python/relay/test_op_level10.py | 16 +++++++++---
 3 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/python/tvm/topi/x86/dense_alter_op.py b/python/tvm/topi/x86/dense_alter_op.py
index 0b195f487b7f..fd2b184a87d2 100644
--- a/python/tvm/topi/x86/dense_alter_op.py
+++ b/python/tvm/topi/x86/dense_alter_op.py
@@ -28,14 +28,13 @@
 from .. import nn
 
 
-def check_vnni_applicable(x, y):
+def check_vnni_applicable(x, y, allow_padding=False):
     mcpu = tvm.target.Target.current().mcpu
     return (
         target_has_vnni(mcpu)
         and "int8" in x.dtype
         and "int8" in y.dtype
-        and y.shape[-2] % 16 == 0
-        and y.shape[-1] % 4 == 0
+        and (allow_padding or (y.shape[-2] % 16 == 0 and y.shape[-1] % 4 == 0))
     )
 
 
@@ -87,7 +86,10 @@ def _alter_dense_layout(attrs, inputs, tinfos, out_type):
 
 def vnni_legalize(inputs, arg_types, op, attrs, need_expand=False):
     """Legalizes s8, s8 -> s32 GEMM op for VNNI."""
-    if check_vnni_applicable(arg_types[0], arg_types[1]) and arg_types[0].dtype == "int8":
+    if (
+        check_vnni_applicable(arg_types[0], arg_types[1], allow_padding=True)
+        and arg_types[0].dtype == "int8"
+    ):
         x, y = inputs
         x = relay.cast(x, "int32")
         x = relay.add(x, relay.const(128, "int32"))
@@ -98,7 +100,30 @@ def vnni_legalize(inputs, arg_types, op, attrs, need_expand=False):
         if need_expand:
             adjust_shift = relay.expand_dims(adjust_shift, axis=1)
 
-        out = op(x, y, **attrs)
+        analyzer = tvm.arith.Analyzer()
+        x_shape = arg_types[0].shape
+        y_shape = arg_types[1].shape
+        inst_n = 16
+        inst_k = 4
+        pad_n = analyzer.simplify((inst_n - y_shape[-2] % inst_n) % inst_n)
+        pad_k = analyzer.simplify((inst_k - y_shape[-1] % inst_k) % inst_k)
+        if pad_k != 0 or pad_n != 0:
+            ndim = len(x_shape)
+            unpadded_dims = [(0, 0)] * (ndim - 2)
+            padding_y = [(0, 0)] * (len(y_shape) - 2) + [(0, pad_n), (0, pad_k)]
+            padded_y = relay.nn.pad(y, pad_width=padding_y, pad_value=0)
+            if pad_k != 0:
+                padding_x = [(0, 0)] * (len(x_shape) - 1) + [(0, pad_k)]
+                padded_x = relay.nn.pad(x, pad_width=padding_x, pad_value=0)
+            else:
+                padded_x = x
+            out = op(padded_x, padded_y, **attrs)
+            if pad_n != 0:
+                begin = [0] * len(x_shape)
+                end = x_shape[:-2] + [x_shape[-2], y_shape[-2]]
+                out = relay.strided_slice(out, begin, end, slice_mode="size")
+        else:
+            out = op(x, y, **attrs)
 
         return relay.subtract(out, adjust_shift)
 
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 7884fa35a48b..1c93ee766a88 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -753,9 +753,10 @@ def test_bitserial_dense():
 
 
 @tvm.testing.requires_cascadelake
-def test_dense_vnni():
-    data_shape = (32, 96)
-    weight_shape = (128, 96)
+@pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
+def test_dense_vnni(m, n, k):
+    data_shape = (m, k)
+    weight_shape = (n, k)
 
     for data_dtype in ["uint8", "int8"]:
         data = relay.var("data", shape=data_shape, dtype=data_dtype)
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 5134ab156b3d..619a0b5a9333 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -474,10 +474,18 @@ def test_batch_matmul(executor_kind):
 
 
 @tvm.testing.requires_cascadelake
-def test_batch_matmul_vnni():
-    x_shape = (16, 32, 96)
-    y_shape = (16, 128, 96)
-    z_shape = (16, 32, 128)
+@pytest.mark.parametrize(
+    "b,m,n,k",
+    [
+        (16, 32, 128, 96),
+        (16, 32, 128, 97),
+        (16, 32, 129, 96),
+    ],
+)
+def test_batch_matmul_vnni(b, m, n, k):
+    x_shape = (b, m, k)
+    y_shape = (b, n, k)
+    z_shape = (b, m, n)
 
     for lhs_dtype in ["uint8", "int8"]:
         x = relay.var("x", shape=x_shape, dtype=lhs_dtype)

From bac450a645c3f9e3a69a6f7af207cff462250bcf Mon Sep 17 00:00:00 2001
From: abhikran-quic <63697863+abhikran-quic@users.noreply.github.com>
Date: Wed, 16 Nov 2022 02:04:38 +0530
Subject: [PATCH 581/704] [Hexagon] Use VTCM while scheduling conv2d op
 (#13388)

Use VTCM for E2E execution of conv2d to reduce tuning time.
---
 .../test_hexagon/metaschedule_e2e/test_resnet50_int8.py      | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
index 9edf5877fd5e..91eb67bbf457 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -248,8 +248,9 @@ def index_map_nchw32c_nchw8h8w32c(n_batch, channel, height, width, channel_32):
 
         # Add cache for input and output activation layout transform,
         # note that weight is already in correct layout
-        input_cache = sch.cache_read(conv2d_block, 0, "global")  # pylint: disable=unused-variable
-        output_cache = sch.cache_write(outer_block, 0, "global")  # pylint: disable=unused-variable
+        # pylint: disable=unused-variable
+        input_cache = sch.cache_read(conv2d_block, 0, "global.vtcm")
+        output_cache = sch.cache_write(outer_block, 0, "global.vtcm")
         # Transform the layout of the input
         sch.transform_layout(
             conv2d_block, ("read", 0), index_map=index_map_nchw32c_nchw8h8w32c, pad_value=0

From aa0c0db71b196826d7474be036d619c04b77008a Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Tue, 15 Nov 2022 23:35:24 +0300
Subject: [PATCH 582/704] [tvm4j] Fix tvm4j build on MacOS (#13370)

- Add missing bracket in pom file
- Replaced JavaVM on JavaNativeFoundation. From Xcode 12.1 (macOS 11
  (Big Sur)) JavaVM framework is no longer includes to the frameworks
  list. Necessary to use JavaNativeFoundation instead.
---
 jvm/native/osx-x86_64/pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jvm/native/osx-x86_64/pom.xml b/jvm/native/osx-x86_64/pom.xml
index c08667401b4d..3f76bb877d0b 100644
--- a/jvm/native/osx-x86_64/pom.xml
+++ b/jvm/native/osx-x86_64/pom.xml
@@ -120,14 +120,14 @@ under the License.
           <compilerEndOptions>
             <compilerEndOption>-I../../../include</compilerEndOption>
             <compilerEndOption>-I${JAVA_HOME}/include</compilerEndOption>
-            <compilerEndOption>-I${JAVA_HOME}/include/linux</compilerEndOption
+            <compilerEndOption>-I${JAVA_HOME}/include/linux</compilerEndOption>
             <compilerEndOption>${cflags}</compilerEndOption>
           </compilerEndOptions>
           <linkerStartOptions>
             <linkerStartOption>-shared</linkerStartOption>
           </linkerStartOptions>
           <linkerMiddleOptions>
-            <linkerMiddleOption>-framework JavaVM</linkerMiddleOption>
+            <linkerMiddleOption>-framework JavaNativeFoundation</linkerMiddleOption>
             <linkerMiddleOption>-Wl,-exported_symbol,_Java_*</linkerMiddleOption>
             <linkerMiddleOption>-undefined dynamic_lookup</linkerMiddleOption>
             <linkerMiddleOption>-Wl,-x</linkerMiddleOption>

From 557f1a9dedcdcbedeb0cb2b8855ff49fff03f5d3 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 15 Nov 2022 13:36:35 -0700
Subject: [PATCH 583/704] [docs] Add some more release process docs (#13351)

This is mostly to address https://github.com/tlc-pack/tlcpack/issues/149
and make it clear how to avoid that in the future.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docs/contribute/release_process.rst | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/docs/contribute/release_process.rst b/docs/contribute/release_process.rst
index 4b5c45fc84ba..463536f20080 100644
--- a/docs/contribute/release_process.rst
+++ b/docs/contribute/release_process.rst
@@ -67,7 +67,7 @@ You can skip this section if you have already uploaded your key.
 
 After generating the gpg key, you need to upload your key to a public key server. Please refer to https://www.apache.org/dev/openpgp.html#generate-key for details.
 
-If you want to do the release on another machine, you can transfer your gpg key to that machine via the :code:`gpg --export` and :code:`gpg --import` commands.
+If you want to do the release on another machine, you can transfer your gpg key to that machine via the ``gpg --export`` and ``gpg --import`` commands.
 
 The last step is to update the KEYS file with your code signing key https://www.apache.org/dev/openpgp.html#export-public-key. Check in the changes to the TVM main branch, as well as ASF SVN,
 
@@ -96,17 +96,17 @@ To cut a release candidate, one needs to first cut a branch using selected versi
 	git branch v0.6.0
 	git push --set-upstream origin v0.6.0
 
-(*Make sure the version numbers in the source code are correct.* Run :code:`python3 version.py` to update the version.)
+(*Make sure the version numbers in the source code are correct.* Run ``python3 version.py`` to update the version.)
 
 Go to the GitHub repositories "releases" tab and click "Draft a new release",
 
-- Provide the release tag in the form of “v1.0.0.rc0” where 0 means it’s the first release candidate
+- Provide the release tag in the form of ``v1.0.0.rc0`` where 0 means it's the first release candidate. The tag must match this pattern ``v[0-9]+\.[0-9]+\.[0-9]+\.rc[0-9]`` exactly!
 - Select the commit by clicking Target: branch > Recent commits > $commit_hash
 - Copy and paste release note draft into the description box
 - Select "This is a pre-release"
 - Click "Publish release"
 
-Notice that one can still apply changes to the BRANCH after the cut, while the TAG is fixed. If any change is required for this release, a new TAG has to be created.
+Notice that one can still apply changes to the branch after the cut, while the tag is fixed. If any change is required for this release, a new tag has to be created.
 
 Remove previous release candidate (if applied),
 
@@ -145,12 +145,15 @@ Create GPG signature as well as the hash of the file,
 	shasum -a 512 apache-tvm-src-v0.6.0.rc0.tar.gz > apache-tvm-src-v0.6.0.rc0.tar.gz.sha512
 
 
-Update TVM Version on Main
---------------------------
+Update TVM Version on ``main``
+------------------------------
 
-After cutting a release candidate, make sure to update the version numbers throughout `main`. For example if we are
-releasing `v0.10.0` we want to bump the version numbers throughout the codebase from `v0.10.dev0` to `v0.11.dev0`. An
+After cutting a release candidate, make sure to update the version numbers throughout ``main``. For example if we are
+releasing ``v0.10.0`` we want to bump the version numbers throughout the codebase from ``v0.10.dev0`` to ``v0.11.dev0``. An
 example of how to do this can be found here: `https://github.com/apache/tvm/pull/12190 <https://github.com/apache/tvm/pull/12190>`_.
+Tag the commit on ``main`` immediately after the last one included in the release branch with the dev tag (e.g. ``v0.11.dev0``)
+for the next release. This tag is necessary so that the nightly packages built from ``main`` have the correct version
+number.
 
 Upload the Release Candidate
 ----------------------------
@@ -173,7 +176,7 @@ The release manager also needs to upload the artifacts to ASF SVN,
 Call a Vote on the Release Candidate
 ------------------------------------
 
-The first voting takes place on the Apache TVM developers list (dev@tvm.apache.org). To get more attention, one can create a github issue start with "[VOTE]" instead, it will be mirrored to dev@ automatically. Look at past voting threads to see how this proceeds. The email should follow this format.
+The first voting takes place on the Apache TVM developers list (dev@tvm.apache.org). To get more attention, one can create a GitHub issue start with "[VOTE]" instead, it will be mirrored to dev@ automatically. Look at past voting threads to see how this proceeds. The email should follow this format.
 
 - Provide the link to the draft of the release notes in the email
 - Provide the link to the release candidate artifacts
@@ -181,9 +184,9 @@ The first voting takes place on the Apache TVM developers list (dev@tvm.apache.o
 
 For the dev@ vote, there must be at least 3 binding +1 votes and more +1 votes than -1 votes. Once the vote is done, you should also send out a summary email with the totals, with a subject that looks something like [VOTE][RESULT] ....
 
-In ASF, votes are open "at least" 72hrs (3 days). If you don't get enough number of binding votes within that time, you cannot close the voting deadline. You need to extend it.
+In ASF, votes are open at least 72 hours (3 days). If you don't get enough number of binding votes within that time, you cannot close the voting deadline. You need to extend it.
 
-If the voting fails, the community needs to modified the release accordingly, create a new release candidate and re-run the voting process.
+If the vote fails, the community needs to modify the release accordingly: create a new release candidate and re-run the voting process.
 
 
 Post the Release
@@ -212,7 +215,7 @@ Remember to create a new release TAG (v0.6.0 in this case) on Github and remove
 Update the TVM Website
 ----------------------
 
-The website repository is located at `https://github.com/apache/tvm-site <https://github.com/apache/tvm-site>`_. Modify the download page to include the release artifacts as well as the GPG signature and SHA hash. Since TVM's docs are continually updated, upload a fixed version of the release docs. If CI has deleted the docs from the release by the time you go to update the website, you can restart the CI build for the release branch on Jenkins. See the example code below for a starting point
+The website repository is located at `https://github.com/apache/tvm-site <https://github.com/apache/tvm-site>`_. Modify the download page to include the release artifacts as well as the GPG signature and SHA hash. Since TVM's docs are continually updated, upload a fixed version of the release docs. If CI has deleted the docs from the release by the time you go to update the website, you can restart the CI build for the release branch on Jenkins. See the example code below for a starting point.
 
 .. code-block:: bash
 

From 4f4b4edafdb0837972e4df6f570368f9f7cefd20 Mon Sep 17 00:00:00 2001
From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com>
Date: Tue, 15 Nov 2022 16:46:24 -0600
Subject: [PATCH 584/704] [TOPI][Hexagon] Implement quantized
 adaptive_avg_pool1d for hexagon (#13282)

* [TOPI][Hexagon] Implement adaptive_avg_pool1d for hexagon

* Fix lint issues

* Fix some lint issues

* Fix lint issues in test

* Fix import for allocate_hexagon_array
---
 python/tvm/topi/hexagon/qnn/__init__.py       |   1 +
 .../topi/hexagon/qnn/adaptive_avg_pool1d.py   | 120 ++++++++++++
 python/tvm/topi/hexagon/utils.py              |   7 +
 .../contrib/test_hexagon/infrastructure.py    |   9 +
 .../topi/test_adaptive_avg_pool1d.py          | 185 ++++++++++++++++++
 5 files changed, 322 insertions(+)
 create mode 100755 python/tvm/topi/hexagon/qnn/adaptive_avg_pool1d.py
 create mode 100755 tests/python/contrib/test_hexagon/topi/test_adaptive_avg_pool1d.py

diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
index f7a018d2257a..d63b69b2e259 100644
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -27,3 +27,4 @@
 from .quantize import quantize_compute, tir_quantize_schedule
 from .nn import *
 from .qdepthwise_conv2d_slice import qdepthwise_conv2d_compute, qdepthwise_conv2d_schedule
+from .adaptive_avg_pool1d import *
diff --git a/python/tvm/topi/hexagon/qnn/adaptive_avg_pool1d.py b/python/tvm/topi/hexagon/qnn/adaptive_avg_pool1d.py
new file mode 100755
index 000000000000..80f1cd1ecf78
--- /dev/null
+++ b/python/tvm/topi/hexagon/qnn/adaptive_avg_pool1d.py
@@ -0,0 +1,120 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Compute and schedule for adaptive_avg_pool1d slice op
+
+Following are few notes and assumptions made by the implementation:
+
+Assumptions:
+1) The input is in NCW layout. Distilbert is the only model that calls
+   nn.adaptive_avg_pool1d and the only layout it uses is 'NCW'.
+2) The op takes output_size as an argument and
+   only handles the specialized case where output_size is 1.
+   The argument output_size is used as the value of output_width.
+3) Both input and output dtype is uint8/int8 and
+   quantization parameter is provided to the op.
+4) Input is assumed to always be multiple of fixed chunk 32c64w.
+
+Notes:
+1) If input width is used as output width, there can be two cases:
+    a. If the quantization parameters of input and output are same,
+       it can return the input as output so the op will be a no-op.
+    b. If the quantization parameters of input and output are different,
+       it will essentially be a requantize op.
+2) If output_size is a value besides 1 or input_width,
+   adaptive_avg_pool1d may use dynamic stride and kernel for each output element.
+   When this case occurs, kernel won't be known at compile time. We want to use
+   the generic implementation nn.adaptive_avg_pool1d() for this case.
+"""
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn, get_fixed_point_value, saturate
+
+
+def adaptive_avg_pool1d(
+    data: te.Tensor,
+    output_size: list,
+    odtype: str,
+    input_zero_point: int,
+    input_scale: float,
+    output_zero_point: int,
+    output_scale: float,
+):
+    """adaptive_avg_pool1d compute"""
+    _, _, inw = data.shape
+
+    out_width = output_size[0]
+
+    n, c = data.shape[:2]
+    oshape = (n, c) + (out_width,)
+
+    # Kernel is same as input_width since output_width is assumed to be 1
+    if out_width == 1:
+        kw_r = inw
+    else:
+        raise RuntimeError(f"Unsupported output_size, {out_width}'")
+
+    if odtype == "uint8":
+        temp_dtype = "uint32"
+    elif odtype == "int8":
+        temp_dtype = "int32"
+    else:
+        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
+
+    scale_with_area = input_scale / (output_scale * int(kw_r))
+    scale_fixed_point, rsh = get_fixed_point_value(scale_with_area, "int16")
+    corr = (output_zero_point << rsh) - input_zero_point * kw_r * scale_fixed_point
+
+    rw_r = te.reduce_axis((0, kw_r), name="rw_r")
+
+    sum_compute = te.compute(
+        oshape,
+        lambda n, c, w: te.sum(data[n, c, w + rw_r].astype(temp_dtype), axis=[rw_r]),
+        name="sum",
+    )
+
+    avg_compute = te.compute(
+        oshape,
+        lambda n, c, w: saturate(
+            ((sum_compute[n, c, w] * scale_fixed_point) + corr) >> rsh, odtype
+        ).astype(odtype),
+        name="adaptive_avg_1d",
+    )
+    return avg_compute
+
+
+def stir_schedule_ncw_32c64w(outs, ins, input_layout: str):
+    """Schedule for input layout ncw-32c64w and output layout ncw"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+
+    sum_block = s.get_block("sum")
+
+    # Input is multiple of fixed chunk but output is NxCx1
+    # Hence transform_layout is only applied on input
+    input_transformed_layout = get_layout_transform_fn(input_layout)
+    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
+
+    return s
+
+
+def tir_adaptive_avg_pool1d_schedule(outs, ins, output_layout: str, input_layout: str):
+    """STIR based schedule"""
+    if output_layout == "ncw":
+        return stir_schedule_ncw_32c64w(outs, ins, input_layout)
+    raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index 890ebeb9fd11..5aeed9aa4fde 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -131,6 +131,11 @@ def ohwi32o_1d(height, width, in_channel, out_channel):
     return [out_channel // 32, height, width, in_channel, out_channel % 32]
 
 
+def ncw_32c64w_2d(n, c, w):
+    """Return index map for ncw_32c64w 2d layout"""
+    return [n, c // 32, w // 64, te.AXIS_SEPARATOR, c % 32, w % 64]
+
+
 def get_layout_transform_fn(layout):
     """Return index map function as per the layout string"""
     if layout == "nhwc-8h2w32c2w-2d":
@@ -173,6 +178,8 @@ def get_layout_transform_fn(layout):
         return n11c_2048c_2d
     if layout == "ohwi32o-1d":
         return ohwi32o_1d
+    if layout == "ncw-32c64w-2d":
+        return ncw_32c64w_2d
     raise RuntimeError(f"Unexpected layout '{layout}'")
 
 
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index c04631156f1e..c03701f83ccc 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -268,6 +268,15 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
 
         raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
 
+    if current_layout == "ncw":
+        if new_layout == "ncw":
+            return arr_np
+        if new_layout in ["ncw-32c64w-2d"]:
+            n, c, w = arr_np.shape
+            return arr_np.reshape([n, c // 32, 32, w // 64, 64]).transpose(0, 1, 3, 2, 4)
+
+        raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
+
     raise RuntimeError(f"Unexpected current_layout '{current_layout}'")
 
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_adaptive_avg_pool1d.py b/tests/python/contrib/test_hexagon/topi/test_adaptive_avg_pool1d.py
new file mode 100755
index 000000000000..4d4aef25e33f
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_adaptive_avg_pool1d.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test code for specialized case of adaptive_avg_pool1d."""
+
+import numpy as np
+
+import tvm
+from tvm import te
+from tvm.topi.testing import adaptive_pool
+import tvm.topi.hexagon.qnn as s1
+from tvm.contrib.hexagon import allocate_hexagon_array
+from ..infrastructure import transform_numpy, quantize_np
+
+
+SCALE_M_VAL = None
+ZERO_POINT_M_VAL = None
+SCALE_VAL = None
+ZERO_POINT_VAL = None
+
+
+class TestAdaptivePool1D:
+    """Test specialized case of adaptive_avg_pool1d."""
+
+    (input_shape,) = tvm.testing.parameters(
+        ([1, 128, 128],),
+        ([1, 64, 64],),
+        ([1, 64, 128],),
+        ([1, 32, 64],),
+        ([1, 128, 768],),
+    )
+
+    # Fixed chunk layout is set as ncw-32c64w-2d for now.
+    # The adaptive_avg_pool1d implementation only handles specialized case
+    # where output_size is 1 as it appears on quantized distilbert model.
+    # Since output size won't be a multiple of fixed-chunk,
+    # output_layout is ncw.
+    # For optimization, it might get changed later.
+    input_layout, output_layout, pool_type, layout, output_size, dtype, = tvm.testing.parameters(
+        (
+            "ncw-32c64w-2d",
+            "ncw",
+            "avg",
+            "NCW",
+            [1],
+            "uint8",
+        )
+    )
+
+    @tvm.testing.fixture
+    def expected_output_np(
+        self,
+        input_np,
+        output_size,
+        pool_type,
+        layout,
+    ):
+        """Generate expected output."""
+        out_width = output_size[0]
+
+        ref_np = adaptive_pool(
+            input_np,
+            out_width,
+            pool_type,
+            layout,
+        )
+        return ref_np
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, dtype):
+        if dtype in ("uint8", "int8"):
+            dtype = "float32"
+        return np.random.random(input_shape).astype(dtype)
+
+    @tvm.testing.fixture
+    def quantize_input_np(self, input_np, dtype):
+        if dtype in ("uint8", "int8"):
+            global ZERO_POINT_VAL, SCALE_VAL
+            input_np_quantized, SCALE_VAL, ZERO_POINT_VAL = quantize_np(input_np, dtype)
+            return input_np_quantized
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, quantize_input_np, input_layout, layout, dtype):
+        if dtype in ("uint8", "int8"):
+            return transform_numpy(quantize_input_np, layout.lower(), input_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def quantize_expected_output_np(self, expected_output_np, dtype):
+        """Generate expected output."""
+        if dtype in ("uint8", "int8"):
+            global ZERO_POINT_M_VAL, SCALE_M_VAL
+            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
+                expected_output_np, dtype
+            )
+
+            # Since output_layout is ncw, no transformation is needed.
+            return out_ref_quantized
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.requires_hexagon
+    def test_pool1d(
+        self,
+        dtype,
+        output_size,
+        input_layout,
+        output_layout,
+        input_shape,
+        transformed_input_np,
+        quantize_expected_output_np,
+        hexagon_session,
+    ):
+        """Test adaptive_avg_pool1d."""
+        target_hexagon = tvm.target.hexagon("v69")
+        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
+
+        m_tensor = s1.adaptive_avg_pool1d(
+            a_tensor,
+            output_size,
+            dtype,
+            ZERO_POINT_VAL,
+            SCALE_VAL,
+            ZERO_POINT_M_VAL,
+            SCALE_M_VAL,
+        )
+
+        tir_schedule = s1.tir_adaptive_avg_pool1d_schedule(
+            m_tensor, a_tensor, output_layout, input_layout
+        )
+
+        sch = tir_schedule.mod
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                [a_tensor, m_tensor],
+                tvm.target.Target(target_hexagon, host=target_hexagon),
+                name="adaptive_pool1d",
+            )
+
+        input_axis_separator = [3]
+
+        a_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            dtype=dtype,
+            axis_separators=input_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        m_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            quantize_expected_output_np.shape,
+            dtype=dtype,
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(a_data_nd, m_data_nd)
+
+        # Convert nd to np
+        m_data_np = m_data_nd.numpy()
+
+        np.testing.assert_allclose(quantize_expected_output_np, m_data_np, atol=2)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 2bb3382c89419820dab51a5a0fe4f09bb7ce6ecd Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Wed, 16 Nov 2022 02:02:59 +0300
Subject: [PATCH 585/704] [OpenCL] Introduce OpenCL wrapper to TVM (#13362)

* [OpenCL] Introduce OpenCL wrapper to TVM

This wrapper helps dynamically loading OpenCL library. It allows us to
avoid of looking for and copying OpenCL library to host, looking for
OpenCL SDK.

* Update apps and documentation

* Apply comments

* Apply comments and fix Android build

Also, use OpenCL wrapper by default and fix Windows build

* Apply comments

* Update LICENSE file
---
 .gitmodules                                   |   3 +
 3rdparty/OpenCL-Headers                       |   1 +
 LICENSE                                       |   1 +
 .../app/src/main/jni/Android.mk               |   1 +
 .../app/src/main/jni/make/config.mk           |   2 +-
 .../app/src/main/jni/tvm_runtime.h            |   2 +
 apps/android_deploy/README.md                 |  34 +-
 .../app/src/main/jni/Android.mk               |   3 +-
 .../app/src/main/jni/make/config.mk           |   2 +-
 .../app/src/main/jni/tvm_runtime.h            |   3 +
 apps/android_rpc/README.md                    |  32 +-
 apps/android_rpc/app/src/main/jni/Android.mk  |   3 +-
 .../app/src/main/jni/make/config.mk           |   2 +-
 .../app/src/main/jni/tvm_runtime.h            |   1 +
 apps/cpp_rpc/README.md                        |  10 +-
 cmake/config.cmake                            |   3 +-
 cmake/modules/OpenCL.cmake                    |  30 +-
 cmake/utils/FindOpenCL.cmake                  |   2 +-
 .../deploy_models/deploy_model_on_android.py  |   5 +-
 src/runtime/opencl/opencl_module.cc           |   2 +-
 src/runtime/opencl/opencl_wrapper/README.md   |  25 +
 .../opencl/opencl_wrapper/opencl_wrapper.cc   | 574 ++++++++++++++++++
 tests/cpp-runtime/opencl/opencl_timer_test.cc |   4 +-
 23 files changed, 663 insertions(+), 82 deletions(-)
 create mode 160000 3rdparty/OpenCL-Headers
 create mode 100644 src/runtime/opencl/opencl_wrapper/README.md
 create mode 100644 src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc

diff --git a/.gitmodules b/.gitmodules
index e03336443d73..66fd0390cf35 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -16,3 +16,6 @@
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
+[submodule "3rdparty/OpenCL-Headers"]
+	path = 3rdparty/OpenCL-Headers
+	url = https://github.com/KhronosGroup/OpenCL-Headers.git
diff --git a/3rdparty/OpenCL-Headers b/3rdparty/OpenCL-Headers
new file mode 160000
index 000000000000..b590a6bfe034
--- /dev/null
+++ b/3rdparty/OpenCL-Headers
@@ -0,0 +1 @@
+Subproject commit b590a6bfe034ea3a418b7b523e3490956bcb367a
diff --git a/LICENSE b/LICENSE
index 345026985b07..6524d530deca 100644
--- a/LICENSE
+++ b/LICENSE
@@ -211,6 +211,7 @@ Apache Software Foundation License 2.0
 
 3rdparty/dlpack
 3rdparty/dmlc-core
+3rdparty/OpenCL-Headers
 
 
 BSD 2-clause License
diff --git a/apps/android_camera/app/src/main/jni/Android.mk b/apps/android_camera/app/src/main/jni/Android.mk
index 513666a4ecb4..2201f669653c 100644
--- a/apps/android_camera/app/src/main/jni/Android.mk
+++ b/apps/android_camera/app/src/main/jni/Android.mk
@@ -41,6 +41,7 @@ LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
 					$(ROOT_PATH)/src/runtime/rpc \
                     $(ROOT_PATH)/3rdparty/dlpack/include \
                     $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(ROOT_PATH)/3rdparty/OpenCL-Headers \
                     $(MY_PATH)
 
 LOCAL_MODULE = tvm4j_runtime_packed
diff --git a/apps/android_camera/app/src/main/jni/make/config.mk b/apps/android_camera/app/src/main/jni/make/config.mk
index 49e332665ad9..1f601b9afb29 100644
--- a/apps/android_camera/app/src/main/jni/make/config.mk
+++ b/apps/android_camera/app/src/main/jni/make/config.mk
@@ -34,7 +34,7 @@ APP_ABI = all
 APP_PLATFORM = android-24
 
 # whether enable OpenCL during compile
-USE_OPENCL = 0
+USE_OPENCL = 1
 
 # whether to enable Vulkan during compile
 USE_VULKAN = 0
diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h
index 658534780130..0aac7f170ab4 100644
--- a/apps/android_camera/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h
@@ -62,6 +62,8 @@
 #ifdef TVM_OPENCL_RUNTIME
 #include "../src/runtime/opencl/opencl_device_api.cc"
 #include "../src/runtime/opencl/opencl_module.cc"
+#include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
+#include "../src/runtime/opencl/texture_pool.cc"
 #include "../src/runtime/source_utils.cc"
 #endif
 
diff --git a/apps/android_deploy/README.md b/apps/android_deploy/README.md
index 32e601840f04..4cfd9eb9daf2 100644
--- a/apps/android_deploy/README.md
+++ b/apps/android_deploy/README.md
@@ -21,7 +21,7 @@ This folder contains Android Demo app that allows us to show how to deploy model
 
 You will need [JDK](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html), [Android SDK](https://developer.android.com/studio/index.html), [Android NDK](https://developer.android.com/ndk) and an Android device to use this. Make sure the `ANDROID_HOME` variable already points to your Android SDK folder or set it using `export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk]`. We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
 
-Alternatively, you may execute Docker image we provide which contains the required packages. Use the command below to build the image and enter interactive session. Note, that building with OpenCL was not tested from Docker.
+Alternatively, you may execute Docker image we provide which contains the required packages. Use the command below to build the image and enter interactive session.
 
 ```bash
 ./docker/build.sh demo_android -it bash
@@ -50,7 +50,7 @@ dependencies {
 }
 ```
 
-Application default has CPU version TVM runtime flavor and follow below instruction to setup.
+Application default has CPU and GPU (OpenCL) versions TVM runtime flavor and follow below instruction to setup.
 In `app/src/main/jni/make` you will find JNI Makefile config `config.mk` and copy it to `app/src/main/jni` and modify it.
 
 ```bash
@@ -64,9 +64,6 @@ Here's a piece of example for `config.mk`.
 APP_ABI = arm64-v8a
 
 APP_PLATFORM = android-17
-
-# whether enable OpenCL during compile
-USE_OPENCL = 0
 ```
 
 Now use Gradle to compile JNI, resolve Java dependencies and build the Android application together with tvm4j. Run following script to generate the apk file.
@@ -82,28 +79,11 @@ Upload `tvmdemo-release.apk` to your Android device and install it.
 
 ### Build with OpenCL
 
-Application does not link with OpenCL library unless you configure it to. Modify JNI Makefile config `app/src/main/jni` with proper target OpenCL configuration.
-
-Here's a piece of example for `config.mk`.
-
-```makefile
-APP_ABI = arm64-v8a
-
-APP_PLATFORM = android-17
-
-# whether enable OpenCL during compile
-USE_OPENCL = 1
-
-# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
-ADD_C_INCLUDES = /opt/adrenosdk-osx/Development/Inc
-
-# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so
-ADD_LDLIBS = libOpenCL.so
-```
-
-Note that you should specify the correct GPU development headers for your android device. Run `adb shell dumpsys | grep GLES` to find out what GPU your android device uses. It is very likely the library (libOpenCL.so) is already present on the mobile device. For instance, I found it under `/system/vendor/lib64`. You can do `adb pull /system/vendor/lib64/libOpenCL.so ./` to get the file to your desktop.
-
-After you setup the `config.mk`, follow the instructions in [Build APK](#buildapk) to build the Android package with OpenCL flavor.
+Application is building with OpenCL support by default.
+[OpenCL-wrapper](../../src/runtime/opencl/opencl_wrapper) is used and will dynamically load OpenCL library on the device.
+If the device doesn't have OpenCL library on it, then you'll see in the runtime that OpenCL library cannot be opened.
+If you want to build this application without OpenCL then set `USE_OPENCL = 0`
+in [config.mk](./app/src/main/jni/make/config.mk)
 
 ## Cross Compile and Run on Android Devices
 
diff --git a/apps/android_deploy/app/src/main/jni/Android.mk b/apps/android_deploy/app/src/main/jni/Android.mk
index 1b06a6bdb898..ad9cee9bbdb5 100644
--- a/apps/android_deploy/app/src/main/jni/Android.mk
+++ b/apps/android_deploy/app/src/main/jni/Android.mk
@@ -38,7 +38,8 @@ LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog
 
 LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
                     $(ROOT_PATH)/3rdparty/dlpack/include \
-                    $(ROOT_PATH)/3rdparty/dmlc-core/include
+                    $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(ROOT_PATH)/3rdparty/OpenCL-Headers
 
 LOCAL_MODULE = tvm4j_runtime_packed
 
diff --git a/apps/android_deploy/app/src/main/jni/make/config.mk b/apps/android_deploy/app/src/main/jni/make/config.mk
index bcd56e37896d..b06f42b2647a 100644
--- a/apps/android_deploy/app/src/main/jni/make/config.mk
+++ b/apps/android_deploy/app/src/main/jni/make/config.mk
@@ -34,7 +34,7 @@ APP_ABI = all
 APP_PLATFORM = android-17
 
 # whether enable OpenCL during compile
-USE_OPENCL = 0
+USE_OPENCL = 1
 
 # the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
 ADD_C_INCLUDES =
diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
index 725b5e1d3b7a..a2f10701d6df 100644
--- a/apps/android_deploy/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
@@ -47,4 +47,7 @@
 #ifdef TVM_OPENCL_RUNTIME
 #include "../src/runtime/opencl/opencl_device_api.cc"
 #include "../src/runtime/opencl/opencl_module.cc"
+#include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
+#include "../src/runtime/opencl/texture_pool.cc"
+#include "../src/runtime/source_utils.cc"
 #endif
diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index 2e301af6d996..d0a11b6121dc 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -74,33 +74,11 @@ $ANDROID_HOME/platform-tools/adb uninstall org.apache.tvm.tvmrpc
 
 ### Build with OpenCL
 
-This application does not link any OpenCL library unless you configure it to. In `app/src/main/jni/make` you will find JNI Makefile config `config.mk`. Copy it to `app/src/main/jni` and modify it.
-
-```bash
-cd apps/android_rpc/app/src/main/jni
-cp make/config.mk .
-```
-
-Here's a piece of example for `config.mk`.
-
-```makefile
-APP_ABI = arm64-v8a
-
-APP_PLATFORM = android-17
-
-# whether enable OpenCL during compile
-USE_OPENCL = 1
-
-# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
-ADD_C_INCLUDES = /opt/adrenosdk-osx/Development/Inc
-
-# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so
-ADD_LDLIBS = libOpenCL.so
-```
-
-Note that you should specify the correct GPU development headers for your android device. Run `adb shell dumpsys | grep GLES` to find out what GPU your android device uses. It is very likely the library (libOpenCL.so) is already present on the mobile device. For instance, I found it under `/system/vendor/lib64`. You can do `adb pull /system/vendor/lib64/libOpenCL.so ./` to get the file to your desktop.
-
-After you setup the `config.mk`, follow the instructions in [Build APK](#buildapk) to build the Android package.
+Application is building with OpenCL support by default.
+[OpenCL-wrapper](../../src/runtime/opencl/opencl_wrapper) is used and will dynamically load OpenCL library on the device.
+If the device doesn't have OpenCL library on it, then you'll see in the runtime that OpenCL library cannot be opened.
+If you want to build this application without OpenCL then set `USE_OPENCL = 0`
+in [config.mk](./app/src/main/jni/make/config.mk)
 
 ## Cross Compile and Run on Android Devices
 
diff --git a/apps/android_rpc/app/src/main/jni/Android.mk b/apps/android_rpc/app/src/main/jni/Android.mk
index 1b06a6bdb898..ad9cee9bbdb5 100644
--- a/apps/android_rpc/app/src/main/jni/Android.mk
+++ b/apps/android_rpc/app/src/main/jni/Android.mk
@@ -38,7 +38,8 @@ LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog
 
 LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
                     $(ROOT_PATH)/3rdparty/dlpack/include \
-                    $(ROOT_PATH)/3rdparty/dmlc-core/include
+                    $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(ROOT_PATH)/3rdparty/OpenCL-Headers
 
 LOCAL_MODULE = tvm4j_runtime_packed
 
diff --git a/apps/android_rpc/app/src/main/jni/make/config.mk b/apps/android_rpc/app/src/main/jni/make/config.mk
index 851430cd42a9..855a0af19021 100644
--- a/apps/android_rpc/app/src/main/jni/make/config.mk
+++ b/apps/android_rpc/app/src/main/jni/make/config.mk
@@ -34,7 +34,7 @@ APP_ABI = all
 APP_PLATFORM = android-24
 
 # whether enable OpenCL during compile
-USE_OPENCL = 0
+USE_OPENCL = 1
 
 # whether to enable Vulkan during compile
 USE_VULKAN = 0
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index 543c9c85334e..17a20bbaf9a0 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -64,6 +64,7 @@
 #ifdef TVM_OPENCL_RUNTIME
 #include "../src/runtime/opencl/opencl_device_api.cc"
 #include "../src/runtime/opencl/opencl_module.cc"
+#include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
 #include "../src/runtime/opencl/texture_pool.cc"
 #include "../src/runtime/source_utils.cc"
 #endif
diff --git a/apps/cpp_rpc/README.md b/apps/cpp_rpc/README.md
index d073fca81921..58eb68055f4d 100644
--- a/apps/cpp_rpc/README.md
+++ b/apps/cpp_rpc/README.md
@@ -37,7 +37,15 @@ This folder contains a simple recipe to make RPC server in c++.
   # Path to the desired C++ cross compiler
   set(CMAKE_CXX_COMPILER /path/to/cross/compiler/executable)
 ```
-- If linking against a custom device OpenCL library is needed, in the config specify the path to the OpenCL SDK containing the include/CL headers and lib/ or lib64/libOpenCL.so:
+- If you need to build cpp_rpc with OpenCL support, specify variable `USE_OPENCL` in the config:
+  ```
+  set(USE_OPENCL ON)
+  ```
+  In this case [OpenCL-wrapper](../../src/runtime/opencl/opencl_wrapper) or OpenCL installed to your system will be used.
+  When OpenCL-wrapper is used, it will dynamically load OpenCL library on the device.
+  If the device doesn't have OpenCL library on it, then you'll see in the runtime that OpenCL library cannot be opened.
+
+  If linking against a custom device OpenCL library is needed, in the config specify the path to the OpenCL SDK containing the include/CL headers and lib/ or lib64/libOpenCL.so:
 ```
   set(USE_OPENCL /path/to/opencl-sdk)
 ```
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 22a548d29895..679f5c459e87 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -65,7 +65,8 @@ set(USE_AOCL OFF)
 # Whether enable OpenCL runtime
 #
 # Possible values:
-# - ON: enable OpenCL with cmake's auto search
+# - ON: enable OpenCL with OpenCL wrapper to remove dependency during build
+#       time and trigger dynamic search and loading of OpenCL in runtime
 # - OFF: disable OpenCL
 # - /path/to/opencl-sdk: use specific path to opencl-sdk
 set(USE_OPENCL OFF)
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index 430af7e8722c..e738df7c564c 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -15,15 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# OPENCL Module
-find_opencl(${USE_OPENCL})
-
-if(OpenCL_FOUND)
-  # always set the includedir when cuda is available
-  # avoid global retrigger of cmake
-  include_directories(SYSTEM ${OpenCL_INCLUDE_DIRS})
-endif(OpenCL_FOUND)
-
 if(USE_SDACCEL)
   message(STATUS "Build with SDAccel support")
   tvm_file_glob(GLOB RUNTIME_SDACCEL_SRCS src/runtime/opencl/sdaccel/*.cc)
@@ -49,12 +40,23 @@ else()
 endif(USE_AOCL)
 
 if(USE_OPENCL)
-  if (NOT OpenCL_FOUND)
-    find_package(OpenCL REQUIRED)
-  endif()
-  message(STATUS "Build with OpenCL support")
   tvm_file_glob(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
-  list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenCL_LIBRARIES})
+
+  if(${USE_OPENCL} MATCHES ${IS_TRUE_PATTERN})
+    message(WARNING "Build with OpenCL wrapper")
+    file_glob_append(RUNTIME_OPENCL_SRCS
+      "src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
+    )
+    include_directories(SYSTEM "3rdparty/OpenCL-Headers")
+  else()
+    find_opencl(${USE_OPENCL})
+    if(NOT OpenCL_FOUND)
+        message(FATAL_ERROR "Error! Cannot find specified OpenCL library")
+    endif()
+    message(STATUS "Build with OpenCL support")
+    include_directories(SYSTEM ${OpenCL_INCLUDE_DIRS})
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenCL_LIBRARIES})
+  endif()
 
   if(DEFINED USE_OPENCL_GTEST AND EXISTS ${USE_OPENCL_GTEST})
     file_glob_append(RUNTIME_OPENCL_SRCS
diff --git a/cmake/utils/FindOpenCL.cmake b/cmake/utils/FindOpenCL.cmake
index f2931332fc90..8eb35ab3993e 100644
--- a/cmake/utils/FindOpenCL.cmake
+++ b/cmake/utils/FindOpenCL.cmake
@@ -21,7 +21,7 @@
 # Usage:
 #   find_opencl(${USE_OPENCL})
 #
-# - When USE_OPENCL=ON, use auto search
+# - When USE_OPENCL=ON, use OpenCL wrapper for dynamic linking
 # - When USE_OPENCL=/path/to/opencl-sdk-path, use the sdk.
 #   Can be useful when cross compiling and cannot rely on
 #   CMake to provide the correct library as part of the
diff --git a/gallery/how_to/deploy_models/deploy_model_on_android.py b/gallery/how_to/deploy_models/deploy_model_on_android.py
index 10e108239ee7..4bf86e2981a1 100644
--- a/gallery/how_to/deploy_models/deploy_model_on_android.py
+++ b/gallery/how_to/deploy_models/deploy_model_on_android.py
@@ -137,11 +137,10 @@
 #
 #   # the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
 #   ADD_C_INCLUDES += /work/adrenosdk-linux-5_0/Development/Inc
-#   # downloaded from https://github.com/KhronosGroup/OpenCL-Headers
-#   ADD_C_INCLUDES += /usr/local/OpenCL-Headers/
+#   ADD_C_INCLUDES =
 #
 #   # the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so
-#   ADD_LDLIBS = /workspace/pull-from-android-device/libOpenCL.so
+#   ADD_LDLIBS =
 #
 # .. note::
 #
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 9ae80d59d565..2fb157aac6af 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -232,7 +232,7 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre
       cl_int err;
       cl_device_id dev = w->devices[device_id];
       programs_[func_name][device_id] =
-          clCreateProgramWithBinary(w->context, 1, &dev, &len, &s, NULL, &err);
+          clCreateProgramWithBinary(w->context, 1, &dev, &len, &s, nullptr, &err);
       OPENCL_CHECK_ERROR(err);
     } else {
       LOG(FATAL) << "Unknown OpenCL format " << fmt_;
diff --git a/src/runtime/opencl/opencl_wrapper/README.md b/src/runtime/opencl/opencl_wrapper/README.md
new file mode 100644
index 000000000000..7597a442c1a9
--- /dev/null
+++ b/src/runtime/opencl/opencl_wrapper/README.md
@@ -0,0 +1,25 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# OpenCL Wrapper
+
+This wrapper helps dynamically loading OpenCL library. It allows us to avoid of
+looking for and copying library from phone to host, looking for OpenCL SDK.
+
+This can be done because OpenCL is a standard and number of functions are
+limited. We can safely wrap all required functions and their number will not
+grow.
diff --git a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
new file mode 100644
index 000000000000..c447ebcb5339
--- /dev/null
+++ b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
@@ -0,0 +1,574 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file opencl_wrapper.cc
+ * \brief This wrapper is actual for OpenCL 1.2, but can be easily upgraded
+ * when TVM will use newer version of OpenCL
+ */
+
+#define CL_TARGET_OPENCL_VERSION 120
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
+#include <tvm/runtime/logging.h>
+
+#include <vector>
+
+namespace {
+#if defined(__APPLE__) || defined(__MACOSX)
+static const std::vector<const char*> default_so_paths = {
+    "libOpenCL.so", "/System/Library/Frameworks/OpenCL.framework/OpenCL"};
+#elif defined(__ANDROID__)
+static const std::vector<const char*> default_so_paths = {
+    "libOpenCL.so",
+    "/system/lib64/libOpenCL.so",
+    "/system/vendor/lib64/libOpenCL.so",
+    "/system/vendor/lib64/egl/libGLES_mali.so",
+    "/system/vendor/lib64/libPVROCL.so",
+    "/data/data/org.pocl.libs/files/lib64/libpocl.so",
+    "/system/lib/libOpenCL.so",
+    "/system/vendor/lib/libOpenCL.so",
+    "/system/vendor/lib/egl/libGLES_mali.so",
+    "/system/vendor/lib/libPVROCL.so",
+    "/data/data/org.pocl.libs/files/lib/libpocl.so"};
+#elif defined(_WIN32)
+static const std::vector<const TCHAR*> default_so_paths = {__TEXT("OpenCL.dll")};
+#elif defined(__linux__)
+static const std::vector<const char*> default_so_paths = {"libOpenCL.so",
+                                                          "/usr/lib/libOpenCL.so",
+                                                          "/usr/local/lib/libOpenCL.so",
+                                                          "/usr/local/lib/libpocl.so",
+                                                          "/usr/lib64/libOpenCL.so",
+                                                          "/usr/lib32/libOpenCL.so"};
+#endif
+
+class LibOpenCLWrapper {
+ public:
+  static LibOpenCLWrapper& getInstance() {
+    static LibOpenCLWrapper instance;
+    return instance;
+  }
+  LibOpenCLWrapper(const LibOpenCLWrapper&) = delete;
+  LibOpenCLWrapper& operator=(const LibOpenCLWrapper&) = delete;
+  void* getOpenCLFunction(const char* funcName) {
+    if (m_libHandler == nullptr) openLibOpenCL();
+#if defined(_WIN32)
+    return GetProcAddress(m_libHandler, funcName);
+#else
+    return dlsym(m_libHandler, funcName);
+#endif
+  }
+
+ private:
+  LibOpenCLWrapper() {}
+  ~LibOpenCLWrapper() {
+#if defined(_WIN32)
+    if (m_libHandler) FreeLibrary(m_libHandler);
+#else
+    if (m_libHandler) dlclose(m_libHandler);
+#endif
+  }
+  void openLibOpenCL() {
+    for (const auto it : default_so_paths) {
+#if defined(_WIN32)
+      m_libHandler = LoadLibrary(it);
+#else
+      m_libHandler = dlopen(it, RTLD_LAZY);
+#endif
+      if (m_libHandler != nullptr) return;
+    }
+    ICHECK(m_libHandler != nullptr) << "Error! Cannot open libOpenCL!";
+  }
+
+ private:
+#if defined(_WIN32)
+  HMODULE m_libHandler = nullptr;
+#else
+  void* m_libHandler = nullptr;
+#endif
+};
+
+// Function pointers declaration
+using f_pfn_notify = void (*)(const char*, const void*, size_t, void*);
+using f_clGetPlatformIDs = cl_int (*)(cl_uint, cl_platform_id*, cl_uint*);
+using f_clGetPlatformInfo = cl_int (*)(cl_platform_id, cl_platform_info, size_t, void*, size_t*);
+using f_clGetDeviceIDs = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id*,
+                                    cl_uint*);
+using f_clGetDeviceInfo = cl_int (*)(cl_device_id, cl_device_info, size_t, void*, size_t*);
+using f_clCreateContext = cl_context (*)(const cl_context_properties*, cl_uint, const cl_device_id*,
+                                         f_pfn_notify, void*, cl_int*);
+using f_clReleaseContext = cl_int (*)(cl_context);
+using f_clReleaseCommandQueue = cl_int (*)(cl_command_queue);
+using f_clGetCommandQueueInfo = cl_int (*)(cl_command_queue, cl_command_queue_info, size_t, void*,
+                                           size_t*);
+using f_clCreateBuffer = cl_mem (*)(cl_context, cl_mem_flags, size_t, void*, cl_int*);
+using f_clCreateImage = cl_mem (*)(cl_context, cl_mem_flags, const cl_image_format*,
+                                   const cl_image_desc*, void*, cl_int*);
+using f_clReleaseMemObject = cl_int (*)(cl_mem);
+using f_clCreateProgramWithSource = cl_program (*)(cl_context, cl_uint, const char**, const size_t*,
+                                                   cl_int*);
+using f_clCreateProgramWithBinary = cl_program (*)(cl_context, cl_uint, const cl_device_id*,
+                                                   const size_t*, const unsigned char**, cl_int*,
+                                                   cl_int*);
+using f_clReleaseProgram = cl_int (*)(cl_program);
+using f_clBuildProgram = cl_int (*)(cl_program, cl_uint, const cl_device_id*, const char*,
+                                    void (*pfn_notify)(cl_program program, void* user_data), void*);
+using f_clGetProgramBuildInfo = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t,
+                                           void*, size_t*);
+using f_clCreateKernel = cl_kernel (*)(cl_program, const char*, cl_int*);
+using f_clReleaseKernel = cl_int (*)(cl_kernel);
+using f_clSetKernelArg = cl_int (*)(cl_kernel, cl_uint, size_t, const void*);
+using f_clWaitForEvents = cl_int (*)(cl_uint, const cl_event*);
+using f_clCreateUserEvent = cl_event (*)(cl_context, cl_int*);
+using f_clGetEventProfilingInfo = cl_int (*)(cl_event, cl_profiling_info, size_t, void*, size_t*);
+using f_clFlush = cl_int (*)(cl_command_queue);
+using f_clFinish = cl_int (*)(cl_command_queue);
+using f_clEnqueueReadBuffer = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*,
+                                         cl_uint, const cl_event*, cl_event*);
+using f_clEnqueueWriteBuffer = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t,
+                                          const void*, cl_uint, const cl_event*, cl_event*);
+using f_clEnqueueCopyBuffer = cl_int (*)(cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t,
+                                         cl_uint, const cl_event*, cl_event*);
+using f_clEnqueueReadImage = cl_int (*)(cl_command_queue, cl_mem, cl_bool, const size_t*,
+                                        const size_t*, size_t, size_t, void*, cl_uint,
+                                        const cl_event*, cl_event*);
+using f_clEnqueueWriteImage = cl_int (*)(cl_command_queue, cl_mem, cl_bool, const size_t*,
+                                         const size_t*, size_t, size_t, const void*, cl_uint,
+                                         const cl_event*, cl_event*);
+using f_clEnqueueCopyImage = cl_int (*)(cl_command_queue, cl_mem, cl_mem, const size_t*,
+                                        const size_t*, const size_t*, cl_uint, const cl_event*,
+                                        cl_event*);
+using f_clEnqueueCopyImageToBuffer = cl_int (*)(cl_command_queue, cl_mem, cl_mem, const size_t*,
+                                                const size_t*, size_t, cl_uint, const cl_event*,
+                                                cl_event*);
+using f_clEnqueueCopyBufferToImage = cl_int (*)(cl_command_queue, cl_mem, cl_mem, size_t,
+                                                const size_t*, const size_t*, cl_uint,
+                                                const cl_event*, cl_event*);
+using f_clEnqueueNDRangeKernel = cl_int (*)(cl_command_queue, cl_kernel, cl_uint, const size_t*,
+                                            const size_t*, const size_t*, cl_uint, const cl_event*,
+                                            cl_event*);
+using f_clCreateCommandQueue = cl_command_queue (*)(cl_context, cl_device_id,
+                                                    cl_command_queue_properties, cl_int*);
+}  // namespace
+
+cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clGetPlatformIDs)lib.getOpenCLFunction("clGetPlatformIDs");
+  if (func) {
+    return func(num_entries, platforms, num_platforms);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name,
+                         size_t param_value_size, void* param_value, size_t* param_value_size_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clGetPlatformInfo)lib.getOpenCLFunction("clGetPlatformInfo");
+  if (func) {
+    return func(platform, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
+                      cl_device_id* devices, cl_uint* num_devices) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clGetDeviceIDs)lib.getOpenCLFunction("clGetDeviceIDs");
+  if (func) {
+    return func(platform, device_type, num_entries, devices, num_devices);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clGetDeviceInfo(cl_device_id device, cl_device_info param_name, size_t param_value_size,
+                       void* param_value, size_t* param_value_size_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clGetDeviceInfo)lib.getOpenCLFunction("clGetDeviceInfo");
+  if (func) {
+    return func(device, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_context clCreateContext(const cl_context_properties* properties, cl_uint num_devices,
+                           const cl_device_id* devices,
+                           void (*pfn_notify)(const char*, const void*, size_t, void*),
+                           void* user_data, cl_int* errcode_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clCreateContext)lib.getOpenCLFunction("clCreateContext");
+  if (func) {
+    return func(properties, num_devices, devices, pfn_notify, user_data, errcode_ret);
+  } else {
+    return nullptr;
+  }
+}
+
+cl_int clReleaseContext(cl_context context) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clReleaseContext)lib.getOpenCLFunction("clReleaseContext");
+
+  if (func) {
+    return func(context);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clReleaseCommandQueue(cl_command_queue command_queue) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clReleaseCommandQueue)lib.getOpenCLFunction("clReleaseCommandQueue");
+  if (func) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clGetCommandQueueInfo(cl_command_queue command_queue, cl_command_queue_info param_name,
+                             size_t param_value_size, void* param_value,
+                             size_t* param_value_size_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clGetCommandQueueInfo)lib.getOpenCLFunction("clGetCommandQueueInfo");
+  if (func) {
+    return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void* host_ptr,
+                      cl_int* errcode_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clCreateBuffer)lib.getOpenCLFunction("clCreateBuffer");
+  if (func) {
+    return func(context, flags, size, host_ptr, errcode_ret);
+  } else {
+    return nullptr;
+  }
+}
+
+cl_mem clCreateImage(cl_context context, cl_mem_flags flags, const cl_image_format* image_format,
+                     const cl_image_desc* image_desc, void* host_ptr, cl_int* errcode_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clCreateImage)lib.getOpenCLFunction("clCreateImage");
+  if (func) {
+    return func(context, flags, image_format, image_desc, host_ptr, errcode_ret);
+  } else {
+    return nullptr;
+  }
+}
+
+cl_int clReleaseMemObject(cl_mem memobj) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clReleaseMemObject)lib.getOpenCLFunction("clReleaseMemObject");
+  if (func) {
+    return func(memobj);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_program clCreateProgramWithSource(cl_context context, cl_uint count, const char** strings,
+                                     const size_t* lengths, cl_int* errcode_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clCreateProgramWithSource)lib.getOpenCLFunction("clCreateProgramWithSource");
+  if (func) {
+    return func(context, count, strings, lengths, errcode_ret);
+  } else {
+    return nullptr;
+  }
+}
+
+cl_program clCreateProgramWithBinary(cl_context context, cl_uint num_devices,
+                                     const cl_device_id* device_list, const size_t* lengths,
+                                     const unsigned char** binaries, cl_int* binary_status,
+                                     cl_int* errcode_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clCreateProgramWithBinary)lib.getOpenCLFunction("clCreateProgramWithBinary");
+  if (func) {
+    return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
+  } else {
+    return nullptr;
+  }
+}
+
+cl_int clReleaseProgram(cl_program program) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clReleaseProgram)lib.getOpenCLFunction("clReleaseProgram");
+  if (func) {
+    return func(program);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clBuildProgram(cl_program program, cl_uint num_devices, const cl_device_id* device_list,
+                      const char* options, void (*pfn_notify)(cl_program program, void* user_data),
+                      void* user_data) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clBuildProgram)lib.getOpenCLFunction("clBuildProgram");
+  if (func) {
+    return func(program, num_devices, device_list, options, pfn_notify, user_data);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device,
+                             cl_program_build_info param_name, size_t param_value_size,
+                             void* param_value, size_t* param_value_size_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clGetProgramBuildInfo)lib.getOpenCLFunction("clGetProgramBuildInfo");
+  if (func) {
+    return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_kernel clCreateKernel(cl_program program, const char* kernel_name, cl_int* errcode_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clCreateKernel)lib.getOpenCLFunction("clCreateKernel");
+  if (func) {
+    return func(program, kernel_name, errcode_ret);
+  } else {
+    return nullptr;
+  }
+}
+
+cl_int clReleaseKernel(cl_kernel kernel) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clReleaseKernel)lib.getOpenCLFunction("clReleaseKernel");
+  if (func) {
+    return func(kernel);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void* arg_value) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clSetKernelArg)lib.getOpenCLFunction("clSetKernelArg");
+  if (func) {
+    return func(kernel, arg_index, arg_size, arg_value);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clWaitForEvents(cl_uint num_events, const cl_event* event_list) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clWaitForEvents)lib.getOpenCLFunction("clWaitForEvents");
+  if (func) {
+    return func(num_events, event_list);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_event clCreateUserEvent(cl_context context, cl_int* errcode_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clCreateUserEvent)lib.getOpenCLFunction("clCreateUserEvent");
+  if (func) {
+    return func(context, errcode_ret);
+  } else {
+    return nullptr;
+  }
+}
+
+cl_int clGetEventProfilingInfo(cl_event event, cl_profiling_info param_name,
+                               size_t param_value_size, void* param_value,
+                               size_t* param_value_size_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clGetEventProfilingInfo)lib.getOpenCLFunction("clGetEventProfilingInfo");
+  if (func) {
+    return func(event, param_name, param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clFlush(cl_command_queue command_queue) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clFlush)lib.getOpenCLFunction("clFlush");
+  if (func) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clFinish(cl_command_queue command_queue) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clFinish)lib.getOpenCLFunction("clFinish");
+  if (func) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+                           size_t offset, size_t size, void* ptr, cl_uint num_events_in_wait_list,
+                           const cl_event* event_wait_list, cl_event* event) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clEnqueueReadBuffer)lib.getOpenCLFunction("clEnqueueReadBuffer");
+  if (func) {
+    return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list,
+                event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+                            size_t offset, size_t size, const void* ptr,
+                            cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
+                            cl_event* event) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clEnqueueWriteBuffer)lib.getOpenCLFunction("clEnqueueWriteBuffer");
+  if (func) {
+    return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list,
+                event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clEnqueueCopyBuffer(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
+                           size_t src_offset, size_t dst_offset, size_t size,
+                           cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
+                           cl_event* event) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clEnqueueCopyBuffer)lib.getOpenCLFunction("clEnqueueCopyBuffer");
+  if (func) {
+    return func(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, size,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clEnqueueReadImage(cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,
+                          const size_t* origin, const size_t* region, size_t row_pitch,
+                          size_t slice_pitch, void* ptr, cl_uint num_events_in_wait_list,
+                          const cl_event* event_wait_list, cl_event* event) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clEnqueueReadImage)lib.getOpenCLFunction("clEnqueueReadImage");
+  if (func) {
+    return func(command_queue, image, blocking_read, origin, region, row_pitch, slice_pitch, ptr,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clEnqueueWriteImage(cl_command_queue command_queue, cl_mem image, cl_bool blocking_write,
+                           const size_t* origin, const size_t* region, size_t input_row_pitch,
+                           size_t input_slice_pitch, const void* ptr,
+                           cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
+                           cl_event* event) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clEnqueueWriteImage)lib.getOpenCLFunction("clEnqueueWriteImage");
+  if (func) {
+    return func(command_queue, image, blocking_write, origin, region, input_row_pitch,
+                input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clEnqueueCopyImage(cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image,
+                          const size_t* src_origin, const size_t* dst_origin, const size_t* region,
+                          cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
+                          cl_event* event) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clEnqueueCopyImage)lib.getOpenCLFunction("clEnqueueCopyImage");
+  if (func) {
+    return func(command_queue, src_image, dst_image, src_origin, dst_origin, region,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clEnqueueCopyImageToBuffer(cl_command_queue command_queue, cl_mem src_image,
+                                  cl_mem dst_buffer, const size_t* src_origin, const size_t* region,
+                                  size_t dst_offset, cl_uint num_events_in_wait_list,
+                                  const cl_event* event_wait_list, cl_event* event) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clEnqueueCopyImageToBuffer)lib.getOpenCLFunction("clEnqueueCopyImageToBuffer");
+  if (func) {
+    return func(command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clEnqueueCopyBufferToImage(cl_command_queue command_queue, cl_mem src_buffer,
+                                  cl_mem dst_image, size_t src_offset, const size_t* dst_origin,
+                                  const size_t* region, cl_uint num_events_in_wait_list,
+                                  const cl_event* event_wait_list, cl_event* event) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clEnqueueCopyBufferToImage)lib.getOpenCLFunction("clEnqueueCopyBufferToImage");
+  if (func) {
+    return func(command_queue, src_buffer, dst_image, src_offset, dst_origin, region,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
+                              const size_t* global_work_offset, const size_t* global_work_size,
+                              const size_t* local_work_size, cl_uint num_events_in_wait_list,
+                              const cl_event* event_wait_list, cl_event* event) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clEnqueueNDRangeKernel)lib.getOpenCLFunction("clEnqueueNDRangeKernel");
+  if (func) {
+    return func(command_queue, kernel, work_dim, global_work_offset, global_work_size,
+                local_work_size, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device,
+                                      cl_command_queue_properties properties, cl_int* errcode_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clCreateCommandQueue)lib.getOpenCLFunction("clCreateCommandQueue");
+  if (func) {
+    return func(context, device, properties, errcode_ret);
+  } else {
+    return nullptr;
+  }
+}
diff --git a/tests/cpp-runtime/opencl/opencl_timer_test.cc b/tests/cpp-runtime/opencl/opencl_timer_test.cc
index 6faf2f6a1482..f6546c25aca5 100644
--- a/tests/cpp-runtime/opencl/opencl_timer_test.cc
+++ b/tests/cpp-runtime/opencl/opencl_timer_test.cc
@@ -44,11 +44,11 @@ TEST(OpenCLTimerNode, nested_timers) {
     cl_event ev = clCreateUserEvent(workspace->context, &err);
     OPENCL_CHECK_ERROR(err);
     cl_mem cl_buf = clCreateBuffer(workspace->context, CL_MEM_READ_ONLY, BUFF_SIZE * sizeof(cl_int),
-                                   NULL, &err);
+                                   nullptr, &err);
     OPENCL_CHECK_ERROR(err);
     queue = workspace->GetQueue(thr->device);
     OPENCL_CALL(clEnqueueWriteBuffer(queue, cl_buf, false, 0, BUFF_SIZE * sizeof(cl_int), tmp_buf,
-                                     0, NULL, &ev));
+                                     0, nullptr, &ev));
     OPENCL_CALL(clReleaseMemObject(cl_buf));
     workspace->events[thr->device.device_id].push_back(ev);
     nested_timer->Stop();

From 86a5ceec271f241451b641d10b4c27e0cdeb1e89 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 15 Nov 2022 19:07:20 -0600
Subject: [PATCH 586/704] [TVMScript] Use tir::Evaluate if expression is in
 statement context (#13396)

* [TVMScript] Use tir::Evaluate if expression is in statement context

For the previous version of the parser, this was special-cased for
some intrinsic operators.  After the new TVMScript was enabled in
https://github.com/apache/tvm/pull/12496, any `PrimExpr` that appears
in the body of a statement is silently ignored.  This commit updates
the parser to instead wrap the bare `PrimExpr` in a `tir::Evaluate`
node.

This change effectively allows [expression
statements](https://docs.python.org/3/reference/simple_stmts.html#expression-statements)
in TVMScript, which are converted to `tir::Evaluate` nodes during
parsing.

* Update to print T.evaluate() for readability, except for CallNode
---
 python/tvm/script/parser/tir/parser.py        |  5 ++++
 src/printer/tvmscript_printer.cc              | 19 +++++++-------
 .../unittest/test_tvmscript_roundtrip.py      | 10 +++++++
 .../unittest/test_tvmscript_syntax_sugar.py   | 26 +++++++++++++++++++
 4 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/python/tvm/script/parser/tir/parser.py b/python/tvm/script/parser/tir/parser.py
index 1370758f5a5b..0e74114ba29c 100644
--- a/python/tvm/script/parser/tir/parser.py
+++ b/python/tvm/script/parser/tir/parser.py
@@ -20,6 +20,7 @@
 from functools import partial
 from typing import Any
 
+import tvm
 from tvm.ir import PrimType
 from tvm.tir import Buffer, IterVar, PrimExpr, Var
 
@@ -411,6 +412,10 @@ def visit_expr_stmt(self: Parser, node: doc.Expr) -> None:
     if isinstance(res, Frame):
         res.add_callback(partial(res.__exit__, None, None, None))
         res.__enter__()
+    elif isinstance(res, PrimExpr):
+        T.evaluate(res)
+    elif isinstance(res, (int, bool)):
+        T.evaluate(tvm.tir.const(res))
 
 
 @dispatch.register(token="tir", type_name="If")
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index d7a3a406e352..f1d68ee43845 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -1275,16 +1275,17 @@ Doc TVMScriptPrinter::VisitStmt_(const SeqStmtNode* op) {
 }
 
 Doc TVMScriptPrinter::VisitStmt_(const EvaluateNode* op) {
-  if (auto* call = op->value.as<CallNode>()) {
-    if (call->op.same_as(builtin::assume())) {
-      Doc doc;
-      doc << tir_prefix_ << ".assume(" << Print(call->args[0]) << ")";
-      return doc;
-    }
-  }
-
+  // When parsing TVMScript, a PrimExpr that occurs as a statement is
+  // automatically wrapped in `tir::Evaluate`.  Therefore, when
+  // printing, it's only necessary to print the value.  For
+  // readability, though, we still print T.evaluate() when the
+  // expression is something other than a call node.
   Doc doc;
-  doc << tir_prefix_ << ".evaluate(" << Print(op->value) << ")";
+  if (op->value.as<CallNode>()) {
+    doc << Print(op->value);
+  } else {
+    doc << tir_prefix_ << ".evaluate(" << Print(op->value) << ")";
+  }
   return doc;
 }
 
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index f22e61e1838d..b8c8379c8a16 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3458,6 +3458,15 @@ def func() -> None:
     return func
 
 
+def implicit_evaluate():
+    @T.prim_func
+    def func(A: T.Buffer[1, "int32"]):
+        T.evaluate(T.assume(A[0] == 5))
+        A[0] = 10
+
+    return func
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3509,6 +3518,7 @@ def func() -> None:
     bool_primitive,
     bool_cast,
     return_none,
+    implicit_evaluate,
 )
 
 
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index 16f1cb04945a..a39354b9552a 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -402,5 +402,31 @@ def int64_grid_expanded(
     assert_structural_equal(int64_grid, int64_grid_expanded)
 
 
+def test_implicit_evaluate_assume():
+    @T.prim_func
+    def explicit(A: T.Buffer[1, "int32"]):
+        T.evaluate(T.assume(A[0] == 5))
+        A[0] = 10
+
+    @T.prim_func
+    def implicit(A: T.Buffer[1, "int32"]):
+        T.assume(A[0] == 5)
+        A[0] = 10
+
+    assert_structural_equal(implicit, explicit)
+
+
+def test_implicit_evaluate_call_extern():
+    @T.prim_func
+    def explicit(A: T.Buffer[1, "int32"]):
+        T.evaluate(T.call_extern("extern_func", A.data, dtype="int32"))
+
+    @T.prim_func
+    def implicit(A: T.Buffer[1, "int32"]):
+        T.call_extern("extern_func", A.data, dtype="int32")
+
+    assert_structural_equal(implicit, explicit)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 49c5d2f17304fd4c109fb60fdc35f4627b89511c Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Tue, 15 Nov 2022 19:47:19 -0800
Subject: [PATCH 587/704] [Hexagon] Change single dma alloc buffers to be vtcm
 (#13374)

Remove broken schedule from the test
---
 .../test_hexagon/test_async_dma_pipeline.py   | 129 ------------------
 1 file changed, 129 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index ef9b142d6f27..a0b5e1e7e42f 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -113,124 +113,6 @@ def evaluate(
     return round(time.mean * 1000, 4)
 
 
-def get_single_dma_schedule(size_a, size_w):
-    """Generate single DMA schedule."""
-    a_shape = (size_a, VRMPY_SIZE_B)
-    w_shape = (size_w, VRMPY_SIZE_B)
-    out_shape = (size_a, VRMPY_SIZE_INT32)
-
-    a_bytes = size_a * VRMPY_SIZE_B
-    w_bytes = size_w * VRMPY_SIZE_B
-
-    @T.prim_func
-    def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        a_buffer = T.match_buffer(a_input, a_shape, dtype="uint8", scope="global")
-        w_buffer = T.match_buffer(b_input, w_shape, dtype="uint8", scope="global")
-        c_buffer = T.match_buffer(c_output, out_shape, dtype="int32", scope="global")
-        a_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", scope="global")
-        w_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", scope="global")
-        c_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", scope="global")
-        T.evaluate(
-            T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
-                T.tvm_stack_make_array(
-                    a_global_vtcm.data,
-                    T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
-                    0,
-                    2,
-                    a_global_vtcm.dtype,
-                    0,
-                    dtype="handle",
-                ),
-                T.tvm_stack_make_array(
-                    a_buffer.data,
-                    T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
-                    0,
-                    2,
-                    a_buffer.dtype,
-                    0,
-                    dtype="handle",
-                ),
-                T.Cast("int", a_bytes),
-                dtype="int32",
-            )
-        )
-        T.evaluate(
-            T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
-                T.tvm_stack_make_array(
-                    w_global_vtcm.data,
-                    T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"),
-                    0,
-                    2,
-                    w_global_vtcm.dtype,
-                    0,
-                    dtype="handle",
-                ),
-                T.tvm_stack_make_array(
-                    w_buffer.data,
-                    T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"),
-                    0,
-                    2,
-                    w_buffer.dtype,
-                    0,
-                    dtype="handle",
-                ),
-                T.Cast("int", w_bytes),
-                dtype="int32",
-            )
-        )
-        for n, index_0 in T.grid(size_a, size_w):
-            with T.block("c_buffer"):
-                vn_index, vi_index = T.axis.remap("SR", [n, index_0])
-                T.reads(
-                    a_global_vtcm[vn_index, 0:VRMPY_SIZE_B],
-                    w_global_vtcm[vi_index, 0:VRMPY_SIZE_B],
-                    c_global_vtcm[vn_index, 0:VRMPY_SIZE_INT32],
-                )
-                T.writes(c_global_vtcm[vn_index, 0:VRMPY_SIZE_INT32])
-                with T.init():
-                    for x in T.serial(VRMPY_SIZE_INT32):
-                        c_global_vtcm[vn_index, x] = 0
-                c_global_vtcm[vn_index, T.ramp(0, 1, 32)] += T.call_llvm_intrin(
-                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
-                    T.uint32(2),
-                    T.reinterpret(a_global_vtcm[vn_index, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(w_global_vtcm[vi_index, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    dtype="int32x32",
-                )
-        T.evaluate(
-            T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
-                T.tvm_stack_make_array(
-                    c_buffer.data,
-                    T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
-                    0,
-                    2,
-                    c_buffer.dtype,
-                    0,
-                    dtype="handle",
-                ),
-                T.tvm_stack_make_array(
-                    c_global_vtcm.data,
-                    T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
-                    0,
-                    2,
-                    c_global_vtcm.dtype,
-                    0,
-                    dtype="handle",
-                ),
-                T.Cast("int", a_bytes),
-                dtype="int32",
-            )
-        )
-
-    sch = tvm.tir.Schedule(operator)
-
-    return sch
-
-
 def get_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
     """Generate fake conv schedule with VTCM."""
     sch = conv_approximation(size_a, size_w)
@@ -434,16 +316,6 @@ def test_loading_vtcm_for_vrmpy(
             use_async_copy=1,
         )
 
-        sch = get_single_dma_schedule(size_a, size_w)
-        single_dma_runtime = evaluate(
-            hexagon_session,
-            sch,
-            input_a,
-            input_w,
-            np.zeros(expected_output.shape, "int32"),
-            expected_output,
-        )
-
         # Total transfer size is equal to the size of
         # a_buffer + w_buffer + c_buffer which is equal to 2 * size_a * 128 + size_w * 128
         transfer_mb = round((2 * size_a * VRMPY_SIZE_B + size_w * VRMPY_SIZE_B) / 1e6, 2)
@@ -461,7 +333,6 @@ def test_loading_vtcm_for_vrmpy(
             ),
             {
                 "without_vtcm": base_runtime,
-                "synchronous_dma": single_dma_runtime,
                 "base_vtcm": base_vtcm_runtime,
                 "async_dma_input": async_input_runtime,
                 "async_dma_output": async_output_runtime,

From e030b146d9fb9756de553d9c1dcac845768bc7b9 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 16 Nov 2022 12:47:51 +0900
Subject: [PATCH 588/704] [TECompiler] Replace static constant index with
 NameSupply (#13397)

The `static` qualifier on the constant index causes a problem, when `TECompiler` is invoked on the same module multiple times. Since the names of constants become different in each call to `TECompiler`, anything that depends on the consistency of constant names is broken.

In practice, MetaSchedule tuning is affected by this issue, since task extraction and the final `relay.build(...)` both invoke `TECompiler`.  So if we do `get_block("fused_constant_0_global")` during tuning,  trace application after tuning will fail because there would be no longer a block with the name "fused_constant_0_global" .
---
 src/relay/backend/task_extraction.cc          |  7 ++--
 src/relay/backend/te_compiler.cc              | 13 +++++---
 src/relay/backend/te_compiler_cache.cc        | 32 +++++++++++--------
 src/relay/backend/te_compiler_cache.h         | 15 +++++++--
 .../auto_scheduler_layout_rewrite.cc          |  2 +-
 .../meta_schedule_layout_rewrite.cc           |  2 +-
 6 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index 7e66dafe16f5..e7e677938e1a 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -16,6 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <tvm/ir/name_supply.h>
 #include <tvm/meta_schedule/extracted_task.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
@@ -75,14 +76,16 @@ Array<meta_schedule::ExtractedTask> ExtractTask(IRModule mod, Target target,
 
   std::vector<std::tuple<std::string, Function, IRModule>> lower_results;
 
-  PostOrderVisit(mod->Lookup("main"), [&lower_results, &target, &tir_converter](const Expr& exp) {
+  NameSupply constant_name_supply("");
+
+  PostOrderVisit(mod->Lookup("main"), [&](const Expr& exp) {
     if (exp->IsInstance<FunctionNode>()) {
       Function relay_func = Downcast<Function>(exp);
       if (!relay_func->HasNonzeroAttr(attr::kPrimitive)) {
         return;
       }
       auto [inputs_outputs, constants, fused_name] =
-          tec::LowerTECompute(relay_func, target, /*return_inputs=*/true);
+          tec::LowerTECompute(relay_func, target, constant_name_supply, /*return_inputs=*/true);
 
       if (Optional<tir::PrimFunc> f = tir_converter(inputs_outputs, constants)) {
         IRModule tir_mod = PrimFuncToIRModule(f.value());
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index b4373c6f5f1e..e20e0c94295d 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -135,10 +135,9 @@ TVM_REGISTER_OBJECT_TYPE(TECompilerNode);
 
 class TECompilerImpl : public TECompilerNode {
  public:
-  explicit TECompilerImpl(Optional<IRModule> opt_mod, Optional<String> opt_mod_name) {
-    String mod_name = opt_mod_name.value_or("");
-    NameSupply name_supply = NameSupply(mod_name /* prefix */);
-    global_var_supply_ = GlobalVarSupply(name_supply);
+  explicit TECompilerImpl(Optional<IRModule> opt_mod, Optional<String> opt_mod_name)
+      : global_var_supply_(GlobalVarSupply(NameSupply(opt_mod_name.value_or("")))),
+        constant_name_supply_(NameSupply("")) {
     // Make sure we don't collide with any existing globals in the module.
     if (opt_mod) {
       for (const auto& kv : opt_mod.value()->functions) {
@@ -392,7 +391,8 @@ class TECompilerImpl : public TECompilerNode {
     With<Target> target_scope(key->target);
 
     ICHECK(!value->cached_func.defined());
-    value->cached_func = PrimFuncFor(key->source_func, key->target, global_var_supply);
+    value->cached_func =
+        PrimFuncFor(key->source_func, key->target, global_var_supply, constant_name_supply_);
 
     if (value->cached_func->prim_func.defined()) {
       VLOG(1) << "Lowering PrimFunc";
@@ -523,6 +523,9 @@ class TECompilerImpl : public TECompilerNode {
   std::mutex mutex_;
   /*! \brief internal GlobalVarSupply to get unique GlobalVars  */
   GlobalVarSupply global_var_supply_;
+  /*! \brief A NameSupply object for assigning unique names to constants, across different
+   * invocations of PrimFuncFor. */
+  NameSupply constant_name_supply_;
   /*! \brief internal compiler cache */
   std::unordered_map<CCacheKey, CCacheValue> cache_;
   /*! \brief internal compiler cache for shape funcs */
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 47a19cbef8fa..d235c17b2a4b 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -20,6 +20,7 @@
 #include "./te_compiler_cache.h"
 
 #include <tvm/driver/driver_api.h>
+#include <tvm/ir/name_supply.h>
 #include <tvm/ir/type_functor.h>
 #include <tvm/meta_schedule/database.h>
 #include <tvm/relay/analysis.h>
@@ -204,8 +205,10 @@ class QnnPatternMatcher {
 // Lowers Relay primitive Function to TE Compute
 class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
  public:
-  explicit LowerToTECompute(Target target)
-      : target_(target), device_copy_op_(Op::Get("device_copy")) {}
+  LowerToTECompute(Target target, NameSupply constants_name_supply)
+      : target_(target),
+        device_copy_op_(Op::Get("device_copy")),
+        constants_name_supply_(constants_name_supply) {}
 
   Array<te::Tensor> Lower(const Function& relay_func) {
     for (Var param : relay_func->params) {
@@ -280,7 +283,7 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
       std::stringstream ss;
       std::string s = readable_name_stream_.str();
       std::replace(s.begin(), s.end(), '.', '_');
-      ss << s << "_constant_" << const_index++;
+      ss << constants_name_supply_->FreshName(s + "_constant");
       tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype, ss.str());
       constant_tensors_[op] = tensor;
       return {tensor};
@@ -392,15 +395,14 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
 
   tvm::Target target_;
   std::ostringstream readable_name_stream_;
-  // Index of the global constants
-  static int const_index;
   // Cache device copy op for equivalence checking to reduce registry lookup
   // overhead for each invocation of call node when retrieving schedules.
   const Op& device_copy_op_;
+  // A NameSupply object passed from a caller, used to assign unique names to constants
+  // across different invocations of LowerToTECompute.
+  NameSupply constants_name_supply_;
 };
 
-int LowerToTECompute::const_index = 0;
-
 using namespace tvm::tir;
 
 class LayoutFreeConstantCollector : public StmtVisitor {
@@ -478,8 +480,9 @@ class ScheduleBuilder : public ExprVisitor {
     }
   }
 
-  CachedFunc Create(const Function& relay_func, GlobalVarSupply global_var_supply) {
-    LowerToTECompute lower_te_compute(target_);
+  CachedFunc Create(const Function& relay_func, GlobalVarSupply global_var_supply,
+                    NameSupply constant_name_supply) {
+    LowerToTECompute lower_te_compute(target_, constant_name_supply);
     Array<te::Tensor> tensor_outs = lower_te_compute.Lower(relay_func);
     Array<te::Tensor> fn_inputs = lower_te_compute.fn_inputs_;
     VisitExpr(relay_func->body);
@@ -724,8 +727,8 @@ class ScheduleBuilder : public ExprVisitor {
  *  The funcs field in cache is not yet populated.
  */
 CachedFunc PrimFuncFor(const Function& source_func, const Target& target,
-                       GlobalVarSupply global_var_supply) {
-  return ScheduleBuilder(target).Create(source_func, global_var_supply);
+                       GlobalVarSupply global_var_supply, NameSupply constant_name_supply) {
+  return ScheduleBuilder(target).Create(source_func, global_var_supply, constant_name_supply);
 }
 
 // Creates shape function from functor.
@@ -1066,8 +1069,9 @@ CachedFunc ShapeFuncFor(const Function& prim_func, const Target& target,
 }
 
 std::tuple<Array<te::Tensor>, Array<runtime::NDArray>, std::string> LowerTECompute(
-    const Function& source_func, Target target, bool return_inputs) {
-  LowerToTECompute lower_te_compute(target);
+    const Function& source_func, Target target, NameSupply constant_name_supply,
+    bool return_inputs) {
+  LowerToTECompute lower_te_compute(target, constant_name_supply);
   Array<te::Tensor> outputs = lower_te_compute.Lower(source_func);
   // Following ScheduleBuilder, remove placeholder ops from outputs.
   tvm::Array<te::Tensor> tensor_outs;
@@ -1092,7 +1096,7 @@ std::tuple<Array<te::Tensor>, Array<runtime::NDArray>, std::string> LowerTECompu
 
 TVM_REGISTER_GLOBAL("relay.backend.LowerToTE").set_body_typed([](Function prim_func) {
   auto tgt = tvm::Target("ext_dev");
-  LowerToTECompute lower_te_compute(tgt);
+  LowerToTECompute lower_te_compute(tgt, NameSupply(""));
   auto outputs = lower_te_compute.Lower(prim_func);
   return CachedFunc(tgt, GlobalVar(lower_te_compute.candidate_name_), lower_te_compute.fn_inputs_,
                     outputs, te::Schedule(), tir::PrimFunc(), {},
diff --git a/src/relay/backend/te_compiler_cache.h b/src/relay/backend/te_compiler_cache.h
index 95c5bc974181..fcbf10477fdf 100644
--- a/src/relay/backend/te_compiler_cache.h
+++ b/src/relay/backend/te_compiler_cache.h
@@ -215,21 +215,32 @@ Array<IndexExpr> GetShape(const Array<IndexExpr>& shape);
  * \brief Lowers Relay primitive Function to TE Compute
  * \param source_func The primitive function to be lowered.
  * \param target The target we want to create schedule for.
+ * \param constant_name_supply A name supplier for constants.
+ *  across different invocations of this function.
  * \param return_inputs If true, prepend input tensors to the output array of tensors.
  * \return Tuple of the lowered TE compute, constant raw data, and fused function name.
  */
 std::tuple<Array<te::Tensor>, Array<runtime::NDArray>, std::string> LowerTECompute(
-    const Function& source_func, Target target, bool return_inputs = true);
+    const Function& source_func, Target target, NameSupply constant_name_supply,
+    bool return_inputs = true);
 
 /*!
  * \brief Create schedule for target.
  * \param source_func The primitive function to be lowered.
  * \param target The target we want to create schedule for.
+ * \param global_var_supply A name supplier for global variables.
+ * \param constant_name_supply A name supplier for constants.
  * \return Pair of schedule and cache.
  *  The funcs field in cache is not yet populated.
  */
 CachedFunc PrimFuncFor(const Function& source_func, const Target& target,
-                       GlobalVarSupply global_var_supply);
+                       GlobalVarSupply global_var_supply, NameSupply constant_name_supply);
+
+/*! \brief A specialization of PrimFuncFor, meant to be used when the names of constants do not
+ * matter. */
+inline CachedFunc PrimFuncFor(const Function& source_func, const Target& target) {
+  return PrimFuncFor(source_func, target, GlobalVarSupply(NameSupply("")), NameSupply(""));
+}
 
 CachedFunc ShapeFuncFor(const Function& prim_func, const Target& target,
                         GlobalVarSupply global_var_supply);
diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
index 25111cec8eda..532b25769f87 100644
--- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -126,7 +126,7 @@ Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) {
       CHECK(f) << "Could not find auto_scheduler.enter_layout_rewrite function.";
       (*f)();
 
-      tec::PrimFuncFor(GetRef<Function>(func), Target::Current(), GlobalVarSupply(NameSupply("")));
+      tec::PrimFuncFor(GetRef<Function>(func), Target::Current());
 
       f = runtime::Registry::Get("auto_scheduler.exit_layout_rewrite");
       CHECK(f) << "Could not find ansor.exit_layout_rewrite function.";
diff --git a/src/relay/transforms/meta_schedule_layout_rewrite.cc b/src/relay/transforms/meta_schedule_layout_rewrite.cc
index 3e7d7f7cb1a1..1ae6a62629dc 100644
--- a/src/relay/transforms/meta_schedule_layout_rewrite.cc
+++ b/src/relay/transforms/meta_schedule_layout_rewrite.cc
@@ -134,7 +134,7 @@ Expr MetaScheduleLayoutRewriter::VisitExpr_(const CallNode* call) {
     if (const auto* func = call->op.as<FunctionNode>()) {
       LayoutIndexQueue* self = LayoutIndexQueue::Global();
       self->queue_.clear();
-      tec::PrimFuncFor(GetRef<Function>(func), Target::Current(), GlobalVarSupply(NameSupply("")));
+      tec::PrimFuncFor(GetRef<Function>(func), Target::Current());
       if (!self->queue_.empty()) {
         std::deque<tir::IndexMap> queue = std::move(self->queue_);
         self->queue_.clear();

From edc1bfd1052a125ef1feaa6db67b814c27be64b5 Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Tue, 15 Nov 2022 19:48:15 -0800
Subject: [PATCH 589/704] Add support for cast double to fp16 (#13395)

__truncdfhf2 can be generated in codegen but the implementation was missing. Added the missing implementation.
---
 include/tvm/runtime/builtin_fp16.h         | 1 +
 src/runtime/builtin_fp16.cc                | 8 ++++++++
 tests/python/topi/python/test_topi_math.py | 4 ++++
 3 files changed, 13 insertions(+)

diff --git a/include/tvm/runtime/builtin_fp16.h b/include/tvm/runtime/builtin_fp16.h
index d4303eba1e4a..5b54583da4ff 100644
--- a/include/tvm/runtime/builtin_fp16.h
+++ b/include/tvm/runtime/builtin_fp16.h
@@ -32,6 +32,7 @@ extern "C" {
 TVM_DLL uint16_t __gnu_f2h_ieee(float);
 TVM_DLL float __gnu_h2f_ieee(uint16_t);
 TVM_DLL uint16_t __truncsfhf2(float v);
+TVM_DLL uint16_t __truncdfhf2(double v);
 TVM_DLL float __extendhfsf2(uint16_t v);
 }
 
diff --git a/src/runtime/builtin_fp16.cc b/src/runtime/builtin_fp16.cc
index d229491a4c7b..64723ffbf5a2 100644
--- a/src/runtime/builtin_fp16.cc
+++ b/src/runtime/builtin_fp16.cc
@@ -37,6 +37,10 @@ TVM_DLL TVM_WEAK float __gnu_h2f_ieee(uint16_t a) {
   return __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(a);
 }
 
+TVM_DLL uint16_t __truncdfhf2(double a) {
+  return __truncXfYf2__<double, uint64_t, 52, uint16_t, uint16_t, 10>(a);
+}
+
 #else
 
 TVM_DLL uint16_t __gnu_f2h_ieee(float a) {
@@ -47,5 +51,9 @@ TVM_DLL float __gnu_h2f_ieee(uint16_t a) {
   return __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(a);
 }
 
+TVM_DLL uint16_t __truncdfhf2(double a) {
+  return __truncXfYf2__<double, uint64_t, 52, uint16_t, uint16_t, 10>(a);
+}
+
 #endif
 }
diff --git a/tests/python/topi/python/test_topi_math.py b/tests/python/topi/python/test_topi_math.py
index 6c3df2671a95..f41923033f14 100644
--- a/tests/python/topi/python/test_topi_math.py
+++ b/tests/python/topi/python/test_topi_math.py
@@ -200,9 +200,13 @@ def test_ewise(target, dev, topi_name, dtype, tolerance, ewise_ref_data):
     ("int32", "float32"),
     ("int32", "float64"),
     ("int32", "bool"),
+    ("float16", "float32"),
+    ("float16", "float64"),
     ("float32", "int32"),
     ("float32", "float64"),
     ("float32", "bool"),
+    ("float64", "float16"),
+    ("float64", "float32"),
     ("bool", "float32"),
     ("bool", "int32"),
 )

From a042a93cd9e13038da2fcdc59d5b245df012eba0 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Tue, 15 Nov 2022 19:48:42 -0800
Subject: [PATCH 590/704] [TIR] Add test to cover specific case of reducer
 match buffer checking (#13373)

Adding in a test that covers another case that can happen when running a check in reducer.cc [here](https://github.com/apache/tvm/blob/main/src/tir/schedule/analysis/reducer.cc#L590).
---
 ..._transform_lower_cross_thread_reduction.py | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
index 3ab09f01dd01..2bf898e66b08 100644
--- a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
+++ b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
@@ -548,6 +548,56 @@ def single_reduction_loop_with_tensorize(
                 )
 
 
+@T.prim_func
+def nested_reduction_loop_with_inner_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            for x in T.serial(4):
+                with T.block("C"):
+                    xr = T.axis.reduce(4, x)
+                    with T.init():
+                        for i in T.serial(4):
+                            with T.block("C_init"):
+                                ii = T.axis.spatial(4, i)
+                                T.reads()
+                                T.writes(out[yi, ii])
+                                out[yi, ii] = 0
+                    with T.block("C"):
+                        T.reads(
+                            out[yi, xr],
+                            in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                            in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                        )
+                        T.writes(out[yi, xr])
+                        A = T.match_buffer(
+                            in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                            [4],
+                            dtype="int8",
+                            offset_factor=1,
+                        )
+                        B = T.match_buffer(
+                            in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                            [4],
+                            dtype="int8",
+                            offset_factor=1,
+                        )
+                        C = T.match_buffer(out[yi, xr], [1], dtype="int32", offset_factor=1)
+                        A_i8x4: T.int8x4 = A[0:4]
+                        A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
+                        B_i8x4: T.int8x4 = B[0:4]
+                        B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
+                        C[0] = A_i32 + B_i32 + C[0]
+
+
 @T.prim_func
 def reducer_max(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [128, 128], dtype="float32")
@@ -1247,6 +1297,13 @@ def test_single_reduction_loop_with_tensorize():
     )
 
 
+def test_nested_reduction_loop_with_inner_match_buffers():
+    _check(
+        nested_reduction_loop_with_inner_match_buffers,
+        nested_reduction_loop_with_inner_match_buffers,
+    )
+
+
 def test_reducer_max():
     _check(reducer_max, lowered_reducer_max)
 

From 6401d0ef62c8f22f187dfb9fde3dfe07f55df206 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Wed, 16 Nov 2022 13:03:58 +0900
Subject: [PATCH 591/704] [TIR] Fix extern_primfunc buffer order bug  (#13347)

This fixes #13330, which was blocking my work to write TIR schedules for microTVM.

I originally thought I'd have to change the function signature of `DomainTouchedAccessMap`, but I couldn't think of a way to do that cleanly. Instead, I changed `extern_primfunc` to use `primfunc.params` to create the buffer lists in the right order.

#13330 should have been caught by `test_tir_te_extern_primfunc.py`, but one of that test's helper functions had the same bug as `extern_primfunc`. I've thus modified `test_tir_te_extern_primfunc.py` to instantiate the input tensors a different way, allowing it to catch regressions of this issue.
---
 python/tvm/te/operation.py                    | 11 ++--
 .../unittest/test_tir_te_extern_primfunc.py   | 50 ++++---------------
 2 files changed, 15 insertions(+), 46 deletions(-)

diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 5279c46aebc2..846f88d38938 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -398,11 +398,12 @@ def before_split(a: T.handle, b: T.handle) -> None:
 
         C = te.extern_primfunc([A, B], func)
     """
-    access_map = {
-        k: tuple(v) for k, v in tvm.arith._ffi_api.DomainTouchedAccessMap(primfunc).items()
-    }
-    in_buffers = [buf for buf, access in access_map.items() if len(access[0])]
-    out_buffers = [buf for buf, access in access_map.items() if len(access[1])]
+
+    # dt_access_map and primfunc.buffer_map are unordered, so use order from primfunc.params
+    dt_access_map = tvm.arith._ffi_api.DomainTouchedAccessMap(primfunc)
+    ordered_buffers = [primfunc.buffer_map[param] for param in primfunc.params]
+    in_buffers = [buf for buf in ordered_buffers if len(dt_access_map[buf][0])]
+    out_buffers = [buf for buf in ordered_buffers if len(dt_access_map[buf][1])]
     assert in_buffers, "PrimFunc has no input buffers"
     assert out_buffers, "PrimFunc has no output buffers"
 
diff --git a/tests/python/unittest/test_tir_te_extern_primfunc.py b/tests/python/unittest/test_tir_te_extern_primfunc.py
index 26752145620a..a622f77cc737 100644
--- a/tests/python/unittest/test_tir_te_extern_primfunc.py
+++ b/tests/python/unittest/test_tir_te_extern_primfunc.py
@@ -21,10 +21,8 @@
 
 import tvm
 import tvm.testing
-from tvm import tir, te, TVMError
+from tvm import te
 from tvm.script import tir as T
-from tvm.arith import _ffi_api as _ffi_arith_api
-from tvm.tir.schedule import _ffi_api as _ffi_schedule_api
 
 
 # TODO(csullivan): Additional tests cases needed:
@@ -174,11 +172,11 @@ def verify_func_4(module):
 
 
 class TestPrimFuncs:
-    func, verify = tvm.testing.parameters(
-        [func_1, verify_func_1],
-        [func_2, verify_func_2],
-        [func_3, verify_func_3],
-        [func_4, verify_func_4],
+    func, params, verify = tvm.testing.parameters(
+        [func_1, ("A"), verify_func_1],
+        [func_2, ("C", "D"), verify_func_2],
+        [func_3, ("C", "A", "D", "E"), verify_func_3],
+        [func_4, ("C", "A", "D", "E"), verify_func_4],
     )
 
     def test_primfunc_call(self, func, verify):
@@ -186,11 +184,12 @@ def test_primfunc_call(self, func, verify):
         func = tvm.build(func, target=target)
         verify(func)
 
-    def test_te_extern_call(self, func, verify):
+    def test_te_extern_call(self, func, params, verify):
         ir_mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
         prim_func = ir_mod["main"]
 
-        input_tensors = create_input_tensors_for_primfunc(prim_func)
+        buf_name_map = {buf.name: buf for buf in func.buffer_map.values()}
+        input_tensors = [te.placeholder(buf_name_map[name].shape) for name in params]
         output = te.extern_primfunc(input_tensors, prim_func)
         rt_prim_func = te.create_prim_func(tensors_from_extern_op(output, prim_func))
         tvm.ir.assert_structural_equal(tvm.lower(prim_func), tvm.lower(rt_prim_func))
@@ -222,36 +221,5 @@ def tensors_from_extern_op(extern, func):
     return ordered_tensors
 
 
-def create_input_tensors_for_primfunc(primfunc):
-    access_map = {k: tuple(v) for k, v in _ffi_arith_api.DomainTouchedAccessMap(primfunc).items()}
-    in_buffers = [buf for buf, access in access_map.items() if len(access[0])]
-    out_buffers = [buf for buf, access in access_map.items() if len(access[1])]
-    assert in_buffers, "PrimFunc has no input buffers"
-    assert out_buffers, "PrimFunc has no output buffers"
-
-    outputs = []
-    inplace = []
-    inputs = in_buffers
-    for obuf in out_buffers:
-        if obuf in in_buffers:
-            inplace.append(obuf)
-        else:
-            outputs.append(obuf)
-
-    if not outputs:
-        iobuf = inplace.pop()
-        inputs.remove(iobuf)
-        outputs = [iobuf]
-
-    def create_tensors(input_buffers):
-        tensors = []
-        for buf in input_buffers:
-            t = te.placeholder(buf.shape, dtype=buf.dtype, name=buf.name + "_placeholder")
-            tensors.append(t)
-        return tensors
-
-    return create_tensors(inputs)
-
-
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))

From 0d9b1850afa7553719789001aa69c2bd8a8f5651 Mon Sep 17 00:00:00 2001
From: alter-xp <xp56@linux.alibaba.com>
Date: Wed, 16 Nov 2022 14:52:08 +0800
Subject: [PATCH 592/704] [ci] fix GPU other build  (#13366)

Related pr https://github.com/apache/tvm/pull/13235, https://github.com/apache/tvm/issues/12777, https://github.com/apache/tvm/pull/13261

because of ROCM build faild, pr https://github.com/apache/tvm/pull/13235 destroyed CI.
this pr fixed that bug.
---
 src/runtime/rocm/rocm_device_api.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index 33d1d6f9d7dc..4e758b7fd977 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -149,7 +149,8 @@ class ROCMDeviceAPI final : public DeviceAPI {
       if (dev_from.device_id == dev_to.device_id) {
         GPUCopy(from, to, size, hipMemcpyDeviceToDevice, hip_stream);
       } else {
-        hipMemcpyPeerAsync(to, dev_to.device_id, from, dev_from.device_id, size, hip_stream);
+        ROCM_CALL(
+            hipMemcpyPeerAsync(to, dev_to.device_id, from, dev_from.device_id, size, hip_stream));
       }
     } else if (dev_from.device_type == kDLROCM && dev_to.device_type == kDLCPU) {
       ROCM_CALL(hipSetDevice(dev_from.device_id));

From 78b53221f8dd8c1d2bbeff9d34803db33ca254dd Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 16 Nov 2022 08:19:34 -0600
Subject: [PATCH 593/704] [TIR] Remove PrimFuncNode::preflattened_buffer_map
 (#10940)

`PrimFuncNode::preflattened_buffer_map` was introduced in
https://github.com/apache/tvm/pull/9727, in order to maintain a record
of the pre-flattened buffer shape until it can be used in
`MakePackedAPI`.  This commit instead maintains the pre-flattened
shapes in `PrimFuncNode::buffer_map`, while the body of the function
uses a flattened buffer alias, as described in
[RFC#70](https://github.com/apache/tvm-rfcs/pull/70)
---
 include/tvm/script/ir_builder/tir/frame.h     |   3 -
 include/tvm/script/ir_builder/tir/ir.h        |  20 --
 include/tvm/tir/function.h                    |  43 +---
 .../backend/contrib/ethosu/tir/passes.py      |  77 ++++---
 python/tvm/script/ir_builder/tir/ir.py        |  69 ------
 .../script/parser_v1/context_maintainer.py    |   3 -
 python/tvm/script/parser_v1/parser.py         |   1 -
 python/tvm/script/parser_v1/tir/__init__.pyi  |  12 --
 .../tvm/script/parser_v1/tir/special_stmt.py  |  73 -------
 python/tvm/tir/function.py                    |   7 -
 src/printer/tir_text_printer.cc               |  10 -
 src/printer/tvmscript_printer.cc              |  20 --
 src/relay/backend/aot/aot_lower_main.cc       |   2 +-
 src/relay/backend/aot_executor_codegen.cc     |   2 +-
 .../backend/contrib/cmsisnn/relay_to_tir.cc   |   2 +-
 .../example_target_hooks/relay_to_tir.cc      |   2 +-
 src/script/ir_builder/tir/frame.cc            |   1 -
 src/script/ir_builder/tir/ir.cc               |  22 --
 src/tir/analysis/device_constraint_utils.cc   |  22 +-
 src/tir/contrib/ethosu/passes.cc              |   6 +-
 src/tir/ir/function.cc                        |  10 +-
 src/tir/transforms/bf16_legalize.cc           |  29 ---
 src/tir/transforms/flatten_buffer.cc          |  16 +-
 src/tir/transforms/legalize_packed_calls.cc   |   4 +-
 src/tir/transforms/make_packed_api.cc         |   4 +-
 .../plan_update_buffer_allocation_location.cc |   6 +-
 src/tir/transforms/storage_flatten.cc         | 201 ++++++++++--------
 src/tir/usmp/transform/assign_pool_info.cc    |   4 +-
 .../convert_pool_allocations_to_offsets.cc    |  10 +-
 src/tir/usmp/transform/create_io_allocates.cc |   5 +-
 .../test_ethosu/test_encode_constants.py      |  33 ++-
 .../test_ethosu/test_hoist_allocates.py       |  31 ++-
 .../test_ethosu/test_merge_constants.py       |  44 ++--
 .../test_ethosu/test_remove_concatenates.py   |   7 +-
 .../test_ethosu/test_replace_conv2d.py        |  48 +++--
 .../contrib/test_ethosu/test_replace_copy.py  |   8 +-
 .../contrib/test_ethosu/test_scheduler.py     |   4 +-
 .../test_hexagon/test_2d_physical_buffers.py  |   2 +-
 .../unittest/test_aot_legalize_packed_call.py |  26 +--
 .../unittest/test_arith_domain_touched.py     |  24 +--
 .../unittest/test_auto_scheduler_feature.py   |  16 +-
 tests/python/unittest/test_lower_build.py     |  36 ++--
 .../test_tir_transform_flatten_buffer.py      |  44 ++--
 .../test_tir_transform_loop_partition.py      |  73 ++++---
 ...tir_transform_renormalize_split_pattern.py |  42 ++--
 .../test_tir_transform_thread_sync.py         |   4 +-
 ...orm_convert_pool_allocations_to_offsets.py |  72 -------
 .../unittest/test_tvmscript_error_report.py   |  20 --
 .../unittest/test_tvmscript_ir_builder_tir.py |   5 -
 .../unittest/test_tvmscript_syntax_sugar.py   |  17 --
 50 files changed, 462 insertions(+), 780 deletions(-)

diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h
index b95d575360e6..ee8032236252 100644
--- a/include/tvm/script/ir_builder/tir/frame.h
+++ b/include/tvm/script/ir_builder/tir/frame.h
@@ -75,8 +75,6 @@ class PrimFuncFrameNode : public TIRFrameNode {
   Optional<Type> ret_type;
   /*! \brief Maps some parameters to specific Buffer data structures. */
   Map<tvm::tir::Var, tvm::tir::Buffer> buffer_map;
-  /*! \brief The buffer map prior to flattening. */
-  Map<tvm::tir::Var, tvm::tir::Buffer> preflattened_buffer_map;
   /*! \brief Additional attributes storing the meta-data */
   Optional<Map<String, ObjectRef>> attrs;
   /*! \brief The variable map bound to thread env. */
@@ -90,7 +88,6 @@ class PrimFuncFrameNode : public TIRFrameNode {
     v->Visit("args", &args);
     v->Visit("ret_type", &ret_type);
     v->Visit("buffer_map", &buffer_map);
-    v->Visit("preflattened_buffer_map", &preflattened_buffer_map);
     v->Visit("attrs", &attrs);
     v->Visit("env_threads", &env_threads);
     v->Visit("root_alloc_buffers", &root_alloc_buffers);
diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h
index d9e1a1b49063..5cba87920580 100644
--- a/include/tvm/script/ir_builder/tir/ir.h
+++ b/include/tvm/script/ir_builder/tir/ir.h
@@ -114,26 +114,6 @@ Buffer MatchBuffer(ObjectRef param, Array<PrimExpr> shape, DataType dtype = Data
                    int align = -1, int offset_factor = 0, String buffer_type = "default",
                    Array<IntImm> axis_separators = {});
 
-/*!
- * \brief The pre-flattened buffer statement.
- * \param postflattened_buffer The original buffer to be flattened.
- * \param shape The type of the buffer prior to flattening.
- * \param dtype The data type in the content of the buffer.
- * \param data The pointer to the head of the data.
- * \param strides The strides of each dimension.
- * \param elem_offset The offset in terms of number of dtype elements (including lanes).
- * \param storage_scope The optional storage scope of buffer data pointer.
- * \param align The alignment requirement of data pointer in bytes.
- * \param offset_factor The factor of elem_offset field.
- * \param buffer_type The buffer type.
- * \param axis_separators The separators between input axes when generating flattened output axes.
- */
-void PreflattenedBuffer(Buffer postflattened_buffer, Array<PrimExpr> shape,
-                        DataType dtype = DataType::Float(32), Optional<Var> data = NullOpt,
-                        Array<PrimExpr> strides = {}, PrimExpr elem_offset = PrimExpr(),
-                        String storage_scope = "global", int align = -1, int offset_factor = 0,
-                        String buffer_type = "default", Array<IntImm> axis_separators = {});
-
 /*!
  * \brief The block declaration statement.
  * \param name The name of the block.
diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index d793d84fc677..cf92f97360b1 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -88,33 +88,22 @@ class PrimFuncNode : public BaseFuncNode {
    *  While we could have express parameter unpacking and constraint using
    *  normal statements, making buffer_map as first class citizen of PrimFunc
    *  will make program analysis much easier.
-   */
-  Map<tir::Var, Buffer> buffer_map;
-
-  /*! \brief The buffer map prior to flattening.
-   *
-   * This contains the buffers as they exists prior to flattening, and
-   * is used for validating an input tensor passed into the packed
-   * API.  Any buffer that is present in `buffer_map` but not present
-   * in `preflattened_buffer_map` is assumed to be the same before
-   * and after flattening (e.g. a 1-d tensor that is backed by 1-d
-   * flat memory).
    *
-   * TODO(Lunderberg): Remove preflattened_buffer_map, and instead
-   * declare each flattened buffer as aliasing the original tensor
-   * shape.  This should include improving the StmtExprMutator to
-   * provide easier interactions with Buffer objects, so that the
-   * bookkeeping of relationships between buffers doesn't need to be
-   * repeated across several transforms.
+   *  Prior to buffer flattening, which is performed either in
+   *  StorageFlatten for TE-based schedules or in FlattenBuffer for
+   *  TIR-based schedules, these buffer objects are used directly in
+   *  the body of the function.  After buffer flattening, these buffer
+   *  objects remain unflattened for use in argument validation, but
+   *  all usage in the body of the function is done through a
+   *  flattened alias of the buffer.
    */
-  Map<tir::Var, Buffer> preflattened_buffer_map;
+  Map<tir::Var, Buffer> buffer_map;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("params", &params);
     v->Visit("body", &body);
     v->Visit("ret_type", &ret_type);
     v->Visit("buffer_map", &buffer_map);
-    v->Visit("preflattened_buffer_map", &preflattened_buffer_map);
     v->Visit("attrs", &attrs);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -123,7 +112,6 @@ class PrimFuncNode : public BaseFuncNode {
   bool SEqualReduce(const PrimFuncNode* other, SEqualReducer equal) const {
     // visit params and buffer_map first as they contains defs.
     return equal.DefEqual(params, other->params) && equal(buffer_map, other->buffer_map) &&
-           equal(preflattened_buffer_map, other->preflattened_buffer_map) &&
            equal(ret_type, other->ret_type) && equal(body, other->body) &&
            equal(attrs, other->attrs);
   }
@@ -131,7 +119,6 @@ class PrimFuncNode : public BaseFuncNode {
   void SHashReduce(SHashReducer hash_reduce) const {
     hash_reduce.DefHash(params);
     hash_reduce(buffer_map);
-    hash_reduce(preflattened_buffer_map);
     hash_reduce(ret_type);
     hash_reduce(body);
     hash_reduce(attrs);
@@ -169,21 +156,13 @@ class PrimFunc : public BaseFunc {
    * PrimFunc.  (e.g. a buffer of shape ``[1024]`` originally
    * generated as a tensor of shape ``[32, 32]``)
    *
-   * \param preflattened_buffer_map The buffer map for
-   * parameter buffer unpacking.  This contains buffer
-   * objects as they are expected to be passed in by the
-   * callee.  (e.g. a buffer of shape ``[32, 32]`` originally
-   * generated as a tensor of shape ``[32, 32]``)
-   *
    * \param attrs Additional function attributes.
    *
    * \param span The location of this object in the source code.
    */
-  TVM_DLL PrimFunc(
-      Array<tir::Var> params, Stmt body, Type ret_type = VoidType(),
-      Map<tir::Var, Buffer> buffer_map = Map<tir::Var, Buffer>(),
-      Optional<Map<tir::Var, Buffer>> preflattened_buffer_map = Optional<Map<tir::Var, Buffer>>(),
-      DictAttrs attrs = NullValue<DictAttrs>(), Span span = Span());
+  TVM_DLL PrimFunc(Array<tir::Var> params, Stmt body, Type ret_type = VoidType(),
+                   Map<tir::Var, Buffer> buffer_map = Map<tir::Var, Buffer>(),
+                   DictAttrs attrs = NullValue<DictAttrs>(), Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(PrimFunc, BaseFunc, PrimFuncNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(PrimFuncNode);
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
index cc94c6e816cd..e15d126dd969 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -299,7 +299,6 @@ def _ftransform(f, mod, ctx):
             new_body,
             f.ret_type,
             new_buffer_map,
-            f.preflattened_buffer_map,
             f.attrs,
             f.span,
         )
@@ -327,7 +326,7 @@ def EncodeConstants(const_dict):
     """
     new_const_dict = {}
 
-    def collect_encoding_definitions(stmt, old_buffer_to_const):
+    def collect_encoding_definitions(stmt, old_buffer_var_to_const):
         # Map from copy destination to copy source.
         copy_map = {}
         # List of buffer copies that occurred
@@ -376,7 +375,7 @@ def _declare_constant_buffer(old_buffer, encoded_constants, split_idx):
         def _encode_weights_or_bias(buffer1, buffer2, stmt, encode_func):
             """Encode the weights or align the bias either for one or two cores,
             depending on the variant."""
-            constant = old_buffer_to_const[buffer1]
+            constant = old_buffer_var_to_const[buffer1.data]
 
             # If we have just one core, encode the whole constant
             if buffer2 is None:
@@ -471,7 +470,12 @@ def _visit(stmt):
         }
 
     def transform_stmt(
-        stmt, buf_remap, var_remap, pointer_to_buffer, new_buffer_to_const, new_buffer_to_split_idx
+        stmt,
+        buf_remap,
+        var_remap,
+        pointer_to_buffer,
+        new_buffer_var_to_const,
+        new_buffer_to_split_idx,
     ):
         def _visit_rewrite(stmt):
             if isinstance(stmt, tvm.tir.Call):
@@ -485,7 +489,7 @@ def _visit_rewrite(stmt):
                     # encoded buffer, the current should be a length.
                     if (
                         isinstance(prev_arg, tvm.tir.BufferLoad)
-                        and prev_arg.buffer in new_buffer_to_const
+                        and prev_arg.buffer.data in new_buffer_var_to_const
                     ):
                         buffer_size = np.prod(list(prev_arg.buffer.shape))
                         arg = buffer_size
@@ -554,28 +558,56 @@ def _visit_rewrite(stmt):
             ["tir.Call", "tir.Allocate", "tir.BufferLoad", "tir.AttrStmt"],
         )
 
+    def _collect_parameter_buffer_aliases(prim_func):
+        buffer_vars = {}
+        for param in prim_func.params:
+            if param in prim_func.buffer_map:
+                buf = prim_func.buffer_map[param]
+                buffer_vars[buf.data] = {buf}
+
+        def visit(node):
+            if isinstance(node, (tvm.tir.BufferStore, tvm.tir.BufferLoad, tvm.tir.DeclBuffer)):
+                buf = node.buffer
+                if buf.data in buffer_vars:
+                    buffer_vars[buf.data].add(buf)
+
+        tvm.tir.stmt_functor.post_order_visit(prim_func.body, visit)
+        return buffer_vars
+
     def _ftransform(f, mod, ctx):
+        param_buffer_var_usage = _collect_parameter_buffer_aliases(f)
+
         # Step 0: Unpack the constant dictionary in terms of the
         # functions buffers.
-        old_buffer_to_const = {}
+        old_buffer_var_to_const = {}
         for i, param in enumerate(f.params):
             if i in const_dict:
-                old_buffer_to_const[f.buffer_map[param]] = const_dict[i]
+                old_buffer_var_to_const[f.buffer_map[param].data] = const_dict[i]
 
         # Step 1: Collect information on the buffers that will be
         # replaced by encodings.
-        buffer_information = collect_encoding_definitions(f.body, old_buffer_to_const)
+        buffer_information = collect_encoding_definitions(f.body, old_buffer_var_to_const)
 
         # Step 2: Generate variable/buffer remaps, based on the
         # collected information.
         buf_remap = {}
-        new_buffer_to_const = {}
+        new_buffer_var_to_const = {}
         new_buffer_to_split_idx = {}
 
+        def define_remap(old_buf, new_buf):
+            try:
+                old_buffers = param_buffer_var_usage[old_buf.data]
+            except KeyError:
+                old_buffers = [old_buf]
+
+            for old_buffer in old_buffers:
+                buf_remap[old_buffer] = new_buf
+
         # Any encoded buffers must be replaced
         for info in buffer_information["constant_buffer_replacements"]:
-            buf_remap[info["old_buffer"]] = info["new_buffer"]
-            new_buffer_to_const[info["new_buffer"]] = info["encoded_constants"]
+            define_remap(info["old_buffer"], info["new_buffer"])
+
+            new_buffer_var_to_const[info["new_buffer"].data] = info["encoded_constants"]
 
             if info["split_idx"]:
                 new_buffer_to_split_idx[info["new_buffer"]] = info["split_idx"]
@@ -596,9 +628,11 @@ def _ftransform(f, mod, ctx):
                     name=copy_dest.name,
                     scope=copy_dest.scope(),
                 )
-                buf_remap[copy_dest] = new_dest
-                if copy_source in new_buffer_to_const:
-                    new_buffer_to_const[new_dest] = new_buffer_to_const[copy_source]
+                define_remap(copy_dest, new_dest)
+                if copy_source.data in new_buffer_var_to_const:
+                    new_buffer_var_to_const[new_dest.data] = new_buffer_var_to_const[
+                        copy_source.data
+                    ]
 
                 if copy_source in new_buffer_to_split_idx:
                     new_buffer_to_split_idx[new_dest] = new_buffer_to_split_idx[copy_source]
@@ -615,7 +649,7 @@ def _ftransform(f, mod, ctx):
             buf_remap,
             var_remap,
             pointer_to_buffer,
-            new_buffer_to_const,
+            new_buffer_var_to_const,
             new_buffer_to_split_idx,
         )
 
@@ -626,10 +660,10 @@ def _ftransform(f, mod, ctx):
             if buffer in buf_remap:
                 buffer = buf_remap[buffer]
 
-            if buffer in new_buffer_to_const:
-                new_const_dict[i] = new_buffer_to_const[buffer].flatten()
-            elif buffer in old_buffer_to_const:
-                new_const_dict[i] = old_buffer_to_const[buffer].flatten()
+            if buffer.data in new_buffer_var_to_const:
+                new_const_dict[i] = new_buffer_var_to_const[buffer.data].flatten()
+            elif buffer.data in old_buffer_var_to_const:
+                new_const_dict[i] = old_buffer_var_to_const[buffer.data].flatten()
 
             new_buffer_map[param] = buffer
 
@@ -638,7 +672,6 @@ def _ftransform(f, mod, ctx):
             new_body,
             f.ret_type,
             new_buffer_map,
-            f.preflattened_buffer_map,
             f.attrs,
             f.span,
         )
@@ -873,7 +906,6 @@ def CreatePrimFuncWithoutConstants(const_dict):
     def _ftransform(f, mod, ctx):
         new_params = list()
         new_buffer_map = dict()
-        new_preflattened_buffer_map = dict()
         for param_idx in const_dict.keys():
             # We are using buffer_var to key the constants as
             # PrimFunc params of constants will be removed.
@@ -882,14 +914,11 @@ def _ftransform(f, mod, ctx):
             if i not in const_dict.keys():
                 new_params.append(param)
                 new_buffer_map[param] = f.buffer_map[param]
-                if param in f.preflattened_buffer_map:
-                    new_preflattened_buffer_map[param] = f.preflattened_buffer_map[param]
         return tvm.tir.PrimFunc(
             new_params,
             f.body,
             f.ret_type,
             new_buffer_map,
-            new_preflattened_buffer_map,
             f.attrs,
             f.span,
         )
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index 0678925e2f7c..842e21378fd1 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -314,74 +314,6 @@ def match_buffer(
     )
 
 
-def preflattened_buffer(
-    postflattened: Buffer,
-    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
-    dtype: str = "float32",
-    data: Var = None,
-    strides: List[PrimExpr] = None,
-    elem_offset: PrimExpr = None,
-    scope: str = "global",
-    align: int = -1,
-    offset_factor: int = 0,
-    buffer_type: str = "default",
-    axis_separators: List[int] = None,
-) -> None:
-    """The pre-flattened buffer statement.
-
-    Parameters
-    ----------
-    postflattened : Buffer
-        The original buffer to be flattened.
-
-    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
-        The type of the buffer prior to flattening.
-
-    dtype : str
-        The data type in the content of the buffer.
-
-    data : Var
-        The pointer to the head of the data.
-
-    strides : List[PrimExpr]
-        The strides of each dimension.
-
-    elem_offset : PrimExpr
-        The offset in terms of number of dtype elements (including lanes).
-
-    scope : str
-        The optional storage scope of buffer data pointer.
-
-    align : int
-        The alignment requirement of data pointer in bytes.
-
-    offset_factor : int
-        The factor of elem_offset field.
-
-    buffer_type : str
-        The buffer type.
-
-    axis_separators : List[int]
-        The separators between input axes when generating flattened output axes.
-    """
-    shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
-    if strides is None:
-        strides = []
-    _ffi_api.PreflattenedBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
-        postflattened,
-        shape,
-        dtype,
-        data,
-        strides,
-        elem_offset,
-        scope,
-        align,
-        offset_factor,
-        buffer_type,
-        axis_separators,
-    )
-
-
 def block(name: str = "", no_realize: bool = False) -> frame.BlockFrame:
     """The block declaration statement.
 
@@ -1697,7 +1629,6 @@ def f():
     "func_attr",
     "func_ret",
     "match_buffer",
-    "preflattened_buffer",
     "block",
     "init",
     "where",
diff --git a/python/tvm/script/parser_v1/context_maintainer.py b/python/tvm/script/parser_v1/context_maintainer.py
index f7f16855c752..b84b7d398084 100644
--- a/python/tvm/script/parser_v1/context_maintainer.py
+++ b/python/tvm/script/parser_v1/context_maintainer.py
@@ -129,8 +129,6 @@ class ContextMaintainer:
     """List[Var]: The function parameters"""
     func_buffer_map: Mapping[Var, Buffer] = {}
     """Mapping[Var, Buffer]: The function buffer map"""
-    func_preflattened_buffer_map: Mapping[Var, Buffer] = {}
-    """Mapping[Var, Buffer]: The function buffer map, prior to any flattening."""
     func_dict_attr: Mapping[str, Object] = {}
     """Mapping[str, Object]: The function attrs"""
     func_var_env_dict: Mapping[Var, str] = {}
@@ -160,7 +158,6 @@ def __init__(
         # function context
         self.func_params = []
         self.func_buffer_map = {}
-        self.func_preflattened_buffer_map = {}
         self.func_dict_attr = {}
         self.func_var_env_dict = {}
         # parser and analyzer
diff --git a/python/tvm/script/parser_v1/parser.py b/python/tvm/script/parser_v1/parser.py
index c34aae23453c..ce8c1fe161a3 100644
--- a/python/tvm/script/parser_v1/parser.py
+++ b/python/tvm/script/parser_v1/parser.py
@@ -501,7 +501,6 @@ def check_decorator(decorators: List[ast.Expr]) -> bool:
             body,
             ret_type,
             buffer_map=self.context.func_buffer_map,
-            preflattened_buffer_map=self.context.func_preflattened_buffer_map,
             attrs=tvm.ir.make_node("DictAttrs", **dict_attr) if dict_attr else None,
             span=tvm_span_from_synr(node.span),
         )
diff --git a/python/tvm/script/parser_v1/tir/__init__.pyi b/python/tvm/script/parser_v1/tir/__init__.pyi
index a64eed055ae8..beefaf4c75d7 100644
--- a/python/tvm/script/parser_v1/tir/__init__.pyi
+++ b/python/tvm/script/parser_v1/tir/__init__.pyi
@@ -117,18 +117,6 @@ def store(
 ) -> None: ...
 def comm_reducer(lambda_io: Callable[[Any, Any], Any], identities: List[PrimExpr]) -> PrimExpr: ...
 def llvm_lookup_intrinsic_id(name: str) -> PrimExpr: ...
-def preflattened_buffer(
-    buf: Buffer,
-    shape: Sequence[PrimExpr],
-    dtype: str = "float32",
-    data: Optional[Ptr] = None,
-    strides: Optional[Sequence[int]] = None,
-    elem_offset: Optional[int] = None,
-    scope: str = "global",
-    align: int = -1,
-    offset_factor: int = 0,
-    buffer_type: str = "default",
-) -> Buffer: ...
 
 """
 Intrinsics - tvm builtin
diff --git a/python/tvm/script/parser_v1/tir/special_stmt.py b/python/tvm/script/parser_v1/tir/special_stmt.py
index 7cbf47441053..f558eb6b7f73 100644
--- a/python/tvm/script/parser_v1/tir/special_stmt.py
+++ b/python/tvm/script/parser_v1/tir/special_stmt.py
@@ -904,79 +904,6 @@ def func_attr(dict_attr, span):
         super().__init__(func_attr, def_symbol=False)
 
 
-@register
-class PreflattenedBufferMap(SpecialStmt):
-    """Special Stmt for declaring the PrimFunc::preflattened_buffer_map
-
-    Example
-    -------
-    .. code-block:: python
-         A0 = T.match_buffer(A, (48,), dtype="float32")
-         T.preflattened_buffer_map(A, (1, 4, 4, 3), elem_offset=1, align=4, dtype="float32")
-    """
-
-    def __init__(self):
-        def preflattened_buffer(
-            postflattened,
-            shape,
-            dtype="float32",
-            data=None,
-            strides=None,
-            elem_offset=None,
-            scope="global",
-            align=-1,
-            offset_factor=0,
-            buffer_type="default",
-            span=None,
-        ):
-
-            param = None
-            for key, value in self.context.func_buffer_map.items():
-                if value.same_as(postflattened):
-                    param = key
-                    break
-
-            assert (
-                param is not None
-            ), f"Post-flatten buffer {postflattened.name} does not appear in the buffer map."
-
-            if data is None:
-                data = self.context.func_buffer_map[param].data
-
-            buffer_name: str = f"{postflattened.name}_preflatten"
-            if align != -1:
-                if isinstance(align, IntImm):
-                    align = align.value
-                else:
-                    assert isinstance(align, int), f"align: want int or IntImm, got {align!r}"
-
-            if offset_factor != 0:
-                if isinstance(offset_factor, IntImm):
-                    offset_factor = offset_factor.value
-                else:
-                    assert isinstance(
-                        offset_factor, int
-                    ), f"offset_factor: want int or IntImm, got {offset_factor!r}"
-
-            preflattened = tvm.tir.decl_buffer(
-                shape,
-                dtype,
-                buffer_name,
-                data,
-                strides,
-                elem_offset,
-                scope,
-                align,
-                offset_factor,
-                buffer_type,
-                span=span,
-            )
-
-            self.context.func_preflattened_buffer_map[param] = preflattened
-
-        super().__init__(preflattened_buffer, def_symbol=False)
-
-
 @register
 class TargetAttrValue(SpecialStmt):
     """Special Stmt for target attr value.
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index 4628ae36265f..c5cc922a3e48 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -49,9 +49,6 @@ class PrimFunc(BaseFunc):
     buffer_map : Map[tvm.tir.Var, tvm.tir.Buffer]
         The buffer binding map.
 
-    preflattened_buffer_map : Optional[Map[tvm.tir.Var, tvm.tir.Buffer]]
-        The buffer binding map, prior to any flattening.
-
     attrs: Optional[tvm.Attrs]
         Attributes of the function, can be None
 
@@ -65,14 +62,12 @@ def __init__(
         body,
         ret_type=None,
         buffer_map=None,
-        preflattened_buffer_map=None,
         attrs=None,
         span=None,
     ):
 
         param_list = []
         buffer_map = {} if buffer_map is None else buffer_map
-        preflattened_buffer_map = {} if preflattened_buffer_map is None else preflattened_buffer_map
         for x in params:
             x = tvm.runtime.convert(x) if not isinstance(x, Object) else x
             if isinstance(x, Buffer):
@@ -90,7 +85,6 @@ def __init__(
             body,
             ret_type,
             buffer_map,
-            preflattened_buffer_map,
             attrs,
             span,
         )  # type: ignore
@@ -116,7 +110,6 @@ def with_body(self, new_body, span=None):
             new_body,
             self.ret_type,
             self.buffer_map,
-            self.preflattened_buffer_map,
             self.attrs,
             span,
         )
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index e50559ac10ff..fc3f49d76fae 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -152,16 +152,6 @@ Doc TIRTextPrinter::PrintPrimFunc(const PrimFunc& prim_func) {
         2, Doc::NewLine() << "buffer_map = {" << PrintSep(buffer_map_doc, Doc::Text(", ")) << "}");
   }
 
-  if (op->preflattened_buffer_map.size() != 0) {
-    // print preflattened_buffer_map
-    std::vector<Doc> preflattened_buffer_map_doc;
-    for (auto& v : op->preflattened_buffer_map) {
-      preflattened_buffer_map_doc.push_back(Print(v.first) << ": " << Print(v.second));
-    }
-    doc << Doc::Indent(2, Doc::NewLine()
-                              << "preflattened_buffer_map = {"
-                              << PrintSep(preflattened_buffer_map_doc, Doc::Text(", ")) << "}");
-  }
   doc << PrintBody(op->body);
   return doc;
 }
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index f1d68ee43845..0dc6240bc6ca 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -1672,26 +1672,6 @@ Doc TVMScriptPrinter::PrintPrimFunc(const PrimFunc& primFunc) {
     body << Print((*it).first) << ", " << memo_buf_decl_[buf];
     body << ")" << Doc::NewLine();
   }
-  // print preflattened buffer map
-  for (const auto& param : op->params) {
-    auto pf_buf_it = op->preflattened_buffer_map.find(param);
-    if (pf_buf_it != op->preflattened_buffer_map.end()) {
-      const Buffer& preflattened = (*pf_buf_it).second;
-
-      auto buf_it = op->buffer_map.find(param);
-      ICHECK(buf_it != op->buffer_map.end()) << "Found pre-flattened buffer " << preflattened->name
-                                             << " with no corresponding post-flatten buffer.";
-      const Buffer& postflattened = (*buf_it).second;
-
-      // Call Print() without assigning in order to fill memo_buf_decl_.
-      Print(preflattened);
-      buf_not_in_headers_.insert(preflattened.get());
-      ICHECK(memo_buf_decl_.count(preflattened));
-
-      body << tir_prefix_ << ".preflattened_buffer(" << Print(postflattened) << ", "
-           << memo_buf_decl_.at(preflattened) << ")" << Doc::NewLine();
-    }
-  }
   // print body
   body << "# body" << Doc::NewLine();
 
diff --git a/src/relay/backend/aot/aot_lower_main.cc b/src/relay/backend/aot/aot_lower_main.cc
index 82393c535c43..2a4dfb84ddcf 100644
--- a/src/relay/backend/aot/aot_lower_main.cc
+++ b/src/relay/backend/aot/aot_lower_main.cc
@@ -504,7 +504,7 @@ class AOTMainLowerer : public MixedModeVisitor {
     tir::Stmt final_body = tir::SeqStmt({device_activations, body, device_deactivations});
 
     // Make the PrimFunc
-    return tir::PrimFunc(main_signature_, final_body, VoidType(), main_buffer_map_, {},
+    return tir::PrimFunc(main_signature_, final_body, VoidType(), main_buffer_map_,
                          DictAttrs(dict_attrs));
   }
 
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 786b3f81a5ae..3c0ab7c16f23 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -803,7 +803,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     tir::Stmt final_body = tir::SeqStmt({device_activations, body, device_deactivations});
 
     // Make the PrimFunc
-    return tir::PrimFunc(main_signature_, final_body, VoidType(), main_buffer_map_, {},
+    return tir::PrimFunc(main_signature_, final_body, VoidType(), main_buffer_map_,
                          DictAttrs(dict_attrs));
   }
 
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index da51e6b762dd..1ea020e884de 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -108,7 +108,7 @@ class RelayToTIRVisitor : public MixedModeMutator {
     }
 
     tir::PrimFunc replacement_func(func_signature, body, VoidType(), buffer_map,
-                                   Map<tir::Var, tir::Buffer>(), DictAttrs(dict_attrs));
+                                   DictAttrs(dict_attrs));
     ir_module_->Add(global_var, replacement_func);
   }
 
diff --git a/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
index eb6cf1cce420..ad2b06695cc1 100644
--- a/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
+++ b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
@@ -152,7 +152,7 @@ class ConvertAddToSubtract : public MixedModeMutator {
       };
 
       tir::PrimFunc replacement_func = tir::PrimFunc({x_var, y_var, out_var}, math_loop, VoidType(),
-                                                     buffer_map, {}, DictAttrs(dict_attrs));
+                                                     buffer_map, DictAttrs(dict_attrs));
 
       // Switch to TIRToRuntime hook for testing
       Bool tir_to_runtime = func->GetAttr<Bool>("tir_to_runtime").value_or(Bool(false));
diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc
index f48ee52506b4..1e63201a40dd 100644
--- a/src/script/ir_builder/tir/frame.cc
+++ b/src/script/ir_builder/tir/frame.cc
@@ -34,7 +34,6 @@ void PrimFuncFrameNode::ExitWithScope() {
       /*body=*/AsStmt(stmts),
       /*ret_type=*/ret_type.value_or(TupleType::Empty()),
       /*buffer_map=*/buffer_map,
-      /*preflattened_buffer_map=*/preflattened_buffer_map,
       /*attrs=*/attrs.defined() ? DictAttrs(attrs.value()) : NullValue<DictAttrs>());
   func = tvm::tir::ScriptComplete(func, root_alloc_buffers);
   IRBuilder builder = IRBuilder::Current();
diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc
index 78107136d492..822e8e468377 100644
--- a/src/script/ir_builder/tir/ir.cc
+++ b/src/script/ir_builder/tir/ir.cc
@@ -58,7 +58,6 @@ PrimFuncFrame PrimFunc() {
   n->args.clear();
   n->ret_type = NullOpt;
   n->buffer_map.clear();
-  n->preflattened_buffer_map.clear();
   n->attrs = NullOpt;
   n->env_threads.clear();
   n->root_alloc_buffers.clear();
@@ -137,26 +136,6 @@ Buffer MatchBuffer(ObjectRef param, Array<PrimExpr> shape, DataType dtype, Optio
   return buffer;
 }
 
-void PreflattenedBuffer(Buffer postflattened_buffer, Array<PrimExpr> shape, DataType dtype,
-                        Optional<Var> data, Array<PrimExpr> strides, PrimExpr elem_offset,
-                        String storage_scope, int align, int offset_factor, String buffer_type_str,
-                        Array<IntImm> axis_separators) {
-  PrimFuncFrame frame = FindPrimFuncFrame("T.preflattened_buffer");
-  for (auto const& p : frame->buffer_map) {
-    if (p.second.same_as(postflattened_buffer)) {
-      String buffer_name(postflattened_buffer->name + "_preflatten");
-      Buffer buffer =
-          BufferDecl(shape, dtype, buffer_name, data.value_or(p.second->data), strides, elem_offset,
-                     storage_scope, align, offset_factor, buffer_type_str, axis_separators);
-      details::Namer::Name(buffer, buffer_name);
-      frame->preflattened_buffer_map.Set(p.first, buffer);
-      return;
-    }
-  }
-  LOG(FATAL) << "ValueError: postflattened buffer " << postflattened_buffer->name
-             << " does not exist.";
-}
-
 BlockFrame Block(String name, bool no_realize) {
   ObjectPtr<BlockFrameNode> n = make_object<BlockFrameNode>();
   n->name = name;
@@ -595,7 +574,6 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.FuncName").set_body_typed(FuncName);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.FuncAttrs").set_body_typed(FuncAttrs);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.FuncRet").set_body_typed(FuncRet);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.MatchBuffer").set_body_typed(MatchBuffer);
-TVM_REGISTER_GLOBAL("script.ir_builder.tir.PreflattenedBuffer").set_body_typed(PreflattenedBuffer);
 
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Block").set_body_typed(Block);
 TVM_REGISTER_GLOBAL("script.ir_builder.tir.Init").set_body_typed(Init);
diff --git a/src/tir/analysis/device_constraint_utils.cc b/src/tir/analysis/device_constraint_utils.cc
index 32b59ce54b69..d0933e0691dd 100644
--- a/src/tir/analysis/device_constraint_utils.cc
+++ b/src/tir/analysis/device_constraint_utils.cc
@@ -210,8 +210,6 @@ class ApplyDeviceConstraintsMutator : public StmtExprMutator {
 
     // Start with a copy of the current prim_func buffer map.
     Map<Var, Buffer> new_buffer_map(prim_func->buffer_map.begin(), prim_func->buffer_map.end());
-    Map<Var, Buffer> new_preflattened_buffer_map(prim_func->preflattened_buffer_map.begin(),
-                                                 prim_func->preflattened_buffer_map.end());
     bool any_change = false;
 
     // For each constrained parameter...
@@ -225,23 +223,6 @@ class ApplyDeviceConstraintsMutator : public StmtExprMutator {
         any_change = true;
       }
       new_buffer_map.Set(param, new_buffer);
-
-      // Rewrite the pre-flattened buffers to account for constraint.
-      // This only has an impact if the IRModule being analyzed has
-      // already been run through the StorageFlatten or FlattenBuffer
-      // passes.
-      if (auto opt = prim_func->preflattened_buffer_map.Get(param)) {
-        Buffer pf_buffer = opt.value();
-        if (pf_buffer.same_as(buffer)) {
-          new_preflattened_buffer_map.Set(param, new_buffer);
-        } else {
-          const Buffer new_buffer = RewriteBuffer(pf_buffer, virtual_device);
-          if (!new_buffer.same_as(pf_buffer)) {
-            any_change = true;
-          }
-          new_preflattened_buffer_map.Set(param, new_buffer);
-        }
-      }
     }
     // Make sure we have accounted for all prim_func parameters.
     CheckNoRemainingPointerParams(prim_func, &current_primfunc_param_index);
@@ -259,8 +240,7 @@ class ApplyDeviceConstraintsMutator : public StmtExprMutator {
 
     if (any_change) {
       return PrimFunc(prim_func->params, std::move(new_body), prim_func->ret_type,
-                      std::move(new_buffer_map), std::move(new_preflattened_buffer_map),
-                      prim_func->attrs, prim_func->span);
+                      std::move(new_buffer_map), prim_func->attrs, prim_func->span);
     } else {
       return prim_func;
     }
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index d51ffbf833a4..369c4adc8536 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -152,9 +152,8 @@ class HoistAllocatesMutator : public StmtExprMutator {
                    current_alloc->span);
     }
 
-    PrimFunc new_main_func =
-        PrimFunc(main_func->params, new_main_func_body, main_func->ret_type, main_func->buffer_map,
-                 main_func->preflattened_buffer_map, main_func->attrs);
+    PrimFunc new_main_func = PrimFunc(main_func->params, new_main_func_body, main_func->ret_type,
+                                      main_func->buffer_map, main_func->attrs);
     return new_main_func;
   }
 
@@ -523,7 +522,6 @@ class MergeConstantsMutator : public StmtExprMutator {
     prim_func_node->body = std::move(new_body);
     prim_func_node->buffer_map = std::move(new_buffer_map);
     prim_func_node->params = std::move(new_params);
-    prim_func_node->preflattened_buffer_map = {};
     PrimFunc f{GetRef<PrimFunc>(prim_func_node)};
 
     // Add the new const dict as an attribute
diff --git a/src/tir/ir/function.cc b/src/tir/ir/function.cc
index c609ad158e34..d4802e287693 100644
--- a/src/tir/ir/function.cc
+++ b/src/tir/ir/function.cc
@@ -29,9 +29,7 @@ namespace tvm {
 namespace tir {
 // Get the function type of a PrimFunc
 PrimFunc::PrimFunc(Array<tir::Var> params, Stmt body, Type ret_type,
-                   Map<tir::Var, Buffer> buffer_map,
-                   Optional<Map<tir::Var, Buffer>> preflattened_buffer_map, DictAttrs attrs,
-                   Span span) {
+                   Map<tir::Var, Buffer> buffer_map, DictAttrs attrs, Span span) {
   // Assume void-return type for now
   // TODO(tvm-team) consider type deduction from body.
   if (!ret_type.defined()) {
@@ -42,7 +40,6 @@ PrimFunc::PrimFunc(Array<tir::Var> params, Stmt body, Type ret_type,
   n->body = std::move(body);
   n->ret_type = std::move(ret_type);
   n->buffer_map = std::move(buffer_map);
-  n->preflattened_buffer_map = preflattened_buffer_map.value_or(Map<tir::Var, Buffer>());
   n->attrs = std::move(attrs);
   n->checked_type_ = n->func_type_annotation();
   n->span = std::move(span);
@@ -129,9 +126,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 TVM_REGISTER_GLOBAL("tir.PrimFunc")
     .set_body_typed([](Array<tir::Var> params, Stmt body, Type ret_type,
-                       Map<tir::Var, Buffer> buffer_map,
-                       Map<tir::Var, Buffer> preflattened_buffer_map, DictAttrs attrs, Span span) {
-      return PrimFunc(params, body, ret_type, buffer_map, preflattened_buffer_map, attrs, span);
+                       Map<tir::Var, Buffer> buffer_map, DictAttrs attrs, Span span) {
+      return PrimFunc(params, body, ret_type, buffer_map, attrs, span);
     });
 
 TVM_REGISTER_GLOBAL("tir.TensorIntrin")
diff --git a/src/tir/transforms/bf16_legalize.cc b/src/tir/transforms/bf16_legalize.cc
index 5dc08f31c23c..040c48c79693 100644
--- a/src/tir/transforms/bf16_legalize.cc
+++ b/src/tir/transforms/bf16_legalize.cc
@@ -308,37 +308,8 @@ class BF16LowerRewriter : public StmtExprMutator {
       }
     }
 
-    // Most passes do not change the preflattened buffer map, nor
-    // should they change it.  This is an exception, because the Var
-    // associated with the `BufferNode::data` in
-    // `PrimFunc::buffer_map` may be replaced, and the corresponding
-    // Var in the `PrimFunc::preflattened_buffer_map` must also be
-    // replaced.
-    Map<Var, Buffer> new_preflattened_buffer_map;
-    for (auto& itr : op->preflattened_buffer_map) {
-      auto param_var = itr.first;
-      auto oldbuf = itr.second;
-      if (oldbuf->dtype.is_bfloat16()) {
-        auto it = new_buffer_map.find(param_var);
-        ICHECK(it != new_buffer_map.end())
-            << "PrimFunc parameter " << param_var->name_hint
-            << " is associated with the pre-flattened buffer " << oldbuf->name
-            << ", but isn't associated with any post-flatten buffer.";
-        const Buffer& flatbuf = (*it).second;
-        DataType dtype = DataType::UInt(16, oldbuf->dtype.lanes());
-        auto newbuf = Buffer(flatbuf->data, dtype, oldbuf->shape, oldbuf->strides,
-                             oldbuf->elem_offset, oldbuf->name, oldbuf->data_alignment,
-                             oldbuf->offset_factor, oldbuf->buffer_type);
-        buffer_remap_[oldbuf] = newbuf;
-        new_preflattened_buffer_map.Set(param_var, newbuf);
-      } else {
-        new_preflattened_buffer_map.Set(param_var, oldbuf);
-      }
-    }
-
     if (buffer_remap_.size() != 0) {
       op->buffer_map = new_buffer_map;
-      op->preflattened_buffer_map = new_preflattened_buffer_map;
     }
   }
 
diff --git a/src/tir/transforms/flatten_buffer.cc b/src/tir/transforms/flatten_buffer.cc
index 5441120491c6..d51a44887f54 100644
--- a/src/tir/transforms/flatten_buffer.cc
+++ b/src/tir/transforms/flatten_buffer.cc
@@ -37,22 +37,18 @@ namespace tir {
 class BufferFlattener : public StmtExprMutator {
  public:
   static PrimFunc Flatten(PrimFunc func) {
-    Map<Var, Buffer> preflattened_buffer_map =
-        Merge(func->buffer_map, func->preflattened_buffer_map);
-    auto pass = BufferFlattener(func->buffer_map);
+    auto pass = BufferFlattener();
     auto writer = func.CopyOnWrite();
     writer->body = pass.VisitStmt(func->body);
-    writer->preflattened_buffer_map = preflattened_buffer_map;
-    writer->buffer_map = pass.updated_extern_buffer_map_;
+    // The buffers in func->buffer_map are deliberately left
+    // unflattened, as they are used for validation of user-provided
+    // arguments.  The flattened buffers used in the updated
+    // function body alias the argument buffers.
     return func;
   }
 
  private:
-  explicit BufferFlattener(const Map<Var, Buffer>& extern_buffer_map) {
-    for (const auto& kv : extern_buffer_map) {
-      updated_extern_buffer_map_.Set(kv.first, GetFlattenedBuffer(kv.second));
-    }
-  }
+  BufferFlattener() {}
 
   Stmt VisitStmt_(const BlockNode* op) final {
     ICHECK_EQ(op->match_buffers.size(), 0)
diff --git a/src/tir/transforms/legalize_packed_calls.cc b/src/tir/transforms/legalize_packed_calls.cc
index 344e6c7ae3cb..fed76876f6bf 100644
--- a/src/tir/transforms/legalize_packed_calls.cc
+++ b/src/tir/transforms/legalize_packed_calls.cc
@@ -74,9 +74,9 @@ class PackedCallLegalizer : public StmtExprMutator {
             tvm::runtime::Map<tvm::tir::Var, tvm::tir::Buffer>::iterator param_buf_it;
             if (prim_func != nullptr) {
               auto param_var = prim_func->params[i - 1];
-              param_buf_it = prim_func->preflattened_buffer_map.find(param_var);
+              param_buf_it = prim_func->buffer_map.find(param_var);
             }
-            if (prim_func != nullptr && param_buf_it != prim_func->preflattened_buffer_map.end()) {
+            if (prim_func != nullptr && param_buf_it != prim_func->buffer_map.end()) {
               Buffer param = (*param_buf_it).second;
               PrimExpr shape = tvm::tir::Call(
                   DataType::Handle(), tvm::tir::builtin::tvm_stack_make_shape(), param->shape);
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 5b9bac03aba9..c1611a23a05f 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -209,9 +209,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func) {
       continue;
     }
 
-    if (func_ptr->preflattened_buffer_map.count(param)) {
-      buffer_def.emplace_back(v_arg, func_ptr->preflattened_buffer_map[param]);
-    } else if (func_ptr->buffer_map.count(param)) {
+    if (func_ptr->buffer_map.count(param)) {
       buffer_def.emplace_back(v_arg, func_ptr->buffer_map[param]);
     } else {
       var_def.emplace_back(v_arg, param);
diff --git a/src/tir/transforms/plan_update_buffer_allocation_location.cc b/src/tir/transforms/plan_update_buffer_allocation_location.cc
index db59824bf1ce..90150ebd3cdf 100644
--- a/src/tir/transforms/plan_update_buffer_allocation_location.cc
+++ b/src/tir/transforms/plan_update_buffer_allocation_location.cc
@@ -52,21 +52,21 @@ class BufferAllocationLocator : public StmtExprMutator {
  public:
   explicit BufferAllocationLocator(const PrimFunc& func) {
     Map<Buffer, Optional<Stmt>> buffer_lca = DetectBufferAccessLCA(func);
+    std::unordered_set<const VarNode*> arg_buffer_vars;
     CollectUnmanagedAllocations collector;
     collector(func->body);
     unmanaged_allocations_ = collector.unmanaged_allocations;
 
-    std::unordered_set<const BufferNode*> arg_buffers;
     for (const auto& kv : func->buffer_map) {
       const Buffer& buffer = kv.second;
-      arg_buffers.emplace(buffer.get());
+      arg_buffer_vars.emplace(buffer->data.get());
       buffer_data_to_buffer_.Set(buffer->data, buffer);
     }
     // create buffers to be allocated at each stmts
     for (const auto& kv : buffer_lca) {
       const Buffer& buffer = kv.first;
       const StmtNode* stmt = kv.second.get();
-      if (arg_buffers.count(buffer.get())) {
+      if (arg_buffer_vars.count(buffer->data.get())) {
         continue;
       }
       if (!unmanaged_allocations_.count(buffer->data.get())) {
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index ab1b062ad647..eb0409e555a1 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -402,6 +402,7 @@ class BufferStrideLegalize : public StmtExprMutator {
 
       auto fptr = func.CopyOnWrite();
       fptr->body = pass(std::move(fptr->body));
+      fptr->buffer_map = pass.UpdatedExternBufferMap();
       if (auto map = func->attrs.GetAttr<Map<Buffer, Array<IndexMap>>>("layout_transform_map")) {
         func = WithAttr(std::move(func), "layout_transform_map", pass.UpdateIndexMap(map.value()));
       }
@@ -420,7 +421,6 @@ class BufferStrideLegalize : public StmtExprMutator {
         BufferEntry entry;
         entry.remap_to = with_strides;
         entry.in_scope = true;
-        entry.is_external = true;
         buf_map_[buf] = entry;
       }
       updated_extern_buffer_map_.Set(kv.first, with_strides);
@@ -443,51 +443,54 @@ class BufferStrideLegalize : public StmtExprMutator {
   Map<Var, Buffer> UpdatedExternBufferMap() const { return updated_extern_buffer_map_; }
 
   Buffer WithStrides(Buffer buf) {
-    auto it = buf_map_.find(buf);
+    auto cache_key = buf;
+
+    auto it = buf_map_.find(cache_key);
     if (it != buf_map_.end()) {
       const BufferEntry& entry = it->second;
       ICHECK(entry.in_scope) << "Cannot annotate an out-of-scope buffer";
       return entry.remap_to;
     }
 
+    Array<PrimExpr> shape = buf->shape;
+
     if (buf->strides.size()) {
       ICHECK_EQ(buf->strides.size(), buf->shape.size())
           << "Buffer " << buf << " has inconsistent strides/shape.";
-      return buf;
-    }
-
-    // Keeping this to have matched behavior to previous version.
-    // There are many parts of the codebase that assume that a strided
-    // array cannot be compact.  For example, ArgBinder::BindBuffer
-    // and tir.Specialize.
-    if (dim_align_.count(buf) == 0) {
-      return buf;
-    }
-
-    // Can't define the strides for a buffer without a known shape.
-    Array<PrimExpr> shape = buf->shape;
-    if (shape.size() == 0) {
-      return buf;
-    }
-
-    std::vector<PrimExpr> rstrides;
-    const std::vector<DimAlignInfo>& avec = dim_align_[buf];
-    int first_dim = 0;
-    PrimExpr stride = make_const(shape[first_dim].dtype(), 1);
-    for (size_t i = shape.size(); i != 0; --i) {
-      size_t dim = i - 1;
-      if (dim < avec.size() && avec[dim].align_factor != 0) {
-        PrimExpr factor = make_const(stride.dtype(), avec[dim].align_factor);
-        PrimExpr offset = make_const(stride.dtype(), avec[dim].align_offset);
-        stride = stride + indexmod(factor + offset - indexmod(stride, factor), factor);
-        stride = bound_analyzer_->Simplify(stride);
+    } else if (dim_align_.count(buf) == 0) {
+      // Keeping this to have matched behavior to previous version.
+      // There are many parts of the codebase that assume that a
+      // strided array cannot be compact.  For example,
+      // ArgBinder::BindBuffer and tir.Specialize.  To avoid breaking
+      // these, do not define the strides unless required for a
+      // non-compact array.
+    } else if (shape.size() == 0) {
+      // Can't define the strides for a buffer without a known shape.
+    } else {
+      // With everything checked, can now define the updated strides
+      std::vector<PrimExpr> rstrides;
+      const std::vector<DimAlignInfo>& avec = dim_align_[buf];
+      int first_dim = 0;
+      PrimExpr stride = make_const(shape[first_dim].dtype(), 1);
+      for (size_t i = shape.size(); i != 0; --i) {
+        size_t dim = i - 1;
+        if (dim < avec.size() && avec[dim].align_factor != 0) {
+          PrimExpr factor = make_const(stride.dtype(), avec[dim].align_factor);
+          PrimExpr offset = make_const(stride.dtype(), avec[dim].align_offset);
+          stride = stride + indexmod(factor + offset - indexmod(stride, factor), factor);
+          stride = bound_analyzer_->Simplify(stride);
+        }
+        rstrides.push_back(stride);
+        stride = stride * shape[dim];
       }
-      rstrides.push_back(stride);
-      stride = stride * shape[dim];
+
+      buf.CopyOnWrite()->strides = Array<PrimExpr>(rstrides.rbegin(), rstrides.rend());
     }
 
-    auto ptr = buf.CopyOnWrite();
-    ptr->strides = Array<PrimExpr>(rstrides.rbegin(), rstrides.rend());
+    BufferEntry entry;
+    entry.remap_to = buf;
+    entry.in_scope = true;
+    buf_map_[cache_key] = entry;
 
     return buf;
   }
@@ -513,16 +516,10 @@ class BufferStrideLegalize : public StmtExprMutator {
       Buffer target_with_strides = WithStrides(Downcast<Buffer>(arr[1]));
       Buffer source_with_strides = WithStrides(source);
 
-      {
-        BufferEntry entry;
-        entry.remap_to = source_with_strides;
-        entry.in_scope = true;
-        entry.is_external = false;
-        buf_map_[source] = entry;
-      }
-
       Stmt body = this->VisitStmt(op->body);
 
+      buf_map_[source].in_scope = false;
+
       return AttrStmt(Array<ObjectRef>{source_with_strides, target_with_strides}, op->attr_key,
                       op->value, body, op->span);
     } else {
@@ -560,13 +557,6 @@ class BufferStrideLegalize : public StmtExprMutator {
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
     Buffer key = op->buffer;
     Buffer with_strides = WithStrides(op->buffer);
-    {
-      BufferEntry entry;
-      entry.remap_to = with_strides;
-      entry.in_scope = true;
-      entry.is_external = false;
-      buf_map_[key] = entry;
-    }
 
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
 
@@ -589,22 +579,14 @@ class BufferStrideLegalize : public StmtExprMutator {
 
   template <typename Node>
   Node VisitBufferAccess(Node node) {
-    auto alloc_key = node->buffer->data.get();
-    if (!buf_map_.count(node->buffer) && buffer_var_defines_.count(alloc_key)) {
-      BufferEntry entry;
-      entry.remap_to = WithStrides(node->buffer);
-      entry.in_scope = true;
-      entry.is_external = false;
-      buf_map_[node->buffer] = entry;
-    }
-
     auto it = buf_map_.find(node->buffer);
-    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << node->buffer;
-    const BufferEntry& e = it->second;
-    ICHECK(e.in_scope) << "Cannot access a buffer " << node->buffer->name << ", out of scope";
+    ICHECK(it == buf_map_.end() || it->second.in_scope)
+        << "Cannot access a buffer " << node->buffer->name << ", out of scope";
 
-    auto writer = node.CopyOnWrite();
-    writer->buffer = e.remap_to;
+    auto with_strides = WithStrides(node->buffer);
+    if (!with_strides.same_as(node->buffer)) {
+      node.CopyOnWrite()->buffer = with_strides;
+    }
 
     return node;
   }
@@ -623,7 +605,6 @@ class BufferStrideLegalize : public StmtExprMutator {
   struct BufferEntry {
     Buffer remap_to;
     bool in_scope;
-    bool is_external;
   };
 
   std::unordered_map<Buffer, BufferEntry, ObjectPtrHash, ObjectPtrEqual> buf_map_;
@@ -846,6 +827,7 @@ class BufferBindUnwrapper : public StmtExprMutator {
       BufferEntry e;
       e.buffer = kv.second;
       e.external = true;
+      var_to_buffer_[kv.second->data.get()] = kv.second;
       buf_map_[kv.second.get()] = std::move(e);
     }
   }
@@ -1001,6 +983,7 @@ class BufferBindUnwrapper : public StmtExprMutator {
       BufferEntry e;
       e.bounds = op->bounds;
       e.buffer = op->buffer;
+      var_to_buffer_[op->buffer->data.get()] = op->buffer;
       buf_map_[key] = std::move(e);
     }
 
@@ -1089,6 +1072,7 @@ class BufferBindUnwrapper : public StmtExprMutator {
       source_info.buffer = source;
       source_info.remap = std::make_unique<RemapInfo>(remap);
 
+      var_to_buffer_[source->data.get()] = source;
       buf_map_[source.get()] = std::move(source_info);
     }
 
@@ -1160,18 +1144,70 @@ class BufferBindUnwrapper : public StmtExprMutator {
   };
 
   const BufferEntry& GetBufferEntry(Buffer buffer) {
-    auto alloc_key = buffer->data.get();
-    if (!buf_map_.count(buffer.get()) && buffer_var_defines_.count(alloc_key)) {
+    if (buf_map_.count(buffer.get())) {
+      const BufferEntry& e = buf_map_[buffer.get()];
+      ICHECK(e.in_scope) << "Cannot access a buffer " << buffer->name << ", out of scope";
+      return e;
+    } else if (buffer_var_defines_.count(buffer->data.get())) {
+      // The buffer var was defined, but the buffer hasn't been seen
+      // before.
       BufferEntry entry;
       entry.buffer = buffer;
+      var_to_buffer_[buffer->data.get()] = buffer;
       buf_map_[buffer.get()] = std::move(entry);
-    }
+      return buf_map_[buffer.get()];
+    } else if (var_remap_.count(buffer->data.get())) {
+      // The buffer var is an alias of a bound buffer.  Only
+      // supported if the bound buffer has no offsets.  In this
+      // case, we just need to make a new aliasing buffer that
+      // shares the remapped data variable.
+      Var old_var = buffer->data;
+      Var new_var = Downcast<Var>(var_remap_[old_var.get()]);
 
-    auto it = buf_map_.find(buffer.get());
-    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << buffer;
-    const BufferEntry& e = it->second;
-    ICHECK(e.in_scope) << "Cannot access a buffer " << buffer->name << ", out of scope";
-    return it->second;
+      {
+        ICHECK(var_to_buffer_.count(old_var.get()))
+            << "Cannot find remap information for aliased buffer var " << old_var->name_hint
+            << ", required to verify this alias is legal.";
+        const Buffer& aliased_buffer = var_to_buffer_[old_var.get()];
+        const BufferEntry& entry = buf_map_[aliased_buffer.get()];
+        if (entry.remap) {
+          for (const auto& begin : entry.remap->begins) {
+            ICHECK(is_zero(begin)) << "Aliasing of buffer with offset is not supported";
+          }
+        }
+      }
+
+      {
+        Buffer new_buf = buffer;
+        new_buf.CopyOnWrite()->data = new_var;
+
+        RemapInfo remap_info;
+        remap_info.target = new_buf;
+        remap_info.begins = Array<PrimExpr>(buffer->shape.size(), 0);
+        remap_info.extents = buffer->shape;
+
+        BufferEntry entry;
+        entry.buffer = buffer;
+        entry.remap = std::make_unique<RemapInfo>(remap_info);
+        entry.in_scope = true;
+        var_to_buffer_[buffer->data.get()] = buffer;
+        buf_map_[buffer.get()] = std::move(entry);
+      }
+      return buf_map_[buffer.get()];
+    } else if (var_to_buffer_.count(buffer->data.get())) {
+      // This buffer is an alias of a known buffer, with no remaps.  A
+      // buffer entry should be generated and returned.
+      BufferEntry entry;
+      entry.buffer = buffer;
+      entry.in_scope = true;
+      var_to_buffer_[buffer->data.get()] = buffer;
+      buf_map_[buffer.get()] = std::move(entry);
+
+      return buf_map_[buffer.get()];
+    } else {
+      LOG(FATAL) << "Can't work around the undefined buffer";
+      return *static_cast<BufferEntry*>(nullptr);
+    }
   }
 
   // The buffer assignment map
@@ -1181,6 +1217,9 @@ class BufferBindUnwrapper : public StmtExprMutator {
   std::unordered_set<const VarNode*> illegal_vars_;
   // Buffer map
   std::unordered_map<const BufferNode*, BufferEntry> buf_map_;
+  // Map from Var to the Buffer they occurred in.  In case of aliased
+  // buffers, contains the first buffer.
+  std::unordered_map<const VarNode*, Buffer> var_to_buffer_;
   // Set of vars that have occurred in an AllocateNode, but haven't
   // yet occurred in a BufferLoad/BufferStore.
   std::unordered_set<const VarNode*> buffer_var_defines_;
@@ -1311,13 +1350,12 @@ class StorageFlattener : public StmtExprMutator {
       auto pass = StorageFlattener(func->buffer_map, cache_line_size, create_bound_attributes,
                                    &bound_analyzer);
 
-      Map<Var, Buffer> preflattened_buffer_map =
-          Merge(func->buffer_map, func->preflattened_buffer_map);
-
       auto fptr = func.CopyOnWrite();
       fptr->body = pass(std::move(fptr->body));
-      fptr->preflattened_buffer_map = preflattened_buffer_map;
-      fptr->buffer_map = pass.UpdatedBufferMap();
+      // The buffers in func->buffer_map are deliberately left
+      // unflattened, as they are used for validation of user-provided
+      // arguments.  The flattened buffers used in the updated
+      // function body alias the argument buffers.
       return func;
     };
     return transform::CreatePrimFuncPass(pass_func, 0, "tir.StorageFlattener", {});
@@ -1345,15 +1383,12 @@ class StorageFlattener : public StmtExprMutator {
         }
       }
       e.external = true;
+      buffer_var_defines_.insert(kv.second->data.get());
       buf_map_[kv.second] = e;
-
-      updated_extern_buffer_map_.Set(kv.first, e.flattened_buffer);
     }
     cache_line_size_ = cache_line_size;
   }
 
-  Map<Var, Buffer> UpdatedBufferMap() { return updated_extern_buffer_map_; }
-
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
     return Stmt();
@@ -1512,8 +1547,10 @@ class StorageFlattener : public StmtExprMutator {
         writer->dtype = DataType::Int(8);
       }
 
+      buffer_var_defines_.insert(op->buffer->data.get());
       buf_map_[key] = e;
       Stmt body = this->VisitStmt(op->body);
+      buffer_var_defines_.erase(op->buffer->data.get());
       buf_map_[key].in_scope = false;
 
       Stmt ret =
@@ -1777,8 +1814,6 @@ class StorageFlattener : public StmtExprMutator {
   std::unordered_map<const VarNode*, std::vector<Buffer>> buffer_var_map_;
   // Buffer map
   std::unordered_map<Buffer, BufferEntry, ObjectPtrHash, ObjectPtrEqual> buf_map_;
-  // The extern buffer map, updated to include flattened buffers.
-  Map<Var, Buffer> updated_extern_buffer_map_;
   // Collects shapes.
   std::vector<std::pair<Var, Array<PrimExpr>>> shape_collector_;
   // bounds populator. We really need the analyzer from it.
diff --git a/src/tir/usmp/transform/assign_pool_info.cc b/src/tir/usmp/transform/assign_pool_info.cc
index 0671f1ea2722..2bded7b4877b 100644
--- a/src/tir/usmp/transform/assign_pool_info.cc
+++ b/src/tir/usmp/transform/assign_pool_info.cc
@@ -166,8 +166,8 @@ IRModule PoolInfoAssigner::operator()() {
     if (kv.second->IsInstance<PrimFuncNode>()) {
       func_ = Downcast<PrimFunc>(kv.second);
       Stmt body = this->VisitStmt(func_->body);
-      PrimFunc new_prim_func = PrimFunc(func_->params, body, func_->ret_type, func_->buffer_map,
-                                        func_->preflattened_buffer_map, func_->attrs);
+      PrimFunc new_prim_func =
+          PrimFunc(func_->params, body, func_->ret_type, func_->buffer_map, func_->attrs);
       mod_->Update(gv, new_prim_func);
     }
   }
diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
index 56aba654b59e..439e2643380a 100644
--- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
+++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
@@ -242,8 +242,8 @@ PrimFunc PoolAllocationToOffsetConverter::CreatePrimFuncWithPoolParams(
     if (emit_tvmscript_printable_) {
       original_attrs = DictAttrs();
     }
-    PrimFunc ret = PrimFunc(si.params, new_body, original_primfunc->ret_type, si.buffer_map,
-                            si.buffer_map, original_attrs);
+    PrimFunc ret =
+        PrimFunc(si.params, new_body, original_primfunc->ret_type, si.buffer_map, original_attrs);
     if (!emit_tvmscript_printable_) {
       ret = WithAttr(ret, tvm::attr::kPoolArgs, si.allocated_pool_params);
     }
@@ -449,12 +449,12 @@ IRModule PoolAllocationToOffsetConverter::operator()() {
   // We dont need attrs of PrimFunc that might include non printable attrs such as target
   // for unit tests where emit_tvmscript_printable_ is to be used.
   if (!emit_tvmscript_printable_) {
-    main_func = PrimFunc(si.params, main_func_body, main_func->ret_type, si.buffer_map, {},
-                         main_func->attrs);
+    main_func =
+        PrimFunc(si.params, main_func_body, main_func->ret_type, si.buffer_map, main_func->attrs);
     main_func = WithAttr(main_func, tvm::attr::kPoolArgs, si.allocated_pool_params);
   } else {
     main_func =
-        PrimFunc(si.params, main_func_body, main_func->ret_type, si.buffer_map, {}, DictAttrs());
+        PrimFunc(si.params, main_func_body, main_func->ret_type, si.buffer_map, DictAttrs());
   }
   module_->Update(gv, main_func);
   if (!emit_tvmscript_printable_) {
diff --git a/src/tir/usmp/transform/create_io_allocates.cc b/src/tir/usmp/transform/create_io_allocates.cc
index 59eee961632d..cf754131776c 100644
--- a/src/tir/usmp/transform/create_io_allocates.cc
+++ b/src/tir/usmp/transform/create_io_allocates.cc
@@ -195,9 +195,8 @@ IRModule IOAllocateCreator::operator()() {
     }
   }
   const GlobalVar& gv = mod_->GetGlobalVar(::tvm::runtime::symbol::tvm_module_main);
-  mod_->Update(gv,
-               PrimFunc(new_main_params, main_body, main_func_->ret_type, main_func_->buffer_map,
-                        main_func_->preflattened_buffer_map, main_func_->attrs, main_func_->span));
+  mod_->Update(gv, PrimFunc(new_main_params, main_body, main_func_->ret_type,
+                            main_func_->buffer_map, main_func_->attrs, main_func_->span));
   return mod_;
 }
 
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index c751d44b6156..61128da71c37 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -34,9 +34,11 @@
 @tvm.script.ir_module
 class WeightStreamOnlyU55:
     @T.prim_func
-    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+    def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        placeholder = T.buffer_decl([8192], "int8", data=input_placeholder.data)
+        ethosu_write = T.buffer_decl([2048], "int8", data=input_ethosu_write.data)
         buffer1 = T.buffer_decl([160], "uint8")
         buffer3 = T.buffer_decl([144], "uint8")
         buffer5 = T.buffer_decl([144], "uint8")
@@ -62,10 +64,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
 @tvm.script.ir_module
 class WeightStreamOnlyU65:
     @T.prim_func
-    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+    def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
+        placeholder = T.buffer_decl([8192], dtype="int8", data=input_placeholder.data)
+        ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
         buffer_encoded_1 = T.buffer_decl([192], dtype="uint8")
         buffer_encoded_2_1 = T.buffer_decl([192], dtype="uint8")
         buffer_encoded_4_1 = T.buffer_decl([208], dtype="uint8")
@@ -148,10 +152,12 @@ def _get_func():
 @tvm.script.ir_module
 class RereadWeightsU55:
     @T.prim_func
-    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+    def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer1 = T.buffer_decl([384], "uint8")
+        placeholder = T.buffer_decl([8192], "int8", data=input_placeholder.data)
+        ethosu_write = T.buffer_decl([2048], "int8", data=input_ethosu_write.data)
         # body
         p1_data = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True})
         p1 = T.buffer_decl([384], "uint8", data=p1_data)
@@ -167,10 +173,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
 @tvm.script.ir_module
 class RereadWeightsU65:
     @T.prim_func
-    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+    def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
+        placeholder = T.buffer_decl([8192], dtype="int8", data=input_placeholder.data)
+        ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
         placeholder_encoded_1 = T.buffer_decl([464], "uint8")
         # body
         p1_data = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True})
@@ -246,13 +254,15 @@ def _get_func():
 @tvm.script.ir_module
 class DirectReadOnlyU55:
     @T.prim_func
-    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+    def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([592], "uint8")
         buffer_1 = T.buffer_decl([160], "uint8")
         buffer_2 = T.buffer_decl([160], "uint8")
         buffer_3 = T.buffer_decl([80], "uint8")
+        placeholder = T.buffer_decl([8192], "int8", data=input_placeholder.data)
+        ethosu_write = T.buffer_decl([2048], "int8", data=input_ethosu_write.data)
         # body
         ethosu_write_1_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
         ethosu_write_1 = T.buffer_decl([4096], "int8", data=ethosu_write_1_data)
@@ -264,7 +274,7 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
 @tvm.script.ir_module
 class DirectReadOnlyU65:
     @T.prim_func
-    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+    def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
@@ -272,6 +282,8 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         placeholder_encoded_1 = T.buffer_decl([160], dtype="uint8")
         placeholder_encoded_2 = T.buffer_decl([208], dtype="uint8")
         placeholder_encoded_3 = T.buffer_decl([96], dtype="uint8")
+        placeholder = T.buffer_decl([8192], dtype="int8", data=input_placeholder.data)
+        ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
         # body
         ethosu_write_2_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
         ethosu_write_2 = T.buffer_decl([4096], "int8", data=ethosu_write_2_data)
@@ -340,7 +352,7 @@ def _get_func():
 @tvm.script.ir_module
 class MixedReadU55:
     @T.prim_func
-    def main(ifm: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+    def main(input_ifm: T.Buffer[(1,16,16,32), "int8"], input_ethosu_write: T.Buffer[(1,16,16,8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer1 = T.buffer_decl([112], "uint8")
@@ -349,6 +361,8 @@ def main(ifm: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]
         buffer7 = T.buffer_decl([112], "uint8")
         buffer9 = T.buffer_decl([592], "uint8")
         buffer10 = T.buffer_decl([160], "uint8")
+        ifm = T.buffer_decl([8192], "int8", data=input_ifm.data)
+        ethosu_write = T.buffer_decl([2048], "int8", data=input_ethosu_write.data)
         # body
         p1_data = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
         p1 = T.buffer_decl([112], "uint8", data=p1_data)
@@ -371,11 +385,12 @@ def main(ifm: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]
 @tvm.script.ir_module
 class MixedReadU65:
     @T.prim_func
-    def main(ifm: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+    def main(input_ifm: T.Buffer[(1,16,16,32), "int8"], input_ethosu_write: T.Buffer[(1,16,16,8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-
         # buffer definition
+        ifm = T.buffer_decl([8192], dtype="int8", data=input_ifm.data)
+        ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
         buffer1 = T.buffer_decl([128], dtype="uint8")
         buffer2 = T.buffer_decl([128], dtype="uint8")
         buffer3 = T.buffer_decl([128], dtype="uint8")
diff --git a/tests/python/contrib/test_ethosu/test_hoist_allocates.py b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
index 6c6d51fa06b9..1508aa441c3b 100644
--- a/tests/python/contrib/test_ethosu/test_hoist_allocates.py
+++ b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
@@ -106,15 +106,15 @@ def test_double_convolution():
     @tvm.script.ir_module
     class Module:
         @T.prim_func
-        def main(placeholder: T.Buffer[(3402,), "int8"], placeholder_encoded: T.Buffer[(128,), "uint8"], placeholder_encoded_1: T.Buffer[(32,), "uint8"], placeholder_encoded_2: T.Buffer[(128,), "uint8"], placeholder_encoded_3: T.Buffer[(32,), "uint8"], ethosu_write: T.Buffer[(3402,), "int8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1, 27, 42, 3), "int8"], input_placeholder_encoded: T.Buffer[(3, 3, 2, 3), "uint8"], input_placeholder_encoded_1: T.Buffer[(3, 10), "uint8"], input_placeholder_encoded_2: T.Buffer[(3, 3, 2, 3), "uint8"], input_placeholder_encoded_3: T.Buffer[(3, 10), "uint8"], input_ethosu_write: T.Buffer[(1, 27, 42, 3), "int8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            T.preflattened_buffer(placeholder, [1, 27, 42, 3], dtype="int8", data=placeholder.data)
-            T.preflattened_buffer(placeholder_encoded, [3, 3, 2, 3], dtype="int8")
-            T.preflattened_buffer(placeholder_encoded_1, [3, 10], dtype="uint8")
-            T.preflattened_buffer(placeholder_encoded_2, [3, 3, 2, 3], dtype="int8")
-            T.preflattened_buffer(placeholder_encoded_3, [3, 10], dtype="uint8")
-            T.preflattened_buffer(ethosu_write, [1, 27, 42, 3], dtype="int8", data=ethosu_write.data)
+            placeholder = T.buffer_decl([3402], dtype="int8", data=input_placeholder.data)
+            placeholder_encoded = T.buffer_decl([128], dtype="int8", data=input_placeholder_encoded.data)
+            placeholder_encoded_1 = T.buffer_decl([32], dtype="uint8", data=input_placeholder_encoded_1.data)
+            placeholder_encoded_2 = T.buffer_decl([128], dtype="int8", data=input_placeholder_encoded_2.data)
+            placeholder_encoded_3 = T.buffer_decl([32], dtype="uint8", data=input_placeholder_encoded_3.data)
+            ethosu_write = T.buffer_decl([3402], dtype="int8", data=input_ethosu_write.data)
             # body
             placeholder_global_data = T.allocate([128], "uint8", "global")
             placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data)
@@ -150,11 +150,10 @@ def test_identities():
     @tvm.script.ir_module
     class Module:
         @T.prim_func
-        def main(placeholder: T.Buffer[(24,), "int8"], T_concat: T.Buffer[(24,), "int8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1, 2, 3, 4), "int8"], T_concat: T.Buffer[(24,), "int8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            T.preflattened_buffer(placeholder, [1, 2, 3, 4], dtype="int8", data=placeholder.data)
-            T.preflattened_buffer(T_concat, [24], dtype="int8", data=T_concat.data)
+            placeholder = T.buffer_decl([24], dtype="int8", data=input_placeholder.data)
             # body
             ethosu_write_data = T.allocate([12], "int8", "global")
             ethosu_write = T.buffer_decl([12], "int8", data=ethosu_write_data)
@@ -188,11 +187,11 @@ def test_outer_seq_stmt():
     @tvm.script.ir_module
     class Module:
         @T.prim_func
-        def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"], buffer_encoded: T.Buffer[(128,), "uint8"], buffer_encoded_1: T.Buffer[(32,), "uint8"], buffer_encoded_2: T.Buffer[(112,), "uint8"], buffer_encoded_3: T.Buffer[(32,), "uint8"], buffer_encoded_4: T.Buffer[(112,), "uint8"], buffer_encoded_5: T.Buffer[(32,), "uint8"], buffer_encoded_6: T.Buffer[(112,), "uint8"], buffer_encoded_7: T.Buffer[(32,), "uint8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"], buffer_encoded: T.Buffer[(128,), "uint8"], buffer_encoded_1: T.Buffer[(32,), "uint8"], buffer_encoded_2: T.Buffer[(112,), "uint8"], buffer_encoded_3: T.Buffer[(32,), "uint8"], buffer_encoded_4: T.Buffer[(112,), "uint8"], buffer_encoded_5: T.Buffer[(32,), "uint8"], buffer_encoded_6: T.Buffer[(112,), "uint8"], buffer_encoded_7: T.Buffer[(32,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
-            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            placeholder = T.buffer_decl([8192], dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
             # body
             with T.allocate([128], "uint8", "global") as placeholder_global_data:
                 placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data)
@@ -238,11 +237,11 @@ def test_allocate_without_seq_stmt():
     @tvm.script.ir_module
     class Module:
         @T.prim_func
-        def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"], buffer_encoded: T.Buffer[(128,), "uint8"], buffer_encoded_1: T.Buffer[(32,), "uint8"], buffer_encoded_2: T.Buffer[(112,), "uint8"], buffer_encoded_3: T.Buffer[(32,), "uint8"], buffer_encoded_4: T.Buffer[(112,), "uint8"], buffer_encoded_5: T.Buffer[(32,), "uint8"], buffer_encoded_6: T.Buffer[(112,), "uint8"], buffer_encoded_7: T.Buffer[(32,), "uint8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"], buffer_encoded: T.Buffer[(128,), "uint8"], buffer_encoded_1: T.Buffer[(32,), "uint8"], buffer_encoded_2: T.Buffer[(112,), "uint8"], buffer_encoded_3: T.Buffer[(32,), "uint8"], buffer_encoded_4: T.Buffer[(112,), "uint8"], buffer_encoded_5: T.Buffer[(32,), "uint8"], buffer_encoded_6: T.Buffer[(112,), "uint8"], buffer_encoded_7: T.Buffer[(32,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
-            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            placeholder = T.buffer_decl([8192], dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
             # body
             placeholder_global_data = T.allocate([128], "uint8", "global")
             placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data)
diff --git a/tests/python/contrib/test_ethosu/test_merge_constants.py b/tests/python/contrib/test_ethosu/test_merge_constants.py
index a5adcfceac83..ed1927b849d6 100644
--- a/tests/python/contrib/test_ethosu/test_merge_constants.py
+++ b/tests/python/contrib/test_ethosu/test_merge_constants.py
@@ -399,12 +399,12 @@ def test_read_from_the_same_buffer():
     @tvm.script.ir_module
     class InputModule:
         @T.prim_func
-        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint8"], buffer2: T.Buffer[(96,), "uint8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], buffer1: T.Buffer[(368,), "uint8"], buffer2: T.Buffer[(96,), "uint8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
-            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([368], "uint8", "global")
             p1 = T.buffer_decl([368], "uint8", data=p1_data)
@@ -419,9 +419,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint
     @tvm.script.ir_module
     class ReferenceModule:
         @T.prim_func
-        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(464,), "uint8"], input_ethosu_write: T.Buffer[(1,16,16,8), "int8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # buffer definition
+            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([464], "uint8", "global")
             p1 = T.buffer_decl([464], "uint8", data=p1_data)
@@ -446,12 +449,12 @@ def test_arbitrary_argument_order():
     @tvm.script.ir_module
     class InputModule:
         @T.prim_func
-        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint8"], buffer2: T.Buffer[(96,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"], buffer3: T.Buffer[(368,), "uint8"], buffer4: T.Buffer[(96,), "uint8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(368,), "uint8"], buffer2: T.Buffer[(96,), "uint8"], input_ethosu_write: T.Buffer[(1,16,16,8), "int8"], buffer3: T.Buffer[(368,), "uint8"], buffer4: T.Buffer[(96,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
-            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([368], "uint8", "global")
             p1 = T.buffer_decl([368], "uint8", data=p1_data)
@@ -473,9 +476,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint
     @tvm.script.ir_module
     class ReferenceModule:
         @T.prim_func
-        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"], buffer2: T.Buffer[(464,), "uint8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(464,), "uint8"], input_ethosu_write: T.Buffer[(1,16,16,8), "int8"], buffer2: T.Buffer[(464,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # buffer definition
+            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([464], "uint8", "global")
             p1 = T.buffer_decl([464], "uint8", data=p1_data)
@@ -509,12 +515,12 @@ def test_arbitrary_argument_order_const_split():
     @tvm.script.ir_module
     class InputModule:
         @T.prim_func
-        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"], buffer2: T.Buffer[(96,), "uint8"], buffer3: T.Buffer[(368,), "uint8"], buffer4: T.Buffer[(96,), "uint8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(368,), "uint8"], input_ethosu_write: T.Buffer[(1,16,16,8), "int8"], buffer2: T.Buffer[(96,), "uint8"], buffer3: T.Buffer[(368,), "uint8"], buffer4: T.Buffer[(96,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
-            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([368], "uint8", "global")
             p1 = T.buffer_decl([368], "uint8", data=p1_data)
@@ -536,9 +542,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint
     @tvm.script.ir_module
     class ReferenceModule:
         @T.prim_func
-        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"], buffer2: T.Buffer[(464,), "uint8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(464,), "uint8"], input_ethosu_write: T.Buffer[(1,16,16,8), "int8"], buffer2: T.Buffer[(464,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # buffer definition
+            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([464], "uint8", "global")
             p1 = T.buffer_decl([464], "uint8", data=p1_data)
@@ -572,12 +581,12 @@ def test_arbitrary_argument_order_const_split_mixed():
     @tvm.script.ir_module
     class InputModule:
         @T.prim_func
-        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint8"], buffer2: T.Buffer[(368,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"], buffer3: T.Buffer[(96,), "uint8"], buffer4: T.Buffer[(96,), "uint8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(368,), "uint8"], buffer2: T.Buffer[(368,), "uint8"], input_ethosu_write: T.Buffer[(2,16,16,8), "int8"], buffer3: T.Buffer[(96,), "uint8"], buffer4: T.Buffer[(96,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
-            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            placeholder = T.buffer_decl(8192, dtype='int8', data=input_placeholder.data)
+            ethosu_write = T.buffer_decl(4096, dtype='int8', data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([368], "uint8", "global")
             p1 = T.buffer_decl([368], "uint8", data=p1_data)
@@ -599,9 +608,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint
     @tvm.script.ir_module
     class ReferenceModule:
         @T.prim_func
-        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint8"], buffer2: T.Buffer[(464,), "uint8"], ethosu_write: T.Buffer[(4096,), "int8"]) -> None:
+        def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(464,), "uint8"], buffer2: T.Buffer[(464,), "uint8"], input_ethosu_write: T.Buffer[(2,16,16,8), "int8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # buffer definition
+            placeholder = T.buffer_decl(8192, dtype='int8', data=input_placeholder.data)
+            ethosu_write = T.buffer_decl(4096, dtype='int8', data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([464], "uint8", "global")
             p1 = T.buffer_decl([464], "uint8", data=p1_data)
diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
index e6414c24d4a3..379a35b1b4a4 100644
--- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py
+++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
@@ -30,9 +30,14 @@
 @tvm.script.ir_module
 class ReferenceModule:
     @T.prim_func
-    def main(placeholder: T.Buffer[(1536,), "int8"], placeholder_1: T.Buffer[(1280,), "int8"], T_concat: T.Buffer[(4096,), "int8"]) -> None:
+    def main(input_placeholder: T.Buffer[(1,8,12,16), "int8"], input_placeholder_1: T.Buffer[(1,8,10,16), "int8"], input_T_concat: T.Buffer[(1,8,32,16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+
+        placeholder = T.buffer_decl(1536, dtype="int8", data=input_placeholder.data)
+        placeholder_1 = T.buffer_decl(1280, dtype="int8", data=input_placeholder_1.data)
+        T_concat = T.buffer_decl(4096, dtype="int8", data=input_T_concat.data)
+
         buffer = T.buffer_decl([2992], "uint8")
         buffer_1 = T.buffer_decl([160], "uint8")
         buffer_2 = T.buffer_decl([2992], "uint8")
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index ae46057369e0..46c6976567c8 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -366,13 +366,15 @@ def _visit(stmt):
 @tvm.script.ir_module
 class Conv2dDoubleCascade1:
     @T.prim_func
-    def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,), "int8"]) -> None:
+    def main(input_placeholder_5: T.Buffer[(1, 8, 8, 3), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 8, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([304], "uint8")
         buffer_1 = T.buffer_decl([80], "uint8")
         buffer_2 = T.buffer_decl([320], "uint8")
         buffer_3 = T.buffer_decl([160], "uint8")
+        placeholder_5 = T.buffer_decl([192], 'int8', data=input_placeholder_5.data)
+        ethosu_write_1 = T.buffer_decl([512], 'int8', data=input_ethosu_write_1.data)
         # body
         ethosu_write_2_data = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True})
         ethosu_write_2 = T.buffer_decl([1024], "int8", data=ethosu_write_2_data)
@@ -386,13 +388,15 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,
 @tvm.script.ir_module
 class Conv2dDoubleCascade2:
     @T.prim_func
-    def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,), "int8"]) -> None:
+    def main(input_placeholder_5: T.Buffer[(1, 8, 8, 3), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 8, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([80], "uint8")
         buffer_1 = T.buffer_decl([320], "uint8")
         buffer_2 = T.buffer_decl([1312], "uint8")
         buffer_3 = T.buffer_decl([2608], "uint8")
+        placeholder_5 = T.buffer_decl([192], 'int8', data=input_placeholder_5.data)
+        ethosu_write_1 = T.buffer_decl([512], 'int8', data=input_ethosu_write_1.data)
         # body
         ethosu_write_2_data = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True})
         ethosu_write_2 = T.buffer_decl([1536], "int8", data=ethosu_write_2_data)
@@ -406,13 +410,16 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,
 @tvm.script.ir_module
 class Conv2dDoubleCascade3:
     @T.prim_func
-    def main(placeholder_5: T.Buffer[(768,), "int8"], ethosu_write_1: T.Buffer[(640,), "int8"]) -> None:
+    def main(input_placeholder_5: T.Buffer[(1, 16, 16, 3), "int8"], input_ethosu_write_1: T.Buffer[(1, 20, 4, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([1744], "uint8")
         buffer_1 = T.buffer_decl([80], "uint8")
         buffer_2 = T.buffer_decl([320], "uint8")
         buffer_3 = T.buffer_decl([880], "uint8")
+        placeholder_5 = T.buffer_decl([768], 'int8', data=input_placeholder_5.data)
+        ethosu_write_1 = T.buffer_decl([640], 'int8', data=input_ethosu_write_1.data)
+
         # body
         ethosu_write_2_data = T.allocate([2560], "int8", "global", annotations={"disable_lower_builtin": True})
         ethosu_write_2 = T.buffer_decl([2560], "int8", data=ethosu_write_2_data)
@@ -428,13 +435,15 @@ def main(placeholder_5: T.Buffer[(768,), "int8"], ethosu_write_1: T.Buffer[(640,
 @tvm.script.ir_module
 class Conv2dDoubleCascade4:
     @T.prim_func
-    def main(placeholder_5: T.Buffer[(1024,), "int8"], ethosu_write_1: T.Buffer[(2048,), "int8"]) -> None:
+    def main(input_placeholder_5: T.Buffer[(1, 8, 1, 8, 16), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 2, 8, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([1456], "uint8")
         buffer_1 = T.buffer_decl([352], "uint8")
         buffer_2 = T.buffer_decl([272], "uint8")
         buffer_3 = T.buffer_decl([11040], "uint8")
+        placeholder_5 = T.buffer_decl([1024], 'int8', data=input_placeholder_5.data)
+        ethosu_write_1 = T.buffer_decl([2048], 'int8', data=input_ethosu_write_1.data)
         # body
         ethosu_write_2_data = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True})
         ethosu_write_2 = T.buffer_decl((2304,), "int8", data=ethosu_write_2_data)
@@ -448,13 +457,15 @@ def main(placeholder_5: T.Buffer[(1024,), "int8"], ethosu_write_1: T.Buffer[(204
 @tvm.script.ir_module
 class Conv2dDoubleCascade5:
     @T.prim_func
-    def main(placeholder: T.Buffer[(192,), "int8"], ethosu_write: T.Buffer[(8192,), "int8"]) -> None:
+    def main(input_placeholder: T.Buffer[(1, 8, 8, 3), "int8"], input_ethosu_write: T.Buffer[(1, 32, 32, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([160], "uint8")
         buffer_1 = T.buffer_decl([320], "uint8")
         buffer_2 = T.buffer_decl([304], "uint8")
         buffer_3 = T.buffer_decl([80], "uint8")
+        placeholder = T.buffer_decl([192], 'int8', data=input_placeholder.data)
+        ethosu_write = T.buffer_decl([8192], 'int8', data=input_ethosu_write.data)
         # body
         ethosu_write_1_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
         ethosu_write_1 = T.buffer_decl([4096], "int8", data=ethosu_write_1_data)
@@ -468,13 +479,15 @@ def main(placeholder: T.Buffer[(192,), "int8"], ethosu_write: T.Buffer[(8192,),
 @tvm.script.ir_module
 class Conv2dDoubleCascade6:
     @T.prim_func
-    def main(placeholder: T.Buffer[(1024,), "int8"], ethosu_write: T.Buffer[(32768,), "int8"]) -> None:
+    def main(input_placeholder: T.Buffer[(1, 8, 1, 8, 16), "int8"], input_ethosu_write: T.Buffer[(1, 32, 2, 32, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([1456], "uint8")
         buffer_1 = T.buffer_decl([352], "uint8")
         buffer_2 = T.buffer_decl([11040], "uint8")
         buffer_3 = T.buffer_decl([272], "uint8")
+        placeholder = T.buffer_decl([1024], 'int8', data=input_placeholder.data)
+        ethosu_write = T.buffer_decl([32768], 'int8', data=input_ethosu_write.data)
         # body
         ethosu_write_1_data = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True})
         ethosu_write_1 = T.buffer_decl([12288], "int8", data=ethosu_write_1_data)
@@ -630,11 +643,13 @@ def _get_func(
 @tvm.script.ir_module
 class Conv2dInlineCopy1:
     @T.prim_func
-    def main(placeholder_3: T.Buffer[(960,), "int8"], ethosu_write_1: T.Buffer[(1024,), "int8"]) -> None:
+    def main(input_placeholder_3: T.Buffer[(1, 10, 12, 8), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 8, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([848], "uint8")
         buffer_1 = T.buffer_decl([160], "uint8")
+        placeholder_3 = T.buffer_decl([960], 'int8', data=input_placeholder_3.data)
+        ethosu_write_1 = T.buffer_decl([1024], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder_3[120], 0, 0, 0, T.float32(0.5), 10, "NHWC", 96, 8, 1, "int8", 8, 8, 16, 8, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 848, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -643,11 +658,13 @@ def main(placeholder_3: T.Buffer[(960,), "int8"], ethosu_write_1: T.Buffer[(1024
 @tvm.script.ir_module
 class Conv2dInlineCopy2:
     @T.prim_func
-    def main(placeholder_3: T.Buffer[(315,), "int8"], ethosu_write_1: T.Buffer[(240,), "int8"]) -> None:
+    def main(input_placeholder_3: T.Buffer[(1, 7, 9, 5), "int8"], input_ethosu_write_1: T.Buffer[(1, 3, 5, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([160], "uint8")
         buffer_1 = T.buffer_decl([656], "uint8")
+        placeholder_3 = T.buffer_decl([315], 'int8', data=input_placeholder_3.data)
+        ethosu_write_1 = T.buffer_decl([240], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 3, 5, 3, 3, 0, 5, placeholder_3[146], 0, 0, 0, T.float32(0.5), 10, "NHWC", 45, 5, 1, "int8", 3, 5, 16, 3, 0, 5, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 80, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 656, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -685,11 +702,13 @@ def _get_func(ifm_shape, lower, upper, ofm_channels=16):
 @tvm.script.ir_module
 class Conv2dInlineReshape1:
     @T.prim_func
-    def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,), "int8"]) -> None:
+    def main(input_placeholder_3: T.Buffer[(4, 6, 8, 1), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 6, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([160], "uint8")
         buffer_1 = T.buffer_decl([848], "uint8")
+        placeholder_3 = T.buffer_decl([192], 'int8', data=input_placeholder_3.data)
+        ethosu_write_1 = T.buffer_decl([768], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -699,11 +718,13 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
 @tvm.script.ir_module
 class Conv2dInlineReshape2:
     @T.prim_func
-    def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,), "int8"]) -> None:
+    def main(input_placeholder_3: T.Buffer[(1, 24, 8), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 6, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([160], "uint8")
         buffer_1 = T.buffer_decl([848], "uint8")
+        placeholder_3 = T.buffer_decl([192], 'int8', data=input_placeholder_3.data)
+        ethosu_write_1 = T.buffer_decl([768], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -713,11 +734,13 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
 @tvm.script.ir_module
 class Conv2dInlineReshape3:
     @T.prim_func
-    def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,), "int8"]) -> None:
+    def main(input_placeholder_3: T.Buffer[(192, 1), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 6, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([160], "uint8")
         buffer_1 = T.buffer_decl([848], "uint8")
+        placeholder_3 = T.buffer_decl([192], 'int8', data=input_placeholder_3.data)
+        ethosu_write_1 = T.buffer_decl([768], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -727,11 +750,12 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
 @tvm.script.ir_module
 class Conv2dInlineReshape4:
     @T.prim_func
-    def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,), "int8"]) -> None:
+    def main(placeholder_3: T.Buffer[(192,), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 6, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([160], "uint8")
         buffer_1 = T.buffer_decl([848], "uint8")
+        ethosu_write_1 = T.buffer_decl([768], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 8c7ff35272ef..7da3d7e5be82 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -31,10 +31,12 @@
 @tvm.script.ir_module
 class ReferenceModule:
     @T.prim_func
-    def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(2048,), "int8"]) -> None:
+    def main(input_placeholder_3: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write_1: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer_1 = T.buffer_decl([384], "uint8")
+        placeholder_3 = T.buffer_decl([8192], dtype="int8", data=input_placeholder_3.data)
+        ethosu_write_1 = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write_1.data)
         # body
         placeholder_global_data = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin": True})
         placeholder_global = T.buffer_decl([384], "uint8", data=placeholder_global_data)
@@ -73,11 +75,13 @@ def _get_func():
 @tvm.script.ir_module
 class WeightStream:
     @T.prim_func
-    def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(4096,), "int8"]) -> None:
+    def main(input_placeholder_5: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write_1: T.Buffer[(1, 16, 16, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([528], "uint8")
         buffer_2 = T.buffer_decl([336], "uint8")
+        placeholder_5 = T.buffer_decl([8192], dtype="int8", data=input_placeholder_5.data)
+        ethosu_write_1 = T.buffer_decl([4096], dtype="int8", data=input_ethosu_write_1.data)
         # body
         placeholder_d_global_data = T.allocate([528], "uint8", "global", annotations={"disable_lower_builtin": True})
         placeholder_d_global = T.buffer_decl([528], "uint8", data=placeholder_d_global_data)
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index 254abab644a2..fd1e1afa60d9 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -180,8 +180,10 @@ def test_schedule_cache_reads():
 @tvm.script.ir_module
 class DiamondGraphTir:
     @T.prim_func
-    def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264,), "int8"]) -> None:
+    def main(input_placeholder: T.Buffer[(1, 56, 56, 96), "int8"], input_ethosu_write: T.Buffer[(1, 56, 56, 24), "int8"]) -> None:
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        placeholder = T.buffer_decl([301056], dtype='int8', data=input_placeholder.data)
+        ethosu_write = T.buffer_decl([75264], dtype='int8', data=input_ethosu_write.data)
         buffer1 = T.buffer_decl([2848], "uint8")
         buffer3 = T.buffer_decl([976], "uint8")
         p1_data = T.allocate([2848], "uint8", "global", annotations={"disable_lower_builtin":True})
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index fb41e99a9bcb..4aa12aedf215 100755
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -304,7 +304,7 @@ def uses_unsupported_physical_dimensions(  # pylint: disable=invalid-name
     def test_param_shapes(self, ir_module, transformed_input_shape, transformed_output_shape):
         func = ir_module["main"]
         primfunc_input_shape, primfunc_output_shape = [
-            list(func.preflattened_buffer_map[param].shape) for param in func.params
+            list(func.buffer_map[param].shape) for param in func.params
         ]
         assert primfunc_input_shape == transformed_input_shape
         assert primfunc_output_shape == transformed_output_shape
diff --git a/tests/python/unittest/test_aot_legalize_packed_call.py b/tests/python/unittest/test_aot_legalize_packed_call.py
index cd0114d46428..106e0f52adac 100644
--- a/tests/python/unittest/test_aot_legalize_packed_call.py
+++ b/tests/python/unittest/test_aot_legalize_packed_call.py
@@ -26,15 +26,12 @@
 class Module:
     @T.prim_func
     def tvm_test_cpacked(
-        A: T.handle, B: T.handle, C: T.handle, device_context: T.handle
+        A: T.Buffer[(1,), "float32"],
+        B: T.Buffer[(1,), "float32"],
+        C: T.Buffer[(1,), "float32"],
+        device_context: T.Buffer[(1,), "float32"],
     ) -> T.handle:
-        A_0 = T.match_buffer(A, (1,), dtype="float32")
-        T.preflattened_buffer(A_0, (1,), dtype="float32")
-        B_0 = T.match_buffer(B, (1,), dtype="float32")
-        T.preflattened_buffer(B_0, (1,), dtype="float32")
-        C_0 = T.match_buffer(C, (1,), dtype="float32")
-        T.preflattened_buffer(C_0, (1,), dtype="float32")
-        T.evaluate(C)
+        T.evaluate(C.data)
 
     @T.prim_func
     def tir_packed_call() -> None:
@@ -59,15 +56,12 @@ def tir_packed_call() -> None:
 class Expected:
     @T.prim_func
     def tvm_test_cpacked(
-        A: T.handle, B: T.handle, C: T.handle, device_context: T.handle
+        A: T.Buffer[(1,), "float32"],
+        B: T.Buffer[(1,), "float32"],
+        C: T.Buffer[(1,), "float32"],
+        device_context: T.Buffer[(1,), "float32"],
     ) -> T.handle:
-        A_0 = T.match_buffer(A, (1,), dtype="float32")
-        T.preflattened_buffer(A_0, (1,), dtype="float32")
-        B_0 = T.match_buffer(B, (1,), dtype="float32")
-        T.preflattened_buffer(B_0, (1,), dtype="float32")
-        C_0 = T.match_buffer(C, (1,), dtype="float32")
-        T.preflattened_buffer(C_0, (1,), dtype="float32")
-        T.evaluate(C)
+        T.evaluate(C.data)
 
     @T.prim_func
     def tir_packed_call() -> None:
diff --git a/tests/python/unittest/test_arith_domain_touched.py b/tests/python/unittest/test_arith_domain_touched.py
index 3641f06ab8a2..9f7eee096362 100644
--- a/tests/python/unittest/test_arith_domain_touched.py
+++ b/tests/python/unittest/test_arith_domain_touched.py
@@ -30,18 +30,6 @@ def scalar_func(a: T.handle, b: T.handle):
         A[i, j] = B[i - 1, j + 1] + A[i - 1, j - 1]
 
 
-@T.prim_func
-def vector_func(a: T.handle, b: T.handle):
-    n = T.var("int32")
-    m = 128
-    A = T.match_buffer(a, (n, m))
-    B = T.match_buffer(b, (n, m))
-
-    for i in T.serial(n):
-        for j in T.vectorized(m):
-            A[i, j] = A[i, j] + B[i, j]
-
-
 def test_domain_touched():
     func = scalar_func
     a, b = [func.buffer_map[var] for var in func.params]
@@ -81,7 +69,17 @@ def test_domain_touched():
 
 
 def test_domain_touched_vector():
-    func = tvm.lower(vector_func)["main"]
+    m = tvm.runtime.convert(128)
+
+    @T.prim_func
+    def func(a: T.handle, b: T.handle):
+        n = T.var("int32")
+        A = T.match_buffer(a, (n * m,))
+        B = T.match_buffer(b, (n * m,))
+
+        for i in T.serial(n):
+            A[i * m : (i + 1) * m : 1] = A[i * m : (i + 1) * m : 1] + B[i * m : (i + 1) * m : 1]
+
     a, b = [func.buffer_map[var] for var in func.params]
 
     assert tvm.arith._ffi_api.DomainTouched(func.body, a, True, False)[0].extent.value == 128
diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py
index 4140e7732d7e..3f435366e176 100644
--- a/tests/python/unittest/test_auto_scheduler_feature.py
+++ b/tests/python/unittest/test_auto_scheduler_feature.py
@@ -203,20 +203,20 @@ def test_gpu_feature():
 
 @T.prim_func
 def tir_matmul(
-    A: T.Buffer[(16384,), "float32"],
-    B: T.Buffer[(16384,), "float32"],
-    C: T.Buffer[(16384,), "float32"],
+    A: T.Buffer[(256, 256), "float32"],
+    B: T.Buffer[(256, 256), "float32"],
+    C: T.Buffer[(256, 256), "float32"],
 ) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-    T.preflattened_buffer(A, [128, 128], dtype="float32", data=A.data)
-    T.preflattened_buffer(B, [128, 128], dtype="float32", data=B.data)
-    T.preflattened_buffer(C, [128, 128], dtype="float32", data=C.data)
+    A_flat = T.buffer_decl([16384], dtype="float32", data=A.data)
+    B_flat = T.buffer_decl([16384], dtype="float32", data=B.data)
+    C_flat = T.buffer_decl([16384], dtype="float32", data=C.data)
     # body
     for x, y in T.grid(128, 128):
-        C[x * 128 + y] = T.float32(0)
+        C_flat[x * 128 + y] = T.float32(0)
         for k in T.serial(128):
-            C[x * 128 + y] = C[x * 128 + y] + A[x * 128 + k] * B[y * 128 + k]
+            C_flat[x * 128 + y] = C_flat[x * 128 + y] + A_flat[x * 128 + k] * B_flat[y * 128 + k]
 
 
 def test_primfunc_without_lowering():
diff --git a/tests/python/unittest/test_lower_build.py b/tests/python/unittest/test_lower_build.py
index bd820b617c2d..665697b84be9 100644
--- a/tests/python/unittest/test_lower_build.py
+++ b/tests/python/unittest/test_lower_build.py
@@ -54,40 +54,44 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
 class LoweredModule:
     @T.prim_func
     def main(
-        A: T.Buffer[(16384,), "float32"],
-        B: T.Buffer[(16384,), "float32"],
-        C: T.Buffer[(16384,), "float32"],
+        A: T.Buffer[(128, 128), "float32"],
+        B: T.Buffer[(128, 128), "float32"],
+        C: T.Buffer[(128, 128), "float32"],
     ) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "from_legacy_te_schedule": True, "tir.noalias": True})
-        T.preflattened_buffer(A, [128, 128], data=A.data)
-        T.preflattened_buffer(B, [128, 128], data=B.data)
-        T.preflattened_buffer(C, [128, 128], data=C.data)
+        A_flat = T.buffer_decl([16384], data=A.data)
+        B_flat = T.buffer_decl([16384], data=B.data)
+        C_flat = T.buffer_decl([16384], data=C.data)
         # body
         for x, y in T.grid(128, 128):
-            C[x * 128 + y] = 0.0
+            C_flat[x * 128 + y] = 0.0
             for k in T.serial(0, 128):
-                C[x * 128 + y] = C[x * 128 + y] + A[x * 128 + k] * B[y * 128 + k]
+                C_flat[x * 128 + y] = (
+                    C_flat[x * 128 + y] + A_flat[x * 128 + k] * B_flat[y * 128 + k]
+                )
 
 
 @tvm.script.ir_module
 class LoweredTIRModule:
     @T.prim_func
     def main(
-        A: T.Buffer[(16384,), "float32"],
-        B: T.Buffer[(16384,), "float32"],
-        C: T.Buffer[(16384,), "float32"],
+        A: T.Buffer[(128, 128), "float32"],
+        B: T.Buffer[(128, 128), "float32"],
+        C: T.Buffer[(128, 128), "float32"],
     ) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        T.preflattened_buffer(A, [128, 128], data=A.data)
-        T.preflattened_buffer(B, [128, 128], data=B.data)
-        T.preflattened_buffer(C, [128, 128], data=C.data)
+        A_flat = T.buffer_decl([16384], data=A.data)
+        B_flat = T.buffer_decl([16384], data=B.data)
+        C_flat = T.buffer_decl([16384], data=C.data)
         # body
         for x, y in T.grid(128, 128):
-            C[x * 128 + y] = 0.0
+            C_flat[x * 128 + y] = 0.0
             for k in T.serial(0, 128):
-                C[x * 128 + y] = C[x * 128 + y] + A[x * 128 + k] * B[y * 128 + k]
+                C_flat[x * 128 + y] = (
+                    C_flat[x * 128 + y] + A_flat[x * 128 + k] * B_flat[y * 128 + k]
+                )
 
 
 def test_lower_build_te_schedule():
diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py
index 870208499e7a..513e04dc2090 100644
--- a/tests/python/unittest/test_tir_transform_flatten_buffer.py
+++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py
@@ -40,9 +40,9 @@ def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
             for j in T.serial(0, 16):
                 C[i, j] = B_new[0, j] * 2.0
 
-    def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]):
-        T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
-        T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)
+    def expected(input_A: T.Buffer[(16, 16), "float32"], input_C: T.Buffer[(16, 16), "float32"]):
+        A = T.buffer_decl(256, dtype="float32", data=input_A.data)
+        C = T.buffer_decl(256, dtype="float32", data=input_C.data)
         for i in T.serial(0, 16):
             B_new_data = T.allocate([16], "float32", scope="global")
             B_new = T.buffer_decl([16], "float32", scope="global", data=B_new_data)
@@ -71,9 +71,9 @@ def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
             for j in T.serial(0, 16):
                 C[i, j] = B_new[0, j] * 2.0
 
-    def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]):
-        T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
-        T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)
+    def expected(input_A: T.Buffer[(16, 16), "float32"], input_C: T.Buffer[(16, 16), "float32"]):
+        A = T.buffer_decl(256, dtype="float32", data=input_A.data)
+        C = T.buffer_decl(256, dtype="float32", data=input_C.data)
         for i in T.serial(0, 16):
             B_new_data = T.allocate([16], "float32", "global")
             B_new = T.buffer_decl(16, "float32", data=B_new_data)
@@ -100,9 +100,9 @@ def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
         for j in range(0, 16):
             C[i0 * 4 + i1 * 2 + i2, j] = B[0, j] * 2.0
 
-    def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]):
-        T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
-        T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)
+    def expected(input_A: T.Buffer[(16, 16), "float32"], input_C: T.Buffer[(16, 16), "float32"]):
+        A = T.buffer_decl(256, dtype="float32", data=input_A.data)
+        C = T.buffer_decl(256, dtype="float32", data=input_C.data)
 
         i0 = T.env_thread("blockIdx.x")
         i1 = T.env_thread("threadIdx.x")
@@ -134,10 +134,10 @@ def before(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None:
                 C[i, j] = B[j] * 2.0
 
     def expected(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None:
-        A = T.match_buffer(a, n * m, "float32")
-        C = T.match_buffer(c, n * m, "float32")
-        T.preflattened_buffer(A, (n, m), "float32", data=A.data)
-        T.preflattened_buffer(C, (n, m), "float32", data=C.data)
+        input_A = T.match_buffer(a, (n, m), "float32")
+        input_C = T.match_buffer(c, (n, m), "float32")
+        A = T.buffer_decl(n * m, "float32", data=input_A.data)
+        C = T.buffer_decl(n * m, "float32", data=input_C.data)
 
         for i in range(0, n):
             B_data = T.allocate([m], "float32", scope="global")
@@ -159,9 +159,9 @@ def before(A: T.Buffer[(4, 32), "float32"], D: T.Buffer[(4, 32), "float32"]):
             C[i, j] = A[i, j] + B[i, j]
             D[i, j] = C[i, j] * 2.0
 
-    def expected(A: T.Buffer[128, "float32"], D: T.Buffer[128, "float32"]):
-        T.preflattened_buffer(A, (4, 32), "float32", data=A.data)
-        T.preflattened_buffer(D, (4, 32), "float32", data=D.data)
+    def expected(input_A: T.Buffer[(4, 32), "float32"], input_D: T.Buffer[(4, 32), "float32"]):
+        A = T.buffer_decl(128, "float32", data=input_A.data)
+        D = T.buffer_decl(128, "float32", data=input_D.data)
 
         for i, j in T.grid(4, 32):
             B_data = T.allocate([128], "float32", scope="global")
@@ -185,9 +185,9 @@ def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
             for i1, j in T.grid(4, 16):
                 C[i0 * 4 + i1, j] = B_1[i1, j] * 2.0
 
-    def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]):
-        T.preflattened_buffer(A, [16, 16], dtype="float32", data=A.data)
-        T.preflattened_buffer(C, [16, 16], dtype="float32", data=C.data)
+    def expected(input_A: T.Buffer[(16, 16), "float32"], input_C: T.Buffer[(16, 16), "float32"]):
+        A = T.buffer_decl(256, dtype="float32", data=input_A.data)
+        C = T.buffer_decl(256, dtype="float32", data=input_C.data)
         for i0 in T.serial(0, 4):
             B_new_data = T.allocate([68], "float32", scope="global")
             B_new = T.buffer_decl([68], "float32", scope="global", data=B_new_data)
@@ -206,9 +206,9 @@ def before(A: T.Buffer[10, "bool"], B: T.Buffer[10, "bool"]) -> None:
         for i0 in T.serial(10):
             B[i0] = A[i0]
 
-    def expected(A: T.Buffer[10, "int8"], B: T.Buffer[10, "int8"]) -> None:
-        T.preflattened_buffer(A, [10], dtype="bool", data=A.data)
-        T.preflattened_buffer(B, [10], dtype="bool", data=B.data)
+    def expected(input_A: T.Buffer[10, "bool"], input_B: T.Buffer[10, "bool"]) -> None:
+        A = T.buffer_decl(10, dtype="int8", data=input_A.data)
+        B = T.buffer_decl(10, dtype="int8", data=input_B.data)
         # body
         for i0 in T.serial(10):
             B[i0] = T.cast(T.cast(A[i0], "bool"), "int8")
diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py
index 56128155295e..fe48aa7d8fd4 100644
--- a/tests/python/unittest/test_tir_transform_loop_partition.py
+++ b/tests/python/unittest/test_tir_transform_loop_partition.py
@@ -544,9 +544,6 @@ def partitioned_concat(
     A: T.Buffer[(16,), "float32"], B: T.Buffer[(16,), "float32"], C: T.Buffer[(32,), "float32"]
 ) -> None:
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-    T.preflattened_buffer(A, [16], data=A.data)
-    T.preflattened_buffer(B, [16], data=B.data)
-    T.preflattened_buffer(C, [32], data=C.data)
     for i in T.serial(0, 16):
         C[i] = A[i]
     for i in T.serial(0, 16):
@@ -581,42 +578,46 @@ def partition_from_scheduled_tir(prim_func, pass_cfg):
 
 @T.prim_func
 def partitioned_concat_3(
-    placeholder: T.Buffer[(50176,), "int8"],
-    placeholder_1: T.Buffer[(25088,), "int8"],
-    placeholder_2: T.Buffer[(25088,), "int8"],
-    T_concat: T.Buffer[(100352,), "int8"],
+    placeholder: T.Buffer[(1, 64, 28, 28), "int8"],
+    placeholder_1: T.Buffer[(1, 32, 28, 28), "int8"],
+    placeholder_2: T.Buffer[(1, 32, 28, 28), "int8"],
+    T_concat: T.Buffer[(1, 128, 28, 28), "int8"],
 ) -> None:
-    T.preflattened_buffer(placeholder, [1, 64, 28, 28], "int8", data=placeholder.data)
-    T.preflattened_buffer(placeholder_1, [1, 32, 28, 28], "int8", data=placeholder_1.data)
-    T.preflattened_buffer(placeholder_2, [1, 32, 28, 28], "int8", data=placeholder_2.data)
-    T.preflattened_buffer(T_concat, [1, 128, 28, 28], "int8", data=T_concat.data)
+    placeholder_flat = T.buffer_decl([50176], "int8", data=placeholder.data)
+    placeholder_1_flat = T.buffer_decl([25088], "int8", data=placeholder_1.data)
+    placeholder_2_flat = T.buffer_decl([25088], "int8", data=placeholder_2.data)
+    T_concat_flat = T.buffer_decl([100352], "int8", data=T_concat.data)
     for i1, i2, i3 in T.grid(64, 28, 28):
-        T_concat[i1 * 784 + i2 * 28 + i3] = placeholder[i1 * 784 + i2 * 28 + i3]
+        T_concat_flat[i1 * 784 + i2 * 28 + i3] = placeholder_flat[i1 * 784 + i2 * 28 + i3]
     for i1, i2, i3 in T.grid(32, 28, 28):
-        T_concat[i1 * 784 + i2 * 28 + i3 + 50176] = placeholder_1[i1 * 784 + i2 * 28 + i3]
+        T_concat_flat[i1 * 784 + i2 * 28 + i3 + 50176] = placeholder_1_flat[i1 * 784 + i2 * 28 + i3]
     for i1, i2, i3 in T.grid(32, 28, 28):
-        T_concat[i1 * 784 + i2 * 28 + i3 + 75264] = placeholder_2[i1 * 784 + i2 * 28 + i3]
+        T_concat_flat[i1 * 784 + i2 * 28 + i3 + 75264] = placeholder_2_flat[i1 * 784 + i2 * 28 + i3]
 
 
 @T.prim_func
 def concat_func_3(
-    placeholder: T.Buffer[(50176,), "int8"],
-    placeholder_1: T.Buffer[(25088,), "int8"],
-    placeholder_2: T.Buffer[(25088,), "int8"],
-    T_concat: T.Buffer[(100352,), "int8"],
+    placeholder: T.Buffer[(1, 64, 28, 28), "int8"],
+    placeholder_1: T.Buffer[(1, 32, 28, 28), "int8"],
+    placeholder_2: T.Buffer[(1, 32, 28, 28), "int8"],
+    T_concat: T.Buffer[(1, 128, 28, 28), "int8"],
 ) -> None:
-    T.preflattened_buffer(placeholder, (1, 64, 28, 28), "int8", data=placeholder.data)
-    T.preflattened_buffer(placeholder_1, (1, 32, 28, 28), "int8", data=placeholder_1.data)
-    T.preflattened_buffer(placeholder_2, (1, 32, 28, 28), "int8", data=placeholder_2.data)
-    T.preflattened_buffer(T_concat, (1, 128, 28, 28), "int8", data=T_concat.data)
+    placeholder_flat = T.buffer_decl([50176], "int8", data=placeholder.data)
+    placeholder_1_flat = T.buffer_decl([25088], "int8", data=placeholder_1.data)
+    placeholder_2_flat = T.buffer_decl([25088], "int8", data=placeholder_2.data)
+    T_concat_flat = T.buffer_decl([100352], "int8", data=T_concat.data)
     for i1 in T.serial(128, annotations={"pragma_loop_partition_hint": 1}):
         for i2, i3 in T.grid(28, 28):
             if 96 <= i1:
-                T_concat[i1 * 784 + i2 * 28 + i3] = placeholder_2[i1 * 784 + i2 * 28 + i3 - 75264]
+                T_concat_flat[i1 * 784 + i2 * 28 + i3] = placeholder_2_flat[
+                    i1 * 784 + i2 * 28 + i3 - 75264
+                ]
             if 64 <= i1 and i1 < 96:
-                T_concat[i1 * 784 + i2 * 28 + i3] = placeholder_1[i1 * 784 + i2 * 28 + i3 - 50176]
+                T_concat_flat[i1 * 784 + i2 * 28 + i3] = placeholder_1_flat[
+                    i1 * 784 + i2 * 28 + i3 - 50176
+                ]
             if i1 < 64:
-                T_concat[i1 * 784 + i2 * 28 + i3] = placeholder[i1 * 784 + i2 * 28 + i3]
+                T_concat_flat[i1 * 784 + i2 * 28 + i3] = placeholder_flat[i1 * 784 + i2 * 28 + i3]
 
 
 def test_condition_mutually_exclusive():
@@ -628,9 +629,11 @@ def test_condition_mutually_exclusive():
 
 def test_loop_partition_unroll_hint():
     @T.prim_func
-    def main(A: T.Buffer[150528, "int8"], B: T.Buffer[25088, "int8"]) -> None:
-        T.preflattened_buffer(A, [1, 3, 224, 224], "int8", data=A.data)
-        T.preflattened_buffer(B, [1, 224, 7, 16], "int8", data=B.data)
+    def main(
+        A_arg: T.Buffer[(1, 3, 224, 224), "int8"], B_arg: T.Buffer[(1, 224, 7, 16), "int8"]
+    ) -> None:
+        A = T.buffer_decl(150528, "int8", data=A_arg.data)
+        B = T.buffer_decl(25088, "int8", data=B_arg.data)
         for ax0 in T.serial(
             112,
             annotations={"pragma_loop_partition_hint": True},
@@ -640,9 +643,11 @@ def main(A: T.Buffer[150528, "int8"], B: T.Buffer[25088, "int8"]) -> None:
                     B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax0 * 2 + ax2 - 3]
 
     @T.prim_func
-    def partitioned_main(A: T.Buffer[150528, "int8"], B: T.Buffer[25088, "int8"]) -> None:
-        T.preflattened_buffer(A, [1, 3, 224, 224], dtype="int8", data=A.data)
-        T.preflattened_buffer(B, [1, 224, 7, 16], dtype="int8", data=B.data)
+    def partitioned_main(
+        A_arg: T.Buffer[(1, 3, 224, 224), "int8"], B_arg: T.Buffer[(1, 224, 7, 16), "int8"]
+    ) -> None:
+        A = T.buffer_decl(150528, dtype="int8", data=A_arg.data)
+        B = T.buffer_decl(25088, dtype="int8", data=B_arg.data)
         # body
         for ax1, ax2, ax3 in T.grid(224, 7, 16):
             if 3 <= ax2 and ax3 < 3:
@@ -688,8 +693,6 @@ def before(A: T.Buffer[160, "int32"], B: T.Buffer[160, "int32"]) -> None:
 
     @T.prim_func
     def after(A: T.Buffer[160, "int32"], B: T.Buffer[160, "int32"]) -> None:
-        T.preflattened_buffer(A, [160], dtype="int32", data=A.data)
-        T.preflattened_buffer(B, [160], dtype="int32", data=B.data)
         for i in T.serial(10, annotations={"key": "value"}):
             B[i] = A[i] + 1
         for i in T.serial(140, annotations={"key": "value"}):
@@ -737,10 +740,6 @@ def after(
         placeholder_2: T.Buffer[25088, "int8"],
         T_concat: T.Buffer[100352, "int8"],
     ) -> None:
-        T.preflattened_buffer(placeholder, [50176], dtype="int8", data=placeholder.data)
-        T.preflattened_buffer(placeholder_1, [25088], dtype="int8", data=placeholder_1.data)
-        T.preflattened_buffer(placeholder_2, [25088], dtype="int8", data=placeholder_2.data)
-        T.preflattened_buffer(T_concat, [100352], dtype="int8", data=T_concat.data)
         for _ in T.serial(1, annotations={"preserve_unit_loop": True}):
             for i1, i2, i3 in T.grid(64, 28, 28):
                 T_concat[i1 * 784 + i2 * 28 + i3] = placeholder[i1 * 784 + i2 * 28 + i3]
diff --git a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
index bfa132d4cecf..635badb847bd 100644
--- a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
+++ b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
@@ -25,12 +25,12 @@
 @tvm.script.ir_module
 class Before:
     @T.prim_func
-    def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "float32"], conv2d_transpose_nhwc: T.Buffer[(16384,), "float32"]) -> None:
+    def main(inputs: T.Buffer[(1, 4, 4, 512), "float32"], weight: T.Buffer[(4, 4, 512, 256), "float32"], conv2d_transpose_nhwc: T.Buffer[(1, 8, 8, 256), "float32"]) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        T.preflattened_buffer(inputs, [1, 4, 4, 512], dtype="float32", data=inputs.data)
-        T.preflattened_buffer(weight, [4, 4, 512, 256], dtype="float32", data=weight.data)
-        T.preflattened_buffer(conv2d_transpose_nhwc, [1, 8, 8, 256], dtype="float32", data=conv2d_transpose_nhwc.data)
+        inputs_flat = T.buffer_decl([8192], dtype="float32", data=inputs.data)
+        weight_flat = T.buffer_decl([2097152], dtype="float32", data=weight.data)
+        conv2d_transpose_nhwc_flat = T.buffer_decl([16384], dtype="float32", data=conv2d_transpose_nhwc.data)
         # var definition
         threadIdx_x = T.env_thread("threadIdx.x")
         blockIdx_x = T.env_thread("blockIdx.x")
@@ -44,24 +44,24 @@ def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "flo
             conv2d_transpose_nhwc_local[i1_4_init * 4 + i2_3_init * 2 + i2_4_init] = T.float32(0)
         for i6_0 in T.serial(16):
             for ax0_ax1_ax2_ax3_fused_0 in T.serial(24):
-                PadInput_shared[ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x] = T.if_then_else(128 <= ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x and ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x < 640 and 1 <= blockIdx_x // 32 * 2 + (ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x) % 128 // 32 and blockIdx_x // 32 * 2 + (ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x) % 128 // 32 < 5, inputs[blockIdx_x // 32 * 1024 + ax0_ax1_ax2_ax3_fused_0 * 512 + i6_0 * 32 + threadIdx_x - 2560], T.float32(0), dtype="float32")
+                PadInput_shared[ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x] = T.if_then_else(128 <= ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x and ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x < 640 and 1 <= blockIdx_x // 32 * 2 + (ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x) % 128 // 32 and blockIdx_x // 32 * 2 + (ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x) % 128 // 32 < 5, inputs_flat[blockIdx_x // 32 * 1024 + ax0_ax1_ax2_ax3_fused_0 * 512 + i6_0 * 32 + threadIdx_x - 2560], T.float32(0), dtype="float32")
             for ax0_ax1_ax2_ax3_fused_0 in T.serial(32):
-                weight_shared[T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1, 4)] = weight[T.ramp((ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4) // 256 * 131072 + i6_0 * 8192 + (ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4) % 256 // 8 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 2 * 4, 1, 4)]
+                weight_shared[T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1, 4)] = weight_flat[T.ramp((ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4) // 256 * 131072 + i6_0 * 8192 + (ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4) % 256 // 8 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 2 * 4, 1, 4)]
             for i6_1, i2_3, i4_2, i5_2, i6_2, i1_4, i2_4 in T.grid(4, 2, 4, 4, 8, 2, 2):
                 conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] = conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] + T.if_then_else((i1_4 + i4_2) % 2 == 0 and (i2_4 + i5_2) % 2 == 0, PadInput_shared[threadIdx_x // 8 * 128 + (i1_4 + i4_2) // 2 * 128 + (i2_4 + i5_2) // 2 * 32 + i2_3 * 32 + i6_1 * 8 + i6_2], T.float32(0), dtype="float32") * weight_shared[i6_1 * 64 + i6_2 * 8 + threadIdx_x % 8 + 3840 - i5_2 * 256 - i4_2 * 1024]
         for ax1, ax2 in T.grid(2, 4):
-            conv2d_transpose_nhwc[threadIdx_x // 8 * 4096 + ax1 * 2048 + blockIdx_x // 32 * 1024 + ax2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 8] = conv2d_transpose_nhwc_local[ax1 * 4 + ax2]
+            conv2d_transpose_nhwc_flat[threadIdx_x // 8 * 4096 + ax1 * 2048 + blockIdx_x // 32 * 1024 + ax2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 8] = conv2d_transpose_nhwc_local[ax1 * 4 + ax2]
 
 
 @tvm.script.ir_module
 class After:
     @T.prim_func
-    def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "float32"], conv2d_transpose_nhwc: T.Buffer[(16384,), "float32"]) -> None:
+    def main(inputs: T.Buffer[(1, 4, 4, 512), "float32"], weight: T.Buffer[(4, 4, 512, 256), "float32"], conv2d_transpose_nhwc: T.Buffer[(1, 8, 8, 256), "float32"]) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        T.preflattened_buffer(inputs, [1, 4, 4, 512], dtype="float32", data=inputs.data)
-        T.preflattened_buffer(weight, [4, 4, 512, 256], dtype="float32", data=weight.data)
-        T.preflattened_buffer(conv2d_transpose_nhwc, [1, 8, 8, 256], dtype="float32", data=conv2d_transpose_nhwc.data)
+        inputs_flat = T.buffer_decl([8192], dtype="float32", data=inputs.data)
+        weight_flat = T.buffer_decl([2097152], dtype="float32", data=weight.data)
+        conv2d_transpose_nhwc_flat = T.buffer_decl([16384], dtype="float32", data=conv2d_transpose_nhwc.data)
         # var definition
         threadIdx_x = T.env_thread("threadIdx.x")
         blockIdx_x = T.env_thread("blockIdx.x")
@@ -75,27 +75,27 @@ def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "flo
             conv2d_transpose_nhwc_local[i1_4_init * 4 + i2_3_init * 2 + i2_4_init] = T.float32(0)
         for i6_0 in T.serial(16):
             for ax0_ax1_ax2_ax3_fused_0 in T.serial(24):
-                PadInput_shared[ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_ax2_ax3_fused_0 + threadIdx_x // 32) // 4 and (ax0_ax1_ax2_ax3_fused_0 + threadIdx_x // 32) // 20 < 1 and 1 <= blockIdx_x // 32 * 2 + (ax0_ax1_ax2_ax3_fused_0 + threadIdx_x // 32) % 4 and (blockIdx_x // 32 * 2 + (ax0_ax1_ax2_ax3_fused_0 + threadIdx_x // 32) % 4) // 5 < 1, inputs[blockIdx_x // 32 * 1024 + ax0_ax1_ax2_ax3_fused_0 * 512 + i6_0 * 32 + threadIdx_x - 2560], T.float32(0), dtype="float32")
+                PadInput_shared[ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_ax2_ax3_fused_0 + threadIdx_x // 32) // 4 and (ax0_ax1_ax2_ax3_fused_0 + threadIdx_x // 32) // 20 < 1 and 1 <= blockIdx_x // 32 * 2 + (ax0_ax1_ax2_ax3_fused_0 + threadIdx_x // 32) % 4 and (blockIdx_x // 32 * 2 + (ax0_ax1_ax2_ax3_fused_0 + threadIdx_x // 32) % 4) // 5 < 1, inputs_flat[blockIdx_x // 32 * 1024 + ax0_ax1_ax2_ax3_fused_0 * 512 + i6_0 * 32 + threadIdx_x - 2560], T.float32(0), dtype="float32")
             for ax0_ax1_ax2_ax3_fused_0 in T.serial(32):
-                weight_shared[T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1, 4)] = weight[T.ramp((ax0_ax1_ax2_ax3_fused_0 + threadIdx_x * 4 // 128) // 2 * 131072 + i6_0 * 8192 + (ax0_ax1_ax2_ax3_fused_0 * 16 + threadIdx_x * 4 // 8) % 32 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 2 * 4, 1, 4)]
+                weight_shared[T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1, 4)] = weight_flat[T.ramp((ax0_ax1_ax2_ax3_fused_0 + threadIdx_x * 4 // 128) // 2 * 131072 + i6_0 * 8192 + (ax0_ax1_ax2_ax3_fused_0 * 16 + threadIdx_x * 4 // 8) % 32 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 2 * 4, 1, 4)]
             for i6_1, i2_3, i4_2, i5_2, i6_2, i1_4, i2_4 in T.grid(4, 2, 4, 4, 8, 2, 2):
                 conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] = conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] + T.if_then_else((i1_4 + i4_2) % 2 == 0 and (i2_4 + i5_2) % 2 == 0, PadInput_shared[threadIdx_x // 8 * 128 + (i1_4 + i4_2) // 2 * 128 + (i2_4 + i5_2) // 2 * 32 + i2_3 * 32 + i6_1 * 8 + i6_2], T.float32(0), dtype="float32") * weight_shared[i6_1 * 64 + i6_2 * 8 + threadIdx_x % 8 + 3840 - i5_2 * 256 - i4_2 * 1024]
         for ax1, ax2 in T.grid(2, 4):
-            conv2d_transpose_nhwc[threadIdx_x // 8 * 4096 + ax1 * 2048 + blockIdx_x // 32 * 1024 + ax2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 8] = conv2d_transpose_nhwc_local[ax1 * 4 + ax2]
+            conv2d_transpose_nhwc_flat[threadIdx_x // 8 * 4096 + ax1 * 2048 + blockIdx_x // 32 * 1024 + ax2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 8] = conv2d_transpose_nhwc_local[ax1 * 4 + ax2]
 
 
 @tvm.script.ir_module
 class After_simplified:
     @T.prim_func
-    def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "float32"], conv2d_transpose_nhwc: T.Buffer[(16384,), "float32"]) -> None:
+    def main(inputs: T.Buffer[(1, 4, 4, 512), "float32"], weight: T.Buffer[(4, 4, 512, 256), "float32"], conv2d_transpose_nhwc: T.Buffer[(1, 8, 8, 256), "float32"]) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         # var definition
         threadIdx_x = T.env_thread("threadIdx.x")
         blockIdx_x = T.env_thread("blockIdx.x")
-        T.preflattened_buffer(inputs, [1, 4, 4, 512], dtype="float32", data=inputs.data)
-        T.preflattened_buffer(weight, [4, 4, 512, 256], dtype="float32", data=weight.data)
-        T.preflattened_buffer(conv2d_transpose_nhwc, [1, 8, 8, 256], dtype="float32", data=conv2d_transpose_nhwc.data)
+        inputs_flat = T.buffer_decl([8192], dtype="float32", data=inputs.data)
+        weight_flat = T.buffer_decl([2097152], dtype="float32", data=weight.data)
+        conv2d_transpose_nhwc_flat = T.buffer_decl([16384], dtype="float32", data=conv2d_transpose_nhwc.data)
         # body
         T.launch_thread(blockIdx_x, 64)
         conv2d_transpose_nhwc_local = T.decl_buffer([8], "float32", scope="local")
@@ -106,13 +106,13 @@ def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "flo
             conv2d_transpose_nhwc_local[i1_4_init * 4 + i2_3_init * 2 + i2_4_init] = T.float32(0)
         for i6_0 in T.serial(16):
             for ax0_ax1_ax2_ax3_fused_0 in T.serial(24):
-                PadInput_shared[ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x] = T.if_then_else(4 <= ax0_ax1_ax2_ax3_fused_0 and ax0_ax1_ax2_ax3_fused_0 < 20 and 1 <= blockIdx_x // 32 * 2 + ax0_ax1_ax2_ax3_fused_0 % 4 and blockIdx_x // 32 * 2 + ax0_ax1_ax2_ax3_fused_0 % 4 < 5, inputs[blockIdx_x // 32 * 1024 + ax0_ax1_ax2_ax3_fused_0 * 512 + i6_0 * 32 + threadIdx_x - 2560], T.float32(0), dtype="float32")
+                PadInput_shared[ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x] = T.if_then_else(4 <= ax0_ax1_ax2_ax3_fused_0 and ax0_ax1_ax2_ax3_fused_0 < 20 and 1 <= blockIdx_x // 32 * 2 + ax0_ax1_ax2_ax3_fused_0 % 4 and blockIdx_x // 32 * 2 + ax0_ax1_ax2_ax3_fused_0 % 4 < 5, inputs_flat[blockIdx_x // 32 * 1024 + ax0_ax1_ax2_ax3_fused_0 * 512 + i6_0 * 32 + threadIdx_x - 2560], T.float32(0), dtype="float32")
             for ax0_ax1_ax2_ax3_fused_0 in T.serial(32):
-                weight_shared[T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1, 4)] = weight[T.ramp(ax0_ax1_ax2_ax3_fused_0 // 2 * 131072 + i6_0 * 8192 + ax0_ax1_ax2_ax3_fused_0 % 2 * 4096 + threadIdx_x // 2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 2 * 4, 1, 4)]
+                weight_shared[T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1, 4)] = weight_flat[T.ramp(ax0_ax1_ax2_ax3_fused_0 // 2 * 131072 + i6_0 * 8192 + ax0_ax1_ax2_ax3_fused_0 % 2 * 4096 + threadIdx_x // 2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 2 * 4, 1, 4)]
             for i6_1, i2_3, i4_2, i5_2, i6_2, i1_4, i2_4 in T.grid(4, 2, 4, 4, 8, 2, 2):
                 conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] = conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] + T.if_then_else((i1_4 + i4_2) % 2 == 0 and (i2_4 + i5_2) % 2 == 0, PadInput_shared[threadIdx_x // 8 * 128 + (i1_4 + i4_2) // 2 * 128 + (i2_4 + i5_2) // 2 * 32 + i2_3 * 32 + i6_1 * 8 + i6_2], T.float32(0), dtype="float32") * weight_shared[i6_1 * 64 + i6_2 * 8 + threadIdx_x % 8 + 3840 - i5_2 * 256 - i4_2 * 1024]
         for ax1, ax2 in T.grid(2, 4):
-            conv2d_transpose_nhwc[threadIdx_x // 8 * 4096 + ax1 * 2048 + blockIdx_x // 32 * 1024 + ax2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 8] = conv2d_transpose_nhwc_local[ax1 * 4 + ax2]
+            conv2d_transpose_nhwc_flat[threadIdx_x // 8 * 4096 + ax1 * 2048 + blockIdx_x // 32 * 1024 + ax2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 8] = conv2d_transpose_nhwc_local[ax1 * 4 + ax2]
 
 # pylint: enable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,redundant-keyword-arg
 # fmt: on
diff --git a/tests/python/unittest/test_tir_transform_thread_sync.py b/tests/python/unittest/test_tir_transform_thread_sync.py
index c80cd55ea27e..0c5d77d02b91 100644
--- a/tests/python/unittest/test_tir_transform_thread_sync.py
+++ b/tests/python/unittest/test_tir_transform_thread_sync.py
@@ -98,10 +98,10 @@ def ir(A, B):
 @tvm.testing.requires_cuda
 def test_sync_read_thread_id_independent_location():
     @T.prim_func
-    def func(p0: T.Buffer[2, "float32"], p1: T.Buffer[2, "float32"]) -> None:
+    def func(p0_arg: T.Buffer[(1, 2, 1, 1), "float32"], p1: T.Buffer[2, "float32"]) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
         blockIdx_x = T.env_thread("blockIdx.x")
-        T.preflattened_buffer(p0, [1, 2, 1, 1], dtype="float32", data=p0.data)
+        p0 = T.buffer_decl([2], dtype="float32", data=p0_arg.data)
         result_local = T.alloc_buffer([1], dtype="float32", scope="local")
         temp_shared = T.alloc_buffer([1], dtype="float32", scope="shared")
         T.launch_thread(blockIdx_x, 8)
diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index 31cc6e07dec3..d1f86814e7d6 100644
--- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -75,11 +75,8 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
         placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
-        T.preflattened_buffer(placeholder_4, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
-        T.preflattened_buffer(placeholder_5, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
-        T.preflattened_buffer(T_subtract_1, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         for ax0_ax1_fused_1 in T.serial(0, 224):
             for ax2_1, ax3_inner_1 in T.grid(224, 3):
@@ -90,13 +87,9 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True})
         placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
-        T.preflattened_buffer(placeholder_65, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1)
-        T.preflattened_buffer(placeholder_66, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
-        T.preflattened_buffer(placeholder_67, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
-        T.preflattened_buffer(T_cast_21, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_7_data = T.allocate([157323], "int16", "global")
         PaddedInput_7 = T.buffer_decl(shape=[157323], dtype="int16", data=PaddedInput_7_data)
@@ -118,9 +111,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True})
         placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
-        T.preflattened_buffer(placeholder_29, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
-        T.preflattened_buffer(T_cast_7, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         tensor_2_data = T.allocate([200704], "uint8", "global")
         tensor_2 = T.buffer_decl(shape=[200704], dtype="uint8", data=tensor_2_data)
@@ -168,13 +159,9 @@ def __tvm_main__(input: T.handle, fast_memory_0_var: T.Ptr[T.uint8], slow_memory
     @T.prim_func
     def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle, fast_memory_6_var: T.Ptr[T.uint8], slow_memory_7_var: T.Ptr[T.uint8]) -> None:
         placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8")
-        T.preflattened_buffer(placeholder_29, [802816], dtype="uint8")
         T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16")
-        T.preflattened_buffer(T_cast_7, [177], dtype="int16")
         fast_memory_6_buffer_var = T.match_buffer(fast_memory_6_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        T.preflattened_buffer(fast_memory_6_buffer_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
         slow_memory_7_buffer_var = T.match_buffer(slow_memory_7_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        T.preflattened_buffer(slow_memory_7_buffer_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         tensor_2_let = T.buffer_decl([200704], dtype="uint8")
         with T.let(tensor_2_let.data, T.address_of(fast_memory_6_buffer_var[0], dtype="handle")):
@@ -189,15 +176,10 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
     @T.prim_func
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle, fast_memory_2_var: T.Ptr[T.uint8], slow_memory_3_var: T.Ptr[T.uint8]) -> None:
         placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8")
-        T.preflattened_buffer(placeholder_4, [150528], dtype="uint8")
         placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16")
-        T.preflattened_buffer(placeholder_5, [1], dtype="int16")
         T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16")
-        T.preflattened_buffer(T_subtract_1, [452], dtype="int16")
         fast_memory_2_buffer_var = T.match_buffer(fast_memory_2_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        T.preflattened_buffer(fast_memory_2_buffer_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
         slow_memory_3_buffer_var = T.match_buffer(slow_memory_3_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        T.preflattened_buffer(slow_memory_3_buffer_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         for ax0_ax1_fused_1, ax2_1, ax3_inner_1 in T.grid(224, 224, 3):
             T_subtract_1[ax0_ax1_fused_1 * 672 + ax2_1 * 3 + ax3_inner_1] = T.cast(placeholder_4[ax0_ax1_fused_1 * 672 + ax2_1 * 3 + ax3_inner_1], "int16") - placeholder_5[0]
@@ -205,17 +187,11 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle, fast_memory_4_var: T.Ptr[T.uint8], slow_memory_5_var: T.Ptr[T.uint8]) -> None:
         placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16")
-        T.preflattened_buffer(placeholder_65, [150528], dtype="int16")
         placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16")
-        T.preflattened_buffer(placeholder_66, [9408], dtype="int16")
         placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32")
-        T.preflattened_buffer(placeholder_67, [64], dtype="int32")
         T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8")
-        T.preflattened_buffer(T_cast_21, [289], dtype="uint8")
         fast_memory_4_buffer_var = T.match_buffer(fast_memory_4_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        T.preflattened_buffer(fast_memory_4_buffer_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
         slow_memory_5_buffer_var = T.match_buffer(slow_memory_5_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        T.preflattened_buffer(slow_memory_5_buffer_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         PaddedInput_7_let = T.buffer_decl([157323], "int16")
         with T.let(PaddedInput_7_let.data, T.address_of(slow_memory_5_buffer_var[802816], dtype="handle")):
@@ -280,11 +256,8 @@ def tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast(p
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast", "tir.noalias": True})
         placeholder_2 = T.match_buffer(placeholder, [360000], dtype="uint8")
-        T.preflattened_buffer(placeholder_2, [360000], dtype="uint8")
         placeholder_3 = T.match_buffer(placeholder_1, [64], dtype="int32")
-        T.preflattened_buffer(placeholder_3, [64], dtype="int32")
         T_cast_1 = T.match_buffer(T_cast, [215], dtype="int16")
-        T.preflattened_buffer(T_cast_1, [215], dtype="int16")
         # body
         for ax0_ax1_fused, ax2, ax3_outer, ax3_inner in T.grid(75, 75, 4, 16):
             T_cast_1[ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner] = T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.cast(placeholder_2[ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner], "int32") - 94, 1843157232, 31, 1, dtype="int32") + placeholder_3[ax3_outer * 16 + ax3_inner], 255), 0), "uint8"), "int16")
@@ -294,13 +267,9 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", "tir.noalias": True})
         placeholder_13 = T.match_buffer(placeholder_10, [360000], dtype="int16")
-        T.preflattened_buffer(placeholder_13, [360000], dtype="int16")
         placeholder_14 = T.match_buffer(placeholder_11, [36864], dtype="int16")
-        T.preflattened_buffer(placeholder_14, [36864], dtype="int16")
         placeholder_15 = T.match_buffer(placeholder_12, [64], dtype="int32")
-        T.preflattened_buffer(placeholder_15, [64], dtype="int32")
         T_cast_5 = T.match_buffer(T_cast_4, [215], dtype="int16")
-        T.preflattened_buffer(T_cast_5, [215], dtype="int16")
         # body
         PaddedInput_1_data = T.allocate([379456], "int16", "global")
         PaddedInput_1 = T.buffer_decl(shape=[379456], dtype="int16", data=PaddedInput_1_data)
@@ -321,13 +290,9 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", "tir.noalias": True})
         placeholder_19 = T.match_buffer(placeholder_16, [360000], dtype="int16")
-        T.preflattened_buffer(placeholder_19, [360000], dtype="int16")
         placeholder_20 = T.match_buffer(placeholder_17, [16384], dtype="int16")
-        T.preflattened_buffer(placeholder_20, [16384], dtype="int16")
         placeholder_21 = T.match_buffer(placeholder_18, [256], dtype="int32")
-        T.preflattened_buffer(placeholder_21, [256], dtype="int32")
         T_add_1 = T.match_buffer(T_add, [407], dtype="int32")
-        T.preflattened_buffer(T_add_1, [407], dtype="int32")
         # body
         PaddedInput_2_data = T.allocate([360000], "int16", "global")
         PaddedInput_2 = T.buffer_decl(shape=[360000], dtype="int16", data=PaddedInput_2_data)
@@ -349,15 +314,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", "tir.noalias": True})
         placeholder_29 = T.match_buffer(placeholder_22, [360000], dtype="int16")
-        T.preflattened_buffer(placeholder_29, [360000], dtype="int16")
         placeholder_27 = T.match_buffer(placeholder_23, [16384], dtype="int16")
-        T.preflattened_buffer(placeholder_27, [16384], dtype="int16")
         placeholder_26 = T.match_buffer(placeholder_24, [256], dtype="int32")
-        T.preflattened_buffer(placeholder_26, [256], dtype="int32")
         placeholder_28 = T.match_buffer(placeholder_25, [1440000], dtype="int32")
-        T.preflattened_buffer(placeholder_28, [1440000], dtype="int32")
         T_cast_7 = T.match_buffer(T_cast_6, [407], dtype="uint8")
-        T.preflattened_buffer(T_cast_7, [407], dtype="uint8")
         # body
         PaddedInput_3_data = T.allocate([360000], "int16", "global")
         PaddedInput_3 = T.buffer_decl(shape=[360000], dtype="int16", data=PaddedInput_3_data)
@@ -396,13 +356,9 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", "tir.noalias": True})
         placeholder_7 = T.match_buffer(placeholder_4, [360000], dtype="int16")
-        T.preflattened_buffer(placeholder_7, [360000], dtype="int16")
         placeholder_8 = T.match_buffer(placeholder_5, [4096], dtype="int16")
-        T.preflattened_buffer(placeholder_8, [4096], dtype="int16")
         placeholder_9 = T.match_buffer(placeholder_6, [64], dtype="int32")
-        T.preflattened_buffer(placeholder_9, [64], dtype="int32")
         T_cast_3 = T.match_buffer(T_cast_2, [215], dtype="int16")
-        T.preflattened_buffer(T_cast_3, [215], dtype="int16")
         # body
         PaddedInput_data = T.allocate([360000], "int16", "global")
         PaddedInput = T.buffer_decl([360000], "int16", data=PaddedInput_data)
@@ -426,13 +382,9 @@ class ResnetStructurePlanned:
     @T.prim_func
     def tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast(placeholder: T.handle, placeholder_1: T.handle, T_cast: T.handle, global_workspace_1_var: T.Ptr[T.uint8]) -> None:
         placeholder_2 = T.match_buffer(placeholder, [360000], dtype="uint8")
-        T.preflattened_buffer(placeholder_2, [360000], dtype="uint8")
         placeholder_3 = T.match_buffer(placeholder_1, [64], dtype="int32")
-        T.preflattened_buffer(placeholder_3, [64], dtype="int32")
         T_cast_1 = T.match_buffer(T_cast, [215], dtype="int16")
-        T.preflattened_buffer(T_cast_1, [215], dtype="int16")
         global_workspace_1_buffer_var = T.match_buffer(global_workspace_1_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        T.preflattened_buffer(global_workspace_1_buffer_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         for ax0_ax1_fused, ax2, ax3_outer, ax3_inner in T.grid(75, 75, 4, 16):
             T_cast_1[ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner] = T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.cast(placeholder_2[ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner], "int32") - 94, 1843157232, 31, 1, dtype="int32") + placeholder_3[ax3_outer * 16 + ax3_inner], 255), 0), "uint8"), "int16")
@@ -440,17 +392,11 @@ def tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast(p
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_(placeholder_22: T.handle, placeholder_23: T.handle, placeholder_24: T.handle, placeholder_25: T.handle, T_cast_6: T.handle, global_workspace_5_var: T.Ptr[T.uint8]) -> None:
         placeholder_29 = T.match_buffer(placeholder_22, [360000], dtype="int16")
-        T.preflattened_buffer(placeholder_29, [360000], dtype="int16")
         placeholder_27 = T.match_buffer(placeholder_23, [16384], dtype="int16")
-        T.preflattened_buffer(placeholder_27, [16384], dtype="int16")
         placeholder_26 = T.match_buffer(placeholder_24, [256], dtype="int32")
-        T.preflattened_buffer(placeholder_26, [256], dtype="int32")
         placeholder_28 = T.match_buffer(placeholder_25, [1440000], dtype="int32")
-        T.preflattened_buffer(placeholder_28, [1440000], dtype="int32")
         T_cast_7 = T.match_buffer(T_cast_6, [407], dtype="uint8")
-        T.preflattened_buffer(T_cast_7, [407], dtype="uint8")
         global_workspace_5_buffer_var = T.match_buffer(global_workspace_5_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        T.preflattened_buffer(global_workspace_5_buffer_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         PaddedInput_3_let = T.buffer_decl([360000], 'int16')
         with T.let(PaddedInput_3_let.data, T.address_of(global_workspace_5_buffer_var[6480000], dtype="handle")):
@@ -470,15 +416,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_(placeholder_16: T.handle, placeholder_17: T.handle, placeholder_18: T.handle, T_add: T.handle, global_workspace_4_var: T.Ptr[T.uint8]) -> None:
         placeholder_19 = T.match_buffer(placeholder_16, [360000], dtype="int16")
-        T.preflattened_buffer(placeholder_19, [360000], dtype="int16")
         placeholder_20 = T.match_buffer(placeholder_17, [16384], dtype="int16")
-        T.preflattened_buffer(placeholder_20, [16384], dtype="int16")
         placeholder_21 = T.match_buffer(placeholder_18, [256], dtype="int32")
-        T.preflattened_buffer(placeholder_21, [256], dtype="int32")
         T_add_1 = T.match_buffer(T_add, [407], dtype="int32")
-        T.preflattened_buffer(T_add_1, [407], dtype="int32")
         global_workspace_4_buffer_var = T.match_buffer(global_workspace_4_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        T.preflattened_buffer(global_workspace_4_buffer_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         PaddedInput_2_let = T.buffer_decl([360000], "int16")
         with T.let(PaddedInput_2_let.data, T.address_of(global_workspace_4_buffer_var[7200000], dtype="handle")):
@@ -498,15 +439,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(placeholder_4: T.handle, placeholder_5: T.handle, placeholder_6: T.handle, T_cast_2: T.handle, global_workspace_2_var: T.Ptr[T.uint8]) -> None:
         placeholder_7 = T.match_buffer(placeholder_4, [360000], dtype="int16")
-        T.preflattened_buffer(placeholder_7, [360000], dtype="int16")
         placeholder_8 = T.match_buffer(placeholder_5, [4096], dtype="int16")
-        T.preflattened_buffer(placeholder_8, [4096], dtype="int16")
         placeholder_9 = T.match_buffer(placeholder_6, [64], dtype="int32")
-        T.preflattened_buffer(placeholder_9, [64], dtype="int32")
         T_cast_3 = T.match_buffer(T_cast_2, [215], dtype="int16")
-        T.preflattened_buffer(T_cast_3, [215], dtype="int16")
         global_workspace_2_buffer_var = T.match_buffer(global_workspace_2_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        T.preflattened_buffer(global_workspace_2_buffer_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         PaddedInput_let = T.buffer_decl([360000], "int16")
         with T.let(PaddedInput_let.data, T.address_of(global_workspace_2_buffer_var[7200000], dtype="handle")):
@@ -525,15 +461,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(placeholder_10: T.handle, placeholder_11: T.handle, placeholder_12: T.handle, T_cast_4: T.handle, global_workspace_3_var: T.Ptr[T.uint8]) -> None:
         placeholder_13 = T.match_buffer(placeholder_10, [360000], dtype="int16")
-        T.preflattened_buffer(placeholder_13, [360000], dtype="int16")
         placeholder_14 = T.match_buffer(placeholder_11, [36864], dtype="int16")
-        T.preflattened_buffer(placeholder_14, [36864], dtype="int16")
         placeholder_15 = T.match_buffer(placeholder_12, [64], dtype="int32")
-        T.preflattened_buffer(placeholder_15, [64], dtype="int32")
         T_cast_5 = T.match_buffer(T_cast_4, [215], dtype="int16")
-        T.preflattened_buffer(T_cast_5, [215], dtype="int16")
         global_workspace_3_buffer_var = T.match_buffer(global_workspace_3_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        T.preflattened_buffer(global_workspace_3_buffer_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         PaddedInput_1_let = T.buffer_decl([379456], "int16")
         with T.let(PaddedInput_1_let.data, T.address_of(global_workspace_3_buffer_var[0], dtype="handle")):
@@ -630,9 +561,6 @@ def tensor_intrin_primfunc(global_workspace_1_var: T.Ptr[T.uint8]) -> None:
         global_workspace_1_buffer_var = T.match_buffer(
             global_workspace_1_var, [40], dtype="uint8", strides=[1], elem_offset=0, align=16
         )
-        T.preflattened_buffer(
-            global_workspace_1_buffer_var, [40], dtype="uint8", strides=[1], elem_offset=0, align=16
-        )
         dense_let = T.buffer_decl([10], "int32")
         with T.let(dense_let.data, T.address_of(global_workspace_1_buffer_var[0], dtype="handle")):
             T.evaluate(
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index 32293cccdcf1..f542080f89f9 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -565,26 +565,6 @@ def non_integer_typed_block_iter():
     check_error(non_integer_typed_block_iter, 3)
 
 
-def test_preflattened_buffer_map_align():
-    def preflattened_buffer_map_align_nonint(foo: T.handle):
-        foo_1 = T.match_buffer(foo, [1])
-        T.preflattened_buffer(
-            foo_1, [1], align="bar"
-        )  # check_error: align: want int or IntImm, got 'bar'
-
-    check_error(preflattened_buffer_map_align_nonint, 3)
-
-
-def test_preflattened_buffer_map_offset_factor():
-    def preflattened_buffer_map_offset_factor_nonint(foo: T.handle):
-        foo_1 = T.match_buffer(foo, [1])
-        T.preflattened_buffer(
-            foo_1, [1], offset_factor="bar"
-        )  # check_error: offset_factor: want int or IntImm, got 'bar'
-
-    check_error(preflattened_buffer_map_offset_factor_nonint, 3)
-
-
 def test_illegal_buffer_slice():
     def strided_buffer_region(A: T.handle):
         # do not allow stride in buffer region
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index 29e03f8bb63f..7d542c7bc7bd 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -41,7 +41,6 @@ def test_ir_builder_tir_primfunc_base():
         body=tir.Evaluate(0),
         ret_type=None,
         buffer_map=None,
-        preflattened_buffer_map=None,
         attrs=None,
     )
 
@@ -60,7 +59,6 @@ def test_ir_builder_tir_primfunc_complete():
             T.func_attr({"key": "value"})
             T.func_ret(tvm.ir.PrimType("int64"))
             buffer_d = T.match_buffer(d, (64, 64), "int64")
-            T.preflattened_buffer(e, (32, 32), "int8", data=e.data)
             T.evaluate(0)
 
     # the prim_func generated by IRBuilder
@@ -83,9 +81,6 @@ def test_ir_builder_tir_primfunc_complete():
         body=tir.Evaluate(0),
         ret_type=tvm.ir.PrimType("int64"),
         buffer_map={c_handle: c_buffer, d_handle: d_buffer, e_handle: e_buffer},
-        preflattened_buffer_map={
-            e_handle: tir.decl_buffer((32, 32), "int8", name="e_preflatten", data=e_buffer.data)
-        },
         attrs=tvm.ir.make_node("DictAttrs", key="value"),
     )
 
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index a39354b9552a..02b18e7e7c44 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -186,23 +186,6 @@ def test_dynamic_shape_gemm():
     assert_structural_equal(gemm_dyn_shape, gemm_dyn_shape_roundtrip)
 
 
-@T.prim_func
-def preflattened_buffer_map(A: T.handle, B: T.handle):
-    A_1 = T.match_buffer(A, [1])
-    T.preflattened_buffer(A_1, [1], align=1, offset_factor=2)
-    B_1 = T.match_buffer(B, [1])
-    T.preflattened_buffer(B_1, [1])
-    B_1[0] = A_1[0]
-
-
-def test_preflattened_buffer_map():
-    A_var = [
-        k for k, _ in preflattened_buffer_map.preflattened_buffer_map.items() if k.name == "A"
-    ][0]
-    assert preflattened_buffer_map.preflattened_buffer_map[A_var].data_alignment == 1
-    assert preflattened_buffer_map.preflattened_buffer_map[A_var].offset_factor == 2
-
-
 @T.prim_func
 def match_buffer_int64(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (T.int64(128), T.int64(128)), dtype="float32")

From 44ed06ac9f019f9f06608504c3382d0905b6d5a2 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 16 Nov 2022 15:58:08 +0000
Subject: [PATCH 594/704] [ETHOSN] Relax concatenate offloading requirements
 (#13405)

This appears to be a historical change due to a bug in the support
library. Since the tests are working without this restriction and
previously not offloaded concatenate operations are now being offloaded,
it seems it is no longer necessary.

Change-Id: Ie6610cccd0c12e9f96db5d2b04e4ea553b3e7c64
---
 python/tvm/relay/op/contrib/ethosn.py         | 19 +------------------
 .../contrib/test_ethosn/test_networks.py      |  6 +++---
 2 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index bbe95dac9bba..e28eea9d224f 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -428,24 +428,7 @@ def qnn_concatenate(expr):
     if not _ethosn.concatenate(expr):
         return False
 
-    # Support library has some unenforced restrictions on qnn params
-    args = expr.args
-    min_range = 1e9
-    max_range = -1e9
-    qnn_params = []
-    for i in range(len(args[1].fields)):
-        scale = args[1].fields[i].data.numpy()
-        zero_point = args[2].fields[i].data.numpy()
-        min_range = min(-1 * zero_point * scale, min_range)
-        max_range = max((255 - zero_point) * scale, max_range)
-        qnn_params.append((scale, zero_point))
-
-    scale = (max_range - min_range) / 255
-    zero_point = int(-min_range / scale)
-    if (scale, zero_point) in qnn_params:
-        return True
-
-    return False
+    return True
 
 
 @tvm.ir.register_op_attr("split", "target.ethos-n")
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 68402cd5e8a9..23ff5207fbcd 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -210,7 +210,7 @@ def test_ssd_mobilenet_v1():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"ec2b78852192058f88b64d45c26620d5", "f68cbeaaba03874ea735ce3f5eab9227"}
+    _compile_hash = {"04855b9b9e0ab3f3768495059e12c5cf"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip",
@@ -218,6 +218,6 @@ def test_ssd_mobilenet_v1():
         input_dict={"normalized_input_image_tensor": (1, 300, 300, 3)},
         compile_hash=_compile_hash,
         output_count=4,
-        host_ops=27,
-        npu_partitions=2,
+        host_ops=26,
+        npu_partitions=1,
     )

From 52739ef8cd4499b555453d7159991316afc44301 Mon Sep 17 00:00:00 2001
From: Benson Muite <bkmgit@users.noreply.github.com>
Date: Wed, 16 Nov 2022 19:32:09 +0300
Subject: [PATCH 595/704] [ci][tvmbot] Fix spelling error, metionable ->
 mentionable (#13276)

In AUTH_CHECKS, "mentionable_users" was "metionable_users".
This pull request fixes this spelling error.
---
 ci/scripts/github/github_tvmbot.py | 4 ++--
 tests/python/ci/test_tvmbot.py     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/scripts/github/github_tvmbot.py b/ci/scripts/github/github_tvmbot.py
index 908551bdec0d..8dc897367e57 100755
--- a/ci/scripts/github/github_tvmbot.py
+++ b/ci/scripts/github/github_tvmbot.py
@@ -632,7 +632,7 @@ def check_mentionable_users(pr, triggering_comment, args):
 
 
 AUTH_CHECKS = {
-    "metionable_users": check_mentionable_users,
+    "mentionable_users": check_mentionable_users,
     "collaborators": check_collaborator,
     "author": check_author,
 }
@@ -676,7 +676,7 @@ class Rerun:
         "run ci",
     ]
 
-    auth = [AUTH_CHECKS["metionable_users"]]
+    auth = [AUTH_CHECKS["mentionable_users"]]
 
     @staticmethod
     def run(pr: PR):
diff --git a/tests/python/ci/test_tvmbot.py b/tests/python/ci/test_tvmbot.py
index de3ab9bb501b..f0ed7786ac16 100644
--- a/tests/python/ci/test_tvmbot.py
+++ b/tests/python/ci/test_tvmbot.py
@@ -253,7 +253,7 @@ class TestRerunPermissions(_TvmBotTest):
 
     COMMENT = "@tvm-bot rerun"
     USER = "someone"
-    EXPECTED = "Failed auth check 'metionable_users', quitting"
+    EXPECTED = "Failed auth check 'mentionable_users', quitting"
 
 
 class TestRerunNonAuthor(_TvmBotTest):
@@ -263,7 +263,7 @@ class TestRerunNonAuthor(_TvmBotTest):
 
     COMMENT = "@tvm-bot rerun"
     USER = "other-abc"
-    EXPECTED = "Passed auth check 'metionable_users', continuing"
+    EXPECTED = "Passed auth check 'mentionable_users', continuing"
 
 
 class TestIgnoreJobs(_TvmBotTest):

From 271ad4302917011a54b257ca2a78c563a7ba652c Mon Sep 17 00:00:00 2001
From: abhikran-quic <63697863+abhikran-quic@users.noreply.github.com>
Date: Wed, 16 Nov 2022 22:18:28 +0530
Subject: [PATCH 596/704] [TOPI] Update names for pooling ops (#13401)

[TOPI] Specify names for pooling ops

- Explicit names are useful while fetching
  the compute during scheduling of pooling ops.
- Specify meta_schedule attributes.
---
 include/tvm/topi/nn/pooling.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/tvm/topi/nn/pooling.h b/include/tvm/topi/nn/pooling.h
index c81c7cda7d15..3503584687fe 100644
--- a/include/tvm/topi/nn/pooling.h
+++ b/include/tvm/topi/nn/pooling.h
@@ -353,7 +353,9 @@ inline Tensor adaptive_pool_impl(const Tensor& x, const Array<PrimExpr>& output_
     return std::make_tuple(indices, reduce_axes);
   };
 
+  Map<String, ObjectRef> attrs;
   if (pool_type == kMaxPool) {
+    attrs.Set("schedule_rule", tvm::runtime::String("meta_schedule.adaptive_pool_max"));
     return tvm::te::compute(
         out_shape,
         [&](const Array<Var>& output) {
@@ -362,8 +364,9 @@ inline Tensor adaptive_pool_impl(const Tensor& x, const Array<PrimExpr>& output_
           std::tie(indices, reduce_axes) = get_iter_vars(output, true);
           return tvm::max(x(indices), reduce_axes);  // NOLINT(*)
         },
-        "tensor", "adaptive_pool_max");
+        "adaptive_pool_max", "adaptive_pool_max", attrs);
   } else if (pool_type == kAvgPool) {
+    attrs.Set("schedule_rule", tvm::runtime::String("meta_schedule.adaptive_pool_avg"));
     auto pool_sum = tvm::te::compute(
         out_shape,
         [&](const Array<Var>& output) {
@@ -372,7 +375,7 @@ inline Tensor adaptive_pool_impl(const Tensor& x, const Array<PrimExpr>& output_
           std::tie(indices, reduce_axes) = get_iter_vars(output, true);
           return tvm::sum(x(indices), reduce_axes);
         },
-        "tensor", "adaptive_pool_sum");
+        "adaptive_pool_sum", "adaptive_pool_sum");
 
     return tvm::te::compute(
         out_shape,
@@ -388,7 +391,7 @@ inline Tensor adaptive_pool_impl(const Tensor& x, const Array<PrimExpr>& output_
 
           return div(pool_sum(indices), divide_factor);
         },
-        "tensor", kElementWise);
+        "adaptive_pool_avg", kElementWise, attrs);
   } else {
     LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
     return x;
@@ -566,8 +569,10 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
     out_shape.Set(ii, out_dim);
   }
 
+  Map<String, ObjectRef> attrs;
   if (pool_type == kMaxPool) {
     auto temp = do_pad ? pad(x, pad_before, pad_after, tvm::min_value(x->dtype), "pad_temp") : x;
+    attrs.Set("schedule_rule", tvm::runtime::String("meta_schedule.pool_max"));
     return tvm::te::compute(
         out_shape,
         [&](const Array<Var>& output) {
@@ -580,8 +585,9 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
           }
           return tvm::max(temp(indices), daxis);
         },
-        "tensor", "pool_max");
+        "pool_max", "pool_max", attrs);
   } else if (pool_type == kAvgPool) {
+    attrs.Set("schedule_rule", tvm::runtime::String("meta_schedule.pool_avg"));
     // Pad the inputs
     auto temp = do_pad ? pad(x, pad_before, pad_after, 0, "pad_temp") : x;
 
@@ -598,7 +604,7 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
           }
           return tvm::sum(temp(indices), daxis);
         },
-        "tensor", "pool_sum");
+        "pool_sum", "pool_sum");
 
     // TVM compute for dividing the reduced window sum by kernel size.
     return tvm::te::compute(
@@ -650,7 +656,7 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
             return div(pool_sum(indices), divide_factor);
           }
         },
-        "tensor", kElementWise);
+        "pool_avg", kElementWise, attrs);
   } else {
     LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
     return x;

From b4d4b82dbb9be2e4d0954f9dfd8e1c46079b66ee Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 17 Nov 2022 01:52:14 +0900
Subject: [PATCH 597/704] [Hexagon] Fix TIR vrmpy tensorization (#13404)

[Hexagon] Fix vrmpy tensorization
---
 python/tvm/tir/tensor_intrin/hexagon.py       |  4 ---
 .../unittest/test_tir_schedule_tensorize.py   | 26 ++++++++++++++++---
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
index 6fa9dd8f00ae..306c8cd2e14e 100644
--- a/python/tvm/tir/tensor_intrin/hexagon.py
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -32,8 +32,6 @@ def dot_product_32x4_u8u8i32_desc(
         for i in T.serial(0, 32):
             for k in T.serial(0, 4):
                 with T.block("update"):
-                    with T.init():
-                        C[i] = T.int32(0)
                     vi, vk = T.axis.remap("SR", [i, k])
                     C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
 
@@ -76,8 +74,6 @@ def dot_product_32x4_u8i8i32_desc(
         for i in T.serial(0, 32):
             for k in T.serial(0, 4):
                 with T.block("update"):
-                    with T.init():
-                        C[i] = T.int32(0)
                     vi, vk = T.axis.remap("SR", [i, k])
                     C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
 
diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index f30e91b892c5..0129cee53254 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -30,6 +30,7 @@
 )
 from tvm.tir.tensor_intrin.rocm import AMDGPU_SDOT4_INTRIN
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
 
 # fmt: off
 # pylint: disable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
@@ -539,9 +540,9 @@ def test_tensorize_with_annotation():
     verify_trace_roundtrip(sch=s, mod=func)
 
 
-def get_matmul_packed(m, n, k, lhs_type, int32_lanes):
+def get_matmul_packed(m, n, k, lhs_type, int32_lanes, rhs_dtype="int8"):
     X = te.placeholder((m, k), name="X", dtype=lhs_type)
-    packed_W = te.placeholder((n // int32_lanes, k // 4, int32_lanes, 4), name="packedW", dtype="int8")
+    packed_W = te.placeholder((n // int32_lanes, k // 4, int32_lanes, 4), name="packedW", dtype=rhs_dtype)
 
     ak = te.reduce_axis((0, k), name="k")
     matmul = te.compute(
@@ -549,7 +550,7 @@ def get_matmul_packed(m, n, k, lhs_type, int32_lanes):
         lambda i, j: te.sum(
             X[i, ak].astype("int32")
             * packed_W[
-                tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4), j % 16, ak % 4
+                tvm.tir.indexdiv(j, int32_lanes), tvm.tir.indexdiv(ak, 4), j % int32_lanes, ak % 4
             ].astype("int32"),
             axis=ak,
         ),
@@ -598,6 +599,25 @@ def test_tensorize_arm_dot():
         verify_trace_roundtrip(sch=sch, mod=func)
 
 
+def test_tensorize_vrmpy():
+    m, n, k = 128, 128, 128
+
+    func = get_matmul_packed(m, n, k, "uint8", 32, "uint8")
+
+    sch = tir.Schedule(func, debug_mask="all")
+    block = sch.get_block("compute")
+    _, j, k = sch.get_loops(block)
+
+    _, ji = sch.split(j, factors=[None, 32])
+    ko, ki = sch.split(k, factors=[None, 4])
+    sch.reorder(ko, ji, ki)
+
+    sch.decompose_reduction(block, ko)
+    sch.tensorize(ji, VRMPY_u8u8i32_INTRIN)
+
+    verify_trace_roundtrip(sch=sch, mod=func)
+
+
 def test_tensorize_dpa4():
     m, n, k = 128, 128, 128
 

From a80cdc26e291abc52bbd70c950023d9e0340464d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 16 Nov 2022 13:45:07 -0600
Subject: [PATCH 598/704] [TIR][Analysis][Arith] Implement basic data-flow
 analysis (#13130)

An optional utility to track known buffer values through a TIR PrimFunc, allowing simplifications based on known values.

* Updated documentation following review comments

* Unit tests for rewrites, including negative numerators for div/mod

* Fix linting error

* Added brief description on what a control graph is

* Updates based on review comments

* Updated T.assume(expr) to T.evaluate(T.assume(expr))
---
 include/tvm/tir/op_attr_types.h               |   31 +
 src/arith/conjunctive_normal_form.cc          |   26 +-
 src/arith/constraint_extract.cc               |   39 +-
 src/arith/constraint_extract.h                |   31 +-
 src/arith/ir_visitor_with_analyzer.h          |    6 +-
 src/arith/rewrite_simplify.cc                 |   53 +-
 src/arith/transitive_comparison_analyzer.cc   |    2 +-
 src/arith/unwrap_vector_expr.cc               |   90 +
 src/arith/unwrap_vector_expr.h                |   56 +
 src/tir/analysis/control_flow_graph.cc        | 1647 +++++++++++++++++
 src/tir/analysis/control_flow_graph.h         |  653 +++++++
 src/tir/transforms/simplify.cc                |  105 +-
 .../unittest/test_arith_rewrite_simplify.py   |   61 +-
 .../unittest/test_tir_transform_simplify.py   |  645 ++++++-
 14 files changed, 3403 insertions(+), 42 deletions(-)
 create mode 100644 src/arith/unwrap_vector_expr.cc
 create mode 100644 src/arith/unwrap_vector_expr.h
 create mode 100644 src/tir/analysis/control_flow_graph.cc
 create mode 100644 src/tir/analysis/control_flow_graph.h

diff --git a/include/tvm/tir/op_attr_types.h b/include/tvm/tir/op_attr_types.h
index 6b5d6c48ddd0..fa409b27d12a 100644
--- a/include/tvm/tir/op_attr_types.h
+++ b/include/tvm/tir/op_attr_types.h
@@ -32,6 +32,8 @@
 #include <tvm/runtime/container/string.h>
 #include <tvm/runtime/packed_func.h>
 
+#include <ostream>
+
 namespace tvm {
 namespace tir {
 /*!
@@ -92,6 +94,35 @@ enum class CallEffectKind : int {
   kControlJump = 6,
 };
 
+inline std::ostream& operator<<(std::ostream& os, CallEffectKind side_effect) {
+  switch (side_effect) {
+    case CallEffectKind::kExprAnnotation:
+      return os << "kExprAnnotation";
+
+    case CallEffectKind::kPure:
+      return os << "kPure";
+
+    case CallEffectKind::kReadState:
+      return os << "kReadState";
+
+    case CallEffectKind::kUpdateState:
+      return os << "kUpdateState";
+
+    case CallEffectKind::kSpecialCallArg:
+      return os << "kSpecialCallArg";
+
+    case CallEffectKind::kEmbedInfo:
+      return os << "kEmbedInfo";
+
+    case CallEffectKind::kControlJump:
+      return os << "kControlJump";
+
+    default:
+      LOG(FATAL) << "Unknown CallEffectKind: " << static_cast<int>(side_effect);
+      return os;
+  }
+}
+
 /*! \brief Use integer to record the kind. */
 using TCallEffectKind = Integer;
 
diff --git a/src/arith/conjunctive_normal_form.cc b/src/arith/conjunctive_normal_form.cc
index 19d6a234e6ad..1c5f31a913a1 100644
--- a/src/arith/conjunctive_normal_form.cc
+++ b/src/arith/conjunctive_normal_form.cc
@@ -248,14 +248,14 @@ void AndOfOrs::TrySimplifyOr(Key* a_ptr, Key* b_ptr, Analyzer* analyzer) {
   Key& a = *a_ptr;
   Key& b = *b_ptr;
   PrimExpr joint = GetExpr(a) || GetExpr(b);
-  PrimExpr simplified = analyzer->Simplify(joint);
+  PrimExpr simplified = analyzer->rewrite_simplify(joint);
   if (!ExprDeepEqual()(simplified, joint)) {
     if (auto* simplified_or = simplified.as<OrNode>()) {
       a = GetKey(simplified_or->a);
       b = GetKey(simplified_or->b);
     } else {
-      a = GetKey(simplified);
-      b = key_false_;
+      a = key_false_;
+      b = GetKey(simplified);
     }
   }
 }
@@ -264,14 +264,14 @@ void AndOfOrs::TrySimplifyAnd(Key* a_ptr, Key* b_ptr, Analyzer* analyzer) {
   Key& a = *a_ptr;
   Key& b = *b_ptr;
   PrimExpr joint = GetExpr(a) && GetExpr(b);
-  PrimExpr simplified = analyzer->Simplify(joint);
+  PrimExpr simplified = analyzer->rewrite_simplify(joint);
   if (!ExprDeepEqual()(simplified, joint)) {
     if (auto* simplified_and = simplified.as<AndNode>()) {
       a = GetKey(simplified_and->a);
       b = GetKey(simplified_and->b);
     } else {
-      a = GetKey(simplified);
-      b = key_true_;
+      a = key_true_;
+      b = GetKey(simplified);
     }
   }
 }
@@ -362,6 +362,20 @@ void AndOfOrs::SimplifyAcrossChunks(Analyzer* analyzer) {
           // (A or B) and (A or C) => A or (B and C)
           auto& key_i = i_chunk[i_distinct_index.value()];
           auto& key_j = j_chunk[j_distinct_index.value()];
+
+          // When attempting to simplify (B and C), the analyzer may
+          // assume that A is false.
+          PrimExpr known = [&]() {
+            PrimExpr known = Bool(true);
+            for (const auto& key : i_chunk) {
+              if (&key != &key_i) {
+                known = known && analyzer->Simplify(!GetExpr(key));
+              }
+            }
+            return known;
+          }();
+
+          With<ConstraintContext> context(analyzer, known);
           TrySimplifyAnd(&key_i, &key_j, analyzer);
         }
       }
diff --git a/src/arith/constraint_extract.cc b/src/arith/constraint_extract.cc
index d0bf57497e63..b873adcb5ca4 100644
--- a/src/arith/constraint_extract.cc
+++ b/src/arith/constraint_extract.cc
@@ -31,23 +31,42 @@
 namespace tvm {
 namespace arith {
 
-void CollectConstraints(const PrimExpr& expr, Analyzer* analyzer, std::vector<PrimExpr>* collect) {
-  collect->push_back(expr);
+template <typename F>
+void CollectConstraints(PrimExpr expr, F callback, bool keep_composite_constraints) {
+  if (keep_composite_constraints) {
+    callback(expr);
+  }
 
   PVar<PrimExpr> x, y;
   if ((x && y).Match(expr)) {
-    CollectConstraints(x.Eval(), analyzer, collect);
-    CollectConstraints(y.Eval(), analyzer, collect);
-  } else if ((!(x || y)).Match(expr)) {
-    CollectConstraints(analyzer->rewrite_simplify(tir::Not(x.Eval())), analyzer, collect);
-    CollectConstraints(analyzer->rewrite_simplify(tir::Not(y.Eval())), analyzer, collect);
+    CollectConstraints(x.Eval(), callback, keep_composite_constraints);
+    CollectConstraints(y.Eval(), callback, keep_composite_constraints);
+  } else if (!keep_composite_constraints) {
+    callback(expr);
+  }
+}
+
+std::vector<PrimExpr> ExtractConstraints(const PrimExpr& expr, bool keep_composite_constraints) {
+  std::vector<PrimExpr> out;
+  CollectConstraints(
+      expr, [&](const PrimExpr& part) { out.push_back(part); }, keep_composite_constraints);
+  return out;
+}
+
+template <typename F>
+void CollectComponents(PrimExpr expr, F callback) {
+  PVar<PrimExpr> x, y;
+  if ((x || y).Match(expr)) {
+    CollectComponents(x.Eval(), callback);
+    CollectComponents(y.Eval(), callback);
+  } else {
+    callback(expr);
   }
 }
 
-std::vector<PrimExpr> ExtractConstraints(const PrimExpr& expr) {
+std::vector<PrimExpr> ExtractComponents(const PrimExpr& expr) {
   std::vector<PrimExpr> out;
-  Analyzer analyzer;
-  CollectConstraints(expr, &analyzer, &out);
+  CollectComponents(expr, [&](const PrimExpr& part) { out.push_back(part); });
   return out;
 }
 
diff --git a/src/arith/constraint_extract.h b/src/arith/constraint_extract.h
index ea6e0a74419c..815eafeebd62 100644
--- a/src/arith/constraint_extract.h
+++ b/src/arith/constraint_extract.h
@@ -42,6 +42,35 @@ namespace arith {
  * Example: `i==5 || j==3` => `[i==5 || j==3]`
  * Example: `!(i>5 || j==3)` => `[!(i==5 || j==3), i<=5, j!=3]`
  *
+ * If `keep_composite_constraints` is true (default), a constraint
+ * that can be decomposed will be included in the output.  If false,
+ * they will be excluded.
+ *
+ * Example, removing composite: `!(i>5 || j==3)` => `[i<=5, j!=3]`
+ *
+ * Intended for use in bounds analysis or simplification within a
+ * conditional, or identifying independent conditionals that may be
+ * hoisted.
+ *
+ * \param expr The expression to be analyzers
+ *
+ * \param keep_composite_constraints Whether to include composite
+ * constraints in the output.
+ *
+ * \returns A vector of independent constraints
+ */
+std::vector<PrimExpr> ExtractConstraints(const PrimExpr& expr,
+                                         bool keep_composite_constraints = true);
+
+/* \brief Returns components that are false if the expression is false.
+ *
+ * Utility to break up a boolean expression into independent
+ * components.
+ *
+ * Example: `i==5 || j==3` => `[i==5, j==3]`
+ * Example: `i==5 && j==3` => `[i==5 && j==3]`
+ * Example: `!(i>5 && j==3)` => `[i<=5, j!=3]`
+ *
  * Intended for use in bounds analysis or simplification within a
  * conditional, or identifying independent conditionals that may be
  * hoisted.
@@ -50,7 +79,7 @@ namespace arith {
  *
  * \returns A vector of independent constraints
  */
-std::vector<PrimExpr> ExtractConstraints(const PrimExpr& expr);
+std::vector<PrimExpr> ExtractComponents(const PrimExpr& expr);
 
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/arith/ir_visitor_with_analyzer.h b/src/arith/ir_visitor_with_analyzer.h
index f41a628f3cc6..416b2af196bd 100644
--- a/src/arith/ir_visitor_with_analyzer.h
+++ b/src/arith/ir_visitor_with_analyzer.h
@@ -57,7 +57,11 @@ class IRVisitorWithAnalyzer : public tir::StmtExprVisitor {
   /*! \brief internal analyzer field. */
   arith::Analyzer analyzer_;
 
- private:
+  /*! \brief Extract a constraint from a conditional statement
+   *
+   * Intended for preparing argument for use in
+   * `With<ConstraintContext>`.
+   */
   PrimExpr ExtractRealCondition(PrimExpr condition) const;
 };
 
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index d0fb943334de..e6d876cf5aa8 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -292,7 +292,7 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
   // we will compare the already simplified result with the constraint,
   // so simplify the constraint as well
   PrimExpr new_constraint = operator()(constraint);
-  for (const PrimExpr& subconstraint : ExtractConstraints(new_constraint)) {
+  for (const PrimExpr& subconstraint : ExtractConstraints(new_constraint, false)) {
     if (SideEffect(subconstraint) <= CallEffectKind::kPure) {
       literal_constraints_.push_back(subconstraint);
       PrimExpr negation;
@@ -1734,7 +1734,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
   // Pattern var match IntImm
-  PVar<IntImm> c1, c2;
+  PVar<IntImm> c1, c2, c3;
   PVar<int> lanes;
 
   if (op->dtype.lanes() != 1) {
@@ -1761,6 +1761,55 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
 
   TVM_TRY_REWRITE(x == c1 && x != c2, x == c1 && c1 != c2);
   TVM_TRY_REWRITE(x != c2 && x == c1, x == c1 && c1 != c2);
+
+  TVM_TRY_RECURSIVE_REWRITE(floordiv(x, c2) == c1 && floormod(x, c2) == c3, x == c1 * c2 + c3);
+  TVM_TRY_RECURSIVE_REWRITE(floormod(x, c2) == c3 && floordiv(x, c2) == c1, x == c1 * c2 + c3);
+
+  TVM_TRY_RECURSIVE_REWRITE_IF(0 <= x - y * c1 &&
+                               x - y * c1<c1, y == floordiv(x, c1), c1.Eval()->value> 0);
+  TVM_TRY_RECURSIVE_REWRITE_IF(x - y * c1 < c1 && 0 <= x - y * c1, y == floordiv(x, c1),
+                               c1.Eval()->value > 0);
+
+  TVM_TRY_RECURSIVE_REWRITE(c1 < x - y * c1 && x - y * c1 <= 0, y == floordiv(x, c1));
+  TVM_TRY_RECURSIVE_REWRITE(x - y * c1 < c1 && 0 <= x - y * c1, y == floordiv(x, c1));
+  TVM_TRY_RECURSIVE_REWRITE_IF(0 <= x + y * c2 && x + y * c2 < c1, y == floordiv(x, c1),
+                               c2.Eval()->value == -c1.Eval()->value);
+  TVM_TRY_RECURSIVE_REWRITE_IF(x + y * c2 < c1 && 0 <= x + y * c2, y == floordiv(x, c1),
+                               c2.Eval()->value == -c1.Eval()->value);
+
+  TVM_TRY_RECURSIVE_REWRITE_IF(x < c1 && floormod(x, c2) < c3,
+                               x < c1 - c2 + c3 && floormod(x, c2) < c3,
+                               c1.Eval()->value % c2.Eval()->value == 0);
+  TVM_TRY_RECURSIVE_REWRITE_IF(
+      x < c1 && floormod(x, c2) < c3, x < c1 - floormod(c1, c2) + c3 && floormod(x, c2) < c3,
+      (c1.Eval()->value % c2.Eval()->value + c2.Eval()->value) % c2.Eval()->value >
+          c3.Eval()->value);
+
+  TVM_TRY_RECURSIVE_REWRITE_IF(x <= c1 && floormod(x, c2) < c3,
+                               x < c1 + 1 - c2 + c3 && floormod(x, c2) < c3,
+                               (c1.Eval()->value + 1) % c2.Eval()->value == 0);
+  TVM_TRY_RECURSIVE_REWRITE_IF(
+      x <= c1 && floormod(x, c2) < c3, x < c1 + 1 - floormod(c1, c2) + c3 && floormod(x, c2) < c3,
+      (((c1.Eval()->value + 1) % c2.Eval()->value) + c2.Eval()->value) % c2.Eval()->value >
+          c3.Eval()->value);
+
+  TVM_TRY_RECURSIVE_REWRITE(floordiv(x, c2) == c1 && floormod(x, c2) < c3,
+                            c1 * c2 <= x && x < c1 * c2 + c3);
+  TVM_TRY_RECURSIVE_REWRITE(floormod(x, c2) < c3 && floordiv(x, c2) == c1,
+                            c1 * c2 <= x && x < c1 * c2 + c3);
+  TVM_TRY_RECURSIVE_REWRITE(floordiv(x, c2) == c1 && floormod(x, c2) <= c3,
+                            c1 * c2 <= x && x <= c1 * c2 + c3);
+  TVM_TRY_RECURSIVE_REWRITE(floormod(x, c2) <= c3 && floordiv(x, c2) == c1,
+                            c1 * c2 <= x && x <= c1 * c2 + c3);
+
+  TVM_TRY_RECURSIVE_REWRITE(floordiv(x, c2) == c1 && c3 <= floormod(x, c2),
+                            c1 * c2 + c3 <= x && x < (c1 + 1) * c2);
+  TVM_TRY_RECURSIVE_REWRITE(c3 <= floormod(x, c2) && floordiv(x, c2) == c1,
+                            c1 * c2 + c3 <= x && x < (c1 + 1) * c2);
+  TVM_TRY_RECURSIVE_REWRITE(floordiv(x, c2) == c1 && c3 < floormod(x, c2),
+                            c1 * c2 + c3 < x && x < (c1 + 1) * c2);
+  TVM_TRY_RECURSIVE_REWRITE(c3 < floormod(x, c2) && floordiv(x, c2) == c1,
+                            c1 * c2 + c3 < x && x < (c1 + 1) * c2);
   return ret;
 }
 
diff --git a/src/arith/transitive_comparison_analyzer.cc b/src/arith/transitive_comparison_analyzer.cc
index b71096a479b5..36c2fb77074c 100644
--- a/src/arith/transitive_comparison_analyzer.cc
+++ b/src/arith/transitive_comparison_analyzer.cc
@@ -547,7 +547,7 @@ std::function<void()> TransitiveComparisonAnalyzer::EnterConstraint(const PrimEx
 
 void TransitiveComparisonAnalyzer::Impl::AddKnown(const PrimExpr& expr,
                                                   std::vector<Comparison>* vec) {
-  for (const auto& subexpr : ExtractConstraints(expr)) {
+  for (const auto& subexpr : ExtractConstraints(expr, false)) {
     if (tir::SideEffect(expr) <= tir::CallEffectKind::kPure) {
       if (auto cmp = FromExpr(subexpr)) {
         vec->push_back(cmp.value());
diff --git a/src/arith/unwrap_vector_expr.cc b/src/arith/unwrap_vector_expr.cc
new file mode 100644
index 000000000000..6a3e8c3d434c
--- /dev/null
+++ b/src/arith/unwrap_vector_expr.cc
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file unwrap_vector_expr.cc
+ * \brief Utility for tracking currently active constraints
+ */
+
+#include "unwrap_vector_expr.h"
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/expr_functor.h>
+#include <tvm/tir/op.h>
+
+#include <unordered_map>
+
+namespace tvm {
+namespace arith {
+
+using namespace tir;
+
+class Scalarizer : public ExprMutator {
+ public:
+  explicit Scalarizer(PrimExpr lane) : lane_(lane) {}
+
+  PrimExpr VisitExpr_(const RampNode* op) final { return op->base + lane_ * op->stride; }
+
+  PrimExpr VisitExpr_(const BroadcastNode* op) final { return op->value; }
+
+  PrimExpr VisitExpr_(const VarNode* op) final {
+    Var var = GetRef<Var>(op);
+
+    auto it = let_var_remap_.find(op);
+    if (it != let_var_remap_.end()) {
+      return it->second;
+    } else {
+      return ExprMutator::VisitExpr_(op);
+    }
+  }
+  PrimExpr VisitExpr_(const LetNode* op) final {
+    if (op->value.dtype().lanes() == 1) {
+      return ExprMutator::VisitExpr_(op);
+    }
+
+    auto it = let_var_remap_.find(op->var.get());
+    ICHECK(it == let_var_remap_.end()) << "Duplicate binding of variable " << op->var;
+
+    Var new_var(op->var->name_hint + "_scalar", op->var.dtype().element_of());
+    let_var_remap_[op->var.get()] = new_var;
+
+    PrimExpr value = this->VisitExpr(op->value);
+    PrimExpr body = this->VisitExpr(op->body);
+
+    let_var_remap_.erase(op->var.get());
+    return Let(op->var, value, body);
+  }
+
+ private:
+  // The lane to extract
+  PrimExpr lane_;
+
+  // Let binding
+  std::unordered_map<const VarNode*, Var> let_var_remap_;
+};
+
+PrimExpr UnwrapVectorExpr(const PrimExpr& vector_expr, const PrimExpr& lane) {
+  return Scalarizer(lane)(vector_expr);
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arith/unwrap_vector_expr.h b/src/arith/unwrap_vector_expr.h
new file mode 100644
index 000000000000..9f18964043ff
--- /dev/null
+++ b/src/arith/unwrap_vector_expr.h
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file unwrap_vector_expr.h
+ *
+ * \brief Centralized location for extraction of constraints from a boolean expression.
+ */
+
+#ifndef TVM_ARITH_UNWRAP_VECTOR_EXPR_H_
+#define TVM_ARITH_UNWRAP_VECTOR_EXPR_H_
+
+#include <tvm/tir/expr.h>
+
+#include <vector>
+
+namespace tvm {
+namespace arith {
+
+/* \brief Unwraps a component of a vector expression
+ *
+ * Utility to break up a vector expression into a specific component
+ * of the expression.
+ *
+ * Example: `Ramp(start, stride, n)` => `start + stride*lane`
+ * Example: `Broadcast(value, n)` => `value`
+ * Example: `2*Ramp(start, stride, n) + Broadcast(value,n)` => `2*(start + stride*lane) + value`
+ *
+ * \param vector_expr The vectorized expression to examine
+ *
+ * \param lane Which lane of the vectorized expression to extract.
+ *
+ * \returns A scalar expression
+ */
+PrimExpr UnwrapVectorExpr(const PrimExpr& vector_expr, const PrimExpr& lane);
+
+}  // namespace arith
+}  // namespace tvm
+
+#endif  // TVM_ARITH_UNWRAP_VECTOR_EXPR_H_
diff --git a/src/tir/analysis/control_flow_graph.cc b/src/tir/analysis/control_flow_graph.cc
new file mode 100644
index 000000000000..42c5c8bb82d5
--- /dev/null
+++ b/src/tir/analysis/control_flow_graph.cc
@@ -0,0 +1,1647 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file control_flow_graph.cc
+ * \brief Utility to deduce bound of expression
+ */
+
+#include "control_flow_graph.h"
+
+#include <tvm/runtime/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <numeric>
+#include <optional>
+#include <queue>
+#include <set>
+#include <sstream>
+#include <unordered_set>
+
+#include "../../arith/conjunctive_normal_form.h"
+#include "../../arith/constraint_extract.h"
+#include "../../arith/ir_mutator_with_analyzer.h"
+#include "../../arith/ir_visitor_with_analyzer.h"
+#include "../../arith/narrow_predicate_expression.h"
+#include "../../arith/unwrap_vector_expr.h"
+
+namespace tvm {
+namespace tir {
+
+using namespace arith;
+
+namespace {
+bool HasBufferLoad(PrimExpr expr) {
+  struct Visitor : public ExprVisitor {
+    void VisitExpr_(const BufferLoadNode* node) override { found_buffer_load = true; }
+    bool found_buffer_load{false};
+  };
+
+  Visitor visitor;
+  visitor(expr);
+  return visitor.found_buffer_load;
+}
+
+Optional<PrimExpr> SubstituteParamValues(const Array<Var>& param_vars,
+                                         const Array<PrimExpr>& param_values,
+                                         const PrimExpr& expr) {
+  ICHECK_EQ(param_vars.size(), param_values.size())
+      << "Expression was defined as having " << param_vars.size() << " parameters, but received "
+      << param_values.size() << " arguments.";
+
+  Map<tir::Var, PrimExpr> var_map;
+  for (size_t i = 0; i < param_values.size(); i++) {
+    var_map.Set(param_vars[i], param_values[i]);
+  }
+
+  return Substitute(expr, var_map);
+}
+}  // namespace
+
+PrimExpr BufferTouch::BeforeLoopIteration() const {
+  PrimExpr loop_predicate = Bool(true);
+  for (auto it = loop_var_expressions.rbegin(); it != loop_var_expressions.rend(); it++) {
+    const Var& loop_var = it->first;
+    const PrimExpr& loop_expr = it->second;
+    loop_predicate = (loop_var <= loop_expr) || ((loop_var == loop_expr) && loop_predicate);
+  }
+  return loop_predicate;
+}
+
+PrimExpr BufferTouch::AtLoopIteration() const {
+  PrimExpr loop_predicate = Bool(true);
+  for (auto it = loop_var_expressions.rbegin(); it != loop_var_expressions.rend(); it++) {
+    const Var& loop_var = it->first;
+    const PrimExpr& loop_expr = it->second;
+    loop_predicate = (loop_var == loop_expr) && loop_predicate;
+  }
+  return loop_predicate;
+}
+
+PrimExpr BufferTouch::AfterLoopIteration() const {
+  PrimExpr loop_predicate = Bool(true);
+  for (auto it = loop_var_expressions.rbegin(); it != loop_var_expressions.rend(); it++) {
+    const Var& loop_var = it->first;
+    const PrimExpr& loop_expr = it->second;
+    loop_predicate = (loop_var >= loop_expr) || ((loop_var == loop_expr) && loop_predicate);
+  }
+  return loop_predicate;
+}
+
+bool BufferTouch::IsSubsetOf(const BufferTouch& other, Analyzer* analyzer) const {
+  if (this->buffer.same_as(other.buffer)) {
+    With<ConstraintContext> constraint(analyzer, predicate);
+
+    return analyzer->CanProve(other.predicate);
+  } else {
+    return false;
+  }
+}
+
+bool BufferTouch::IsDistinctFrom(const BufferTouch& other, Analyzer* analyzer) const {
+  if (this->buffer.same_as(other.buffer)) {
+    With<ConstraintContext> constraint(analyzer, predicate);
+
+    return analyzer->CanProve(!other.predicate);
+  } else {
+    return true;
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const BufferTouch& tp) {
+  auto touch_type = [&]() {
+    if (tp.touch_type == BufferTouch::AccessType::Read) {
+      return "read";
+    } else if (tp.touch_type == BufferTouch::AccessType::Write) {
+      return "write";
+    } else if (tp.touch_type == BufferTouch::AccessType::Assume) {
+      return "assume";
+    } else {
+      return "???";
+    }
+  }();
+
+  os << "BufferTouch(" << tp.buffer->name << ", " << touch_type << ", " << tp.predicate
+     << ", value = " << tp.value << ")";
+  return os;
+}
+
+class BufferConstraintApply : public IRMutatorWithAnalyzer {
+ public:
+  using Parent = IRMutatorWithAnalyzer;
+
+  BufferConstraintApply(const Map<Buffer, Array<Var>>& axis_var_lookup,
+                        const std::vector<BufferTouch>& knowns, Analyzer* analyzer)
+      : Parent(analyzer), axis_var_lookup_(axis_var_lookup), knowns_(knowns) {}
+
+  using Parent::VisitExpr_;
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) override {
+    for (const auto& known : knowns_) {
+      if (!op->buffer.same_as(known.buffer)) {
+        continue;
+      }
+
+      Optional<Var> lane_var = NullOpt;
+      IntImm num_lanes;
+
+      Array<PrimExpr> indices = op->indices.Map([&](const auto& index) {
+        if (index.dtype().lanes() == 1) {
+          return index;
+        } else {
+          ICHECK(!lane_var) << "Multiple indices found with non-scalar values";
+          lane_var = Var("lane", index.dtype().element_of());
+          num_lanes = IntImm(index.dtype().element_of(), index.dtype().lanes());
+          return UnwrapVectorExpr(index, lane_var.value());
+        }
+      });
+
+      auto axis_vars = axis_var_lookup_.at(op->buffer);
+      PrimExpr predicate = SubstituteParamValues(axis_vars, indices, known.predicate).value();
+
+      std::optional<With<ConstraintContext>> context;
+      if (lane_var.defined()) {
+        Var lanes = lane_var.value();
+        PrimExpr known = (IntImm(lanes.dtype(), 0) <= lanes) && (lanes < num_lanes);
+        context.emplace(analyzer_, known);
+      }
+
+      if (analyzer_->CanProve(predicate)) {
+        return SubstituteParamValues(axis_vars, op->indices, known.value).value();
+      }
+    }
+
+    return GetRef<PrimExpr>(op);
+  }
+
+ private:
+  const Map<Buffer, Array<Var>>& axis_var_lookup_;
+  const std::vector<BufferTouch>& knowns_;
+};
+
+/*! \brief Extract the control-flow graph
+ *
+ * Walk through a statement, populating the control-flow graph.
+ */
+class ControlFlowGraphBuilder final : public IRVisitorWithAnalyzer {
+ public:
+  static void Build(ControlFlowGraph* out, const Stmt& stmt) {
+    ControlFlowGraphBuilder extractor(out);
+    extractor.AppendControlBlock();
+    extractor(stmt);
+  }
+
+ private:
+  ControlFlowGraphBuilder(ControlFlowGraph* out) : out_(out) {}
+
+  using Parent = IRVisitorWithAnalyzer;
+  using Parent::VisitExpr_;
+  using Parent::VisitStmt_;
+
+  void VisitStmt(const Stmt& stmt) override {
+    // Update the lookup table to determine which control-flow block
+    // contains the start of the specified statement.  This is used
+    // later to determine which set of known values should be used to
+    // simplify a statement.
+    out_->control_flow_lookup_[stmt.get()] = CurrentControlBlock();
+    Stmt prev_stmt = current_stmt_;
+    current_stmt_ = stmt;
+    Parent::VisitStmt(stmt);
+    current_stmt_ = prev_stmt;
+  }
+
+  void VisitStmt_(const EvaluateNode* op) override {
+    if (auto* call = op->value.as<CallNode>()) {
+      if (call->op.same_as(builtin::assume())) {
+        Assume(call->args[0], true);
+        return;
+      }
+    }
+
+    Parent::VisitStmt_(op);
+  }
+
+  void Assume(PrimExpr assumption, bool from_assume_statement) {
+    for (const auto& expr : ExtractConstraints(assumption, false)) {
+      AssumeConstraintComponent(expr, from_assume_statement);
+    }
+  }
+
+  void AssumeConstraintComponent(PrimExpr assumption, bool from_assume_statement) {
+    PrimExpr additional_predicate = Bool(true);
+
+    std::vector<PrimExpr> buffer_exprs;
+    for (const auto& expr : ExtractComponents(assumption)) {
+      auto side_effect = tir::SideEffect(expr);
+      if (side_effect <= tir::CallEffectKind::kPure) {
+        // Pulling out portions of the assumption that do not depend
+        // on a buffer value allows the following two forms to be
+        // treated identically.
+        //
+        // Option 1: if i < 3: T.assume(buf[i] == value)
+        // Option 2: T.assume(i>=3 or buf[i] == value)
+        additional_predicate = additional_predicate && logical_not(expr);
+      } else if (side_effect == tir::CallEffectKind::kReadState) {
+        buffer_exprs.push_back(expr);
+      } else {
+        LOG(FATAL) << "Assumption must be pure or read-only, but contained expression " << expr
+                   << " with side-effect \'" << side_effect << "\'";
+      }
+    }
+
+    if (buffer_exprs.empty()) {
+      out_->non_buffer_assumptions_.push_back(!CurrentScopePredicate() || assumption);
+      return;
+    }
+
+    CHECK_EQ(buffer_exprs.size(), 1) << "T.assume must contain only a single buffer expression";
+
+    auto* as_equal_node = buffer_exprs[0].as<tir::EQNode>();
+    CHECK(as_equal_node || !from_assume_statement)
+        << "T.assume buffer constraint must be of the form 'buffer[indices] == "
+           "value', but received "
+        << assumption;
+    if (!as_equal_node) {
+      // This assumption is an inequality on a data-dependent
+      // conditional.  Not an error for this to occur, but also not
+      // something that is currently supported.
+      return;
+    }
+
+    tir::BufferLoad load;
+    PrimExpr value;
+    if (auto* as_load = as_equal_node->a.as<tir::BufferLoadNode>()) {
+      load = GetRef<tir::BufferLoad>(as_load);
+      value = as_equal_node->b;
+    } else if (auto* as_load = as_equal_node->b.as<tir::BufferLoadNode>()) {
+      load = GetRef<tir::BufferLoad>(as_load);
+      value = as_equal_node->a;
+    } else if (!from_assume_statement) {
+      return;
+    } else {
+      LOG(FATAL) << "T.assume buffer constraint must be of the form 'buffer[indices] == value'";
+    }
+
+    auto has_side_effect = tir::SideEffect(value) > tir::CallEffectKind::kPure;
+    CHECK(!has_side_effect || !from_assume_statement)
+        << "Buffer value in constraint must be pure expression, but was " << value;
+    if (has_side_effect) {
+      return;
+    }
+
+    {
+      InternalConstraintContext context(this, additional_predicate);
+      VisitAccess(load, BufferTouch::AccessType::Assume, value);
+    }
+    // Appending a control block ensures that all control blocks have
+    // at most one statement that changes the known buffer contents.
+    auto prev_block = CurrentControlBlock();
+    auto new_block = AppendControlBlock();
+    MarkControlFlow(prev_block, new_block);
+  }
+
+  void VisitExpr_(const LetNode* op) override {
+    std::optional<BindLetVar> binding;
+    if (UsesLoopVar(op->value)) {
+      binding.emplace(this, op->var, op->value);
+    }
+    Parent::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const LetStmtNode* op) override {
+    std::optional<BindLetVar> binding;
+    if (UsesLoopVar(op->value)) {
+      binding.emplace(this, op->var, op->value);
+    }
+    Parent::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const BufferLoadNode* op) override {
+    Parent::VisitExpr_(op);
+    BufferLoad load = GetRef<BufferLoad>(op);
+    VisitAccess(load, BufferTouch::AccessType::Read, load);
+  }
+
+  void VisitStmt_(const BufferStoreNode* op) override {
+    Parent::VisitStmt_(op);
+    VisitAccess(GetRef<BufferStore>(op), BufferTouch::AccessType::Write, op->value);
+    // Appending a control block ensures that all control blocks have
+    // at most one statement that changes the buffer contents.
+    auto prev_block = CurrentControlBlock();
+    auto new_block = AppendControlBlock();
+    MarkControlFlow(prev_block, new_block);
+  }
+
+  void VisitStmt_(const ForNode* op) override {
+    out_->iterator_ranges_.Set(op->loop_var, Range::FromMinExtent(op->min, op->extent));
+
+    auto before_loop = CurrentControlBlock();
+    size_t loop_start = -1;
+
+    {
+      BindActiveLoopVar binding(this, op->loop_var, op->min, op->extent);
+      loop_start = AppendControlBlock();
+      Parent::VisitStmt_(op);
+    }
+
+    auto loop_end = CurrentControlBlock();
+    auto after_loop = AppendControlBlock();
+    PrimExpr max_iterator_value = analyzer_.Simplify(op->min + op->extent - 1);
+    {
+      auto [forward, backward] = MarkControlFlow(before_loop, loop_start);
+      backward.post_condition = (op->loop_var == op->min);
+      forward.var_remap = {{op->loop_var, op->min}};
+    }
+    {
+      auto [forward, backward] = MarkControlFlow(loop_end, after_loop);
+      backward.var_remap = {{op->loop_var, max_iterator_value}};
+      forward.post_condition = (op->loop_var == max_iterator_value);
+    }
+    {
+      auto [forward, backward] = MarkControlFlow(loop_end, loop_start);
+      backward.var_remap = {{op->loop_var, op->loop_var - 1}};
+      forward.var_remap = {{op->loop_var, op->loop_var + 1}};
+      backward.post_condition = (op->loop_var > op->min);
+      forward.post_condition = (op->loop_var < max_iterator_value);
+    }
+  }
+
+  void VisitStmt_(const IfThenElseNode* op) override {
+    this->VisitExpr(op->condition);
+
+    PrimExpr real_condition = ExtractRealCondition(op->condition);
+
+    auto before_branching = CurrentControlBlock();
+
+    auto branch_start = AppendControlBlock();
+    MarkControlFlow(before_branching, branch_start);
+
+    {
+      InternalConstraintContext context(this, real_condition);
+      auto then_start = AppendControlBlock();
+      if (context.assume.defined()) {
+        Assume(context.assume.value(), false);
+      }
+      auto [forward, backward] = MarkControlFlow(branch_start, then_start);
+      backward.post_condition = real_condition;
+      forward.post_condition = real_condition;
+      this->VisitStmt(op->then_case);
+    }
+    auto then_end = CurrentControlBlock();
+
+    auto negation = analyzer_.rewrite_simplify(!real_condition);
+    {
+      InternalConstraintContext context(this, negation);
+      auto else_start = AppendControlBlock();
+      if (context.assume.defined()) {
+        Assume(context.assume.value(), false);
+      }
+      auto [forward, backward] = MarkControlFlow(branch_start, else_start);
+      backward.post_condition = negation;
+      forward.post_condition = negation;
+
+      if (op->else_case.defined()) {
+        this->VisitStmt(op->else_case.value());
+      }
+    }
+
+    auto else_end = CurrentControlBlock();
+    auto after_branching = AppendControlBlock();
+
+    if (HasBufferLoad(real_condition)) {
+      // The buffer value may have changed during the body of the
+      // condition, so we can't provide it as a post-condition.
+      MarkControlFlow(then_end, after_branching);
+      MarkControlFlow(else_end, after_branching);
+    } else {
+      {
+        auto [forward, backward] = MarkControlFlow(then_end, after_branching);
+        backward.post_condition = real_condition;
+        forward.post_condition = real_condition;
+      }
+      {
+        auto [forward, backward] = MarkControlFlow(else_end, after_branching);
+        backward.post_condition = negation;
+        forward.post_condition = negation;
+      }
+    }
+  }
+
+  /*! \brief Internal utility, returns true if the expression depends
+   *  on a loop iterator
+   */
+  bool UsesLoopVar(const PrimExpr& expr) {
+    return UsesVar(expr, [&](const VarNode* expr_var) {
+      return loop_dependent_vars_.find(expr_var) != loop_dependent_vars_.end();
+    });
+  }
+
+  /*! \brief Record the interaction with the buffer.
+   *
+   * \param node The TIR node that accesses the buffer.  Should be
+   * either a BufferLoad or BufferStore node.
+   *
+   * \param touch_type The type of buffer access being performed.  A
+   * BufferStore should always use AccessType::Write.  A BufferLoad
+   * may use either AccessType::Read or AccessType::Assume, depending
+   * on whether the BufferLoad occurs within `builtin::assume`.
+   *
+   * \param known_value_expr The value in the buffer following the access.
+   */
+  template <typename BufferAccess>
+  void VisitAccess(const BufferAccess& node, BufferTouch::AccessType touch_type,
+                   PrimExpr known_value_expr) {
+    auto& current_block = out_->control_flow_.back();
+    BufferTouch buffer_touch = current_block.MakeBufferTouch(out_, node->buffer, node->indices,
+                                                             touch_type, known_value_expr);
+    current_block.touch_points.push_back(buffer_touch);
+  }
+
+  /*! \brief Return a predicate for having reached the current
+   *  control-flow block
+   *
+   * For example, while inside an IfThenElse, will return the
+   * IfThenElse's condition.
+   */
+  PrimExpr CurrentScopePredicate() const {
+    PrimExpr predicate = Bool(true);
+    for (const auto& condition : conditions_) {
+      predicate = predicate && condition;
+    }
+    return predicate;
+  }
+
+  /* \brief Add a new control block, returning its index */
+  size_t AppendControlBlock() {
+    size_t index = out_->control_flow_.size();
+    auto& block = out_->control_flow_.emplace_back();
+    block.active_loop_iterators = active_loop_iterators_;
+    block.let_bindings_using_loop = let_bindings_using_loop_;
+    block.scope_predicate = CurrentScopePredicate();
+    return index;
+  }
+
+  /* \brief The index of the current control block */
+  size_t CurrentControlBlock() { return out_->control_flow_.size() - 1; }
+
+  /* \brief Mark a possible control from one block to another
+   *
+   * \param from_block The block from which control leaves
+   *
+   * \param to_block The block to which control enters
+   *
+   * \param var_remap Variable replacements that should be made in
+   * known expression while traversing this edge.  For example,
+   * replacing `i` with `i-1` when entering the next loop iteration,
+   * or replacing `i` with `n-1` when concluding a loop.
+   */
+  std::pair<ControlFlowGraph::ControlFlowEdge&, ControlFlowGraph::ControlFlowEdge&> MarkControlFlow(
+      size_t from_block, size_t to_block) {
+    ICHECK_LE(from_block, out_->control_flow_.size());
+    ICHECK_LE(to_block, out_->control_flow_.size());
+
+    auto& forward = out_->control_flow_[from_block].successors.emplace_back(
+        ControlFlowGraph::ControlFlowEdge{to_block, {}, NullOpt});
+    auto& backward = out_->control_flow_[to_block].predecessors.emplace_back(
+        ControlFlowGraph::ControlFlowEdge{from_block, {}, NullOpt});
+    return {forward, backward};
+  }
+
+  // Internal utility, context manager for entering/leaving a scoped constraint
+  struct InternalConstraintContext {
+    InternalConstraintContext(ControlFlowGraphBuilder* self, PrimExpr constraint)
+        : self(self), analyzer_context(&self->analyzer_, constraint) {
+      old_num_constraints = self->conditions_.size();
+
+      auto side_effect = tir::SideEffect(constraint);
+      if (side_effect <= tir::CallEffectKind::kPure) {
+        self->conditions_.push_back(constraint);
+      } else if (side_effect <= tir::CallEffectKind::kReadState) {
+        assume = constraint;
+      }
+
+      new_num_constraints = self->conditions_.size();
+    }
+    ~InternalConstraintContext() {
+      ICHECK_EQ(self->conditions_.size(), new_num_constraints)
+          << "Internal error: Each condition should only be popped once.";
+      self->conditions_.erase(self->conditions_.begin() + old_num_constraints,
+                              self->conditions_.end());
+    }
+
+    ControlFlowGraphBuilder* self{nullptr};
+    With<ConstraintContext> analyzer_context;
+    size_t old_num_constraints{0};
+    size_t new_num_constraints{0};
+    Optional<PrimExpr> assume{NullOpt};
+
+    // Disable default-generated copy/move assignment and constructors
+    InternalConstraintContext(const InternalConstraintContext&) = delete;
+    InternalConstraintContext& operator=(const InternalConstraintContext&) = delete;
+    InternalConstraintContext(InternalConstraintContext&&) = delete;
+    InternalConstraintContext& operator=(InternalConstraintContext&&) = delete;
+  };
+
+  // Internal utility, context manager for tracking a loop
+  struct BindActiveLoopVar {
+    BindActiveLoopVar(ControlFlowGraphBuilder* self, Var var, PrimExpr loop_min,
+                      PrimExpr loop_extent)
+        : self(self), var(var) {
+      PrimExpr loop_max = loop_min + (loop_extent - 1);
+      auto loop_range = Range::FromMinExtent(loop_min, loop_extent);
+      self->active_loop_iterators_.push_back({var, loop_min, loop_max, loop_range});
+      self->loop_dependent_vars_.insert(var.get());
+    }
+    ~BindActiveLoopVar() { self->active_loop_iterators_.pop_back(); }
+
+    ControlFlowGraphBuilder* self;
+    Var var;
+
+    // Disable default-generated copy/move assignment and constructors
+    BindActiveLoopVar(const BindActiveLoopVar&) = delete;
+    BindActiveLoopVar& operator=(const BindActiveLoopVar&) = delete;
+    BindActiveLoopVar(BindActiveLoopVar&&) = delete;
+    BindActiveLoopVar& operator=(BindActiveLoopVar&&) = delete;
+  };
+
+  // Internal utility, context manager for tracking a variable binding
+  struct BindLetVar {
+    BindLetVar(ControlFlowGraphBuilder* self, Var var, PrimExpr value) : self(self), var(var) {
+      self->let_bindings_using_loop_.Set(var, value);
+      self->loop_dependent_vars_.insert(var.get());
+    }
+    ~BindLetVar() {
+      self->loop_dependent_vars_.erase(var.get());
+      self->let_bindings_using_loop_.erase(var);
+    }
+    ControlFlowGraphBuilder* self;
+    Var var;
+
+    // Disable default-generated copy/move assignment and constructors
+    BindLetVar(const BindLetVar&) = delete;
+    BindLetVar& operator=(const BindLetVar&) = delete;
+    BindLetVar(BindLetVar&&) = delete;
+    BindLetVar& operator=(BindLetVar&&) = delete;
+  };
+
+  struct LoopEntry {
+    Var loop_var;
+    PrimExpr loop_min;
+    PrimExpr loop_max;
+    Range loop_range;
+  };
+
+  // Track in order to know which Vars to write in terms of the buffer
+  // indices and substitute out of the predicate.
+  std::vector<ControlFlowGraph::ControlFlowBlock::LoopEntry> active_loop_iterators_;
+
+  // Track all loop iterators, along with values derived from loop iterators.
+  std::unordered_set<const VarNode*> loop_dependent_vars_;
+
+  // Any let binding that depends, directly or indirectly, on a loop
+  // binding.  When making a predicate in terms of the buffer indices,
+  // these need to be substituted out.
+  // std::unordered_map<const VarNode*, PrimExpr> let_bindings_using_loop_;
+  Map<Var, PrimExpr> let_bindings_using_loop_;
+
+  // Track in order to know what conditions limit the buffer access
+  std::vector<PrimExpr> conditions_;
+
+  // Track in order to know what statement initiated the buffer access
+  Stmt current_stmt_;
+
+  // Output data structure
+  ControlFlowGraph* out_;
+};
+
+std::pair<BufferTouch, Map<Var, Range>> ControlFlowGraph::ControlFlowBlock::MakeBufferTouch(
+    const tir::Buffer& buf, Array<Var> index_variables, Array<PrimExpr> indices,
+    BufferTouch::AccessType touch_type, PrimExpr known_value_expr) const {
+  const auto& current_block = *this;
+
+  Analyzer local_analyzer;
+
+  Optional<Var> lane_var = NullOpt;
+  IntImm num_lanes;
+
+  Array<PrimExpr> index_expressions = indices.Map([&](const auto& index) {
+    if (index.dtype().lanes() == 1) {
+      return index;
+    } else {
+      ICHECK(!lane_var) << "Multiple indices found with non-scalar values";
+      lane_var = Var("lane", index.dtype().element_of());
+      num_lanes = IntImm(index.dtype().element_of(), index.dtype().lanes());
+      return UnwrapVectorExpr(index, lane_var.value());
+    }
+  });
+
+  Array<Var> loop_vars;
+
+  Map<Var, Range> loop_ranges;
+  for (const auto& loop_entry : current_block.active_loop_iterators) {
+    loop_vars.push_back(loop_entry.loop_var);
+    loop_ranges.Set(loop_entry.loop_var, loop_entry.loop_range);
+  }
+
+  // If the indices contain multiple lanes, treat the lane variable
+  // as an additional loop iterator to be solved for and substituted
+  // out.
+  if (lane_var) {
+    loop_vars.push_back(lane_var.value());
+    loop_ranges.Set(lane_var.value(), Range::FromMinExtent(0, num_lanes));
+  }
+
+  IntConstraintsTransform transform = [&]() {
+    ICHECK_EQ(index_variables.size(), index_expressions.size());
+
+    Array<PrimExpr> relations;
+
+    for (size_t i = 0; i < index_expressions.size(); i++) {
+      PrimExpr expr = index_expressions[i];
+      Var var = index_variables[i];
+
+      expr = Substitute(expr, current_block.let_bindings_using_loop);
+      relations.push_back(var == expr);
+    }
+
+    IntConstraints system(loop_vars, loop_ranges, relations);
+    return arith::SolveLinearEquations(system);
+  }();
+
+  Map<Var, PrimExpr> loop_var_to_axis_var = transform->src_to_dst;
+  Map<Var, Range> free_params = transform->dst->ranges;
+  PrimExpr transform_predicate =
+      std::accumulate(transform->dst->relations.begin(), transform->dst->relations.end(),
+                      PrimExpr(Bool(true)), [](PrimExpr a, PrimExpr b) { return a && b; });
+
+  transform_predicate = SimplifyAsAndOfOrs(transform_predicate, &local_analyzer);
+
+  auto find_removable_params = [&]() -> Map<Var, PrimExpr> {
+    Map<Var, PrimExpr> removable_params;
+
+    // The arith::SolveLinearEquations is more general than the
+    // utilities in iter_affine_map.h, but can introduce free
+    // parameters that could later be determined with the known
+    // constraints.  This step removes all such free parameters.
+    for (const auto& expr : ExtractConstraints(transform_predicate)) {
+      if (auto* as_equal = expr.as<EQNode>()) {
+        auto check_expr = [&](const PrimExpr& a, const PrimExpr& b) {
+          auto* var_ptr = a.as<VarNode>();
+          if (!var_ptr) {
+            return;
+          }
+
+          Var var = GetRef<Var>(var_ptr);
+          if (free_params.count(var) == 0) {
+            return;
+          }
+
+          bool uses_free_param =
+              UsesVar(b, [&](const VarNode* v) { return free_params.count(GetRef<Var>(v)) > 0; });
+          if (uses_free_param) {
+            return;
+          }
+          removable_params.Set(var, b);
+        };
+        check_expr(as_equal->a, as_equal->b);
+        check_expr(as_equal->b, as_equal->a);
+      }
+    }
+
+    // In addition, the arith::SolveLinearEquation can introduce
+    // free parameters with an extent of one.  Filtering them out here
+    // avoids needing to track them through later simplifications.
+    for (const auto [var, range] : free_params) {
+      if (is_one(range->extent)) {
+        removable_params.Set(var, range->min);
+      }
+    }
+
+    return removable_params;
+  };
+  for (auto removable_params = find_removable_params(); removable_params.size() > 0;
+       removable_params = find_removable_params()) {
+    auto update = [&](const PrimExpr& expr) {
+      return local_analyzer.Simplify(Substitute(expr, removable_params));
+    };
+
+    Map<Var, PrimExpr> new_map;
+    for (const auto [loop_var, expr] : loop_var_to_axis_var) {
+      static_cast<void>(expr);  // gcc 7.x bug, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
+      new_map.Set(loop_var, update(expr));
+    }
+    loop_var_to_axis_var = new_map;
+
+    transform_predicate = update(transform_predicate);
+
+    for (const auto [var, expr] : removable_params) {
+      static_cast<void>(expr);  // gcc 7.x bug, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
+      free_params.erase(var);
+    }
+  }
+
+  // Normalization function, applied to both the predicate and the
+  // known value.  Converts from an expression in terms of loop
+  // iterators to an expression in terms of buffer indices.
+  auto normalize_expr = [&](PrimExpr expr) -> PrimExpr {
+    expr = Substitute(expr, current_block.let_bindings_using_loop);
+
+    if (lane_var) {
+      expr = UnwrapVectorExpr(expr, lane_var.value());
+    }
+    expr = Substitute(expr, loop_var_to_axis_var);
+
+    return expr;
+  };
+
+  // Collect the current loop variables, along with an expression for
+  // the loop variables in terms of the buffer axis variables.  This
+  // is used during forward/backward propagation to generate predicate
+  // tracking whether a loop iteration has been reached.
+  std::vector<std::pair<Var, PrimExpr>> loop_var_expressions;
+  for (const auto& entry : current_block.active_loop_iterators) {
+    auto expr_it = loop_var_to_axis_var.find(entry.loop_var);
+    ICHECK(expr_it != loop_var_to_axis_var.end());
+    loop_var_expressions.push_back({entry.loop_var, (*expr_it).second});
+  }
+
+  // The full predicate is composed of the values required to reach
+  // the scope of the BufferStore or builtin::assume(), any bounds
+  // implied by solving for the axis variables, and any additional
+  // statements resulting from unpacking the expression contained in
+  // builtin::assume().
+  PrimExpr scope_predicate = normalize_expr(current_block.scope_predicate);
+  transform_predicate = normalize_expr(transform_predicate);
+
+  known_value_expr = local_analyzer.Simplify(normalize_expr(known_value_expr));
+
+  // Deliberately use an analyzer without scope-based information,
+  // to avoid simplifying `scope_predicate` to True.
+  PrimExpr predicate_expr = local_analyzer.Simplify(transform_predicate && scope_predicate);
+
+  BufferTouch buffer_touch = {buf, predicate_expr, known_value_expr, loop_var_expressions,
+                              touch_type};
+
+  return {buffer_touch, free_params};
+}
+
+BufferTouch ControlFlowGraph::ControlFlowBlock::MakeBufferTouch(ControlFlowGraph* graph,
+                                                                const tir::Buffer& buf,
+                                                                const Array<PrimExpr>& indices,
+                                                                BufferTouch::AccessType touch_type,
+                                                                PrimExpr known_value_expr) const {
+  ICHECK(graph);
+  auto [buffer_touch, free_params] = MakeBufferTouch(buf, graph->GetIndexVariables(buf, indices),
+                                                     indices, touch_type, known_value_expr);
+  for (const auto& pair : free_params) {
+    graph->free_predicate_parameters_.Set(pair.first, pair.second);
+  }
+  return buffer_touch;
+}
+
+ControlFlowGraph::ControlFlowGraph(const tir::Stmt& stmt, size_t max_revisits) {
+  ControlFlowGraphBuilder::Build(this, stmt);
+  ForwardPropagateKnownValues(max_revisits);
+  BackwardPropagateUnusedValues(max_revisits);
+}
+
+std::ostream& operator<<(std::ostream& os, const ControlFlowGraph::ControlFlowEdge& edge) {
+  os << edge.index;
+  if (edge.var_remap.size()) {
+    os << " with remap " << edge.var_remap;
+  }
+  if (edge.post_condition) {
+    os << " with postcondition " << edge.post_condition;
+  }
+
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const ControlFlowGraph::ControlFlowBlock& block) {
+  os << "Predecessors: [";
+  for (size_t i = 0; i < block.predecessors.size(); i++) {
+    if (i) {
+      os << ", ";
+    }
+    os << block.predecessors[i];
+  }
+  os << "]\n";
+
+  os << "Active loop iterators: [";
+  for (size_t i = 0; i < block.active_loop_iterators.size(); i++) {
+    if (i) {
+      os << ", ";
+    }
+    os << block.active_loop_iterators[i].loop_var;
+  }
+  os << "]\n";
+
+  os << "Before block knowns: " << block.known_at_block_start << "\n";
+
+  os << "Before block unused: " << block.unused_at_block_start << "\n";
+
+  for (size_t i = 0; i < block.touch_points.size(); i++) {
+    os << "Touch[" << i << "] = " << block.touch_points[i] << "\n";
+  }
+  os << "After block: " << block.known_at_block_end << "\n";
+
+  os << "After block unused: " << block.unused_at_block_end << "\n";
+
+  os << "Successors: [";
+  for (size_t i = 0; i < block.successors.size(); i++) {
+    if (i) {
+      os << ", ";
+    }
+    os << block.successors[i];
+  }
+  os << "]";
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const ControlFlowGraph& pattern) {
+  os << "Touch pattern contains " << pattern.control_flow_.size() << " control blocks."
+     << (pattern.control_flow_.size() ? "\n" : "");
+  for (size_t i = 0; i < pattern.control_flow_.size(); i++) {
+    os << "\t"
+       << "ControlBlock[" << i << "] = " << pattern.control_flow_[i] << "\n";
+  }
+
+  return os;
+}
+
+bool BufferTouch::IsEquivalentTo(const BufferTouch& other, Analyzer* analyzer) const {
+  // Constraints must apply to the same buffer to be equivalent
+  if (!buffer.same_as(other.buffer) || touch_type != other.touch_type) {
+    return false;
+  }
+
+  ExprDeepEqual deep_equal;
+
+  auto implies = [&](const PrimExpr& a, const PrimExpr& b) -> bool {
+    With<ConstraintContext> context(analyzer, a);
+    return analyzer->CanProve(b);
+  };
+
+  // Predicates must be equivalent expressions, or must both be undefined
+  bool equivalent_predicates =
+      deep_equal(predicate, other.predicate) ||
+      (implies(predicate, other.predicate) && implies(other.predicate, predicate));
+  if (!equivalent_predicates) {
+    return false;
+  }
+
+  // The known value must be equal
+  if (!deep_equal(value, other.value) && !analyzer->CanProveEqual(value, other.value)) {
+    return false;
+  }
+
+  return true;
+}
+
+std::ostream& operator<<(std::ostream& os, const BufferState& state) {
+  for (size_t i = 0; i < state.constraints_.size(); i++) {
+    os << "constraints[" << i << "] = " << state.constraints_[i]
+       << (i + 1 == state.constraints_.size() ? "" : "\n");
+  }
+  return os;
+}
+
+PrimExpr BufferState::SubstituteKnownBufferValues(
+    PrimExpr expr, const Map<tir::Buffer, Array<tir::Var>>& axis_var_lookup,
+    Analyzer* analyzer) const {
+  BufferConstraintApply mutator(axis_var_lookup, constraints_, analyzer);
+  return mutator(std::move(expr));
+}
+
+void BufferState::AddCondition(const PrimExpr& condition) {
+  for (auto& constraint : constraints_) {
+    constraint.predicate = constraint.predicate && condition;
+  }
+}
+
+void BufferState::Substitute(const Map<Var, PrimExpr>& var_remap, Analyzer* analyzer) {
+  if (var_remap.size()) {
+    for (auto& prior : constraints_) {
+      PrimExpr updated = tvm::tir::Substitute(prior.predicate, var_remap);
+      if (!updated.same_as(prior.predicate)) {
+        prior.predicate = SimplifyAsAndOfOrs(updated, analyzer);
+      }
+    }
+  }
+}
+
+void BufferState::Simplify(Analyzer* analyzer) {
+  for (auto& constraint : constraints_) {
+    constraint.predicate = SimplifyAsAndOfOrs(constraint.predicate, analyzer);
+  }
+}
+
+void BufferState::Union(const BufferState& b, Analyzer* analyzer) {
+  for (const auto& b_constraint : b.constraints_) {
+    bool used = false;
+    for (auto& a_constraint : constraints_) {
+      if (a_constraint.buffer.same_as(b_constraint.buffer) &&
+          analyzer->CanProveEqual(a_constraint.value, b_constraint.value)) {
+        a_constraint.predicate =
+            SimplifyAsAndOfOrs(a_constraint.predicate || b_constraint.predicate, analyzer);
+        used = true;
+        break;
+      }
+    }
+    if (!used) {
+      constraints_.push_back(b_constraint);
+    }
+  }
+}
+
+void BufferState::Intersection(const BufferState& b, Analyzer* analyzer) {
+  // For a constraint to be in the output, it must be present in both
+  // inputs.
+
+  std::vector<BufferTouch> new_constraints;
+  for (const auto& ai : constraints_) {
+    for (const auto& bi : b.constraints_) {
+      if (ai.buffer.same_as(bi.buffer)) {
+        PrimExpr predicate = SimplifyAsAndOfOrs(ai.predicate && bi.predicate, analyzer);
+        if (!is_zero(predicate)) {
+          With<ConstraintContext> context(analyzer, predicate);
+          PrimExpr known_value_a = ai.value;
+          PrimExpr known_value_b = bi.value;
+
+          bool is_consistent = analyzer->CanProveEqual(known_value_a, known_value_b);
+          if (is_consistent) {
+            new_constraints.push_back({ai.buffer, predicate, known_value_a});
+          }
+        }
+      }
+    }
+  }
+
+  constraints_ = std::move(new_constraints);
+}
+
+class BufferRegionCollector : public ExprVisitor {
+ public:
+  struct Region {
+    PrimExpr region_predicate;
+    std::unordered_map<const BufferLoadNode*, Optional<PrimExpr>> known_values;
+  };
+
+  static std::vector<Region> Collect(const Map<Buffer, Array<Var>>& axis_var_lookup,
+                                     const std::vector<BufferTouch>& knowns,
+                                     const std::vector<Optional<PrimExpr>>& exprs,
+                                     Analyzer* analyzer) {
+    BufferRegionCollector collector(axis_var_lookup, knowns, analyzer);
+    for (const auto& expr : exprs) {
+      if (expr) {
+        collector(expr.value());
+      }
+    }
+
+    return collector.regions_;
+  }
+
+ private:
+  using Parent = ExprVisitor;
+
+  BufferRegionCollector(const Map<Buffer, Array<Var>>& axis_var_lookup,
+                        const std::vector<BufferTouch>& knowns, Analyzer* analyzer)
+      : analyzer_(analyzer), axis_var_lookup_(axis_var_lookup), knowns_(knowns) {
+    regions_.push_back(Region{Bool(true), {}});
+  }
+
+  using Parent::VisitExpr_;
+
+  void VisitExpr_(const BufferLoadNode* op) override {
+    // Helper struct for the known values of this BufferLoad
+    struct Known {
+      PrimExpr predicate;
+      Optional<PrimExpr> value;
+    };
+
+    std::vector<Known> new_regions;
+
+    PrimExpr unknown_region = Bool(true);
+
+    for (const BufferTouch& constraint : knowns_) {
+      if (!op->buffer.same_as(constraint.buffer)) {
+        // This is a different buffer, so continue searching.
+        continue;
+      }
+
+      auto axis_vars = axis_var_lookup_.at(op->buffer);
+      PrimExpr touch_predicate =
+          SubstituteParamValues(axis_vars, op->indices, constraint.predicate).value();
+      touch_predicate = SimplifyAsAndOfOrs(touch_predicate, analyzer_);
+
+      if (!is_zero(touch_predicate)) {
+        Optional<PrimExpr> known_value =
+            SubstituteParamValues(axis_vars, op->indices, constraint.value);
+        new_regions.push_back(Known{touch_predicate, known_value});
+
+        unknown_region = unknown_region && !touch_predicate;
+        unknown_region = SimplifyAsAndOfOrs(unknown_region, analyzer_);
+      }
+    }
+
+    if (new_regions.size()) {
+      Analyzer local_analyzer;
+
+      if (!is_zero(unknown_region)) {
+        new_regions.insert(new_regions.begin(), Known{unknown_region, NullOpt});
+      }
+
+      std::vector<Region> updated_regions;
+      for (const auto& prev_region : regions_) {
+        for (const auto& new_region : new_regions) {
+          PrimExpr intersection =
+              SimplifyAsAndOfOrs(prev_region.region_predicate && new_region.predicate, analyzer_);
+
+          if (!is_zero(intersection)) {
+            Region merged{intersection, prev_region.known_values};
+            merged.known_values[op] = new_region.value;
+            updated_regions.push_back(std::move(merged));
+          }
+        }
+      }
+      regions_ = updated_regions;
+    }
+  }
+
+  Analyzer* analyzer_;
+  std::vector<Region> regions_;
+  const Map<Buffer, Array<Var>>& axis_var_lookup_;
+  const std::vector<BufferTouch>& knowns_;
+};
+
+class BufferRegionValueReplacer : public IRMutatorWithAnalyzer {
+ public:
+  static PrimExpr Apply(
+      const std::unordered_map<const BufferLoadNode*, Optional<PrimExpr>>& known_values,
+      PrimExpr expr, Analyzer* analyzer) {
+    BufferRegionValueReplacer mutator(known_values, analyzer);
+    PrimExpr result = mutator(expr);
+    // Simplification must occur after the substitution, as known
+    // values may provide enable simplifications.  Also, cannot track
+    // whether a BufferLoad was
+    result = analyzer->Simplify(result);
+    return result;
+  }
+
+ private:
+  using Parent = IRMutatorWithAnalyzer;
+
+  BufferRegionValueReplacer(
+      const std::unordered_map<const BufferLoadNode*, Optional<PrimExpr>>& known_values,
+      Analyzer* analyzer)
+      : Parent(analyzer), known_values_(known_values) {}
+
+  using Parent::VisitExpr_;
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) override {
+    auto it = known_values_.find(op);
+    if (it != known_values_.end() && it->second) {
+      return it->second.value();
+    } else {
+      return GetRef<PrimExpr>(op);
+    }
+  }
+
+  const std::unordered_map<const BufferLoadNode*, Optional<PrimExpr>>& known_values_;
+};
+
+void BufferState::ApplyTouches(const Map<Buffer, Array<Var>>& axis_var_lookup,
+                               const std::vector<BufferTouch>& touch_points, Analyzer* analyzer) {
+  std::vector<BufferTouch> new_knowns;
+  Map<Buffer, PrimExpr> keep_prior_known_at;
+
+  for (auto& touch : touch_points) {
+    if (touch.touch_type == BufferTouch::AccessType::Read) {
+      continue;
+    }
+
+    PrimExpr known_value = touch.value;
+
+    PrimExpr predicate = touch.predicate && touch.AfterLoopIteration();
+    auto regions = BufferRegionCollector::Collect(axis_var_lookup, constraints_,
+                                                  {predicate, touch.value}, analyzer);
+
+    for (const auto& region : regions) {
+      PrimExpr updated_predicate = BufferRegionValueReplacer::Apply(
+          region.known_values, region.region_predicate && predicate, analyzer);
+
+      updated_predicate = SimplifyAsAndOfOrs(updated_predicate, analyzer);
+      PrimExpr updated_value =
+          BufferRegionValueReplacer::Apply(region.known_values, known_value, analyzer);
+
+      if (!is_zero(updated_predicate)) {
+        if (auto it = keep_prior_known_at.find(touch.buffer); it != keep_prior_known_at.end()) {
+          keep_prior_known_at.Set(touch.buffer, (*it).second && !updated_predicate);
+        } else {
+          keep_prior_known_at.Set(touch.buffer, !updated_predicate);
+        }
+
+        if (!HasBufferLoad(updated_value)) {
+          BufferTouch new_constraint{touch.buffer, updated_predicate, updated_value};
+          new_knowns.push_back(new_constraint);
+        }
+      }
+    }
+  }
+
+  if (keep_prior_known_at.size()) {
+    for (auto& constraint : constraints_) {
+      if (auto it = keep_prior_known_at.find(constraint.buffer); it != keep_prior_known_at.end()) {
+        constraint.predicate = SimplifyAsAndOfOrs(constraint.predicate && (*it).second, analyzer);
+      }
+    }
+  }
+
+  if (new_knowns.size()) {
+    std::vector<bool> used(new_knowns.size(), false);
+
+    for (auto& constraint : constraints_) {
+      PrimExpr expand_known_at = Bool(false);
+
+      PrimExpr prev_value = constraint.value;
+
+      for (size_t i = 0; i < new_knowns.size(); i++) {
+        if (new_knowns[i].buffer.same_as(constraint.buffer)) {
+          Optional<PrimExpr> overwritten_with = new_knowns[i].value;
+          if (overwritten_with && analyzer->CanProveEqual(prev_value, overwritten_with.value())) {
+            expand_known_at =
+                SimplifyAsAndOfOrs(expand_known_at || new_knowns[i].predicate, analyzer);
+            used[i] = true;
+          }
+        }
+      }
+
+      if (!is_zero(expand_known_at)) {
+        constraint.predicate =
+            SimplifyAsAndOfOrs(constraint.predicate || expand_known_at, analyzer);
+      }
+    }
+
+    for (size_t i = 0; i < new_knowns.size(); i++) {
+      if (!used[i]) {
+        constraints_.push_back(new_knowns[i]);
+      }
+    }
+  }
+
+  constraints_.erase(
+      std::remove_if(constraints_.begin(), constraints_.end(),
+                     [&](const auto& constraint) { return is_zero(constraint.predicate); }),
+      constraints_.end());
+}
+
+void BufferState::BackpropUnusedIndices(const Map<Buffer, Array<Var>>& axis_var_lookup,
+                                        const std::vector<BufferTouch>& touch_points,
+                                        Analyzer* analyzer) {
+  std::vector<BufferTouch> new_knowns;
+  Map<Buffer, PrimExpr> keep_prior_known_at;
+
+  Map<Buffer, PrimExpr> regions_written;
+  Map<Buffer, PrimExpr> regions_read;
+  for (auto it = touch_points.rbegin(); it != touch_points.rend(); it++) {
+    const auto& touch = *it;
+
+    Map<Buffer, PrimExpr>* to_update{nullptr};
+    if (touch.touch_type == BufferTouch::AccessType::Write) {
+      to_update = &regions_written;
+
+    } else if (touch.touch_type == BufferTouch::AccessType::Read) {
+      to_update = &regions_read;
+    } else {
+      continue;
+    }
+
+    PrimExpr prev = to_update->Get(touch.buffer).value_or(Bool(false));
+    PrimExpr new_predicate = touch.predicate && touch.BeforeLoopIteration();
+    to_update->Set(touch.buffer, prev || new_predicate);
+  }
+
+  auto update_map = [&](auto& map) {
+    Map<Buffer, PrimExpr> new_map;
+    for (auto [buffer, predicate] : map) {
+      new_map.Set(buffer, SimplifyAsAndOfOrs(predicate, analyzer));
+    }
+    map = std::move(new_map);
+  };
+  update_map(regions_written);
+  update_map(regions_read);
+
+  // If buffer is already in used, widen the predicate
+  for (auto& prev_unused : constraints_) {
+    if (auto opt_predicate = regions_written.Get(prev_unused.buffer)) {
+      PrimExpr new_predicate = prev_unused.predicate || opt_predicate.value();
+      prev_unused.predicate = SimplifyAsAndOfOrs(new_predicate, analyzer);
+      regions_written.erase(prev_unused.buffer);
+    }
+  }
+
+  // Otherwise, add new "touch" to represent the unused values
+  for (auto [buffer, predicate] : regions_written) {
+    constraints_.push_back(
+        BufferTouch{buffer, predicate, tir::Call(buffer->dtype, builtin::undef(), {})});
+  }
+
+  // If buffer is read out, narrow the predicate
+  for (auto& prev_unused : constraints_) {
+    if (auto opt_pred = regions_read.Get(prev_unused.buffer)) {
+      PrimExpr predicate = opt_pred.value();
+      prev_unused.predicate = SimplifyAsAndOfOrs(prev_unused.predicate && !predicate, analyzer);
+    }
+  }
+
+  // Clean-up and remove any empty constraints
+  constraints_.erase(
+      std::remove_if(constraints_.begin(), constraints_.end(),
+                     [](const auto& constraint) { return is_zero(constraint.predicate); }),
+      constraints_.end());
+}
+
+void BufferState::RemoveFreeParameters(const Map<Var, Range>& free_predicate_parameters,
+                                       Analyzer* analyzer) {
+  for (auto& known : constraints_) {
+    known.predicate = NarrowPredicateExpression(known.predicate, free_predicate_parameters);
+    known.predicate = SimplifyAsAndOfOrs(known.predicate, analyzer);
+  }
+}
+
+bool BufferState::IsEquivalentTo(const BufferState& other, Analyzer* analyzer) const {
+  if (constraints_.size() != other.constraints_.size()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < constraints_.size(); i++) {
+    if (!constraints_[i].IsEquivalentTo(other.constraints_[i], analyzer)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+Optional<Array<Var>> ControlFlowGraph::GetIndexVariables(const Buffer& buf) const {
+  if (auto it = axis_var_lookup_.find(buf); it != axis_var_lookup_.end()) {
+    return (*it).second;
+  } else {
+    return NullOpt;
+  }
+}
+
+Array<Var> ControlFlowGraph::GetIndexVariables(const Buffer& buf, const Array<PrimExpr>& indices) {
+  if (auto it = axis_var_lookup_.find(buf); it != axis_var_lookup_.end()) {
+    return (*it).second;
+  }
+
+  Array<Var> vars;
+  for (size_t i = 0; i < indices.size(); i++) {
+    std::stringstream ss;
+    ss << buf->name << "_axis_" << i;
+    vars.push_back(Var(ss.str(), indices[i].dtype().element_of()));
+  }
+
+  axis_var_lookup_.Set(buf, vars);
+  return vars;
+}
+
+void ControlFlowGraph::ForwardPropagateKnownValues(size_t max_revisits) {
+  // Values to visit when searching.  Using a std::set to
+  // preferentially visit nodes near the start of the control flow.
+  std::set<size_t> to_visit;
+
+  // Map from a block's index
+  std::unordered_map<size_t, size_t> visit_count_lookup;
+
+  // Initiatize the locations to search from, propagating values
+  // forward from all locations that have a known value.
+  for (size_t i = 0; i < control_flow_.size(); i++) {
+    bool has_known_value = false;
+    for (const auto& touch : control_flow_[i].touch_points) {
+      if (!HasBufferLoad(touch.value)) {
+        has_known_value = true;
+        break;
+      }
+    }
+
+    if (has_known_value) {
+      to_visit.insert(i);
+    }
+  }
+
+  Analyzer analyzer;
+  analyzer.rewrite_simplify.SetEnabledExtensions(arith::RewriteSimplifier::Extension(
+      arith::RewriteSimplifier::kTransitivelyProveInequalities |
+      arith::RewriteSimplifier::kApplyConstraintsToBooleanBranches));
+
+  analyzer.Bind(iterator_ranges_);
+  analyzer.Bind(free_predicate_parameters_);
+
+  while (to_visit.size()) {
+    size_t visiting = *to_visit.begin();
+    to_visit.erase(visiting);
+
+    size_t num_previous_visits = visit_count_lookup[visiting]++;
+
+    ControlFlowBlock& block = control_flow_[visiting];
+
+    // Step 1: Collect known values provided from each predecessor
+    block.known_at_block_start = [&]() -> BufferState {
+      if (num_previous_visits >= max_revisits) {
+        return BufferState();
+      }
+
+      // Validate internal constraint.  This should be true by
+      // construction, as ControlFlowGraphBuilder only builds graphs
+      // that have two or fewer predecessors.
+      ICHECK_LE(block.predecessors.size(), 2)
+          << "InternalError: Each block should have at most two predecessors.  "
+          << "Graph constructed in ControlFlowGraphBuilder did not satisfy this constraint.";
+
+      std::vector<BufferState> states;
+      for (const auto& pred : block.predecessors) {
+        const auto& pred_block = control_flow_[pred.index];
+        BufferState state = pred_block.known_at_block_end;
+        state.Substitute(pred.var_remap, &analyzer);
+        states.push_back(state);
+      }
+
+      if (std::all_of(block.predecessors.begin(), block.predecessors.end(),
+                      [&](const auto& pred) { return visit_count_lookup[pred.index] == 0; })) {
+        // Predecessors, if any, are unvisited.
+        return {};
+      } else if (block.predecessors.size() == 1) {
+        // Block has only a single predecessor
+        return states[0];
+      }
+
+      const auto& pred_a = block.predecessors[0];
+      const auto& pred_b = block.predecessors[1];
+
+      auto& priors_a = states[0];
+      auto& priors_b = states[1];
+
+      // During the first visit of a block, predecessor blocks may be
+      // unvisited, even though we preferentially visit earlier blocks
+      // first.  (e.g. During the first visit of the start of a For
+      // loop, the end of the For loop has not yet been visited.)  If
+      // this is the case, assume the best-case scenario that all
+      // knowns are consistent, and rely on a later visit to
+      // resolve/remove any conflicts.
+      if (visit_count_lookup[pred_a.index] == 0) {
+        return priors_b;
+      } else if (visit_count_lookup[pred_b.index] == 0) {
+        return priors_a;
+      }
+
+      if (pred_a.post_condition && pred_b.post_condition) {
+        // The predicate can identify which predecessor block applies
+        // (e.g. i==0 for the first loop iteration, i>0 for remaining
+        // loop iterations).  Therefore, we can use all buffer
+        // constraints, conditional on having come from the
+        // predecessor that provides it.
+        priors_a.AddCondition(pred_a.post_condition.value());
+        priors_b.AddCondition(pred_b.post_condition.value());
+        priors_a.Union(priors_b, &analyzer);
+        return priors_a;
+      } else {
+        // We don't know which predecessor applies.  Therefore, the
+        // only buffer constraints that can be used are those that
+        // appear in both predecessors.
+        priors_a.Intersection(priors_b, &analyzer);
+        return priors_a;
+      }
+    }();
+
+    // Step 2: Collect knowns provided as a result of executing this block
+    auto post_state = [&]() {
+      if (num_previous_visits >= max_revisits) {
+        return BufferState();
+      }
+      auto post_state = block.known_at_block_start;
+      post_state.ApplyTouches(axis_var_lookup_, block.touch_points, &analyzer);
+      post_state.RemoveFreeParameters(free_predicate_parameters_, &analyzer);
+      return post_state;
+    }();
+
+    // Step 3: If any changes are made to the post knowns since the
+    // previous time we visited this block, mark the successor block
+    // as needing to be visited.
+    if (num_previous_visits == 0 ||
+        !post_state.IsEquivalentTo(block.known_at_block_end, &analyzer)) {
+      block.known_at_block_end = std::move(post_state);
+      for (const auto& successor : block.successors) {
+        to_visit.insert(successor.index);
+      }
+    }
+  }
+}
+
+void ControlFlowGraph::BackwardPropagateUnusedValues(size_t max_revisits) {
+  // Values to visit when searching.  Using a std::set to
+  // preferentially visit nodes near the end of the control flow.
+  std::set<size_t> to_visit;
+
+  // Map from a block's index
+  std::unordered_map<size_t, size_t> visit_count_lookup;
+
+  // Initiatize the locations to search from, propagating values
+  // backward from anywhere that performs a write.
+  for (size_t i = 0; i < control_flow_.size(); i++) {
+    const auto& touch_points = control_flow_[i].touch_points;
+    bool performs_write = std::any_of(
+        touch_points.begin(), touch_points.end(),
+        [](const auto& touch) { return touch.touch_type == BufferTouch::AccessType::Write; });
+    if (performs_write) {
+      to_visit.insert(i);
+    }
+  }
+
+  Analyzer analyzer;
+  analyzer.rewrite_simplify.SetEnabledExtensions(
+      arith::RewriteSimplifier::kTransitivelyProveInequalities);
+
+  analyzer.Bind(iterator_ranges_);
+  analyzer.Bind(free_predicate_parameters_);
+
+  while (to_visit.size()) {
+    size_t visiting = *to_visit.rbegin();
+    to_visit.erase(visiting);
+
+    size_t num_previous_visits = visit_count_lookup[visiting]++;
+
+    ControlFlowBlock& block = control_flow_[visiting];
+
+    // Step 1: Collect known unused indices provided by each successor
+    block.unused_at_block_end = [&]() -> BufferState {
+      if (num_previous_visits >= max_revisits) {
+        return BufferState();
+      }
+      ICHECK_LE(block.successors.size(), 2)
+          << "Each block should have at most two successors, but block " << visiting
+          << " breaks this requirement";
+
+      std::vector<BufferState> states;
+      for (const auto& successor : block.successors) {
+        const auto& successor_block = control_flow_[successor.index];
+        BufferState state = successor_block.unused_at_block_start;
+        state.Substitute(successor.var_remap, &analyzer);
+        states.push_back(state);
+      }
+
+      if (std::all_of(block.successors.begin(), block.successors.end(), [&](const auto& successor) {
+            return visit_count_lookup[successor.index] == 0;
+          })) {
+        // Successors, if any, are unvisited.
+        return {};
+      } else if (block.successors.size() == 1) {
+        // Block has only a single successor
+        return states[0];
+      }
+
+      const auto& successor_a = block.successors[0];
+      const auto& successor_b = block.successors[1];
+
+      auto& post_a = states[0];
+      auto& post_b = states[1];
+
+      // During the first visit of a block, successor blocks may be
+      // unvisited, even though we preferentially visit later blocks
+      // first.  (e.g. During the first visit of the end of a For
+      // loop, the start of the For loop has not yet been visited.)
+      // If this is the case, assume the best-case scenario that all
+      // knowns are consistent, and rely on a later visit to
+      // resolve/remove any conflicts.
+      if (visit_count_lookup[successor_a.index] == 0) {
+        return post_b;
+      } else if (visit_count_lookup[successor_b.index] == 0) {
+        return post_a;
+      }
+
+      if (successor_a.post_condition && successor_b.post_condition) {
+        // The predicate can identify which successor block applies
+        // (e.g. i==n-1 for the last loop iteration, i<n-1 for earlier
+        // loop iterations).  Therefore, we can use all buffer
+        // constraints, conditional on having come from the
+        // successor that provides it.
+        post_a.AddCondition(successor_a.post_condition.value());
+        post_b.AddCondition(successor_b.post_condition.value());
+        post_a.Union(post_b, &analyzer);
+        return post_a;
+      } else {
+        // We don't know which successor applies.  Therefore, the
+        // only buffer constraints that can be used are those that
+        // appear in both successors.
+        post_a.Intersection(post_b, &analyzer);
+        return post_a;
+      }
+    }();
+
+    // Step 2: Collect knowns provided as a result of executing this block
+    auto unused_at_block_start = [&]() {
+      if (num_previous_visits >= max_revisits) {
+        return BufferState();
+      }
+      auto prior_state = block.unused_at_block_end;
+      prior_state.BackpropUnusedIndices(axis_var_lookup_, block.touch_points, &analyzer);
+      prior_state.RemoveFreeParameters(free_predicate_parameters_, &analyzer);
+      return prior_state;
+    }();
+
+    // Step 3: If any changes are made to the post knowns since the
+    // previous time we visited this block, mark the successor block
+    // as needing to be visited.
+    if (num_previous_visits == 0 ||
+        !unused_at_block_start.IsEquivalentTo(block.unused_at_block_start, &analyzer)) {
+      block.unused_at_block_start = std::move(unused_at_block_start);
+      for (const auto& pred : block.predecessors) {
+        to_visit.insert(pred.index);
+      }
+    }
+  }
+}
+
+bool ControlFlowGraph::IsOverwrittenWithoutEffect(const tir::BufferStore& store,
+                                                  const Stmt& context) const {
+  Optional<Array<Var>> index_variables = GetIndexVariables(store->buffer);
+  if (!index_variables) {
+    return false;
+  }
+
+  auto it = control_flow_lookup_.find(context.get());
+  ICHECK(it != control_flow_lookup_.end())
+      << "Context " << PrettyPrint(context) << " did not occur within analyzed statement";
+  const auto& context_block = control_flow_[it->second];
+
+  auto [store_touch, free_params] = context_block.MakeBufferTouch(
+      store->buffer, index_variables.value(), store->indices, BufferTouch::AccessType::Write,
+      BufferLoad(store->buffer, store->indices));
+
+  Analyzer local_analyzer;
+  local_analyzer.Bind(free_predicate_parameters_);
+  local_analyzer.Bind(iterator_ranges_);
+  local_analyzer.Bind(free_params);
+  local_analyzer.rewrite_simplify.SetEnabledExtensions(
+      RewriteSimplifier::kTransitivelyProveInequalities);
+
+  PrimExpr predicate = store_touch.predicate && store_touch.AtLoopIteration();
+
+  predicate = SimplifyAsAndOfOrs(predicate, &local_analyzer);
+
+  for (const auto& unused : context_block.unused_at_block_end.constraints_) {
+    if (store_touch.buffer.same_as(unused.buffer)) {
+      PrimExpr difference = SimplifyAsAndOfOrs(predicate && !unused.predicate, &local_analyzer);
+      if (is_zero(difference)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+PrimExpr ControlFlowGraph::SimplifyInContext(PrimExpr expr, const tir::Stmt& context,
+                                             Analyzer* analyzer) const {
+  size_t context_index = [&]() {
+    auto it = control_flow_lookup_.find(context.get());
+    ICHECK(it != control_flow_lookup_.end())
+        << "Context did not occur in the Stmt provided to BufferTouchPattern's constructor";
+    return it->second;
+  }();
+
+  PrimExpr constraint = Bool(true);
+  for (const auto& known : non_buffer_assumptions_) {
+    constraint = constraint && known;
+  }
+  With<ConstraintContext> constraint_context(analyzer, constraint);
+
+  expr = control_flow_[context_index].known_at_block_start.SubstituteKnownBufferValues(
+      std::move(expr), axis_var_lookup_, analyzer);
+
+  expr = analyzer->Simplify(std::move(expr));
+  return expr;
+}
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/analysis/control_flow_graph.h b/src/tir/analysis/control_flow_graph.h
new file mode 100644
index 000000000000..aa9023ba29dd
--- /dev/null
+++ b/src/tir/analysis/control_flow_graph.h
@@ -0,0 +1,653 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file control_flow_graph.h
+ * \brief Utility for extracting and interacting with buffer touch points
+ */
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/arith/int_solver.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/var.h>
+
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#ifndef TVM_TIR_ANALYSIS_CONTROL_FLOW_GRAPH_H_
+#define TVM_TIR_ANALYSIS_CONTROL_FLOW_GRAPH_H_
+
+namespace tvm {
+namespace tir {
+
+/*! \brief Represents an interaction with a buffer */
+struct BufferTouch {
+  enum class AccessType {
+    /*! \brief Buffer access occurs in BufferLoad */
+    Read,
+
+    /*! \brief Buffer access occurs in BufferStore */
+    Write,
+
+    /*! \brief Buffer access occurs in tir::builtin::assume() */
+    Assume,
+  };
+
+  BufferTouch(Buffer buffer, PrimExpr predicate, PrimExpr value)
+      : buffer(buffer),
+        predicate(predicate),
+        value(value),
+        loop_var_expressions({}),
+        touch_type(AccessType::Assume) {}
+
+  BufferTouch(Buffer buffer, PrimExpr predicate, PrimExpr value,
+              std::vector<std::pair<Var, PrimExpr>> loop_var_expressions, AccessType touch_type)
+      : buffer(buffer),
+        predicate(predicate),
+        value(value),
+        loop_var_expressions(loop_var_expressions),
+        touch_type(touch_type) {}
+
+  /*! \brief The buffer being touched */
+  Buffer buffer;
+
+  /*! \brief A predicate that is true when this touch applies
+   *
+   * May be in terms of axis variables to indicate touches that impact
+   * only a portion of a buffer.
+   */
+  PrimExpr predicate;
+
+  /*! \brief The value in this buffer after the touch
+   *
+   * May be in terms of axis variables to indicate a known
+   * non-constant value.  May be in terms of a BufferLoad to indicate
+   * an unknown value.
+   */
+  PrimExpr value;
+
+  /*! \brief Active loops during the buffer touch
+   *
+   * The vector contains one entry for each loop that contains the
+   * buffer touch.  The `Var` item in each entry is the loop variable
+   * itself.  The `PrimExpr` item is an expression for the loop
+   * variable in terms of the buffer axis variables in
+   * `ControlFlowGraph::axis_var_lookup_`.
+   *
+   * Used to construct boolean expressions indicating whether the loop
+   * iteration that performs this touch has been reached.
+   */
+  std::vector<std::pair<Var, PrimExpr>> loop_var_expressions;
+
+  /*! \brief How the buffer was interacted with
+   *
+   * When used as a constraint (e.g. in BufferState), should use
+   * Assume.
+   */
+  AccessType touch_type{AccessType::Assume};
+
+  /*! \brief Generate a boolean expression that is true for indices
+   *  accessed by this touch during this iteration or a previous
+   *  loop iteration.
+   *
+   * Used during forward propagation, to track known values that were
+   * written in the current loop iteration, or in a preceding loop
+   * iteration.
+   */
+  PrimExpr BeforeLoopIteration() const;
+
+  /*! \brief Generate a boolean expression that is true for indices
+   *  accessed by this touch during this loop iteration.
+   *
+   * Used during speculative no-op insertion checks, to specify which
+   * indices must be later overwritten for a store to have no impact
+   * on final results.
+   */
+  PrimExpr AtLoopIteration() const;
+
+  /*! \brief Generate a boolean expression that is true for indices
+   *  accessed by this touch during this loop iteration or a
+   *  subsequent loop iteration.
+   *
+   * Used during backward propagation, to track indices that that are
+   * overwritten in the current loop iteration or in a later loop
+   * iteration.
+   */
+  PrimExpr AfterLoopIteration() const;
+
+  /* \brief Checks if this touch affects a subset of indices of another
+   *
+   * Returns true if the indices accessed by this touch are a subset
+   * of predicate is true can be proven to be a subset of the other
+   * subset.  Returns false if it cannot be proven to be a subset of
+   * ther other subset.
+   */
+  bool IsSubsetOf(const BufferTouch& other, arith::Analyzer* analyzer) const;
+
+  /* \brief Checks if this touch affects distinct indices from another
+   *
+   * Returns true if it can be proven that the two predicates cannot
+   * be simultaneously true.  Returns false if it cannot be proven
+   * that the two predicates are distinct.
+   */
+  bool IsDistinctFrom(const BufferTouch& other, arith::Analyzer* analyzer) const;
+
+  /* \brief Checks if this touch affects distinct indices from another
+   *
+   * Returns true if it can be proven that the two predicates cannot
+   * be simultaneously true.  Returns false if it cannot be proven
+   * that the two predicates are distinct.
+   */
+  bool IsEquivalentTo(const BufferTouch& other, arith::Analyzer* analyzer) const;
+
+  friend std::ostream& operator<<(std::ostream& os, const BufferTouch& expr);
+};
+
+/*! \brief Represents the known state of buffers at a specific point */
+class BufferState {
+ public:
+  /*! Default constructor
+   *
+   * Initialize the buffer state with no known information.
+   */
+  BufferState() {}
+
+  /*! \brief Replace BufferLoad instances with known values
+   *
+   * \param expr The expression to be updated.
+   *
+   * \param axis_var_lookup A map from buffer to the variables
+   * representing positions along the buffer's axes.
+   *
+   * \param analyzer The analyzer to use when validating a
+   * constraint's predicate.
+   *
+   * \returns The modified expression.  If no substitutions are made,
+   * the original expression is returned.
+   */
+  PrimExpr SubstituteKnownBufferValues(PrimExpr expr,
+                                       const Map<Buffer, Array<Var>>& axis_var_lookup,
+                                       arith::Analyzer* analyzer) const;
+
+  /*! \brief Apply a condition to all known constraints
+   *
+   * For example, when propagating pre-loop constraints into the body
+   * of a loop, add a condition that the loop iterator is zero.
+   *
+   * \param condition The condition to apply
+   */
+  void AddCondition(const PrimExpr& condition);
+
+  /*! \brief Perform a variable substitution for all constraints
+   *
+   * For example, when propagating constraints from the end of a loop
+   * to the beginning, replace `i` with `i-1`.
+   *
+   * \param var_remap The variable remapping to apply.
+   */
+  void Substitute(const Map<Var, PrimExpr>& var_remap, arith::Analyzer* analyzer);
+
+  /*! \brief Simplify the predicate of all constraints
+   *
+   * \param analyzer The analyzer with which to simplify
+   */
+  void Simplify(arith::Analyzer* analyzer);
+
+  /*! \brief Update the known buffer values based on buffer touches
+   *
+   * For any Write or Assume touches, update the known values.  For
+   * any Read touches, ignore.  Used to determine known values at the
+   * end of a control flow block, given the known values at the start.
+   *
+   * \param axis_var_lookup A map from buffer to the variables
+   * representing positions along the buffer's axes.
+   *
+   * \param touch_points The buffer touch points to apply
+   *
+   * \param analyzer The analyzer to use for simplifications
+   */
+  void ApplyTouches(const Map<Buffer, Array<Var>>& axis_var_lookup,
+                    const std::vector<BufferTouch>& touch_points, arith::Analyzer* analyzer);
+
+  /*! \brief Update unused buffer locations based on buffer touches
+   *
+   * For any Write, mark the written-to indices as unused.  (That is,
+   * immediately prior to assigning `buf[i] = expr`, the value stored
+   * at `buf[i]` is irrelevant.)  For any Read, mark the read-from
+   * indices as used.  This method is used to determine unused buffer
+   * indices at the start of a control flow block, given the unused
+   * buffer indices values at the end.
+   *
+   * \param axis_var_lookup A map from buffer to the variables
+   * representing positions along the buffer's axes.
+   *
+   * \param touch_points The buffer touch points to apply
+   *
+   * \param analyzer The analyzer to use for simplifications
+   */
+  void BackpropUnusedIndices(const Map<Buffer, Array<Var>>& axis_var_lookup,
+                             const std::vector<BufferTouch>& touch_points,
+                             arith::Analyzer* analyzer);
+
+  /*! \brief Remove free parameters from the constraints
+   *
+   * \param free_predicate_parameters
+   *
+   * \param analyzer The analyzer with which to simplify after removal
+   */
+  void RemoveFreeParameters(const Map<Var, Range>& free_predicate_parameters,
+                            arith::Analyzer* analyzer);
+
+  /*! \brief Check if two buffer states are equivalent
+   *
+   * \param other
+   *
+   * \param analyzer The analyzer used to check equality of PrimExpr
+   *
+   * \return True if the two states are provably equivalent, false otherwise.
+   */
+  bool IsEquivalentTo(const BufferState& other, arith::Analyzer* analyzer) const;
+
+  /* \brief Add known values provided by another state
+   *
+   * \param other The state with which to merge constraints
+   *
+   * \param analyzer The analyzer with which to simplify the result
+   */
+  void Union(const BufferState& other, arith::Analyzer* analyzer);
+
+  /* \brief Remove all known values not consistent with another state
+   *
+   * \param other The state with which to merge constraints
+   *
+   * \param analyzer The analyzer with which to simplify the result
+   */
+  void Intersection(const BufferState& other, arith::Analyzer* analyzer);
+
+  friend std::ostream& operator<<(std::ostream& os, const BufferState&);
+
+ private:
+  friend class ControlFlowGraph;
+  /*! \brief The known constraints */
+  std::vector<BufferTouch> constraints_;
+};
+
+/*! \brief Represents the flow of control through a `tir::Stmt`
+ *
+ * This class contains an internal representation of the possible
+ * control flow that may occur during execution of a `tir::Stmt`.  It
+ * consists of a collection of ControlFlowBlock objects, each of which
+ * represents a subset of operations performed during execution, along
+ * with edges that represent allowed transitions between
+ * `ControlFlowBlock`.
+ *
+ * In addition, the following restrictions are used.
+ *
+ * 1. Each block may have at most two predecessors, and at most two
+ *    successors.
+ *
+ * 2. Within each block, values stored in a buffer do not change.
+ *    That is, encountering a `BufferStore` node requires creating a
+ *    new block.
+ *
+ * For example, consider the following PrimFunc
+ *
+ * ```python
+ * @T.prim_func
+ * def func(T.Buffer[16, "float32"]):
+ *     for i in T.serial(16):
+ *         if i < 8:
+ *              B[i] = i
+ *         else:
+ *              B[i] = i-8
+ * ```
+ *
+ * The control flow graph would have eight control blocks.
+ *
+ * 1. function_entry, from the start of the function through the
+ *    evaluation of the loop's extent.
+ *
+ *    Predecessors: n/a
+ *    Successors: loop_start
+ *
+ * 2. loop_start, after entering the body of the loop, through the
+ *    evaluation of the conditional `i < 8`
+ *
+ *    Predecessors: function_entry, after_conditional
+ *    Successors: then_clause_start, else_clause_start
+ *
+ * 3. then_clause_start, after entering the then_clause of `i < 8`,
+ *    through evaluation of the value `i`.
+ *
+ *    Predecessors: loop_start
+ *    Successors: then_clause_end
+ *
+ * 4. then_clause_end, after storing to `B[i]` prior to exiting the
+ *    then_clause.
+ *
+ *    Predecessors: then_clause_start
+ *    Successors: after_conditional
+ *
+ * 5. else_clause_start, after entering the else_clause of `i < 8`,
+ *    through evaluation of the value `i-8`.
+ *
+ *    Predecessors: loop_start
+ *    Successors: else_clause_end
+ *
+ * 6. else_clause_end, after storing to `B[i]` prior to exiting the
+ *    else_clause.
+ *
+ *    Predecessors: else_clause_start
+ *    Successors: after_conditional
+ *
+ * 7. after_conditional, after the end of the if/then/else, before the
+ *    end of the loop body
+ *
+ *    Predecessors: then_clause_end, else_clause_end
+ *    Successors: loop_start, after_loop
+ *
+ * 8. after_loop, after the loop
+ *
+ *    Predecessors: after_conditional
+ *    Successors: n/a
+ *
+ *
+ * By identifying `BufferStore` nodes whose value does not depend on
+ * values stored in input buffers (e.g. initializing `buf[i] = 0.0`),
+ * or whose values are provided using `builtin::assume()`
+ * (e.g. `T.assume(buf[i] == 0.0)`), the value stored in a buffer at
+ * those indices may be known for a given control block.  These known
+ * values can then be propagated forward to successor blocks, to be
+ * used in context-dependent simplifications.
+ *
+ * In addition to the allowed transitions between control-flow
+ * blocks, each block also tracks the buffer touch points; which
+ * indices are read from a buffer, which values are written to which
+ * indices of a buffer, and assumptions are provided using
+ * `builtin::assume()`; that occur during the control-flow block.
+ *
+ * Note: The current implementation only tracks the values of
+ * buffers that are constrained to a specific value, and does not
+ * track inequalities that may partially constrain buffer values.
+ * That is, entering a scoped context with a data-dependent equality
+ * condition (e.g. `if buf[i] == value`) is tracked, but entering a
+ * scoped context with a data-dependent inequality condition
+ * (e.g. `if buf[i] > value`) is not tracked.
+ */
+class ControlFlowGraph {
+ public:
+  /* \brief Extract the touch pattern from a TIR statement
+   */
+  explicit ControlFlowGraph(const Stmt& stmt, size_t max_revisits = 5);
+
+  /* \brief Check if a write is overwritten without impacting final results
+   *
+   * \param store The store to be examined
+   *
+   * \param context The context in which the buffer store occurs, used
+   * to identify the control-flow block in which the store occurs.  In
+   * most cases, this will be the same object as the `store` itself.
+   *
+   * \param analyzer The analyzer to be used for simplifications
+   *
+   * \return True if the specified store can be proven to be
+   * overwritten without contributing to any later statements.
+   * Returns false otherwise.
+   */
+  bool IsOverwrittenWithoutEffect(const BufferStore& store, const Stmt& context) const;
+
+  /* \brief Simplify the expression, assuming it occurs within the given context
+   *
+   * \param expr The expression to be simplified.  Does not need to
+   * have occurred within the statement used to construct this
+   * BufferTouchPattern.
+   *
+   * \param context The statement where this expression occurred, or
+   * is to be inserted.  Must occur within the statement used to
+   * construct this BufferTouchPattern.
+   *
+   * \param analyzer The analyzer to be used for simplifications
+   *
+   * \returns The simplified statement
+   */
+  PrimExpr SimplifyInContext(PrimExpr expr, const Stmt& context, arith::Analyzer* analyzer) const;
+
+  /*! \brief Remove the specified BufferStore from the control-flow
+   *  graph
+   *
+   * Removing the specified store, which may reflow known values.
+   * This is necessary when simplifying sequential stores of the same
+   * value.  Otherwise, the first could be removed as a no-op because
+   * it is overwritten by the second, and the second could be removed
+   * as a no-op because it is the same value as the first.
+   *
+   * \param store The store to remove
+   */
+  void RemoveStore(const tir::BufferStore& store);
+
+  friend std::ostream& operator<<(std::ostream& os, const ControlFlowGraph& pattern);
+
+ private:
+  /*! \brief Return index variables representing locations within a
+   *   buffer.
+   *
+   * For a given buffer, will always return the same set of variables.
+   *
+   * \param buf The buffer being accessed
+   *
+   * \param indices The indices at which the buffer is being accessed.
+   * These are used to set the dtype of the buffer axis variables.
+   *
+   * \returns Variables representing a position along the buffer's axis.
+   */
+  Array<Var> GetIndexVariables(const Buffer& buf, const Array<PrimExpr>& indices);
+
+  /*! \brief Return index variables representing locations within a
+   *   buffer, if they have been generated before.
+   *
+   * For a given buffer, will always return the same set of variables.
+   *
+   * \param buf The buffer being accessed
+   *
+   * \returns Variables representing a position along the buffer's axis.
+   */
+  Optional<Array<Var>> GetIndexVariables(const Buffer& buf) const;
+
+  /*! \brief Propagate known values from known BufferStore/assume
+   *  subsequent control flow blocks
+   */
+  void ForwardPropagateKnownValues(size_t max_revisits);
+
+  /*! \brief Propagate overwritten/unused indices to preceding control
+   *  flow blocks
+   */
+  void BackwardPropagateUnusedValues(size_t max_revisits);
+
+  struct ControlFlowEdge {
+    /* \brief The source block of the control flow edge
+     *
+     * Lookup index into `control_flow_`
+     */
+    size_t index;
+
+    /*! \brief Variable remaps
+     *
+     * e.g. Replacing loop iterator `i` with `i-1` when following an
+     * edge from the end of a loop to the beginning of the loop.
+     */
+    Map<Var, PrimExpr> var_remap;
+
+    /*! \brief Condition that must to true after following this edge
+     *
+     * This is applied after variable remapping.  For example, `i >
+     * loop_min` when following the an edge from the end of a loop to
+     * the beginning of the loop.
+     */
+    Optional<PrimExpr> post_condition;
+  };
+  friend std::ostream& operator<<(std::ostream& os, const ControlFlowEdge& edge);
+
+  struct ControlFlowBlock {
+    struct LoopEntry {
+      Var loop_var;
+      PrimExpr loop_min;
+      PrimExpr loop_max;
+      Range loop_range;
+    };
+
+    /*! \brief Loop iterators that are active during this block */
+    std::vector<LoopEntry> active_loop_iterators;
+
+    /*! \brief Loop-dependent Let bindings that may appear within the block */
+    Map<Var, PrimExpr> let_bindings_using_loop;
+
+    /*! \brief Predicate that must be true to have reached this block */
+    PrimExpr scope_predicate{Bool(true)};
+
+    /*! \brief All known values prior to executing the block */
+    BufferState known_at_block_start;
+
+    /*! \brief All known values after executing the block */
+    BufferState known_at_block_end;
+
+    /*! \brief Indices whose value at the start of the block is known to be unused */
+    BufferState unused_at_block_start;
+
+    /*! \brief Indices whose value at the end of the block is known to be unused */
+    BufferState unused_at_block_end;
+
+    /* \brief Buffer touches that occur within the block
+     *
+     * All buffer touches within a block can be treated as occurring
+     * simultaneously.
+     */
+    std::vector<BufferTouch> touch_points;
+
+    /* \brief The blocks that occur after this block
+     *
+     * Lookup index into `control_flow_`
+     */
+    std::vector<ControlFlowEdge> successors;
+
+    /* \brief The blocks that occur before this block */
+    std::vector<ControlFlowEdge> predecessors;
+
+    /* \brief Construct a BufferTouch instance within this
+     * ControlFlowBlock
+     *
+     * \param graph The mutable ControlFlowGraph that owns the buffer
+     * touch.  Any free parameters used in the BufferTouch's predicate
+     * will be tracked by the ControlFlowGraph.
+     *
+     * \param buf The Buffer being accessed
+     *
+     * \param indices The indices at which the buffer is accessed, in
+     * terms of the loop variables.
+     *
+     * \param touch_type The type of touch being generated
+     *
+     * \param known_expr_value The value being written to the buffer
+     *
+     * \returns The newly generated BufferTouch
+     */
+    BufferTouch MakeBufferTouch(ControlFlowGraph* graph, const Buffer& buf,
+                                const Array<PrimExpr>& indices, BufferTouch::AccessType touch_type,
+                                PrimExpr known_value_expr) const;
+
+    /* \brief Construct a BufferTouch instance as if it occurred in
+     * this ControlFlowBlock
+     *
+     * Used when speculative checking if a BufferStore could be
+     * inserted.
+     *
+     * \param buf The Buffer being accessed
+     *
+     * \param index_variables The variables representing location
+     * within a buffer, with one variable for each axis of the buffer.
+     *
+     * \param indices The indices at which the buffer is accessed, in
+     * terms of the loop variables.
+     *
+     * \param touch_type The type of touch being generated
+     *
+     * \param known_expr_value The value being written to the buffer
+     *
+     * \returns The newly generated BufferTouch, and a map specifying
+     * all free parameters that may occur in the BufferTouch's
+     * predicate.
+     */
+    std::pair<BufferTouch, Map<Var, Range>> MakeBufferTouch(const Buffer& buf,
+                                                            Array<Var> index_variables,
+                                                            Array<PrimExpr> indices,
+                                                            BufferTouch::AccessType touch_type,
+                                                            PrimExpr known_value_expr) const;
+  };
+  friend std::ostream& operator<<(std::ostream& os, const ControlFlowBlock& pattern);
+
+  /* \brief The control flow that occurs within the analyzed statement */
+  std::vector<ControlFlowBlock> control_flow_;
+
+  /* \brief A lookup into control_flow_
+   *
+   * A map to look up the control flow block that contains the
+   * statement.
+   */
+  std::unordered_map<const StmtNode*, size_t> control_flow_lookup_;
+
+  /*! \brief A map from free parameters to their range
+   *
+   * A BufferStore/BufferLoad has indices in terms of loop iterators,
+   * while the internal BufferTouch must have predicate in terms of
+   * the buffer's axes.  While converting to the internal BufferTouch,
+   * reduction axes show up as free parameters.  Tracking the range of
+   * the free parameters allows them to be removed later, by requiring
+   * a predicate to be true for all values of the free parameters.
+   */
+  Map<Var, Range> free_predicate_parameters_;
+
+  /*! \brief Ranges of iterators found in the analyzed statement */
+  Map<Var, Range> iterator_ranges_;
+
+  /* \brief A map from buffer to the variables representing positions
+   * along the buffer's axes.
+   *
+   * This is stored here, rather than as part of the BufferState or
+   * BufferTouch, to ensure that all access of a buffer use the same
+   * variables to represent the buffer's axes, reducing the amount of
+   * variable substitution required.
+   */
+  Map<Buffer, Array<Var>> axis_var_lookup_;
+
+  /* \brief Assumptions that do not depend on buffer values
+   *
+   * These may be collected as part of the handling of `builtin::assume()`, and do not depend on any
+   * buffer.  Since TIR only allows mutable values as part of buffers, these assumptions may be used
+   * anywhere the
+   */
+  std::vector<PrimExpr> non_buffer_assumptions_;
+
+  friend class ControlFlowGraphBuilder;
+};
+
+}  // namespace tir
+}  // namespace tvm
+#endif  // TVM_TIR_ANALYSIS_CONTROL_FLOW_GRAPH_H_
diff --git a/src/tir/transforms/simplify.cc b/src/tir/transforms/simplify.cc
index 1dbf9e688027..49d3a9ceaef5 100644
--- a/src/tir/transforms/simplify.cc
+++ b/src/tir/transforms/simplify.cc
@@ -29,7 +29,10 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/transform.h>
 
+#include <optional>
+
 #include "../../arith/ir_mutator_with_analyzer.h"
+#include "../../tir/analysis/control_flow_graph.h"
 
 namespace tvm {
 namespace arith {
@@ -38,6 +41,8 @@ using namespace tir;
 
 struct SimplifyConfigNode : public tvm::AttrsNode<SimplifyConfigNode> {
   bool transitively_prove_inequalities;
+  bool propagate_knowns_to_prove_conditional;
+  bool propagate_knowns_to_simplify_expressions;
   bool convert_boolean_to_and_of_ors;
   bool apply_constraints_to_boolean_branches;
 
@@ -47,6 +52,17 @@ struct SimplifyConfigNode : public tvm::AttrsNode<SimplifyConfigNode> {
             "If true, simplify conditionals with transitive combinations of scoped constraints")
         .set_default(false);
 
+    TVM_ATTR_FIELD(propagate_knowns_to_prove_conditional)
+        .describe(
+            "If true, known buffer values are propagated and used to statically prove conditionals")
+        .set_default(false);
+
+    TVM_ATTR_FIELD(propagate_knowns_to_simplify_expressions)
+        .describe(
+            "If true, known buffer values are propagated and used to replace BufferLoad wherever "
+            "possible")
+        .set_default(false);
+
     TVM_ATTR_FIELD(convert_boolean_to_and_of_ors)
         .describe("If true, simplify conditionals into an AND of ORs")
         .set_default(false);
@@ -85,16 +101,46 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.Simplify", SimplifyConfig);
 
 class StmtSimplifier : public IRMutatorWithAnalyzer {
  public:
-  explicit StmtSimplifier(Analyzer* analyzer) : IRMutatorWithAnalyzer(analyzer) {}
+  static Stmt Apply(Stmt stmt, Analyzer* analyzer, Optional<SimplifyConfig> config_opt = NullOpt) {
+    auto config = config_opt.value_or(AttrsWithDefaultValues<arith::SimplifyConfig>());
+    analyzer->rewrite_simplify.SetEnabledExtensions(config->GetEnabledExtensions());
+
+    std::optional<ControlFlowGraph> touch_pattern = std::nullopt;
+    if (config->propagate_knowns_to_prove_conditional ||
+        config->propagate_knowns_to_simplify_expressions) {
+      touch_pattern = ControlFlowGraph(stmt);
+    }
+    StmtSimplifier simplifier(analyzer, config, std::move(touch_pattern));
+    return simplifier(std::move(stmt));
+  }
+
+ private:
+  explicit StmtSimplifier(Analyzer* analyzer, SimplifyConfig config,
+                          std::optional<ControlFlowGraph> touch_pattern)
+      : IRMutatorWithAnalyzer(analyzer), config_(config), touch_pattern_(touch_pattern) {}
 
   using Parent = IRMutatorWithAnalyzer;
   using Parent::VisitStmt;
   using Parent::VisitStmt_;
 
-  PrimExpr VisitExpr(const PrimExpr& expr) final { return analyzer_->Simplify(expr); }
+  PrimExpr VisitExpr(const PrimExpr& expr) final {
+    if (config_->propagate_knowns_to_simplify_expressions) {
+      return touch_pattern_->SimplifyInContext(expr, current_stmt_.value(), analyzer_);
+    } else {
+      return analyzer_->Simplify(expr);
+    }
+  }
 
   Stmt Simplify(Stmt stmt) { return operator()(std::move(stmt)); }
 
+  Stmt VisitStmt(const Stmt& stmt) override {
+    Optional<Stmt> cache = this->current_stmt_;
+    this->current_stmt_ = stmt;
+    Stmt output = Parent::VisitStmt(stmt);
+    this->current_stmt_ = std::move(cache);
+    return output;
+  }
+
   Stmt VisitStmt_(const ForNode* op) final {
     analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
     With<ConstraintContext> ctx1(analyzer_, op->loop_var >= op->min);
@@ -111,7 +157,7 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     return SideEffect(op->value) <= CallEffectKind::kPure;
   }
 
-  Stmt VisitStmt_(const LetStmtNode* op) {
+  Stmt VisitStmt_(const LetStmtNode* op) override {
     PrimExpr value = this->VisitExpr(op->value);
     if (CanInlineLetStmt(op)) {
       // it is fine to discard the let binding
@@ -134,26 +180,24 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     }
   }
 
-  Stmt VisitStmt_(const IfThenElseNode* op) {
-    PrimExpr cond = analyzer_->Simplify(Substitute(op->condition, non_inlined_bindings_));
-    if (const int64_t* as_int = as_const_int(cond)) {
-      if (*as_int) {
+  Stmt VisitStmt_(const IfThenElseNode* op) override {
+    if (Optional<Bool> cond = ProveCondition(op->condition)) {
+      if (cond.value()->value) {
         return this->VisitStmt(op->then_case);
       } else if (op->else_case) {
         return this->VisitStmt(op->else_case.value());
       } else {
         return Evaluate(0);
       }
+    } else {
+      return Parent::VisitStmt_(op);
     }
-    return Parent::VisitStmt_(op);
   }
 
-  PrimExpr VisitExpr_(const CallNode* op) {
+  PrimExpr VisitExpr_(const CallNode* op) override {
     if (op->op.same_as(builtin::if_then_else())) {
-      PrimExpr cond = this->VisitExpr(op->args[0]);
-      cond = analyzer_->Simplify(Substitute(std::move(cond), non_inlined_bindings_));
-      if (const int64_t* as_int = as_const_int(cond)) {
-        if (*as_int) {
+      if (Optional<Bool> cond = ProveCondition(op->args[0])) {
+        if (cond.value()->value) {
           return this->VisitExpr(op->args[1]);
         } else {
           return this->VisitExpr(op->args[2]);
@@ -196,23 +240,50 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     return true;
   }
 
+  /* \brief Internal utility for checking conditionals
+   *
+   * Uses more aggressive optimization, such as performing additional
+   * inlining and tracking known buffer values.
+   */
+  Optional<Bool> ProveCondition(PrimExpr condition) const {
+    condition = Substitute(condition, non_inlined_bindings_);
+    if (config_->propagate_knowns_to_prove_conditional) {
+      ICHECK(touch_pattern_.has_value());
+      condition = touch_pattern_->SimplifyInContext(condition, current_stmt_.value(), analyzer_);
+    } else {
+      condition = analyzer_->Simplify(condition);
+    }
+    if (const int64_t* as_int = as_const_int(condition)) {
+      return Bool(*as_int);
+    } else {
+      return NullOpt;
+    }
+  }
+
+  SimplifyConfig config_;
+  std::optional<ControlFlowGraph> touch_pattern_;
+
   Map<Var, PrimExpr> non_inlined_bindings_;
+  Optional<Stmt> current_stmt_{NullOpt};
 };
 
 }  // namespace arith
 
 namespace tir {
+
+Stmt Simplify(Stmt stmt, arith::Analyzer* analyzer) {
+  return arith::StmtSimplifier::Apply(stmt, analyzer);
+}
+
 namespace transform {
 
 Pass Simplify() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     arith::Analyzer analyzer;
-    auto cfg = ctx->GetConfig<arith::SimplifyConfig>("tir.Simplify")
-                   .value_or(AttrsWithDefaultValues<arith::SimplifyConfig>());
-    analyzer.rewrite_simplify.SetEnabledExtensions(cfg->GetEnabledExtensions());
+    auto cfg = ctx->GetConfig<arith::SimplifyConfig>("tir.Simplify");
 
     auto* n = f.CopyOnWrite();
-    n->body = arith::StmtSimplifier(&analyzer).Simplify(std::move(n->body));
+    n->body = arith::StmtSimplifier::Apply(std::move(n->body), &analyzer, cfg);
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.Simplify", {});
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 4477e1d9c713..4199cb9a56f7 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -16,7 +16,7 @@
 # under the License.
 import pytest
 import tvm
-from tvm import te
+from tvm import te, tir
 
 
 class RewriteChecker:
@@ -873,6 +873,65 @@ def test_cmp_simplify():
     ck.verify(fld(x + 2, 4) * 4 >= x - y, tvm.tir.LE(flm(x + 2, 4) + (-2), y))
     # End DivMod Rules
 
+    # merging flm/fld into known value
+    ck.verify(tir.all(fld(x, 8) == 3, flm(x, 8) == 4), x == 28)
+    ck.verify(tir.all(flm(x, 8) == 4, fld(x, 8) == 3), x == 28)
+    ck.verify(tir.all(fld(x, 8) == -3, flm(x, 8) == 4), x == -20)
+    ck.verify(tir.all(flm(x, 8) == 4, fld(x, 8) == -3), x == -20)
+
+    # Rewrite based on definition of integer division
+    ck.verify(tir.all(tvm.runtime.convert(0) <= x - y * 5, x - y * 5 < 5), y == fld(x, 5))
+    ck.verify(tir.all(x - y * 5 < 5, tvm.runtime.convert(0) <= x - y * 5), y == fld(x, 5))
+
+    # Narrow upper bound using floormod
+    ck.verify(tir.all(x < 20, flm(x, 5) < 2), tir.all(x < 17, flm(x, 5) < 2))
+    ck.verify(tir.all(x < 18, flm(x, 5) < 2), tir.all(x < 17, flm(x, 5) < 2))
+    ck.verify(tir.all(x <= 19, flm(x, 5) < 2), tir.all(x < 17, flm(x, 5) < 2))
+    ck.verify(tir.all(x <= 18, flm(x, 5) < 2), tir.all(x < 17, flm(x, 5) < 2))
+    ck.verify(tir.all(x < -20, flm(x, 5) < 2), tir.all(x < -23, flm(x, 5) < 2))
+    ck.verify(tir.all(x < 18 - 40, flm(x, 5) < 2), tir.all(x < 17 - 40, flm(x, 5) < 2))
+    ck.verify(tir.all(x <= -21, flm(x, 5) < 2), tir.all(x < -23, flm(x, 5) < 2))
+    ck.verify(tir.all(x <= -22, flm(x, 5) < 2), tir.all(x < -23, flm(x, 5) < 2))
+    # No change if the floormod cannot help narrow the upper bound
+    ck.verify(tir.all(x < 16, flm(x, 5) < 2), tir.all(x < 16, flm(x, 5) < 2))
+    ck.verify(tir.all(x <= 15, flm(x, 5) < 2), tir.all(x <= 15, flm(x, 5) < 2))
+
+    # Merge a known floordiv and an upper bound of floormod into a value range
+    ck.verify(
+        tir.all(fld(x, 10) == 5, flm(x, 10) < 7),
+        tir.all(tvm.runtime.convert(50) <= x, x < 57),
+    )
+    ck.verify(
+        tir.all(fld(x, 10) == 5, flm(x, 10) <= 7),
+        tir.all(tvm.runtime.convert(50) <= x, x <= 57),
+    )
+    ck.verify(
+        tir.all(fld(x, 10) == -5, flm(x, 10) < 7),
+        tir.all(tvm.runtime.convert(-50) <= x, x < -43),
+    )
+    ck.verify(
+        tir.all(fld(x, 10) == -5, flm(x, 10) <= 7),
+        tir.all(tvm.runtime.convert(-50) <= x, x <= -43),
+    )
+
+    # Merge a known floordiv and an lower bound of floormod into a value range
+    ck.verify(
+        tir.all(fld(x, 10) == 5, tvm.runtime.convert(7) < flm(x, 10)),
+        tir.all(tvm.runtime.convert(57) < x, x < 60),
+    )
+    ck.verify(
+        tir.all(fld(x, 10) == 5, tvm.runtime.convert(7) <= flm(x, 10)),
+        tir.all(tvm.runtime.convert(57) <= x, x < 60),
+    )
+    ck.verify(
+        tir.all(fld(x, 10) == -5, tvm.runtime.convert(7) < flm(x, 10)),
+        tir.all(tvm.runtime.convert(-43) < x, x < -40),
+    )
+    ck.verify(
+        tir.all(fld(x, 10) == -5, tvm.runtime.convert(7) <= flm(x, 10)),
+        tir.all(tvm.runtime.convert(-43) <= x, x < -40),
+    )
+
     ck.verify(tvm.te.min(x, 11) < 10, x < 10)
     ck.verify(tvm.te.min(x, 8) < 10, tvm.tir.const(1, "bool"))
     ck.verify(tvm.te.max(8, x) > 10, tvm.tir.LT(10, x))
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 8d9c76c6b20d..fd98b715a4bc 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -140,6 +140,8 @@ class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
     transitively_prove_inequalities = False
     convert_boolean_to_and_of_ors = False
     apply_constraints_to_boolean_branches = False
+    propagate_knowns_to_prove_conditional = False
+    propagate_knowns_to_simplify_expressions = False
 
     def transform(self):
         def inner(mod):
@@ -148,6 +150,8 @@ def inner(mod):
                     "transitively_prove_inequalities": self.transitively_prove_inequalities,
                     "convert_boolean_to_and_of_ors": self.convert_boolean_to_and_of_ors,
                     "apply_constraints_to_boolean_branches": self.apply_constraints_to_boolean_branches,
+                    "propagate_knowns_to_prove_conditional": self.propagate_knowns_to_prove_conditional,
+                    "propagate_knowns_to_simplify_expressions": self.propagate_knowns_to_simplify_expressions,
                 }
             }
             with tvm.transform.PassContext(config=config):
@@ -777,7 +781,7 @@ def before(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
         A[0] = (i == 0 or j == 10 or k == 20) and (j == 10 or k != 30 or i == 0)
 
     def expected(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
-        A[0] = i == 0 or j == 10 or k == 20
+        A[0] = j == 10 or k == 20 or i == 0
 
 
 class TestRewriteAsAndOfOrUsingSimplificationAcrossAnd(BaseBeforeAfter):
@@ -794,7 +798,7 @@ def before(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
         A[0] = (k == 20) and ((i == 0 or j == 10) and (k != 30))
 
     def expected(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
-        A[0] = (k == 20) and (i == 0 or j == 10)
+        A[0] = (i == 0 or j == 10) and (k == 20)
 
 
 class TestRewriteAsAndOfOrUsingSimplificationWithinOr(BaseBeforeAfter):
@@ -815,7 +819,7 @@ def before(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
         A[0] = (i == 20) or (j == 0) or (i != 30)
 
     def expected(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32, k: T.int32):
-        A[0] = (i != 30) or (j == 0)
+        A[0] = (j == 0) or (i != 30)
 
 
 class TestConditionalFloorMod(BaseBeforeAfter):
@@ -1049,5 +1053,640 @@ def func(A: T.Buffer[1, "bool"]):
         return func
 
 
+class TestProvableConditionWithOffset(BaseBeforeAfter):
+    """Use scoped-constraint to prove inequalities"""
+
+    transitively_prove_inequalities = False
+
+    def before(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32):
+        if i < j:
+            A[0] = i < j + 1
+
+    def expected(A: T.Buffer[1, "bool"], i: T.int32, j: T.int32):
+        if i < j:
+            A[0] = True
+
+
+class TestAlteredBufferContents(BaseBeforeAfter):
+    """Propagation of data-dependent conditionals.
+
+    A literal constraint must not be propagated if the values
+    referenced may change.  TIR requires single assignment of
+    variables, so Var objects may be assumed constant, but BufferLoad
+    may not.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[(1,), "int32"], n: T.int32):
+        if A[0] == n:
+            A[0] = A[0] + 1
+            # If the simplifier incorrectly uses the invalidated
+            # A[0]==n condition required to reach this point, then it
+            # will incorrectly simplify to the then-case.  If the
+            # simplifier correctly determines that A[0] now contains
+            # n+1, then it will correctly simplify to the else-case.
+            if A[0] == n:
+                A[0] = 5
+            else:
+                A[0] = 10
+
+    def expected(A: T.Buffer[(1,), "int32"], n: T.int32):
+        if A[0] == n:
+            A[0] = A[0] + 1
+            A[0] = 10
+
+
+class TestPossiblyAlteredBufferContents(BaseBeforeAfter):
+    """No simplification of data-dependent conditionals.
+
+    Like TestAlteredBufferContents, but the `m==0` conditional
+    prevents the value of `A[0]` from being known at the point of the
+    inner conditional, either as `A[0] == n` from the outer
+    conditional or as `A[0] == n+1` from the write statement.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[(1,), "int32"], n: T.int32, m: T.int32):
+        if A[0] == n:
+            if m == 0:
+                A[0] = A[0] + 1
+
+            if A[0] == n:
+                A[0] = 5
+            else:
+                A[0] = 10
+
+    expected = before
+
+
+class TestSimplifyInputAssumption(BaseBeforeAfter):
+    """A T.assume annotation may be used to simplify"""
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[1, "int32"], n: T.int32):
+        T.evaluate(T.assume(n == 0))
+        if n == 0:
+            A[0] = 42
+
+    def expected(A: T.Buffer[1, "int32"], n: T.int32):
+        T.evaluate(T.assume(n == 0))
+        A[0] = 42
+
+
+class TestSimplifyInputAssumption(BaseBeforeAfter):
+    """A T.assume annotation may be used to simplify"""
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[1, "int32"], n: T.int32):
+        T.evaluate(T.assume(n == 0))
+        if n == 0:
+            A[0] = 42
+
+    def expected(A: T.Buffer[1, "int32"], n: T.int32):
+        T.evaluate(T.assume(n == 0))
+        A[0] = 42
+
+
+class TestNoSimplifyFromScopedInputAssumption(BaseBeforeAfter):
+    """A T.assume inside a scope may not apply outside that scope"""
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[1, "int32"], n: T.int32, m: T.int32):
+        if m == 0:
+            T.evaluate(T.assume(n == 0))
+
+        if n == 0:
+            A[0] = 42
+
+    expected = before
+
+
+class TestSimplifyConditionalUsingBufferValue(BaseBeforeAfter):
+    """Simplify a conditional using the known value in the buffer"""
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[1, "int32"]):
+        A[0] = 0
+
+        if A[0] == 0:
+            A[0] = 42
+
+    def expected(A: T.Buffer[1, "int32"]):
+        A[0] = 0
+        A[0] = 42
+
+
+class TestKeepExpressionSimplifyUsingBufferValue(BaseBeforeAfter):
+    """Do not simplify expressions in general using known values in the buffer
+
+    For now, because this is equivalent to inlining, preventing this
+    usage from occurring.  Known buffer values may be used to prove
+    conditionals, but should not be used for other simplifications.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[1, "int32"], B: T.Buffer[1, "int32"]):
+        A[0] = 0
+        B[0] = A[0]
+
+    expected = before
+
+
+class TestSimplifyConditionalInLoopUsingBufferValue(BaseBeforeAfter):
+    """Simplify a conditional using the known value in the buffer
+
+    Like TestSimplifyConditionalUsingBufferValue, but the value used
+    to simplify is set in a previous loop.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[16, "int32"], B: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = i
+
+        for j in T.serial(16):
+            if A[j] == j:
+                B[j] = 42
+            else:
+                B[j] = 100
+
+    def expected(A: T.Buffer[16, "int32"], B: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = i
+
+        for j in T.serial(16):
+            B[j] = 42
+
+
+class TestSimplifyUsingBufferAssumption(BaseBeforeAfter):
+    """A T.assume may apply to a buffer's contents"""
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[1, "int32"]):
+        T.evaluate(T.assume(A[0] == 0))
+
+        if A[0] == 0:
+            A[0] = 42
+
+    def expected(A: T.Buffer[1, "int32"]):
+        T.evaluate(T.assume(A[0] == 0))
+        A[0] = 42
+
+
+class TestSimplifyUsingBufferAssumptionInLoop(BaseBeforeAfter):
+    """An assumption about buffer contents may apply to a range"""
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            T.evaluate(T.assume(A[i] == i))
+
+        for i in T.serial(16):
+            if A[i] < 100:
+                A[i] = 0
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            T.evaluate(T.assume(A[i] == i))
+
+        for i in T.serial(16):
+            A[i] = 0
+
+
+class TestSimplifyUsingPartiallyKnownBufferConditional(BaseBeforeAfter):
+    """An assumption about buffer contents may apply to only part of a buffer"""
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if 14 <= i:
+                T.evaluate(T.assume(A[i] == 0))
+
+        for i in T.serial(16):
+            if 14 <= i:
+                if A[i] == 0:
+                    A[i] = 42
+
+            else:
+                if A[i] == 0:
+                    A[i] = 100
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if 14 <= i:
+                T.evaluate(T.assume(A[i] == 0))
+
+        for i in T.serial(16):
+            if 14 <= i:
+                A[i] = 42
+
+            else:
+                if A[i] == 0:
+                    A[i] = 100
+
+
+class TestSimplifyUsingPartiallyKnownBufferExpression(BaseBeforeAfter):
+    """An assumption about buffer contents may apply to only part of a buffer
+
+    Like TestSimplifyUsingPartiallyKnownBufferConditional, but the
+    conditional is expressed as part of T.assume, instead of in the
+    control flow.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            T.evaluate(T.assume(i < 14 or A[i] == 0))
+
+        for i in T.serial(16):
+            if 14 <= i:
+                if A[i] == 0:
+                    A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            T.evaluate(T.assume(i < 14 or A[i] == 0))
+
+        for i in T.serial(16):
+            if 14 <= i:
+                A[i] = 42
+
+
+class TestNoSimplificationIfPredicateNotMet(BaseBeforeAfter):
+    """Assumptions about buffer contents must apply to all cases to be used
+
+    Like TestSimplifyUsingPartialBufferAssumptionInLoop, but the
+    predicate in the second loop does not match the predicate in the
+    first loop.  Therefore, the `T.assume` refers to a different set
+    of indices.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if 14 <= i:
+                T.evaluate(T.assume(A[i] == 0))
+
+        for i in T.serial(16):
+            if i < 14:
+                if A[i] == 0:
+                    A[i] = 42
+
+    expected = before
+
+
+class TestNoSimplifyUsingInvalidatedScopedConstraint(BaseBeforeAfter):
+    """A write may not be used for proofs outside its conditional"""
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i == 0:
+                A[i] = 0
+
+            if A[i] == 0:
+                A[i] = 42
+
+    expected = before
+
+
+class TestNoSimplifyUsingOverwrittenValue(BaseBeforeAfter):
+    """A write that may have been overwritten may not be treated as known
+
+    The appearance of "A[i] = 5" must prevent the earlier constraint
+    from being used for simplification.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            T.evaluate(T.assume(A[i] == 0))
+
+        for i in T.serial(16):
+            if i == 0:
+                A[i] = 5
+
+            if A[i] == 0:
+                A[i] = 42
+
+    expected = before
+
+
+class TestNoSimplifyUsingLoopDependentBufferValue(BaseBeforeAfter):
+    """Do not simplify assuming reads are invariant
+
+    If a buffer's value changes across loop iterations, the buffer's
+    value before the loop should not be used to simplify conditionals
+    within the loop.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[16, "int32"], B: T.Buffer[1, "int32"]):
+        B[0] = 0
+        for i in T.serial(16):
+            if B[0] < 10:
+                B[0] = A[i] * 2 + B[0]
+            else:
+                B[0] = A[i] + B[0]
+
+    expected = before
+
+
+class TestSimplifyPriorToOverwrittenValue(BaseBeforeAfter):
+    """A known value may be used until it is overwritten
+
+    Like TestNoSimplifyUsingOverwrittenValue, but the use of the
+    known `A[i]` value occurs before it is overwritten.
+
+    Like TestNoSimplifyUsingLoopDependentBufferValue, but the loop
+    iterations are all independent.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            T.evaluate(T.assume(A[i] == 0))
+
+        for i in T.serial(16):
+            if A[i] == 0:
+                A[i] = 17
+
+            if i == 0:
+                A[i] = 5
+
+            if A[i] == 0:
+                A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            T.evaluate(T.assume(A[i] == 0))
+
+        for i in T.serial(16):
+            A[i] = 17
+
+            if i == 0:
+                A[i] = 5
+
+            if A[i] == 0:
+                A[i] = 42
+
+
+class TestSimplifyElementWiseUsingPreLoopBufferValue(BaseBeforeAfter):
+    """Allow data-Do not simplify assuming reads are invariant
+
+    If an element-wise loop reads and overwrites a buffer value, the
+    pre-loop buffer value may be used to simplify conditions that
+    occur prior to the write.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[16, "int32"], B: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            B[i] = 0
+
+        for i in T.serial(16):
+            if B[i] < 10:
+                B[i] = A[i] * 2 + B[i]
+            else:
+                B[i] = A[i] + B[i]
+
+    def expected(A: T.Buffer[16, "int32"], B: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            B[i] = 0
+
+        for i in T.serial(16):
+            B[i] = A[i] * 2 + B[i]
+
+
+class TestSimplifyNonConditional(BaseBeforeAfter):
+    """Propagate a known value to later expressions."""
+
+    propagate_knowns_to_simplify_expressions = True
+
+    def before(A: T.Buffer[1, "int32"]):
+        A[0] = 0
+        A[0] = A[0] + 1
+
+    def expected(A: T.Buffer[1, "int32"]):
+        A[0] = 0
+        A[0] = 1
+
+
+class TestSuppressSimplifyNonConditional(BaseBeforeAfter):
+    """Propagate a known value to later expressions.
+
+    Like TestSimplifyNonConditional, but with data-propagation turned off.
+    """
+
+    propagate_knowns_to_simplify_expressions = False
+
+    def before(A: T.Buffer[1, "int32"]):
+        A[0] = 0
+        A[0] = A[0] + 1
+
+    expected = before
+
+
+class TestSimplifyUsingTransitiveKnownBufferValue(BaseBeforeAfter):
+    """Propagate known buffer values
+
+    If a known value of a buffer depends on another known value, it
+    can be tracked backwards through both.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[1, "int32"]):
+        T.evaluate(T.assume(A[0] == 0))
+
+        A[0] = A[0] + 1
+        A[0] = A[0] + 1
+        A[0] = A[0] + 1
+
+        if A[0] == 3:
+            A[0] = 42
+
+    def expected(A: T.Buffer[1, "int32"]):
+        T.evaluate(T.assume(A[0] == 0))
+
+        A[0] = A[0] + 1
+        A[0] = A[0] + 1
+        A[0] = A[0] + 1
+
+        A[0] = 42
+
+
+class TestSimplifyRampIndexBroadcastValue(BaseBeforeAfter):
+    """Simplifications involving buffer loads with ramp indices"""
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[4, "int32"]):
+        A[T.ramp(0, 1, 4)] = T.broadcast(0, 4)
+
+        if A[0] == 0:
+            A[0] = 42
+
+        if A[1] == 0:
+            A[1] = 60
+
+    def expected(A: T.Buffer[4, "int32"]):
+        A[T.ramp(0, 1, 4)] = T.broadcast(0, 4)
+
+        A[0] = 42
+        A[1] = 60
+
+
+class TestSimplifyRampIndexRampValue(BaseBeforeAfter):
+    """Simplifications involving buffer loads with ramp indices"""
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[4, "int32"]):
+        A[T.ramp(0, 1, 4)] = T.ramp(11, 1, 4)
+
+        if A[0] == 11:
+            A[0] = 42
+
+        if A[1] == 12:
+            A[1] = 60
+
+    def expected(A: T.Buffer[4, "int32"]):
+        A[T.ramp(0, 1, 4)] = T.ramp(11, 1, 4)
+
+        A[0] = 42
+        A[1] = 60
+
+
+class TestSimplifyUsingPartiallyProvenBufferValueGather(BaseBeforeAfter):
+    """Propagate known buffer values in part of buffer.
+
+    Even if a constraint can't be solved for all values in an
+    assignment, it may be provable in part of a buffer.  Here, the
+    known 0 values in the padding of A produces known 0 values in the
+    padding of B.
+    """
+
+    transitively_prove_inequalities = True
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[24, "int32"], B: T.Buffer[24, "int32"], F: T.Buffer[3, "int32"]):
+        # A has non-zero values only in the range 3 <= i < 17
+        for i in T.serial(24):
+            T.evaluate(T.assume(((3 <= i) and (i < 17)) or A[i] == 0))
+
+        # After convoluting with F, B has non-zero values only in the
+        # range 3 <= i < 19.
+        for i in T.serial(24):
+            B[i] = 0
+            for f in T.serial(3):
+                if 0 <= i - f:
+                    B[i] = B[i] + A[i - f] * F[f]
+
+        # Which means that this loop is unnecessary.  It would be
+        # removed entirely in tir.transform.RemoveNoOp, but here we
+        # want to test that the simplification works as intended.
+        for i in T.serial(24):
+            if i < 3 or 19 <= i:
+                if B[i] != 0:
+                    B[i] = 0
+
+    def expected(A: T.Buffer[24, "int32"], B: T.Buffer[24, "int32"], F: T.Buffer[3, "int32"]):
+        for i in T.serial(24):
+            T.evaluate(T.assume(((3 <= i) and (i < 17)) or A[i] == 0))
+
+        for i in T.serial(24):
+            B[i] = 0
+            for f in T.serial(3):
+                if 0 <= i - f:
+                    B[i] = B[i] + A[i - f] * F[f]
+
+        for i in T.serial(24):
+            if i < 3 or 19 <= i:
+                T.evaluate(0)
+
+
+class TestSimplifyUsingPartiallyProvenBufferValueScatter(BaseBeforeAfter):
+    """Propagate known buffer values in part of buffer.
+
+    Like TestSimplifyUsingPartiallyProvenBufferValueGather, but the
+    compute loop is over the input buffer A, rather than the output
+    buffer B.
+    """
+
+    propagate_knowns_to_prove_conditional = True
+
+    def before(A: T.Buffer[24, "int32"], B: T.Buffer[24, "int32"], F: T.Buffer[3, "int32"]):
+        # A has non-zero values only in the range 3 <= i < 17
+        for i in T.serial(24):
+            T.evaluate(T.assume(((3 <= i) and (i < 17)) or A[i] == 0))
+
+        for i in T.serial(24):
+            B[i] = 0
+
+        # After convoluting with F, B has non-zero values only in the
+        # range 3 <= i < 19.
+        for i in T.serial(24):
+            for f in T.serial(3):
+                if i + f >= 0 and i + f < 24:
+                    B[i + f] = B[i + f] + A[i] * F[f]
+
+        # Which means that this loop is unnecessary.  It actually gets
+        # removed in tir.transform.RemoveNoOp, but here we want to
+        # test that the simplification works as intended.
+        for i in T.serial(24):
+            if i < 3 or 19 <= i:
+                if B[i] != 0:
+                    B[i] = 0
+
+    def expected(A: T.Buffer[24, "int32"], B: T.Buffer[24, "int32"], F: T.Buffer[3, "int32"]):
+        for i in T.serial(24):
+            T.evaluate(T.assume(((3 <= i) and (i < 17)) or A[i] == 0))
+
+        for i in T.serial(24):
+            B[i] = 0
+
+        for i in T.serial(24):
+            for f in T.serial(3):
+                if i + f < 24:
+                    B[i + f] = B[i + f] + A[i] * F[f]
+
+        for i in T.serial(24):
+            if i < 3 or 19 <= i:
+                T.evaluate(0)
+
+
+class TestSimplifyBufferStore(BaseBeforeAfter):
+    """Simplification using prior known"""
+
+    propagate_knowns_to_simplify_expressions = True
+
+    def before(A: T.Buffer[1, "int32"]):
+        A[0] = 5
+        A[0] = A[0] + 7
+
+    def expected(A: T.Buffer[1, "int32"]):
+        A[0] = 5
+        A[0] = 12
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 14342a37f5b0d79884923f1fef8a1935e9f5ba61 Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Wed, 16 Nov 2022 13:18:44 -0800
Subject: [PATCH 599/704] [Hexagon] Enable Hexagon User DMA bypass mode
 (#13381)

Enables Hexagon User DMA bypass mode based on user-specified dma_bypass_cache option for DMA copies between DDR and VTCM.

The upside of this change is increased DMA bandwidth (up to 40 GBps observed using test_vtcm_bandwidth.py) and compute throughput using a 3-stage pipeline --- cache read, compute, cache write (up to 38 Gops using test_parallel_hvx_load_vtcm.py).

The downside of this change is the potential for data coherency issues resulting from the need to manage the cache in software when using DMA bypass hence the user dma_bypass_cache option to enable or disable bypass mode.

The strategy to manage the cache in software centers around the requirement for Hexagon to operate on HexagonBuffer objects regardless of scope --- DDR or VTCM. When copying to / from a HexagonBuffer we aggressively invalidate the cache for both the source and destination, both before and after the copy. Also note that the copy is now implemented with memcpy instead of DMA. With the cache clean after copy to / from a HexagonBuffer we can now use DMA bypass mode. However, this software cache management strategy is NOT infallible --- if a HexagonBuffer becomes dirty in the cache prior to a DMA with bypass mode enabled we may see data coherency issues.

Also simplifies Hexagon DMA flows by removing the unused mem_copy instrinsic and lowering as well as the hexagon_user_dma_1d_sync helper function which is replaced by calls to HexagonUserDMA::Copy and HexagonUserDMA::Wait.

* restore vtcm tests; add TODO for ION buffer; check IsVtcm pointers
---
 include/tvm/tir/builtin.h                     |   7 -
 src/driver/driver_api.cc                      |   1 +
 src/runtime/hexagon/hexagon_buffer.cc         |  18 +-
 src/runtime/hexagon/hexagon_device_api.cc     |  29 +--
 src/runtime/hexagon/hexagon_user_dma.cc       |  61 ++---
 src/runtime/hexagon/hexagon_user_dma.h        |   2 +-
 src/runtime/hexagon/hexagon_vtcm_pool.h       |  12 +
 src/tir/op/builtin.cc                         |   3 -
 src/tir/transforms/lower_async_dma.cc         |  14 +-
 src/tir/transforms/lower_tvm_builtin.cc       |  15 --
 .../hexagon/hexagon_user_dma_tests.cc         | 151 +++++++++++-
 .../test_hexagon/test_async_dma_pipeline.py   |   3 +-
 .../test_hexagon/test_cache_read_write.py     | 226 ------------------
 .../test_parallel_hvx_load_vtcm.py            |   9 +-
 .../test_software_pipeline_async.py           |   6 +-
 .../python/contrib/test_hexagon/test_vtcm.py  |  63 +++++
 .../test_hexagon/test_vtcm_bandwidth.py       |   3 +-
 17 files changed, 284 insertions(+), 339 deletions(-)
 delete mode 100644 tests/python/contrib/test_hexagon/test_cache_read_write.py
 create mode 100644 tests/python/contrib/test_hexagon/test_vtcm.py

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 9f6b7f9ce5d1..d830ea579aa7 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -713,13 +713,6 @@ TVM_DLL const Op& texture2d_store();
  */
 TVM_DLL const Op& texture2d_load();
 
-/*!
- * \brief Copy 1d memory from source to destination
- * Same semantics as memcpy(destination, source, size)
- * Allows for device specific implementations e.g. direct memory access (DMA)
- */
-TVM_DLL const Op& mem_copy();
-
 /*!
  * \brief Initiate a non-blocking DMA copy from source to destination
  */
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index bb4990e3e502..e5e3998b1e7b 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -53,6 +53,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.merge_async_commit_queue_scope", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_lwp", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.dma_bypass_cache", Bool);
 
 using tvm::Array;
 using tvm::transform::Pass;
diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index c58026e83cfe..b8c7bd2cb96e 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -26,13 +26,12 @@
 
 #include "hexagon_common.h"
 #include "hexagon_device_api.h"
+#include "qurt_memory.h"
 
 namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length);
-
 struct Allocation {
   Allocation(size_t allocation_nbytes, size_t alignment)
       : allocation_nbytes_(allocation_nbytes), alignment_(alignment) {}
@@ -237,8 +236,19 @@ void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet&
 
   // Finally, do the memory copies.
   for (const auto& copy : macro_copies) {
-    int error_code = hexagon_user_dma_1d_sync(copy.dest, copy.src, copy.num_bytes);
-    CHECK_EQ(error_code, 0);
+    // clean Hexagon cache before / after memcpy to ensure clean cache state to enable usage of DMA
+    // bypass mode for increased DMA bandwidth
+    // TODO(HWE): Switch to ION Buffer to avoid need for memcpy and potentially lighten or alleviate
+    // the burden of cache invalidation in this code
+    qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.dest), copy.num_bytes,
+                         QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
+    qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.src), copy.num_bytes,
+                         QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
+    memcpy(copy.dest, copy.src, copy.num_bytes);
+    qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.dest), copy.num_bytes,
+                         QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
+    qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.src), copy.num_bytes,
+                         QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
   }
 }
 
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 7221be03cc53..1c3b139d39a3 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -38,8 +38,6 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length);
-
 HexagonDeviceAPI* HexagonDeviceAPI::Global() {
   static auto* inst = new HexagonDeviceAPI();
   return inst;
@@ -206,39 +204,38 @@ void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void
   memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
 }
 
-TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy_DLTensor")
+TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy_dltensor")
     .set_body([](TVMArgs args, TVMRetValue* rv) {
       DLTensor* dst = args[0];
       DLTensor* src = args[1];
       int size = args[2];
+      ICHECK(size > 0);
+      bool bypass_cache = args[3];
 
-      hexagon_user_dma_1d_sync(dst->data, src->data, size);
+      int ret = DMA_RETRY;
+      do {
+        ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(SYNC_DMA_QUEUE, dst->data, src->data,
+                                                          size, bypass_cache);
+      } while (ret == DMA_RETRY);
+      CHECK(ret == DMA_SUCCESS);
+      HexagonDeviceAPI::Global()->UserDMA()->Wait(SYNC_DMA_QUEUE, 0);
 
       *rv = static_cast<int32_t>(0);
     });
 
-TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
-  void* dst = args[0];
-  void* src = args[1];
-  int size = args[2];
-
-  int error_code = hexagon_user_dma_1d_sync(dst, src, size);
-  CHECK_EQ(error_code, 0);
-
-  *rv = static_cast<int32_t>(0);
-});
-
 TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
   int queue_id = args[0];
   void* dst = args[1];
   void* src = args[2];
   int size = args[3];
   ICHECK(size > 0);
+  bool bypass_cache = args[3];
 
   int ret = DMA_RETRY;
   do {
-    ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(queue_id, dst, src, size);
+    ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(queue_id, dst, src, size, bypass_cache);
   } while (ret == DMA_RETRY);
+  CHECK(ret == DMA_SUCCESS);
   *rv = static_cast<int32_t>(ret);
 });
 
diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc
index 619338e39688..c30fd645bbd0 100644
--- a/src/runtime/hexagon/hexagon_user_dma.cc
+++ b/src/runtime/hexagon/hexagon_user_dma.cc
@@ -32,7 +32,7 @@ unsigned int HexagonUserDMA::Init() {
   return status;
 }
 
-int HexagonUserDMA::Copy(int queue_id, void* dst, void* src, uint32_t length) {
+int HexagonUserDMA::Copy(int queue_id, void* dst, void* src, uint32_t length, bool bypass_cache) {
   // length limited to 24 bits
   if (length > DESC_LENGTH_MASK) {
     return DMA_FAILURE;
@@ -66,8 +66,24 @@ int HexagonUserDMA::Copy(int queue_id, void* dst, void* src, uint32_t length) {
   dma_desc_set_desctype(dma_desc, DESC_DESCTYPE_1D);
   dma_desc_set_dstcomp(dma_desc, DESC_COMP_NONE);
   dma_desc_set_srccomp(dma_desc, DESC_COMP_NONE);
-  dma_desc_set_bypassdst(dma_desc, DESC_BYPASS_OFF);
-  dma_desc_set_bypasssrc(dma_desc, DESC_BYPASS_OFF);
+
+  bool dst_is_ddr = !HexagonDeviceAPI::Global()->VtcmPool()->IsVtcm(dst, length);
+  bool src_is_ddr = !HexagonDeviceAPI::Global()->VtcmPool()->IsVtcm(src, length);
+
+  // VTCM -> DDR with bypass enabled
+  if (dst_is_ddr && !src_is_ddr && bypass_cache) {
+    dma_desc_set_bypassdst(dma_desc, DESC_BYPASS_ON);
+  } else {
+    dma_desc_set_bypassdst(dma_desc, DESC_BYPASS_OFF);
+  }
+
+  // DDR -> VTCM with bypass enabled
+  if (src_is_ddr && !dst_is_ddr && bypass_cache) {
+    dma_desc_set_bypasssrc(dma_desc, DESC_BYPASS_ON);
+  } else {
+    dma_desc_set_bypasssrc(dma_desc, DESC_BYPASS_OFF);
+  }
+
   dma_desc_set_order(dma_desc, DESC_ORDER_ORDER);
   dma_desc_set_done(dma_desc, DESC_DONE_INCOMPLETE);
   dma_desc_set_src(dma_desc, src32);
@@ -117,45 +133,6 @@ HexagonUserDMA::~HexagonUserDMA() {
   delete descriptors_;
 }
 
-int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
-  HexagonUserDMA* user_dma = HexagonDeviceAPI::Global()->UserDMA();
-
-  // One DMA transfer can copy at most DESC_LENGTH_MASK bytes.
-  // Make the common case quick.
-  if (length <= DESC_LENGTH_MASK) {
-    // sync DMA -> `Copy` and then `Wait(0)`
-    int ret_val = user_dma->Copy(SYNC_DMA_QUEUE, dst, src, length);
-    if (ret_val != DMA_SUCCESS) return ret_val;
-    user_dma->Wait(SYNC_DMA_QUEUE, 0);
-    return DMA_SUCCESS;
-  }
-
-  // Split big transfers into smaller transfers.
-  char* cast_src = static_cast<char*>(src);
-  char* cast_dst = static_cast<char*>(dst);
-  for (uint32_t i = 0; i < length;) {
-    // Ensure there is no overflow while updating i
-    uint32_t cur_len = std::min<uint32_t>(length - i, DESC_LENGTH_MASK);
-    // sync DMA -> `Copy` and then `Wait(0)`
-    int ret_val = user_dma->Copy(SYNC_DMA_QUEUE, &cast_dst[i], &cast_src[i], cur_len);
-    if (ret_val != DMA_SUCCESS) return ret_val;
-    user_dma->Wait(SYNC_DMA_QUEUE, 0);
-    // 2 cases for new val for i:
-    // 1. length - i <= DESC_LENGTH_MASK (<= MAX_UINT)
-    //    new_i = i + (length - i) = length, no more iter
-    //            and no overflow (since (length - i) <= (MAX_UINT - i))
-    // 2. length - i > DESC_LENGTH_MASK
-    //    length > (i + DESC_LENGTH_MASK)
-    //    new_i = (i + DESC_LENGTH_MASK)
-    //    length > new_i for next iter, we're done
-    //    length - i > DESC_LENGTH_MASK
-    //    and length <= MAX_UINT,
-    //    so MAX_UINT >= length > DESC_LEN_MASK + i
-    //    MAX_UINT > (DESC_LEN_MASK + i), so no overflow
-    i += cur_len;
-  }
-  return DMA_SUCCESS;
-}
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h
index 01e143d255b4..9397a16e3f03 100644
--- a/src/runtime/hexagon/hexagon_user_dma.h
+++ b/src/runtime/hexagon/hexagon_user_dma.h
@@ -52,7 +52,7 @@ class HexagonUserDMA {
    * \param length Length in bytes to copy
    * \returns Status: DMA_SUCCESS or DMA_FAILURE
    */
-  int Copy(int queue_id, void* dst, void* src, uint32_t length);
+  int Copy(int queue_id, void* dst, void* src, uint32_t length, bool bypass_cache);
 
   /*!
    * \brief Wait until the number of DMAs in flight is less than or equal to some maximum
diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.h b/src/runtime/hexagon/hexagon_vtcm_pool.h
index 1c44a455196c..2e0918e997c4 100644
--- a/src/runtime/hexagon/hexagon_vtcm_pool.h
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.h
@@ -70,6 +70,18 @@ class HexagonVtcmPool {
   //! \brief Returns the total number of bytes in this pool
   size_t TotalBytes() { return reinterpret_cast<size_t>(vtcm_size_); }
 
+  bool IsVtcm(void* ptr, unsigned size) {
+    auto char_ptr = static_cast<char*>(ptr);
+    CHECK(char_ptr != nullptr);
+    auto char_vtcm = static_cast<char*>(vtcm_data_);
+    CHECK(vtcm_data_ != nullptr);
+
+    if (char_ptr >= char_vtcm && (char_ptr + size) <= (char_vtcm + vtcm_size_)) {
+      return true;
+    }
+    return false;
+  }
+
  private:
   //! \brief Total size of VTCM pool
   unsigned int vtcm_size_;
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 9feba142eb6a..56ecba9e9ed9 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -290,9 +290,6 @@ TIR_DEFINE_BUILTIN_FUNC(texture2d_load)
     .set_attr<TVectorizable>("TVectorizable", true)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_BUILTIN_FUNC(mem_copy).set_attr<TCallEffectKind>("TCallEffectKind",
-                                                            Integer(CallEffectKind::kOpaque));
-
 TIR_DEFINE_BUILTIN_FUNC(dma_copy).set_attr<TCallEffectKind>("TCallEffectKind",
                                                             Integer(CallEffectKind::kOpaque));
 
diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc
index 417e9d61f263..b9ba4d41b7da 100644
--- a/src/tir/transforms/lower_async_dma.cc
+++ b/src/tir/transforms/lower_async_dma.cc
@@ -32,7 +32,7 @@ namespace tir {
 
 class AsyncDMALowerer : public StmtExprMutator {
  public:
-  AsyncDMALowerer() {}
+  explicit AsyncDMALowerer(bool dma_bypass_cache) : dma_bypass_cache_(dma_bypass_cache) {}
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     // Convert this, for example:
@@ -52,7 +52,7 @@ class AsyncDMALowerer : public StmtExprMutator {
       int queue_id = queue_id_node->value;
 
       // abort if we have not seen this queue ID in `copy` transform
-      if (queue_ids.find(queue_id) == queue_ids.end()) {
+      if (queue_ids_.find(queue_id) == queue_ids_.end()) {
         DLOG(INFO) << "AsyncDMALowerer exiting because the queue ID observed in the "
                       "`async_wait_queue_scope` transform has not been previously observed in the "
                       "`async_commit_queue_scope` transform";
@@ -160,7 +160,7 @@ class AsyncDMALowerer : public StmtExprMutator {
 
       // now that we are about to perform the `copy` transform
       // save queue ID for inspection in `wait` transform
-      queue_ids.insert(queue_id);
+      queue_ids_.insert(queue_id);
 
       return Evaluate(Call(DataType::Int(32), builtin::dma_copy(),
                            {queue_id,
@@ -168,13 +168,14 @@ class AsyncDMALowerer : public StmtExprMutator {
                                  {BufferLoad(bufferstorenode->buffer, store_index)}),
                             Call(DataType::Handle(), builtin::address_of(),
                                  {BufferLoad(bufferloadnode->buffer, load_index)}),
-                            for_loop->extent * bufferloadnode->dtype.bytes()}));
+                            for_loop->extent * bufferloadnode->dtype.bytes(), dma_bypass_cache_}));
     }
     return StmtExprMutator::VisitStmt_(op);
   }
 
  private:
-  std::set<int> queue_ids;
+  std::set<int> queue_ids_;
+  bool dma_bypass_cache_;
 };
 
 namespace transform {
@@ -182,7 +183,8 @@ namespace transform {
 Pass LowerAsyncDMA() {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
     auto fptr = f.CopyOnWrite();
-    fptr->body = AsyncDMALowerer()(std::move(fptr->body));
+    bool dma_bypass_cache = ctx->GetConfig<Bool>("tir.dma_bypass_cache", Bool(false)).value();
+    fptr->body = AsyncDMALowerer(dma_bypass_cache)(std::move(fptr->body));
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.LowerAsyncDMA", {});
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index f79682ef7ecc..25d62539721f 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -315,8 +315,6 @@ class BuiltinLower : public StmtExprMutator {
       return MakeArray(op);
     } else if (op->op.same_as(builtin::tvm_context_id())) {
       return make_zero(op->dtype);
-    } else if (op->op.same_as(builtin::mem_copy())) {
-      return MakeMemCopy(op);
     } else if (op->op.same_as(builtin::dma_copy())) {
       return MakeDMACopy(op);
     } else if (op->op.same_as(builtin::dma_wait())) {
@@ -326,19 +324,6 @@ class BuiltinLower : public StmtExprMutator {
     }
   }
 
-  PrimExpr MakeMemCopy(const CallNode* op) {
-    PrimExpr dst = op->args[0];
-    PrimExpr src = op->args[1];
-    PrimExpr size = op->args[2];
-
-    std::string fdevapi_prefix =
-        "device_api." + std::string(runtime::DeviceName(device_type_.as<IntImmNode>()->value));
-
-    Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(),
-                            {StringImm(fdevapi_prefix + ".mem_copy"), dst, src, size});
-    return VisitExpr(call_packed);
-  }
-
   PrimExpr MakeDMACopy(const CallNode* op) {
     PrimExpr queue_id = op->args[0];
     PrimExpr dst = op->args[1];
diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
index b76c7c652e6a..e4ffe3a0de9c 100644
--- a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
@@ -53,6 +53,10 @@ class HexagonUserDMATest : public ::testing::Test {
   char* src_char = nullptr;
   char* dst_char = nullptr;
   uint32_t length = 0x4000;  // 16KB
+  const bool ENABLE_BYPASS = true;
+  const bool DISABLE_BYPASS = false;
+  Optional<String> global_scope{"global"};
+  Optional<String> global_vtcm_scope{"global.vtcm"};
 };
 
 TEST_F(HexagonUserDMATest, wait) {
@@ -67,14 +71,14 @@ TEST_F(HexagonUserDMATest, bad_copy) {
   void* src64 = reinterpret_cast<void*>(bigaddr);
   void* dst64 = reinterpret_cast<void*>(bigaddr);
   uint32_t biglength = 0x1000000;
-  ASSERT_NE(user_dma->Copy(queue_id, dst64, src, length), DMA_SUCCESS);
-  ASSERT_NE(user_dma->Copy(queue_id, dst, src64, length), DMA_SUCCESS);
-  ASSERT_NE(user_dma->Copy(queue_id, dst, src, biglength), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(queue_id, dst64, src, length, DISABLE_BYPASS), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(queue_id, dst, src64, length, DISABLE_BYPASS), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(queue_id, dst, src, biglength, DISABLE_BYPASS), DMA_SUCCESS);
 }
 
 TEST_F(HexagonUserDMATest, sync_dma) {
   // kick off 1 DMA
-  ret = user_dma->Copy(queue_id, dst, src, length);
+  ret = user_dma->Copy(queue_id, dst, src, length, DISABLE_BYPASS);
   ASSERT_EQ(ret, DMA_SUCCESS);
 
   // wait for DMA to complete
@@ -89,7 +93,7 @@ TEST_F(HexagonUserDMATest, sync_dma) {
 TEST_F(HexagonUserDMATest, async_dma_wait) {
   // kick off 10x duplicate DMAs
   for (uint32_t i = 0; i < 10; ++i) {
-    ret = user_dma->Copy(queue_id, dst, src, length);
+    ret = user_dma->Copy(queue_id, dst, src, length, DISABLE_BYPASS);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }
 
@@ -108,7 +112,7 @@ TEST_F(HexagonUserDMATest, async_dma_wait) {
 TEST_F(HexagonUserDMATest, async_dma_poll) {
   // kick off 10x duplicate DMAs
   for (uint32_t i = 0; i < 10; ++i) {
-    ret = user_dma->Copy(queue_id, dst, src, length);
+    ret = user_dma->Copy(queue_id, dst, src, length, DISABLE_BYPASS);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }
 
@@ -131,7 +135,7 @@ TEST_F(HexagonUserDMATest, pipeline) {
 
   for (uint32_t i = 0; i < pipeline_depth; ++i) {
     ret |= user_dma->Copy(queue_id, dst_char + i * pipeline_length, src_char + i * pipeline_length,
-                          pipeline_length);
+                          pipeline_length, DISABLE_BYPASS);
   }
 
   user_dma->Wait(queue_id, 3);
@@ -168,35 +172,35 @@ TEST_F(HexagonUserDMATest, pipeline_write_queue) {
 
   for (uint32_t i = 0; i < pipeline_depth; ++i) {
     ret |= user_dma->Copy(queue_id, dst_char + i * pipeline_length, src_char + i * pipeline_length,
-                          pipeline_length);
+                          pipeline_length, DISABLE_BYPASS);
   }
 
   user_dma->Wait(queue_id, 3);
   for (uint32_t i = 0; i < pipeline_length; ++i) {
     dst_char[i]++;
   }
-  ret |= user_dma->Copy(write_queue, src_char, dst_char, pipeline_length);
+  ret |= user_dma->Copy(write_queue, src_char, dst_char, pipeline_length, DISABLE_BYPASS);
 
   user_dma->Wait(queue_id, 2);
   for (uint32_t i = pipeline_length; i < 2 * pipeline_length; ++i) {
     dst_char[i]++;
   }
   ret |= user_dma->Copy(write_queue, src_char + pipeline_length, dst_char + pipeline_length,
-                        pipeline_length);
+                        pipeline_length, DISABLE_BYPASS);
 
   user_dma->Wait(queue_id, 1);
   for (uint32_t i = 2 * pipeline_length; i < 3 * pipeline_length; ++i) {
     dst_char[i]++;
   }
   ret |= user_dma->Copy(write_queue, src_char + 2 * pipeline_length, dst_char + 2 * pipeline_length,
-                        pipeline_length);
+                        pipeline_length, DISABLE_BYPASS);
 
   user_dma->Wait(queue_id, 0);
   for (uint32_t i = 3 * pipeline_length; i < 4 * pipeline_length; ++i) {
     dst_char[i]++;
   }
   ret |= user_dma->Copy(write_queue, src_char + 3 * pipeline_length, dst_char + 3 * pipeline_length,
-                        pipeline_length);
+                        pipeline_length, DISABLE_BYPASS);
   user_dma->Wait(write_queue, 0);
 
   // verify
@@ -214,7 +218,7 @@ TEST_F(HexagonUserDMATest, overflow_ring_buffer) {
   for (uint32_t i = 0; i < number_of_dmas; ++i) {
     do {
       ret = user_dma->Copy(queue_id, dst_char + i * length_of_each_dma,
-                           src_char + i * length_of_each_dma, length_of_each_dma);
+                           src_char + i * length_of_each_dma, length_of_each_dma, DISABLE_BYPASS);
     } while (ret == DMA_RETRY);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }
@@ -224,3 +228,124 @@ TEST_F(HexagonUserDMATest, overflow_ring_buffer) {
     ASSERT_EQ(src_char[i], dst_char[i]);
   }
 }
+
+TEST_F(HexagonUserDMATest, sync_dma_bypass) {
+  HexagonBuffer srchb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer dsthb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer vtcmhb(length, kHexagonAllocAlignment, global_vtcm_scope);
+
+  // init src, dst HexagonBuffers
+  srchb.CopyFrom(src, length);
+  dsthb.CopyFrom(dst, length);
+
+  // DDR src -> VTCM
+  ret = user_dma->Copy(queue_id, vtcmhb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // VTCM -> DDR dst
+  ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcmhb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // wait for DMAs to complete
+  user_dma->Wait(queue_id, 0);
+
+  // copy answer from dst HexagonBuffer
+  dsthb.CopyTo(dst, length);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+}
+
+TEST_F(HexagonUserDMATest, sync_dma_bypass_vtcm_to_vtcm) {
+  HexagonBuffer srchb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer dsthb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer vtcm1hb(length, kHexagonAllocAlignment, global_vtcm_scope);
+  HexagonBuffer vtcm2hb(length, kHexagonAllocAlignment, global_vtcm_scope);
+
+  // init src, dst HexagonBuffers
+  srchb.CopyFrom(src, length);
+  dsthb.CopyFrom(dst, length);
+
+  // DDR src -> VTCM
+  ret = user_dma->Copy(queue_id, vtcm1hb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // VTCM -> VTCM
+  // NOTE: Cache bypass is disabled for VTCM -> VTCM transfers
+  ret =
+      user_dma->Copy(queue_id, vtcm2hb.GetPointer(), vtcm1hb.GetPointer(), length, DISABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // VTCM -> DDR dst
+  ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcm2hb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // wait for DMAs to complete
+  user_dma->Wait(queue_id, 0);
+
+  // copy answer from dst HexagonBuffer
+  dsthb.CopyTo(dst, length);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+}
+
+TEST_F(HexagonUserDMATest, sync_dma_bypass_) {
+  HexagonBuffer srchb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer dsthb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer vtcmhb(length, kHexagonAllocAlignment, global_vtcm_scope);
+
+  // init src, dst HexagonBuffers
+  srchb.CopyFrom(src, length);
+  dsthb.CopyFrom(dst, length);
+
+  // DDR src -> VTCM
+  ret = user_dma->Copy(queue_id, vtcmhb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // VTCM -> DDR dst
+  ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcmhb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // wait for DMAs to complete
+  user_dma->Wait(queue_id, 0);
+
+  // copy answer from dst HexagonBuffer
+  dsthb.CopyTo(dst, length);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+
+  // change src data
+  for (uint32_t i = 0; i < length; ++i) {
+    src_char[i] = 2;
+  }
+
+  // copy new src data to HexagonBuffer
+  srchb.CopyFrom(src, length);
+
+  // DDR src -> VTCM
+  ret = user_dma->Copy(queue_id, vtcmhb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // VTCM -> DDR dst
+  ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcmhb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // wait for DMAs to complete
+  user_dma->Wait(queue_id, 0);
+
+  // copy answer from dst HexagonBuffer
+  dsthb.CopyTo(dst, length);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+}
diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index a0b5e1e7e42f..a35eefd1a300 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -89,6 +89,7 @@ def evaluate(
     with tvm.transform.PassContext(
         config={
             "tir.use_async_copy": use_async_copy,
+            "tir.dma_bypass_cache": 1,
             "tir.merge_async_commit_queue_scope": merge_async_commit_queue_scope,
         }
     ):
@@ -172,7 +173,7 @@ class TestAsyncDMAPipeline:
     size_a = tvm.testing.parameter(
         1024,
         64 * 64,
-        128 * 64,
+        # 128 * 64, # Only works on 8Gen1 HDK's
     )
 
     size_w = tvm.testing.parameter(
diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py
deleted file mode 100644
index 3ac297fd80d8..000000000000
--- a/tests/python/contrib/test_hexagon/test_cache_read_write.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Lower cache_read and cache_write to Hexagon DMA via tensorize """
-
-import numpy as np
-
-import tvm.testing
-from tvm import te, tir
-from tvm.contrib.hexagon.session import Session
-from tvm.script import tir as T
-
-from .infrastructure import get_hexagon_target
-
-
-def intrin_mem_copy(shape, dtype, dst_scope, src_scope):
-    """Define and return tensor intrinsic for mem copy"""
-    src = te.placeholder(shape=shape, dtype=dtype, name="src")
-    dst = te.compute(shape, lambda i: src[i], name="dst")
-    size = shape[0] * np.dtype(dtype).itemsize
-
-    src_buffer = tvm.tir.decl_buffer(
-        shape,
-        dtype,
-        scope=src_scope,
-        offset_factor=1,
-        name="mem_copy_src_buffer",
-    )
-
-    dst_buffer = tvm.tir.decl_buffer(
-        shape,
-        dtype,
-        scope=dst_scope,
-        offset_factor=1,
-        name="mem_copy_dst_buffer",
-    )
-
-    zero_indices = [0 for _ in shape]
-
-    def intrin_func(ins, outs):
-        ir_builder = tvm.tir.ir_builder.create()
-
-        _src = ins[0]
-        _dst = outs[0]
-
-        dst_handle = ir_builder.buffer_ptr(dst_buffer)
-        src_handle = ir_builder.buffer_ptr(src_buffer)
-
-        ir_builder.emit(
-            tvm.tir.call_intrin(
-                "handle",
-                "tir.mem_copy",
-                tvm.tir.call_intrin("handle", "tir.address_of", dst_handle[zero_indices]),
-                tvm.tir.call_intrin("handle", "tir.address_of", src_handle[zero_indices]),
-                size,
-            )
-        )
-        return ir_builder.get()
-
-    return te.decl_tensor_intrin(dst.op, intrin_func, binds={src: src_buffer, dst: dst_buffer})
-
-
-def verify(hexagon_session: Session, schedule, x_tensor, y_tensor, z_tensor, size):
-    """Verify correctness with reference from numpy"""
-    print(tvm.lower(schedule, [x_tensor, y_tensor, z_tensor]))
-
-    func = tvm.build(
-        schedule,
-        [x_tensor, y_tensor, z_tensor],
-        get_hexagon_target("v68"),
-        name="dmacpy",
-    )
-
-    mod = hexagon_session.load_module(func)
-    x_array = tvm.nd.array(
-        np.random.randint(low=-128, high=127, size=size, dtype=x_tensor.dtype),
-        device=hexagon_session.device,
-    )
-    y_array = tvm.nd.array(
-        np.random.randint(low=-128, high=127, size=size, dtype=y_tensor.dtype),
-        device=hexagon_session.device,
-    )
-    z_array = tvm.nd.array(
-        np.random.randint(low=-128, high=127, size=size, dtype=z_tensor.dtype),
-        device=hexagon_session.device,
-    )
-    mod["dmacpy"](x_array, y_array, z_array)
-
-    ref = x_array.numpy() + y_array.numpy()
-    np.testing.assert_equal(z_array.numpy(), ref)
-
-
-@tvm.testing.requires_hexagon
-def test_cache_read_write(hexagon_session: Session):
-    """Test cache_read and cache_write to global.vtcm for hexagon"""
-    size = 128
-    outer_shape = (size,)
-    factor = 16
-    inner_shape = (factor,)
-    dtype = "int8"
-
-    x_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="x")
-    y_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="y")
-    z_tensor = te.compute(outer_shape, lambda i: x_tensor[i] + y_tensor[i], name="z")
-    s = te.create_schedule(z_tensor.op)
-
-    x_vtcm = s.cache_read(x_tensor, "global.vtcm", [z_tensor])
-    y_vtcm = s.cache_read(y_tensor, "global.vtcm", [z_tensor])
-    z_vtcm = s.cache_write(z_tensor, "global.vtcm")
-
-    zouter, _ = s[z_vtcm].split(z_vtcm.op.axis[0], factor=factor)
-
-    s[x_vtcm].compute_at(s[z_vtcm], zouter)
-    s[y_vtcm].compute_at(s[z_vtcm], zouter)
-
-    mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm", "global")
-
-    (cache_read_x,) = s[x_vtcm].op.axis
-    s[x_vtcm].tensorize(cache_read_x, mem_copy_read)
-
-    (cache_read_y,) = s[y_vtcm].op.axis
-    s[y_vtcm].tensorize(cache_read_y, mem_copy_read)
-
-    mem_copy_write = intrin_mem_copy(outer_shape, dtype, "global", "global.vtcm")
-
-    (cache_write_z,) = s[z_tensor].op.axis
-    s[z_tensor].tensorize(cache_write_z, mem_copy_write)
-
-    verify(hexagon_session, s, x_tensor, y_tensor, z_tensor, size)
-
-
-def layout_transform_2d(n):
-    return [n // 16, te.AXIS_SEPARATOR, n % 16]
-
-
-@tvm.testing.requires_hexagon
-def test_cache_read_write_2d(hexagon_session: Session):
-    """Test 2D cache_read and cache_write to global.vtcm for hexagon"""
-    size = 128
-    outer_shape = (size,)
-    factor = 16
-    inner_shape = (factor,)
-    dtype = "int8"
-
-    x_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="x")
-    y_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="y")
-    z_tensor = te.compute(outer_shape, lambda i: x_tensor[i] + y_tensor[i], name="z")
-    s = te.create_schedule(z_tensor.op)
-
-    x_vtcm = s.cache_read(x_tensor, "global.vtcm", [z_tensor])
-    y_vtcm = s.cache_read(y_tensor, "global.vtcm", [z_tensor])
-    z_vtcm = s.cache_write(z_tensor, "global.vtcm")
-
-    layout_x_vtcm = s[x_vtcm].transform_layout(layout_transform_2d)
-    layout_y_vtcm = s[y_vtcm].transform_layout(layout_transform_2d)
-    _ = s[z_vtcm].transform_layout(layout_transform_2d)
-
-    mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm", "global")
-    s[x_vtcm].tensorize(layout_x_vtcm[1], mem_copy_read)
-    s[y_vtcm].tensorize(layout_y_vtcm[1], mem_copy_read)
-
-    # The loop schedule over `z` is not modified when calling `transform_layout`
-    # on `z_vtcm` above therefore we must call `split` to modify the loop schedule
-    # over `z` to match the layout of `z_vtcm` such that we can accurately write
-    # `z_vtcm` back to `z` using memory copy intrinsic
-    _, zinner = s[z_tensor].split(z_tensor.op.axis[0], factor=factor)
-    mem_copy_write = intrin_mem_copy(inner_shape, dtype, "global", "global.vtcm")
-    s[z_tensor].tensorize(zinner, mem_copy_write)
-
-    verify(hexagon_session, s, x_tensor, y_tensor, z_tensor, size)
-
-
-@T.prim_func
-def scale_by_two(buffer_a: T.Buffer[(8192,), "int8"], buffer_c: T.Buffer[(8192,), "int8"]):
-    for i in T.serial(
-        0,
-        8192,
-    ):
-        with T.block("C"):
-            buffer_c[i] = buffer_a[i] * T.int8(2)
-
-
-def test_vtcm_lowering():
-    """Test lowering with vtcm mem scope"""
-    mod = tvm.IRModule.from_expr(scale_by_two.with_attr("global_symbol", "main"))
-    sch = tir.Schedule(mod, debug_mask="all")
-    block_c = sch.get_block("C")
-    (flat,) = sch.get_loops(block_c)
-    outer, _, _, _ = sch.split(flat, factors=[8, 4, 2, 128])
-    cache_block = sch.cache_read(block_c, 0, storage_scope="global.vtcm")
-    sch.compute_at(cache_block, outer)
-    lowered = tvm.lower(sch.mod["main"])
-
-    def ir_module_has_allocate_nodes(irmod):
-        nallocs = 0
-
-        def _visit(stmt):
-            nonlocal nallocs
-            if isinstance(stmt, tvm.tir.Allocate):
-                nallocs += 1
-
-        tvm.tir.stmt_functor.post_order_visit(irmod["main"].body, _visit)
-        return nallocs
-
-    assert not ir_module_has_allocate_nodes(lowered), (
-        "AllocateNode found in lowered IRModule, "
-        "VTCM allocations should have been lowered to tir.nd_mem_alloc_with_scope"
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
index e6fc0a3c201c..6cca44388d09 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -213,7 +213,7 @@ def operator(
         )
         T.evaluate(
             T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
+                "device_api.hexagon.dma_copy_dltensor",
                 T.tvm_stack_make_array(
                     a_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
@@ -233,12 +233,13 @@ def operator(
                     dtype="handle",
                 ),
                 T.cast(size, dtype="int"),
+                True,  # bypass cache
                 dtype="int32",
             )
         )
         T.evaluate(
             T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
+                "device_api.hexagon.dma_copy_dltensor",
                 T.tvm_stack_make_array(
                     b_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
@@ -258,6 +259,7 @@ def operator(
                     dtype="handle",
                 ),
                 T.cast(size, dtype="int"),
+                True,  # bypass cache
                 dtype="int32",
             )
         )
@@ -279,7 +281,7 @@ def operator(
                 )
         T.evaluate(
             T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
+                "device_api.hexagon.dma_copy_dltensor",
                 T.tvm_stack_make_array(
                     c_buffer.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
@@ -299,6 +301,7 @@ def operator(
                     dtype="handle",
                 ),
                 T.cast(size, dtype="int"),
+                True,  # bypass cache
                 dtype="int32",
             )
         )
diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
index ba7513a4f39c..387d0f20c4c2 100644
--- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -178,7 +178,11 @@ def test_async_software_pipeline(
             ref = reference(a_np, b_np)
 
         with tvm.transform.PassContext(
-            config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": False}
+            config={
+                "tir.use_async_copy": 1,
+                "tir.dma_bypass_cache": 1,
+                "tir.merge_async_commit_queue_scope": False,
+            }
         ):
             # tvm.lower(schedule.mod["main"]).show()
             func = tvm.build(schedule.mod["main"], target=get_hexagon_target("v68"))
diff --git a/tests/python/contrib/test_hexagon/test_vtcm.py b/tests/python/contrib/test_hexagon/test_vtcm.py
new file mode 100644
index 000000000000..11188436a318
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_vtcm.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""VTCM Tests"""
+
+import tvm.testing
+from tvm import tir
+from tvm.script import tir as T
+
+
+@T.prim_func
+def scale_by_two(buffer_a: T.Buffer[(8192,), "int8"], buffer_c: T.Buffer[(8192,), "int8"]):
+    for i in T.serial(
+        0,
+        8192,
+    ):
+        with T.block("C"):
+            buffer_c[i] = buffer_a[i] * T.int8(2)
+
+
+def test_vtcm_lowering():
+    """Test lowering with vtcm mem scope"""
+    mod = tvm.IRModule.from_expr(scale_by_two.with_attr("global_symbol", "main"))
+    sch = tir.Schedule(mod, debug_mask="all")
+    block_c = sch.get_block("C")
+    (flat,) = sch.get_loops(block_c)
+    outer, _, _, _ = sch.split(flat, factors=[8, 4, 2, 128])
+    cache_block = sch.cache_read(block_c, 0, storage_scope="global.vtcm")
+    sch.compute_at(cache_block, outer)
+    lowered = tvm.lower(sch.mod["main"])
+
+    def ir_module_has_allocate_nodes(irmod):
+        nallocs = 0
+
+        def _visit(stmt):
+            nonlocal nallocs
+            if isinstance(stmt, tvm.tir.Allocate):
+                nallocs += 1
+
+        tvm.tir.stmt_functor.post_order_visit(irmod["main"].body, _visit)
+        return nallocs
+
+    assert not ir_module_has_allocate_nodes(lowered), (
+        "AllocateNode found in lowered IRModule, "
+        "VTCM allocations should have been lowered to tir.nd_mem_alloc_with_scope"
+    )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 980ac0cf4c2a..0b6b52335cb5 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -61,7 +61,7 @@ def operator(a: T.handle, a_v: T.handle) -> None:
         a_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
         T.evaluate(
             T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
+                "device_api.hexagon.dma_copy_dltensor",
                 T.tvm_stack_make_array(
                     a_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
@@ -81,6 +81,7 @@ def operator(a: T.handle, a_v: T.handle) -> None:
                     dtype="handle",
                 ),
                 T.cast(size, dtype="int"),
+                True,  # bypass cache
                 dtype="int32",
             )
         )

From eca361de4f20c82b5d74aa3f89aee9cfc592abaa Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Wed, 16 Nov 2022 19:10:37 -0500
Subject: [PATCH 600/704] [MetaSchedule] Fix segfault in gradient based
 scheduler (#13399)

Fix segfault in gradient based scheduler

Gradient based scheduler would segfault if no candidates can
be returned by the search strategy for some workload.
It's expected to tune other workloads that have candidates.
---
 .../task_scheduler/gradient_based.cc          | 17 ++--
 .../test_meta_schedule_task_scheduler.py      | 85 +++++++++++++++++++
 2 files changed, 95 insertions(+), 7 deletions(-)

diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
index e0470337b536..5b261eec32a4 100644
--- a/src/meta_schedule/task_scheduler/gradient_based.cc
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -68,7 +68,9 @@ class GradientBasedNode final : public TaskSchedulerNode {
     }
     if (round_robin_rounds_ == n_tasks) {
       for (int i = 0; i < n_tasks; ++i) {
-        this->JoinRunningTask(i);
+        if (this->tasks_[i]->runner_futures.defined()) {
+          this->JoinRunningTask(i);
+        }
       }
       ++round_robin_rounds_;
     }
@@ -92,11 +94,10 @@ class GradientBasedNode final : public TaskSchedulerNode {
     for (int task_id : tasks_alive) {
       const std::vector<double>& best_latency = this->best_latency_history_.at(task_id);
       int n = best_latency.size();
-      ICHECK_GE(n, 1);
       double task_weight = this->tasks_[task_id]->task_weight;
       int w = this->window_size;
-      double best = best_latency[n - 1];
-      if (best < 1e9) {
+      if (n > 0 && best_latency[n - 1] < 1e9) {
+        double best = best_latency[n - 1];
         double g1 = (n >= 1 + w) ? (best_latency[n - 1 - w] - best) / w : 0.0;
         double g2 = best / n;
         double g = alpha * g1 + (1 - alpha) * g2;
@@ -124,9 +125,11 @@ class GradientBasedNode final : public TaskSchedulerNode {
   Array<RunnerResult> JoinRunningTask(int task_id) final {
     Array<RunnerResult> results = TaskSchedulerNode::JoinRunningTask(task_id);
     TaskRecordNode* task = this->tasks_[task_id].get();
-    this->best_latency_history_.at(task_id).push_back(
-        *std::min_element(task->latency_ms.begin(),  //
-                          task->latency_ms.end()));
+    if (task->latency_ms.size() > 0) {
+      this->best_latency_history_.at(task_id).push_back(
+          *std::min_element(task->latency_ms.begin(),  //
+                            task->latency_ms.end()));
+    }
     return results;
   }
 };
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index 33a019e3c555..ab0e3f0123dd 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -20,6 +20,7 @@
 from typing import Set
 
 import pytest
+
 import tvm
 import tvm.testing
 from tvm import meta_schedule as ms
@@ -352,6 +353,89 @@ def test_meta_schedule_task_scheduler_multiple_gradient_based():
         )
 
 
+def test_meta_schedule_task_scheduler_gradient_based_with_null_search_strategy():
+    """
+    When search strategy of one task returns empty list of candidates or None,
+    the scheduler should continue working as normal for other tasks
+    """
+
+    @ms.derived_object
+    class NullSearchStrategy(ms.search_strategy.PySearchStrategy):
+        def __init__(self, rounds_with_empty_candidates):
+            self.rounds_with_empty_candidates = rounds_with_empty_candidates
+
+        def _initialize_with_tune_context(self, context: "TuneContext") -> None:
+            pass
+
+        def pre_tuning(self, *args, **kwargs):
+            pass
+
+        def post_tuning(self):
+            pass
+
+        def generate_measure_candidates(self):
+            """
+            Returns empty list to indicate there is no result from search, while
+            the search isn't ended.
+            """
+            if self.rounds_with_empty_candidates:
+                self.rounds_with_empty_candidates -= 1
+                return []
+            return None
+
+        def notify_runner_results(self, *args, **kwargs):
+            pass
+
+        def clone(self):
+            return NullSearchStrategy(n=self.n)
+
+    tasks = [
+        ms.TuneContext(
+            MatmulModule,
+            target=tvm.target.Target("llvm"),
+            space_generator=_schedule_matmul,
+            search_strategy=NullSearchStrategy(rounds_with_empty_candidates=5),
+            task_name="Matmul",
+            rand_state=42,
+        ),
+        ms.TuneContext(
+            BatchMatmulModule,
+            target=tvm.target.Target("llvm"),
+            space_generator=_schedule_batch_matmul,
+            search_strategy=NullSearchStrategy(rounds_with_empty_candidates=0),
+            task_name="BatchMatmul",
+            rand_state=0x114514,
+        ),
+        ms.TuneContext(
+            MatmulReluModule,
+            target=tvm.target.Target("llvm"),
+            space_generator=_schedule_matmul,
+            search_strategy=ms.search_strategy.ReplayTrace(),
+            task_name="MatmulRelu",
+            rand_state=0xDEADBEEF,
+        ),
+    ]
+    database = ms.database.MemoryDatabase()
+    gradient_based = ms.task_scheduler.GradientBased()
+    gradient_based.tune(
+        tasks,
+        task_weights=[1.0, 1.0, 1.0],
+        builder=DummyBuilder(),
+        runner=DummyRunner(),
+        database=database,
+        measure_callbacks=[ms.measure_callback.AddToDatabase()],
+        max_trials_global=30,
+        max_trials_per_task=10,
+        num_trials_per_iter=6,
+        cost_model=None,
+    )
+
+    assert len(database) == 10
+    assert len(database.get_top_k(database.commit_workload(MatmulModule), 100)) == 0
+    assert len(database.get_top_k(database.commit_workload(BatchMatmulModule), 100)) == 0
+    assert len(database.get_top_k(database.commit_workload(MatmulReluModule), 100)) == 10
+
+
 if __name__ == "__main__":
     test_meta_schedule_task_scheduler_single()
     test_meta_schedule_task_scheduler_multiple()
@@ -359,3 +443,4 @@ def test_meta_schedule_task_scheduler_multiple_gradient_based():
     test_meta_schedule_task_scheduler_avoid_cyclic()
     test_meta_schedule_task_scheduler_override_next_task_id_only()
     test_meta_schedule_task_scheduler_multiple_gradient_based()
+    test_meta_schedule_task_scheduler_gradient_based_with_null_search_strategy()

From 59abd9e1068f7d6d4230c428042918fbfa0e3c44 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 16 Nov 2022 16:11:16 -0800
Subject: [PATCH 601/704] [Build] Fix MSVC compile option /bigobj (#13411)

/bigobj was not correctly added to C++ targets. This caused error using cmake build.
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b8d8f4c0239c..011b593157a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,7 +152,7 @@ if(MSVC)
   set(CMAKE_SUPPRESS_REGENERATION ON)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /bigobj")
+  add_compile_options(/bigobj)
 
   # MSVC already errors on undefined symbols, no additional flag needed.
   set(TVM_NO_UNDEFINED_SYMBOLS "")

From ad5c811411aaacf9e99594eea69a2dc343cd6c7e Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 16 Nov 2022 18:57:54 -0800
Subject: [PATCH 602/704] [TIR] Unify index data type when creating prim func
 (#13327)

* Added data type pass unification pass to by default promote data types of all indices and shapes to int64 when creating prim func.
* Added some fixes for lowering passes to make it compatible with int64 data type.
---
 include/tvm/tir/data_type_rewriter.h          | 155 +++++++
 include/tvm/tir/stmt.h                        |   1 +
 include/tvm/tir/stmt_functor.h                |  50 ---
 python/tvm/te/operation.py                    |   8 +-
 src/relay/backend/utils.cc                    |   2 +-
 src/te/operation/create_primfunc.cc           |  25 +-
 src/te/operation/create_primfunc.h            |   8 +-
 src/tir/ir/data_type_rewriter.cc              | 389 +++++++++++++++++-
 src/tir/ir/stmt_functor.cc                    |   1 +
 .../schedule/primitive/blockize_tensorize.cc  |  15 +
 src/tir/transforms/lower_match_buffer.cc      |   4 +-
 src/tir/transforms/narrow_datatype.cc         | 139 ++-----
 tests/cpp/data_type_rewriter_test.cc          |   2 +-
 .../test_meta_schedule_relay_integration.py   |  36 +-
 .../unittest/test_te_create_primfunc.py       |  25 +-
 .../unittest/test_tir_schedule_tensorize.py   |  20 +-
 .../test_tir_transform_narrow_datatype.py     |  33 +-
 17 files changed, 675 insertions(+), 238 deletions(-)
 create mode 100644 include/tvm/tir/data_type_rewriter.h

diff --git a/include/tvm/tir/data_type_rewriter.h b/include/tvm/tir/data_type_rewriter.h
new file mode 100644
index 000000000000..378addaba528
--- /dev/null
+++ b/include/tvm/tir/data_type_rewriter.h
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file data_type_rewriter.h
+ * \brief Rewrite the data type of expressions.
+ */
+#ifndef TVM_TIR_DATA_TYPE_REWRITER_H_
+#define TVM_TIR_DATA_TYPE_REWRITER_H_
+
+#include <tvm/tir/stmt_functor.h>
+
+#include <unordered_map>
+
+namespace tvm {
+namespace tir {
+
+/*!
+ * \brief Legalize the data types of expressions to make sure they are consistent with other
+ * parts of the program.
+ *
+ * It enforces the following rules:
+ * - The data type of the index variable in a loop must be consistent with the data type of the loop
+ *  bounds.
+ * - The data type of the binary and ternary expressions must be consistent with the data types of
+ * each of their operands.
+ * - The data type of the bounds and binding values of block iter vars must be consistent with the
+ * data type of the block iter vars.
+ *
+ * Usually we enforce the consistency of data types when constructing the IR nodes. However, such
+ * inconsistency may happen as a result of IR mutation in some passes. This class can be used as
+ * base class of such passes to ensure the consistency of data types.
+ */
+class DataTypeLegalizer : public StmtExprMutator {
+ protected:
+  Stmt VisitStmt_(const ForNode* op) override;
+  Stmt VisitStmt_(const AttrStmtNode* op) override;
+  Stmt VisitStmt_(const BlockRealizeNode* op) override;
+  Stmt VisitStmt_(const BlockNode* op) override;
+  PrimExpr VisitExpr_(const SelectNode* op) override;
+  PrimExpr VisitExpr_(const RampNode* op) override;
+  PrimExpr VisitExpr_(const AddNode* op) override;
+  PrimExpr VisitExpr_(const SubNode* op) override;
+  PrimExpr VisitExpr_(const MulNode* op) override;
+  PrimExpr VisitExpr_(const DivNode* op) override;
+  PrimExpr VisitExpr_(const ModNode* op) override;
+  PrimExpr VisitExpr_(const FloorDivNode* op) override;
+  PrimExpr VisitExpr_(const FloorModNode* op) override;
+  PrimExpr VisitExpr_(const MinNode* op) override;
+  PrimExpr VisitExpr_(const MaxNode* op) override;
+  PrimExpr VisitExpr_(const EQNode* op) override;
+  PrimExpr VisitExpr_(const NENode* op) override;
+  PrimExpr VisitExpr_(const LTNode* op) override;
+  PrimExpr VisitExpr_(const LENode* op) override;
+  PrimExpr VisitExpr_(const GTNode* op) override;
+  PrimExpr VisitExpr_(const GENode* op) override;
+  PrimExpr VisitExpr_(const CallNode* op) override;
+  PrimExpr VisitExpr_(const CastNode* op) override;
+
+  using StmtExprMutator::VisitExpr_;
+  using StmtExprMutator::VisitStmt_;
+
+  // a map from IterVar before rewrite to that after rewrite,
+  // ensures one old IterVar maps to exactly one new IterVar
+  std::unordered_map<const IterVarNode*, IterVar> ivmap_;
+};
+
+/*!
+ * \brief Data type rewriter for buffer indices.
+ *
+ * Detect the components of buffer indices that should be considered for data type rewriting.
+ * This class doesn't perform actual rewriting of data types. During recursive visiting, the
+ * internal flags `is_enabled_` and `is_conditional_` are used to indicate whether the current
+ * expression is a buffer index or a conditional expression, which can be used in the sub-classes to
+ * implement different rewriting rules.
+ */
+class IndexDataTypeRewriter : public DataTypeLegalizer {
+ protected:
+  using Parent = DataTypeLegalizer;
+  using Parent::VisitExpr_;
+  using Parent::VisitStmt_;
+
+  Stmt VisitStmt_(const BlockRealizeNode* op) override;
+  Stmt VisitStmt_(const BlockNode* op) override;
+  Stmt VisitStmt_(const BufferStoreNode* op) override;
+  PrimExpr VisitExpr_(const BufferLoadNode* op) override;
+  Array<PrimExpr> VisitIndices(Array<PrimExpr> indices);
+  Stmt VisitStmt_(const IfThenElseNode* op) override;
+  Stmt VisitStmt_(const DeclBufferNode* op) override;
+  Stmt VisitStmt_(const AllocateNode* op) override;
+  PrimExpr VisitExpr_(const EQNode* op) override;
+  PrimExpr VisitExpr_(const NENode* op) override;
+  PrimExpr VisitExpr_(const LTNode* op) override;
+  PrimExpr VisitExpr_(const LENode* op) override;
+  PrimExpr VisitExpr_(const GTNode* op) override;
+  PrimExpr VisitExpr_(const GENode* op) override;
+  PrimExpr VisitExpr_(const CallNode* op) override;
+  Stmt VisitStmt_(const ForNode* op) override;
+
+  Buffer VisitBuffer(const Buffer& buffer);
+  Buffer GetRemappedBuffer(const Buffer& buffer);
+  Map<String, ObjectRef> VisitBlockAnnotations(const Map<String, ObjectRef>& annotations);
+  BufferRegion VisitBufferRegion(const BufferRegion& region);
+  IterVar VisitIterVar(const IterVar& iter_var);
+  // indicator of index expr to rewrite
+  bool is_enabled_{false};
+  // indicator of condition
+  bool is_condition_{false};
+
+  Map<Var, Var> var_remap_;
+  Map<Buffer, Buffer> buffer_remap_;
+};
+
+/*!
+ * \brief Normalize the data types of buffer shapes and indices to the same data type.
+ *
+ * This pass rewrites the data types of buffer shapes and indices to the specified data type. It
+ * assumes the specified data type is large enough to hold the original ranges of buffer shapes and
+ * indices.
+ */
+class IndexDataTypeNormalizer : public IndexDataTypeRewriter {
+ public:
+  explicit IndexDataTypeNormalizer(DataType target_data_type);
+  PrimFunc Rewrite(PrimFunc func);
+
+ protected:
+  using Parent = IndexDataTypeRewriter;
+  using Parent::VisitExpr_;
+  using Parent::VisitStmt_;
+  PrimExpr VisitExpr_(const IntImmNode* op) final;
+  PrimExpr VisitExpr_(const VarNode* op) final;
+
+  DataType target_data_type_ = DataType::Int(64);
+};
+
+}  // namespace tir
+}  // namespace tvm
+
+#endif  // TVM_TIR_DATA_TYPE_REWRITER_H_
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index e0e191b282e5..6865326b8849 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -858,6 +858,7 @@ class IfThenElse : public Stmt {
                      Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(IfThenElse, Stmt, IfThenElseNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(IfThenElseNode);
 };
 
 /*!
diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index 8057108803db..9f4b4b40e4cd 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -485,56 +485,6 @@ bool ContainsNode(const Stmt& stmt) {
   return visitor.contains_node;
 }
 
-/*!
- * \brief Legalize the data types of expressions to make sure they are consistent with other
- * parts of the program.
- *
- * It enforces the following rules:
- * - The data type of the index variable in a loop must be consistent with the data type of the loop
- *  bounds.
- * - The data type of the binary and ternary expressions must be consistent with the data types of
- * each of their operands.
- * - The data type of the bounds and binding values of block iter vars must be consistent with the
- * data type of the block iter vars.
- *
- * Usually we enforce the consistency of data types when constructing the IR nodes. However, such
- * inconsistency may happen as a result of IR mutation in some passes. This class can be used as
- * base class of such passes to ensure the consistency of data types.
- */
-class DataTypeLegalizer : public StmtExprMutator {
- protected:
-  Stmt VisitStmt_(const ForNode* op) override;
-
-  Stmt VisitStmt_(const AttrStmtNode* op) override;
-  Stmt VisitStmt_(const BlockRealizeNode* op) override;
-  Stmt VisitStmt_(const BlockNode* op) override;
-  PrimExpr VisitExpr_(const SelectNode* op) override;
-  PrimExpr VisitExpr_(const RampNode* op) override;
-  PrimExpr VisitExpr_(const AddNode* op) override;
-  PrimExpr VisitExpr_(const SubNode* op) override;
-  PrimExpr VisitExpr_(const MulNode* op) override;
-  PrimExpr VisitExpr_(const DivNode* op) override;
-  PrimExpr VisitExpr_(const ModNode* op) override;
-  PrimExpr VisitExpr_(const FloorDivNode* op) override;
-  PrimExpr VisitExpr_(const FloorModNode* op) override;
-  PrimExpr VisitExpr_(const MinNode* op) override;
-  PrimExpr VisitExpr_(const MaxNode* op) override;
-  PrimExpr VisitExpr_(const EQNode* op) override;
-  PrimExpr VisitExpr_(const NENode* op) override;
-  PrimExpr VisitExpr_(const LTNode* op) override;
-  PrimExpr VisitExpr_(const LENode* op) override;
-  PrimExpr VisitExpr_(const GTNode* op) override;
-  PrimExpr VisitExpr_(const GENode* op) override;
-  PrimExpr VisitExpr_(const CallNode* op) override;
-
-  using StmtExprMutator::VisitExpr_;
-  using StmtExprMutator::VisitStmt_;
-
-  // a map from IterVar before rewrite to that after rewrite,
-  // ensures one old IterVar maps to exactly one new IterVar
-  std::unordered_map<const IterVarNode*, IterVar> ivmap_;
-};
-
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 846f88d38938..ae3ad7ca892a 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -19,7 +19,7 @@
 
 # pylint: disable=invalid-name
 from numbers import Integral as _Integral
-from typing import List
+from typing import List, Optional
 
 import tvm._ffi
 import tvm.arith._ffi_api
@@ -567,7 +567,9 @@ def reduce_axis(dom, name="rv", thread_tag="", span=None):
     return tvm.tir.IterVar(dom, name, 2, thread_tag, span)
 
 
-def create_prim_func(ops: List[_tensor.Tensor]) -> tvm.tir.PrimFunc:
+def create_prim_func(
+    ops: List[_tensor.Tensor], index_dtype_override: Optional[str] = None
+) -> tvm.tir.PrimFunc:
     """Create a TensorIR PrimFunc from tensor expression
 
     Parameters
@@ -619,4 +621,4 @@ def tir_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     """
     if not isinstance(ops, (list, tuple, Array)):
         ops = [ops]
-    return _ffi_api.CreatePrimFunc(ops)
+    return _ffi_api.CreatePrimFunc(ops, index_dtype_override)
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 51bcab527d1b..183a3094e473 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -416,7 +416,7 @@ Optional<tir::PrimFunc> DefaultTIRConverterImpl(const Array<te::Tensor>& args,
       return NullOpt;
     }
   }
-  PrimFunc func = te::CreatePrimFuncWithConstants(args, constants);
+  PrimFunc func = te::CreatePrimFuncWithConstants(args, constants, DataType::Int(64));
   bool dynamic_loop_extent = false;
   tir::PostOrderVisit(func->body, [&dynamic_loop_extent](const ObjectRef& obj) -> void {
     if (const auto* loop = obj.as<tir::ForNode>()) {
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 0581ad60e8f4..d797350eed4f 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -22,6 +22,7 @@
 #include <tvm/arith/analyzer.h>
 #include <tvm/ir/name_supply.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/tir/data_type_rewriter.h>
 #include <tvm/tir/function.h>
 #include <tvm/tir/stmt_functor.h>
 
@@ -493,7 +494,8 @@ PrimFunc GenerateAndCompletePrimFunc(const Array<te::Tensor>& arg_list,
 }
 
 PrimFunc CreatePrimFuncWithConstants(const Array<te::Tensor>& arg_list,
-                                     const Array<runtime::NDArray>& constants) {
+                                     const Array<runtime::NDArray>& constants,
+                                     std::optional<DataType> index_dtype_override) {
   // Infomations used in CreatePrimFunc and its sub-functions.
   CreateFuncInfo info(arg_list);
   // Root body stmts.
@@ -515,14 +517,27 @@ PrimFunc CreatePrimFuncWithConstants(const Array<te::Tensor>& arg_list,
   // Step 4. Create func and complete prim func.
   auto func = GenerateAndCompletePrimFunc(arg_list, root_stmts, &info);
   func = tir::BindParams(func, constants);
-  return LayoutFreePlaceholdersNormalizer().Process(std::move(func));
+  if (index_dtype_override.has_value()) {
+    func = IndexDataTypeNormalizer(index_dtype_override.value()).Rewrite(std::move(func));
+  }
+  auto result = LayoutFreePlaceholdersNormalizer().Process(std::move(func));
+  return result;
 }
 
-PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
-  return CreatePrimFuncWithConstants(arg_list, {});
+PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list,
+                        std::optional<DataType> index_dtype_override) {
+  return CreatePrimFuncWithConstants(arg_list, {}, index_dtype_override);
 }
 
-TVM_REGISTER_GLOBAL("te.CreatePrimFunc").set_body_typed(CreatePrimFunc);
+TVM_REGISTER_GLOBAL("te.CreatePrimFunc").set_body([](TVMArgs args, TVMRetValue* ret) {
+  Array<te::Tensor> arg_list = args[0];
+  std::optional<DataType> index_dtype_override{std::nullopt};
+  // Add conversion to make std::optional compatible with FFI.
+  if (args[1].type_code() != kTVMNullptr) {
+    index_dtype_override = args[1].operator DataType();
+  }
+  *ret = CreatePrimFunc(arg_list, index_dtype_override);
+});
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/te/operation/create_primfunc.h b/src/te/operation/create_primfunc.h
index b68d30a2fb82..4246347a16f3 100644
--- a/src/te/operation/create_primfunc.h
+++ b/src/te/operation/create_primfunc.h
@@ -24,11 +24,14 @@
 #include <tvm/te/tensor.h>
 #include <tvm/tir/function.h>
 
+#include <optional>
+
 namespace tvm {
 namespace tir {
 
 /*! \brief Use Tensor Expression to create a schedulable TensorIR func. */
-PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list);
+PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list,
+                        std::optional<DataType> index_dtype_override = std::nullopt);
 
 /*! \brief The same as above but create a PrimFunc with AllocateConstNode. If the size of the
  * constants array is N, the last N tensors in arg_list will be treated as constant tensors.
@@ -36,7 +39,8 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list);
  * will be embedded in the body as AllocateConstNode.
  */
 PrimFunc CreatePrimFuncWithConstants(const Array<te::Tensor>& arg_list,
-                                     const Array<runtime::NDArray>& constants);
+                                     const Array<runtime::NDArray>& constants,
+                                     std::optional<DataType> index_dtype_override = std::nullopt);
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/ir/data_type_rewriter.cc b/src/tir/ir/data_type_rewriter.cc
index 102989acf6e0..fecb8e5fb70c 100644
--- a/src/tir/ir/data_type_rewriter.cc
+++ b/src/tir/ir/data_type_rewriter.cc
@@ -23,8 +23,8 @@
  */
 
 #include <tvm/tir/builtin.h>
+#include <tvm/tir/data_type_rewriter.h>
 #include <tvm/tir/op.h>
-#include <tvm/tir/stmt_functor.h>
 
 #include "./functor_common.h"
 
@@ -138,7 +138,11 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const RampNode* op) {
   }
 }
 
-#define DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)                 \
+PrimExpr DataTypeLegalizer::VisitExpr_(const CastNode* op) {
+  return StmtExprMutator::VisitExpr_(op);
+}
+
+#define TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)             \
   PrimExpr DataTypeLegalizer::VisitExpr_(const OP* op) {                  \
     PrimExpr a = this->VisitExpr(op->a);                                  \
     PrimExpr b = this->VisitExpr(op->b);                                  \
@@ -149,23 +153,23 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const RampNode* op) {
     }                                                                     \
   }
 
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(AddNode, operator+);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(SubNode, operator-);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MulNode, operator*);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(DivNode, div);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(ModNode, truncmod);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorDivNode, floordiv);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorModNode, floormod);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MinNode, min);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MaxNode, max);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(NENode, operator!=);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(LENode, operator<=);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(LTNode, operator<);  // NOLINT(*)
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GTNode, operator>);  // NOLINT(*)
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GENode, operator>=);
-
-#undef DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(AddNode, operator+);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(SubNode, operator-);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MulNode, operator*);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(DivNode, div);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(ModNode, truncmod);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorDivNode, floordiv);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorModNode, floormod);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MinNode, min);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MaxNode, max);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(NENode, operator!=);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(LENode, operator<=);
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(LTNode, operator<);  // NOLINT(*)
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GTNode, operator>);  // NOLINT(*)
+TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GENode, operator>=);
+
+#undef TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH
 
 PrimExpr DataTypeLegalizer::VisitExpr_(const CallNode* op) {
   PrimExpr e = StmtExprMutator::VisitExpr_(op);
@@ -191,5 +195,352 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const CallNode* op) {
   return e;
 }
 
+Stmt IndexDataTypeRewriter::VisitStmt_(const AllocateNode* op) {
+  bool is_enabled = is_enabled_;
+  is_enabled_ = true;
+  auto new_extents = op->extents.Map([this](const PrimExpr& e) { return this->VisitExpr(e); });
+  auto new_cond = VisitExpr(op->condition);
+  is_enabled_ = is_enabled;
+  auto new_body = this->VisitStmt(op->body);
+  if (!new_extents.same_as(op->extents) || !new_cond.same_as(op->condition) ||
+      !new_body.same_as(op->body)) {
+    Allocate new_allocate = GetRef<Allocate>(op);
+    auto* n = new_allocate.CopyOnWrite();
+    n->extents = std::move(new_extents);
+    n->condition = std::move(new_cond);
+    n->body = std::move(new_body);
+    return std::move(new_allocate);
+  } else {
+    return GetRef<Stmt>(op);
+  }
+}
+
+Stmt IndexDataTypeRewriter::VisitStmt_(const DeclBufferNode* op) {
+  Buffer new_buffer = VisitBuffer(op->buffer);
+  DeclBuffer decl_buffer = Downcast<DeclBuffer>(StmtExprMutator::VisitStmt_(op));
+  if (!new_buffer.same_as(op->buffer)) {
+    decl_buffer.CopyOnWrite()->buffer = new_buffer;
+  }
+  return std::move(decl_buffer);
+}
+
+Stmt IndexDataTypeRewriter::VisitStmt_(const BlockRealizeNode* op) {
+  bool is_condition = is_condition_;
+  is_condition_ = true;
+  auto new_predicate = VisitExpr(op->predicate);
+  is_condition_ = is_condition;
+
+  bool is_enabled = is_enabled_;
+  is_enabled_ = true;
+  auto new_iter_values =
+      op->iter_values.Map([this](const PrimExpr& e) { return this->VisitExpr(e); });
+  is_enabled_ = is_enabled;
+  Block new_body = Downcast<Block>(this->VisitStmt(op->block));
+  if (!new_predicate.same_as(op->predicate) || !new_iter_values.same_as(op->iter_values) ||
+      !new_body.same_as(op->block)) {
+    BlockRealize new_block_realize = GetRef<BlockRealize>(op);
+    auto* n = new_block_realize.CopyOnWrite();
+    n->predicate = std::move(new_predicate);
+    n->iter_values = std::move(new_iter_values);
+    n->block = std::move(new_body);
+    return std::move(new_block_realize);
+  } else {
+    return GetRef<Stmt>(op);
+  }
+}
+
+Stmt IndexDataTypeRewriter::VisitStmt_(const BlockNode* op) {
+  Array<Buffer> new_alloc_buffers =
+      op->alloc_buffers.Map([this](const Buffer& buffer) { return this->VisitBuffer(buffer); });
+  Array<MatchBufferRegion> new_match_buffers =
+      op->match_buffers.Map([this](const MatchBufferRegion& match_buffer_region) {
+        Buffer new_buffer = this->VisitBuffer(match_buffer_region->buffer);
+        BufferRegion new_buffer_region = this->VisitBufferRegion(match_buffer_region->source);
+        if (!new_buffer.same_as(match_buffer_region->buffer) ||
+            !new_buffer_region.same_as(match_buffer_region->source)) {
+          return MatchBufferRegion(new_buffer, new_buffer_region);
+        } else {
+          return match_buffer_region;
+        }
+      });
+  Array<BufferRegion> new_reads = op->reads.Map(
+      [this](const BufferRegion& buffer_region) { return this->VisitBufferRegion(buffer_region); });
+  Array<BufferRegion> new_writes = op->writes.Map(
+      [this](const BufferRegion& buffer_region) { return this->VisitBufferRegion(buffer_region); });
+  Array<IterVar> new_iter_vars =
+      op->iter_vars.Map([this](const IterVar& iter_var) { return this->VisitIterVar(iter_var); });
+  Optional<Stmt> new_init = NullOpt;
+  if (op->init.defined()) {
+    new_init = this->VisitStmt(op->init.value());
+  }
+  Stmt new_body = this->VisitStmt(op->body);
+
+  if (!new_init.same_as(op->init) || !new_body.same_as(op->body) ||
+      !new_alloc_buffers.same_as(op->alloc_buffers) ||
+      !new_match_buffers.same_as(op->match_buffers) || !new_reads.same_as(op->reads) ||
+      !new_writes.same_as(op->writes) || new_iter_vars.same_as(op->iter_vars)) {
+    Block new_block = GetRef<Block>(op);
+    BlockNode* n = new_block.CopyOnWrite();
+    n->alloc_buffers = std::move(new_alloc_buffers);
+    n->match_buffers = std::move(new_match_buffers);
+    n->reads = std::move(new_reads);
+    n->writes = std::move(new_writes);
+    n->iter_vars = std::move(new_iter_vars);
+    n->init = std::move(new_init);
+    n->body = std::move(new_body);
+    return std::move(new_block);
+  }
+  return GetRef<Stmt>(op);
+}
+
+Map<String, ObjectRef> IndexDataTypeRewriter::VisitBlockAnnotations(
+    const Map<String, ObjectRef>& annotations) {
+  auto new_annotations = annotations;
+
+  std::function<ObjectRef(const ObjectRef&)> f_mutate_obj =
+      [this, &f_mutate_obj](const ObjectRef& obj) -> ObjectRef {
+    if (!obj.defined()) {
+      return obj;
+    }
+    if (obj->IsInstance<BufferNode>()) {
+      Buffer buffer = Downcast<Buffer>(obj);
+      if (Buffer new_buffer = GetRemappedBuffer(buffer); !new_buffer.same_as(buffer)) {
+        return new_buffer;
+      }
+    } else if (obj->IsInstance<ArrayNode>()) {
+      return Downcast<Array<ObjectRef>>(obj).Map(f_mutate_obj);
+    }
+    return obj;
+  };
+  for (const auto& [key, value] : annotations) {
+    auto new_value = f_mutate_obj(value);
+    if (!new_value.same_as(value)) {
+      new_annotations.Set(key, new_value);
+    }
+  }
+  return new_annotations;
+}
+
+Buffer IndexDataTypeRewriter::GetRemappedBuffer(const Buffer& buffer) {
+  if (auto it = buffer_remap_.find(buffer); it != buffer_remap_.end()) {
+    return (*it).second;
+  }
+  return buffer;
+}
+
+IterVar IndexDataTypeRewriter::VisitIterVar(const IterVar& iter_var) {
+  bool is_enabled = is_enabled_;
+  is_enabled_ = true;
+  Var new_var = Downcast<Var>(VisitExpr(iter_var->var));
+  PrimExpr min = VisitExpr(iter_var->dom->min);
+  PrimExpr extent = VisitExpr(iter_var->dom->extent);
+  is_enabled_ = is_enabled;
+  if (!new_var.same_as(iter_var->var) || !min.same_as(iter_var->dom->min) ||
+      !extent.same_as(iter_var->dom->extent)) {
+    IterVar new_iter_var = iter_var;
+    IterVarNode* n = new_iter_var.CopyOnWrite();
+    n->var = std::move(new_var);
+    n->dom = Range(min, extent);
+    return new_iter_var;
+  }
+  return iter_var;
+}
+
+Buffer IndexDataTypeRewriter::VisitBuffer(const Buffer& buffer) {
+  bool is_enabled = is_enabled_;
+
+  is_enabled_ = true;
+  Array<PrimExpr> new_shape =
+      buffer->shape.Map([&](const PrimExpr& e) { return this->VisitExpr(e); });
+  Array<PrimExpr> new_strides =
+      buffer->strides.Map([&](const PrimExpr& e) { return this->VisitExpr(e); });
+  auto new_elem_offset = VisitExpr(buffer->elem_offset);
+  is_enabled_ = is_enabled;
+
+  if (!buffer->shape.same_as(new_shape) || !buffer->strides.same_as(new_strides) ||
+      !buffer->elem_offset.same_as(new_elem_offset)) {
+    Buffer new_buffer = buffer;
+    BufferNode* new_buffer_node = new_buffer.CopyOnWrite();
+    new_buffer_node->shape = std::move(new_shape);
+    new_buffer_node->strides = std::move(new_strides);
+    new_buffer_node->elem_offset = std::move(new_elem_offset);
+    buffer_remap_.Set(buffer, new_buffer);
+    return new_buffer;
+  } else {
+    return buffer;
+  }
+}
+
+BufferRegion IndexDataTypeRewriter::VisitBufferRegion(const BufferRegion& buffer_region) {
+  Buffer remapped_buffer = GetRemappedBuffer(buffer_region->buffer);
+
+  bool is_enabled = is_enabled_;
+  is_enabled_ = true;
+  auto new_region = buffer_region->region.Map([&](const Range& range) {
+    return Range::FromMinExtent(this->VisitExpr(range->min), this->VisitExpr(range->extent));
+  });
+  is_enabled_ = is_enabled;
+
+  if (!remapped_buffer.same_as(buffer_region->buffer) ||
+      !new_region.same_as(buffer_region->region)) {
+    return BufferRegion(remapped_buffer, new_region);
+  } else {
+    return buffer_region;
+  }
+}
+
+Stmt IndexDataTypeRewriter::VisitStmt_(const BufferStoreNode* op) {
+  BufferStore store = GetRef<BufferStore>(op);
+
+  Buffer new_buffer = GetRemappedBuffer(op->buffer);
+  auto value = this->VisitExpr(op->value);
+  auto indices = VisitIndices(op->indices);
+
+  if (!new_buffer.same_as(op->buffer) || !value.same_as(op->value) ||
+      !indices.same_as(op->indices)) {
+    auto writer = store.CopyOnWrite();
+    writer->buffer = new_buffer;
+    writer->value = value;
+    writer->indices = indices;
+  }
+
+  return std::move(store);
+}
+
+PrimExpr IndexDataTypeRewriter::VisitExpr_(const BufferLoadNode* op) {
+  BufferLoad load = GetRef<BufferLoad>(op);
+
+  Buffer new_buffer = GetRemappedBuffer(op->buffer);
+  auto indices = VisitIndices(op->indices);
+
+  if (!new_buffer.same_as(op->buffer) || !indices.same_as(op->indices)) {
+    auto writer = load.CopyOnWrite();
+    writer->indices = indices;
+    writer->buffer = new_buffer;
+  }
+
+  return std::move(load);
+}
+
+Array<PrimExpr> IndexDataTypeRewriter::VisitIndices(Array<PrimExpr> indices) {
+  bool is_enabled = is_enabled_;
+  is_enabled_ = true;
+
+  auto fmutate = [this](const PrimExpr& index) { return this->VisitExpr(index); };
+  indices.MutateByApply(fmutate);
+
+  is_enabled_ = is_enabled;
+
+  return indices;
+}
+
+Stmt IndexDataTypeRewriter::VisitStmt_(const IfThenElseNode* op) {
+  bool is_condition = is_condition_;
+  is_condition_ = true;
+  PrimExpr cond = VisitExpr(op->condition);
+  is_condition_ = is_condition;
+
+  Stmt then_case = VisitStmt(op->then_case);
+  Optional<Stmt> else_case =
+      op->else_case.defined() ? Optional<Stmt>{VisitStmt(op->else_case.value())} : NullOpt;
+  if (!cond.same_as(op->condition) || !then_case.same_as(op->then_case) ||
+      !else_case.same_as(op->else_case)) {
+    IfThenElse new_stmt = GetRef<IfThenElse>(op);
+    auto* n = new_stmt.CopyOnWrite();
+    n->condition = std::move(cond);
+    n->then_case = std::move(then_case);
+    n->else_case = std::move(else_case);
+    return std::move(new_stmt);
+  }
+  return GetRef<Stmt>(op);
+}
+
+Stmt IndexDataTypeRewriter::VisitStmt_(const ForNode* op) {
+  bool is_enabled = is_enabled_;
+  is_enabled_ = true;
+  Var new_loop_var = Downcast<Var>(VisitExpr(op->loop_var));
+  PrimExpr min = VisitExpr(op->min);
+  PrimExpr extent = VisitExpr(op->extent);
+  is_enabled_ = is_enabled;
+
+  Stmt new_body = VisitStmt(op->body);
+
+  if (!new_loop_var.same_as(op->loop_var) || !min.same_as(op->min) || !extent.same_as(op->extent) ||
+      !new_body.same_as(op->body)) {
+    For new_for = GetRef<For>(op);
+    auto* n = new_for.CopyOnWrite();
+    n->loop_var = new_loop_var;
+    n->min = cast(new_loop_var.dtype(), min);
+    n->extent = cast(new_loop_var.dtype(), extent);
+    n->body = new_body;
+    return std::move(new_for);
+  } else {
+    return GetRef<Stmt>(op);
+  }
+}
+
+#define TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)                     \
+  PrimExpr IndexDataTypeRewriter::VisitExpr_(const OP* op) {                       \
+    bool is_enabled = is_enabled_;                                                 \
+    is_enabled_ = is_condition_ && op->a->dtype.is_int() && op->b->dtype.is_int(); \
+    auto result = Parent::VisitExpr_(op);                                          \
+    is_enabled_ = is_enabled;                                                      \
+    return std::move(result);                                                      \
+  }
+
+TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==);
+TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(NENode, operator!=);
+TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(LENode, operator<=);
+TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(LTNode, operator<);  // NOLINT(*)
+TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(GTNode, operator>);  // NOLINT(*)
+TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(GENode, operator>=);
+
+PrimExpr IndexDataTypeRewriter::VisitExpr_(const CallNode* op) {
+  // handle if_then_else condition
+  if (op->op.same_as(builtin::if_then_else())) {
+    bool is_condition = is_condition_;
+    is_condition_ = true;
+    PrimExpr cond = VisitExpr(op->args[0]);
+    is_condition_ = is_condition;
+    return if_then_else(cond, VisitExpr(op->args[1]), VisitExpr(op->args[2]));
+  }
+  return Parent::VisitExpr_(op);
+}
+
+#undef TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH
+
+IndexDataTypeNormalizer::IndexDataTypeNormalizer(DataType target_data_type)
+    : target_data_type_(std::move(target_data_type)) {}
+PrimFunc IndexDataTypeNormalizer::Rewrite(PrimFunc func) {
+  Map<Var, Buffer> new_buffer_map = func->buffer_map;
+  for (const auto& [var, buffer] : func->buffer_map) {
+    new_buffer_map.Set(var, VisitBuffer(buffer));
+  }
+  PrimFuncNode* new_func = func.CopyOnWrite();
+  new_func->buffer_map = std::move(new_buffer_map);
+  new_func->body = VisitStmt(std::move(new_func->body));
+  return func;
+}
+
+PrimExpr IndexDataTypeNormalizer::VisitExpr_(const IntImmNode* op) {
+  if (is_enabled_) {
+    ICHECK_LE(op->value, Downcast<Integer>(max_value(target_data_type_))->value);
+    return cast(target_data_type_, GetRef<IntImm>(op));
+  }
+  return GetRef<IntImm>(op);
+}
+
+PrimExpr IndexDataTypeNormalizer::VisitExpr_(const VarNode* op) {
+  if (auto it = var_remap_.find(GetRef<Var>(op)); it != var_remap_.end()) {
+    return (*it).second;
+  }
+  if (is_enabled_) {
+    Var new_var = GetRef<Var>(op).copy_with_dtype(target_data_type_);
+    var_remap_.Set(GetRef<Var>(op), new_var);
+    return std::move(new_var);
+  }
+  return GetRef<PrimExpr>(op);
+}
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index e445432e5b6f..daa8fe703a08 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -21,6 +21,7 @@
  */
 #include <tvm/ir/module.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/tir/data_type_rewriter.h>
 #include <tvm/tir/function.h>
 #include <tvm/tir/stmt_functor.h>
 
diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc
index 98e30117e172..80a653c544b0 100644
--- a/src/tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/tir/schedule/primitive/blockize_tensorize.cc
@@ -16,6 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <tvm/tir/data_type_rewriter.h>
+
 #include <functional>
 
 #include "../ir_comparator.h"
@@ -523,6 +525,19 @@ void Tensorize(ScheduleState self, const StmtSRef& sref, const TensorIntrin& int
   }
   PrimFunc intrin_desc = intrin->desc;
   PrimFunc intrin_impl = DeepCopy(intrin->impl);
+
+  int index_dtype_bits = -1;
+  auto f_update_max_dtype_bits_from_region = [&](const Array<BufferRegion>& buffer_regions) {
+    for (const BufferRegion& buffer_region : buffer_regions) {
+      for (const auto& range : buffer_region->region) {
+        index_dtype_bits = std::max(index_dtype_bits, range->min.dtype().bits());
+      }
+    }
+  };
+  f_update_max_dtype_bits_from_region(block_realize->block->reads);
+  f_update_max_dtype_bits_from_region(block_realize->block->writes);
+  ICHECK(index_dtype_bits > 0);
+  intrin_impl = IndexDataTypeNormalizer(DataType::Int(index_dtype_bits)).Rewrite(intrin_impl);
   // Step 2: Structural pattern matching
   TensorizeComparator comparator(self->mod, /*assert_mode=*/true);
   comparator.VisitStmt(block_realize, intrin_desc->body);
diff --git a/src/tir/transforms/lower_match_buffer.cc b/src/tir/transforms/lower_match_buffer.cc
index 9b915da6290b..2aa6d18b4d11 100644
--- a/src/tir/transforms/lower_match_buffer.cc
+++ b/src/tir/transforms/lower_match_buffer.cc
@@ -195,7 +195,7 @@ class MatchBufferLower : public StmtExprMutator {
         // Non-zero elem_offset is ill-defined for non-flat memory.
         // If needed in the future, will require `Array<PrimExpr>
         // elem_offsets`, with one offset for each flattened index.
-        Bind(buffer->elem_offset, 0);
+        Bind(buffer->elem_offset, make_const(buffer->elem_offset.dtype(), 0));
       }
     }
 
@@ -206,7 +206,7 @@ class MatchBufferLower : public StmtExprMutator {
     if (!buffer->strides.empty()) {
       ICHECK_EQ(buffer->strides.size(), buffer->shape.size());
       if (source_buffer->strides.empty()) {
-        PrimExpr stride = make_const(DataType::Int(32), 1);
+        PrimExpr stride = make_const(buffer->strides.back().dtype(), 1);
         for (size_t i = buffer->shape.size(); i > 0; --i) {
           const PrimExpr& shape = source_buffer->shape[i - 1 + offset];
           Bind(buffer->strides[i - 1], stride, buffer->name + ".strides_" + std::to_string(i - 1));
diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index 2d287deec44c..fba813870bb1 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -24,11 +24,13 @@
 
 #include <tvm/runtime/registry.h>
 #include <tvm/tir/builtin.h>
+#include <tvm/tir/data_type_rewriter.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/transform.h>
 
 #include "../../arith/ir_mutator_with_analyzer.h"
 #include "../../arith/ir_visitor_with_analyzer.h"
+#include "../../printer/text_printer.h"
 
 namespace tvm {
 namespace tir {
@@ -102,6 +104,14 @@ class DataTypeVisitor final : public StmtExprVisitor {
     return StmtExprVisitor::VisitStmt_(op);
   }
 
+  void VisitStmt_(const BlockNode* op) {
+    for (const IterVar& iter : op->iter_vars) {
+      analyzer_.Bind(iter->var, Range::FromMinExtent(iter->dom->min, iter->dom->extent));
+      vextent_[iter->var.as<VarNode>()] = iter->dom->extent.dtype();
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
   void VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
       IterVar iv = Downcast<IterVar>(op->node);
@@ -187,11 +197,10 @@ class DataTypeVisitor final : public StmtExprVisitor {
   arith::ConstIntBoundAnalyzer::BoundMapType bound_;
 };
 
-class DataTypeRewriter : public DataTypeLegalizer {
-  using Parent = DataTypeLegalizer;
-
+class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
  public:
-  explicit DataTypeRewriter(int target_bits) : visitor_(target_bits) {}
+  using Parent = IndexDataTypeRewriter;
+  explicit NarrowDataTypeRewriter(int target_bits) : visitor_(target_bits) {}
 
   Stmt operator()(Stmt s) {
     visitor_(s);
@@ -225,78 +234,19 @@ class DataTypeRewriter : public DataTypeLegalizer {
     return PrimExpr();
   }
 
-  Stmt VisitStmt_(const BufferStoreNode* op) final {
-    BufferStore store = GetRef<BufferStore>(op);
-
-    auto value = this->VisitExpr(op->value);
-    auto indices = VisitIndices(op->indices);
-
-    if (!value.same_as(op->value) || !indices.same_as(op->indices)) {
-      auto writer = store.CopyOnWrite();
-      writer->value = value;
-      writer->indices = indices;
-    }
-
-    return std::move(store);
-  }
-
-  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
-    BufferLoad load = GetRef<BufferLoad>(op);
-
-    auto indices = VisitIndices(op->indices);
-
-    if (!indices.same_as(op->indices)) {
-      auto writer = load.CopyOnWrite();
-      writer->indices = indices;
-    }
-
-    return std::move(load);
-  }
-
-  Array<PrimExpr> VisitIndices(Array<PrimExpr> indices) {
-    is_index_ = true;
-
-    auto fmutate = [this](const PrimExpr& index) { return this->VisitExpr(index); };
-    indices.MutateByApply(fmutate);
-
-    is_index_ = false;
-
-    return indices;
-  }
-
-  Stmt VisitStmt_(const IfThenElseNode* op) final {
-    IfThenElse updated = Downcast<IfThenElse>(Parent::VisitStmt_(op));
-    is_condition_ = true;
-    PrimExpr cond = VisitExpr(op->condition);
-    is_condition_ = false;
-    if (!cond.same_as(op->condition)) {
-      return std::move(IfThenElse(cond, updated->then_case, updated->else_case));
-    }
-    return std::move(updated);
-  }
-
   PrimExpr VisitExpr_(const VarNode* op) final {
-    if (visitor_.vmap.find(op) != visitor_.vmap.end()) {
-      if (vmap_.find(op) == vmap_.end()) {
-        vmap_[op] = Var(op->name_hint, visitor_.vmap[op]);
-      }
-      return vmap_[op];
-    }
-    return Parent::VisitExpr_(op);
-  }
-
-  PrimExpr VisitExpr_(const SizeVarNode* op) final {
-    if (visitor_.vmap.find(op) != visitor_.vmap.end()) {
-      if (vmap_.find(op) == vmap_.end()) {
-        vmap_[op] = SizeVar(op->name_hint, visitor_.vmap[op]);
-      }
-      return vmap_[op];
+    if (auto it = var_remap_.find(GetRef<Var>(op)); it != var_remap_.end()) {
+      return (*it).second;
+    } else if (visitor_.vmap.find(op) != visitor_.vmap.end()) {
+      Var v = Var(op->name_hint, visitor_.vmap[op]);
+      var_remap_.Set(GetRef<Var>(op), v);
+      return v;
     }
     return Parent::VisitExpr_(op);
   }
 
   PrimExpr VisitExpr_(const IntImmNode* op) final {
-    if (is_index_) {
+    if (is_enabled_) {
       if (visitor_.vmap.find(op) != visitor_.vmap.end()) {
         return IntImm(visitor_.vmap[op], op->value);
       }
@@ -305,7 +255,7 @@ class DataTypeRewriter : public DataTypeLegalizer {
   }
 
   PrimExpr VisitExpr_(const CastNode* op) final {
-    if (is_index_ && visitor_.vmap.find(op) != visitor_.vmap.end()) {
+    if (is_enabled_ && visitor_.vmap.find(op) != visitor_.vmap.end()) {
       PrimExpr e = Parent::VisitExpr_(op);
       const CastNode* new_op = e.as<CastNode>();
       ICHECK(new_op != nullptr) << "Expected type to be CastNode"
@@ -315,65 +265,24 @@ class DataTypeRewriter : public DataTypeLegalizer {
     return Parent::VisitExpr_(op);
   }
 
-  PrimExpr VisitExpr_(const EQNode* op) final;
-  PrimExpr VisitExpr_(const NENode* op) final;
-  PrimExpr VisitExpr_(const LTNode* op) final;
-  PrimExpr VisitExpr_(const LENode* op) final;
-  PrimExpr VisitExpr_(const GTNode* op) final;
-  PrimExpr VisitExpr_(const GENode* op) final;
-  PrimExpr VisitExpr_(const CallNode* op) final;
-
  private:
   // the internal visitor to deduce the narrowed dtype
   DataTypeVisitor visitor_;
   // a map from Var before rewrite to that after rewrite,
   // ensures one old Var maps to exactly one new Var
   std::unordered_map<const VarNode*, Var> vmap_;
-  // indicator of index expr to rewrite
-  bool is_index_{false};
-  // indicator of condition
-  bool is_condition_{false};
 };
 
-#define DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)                          \
-  PrimExpr DataTypeRewriter::VisitExpr_(const OP* op) {                             \
-    bool is_index = is_index_;                                                      \
-    bool rewrite = is_condition_ && op->a->dtype.is_int() && op->b->dtype.is_int(); \
-    if (rewrite) {                                                                  \
-      is_index_ = true;                                                             \
-    }                                                                               \
-    auto result = Parent::VisitExpr_(op);                                           \
-    is_index_ = is_index;                                                           \
-    return std::move(result);                                                       \
-  }
-
-DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==);
-DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(NENode, operator!=);
-DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(LENode, operator<=);
-DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(LTNode, operator<);  // NOLINT(*)
-DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(GTNode, operator>);  // NOLINT(*)
-DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(GENode, operator>=);
-
-PrimExpr DataTypeRewriter::VisitExpr_(const CallNode* op) {
-  // handle if_then_else condition
-  if (op->op.same_as(builtin::if_then_else())) {
-    bool is_condition = is_condition_;
-    is_condition_ = true;
-    PrimExpr cond = VisitExpr(op->args[0]);
-    is_condition_ = is_condition;
-    return if_then_else(cond, VisitExpr(op->args[1]), VisitExpr(op->args[2]));
-  }
-  return Parent::VisitExpr_(op);
+Stmt NarrowDataType(Stmt stmt, int target_bits) {
+  return NarrowDataTypeRewriter(target_bits)(stmt);
 }
 
-Stmt NarrowDataType(Stmt stmt, int target_bits) { return DataTypeRewriter(target_bits)(stmt); }
-
 namespace transform {
 
 Pass NarrowDataType(int target_bits) {
   auto pass_func = [target_bits](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
-    n->body = DataTypeRewriter(target_bits)(std::move(n->body));
+    n->body = NarrowDataTypeRewriter(target_bits)(std::move(n->body));
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.NarrowDataType", {});
diff --git a/tests/cpp/data_type_rewriter_test.cc b/tests/cpp/data_type_rewriter_test.cc
index d1ac9d782ce5..c5e6d4f75843 100644
--- a/tests/cpp/data_type_rewriter_test.cc
+++ b/tests/cpp/data_type_rewriter_test.cc
@@ -19,8 +19,8 @@
 
 #include <gtest/gtest.h>
 #include <tvm/tir/builtin.h>
+#include <tvm/tir/data_type_rewriter.h>
 #include <tvm/tir/op.h>
-#include <tvm/tir/stmt_functor.h>
 
 using namespace tvm;
 using namespace tvm::tir;
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index bf302cd0e5bf..021db0f86ad2 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -391,21 +391,21 @@ def test_meta_schedule_te2primfunc_argument_order_and_lowering():
     class _fused_layout_transform:
         @T.prim_func
         def main( # type: ignore
-            placeholder: T.Buffer[(1, 3, 16, 16), "float32"], # type: ignore
-            T_layout_trans: T.Buffer[(1, 1, 16, 16, 3), "float32"], # type: ignore
+            placeholder: T.Buffer[(T.int64(1), T.int64(3), T.int64(16), T.int64(16)), "float32"], # type: ignore
+            T_layout_trans: T.Buffer[(T.int64(1), T.int64(1), T.int64(16), T.int64(16), T.int64(3)), "float32"], # type: ignore
         ) -> None: # type: ignore
             # function attr dict
             T.func_attr({"global_symbol": "main", "tir.noalias": True})
             # body
             # with T.block("root")
-            for i0, i1, i2, i3, i4 in T.grid(1, 1, 16, 16, 3):
+            for i0, i1, i2, i3, i4 in T.grid(T.int64(1), T.int64(1), T.int64(16), T.int64(16), T.int64(3)):
                 with T.block("T_layout_trans"):
                     ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
-                    T.reads(placeholder[ax0, ax1 * 3 + ax4, ax2, ax3])
+                    T.reads(placeholder[ax0, ax1 * T.int64(3) + ax4, ax2, ax3])
                     T.writes(T_layout_trans[ax0, ax1, ax2, ax3, ax4])
                     T_layout_trans[ax0, ax1, ax2, ax3, ax4] = T.if_then_else(
-                        ax0 < 1 and ax1 * 3 + ax4 < 3 and ax2 < 16 and ax3 < 16, # type: ignore
-                        placeholder[ax0, ax1 * 3 + ax4, ax2, ax3],
+                        ax0 < T.int64(1) and ax1 * T.int64(3) + ax4 < T.int64(3) and ax2 < T.int64(16) and ax3 < T.int64(16), # type: ignore
+                        placeholder[ax0, ax1 * T.int64(3) + ax4, ax2, ax3],
                         T.float32(0),
                         dtype="float32",
                     )
@@ -413,41 +413,41 @@ def main( # type: ignore
     @tvm.script.ir_module
     class _fused_layout_transform_1:
         @T.prim_func
-        def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T.Buffer[(1, 8, 16, 16), "float32"]) -> None: # type: ignore
+        def main(placeholder: T.Buffer[(T.int64(1), T.int64(2), T.int64(16), T.int64(16), T.int64(4)), "float32"], T_layout_trans: T.Buffer[(T.int64(1), T.int64(8), T.int64(16), T.int64(16)), "float32"]) -> None: # type: ignore
             # function attr dict
             T.func_attr({"global_symbol": "main", "tir.noalias": True})
             # body
             # with T.block("root")
-            for i0, i1, i2, i3 in T.grid(1, 8, 16, 16):
+            for i0, i1, i2, i3 in T.grid(T.int64(1), T.int64(8), T.int64(16), T.int64(16)):
                 with T.block("T_layout_trans"):
                     ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                    T.reads(placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4]) # type: ignore
+                    T.reads(placeholder[ax0, ax1 // T.int64(4), ax2, ax3, ax1 % T.int64(4)]) # type: ignore
                     T.writes(T_layout_trans[ax0, ax1, ax2, ax3])
-                    T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < 1 and ax1 < 8 and ax2 < 16 and ax3 < 16, placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4], T.float32(0), dtype="float32") # type: ignore
+                    T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < T.int64(1) and ax1 < T.int64(8) and ax2 < T.int64(16) and ax3 < T.int64(16), placeholder[ax0, ax1 // T.int64(4), ax2, ax3, ax1 % T.int64(4)], T.float32(0), dtype="float32") # type: ignore
 
     @tvm.script.ir_module
     class _fused_nn_contrib_conv2d_NCHWc:
         @T.prim_func
-        def main(placeholder: T.Buffer[(1, 1, 16, 16, 3), "float32"], placeholder_1: T.Buffer[(2, 1, 5, 5, 3, 4), "float32"], conv2d_NCHWc: T.Buffer[(1, 2, 16, 16, 4), "float32"]) -> None: # type: ignore
+        def main(placeholder: T.Buffer[(T.int64(1), T.int64(1), T.int64(16), T.int64(16), T.int64(3)), "float32"], placeholder_1: T.Buffer[(T.int64(2), T.int64(1), T.int64(5), T.int64(5), T.int64(3), T.int64(4)), "float32"], conv2d_NCHWc: T.Buffer[(T.int64(1), T.int64(2), T.int64(16), T.int64(16), T.int64(4)), "float32"]) -> None: # type: ignore
             # function attr dict
             T.func_attr({"global_symbol": "main", "tir.noalias": True})
             # body
             # with T.block("root")
-            data_pad = T.alloc_buffer([1, 1, 20, 20, 3], dtype="float32")
-            for i0, i1, i2, i3, i4 in T.grid(1, 1, 20, 20, 3):
+            data_pad = T.alloc_buffer([T.int64(1), T.int64(1), T.int64(20), T.int64(20), T.int64(3)], dtype="float32")
+            for i0, i1, i2, i3, i4 in T.grid(T.int64(1), T.int64(1), T.int64(20), T.int64(20), T.int64(3)):
                 with T.block("data_pad"):
                     i0_1, i1_1, i2_1, i3_1, i4_1 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
-                    T.reads(placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1])
+                    T.reads(placeholder[i0_1, i1_1, i2_1 - T.int64(2), i3_1 - T.int64(2), i4_1])
                     T.writes(data_pad[i0_1, i1_1, i2_1, i3_1, i4_1])
-                    data_pad[i0_1, i1_1, i2_1, i3_1, i4_1] = T.if_then_else(2 <= i2_1 and i2_1 < 18 and 2 <= i3_1 and i3_1 < 18, placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1], T.float32(0), dtype="float32") # type: ignore # pylint: disable=R1716
-            for i0, i1, i2, i3, i4, i5, i6, i7 in T.grid(1, 2, 16, 16, 4, 3, 5, 5):
+                    data_pad[i0_1, i1_1, i2_1, i3_1, i4_1] = T.if_then_else(T.int64(2) <= i2_1 and i2_1 < T.int64(18) and T.int64(2) <= i3_1 and i3_1 < T.int64(18), placeholder[i0_1, i1_1, i2_1 - T.int64(2), i3_1 - T.int64(2), i4_1], T.float32(0), dtype="float32") # type: ignore # pylint: disable=R1716
+            for i0, i1, i2, i3, i4, i5, i6, i7 in T.grid(T.int64(1), T.int64(2), T.int64(16), T.int64(16), T.int64(4), T.int64(3), T.int64(5), T.int64(5)):
                 with T.block("conv2d_NCHWc"):
                     n, oc_chunk, oh, ow, oc_block, ic, kh, kw = T.axis.remap("SSSSSRRR", [i0, i1, i2, i3, i4, i5, i6, i7])
-                    T.reads(data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3], placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block]) # type: ignore
+                    T.reads(data_pad[n, ic // T.int64(3), oh + kh, ow + kw, ic % T.int64(3)], placeholder_1[oc_chunk, ic // T.int64(3), kh, kw, ic % T.int64(3), oc_block]) # type: ignore
                     T.writes(conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block])
                     with T.init():
                         conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = T.float32(0)
-                    conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3] * placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block] # type: ignore
+                    conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[n, ic // T.int64(3), oh + kh, ow + kw, ic % T.int64(3)] * placeholder_1[oc_chunk, ic // T.int64(3), kh, kw, ic % T.int64(3), oc_block] # type: ignore
 
     # fmt: on
     # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 7b8173d0b2d9..6662c7aca85b 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -44,8 +44,10 @@ def test_unique_name_reduction_block():
     assert isinstance(s.get_sref(s.get_block("sum_1")), tir.schedule.StmtSRef)
 
 
-def _check_workload(te_workload, tir_workload):
-    func = te.create_prim_func(te_workload())
+def _check_workload(te_workload, tir_workload, index_dtype_override=None):
+    func = te.create_prim_func(te_workload(), index_dtype_override)
+    print(func.script())
+    print(tvm.ir.base.get_first_structural_mismatch(func, tir_workload))
     tvm.ir.assert_structural_equal(func, tir_workload)
     # make sure that we can create schedule from the func
     s = tir.Schedule(func, debug_mask="all")
@@ -75,10 +77,29 @@ def tir_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
             C[i, j] += A[i, k] * B[j, k]
 
 
+@T.prim_func
+def tir_matmul_int64(
+    A: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+    B: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+    C: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    for i0, j0, k0 in T.grid(T.int64(128), T.int64(128), T.int64(128)):
+        with T.block():
+            i, j, k = T.axis.remap("SSR", [i0, j0, k0])
+            with T.init():
+                C[i, j] = 0.0
+            C[i, j] += A[i, k] * B[j, k]
+
+
 def test_matmul():
     _check_workload(te_matmul, tir_matmul)
 
 
+def test_matmul_int64():
+    _check_workload(te_matmul, tir_matmul_int64, index_dtype_override="int64")
+
+
 def te_element_wise():
     A = te.placeholder((128, 128), name="A")
     B = te.compute((128, 128), lambda x, y: A[x, y] * 2, name="B")
diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index 0129cee53254..21cc39b71402 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -717,34 +717,34 @@ def tensorized_matmul_int64_shape(
                         ]
                     )
                     T.writes(C[vi * T.int64(16) : vi * T.int64(16) + T.int64(16), vj * T.int64(16) : vj * T.int64(16) + T.int64(16)])
-                    A_elem_offset = T.var("int32")
-                    B_elem_offset = T.var("int32")
-                    C_elem_offset = T.var("int32")
+                    A_elem_offset = T.var("int64")
+                    B_elem_offset = T.var("int64")
+                    C_elem_offset = T.var("int64")
                     A_sub = T.match_buffer(
                         A[vi * T.int64(16) : vi * T.int64(16) + T.int64(16), vk * T.int64(16) : vk * T.int64(16) + T.int64(16)],
-                        [16, 16],
+                        [T.int64(16), T.int64(16)],
                         elem_offset=A_elem_offset,
                     )
                     B_sub = T.match_buffer(
                         B[vj * T.int64(16) : vj * T.int64(16) + T.int64(16), vk * T.int64(16) : vk * T.int64(16) + T.int64(16)],
-                        [16, 16],
+                        [T.int64(16), T.int64(16)],
                         elem_offset=B_elem_offset,
                     )
                     C_sub = T.match_buffer(
                         C[vi * T.int64(16) : vi * T.int64(16) + T.int64(16), vj * T.int64(16) : vj * T.int64(16) + T.int64(16)],
-                        [16, 16],
+                        [T.int64(16), T.int64(16)],
                         elem_offset=C_elem_offset,
                     )
                     T.evaluate(
                         T.tvm_mma_sync(
                             C_sub.data,
-                            T.floordiv(C_sub.elem_offset, 256),
+                            T.floordiv(C_sub.elem_offset, T.int64(256)),
                             A_sub.data,
-                            T.floordiv(A_sub.elem_offset, 256),
+                            T.floordiv(A_sub.elem_offset, T.int64(256)),
                             B_sub.data,
-                            T.floordiv(B_sub.elem_offset, 256),
+                            T.floordiv(B_sub.elem_offset, T.int64(256)),
                             C_sub.data,
-                            T.floordiv(C_sub.elem_offset, 256),
+                            T.floordiv(C_sub.elem_offset, T.int64(256)),
                             dtype="handle",
                         )
                     )
diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py
index 20818a5b326a..c9c513378595 100644
--- a/tests/python/unittest/test_tir_transform_narrow_datatype.py
+++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py
@@ -19,6 +19,7 @@
 from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
 from tvm.tir import const
+import tvm.testing
 
 
 def lower_stmt(params, stmt, target_bits):
@@ -324,14 +325,26 @@ def expected_after(A: T.Buffer[128, "float32"], B: T.Buffer[130, "float32"]):
     tvm.ir.assert_structural_equal(after, expected_after)
 
 
+def test_block():
+    @T.prim_func
+    def before(A: T.Buffer[(128,), "float32"], B: T.Buffer[(128,), "float32"]):
+        for i in T.serial(0, T.int64(16)):
+            for j in T.serial(0, T.int64(8)):
+                with T.block():
+                    vi = T.axis.spatial(T.int64(128), i * T.int64(8) + j)
+                    B[vi] = A[vi] + T.float32(1)
+
+    @T.prim_func
+    def expected_after(A: T.Buffer[(128,), "float32"], B: T.Buffer[(128,), "float32"]):
+        for i in T.serial(0, T.int32(16)):
+            for j in T.serial(0, T.int32(8)):
+                with T.block():
+                    vi = T.axis.spatial(T.int32(128), i * T.int32(8) + j)
+                    B[vi] = A[vi] + T.float32(1)
+
+    after = tvm.tir.transform.NarrowDataType(32)(tvm.IRModule.from_expr(before))["main"]
+    tvm.ir.assert_structural_equal(after, expected_after)
+
+
 if __name__ == "__main__":
-    test_basic()
-    test_thread_axis()
-    test_thread_axis_2()
-    test_multilanes()
-    test_reduce()
-    test_slice()
-    test_relay_basic()
-    test_relay_take()
-    test_ramp_dtype_consistency()
-    test_condition()
+    tvm.testing.main()

From c98f3cd6f8b0dcd8b6b07fecd5a60174ec13dc5b Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Thu, 17 Nov 2022 09:49:04 +0000
Subject: [PATCH 603/704] [ACL] Enable int8 data type in QNN ADD (#13407)

This enables int8 data type to be used in Compute Library
for the Arm(r) Architecture (ACL) BYOC integration.
---
 python/tvm/relay/op/contrib/arm_compute_lib.py        | 2 +-
 src/relay/backend/contrib/arm_compute_lib/codegen.cc  | 2 +-
 src/runtime/contrib/arm_compute_lib/acl_utils.cc      | 2 ++
 tests/python/contrib/test_arm_compute_lib/test_add.py | 4 +++-
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 9abd320b2956..d63cd8c83a93 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -511,7 +511,7 @@ def qnn_add(expr):
     """Check if the external ACL codegen for add should be used."""
     args = expr.args
     for typ in [args[0].checked_type, args[1].checked_type]:
-        if typ.dtype != "uint8":
+        if typ.dtype not in ["int8", "uint8"]:
             return False
 
     return True
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index 81a5b5bbd9d8..3f11e63c7391 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -292,7 +292,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
   /*!
    * \brief Create a JSON representation of a composite (global) average pooling operator.
    *
-   * A composite function is only created when using the uint8 datatype for these operators.
+   * A composite function is only created when using the int8/uint8 datatype for these operators.
    *
    * \param cn The call to be represented.
    * \return A JSON representation of a specific operator.
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
index 238b7355de26..0f2dde5e36e1 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -130,6 +130,8 @@ arm_compute::DataType MakeACLDataType(const DLDataType& data_type) {
     return arm_compute::DataType::F32;
   } else if (data_type.code == DLDataTypeCode::kDLUInt && data_type.bits == 8) {
     return arm_compute::DataType::QASYMM8;
+  } else if (data_type.code == DLDataTypeCode::kDLInt && data_type.bits == 8) {
+    return arm_compute::DataType::QASYMM8_SIGNED;
   } else if (data_type.code == DLDataTypeCode::kDLInt && data_type.bits == 32) {
     return arm_compute::DataType::S32;
   } else {
diff --git a/tests/python/contrib/test_arm_compute_lib/test_add.py b/tests/python/contrib/test_arm_compute_lib/test_add.py
index ba324358f8e5..ee6fcf603cb0 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_add.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_add.py
@@ -92,7 +92,8 @@ def test_runtime_add():
 
     for dtype, low, high, atol, rtol, op, op_params in [
         ("float32", -127, 128, 1e-7, 1e-7, relay.add, {}),
-        ("uint8", 0, 255, 0.0, 1.0, relay.qnn.op.add, _qnn_params),
+        ("uint8", 0, 255, 1.0, 0.0, relay.qnn.op.add, _qnn_params),
+        ("int8", -127, 128, 1.0, 0.0, relay.qnn.op.add, _qnn_params),
     ]:
         shape = (2, 2)
         for inputs in [
@@ -125,6 +126,7 @@ def test_codegen_add():
     for dtype, op_name, op, qnn_params in [
         ("float32", "add", relay.add, {}),
         ("uint8", "qnn.add", relay.qnn.op.add, _qnn_params),
+        ("int8", "qnn.add", relay.qnn.op.add, _qnn_params),
     ]:
         for shape in [(1, 1), (2, 2, 2), (3, 3, 3, 3)]:
             func = _get_model(shape, dtype, iter(inputs), op, qnn_params)

From 5b1d2cc3e822367783a660e268ef3311f8a2f06b Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Thu, 17 Nov 2022 15:42:09 +0000
Subject: [PATCH 604/704] [usmp] Hill Climb greedy layout size check relaxed
 (#13369)

* [usmp] Hill Climb greedy layout size check relaxed

The check relaxed as the later permutations could lead to winning
combination

Change-Id: I74cff65f7899b419264b79d269c1ddb8624adc48

* added check for empty imput

Change-Id: I674b0ee9061c675a968d58ce120d10191bfc9b16
---
 src/tir/usmp/algo/hill_climb.cc | 78 +++++++++++++++++++++++++--------
 1 file changed, 60 insertions(+), 18 deletions(-)

diff --git a/src/tir/usmp/algo/hill_climb.cc b/src/tir/usmp/algo/hill_climb.cc
index ed90430277ec..1da9cef1eb6f 100644
--- a/src/tir/usmp/algo/hill_climb.cc
+++ b/src/tir/usmp/algo/hill_climb.cc
@@ -78,16 +78,18 @@ class HillClimbAllocator : public GreedyBase {
    * HillClimb's version of greedy allocation
    * \param buffer_info_vec - buffers in specific order for allocation
    */
-  alloc_map_t greedy(const std::vector<BufferInfo>& buffer_info_vec) {
+  alloc_map_t greedy(const std::vector<BufferInfo>& buffer_info_vec, bool* could_not_fit) {
     alloc_map_t pool_allocations(buffer_info_vec.size());
     for (const auto& buf_info : buffer_info_vec) {
       std::unordered_map<PoolInfo, size_t, ObjectPtrHash, ObjectPtrEqual> pool_offset_candidates;
+
+      // check whether we can fit the buffer into the empty pool candidate
       for (const auto& pool_info : buf_info->pool_candidates) {
         if (IsValidPlacement(pool_info, 0, buf_info->size_bytes->value)) {
           pool_offset_candidates[pool_info] = 0;
         }
       }
-
+      // select conflicting buffers which have already been allocated
       std::vector<const BufferInfoNode*> buf_conf;
       for (const auto& conflict_buf_info_obj : buf_info->conflicts) {
         const BufferInfoNode* conflict_buf_info = conflict_buf_info_obj.as<BufferInfoNode>();
@@ -106,14 +108,18 @@ class HillClimbAllocator : public GreedyBase {
       for (const auto* conflict_buf_info : buf_conf) {
         size_t next_offset = 0;
         auto pool_allocation = pool_allocations[conflict_buf_info];
-        next_offset =
-            pool_allocation->byte_offset.IntValue() + conflict_buf_info->size_bytes.IntValue();
-        next_offset = round_up_to_byte_alignment(next_offset, conflict_buf_info->alignment->value);
         if (!pool_offset_candidates.count(pool_allocation->pool_info)) {
           continue;
         }
+
+        next_offset =
+            pool_allocation->byte_offset.IntValue() + conflict_buf_info->size_bytes.IntValue();
+        next_offset = round_up_to_byte_alignment(next_offset, conflict_buf_info->alignment->value);
+
         if (IsValidPlacement(pool_allocation->pool_info, next_offset,
                              buf_info->size_bytes->value)) {
+          // extra check whether the previous attempt to fit the buffer is clashing with the current
+          // conflict
           if (next_offset > pool_offset_candidates[pool_allocation->pool_info] &&
               pool_offset_candidates[pool_allocation->pool_info] +
                       static_cast<size_t>(buf_info->size_bytes.IntValue()) >
@@ -124,7 +130,18 @@ class HillClimbAllocator : public GreedyBase {
           pool_offset_candidates.erase(pool_allocation->pool_info);
         }
       }
-      auto selected_pool = SelectPlacementPool(buf_info, pool_offset_candidates);
+      auto selected_pool = NullValue<PoolInfo>();
+      for (const auto& pi : buf_info->pool_candidates) {
+        if (pool_offset_candidates.count(pi)) {
+          selected_pool = pi;
+          break;
+        }
+      }
+
+      if (selected_pool.same_as(NullValue<PoolInfo>())) {
+        *could_not_fit = true;
+      }
+
       pool_allocations[buf_info.as<BufferInfoNode>()] =
           PoolAllocation(selected_pool, Integer(pool_offset_candidates[selected_pool]));
     }
@@ -140,6 +157,9 @@ class HillClimbAllocator : public GreedyBase {
     for (const auto& it : *pool_allocations) {
       const BufferInfoNode* buf = it.first;
       const PoolAllocation& pa = it.second;
+      if (pa->pool_info.same_as(NullValue<PoolInfo>())) {
+        continue;
+      }
       size_t high_sz = pa->byte_offset.IntValue() + buf->size_bytes.IntValue();
       if (pool_sizes[pa->pool_info] <= high_sz) {
         pool_sizes[pa->pool_info] = high_sz;
@@ -183,14 +203,16 @@ class HillClimbAllocator : public GreedyBase {
 #else
 #define rnd_func() rand()
 #endif
-
+    Map<BufferInfo, PoolAllocation> result;
+    if (!buffer_info_arr.size()) {
+      return result;
+    }
     std::vector<BufferInfo> buffer_info_vec;
     for (const auto& buffer_info : buffer_info_arr) {
       ICHECK(buffer_info->pool_candidates.size())
           << "Cannot process buffer \"" << buffer_info->name_hint << "\" with no pool candidates";
       buffer_info_vec.push_back(std::move(buffer_info));
     }
-
     sort_vector<BufferInfo>(&buffer_info_vec);
 
     // populate positional index map
@@ -232,27 +254,34 @@ class HillClimbAllocator : public GreedyBase {
 
     for (; attempts < _max_attempts; ++attempts) {
       rollback_pool_allocations = std::move(pool_allocations);
-      pool_allocations = std::move(greedy(buffer_info_vec));
+      bool could_not_fit = false;
+      pool_allocations = std::move(greedy(buffer_info_vec, &could_not_fit));
 
       // estimate result buffers
       std::unordered_map<PoolInfo, size_t, ObjectPtrHash, ObjectPtrEqual> pool_sizes =
           find_highest(&pool_allocations);
+      if (!pool_sizes.size()) {
+        CHECK(false) << "TVM USMP Error: Please increase the size_hints for memory pools.";
+      }
+
       // calculate summary
       size_t total = 0;
       for (const auto& el : pool_sizes) {
         total += el.second;
       }
       // accept/reject result heuristic
-      if (!total_size ||         /* first run */
-          (total_size > total || /* always accept if better or with some probability */
-           rnd_func() % 100 < static_cast<int>(50 * (total - total_size) / total / attempts))) {
+      if (!total_size || /* first run */
+          (!could_not_fit &&
+           (total_size > total || /* always accept if better or with some probability */
+            rnd_func() % 100 < static_cast<int>(50 * (total - total_size) / total / attempts)))) {
         // remember winning combination
         result_pool_allocations = pool_allocations;
-        total_size = total;
-
-        // reached desired size
-        if (total_size <= desired_bytes_) {
-          break;
+        if (!could_not_fit) {
+          total_size = total;
+          // reached desired size
+          if (total_size <= desired_bytes_) {
+            break;
+          }
         }
 
       } else {
@@ -267,11 +296,17 @@ class HillClimbAllocator : public GreedyBase {
       for (const auto& it : pool_allocations) {
         const auto* buf = it.first;
         const auto pa = it.second;
+        if (pa->pool_info.same_as(NullValue<PoolInfo>())) {
+          continue;
+        }
         size_t high_sz = pa->byte_offset.IntValue() + buf->size_bytes.IntValue();
         if (pool_sizes[pa->pool_info] == high_sz) {
           max_pool_buf.push_back(buf);
         }
       }
+      if (!max_pool_buf.size()) {
+        CHECK(false) << "TVM USMP Error: Please increase the size_hints for memory pools.";
+      }
       sort(max_pool_buf.begin(), max_pool_buf.end(),
            [&_pos](const auto* a, const auto* b) { return _pos(a) < _pos(b); });
       // pick highest
@@ -309,9 +344,16 @@ class HillClimbAllocator : public GreedyBase {
       swap_buffers(swap_i1, swap_i2);
     }
 
-    Map<BufferInfo, PoolAllocation> result;
     // return winning combination
     for (auto it : result_pool_allocations) {
+      // post-check that everything was fit
+      const BufferInfoNode* buf = it.first;
+      const PoolAllocation& pa = it.second;
+      if (NullValue<PoolInfo>().same_as(pa->pool_info) ||
+          !IsValidPlacement(pa->pool_info, pa->byte_offset->value, buf->size_bytes->value)) {
+        std::unordered_map<PoolInfo, size_t, ObjectPtrHash, ObjectPtrEqual> m = {};
+        SelectPlacementPool(GetRef<BufferInfo>(buf), m);
+      }
       result.Set(GetRef<BufferInfo>(it.first), it.second);
     }
     return result;

From 25ad54058a28782314405aa93c0d28ca6cd78077 Mon Sep 17 00:00:00 2001
From: Masahiro Hiramori <mhg00g13@gmail.com>
Date: Fri, 18 Nov 2022 03:09:35 +0900
Subject: [PATCH 605/704] [CI] Update minor git options (#13398)

The changes are listed below.
- avoid git checkout after git clone
---
 docker/install/ubuntu_install_ethosn_driver_stack.sh |  7 ++-----
 docker/install/ubuntu_install_ethosu_driver_stack.sh | 10 ++--------
 docker/install/ubuntu_install_papi.sh                |  6 ++----
 3 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/docker/install/ubuntu_install_ethosn_driver_stack.sh b/docker/install/ubuntu_install_ethosn_driver_stack.sh
index 1696b3230e2f..4c26497c3895 100755
--- a/docker/install/ubuntu_install_ethosn_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosn_driver_stack.sh
@@ -50,10 +50,7 @@ apt-install-and-clear -y \
     wget
 
 cd "$tmpdir"
-git clone "$repo_url" "$repo_dir"
+git clone --branch "$repo_revision" "$repo_url" "$repo_dir"
 
-cd "$repo_dir"
-git checkout "$repo_revision"
-
-cd "driver"
+cd "$repo_dir"/driver
 scons install_prefix="$install_path" install
diff --git a/docker/install/ubuntu_install_ethosu_driver_stack.sh b/docker/install/ubuntu_install_ethosu_driver_stack.sh
index d34445e2e80f..da2f955d3fb7 100755
--- a/docker/install/ubuntu_install_ethosu_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosu_driver_stack.sh
@@ -76,14 +76,8 @@ export PATH="/opt/arm/gcc-arm-none-eabi/bin:${PATH}"
 # Clone Arm(R) Ethos(TM)-U NPU driver stack
 mkdir -p "${ethosu_dir}"
 cd "${ethosu_dir}"
-git clone "https://review.mlplatform.org/ml/ethos-u/ethos-u-core-driver" core_driver
-cd core_driver
-git checkout tags/${ethosu_driver_ver}
-
-cd "${ethosu_dir}"
-git clone "https://review.mlplatform.org/ml/ethos-u/ethos-u-core-platform" core_platform
-cd core_platform
-git checkout tags/${ethosu_driver_ver}
+git clone --branch ${ethosu_driver_ver} "https://review.mlplatform.org/ml/ethos-u/ethos-u-core-driver" core_driver
+git clone --branch ${ethosu_driver_ver} "https://review.mlplatform.org/ml/ethos-u/ethos-u-core-platform" core_platform
 
 # Build Driver
 mkdir ${ethosu_dir}/core_driver/build && cd ${ethosu_dir}/core_driver/build
diff --git a/docker/install/ubuntu_install_papi.sh b/docker/install/ubuntu_install_papi.sh
index ebcca0b424a6..958144518590 100755
--- a/docker/install/ubuntu_install_papi.sh
+++ b/docker/install/ubuntu_install_papi.sh
@@ -26,11 +26,9 @@ apt-get update --fix-missing
 apt-install-and-clear -y linux-tools-common linux-tools-generic kmod
 
 cd /
-git clone https://bitbucket.org/icl/papi.git
 # Pulling the latest version of this has broken the images before. Checkout the tagged version below for now.
-cd papi
-git checkout papi-6-0-0-1-t
-cd src
+git clone --branch papi-6-0-0-1-t https://bitbucket.org/icl/papi.git
+cd papi/src
 export PAPI_CUDA_ROOT=/usr/local/cuda
 export PAPI_ROCM_ROOT=/opt/rocm
 ./configure --with-components="$1"

From 01a4725b6b8b36d3a9a1df666478310078160d2d Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Fri, 18 Nov 2022 02:27:05 +0800
Subject: [PATCH 606/704] [MetaSchedule][Fix] Fix Empty Run Time Issue when
 Benchmarking Result (#13406)

* [MetaSchedule][Fix] Fix Empty Run Time Issue when Benchmarking Result

As a follow-up fix PR for https://github.com/apache/tvm/pull/13354, which introduces a bug that the tuning will crash if the run time is empty (usually because of a runtime error).

* Update src/meta_schedule/measure_callback/update_cost_model.cc

Co-authored-by: Xiyou Zhou <xiyou.zhou@gmail.com>

* lint

Co-authored-by: Xiyou Zhou <xiyou.zhou@gmail.com>
---
 .../measure_callback/update_cost_model.cc     |  5 ++--
 .../test_meta_schedule_measure_callback.py    | 26 +++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/meta_schedule/measure_callback/update_cost_model.cc b/src/meta_schedule/measure_callback/update_cost_model.cc
index 8a8a43658409..6c217a6c4d65 100644
--- a/src/meta_schedule/measure_callback/update_cost_model.cc
+++ b/src/meta_schedule/measure_callback/update_cost_model.cc
@@ -42,8 +42,9 @@ class UpdateCostModelNode : public MeasureCallbackNode {
     pruned_candidate.reserve(n);
     pruned_runner_result.reserve(n);
     for (int i = 0; i < n; i++) {
-      if (!builder_results[i]->error_msg.defined() &&
-          Sum(runner_results[i]->run_secs.value()) > 0) {
+      if (!builder_results[i]->error_msg.defined() &&  //
+          (runner_results[i]->error_msg.defined() ||   //
+           Sum(runner_results[i]->run_secs.value()) > 0)) {
         pruned_candidate.push_back(measure_candidates[i]);
         pruned_runner_result.push_back(runner_results[i]);
       }
diff --git a/tests/python/unittest/test_meta_schedule_measure_callback.py b/tests/python/unittest/test_meta_schedule_measure_callback.py
index c3fbbbe97231..0b7b22d92bb7 100644
--- a/tests/python/unittest/test_meta_schedule_measure_callback.py
+++ b/tests/python/unittest/test_meta_schedule_measure_callback.py
@@ -148,8 +148,34 @@ def run(self, runner_inputs: List[ms.runner.RunnerInput]) -> List[ms.runner.Runn
         )
 
 
+def test_meta_schedule_measure_callback_update_cost_model_with_runtime_error():
+    @ms.derived_object
+    class EmptyRunnerFuture(ms.runner.PyRunnerFuture):
+        def done(self) -> bool:
+            return True
+
+        def result(self) -> ms.runner.RunnerResult:
+            return ms.runner.RunnerResult(None, "error")
+
+    @ms.derived_object
+    class EmptyRunner(ms.runner.PyRunner):
+        def run(self, runner_inputs: List[ms.runner.RunnerInput]) -> List[ms.runner.RunnerResult]:
+            return [EmptyRunnerFuture() for _ in runner_inputs]
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        ms.tune_tir(
+            mod=Matmul,
+            target="llvm -num-cores=1",
+            work_dir=work_dir,
+            max_trials_global=10,
+            runner=EmptyRunner(),
+            measure_callbacks=[ms.measure_callback.UpdateCostModel()],
+        )
+
+
 if __name__ == "__main__":
     test_meta_schedule_measure_callback()
     test_meta_schedule_measure_callback_fail()
     test_meta_schedule_measure_callback_as_string()
     test_meta_schedule_measure_callback_update_cost_model_with_zero()
+    test_meta_schedule_measure_callback_update_cost_model_with_runtime_error()

From b29ab5c6baea8884a5b9c0702c1c93da6a830866 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Thu, 17 Nov 2022 19:21:14 -0800
Subject: [PATCH 607/704] =?UTF-8?q?[Hexagon]=20Add=20test=20to=20show=20sc?=
 =?UTF-8?q?heduling=20of=20resnet50=20with=20async=20dma=20pipe=E2=80=A6?=
 =?UTF-8?q?=20(#13352)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [Hexagon] Add test to show scheduling of resnet50 with async dma pipelines using metaschedule

* lint
---
 python/tvm/contrib/hexagon/meta_schedule.py   |  23 +-
 python/tvm/tir/tensor_intrin/hexagon.py       | 178 +++++++-------
 .../metaschedule_e2e/test_resnet50_int8.py    | 218 +++++++++++++++---
 3 files changed, 299 insertions(+), 120 deletions(-)

diff --git a/python/tvm/contrib/hexagon/meta_schedule.py b/python/tvm/contrib/hexagon/meta_schedule.py
index aaf3f8c7f8d5..dcc7d232d8c4 100644
--- a/python/tvm/contrib/hexagon/meta_schedule.py
+++ b/python/tvm/contrib/hexagon/meta_schedule.py
@@ -17,7 +17,14 @@
 """Meta schedule tuning utilities for Hexagon."""
 import os
 import tempfile
-from typing import Callable, List, Optional
+from typing import Callable, Dict, List, Optional
+import tvm
+
+from tvm.ir.module import IRModule
+from tvm.runtime import Module, NDArray
+from tvm.target import Target
+from tvm.driver import build as tvm_build
+from tvm.tir.transform import RemoveWeightLayoutRewriteBlock
 from tvm.contrib.popen_pool import PopenPoolExecutor
 from tvm.meta_schedule.utils import cpu_count, derived_object
 from tvm.meta_schedule.builder import LocalBuilder
@@ -121,14 +128,24 @@ def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path
     return costs
 
 
-def get_hexagon_local_builder():
+def get_hexagon_local_builder(pass_context: tvm.transform.PassContext = None):
     """Return Hexagon-compatible Builder for meta schedule."""
 
     def export_func(mod):
         binary_path = export_module(mod, tempfile.mkdtemp())
         return str(binary_path)
 
-    return LocalBuilder(f_export=export_func)
+    def default_build_with_context(
+        mod: IRModule, target: Target, _params: Optional[Dict[str, NDArray]]
+    ) -> Module:
+        with pass_context:
+            mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
+            return tvm_build(mod, target=target)
+
+    if pass_context is not None:
+        return LocalBuilder(f_build=default_build_with_context, f_export=export_func)
+    else:
+        return LocalBuilder(f_export=export_func)
 
 
 def get_hexagon_rpc_runner(
diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
index 306c8cd2e14e..49c12c3e9dce 100644
--- a/python/tvm/tir/tensor_intrin/hexagon.py
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -20,98 +20,100 @@
 from .. import TensorIntrin
 
 
-@T.prim_func
-def dot_product_32x4_u8u8i32_desc(
-    A: T.Buffer((4,), "uint8", offset_factor=1),
-    B: T.Buffer((32, 4), "uint8", offset_factor=1),
-    C: T.Buffer((32,), "int32", offset_factor=1),
-) -> None:
-    with T.block("root"):
-        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
-        T.writes(C[0:32])
-        for i in T.serial(0, 32):
-            for k in T.serial(0, 4):
-                with T.block("update"):
-                    vi, vk = T.axis.remap("SR", [i, k])
-                    C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
-
-
-@T.prim_func
-def dot_product_32x4_u8u8i32_vrmpy(
-    A: T.Buffer((4,), "uint8", offset_factor=1),
-    B: T.Buffer((32, 4), "uint8", offset_factor=1),
-    C: T.Buffer((32,), "int32", offset_factor=1),
-) -> None:
-    with T.block("root"):
-        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
-        T.writes(C[0:32])
-
-        A_u8x4 = A.vload([0], "uint8x4")
-        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
-
-        B_i8x128 = B.vload([0, 0], dtype="uint8x128")
-        B_i32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
-
-        C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin(
-            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
-            T.uint32(3),
-            C[T.ramp(T.int32(0), 1, 32)],
-            B_i32x32,
-            A_i32,
-            dtype="int32x32",
-        )
-
-
-@T.prim_func
-def dot_product_32x4_u8i8i32_desc(
-    A: T.Buffer((4,), "uint8", offset_factor=1),
-    B: T.Buffer((32, 4), "int8", offset_factor=1),
-    C: T.Buffer((32,), "int32", offset_factor=1),
-) -> None:
-    with T.block("root"):
-        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
-        T.writes(C[0:32])
-        for i in T.serial(0, 32):
-            for k in T.serial(0, 4):
-                with T.block("update"):
-                    vi, vk = T.axis.remap("SR", [i, k])
-                    C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
-
-
-@T.prim_func
-def dot_product_32x4_u8i8i32_vrmpy(
-    A: T.Buffer((4,), "uint8", offset_factor=1),
-    B: T.Buffer((32, 4), "int8", offset_factor=1),
-    C: T.Buffer((32,), "int32", offset_factor=1),
-) -> None:
-    with T.block("root"):
-        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
-        T.writes(C[0:32])
-
-        A_u8x4 = A.vload([0], "uint8x4")
-        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
-
-        B_i8x128 = B.vload([0, 0], dtype="int8x128")
-        B_i32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
-
-        C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin(
-            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpybusv.acc.128B"),
-            T.uint32(3),
-            C[T.ramp(T.int32(0), 1, 32)],
-            T.broadcast(A_i32, 32),
-            B_i32x32,
-            dtype="int32x32",
-        )
+def generate_dot_product_32x4_u8u8i32(mem_scope="global"):
+    @T.prim_func
+    def dot_product_32x4_u8u8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        with T.block("root"):
+            T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+            T.writes(C[0:32])
+            for i in T.serial(0, 32):
+                for k in T.serial(0, 4):
+                    with T.block("update"):
+                        vi, vk = T.axis.remap("SR", [i, k])
+                        C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
+
+    @T.prim_func
+    def dot_product_32x4_u8u8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        with T.block("root"):
+            T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+            T.writes(C[0:32])
+
+            A_u8x4 = A.vload([0], "uint8x4")
+            A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+
+            B_i8x128 = B.vload([0, 0], dtype="uint8x128")
+            B_i32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+
+            C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin(
+                T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+                T.uint32(3),
+                C[T.ramp(T.int32(0), 1, 32)],
+                B_i32x32,
+                A_i32,
+                dtype="int32x32",
+            )
+
+    return dot_product_32x4_u8u8i32_desc, dot_product_32x4_u8u8i32_vrmpy
+
+
+def generate_dot_product_32x4_u8i8i32(mem_scope="global"):
+    @T.prim_func
+    def dot_product_32x4_u8i8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        with T.block("root"):
+            T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+            T.writes(C[0:32])
+            for i in T.serial(0, 32):
+                for k in T.serial(0, 4):
+                    with T.block("update"):
+                        vi, vk = T.axis.remap("SR", [i, k])
+                        C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
+
+    @T.prim_func
+    def dot_product_32x4_u8i8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        with T.block("root"):
+            T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+            T.writes(C[0:32])
+
+            A_u8x4 = A.vload([0], "uint8x4")
+            A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+
+            B_i8x128 = B.vload([0, 0], dtype="int8x128")
+            B_i32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+
+            C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin(
+                T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpybusv.acc.128B"),
+                T.uint32(3),
+                C[T.ramp(T.int32(0), 1, 32)],
+                T.broadcast(A_i32, 32),
+                B_i32x32,
+                dtype="int32x32",
+            )
+
+    return dot_product_32x4_u8i8i32_desc, dot_product_32x4_u8i8i32_vrmpy
 
 
 VRMPY_u8u8i32_INTRIN = "dot_32x4_u8u8i32_vrmpy"
 
-TensorIntrin.register(
-    VRMPY_u8u8i32_INTRIN, dot_product_32x4_u8u8i32_desc, dot_product_32x4_u8u8i32_vrmpy
-)
+TensorIntrin.register(VRMPY_u8u8i32_INTRIN, *generate_dot_product_32x4_u8u8i32())
 
 VRMPY_u8i8i32_INTRIN = "dot_32x4_u8i8i32_vrmpy"
 
-TensorIntrin.register(
-    VRMPY_u8i8i32_INTRIN, dot_product_32x4_u8i8i32_desc, dot_product_32x4_u8i8i32_vrmpy
-)
+TensorIntrin.register(VRMPY_u8i8i32_INTRIN, *generate_dot_product_32x4_u8i8i32())
+
+VRMPY_u8u8i32_VTCM_INTRIN = "dot_32x4_u8u8i32_vtcm_vrmpy"
+TensorIntrin.register(VRMPY_u8u8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8u8i32("global.vtcm"))
+
+VRMPY_u8i8i32_VTCM_INTRIN = "dot_32x4_u8i8i32_vtcm_vrmpy"
+TensorIntrin.register(VRMPY_u8i8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8i8i32("global.vtcm"))
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
index 91eb67bbf457..e15b0a4e7ddb 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -18,7 +18,8 @@
 
 import os
 import tempfile
-from typing import Optional
+from types import MappingProxyType
+from typing import Any, Mapping, Optional
 
 import numpy as np
 import pytest
@@ -34,7 +35,11 @@
 from tvm.meta_schedule import postproc, schedule_rule
 from tvm.tir.schedule import BlockRV, Schedule
 from tvm.tir.schedule.analysis import has_block
-from tvm.tir.tensor_intrin.hexagon import VRMPY_u8i8i32_INTRIN, VRMPY_u8u8i32_INTRIN
+from tvm.tir.tensor_intrin.hexagon import (
+    VRMPY_u8i8i32_INTRIN,
+    VRMPY_u8u8i32_INTRIN,
+    VRMPY_u8i8i32_VTCM_INTRIN,
+)
 
 from ..infrastructure import get_hexagon_target
 
@@ -133,7 +138,6 @@ def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
             # from 36 to 23, with negligible performance difference.
             module_equality="anchor-block",
         )
-
         return ms.relay_integration.compile_relay(
             database=database,
             mod=mod,
@@ -142,10 +146,13 @@ def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
         )
 
 
-@pytest.mark.skip("End-to-end tuning is skipped on CI.")
 @tvm.testing.requires_hexagon
 def test_resnet50(hexagon_launcher):
     """Test Resnet50."""
+
+    if tvm.testing.utils.IS_IN_CI:
+        pytest.skip("Skipping test since it takes too long in CI.")
+
     if not os.path.exists(MODEL_JSON):
         pytest.skip(msg="Run python export_models.py first.")
 
@@ -200,6 +207,44 @@ def test_resnet50(hexagon_launcher):
         print(debug_ex.profile(input_name=inp.copy()))
 
 
+def evaluate_mod(hexagon_launcher, hexagon_lowered, llvm_lowered, input_name, inp, benchmark=False):
+    """Evaluate the Modules against llvm version."""
+    with hexagon_launcher.create_session() as session:
+        graph_mod = session.get_executor_from_factory(hexagon_lowered)
+        graph_mod.set_input(input_name, inp.copy())
+        graph_mod.run()
+        output = graph_mod.get_output(0).numpy()
+
+        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+        llvm_graph_mod.set_input(input_name, inp.copy())
+        llvm_graph_mod.run()
+        ref_result = llvm_graph_mod.get_output(0).numpy()
+
+        if benchmark:
+            time_ms = graph_mod.benchmark(session.device, number=1, repeat=1).mean * 1e3
+            print("hexagon time elapsed: ", time_ms)
+            debug_ex = session.get_graph_debug_executor(
+                hexagon_lowered.get_graph_json(), hexagon_lowered.lib
+            )
+            print(debug_ex.profile(input_name=inp.copy()))
+
+        np.testing.assert_allclose(ref_result, output, atol=1e-4, rtol=1e-5)
+
+
+def load_model():
+    """Load renset50 model."""
+    if not os.path.exists(MODEL_JSON):
+        pytest.skip(msg="Run python export_models.py first.")
+
+    with open(MODEL_JSON, "r") as file:
+        mod = tvm.ir.load_json(file.read())
+
+    with open(MODEL_PARAMS, "rb") as file:
+        params = relay.load_param_dict(file.read())
+
+    return mod, params
+
+
 def _schedule_packed_8x8x32_conv2d():
     """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
     using 8x8x32 packed layout.
@@ -268,22 +313,39 @@ def index_map_nchw32c_nchw8h8w32c(n_batch, channel, height, width, channel_32):
     return schedule_fn
 
 
-def tune_packed_8x8x32_template(mod, params, hexagon_launcher):
+def tune_conv2d_template(
+    mod,
+    scheduler,
+    schedule_tag,
+    params,
+    hexagon_launcher,
+    pass_config: Mapping[str, Any] = MappingProxyType({}),
+):
     """Generate packed 8*8*32 template."""
 
-    def schedule_rule_conv2d_packed_8x8x32(sch: Schedule, conv2d_block: BlockRV):
-        _schedule_packed_8x8x32_conv2d()(sch, conv2d_block)
+    def schedule_rule_conv2d(sch: Schedule, conv2d_block: BlockRV):
+        scheduler()(sch, conv2d_block)
         return [sch]
 
-    register_func("meta_schedule.conv2d_NCHWc_int8.hexagon", schedule_rule_conv2d_packed_8x8x32)
+    register_func(
+        "meta_schedule.conv2d_NCHWc_int8.{}.hexagon".format(schedule_tag), schedule_rule_conv2d
+    )
 
     def schedule_conv2d_for_tune(sch: Schedule):
-        _schedule_packed_8x8x32_conv2d()(sch)
+        scheduler()(sch)
 
     # This line is necessary for link-params to take effect during
     # task extraction and relay.build(...).
     mod = mod.with_attr("executor", EXECUTOR)
 
+    pass_context = None
+    if len(pass_config.items()) > 0:
+        pass_context = (
+            tvm.transform.PassContext(opt_level=3, config=pass_config)
+            if pass_config is not None
+            else None
+        )
+
     with tempfile.TemporaryDirectory() as work_dir:
         database = ms.relay_integration.tune_relay(
             mod=mod,
@@ -294,8 +356,8 @@ def schedule_conv2d_for_tune(sch: Schedule):
             max_trials_per_task=1,
             num_trials_per_iter=1,
             strategy="replay-trace",
-            builder=get_hexagon_local_builder(),
-            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
+            builder=get_hexagon_local_builder(pass_context),
+            runner=get_hexagon_rpc_runner(hexagon_launcher, number=1),
             # Apply MS auto scheduling rules for all blocks, but utilize
             # the custom block scheduling strategy registered above for
             # blocks annotated as `schedule_rule:meta_schedule.conv2d_NCHWc_int8`
@@ -318,33 +380,37 @@ def schedule_conv2d_for_tune(sch: Schedule):
             # are treated as distinct tuning tasks.
             module_equality="ignore-ndarray",
         )
+
+        # Add default options so that it still uses the base config.
+        pass_config["relay.backend.use_meta_schedule"] = True
+        pass_config["relay.backend.tir_converter"] = "default"
         return ms.relay_integration.compile_relay(
             database=database,
             mod=mod,
             target=TARGET_HEXAGON,
             params=params,
+            pass_config=pass_config,
         )
 
 
-@pytest.mark.skip("End-to-end tuning is skipped on CI.")
 @tvm.testing.requires_hexagon
 def test_packed_8x8x32_resnet50(hexagon_launcher):
     """Test packed 8*8*32 Resnet50"""
-    if not os.path.exists(MODEL_JSON):
-        pytest.skip(msg="Run python export_models.py first.")
 
-    with open(MODEL_JSON, "r") as file:
-        mod = tvm.ir.load_json(file.read())
+    if tvm.testing.utils.IS_IN_CI:
+        pytest.skip("Skipping test since it takes too long in CI.")
+
+    mod, params = load_model()
 
-    with open(MODEL_PARAMS, "rb") as file:
-        params = relay.load_param_dict(file.read())
     inp = np.random.randn(1, 3, 224, 224).astype("float32")
     input_name = "image"
 
     do_tune = True
 
     if do_tune:
-        hexagon_lowered = tune_packed_8x8x32_template(mod, params, hexagon_launcher)
+        hexagon_lowered = tune_conv2d_template(
+            mod, _schedule_packed_8x8x32_conv2d, "packed_8x8x32", params, hexagon_launcher
+        )
     else:
         with tvm.transform.PassContext(opt_level=3):
             hexagon_lowered = relay.build(
@@ -361,18 +427,112 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
             params=params,
         )
 
-    with hexagon_launcher.start_session() as session:
-        graph_mod = session.get_executor_from_factory(hexagon_lowered)
-        graph_mod.set_input(input_name, inp.copy())
-        graph_mod.run()
-        hexagon_output = graph_mod.get_output(0).numpy()
+    evaluate_mod(hexagon_launcher, hexagon_lowered, llvm_lowered, input_name, inp)
 
-        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
-        llvm_graph_mod.set_input(input_name, inp.copy())
-        llvm_graph_mod.run()
-        ref_result = llvm_graph_mod.get_output(0).numpy()
 
-        np.testing.assert_allclose(ref_result, hexagon_output, atol=1e-4, rtol=1e-5)
+def _schedule_async_dma_conv2d():
+    """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
+    using 8x8x32 packed layout.
+    """
+
+    def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
+        if conv2d_block is None:
+            if has_block(sch, "conv2d_NCHWc_int8"):
+                conv2d_block = sch.get_block("conv2d_NCHWc_int8")
+            else:
+                return False
+
+        assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
+
+        # Apply scheduling
+
+        post_blocks = sch.get_consumers(conv2d_block)
+        if len(post_blocks) > 0:
+            # Fuse all intermediate post ops into the last op.
+            # This is equivalent to the traverse_inline function used in TE schedules.
+            while True:
+                next_post_blocks = []
+                for post_block in post_blocks:
+                    next_consumers = sch.get_consumers(post_block)
+                    if len(next_consumers) > 0:
+                        sch.compute_inline(post_block)
+                    next_post_blocks += next_consumers
+                if len(next_post_blocks) == 0:
+                    assert len(post_blocks) == 1
+                    outer_block = post_blocks[0]
+                    break
+                post_blocks = next_post_blocks
+        else:
+            outer_block = conv2d_block
+
+        # Move the conv2d mma into the injective post mma compute block
+        if outer_block != conv2d_block:
+            loops = sch.get_loops(outer_block)
+            # Compute at the second loop for pipelining.
+            sch.compute_at(conv2d_block, loops[1], preserve_unit_loops=True)
+
+        # Add cache for input and output for copying data to vtcm.
+        input_a_cache = sch.cache_read(conv2d_block, 0, "global.vtcm")
+        sch.compute_at(input_a_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_a_cache)[2:])
+
+        input_b_cache = sch.cache_read(conv2d_block, 1, "global.vtcm")
+        sch.compute_at(input_b_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_b_cache)[2:])
+
+        output_cache_write = sch.cache_write(conv2d_block, 0, "global.vtcm")
+        sch.fuse(*sch.get_loops(output_cache_write)[2:])
+
+        conv2d_loops = sch.get_loops(block=conv2d_block)
+        o_c, k_h, k_w, x_0, x_1, i_c = conv2d_loops[-6:]
+        ic_o, ic_i = sch.split(loop=i_c, factors=[None, 4], preserve_unit_iters=True)
+        oc_o, oc_i = sch.split(loop=o_c, factors=[None, 32], preserve_unit_iters=True)
+        sch.reorder(oc_o, k_h, k_w, x_0, x_1, ic_o, oc_i, ic_i)
+        new_loops = sch.get_loops(block=conv2d_block)
+        sch.parallel(new_loops[4])
+        sch.unroll(new_loops[5])
+        # TODO(nverke): Add compute optimizations here.
+        sch.blockize(loop=oc_i)
+
+        sch.tensorize(oc_i, VRMPY_u8i8i32_VTCM_INTRIN)
+
+        pipeline_loop = conv2d_loops[1]
+        sch.annotate(pipeline_loop, "software_pipeline_stage", [0, 0, 1, 2, 3])
+        sch.annotate(pipeline_loop, "software_pipeline_order", [0, 1, 2, 3, 4])
+        sch.annotate(pipeline_loop, "software_pipeline_async_stages", [0, 2])
+
+        return True
+
+    return schedule_fn
+
+
+@tvm.testing.requires_hexagon
+def test_async_dma_resnet50(hexagon_launcher):
+    """Test async dma Resnet50"""
+
+    if tvm.testing.utils.IS_IN_CI:
+        pytest.skip("Skipping test since it takes too long in CI.")
+
+    mod, params = load_model()
+
+    inp = np.random.randn(1, 3, 224, 224).astype("float32")
+    input_name = "image"
+
+    pass_config = {
+        "tir.use_async_copy": 1,
+        "tir.merge_async_commit_queue_scope": False,
+        "relay.backend.use_meta_schedule": True,
+        "relay.backend.tir_converter": "default",
+    }
+
+    hexagon_lowered = tune_conv2d_template(
+        mod, _schedule_async_dma_conv2d, "async_dma", params, hexagon_launcher, pass_config
+    )
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            mod, tvm.target.Target(TARGET_LLVM, host=TARGET_LLVM), params=params
+        )
+    evaluate_mod(hexagon_launcher, hexagon_lowered, llvm_lowered, input_name, inp, True)
 
 
 if __name__ == "__main__":

From 9cb36b1aada8b4d7833ca3ef21a7b23342c57825 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Thu, 17 Nov 2022 23:05:46 -0500
Subject: [PATCH 608/704] [COMMUNITY] Egor Churaev -> Committer (#13422)

adding egor
---
 CONTRIBUTORS.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index fb1353e49d5d..7be6cd62b599 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -35,6 +35,7 @@ We do encourage everyone to work anything they are interested in.
 - [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
 - [Wei Chen](https://github.com/wweic): @wweic - runtime, relay, vm
 - [Zhi Chen](https://github.com/zhiics) (PMC): @zhiics - relay, quantization, pass manager
+- [Egor Churaev](https://github.com/echuraev): @echuraev - metal, opencl, adreno
 - [Siyuan Feng](https://github.com/Hzfengsy) (PMC): @Hzfengsy - tir
 - [Josh Fromm](https://github.com/jwfromm) (PMC): @jwfromm - frontends, quantization, topi
 - [Mehrdad Hessar](https://github.com/mehrdadh): @mehrdadh - microTVM, hexagon
@@ -104,7 +105,7 @@ We do encourage everyone to work anything they are interested in.
 - [Christian Convey](https://github.com/cconvey/): @cconvey
 - [Meghan Cowan](https://github.com/cowanmeg): @cowanmeg
 - [Balint Cristian](https://github.com/cbalint13): @cbalint13
-- [Egor Churaev](https://github.com/echuraev): @echuraev - metal
+- [Egor Churaev](https://github.com/echuraev): @echuraev
 - [Xiaoqiang Dan](https://github.com/xqdan): @xqdan
 - [Haozheng Fan](https://github.com/hzfan): @hzfan
 - [Siyuan Feng](https://github.com/Hzfengsy): @Hzfengsy

From 53824d697a633260ac62777eafd624c6406d9d42 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Fri, 18 Nov 2022 07:06:42 +0300
Subject: [PATCH 609/704] [Hexagon][QNN] Add TOPI strategies for qnn ops
 mul/tanh/subtract (#13416)

This commit adds compute/schedule implementation for Hexagon target for
QNN ops: qnn.mul, qnn.subtract, qnn.tanh. It works only if QNN
canonicalization pass was disabled.
---
 python/tvm/relay/qnn/op/_qnn.py               |  11 +-
 python/tvm/relay/qnn/strategy/generic.py      |  27 +++
 python/tvm/relay/qnn/strategy/hexagon.py      |  36 ++++
 python/tvm/topi/hexagon/qnn/nn.py             | 179 ++++++++++++++----
 src/relay/qnn/op/add.cc                       |   3 +-
 src/relay/qnn/op/mul.cc                       |   3 +-
 src/relay/qnn/op/requantize.cc                |   3 +
 src/relay/qnn/op/subtract.cc                  |   3 +-
 .../test_wo_qnn_canonicalization.py           | 178 +++++++++++++----
 9 files changed, 362 insertions(+), 81 deletions(-)

diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
index 4e54583a3be0..64ef1ee92a1c 100644
--- a/python/tvm/relay/qnn/op/_qnn.py
+++ b/python/tvm/relay/qnn/op/_qnn.py
@@ -66,7 +66,16 @@ def simulated_dequantize_compute(attrs, inputs, output_type):
 
 # qnn.add
 register_strategy("qnn.add", strategy.qnn_add_strategy)
-register_pattern("qnn.add", OpPattern.BROADCAST)
+
+# qnn.subtract
+register_strategy("qnn.subtract", strategy.qnn_subtract_strategy)
+
+# qnn.mul
+register_strategy("qnn.mul", strategy.qnn_mul_strategy)
+
+# qnn.tanh
+register_strategy("qnn.tanh", strategy.qnn_tanh_strategy)
+register_pattern("qnn.tanh", OpPattern.ELEMWISE)
 
 # qnn.concatenate
 register_strategy("qnn.concatenate", strategy.qnn_concatenate_strategy)
diff --git a/python/tvm/relay/qnn/strategy/generic.py b/python/tvm/relay/qnn/strategy/generic.py
index 57a364f7e057..8275cf7f755e 100644
--- a/python/tvm/relay/qnn/strategy/generic.py
+++ b/python/tvm/relay/qnn/strategy/generic.py
@@ -213,6 +213,33 @@ def qnn_add_strategy(attrs, inputs, out_type, target):
     )
 
 
+@override_native_generic_func("qnn_subtract_strategy")
+def qnn_subtract_strategy(attrs, inputs, out_type, target):
+    """qnn.subtract generic strategy"""
+    raise RuntimeError(
+        "qnn.subtract is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
+
+
+@override_native_generic_func("qnn_mul_strategy")
+def qnn_mul_strategy(attrs, inputs, out_type, target):
+    """qnn.mul generic strategy"""
+    raise RuntimeError(
+        "qnn.mul is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
+
+
+@override_native_generic_func("qnn_tanh_strategy")
+def qnn_tanh_strategy(attrs, inputs, out_type, target):
+    """qnn.tanh generic strategy"""
+    raise RuntimeError(
+        "qnn.tanh is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
+
+
 @override_native_generic_func("qnn_concatenate_strategy")
 def qnn_concatenate_strategy(attrs, inputs, out_type, target):
     """qnn.concatenate generic strategy"""
diff --git a/python/tvm/relay/qnn/strategy/hexagon.py b/python/tvm/relay/qnn/strategy/hexagon.py
index c7f59cc096fc..d17812e3fbcc 100644
--- a/python/tvm/relay/qnn/strategy/hexagon.py
+++ b/python/tvm/relay/qnn/strategy/hexagon.py
@@ -71,6 +71,42 @@ def qnn_add_strategy_hexagon(attrs, inputs, out_type, target):
     return strategy
 
 
+@qnn_subtract_strategy.register("hexagon")
+def qnn_subtract_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.subtract strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_topi_compute(topi.hexagon.qnn_subtract),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_subtract),
+        name="qnn_subtract.hexagon",
+    )
+    return strategy
+
+
+@qnn_mul_strategy.register("hexagon")
+def qnn_mul_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.mul strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_topi_compute(topi.hexagon.qnn_mul),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_mul),
+        name="qnn_mul.hexagon",
+    )
+    return strategy
+
+
+@qnn_tanh_strategy.register("hexagon")
+def qnn_tanh_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.tanh strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_topi_compute(topi.hexagon.qnn_tanh),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_tanh),
+        name="qnn_tanh.hexagon",
+    )
+    return strategy
+
+
 @qnn_concatenate_strategy.register("hexagon")
 def qnn_concatenate_strategy_hexagon(attrs, inputs, out_type, target):
     """qnn.concatenate strategy for Hexagon"""
diff --git a/python/tvm/topi/hexagon/qnn/nn.py b/python/tvm/topi/hexagon/qnn/nn.py
index 40cfd0ee96b1..49220d0fd013 100644
--- a/python/tvm/topi/hexagon/qnn/nn.py
+++ b/python/tvm/topi/hexagon/qnn/nn.py
@@ -19,6 +19,7 @@
 
 import tvm
 from tvm import te, topi
+from ..utils import saturate
 from ...utils import get_const_tuple
 from ...nn.utils import get_pad_tuple
 from ...nn.pad import pad
@@ -33,6 +34,11 @@ def clip_cast(val, dtype):
     return te.max(tvm.te.min(val, const_max), const_min).astype(dtype)
 
 
+# Return True if given Tensor is scalar constant value.
+def is_constant(tensor: te.Tensor):
+    return tensor.ndim == 0
+
+
 def get_qnn_param(param, indices, axis):
     # Account scalar and 1D quantization parameters:
     if len(param.shape) == 0:
@@ -62,7 +68,7 @@ def default_schedule(outs):
     return s
 
 
-def qnn_quantize(data, output_scale, output_zero_point, axis, out_dtype):
+def qnn_quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"):
     """Compute for qnn.quantize
 
     Q_output = clamp((round(input_tensor/output_scale) + output_zero_point),
@@ -101,7 +107,7 @@ def schedule_qnn_quantize(outs):
     return default_schedule(outs)
 
 
-def qnn_dequantize(data, input_scale, input_zero_point, axis):
+def qnn_dequantize(data, input_scale, input_zero_point, axis=-1):
     """Compute for qnn.dequantize
 
     fp_output = input_scale * (Q_input - input_zero_point)
@@ -134,7 +140,7 @@ def schedule_qnn_dequantize(outs):
     return default_schedule(outs)
 
 
-def qnn_requantize(data, input_scale, input_zp, output_scale, output_zp, axis, out_dtype):
+def qnn_requantize(data, input_scale, input_zp, output_scale, output_zp, axis=-1, out_dtype="int8"):
     """Compute for qnn.requantize
 
     Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
@@ -177,37 +183,58 @@ def schedule_qnn_requantize(outs):
     return default_schedule(outs)
 
 
-def qnn_add(
-    lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, output_zero_point
+def compute_qnn_binary_op(
+    lhs, rhs, lhs_scale, lhs_zp, rhs_scale, rhs_zp, output_scale, output_zp, func
 ):
-    """Compute for qnn.add
+    """Compute for QNN binary operation
 
-    Q_output = zp_output + round((lhs_scale)/(scale_output) * (lhs_input - lhs_zp_input))
-                         + round((rhs_scale)/(scale_output) * (rhs_input - rhs_zp_input))
-
-    TODO: support 'axis' argument.
+    Q_output = output_zp + round((lhs_scale)/(output_scale) * (lhs_input - lhs_zp))
+                      _OP_ round((rhs_scale)/(output_scale) * (rhs_input - rhs_zp))
+    where _OP_ is add/subtract
     """
-
     assert lhs.dtype == rhs.dtype
     dtype = lhs.dtype
 
+    def _compute_const(x: te.Tensor, iscale, input_zp):
+        return te.round(te.multiply(te.div(iscale, output_scale), te.subtract(x, input_zp))).astype(
+            "int32"
+        )
+
+    def _compute_tensor(x: te.Tensor, iscale, input_zp):
+        return te.compute(
+            x.shape,
+            lambda *i: te.round(
+                te.multiply(te.div(iscale, output_scale), te.subtract(x(*i), input_zp))
+            ).astype("int32"),
+        )
+
+    if is_constant(lhs):
+        lhs_tensor = _compute_const(lhs, lhs_scale, lhs_zp)
+    else:
+        lhs_tensor = _compute_tensor(lhs, lhs_scale, lhs_zp)
+
+    if is_constant(rhs):
+        rhs_tensor = _compute_const(rhs, rhs_scale, rhs_zp)
+    else:
+        rhs_tensor = _compute_tensor(rhs, rhs_scale, rhs_zp)
+
+    # Binary op with broadcasting
+    tensor = func(lhs_tensor, rhs_tensor)
+
+    # Add output zero point and clip+cast.
     def _compute(*indices):
-        lvalue = lhs(*indices)
-        rvalue = rhs(*indices)
-        q_lv = te.round(
-            te.multiply(te.div(lhs_scale, output_scale), te.subtract(lvalue, lhs_zero_point))
-        ).astype("int32")
-        q_rv = te.round(
-            te.multiply(te.div(rhs_scale, output_scale), te.subtract(rvalue, rhs_zero_point))
-        ).astype("int32")
-        val = te.add(te.add(q_lv, q_rv), output_zero_point)
+        return saturate(te.add(tensor(*indices), output_zp), dtype).astype(dtype)
+
+    return te.compute(tensor.shape, _compute)
 
-        # clip + cast:
-        const_min = tvm.tir.min_value(dtype)
-        const_max = tvm.tir.max_value(dtype)
-        return te.max(tvm.te.min(val, const_max), const_min).astype(dtype)
 
-    return te.compute(lhs.shape, _compute)
+def qnn_add(lhs, rhs, lhs_scale, lhs_zp, rhs_scale, rhs_zp, output_scale, output_zp):
+    """Compute for qnn.add
+    TODO: support 'axis' argument.
+    """
+    return compute_qnn_binary_op(
+        lhs, rhs, lhs_scale, lhs_zp, rhs_scale, rhs_zp, output_scale, output_zp, topi.add
+    )
 
 
 def schedule_qnn_add(outs):
@@ -227,19 +254,99 @@ def schedule_qnn_add(outs):
     return default_schedule(outs)
 
 
-def requantize_tensor(tensor, i_scale, i_zp, o_scale, o_zp, out_dtype):
-    """Requantize tensor"""
+def qnn_subtract(lhs, rhs, lhs_scale, lhs_zp, rhs_scale, rhs_zp, output_scale, output_zp):
+    """Compute for qnn.subtract"""
 
-    def _compute(*indices):
-        value = tensor(*indices)
-        mul_value = te.round(
-            te.multiply(te.div(i_scale, o_scale), te.subtract(value, i_zp))
-        ).astype("int32")
-        rq_value = te.add(mul_value, o_zp)
+    return compute_qnn_binary_op(
+        lhs, rhs, lhs_scale, lhs_zp, rhs_scale, rhs_zp, output_scale, output_zp, topi.subtract
+    )
 
-        return clip_cast(rq_value, out_dtype)
 
-    return te.compute(tensor.shape, _compute)
+def schedule_qnn_subtract(outs):
+    """Schedule for qnn.subtract
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.add
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
+
+
+def qnn_mul(lhs, rhs, lhs_scale, lhs_zp, rhs_scale, rhs_zp, output_scale, output_zp):
+    """Compute for qnn.mul
+
+    mul = (lhs_input - lhs_zp) * (rhs_input - rhs_zp)
+    Q_output = requantize(mul, lhs_scale * rhs_scale, 0, output_scale, output_zp)
+    """
+    assert lhs.dtype == rhs.dtype
+    odtype = lhs.dtype
+
+    if is_constant(lhs):
+        lhs_tensor = lhs - lhs_zp
+    else:
+        lhs_tensor = te.compute(lhs.shape, lambda *i: te.subtract(lhs(*i), lhs_zp))
+
+    if is_constant(rhs):
+        rhs_tensor = rhs - rhs_zp
+    else:
+        rhs_tensor = te.compute(rhs.shape, lambda *i: te.subtract(rhs(*i), rhs_zp))
+
+    # Multiply with broadcasting.
+    mul = topi.multiply(lhs_tensor, rhs_tensor)
+
+    iscale = lhs_scale * rhs_scale
+    return qnn_requantize(mul, iscale, tvm.tir.const(0), output_scale, output_zp, out_dtype=odtype)
+
+
+def schedule_qnn_mul(outs):
+    """Schedule for qnn.mul
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.add
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
+
+
+def qnn_tanh(data, input_scale, input_zp, output_scale, output_zp):
+    """Compute for qnn.tanh
+
+    Q_output = quantize(tanh(dequantize(data)))
+    """
+    dq_tensor = qnn_dequantize(data, input_scale, input_zp)
+    tanh = te.compute(dq_tensor.shape, lambda *i: te.tanh(dq_tensor(*i)))
+    return qnn_quantize(tanh, output_scale, output_zp, out_dtype=data.dtype)
+
+
+def schedule_qnn_tanh(outs):
+    """Schedule for qnn.tanh
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.add
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return default_schedule(outs)
 
 
 def qnn_concatenate(data, axis, out_dtype):
@@ -282,7 +389,7 @@ def qnn_concatenate(data, axis, out_dtype):
         i_zp = data[i + args_num * 2]
 
         # Requantize tensors and add them to the list.
-        args.append(requantize_tensor(tensor, i_scale, i_zp, o_scale, o_zp, out_dtype))
+        args.append(qnn_requantize(tensor, i_scale, i_zp, o_scale, o_zp, out_dtype=out_dtype))
 
     # Call x86 implementation of concatenate.
     return concatenate(args, axis)
diff --git a/src/relay/qnn/op/add.cc b/src/relay/qnn/op/add.cc
index d087d9fa7796..0e0d3fdbc0dd 100644
--- a/src/relay/qnn/op/add.cc
+++ b/src/relay/qnn/op/add.cc
@@ -96,7 +96,8 @@ Expr QnnAddCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
 QNN_REGISTER_BINARY_OP("add")
     .describe("Elementwise add with broadcasting for quantized tensors.")
     .set_support_level(11)
-    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnAddCanonicalize);
+    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnAddCanonicalize)
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast);
 
 }  // namespace qnn
 }  // namespace relay
diff --git a/src/relay/qnn/op/mul.cc b/src/relay/qnn/op/mul.cc
index 6dde61359df6..73c6eed44889 100644
--- a/src/relay/qnn/op/mul.cc
+++ b/src/relay/qnn/op/mul.cc
@@ -162,7 +162,8 @@ Expr QnnMulCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
 QNN_REGISTER_BINARY_OP("mul")
     .describe("Elementwise mul with broadcasting for quantized tensors.")
     .set_support_level(11)
-    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnMulCanonicalize);
+    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnMulCanonicalize)
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast);
 
 }  // namespace qnn
 }  // namespace relay
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index e199ea27f1e4..91df4a287ca7 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -384,6 +384,9 @@ Expr RequantizeLower(const Expr& input_tensor, const Expr& input_scale,
                      const Expr& input_zero_point, const Expr& output_scale,
                      const Expr& output_zero_point, const RequantizeAttrs* param,
                      const Array<IndexExpr>& input_shape, const DataType& out_dtype) {
+  // Check output scale validity.
+  ICHECK_NE(GetScalarFromConstant<float>(output_scale), 0.0)
+      << "QNN requantize output scale can not be equal to 0.0";
   // Check rounding validity.
   ICHECK(param->rounding == "UPWARD" || param->rounding == "TONEAREST")
       << "QNN requantize supports two rounding modes - UPWARD and "
diff --git a/src/relay/qnn/op/subtract.cc b/src/relay/qnn/op/subtract.cc
index 181501922086..962a3434cb72 100644
--- a/src/relay/qnn/op/subtract.cc
+++ b/src/relay/qnn/op/subtract.cc
@@ -97,7 +97,8 @@ Expr QnnSubtractCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
 QNN_REGISTER_BINARY_OP("subtract")
     .describe("Elementwise subtract with broadcasting for quantized tensors.")
     .set_support_level(11)
-    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnSubtractCanonicalize);
+    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnSubtractCanonicalize)
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast);
 
 }  // namespace qnn
 }  // namespace relay
diff --git a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
index e4edf2919a00..06e738d9b70e 100644
--- a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
+++ b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
@@ -51,13 +51,33 @@ def test_no_qnn_pass():
     assert "qnn.dequantize" in opt_mod_2.astext(show_meta_data=False)
 
 
-def execute(executor, data_np, weight_np, bias_np=None):
-    executor.set_input("data", data_np)
-    executor.set_input("weight", weight_np)
-    if bias_np is not None:
-        executor.set_input("bias", bias_np)
-    executor.run()
-    return executor.get_output(0)
+def execute(mod_executor, inputs: dict):
+    for input_name, input_data in inputs.items():
+        mod_executor.set_input(input_name, input_data)
+    mod_executor.run()
+    return mod_executor.get_output(0).numpy()
+
+
+def build_hexagon_module(mod):
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
+        hexagon_lowered = tvm.relay.build(
+            mod,
+            tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET),
+            executor=Executor("aot"),
+        )
+
+    return hexagon_lowered
+
+
+def build_ref_module(mod):
+    target_llvm = tvm.target.Target("llvm")
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            executor=Executor("aot"),
+        )
+    return llvm_lowered
 
 
 @tvm.testing.requires_hexagon
@@ -90,33 +110,24 @@ def test_qnn_conv2d_rq(hexagon_session: Session):
     )
     relay_mod = tvm.IRModule.from_expr(op5)
 
-    target_llvm = tvm.target.Target("llvm")
-    executor = Executor("aot")
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
-        hexagon_lowered = tvm.relay.build(
-            relay_mod,
-            tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET),
-            executor=executor,
-        )
+    # Compile for Hexagon
+    hexagon_lowered = build_hexagon_module(relay_mod)
 
-    with tvm.transform.PassContext(opt_level=3):
-        llvm_lowered = tvm.relay.build(
-            relay_mod,
-            tvm.target.Target(target_llvm, host=target_llvm),
-            executor=executor,
-        )
+    # Reference compilation
+    llvm_lowered = build_ref_module(relay_mod)
 
     data_np = np.random.rand(*data_shape) - 0.5
     weight_np = np.random.rand(*weight_shape) - 0.5
+    inputs = {"data": data_np, "weight": weight_np}
 
     hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    hexagon_output = execute(hx_m, data_np, weight_np)
+    hexagon_output = execute(hx_m, inputs)
 
     dev = tvm.cpu(0)
     llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
-    llvm_out = execute(llvm_m, data_np, weight_np)
+    llvm_out = execute(llvm_m, inputs)
 
-    np.testing.assert_equal(hexagon_output.numpy(), llvm_out.numpy())
+    np.testing.assert_equal(hexagon_output, llvm_out)
 
 
 @tvm.testing.requires_hexagon
@@ -152,34 +163,119 @@ def test_qnn_dense_bias_rq(hexagon_session: Session):
     )
     relay_mod = tvm.IRModule.from_expr(op5)
 
-    target_llvm = tvm.target.Target("llvm")
-    executor = Executor("aot")
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
-        hexagon_lowered = tvm.relay.build(
-            relay_mod,
-            tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET),
-            executor=executor,
-        )
+    # Compile for Hexagon
+    hexagon_lowered = build_hexagon_module(relay_mod)
 
-    with tvm.transform.PassContext(opt_level=3):
-        llvm_lowered = tvm.relay.build(
-            relay_mod,
-            tvm.target.Target(target_llvm, host=target_llvm),
-            executor=executor,
-        )
+    # Reference compilation
+    llvm_lowered = build_ref_module(relay_mod)
 
     data_np = np.random.rand(*data_shape) - 0.5
     weight_np = np.random.rand(*weight_shape) - 0.5
     bias_np = np.random.rand(*bias_shape)
+    inputs = {"data": data_np, "weight": weight_np, "bias": bias_np}
 
     hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    hexagon_output = execute(hx_m, data_np, weight_np, bias_np)
+    hexagon_output = execute(hx_m, inputs)
 
     dev = tvm.cpu(0)
     llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
-    llvm_out = execute(llvm_m, data_np, weight_np, bias_np)
+    llvm_out = execute(llvm_m, inputs)
+
+    np.testing.assert_equal(hexagon_output, llvm_out)
+
+
+class TestQnnBinaryOp:
+    """QNN binary op test class"""
+
+    operation = tvm.testing.parameter(
+        relay.qnn.op.add,
+        relay.qnn.op.subtract,
+        relay.qnn.op.mul,
+    )
+    dtype = tvm.testing.parameter("uint8", "int8")
+    input_shape = tvm.testing.parameter([256], [4, 256])
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_binary_op_broadcasting(
+        self, hexagon_session: Session, operation, dtype, input_shape
+    ):
+        """qnn binary op test without QNN canonicalization."""
+        lhs_shape = [4, 256]
+        rhs_shape = input_shape
+        lhs = relay.var("lhs", shape=lhs_shape, dtype=dtype)
+        rhs = relay.var("rhs", shape=rhs_shape, dtype=dtype)
+        zp_const1 = 1
+        zp_const2 = 3
+
+        op = operation(
+            lhs,
+            rhs,
+            lhs_scale=relay.const(0.041, "float32"),
+            lhs_zero_point=relay.const(zp_const1, "int32"),
+            rhs_scale=relay.const(0.017, "float32"),
+            rhs_zero_point=relay.const(zp_const2, "int32"),
+            output_scale=relay.const(0.039, "float32"),
+            output_zero_point=relay.const(2, "int32"),
+        )
+        mod = tvm.IRModule.from_expr(op)
+
+        # Compile for Hexagon
+        hexagon_lowered = build_hexagon_module(mod)
+
+        # Reference compilation
+        llvm_lowered = build_ref_module(mod)
+
+        lhs_np = np.random.randint(np.iinfo(dtype).min + zp_const1, np.iinfo(dtype).max, lhs_shape)
+        rhs_np = np.random.randint(np.iinfo(dtype).min + zp_const2, np.iinfo(dtype).max, rhs_shape)
+        inputs = {"lhs": lhs_np, "rhs": rhs_np}
+
+        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
+        hexagon_output = execute(hx_m, inputs)
+
+        dev = tvm.cpu(0)
+        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
+        llvm_output = execute(llvm_m, inputs)
+
+        # Diff by 1 is Ok.
+        tvm.testing.assert_allclose(hexagon_output, llvm_output, atol=1)
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_binary_op_scalar(self, hexagon_session: Session, operation):
+        """qnn binary op test without QNN canonicalization."""
+        lhs_shape = [4, 256]
+        lhs = relay.var("lhs", shape=lhs_shape, dtype="uint8")
+        rhs = relay.const(11, dtype="uint8")
+
+        op = operation(
+            lhs,
+            rhs,
+            lhs_scale=relay.const(0.049, "float32"),
+            lhs_zero_point=relay.const(1, "int32"),
+            rhs_scale=relay.const(0.067, "float32"),
+            rhs_zero_point=relay.const(3, "int32"),
+            output_scale=relay.const(0.041, "float32"),
+            output_zero_point=relay.const(2, "int32"),
+        )
+        mod = tvm.IRModule.from_expr(op)
+
+        # Compile for Hexagon
+        hexagon_lowered = build_hexagon_module(mod)
+
+        # Reference compilation
+        llvm_lowered = build_ref_module(mod)
+
+        lhs_np = np.random.randint(1, 255, size=lhs_shape)
+        inputs = {"lhs": lhs_np}
+
+        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
+        hexagon_output = execute(hx_m, inputs)
+
+        dev = tvm.cpu(0)
+        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
+        llvm_output = execute(llvm_m, inputs)
 
-    np.testing.assert_equal(hexagon_output.numpy(), llvm_out.numpy())
+        # Diff by 1 is Ok.
+        tvm.testing.assert_allclose(hexagon_output, llvm_output, atol=1)
 
 
 if __name__ == "__main__":

From 37a885553c83ef5c0fe5165f5547c58b696d9763 Mon Sep 17 00:00:00 2001
From: Alexey Yazev <113356454+Alexey-Yazev@users.noreply.github.com>
Date: Fri, 18 Nov 2022 08:55:27 -0800
Subject: [PATCH 610/704] [microNPU] Fix Cascader code generation without
 StorageRewrite (#13365)

There were extra memory allocations for buffers when parts of the buffer for the result were replaced with a buffer for the entire result (in ReplaceOperators pass)
summing up we received a larger size in the number of parts
---
 python/tvm/relay/backend/contrib/ethosu/tir/compiler.py   | 6 +-----
 python/tvm/relay/backend/contrib/ethosu/tir/passes.py     | 5 +++++
 .../contrib/test_ethosu/cascader/test_memory_reduction.py | 8 ++++----
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index 4133aff6ef51..2cf45170e4e3 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -94,12 +94,8 @@ def lower_ethosu(sch, args, const_dict, name="main"):
         mod, const_dict = ethosu_passes.MergeConstants(const_dict)(mod)
         mod = ethosu_passes.CopyComputeReordering()(mod)
 
-        # When striping is enabled and if storage_rewrite is not run
-        # the striping results in incorrect code generation. This needs
-        # further investigation. Until such a time that is fixed, disable_storage_rewrite
-        # user directive will be overridden if striping is enabled.
         disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False)
-        if not disable_storage_rewrite or util.is_striping_enabled():
+        if not disable_storage_rewrite:
             mod = tvm.tir.transform.StorageRewrite()(mod)
 
         mod = tvm.tir.transform.RemoveNoOp()(mod)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
index e15d126dd969..f313ff720500 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -72,6 +72,7 @@ def ReplaceOperators():
     producers_consumers = ProducersConsumers()
     replace_output_pointer = {}
     pointer_to_extents = {}
+    replaced_pointers = []
 
     ReplaceInfo = namedtuple("ReplaceInfo", ["pointer", "reallocate"])
 
@@ -136,9 +137,13 @@ def _replace_operator(stmt):
                     stmt, producers_consumers
                 )
                 if replace_pointer is not None:
+                    # Allocate pointer only once
+                    if replace_pointer in replaced_pointers:
+                        is_allocator = False
                     replace_output_pointer[output_pointer] = ReplaceInfo(
                         replace_pointer, is_allocator
                     )
+                    replaced_pointers.append(replace_pointer)
                 # Make the extern call
                 irb = tvm.tir.ir_builder.create()
                 irb.emit(tvm.tir.call_extern("handle", op_name, *info))
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index e88282240510..99238fa59337 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -171,10 +171,10 @@ def tf_graph(x):
 @pytest.mark.parametrize(
     "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
-        ("ethos-u55-256", 180288, 15312),
-        ("ethos-u55-128", 180288, 15312),
-        ("ethos-u55-64", 180288, 14544),
-        ("ethos-u55-32", 180272, 14544),
+        ("ethos-u55-256", 180288, 15200),
+        ("ethos-u55-128", 180288, 15200),
+        ("ethos-u55-64", 180288, 14432),
+        ("ethos-u55-32", 180272, 14416),
     ],
 )
 def test_depthwise2d_conv2d_pooling(

From 490e0e3120f304a98607770502b5700ec6ab9d55 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 18 Nov 2022 10:02:55 -0800
Subject: [PATCH 611/704] [ci] Split out C++ unittests (#13335)

* [ci] Split out C++ unittests

This makes C++ unittests follow the normal flow of build -> upload
artifacts -> download and run tests. To simplify the changes there is a
new utility for interacting with S3.

* Comments

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                                   | 1498 ++++-------------
 ci/jenkins/Build.groovy.j2                    |  131 +-
 ci/jenkins/Deploy.groovy.j2                   |    2 +-
 ci/jenkins/Jenkinsfile.j2                     |    7 +-
 ci/jenkins/Test.groovy.j2                     |   70 +-
 ci/jenkins/macros.j2                          |   39 +-
 ci/scripts/jenkins/cmd_utils.py               |   11 +
 ci/scripts/jenkins/s3.py                      |  140 ++
 ...{task_ci_setup.sh => task_clear_pytest.sh} |   17 -
 tests/scripts/task_cpp_unittest.sh            |   30 +-
 tests/scripts/task_microtvm_cpp_tests.sh      |   42 +
 11 files changed, 636 insertions(+), 1351 deletions(-)
 create mode 100755 ci/scripts/jenkins/s3.py
 rename tests/scripts/{task_ci_setup.sh => task_clear_pytest.sh} (58%)
 create mode 100755 tests/scripts/task_microtvm_cpp_tests.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index 9fd926430b6a..dfa9a7eda284 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-11-14T12:32:18.663464
+// Generated at 2022-11-17T23:53:21.059864
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -104,7 +104,8 @@ max_time = 180
 rebuild_docker_images = false
 
 // Filenames for stashing between build and test steps
-s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
 // Jenkins script root directory
 jenkins_scripts_root = "ci/scripts/jenkins"
@@ -672,8 +673,8 @@ def lint() {
 }
 def ci_setup(image) {
   sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh",
-    label: 'Set up CI environment',
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
   )
 }
 
@@ -691,57 +692,43 @@ def fsim_test(image) {
   )
 }
 
-def cmake_build(image, path, make_flag) {
+def make_standalone_crt(image, build_dir) {
   sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
-    label: 'Run cmake build',
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
   )
 }
 
-def cpp_unittest(image) {
+def make_cpp_tests(image, build_dir) {
   sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
-    label: 'Build and run C++ tests',
-  )
-}
-
-def add_microtvm_permissions() {
-  sh(
-    script: 'find build/microtvm_template_projects -type f | grep qemu-hack | xargs chmod +x',
-    label: 'Add execute permissions for microTVM files',
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
   )
 }
 
-def add_hexagon_permissions() {
-  sh(
-    script: 'find build/hexagon_api_output -type f | xargs chmod +x',
-    label: 'Add execute permissions for hexagon files',
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
   )
 }
 
-// Run make. First try to do an incremental make from a previous workspace in hope to
-// accelerate the compilation. If something is wrong, clean the workspace and then
-// build from scratch.
-def make(docker_type, path, make_flag) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    try {
-      cmake_build(docker_type, path, make_flag)
-    } catch (hudson.AbortException ae) {
-      // script exited due to user abort, directly throw instead of retry
-      if (ae.getMessage().contains('script returned exit code 143')) {
-        throw ae
-      }
-      echo 'Incremental compilation failed. Fall back to build from scratch'
-      sh (
-        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
-        label: 'Clear old cmake workspace',
-      )
-      cmake_build(docker_type, path, make_flag)
-    }
-  }
-}
-
-
 def build() {
 stage('Build') {
   environment {
@@ -757,41 +744,21 @@ stage('Build') {
           docker_init(ci_gpu)
           timeout(time: max_time, unit: 'MINUTES') {
             sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
-          make("${ci_gpu} --no-gpu", 'build', '-j2')
+          cmake_build("${ci_gpu} --no-gpu", 'build', '-j2')
+          make_standalone_crt("${ci_gpu} --no-gpu", 'build')
           sh(
-            script: """
-              set -eux
-              . ${jenkins_scripts_root}/retry.sh
-              md5sum build/libtvm.so
-              retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu/build/libtvm.so
-              md5sum build/libvta_fsim.so
-              retry 3 aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/gpu/build/libvta_fsim.so
-              md5sum build/libtvm_runtime.so
-              retry 3 aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/gpu/build/libtvm_runtime.so
-              md5sum build/config.cmake
-              retry 3 aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/gpu/build/config.cmake
-              retry 3 aws s3 cp --no-progress build/microtvm_template_projects s3://${s3_prefix}/gpu/build/microtvm_template_projects --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu --items build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/microtvm_template_projects build/crttest build/standalone_crt build/build.ninja",
             label: 'Upload artifacts to S3',
           )
 
 
           // compiler test
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
-          make("${ci_gpu} --no-gpu", 'build2', '-j2')
+          sh "rm -rf build"
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
+          cmake_build("${ci_gpu} --no-gpu", 'build', '-j2')
+          make_standalone_crt("${ci_gpu} --no-gpu", 'build')
           sh(
-            script: """
-              set -eux
-              . ${jenkins_scripts_root}/retry.sh
-              md5sum build/libtvm.so
-              retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu2/build/libtvm.so
-              md5sum build/libvta_fsim.so
-              retry 3 aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/gpu2/build/libvta_fsim.so
-              md5sum build/libtvm_runtime.so
-              retry 3 aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/gpu2/build/libtvm_runtime.so
-              md5sum build/config.cmake
-              retry 3 aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/gpu2/build/config.cmake
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu2 --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/crttest build/standalone_crt build/build.ninja",
             label: 'Upload artifacts to S3',
           )
           }
@@ -813,22 +780,11 @@ stage('Build') {
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
             label: 'Create CPU cmake config',
           )
-          make(ci_cpu, 'build', '-j2')
+          cmake_build(ci_cpu, 'build', '-j2')
+          make_standalone_crt(ci_cpu, 'build')
+          make_cpp_tests(ci_cpu, 'build')
           sh(
-            script: """
-              set -eux
-              . ${jenkins_scripts_root}/retry.sh
-              md5sum build/libvta_tsim.so
-              retry 3 aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/cpu/build/libvta_tsim.so
-              md5sum build/libtvm.so
-              retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cpu/build/libtvm.so
-              md5sum build/libvta_fsim.so
-              retry 3 aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/cpu/build/libvta_fsim.so
-              md5sum build/libtvm_runtime.so
-              retry 3 aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/cpu/build/libtvm_runtime.so
-              md5sum build/config.cmake
-              retry 3 aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/cpu/build/config.cmake
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu --items build/libvta_tsim.so build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/standalone_crt build/build.ninja",
             label: 'Upload artifacts to S3',
           )
 
@@ -855,18 +811,10 @@ stage('Build') {
             script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build",
             label: 'Create CPU minimal cmake config',
           )
-          make(ci_minimal, 'build', '-j2')
+          cmake_build(ci_minimal, 'build', '-j2')
+          make_cpp_tests(ci_minimal, 'build')
           sh(
-            script: """
-              set -eux
-              . ${jenkins_scripts_root}/retry.sh
-              md5sum build/libtvm.so
-              retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cpu-minimal/build/libtvm.so
-              md5sum build/libtvm_runtime.so
-              retry 3 aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/cpu-minimal/build/libtvm_runtime.so
-              md5sum build/config.cmake
-              retry 3 aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/cpu-minimal/build/config.cmake
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu-minimal --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/cpptest build/build.ninja build/CMakeFiles/rules.ninja",
             label: 'Upload artifacts to S3',
           )
           }
@@ -888,7 +836,9 @@ stage('Build') {
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
             label: 'Create WASM cmake config',
           )
-          make(ci_wasm, 'build', '-j2')
+          cmake_build(ci_wasm, 'build', '-j2')
+          make_standalone_crt(ci_wasm, 'build')
+          make_cpp_tests(ci_wasm, 'build')
           cpp_unittest(ci_wasm)
           ci_setup(ci_wasm)
           sh (
@@ -914,22 +864,11 @@ stage('Build') {
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
             label: 'Create i386 cmake config',
           )
-          make(ci_i386, 'build', '-j2')
+          cmake_build(ci_i386, 'build', '-j2')
+          make_standalone_crt(ci_i386, 'build')
+          make_cpp_tests(ci_i386, 'build')
           sh(
-            script: """
-              set -eux
-              . ${jenkins_scripts_root}/retry.sh
-              md5sum build/libvta_tsim.so
-              retry 3 aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/i386/build/libvta_tsim.so
-              md5sum build/libtvm.so
-              retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/i386/build/libtvm.so
-              md5sum build/libvta_fsim.so
-              retry 3 aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/i386/build/libvta_fsim.so
-              md5sum build/libtvm_runtime.so
-              retry 3 aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/i386/build/libtvm_runtime.so
-              md5sum build/config.cmake
-              retry 3 aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/i386/build/config.cmake
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/i386 --items build/libvta_tsim.so build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/standalone_crt build/build.ninja build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja",
             label: 'Upload artifacts to S3',
           )
           }
@@ -951,20 +890,11 @@ stage('Build') {
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
             label: 'Create ARM cmake config',
           )
-          make(ci_arm, 'build', '-j4')
+          cmake_build(ci_arm, 'build', '-j4')
+          make_standalone_crt(ci_arm, 'build')
+          make_cpp_tests(ci_arm, 'build')
           sh(
-            script: """
-              set -eux
-              . ${jenkins_scripts_root}/retry.sh
-              md5sum build/libtvm.so
-              retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/arm/build/libtvm.so
-              md5sum build/libvta_fsim.so
-              retry 3 aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/arm/build/libvta_fsim.so
-              md5sum build/libtvm_runtime.so
-              retry 3 aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/arm/build/libtvm_runtime.so
-              md5sum build/config.cmake
-              retry 3 aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/arm/build/config.cmake
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/arm --items build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/crttest build/standalone_crt build/build.ninja",
             label: 'Upload artifacts to S3',
           )
           }
@@ -986,19 +916,11 @@ stage('Build') {
             script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build",
             label: 'Create Cortex-M cmake config',
           )
-          make(ci_cortexm, 'build', '-j2')
+          cmake_build(ci_cortexm, 'build', '-j2')
+          make_standalone_crt(ci_cortexm, 'build')
+          make_cpp_tests(ci_cortexm, 'build')
           sh(
-            script: """
-              set -eux
-              . ${jenkins_scripts_root}/retry.sh
-              md5sum build/libtvm.so
-              retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cortexm/build/libtvm.so
-              md5sum build/libtvm_runtime.so
-              retry 3 aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/cortexm/build/libtvm_runtime.so
-              md5sum build/config.cmake
-              retry 3 aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/cortexm/build/config.cmake
-              retry 3 aws s3 cp --no-progress build/microtvm_template_projects s3://${s3_prefix}/cortexm/build/microtvm_template_projects --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/crttest build/standalone_crt build/build.ninja build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/microtvm_template_projects",
             label: 'Upload artifacts to S3',
           )
           }
@@ -1020,23 +942,14 @@ stage('Build') {
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
             label: 'Create Hexagon cmake config',
           )
-          make(ci_hexagon, 'build', '-j2')
+          cmake_build(ci_hexagon, 'build', '-j2')
+          make_cpp_tests(ci_hexagon, 'build')
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
             label: 'Build Hexagon API',
           )
           sh(
-            script: """
-              set -eux
-              . ${jenkins_scripts_root}/retry.sh
-              md5sum build/libtvm.so
-              retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/hexagon/build/libtvm.so
-              md5sum build/libtvm_runtime.so
-              retry 3 aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/hexagon/build/libtvm_runtime.so
-              md5sum build/config.cmake
-              retry 3 aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/hexagon/build/config.cmake
-              retry 3 aws s3 cp --no-progress build/hexagon_api_output s3://${s3_prefix}/hexagon/build/hexagon_api_output --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/hexagon_api_output",
             label: 'Upload artifacts to S3',
           )
           }
@@ -1058,19 +971,11 @@ stage('Build') {
             script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build",
             label: 'Create RISC-V cmake config',
           )
-          make(ci_riscv, 'build', '-j2')
+          cmake_build(ci_riscv, 'build', '-j2')
+          make_standalone_crt(ci_riscv, 'build')
+          make_cpp_tests(ci_riscv, 'build')
           sh(
-            script: """
-              set -eux
-              . ${jenkins_scripts_root}/retry.sh
-              md5sum build/libtvm.so
-              retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/riscv/build/libtvm.so
-              md5sum build/libtvm_runtime.so
-              retry 3 aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/riscv/build/libtvm_runtime.so
-              md5sum build/config.cmake
-              retry 3 aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/riscv/build/config.cmake
-              retry 3 aws s3 cp --no-progress build/microtvm_template_projects s3://${s3_prefix}/riscv/build/microtvm_template_projects --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/riscv --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/standalone_crt build/build.ninja build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/microtvm_template_projects",
             label: 'Upload artifacts to S3',
           )
           }
@@ -1085,6 +990,20 @@ stage('Build') {
 }
 }
 
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
 // We have to do this whacky split of the code from where it's used since the
 // JVM limits method length to 64k and we easily exceed that with all this
 // autogenerated code. This makes it so each test step is in its own method so
@@ -1105,41 +1024,28 @@ def shard_run_unittest_GPU_1_of_3() {
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu2",
+                  label: 'Download artifacts from S3',
+                )
 
-              cpp_unittest(ci_gpu)
+              sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
+              // These require a GPU to finish the build (i.e. CUDA needs to be load-able)
+              make_standalone_crt(ci_gpu, 'build')
+              // make_cpp_tests(ci_gpu, 'build')
+              // cpp_unittest(ci_gpu)
 
+              sh "rm -rf build"
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
+              sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
+              make_standalone_crt(ci_gpu, 'build')
+              make_cpp_tests(ci_gpu, 'build')
               cpp_unittest(ci_gpu)
+              micro_cpp_unittest(ci_gpu)
               sh (
                 script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
                 label: 'Run Python GPU unit tests',
@@ -1153,11 +1059,7 @@ def shard_run_unittest_GPU_1_of_3() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1188,20 +1090,9 @@ def shard_run_unittest_GPU_2_of_3() {
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
               sh (
@@ -1221,11 +1112,7 @@ def shard_run_unittest_GPU_2_of_3() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1256,20 +1143,9 @@ def shard_run_unittest_GPU_3_of_3() {
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
               sh (
@@ -1285,11 +1161,7 @@ def shard_run_unittest_GPU_3_of_3() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1321,22 +1193,9 @@ def shard_run_integration_CPU_1_of_4() {
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_cpu)
               sh (
@@ -1348,11 +1207,7 @@ def shard_run_integration_CPU_1_of_4() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1383,22 +1238,9 @@ def shard_run_integration_CPU_2_of_4() {
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_cpu)
               sh (
@@ -1410,11 +1252,7 @@ def shard_run_integration_CPU_2_of_4() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1445,22 +1283,9 @@ def shard_run_integration_CPU_3_of_4() {
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_cpu)
               sh (
@@ -1472,11 +1297,7 @@ def shard_run_integration_CPU_3_of_4() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1507,22 +1328,9 @@ def shard_run_integration_CPU_4_of_4() {
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_cpu)
               sh (
@@ -1534,11 +1342,7 @@ def shard_run_integration_CPU_4_of_4() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1570,23 +1374,13 @@ def shard_run_python_i386_1_of_3() {
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/i386",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_i386)
               cpp_unittest(ci_i386)
+              micro_cpp_unittest(ci_i386)
               python_unittest(ci_i386)
               sh (
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
@@ -1597,11 +1391,7 @@ def shard_run_python_i386_1_of_3() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/python_i386 --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1632,20 +1422,9 @@ def shard_run_python_i386_2_of_3() {
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/i386",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_i386)
               python_unittest(ci_i386)
@@ -1659,11 +1438,7 @@ def shard_run_python_i386_2_of_3() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/python_i386 --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1694,20 +1469,9 @@ def shard_run_python_i386_3_of_3() {
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/i386",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_i386)
               python_unittest(ci_i386)
@@ -1720,11 +1484,7 @@ def shard_run_python_i386_3_of_3() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/python_i386 --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1756,21 +1516,10 @@ def shard_run_test_Hexagon_1_of_8() {
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_hexagon_permissions()
               ci_setup(ci_hexagon)
               cpp_unittest(ci_hexagon)
               sh (
@@ -1782,11 +1531,7 @@ def shard_run_test_Hexagon_1_of_8() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1817,21 +1562,10 @@ def shard_run_test_Hexagon_2_of_8() {
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_hexagon_permissions()
               ci_setup(ci_hexagon)
               sh (
                 script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
@@ -1842,11 +1576,7 @@ def shard_run_test_Hexagon_2_of_8() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1877,21 +1607,10 @@ def shard_run_test_Hexagon_3_of_8() {
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_hexagon_permissions()
               ci_setup(ci_hexagon)
               sh (
                 script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
@@ -1902,11 +1621,7 @@ def shard_run_test_Hexagon_3_of_8() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1937,21 +1652,10 @@ def shard_run_test_Hexagon_4_of_8() {
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_hexagon_permissions()
               ci_setup(ci_hexagon)
               sh (
                 script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
@@ -1962,11 +1666,7 @@ def shard_run_test_Hexagon_4_of_8() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -1997,21 +1697,10 @@ def shard_run_test_Hexagon_5_of_8() {
               'TVM_SHARD_INDEX=4',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_hexagon_permissions()
               ci_setup(ci_hexagon)
               sh (
                 script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
@@ -2022,11 +1711,7 @@ def shard_run_test_Hexagon_5_of_8() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2057,21 +1742,10 @@ def shard_run_test_Hexagon_6_of_8() {
               'TVM_SHARD_INDEX=5',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_hexagon_permissions()
               ci_setup(ci_hexagon)
               sh (
                 script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
@@ -2082,11 +1756,7 @@ def shard_run_test_Hexagon_6_of_8() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2117,21 +1787,10 @@ def shard_run_test_Hexagon_7_of_8() {
               'TVM_SHARD_INDEX=6',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_hexagon_permissions()
               ci_setup(ci_hexagon)
               sh (
                 script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
@@ -2142,11 +1801,7 @@ def shard_run_test_Hexagon_7_of_8() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2177,21 +1832,10 @@ def shard_run_test_Hexagon_8_of_8() {
               'TVM_SHARD_INDEX=7',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_hexagon_permissions()
               ci_setup(ci_hexagon)
               sh (
                 script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
@@ -2202,11 +1846,7 @@ def shard_run_test_Hexagon_8_of_8() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2238,20 +1878,9 @@ def shard_run_integration_aarch64_1_of_4() {
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_arm)
               python_unittest(ci_arm)
@@ -2264,11 +1893,7 @@ def shard_run_integration_aarch64_1_of_4() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2299,20 +1924,9 @@ def shard_run_integration_aarch64_2_of_4() {
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_arm)
               python_unittest(ci_arm)
@@ -2325,11 +1939,7 @@ def shard_run_integration_aarch64_2_of_4() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2360,20 +1970,9 @@ def shard_run_integration_aarch64_3_of_4() {
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_arm)
               python_unittest(ci_arm)
@@ -2386,11 +1985,7 @@ def shard_run_integration_aarch64_3_of_4() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2421,20 +2016,9 @@ def shard_run_integration_aarch64_4_of_4() {
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_arm)
               python_unittest(ci_arm)
@@ -2447,11 +2031,7 @@ def shard_run_integration_aarch64_4_of_4() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2483,20 +2063,9 @@ def shard_run_topi_GPU_1_of_3() {
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
               sh (
@@ -2508,11 +2077,7 @@ def shard_run_topi_GPU_1_of_3() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2543,20 +2108,9 @@ def shard_run_topi_GPU_2_of_3() {
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
               sh (
@@ -2568,11 +2122,7 @@ def shard_run_topi_GPU_2_of_3() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2603,20 +2153,9 @@ def shard_run_topi_GPU_3_of_3() {
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
               sh (
@@ -2628,11 +2167,7 @@ def shard_run_topi_GPU_3_of_3() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2664,20 +2199,9 @@ def shard_run_frontend_GPU_1_of_6() {
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
               sh (
@@ -2689,11 +2213,7 @@ def shard_run_frontend_GPU_1_of_6() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2724,20 +2244,9 @@ def shard_run_frontend_GPU_2_of_6() {
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
               sh (
@@ -2749,11 +2258,7 @@ def shard_run_frontend_GPU_2_of_6() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2784,20 +2289,9 @@ def shard_run_frontend_GPU_3_of_6() {
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
               sh (
@@ -2809,11 +2303,7 @@ def shard_run_frontend_GPU_3_of_6() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2844,20 +2334,9 @@ def shard_run_frontend_GPU_4_of_6() {
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
               sh (
@@ -2869,11 +2348,7 @@ def shard_run_frontend_GPU_4_of_6() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2904,20 +2379,9 @@ def shard_run_frontend_GPU_5_of_6() {
               'TVM_SHARD_INDEX=4',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
               sh (
@@ -2929,11 +2393,7 @@ def shard_run_frontend_GPU_5_of_6() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -2964,20 +2424,9 @@ def shard_run_frontend_GPU_6_of_6() {
               'TVM_SHARD_INDEX=5',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_gpu)
               sh (
@@ -2989,11 +2438,7 @@ def shard_run_frontend_GPU_6_of_6() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3025,23 +2470,13 @@ def shard_run_topi_aarch64_1_of_2() {
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_arm)
               cpp_unittest(ci_arm)
+              micro_cpp_unittest(ci_arm)
               sh (
                 script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
                 label: 'Run test_arm_compute_lib test',
@@ -3055,11 +2490,7 @@ def shard_run_topi_aarch64_1_of_2() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_aarch64 --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3090,20 +2521,9 @@ def shard_run_topi_aarch64_2_of_2() {
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_arm)
               sh (
@@ -3119,11 +2539,7 @@ def shard_run_topi_aarch64_2_of_2() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_aarch64 --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3155,20 +2571,9 @@ def shard_run_frontend_aarch64_1_of_2() {
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_arm)
               sh (
@@ -3180,11 +2585,7 @@ def shard_run_frontend_aarch64_1_of_2() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_aarch64 --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3215,20 +2616,9 @@ def shard_run_frontend_aarch64_2_of_2() {
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
 
               ci_setup(ci_arm)
               sh (
@@ -3240,11 +2630,7 @@ def shard_run_frontend_aarch64_2_of_2() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_aarch64 --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3276,23 +2662,13 @@ def shard_run_test_Cortex_M_1_of_12() {
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               cpp_unittest(ci_cortexm)
+              micro_cpp_unittest(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_demo_microtvm.sh",
                 label: 'Run microTVM demos',
@@ -3306,11 +2682,7 @@ def shard_run_test_Cortex_M_1_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3341,21 +2713,10 @@ def shard_run_test_Cortex_M_2_of_12() {
               'TVM_SHARD_INDEX=1',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
@@ -3366,11 +2727,7 @@ def shard_run_test_Cortex_M_2_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3401,21 +2758,10 @@ def shard_run_test_Cortex_M_3_of_12() {
               'TVM_SHARD_INDEX=2',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
@@ -3426,11 +2772,7 @@ def shard_run_test_Cortex_M_3_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3461,21 +2803,10 @@ def shard_run_test_Cortex_M_4_of_12() {
               'TVM_SHARD_INDEX=3',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
@@ -3486,11 +2817,7 @@ def shard_run_test_Cortex_M_4_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3521,21 +2848,10 @@ def shard_run_test_Cortex_M_5_of_12() {
               'TVM_SHARD_INDEX=4',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
@@ -3546,11 +2862,7 @@ def shard_run_test_Cortex_M_5_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3581,21 +2893,10 @@ def shard_run_test_Cortex_M_6_of_12() {
               'TVM_SHARD_INDEX=5',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
@@ -3606,11 +2907,7 @@ def shard_run_test_Cortex_M_6_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3641,21 +2938,10 @@ def shard_run_test_Cortex_M_7_of_12() {
               'TVM_SHARD_INDEX=6',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
@@ -3666,11 +2952,7 @@ def shard_run_test_Cortex_M_7_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3701,21 +2983,10 @@ def shard_run_test_Cortex_M_8_of_12() {
               'TVM_SHARD_INDEX=7',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
@@ -3726,11 +2997,7 @@ def shard_run_test_Cortex_M_8_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3761,21 +3028,10 @@ def shard_run_test_Cortex_M_9_of_12() {
               'TVM_SHARD_INDEX=8',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
@@ -3786,11 +3042,7 @@ def shard_run_test_Cortex_M_9_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3821,21 +3073,10 @@ def shard_run_test_Cortex_M_10_of_12() {
               'TVM_SHARD_INDEX=9',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
@@ -3846,11 +3087,7 @@ def shard_run_test_Cortex_M_10_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3881,21 +3118,10 @@ def shard_run_test_Cortex_M_11_of_12() {
               'TVM_SHARD_INDEX=10',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
@@ -3906,11 +3132,7 @@ def shard_run_test_Cortex_M_11_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -3941,21 +3163,10 @@ def shard_run_test_Cortex_M_12_of_12() {
               'TVM_SHARD_INDEX=11',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
@@ -3966,11 +3177,7 @@ def shard_run_test_Cortex_M_12_of_12() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -4002,23 +3209,13 @@ def shard_run_test_RISC_V_1_of_1() {
               'TVM_SHARD_INDEX=0',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
               sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/riscv/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/riscv/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/riscv/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/riscv/build/microtvm_template_projects build/microtvm_template_projects --recursive
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/riscv",
+                  label: 'Download artifacts from S3',
+                )
 
-              add_microtvm_permissions()
               ci_setup(ci_riscv)
               cpp_unittest(ci_cortexm)
+              micro_cpp_unittest(ci_cortexm)
               sh (
                 script: "${docker_run} ${ci_riscv} ./tests/scripts/task_riscv_microtvm.sh",
                 label: 'Run microTVM tests',
@@ -4028,11 +3225,7 @@ def shard_run_test_RISC_V_1_of_1() {
         } finally {
           try {
             sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_RISC_V --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_RISC_V --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -4059,18 +3252,9 @@ def run_unittest_minimal() {
             docker_init(ci_minimal)
             withEnv(['PLATFORM=minimal'], {
               sh(
-                    script: """
-                      set -eux
-                      . ${jenkins_scripts_root}/retry.sh
-                      retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu-minimal/build/libtvm.so build/libtvm.so
-                      md5sum build/libtvm.so
-                      retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu-minimal/build/libtvm_runtime.so build/libtvm_runtime.so
-                      md5sum build/libtvm_runtime.so
-                      retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu-minimal/build/config.cmake build/config.cmake
-                      md5sum build/config.cmake
-                    """,
-                    label: 'Download artifacts from S3',
-                  )
+              script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu-minimal",
+              label: 'Download artifacts from S3',
+            )
 
               cpp_unittest(ci_minimal)
               python_unittest(ci_minimal)
@@ -4078,11 +3262,7 @@ def run_unittest_minimal() {
           } finally {
             try {
               sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_CPU_MINIMAL --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_CPU_MINIMAL --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -4264,25 +3444,13 @@ stage('Test') {
               'TEST_STEP_NAME=unittest: CPU',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
                 sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
 
                 ci_setup(ci_cpu)
                 cpp_unittest(ci_cpu)
+                micro_cpp_unittest(ci_cpu)
                 python_unittest(ci_cpu)
                 fsim_test(ci_cpu)
                 sh (
@@ -4293,11 +3461,7 @@ stage('Test') {
             } finally {
             try {
               sh(
-                script: """
-                  set -eux
-                  . ci/scripts/retry.sh
-                  retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/unittest_CPU --recursive
-                """,
+                script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_CPU --items build/pytest-results",
                 label: 'Upload JUnits to S3',
               )
 
@@ -4325,20 +3489,9 @@ stage('Test') {
               'TEST_STEP_NAME=frontend: CPU',
               "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
                 sh(
-                        script: """
-                          set -eux
-                          . ${jenkins_scripts_root}/retry.sh
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
 
                 ci_setup(ci_cpu)
                 sh (
@@ -4349,11 +3502,7 @@ stage('Test') {
             } finally {
             try {
               sh(
-                script: """
-                  set -eux
-                  . ci/scripts/retry.sh
-                  retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_CPU --recursive
-                """,
+                script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_CPU --items build/pytest-results",
                 label: 'Upload JUnits to S3',
               )
 
@@ -4376,23 +3525,10 @@ stage('Test') {
           init_git()
           docker_init(ci_gpu)
           sh(
-            script: """
-              set -eux
-              . ${jenkins_scripts_root}/retry.sh
-              retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
-              md5sum build/libtvm.so
-              retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-              md5sum build/libvta_fsim.so
-              retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
-              md5sum build/libtvm_runtime.so
-              retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
-              md5sum build/config.cmake
-              retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/microtvm_template_projects build/microtvm_template_projects --recursive
-            """,
-            label: 'Download artifacts from S3',
-          )
+      script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+      label: 'Download artifacts from S3',
+    )
 
-          add_microtvm_permissions()
           timeout(time: 180, unit: 'MINUTES') {
             ci_setup(ci_gpu)
             sh (
@@ -4401,17 +3537,12 @@ stage('Test') {
             )
           }
           sh(
-      script: """
-        set -eux
-        . ${jenkins_scripts_root}/retry.sh
-        md5sum docs.tgz
-        retry 3 aws s3 cp --no-progress docs.tgz s3://${s3_prefix}/docs/docs.tgz
-      """,
+      script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/docs --items docs.tgz",
       label: 'Upload artifacts to S3',
     )
 
           sh(
-            script: "aws s3 cp --no-progress _docs s3://${s3_prefix}/docs --recursive",
+            script: "aws s3 cp --no-progress _docs s3://${s3_bucket}/${s3_prefix}/docs --recursive",
             label: 'Upload docs to S3',
           )
         }
@@ -4515,14 +3646,9 @@ def deploy() {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
                     sh(
-                      script: """
-                        set -eux
-                        . ${jenkins_scripts_root}/retry.sh
-                        retry 3 aws s3 cp --no-progress s3://${s3_prefix}/docs/docs.tgz docs.tgz
-                        md5sum docs.tgz
-                      """,
-                      label: 'Download artifacts from S3',
-                    )
+                script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/docs",
+                label: 'Download artifacts from S3',
+              )
 
                     deploy_docs()
           }
diff --git a/ci/jenkins/Build.groovy.j2 b/ci/jenkins/Build.groovy.j2
index 49cffacdc16e..914e3e99b659 100644
--- a/ci/jenkins/Build.groovy.j2
+++ b/ci/jenkins/Build.groovy.j2
@@ -1,7 +1,7 @@
 def ci_setup(image) {
   sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh",
-    label: 'Set up CI environment',
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
   )
 }
 
@@ -19,61 +19,43 @@ def fsim_test(image) {
   )
 }
 
-def cmake_build(image, path, make_flag) {
+def make_standalone_crt(image, build_dir) {
   sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
-    label: 'Run cmake build',
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
   )
 }
 
-def cpp_unittest(image) {
+def make_cpp_tests(image, build_dir) {
   sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
-    label: 'Build and run C++ tests',
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
   )
 }
 
-def add_microtvm_permissions() {
-  {% for folder in microtvm_template_projects %}
-  sh(
-    script: 'find {{ folder }} -type f | grep qemu-hack | xargs chmod +x',
-    label: 'Add execute permissions for microTVM files',
-  )
-  {% endfor %}
-}
-
-def add_hexagon_permissions() {
-  {% for folder in hexagon_api %}
-  sh(
-    script: 'find {{ folder }} -type f | xargs chmod +x',
-    label: 'Add execute permissions for hexagon files',
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
   )
-  {% endfor %}
 }
 
-// Run make. First try to do an incremental make from a previous workspace in hope to
-// accelerate the compilation. If something is wrong, clean the workspace and then
-// build from scratch.
-def make(docker_type, path, make_flag) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    try {
-      cmake_build(docker_type, path, make_flag)
-    } catch (hudson.AbortException ae) {
-      // script exited due to user abort, directly throw instead of retry
-      if (ae.getMessage().contains('script returned exit code 143')) {
-        throw ae
-      }
-      echo 'Incremental compilation failed. Fall back to build from scratch'
-      sh (
-        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
-        label: 'Clear old cmake workspace',
-      )
-      cmake_build(docker_type, path, make_flag)
-    }
-  }
-}
-
-
 def build() {
 stage('Build') {
   environment {
@@ -89,13 +71,16 @@ stage('Build') {
       docker_image='ci_gpu',
     ) %}
     sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
-    make("${ci_gpu} --no-gpu", 'build', '-j2')
-    {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
+    cmake_build("${ci_gpu} --no-gpu", 'build', '-j2')
+    make_standalone_crt("${ci_gpu} --no-gpu", 'build')
+    {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib + tvm_allvisible + microtvm_template_projects + crttest + standalone_crt) }}
 
     // compiler test
-    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
-    make("${ci_gpu} --no-gpu", 'build2', '-j2')
-    {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }}
+    sh "rm -rf build"
+    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
+    cmake_build("${ci_gpu} --no-gpu", 'build', '-j2')
+    make_standalone_crt("${ci_gpu} --no-gpu", 'build')
+    {{ m.upload_artifacts(tag='gpu2', filenames=tvm_lib + crttest + standalone_crt) }}
   {% endcall %}
 
   {% call m.build_step(
@@ -109,8 +94,10 @@ stage('Build') {
       script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
       label: 'Create CPU cmake config',
     )
-    make(ci_cpu, 'build', '-j2')
-    {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
+    cmake_build(ci_cpu, 'build', '-j2')
+    make_standalone_crt(ci_cpu, 'build')
+    make_cpp_tests(ci_cpu, 'build')
+    {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim + tvm_allvisible + crttest + cpptest + standalone_crt) }}
     ci_setup(ci_cpu)
     // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
     // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
@@ -128,8 +115,9 @@ stage('Build') {
       script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build",
       label: 'Create CPU minimal cmake config',
     )
-    make(ci_minimal, 'build', '-j2')
-    {{ m.upload_artifacts(tag='cpu-minimal', filenames=tvm_lib) }}
+    cmake_build(ci_minimal, 'build', '-j2')
+    make_cpp_tests(ci_minimal, 'build')
+    {{ m.upload_artifacts(tag='cpu-minimal', filenames=tvm_lib + tvm_allvisible + cpptest) }}
   {% endcall %}
 
   {% call m.build_step(
@@ -143,7 +131,9 @@ stage('Build') {
       script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
       label: 'Create WASM cmake config',
     )
-    make(ci_wasm, 'build', '-j2')
+    cmake_build(ci_wasm, 'build', '-j2')
+    make_standalone_crt(ci_wasm, 'build')
+    make_cpp_tests(ci_wasm, 'build')
     cpp_unittest(ci_wasm)
     ci_setup(ci_wasm)
     sh (
@@ -163,8 +153,10 @@ stage('Build') {
       script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
       label: 'Create i386 cmake config',
     )
-    make(ci_i386, 'build', '-j2')
-    {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }}
+    cmake_build(ci_i386, 'build', '-j2')
+    make_standalone_crt(ci_i386, 'build')
+    make_cpp_tests(ci_i386, 'build')
+    {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim + standalone_crt + crttest + cpptest) }}
   {% endcall %}
 
   {% call m.build_step(
@@ -178,8 +170,10 @@ stage('Build') {
       script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
       label: 'Create ARM cmake config',
     )
-    make(ci_arm, 'build', '-j4')
-    {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }}
+    cmake_build(ci_arm, 'build', '-j4')
+    make_standalone_crt(ci_arm, 'build')
+    make_cpp_tests(ci_arm, 'build')
+    {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib + cpptest + crttest + standalone_crt) }}
   {% endcall %}
 
   {% call m.build_step(
@@ -193,8 +187,10 @@ stage('Build') {
       script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build",
       label: 'Create Cortex-M cmake config',
     )
-    make(ci_cortexm, 'build', '-j2')
-    {{ m.upload_artifacts(tag='cortexm', filenames=tvm_lib, folders=microtvm_template_projects) }}
+    cmake_build(ci_cortexm, 'build', '-j2')
+    make_standalone_crt(ci_cortexm, 'build')
+    make_cpp_tests(ci_cortexm, 'build')
+    {{ m.upload_artifacts(tag='cortexm', filenames=tvm_lib + tvm_allvisible + crttest + standalone_crt + cpptest + microtvm_template_projects) }}
   {% endcall %}
 
   {% call m.build_step(
@@ -208,12 +204,13 @@ stage('Build') {
       script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
       label: 'Create Hexagon cmake config',
     )
-    make(ci_hexagon, 'build', '-j2')
+    cmake_build(ci_hexagon, 'build', '-j2')
+    make_cpp_tests(ci_hexagon, 'build')
     sh (
       script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
       label: 'Build Hexagon API',
     )
-    {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib, folders=hexagon_api) }}
+    {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib + cpptest + hexagon_api) }}
   {% endcall %}
 
   {% call m.build_step(
@@ -227,8 +224,10 @@ stage('Build') {
       script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build",
       label: 'Create RISC-V cmake config',
     )
-    make(ci_riscv, 'build', '-j2')
-    {{ m.upload_artifacts(tag='riscv', filenames=tvm_lib, folders=microtvm_template_projects) }}
+    cmake_build(ci_riscv, 'build', '-j2')
+    make_standalone_crt(ci_riscv, 'build')
+    make_cpp_tests(ci_riscv, 'build')
+    {{ m.upload_artifacts(tag='riscv', filenames=tvm_lib + tvm_allvisible + standalone_crt + crttest + cpptest + microtvm_template_projects) }}
   {% endcall %}
 
   )
diff --git a/ci/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2
index f11d901258f6..9bb6257f7a74 100644
--- a/ci/jenkins/Deploy.groovy.j2
+++ b/ci/jenkins/Deploy.groovy.j2
@@ -91,7 +91,7 @@ def deploy() {
           ws="tvm/deploy-docs",
         ) %}
           init_git()
-          {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }}
+          {{ m.download_artifacts(tag='docs') }}
           deploy_docs()
         {% endcall %}
         {% call m.deploy_step(
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index 7ceef81e4e7c..2a3ade049361 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -93,12 +93,17 @@ rebuild_docker_images = false
 
 // Filenames for stashing between build and test steps
 {% set tvm_runtime = ['build/libtvm_runtime.so', 'build/config.cmake'] %}
+{% set crttest = ['build/crttest'] %}
+{% set tvm_allvisible = ['build/libtvm_allvisible.so'] %}
+{% set cpptest = ['build/cpptest', 'build/build.ninja', 'build/CMakeFiles/rules.ninja'] %}
 {% set tvm_lib = ['build/libtvm.so'] + tvm_runtime %}
 {% set tvm_multilib = ['build/libtvm.so', 'build/libvta_fsim.so'] + tvm_runtime %}
 {% set tvm_multilib_tsim = ['build/libvta_tsim.so'] + tvm_multilib %}
 {% set microtvm_template_projects = ['build/microtvm_template_projects',] %}
 {% set hexagon_api = ['build/hexagon_api_output',] %}
-s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+{% set standalone_crt = ['build/standalone_crt', 'build/build.ninja'] %}
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
 // Jenkins script root directory
 jenkins_scripts_root = "ci/scripts/jenkins"
diff --git a/ci/jenkins/Test.groovy.j2 b/ci/jenkins/Test.groovy.j2
index 52ed742d4cc0..eb7c8fdc0c0d 100644
--- a/ci/jenkins/Test.groovy.j2
+++ b/ci/jenkins/Test.groovy.j2
@@ -1,5 +1,19 @@
 {% set test_method_names = [] %}
 
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
 // We have to do this whacky split of the code from where it's used since the
 // JVM limits method length to 64k and we easily exceed that with all this
 // autogenerated code. This makes it so each test step is in its own method so
@@ -14,14 +28,23 @@
   test_method_names=test_method_names,
 ) %}
   {% if shard_index == 1 %}
-  {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }}
-  cpp_unittest(ci_gpu)
+  {{ m.download_artifacts(tag='gpu2') }}
+  sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
+  // These require a GPU to finish the build (i.e. CUDA needs to be load-able)
+  make_standalone_crt(ci_gpu, 'build')
+  // make_cpp_tests(ci_gpu, 'build')
+  // cpp_unittest(ci_gpu)
 
-  {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+  sh "rm -rf build"
+  {{ m.download_artifacts(tag='gpu') }}
   ci_setup(ci_gpu)
+  sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
+  make_standalone_crt(ci_gpu, 'build')
+  make_cpp_tests(ci_gpu, 'build')
   cpp_unittest(ci_gpu)
+  micro_cpp_unittest(ci_gpu)
   {% else %}
-  {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+  {{ m.download_artifacts(tag='gpu') }}
   ci_setup(ci_gpu)
   {% endif %}
   {% if shard_index == 2 or num_shards < 2 %}
@@ -48,7 +71,7 @@
   docker_image="ci_cpu",
   test_method_names=test_method_names,
 ) %}
-  {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
+  {{ m.download_artifacts(tag='cpu') }}
   ci_setup(ci_cpu)
   sh (
     script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
@@ -64,10 +87,11 @@
   docker_image="ci_i386",
   test_method_names=test_method_names,
 ) %}
-  {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
+  {{ m.download_artifacts(tag='i386') }}
   ci_setup(ci_i386)
   {% if shard_index == 1 %}
   cpp_unittest(ci_i386)
+  micro_cpp_unittest(ci_i386)
   {% endif %}
   python_unittest(ci_i386)
   sh (
@@ -87,8 +111,7 @@
   test_method_names=test_method_names,
   num_shards=8,
 ) %}
-  {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib, folders=hexagon_api) }}
-  add_hexagon_permissions()
+  {{ m.download_artifacts(tag='hexagon') }}
   ci_setup(ci_hexagon)
   {% if shard_index == 1 %}
   cpp_unittest(ci_hexagon)
@@ -107,7 +130,7 @@
   docker_image="ci_arm",
   test_method_names=test_method_names,
 ) %}
-  {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+  {{ m.download_artifacts(tag='arm') }}
   ci_setup(ci_arm)
   python_unittest(ci_arm)
   sh (
@@ -124,7 +147,7 @@
   docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
-  {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+  {{ m.download_artifacts(tag='gpu') }}
   ci_setup(ci_gpu)
   sh (
     script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
@@ -140,7 +163,7 @@
   docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
-  {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+  {{ m.download_artifacts(tag='gpu') }}
   ci_setup(ci_gpu)
   sh (
     script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
@@ -156,10 +179,11 @@
   num_shards=2,
   test_method_names=test_method_names,
 ) %}
-  {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+  {{ m.download_artifacts(tag='arm') }}
   ci_setup(ci_arm)
   {% if shard_index == 1 %}
   cpp_unittest(ci_arm)
+  micro_cpp_unittest(ci_arm)
   {% endif %}
   sh (
     script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
@@ -179,7 +203,7 @@
   num_shards=2,
   test_method_names=test_method_names,
 ) %}
-  {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+  {{ m.download_artifacts(tag='arm') }}
   ci_setup(ci_arm)
   sh (
     script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
@@ -195,11 +219,11 @@
   num_shards=12,
   test_method_names=test_method_names,
 ) %}
-  {{ m.download_artifacts(tag='cortexm', filenames=tvm_lib, folders=microtvm_template_projects) }}
-  add_microtvm_permissions()
+  {{ m.download_artifacts(tag='cortexm') }}
   ci_setup(ci_cortexm)
   {% if shard_index == 1%}
   cpp_unittest(ci_cortexm)
+  micro_cpp_unittest(ci_cortexm)
   sh (
     script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_demo_microtvm.sh",
     label: 'Run microTVM demos',
@@ -219,11 +243,11 @@
   num_shards=1,
   test_method_names=test_method_names,
 ) %}
-  {{ m.download_artifacts(tag='riscv', filenames=tvm_lib, folders=microtvm_template_projects) }}
-  add_microtvm_permissions()
+  {{ m.download_artifacts(tag='riscv') }}
   ci_setup(ci_riscv)
   {% if shard_index == 1%}
   cpp_unittest(ci_cortexm)
+  micro_cpp_unittest(ci_cortexm)
   {% endif %}
   sh (
     script: "${docker_run} ${ci_riscv} ./tests/scripts/task_riscv_microtvm.sh",
@@ -239,7 +263,7 @@ def run_unittest_minimal() {
       platform="minimal",
       docker_image="ci_minimal",
     ) %}
-      {{ m.download_artifacts(tag='cpu-minimal', filenames=tvm_lib) }}
+      {{ m.download_artifacts(tag='cpu-minimal') }}
       cpp_unittest(ci_minimal)
       python_unittest(ci_minimal)
   {% endcall %}
@@ -266,9 +290,10 @@ stage('Test') {
     platform="cpu",
     docker_image="ci_cpu",
   ) %}
-    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
+    {{ m.download_artifacts(tag='cpu') }}
     ci_setup(ci_cpu)
     cpp_unittest(ci_cpu)
+    micro_cpp_unittest(ci_cpu)
     python_unittest(ci_cpu)
     fsim_test(ci_cpu)
     sh (
@@ -283,7 +308,7 @@ stage('Test') {
     platform="cpu",
     docker_image="ci_cpu",
 ) %}
-    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }}
+    {{ m.download_artifacts(tag='cpu') }}
     ci_setup(ci_cpu)
     sh (
       script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
@@ -296,8 +321,7 @@ stage('Test') {
         ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
           init_git()
           docker_init(ci_gpu)
-          {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
-          add_microtvm_permissions()
+          {{ m.download_artifacts(tag='gpu') }}
           timeout(time: 180, unit: 'MINUTES') {
             ci_setup(ci_gpu)
             sh (
@@ -307,7 +331,7 @@ stage('Test') {
           }
           {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }}
           sh(
-            script: "aws s3 cp --no-progress _docs s3://${s3_prefix}/docs --recursive",
+            script: "aws s3 cp --no-progress _docs s3://${s3_bucket}/${s3_prefix}/docs --recursive",
             label: 'Upload docs to S3',
           )
         }
diff --git a/ci/jenkins/macros.j2 b/ci/jenkins/macros.j2
index b8ac0de91ce6..ff59a4046179 100644
--- a/ci/jenkins/macros.j2
+++ b/ci/jenkins/macros.j2
@@ -21,11 +21,7 @@
 
 {% macro junit_to_s3(test_dir_name) %}
 sh(
-            script: """
-              set -eux
-              . ci/scripts/retry.sh
-              retry 3 aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/{{ test_dir_name }} --recursive
-            """,
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/{{ test_dir_name }} --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 {% endmacro %}
@@ -187,36 +183,17 @@ def {{ method_name }}() {
   },
 {% endmacro %}
 
-{% macro upload_artifacts(tag, filenames, folders=None) %}
+{% macro upload_artifacts(action, tag, filenames) %}
+{% set items = ' '.join(filenames) %}
 sh(
-      script: """
-        set -eux
-        . ${jenkins_scripts_root}/retry.sh
-        {% for filename in filenames %}
-        md5sum {{ filename }}
-        retry 3 aws s3 cp --no-progress {{ filename }} s3://${s3_prefix}/{{ tag }}/{{ filename }}
-        {% endfor %}
-        {% for folder in (folders or []) %}
-        retry 3 aws s3 cp --no-progress {{ folder }} s3://${s3_prefix}/{{ tag }}/{{ folder }} --recursive
-        {% endfor %}
-      """,
+      script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/{{ tag }} --items {{ items }}",
       label: 'Upload artifacts to S3',
     )
 {% endmacro %}
 
-{% macro download_artifacts(tag, filenames, folders=None) %}
+{% macro download_artifacts(tag) %}
 sh(
-            script: """
-              set -eux
-              . ${jenkins_scripts_root}/retry.sh
-              {% for filename in filenames %}
-              retry 3 aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ filename }} {{ filename }}
-              md5sum {{ filename }}
-              {% endfor %}
-              {% for folder in (folders or []) %}
-              retry 3 aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ folder }} {{ folder }} --recursive
-              {% endfor %}
-            """,
-            label: 'Download artifacts from S3',
-          )
+      script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/{{ tag }}",
+      label: 'Download artifacts from S3',
+    )
 {% endmacro %}
diff --git a/ci/scripts/jenkins/cmd_utils.py b/ci/scripts/jenkins/cmd_utils.py
index 52eaf9ac0ad2..1b282c50ba0f 100644
--- a/ci/scripts/jenkins/cmd_utils.py
+++ b/ci/scripts/jenkins/cmd_utils.py
@@ -20,6 +20,7 @@
 import logging
 import sys
 import re
+import tempfile
 from pathlib import Path
 from typing import List
 
@@ -53,6 +54,16 @@ def __init__(self, env=None, cwd=None):
             self.env.update(env)
         self.cwd = cwd
 
+    def tee(self, cmd: str, **kwargs):
+        """
+        Run 'cmd' in a shell then return the (process, stdout) as a tuple
+        """
+        with tempfile.NamedTemporaryFile(delete=False) as f:
+            proc = self.run(f"{cmd} | tee {f.name}", **kwargs)
+            with open(f.name, "r") as f:
+                output = f.read()
+            return proc, output
+
     def run(self, cmd: str, **kwargs):
         logging.info(f"+ {cmd}")
         defaults = {
diff --git a/ci/scripts/jenkins/s3.py b/ci/scripts/jenkins/s3.py
new file mode 100755
index 000000000000..f5aa535df8c4
--- /dev/null
+++ b/ci/scripts/jenkins/s3.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import logging
+import re
+from pathlib import Path
+from typing import List
+from enum import Enum
+
+from cmd_utils import Sh, REPO_ROOT, init_log
+
+RETRY_SCRIPT = REPO_ROOT / "ci" / "scripts" / "jenkins" / "retry.sh"
+S3_DOWNLOAD_REGEX = re.compile(r"download: s3://.* to (.*)")
+SH = Sh()
+
+
+class Action(Enum):
+    UPLOAD = 1
+    DOWNLOAD = 2
+
+
+def show_md5(item: str) -> None:
+    if not Path(item).is_dir():
+        sh.run(f"md5sum {item}")
+
+
+def parse_output_files(stdout: str) -> List[str]:
+    """
+    Grab the list of downloaded files from the output of 'aws s3 cp'. Lines look
+    like:
+
+        download: s3://some/prefix/a_file.txt to a_file.txt
+    """
+    files = []
+    for line in stdout.split("\n"):
+        line = line.strip()
+        if line == "":
+            continue
+        m = S3_DOWNLOAD_REGEX.match(line)
+        if m:
+            files.append(m.groups()[0])
+
+    return files
+
+
+def chmod(files: List[str]) -> None:
+    """
+    S3 has no concept of file permissions so add them back in here to every file
+    """
+    # Add execute bit for downloads
+    to_chmod = [str(f) for f in files]
+    logging.info(f"Adding execute bit for files: {to_chmod}")
+    if len(to_chmod) > 0:
+        SH.run(f"chmod +x {' '.join(to_chmod)}")
+
+
+def s3(source: str, destination: str, recursive: bool) -> List[str]:
+    """
+    Send or download the source to the destination in S3
+    """
+    cmd = f". {RETRY_SCRIPT.relative_to(REPO_ROOT)} && retry 3 aws s3 cp --no-progress"
+
+    if recursive:
+        cmd += " --recursive"
+
+    cmd += f" {source} {destination}"
+    _, stdout = SH.tee(cmd)
+    return stdout
+
+
+if __name__ == "__main__":
+    init_log()
+    help = "Uploads or downloads files from S3"
+    parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--action", help="either 'upload' or 'download'", required=True)
+    parser.add_argument("--bucket", help="s3 bucket", required=True)
+    parser.add_argument(
+        "--prefix", help="s3 bucket + tag (e.g. s3://tvm-ci-prod/PR-1234/cpu", required=True
+    )
+    parser.add_argument("--items", help="files and folders to upload", nargs="+")
+
+    args = parser.parse_args()
+    logging.info(args)
+
+    sh = Sh()
+
+    if Path.cwd() != REPO_ROOT:
+        logging.error(f"s3.py can only be executed from the repo root, instead was in {Path.cwd()}")
+        exit(1)
+
+    prefix = args.prefix.strip("/")
+    s3_path = f"s3://{args.bucket}/{prefix}"
+    logging.info(f"Using s3 path: {s3_path}")
+
+    if args.action == "upload":
+        action = Action.UPLOAD
+    elif args.action == "download":
+        action = Action.DOWNLOAD
+    else:
+        logging.error(f"Unsupported action: {args.action}")
+        exit(1)
+
+    if args.items is None:
+        if args.action == "upload":
+            logging.error(f"Cannot upload without --items")
+            exit(1)
+        else:
+            # Download the whole prefix
+            items = ["."]
+
+    else:
+        items = args.items
+
+    for item in items:
+        if action == Action.DOWNLOAD:
+            stdout = s3(source=s3_path, destination=item, recursive=True)
+            files = parse_output_files(stdout)
+            chmod(files)
+            for file in files:
+                # Show md5 after downloading
+                show_md5(file)
+        elif action == Action.UPLOAD:
+            show_md5(item)
+            s3(item, s3_path + "/" + item, recursive=Path(item).is_dir())
diff --git a/tests/scripts/task_ci_setup.sh b/tests/scripts/task_clear_pytest.sh
similarity index 58%
rename from tests/scripts/task_ci_setup.sh
rename to tests/scripts/task_clear_pytest.sh
index 5af0d0c9801e..0a24f62531d5 100755
--- a/tests/scripts/task_ci_setup.sh
+++ b/tests/scripts/task_clear_pytest.sh
@@ -18,23 +18,6 @@
 
 set -euxo pipefail
 
-# Script to setup additional python env.
-#
-# Use the following command to install the
-# package to /workspace/.local, these additional
-# packages will have precedence over the system packages.
-#
-# command: python3 -m pip install --user <package>==<version>
-#
-echo "Additional setup in ${CI_IMAGE_NAME}"
-
-# Rebuild standalone_crt in build/ tree. This file is not currently archived by pack_lib() in
-# Jenkinsfile. We expect config.cmake to be present from pack_lib().
-# TODO(areusch): Make pack_lib() pack all the data dependencies of TVM.
-python3 tests/scripts/task_build.py \
-    --sccache-bucket tvm-sccache-prod \
-    --cmake-target standalone_crt
-
 # Ensure no stale pytest-results remain from a previous test run.
 pushd build
 rm -rf pytest-results
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index 22cc937e8784..bc152dfba34f 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -28,38 +28,16 @@ else
     BUILD_DIR=build
 fi
 
-# Python is required by apps/bundle_deploy
-source tests/scripts/setup-pytest-env.sh
 
-export LD_LIBRARY_PATH="lib:${LD_LIBRARY_PATH:-}"
 # NOTE: important to use abspath, when VTA is enabled.
-export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
+VTA_HW_PATH=$(pwd)/3rdparty/vta-hw
+export VTA_HW_PATH
 
 # to avoid CI thread throttling.
 export TVM_BIND_THREADS=0
 export OMP_NUM_THREADS=1
 
-# Build cpptest suite
-python3 tests/scripts/task_build.py \
-    --sccache-bucket tvm-sccache-prod \
-    --cmake-target cpptest \
-    --build-dir "${BUILD_DIR}"
-
-# crttest requries USE_MICRO to be enabled.
-if grep -Fq "USE_MICRO ON" ${BUILD_DIR}/TVMBuildOptions.txt; then
-  pushd "${BUILD_DIR}"
-  ninja crttest
-  popd
-fi
-
 pushd "${BUILD_DIR}"
-ctest --gtest_death_test_style=threadsafe
+# run cpp test executable
+./cpptest
 popd
-
-# Test MISRA-C runtime. It requires USE_MICRO to be enabled.
-if grep -Fq "USE_MICRO ON" ${BUILD_DIR}/TVMBuildOptions.txt; then
-  pushd apps/bundle_deploy
-  rm -rf build
-  make test_dynamic test_static
-  popd
-fi
diff --git a/tests/scripts/task_microtvm_cpp_tests.sh b/tests/scripts/task_microtvm_cpp_tests.sh
new file mode 100755
index 000000000000..cc36963afa5b
--- /dev/null
+++ b/tests/scripts/task_microtvm_cpp_tests.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euxo pipefail
+
+BUILD_DIR=$1
+
+# Python is required by apps/bundle_deploy
+source tests/scripts/setup-pytest-env.sh
+
+export LD_LIBRARY_PATH="lib:${LD_LIBRARY_PATH:-}"
+# NOTE: important to use abspath, when VTA is enabled.
+VTA_HW_PATH=$(pwd)/3rdparty/vta-hw
+export VTA_HW_PATH
+
+# to avoid CI thread throttling.
+export TVM_BIND_THREADS=0
+export OMP_NUM_THREADS=1
+
+# crttest requries USE_MICRO to be enabled.
+./build/crttest
+
+# Test MISRA-C runtime. It requires USE_MICRO to be enabled.
+pushd apps/bundle_deploy
+rm -rf build
+make test_dynamic test_static
+popd

From 4f35b4ded106ea850f136c7d990e9a9008b5888b Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 18 Nov 2022 11:51:35 -0800
Subject: [PATCH 612/704] [TIR][Hexagon] Async DMA fixups (#13436)

* [TIR] Ensure the body of the async_wait attribute is visited.

* [TIR][Runtime] Pass 'bypass_cache' flag during lowering and correctly extract it from packed API.
---
 src/runtime/hexagon/hexagon_device_api.cc | 2 +-
 src/tir/transforms/lower_async_dma.cc     | 2 +-
 src/tir/transforms/lower_tvm_builtin.cc   | 6 ++++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 1c3b139d39a3..51cc976e46d3 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -229,7 +229,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy").set_body([](TVMArgs args, TVM
   void* src = args[2];
   int size = args[3];
   ICHECK(size > 0);
-  bool bypass_cache = args[3];
+  bool bypass_cache = args[4];
 
   int ret = DMA_RETRY;
   do {
diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc
index b9ba4d41b7da..9a950c10c776 100644
--- a/src/tir/transforms/lower_async_dma.cc
+++ b/src/tir/transforms/lower_async_dma.cc
@@ -71,7 +71,7 @@ class AsyncDMALowerer : public StmtExprMutator {
           Evaluate(Call(DataType::Int(32), builtin::dma_wait(), {queue_id, async_wait->value}));
 
       // concatenate the call with the body and return
-      return SeqStmt({call_dma_wait, async_wait->body});
+      return SeqStmt({call_dma_wait, StmtExprMutator::VisitStmt(async_wait->body)});
 
       // Convert this, for example:
       // attr [0] "async_commit_queue_scope" = 0;
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 25d62539721f..082a54f9c73d 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -329,12 +329,14 @@ class BuiltinLower : public StmtExprMutator {
     PrimExpr dst = op->args[1];
     PrimExpr src = op->args[2];
     PrimExpr size = op->args[3];
+    PrimExpr bypass_cache = op->args[4];
 
     std::string fdevapi_prefix =
         "device_api." + std::string(runtime::DeviceName(device_type_.as<IntImmNode>()->value));
 
-    Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(),
-                            {StringImm(fdevapi_prefix + ".dma_copy"), queue_id, dst, src, size});
+    Call call_packed =
+        Call(DataType::Int(32), builtin::tvm_call_packed(),
+             {StringImm(fdevapi_prefix + ".dma_copy"), queue_id, dst, src, size, bypass_cache});
     return VisitExpr(call_packed);
   }
 

From bfb4c00197c609207c294bf764bcec128c7a7775 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Fri, 18 Nov 2022 23:23:48 +0300
Subject: [PATCH 613/704] [OpenCL] Improve OpenCL version detection (#13434)

As it was mentioned in #13362, it would be nice to add check when the
user is using a version of libOpenCL.so that is too old.

In this PR we introduce this functionality. In the `init` method, we
traverse through all OpenCL devices and check their version. If the
version is older than the target version in TVM, then we notify the user
that we will skip this device. We cannot throw any exception from method
`init` because it is possible that you have compiled host code with
OpenCL support, but the host device won't have any OpenCL device which
is supported by TVM (e.g. they all have too old version of
libOpenCL.so). From OpenCL codegen we call function OpenCLModuleCreate.
In the OpenCLModuleCreate init function might be called and in this case
an exception will be generated on the host side although that the target
device might be supported by TVM. This is why we don't throw any
exceptions in the init function.

If in the runtime we use some OpenCL methods and the list of the devices
is empty, then we will generate an exception and notify user that
possible reason is because version of libOpenCL.so is too old.
---
 src/runtime/opencl/opencl_common.h      | 11 +++-
 src/runtime/opencl/opencl_device_api.cc | 70 ++++++++++++++++++-------
 2 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 7f7f083cf303..4c51158c29df 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -263,7 +263,7 @@ class OpenCLWorkspace : public DeviceAPI {
     ICHECK(IsOpenCLDevice(dev));
     this->Init();
     ICHECK(dev.device_id >= 0 && static_cast<size_t>(dev.device_id) < queues.size())
-        << "Invalid OpenCL device_id=" << dev.device_id;
+        << "Invalid OpenCL device_id=" << dev.device_id << ". " << GetError();
     return queues[dev.device_id];
   }
   // get the event queue of the context
@@ -271,7 +271,7 @@ class OpenCLWorkspace : public DeviceAPI {
     ICHECK(IsOpenCLDevice(dev));
     this->Init();
     ICHECK(dev.device_id >= 0 && static_cast<size_t>(dev.device_id) < queues.size())
-        << "Invalid OpenCL device_id=" << dev.device_id;
+        << "Invalid OpenCL device_id=" << dev.device_id << ". " << GetError();
     return events[dev.device_id];
   }
   // is current clCommandQueue in profiling mode
@@ -310,6 +310,13 @@ class OpenCLWorkspace : public DeviceAPI {
   static OpenCLWorkspace* Global();
 
   void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final;
+
+ private:
+  std::string GetError() {
+    if (this->devices.size() == 0) return noDevicesErrorMsg;
+    return "";
+  }
+  std::string noDevicesErrorMsg = "";
 };
 
 /*! \brief Thread local workspace */
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index d67864287dbc..58744c2cc615 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -25,6 +25,8 @@
 #include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 
+#include <sstream>
+
 #include "opencl_common.h"
 
 namespace tvm {
@@ -33,6 +35,7 @@ namespace cl {
 
 std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name);
 std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name);
+std::string GetOpenCLVersion(cl_device_id pid);
 
 struct ImageInfo {
   size_t origin[3] = {};
@@ -111,7 +114,7 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
     *rv = static_cast<int>(index < devices.size());
     return;
   }
-  ICHECK_LT(index, devices.size()) << "Invalid device id " << index;
+  ICHECK_LT(index, devices.size()) << "Invalid device id " << index << ". " << GetError();
   switch (kind) {
     case kExist:
       break;
@@ -139,17 +142,9 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       *rv = static_cast<int64_t>(value);
       break;
     }
-    case kComputeVersion: {
-      // String returned is "OpenCL $MAJOR.$MINOR $VENDOR_INFO".  To
-      // match other implementations, we want to return "$MAJOR.$MINOR"
-      std::string ret = GetDeviceInfo(devices[index], CL_DEVICE_VERSION);
-
-      const size_t version_start = 7;  // Length of initial "OpenCL " prefix to skip
-      const size_t version_end = ret.find(' ', version_start);
-      *rv = ret.substr(version_start, version_end - version_start);
+    case kComputeVersion:
+      *rv = GetOpenCLVersion(devices[index]);
       break;
-    }
-      return;
     case kDeviceName:
       *rv = GetDeviceInfo(devices[index], CL_DEVICE_NAME);
       break;
@@ -200,7 +195,7 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
 void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
                                       DLDataType type_hint) {
   this->Init();
-  ICHECK(context != nullptr) << "No OpenCL device";
+  ICHECK(context != nullptr) << "No OpenCL device. " << GetError();
   cl_int err_code;
   cl::BufferDescriptor* desc = new cl::BufferDescriptor;
   // CL_INVALID_BUFFER_SIZE if size is 0.
@@ -245,7 +240,7 @@ void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
 cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height,
                                      DLDataType type_hint) {
   this->Init();
-  ICHECK(context != nullptr) << "No OpenCL device";
+  ICHECK(context != nullptr) << "No OpenCL device. " << GetError();
   cl_int err_code;
   cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
   cl_image_format format = {CL_RGBA, cl_type};
@@ -373,12 +368,23 @@ std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name) {
 std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name) {
   size_t ret_size;
   OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size));
-  std::string ret;
-  ret.resize(ret_size);
-  OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, &ret[0], nullptr));
+  char* info = new char[ret_size];
+  OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, info, nullptr));
+  std::string ret = info;
+  delete[] info;
   return ret;
 }
 
+std::string GetOpenCLVersion(cl_device_id pid) {
+  // String returned is "OpenCL $MAJOR.$MINOR $VENDOR_INFO".  To
+  // match other implementations, we want to return "$MAJOR.$MINOR"
+  std::string ret = GetDeviceInfo(pid, CL_DEVICE_VERSION);
+
+  const size_t version_start = 7;  // Length of initial "OpenCL " prefix to skip
+  const size_t version_end = ret.find(' ', version_start);
+  return ret.substr(version_start, version_end - version_start);
+}
+
 std::vector<cl_platform_id> GetPlatformIDs() {
   cl_uint ret_size;
   cl_int code = clGetPlatformIDs(0, nullptr, &ret_size);
@@ -432,16 +438,44 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
       LOG(WARNING) << "Using CPU OpenCL device";
       devices_matched = cl::GetDeviceIDs(platform_id, "cpu");
     }
-    if (devices_matched.size() > 0) {
+    std::vector<cl_device_id> supported_devices = {};
+    auto get_version_str = [](int version) {
+      std::ostringstream out;
+      out.precision(1);
+      out << std::fixed << version / 100.f;
+      return out.str();
+    };
+    for (auto& device : devices_matched) {
+      std::string ver = GetOpenCLVersion(device);
+      int opencl_version = std::stod(ver) * 100;
+      if (opencl_version >= CL_TARGET_OPENCL_VERSION) {
+        supported_devices.push_back(device);
+      } else {
+        std::string dev_msg = GetDeviceInfo(device, CL_DEVICE_NAME) +
+                              " has OpenCL version == " + get_version_str(opencl_version);
+        LOG(WARNING) << "TVM supports devices with OpenCL version >= "
+                     << get_version_str(CL_TARGET_OPENCL_VERSION) << ", device " << dev_msg
+                     << ". This device will be ignored.";
+
+        if (noDevicesErrorMsg.empty()) {
+          noDevicesErrorMsg =
+              "Probably this error happen because TVM supports devices with OpenCL version >= " +
+              get_version_str(CL_TARGET_OPENCL_VERSION) + ". We found the following devices:\n";
+        }
+        noDevicesErrorMsg += "\t" + dev_msg + "\n";
+      }
+    }
+    if (supported_devices.size() > 0) {
       this->platform_id = platform_id;
       this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
       this->device_type = device_type;
-      this->devices = devices_matched;
+      this->devices = supported_devices;
       break;
     }
   }
   if (this->platform_id == nullptr) {
     LOG(WARNING) << "No OpenCL device";
+    initialized_ = true;
     return;
   }
   cl_int err_code;

From 1888dbd72d143069d3dd7fa829d3628e02e700de Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Sat, 19 Nov 2022 01:54:35 +0530
Subject: [PATCH 614/704] [DOCKER][ADRENO] Enable autotvm tuning cache from
 tophub for Adreno GPU (#13427)

fix envoronment for RPC execution.
---
 python/tvm/autotvm/tophub.py                     | 2 ++
 tests/python/contrib/test_clml/infrastructure.py | 2 +-
 tests/scripts/task_python_adreno.sh              | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 0a51bb12b2a4..f705d591e6ee 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -57,6 +57,7 @@
     "intel_graphics": "v0.02",
     "vta": "v0.10",
     "amd_apu": "v0.01",
+    "adreno": "v0.01",
 }
 
 logger = logging.getLogger("autotvm")
@@ -70,6 +71,7 @@ def _alias(name):
         "vulkan": "opencl",
         "nvptx": "cuda",
         "amd_apu": "amd_apu",
+        "adreno": "adreno",
     }
     return table.get(name, name)
 
diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py
index 12accda3fda5..89c22255d77d 100644
--- a/tests/python/contrib/test_clml/infrastructure.py
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -77,7 +77,7 @@ class Device:
     port = int(os.getenv("TVM_TRACKER_PORT", 9090))
     target = "opencl"
     target_host = "llvm -mtriple=aarch64-linux-gnu"
-    device_key = "android"
+    device_key = os.getenv("RPC_DEVICE_KEY", "android")
     cross_compile = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
 
     def __init__(self):
diff --git a/tests/scripts/task_python_adreno.sh b/tests/scripts/task_python_adreno.sh
index 2b131ec762be..809df1ed2daf 100755
--- a/tests/scripts/task_python_adreno.sh
+++ b/tests/scripts/task_python_adreno.sh
@@ -28,6 +28,7 @@ export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-adreno
 
 export TVM_TRACKER_HOST=127.0.0.1
 export TVM_TRACKER_PORT=$(((RANDOM % 100) + 9100))
+export RPC_DEVICE_KEY="android"
 export RPC_TARGET="adreno"
 export TVM_NDK_CC="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang"
 
@@ -45,7 +46,7 @@ adb reverse tcp:${TVM_TRACKER_PORT} tcp:${TVM_TRACKER_PORT}
 adb forward tcp:5000 tcp:5000
 adb forward tcp:5001 tcp:5001
 adb forward tcp:5002 tcp:5002
-env adb shell "cd /data/local/tmp/tvm_ci; killall -9 tvm_rpc_ci; sleep 2; LD_LIBRARY_PATH=/data/local/tmp/tvm_ci/ ./tvm_rpc_ci server --host=0.0.0.0 --port=5000 --port-end=5010 --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=android" &
+env adb shell "cd /data/local/tmp/tvm_ci; killall -9 tvm_rpc_ci; sleep 2; LD_LIBRARY_PATH=/data/local/tmp/tvm_ci/ ./tvm_rpc_ci server --host=0.0.0.0 --port=5000 --port-end=5010 --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}" &
 DEVICE_PID=$!
 sleep 5 # Wait for the device connections
 trap "{ kill ${TRACKER_PID}; kill ${DEVICE_PID}; }" 0

From 8eff158470894fa495710cbfacba7f1984de138b Mon Sep 17 00:00:00 2001
From: krishnaraj36 <45380557+krishnaraj36@users.noreply.github.com>
Date: Sat, 19 Nov 2022 01:55:12 +0530
Subject: [PATCH 615/704] [RUNTIME][CLML] Add fixes to clml runtime api
 (#13426)

* [RUNTIME][CLML] Add fixes to clml runtime

Handled unsupported configuration for ops in clml pattern table
and updated Dense op runtime api with conv 1x1 kernel api invocation.

* [RUNTIME][CLML] Add fixes to clml runtime

Handled unsupported configuration for ops in clml pattern table and
updated Dense op runtime api with conv 1x1 kernel api invocation.

* Fix the lint error for clml.py

* Fix the lint error for whitespace

* Fix the lint error for space

* Fix the lint error for if condition

* Fixes for lint error of whitespace

* Fixes for lint error of trailing space

* Fixes for lint error of trailing space

* Fixes for lint error of whitespace

* Fixes for lint error of trailingspace

Co-authored-by: kvegiraj <kvegiraj@qti.qualcomm.com>
---
 python/tvm/relay/op/contrib/clml.py       | 70 +++++++++++++++++------
 src/relay/backend/contrib/clml/codegen.cc | 26 ++++++++-
 src/relay/backend/contrib/clml/target.cc  | 41 +++++++++++++
 src/runtime/contrib/clml/clml_runtime.cc  | 38 +++++++-----
 4 files changed, 140 insertions(+), 35 deletions(-)
 create mode 100644 src/relay/backend/contrib/clml/target.cc

diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index d253544d45d9..c3d4eb84700d 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -137,11 +137,12 @@ def conv_pattern():
         pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
         pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
         pattern = pattern.optional(
-            lambda x: is_op("nn.batch_norm")(
-                x, is_constant(), is_constant(), is_constant(), is_constant()
+            lambda x: is_tuple_get_item(
+                is_op("nn.batch_norm")(
+                    x, is_constant(), is_constant(), is_constant(), is_constant()
+                )
             )
         )
-        pattern = pattern.optional(is_tuple_get_item)
         pattern = pattern.optional(is_op("nn.relu"))
         pattern = pattern.optional(is_op("clip"))
         return pattern
@@ -176,12 +177,13 @@ def dense_pattern():
 
     def pad_pattern():
         """Create a pad pattern."""
-        pattern = is_op("nn.pad")(wildcard(), wildcard())
+        pattern = is_op("nn.pad")(wildcard(), is_constant())
         return pattern
 
     def check_conv(extract):
         """Check conv pattern is supported by CLML."""
         call = extract
+        clip_found = False
         if isinstance(call, tvm.relay.expr.TupleGetItem):
             call = call.tuple_value
         elif call.op.name == "nn.relu":
@@ -189,6 +191,7 @@ def check_conv(extract):
             if isinstance(call, tvm.relay.expr.TupleGetItem):
                 call = call.tuple_value
         elif call.op.name == "clip":
+            clip_found = True
             if call.attrs["a_min"] != 0.0 or call.attrs["a_max"] != 6.0:
                 return False
             call = call.args[0]
@@ -200,6 +203,14 @@ def check_conv(extract):
         attrs, args = call.attrs, call.args
         if attrs.data_layout != "NCHW":
             return False
+        if (
+            (not clip_found)
+            and (attrs.kernel_size[0] == 3)
+            and (attrs.dilation[0] != 1)
+            and (attrs.groups != 1)
+            and (attrs.channels == attrs.groups)
+        ):
+            return False
         data_typ = args[0].checked_type
         kernel_typ = args[1].checked_type
         is_depthwise = is_depthwise_conv2d(
@@ -213,12 +224,44 @@ def check_conv(extract):
             return False
         return True
 
+    def check_binary_op(extract):
+        call = extract
+        if len(call.args[1].checked_type.shape) > 0:
+            return True
+        return False
+
+    def check_pad_op(extract):
+        call = extract
+        if len(call.attrs["pad_width"]) != 4:
+            return False
+        return True
+
+    def check_softmax_op(extract):
+        call = extract
+        if len(call.args[0].checked_type.shape) > 2:
+            return False
+        return True
+
+    def check_default_op(extract):
+        return True
+
     return [
         ("clml.conv2d", conv_pattern(), check_conv),
-        ("clml.dense", dense_pattern()),
-        ("clml.pad", pad_pattern()),
-        ("clml.concat", concat_pattern()),
-        ("clml.batch_norm", batch_norm_pattern()),
+        ("clml.dense", dense_pattern(), check_default_op),
+        ("clml.pad", pad_pattern(), check_pad_op),
+        ("clml.concat", concat_pattern(), check_default_op),
+        ("clml.batch_norm", batch_norm_pattern(), check_default_op),
+        ("clml.add", is_op("add")(wildcard(), wildcard()), check_binary_op),
+        ("clml.subtract", is_op("subtract")(wildcard(), wildcard()), check_binary_op),
+        ("clml.multiply", is_op("multiply")(wildcard(), wildcard()), check_binary_op),
+        ("clml.softmax", is_op("nn.softmax")(wildcard()), check_softmax_op),
+        ("clml.reshape", is_op("reshape")(wildcard()), check_default_op),
+        ("clml.avg_pool2d", is_op("nn.avg_pool2d")(wildcard()), check_default_op),
+        ("clml.max_pool2d", is_op("nn.max_pool2d")(wildcard()), check_default_op),
+        ("clml.global_avg_pool2d", is_op("nn.global_avg_pool2d")(wildcard()), check_default_op),
+        ("clml.global_max_pool2d", is_op("nn.global_max_pool2d")(wildcard()), check_default_op),
+        ("clml.relu", is_op("nn.relu")(wildcard()), check_default_op),
+        ("clml.clip", is_op("clip")(wildcard()), check_default_op),
     ]
 
 
@@ -230,17 +273,6 @@ def _func_wrapper(expr):
     return _func_wrapper
 
 
-_register_external_op_helper("clip")
-_register_external_op_helper("nn.relu")
-_register_external_op_helper("nn.global_avg_pool2d")
-_register_external_op_helper("nn.global_max_pool2d")
-_register_external_op_helper("nn.avg_pool2d")
-_register_external_op_helper("nn.max_pool2d")
-_register_external_op_helper("nn.softmax")
-_register_external_op_helper("reshape")
-_register_external_op_helper("add")
-_register_external_op_helper("subtract")
-_register_external_op_helper("multiply")
 _register_external_op_helper("minimum")
 _register_external_op_helper("maximum")
 
diff --git a/src/relay/backend/contrib/clml/codegen.cc b/src/relay/backend/contrib/clml/codegen.cc
index b89f05e17857..9ecec0c4531f 100644
--- a/src/relay/backend/contrib/clml/codegen.cc
+++ b/src/relay/backend/contrib/clml/codegen.cc
@@ -94,7 +94,7 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
     } else if (name == "clml.concat") {
       json_node = CreateConcatJSONNode(cn);
     } else {
-      LOG(FATAL) << "Unrecognized CLML  pattern: " << name;
+      json_node = CreateGenericJSONNode(cn);
     }
     return AddNode(json_node, GetRef<Expr>(cn));
   }
@@ -164,7 +164,7 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
       nodes.bn = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
-    if (backend::IsOp(current_call, "add")) {
+    if (backend::IsOp(current_call, "add") || backend::IsOp(current_call, "nn.bias_add")) {
       nodes.bias = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
@@ -387,6 +387,28 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
 
     return json_node;
   }
+
+  std::shared_ptr<JSONGraphNode> CreateGenericJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* node = fn->body.as<CallNode>();
+
+    const auto* node_op = node->op.as<OpNode>();
+    ICHECK(node_op);
+    const std::string name = node_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    unsigned int i = 0;
+    for (i = 0; i < cn->args.size(); i++) {
+      inputs.push_back(VisitExpr(cn->args[i])[0]);
+    }
+    for (unsigned int j = i; j < node->args.size(); j++) {
+      inputs.push_back(VisitExpr(node->args[j])[0]);
+    }
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, node);
+    return json_node;
+  }
 };
 
 /*!
diff --git a/src/relay/backend/contrib/clml/target.cc b/src/relay/backend/contrib/clml/target.cc
new file mode 100644
index 000000000000..c7f22c1315c8
--- /dev/null
+++ b/src/relay/backend/contrib/clml/target.cc
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/clml/target.cc
+ * \brief Registers the "clml" external codegen TargetKind.
+ */
+
+#include <tvm/target/target.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*!
+ * \brief This external codegen target can use the CLML library linked into the TVM runtime.
+ *  - Patterns and custom compiler: python/tvm/relay/op/contrib/clml.py
+ *  - Runtime: src/runtime/contrib/clml/clml_runtime.cc
+ */
+TVM_REGISTER_TARGET_KIND("clml", kDLOpenCL)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index cdc3b9a7b51c..7492e521b7f5 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -1144,28 +1144,38 @@ class CLMLRuntime : public JSONRuntimeBase {
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto wt_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
     bool has_bias = node.GetInputs().size() == 3 ? true : false;
-    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c},
+    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {wt_dims.n, wt_dims.c, 1, 1},
                                               CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+
     auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     if (has_bias) {
       auto bias_dims = get_tensor_dims(nodes_[node.GetInputs()[2].id_]);
       bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1},
                                          CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
-    }
-
-    cl_ml_op_fully_connected_desc_qcom fc_desc = {1, CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM,
-                                                  cl_arithmetic_mode};
-    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
-
-    if (has_bias) {
-      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(
-          workspace->context, 0, &fc_desc, input->tensor, weight->tensor, bias->tensor,
-          output->tensor, &op, tuning_cache);
     } else {
-      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(workspace->context, 0, &fc_desc,
-                                                          input->tensor, weight->tensor, NULL,
-                                                          output->tensor, &op, tuning_cache);
+      cl_ml_tensor_desc_qcom desc = {};
+      desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+      result =
+          h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &layer_.unusedTensor);
+      ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
+      bias->tensor = layer_.unusedTensor;
     }
+    // Output
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype, nullptr,
+                                             {1, wt_dims.n, 1, 1});
+    cl_ml_op_convolution_desc_qcom conv_desc = {CL_CONVOLUTION_MODE_CONVOLUTION_QCOM,
+                                                1,
+                                                4,
+                                                {0, 0},
+                                                {0, 0},
+                                                {1, 1},
+                                                {1, 1},
+                                                0,
+                                                cl_arithmetic_mode};
+
+    result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM(
+        workspace->context, 0, &conv_desc, input->tensor, weight->tensor, bias->tensor,
+        output->tensor, &op, NULL);
     ICHECK(op && result == CL_SUCCESS) << "Fully Connected Error:" << result;
 
     layer->function.push_back(op);

From b023e9539c33a2fa43207a122ec60ad11c67314b Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 18 Nov 2022 12:26:16 -0800
Subject: [PATCH 616/704] [Fix,Roofline] Handle zero length features in
 roofline (#13424)

These features may occur for extern ops.
---
 python/tvm/auto_scheduler/feature.py  | 2 ++
 python/tvm/utils/roofline/__init__.py | 2 ++
 src/auto_scheduler/feature.cc         | 5 ++++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/tvm/auto_scheduler/feature.py b/python/tvm/auto_scheduler/feature.py
index 491f8b7643b6..21db37abd43e 100644
--- a/python/tvm/auto_scheduler/feature.py
+++ b/python/tvm/auto_scheduler/feature.py
@@ -327,4 +327,6 @@ def named_features_from_primfunc(
     """
     features = features_from_primfunc(func, cache_line_bytes, max_n_bufs, log_scale)
     names = get_per_store_feature_names(max_n_bufs)
+    if features.shape[0] == 0:
+        return None
     return {name: features[:, i] for i, name in enumerate(names)}
diff --git a/python/tvm/utils/roofline/__init__.py b/python/tvm/utils/roofline/__init__.py
index 3b0144cb90e8..1129ac2c0e1d 100644
--- a/python/tvm/utils/roofline/__init__.py
+++ b/python/tvm/utils/roofline/__init__.py
@@ -142,6 +142,8 @@ def roofline_from_existing(
     for call in report.calls:
         if "Hash" in call.keys() and call["Hash"] in all_features:
             _, prim, features = all_features[call["Hash"]]
+            if features is None:
+                continue
 
             with target:
                 flops, peak_flops, flops_name = registry.estimate_peak_flops(
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 0b5a157c8813..2f993c0c8b82 100644
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1737,7 +1737,10 @@ TVM_REGISTER_GLOBAL("auto_scheduler.FeaturesFromPrimFunc")
       std::vector<float> vec;
       GetPerStoreFeature(func, cache_line_size, max_n_bufs, &vec, log_scale);
       int64_t num_feature_rows = vec[0];  // first element is number of rows
-      int64_t row_length = (vec.size() - 1) / num_feature_rows;
+      int64_t row_length = 0;
+      if (num_feature_rows != 0) {
+        row_length = (vec.size() - 1) / num_feature_rows;
+      }
       auto ary =
           runtime::NDArray::Empty({num_feature_rows, row_length}, {kDLFloat, 32, 1}, {kDLCPU, 0});
       // NDArray is row major by default

From 80f8e8bb74d8f4d026116d86a7b1d81a9e69fbcf Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 18 Nov 2022 14:30:42 -0600
Subject: [PATCH 617/704] [TE] Remove binding to temporary variable (#13430)

The call to `analyzer->Simplify` returns a `PrimExpr` object.  Binding
a `const PrimExpr&` to this temporary object results in a dangling
reference.
---
 src/te/operation/create_primfunc.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index d797350eed4f..9f8d7d46a151 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -156,8 +156,8 @@ BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
       Var new_var(iter_var->var->name_hint, iter_var->var->dtype);
       var_map[iter_var->var.get()] = new_var;
 
-      const PrimExpr& dom_min = analyzer->Simplify(iter_var->dom->min);
-      const PrimExpr& dom_extent = analyzer->Simplify(iter_var->dom->extent);
+      PrimExpr dom_min = analyzer->Simplify(iter_var->dom->min);
+      PrimExpr dom_extent = analyzer->Simplify(iter_var->dom->extent);
       iter_vars.push_back(IterVar(Range::FromMinExtent(dom_min, dom_extent), new_var,
                                   iter_var->iter_type, iter_var->thread_tag, iter_var->span));
     }

From c1b8721695d52a57a86dd7030c4ca2dbbf46134b Mon Sep 17 00:00:00 2001
From: Alexey Yazev <113356454+Alexey-Yazev@users.noreply.github.com>
Date: Fri, 18 Nov 2022 14:16:04 -0800
Subject: [PATCH 618/704] [microNPU] Upgrade Vela to v3.5.0 (#13394)

---
 apps/microtvm/cmsisnn/requirements.txt        | 79 +++++++++----------
 apps/microtvm/ethosu/requirements.txt         | 79 +++++++++----------
 docker/install/ubuntu_install_vela.sh         |  2 +-
 .../how_to/work_with_microtvm/micro_ethosu.py |  4 +-
 python/gen_requirements.py                    |  2 +-
 5 files changed, 82 insertions(+), 84 deletions(-)

diff --git a/apps/microtvm/cmsisnn/requirements.txt b/apps/microtvm/cmsisnn/requirements.txt
index 6c699612dac5..ae8e0aacd738 100644
--- a/apps/microtvm/cmsisnn/requirements.txt
+++ b/apps/microtvm/cmsisnn/requirements.txt
@@ -7,8 +7,8 @@ cloudpickle==2.0.0 \
 decorator==5.1.0 \
     --hash=sha256:7b12e7c3c6ab203a29e157335e9122cb03de9ab7264b137594103fd4a683b374 \
     --hash=sha256:e59913af105b9860aa2c8d3272d9de5a56a4e608db9a2f167a8480b323d529a7
-ethos-u-vela==3.2.0 \
-    --hash=sha256:2deb06af5d5c71227aeba9a98cd1f65869250cf70f89759de3f03475a38b7b0b
+ethos-u-vela==3.5.0 \
+    --hash=sha256:e56c2f62e06439f45d07f2e6f41fd133a46fb7b6a2e0e6d3baf7ec1d947baca1
 flatbuffers==1.12 \
     --hash=sha256:63bb9a722d5e373701913e226135b28a6f6ac200d5cc7b4d919fa38d73b44610 \
     --hash=sha256:9e9ef47fa92625c4721036e7c4124182668dc6021d9e7c73704edd395648deb9
@@ -65,41 +65,40 @@ nose==1.3.7 \
     --hash=sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac \
     --hash=sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a \
     --hash=sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98
-numpy==1.19.5 \
-    --hash=sha256:012426a41bc9ab63bb158635aecccc7610e3eff5d31d1eb43bc099debc979d94 \
-    --hash=sha256:06fab248a088e439402141ea04f0fffb203723148f6ee791e9c75b3e9e82f080 \
-    --hash=sha256:0eef32ca3132a48e43f6a0f5a82cb508f22ce5a3d6f67a8329c81c8e226d3f6e \
-    --hash=sha256:1ded4fce9cfaaf24e7a0ab51b7a87be9038ea1ace7f34b841fe3b6894c721d1c \
-    --hash=sha256:2e55195bc1c6b705bfd8ad6f288b38b11b1af32f3c8289d6c50d47f950c12e76 \
-    --hash=sha256:2ea52bd92ab9f768cc64a4c3ef8f4b2580a17af0a5436f6126b08efbd1838371 \
-    --hash=sha256:36674959eed6957e61f11c912f71e78857a8d0604171dfd9ce9ad5cbf41c511c \
-    --hash=sha256:384ec0463d1c2671170901994aeb6dce126de0a95ccc3976c43b0038a37329c2 \
-    --hash=sha256:39b70c19ec771805081578cc936bbe95336798b7edf4732ed102e7a43ec5c07a \
-    --hash=sha256:400580cbd3cff6ffa6293df2278c75aef2d58d8d93d3c5614cd67981dae68ceb \
-    --hash=sha256:43d4c81d5ffdff6bae58d66a3cd7f54a7acd9a0e7b18d97abb255defc09e3140 \
-    --hash=sha256:50a4a0ad0111cc1b71fa32dedd05fa239f7fb5a43a40663269bb5dc7877cfd28 \
-    --hash=sha256:603aa0706be710eea8884af807b1b3bc9fb2e49b9f4da439e76000f3b3c6ff0f \
-    --hash=sha256:6149a185cece5ee78d1d196938b2a8f9d09f5a5ebfbba66969302a778d5ddd1d \
-    --hash=sha256:759e4095edc3c1b3ac031f34d9459fa781777a93ccc633a472a5468587a190ff \
-    --hash=sha256:7fb43004bce0ca31d8f13a6eb5e943fa73371381e53f7074ed21a4cb786c32f8 \
-    --hash=sha256:811daee36a58dc79cf3d8bdd4a490e4277d0e4b7d103a001a4e73ddb48e7e6aa \
-    --hash=sha256:8b5e972b43c8fc27d56550b4120fe6257fdc15f9301914380b27f74856299fea \
-    --hash=sha256:99abf4f353c3d1a0c7a5f27699482c987cf663b1eac20db59b8c7b061eabd7fc \
-    --hash=sha256:a0d53e51a6cb6f0d9082decb7a4cb6dfb33055308c4c44f53103c073f649af73 \
-    --hash=sha256:a12ff4c8ddfee61f90a1633a4c4afd3f7bcb32b11c52026c92a12e1325922d0d \
-    --hash=sha256:a4646724fba402aa7504cd48b4b50e783296b5e10a524c7a6da62e4a8ac9698d \
-    --hash=sha256:a76f502430dd98d7546e1ea2250a7360c065a5fdea52b2dffe8ae7180909b6f4 \
-    --hash=sha256:a9d17f2be3b427fbb2bce61e596cf555d6f8a56c222bd2ca148baeeb5e5c783c \
-    --hash=sha256:ab83f24d5c52d60dbc8cd0528759532736b56db58adaa7b5f1f76ad551416a1e \
-    --hash=sha256:aeb9ed923be74e659984e321f609b9ba54a48354bfd168d21a2b072ed1e833ea \
-    --hash=sha256:c843b3f50d1ab7361ca4f0b3639bf691569493a56808a0b0c54a051d260b7dbd \
-    --hash=sha256:cae865b1cae1ec2663d8ea56ef6ff185bad091a5e33ebbadd98de2cfa3fa668f \
-    --hash=sha256:cc6bd4fd593cb261332568485e20a0712883cf631f6f5e8e86a52caa8b2b50ff \
-    --hash=sha256:cf2402002d3d9f91c8b01e66fbb436a4ed01c6498fffed0e4c7566da1d40ee1e \
-    --hash=sha256:d051ec1c64b85ecc69531e1137bb9751c6830772ee5c1c426dbcfe98ef5788d7 \
-    --hash=sha256:d6631f2e867676b13026e2846180e2c13c1e11289d67da08d71cacb2cd93d4aa \
-    --hash=sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827 \
-    --hash=sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60
+numpy==1.21.3 \
+    --hash=sha256:043e83bfc274649c82a6f09836943e4a4aebe5e33656271c7dbf9621dd58b8ec \
+    --hash=sha256:160ccc1bed3a8371bf0d760971f09bfe80a3e18646620e9ded0ad159d9749baa \
+    --hash=sha256:188031f833bbb623637e66006cf75e933e00e7231f67e2b45cf8189612bb5dc3 \
+    --hash=sha256:28f15209fb535dd4c504a7762d3bc440779b0e37d50ed810ced209e5cea60d96 \
+    --hash=sha256:29fb3dcd0468b7715f8ce2c0c2d9bbbaf5ae686334951343a41bd8d155c6ea27 \
+    --hash=sha256:2a6ee9620061b2a722749b391c0d80a0e2ae97290f1b32e28d5a362e21941ee4 \
+    --hash=sha256:300321e3985c968e3ae7fbda187237b225f3ffe6528395a5b7a5407f73cf093e \
+    --hash=sha256:32437f0b275c1d09d9c3add782516413e98cd7c09e6baf4715cbce781fc29912 \
+    --hash=sha256:3c09418a14471c7ae69ba682e2428cae5b4420a766659605566c0fa6987f6b7e \
+    --hash=sha256:49c6249260890e05b8111ebfc391ed58b3cb4b33e63197b2ec7f776e45330721 \
+    --hash=sha256:4cc9b512e9fb590797474f58b7f6d1f1b654b3a94f4fa8558b48ca8b3cfc97cf \
+    --hash=sha256:508b0b513fa1266875524ba8a9ecc27b02ad771fe1704a16314dc1a816a68737 \
+    --hash=sha256:50cd26b0cf6664cb3b3dd161ba0a09c9c1343db064e7c69f9f8b551f5104d654 \
+    --hash=sha256:5c4193f70f8069550a1788bd0cd3268ab7d3a2b70583dfe3b2e7f421e9aace06 \
+    --hash=sha256:5dfe9d6a4c39b8b6edd7990091fea4f852888e41919d0e6722fe78dd421db0eb \
+    --hash=sha256:63571bb7897a584ca3249c86dd01c10bcb5fe4296e3568b2e9c1a55356b6410e \
+    --hash=sha256:75621882d2230ab77fb6a03d4cbccd2038511491076e7964ef87306623aa5272 \
+    --hash=sha256:75eb7cadc8da49302f5b659d40ba4f6d94d5045fbd9569c9d058e77b0514c9e4 \
+    --hash=sha256:88a5d6b268e9ad18f3533e184744acdaa2e913b13148160b1152300c949bbb5f \
+    --hash=sha256:8a10968963640e75cc0193e1847616ab4c718e83b6938ae74dea44953950f6b7 \
+    --hash=sha256:90bec6a86b348b4559b6482e2b684db4a9a7eed1fa054b86115a48d58fbbf62a \
+    --hash=sha256:98339aa9911853f131de11010f6dd94c8cec254d3d1f7261528c3b3e3219f139 \
+    --hash=sha256:a99a6b067e5190ac6d12005a4d85aa6227c5606fa93211f86b1dafb16233e57d \
+    --hash=sha256:bffa2eee3b87376cc6b31eee36d05349571c236d1de1175b804b348dc0941e3f \
+    --hash=sha256:c6c2d535a7beb1f8790aaa98fd089ceab2e3dd7ca48aca0af7dc60e6ef93ffe1 \
+    --hash=sha256:cc14e7519fab2a4ed87d31f99c31a3796e4e1fe63a86ebdd1c5a1ea78ebd5896 \
+    --hash=sha256:dd0482f3fc547f1b1b5d6a8b8e08f63fdc250c58ce688dedd8851e6e26cff0f3 \
+    --hash=sha256:dde972a1e11bb7b702ed0e447953e7617723760f420decb97305e66fb4afc54f \
+    --hash=sha256:e54af82d68ef8255535a6cdb353f55d6b8cf418a83e2be3569243787a4f4866f \
+    --hash=sha256:e606e6316911471c8d9b4618e082635cfe98876007556e89ce03d52ff5e8fcf0 \
+    --hash=sha256:f41b018f126aac18583956c54544db437f25c7ee4794bcb23eb38bef8e5e192a \
+    --hash=sha256:f8f4625536926a155b80ad2bbff44f8cc59e9f2ad14cdda7acf4c135b4dc8ff2 \
+    --hash=sha256:fe52dbe47d9deb69b05084abd4b0df7abb39a3c51957c09f635520abd49b29dd
 Pillow==8.3.2 \
     --hash=sha256:0412516dcc9de9b0a1e0ae25a280015809de8270f134cc2c1e32c4eeb397cf30 \
     --hash=sha256:04835e68ef12904bc3e1fd002b33eea0779320d4346082bd5b24bec12ad9c3e9 \
@@ -209,9 +208,9 @@ scipy==1.5.4 \
     --hash=sha256:ed572470af2438b526ea574ff8f05e7f39b44ac37f712105e57fc4d53a6fb660 \
     --hash=sha256:f87b39f4d69cf7d7529d7b1098cb712033b17ea7714aed831b95628f483fd012 \
     --hash=sha256:fa789583fc94a7689b45834453fec095245c7e69c58561dc159b5d5277057e4c
-synr==0.4 \
-    --hash=sha256:2f280cdc73d6f98154c97f13130c9e387635060436a0bf07483bb8c6423ee8aa \
-    --hash=sha256:35cd3e0739ad8a4d52b742534f14149bd70f60f1ff8779d96b3484123ced3640
+synr==0.6.0 \
+    --hash=sha256:0b4e16b10c3988e1981e3372153a31956f74d86752eaaa55e8c4e7b7fe591e4e \
+    --hash=sha256:9399b27d9f21c5d439eae92e0159d6f521cc396d27149ac45473012a205a3c30
 tflite==2.4.0 \
     --hash=sha256:0510db1b48a3eec86bf9bb8d2749cd9d6d26d6a4fb329fd141bde5b4404932d1 \
     --hash=sha256:0796f6ce6eb2aef4a318f5509e5fb0ce808e29cd3094801b4abbb1d8575a28cd
diff --git a/apps/microtvm/ethosu/requirements.txt b/apps/microtvm/ethosu/requirements.txt
index 6c699612dac5..ae8e0aacd738 100644
--- a/apps/microtvm/ethosu/requirements.txt
+++ b/apps/microtvm/ethosu/requirements.txt
@@ -7,8 +7,8 @@ cloudpickle==2.0.0 \
 decorator==5.1.0 \
     --hash=sha256:7b12e7c3c6ab203a29e157335e9122cb03de9ab7264b137594103fd4a683b374 \
     --hash=sha256:e59913af105b9860aa2c8d3272d9de5a56a4e608db9a2f167a8480b323d529a7
-ethos-u-vela==3.2.0 \
-    --hash=sha256:2deb06af5d5c71227aeba9a98cd1f65869250cf70f89759de3f03475a38b7b0b
+ethos-u-vela==3.5.0 \
+    --hash=sha256:e56c2f62e06439f45d07f2e6f41fd133a46fb7b6a2e0e6d3baf7ec1d947baca1
 flatbuffers==1.12 \
     --hash=sha256:63bb9a722d5e373701913e226135b28a6f6ac200d5cc7b4d919fa38d73b44610 \
     --hash=sha256:9e9ef47fa92625c4721036e7c4124182668dc6021d9e7c73704edd395648deb9
@@ -65,41 +65,40 @@ nose==1.3.7 \
     --hash=sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac \
     --hash=sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a \
     --hash=sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98
-numpy==1.19.5 \
-    --hash=sha256:012426a41bc9ab63bb158635aecccc7610e3eff5d31d1eb43bc099debc979d94 \
-    --hash=sha256:06fab248a088e439402141ea04f0fffb203723148f6ee791e9c75b3e9e82f080 \
-    --hash=sha256:0eef32ca3132a48e43f6a0f5a82cb508f22ce5a3d6f67a8329c81c8e226d3f6e \
-    --hash=sha256:1ded4fce9cfaaf24e7a0ab51b7a87be9038ea1ace7f34b841fe3b6894c721d1c \
-    --hash=sha256:2e55195bc1c6b705bfd8ad6f288b38b11b1af32f3c8289d6c50d47f950c12e76 \
-    --hash=sha256:2ea52bd92ab9f768cc64a4c3ef8f4b2580a17af0a5436f6126b08efbd1838371 \
-    --hash=sha256:36674959eed6957e61f11c912f71e78857a8d0604171dfd9ce9ad5cbf41c511c \
-    --hash=sha256:384ec0463d1c2671170901994aeb6dce126de0a95ccc3976c43b0038a37329c2 \
-    --hash=sha256:39b70c19ec771805081578cc936bbe95336798b7edf4732ed102e7a43ec5c07a \
-    --hash=sha256:400580cbd3cff6ffa6293df2278c75aef2d58d8d93d3c5614cd67981dae68ceb \
-    --hash=sha256:43d4c81d5ffdff6bae58d66a3cd7f54a7acd9a0e7b18d97abb255defc09e3140 \
-    --hash=sha256:50a4a0ad0111cc1b71fa32dedd05fa239f7fb5a43a40663269bb5dc7877cfd28 \
-    --hash=sha256:603aa0706be710eea8884af807b1b3bc9fb2e49b9f4da439e76000f3b3c6ff0f \
-    --hash=sha256:6149a185cece5ee78d1d196938b2a8f9d09f5a5ebfbba66969302a778d5ddd1d \
-    --hash=sha256:759e4095edc3c1b3ac031f34d9459fa781777a93ccc633a472a5468587a190ff \
-    --hash=sha256:7fb43004bce0ca31d8f13a6eb5e943fa73371381e53f7074ed21a4cb786c32f8 \
-    --hash=sha256:811daee36a58dc79cf3d8bdd4a490e4277d0e4b7d103a001a4e73ddb48e7e6aa \
-    --hash=sha256:8b5e972b43c8fc27d56550b4120fe6257fdc15f9301914380b27f74856299fea \
-    --hash=sha256:99abf4f353c3d1a0c7a5f27699482c987cf663b1eac20db59b8c7b061eabd7fc \
-    --hash=sha256:a0d53e51a6cb6f0d9082decb7a4cb6dfb33055308c4c44f53103c073f649af73 \
-    --hash=sha256:a12ff4c8ddfee61f90a1633a4c4afd3f7bcb32b11c52026c92a12e1325922d0d \
-    --hash=sha256:a4646724fba402aa7504cd48b4b50e783296b5e10a524c7a6da62e4a8ac9698d \
-    --hash=sha256:a76f502430dd98d7546e1ea2250a7360c065a5fdea52b2dffe8ae7180909b6f4 \
-    --hash=sha256:a9d17f2be3b427fbb2bce61e596cf555d6f8a56c222bd2ca148baeeb5e5c783c \
-    --hash=sha256:ab83f24d5c52d60dbc8cd0528759532736b56db58adaa7b5f1f76ad551416a1e \
-    --hash=sha256:aeb9ed923be74e659984e321f609b9ba54a48354bfd168d21a2b072ed1e833ea \
-    --hash=sha256:c843b3f50d1ab7361ca4f0b3639bf691569493a56808a0b0c54a051d260b7dbd \
-    --hash=sha256:cae865b1cae1ec2663d8ea56ef6ff185bad091a5e33ebbadd98de2cfa3fa668f \
-    --hash=sha256:cc6bd4fd593cb261332568485e20a0712883cf631f6f5e8e86a52caa8b2b50ff \
-    --hash=sha256:cf2402002d3d9f91c8b01e66fbb436a4ed01c6498fffed0e4c7566da1d40ee1e \
-    --hash=sha256:d051ec1c64b85ecc69531e1137bb9751c6830772ee5c1c426dbcfe98ef5788d7 \
-    --hash=sha256:d6631f2e867676b13026e2846180e2c13c1e11289d67da08d71cacb2cd93d4aa \
-    --hash=sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827 \
-    --hash=sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60
+numpy==1.21.3 \
+    --hash=sha256:043e83bfc274649c82a6f09836943e4a4aebe5e33656271c7dbf9621dd58b8ec \
+    --hash=sha256:160ccc1bed3a8371bf0d760971f09bfe80a3e18646620e9ded0ad159d9749baa \
+    --hash=sha256:188031f833bbb623637e66006cf75e933e00e7231f67e2b45cf8189612bb5dc3 \
+    --hash=sha256:28f15209fb535dd4c504a7762d3bc440779b0e37d50ed810ced209e5cea60d96 \
+    --hash=sha256:29fb3dcd0468b7715f8ce2c0c2d9bbbaf5ae686334951343a41bd8d155c6ea27 \
+    --hash=sha256:2a6ee9620061b2a722749b391c0d80a0e2ae97290f1b32e28d5a362e21941ee4 \
+    --hash=sha256:300321e3985c968e3ae7fbda187237b225f3ffe6528395a5b7a5407f73cf093e \
+    --hash=sha256:32437f0b275c1d09d9c3add782516413e98cd7c09e6baf4715cbce781fc29912 \
+    --hash=sha256:3c09418a14471c7ae69ba682e2428cae5b4420a766659605566c0fa6987f6b7e \
+    --hash=sha256:49c6249260890e05b8111ebfc391ed58b3cb4b33e63197b2ec7f776e45330721 \
+    --hash=sha256:4cc9b512e9fb590797474f58b7f6d1f1b654b3a94f4fa8558b48ca8b3cfc97cf \
+    --hash=sha256:508b0b513fa1266875524ba8a9ecc27b02ad771fe1704a16314dc1a816a68737 \
+    --hash=sha256:50cd26b0cf6664cb3b3dd161ba0a09c9c1343db064e7c69f9f8b551f5104d654 \
+    --hash=sha256:5c4193f70f8069550a1788bd0cd3268ab7d3a2b70583dfe3b2e7f421e9aace06 \
+    --hash=sha256:5dfe9d6a4c39b8b6edd7990091fea4f852888e41919d0e6722fe78dd421db0eb \
+    --hash=sha256:63571bb7897a584ca3249c86dd01c10bcb5fe4296e3568b2e9c1a55356b6410e \
+    --hash=sha256:75621882d2230ab77fb6a03d4cbccd2038511491076e7964ef87306623aa5272 \
+    --hash=sha256:75eb7cadc8da49302f5b659d40ba4f6d94d5045fbd9569c9d058e77b0514c9e4 \
+    --hash=sha256:88a5d6b268e9ad18f3533e184744acdaa2e913b13148160b1152300c949bbb5f \
+    --hash=sha256:8a10968963640e75cc0193e1847616ab4c718e83b6938ae74dea44953950f6b7 \
+    --hash=sha256:90bec6a86b348b4559b6482e2b684db4a9a7eed1fa054b86115a48d58fbbf62a \
+    --hash=sha256:98339aa9911853f131de11010f6dd94c8cec254d3d1f7261528c3b3e3219f139 \
+    --hash=sha256:a99a6b067e5190ac6d12005a4d85aa6227c5606fa93211f86b1dafb16233e57d \
+    --hash=sha256:bffa2eee3b87376cc6b31eee36d05349571c236d1de1175b804b348dc0941e3f \
+    --hash=sha256:c6c2d535a7beb1f8790aaa98fd089ceab2e3dd7ca48aca0af7dc60e6ef93ffe1 \
+    --hash=sha256:cc14e7519fab2a4ed87d31f99c31a3796e4e1fe63a86ebdd1c5a1ea78ebd5896 \
+    --hash=sha256:dd0482f3fc547f1b1b5d6a8b8e08f63fdc250c58ce688dedd8851e6e26cff0f3 \
+    --hash=sha256:dde972a1e11bb7b702ed0e447953e7617723760f420decb97305e66fb4afc54f \
+    --hash=sha256:e54af82d68ef8255535a6cdb353f55d6b8cf418a83e2be3569243787a4f4866f \
+    --hash=sha256:e606e6316911471c8d9b4618e082635cfe98876007556e89ce03d52ff5e8fcf0 \
+    --hash=sha256:f41b018f126aac18583956c54544db437f25c7ee4794bcb23eb38bef8e5e192a \
+    --hash=sha256:f8f4625536926a155b80ad2bbff44f8cc59e9f2ad14cdda7acf4c135b4dc8ff2 \
+    --hash=sha256:fe52dbe47d9deb69b05084abd4b0df7abb39a3c51957c09f635520abd49b29dd
 Pillow==8.3.2 \
     --hash=sha256:0412516dcc9de9b0a1e0ae25a280015809de8270f134cc2c1e32c4eeb397cf30 \
     --hash=sha256:04835e68ef12904bc3e1fd002b33eea0779320d4346082bd5b24bec12ad9c3e9 \
@@ -209,9 +208,9 @@ scipy==1.5.4 \
     --hash=sha256:ed572470af2438b526ea574ff8f05e7f39b44ac37f712105e57fc4d53a6fb660 \
     --hash=sha256:f87b39f4d69cf7d7529d7b1098cb712033b17ea7714aed831b95628f483fd012 \
     --hash=sha256:fa789583fc94a7689b45834453fec095245c7e69c58561dc159b5d5277057e4c
-synr==0.4 \
-    --hash=sha256:2f280cdc73d6f98154c97f13130c9e387635060436a0bf07483bb8c6423ee8aa \
-    --hash=sha256:35cd3e0739ad8a4d52b742534f14149bd70f60f1ff8779d96b3484123ced3640
+synr==0.6.0 \
+    --hash=sha256:0b4e16b10c3988e1981e3372153a31956f74d86752eaaa55e8c4e7b7fe591e4e \
+    --hash=sha256:9399b27d9f21c5d439eae92e0159d6f521cc396d27149ac45473012a205a3c30
 tflite==2.4.0 \
     --hash=sha256:0510db1b48a3eec86bf9bb8d2749cd9d6d26d6a4fb329fd141bde5b4404932d1 \
     --hash=sha256:0796f6ce6eb2aef4a318f5509e5fb0ce808e29cd3094801b4abbb1d8575a28cd
diff --git a/docker/install/ubuntu_install_vela.sh b/docker/install/ubuntu_install_vela.sh
index 9e32889cd4eb..8d43a4d6e112 100755
--- a/docker/install/ubuntu_install_vela.sh
+++ b/docker/install/ubuntu_install_vela.sh
@@ -20,4 +20,4 @@ set -e
 set -u
 set -o pipefail
 
-pip3 install ethos-u-vela==3.4.0
+pip3 install ethos-u-vela==3.5.0
diff --git a/gallery/how_to/work_with_microtvm/micro_ethosu.py b/gallery/how_to/work_with_microtvm/micro_ethosu.py
index 8e37a0ea5ec4..386c658ea818 100644
--- a/gallery/how_to/work_with_microtvm/micro_ethosu.py
+++ b/gallery/how_to/work_with_microtvm/micro_ethosu.py
@@ -87,7 +87,7 @@
 #     attrs==21.2.0
 #     cloudpickle==2.0.0
 #     decorator==5.1.0
-#     ethos-u-vela==3.2.0
+#     ethos-u-vela==3.5.0
 #     flatbuffers==1.12
 #     lxml==4.6.3
 #     nose==1.3.7
@@ -95,7 +95,7 @@
 #     Pillow==8.3.2
 #     psutil==5.8.0
 #     scipy==1.5.4
-#     synr==0.4
+#     synr==0.6
 #     tflite==2.4.0
 #     tornado==6.1
 #
diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index 7e2c3e218618..9778937ae80b 100755
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -247,7 +247,7 @@
         "docutils",
         "<0.17",
     ),  # Work around https://github.com/readthedocs/sphinx_rtd_theme/issues/1115
-    ("ethos-u-vela", "==3.2.0"),
+    ("ethos-u-vela", "==3.5.0"),
     ("future", None),
     ("h5py", "==2.10.0"),
     ("image", None),

From 1ff6e99556e228fc4d2af095ca3f5d1b94aeefd8 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Sat, 19 Nov 2022 07:06:40 +0530
Subject: [PATCH 619/704] [OPENCL][TEXTURE] Test case enhancements and fixes
 for RPC (#13408)

RPC execution fails some times when opened multiple times per test case.
Making remote object initialized from pytest session instead.

Network test cases added.

dtypes enhanced to support multiple inputs.

Co-authored-by: Siva Rama Krishna Reddy B <sivb@blr-ubuntu-ripper.qualcomm.com>
---
 python/tvm/relay/frontend/tflite.py           |   2 +-
 tests/python/relay/opencl_texture/conftest.py |  39 ++++++
 .../test_conv2d_nchw_texture.py               | 100 +++++++++-----
 .../test_conv2d_nhwc_texture.py               |  74 +++++-----
 .../test_depthwise_conv2d_nchw_texture.py     |  24 ++--
 .../test_depthwise_conv2d_nhwc_texture.py     |  24 ++--
 .../relay/opencl_texture/test_network.py      |  66 +++++++++
 .../opencl_texture/test_reduction_texture.py  |  12 +-
 .../opencl_texture/utils/adreno_utils.py      | 130 ++++++++++++++----
 9 files changed, 351 insertions(+), 120 deletions(-)
 create mode 100644 tests/python/relay/opencl_texture/conftest.py
 create mode 100644 tests/python/relay/opencl_texture/test_network.py

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 3d2f4a2f25e6..09e6523534cf 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -1549,7 +1549,7 @@ def convert_gather(self, op):
         assert axis < data_dim, "Axis out of bounds"
 
         if self.has_expr(indices.tensor_idx):
-            indices_expr = self.get_expr(indices.tensor_idx)
+            indices_expr = _op.cast(self.get_expr(indices.tensor_idx), "int32")
         else:
             indices_val = self.get_tensor_value(indices)
             indices_expr = self.exp_tab.new_const(
diff --git a/tests/python/relay/opencl_texture/conftest.py b/tests/python/relay/opencl_texture/conftest.py
new file mode 100644
index 000000000000..6b9c91ec1067
--- /dev/null
+++ b/tests/python/relay/opencl_texture/conftest.py
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import tvm
+from tvm import rpc
+import pytest
+
+
+@pytest.fixture(scope="session")
+def remote():
+    if (
+        "TVM_TRACKER_HOST" in os.environ
+        and "TVM_TRACKER_PORT" in os.environ
+        and "RPC_DEVICE_KEY" in os.environ
+    ):
+
+        rpc_tracker_host = os.environ["TVM_TRACKER_HOST"]
+        rpc_tracker_port = int(os.environ["TVM_TRACKER_PORT"])
+        rpc_device_key = os.environ["RPC_DEVICE_KEY"]
+        tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port)
+        remote = tracker.request(rpc_device_key, priority=0, session_timeout=600)
+        return remote
+    else:
+        return None
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
index 5198cbdf6bc6..a0ca8423478e 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
@@ -30,7 +30,7 @@
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(target, dtype):
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(remote, target, dtype):
     input_shape = (1, 32, 42, 42)
     filter_shape = (96, 32, 3, 3)
     bias_shape = (1, 96, 1, 1)
@@ -65,12 +65,14 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, [], gpu_preprocess)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(target, dtype):
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(remote, target, dtype):
     input_shape = (1, 32, 40, 40)
     filter_shape = (96, 32, 2, 2)
     bias_shape = (1, 96, 1, 1)
@@ -105,12 +107,14 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, [], gpu_preprocess)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_35_35_strides(target, dtype):
+def test_conv2d_inceptionv3_35_35_strides(remote, target, dtype):
     input_shape = (1, 48, 35, 35)
     filter_shape = (64, 48, 5, 5)
     bias_shape = (1, 64, 1, 1)
@@ -145,12 +149,14 @@ def test_conv2d_inceptionv3_35_35_strides(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, [], gpu_preprocess)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_resnet50_v2_nchw_3c(target, dtype):
+def test_conv2d_resnet50_v2_nchw_3c(remote, target, dtype):
     input_shape = (1, 3, 224, 224)
     filter_shape = (64, 3, 7, 7)
     bias_shape = (1, 64, 1, 1)
@@ -186,12 +192,12 @@ def test_conv2d_resnet50_v2_nchw_3c(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_nchw_3c(target, dtype):
+def test_conv2d_inceptionv3_nchw_3c(remote, target, dtype):
     input_shape = (1, 3, 299, 299)
     filter_shape = (64, 3, 3, 3)
     bias_shape = (1, 64, 1, 1)
@@ -226,12 +232,12 @@ def test_conv2d_inceptionv3_nchw_3c(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_1x1_16c16spatial(target, dtype):
+def test_conv2d_1x1_16c16spatial(remote, target, dtype):
     input_shape = (1, 16, 256, 256)
     filter_shape = (32, 16, 4, 4)
     bias_shape = (1, 32, 1, 1)
@@ -266,12 +272,12 @@ def test_conv2d_1x1_16c16spatial(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_4x4_16c16pad(target, dtype):
+def test_conv2d_4x4_16c16pad(remote, target, dtype):
     input_shape = (1, 32, 256, 256)
     filter_shape = (32, 32, 4, 4)
     bias_shape = (1, 32, 1, 1)
@@ -306,12 +312,12 @@ def test_conv2d_4x4_16c16pad(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_4x4x4_16c16pad(target, dtype):
+def test_conv2d_4x4x4_16c16pad(remote, target, dtype):
     input_shape = (1, 32, 256, 256)
     filter_shape = (4, 32, 4, 4)
     bias_shape = (1, 4, 1, 1)
@@ -346,12 +352,12 @@ def test_conv2d_4x4x4_16c16pad(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_yolov3_v2_nchw_3c(target, dtype):
+def test_conv2d_yolov3_v2_nchw_3c(remote, target, dtype):
     input_shape = (1, 1024, 13, 13)
     filter_shape = (255, 1024, 1, 1)
     A = relay.var("data", shape=input_shape, dtype=dtype)
@@ -379,12 +385,12 @@ def test_conv2d_yolov3_v2_nchw_3c(target, dtype):
         "weight": tvm.nd.array(filter_data),
     }
 
-    build_run_compare(mod, params, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_vgg16_winograd_4d(target, dtype):
+def test_conv2d_vgg16_winograd_4d(remote, target, dtype):
     input_shape = (1, 512, 28, 28)
     filter_shape = (512, 512, 3, 3)
     bias_shape = (1, 512, 1, 1)
@@ -424,7 +430,7 @@ def test_conv2d_vgg16_winograd_4d(target, dtype):
             f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 512, 28, 28], "{dtype}"], ["TENSOR", [512, 512, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
         )
     graph = build_run_compare(
-        mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file
     )
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
@@ -432,7 +438,7 @@ def test_conv2d_vgg16_winograd_4d(target, dtype):
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_winograd_conv(target, dtype):
+def test_conv2d_winograd_conv(remote, target, dtype):
     input_shape = (1, 4, 3, 3)
     A = relay.var("data", shape=input_shape, dtype=dtype)
     filter_shape3 = (8, 4, 3, 3)
@@ -471,7 +477,7 @@ def test_conv2d_winograd_conv(target, dtype):
             f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 4, 3, 3], "{dtype}"], ["TENSOR", [8, 4, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
         )
     graph = build_run_compare(
-        mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file
     )
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
@@ -479,7 +485,7 @@ def test_conv2d_winograd_conv(target, dtype):
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_residual_block(target, dtype):
+def test_residual_block(remote, target, dtype):
     """
     - some kind of residual block followed by convolution to have texture after residual block
     - scalar data type verification which should be mapped to global memory scope
@@ -596,12 +602,14 @@ def test_residual_block(target, dtype):
             "",
         ]
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_concat(target, dtype):
+def test_concat(remote, target, dtype):
     """
         layout_transform (NCHW->NCHW4c)
                   |                      <- buffer
@@ -708,12 +716,14 @@ def test_concat(target, dtype):
 
     static_memory_scope = []
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_pooling_branching_texture_params(target, dtype):
+def test_pooling_branching_texture_params(remote, target, dtype):
     """
     Verification of the pooling and many branches having textures
                 layout_transform (NCHW->NCHW4c)
@@ -834,12 +844,14 @@ def test_pooling_branching_texture_params(target, dtype):
         "",
     ]
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_branching_texture_params(target, dtype):
+def test_branching_texture_params(remote, target, dtype):
     """
     Verification of passing texture to several consumers markup of relay variables in
     primary functions + on_device
@@ -958,13 +970,15 @@ def test_branching_texture_params(target, dtype):
         "",
     ]
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope
+    )
 
 
 # function repeat, params scope are different in reused functions
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_different_lowering_same_op(target, dtype):
+def test_conv2d_different_lowering_same_op(remote, target, dtype):
     """
     Use case for verification of caching compiled functions
     Three convolutions following by each other in this case should be
@@ -1040,12 +1054,14 @@ def test_conv2d_different_lowering_same_op(target, dtype):
         "",
     ]
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_winograd_non_rect(target, dtype):
+def test_conv2d_winograd_non_rect(remote, target, dtype):
     input_shape = (1, 771, 36, 64)
     A = relay.var("data", shape=input_shape, dtype=dtype)
     filter_shape = (128, 771, 3, 3)
@@ -1070,7 +1086,7 @@ def test_conv2d_winograd_non_rect(target, dtype):
             f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256 -texture_spatial_limit=16384 -thread_warp_size=1", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 771, 36, 64], "{dtype}"], ["TENSOR", [128, 771, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 5399, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 16], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 8]], ["tile_rc", "sp", [-1, 193]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
         )
     graph = build_run_compare(
-        mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file
     )
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
@@ -1079,7 +1095,7 @@ def test_conv2d_winograd_non_rect(target, dtype):
 # function repeat, params scope are different in reused functions
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_injective_nwo_inputs1(target, dtype):
+def test_injective_nwo_inputs1(remote, target, dtype):
     """
     Use case for verification of stability of annotation primary functions
     having several ops accepting data outside of Primary function
@@ -1170,13 +1186,15 @@ def test_injective_nwo_inputs1(target, dtype):
         "global",
         "global",
     ]
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope
+    )
 
 
 # function repeat, params scope are different in reused functions
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_injective_nwo_inputs2(target, dtype):
+def test_injective_nwo_inputs2(remote, target, dtype):
     """
     Use case for verification of stability of annotation primary functions
     having several ops accepting data outside of Primary function
@@ -1266,4 +1284,10 @@ def test_injective_nwo_inputs2(target, dtype):
         "global.texture",
         "global",
     ]
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope
+    )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py
index 0b89e3dc9c7f..43979cc79a68 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py
@@ -31,7 +31,7 @@
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(target, dtype):
+def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(remote, target, dtype):
     input_shape = (1, 257, 257, 32)
     filter_shape = (1, 1, 32, 16)
     bias_shape = (filter_shape[-1],)
@@ -63,12 +63,12 @@ def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(target, dtype):
+def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(remote, target, dtype):
     input_shape = (1, 257, 257, 32)
     filter_shape = (1, 1, 32, 16)
     bias_shape = (filter_shape[-1],)
@@ -103,12 +103,12 @@ def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_4_35_35_32x3_3_144_16(target, dtype):
+def test_conv2d_4_35_35_32x3_3_144_16(remote, target, dtype):
     input_shape = (4, 35, 35, 32)
     filter_shape = (3, 3, 32, 16)
     bias_shape = (filter_shape[-1],)
@@ -141,12 +141,12 @@ def test_conv2d_4_35_35_32x3_3_144_16(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(target, dtype):
+def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(remote, target, dtype):
     input_shape = (1, 513, 513, 3)
     filter_shape = (3, 3, 3, 32)
     bias_shape = (filter_shape[-1],)
@@ -179,12 +179,12 @@ def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(target, dtype):
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(remote, target, dtype):
     input_shape = (1, 42, 42, 32)
     filter_shape = (3, 3, 32, 96)
     bias_shape = (1, 1, 1, 96)
@@ -219,12 +219,14 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, [], gpu_preprocess)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(target, dtype):
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(remote, target, dtype):
     input_shape = (1, 40, 40, 32)
     filter_shape = (2, 2, 32, 96)
     bias_shape = (1, 1, 1, 96)
@@ -259,12 +261,14 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, [], gpu_preprocess)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_35_35_strides(target, dtype):
+def test_conv2d_inceptionv3_35_35_strides(remote, target, dtype):
     input_shape = (1, 35, 35, 48)
     filter_shape = (5, 5, 48, 64)
     bias_shape = (1, 1, 1, 64)
@@ -299,12 +303,14 @@ def test_conv2d_inceptionv3_35_35_strides(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, [], gpu_preprocess)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_resnet50_v2_nhwc_3c(target, dtype):
+def test_conv2d_resnet50_v2_nhwc_3c(remote, target, dtype):
     input_shape = (1, 224, 224, 3)
     filter_shape = (7, 7, 3, 64)
     bias_shape = (1, 1, 1, 64)
@@ -340,12 +346,12 @@ def test_conv2d_resnet50_v2_nhwc_3c(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_inceptionv3_nhwc_3c(target, dtype):
+def test_conv2d_inceptionv3_nhwc_3c(remote, target, dtype):
     input_shape = (1, 299, 299, 3)
     filter_shape = (3, 3, 3, 64)
     bias_shape = (1, 1, 1, 64)
@@ -380,12 +386,12 @@ def test_conv2d_inceptionv3_nhwc_3c(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_1x1_16c16spatial(target, dtype):
+def test_conv2d_1x1_16c16spatial(remote, target, dtype):
     input_shape = (1, 128, 128, 16)
     filter_shape = (4, 4, 16, 32)
     bias_shape = (1, 1, 1, 32)
@@ -420,12 +426,12 @@ def test_conv2d_1x1_16c16spatial(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_4x4_16c16pad(target, dtype):
+def test_conv2d_4x4_16c16pad(remote, target, dtype):
     input_shape = (1, 256, 256, 32)
     filter_shape = (4, 4, 32, 32)
     bias_shape = (1, 1, 1, 32)
@@ -460,12 +466,12 @@ def test_conv2d_4x4_16c16pad(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_4x4x4_16c16pad(target, dtype):
+def test_conv2d_4x4x4_16c16pad(remote, target, dtype):
     input_shape = (1, 256, 256, 32)
     filter_shape = (4, 4, 32, 4)
     bias_shape = (1, 1, 1, 4)
@@ -499,12 +505,12 @@ def test_conv2d_4x4x4_16c16pad(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_yolov3_v2_nhwc_3c(target, dtype):
+def test_conv2d_yolov3_v2_nhwc_3c(remote, target, dtype):
     input_shape = (1, 13, 13, 1024)
     filter_shape = (1, 1, 1024, 255)
     A = relay.var("data", shape=input_shape, dtype=dtype)
@@ -532,12 +538,12 @@ def test_conv2d_yolov3_v2_nhwc_3c(target, dtype):
         "weight": tvm.nd.array(filter_data),
     }
 
-    build_run_compare(mod, params, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_vgg16_winograd_4d(target, dtype):
+def test_conv2d_vgg16_winograd_4d(remote, target, dtype):
     input_shape = (1, 28, 28, 512)
     filter_shape = (3, 3, 512, 512)
     bias_shape = (1, 1, 1, 512)
@@ -577,7 +583,7 @@ def test_conv2d_vgg16_winograd_4d(target, dtype):
             f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 512], "{dtype}"], ["TENSOR", [3, 3, 512, 512], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
         )
     graph = build_run_compare(
-        mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file
     )
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
@@ -585,7 +591,7 @@ def test_conv2d_vgg16_winograd_4d(target, dtype):
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_winograd_conv(target, dtype):
+def test_conv2d_winograd_conv(remote, target, dtype):
     input_shape = (1, 3, 3, 4)
     A = relay.var("data", shape=input_shape, dtype=dtype)
     filter_shape3 = (3, 3, 4, 8)
@@ -638,7 +644,7 @@ def test_conv2d_winograd_conv(target, dtype):
             f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 3, 3, 4], "{dtype}"], ["TENSOR", [3, 3, 4, 8], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
         )
     graph = build_run_compare(
-        mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file
     )
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
@@ -646,7 +652,7 @@ def test_conv2d_winograd_conv(target, dtype):
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_conv2d_winograd_non_rect(target, dtype):
+def test_conv2d_winograd_non_rect(remote, target, dtype):
     input_shape = (1, 36, 64, 771)
     A = relay.var("data", shape=input_shape, dtype=dtype)
     filter_shape = (3, 3, 771, 128)
@@ -678,7 +684,11 @@ def test_conv2d_winograd_non_rect(target, dtype):
             f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256 -texture_spatial_limit=16384 -thread_warp_size=1", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 36, 64, 771], "{dtype}"], ["TENSOR", [3, 3, 771, 128], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 5399, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 16], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 8]], ["tile_rc", "sp", [-1, 193]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n'
         )
     graph = build_run_compare(
-        mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file
     )
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py
index 0ac92d03b6f9..00e2c5a8c069 100644
--- a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py
@@ -27,7 +27,7 @@
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_bias_nchwc(target, dtype):
+def test_depthwise_conv2d_bias_nchwc(remote, target, dtype):
     input_shape = (1, 64, 112, 112)
     filter_shape = (64, 1, 3, 3)
     bias_shape = (1, 64, 1, 1)
@@ -64,12 +64,14 @@ def test_depthwise_conv2d_bias_nchwc(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, [], gpu_preprocess)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_nchwc(target, dtype):
+def test_depthwise_conv2d_nchwc(remote, target, dtype):
     input_shape = (1, 64, 112, 112)
     filter_shape = (64, 1, 3, 3)
     bias_shape = (1, 64, 1, 1)
@@ -101,12 +103,14 @@ def test_depthwise_conv2d_nchwc(target, dtype):
         "weight": tvm.nd.array(filter_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, [], gpu_preprocess)
+    build_run_compare(
+        remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess
+    )
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_bias_nchw(target, dtype):
+def test_depthwise_conv2d_bias_nchw(remote, target, dtype):
     input_shape = (1, 64, 112, 112)
     filter_shape = (64, 1, 3, 3)
     bias_shape = (1, 64, 1, 1)
@@ -143,12 +147,12 @@ def test_depthwise_conv2d_bias_nchw(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_repack_bias_nchw(target, dtype):
+def test_depthwise_conv2d_repack_bias_nchw(remote, target, dtype):
     input_shape = (1, 63, 112, 112)
     filter_shape = (63, 1, 3, 3)
     bias_shape = (1, 63, 1, 1)
@@ -185,4 +189,8 @@ def test_depthwise_conv2d_repack_bias_nchw(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py
index 3af7db3a4e1f..7d7f640294ce 100644
--- a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py
+++ b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py
@@ -27,7 +27,7 @@
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(target, dtype):
+def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(remote, target, dtype):
     input_shape = (1, 129, 129, 144)
     filter_shape = (3, 3, 144, 1)
     kernel_size = (filter_shape[0], filter_shape[1])
@@ -62,12 +62,12 @@ def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(target, dtype):
+def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(remote, target, dtype):
     input_shape = (4, 35, 35, 576)
     filter_shape = (3, 3, 576, 1)
     kernel_size = (filter_shape[0], filter_shape[1])
@@ -102,12 +102,12 @@ def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding(target, dtype):
+def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding(remote, target, dtype):
     input_shape = (1, 129, 129, 144)
     filter_shape = (3, 3, 144, 1)
     kernel_size = (filter_shape[0], filter_shape[1])
@@ -144,12 +144,12 @@ def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding(target,
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_1_513_513_7x3_3_7_1(target, dtype):
+def test_depthwise_conv2d_1_513_513_7x3_3_7_1(remote, target, dtype):
     input_shape = (1, 513, 513, 7)
     filter_shape = (3, 3, 7, 1)
     bias_shape = (filter_shape[2],)
@@ -183,12 +183,12 @@ def test_depthwise_conv2d_1_513_513_7x3_3_7_1(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_depthwise_conv2d_1_513_513_3x3_3_3_1(target, dtype):
+def test_depthwise_conv2d_1_513_513_3x3_3_3_1(remote, target, dtype):
     input_shape = (1, 513, 513, 3)
     filter_shape = (3, 3, 3, 1)
     bias_shape = (filter_shape[2],)
@@ -222,4 +222,8 @@ def test_depthwise_conv2d_1_513_513_3x3_3_3_1(target, dtype):
         "bias": tvm.nd.array(bias_data),
     }
 
-    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_network.py b/tests/python/relay/opencl_texture/test_network.py
new file mode 100644
index 000000000000..638be477d06c
--- /dev/null
+++ b/tests/python/relay/opencl_texture/test_network.py
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import testing
+from tvm.contrib import utils
+from utils.adreno_utils import gpu_preprocess, build_run_compare, get_model
+import pytest
+from tvm.relay.op import register_mixed_precision_conversion
+
+
+def convert_to_fp16(mod, dtype):
+    from tvm.ir import IRModule
+
+    mod = IRModule.from_expr(mod)
+    seq = tvm.transform.Sequential(
+        [relay.transform.InferType(), relay.transform.ToMixedPrecision()]
+    )
+    with tvm.transform.PassContext(opt_level=3):
+        mod = seq(mod)
+        return mod
+
+
+def _test_mobilenet_v1(remote, target, dtype):
+    mod, params, inputs, dtypes = get_model(
+        "https://github.com/mlcommons/mobile_models/raw/main/v0_7/tflite/mobilenet_edgetpu_224_1.0_float.tflite",
+        "mobilenet_edgetpu_224_1.0_float.tflite",
+        "tflite",
+    )
+    if dtype == "float16":
+        mod = convert_to_fp16(mod["main"], dtype)
+    build_run_compare(remote, mod, params, inputs, dtypes, target, [])
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+@pytest.mark.skipif(tvm.testing.utils.IS_IN_CI, reason="CI doesn't support fp16(half datatypes)")
+def test_mobilenet_v1_fp16(remote, target):
+    _test_mobilenet_v1(remote, target, "float16")
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_mobilenet_v1_fp32(remote, target):
+    _test_mobilenet_v1(remote, target, "float32")
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/test_reduction_texture.py b/tests/python/relay/opencl_texture/test_reduction_texture.py
index b14aefd2f9ab..9dc8a8992d27 100644
--- a/tests/python/relay/opencl_texture/test_reduction_texture.py
+++ b/tests/python/relay/opencl_texture/test_reduction_texture.py
@@ -29,23 +29,27 @@
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_mean(target, dtype):
+def test_mean(remote, target, dtype):
     # NCHW
     input_shape = (1, 3, 720, 1280)
     A = relay.var("data", shape=input_shape, dtype=dtype)
     mean = relay.mean(A, axis=1, keepdims=True)
     mod = relay.Function([A], mean)
 
-    build_run_compare(mod, {}, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
 
 
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
-def test_argmax(target, dtype):
+def test_argmax(remote, target, dtype):
     # NCHW
     input_shape = (1, 3, 720, 1280)
     A = relay.var("data", shape=input_shape, dtype=dtype)
     argmax = relay.op.argmax(A, axis=[1])
     mod = relay.Function([A], argmax)
 
-    build_run_compare(mod, {}, {"data": input_shape}, dtype, target)
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/opencl_texture/utils/adreno_utils.py b/tests/python/relay/opencl_texture/utils/adreno_utils.py
index 27768c3d0cec..e2a271d9f68d 100644
--- a/tests/python/relay/opencl_texture/utils/adreno_utils.py
+++ b/tests/python/relay/opencl_texture/utils/adreno_utils.py
@@ -21,6 +21,8 @@
 import numpy as np
 from tvm import relay
 from tvm import autotvm
+from tvm import rpc
+from tvm.contrib import utils, ndk
 from tvm.relay import testing
 from tvm.relay.transform import recast
 from tvm.contrib import graph_runtime
@@ -47,25 +49,20 @@ def get_cpu_reference(mod, params1, input_shape, inputs):
 
 # build module run with opencl and cpu, compare results
 def build_run_compare(
+    remote,
     tvm_mod,
     params1,
     input_shape,
-    dtype="float32",
+    dtypes,
     target="llvm",
     static_mem_scopes=[],
     gpu_preprocess=None,
     stat_file=None,
 ):
-
-    if "TVM_TRACKER_HOST" in os.environ and "TVM_TRACKER_PORT" in os.environ:
-        rpc_tracker_host = os.environ["TVM_TRACKER_HOST"]
-        rpc_tracker_port = os.environ["TVM_TRACKER_PORT"]
-        run_on_host = 0
-        target_host = "llvm -mtriple=arm64-linux-android"
-        rpc_tracker_port = int(rpc_tracker_port)
-    else:
-        run_on_host = 1
+    if remote is None:
         target_host = "llvm"
+    else:
+        target_host = "llvm -mtriple=arm64-linux-android"
 
     if gpu_preprocess:
         tvm_mod_nchwc = gpu_preprocess(tvm_mod)
@@ -97,16 +94,10 @@ def build_run_compare(
     for i in range(0, len(static_mem_scopes)):
         assert static_mem_scopes[i] == graph_json["attrs"]["storage_scope"][1][i]
 
-    if run_on_host:
+    if remote is None:
         ctx = tvm.opencl()
         m = graph_runtime.create(graph, lib, ctx)
     else:
-        from tvm import rpc
-        from tvm.contrib import utils, ndk
-
-        rpc_key = "android"
-        tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port)
-        remote = tracker.request(rpc_key, priority=0, session_timeout=600)
         temp = utils.tempdir()
         dso_binary = "dev_lib_cl.so"
         dso_binary_path = temp.relpath(dso_binary)
@@ -117,22 +108,15 @@ def build_run_compare(
         m = graph_runtime.create(graph, rlib, ctx)
     m.set_input(**params)
     inputs = []
-    if isinstance(input_shape, dict):
-        for key in input_shape:
-            inputs.append(np.random.normal(size=input_shape[key]).astype(dtype))
-            m.set_input(key, inputs[-1])
-    else:
-        inputs.append(np.random.normal(size=input_shape).astype(dtype))
-        m.set_input("data", inputs[-1])
+    for key in input_shape:
+        inputs.append(np.random.normal(size=input_shape[key]).astype(dtypes[key]))
+        m.set_input(key, inputs[-1])
     m.run()
 
     ref_outputs = get_cpu_reference(tvm_mod, params1, input_shape, inputs)
     for i, ref_output in enumerate(ref_outputs):
         tvm_output = m.get_output(i)
         output = tvm_output.asnumpy()
-        # for index, x in np.ndenumerate(ref_output):
-        #     if abs(output[index] - x) > 0.01:
-        #         print(index, output[index], x)
 
         np.testing.assert_allclose(output, ref_output, rtol=1e-1, atol=1e-1)
     return graph
@@ -147,3 +131,95 @@ def gpu_preprocess(tvm_mod):
             mod = tvm.IRModule.from_expr(tvm_mod)
             tvm_mod_nchwc = seq(mod)
             return tvm_mod_nchwc
+
+
+def get_model(url, local_file, module):
+    def get_tensor_type_str(tensor_type):
+        """Get tensor type string representation when given TFLite tensor type"""
+        try:
+            from tflite.TensorType import TensorType
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
+        if tensor_type == TensorType.INT8:
+            return "int8"
+        if tensor_type == TensorType.INT16:
+            return "int16"
+        if tensor_type == TensorType.UINT8:
+            return "uint8"
+        if tensor_type == TensorType.FLOAT16:
+            return "float16"
+        if tensor_type == TensorType.FLOAT32:
+            return "float32"
+        if tensor_type == TensorType.INT32:
+            return "int32"
+        if tensor_type == TensorType.INT64:
+            return "int64"
+        if tensor_type == TensorType.BOOL:
+            return "bool"
+        raise NotImplementedError(
+            "Tensor type {} is currently not supported".format(str(tensor_type))
+        )
+
+    if url is None:
+        model_path = local_file
+    else:
+        model_path = tvm.contrib.download.download_testdata(url, local_file, module=module)
+
+    with open(model_path, "rb") as f:
+        tflite_model_buf = f.read()
+
+    try:
+        import tflite.Model
+
+        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+    except AttributeError:
+        import tflite
+
+        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+    except ImportError:
+        raise ImportError("The tflite package must be installed")
+
+    # keep the same as tflite
+    assert tflite_model.SubgraphsLength() == 1, "only support one subgraph (main subgraph)"
+    subgraph = tflite_model.Subgraphs(0)
+
+    # model inputs
+    model_inputs = subgraph.InputsAsNumpy()
+    shape_dict = {}
+    dtype_dict = {}
+    for model_input in model_inputs:
+        model_input_name = subgraph.Tensors(model_input).Name().decode("utf-8")
+        model_shape_length = subgraph.Tensors(model_input).ShapeLength()
+        model_input_shape = [
+            subgraph.Tensors(model_input).Shape(i) for i in range(model_shape_length)
+        ]
+        shape_dict[model_input_name] = model_input_shape
+        dtype_dict[model_input_name] = get_tensor_type_str(subgraph.Tensors(model_input).Type())
+
+    # model Outputs
+    model_outputs = subgraph.OutputsAsNumpy()
+    shape_dict_out = {}
+    dtype_dict_out = {}
+    for model_output in model_outputs:
+        model_output_name = subgraph.Tensors(model_output).Name().decode("utf-8")
+        model_shape_length = subgraph.Tensors(model_output).ShapeLength()
+        model_output_shape = [
+            subgraph.Tensors(model_output).Shape(i) for i in range(model_shape_length)
+        ]
+        shape_dict_out[model_output_name] = model_output_shape
+        dtype_dict_out[model_output_name] = get_tensor_type_str(
+            subgraph.Tensors(model_input).Type()
+        )
+
+    mod, params = relay.frontend.from_tflite(
+        tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
+    )
+
+    layout_config = relay.transform.LayoutConfig(skip_layers=[])
+    desired_layouts = {"nn.conv2d": ["NCHW", "default"]}
+    seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
+    with tvm.transform.PassContext(opt_level=3):
+        mod = seq(mod)
+
+    return mod, params, shape_dict, dtype_dict

From 24b7d9fdd0cd5e6236f3f4834d21358c238ff48b Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 18 Nov 2022 17:41:19 -0800
Subject: [PATCH 620/704] [MetaSchedule] Fix Dynamic Loop from AutoBinding
 (#13421)

This PR fixes an issue that would cause dynamic loop when reorder is applied during auto binding incorrectly. This would cause failed cuda kernel generation. This PR provided a fix by changing the loop to fuse blocks.

Test is also added and locally tested on `nvidia/geforce-rtx-3070` target.

Thanks for Junru Shao for the guidance and Lite Ye for reporting this issue.
---
 .../meta_schedule/schedule/cuda/thread_bind.h |   1 +
 .../schedule/cuda/thread_bind.cc              |   2 +-
 src/meta_schedule/schedule/cuda/winograd.cc   |   3 +-
 .../test_meta_schedule_space_cuda_winograd.py | 241 ++++++++++++++++++
 4 files changed, 245 insertions(+), 2 deletions(-)

diff --git a/include/tvm/meta_schedule/schedule/cuda/thread_bind.h b/include/tvm/meta_schedule/schedule/cuda/thread_bind.h
index ae6d492bfe12..125d6dc11fc8 100644
--- a/include/tvm/meta_schedule/schedule/cuda/thread_bind.h
+++ b/include/tvm/meta_schedule/schedule/cuda/thread_bind.h
@@ -45,6 +45,7 @@ std::function<tir::ExprRV(int64_t)> MakeFactorSampler(tir::Schedule sch,
  * \param max_threadblocks The maximum number of threadblocks allowed.
  * \param max_threads_per_block The maximum number of threads allowed.
  * \param get_factor A function that returns the tiling factor.
+ * \return The binded loops in the order of blockIdx.x, threadIdx.x, and the rest.
  */
 Array<tir::LoopRV> BindSpatialLoop(tir::Schedule sch, tir::LoopRV loop,  //
                                    int64_t max_threadblocks, int64_t max_threads_per_block,
diff --git a/src/meta_schedule/schedule/cuda/thread_bind.cc b/src/meta_schedule/schedule/cuda/thread_bind.cc
index e5dd5068783d..b651b1f401cb 100644
--- a/src/meta_schedule/schedule/cuda/thread_bind.cc
+++ b/src/meta_schedule/schedule/cuda/thread_bind.cc
@@ -80,7 +80,7 @@ Array<LoopRV> BindSpatialLoop(Schedule sch, LoopRV loop, int64_t max_threadblock
     sch->Reorder({splits[1], splits[2], splits[0]});
     sch->Bind(splits[1], "blockIdx.x");
     sch->Bind(splits[2], "threadIdx.x");
-    return {splits[1], splits[2]};
+    return {splits[1], splits[2], splits[0]};
   }
 }
 
diff --git a/src/meta_schedule/schedule/cuda/winograd.cc b/src/meta_schedule/schedule/cuda/winograd.cc
index 5334c4df2ac9..59ed7bdc009a 100644
--- a/src/meta_schedule/schedule/cuda/winograd.cc
+++ b/src/meta_schedule/schedule/cuda/winograd.cc
@@ -117,7 +117,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.cuda.conv2d_nchw_winograd_data_pack")
         sch->Unroll(loops[4]);
         sch->Unroll(loops[5]);
         outer = BindSpatialLoop(sch, sch->Fuse({loops[2], loops[3]}), max_threadblocks,
-                                max_threads_per_block)[1];
+                                max_threads_per_block, /*get_factor=*/nullptr)
+                    .back();
       }
       {
         BlockRV data_pack_local = sch->CacheWrite(data_pack, 0, "local");
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda_winograd.py b/tests/python/unittest/test_meta_schedule_space_cuda_winograd.py
index 16f9e64252ad..53a153b90522 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda_winograd.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda_winograd.py
@@ -350,6 +350,247 @@ def cuda_nchw_0(data: T.Buffer[(1, 64, 56, 56), "float32"], weight: T.Buffer[(6,
     )
 
 
+def test_cuda_nchw_add_relu():
+    # fmt: off
+    @T.prim_func
+    def nchw_add_relu(p0: T.Buffer[(2, 2048, 50, 75), "float32"], p1: T.Buffer[(4, 4, 2048, 2048), "float32"], p2: T.Buffer[(1, 2048, 1, 1), "float32"], T_relu: T.Buffer[(2, 2048, 50, 75), "float32"]):
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
+        # body
+        # with T.block("root")
+        data_pad = T.alloc_buffer([2, 2048, 52, 77], dtype="float32")
+        input_tile = T.alloc_buffer([2048, 1900, 4, 4], dtype="float32")
+        B = T.alloc_buffer([4, 4], dtype="float32")
+        data_pack = T.alloc_buffer([4, 4, 2048, 1900], dtype="float32")
+        bgemm = T.alloc_buffer([4, 4, 2048, 1900], dtype="float32")
+        A = T.alloc_buffer([4, 2], dtype="float32")
+        inverse = T.alloc_buffer([2048, 1900, 2, 2], dtype="float32")
+        conv2d_winograd = T.alloc_buffer([2, 2048, 50, 75], dtype="float32")
+        T_add = T.alloc_buffer([2, 2048, 50, 75], dtype="float32")
+        for i0, i1, i2, i3 in T.grid(2, 2048, 52, 77):
+            with T.block("data_pad"):
+                i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(p0[i0_1, i1_1, i2_1 - 1, i3_1 - 1])
+                T.writes(data_pad[i0_1, i1_1, i2_1, i3_1])
+                data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i2_1 and i2_1 < 51 and 1 <= i3_1 and i3_1 < 76, p0[i0_1, i1_1, i2_1 - 1, i3_1 - 1], T.float32(0), dtype="float32")
+        for i0, i1, i2, i3 in T.grid(2048, 1900, 4, 4):
+            with T.block("input_tile"):
+                ci, p, eps, nu = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(data_pad[p // 950, ci, p % 950 // 38 * 2 + eps, p % 38 * 2 + nu])
+                T.writes(input_tile[ci, p, eps, nu])
+                T.block_attr({"schedule_rule":"None"})
+                input_tile[ci, p, eps, nu] = data_pad[p // 950, ci, p % 950 // 38 * 2 + eps, p % 38 * 2 + nu]
+        for i0, i1 in T.grid(4, 4):
+            with T.block("B"):
+                i, j = T.axis.remap("SS", [i0, i1])
+                T.reads()
+                T.writes(B[i, j])
+                T.block_attr({"schedule_rule":"None"})
+                B[i, j] = T.Select(i % 4 == 3 and j % 4 == 3, T.float32(1), T.Select(i % 4 == 3 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 0, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))
+        for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 2048, 1900, 4, 4):
+            with T.block("data_pack"):
+                eps, nu, ci, p, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+                T.reads(input_tile[ci, p, r_a, r_b], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1])
+                T.writes(data_pack[eps, nu, ci, p])
+                T.block_attr({"schedule_rule":"conv2d_nchw_winograd_data_pack"})
+                with T.init():
+                    data_pack[eps, nu, ci, p] = T.float32(0)
+                data_pack[eps, nu, ci, p] = data_pack[eps, nu, ci, p] + input_tile[ci, p, r_a, r_b] * B[r_a, eps] * B[r_b, nu]
+        for i0, i1, i2, i3, i4 in T.grid(4, 4, 2048, 1900, 2048):
+            with T.block("bgemm"):
+                eps, nu, co, p, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4])
+                T.reads(data_pack[eps, nu, ci, p], p1[eps, nu, ci, co])
+                T.writes(bgemm[eps, nu, co, p])
+                with T.init():
+                    bgemm[eps, nu, co, p] = T.float32(0)
+                bgemm[eps, nu, co, p] = bgemm[eps, nu, co, p] + data_pack[eps, nu, ci, p] * p1[eps, nu, ci, co]
+        for i0, i1 in T.grid(4, 2):
+            with T.block("A"):
+                i, j = T.axis.remap("SS", [i0, i1])
+                T.reads()
+                T.writes(A[i, j])
+                T.block_attr({"schedule_rule":"None"})
+                A[i, j] = T.Select(i % 4 == 3 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 3 and j % 2 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 1 and j % 2 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 0 and j % 2 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 2 == 0, T.float32(1), T.float32(0)))))))))
+        for i0, i1, i2, i3, i4, i5 in T.grid(2048, 1900, 2, 2, 4, 4):
+            with T.block("inverse"):
+                co, p, vh, vw, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+                T.reads(bgemm[r_a, r_b, co, p], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1])
+                T.writes(inverse[co, p, vh, vw])
+                T.block_attr({"schedule_rule":"conv2d_nchw_winograd_inverse"})
+                with T.init():
+                    inverse[co, p, vh, vw] = T.float32(0)
+                inverse[co, p, vh, vw] = inverse[co, p, vh, vw] + bgemm[r_a, r_b, co, p] * A[r_a, vh] * A[r_b, vw]
+        for i0, i1, i2, i3 in T.grid(2, 2048, 50, 75):
+            with T.block("conv2d_winograd"):
+                n, co, h, w = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(inverse[co, n * 950 + h // 2 * 38 + w // 2, h % 2, w % 2])
+                T.writes(conv2d_winograd[n, co, h, w])
+                conv2d_winograd[n, co, h, w] = inverse[co, n * 950 + h // 2 * 38 + w // 2, h % 2, w % 2]
+        for i0, i1, i2, i3 in T.grid(2, 2048, 50, 75):
+            with T.block("T_add"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], p2[0, ax1, 0, 0])
+                T.writes(T_add[ax0, ax1, ax2, ax3])
+                T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + p2[0, ax1, 0, 0]
+        for i0, i1, i2, i3 in T.grid(2, 2048, 50, 75):
+            with T.block("T_relu"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                T.reads(T_add[ax0, ax1, ax2, ax3])
+                T.writes(T_relu[ax0, ax1, ax2, ax3])
+                T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float32(0))
+
+    @T.prim_func
+    def nchw_add_relu_scheduled(p0: T.Buffer[(2, 2048, 50, 75), "float32"], p1: T.Buffer[(4, 4, 2048, 2048), "float32"], p2: T.Buffer[(1, 2048, 1, 1), "float32"], T_relu: T.Buffer[(2, 2048, 50, 75), "float32"]):
+        # function attr dict
+        T.func_attr({"layout_free_buffers": [1], "tir.noalias": True, "global_symbol": "main"})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.unroll_explicit":1024})
+            input_tile_local = T.alloc_buffer([2048, 1900, 4, 4], dtype="float32", scope="local")
+            data_pack = T.alloc_buffer([4, 4, 2048, 1900], dtype="float32")
+            bgemm = T.alloc_buffer([4, 4, 2048, 1900], dtype="float32")
+            inverse_local = T.alloc_buffer([2048, 1900, 2, 2], dtype="float32", scope="local")
+            data_pack_local = T.alloc_buffer([4, 4, 2048, 1900], dtype="float32", scope="local")
+            bgemm_local = T.alloc_buffer([4, 4, 2048, 1900], dtype="float32", scope="local")
+            data_pack_shared = T.alloc_buffer([4, 4, 2048, 1900], dtype="float32", scope="shared")
+            p1_shared = T.alloc_buffer([4, 4, 2048, 2048], dtype="float32", scope="shared")
+            for i2_i3_fused_1 in T.thread_binding(256, thread="blockIdx.x"):
+                for i2_i3_fused_2 in T.thread_binding(1024, thread="threadIdx.x"):
+                    for i2_i3_fused_0 in T.serial(15):
+                        for ax0, ax1, ax2, ax3 in T.grid(1, 1, 4, 4):
+                            with T.block("input_tile"):
+                                T.where(i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2 < 3891200)
+                                ci = T.axis.spatial(2048, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) // 1900 + ax0)
+                                p = T.axis.spatial(1900, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) % 1900 + ax1)
+                                eps, nu = T.axis.remap("SS", [ax2, ax3])
+                                T.reads(p0[p // 950, ci, p % 950 // 38 * 2 + eps - 1, p % 38 * 2 + nu - 1])
+                                T.writes(input_tile_local[ci, p, eps, nu])
+                                T.block_attr({"schedule_rule":"None"})
+                                input_tile_local[ci, p, eps, nu] = T.if_then_else(1 <= p % 950 // 38 * 2 + eps and p % 950 // 38 * 2 + eps < 51 and 1 <= p % 38 * 2 + nu and p % 38 * 2 + nu < 76, p0[p // 950, ci, p % 950 // 38 * 2 + eps - 1, p % 38 * 2 + nu - 1], T.float32(0), dtype="float32")
+                        for i0 in T.unroll(4):
+                            for i1 in T.unroll(4):
+                                for i4 in T.unroll(4):
+                                    for i5 in T.unroll(4):
+                                        with T.block("data_pack"):
+                                            T.where((i2_i3_fused_0 * 256 + i2_i3_fused_1) * 1024 + i2_i3_fused_2 < 3891200)
+                                            eps, nu = T.axis.remap("SS", [i0, i1])
+                                            ci = T.axis.spatial(2048, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) // 1900)
+                                            p = T.axis.spatial(1900, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) % 1900)
+                                            r_a, r_b = T.axis.remap("RR", [i4, i5])
+                                            T.reads(input_tile_local[ci, p, r_a, r_b])
+                                            T.writes(data_pack_local[eps, nu, ci, p])
+                                            T.block_attr({"schedule_rule":"conv2d_nchw_winograd_data_pack"})
+                                            with T.init():
+                                                data_pack_local[eps, nu, ci, p] = T.float32(0)
+                                            data_pack_local[eps, nu, ci, p] = data_pack_local[eps, nu, ci, p] + input_tile_local[ci, p, r_a, r_b] * T.Select(r_a % 4 == 3 and eps % 4 == 3, T.float32(1), T.Select(r_a % 4 == 3 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 1, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 0, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 3, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 1 and eps % 4 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) * T.Select(r_b % 4 == 3 and nu % 4 == 3, T.float32(1), T.Select(r_b % 4 == 3 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 1, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 0, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 3, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 1 and nu % 4 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))
+                        for ax0, ax1, ax2, ax3 in T.grid(4, 4, 1, 1):
+                            with T.block("data_pack_local"):
+                                T.where(i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2 < 3891200)
+                                v0, v1 = T.axis.remap("SS", [ax0, ax1])
+                                v2 = T.axis.spatial(2048, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) // 1900 + ax2)
+                                v3 = T.axis.spatial(1900, (i2_i3_fused_0 * 262144 + i2_i3_fused_1 * 1024 + i2_i3_fused_2) % 1900 + ax3)
+                                T.reads(data_pack_local[v0, v1, v2, v3])
+                                T.writes(data_pack[v0, v1, v2, v3])
+                                data_pack[v0, v1, v2, v3] = data_pack_local[v0, v1, v2, v3]
+            for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(24320, thread="blockIdx.x"):
+                for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(2, thread="vthread.x"):
+                    for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(64, thread="threadIdx.x"):
+                        for i4_0 in T.serial(256):
+                            for ax0_ax1_ax2_ax3_fused in T.serial(640):
+                                with T.block("data_pack_shared"):
+                                    v0 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 12160 * 2 + ax0_ax1_ax2_ax3_fused // 320)
+                                    v1 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused % 12160 // 6080 * 2 + ax0_ax1_ax2_ax3_fused % 320 // 160)
+                                    v2 = T.axis.spatial(2048, i4_0 * 8 + ax0_ax1_ax2_ax3_fused % 160 // 20)
+                                    v3 = T.axis.spatial(1900, i0_0_i1_0_i2_0_i3_0_fused % 95 * 20 + ax0_ax1_ax2_ax3_fused % 20)
+                                    T.reads(data_pack[v0, v1, v2, v3])
+                                    T.writes(data_pack_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":1})
+                                    data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3]
+                            for ax0_ax1_ax2_ax3_fused in T.serial(1024):
+                                with T.block("p1_shared"):
+                                    v0 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 12160 * 2 + ax0_ax1_ax2_ax3_fused // 512)
+                                    v1 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused % 12160 // 6080 * 2 + ax0_ax1_ax2_ax3_fused % 512 // 256)
+                                    v2 = T.axis.spatial(2048, i4_0 * 8 + ax0_ax1_ax2_ax3_fused % 256 // 32)
+                                    v3 = T.axis.spatial(2048, i0_0_i1_0_i2_0_i3_0_fused % 6080 // 95 * 32 + ax0_ax1_ax2_ax3_fused % 32)
+                                    T.reads(p1[v0, v1, v2, v3])
+                                    T.writes(p1_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":4})
+                                    p1_shared[v0, v1, v2, v3] = p1[v0, v1, v2, v3]
+                            for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 2, 1, 1, 8, 1, 1, 2, 5):
+                                with T.block("bgemm"):
+                                    eps = T.axis.spatial(4, i0_4 + i0_0_i1_0_i2_0_i3_0_fused // 12160 * 2 + i0_2_i1_2_i2_2_i3_2_fused // 32 + i0_3)
+                                    nu = T.axis.spatial(4, i1_4 + i0_0_i1_0_i2_0_i3_0_fused % 12160 // 6080 * 2 + i1_3)
+                                    co = T.axis.spatial(2048, i0_0_i1_0_i2_0_i3_0_fused % 6080 // 95 * 32 + i0_1_i1_1_i2_1_i3_1_fused * 16 + i0_2_i1_2_i2_2_i3_2_fused % 32 // 4 * 2 + i2_3 * 2 + i2_4)
+                                    p = T.axis.spatial(1900, i0_0_i1_0_i2_0_i3_0_fused % 95 * 20 + i0_2_i1_2_i2_2_i3_2_fused % 4 * 5 + i3_3 * 5 + i3_4)
+                                    ci = T.axis.reduce(2048, i4_0 * 8 + i4_1 * 8 + i4_2)
+                                    T.reads(data_pack_shared[eps, nu, ci, p], p1_shared[eps, nu, ci, co])
+                                    T.writes(bgemm_local[eps, nu, co, p])
+                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
+                                    with T.init():
+                                        bgemm_local[eps, nu, co, p] = T.float32(0)
+                                    bgemm_local[eps, nu, co, p] = bgemm_local[eps, nu, co, p] + data_pack_shared[eps, nu, ci, p] * p1_shared[eps, nu, ci, co]
+                        for ax0, ax1, ax2, ax3 in T.grid(1, 2, 2, 5):
+                            with T.block("bgemm_local"):
+                                v0 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 12160 * 2 + i0_2_i1_2_i2_2_i3_2_fused // 32 + ax0)
+                                v1 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused % 12160 // 6080 * 2 + ax1)
+                                v2 = T.axis.spatial(2048, i0_0_i1_0_i2_0_i3_0_fused % 6080 // 95 * 32 + i0_1_i1_1_i2_1_i3_1_fused * 16 + i0_2_i1_2_i2_2_i3_2_fused % 32 // 4 * 2 + ax2)
+                                v3 = T.axis.spatial(1900, i0_0_i1_0_i2_0_i3_0_fused % 95 * 20 + i0_2_i1_2_i2_2_i3_2_fused % 4 * 5 + ax3)
+                                T.reads(bgemm_local[v0, v1, v2, v3])
+                                T.writes(bgemm[v0, v1, v2, v3])
+                                bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3]
+            for i0, i1, i2_0, i3_0, ax0, ax1 in T.grid(2, 2048, 25, 38, 1, 1):
+                for ax2 in T.unroll(2):
+                    for ax3 in T.unroll(2):
+                        for ax4 in T.unroll(4):
+                            for ax5 in T.unroll(4):
+                                with T.block("inverse"):
+                                    co = T.axis.spatial(2048, i1 + ax0)
+                                    p = T.axis.spatial(1900, i0 * 950 + i2_0 * 38 + i3_0 + ax1)
+                                    vh, vw, r_a, r_b = T.axis.remap("SSRR", [ax2, ax3, ax4, ax5])
+                                    T.reads(bgemm[r_a, r_b, co, p])
+                                    T.writes(inverse_local[co, p, vh, vw])
+                                    T.block_attr({"schedule_rule":"conv2d_nchw_winograd_inverse"})
+                                    with T.init():
+                                        inverse_local[co, p, vh, vw] = T.float32(0)
+                                    inverse_local[co, p, vh, vw] = inverse_local[co, p, vh, vw] + bgemm[r_a, r_b, co, p] * T.Select(r_a % 4 == 3 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 3 and vh % 2 == 0, T.float32(0), T.Select(r_a % 4 == 2 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 2 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 1 and vh % 2 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 0 and vh % 2 == 1, T.float32(0), T.Select(r_a % 4 == 0 and vh % 2 == 0, T.float32(1), T.float32(0))))))))) * T.Select(r_b % 4 == 3 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 3 and vw % 2 == 0, T.float32(0), T.Select(r_b % 4 == 2 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 2 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 1 and vw % 2 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 0 and vw % 2 == 1, T.float32(0), T.Select(r_b % 4 == 0 and vw % 2 == 0, T.float32(1), T.float32(0)))))))))
+            for i0_i1_i2_i3_fused_1 in T.thread_binding(256, thread="blockIdx.x"):
+                for i0_i1_i2_i3_fused_2 in T.thread_binding(1024, thread="threadIdx.x"):
+                    for i0_i1_i2_i3_fused_0 in T.serial(59):
+                        with T.block("T_add"):
+                            T.where((i0_i1_i2_i3_fused_0 * 256 + i0_i1_i2_i3_fused_1) * 1024 + i0_i1_i2_i3_fused_2 < 15360000)
+                            ax0 = T.axis.spatial(2, (i0_i1_i2_i3_fused_0 * 262144 + i0_i1_i2_i3_fused_1 * 1024 + i0_i1_i2_i3_fused_2) // 7680000)
+                            ax1 = T.axis.spatial(2048, (i0_i1_i2_i3_fused_0 * 262144 + i0_i1_i2_i3_fused_1 * 1024 + i0_i1_i2_i3_fused_2) % 7680000 // 3750)
+                            ax2 = T.axis.spatial(50, (i0_i1_i2_i3_fused_0 * 262144 + i0_i1_i2_i3_fused_1 * 1024 + i0_i1_i2_i3_fused_2) % 3750 // 75)
+                            ax3 = T.axis.spatial(75, (i0_i1_i2_i3_fused_0 * 262144 + i0_i1_i2_i3_fused_1 * 1024 + i0_i1_i2_i3_fused_2) % 75)
+                            T.reads(inverse_local[ax1, ax0 * 950 + ax2 // 2 * 38 + ax3 // 2, ax2 % 2, ax3 % 2], p2[0, ax1, 0, 0])
+                            T.writes(T_relu[ax0, ax1, ax2, ax3])
+                            T_relu[ax0, ax1, ax2, ax3] = T.max(inverse_local[ax1, ax0 * 950 + ax2 // 2 * 38 + ax3 // 2, ax2 % 2, ax3 % 2] + p2[0, ax1, 0, 0], T.float32(0))
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [2, 1, 2, 1, 1]),
+        ("SamplePerfectTile", [2, 1, 1, 2, 1]),
+        ("SamplePerfectTile", [64, 2, 8, 1, 2]),
+        ("SamplePerfectTile", [95, 1, 4, 1, 5]),
+        ("SamplePerfectTile", [256, 1, 8]),
+        ("SampleCategorical", 0),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 4),
+    ]
+    with _target():
+        mod = nchw_add_relu
+    actual = _design_space(mod)
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[nchw_add_relu_scheduled],
+        expected_decisions=[decision_0],
+        debug_mask=0,
+    )
+
+
 if __name__ == "__main__":
     test_cuda_nhwc()
     test_cuda_nchw()
+    test_cuda_nchw_add_relu()

From 76a6e7141f01501b298ef08dc50a1d22fba1573b Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 18 Nov 2022 19:42:09 -0600
Subject: [PATCH 621/704] [TVMScript] Output elif where possible (#13433)

When a TIR `IfThenElseNode::else_case` is itself a `IfThenElse` node,
it may be printed as `elif` instead of `else: if ...`.  The `elif`
syntax was already handled on the parsing side, so this commit only
changes the TVMScript printer.

This change also incidentally altered the printer's output for an else
block whose condition is `True`.  Previously, these were dropped from
the output altogether.  Now, they are retained in the else block.
Removing no-op statements should be done by a transformation pass, not
a printer/parser round trip, so this change in behavior makes sense to
keep.
---
 src/printer/tvmscript_printer.cc              | 18 ++++++--
 .../unittest/test_tvmscript_roundtrip.py      | 42 +++++++++++++++++++
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 0dc6240bc6ca..8f012f3b0eb3 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -1259,10 +1259,22 @@ Doc TVMScriptPrinter::VisitStmt_(const IfThenElseNode* op) {
   Doc doc;
   doc << "if " << Print(op->condition) << ":";
   doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->then_case));
-  if (!is_one(op->condition) && op->else_case) {
-    doc << Doc::NewLine();
-    doc << "else:" << Doc::Indent(4, Doc::NewLine() << PrintBody(op->else_case.value()));
+
+  Optional<Stmt> else_case = op->else_case;
+  while (else_case) {
+    if (auto* else_if = else_case.value().as<IfThenElseNode>()) {
+      doc << Doc::NewLine();
+      doc << "elif " << Print(else_if->condition) << ":";
+      doc << Doc::Indent(4, Doc::NewLine() << PrintBody(else_if->then_case));
+
+      else_case = else_if->else_case;
+    } else {
+      doc << Doc::NewLine();
+      doc << "else:" << Doc::Indent(4, Doc::NewLine() << PrintBody(else_case.value()));
+      break;
+    }
   }
+
   return doc;
 }
 
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index b8c8379c8a16..53b3cd69ea80 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3467,6 +3467,45 @@ def func(A: T.Buffer[1, "int32"]):
     return func
 
 
+def if_true_else():
+    @T.prim_func
+    def func() -> None:
+        if True:
+            T.evaluate(0)
+        else:
+            T.evaluate(1)
+
+    return func
+
+
+def elif_chain_without_else():
+    @T.prim_func
+    def func(i: T.int32) -> None:
+        if i == 0:
+            T.evaluate(0)
+        elif i == 1:
+            T.evaluate(1)
+        elif i == 2:
+            T.evaluate(2)
+
+    return func
+
+
+def elif_chain_with_else():
+    @T.prim_func
+    def func(i: T.int32) -> None:
+        if i == 0:
+            T.evaluate(0)
+        elif i == 1:
+            T.evaluate(1)
+        elif i == 2:
+            T.evaluate(2)
+        else:
+            T.evaluate(3)
+
+    return func
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3519,6 +3558,9 @@ def func(A: T.Buffer[1, "int32"]):
     bool_cast,
     return_none,
     implicit_evaluate,
+    if_true_else,
+    elif_chain_without_else,
+    elif_chain_with_else,
 )
 
 
From 26d9b5a7c60fa8f7fe600f569274319c70702631 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 19 Nov 2022 07:29:31 -0800
Subject: [PATCH 622/704] [Hotfix] Skip Flaky Tests (Tracked in #13443)
 (#13444)

This PR skips a flaky test to unblock CI on main.
---
 tests/python/relay/opencl_texture/test_network.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/python/relay/opencl_texture/test_network.py b/tests/python/relay/opencl_texture/test_network.py
index 638be477d06c..46ee79697ea6 100644
--- a/tests/python/relay/opencl_texture/test_network.py
+++ b/tests/python/relay/opencl_texture/test_network.py
@@ -16,14 +16,15 @@
 # under the License.
 
 import re
-import tvm
+
 import numpy as np
+import pytest
+import tvm
 from tvm import relay
-from tvm.relay import testing
 from tvm.contrib import utils
-from utils.adreno_utils import gpu_preprocess, build_run_compare, get_model
-import pytest
+from tvm.relay import testing
 from tvm.relay.op import register_mixed_precision_conversion
+from utils.adreno_utils import build_run_compare, get_model, gpu_preprocess
 
 
 def convert_to_fp16(mod, dtype):
@@ -49,6 +50,7 @@ def _test_mobilenet_v1(remote, target, dtype):
     build_run_compare(remote, mod, params, inputs, dtypes, target, [])
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/13443")
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
 @pytest.mark.skipif(tvm.testing.utils.IS_IN_CI, reason="CI doesn't support fp16(half datatypes)")
@@ -56,6 +58,7 @@ def test_mobilenet_v1_fp16(remote, target):
     _test_mobilenet_v1(remote, target, "float16")
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/13443")
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl -device=adreno")
 def test_mobilenet_v1_fp32(remote, target):

From d6632070a01e23270f9f480efc39d09fc38eb55f Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Sun, 20 Nov 2022 22:40:55 -0500
Subject: [PATCH 623/704] [Fix] Fix IndexDataTypeNormalizer to avoid redundant
 casting (#13449)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR fixes the behavior of IndexDataTypeNormalizer on CastNode.

## Background

Consider the following case,
```python
A = te.placeholder((tir.IntImm("int64", 2), tir.IntImm("int64", 4)), name="A")
B = topi.reshape(A, (4, 2))
func = te.create_prim_func([A, B], index_dtype_override=None)
```
the generated PrimFunc is
```python
@T.prim_func
def func(A: T.Buffer[(T.int64(2), T.int64(4)), "float32"], T_reshape: T.Buffer[(4, 2), "float32"]):
    for i0, i1 in T.grid(4, 2):
        with T.block("T_reshape"):
            ax0, ax1 = T.axis.remap("SS", [i0, i1])
            T.reads(A[(T.Cast("int64", ax0) * T.int64(2) + T.Cast("int64", ax1)) % T.int64(8) // T.int64(4), (T.Cast("int64", ax0) * T.int64(2) + T.Cast("int64", ax1)) % T.int64(4)])
            T.writes(T_reshape[ax0, ax1])
            T_reshape[ax0, ax1] = A[(T.Cast("int64", ax0) * T.int64(2) + T.Cast("int64", ax1)) % T.int64(8) // T.int64(4), (T.Cast("int64", ax0) * T.int64(2) + T.Cast("int64", ax1)) % T.int64(4)]
```
Here loop variables `ax0` and `ax1` have dtype int32, since the shape of the output buffer is in int32. Other other hand, the input buffer has shape in int64. So as the script above shows, CreatePrimFunc will cast the int32 variables to int64 first, and access the input buffer afterwards.

Now if we use the option `index_dtype_override` to specify an index dtype as below,
```python
func = te.create_prim_func([A, B], index_dtype_override="int64")
```
the generated function will be
```python
@T.prim_func
def func(A: T.Buffer[(T.int64(2), T.int64(4)), "float32"], T_reshape: T.Buffer[(T.int64(4), T.int64(2)), "float32"]):
    for i0, i1 in T.grid(T.int64(4), T.int64(2)):
        with T.block("T_reshape"):
            ax0, ax1 = T.axis.remap("SS", [i0, i1])
            T.reads(A[(T.Cast("int64", ax0) * T.int64(2) + T.Cast("int64", ax1)) % T.int64(8) // T.int64(4), (T.Cast("int64", ax0) * T.int64(2) + T.Cast("int64", ax1)) % T.int64(4)])
            T.writes(T_reshape[ax0, ax1])
            T_reshape[ax0, ax1] = A[(T.Cast("int64", ax0) * T.int64(2) + T.Cast("int64", ax1)) % T.int64(8) // T.int64(4), (T.Cast("int64", ax0) * T.int64(2) + T.Cast("int64", ax1)) % T.int64(4)]
```
Note that though all variables and the buffer shapes have dtype int64, there are still CastNodes such as `T.Cast("int64", ax0)` when `ax0` is already an int64 variable. We don’t want such redundant casting.

## Fix

To fix the issue above, this PR overrides the `VisitExpr_(const CastNode* cast)` method in IndexDataTypeNormalizer. When the `value` field of a CastNode already has the target dtype, we no longer cast it.
---
 include/tvm/tir/data_type_rewriter.h          |  1 +
 src/te/operation/create_primfunc.cc           |  2 +-
 src/tir/ir/data_type_rewriter.cc              | 11 +++++-
 .../unittest/test_te_create_primfunc.py       | 36 +++++++++++++++++--
 4 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/include/tvm/tir/data_type_rewriter.h b/include/tvm/tir/data_type_rewriter.h
index 378addaba528..bf90aaedfec0 100644
--- a/include/tvm/tir/data_type_rewriter.h
+++ b/include/tvm/tir/data_type_rewriter.h
@@ -145,6 +145,7 @@ class IndexDataTypeNormalizer : public IndexDataTypeRewriter {
   using Parent::VisitStmt_;
   PrimExpr VisitExpr_(const IntImmNode* op) final;
   PrimExpr VisitExpr_(const VarNode* op) final;
+  PrimExpr VisitExpr_(const CastNode* op) final;
 
   DataType target_data_type_ = DataType::Int(64);
 };
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 9f8d7d46a151..223f8dcd5dd0 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -496,7 +496,7 @@ PrimFunc GenerateAndCompletePrimFunc(const Array<te::Tensor>& arg_list,
 PrimFunc CreatePrimFuncWithConstants(const Array<te::Tensor>& arg_list,
                                      const Array<runtime::NDArray>& constants,
                                      std::optional<DataType> index_dtype_override) {
-  // Infomations used in CreatePrimFunc and its sub-functions.
+  // Information used in CreatePrimFunc and its sub-functions.
   CreateFuncInfo info(arg_list);
   // Root body stmts.
   Array<Stmt> root_stmts;
diff --git a/src/tir/ir/data_type_rewriter.cc b/src/tir/ir/data_type_rewriter.cc
index fecb8e5fb70c..27a59d970981 100644
--- a/src/tir/ir/data_type_rewriter.cc
+++ b/src/tir/ir/data_type_rewriter.cc
@@ -511,6 +511,7 @@ PrimExpr IndexDataTypeRewriter::VisitExpr_(const CallNode* op) {
 
 IndexDataTypeNormalizer::IndexDataTypeNormalizer(DataType target_data_type)
     : target_data_type_(std::move(target_data_type)) {}
+
 PrimFunc IndexDataTypeNormalizer::Rewrite(PrimFunc func) {
   Map<Var, Buffer> new_buffer_map = func->buffer_map;
   for (const auto& [var, buffer] : func->buffer_map) {
@@ -534,7 +535,7 @@ PrimExpr IndexDataTypeNormalizer::VisitExpr_(const VarNode* op) {
   if (auto it = var_remap_.find(GetRef<Var>(op)); it != var_remap_.end()) {
     return (*it).second;
   }
-  if (is_enabled_) {
+  if (is_enabled_ && op->dtype != target_data_type_) {
     Var new_var = GetRef<Var>(op).copy_with_dtype(target_data_type_);
     var_remap_.Set(GetRef<Var>(op), new_var);
     return std::move(new_var);
@@ -542,5 +543,13 @@ PrimExpr IndexDataTypeNormalizer::VisitExpr_(const VarNode* op) {
   return GetRef<PrimExpr>(op);
 }
 
+PrimExpr IndexDataTypeNormalizer::VisitExpr_(const CastNode* op) {
+  if (is_enabled_) {
+    PrimExpr value = IndexDataTypeNormalizer::VisitExpr(op->value);
+    return value->dtype == target_data_type_ ? value : Cast(target_data_type_, value);
+  }
+  return IndexDataTypeRewriter::VisitExpr_(op);
+}
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 6662c7aca85b..9a5326650184 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -46,8 +46,6 @@ def test_unique_name_reduction_block():
 
 def _check_workload(te_workload, tir_workload, index_dtype_override=None):
     func = te.create_prim_func(te_workload(), index_dtype_override)
-    print(func.script())
-    print(tvm.ir.base.get_first_structural_mismatch(func, tir_workload))
     tvm.ir.assert_structural_equal(func, tir_workload)
     # make sure that we can create schedule from the func
     s = tir.Schedule(func, debug_mask="all")
@@ -575,6 +573,39 @@ def expected(
     _check_workload(te_func, expected)
 
 
+def te_reshape():
+    # The following is possible to be generated by TOPI. So we test this case.
+    A = te.placeholder((tvm.tir.IntImm("int64", 2), tvm.tir.IntImm("int64", 4)), name="A")
+    B = topi.reshape(A, (4, 2))
+    return [A, B]
+
+
+@T.prim_func
+def tir_reshape(
+    A: T.Buffer[(T.int64(2), T.int64(4)), "float32"],
+    T_reshape: T.Buffer[(T.int64(4), T.int64(2)), "float32"],
+):
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    for i0, i1 in T.grid(T.int64(4), T.int64(2)):
+        with T.block("T_reshape"):
+            ax0, ax1 = T.axis.remap("SS", [i0, i1])
+            T.reads(
+                A[
+                    (ax0 * T.int64(2) + ax1) % T.int64(8) // T.int64(4),
+                    (ax0 * T.int64(2) + ax1) % T.int64(4),
+                ]
+            )
+            T.writes(T_reshape[ax0, ax1])
+            T_reshape[ax0, ax1] = A[
+                (ax0 * T.int64(2) + ax1) % T.int64(8) // T.int64(4),
+                (ax0 * T.int64(2) + ax1) % T.int64(4),
+            ]
+
+
+def test_reshape():
+    _check_workload(te_reshape, tir_reshape, index_dtype_override="int64")
+
+
 if __name__ == "__main__":
     test_unique_name_complete_block()
     test_unique_name_reduction_block()
@@ -593,3 +624,4 @@ def expected(
     test_argmax_val_idx()
     test_int64_indices()
     test_zero_dim_add()
+    test_reshape()

From 41b04007aa5a36f81d183721c23560d77b7b4c1d Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Sun, 20 Nov 2022 21:55:06 -0600
Subject: [PATCH 624/704] [FIX][ONNX][Relay] onnx converter on matmul with
 scalar; bring back nn.matmul check (#13448)

This PR brings 2 bug fixes:
1. ONNX converter for matmul: ONNX matmul follows NumPy [rules](https://numpy.org/doc/stable/reference/generated/numpy.matmul.html):
> If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions. After matrix multiplication the prepended 1 is removed.
> If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions. After matrix multiplication the appended 1 is removed.

The (my) previous fix https://github.com/apache/tvm/pull/11174 did not consider the second rule (append 1 dimension for the rhs vector).

2. Relay's `nn.matmul` takes 2-D matrices and the checker was removed in a recent PR https://github.com/apache/tvm/pull/13287. This PR puts the checker back to prevent process crashes (make it a readable TVMError) for readability (and also for that the CI in https://github.com/ise-uiuc/nnsmith/pull/64 won't be terminated while using TVM-10).
---
 python/tvm/relay/frontend/onnx.py          | 15 +++++++++++++--
 src/relay/op/nn/nn.h                       |  1 +
 tests/python/frontend/onnx/test_forward.py |  7 +++----
 tests/python/relay/test_op_level1.py       |  5 +++++
 4 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index a14bb47956ee..d185d143c7a6 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -358,8 +358,19 @@ def matmul_out_dtype(inputs, out_dtype):
         )
         return _op.reshape(output, fold_constant(final_shape))
 
-    if a_rank == 1:
-        return _op.squeeze(_op.nn.matmul(_op.expand_dims(inputs[0], axis=0), inputs[1]), axis=[0])
+    if a_rank == 1 or b_rank == 1:
+        axis = []
+        if a_rank == 1:
+            lhs = _op.expand_dims(inputs[0], axis=0)
+            axis.append(0)
+        else:
+            lhs = inputs[0]
+        if b_rank == 1:
+            rhs = _op.expand_dims(inputs[1], axis=1)
+            axis.append(-1)
+        else:
+            rhs = inputs[1]
+        return _op.squeeze(_op.nn.matmul(lhs, rhs), axis=axis)
 
     # Otherwise a simple dense op will get the job done.
     input_1_t = _op.transpose(inputs[1], axes=(1, 0))
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index 30f7f9e3304d..f5497a4603bf 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -103,6 +103,7 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       oshape.Set(oshape.size() - 1, tensor_b_elements / dshape[dshape.size() - 1]);
       // Otherwise just pull it out of the tensor_b shape directly.
     } else {
+      ICHECK(static_cast<int>(tensor_b->shape.size()) == 2);
       if (param->auto_scheduler_rewritten_layout.size() == 0 &&
           param->meta_schedule_original_shape.size() == 0) {
         // ensure inner dimension matches between data and weight. If one inner
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 017a1621d7d8..211d7f798aba 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1298,10 +1298,7 @@ def test_matmul(target, dev):
     """test_matmul"""
 
     def test_one_matmul(a_shape, b_shape):
-        if len(a_shape) == 1:
-            out_shape = [b_shape[1]]
-        else:
-            out_shape = [a_shape[0], b_shape[1]]
+        out_shape = np.matmul(np.zeros(a_shape), np.zeros(b_shape)).shape
 
         a_array = np.random.uniform(size=a_shape).astype("float32")
         b_array = np.random.uniform(size=b_shape).astype("float32")
@@ -1323,6 +1320,8 @@ def test_one_matmul(a_shape, b_shape):
 
     test_one_matmul((4, 3), (3, 4))
     test_one_matmul((3,), (3, 1))
+    test_one_matmul((1, 3), (3,))
+    test_one_matmul((3,), (3,))
 
 
 @tvm.testing.parametrize_targets
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 1c93ee766a88..30d9d88ad7cb 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -602,6 +602,11 @@ def test_matmul_type_check():
     y = relay.nn.matmul(x, w)
     yy = run_infer_type(y)
 
+    i0 = relay.var("i0", shape=(1, 1), dtype="float32")
+    i1 = relay.var("i1", shape=(1,), dtype="float32")
+    with pytest.raises(tvm.TVMError):
+        run_infer_type(relay.nn.matmul(i0, i1))
+
 
 @tvm.testing.uses_gpu
 def test_matmul(executor_kind):

From edfeba5c3a2caf133d2d045d1c4d99f5c34aeb69 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Sun, 20 Nov 2022 19:55:39 -0800
Subject: [PATCH 625/704] [Roofline] Allow user choice of pass for saving
 lowered TIR (#13437)

Useful if users what to use a different pass for TIR extraction.
---
 python/tvm/utils/roofline/__init__.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/tvm/utils/roofline/__init__.py b/python/tvm/utils/roofline/__init__.py
index 1129ac2c0e1d..67cf1133ffa5 100644
--- a/python/tvm/utils/roofline/__init__.py
+++ b/python/tvm/utils/roofline/__init__.py
@@ -54,11 +54,18 @@ class SaveLoweredTIR:
     """Save TIR functions from right before final lowering. Right now this
     means right before tir.MakePackedAPI."""
 
-    def __init__(self):
+    def __init__(self, before_pass: str = "tir.MakePackedAPI"):
+        """
+        Parameters
+        ----------
+        before_pass: str
+            Pass before which the TIR is saved.
+        """
         self.functions = {}
+        self.before_pass = before_pass
 
     def run_before_pass(self, mod, info):
-        if info.name == "tir.MakePackedAPI":
+        if info.name == self.before_pass:
             for v, func in mod.functions.items():
                 if isinstance(func, tir.PrimFunc):
                     self.functions[v] = func

From 8136173a631bf6c7274d26285349225fcf6e495f Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Sun, 20 Nov 2022 23:36:57 -0800
Subject: [PATCH 626/704] [ci][docs] Fix docs deploy (#13442)

The docs deploy is broken following #13335:
https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4754/pipeline

This avoids downloading the whole docs directory (which is just used to
host documentation previews for PRs) and just grabs the `docs.tgz` which
is actually used to deploy the docs.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                 | 9 ++++-----
 ci/jenkins/Deploy.groovy.j2 | 5 ++++-
 ci/scripts/jenkins/s3.py    | 7 ++++++-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index dfa9a7eda284..23c022abd335 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-11-17T23:53:21.059864
+// Generated at 2022-11-19T01:24:31.191996
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -3646,10 +3646,9 @@ def deploy() {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
                     sh(
-                script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/docs",
-                label: 'Download artifacts from S3',
-              )
-
+                      script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/docs --items docs.tgz",
+                      label: 'Download docs folder from S3',
+                    )
                     deploy_docs()
           }
         }
diff --git a/ci/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2
index 9bb6257f7a74..5cfffc7caef3 100644
--- a/ci/jenkins/Deploy.groovy.j2
+++ b/ci/jenkins/Deploy.groovy.j2
@@ -91,7 +91,10 @@ def deploy() {
           ws="tvm/deploy-docs",
         ) %}
           init_git()
-          {{ m.download_artifacts(tag='docs') }}
+          sh(
+            script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/docs --items docs.tgz",
+            label: 'Download docs folder from S3',
+          )
           deploy_docs()
         {% endcall %}
         {% call m.deploy_step(
diff --git a/ci/scripts/jenkins/s3.py b/ci/scripts/jenkins/s3.py
index f5aa535df8c4..63fbaac5fafc 100755
--- a/ci/scripts/jenkins/s3.py
+++ b/ci/scripts/jenkins/s3.py
@@ -129,7 +129,12 @@ def s3(source: str, destination: str, recursive: bool) -> List[str]:
 
     for item in items:
         if action == Action.DOWNLOAD:
-            stdout = s3(source=s3_path, destination=item, recursive=True)
+            source = s3_path
+            recursive = True
+            if item != ".":
+                source = s3_path + "/" + item
+                recursive = False
+            stdout = s3(source=source, destination=item, recursive=recursive)
             files = parse_output_files(stdout)
             chmod(files)
             for file in files:

From b419c4b4de8d73b8e942b332074f4ae577348319 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 21 Nov 2022 11:11:14 -0800
Subject: [PATCH 627/704] [Roofline] Add fma (non-tensorcore) peak flops for
 CUDA (#13419)

Compute peak flops using fma instructions on CUDA targets. Supports
arbitrary datatypes.
---
 python/tvm/utils/roofline/cuda.py      | 94 ++++++++++++++++++++++++--
 tests/python/unittest/test_roofline.py |  7 ++
 2 files changed, 94 insertions(+), 7 deletions(-)

diff --git a/python/tvm/utils/roofline/cuda.py b/python/tvm/utils/roofline/cuda.py
index b6e8ae066459..5d80c808801b 100644
--- a/python/tvm/utils/roofline/cuda.py
+++ b/python/tvm/utils/roofline/cuda.py
@@ -154,10 +154,10 @@ def peak_flops_tensorcore_tir(
         if remote is None:
             raise RuntimeError("A RPCSession must be provided when using a remote device.")
         temp = utils.tempdir()
-        path = temp.relpath("peak_fma_flops.tar")
+        path = temp.relpath("peak_mma_flops.tar")
         f.export_library(path)
         remote.upload(path)
-        f = remote.load_module("peak_fma_flops.tar")
+        f = remote.load_module("peak_mma_flops.tar")
 
     x = nd.empty((16, 16), dtype=mat_dtype, device=dev)
     y = nd.empty((16, 16), dtype=acc_dtype, device=dev)
@@ -166,6 +166,81 @@ def peak_flops_tensorcore_tir(
     return n * 16 * 16 * 16 * 2 * sms * 8 / times.min
 
 
+@functools.lru_cache(maxsize=None)
+def estimate_peak_flops_fma(
+    target: Target,
+    dev: Device,
+    remote: Optional[RPCSession],
+    dtype: str,
+) -> Tuple[float, float, str]:
+    """Estimate the peak FLOP/s of a cuda device with fma operations (not using tensor cores).
+
+    References
+    ----------
+    https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.1.pdf
+
+    Parameters
+    ----------
+    target : Target
+        Target to run on. This should be as specific to the actual hardware as
+        possible.
+    dev : Device
+        Device to run on.
+    remote : Optional[RPCSession]
+      Remote session used to upload artifacts for runtime evaluation. Must be
+      the same session used to create `dev`.
+    dtype : str
+        Dtype of fma operation
+
+    Returns
+    -------
+    peak_flops : float
+        Approximate sustained FLOP/s of this target/device combo assuming
+        fma instructions. Addition and multiplications are each counted as
+        separate FLOPs.
+    """
+
+    vec_width = 32
+    warps = 16  # need 16 warps to get enough in-SM parallelism
+    sms = dev.multi_processor_count
+    n = 100000
+
+    @T.prim_func
+    def peak_flops_fma_tir(
+        A: T.Buffer((sms, warps, vec_width), dtype),
+        B: T.Buffer((sms, warps, vec_width), dtype),
+    ):
+        # pylint: disable=invalid-name, missing-function-docstring
+        shared = T.alloc_buffer((sms, warps, vec_width), dtype=dtype, scope="shared")
+        for sm in T.thread_binding(sms, thread="blockIdx.x"):
+            for warp in T.thread_binding(warps, thread="threadIdx.y"):
+                for t in T.thread_binding(vec_width, thread="threadIdx.x"):
+                    shared[sm, warp, t] = A[sm, warp, t]
+                    for _ in range(n):
+                        shared[sm, warp, t] = (
+                            shared[sm, warp, t] * shared[sm, warp, t] + shared[sm, warp, t]
+                        )
+                    B[sm, warp, t] = shared[sm, warp, t]
+
+    with transform.PassContext(opt_level=3):
+        f = build(peak_flops_fma_tir, target=target)
+
+    # upload to remote if running over rpc
+    if dev.device_type >= RPC_SESS_MASK:
+        if remote is None:
+            raise RuntimeError("A RPCSession must be provided when using a remote device.")
+        temp = utils.tempdir()
+        path = temp.relpath("peak_fma_flops.tar")
+        f.export_library(path)
+        remote.upload(path)
+        f = remote.load_module("peak_fma_flops.tar")
+
+    x = nd.empty((sms, warps, vec_width), dtype=dtype, device=dev)
+    y = nd.empty((sms, warps, vec_width), dtype=dtype, device=dev)
+    times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(x, y)
+    return n * warps * sms * vec_width * 2 / times.min
+
+
 @registry.estimate_peak_flops.register("cuda")
 def estimate_peak_flops(
     func: PrimFunc,  # pylint: disable=unused-argument
@@ -203,17 +278,22 @@ def estimate_peak_flops(
     name : str
         Dtype/intrinsic used by `func` to achieve peak flops.
     """
-    assert nvcc.have_tensorcore(
-        dev.compute_version
-    ), "CUDA roofline only works with devices that have tensorcores"
+    has_tensorcore = nvcc.have_tensorcore(dev.compute_version)
+    # assume that the first argument dtype is the same as all the others
+    dtype = list(func.buffer_map.values())[0].dtype
+    if dtype == "float16" and has_tensorcore:
+        peak_flops = estimate_peak_flops_tensorcore(target, dev, remote)
+        name = "float16 tensorcore"
+    else:
+        peak_flops = estimate_peak_flops_fma(target, dev, remote, dtype)
+        name = f"{dtype} fma"
     flops = np.sum(
         features["float_addsub"]
         + features["float_mul"]
         + features["float_mad"] * 2
         + features["float_divmod"]
     )
-    peak_flops = estimate_peak_flops_tensorcore(target, dev, remote)
-    return flops, peak_flops, "float16 tensorcore"
+    return flops, peak_flops, name
 
 
 @T.prim_func
diff --git a/tests/python/unittest/test_roofline.py b/tests/python/unittest/test_roofline.py
index a8bf4df497f6..7a1c3478c51d 100644
--- a/tests/python/unittest/test_roofline.py
+++ b/tests/python/unittest/test_roofline.py
@@ -62,6 +62,13 @@ def test_estimate_peak_flops_gpu():
         flops > 10**12 and flops < 10**14
     ), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"
 
+    # this test should run on all gpus
+    flops = tvm.utils.roofline.cuda.estimate_peak_flops_fma(target, dev, remote, "float32")
+    # most gpus since 2016 should be able to hit a TFLOP/s with fma instructions
+    assert (
+        flops > 10**12 and flops < 10**14
+    ), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"
+
 
 @tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
 @tvm.testing.requires_llvm

From 77f9c49b4e5182486d80fc44630ebc5e40535245 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Mon, 21 Nov 2022 14:14:42 -0500
Subject: [PATCH 628/704] [Fix][MetaSchedule] Param for rule AutoBind on Python
 side (#13454)

Following https://github.com/apache/tvm/pull/13206, this PR brings the new parameter added to the AutoBind schedule rule to Python side.
---
 python/tvm/meta_schedule/schedule_rule/auto_bind.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/tvm/meta_schedule/schedule_rule/auto_bind.py b/python/tvm/meta_schedule/schedule_rule/auto_bind.py
index c211093e9275..99a91f606e32 100644
--- a/python/tvm/meta_schedule/schedule_rule/auto_bind.py
+++ b/python/tvm/meta_schedule/schedule_rule/auto_bind.py
@@ -33,12 +33,15 @@ class AutoBind(ScheduleRule):
         The maximum number of threadblock on GPU.
     thread_extents: Optional[List[int]]
         Candidates of thread axis extent.
+    max_threads_per_block: int
+        The maximum number of threads per block, if it is known when this schedule rule is created.
     """
 
     def __init__(
         self,
         max_threadblocks: int = 256,
         thread_extents: Optional[List[int]] = None,
+        max_threads_per_block: int = -1,
     ) -> None:
         if thread_extents is None:
             thread_extents = [32, 64, 128, 256, 512, 1024]
@@ -46,4 +49,5 @@ def __init__(
             _ffi_api.ScheduleRuleAutoBind,  # type: ignore # pylint: disable=no-member
             max_threadblocks,
             thread_extents,
+            max_threads_per_block,
         )

From 1b3d77a8973c3542e74affe29e02fdf573cddc91 Mon Sep 17 00:00:00 2001
From: Alexey Voronov <avoronov.icemist@gmail.com>
Date: Mon, 21 Nov 2022 22:38:56 +0300
Subject: [PATCH 629/704] [RPC] Fix tracker connection termination (#13420)

* [RPC] Fix tracker connection termination

* [RPC] Unify work with random key

* additional usage of the random_key API change
---
 python/tvm/rpc/base.py                    | 36 +++++++++++++++-----
 python/tvm/rpc/proxy.py                   |  8 ++---
 python/tvm/rpc/server.py                  |  4 +--
 python/tvm/rpc/tracker.py                 |  2 +-
 tests/python/unittest/test_rpc_base.py    | 40 +++++++++++++++++++++++
 tests/python/unittest/test_runtime_rpc.py | 13 ++++----
 6 files changed, 81 insertions(+), 22 deletions(-)
 create mode 100644 tests/python/unittest/test_rpc_base.py

diff --git a/python/tvm/rpc/base.py b/python/tvm/rpc/base.py
index fea5169d7fde..6ab64f1e885c 100644
--- a/python/tvm/rpc/base.py
+++ b/python/tvm/rpc/base.py
@@ -120,7 +120,7 @@ def recvjson(sock):
     return data
 
 
-def random_key(prefix, cmap=None):
+def random_key(prefix, delimiter=":", cmap=None):
     """Generate a random key
 
     Parameters
@@ -128,6 +128,9 @@ def random_key(prefix, cmap=None):
     prefix : str
         The string prefix
 
+    delimiter : str
+        The delimiter
+
     cmap : dict
         Conflict map
 
@@ -136,13 +139,30 @@ def random_key(prefix, cmap=None):
     key : str
         The generated random key
     """
-    if cmap:
-        while True:
-            key = prefix + str(random.random())
-            if key not in cmap:
-                return key
-    else:
-        return prefix + str(random.random())
+    while True:
+        key = "{}{}{}".format(prefix, delimiter, random.random())
+        if not cmap or key not in cmap:
+            break
+    return key
+
+
+def split_random_key(key, delimiter=":"):
+    """Split a random key by delimiter into prefix and random part
+
+    Parameters
+    ----------
+    key : str
+        The generated random key
+
+    Returns
+    -------
+    prefix : str
+        The string prefix
+
+    random_part : str
+        The generated random
+    """
+    return key.rsplit(delimiter, 1)
 
 
 def connect_with_retry(addr, timeout=60, retry_period=5):
diff --git a/python/tvm/rpc/proxy.py b/python/tvm/rpc/proxy.py
index 50aa5a775d51..4c3144e7b5cd 100644
--- a/python/tvm/rpc/proxy.py
+++ b/python/tvm/rpc/proxy.py
@@ -319,10 +319,10 @@ def _regenerate_server_keys(self, keys):
         new_keys = []
         # re-generate the server match key, so old information is invalidated.
         for key in keys:
-            rpc_key, _ = key.split(":")
+            rpc_key, _ = base.split_random_key(key)
             handle = self._server_pool[key]
             del self._server_pool[key]
-            new_key = base.random_key(rpc_key + ":", keyset)
+            new_key = base.random_key(rpc_key, keyset)
             self._server_pool[new_key] = handle
             keyset.add(new_key)
             new_keys.append(new_key)
@@ -368,7 +368,7 @@ def _update_tracker(self, period_update=False):
             need_update_info = False
             # report new connections
             for key in self._tracker_pending_puts:
-                rpc_key = key.split(":")[0]
+                rpc_key, _ = base.split_random_key(key)
                 base.sendjson(
                     self._tracker_conn, [TrackerCode.PUT, rpc_key, (self._listen_port, key), None]
                 )
@@ -403,7 +403,7 @@ def _callback():
     def _handler_ready_tracker_mode(self, handler):
         """tracker mode to handle handler ready."""
         if handler.rpc_key.startswith("server:"):
-            key = base.random_key(handler.match_key + ":", self._server_pool)
+            key = base.random_key(handler.match_key, cmap=self._server_pool)
             handler.match_key = key
             self._server_pool[key] = handler
             self._tracker_pending_puts.append(key)
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index a1a8d8de9288..7932e98aa20c 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -157,7 +157,7 @@ def _accept_conn(listen_sock, tracker_conn, ping_period=2):
         old_keyset = set()
         # Report resource to tracker
         if tracker_conn:
-            matchkey = base.random_key(rpc_key + ":")
+            matchkey = base.random_key(rpc_key)
             base.sendjson(tracker_conn, [TrackerCode.PUT, rpc_key, (port, matchkey), custom_addr])
             assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS
         else:
@@ -182,7 +182,7 @@ def _accept_conn(listen_sock, tracker_conn, ping_period=2):
                     # regenerate match key if key is acquired but not used for a while
                     if unmatch_period_count * ping_period > unmatch_timeout + ping_period:
                         logger.info("no incoming connections, regenerate key ...")
-                        matchkey = base.random_key(rpc_key + ":", old_keyset)
+                        matchkey = base.random_key(rpc_key, cmap=old_keyset)
                         base.sendjson(
                             tracker_conn, [TrackerCode.PUT, rpc_key, (port, matchkey), custom_addr]
                         )
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index 5440addac023..e65ed4a012f0 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -348,7 +348,7 @@ def close(self, conn):
         if "key" in conn._info:
             for value in conn.put_values:
                 _, _, _, key = value
-                rpc_key = key.split(":")[0]
+                rpc_key, _ = base.split_random_key(key)
                 self._scheduler_map[rpc_key].remove(value)
 
     def stop(self):
diff --git a/tests/python/unittest/test_rpc_base.py b/tests/python/unittest/test_rpc_base.py
new file mode 100644
index 000000000000..b7ee6eff5275
--- /dev/null
+++ b/tests/python/unittest/test_rpc_base.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tvm.rpc import base
+import pytest
+import random
+
+
+@pytest.mark.parametrize("device_key", ["16e995b6", "127.0.0.1:5555"])
+def test_rpc_base_random_key(device_key):
+    random.seed(0)
+    key = base.random_key(device_key)
+    assert key.startswith(device_key)
+    res_device_key, _ = base.split_random_key(key)
+    assert device_key == res_device_key
+    # start with seed 0 as well, but use cmap arg(a conflict map)
+    # to generate another unique random key
+    random.seed(0)
+    new_key = base.random_key(device_key, cmap={key})
+    assert key != new_key
+    assert new_key.startswith(device_key)
+    res_device_key2, _ = base.split_random_key(new_key)
+    assert device_key == res_device_key2
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index 255d28a5eff2..665bfb9f5df4 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -448,10 +448,10 @@ def check_remote():
 
 
 @tvm.testing.requires_rpc
-def test_rpc_tracker_register():
+@pytest.mark.parametrize("device_key", ["test_device", "127.0.0.1:5555"])
+def test_rpc_tracker_register(device_key):
     # test registration
     tracker = Tracker(port=9000, port_end=10000)
-    device_key = "test_device"
     server1 = rpc.Server(
         host="127.0.0.1",
         port=9000,
@@ -521,10 +521,10 @@ def _target(host, port, device_key, timeout):
 
 
 @tvm.testing.requires_rpc
-def test_rpc_tracker_request():
+@pytest.mark.parametrize("device_key", ["test_device", "127.0.0.1:5555"])
+def test_rpc_tracker_request(device_key):
     # test concurrent request
     tracker = Tracker(port=9000, port_end=10000)
-    device_key = "test_device"
     server = rpc.Server(
         port=9000,
         port_end=10000,
@@ -562,15 +562,14 @@ def test_rpc_tracker_request():
 
 
 @tvm.testing.requires_rpc
-def test_rpc_tracker_via_proxy():
+@pytest.mark.parametrize("device_key", ["test_device", "127.0.0.1:5555"])
+def test_rpc_tracker_via_proxy(device_key):
     """
          tracker
          /     \
     Host   --   Proxy -- RPC server
     """
 
-    device_key = "test_device"
-
     tracker_server = Tracker(port=9000, port_end=9100)
     proxy_server = Proxy(
         host=tracker_server.host,

From 97789078117e58289a4530e6b162b815f6064473 Mon Sep 17 00:00:00 2001
From: krishnaraj36 <45380557+krishnaraj36@users.noreply.github.com>
Date: Tue, 22 Nov 2022 01:41:59 +0530
Subject: [PATCH 630/704] [RUNTIME][CLML] Fix Dense layer crash issue (#13451)

* [RUNTIME][CLML] Fix Dense layer crash issue

Fixed the bias tensor allocation failure issue for Dense layer
in CLML runtime.

* Fix the lint errors

Co-authored-by: kvegiraj <kvegiraj@qti.qualcomm.com>
---
 src/runtime/contrib/clml/clml_runtime.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index 7492e521b7f5..c3fa3051591f 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -1150,8 +1150,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     if (has_bias) {
       auto bias_dims = get_tensor_dims(nodes_[node.GetInputs()[2].id_]);
-      bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1},
-                                         CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+      bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                         cl_dtype);
     } else {
       cl_ml_tensor_desc_qcom desc = {};
       desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;

From 3ccc3009a6a4f3cce4cbe9e24e6fa18cc1247f87 Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Mon, 21 Nov 2022 13:21:55 -0800
Subject: [PATCH 631/704] Fix building static tvm_runtime on windows (#13445)

The recent cmake minimum bump from 3.2 to 3.18 introduced breaking changes for building static tvm_runtime on windows. This PR fixes that problem.
---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 011b593157a8..736d516fa1f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -165,6 +165,8 @@ if(MSVC)
         string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
       endif(${flag_var} MATCHES "/MD")
     endforeach(flag_var)
+    # Static linking. cmake behavior changed in 3.15 making this necessary.
+    add_compile_options(/MT)
   endif()
   # Disable common MSVC warnings
   # Integer conversion warnings(e.g. int64 to int)

From 545f8dc927d4dc9fb1394c67c681ea40ec16db8d Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 21 Nov 2022 13:23:16 -0800
Subject: [PATCH 632/704] [TOPI] Add handwritten matvec for dynamic cases
 (#13423)

Add a handwritten matrix-vector multiplication implementation for
dynamic cases on cpu. This avoids crashing when a dynamic shape is
present.
---
 python/tvm/relay/op/strategy/x86.py         | 19 +++++++
 python/tvm/topi/x86/dense.py                | 60 +++++++++++++++++++++
 tests/python/topi/python/test_topi_dense.py |  3 ++
 3 files changed, 82 insertions(+)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 10d7fbb3a926..897f7c4e588f 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -507,10 +507,29 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
     return strategy
 
 
+def is_dynamic_shape(shape):
+    return any([isinstance(x, (tir.Any, tir.SizeVar)) for x in shape])
+
+
 @dense_strategy.register("cpu")
 def dense_strategy_cpu(attrs, inputs, out_type, target):
     """dense x86 strategy"""
+
     strategy = _op.OpStrategy()
+    # For dynamic matrix-vector multiply we use a hand written kernel.
+    if (
+        isinstance(inputs[0].shape[0], (int, tir.IntImm))
+        and inputs[0].shape[0] == 1
+        and (is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape))
+    ):
+        strategy.add_implementation(
+            wrap_compute_dense(topi.x86.dense_dynamic),
+            wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
+            name="dense_dynamic.x86",
+            plevel=20,
+        )
+        return strategy
+
     same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
     dtype = inputs[0].dtype
     u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32"
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 8ddb8d7a5c9a..65a803781a57 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -480,3 +480,63 @@ def matmul_dnnl(
 def schedule_matmul_dnnl(_, outs):
     """Create schedule for matmul_dnnl."""
     return generic.schedule_extern(outs)
+
+
+def dense_dynamic(A, B, bias, dtype):
+    """Compute for dense with dynamic shape"""
+
+    assert A.shape[0] == 1, "Only dynamic matrix vector multiplication with vector LHS is supported"
+
+    # Right now we only support matrix-vector multiplication with lhs as the
+    # vector. We don't need to do much optimization here because the access
+    # pattern and parallelization are straight forward.
+    def gen_ir(a, b, c):
+        ib = tvm.tir.ir_builder.create()
+        A = ib.buffer_ptr(a)
+        B = ib.buffer_ptr(b)
+        C = ib.buffer_ptr(c)
+        with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j:
+            C[0, j] = 0.0
+            with ib.for_range(0, b.shape[1], name="k") as k:
+                C[0, j] += A[0, k] * B[j, k]
+        return ib.get()
+
+    def gen_ir_bias(a, b, bias, c):
+        ib = tvm.tir.ir_builder.create()
+        A = ib.buffer_ptr(a)
+        B = ib.buffer_ptr(b)
+        C = ib.buffer_ptr(c)
+        with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j:
+            C[0, j] = bias[j]
+            with ib.for_range(0, b.shape[1], name="k") as k:
+                C[0, j] += A[0, k] * B[j, k]
+        return ib.get()
+
+    out_shape = (A.shape[0], B.shape[0])
+    out_buf = tvm.tir.decl_buffer(out_shape, dtype, "out_buf")
+    if bias is None:
+        out = te.extern(
+            [out_shape],
+            [A, B],
+            lambda ins, outs: gen_ir(*ins, *outs),
+            dtype=dtype,
+            out_buffers=[out_buf],
+            name="dense_dynamic_cpu",
+            tag="dense_dynamic_cpu",
+        )
+    else:
+        out = te.extern(
+            [out_shape],
+            [A, B, bias],
+            lambda ins, outs: gen_ir_bias(*ins, *outs),
+            dtype=dtype,
+            out_buffers=[out_buf],
+            name="dense_dynamic_cpu",
+            tag="dense_dynamic_cpu",
+        )
+    return out
+
+
+def schedule_dense_dynamic(outs):
+    """Create schedule for dense_dynamic."""
+    return generic.schedule_extern(outs)
diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py
index 7e65e2449fd7..8f6523366878 100644
--- a/tests/python/topi/python/test_topi_dense.py
+++ b/tests/python/topi/python/test_topi_dense.py
@@ -45,6 +45,7 @@
     "cpu": [
         (topi.x86.dense_nopack, topi.x86.schedule_dense_nopack),
         (topi.x86.dense_pack, topi.x86.schedule_dense_pack),
+        (topi.x86.dense_dynamic, topi.x86.schedule_dense_dynamic),
     ],
     "gpu": [
         (topi.gpu.dense_small_batch, topi.gpu.schedule_dense_small_batch),
@@ -136,6 +137,8 @@ def test_dense(
         implementations = tvm.topi.testing.dispatch(target, _dense_implementations)
 
     for fcompute, fschedule in implementations:
+        if fcompute == topi.x86.dense_dynamic and (batch_size != 1 or in_dtype != "float32"):
+            continue
         with tvm.target.Target(target):
             D = fcompute(A, B, C if use_bias else None, out_dtype)
             D = topi.nn.relu(D)

From 7cfa62e255409b083d07b537765726a634c449c4 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Mon, 21 Nov 2022 18:20:11 -0800
Subject: [PATCH 633/704] [Meta Schedule] Patch ICHECK for `target_has_vnni` to
 avoid seg fault (#13441)

* patch to avoid seg fault with test

* address comments
---
 src/meta_schedule/space_generator/space_generator.cc           | 2 +-
 tests/python/unittest/test_meta_schedule_space_cpu_winograd.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index bd124511b83c..cb89b3b817af 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -25,7 +25,7 @@ String GetRuleKindFromTarget(const Target& target) {
   if (target->kind->name == "llvm") {
     static const PackedFunc* f_check_vnni =
         runtime::Registry::Get("tvm.topi.x86.utils.target_has_vnni");
-    ICHECK(*f_check_vnni != nullptr) << "The `target_has_vnni` func is not in tvm registry.";
+    ICHECK(f_check_vnni != nullptr) << "The `target_has_vnni` func is not in tvm registry.";
     if (target->GetAttr<String>("mcpu") &&
         (*f_check_vnni)(target->GetAttr<String>("mcpu").value())) {
       return "vnni";
diff --git a/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py b/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py
index 78b75d592ed4..135304286b4b 100644
--- a/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py
+++ b/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py
@@ -19,7 +19,6 @@
 from tvm.meta_schedule.testing.space_generation import (
     check_sketches,
     generate_design_space,
-    print_sketches,
 )
 from tvm.meta_schedule.testing.te_workload import create_te_workload
 from tvm.script import tir as T

From 3ad425fbdfc1cffd4b32457c538f4a61de454a45 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 21 Nov 2022 23:11:42 -0600
Subject: [PATCH 634/704] [Docker]Add privileged option for hardware with USB
 access testing (#13460)

add options
---
 docker/bash.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/docker/bash.sh b/docker/bash.sh
index 5973c7013b85..6919ce9edb65 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -22,7 +22,7 @@
 #
 # Usage: docker/bash.sh [-i|--interactive] [--net=host] [-t|--tty]
 #          [--mount MOUNT_DIR] [--repo-mount-point REPO_MOUNT_POINT]
-#          [--dry-run] [--name NAME]
+#          [--dry-run] [--name NAME] [--privileged]
 #          <DOCKER_IMAGE_NAME> [--] [COMMAND]
 #
 # Usage: docker/bash.sh <CONTAINER_NAME>
@@ -97,6 +97,10 @@ Usage: docker/bash.sh [-i|--interactive] [--net=host] [-t|--tty]
     Set the name of the docker container, and the hostname that will
     appear inside the container.
 
+--privileged
+
+    Give extended privileges to this container.
+
 DOCKER_IMAGE_NAME
 
     The name of the docker container to be run.  This can be an
@@ -213,6 +217,11 @@ while (( $# )); do
             fi
             ;;
 
+        --privileged)
+            DOCKER_FLAGS+=( "--privileged" )
+            shift 1
+            ;;
+
         --env)
             DOCKER_ENV+=( --env "$2" )
             shift 2

From 723a13ac0891400a21724fcaaefbd4875dc35d9f Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Tue, 22 Nov 2022 04:41:00 -0500
Subject: [PATCH 635/704] [MetaSchedule] TorchBench tuning script: add option
 to disallow operators in sub graph (#13453)

* Add option to disallow operator in TorchDynamo

Add missing type annotation

* Fix lint
---
 .../meta_schedule/testing/torchbench/run.py   | 14 +++++
 .../meta_schedule/testing/torchbench/utils.py | 63 ++++++++++++++++++-
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/python/tvm/meta_schedule/testing/torchbench/run.py b/python/tvm/meta_schedule/testing/torchbench/run.py
index 65e1a1a59f3e..55ac02b0b743 100644
--- a/python/tvm/meta_schedule/testing/torchbench/run.py
+++ b/python/tvm/meta_schedule/testing/torchbench/run.py
@@ -110,6 +110,7 @@
 from tvm._ffi import get_global_func
 from tvm.contrib.graph_executor import GraphModule
 from tvm.meta_schedule.testing.torchbench.utils import (
+    DisallowedOperator,
     load_torchdynamo_benchmark_runner,
     same,
     timed,
@@ -196,6 +197,12 @@ def parse_args():
         default=5,
         help="The number of rounds to warmup before starting to measure the performance.",
     )
+    args.add_argument(
+        "--disallowed-op",
+        type=str,
+        default="all",
+        help=DisallowedOperator.__doc__,
+    )
 
     # Model selection
     args.add_argument(
@@ -313,6 +320,12 @@ def parse_args():
 
     parsed = args.parse_args()
 
+    if parsed.disallowed_op == "all":
+        disallowed_op = set(DisallowedOperator)
+    else:
+        disallowed_op = {DisallowedOperator(v) for v in parsed.disallowed_op.split(",")}
+    parsed.disallowed_op = disallowed_op
+
     # Trim all args, otherwise it confuses the arg parser of timm_efficientdet
     sys.argv = sys.argv[:1]
 
@@ -335,6 +348,7 @@ def parse_args():
     IS_CUDA,
     cosine_similarity=ARGS.result_metric == ResultComparisonMetric.COSINE,
     float32=ARGS.float32,
+    disallowed_operators=ARGS.disallowed_op,
 )
 
 
diff --git a/python/tvm/meta_schedule/testing/torchbench/utils.py b/python/tvm/meta_schedule/testing/torchbench/utils.py
index 8bd022a9cb18..af81318d0006 100644
--- a/python/tvm/meta_schedule/testing/torchbench/utils.py
+++ b/python/tvm/meta_schedule/testing/torchbench/utils.py
@@ -19,13 +19,33 @@
 from TorchDynamo.
 """
 
+import functools
 import os
 import sys
 from dataclasses import dataclass
+from enum import Enum
+from typing import Set
 
 import torch  # type: ignore
 
 
+class DisallowedOperator(Enum):
+    """
+    The operators to disallow in the fx graph produced by TorchDynamo.
+    This is to workaround the limitation in TVM's PyTorch frontend.
+
+    - inplace_copy: aten::copy_ as inplace assign A[...] = ..., or method call A.copy_(...)
+    - einsum: torch.functional.einsum
+    - multihead_attention: torch.nn.MultiheadAttention
+    - as_stride: Tensor.as_stride
+    """
+
+    INPLACE_COPY = "inplace_copy"
+    EINSUM = "einsum"
+    MULTIHEAD_ATTENTION = "multihead_attention"
+    AS_STRIDE = "as_stride"
+
+
 def find_torchdynamo() -> str:
     """
     Find the directory of TorchDynamo repo.
@@ -57,14 +77,52 @@ def find_torchdynamo() -> str:
 sys.path.append(f"{DYNAMO_DIR}/benchmarks")
 
 # pylint: disable=wrong-import-position, unused-import
+import torchdynamo  # type: ignore
 from benchmarks.common import same, timed  # type: ignore
 from torchbench import TorchBenchmarkRunner  # type: ignore
 
 # pylint: disable=wrong-import-position, unused-import
 
 
+def _disallow_operators(disallowed_ops: Set[DisallowedOperator]):
+    """
+    Disallow certain operators in the fx graph produced by TorchDynamo.
+    There are two ways to disallow operator in TorchDynamo,
+    1. Use the disallow_in_graph API, which only applies to free function call.
+    2. Patch the TensorVariable class, which applies to method call on torch.Tensor.
+    """
+    disallowed_tensor_methods: Set[str] = set()
+
+    if DisallowedOperator.INPLACE_COPY in disallowed_ops:
+        torchdynamo.disallow_in_graph(torch.Tensor.copy_)
+        disallowed_tensor_methods.update({"copy_", "__setitem__"})
+
+    if DisallowedOperator.EINSUM in disallowed_ops:
+        torchdynamo.disallow_in_graph(torch.functional.einsum)
+
+    if DisallowedOperator.MULTIHEAD_ATTENTION in disallowed_ops:
+        torchdynamo.disallow_in_graph(torch.nn.MultiheadAttention)
+
+    if DisallowedOperator.AS_STRIDE in disallowed_ops:
+        disallowed_tensor_methods.add("as_stride")
+
+    tensor_variable_cls = torchdynamo.variables.tensor.TensorVariable
+    old_call_method = tensor_variable_cls.call_method
+
+    @functools.wraps(old_call_method)
+    def call_method(self, translator, name, args, kwargs):
+        if name in disallowed_tensor_methods:
+            raise torchdynamo.exc.Unsupported(f"Tensor.{name} not supported by TVM.")
+        return old_call_method(self, translator, name, args, kwargs)
+
+    tensor_variable_cls.call_method = call_method
+
+
 def load_torchdynamo_benchmark_runner(
-    is_cuda: bool, cosine_similarity: bool = False, float32: bool = False
+    is_cuda: bool,
+    cosine_similarity: bool = False,
+    float32: bool = False,
+    disallowed_operators: Set[DisallowedOperator] = None,
 ) -> TorchBenchmarkRunner:
     """
     Load the benchmark runner from TorchDynamo.
@@ -94,6 +152,9 @@ class RunnerArgs:
     runner.args = args
     runner.model_iter_fn = runner.forward_pass
 
+    if disallowed_operators:
+        _disallow_operators(disallowed_operators)
+
     if is_cuda:
         # pylint: disable=import-outside-toplevel
         import benchmarks.common  # type: ignore

From 27d8d4153b2117f0f0137dfbf848ddb67b0b5247 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 22 Nov 2022 10:58:55 +0000
Subject: [PATCH 636/704] Bump tensorflow from 2.9.1 to 2.9.3 in
 /apps/android_camera/models (#13455)

Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.9.1 to 2.9.3.
- [Release notes](https://github.com/tensorflow/tensorflow/releases)
- [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md)
- [Commits](https://github.com/tensorflow/tensorflow/compare/v2.9.1...v2.9.3)

---
updated-dependencies:
- dependency-name: tensorflow
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 apps/android_camera/models/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/android_camera/models/requirements.txt b/apps/android_camera/models/requirements.txt
index a44730c7ec37..dbf496b2d968 100644
--- a/apps/android_camera/models/requirements.txt
+++ b/apps/android_camera/models/requirements.txt
@@ -1,4 +1,4 @@
 keras==2.9
 mxnet
 scipy
-tensorflow==2.9.1
+tensorflow==2.9.3

From f38dbbbd328d2e29ff9ef4e5335cda5e0d4224f3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 22 Nov 2022 13:56:52 +0000
Subject: [PATCH 637/704] Bump tflite from 2.4.0 to 2.10.0 in
 /apps/microtvm/cmsisnn (#13457)

Bumps [tflite](https://github.com/zhenhuaw-me/tflite) from 2.4.0 to 2.10.0.
- [Release notes](https://github.com/zhenhuaw-me/tflite/releases)
- [Commits](https://github.com/zhenhuaw-me/tflite/compare/v2.4.0...v2.10.0)

---
updated-dependencies:
- dependency-name: tflite
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 apps/microtvm/cmsisnn/requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/microtvm/cmsisnn/requirements.txt b/apps/microtvm/cmsisnn/requirements.txt
index ae8e0aacd738..3399d4a7f7be 100644
--- a/apps/microtvm/cmsisnn/requirements.txt
+++ b/apps/microtvm/cmsisnn/requirements.txt
@@ -211,9 +211,9 @@ scipy==1.5.4 \
 synr==0.6.0 \
     --hash=sha256:0b4e16b10c3988e1981e3372153a31956f74d86752eaaa55e8c4e7b7fe591e4e \
     --hash=sha256:9399b27d9f21c5d439eae92e0159d6f521cc396d27149ac45473012a205a3c30
-tflite==2.4.0 \
-    --hash=sha256:0510db1b48a3eec86bf9bb8d2749cd9d6d26d6a4fb329fd141bde5b4404932d1 \
-    --hash=sha256:0796f6ce6eb2aef4a318f5509e5fb0ce808e29cd3094801b4abbb1d8575a28cd
+tflite==2.10.0 \
+    --hash=sha256:6818a5d7776958b803944ba0a1f4c4395559606d9e795d67ac467a8a3904757d \
+    --hash=sha256:89cb9f57df0f5345f8fad1381e0fae6180ded687113eb552cfbb60a05edc002c
 tornado==6.1 \
     --hash=sha256:0a00ff4561e2929a2c37ce706cb8233b7907e0cdc22eab98888aca5dd3775feb \
     --hash=sha256:0d321a39c36e5f2c4ff12b4ed58d41390460f798422c4504e09eb5678e09998c \

From e6629706479421783d7ee5fa29eb3fe862d9b981 Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Tue, 22 Nov 2022 08:52:43 -0800
Subject: [PATCH 638/704] [microTVM] enable building microTVM components by
 default (#13073)

* enable micro by default

* apply patch for ranlib fix

* exclude windows build from including cctools

* remove check for USE_MICRO to build crttest

* disable USE_MICRO for hexagon builds

* remove USE_MICRO conditional from TVM python package init

* remove USE_MICRO restrictions for hexagon build

* remove USE_MICRO conditional for MISRA-C test

* set USE_MICRO to ON in tvm_option, set USE_MICRO to OFF for now in CPU minimal build

Co-authored-by: Mehrdad Hessar <mhessar@octoml.ai>
---
 CMakeLists.txt                               | 7 ++++++-
 cmake/config.cmake                           | 3 ---
 conda/recipe/build.sh                        | 3 +++
 conda/recipe/meta.yaml                       | 3 +++
 tests/scripts/task_config_build_arm.sh       | 1 -
 tests/scripts/task_config_build_cortexm.sh   | 1 -
 tests/scripts/task_config_build_cpu.sh       | 1 -
 tests/scripts/task_config_build_gpu.sh       | 1 -
 tests/scripts/task_config_build_gpu_other.sh | 1 -
 tests/scripts/task_config_build_hexagon.sh   | 2 --
 tests/scripts/task_config_build_i386.sh      | 1 -
 tests/scripts/task_config_build_minimal.sh   | 1 +
 tests/scripts/task_config_build_riscv.sh     | 1 -
 tests/scripts/task_config_build_wasm.sh      | 1 -
 14 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 736d516fa1f6..cb0bad8a50d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,7 +58,7 @@ tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF)
 tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
 tvm_option(USE_MSVC_MT "Build with MT" OFF)
-tvm_option(USE_MICRO "Build with Micro TVM support" OFF)
+tvm_option(USE_MICRO "Build with Micro TVM support" ON)
 tvm_option(INSTALL_DEV "Install compiler infrastructure" OFF)
 tvm_option(HIDE_PRIVATE_SYMBOLS "Compile with -fvisibility=hidden." OFF)
 tvm_option(USE_TF_TVMDSOOP "Build with TensorFlow TVMDSOOp" OFF)
@@ -118,6 +118,11 @@ tvm_option(USE_CLML "Build with CLML Codegen support" OFF)
 tvm_option(USE_CLML_GRAPH_EXECUTOR "Build with CLML graph runtime" OFF)
 tvm_option(USE_UMA "Build with UMA support" OFF)
 
+# disable microTVM for iOS and hexagon builds
+if(${CMAKE_SYSTEM_NAME} MATCHES "iOS" OR USE_HEXAGON)
+  set(USE_MICRO OFF)
+endif()
+
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
 include_directories("include")
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 679f5c459e87..0c803c0b6a2e 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -96,9 +96,6 @@ set(USE_SPIRV_KHR_INTEGER_DOT_PRODUCT OFF)
 # Whether enable OpenGL runtime
 set(USE_OPENGL OFF)
 
-# Whether enable MicroTVM runtime
-set(USE_MICRO OFF)
-
 # Whether enable RPC runtime
 set(USE_RPC ON)
 
diff --git a/conda/recipe/build.sh b/conda/recipe/build.sh
index 0131fd65a48e..aa2d2f3d5fb3 100755
--- a/conda/recipe/build.sh
+++ b/conda/recipe/build.sh
@@ -45,6 +45,9 @@ rm -rf build || true
 mkdir -p build
 cd build
 
+export PREFIX="${PREFIX}/"
+cp -f ${PREFIX}/bin/ranlib $PREFIX
+
 cmake -DCMAKE_INSTALL_PREFIX="${PREFIX}" \
       -DCMAKE_BUILD_TYPE=Release \
       -DUSE_RPC=ON \
diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
index 519b84c570d7..b8463ebdd059 100644
--- a/conda/recipe/meta.yaml
+++ b/conda/recipe/meta.yaml
@@ -45,6 +45,7 @@ requirements:
   host:
     - zlib
     - llvmdev >=11
+    - cctools # [not win]
 
 outputs:
   - name: {{ pkg_name }}-libs
@@ -62,10 +63,12 @@ outputs:
         - llvmdev >=11
         - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
         - cudnn >=7.6.0  # [cuda]
+        - cctools # [not win]
       run:
         - llvmdev >=11
         - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
         - cudnn >=7.6.0  # [cuda]
+        - cctools # [not win]
 
   - name: {{ pkg_name }}
     script: install_tvm_python.sh  # [not win]
diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh
index 516e6ac86791..35ecde2904bb 100755
--- a/tests/scripts/task_config_build_arm.sh
+++ b/tests/scripts/task_config_build_arm.sh
@@ -25,7 +25,6 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
-echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-8\) >> config.cmake
diff --git a/tests/scripts/task_config_build_cortexm.sh b/tests/scripts/task_config_build_cortexm.sh
index f15ed81711f6..5407079c1a2c 100755
--- a/tests/scripts/task_config_build_cortexm.sh
+++ b/tests/scripts/task_config_build_cortexm.sh
@@ -24,7 +24,6 @@ cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
-echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_CMSISNN ON\) >> config.cmake
 echo set\(USE_ETHOSU ON\) >> config.cmake
 echo set\(USE_UMA ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index e3d8aa9a1d1b..66fc161ece96 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -24,7 +24,6 @@ cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
-echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_DNNL ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index ca5f3e935c08..0b8be8ac4cbe 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -29,7 +29,6 @@ echo set\(USE_CUDA ON\) >> config.cmake
 echo set\(USE_VULKAN ON\) >> config.cmake
 echo set\(USE_OPENGL ON\) >> config.cmake
 echo set\(USE_OPENCL ON\) >> config.cmake
-echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM \"/usr/bin/llvm-config-9 --link-static\"\) >> config.cmake
 echo set\(USE_NNPACK ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu_other.sh b/tests/scripts/task_config_build_gpu_other.sh
index 6fb10d44508a..747e1006e507 100755
--- a/tests/scripts/task_config_build_gpu_other.sh
+++ b/tests/scripts/task_config_build_gpu_other.sh
@@ -27,7 +27,6 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_OPENCL ON\) >> config.cmake
 echo set\(USE_ROCM ON\) >> config.cmake
-echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE OFF\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index 0736ed6b53b8..c8e70c00f97d 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -25,8 +25,6 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
-echo set\(USE_MICRO ON\) >> config.cmake
-echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM "${CLANG_LLVM_HOME}/bin/llvm-config"\) >> config.cmake
 
 if [[ ${CI:-false} == "true" ]]; then
diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh
index 369706dfd34a..18a7189e1470 100755
--- a/tests/scripts/task_config_build_i386.sh
+++ b/tests/scripts/task_config_build_i386.sh
@@ -25,7 +25,6 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
-echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-4.0\) >> config.cmake
diff --git a/tests/scripts/task_config_build_minimal.sh b/tests/scripts/task_config_build_minimal.sh
index 651f54cea21b..27dc9d77567a 100755
--- a/tests/scripts/task_config_build_minimal.sh
+++ b/tests/scripts/task_config_build_minimal.sh
@@ -32,3 +32,4 @@ echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
+echo set\(USE_MICRO OFF\) >> config.cmake
diff --git a/tests/scripts/task_config_build_riscv.sh b/tests/scripts/task_config_build_riscv.sh
index 9e11e5e255e9..b39cb4b28e3b 100755
--- a/tests/scripts/task_config_build_riscv.sh
+++ b/tests/scripts/task_config_build_riscv.sh
@@ -24,7 +24,6 @@ cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
-echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_CMSISNN ON\) >> config.cmake
 echo set\(USE_UMA ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh
index daa5481bea9d..e96288e36b7e 100755
--- a/tests/scripts/task_config_build_wasm.sh
+++ b/tests/scripts/task_config_build_wasm.sh
@@ -24,7 +24,6 @@ cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
-echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-11\) >> config.cmake

From 1c677684835dff05e71dad890b60e088fb79b6be Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 22 Nov 2022 15:17:09 -0800
Subject: [PATCH 639/704] [MetaSchedule][Minor] Unify Cuda-TensorCore Naming in
 Schedule Rule Kind (#13473)

This PR unifies `cuda-tensorcore` and `cuda_tensorcore` to the former one to be consistent with python side `create` functions for schedule rules, post processors, and mutators.
---
 src/meta_schedule/space_generator/space_generator.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index cb89b3b817af..2d69727384a7 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -42,7 +42,7 @@ String GetRuleKindFromTarget(const Target& target) {
         sm = sm.substr(3);
         try {
           if (std::stoi(sm) >= 75) {
-            return "cuda_tensorcore";
+            return "cuda-tensorcore";
           }
         } catch (const std::invalid_argument& e) {
           LOG(WARNING) << "ValueError: Unable to parse `target.arch`: " << sm
@@ -78,7 +78,7 @@ void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
       default_sch_rules = ScheduleRule::DefaultCUDA();
       default_postprocs = Postproc::DefaultCUDA();
       default_mutator_probs = Mutator::DefaultCUDA();
-    } else if (kind == "cuda_tensorcore") {
+    } else if (kind == "cuda-tensorcore") {
       default_sch_rules = ScheduleRule::DefaultCUDATensorCore();
       default_postprocs = Postproc::DefaultCUDATensorCore();
       default_mutator_probs = Mutator::DefaultCUDATensorCore();

From a41e192dc8a3d4246d54e35e3afbe2a9dd08d52b Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Tue, 22 Nov 2022 18:18:26 -0500
Subject: [PATCH 640/704] [MetaSchedule] TorchBench tuning script: add task
 extraction mode (#13452)

This PR adds the task extraction mode to the TorchBench tuning script. The task extraction mode will only import PyTorch model into TVM, extract tuning tasks from it and save them for further use.

This helps the use case where tasks should be extracted from a machine with GPU and then be tuned on a machine with powerful CPU with rpc workers.
---
 .../meta_schedule/testing/torchbench/run.py   | 28 +++++++++++++++----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/torchbench/run.py b/python/tvm/meta_schedule/testing/torchbench/run.py
index 55ac02b0b743..05efd466ee1d 100644
--- a/python/tvm/meta_schedule/testing/torchbench/run.py
+++ b/python/tvm/meta_schedule/testing/torchbench/run.py
@@ -94,6 +94,7 @@
 import contextlib
 import logging
 import os
+import pickle
 import sys
 import warnings
 from collections import defaultdict
@@ -125,28 +126,37 @@
 class RunMode(Enum):
     """
     The running mode of this script. Available values are:
-    - tune: Only tune the model and create the tuning database.
+    - extract: Only import the model and extract tuning tasks from it.
+    - tune: Only tune the tasks and create the tuning database.
     - eval: Only benchmark model using pre-existing tuning database.
     - all: Run both tuning and benchmark
     """
 
     ALL = "all"
+    EXTRACT = "extract"
     TUNE = "tune"
     EVAL = "eval"
 
+    @property
+    def should_extract(self):
+        """
+        Returns whether it should extract tuning tasks.
+        """
+        return self in (RunMode.ALL, RunMode.EXTRACT)
+
     @property
     def should_tune(self):
         """
-        Returns whether it should tune the model.
+        Returns whether it should tune the tasks.
         """
-        return self != RunMode.EVAL
+        return self in (RunMode.ALL, RunMode.TUNE)
 
     @property
     def should_eval(self):
         """
         Returns whether it should actually benchmark the model.
         """
-        return self != RunMode.TUNE
+        return self in (RunMode.ALL, RunMode.EVAL)
 
 
 class ResultComparisonMetric(Enum):
@@ -734,11 +744,19 @@ def main():
         profiler = stack.enter_context(ms.Profiler())
         stack.enter_context(torch.no_grad())
 
-        if ARGS.mode.should_tune:
+        tasks_path = os.path.join(ARGS.work_dir, "extracted_tasks")
+
+        if ARGS.mode.should_extract:
             task_collect_backend, extracted_tasks = create_tvm_task_collection_backend()
             task_collect_ctx = torchdynamo.optimize(task_collect_backend)
             task_collect_ctx(runner.model_iter_fn)(model, example_inputs)
+            with open(tasks_path, "wb") as f:
+                pickle.dump(extracted_tasks, f)
+        else:
+            with open(tasks_path, "rb") as f:
+                extracted_tasks = pickle.load(f)
 
+        if ARGS.mode.should_tune:
             tasks, task_weights = ms.relay_integration.extracted_tasks_to_tune_contexts(
                 extracted_tasks=extracted_tasks,
                 work_dir=ARGS.work_dir,

From e2fc4d7e984c406e2c3f87665f79af7a1d4674b7 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 22 Nov 2022 20:52:31 -0600
Subject: [PATCH 641/704] [TVMScript] Improvements tvm.script.highlight
 (#13438)

* [TVMScript] Improvements tvm.script.highlight

- Automatically use "black" formatter if available.

- Allow overrides of pygmentize style based on environment variable
  `TVM_PYGMENTIZE_STYLE`.

- Forwarded `black_format` argument from `show` method to `cprint`
---
 python/tvm/ir/module.py          |  16 +-
 python/tvm/script/highlight.py   | 264 +++++++++++++++++++++----------
 python/tvm/tir/function.py       |  16 +-
 python/tvm/tir/schedule/trace.py |  14 +-
 4 files changed, 211 insertions(+), 99 deletions(-)

diff --git a/python/tvm/ir/module.py b/python/tvm/ir/module.py
index 06537e2cdc4d..3ed7e57cb758 100644
--- a/python/tvm/ir/module.py
+++ b/python/tvm/ir/module.py
@@ -278,18 +278,24 @@ def script(self, tir_prefix: str = "T", show_meta: bool = False) -> str:
             self, tir_prefix, show_meta
         )  # type: ignore
 
-    def show(self, style: Optional[str] = None) -> None:
-        """
-        A sugar for print highlighted TVM script.
+    def show(self, style: Optional[str] = None, black_format: bool = True) -> None:
+        """A sugar for print highlighted TVM script.
+
         Parameters
         ----------
         style : str, optional
-            Pygments styles extended by "light" (default) and "dark", by default "light"
+
+            Pygmentize printing style, auto-detected if None.  See
+            `tvm.script.highlight.cprint` for more details.
+
+        black_format: bool
+
+            If true (default), use the formatter Black to format the TVMScript
         """
         from tvm.script.highlight import cprint  # pylint: disable=import-outside-toplevel
 
         # Use deferred import to avoid circular import while keeping cprint under tvm/script
-        cprint(self, style=style)
+        cprint(self, style=style, black_format=black_format)
 
     def get_attr(self, attr_key):
         """Get the IRModule attribute.
diff --git a/python/tvm/script/highlight.py b/python/tvm/script/highlight.py
index dc45b5a3f1cd..d12f6c276767 100644
--- a/python/tvm/script/highlight.py
+++ b/python/tvm/script/highlight.py
@@ -17,6 +17,7 @@
 """Highlight printed TVM script.
 """
 
+import os
 import sys
 import warnings
 from typing import Optional, Union
@@ -25,17 +26,30 @@
 from tvm.tir import PrimFunc
 
 
-def cprint(printable: Union[IRModule, PrimFunc, str], style: Optional[str] = None) -> None:
-    """
-    Print highlighted TVM script string with Pygments
+def cprint(
+    printable: Union[IRModule, PrimFunc, str],
+    style: Optional[str] = None,
+    black_format: bool = True,
+) -> None:
+    """Print TVMScript string with Pygments highlight and Black auto-formatting.
+
     Parameters
     ----------
     printable : Union[IRModule, PrimFunc, str]
-        The TVM script to be printed
+
+        The TVMScript to be printed
+
     style : str, optional
-        Printing style, auto-detected if None.
+
+        Pygmentize printing style, auto-detected if None.
+
+    black_format: bool
+
+        If true (default), use the formatter Black to format the TVMScript
+
     Notes
     -----
+
     The style parameter follows the Pygments style names or Style objects. Three
     built-in styles are extended: "light", "dark" and "ansi". By default, "light"
     will be used for notebook environment and terminal style will be "ansi" for
@@ -43,16 +57,103 @@ def cprint(printable: Union[IRModule, PrimFunc, str], style: Optional[str] = Non
     not installed, plain text will be printed with a one-time warning to suggest
     installing the Pygment library. Other Pygment styles can be found in
     https://pygments.org/styles/
+
+    The default pygmentize style can also be set with the environment
+    variable "TVM_PYGMENTIZE_STYLE".
     """
     if isinstance(printable, (IRModule, PrimFunc)):
         printable = printable.script()
+
+    if black_format:
+        printable = _format(printable)
+
+    is_in_notebook = "ipykernel" in sys.modules  # in notebook env (support html display).
+
+    style = _get_pygments_style(style, is_in_notebook)
+
+    if style is None:
+        print(printable)
+        return
+
+    # pylint: disable=import-outside-toplevel
+    from pygments import highlight
+    from pygments.formatters import HtmlFormatter, Terminal256Formatter
+    from pygments.lexers.python import Python3Lexer
+
+    if is_in_notebook:
+        from IPython import display  # pylint: disable=import-outside-toplevel
+
+        formatter = HtmlFormatter(style=style)
+        formatter.noclasses = True  # inline styles
+        html = highlight(printable, Python3Lexer(), formatter)
+        display.display(display.HTML(html))
+    else:
+        print(highlight(printable, Python3Lexer(), Terminal256Formatter(style=style)))
+
+
+def _format(code_str: str) -> str:
+    """Format a code string using Black.
+
+    Parameters
+    ----------
+    code_str: str
+
+        The string containing Python/TVMScript code to format
+
+    Returns
+    -------
+    formatted: str
+
+        The formatted Python/TVMScript code
+    """
+    try:
+        # pylint: disable=import-outside-toplevel
+        import black
+    except ImportError as err:
+        with warnings.catch_warnings():
+            warnings.simplefilter("once", UserWarning)
+            install_cmd = sys.executable + ' -m pip install "black==22.3.0" --upgrade --user'
+            warnings.warn(
+                str(err)
+                + "\n"
+                + "To print formatted TVM script, please install the formatter 'Black':\n"
+                + install_cmd,
+                category=UserWarning,
+            )
+        return code_str
+    else:
+        return black.format_str(code_str, mode=black.FileMode())
+
+
+def _get_pygments_style(
+    style: Optional[str], is_in_notebook: bool
+) -> Optional[Union["pygments.style.Style", str]]:
+    """Select a pygments style to use
+
+    Parameters
+    ----------
+    style: str
+
+        The style specifier to use.  If None, auto-select a style.
+
+    is_in_notebook: bool
+
+        Whether python is currently running in a jupyter notebook.
+        Used for automatic selection.
+
+    Returns
+    -------
+    style: Optional[Union['pygments.style.Style',str]]
+
+        If pygments is installed, the style object or string, suitable
+        for use as the "style" argument to pygments formatters.  If
+        pygments is not installed, returns None.
+
+    """
     try:
         # pylint: disable=import-outside-toplevel
         import pygments
         from packaging import version
-        from pygments import highlight
-        from pygments.formatters import HtmlFormatter, Terminal256Formatter
-        from pygments.lexers.python import Python3Lexer
         from pygments.style import Style
         from pygments.token import Comment, Keyword, Name, Number, Operator, String
 
@@ -69,82 +170,75 @@ def cprint(printable: Union[IRModule, PrimFunc, str], style: Optional[str] = Non
                 + install_cmd,
                 category=UserWarning,
             )
-        print(printable)
-    else:
+        return None
 
-        class JupyterLight(Style):
-            """A Jupyter-Notebook-like Pygments style configuration (aka. "light")"""
-
-            background_color = ""
-            styles = {
-                Keyword: "bold #008000",
-                Keyword.Type: "nobold #008000",
-                Name.Function: "#0000FF",
-                Name.Class: "bold #0000FF",
-                Name.Decorator: "#AA22FF",
-                String: "#BA2121",
-                Number: "#008000",
-                Operator: "bold #AA22FF",
-                Operator.Word: "bold #008000",
-                Comment: "italic #007979",
-            }
-
-        class VSCDark(Style):
-            """A VSCode-Dark-like Pygments style configuration (aka. "dark")"""
-
-            background_color = ""
-            styles = {
-                Keyword: "bold #c586c0",
-                Keyword.Type: "#82aaff",
-                Keyword.Namespace: "#4ec9b0",
-                Name.Class: "bold #569cd6",
-                Name.Function: "bold #dcdcaa",
-                Name.Decorator: "italic #fe4ef3",
-                String: "#ce9178",
-                Number: "#b5cea8",
-                Operator: "#bbbbbb",
-                Operator.Word: "#569cd6",
-                Comment: "italic #6a9956",
-            }
-
-        class AnsiTerminalDefault(Style):
-            """The default style for terminal display with ANSI colors (aka. "ansi")"""
-
-            background_color = ""
-            styles = {
-                Keyword: "bold ansigreen",
-                Keyword.Type: "nobold ansigreen",
-                Name.Class: "bold ansiblue",
-                Name.Function: "bold ansiblue",
-                Name.Decorator: "italic ansibrightmagenta",
-                String: "ansiyellow",
-                Number: "ansibrightgreen",
-                Operator: "bold ansimagenta",
-                Operator.Word: "bold ansigreen",
-                Comment: "italic ansibrightblack",
-            }
-
-        is_in_notebook = "ipykernel" in sys.modules  # in notebook env (support html display).
-
-        if style is None:
-            # choose style automatically according to the environment:
-            style = JupyterLight if is_in_notebook else AnsiTerminalDefault
-        elif style == "light":
-            style = JupyterLight
-        elif style == "dark":
-            style = VSCDark
-        elif style == "ansi":
-            style = AnsiTerminalDefault
-
-        if is_in_notebook:  # print with HTML display
-            from IPython.display import (  # pylint: disable=import-outside-toplevel
-                HTML,
-                display,
-            )
+    class JupyterLight(Style):
+        """A Jupyter-Notebook-like Pygments style configuration (aka. "light")"""
+
+        background_color = ""
+        styles = {
+            Keyword: "bold #008000",
+            Keyword.Type: "nobold #008000",
+            Name.Function: "#0000FF",
+            Name.Class: "bold #0000FF",
+            Name.Decorator: "#AA22FF",
+            String: "#BA2121",
+            Number: "#008000",
+            Operator: "bold #AA22FF",
+            Operator.Word: "bold #008000",
+            Comment: "italic #007979",
+        }
+
+    class VSCDark(Style):
+        """A VSCode-Dark-like Pygments style configuration (aka. "dark")"""
+
+        background_color = ""
+        styles = {
+            Keyword: "bold #c586c0",
+            Keyword.Type: "#82aaff",
+            Keyword.Namespace: "#4ec9b0",
+            Name.Class: "bold #569cd6",
+            Name.Function: "bold #dcdcaa",
+            Name.Decorator: "italic #fe4ef3",
+            String: "#ce9178",
+            Number: "#b5cea8",
+            Operator: "#bbbbbb",
+            Operator.Word: "#569cd6",
+            Comment: "italic #6a9956",
+        }
+
+    class AnsiTerminalDefault(Style):
+        """The default style for terminal display with ANSI colors (aka. "ansi")"""
+
+        background_color = ""
+        styles = {
+            Keyword: "bold ansigreen",
+            Keyword.Type: "nobold ansigreen",
+            Name.Class: "bold ansiblue",
+            Name.Function: "bold ansiblue",
+            Name.Decorator: "italic ansibrightmagenta",
+            String: "ansiyellow",
+            Number: "ansibrightgreen",
+            Operator: "bold ansimagenta",
+            Operator.Word: "bold ansigreen",
+            Comment: "italic ansibrightblack",
+        }
+
+    if style == "light":
+        return JupyterLight
+    elif style == "dark":
+        return VSCDark
+    elif style == "ansi":
+        return AnsiTerminalDefault
+
+    if style is not None:
+        return style
+
+    style_from_environment = os.environ.get("TVM_PYGMENTIZE_STYLE", "").strip()
+    if style_from_environment:
+        return style_from_environment
+
+    if is_in_notebook:
+        return JupyterLight
 
-            formatter = HtmlFormatter(style=JupyterLight)
-            formatter.noclasses = True  # inline styles
-            html = highlight(printable, Python3Lexer(), formatter)
-            display(HTML(html))
-        else:
-            print(highlight(printable, Python3Lexer(), Terminal256Formatter(style=style)))
+    return AnsiTerminalDefault
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index c5cc922a3e48..082faeb456d3 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -189,18 +189,24 @@ def script(self, tir_prefix: str = "T", show_meta: bool = False) -> str:
             self, tir_prefix, show_meta
         )  # type: ignore
 
-    def show(self, style: Optional[str] = None) -> None:
-        """
-        A sugar for print highlighted TVM script.
+    def show(self, style: Optional[str] = None, black_format: bool = True) -> None:
+        """A sugar for print highlighted TVM script.
+
         Parameters
         ----------
         style : str, optional
-            Pygments styles extended by "light" (default) and "dark", by default "light"
+
+            Pygmentize printing style, auto-detected if None.  See
+            `tvm.script.highlight.cprint` for more details.
+
+        black_format: bool
+
+            If true (default), use the formatter Black to format the TVMScript
         """
         from tvm.script.highlight import cprint  # pylint: disable=import-outside-toplevel
 
         # Use deferred import to avoid circular import while keeping cprint under tvm/script
-        cprint(self, style=style)
+        cprint(self, style=style, black_format=black_format)
 
 
 @tvm._ffi.register_object("tir.TensorIntrin")
diff --git a/python/tvm/tir/schedule/trace.py b/python/tvm/tir/schedule/trace.py
index da599081df3b..99e48debfefd 100644
--- a/python/tvm/tir/schedule/trace.py
+++ b/python/tvm/tir/schedule/trace.py
@@ -259,16 +259,22 @@ def apply_json_to_schedule(json_obj: JSON_TYPE, sch: "Schedule") -> None:
         """
         _ffi_api.TraceApplyJSONToSchedule(json_obj, sch)  # type: ignore # pylint: disable=no-member
 
-    def show(self, style: Optional[str] = None) -> None:
-        """A sugar for print highlighted trace.
+    def show(self, style: Optional[str] = None, black_format: bool = True) -> None:
+        """A sugar for print highlighted TVM script.
 
         Parameters
         ----------
         style : str, optional
-            Pygments styles extended by "light" (default) and "dark", by default "light"
+
+            Pygmentize printing style, auto-detected if None.  See
+            `tvm.script.highlight.cprint` for more details.
+
+        black_format: bool
+
+            If true (default), use the formatter Black to format the TVMScript
         """
         from tvm.script.highlight import (  # pylint: disable=import-outside-toplevel
             cprint,
         )
 
-        cprint(str(self), style=style)
+        cprint(str(self), style=style, black_format=black_format)

From 8cccc253dae1474e17eca47ce67b7a34e0c41eba Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 22 Nov 2022 21:19:47 -0800
Subject: [PATCH 642/704] [ci] Enable CRT tests for CPU minimal build (#13471)

Fixes #13462
---
 Jenkinsfile                                | 6 ++++--
 ci/jenkins/Build.groovy.j2                 | 3 ++-
 ci/jenkins/Test.groovy.j2                  | 1 +
 tests/scripts/task_config_build_minimal.sh | 1 -
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 23c022abd335..377832461ef6 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-11-19T01:24:31.191996
+// Generated at 2022-11-22T15:04:11.262643
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -812,9 +812,10 @@ stage('Build') {
             label: 'Create CPU minimal cmake config',
           )
           cmake_build(ci_minimal, 'build', '-j2')
+          make_standalone_crt(ci_minimal, 'build')
           make_cpp_tests(ci_minimal, 'build')
           sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu-minimal --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/cpptest build/build.ninja build/CMakeFiles/rules.ninja",
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu-minimal --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/standalone_crt build/build.ninja",
             label: 'Upload artifacts to S3',
           )
           }
@@ -3257,6 +3258,7 @@ def run_unittest_minimal() {
             )
 
               cpp_unittest(ci_minimal)
+              micro_cpp_unittest(ci_minimal)
               python_unittest(ci_minimal)
             })
           } finally {
diff --git a/ci/jenkins/Build.groovy.j2 b/ci/jenkins/Build.groovy.j2
index 914e3e99b659..7592079ef8d1 100644
--- a/ci/jenkins/Build.groovy.j2
+++ b/ci/jenkins/Build.groovy.j2
@@ -116,8 +116,9 @@ stage('Build') {
       label: 'Create CPU minimal cmake config',
     )
     cmake_build(ci_minimal, 'build', '-j2')
+    make_standalone_crt(ci_minimal, 'build')
     make_cpp_tests(ci_minimal, 'build')
-    {{ m.upload_artifacts(tag='cpu-minimal', filenames=tvm_lib + tvm_allvisible + cpptest) }}
+    {{ m.upload_artifacts(tag='cpu-minimal', filenames=tvm_lib + tvm_allvisible + crttest + cpptest + standalone_crt) }}
   {% endcall %}
 
   {% call m.build_step(
diff --git a/ci/jenkins/Test.groovy.j2 b/ci/jenkins/Test.groovy.j2
index eb7c8fdc0c0d..274a3e2dce6c 100644
--- a/ci/jenkins/Test.groovy.j2
+++ b/ci/jenkins/Test.groovy.j2
@@ -265,6 +265,7 @@ def run_unittest_minimal() {
     ) %}
       {{ m.download_artifacts(tag='cpu-minimal') }}
       cpp_unittest(ci_minimal)
+      micro_cpp_unittest(ci_minimal)
       python_unittest(ci_minimal)
   {% endcall %}
 }
diff --git a/tests/scripts/task_config_build_minimal.sh b/tests/scripts/task_config_build_minimal.sh
index 27dc9d77567a..651f54cea21b 100755
--- a/tests/scripts/task_config_build_minimal.sh
+++ b/tests/scripts/task_config_build_minimal.sh
@@ -32,4 +32,3 @@ echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
-echo set\(USE_MICRO OFF\) >> config.cmake

From b2058f4dd2e0ae1fc5ab51ac9f84b372a389a65a Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Wed, 23 Nov 2022 08:20:10 +0300
Subject: [PATCH 643/704] [CI][Docker] Store GTest sources in GPU docker image
 (#13468)

It is necessary to have google tests source code to build them for the
GPU device and run OpenCL cpp tests. In #13400 we enable OpenCL cpp
tests in CI.
---
 docker/Dockerfile.ci_gpu                    |  2 +-
 docker/install/ubuntu_install_googletest.sh | 21 ++++++++++++---------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index a9ddc22c97a2..9917e4cc78a7 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -39,7 +39,7 @@ COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source
 RUN bash /install/ubuntu_install_cmake_source.sh
 
 COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
-RUN bash /install/ubuntu_install_googletest.sh
+RUN bash /install/ubuntu_install_googletest.sh /googletest
 
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
diff --git a/docker/install/ubuntu_install_googletest.sh b/docker/install/ubuntu_install_googletest.sh
index 6cc2a128a0d7..9a3473da4e8d 100755
--- a/docker/install/ubuntu_install_googletest.sh
+++ b/docker/install/ubuntu_install_googletest.sh
@@ -20,14 +20,19 @@ set -e
 set -u
 set -o pipefail
 
-tmpdir=$(mktemp -d)
+if [ $# -eq 0 ]; then
+    tmpdir=$(mktemp -d)
 
-cleanup()
-{
-  rm -rf "$tmpdir"
-}
+    cleanup()
+    {
+      rm -rf "$tmpdir"
+    }
 
-trap cleanup 0
+    trap cleanup 0
+else
+    tmpdir=$1
+    mkdir -p "$tmpdir"
+fi
 
 # GoogleTest uses a Live-at-Head philosophy:
 # https://github.com/google/googletest#live-at-head
@@ -38,16 +43,14 @@ repo_revision="830fb567285c63ab5b5873e2e8b02f2249864916"
 
 archive_name="${repo_revision}.tar.gz"
 archive_url="${repo_url}/archive/${archive_name}"
-archive_folder="googletest-${repo_revision}"
 archive_hash="10f10ed771efc64a1d8234a7e4801838a468f8990e5d6d8fcf63e89f8d1455c4f9c5adc0bb829669f381609a9abf84e4c91a7fdd7404630f375f38fb485ef0eb"
 
 cd "$tmpdir"
 
 curl -sL "${archive_url}" -o "${archive_name}"
 echo "$archive_hash" ${archive_name} | sha512sum -c
-tar xf "${archive_name}"
+tar xf "${archive_name}" --strip-components=1
 
-cd ${archive_folder}
 mkdir build
 cd build
 

From ca5bc958d10dd3a504d2f2256a588ff194b7cb34 Mon Sep 17 00:00:00 2001
From: Huan Mei <mable_meihuan@163.com>
Date: Wed, 23 Nov 2022 15:28:47 +0800
Subject: [PATCH 644/704] [tir]delete useless param in driver_api.cc (#13474)

delete useless param in driver_api.cc
---
 src/driver/driver_api.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index e5e3998b1e7b..90676e0b840b 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -422,8 +422,6 @@ std::pair<IRModule, IRModule> SplitMixedModule(IRModule mod_mixed, const Target&
 
 runtime::Module TIRToRuntime(const Map<Target, IRModule>& inputs_arg,
                              const Target& target_host_arg) {
-  auto pass_ctx = transform::PassContext::Current();
-
   std::vector<runtime::Module> device_modules;
   Map<Target, IRModule> inputs = inputs_arg;
   Target target_host = target_host_arg;

From dcea36e76d12f28342c700789d51ccc401f60bfb Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 23 Nov 2022 11:42:46 -0800
Subject: [PATCH 645/704] Add python venvs to demo_ images that build (#13435)

Add python venvs to demo_ images that build.
---
 docker/Dockerfile.demo_android  | 7 +++++++
 docker/Dockerfile.demo_rocm     | 8 ++++++--
 docker/Dockerfile.demo_vitis_ai | 4 ++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index 8a461269e75d..cb5a9e0015ab 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -28,8 +28,12 @@ RUN bash /install/ubuntu_setup_tz.sh
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
@@ -54,6 +58,9 @@ RUN bash /install/ubuntu_install_vulkan.sh
 
 ENV VULKAN_SDK=/usr
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 RUN git clone https://github.com/KhronosGroup/OpenCL-Headers /usr/local/OpenCL-Headers/
 
 # Build TVM
diff --git a/docker/Dockerfile.demo_rocm b/docker/Dockerfile.demo_rocm
index 3f3ffef78e68..b4cb83f7f68c 100644
--- a/docker/Dockerfile.demo_rocm
+++ b/docker/Dockerfile.demo_rocm
@@ -26,8 +26,12 @@ RUN bash /install/ubuntu_setup_tz.sh
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
-COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
diff --git a/docker/Dockerfile.demo_vitis_ai b/docker/Dockerfile.demo_vitis_ai
index fa024767fd1e..c90091e7ecd2 100644
--- a/docker/Dockerfile.demo_vitis_ai
+++ b/docker/Dockerfile.demo_vitis_ai
@@ -32,8 +32,12 @@ RUN bash /install/ubuntu_install_core.sh
 COPY install/ubuntu_install_vitis_ai_core.sh /install/ubuntu_install_vitis_ai_core.sh
 RUN bash /install/ubuntu_install_vitis_ai_core.sh
 
+ENV TVM_VENV /venv/apache-tvm-py3.7
+COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
 RUN bash /install/ubuntu_install_python.sh
+ENV PATH ${TVM_VENV}/bin:$PATH
+ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh

From 3680b3cb49f82a4dcff559ace509ed2ed3aa02da Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 23 Nov 2022 18:52:23 -0600
Subject: [PATCH 646/704] [microTVM][Zephyr] Add 'serial_number' option
 (#13377)

This commit adds a new option 'serial_number' to allow specifying the
serial device number in Zephyr, used to flash and to communicate with
target boards attached to specific serial ports, and adjusts Zephyr
tests to run with this new option. This is particularly useful in an
environment where multiple boards are connected to the host.

It also removes 'west_cmd' in test arguments, as it is not used, and
refactors Zephyr server API to set the required/optional project options
at the beginning of each API call.
---
 .../template_project/CMakeLists.txt.template  |   4 +-
 .../template_project/microtvm_api_server.py   | 192 +++++++++++-------
 tests/micro/project_api/__init__.py           |  18 ++
 tests/micro/project_api/test_project_api.py   |  34 +---
 .../test_zephyr_microtvm_api_server.py        |  31 +++
 tests/micro/project_api/utils.py              |  58 ++++++
 tests/micro/zephyr/README.md                  |   6 +
 tests/micro/zephyr/__init__.py                |  18 ++
 tests/micro/zephyr/conftest.py                |  18 +-
 tests/micro/zephyr/test_zephyr.py             | 123 ++++++-----
 tests/micro/zephyr/test_zephyr_aot_exec.py    |  34 +---
 .../zephyr/test_zephyr_aot_exec_standalone.py |  31 ++-
 tests/micro/zephyr/test_zephyr_armv7m.py      |  30 +--
 .../micro/zephyr/{test_utils.py => utils.py}  |  16 +-
 14 files changed, 382 insertions(+), 231 deletions(-)
 create mode 100644 tests/micro/project_api/__init__.py
 create mode 100644 tests/micro/project_api/test_zephyr_microtvm_api_server.py
 create mode 100644 tests/micro/project_api/utils.py
 create mode 100644 tests/micro/zephyr/__init__.py
 rename tests/micro/zephyr/{test_utils.py => utils.py} (96%)

diff --git a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
index bbd975315e88..9386576c394b 100644
--- a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
+++ b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
@@ -28,9 +28,7 @@ set(QEMU_PIPE <QEMU_PIPE> CACHE PATH "Path to QEMU pipe")
 find_package(Zephyr HINTS $ENV{ZEPHYR_BASE})
 project(microtvm_autogenerated_project)
 
-if(${ENABLE_CMSIS})
-  set(CMSIS_PATH <CMSIS_PATH>)
-
+if(DEFINED CMSIS_PATH)
   file(GLOB_RECURSE cmsis_lib_srcs
     ${CMSIS_PATH}/CMSIS/NN/Source/ActivationFunctions/*.c
     ${CMSIS_PATH}/CMSIS/NN/Source/BasicMathFunctions/*.c
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index 568d958fb033..9a8015d62571 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -97,6 +97,10 @@ def check_call(cmd_args, *args, **kwargs):
     + [(k, False) for k in ("0", "OFF", "NO", "FALSE", "N", "IGNORE", "NOTFOUND", "")]
 )
 
+CMSIS_PATH_ERROR = (
+    "cmsis_path is not defined! Please pass it as an option or set the `CMSIS_PATH` env variable."
+)
+
 
 class CMakeCache(collections.abc.Mapping):
     def __init__(self, path):
@@ -261,11 +265,11 @@ def generic_find_serial_port(serial_number=None):
 
 
 def _get_openocd_device_args(options):
-    serial_number = options.get("openocd_serial")
+    serial_number = options.get("serial_number")
     return ["--serial", generic_find_serial_port(serial_number)]
 
 
-def _get_nrf_device_args(options):
+def _get_nrf_device_args(serial_number: str):
     nrfjprog_args = ["nrfjprog", "--ids"]
     nrfjprog_ids = subprocess.check_output(nrfjprog_args, encoding="utf-8")
     if not nrfjprog_ids.strip("\n"):
@@ -273,17 +277,17 @@ def _get_nrf_device_args(options):
 
     boards = nrfjprog_ids.split("\n")[:-1]
     if len(boards) > 1:
-        if options["nrfjprog_snr"] is None:
+        if serial_number is None:
             raise BoardError(
                 "Multiple boards connected; specify one with nrfjprog_snr=: " f'{", ".join(boards)}'
             )
 
-        if str(options["nrfjprog_snr"]) not in boards:
+        if serial_number not in boards:
             raise BoardError(
-                f"nrfjprog_snr ({options['nrfjprog_snr']}) not found in {nrfjprog_args}: {boards}"
+                f"serial number ({serial_number}) not found in {nrfjprog_args}: {boards}"
             )
 
-        return ["--snr", options["nrfjprog_snr"]]
+        return ["--snr", serial_number]
 
     if not boards:
         return []
@@ -310,18 +314,11 @@ def _get_nrf_device_args(options):
         help=("If given, port number to use when running the local gdbserver."),
     ),
     server.ProjectOption(
-        "nrfjprog_snr",
-        optional=["open_transport"],
-        type="int",
-        default=None,
-        help=("When used with nRF targets, serial # of the attached board to use, from nrfjprog."),
-    ),
-    server.ProjectOption(
-        "openocd_serial",
-        optional=["open_transport"],
-        type="int",
+        "serial_number",
+        optional=["open_transport", "flash"],
+        type="str",
         default=None,
-        help=("When used with OpenOCD targets, serial # of the attached board to use."),
+        help=("Board serial number."),
     ),
     server.ProjectOption(
         "west_cmd",
@@ -372,7 +369,7 @@ def _get_nrf_device_args(options):
 ]
 
 
-def get_zephyr_base(options: dict):
+def get_zephyr_base(options: dict) -> str:
     """Returns Zephyr base path"""
     zephyr_base = options.get("zephyr_base", ZEPHYR_BASE)
     assert zephyr_base, "'zephyr_base' option not passed and not found by default!"
@@ -381,9 +378,15 @@ def get_zephyr_base(options: dict):
 
 def get_cmsis_path(options: dict) -> pathlib.Path:
     """Returns CMSIS dependency path"""
-    cmsis_path = options.get("cmsis_path")
-    assert cmsis_path, "'cmsis_path' option not passed!"
-    return pathlib.Path(cmsis_path)
+    cmsis_path = options.get("cmsis_path", os.environ.get("CMSIS_PATH", None))
+    if cmsis_path:
+        return pathlib.Path(cmsis_path)
+    return None
+
+
+def get_west_cmd(options: dict) -> str:
+    """Returns west command"""
+    return options.get("west_cmd", WEST_CMD)
 
 
 class Handler(server.ProjectAPIHandler):
@@ -421,8 +424,9 @@ def server_info_query(self, tvm_version):
         ),
     }
 
-    def _create_prj_conf(self, project_dir, options):
-        zephyr_board = options["board"]
+    def _create_prj_conf(
+        self, project_dir: pathlib.Path, board: str, project_type: str, config_main_stack_size
+    ):
         with open(project_dir / "prj.conf", "w") as f:
             f.write(
                 "# For UART used from main().\n"
@@ -433,7 +437,7 @@ def _create_prj_conf(self, project_dir, options):
             )
             f.write("# For TVMPlatformAbort().\n" "CONFIG_REBOOT=y\n" "\n")
 
-            if options["project_type"] == "host_driven":
+            if project_type == "host_driven":
                 f.write(
                     "CONFIG_TIMING_FUNCTIONS=y\n"
                     "# For RPC server C++ bindings.\n"
@@ -444,22 +448,22 @@ def _create_prj_conf(self, project_dir, options):
 
             f.write("# For math routines\n" "CONFIG_NEWLIB_LIBC=y\n" "\n")
 
-            if self._has_fpu(zephyr_board):
+            if self._has_fpu(board):
                 f.write("# For models with floating point.\n" "CONFIG_FPU=y\n" "\n")
 
             # Set main stack size, if needed.
-            if options.get("config_main_stack_size") is not None:
-                f.write(f"CONFIG_MAIN_STACK_SIZE={options['config_main_stack_size']}\n")
+            if config_main_stack_size is not None:
+                f.write(f"CONFIG_MAIN_STACK_SIZE={config_main_stack_size}\n")
 
             f.write("# For random number generation.\n" "CONFIG_TEST_RANDOM_GENERATOR=y\n")
 
             f.write("\n# Extra prj.conf directives\n")
             for line, board_list in self.EXTRA_PRJ_CONF_DIRECTIVES.items():
-                if zephyr_board in board_list:
+                if board in board_list:
                     f.write(f"{line}\n")
 
             # TODO(mehrdadh): due to https://github.com/apache/tvm/issues/12721
-            if zephyr_board not in ["qemu_riscv64"]:
+            if board not in ["qemu_riscv64"]:
                 f.write("# For setting -O2 in compiler.\n" "CONFIG_SPEED_OPTIMIZATIONS=y\n")
 
             f.write("\n")
@@ -467,7 +471,6 @@ def _create_prj_conf(self, project_dir, options):
     API_SERVER_CRT_LIBS_TOKEN = "<API_SERVER_CRT_LIBS>"
     CMAKE_ARGS_TOKEN = "<CMAKE_ARGS>"
     QEMU_PIPE_TOKEN = "<QEMU_PIPE>"
-    CMSIS_PATH_TOKEN = "<CMSIS_PATH>"
 
     CRT_LIBS_BY_PROJECT_TYPE = {
         "host_driven": "microtvm_rpc_server microtvm_rpc_common aot_executor_module aot_executor common",
@@ -504,42 +507,68 @@ def _cmsis_required(self, project_path: Union[str, pathlib.Path]) -> bool:
                     return True
         return False
 
-    def _generate_cmake_args(self, mlf_extracted_path, options) -> str:
+    def _generate_cmake_args(
+        self,
+        mlf_extracted_path: pathlib.Path,
+        board: str,
+        use_fvp: bool,
+        west_cmd: str,
+        zephyr_base: str,
+        verbose: bool,
+        cmsis_path: pathlib.Path,
+    ) -> str:
         cmake_args = "\n# cmake args\n"
-        if options.get("verbose"):
+        if verbose:
             cmake_args += "set(CMAKE_VERBOSE_MAKEFILE TRUE)\n"
 
-        if options.get("zephyr_base"):
-            cmake_args += f"set(ZEPHYR_BASE {options['zephyr_base']})\n"
+        if zephyr_base:
+            cmake_args += f"set(ZEPHYR_BASE {zephyr_base})\n"
 
-        if options.get("west_cmd"):
-            cmake_args += f"set(WEST {options['west_cmd']})\n"
+        if west_cmd:
+            cmake_args += f"set(WEST {west_cmd})\n"
 
-        if self._is_qemu(options["board"], options.get("use_fvp")):
+        if self._is_qemu(board, use_fvp):
             # Some boards support more than one emulator, so ensure QEMU is set.
             cmake_args += f"set(EMU_PLATFORM qemu)\n"
 
-        if self._is_fvp(options["board"], options.get("use_fvp")):
+        if self._is_fvp(board, use_fvp):
             cmake_args += "set(EMU_PLATFORM armfvp)\n"
             cmake_args += "set(ARMFVP_FLAGS -I)\n"
 
-        cmake_args += f"set(BOARD {options['board']})\n"
+        cmake_args += f"set(BOARD {board})\n"
 
-        enable_cmsis = self._cmsis_required(mlf_extracted_path)
-        if enable_cmsis:
-            assert os.environ.get("CMSIS_PATH"), "CMSIS_PATH is not defined."
-        cmake_args += f"set(ENABLE_CMSIS {str(enable_cmsis).upper()})\n"
+        if self._cmsis_required(mlf_extracted_path):
+            assert cmsis_path, CMSIS_PATH_ERROR
+        cmake_args += f"set(CMSIS_PATH {str(cmsis_path)})\n"
 
         return cmake_args
 
     def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
         zephyr_board = options["board"]
+        project_type = options["project_type"]
+
+        zephyr_base = get_zephyr_base(options)
+        warning_as_error = options.get("warning_as_error")
+        use_fvp = options.get("use_fvp")
+        west_cmd = get_west_cmd(options)
+        verbose = options.get("verbose")
+
+        recommended_heap_size = _get_recommended_heap_size_bytes(options)
+        heap_size_bytes = options.get("heap_size_bytes") or recommended_heap_size
+        board_mem_size = _get_board_mem_size_bytes(options)
+
+        compile_definitions = options.get("compile_definitions")
+        config_main_stack_size = options.get("config_main_stack_size")
+
+        extra_files_tar = options.get("extra_files_tar")
+
+        cmsis_path = get_cmsis_path(options)
 
         # Check Zephyr version
-        version = self._get_platform_version(get_zephyr_base(options))
+        version = self._get_platform_version(zephyr_base)
         if version != ZEPHYR_VERSION:
             message = f"Zephyr version found is not supported: found {version}, expected {ZEPHYR_VERSION}."
-            if options.get("warning_as_error") is not None and options["warning_as_error"]:
+            if warning_as_error is not None and warning_as_error:
                 raise server.ServerError(message=message)
             _LOG.warning(message)
 
@@ -570,9 +599,9 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
             os.makedirs(extract_path)
             tf.extractall(path=extract_path)
 
-        if self._is_qemu(zephyr_board, options.get("use_fvp")):
+        if self._is_qemu(zephyr_board, use_fvp):
             shutil.copytree(API_SERVER_DIR / "qemu-hack", project_dir / "qemu-hack")
-        elif self._is_fvp(zephyr_board, options.get("use_fvp")):
+        elif self._is_fvp(zephyr_board, use_fvp):
             shutil.copytree(API_SERVER_DIR / "fvp-hack", project_dir / "fvp-hack")
 
         # Populate CRT.
@@ -591,42 +620,43 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
             with open(API_SERVER_DIR / f"{CMAKELIST_FILENAME}.template", "r") as cmake_template_f:
                 for line in cmake_template_f:
                     if self.API_SERVER_CRT_LIBS_TOKEN in line:
-                        crt_libs = self.CRT_LIBS_BY_PROJECT_TYPE[options["project_type"]]
+                        crt_libs = self.CRT_LIBS_BY_PROJECT_TYPE[project_type]
                         line = line.replace("<API_SERVER_CRT_LIBS>", crt_libs)
 
                     if self.CMAKE_ARGS_TOKEN in line:
-                        line = self._generate_cmake_args(extract_path, options)
+                        line = self._generate_cmake_args(
+                            extract_path,
+                            zephyr_board,
+                            use_fvp,
+                            west_cmd,
+                            zephyr_base,
+                            verbose,
+                            cmsis_path,
+                        )
 
                     if self.QEMU_PIPE_TOKEN in line:
                         self.qemu_pipe_dir = pathlib.Path(tempfile.mkdtemp())
                         line = line.replace(self.QEMU_PIPE_TOKEN, str(self.qemu_pipe_dir / "fifo"))
 
-                    if self.CMSIS_PATH_TOKEN in line and self._cmsis_required(extract_path):
-                        line = line.replace(self.CMSIS_PATH_TOKEN, str(os.environ["CMSIS_PATH"]))
-
                     cmake_f.write(line)
 
-                heap_size = _get_recommended_heap_size_bytes(options)
-                if options.get("heap_size_bytes"):
-                    board_mem_size = _get_board_mem_size_bytes(options)
-                    heap_size = options["heap_size_bytes"]
-                    if board_mem_size is not None:
-                        assert (
-                            heap_size < board_mem_size
-                        ), f"Heap size {heap_size} is larger than memory size {board_mem_size} on this board."
+                if board_mem_size is not None:
+                    assert (
+                        heap_size_bytes < board_mem_size
+                    ), f"Heap size {heap_size_bytes} is larger than memory size {board_mem_size} on this board."
                 cmake_f.write(
-                    f"target_compile_definitions(app PUBLIC -DHEAP_SIZE_BYTES={heap_size})\n"
+                    f"target_compile_definitions(app PUBLIC -DHEAP_SIZE_BYTES={heap_size_bytes})\n"
                 )
 
-                if options.get("compile_definitions"):
-                    flags = options.get("compile_definitions")
+                if compile_definitions:
+                    flags = compile_definitions
                     for item in flags:
                         cmake_f.write(f"target_compile_definitions(app PUBLIC {item})\n")
 
-                if self._is_fvp(zephyr_board, options.get("use_fvp")):
+                if self._is_fvp(zephyr_board, use_fvp):
                     cmake_f.write(f"target_compile_definitions(app PUBLIC -DFVP=1)\n")
 
-        self._create_prj_conf(project_dir, options)
+        self._create_prj_conf(project_dir, zephyr_board, project_type, config_main_stack_size)
 
         # Populate crt-config.h
         crt_config_dir = project_dir / "crt_config"
@@ -637,20 +667,20 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
 
         # Populate src/
         src_dir = project_dir / "src"
-        if options["project_type"] != "host_driven" or self._is_fvp(
-            zephyr_board, options.get("use_fvp")
-        ):
-            shutil.copytree(API_SERVER_DIR / "src" / options["project_type"], src_dir)
+        if project_type != "host_driven" or self._is_fvp(zephyr_board, use_fvp):
+            shutil.copytree(API_SERVER_DIR / "src" / project_type, src_dir)
         else:
             src_dir.mkdir()
-            shutil.copy2(API_SERVER_DIR / "src" / options["project_type"] / "main.c", src_dir)
+            shutil.copy2(API_SERVER_DIR / "src" / project_type / "main.c", src_dir)
 
         # Populate extra_files
-        if options.get("extra_files_tar"):
-            with tarfile.open(options["extra_files_tar"], mode="r:*") as tf:
+        if extra_files_tar:
+            with tarfile.open(extra_files_tar, mode="r:*") as tf:
                 tf.extractall(project_dir)
 
     def build(self, options):
+        verbose = options.get("verbose", None)
+
         if BUILD_DIR.exists():
             shutil.rmtree(BUILD_DIR)
         BUILD_DIR.mkdir()
@@ -672,7 +702,7 @@ def build(self, options):
         check_call(["cmake", "-GNinja", ".."], cwd=BUILD_DIR, env=env)
 
         args = ["ninja"]
-        if options.get("verbose"):
+        if verbose:
             args.append("-v")
         check_call(args, cwd=BUILD_DIR, env=env)
 
@@ -706,6 +736,9 @@ def _has_fpu(cls, zephyr_board):
         return zephyr_board in fpu_boards
 
     def flash(self, options):
+        serial_number = options.get("serial_number")
+        west_cmd_list = get_west_cmd(options).split(" ")
+
         if _find_platform_from_cmake_file(API_SERVER_DIR / CMAKELIST_FILENAME):
             return  # NOTE: qemu requires no flash step--it is launched from open_transport.
 
@@ -717,10 +750,17 @@ def flash(self, options):
         zephyr_board = _find_board_from_cmake_file(API_SERVER_DIR / CMAKELIST_FILENAME)
         if zephyr_board.startswith("nrf5340dk") and _get_flash_runner() == "nrfjprog":
             recover_args = ["nrfjprog", "--recover"]
-            recover_args.extend(_get_nrf_device_args(options))
+            recover_args.extend(_get_nrf_device_args(serial_number))
             check_call(recover_args, cwd=API_SERVER_DIR / "build")
 
-        check_call(["ninja", "flash"], cwd=API_SERVER_DIR / "build")
+        flash_extra_args = []
+        if _get_flash_runner() == "openocd" and serial_number:
+            flash_extra_args = ["--cmd-pre-init", f"""hla_serial {serial_number}"""]
+
+        check_call(
+            west_cmd_list + ["flash", "-r", _get_flash_runner()] + flash_extra_args,
+            cwd=API_SERVER_DIR / "build",
+        )
 
     def open_transport(self, options):
         zephyr_board = _find_board_from_cmake_file(API_SERVER_DIR / CMAKELIST_FILENAME)
@@ -821,7 +861,7 @@ def _find_nrf_serial_port(cls, options):
 
     @classmethod
     def _find_openocd_serial_port(cls, options):
-        serial_number = options.get("openocd_serial")
+        serial_number = options.get("serial_number")
         return generic_find_serial_port(serial_number)
 
     @classmethod
diff --git a/tests/micro/project_api/__init__.py b/tests/micro/project_api/__init__.py
new file mode 100644
index 000000000000..09ce2f87f44a
--- /dev/null
+++ b/tests/micro/project_api/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test Project API in different platforms infrastructure."""
diff --git a/tests/micro/project_api/test_project_api.py b/tests/micro/project_api/test_project_api.py
index b85f6c09536f..33ece65897a3 100644
--- a/tests/micro/project_api/test_project_api.py
+++ b/tests/micro/project_api/test_project_api.py
@@ -24,6 +24,8 @@
 from tvm.relay.backend import Runtime
 from tvm.micro.testing import get_target
 
+from .utils import build_project_api
+
 API_GENERATE_PROJECT = "generate_project"
 API_BUILD = "build"
 API_FLASH = "flash"
@@ -55,37 +57,7 @@ def test_default_options_exist(platform):
 @tvm.testing.requires_micro
 def test_project_minimal_options(platform):
     """Test template project with minimum projectOptions"""
-    shape = (10,)
-    dtype = "int8"
-    x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype))
-    xx = relay.multiply(x, x)
-    z = relay.add(xx, relay.const(np.ones(shape=shape, dtype=dtype)))
-    func = relay.Function([x], z)
-    ir_mod = tvm.IRModule.from_expr(func)
-
-    if platform == "arduino":
-        board = "due"
-    elif platform == "zephyr":
-        board = "qemu_x86"
-
-    runtime = Runtime("crt", {"system-lib": True})
-    target = get_target(platform, board)
-    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        mod = tvm.relay.build(ir_mod, target=target, runtime=runtime)
-
-    project_options = {
-        "project_type": "host_driven",
-        "board": board,
-    }
-
-    temp_dir = tvm.contrib.utils.tempdir()
-    project = tvm.micro.generate_project(
-        tvm.micro.get_microtvm_template_projects(platform),
-        mod,
-        temp_dir / "project",
-        project_options,
-    )
-    project.build()
+    build_project_api(platform)
 
 
 if __name__ == "__main__":
diff --git a/tests/micro/project_api/test_zephyr_microtvm_api_server.py b/tests/micro/project_api/test_zephyr_microtvm_api_server.py
new file mode 100644
index 000000000000..68b98f2fa527
--- /dev/null
+++ b/tests/micro/project_api/test_zephyr_microtvm_api_server.py
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+
+import tvm
+
+from .utils import build_project_api
+
+
+@tvm.testing.requires_micro
+def test_option_cmsis_path():
+    """Test project API without CMSIS_PATH environment variable."""
+    cmsis_path = os.environ.get("CMSIS_PATH", None)
+    del os.environ["CMSIS_PATH"]
+    build_project_api("zephyr")
+    os.environ["CMSIS_PATH"] = cmsis_path
diff --git a/tests/micro/project_api/utils.py b/tests/micro/project_api/utils.py
new file mode 100644
index 000000000000..6f1b41877d3d
--- /dev/null
+++ b/tests/micro/project_api/utils.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.backend import Runtime
+from tvm.micro.testing import get_target
+
+
+def build_project_api(platform: str):
+    """Build a relay module with Project API."""
+    shape = (10,)
+    dtype = "int8"
+    x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype))
+    xx = relay.multiply(x, x)
+    z = relay.add(xx, relay.const(np.ones(shape=shape, dtype=dtype)))
+    func = relay.Function([x], z)
+    ir_mod = tvm.IRModule.from_expr(func)
+
+    if platform == "arduino":
+        board = "due"
+    elif platform == "zephyr":
+        board = "qemu_x86"
+
+    runtime = Runtime("crt", {"system-lib": True})
+    target = get_target(platform, board)
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(ir_mod, target=target, runtime=runtime)
+
+    project_options = {
+        "project_type": "host_driven",
+        "board": board,
+    }
+
+    temp_dir = tvm.contrib.utils.tempdir()
+    project = tvm.micro.generate_project(
+        tvm.micro.get_microtvm_template_projects(platform),
+        mod,
+        temp_dir / "project",
+        project_options,
+    )
+    project.build()
diff --git a/tests/micro/zephyr/README.md b/tests/micro/zephyr/README.md
index d41045c3752f..7ae98bbd2d42 100644
--- a/tests/micro/zephyr/README.md
+++ b/tests/micro/zephyr/README.md
@@ -40,3 +40,9 @@ To see the list of supported values for `--board`, run:
 ```
 $ pytest test_zephyr.py --help
 ```
+
+If you like to test with a real hardware, you have the option to pass the serial number
+for your development board.
+```
+$ pytest test_zephyr.py --board=nrf5340dk_nrf5340_cpuapp --serial="0672FF5"
+```
diff --git a/tests/micro/zephyr/__init__.py b/tests/micro/zephyr/__init__.py
new file mode 100644
index 000000000000..15b94a9f78d0
--- /dev/null
+++ b/tests/micro/zephyr/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Testing infrastructure for microTVM Zephyr """
diff --git a/tests/micro/zephyr/conftest.py b/tests/micro/zephyr/conftest.py
index 52e89481b7bb..d3dbf22e4746 100644
--- a/tests/micro/zephyr/conftest.py
+++ b/tests/micro/zephyr/conftest.py
@@ -23,20 +23,17 @@
 
 
 def pytest_addoption(parser):
-    parser.addoption(
-        "--west-cmd", default="west", help="Path to `west` command for flashing device."
-    )
     parser.addoption(
         "--use-fvp",
         action="store_true",
         default=False,
         help="If set true, use the FVP emulator to run the test",
     )
-
-
-@pytest.fixture(scope="session")
-def west_cmd(request):
-    return request.config.getoption("--west-cmd")
+    parser.addoption(
+        "--serial",
+        default=None,
+        help="If set true, use the FVP emulator to run the test",
+    )
 
 
 @pytest.fixture
@@ -44,6 +41,11 @@ def use_fvp(request):
     return request.config.getoption("--use-fvp")
 
 
+@pytest.fixture
+def serial_number(request):
+    return request.config.getoption("--serial")
+
+
 @pytest.fixture(autouse=True)
 def xfail_on_fvp(request, use_fvp):
     """mark the tests as xfail if running on fvp."""
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 0ef1dd2ce211..a8fb26133970 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -33,13 +33,21 @@
 from tvm.contrib import utils
 from tvm.micro.testing.utils import check_tune_log
 
-import test_utils
+from . import utils
 
 _LOG = logging.getLogger(__name__)
 
 
 def _make_sess_from_op(
-    temp_dir, model, zephyr_board, west_cmd, op_name, sched, arg_bufs, build_config, use_fvp
+    temp_dir,
+    model,
+    zephyr_board,
+    op_name,
+    sched,
+    arg_bufs,
+    build_config,
+    use_fvp,
+    serial_number,
 ):
     runtime = Runtime("crt", {"system-lib": True})
     target = tvm.target.target.micro(model)
@@ -47,27 +55,27 @@ def _make_sess_from_op(
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.build(sched, arg_bufs, target=target, runtime=runtime, name=op_name)
 
-    return _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config, use_fvp)
+    return _make_session(temp_dir, zephyr_board, mod, build_config, use_fvp, serial_number)
 
 
-def _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config, use_fvp):
+def _make_session(temp_dir, zephyr_board, mod, build_config, use_fvp, serial_number):
     config_main_stack_size = None
-    if test_utils.qemu_boards(zephyr_board):
+    if utils.qemu_boards(zephyr_board):
         config_main_stack_size = 1536
 
     project_options = {
         "project_type": "host_driven",
-        "west_cmd": west_cmd,
         "verbose": bool(build_config.get("debug")),
         "board": zephyr_board,
         "arm_fvp_path": "/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4/FVP_Corstone_SSE-300_Ethos-U55",
         "use_fvp": bool(use_fvp),
+        "serial_number": serial_number,
     }
     if config_main_stack_size is not None:
         project_options["config_main_stack_size"] = config_main_stack_size
 
     project = tvm.micro.generate_project(
-        str(test_utils.TEMPLATE_PROJECT_DIR),
+        str(utils.TEMPLATE_PROJECT_DIR),
         mod,
         temp_dir / "project",
         project_options,
@@ -77,13 +85,23 @@ def _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config, use_fvp):
     return tvm.micro.Session(project.transport())
 
 
-def _make_add_sess(temp_dir, model, zephyr_board, west_cmd, build_config, use_fvp, dtype="int8"):
+def _make_add_sess(
+    temp_dir, model, zephyr_board, build_config, use_fvp, serial_number, dtype="int8"
+):
     A = tvm.te.placeholder((2,), dtype=dtype)
     B = tvm.te.placeholder((1,), dtype=dtype)
     C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
     sched = tvm.te.create_schedule(C.op)
     return _make_sess_from_op(
-        temp_dir, model, zephyr_board, west_cmd, "add", sched, [A, B, C], build_config, use_fvp
+        temp_dir,
+        model,
+        zephyr_board,
+        "add",
+        sched,
+        [A, B, C],
+        build_config,
+        use_fvp,
+        serial_number,
     )
 
 
@@ -91,10 +109,10 @@ def _make_add_sess(temp_dir, model, zephyr_board, west_cmd, build_config, use_fv
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521"])
 @pytest.mark.xfail_on_fvp()
-def test_add_uint(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
+def test_add_uint(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Test compiling the on-device runtime."""
 
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -110,7 +128,7 @@ def test_basic_add(sess):
         system_lib.get_function("add")(A_data, B_data, C_data)
         assert (C_data.numpy() == np.array([6, 7])).all()
 
-    with _make_add_sess(workspace_dir, model, board, west_cmd, build_config, use_fvp) as sess:
+    with _make_add_sess(workspace_dir, model, board, build_config, use_fvp, serial_number) as sess:
         test_basic_add(sess)
 
 
@@ -118,10 +136,10 @@ def test_basic_add(sess):
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521"])
 @pytest.mark.xfail_on_fvp()
-def test_add_float(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
+def test_add_float(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Test compiling the on-device runtime."""
-    model = test_utils.ZEPHYR_BOARDS[board]
-    if not test_utils.has_fpu(board):
+    model = utils.ZEPHYR_BOARDS[board]
+    if not utils.has_fpu(board):
         pytest.skip(f"FPU not enabled for {board}")
 
     build_config = {"debug": microtvm_debug}
@@ -140,7 +158,13 @@ def test_basic_add(sess):
         assert (C_data.numpy() == np.array([7, 8])).all()
 
     with _make_add_sess(
-        workspace_dir, model, board, west_cmd, build_config, use_fvp, dtype="float32"
+        workspace_dir,
+        model,
+        board,
+        build_config,
+        use_fvp,
+        serial_number,
+        dtype="float32",
     ) as sess:
         test_basic_add(sess)
 
@@ -148,10 +172,10 @@ def test_basic_add(sess):
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521"])
 @pytest.mark.xfail_on_fvp()
-def test_platform_timer(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
+def test_platform_timer(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Test compiling the on-device runtime."""
 
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -172,16 +196,16 @@ def test_basic_add(sess):
         assert result.mean > 0
         assert len(result.results) == 3
 
-    with _make_add_sess(workspace_dir, model, board, west_cmd, build_config, use_fvp) as sess:
+    with _make_add_sess(workspace_dir, model, board, build_config, use_fvp, serial_number) as sess:
         test_basic_add(sess)
 
 
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521"])
 @pytest.mark.xfail_on_fvp()
-def test_relay(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
+def test_relay(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Testing a simple relay graph"""
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
     shape = (10,)
     dtype = "int8"
@@ -198,7 +222,7 @@ def test_relay(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.relay.build(ir_mod, target=target, runtime=runtime)
 
-    with _make_session(workspace_dir, board, west_cmd, mod, build_config, use_fvp) as session:
+    with _make_session(workspace_dir, board, mod, build_config, use_fvp, serial_number) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             mod.get_graph_json(), session.get_system_lib(), session.device
         )
@@ -213,9 +237,9 @@ def test_relay(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521"])
 @pytest.mark.xfail_on_fvp()
-def test_onnx(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
+def test_onnx(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Testing a simple ONNX model."""
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
 
     this_dir = pathlib.Path(os.path.dirname(__file__))
@@ -245,7 +269,9 @@ def test_onnx(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
         lowered = relay.build(relay_mod, target, params=params, executor=executor, runtime=runtime)
         graph = lowered.get_graph_json()
 
-    with _make_session(workspace_dir, board, west_cmd, lowered, build_config, use_fvp) as session:
+    with _make_session(
+        workspace_dir, board, lowered, build_config, use_fvp, serial_number
+    ) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             graph, session.get_system_lib(), session.device
         )
@@ -268,12 +294,12 @@ def check_result(
     relay_mod,
     model,
     zephyr_board,
-    west_cmd,
     map_inputs,
     out_shape,
     result,
     build_config,
     use_fvp,
+    serial_number,
 ):
     """Helper function to verify results"""
     TOL = 1e-5
@@ -282,7 +308,9 @@ def check_result(
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.relay.build(relay_mod, target=target, runtime=runtime)
 
-    with _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config, use_fvp) as session:
+    with _make_session(
+        temp_dir, zephyr_board, mod, build_config, use_fvp, serial_number
+    ) as session:
         rt_mod = tvm.micro.create_local_graph_executor(
             mod.get_graph_json(), session.get_system_lib(), session.device
         )
@@ -304,9 +332,9 @@ def check_result(
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521"])
 @pytest.mark.xfail_on_fvp()
-def test_byoc_microtvm(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
+def test_byoc_microtvm(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """This is a simple test case to check BYOC capabilities of microTVM"""
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
     x = relay.var("x", shape=(10, 10))
     w0 = relay.var("w0", shape=(10, 10))
@@ -361,20 +389,20 @@ def test_byoc_microtvm(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
         ),
         model=model,
         zephyr_board=board,
-        west_cmd=west_cmd,
         build_config=build_config,
         use_fvp=use_fvp,
+        serial_number=serial_number,
     )
 
 
 def _make_add_sess_with_shape(
-    temp_dir, model, zephyr_board, west_cmd, shape, build_config, use_fvp
+    temp_dir, model, zephyr_board, shape, build_config, use_fvp, serial_number
 ):
     A = tvm.te.placeholder(shape, dtype="int8")
     C = tvm.te.compute(A.shape, lambda i: A[i] + A[i], name="C")
     sched = tvm.te.create_schedule(C.op)
     return _make_sess_from_op(
-        temp_dir, model, zephyr_board, west_cmd, "add", sched, [A, C], build_config, use_fvp
+        temp_dir, model, zephyr_board, "add", sched, [A, C], build_config, use_fvp, serial_number
     )
 
 
@@ -389,9 +417,9 @@ def _make_add_sess_with_shape(
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521"])
 @pytest.mark.xfail_on_fvp()
-def test_rpc_large_array(workspace_dir, board, west_cmd, microtvm_debug, shape, use_fvp):
+def test_rpc_large_array(workspace_dir, board, microtvm_debug, shape, use_fvp, serial_number):
     """Test large RPC array transfer."""
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -404,20 +432,20 @@ def test_tensors(sess):
         assert (C_data.numpy() == np.zeros(shape)).all()
 
     with _make_add_sess_with_shape(
-        workspace_dir, model, board, west_cmd, shape, build_config, use_fvp
+        workspace_dir, model, board, shape, build_config, use_fvp, serial_number
     ) as sess:
         test_tensors(sess)
 
 
 @pytest.mark.xfail(strict=False, reason="See https://github.com/apache/tvm/issues/10297")
 @tvm.testing.requires_micro
-def test_autotune_conv2d(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
+def test_autotune_conv2d(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Test AutoTune for microTVM Zephyr"""
     if board != "qemu_x86":
         pytest.xfail(f"Autotune fails on {board}.")
 
     runtime = Runtime("crt", {"system-lib": True})
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
 
     # Create a Relay model
@@ -452,21 +480,21 @@ def test_autotune_conv2d(workspace_dir, board, west_cmd, microtvm_debug, use_fvp
     assert len(tasks) > 0
 
     config_main_stack_size = None
-    if test_utils.qemu_boards(board):
+    if utils.qemu_boards(board):
         config_main_stack_size = 1536
 
     project_options = {
         "board": board,
-        "west_cmd": west_cmd,
         "verbose": 1,
         "project_type": "host_driven",
         "use_fvp": bool(use_fvp),
+        "serial_number": serial_number,
     }
     if config_main_stack_size is not None:
         project_options["config_main_stack_size"] = config_main_stack_size
 
     module_loader = tvm.micro.AutoTvmModuleLoader(
-        template_project_dir=test_utils.TEMPLATE_PROJECT_DIR,
+        template_project_dir=utils.TEMPLATE_PROJECT_DIR,
         project_options=project_options,
     )
 
@@ -510,7 +538,7 @@ def test_autotune_conv2d(workspace_dir, board, west_cmd, microtvm_debug, use_fvp
         lowered = tvm.relay.build(mod, target=target, runtime=runtime, params=params)
 
     temp_dir = utils.tempdir()
-    with _make_session(temp_dir, board, west_cmd, lowered, build_config, use_fvp) as session:
+    with _make_session(temp_dir, board, lowered, build_config, use_fvp, serial_number) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             lowered.get_graph_json(), session.get_system_lib(), session.device
         )
@@ -525,7 +553,9 @@ def test_autotune_conv2d(workspace_dir, board, west_cmd, microtvm_debug, use_fvp
             lowered_tuned = tvm.relay.build(mod, target=target, runtime=runtime, params=params)
 
     temp_dir = utils.tempdir()
-    with _make_session(temp_dir, board, west_cmd, lowered_tuned, build_config, use_fvp) as session:
+    with _make_session(
+        temp_dir, board, lowered_tuned, build_config, use_fvp, serial_number
+    ) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             lowered_tuned.get_graph_json(), session.get_system_lib(), session.device
         )
@@ -538,13 +568,11 @@ def test_autotune_conv2d(workspace_dir, board, west_cmd, microtvm_debug, use_fvp
 
 
 @tvm.testing.requires_micro
-def test_schedule_build_with_cmsis_dependency(
-    workspace_dir, board, west_cmd, microtvm_debug, use_fvp
-):
+def test_schedule_build_with_cmsis_dependency(workspace_dir, board, microtvm_debug, use_fvp):
     """Test Relay schedule with CMSIS dependency. This test shows if microTVM Auto tuning
     with Zephyr breaks if CMSIS dependency was required for a schedule.
     """
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
     target = tvm.target.target.micro(model, options=["-keys=arm_cpu,cpu"])
 
@@ -575,7 +603,6 @@ def test_schedule_build_with_cmsis_dependency(
 
     project_options = {
         "project_type": "host_driven",
-        "west_cmd": west_cmd,
         "verbose": bool(build_config.get("debug")),
         "board": board,
         "cmsis_path": os.getenv("CMSIS_PATH"),
@@ -584,7 +611,7 @@ def test_schedule_build_with_cmsis_dependency(
 
     project_dir = workspace_dir / "project"
     project = tvm.micro.generate_project(
-        str(test_utils.TEMPLATE_PROJECT_DIR),
+        str(utils.TEMPLATE_PROJECT_DIR),
         mod,
         project_dir,
         project_options,
diff --git a/tests/micro/zephyr/test_zephyr_aot_exec.py b/tests/micro/zephyr/test_zephyr_aot_exec.py
index 9ebba8ec08cb..a67cf0830a70 100644
--- a/tests/micro/zephyr/test_zephyr_aot_exec.py
+++ b/tests/micro/zephyr/test_zephyr_aot_exec.py
@@ -14,33 +14,21 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import logging
-import os
-import pathlib
-import sys
-import logging
-
 import pytest
 import numpy as np
 
-import onnx
-from PIL import Image
-
 import tvm
 import tvm.testing
 import tvm.relay as relay
 from tvm.relay.backend import Executor, Runtime
-from tvm.relay.testing import byoc
 from tvm.contrib import utils
-from tvm.micro.testing.utils import check_tune_log
-from tvm._ffi import get_global_func, register_func
 
-import test_utils
+from . import utils
 
 
-def _make_session(workspace_dir, zephyr_board, west_cmd, mod, build_config, use_fvp):
+def _make_session(workspace_dir, zephyr_board, mod, build_config, use_fvp, serial_number):
     config_main_stack_size = None
-    if test_utils.qemu_boards(zephyr_board):
+    if utils.qemu_boards(zephyr_board):
         # fyi: qemu_riscv64 seems to be the greediest stack user
         config_main_stack_size = 4096
     else:
@@ -49,17 +37,17 @@ def _make_session(workspace_dir, zephyr_board, west_cmd, mod, build_config, use_
 
     project_options = {
         "project_type": "host_driven",
-        "west_cmd": west_cmd,
         "verbose": bool(build_config.get("debug")),
         "board": zephyr_board,
         "arm_fvp_path": "/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4/FVP_Corstone_SSE-300_Ethos-U55",
         "use_fvp": bool(use_fvp),
+        "serial_number": serial_number,
     }
     if config_main_stack_size is not None:
         project_options["config_main_stack_size"] = config_main_stack_size
 
     project = tvm.micro.generate_project(
-        str(test_utils.TEMPLATE_PROJECT_DIR),
+        str(utils.TEMPLATE_PROJECT_DIR),
         mod,
         workspace_dir / "project",
         project_options,
@@ -72,10 +60,10 @@ def _make_session(workspace_dir, zephyr_board, west_cmd, mod, build_config, use_
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521"])
 @pytest.mark.xfail_on_fvp()
-def test_relay(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
+def test_relay(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Testing a simple relay graph"""
 
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
     shape = (10,)
     dtype = "int8"
@@ -93,7 +81,7 @@ def test_relay(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.relay.build(ir_mod, target=target, runtime=runtime, executor=executor)
 
-    with _make_session(workspace_dir, board, west_cmd, mod, build_config, use_fvp) as session:
+    with _make_session(workspace_dir, board, mod, build_config, use_fvp, serial_number) as session:
 
         aot_executor = tvm.runtime.executor.aot_executor.AotModule(session.create_aot_executor())
 
@@ -107,10 +95,10 @@ def test_relay(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521"])
 @pytest.mark.xfail_on_fvp()
-def test_aot_executor(workspace_dir, board, west_cmd, microtvm_debug, use_fvp):
+def test_aot_executor(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Test use of the AOT executor with microTVM."""
 
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
     shape = (10,)
     dtype = "int8"
@@ -158,7 +146,7 @@ def do_test():
         aot_executor.set_input("b", B_np_new)
         assert (B_data.numpy() == B_np_new).all()
 
-    with _make_session(workspace_dir, board, west_cmd, mod, build_config, use_fvp) as session:
+    with _make_session(workspace_dir, board, mod, build_config, use_fvp, serial_number) as session:
         do_test()
 
 
diff --git a/tests/micro/zephyr/test_zephyr_aot_exec_standalone.py b/tests/micro/zephyr/test_zephyr_aot_exec_standalone.py
index 2941bb1befc4..9e015448e91b 100644
--- a/tests/micro/zephyr/test_zephyr_aot_exec_standalone.py
+++ b/tests/micro/zephyr/test_zephyr_aot_exec_standalone.py
@@ -14,14 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import io
-import logging
 import os
-import sys
-import logging
 import pathlib
-import tarfile
-import tempfile
 
 import pytest
 import numpy as np
@@ -31,17 +25,16 @@
 from tvm.micro.project_api import server
 import tvm.relay as relay
 from tvm.relay.backend import Executor, Runtime
-
 from tvm.contrib.download import download_testdata
 
-import test_utils
+from . import utils
 
 
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521", "mps3_an547"])
-def test_tflite(workspace_dir, board, west_cmd, microtvm_debug):
+def test_tflite(workspace_dir, board, microtvm_debug, serial_number):
     """Testing a TFLite model."""
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     input_shape = (1, 49, 10, 1)
     output_shape = (1, 12)
     build_config = {"debug": microtvm_debug}
@@ -77,30 +70,30 @@ def test_tflite(workspace_dir, board, west_cmd, microtvm_debug):
     sample_path = download_testdata(sample_url, "keyword_spotting_int8_6.pyc.npy", module="data")
     sample = np.load(sample_path)
 
-    project, _ = test_utils.generate_project(
+    project, _ = utils.generate_project(
         workspace_dir,
         board,
-        west_cmd,
         lowered,
         build_config,
         sample,
         output_shape,
         "int8",
-        load_cmsis=False,
+        False,
+        serial_number,
     )
 
-    result, time = test_utils.run_model(project)
+    result, _ = utils.run_model(project)
     assert result == 6
 
 
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521", "mps3_an547"])
-def test_qemu_make_fail(workspace_dir, board, west_cmd, microtvm_debug):
+def test_qemu_make_fail(workspace_dir, board, microtvm_debug, serial_number):
     """Testing QEMU make fail."""
     if board not in ["qemu_x86", "mps2_an521", "mps3_an547"]:
         pytest.skip(msg="Only for QEMU targets.")
 
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
     shape = (10,)
     dtype = "float32"
@@ -119,16 +112,16 @@ def test_qemu_make_fail(workspace_dir, board, west_cmd, microtvm_debug):
         lowered = relay.build(ir_mod, target, executor=executor, runtime=runtime)
 
     sample = np.zeros(shape=shape, dtype=dtype)
-    project, project_dir = test_utils.generate_project(
+    project, project_dir = utils.generate_project(
         workspace_dir,
         board,
-        west_cmd,
         lowered,
         build_config,
         sample,
         shape,
         dtype,
-        load_cmsis=False,
+        False,
+        serial_number,
     )
 
     file_path = pathlib.Path(project_dir) / "build" / "build.ninja"
diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py b/tests/micro/zephyr/test_zephyr_armv7m.py
index 6a1dff254591..eb709382024d 100644
--- a/tests/micro/zephyr/test_zephyr_armv7m.py
+++ b/tests/micro/zephyr/test_zephyr_armv7m.py
@@ -14,14 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-from json import load
-import logging
 import os
 import pathlib
-import sys
-import tarfile
-import tempfile
 
 import pytest
 import numpy as np
@@ -35,9 +29,7 @@
 from tvm.contrib.download import download_testdata
 from tvm.relay.backend import Executor, Runtime
 
-import test_utils
-
-_LOG = logging.getLogger(__name__)
+from . import utils
 
 
 def _open_tflite_model():
@@ -105,7 +97,7 @@ def _apply_desired_layout_no_simd(relay_mod):
 @tvm.testing.requires_micro
 @pytest.mark.skip_boards(["mps2_an521"])
 @pytest.mark.xfail(reason="due https://github.com/apache/tvm/issues/12619")
-def test_armv7m_intrinsic(workspace_dir, board, west_cmd, microtvm_debug):
+def test_armv7m_intrinsic(workspace_dir, board, microtvm_debug, serial_number):
     """Testing a ARM v7m SIMD extension."""
     if board not in [
         "mps2_an521",
@@ -116,7 +108,7 @@ def test_armv7m_intrinsic(workspace_dir, board, west_cmd, microtvm_debug):
     ]:
         pytest.skip(msg="Platform does not support ARM v7m SIMD extension.")
 
-    model = test_utils.ZEPHYR_BOARDS[board]
+    model = utils.ZEPHYR_BOARDS[board]
 
     build_config = {"debug": microtvm_debug}
 
@@ -151,31 +143,31 @@ def test_armv7m_intrinsic(workspace_dir, board, west_cmd, microtvm_debug):
             relay_mod_no_simd, target, params=params, runtime=runtime, executor=executor
         )
 
-        simd_project, _ = test_utils.generate_project(
+        simd_project, _ = utils.generate_project(
             workspace_dir_simd,
             board,
-            west_cmd,
             lowered_simd,
             build_config,
             sample,
             output_shape,
             "float32",
-            load_cmsis=True,
+            True,
+            serial_number,
         )
-        result_simd, time_simd = test_utils.run_model(simd_project)
+        result_simd, time_simd = utils.run_model(simd_project)
 
-        no_simd_project, _ = test_utils.generate_project(
+        no_simd_project, _ = utils.generate_project(
             workspace_dir_no_simd,
             board,
-            west_cmd,
             lowered_no_simd,
             build_config,
             sample,
             output_shape,
             "float32",
-            load_cmsis=False,
+            False,
+            serial_number,
         )
-        result_no_simd, time_no_simd = test_utils.run_model(no_simd_project)
+        result_no_simd, time_no_simd = utils.run_model(no_simd_project)
 
     assert result_no_simd == result_simd == 2
 
diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/utils.py
similarity index 96%
rename from tests/micro/zephyr/test_utils.py
rename to tests/micro/zephyr/utils.py
index 695bf2e9caae..05b209094420 100644
--- a/tests/micro/zephyr/test_utils.py
+++ b/tests/micro/zephyr/utils.py
@@ -76,7 +76,7 @@ def has_fpu(board: str):
 
 
 def build_project(
-    temp_dir, zephyr_board, west_cmd, mod, build_config, simd=False, extra_files_tar=None
+    temp_dir, zephyr_board, mod, build_config, serial_number, simd=False, extra_files_tar=None
 ):
     project_dir = temp_dir / "project"
 
@@ -88,9 +88,9 @@ def build_project(
         project_options = {
             "extra_files_tar": extra_files_tar,
             "project_type": "aot_standalone_demo",
-            "west_cmd": west_cmd,
             "verbose": bool(build_config.get("debug")),
             "board": zephyr_board,
+            "serial_number": serial_number,
             "compile_definitions": [
                 # TODO(mehrdadh): It fails without offset.
                 f"-DWORKSPACE_SIZE={workspace_size + 128}",
@@ -197,7 +197,15 @@ def run_model(project):
 
 
 def generate_project(
-    temp_dir, board, west_cmd, lowered, build_config, sample, output_shape, output_type, load_cmsis
+    temp_dir,
+    board,
+    lowered,
+    build_config,
+    sample,
+    output_shape,
+    output_type,
+    load_cmsis,
+    serial_number,
 ):
     with tempfile.NamedTemporaryFile() as tar_temp_file:
         with tarfile.open(tar_temp_file.name, "w:gz") as tf:
@@ -222,9 +230,9 @@ def generate_project(
         project, project_dir = build_project(
             temp_dir,
             board,
-            west_cmd,
             lowered,
             build_config,
+            serial_number,
             simd=load_cmsis,
             extra_files_tar=tar_temp_file.name,
         )

From fae4c5f1b3ad39bee5acd3e725a005a24db0ffbd Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Fri, 25 Nov 2022 03:47:58 -0500
Subject: [PATCH 647/704] [TIR] Fix an error when the result of compute_at has
 unit loop (#13481)

---
 src/tir/schedule/primitive/compute_at.cc      |  2 +-
 .../unittest/test_tir_schedule_compute_at.py  | 52 +++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
index 83342e351b91..5a5a53615744 100644
--- a/src/tir/schedule/primitive/compute_at.cc
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -265,7 +265,7 @@ class ScopeReconstructor : private StmtMutator {
         loop_vars.push_back(var);
         loop_extents.push_back(analyzer->Simplify(iter_dom->extent));
         iter_values.push_back(iter_dom->min + var);
-        analyzer->Bind(var, Range::FromMinExtent(0, iter_dom->extent));
+        analyzer->Bind(var, Range::FromMinExtent(IntImm(var.dtype(), 0), iter_dom->extent));
       } else {
         iter_values.push_back(iter_dom->min);
       }
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
index 72cba1a8fdc4..34ca937cc2ba 100644
--- a/tests/python/unittest/test_tir_schedule_compute_at.py
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -1505,5 +1505,57 @@ def main_reverse_compute_at(
     tvm.ir.assert_structural_equal(main_reverse_compute_at, sch.mod["main"])
 
 
+def test_reverse_compute_at_with_unit_loop():
+    @T.prim_func
+    def main(A: T.Buffer[(128, 128), "float32"], D: T.Buffer[(1, 2, 1), "float32"]) -> None:
+        B = T.alloc_buffer([128, 128], dtype="float32")
+        for i_0, j_0, i_1 in T.grid(T.int64(8), T.int64(8), T.int64(16)):
+            for j_1 in T.serial(T.int64(16)):
+                with T.block("B"):
+                    vi = T.axis.spatial(T.int64(128), i_0 * T.int64(16) + i_1)
+                    vj = T.axis.spatial(T.int64(128), j_0 * T.int64(16) + j_1)
+                    T.reads(A[vi, vj])
+                    T.writes(B[vi, vj])
+                    B[vi, vj] = A[vi, vj] * T.float32(2)
+        for ax0, ax1, ax2 in T.grid(T.int64(1), T.int64(2), T.int64(1)):
+            with T.block("D"):
+                v0, v1, v2 = T.axis.remap("SSS", [ax0, ax1, ax2])
+                T.reads(B[v0, v1])
+                T.writes(D[v0, v1, v2])
+                D[v0, v1, v2] = B[v0, v1] + T.float32(1)
+
+    @T.prim_func
+    def main_reverse_compute_at(
+        A: T.Buffer[(128, 128), "float32"], D: T.Buffer[(1, 2, 1), "float32"]
+    ):
+        B = T.alloc_buffer([128, 128], dtype="float32")
+        for i_0, j_0, i_1 in T.grid(T.int64(8), T.int64(8), T.int64(16)):
+            for j_1 in T.serial(T.int64(16)):
+                with T.block("B"):
+                    vi = T.axis.spatial(T.int64(128), i_0 * T.int64(16) + i_1)
+                    vj = T.axis.spatial(T.int64(128), j_0 * T.int64(16) + j_1)
+                    T.reads(A[vi, vj])
+                    T.writes(B[vi, vj])
+                    B[vi, vj] = A[vi, vj] * T.float32(2)
+            for ax0, ax1, ax2 in T.grid(T.int64(1), T.int64(16), T.int64(1)):
+                with T.block("D"):
+                    T.where(
+                        i_0 * T.int64(16) + i_1 < T.int64(1)
+                        and j_0 * T.int64(16) + ax1 < T.int64(2)
+                    )
+                    v0 = T.axis.spatial(T.int64(1), i_0 * T.int64(16) + i_1 + ax0)
+                    v1 = T.axis.spatial(T.int64(2), j_0 * T.int64(16) + ax1)
+                    v2 = T.axis.spatial(T.int64(1), ax2)
+                    T.reads(B[v0, v1])
+                    T.writes(D[v0, v1, v2])
+                    D[v0, v1, v2] = B[v0, v1] + T.float32(1)
+
+    sch = tir.Schedule(main, debug_mask="all")
+    block_d = sch.get_block("D")
+    axis = sch.get_loops("B")[2]
+    sch.reverse_compute_at(block_d, axis, preserve_unit_loops=True, index=1)
+    tvm.ir.assert_structural_equal(main_reverse_compute_at, sch.mod["main"])
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 2105b937cd6b4f426156813d6562f2845e5604f0 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Fri, 25 Nov 2022 23:40:20 +0300
Subject: [PATCH 648/704] [OpenCL][Adreno] Remove PrimFunc parameters
 annotation (#13483)

`DeviceAwareVisitExpr_` for `FunctionNode` had been modifying state of
the parameters and specified memory scope for virtual device.
`VisitExpr` functions shouldn't modify state of the objects. This is why
this function was removed. After removing this function, the memory
scope of the input tensors starts to be equal to empty line instead of
"global".  But empty line and "global" will be transformed to the same
memory object. This is why there is no difference between empty line and
"global".
---
 src/relay/transforms/annotate_texture_storage.cc   | 12 ------------
 .../opencl_texture/test_conv2d_nchw_texture.py     | 14 +++++++-------
 2 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc
index 277c5e1da424..9b700bef2a46 100644
--- a/src/relay/transforms/annotate_texture_storage.cc
+++ b/src/relay/transforms/annotate_texture_storage.cc
@@ -136,18 +136,6 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
 
   void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
 
-  void DeviceAwareVisitExpr_(const FunctionNode* function_node) final {
-    if (!function_node->HasNonzeroAttr(attr::kPrimitive)) {
-      for (auto&& param : function_node->params) {
-        auto virtual_device = GetVirtualDevice(param);
-        param->virtual_device_ =
-            VirtualDevice(virtual_device->device_type(), virtual_device->virtual_device_id,
-                          virtual_device->target, "global");
-      }
-    }
-    transform::DeviceAwareExprVisitor::DeviceAwareVisitExpr_(function_node);
-  }
-
   void DeviceAwareVisitExpr_(const CallNode* call) final {
     // Check the contents of this primitive function
     if (const auto* fn = call->op.as<FunctionNode>()) {
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
index a0ca8423478e..d3fff68ae7cb 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
@@ -576,7 +576,7 @@ def test_residual_block(remote, target, dtype):
     }
     if dtype == "float16":
         static_memory_scope = [
-            "global",
+            "",
             "global.texture",
             "global.texture-weight",
             "global.texture-weight",
@@ -590,7 +590,7 @@ def test_residual_block(remote, target, dtype):
         ]
     else:
         static_memory_scope = [
-            "global",
+            "",
             "global.texture",
             "global.texture-weight",
             "global.texture-weight",
@@ -829,7 +829,7 @@ def test_pooling_branching_texture_params(remote, target, dtype):
     }
 
     static_memory_scope = [
-        "global",
+        "",
         "global.texture",
         "global.texture-weight",
         "global.texture",
@@ -956,7 +956,7 @@ def test_branching_texture_params(remote, target, dtype):
     }
 
     static_memory_scope = [
-        "global",
+        "",
         "global.texture",
         "global.texture-weight",
         "global.texture",
@@ -1045,7 +1045,7 @@ def test_conv2d_different_lowering_same_op(remote, target, dtype):
     }
 
     static_memory_scope = [
-        "global",
+        "",
         "global.texture",
         "global.texture-weight",
         "global.texture",
@@ -1177,7 +1177,7 @@ def test_injective_nwo_inputs1(remote, target, dtype):
     }
 
     static_memory_scope = [
-        "global",
+        "",
         "global.texture",
         "global.texture-nhwc",
         "global.texture",
@@ -1275,7 +1275,7 @@ def test_injective_nwo_inputs2(remote, target, dtype):
     }
 
     static_memory_scope = [
-        "global",
+        "",
         "global.texture",
         "global.texture-nhwc",
         "global.texture",

From f5a102c83c9200cf9dba1250a7431f57e018f83c Mon Sep 17 00:00:00 2001
From: neildhickey <nhickey1@gmail.com>
Date: Fri, 25 Nov 2022 21:49:18 +0000
Subject: [PATCH 649/704] [CMSIS-NN] Support for int16 in fully connected layer
 (#13484)

* [CMSIS-NN] Support for int16 in fully connected layer

-Pattern matching and RelatToTIR introduce int16 support
-Added int16 variants to fully_connected tests
---
 python/tvm/relay/op/contrib/cmsisnn.py        | 33 +++++++++++------
 .../backend/contrib/cmsisnn/relay_to_tir.cc   | 35 +++++++++++++++----
 .../backend/contrib/cmsisnn/tir_to_runtime.cc |  3 +-
 .../test_cmsisnn/test_fully_connected.py      | 25 ++++++++-----
 4 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 8964937469c4..779fe35c3718 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -231,27 +231,40 @@ def check_qnn_fully_connected(pattern):
             requantize = pattern
         requantize_input = requantize.args[0]
         bias_add = None
-        bias_dtype = "int32"
         if str(requantize_input.op.name) == "nn.bias_add":
             bias_add = requantize_input
             fc = bias_add.args[0]
-            bias_dtype = bias_add.args[1].checked_type.dtype
         else:
             fc = requantize_input
         fc_input = fc.args[0]
         fc_weight = fc.args[1]
 
+        are_dtypes_valid = False
+        fc_input_dtype = fc_input.checked_type.dtype
+        if bias_add:
+            bias_dtype = bias_add.args[1].checked_type.dtype
+        else:
+            bias_dtype = "int32" if fc_input_dtype == "int8" else "int64"
+
+        valid_dtypes = None
+        if fc_input_dtype == "int8":
+            valid_dtypes = ("int8", "int8", "int32", "int32", "int8")
+        elif fc_input_dtype == "int16":
+            valid_dtypes = ("int16", "int8", "int64", "int64", "int16")
+
+        if (
+            fc_input_dtype,
+            fc_weight.checked_type.dtype,
+            bias_dtype,
+            fc.attrs.out_dtype,
+            pattern.checked_type.dtype,
+        ) == valid_dtypes:
+            are_dtypes_valid = True
+
         # kernel zero_point should be 0
         kernel_zp = fc.args[3].data.numpy().item(0)
 
-        return (
-            fc.attrs.out_dtype == "int32"
-            and fc_input.checked_type.dtype == "int8"
-            and fc_weight.checked_type.dtype == "int8"
-            and pattern.checked_type.dtype == "int8"
-            and bias_dtype == "int32"
-            and kernel_zp == 0
-        )
+        return are_dtypes_valid and kernel_zp == 0
 
     def qnn_avg_pool2d_pattern():
         """Matches average pooling with optional Relu"""
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 1ea020e884de..c9e41589fb4b 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -192,6 +192,11 @@ class RelayToTIRVisitor : public MixedModeMutator {
     std::string kernel_layout = conv2d_attrs->kernel_layout.c_str();
     int32_t clip_min = std::numeric_limits<int8_t>::min();
     int32_t clip_max = std::numeric_limits<int8_t>::max();
+
+    if (dtype_bits == 16) {
+      clip_min = std::numeric_limits<int16_t>::min();
+      clip_max = std::numeric_limits<int16_t>::max();
+    }
     if (clip_call) {
       const ClipAttrs* clip_attrs = clip_call->attrs.as<ClipAttrs>();
       clip_min = clip_attrs->a_min;
@@ -309,6 +314,14 @@ class RelayToTIRVisitor : public MixedModeMutator {
       fc_call = requantize_input;
     }
 
+    // Extract the size of the input parameter from the call arguments. Other params are based off
+    // the input size
+    int32_t dtype_bits = fc_call->args[0]->type_as<TensorTypeNode>()->dtype.bits();
+    int32_t input_bits = dtype_bits;
+    int32_t filter_bits = 8;
+    int32_t bias_bits = dtype_bits * 4U;
+    int32_t output_bits = dtype_bits;
+
     // TIR variables are created in the order they appear in the Relay partitioned function
     // %1 = qnn.dense(%input, %weight_const_0, input_zero_point_scalar, kernel_zero_point_scalar,
     //                 %input_scale_scalar, %kernel_scale_scalar)
@@ -317,12 +330,12 @@ class RelayToTIRVisitor : public MixedModeMutator {
     //                     %output_scale_scalar, %output_zero_point_scalar)
     // clip(%3, a_min=%min_scalar, a_max=%max_scalar)
     BufferCreator buffer_creator;
-    tir::Var input = buffer_creator.CreateBufferVar("input", DataType::Handle(8));
-    tir::Var filter = buffer_creator.CreateBufferVar("filter", DataType::Handle(8));
+    tir::Var input = buffer_creator.CreateBufferVar("input", DataType::Handle(input_bits));
+    tir::Var filter = buffer_creator.CreateBufferVar("filter", DataType::Handle(filter_bits));
     if (bias_add_call) {
-      buffer_creator.CreateBufferVar("bias", DataType::Handle(32));
+      buffer_creator.CreateBufferVar("bias", DataType::Handle(bias_bits));
     }
-    tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
+    tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(output_bits));
 
     // Individual arguments to the structs arguments of the CMSIS-NN API are filled into call_extern
     // https://github.com/ARM-software/CMSIS_5/blob/def6f800f95661eb3451d317f7d0dde504f6020d/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L50
@@ -341,8 +354,13 @@ class RelayToTIRVisitor : public MixedModeMutator {
       clip_min = clip_attrs->a_min;
       clip_max = clip_attrs->a_max;
     } else {
-      clip_min = -128;
-      clip_max = 127;
+      if (dtype_bits == 8) {
+        clip_min = std::numeric_limits<int8_t>::min();
+        clip_max = std::numeric_limits<int8_t>::max();
+      } else {
+        clip_min = std::numeric_limits<int16_t>::min();
+        clip_max = std::numeric_limits<int16_t>::max();
+      }
     }
 
     double quantized_multiplier =
@@ -366,7 +384,10 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     Array<PrimExpr> cmsisnn_output_shape{batch_size, 1, 1, out_channels};
 
-    tvm::Array<PrimExpr> call_ext_args = {tir::StringImm("arm_fully_connected_s8"), input, filter};
+    std::string cmsisnn_api =
+        dtype_bits == 16 ? "arm_fully_connected_s16" : "arm_fully_connected_s8";
+
+    tvm::Array<PrimExpr> call_ext_args = {tir::StringImm(cmsisnn_api), input, filter};
     if (bias_add_call) {
       call_ext_args.push_back(buffer_creator.GetBufferVar("bias"));
     }
diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
index ae9f195ca509..b5c5058ddbc0 100644
--- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
@@ -115,7 +115,8 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
                cmsis_func_name == "arm_depthwise_conv_wrapper_s8" ||
                cmsis_func_name == "arm_depthwise_conv_wrapper_s16") {
       EmitConv2D(op);
-    } else if (cmsis_func_name == "arm_fully_connected_s8") {
+    } else if (cmsis_func_name == "arm_fully_connected_s8" ||
+               cmsis_func_name == "arm_fully_connected_s16") {
       EmitFullyConnected(op);
     } else if (cmsis_func_name == "arm_avgpool_s8" || cmsis_func_name == "arm_max_pool_s8") {
       EmitPool2D(op);
diff --git a/tests/python/contrib/test_cmsisnn/test_fully_connected.py b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
index 6fa1cc687f81..3b220eb42c9b 100644
--- a/tests/python/contrib/test_cmsisnn/test_fully_connected.py
+++ b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
@@ -32,6 +32,7 @@
     assert_partitioned_function,
     assert_no_external_function,
     create_test_runner,
+    get_kernel_bias_dtype,
 )
 
 
@@ -46,6 +47,7 @@ def make_model(
     output_scale,
     dtype,
     kernel_dtype,
+    bias_dtype,
     out_channels,
     enable_bias,
     relu_type="NONE",
@@ -70,11 +72,11 @@ def make_model(
         input_scale=relay.const(input_scale, "float32"),
         kernel_scale=relay.const(kernel_scale, "float32"),
         units=out_channels,
-        out_dtype="int32",
+        out_dtype=bias_dtype,
     )
 
-    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
-    bias_const = relay.const(bias, "int32")
+    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype=bias_dtype))
+    bias_const = relay.const(bias, bias_dtype)
     last_op = relay.nn.bias_add(dense, bias_const) if enable_bias else dense
     requant_input_sc = input_scale * kernel_scale
     last_op = relay.qnn.op.requantize(
@@ -91,6 +93,7 @@ def make_model(
 
 
 @tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("dtype", ["int8", "int16"])
 @pytest.mark.parametrize("in_shape", [(2, 28), (1, 64)])
 @pytest.mark.parametrize("out_channels", [12, 128])
 @pytest.mark.parametrize("enable_bias", [False, True])
@@ -101,7 +104,8 @@ def make_model(
 @pytest.mark.parametrize(
     "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
 )
-def test_op_int8(
+def test_ops(
+    dtype,
     in_shape,
     enable_bias,
     input_zero_point,
@@ -115,7 +119,7 @@ def test_op_int8(
     interface_api = "c"
     use_unpacked_api = True
 
-    dtype = "int8"
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
     kernel_zero_point = 0
     kernel_shape = [out_channels, in_shape[1]]
     conv2d_kernel_shape = (1, 1, kernel_shape[0], kernel_shape[1])
@@ -140,7 +144,8 @@ def test_op_int8(
         output_zero_point,
         output_scale,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
         enable_bias,
     )
@@ -170,13 +175,15 @@ def test_op_int8(
 
 def parameterize_for_invalid_model(test):
     """Generates parameters for non int8 inputs to fully connected layer"""
-    in_dtype = ["uint8", "int8"]
+    in_dtype = ["uint8", "int8", "int16"]
     kernel_dtype = ["uint8", "int8"]
     kernel_zero_point = [-33, 10, 0]
     all_combinations = itertools.product(in_dtype, kernel_dtype, kernel_zero_point)
     all_combinations = filter(
         lambda parameters: not (
-            parameters[0] == "int8" and parameters[1] == "int8" and parameters[2] == 0
+            (parameters[0] == "int8" or parameters[0] == "int16")
+            and parameters[1] == "int8"
+            and parameters[2] == 0
         ),
         all_combinations,
     )
@@ -199,6 +206,7 @@ def test_invalid_parameters(
     input_scale = 1
     input_zero_point = 24
     kernel_scale = [0.11, 0.0237]
+    _, bias_dtype = get_kernel_bias_dtype(in_dtype)
 
     kernel_shape = [out_channels, in_shape[1]]
     conv2d_kernel_shape = [1, 1, kernel_shape[0], kernel_shape[1]]
@@ -223,6 +231,7 @@ def test_invalid_parameters(
         output_scale=output_scale,
         dtype=in_dtype,
         kernel_dtype=kernel_dtype,
+        bias_dtype=bias_dtype,
         out_channels=out_channels,
         enable_bias=True,
     )

From 101e3a4ade226a2b9cdef6437a285af18aef9cf8 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 25 Nov 2022 16:05:37 -0600
Subject: [PATCH 650/704] [TIR][Transform] Optional data-flow analysis in
 RemoveNoOp (#13217)

* [TIR][Transform] Optional data-flow analysis in RemoveNoOp

Previously, `RemoveNoOp` would remove statements that could be locally
analyzed as having no effect (e.g. `For` with empty loop extents).
This commit adds opt-in use of data-flow analysis to identify
two types of statements that are no-ops based on their context:

* Buffer stores that are overwritten without ever being read.

  ```python
  buf[i] = 5 # Overwritten by next statement
  buf[i] = 10
  ```

* Storing a value that is already known to be present.

  ```python
  buf[0:16] = T.ramp(0, 16, 1)
  buf[5] = 5 # Previous load already stored this value
  ```

* Avoid dangling pointers in var_range_map_
---
 src/arith/rewrite_simplify.cc                 |   7 +
 src/tir/analysis/control_flow_graph.cc        | 117 ++--
 src/tir/analysis/control_flow_graph.h         |  12 +-
 src/tir/transforms/remove_no_op.cc            | 230 ++++++--
 .../test_tir_transform_remove_no_op.py        | 521 ++++++++++++++++++
 .../unittest/test_tir_transform_simplify.py   |   1 +
 6 files changed, 796 insertions(+), 92 deletions(-)

diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index e6d876cf5aa8..90c448f4ea5c 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -1644,6 +1644,11 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(LT ret) {
     TVM_TRY_RECURSIVE_REWRITE(x + c1 < c2, x < c2 - c1);
     TVM_TRY_RECURSIVE_REWRITE(x - c1 < c2, x < c2 + c1);
     TVM_TRY_REWRITE(x - c1 < 0, x < c1);
+
+    TVM_TRY_RECURSIVE_REWRITE(x - 1 < y, x <= y);
+    TVM_TRY_RECURSIVE_REWRITE(x < y + 1, x <= y);
+    TVM_TRY_RECURSIVE_REWRITE(x + (-1) < y, x <= y);
+    TVM_TRY_RECURSIVE_REWRITE(x < y - (-1), x <= y);
     // clang-format on
   }
   return std::move(ret);
@@ -1886,6 +1891,8 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) {
   TVM_TRY_REWRITE(x <= y || y < x, ctrue);
   TVM_TRY_REWRITE(y < x || x <= y, ctrue);
 
+  TVM_TRY_REWRITE(x < y || y < x, x != y);
+
   TVM_TRY_REWRITE_IF(x < c1 || c2 < x, ctrue, c2.Eval()->value < c1.Eval()->value);
   TVM_TRY_REWRITE_IF(c2 < x || x < c1, ctrue, c2.Eval()->value < c1.Eval()->value);
 
diff --git a/src/tir/analysis/control_flow_graph.cc b/src/tir/analysis/control_flow_graph.cc
index 42c5c8bb82d5..2e537450d232 100644
--- a/src/tir/analysis/control_flow_graph.cc
+++ b/src/tir/analysis/control_flow_graph.cc
@@ -31,6 +31,7 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include <algorithm>
 #include <numeric>
 #include <optional>
 #include <queue>
@@ -819,10 +820,30 @@ BufferTouch ControlFlowGraph::ControlFlowBlock::MakeBufferTouch(ControlFlowGraph
   return buffer_touch;
 }
 
-ControlFlowGraph::ControlFlowGraph(const tir::Stmt& stmt, size_t max_revisits) {
+ControlFlowGraph::ControlFlowGraph(const tir::Stmt& stmt, size_t max_revisits)
+    : max_revisits_(max_revisits) {
   ControlFlowGraphBuilder::Build(this, stmt);
-  ForwardPropagateKnownValues(max_revisits);
-  BackwardPropagateUnusedValues(max_revisits);
+  ForwardPropagateKnownValues();
+  BackwardPropagateUnusedValues();
+}
+
+void ControlFlowGraph::RemoveStore(const tir::BufferStore& store) {
+  size_t context_index = [&]() {
+    auto it = control_flow_lookup_.find(store.get());
+    ICHECK(it != control_flow_lookup_.end())
+        << "BufferStore did not occur in the Stmt provided to BufferTouchPattern's constructor";
+    return it->second;
+  }();
+
+  auto& touch_points = control_flow_[context_index].touch_points;
+
+  touch_points.erase(std::remove_if(touch_points.begin(), touch_points.end(),
+                                    [](const BufferTouch& touch) {
+                                      return touch.touch_type == BufferTouch::AccessType::Write;
+                                    }),
+                     touch_points.end());
+  ForwardPropagateKnownValues(context_index);
+  BackwardPropagateUnusedValues(context_index);
 }
 
 std::ostream& operator<<(std::ostream& os, const ControlFlowGraph::ControlFlowEdge& edge) {
@@ -1327,33 +1348,38 @@ Array<Var> ControlFlowGraph::GetIndexVariables(const Buffer& buf, const Array<Pr
   return vars;
 }
 
-void ControlFlowGraph::ForwardPropagateKnownValues(size_t max_revisits) {
+void ControlFlowGraph::ForwardPropagateKnownValues(std::optional<size_t> flow_from) {
   // Values to visit when searching.  Using a std::set to
   // preferentially visit nodes near the start of the control flow.
   std::set<size_t> to_visit;
 
-  // Map from a block's index
-  std::unordered_map<size_t, size_t> visit_count_lookup;
-
-  // Initiatize the locations to search from, propagating values
-  // forward from all locations that have a known value.
-  for (size_t i = 0; i < control_flow_.size(); i++) {
-    bool has_known_value = false;
-    for (const auto& touch : control_flow_[i].touch_points) {
-      if (!HasBufferLoad(touch.value)) {
-        has_known_value = true;
-        break;
+  if (flow_from.has_value()) {
+    to_visit.insert(flow_from.value());
+  } else {
+    // Initiatize the locations to search from, propagating values
+    // forward from all locations that have a known value.
+    for (size_t i = 0; i < control_flow_.size(); i++) {
+      bool has_known_value = false;
+      for (const auto& touch : control_flow_[i].touch_points) {
+        if (!HasBufferLoad(touch.value)) {
+          has_known_value = true;
+          break;
+        }
       }
-    }
 
-    if (has_known_value) {
-      to_visit.insert(i);
+      if (has_known_value) {
+        to_visit.insert(i);
+      }
     }
   }
 
+  // Map from a block's index
+  std::unordered_map<size_t, size_t> visit_count_lookup;
+
   Analyzer analyzer;
   analyzer.rewrite_simplify.SetEnabledExtensions(arith::RewriteSimplifier::Extension(
       arith::RewriteSimplifier::kTransitivelyProveInequalities |
+      arith::RewriteSimplifier::kConvertBooleanToAndOfOrs |
       arith::RewriteSimplifier::kApplyConstraintsToBooleanBranches));
 
   analyzer.Bind(iterator_ranges_);
@@ -1369,7 +1395,7 @@ void ControlFlowGraph::ForwardPropagateKnownValues(size_t max_revisits) {
 
     // Step 1: Collect known values provided from each predecessor
     block.known_at_block_start = [&]() -> BufferState {
-      if (num_previous_visits >= max_revisits) {
+      if (num_previous_visits >= max_revisits_) {
         return BufferState();
       }
 
@@ -1437,7 +1463,7 @@ void ControlFlowGraph::ForwardPropagateKnownValues(size_t max_revisits) {
 
     // Step 2: Collect knowns provided as a result of executing this block
     auto post_state = [&]() {
-      if (num_previous_visits >= max_revisits) {
+      if (num_previous_visits >= max_revisits_) {
         return BufferState();
       }
       auto post_state = block.known_at_block_start;
@@ -1459,29 +1485,35 @@ void ControlFlowGraph::ForwardPropagateKnownValues(size_t max_revisits) {
   }
 }
 
-void ControlFlowGraph::BackwardPropagateUnusedValues(size_t max_revisits) {
+void ControlFlowGraph::BackwardPropagateUnusedValues(std::optional<size_t> flow_from) {
   // Values to visit when searching.  Using a std::set to
   // preferentially visit nodes near the end of the control flow.
   std::set<size_t> to_visit;
 
-  // Map from a block's index
-  std::unordered_map<size_t, size_t> visit_count_lookup;
-
-  // Initiatize the locations to search from, propagating values
-  // backward from anywhere that performs a write.
-  for (size_t i = 0; i < control_flow_.size(); i++) {
-    const auto& touch_points = control_flow_[i].touch_points;
-    bool performs_write = std::any_of(
-        touch_points.begin(), touch_points.end(),
-        [](const auto& touch) { return touch.touch_type == BufferTouch::AccessType::Write; });
-    if (performs_write) {
-      to_visit.insert(i);
+  if (flow_from.has_value()) {
+    to_visit.insert(flow_from.value());
+  } else {
+    // Initiatize the locations to search from, propagating values
+    // backward from anywhere that performs a write.
+    for (size_t i = 0; i < control_flow_.size(); i++) {
+      const auto& touch_points = control_flow_[i].touch_points;
+      bool performs_write = std::any_of(
+          touch_points.begin(), touch_points.end(),
+          [](const auto& touch) { return touch.touch_type == BufferTouch::AccessType::Write; });
+      if (performs_write) {
+        to_visit.insert(i);
+      }
     }
   }
 
+  // Map from a block's index
+  std::unordered_map<size_t, size_t> visit_count_lookup;
+
   Analyzer analyzer;
-  analyzer.rewrite_simplify.SetEnabledExtensions(
-      arith::RewriteSimplifier::kTransitivelyProveInequalities);
+  analyzer.rewrite_simplify.SetEnabledExtensions(arith::RewriteSimplifier::Extension(
+      arith::RewriteSimplifier::kTransitivelyProveInequalities |
+      arith::RewriteSimplifier::kConvertBooleanToAndOfOrs |
+      arith::RewriteSimplifier::kApplyConstraintsToBooleanBranches));
 
   analyzer.Bind(iterator_ranges_);
   analyzer.Bind(free_predicate_parameters_);
@@ -1496,7 +1528,7 @@ void ControlFlowGraph::BackwardPropagateUnusedValues(size_t max_revisits) {
 
     // Step 1: Collect known unused indices provided by each successor
     block.unused_at_block_end = [&]() -> BufferState {
-      if (num_previous_visits >= max_revisits) {
+      if (num_previous_visits >= max_revisits_) {
         return BufferState();
       }
       ICHECK_LE(block.successors.size(), 2)
@@ -1561,7 +1593,7 @@ void ControlFlowGraph::BackwardPropagateUnusedValues(size_t max_revisits) {
 
     // Step 2: Collect knowns provided as a result of executing this block
     auto unused_at_block_start = [&]() {
-      if (num_previous_visits >= max_revisits) {
+      if (num_previous_visits >= max_revisits_) {
         return BufferState();
       }
       auto prior_state = block.unused_at_block_end;
@@ -1603,8 +1635,10 @@ bool ControlFlowGraph::IsOverwrittenWithoutEffect(const tir::BufferStore& store,
   local_analyzer.Bind(free_predicate_parameters_);
   local_analyzer.Bind(iterator_ranges_);
   local_analyzer.Bind(free_params);
-  local_analyzer.rewrite_simplify.SetEnabledExtensions(
-      RewriteSimplifier::kTransitivelyProveInequalities);
+  local_analyzer.rewrite_simplify.SetEnabledExtensions(arith::RewriteSimplifier::Extension(
+      arith::RewriteSimplifier::kTransitivelyProveInequalities |
+      arith::RewriteSimplifier::kConvertBooleanToAndOfOrs |
+      arith::RewriteSimplifier::kApplyConstraintsToBooleanBranches));
 
   PrimExpr predicate = store_touch.predicate && store_touch.AtLoopIteration();
 
@@ -1630,13 +1664,16 @@ PrimExpr ControlFlowGraph::SimplifyInContext(PrimExpr expr, const tir::Stmt& con
     return it->second;
   }();
 
+  const auto& control_flow_block = control_flow_[context_index];
+
   PrimExpr constraint = Bool(true);
   for (const auto& known : non_buffer_assumptions_) {
     constraint = constraint && known;
   }
   With<ConstraintContext> constraint_context(analyzer, constraint);
+  With<ConstraintContext> control_flow_scope(analyzer, control_flow_block.scope_predicate);
 
-  expr = control_flow_[context_index].known_at_block_start.SubstituteKnownBufferValues(
+  expr = control_flow_block.known_at_block_start.SubstituteKnownBufferValues(
       std::move(expr), axis_var_lookup_, analyzer);
 
   expr = analyzer->Simplify(std::move(expr));
diff --git a/src/tir/analysis/control_flow_graph.h b/src/tir/analysis/control_flow_graph.h
index aa9023ba29dd..590392cf658a 100644
--- a/src/tir/analysis/control_flow_graph.h
+++ b/src/tir/analysis/control_flow_graph.h
@@ -29,6 +29,7 @@
 #include <tvm/tir/stmt.h>
 #include <tvm/tir/var.h>
 
+#include <optional>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -474,13 +475,17 @@ class ControlFlowGraph {
 
   /*! \brief Propagate known values from known BufferStore/assume
    *  subsequent control flow blocks
+   *
+   * \param flow_from If specified, re-flow only from that block.
    */
-  void ForwardPropagateKnownValues(size_t max_revisits);
+  void ForwardPropagateKnownValues(std::optional<size_t> flow_from = std::nullopt);
 
   /*! \brief Propagate overwritten/unused indices to preceding control
    *  flow blocks
+   *
+   * \param flow_from If specified, re-flow only from that block.
    */
-  void BackwardPropagateUnusedValues(size_t max_revisits);
+  void BackwardPropagateUnusedValues(std::optional<size_t> flow_from = std::nullopt);
 
   struct ControlFlowEdge {
     /* \brief The source block of the control flow edge
@@ -646,6 +651,9 @@ class ControlFlowGraph {
   std::vector<PrimExpr> non_buffer_assumptions_;
 
   friend class ControlFlowGraphBuilder;
+
+  /*! \brief The maximum number of revisits while flowing constraints */
+  size_t max_revisits_;
 };
 
 }  // namespace tir
diff --git a/src/tir/transforms/remove_no_op.cc b/src/tir/transforms/remove_no_op.cc
index 41250408a7f2..3374f975f5ac 100644
--- a/src/tir/transforms/remove_no_op.cc
+++ b/src/tir/transforms/remove_no_op.cc
@@ -29,21 +29,71 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <optional>
 #include <unordered_map>
 
 #include "../../arith/const_fold.h"
+#include "../../arith/ir_mutator_with_analyzer.h"
+#include "../analysis/control_flow_graph.h"
 #include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
 
+struct RemoveNoOpConfigNode : public tvm::AttrsNode<RemoveNoOpConfigNode> {
+  bool use_dataflow_analysis;
+
+  TVM_DECLARE_ATTRS(RemoveNoOpConfigNode, "tir.transform.RemoveNoOpConfig") {
+    TVM_ATTR_FIELD(use_dataflow_analysis)
+        .describe(
+            "If true, known buffer values are propagated and used "
+            "to statically prove statements as no-ops.")
+        .set_default(false);
+  }
+};
+
+class RemoveNoOpConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(RemoveNoOpConfig, Attrs, RemoveNoOpConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(RemoveNoOpConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.RemoveNoOp", RemoveNoOpConfig);
+
 // Mark the statement of each stage.
-class NoOpRemover : public StmtMutator {
+class NoOpRemover : public arith::IRMutatorWithAnalyzer {
  public:
+  static Stmt Apply(Stmt stmt, arith::Analyzer* analyzer,
+                    std::optional<ControlFlowGraph> touch_pattern, const StmtNode* context) {
+    NoOpRemover visitor(analyzer, touch_pattern, context);
+    return visitor(std::move(stmt));
+  }
+
+ private:
+  using Parent = IRMutatorWithAnalyzer;
+  using Parent::VisitStmt;
+  using Parent::VisitStmt_;
+
+  NoOpRemover(arith::Analyzer* analyzer, std::optional<ControlFlowGraph> touch_pattern,
+              const StmtNode* context)
+      : Parent(analyzer), touch_pattern_(touch_pattern), context_(context) {}
+
   Stmt VisitStmt_(const LetStmtNode* op) final {
-    Stmt stmt = StmtMutator::VisitStmt_(op);
+    Stmt stmt = Parent::VisitStmt_(op);
     op = stmt.as<LetStmtNode>();
-    return is_no_op(op->body) ? MakeEvaluate(op->value) : stmt;
+    if (is_no_op(op->body)) {
+      return MakeEvaluate(op->value);
+    }
+
+    bool body_uses_bound_variable =
+        !UsesVar(op->body, [&](const VarNode* var) { return var == op->var.get(); });
+    if (body_uses_bound_variable && HasSideEffect(op->value)) {
+      return SeqStmt({MakeEvaluate(op->value), op->body});
+    } else if (body_uses_bound_variable) {
+      return op->body;
+    } else {
+      return stmt;
+    }
   }
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == "pragma_debug_skip_region") {
@@ -58,24 +108,26 @@ class NoOpRemover : public StmtMutator {
         // We assume that such wait is a nop.
         auto inner = op->body.as<AttrStmtNode>();
         ICHECK(inner);
-        return StmtMutator::VisitStmt(inner->body);
+        return Parent::VisitStmt(inner->body);
       }
     }
 
-    Stmt stmt = StmtMutator::VisitStmt_(op);
+    Stmt stmt = Parent::VisitStmt_(op);
     op = stmt.as<AttrStmtNode>();
     return is_no_op(op->body) ? MakeEvaluate(op->value) : stmt;
   }
   Stmt VisitStmt_(const IfThenElseNode* op) final {
-    Stmt stmt = StmtMutator::VisitStmt_(op);
+    Stmt stmt = Parent::VisitStmt_(op);
     op = stmt.as<IfThenElseNode>();
     if (op->else_case) {
-      if (is_no_op(op->else_case.value())) {
-        if (is_no_op(op->then_case)) {
-          return MakeEvaluate(op->condition);
-        } else {
-          return IfThenElse(op->condition, op->then_case);
-        }
+      bool no_op_else = is_no_op(op->else_case.value());
+      bool no_op_then = is_no_op(op->then_case);
+      if (no_op_else && no_op_then) {
+        return MakeEvaluate(op->condition);
+      } else if (no_op_else) {
+        return IfThenElse(op->condition, op->then_case);
+      } else if (no_op_then) {
+        return IfThenElse(!op->condition, op->else_case.value());
       } else {
         return stmt;
       }
@@ -88,13 +140,13 @@ class NoOpRemover : public StmtMutator {
     }
   }
   Stmt VisitStmt_(const ForNode* op) final {
-    var_range_map_[op->loop_var.get()] = arith::IntSet::FromMinExtent(op->min, op->extent);
     auto extent_range = arith::EvalSet(op->extent, var_range_map_);
     if (!arith::is_neg_inf(extent_range.max()) && !arith::is_pos_inf(extent_range.max()) &&
-        analyzer_.CanProve(extent_range.max() <= 0)) {
+        analyzer_->CanProve(extent_range.max() <= 0)) {
       return Evaluate(0);
     }
-    Stmt stmt = StmtMutator::VisitStmt_(op);
+    var_range_map_[op->loop_var.get()] = arith::IntSet::FromMinExtent(op->min, op->extent);
+    Stmt stmt = Parent::VisitStmt_(op);
     var_range_map_.erase(op->loop_var.get());
     op = stmt.as<ForNode>();
     if (is_zero(op->extent)) {
@@ -114,42 +166,104 @@ class NoOpRemover : public StmtMutator {
     return is_no_op(op->body) ? op->body : stmt;
   }
   Stmt VisitStmt_(const EvaluateNode* op) final {
-    if (SideEffect(op->value) > CallEffectKind::kReadState) return GetRef<Stmt>(op);
-    return Evaluate(0);
+    if (HasSideEffect(op->value)) {
+      return GetRef<Stmt>(op);
+    } else {
+      return Evaluate(0);
+    }
   }
 
   Stmt VisitStmt_(const SeqStmtNode* op) final {
-    Stmt ret = StmtMutator::VisitSeqStmt_(op, true);
-    op = ret.as<SeqStmtNode>();
-    ICHECK(op != nullptr);
-    bool need_compact = false;
-    for (size_t i = 0; i < op->size(); ++i) {
-      if (is_no_op(op->seq[i])) need_compact = true;
-    }
+    auto ret = Downcast<SeqStmt>(StmtMutator::VisitSeqStmt_(op, true));
+
+    bool need_compact = std::any_of(ret->seq.begin(), ret->seq.end(),
+                                    [](const auto& stmt) { return is_no_op(stmt); });
+
     if (need_compact) {
-      auto n = CopyOnWrite(op);
-      size_t top = 0;
-      for (size_t i = 0; i < n->seq.size(); ++i) {
-        if (!is_no_op(n->seq[i])) {
-          n->seq.Set(top++, n->seq[i]);
+      Array<Stmt> filtered;
+      for (Stmt stmt : ret->seq) {
+        if (!is_no_op(stmt)) {
+          filtered.push_back(std::move(stmt));
         }
       }
-      if (top == 1) {
-        return n->seq[0];
-      } else {
-        n->seq.resize(top);
-        return Stmt(n);
-      }
+      ret = SeqStmt(filtered);
+    }
+
+    if (ret->size() == 0) {
+      return Evaluate(0);
+    } else if (ret->size() == 1) {
+      return ret->seq[0];
     } else {
-      if (op->size() == 1) {
-        return op->seq[0];
-      } else {
-        return ret;
+      return std::move(ret);
+    }
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    BufferStore store = GetRef<BufferStore>(op);
+
+    // Helper function that returns a statement containing only the
+    // side effects of evaluating this BufferStore, but not the store
+    // itself.
+    auto only_side_effects = [&]() {
+      Array<Stmt> statements;
+      statements.push_back(MakeEvaluate(store->value));
+      for (const auto& index : store->indices) {
+        statements.push_back(MakeEvaluate(index));
+      }
+      return this->VisitStmt(SeqStmt(statements));
+    };
+
+    if (touch_pattern_.has_value()) {
+      // A write that is later overwritten is a no-op.
+      Stmt context = context_ ? GetRef<Stmt>(context_) : store;
+      if (touch_pattern_->IsOverwrittenWithoutEffect(store, context)) {
+        touch_pattern_->RemoveStore(store);
+        return only_side_effects();
+      }
+
+      // A write whose destination is known to already contain the
+      // values to be written is a no-op.
+      PrimExpr stores_existing_value = store->value == BufferLoad(store->buffer, store->indices);
+
+      PrimExpr simplified =
+          touch_pattern_->SimplifyInContext(stores_existing_value, context, analyzer_);
+      if (auto* as_int = as_const_int(simplified); as_int && *as_int) {
+        return only_side_effects();
       }
     }
+
+    // If the stored value is a load from the same location, the
+    // statement is a no-op, regardless of contextual information.
+    if (const BufferLoadNode* load = store->value.as<BufferLoadNode>()) {
+      if (load->buffer->data.same_as(store->buffer->data) &&
+          analyzer_->CanProveEqual(load->buffer->elem_offset, store->buffer->elem_offset) &&
+          ArrayValueEqual(load->buffer->shape, store->buffer->shape) &&
+          ArrayValueEqual(load->buffer->strides, store->buffer->strides) &&
+          ArrayValueEqual(load->indices, store->indices)) {
+        return only_side_effects();
+      }
+    }
+
+    return std::move(store);
   }
 
  private:
+  bool ArrayValueEqual(const Array<PrimExpr>& a, const Array<PrimExpr>& b) {
+    if (a.size() != b.size()) {
+      return false;
+    }
+    for (size_t i = 0; i < a.size(); i++) {
+      if (!analyzer_->CanProveEqual(a[i], b[i])) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HasSideEffect(const PrimExpr& value) {
+    return SideEffect(value) > CallEffectKind::kReadState;
+  }
+
   Stmt MakeEvaluate(PrimExpr value) {
     if (SideEffect(value) > CallEffectKind::kReadState) {
       return Evaluate(value);
@@ -158,31 +272,47 @@ class NoOpRemover : public StmtMutator {
     }
   }
   Stmt MakeEvaluate(const Array<PrimExpr>& values) {
-    Stmt stmt;
+    Array<Stmt> stmts;
     for (PrimExpr e : values) {
       if (SideEffect(e) > CallEffectKind::kReadState) {
-        if (stmt.defined()) {
-          stmt = SeqStmt({stmt, Evaluate(e)});
-        } else {
-          stmt = Evaluate(e);
-        }
+        stmts.push_back(Evaluate(e));
       }
     }
-    return stmt.defined() ? stmt : Evaluate(0);
+
+    if (stmts.size() == 0) {
+      return Evaluate(0);
+    } else if (stmts.size() == 1) {
+      return stmts[0];
+    } else {
+      return SeqStmt(stmts);
+    }
   }
 
   std::unordered_map<const VarNode*, arith::IntSet> var_range_map_;
-  arith::Analyzer analyzer_;
+  std::optional<ControlFlowGraph> touch_pattern_;
+  const StmtNode* context_;
 };
 
-Stmt RemoveNoOp(Stmt stmt) { return NoOpRemover()(std::move(stmt)); }
-
 namespace transform {
 
 Pass RemoveNoOp() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
+    std::optional<ControlFlowGraph> touch_pattern = std::nullopt;
+
+    RemoveNoOpConfig config = ctx->GetConfig<RemoveNoOpConfig>("tir.RemoveNoOp")
+                                  .value_or(AttrsWithDefaultValues<RemoveNoOpConfig>());
+    if (config->use_dataflow_analysis) {
+      touch_pattern.emplace(f->body);
+    }
+
+    arith::Analyzer analyzer;
+    analyzer.rewrite_simplify.SetEnabledExtensions(arith::RewriteSimplifier::Extension(
+        arith::RewriteSimplifier::kTransitivelyProveInequalities |
+        arith::RewriteSimplifier::kConvertBooleanToAndOfOrs |
+        arith::RewriteSimplifier::kApplyConstraintsToBooleanBranches));
+
     auto* n = f.CopyOnWrite();
-    n->body = NoOpRemover()(std::move(n->body));
+    n->body = NoOpRemover::Apply(std::move(n->body), &analyzer, std::move(touch_pattern), nullptr);
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.RemoveNoOp", {});
diff --git a/tests/python/unittest/test_tir_transform_remove_no_op.py b/tests/python/unittest/test_tir_transform_remove_no_op.py
index 820e32eb7e72..ce37329b7ed3 100644
--- a/tests/python/unittest/test_tir_transform_remove_no_op.py
+++ b/tests/python/unittest/test_tir_transform_remove_no_op.py
@@ -19,6 +19,8 @@
 from tvm.script import tir as T
 import tvm.testing
 
+import pytest
+
 
 def nop():
     return tvm.tir.Evaluate(0)
@@ -82,5 +84,524 @@ def main(A: T.Buffer[(16), "int32"], B: T.Buffer[(16), "int32"]) -> None:
     assert isinstance(ret, tvm.tir.Evaluate)
 
 
+class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
+    use_dataflow_analysis = False
+
+    def transform(self):
+        def inner(mod):
+            config = {
+                "tir.RemoveNoOp": {
+                    "use_dataflow_analysis": self.use_dataflow_analysis,
+                }
+            }
+            with tvm.transform.PassContext(config=config):
+                mod = tvm.tir.transform.RemoveNoOp()(mod)
+            return mod
+
+        return inner
+
+
+class TestRemoveEmptyForLoop(BaseBeforeAfter):
+    """A for-loop whose body is a no-op is itself a no-op."""
+
+    def before():
+        for i in T.serial(16):
+            T.evaluate(0)
+
+    def expected():
+        T.evaluate(0)
+
+
+class TestRemoveZeroExtentLoop(BaseBeforeAfter):
+    """A for-loop with no extent is a no-op."""
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(0):
+            A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"]):
+        T.evaluate(0)
+
+
+class TestRemoveUnusedLet(BaseBeforeAfter):
+    """A let statement that is never used is a no-op."""
+
+    def before(A: T.Buffer[16, "int32"]):
+        x = 5
+        for i in T.serial(16):
+            A[i] = 0
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 0
+
+
+class TestRemoveLetUsedOnlyInNoOp(BaseBeforeAfter):
+    """A let statement that is never used is a no-op.
+
+    Similar to TestRemoveUnusedLet, but the usage of the let binding
+    may have been removed by an earlier removal of another no-op.
+    """
+
+    def before(A: T.Buffer[16, "int32"]):
+        x = 5
+        for i in T.serial(0):
+            A[i] = x
+
+    def expected(A: T.Buffer[16, "int32"]):
+        T.evaluate(0)
+
+
+class TestKeepSideEffectsOfLet(BaseBeforeAfter):
+    """The side effects of a no-op let must be kept."""
+
+    def before():
+        x = T.call_extern("extern_func", dtype="int32")
+        T.evaluate(0)
+
+    def expected():
+        T.evaluate(T.call_extern("extern_func", dtype="int32"))
+
+
+class TestRemoveEmptyThenCase(BaseBeforeAfter):
+    """A no-op then_case can be removed."""
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 8:
+                T.evaluate(0)
+            else:
+                A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if not (i < 8):
+                A[i] = 42
+
+
+class TestRemoveEmptyElseCase(BaseBeforeAfter):
+    """A no-op else_case can be removed."""
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 8:
+                A[i] = 42
+            else:
+                T.evaluate(0)
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 8:
+                A[i] = 42
+
+
+class TestRemoveUnusedWrite(BaseBeforeAfter):
+    """For two sequential writes, the first is a no-op"""
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 100
+            A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 42
+
+
+class TestSuppressRemovalOfUnusedWrite(BaseBeforeAfter):
+    """Dataflow analysis requires the config to opt-in
+
+    Like TestRemoveUnusedWrite, but dataflow analysis isn't enabled.
+    """
+
+    use_dataflow_analysis = False
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 100
+            A[i] = 42
+
+    expected = before
+
+
+class TestKeepSideEffectsOfUnusedWrite(BaseBeforeAfter):
+    """For two sequential writes, the first value may have side effects"""
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = T.call_extern("extern_func", dtype="int32")
+            A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            T.evaluate(T.call_extern("extern_func", dtype="int32"))
+            A[i] = 42
+
+
+class TestKeepFirstWriteWhenUsed(BaseBeforeAfter):
+    """For two sequential writes, keep the first if it is used"""
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 100
+            A[i] = A[i] + 1
+
+    expected = before
+
+
+class TestRemoveOverwrittenLoop(BaseBeforeAfter):
+    """Remove repeated writes to the same region
+
+    If two loops write to the same region, the first is a no-op.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 100
+
+        for i in T.serial(16):
+            A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 42
+
+
+class TestRemoveOverwrittenSubloop(BaseBeforeAfter):
+    """Remove repeated writes to the same region
+
+    If the first loop writes to a subset of the region, the first loop
+    is a no-op.  Similar to TestRemoveOverwrittenLoop, but the first
+    loop's extents are a subset of the second loop.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(4, 12):
+            A[i] = 100
+
+        for i in T.serial(16):
+            A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 42
+
+
+class TestKeepPartiallyOverwrittenLoop(BaseBeforeAfter):
+    """Keep partially overwritten regions
+
+    If the second loop doesn't entirely overwrite the first, the first
+    may not be removed be kept.
+    """
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 100
+
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = 42
+
+    expected = before
+
+
+class TestRemoveOverwrittenPredicatedLoopWithIdenticalCondition(BaseBeforeAfter):
+    """Remove repeated writes to the same predicated region.
+
+    Similar to TestKeepPartiallyOverwrittenLoop, except the first loop
+    has the same predicate as the second, and can therefore be
+    removed.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = 100
+
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = 42
+
+
+class TestRemoveOverwrittenPredicatedLoopWithProvableCondition(BaseBeforeAfter):
+    """Remove repeated writes to the same predicated region.
+
+    Similar to
+    TestRemoveOverwrittenPredicatedLoopWithIdenticalCondition, except
+    the first loop's predicate is not a precise match for the second
+    loop's predicate.  So long as the regions written in the first
+    loop are a subset of those written in the second loop, they can be
+    removed.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 10:
+                A[i] = 100
+
+        for i in T.serial(16):
+            if i // 4 < 3:
+                A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i // 4 < 3:
+                A[i] = 42
+
+
+class TestRemoveSeparatedOverwrites(BaseBeforeAfter):
+    """Remove repeated writes to the same predicated region.
+
+    Similar to TestRemoveOverwrittenLoopRegion, but with an
+    independent loop between the first and second write of the buffer.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"], B: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 100
+
+        for i in T.serial(16):
+            B[i] = 0
+
+        for i in T.serial(16):
+            A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"], B: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            B[i] = 0
+
+        for i in T.serial(16):
+            A[i] = 42
+
+
+@pytest.mark.xfail(reason="Not implemented yet")
+class TestRemoveSeparatedOverwriteOfPredicatedLoop(BaseBeforeAfter):
+    """Remove repeated writes to the same predicated region.
+
+    Similar to TestRemoveSeparatedOverwrites, but the independent loop
+    between the first and second writes writes to a different subset
+    of the same buffer.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = 100
+
+        for i in T.serial(16):
+            if i > 12:
+                A[i] = 15
+
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = 42
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i > 12:
+                A[i] = 15
+
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = 42
+
+
+class TestRemoveReadWrite(BaseBeforeAfter):
+    """Writing a value to the same location as was just read is a no-op."""
+
+    def before(A: T.Buffer[1, "int32"]):
+        A[0] = A[0]
+
+    def expected(A: T.Buffer[1, "int32"]):
+        T.evaluate(0)
+
+
+class TestKeepReadWriteToDifferentIndices(BaseBeforeAfter):
+    """Writing a value to a different index should not be removed"""
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(15):
+            A[i] = A[i + 1]
+
+    expected = before
+
+
+class TestRemoveReadWriteSameIndexDifferentExpression(BaseBeforeAfter):
+    """Writing a value to the same location as the read is a no-op.
+
+    If the value of the index can be proven to be the same, then the
+    no-op can be removed, even if they have different forms of the
+    expression.
+    """
+
+    def before(A: T.Buffer[16, "int32"]):
+        for io, ii in T.grid(4, 4):
+            i = 4 * io + ii
+            A[4 * io + ii] = A[i]
+
+    def expected(A: T.Buffer[16, "int32"]):
+        T.evaluate(0)
+
+
+class TestRemoveReadWriteSameIndexUsingConstraint(BaseBeforeAfter):
+    """Writing a value to the same location as the read is a no-op.
+
+    If the value of the index can be proven to be the same, then the
+    no-op can be removed.  This may require using the a constraint
+    that is known from a conditional containing the read/write.
+    """
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i != 0:
+                A[i] = A[i - 1]
+            else:
+                A[i] = A[0]
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i != 0:
+                A[i] = A[i - 1]
+
+
+class TestRemoveWritingOfKnownValue(BaseBeforeAfter):
+    """Writing a value that already exists at that index is a no-op"""
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = i
+
+        A[4] = 4
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = i
+
+
+class TestKeepOneOfDuplicateLoops(BaseBeforeAfter):
+    """Must not reason based on a touch point after removing it.
+
+    If the first loop is removed because it is overwritten by the
+    second loop, and the second loop is removed because it writes the
+    same value as the first loop, the overall transformation is no
+    longer valid.  In this case, only one of the two should be
+    removed.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = i
+
+        for i in T.serial(16):
+            A[i] = i
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = i
+
+
+class TestRemoveEmptyTemporary(BaseBeforeAfter):
+    """An allocation with a no-op body is a no-op."""
+
+    def before():
+        A = T.allocate([16], "int32", "local")
+        T.evaluate(0)
+
+    def expected():
+        T.evaluate(0)
+
+
+@pytest.mark.xfail(reason="Not implemented yet")
+class TestRemoveUnusedTemporary(BaseBeforeAfter):
+    """An unused allocation is a no-op."""
+
+    def before(A: T.Buffer[16, "int32"]):
+        B = T.allocate([16], "int32", "local")
+        for i in T.serial(16):
+            A[i] = 1
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 1
+
+
+@pytest.mark.xfail(reason="Not implemented yet")
+class TestRemoveUnusedWriteIntoTemporary(BaseBeforeAfter):
+    """A write that only impacts a temporary allocation is a no-op."""
+
+    def before():
+        A = T.decl_buffer([16], "int32", scope="local")
+        for i in T.serial(16):
+            A[i] = 0
+
+    def expected():
+        T.evaluate(0)
+
+
+class TestKeepUsedWriteIntoTemporary(BaseBeforeAfter):
+    """A write into a temporary that is used later must be kept."""
+
+    def before(B: T.Buffer[16, "int32"]):
+        A = T.decl_buffer([16], "int32", scope="local")
+        for i in T.serial(16):
+            A[i] = 0
+
+        for i in T.serial(16):
+            B[i] = A[i]
+
+    expected = before
+
+
+@pytest.mark.xfail(reason="Not implemented yet")
+class TestRemoveWriteIntoTemporary(BaseBeforeAfter):
+    """A write that only impacts a temporary allocation is a no-op."""
+
+    def before(A: T.Buffer[16, "int32"], C: T.Buffer[1, "int32"]):
+        B = T.decl_buffer([16], "int32", scope="local")
+        for i in T.serial(16):
+            B[i] = A[i]
+
+        C[0] = 0
+        for i in T.serial(16):
+            C[0] = C[0] + B[i]
+
+        for i in T.serial(16):
+            B[i] = 0
+
+    def expected(A: T.Buffer[16, "int32"], C: T.Buffer[1, "int32"]):
+        B = T.decl_buffer([16], "int32", scope="local")
+        for i in T.serial(16):
+            B[i] = A[i]
+
+        C[0] = 0
+        for i in T.serial(16):
+            C[0] = C[0] + B[i]
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index fd98b715a4bc..1ddc0e50d98f 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -1267,6 +1267,7 @@ class TestSimplifyUsingPartiallyKnownBufferConditional(BaseBeforeAfter):
     """An assumption about buffer contents may apply to only part of a buffer"""
 
     propagate_knowns_to_prove_conditional = True
+    apply_constraints_to_boolean_branches = True
 
     def before(A: T.Buffer[16, "int32"]):
         for i in T.serial(16):

From fc606c09b223df445dd6bc6a33d3e3bfbd670535 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 25 Nov 2022 16:37:45 -0600
Subject: [PATCH 651/704] [TIR][TVMScript] Cleaner printing of And/Or chains
 (#13432)

Even though the operator precedence of `And` has a higher precedence
than `Or`, removing parentheses based on this precedence can harm
readability.  This commit adds an exception to the TVMScript rules for
parentheses, to always insert parentheses between `And` and `Or`
operators.

In addition, adding a rewrite rule to preferentially produce And/Or
chains that may be expressed in a single left-associative chain of
operators.

Between these two changes, the readability of boolean expressions can
be improved.  Below is the motivating example for this change.  In
each case, the output had been passed through the `black` formatter.
Both expressions are equivalent, but the before-case was much more
difficult to read.

```python
x = (
    AAA == 0
    and BBB < 4
    or AAA == 7
    and 6 <= BBB
    or (CCC == 0 and DDD < 4 or CCC == 7 and 6 <= DDD)
)

x = (
    (AAA == 0 and BBB < 4)
    or (AAA == 7 and 6 <= BBB)
    or (CCC == 0 and DDD < 4)
    or (CCC == 7 and 6 <= DDD)
)
```
---
 src/arith/rewrite_simplify.cc                 |  9 ++++--
 src/printer/tvmscript_printer.cc              |  6 ++--
 .../unittest/test_arith_rewrite_simplify.py   |  9 ++++++
 .../test_tir_schedule_transform_layout.py     | 12 +++----
 .../unittest/test_tvmscript_roundtrip.py      | 32 +++++++++++++++++++
 5 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index 90c448f4ea5c..c9d92f992564 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -1737,7 +1737,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
   }
 
   // Pattern var to match any expression
-  PVar<PrimExpr> x, y;
+  PVar<PrimExpr> x, y, z;
   // Pattern var match IntImm
   PVar<IntImm> c1, c2, c3;
   PVar<int> lanes;
@@ -1815,6 +1815,9 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
                             c1 * c2 + c3 < x && x < (c1 + 1) * c2);
   TVM_TRY_RECURSIVE_REWRITE(c3 < floormod(x, c2) && floordiv(x, c2) == c1,
                             c1 * c2 + c3 < x && x < (c1 + 1) * c2);
+
+  TVM_TRY_RECURSIVE_REWRITE(x && (y && z), (x && y) && z);
+
   return ret;
 }
 
@@ -1874,7 +1877,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) {
   }
 
   // Pattern var to match any expression
-  PVar<PrimExpr> x, y;
+  PVar<PrimExpr> x, y, z;
   // Pattern var match IntImm
   PVar<IntImm> c1, c2;
   PVar<int> lanes;
@@ -1912,6 +1915,8 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) {
   TVM_TRY_RECURSIVE_REWRITE(x == y || x < y, x <= y);
   TVM_TRY_RECURSIVE_REWRITE(y == x || x < y, x <= y);
 
+  TVM_TRY_RECURSIVE_REWRITE(x || (y || z), (x || y) || z);
+
   return ret;
 }
 
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 8f012f3b0eb3..05e514295c04 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -823,13 +823,15 @@ bool WillPrintConstScalar(const PrimExpr& expr) {
     ICHECK(rhs_precedence != ExprPrecedence::kUnknown);                                           \
     /* Update out_precedence of current node. */                                                  \
     *out_precedence = OpPrecedence;                                                               \
-    if (lhs_precedence > OpPrecedence) {                                                          \
+    if (lhs_precedence > OpPrecedence ||                                                          \
+        (lhs_precedence == ExprPrecedence::kAnd && OpPrecedence == ExprPrecedence::kOr)) {        \
       doc << "(" << lhs_doc << ")";                                                               \
     } else {                                                                                      \
       doc << lhs_doc;                                                                             \
     }                                                                                             \
     doc << OpString;                                                                              \
-    if (rhs_precedence >= OpPrecedence) {                                                         \
+    if (rhs_precedence >= OpPrecedence ||                                                         \
+        (rhs_precedence == ExprPrecedence::kAnd && OpPrecedence == ExprPrecedence::kOr)) {        \
       doc << "(" << rhs_doc << ")";                                                               \
     } else {                                                                                      \
       doc << rhs_doc;                                                                             \
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 4199cb9a56f7..d6c2cfe8bbdd 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -992,6 +992,15 @@ def test_logical_simplify():
     ck.verify(tvm.tir.Or(2 <= x, x <= 1), tvm.tir.const(True, "bool"))
     ck.verify(tvm.tir.Or(x != 1, x == 2), x != 1)
 
+    ck.verify(
+        tvm.tir.Or(x == 1, tvm.tir.Or(y == 1, z == 1)),
+        tvm.tir.Or(tvm.tir.Or(x == 1, y == 1), z == 1),
+    )
+    ck.verify(
+        tvm.tir.And(x == 1, tvm.tir.And(y == 1, z == 1)),
+        tvm.tir.And(tvm.tir.And(x == 1, y == 1), z == 1),
+    )
+
 
 def test_let_simplify():
     ck = RewriteChecker()
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index e90478922324..faeaf8768681 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -872,13 +872,11 @@ def expected(A: T.Buffer[16, "int32"], n: T.int32):
                 B[vi, vj] = T.if_then_else(
                     # Checks if the transform introduced padding
                     -16 % n != 0
-                    and (
-                        # If so, is vi in the last group (which may
-                        # include padding).
-                        (vj + vi * n) // n == 16 // n
-                        # And is vj within the padding
-                        and 16 % n <= (vj + vi * n) % n
-                    ),
+                    # If so, is vi in the last group (which may
+                    # include padding).
+                    and (vj + vi * n) // n == 16 // n
+                    # And is vj within the padding
+                    and 16 % n <= (vj + vi * n) % n,
                     0,
                     A[vj + vi * n],
                     dtype="int32",
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 53b3cd69ea80..0ead66bd609f 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3506,6 +3506,37 @@ def func(i: T.int32) -> None:
     return func
 
 
+def nested_boolean_expressions():
+    expressions = {
+        "and_lhs_and": lambda i, j, k: tir.all(tir.all(i, j), k),
+        "and_rhs_and": lambda i, j, k: tir.all(i, tir.all(j, k)),
+        "and_lhs_or": lambda i, j, k: tir.all(tir.any(i, j), k),
+        "and_rhs_or": lambda i, j, k: tir.all(i, tir.any(j, k)),
+        "or_lhs_and": lambda i, j, k: tir.any(tir.all(i, j), k),
+        "or_rhs_and": lambda i, j, k: tir.any(i, tir.all(j, k)),
+        "or_lhs_or": lambda i, j, k: tir.any(tir.any(i, j), k),
+        "or_rhs_or": lambda i, j, k: tir.any(i, tir.any(j, k)),
+        "and_of_ors": lambda i, j, k: tir.all(tir.any(i, j), tir.any(j, k), tir.any(i, k), i, j, k),
+        "or_of_ands": lambda i, j, k: tir.any(tir.all(i, j), tir.all(j, k), tir.all(i, k), i, j, k),
+    }
+
+    def make_ir_generator(name, expression):
+        def inner():
+            @T.prim_func
+            def func(A: T.Buffer[1, "bool"], i: T.bool, j: T.bool, k: T.bool):
+                A[0] = expression(i, j, k)
+
+            return func
+
+        inner.__name__ = f"nested_boolean_expr_{name}"
+        return inner
+
+    for name, expression in expressions.items():
+        generator = make_ir_generator(name, expression)
+
+        yield generator
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3561,6 +3592,7 @@ def func(i: T.int32) -> None:
     if_true_else,
     elif_chain_without_else,
     elif_chain_with_else,
+    *nested_boolean_expressions(),
 )
 
 
From 9098b497bbbe20fb6c3b3ea6e062d2372cb0c8c9 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 25 Nov 2022 14:49:09 -0800
Subject: [PATCH 652/704] [TIR] Correct type annotation for `rfactor` (#13485)

---
 python/tvm/tir/schedule/schedule.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 170179d0d4e8..69feaff53aa3 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -1875,7 +1875,7 @@ def after_decompose(a: ty.handle, c: ty.handle) -> None:
         return _ffi_api.ScheduleDecomposeReduction(self, block, loop)  # type: ignore # pylint: disable=no-member
 
     @type_checked
-    def rfactor(self, loop: LoopRV, factor_axis: int) -> LoopRV:
+    def rfactor(self, loop: LoopRV, factor_axis: int) -> BlockRV:
         """Factorize an associative reduction block by the specified loop.
 
         An associative reduction cannot be parallelized directly,

From 61a4f214122d2090e9b58602fa51af5a79c29d04 Mon Sep 17 00:00:00 2001
From: Hongyi Jin <3231950289@qq.com>
Date: Sun, 27 Nov 2022 09:59:26 -0500
Subject: [PATCH 653/704] [RUNTIME] Correctly handling export_module when
 exporting modules of different type (#13489)

---
 python/tvm/runtime/module.py                    | 17 ++++++++---------
 .../unittest/test_runtime_module_export.py      |  6 +++++-
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index e85b99234100..83b436939e9f 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -463,6 +463,7 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No
         is_system_lib = False
         has_c_module = False
         llvm_target_string = None
+        global_object_format = "o"
         for index, module in enumerate(modules):
             if fcompile is not None and hasattr(fcompile, "object_format"):
                 if module.type_key == "c":
@@ -475,7 +476,7 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No
                     object_format = module.format
                     has_c_module = True
                 else:
-                    object_format = fcompile.object_format
+                    global_object_format = object_format = fcompile.object_format
             else:
                 if module.type_key == "c":
                     if len(module.format) > 0:
@@ -494,16 +495,14 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No
                     has_c_module = True
                 else:
                     assert module.type_key == "llvm" or module.type_key == "static_library"
-                    object_format = "o"
+                    global_object_format = object_format = "o"
+
             path_obj = os.path.join(workspace_dir, f"lib{index}.{object_format}")
             module.save(path_obj)
             files.append(path_obj)
-            is_system_lib = (
-                module.type_key == "llvm" and module.get_function("__tvm_is_system_module")()
-            )
-            llvm_target_string = (
-                module.type_key == "llvm" and module.get_function("_get_target_string")()
-            )
+            if module.type_key == "llvm":
+                is_system_lib = module.get_function("__tvm_is_system_module")()
+                llvm_target_string = module.get_function("_get_target_string")()
         if not fcompile:
             if file_name.endswith(".tar"):
                 fcompile = _tar.tar
@@ -520,7 +519,7 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No
 
         if self.imported_modules:
             if enabled("llvm") and llvm_target_string:
-                path_obj = os.path.join(workspace_dir, f"devc.{object_format}")
+                path_obj = os.path.join(workspace_dir, f"devc.{global_object_format}")
                 m = _ffi_api.ModulePackImportsToLLVM(self, is_system_lib, llvm_target_string)
                 m.save(path_obj)
                 files.append(path_obj)
diff --git a/tests/python/unittest/test_runtime_module_export.py b/tests/python/unittest/test_runtime_module_export.py
index 72608fe36fb9..3f6acca18b89 100644
--- a/tests/python/unittest/test_runtime_module_export.py
+++ b/tests/python/unittest/test_runtime_module_export.py
@@ -21,6 +21,7 @@
 import tvm.testing
 
 from tvm.contrib import utils
+import os
 
 header_file_dir_path = utils.tempdir()
 
@@ -203,7 +204,10 @@ def verify_multi_c_mod_export():
         synthetic_cpu_lib.import_module(f)
         synthetic_cpu_lib.import_module(engine_module)
         kwargs = {"options": ["-O2", "-std=c++17", "-I" + header_file_dir_path.relpath("")]}
-        synthetic_cpu_lib.export_library(path_lib, fcompile=False, **kwargs)
+        work_dir = temp.relpath("work_dir")
+        os.mkdir(work_dir)
+        synthetic_cpu_lib.export_library(path_lib, fcompile=False, workspace_dir=work_dir, **kwargs)
+        assert os.path.exists(os.path.join(work_dir, "devc.o"))
         loaded_lib = tvm.runtime.load_module(path_lib)
         assert loaded_lib.type_key == "library"
         # dso modules are merged

From ae4fd7df7dce9333d7efc5454d393afa4f23d27b Mon Sep 17 00:00:00 2001
From: sisleyli <43139237+sisleyli@users.noreply.github.com>
Date: Mon, 28 Nov 2022 14:43:55 +0800
Subject: [PATCH 654/704] [Relay][Pattern] Enable rewrite_once in
 class:DFPatternRewrite (#13490)

Co-authored-by: Bin Li <binli1@amd.com>
---
 src/relay/transforms/simplify_expr.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/relay/transforms/simplify_expr.h b/src/relay/transforms/simplify_expr.h
index cbaa326b362b..bdd3f2ca6e6f 100644
--- a/src/relay/transforms/simplify_expr.h
+++ b/src/relay/transforms/simplify_expr.h
@@ -54,7 +54,7 @@ class DFPatternRewrite {
       Map<DFPattern, Array<Expr>> node_map = args[2];
       *rv = this->Callback(pre, post, node_map);
     };
-    return DFPatternCallback(pattern_, PackedFunc(func), require_type_);
+    return DFPatternCallback(pattern_, PackedFunc(func), require_type_, rewrite_once_);
   }
 
  protected:
@@ -62,6 +62,8 @@ class DFPatternRewrite {
   DFPattern pattern_;
   /*! \brief Whether or not the rewrite requires types to be inferred. */
   bool require_type_ = true;
+  /*! \brief Whether or not run the callback only once */
+  bool rewrite_once_ = false;
 };
 
 /*! \brief Helper class for composing rewrites and getting callbacks. */

From b8d7cd7fe0f7f268272eb8714163e540875d9c0f Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Mon, 28 Nov 2022 13:19:15 +0530
Subject: [PATCH 655/704] =?UTF-8?q?[DOCKER][ADRENO]=20we=20don't=20need=20?=
 =?UTF-8?q?microtvm=20being=20built=20for=20android=20cross=E2=80=A6=20(#1?=
 =?UTF-8?q?3486)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[DOCKER][ADRENO] we don't need microtvm being built for android cross compilation

PR:https://github.com/apache/tvm/pull/13073 makes microtvm built by default
with x86 compilers. Cross compilation fails due to this.
---
 tests/scripts/task_build_adreno_bins.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/scripts/task_build_adreno_bins.sh b/tests/scripts/task_build_adreno_bins.sh
index 6a9bbd9554f1..6b43d7cbc421 100755
--- a/tests/scripts/task_build_adreno_bins.sh
+++ b/tests/scripts/task_build_adreno_bins.sh
@@ -28,6 +28,7 @@ cd ${output_directory}
 
 cp ../cmake/config.cmake .
 
+echo set\(USE_MICRO OFF\) >> config.cmake
 echo set\(USE_CLML ON\) >> config.cmake
 echo set\(USE_CLML_GRAPH_EXECUTOR "${ADRENO_OPENCL}"\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake

From c38a0c50a7cde09d548f570f7aafa8e293ef1485 Mon Sep 17 00:00:00 2001
From: dsbarinov1 <71228944+dsbarinov1@users.noreply.github.com>
Date: Mon, 28 Nov 2022 11:25:55 +0300
Subject: [PATCH 656/704] [Adreno] Add documentation for Adreno deployment
 (#13393)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [Adreno] Add documentation for Adreno deployment
Purpose:
assist TVM users compile and deploy on Adreno by expanding our documentation and providing sample scripts in TVM.

Information about PR:
The present PR consists globally of 3 parts.

The first part is an introductory article on compilation and deployment of neural networks on Adreno, covering such topics as: «Build TVM for Android/Adreno», «Advantages of textures» and «Differences in compilation and deployment of models for Adreno devices».

The second part is a straightforward example script for compiling and inferring models at different precisions for Adreno devices.

The third part is auxiliary files, images, etc.

* Add correct links to images + small fixes

* Remove images (.png)

* Add request_hook in deploy_model_on_adreno.py

* Fix trailing newline + add license

* No newline at the EOF + blanks

* Fix request hook placing

* Fix style

* Fix trailing

* Fix whitespaces

* Fix whitespaces v2

* Add newline at adreno.rst EOF

* Add license to adreno.rst

* Remove sphinx 'autosectionlabel' extension + modify cross-references in docs to work without this extension

* Set default values to tracker_host and tracker_port

* Add local_demo to be able to autogenerate docs

* Fix quotes

* Fix benchmark

* .
---
 docs/how_to/deploy/adreno.rst                 | 336 +++++++++++++++++
 docs/how_to/deploy/index.rst                  |   1 +
 .../deploy_models/deploy_model_on_adreno.py   | 351 ++++++++++++++++++
 3 files changed, 688 insertions(+)
 create mode 100644 docs/how_to/deploy/adreno.rst
 create mode 100644 gallery/how_to/deploy_models/deploy_model_on_adreno.py

diff --git a/docs/how_to/deploy/adreno.rst b/docs/how_to/deploy/adreno.rst
new file mode 100644
index 000000000000..af613aa5cb21
--- /dev/null
+++ b/docs/how_to/deploy/adreno.rst
@@ -0,0 +1,336 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Deploy to Adreno GPU
+=======================================
+
+**Authors**: Daniil Barinov, Egor Churaev, Andrey Malyshev
+
+Introduction
+------------
+
+Adreno is a series of graphics processing unit (GPU) semiconductor
+intellectual property cores developed by Qualcomm and used in many of
+their SoCs.
+
+The Adreno GPU accelerates the rendering of complex geometries to
+deliver high-performance graphics and a rich user experience with low
+power consumption.
+
+This guide will demonstrate :ref:`the benefits of using textures with Adreno<advantages_of_the_textures>`,
+how to :ref:`build TVM with OpenCL<building_tvm_for_adreno>` (needed by Adreno devices) and TVM RPC
+enabled. It will also provide :ref:`example code<build_and_deploy_model_for_adreno>` to better understand the differences in compiling and deploying models
+for Adreno devices.
+
+.. _advantages_of_the_textures:
+
+Advantages of the Textures
+--------------------------
+
+One of the Adreno's advantages is the clever handling of textures. At
+the moment, TVM is able to benefit from this by having texture support
+for Adreno. The graph below shows the Adreno A5x architecture.
+
+|High-level overview of the Adreno A5x architecture for OpenCL|
+
+*Fig. 1 High-level overview of the Adreno A5x architecture for OpenCL*
+
+*source:* `OpenCL Optimization and Best Practices for Qualcomm Adreno GPUs <https://dl.acm.org/doi/10.1145/3204919.3204935>`_
+
+Reasons of using textures:
+
+-  Texture processor (TP) has a dedicated L1 cache, which is read-only cache and stores data
+   fetched from level-2 (L2) cache for texture operations (primary
+   reason)
+
+-  The handling of image boundaries is built-in.
+
+-  Supports numerous image format and data type combinations with
+   support for automatic format conversions
+
+Overall, with textures, it is possible to achieve a significant performance boost
+compared to OpenCL buffer based solutions.
+
+.. _building_tvm_for_adreno:
+
+Building TVM for Adreno
+-----------------------
+
+This section gives instructions on how to build the Android part of TVM
+with OpenCL and TVM RPC Server in order to deploy models on Adreno.
+
+Since the process of building TVM for Adreno is exactly the same as the
+process of building TVM for Android, please refer to these instructions:
+`TVM RPC
+Server <https://github.com/apache/tvm/tree/main/apps/cpp_rpc>`_.
+
+Since there are many required packages for Android, you can use the official Docker Image to build TVM.
+For more information refer to this guide: `Deploy the Pretrained Model on Android <https://tvm.apache.org/docs/how_to/deploy_models/deploy_model_on_android.html>`_.
+
+**Prerequisites**: Android NDK and Android Debug Bridge must
+be installed, the desired device must have OpenCL support and Android part of TVM must be built:
+
+- Read documentation about *Android NDK installation* here: https://developer.android.com/ndk
+- To get access to adb tools you can see *Android Debug Bridge installation* here: https://developer.android.com/studio/command-line/adb
+
+You can also build the android part of TVM locally. From the root
+folder of TVM:
+
+::
+
+   mkdir build_android
+   cd build_android
+   cmake .. -DUSE_OPENCL=ON -DUSE_MICRO=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_NATIVE_API_LEVEL=android-28 -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=ON -DANDROID_STL=c++_static -DUSE_CPP_RPC=ON
+   make -jN tvm_runtime tvm_rpc
+
+where **N** is the number of cores available on your *CPU*.
+
+At this stage you have built TVM for Adreno.
+
+.. _build_and_deploy_model_for_adreno:
+
+Build and deploy model for Adreno
+---------------------------------
+
+In this section we will focus on target, needed to compile and deploy models for Adreno, demonstrate
+the differences in generated kernels with and without textures and, in addition, the
+possibility of choosing a different precision for model compilation will
+be considered.
+
+For the complete step-py-step process of compiling and deploying models on
+Adreno, including selection of precision, running the inference of the
+model, getting the predictions, and measuring the performance please refer to this tutorial: `How To Deploy model on Adreno <https://tvm.apache.org/docs/how_to/deploy_models/deploy_model_on_adreno.html>`_
+
+|Android deployment pipeline|
+
+*Fig.2 Deployment pipeline on Adreno devices*
+
+The figure above demonstrates a generalized pipeline for deploying and running neural network models on android devices.
+As can be seen from the figure, the compiled model has a set_input() and a run() methods,
+which *prepare the inputs* for inference and *execute the inference* on the remote device using the Graph Executor runtime module.
+
+Adreno target
+~~~~~~~~~~~~~
+
+Normally, when compiling models for Android using OpenCL, the
+corresponding target is used
+
+.. code:: python
+
+   target="opencl"
+
+Using Adreno, we want to get all the benefits of textures, so we have to
+use the following target to generate texture leveraging kernels
+
+.. code:: python
+
+   target="opencl -device=adreno"
+
+Let's write a simple model with one convolutional (conv2d) layer and take a look at generated kernels for these
+two targets
+
+.. code:: python
+
+   import tvm
+   from tvm import relay
+   import numpy as np
+
+   input_shape=(1, 56, 56, 32)
+   filter_shape=(3, 3, 32, 64)
+   filter = np.random.rand(*filter_shape)
+
+   dtype="float32"
+   input = tvm.relay.var("input", shape=input_shape, dtype=dtype)
+   weight = tvm.relay.var("weight", shape=filter_shape, dtype=dtype)
+   D = relay.nn.conv2d(input, weight, padding=(1, 1), data_layout="NHWC", kernel_layout="HWIO", out_dtype=dtype)
+
+   mod = relay.Function([input, weight], D)
+   params = {
+      "weight": tvm.nd.array(filter)
+   }
+
+Now compile our model with the classic OpenCL target and print its modules:
+
+.. code:: python
+
+   target="opencl"
+
+   with tvm.transform.PassContext(opt_level=3):
+      graph, lib, params = relay.build_module.build(mod, target, params=params)
+   print(lib.imported_modules[0].get_source())
+
+Notice that the generated convolution kernel has pointers in
+the initialization of the function. The kernels generated with the above target are buffer-based.
+
+.. code:: c
+
+   __kernel void tvmgen_default_fused_nn_conv2d_kernel0(__global float* restrict p0, __global double* restrict p1, __global float* restrict conv2d_nhwc) {
+   // body..
+
+
+Now take a look at “opencl -device=adreno” target:
+
+.. code:: python
+
+   target="opencl -device=adreno"
+
+   with tvm.transform.PassContext(opt_level=3):
+      graph, lib, params = relay.build_module.build(mod, target, params=params)
+   print(lib.imported_modules[0].get_source())
+
+The kernels generated this way is actually working with 2d arrays, leveraging textures
+
+.. code:: c
+
+   __kernel void tvmgen_default_fused_nn_conv2d_kernel0(__write_only image2d_t pad_temp_global_texture, __read_only image2d_t p0) {
+   // body..
+
+*image2d_t* is a built-in OpenCL types that represents two-dimensional image object and provides several additional functions.
+When we use *image2d_t* we read *4 elements at one time*, and it helps to utilize hardware in a more efficient way.
+
+Precisions
+~~~~~~~~~~
+The right choice of precision for a specific workload can greatly increase the efficiency of the solution,
+shifting the initial balance of precision and speed to the side that is a priority for the problem.
+
+We can choose from *float16*, *float16_acc32* (Mixed Precision), *float32* (standard).
+
+**Float16**
+
+To leverage the GPU hardware capabilities and utilize the benefits of half precision computation and memory management,
+we can convert an original model having floating points operation to a model operating with half precision.
+Choosing lower precision will positively affect the performance of the model, but it may also have a decrease in the accuracy of the model.
+To do the conversion you need to write a simple conversion function and specify the *dtype* value of "float16" before calling the function:
+
+.. code:: python
+
+   def  convert_to_dtype(mod, dtype):
+      # downcast to float16
+      if  dtype == "float16":
+         global  conv2d_acc = "float16"
+         from  tvm.ir  import  IRModule
+         mod = IRModule.from_expr(mod)
+         seq = tvm.transform.Sequential(
+            [
+                  relay.transform.InferType(),
+                  relay.transform.ToMixedPrecision()
+            ]
+         )
+         with  tvm.transform.PassContext(opt_level=3):
+            mod = seq(mod)
+      return  mod
+
+   dtype="float16"
+   mod = convert_to_dtype(mod["main"], dtype)
+
+We then can compile our model in any convinient way
+
+.. code:: python
+
+   with  tvm.transform.PassContext(opt_level=3):
+       lib = relay.build(
+           mod, target_host=target_host, target=target, params=params
+       )
+
+**float16_acc32 (Mixed Precision)**
+
+ToMixedPrecision pass traverse over the network and split network to clusters of ops dealing with float or float16 data types.
+The clusters are defined by three types of operations:
+- Operations always be converted into float16 data type
+- Operations which can be converted if they follow by converted cluster
+- Operations never be converted to the float16 data type
+This list is defined in the ToMixedPrecision implementation here
+`relay/transform/mixed_precision.py <https://github.com/apache/tvm/blob/main/python/tvm/relay/transform/mixed_precision.py#L34>`_
+and can be overridden by user
+
+In some cases, we want higher precision in accumulation than the input data.
+This is supported, for example, for conv2d and dense operations. To override accumulation type you need to register
+function with ``@register_mixed_precision_conversion`` decorator to modify parameters of ``ToMixedPrecision`` conversion
+
+.. code:: python
+
+   from  tvm.relay.op  import  register_mixed_precision_conversion
+
+   conv2d_acc = "float32"
+
+   # Pick a priority > 10 to overwrite defaults, higher priorities take precedence
+   @register_mixed_precision_conversion("nn.conv2d", level=11)
+   def  conv2d_mixed_precision_rule(call_node: "relay.Call", mixed_precision_type: str):
+       global  conv2d_acc
+       return [
+           # always do main calculation in mixed_precision_type
+           relay.transform.mixed_precision.MIXED_PRECISION_ALWAYS,
+           # the dtype for the accumulator
+           conv2d_acc,
+           # the output dtype for the operation (usually fp16)
+           mixed_precision_type,
+       ]
+
+   # Same for dense
+   @register_mixed_precision_conversion("nn.dense", level=11)
+   def  conv2d_mixed_precision_rule(call_node: "relay.Call", mixed_precision_type: str):
+       global  conv2d_acc
+       return [
+           relay.transform.mixed_precision.MIXED_PRECISION_ALWAYS,
+           conv2d_acc,
+           mixed_precision_type,
+       ]
+
+Now we need to modify the conversion function by adding some logical "forks" and ToMixedPrecision() call,
+then create a Relay graph from desired model in any convinient way and obtain **mod** (which is IR representation of the model),
+after which we can convert it to the required **dtype** and then assemble our model sequentialy
+
+.. code:: python
+
+   def  convert_to_dtype(mod, dtype):
+       # downcast to float16
+       if  dtype == "float16"  or  dtype == "float16_acc32":
+           global  conv2d_acc
+           conv2d_acc = "float16"  if  dtype == "float16"  else  "float32"
+           from  tvm.ir  import  IRModule
+           mod = IRModule.from_expr(mod)
+           seq = tvm.transform.Sequential(
+               [
+                   relay.transform.InferType(),
+                   relay.transform.ToMixedPrecision()
+               ]
+           )
+           with tvm.transform.PassContext(
+                config={"relay.ToMixedPrecision.keep_orig_output_dtype": True},
+                opt_level=3):
+            mod = seq(mod)
+       return  mod
+
+   dtype="float16_acc32"
+   mod = convert_to_dtype(mod["main"], dtype)
+   dtype = "float32"  if  dtype == "float32"  else  "float16"
+
+The ``ToMixedPrecision`` method is a pass to convert an FP32 relay graph into an FP16 version (with
+FP16 or FP32 accumulation dtypes). Doing this transformation is useful for reducing model size
+as it halves the expected size of the weights (FP16_acc16 case).
+
+From this point onwards, we can compile our model as normal
+
+.. code:: python
+
+   with  tvm.transform.PassContext(opt_level=3):
+       lib = relay.build(
+           mod, target_host=target_host, target=target, params=params
+       )
+
+.. |High-level overview of the Adreno A5x architecture for OpenCL| image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/how-to/adreno_architecture.png
+.. |Android deployment pipeline| image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/how-to/android_deployment_pipeline.jpg
diff --git a/docs/how_to/deploy/index.rst b/docs/how_to/deploy/index.rst
index 74bae0f9234e..ac1e2a127616 100644
--- a/docs/how_to/deploy/index.rst
+++ b/docs/how_to/deploy/index.rst
@@ -169,6 +169,7 @@ target device without relying on RPC. See the following resources on how to do s
 
    cpp_deploy
    android
+   adreno
    integrate
    hls
    arm_compute_lib
diff --git a/gallery/how_to/deploy_models/deploy_model_on_adreno.py b/gallery/how_to/deploy_models/deploy_model_on_adreno.py
new file mode 100644
index 000000000000..d6ed1f1f99a3
--- /dev/null
+++ b/gallery/how_to/deploy_models/deploy_model_on_adreno.py
@@ -0,0 +1,351 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+.. _tutorial-deploy-model-on-adreno:
+
+Deploy the Pretrained Model on Adreno
+=======================================
+**Author**: Daniil Barinov
+
+This article is a step-by-step tutorial to deploy pretrained Pytorch ResNet-18 model on Adreno (on different precisions).
+
+For us to begin with, PyTorch must be installed.
+TorchVision is also required since we will be using it as our model zoo.
+
+A quick solution is to install it via pip:
+
+.. code-block:: bash
+
+  pip install torch
+  pip install torchvision
+
+Besides that, you should have TVM builded for Android.
+See the following instructions on how to build it.
+
+`Deploy to Adreno GPU <https://tvm.apache.org/docs/how_to/deploy/adreno.html>`_
+
+After the build section there should be two files in *build* directory «libtvm_runtime.so» and «tvm_rpc».
+Let's push them to the device and run TVM RPC Server.
+"""
+
+######################################################################
+# TVM RPC Server
+# --------------
+# To get the hash of the device use:
+#
+# .. code-block:: bash
+#
+#   adb devices
+#
+# Then to upload these two files to the device you should use:
+#
+# .. code-block:: bash
+#
+#   adb -s <device_hash> push {libtvm_runtime.so,tvm_rpc} /data/local/tmp
+#
+# At this moment you will have «libtvm_runtime.so» and «tvm_rpc» on path /data/local/tmp on your device.
+# Sometimes cmake can’t find «libc++_shared.so». Use:
+#
+# .. code-block:: bash
+#
+#   find ${ANDROID_NDK_HOME} -name libc++_shared.so
+#
+# to find it and also push it with adb on the desired device:
+#
+# .. code-block:: bash
+#
+#   adb -s <device_hash> push libc++_shared.so /data/local/tmp
+#
+# We are now ready to run the TVM RPC Server.
+# Launch rpc_tracker with following line in 1st console:
+#
+# .. code-block:: bash
+#
+#   python3 -m tvm.exec.rpc_tracker --port 9190
+#
+# Then we need to run tvm_rpc server from under the desired device in 2nd console:
+#
+# .. code-block:: bash
+#
+#   adb -s <device_hash> reverse tcp:9190 tcp:9190
+#   adb -s <device_hash> forward tcp:9090 tcp:9090
+#   adb -s <device_hash> forward tcp:9091 tcp:9091
+#   adb -s <device_hash> forward tcp:9092 tcp:9092
+#   adb -s <device_hash> forward tcp:9093 tcp:9093
+#   adb -s <device_hash> shell LD_LIBRARY_PATH=/data/local/tmp /data/local/tmp/tvm_rpc server --host=0.0.0.0 --port=9090 --tracker=127.0.0.1:9190 --key=android --port-end=9190
+#
+# Before proceeding to compile and infer model, specify TVM_TRACKER_HOST and TVM_TRACKER_PORT
+#
+# .. code-block:: bash
+#
+#   export TVM_TRACKER_HOST=0.0.0.0
+#   export TVM_TRACKER_PORT=9190
+#
+# check that the tracker is running and the device is available
+#
+# .. code-block:: bash
+#
+#     python -m tvm.exec.query_rpc_tracker --port 9190
+#
+# For example, if we have 1 Android device,
+# the output can be:
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    android      1      1     0
+#    ----------------------------------
+
+#################################################################
+# Load a test image
+# -----------------
+# As an example we would use classical cat image from ImageNet
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
+from PIL import Image
+from tvm.contrib.download import download_testdata
+from matplotlib import pyplot as plt
+import numpy as np
+
+img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
+img_path = download_testdata(img_url, "cat.png", module="data")
+img = Image.open(img_path).resize((224, 224))
+plt.imshow(img)
+plt.show()
+
+# Preprocess the image and convert to tensor
+from torchvision import transforms
+
+my_preprocess = transforms.Compose(
+    [
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ]
+)
+img = my_preprocess(img)
+img = np.expand_dims(img, 0)
+
+#################################################################
+# Load pretrained Pytorch model
+# -----------------------------
+# Create a Relay graph from a Pytorch ResNet-18 model
+import os
+import torch
+import torchvision
+import tvm
+from tvm import te
+from tvm import relay, rpc
+from tvm.contrib import utils, ndk
+from tvm.contrib import graph_executor
+
+model_name = "resnet18"
+model = getattr(torchvision.models, model_name)(pretrained=True)
+model = model.eval()
+
+# We grab the TorchScripted model via tracing
+input_shape = [1, 3, 224, 224]
+input_data = torch.randn(input_shape)
+scripted_model = torch.jit.trace(model, input_data).eval()
+
+# Input name can be arbitrary
+input_name = "input0"
+shape_list = [(input_name, img.shape)]
+mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
+
+#################################################################
+# Precisions
+# ----------
+# Since TVM support Mixed Precision, we need to register mixed_precision_conversion:
+from tvm.relay.op import register_mixed_precision_conversion
+
+conv2d_acc = "float32"
+
+
+@register_mixed_precision_conversion("nn.conv2d", level=11)
+def conv2d_mixed_precision_rule(call_node: "relay.Call", mixed_precision_type: str):
+    global conv2d_acc
+    return [
+        relay.transform.mixed_precision.MIXED_PRECISION_ALWAYS,
+        conv2d_acc,
+        mixed_precision_type,
+    ]
+
+
+@register_mixed_precision_conversion("nn.dense", level=11)
+def conv2d_mixed_precision_rule(call_node: "relay.Call", mixed_precision_type: str):
+    global conv2d_acc
+    return [
+        relay.transform.mixed_precision.MIXED_PRECISION_ALWAYS,
+        conv2d_acc,
+        mixed_precision_type,
+    ]
+
+
+#################################################################
+# and also define the conversion function itself
+def convert_to_dtype(mod, dtype):
+    # downcast to float16
+    if dtype == "float16" or dtype == "float16_acc32":
+        global conv2d_acc
+        conv2d_acc = "float16" if dtype == "float16" else "float32"
+        from tvm.ir import IRModule
+
+        mod = IRModule.from_expr(mod)
+        seq = tvm.transform.Sequential(
+            [relay.transform.InferType(), relay.transform.ToMixedPrecision()]
+        )
+        with tvm.transform.PassContext(opt_level=3):
+            mod = seq(mod)
+    return mod
+
+
+#################################################################
+# Let's choose "float16_acc32" for example.
+dtype = "float16_acc32"
+mod = convert_to_dtype(mod["main"], dtype)
+dtype = "float32" if dtype == "float32" else "float16"
+
+print(mod)
+
+#################################################################
+# As you can see in the IR, the architecture now contains cast operations, which are
+# needed to convert to FP16 precision.
+# You can also use "float16" or "float32" precisions as other dtype options.
+
+#################################################################
+# Compile the model with relay
+# ----------------------------
+# Specify Adreno target before compiling to generate texture
+# leveraging kernels and get all the benefits of textures
+# Note: This generated example running on our x86 server for demonstration.
+# If running it on the Android device, we need to
+# specify its instruction set. Set :code:`local_demo` to False if you want
+# to run this tutorial with a real device.
+
+local_demo = True
+
+# by default on CPU target will execute.
+# select 'cpu', 'opencl' and 'vulkan'
+test_target = "cpu"
+
+# Change target configuration.
+# Run `adb shell cat /proc/cpuinfo` to find the arch.
+arch = "arm64"
+target = tvm.target.Target("llvm -mtriple=%s-linux-android" % arch)
+
+if local_demo:
+    target = tvm.target.Target("llvm")
+elif test_target == "opencl":
+    target = tvm.target.Target("opencl", host=target)
+elif test_target == "vulkan":
+    target = tvm.target.Target("vulkan", host=target)
+
+with tvm.transform.PassContext(opt_level=3):
+    lib = relay.build(mod, target=target, params=params)
+
+#################################################################
+# Deploy the Model Remotely by RPC
+# --------------------------------
+# Using RPC you can deploy the model from host
+# machine to the remote Adreno device
+
+rpc_tracker_host = os.environ.get("TVM_TRACKER_HOST", "127.0.0.1")
+rpc_tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
+key = "android"
+
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port)
+    # When running a heavy model, we should increase the `session_timeout`
+    remote = tracker.request(key, priority=0, session_timeout=60)
+
+if local_demo:
+    dev = remote.cpu(0)
+elif test_target == "opencl":
+    dev = remote.cl(0)
+elif test_target == "vulkan":
+    dev = remote.vulkan(0)
+else:
+    dev = remote.cpu(0)
+
+temp = utils.tempdir()
+dso_binary = "dev_lib_cl.so"
+dso_binary_path = temp.relpath(dso_binary)
+fcompile = ndk.create_shared if not local_demo else None
+lib.export_library(dso_binary_path, fcompile)
+remote_path = "/data/local/tmp/" + dso_binary
+remote.upload(dso_binary_path)
+rlib = remote.load_module(dso_binary)
+m = graph_executor.GraphModule(rlib["default"](dev))
+
+#################################################################
+# Run inference
+# -------------
+# We now can set inputs, infer our model and get predictions as output
+m.set_input(input_name, tvm.nd.array(img.astype("float32")))
+m.run()
+tvm_output = m.get_output(0)
+
+#################################################################
+# Get predictions and performance statistic
+# -----------------------------------------
+# This piece of code displays the top-1 and top-5 predictions, as
+# well as provides information about the model's performance
+from os.path import join, isfile
+from matplotlib import pyplot as plt
+from tvm.contrib import download
+
+# Download ImageNet categories
+categ_url = "https://github.com/uwsampl/web-data/raw/main/vta/models/"
+categ_fn = "synset.txt"
+download.download(join(categ_url, categ_fn), categ_fn)
+synset = eval(open(categ_fn).read())
+
+top_categories = np.argsort(tvm_output.asnumpy()[0])
+top5 = np.flip(top_categories, axis=0)[:5]
+
+# Report top-1 classification result
+print("Top-1 id: {}, class name: {}".format(top5[1 - 1], synset[top5[1 - 1]]))
+
+# Report top-5 classification results
+print("\nTop5 predictions: \n")
+print("\t#1:", synset[top5[1 - 1]])
+print("\t#2:", synset[top5[2 - 1]])
+print("\t#3:", synset[top5[3 - 1]])
+print("\t#4:", synset[top5[4 - 1]])
+print("\t#5:", synset[top5[5 - 1]])
+print("\t", top5)
+ImageNetClassifier = False
+for k in top_categories[-5:]:
+    if "cat" in synset[k]:
+        ImageNetClassifier = True
+assert ImageNetClassifier, "Failed ImageNet classifier validation check"
+
+print("Evaluate inference time cost...")
+print(m.benchmark(dev, number=1, repeat=10))

From 5d8fc204a59f5d7cfed3d948084593866b16ceef Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Mon, 28 Nov 2022 09:23:21 +0000
Subject: [PATCH 657/704] [ACL] Enable int8 data type in QNN DENSE (#13487)

This enables int8 data type to be used in Compute Library
for the Arm(r) Architecture (ACL) BYOC integration.
---
 .../tvm/relay/op/contrib/arm_compute_lib.py   |  6 ++--
 .../test_arm_compute_lib/test_dense.py        | 29 ++++++++++++-------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index d63cd8c83a93..95500c91e1f4 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -258,7 +258,7 @@ def check_dense(extract):
 
     def check_qnn_dense(extract):
         """Check qnn conv pattern is supported by ACL."""
-        if extract.attrs.out_dtype != "uint8":
+        if extract.attrs.out_dtype not in ("uint8", "int8"):
             return False
         call = extract
         while call.op.name != "qnn.dense":
@@ -414,10 +414,10 @@ def qnn_dense(expr):
     """Check if the external ACL codegen for qnn.dense should be used."""
     attrs, args = expr.attrs, expr.args
     data_typ = args[0].checked_type
-    if data_typ.dtype != "uint8":
+    if data_typ.dtype not in ("uint8", "int8"):
         return False
     kernel_typ = args[1].checked_type
-    if len(kernel_typ.shape) != 2 or kernel_typ.dtype != "uint8":
+    if len(kernel_typ.shape) != 2 or kernel_typ.dtype not in ("uint8", "int8"):
         return False
     if attrs.out_dtype != "int32":
         return False
diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py
index 6bdff0fdb857..fa6057dd9a63 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_dense.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """Arm Compute Library integration dense tests."""
-
 import numpy as np
+import pytest
 
 import tvm
 from tvm import relay
@@ -104,14 +104,15 @@ def _get_qnn_model(
         relay.const(0, "int32"),  # input zero point
         relay.const(output_sc, "float32"),  # output scale
         relay.const(output_zp, "int32"),  # output zero point
-        out_dtype="uint8",
+        out_dtype=dtype,
     )
     return out, params
 
 
 def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False):
     output_shape = (shape[0], units)
-    out_dtype = "int32" if dtype == "uint8" else "float32"
+    qnn_dtypes = ("uint8", "int8")
+    out_dtype = "int32" if dtype in qnn_dtypes else "float32"
 
     node = {
         "op": "kernel",
@@ -136,7 +137,7 @@ def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False):
     ]
 
     # qnn.dense params, input and kernel
-    if dtype == "uint8":
+    if dtype in qnn_dtypes:
         node["name"] = "qnn.dense"
         for param_dtype in ["int32", "float32"]:
             for _ in range(2):
@@ -149,7 +150,7 @@ def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False):
                 )
 
     if has_bias:
-        bias_dtype = "int32" if dtype == "uint8" else "float32"
+        bias_dtype = "int32" if dtype in qnn_dtypes else "float32"
         bias_shape = (
             [1, weight_shape[0]]
             if dtype == "float32" and weight_shape[0] != 1
@@ -164,7 +165,7 @@ def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False):
         )
 
     # qnn.dense params, output
-    if dtype == "uint8":
+    if dtype in qnn_dtypes:
         for param_dtype in ["float32", "int32"]:
             inputs.append(
                 {"op": "const", "name": "", "attrs": {"shape": [[[]]], "dtype": [[param_dtype]]}}
@@ -251,7 +252,14 @@ def test_codegen_dense():
         verify_codegen(func, exp_codegen)
 
 
-def test_qnn_dense():
+@pytest.mark.parametrize(
+    "dtype,min_range,max_range",
+    [
+        ("uint8", 0, 255),
+        ("int8", -127, 128),
+    ],
+)
+def test_qnn_dense(dtype, min_range, max_range):
     Device.load("test_config.json")
 
     if skip_runtime_test():
@@ -260,7 +268,6 @@ def test_qnn_dense():
     device = Device()
     np.random.seed(0)
 
-    dtype = "uint8"
     trials = [
         [(1, 2), (2, 2), 2, True],
         [(1, 2), (2, 2), 2, False],
@@ -277,7 +284,7 @@ def test_qnn_dense():
     ]
     for shape, weight_shape, units, composite in trials:
         outputs = []
-        inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))}
+        inputs = {"a": tvm.nd.array(np.random.uniform(min_range, max_range, shape).astype(dtype))}
         input_zp = 100
         input_sc = 0.5
         kernel_zp = 50
@@ -329,13 +336,13 @@ def test_qnn_dense():
         verify(outputs, atol=1, rtol=0, config=config, verify_saturation=True)
 
 
-def test_codegen_qnn_dense():
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+def test_codegen_qnn_dense(dtype):
     if skip_codegen_test():
         return
 
     np.random.seed(0)
 
-    dtype = "uint8"
     trials = [
         [(1, 2), (2, 2), 2, True],
         [(1, 2), (2, 2), 2, False],

From b6151bcaa239c48dbec3c79aeef62c9c7780e147 Mon Sep 17 00:00:00 2001
From: Duture <duture@foxmail.com>
Date: Mon, 28 Nov 2022 18:35:11 +0800
Subject: [PATCH 658/704] Fix typo in golang sample (#13476)

Create complex.go

At line132, raw code is `inSlice := make([]float32, (244 * 244 * 3))`,
Howover, at line90 the input shape set `tshapeIn  := []int64{1, 224, 224, 3}`,
So there is an error uncatched.
---
 golang/sample/complex.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/golang/sample/complex.go b/golang/sample/complex.go
index 91821c978e96..c048207b8b5e 100644
--- a/golang/sample/complex.go
+++ b/golang/sample/complex.go
@@ -129,7 +129,7 @@ func main() {
     fmt.Printf("Module params loaded\n")
 
     // Set some data in input Array
-    inSlice := make([]float32, (244 * 244 * 3))
+    inSlice := make([]float32, (224 * 224 * 3))
     rand.Seed(10)
     rand.Shuffle(len(inSlice), func(i, j int) {inSlice[i],
                                                inSlice[j] = rand.Float32(),

From fc59b6dbdf445c09f70d20c8156cc940f696fdcd Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 28 Nov 2022 10:37:53 +0000
Subject: [PATCH 659/704] Bump pillow from 8.3.2 to 9.3.0 in
 /apps/microtvm/ethosu (#13464)

Bumps [pillow](https://github.com/python-pillow/Pillow) from 8.3.2 to 9.3.0.
- [Release notes](https://github.com/python-pillow/Pillow/releases)
- [Changelog](https://github.com/python-pillow/Pillow/blob/main/CHANGES.rst)
- [Commits](https://github.com/python-pillow/Pillow/compare/8.3.2...9.3.0)

---
updated-dependencies:
- dependency-name: pillow
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 apps/microtvm/ethosu/requirements.txt | 116 ++++++++++++++------------
 1 file changed, 62 insertions(+), 54 deletions(-)

diff --git a/apps/microtvm/ethosu/requirements.txt b/apps/microtvm/ethosu/requirements.txt
index ae8e0aacd738..d9593a8184e9 100644
--- a/apps/microtvm/ethosu/requirements.txt
+++ b/apps/microtvm/ethosu/requirements.txt
@@ -99,60 +99,68 @@ numpy==1.21.3 \
     --hash=sha256:f41b018f126aac18583956c54544db437f25c7ee4794bcb23eb38bef8e5e192a \
     --hash=sha256:f8f4625536926a155b80ad2bbff44f8cc59e9f2ad14cdda7acf4c135b4dc8ff2 \
     --hash=sha256:fe52dbe47d9deb69b05084abd4b0df7abb39a3c51957c09f635520abd49b29dd
-Pillow==8.3.2 \
-    --hash=sha256:0412516dcc9de9b0a1e0ae25a280015809de8270f134cc2c1e32c4eeb397cf30 \
-    --hash=sha256:04835e68ef12904bc3e1fd002b33eea0779320d4346082bd5b24bec12ad9c3e9 \
-    --hash=sha256:06d1adaa284696785375fa80a6a8eb309be722cf4ef8949518beb34487a3df71 \
-    --hash=sha256:085a90a99404b859a4b6c3daa42afde17cb3ad3115e44a75f0d7b4a32f06a6c9 \
-    --hash=sha256:0b9911ec70731711c3b6ebcde26caea620cbdd9dcb73c67b0730c8817f24711b \
-    --hash=sha256:10e00f7336780ca7d3653cf3ac26f068fa11b5a96894ea29a64d3dc4b810d630 \
-    --hash=sha256:11c27e74bab423eb3c9232d97553111cc0be81b74b47165f07ebfdd29d825875 \
-    --hash=sha256:11eb7f98165d56042545c9e6db3ce394ed8b45089a67124298f0473b29cb60b2 \
-    --hash=sha256:13654b521fb98abdecec105ea3fb5ba863d1548c9b58831dd5105bb3873569f1 \
-    --hash=sha256:15ccb81a6ffc57ea0137f9f3ac2737ffa1d11f786244d719639df17476d399a7 \
-    --hash=sha256:18a07a683805d32826c09acfce44a90bf474e6a66ce482b1c7fcd3757d588df3 \
-    --hash=sha256:19ec4cfe4b961edc249b0e04b5618666c23a83bc35842dea2bfd5dfa0157f81b \
-    --hash=sha256:1c3ff00110835bdda2b1e2b07f4a2548a39744bb7de5946dc8e95517c4fb2ca6 \
-    --hash=sha256:27a330bf7014ee034046db43ccbb05c766aa9e70b8d6c5260bfc38d73103b0ba \
-    --hash=sha256:2b11c9d310a3522b0fd3c35667914271f570576a0e387701f370eb39d45f08a4 \
-    --hash=sha256:2c661542c6f71dfd9dc82d9d29a8386287e82813b0375b3a02983feac69ef864 \
-    --hash=sha256:2cde7a4d3687f21cffdf5bb171172070bb95e02af448c4c8b2f223d783214056 \
-    --hash=sha256:2d5e9dc0bf1b5d9048a94c48d0813b6c96fccfa4ccf276d9c36308840f40c228 \
-    --hash=sha256:2f23b2d3079522fdf3c09de6517f625f7a964f916c956527bed805ac043799b8 \
-    --hash=sha256:35d27687f027ad25a8d0ef45dd5208ef044c588003cdcedf05afb00dbc5c2deb \
-    --hash=sha256:35d409030bf3bd05fa66fb5fdedc39c521b397f61ad04309c90444e893d05f7d \
-    --hash=sha256:4326ea1e2722f3dc00ed77c36d3b5354b8fb7399fb59230249ea6d59cbed90da \
-    --hash=sha256:4abc247b31a98f29e5224f2d31ef15f86a71f79c7f4d2ac345a5d551d6393073 \
-    --hash=sha256:4d89a2e9219a526401015153c0e9dd48319ea6ab9fe3b066a20aa9aee23d9fd3 \
-    --hash=sha256:4e59e99fd680e2b8b11bbd463f3c9450ab799305d5f2bafb74fefba6ac058616 \
-    --hash=sha256:548794f99ff52a73a156771a0402f5e1c35285bd981046a502d7e4793e8facaa \
-    --hash=sha256:56fd98c8294f57636084f4b076b75f86c57b2a63a8410c0cd172bc93695ee979 \
-    --hash=sha256:59697568a0455764a094585b2551fd76bfd6b959c9f92d4bdec9d0e14616303a \
-    --hash=sha256:6bff50ba9891be0a004ef48828e012babaaf7da204d81ab9be37480b9020a82b \
-    --hash=sha256:6cb3dd7f23b044b0737317f892d399f9e2f0b3a02b22b2c692851fb8120d82c6 \
-    --hash=sha256:7dbfbc0020aa1d9bc1b0b8bcf255a7d73f4ad0336f8fd2533fcc54a4ccfb9441 \
-    --hash=sha256:838eb85de6d9307c19c655c726f8d13b8b646f144ca6b3771fa62b711ebf7624 \
-    --hash=sha256:8b68f565a4175e12e68ca900af8910e8fe48aaa48fd3ca853494f384e11c8bcd \
-    --hash=sha256:8f284dc1695caf71a74f24993b7c7473d77bc760be45f776a2c2f4e04c170550 \
-    --hash=sha256:963ebdc5365d748185fdb06daf2ac758116deecb2277ec5ae98139f93844bc09 \
-    --hash=sha256:a048dad5ed6ad1fad338c02c609b862dfaa921fcd065d747194a6805f91f2196 \
-    --hash=sha256:a1bd983c565f92779be456ece2479840ec39d386007cd4ae83382646293d681b \
-    --hash=sha256:a66566f8a22561fc1a88dc87606c69b84fa9ce724f99522cf922c801ec68f5c1 \
-    --hash=sha256:bcb04ff12e79b28be6c9988f275e7ab69f01cc2ba319fb3114f87817bb7c74b6 \
-    --hash=sha256:bd24054aaf21e70a51e2a2a5ed1183560d3a69e6f9594a4bfe360a46f94eba83 \
-    --hash=sha256:be25cb93442c6d2f8702c599b51184bd3ccd83adebd08886b682173e09ef0c3f \
-    --hash=sha256:c691b26283c3a31594683217d746f1dad59a7ae1d4cfc24626d7a064a11197d4 \
-    --hash=sha256:cc9d0dec711c914ed500f1d0d3822868760954dce98dfb0b7382a854aee55d19 \
-    --hash=sha256:ce2e5e04bb86da6187f96d7bab3f93a7877830981b37f0287dd6479e27a10341 \
-    --hash=sha256:ce651ca46d0202c302a535d3047c55a0131a720cf554a578fc1b8a2aff0e7d96 \
-    --hash=sha256:d0c8ebbfd439c37624db98f3877d9ed12c137cadd99dde2d2eae0dab0bbfc355 \
-    --hash=sha256:d675a876b295afa114ca8bf42d7f86b5fb1298e1b6bb9a24405a3f6c8338811c \
-    --hash=sha256:dde3f3ed8d00c72631bc19cbfff8ad3b6215062a5eed402381ad365f82f0c18c \
-    --hash=sha256:e5a31c07cea5edbaeb4bdba6f2b87db7d3dc0f446f379d907e51cc70ea375629 \
-    --hash=sha256:f514c2717012859ccb349c97862568fdc0479aad85b0270d6b5a6509dbc142e2 \
-    --hash=sha256:fc0db32f7223b094964e71729c0361f93db43664dd1ec86d3df217853cedda87 \
-    --hash=sha256:fd4fd83aa912d7b89b4b4a1580d30e2a4242f3936882a3f433586e5ab97ed0d5 \
-    --hash=sha256:feb5db446e96bfecfec078b943cc07744cc759893cef045aa8b8b6d6aaa8274e
+Pillow==9.3.0 \
+    --hash=sha256:03150abd92771742d4a8cd6f2fa6246d847dcd2e332a18d0c15cc75bf6703040 \
+    --hash=sha256:073adb2ae23431d3b9bcbcff3fe698b62ed47211d0716b067385538a1b0f28b8 \
+    --hash=sha256:0b07fffc13f474264c336298d1b4ce01d9c5a011415b79d4ee5527bb69ae6f65 \
+    --hash=sha256:0b7257127d646ff8676ec8a15520013a698d1fdc48bc2a79ba4e53df792526f2 \
+    --hash=sha256:12ce4932caf2ddf3e41d17fc9c02d67126935a44b86df6a206cf0d7161548627 \
+    --hash=sha256:15c42fb9dea42465dfd902fb0ecf584b8848ceb28b41ee2b58f866411be33f07 \
+    --hash=sha256:18498994b29e1cf86d505edcb7edbe814d133d2232d256db8c7a8ceb34d18cef \
+    --hash=sha256:1c7c8ae3864846fc95f4611c78129301e203aaa2af813b703c55d10cc1628535 \
+    --hash=sha256:22b012ea2d065fd163ca096f4e37e47cd8b59cf4b0fd47bfca6abb93df70b34c \
+    --hash=sha256:276a5ca930c913f714e372b2591a22c4bd3b81a418c0f6635ba832daec1cbcfc \
+    --hash=sha256:2e0918e03aa0c72ea56edbb00d4d664294815aa11291a11504a377ea018330d3 \
+    --hash=sha256:3033fbe1feb1b59394615a1cafaee85e49d01b51d54de0cbf6aa8e64182518a1 \
+    --hash=sha256:3168434d303babf495d4ba58fc22d6604f6e2afb97adc6a423e917dab828939c \
+    --hash=sha256:32a44128c4bdca7f31de5be641187367fe2a450ad83b833ef78910397db491aa \
+    --hash=sha256:3dd6caf940756101205dffc5367babf288a30043d35f80936f9bfb37f8355b32 \
+    --hash=sha256:40e1ce476a7804b0fb74bcfa80b0a2206ea6a882938eaba917f7a0f004b42502 \
+    --hash=sha256:41e0051336807468be450d52b8edd12ac60bebaa97fe10c8b660f116e50b30e4 \
+    --hash=sha256:4390e9ce199fc1951fcfa65795f239a8a4944117b5935a9317fb320e7767b40f \
+    --hash=sha256:502526a2cbfa431d9fc2a079bdd9061a2397b842bb6bc4239bb176da00993812 \
+    --hash=sha256:51e0e543a33ed92db9f5ef69a0356e0b1a7a6b6a71b80df99f1d181ae5875636 \
+    --hash=sha256:57751894f6618fd4308ed8e0c36c333e2f5469744c34729a27532b3db106ee20 \
+    --hash=sha256:5d77adcd56a42d00cc1be30843d3426aa4e660cab4a61021dc84467123f7a00c \
+    --hash=sha256:655a83b0058ba47c7c52e4e2df5ecf484c1b0b0349805896dd350cbc416bdd91 \
+    --hash=sha256:68943d632f1f9e3dce98908e873b3a090f6cba1cbb1b892a9e8d97c938871fbe \
+    --hash=sha256:6c738585d7a9961d8c2821a1eb3dcb978d14e238be3d70f0a706f7fa9316946b \
+    --hash=sha256:73bd195e43f3fadecfc50c682f5055ec32ee2c933243cafbfdec69ab1aa87cad \
+    --hash=sha256:772a91fc0e03eaf922c63badeca75e91baa80fe2f5f87bdaed4280662aad25c9 \
+    --hash=sha256:77ec3e7be99629898c9a6d24a09de089fa5356ee408cdffffe62d67bb75fdd72 \
+    --hash=sha256:7db8b751ad307d7cf238f02101e8e36a128a6cb199326e867d1398067381bff4 \
+    --hash=sha256:801ec82e4188e935c7f5e22e006d01611d6b41661bba9fe45b60e7ac1a8f84de \
+    --hash=sha256:82409ffe29d70fd733ff3c1025a602abb3e67405d41b9403b00b01debc4c9a29 \
+    --hash=sha256:828989c45c245518065a110434246c44a56a8b2b2f6347d1409c787e6e4651ee \
+    --hash=sha256:829f97c8e258593b9daa80638aee3789b7df9da5cf1336035016d76f03b8860c \
+    --hash=sha256:871b72c3643e516db4ecf20efe735deb27fe30ca17800e661d769faab45a18d7 \
+    --hash=sha256:89dca0ce00a2b49024df6325925555d406b14aa3efc2f752dbb5940c52c56b11 \
+    --hash=sha256:90fb88843d3902fe7c9586d439d1e8c05258f41da473952aa8b328d8b907498c \
+    --hash=sha256:97aabc5c50312afa5e0a2b07c17d4ac5e865b250986f8afe2b02d772567a380c \
+    --hash=sha256:9aaa107275d8527e9d6e7670b64aabaaa36e5b6bd71a1015ddd21da0d4e06448 \
+    --hash=sha256:9f47eabcd2ded7698106b05c2c338672d16a6f2a485e74481f524e2a23c2794b \
+    --hash=sha256:a0a06a052c5f37b4ed81c613a455a81f9a3a69429b4fd7bb913c3fa98abefc20 \
+    --hash=sha256:ab388aaa3f6ce52ac1cb8e122c4bd46657c15905904b3120a6248b5b8b0bc228 \
+    --hash=sha256:ad58d27a5b0262c0c19b47d54c5802db9b34d38bbf886665b626aff83c74bacd \
+    --hash=sha256:ae5331c23ce118c53b172fa64a4c037eb83c9165aba3a7ba9ddd3ec9fa64a699 \
+    --hash=sha256:af0372acb5d3598f36ec0914deed2a63f6bcdb7b606da04dc19a88d31bf0c05b \
+    --hash=sha256:afa4107d1b306cdf8953edde0534562607fe8811b6c4d9a486298ad31de733b2 \
+    --hash=sha256:b03ae6f1a1878233ac620c98f3459f79fd77c7e3c2b20d460284e1fb370557d4 \
+    --hash=sha256:b0915e734b33a474d76c28e07292f196cdf2a590a0d25bcc06e64e545f2d146c \
+    --hash=sha256:b4012d06c846dc2b80651b120e2cdd787b013deb39c09f407727ba90015c684f \
+    --hash=sha256:b472b5ea442148d1c3e2209f20f1e0bb0eb556538690fa70b5e1f79fa0ba8dc2 \
+    --hash=sha256:b59430236b8e58840a0dfb4099a0e8717ffb779c952426a69ae435ca1f57210c \
+    --hash=sha256:b90f7616ea170e92820775ed47e136208e04c967271c9ef615b6fbd08d9af0e3 \
+    --hash=sha256:b9a65733d103311331875c1dca05cb4606997fd33d6acfed695b1232ba1df193 \
+    --hash=sha256:bac18ab8d2d1e6b4ce25e3424f709aceef668347db8637c2296bcf41acb7cf48 \
+    --hash=sha256:bca31dd6014cb8b0b2db1e46081b0ca7d936f856da3b39744aef499db5d84d02 \
+    --hash=sha256:be55f8457cd1eac957af0c3f5ece7bc3f033f89b114ef30f710882717670b2a8 \
+    --hash=sha256:c7025dce65566eb6e89f56c9509d4f628fddcedb131d9465cacd3d8bac337e7e \
+    --hash=sha256:c935a22a557a560108d780f9a0fc426dd7459940dc54faa49d83249c8d3e760f \
+    --hash=sha256:dbb8e7f2abee51cef77673be97760abff1674ed32847ce04b4af90f610144c7b \
+    --hash=sha256:e6ea6b856a74d560d9326c0f5895ef8050126acfdc7ca08ad703eb0081e82b74 \
+    --hash=sha256:ebf2029c1f464c59b8bdbe5143c79fa2045a581ac53679733d3a91d400ff9efb \
+    --hash=sha256:f1ff2ee69f10f13a9596480335f406dd1f70c3650349e2be67ca3139280cade0
 psutil==5.8.0 \
     --hash=sha256:0066a82f7b1b37d334e68697faba68e5ad5e858279fd6351c8ca6024e8d6ba64 \
     --hash=sha256:02b8292609b1f7fcb34173b25e48d0da8667bc85f81d7476584d889c6e0f2131 \

From a92a3dd84f8b3ae994e7a4bef9305dc00c425285 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Mon, 28 Nov 2022 12:11:30 +0000
Subject: [PATCH 660/704] [ACL] Enable int8 data type in QNN CONV2D (#13496)

This enables CONV2D int8 data type to be used in Compute Library
for the Arm(r) Architecture (ACL) BYOC integration.
---
 .../tvm/relay/op/contrib/arm_compute_lib.py   |  7 +++---
 .../test_arm_compute_lib/infrastructure.py    | 18 +++++++++++++
 .../test_arm_compute_lib/test_conv2d.py       | 25 +++++++++++--------
 3 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 95500c91e1f4..76e65e0f41aa 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -242,7 +242,7 @@ def check_conv(extract):
 
     def check_qnn_conv(extract):
         """Check qnn conv pattern is supported by ACL."""
-        if extract.attrs.out_dtype != "uint8":
+        if extract.attrs.out_dtype not in ("uint8", "int8"):
             return False
         call = extract
         while call.op.name != "qnn.conv2d":
@@ -347,16 +347,17 @@ def conv2d(expr):
 def qnn_conv2d(expr):
     """Check if the external ACL codegen for qnn.conv2d should be used."""
     attrs, args = expr.attrs, expr.args
+    qnn_dtypes = ("uint8", "int8")
 
     if attrs.data_layout != "NHWC":
         return False
     if attrs.out_dtype != "int32" and attrs.out_dtype != "":
         return False
     data_typ = args[0].checked_type
-    if len(data_typ.shape) != 4 or data_typ.shape[0] != 1 or data_typ.dtype != "uint8":
+    if len(data_typ.shape) != 4 or data_typ.shape[0] != 1 or data_typ.dtype not in qnn_dtypes:
         return False
     kernel_typ = args[1].checked_type
-    if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "uint8":
+    if len(kernel_typ.shape) != 4 or kernel_typ.dtype not in qnn_dtypes:
         return False
     is_depthwise = is_depthwise_conv2d(
         data_typ.shape,
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index 74170d3d3c71..7d802e5b3a9b 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -30,6 +30,9 @@
 from tvm.autotvm.measure import request_remote
 
 
+QNN_DTYPES = ("uint8", "int8")
+
+
 class Device:
     """
     Configuration for Arm Compute Library tests.
@@ -120,6 +123,21 @@ def load(cls, file_name):
         cls.cross_compile = test_config.get("cross_compile") or ""
 
 
+def get_low_high_atol_rtol(dtype):
+    """Returns a tuple with boundary values and and tolerance for ACL tests."""
+
+    if dtype == "float32":
+        low, high, atol, rtol = (-127, 128, 0.001, 0.001)
+    elif dtype == "uint8":
+        low, high, atol, rtol = (0, 255, 1, 0)
+    elif dtype == "int8":
+        low, high, atol, rtol = (-127, 128, 1, 0)
+    else:
+        raise Exception(f"dtype not expected: {dtype}")
+
+    return low, high, atol, rtol
+
+
 def get_cpu_op_count(mod):
     """Traverse graph counting ops offloaded to TVM."""
 
diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
index cc5bbfec7c69..8acf3d81d917 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
@@ -17,11 +17,14 @@
 """Arm Compute Library integration conv2d tests."""
 
 import numpy as np
+import pytest
 
 import tvm
 from tvm import relay
 
 from test_arm_compute_lib.infrastructure import (
+    QNN_DTYPES,
+    get_low_high_atol_rtol,
     skip_runtime_test,
     skip_codegen_test,
     build_and_run,
@@ -130,6 +133,8 @@ def _get_qnn_model(
     has_pad=False,
 ):
     """Return a model and any parameters it may have."""
+    low, high, _, _ = get_low_high_atol_rtol(dtype)
+
     a = relay.var(next(var_names), shape=shape, dtype=dtype)
     if has_pad:
         p = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 0))
@@ -145,7 +150,7 @@ def _get_qnn_model(
         weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
     else:
         weight_shape = (kernel_h, kernel_w, channels, shape[3] // groups)
-    w = tvm.nd.array(np.random.uniform(0, 255, weight_shape).astype(dtype))
+    w = tvm.nd.array(np.random.uniform(low, high, weight_shape).astype(dtype))
     weights = relay.const(w, dtype)
     out = relay.qnn.op.conv2d(
         a,
@@ -179,7 +184,7 @@ def _get_qnn_model(
         relay.const(0, "int32"),  # input zero point
         relay.const(output_sc, "float32"),  # output scale
         relay.const(output_zp, "int32"),  # output zero point
-        out_dtype="uint8",
+        out_dtype=dtype,
     )
     return req, params
 
@@ -202,7 +207,7 @@ def _get_expected_codegen(
     output_height = ((shape[1] - kernel_h + padding[0] + padding[2]) / strides[0]) + 1
     output_width = ((shape[2] - kernel_w + padding[1] + padding[3]) / strides[1]) + 1
     output_shape = (1, int(output_height), int(output_width), channels)
-    out_dtype = "int32" if dtype == "uint8" else "float32"
+    out_dtype = "int32" if dtype in QNN_DTYPES else "float32"
     is_depthwise = shape[3] == channels == groups
     weight_format = "IHWO" if is_depthwise else "OHWI"
     if weight_format == "IHWO":
@@ -248,7 +253,7 @@ def _get_expected_codegen(
     ]
 
     # qnn.conv2d params, input and kernel
-    if dtype == "uint8":
+    if dtype in QNN_DTYPES:
         node["name"] = "qnn." + node["name"].split(".")[1]
         for param_dtype in ["int32", "float32"]:
             for _ in range(2):
@@ -261,7 +266,7 @@ def _get_expected_codegen(
                 )
 
     if has_bias:
-        bias_dtype = "int32" if dtype == "uint8" else "float32"
+        bias_dtype = "int32" if dtype in QNN_DTYPES else "float32"
         inputs.append(
             {
                 "op": "const",
@@ -274,7 +279,7 @@ def _get_expected_codegen(
         )
 
     # qnn.conv2d params, output
-    if dtype == "uint8":
+    if dtype in QNN_DTYPES:
         for param_dtype in ["float32", "int32"]:
             inputs.append(
                 {"op": "const", "name": "", "attrs": {"shape": [[[]]], "dtype": [[param_dtype]]}}
@@ -429,7 +434,8 @@ def test_codegen_conv2d():
         verify_codegen(func, exp_codegen, 1)
 
 
-def test_qnn_conv2d():
+@pytest.mark.parametrize("dtype", QNN_DTYPES)
+def test_qnn_conv2d(dtype):
     Device.load("test_config.json")
 
     if skip_runtime_test():
@@ -438,7 +444,6 @@ def test_qnn_conv2d():
     device = Device()
     np.random.seed(0)
 
-    dtype = "uint8"
     trials = [
         # Normal convolution
         [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
@@ -531,11 +536,11 @@ def test_qnn_conv2d():
         verify(outputs, atol=atol, rtol=0, config=config, verify_saturation=True)
 
 
-def test_codegen_qnn_conv2d():
+@pytest.mark.parametrize("dtype", QNN_DTYPES)
+def test_codegen_qnn_conv2d(dtype):
     if skip_codegen_test():
         return
 
-    dtype = "uint8"
     trials = [
         # Normal convolution
         [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],

From 61144f9d8711aad15d2b1a19d983a4891e1be706 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Mon, 28 Nov 2022 12:21:03 +0000
Subject: [PATCH 661/704] [ACL] Enable int8 data type in CONCATENATE (#13497)

This enables CONCATENATE int8 data type to be used in Compute Library
for the Arm(r) Architecture (ACL) BYOC integration.
---
 .../tvm/relay/op/contrib/arm_compute_lib.py   |  2 +-
 .../test_arm_compute_lib/test_concatenate.py  | 72 ++++++++++---------
 2 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 76e65e0f41aa..97f695bacaeb 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -285,7 +285,7 @@ def check_concatenate(expr):
             return False
         attrs, type_args = expr.attrs, expr.type_args
         for idx in range(len(type_args[0].fields)):
-            if type_args[0].fields[idx].dtype not in ["float32", "uint8"]:
+            if type_args[0].fields[idx].dtype not in ["float32", "uint8", "int8"]:
                 return False
         # ACL concatenate only supports maximum 4 dimensions input tensor
         if attrs.axis not in [-4, -3, -2, -1, 0, 1, 2, 3]:
diff --git a/tests/python/contrib/test_arm_compute_lib/test_concatenate.py b/tests/python/contrib/test_arm_compute_lib/test_concatenate.py
index deba26a0db56..55072f37c2bf 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_concatenate.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_concatenate.py
@@ -17,6 +17,7 @@
 """Arm Compute Library integration concatenate tests."""
 
 import numpy as np
+import pytest
 
 import tvm
 from tvm import relay
@@ -88,16 +89,9 @@ def _get_expected_codegen(input_shape_a, input_shape_b, input_shape_c, axis, dty
     return [input_a, input_b, input_c, node]
 
 
-def test_concatenate():
-    Device.load("test_config.json")
-
-    if skip_runtime_test():
-        return
-
-    device = Device()
-    np.random.seed(0)
-
-    for input_shape_a, input_shape_b, input_shape_c, axis, dtype in [
+@pytest.mark.parametrize(
+    "input_shape_a, input_shape_b, input_shape_c, axis, dtype",
+    [
         ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], 0, "float32"),
         ([1, 1, 234, 256], [1, 2, 234, 256], [1, 3, 234, 256], 1, "float32"),
         ([1, 234, 234, 1], [1, 234, 234, 2], [1, 234, 234, 3], -1, "float32"),
@@ -106,29 +100,43 @@ def test_concatenate():
         ([1, 1, 234, 256], [1, 2, 234, 256], [1, 3, 234, 256], 1, "uint8"),
         ([1, 234, 234, 1], [1, 234, 234, 2], [1, 234, 234, 3], -1, "uint8"),
         ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], -4, "uint8"),
-    ]:
-        outputs = []
-        inputs = {
-            "a": tvm.nd.array(np.random.randn(*input_shape_a).astype(dtype)),
-            "b": tvm.nd.array(np.random.randn(*input_shape_b).astype(dtype)),
-            "c": tvm.nd.array(np.random.randn(*input_shape_c).astype(dtype)),
-        }
-        func = _get_model(
-            inputs["a"].shape, inputs["b"].shape, inputs["c"].shape, axis, dtype, iter(inputs)
+        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], 0, "int8"),
+        ([1, 1, 234, 256], [1, 2, 234, 256], [1, 3, 234, 256], 1, "int8"),
+        ([1, 234, 234, 1], [1, 234, 234, 2], [1, 234, 234, 3], -1, "int8"),
+        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], -4, "int8"),
+    ],
+)
+def test_concatenate(input_shape_a, input_shape_b, input_shape_c, axis, dtype):
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    outputs = []
+    inputs = {
+        "a": tvm.nd.array(np.random.randn(*input_shape_a).astype(dtype)),
+        "b": tvm.nd.array(np.random.randn(*input_shape_b).astype(dtype)),
+        "c": tvm.nd.array(np.random.randn(*input_shape_c).astype(dtype)),
+    }
+    func = _get_model(
+        inputs["a"].shape, inputs["b"].shape, inputs["c"].shape, axis, dtype, iter(inputs)
+    )
+    for acl in [False, True]:
+        outputs.append(
+            build_and_run(func, inputs, 1, None, device, enable_acl=acl, disabled_ops=[])[0]
         )
-        for acl in [False, True]:
-            outputs.append(
-                build_and_run(func, inputs, 1, None, device, enable_acl=acl, disabled_ops=[])[0]
-            )
-
-        config = {
-            "input_shape_a": input_shape_a,
-            "input_shape_b": input_shape_b,
-            "input_shape_c": input_shape_c,
-            "axis": axis,
-            "dtype": dtype,
-        }
-        verify(outputs, atol=1e-7, rtol=1e-7, config=config)
+
+    config = {
+        "input_shape_a": input_shape_a,
+        "input_shape_b": input_shape_b,
+        "input_shape_c": input_shape_c,
+        "axis": axis,
+        "dtype": dtype,
+    }
+    verify(outputs, atol=1e-7, rtol=1e-7, config=config)
 
 
 def test_codegen_concatenate():

From 25e98dd5e4bd989ade14e36df1fc2e1f81be7885 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Mon, 28 Nov 2022 13:22:24 +0000
Subject: [PATCH 662/704] [ACL] Enable int8 data type in pooling operators
 (#13488)

This enables int8 data type to be used in Compute Library
for the Arm(r) Architecture (ACL) BYOC integration.

This PR covers:
- nn.max_pool2d
- nn.avg_pool2d
- nn.l2_pool2d
- nn.global_avg_pool2d
- nn.global_max_pool2d

Co-authored-by: Luke Hutton <luke.hutton@arm.com>
---
 .../tvm/relay/op/contrib/arm_compute_lib.py   |   6 +-
 .../test_arm_compute_lib/test_pooling.py      | 414 ++++++++++--------
 2 files changed, 228 insertions(+), 192 deletions(-)

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 97f695bacaeb..1b9abb0948b5 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -267,7 +267,7 @@ def check_qnn_dense(extract):
 
     def check_avg_pool2d(extract):
         """Check average pool2d pattern is supported by ACL."""
-        if extract.attrs.dtype != "uint8":
+        if extract.attrs.dtype not in ("uint8", "int8"):
             return False
         pool = extract.args[0]
         if pool.args[0].attrs.dtype != "int32":
@@ -440,7 +440,7 @@ def max_pool2d(expr):
     if attrs.layout != "NHWC":
         return False
     typ = args[0].checked_type
-    if typ.dtype not in ["float32", "uint8"]:
+    if typ.dtype not in ["float32", "uint8", "int8"]:
         return False
     return check_dilation(attrs)
 
@@ -468,7 +468,7 @@ def global_max_pool2d(expr):
     """Check if the external ACL codegen for gloval_maxpool2d should be used."""
     attrs, args = expr.attrs, expr.args
     typ = args[0].checked_type
-    if typ.dtype not in ["float32", "uint8"]:
+    if typ.dtype not in ["float32", "uint8", "int8"]:
         return False
     if attrs.layout != "NHWC":
         return False
diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
index b174f9a78866..f08fa0059ddc 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
@@ -15,8 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Arm Compute Library integration pooling tests."""
-
 import numpy as np
+import pytest
+
 import tvm
 from tvm import relay, testing
 
@@ -46,6 +47,7 @@ def _get_pooling_model(
     if len(padding) == 2:
         padding = (padding[0], padding[1], padding[0], padding[1])
     out = relay.var(next(var_names), shape=shape, dtype=dtype)
+    qnn_dtypes = ("uint8", "int8")
 
     if typef == "nn.max_pool2d":
         out = relay.nn.max_pool2d(
@@ -58,7 +60,7 @@ def _get_pooling_model(
             layout="NHWC",
         )
     elif typef == "nn.avg_pool2d":
-        if dtype == "uint8":
+        if dtype in qnn_dtypes:
             out = relay.cast(out, "int32")
         out = relay.nn.avg_pool2d(
             out,
@@ -70,8 +72,8 @@ def _get_pooling_model(
             count_include_pad=count_include_pad,
             layout="NHWC",
         )
-        if dtype == "uint8":
-            out = relay.cast(out, "uint8")
+        if dtype in qnn_dtypes:
+            out = relay.cast(out, dtype)
     elif typef == "nn.l2_pool2d":
         out = relay.power(out, relay.const(2.0))
         out = relay.nn.avg_pool2d(
@@ -93,15 +95,16 @@ def _get_pooling_model(
 def _get_global_pooling_model(shape, dtype, typef, var_names):
     """Return a model and any parameters it may have."""
     out = relay.var(next(var_names), shape=shape, dtype=dtype)
+    qnn_dtypes = ("uint8", "int8")
 
     if typef == "nn.global_max_pool2d":
         out = relay.nn.global_max_pool2d(out, layout="NHWC")
     elif typef == "nn.global_avg_pool2d":
-        if dtype == "uint8":
+        if dtype in qnn_dtypes:
             out = relay.cast(out, "int32")
         out = relay.nn.global_avg_pool2d(out, layout="NHWC")
-        if dtype == "uint8":
-            out = relay.cast(out, "uint8")
+        if dtype in qnn_dtypes:
+            out = relay.cast(out, dtype)
     else:
         raise ValueError("Function not supported")
 
@@ -160,7 +163,59 @@ def _get_expected_global_pooling_codegen(shape, dtype, typef):
     return [input, node]
 
 
-def test_pooling():
+def _get_low_high_atol_rtol(dtype):
+    if dtype == "float32":
+        low, high, atol, rtol = (-127, 128, 0.001, 0.001)
+    elif dtype == "uint8":
+        low, high, atol, rtol = (0, 255, 1, 0)
+    elif dtype == "int8":
+        low, high, atol, rtol = (-127, 128, 1, 0)
+    else:
+        pytest.fail(f"dtype not expected: {dtype}")
+
+    return low, high, atol, rtol
+
+
+# fmt: off
+@pytest.mark.parametrize(
+     "typef,dtype,size,stride,dilation,pad,ceil_mode,count_include_pad,input_shape,expected_ops",
+     [
+        ("nn.max_pool2d", "float32",  (3, 3), (2, 2), (1, 1), (0, 0), False, False, (27, 27, 512), (0, 1),),
+        ("nn.max_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 0), False, True,  (16, 16, 16),  (0, 1),),
+        ("nn.max_pool2d", "float32",  (3, 3), (2, 2), (1, 1), (1, 1), True,  True,  (15, 15, 16),  (0, 1),),
+        ("nn.max_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),),
+        ("nn.max_pool2d", "uint8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),),
+        ("nn.max_pool2d", "uint8", (2, 2), (2, 2), (1, 1), (1, 1), True,  True,  (15, 15, 16),  (0, 1),),
+        ("nn.max_pool2d", "uint8", (2, 2), (2, 2), (3, 2), (1, 1), True,  True,  (15, 15, 16),  (1, 0),),
+        ("nn.max_pool2d", "int8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),),
+        ("nn.max_pool2d", "int8", (2, 2), (2, 2), (1, 1), (1, 1), True,  True,  (15, 15, 16),  (0, 1),),
+        ("nn.max_pool2d", "int8", (2, 2), (2, 2), (3, 2), (1, 1), True,  True,  (15, 15, 16),  (1, 0),),
+        ("nn.avg_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (1, 1), False, False, (16, 16, 16),  (0, 1),),
+        ("nn.avg_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 0), False, True,  (16, 16, 16),  (0, 1),),
+        ("nn.avg_pool2d", "float32",  (3, 3), (2, 2), (3, 2), (0, 1), True,  False, (15, 15, 16),  (1, 0),),
+        # 20.05: "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"
+        # ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), False, True, (16, 16, 16)],
+        ("nn.avg_pool2d", "uint8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),),
+        ("nn.avg_pool2d", "int8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),),
+        ("nn.l2_pool2d",  "float32",  (2, 2), (2, 2), (1, 1), (0, 1), True,  False, (16, 16, 16),  (0, 1),),
+        ("nn.l2_pool2d",  "float32",  (3, 3), (2, 2), (1, 1), (0, 0), False, False, (16, 16, 16),  (0, 1),),
+        ("nn.l2_pool2d",  "float32",  (2, 2), (2, 2), (1, 1), (1, 1), False, True,  (15, 15, 16),  (0, 1),),
+
+     ],
+)
+# fmt: on
+def test_pooling(
+    typef,
+    dtype,
+    size,
+    stride,
+    dilation,
+    pad,
+    ceil_mode,
+    count_include_pad,
+    input_shape,
+    expected_ops,
+):
     Device.load("test_config.json")
 
     if skip_runtime_test():
@@ -169,91 +224,77 @@ def test_pooling():
     device = Device()
     np.random.seed(0)
 
-    fp32_dtype = ("float32", -127, 128, 0.001, 0.001)
-    uint8_dtype = ("uint8", 0, 255, 1, 0)
-    # fmt: off
-    trials = [
-        ["nn.max_pool2d", fp32_dtype,  (3, 3), (2, 2), (1, 1), (0, 0), False, False, (27, 27, 512), (0, 1),],
-        ["nn.max_pool2d", fp32_dtype,  (2, 2), (2, 2), (1, 1), (0, 0), False, True,  (16, 16, 16),  (0, 1),],
-        ["nn.max_pool2d", fp32_dtype,  (3, 3), (2, 2), (1, 1), (1, 1), True,  True,  (15, 15, 16),  (0, 1),],
-        ["nn.max_pool2d", fp32_dtype,  (2, 2), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),],
-        ["nn.max_pool2d", uint8_dtype, (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),],
-        ["nn.max_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), (1, 1), True,  True,  (15, 15, 16),  (0, 1),],
-        ["nn.max_pool2d", uint8_dtype, (2, 2), (2, 2), (3, 2), (1, 1), True,  True,  (15, 15, 16),  (1, 0),],
-        ["nn.avg_pool2d", fp32_dtype,  (2, 2), (2, 2), (1, 1), (1, 1), False, False, (16, 16, 16),  (0, 1),],
-        ["nn.avg_pool2d", fp32_dtype,  (2, 2), (2, 2), (1, 1), (0, 0), False, True,  (16, 16, 16),  (0, 1),],
-        ["nn.avg_pool2d", fp32_dtype,  (3, 3), (2, 2), (3, 2), (0, 1), True,  False, (15, 15, 16),  (1, 0),],
-        # 20.05: "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"
-        # ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), False, True, (16, 16, 16)],
-        ["nn.avg_pool2d", uint8_dtype, (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16),  (0, 1),],
-        ["nn.l2_pool2d",  fp32_dtype,  (2, 2), (2, 2), (1, 1), (0, 1), True,  False, (16, 16, 16),  (0, 1),],
-        ["nn.l2_pool2d",  fp32_dtype,  (3, 3), (2, 2), (1, 1), (0, 0), False, False, (16, 16, 16),  (0, 1),],
-        ["nn.l2_pool2d",  fp32_dtype,  (2, 2), (2, 2), (1, 1), (1, 1), False, True,  (15, 15, 16),  (0, 1),],
-    ]
-    # fmt: on
-    for (
+    low, high, atol, rtol = _get_low_high_atol_rtol(dtype)
+    tvm_ops, acl_partitions = expected_ops
+
+    shape = (1, *input_shape)
+    outputs = []
+    inputs = {
+        "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
+    }
+
+    func = _get_pooling_model(
+        shape,
+        dtype,
         typef,
-        (dtype, low, high, atol, rtol),
         size,
         stride,
         dilation,
         pad,
         ceil_mode,
         count_include_pad,
-        input_shape,
-        (tvm_ops, acl_partitions),
-    ) in trials:
-        shape = (1, *input_shape)
-        outputs = []
-        inputs = {
-            "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
-        }
-
-        func = _get_pooling_model(
-            shape,
-            dtype,
-            typef,
-            size,
-            stride,
-            dilation,
-            pad,
-            ceil_mode,
-            count_include_pad,
-            iter(inputs),
+        iter(inputs),
+    )
+
+    config = {
+        "size": size,
+        "stride": stride,
+        "shape": shape,
+        "pooling type": typef,
+        "dtype": dtype,
+        "padding": pad,
+        "dilation": dilation,
+        "ceil_mode": ceil_mode,
+        "count_include_pad": count_include_pad,
+        "inputs": inputs,
+    }
+    verify_saturation = True if dtype == "uint8" else False
+    for acl in [False, True]:
+        outputs.append(
+            build_and_run(
+                func,
+                inputs,
+                1,
+                None,
+                device,
+                enable_acl=acl,
+                tvm_ops=tvm_ops,
+                acl_partitions=acl_partitions,
+                config=config,
+            )[0]
         )
 
-        config = {
-            "size": size,
-            "stride": stride,
-            "shape": shape,
-            "pooling type": typef,
-            "dtype": dtype,
-            "padding": pad,
-            "dilation": dilation,
-            "ceil_mode": ceil_mode,
-            "count_include_pad": count_include_pad,
-            "inputs": inputs,
-        }
-        verify_saturation = True if dtype == "uint8" else False
-        for acl in [False, True]:
-            outputs.append(
-                build_and_run(
-                    func,
-                    inputs,
-                    1,
-                    None,
-                    device,
-                    enable_acl=acl,
-                    tvm_ops=tvm_ops,
-                    acl_partitions=acl_partitions,
-                    config=config,
-                )[0]
-            )
-
-        verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=verify_saturation)
-
-
-def test_global_pooling():
+    verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=verify_saturation)
+
+
+@pytest.mark.parametrize(
+    "typef,dtype,input_shape",
+    [
+        ["nn.global_max_pool2d", "float32", (8, 8, 16)],
+        ["nn.global_max_pool2d", "float32", (9, 9, 16)],
+        ["nn.global_max_pool2d", "uint8", (8, 8, 16)],
+        ["nn.global_max_pool2d", "uint8", (9, 9, 16)],
+        ["nn.global_max_pool2d", "int8", (8, 8, 16)],
+        ["nn.global_max_pool2d", "int8", (9, 9, 16)],
+        ["nn.global_avg_pool2d", "float32", (8, 8, 16)],
+        ["nn.global_avg_pool2d", "float32", (9, 9, 16)],
+        ["nn.global_avg_pool2d", "uint8", (8, 8, 16)],
+        ["nn.global_avg_pool2d", "uint8", (9, 9, 16)],
+        ["nn.global_avg_pool2d", "int8", (8, 8, 16)],
+        ["nn.global_avg_pool2d", "int8", (9, 9, 16)],
+    ],
+)
+def test_global_pooling(typef, dtype, input_shape):
     Device.load("test_config.json")
 
     if skip_runtime_test():
@@ -262,118 +303,113 @@ def test_global_pooling():
     device = Device()
     np.random.seed(0)
 
-    fp32_dtype = ("float32", -127, 128, 0.001, 0.001)
-    uint8_dtype = ("uint8", 0, 255, 1, 0)
-
-    trials = [
-        ["nn.global_max_pool2d", fp32_dtype, (8, 8, 16)],
-        ["nn.global_max_pool2d", fp32_dtype, (9, 9, 16)],
-        ["nn.global_max_pool2d", fp32_dtype, (8, 8, 16)],
-        ["nn.global_max_pool2d", uint8_dtype, (8, 8, 16)],
-        ["nn.global_max_pool2d", uint8_dtype, (9, 9, 16)],
-        ["nn.global_avg_pool2d", fp32_dtype, (8, 8, 16)],
-        ["nn.global_avg_pool2d", fp32_dtype, (8, 8, 16)],
-        ["nn.global_avg_pool2d", fp32_dtype, (9, 9, 16)],
-        ["nn.global_avg_pool2d", uint8_dtype, (8, 8, 16)],
-        ["nn.global_avg_pool2d", uint8_dtype, (8, 8, 16)],
-    ]
-
-    for typef, (dtype, low, high, atol, rtol), input_shape in trials:
-        shape = (1, *input_shape)
-        outputs = []
-        inputs = {
-            "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
-        }
-
-        func = _get_global_pooling_model(shape, dtype, typef, iter(inputs))
-        config = {
-            "shape": shape,
-            "pooling type": typef,
-            "dtype": dtype,
-        }
-        verify_saturation = True if dtype == "uint8" else False
-
-        for acl in [False, True]:
-            outputs.append(
-                build_and_run(func, inputs, 1, None, device, enable_acl=acl, config=config)[0]
-            )
-
-        verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=verify_saturation)
-
-
-def test_codegen_pooling():
-    if skip_codegen_test():
-        return
+    low, high, rtol, atol = _get_low_high_atol_rtol(dtype)
 
-    fp32_dtype = ("float32", -127, 128)
-    uint8_dtype = ("uint8", 0, 255)
-    # fmt: off
-    trials = [
-        ["nn.max_pool2d", fp32_dtype,  (2, 2), (2, 2), (1, 1), (0, 0), False,  True, (16, 16, 16), (0, 1),],
-        ["nn.max_pool2d", fp32_dtype,  (3, 3), (2, 2), (1, 1), (1, 1),  True,  True, (15, 15, 16), (0, 1),],
-        ["nn.max_pool2d", fp32_dtype,  (2, 2), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),],
-        ["nn.max_pool2d", uint8_dtype, (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),],
-        ["nn.max_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), (1, 1),  True,  True, (15, 15, 16), (0, 1),],
-        ["nn.max_pool2d", uint8_dtype, (2, 2), (2, 2), (3, 2), (1, 1),  True,  True, (15, 15, 16), (1, 0),],
-        ["nn.avg_pool2d", fp32_dtype,  (2, 2), (2, 2), (1, 1), (1, 1), False, False, (16, 16, 16), (0, 1),],
-        ["nn.avg_pool2d", fp32_dtype,  (2, 2), (2, 2), (1, 1), (1, 1), False, False, (16, 16, 16), (0, 1),],
-        ["nn.avg_pool2d", fp32_dtype,  (2, 2), (2, 2), (1, 1), (0, 0), False,  True, (16, 16, 16), (0, 1),],
-        ["nn.avg_pool2d", fp32_dtype,  (3, 3), (2, 2), (3, 2), (0, 1),  True, False, (15, 15, 16), (1, 0),],
-        ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), (1, 1), False,  True, (16, 16, 16), (0, 1),],
-        ["nn.avg_pool2d", uint8_dtype, (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),],
-        ["nn.l2_pool2d",  fp32_dtype,  (2, 2), (2, 2), (1, 1), (0, 1),  True, False, (15, 15, 16), (0, 1),],
-        ["nn.l2_pool2d",  fp32_dtype,  (3, 3), (2, 2), (1, 1), (0, 0), False, False, (16, 16, 16), (0, 1),],
-        ["nn.l2_pool2d",  fp32_dtype,  (2, 2), (2, 2), (1, 1), (1, 1), False,  True, (15, 15, 16), (0, 1),],
-    ]
-    # fmt: on
-    for (
-        typef,
-        (dtype, low, high),
-        size,
-        stride,
-        dilation,
-        pad,
-        ceil_mode,
-        count_include_pad,
-        input_shape,
-        (tvm_ops, acl_partitions),
-    ) in trials:
-        shape = (1, *input_shape)
-        inputs = {"a"}
-        args = (shape, dtype, typef, size, stride, dilation, pad, False, False)
-        func = _get_pooling_model(*args, iter(inputs))
-        exp_codegen = _get_expected_pooling_codegen(*args)
+    shape = (1, *input_shape)
+    outputs = []
+    inputs = {
+        "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
+    }
 
-        verify_codegen(func, exp_codegen, acl_partitions, tvm_ops)
+    func = _get_global_pooling_model(shape, dtype, typef, iter(inputs))
+    config = {
+        "shape": shape,
+        "pooling type": typef,
+        "dtype": dtype,
+    }
+    verify_saturation = True if dtype in ("uint8", "int8") else False
 
+    for acl in [False, True]:
+        outputs.append(
+            build_and_run(func, inputs, 1, None, device, enable_acl=acl, config=config)[0]
+        )
+
+    verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=verify_saturation)
+
+
+# fmt: off
+@pytest.mark.parametrize(
+     "typef,dtype,size,stride,dilation,pad,ceil_mode,count_include_pad,input_shape,expected_ops",
+     [
+        ("nn.max_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 0), False,  True, (16, 16, 16), (0, 1),),
+        ("nn.max_pool2d", "float32",  (3, 3), (2, 2), (1, 1), (1, 1),  True,  True, (15, 15, 16), (0, 1),),
+        ("nn.max_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),),
+        ("nn.max_pool2d", "uint8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),),
+        ("nn.max_pool2d", "uint8", (2, 2), (2, 2), (1, 1), (1, 1),  True,  True, (15, 15, 16), (0, 1),),
+        ("nn.max_pool2d", "uint8", (2, 2), (2, 2), (3, 2), (1, 1),  True,  True, (15, 15, 16), (1, 0),),
+        ("nn.max_pool2d", "int8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),),
+        ("nn.max_pool2d", "int8", (2, 2), (2, 2), (1, 1), (1, 1),  True,  True, (15, 15, 16), (0, 1),),
+        ("nn.max_pool2d", "int8", (2, 2), (2, 2), (3, 2), (1, 1),  True,  True, (15, 15, 16), (1, 0),),
+        ("nn.avg_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (1, 1), False, False, (16, 16, 16), (0, 1),),
+        ("nn.avg_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (1, 1), False, False, (16, 16, 16), (0, 1),),
+        ("nn.avg_pool2d", "float32",  (2, 2), (2, 2), (1, 1), (0, 0), False,  True, (16, 16, 16), (0, 1),),
+        ("nn.avg_pool2d", "float32",  (3, 3), (2, 2), (3, 2), (0, 1),  True, False, (15, 15, 16), (1, 0),),
+        ("nn.avg_pool2d", "uint8", (2, 2), (2, 2), (1, 1), (1, 1), False,  True, (16, 16, 16), (0, 1),),
+        ("nn.avg_pool2d", "uint8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),),
+        ("nn.avg_pool2d", "int8", (2, 2), (2, 2), (1, 1), (1, 1), False,  True, (16, 16, 16), (0, 1),),
+        ("nn.avg_pool2d", "int8", (3, 3), (2, 2), (1, 1), (0, 1), False, False, (16, 16, 16), (0, 1),),
+        ("nn.l2_pool2d",  "float32",  (2, 2), (2, 2), (1, 1), (0, 1),  True, False, (15, 15, 16), (0, 1),),
+        ("nn.l2_pool2d",  "float32",  (3, 3), (2, 2), (1, 1), (0, 0), False, False, (16, 16, 16), (0, 1),),
+        ("nn.l2_pool2d",  "float32",  (2, 2), (2, 2), (1, 1), (1, 1), False,  True, (15, 15, 16), (0, 1),),
+     ],
+)
+# fmt: on
+def test_codegen_pooling(
+    typef,
+    dtype,
+    size,
+    stride,
+    dilation,
+    pad,
+    ceil_mode,
+    count_include_pad,
+    input_shape,
+    expected_ops,
+):
+    if skip_codegen_test():
+        return
 
-def test_codegen_global_pooling():
+    low, high, _, _ = _get_low_high_atol_rtol(dtype)
+    tvm_ops, acl_partitions = expected_ops
+
+    shape = (1, *input_shape)
+    inputs = {"a"}
+    args = (shape, dtype, typef, size, stride, dilation, pad, False, False)
+    func = _get_pooling_model(*args, iter(inputs))
+    exp_codegen = _get_expected_pooling_codegen(*args)
+
+    verify_codegen(func, exp_codegen, acl_partitions, tvm_ops)
+
+
+@pytest.mark.parametrize(
+    "typef,dtype,input_shape",
+    [
+        ("nn.global_max_pool2d", "float32", (8, 8, 16)),
+        ("nn.global_max_pool2d", "float32", (9, 9, 16)),
+        ("nn.global_max_pool2d", "uint8", (8, 8, 16)),
+        ("nn.global_max_pool2d", "uint8", (9, 9, 16)),
+        ("nn.global_max_pool2d", "int8", (8, 8, 16)),
+        ("nn.global_max_pool2d", "int8", (9, 9, 16)),
+        ("nn.global_avg_pool2d", "float32", (8, 8, 16)),
+        ("nn.global_avg_pool2d", "float32", (9, 9, 16)),
+        ("nn.global_avg_pool2d", "uint8", (8, 8, 16)),
+        ("nn.global_avg_pool2d", "uint8", (9, 9, 16)),
+        ("nn.global_avg_pool2d", "int8", (8, 8, 16)),
+        ("nn.global_avg_pool2d", "int8", (9, 9, 16)),
+    ],
+)
+def test_codegen_global_pooling(typef, dtype, input_shape):
     if skip_codegen_test():
         return
 
-    fp32_dtype = ("float32", -127, 128)
-    uint8_dtype = ("uint8", 0, 255)
-
-    trials = [
-        ["nn.global_max_pool2d", fp32_dtype, (8, 8, 16)],
-        ["nn.global_max_pool2d", fp32_dtype, (9, 9, 16)],
-        ["nn.global_max_pool2d", fp32_dtype, (8, 8, 16)],
-        ["nn.global_max_pool2d", uint8_dtype, (8, 8, 16)],
-        ["nn.global_max_pool2d", uint8_dtype, (9, 9, 16)],
-        ["nn.global_avg_pool2d", fp32_dtype, (8, 8, 16)],
-        ["nn.global_avg_pool2d", fp32_dtype, (8, 8, 16)],
-        ["nn.global_avg_pool2d", fp32_dtype, (9, 9, 16)],
-        ["nn.global_avg_pool2d", uint8_dtype, (8, 8, 16)],
-        ["nn.global_avg_pool2d", uint8_dtype, (8, 8, 16)],
-    ]
-
-    for typef, (dtype, low, high), input_shape in trials:
-        shape = (1, *input_shape)
-        inputs = {"a"}
-        args = (shape, dtype, typef)
-        func = _get_global_pooling_model(*args, iter(inputs))
-        exp_codegen = _get_expected_global_pooling_codegen(*args)
-        verify_codegen(func, exp_codegen, 1)
+    low, high, _, _ = _get_low_high_atol_rtol(dtype)
+
+    shape = (1, *input_shape)
+    inputs = {"a"}
+    args = (shape, dtype, typef)
+    func = _get_global_pooling_model(*args, iter(inputs))
+    exp_codegen = _get_expected_global_pooling_codegen(*args)
+    verify_codegen(func, exp_codegen, 1)
 
 
 if __name__ == "__main__":

From 1e5fc256491b3ba88d938fb51013ad5d32f70ac6 Mon Sep 17 00:00:00 2001
From: arangasa <76030063+arangasa@users.noreply.github.com>
Date: Mon, 28 Nov 2022 19:07:55 +0530
Subject: [PATCH 663/704] =?UTF-8?q?[Hexagon]Call=20Acquire/Release=20resou?=
 =?UTF-8?q?rces=20API=20in=20Hexagon=20Launcher=20durin=E2=80=A6=20(#13495?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[Hexagon]Call Acquire/Release resources API in Hexagon Launcher during RPC Open/Close
---
 apps/hexagon_launcher/launcher_hexagon.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/apps/hexagon_launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc
index 2692caf90e66..63659cb5044d 100644
--- a/apps/hexagon_launcher/launcher_hexagon.cc
+++ b/apps/hexagon_launcher/launcher_hexagon.cc
@@ -47,11 +47,17 @@ static AEEResult error_too_small(const std::string& func_name, const std::string
 int __QAIC_HEADER(launcher_rpc_open)(const char* uri, remote_handle64* handle) {
   *handle = 0;  // Just use any value.
   reset_device_api();
+  static const tvm::runtime::PackedFunc acq_res =
+      get_runtime_func("device_api.hexagon.acquire_resources");
+  acq_res();
   return AEE_SUCCESS;
 }
 
 int __QAIC_HEADER(launcher_rpc_close)(remote_handle64 handle) {
   // Comment to stop clang-format from single-lining this function.
+  static const tvm::runtime::PackedFunc rel_res =
+      get_runtime_func("device_api.hexagon.release_resources");
+  rel_res();
   return AEE_SUCCESS;
 }
 

From 449d674e8dcdacaf954d19ebcc26416f7959d231 Mon Sep 17 00:00:00 2001
From: abhikran-quic <63697863+abhikran-quic@users.noreply.github.com>
Date: Mon, 28 Nov 2022 19:53:15 +0530
Subject: [PATCH 664/704] [TIR][Schedule] Add condition to check buffer type
 (#13429)

Add condition to check buffer type while creating array for padding.

Co-authored by: Anirudh Sundar Subramaniam <quic_sanirudh@quicinc.com>
---
 python/tvm/tir/schedule/schedule.py               |  4 ++--
 .../test_tir_schedule_transform_layout.py         | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 69feaff53aa3..91c42f2a8d1d 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -2751,9 +2751,9 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) ->
             # buffer's type.  If the default `tvm.runtime.convert`
             # behavior is applied, these would be converted to
             # int32/float32, which may not match the buffer's type.
-            if isinstance(pad_value, int):
+            if "int" in buffer_obj.dtype and isinstance(pad_value, int):
                 pad_value = IntImm(buffer_obj.dtype, pad_value)
-            elif isinstance(pad_value, float):
+            elif "float" in buffer_obj.dtype and isinstance(pad_value, float):
                 pad_value = FloatImm(buffer_obj.dtype, pad_value)
             pad_value = IndexMap.from_func(
                 lambda *indices: pad_value, ndim=len(index_map.final_indices)
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index faeaf8768681..1ab61c08b05e 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -533,6 +533,21 @@ def before():
     expected = tvm.tir.schedule.schedule.ScheduleError
 
 
+class TestErrorOnNonMatchingTypes(BasePaddingCompare):
+    """The padding must have the same dtype as the buffer"""
+
+    pad_value = tvm.testing.parameter(0)
+
+    def before():
+        A = T.alloc_buffer(14, "float32")
+        for i in T.serial(14):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                A[vi] = 0
+
+    expected = tvm.tir.schedule.schedule.ScheduleError
+
+
 class TestPaddedTransformIfThenElse(BasePaddingCompare):
     """Use if_then_else to represent padding, if possible.
 

From 2a812f90f311310e63d648c103be6d97a660c079 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 28 Nov 2022 15:15:22 +0000
Subject: [PATCH 665/704] Bump pillow from 8.3.2 to 9.3.0 in
 /apps/microtvm/cmsisnn (#13465)

Bumps [pillow](https://github.com/python-pillow/Pillow) from 8.3.2 to 9.3.0.
- [Release notes](https://github.com/python-pillow/Pillow/releases)
- [Changelog](https://github.com/python-pillow/Pillow/blob/main/CHANGES.rst)
- [Commits](https://github.com/python-pillow/Pillow/compare/8.3.2...9.3.0)

---
updated-dependencies:
- dependency-name: pillow
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 apps/microtvm/cmsisnn/requirements.txt | 116 +++++++++++++------------
 1 file changed, 62 insertions(+), 54 deletions(-)

diff --git a/apps/microtvm/cmsisnn/requirements.txt b/apps/microtvm/cmsisnn/requirements.txt
index 3399d4a7f7be..72ae166963ee 100644
--- a/apps/microtvm/cmsisnn/requirements.txt
+++ b/apps/microtvm/cmsisnn/requirements.txt
@@ -99,60 +99,68 @@ numpy==1.21.3 \
     --hash=sha256:f41b018f126aac18583956c54544db437f25c7ee4794bcb23eb38bef8e5e192a \
     --hash=sha256:f8f4625536926a155b80ad2bbff44f8cc59e9f2ad14cdda7acf4c135b4dc8ff2 \
     --hash=sha256:fe52dbe47d9deb69b05084abd4b0df7abb39a3c51957c09f635520abd49b29dd
-Pillow==8.3.2 \
-    --hash=sha256:0412516dcc9de9b0a1e0ae25a280015809de8270f134cc2c1e32c4eeb397cf30 \
-    --hash=sha256:04835e68ef12904bc3e1fd002b33eea0779320d4346082bd5b24bec12ad9c3e9 \
-    --hash=sha256:06d1adaa284696785375fa80a6a8eb309be722cf4ef8949518beb34487a3df71 \
-    --hash=sha256:085a90a99404b859a4b6c3daa42afde17cb3ad3115e44a75f0d7b4a32f06a6c9 \
-    --hash=sha256:0b9911ec70731711c3b6ebcde26caea620cbdd9dcb73c67b0730c8817f24711b \
-    --hash=sha256:10e00f7336780ca7d3653cf3ac26f068fa11b5a96894ea29a64d3dc4b810d630 \
-    --hash=sha256:11c27e74bab423eb3c9232d97553111cc0be81b74b47165f07ebfdd29d825875 \
-    --hash=sha256:11eb7f98165d56042545c9e6db3ce394ed8b45089a67124298f0473b29cb60b2 \
-    --hash=sha256:13654b521fb98abdecec105ea3fb5ba863d1548c9b58831dd5105bb3873569f1 \
-    --hash=sha256:15ccb81a6ffc57ea0137f9f3ac2737ffa1d11f786244d719639df17476d399a7 \
-    --hash=sha256:18a07a683805d32826c09acfce44a90bf474e6a66ce482b1c7fcd3757d588df3 \
-    --hash=sha256:19ec4cfe4b961edc249b0e04b5618666c23a83bc35842dea2bfd5dfa0157f81b \
-    --hash=sha256:1c3ff00110835bdda2b1e2b07f4a2548a39744bb7de5946dc8e95517c4fb2ca6 \
-    --hash=sha256:27a330bf7014ee034046db43ccbb05c766aa9e70b8d6c5260bfc38d73103b0ba \
-    --hash=sha256:2b11c9d310a3522b0fd3c35667914271f570576a0e387701f370eb39d45f08a4 \
-    --hash=sha256:2c661542c6f71dfd9dc82d9d29a8386287e82813b0375b3a02983feac69ef864 \
-    --hash=sha256:2cde7a4d3687f21cffdf5bb171172070bb95e02af448c4c8b2f223d783214056 \
-    --hash=sha256:2d5e9dc0bf1b5d9048a94c48d0813b6c96fccfa4ccf276d9c36308840f40c228 \
-    --hash=sha256:2f23b2d3079522fdf3c09de6517f625f7a964f916c956527bed805ac043799b8 \
-    --hash=sha256:35d27687f027ad25a8d0ef45dd5208ef044c588003cdcedf05afb00dbc5c2deb \
-    --hash=sha256:35d409030bf3bd05fa66fb5fdedc39c521b397f61ad04309c90444e893d05f7d \
-    --hash=sha256:4326ea1e2722f3dc00ed77c36d3b5354b8fb7399fb59230249ea6d59cbed90da \
-    --hash=sha256:4abc247b31a98f29e5224f2d31ef15f86a71f79c7f4d2ac345a5d551d6393073 \
-    --hash=sha256:4d89a2e9219a526401015153c0e9dd48319ea6ab9fe3b066a20aa9aee23d9fd3 \
-    --hash=sha256:4e59e99fd680e2b8b11bbd463f3c9450ab799305d5f2bafb74fefba6ac058616 \
-    --hash=sha256:548794f99ff52a73a156771a0402f5e1c35285bd981046a502d7e4793e8facaa \
-    --hash=sha256:56fd98c8294f57636084f4b076b75f86c57b2a63a8410c0cd172bc93695ee979 \
-    --hash=sha256:59697568a0455764a094585b2551fd76bfd6b959c9f92d4bdec9d0e14616303a \
-    --hash=sha256:6bff50ba9891be0a004ef48828e012babaaf7da204d81ab9be37480b9020a82b \
-    --hash=sha256:6cb3dd7f23b044b0737317f892d399f9e2f0b3a02b22b2c692851fb8120d82c6 \
-    --hash=sha256:7dbfbc0020aa1d9bc1b0b8bcf255a7d73f4ad0336f8fd2533fcc54a4ccfb9441 \
-    --hash=sha256:838eb85de6d9307c19c655c726f8d13b8b646f144ca6b3771fa62b711ebf7624 \
-    --hash=sha256:8b68f565a4175e12e68ca900af8910e8fe48aaa48fd3ca853494f384e11c8bcd \
-    --hash=sha256:8f284dc1695caf71a74f24993b7c7473d77bc760be45f776a2c2f4e04c170550 \
-    --hash=sha256:963ebdc5365d748185fdb06daf2ac758116deecb2277ec5ae98139f93844bc09 \
-    --hash=sha256:a048dad5ed6ad1fad338c02c609b862dfaa921fcd065d747194a6805f91f2196 \
-    --hash=sha256:a1bd983c565f92779be456ece2479840ec39d386007cd4ae83382646293d681b \
-    --hash=sha256:a66566f8a22561fc1a88dc87606c69b84fa9ce724f99522cf922c801ec68f5c1 \
-    --hash=sha256:bcb04ff12e79b28be6c9988f275e7ab69f01cc2ba319fb3114f87817bb7c74b6 \
-    --hash=sha256:bd24054aaf21e70a51e2a2a5ed1183560d3a69e6f9594a4bfe360a46f94eba83 \
-    --hash=sha256:be25cb93442c6d2f8702c599b51184bd3ccd83adebd08886b682173e09ef0c3f \
-    --hash=sha256:c691b26283c3a31594683217d746f1dad59a7ae1d4cfc24626d7a064a11197d4 \
-    --hash=sha256:cc9d0dec711c914ed500f1d0d3822868760954dce98dfb0b7382a854aee55d19 \
-    --hash=sha256:ce2e5e04bb86da6187f96d7bab3f93a7877830981b37f0287dd6479e27a10341 \
-    --hash=sha256:ce651ca46d0202c302a535d3047c55a0131a720cf554a578fc1b8a2aff0e7d96 \
-    --hash=sha256:d0c8ebbfd439c37624db98f3877d9ed12c137cadd99dde2d2eae0dab0bbfc355 \
-    --hash=sha256:d675a876b295afa114ca8bf42d7f86b5fb1298e1b6bb9a24405a3f6c8338811c \
-    --hash=sha256:dde3f3ed8d00c72631bc19cbfff8ad3b6215062a5eed402381ad365f82f0c18c \
-    --hash=sha256:e5a31c07cea5edbaeb4bdba6f2b87db7d3dc0f446f379d907e51cc70ea375629 \
-    --hash=sha256:f514c2717012859ccb349c97862568fdc0479aad85b0270d6b5a6509dbc142e2 \
-    --hash=sha256:fc0db32f7223b094964e71729c0361f93db43664dd1ec86d3df217853cedda87 \
-    --hash=sha256:fd4fd83aa912d7b89b4b4a1580d30e2a4242f3936882a3f433586e5ab97ed0d5 \
-    --hash=sha256:feb5db446e96bfecfec078b943cc07744cc759893cef045aa8b8b6d6aaa8274e
+Pillow==9.3.0 \
+    --hash=sha256:03150abd92771742d4a8cd6f2fa6246d847dcd2e332a18d0c15cc75bf6703040 \
+    --hash=sha256:073adb2ae23431d3b9bcbcff3fe698b62ed47211d0716b067385538a1b0f28b8 \
+    --hash=sha256:0b07fffc13f474264c336298d1b4ce01d9c5a011415b79d4ee5527bb69ae6f65 \
+    --hash=sha256:0b7257127d646ff8676ec8a15520013a698d1fdc48bc2a79ba4e53df792526f2 \
+    --hash=sha256:12ce4932caf2ddf3e41d17fc9c02d67126935a44b86df6a206cf0d7161548627 \
+    --hash=sha256:15c42fb9dea42465dfd902fb0ecf584b8848ceb28b41ee2b58f866411be33f07 \
+    --hash=sha256:18498994b29e1cf86d505edcb7edbe814d133d2232d256db8c7a8ceb34d18cef \
+    --hash=sha256:1c7c8ae3864846fc95f4611c78129301e203aaa2af813b703c55d10cc1628535 \
+    --hash=sha256:22b012ea2d065fd163ca096f4e37e47cd8b59cf4b0fd47bfca6abb93df70b34c \
+    --hash=sha256:276a5ca930c913f714e372b2591a22c4bd3b81a418c0f6635ba832daec1cbcfc \
+    --hash=sha256:2e0918e03aa0c72ea56edbb00d4d664294815aa11291a11504a377ea018330d3 \
+    --hash=sha256:3033fbe1feb1b59394615a1cafaee85e49d01b51d54de0cbf6aa8e64182518a1 \
+    --hash=sha256:3168434d303babf495d4ba58fc22d6604f6e2afb97adc6a423e917dab828939c \
+    --hash=sha256:32a44128c4bdca7f31de5be641187367fe2a450ad83b833ef78910397db491aa \
+    --hash=sha256:3dd6caf940756101205dffc5367babf288a30043d35f80936f9bfb37f8355b32 \
+    --hash=sha256:40e1ce476a7804b0fb74bcfa80b0a2206ea6a882938eaba917f7a0f004b42502 \
+    --hash=sha256:41e0051336807468be450d52b8edd12ac60bebaa97fe10c8b660f116e50b30e4 \
+    --hash=sha256:4390e9ce199fc1951fcfa65795f239a8a4944117b5935a9317fb320e7767b40f \
+    --hash=sha256:502526a2cbfa431d9fc2a079bdd9061a2397b842bb6bc4239bb176da00993812 \
+    --hash=sha256:51e0e543a33ed92db9f5ef69a0356e0b1a7a6b6a71b80df99f1d181ae5875636 \
+    --hash=sha256:57751894f6618fd4308ed8e0c36c333e2f5469744c34729a27532b3db106ee20 \
+    --hash=sha256:5d77adcd56a42d00cc1be30843d3426aa4e660cab4a61021dc84467123f7a00c \
+    --hash=sha256:655a83b0058ba47c7c52e4e2df5ecf484c1b0b0349805896dd350cbc416bdd91 \
+    --hash=sha256:68943d632f1f9e3dce98908e873b3a090f6cba1cbb1b892a9e8d97c938871fbe \
+    --hash=sha256:6c738585d7a9961d8c2821a1eb3dcb978d14e238be3d70f0a706f7fa9316946b \
+    --hash=sha256:73bd195e43f3fadecfc50c682f5055ec32ee2c933243cafbfdec69ab1aa87cad \
+    --hash=sha256:772a91fc0e03eaf922c63badeca75e91baa80fe2f5f87bdaed4280662aad25c9 \
+    --hash=sha256:77ec3e7be99629898c9a6d24a09de089fa5356ee408cdffffe62d67bb75fdd72 \
+    --hash=sha256:7db8b751ad307d7cf238f02101e8e36a128a6cb199326e867d1398067381bff4 \
+    --hash=sha256:801ec82e4188e935c7f5e22e006d01611d6b41661bba9fe45b60e7ac1a8f84de \
+    --hash=sha256:82409ffe29d70fd733ff3c1025a602abb3e67405d41b9403b00b01debc4c9a29 \
+    --hash=sha256:828989c45c245518065a110434246c44a56a8b2b2f6347d1409c787e6e4651ee \
+    --hash=sha256:829f97c8e258593b9daa80638aee3789b7df9da5cf1336035016d76f03b8860c \
+    --hash=sha256:871b72c3643e516db4ecf20efe735deb27fe30ca17800e661d769faab45a18d7 \
+    --hash=sha256:89dca0ce00a2b49024df6325925555d406b14aa3efc2f752dbb5940c52c56b11 \
+    --hash=sha256:90fb88843d3902fe7c9586d439d1e8c05258f41da473952aa8b328d8b907498c \
+    --hash=sha256:97aabc5c50312afa5e0a2b07c17d4ac5e865b250986f8afe2b02d772567a380c \
+    --hash=sha256:9aaa107275d8527e9d6e7670b64aabaaa36e5b6bd71a1015ddd21da0d4e06448 \
+    --hash=sha256:9f47eabcd2ded7698106b05c2c338672d16a6f2a485e74481f524e2a23c2794b \
+    --hash=sha256:a0a06a052c5f37b4ed81c613a455a81f9a3a69429b4fd7bb913c3fa98abefc20 \
+    --hash=sha256:ab388aaa3f6ce52ac1cb8e122c4bd46657c15905904b3120a6248b5b8b0bc228 \
+    --hash=sha256:ad58d27a5b0262c0c19b47d54c5802db9b34d38bbf886665b626aff83c74bacd \
+    --hash=sha256:ae5331c23ce118c53b172fa64a4c037eb83c9165aba3a7ba9ddd3ec9fa64a699 \
+    --hash=sha256:af0372acb5d3598f36ec0914deed2a63f6bcdb7b606da04dc19a88d31bf0c05b \
+    --hash=sha256:afa4107d1b306cdf8953edde0534562607fe8811b6c4d9a486298ad31de733b2 \
+    --hash=sha256:b03ae6f1a1878233ac620c98f3459f79fd77c7e3c2b20d460284e1fb370557d4 \
+    --hash=sha256:b0915e734b33a474d76c28e07292f196cdf2a590a0d25bcc06e64e545f2d146c \
+    --hash=sha256:b4012d06c846dc2b80651b120e2cdd787b013deb39c09f407727ba90015c684f \
+    --hash=sha256:b472b5ea442148d1c3e2209f20f1e0bb0eb556538690fa70b5e1f79fa0ba8dc2 \
+    --hash=sha256:b59430236b8e58840a0dfb4099a0e8717ffb779c952426a69ae435ca1f57210c \
+    --hash=sha256:b90f7616ea170e92820775ed47e136208e04c967271c9ef615b6fbd08d9af0e3 \
+    --hash=sha256:b9a65733d103311331875c1dca05cb4606997fd33d6acfed695b1232ba1df193 \
+    --hash=sha256:bac18ab8d2d1e6b4ce25e3424f709aceef668347db8637c2296bcf41acb7cf48 \
+    --hash=sha256:bca31dd6014cb8b0b2db1e46081b0ca7d936f856da3b39744aef499db5d84d02 \
+    --hash=sha256:be55f8457cd1eac957af0c3f5ece7bc3f033f89b114ef30f710882717670b2a8 \
+    --hash=sha256:c7025dce65566eb6e89f56c9509d4f628fddcedb131d9465cacd3d8bac337e7e \
+    --hash=sha256:c935a22a557a560108d780f9a0fc426dd7459940dc54faa49d83249c8d3e760f \
+    --hash=sha256:dbb8e7f2abee51cef77673be97760abff1674ed32847ce04b4af90f610144c7b \
+    --hash=sha256:e6ea6b856a74d560d9326c0f5895ef8050126acfdc7ca08ad703eb0081e82b74 \
+    --hash=sha256:ebf2029c1f464c59b8bdbe5143c79fa2045a581ac53679733d3a91d400ff9efb \
+    --hash=sha256:f1ff2ee69f10f13a9596480335f406dd1f70c3650349e2be67ca3139280cade0
 psutil==5.8.0 \
     --hash=sha256:0066a82f7b1b37d334e68697faba68e5ad5e858279fd6351c8ca6024e8d6ba64 \
     --hash=sha256:02b8292609b1f7fcb34173b25e48d0da8667bc85f81d7476584d889c6e0f2131 \

From 36d18e905bc58e18ed80356b9161ddfbbbf098bf Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 29 Nov 2022 02:54:06 +0900
Subject: [PATCH 666/704] [TIR] Fix buffer shape and IndexMap indices dtype
 mismatch (#13463)

* [TIR] Fix buffer shape and IndexMap indices dtype mismatch

* turn off debug_mask to suppress flaky VerifySRefTree error

* add comment

* add missing const auto&, handle cases dtypes partially match

* massively simplify test case
---
 .../primitive/layout_transformation.cc        | 34 ++++++++++++++++++-
 .../test_tir_schedule_transform_layout.py     | 21 ++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index c0b4ddfb4ac3..bf618af8de54 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -1055,13 +1055,45 @@ class TransformationIntroducesPaddingError : public ScheduleError {
   PrimExpr padding_predicate_;
 };
 
+// Make the dtypes of indices in IndexMap be the same as the dtype of the buffer shape, to avoid
+// dtype-mismatch issues later.
+IndexMap LegalizeIndexMapDType(const IndexMap& index_map, const Buffer& buf) {
+  const auto& initial_indices_orig = index_map->initial_indices;
+  ICHECK(buf->shape.size() == initial_indices_orig.size());
+
+  Array<Var> initial_indices;
+  Map<Var, PrimExpr> var_map;
+
+  for (size_t i = 0; i < buf->shape.size(); ++i) {
+    if (buf->shape[i]->dtype != initial_indices_orig[i].dtype()) {
+      auto new_idx = Var(initial_indices_orig[i]->name_hint, buf->shape[i]->dtype);
+      initial_indices.push_back(new_idx);
+      var_map.Set(initial_indices_orig[i], new_idx);
+    } else {
+      initial_indices.push_back(initial_indices_orig[i]);
+    }
+  }
+
+  if (!var_map.empty()) {
+    auto final_indices = index_map->final_indices.Map([&](PrimExpr index) {
+      return SubstituteWithDataTypeLegalization(index,
+                                                [&](const Var& var) { return var_map.Get(var); });
+    });
+    return IndexMap(initial_indices, final_indices);
+  }
+  return index_map;
+}
+
 void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
-                     BufferIndexType buffer_index_type, const IndexMap& index_map,
+                     BufferIndexType buffer_index_type, const IndexMap& index_map_orig,
                      const Optional<IndexMap>& pad_value) {
   // Step 1: Input handling and error checking
   const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref);
   Buffer old_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index, buffer_index_type);
+
+  auto index_map = LegalizeIndexMapDType(index_map_orig, old_buffer);
+
   auto [defining_site_sref, is_alloc] = GetBufferDefiningSite(block_sref, old_buffer);
   if (defining_site_sref.defined() && !is_alloc) {
     throw BufferIsSubregionError(self->mod, old_buffer);
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index 1ab61c08b05e..b4e49316f123 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -938,5 +938,26 @@ def expected(a: T.handle):
                 A[i, j] = T.if_then_else(i == 3 and 2 <= j, 0, 42, dtype="int32")
 
 
+def test_index_map_dtype_legalize():
+    """Test dtype legalization of the index map indices."""
+
+    @T.prim_func
+    def func(A: T.Buffer[T.int64(58), "int32"]):
+        for i in T.serial(T.int64(58)):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                T.writes(A[vi])
+                A[vi] = 0
+
+    sch = tir.Schedule(func)
+
+    # # The following error is raised from the IterVar constructor without the dtype legalization.
+    # # TVMError: Check failed: dom->extent.dtype() == var.dtype() (int64 vs. int32) :
+    # # The dtype of the extent of an IterVar (int64) must match its associated Var's dtype (int32)
+    sch.transform_layout(
+        sch.get_block("block"), buffer="A", index_map=lambda h: [h // 8, h % 8], pad_value=0
+    )
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 3252362d94aaccedeb760a1c29df12910474cfd1 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 28 Nov 2022 09:55:21 -0800
Subject: [PATCH 667/704] [MetaSchedule] Enhance Database Validation Script
 (#13459)

* Unify output function.

* Support reuse input & local mod results.

* Fix issues.

* Remove exception inputs in results.

* Check point.

* Check point.

* Check point.

* Resolve issues.

* Avoid nullptr.

* Linting.

* Linting.

* Move function out of initializer.

* Support local runner.
---
 .../testing/validate_database.py              | 781 ++++++++++++++----
 .../measure_callback/remove_build_artifact.cc |   2 +-
 2 files changed, 638 insertions(+), 145 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/validate_database.py b/python/tvm/meta_schedule/testing/validate_database.py
index 5e48bfb6b04e..a5981a78d645 100644
--- a/python/tvm/meta_schedule/testing/validate_database.py
+++ b/python/tvm/meta_schedule/testing/validate_database.py
@@ -15,22 +15,26 @@
 # specific language governing permissions and limitations
 # under the License.
 """JSON Database validation script"""
-from typing import Union, Callable, List
-from distutils.util import strtobool
 import argparse
 import logging
 import warnings
+import itertools
+from statistics import mean
+from distutils.util import strtobool
+from typing import Callable, Tuple, Union, List, Any
 import numpy as np  # type: ignore
 
 import tvm
-from tvm.target import Target
-from tvm.ir import IRModule
-from tvm.tir import Schedule
 from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
-from tvm.meta_schedule.testing.tune_utils import create_calculator, generate_input_data
 from tvm._ffi import get_global_func, register_func
+from tvm.ir import IRModule
 from tvm.support import describe
+from tvm.target import Target
+from tvm.tir import Schedule
+from tvm.tir.schedule import Trace
+from tvm.meta_schedule.utils import remove_build_dir
+from tvm.meta_schedule.testing.tune_utils import generate_input_data
+from tvm.tir.tensor_intrin import *  # type: ignore # pylint: disable=wildcard-import,unused-wildcard-import
 
 DELIMITOR = "\n" + "-" * 30 + "\n"
 
@@ -55,20 +59,24 @@ def _parse_args():
         required=False,
         help="The baseline target to compile the original module.",
     )
+    args.add_argument(
+        "--top-k",
+        type=int,
+        default=10**9,
+        required=False,
+        help="The number of top-k tuning records to validate for each unique original workload.",
+    )
     args.add_argument(
         "--rpc-host",
         type=str,
-        required=True,
     )
     args.add_argument(
         "--rpc-port",
         type=int,
-        required=True,
     )
     args.add_argument(
         "--rpc-key",
         type=str,
-        required=True,
     )
     args.add_argument(
         "--number",
@@ -91,31 +99,152 @@ def _parse_args():
         help="example: True / False",
         required=True,
     )
+    args.add_argument(
+        "--input-generator-func",
+        type=str,
+        default="tvm.meta_schedule.testing.default_input_generator",
+    )
+    args.add_argument(
+        "--check-metric-func",
+        type=str,
+        default="tvm.meta_schedule.testing.default_check_metric",
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
-    parsed.rpc_config = ms.runner.RPCConfig(
-        tracker_host=parsed.rpc_host,
-        tracker_port=parsed.rpc_port,
-        tracker_key=parsed.rpc_key,
-        session_timeout_sec=600,
-    )
+    if parsed.rpc_host is not None and parsed.rpc_port is not None and parsed.rpc_key is not None:
+        parsed.rpc_config = ms.runner.RPCConfig(
+            tracker_host=parsed.rpc_host,
+            tracker_port=parsed.rpc_port,
+            tracker_key=parsed.rpc_key,
+            session_timeout_sec=600,
+        )
+    else:
+        parsed.rpc_config = None
+        warnings.warn("RPC config is not provided, will use local runner.")
     if parsed.cpu_flush and parsed.target.kind.name != "llvm":
         warnings.warn("cpu_flush is only supported on llvm target")
     return parsed
 
 
+# arg parser
+ARGS = _parse_args()
+
 # logging
 logging.basicConfig(
     format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
 )
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
+logging.getLogger("tvm.meta_schedule.runner").setLevel(logging.WARN)
 
-# arg parser
-ARGS = _parse_args()
+
+def get_device_type(target: Target) -> str:
+    """Get the device type string from a target.
+
+    Parameters
+    ----------
+    target : Target
+        The target to get the device type from.
+
+    Returns
+    -------
+    device_type : str
+        The device type string.
+    """
+    if target.kind.name == "llvm":
+        return "cpu"
+    elif target.kind.name == "cuda":
+        return "cuda"
+    else:
+        raise RuntimeError(f"Unsupported target kind for device type: {target.kind.name}")
+
+
+def get_runtime_device(target: Target) -> tvm.runtime.Device:
+    """Get the runtime device from a target.
+
+    Parameters
+    ----------
+    target : Target
+        The target to get the runtime device from.
+
+    Returns
+    -------
+    device : tvm.runtime.Device
+        The runtime device.
+    """
+    if target.kind.name == "llvm":
+        return tvm.cpu()
+    elif target.kind.name == "cuda":
+        return tvm.cuda()
+    else:
+        raise RuntimeError(f"Unsupported target kind for runtime device: {target.kind.name}")
+
+
+def check_and_run(func: Union[str, Callable], *args, **kwargs) -> Any:
+    """Check if the function is a string or a callable, and run it."""
+    if isinstance(func, str):
+        func = get_global_func(func)
+    return func(*args, **kwargs)  # type: ignore
+
+
+class OriginalModule:
+    """Original module class for deduplication."""
+
+    def __init__(self, mod: IRModule):
+        self.mod = mod
+
+    def __eq__(self, __o: "OriginalModule") -> bool:  # type: ignore
+        return tvm.ir.structural_equal(self.mod, __o.mod)
+
+    def __hash__(self) -> int:
+        return tvm.ir.structural_hash(self.mod)
+
+
+def initializer() -> None:
+    """Initializer function to register the functions on PopenWorker."""
+
+    @register_func("tvm.meta_schedule.testing.default_check_metric")
+    def default_check_metric(  # pylint: disable=unused-variable,unreachable-code
+        lhs: List[tvm.nd.NDArray], rhs: List[tvm.nd.NDArray]
+    ) -> bool:
+        """Check if the outputs are equal
+
+        Parameters
+        ----------
+        lhs : List[tvm.nd.NDArray]
+            The first list of NDArrays to compare.
+
+        rhs : List[tvm.nd.NDArray]
+            The second list of NDArrays to compare.
+
+        Returns
+        -------
+        is_equal : bool
+            Whether the two lists of NDArrays are equal.
+        """
+        assert len(lhs) == len(rhs), "Different number of outputs from two modules"
+        for i in range(len(lhs)):  # pylint: disable=consider-using-enumerate
+            if not np.allclose(lhs[i].numpy(), rhs[i].numpy(), rtol=1e-3, atol=2e-3):
+                return False
+        return True
 
 
 @register_func("tvm.meta_schedule.testing.default_input_generator")
-def default_input_generator(mod: IRModule) -> List[tvm.nd.NDArray]:
+def default_input_generator(  # pylint: disable=unused-variable
+    mod: IRModule,
+) -> List[tvm.nd.NDArray]:
+    """Default input generator function
+
+    Parameters
+    ----------
+    mod : IRModule
+        The IRModule to generate the input data for.
+
+    Returns
+    -------
+    inputs : List[tvm.nd.NDArray]
+        The generated input data.
+    """
+
     args_info = ms.arg_info.TensorInfo.from_prim_func(mod["main"])
     inputs = [
         tvm.nd.array(generate_input_data(input_shape=arg_info.shape, input_dtype=arg_info.dtype))
@@ -124,158 +253,522 @@ def default_input_generator(mod: IRModule) -> List[tvm.nd.NDArray]:
     return inputs
 
 
-@register_func("tvm.meta_schedule.testing.default_check_metric")
-def default_check_metric(a: List[tvm.nd.NDArray], b: List[tvm.nd.NDArray]) -> bool:
-    assert len(a) == len(b), "Different number of outputs from two modules"
-    for i, _ in enumerate(a):
-        if not np.allclose(a[i].numpy(), b[i].numpy(), rtol=1e-3, atol=2e-3):
-            return False
-    return True
+def to_numpy(a: List[tvm.nd.NDArray]) -> List[np.ndarray]:
+    """Convert a list of TVM NDArray to a list of numpy array
 
+    Parameters
+    ----------
+    a : List[tvm.nd.NDArray]
+        The list of TVM NDArray to be converted
 
-def validate_correctness(
-    original_mod: IRModule,  # compiled for "baseline_target"
-    scheduled_mod: IRModule,  # compiled for "target"
-    *,
-    baseline_target: Target,
-    target: Target,
-    dev_type: str,
-    rpc_config: ms.runner.RPCConfig,
-    f_input_generator: Union[
-        str, Callable[[IRModule], List[tvm.nd.NDArray]]
-    ] = default_input_generator,
-    f_check_metric: Union[
-        str, Callable[[tvm.nd.NDArray, tvm.nd.NDArray], bool]
-    ] = default_check_metric,
-) -> bool:
-    """Function to validate the correctness of a scheduled module.
+    Returns
+    -------
+    b : List[np.ndarray]
+        The list of numpy array
+    """
+    assert a is not None, "Empty result cannot be converted to numpy"
+    return [x.numpy() for x in a]
+
+
+def to_tvm_ndarray(a: List[np.ndarray]) -> List[tvm.nd.NDArray]:
+    """Convert a list of numpy array to a list of TVM NDArray
 
     Parameters
     ----------
-    original_mod : IRModule
-        The original module to be compiled.
-    scheduled_mod : IRModule
-        The scheduled module to be compiled.
-    baseline_target : Target
-        The baseline target to compile the original module.
-    target : Target
-        The target to compile the scheduled module.
-    dev_type : str
-        The device type to run the module via rpc.
-    rpc_config : RPCConfig
-        The RPCConfig to run the scheduled module.
-    f_input_generator : Union[str, Callable]
-        The function to generate the input data.
-    f_check_metric : Union[str, Callable]
-        The function to check the metric.
+    a : List[np.ndarray]
+        The list of numpy array to be converted.
 
     Returns
     -------
-    result : bool
-        The result of the validation.
+    b : List[tvm.nd.NDArray]
+        The list of TVM NDArray.
     """
+    assert a is not None, "Empty result cannot be converted to TVM NDArray"
+    return [tvm.nd.array(x) for x in a]
 
-    def to_numpy(a: List[tvm.nd.NDArray]) -> List[np.ndarray]:
-        """Convert a list of TVM NDArray to a list of numpy array"""
-        assert a is not None, "Empty result cannot be converted to numpy"
-        return [x.numpy() for x in a]
-
-    def to_tvm_ndarray(a: List[np.ndarray]) -> List[tvm.nd.NDArray]:
-        """Convert a list of numpy array to a list of TVM NDArray"""
-        assert a is not None, "Empty result cannot be converted to TVM NDArray"
-        return [tvm.nd.array(x) for x in a]
-
-    def build_and_run(mod: IRModule, target: Target, dev_type: str) -> np.ndarray:
-        """Build and run the module on the target device."""
-        rt_mod = tvm.build(mod, target=target)
-        return run_module_via_rpc(
-            rpc_config=rpc_config,
-            lib=rt_mod,
-            dev_type=dev_type,
-            args={i: v for i, v in enumerate(inputs)},  # pylint: disable=unnecessary-comprehension
-            continuation=create_calculator(backend="tir"),
-            backend="tir",
-        )
 
-    # fetch functions & prepare inputs
-    if isinstance(f_input_generator, str):
-        f_input_generator = get_global_func(f_input_generator)
-    if isinstance(f_check_metric, str):
-        f_check_metric = get_global_func(f_check_metric)
-    inputs = to_numpy(f_input_generator(original_mod))  # type: ignore
-    # build & run original result
-    original_res = to_numpy(build_and_run(original_mod, target=baseline_target, dev_type="cpu"))
-    scheduled_res = to_numpy(build_and_run(scheduled_mod, target=target, dev_type=dev_type))
-    # check metric
-    if f_check_metric(to_tvm_ndarray(original_res), to_tvm_ndarray(scheduled_res)):  # type: ignore
-        return True
-    else:
-        print(
-            ("\n\n").join(
+def is_failed_record(record: ms.database.TuningRecord) -> bool:
+    """Check if a tuning record is failed.
+
+    Parameters
+    ----------
+    record : TuningRecord
+        The tuning record to check.
+
+    Returns
+    -------
+    is_failed : bool
+    """
+    return len(record.run_secs) == 1 and record.run_secs[0] == 1e9
+
+
+def print_with_counter_func(counter: int, total: int) -> Callable:
+    """Print with counter
+
+    Parameters
+    ----------
+    counter : int
+        The counter to print with.
+    total : int
+        The total number of items to print with.
+
+    Returns
+    -------
+    print_result : Callable
+        The print result function.
+    """
+
+    def print_result(
+        result: str,
+        *,
+        original_mod: IRModule = None,
+        scheduled_mod: IRModule = None,
+        inputs: List[np.ndarray] = None,
+        original_res: List[np.ndarray] = None,
+        scheduled_res: List[np.ndarray] = None,
+        original_run_secs: List[float] = None,
+        scheduled_run_secs: List[float] = None,
+        exception: Exception = None,
+        trace: str = None,
+    ) -> None:
+        """Print the validation result."""
+        status = f"Progress {counter: 6d} / {total: 6d} (estimated) checked, result: {result:>10}, "
+
+        if result in ["pass", "wrong answer"]:
+            status += (
+                f"original: {mean(original_run_secs) * 1e3: 10.3f} ms, "
+                f"scheduled: {mean(scheduled_run_secs) * 1e3: 10.3f} ms"
+            )
+
+        output = [status]
+        if result not in ["pass", "skip"]:
+            output.extend(
                 [
-                    "Validation failed!",
-                    "Original Result:" + DELIMITOR + str(original_res),
-                    "Scheduled Result:" + DELIMITOR + str(scheduled_res),
-                    "Input:" + DELIMITOR + str(inputs),
                     "Original IRModule:" + DELIMITOR + original_mod.script(),
                     "Scheduled IRModule:" + DELIMITOR + scheduled_mod.script(),
+                    "Trace" + DELIMITOR + str(trace),
                 ]
             )
+            if result == "wrong answer":
+                output.extend(
+                    [
+                        "Input:" + DELIMITOR + str(inputs),
+                        "Original Result:" + DELIMITOR + str(original_res),
+                        "Scheduled Result:" + DELIMITOR + str(scheduled_res),
+                        "Max Diff:"
+                        + DELIMITOR
+                        + str(
+                            [
+                                np.max(np.abs(original_res[i] - scheduled_res[i]))
+                                for i in range(len(original_res))
+                            ]
+                        )
+                        + "\n",
+                    ]
+                )
+            elif result == "exception":
+                output.extend(["Exception:" + DELIMITOR + str(exception) + "\n"])
+            else:
+                raise ValueError(f"Unknown result: {result}")
+        print("\n\n".join(output))
+
+    return print_result
+
+
+def make_alloc_arg_and_check(
+    inputs: List[np.ndarray],
+    original_mod: IRModule,
+    scheduled_mod: IRModule,
+    trace: str,
+    original_res: List[np.ndarray],
+    original_run_secs: List[float],
+    print_result: Callable,
+) -> Tuple[Callable, Callable]:
+    """Make alloc_arg and check functions for the given inputs and collect results.
+
+    Parameters
+    ----------
+    inputs : List[np.ndarray]
+        The inputs to the two modules.
+    original_mod : IRModule
+        The original IRModule.
+    scheduled_mod : IRModule
+        The scheduled IRModule.
+    trace : str
+        The trace of the scheduled IRModule.
+    original_res : List[np.ndarray]
+        The original results.
+    original_run_secs : List[float]
+        The original run times.
+    print_result : Callable
+        The print result function.
+
+    Returns
+    -------
+    f_with_args_alloc_argument : Callable
+        The function to allocate arguments.
+
+    f_with_args_run_evaluator : Callable
+        The function to run evaluator.
+    """
+
+    def f_with_args_alloc_argument_common(
+        device: tvm.runtime.Device,
+        args_info: ms.runner.rpc_runner.T_ARG_INFO_JSON_OBJ_LIST,  # pylint: disable=unused-argument
+        alloc_repeat: int,
+    ) -> List[ms.runner.rpc_runner.T_ARGUMENT_LIST]:
+        """Allocate arguments using the given inputs.
+
+        Parameters
+        ----------
+        session : RPCSession
+            The RPC session.
+        device : Device
+            The device.
+        args_info : T_ARG_INFO_JSON_OBJ_LIST
+            argument information.
+        alloc_repeat : int
+            The number of times to repeat the allocation.
+
+        Returns
+        -------
+        args_list : List[T_ARGUMENT_LIST]
+            The list of argument lists.
+        """
+        return [[tvm.nd.array(arg, device=device) for arg in inputs] for _ in range(alloc_repeat)]
+
+    def f_with_args_run_evaluator_common(
+        rt_mod: tvm.runtime.Module,
+        device: tvm.runtime.Device,
+        evaluator_config: ms.runner.EvaluatorConfig,
+        repeated_args: List[ms.runner.rpc_runner.T_ARGUMENT_LIST],
+    ) -> List[float]:
+        """With args function to run the evaluator
+
+        Parameters
+        ----------
+        session : tvm.rpc.RPCSession
+            The RPC session
+        rt_mod: Module
+            The runtime module
+        device: Device
+            The device to run the evaluator
+        evaluator_config: EvaluatorConfig
+            The evaluator config
+        repeated_args: List[T_ARGUMENT_LIST]
+            The repeated arguments
+
+        Returns
+        -------
+        costs: List[float]
+            The evaluator results
+        """
+        evaluator = rt_mod.time_evaluator(
+            func_name=rt_mod.entry_name,
+            dev=device,
+            number=evaluator_config.number,
+            repeat=evaluator_config.repeat,
+            min_repeat_ms=evaluator_config.min_repeat_ms,
+            f_preproc="cache_flush_cpu_non_first_arg"
+            if evaluator_config.enable_cpu_cache_flush
+            else "",
+        )
+
+        repeated_costs: List[List[float]] = []
+        for args in repeated_args:
+            device.sync()
+            profile_result = evaluator(*args)
+            repeated_costs.append(profile_result.results)
+        costs = [float(cost) for cost in itertools.chain.from_iterable(repeated_costs)]
+
+        assert len(repeated_args) == 1, "Only support one set of arguments"
+        scheduled_res = [arg.numpy() for arg in repeated_args[0]]  # type: ignore
+        # fetch comparison function
+        passed = check_and_run(
+            ARGS.check_metric_func,
+            to_tvm_ndarray(original_res),
+            to_tvm_ndarray(scheduled_res),
         )
-        return False
+
+        print_result(
+            result="pass" if passed else "wrong answer",
+            original_mod=original_mod,
+            scheduled_mod=scheduled_mod,
+            trace=trace,
+            inputs=inputs,
+            original_res=original_res,
+            scheduled_res=scheduled_res,
+            original_run_secs=original_run_secs,
+            scheduled_run_secs=costs,
+        )
+
+        return costs
+
+    def f_with_args_alloc_argument_rpc(
+        rpc_session: ms.runner.rpc_runner.RPCSession,  # pylint: disable=unused-argument
+        device: tvm.runtime.Device,
+        args_info: ms.runner.rpc_runner.T_ARG_INFO_JSON_OBJ_LIST,
+        alloc_repeat: int,
+    ) -> List[ms.runner.rpc_runner.T_ARGUMENT_LIST]:
+        return f_with_args_alloc_argument_common(device, args_info, alloc_repeat)
+
+    def f_with_args_run_evaluator_rpc(
+        rpc_session: ms.runner.rpc_runner.RPCSession,  # pylint: disable=unused-argument
+        rt_mod: tvm.runtime.Module,
+        device: tvm.runtime.Device,
+        evaluator_config: ms.runner.EvaluatorConfig,
+        repeated_args: List[ms.runner.rpc_runner.T_ARGUMENT_LIST],
+    ) -> List[float]:
+        return f_with_args_run_evaluator_common(rt_mod, device, evaluator_config, repeated_args)
+
+    if ARGS.rpc_config is None:
+        return f_with_args_alloc_argument_common, f_with_args_run_evaluator_common
+    else:
+        return f_with_args_alloc_argument_rpc, f_with_args_run_evaluator_rpc
+
+
+def local_build_and_run(
+    mod: IRModule,
+    target: Target,
+    device: tvm.runtime.Device,
+    inputs: List[np.ndarray],
+) -> Tuple[List[np.ndarray], List[float]]:
+    """Build and run the module locally.
+
+    Parameters
+    ----------
+    mod: IRModule
+        The module to build and run
+    target: Target
+        The target to build the module
+    device: Device
+        The device to run the module
+    inputs: List[np.ndarray]
+        The inputs to run the module
+
+    Returns
+    -------
+    res: List[np.ndarray]
+        The results of running the module
+    run_secs: List[float]
+        The running time of running the module
+    """
+    # potential memory leak https://github.com/apache/tvm/issues/11096
+    lib = tvm.build(mod, target=target)
+    tvm_inputs = [tvm.nd.array(inp, device=device) for inp in inputs]
+    device.sync()
+    func = lib.time_evaluator(lib.entry_name, dev=device, number=ARGS.number, repeat=ARGS.repeat)
+    benchmark_res = func(*tvm_inputs)
+    device.sync()
+    return [arg.numpy() for arg in tvm_inputs], list(benchmark_res.results)
+
+
+def _check_builder_result(builder_result: ms.builder.BuilderResult) -> None:
+    """Check if the builder result is defined.
+
+    Parameters
+    ----------
+    builder_result: BuilderResult
+        The builder result
+    """
+    assert builder_result.error_msg is None, "Builder failed: " + str(
+        builder_result.error_msg if builder_result.error_msg else "Empty error message"
+    )
+
+
+def _apply_trace(mod: IRModule, trace: Trace) -> IRModule:
+    """Apply the trace to the module.
+
+    Parameters
+    ----------
+    mod: IRModule
+        The module to apply the trace to
+    trace: Trace
+        The trace to apply
+
+    Returns
+    -------
+    mod: IRModule
+        The module with the trace applied
+    """
+    sch = Schedule(mod)
+    trace.apply_to_schedule(sch, remove_postproc=False)
+    return sch.mod
+
+
+def _build_all_mods(
+    mods: List[IRModule], builder: ms.builder.Builder, target: Target
+) -> List[ms.builder.BuilderResult]:
+    """Build all the modules.
+
+    Parameters
+    ----------
+    mods: List[IRModule]
+        The modules to build
+    builder: Builder
+        The builder to build the modules
+    target: Target
+        The target to build the modules
+
+    Returns
+    -------
+    builder_results: List[BuilderResult]
+        The builder results
+    """
+    builder_results = builder.build([ms.builder.BuilderInput(mod, target) for mod in mods])
+    assert len(builder_results) == len(
+        mods
+    ), f"Unexpected number of build results, expected {len(mods)} got {len(builder_results)}"
+    return builder_results
+
+
+def _run_single_mod(
+    builder_result: ms.builder.BuilderResult,
+    runner: ms.runner.Runner,
+    dev_type: str,
+) -> None:
+    """Run a single module.
+
+    Parameters
+    ----------
+    builder_result: BuilderResult
+        The builder result
+    runner: Runner
+        The runner to run the module
+    dev_type: str
+        The device type
+    """
+    runner_futures = runner.run(
+        # arginfo is not used in this case so we can pass an empty list
+        [ms.runner.RunnerInput(builder_result.artifact_path, device_type=dev_type, args_info=[])]
+    )
+    assert (
+        len(runner_futures) == 1
+    ), f"Unexpected number of runner futures, expected 1 got {len(runner_futures)}"
+    (runner_future,) = runner_futures  # pylint: disable=unbalanced-tuple-unpacking
+    runner_res = runner_future.result()
+    assert runner_res.error_msg is None, "Runner failed: " + (
+        runner_res.error_msg if runner_res.error_msg else "Empty error message"
+    )
 
 
 def main():
     """Main function"""
     describe()
-    database = ms.database.create(work_dir=ARGS.work_dir)
-    target = ARGS.target
-    if target.kind.name == "llvm":
-        dev_type = "cpu"
-    elif target.kind.name == "cuda":
-        dev_type = "cuda"
-    else:
-        raise RuntimeError(f"Unsupported target kind: {target.kind.name}")
-    records = database.get_all_tuning_records()
     with ms.Profiler() as profiler:
-        for i, record in enumerate(records):
-            scope_name = f"validate #{i}"
-            with profiler.timeit(scope_name):
-                original_mod = record.workload.mod
-                sch = Schedule(original_mod)
-                record.trace.apply_to_schedule(sch=sch, remove_postproc=False)
-                scheduled_mod = sch.mod
-                is_success = False
+        # initialize
+        target = ARGS.target
+        dev_type = get_device_type(target)
+        builder = ms.builder.LocalBuilder()
+        database = ms.database.create(work_dir=ARGS.work_dir)
+
+        # collect records
+        with profiler.timeit("collect records"):
+            records = database.get_all_tuning_records()
+        total = len(records)
+        print(
+            f"Total {total} records to be validated. "
+            f"Collected in {float(profiler.get()['collect records']): 3.3f} sec."
+        )
+
+        # collect unique original TIR
+        with profiler.timeit("deduplicate records"):
+            workloads = set()
+            for record in records:
+                workloads.add(OriginalModule(record.workload.mod))
+        print(
+            f"Total {len(workloads)} unique original TIR to validate. "
+            f"Deduplicated in {float(profiler.get()['deduplicate records']): 3.3f} sec."
+        )
+        if ARGS.top_k < 10**9:
+            print(f"Top {ARGS.top_k} records for each original TIR will be validated.")
+            total = len(workloads) * ARGS.top_k
+        print()
+
+        # validate correctness
+        counter = 0
+        for item in workloads:
+            original_mod = item.mod
+            records = database.get_top_k(
+                workload=database.commit_workload(original_mod), top_k=ARGS.top_k
+            )
+            if len(records) < ARGS.top_k:
+                total -= ARGS.top_k - len(records)
+            inputs = to_numpy(check_and_run(ARGS.input_generator_func, original_mod))
+            original_res, original_run_secs = local_build_and_run(
+                original_mod,
+                target=ARGS.baseline_target,
+                inputs=inputs,
+                device=get_runtime_device(ARGS.baseline_target),
+            )
+            scheduled_mods = [_apply_trace(original_mod, record.trace) for record in records]
+            builder_results = _build_all_mods(scheduled_mods, builder, target)  # type: ignore
+            for i, record in enumerate(records):
+                counter += 1
+                print_result = print_with_counter_func(counter=counter, total=total)
+                if is_failed_record(record):
+                    # skip failed records where run_secs is 1e9
+                    # these records are only negative samples for cost model
+                    print_result(result="skip")
+                    continue
                 try:
-                    is_success = validate_correctness(
-                        original_mod=original_mod,
-                        scheduled_mod=scheduled_mod,
-                        target=target,
-                        baseline_target=ARGS.baseline_target,
-                        dev_type=dev_type,
-                        rpc_config=ARGS.rpc_config,
+                    # prepare scheduled module
+                    scheduled_mod = scheduled_mods[i]
+                    # check build result
+                    builder_result = builder_results[i]
+                    _check_builder_result(builder_result)
+                    # fetch functions
+                    (
+                        f_with_args_alloc_argument,
+                        f_with_args_run_evaluator,
+                    ) = make_alloc_arg_and_check(
+                        inputs,
+                        original_mod,
+                        scheduled_mod,
+                        str(record.trace),
+                        original_res=original_res,
+                        original_run_secs=original_run_secs,
+                        print_result=print_result,
                     )
-                except Exception as e:  # pylint: disable=broad-except, invalid-name
-                    print(
-                        ("\n\n").join(
-                            [
-                                "Validation failed!",
-                                "Original IRModule:" + DELIMITOR + original_mod.script(),
-                                "Scheduled IRModule:" + DELIMITOR + scheduled_mod.script(),
-                                "Exception" + DELIMITOR + str(e),
-                            ]
-                        )
+                    # create runner
+                    evaluator_config = ms.runner.EvaluatorConfig(
+                        number=ARGS.number,
+                        repeat=ARGS.repeat,
+                        min_repeat_ms=ARGS.min_repeat_ms,
+                        enable_cpu_cache_flush=ARGS.cpu_flush,
                     )
-            if is_success:
-                print(
-                    f"Progress {i+1: 6d} / {len(records): 6d} checked,"
-                    f" used {float(profiler.get()[scope_name]): 3.3f} sec."
-                )
-            else:
-                return
+                    if ARGS.rpc_config is not None:
+                        runner: ms.Runner = ms.runner.RPCRunner(  # type: ignore
+                            ARGS.rpc_config,
+                            evaluator_config=evaluator_config,
+                            alloc_repeat=1,
+                            f_alloc_argument=f_with_args_alloc_argument,
+                            f_run_evaluator=f_with_args_run_evaluator,
+                            initializer=initializer,
+                        )
+                    else:
+                        runner: ms.Runner = ms.runner.LocalRunner(  # type: ignore
+                            evaluator_config=evaluator_config,
+                            alloc_repeat=1,
+                            f_alloc_argument=f_with_args_alloc_argument,
+                            f_run_evaluator=f_with_args_run_evaluator,
+                            initializer=initializer,
+                        )
 
-    print("Validation passed!")
-    print(f"Total time spent: {float(profiler.get()['Total']): 3.3f} sec.")
+                    # run and validate
+                    _run_single_mod(builder_result, runner, dev_type)  # type: ignore
+                except Exception as e:  # pylint: disable=broad-except, invalid-name
+                    # validation failed with exception
+                    print_result(
+                        result="exception",
+                        original_mod=original_mod,
+                        scheduled_mod=scheduled_mod,
+                        trace=str(record.trace),
+                        exception=e,
+                    )
+                # clean up
+                remove_build_dir(builder_result.artifact_path)
+    print(f"Validation finished! Total time spent: {float(profiler.get()['Total']): 3.3f} sec.")
 
 
 if __name__ == "__main__":
diff --git a/src/meta_schedule/measure_callback/remove_build_artifact.cc b/src/meta_schedule/measure_callback/remove_build_artifact.cc
index 0abbebf3b484..41e52adbae99 100644
--- a/src/meta_schedule/measure_callback/remove_build_artifact.cc
+++ b/src/meta_schedule/measure_callback/remove_build_artifact.cc
@@ -28,7 +28,7 @@ class RemoveBuildArtifactNode : public MeasureCallbackNode {
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
     static const PackedFunc* f_rm = runtime::Registry::Get("meta_schedule.remove_build_dir");
-    ICHECK(*f_rm != nullptr) << "The `remove_build_dir` func is not in tvm registry.";
+    ICHECK(f_rm != nullptr) << "The `remove_build_dir` func is not in tvm registry.";
     auto _ = Profiler::TimedScope("MeasureCallback/RemoveBuildArtifact");
     for (const BuilderResult& build_result : builder_results) {
       if (Optional<String> path = build_result->artifact_path) {

From 5b1a1e3d3931d2d17842b96e563f4162623290d1 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 28 Nov 2022 14:01:49 -0800
Subject: [PATCH 668/704] [skip ci][ci][wasm] Add package-lock.json to git
 (#13505)

This fixes the broken wasm build (e.g.
https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4810/pipeline/)
and adds the `package-lock.json` file to the repo to avoid these kinds
of issues in the future.
---
 web/.gitignore        |    1 -
 web/package-lock.json | 6724 +++++++++++++++++++++++++++++++++++++++++
 web/package.json      |    3 +-
 3 files changed, 6726 insertions(+), 2 deletions(-)
 create mode 100644 web/package-lock.json

diff --git a/web/.gitignore b/web/.gitignore
index a3135cf24b9d..082c5a26770b 100644
--- a/web/.gitignore
+++ b/web/.gitignore
@@ -2,5 +2,4 @@
 *~
 out
 node_modules
-package-lock.json
 build
diff --git a/web/package-lock.json b/web/package-lock.json
new file mode 100644
index 000000000000..7032e318877c
--- /dev/null
+++ b/web/package-lock.json
@@ -0,0 +1,6724 @@
+{
+  "name": "tvmjs",
+  "version": "0.11.0-dev0",
+  "lockfileVersion": 1,
+  "requires": true,
+  "dependencies": {
+    "@ampproject/remapping": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.2.0.tgz",
+      "integrity": "sha512-qRmjj8nj9qmLTQXXmaR1cck3UXSRMPrbsLJAasZpF+t3riI71BXed5ebIOYwQntykeZuhjsdweEc9BxH5Jc26w==",
+      "dev": true,
+      "requires": {
+        "@jridgewell/gen-mapping": "^0.1.0",
+        "@jridgewell/trace-mapping": "^0.3.9"
+      }
+    },
+    "@babel/code-frame": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.18.6.tgz",
+      "integrity": "sha512-TDCmlK5eOvH+eH7cdAFlNXeVJqWIQ7gW9tY1GJIpUtFb6CmjVyq2VM3u71bOyR8CRihcCgMUYoDNyLXao3+70Q==",
+      "dev": true,
+      "requires": {
+        "@babel/highlight": "^7.18.6"
+      }
+    },
+    "@babel/compat-data": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.20.5.tgz",
+      "integrity": "sha512-KZXo2t10+/jxmkhNXc7pZTqRvSOIvVv/+lJwHS+B2rErwOyjuVRh60yVpb7liQ1U5t7lLJ1bz+t8tSypUZdm0g==",
+      "dev": true
+    },
+    "@babel/core": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.20.5.tgz",
+      "integrity": "sha512-UdOWmk4pNWTm/4DlPUl/Pt4Gz4rcEMb7CY0Y3eJl5Yz1vI8ZJGmHWaVE55LoxRjdpx0z259GE9U5STA9atUinQ==",
+      "dev": true,
+      "requires": {
+        "@ampproject/remapping": "^2.1.0",
+        "@babel/code-frame": "^7.18.6",
+        "@babel/generator": "^7.20.5",
+        "@babel/helper-compilation-targets": "^7.20.0",
+        "@babel/helper-module-transforms": "^7.20.2",
+        "@babel/helpers": "^7.20.5",
+        "@babel/parser": "^7.20.5",
+        "@babel/template": "^7.18.10",
+        "@babel/traverse": "^7.20.5",
+        "@babel/types": "^7.20.5",
+        "convert-source-map": "^1.7.0",
+        "debug": "^4.1.0",
+        "gensync": "^1.0.0-beta.2",
+        "json5": "^2.2.1",
+        "semver": "^6.3.0"
+      },
+      "dependencies": {
+        "semver": {
+          "version": "6.3.0",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+          "dev": true
+        }
+      }
+    },
+    "@babel/generator": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.20.5.tgz",
+      "integrity": "sha512-jl7JY2Ykn9S0yj4DQP82sYvPU+T3g0HFcWTqDLqiuA9tGRNIj9VfbtXGAYTTkyNEnQk1jkMGOdYka8aG/lulCA==",
+      "dev": true,
+      "requires": {
+        "@babel/types": "^7.20.5",
+        "@jridgewell/gen-mapping": "^0.3.2",
+        "jsesc": "^2.5.1"
+      },
+      "dependencies": {
+        "@jridgewell/gen-mapping": {
+          "version": "0.3.2",
+          "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.2.tgz",
+          "integrity": "sha512-mh65xKQAzI6iBcFzwv28KVWSmCkdRBWoOh+bYQGW3+6OZvbbN3TqMGo5hqYxQniRcH9F2VZIoJCm4pa3BPDK/A==",
+          "dev": true,
+          "requires": {
+            "@jridgewell/set-array": "^1.0.1",
+            "@jridgewell/sourcemap-codec": "^1.4.10",
+            "@jridgewell/trace-mapping": "^0.3.9"
+          }
+        }
+      }
+    },
+    "@babel/helper-compilation-targets": {
+      "version": "7.20.0",
+      "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.20.0.tgz",
+      "integrity": "sha512-0jp//vDGp9e8hZzBc6N/KwA5ZK3Wsm/pfm4CrY7vzegkVxc65SgSn6wYOnwHe9Js9HRQ1YTCKLGPzDtaS3RoLQ==",
+      "dev": true,
+      "requires": {
+        "@babel/compat-data": "^7.20.0",
+        "@babel/helper-validator-option": "^7.18.6",
+        "browserslist": "^4.21.3",
+        "semver": "^6.3.0"
+      },
+      "dependencies": {
+        "semver": {
+          "version": "6.3.0",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+          "dev": true
+        }
+      }
+    },
+    "@babel/helper-environment-visitor": {
+      "version": "7.18.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.18.9.tgz",
+      "integrity": "sha512-3r/aACDJ3fhQ/EVgFy0hpj8oHyHpQc+LPtJoY9SzTThAsStm4Ptegq92vqKoE3vD706ZVFWITnMnxucw+S9Ipg==",
+      "dev": true
+    },
+    "@babel/helper-function-name": {
+      "version": "7.19.0",
+      "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.19.0.tgz",
+      "integrity": "sha512-WAwHBINyrpqywkUH0nTnNgI5ina5TFn85HKS0pbPDfxFfhyR/aNQEn4hGi1P1JyT//I0t4OgXUlofzWILRvS5w==",
+      "dev": true,
+      "requires": {
+        "@babel/template": "^7.18.10",
+        "@babel/types": "^7.19.0"
+      }
+    },
+    "@babel/helper-hoist-variables": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.18.6.tgz",
+      "integrity": "sha512-UlJQPkFqFULIcyW5sbzgbkxn2FKRgwWiRexcuaR8RNJRy8+LLveqPjwZV/bwrLZCN0eUHD/x8D0heK1ozuoo6Q==",
+      "dev": true,
+      "requires": {
+        "@babel/types": "^7.18.6"
+      }
+    },
+    "@babel/helper-module-imports": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.18.6.tgz",
+      "integrity": "sha512-0NFvs3VkuSYbFi1x2Vd6tKrywq+z/cLeYC/RJNFrIX/30Bf5aiGYbtvGXolEktzJH8o5E5KJ3tT+nkxuuZFVlA==",
+      "dev": true,
+      "requires": {
+        "@babel/types": "^7.18.6"
+      }
+    },
+    "@babel/helper-module-transforms": {
+      "version": "7.20.2",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.20.2.tgz",
+      "integrity": "sha512-zvBKyJXRbmK07XhMuujYoJ48B5yvvmM6+wcpv6Ivj4Yg6qO7NOZOSnvZN9CRl1zz1Z4cKf8YejmCMh8clOoOeA==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-environment-visitor": "^7.18.9",
+        "@babel/helper-module-imports": "^7.18.6",
+        "@babel/helper-simple-access": "^7.20.2",
+        "@babel/helper-split-export-declaration": "^7.18.6",
+        "@babel/helper-validator-identifier": "^7.19.1",
+        "@babel/template": "^7.18.10",
+        "@babel/traverse": "^7.20.1",
+        "@babel/types": "^7.20.2"
+      }
+    },
+    "@babel/helper-plugin-utils": {
+      "version": "7.20.2",
+      "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.20.2.tgz",
+      "integrity": "sha512-8RvlJG2mj4huQ4pZ+rU9lqKi9ZKiRmuvGuM2HlWmkmgOhbs6zEAw6IEiJ5cQqGbDzGZOhwuOQNtZMi/ENLjZoQ==",
+      "dev": true
+    },
+    "@babel/helper-simple-access": {
+      "version": "7.20.2",
+      "resolved": "https://registry.npmjs.org/@babel/helper-simple-access/-/helper-simple-access-7.20.2.tgz",
+      "integrity": "sha512-+0woI/WPq59IrqDYbVGfshjT5Dmk/nnbdpcF8SnMhhXObpTq2KNBdLFRFrkVdbDOyUmHBCxzm5FHV1rACIkIbA==",
+      "dev": true,
+      "requires": {
+        "@babel/types": "^7.20.2"
+      }
+    },
+    "@babel/helper-split-export-declaration": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.18.6.tgz",
+      "integrity": "sha512-bde1etTx6ZyTmobl9LLMMQsaizFVZrquTEHOqKeQESMKo4PlObf+8+JA25ZsIpZhT/WEd39+vOdLXAFG/nELpA==",
+      "dev": true,
+      "requires": {
+        "@babel/types": "^7.18.6"
+      }
+    },
+    "@babel/helper-string-parser": {
+      "version": "7.19.4",
+      "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.19.4.tgz",
+      "integrity": "sha512-nHtDoQcuqFmwYNYPz3Rah5ph2p8PFeFCsZk9A/48dPc/rGocJ5J3hAAZ7pb76VWX3fZKu+uEr/FhH5jLx7umrw==",
+      "dev": true
+    },
+    "@babel/helper-validator-identifier": {
+      "version": "7.19.1",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.19.1.tgz",
+      "integrity": "sha512-awrNfaMtnHUr653GgGEs++LlAvW6w+DcPrOliSMXWCKo597CwL5Acf/wWdNkf/tfEQE3mjkeD1YOVZOUV/od1w==",
+      "dev": true
+    },
+    "@babel/helper-validator-option": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.18.6.tgz",
+      "integrity": "sha512-XO7gESt5ouv/LRJdrVjkShckw6STTaB7l9BrpBaAHDeF5YZT+01PCwmR0SJHnkW6i8OwW/EVWRShfi4j2x+KQw==",
+      "dev": true
+    },
+    "@babel/helpers": {
+      "version": "7.20.6",
+      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.20.6.tgz",
+      "integrity": "sha512-Pf/OjgfgFRW5bApskEz5pvidpim7tEDPlFtKcNRXWmfHGn9IEI2W2flqRQXTFb7gIPTyK++N6rVHuwKut4XK6w==",
+      "dev": true,
+      "requires": {
+        "@babel/template": "^7.18.10",
+        "@babel/traverse": "^7.20.5",
+        "@babel/types": "^7.20.5"
+      }
+    },
+    "@babel/highlight": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.18.6.tgz",
+      "integrity": "sha512-u7stbOuYjaPezCuLj29hNW1v64M2Md2qupEKP1fHc7WdOA3DgLh37suiSrZYY7haUB7iBeQZ9P1uiRF359do3g==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-validator-identifier": "^7.18.6",
+        "chalk": "^2.0.0",
+        "js-tokens": "^4.0.0"
+      }
+    },
+    "@babel/parser": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.20.5.tgz",
+      "integrity": "sha512-r27t/cy/m9uKLXQNWWebeCUHgnAZq0CpG1OwKRxzJMP1vpSU4bSIK2hq+/cp0bQxetkXx38n09rNu8jVkcK/zA==",
+      "dev": true
+    },
+    "@babel/plugin-syntax-async-generators": {
+      "version": "7.8.4",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-async-generators/-/plugin-syntax-async-generators-7.8.4.tgz",
+      "integrity": "sha512-tycmZxkGfZaxhMRbXlPXuVFpdWlXpir2W4AMhSJgRKzk/eDlIXOhb2LHWoLpDF7TEHylV5zNhykX6KAgHJmTNw==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      }
+    },
+    "@babel/plugin-syntax-bigint": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-bigint/-/plugin-syntax-bigint-7.8.3.tgz",
+      "integrity": "sha512-wnTnFlG+YxQm3vDxpGE57Pj0srRU4sHE/mDkt1qv2YJJSeUAec2ma4WLUnUPeKjyrfntVwe/N6dCXpU+zL3Npg==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      }
+    },
+    "@babel/plugin-syntax-class-properties": {
+      "version": "7.12.13",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-class-properties/-/plugin-syntax-class-properties-7.12.13.tgz",
+      "integrity": "sha512-fm4idjKla0YahUNgFNLCB0qySdsoPiZP3iQE3rky0mBUtMZ23yDJ9SJdg6dXTSDnulOVqiF3Hgr9nbXvXTQZYA==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.12.13"
+      }
+    },
+    "@babel/plugin-syntax-import-meta": {
+      "version": "7.10.4",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-import-meta/-/plugin-syntax-import-meta-7.10.4.tgz",
+      "integrity": "sha512-Yqfm+XDx0+Prh3VSeEQCPU81yC+JWZ2pDPFSS4ZdpfZhp4MkFMaDC1UqseovEKwSUpnIL7+vK+Clp7bfh0iD7g==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.10.4"
+      }
+    },
+    "@babel/plugin-syntax-json-strings": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-json-strings/-/plugin-syntax-json-strings-7.8.3.tgz",
+      "integrity": "sha512-lY6kdGpWHvjoe2vk4WrAapEuBR69EMxZl+RoGRhrFGNYVK8mOPAW8VfbT/ZgrFbXlDNiiaxQnAtgVCZ6jv30EA==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      }
+    },
+    "@babel/plugin-syntax-logical-assignment-operators": {
+      "version": "7.10.4",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-logical-assignment-operators/-/plugin-syntax-logical-assignment-operators-7.10.4.tgz",
+      "integrity": "sha512-d8waShlpFDinQ5MtvGU9xDAOzKH47+FFoney2baFIoMr952hKOLp1HR7VszoZvOsV/4+RRszNY7D17ba0te0ig==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.10.4"
+      }
+    },
+    "@babel/plugin-syntax-nullish-coalescing-operator": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-nullish-coalescing-operator/-/plugin-syntax-nullish-coalescing-operator-7.8.3.tgz",
+      "integrity": "sha512-aSff4zPII1u2QD7y+F8oDsz19ew4IGEJg9SVW+bqwpwtfFleiQDMdzA/R+UlWDzfnHFCxxleFT0PMIrR36XLNQ==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      }
+    },
+    "@babel/plugin-syntax-numeric-separator": {
+      "version": "7.10.4",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-numeric-separator/-/plugin-syntax-numeric-separator-7.10.4.tgz",
+      "integrity": "sha512-9H6YdfkcK/uOnY/K7/aA2xpzaAgkQn37yzWUMRK7OaPOqOpGS1+n0H5hxT9AUw9EsSjPW8SVyMJwYRtWs3X3ug==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.10.4"
+      }
+    },
+    "@babel/plugin-syntax-object-rest-spread": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-object-rest-spread/-/plugin-syntax-object-rest-spread-7.8.3.tgz",
+      "integrity": "sha512-XoqMijGZb9y3y2XskN+P1wUGiVwWZ5JmoDRwx5+3GmEplNyVM2s2Dg8ILFQm8rWM48orGy5YpI5Bl8U1y7ydlA==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      }
+    },
+    "@babel/plugin-syntax-optional-catch-binding": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-optional-catch-binding/-/plugin-syntax-optional-catch-binding-7.8.3.tgz",
+      "integrity": "sha512-6VPD0Pc1lpTqw0aKoeRTMiB+kWhAoT24PA+ksWSBrFtl5SIRVpZlwN3NNPQjehA2E/91FV3RjLWoVTglWcSV3Q==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      }
+    },
+    "@babel/plugin-syntax-optional-chaining": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-optional-chaining/-/plugin-syntax-optional-chaining-7.8.3.tgz",
+      "integrity": "sha512-KoK9ErH1MBlCPxV0VANkXW2/dw4vlbGDrFgz8bmUsBGYkFRcbRwMh6cIJubdPrkxRwuGdtCk0v/wPTKbQgBjkg==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      }
+    },
+    "@babel/plugin-syntax-top-level-await": {
+      "version": "7.14.5",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-top-level-await/-/plugin-syntax-top-level-await-7.14.5.tgz",
+      "integrity": "sha512-hx++upLv5U1rgYfwe1xBQUhRmU41NEvpUvrp8jkrSCdvGSnM5/qdRMtylJ6PG5OFkBaHkbTAKTnd3/YyESRHFw==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.14.5"
+      }
+    },
+    "@babel/template": {
+      "version": "7.18.10",
+      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.18.10.tgz",
+      "integrity": "sha512-TI+rCtooWHr3QJ27kJxfjutghu44DLnasDMwpDqCXVTal9RLp3RSYNh4NdBrRP2cQAoG9A8juOQl6P6oZG4JxA==",
+      "dev": true,
+      "requires": {
+        "@babel/code-frame": "^7.18.6",
+        "@babel/parser": "^7.18.10",
+        "@babel/types": "^7.18.10"
+      }
+    },
+    "@babel/traverse": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.20.5.tgz",
+      "integrity": "sha512-WM5ZNN3JITQIq9tFZaw1ojLU3WgWdtkxnhM1AegMS+PvHjkM5IXjmYEGY7yukz5XS4sJyEf2VzWjI8uAavhxBQ==",
+      "dev": true,
+      "requires": {
+        "@babel/code-frame": "^7.18.6",
+        "@babel/generator": "^7.20.5",
+        "@babel/helper-environment-visitor": "^7.18.9",
+        "@babel/helper-function-name": "^7.19.0",
+        "@babel/helper-hoist-variables": "^7.18.6",
+        "@babel/helper-split-export-declaration": "^7.18.6",
+        "@babel/parser": "^7.20.5",
+        "@babel/types": "^7.20.5",
+        "debug": "^4.1.0",
+        "globals": "^11.1.0"
+      },
+      "dependencies": {
+        "globals": {
+          "version": "11.12.0",
+          "resolved": "https://registry.npmjs.org/globals/-/globals-11.12.0.tgz",
+          "integrity": "sha512-WOBp/EEGUiIsJSp7wcv/y6MO+lV9UoncWqxuFfm8eBwzWNgyfBd6Gz+IeKQ9jCmyhoH99g15M3T+QaVHFjizVA==",
+          "dev": true
+        }
+      }
+    },
+    "@babel/types": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.20.5.tgz",
+      "integrity": "sha512-c9fst/h2/dcF7H+MJKZ2T0KjEQ8hY/BNnDk/H3XY8C4Aw/eWQXWn/lWntHF9ooUBnGmEvbfGrTgLWc+um0YDUg==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-string-parser": "^7.19.4",
+        "@babel/helper-validator-identifier": "^7.19.1",
+        "to-fast-properties": "^2.0.0"
+      }
+    },
+    "@bcoe/v8-coverage": {
+      "version": "0.2.3",
+      "resolved": "https://registry.npmjs.org/@bcoe/v8-coverage/-/v8-coverage-0.2.3.tgz",
+      "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==",
+      "dev": true
+    },
+    "@cnakazawa/watch": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/@cnakazawa/watch/-/watch-1.0.4.tgz",
+      "integrity": "sha512-v9kIhKwjeZThiWrLmj0y17CWoyddASLj9O2yvbZkbvw/N3rWOYy9zkV66ursAoVr0mV15bL8g0c4QZUE6cdDoQ==",
+      "dev": true,
+      "requires": {
+        "exec-sh": "^0.3.2",
+        "minimist": "^1.2.0"
+      }
+    },
+    "@istanbuljs/load-nyc-config": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@istanbuljs/load-nyc-config/-/load-nyc-config-1.1.0.tgz",
+      "integrity": "sha512-VjeHSlIzpv/NyD3N0YuHfXOPDIixcA1q2ZV98wsMqcYlPmv2n3Yb2lYP9XMElnaFVXg5A7YLTeLu6V84uQDjmQ==",
+      "dev": true,
+      "requires": {
+        "camelcase": "^5.3.1",
+        "find-up": "^4.1.0",
+        "get-package-type": "^0.1.0",
+        "js-yaml": "^3.13.1",
+        "resolve-from": "^5.0.0"
+      },
+      "dependencies": {
+        "resolve-from": {
+          "version": "5.0.0",
+          "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-5.0.0.tgz",
+          "integrity": "sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==",
+          "dev": true
+        }
+      }
+    },
+    "@istanbuljs/schema": {
+      "version": "0.1.3",
+      "resolved": "https://registry.npmjs.org/@istanbuljs/schema/-/schema-0.1.3.tgz",
+      "integrity": "sha512-ZXRY4jNvVgSVQ8DL3LTcakaAtXwTVUxE81hslsyD2AtoXW/wVob10HkOJ1X/pAlcI7D+2YoZKg5do8G/w6RYgA==",
+      "dev": true
+    },
+    "@jest/console": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/console/-/console-26.6.2.tgz",
+      "integrity": "sha512-IY1R2i2aLsLr7Id3S6p2BA82GNWryt4oSvEXLAKc+L2zdi89dSkE8xC1C+0kpATG4JhBJREnQOH7/zmccM2B0g==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "chalk": "^4.0.0",
+        "jest-message-util": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "slash": "^3.0.0"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "@jest/core": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/@jest/core/-/core-26.6.3.tgz",
+      "integrity": "sha512-xvV1kKbhfUqFVuZ8Cyo+JPpipAHHAV3kcDBftiduK8EICXmTFddryy3P7NfZt8Pv37rA9nEJBKCCkglCPt/Xjw==",
+      "dev": true,
+      "requires": {
+        "@jest/console": "^26.6.2",
+        "@jest/reporters": "^26.6.2",
+        "@jest/test-result": "^26.6.2",
+        "@jest/transform": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "ansi-escapes": "^4.2.1",
+        "chalk": "^4.0.0",
+        "exit": "^0.1.2",
+        "graceful-fs": "^4.2.4",
+        "jest-changed-files": "^26.6.2",
+        "jest-config": "^26.6.3",
+        "jest-haste-map": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-regex-util": "^26.0.0",
+        "jest-resolve": "^26.6.2",
+        "jest-resolve-dependencies": "^26.6.3",
+        "jest-runner": "^26.6.3",
+        "jest-runtime": "^26.6.3",
+        "jest-snapshot": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jest-validate": "^26.6.2",
+        "jest-watcher": "^26.6.2",
+        "micromatch": "^4.0.2",
+        "p-each-series": "^2.1.0",
+        "rimraf": "^3.0.0",
+        "slash": "^3.0.0",
+        "strip-ansi": "^6.0.0"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "rimraf": {
+          "version": "3.0.2",
+          "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
+          "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==",
+          "dev": true,
+          "requires": {
+            "glob": "^7.1.3"
+          }
+        },
+        "strip-ansi": {
+          "version": "6.0.1",
+          "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+          "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+          "dev": true,
+          "requires": {
+            "ansi-regex": "^5.0.1"
+          }
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "@jest/environment": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/environment/-/environment-26.6.2.tgz",
+      "integrity": "sha512-nFy+fHl28zUrRsCeMB61VDThV1pVTtlEokBRgqPrcT1JNq4yRNIyTHfyht6PqtUvY9IsuLGTrbG8kPXjSZIZwA==",
+      "dev": true,
+      "requires": {
+        "@jest/fake-timers": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "jest-mock": "^26.6.2"
+      }
+    },
+    "@jest/fake-timers": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/fake-timers/-/fake-timers-26.6.2.tgz",
+      "integrity": "sha512-14Uleatt7jdzefLPYM3KLcnUl1ZNikaKq34enpb5XG9i81JpppDb5muZvonvKyrl7ftEHkKS5L5/eB/kxJ+bvA==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "@sinonjs/fake-timers": "^6.0.1",
+        "@types/node": "*",
+        "jest-message-util": "^26.6.2",
+        "jest-mock": "^26.6.2",
+        "jest-util": "^26.6.2"
+      }
+    },
+    "@jest/globals": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/globals/-/globals-26.6.2.tgz",
+      "integrity": "sha512-85Ltnm7HlB/KesBUuALwQ68YTU72w9H2xW9FjZ1eL1U3lhtefjjl5c2MiUbpXt/i6LaPRvoOFJ22yCBSfQ0JIA==",
+      "dev": true,
+      "requires": {
+        "@jest/environment": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "expect": "^26.6.2"
+      }
+    },
+    "@jest/reporters": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/reporters/-/reporters-26.6.2.tgz",
+      "integrity": "sha512-h2bW53APG4HvkOnVMo8q3QXa6pcaNt1HkwVsOPMBV6LD/q9oSpxNSYZQYkAnjdMjrJ86UuYeLo+aEZClV6opnw==",
+      "dev": true,
+      "requires": {
+        "@bcoe/v8-coverage": "^0.2.3",
+        "@jest/console": "^26.6.2",
+        "@jest/test-result": "^26.6.2",
+        "@jest/transform": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "chalk": "^4.0.0",
+        "collect-v8-coverage": "^1.0.0",
+        "exit": "^0.1.2",
+        "glob": "^7.1.2",
+        "graceful-fs": "^4.2.4",
+        "istanbul-lib-coverage": "^3.0.0",
+        "istanbul-lib-instrument": "^4.0.3",
+        "istanbul-lib-report": "^3.0.0",
+        "istanbul-lib-source-maps": "^4.0.0",
+        "istanbul-reports": "^3.0.2",
+        "jest-haste-map": "^26.6.2",
+        "jest-resolve": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jest-worker": "^26.6.2",
+        "node-notifier": "^8.0.0",
+        "slash": "^3.0.0",
+        "source-map": "^0.6.0",
+        "string-length": "^4.0.1",
+        "terminal-link": "^2.0.0",
+        "v8-to-istanbul": "^7.0.0"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "@jest/source-map": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/source-map/-/source-map-26.6.2.tgz",
+      "integrity": "sha512-YwYcCwAnNmOVsZ8mr3GfnzdXDAl4LaenZP5z+G0c8bzC9/dugL8zRmxZzdoTl4IaS3CryS1uWnROLPFmb6lVvA==",
+      "dev": true,
+      "requires": {
+        "callsites": "^3.0.0",
+        "graceful-fs": "^4.2.4",
+        "source-map": "^0.6.0"
+      }
+    },
+    "@jest/test-result": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/test-result/-/test-result-26.6.2.tgz",
+      "integrity": "sha512-5O7H5c/7YlojphYNrK02LlDIV2GNPYisKwHm2QTKjNZeEzezCbwYs9swJySv2UfPMyZ0VdsmMv7jIlD/IKYQpQ==",
+      "dev": true,
+      "requires": {
+        "@jest/console": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/istanbul-lib-coverage": "^2.0.0",
+        "collect-v8-coverage": "^1.0.0"
+      }
+    },
+    "@jest/test-sequencer": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/@jest/test-sequencer/-/test-sequencer-26.6.3.tgz",
+      "integrity": "sha512-YHlVIjP5nfEyjlrSr8t/YdNfU/1XEt7c5b4OxcXCjyRhjzLYu/rO69/WHPuYcbCWkz8kAeZVZp2N2+IOLLEPGw==",
+      "dev": true,
+      "requires": {
+        "@jest/test-result": "^26.6.2",
+        "graceful-fs": "^4.2.4",
+        "jest-haste-map": "^26.6.2",
+        "jest-runner": "^26.6.3",
+        "jest-runtime": "^26.6.3"
+      }
+    },
+    "@jest/transform": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/transform/-/transform-26.6.2.tgz",
+      "integrity": "sha512-E9JjhUgNzvuQ+vVAL21vlyfy12gP0GhazGgJC4h6qUt1jSdUXGWJ1wfu/X7Sd8etSgxV4ovT1pb9v5D6QW4XgA==",
+      "dev": true,
+      "requires": {
+        "@babel/core": "^7.1.0",
+        "@jest/types": "^26.6.2",
+        "babel-plugin-istanbul": "^6.0.0",
+        "chalk": "^4.0.0",
+        "convert-source-map": "^1.4.0",
+        "fast-json-stable-stringify": "^2.0.0",
+        "graceful-fs": "^4.2.4",
+        "jest-haste-map": "^26.6.2",
+        "jest-regex-util": "^26.0.0",
+        "jest-util": "^26.6.2",
+        "micromatch": "^4.0.2",
+        "pirates": "^4.0.1",
+        "slash": "^3.0.0",
+        "source-map": "^0.6.1",
+        "write-file-atomic": "^3.0.0"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "@jest/types": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/types/-/types-26.6.2.tgz",
+      "integrity": "sha512-fC6QCp7Sc5sX6g8Tvbmj4XUTbyrik0akgRy03yjXbQaBWWNWGE7SGtJk98m0N8nzegD/7SggrUlivxo5ax4KWQ==",
+      "dev": true,
+      "requires": {
+        "@types/istanbul-lib-coverage": "^2.0.0",
+        "@types/istanbul-reports": "^3.0.0",
+        "@types/node": "*",
+        "@types/yargs": "^15.0.0",
+        "chalk": "^4.0.0"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "@jridgewell/gen-mapping": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.1.1.tgz",
+      "integrity": "sha512-sQXCasFk+U8lWYEe66WxRDOE9PjVz4vSM51fTu3Hw+ClTpUSQb718772vH3pyS5pShp6lvQM7SxgIDXXXmOX7w==",
+      "dev": true,
+      "requires": {
+        "@jridgewell/set-array": "^1.0.0",
+        "@jridgewell/sourcemap-codec": "^1.4.10"
+      }
+    },
+    "@jridgewell/resolve-uri": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.0.tgz",
+      "integrity": "sha512-F2msla3tad+Mfht5cJq7LSXcdudKTWCVYUgw6pLFOOHSTtZlj6SWNYAp+AhuqLmWdBO2X5hPrLcu8cVP8fy28w==",
+      "dev": true
+    },
+    "@jridgewell/set-array": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.1.2.tgz",
+      "integrity": "sha512-xnkseuNADM0gt2bs+BvhO0p78Mk762YnZdsuzFV018NoG1Sj1SCQvpSqa7XUaTam5vAGasABV9qXASMKnFMwMw==",
+      "dev": true
+    },
+    "@jridgewell/sourcemap-codec": {
+      "version": "1.4.14",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.14.tgz",
+      "integrity": "sha512-XPSJHWmi394fuUuzDnGz1wiKqWfo1yXecHQMRf2l6hztTO+nPru658AyDngaBe7isIxEkRsPR3FZh+s7iVa4Uw==",
+      "dev": true
+    },
+    "@jridgewell/trace-mapping": {
+      "version": "0.3.17",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.17.tgz",
+      "integrity": "sha512-MCNzAp77qzKca9+W/+I0+sEpaUnZoeasnghNeVc41VZCEKaCH73Vq3BZZ/SzWIgrqE4H4ceI+p+b6C0mHf9T4g==",
+      "dev": true,
+      "requires": {
+        "@jridgewell/resolve-uri": "3.1.0",
+        "@jridgewell/sourcemap-codec": "1.4.14"
+      }
+    },
+    "@rollup/plugin-commonjs": {
+      "version": "11.1.0",
+      "resolved": "https://registry.npmjs.org/@rollup/plugin-commonjs/-/plugin-commonjs-11.1.0.tgz",
+      "integrity": "sha512-Ycr12N3ZPN96Fw2STurD21jMqzKwL9QuFhms3SD7KKRK7oaXUsBU9Zt0jL/rOPHiPYisI21/rXGO3jr9BnLHUA==",
+      "dev": true,
+      "requires": {
+        "@rollup/pluginutils": "^3.0.8",
+        "commondir": "^1.0.1",
+        "estree-walker": "^1.0.1",
+        "glob": "^7.1.2",
+        "is-reference": "^1.1.2",
+        "magic-string": "^0.25.2",
+        "resolve": "^1.11.0"
+      }
+    },
+    "@rollup/plugin-node-resolve": {
+      "version": "7.1.3",
+      "resolved": "https://registry.npmjs.org/@rollup/plugin-node-resolve/-/plugin-node-resolve-7.1.3.tgz",
+      "integrity": "sha512-RxtSL3XmdTAE2byxekYLnx+98kEUOrPHF/KRVjLH+DEIHy6kjIw7YINQzn+NXiH/NTrQLAwYs0GWB+csWygA9Q==",
+      "dev": true,
+      "requires": {
+        "@rollup/pluginutils": "^3.0.8",
+        "@types/resolve": "0.0.8",
+        "builtin-modules": "^3.1.0",
+        "is-module": "^1.0.0",
+        "resolve": "^1.14.2"
+      }
+    },
+    "@rollup/pluginutils": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/@rollup/pluginutils/-/pluginutils-3.1.0.tgz",
+      "integrity": "sha512-GksZ6pr6TpIjHm8h9lSQ8pi8BE9VeubNT0OMJ3B5uZJ8pz73NPiqOtCog/x2/QzM1ENChPKxMDhiQuRHsqc+lg==",
+      "dev": true,
+      "requires": {
+        "@types/estree": "0.0.39",
+        "estree-walker": "^1.0.1",
+        "picomatch": "^2.2.2"
+      }
+    },
+    "@sinonjs/commons": {
+      "version": "1.8.6",
+      "resolved": "https://registry.npmjs.org/@sinonjs/commons/-/commons-1.8.6.tgz",
+      "integrity": "sha512-Ky+XkAkqPZSm3NLBeUng77EBQl3cmeJhITaGHdYH8kjVB+aun3S4XBRti2zt17mtt0mIUDiNxYeoJm6drVvBJQ==",
+      "dev": true,
+      "requires": {
+        "type-detect": "4.0.8"
+      }
+    },
+    "@sinonjs/fake-timers": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/@sinonjs/fake-timers/-/fake-timers-6.0.1.tgz",
+      "integrity": "sha512-MZPUxrmFubI36XS1DI3qmI0YdN1gks62JtFZvxR67ljjSNCeK6U08Zx4msEWOXuofgqUt6zPHSi1H9fbjR/NRA==",
+      "dev": true,
+      "requires": {
+        "@sinonjs/commons": "^1.7.0"
+      }
+    },
+    "@tootallnate/once": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-1.1.2.tgz",
+      "integrity": "sha512-RbzJvlNzmRq5c3O09UipeuXno4tA1FE6ikOjxZK0tuxVv3412l64l5t1W5pj4+rJq9vpkm/kwiR07aZXnsKPxw==",
+      "dev": true
+    },
+    "@types/babel__core": {
+      "version": "7.1.20",
+      "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.1.20.tgz",
+      "integrity": "sha512-PVb6Bg2QuscZ30FvOU7z4guG6c926D9YRvOxEaelzndpMsvP+YM74Q/dAFASpg2l6+XLalxSGxcq/lrgYWZtyQ==",
+      "dev": true,
+      "requires": {
+        "@babel/parser": "^7.1.0",
+        "@babel/types": "^7.0.0",
+        "@types/babel__generator": "*",
+        "@types/babel__template": "*",
+        "@types/babel__traverse": "*"
+      }
+    },
+    "@types/babel__generator": {
+      "version": "7.6.4",
+      "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.6.4.tgz",
+      "integrity": "sha512-tFkciB9j2K755yrTALxD44McOrk+gfpIpvC3sxHjRawj6PfnQxrse4Clq5y/Rq+G3mrBurMax/lG8Qn2t9mSsg==",
+      "dev": true,
+      "requires": {
+        "@babel/types": "^7.0.0"
+      }
+    },
+    "@types/babel__template": {
+      "version": "7.4.1",
+      "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.1.tgz",
+      "integrity": "sha512-azBFKemX6kMg5Io+/rdGT0dkGreboUVR0Cdm3fz9QJWpaQGJRQXl7C+6hOTCZcMll7KFyEQpgbYI2lHdsS4U7g==",
+      "dev": true,
+      "requires": {
+        "@babel/parser": "^7.1.0",
+        "@babel/types": "^7.0.0"
+      }
+    },
+    "@types/babel__traverse": {
+      "version": "7.18.2",
+      "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.18.2.tgz",
+      "integrity": "sha512-FcFaxOr2V5KZCviw1TnutEMVUVsGt4D2hP1TAfXZAMKuHYW3xQhe3jTxNPWutgCJ3/X1c5yX8ZoGVEItxKbwBg==",
+      "dev": true,
+      "requires": {
+        "@babel/types": "^7.3.0"
+      }
+    },
+    "@types/eslint-visitor-keys": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/@types/eslint-visitor-keys/-/eslint-visitor-keys-1.0.0.tgz",
+      "integrity": "sha512-OCutwjDZ4aFS6PB1UZ988C4YgwlBHJd6wCeQqaLdmadZ/7e+w79+hbMUFC1QXDNCmdyoRfAFdm0RypzwR+Qpag==",
+      "dev": true
+    },
+    "@types/estree": {
+      "version": "0.0.39",
+      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-0.0.39.tgz",
+      "integrity": "sha512-EYNwp3bU+98cpU4lAWYYL7Zz+2gryWH1qbdDTidVd6hkiR6weksdbMadyXKXNPEkQFhXM+hVO9ZygomHXp+AIw==",
+      "dev": true
+    },
+    "@types/graceful-fs": {
+      "version": "4.1.5",
+      "resolved": "https://registry.npmjs.org/@types/graceful-fs/-/graceful-fs-4.1.5.tgz",
+      "integrity": "sha512-anKkLmZZ+xm4p8JWBf4hElkM4XR+EZeA2M9BAkkTldmcyDY4mbdIJnRghDJH3Ov5ooY7/UAoENtmdMSkaAd7Cw==",
+      "dev": true,
+      "requires": {
+        "@types/node": "*"
+      }
+    },
+    "@types/istanbul-lib-coverage": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.4.tgz",
+      "integrity": "sha512-z/QT1XN4K4KYuslS23k62yDIDLwLFkzxOuMplDtObz0+y7VqJCaO2o+SPwHCvLFZh7xazvvoor2tA/hPz9ee7g==",
+      "dev": true
+    },
+    "@types/istanbul-lib-report": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/@types/istanbul-lib-report/-/istanbul-lib-report-3.0.0.tgz",
+      "integrity": "sha512-plGgXAPfVKFoYfa9NpYDAkseG+g6Jr294RqeqcqDixSbU34MZVJRi/P+7Y8GDpzkEwLaGZZOpKIEmeVZNtKsrg==",
+      "dev": true,
+      "requires": {
+        "@types/istanbul-lib-coverage": "*"
+      }
+    },
+    "@types/istanbul-reports": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/@types/istanbul-reports/-/istanbul-reports-3.0.1.tgz",
+      "integrity": "sha512-c3mAZEuK0lvBp8tmuL74XRKn1+y2dcwOUpH7x4WrF6gk1GIgiluDRgMYQtw2OFcBvAJWlt6ASU3tSqxp0Uu0Aw==",
+      "dev": true,
+      "requires": {
+        "@types/istanbul-lib-report": "*"
+      }
+    },
+    "@types/json-schema": {
+      "version": "7.0.11",
+      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.11.tgz",
+      "integrity": "sha512-wOuvG1SN4Us4rez+tylwwwCV1psiNVOkJeM3AUWUNWg/jDQY2+HE/444y5gc+jBmRqASOm2Oeh5c1axHobwRKQ==",
+      "dev": true
+    },
+    "@types/node": {
+      "version": "12.20.55",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-12.20.55.tgz",
+      "integrity": "sha512-J8xLz7q2OFulZ2cyGTLE1TbbZcjpno7FaN6zdJNrgAdrJ+DZzh/uFR6YrTb4C+nXakvud8Q4+rbhoIWlYQbUFQ==",
+      "dev": true
+    },
+    "@types/normalize-package-data": {
+      "version": "2.4.1",
+      "resolved": "https://registry.npmjs.org/@types/normalize-package-data/-/normalize-package-data-2.4.1.tgz",
+      "integrity": "sha512-Gj7cI7z+98M282Tqmp2K5EIsoouUEzbBJhQQzDE3jSIRk6r9gsz0oUokqIUR4u1R3dMHo0pDHM7sNOHyhulypw==",
+      "dev": true
+    },
+    "@types/prettier": {
+      "version": "2.7.1",
+      "resolved": "https://registry.npmjs.org/@types/prettier/-/prettier-2.7.1.tgz",
+      "integrity": "sha512-ri0UmynRRvZiiUJdiz38MmIblKK+oH30MztdBVR95dv/Ubw6neWSb8u1XpRb72L4qsZOhz+L+z9JD40SJmfWow==",
+      "dev": true
+    },
+    "@types/resolve": {
+      "version": "0.0.8",
+      "resolved": "https://registry.npmjs.org/@types/resolve/-/resolve-0.0.8.tgz",
+      "integrity": "sha512-auApPaJf3NPfe18hSoJkp8EbZzer2ISk7o8mCC3M9he/a04+gbMF97NkpD2S8riMGvm4BMRI59/SZQSaLTKpsQ==",
+      "dev": true,
+      "requires": {
+        "@types/node": "*"
+      }
+    },
+    "@types/stack-utils": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.1.tgz",
+      "integrity": "sha512-Hl219/BT5fLAaz6NDkSuhzasy49dwQS/DSdu4MdggFB8zcXv7vflBI3xp7FEmkmdDkBUI2bPUNeMttp2knYdxw==",
+      "dev": true
+    },
+    "@types/yargs": {
+      "version": "15.0.14",
+      "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-15.0.14.tgz",
+      "integrity": "sha512-yEJzHoxf6SyQGhBhIYGXQDSCkJjB6HohDShto7m8vaKg9Yp0Yn8+71J9eakh2bnPg6BfsH9PRMhiRTZnd4eXGQ==",
+      "dev": true,
+      "requires": {
+        "@types/yargs-parser": "*"
+      }
+    },
+    "@types/yargs-parser": {
+      "version": "21.0.0",
+      "resolved": "https://registry.npmjs.org/@types/yargs-parser/-/yargs-parser-21.0.0.tgz",
+      "integrity": "sha512-iO9ZQHkZxHn4mSakYV0vFHAVDyEOIJQrV2uZ06HxEPcx+mt8swXoZHIbaaJ2crJYFfErySgktuTZ3BeLz+XmFA==",
+      "dev": true
+    },
+    "@typescript-eslint/eslint-plugin": {
+      "version": "2.34.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-2.34.0.tgz",
+      "integrity": "sha512-4zY3Z88rEE99+CNvTbXSyovv2z9PNOVffTWD2W8QF5s2prBQtwN2zadqERcrHpcR7O/+KMI3fcTAmUUhK/iQcQ==",
+      "dev": true,
+      "requires": {
+        "@typescript-eslint/experimental-utils": "2.34.0",
+        "functional-red-black-tree": "^1.0.1",
+        "regexpp": "^3.0.0",
+        "tsutils": "^3.17.1"
+      }
+    },
+    "@typescript-eslint/experimental-utils": {
+      "version": "2.34.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/experimental-utils/-/experimental-utils-2.34.0.tgz",
+      "integrity": "sha512-eS6FTkq+wuMJ+sgtuNTtcqavWXqsflWcfBnlYhg/nS4aZ1leewkXGbvBhaapn1q6qf4M71bsR1tez5JTRMuqwA==",
+      "dev": true,
+      "requires": {
+        "@types/json-schema": "^7.0.3",
+        "@typescript-eslint/typescript-estree": "2.34.0",
+        "eslint-scope": "^5.0.0",
+        "eslint-utils": "^2.0.0"
+      }
+    },
+    "@typescript-eslint/parser": {
+      "version": "2.34.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-2.34.0.tgz",
+      "integrity": "sha512-03ilO0ucSD0EPTw2X4PntSIRFtDPWjrVq7C3/Z3VQHRC7+13YB55rcJI3Jt+YgeHbjUdJPcPa7b23rXCBokuyA==",
+      "dev": true,
+      "requires": {
+        "@types/eslint-visitor-keys": "^1.0.0",
+        "@typescript-eslint/experimental-utils": "2.34.0",
+        "@typescript-eslint/typescript-estree": "2.34.0",
+        "eslint-visitor-keys": "^1.1.0"
+      }
+    },
+    "@typescript-eslint/typescript-estree": {
+      "version": "2.34.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-2.34.0.tgz",
+      "integrity": "sha512-OMAr+nJWKdlVM9LOqCqh3pQQPwxHAN7Du8DR6dmwCrAmxtiXQnhHJ6tBNtf+cggqfo51SG/FCwnKhXCIM7hnVg==",
+      "dev": true,
+      "requires": {
+        "debug": "^4.1.1",
+        "eslint-visitor-keys": "^1.1.0",
+        "glob": "^7.1.6",
+        "is-glob": "^4.0.1",
+        "lodash": "^4.17.15",
+        "semver": "^7.3.2",
+        "tsutils": "^3.17.1"
+      }
+    },
+    "@webgpu/types": {
+      "version": "0.0.31",
+      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.0.31.tgz",
+      "integrity": "sha512-cvvCMSZBT4VsRNtt0lI6XQqvOIIWw6+NRUtnPUMDVDgsI4pCZColz3qzF5QcP9wIYOHEc3jssIBse8UWONKhlQ==",
+      "dev": true
+    },
+    "abab": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/abab/-/abab-2.0.6.tgz",
+      "integrity": "sha512-j2afSsaIENvHZN2B8GOpF566vZ5WVk5opAiMTvWgaQT8DkbOqsTfvNAvHoRGU2zzP8cPoqys+xHTRDWW8L+/BA==",
+      "dev": true
+    },
+    "acorn": {
+      "version": "7.4.1",
+      "resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz",
+      "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A==",
+      "dev": true
+    },
+    "acorn-globals": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/acorn-globals/-/acorn-globals-6.0.0.tgz",
+      "integrity": "sha512-ZQl7LOWaF5ePqqcX4hLuv/bLXYQNfNWw2c0/yX/TsPRKamzHcTGQnlCjHT3TsmkOUVEPS3crCxiPfdzE/Trlhg==",
+      "dev": true,
+      "requires": {
+        "acorn": "^7.1.1",
+        "acorn-walk": "^7.1.1"
+      }
+    },
+    "acorn-jsx": {
+      "version": "5.3.2",
+      "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz",
+      "integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==",
+      "dev": true
+    },
+    "acorn-walk": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-7.2.0.tgz",
+      "integrity": "sha512-OPdCF6GsMIP+Az+aWfAAOEt2/+iVDKE7oy6lJ098aoe59oAmK76qV6Gw60SbZ8jHuG2wH058GF4pLFbYamYrVA==",
+      "dev": true
+    },
+    "agent-base": {
+      "version": "6.0.2",
+      "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz",
+      "integrity": "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==",
+      "dev": true,
+      "requires": {
+        "debug": "4"
+      }
+    },
+    "ajv": {
+      "version": "6.12.6",
+      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
+      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+      "dev": true,
+      "requires": {
+        "fast-deep-equal": "^3.1.1",
+        "fast-json-stable-stringify": "^2.0.0",
+        "json-schema-traverse": "^0.4.1",
+        "uri-js": "^4.2.2"
+      }
+    },
+    "ansi-escapes": {
+      "version": "4.3.2",
+      "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-4.3.2.tgz",
+      "integrity": "sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==",
+      "dev": true,
+      "requires": {
+        "type-fest": "^0.21.3"
+      },
+      "dependencies": {
+        "type-fest": {
+          "version": "0.21.3",
+          "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.21.3.tgz",
+          "integrity": "sha512-t0rzBq87m3fVcduHDUFhKmyyX+9eo6WQjZvf51Ea/M0Q7+T374Jp1aUiyUl0GKxp8M/OETVHSDvmkyPgvX+X2w==",
+          "dev": true
+        }
+      }
+    },
+    "ansi-regex": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "dev": true
+    },
+    "ansi-styles": {
+      "version": "3.2.1",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz",
+      "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==",
+      "dev": true,
+      "requires": {
+        "color-convert": "^1.9.0"
+      }
+    },
+    "anymatch": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz",
+      "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==",
+      "dev": true,
+      "requires": {
+        "normalize-path": "^3.0.0",
+        "picomatch": "^2.0.4"
+      }
+    },
+    "argparse": {
+      "version": "1.0.10",
+      "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz",
+      "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==",
+      "dev": true,
+      "requires": {
+        "sprintf-js": "~1.0.2"
+      }
+    },
+    "arr-diff": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz",
+      "integrity": "sha512-YVIQ82gZPGBebQV/a8dar4AitzCQs0jjXwMPZllpXMaGjXPYVUawSxQrRsjhjupyVxEvbHgUmIhKVlND+j02kA==",
+      "dev": true
+    },
+    "arr-flatten": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/arr-flatten/-/arr-flatten-1.1.0.tgz",
+      "integrity": "sha512-L3hKV5R/p5o81R7O02IGnwpDmkp6E982XhtbuwSe3O4qOtMMMtodicASA1Cny2U+aCXcNpml+m4dPsvsJ3jatg==",
+      "dev": true
+    },
+    "arr-union": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz",
+      "integrity": "sha512-sKpyeERZ02v1FeCZT8lrfJq5u6goHCtpTAzPwJYe7c8SPFOboNjNg1vz2L4VTn9T4PQxEx13TbXLmYUcS6Ug7Q==",
+      "dev": true
+    },
+    "array-unique": {
+      "version": "0.3.2",
+      "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz",
+      "integrity": "sha512-SleRWjh9JUud2wH1hPs9rZBZ33H6T9HOiL0uwGnGx9FpE6wKGyfWugmbkEOIs6qWrZhg0LWeLziLrEwQJhs5mQ==",
+      "dev": true
+    },
+    "assign-symbols": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/assign-symbols/-/assign-symbols-1.0.0.tgz",
+      "integrity": "sha512-Q+JC7Whu8HhmTdBph/Tq59IoRtoy6KAm5zzPv00WdujX82lbAL8K7WVjne7vdCsAmbF4AYaDOPyO3k0kl8qIrw==",
+      "dev": true
+    },
+    "astral-regex": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/astral-regex/-/astral-regex-1.0.0.tgz",
+      "integrity": "sha512-+Ryf6g3BKoRc7jfp7ad8tM4TtMiaWvbF/1/sQcZPkkS7ag3D5nMBCe2UfOTONtAkaG0tO0ij3C5Lwmf1EiyjHg==",
+      "dev": true
+    },
+    "asynckit": {
+      "version": "0.4.0",
+      "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
+      "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
+      "dev": true
+    },
+    "atob": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/atob/-/atob-2.1.2.tgz",
+      "integrity": "sha512-Wm6ukoaOGJi/73p/cl2GvLjTI5JM1k/O14isD73YML8StrH/7/lRFgmg8nICZgD3bZZvjwCGxtMOD3wWNAu8cg==",
+      "dev": true
+    },
+    "babel-jest": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/babel-jest/-/babel-jest-26.6.3.tgz",
+      "integrity": "sha512-pl4Q+GAVOHwvjrck6jKjvmGhnO3jHX/xuB9d27f+EJZ/6k+6nMuPjorrYp7s++bKKdANwzElBWnLWaObvTnaZA==",
+      "dev": true,
+      "requires": {
+        "@jest/transform": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/babel__core": "^7.1.7",
+        "babel-plugin-istanbul": "^6.0.0",
+        "babel-preset-jest": "^26.6.2",
+        "chalk": "^4.0.0",
+        "graceful-fs": "^4.2.4",
+        "slash": "^3.0.0"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "babel-plugin-istanbul": {
+      "version": "6.1.1",
+      "resolved": "https://registry.npmjs.org/babel-plugin-istanbul/-/babel-plugin-istanbul-6.1.1.tgz",
+      "integrity": "sha512-Y1IQok9821cC9onCx5otgFfRm7Lm+I+wwxOx738M/WLPZ9Q42m4IG5W0FNX8WLL2gYMZo3JkuXIH2DOpWM+qwA==",
+      "dev": true,
+      "requires": {
+        "@babel/helper-plugin-utils": "^7.0.0",
+        "@istanbuljs/load-nyc-config": "^1.0.0",
+        "@istanbuljs/schema": "^0.1.2",
+        "istanbul-lib-instrument": "^5.0.4",
+        "test-exclude": "^6.0.0"
+      },
+      "dependencies": {
+        "istanbul-lib-instrument": {
+          "version": "5.2.1",
+          "resolved": "https://registry.npmjs.org/istanbul-lib-instrument/-/istanbul-lib-instrument-5.2.1.tgz",
+          "integrity": "sha512-pzqtp31nLv/XFOzXGuvhCb8qhjmTVo5vjVk19XE4CRlSWz0KoeJ3bw9XsA7nOp9YBf4qHjwBxkDzKcME/J29Yg==",
+          "dev": true,
+          "requires": {
+            "@babel/core": "^7.12.3",
+            "@babel/parser": "^7.14.7",
+            "@istanbuljs/schema": "^0.1.2",
+            "istanbul-lib-coverage": "^3.2.0",
+            "semver": "^6.3.0"
+          }
+        },
+        "semver": {
+          "version": "6.3.0",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+          "dev": true
+        }
+      }
+    },
+    "babel-plugin-jest-hoist": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/babel-plugin-jest-hoist/-/babel-plugin-jest-hoist-26.6.2.tgz",
+      "integrity": "sha512-PO9t0697lNTmcEHH69mdtYiOIkkOlj9fySqfO3K1eCcdISevLAE0xY59VLLUj0SoiPiTX/JU2CYFpILydUa5Lw==",
+      "dev": true,
+      "requires": {
+        "@babel/template": "^7.3.3",
+        "@babel/types": "^7.3.3",
+        "@types/babel__core": "^7.0.0",
+        "@types/babel__traverse": "^7.0.6"
+      }
+    },
+    "babel-preset-current-node-syntax": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/babel-preset-current-node-syntax/-/babel-preset-current-node-syntax-1.0.1.tgz",
+      "integrity": "sha512-M7LQ0bxarkxQoN+vz5aJPsLBn77n8QgTFmo8WK0/44auK2xlCXrYcUxHFxgU7qW5Yzw/CjmLRK2uJzaCd7LvqQ==",
+      "dev": true,
+      "requires": {
+        "@babel/plugin-syntax-async-generators": "^7.8.4",
+        "@babel/plugin-syntax-bigint": "^7.8.3",
+        "@babel/plugin-syntax-class-properties": "^7.8.3",
+        "@babel/plugin-syntax-import-meta": "^7.8.3",
+        "@babel/plugin-syntax-json-strings": "^7.8.3",
+        "@babel/plugin-syntax-logical-assignment-operators": "^7.8.3",
+        "@babel/plugin-syntax-nullish-coalescing-operator": "^7.8.3",
+        "@babel/plugin-syntax-numeric-separator": "^7.8.3",
+        "@babel/plugin-syntax-object-rest-spread": "^7.8.3",
+        "@babel/plugin-syntax-optional-catch-binding": "^7.8.3",
+        "@babel/plugin-syntax-optional-chaining": "^7.8.3",
+        "@babel/plugin-syntax-top-level-await": "^7.8.3"
+      }
+    },
+    "babel-preset-jest": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/babel-preset-jest/-/babel-preset-jest-26.6.2.tgz",
+      "integrity": "sha512-YvdtlVm9t3k777c5NPQIv6cxFFFapys25HiUmuSgHwIZhfifweR5c5Sf5nwE3MAbfu327CYSvps8Yx6ANLyleQ==",
+      "dev": true,
+      "requires": {
+        "babel-plugin-jest-hoist": "^26.6.2",
+        "babel-preset-current-node-syntax": "^1.0.0"
+      }
+    },
+    "balanced-match": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
+      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
+      "dev": true
+    },
+    "base": {
+      "version": "0.11.2",
+      "resolved": "https://registry.npmjs.org/base/-/base-0.11.2.tgz",
+      "integrity": "sha512-5T6P4xPgpp0YDFvSWwEZ4NoE3aM4QBQXDzmVbraCkFj8zHM+mba8SyqB5DbZWyR7mYHo6Y7BdQo3MoA4m0TeQg==",
+      "dev": true,
+      "requires": {
+        "cache-base": "^1.0.1",
+        "class-utils": "^0.3.5",
+        "component-emitter": "^1.2.1",
+        "define-property": "^1.0.0",
+        "isobject": "^3.0.1",
+        "mixin-deep": "^1.2.0",
+        "pascalcase": "^0.1.1"
+      },
+      "dependencies": {
+        "define-property": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz",
+          "integrity": "sha512-cZTYKFWspt9jZsMscWo8sc/5lbPC9Q0N5nBLgb+Yd915iL3udB1uFgS3B8YCx66UVHq018DAVFoee7x+gxggeA==",
+          "dev": true,
+          "requires": {
+            "is-descriptor": "^1.0.0"
+          }
+        },
+        "is-accessor-descriptor": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz",
+          "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==",
+          "dev": true,
+          "requires": {
+            "kind-of": "^6.0.0"
+          }
+        },
+        "is-data-descriptor": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz",
+          "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==",
+          "dev": true,
+          "requires": {
+            "kind-of": "^6.0.0"
+          }
+        },
+        "is-descriptor": {
+          "version": "1.0.2",
+          "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz",
+          "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==",
+          "dev": true,
+          "requires": {
+            "is-accessor-descriptor": "^1.0.0",
+            "is-data-descriptor": "^1.0.0",
+            "kind-of": "^6.0.2"
+          }
+        }
+      }
+    },
+    "brace-expansion": {
+      "version": "1.1.11",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
+      "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
+      "dev": true,
+      "requires": {
+        "balanced-match": "^1.0.0",
+        "concat-map": "0.0.1"
+      }
+    },
+    "braces": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz",
+      "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==",
+      "dev": true,
+      "requires": {
+        "fill-range": "^7.0.1"
+      }
+    },
+    "browser-process-hrtime": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/browser-process-hrtime/-/browser-process-hrtime-1.0.0.tgz",
+      "integrity": "sha512-9o5UecI3GhkpM6DrXr69PblIuWxPKk9Y0jHBRhdocZ2y7YECBFCsHm79Pr3OyR2AvjhDkabFJaDJMYRazHgsow==",
+      "dev": true
+    },
+    "browserslist": {
+      "version": "4.21.4",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.21.4.tgz",
+      "integrity": "sha512-CBHJJdDmgjl3daYjN5Cp5kbTf1mUhZoS+beLklHIvkOWscs83YAhLlF3Wsh/lciQYAcbBJgTOD44VtG31ZM4Hw==",
+      "dev": true,
+      "requires": {
+        "caniuse-lite": "^1.0.30001400",
+        "electron-to-chromium": "^1.4.251",
+        "node-releases": "^2.0.6",
+        "update-browserslist-db": "^1.0.9"
+      }
+    },
+    "bser": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/bser/-/bser-2.1.1.tgz",
+      "integrity": "sha512-gQxTNE/GAfIIrmHLUE3oJyp5FO6HRBfhjnw4/wMmA63ZGDJnWBmgY/lyQBpnDUkGmAhbSe39tx2d/iTOAfglwQ==",
+      "dev": true,
+      "requires": {
+        "node-int64": "^0.4.0"
+      }
+    },
+    "buffer-from": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz",
+      "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==",
+      "dev": true
+    },
+    "builtin-modules": {
+      "version": "3.3.0",
+      "resolved": "https://registry.npmjs.org/builtin-modules/-/builtin-modules-3.3.0.tgz",
+      "integrity": "sha512-zhaCDicdLuWN5UbN5IMnFqNMhNfo919sH85y2/ea+5Yg9TsTkeZxpL+JLbp6cgYFS4sRLp3YV4S6yDuqVWHYOw==",
+      "dev": true
+    },
+    "cache-base": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/cache-base/-/cache-base-1.0.1.tgz",
+      "integrity": "sha512-AKcdTnFSWATd5/GCPRxr2ChwIJ85CeyrEyjRHlKxQ56d4XJMGym0uAiKn0xbLOGOl3+yRpOTi484dVCEc5AUzQ==",
+      "dev": true,
+      "requires": {
+        "collection-visit": "^1.0.0",
+        "component-emitter": "^1.2.1",
+        "get-value": "^2.0.6",
+        "has-value": "^1.0.0",
+        "isobject": "^3.0.1",
+        "set-value": "^2.0.0",
+        "to-object-path": "^0.3.0",
+        "union-value": "^1.0.0",
+        "unset-value": "^1.0.0"
+      }
+    },
+    "callsites": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
+      "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==",
+      "dev": true
+    },
+    "camelcase": {
+      "version": "5.3.1",
+      "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-5.3.1.tgz",
+      "integrity": "sha512-L28STB170nwWS63UjtlEOE3dldQApaJXZkOI1uMFfzf3rRuPegHaHesyee+YxQ+W6SvRDQV6UrdOdRiR153wJg==",
+      "dev": true
+    },
+    "caniuse-lite": {
+      "version": "1.0.30001434",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001434.tgz",
+      "integrity": "sha512-aOBHrLmTQw//WFa2rcF1If9fa3ypkC1wzqqiKHgfdrXTWcU8C4gKVZT77eQAPWN1APys3+uQ0Df07rKauXGEYA==",
+      "dev": true
+    },
+    "capture-exit": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/capture-exit/-/capture-exit-2.0.0.tgz",
+      "integrity": "sha512-PiT/hQmTonHhl/HFGN+Lx3JJUznrVYJ3+AQsnthneZbvW7x+f08Tk7yLJTLEOUvBTbduLeeBkxEaYXUOUrRq6g==",
+      "dev": true,
+      "requires": {
+        "rsvp": "^4.8.4"
+      }
+    },
+    "chalk": {
+      "version": "2.4.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz",
+      "integrity": "sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==",
+      "dev": true,
+      "requires": {
+        "ansi-styles": "^3.2.1",
+        "escape-string-regexp": "^1.0.5",
+        "supports-color": "^5.3.0"
+      }
+    },
+    "char-regex": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/char-regex/-/char-regex-1.0.2.tgz",
+      "integrity": "sha512-kWWXztvZ5SBQV+eRgKFeh8q5sLuZY2+8WUIzlxWVTg+oGwY14qylx1KbKzHd8P6ZYkAg0xyIDU9JMHhyJMZ1jw==",
+      "dev": true
+    },
+    "chardet": {
+      "version": "0.7.0",
+      "resolved": "https://registry.npmjs.org/chardet/-/chardet-0.7.0.tgz",
+      "integrity": "sha512-mT8iDcrh03qDGRRmoA2hmBJnxpllMR+0/0qlzjqZES6NdiWDcZkCNAk4rPFZ9Q85r27unkiNNg8ZOiwZXBHwcA==",
+      "dev": true
+    },
+    "ci-info": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-2.0.0.tgz",
+      "integrity": "sha512-5tK7EtrZ0N+OLFMthtqOj4fI2Jeb88C4CAZPu25LDVUgXJ0A3Js4PMGqrn0JU1W0Mh1/Z8wZzYPxqUrXeBboCQ==",
+      "dev": true
+    },
+    "cjs-module-lexer": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-0.6.0.tgz",
+      "integrity": "sha512-uc2Vix1frTfnuzxxu1Hp4ktSvM3QaI4oXl4ZUqL1wjTu/BGki9TrCWoqLTg/drR1KwAEarXuRFCG2Svr1GxPFw==",
+      "dev": true
+    },
+    "class-utils": {
+      "version": "0.3.6",
+      "resolved": "https://registry.npmjs.org/class-utils/-/class-utils-0.3.6.tgz",
+      "integrity": "sha512-qOhPa/Fj7s6TY8H8esGu5QNpMMQxz79h+urzrNYN6mn+9BnxlDGf5QZ+XeCDsxSjPqsSR56XOZOJmpeurnLMeg==",
+      "dev": true,
+      "requires": {
+        "arr-union": "^3.1.0",
+        "define-property": "^0.2.5",
+        "isobject": "^3.0.0",
+        "static-extend": "^0.1.1"
+      },
+      "dependencies": {
+        "define-property": {
+          "version": "0.2.5",
+          "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz",
+          "integrity": "sha512-Rr7ADjQZenceVOAKop6ALkkRAmH1A4Gx9hV/7ZujPUN2rkATqFO0JZLZInbAjpZYoJ1gUx8MRMQVkYemcbMSTA==",
+          "dev": true,
+          "requires": {
+            "is-descriptor": "^0.1.0"
+          }
+        }
+      }
+    },
+    "cli-cursor": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-3.1.0.tgz",
+      "integrity": "sha512-I/zHAwsKf9FqGoXM4WWRACob9+SNukZTd94DWF57E4toouRulbCxcUh6RKUEOQlYTHJnzkPMySvPNaaSLNfLZw==",
+      "dev": true,
+      "requires": {
+        "restore-cursor": "^3.1.0"
+      }
+    },
+    "cli-width": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/cli-width/-/cli-width-3.0.0.tgz",
+      "integrity": "sha512-FxqpkPPwu1HjuN93Omfm4h8uIanXofW0RxVEW3k5RKx+mJJYSthzNhp32Kzxxy3YAEZ/Dc/EWN1vZRY0+kOhbw==",
+      "dev": true
+    },
+    "cliui": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/cliui/-/cliui-6.0.0.tgz",
+      "integrity": "sha512-t6wbgtoCXvAzst7QgXxJYqPt0usEfbgQdftEPbLL/cvv6HPE5VgvqCuAIDR0NgU52ds6rFwqrgakNLrHEjCbrQ==",
+      "dev": true,
+      "requires": {
+        "string-width": "^4.2.0",
+        "strip-ansi": "^6.0.0",
+        "wrap-ansi": "^6.2.0"
+      },
+      "dependencies": {
+        "strip-ansi": {
+          "version": "6.0.1",
+          "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+          "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+          "dev": true,
+          "requires": {
+            "ansi-regex": "^5.0.1"
+          }
+        }
+      }
+    },
+    "co": {
+      "version": "4.6.0",
+      "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
+      "integrity": "sha512-QVb0dM5HvG+uaxitm8wONl7jltx8dqhfU33DcqtOZcLSVIKSDDLDi7+0LbAKiyI8hD9u42m2YxXSkMGWThaecQ==",
+      "dev": true
+    },
+    "collect-v8-coverage": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/collect-v8-coverage/-/collect-v8-coverage-1.0.1.tgz",
+      "integrity": "sha512-iBPtljfCNcTKNAto0KEtDfZ3qzjJvqE3aTGZsbhjSBlorqpXJlaWWtPO35D+ZImoC3KWejX64o+yPGxhWSTzfg==",
+      "dev": true
+    },
+    "collection-visit": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/collection-visit/-/collection-visit-1.0.0.tgz",
+      "integrity": "sha512-lNkKvzEeMBBjUGHZ+q6z9pSJla0KWAQPvtzhEV9+iGyQYG+pBpl7xKDhxoNSOZH2hhv0v5k0y2yAM4o4SjoSkw==",
+      "dev": true,
+      "requires": {
+        "map-visit": "^1.0.0",
+        "object-visit": "^1.0.0"
+      }
+    },
+    "color-convert": {
+      "version": "1.9.3",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
+      "integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
+      "dev": true,
+      "requires": {
+        "color-name": "1.1.3"
+      }
+    },
+    "color-name": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
+      "integrity": "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw==",
+      "dev": true
+    },
+    "combined-stream": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
+      "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
+      "dev": true,
+      "requires": {
+        "delayed-stream": "~1.0.0"
+      }
+    },
+    "commondir": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/commondir/-/commondir-1.0.1.tgz",
+      "integrity": "sha512-W9pAhw0ja1Edb5GVdIF1mjZw/ASI0AlShXM83UUGe2DVr5TdAPEA1OA8m/g8zWp9x6On7gqufY+FatDbC3MDQg==",
+      "dev": true
+    },
+    "component-emitter": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/component-emitter/-/component-emitter-1.3.0.tgz",
+      "integrity": "sha512-Rd3se6QB+sO1TwqZjscQrurpEPIfO0/yYnSin6Q/rD3mOutHvUrCAhJub3r90uNb+SESBuE0QYoB90YdfatsRg==",
+      "dev": true
+    },
+    "concat-map": {
+      "version": "0.0.1",
+      "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
+      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
+      "dev": true
+    },
+    "convert-source-map": {
+      "version": "1.9.0",
+      "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-1.9.0.tgz",
+      "integrity": "sha512-ASFBup0Mz1uyiIjANan1jzLQami9z1PoYSZCiiYW2FczPbenXc45FZdBZLzOT+r6+iciuEModtmCti+hjaAk0A==",
+      "dev": true
+    },
+    "copy-descriptor": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/copy-descriptor/-/copy-descriptor-0.1.1.tgz",
+      "integrity": "sha512-XgZ0pFcakEUlbwQEVNg3+QAis1FyTL3Qel9FYy8pSkQqoG3PNoT0bOCQtOXcOkur21r2Eq2kI+IE+gsmAEVlYw==",
+      "dev": true
+    },
+    "cross-spawn": {
+      "version": "6.0.5",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.5.tgz",
+      "integrity": "sha512-eTVLrBSt7fjbDygz805pMnstIs2VTBNkRm0qxZd+M7A5XDdxVRWO5MxGBXZhjY4cqLYLdtrGqRf8mBPmzwSpWQ==",
+      "dev": true,
+      "requires": {
+        "nice-try": "^1.0.4",
+        "path-key": "^2.0.1",
+        "semver": "^5.5.0",
+        "shebang-command": "^1.2.0",
+        "which": "^1.2.9"
+      },
+      "dependencies": {
+        "semver": {
+          "version": "5.7.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
+          "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+          "dev": true
+        }
+      }
+    },
+    "cssom": {
+      "version": "0.4.4",
+      "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.4.4.tgz",
+      "integrity": "sha512-p3pvU7r1MyyqbTk+WbNJIgJjG2VmTIaB10rI93LzVPrmDJKkzKYMtxxyAvQXR/NS6otuzveI7+7BBq3SjBS2mw==",
+      "dev": true
+    },
+    "cssstyle": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-2.3.0.tgz",
+      "integrity": "sha512-AZL67abkUzIuvcHqk7c09cezpGNcxUxU4Ioi/05xHk4DQeTkWmGYftIE6ctU6AEt+Gn4n1lDStOtj7FKycP71A==",
+      "dev": true,
+      "requires": {
+        "cssom": "~0.3.6"
+      },
+      "dependencies": {
+        "cssom": {
+          "version": "0.3.8",
+          "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.3.8.tgz",
+          "integrity": "sha512-b0tGHbfegbhPJpxpiBPU2sCkigAqtM9O121le6bbOlgyV+NyGyCmVfJ6QW9eRjz8CpNfWEOYBIMIGRYkLwsIYg==",
+          "dev": true
+        }
+      }
+    },
+    "data-urls": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-2.0.0.tgz",
+      "integrity": "sha512-X5eWTSXO/BJmpdIKCRuKUgSCgAN0OwliVK3yPKbwIWU1Tdw5BRajxlzMidvh+gwko9AfQ9zIj52pzF91Q3YAvQ==",
+      "dev": true,
+      "requires": {
+        "abab": "^2.0.3",
+        "whatwg-mimetype": "^2.3.0",
+        "whatwg-url": "^8.0.0"
+      }
+    },
+    "debug": {
+      "version": "4.3.4",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
+      "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
+      "dev": true,
+      "requires": {
+        "ms": "2.1.2"
+      }
+    },
+    "decamelize": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz",
+      "integrity": "sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==",
+      "dev": true
+    },
+    "decimal.js": {
+      "version": "10.4.2",
+      "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.2.tgz",
+      "integrity": "sha512-ic1yEvwT6GuvaYwBLLY6/aFFgjZdySKTE8en/fkU3QICTmRtgtSlFn0u0BXN06InZwtfCelR7j8LRiDI/02iGA==",
+      "dev": true
+    },
+    "decode-uri-component": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.0.tgz",
+      "integrity": "sha512-hjf+xovcEn31w/EUYdTXQh/8smFL/dzYjohQGEIgjyNavaJfBY2p5F527Bo1VPATxv0VYTUC2bOcXvqFwk78Og==",
+      "dev": true
+    },
+    "deep-is": {
+      "version": "0.1.4",
+      "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz",
+      "integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==",
+      "dev": true
+    },
+    "deepmerge": {
+      "version": "4.2.2",
+      "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.2.2.tgz",
+      "integrity": "sha512-FJ3UgI4gIl+PHZm53knsuSFpE+nESMr7M4v9QcgB7S63Kj/6WqMiFQJpBBYz1Pt+66bZpP3Q7Lye0Oo9MPKEdg==",
+      "dev": true
+    },
+    "define-property": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/define-property/-/define-property-2.0.2.tgz",
+      "integrity": "sha512-jwK2UV4cnPpbcG7+VRARKTZPUWowwXA8bzH5NP6ud0oeAxyYPuGZUAC7hMugpCdz4BeSZl2Dl9k66CHJ/46ZYQ==",
+      "dev": true,
+      "requires": {
+        "is-descriptor": "^1.0.2",
+        "isobject": "^3.0.1"
+      },
+      "dependencies": {
+        "is-accessor-descriptor": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz",
+          "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==",
+          "dev": true,
+          "requires": {
+            "kind-of": "^6.0.0"
+          }
+        },
+        "is-data-descriptor": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz",
+          "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==",
+          "dev": true,
+          "requires": {
+            "kind-of": "^6.0.0"
+          }
+        },
+        "is-descriptor": {
+          "version": "1.0.2",
+          "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz",
+          "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==",
+          "dev": true,
+          "requires": {
+            "is-accessor-descriptor": "^1.0.0",
+            "is-data-descriptor": "^1.0.0",
+            "kind-of": "^6.0.2"
+          }
+        }
+      }
+    },
+    "delayed-stream": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
+      "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
+      "dev": true
+    },
+    "detect-newline": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/detect-newline/-/detect-newline-3.1.0.tgz",
+      "integrity": "sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==",
+      "dev": true
+    },
+    "diff-sequences": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/diff-sequences/-/diff-sequences-26.6.2.tgz",
+      "integrity": "sha512-Mv/TDa3nZ9sbc5soK+OoA74BsS3mL37yixCvUAQkiuA4Wz6YtwP/K47n2rv2ovzHZvoiQeA5FTQOschKkEwB0Q==",
+      "dev": true
+    },
+    "doctrine": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-3.0.0.tgz",
+      "integrity": "sha512-yS+Q5i3hBf7GBkd4KG8a7eBNNWNGLTaEwwYWUijIYM7zrlYDM0BFXHjjPWlWZ1Rg7UaddZeIDmi9jF3HmqiQ2w==",
+      "dev": true,
+      "requires": {
+        "esutils": "^2.0.2"
+      }
+    },
+    "domexception": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/domexception/-/domexception-2.0.1.tgz",
+      "integrity": "sha512-yxJ2mFy/sibVQlu5qHjOkf9J3K6zgmCxgJ94u2EdvDOV09H+32LtRswEcUsmUWN72pVLOEnTSRaIVVzVQgS0dg==",
+      "dev": true,
+      "requires": {
+        "webidl-conversions": "^5.0.0"
+      },
+      "dependencies": {
+        "webidl-conversions": {
+          "version": "5.0.0",
+          "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-5.0.0.tgz",
+          "integrity": "sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA==",
+          "dev": true
+        }
+      }
+    },
+    "electron-to-chromium": {
+      "version": "1.4.284",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.284.tgz",
+      "integrity": "sha512-M8WEXFuKXMYMVr45fo8mq0wUrrJHheiKZf6BArTKk9ZBYCKJEOU5H8cdWgDT+qCVZf7Na4lVUaZsA+h6uA9+PA==",
+      "dev": true
+    },
+    "emittery": {
+      "version": "0.7.2",
+      "resolved": "https://registry.npmjs.org/emittery/-/emittery-0.7.2.tgz",
+      "integrity": "sha512-A8OG5SR/ij3SsJdWDJdkkSYUjQdCUx6APQXem0SaEePBSRg4eymGYwBkKo1Y6DU+af/Jn2dBQqDBvjnr9Vi8nQ==",
+      "dev": true
+    },
+    "emoji-regex": {
+      "version": "8.0.0",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
+      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
+      "dev": true
+    },
+    "end-of-stream": {
+      "version": "1.4.4",
+      "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz",
+      "integrity": "sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==",
+      "dev": true,
+      "requires": {
+        "once": "^1.4.0"
+      }
+    },
+    "error-ex": {
+      "version": "1.3.2",
+      "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz",
+      "integrity": "sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==",
+      "dev": true,
+      "requires": {
+        "is-arrayish": "^0.2.1"
+      }
+    },
+    "escalade": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz",
+      "integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw==",
+      "dev": true
+    },
+    "escape-string-regexp": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz",
+      "integrity": "sha512-vbRorB5FUQWvla16U8R/qgaFIya2qGzwDrNmCZuYKrbdSUMG6I1ZCGQRefkRVhuOkIGVne7BQ35DSfo1qvJqFg==",
+      "dev": true
+    },
+    "escodegen": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.0.0.tgz",
+      "integrity": "sha512-mmHKys/C8BFUGI+MAWNcSYoORYLMdPzjrknd2Vc+bUsjN5bXcr8EhrNB+UTqfL1y3I9c4fw2ihgtMPQLBRiQxw==",
+      "dev": true,
+      "requires": {
+        "esprima": "^4.0.1",
+        "estraverse": "^5.2.0",
+        "esutils": "^2.0.2",
+        "optionator": "^0.8.1",
+        "source-map": "~0.6.1"
+      },
+      "dependencies": {
+        "estraverse": {
+          "version": "5.3.0",
+          "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
+          "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
+          "dev": true
+        }
+      }
+    },
+    "eslint": {
+      "version": "6.8.0",
+      "resolved": "https://registry.npmjs.org/eslint/-/eslint-6.8.0.tgz",
+      "integrity": "sha512-K+Iayyo2LtyYhDSYwz5D5QdWw0hCacNzyq1Y821Xna2xSJj7cijoLLYmLxTQgcgZ9mC61nryMy9S7GRbYpI5Ig==",
+      "dev": true,
+      "requires": {
+        "@babel/code-frame": "^7.0.0",
+        "ajv": "^6.10.0",
+        "chalk": "^2.1.0",
+        "cross-spawn": "^6.0.5",
+        "debug": "^4.0.1",
+        "doctrine": "^3.0.0",
+        "eslint-scope": "^5.0.0",
+        "eslint-utils": "^1.4.3",
+        "eslint-visitor-keys": "^1.1.0",
+        "espree": "^6.1.2",
+        "esquery": "^1.0.1",
+        "esutils": "^2.0.2",
+        "file-entry-cache": "^5.0.1",
+        "functional-red-black-tree": "^1.0.1",
+        "glob-parent": "^5.0.0",
+        "globals": "^12.1.0",
+        "ignore": "^4.0.6",
+        "import-fresh": "^3.0.0",
+        "imurmurhash": "^0.1.4",
+        "inquirer": "^7.0.0",
+        "is-glob": "^4.0.0",
+        "js-yaml": "^3.13.1",
+        "json-stable-stringify-without-jsonify": "^1.0.1",
+        "levn": "^0.3.0",
+        "lodash": "^4.17.14",
+        "minimatch": "^3.0.4",
+        "mkdirp": "^0.5.1",
+        "natural-compare": "^1.4.0",
+        "optionator": "^0.8.3",
+        "progress": "^2.0.0",
+        "regexpp": "^2.0.1",
+        "semver": "^6.1.2",
+        "strip-ansi": "^5.2.0",
+        "strip-json-comments": "^3.0.1",
+        "table": "^5.2.3",
+        "text-table": "^0.2.0",
+        "v8-compile-cache": "^2.0.3"
+      },
+      "dependencies": {
+        "eslint-utils": {
+          "version": "1.4.3",
+          "resolved": "https://registry.npmjs.org/eslint-utils/-/eslint-utils-1.4.3.tgz",
+          "integrity": "sha512-fbBN5W2xdY45KulGXmLHZ3c3FHfVYmKg0IrAKGOkT/464PQsx2UeIzfz1RmEci+KLm1bBaAzZAh8+/E+XAeZ8Q==",
+          "dev": true,
+          "requires": {
+            "eslint-visitor-keys": "^1.1.0"
+          }
+        },
+        "eslint-visitor-keys": {
+          "version": "1.3.0",
+          "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz",
+          "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==",
+          "dev": true
+        },
+        "ignore": {
+          "version": "4.0.6",
+          "resolved": "https://registry.npmjs.org/ignore/-/ignore-4.0.6.tgz",
+          "integrity": "sha512-cyFDKrqc/YdcWFniJhzI42+AzS+gNwmUzOSFcRCQYwySuBBBy/KjuxWLZ/FHEH6Moq1NizMOBWyTcv8O4OZIMg==",
+          "dev": true
+        },
+        "regexpp": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/regexpp/-/regexpp-2.0.1.tgz",
+          "integrity": "sha512-lv0M6+TkDVniA3aD1Eg0DVpfU/booSu7Eev3TDO/mZKHBfVjgCGTV4t4buppESEYDtkArYFOxTJWv6S5C+iaNw==",
+          "dev": true
+        },
+        "semver": {
+          "version": "6.3.0",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+          "dev": true
+        }
+      }
+    },
+    "eslint-scope": {
+      "version": "5.1.1",
+      "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-5.1.1.tgz",
+      "integrity": "sha512-2NxwbF/hZ0KpepYN0cNbo+FN6XoK7GaHlQhgx/hIZl6Va0bF45RQOOwhLIy8lQDbuCiadSLCBnH2CFYquit5bw==",
+      "dev": true,
+      "requires": {
+        "esrecurse": "^4.3.0",
+        "estraverse": "^4.1.1"
+      }
+    },
+    "eslint-utils": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/eslint-utils/-/eslint-utils-2.1.0.tgz",
+      "integrity": "sha512-w94dQYoauyvlDc43XnGB8lU3Zt713vNChgt4EWwhXAP2XkBvndfxF0AgIqKOOasjPIPzj9JqgwkwbCYD0/V3Zg==",
+      "dev": true,
+      "requires": {
+        "eslint-visitor-keys": "^1.1.0"
+      }
+    },
+    "eslint-visitor-keys": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz",
+      "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==",
+      "dev": true
+    },
+    "espree": {
+      "version": "6.2.1",
+      "resolved": "https://registry.npmjs.org/espree/-/espree-6.2.1.tgz",
+      "integrity": "sha512-ysCxRQY3WaXJz9tdbWOwuWr5Y/XrPTGX9Kiz3yoUXwW0VZ4w30HTkQLaGx/+ttFjF8i+ACbArnB4ce68a9m5hw==",
+      "dev": true,
+      "requires": {
+        "acorn": "^7.1.1",
+        "acorn-jsx": "^5.2.0",
+        "eslint-visitor-keys": "^1.1.0"
+      },
+      "dependencies": {
+        "eslint-visitor-keys": {
+          "version": "1.3.0",
+          "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz",
+          "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==",
+          "dev": true
+        }
+      }
+    },
+    "esprima": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz",
+      "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==",
+      "dev": true
+    },
+    "esquery": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.4.0.tgz",
+      "integrity": "sha512-cCDispWt5vHHtwMY2YrAQ4ibFkAL8RbH5YGBnZBc90MolvvfkkQcJro/aZiAQUlQ3qgrYS6D6v8Gc5G5CQsc9w==",
+      "dev": true,
+      "requires": {
+        "estraverse": "^5.1.0"
+      },
+      "dependencies": {
+        "estraverse": {
+          "version": "5.3.0",
+          "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
+          "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
+          "dev": true
+        }
+      }
+    },
+    "esrecurse": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz",
+      "integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==",
+      "dev": true,
+      "requires": {
+        "estraverse": "^5.2.0"
+      },
+      "dependencies": {
+        "estraverse": {
+          "version": "5.3.0",
+          "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
+          "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
+          "dev": true
+        }
+      }
+    },
+    "estraverse": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-4.3.0.tgz",
+      "integrity": "sha512-39nnKffWz8xN1BU/2c79n9nB9HDzo0niYUqx6xyqUnyoAnQyyWpOTdZEeiCch8BBu515t4wp9ZmgVfVhn9EBpw==",
+      "dev": true
+    },
+    "estree-walker": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-1.0.1.tgz",
+      "integrity": "sha512-1fMXF3YP4pZZVozF8j/ZLfvnR8NSIljt56UhbZ5PeeDmmGHpgpdwQt7ITlGvYaQukCvuBRMLEiKiYC+oeIg4cg==",
+      "dev": true
+    },
+    "esutils": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
+      "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
+      "dev": true
+    },
+    "exec-sh": {
+      "version": "0.3.6",
+      "resolved": "https://registry.npmjs.org/exec-sh/-/exec-sh-0.3.6.tgz",
+      "integrity": "sha512-nQn+hI3yp+oD0huYhKwvYI32+JFeq+XkNcD1GAo3Y/MjxsfVGmrrzrnzjWiNY6f+pUCP440fThsFh5gZrRAU/w==",
+      "dev": true
+    },
+    "execa": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/execa/-/execa-1.0.0.tgz",
+      "integrity": "sha512-adbxcyWV46qiHyvSp50TKt05tB4tK3HcmF7/nxfAdhnox83seTDbwnaqKO4sXRy7roHAIFqJP/Rw/AuEbX61LA==",
+      "dev": true,
+      "requires": {
+        "cross-spawn": "^6.0.0",
+        "get-stream": "^4.0.0",
+        "is-stream": "^1.1.0",
+        "npm-run-path": "^2.0.0",
+        "p-finally": "^1.0.0",
+        "signal-exit": "^3.0.0",
+        "strip-eof": "^1.0.0"
+      }
+    },
+    "exit": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/exit/-/exit-0.1.2.tgz",
+      "integrity": "sha512-Zk/eNKV2zbjpKzrsQ+n1G6poVbErQxJ0LBOJXaKZ1EViLzH+hrLu9cdXI4zw9dBQJslwBEpbQ2P1oS7nDxs6jQ==",
+      "dev": true
+    },
+    "expand-brackets": {
+      "version": "2.1.4",
+      "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz",
+      "integrity": "sha512-w/ozOKR9Obk3qoWeY/WDi6MFta9AoMR+zud60mdnbniMcBxRuFJyDt2LdX/14A1UABeqk+Uk+LDfUpvoGKppZA==",
+      "dev": true,
+      "requires": {
+        "debug": "^2.3.3",
+        "define-property": "^0.2.5",
+        "extend-shallow": "^2.0.1",
+        "posix-character-classes": "^0.1.0",
+        "regex-not": "^1.0.0",
+        "snapdragon": "^0.8.1",
+        "to-regex": "^3.0.1"
+      },
+      "dependencies": {
+        "debug": {
+          "version": "2.6.9",
+          "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
+          "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
+          "dev": true,
+          "requires": {
+            "ms": "2.0.0"
+          }
+        },
+        "define-property": {
+          "version": "0.2.5",
+          "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz",
+          "integrity": "sha512-Rr7ADjQZenceVOAKop6ALkkRAmH1A4Gx9hV/7ZujPUN2rkATqFO0JZLZInbAjpZYoJ1gUx8MRMQVkYemcbMSTA==",
+          "dev": true,
+          "requires": {
+            "is-descriptor": "^0.1.0"
+          }
+        },
+        "extend-shallow": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+          "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+          "dev": true,
+          "requires": {
+            "is-extendable": "^0.1.0"
+          }
+        },
+        "ms": {
+          "version": "2.0.0",
+          "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
+          "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==",
+          "dev": true
+        }
+      }
+    },
+    "expect": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/expect/-/expect-26.6.2.tgz",
+      "integrity": "sha512-9/hlOBkQl2l/PLHJx6JjoDF6xPKcJEsUlWKb23rKE7KzeDqUZKXKNMW27KIue5JMdBV9HgmoJPcc8HtO85t9IA==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "ansi-styles": "^4.0.0",
+        "jest-get-type": "^26.3.0",
+        "jest-matcher-utils": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-regex-util": "^26.0.0"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        }
+      }
+    },
+    "extend-shallow": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-3.0.2.tgz",
+      "integrity": "sha512-BwY5b5Ql4+qZoefgMj2NUmx+tehVTH/Kf4k1ZEtOHNFcm2wSxMRo992l6X3TIgni2eZVTZ85xMOjF31fwZAj6Q==",
+      "dev": true,
+      "requires": {
+        "assign-symbols": "^1.0.0",
+        "is-extendable": "^1.0.1"
+      },
+      "dependencies": {
+        "is-extendable": {
+          "version": "1.0.1",
+          "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-1.0.1.tgz",
+          "integrity": "sha512-arnXMxT1hhoKo9k1LZdmlNyJdDDfy2v0fXjFlmok4+i8ul/6WlbVge9bhM74OpNPQPMGUToDtz+KXa1PneJxOA==",
+          "dev": true,
+          "requires": {
+            "is-plain-object": "^2.0.4"
+          }
+        }
+      }
+    },
+    "external-editor": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/external-editor/-/external-editor-3.1.0.tgz",
+      "integrity": "sha512-hMQ4CX1p1izmuLYyZqLMO/qGNw10wSv9QDCPfzXfyFrOaCSSoRfqE1Kf1s5an66J5JZC62NewG+mK49jOCtQew==",
+      "dev": true,
+      "requires": {
+        "chardet": "^0.7.0",
+        "iconv-lite": "^0.4.24",
+        "tmp": "^0.0.33"
+      }
+    },
+    "extglob": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz",
+      "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==",
+      "dev": true,
+      "requires": {
+        "array-unique": "^0.3.2",
+        "define-property": "^1.0.0",
+        "expand-brackets": "^2.1.4",
+        "extend-shallow": "^2.0.1",
+        "fragment-cache": "^0.2.1",
+        "regex-not": "^1.0.0",
+        "snapdragon": "^0.8.1",
+        "to-regex": "^3.0.1"
+      },
+      "dependencies": {
+        "define-property": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz",
+          "integrity": "sha512-cZTYKFWspt9jZsMscWo8sc/5lbPC9Q0N5nBLgb+Yd915iL3udB1uFgS3B8YCx66UVHq018DAVFoee7x+gxggeA==",
+          "dev": true,
+          "requires": {
+            "is-descriptor": "^1.0.0"
+          }
+        },
+        "extend-shallow": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+          "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+          "dev": true,
+          "requires": {
+            "is-extendable": "^0.1.0"
+          }
+        },
+        "is-accessor-descriptor": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz",
+          "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==",
+          "dev": true,
+          "requires": {
+            "kind-of": "^6.0.0"
+          }
+        },
+        "is-data-descriptor": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz",
+          "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==",
+          "dev": true,
+          "requires": {
+            "kind-of": "^6.0.0"
+          }
+        },
+        "is-descriptor": {
+          "version": "1.0.2",
+          "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz",
+          "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==",
+          "dev": true,
+          "requires": {
+            "is-accessor-descriptor": "^1.0.0",
+            "is-data-descriptor": "^1.0.0",
+            "kind-of": "^6.0.2"
+          }
+        }
+      }
+    },
+    "fast-deep-equal": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
+      "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
+      "dev": true
+    },
+    "fast-json-stable-stringify": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
+      "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
+      "dev": true
+    },
+    "fast-levenshtein": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz",
+      "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==",
+      "dev": true
+    },
+    "fb-watchman": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/fb-watchman/-/fb-watchman-2.0.2.tgz",
+      "integrity": "sha512-p5161BqbuCaSnB8jIbzQHOlpgsPmK5rJVDfDKO91Axs5NC1uu3HRQm6wt9cd9/+GtQQIO53JdGXXoyDpTAsgYA==",
+      "dev": true,
+      "requires": {
+        "bser": "2.1.1"
+      }
+    },
+    "figures": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/figures/-/figures-3.2.0.tgz",
+      "integrity": "sha512-yaduQFRKLXYOGgEn6AZau90j3ggSOyiqXU0F9JZfeXYhNa+Jk4X+s45A2zg5jns87GAFa34BBm2kXw4XpNcbdg==",
+      "dev": true,
+      "requires": {
+        "escape-string-regexp": "^1.0.5"
+      }
+    },
+    "file-entry-cache": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-5.0.1.tgz",
+      "integrity": "sha512-bCg29ictuBaKUwwArK4ouCaqDgLZcysCFLmM/Yn/FDoqndh/9vNuQfXRDvTuXKLxfD/JtZQGKFT8MGcJBK644g==",
+      "dev": true,
+      "requires": {
+        "flat-cache": "^2.0.1"
+      }
+    },
+    "fill-range": {
+      "version": "7.0.1",
+      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz",
+      "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==",
+      "dev": true,
+      "requires": {
+        "to-regex-range": "^5.0.1"
+      }
+    },
+    "find-cache-dir": {
+      "version": "3.3.2",
+      "resolved": "https://registry.npmjs.org/find-cache-dir/-/find-cache-dir-3.3.2.tgz",
+      "integrity": "sha512-wXZV5emFEjrridIgED11OoUKLxiYjAcqot/NJdAkOhlJ+vGzwhOAfcG5OX1jP+S0PcjEn8bdMJv+g2jwQ3Onig==",
+      "dev": true,
+      "requires": {
+        "commondir": "^1.0.1",
+        "make-dir": "^3.0.2",
+        "pkg-dir": "^4.1.0"
+      }
+    },
+    "find-up": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/find-up/-/find-up-4.1.0.tgz",
+      "integrity": "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==",
+      "dev": true,
+      "requires": {
+        "locate-path": "^5.0.0",
+        "path-exists": "^4.0.0"
+      }
+    },
+    "flat-cache": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-2.0.1.tgz",
+      "integrity": "sha512-LoQe6yDuUMDzQAEH8sgmh4Md6oZnc/7PjtwjNFSzveXqSHt6ka9fPBuso7IGf9Rz4uqnSnWiFH2B/zj24a5ReA==",
+      "dev": true,
+      "requires": {
+        "flatted": "^2.0.0",
+        "rimraf": "2.6.3",
+        "write": "1.0.3"
+      }
+    },
+    "flatted": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/flatted/-/flatted-2.0.2.tgz",
+      "integrity": "sha512-r5wGx7YeOwNWNlCA0wQ86zKyDLMQr+/RB8xy74M4hTphfmjlijTSSXGuH8rnvKZnfT9i+75zmd8jcKdMR4O6jA==",
+      "dev": true
+    },
+    "for-in": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz",
+      "integrity": "sha512-7EwmXrOjyL+ChxMhmG5lnW9MPt1aIeZEwKhQzoBUdTV0N3zuwWDZYVJatDvZ2OyzPUvdIAZDsCetk3coyMfcnQ==",
+      "dev": true
+    },
+    "form-data": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/form-data/-/form-data-3.0.1.tgz",
+      "integrity": "sha512-RHkBKtLWUVwd7SqRIvCZMEvAMoGUp0XU+seQiZejj0COz3RI3hWP4sCv3gZWWLjJTd7rGwcsF5eKZGii0r/hbg==",
+      "dev": true,
+      "requires": {
+        "asynckit": "^0.4.0",
+        "combined-stream": "^1.0.8",
+        "mime-types": "^2.1.12"
+      }
+    },
+    "fragment-cache": {
+      "version": "0.2.1",
+      "resolved": "https://registry.npmjs.org/fragment-cache/-/fragment-cache-0.2.1.tgz",
+      "integrity": "sha512-GMBAbW9antB8iZRHLoGw0b3HANt57diZYFO/HL1JGIC1MjKrdmhxvrJbupnVvpys0zsz7yBApXdQyfepKly2kA==",
+      "dev": true,
+      "requires": {
+        "map-cache": "^0.2.2"
+      }
+    },
+    "fs-extra": {
+      "version": "8.1.0",
+      "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-8.1.0.tgz",
+      "integrity": "sha512-yhlQgA6mnOJUKOsRUFsgJdQCvkKhcz8tlZG5HBQfReYZy46OwLcY+Zia0mtdHsOo9y/hP+CxMN0TU9QxoOtG4g==",
+      "dev": true,
+      "requires": {
+        "graceful-fs": "^4.2.0",
+        "jsonfile": "^4.0.0",
+        "universalify": "^0.1.0"
+      },
+      "dependencies": {
+        "universalify": {
+          "version": "0.1.2",
+          "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz",
+          "integrity": "sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg==",
+          "dev": true
+        }
+      }
+    },
+    "fs.realpath": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
+      "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==",
+      "dev": true
+    },
+    "fsevents": {
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
+      "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
+      "dev": true,
+      "optional": true
+    },
+    "function-bind": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz",
+      "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A==",
+      "dev": true
+    },
+    "functional-red-black-tree": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/functional-red-black-tree/-/functional-red-black-tree-1.0.1.tgz",
+      "integrity": "sha512-dsKNQNdj6xA3T+QlADDA7mOSlX0qiMINjn0cgr+eGHGsbSHzTabcIogz2+p/iqP1Xs6EP/sS2SbqH+brGTbq0g==",
+      "dev": true
+    },
+    "gensync": {
+      "version": "1.0.0-beta.2",
+      "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz",
+      "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==",
+      "dev": true
+    },
+    "get-caller-file": {
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
+      "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
+      "dev": true
+    },
+    "get-package-type": {
+      "version": "0.1.0",
+      "resolved": "https://registry.npmjs.org/get-package-type/-/get-package-type-0.1.0.tgz",
+      "integrity": "sha512-pjzuKtY64GYfWizNAJ0fr9VqttZkNiK2iS430LtIHzjBEr6bX8Am2zm4sW4Ro5wjWW5cAlRL1qAMTcXbjNAO2Q==",
+      "dev": true
+    },
+    "get-stream": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz",
+      "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==",
+      "dev": true,
+      "requires": {
+        "pump": "^3.0.0"
+      }
+    },
+    "get-value": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/get-value/-/get-value-2.0.6.tgz",
+      "integrity": "sha512-Ln0UQDlxH1BapMu3GPtf7CuYNwRZf2gwCuPqbyG6pB8WfmFpzqcy4xtAaAMUhnNqjMKTiCPZG2oMT3YSx8U2NA==",
+      "dev": true
+    },
+    "glob": {
+      "version": "7.2.3",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
+      "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==",
+      "dev": true,
+      "requires": {
+        "fs.realpath": "^1.0.0",
+        "inflight": "^1.0.4",
+        "inherits": "2",
+        "minimatch": "^3.1.1",
+        "once": "^1.3.0",
+        "path-is-absolute": "^1.0.0"
+      }
+    },
+    "glob-parent": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
+      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
+      "dev": true,
+      "requires": {
+        "is-glob": "^4.0.1"
+      }
+    },
+    "globals": {
+      "version": "12.4.0",
+      "resolved": "https://registry.npmjs.org/globals/-/globals-12.4.0.tgz",
+      "integrity": "sha512-BWICuzzDvDoH54NHKCseDanAhE3CeDorgDL5MT6LMXXj2WCnd9UC2szdk4AWLfjdgNBCXLUanXYcpBBKOSWGwg==",
+      "dev": true,
+      "requires": {
+        "type-fest": "^0.8.1"
+      }
+    },
+    "graceful-fs": {
+      "version": "4.2.10",
+      "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.10.tgz",
+      "integrity": "sha512-9ByhssR2fPVsNZj478qUUbKfmL0+t5BDVyjShtyZZLiK7ZDAArFFfopyOTj0M05wE2tJPisA4iTnnXl2YoPvOA==",
+      "dev": true
+    },
+    "growly": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/growly/-/growly-1.3.0.tgz",
+      "integrity": "sha512-+xGQY0YyAWCnqy7Cd++hc2JqMYzlm0dG30Jd0beaA64sROr8C4nt8Yc9V5Ro3avlSUDTN0ulqP/VBKi1/lLygw==",
+      "dev": true,
+      "optional": true
+    },
+    "handlebars": {
+      "version": "4.7.7",
+      "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.7.tgz",
+      "integrity": "sha512-aAcXm5OAfE/8IXkcZvCepKU3VzW1/39Fb5ZuqMtgI/hT8X2YgoMvBY5dLhq/cpOvw7Lk1nK/UF71aLG/ZnVYRA==",
+      "dev": true,
+      "requires": {
+        "minimist": "^1.2.5",
+        "neo-async": "^2.6.0",
+        "source-map": "^0.6.1",
+        "uglify-js": "^3.1.4",
+        "wordwrap": "^1.0.0"
+      }
+    },
+    "has": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz",
+      "integrity": "sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw==",
+      "dev": true,
+      "requires": {
+        "function-bind": "^1.1.1"
+      }
+    },
+    "has-flag": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz",
+      "integrity": "sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw==",
+      "dev": true
+    },
+    "has-value": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/has-value/-/has-value-1.0.0.tgz",
+      "integrity": "sha512-IBXk4GTsLYdQ7Rvt+GRBrFSVEkmuOUy4re0Xjd9kJSUQpnTrWR4/y9RpfexN9vkAPMFuQoeWKwqzPozRTlasGw==",
+      "dev": true,
+      "requires": {
+        "get-value": "^2.0.6",
+        "has-values": "^1.0.0",
+        "isobject": "^3.0.0"
+      }
+    },
+    "has-values": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/has-values/-/has-values-1.0.0.tgz",
+      "integrity": "sha512-ODYZC64uqzmtfGMEAX/FvZiRyWLpAC3vYnNunURUnkGVTS+mI0smVsWaPydRBsE3g+ok7h960jChO8mFcWlHaQ==",
+      "dev": true,
+      "requires": {
+        "is-number": "^3.0.0",
+        "kind-of": "^4.0.0"
+      },
+      "dependencies": {
+        "is-number": {
+          "version": "3.0.0",
+          "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz",
+          "integrity": "sha512-4cboCqIpliH+mAvFNegjZQ4kgKc3ZUhQVr3HvWbSh5q3WH2v82ct+T2Y1hdU5Gdtorx/cLifQjqCbL7bpznLTg==",
+          "dev": true,
+          "requires": {
+            "kind-of": "^3.0.2"
+          },
+          "dependencies": {
+            "kind-of": {
+              "version": "3.2.2",
+              "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+              "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+              "dev": true,
+              "requires": {
+                "is-buffer": "^1.1.5"
+              }
+            }
+          }
+        },
+        "kind-of": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-4.0.0.tgz",
+          "integrity": "sha512-24XsCxmEbRwEDbz/qz3stgin8TTzZ1ESR56OMCN0ujYg+vRutNSiOj9bHH9u85DKgXguraugV5sFuvbD4FW/hw==",
+          "dev": true,
+          "requires": {
+            "is-buffer": "^1.1.5"
+          }
+        }
+      }
+    },
+    "highlight.js": {
+      "version": "10.7.3",
+      "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.7.3.tgz",
+      "integrity": "sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A==",
+      "dev": true
+    },
+    "hosted-git-info": {
+      "version": "2.8.9",
+      "resolved": "https://registry.npmjs.org/hosted-git-info/-/hosted-git-info-2.8.9.tgz",
+      "integrity": "sha512-mxIDAb9Lsm6DoOJ7xH+5+X4y1LU/4Hi50L9C5sIswK3JzULS4bwk1FvjdBgvYR4bzT4tuUQiC15FE2f5HbLvYw==",
+      "dev": true
+    },
+    "html-encoding-sniffer": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-2.0.1.tgz",
+      "integrity": "sha512-D5JbOMBIR/TVZkubHT+OyT2705QvogUW4IBn6nHd756OwieSF9aDYFj4dv6HHEVGYbHaLETa3WggZYWWMyy3ZQ==",
+      "dev": true,
+      "requires": {
+        "whatwg-encoding": "^1.0.5"
+      }
+    },
+    "html-escaper": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
+      "integrity": "sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==",
+      "dev": true
+    },
+    "http-proxy-agent": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-4.0.1.tgz",
+      "integrity": "sha512-k0zdNgqWTGA6aeIRVpvfVob4fL52dTfaehylg0Y4UvSySvOq/Y+BOyPrgpUrA7HylqvU8vIZGsRuXmspskV0Tg==",
+      "dev": true,
+      "requires": {
+        "@tootallnate/once": "1",
+        "agent-base": "6",
+        "debug": "4"
+      }
+    },
+    "https-proxy-agent": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
+      "integrity": "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==",
+      "dev": true,
+      "requires": {
+        "agent-base": "6",
+        "debug": "4"
+      }
+    },
+    "human-signals": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-1.1.1.tgz",
+      "integrity": "sha512-SEQu7vl8KjNL2eoGBLF3+wAjpsNfA9XMlXAYj/3EdaNfAlxKthD1xjEQfGOUhllCGGJVNY34bRr6lPINhNjyZw==",
+      "dev": true
+    },
+    "iconv-lite": {
+      "version": "0.4.24",
+      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
+      "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
+      "dev": true,
+      "requires": {
+        "safer-buffer": ">= 2.1.2 < 3"
+      }
+    },
+    "import-fresh": {
+      "version": "3.3.0",
+      "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.0.tgz",
+      "integrity": "sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==",
+      "dev": true,
+      "requires": {
+        "parent-module": "^1.0.0",
+        "resolve-from": "^4.0.0"
+      }
+    },
+    "import-local": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/import-local/-/import-local-3.1.0.tgz",
+      "integrity": "sha512-ASB07uLtnDs1o6EHjKpX34BKYDSqnFerfTOJL2HvMqF70LnxpjkzDB8J44oT9pu4AMPkQwf8jl6szgvNd2tRIg==",
+      "dev": true,
+      "requires": {
+        "pkg-dir": "^4.2.0",
+        "resolve-cwd": "^3.0.0"
+      }
+    },
+    "imurmurhash": {
+      "version": "0.1.4",
+      "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz",
+      "integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==",
+      "dev": true
+    },
+    "inflight": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
+      "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==",
+      "dev": true,
+      "requires": {
+        "once": "^1.3.0",
+        "wrappy": "1"
+      }
+    },
+    "inherits": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
+      "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
+      "dev": true
+    },
+    "inquirer": {
+      "version": "7.3.3",
+      "resolved": "https://registry.npmjs.org/inquirer/-/inquirer-7.3.3.tgz",
+      "integrity": "sha512-JG3eIAj5V9CwcGvuOmoo6LB9kbAYT8HXffUl6memuszlwDC/qvFAJw49XJ5NROSFNPxp3iQg1GqkFhaY/CR0IA==",
+      "dev": true,
+      "requires": {
+        "ansi-escapes": "^4.2.1",
+        "chalk": "^4.1.0",
+        "cli-cursor": "^3.1.0",
+        "cli-width": "^3.0.0",
+        "external-editor": "^3.0.3",
+        "figures": "^3.0.0",
+        "lodash": "^4.17.19",
+        "mute-stream": "0.0.8",
+        "run-async": "^2.4.0",
+        "rxjs": "^6.6.0",
+        "string-width": "^4.1.0",
+        "strip-ansi": "^6.0.0",
+        "through": "^2.3.6"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "strip-ansi": {
+          "version": "6.0.1",
+          "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+          "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+          "dev": true,
+          "requires": {
+            "ansi-regex": "^5.0.1"
+          }
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "interpret": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/interpret/-/interpret-1.4.0.tgz",
+      "integrity": "sha512-agE4QfB2Lkp9uICn7BAqoscw4SZP9kTE2hxiFI3jBPmXJfdqiahTbUuKGsMoN2GtqL9AxhYioAcVvgsb1HvRbA==",
+      "dev": true
+    },
+    "is-accessor-descriptor": {
+      "version": "0.1.6",
+      "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz",
+      "integrity": "sha512-e1BM1qnDbMRG3ll2U9dSK0UMHuWOs3pY3AtcFsmvwPtKL3MML/Q86i+GilLfvqEs4GW+ExB91tQ3Ig9noDIZ+A==",
+      "dev": true,
+      "requires": {
+        "kind-of": "^3.0.2"
+      },
+      "dependencies": {
+        "kind-of": {
+          "version": "3.2.2",
+          "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+          "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+          "dev": true,
+          "requires": {
+            "is-buffer": "^1.1.5"
+          }
+        }
+      }
+    },
+    "is-arrayish": {
+      "version": "0.2.1",
+      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz",
+      "integrity": "sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==",
+      "dev": true
+    },
+    "is-buffer": {
+      "version": "1.1.6",
+      "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
+      "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==",
+      "dev": true
+    },
+    "is-ci": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/is-ci/-/is-ci-2.0.0.tgz",
+      "integrity": "sha512-YfJT7rkpQB0updsdHLGWrvhBJfcfzNNawYDNIyQXJz0IViGf75O8EBPKSdvw2rF+LGCsX4FZ8tcr3b19LcZq4w==",
+      "dev": true,
+      "requires": {
+        "ci-info": "^2.0.0"
+      }
+    },
+    "is-core-module": {
+      "version": "2.11.0",
+      "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.11.0.tgz",
+      "integrity": "sha512-RRjxlvLDkD1YJwDbroBHMb+cukurkDWNyHx7D3oNB5x9rb5ogcksMC5wHCadcXoo67gVr/+3GFySh3134zi6rw==",
+      "dev": true,
+      "requires": {
+        "has": "^1.0.3"
+      }
+    },
+    "is-data-descriptor": {
+      "version": "0.1.4",
+      "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz",
+      "integrity": "sha512-+w9D5ulSoBNlmw9OHn3U2v51SyoCd0he+bB3xMl62oijhrspxowjU+AIcDY0N3iEJbUEkB15IlMASQsxYigvXg==",
+      "dev": true,
+      "requires": {
+        "kind-of": "^3.0.2"
+      },
+      "dependencies": {
+        "kind-of": {
+          "version": "3.2.2",
+          "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+          "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+          "dev": true,
+          "requires": {
+            "is-buffer": "^1.1.5"
+          }
+        }
+      }
+    },
+    "is-descriptor": {
+      "version": "0.1.6",
+      "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz",
+      "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==",
+      "dev": true,
+      "requires": {
+        "is-accessor-descriptor": "^0.1.6",
+        "is-data-descriptor": "^0.1.4",
+        "kind-of": "^5.0.0"
+      },
+      "dependencies": {
+        "kind-of": {
+          "version": "5.1.0",
+          "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz",
+          "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==",
+          "dev": true
+        }
+      }
+    },
+    "is-docker": {
+      "version": "2.2.1",
+      "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-2.2.1.tgz",
+      "integrity": "sha512-F+i2BKsFrH66iaUFc0woD8sLy8getkwTwtOBjvs56Cx4CgJDeKQeqfz8wAYiSb8JOprWhHH5p77PbmYCvvUuXQ==",
+      "dev": true,
+      "optional": true
+    },
+    "is-extendable": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz",
+      "integrity": "sha512-5BMULNob1vgFX6EjQw5izWDxrecWK9AM72rugNr0TFldMOi0fj6Jk+zeKIt0xGj4cEfQIJth4w3OKWOJ4f+AFw==",
+      "dev": true
+    },
+    "is-extglob": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
+      "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==",
+      "dev": true
+    },
+    "is-fullwidth-code-point": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
+      "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
+      "dev": true
+    },
+    "is-generator-fn": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/is-generator-fn/-/is-generator-fn-2.1.0.tgz",
+      "integrity": "sha512-cTIB4yPYL/Grw0EaSzASzg6bBy9gqCofvWN8okThAYIxKJZC+udlRAmGbM0XLeniEJSs8uEgHPGuHSe1XsOLSQ==",
+      "dev": true
+    },
+    "is-glob": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
+      "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==",
+      "dev": true,
+      "requires": {
+        "is-extglob": "^2.1.1"
+      }
+    },
+    "is-module": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-module/-/is-module-1.0.0.tgz",
+      "integrity": "sha512-51ypPSPCoTEIN9dy5Oy+h4pShgJmPCygKfyRCISBI+JoWT/2oJvK8QPxmwv7b/p239jXrm9M1mlQbyKJ5A152g==",
+      "dev": true
+    },
+    "is-number": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
+      "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
+      "dev": true
+    },
+    "is-plain-object": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz",
+      "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==",
+      "dev": true,
+      "requires": {
+        "isobject": "^3.0.1"
+      }
+    },
+    "is-potential-custom-element-name": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
+      "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==",
+      "dev": true
+    },
+    "is-reference": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/is-reference/-/is-reference-1.2.1.tgz",
+      "integrity": "sha512-U82MsXXiFIrjCK4otLT+o2NA2Cd2g5MLoOVXUZjIOhLurrRxpEXzI8O0KZHr3IjLvlAH1kTPYSuqer5T9ZVBKQ==",
+      "dev": true,
+      "requires": {
+        "@types/estree": "*"
+      }
+    },
+    "is-stream": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz",
+      "integrity": "sha512-uQPm8kcs47jx38atAcWTVxyltQYoPT68y9aWYdV6yWXSyW8mzSat0TL6CiWdZeCdF3KrAvpVtnHbTv4RN+rqdQ==",
+      "dev": true
+    },
+    "is-typedarray": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz",
+      "integrity": "sha512-cyA56iCMHAh5CdzjJIa4aohJyeO1YbwLi3Jc35MmRU6poroFjIGZzUzupGiRPOjgHg9TLu43xbpwXk523fMxKA==",
+      "dev": true
+    },
+    "is-windows": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/is-windows/-/is-windows-1.0.2.tgz",
+      "integrity": "sha512-eXK1UInq2bPmjyX6e3VHIzMLobc4J94i4AWn+Hpq3OU5KkrRC96OAcR3PRJ/pGu6m8TRnBHP9dkXQVsT/COVIA==",
+      "dev": true
+    },
+    "is-wsl": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-2.2.0.tgz",
+      "integrity": "sha512-fKzAra0rGJUUBwGBgNkHZuToZcn+TtXHpeCgmkMJMMYx1sQDYaCSyjJBSCa2nH1DGm7s3n1oBnohoVTBaN7Lww==",
+      "dev": true,
+      "optional": true,
+      "requires": {
+        "is-docker": "^2.0.0"
+      }
+    },
+    "isarray": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
+      "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==",
+      "dev": true
+    },
+    "isexe": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
+      "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
+      "dev": true
+    },
+    "isobject": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz",
+      "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==",
+      "dev": true
+    },
+    "istanbul-lib-coverage": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.0.tgz",
+      "integrity": "sha512-eOeJ5BHCmHYvQK7xt9GkdHuzuCGS1Y6g9Gvnx3Ym33fz/HpLRYxiS0wHNr+m/MBC8B647Xt608vCDEvhl9c6Mw==",
+      "dev": true
+    },
+    "istanbul-lib-instrument": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/istanbul-lib-instrument/-/istanbul-lib-instrument-4.0.3.tgz",
+      "integrity": "sha512-BXgQl9kf4WTCPCCpmFGoJkz/+uhvm7h7PFKUYxh7qarQd3ER33vHG//qaE8eN25l07YqZPpHXU9I09l/RD5aGQ==",
+      "dev": true,
+      "requires": {
+        "@babel/core": "^7.7.5",
+        "@istanbuljs/schema": "^0.1.2",
+        "istanbul-lib-coverage": "^3.0.0",
+        "semver": "^6.3.0"
+      },
+      "dependencies": {
+        "semver": {
+          "version": "6.3.0",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+          "dev": true
+        }
+      }
+    },
+    "istanbul-lib-report": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/istanbul-lib-report/-/istanbul-lib-report-3.0.0.tgz",
+      "integrity": "sha512-wcdi+uAKzfiGT2abPpKZ0hSU1rGQjUQnLvtY5MpQ7QCTahD3VODhcu4wcfY1YtkGaDD5yuydOLINXsfbus9ROw==",
+      "dev": true,
+      "requires": {
+        "istanbul-lib-coverage": "^3.0.0",
+        "make-dir": "^3.0.0",
+        "supports-color": "^7.1.0"
+      },
+      "dependencies": {
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "istanbul-lib-source-maps": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/istanbul-lib-source-maps/-/istanbul-lib-source-maps-4.0.1.tgz",
+      "integrity": "sha512-n3s8EwkdFIJCG3BPKBYvskgXGoy88ARzvegkitk60NxRdwltLOTaH7CUiMRXvwYorl0Q712iEjcWB+fK/MrWVw==",
+      "dev": true,
+      "requires": {
+        "debug": "^4.1.1",
+        "istanbul-lib-coverage": "^3.0.0",
+        "source-map": "^0.6.1"
+      }
+    },
+    "istanbul-reports": {
+      "version": "3.1.5",
+      "resolved": "https://registry.npmjs.org/istanbul-reports/-/istanbul-reports-3.1.5.tgz",
+      "integrity": "sha512-nUsEMa9pBt/NOHqbcbeJEgqIlY/K7rVWUX6Lql2orY5e9roQOthbR3vtY4zzf2orPELg80fnxxk9zUyPlgwD1w==",
+      "dev": true,
+      "requires": {
+        "html-escaper": "^2.0.0",
+        "istanbul-lib-report": "^3.0.0"
+      }
+    },
+    "jest": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest/-/jest-26.6.3.tgz",
+      "integrity": "sha512-lGS5PXGAzR4RF7V5+XObhqz2KZIDUA1yD0DG6pBVmy10eh0ZIXQImRuzocsI/N2XZ1GrLFwTS27In2i2jlpq1Q==",
+      "dev": true,
+      "requires": {
+        "@jest/core": "^26.6.3",
+        "import-local": "^3.0.2",
+        "jest-cli": "^26.6.3"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "jest-cli": {
+          "version": "26.6.3",
+          "resolved": "https://registry.npmjs.org/jest-cli/-/jest-cli-26.6.3.tgz",
+          "integrity": "sha512-GF9noBSa9t08pSyl3CY4frMrqp+aQXFGFkf5hEPbh/pIUFYWMK6ZLTfbmadxJVcJrdRoChlWQsA2VkJcDFK8hg==",
+          "dev": true,
+          "requires": {
+            "@jest/core": "^26.6.3",
+            "@jest/test-result": "^26.6.2",
+            "@jest/types": "^26.6.2",
+            "chalk": "^4.0.0",
+            "exit": "^0.1.2",
+            "graceful-fs": "^4.2.4",
+            "import-local": "^3.0.2",
+            "is-ci": "^2.0.0",
+            "jest-config": "^26.6.3",
+            "jest-util": "^26.6.2",
+            "jest-validate": "^26.6.2",
+            "prompts": "^2.0.1",
+            "yargs": "^15.4.1"
+          }
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-changed-files": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-changed-files/-/jest-changed-files-26.6.2.tgz",
+      "integrity": "sha512-fDS7szLcY9sCtIip8Fjry9oGf3I2ht/QT21bAHm5Dmf0mD4X3ReNUf17y+bO6fR8WgbIZTlbyG1ak/53cbRzKQ==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "execa": "^4.0.0",
+        "throat": "^5.0.0"
+      },
+      "dependencies": {
+        "cross-spawn": {
+          "version": "7.0.3",
+          "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
+          "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+          "dev": true,
+          "requires": {
+            "path-key": "^3.1.0",
+            "shebang-command": "^2.0.0",
+            "which": "^2.0.1"
+          }
+        },
+        "execa": {
+          "version": "4.1.0",
+          "resolved": "https://registry.npmjs.org/execa/-/execa-4.1.0.tgz",
+          "integrity": "sha512-j5W0//W7f8UxAn8hXVnwG8tLwdiUy4FJLcSupCg6maBYZDpyBvTApK7KyuI4bKj8KOh1r2YH+6ucuYtJv1bTZA==",
+          "dev": true,
+          "requires": {
+            "cross-spawn": "^7.0.0",
+            "get-stream": "^5.0.0",
+            "human-signals": "^1.1.1",
+            "is-stream": "^2.0.0",
+            "merge-stream": "^2.0.0",
+            "npm-run-path": "^4.0.0",
+            "onetime": "^5.1.0",
+            "signal-exit": "^3.0.2",
+            "strip-final-newline": "^2.0.0"
+          }
+        },
+        "get-stream": {
+          "version": "5.2.0",
+          "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
+          "integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
+          "dev": true,
+          "requires": {
+            "pump": "^3.0.0"
+          }
+        },
+        "is-stream": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz",
+          "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==",
+          "dev": true
+        },
+        "npm-run-path": {
+          "version": "4.0.1",
+          "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-4.0.1.tgz",
+          "integrity": "sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==",
+          "dev": true,
+          "requires": {
+            "path-key": "^3.0.0"
+          }
+        },
+        "path-key": {
+          "version": "3.1.1",
+          "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
+          "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
+          "dev": true
+        },
+        "shebang-command": {
+          "version": "2.0.0",
+          "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
+          "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
+          "dev": true,
+          "requires": {
+            "shebang-regex": "^3.0.0"
+          }
+        },
+        "shebang-regex": {
+          "version": "3.0.0",
+          "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
+          "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
+          "dev": true
+        },
+        "which": {
+          "version": "2.0.2",
+          "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
+          "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+          "dev": true,
+          "requires": {
+            "isexe": "^2.0.0"
+          }
+        }
+      }
+    },
+    "jest-config": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest-config/-/jest-config-26.6.3.tgz",
+      "integrity": "sha512-t5qdIj/bCj2j7NFVHb2nFB4aUdfucDn3JRKgrZnplb8nieAirAzRSHP8uDEd+qV6ygzg9Pz4YG7UTJf94LPSyg==",
+      "dev": true,
+      "requires": {
+        "@babel/core": "^7.1.0",
+        "@jest/test-sequencer": "^26.6.3",
+        "@jest/types": "^26.6.2",
+        "babel-jest": "^26.6.3",
+        "chalk": "^4.0.0",
+        "deepmerge": "^4.2.2",
+        "glob": "^7.1.1",
+        "graceful-fs": "^4.2.4",
+        "jest-environment-jsdom": "^26.6.2",
+        "jest-environment-node": "^26.6.2",
+        "jest-get-type": "^26.3.0",
+        "jest-jasmine2": "^26.6.3",
+        "jest-regex-util": "^26.0.0",
+        "jest-resolve": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jest-validate": "^26.6.2",
+        "micromatch": "^4.0.2",
+        "pretty-format": "^26.6.2"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-diff": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-diff/-/jest-diff-26.6.2.tgz",
+      "integrity": "sha512-6m+9Z3Gv9wN0WFVasqjCL/06+EFCMTqDEUl/b87HYK2rAPTyfz4ZIuSlPhY51PIQRWx5TaxeF1qmXKe9gfN3sA==",
+      "dev": true,
+      "requires": {
+        "chalk": "^4.0.0",
+        "diff-sequences": "^26.6.2",
+        "jest-get-type": "^26.3.0",
+        "pretty-format": "^26.6.2"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-docblock": {
+      "version": "26.0.0",
+      "resolved": "https://registry.npmjs.org/jest-docblock/-/jest-docblock-26.0.0.tgz",
+      "integrity": "sha512-RDZ4Iz3QbtRWycd8bUEPxQsTlYazfYn/h5R65Fc6gOfwozFhoImx+affzky/FFBuqISPTqjXomoIGJVKBWoo0w==",
+      "dev": true,
+      "requires": {
+        "detect-newline": "^3.0.0"
+      }
+    },
+    "jest-each": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-each/-/jest-each-26.6.2.tgz",
+      "integrity": "sha512-Mer/f0KaATbjl8MCJ+0GEpNdqmnVmDYqCTJYTvoo7rqmRiDllmp2AYN+06F93nXcY3ur9ShIjS+CO/uD+BbH4A==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "chalk": "^4.0.0",
+        "jest-get-type": "^26.3.0",
+        "jest-util": "^26.6.2",
+        "pretty-format": "^26.6.2"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-environment-jsdom": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-environment-jsdom/-/jest-environment-jsdom-26.6.2.tgz",
+      "integrity": "sha512-jgPqCruTlt3Kwqg5/WVFyHIOJHsiAvhcp2qiR2QQstuG9yWox5+iHpU3ZrcBxW14T4fe5Z68jAfLRh7joCSP2Q==",
+      "dev": true,
+      "requires": {
+        "@jest/environment": "^26.6.2",
+        "@jest/fake-timers": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "jest-mock": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jsdom": "^16.4.0"
+      }
+    },
+    "jest-environment-node": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-environment-node/-/jest-environment-node-26.6.2.tgz",
+      "integrity": "sha512-zhtMio3Exty18dy8ee8eJ9kjnRyZC1N4C1Nt/VShN1apyXc8rWGtJ9lI7vqiWcyyXS4BVSEn9lxAM2D+07/Tag==",
+      "dev": true,
+      "requires": {
+        "@jest/environment": "^26.6.2",
+        "@jest/fake-timers": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "jest-mock": "^26.6.2",
+        "jest-util": "^26.6.2"
+      }
+    },
+    "jest-get-type": {
+      "version": "26.3.0",
+      "resolved": "https://registry.npmjs.org/jest-get-type/-/jest-get-type-26.3.0.tgz",
+      "integrity": "sha512-TpfaviN1R2pQWkIihlfEanwOXK0zcxrKEE4MlU6Tn7keoXdN6/3gK/xl0yEh8DOunn5pOVGKf8hB4R9gVh04ig==",
+      "dev": true
+    },
+    "jest-haste-map": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-haste-map/-/jest-haste-map-26.6.2.tgz",
+      "integrity": "sha512-easWIJXIw71B2RdR8kgqpjQrbMRWQBgiBwXYEhtGUTaX+doCjBheluShdDMeR8IMfJiTqH4+zfhtg29apJf/8w==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "@types/graceful-fs": "^4.1.2",
+        "@types/node": "*",
+        "anymatch": "^3.0.3",
+        "fb-watchman": "^2.0.0",
+        "fsevents": "^2.1.2",
+        "graceful-fs": "^4.2.4",
+        "jest-regex-util": "^26.0.0",
+        "jest-serializer": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jest-worker": "^26.6.2",
+        "micromatch": "^4.0.2",
+        "sane": "^4.0.3",
+        "walker": "^1.0.7"
+      }
+    },
+    "jest-jasmine2": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest-jasmine2/-/jest-jasmine2-26.6.3.tgz",
+      "integrity": "sha512-kPKUrQtc8aYwBV7CqBg5pu+tmYXlvFlSFYn18ev4gPFtrRzB15N2gW/Roew3187q2w2eHuu0MU9TJz6w0/nPEg==",
+      "dev": true,
+      "requires": {
+        "@babel/traverse": "^7.1.0",
+        "@jest/environment": "^26.6.2",
+        "@jest/source-map": "^26.6.2",
+        "@jest/test-result": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "chalk": "^4.0.0",
+        "co": "^4.6.0",
+        "expect": "^26.6.2",
+        "is-generator-fn": "^2.0.0",
+        "jest-each": "^26.6.2",
+        "jest-matcher-utils": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-runtime": "^26.6.3",
+        "jest-snapshot": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "pretty-format": "^26.6.2",
+        "throat": "^5.0.0"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-leak-detector": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-leak-detector/-/jest-leak-detector-26.6.2.tgz",
+      "integrity": "sha512-i4xlXpsVSMeKvg2cEKdfhh0H39qlJlP5Ex1yQxwF9ubahboQYMgTtz5oML35AVA3B4Eu+YsmwaiKVev9KCvLxg==",
+      "dev": true,
+      "requires": {
+        "jest-get-type": "^26.3.0",
+        "pretty-format": "^26.6.2"
+      }
+    },
+    "jest-matcher-utils": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-matcher-utils/-/jest-matcher-utils-26.6.2.tgz",
+      "integrity": "sha512-llnc8vQgYcNqDrqRDXWwMr9i7rS5XFiCwvh6DTP7Jqa2mqpcCBBlpCbn+trkG0KNhPu/h8rzyBkriOtBstvWhw==",
+      "dev": true,
+      "requires": {
+        "chalk": "^4.0.0",
+        "jest-diff": "^26.6.2",
+        "jest-get-type": "^26.3.0",
+        "pretty-format": "^26.6.2"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-message-util": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-message-util/-/jest-message-util-26.6.2.tgz",
+      "integrity": "sha512-rGiLePzQ3AzwUshu2+Rn+UMFk0pHN58sOG+IaJbk5Jxuqo3NYO1U2/MIR4S1sKgsoYSXSzdtSa0TgrmtUwEbmA==",
+      "dev": true,
+      "requires": {
+        "@babel/code-frame": "^7.0.0",
+        "@jest/types": "^26.6.2",
+        "@types/stack-utils": "^2.0.0",
+        "chalk": "^4.0.0",
+        "graceful-fs": "^4.2.4",
+        "micromatch": "^4.0.2",
+        "pretty-format": "^26.6.2",
+        "slash": "^3.0.0",
+        "stack-utils": "^2.0.2"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-mock": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-mock/-/jest-mock-26.6.2.tgz",
+      "integrity": "sha512-YyFjePHHp1LzpzYcmgqkJ0nm0gg/lJx2aZFzFy1S6eUqNjXsOqTK10zNRff2dNfssgokjkG65OlWNcIlgd3zew==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "@types/node": "*"
+      }
+    },
+    "jest-pnp-resolver": {
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/jest-pnp-resolver/-/jest-pnp-resolver-1.2.3.tgz",
+      "integrity": "sha512-+3NpwQEnRoIBtx4fyhblQDPgJI0H1IEIkX7ShLUjPGA7TtUTvI1oiKi3SR4oBR0hQhQR80l4WAe5RrXBwWMA8w==",
+      "dev": true
+    },
+    "jest-regex-util": {
+      "version": "26.0.0",
+      "resolved": "https://registry.npmjs.org/jest-regex-util/-/jest-regex-util-26.0.0.tgz",
+      "integrity": "sha512-Gv3ZIs/nA48/Zvjrl34bf+oD76JHiGDUxNOVgUjh3j890sblXryjY4rss71fPtD/njchl6PSE2hIhvyWa1eT0A==",
+      "dev": true
+    },
+    "jest-resolve": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-resolve/-/jest-resolve-26.6.2.tgz",
+      "integrity": "sha512-sOxsZOq25mT1wRsfHcbtkInS+Ek7Q8jCHUB0ZUTP0tc/c41QHriU/NunqMfCUWsL4H3MHpvQD4QR9kSYhS7UvQ==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "chalk": "^4.0.0",
+        "graceful-fs": "^4.2.4",
+        "jest-pnp-resolver": "^1.2.2",
+        "jest-util": "^26.6.2",
+        "read-pkg-up": "^7.0.1",
+        "resolve": "^1.18.1",
+        "slash": "^3.0.0"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-resolve-dependencies": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest-resolve-dependencies/-/jest-resolve-dependencies-26.6.3.tgz",
+      "integrity": "sha512-pVwUjJkxbhe4RY8QEWzN3vns2kqyuldKpxlxJlzEYfKSvY6/bMvxoFrYYzUO1Gx28yKWN37qyV7rIoIp2h8fTg==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "jest-regex-util": "^26.0.0",
+        "jest-snapshot": "^26.6.2"
+      }
+    },
+    "jest-runner": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest-runner/-/jest-runner-26.6.3.tgz",
+      "integrity": "sha512-atgKpRHnaA2OvByG/HpGA4g6CSPS/1LK0jK3gATJAoptC1ojltpmVlYC3TYgdmGp+GLuhzpH30Gvs36szSL2JQ==",
+      "dev": true,
+      "requires": {
+        "@jest/console": "^26.6.2",
+        "@jest/environment": "^26.6.2",
+        "@jest/test-result": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "chalk": "^4.0.0",
+        "emittery": "^0.7.1",
+        "exit": "^0.1.2",
+        "graceful-fs": "^4.2.4",
+        "jest-config": "^26.6.3",
+        "jest-docblock": "^26.0.0",
+        "jest-haste-map": "^26.6.2",
+        "jest-leak-detector": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-resolve": "^26.6.2",
+        "jest-runtime": "^26.6.3",
+        "jest-util": "^26.6.2",
+        "jest-worker": "^26.6.2",
+        "source-map-support": "^0.5.6",
+        "throat": "^5.0.0"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-runtime": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest-runtime/-/jest-runtime-26.6.3.tgz",
+      "integrity": "sha512-lrzyR3N8sacTAMeonbqpnSka1dHNux2uk0qqDXVkMv2c/A3wYnvQ4EXuI013Y6+gSKSCxdaczvf4HF0mVXHRdw==",
+      "dev": true,
+      "requires": {
+        "@jest/console": "^26.6.2",
+        "@jest/environment": "^26.6.2",
+        "@jest/fake-timers": "^26.6.2",
+        "@jest/globals": "^26.6.2",
+        "@jest/source-map": "^26.6.2",
+        "@jest/test-result": "^26.6.2",
+        "@jest/transform": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/yargs": "^15.0.0",
+        "chalk": "^4.0.0",
+        "cjs-module-lexer": "^0.6.0",
+        "collect-v8-coverage": "^1.0.0",
+        "exit": "^0.1.2",
+        "glob": "^7.1.3",
+        "graceful-fs": "^4.2.4",
+        "jest-config": "^26.6.3",
+        "jest-haste-map": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-mock": "^26.6.2",
+        "jest-regex-util": "^26.0.0",
+        "jest-resolve": "^26.6.2",
+        "jest-snapshot": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jest-validate": "^26.6.2",
+        "slash": "^3.0.0",
+        "strip-bom": "^4.0.0",
+        "yargs": "^15.4.1"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-serializer": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-serializer/-/jest-serializer-26.6.2.tgz",
+      "integrity": "sha512-S5wqyz0DXnNJPd/xfIzZ5Xnp1HrJWBczg8mMfMpN78OJ5eDxXyf+Ygld9wX1DnUWbIbhM1YDY95NjR4CBXkb2g==",
+      "dev": true,
+      "requires": {
+        "@types/node": "*",
+        "graceful-fs": "^4.2.4"
+      }
+    },
+    "jest-snapshot": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-snapshot/-/jest-snapshot-26.6.2.tgz",
+      "integrity": "sha512-OLhxz05EzUtsAmOMzuupt1lHYXCNib0ECyuZ/PZOx9TrZcC8vL0x+DUG3TL+GLX3yHG45e6YGjIm0XwDc3q3og==",
+      "dev": true,
+      "requires": {
+        "@babel/types": "^7.0.0",
+        "@jest/types": "^26.6.2",
+        "@types/babel__traverse": "^7.0.4",
+        "@types/prettier": "^2.0.0",
+        "chalk": "^4.0.0",
+        "expect": "^26.6.2",
+        "graceful-fs": "^4.2.4",
+        "jest-diff": "^26.6.2",
+        "jest-get-type": "^26.3.0",
+        "jest-haste-map": "^26.6.2",
+        "jest-matcher-utils": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-resolve": "^26.6.2",
+        "natural-compare": "^1.4.0",
+        "pretty-format": "^26.6.2",
+        "semver": "^7.3.2"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-util": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-util/-/jest-util-26.6.2.tgz",
+      "integrity": "sha512-MDW0fKfsn0OI7MS7Euz6h8HNDXVQ0gaM9uW6RjfDmd1DAFcaxX9OqIakHIqhbnmF08Cf2DLDG+ulq8YQQ0Lp0Q==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "chalk": "^4.0.0",
+        "graceful-fs": "^4.2.4",
+        "is-ci": "^2.0.0",
+        "micromatch": "^4.0.2"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-validate": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-validate/-/jest-validate-26.6.2.tgz",
+      "integrity": "sha512-NEYZ9Aeyj0i5rQqbq+tpIOom0YS1u2MVu6+euBsvpgIme+FOfRmoC4R5p0JiAUpaFvFy24xgrpMknarR/93XjQ==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "camelcase": "^6.0.0",
+        "chalk": "^4.0.0",
+        "jest-get-type": "^26.3.0",
+        "leven": "^3.1.0",
+        "pretty-format": "^26.6.2"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "camelcase": {
+          "version": "6.3.0",
+          "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz",
+          "integrity": "sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==",
+          "dev": true
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-watcher": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-watcher/-/jest-watcher-26.6.2.tgz",
+      "integrity": "sha512-WKJob0P/Em2csiVthsI68p6aGKTIcsfjH9Gsx1f0A3Italz43e3ho0geSAVsmj09RWOELP1AZ/DXyJgOgDKxXQ==",
+      "dev": true,
+      "requires": {
+        "@jest/test-result": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "ansi-escapes": "^4.2.1",
+        "chalk": "^4.0.0",
+        "jest-util": "^26.6.2",
+        "string-length": "^4.0.1"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "jest-worker": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-worker/-/jest-worker-26.6.2.tgz",
+      "integrity": "sha512-KWYVV1c4i+jbMpaBC+U++4Va0cp8OisU185o73T1vo99hqi7w8tSJfUXYswwqqrjzwxa6KpRK54WhPvwf5w6PQ==",
+      "dev": true,
+      "requires": {
+        "@types/node": "*",
+        "merge-stream": "^2.0.0",
+        "supports-color": "^7.0.0"
+      },
+      "dependencies": {
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "js-tokens": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
+      "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
+      "dev": true
+    },
+    "js-yaml": {
+      "version": "3.14.1",
+      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.14.1.tgz",
+      "integrity": "sha512-okMH7OXXJ7YrN9Ok3/SXrnu4iX9yOk+25nqX4imS2npuvTYDmo/QEZoqwZkYaIDk3jVvBOTOIEgEhaLOynBS9g==",
+      "dev": true,
+      "requires": {
+        "argparse": "^1.0.7",
+        "esprima": "^4.0.0"
+      }
+    },
+    "jsdom": {
+      "version": "16.7.0",
+      "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.7.0.tgz",
+      "integrity": "sha512-u9Smc2G1USStM+s/x1ru5Sxrl6mPYCbByG1U/hUmqaVsm4tbNyS7CicOSRyuGQYZhTu0h84qkZZQ/I+dzizSVw==",
+      "dev": true,
+      "requires": {
+        "abab": "^2.0.5",
+        "acorn": "^8.2.4",
+        "acorn-globals": "^6.0.0",
+        "cssom": "^0.4.4",
+        "cssstyle": "^2.3.0",
+        "data-urls": "^2.0.0",
+        "decimal.js": "^10.2.1",
+        "domexception": "^2.0.1",
+        "escodegen": "^2.0.0",
+        "form-data": "^3.0.0",
+        "html-encoding-sniffer": "^2.0.1",
+        "http-proxy-agent": "^4.0.1",
+        "https-proxy-agent": "^5.0.0",
+        "is-potential-custom-element-name": "^1.0.1",
+        "nwsapi": "^2.2.0",
+        "parse5": "6.0.1",
+        "saxes": "^5.0.1",
+        "symbol-tree": "^3.2.4",
+        "tough-cookie": "^4.0.0",
+        "w3c-hr-time": "^1.0.2",
+        "w3c-xmlserializer": "^2.0.0",
+        "webidl-conversions": "^6.1.0",
+        "whatwg-encoding": "^1.0.5",
+        "whatwg-mimetype": "^2.3.0",
+        "whatwg-url": "^8.5.0",
+        "ws": "^7.4.6",
+        "xml-name-validator": "^3.0.0"
+      },
+      "dependencies": {
+        "acorn": {
+          "version": "8.8.1",
+          "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.8.1.tgz",
+          "integrity": "sha512-7zFpHzhnqYKrkYdUjF1HI1bzd0VygEGX8lFk4k5zVMqHEoES+P+7TKI+EvLO9WVMJ8eekdO0aDEK044xTXwPPA==",
+          "dev": true
+        }
+      }
+    },
+    "jsesc": {
+      "version": "2.5.2",
+      "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
+      "integrity": "sha512-OYu7XEzjkCQ3C5Ps3QIZsQfNpqoJyZZA99wd9aWd05NCtC5pWOkShK2mkL6HXQR6/Cy2lbNdPlZBpuQHXE63gA==",
+      "dev": true
+    },
+    "json-parse-even-better-errors": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz",
+      "integrity": "sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==",
+      "dev": true
+    },
+    "json-schema-traverse": {
+      "version": "0.4.1",
+      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
+      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
+      "dev": true
+    },
+    "json-stable-stringify-without-jsonify": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz",
+      "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==",
+      "dev": true
+    },
+    "json5": {
+      "version": "2.2.1",
+      "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.1.tgz",
+      "integrity": "sha512-1hqLFMSrGHRHxav9q9gNjJ5EXznIxGVO09xQRrwplcS8qs28pZ8s8hupZAmqDwZUmVZ2Qb2jnyPOWcDH8m8dlA==",
+      "dev": true
+    },
+    "jsonfile": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-4.0.0.tgz",
+      "integrity": "sha512-m6F1R3z8jjlf2imQHS2Qez5sjKWQzbuuhuJ/FKYFRZvPE3PuHcSMVZzfsLhGVOkfd20obL5SWEBew5ShlquNxg==",
+      "dev": true,
+      "requires": {
+        "graceful-fs": "^4.1.6"
+      }
+    },
+    "kind-of": {
+      "version": "6.0.3",
+      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.3.tgz",
+      "integrity": "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==",
+      "dev": true
+    },
+    "kleur": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/kleur/-/kleur-3.0.3.tgz",
+      "integrity": "sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==",
+      "dev": true
+    },
+    "leven": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz",
+      "integrity": "sha512-qsda+H8jTaUaN/x5vzW2rzc+8Rw4TAQ/4KjB46IwK5VH+IlVeeeje/EoZRpiXvIqjFgK84QffqPztGI3VBLG1A==",
+      "dev": true
+    },
+    "levn": {
+      "version": "0.3.0",
+      "resolved": "https://registry.npmjs.org/levn/-/levn-0.3.0.tgz",
+      "integrity": "sha512-0OO4y2iOHix2W6ujICbKIaEQXvFQHue65vUG3pb5EUomzPI90z9hsA1VsO/dbIIpC53J8gxM9Q4Oho0jrCM/yA==",
+      "dev": true,
+      "requires": {
+        "prelude-ls": "~1.1.2",
+        "type-check": "~0.3.2"
+      }
+    },
+    "lines-and-columns": {
+      "version": "1.2.4",
+      "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz",
+      "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==",
+      "dev": true
+    },
+    "locate-path": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz",
+      "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==",
+      "dev": true,
+      "requires": {
+        "p-locate": "^4.1.0"
+      }
+    },
+    "lodash": {
+      "version": "4.17.21",
+      "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
+      "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==",
+      "dev": true
+    },
+    "lru-cache": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz",
+      "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==",
+      "dev": true,
+      "requires": {
+        "yallist": "^4.0.0"
+      }
+    },
+    "lunr": {
+      "version": "2.3.9",
+      "resolved": "https://registry.npmjs.org/lunr/-/lunr-2.3.9.tgz",
+      "integrity": "sha512-zTU3DaZaF3Rt9rhN3uBMGQD3dD2/vFQqnvZCDv4dl5iOzq2IZQqTxu90r4E5J+nP70J3ilqVCrbho2eWaeW8Ow==",
+      "dev": true
+    },
+    "magic-string": {
+      "version": "0.25.9",
+      "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.25.9.tgz",
+      "integrity": "sha512-RmF0AsMzgt25qzqqLc1+MbHmhdx0ojF2Fvs4XnOqz2ZOBXzzkEwc/dJQZCYHAn7v1jbVOjAZfK8msRn4BxO4VQ==",
+      "dev": true,
+      "requires": {
+        "sourcemap-codec": "^1.4.8"
+      }
+    },
+    "make-dir": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-3.1.0.tgz",
+      "integrity": "sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==",
+      "dev": true,
+      "requires": {
+        "semver": "^6.0.0"
+      },
+      "dependencies": {
+        "semver": {
+          "version": "6.3.0",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+          "dev": true
+        }
+      }
+    },
+    "makeerror": {
+      "version": "1.0.12",
+      "resolved": "https://registry.npmjs.org/makeerror/-/makeerror-1.0.12.tgz",
+      "integrity": "sha512-JmqCvUhmt43madlpFzG4BQzG2Z3m6tvQDNKdClZnO3VbIudJYmxsT0FNJMeiB2+JTSlTQTSbU8QdesVmwJcmLg==",
+      "dev": true,
+      "requires": {
+        "tmpl": "1.0.5"
+      }
+    },
+    "map-cache": {
+      "version": "0.2.2",
+      "resolved": "https://registry.npmjs.org/map-cache/-/map-cache-0.2.2.tgz",
+      "integrity": "sha512-8y/eV9QQZCiyn1SprXSrCmqJN0yNRATe+PO8ztwqrvrbdRLA3eYJF0yaR0YayLWkMbsQSKWS9N2gPcGEc4UsZg==",
+      "dev": true
+    },
+    "map-visit": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/map-visit/-/map-visit-1.0.0.tgz",
+      "integrity": "sha512-4y7uGv8bd2WdM9vpQsiQNo41Ln1NvhvDRuVt0k2JZQ+ezN2uaQes7lZeZ+QQUHOLQAtDaBJ+7wCbi+ab/KFs+w==",
+      "dev": true,
+      "requires": {
+        "object-visit": "^1.0.0"
+      }
+    },
+    "marked": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/marked/-/marked-1.0.0.tgz",
+      "integrity": "sha512-Wo+L1pWTVibfrSr+TTtMuiMfNzmZWiOPeO7rZsQUY5bgsxpHesBEcIWJloWVTFnrMXnf/TL30eTFSGJddmQAng==",
+      "dev": true
+    },
+    "merge-stream": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
+      "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==",
+      "dev": true
+    },
+    "micromatch": {
+      "version": "4.0.5",
+      "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.5.tgz",
+      "integrity": "sha512-DMy+ERcEW2q8Z2Po+WNXuw3c5YaUSFjAO5GsJqfEl7UjvtIuFKO6ZrKvcItdy98dwFI2N1tg3zNIdKaQT+aNdA==",
+      "dev": true,
+      "requires": {
+        "braces": "^3.0.2",
+        "picomatch": "^2.3.1"
+      }
+    },
+    "mime-db": {
+      "version": "1.52.0",
+      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
+      "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
+      "dev": true
+    },
+    "mime-types": {
+      "version": "2.1.35",
+      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
+      "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
+      "dev": true,
+      "requires": {
+        "mime-db": "1.52.0"
+      }
+    },
+    "mimic-fn": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz",
+      "integrity": "sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==",
+      "dev": true
+    },
+    "minimatch": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
+      "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
+      "dev": true,
+      "requires": {
+        "brace-expansion": "^1.1.7"
+      }
+    },
+    "minimist": {
+      "version": "1.2.7",
+      "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.7.tgz",
+      "integrity": "sha512-bzfL1YUZsP41gmu/qjrEk0Q6i2ix/cVeAhbCbqH9u3zYutS1cLg00qhrD0M2MVdCcx4Sc0UpP2eBWo9rotpq6g==",
+      "dev": true
+    },
+    "mixin-deep": {
+      "version": "1.3.2",
+      "resolved": "https://registry.npmjs.org/mixin-deep/-/mixin-deep-1.3.2.tgz",
+      "integrity": "sha512-WRoDn//mXBiJ1H40rqa3vH0toePwSsGb45iInWlTySa+Uu4k3tYUSxa2v1KqAiLtvlrSzaExqS1gtk96A9zvEA==",
+      "dev": true,
+      "requires": {
+        "for-in": "^1.0.2",
+        "is-extendable": "^1.0.1"
+      },
+      "dependencies": {
+        "is-extendable": {
+          "version": "1.0.1",
+          "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-1.0.1.tgz",
+          "integrity": "sha512-arnXMxT1hhoKo9k1LZdmlNyJdDDfy2v0fXjFlmok4+i8ul/6WlbVge9bhM74OpNPQPMGUToDtz+KXa1PneJxOA==",
+          "dev": true,
+          "requires": {
+            "is-plain-object": "^2.0.4"
+          }
+        }
+      }
+    },
+    "mkdirp": {
+      "version": "0.5.6",
+      "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz",
+      "integrity": "sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==",
+      "dev": true,
+      "requires": {
+        "minimist": "^1.2.6"
+      }
+    },
+    "ms": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
+      "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==",
+      "dev": true
+    },
+    "mute-stream": {
+      "version": "0.0.8",
+      "resolved": "https://registry.npmjs.org/mute-stream/-/mute-stream-0.0.8.tgz",
+      "integrity": "sha512-nnbWWOkoWyUsTjKrhgD0dcz22mdkSnpYqbEjIm2nhwhuxlSkpywJmBo8h0ZqJdkp73mb90SssHkN4rsRaBAfAA==",
+      "dev": true
+    },
+    "nanomatch": {
+      "version": "1.2.13",
+      "resolved": "https://registry.npmjs.org/nanomatch/-/nanomatch-1.2.13.tgz",
+      "integrity": "sha512-fpoe2T0RbHwBTBUOftAfBPaDEi06ufaUai0mE6Yn1kacc3SnTErfb/h+X94VXzI64rKFHYImXSvdwGGCmwOqCA==",
+      "dev": true,
+      "requires": {
+        "arr-diff": "^4.0.0",
+        "array-unique": "^0.3.2",
+        "define-property": "^2.0.2",
+        "extend-shallow": "^3.0.2",
+        "fragment-cache": "^0.2.1",
+        "is-windows": "^1.0.2",
+        "kind-of": "^6.0.2",
+        "object.pick": "^1.3.0",
+        "regex-not": "^1.0.0",
+        "snapdragon": "^0.8.1",
+        "to-regex": "^3.0.1"
+      }
+    },
+    "natural-compare": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz",
+      "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==",
+      "dev": true
+    },
+    "neo-async": {
+      "version": "2.6.2",
+      "resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.6.2.tgz",
+      "integrity": "sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==",
+      "dev": true
+    },
+    "nice-try": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/nice-try/-/nice-try-1.0.5.tgz",
+      "integrity": "sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ==",
+      "dev": true
+    },
+    "node-int64": {
+      "version": "0.4.0",
+      "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz",
+      "integrity": "sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==",
+      "dev": true
+    },
+    "node-notifier": {
+      "version": "8.0.2",
+      "resolved": "https://registry.npmjs.org/node-notifier/-/node-notifier-8.0.2.tgz",
+      "integrity": "sha512-oJP/9NAdd9+x2Q+rfphB2RJCHjod70RcRLjosiPMMu5gjIfwVnOUGq2nbTjTUbmy0DJ/tFIVT30+Qe3nzl4TJg==",
+      "dev": true,
+      "optional": true,
+      "requires": {
+        "growly": "^1.3.0",
+        "is-wsl": "^2.2.0",
+        "semver": "^7.3.2",
+        "shellwords": "^0.1.1",
+        "uuid": "^8.3.0",
+        "which": "^2.0.2"
+      },
+      "dependencies": {
+        "which": {
+          "version": "2.0.2",
+          "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
+          "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+          "dev": true,
+          "optional": true,
+          "requires": {
+            "isexe": "^2.0.0"
+          }
+        }
+      }
+    },
+    "node-releases": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.6.tgz",
+      "integrity": "sha512-PiVXnNuFm5+iYkLBNeq5211hvO38y63T0i2KKh2KnUs3RpzJ+JtODFjkD8yjLwnDkTYF1eKXheUwdssR+NRZdg==",
+      "dev": true
+    },
+    "normalize-package-data": {
+      "version": "2.5.0",
+      "resolved": "https://registry.npmjs.org/normalize-package-data/-/normalize-package-data-2.5.0.tgz",
+      "integrity": "sha512-/5CMN3T0R4XTj4DcGaexo+roZSdSFW/0AOOTROrjxzCG1wrWXEsGbRKevjlIL+ZDE4sZlJr5ED4YW0yqmkK+eA==",
+      "dev": true,
+      "requires": {
+        "hosted-git-info": "^2.1.4",
+        "resolve": "^1.10.0",
+        "semver": "2 || 3 || 4 || 5",
+        "validate-npm-package-license": "^3.0.1"
+      },
+      "dependencies": {
+        "semver": {
+          "version": "5.7.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
+          "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+          "dev": true
+        }
+      }
+    },
+    "normalize-path": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz",
+      "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==",
+      "dev": true
+    },
+    "npm-run-path": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-2.0.2.tgz",
+      "integrity": "sha512-lJxZYlT4DW/bRUtFh1MQIWqmLwQfAxnqWG4HhEdjMlkrJYnJn0Jrr2u3mgxqaWsdiBc76TYkTG/mhrnYTuzfHw==",
+      "dev": true,
+      "requires": {
+        "path-key": "^2.0.0"
+      }
+    },
+    "nwsapi": {
+      "version": "2.2.2",
+      "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.2.tgz",
+      "integrity": "sha512-90yv+6538zuvUMnN+zCr8LuV6bPFdq50304114vJYJ8RDyK8D5O9Phpbd6SZWgI7PwzmmfN1upeOJlvybDSgCw==",
+      "dev": true
+    },
+    "object-copy": {
+      "version": "0.1.0",
+      "resolved": "https://registry.npmjs.org/object-copy/-/object-copy-0.1.0.tgz",
+      "integrity": "sha512-79LYn6VAb63zgtmAteVOWo9Vdj71ZVBy3Pbse+VqxDpEP83XuujMrGqHIwAXJ5I/aM0zU7dIyIAhifVTPrNItQ==",
+      "dev": true,
+      "requires": {
+        "copy-descriptor": "^0.1.0",
+        "define-property": "^0.2.5",
+        "kind-of": "^3.0.3"
+      },
+      "dependencies": {
+        "define-property": {
+          "version": "0.2.5",
+          "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz",
+          "integrity": "sha512-Rr7ADjQZenceVOAKop6ALkkRAmH1A4Gx9hV/7ZujPUN2rkATqFO0JZLZInbAjpZYoJ1gUx8MRMQVkYemcbMSTA==",
+          "dev": true,
+          "requires": {
+            "is-descriptor": "^0.1.0"
+          }
+        },
+        "kind-of": {
+          "version": "3.2.2",
+          "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+          "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+          "dev": true,
+          "requires": {
+            "is-buffer": "^1.1.5"
+          }
+        }
+      }
+    },
+    "object-visit": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/object-visit/-/object-visit-1.0.1.tgz",
+      "integrity": "sha512-GBaMwwAVK9qbQN3Scdo0OyvgPW7l3lnaVMj84uTOZlswkX0KpF6fyDBJhtTthf7pymztoN36/KEr1DyhF96zEA==",
+      "dev": true,
+      "requires": {
+        "isobject": "^3.0.0"
+      }
+    },
+    "object.pick": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/object.pick/-/object.pick-1.3.0.tgz",
+      "integrity": "sha512-tqa/UMy/CCoYmj+H5qc07qvSL9dqcs/WZENZ1JbtWBlATP+iVOe778gE6MSijnyCnORzDuX6hU+LA4SZ09YjFQ==",
+      "dev": true,
+      "requires": {
+        "isobject": "^3.0.1"
+      }
+    },
+    "once": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
+      "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
+      "dev": true,
+      "requires": {
+        "wrappy": "1"
+      }
+    },
+    "onetime": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz",
+      "integrity": "sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==",
+      "dev": true,
+      "requires": {
+        "mimic-fn": "^2.1.0"
+      }
+    },
+    "optionator": {
+      "version": "0.8.3",
+      "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.8.3.tgz",
+      "integrity": "sha512-+IW9pACdk3XWmmTXG8m3upGUJst5XRGzxMRjXzAuJ1XnIFNvfhjjIuYkDvysnPQ7qzqVzLt78BCruntqRhWQbA==",
+      "dev": true,
+      "requires": {
+        "deep-is": "~0.1.3",
+        "fast-levenshtein": "~2.0.6",
+        "levn": "~0.3.0",
+        "prelude-ls": "~1.1.2",
+        "type-check": "~0.3.2",
+        "word-wrap": "~1.2.3"
+      }
+    },
+    "os-tmpdir": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz",
+      "integrity": "sha512-D2FR03Vir7FIu45XBY20mTb+/ZSWB00sjU9jdQXt83gDrI4Ztz5Fs7/yy74g2N5SVQY4xY1qDr4rNddwYRVX0g==",
+      "dev": true
+    },
+    "p-each-series": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/p-each-series/-/p-each-series-2.2.0.tgz",
+      "integrity": "sha512-ycIL2+1V32th+8scbpTvyHNaHe02z0sjgh91XXjAk+ZeXoPN4Z46DVUnzdso0aX4KckKw0FNNFHdjZ2UsZvxiA==",
+      "dev": true
+    },
+    "p-finally": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz",
+      "integrity": "sha512-LICb2p9CB7FS+0eR1oqWnHhp0FljGLZCWBE9aix0Uye9W8LTQPwMTYVGWQWIw9RdQiDg4+epXQODwIYJtSJaow==",
+      "dev": true
+    },
+    "p-limit": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz",
+      "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==",
+      "dev": true,
+      "requires": {
+        "p-try": "^2.0.0"
+      }
+    },
+    "p-locate": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-4.1.0.tgz",
+      "integrity": "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==",
+      "dev": true,
+      "requires": {
+        "p-limit": "^2.2.0"
+      }
+    },
+    "p-try": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz",
+      "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==",
+      "dev": true
+    },
+    "parent-module": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
+      "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==",
+      "dev": true,
+      "requires": {
+        "callsites": "^3.0.0"
+      }
+    },
+    "parse-json": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz",
+      "integrity": "sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==",
+      "dev": true,
+      "requires": {
+        "@babel/code-frame": "^7.0.0",
+        "error-ex": "^1.3.1",
+        "json-parse-even-better-errors": "^2.3.0",
+        "lines-and-columns": "^1.1.6"
+      }
+    },
+    "parse5": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/parse5/-/parse5-6.0.1.tgz",
+      "integrity": "sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw==",
+      "dev": true
+    },
+    "pascalcase": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/pascalcase/-/pascalcase-0.1.1.tgz",
+      "integrity": "sha512-XHXfu/yOQRy9vYOtUDVMN60OEJjW013GoObG1o+xwQTpB9eYJX/BjXMsdW13ZDPruFhYYn0AG22w0xgQMwl3Nw==",
+      "dev": true
+    },
+    "path-exists": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz",
+      "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==",
+      "dev": true
+    },
+    "path-is-absolute": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
+      "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==",
+      "dev": true
+    },
+    "path-key": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/path-key/-/path-key-2.0.1.tgz",
+      "integrity": "sha512-fEHGKCSmUSDPv4uoj8AlD+joPlq3peND+HRYyxFz4KPw4z926S/b8rIuFs2FYJg3BwsxJf6A9/3eIdLaYC+9Dw==",
+      "dev": true
+    },
+    "path-parse": {
+      "version": "1.0.7",
+      "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz",
+      "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==",
+      "dev": true
+    },
+    "picocolors": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz",
+      "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==",
+      "dev": true
+    },
+    "picomatch": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
+      "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+      "dev": true
+    },
+    "pirates": {
+      "version": "4.0.5",
+      "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.5.tgz",
+      "integrity": "sha512-8V9+HQPupnaXMA23c5hvl69zXvTwTzyAYasnkb0Tts4XvO4CliqONMOnvlq26rkhLC3nWDFBJf73LU1e1VZLaQ==",
+      "dev": true
+    },
+    "pkg-dir": {
+      "version": "4.2.0",
+      "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-4.2.0.tgz",
+      "integrity": "sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==",
+      "dev": true,
+      "requires": {
+        "find-up": "^4.0.0"
+      }
+    },
+    "posix-character-classes": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/posix-character-classes/-/posix-character-classes-0.1.1.tgz",
+      "integrity": "sha512-xTgYBc3fuo7Yt7JbiuFxSYGToMoz8fLoE6TC9Wx1P/u+LfeThMOAqmuyECnlBaaJb+u1m9hHiXUEtwW4OzfUJg==",
+      "dev": true
+    },
+    "prelude-ls": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.1.2.tgz",
+      "integrity": "sha512-ESF23V4SKG6lVSGZgYNpbsiaAkdab6ZgOxe52p7+Kid3W3u3bxR4Vfd/o21dmN7jSt0IwgZ4v5MUd26FEtXE9w==",
+      "dev": true
+    },
+    "pretty-format": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-26.6.2.tgz",
+      "integrity": "sha512-7AeGuCYNGmycyQbCqd/3PWH4eOoX/OiCa0uphp57NVTeAGdJGaAliecxwBDHYQCIvrW7aDBZCYeNTP/WX69mkg==",
+      "dev": true,
+      "requires": {
+        "@jest/types": "^26.6.2",
+        "ansi-regex": "^5.0.0",
+        "ansi-styles": "^4.0.0",
+        "react-is": "^17.0.1"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        }
+      }
+    },
+    "progress": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
+      "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==",
+      "dev": true
+    },
+    "prompts": {
+      "version": "2.4.2",
+      "resolved": "https://registry.npmjs.org/prompts/-/prompts-2.4.2.tgz",
+      "integrity": "sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q==",
+      "dev": true,
+      "requires": {
+        "kleur": "^3.0.3",
+        "sisteransi": "^1.0.5"
+      }
+    },
+    "psl": {
+      "version": "1.9.0",
+      "resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz",
+      "integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag==",
+      "dev": true
+    },
+    "pump": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
+      "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==",
+      "dev": true,
+      "requires": {
+        "end-of-stream": "^1.1.0",
+        "once": "^1.3.1"
+      }
+    },
+    "punycode": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz",
+      "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==",
+      "dev": true
+    },
+    "querystringify": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/querystringify/-/querystringify-2.2.0.tgz",
+      "integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ==",
+      "dev": true
+    },
+    "react-is": {
+      "version": "17.0.2",
+      "resolved": "https://registry.npmjs.org/react-is/-/react-is-17.0.2.tgz",
+      "integrity": "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==",
+      "dev": true
+    },
+    "read-pkg": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-5.2.0.tgz",
+      "integrity": "sha512-Ug69mNOpfvKDAc2Q8DRpMjjzdtrnv9HcSMX+4VsZxD1aZ6ZzrIE7rlzXBtWTyhULSMKg076AW6WR5iZpD0JiOg==",
+      "dev": true,
+      "requires": {
+        "@types/normalize-package-data": "^2.4.0",
+        "normalize-package-data": "^2.5.0",
+        "parse-json": "^5.0.0",
+        "type-fest": "^0.6.0"
+      },
+      "dependencies": {
+        "type-fest": {
+          "version": "0.6.0",
+          "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.6.0.tgz",
+          "integrity": "sha512-q+MB8nYR1KDLrgr4G5yemftpMC7/QLqVndBmEEdqzmNj5dcFOO4Oo8qlwZE3ULT3+Zim1F8Kq4cBnikNhlCMlg==",
+          "dev": true
+        }
+      }
+    },
+    "read-pkg-up": {
+      "version": "7.0.1",
+      "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-7.0.1.tgz",
+      "integrity": "sha512-zK0TB7Xd6JpCLmlLmufqykGE+/TlOePD6qKClNW7hHDKFh/J7/7gCWGR7joEQEW1bKq3a3yUZSObOoWLFQ4ohg==",
+      "dev": true,
+      "requires": {
+        "find-up": "^4.1.0",
+        "read-pkg": "^5.2.0",
+        "type-fest": "^0.8.1"
+      }
+    },
+    "rechoir": {
+      "version": "0.6.2",
+      "resolved": "https://registry.npmjs.org/rechoir/-/rechoir-0.6.2.tgz",
+      "integrity": "sha512-HFM8rkZ+i3zrV+4LQjwQ0W+ez98pApMGM3HUrN04j3CqzPOzl9nmP15Y8YXNm8QHGv/eacOVEjqhmWpkRV0NAw==",
+      "dev": true,
+      "requires": {
+        "resolve": "^1.1.6"
+      }
+    },
+    "regex-not": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/regex-not/-/regex-not-1.0.2.tgz",
+      "integrity": "sha512-J6SDjUgDxQj5NusnOtdFxDwN/+HWykR8GELwctJ7mdqhcyy1xEc4SRFHUXvxTp661YaVKAjfRLZ9cCqS6tn32A==",
+      "dev": true,
+      "requires": {
+        "extend-shallow": "^3.0.2",
+        "safe-regex": "^1.1.0"
+      }
+    },
+    "regexpp": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/regexpp/-/regexpp-3.2.0.tgz",
+      "integrity": "sha512-pq2bWo9mVD43nbts2wGv17XLiNLya+GklZ8kaDLV2Z08gDCsGpnKn9BFMepvWuHCbyVvY7J5o5+BVvoQbmlJLg==",
+      "dev": true
+    },
+    "remove-trailing-separator": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/remove-trailing-separator/-/remove-trailing-separator-1.1.0.tgz",
+      "integrity": "sha512-/hS+Y0u3aOfIETiaiirUFwDBDzmXPvO+jAfKTitUngIPzdKc6Z0LoFjM/CK5PL4C+eKwHohlHAb6H0VFfmmUsw==",
+      "dev": true
+    },
+    "repeat-element": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/repeat-element/-/repeat-element-1.1.4.tgz",
+      "integrity": "sha512-LFiNfRcSu7KK3evMyYOuCzv3L10TW7yC1G2/+StMjK8Y6Vqd2MG7r/Qjw4ghtuCOjFvlnms/iMmLqpvW/ES/WQ==",
+      "dev": true
+    },
+    "repeat-string": {
+      "version": "1.6.1",
+      "resolved": "https://registry.npmjs.org/repeat-string/-/repeat-string-1.6.1.tgz",
+      "integrity": "sha512-PV0dzCYDNfRi1jCDbJzpW7jNNDRuCOG/jI5ctQcGKt/clZD+YcPS3yIlWuTJMmESC8aevCFmWJy5wjAFgNqN6w==",
+      "dev": true
+    },
+    "require-directory": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
+      "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
+      "dev": true
+    },
+    "require-main-filename": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/require-main-filename/-/require-main-filename-2.0.0.tgz",
+      "integrity": "sha512-NKN5kMDylKuldxYLSUfrbo5Tuzh4hd+2E8NPPX02mZtn1VuREQToYe/ZdlJy+J3uCpfaiGF05e7B8W0iXbQHmg==",
+      "dev": true
+    },
+    "requires-port": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz",
+      "integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==",
+      "dev": true
+    },
+    "resolve": {
+      "version": "1.22.1",
+      "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.1.tgz",
+      "integrity": "sha512-nBpuuYuY5jFsli/JIs1oldw6fOQCBioohqWZg/2hiaOybXOft4lonv85uDOKXdf8rhyK159cxU5cDcK/NKk8zw==",
+      "dev": true,
+      "requires": {
+        "is-core-module": "^2.9.0",
+        "path-parse": "^1.0.7",
+        "supports-preserve-symlinks-flag": "^1.0.0"
+      }
+    },
+    "resolve-cwd": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-cwd/-/resolve-cwd-3.0.0.tgz",
+      "integrity": "sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==",
+      "dev": true,
+      "requires": {
+        "resolve-from": "^5.0.0"
+      },
+      "dependencies": {
+        "resolve-from": {
+          "version": "5.0.0",
+          "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-5.0.0.tgz",
+          "integrity": "sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==",
+          "dev": true
+        }
+      }
+    },
+    "resolve-from": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
+      "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==",
+      "dev": true
+    },
+    "resolve-url": {
+      "version": "0.2.1",
+      "resolved": "https://registry.npmjs.org/resolve-url/-/resolve-url-0.2.1.tgz",
+      "integrity": "sha512-ZuF55hVUQaaczgOIwqWzkEcEidmlD/xl44x1UZnhOXcYuFN2S6+rcxpG+C1N3So0wvNI3DmJICUFfu2SxhBmvg==",
+      "dev": true
+    },
+    "restore-cursor": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-3.1.0.tgz",
+      "integrity": "sha512-l+sSefzHpj5qimhFSE5a8nufZYAM3sBSVMAPtYkmC+4EH2anSGaEMXSD0izRQbu9nfyQ9y5JrVmp7E8oZrUjvA==",
+      "dev": true,
+      "requires": {
+        "onetime": "^5.1.0",
+        "signal-exit": "^3.0.2"
+      }
+    },
+    "ret": {
+      "version": "0.1.15",
+      "resolved": "https://registry.npmjs.org/ret/-/ret-0.1.15.tgz",
+      "integrity": "sha512-TTlYpa+OL+vMMNG24xSlQGEJ3B/RzEfUlLct7b5G/ytav+wPrplCpVMFuwzXbkecJrb6IYo1iFb0S9v37754mg==",
+      "dev": true
+    },
+    "rimraf": {
+      "version": "2.6.3",
+      "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.3.tgz",
+      "integrity": "sha512-mwqeW5XsA2qAejG46gYdENaxXjx9onRNCfn7L0duuP4hCuTIi/QO7PDK07KJfp1d+izWPrzEJDcSqBa0OZQriA==",
+      "dev": true,
+      "requires": {
+        "glob": "^7.1.3"
+      }
+    },
+    "rollup": {
+      "version": "2.79.1",
+      "resolved": "https://registry.npmjs.org/rollup/-/rollup-2.79.1.tgz",
+      "integrity": "sha512-uKxbd0IhMZOhjAiD5oAFp7BqvkA4Dv47qpOCtaNvng4HBwdbWtdOh8f5nZNuk2rp51PMGk3bzfWu5oayNEuYnw==",
+      "dev": true,
+      "requires": {
+        "fsevents": "~2.3.2"
+      }
+    },
+    "rollup-plugin-typescript2": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/rollup-plugin-typescript2/-/rollup-plugin-typescript2-0.27.3.tgz",
+      "integrity": "sha512-gmYPIFmALj9D3Ga1ZbTZAKTXq1JKlTQBtj299DXhqYz9cL3g/AQfUvbb2UhH+Nf++cCq941W2Mv7UcrcgLzJJg==",
+      "dev": true,
+      "requires": {
+        "@rollup/pluginutils": "^3.1.0",
+        "find-cache-dir": "^3.3.1",
+        "fs-extra": "8.1.0",
+        "resolve": "1.17.0",
+        "tslib": "2.0.1"
+      },
+      "dependencies": {
+        "resolve": {
+          "version": "1.17.0",
+          "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.17.0.tgz",
+          "integrity": "sha512-ic+7JYiV8Vi2yzQGFWOkiZD5Z9z7O2Zhm9XMaTxdJExKasieFCr+yXZ/WmXsckHiKl12ar0y6XiXDx3m4RHn1w==",
+          "dev": true,
+          "requires": {
+            "path-parse": "^1.0.6"
+          }
+        },
+        "tslib": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.0.1.tgz",
+          "integrity": "sha512-SgIkNheinmEBgx1IUNirK0TUD4X9yjjBRTqqjggWCU3pUEqIk3/Uwl3yRixYKT6WjQuGiwDv4NomL3wqRCj+CQ==",
+          "dev": true
+        }
+      }
+    },
+    "rsvp": {
+      "version": "4.8.5",
+      "resolved": "https://registry.npmjs.org/rsvp/-/rsvp-4.8.5.tgz",
+      "integrity": "sha512-nfMOlASu9OnRJo1mbEk2cz0D56a1MBNrJ7orjRZQG10XDyuvwksKbuXNp6qa+kbn839HwjwhBzhFmdsaEAfauA==",
+      "dev": true
+    },
+    "run-async": {
+      "version": "2.4.1",
+      "resolved": "https://registry.npmjs.org/run-async/-/run-async-2.4.1.tgz",
+      "integrity": "sha512-tvVnVv01b8c1RrA6Ep7JkStj85Guv/YrMcwqYQnwjsAS2cTmmPGBBjAjpCW7RrSodNSoE2/qg9O4bceNvUuDgQ==",
+      "dev": true
+    },
+    "rxjs": {
+      "version": "6.6.7",
+      "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-6.6.7.tgz",
+      "integrity": "sha512-hTdwr+7yYNIT5n4AMYp85KA6yw2Va0FLa3Rguvbpa4W3I5xynaBZo41cM3XM+4Q6fRMj3sBYIR1VAmZMXYJvRQ==",
+      "dev": true,
+      "requires": {
+        "tslib": "^1.9.0"
+      }
+    },
+    "safe-regex": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz",
+      "integrity": "sha512-aJXcif4xnaNUzvUuC5gcb46oTS7zvg4jpMTnuqtrEPlR3vFr4pxtdTwaF1Qs3Enjn9HK+ZlwQui+a7z0SywIzg==",
+      "dev": true,
+      "requires": {
+        "ret": "~0.1.10"
+      }
+    },
+    "safer-buffer": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
+      "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
+      "dev": true
+    },
+    "sane": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/sane/-/sane-4.1.0.tgz",
+      "integrity": "sha512-hhbzAgTIX8O7SHfp2c8/kREfEn4qO/9q8C9beyY6+tvZ87EpoZ3i1RIEvp27YBswnNbY9mWd6paKVmKbAgLfZA==",
+      "dev": true,
+      "requires": {
+        "@cnakazawa/watch": "^1.0.3",
+        "anymatch": "^2.0.0",
+        "capture-exit": "^2.0.0",
+        "exec-sh": "^0.3.2",
+        "execa": "^1.0.0",
+        "fb-watchman": "^2.0.0",
+        "micromatch": "^3.1.4",
+        "minimist": "^1.1.1",
+        "walker": "~1.0.5"
+      },
+      "dependencies": {
+        "anymatch": {
+          "version": "2.0.0",
+          "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz",
+          "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==",
+          "dev": true,
+          "requires": {
+            "micromatch": "^3.1.4",
+            "normalize-path": "^2.1.1"
+          }
+        },
+        "braces": {
+          "version": "2.3.2",
+          "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz",
+          "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==",
+          "dev": true,
+          "requires": {
+            "arr-flatten": "^1.1.0",
+            "array-unique": "^0.3.2",
+            "extend-shallow": "^2.0.1",
+            "fill-range": "^4.0.0",
+            "isobject": "^3.0.1",
+            "repeat-element": "^1.1.2",
+            "snapdragon": "^0.8.1",
+            "snapdragon-node": "^2.0.1",
+            "split-string": "^3.0.2",
+            "to-regex": "^3.0.1"
+          },
+          "dependencies": {
+            "extend-shallow": {
+              "version": "2.0.1",
+              "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+              "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+              "dev": true,
+              "requires": {
+                "is-extendable": "^0.1.0"
+              }
+            }
+          }
+        },
+        "fill-range": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz",
+          "integrity": "sha512-VcpLTWqWDiTerugjj8e3+esbg+skS3M9e54UuR3iCeIDMXCLTsAH8hTSzDQU/X6/6t3eYkOKoZSef2PlU6U1XQ==",
+          "dev": true,
+          "requires": {
+            "extend-shallow": "^2.0.1",
+            "is-number": "^3.0.0",
+            "repeat-string": "^1.6.1",
+            "to-regex-range": "^2.1.0"
+          },
+          "dependencies": {
+            "extend-shallow": {
+              "version": "2.0.1",
+              "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+              "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+              "dev": true,
+              "requires": {
+                "is-extendable": "^0.1.0"
+              }
+            }
+          }
+        },
+        "is-number": {
+          "version": "3.0.0",
+          "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz",
+          "integrity": "sha512-4cboCqIpliH+mAvFNegjZQ4kgKc3ZUhQVr3HvWbSh5q3WH2v82ct+T2Y1hdU5Gdtorx/cLifQjqCbL7bpznLTg==",
+          "dev": true,
+          "requires": {
+            "kind-of": "^3.0.2"
+          },
+          "dependencies": {
+            "kind-of": {
+              "version": "3.2.2",
+              "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+              "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+              "dev": true,
+              "requires": {
+                "is-buffer": "^1.1.5"
+              }
+            }
+          }
+        },
+        "micromatch": {
+          "version": "3.1.10",
+          "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz",
+          "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==",
+          "dev": true,
+          "requires": {
+            "arr-diff": "^4.0.0",
+            "array-unique": "^0.3.2",
+            "braces": "^2.3.1",
+            "define-property": "^2.0.2",
+            "extend-shallow": "^3.0.2",
+            "extglob": "^2.0.4",
+            "fragment-cache": "^0.2.1",
+            "kind-of": "^6.0.2",
+            "nanomatch": "^1.2.9",
+            "object.pick": "^1.3.0",
+            "regex-not": "^1.0.0",
+            "snapdragon": "^0.8.1",
+            "to-regex": "^3.0.2"
+          }
+        },
+        "normalize-path": {
+          "version": "2.1.1",
+          "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-2.1.1.tgz",
+          "integrity": "sha512-3pKJwH184Xo/lnH6oyP1q2pMd7HcypqqmRs91/6/i2CGtWwIKGCkOOMTm/zXbgTEWHw1uNpNi/igc3ePOYHb6w==",
+          "dev": true,
+          "requires": {
+            "remove-trailing-separator": "^1.0.1"
+          }
+        },
+        "to-regex-range": {
+          "version": "2.1.1",
+          "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-2.1.1.tgz",
+          "integrity": "sha512-ZZWNfCjUokXXDGXFpZehJIkZqq91BcULFq/Pi7M5i4JnxXdhMKAK682z8bCW3o8Hj1wuuzoKcW3DfVzaP6VuNg==",
+          "dev": true,
+          "requires": {
+            "is-number": "^3.0.0",
+            "repeat-string": "^1.6.1"
+          }
+        }
+      }
+    },
+    "saxes": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/saxes/-/saxes-5.0.1.tgz",
+      "integrity": "sha512-5LBh1Tls8c9xgGjw3QrMwETmTMVk0oFgvrFSvWx62llR2hcEInrKNZ2GZCCuuy2lvWrdl5jhbpeqc5hRYKFOcw==",
+      "dev": true,
+      "requires": {
+        "xmlchars": "^2.2.0"
+      }
+    },
+    "semver": {
+      "version": "7.3.8",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
+      "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
+      "dev": true,
+      "requires": {
+        "lru-cache": "^6.0.0"
+      }
+    },
+    "set-blocking": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz",
+      "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==",
+      "dev": true
+    },
+    "set-value": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/set-value/-/set-value-2.0.1.tgz",
+      "integrity": "sha512-JxHc1weCN68wRY0fhCoXpyK55m/XPHafOmK4UWD7m2CI14GMcFypt4w/0+NV5f/ZMby2F6S2wwA7fgynh9gWSw==",
+      "dev": true,
+      "requires": {
+        "extend-shallow": "^2.0.1",
+        "is-extendable": "^0.1.1",
+        "is-plain-object": "^2.0.3",
+        "split-string": "^3.0.1"
+      },
+      "dependencies": {
+        "extend-shallow": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+          "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+          "dev": true,
+          "requires": {
+            "is-extendable": "^0.1.0"
+          }
+        }
+      }
+    },
+    "shebang-command": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz",
+      "integrity": "sha512-EV3L1+UQWGor21OmnvojK36mhg+TyIKDh3iFBKBohr5xeXIhNBcx8oWdgkTEEQ+BEFFYdLRuqMfd5L84N1V5Vg==",
+      "dev": true,
+      "requires": {
+        "shebang-regex": "^1.0.0"
+      }
+    },
+    "shebang-regex": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-1.0.0.tgz",
+      "integrity": "sha512-wpoSFAxys6b2a2wHZ1XpDSgD7N9iVjg29Ph9uV/uaP9Ex/KXlkTZTeddxDPSYQpgvzKLGJke2UU0AzoGCjNIvQ==",
+      "dev": true
+    },
+    "shelljs": {
+      "version": "0.8.5",
+      "resolved": "https://registry.npmjs.org/shelljs/-/shelljs-0.8.5.tgz",
+      "integrity": "sha512-TiwcRcrkhHvbrZbnRcFYMLl30Dfov3HKqzp5tO5b4pt6G/SezKcYhmDg15zXVBswHmctSAQKznqNW2LO5tTDow==",
+      "dev": true,
+      "requires": {
+        "glob": "^7.0.0",
+        "interpret": "^1.0.0",
+        "rechoir": "^0.6.2"
+      }
+    },
+    "shellwords": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/shellwords/-/shellwords-0.1.1.tgz",
+      "integrity": "sha512-vFwSUfQvqybiICwZY5+DAWIPLKsWO31Q91JSKl3UYv+K5c2QRPzn0qzec6QPu1Qc9eHYItiP3NdJqNVqetYAww==",
+      "dev": true,
+      "optional": true
+    },
+    "signal-exit": {
+      "version": "3.0.7",
+      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
+      "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==",
+      "dev": true
+    },
+    "sisteransi": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/sisteransi/-/sisteransi-1.0.5.tgz",
+      "integrity": "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==",
+      "dev": true
+    },
+    "slash": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz",
+      "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==",
+      "dev": true
+    },
+    "slice-ansi": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-2.1.0.tgz",
+      "integrity": "sha512-Qu+VC3EwYLldKa1fCxuuvULvSJOKEgk9pi8dZeCVK7TqBfUNTH4sFkk4joj8afVSfAYgJoSOetjx9QWOJ5mYoQ==",
+      "dev": true,
+      "requires": {
+        "ansi-styles": "^3.2.0",
+        "astral-regex": "^1.0.0",
+        "is-fullwidth-code-point": "^2.0.0"
+      },
+      "dependencies": {
+        "is-fullwidth-code-point": {
+          "version": "2.0.0",
+          "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz",
+          "integrity": "sha512-VHskAKYM8RfSFXwee5t5cbN5PZeq1Wrh6qd5bkyiXIf6UQcN6w/A0eXM9r6t8d+GYOh+o6ZhiEnb88LN/Y8m2w==",
+          "dev": true
+        }
+      }
+    },
+    "snapdragon": {
+      "version": "0.8.2",
+      "resolved": "https://registry.npmjs.org/snapdragon/-/snapdragon-0.8.2.tgz",
+      "integrity": "sha512-FtyOnWN/wCHTVXOMwvSv26d+ko5vWlIDD6zoUJ7LW8vh+ZBC8QdljveRP+crNrtBwioEUWy/4dMtbBjA4ioNlg==",
+      "dev": true,
+      "requires": {
+        "base": "^0.11.1",
+        "debug": "^2.2.0",
+        "define-property": "^0.2.5",
+        "extend-shallow": "^2.0.1",
+        "map-cache": "^0.2.2",
+        "source-map": "^0.5.6",
+        "source-map-resolve": "^0.5.0",
+        "use": "^3.1.0"
+      },
+      "dependencies": {
+        "debug": {
+          "version": "2.6.9",
+          "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
+          "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
+          "dev": true,
+          "requires": {
+            "ms": "2.0.0"
+          }
+        },
+        "define-property": {
+          "version": "0.2.5",
+          "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz",
+          "integrity": "sha512-Rr7ADjQZenceVOAKop6ALkkRAmH1A4Gx9hV/7ZujPUN2rkATqFO0JZLZInbAjpZYoJ1gUx8MRMQVkYemcbMSTA==",
+          "dev": true,
+          "requires": {
+            "is-descriptor": "^0.1.0"
+          }
+        },
+        "extend-shallow": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+          "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+          "dev": true,
+          "requires": {
+            "is-extendable": "^0.1.0"
+          }
+        },
+        "ms": {
+          "version": "2.0.0",
+          "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
+          "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==",
+          "dev": true
+        },
+        "source-map": {
+          "version": "0.5.7",
+          "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz",
+          "integrity": "sha512-LbrmJOMUSdEVxIKvdcJzQC+nQhe8FUZQTXQy6+I75skNgn3OoQ0DZA8YnFa7gp8tqtL3KPf1kmo0R5DoApeSGQ==",
+          "dev": true
+        }
+      }
+    },
+    "snapdragon-node": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/snapdragon-node/-/snapdragon-node-2.1.1.tgz",
+      "integrity": "sha512-O27l4xaMYt/RSQ5TR3vpWCAB5Kb/czIcqUFOM/C4fYcLnbZUc1PkjTAMjof2pBWaSTwOUd6qUHcFGVGj7aIwnw==",
+      "dev": true,
+      "requires": {
+        "define-property": "^1.0.0",
+        "isobject": "^3.0.0",
+        "snapdragon-util": "^3.0.1"
+      },
+      "dependencies": {
+        "define-property": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz",
+          "integrity": "sha512-cZTYKFWspt9jZsMscWo8sc/5lbPC9Q0N5nBLgb+Yd915iL3udB1uFgS3B8YCx66UVHq018DAVFoee7x+gxggeA==",
+          "dev": true,
+          "requires": {
+            "is-descriptor": "^1.0.0"
+          }
+        },
+        "is-accessor-descriptor": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz",
+          "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==",
+          "dev": true,
+          "requires": {
+            "kind-of": "^6.0.0"
+          }
+        },
+        "is-data-descriptor": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz",
+          "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==",
+          "dev": true,
+          "requires": {
+            "kind-of": "^6.0.0"
+          }
+        },
+        "is-descriptor": {
+          "version": "1.0.2",
+          "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz",
+          "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==",
+          "dev": true,
+          "requires": {
+            "is-accessor-descriptor": "^1.0.0",
+            "is-data-descriptor": "^1.0.0",
+            "kind-of": "^6.0.2"
+          }
+        }
+      }
+    },
+    "snapdragon-util": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/snapdragon-util/-/snapdragon-util-3.0.1.tgz",
+      "integrity": "sha512-mbKkMdQKsjX4BAL4bRYTj21edOf8cN7XHdYUJEe+Zn99hVEYcMvKPct1IqNe7+AZPirn8BCDOQBHQZknqmKlZQ==",
+      "dev": true,
+      "requires": {
+        "kind-of": "^3.2.0"
+      },
+      "dependencies": {
+        "kind-of": {
+          "version": "3.2.2",
+          "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+          "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+          "dev": true,
+          "requires": {
+            "is-buffer": "^1.1.5"
+          }
+        }
+      }
+    },
+    "source-map": {
+      "version": "0.6.1",
+      "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
+      "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
+      "dev": true
+    },
+    "source-map-resolve": {
+      "version": "0.5.3",
+      "resolved": "https://registry.npmjs.org/source-map-resolve/-/source-map-resolve-0.5.3.tgz",
+      "integrity": "sha512-Htz+RnsXWk5+P2slx5Jh3Q66vhQj1Cllm0zvnaY98+NFx+Dv2CF/f5O/t8x+KaNdrdIAsruNzoh/KpialbqAnw==",
+      "dev": true,
+      "requires": {
+        "atob": "^2.1.2",
+        "decode-uri-component": "^0.2.0",
+        "resolve-url": "^0.2.1",
+        "source-map-url": "^0.4.0",
+        "urix": "^0.1.0"
+      }
+    },
+    "source-map-support": {
+      "version": "0.5.21",
+      "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz",
+      "integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==",
+      "dev": true,
+      "requires": {
+        "buffer-from": "^1.0.0",
+        "source-map": "^0.6.0"
+      }
+    },
+    "source-map-url": {
+      "version": "0.4.1",
+      "resolved": "https://registry.npmjs.org/source-map-url/-/source-map-url-0.4.1.tgz",
+      "integrity": "sha512-cPiFOTLUKvJFIg4SKVScy4ilPPW6rFgMgfuZJPNoDuMs3nC1HbMUycBoJw77xFIp6z1UJQJOfx6C9GMH80DiTw==",
+      "dev": true
+    },
+    "sourcemap-codec": {
+      "version": "1.4.8",
+      "resolved": "https://registry.npmjs.org/sourcemap-codec/-/sourcemap-codec-1.4.8.tgz",
+      "integrity": "sha512-9NykojV5Uih4lgo5So5dtw+f0JgJX30KCNI8gwhz2J9A15wD0Ml6tjHKwf6fTSa6fAdVBdZeNOs9eJ71qCk8vA==",
+      "dev": true
+    },
+    "spdx-correct": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.1.1.tgz",
+      "integrity": "sha512-cOYcUWwhCuHCXi49RhFRCyJEK3iPj1Ziz9DpViV3tbZOwXD49QzIN3MpOLJNxh2qwq2lJJZaKMVw9qNi4jTC0w==",
+      "dev": true,
+      "requires": {
+        "spdx-expression-parse": "^3.0.0",
+        "spdx-license-ids": "^3.0.0"
+      }
+    },
+    "spdx-exceptions": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/spdx-exceptions/-/spdx-exceptions-2.3.0.tgz",
+      "integrity": "sha512-/tTrYOC7PPI1nUAgx34hUpqXuyJG+DTHJTnIULG4rDygi4xu/tfgmq1e1cIRwRzwZgo4NLySi+ricLkZkw4i5A==",
+      "dev": true
+    },
+    "spdx-expression-parse": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/spdx-expression-parse/-/spdx-expression-parse-3.0.1.tgz",
+      "integrity": "sha512-cbqHunsQWnJNE6KhVSMsMeH5H/L9EpymbzqTQ3uLwNCLZ1Q481oWaofqH7nO6V07xlXwY6PhQdQ2IedWx/ZK4Q==",
+      "dev": true,
+      "requires": {
+        "spdx-exceptions": "^2.1.0",
+        "spdx-license-ids": "^3.0.0"
+      }
+    },
+    "spdx-license-ids": {
+      "version": "3.0.12",
+      "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.12.tgz",
+      "integrity": "sha512-rr+VVSXtRhO4OHbXUiAF7xW3Bo9DuuF6C5jH+q/x15j2jniycgKbxU09Hr0WqlSLUs4i4ltHGXqTe7VHclYWyA==",
+      "dev": true
+    },
+    "split-string": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/split-string/-/split-string-3.1.0.tgz",
+      "integrity": "sha512-NzNVhJDYpwceVVii8/Hu6DKfD2G+NrQHlS/V/qgv763EYudVwEcMQNxd2lh+0VrUByXN/oJkl5grOhYWvQUYiw==",
+      "dev": true,
+      "requires": {
+        "extend-shallow": "^3.0.0"
+      }
+    },
+    "sprintf-js": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
+      "integrity": "sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==",
+      "dev": true
+    },
+    "stack-utils": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/stack-utils/-/stack-utils-2.0.6.tgz",
+      "integrity": "sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==",
+      "dev": true,
+      "requires": {
+        "escape-string-regexp": "^2.0.0"
+      },
+      "dependencies": {
+        "escape-string-regexp": {
+          "version": "2.0.0",
+          "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-2.0.0.tgz",
+          "integrity": "sha512-UpzcLCXolUWcNu5HtVMHYdXJjArjsF9C0aNnquZYY4uW/Vu0miy5YoWvbV345HauVvcAUnpRuhMMcqTcGOY2+w==",
+          "dev": true
+        }
+      }
+    },
+    "static-extend": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/static-extend/-/static-extend-0.1.2.tgz",
+      "integrity": "sha512-72E9+uLc27Mt718pMHt9VMNiAL4LMsmDbBva8mxWUCkT07fSzEGMYUCk0XWY6lp0j6RBAG4cJ3mWuZv2OE3s0g==",
+      "dev": true,
+      "requires": {
+        "define-property": "^0.2.5",
+        "object-copy": "^0.1.0"
+      },
+      "dependencies": {
+        "define-property": {
+          "version": "0.2.5",
+          "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz",
+          "integrity": "sha512-Rr7ADjQZenceVOAKop6ALkkRAmH1A4Gx9hV/7ZujPUN2rkATqFO0JZLZInbAjpZYoJ1gUx8MRMQVkYemcbMSTA==",
+          "dev": true,
+          "requires": {
+            "is-descriptor": "^0.1.0"
+          }
+        }
+      }
+    },
+    "string-length": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz",
+      "integrity": "sha512-+l6rNN5fYHNhZZy41RXsYptCjA2Igmq4EG7kZAYFQI1E1VTXarr6ZPXBg6eq7Y6eK4FEhY6AJlyuFIb/v/S0VQ==",
+      "dev": true,
+      "requires": {
+        "char-regex": "^1.0.2",
+        "strip-ansi": "^6.0.0"
+      },
+      "dependencies": {
+        "strip-ansi": {
+          "version": "6.0.1",
+          "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+          "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+          "dev": true,
+          "requires": {
+            "ansi-regex": "^5.0.1"
+          }
+        }
+      }
+    },
+    "string-width": {
+      "version": "4.2.3",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
+      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
+      "dev": true,
+      "requires": {
+        "emoji-regex": "^8.0.0",
+        "is-fullwidth-code-point": "^3.0.0",
+        "strip-ansi": "^6.0.1"
+      },
+      "dependencies": {
+        "strip-ansi": {
+          "version": "6.0.1",
+          "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+          "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+          "dev": true,
+          "requires": {
+            "ansi-regex": "^5.0.1"
+          }
+        }
+      }
+    },
+    "strip-ansi": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-5.2.0.tgz",
+      "integrity": "sha512-DuRs1gKbBqsMKIZlrffwlug8MHkcnpjs5VPmL1PAh+mA30U0DTotfDZ0d2UUsXpPmPmMMJ6W773MaA3J+lbiWA==",
+      "dev": true,
+      "requires": {
+        "ansi-regex": "^4.1.0"
+      },
+      "dependencies": {
+        "ansi-regex": {
+          "version": "4.1.1",
+          "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-4.1.1.tgz",
+          "integrity": "sha512-ILlv4k/3f6vfQ4OoP2AGvirOktlQ98ZEL1k9FaQjxa3L1abBgbuTDAdPOpvbGncC0BTVQrl+OM8xZGK6tWXt7g==",
+          "dev": true
+        }
+      }
+    },
+    "strip-bom": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-4.0.0.tgz",
+      "integrity": "sha512-3xurFv5tEgii33Zi8Jtp55wEIILR9eh34FAW00PZf+JnSsTmV/ioewSgQl97JHvgjoRGwPShsWm+IdrxB35d0w==",
+      "dev": true
+    },
+    "strip-eof": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/strip-eof/-/strip-eof-1.0.0.tgz",
+      "integrity": "sha512-7FCwGGmx8mD5xQd3RPUvnSpUXHM3BWuzjtpD4TXsfcZ9EL4azvVVUscFYwD9nx8Kh+uCBC00XBtAykoMHwTh8Q==",
+      "dev": true
+    },
+    "strip-final-newline": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-2.0.0.tgz",
+      "integrity": "sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==",
+      "dev": true
+    },
+    "strip-json-comments": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
+      "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==",
+      "dev": true
+    },
+    "supports-color": {
+      "version": "5.5.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz",
+      "integrity": "sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==",
+      "dev": true,
+      "requires": {
+        "has-flag": "^3.0.0"
+      }
+    },
+    "supports-hyperlinks": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/supports-hyperlinks/-/supports-hyperlinks-2.3.0.tgz",
+      "integrity": "sha512-RpsAZlpWcDwOPQA22aCH4J0t7L8JmAvsCxfOSEwm7cQs3LshN36QaTkwd70DnBOXDWGssw2eUoc8CaRWT0XunA==",
+      "dev": true,
+      "requires": {
+        "has-flag": "^4.0.0",
+        "supports-color": "^7.0.0"
+      },
+      "dependencies": {
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
+    "supports-preserve-symlinks-flag": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz",
+      "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==",
+      "dev": true
+    },
+    "symbol-tree": {
+      "version": "3.2.4",
+      "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz",
+      "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==",
+      "dev": true
+    },
+    "table": {
+      "version": "5.4.6",
+      "resolved": "https://registry.npmjs.org/table/-/table-5.4.6.tgz",
+      "integrity": "sha512-wmEc8m4fjnob4gt5riFRtTu/6+4rSe12TpAELNSqHMfF3IqnA+CH37USM6/YR3qRZv7e56kAEAtd6nKZaxe0Ug==",
+      "dev": true,
+      "requires": {
+        "ajv": "^6.10.2",
+        "lodash": "^4.17.14",
+        "slice-ansi": "^2.1.0",
+        "string-width": "^3.0.0"
+      },
+      "dependencies": {
+        "emoji-regex": {
+          "version": "7.0.3",
+          "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz",
+          "integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==",
+          "dev": true
+        },
+        "is-fullwidth-code-point": {
+          "version": "2.0.0",
+          "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz",
+          "integrity": "sha512-VHskAKYM8RfSFXwee5t5cbN5PZeq1Wrh6qd5bkyiXIf6UQcN6w/A0eXM9r6t8d+GYOh+o6ZhiEnb88LN/Y8m2w==",
+          "dev": true
+        },
+        "string-width": {
+          "version": "3.1.0",
+          "resolved": "https://registry.npmjs.org/string-width/-/string-width-3.1.0.tgz",
+          "integrity": "sha512-vafcv6KjVZKSgz06oM/H6GDBrAtz8vdhQakGjFIvNrHA6y3HCF1CInLy+QLq8dTJPQ1b+KDUqDFctkdRW44e1w==",
+          "dev": true,
+          "requires": {
+            "emoji-regex": "^7.0.1",
+            "is-fullwidth-code-point": "^2.0.0",
+            "strip-ansi": "^5.1.0"
+          }
+        }
+      }
+    },
+    "terminal-link": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/terminal-link/-/terminal-link-2.1.1.tgz",
+      "integrity": "sha512-un0FmiRUQNr5PJqy9kP7c40F5BOfpGlYTrxonDChEZB7pzZxRNp/bt+ymiy9/npwXya9KH99nJ/GXFIiUkYGFQ==",
+      "dev": true,
+      "requires": {
+        "ansi-escapes": "^4.2.1",
+        "supports-hyperlinks": "^2.0.0"
+      }
+    },
+    "test-exclude": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/test-exclude/-/test-exclude-6.0.0.tgz",
+      "integrity": "sha512-cAGWPIyOHU6zlmg88jwm7VRyXnMN7iV68OGAbYDk/Mh/xC/pzVPlQtY6ngoIH/5/tciuhGfvESU8GrHrcxD56w==",
+      "dev": true,
+      "requires": {
+        "@istanbuljs/schema": "^0.1.2",
+        "glob": "^7.1.4",
+        "minimatch": "^3.0.4"
+      }
+    },
+    "text-table": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
+      "integrity": "sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw==",
+      "dev": true
+    },
+    "throat": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/throat/-/throat-5.0.0.tgz",
+      "integrity": "sha512-fcwX4mndzpLQKBS1DVYhGAcYaYt7vsHNIvQV+WXMvnow5cgjPphq5CaayLaGsjRdSCKZFNGt7/GYAuXaNOiYCA==",
+      "dev": true
+    },
+    "through": {
+      "version": "2.3.8",
+      "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
+      "integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==",
+      "dev": true
+    },
+    "tmp": {
+      "version": "0.0.33",
+      "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.33.tgz",
+      "integrity": "sha512-jRCJlojKnZ3addtTOjdIqoRuPEKBvNXcGYqzO6zWZX8KfKEpnGY5jfggJQ3EjKuu8D4bJRr0y+cYJFmYbImXGw==",
+      "dev": true,
+      "requires": {
+        "os-tmpdir": "~1.0.2"
+      }
+    },
+    "tmpl": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz",
+      "integrity": "sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==",
+      "dev": true
+    },
+    "to-fast-properties": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-2.0.0.tgz",
+      "integrity": "sha512-/OaKK0xYrs3DmxRYqL/yDc+FxFUVYhDlXMhRmv3z915w2HF1tnN1omB354j8VUGO/hbRzyD6Y3sA7v7GS/ceog==",
+      "dev": true
+    },
+    "to-object-path": {
+      "version": "0.3.0",
+      "resolved": "https://registry.npmjs.org/to-object-path/-/to-object-path-0.3.0.tgz",
+      "integrity": "sha512-9mWHdnGRuh3onocaHzukyvCZhzvr6tiflAy/JRFXcJX0TjgfWA9pk9t8CMbzmBE4Jfw58pXbkngtBtqYxzNEyg==",
+      "dev": true,
+      "requires": {
+        "kind-of": "^3.0.2"
+      },
+      "dependencies": {
+        "kind-of": {
+          "version": "3.2.2",
+          "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+          "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+          "dev": true,
+          "requires": {
+            "is-buffer": "^1.1.5"
+          }
+        }
+      }
+    },
+    "to-regex": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/to-regex/-/to-regex-3.0.2.tgz",
+      "integrity": "sha512-FWtleNAtZ/Ki2qtqej2CXTOayOH9bHDQF+Q48VpWyDXjbYxA4Yz8iDB31zXOBUlOHHKidDbqGVrTUvQMPmBGBw==",
+      "dev": true,
+      "requires": {
+        "define-property": "^2.0.2",
+        "extend-shallow": "^3.0.2",
+        "regex-not": "^1.0.2",
+        "safe-regex": "^1.1.0"
+      }
+    },
+    "to-regex-range": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
+      "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
+      "dev": true,
+      "requires": {
+        "is-number": "^7.0.0"
+      }
+    },
+    "tough-cookie": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.2.tgz",
+      "integrity": "sha512-G9fqXWoYFZgTc2z8Q5zaHy/vJMjm+WV0AkAeHxVCQiEB1b+dGvWzFW6QV07cY5jQ5gRkeid2qIkzkxUnmoQZUQ==",
+      "dev": true,
+      "requires": {
+        "psl": "^1.1.33",
+        "punycode": "^2.1.1",
+        "universalify": "^0.2.0",
+        "url-parse": "^1.5.3"
+      }
+    },
+    "tr46": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/tr46/-/tr46-2.1.0.tgz",
+      "integrity": "sha512-15Ih7phfcdP5YxqiB+iDtLoaTz4Nd35+IiAv0kQ5FNKHzXgdWqPoTIqEDDJmXceQt4JZk6lVPT8lnDlPpGDppw==",
+      "dev": true,
+      "requires": {
+        "punycode": "^2.1.1"
+      }
+    },
+    "tslib": {
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz",
+      "integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==",
+      "dev": true
+    },
+    "tsutils": {
+      "version": "3.21.0",
+      "resolved": "https://registry.npmjs.org/tsutils/-/tsutils-3.21.0.tgz",
+      "integrity": "sha512-mHKK3iUXL+3UF6xL5k0PEhKRUBKPBCv/+RkEOpjRWxxx27KKRBmmA60A9pgOUvMi8GKhRMPEmjBRPzs2W7O1OA==",
+      "dev": true,
+      "requires": {
+        "tslib": "^1.8.1"
+      }
+    },
+    "type-check": {
+      "version": "0.3.2",
+      "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.3.2.tgz",
+      "integrity": "sha512-ZCmOJdvOWDBYJlzAoFkC+Q0+bUyEOS1ltgp1MGU03fqHG+dbi9tBFU2Rd9QKiDZFAYrhPh2JUf7rZRIuHRKtOg==",
+      "dev": true,
+      "requires": {
+        "prelude-ls": "~1.1.2"
+      }
+    },
+    "type-detect": {
+      "version": "4.0.8",
+      "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
+      "integrity": "sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==",
+      "dev": true
+    },
+    "type-fest": {
+      "version": "0.8.1",
+      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.8.1.tgz",
+      "integrity": "sha512-4dbzIzqvjtgiM5rw1k5rEHtBANKmdudhGyBEajN01fEyhaAIhsoKNy6y7+IN93IfpFtwY9iqi7kD+xwKhQsNJA==",
+      "dev": true
+    },
+    "typedarray-to-buffer": {
+      "version": "3.1.5",
+      "resolved": "https://registry.npmjs.org/typedarray-to-buffer/-/typedarray-to-buffer-3.1.5.tgz",
+      "integrity": "sha512-zdu8XMNEDepKKR+XYOXAVPtWui0ly0NtohUscw+UmaHiAWT8hrV1rr//H6V+0DvJ3OQ19S979M0laLfX8rm82Q==",
+      "dev": true,
+      "requires": {
+        "is-typedarray": "^1.0.0"
+      }
+    },
+    "typedoc": {
+      "version": "0.17.8",
+      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.17.8.tgz",
+      "integrity": "sha512-/OyrHCJ8jtzu+QZ+771YaxQ9s4g5Z3XsQE3Ma7q+BL392xxBn4UMvvCdVnqKC2T/dz03/VXSLVKOP3lHmDdc/w==",
+      "dev": true,
+      "requires": {
+        "fs-extra": "^8.1.0",
+        "handlebars": "^4.7.6",
+        "highlight.js": "^10.0.0",
+        "lodash": "^4.17.15",
+        "lunr": "^2.3.8",
+        "marked": "1.0.0",
+        "minimatch": "^3.0.0",
+        "progress": "^2.0.3",
+        "shelljs": "^0.8.4",
+        "typedoc-default-themes": "^0.10.2"
+      }
+    },
+    "typedoc-default-themes": {
+      "version": "0.10.2",
+      "resolved": "https://registry.npmjs.org/typedoc-default-themes/-/typedoc-default-themes-0.10.2.tgz",
+      "integrity": "sha512-zo09yRj+xwLFE3hyhJeVHWRSPuKEIAsFK5r2u47KL/HBKqpwdUSanoaz5L34IKiSATFrjG5ywmIu98hPVMfxZg==",
+      "dev": true,
+      "requires": {
+        "lunr": "^2.3.8"
+      }
+    },
+    "typescript": {
+      "version": "3.9.10",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.9.10.tgz",
+      "integrity": "sha512-w6fIxVE/H1PkLKcCPsFqKE7Kv7QUwhU8qQY2MueZXWx5cPZdwFupLgKK3vntcK98BtNHZtAF4LA/yl2a7k8R6Q==",
+      "dev": true
+    },
+    "uglify-js": {
+      "version": "3.17.4",
+      "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.17.4.tgz",
+      "integrity": "sha512-T9q82TJI9e/C1TAxYvfb16xO120tMVFZrGA3f9/P4424DNu6ypK103y0GPFVa17yotwSyZW5iYXgjYHkGrJW/g==",
+      "dev": true,
+      "optional": true
+    },
+    "union-value": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/union-value/-/union-value-1.0.1.tgz",
+      "integrity": "sha512-tJfXmxMeWYnczCVs7XAEvIV7ieppALdyepWMkHkwciRpZraG/xwT+s2JN8+pr1+8jCRf80FFzvr+MpQeeoF4Xg==",
+      "dev": true,
+      "requires": {
+        "arr-union": "^3.1.0",
+        "get-value": "^2.0.6",
+        "is-extendable": "^0.1.1",
+        "set-value": "^2.0.1"
+      }
+    },
+    "universalify": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz",
+      "integrity": "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg==",
+      "dev": true
+    },
+    "unset-value": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/unset-value/-/unset-value-1.0.0.tgz",
+      "integrity": "sha512-PcA2tsuGSF9cnySLHTLSh2qrQiJ70mn+r+Glzxv2TWZblxsxCC52BDlZoPCsz7STd9pN7EZetkWZBAvk4cgZdQ==",
+      "dev": true,
+      "requires": {
+        "has-value": "^0.3.1",
+        "isobject": "^3.0.0"
+      },
+      "dependencies": {
+        "has-value": {
+          "version": "0.3.1",
+          "resolved": "https://registry.npmjs.org/has-value/-/has-value-0.3.1.tgz",
+          "integrity": "sha512-gpG936j8/MzaeID5Yif+577c17TxaDmhuyVgSwtnL/q8UUTySg8Mecb+8Cf1otgLoD7DDH75axp86ER7LFsf3Q==",
+          "dev": true,
+          "requires": {
+            "get-value": "^2.0.3",
+            "has-values": "^0.1.4",
+            "isobject": "^2.0.0"
+          },
+          "dependencies": {
+            "isobject": {
+              "version": "2.1.0",
+              "resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz",
+              "integrity": "sha512-+OUdGJlgjOBZDfxnDjYYG6zp487z0JGNQq3cYQYg5f5hKR+syHMsaztzGeml/4kGG55CSpKSpWTY+jYGgsHLgA==",
+              "dev": true,
+              "requires": {
+                "isarray": "1.0.0"
+              }
+            }
+          }
+        },
+        "has-values": {
+          "version": "0.1.4",
+          "resolved": "https://registry.npmjs.org/has-values/-/has-values-0.1.4.tgz",
+          "integrity": "sha512-J8S0cEdWuQbqD9//tlZxiMuMNmxB8PlEwvYwuxsTmR1G5RXUePEX/SJn7aD0GMLieuZYSwNH0cQuJGwnYunXRQ==",
+          "dev": true
+        }
+      }
+    },
+    "update-browserslist-db": {
+      "version": "1.0.10",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.10.tgz",
+      "integrity": "sha512-OztqDenkfFkbSG+tRxBeAnCVPckDBcvibKd35yDONx6OU8N7sqgwc7rCbkJ/WcYtVRZ4ba68d6byhC21GFh7sQ==",
+      "dev": true,
+      "requires": {
+        "escalade": "^3.1.1",
+        "picocolors": "^1.0.0"
+      }
+    },
+    "uri-js": {
+      "version": "4.4.1",
+      "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
+      "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
+      "dev": true,
+      "requires": {
+        "punycode": "^2.1.0"
+      }
+    },
+    "urix": {
+      "version": "0.1.0",
+      "resolved": "https://registry.npmjs.org/urix/-/urix-0.1.0.tgz",
+      "integrity": "sha512-Am1ousAhSLBeB9cG/7k7r2R0zj50uDRlZHPGbazid5s9rlF1F/QKYObEKSIunSjIOkJZqwRRLpvewjEkM7pSqg==",
+      "dev": true
+    },
+    "url-parse": {
+      "version": "1.5.10",
+      "resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.10.tgz",
+      "integrity": "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==",
+      "dev": true,
+      "requires": {
+        "querystringify": "^2.1.1",
+        "requires-port": "^1.0.0"
+      }
+    },
+    "use": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/use/-/use-3.1.1.tgz",
+      "integrity": "sha512-cwESVXlO3url9YWlFW/TA9cshCEhtu7IKJ/p5soJ/gGpj7vbvFrAY/eIioQ6Dw23KjZhYgiIo8HOs1nQ2vr/oQ==",
+      "dev": true
+    },
+    "uuid": {
+      "version": "8.3.2",
+      "resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz",
+      "integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==",
+      "dev": true,
+      "optional": true
+    },
+    "v8-compile-cache": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/v8-compile-cache/-/v8-compile-cache-2.3.0.tgz",
+      "integrity": "sha512-l8lCEmLcLYZh4nbunNZvQCJc5pv7+RCwa8q/LdUx8u7lsWvPDKmpodJAJNwkAhJC//dFY48KuIEmjtd4RViDrA==",
+      "dev": true
+    },
+    "v8-to-istanbul": {
+      "version": "7.1.2",
+      "resolved": "https://registry.npmjs.org/v8-to-istanbul/-/v8-to-istanbul-7.1.2.tgz",
+      "integrity": "sha512-TxNb7YEUwkLXCQYeudi6lgQ/SZrzNO4kMdlqVxaZPUIUjCv6iSSypUQX70kNBSERpQ8fk48+d61FXk+tgqcWow==",
+      "dev": true,
+      "requires": {
+        "@types/istanbul-lib-coverage": "^2.0.1",
+        "convert-source-map": "^1.6.0",
+        "source-map": "^0.7.3"
+      },
+      "dependencies": {
+        "source-map": {
+          "version": "0.7.4",
+          "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.7.4.tgz",
+          "integrity": "sha512-l3BikUxvPOcn5E74dZiq5BGsTb5yEwhaTSzccU6t4sDOH8NWJCstKO5QT2CvtFoK6F0saL7p9xHAqHOlCPJygA==",
+          "dev": true
+        }
+      }
+    },
+    "validate-npm-package-license": {
+      "version": "3.0.4",
+      "resolved": "https://registry.npmjs.org/validate-npm-package-license/-/validate-npm-package-license-3.0.4.tgz",
+      "integrity": "sha512-DpKm2Ui/xN7/HQKCtpZxoRWBhZ9Z0kqtygG8XCgNQ8ZlDnxuQmWhj566j8fN4Cu3/JmbhsDo7fcAJq4s9h27Ew==",
+      "dev": true,
+      "requires": {
+        "spdx-correct": "^3.0.0",
+        "spdx-expression-parse": "^3.0.0"
+      }
+    },
+    "w3c-hr-time": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/w3c-hr-time/-/w3c-hr-time-1.0.2.tgz",
+      "integrity": "sha512-z8P5DvDNjKDoFIHK7q8r8lackT6l+jo/Ye3HOle7l9nICP9lf1Ci25fy9vHd0JOWewkIFzXIEig3TdKT7JQ5fQ==",
+      "dev": true,
+      "requires": {
+        "browser-process-hrtime": "^1.0.0"
+      }
+    },
+    "w3c-xmlserializer": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-2.0.0.tgz",
+      "integrity": "sha512-4tzD0mF8iSiMiNs30BiLO3EpfGLZUT2MSX/G+o7ZywDzliWQ3OPtTZ0PTC3B3ca1UAf4cJMHB+2Bf56EriJuRA==",
+      "dev": true,
+      "requires": {
+        "xml-name-validator": "^3.0.0"
+      }
+    },
+    "walker": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
+      "integrity": "sha512-ts/8E8l5b7kY0vlWLewOkDXMmPdLcVV4GmOQLyxuSswIJsweeFZtAsMF7k1Nszz+TYBQrlYRmzOnr398y1JemQ==",
+      "dev": true,
+      "requires": {
+        "makeerror": "1.0.12"
+      }
+    },
+    "webidl-conversions": {
+      "version": "6.1.0",
+      "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-6.1.0.tgz",
+      "integrity": "sha512-qBIvFLGiBpLjfwmYAaHPXsn+ho5xZnGvyGvsarywGNc8VyQJUMHJ8OBKGGrPER0okBeMDaan4mNBlgBROxuI8w==",
+      "dev": true
+    },
+    "whatwg-encoding": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-1.0.5.tgz",
+      "integrity": "sha512-b5lim54JOPN9HtzvK9HFXvBma/rnfFeqsic0hSpjtDbVxR3dJKLc+KB4V6GgiGOvl7CY/KNh8rxSo9DKQrnUEw==",
+      "dev": true,
+      "requires": {
+        "iconv-lite": "0.4.24"
+      }
+    },
+    "whatwg-mimetype": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-2.3.0.tgz",
+      "integrity": "sha512-M4yMwr6mAnQz76TbJm914+gPpB/nCwvZbJU28cUD6dR004SAxDLOOSUaB1JDRqLtaOV/vi0IC5lEAGFgrjGv/g==",
+      "dev": true
+    },
+    "whatwg-url": {
+      "version": "8.7.0",
+      "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-8.7.0.tgz",
+      "integrity": "sha512-gAojqb/m9Q8a5IV96E3fHJM70AzCkgt4uXYX2O7EmuyOnLrViCQlsEBmF9UQIu3/aeAIp2U17rtbpZWNntQqdg==",
+      "dev": true,
+      "requires": {
+        "lodash": "^4.7.0",
+        "tr46": "^2.1.0",
+        "webidl-conversions": "^6.1.0"
+      }
+    },
+    "which": {
+      "version": "1.3.1",
+      "resolved": "https://registry.npmjs.org/which/-/which-1.3.1.tgz",
+      "integrity": "sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ==",
+      "dev": true,
+      "requires": {
+        "isexe": "^2.0.0"
+      }
+    },
+    "which-module": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz",
+      "integrity": "sha512-B+enWhmw6cjfVC7kS8Pj9pCrKSc5txArRyaYGe088shv/FGWH+0Rjx/xPgtsWfsUtS27FkP697E4DDhgrgoc0Q==",
+      "dev": true
+    },
+    "word-wrap": {
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz",
+      "integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==",
+      "dev": true
+    },
+    "wordwrap": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz",
+      "integrity": "sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==",
+      "dev": true
+    },
+    "wrap-ansi": {
+      "version": "6.2.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-6.2.0.tgz",
+      "integrity": "sha512-r6lPcBGxZXlIcymEu7InxDMhdW0KDxpLgoFLcguasxCaJ/SOIZwINatK9KY/tf+ZrlywOKU0UDj3ATXUBfxJXA==",
+      "dev": true,
+      "requires": {
+        "ansi-styles": "^4.0.0",
+        "string-width": "^4.1.0",
+        "strip-ansi": "^6.0.0"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "strip-ansi": {
+          "version": "6.0.1",
+          "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+          "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+          "dev": true,
+          "requires": {
+            "ansi-regex": "^5.0.1"
+          }
+        }
+      }
+    },
+    "wrappy": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
+      "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
+      "dev": true
+    },
+    "write": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/write/-/write-1.0.3.tgz",
+      "integrity": "sha512-/lg70HAjtkUgWPVZhZcm+T4hkL8Zbtp1nFNOn3lRrxnlv50SRBv7cR7RqR+GMsd3hUXy9hWBo4CHTbFTcOYwig==",
+      "dev": true,
+      "requires": {
+        "mkdirp": "^0.5.1"
+      }
+    },
+    "write-file-atomic": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/write-file-atomic/-/write-file-atomic-3.0.3.tgz",
+      "integrity": "sha512-AvHcyZ5JnSfq3ioSyjrBkH9yW4m7Ayk8/9My/DD9onKeu/94fwrMocemO2QAJFAlnnDN+ZDS+ZjAR5ua1/PV/Q==",
+      "dev": true,
+      "requires": {
+        "imurmurhash": "^0.1.4",
+        "is-typedarray": "^1.0.0",
+        "signal-exit": "^3.0.2",
+        "typedarray-to-buffer": "^3.1.5"
+      }
+    },
+    "ws": {
+      "version": "7.5.9",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.9.tgz",
+      "integrity": "sha512-F+P9Jil7UiSKSkppIiD94dN07AwvFixvLIj1Og1Rl9GGMuNipJnV9JzjD6XuqmAeiswGvUmNLjr5cFuXwNS77Q==",
+      "dev": true
+    },
+    "xml-name-validator": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-3.0.0.tgz",
+      "integrity": "sha512-A5CUptxDsvxKJEU3yO6DuWBSJz/qizqzJKOMIfUJHETbBw/sFaDxgd6fxm1ewUaM0jZ444Fc5vC5ROYurg/4Pw==",
+      "dev": true
+    },
+    "xmlchars": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz",
+      "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==",
+      "dev": true
+    },
+    "y18n": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.3.tgz",
+      "integrity": "sha512-JKhqTOwSrqNA1NY5lSztJ1GrBiUodLMmIZuLiDaMRJ+itFd+ABVE8XBjOvIWL+rSqNDC74LCSFmlb/U4UZ4hJQ==",
+      "dev": true
+    },
+    "yallist": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz",
+      "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==",
+      "dev": true
+    },
+    "yargs": {
+      "version": "15.4.1",
+      "resolved": "https://registry.npmjs.org/yargs/-/yargs-15.4.1.tgz",
+      "integrity": "sha512-aePbxDmcYW++PaqBsJ+HYUFwCdv4LVvdnhBy78E57PIor8/OVvhMrADFFEDh8DHDFRv/O9i3lPhsENjO7QX0+A==",
+      "dev": true,
+      "requires": {
+        "cliui": "^6.0.0",
+        "decamelize": "^1.2.0",
+        "find-up": "^4.1.0",
+        "get-caller-file": "^2.0.1",
+        "require-directory": "^2.1.1",
+        "require-main-filename": "^2.0.0",
+        "set-blocking": "^2.0.0",
+        "string-width": "^4.2.0",
+        "which-module": "^2.0.0",
+        "y18n": "^4.0.0",
+        "yargs-parser": "^18.1.2"
+      }
+    },
+    "yargs-parser": {
+      "version": "18.1.3",
+      "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-18.1.3.tgz",
+      "integrity": "sha512-o50j0JeToy/4K6OZcaQmW6lyXXKhq7csREXcDwk2omFPJEwUNOVtJKvmDr9EI1fAJZUyZcRF7kxGBWmRXudrCQ==",
+      "dev": true,
+      "requires": {
+        "camelcase": "^5.0.0",
+        "decamelize": "^1.2.0"
+      }
+    }
+  }
+}
diff --git a/web/package.json b/web/package.json
index a45737ceb8d8..4cc88dc4c59e 100644
--- a/web/package.json
+++ b/web/package.json
@@ -17,14 +17,15 @@
   "devDependencies": {
     "@rollup/plugin-commonjs": "^11.1.0",
     "@rollup/plugin-node-resolve": "^7.1.3",
+    "@types/babel__traverse": "<=7.18.2",
     "@types/node": "^12.12.37",
     "@typescript-eslint/eslint-plugin": "^2.29.0",
     "@typescript-eslint/parser": "^2.29.0",
     "@webgpu/types": "^0.0.31",
     "eslint": "^6.8.0",
     "jest": "^26.0.1",
-    "rollup": "^2.7.6",
     "rollup-plugin-typescript2": "^0.27.0",
+    "rollup": "^2.7.6",
     "typedoc": "^0.17.6",
     "typescript": "^3.8.3",
     "ws": "^7.2.5"

From 435df5081a68d5e3b913684414b87f2e93de835f Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 28 Nov 2022 19:21:11 -0600
Subject: [PATCH 669/704] [microTVM] Use `serial_number` in Zephyr tutorials
 (#13479)

Add serial_number option to Zephyr tutorials so it's possible to run them in the hardware CI and specify the serial port by setting TVM_MICRO_SERIAL env. variable.
---
 gallery/how_to/work_with_microtvm/micro_aot.py      | 3 ++-
 gallery/how_to/work_with_microtvm/micro_autotune.py | 4 ++++
 gallery/how_to/work_with_microtvm/micro_tflite.py   | 3 ++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/gallery/how_to/work_with_microtvm/micro_aot.py b/gallery/how_to/work_with_microtvm/micro_aot.py
index f02a1ebbbd0b..f702170f0bb9 100644
--- a/gallery/how_to/work_with_microtvm/micro_aot.py
+++ b/gallery/how_to/work_with_microtvm/micro_aot.py
@@ -106,6 +106,7 @@
     with open(boards_file) as f:
         boards = json.load(f)
     BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
+    SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
     TARGET = tvm.target.target.micro(boards[BOARD]["model"])
 
 ######################################################################
@@ -133,7 +134,7 @@
 
 if use_physical_hw:
     template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
-    project_options = {"project_type": "host_driven", "board": BOARD}
+    project_options = {"project_type": "host_driven", "board": BOARD, "serial_number": SERIAL}
 
 temp_dir = tvm.contrib.utils.tempdir()
 generated_project_dir = temp_dir / "project"
diff --git a/gallery/how_to/work_with_microtvm/micro_autotune.py b/gallery/how_to/work_with_microtvm/micro_autotune.py
index 4c57717df889..ea83ef563940 100644
--- a/gallery/how_to/work_with_microtvm/micro_autotune.py
+++ b/gallery/how_to/work_with_microtvm/micro_autotune.py
@@ -101,6 +101,7 @@
         boards = json.load(f)
 
     BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
+    SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
     TARGET = tvm.target.target.micro(boards[BOARD]["model"])
 
 
@@ -156,6 +157,7 @@
             "west_cmd": "west",
             "verbose": False,
             "project_type": "host_driven",
+            "serial_number": SERIAL,
         },
     )
     builder = tvm.autotvm.LocalBuilder(
@@ -223,6 +225,7 @@
             "west_cmd": "west",
             "verbose": False,
             "project_type": "host_driven",
+            "serial_number": SERIAL,
         },
     )
 
@@ -266,6 +269,7 @@
             "west_cmd": "west",
             "verbose": False,
             "project_type": "host_driven",
+            "serial_number": SERIAL,
         },
     )
 
diff --git a/gallery/how_to/work_with_microtvm/micro_tflite.py b/gallery/how_to/work_with_microtvm/micro_tflite.py
index b04a2fdca18f..5822a1a1e97d 100644
--- a/gallery/how_to/work_with_microtvm/micro_tflite.py
+++ b/gallery/how_to/work_with_microtvm/micro_tflite.py
@@ -209,6 +209,7 @@
         boards = json.load(f)
 
     BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_f746zg")
+    SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
     TARGET = tvm.target.target.micro(boards[BOARD]["model"])
 
 #
@@ -291,7 +292,7 @@
 
 if use_physical_hw:
     template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
-    project_options = {"project_type": "host_driven", "board": BOARD}
+    project_options = {"project_type": "host_driven", "board": BOARD, "serial_number": SERIAL}
 
 # Create a temporary directory
 

From 40d0ec7515c8866d2787f8be0c98b07a666629ce Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 28 Nov 2022 18:12:30 -0800
Subject: [PATCH 670/704] [ci][docker] Update Docker image tags (#13504)

To support #13400
---
 Jenkinsfile               | 18 +++++++++---------
 ci/jenkins/Jenkinsfile.j2 | 18 +++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 377832461ef6..ea26b9e8ac02 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -49,16 +49,16 @@
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20221025-182121-e41d0ed6e'
-ci_gpu = 'tlcpack/ci-gpu:20221025-182121-e41d0ed6e'
-ci_cpu = 'tlcpack/ci-cpu:20221025-182121-e41d0ed6e'
-ci_minimal = 'tlcpack/ci-minimal:20221025-182121-e41d0ed6e'
-ci_wasm = 'tlcpack/ci-wasm:20221025-182121-e41d0ed6e'
-ci_i386 = 'tlcpack/ci-i386:20221025-182121-e41d0ed6e'
-ci_cortexm = 'tlcpack/ci-cortexm:20221025-182121-e41d0ed6e'
-ci_arm = 'tlcpack/ci-arm:20221025-182121-e41d0ed6e'
+ci_lint = 'tlcpack/ci-lint:20221128-070141-ae4fd7df7'
+ci_gpu = 'tlcpack/ci-gpu:20221128-070141-ae4fd7df7'
+ci_cpu = 'tlcpack/ci-cpu:20221128-070141-ae4fd7df7'
+ci_minimal = 'tlcpack/ci-minimal:20221128-070141-ae4fd7df7'
+ci_wasm = 'tlcpack/ci-wasm:20221128-070141-ae4fd7df7'
+ci_i386 = 'tlcpack/ci-i386:20221128-070141-ae4fd7df7'
+ci_cortexm = 'tlcpack/ci-cortexm:20221128-070141-ae4fd7df7'
+ci_arm = 'tlcpack/ci-arm:20221128-070141-ae4fd7df7'
 ci_hexagon = 'tlcpack/ci-hexagon:20221025-182121-e41d0ed6e'
-ci_riscv = 'tlcpack/ci-riscv:20221025-182121-e41d0ed6e'
+ci_riscv = 'tlcpack/ci-riscv:20221128-070141-ae4fd7df7'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index 2a3ade049361..3aa44294966e 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -51,16 +51,16 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'ci/jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20221025-182121-e41d0ed6e'
-ci_gpu = 'tlcpack/ci-gpu:20221025-182121-e41d0ed6e'
-ci_cpu = 'tlcpack/ci-cpu:20221025-182121-e41d0ed6e'
-ci_minimal = 'tlcpack/ci-minimal:20221025-182121-e41d0ed6e'
-ci_wasm = 'tlcpack/ci-wasm:20221025-182121-e41d0ed6e'
-ci_i386 = 'tlcpack/ci-i386:20221025-182121-e41d0ed6e'
-ci_cortexm = 'tlcpack/ci-cortexm:20221025-182121-e41d0ed6e'
-ci_arm = 'tlcpack/ci-arm:20221025-182121-e41d0ed6e'
+ci_lint = 'tlcpack/ci-lint:20221128-070141-ae4fd7df7'
+ci_gpu = 'tlcpack/ci-gpu:20221128-070141-ae4fd7df7'
+ci_cpu = 'tlcpack/ci-cpu:20221128-070141-ae4fd7df7'
+ci_minimal = 'tlcpack/ci-minimal:20221128-070141-ae4fd7df7'
+ci_wasm = 'tlcpack/ci-wasm:20221128-070141-ae4fd7df7'
+ci_i386 = 'tlcpack/ci-i386:20221128-070141-ae4fd7df7'
+ci_cortexm = 'tlcpack/ci-cortexm:20221128-070141-ae4fd7df7'
+ci_arm = 'tlcpack/ci-arm:20221128-070141-ae4fd7df7'
 ci_hexagon = 'tlcpack/ci-hexagon:20221025-182121-e41d0ed6e'
-ci_riscv = 'tlcpack/ci-riscv:20221025-182121-e41d0ed6e'
+ci_riscv = 'tlcpack/ci-riscv:20221128-070141-ae4fd7df7'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images

From 95d2e9fa35524bbdafbe4ff758523eb571055d02 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Mon, 28 Nov 2022 19:36:42 -0800
Subject: [PATCH 671/704] [ARM] Add dynamic matvec support (#13502)

* [ARM] Add dynamic matvec support

* proper imports

Co-authored-by: Tristan Konolige <tkonolige@octoml.ai>
---
 python/tvm/relay/op/strategy/arm_cpu.py | 18 +++++++++++++++++-
 python/tvm/relay/op/strategy/x86.py     |  9 ++++-----
 python/tvm/topi/utils.py                |  7 ++++++-
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 5c25696a1ee1..261b979dedaf 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -21,7 +21,7 @@
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 import re
 
-from tvm import relay, topi
+from tvm import relay, topi, tir
 
 from ....auto_scheduler import is_auto_scheduler_enabled
 from ....meta_schedule import is_meta_schedule_enabled
@@ -558,6 +558,22 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
             name="dense_dsp.arm_cpu",
         )
     else:
+        # For dynamic matrix-vector multiply we use a hand written kernel.
+        if (
+            isinstance(inputs[0].shape[0], (int, tir.IntImm))
+            and inputs[0].shape[0] == 1
+            and (
+                topi.utils.is_dynamic_shape(inputs[0].shape)
+                or topi.utils.is_dynamic_shape(inputs[1].shape)
+            )
+        ):
+            strategy.add_implementation(
+                wrap_compute_dense(topi.x86.dense_dynamic),
+                wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
+                name="dense_dynamic.x86",
+                plevel=20,
+            )
+            return strategy
         logger.warning("dense is not optimized for arm cpu.")
         strategy.add_implementation(
             wrap_compute_dense(
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 897f7c4e588f..3e59209f5822 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -507,10 +507,6 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
     return strategy
 
 
-def is_dynamic_shape(shape):
-    return any([isinstance(x, (tir.Any, tir.SizeVar)) for x in shape])
-
-
 @dense_strategy.register("cpu")
 def dense_strategy_cpu(attrs, inputs, out_type, target):
     """dense x86 strategy"""
@@ -520,7 +516,10 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
     if (
         isinstance(inputs[0].shape[0], (int, tir.IntImm))
         and inputs[0].shape[0] == 1
-        and (is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape))
+        and (
+            topi.utils.is_dynamic_shape(inputs[0].shape)
+            or topi.utils.is_dynamic_shape(inputs[1].shape)
+        )
     ):
         strategy.add_implementation(
             wrap_compute_dense(topi.x86.dense_dynamic),
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index 8251dac4137b..7580eac0216d 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -23,7 +23,7 @@
 import numpy as np
 import tvm
 from tvm import te
-from tvm.tir import bijective_layout, layout
+from tvm.tir import Any, SizeVar, bijective_layout, layout
 
 from . import cpp, tag
 
@@ -534,3 +534,8 @@ def is_target(names):
     names = [names] if isinstance(names, str) else names
     target = tvm.target.Target.current(allow_none=False)
     return any(name in target.keys for name in names)
+
+
+def is_dynamic_shape(shape):
+    """Checks if any part of a shape is dynamic"""
+    return any([isinstance(x, (Any, SizeVar)) for x in shape])

From 57de9e7f3d2711582368903ce95f08b91216b7b5 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Mon, 28 Nov 2022 21:28:37 -0800
Subject: [PATCH 672/704] Revert "[microTVM] enable building microTVM
 components by default" (#13503)

* Revert "[microTVM] enable building microTVM components by default (#13073)"

This reverts commit e6629706479421783d7ee5fa29eb3fe862d9b981.

* USE_MICRO ON for minimal and wasm to work around CI limitation.

* revert USE_MICRO for wasm, unrelated
---
 CMakeLists.txt                               | 7 +------
 cmake/config.cmake                           | 3 +++
 conda/recipe/build.sh                        | 3 ---
 conda/recipe/meta.yaml                       | 3 ---
 tests/scripts/task_config_build_arm.sh       | 1 +
 tests/scripts/task_config_build_cortexm.sh   | 1 +
 tests/scripts/task_config_build_cpu.sh       | 1 +
 tests/scripts/task_config_build_gpu.sh       | 1 +
 tests/scripts/task_config_build_gpu_other.sh | 1 +
 tests/scripts/task_config_build_hexagon.sh   | 2 ++
 tests/scripts/task_config_build_i386.sh      | 1 +
 tests/scripts/task_config_build_minimal.sh   | 1 +
 tests/scripts/task_config_build_riscv.sh     | 1 +
 tests/scripts/task_config_build_wasm.sh      | 1 +
 14 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb0bad8a50d6..736d516fa1f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,7 +58,7 @@ tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF)
 tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
 tvm_option(USE_MSVC_MT "Build with MT" OFF)
-tvm_option(USE_MICRO "Build with Micro TVM support" ON)
+tvm_option(USE_MICRO "Build with Micro TVM support" OFF)
 tvm_option(INSTALL_DEV "Install compiler infrastructure" OFF)
 tvm_option(HIDE_PRIVATE_SYMBOLS "Compile with -fvisibility=hidden." OFF)
 tvm_option(USE_TF_TVMDSOOP "Build with TensorFlow TVMDSOOp" OFF)
@@ -118,11 +118,6 @@ tvm_option(USE_CLML "Build with CLML Codegen support" OFF)
 tvm_option(USE_CLML_GRAPH_EXECUTOR "Build with CLML graph runtime" OFF)
 tvm_option(USE_UMA "Build with UMA support" OFF)
 
-# disable microTVM for iOS and hexagon builds
-if(${CMAKE_SYSTEM_NAME} MATCHES "iOS" OR USE_HEXAGON)
-  set(USE_MICRO OFF)
-endif()
-
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
 include_directories("include")
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 0c803c0b6a2e..679f5c459e87 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -96,6 +96,9 @@ set(USE_SPIRV_KHR_INTEGER_DOT_PRODUCT OFF)
 # Whether enable OpenGL runtime
 set(USE_OPENGL OFF)
 
+# Whether enable MicroTVM runtime
+set(USE_MICRO OFF)
+
 # Whether enable RPC runtime
 set(USE_RPC ON)
 
diff --git a/conda/recipe/build.sh b/conda/recipe/build.sh
index aa2d2f3d5fb3..0131fd65a48e 100755
--- a/conda/recipe/build.sh
+++ b/conda/recipe/build.sh
@@ -45,9 +45,6 @@ rm -rf build || true
 mkdir -p build
 cd build
 
-export PREFIX="${PREFIX}/"
-cp -f ${PREFIX}/bin/ranlib $PREFIX
-
 cmake -DCMAKE_INSTALL_PREFIX="${PREFIX}" \
       -DCMAKE_BUILD_TYPE=Release \
       -DUSE_RPC=ON \
diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
index b8463ebdd059..519b84c570d7 100644
--- a/conda/recipe/meta.yaml
+++ b/conda/recipe/meta.yaml
@@ -45,7 +45,6 @@ requirements:
   host:
     - zlib
     - llvmdev >=11
-    - cctools # [not win]
 
 outputs:
   - name: {{ pkg_name }}-libs
@@ -63,12 +62,10 @@ outputs:
         - llvmdev >=11
         - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
         - cudnn >=7.6.0  # [cuda]
-        - cctools # [not win]
       run:
         - llvmdev >=11
         - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
         - cudnn >=7.6.0  # [cuda]
-        - cctools # [not win]
 
   - name: {{ pkg_name }}
     script: install_tvm_python.sh  # [not win]
diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh
index 35ecde2904bb..516e6ac86791 100755
--- a/tests/scripts/task_config_build_arm.sh
+++ b/tests/scripts/task_config_build_arm.sh
@@ -25,6 +25,7 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-8\) >> config.cmake
diff --git a/tests/scripts/task_config_build_cortexm.sh b/tests/scripts/task_config_build_cortexm.sh
index 5407079c1a2c..f15ed81711f6 100755
--- a/tests/scripts/task_config_build_cortexm.sh
+++ b/tests/scripts/task_config_build_cortexm.sh
@@ -24,6 +24,7 @@ cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_CMSISNN ON\) >> config.cmake
 echo set\(USE_ETHOSU ON\) >> config.cmake
 echo set\(USE_UMA ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 66fc161ece96..e3d8aa9a1d1b 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -24,6 +24,7 @@ cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_DNNL ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 0b8be8ac4cbe..ca5f3e935c08 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -29,6 +29,7 @@ echo set\(USE_CUDA ON\) >> config.cmake
 echo set\(USE_VULKAN ON\) >> config.cmake
 echo set\(USE_OPENGL ON\) >> config.cmake
 echo set\(USE_OPENCL ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM \"/usr/bin/llvm-config-9 --link-static\"\) >> config.cmake
 echo set\(USE_NNPACK ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu_other.sh b/tests/scripts/task_config_build_gpu_other.sh
index 747e1006e507..6fb10d44508a 100755
--- a/tests/scripts/task_config_build_gpu_other.sh
+++ b/tests/scripts/task_config_build_gpu_other.sh
@@ -27,6 +27,7 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_OPENCL ON\) >> config.cmake
 echo set\(USE_ROCM ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE OFF\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index c8e70c00f97d..0736ed6b53b8 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -25,6 +25,8 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
+echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM "${CLANG_LLVM_HOME}/bin/llvm-config"\) >> config.cmake
 
 if [[ ${CI:-false} == "true" ]]; then
diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh
index 18a7189e1470..369706dfd34a 100755
--- a/tests/scripts/task_config_build_i386.sh
+++ b/tests/scripts/task_config_build_i386.sh
@@ -25,6 +25,7 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-4.0\) >> config.cmake
diff --git a/tests/scripts/task_config_build_minimal.sh b/tests/scripts/task_config_build_minimal.sh
index 651f54cea21b..9c8e101a7043 100755
--- a/tests/scripts/task_config_build_minimal.sh
+++ b/tests/scripts/task_config_build_minimal.sh
@@ -32,3 +32,4 @@ echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_riscv.sh b/tests/scripts/task_config_build_riscv.sh
index b39cb4b28e3b..9e11e5e255e9 100755
--- a/tests/scripts/task_config_build_riscv.sh
+++ b/tests/scripts/task_config_build_riscv.sh
@@ -24,6 +24,7 @@ cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_CMSISNN ON\) >> config.cmake
 echo set\(USE_UMA ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh
index e96288e36b7e..daa5481bea9d 100755
--- a/tests/scripts/task_config_build_wasm.sh
+++ b/tests/scripts/task_config_build_wasm.sh
@@ -24,6 +24,7 @@ cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-11\) >> config.cmake

From c0ba8a195115ed5b6f5c53df951972b98276d15a Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Tue, 29 Nov 2022 11:09:24 +0300
Subject: [PATCH 673/704] [docs][Adreno] Remove unnecessary compilation flag
 (#13509)

Flag `-DUSE_MICRO=OFF` was added to avoid this issue #13482. In #13503 these changes were reverted.
Removed unnecessary compilation flag to avoid user confusing.
---
 docs/how_to/deploy/adreno.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/how_to/deploy/adreno.rst b/docs/how_to/deploy/adreno.rst
index af613aa5cb21..7f4616fbf797 100644
--- a/docs/how_to/deploy/adreno.rst
+++ b/docs/how_to/deploy/adreno.rst
@@ -94,7 +94,7 @@ folder of TVM:
 
    mkdir build_android
    cd build_android
-   cmake .. -DUSE_OPENCL=ON -DUSE_MICRO=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_NATIVE_API_LEVEL=android-28 -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=ON -DANDROID_STL=c++_static -DUSE_CPP_RPC=ON
+   cmake .. -DUSE_OPENCL=ON -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_NATIVE_API_LEVEL=android-28 -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=ON -DANDROID_STL=c++_static -DUSE_CPP_RPC=ON
    make -jN tvm_runtime tvm_rpc
 
 where **N** is the number of cores available on your *CPU*.

From f6f7feafb297993f5f035de7f814407a2b876967 Mon Sep 17 00:00:00 2001
From: neildhickey <neil.hickey@arm.com>
Date: Tue, 29 Nov 2022 15:08:48 +0000
Subject: [PATCH 674/704] [CMSIS-NN] Support int16 handling for pooling
 functions (#13498)

[CMSIS-NN] Support int16 handling for pooling functions

-Pattern matching and RelayToTIR introduce int16 support
-Added int16 variants to fully_connected tests
---
 python/tvm/relay/op/contrib/cmsisnn.py        | 12 +++++---
 .../backend/contrib/cmsisnn/relay_to_tir.cc   | 29 +++++++++++++++----
 .../backend/contrib/cmsisnn/tir_to_runtime.cc |  3 +-
 .../contrib/test_cmsisnn/test_pooling.py      | 12 ++++----
 4 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 779fe35c3718..4581378dcd24 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -287,8 +287,10 @@ def check_qnn_avg_pool2d(pattern):
         return (
             pooling.attrs.layout == "NHWC"
             and int(input_op.checked_type.shape[0]) == 1
-            and input_op.checked_type.dtype == "int8"
-            and output.checked_type.dtype == "int8"
+            and (
+                (input_op.checked_type.dtype == "int8" and output.checked_type.dtype == "int8")
+                or (input_op.checked_type.dtype == "int16" and output.checked_type.dtype == "int16")
+            )
         )
 
     def qnn_max_pool2d_pattern():
@@ -310,8 +312,10 @@ def check_qnn_max_pool2d(pattern):
         return (
             pooling.attrs.layout == "NHWC"
             and int(input_op.checked_type.shape[0]) == 1
-            and input_op.checked_type.dtype == "int8"
-            and output.checked_type.dtype == "int8"
+            and (
+                (input_op.checked_type.dtype == "int8" and output.checked_type.dtype == "int8")
+                or (input_op.checked_type.dtype == "int16" and output.checked_type.dtype == "int16")
+            )
         )
 
     def binary_op_pattern(op):
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index c9e41589fb4b..f8685dc4df47 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -428,12 +428,19 @@ class RelayToTIRVisitor : public MixedModeMutator {
       pool = final_call;
     }
 
+    int32_t dtype_bits = final_call->type_as<TensorTypeNode>()->dtype.bits();
+
     // prepare cmsis_nn_pool_params
     int32_t stride_h, stride_w, padding_h, padding_w, pool_size_h, pool_size_w;
     int32_t clip_min, clip_max;
     std::string cmsisnn_api;
     if (pool_name == "cmsis-nn.qnn_avg_pool2d") {
-      cmsisnn_api = "arm_avgpool_s8";
+      if (dtype_bits == 8) {
+        cmsisnn_api = "arm_avgpool_s8";
+      } else {
+        cmsisnn_api = "arm_avgpool_s16";
+      }
+
       const AvgPool2DAttrs* attrs = pool->attrs.as<AvgPool2DAttrs>();
       stride_h = qnn::get_const_int(attrs->strides[0]);
       stride_w = qnn::get_const_int(attrs->strides[1]);
@@ -442,7 +449,12 @@ class RelayToTIRVisitor : public MixedModeMutator {
       pool_size_h = qnn::get_const_int(attrs->pool_size[0]);
       pool_size_w = qnn::get_const_int(attrs->pool_size[1]);
     } else {
-      cmsisnn_api = "arm_max_pool_s8";
+      if (dtype_bits == 8) {
+        cmsisnn_api = "arm_max_pool_s8";
+      } else {
+        cmsisnn_api = "arm_max_pool_s16";
+      }
+
       const MaxPool2DAttrs* attrs = pool->attrs.as<MaxPool2DAttrs>();
       stride_h = qnn::get_const_int(attrs->strides[0]);
       stride_w = qnn::get_const_int(attrs->strides[1]);
@@ -456,8 +468,13 @@ class RelayToTIRVisitor : public MixedModeMutator {
       clip_min = clip_attrs->a_min;
       clip_max = clip_attrs->a_max;
     } else {
-      clip_min = -128;
-      clip_max = 127;
+      if (dtype_bits == 8) {
+        clip_min = std::numeric_limits<int8_t>::min();
+        clip_max = std::numeric_limits<int8_t>::max();
+      } else {
+        clip_min = std::numeric_limits<int16_t>::min();
+        clip_max = std::numeric_limits<int16_t>::max();
+      }
     }
 
     tvm::Array<PrimExpr> scalar_args = {ToArg(stride_h),  ToArg(stride_w), ToArg(padding_h),
@@ -472,8 +489,8 @@ class RelayToTIRVisitor : public MixedModeMutator {
     Array<PrimExpr> cmsisnn_output_shape{1, output_shape[1], output_shape[2], output_shape[3]};
 
     BufferCreator buffer_creator;
-    tir::Var input = buffer_creator.CreateBufferVar("input", DataType::Handle(8));
-    tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
+    tir::Var input = buffer_creator.CreateBufferVar("input", DataType::Handle(dtype_bits));
+    tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(dtype_bits));
     tvm::Array<PrimExpr> call_ext_args = {tir::StringImm(cmsisnn_api), input, output};
 
     int context_buffer_size = 0;
diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
index b5c5058ddbc0..420e8618a4f9 100644
--- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
@@ -118,7 +118,8 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     } else if (cmsis_func_name == "arm_fully_connected_s8" ||
                cmsis_func_name == "arm_fully_connected_s16") {
       EmitFullyConnected(op);
-    } else if (cmsis_func_name == "arm_avgpool_s8" || cmsis_func_name == "arm_max_pool_s8") {
+    } else if (cmsis_func_name == "arm_avgpool_s8" || cmsis_func_name == "arm_avgpool_s16" ||
+               cmsis_func_name == "arm_max_pool_s8" || cmsis_func_name == "arm_max_pool_s16") {
       EmitPool2D(op);
     }
     return;
diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py
index 29140ad2e656..7657e0e63220 100644
--- a/tests/python/contrib/test_cmsisnn/test_pooling.py
+++ b/tests/python/contrib/test_cmsisnn/test_pooling.py
@@ -81,6 +81,7 @@ def make_model(
 
 
 @tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("dtype", ["int16", "int8"])
 @pytest.mark.parametrize("in_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
 @pytest.mark.parametrize(
     "pool_size, strides, padding", [((3, 3), (2, 2), "SAME"), ((2, 2), (1, 1), "VALID")]
@@ -91,7 +92,8 @@ def make_model(
 @pytest.mark.parametrize(
     "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
 )
-def test_op_int8(
+def test_ops(
+    dtype,
     in_shape,
     pool_size,
     strides,
@@ -103,18 +105,17 @@ def test_op_int8(
     compiler_cpu,
     cpu_flags,
 ):
-    """Tests QNN pooling op for int8 inputs"""
+    """Tests QNN pooling op for int8 and int16 pooling"""
     interface_api = "c"
     use_unpacked_api = True
 
-    dtype = "int8"
-
     model = make_model(
         pool_op=pool_type,
         shape=in_shape,
         pool_size=pool_size,
         strides=strides,
         padding=padding,
+        dtype=dtype,
         scale=scale,
         zero_point=zero_point,
         relu_type=relu_type,
@@ -130,7 +131,7 @@ def test_op_int8(
     in_min, in_max = get_range_for_dtype_str(dtype)
     np.random.seed(0)
     inputs = {
-        "input": np.random.randint(in_min, high=in_max, size=in_shape, dtype="int8"),
+        "input": np.random.randint(in_min, high=in_max, size=in_shape, dtype=dtype),
     }
     output_list = generate_ref_data(orig_mod["main"], inputs)
     compile_and_run(
@@ -211,7 +212,6 @@ def test_int8_pool_with_float32_input(
 def test_invalid_datatype(op):
     """Checks CMSIS-NN partitioning for non int8 dtype"""
     model = make_model(pool_op=op, dtype="int64")
-
     orig_mod = make_module(model)
     cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod)
     assert_no_external_function(cmsisnn_mod)

From b9f89a2eb9c37cedfaaa4f316fd86ed5ba5dcf1f Mon Sep 17 00:00:00 2001
From: Benson Muite <bkmgit@users.noreply.github.com>
Date: Tue, 29 Nov 2022 18:39:26 +0300
Subject: [PATCH 675/704] [Docs][Bug] Fix broken link to tvmc python (#13499)

Ensure correct link is generated to the next page in the documentation.
Update description to indicate next section introduces Python API, and
the section after that discusses the same material but using Python
rather than the command line interface.
---
 gallery/tutorial/tvmc_command_line_driver.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/gallery/tutorial/tvmc_command_line_driver.py b/gallery/tutorial/tvmc_command_line_driver.py
index 39e5f06311cd..3f4413e848ce 100644
--- a/gallery/tutorial/tvmc_command_line_driver.py
+++ b/gallery/tutorial/tvmc_command_line_driver.py
@@ -518,6 +518,8 @@
 # To see what other options are available, please have a look at ``tvmc
 # --help``.
 #
-# In the next tutorial, `Compiling and Optimizing a Model with the Python
-# Interface <auto_tuning_with_pyton>`_, we will cover the same compilation
-# and optimization steps using the Python interface.
+# In the `next tutorial <tvmc_python>`, we introduce the Python interface to TVM,
+# and in the tutorial after that,
+# `Compiling and Optimizing a Model with the Python Interface <autotvm_relay_x86>`,
+# we will cover the same compilation and optimization steps using the Python
+# interface.

From b587e33027ced167cd66ae26646e8dc5b3293120 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 29 Nov 2022 15:52:54 -0600
Subject: [PATCH 676/704] [microTVM][Tutorial] Fix micro_aot and micro_autotune
 tutorials (#13513)

Added config_main_stack_size to autotune since running graph debugger requires more workspace. Also added to micro_aot because of KWS model.
Also removed west_cmd option in tutorial because of #13377
---
 gallery/how_to/work_with_microtvm/micro_aot.py      | 7 ++++++-
 gallery/how_to/work_with_microtvm/micro_autotune.py | 5 ++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/gallery/how_to/work_with_microtvm/micro_aot.py b/gallery/how_to/work_with_microtvm/micro_aot.py
index f702170f0bb9..4d6890f8d936 100644
--- a/gallery/how_to/work_with_microtvm/micro_aot.py
+++ b/gallery/how_to/work_with_microtvm/micro_aot.py
@@ -134,7 +134,12 @@
 
 if use_physical_hw:
     template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
-    project_options = {"project_type": "host_driven", "board": BOARD, "serial_number": SERIAL}
+    project_options = {
+        "project_type": "host_driven",
+        "board": BOARD,
+        "serial_number": SERIAL,
+        "config_main_stack_size": 4096,
+    }
 
 temp_dir = tvm.contrib.utils.tempdir()
 generated_project_dir = temp_dir / "project"
diff --git a/gallery/how_to/work_with_microtvm/micro_autotune.py b/gallery/how_to/work_with_microtvm/micro_autotune.py
index ea83ef563940..13bf4efac138 100644
--- a/gallery/how_to/work_with_microtvm/micro_autotune.py
+++ b/gallery/how_to/work_with_microtvm/micro_autotune.py
@@ -154,7 +154,6 @@
         template_project_dir=pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")),
         project_options={
             "board": BOARD,
-            "west_cmd": "west",
             "verbose": False,
             "project_type": "host_driven",
             "serial_number": SERIAL,
@@ -222,10 +221,10 @@
         temp_dir / "project",
         {
             "board": BOARD,
-            "west_cmd": "west",
             "verbose": False,
             "project_type": "host_driven",
             "serial_number": SERIAL,
+            "config_main_stack_size": 4096,
         },
     )
 
@@ -266,10 +265,10 @@
         temp_dir / "project",
         {
             "board": BOARD,
-            "west_cmd": "west",
             "verbose": False,
             "project_type": "host_driven",
             "serial_number": SERIAL,
+            "config_main_stack_size": 4096,
         },
     )
 

From 694d4bf5eaf65df4eaad93188830112c6b139956 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 29 Nov 2022 13:58:21 -0800
Subject: [PATCH 677/704] [tir] Add copy on write to all nodes (#13512)

This enables copy on write methods for all nodes since some were missing
it before (see #13012 for more context)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 include/tvm/ir/expr.h  |  2 ++
 include/tvm/tir/expr.h | 30 ++++++++++++++++++++++++++++++
 include/tvm/tir/stmt.h | 12 ++++++++++++
 3 files changed, 44 insertions(+)

diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index 94927b4892eb..bb4c468f452f 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -526,6 +526,7 @@ class IntImm : public PrimExpr {
   TVM_DLL IntImm(DataType dtype, int64_t value, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(IntImm, PrimExpr, IntImmNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(IntImmNode);
 };
 
 /*!
@@ -572,6 +573,7 @@ class FloatImm : public PrimExpr {
   TVM_DLL FloatImm(DataType dtype, double value, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(FloatImm, PrimExpr, FloatImmNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(FloatImmNode);
 };
 
 /*!
diff --git a/include/tvm/tir/expr.h b/include/tvm/tir/expr.h
index 674ff0b7f43c..689b1c0a17ad 100644
--- a/include/tvm/tir/expr.h
+++ b/include/tvm/tir/expr.h
@@ -79,6 +79,7 @@ class StringImm : public PrimExpr {
  public:
   TVM_DLL StringImm(String value, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(StringImm, PrimExpr, StringImmNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(StringImmNode);
 };
 
 /*!
@@ -117,6 +118,7 @@ class Cast : public PrimExpr {
  public:
   TVM_DLL Cast(DataType dtype, PrimExpr value, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Cast, PrimExpr, CastNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(CastNode);
 };
 
 /*!
@@ -165,6 +167,7 @@ class Add : public PrimExpr {
  public:
   TVM_DLL Add(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Add, PrimExpr, AddNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(AddNode);
 };
 
 /*! \brief a - b */
@@ -181,6 +184,7 @@ class Sub : public PrimExpr {
  public:
   TVM_DLL Sub(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Sub, PrimExpr, SubNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(SubNode);
 };
 
 /*! \brief a * b */
@@ -197,6 +201,7 @@ class Mul : public PrimExpr {
  public:
   TVM_DLL Mul(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Mul, PrimExpr, MulNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(MulNode);
 };
 
 /*!
@@ -216,6 +221,7 @@ class Div : public PrimExpr {
  public:
   TVM_DLL Div(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Div, PrimExpr, DivNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(DivNode);
 };
 
 /*!
@@ -235,6 +241,7 @@ class Mod : public PrimExpr {
  public:
   TVM_DLL Mod(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Mod, PrimExpr, ModNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(ModNode);
 };
 
 /*! \brief Floor division, floor(a/b) */
@@ -251,6 +258,7 @@ class FloorDiv : public PrimExpr {
  public:
   TVM_DLL FloorDiv(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(FloorDiv, PrimExpr, FloorDivNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(FloorDivNode);
 };
 
 /*! \brief The remainder of the floordiv */
@@ -267,6 +275,7 @@ class FloorMod : public PrimExpr {
  public:
   TVM_DLL FloorMod(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(FloorMod, PrimExpr, FloorModNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(FloorModNode);
 };
 
 /*! \brief min(a, b) */
@@ -283,6 +292,7 @@ class Min : public PrimExpr {
  public:
   TVM_DLL Min(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Min, PrimExpr, MinNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(MinNode);
 };
 
 /*! \brief max(a, b) */
@@ -299,6 +309,7 @@ class Max : public PrimExpr {
  public:
   TVM_DLL Max(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Max, PrimExpr, MaxNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(MaxNode);
 };
 
 /*!
@@ -347,6 +358,7 @@ class EQ : public PrimExpr {
  public:
   TVM_DLL EQ(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(EQ, PrimExpr, EQNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(EQNode);
 };
 
 /*! \brief a != b */
@@ -363,6 +375,7 @@ class NE : public PrimExpr {
  public:
   TVM_DLL NE(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(NE, PrimExpr, NENode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(NENode);
 };
 
 /*! \brief a < b */
@@ -379,6 +392,7 @@ class LT : public PrimExpr {
  public:
   TVM_DLL LT(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(LT, PrimExpr, LTNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(LTNode);
 };
 
 /*! \brief a <= b */
@@ -395,6 +409,7 @@ class LE : public PrimExpr {
  public:
   TVM_DLL LE(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(LE, PrimExpr, LENode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(LENode);
 };
 
 /*! \brief a > b */
@@ -411,6 +426,7 @@ class GT : public PrimExpr {
  public:
   TVM_DLL GT(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(GT, PrimExpr, GTNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(GTNode);
 };
 
 /*! \brief a >= b */
@@ -427,6 +443,7 @@ class GE : public PrimExpr {
  public:
   TVM_DLL GE(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(GE, PrimExpr, GENode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(GENode);
 };
 
 /*! \brief a && b */
@@ -466,6 +483,7 @@ class And : public PrimExpr {
  public:
   TVM_DLL And(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(And, PrimExpr, AndNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(AndNode);
 };
 
 /*! \brief a || b */
@@ -505,6 +523,7 @@ class Or : public PrimExpr {
  public:
   TVM_DLL Or(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Or, PrimExpr, OrNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(OrNode);
 };
 
 /*! \brief !a */
@@ -540,6 +559,7 @@ class Not : public PrimExpr {
  public:
   TVM_DLL Not(PrimExpr a, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Not, PrimExpr, NotNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(NotNode);
 };
 
 /*!
@@ -591,6 +611,7 @@ class Select : public PrimExpr {
   TVM_DLL Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Select, PrimExpr, SelectNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(SelectNode);
 };
 
 /*!
@@ -706,6 +727,7 @@ class ProducerLoad : public PrimExpr {
   TVM_DLL explicit ProducerLoad(DataProducer producer, Array<PrimExpr> indices, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(ProducerLoad, PrimExpr, ProducerLoadNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(ProducerLoadNode);
 };
 
 /*!
@@ -765,6 +787,7 @@ class Load : public PrimExpr {
   TVM_DLL Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate,
                Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Load, PrimExpr, LoadNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(LoadNode);
 };
 
 /*!
@@ -817,6 +840,7 @@ class Ramp : public PrimExpr {
  public:
   TVM_DLL Ramp(PrimExpr base, PrimExpr stride, int lanes, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Ramp, PrimExpr, RampNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(RampNode);
 };
 
 /*! \brief Create a vector where all the elements are value. */
@@ -856,6 +880,7 @@ class Broadcast : public PrimExpr {
  public:
   TVM_DLL Broadcast(PrimExpr value, int lanes, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Broadcast, PrimExpr, BroadcastNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(BroadcastNode);
 };
 
 /*!
@@ -902,6 +927,7 @@ class Let : public PrimExpr {
  public:
   TVM_DLL Let(Var var, PrimExpr value, PrimExpr body, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Let, PrimExpr, LetNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(LetNode);
 };
 
 /*!
@@ -948,6 +974,7 @@ class Call : public PrimExpr {
  public:
   TVM_DLL Call(DataType dtype, RelayExpr op, Array<PrimExpr> args, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Call, PrimExpr, CallNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(CallNode);
 };
 
 /*!
@@ -995,6 +1022,7 @@ class Shuffle : public PrimExpr {
   TVM_DLL static PrimExpr ExtractElement(PrimExpr vector, int index, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Shuffle, PrimExpr, ShuffleNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(ShuffleNode);
 };
 
 // Reduce operator
@@ -1124,6 +1152,7 @@ class Reduce : public PrimExpr {
                  int value_index, Array<PrimExpr> init, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Reduce, PrimExpr, ReduceNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(ReduceNode);
 };
 
 /*! \brief Any shape. */
@@ -1159,6 +1188,7 @@ class Any : public PrimExpr {
   TVM_DLL Any(Span span = Span());
 
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Any, PrimExpr, AnyNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(AnyNode);
 };
 
 /*
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 6865326b8849..5beea44cdb1a 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -102,6 +102,7 @@ class LetStmt : public Stmt {
   TVM_DLL LetStmt(Var var, PrimExpr value, Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(LetStmt, Stmt, LetStmtNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(LetStmtNode);
 };
 
 /*!
@@ -158,6 +159,7 @@ class AttrStmt : public Stmt {
   TVM_DLL AttrStmt(ObjectRef node, String attr_key, PrimExpr value, Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(AttrStmt, Stmt, AttrStmtNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(AttrStmtNode);
 };
 
 /*!
@@ -206,6 +208,7 @@ class AssertStmt : public Stmt {
   TVM_DLL AssertStmt(PrimExpr condition, PrimExpr message, Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(AssertStmt, Stmt, AssertStmtNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(AssertStmtNode);
 };
 
 /*!
@@ -271,6 +274,7 @@ class Store : public Stmt {
                 Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Store, Stmt, StoreNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(StoreNode);
 };
 
 /*!
@@ -442,6 +446,7 @@ class ProducerStore : public Stmt {
                         Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(ProducerStore, Stmt, ProducerStoreNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(ProducerStoreNode);
 };
 
 /*!
@@ -505,6 +510,7 @@ class ProducerRealize : public Stmt {
                           String storage_scope = "", Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(ProducerRealize, Stmt, ProducerRealizeNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(ProducerRealizeNode);
 };
 
 /*!
@@ -679,6 +685,7 @@ class AllocateConst : public Stmt {
                         Map<String, ObjectRef> annotations = Map<String, ObjectRef>(),
                         Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(AllocateConst, Stmt, AllocateConstNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(AllocateConstNode);
 };
 
 /*! \brief Declare a buffer that can be used in the body */
@@ -812,6 +819,7 @@ class SeqStmt : public Stmt {
   };
 
   TVM_DEFINE_OBJECT_REF_METHODS(SeqStmt, Stmt, SeqStmtNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(SeqStmtNode);
 };
 
 /*!
@@ -898,6 +906,7 @@ class Evaluate : public Stmt {
   explicit Evaluate(int value, Span span = Span()) : Evaluate(PrimExpr(value), span) {}
 
   TVM_DEFINE_OBJECT_REF_METHODS(Evaluate, Stmt, EvaluateNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(EvaluateNode);
 };
 
 /*!
@@ -1055,6 +1064,7 @@ class While : public Stmt {
   TVM_DLL While(PrimExpr condition, Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(While, Stmt, WhileNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(WhileNode);
 };
 
 /*!
@@ -1099,6 +1109,7 @@ class Prefetch : public Stmt {
   TVM_DLL explicit Prefetch(Buffer buffer, Array<Range> bounds, Span span = Span());
 
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Prefetch, Stmt, PrefetchNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(PrefetchNode);
 };
 
 /*!
@@ -1203,6 +1214,7 @@ class MatchBufferRegion : public ObjectRef {
   TVM_DLL explicit MatchBufferRegion(Buffer buffer, BufferRegion source);
 
   TVM_DEFINE_OBJECT_REF_METHODS(MatchBufferRegion, ObjectRef, MatchBufferRegionNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(MatchBufferRegionNode);
 };
 
 /*!

From e47eed13d9a0af3bab5fc6fafbc8228797419249 Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Tue, 29 Nov 2022 19:54:41 -0600
Subject: [PATCH 678/704] Add methods to get the size of VTCM on device as well
 as the allocated size of the HexagonVtcmPool (#13511)

* Get device size and allocated size of VTCM

* Add python API
---
 src/runtime/hexagon/hexagon_device_api.cc         |  6 ++++++
 src/runtime/hexagon/hexagon_vtcm_pool.cc          | 15 ++++++++-------
 src/runtime/hexagon/hexagon_vtcm_pool.h           | 14 ++++++++++----
 .../hexagon/hexagon_vtcm_pool_tests.cc            |  9 +++++++--
 4 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 51cc976e46d3..a1d55db42f28 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -299,6 +299,12 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.release_resources")
       api->ReleaseResources();
     });
 
+TVM_REGISTER_GLOBAL("device_api.hexagon.vtcm_device_bytes")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      HexagonDeviceAPI* api = HexagonDeviceAPI::Global();
+      *rv = static_cast<int32_t>(api->VtcmPool()->VtcmDeviceBytes());
+    });
+
 TVM_REGISTER_GLOBAL("device_api.hexagon").set_body([](TVMArgs args, TVMRetValue* rv) {
   DeviceAPI* ptr = HexagonDeviceAPI::Global();
   *rv = static_cast<void*>(ptr);
diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.cc b/src/runtime/hexagon/hexagon_vtcm_pool.cc
index 17089852a954..8373ef61c9a4 100644
--- a/src/runtime/hexagon/hexagon_vtcm_pool.cc
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.cc
@@ -29,20 +29,19 @@ HexagonVtcmPool::HexagonVtcmPool() {
   compute_res_attr_t res_info;
   HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info));
 
-  unsigned int total_block_size;
   unsigned int avail_block_size;
   compute_res_vtcm_page_t total_block_layout;
   compute_res_vtcm_page_t avail_block_layout;
 
-  HEXAGON_SAFE_CALL(compute_resource_query_VTCM(/* application_id = */ 0, &total_block_size,
+  HEXAGON_SAFE_CALL(compute_resource_query_VTCM(/* application_id = */ 0, &vtcm_device_size_,
                                                 &total_block_layout, &avail_block_size,
                                                 &avail_block_layout));
-  DLOG(INFO) << "HexagonVtcmPool total " << total_block_size << " avail " << avail_block_size;
+  DLOG(INFO) << "HexagonVtcmPool total " << vtcm_device_size_ << " avail " << avail_block_size;
   CHECK(avail_block_size >= (1024 * 1024)) << "Less than 1MB VTCM available";
 
   // allocate nbytes of vtcm on a single page
   HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param_v2(&res_info,
-                                                           /*vtcm_size = */ total_block_size,
+                                                           /*vtcm_size = */ vtcm_device_size_,
                                                            /*min_page_size = */ 1,
                                                            /*min_vtcm_size = */ avail_block_size));
 
@@ -50,11 +49,13 @@ HexagonVtcmPool::HexagonVtcmPool() {
   // hanging, both in the simulator and on hardware.
   context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 0);
   CHECK(context_id_) << "HAP_compute_res_acquire failed to acquire requested VTCM resource.";
-  HEXAGON_SAFE_CALL(HAP_compute_res_attr_get_vtcm_ptr_v2(&res_info, &vtcm_data_, &vtcm_size_));
+  HEXAGON_SAFE_CALL(
+      HAP_compute_res_attr_get_vtcm_ptr_v2(&res_info, &vtcm_data_, &vtcm_allocated_size_));
   CHECK(vtcm_data_ != nullptr) << "HAP_compute_res_acquire returned nullptr when allocating VTCM.";
-  CHECK(vtcm_size_ >= avail_block_size)
+  CHECK(vtcm_allocated_size_ >= avail_block_size)
       << "HAP_compute_res_acquire failed to allocate minimum amount of VTCM";
-  free_.emplace_back(std::pair<char*, size_t>(static_cast<char*>(vtcm_data_), vtcm_size_));
+  free_.emplace_back(
+      std::pair<char*, size_t>(static_cast<char*>(vtcm_data_), vtcm_allocated_size_));
   // DebugDump();
 }
 
diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.h b/src/runtime/hexagon/hexagon_vtcm_pool.h
index 2e0918e997c4..88b8f1470cf3 100644
--- a/src/runtime/hexagon/hexagon_vtcm_pool.h
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.h
@@ -68,7 +68,10 @@ class HexagonVtcmPool {
   void Free(void* ptr, size_t nbytes);
 
   //! \brief Returns the total number of bytes in this pool
-  size_t TotalBytes() { return reinterpret_cast<size_t>(vtcm_size_); }
+  size_t VtcmDeviceBytes() { return reinterpret_cast<size_t>(vtcm_device_size_); }
+
+  //! \brief Returns the total number of bytes in this pool
+  size_t VtcmAllocatedBytes() { return reinterpret_cast<size_t>(vtcm_allocated_size_); }
 
   bool IsVtcm(void* ptr, unsigned size) {
     auto char_ptr = static_cast<char*>(ptr);
@@ -76,15 +79,18 @@ class HexagonVtcmPool {
     auto char_vtcm = static_cast<char*>(vtcm_data_);
     CHECK(vtcm_data_ != nullptr);
 
-    if (char_ptr >= char_vtcm && (char_ptr + size) <= (char_vtcm + vtcm_size_)) {
+    if (char_ptr >= char_vtcm && (char_ptr + size) <= (char_vtcm + vtcm_allocated_size_)) {
       return true;
     }
     return false;
   }
 
  private:
-  //! \brief Total size of VTCM pool
-  unsigned int vtcm_size_;
+  //! \brief Total size of VTCM memory on device
+  unsigned int vtcm_device_size_;
+
+  //! \brief Total size of VTCM pool allocated on device
+  unsigned int vtcm_allocated_size_;
 
   //! \brief Pointer to the beginning of the pool
   void* vtcm_data_;
diff --git a/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc b/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
index 81bd31cc84d5..8240241eee26 100644
--- a/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_vtcm_pool_tests.cc
@@ -27,13 +27,15 @@ using namespace tvm::runtime::hexagon;
 class HexagonVtcmPoolTest : public ::testing::Test {
   void SetUp() override {
     vtcm_pool = HexagonDeviceAPI::Global()->VtcmPool();
-    max_bytes = vtcm_pool->TotalBytes();
+    max_bytes = vtcm_pool->VtcmAllocatedBytes();
+    device_bytes = vtcm_pool->VtcmDeviceBytes();
   }
   void TearDown() override {}
 
  public:
   HexagonVtcmPool* vtcm_pool;
   size_t max_bytes;
+  size_t device_bytes;
   size_t four_k_block = 4096;
   size_t two_k_block = 2048;
   size_t one_k_block = 1024;
@@ -44,6 +46,9 @@ TEST_F(HexagonVtcmPoolTest, basic) {
   void* ptr;
   void* ptr2;
 
+  CHECK(device_bytes >= max_bytes) << "VTCM device size " << device_bytes
+                                   << " not greater than or equal to allocated size " << max_bytes;
+
   ptr = vtcm_pool->Allocate(max_bytes);
   CHECK((reinterpret_cast<uintptr_t>(ptr) & 0x7FF) == 0)
       << "Must be multiple of 2k " << ptr << " " << max_bytes;
@@ -123,7 +128,7 @@ TEST_F(HexagonVtcmPoolTest, free_alloc_combinations) {
   void* ptr3;
   void* ptr4;
   void* new_ptr;
-  size_t max_less_3_blocks = vtcm_pool->TotalBytes() - (3 * two_k_block);
+  size_t max_less_3_blocks = max_bytes - (3 * two_k_block);
   ptr1 = vtcm_pool->Allocate(two_k_block);
   ptr2 = vtcm_pool->Allocate(two_k_block);
   ptr3 = vtcm_pool->Allocate(two_k_block);

From 8dc8d248a1c3af96a4496900983bb8152ee128ae Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 29 Nov 2022 19:55:07 -0600
Subject: [PATCH 679/704] [LLVM] Switch to using New Pass Manager (NPM) with
 LLVM 16+ (#13515)

LLVM 16 has removed support for the legacy pass manager, and the LLVM
codegen in TVM will no longer even compile.
---
 src/target/llvm/codegen_amdgpu.cc |  3 ++
 src/target/llvm/codegen_llvm.cc   | 69 ++++++++++++++++++++++++++++++-
 src/target/llvm/codegen_llvm.h    |  3 ++
 src/target/llvm/codegen_nvptx.cc  |  3 ++
 4 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 89b567b6b077..4e83e83ba7e3 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -202,9 +202,12 @@ class CodeGenAMDGPU : public CodeGenLLVM {
     }
   }
 
+#if TVM_LLVM_VERSION < 160
+  // This function only works with the legacy pass manager.
   void InitPassManagerBuilder(llvm::PassManagerBuilder* builder) final {
     // Additional optimization hook to tweak the builder.
   }
+#endif
 
   unsigned GetGlobalAddressSpace() const final { return 1; }
 
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 87479ec74237..dce7d0b82f0d 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -58,7 +58,6 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/LLVMContext.h>
-#include <llvm/IR/LegacyPassManager.h>
 #include <llvm/IR/MDBuilder.h>
 #include <llvm/IR/Metadata.h>
 #include <llvm/IR/Module.h>
@@ -66,6 +65,14 @@
 #include <llvm/IRReader/IRReader.h>
 #include <llvm/Linker/Linker.h>
 #include <llvm/Pass.h>
+#if TVM_LLVM_VERSION >= 160
+#include <llvm/IR/Verifier.h>  // For VerifierPass
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Passes/StandardInstrumentations.h>
+#else
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/Transforms/IPO/PassManagerBuilder.h>
+#endif
 #if TVM_LLVM_VERSION >= 100
 #include <llvm/Support/Alignment.h>
 #include <llvm/Support/TypeSize.h>
@@ -75,7 +82,6 @@
 #include <llvm/Support/SourceMgr.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Transforms/IPO.h>
-#include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #include <llvm/Transforms/Utils/ModuleUtils.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/error_codes.h>
@@ -351,6 +357,64 @@ llvm::Value* CodeGenLLVM::CreateStorageSync(const CallNode* op) {
   return nullptr;
 }
 
+#if TVM_LLVM_VERSION >= 160
+
+// Use new pass manager
+
+void CodeGenLLVM::Optimize() {
+  llvm::TargetMachine* tm = llvm_target_->GetOrCreateTargetMachine();
+
+  bool debug_logging = false;
+  bool verify_each = false;
+
+  llvm::PipelineTuningOptions pto = llvm::PipelineTuningOptions();
+  llvm::PassInstrumentationCallbacks pic;
+  llvm::PassBuilder builder(tm, pto, llvm::None, &pic);
+
+  llvm::LoopAnalysisManager lam;
+  llvm::FunctionAnalysisManager fam;
+  llvm::CGSCCAnalysisManager cgam;
+  llvm::ModuleAnalysisManager mam;
+  builder.registerLoopAnalyses(lam);
+  builder.registerFunctionAnalyses(fam);
+  builder.registerCGSCCAnalyses(cgam);
+  builder.registerModuleAnalyses(mam);
+  builder.crossRegisterProxies(lam, fam, cgam, mam);
+
+  // Construct the default pass pipeline depending on the opt level.
+  std::string pipeline;
+  switch (llvm_target_->GetOptLevel()) {
+    case llvm::CodeGenOpt::Level::None:
+      pipeline = "default<O0>";
+      break;
+    case llvm::CodeGenOpt::Level::Less:
+      pipeline = "default<O1>";
+      break;
+    case llvm::CodeGenOpt::Level::Default:
+      pipeline = "default<O2>";
+      break;
+    default:
+      // CodeGenOpt::Level::Aggressive
+      pipeline = "default<O3>";
+      break;
+  }
+
+  llvm::StandardInstrumentations si(*llvm_target_->GetContext(), debug_logging, verify_each);
+  si.registerCallbacks(pic, &fam);
+  llvm::ModulePassManager mpass;
+  if (verify_each) {
+    mpass.addPass(llvm::VerifierPass());
+  }
+  if (auto err = builder.parsePassPipeline(mpass, pipeline)) {
+    LOG(FATAL) << "error parsing pass pipeline '" << pipeline
+               << "':" << llvm::toString(std::move(err)) << '\n';
+  }
+
+  mpass.run(*module_, mam);
+}
+
+#else  // TVM_LLVM_VERSION
+
 class FPassManager : public llvm::legacy::FunctionPassManager {
  public:
   explicit FPassManager(llvm::Module* m) : llvm::legacy::FunctionPassManager(m) {}
@@ -420,6 +484,7 @@ void CodeGenLLVM::Optimize() {
   fpass.doFinalization();
   mpass.run(*module_);
 }
+#endif  // TVM_LLVM_VERSION
 
 int CodeGenLLVM::NativeVectorBits(const runtime::StorageScope& storage_scope) const {
   return native_vector_bits_;
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 7a8daf2e761f..1ae9d14dc4ad 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -302,8 +302,11 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   virtual llvm::Value* GetThreadIndex(const IterVar& iv);
   // Get the corresponding thread index
   virtual llvm::Value* CreateStorageSync(const CallNode* op);
+#if TVM_LLVM_VERSION < 160
+  // This function only works with the legacy pass manager.
   // apply optimization on the module.
   virtual void InitPassManagerBuilder(llvm::PassManagerBuilder* builder);
+#endif
   // Scalarize by iterating elements of e.
   // f is a callback that takes index and v.
   void Scalarize(const PrimExpr& e, std::function<void(int i, llvm::Value* v)> f);
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index 2442d2ccbaa4..ff330e52d33c 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -183,9 +183,12 @@ class CodeGenNVPTX : public CodeGenLLVM {
     }
   }
 
+#if TVM_LLVM_VERSION < 160
+  // This function only works with the legacy pass manager.
   void InitPassManagerBuilder(llvm::PassManagerBuilder* builder) final {
     // Additional optimization hook to tweak the builder.
   }
+#endif
 
   void Optimize() final {
     for (auto& f : *module_) {

From c2dd53d5315b3073a14ced200ab55426ac69904e Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 30 Nov 2022 08:17:44 -0600
Subject: [PATCH 680/704] [TE][TIR] Improved naming when converting TE to
 schedulable TIR (#13431)

Prior to this commit, loop iterators were named `i0`, `i1`, and so on,
while the `BlockNode::iter_vars` used the name from the TE `IterVar`.
As a result, after `BlockNode::iter_vars` is lowered out, the
resulting `PrimFunc` no longer contained the user-generated iterator
names.  This commit updates the TIR conversion so that the loop
iterators take the name of the TE `IterVar`, and the
`BlockNode::iter_vars` are named `v_$IterVarName`.
---
 src/te/operation/create_primfunc.cc                | 14 +++++++-------
 .../test_meta_schedule_schedule_rule_mlt.py        | 10 +++++-----
 .../test_tir_transform_inject_software_pipeline.py |  4 ++--
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 223f8dcd5dd0..21456af1bdf4 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -153,7 +153,7 @@ BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
   auto f_push_block_vars = [&iter_vars, &var_map, &analyzer](const Array<IterVar>& iters) {
     for (IterVar iter_var : iters) {
       // Create new var
-      Var new_var(iter_var->var->name_hint, iter_var->var->dtype);
+      Var new_var("v_" + iter_var->var->name_hint, iter_var->var->dtype);
       var_map[iter_var->var.get()] = new_var;
 
       PrimExpr dom_min = analyzer->Simplify(iter_var->dom->min);
@@ -307,12 +307,12 @@ Stmt GenerateStmtFromCompute(const te::ComputeOp& compute_op, CreateFuncInfo* in
   // Step 1. Creating loop vars for block bindings.
   Array<IterVar> axes = compute_op->axis;
   axes.insert(axes.end(), compute_op->reduce_axis.begin(), compute_op->reduce_axis.end());
-  Array<PrimExpr> bindings;
-  for (size_t i = 0; i < axes.size(); ++i) {
-    const IterVar& axis = axes[i];
-    int bits = std::max(axis->dom->min.dtype().bits(), axis->dom->extent.dtype().bits());
-    bindings.push_back(Var("i" + std::to_string(i), runtime::DataType::Int(bits)));
-  }
+
+  Array<PrimExpr> bindings = axes.Map([&](IterVar iter_var) -> PrimExpr {
+    int bits = std::max(iter_var->dom->min.dtype().bits(), iter_var->dom->extent.dtype().bits());
+    return Var(iter_var->var->name_hint, runtime::DataType::Int(bits));
+  });
+
   // Step 2. Generate block bodies.
   Array<Stmt> seq_stmt;
   if (compute_op->body[0]->IsInstance<ReduceNode>()) {
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
index 24e34302202b..2c5a44d7a29f 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
@@ -635,12 +635,12 @@ def test_cache_read_specify_consumer():
     )
 
     residual_block = """
-        for i0, i1 in T.grid(512, 512):
+        for ax0, ax1 in T.grid(512, 512):
             with T.block("T_add"):
-                ax0, ax1 = T.axis.remap("SS", [i0, i1])
-                T.reads(C[ax0, ax1], A[ax0, ax1])
-                T.writes(T_add[ax0, ax1])
-                T_add[ax0, ax1] = C[ax0, ax1] + A[ax0, ax1]
+                v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
+                T.reads(C[v_ax0, v_ax1], A[v_ax0, v_ax1])
+                T.writes(T_add[v_ax0, v_ax1])
+                T_add[v_ax0, v_ax1] = C[v_ax0, v_ax1] + A[v_ax0, v_ax1]
     """
 
     assert residual_block in space[0].mod.script()
diff --git a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
index c70525b05712..006b67d62697 100644
--- a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
+++ b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
@@ -1507,7 +1507,7 @@ def test_async_pipelined_mma_gemm_simple():
     assert body.block.body.body[1].block.body.body.value == 3
 
     assert epilogue.block.body.body.block.body.body.attr_key == "async_wait_inflight_count"
-    assert str(epilogue.block.body.body.block.body.body.value) == "(2 - i2_0_0: int32)"
+    assert str(epilogue.block.body.body.block.body.body.value) == "(2 - k_0_0: int32)"
 
     build_and_run(sch)
 
@@ -1554,7 +1554,7 @@ def test_async_nested_pipeline_mma_gemm_ideal_annotation():
     assert body.block.body.body[1].block.body.body.attr_key == "async_wait_inflight_count"
     assert body.block.body.body[1].block.body.body.value == 2
 
-    assert str(epilogue.block.body.body[0].block.body.body.value) == "(1 - i2_0_0: int32)"
+    assert str(epilogue.block.body.body[0].block.body.body.value) == "(1 - k_0_0: int32)"
 
     build_and_run(sch)
 

From ab3f54d3f72981bc5afab7f21ea3e65b9d1b34d8 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 30 Nov 2022 12:01:24 -0600
Subject: [PATCH 681/704] [CI] Add `set -x` for demo scripts (#13523)

Previously, the `tests/scripts/task_demo_microtvm.sh` script prints
the commands as they are run, but the
`apps/microtvm/ethosu/run_demo.sh` script did not.  As a result, it
could be difficult to tell which step stalled or failed during CI.
---
 apps/microtvm/cmsisnn/run_demo.sh | 1 +
 apps/microtvm/ethosu/run_demo.sh  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/apps/microtvm/cmsisnn/run_demo.sh b/apps/microtvm/cmsisnn/run_demo.sh
index 3b51f8418363..e5d1064e6e65 100755
--- a/apps/microtvm/cmsisnn/run_demo.sh
+++ b/apps/microtvm/cmsisnn/run_demo.sh
@@ -19,6 +19,7 @@
 set -e
 set -u
 set -o pipefail
+set -x
 
 # Show usage
 function show_usage() {
diff --git a/apps/microtvm/ethosu/run_demo.sh b/apps/microtvm/ethosu/run_demo.sh
index e48366e48fb3..7490f979b834 100755
--- a/apps/microtvm/ethosu/run_demo.sh
+++ b/apps/microtvm/ethosu/run_demo.sh
@@ -19,6 +19,7 @@
 set -e
 set -u
 set -o pipefail
+set -x
 
 # Show usage
 function show_usage() {

From 6782a35018d5df63486569f17296737d75eedc55 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 30 Nov 2022 12:28:42 -0600
Subject: [PATCH 682/704] [TIR] Disable RewriteSimplifier extensions in
 RemoveNoOp (#13524)

During a `tir.Simplify` pass, these extensions were conditionally
enabled based on the `PassContext`.  Prior to this commit, they were
enabled by default in the `tir.RemoveNoOp` pass, as the simplified
expressions were only used to prove/disprove a no-op, and did not
appear in the output TIR.  However, this caused performance issues for
some nested boolean expressions.

This PR disables the analyzer extensions for the analyzer used by
`tir.RemoveNoOp`.  The extensions are still used internally by
`ControlFlowGraph`, including during the data-flow analysis used if
`tir.transform.RemoveNoOpConfig.use_dataflow_analysis` is enabled, so
the opt-in data-dependent no-op removals are unaffected.

Related to issue #13508.
---
 src/tir/transforms/remove_no_op.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/tir/transforms/remove_no_op.cc b/src/tir/transforms/remove_no_op.cc
index 3374f975f5ac..52468e73d474 100644
--- a/src/tir/transforms/remove_no_op.cc
+++ b/src/tir/transforms/remove_no_op.cc
@@ -306,10 +306,6 @@ Pass RemoveNoOp() {
     }
 
     arith::Analyzer analyzer;
-    analyzer.rewrite_simplify.SetEnabledExtensions(arith::RewriteSimplifier::Extension(
-        arith::RewriteSimplifier::kTransitivelyProveInequalities |
-        arith::RewriteSimplifier::kConvertBooleanToAndOfOrs |
-        arith::RewriteSimplifier::kApplyConstraintsToBooleanBranches));
 
     auto* n = f.CopyOnWrite();
     n->body = NoOpRemover::Apply(std::move(n->body), &analyzer, std::move(touch_pattern), nullptr);

From c7d7164c421ead653d1a300a9610e8d9da14722b Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Thu, 1 Dec 2022 21:18:57 +0300
Subject: [PATCH 683/704] [QNN] support zero points as variable scalar for
 QnnBatchMatMul op (#13469)

* support zero points as variable scalar

* lint fix

* error logging was added for unsupported case when zero point is N-d tensor

* fix misprinting

* remove unnecessary TODOs

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 src/relay/qnn/op/batch_matmul.cc | 78 +++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 26 deletions(-)

diff --git a/src/relay/qnn/op/batch_matmul.cc b/src/relay/qnn/op/batch_matmul.cc
index be5a314e8025..a948d9387d6b 100644
--- a/src/relay/qnn/op/batch_matmul.cc
+++ b/src/relay/qnn/op/batch_matmul.cc
@@ -96,20 +96,42 @@ Expr BatchMatmulFirstTerm(const Expr& quantized_x, const Expr& quantized_y,
 }
 
 Expr BatchMatmulSecondTerm(const Expr& x_quantized_data, const Expr& y_zero_point) {
-  Array<Integer> axes = {2};
-  return Multiply(y_zero_point, Sum(Cast(x_quantized_data, DataType::Int(32)), axes, true, false));
+  if (IsScalar(y_zero_point)) {
+    Array<Integer> axes = {2};
+    return Multiply(y_zero_point,
+                    Sum(Cast(x_quantized_data, DataType::Int(32)), axes, true, false));
+  } else {
+    LOG(FATAL) << "Tensor zero point (non-scalar) is not supported";
+    return Expr();
+  }
 }
 
 Expr BatchMatmulThirdTerm(const Expr& y_quantized_data, const Expr& x_zero_point,
                           int broadcast_dim_size) {
-  Array<Integer> axes = {2};
-  auto reducemult =
-      Multiply(x_zero_point, Sum(Cast(y_quantized_data, DataType::Int(32)), axes, true, false));
-  Array<Integer> newshape;
-
-  // dimension of 0 in reshape copies old dimension size
-  newshape = {0, 1, broadcast_dim_size};
-  return Reshape(reducemult, newshape);
+  if (IsScalar(x_zero_point)) {
+    Array<Integer> axes = {2};
+    auto reducemult =
+        Multiply(x_zero_point, Sum(Cast(y_quantized_data, DataType::Int(32)), axes, true, false));
+    Array<Integer> newshape;
+
+    // dimension of 0 in reshape copies old dimension size
+    newshape = {0, 1, broadcast_dim_size};
+    return Reshape(reducemult, newshape);
+  } else {
+    LOG(FATAL) << "Tensor zero point (non-scalar) is not supported";
+    return Expr();
+  }
+}
+
+Expr BatchMatmulFourthTerm(Expr x_zero_point, Expr y_zero_point, int reduction_dim_size) {
+  if (IsScalar(x_zero_point) && IsScalar(y_zero_point)) {
+    auto zero_point_mul = Multiply(x_zero_point, y_zero_point);
+    auto const_scale = MakeConstantScalar(DataType::Int(32), reduction_dim_size);
+    return Multiply(zero_point_mul, const_scale);
+  } else {
+    LOG(FATAL) << "Tensor zero point (non-scalar) is not supported";
+    return Expr();
+  }
 }
 
 Expr BatchMatmulFourthTerm(int x_zero_point_int, int y_zero_point_int, int reduction_dim_size) {
@@ -175,27 +197,31 @@ Expr QnnBatchMatmulCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
 
   const auto* qnn_batch_matmul_attrs = attrs.as<BatchMatmulAttrs>();
 
-  // Extract the integer zero points.
-  auto y_zero_point_int = GetScalarFromConstant<int>(y_zero_point);
-  auto x_zero_point_int = GetScalarFromConstant<int>(x_zero_point);
-
   // Get all the terms as described in the comments.
   auto term1 = BatchMatmulFirstTerm(quantized_x, quantized_y, qnn_batch_matmul_attrs);
   auto term2 = BatchMatmulSecondTerm(quantized_x, y_zero_point);
   auto term3 = BatchMatmulThirdTerm(quantized_y, x_zero_point, broadcast_dim_size);
-  auto term4 = BatchMatmulFourthTerm(x_zero_point_int, y_zero_point_int, reduction_dim_size);
-
-  // Combine those 4 terms depending on the zero points to get the best lowering.
-  if (x_zero_point_int == 0 && y_zero_point_int == 0) {
-    // term 2, 3 and 4 become zero.
-    return term1;
-  } else if (x_zero_point_int == 0 && y_zero_point_int != 0) {
-    // term 3 and term 4 become zero.
-    return Subtract(term1, term2);
-  } else if (x_zero_point_int != 0 && y_zero_point_int == 0) {
-    // term 2 and term 4 become zero.
-    return Subtract(term1, term3);
+
+  if (IsConstScalar(x_zero_point) && IsConstScalar(y_zero_point)) {
+    // Extract the integer zero points.
+    auto y_zero_point_int = GetScalarFromConstant<int>(y_zero_point);
+    auto x_zero_point_int = GetScalarFromConstant<int>(x_zero_point);
+    auto term4 = BatchMatmulFourthTerm(x_zero_point_int, y_zero_point_int, reduction_dim_size);
+    // Combine those 4 terms depending on the zero points to get the best lowering.
+    if (x_zero_point_int == 0 && y_zero_point_int == 0) {
+      // term 2, 3 and 4 become zero.
+      return term1;
+    } else if (x_zero_point_int == 0 && y_zero_point_int != 0) {
+      // term 3 and term 4 become zero.
+      return Subtract(term1, term2);
+    } else if (x_zero_point_int != 0 && y_zero_point_int == 0) {
+      // term 2 and term 4 become zero.
+      return Subtract(term1, term3);
+    } else {
+      return BatchMatmulCombineTerms(term1, term2, term3, term4);
+    }
   } else {
+    auto term4 = BatchMatmulFourthTerm(x_zero_point, y_zero_point, reduction_dim_size);
     return BatchMatmulCombineTerms(term1, term2, term3, term4);
   }
 }

From bf16b42edb94d016fd03ee68cf664d99c5f97e61 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar Subramaniam <quic_sanirudh@quicinc.com>
Date: Fri, 2 Dec 2022 00:11:00 +0530
Subject: [PATCH 684/704] [Hexagon] Add HVX quant conv2d implementation
 (#13256)

* [Hexagon] Add HVX quant conv2d implementation

This patch adds a new HVX intrinsic implementation to perform quantized
convolution.

It assumes that the qnn.conv2d relay op is not
canonicalized and all the quantization parameters (scales and zero
points) are passed into the intrinsic implementation.

It also uses the fixed point computation function defined in hexagon
topi utils to compute a fixed point (combined) scale which is used to
perform the final requantization before returning the quantized output.

* Remove inline keywords and add debug asserts
---
 cmake/modules/Hexagon.cmake                   |   9 +
 .../tvm => src}/runtime/hexagon/ops/conv2d.h  | 145 +++++++-
 src/runtime/hexagon/ops/conv2d_fp16_hvx.cc    |  57 ++--
 src/runtime/hexagon/ops/conv2d_quant_hvx.cc   | 319 ++++++++++++++++++
 src/runtime/hexagon/ops/conv_utils.cc         | 170 ++++------
 .../hexagon/hexagon_conv_utils_test.h         | 102 ++++++
 .../hexagon/hexagon_fp16_utils_tests.cc       |  96 ++----
 .../hexagon/hexagon_quant_utils_tests.cc      | 224 ++++++++++++
 .../contrib/test_hexagon/infrastructure.py    |   2 +-
 .../topi/test_conv2d_quant_intrin.py          | 261 ++++++++++++++
 10 files changed, 1179 insertions(+), 206 deletions(-)
 rename {include/tvm => src}/runtime/hexagon/ops/conv2d.h (54%)
 create mode 100644 src/runtime/hexagon/ops/conv2d_quant_hvx.cc
 create mode 100644 tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h
 create mode 100644 tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py

diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 31cece8a19e0..887211893558 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -178,6 +178,15 @@ if(BUILD_FOR_HEXAGON)
     "${TVMRT_SOURCE_DIR}/hexagon/ops/*.cc"
   )
 
+  include_directories(
+    "${TVMRT_SOURCE_DIR}/hexagon/ops"
+  )
+
+  set_source_files_properties(
+    "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_quant_hvx.cc"
+    PROPERTIES COMPILE_FLAGS "-mhvx"
+  )
+
   set_source_files_properties(
     "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_fp16_hvx.cc"
     PROPERTIES COMPILE_FLAGS "-mhvx"
diff --git a/include/tvm/runtime/hexagon/ops/conv2d.h b/src/runtime/hexagon/ops/conv2d.h
similarity index 54%
rename from include/tvm/runtime/hexagon/ops/conv2d.h
rename to src/runtime/hexagon/ops/conv2d.h
index d759149727e8..76c6cccff73d 100644
--- a/include/tvm/runtime/hexagon/ops/conv2d.h
+++ b/src/runtime/hexagon/ops/conv2d.h
@@ -20,6 +20,7 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 
+#include <algorithm>
 #include <cassert>
 
 #ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
@@ -28,6 +29,7 @@
 namespace tvm {
 namespace runtime {
 namespace hexagon {
+namespace conv_utils {
 static constexpr auto hexagon_device = DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
 
 // Standalone DLTensor: the standalone-ness means that this object owns the shape
@@ -75,14 +77,36 @@ inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
 
 inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); }
 
-constexpr int xyc_to_sm_16b(int y, int x, int c) {
+constexpr int yxc_to_sm_16b(int y, int x, int c) {
   // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
   // from the beginning of the block in spatial-major layout.
   // 10-bit spatial mask: yyyxcccccx
   assert(y >= 0 && x >= 0 && c >= 0);
+  assert(y < 8 && x < 4 && c < 32);
   return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
 }
 
+constexpr int yxc_to_sm_8b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 8-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxxxccccc
+  assert(y >= 0 && x >= 0 && c >= 0);
+  assert(y < 8 && x < 8 && c < 32);
+  return y << 8 | x << 5 | c;
+}
+
+constexpr int hwio_to_sm_8b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 8-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiioooooii, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  assert(i < 32 && o < 32);
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1c) << 5 | o << 2 | (i & 3);
+}
+
 constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
   // Map y,x,i,o coordinates within a chunk (assuming the origin at the
   // top-left spatial corner) to the offset (in 16-bit elements) from the
@@ -90,11 +114,12 @@ constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
   // Spatial mask: p..piiiioooooi, where p..p are position bits.
   assert(width >= 1);
   assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  assert(i < 32 && o < 32);
   int p = y * width + (width - 1 - x);
   return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
 }
 
-inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
 
 // Returns the block address at the given index
 // Assumptions
@@ -123,6 +148,10 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
  * The input is mapped into the below mentioned layout (notation similar to index map used for
  * transform layout):
  *
+ * For uint8_t type
+ * lambda n, h, w, c: n, h//8, w//8, c//32, AXIS_SEPARATOR, h%8, w%8, c%32
+ *
+ * For uint16_t type
  * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2
  *
  * where AXIS_SEPARATOR represents split up in the physical layout
@@ -133,7 +162,48 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
  * @param width
  * @param depth
  */
-void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth);
+template <typename T, int block_height, int block_width, int block_depth>
+void blockize_hwc(void* out, void* inp_flat, int height, int width, int depth) {
+  int (*index_func)(int, int, int);
+  if constexpr (std::is_same_v<T, uint8_t>)
+    index_func = yxc_to_sm_8b;
+  else if constexpr (std::is_same_v<T, uint16_t>)
+    index_func = yxc_to_sm_16b;
+  else
+    LOG_ERROR << "blockize_hwc is only supported for uint8_t and uint16_t types";
+
+  auto inp_data = static_cast<T*>(inp_flat);
+  auto out_data = static_cast<uintptr_t*>(out);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += block_height) {
+    for (int cx = 0; cx < width; cx += block_width) {
+      for (int cc = 0; cc < depth; cc += block_depth) {
+        auto block = reinterpret_cast<T*>(*out_data++);
+        int max_y = std::min(block_height, height - cy);
+        int max_x = std::min(block_width, width - cx);
+        int max_c = std::min(block_depth, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              block[index_func(y, x, c)] =
+                  inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)];
+            }
+            for (int c = max_c; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+          }
+          for (int x = max_x; x < block_width; ++x) {
+            for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+          }
+        }
+
+        for (int y = max_y; y < block_height; ++y)
+          for (int x = 0; x < block_width; ++x)
+            for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+      }  // cc
+    }    // cx
+  }      // cy
+}
 
 /**
  * @brief Convert back from non-contguous layout to a flat layout
@@ -144,7 +214,42 @@ void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int dept
  * @param width
  * @param depth
  */
-void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth);
+template <typename T, int block_height, int block_width, int block_depth>
+void deblockize_hwc(void* out_flat, void* inp, int height, int width, int depth) {
+  int (*index_func)(int, int, int);
+  if constexpr (std::is_same_v<T, uint8_t>)
+    index_func = yxc_to_sm_8b;
+  else if constexpr (std::is_same_v<T, uint16_t>)
+    index_func = yxc_to_sm_16b;
+  else
+    LOG_ERROR << "deblockize_hwc is only supported for uint8_t and uint16_t types";
+
+  uintptr_t* inp_data = static_cast<uintptr_t*>(inp);
+  T* out_data = static_cast<T*>(out_flat);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += block_height) {
+    for (int cx = 0; cx < width; cx += block_width) {
+      for (int cc = 0; cc < depth; cc += block_depth) {
+        auto block = reinterpret_cast<T*>(*inp_data);
+        int max_y = std::min(block_height, height - cy);
+        int max_x = std::min(block_width, width - cx);
+        int max_c = std::min(block_depth, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] =
+                  block[index_func(y, x, c)];
+            }
+          }
+        }
+
+        inp_data++;
+      }
+    }
+  }
+}
 
 /**
  * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
@@ -175,15 +280,42 @@ void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int de
  */
 void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height,
                        int width, int idepth, int odepth);
+void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width,
+                      int idepth, int odepth);
 
+template <typename T, int block_height, int block_width, int block_depth>
 SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat,
-                          bool copy_data);
+                          bool copy_data) {
+  tvm::runtime::String vtcm_scope = "global.vtcm";
+
+  // Allocate blocks for activations. We will use the block pointers
+  // directly from the allocated area.
+  int n = nhwc_flat->shape[0];
+  int h = round_up(nhwc_flat->shape[1], block_height);
+  int w = round_up(nhwc_flat->shape[2], block_width);
+  int c = round_up(nhwc_flat->shape[3], block_depth);
+  int64_t shape_2d[2] = {(n * h * w * c) / (block_height * block_width * block_depth),
+                         block_height * block_width * block_depth};
+  void* nhwc_vtcm =
+      device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope);
+  if (copy_data) {
+    blockize_hwc<T, block_height, block_width, block_depth>(
+        nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2], nhwc_flat->shape[3]);
+  }
 
-int calculate_num_weight_chunks(int64_t* shape_hwio);
+  return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm,
+                      {n, h / block_height, w / block_width, c / block_depth});
+}
+
+int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width,
+                                int chunk_in_channel, int chunk_out_channel);
 
 SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
                           int num_chunks, void** ptr_table);
 
+SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
+                             int num_chunks, void** ptr_table, int wgt_zp = 0);
+
 template <size_t N>
 void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor<N>& tensor) {
   if (auto* data_space = tensor.GetDataSpace()) {
@@ -191,6 +323,7 @@ void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor<N>& tensor) {
   }
 }
 
+}  // namespace conv_utils
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
index a478fbab352d..53ea0868ad2a 100644
--- a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
+++ b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
@@ -27,7 +27,7 @@
 #include <cassert>
 #include <cinttypes>
 
-#include "tvm/runtime/hexagon/ops/conv2d.h"
+#include "conv2d.h"
 
 // Current limitations:
 // - N in NHWC must be 1
@@ -68,7 +68,7 @@ namespace hexagon {
  */
 static inline uint16_t* getElementPtr(int block_out_y, int block_out_x, int block_out_c, int yi,
                                       int xio, int ci, int xii, const DLTensor& tensor) {
-  auto block_ptr = nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c);
+  auto block_ptr = conv_utils::nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c);
   auto block_offset = yi * 128 + xio * 64 + ci * 2 + xii;
   auto first_element_ptr = reinterpret_cast<uint16_t*>(block_ptr);
   return first_element_ptr + block_offset;
@@ -279,10 +279,10 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
         }
         int fx = (fw < wgt_chunk_thin_width) ? fw : ((fw - wgt_chunk_thin_width) % 4);
         int fy = fh % 8;
-        for (int c = 0; c < round_up(filt_idepth, 2); c += 2) {
+        for (int c = 0; c < conv_utils::round_up(filt_idepth, 2); c += 2) {
           int out_act_cc = c / 32;
           int ci = c % 32;
-          auto wgt_chunk = hwio_at(cr_filt, fch, fcw, out_act_cc, out_c);
+          auto wgt_chunk = conv_utils::hwio_at(cr_filt, fch, fcw, out_act_cc, out_c);
 
           // Find weight chunk offset ptr
           int max_x = (fcw == 0) ? wgt_chunk_thin_width : 4;
@@ -306,7 +306,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
                                                true_wo, ci, true_wi, cr_act);
           HVX_Vector act_vec = getInputVector(act_element_ptr);
 
-          auto wgt_chunk_offset = hwio_to_sm_16b(max_x, fy, fx, ci, 0);
+          auto wgt_chunk_offset = conv_utils::hwio_to_sm_16b(max_x, fy, fx, ci, 0);
           auto base_chunk_ptr = reinterpret_cast<uint16_t*>(wgt_chunk);
           auto chunk_ptr = base_chunk_ptr + wgt_chunk_offset;
 
@@ -404,7 +404,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
 
 int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
                        int out_code, void* res_handle) {
-  namespace hexagonrt = tvm::runtime::hexagon;
+  namespace conv_utils = tvm::runtime::hexagon::conv_utils;
   ICHECK_EQ(num_args, 7) << "Unexpected number of arguments";
   ICHECK_EQ(type_codes[0], kTVMDLTensorHandle)
       << "First argument is expected to be the input tensor";  // Input activations
@@ -440,50 +440,55 @@ int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue*
            << wgt_flat->shape[2] << "x" << wgt_flat->shape[3] << ", pad_top=" << pad_top
            << ", pad_left=" << pad_left;
 
-  auto* device_api = tvm::runtime::DeviceAPI::Get(hexagonrt::hexagon_device, false);
+  auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false);
   ICHECK(device_api != nullptr);
   tvm::runtime::String vtcm_scope = "global.vtcm";
 
-  auto act_vtcm = hexagonrt::prepare_nhwc(device_api, act_flat, /*copy_data=*/true);
+  auto act_vtcm =
+      conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, act_flat, /*copy_data=*/true);
 
   ICHECK_NE(wgt_flat->shape[0], 0) << "Weights height should not be zero";
   ICHECK_NE(wgt_flat->shape[1], 0) << "Weights width should not be zero";
   ICHECK_NE(wgt_flat->shape[2], 0) << "Weights input channels should not be zero";
   ICHECK_NE(wgt_flat->shape[3], 0) << "Weights output channels should not be zero";
-  int num_wgt_chunks = hexagonrt::calculate_num_weight_chunks(wgt_flat->shape);
+  int num_wgt_chunks = conv_utils::calculate_num_weight_chunks(
+      wgt_flat->shape, /* chunk_height */ 8, /* chunk_width */ 4, /* chunk_in_channel */ 32,
+      /* chunk_out_channel */ 32);
+
   LOG_INFO << "num_wgt_chunks: " << num_wgt_chunks;
   auto wgt_ptr_table =
       reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
-  auto wgt_vtcm = hexagonrt::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table);
+  auto wgt_vtcm = conv_utils::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table);
 
-  auto out_vtcm = hexagonrt::prepare_nhwc(device_api, out_flat, /*copy_data=*/false);
+  auto out_vtcm =
+      conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, out_flat, /*copy_data=*/false);
 
   // Prepare zero_block
   int64_t block_nbytes = 2048;
-  void* zero_block = device_api->AllocDataSpace(hexagonrt::hexagon_device, 1, &block_nbytes,
+  void* zero_block = device_api->AllocDataSpace(conv_utils::hexagon_device, 1, &block_nbytes,
                                                 tvm::runtime::DataType::UInt(8), vtcm_scope);
   memset(zero_block, 0, 2048);
 
   // FIXME: Setting bias to zero_block: this works for up to 256 output channels.
   auto bias_flat =
-      hexagonrt::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]);
-  auto act_shape = hexagonrt::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
-  auto filt_shape = hexagonrt::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
-  auto pad_shape = hexagonrt::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left});
-  auto out_shape = hexagonrt::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
+      conv_utils::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]);
+  auto act_shape = conv_utils::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
+  auto filt_shape = conv_utils::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
+  auto pad_shape = conv_utils::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left});
+  auto out_shape = conv_utils::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
   bool relu = false;
 
-  hexagonrt::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, bias_flat,
-                                 filt_shape, pad_shape, relu, stride_h, stride_w,
-                                 hexagonrt::to_uint(zero_block));
+  tvm::runtime::hexagon::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape,
+                                             bias_flat, filt_shape, pad_shape, relu, stride_h,
+                                             stride_w, conv_utils::to_uint(zero_block));
 
-  hexagonrt::deblockize_hwc_16b(out_flat->data, out_vtcm.data, out_flat->shape[1],
-                                out_flat->shape[2], out_flat->shape[3]);
+  conv_utils::deblockize_hwc<uint16_t, 8, 4, 32>(out_flat->data, out_vtcm.data, out_flat->shape[1],
+                                                 out_flat->shape[2], out_flat->shape[3]);
 
-  device_api->FreeDataSpace(hexagonrt::hexagon_device, zero_block);
-  hexagonrt::release(device_api, out_vtcm);
-  hexagonrt::release(device_api, wgt_vtcm);
-  hexagonrt::release(device_api, act_vtcm);
+  device_api->FreeDataSpace(conv_utils::hexagon_device, zero_block);
+  conv_utils::release(device_api, out_vtcm);
+  conv_utils::release(device_api, wgt_vtcm);
+  conv_utils::release(device_api, act_vtcm);
 
   return 0;
 }
diff --git a/src/runtime/hexagon/ops/conv2d_quant_hvx.cc b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc
new file mode 100644
index 000000000000..682eebb137c0
--- /dev/null
+++ b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc
@@ -0,0 +1,319 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <hexagon_types.h>
+#include <hvx_hexagon_protos.h>
+#include <inttypes.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include "conv2d.h"
+
+extern "C" int conv2d_packed_quant(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
+                                   int out_code, void* res_handle);
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+inline uint8_t* getElementPtr_int8(int block_out_y, int block_out_x, int block_out_c, int yi,
+                                   int xi, int ci, const DLTensor& block) {
+  auto block_ptr =
+      tvm::runtime::hexagon::conv_utils::nhwc_at(block, 0, block_out_y, block_out_x, block_out_c);
+  const int width_stride = 32;
+  const int height_stride = width_stride * 8;
+  auto block_offset = yi * height_stride + xi * width_stride + ci;
+  auto first_element_ptr = reinterpret_cast<uint8_t*>(block_ptr);
+  return first_element_ptr + block_offset;
+}
+
+inline int8_t* getWgtPtr_int8(int out_i, int out_o, int h, int w, int i, int o,
+                              const DLTensor& wgt_vtcm, int width) {
+  auto data = static_cast<intptr_t*>(wgt_vtcm.data);
+  auto chunk = data[out_i * wgt_vtcm.shape[3] + out_o];
+  auto base_chunk_ptr = reinterpret_cast<int8_t*>(chunk);
+  auto wgt_chunk_offset = tvm::runtime::hexagon::conv_utils::hwio_to_sm_8b(width, h, w, i, o);
+  return base_chunk_ptr + wgt_chunk_offset;
+}
+
+int32_t saturate_uint8(int32_t val) { return std::max(std::min(val, 255), 0); }
+
+int32_t saturate_int8(int32_t val) { return std::max(std::min(val, 127), -128); }
+
+/**
+ * @brief Compute the quantized convolution along with requantize with output quantization params to
+ * get uint8 outputs
+ *
+ * The quantized convolution is represented by the below equation
+ * out_scale(out_q - out_zp) = Σr,s,c(act_scale(act_q[n,h+r,w+s,c] - act_zp) *
+ *                             wgt_scale(wgt_q[r,s,c,o] - wgt_zp))
+ * => out_q = Σr,s,c((act_q[n,h+r,w+s,c] - act_zp) * (wgt_q[r,s,c,o] - wgt_zp))
+ *            * (act_scale*wgt_scale/out_scale) + out_zp
+ * out_q = Σr,s,c((act_q[n,h+r,w+s,c] - act_zp) * (wgt_zp_q[r,s,c,o])) *
+ * (act_scale*wgt_scale/out_scale) + out_zp, where wgt_zp_q = (wgt_q[r,s,c,o] - wgt_zp)
+ *
+ *  Assumptions/Limitations:
+ *  - Strided convolution is not yet supported so the stride variables are unused
+ *
+ * @param cr_out blockized output tensor with zeros already filled in
+ * @param cr_act blockized activations
+ * @param cr_filt Chunkified weights as returned from output of prepare_hwio
+ * @param out_shape Original output shape of the tensor before blockization
+ * @param act_shape Original input shape
+ * @param filt_shape Original filter shape
+ * @param act_scale Quantization scale for activation
+ * @param act_zp Activations zero point
+ * @param wgt_scale Quantization scale for weights
+ * @param wgt_zp Weights zero point
+ * @param out_scale Quantization scale for output
+ * @param out_zp Output zero point
+ * @param fixed_final_scale Fixed point value of final_scale= (act_scale*wgt_scale/out_scale)
+ * @param scale_factor Scale factor for the fixed_final_scale
+ */
+void conv_layer_int8_hvx_whole(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
+                               const DLTensor& cr_filt, const DLTensor& out_shape,
+                               const DLTensor& act_shape, const DLTensor& filt_shape,
+                               float act_scale, int act_zp, float wgt_scale, int wgt_zp,
+                               float out_scale, int out_zp, int fixed_final_scale,
+                               int scale_factor) {
+  namespace conv_utils = tvm::runtime::hexagon::conv_utils;
+  int filt_height = filt_shape.shape[0];
+  int filt_width = filt_shape.shape[1];
+  int filt_idepth = filt_shape.shape[2];
+
+  int a_depth = cr_act.shape[3];
+
+  int o_height = cr_out.shape[1];
+  int o_width = cr_out.shape[2];
+  int o_depth = cr_out.shape[3];
+
+  int out_height = out_shape.shape[1];
+  int out_width = out_shape.shape[2];
+
+  uint8_t act_zp_u8 = static_cast<uint8_t>(act_zp);
+  int8_t wgt_zp_i8 = static_cast<int8_t>(wgt_zp);
+
+  HVX_Vector act_zp_vec = Q6_Vb_vsplat_R(act_zp_u8);
+  HVX_Vector wgt_zp_vec = Q6_Vb_vsplat_R(wgt_zp_i8);
+  HVX_VectorPair wgt_zp_vec_pair = Q6_Wh_vsxt_Vb(wgt_zp_vec);
+
+  ICHECK_EQ(a_depth, cr_filt.shape[2]) << "input depth should match weights input channels";
+  ICHECK_EQ(o_depth, cr_filt.shape[3]) << "output depth should match the weights output channel";
+
+  uint32_t scale_u = static_cast<uint32_t>(fixed_final_scale);
+  HVX_Vector scale_vec = Q6_V_vsplat_R(scale_u);
+  uint32_t new_scale_factor = static_cast<uint32_t>(scale_factor - 16);
+  HVX_Vector out_zp_vec = Q6_V_vsplat_R(out_zp);
+
+  auto computeOutVec = [&cr_act, &cr_filt, &act_zp_vec, &wgt_zp_vec_pair, &out_zp_vec, &scale_vec,
+                        new_scale_factor, filt_height, filt_width,
+                        filt_idepth](int out_h, int out_w, int out_c, int h, int w) -> HVX_Vector {
+    HVX_Vector out_vec = Q6_V_vzero();
+    for (int fh = 0; fh < filt_height; ++fh) {
+      for (int fw = 0; fw < filt_width; ++fw) {
+        for (int c = 0; c < conv_utils::round_up(filt_idepth, 4); c += 4) {
+          int act_h = out_h * 8 + h + fh;
+          int act_ho = act_h / 8;
+          int act_hi = act_h % 8;
+
+          int act_w = out_w * 8 + w + fw;
+          int act_wo = act_w / 8;
+          int act_wi = act_w % 8;
+
+          int act_co = c / 32;
+          int act_ci = c % 32;
+
+          uint8_t* act_ptr =
+              getElementPtr_int8(act_ho, act_wo, act_co, act_hi, act_wi, act_ci, cr_act);
+
+          uint32_t four_act_elems = *reinterpret_cast<uint32_t*>(act_ptr);
+          HVX_Vector act_vec = Q6_V_vsplat_R(four_act_elems);
+          int8_t* wgt_ptr = getWgtPtr_int8(act_co, out_c, fh, fw, act_ci, 0, cr_filt, filt_width);
+
+          HVX_Vector* wgt_vec_ptr = reinterpret_cast<HVX_Vector*>(wgt_ptr);
+          HVX_Vector wgt_vec = *wgt_vec_ptr;
+
+          HVX_VectorPair act_vec_zp_diff = Q6_Wh_vsub_VubVub(act_vec, act_zp_vec);
+          HVX_VectorPair wgt_i16_vec_nodiff = Q6_Wh_vsxt_Vb(wgt_vec);
+          HVX_VectorPair wgt_i16_vec = Q6_Wh_vsub_WhWh_sat(wgt_i16_vec_nodiff, wgt_zp_vec_pair);
+
+          out_vec = Q6_Vw_vdmpyacc_VwVhVh_sat(out_vec, Q6_V_lo_W(act_vec_zp_diff),
+                                              Q6_V_lo_W(wgt_i16_vec));
+          out_vec = Q6_Vw_vdmpyacc_VwVhVh_sat(out_vec, Q6_V_hi_W(act_vec_zp_diff),
+                                              Q6_V_hi_W(wgt_i16_vec));
+        }
+      }
+    }
+    HVX_Vector mul_vec = Q6_Vw_vmpye_VwVuh(out_vec, scale_vec);
+    HVX_Vector scaled_vec = Q6_Vw_vasr_VwR(mul_vec, new_scale_factor);
+    HVX_Vector sum_vec = Q6_Vw_vadd_VwVw(scaled_vec, out_zp_vec);
+    return sum_vec;
+  };
+
+  auto saturateAndStore = [&cr_out, &computeOutVec](int out_h, int out_w, int out_c, int h, int w) {
+    uint8_t* out_ptr = getElementPtr_int8(out_h, out_w, out_c, h, w, 0, cr_out);
+    HVX_Vector* out_vec_ptr = reinterpret_cast<HVX_Vector*>(out_ptr);
+    HVX_Vector out_vec1, out_vec2, out_vec3, out_vec4, out_vec;
+    out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+    out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1);
+    out_vec3 = computeOutVec(out_h, out_w, out_c, h, w + 2);
+    out_vec4 = computeOutVec(out_h, out_w, out_c, h, w + 3);
+
+    HVX_Vector half_vec1 = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1);
+    HVX_Vector half_vec2 = Q6_Vh_vpack_VwVw_sat(out_vec4, out_vec3);
+    out_vec = Q6_Vub_vpack_VhVh_sat(half_vec2, half_vec1);
+    *out_vec_ptr = out_vec;
+  };
+
+  for (int out_c = 0; out_c < o_depth; ++out_c) {
+    for (int out_h = 0; out_h < o_height; ++out_h) {
+      int max_y = std::min(8, out_height - out_h * 8);
+      for (int out_w = 0; out_w < o_width; ++out_w) {
+        int max_x = std::min(8, out_width - out_w * 8);
+        for (int h = 0; h < max_y; ++h) {
+          if (max_x == 8) {
+            for (int w = 0; w < max_x; w += 4) {
+              saturateAndStore(out_h, out_w, out_c, h, w);
+            }
+          } else {
+            int w = 0;
+            if (max_x >= 4) {
+              saturateAndStore(out_h, out_w, out_c, h, w);
+              w = 4;
+            }
+            uint8_t* out_ptr = getElementPtr_int8(out_h, out_w, out_c, h, w, 0, cr_out);
+            HVX_Vector* out_vec_ptr = reinterpret_cast<HVX_Vector*>(out_ptr);
+            HVX_Vector out_vec1, out_vec2, out_vec3, out_vec;
+            if (max_x % 4 == 1) {
+              out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+              HVX_Vector half_vec = Q6_Vh_vpack_VwVw_sat(Q6_V_vzero(), out_vec1);
+              out_vec = Q6_Vub_vpack_VhVh_sat(Q6_V_vzero(), half_vec);
+              *out_vec_ptr = out_vec;
+            } else if (max_x % 4 == 2) {
+              out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+              out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1);
+              HVX_Vector half_vec = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1);
+              out_vec = Q6_Vub_vpack_VhVh_sat(Q6_V_vzero(), half_vec);
+              *out_vec_ptr = out_vec;
+            } else if (max_x % 4 == 3) {
+              out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+              out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1);
+              out_vec3 = computeOutVec(out_h, out_w, out_c, h, w + 2);
+              HVX_Vector half_vec1 = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1);
+              HVX_Vector half_vec2 = Q6_Vh_vpack_VwVw_sat(Q6_V_vzero(), out_vec3);
+              out_vec = Q6_Vub_vpack_VhVh_sat(half_vec2, half_vec1);
+              *out_vec_ptr = out_vec;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+int conv2d_packed_quant(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
+                        int out_code, void* res_handle) {
+  namespace conv_utils = tvm::runtime::hexagon::conv_utils;
+  ICHECK_EQ(num_args, 13) << "Unexpected number of arguments";
+  ICHECK_EQ(type_codes[0], kTVMDLTensorHandle)
+      << "First argument is expected to be the input tensor";  // Input activations
+  ICHECK_EQ(type_codes[1], kTVMDLTensorHandle)
+      << "Second argument is expected to be the weights tensor";  // Weights
+  ICHECK_EQ(type_codes[2], kDLFloat) << "Third argument is expected to be the activation scale";
+  ICHECK_EQ(type_codes[3], kDLInt) << "Fourth argument is expected to be the activation zero point";
+  ICHECK_EQ(type_codes[4], kDLFloat) << "Fifth argument is expected to be the weight scale";
+  ICHECK_EQ(type_codes[5], kDLInt) << "Sixth argument is expected to be the weight zero point";
+  ICHECK_EQ(type_codes[6], kDLFloat) << "Seventh argument is expected to be the output scale";
+  ICHECK_EQ(type_codes[7], kDLInt) << "Eigth argument is expected to be the output zero point";
+  ICHECK_EQ(type_codes[8], kDLInt) << "Nineth argument is expected to be the stride_h";  // stride_h
+  ICHECK_EQ(type_codes[9], kDLInt) << "Tenth argument is expected to be the stride_w";   // stride_w
+  ICHECK_EQ(type_codes[10], kDLInt) << "Eleventh argument is expected to be fixed final scale";
+  ICHECK_EQ(type_codes[11], kDLInt) << "Twelfth argument is expected to be scale factor";
+  ICHECK_EQ(type_codes[12], kTVMDLTensorHandle)
+      << "Thirteenth argument is expected to be the output tensor";  // output
+
+  auto* act_flat = static_cast<DLTensor*>(args[0].v_handle);
+  auto* wgt_flat = static_cast<DLTensor*>(args[1].v_handle);
+  auto* out_flat = static_cast<DLTensor*>(args[12].v_handle);
+
+  // Temporary assertion until multiple batches are supported
+  ICHECK_EQ(act_flat->shape[0], 1) << "Input batch size more than 1 is not supported yet";
+
+  // Temporary assertion until multiple batches are supported
+  ICHECK_EQ(out_flat->shape[0], 1) << "Output batch size more than 1 is not supported yet";
+
+  float act_scale = args[2].v_float64;
+  int act_zp = args[3].v_int64;
+  LOG_INFO << "act_scale: " << act_scale << ", act_zp: " << act_zp;
+
+  float wgt_scale = args[4].v_float64;
+  int wgt_zp = args[5].v_int64;
+  LOG_INFO << "wgt_scale: " << wgt_scale << ", wgt_zp: " << wgt_zp;
+
+  float out_scale = args[6].v_float64;
+  int out_zp = args[7].v_int64;
+  LOG_INFO << "out_scale: " << out_scale << ", out_zp: " << out_zp;
+
+  int stride_h = args[8].v_int64;
+  int stride_w = args[9].v_int64;
+  LOG_INFO << "stride_h: " << stride_h << ", stride_w: " << stride_w;
+
+  int fixed_final_scale = args[10].v_int64;
+  int scale_factor = args[11].v_int64;
+  LOG_INFO << "fixed_final_scale: " << fixed_final_scale << ", scale_factor: " << scale_factor;
+
+  auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false);
+  ICHECK(device_api != nullptr);
+  tvm::runtime::String vtcm_scope = "global.vtcm";
+
+  auto act_vtcm =
+      conv_utils::prepare_nhwc<uint8_t, 8, 8, 32>(device_api, act_flat, /*copy_data=*/true);
+
+  int num_wgt_chunks = conv_utils::calculate_num_weight_chunks(
+      wgt_flat->shape, /* chunk_height */ wgt_flat->shape[0],
+      /* chunk_width */ wgt_flat->shape[1], /* chunk_in_channel */ 32, /* chunk_out_channel */ 32);
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+
+  auto wgt_vtcm =
+      conv_utils::prepare_hwio_8b(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table, wgt_zp);
+
+  auto out_vtcm =
+      conv_utils::prepare_nhwc<uint8_t, 8, 8, 32>(device_api, out_flat, /*copy_data=*/false);
+
+  auto act_shape = conv_utils::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
+  auto filt_shape = conv_utils::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
+  auto out_shape = conv_utils::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
+
+  tvm::runtime::hexagon::conv_layer_int8_hvx_whole(
+      out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, filt_shape, act_scale, act_zp, wgt_scale,
+      wgt_zp, out_scale, out_zp, fixed_final_scale, scale_factor);
+
+  conv_utils::deblockize_hwc<uint8_t, 8, 8, 32>(out_flat->data, out_vtcm.data, out_flat->shape[1],
+                                                out_flat->shape[2], out_flat->shape[3]);
+
+  conv_utils::release(device_api, out_vtcm);
+  conv_utils::release(device_api, wgt_vtcm);
+  conv_utils::release(device_api, act_vtcm);
+
+  return 0;
+}
diff --git a/src/runtime/hexagon/ops/conv_utils.cc b/src/runtime/hexagon/ops/conv_utils.cc
index b10f7cc315b2..a40e23e463d4 100644
--- a/src/runtime/hexagon/ops/conv_utils.cc
+++ b/src/runtime/hexagon/ops/conv_utils.cc
@@ -17,96 +17,69 @@
  * under the License.
  */
 
-#include "tvm/runtime/hexagon/ops/conv2d.h"
+#include <type_traits>
+
+#include "conv2d.h"
 
 namespace tvm {
 namespace runtime {
 namespace hexagon {
+namespace conv_utils {
 
 /**
- * @brief Function to "blockize" the flat input data
- * The term "blockize" is used to mention that the data is stored in non-contiguous blocks
+ * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
  *
- * The input is mapped into the below mentioned layout (notation similar to index map used for
- * transform layout):
+ * Weights are packed into the below mentioned layout (notation similar to index map):
+ * Since weights cannot be exactly represented into a index map notation, the
+ * base split up is mentioned below with a few deviations
  *
- * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2
+ * lambda h, w, i, o: o//32, i//32, h, w, (i%32)//4, o%32, i%4
  *
- * where AXIS_SEPARATOR represents split up in the physical layout
+ * The deviations are:
+ *  - w is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3
  *
- * @param out Pre-allocated output memory pointer
- * @param inp_flat Flat input data pointer
- * @param height
- * @param width
- * @param depth
- */
-void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth) {
-  auto inp_data = static_cast<uint16_t*>(inp_flat);
-  auto out_data = static_cast<uintptr_t*>(out);
-  const int stride_x = depth;
-  const int stride_y = stride_x * width;
-
-  for (int cy = 0; cy < height; cy += 8) {
-    for (int cx = 0; cx < width; cx += 4) {
-      for (int cc = 0; cc < depth; cc += 32) {
-        auto block = reinterpret_cast<uint16_t*>(*out_data++);
-        int max_y = std::min(8, height - cy);
-        int max_x = std::min(4, width - cx);
-        int max_c = std::min(32, depth - cc);
-        for (int y = 0; y < max_y; ++y) {
-          for (int x = 0; x < max_x; ++x) {
-            for (int c = 0; c < max_c; ++c) {
-              block[xyc_to_sm_16b(y, x, c)] =
-                  inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)];
-            }
-            for (int c = max_c; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
-          }
-          for (int x = max_x; x < 4; ++x) {
-            for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
-          }
-        }
-
-        for (int y = max_y; y < 8; ++y)
-          for (int x = 0; x < 4; ++x)
-            for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
-      }  // cc
-    }    // cx
-  }      // cy
-}
-
-/**
- * @brief Convert back from non-contguous layout to a flat layout
- *
- * @param out_flat Pre-allocated output memory pointer
- * @param inp Blockized input data pointer
+ * @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses
+ * of the "chunked" weights
+ * @param out_ptr_size The number of chunks
+ * @param out Pointer to pre-allocated output memory
+ * @param inp Pointer to flat input data
  * @param height
  * @param width
- * @param depth
+ * @param idepth
+ * @param odepth
  */
-void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth) {
-  uintptr_t* inp_data = static_cast<uintptr_t*>(inp);
-  uint16_t* out_data = static_cast<uint16_t*>(out_flat);
-  const int stride_x = depth;
+void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width,
+                      int idepth, int odepth, int wgt_zp) {
+  auto inp_data = static_cast<int8_t*>(inp);
+  auto out_data = static_cast<int8_t*>(out);
+  const int stride_i = odepth;
+  const int stride_x = stride_i * idepth;
   const int stride_y = stride_x * width;
 
-  for (int cy = 0; cy < height; cy += 8) {
-    for (int cx = 0; cx < width; cx += 4) {
-      for (int cc = 0; cc < depth; cc += 32) {
-        auto block = reinterpret_cast<uint16_t*>(*inp_data);
-        int max_y = std::min(8, height - cy);
-        int max_x = std::min(4, width - cx);
-        int max_c = std::min(32, depth - cc);
-        for (int y = 0; y < max_y; ++y) {
-          for (int x = 0; x < max_x; ++x) {
-            for (int c = 0; c < max_c; ++c) {
-              out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] =
-                  block[xyc_to_sm_16b(y, x, c)];
+  for (int ci = 0; ci < idepth; ci += 32) {
+    for (int co = 0; co < odepth; co += 32) {
+      int max_i = std::min(32, idepth - ci);
+      int max_o = std::min(32, odepth - co);
+
+      auto chunk = out_data;
+      for (int y = 0; y < height; ++y) {
+        for (int x = width - 1; x >= 0; --x) {
+          for (int i = 0; i < max_i; ++i) {
+            for (int o = 0; o < max_o; ++o) {
+              chunk[hwio_to_sm_8b(width, y, x, i, o)] =
+                  inp_data[y * stride_y + x * stride_x + (ci + i) * stride_i + (co + o)];
             }
+            for (int o = max_o; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp;
           }
+          for (int i = max_i; i < 32; ++i)
+            for (int o = 0; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp;
         }
-
-        inp_data++;
       }
+
+      *out_ptr++ = chunk;
+      out_data += height * width * 32 * 32;
+      out_ptr_size--;
+      assert(out_ptr_size >= 0);
     }
   }
 }
@@ -184,25 +157,27 @@ void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, i
   }
 }
 
-SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat,
-                          bool copy_data) {
+std::tuple<int, int, int, int> getHWIO(const DLTensor* hwio_flat) {
+  int h = hwio_flat->shape[0];
+  int w = hwio_flat->shape[1];
+  int i = round_up(hwio_flat->shape[2], 32);
+  int o = round_up(hwio_flat->shape[3], 32);
+  return std::make_tuple(h, w, i, o);
+}
+
+SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
+                             int num_chunks, void** ptr_table, int wgt_zp) {
   tvm::runtime::String vtcm_scope = "global.vtcm";
 
-  // Allocate blocks for activations. We will use the block pointers
-  // directly from the allocated area.
-  int n = nhwc_flat->shape[0];
-  int h = round_up(nhwc_flat->shape[1], 8);
-  int w = round_up(nhwc_flat->shape[2], 4);
-  int c = round_up(nhwc_flat->shape[3], 32);
-  int64_t shape_2d[2] = {(n * h * w * c) / (8 * 4 * 32), 8 * 4 * 32};
-  void* nhwc_vtcm =
-      device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope);
-  if (copy_data) {
-    blockize_hwc_16b(nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2],
-                     nhwc_flat->shape[3]);
-  }
+  auto [h, w, i, o] = getHWIO(hwio_flat);
+  int64_t shape_1d[] = {h * w * i * o};
+  void* hwio_vtcm =
+      device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope);
 
-  return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm, {n, h / 8, w / 4, c / 32});
+  chunkify_hwio_8b(ptr_table, num_chunks, hwio_vtcm, hwio_flat->data, hwio_flat->shape[0],
+                   hwio_flat->shape[1], hwio_flat->shape[2], hwio_flat->shape[3], wgt_zp);
+
+  return SDLTensor<4>(ptr_table, hwio_flat->dtype, hwio_vtcm, {1, 1, i / 32, o / 32});
 }
 
 SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
@@ -214,10 +189,7 @@ SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* h
   // height- or width-wise, so filter chunks may have different sizes.
   // A filter chunk is a block of size HxWx32x32, where H, W are at most
   // height and width of a block respectively.
-  int h = hwio_flat->shape[0];
-  int w = hwio_flat->shape[1];
-  int i = round_up(hwio_flat->shape[2], 32);
-  int o = round_up(hwio_flat->shape[3], 32);
+  auto [h, w, i, o] = getHWIO(hwio_flat);
   int64_t shape_1d[] = {h * w * i * o};
   void* hwio_vtcm =
       device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope);
@@ -229,15 +201,19 @@ SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* h
                       {round_up(h, 8) / 8, round_up(w, 4) / 4, i / 32, o / 32});
 }
 
-int calculate_num_weight_chunks(int64_t* shape_hwio) {
-  int h = round_up(shape_hwio[0], 8);
-  int w = round_up(shape_hwio[1], 4);
-  int i = round_up(shape_hwio[2], 32);
-  int o = round_up(shape_hwio[3], 32);
+int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width,
+                                int chunk_in_channel, int chunk_out_channel) {
+  // Define slower roundup that doesn't assume multiplier 'p' to be power of 2
+  auto roundup = [](int v, int p) { return (v + p - 1) - ((v + p - 1) % p); };
+  int h = roundup(shape_hwio[0], chunk_height);
+  int w = roundup(shape_hwio[1], chunk_width);
+  int i = roundup(shape_hwio[2], chunk_in_channel);
+  int o = roundup(shape_hwio[3], chunk_out_channel);
 
-  return (h * w * i * o) / (8 * 4 * 32 * 32);
+  return (h * w * i * o) / (chunk_height * chunk_width * chunk_in_channel * chunk_out_channel);
 }
 
+}  // namespace conv_utils
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h
new file mode 100644
index 000000000000..07e15966863e
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_TESTS_CPPRUNTIME_HEXAGON_HEXAGON_CONV_UTILS_H
+#define TVM_TESTS_CPPRUNTIME_HEXAGON_HEXAGON_CONV_UTILS_H
+
+#include <dlpack/dlpack.h>
+#include <gtest/gtest.h>
+
+#include <limits>
+
+#include "conv2d.h"
+
+using namespace tvm::runtime::hexagon::conv_utils;
+
+template <typename T>
+class HexagonUtilsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    vtcm_scope = "global.vtcm";
+    device_api = tvm::runtime::DeviceAPI::Get(hexagon_device, false);
+    float16.code = kDLFloat;
+    float16.bits = 16;
+    float16.lanes = 1;
+
+    uint8.code = kDLUInt;
+    uint8.bits = 8;
+    uint8.lanes = 1;
+
+    int8.code = kDLInt;
+    int8.bits = 8;
+    int8.lanes = 1;
+  }
+
+  void setupTensor(std::tuple<int64_t, int64_t, int64_t, int64_t> shape, DLDataType dtype) {
+    auto [s1, s2, s3, s4] = shape;
+    tensor_shape[0] = s1;
+    tensor_shape[1] = s2;
+    tensor_shape[2] = s3;
+    tensor_shape[3] = s4;
+    int64_t shape_1d[1] = {s1 * s2 * s3 * s4};
+
+    flat_mem = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, dtype, vtcm_scope);
+    flat_mem_data = static_cast<T*>(flat_mem);
+    fill_vals(flat_mem_data, shape_1d[0]);
+
+    flat_tensor.data = flat_mem;
+    flat_tensor.device = hexagon_device;
+    flat_tensor.ndim = 4;
+    flat_tensor.dtype = dtype;
+    flat_tensor.shape = tensor_shape;
+    flat_tensor.strides = nullptr;
+    flat_tensor.byte_offset = 0;
+  }
+
+  void TearDownTensor() {
+    if (flat_tensor.data) device_api->FreeDataSpace(hexagon_device, flat_mem);
+  }
+
+  static void fill_vals(T* arr, int size) {
+    // Testing with uint16 instead of float16 as generating random float16 is not easy within c++
+    auto max = std::numeric_limits<T>::max();
+    srand(std::time(0));
+    for (int i = 0; i < size; ++i) {
+      arr[i] = static_cast<T>(std::rand() % max);
+    }
+  }
+
+  static int flattened_idx(int nn, int hh, int ww, int cc, int64_t* shape) {
+    int h = shape[1];
+    int w = shape[2];
+    int c = shape[3];
+    return cc + c * (ww + w * (hh + h * (nn)));
+  }
+
+  DLTensor flat_tensor;
+  void* flat_mem;
+  T* flat_mem_data;
+  tvm::runtime::DeviceAPI* device_api;
+  tvm::runtime::String vtcm_scope;
+  DLDataType float16;
+  DLDataType int8, uint8;
+  int64_t tensor_shape[4];
+};
+
+#endif
diff --git a/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
index 3b922fa6c2a8..5f2ef490d020 100644
--- a/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
@@ -26,73 +26,14 @@
 #include <string>
 #include <tuple>
 
-#include "tvm/runtime/hexagon/ops/conv2d.h"
+#include "conv2d.h"
+#include "hexagon_conv_utils_test.h"
 
 using namespace tvm::runtime::hexagon;
 
-class HexagonUtilsTest : public ::testing::Test {
- public:
-  void SetUp() override {
-    vtcm_scope = "global.vtcm";
-    device_api = tvm::runtime::DeviceAPI::Get(hexagon_device, false);
-    float16.code = kDLFloat;
-    float16.bits = 16;
-    float16.lanes = 1;
-  }
-
-  void setupTensor(std::tuple<int64_t, int64_t, int64_t, int64_t> shape) {
-    auto [s1, s2, s3, s4] = shape;
-    tensor_shape[0] = s1;
-    tensor_shape[1] = s2;
-    tensor_shape[2] = s3;
-    tensor_shape[3] = s4;
-    int64_t shape_1d[1] = {s1 * s2 * s3 * s4};
-
-    flat_mem = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope);
-    flat_mem_data = static_cast<uint16_t*>(flat_mem);
-    fill_vals(flat_mem_data, shape_1d[0]);
-
-    flat_tensor.data = flat_mem;
-    flat_tensor.device = hexagon_device;
-    flat_tensor.ndim = 4;
-    flat_tensor.dtype = float16;
-    flat_tensor.shape = tensor_shape;
-    flat_tensor.strides = nullptr;
-    flat_tensor.byte_offset = 0;
-  }
-
-  void TearDownTensor() {
-    if (flat_tensor.data) device_api->FreeDataSpace(hexagon_device, flat_mem);
-  }
-
-  static void fill_vals(uint16_t* arr, int size) {
-    // Testing with uint16 instead of float16 as generating random float16 is not easy within c++
-    uint16_t max = UINT16_MAX;
-    srand(std::time(0));
-    for (int i = 0; i < size; ++i) {
-      arr[i] = static_cast<uint16_t>(std::rand() % max);
-    }
-  }
-
-  static int flattened_idx(int nn, int hh, int ww, int cc, int64_t* shape) {
-    int h = shape[1];
-    int w = shape[2];
-    int c = shape[3];
-    return cc + c * (ww + w * (hh + h * (nn)));
-  }
-
-  DLTensor flat_tensor;
-  void* flat_mem;
-  uint16_t* flat_mem_data;
-  tvm::runtime::DeviceAPI* device_api;
-  tvm::runtime::String vtcm_scope;
-  DLDataType float16;
-  int64_t tensor_shape[4];
-};
-
 // Parameterized test fixture with 4 params representing n, h, w, c
 class HexagonUtilsActivationsBlockizeTest
-    : public HexagonUtilsTest,
+    : public HexagonUtilsTest<uint16_t>,
       public ::testing::WithParamInterface<std::tuple<
           std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
 
@@ -122,11 +63,12 @@ INSTANTIATE_TEST_SUITE_P(
 TEST_F(HexagonUtilsActivationsBlockizeTest, prepare_nhwc) {
   auto shape = std::make_tuple(1, 14, 7, 60);
   auto [n, h, w, c] = shape;
-  setupTensor(shape);
+  setupTensor(shape, float16);
 
   // // copy_data is set to false here as there's a separate test for blockize when copy_data
   // becomes true
-  auto blocked_tensor = prepare_nhwc(device_api, &flat_tensor, /*copy_data=*/false);
+  auto blocked_tensor =
+      prepare_nhwc<uint16_t, 8, 4, 32>(device_api, &flat_tensor, /*copy_data=*/false);
 
   EXPECT_EQ(blocked_tensor.shape[0], n);
   EXPECT_EQ(blocked_tensor.shape[1], round_up(h, 8) / 8);
@@ -139,7 +81,7 @@ TEST_F(HexagonUtilsActivationsBlockizeTest, prepare_nhwc) {
 
 TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
   auto shape_tuple = std::get<0>(GetParam());
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
   auto [n, h, w, c] = shape_tuple;
   int64_t shape[] = {n, h, w, c};
 
@@ -150,7 +92,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
 
   void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope);
   int64_t blocked_shape[] = {n, h_rounded / 8, w_rounded / 4, c_rounded / 32};
-  blockize_hwc_16b(blocked_mem, flat_mem, h, w, c);
+  blockize_hwc<uint16_t, 8, 4, 32>(blocked_mem, flat_mem, h, w, c);
 
   std::function<int(int, int, int, int, int64_t*)> flatten =
       HexagonUtilsActivationsBlockizeTest::flattened_idx;
@@ -159,7 +101,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
     auto* blocks = static_cast<uintptr_t*>(blocked_mem);
     int blockIdx = flatten(nn, hh / 8, ww / 4, cc / 32, blocked_shape);
     uint16_t* block = reinterpret_cast<uint16_t*>(blocks[blockIdx]);
-    return block[xyc_to_sm_16b(hh % 8, ww % 4, cc % 32)];
+    return block[yxc_to_sm_16b(hh % 8, ww % 4, cc % 32)];
   };
 
   auto [nn, hh, ww, cc] = std::get<1>(GetParam());
@@ -172,7 +114,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
 
 TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) {
   auto shape_tuple = std::get<0>(GetParam());
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
   auto [n, h, w, c] = shape_tuple;
   int64_t shape[] = {n, h, w, c};
   int64_t shape_1d[1] = {n * h * w * c};
@@ -183,11 +125,11 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) {
   int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 4 * 32), 8 * 4 * 32};
 
   void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope);
-  blockize_hwc_16b(blocked_mem, flat_mem, h, w, c);
+  blockize_hwc<uint16_t, 8, 4, 32>(blocked_mem, flat_mem, h, w, c);
 
   void* deblocked_flat_mem =
       device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope);
-  deblockize_hwc_16b(deblocked_flat_mem, blocked_mem, h, w, c);
+  deblockize_hwc<uint16_t, 8, 4, 32>(deblocked_flat_mem, blocked_mem, h, w, c);
   auto* deblocked_flat_mem_data = static_cast<uint16_t*>(deblocked_flat_mem);
 
   auto [nn, hh, ww, cc] = std::get<1>(GetParam());
@@ -201,7 +143,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) {
 }
 
 class HexagonUtilsWeightsChunkifyTest
-    : public HexagonUtilsTest,
+    : public HexagonUtilsTest<uint16_t>,
       public ::testing::WithParamInterface<std::tuple<
           std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
 
@@ -231,7 +173,9 @@ INSTANTIATE_TEST_SUITE_P(
 
 TEST_F(HexagonUtilsWeightsChunkifyTest, calculate_num_weight_chunks) {
   int64_t shape[] = {3, 3, 40, 40};
-  int num_wgt_chunks = calculate_num_weight_chunks(shape);
+  int num_wgt_chunks =
+      calculate_num_weight_chunks(shape, /* chunk_height */ 8, /* chunk_width */ 4,
+                                  /* chunk_in_channel */ 32, /* chunk_out_channel */ 32);
   EXPECT_EQ(num_wgt_chunks, 4);
 }
 
@@ -239,11 +183,11 @@ TEST_F(HexagonUtilsWeightsChunkifyTest, prepare_hwio) {
   int64_t shape[] = {3, 3, 40, 40};
   auto [h, w, i, o] = shape;
   auto shape_tuple = std::make_tuple(h, w, i, o);
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
 
   // copy_data is set to false here as there's a separate test for blockize when copy_data becomes
   // true
-  auto num_wgt_chunks = calculate_num_weight_chunks(shape);
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, 8, 4, 32, 32);
   auto wgt_ptr_table =
       reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
   auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
@@ -260,10 +204,10 @@ TEST_F(HexagonUtilsWeightsChunkifyTest, prepare_hwio) {
 TEST_P(HexagonUtilsWeightsChunkifyTest, chunkify_hwio_16b) {
   auto [shape_tuple, indices] = GetParam();
   auto [h, w, i, o] = shape_tuple;
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
   int64_t shape[] = {h, w, i, o};
 
-  auto num_wgt_chunks = calculate_num_weight_chunks(shape);
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, 8, 4, 32, 32);
   auto wgt_ptr_table =
       reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
   auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
diff --git a/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc b/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc
new file mode 100644
index 000000000000..449c69736050
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dlpack/dlpack.h>
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <ctime>
+#include <functional>
+#include <string>
+#include <tuple>
+
+#include "conv2d.h"
+#include "hexagon_conv_utils_test.h"
+
+using namespace tvm::runtime::hexagon::conv_utils;
+
+// Parameterized test fixture with 4 params representing n, h, w, c
+class HexagonUtilsQuantActivationsBlockizeTest
+    : public HexagonUtilsTest<uint8_t>,
+      public ::testing::WithParamInterface<std::tuple<
+          std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
+
+// TODO (quic-sanirudh): See if we can test with random generated indices
+INSTANTIATE_TEST_SUITE_P(
+    BlockizeDeblockizeTestFixtures, HexagonUtilsQuantActivationsBlockizeTest,
+    ::testing::Combine(::testing::Values(std::make_tuple(1, 14, 7, 60)),
+                       ::testing::Values(std::make_tuple(0, 0, 0, 0),   // first element
+                                         std::make_tuple(0, 7, 3, 31),  // last element
+                                         // Remaining are random element tests
+                                         std::make_tuple(0, 13, 6, 59),
+                                         std::make_tuple(0, 0, 0, 32), std::make_tuple(0, 0, 4, 32),
+                                         std::make_tuple(0, 2, 3, 4), std::make_tuple(0, 5, 6, 7),
+                                         std::make_tuple(0, 10, 4, 12))),
+    [](const ::testing::TestParamInfo<HexagonUtilsQuantActivationsBlockizeTest::ParamType>& info) {
+      // Can use info.param here to generate the test suffix
+      auto indices = std::get<1>(info.param);
+      int h = std::get<1>(indices);
+      int w = std::get<2>(indices);
+      int c = std::get<3>(indices);
+      // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0
+      std::string name =
+          "hwc" + std::to_string(h) + "x" + std::to_string(w) + "x" + std::to_string(c);
+      return name;
+    });
+
+TEST_F(HexagonUtilsQuantActivationsBlockizeTest, prepare_nhwc) {
+  auto shape = std::make_tuple(1, 14, 7, 60);
+  auto [n, h, w, c] = shape;
+  setupTensor(shape, uint8);
+
+  // // copy_data is set to false here as there's a separate test for blockize when copy_data
+  // becomes true
+  auto blocked_tensor =
+      prepare_nhwc<uint8_t, 8, 8, 32>(device_api, &flat_tensor, /*copy_data=*/false);
+
+  EXPECT_EQ(blocked_tensor.shape[0], n);
+  EXPECT_EQ(blocked_tensor.shape[1], round_up(h, 8) / 8);
+  EXPECT_EQ(blocked_tensor.shape[2], round_up(w, 8) / 8);
+  EXPECT_EQ(blocked_tensor.shape[3], round_up(c, 32) / 32);
+
+  TearDownTensor();
+  release(device_api, blocked_tensor);
+}
+
+TEST_P(HexagonUtilsQuantActivationsBlockizeTest, blockize_hwc_8b) {
+  auto shape_tuple = std::get<0>(GetParam());
+  setupTensor(shape_tuple, uint8);
+  auto [n, h, w, c] = shape_tuple;
+  int64_t shape[] = {n, h, w, c};
+
+  int h_rounded = round_up(h, 8);
+  int w_rounded = round_up(w, 8);
+  int c_rounded = round_up(c, 32);
+  int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 8 * 32), 8 * 8 * 32};
+
+  void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, uint8, vtcm_scope);
+  int64_t blocked_shape[] = {n, h_rounded / 8, w_rounded / 8, c_rounded / 32};
+  blockize_hwc<uint8_t, 8, 8, 32>(blocked_mem, flat_mem, h, w, c);
+
+  std::function<int(int, int, int, int, int64_t*)> flatten =
+      HexagonUtilsQuantActivationsBlockizeTest::flattened_idx;
+
+  auto getBlockedElem = [&blocked_shape, blocked_mem, flatten](int nn, int hh, int ww, int cc) {
+    auto* blocks = static_cast<uintptr_t*>(blocked_mem);
+    int blockIdx = flatten(nn, hh / 8, ww / 8, cc / 32, blocked_shape);
+    uint8_t* block = reinterpret_cast<uint8_t*>(blocks[blockIdx]);
+    return block[yxc_to_sm_8b(hh % 8, ww % 8, cc % 32)];
+  };
+
+  auto [nn, hh, ww, cc] = std::get<1>(GetParam());
+
+  EXPECT_EQ(flat_mem_data[flattened_idx(nn, hh, ww, cc, shape)], getBlockedElem(nn, hh, ww, cc));
+
+  TearDownTensor();
+  device_api->FreeDataSpace(hexagon_device, blocked_mem);
+}
+
+TEST_P(HexagonUtilsQuantActivationsBlockizeTest, deblockize_hwc_8b) {
+  auto shape_tuple = std::get<0>(GetParam());
+  setupTensor(shape_tuple, uint8);
+  auto [n, h, w, c] = shape_tuple;
+  int64_t shape[] = {n, h, w, c};
+  int64_t shape_1d[1] = {n * h * w * c};
+
+  int h_rounded = round_up(h, 8);
+  int w_rounded = round_up(w, 8);
+  int c_rounded = round_up(c, 32);
+  int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 8 * 32), 8 * 8 * 32};
+
+  void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, uint8, vtcm_scope);
+  blockize_hwc<uint8_t, 8, 8, 32>(blocked_mem, flat_mem, h, w, c);
+
+  void* deblocked_flat_mem =
+      device_api->AllocDataSpace(hexagon_device, 1, shape_1d, uint8, vtcm_scope);
+  deblockize_hwc<uint8_t, 8, 8, 32>(deblocked_flat_mem, blocked_mem, h, w, c);
+  auto* deblocked_flat_mem_data = static_cast<uint8_t*>(deblocked_flat_mem);
+
+  auto [nn, hh, ww, cc] = std::get<1>(GetParam());
+
+  auto idx = flattened_idx(nn, hh, ww, cc, shape);
+  EXPECT_EQ(flat_mem_data[idx], deblocked_flat_mem_data[idx]);
+
+  TearDownTensor();
+  device_api->FreeDataSpace(hexagon_device, blocked_mem);
+  device_api->FreeDataSpace(hexagon_device, deblocked_flat_mem);
+}
+
+class HexagonUtilsQuantWeightsChunkifyTest
+    : public HexagonUtilsTest<int8_t>,
+      public ::testing::WithParamInterface<std::tuple<
+          std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
+
+INSTANTIATE_TEST_SUITE_P(
+    ChunkifyDechunkifyTests, HexagonUtilsQuantWeightsChunkifyTest,
+    ::testing::Combine(::testing::Values(std::make_tuple(3, 3, 40, 40)),
+                       ::testing::Values(std::make_tuple(0, 0, 0, 0),    // first element
+                                         std::make_tuple(2, 2, 39, 39),  // Last element
+                                         // Remaining are random element tests
+                                         std::make_tuple(1, 1, 28, 33),
+                                         std::make_tuple(1, 2, 8, 38),
+                                         std::make_tuple(1, 0, 12, 15),
+                                         std::make_tuple(2, 1, 9, 22), std::make_tuple(0, 2, 6, 7),
+                                         std::make_tuple(1, 2, 3, 4))),
+    [](const ::testing::TestParamInfo<HexagonUtilsQuantWeightsChunkifyTest::ParamType>& info) {
+      // Can use info.param here to generate the test suffix
+      auto indices = std::get<1>(info.param);
+      int h = std::get<0>(indices);
+      int w = std::get<1>(indices);
+      int i = std::get<2>(indices);
+      int o = std::get<3>(indices);
+      // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0
+      std::string name = "hwio" + std::to_string(h) + std::to_string(w) + "x" + std::to_string(i) +
+                         "x" + std::to_string(o);
+      return name;
+    });
+
+TEST_F(HexagonUtilsQuantWeightsChunkifyTest, calculate_num_weight_chunks) {
+  int64_t shape[] = {3, 3, 40, 40};
+  int num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32);
+  EXPECT_EQ(num_wgt_chunks, 4);
+}
+
+TEST_F(HexagonUtilsQuantWeightsChunkifyTest, prepare_hwio) {
+  int64_t shape[] = {3, 3, 40, 40};
+  auto [h, w, i, o] = shape;
+  auto shape_tuple = std::make_tuple(h, w, i, o);
+  setupTensor(shape_tuple, int8);
+
+  // copy_data is set to false here as there's a separate test for blockize when copy_data becomes
+  // true
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32);
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+  auto chunked_tensor = prepare_hwio_8b(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
+
+  EXPECT_EQ(chunked_tensor.shape[0], 1);
+  EXPECT_EQ(chunked_tensor.shape[1], 1);
+  EXPECT_EQ(chunked_tensor.shape[2], round_up(i, 32) / 32);
+  EXPECT_EQ(chunked_tensor.shape[3], round_up(o, 32) / 32);
+
+  release(device_api, chunked_tensor);
+  TearDownTensor();
+}
+
+TEST_P(HexagonUtilsQuantWeightsChunkifyTest, chunkify_hwio_8b) {
+  auto [shape_tuple, indices] = GetParam();
+  auto [h, w, i, o] = shape_tuple;
+  setupTensor(shape_tuple, int8);
+  int64_t shape[] = {h, w, i, o};
+
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32);
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+  auto chunked_tensor = prepare_hwio_8b(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
+
+  auto getChunkedElem = [width = w, chunked_tensor](int hh, int ww, int ii, int oo) {
+    auto data = static_cast<intptr_t*>(chunked_tensor.data);
+    auto chunk = data[ii / 32 * chunked_tensor.shape[3] + oo / 32];
+    auto chunk_int8 = reinterpret_cast<int8_t*>(chunk);
+    return chunk_int8[hwio_to_sm_8b(width, hh, ww, ii % 32, oo % 32)];
+  };
+
+  auto [hh, ww, ii, oo] = indices;
+
+  EXPECT_EQ(flat_mem_data[flattened_idx(hh, ww, ii, oo, shape)], getChunkedElem(hh, ww, ii, oo));
+  release(device_api, chunked_tensor);
+}
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index c03701f83ccc..5b13513c0fb3 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -320,7 +320,7 @@ def quantize_np(arr_np: numpy.ndarray, dtype: str):
 
     scale = (fmax - fmin) / (qmax - qmin)
     zero_point = numpy.rint((fmax * qmin - fmin * qmax) / (fmax - fmin)).astype("int32")
-    quant_np = (arr_np / scale + zero_point).astype(dtype)
+    quant_np = numpy.clip(((arr_np / scale).round() + zero_point), qmin, qmax).astype(dtype)
     return quant_np, scale, zero_point
 
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py
new file mode 100644
index 000000000000..c26e6142ba5c
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py
@@ -0,0 +1,261 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test quantized conv2d HVX intrinsic implementation"""
+
+import numpy as np
+
+import tvm
+import tvm.contrib.hexagon
+from tvm.topi.hexagon.utils import get_fixed_point_value
+from tvm.topi.testing import conv2d_nhwc_python
+
+from ..infrastructure import get_hexagon_target, quantize_np
+
+
+def build_conv2d(target):
+    """Build and return the conv2d IRModule that calls the intrinsic implementation"""
+    act_n, act_h, act_w, act_c = (
+        tvm.te.var("an"),
+        tvm.te.var("ah"),
+        tvm.te.var("aw"),
+        tvm.te.var("ac"),
+    )
+    filt_h, filt_w, filt_o = tvm.te.var("filt_h"), tvm.te.var("filt_w"), tvm.te.var("filt_o")
+    act_scale, act_zp = tvm.te.var("act_scale", dtype="float32"), tvm.te.var("act_zp")
+    wgt_scale, wgt_zp = tvm.te.var("wgt_scale", dtype="float32"), tvm.te.var("wgt_zp")
+    out_scale, out_zp = tvm.te.var("out_scale", dtype="float32"), tvm.te.var("out_zp")
+    fixed_final_scale, scale_factor = tvm.te.var("fixed_final_scale", dtype="int32"), tvm.te.var(
+        "scale_factor"
+    )
+    stride_h, stride_w = tvm.te.var("stride_h"), tvm.te.var("stride_w")
+
+    act_flat = tvm.te.placeholder(
+        shape=(act_n, act_h, act_w, act_c), dtype="uint8", name="act_flat"
+    )
+    wgt_flat = tvm.te.placeholder(
+        shape=(filt_h, filt_w, act_c, filt_o), dtype="int8", name="wgt_flat"
+    )
+
+    out_flat = tvm.te.extern(
+        shape=(act_n, (act_h - filt_h) // stride_h + 1, (act_w - filt_w) // stride_w + 1, filt_o),
+        inputs=[act_flat, wgt_flat],
+        fcompute=lambda ins, outs: tvm.tir.call_cpacked(
+            "conv2d_packed_quant",  # Function from TVM runtime
+            ins[0],
+            ins[1],
+            act_scale,
+            act_zp,
+            wgt_scale,
+            wgt_zp,
+            out_scale,
+            out_zp,
+            stride_h,
+            stride_w,
+            fixed_final_scale,
+            scale_factor,
+            outs[0],
+            tvm.runtime.const(0),  # resource_handle (unused)
+        ),
+        dtype="uint8",
+    )
+
+    s = tvm.te.create_schedule(out_flat.op)
+
+    func_name = "conv2d_quant_hvx"
+    module = tvm.build(
+        s,
+        [
+            act_flat,
+            wgt_flat,
+            act_scale,
+            act_zp,
+            wgt_scale,
+            wgt_zp,
+            out_scale,
+            out_zp,
+            stride_h,
+            stride_w,
+            fixed_final_scale,
+            scale_factor,
+            out_flat,
+        ],
+        target=target,
+        name=func_name,
+    )
+
+    return module
+
+
+def gen_config(params):
+    """Utility function to generate useful ids for shape_parameters"""
+
+    dims = lambda vals: "x".join(map(str, vals))
+
+    config = {}
+    for param in params:
+        act_shape, wgt_shape, inp_stride = param
+        name = f"nhwc{dims(act_shape)}-hwio{dims(wgt_shape)}-stride{dims(inp_stride)}"
+        config[name] = param
+
+    return config
+
+
+class TestQuantConv2dIntrin:
+    """Test Quantized Conv2d Intrin class"""
+
+    shape_parameters = [
+        [
+            (1, 5, 5, 33),
+            (3, 3, 33, 33),
+            (1, 1),
+        ],
+        [
+            (1, 9, 8, 64),
+            (3, 3, 64, 64),
+            (1, 1),
+        ],
+        [
+            (1, 11, 16, 64),
+            (3, 3, 64, 32),
+            (1, 1),
+        ],
+        [
+            (1, 24, 8, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 4, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 5, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 6, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 7, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 8, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 9, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 10, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 11, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 4, 5),
+            (3, 3, 5, 3),
+            (1, 1),
+        ],
+    ]
+
+    config = gen_config(shape_parameters)
+    act_shape, wgt_shape, inp_stride = tvm.testing.parameters(*config.values(), ids=config.keys())
+    inp_offset = tvm.testing.parameter((0, 0), ids=["offset0x0"])
+
+    @tvm.testing.requires_hexagon
+    def test_conv2d_quant(self, act_shape, wgt_shape, inp_stride, hexagon_session):
+        """Test quantized conv2d intrinsic implementation"""
+        assert act_shape[3] == wgt_shape[2]
+
+        # Currently, input offset does not affect the output shape
+        def get_out_shape(ash, wsh, inp_stride):
+            assert ash[3] == wsh[2]
+            osh = (
+                ash[0],
+                (ash[1] - wsh[0]) // inp_stride[0] + 1,
+                (ash[2] - wsh[1]) // inp_stride[1] + 1,
+                wsh[3],
+            )
+            assert tvm.tir.all([x > 0 for x in osh])
+            return osh
+
+        act_f = np.random.uniform(-1.5, 1.0, size=act_shape).astype("float32")
+        wgt_f = np.random.uniform(-1.5, 1.0, size=wgt_shape).astype("float32")
+
+        # Quanize activations using onnxruntime
+        act_q, act_scale, act_zp = quantize_np(act_f, dtype="uint8")
+        act_q = act_q.reshape(act_f.shape)
+
+        # Quanize weights using onnxruntime
+        wgt_q, wgt_scale, wgt_zp = quantize_np(wgt_f, dtype="int8")
+        wgt_q = wgt_q.reshape(wgt_f.shape)
+
+        # Generate reference output
+        ref_out = conv2d_nhwc_python(act_f, wgt_f, stride=inp_stride, padding="VALID")
+
+        ref_out_q, out_scale, out_zp = quantize_np(ref_out, dtype="uint8")
+        ref_out_q = ref_out_q.reshape(ref_out.shape)
+
+        final_scale = act_scale * wgt_scale / out_scale
+        fixed_final_scale, scale_factor = get_fixed_point_value(final_scale)
+
+        module = build_conv2d(get_hexagon_target("v69"))
+        mod = hexagon_session.load_module(module)
+
+        output_shape = get_out_shape(act_shape, wgt_shape, inp_stride)
+
+        output = tvm.nd.array(
+            np.zeros(output_shape, dtype="uint8"),
+            device=hexagon_session.device,
+        )
+        mod(
+            tvm.nd.array(act_q, device=hexagon_session.device),
+            tvm.nd.array(wgt_q, device=hexagon_session.device),
+            act_scale,
+            act_zp,
+            wgt_scale,
+            wgt_zp,
+            out_scale,
+            out_zp,
+            inp_stride[0],  # stride_height
+            inp_stride[1],  # stride_width
+            fixed_final_scale,
+            scale_factor,
+            output,
+        )
+
+        out_q = output.numpy()
+
+        tvm.testing.assert_allclose(out_q, ref_out_q, rtol=0, atol=2)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From afbfb7aa7e43732cb716f8e443df696110be6afc Mon Sep 17 00:00:00 2001
From: Alexey Voronov <avoronov.icemist@gmail.com>
Date: Thu, 1 Dec 2022 21:56:31 +0300
Subject: [PATCH 685/704] [TIR][Analysis][Hexagon] Add vtcm memory capacity
 verification for Hexagon target (#13349)

The main items that have been added are:

* tvm.tir.analysis.calculate_allocated_bytes(), to calculate allocated memory per memory scope

* tir.transform.VerifyVTCMLimit(limit), to verify if the size of the allocated vtcm memory satisfies the limit

* tvm.target.hexagon().vtcm_capacity, attribute to pass the limit

* tir.vtcm_capacity, context configuration attribute to pass the limit alternatively
---
 include/tvm/tir/analysis.h                    |  16 +++
 python/tvm/autotvm/measure/measure_methods.py |  33 ++++-
 python/tvm/target/target.py                   |   8 ++
 python/tvm/tir/analysis/analysis.py           |  16 +++
 python/tvm/tir/transform/transform.py         |  11 ++
 src/auto_scheduler/feature.cc                 |   7 ++
 src/auto_scheduler/search_policy/utils.h      |   5 +
 src/driver/driver_api.cc                      |  17 ++-
 src/target/target_kind.cc                     |   1 +
 .../analysis/calculate_allocated_memory.cc    | 117 ++++++++++++++++++
 .../contrib/test_hexagon/infrastructure.py    |   4 +-
 .../python/contrib/test_hexagon/test_vtcm.py  |  55 +++++---
 ...tir_analysis_calculate_allocated_memory.py | 101 +++++++++++++++
 13 files changed, 366 insertions(+), 25 deletions(-)
 create mode 100644 src/tir/analysis/calculate_allocated_memory.cc
 create mode 100644 tests/python/unittest/test_tir_analysis_calculate_allocated_memory.py

diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h
index e9796eca6505..cb31a7e5ee96 100644
--- a/include/tvm/tir/analysis.h
+++ b/include/tvm/tir/analysis.h
@@ -217,6 +217,12 @@ TVM_DLL size_t CalculateConstantBytes(const PrimFunc& func, const Integer& const
 TVM_DLL size_t CalculateWorkspaceBytes(const PrimFunc& func,
                                        const Integer& workspace_byte_alignment);
 
+/*!
+ * \brief Calculate the allocated memory per scope in bytes needed inside the TIR PrimFunc
+ * \param func The TIR PrimFunc for which the the allocated memory size to be calculated
+ */
+TVM_DLL tvm::Map<String, Integer> CalculateAllocatedBytes(const PrimFunc& func);
+
 /*!
  * \brief Detect the lowest common ancestor(LCA) of buffer access, including both high-level
  *        access(BufferLoad, BufferStore) and low-level access(Load, Store and opaque access).
@@ -294,6 +300,16 @@ TVM_DLL Pass VerifyMemory();
  */
 TVM_DLL Pass VerifyGPUCode(Map<String, PrimExpr> constraints);
 
+/*!
+ * \brief Pass to checks if the size of the allocated vtcm memory satisfies the limit
+ *
+ * \param limit The limit to check.
+ *
+ * \returns The pass.
+ * \sa tvm::tir::CalculateAllocatedBytes
+ */
+TVM_DLL Pass VerifyVTCMLimit(const Integer& limit);
+
 /*!
  * \brief Statically check TIR code for out of bounds array access.
  *
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 8fc0da89c4c6..f1c14c3cd914 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -330,7 +330,7 @@ def set_task(self, task):
             )
 
     def get_build_kwargs(self):
-        kwargs = {}
+        kwargs = {"checks": {}}
         if (
             "cuda" in self.task.target.keys
             or "opencl" in self.task.target.keys
@@ -340,13 +340,15 @@ def get_build_kwargs(self):
             remote = request_remote(self.key, self.host, self.port)
             dev = remote.device(str(self.task.target), 0)
             max_dims = dev.max_thread_dimensions
-            kwargs["check_gpu"] = {
+            kwargs["checks"]["gpu"] = {
                 "max_shared_memory_per_block": dev.max_shared_memory_per_block,
                 "max_threads_per_block": dev.max_threads_per_block,
                 "max_thread_x": max_dims[0],
                 "max_thread_y": max_dims[1],
                 "max_thread_z": max_dims[2],
             }
+        if "hexagon" in self.task.target.keys:
+            kwargs["checks"]["hexagon"] = {"vtcm_capacity": self.task.target.vtcm_capacity}
 
         return kwargs
 
@@ -493,11 +495,11 @@ def set_task(self, task):
         return server, tracker
 
 
-def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option=None):
+def _build_func_common(measure_input, runtime=None, checks=None, build_option=None):
     """Common part for building a configuration"""
     target, task, config = measure_input
     target, task.target_host = Target.canon_target_and_host(target, task.target_host)
-
+    checks = checks or {}
     with target:
         s, args = task.instantiate(config)
 
@@ -526,8 +528,10 @@ def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option
                 current_add_lower_pass = list(current_config["tir.add_lower_pass"])
             else:
                 current_add_lower_pass = []
-            if check_gpu:
-                current_add_lower_pass.append((2, gpu_verify_pass(**check_gpu)))
+            if checks.get("gpu"):
+                current_add_lower_pass.append((2, gpu_verify_pass(**checks.get("gpu"))))
+            if checks.get("hexagon"):
+                current_add_lower_pass.append((2, vtcm_verify_pass(**checks.get("hexagon"))))
             current_config["tir.add_lower_pass"] = current_add_lower_pass
 
             with tvm.ir.transform.PassContext(
@@ -872,3 +876,20 @@ def verify_pass(f, *_):
         return f
 
     return tvm.tir.transform.prim_func_pass(verify_pass, opt_level=0)
+
+
+def vtcm_verify_pass(**kwargs):
+    """Verify the validity of a hexagon kernel.
+    This pass will check vtcm memory usage.
+    """
+
+    def verify_pass(f, *_):
+        sizes = tvm.tir.analysis.calculate_allocated_bytes(f)
+        vtcm_capacity = kwargs.get("vtcm_capacity", 0)
+        vtcm_allocated = sizes.get("global.vtcm", 0)
+        if 0 < vtcm_capacity < vtcm_allocated:
+            raise InstantiationError("Skipped because of invalid vtcm memory usage limit")
+
+        return f
+
+    return tvm.tir.transform.prim_func_pass(verify_pass, opt_level=0)
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 7081f992afd9..06e1776965c2 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -182,6 +182,10 @@ def thread_warp_size(self):
     def max_function_args(self):
         return int(self.attrs.get("max_function_args", -1))
 
+    @property
+    def vtcm_capacity(self):
+        return int(self.attrs.get("vtcm-capacity", 0))
+
     @property
     def device_name(self):
         return str(self.attrs.get("device", ""))
@@ -642,6 +646,8 @@ def hexagon(cpu_ver="v66", **kwargs):
         Whether to use IEEE HVX instructions
     num_cores : int (default: 4)
         The number of HVX threads. This attribute is required by meta scheduler.
+    vtcm_capacity: int (default: 0)
+        Hexagon VTCM capacity limitation. If the value is 0, the capacity is treated as unbounded.
 
     Note: Floating point support in HVX requires LLVM 14+.
     """
@@ -675,6 +681,7 @@ def get_arch_version(cpu_ver):
         "llvm_options": None,
         "use_qfloat": arch_version >= 68,
         "use_ieee_fp": False,
+        "vtcm_capacity": 0,
     }
     config.update(kwargs)
 
@@ -748,6 +755,7 @@ def create_llvm_options(cpu_ver, config):  # pylint: disable=unused-argument
 
     num_cores = config["num_cores"] if "num_cores" in kwargs else 4
     args_list.append("--num-cores=%d" % num_cores)
+    args_list.append("--vtcm-capacity=%d" % config["vtcm_capacity"])
 
     return Target(" ".join(["hexagon"] + args_list))
 
diff --git a/python/tvm/tir/analysis/analysis.py b/python/tvm/tir/analysis/analysis.py
index efb869efd6dc..45b1f745c3de 100644
--- a/python/tvm/tir/analysis/analysis.py
+++ b/python/tvm/tir/analysis/analysis.py
@@ -201,6 +201,22 @@ def calculate_constant_bytes(func: PrimFunc, constant_byte_alignment: int) -> in
     return _ffi_api.calculate_constant_bytes(func, constant_byte_alignment)  # type: ignore
 
 
+def calculate_allocated_bytes(func: PrimFunc) -> Dict[str, int]:
+    """Calculate allocated memory per memory scope required by TIR PrimFuncs.
+
+    Parameters
+    ----------
+    func: tvm.tir.PrimFunc
+        The function to be detected.
+
+    Returns
+    -------
+    result : Dict[String, int]
+        Allocated memory size per scope in bytes.
+    """
+    return _ffi_api.calculate_allocated_bytes(func)  # type: ignore
+
+
 def detect_buffer_access_lca(func: PrimFunc) -> Dict[Buffer, Stmt]:
     """Detect the lowest common ancestor(LCA) of buffer access, including both high-level
     access(BufferLoad, BufferStore) and low-level access(Load, Store and opaque access).
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 82533a2f9f5a..81b90d5f4051 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -611,6 +611,17 @@ def VerifyMemory():
     return _ffi_api.VerifyMemory()  # type: ignore
 
 
+def VerifyVTCMLimit(limit: int):
+    """Verify if the size of the allocated vtcm memory satisfies the limit.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.VerifyVTCMLimit(limit)  # type: ignore
+
+
 # pylint: disable=no-else-return,inconsistent-return-statements
 def HoistIfThenElse(variant: Optional[str] = None):
     """Hoist loop-invariant IfThenElse nodes to outside the eligible loops.
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 2f993c0c8b82..4ce7ad13bc60 100644
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1401,6 +1401,13 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
       const auto& optimize = tir::transform::Sequential(pass_list);
       optimize(mod);
     }
+    if (IsHexagonTask(task)) {
+      Target target = task->target;
+      const auto vtcm_capacity = target->GetAttr<Integer>("vtcm-capacity").value().IntValue();
+      const auto& optimize =
+          tir::transform::Sequential({tir::transform::VerifyVTCMLimit(vtcm_capacity)});
+      optimize(mod);
+    }
     const auto& optimize =
         tir::transform::Sequential(Array<tvm::transform::Pass>{tir::transform::Simplify()});
     mod = optimize(std::move(mod));
diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h
index 44b60de1d7ad..ca8979c0e829 100644
--- a/src/auto_scheduler/search_policy/utils.h
+++ b/src/auto_scheduler/search_policy/utils.h
@@ -58,6 +58,11 @@ inline bool IsGPUTask(const SearchTask& task) {
          device_type == kDLMetal || device_type == kDLROCM || device_type == kOpenGL;
 }
 
+/*! \brief Return whether the search task is targeting a Hexagon. */
+inline bool IsHexagonTask(const SearchTask& task) {
+  return (task)->target->GetTargetDeviceType() == kDLHexagon;
+}
+
 /*! \brief Return whether the search task is targeting a CUDA GPU. */
 inline bool IsCUDATask(const SearchTask& task) {
   return (task)->target->GetTargetDeviceType() == kDLCUDA;
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 90676e0b840b..10d9e8023a61 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -54,6 +54,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.merge_async_commit_queue_scope", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_lwp", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.dma_bypass_cache", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.vtcm_capacity", Integer);
 
 using tvm::Array;
 using tvm::transform::Pass;
@@ -225,8 +226,6 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   if (!disable_storage_rewrite) {
     pass_list.push_back(tir::transform::StorageRewrite());
   }
-  // LowerVtcmAlloc must occur after any transformations that modify memory allocation locations
-  pass_list.push_back(tir::transform::LowerVtcmAlloc());
   bool use_async_copy = pass_ctx->GetConfig<Bool>("tir.use_async_copy", Bool(false)).value();
 
   if (use_async_copy) {
@@ -532,11 +531,25 @@ runtime::Module build(const IRModule& funcs, const Target& target_arg,
   return TIRToRuntime(inputs, target_host);
 }
 
+int64_t GetVTCMCapacity(Target target, const transform::PassContext& pass_ctx) {
+  if (!target.defined()) target = Target::Current(/*allow_not_defined=*/true);
+  if (target.defined() && target->kind->name == "hexagon") {
+    auto value = Downcast<Integer>(target->attrs.at("vtcm-capacity"))->value;
+    if (value > 0) return value;
+  }
+  return pass_ctx->GetConfig<Integer>("tir.vtcm_capacity", Integer(0)).value()->value;
+}
+
 transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target) {
   transform::PassContext pass_ctx = transform::PassContext::Current();
 
   Array<Pass> mixed_pass_list;
 
+  // VerifyVTCMLimit must occur before LowerVtcmAlloc
+  mixed_pass_list.push_back(tir::transform::VerifyVTCMLimit(GetVTCMCapacity(target, pass_ctx)));
+  // LowerVtcmAlloc must occur after any transformations that modify memory allocation locations
+  mixed_pass_list.push_back(tir::transform::LowerVtcmAlloc());
+
   mixed_pass_list.push_back(tir::transform::BindTarget(target));
 
   mixed_pass_list.push_back(tir::transform::VerifyMemory());
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index ef350004ad52..a87bb92c483b 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -421,6 +421,7 @@ TVM_REGISTER_TARGET_KIND("hexagon", kDLHexagon)
     .add_attr_option<String>("mtriple")
     .add_attr_option<Array<String>>("llvm-options")
     .add_attr_option<Integer>("num-cores")
+    .add_attr_option<Integer>("vtcm-capacity")
     .set_default_keys({"hexagon"});
 
 TVM_REGISTER_TARGET_KIND("stackvm", kDLCPU);
diff --git a/src/tir/analysis/calculate_allocated_memory.cc b/src/tir/analysis/calculate_allocated_memory.cc
new file mode 100644
index 000000000000..01457508ab95
--- /dev/null
+++ b/src/tir/analysis/calculate_allocated_memory.cc
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir/analysis/calculate_allocated_memory.cc
+ * \brief Calculate allocated memory per memory scope required by PrimFuncs.
+ */
+#include <tvm/arith/analyzer.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/usmp/utils.h>
+
+#include <algorithm>
+#include <map>
+#include <unordered_map>
+
+namespace tvm {
+namespace tir {
+
+template <typename T>
+class AllocationCalculator : public StmtExprVisitor {
+ public:
+  AllocationCalculator() = default;
+  tvm::Map<String, Integer> operator()(const PrimFunc& func);
+
+ private:
+  void VisitStmt_(const T* op) override;
+  std::unordered_map<std::string, int64_t> _max_size;
+  std::unordered_map<std::string, int64_t> _current_size;
+};
+
+template <typename T>
+tvm::Map<String, Integer> AllocationCalculator<T>::operator()(const PrimFunc& func) {
+  this->VisitStmt(func->body);
+  tvm::Map<String, Integer> res;
+  for (auto [k, v] : _max_size) {
+    res.Set(String(k), Integer(v));
+  }
+  return res;
+}
+
+std::string GetStorageScope(const Var& var) {
+  auto* ptr = var->type_annotation.as<PointerTypeNode>();
+  ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType";
+  return ptr->storage_scope;
+}
+
+template <typename T>
+void AllocationCalculator<T>::VisitStmt_(const T* op) {
+  std::string storage_scope = GetStorageScope(op->buffer_var);
+  auto search = _current_size.find(storage_scope);
+  if (search == _current_size.end()) {
+    _current_size[storage_scope] = 0;
+    _max_size[storage_scope] = 0;
+  }
+  auto size = op->ConstantAllocationSize() * op->dtype.bytes() * op->dtype.lanes();
+  _current_size[storage_scope] += size;
+  _max_size[storage_scope] = std::max(_current_size[storage_scope], _max_size[storage_scope]);
+  StmtExprVisitor::VisitStmt(op->body);
+  _current_size[storage_scope] -= size;
+}
+
+tvm::Map<String, Integer> CalculateAllocatedBytes(const PrimFunc& func) {
+  return AllocationCalculator<AllocateNode>()(func);
+}
+
+TVM_REGISTER_GLOBAL("tir.analysis.calculate_allocated_bytes").set_body_typed([](PrimFunc func) {
+  return CalculateAllocatedBytes(func);
+});
+
+namespace transform {
+
+Pass VerifyVTCMLimit(const Integer& limit) {
+  auto pass_func = [=](IRModule mod, PassContext ctx) {
+    for (auto kv : mod->functions) {
+      if (auto* n = kv.second.as<PrimFuncNode>()) {
+        auto func = GetRef<PrimFunc>(n);
+        auto sizes = CalculateAllocatedBytes(func);
+        const auto vtcm_allocated = sizes.Get("global.vtcm").value_or(0);
+        if (limit.IntValue() > 0 && vtcm_allocated.IntValue() > limit.IntValue()) {
+          LOG(FATAL) << "RuntimeError: The global.vtcm memory allocation limit has been "
+                        "exceeded(allocated: "
+                     << vtcm_allocated << ", limit: " << limit << ").\n"
+                     << "In function\n"
+                     << func;
+        }
+      }
+    }
+    return mod;
+  };
+  return tvm::transform::CreateModulePass(pass_func, 0, "tir.calculate_allocated_bytes", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.VerifyVTCMLimit").set_body_typed(VerifyVTCMLimit);
+
+}  // namespace transform
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 5b13513c0fb3..fcb811fce742 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -324,7 +324,7 @@ def quantize_np(arr_np: numpy.ndarray, dtype: str):
     return quant_np, scale, zero_point
 
 
-def get_hexagon_target(cpu_ver: str) -> tvm.target.Target:
+def get_hexagon_target(cpu_ver: str, **kwargs) -> tvm.target.Target:
     """Creates a Hexagon target"""
-    target = tvm.target.hexagon(cpu_ver)
+    target = tvm.target.hexagon(cpu_ver, **kwargs)
     return tvm.target.Target(target, host=target)
diff --git a/tests/python/contrib/test_hexagon/test_vtcm.py b/tests/python/contrib/test_hexagon/test_vtcm.py
index 11188436a318..e71f890740c1 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm.py
@@ -16,9 +16,11 @@
 # under the License.
 """VTCM Tests"""
 
+import pytest
 import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
+from .infrastructure import get_hexagon_target
 
 
 @T.prim_func
@@ -31,8 +33,7 @@ def scale_by_two(buffer_a: T.Buffer[(8192,), "int8"], buffer_c: T.Buffer[(8192,)
             buffer_c[i] = buffer_a[i] * T.int8(2)
 
 
-def test_vtcm_lowering():
-    """Test lowering with vtcm mem scope"""
+def get_scale_by_two_schedule():
     mod = tvm.IRModule.from_expr(scale_by_two.with_attr("global_symbol", "main"))
     sch = tir.Schedule(mod, debug_mask="all")
     block_c = sch.get_block("C")
@@ -40,23 +41,47 @@ def test_vtcm_lowering():
     outer, _, _, _ = sch.split(flat, factors=[8, 4, 2, 128])
     cache_block = sch.cache_read(block_c, 0, storage_scope="global.vtcm")
     sch.compute_at(cache_block, outer)
-    lowered = tvm.lower(sch.mod["main"])
+    return sch
 
-    def ir_module_has_allocate_nodes(irmod):
-        nallocs = 0
 
-        def _visit(stmt):
-            nonlocal nallocs
-            if isinstance(stmt, tvm.tir.Allocate):
-                nallocs += 1
+@tvm.testing.requires_hexagon
+def test_vtcm_building():
+    """Test building with vtcm mem scope"""
+    sch = get_scale_by_two_schedule()
+    target = get_hexagon_target("v68")
+    built = tvm.build(sch.mod, target=target)
+    assert "global.vtcm" in built.get_source("asm")
 
-        tvm.tir.stmt_functor.post_order_visit(irmod["main"].body, _visit)
-        return nallocs
 
-    assert not ir_module_has_allocate_nodes(lowered), (
-        "AllocateNode found in lowered IRModule, "
-        "VTCM allocations should have been lowered to tir.nd_mem_alloc_with_scope"
-    )
+@tvm.testing.requires_hexagon
+@pytest.mark.parametrize("vtcm_capacity,limited", [(8192, False), (1024, False), (128, True)])
+def test_vtcm_limit(vtcm_capacity, limited):
+    """Test building with vtcm mem scope limit"""
+    sch = get_scale_by_two_schedule()
+
+    def _raises_exception(f):
+        try:
+            f()
+        except tvm._ffi.base.TVMError:
+            return True
+        return False
+
+    target = get_hexagon_target("v68", vtcm_capacity=vtcm_capacity)
+
+    assert (
+        _raises_exception(lambda: tvm.build(sch.mod, target=target)) == limited
+    ), "Case 1 - arg. VTCM memory allocation limiter does not work correctly "
+
+    with target:
+        assert (
+            _raises_exception(lambda: tvm.build(sch.mod)) == limited
+        ), "Case 2 - with.VTCM memory allocation limiter does not work correctly "
+
+    with tvm.transform.PassContext(config={"tir.vtcm_capacity": vtcm_capacity}):
+        assert (
+            _raises_exception(lambda: tvm.build(sch.mod, target=get_hexagon_target("v68")))
+            == limited
+        ), "Case 3 - context. VTCM memory allocation limiter does not work correctly "
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_tir_analysis_calculate_allocated_memory.py b/tests/python/unittest/test_tir_analysis_calculate_allocated_memory.py
new file mode 100644
index 000000000000..1a2d50ef5d7f
--- /dev/null
+++ b/tests/python/unittest/test_tir_analysis_calculate_allocated_memory.py
@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+import tvm
+from tvm import tir
+from tvm.script import tir as T
+
+
+@T.prim_func
+def scale_by_two(a: T.Buffer[(128,), "int8"], c: T.Buffer[(128,), "int8"]):
+    for i in T.serial(128):
+        with T.block("C"):
+            c[i] = a[i] * T.int8(2)
+
+
+@T.prim_func
+def scale_by_two_three(a: T.Buffer[(128,), "int8"], c: T.Buffer[(128,), "int8"]):
+    B = T.alloc_buffer([128], dtype="int8", scope="global.vtcm")
+    for i in T.serial(128):
+        with T.block("B"):
+            B[i] = a[i] * T.int8(2)
+    for i in T.serial(128):
+        with T.block("C"):
+            c[i] = B[i] * T.int8(3)
+
+
+@pytest.mark.parametrize("primFunc,size", [(scale_by_two, 128), (scale_by_two_three, 256)])
+def test_scale_by(primFunc, size):
+    """Test calculate allocated bytes per scope"""
+    mod = tvm.IRModule.from_expr(primFunc.with_attr("global_symbol", "main"))
+    sch = tir.Schedule(mod, debug_mask="all")
+    block_c = sch.get_block("C")
+    (flat,) = sch.get_loops(block_c)
+    cache_block = sch.cache_read(block_c, 0, storage_scope="global.vtcm")
+    sch.compute_at(cache_block, flat)
+
+    mod = sch.mod
+    mod = tvm.tir.transform.ConvertBlocksToOpaque()(mod)
+    mod = tvm.tir.transform.LowerOpaqueBlock()(mod)
+    sizes = tvm.tir.analysis.calculate_allocated_bytes(mod["main"])
+    assert sizes.get("global.vtcm", 0) == size
+
+
+@T.prim_func
+def matmul_mix_scope(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, [128, 128], scope="global")
+    B = T.match_buffer(b, [128, 128], scope="global")
+    C = T.match_buffer(c, [128, 128], scope="global")
+    A_allocated = T.alloc_buffer([128, 128], dtype="float32", scope="global.texture")
+    B_allocated = T.alloc_buffer([128, 128], dtype="float32", scope="global.texture")
+    C_allocated = T.alloc_buffer([128, 128], dtype="float32", scope="global")
+
+    for i, j in T.grid(128, 128):
+        with T.block("A.allocated"):
+            A_allocated[i, j] = A[i, j]
+    for i, j in T.grid(128, 128):
+        with T.block("B.allocated"):
+            B_allocated[i, j] = B[i, j]
+
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("update"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C_allocated[vi, vj] = 0.0
+            C_allocated[vi, vj] = C[vi, vj] + A_allocated[vi, vk] * B_allocated[vj, vk]
+
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            C[i, j] = C_allocated[i, j]
+
+
+@pytest.mark.parametrize(
+    "scope,size", [("global", 65536), ("global.texture", 131072), ("global.texture-nhwc", 0)]
+)
+def test_matmul_mix_scope(scope, size):
+    """Test calculate allocated bytes per scope"""
+    mod = tvm.IRModule({"main": matmul_mix_scope})
+    mod = tvm.tir.transform.LowerInitBlock()(mod)
+    mod = tvm.tir.transform.ConvertBlocksToOpaque()(mod)
+    mod = tvm.tir.transform.LowerOpaqueBlock()(mod)
+    sizes = tvm.tir.analysis.calculate_allocated_bytes(mod["main"])
+    assert sizes.get(scope, 0) == size
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From a923ed3c89c303808c61882b7930751b1f7fb0b1 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 2 Dec 2022 09:01:39 -0600
Subject: [PATCH 686/704] [TIR] Introduce ReduceBranchingThroughOvercompute
 (#13299)

This PR introduces a new transform, which attempts to reduce branching
by adding overcompute, where the overcompute has provably no effect on
any observable value outside the PrimFunc.
---
 python/tvm/tir/transform/transform.py         |  11 +
 src/arith/rewrite_simplify.cc                 |  10 +-
 .../reduce_branching_through_overcompute.cc   | 178 ++++++++++++++
 src/tir/transforms/remove_no_op.cc            |  28 ++-
 src/tir/transforms/remove_no_op.h             |  60 +++++
 src/tir/transforms/simplify.h                 |  42 ++++
 ...rm_reduce_branching_through_overcompute.py | 219 ++++++++++++++++++
 7 files changed, 536 insertions(+), 12 deletions(-)
 create mode 100644 src/tir/transforms/reduce_branching_through_overcompute.cc
 create mode 100644 src/tir/transforms/remove_no_op.h
 create mode 100644 src/tir/transforms/simplify.h
 create mode 100644 tests/python/unittest/test_tir_transform_reduce_branching_through_overcompute.py

diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 81b90d5f4051..9b0e5748bcc0 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -242,6 +242,17 @@ def UnrollLoop():
     return _ffi_api.UnrollLoop()  # type: ignore
 
 
+def ReduceBranchingThroughOvercompute():
+    """Reduce branching by introducing overcompute
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.ReduceBranchingThroughOvercompute()  # type: ignore
+
+
 def RemoveNoOp():
     """Remove No Op from the Stmt.
 
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index c9d92f992564..f1838f5a9099 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -1388,8 +1388,12 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const EQNode* op) {
   EQ ret = Downcast<EQ>(IRMutatorWithAnalyzer::VisitExpr_(op));
   op = ret.get();
 
-  if (auto const_res = TryConstFold<EQ>(op->a, op->b)) return const_res.value();
-  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
+  if (auto const_res = TryConstFold<EQ>(op->a, op->b)) {
+    return const_res.value();
+  }
+  if (auto match = TryMatchLiteralConstraint(ret)) {
+    return match.value();
+  }
 
   return ApplyRewriteRules(ret);
 }
@@ -1419,7 +1423,7 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(EQ ret) {
     TVM_TRY_REWRITE(x - c1 == 0, x == c1);
     TVM_TRY_REWRITE(c1 - x == 0, x == c1);
     TVM_TRY_REWRITE(x + c1 == 0, x == 0 - c1);
-    TVM_TRY_REWRITE(x * y == 0, x == 0 || y == 0);
+    TVM_TRY_RECURSIVE_REWRITE(x * y == 0, x == 0 || y == 0);
   }
   return std::move(ret);
 }
diff --git a/src/tir/transforms/reduce_branching_through_overcompute.cc b/src/tir/transforms/reduce_branching_through_overcompute.cc
new file mode 100644
index 000000000000..8c8824719276
--- /dev/null
+++ b/src/tir/transforms/reduce_branching_through_overcompute.cc
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file reduce_branching_through_overcompute.cc
+ *
+ * \brief Attempt to remove conditional statements by introducing
+ * extra computations that do not impact the final results.
+ */
+
+#include <tvm/tir/op.h>
+#include <tvm/tir/transform.h>
+
+#include <optional>
+
+#include "../../arith/ir_mutator_with_analyzer.h"
+#include "../analysis/control_flow_graph.h"
+#include "remove_no_op.h"
+#include "simplify.h"
+
+namespace tvm {
+namespace tir {
+
+struct ReduceBranchingThroughOvercomputeConfigNode
+    : public tvm::AttrsNode<ReduceBranchingThroughOvercomputeConfigNode> {
+  bool use_dataflow_analysis;
+
+  TVM_DECLARE_ATTRS(ReduceBranchingThroughOvercomputeConfigNode,
+                    "tir.transform.ReduceBranchingThroughOvercomputeConfig") {
+    TVM_ATTR_FIELD(use_dataflow_analysis)
+        .describe(
+            "If true, known buffer values are propagated and used "
+            "to statically prove that overcompute is valid.")
+        .set_default(false);
+  }
+};
+
+class ReduceBranchingThroughOvercomputeConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(ReduceBranchingThroughOvercomputeConfig, Attrs,
+                                            ReduceBranchingThroughOvercomputeConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(ReduceBranchingThroughOvercomputeConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.ReduceBranchingThroughOvercompute",
+                                ReduceBranchingThroughOvercomputeConfig);
+
+struct ElseBranchFiller : StmtExprMutator {
+  Stmt VisitStmt_(const IfThenElseNode* op) override {
+    IfThenElse ret = Downcast<IfThenElse>(StmtExprMutator::VisitStmt_(op));
+    if (ret->else_case.defined()) {
+      return std::move(ret);
+    } else {
+      auto new_else_clause = Evaluate(0);
+      new_else_clauses.insert(new_else_clause);
+      return IfThenElse(ret->condition, ret->then_case, new_else_clause);
+    }
+  }
+
+  std::unordered_set<Evaluate, ObjectPtrHash, ObjectPtrEqual> new_else_clauses;
+};
+
+class ElseBranchStripper : public StmtExprMutator {
+ public:
+  ElseBranchStripper(
+      const std::unordered_set<Evaluate, ObjectPtrHash, ObjectPtrEqual>& new_else_clauses)
+      : new_else_clauses_(new_else_clauses) {}
+
+ private:
+  Stmt VisitStmt_(const IfThenElseNode* op) override {
+    IfThenElse ret = Downcast<IfThenElse>(StmtExprMutator::VisitStmt_(op));
+    auto as_eval = ret->else_case.as<EvaluateNode>();
+    if (as_eval && new_else_clauses_.count(GetRef<Evaluate>(as_eval))) {
+      return IfThenElse(ret->condition, ret->then_case);
+    } else {
+      return std::move(ret);
+    }
+  }
+
+  const std::unordered_set<Evaluate, ObjectPtrHash, ObjectPtrEqual>& new_else_clauses_;
+};
+
+class BranchReducer : public arith::IRMutatorWithAnalyzer {
+ public:
+  static Stmt Apply(Stmt stmt, const std::optional<ControlFlowGraph>& touch_pattern) {
+    arith::Analyzer analyzer;
+    BranchReducer visitor(&analyzer, touch_pattern);
+    return visitor(std::move(stmt));
+  }
+
+ private:
+  using Parent = IRMutatorWithAnalyzer;
+  using Parent::VisitStmt;
+  using Parent::VisitStmt_;
+
+  BranchReducer(arith::Analyzer* analyzer, const std::optional<ControlFlowGraph>& touch_pattern)
+      : Parent(analyzer), touch_pattern_(touch_pattern) {}
+
+  Stmt VisitStmt_(const IfThenElseNode* op) final {
+    IfThenElse cond = Downcast<IfThenElse>(Parent::VisitStmt_(op));
+
+    auto is_special_case = [&](PrimExpr condition, Stmt general_case, Stmt special_case) -> bool {
+      condition = analyzer_->rewrite_simplify(condition);
+      With<arith::ConstraintContext> constraint(analyzer_, condition);
+      Stmt stmt = RemoveNoOp(general_case, analyzer_, touch_pattern_, special_case.get());
+      return StructuralEqual()(stmt, special_case);
+    };
+
+    ICHECK(cond->else_case.defined() || !touch_pattern_.has_value())
+        << "Temp assert, should be true whenever touch pattern is available";
+    Stmt else_case = cond->else_case.value_or(Evaluate(0));
+
+    if (is_special_case(cond->condition, else_case, cond->then_case)) {
+      return else_case;
+    } else if (is_special_case(!cond->condition, cond->then_case, else_case)) {
+      return cond->then_case;
+    } else {
+      return std::move(cond);
+    }
+  }
+
+ private:
+  const std::optional<ControlFlowGraph>& touch_pattern_;
+};
+
+namespace transform {
+
+Pass ReduceBranchingThroughOvercompute() {
+  auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
+    arith::Analyzer analyzer;
+
+    ReduceBranchingThroughOvercomputeConfig config =
+        ctx->GetConfig<ReduceBranchingThroughOvercomputeConfig>(
+               "tir.ReduceBranchingThroughOvercompute")
+            .value_or(AttrsWithDefaultValues<ReduceBranchingThroughOvercomputeConfig>());
+
+    auto* n = f.CopyOnWrite();
+
+    std::optional<ControlFlowGraph> touch_pattern = std::nullopt;
+    ElseBranchFiller else_branch_filler;
+    if (config->use_dataflow_analysis) {
+      n->body = else_branch_filler(std::move(n->body));
+      touch_pattern.emplace(n->body);
+    }
+
+    n->body = BranchReducer::Apply(std::move(n->body), touch_pattern);
+
+    if (config->use_dataflow_analysis) {
+      n->body = ElseBranchStripper(else_branch_filler.new_else_clauses)(std::move(n->body));
+    }
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.ReduceBranchingThroughOvercompute", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.ReduceBranchingThroughOvercompute")
+    .set_body_typed(ReduceBranchingThroughOvercompute);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/transforms/remove_no_op.cc b/src/tir/transforms/remove_no_op.cc
index 52468e73d474..430c1f41bfaf 100644
--- a/src/tir/transforms/remove_no_op.cc
+++ b/src/tir/transforms/remove_no_op.cc
@@ -220,16 +220,21 @@ class NoOpRemover : public arith::IRMutatorWithAnalyzer {
         touch_pattern_->RemoveStore(store);
         return only_side_effects();
       }
+    }
 
-      // A write whose destination is known to already contain the
-      // values to be written is a no-op.
-      PrimExpr stores_existing_value = store->value == BufferLoad(store->buffer, store->indices);
-
-      PrimExpr simplified =
-          touch_pattern_->SimplifyInContext(stores_existing_value, context, analyzer_);
-      if (auto* as_int = as_const_int(simplified); as_int && *as_int) {
-        return only_side_effects();
-      }
+    // A write whose destination is known to already contain the
+    // values to be written is a no-op.
+    // PrimExpr stores_existing_value = store->value == BufferLoad(store->buffer, store->indices);
+    PrimExpr stores_existing_value = store->value - BufferLoad(store->buffer, store->indices) == 0;
+    if (touch_pattern_.has_value()) {
+      Stmt context_arg = context_ ? GetRef<Stmt>(context_) : Stmt(store);
+      stores_existing_value =
+          touch_pattern_->SimplifyInContext(stores_existing_value, context_arg, analyzer_);
+    } else {
+      stores_existing_value = analyzer_->Simplify(stores_existing_value);
+    }
+    if (is_one(stores_existing_value)) {
+      return only_side_effects();
     }
 
     // If the stored value is a load from the same location, the
@@ -293,6 +298,11 @@ class NoOpRemover : public arith::IRMutatorWithAnalyzer {
   const StmtNode* context_;
 };
 
+Stmt RemoveNoOp(Stmt stmt, arith::Analyzer* analyzer, std::optional<ControlFlowGraph> touch_pattern,
+                const StmtNode* context) {
+  return NoOpRemover::Apply(std::move(stmt), analyzer, std::move(touch_pattern), context);
+}
+
 namespace transform {
 
 Pass RemoveNoOp() {
diff --git a/src/tir/transforms/remove_no_op.h b/src/tir/transforms/remove_no_op.h
new file mode 100644
index 000000000000..e24c32b5da18
--- /dev/null
+++ b/src/tir/transforms/remove_no_op.h
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file remove_no_op.h
+ * \brief Helper functions to construct and compose IR nodes.
+ */
+#ifndef TVM_TIR_TRANSFORMS_REMOVE_NO_OP_H_
+#define TVM_TIR_TRANSFORMS_REMOVE_NO_OP_H_
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/tir/stmt.h>
+
+#include <optional>
+
+#include "../analysis/control_flow_graph.h"
+
+namespace tvm {
+namespace tir {
+
+/* \brief Remove no-ops from the statement
+ *
+ * Applies the same behavior as the tir.transform.RemoveNoOp pass, but
+ * on a single statement, usable as a subroutine in other passes.
+ *
+ * \param stmt The TIR statement from which to remove no-ops
+ *
+ * \param analyzer The analyzer to use while proving no-ops
+ *
+ * \param control_flow The analyzed control-flow graph, which contains
+ * the `stmt` to be analyzed.  If provided, known buffer values will
+ * be used to remove no-ops.  (e.g. Removing `buf[i] = 0` in cases
+ * where `buf[i]` is known to already contain zero.)  If nullptr,
+ * known buffer values will not be used.
+ *
+ * \return The modified statement with no-ops removed
+ */
+Stmt RemoveNoOp(Stmt stmt, arith::Analyzer* analyzer,
+                std::optional<ControlFlowGraph> touch_pattern = std::nullopt,
+                const StmtNode* context = nullptr);
+
+}  // namespace tir
+}  // namespace tvm
+#endif  // TVM_TIR_TRANSFORMS_REMOVE_NO_OP_H_
diff --git a/src/tir/transforms/simplify.h b/src/tir/transforms/simplify.h
new file mode 100644
index 000000000000..43afc5e48dcb
--- /dev/null
+++ b/src/tir/transforms/simplify.h
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file simplify.h
+ * \brief Helper functions to construct and compose IR nodes.
+ */
+#ifndef TVM_TIR_TRANSFORMS_SIMPLIFY_H_
+#define TVM_TIR_TRANSFORMS_SIMPLIFY_H_
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/tir/stmt.h>
+
+namespace tvm {
+namespace tir {
+
+/* \brief Simplifies the statement
+ *
+ * Applies the same behavior as the tir.transform.Simplify pass, but
+ * on a single statement, usable as a subroutine in other passes.
+ */
+Stmt Simplify(Stmt stmt, arith::Analyzer* analyzer);
+
+}  // namespace tir
+}  // namespace tvm
+#endif  // TVM_TIR_TRANSFORMS_SIMPLIFY_H_
diff --git a/tests/python/unittest/test_tir_transform_reduce_branching_through_overcompute.py b/tests/python/unittest/test_tir_transform_reduce_branching_through_overcompute.py
new file mode 100644
index 000000000000..13fbcc7594ec
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_reduce_branching_through_overcompute.py
@@ -0,0 +1,219 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import tvm.testing
+from tvm.script import tir as T
+
+import pytest
+
+
+class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
+    use_dataflow_analysis = False
+
+    def transform(self):
+        def inner(mod):
+            config = {
+                "tir.ReduceBranchingThroughOvercompute": {
+                    "use_dataflow_analysis": self.use_dataflow_analysis,
+                }
+            }
+            with tvm.transform.PassContext(config=config):
+                mod = tvm.tir.transform.ReduceBranchingThroughOvercompute()(mod)
+            return mod
+
+        return inner
+
+
+class TestIntroduceNoOp(BaseBeforeAfter):
+    """Remove a conditional by introducing a no-op
+
+    If the else_case can have a no-op added in order to be identical
+    to the then_case, then the conditional can be removed.
+    """
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 14:
+                A[i] = 1
+                T.evaluate(0)
+            else:
+                A[i] = 1
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 1
+            T.evaluate(0)
+
+
+class TestIntroduceAdditionOfZero(BaseBeforeAfter):
+    """Insert a conditionally no-op statement
+
+    Overcompute doesn't need to explicitly be a no-op, and can be
+    something that simplifies to a no-op.  Here, when i==0, the
+    expression simplifies to ``A[0] = A[0]``, which is a no-op.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[1, "int32"]):
+        for i in T.serial(16):
+            if i > 0:
+                A[0] = A[0] + i * i
+
+    def expected(A: T.Buffer[1, "int32"]):
+        for i in T.serial(16):
+            A[0] = A[0] + i * i
+
+
+class TestIntroduceAdditionOfKnownZeroInBuffer(BaseBeforeAfter):
+    """Insert a conditionally no-op statement
+
+    Proving that the overcompute is a no-op may use known values that
+    are present in a buffer.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"], B: T.Buffer[1, "int32"]):
+        for i in T.serial(16):
+            T.evaluate(T.assume(i < 14 or A[i] == 0))
+
+        B[0] = 0
+        for i in T.serial(16):
+            if i < 14:
+                B[0] = B[0] + A[i]
+
+    def expected(A: T.Buffer[16, "int32"], B: T.Buffer[1, "int32"]):
+        for i in T.serial(16):
+            T.evaluate(T.assume(i < 14 or A[i] == 0))
+
+        B[0] = 0
+        for i in T.serial(16):
+            B[0] = B[0] + A[i]
+
+
+class TestIntroduceOverwrittenWrite(BaseBeforeAfter):
+    """Insert a write that is later overwritten.
+
+    Given two sequential writes to the same location without a read
+    occurring in-between, the first is a no-op.  Therefore, the
+    conditional in the first loop can be removed, with any temporary
+    values overwritten by the second loop.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 14:
+                A[i] = 1
+
+        for i in T.serial(16):
+            if i >= 14:
+                A[i] = 2
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 1
+
+        for i in T.serial(16):
+            if i >= 14:
+                A[i] = 2
+
+
+class TestMaintainValuesUsedLater(BaseBeforeAfter):
+    """Do not insert writes that would be used later.
+
+    As TestIntroduceOverwrittenWrite, except that the values stored at
+    A[14] and A[15] are used by the second loop.  Overwriting them in
+    the first loop would change the result, so the overcompute would
+    not be valid.
+    """
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 14:
+                A[i] = 1
+
+        for i in T.serial(16):
+            if i >= 14:
+                A[i] = A[i] + 1
+
+    expected = before
+
+
+class TestIdentifyOverwrittenWriteFromEquivalentExpressions(BaseBeforeAfter):
+    """Insert a write that is later overwritten.
+
+    As TestIntroduceOverwrittenWrite, but the conditionals used in the
+    first and second loop have different structures while referring to
+    the same elements.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 14:
+                A[i] = 1
+
+        for io, ii in T.grid(4, 4):
+            if io == 3 and ii >= 2:
+                A[4 * io + ii] = 2
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 1
+
+        for io, ii in T.grid(4, 4):
+            if io == 3 and ii >= 2:
+                A[4 * io + ii] = 2
+
+
+class TestIntroduceSupersetOverwrittenWrite(BaseBeforeAfter):
+    """Insert a write that is later overwritten.
+
+    As TestIntroduceOverwrittenWrite, but the elements written in the
+    second loop are not distinct from the elements in the first loop.
+    So long as the writes introduced by overcompute in the first loop
+    are a subset of the writes present in the second loop, the
+    overcompute can be introduced.
+    """
+
+    use_dataflow_analysis = True
+
+    def before(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            if i < 14:
+                A[i] = 1
+
+        for i in T.serial(16):
+            if i >= 14:
+                A[i] = 2
+
+    def expected(A: T.Buffer[16, "int32"]):
+        for i in T.serial(16):
+            A[i] = 1
+
+        for i in T.serial(16):
+            if i >= 14:
+                A[i] = 2
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 72c3399207793ee9c28639995c88a64115f2193b Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 2 Dec 2022 15:29:58 -0800
Subject: [PATCH 687/704] [ci][docker] Update NRF command line tools URL
 (#13541)

The old one began to 404, this one is taken for 10.2.1 from
https://www.nordicsemi.com/Products/Development-tools/nrf-command-line-tools/download
---
 docker/install/ubuntu_install_nrfjprog.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_nrfjprog.sh b/docker/install/ubuntu_install_nrfjprog.sh
index 372c39a06a58..1a82f057a950 100755
--- a/docker/install/ubuntu_install_nrfjprog.sh
+++ b/docker/install/ubuntu_install_nrfjprog.sh
@@ -22,7 +22,7 @@ set -o pipefail
 set -x
 
 NRF_COMMANDLINE_TOOLS_FILE=nRFCommandLineToolsLinuxamd64.tar.gz
-NRF_COMMANDLINE_TOOLS_URL=https://www.nordicsemi.com/-/media/Software-and-other-downloads/Desktop-software/nRF-command-line-tools/sw/Versions-10-x-x/10-12-1/nRFCommandLineTools10121Linuxamd64.tar.gz
+NRF_COMMANDLINE_TOOLS_URL=https://nsscprodmedia.blob.core.windows.net/prod/software-and-other-downloads/desktop-software/nrf-command-line-tools/sw/versions-10-x-x/10-12-1/nrfcommandlinetools10121linuxamd64.tar.gz
 NRF_COMMANDLINE_TOOLS_INSTALLER=nRF-Command-Line-Tools_10_12_1_Linux-amd64.deb
 JLINK_LINUX_INSTALLER=JLink_Linux_V688a_x86_64.deb
 

From d31a1fb0dbea484dec045c22ce2a756aa1071b38 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 2 Dec 2022 17:39:15 -0800
Subject: [PATCH 688/704] [ci] Dis-allow any non-S3 URLs in CI (#13283)

* [ci] Dis-allow any non-S3 URLs in CI

This PR makes it so any URLs accessed in tests in CI must be hosted in
S3. This improves reliability as we've seen even files on GitHub
sometimes serve 503s even when everything else is working fine. This
raises an error if any unallowed URL is detected and adds the remaining
few.
---
 tests/python/frontend/darknet/test_forward.py | 200 ++++++++++--------
 tests/scripts/request_hook/request_hook.py    | 161 +++++++++++++-
 2 files changed, 263 insertions(+), 98 deletions(-)

diff --git a/tests/python/frontend/darknet/test_forward.py b/tests/python/frontend/darknet/test_forward.py
index 5e6af51f3298..58695e1fd63f 100644
--- a/tests/python/frontend/darknet/test_forward.py
+++ b/tests/python/frontend/darknet/test_forward.py
@@ -34,15 +34,29 @@
 from tvm import relay
 
 REPO_URL = "https://github.com/dmlc/web-data/blob/main/darknet/"
-DARKNET_LIB = "libdarknet2.0.so"
-DARKNETLIB_URL = REPO_URL + "lib/" + DARKNET_LIB + "?raw=true"
-LIB = __darknetffi__.dlopen(download_testdata(DARKNETLIB_URL, DARKNET_LIB, module="darknet"))
 
-DARKNET_TEST_IMAGE_NAME = "dog.jpg"
-DARKNET_TEST_IMAGE_URL = REPO_URL + "data/" + DARKNET_TEST_IMAGE_NAME + "?raw=true"
-DARKNET_TEST_IMAGE_PATH = download_testdata(
-    DARKNET_TEST_IMAGE_URL, DARKNET_TEST_IMAGE_NAME, module="data"
-)
+# Lazily initialized
+DARKNET_TEST_IMAGE_PATH = None
+LIB = None
+
+
+def _lib():
+    global LIB
+    lib = "libdarknet2.0.so"
+    url = REPO_URL + "lib/" + lib + "?raw=true"
+    if LIB is None:
+        LIB = __darknetffi__.dlopen(download_testdata(url, lib, module="darknet"))
+
+    return LIB
+
+
+def _darknet_test_image_path():
+    global DARKNET_TEST_IMAGE_PATH
+    if DARKNET_TEST_IMAGE_PATH is None:
+        name = "dog.jpg"
+        url = REPO_URL + "data/" + name + "?raw=true"
+        DARKNET_TEST_IMAGE_PATH = download_testdata(url, name, module="data")
+    return DARKNET_TEST_IMAGE_PATH
 
 
 def astext(program, unify_free_vars=False):
@@ -96,7 +110,7 @@ def _get_tvm_output(net, data, build_dtype="float32", states=None):
 def _load_net(cfg_url, cfg_name, weights_url, weights_name):
     cfg_path = download_testdata(cfg_url, cfg_name, module="darknet")
     weights_path = download_testdata(weights_url, weights_name, module="darknet")
-    net = LIB.load_network(cfg_path.encode("utf-8"), weights_path.encode("utf-8"), 0)
+    net = _lib().load_network(cfg_path.encode("utf-8"), weights_path.encode("utf-8"), 0)
     return net
 
 
@@ -104,7 +118,7 @@ def verify_darknet_frontend(net, build_dtype="float32"):
     """Test network with given input image on both darknet and tvm"""
 
     def get_darknet_output(net, img):
-        LIB.network_predict_image(net, img)
+        _lib().network_predict_image(net, img)
         out = []
         for i in range(net.n):
             layer = net.layers[i]
@@ -147,8 +161,8 @@ def get_darknet_output(net, img):
 
     dtype = "float32"
 
-    img = LIB.letterbox_image(
-        LIB.load_image_color(DARKNET_TEST_IMAGE_PATH.encode("utf-8"), 0, 0), net.w, net.h
+    img = _lib().letterbox_image(
+        _lib().load_image_color(_darknet_test_image_path().encode("utf-8"), 0, 0), net.w, net.h
     )
     darknet_output = get_darknet_output(net, img)
     batch_size = 1
@@ -169,7 +183,7 @@ def _test_rnn_network(net, states):
     """Test network with given input data on both darknet and tvm"""
 
     def get_darknet_network_predict(net, data):
-        return LIB.network_predict(net, data)
+        return _lib().network_predict(net, data)
 
     ffi = FFI()
     np_arr = np.zeros([1, net.inputs], dtype="float32")
@@ -195,7 +209,7 @@ def test_forward_extraction():
     weights_url = "http://pjreddie.com/media/files/" + weights_name + "?raw=true"
     net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_alexnet():
@@ -207,7 +221,7 @@ def test_forward_alexnet():
     weights_url = "http://pjreddie.com/media/files/" + weights_name + "?raw=true"
     net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_resnet50():
@@ -219,7 +233,7 @@ def test_forward_resnet50():
     weights_url = "http://pjreddie.com/media/files/" + weights_name + "?raw=true"
     net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_resnext50():
@@ -231,7 +245,7 @@ def test_forward_resnext50():
     weights_url = "http://pjreddie.com/media/files/" + weights_name + "?raw=true"
     net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_yolov2():
@@ -244,7 +258,7 @@ def test_forward_yolov2():
     net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
     build_dtype = {}
     verify_darknet_frontend(net, build_dtype)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_yolov3():
@@ -257,88 +271,88 @@ def test_forward_yolov3():
     net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
     build_dtype = {}
     verify_darknet_frontend(net, build_dtype)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_convolutional():
     """test convolutional layer"""
-    net = LIB.make_network(1)
-    layer = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    net = _lib().make_network(1)
+    layer = _lib().make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
     net.layers[0] = layer
     net.w = net.h = 224
-    LIB.resize_network(net, 224, 224)
+    _lib().resize_network(net, 224, 224)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_dense():
     """test fully connected layer"""
-    net = LIB.make_network(1)
-    layer = LIB.make_connected_layer(1, 75, 20, 1, 0, 0)
+    net = _lib().make_network(1)
+    layer = _lib().make_connected_layer(1, 75, 20, 1, 0, 0)
     net.layers[0] = layer
     net.w = net.h = 5
-    LIB.resize_network(net, 5, 5)
+    _lib().resize_network(net, 5, 5)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_dense_batchnorm():
     """test fully connected layer with batchnorm"""
-    net = LIB.make_network(1)
-    layer = LIB.make_connected_layer(1, 12, 2, 1, 1, 0)
+    net = _lib().make_network(1)
+    layer = _lib().make_connected_layer(1, 12, 2, 1, 1, 0)
     for i in range(5):
         layer.rolling_mean[i] = np.random.rand(1)
         layer.rolling_variance[i] = np.random.rand(1) + 0.5
         layer.scales[i] = np.random.rand(1)
     net.layers[0] = layer
     net.w = net.h = 2
-    LIB.resize_network(net, 2, 2)
+    _lib().resize_network(net, 2, 2)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_maxpooling():
     """test maxpooling layer"""
-    net = LIB.make_network(1)
-    layer = LIB.make_maxpool_layer(1, 224, 224, 3, 2, 2, 0)
+    net = _lib().make_network(1)
+    layer = _lib().make_maxpool_layer(1, 224, 224, 3, 2, 2, 0)
     net.layers[0] = layer
     net.w = net.h = 224
-    LIB.resize_network(net, 224, 224)
+    _lib().resize_network(net, 224, 224)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_avgpooling():
     """test avgerage pooling layer"""
-    net = LIB.make_network(1)
-    layer = LIB.make_avgpool_layer(1, 224, 224, 3)
+    net = _lib().make_network(1)
+    layer = _lib().make_avgpool_layer(1, 224, 224, 3)
     net.layers[0] = layer
     net.w = net.h = 224
-    LIB.resize_network(net, 224, 224)
+    _lib().resize_network(net, 224, 224)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_conv_batch_norm():
     """test batch normalization layer"""
-    net = LIB.make_network(1)
-    layer = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 1, 0, 0, 0)
+    net = _lib().make_network(1)
+    layer = _lib().make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 1, 0, 0, 0)
     for i in range(32):
         layer.rolling_mean[i] = np.random.rand(1)
         layer.rolling_variance[i] = np.random.rand(1) + 0.5
     net.layers[0] = layer
     net.w = net.h = 224
-    LIB.resize_network(net, 224, 224)
+    _lib().resize_network(net, 224, 224)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_shortcut():
     """test shortcut layer"""
-    net = LIB.make_network(3)
-    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
-    layer_2 = LIB.make_convolutional_layer(1, 111, 111, 32, 32, 1, 1, 1, 0, 1, 0, 0, 0, 0)
-    layer_3 = LIB.make_shortcut_layer(1, 0, 111, 111, 32, 111, 111, 32)
+    net = _lib().make_network(3)
+    layer_1 = _lib().make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = _lib().make_convolutional_layer(1, 111, 111, 32, 32, 1, 1, 1, 0, 1, 0, 0, 0, 0)
+    layer_3 = _lib().make_shortcut_layer(1, 0, 111, 111, 32, 111, 111, 32)
     layer_3.activation = ACTIVATION.RELU
     layer_3.alpha = 1
     layer_3.beta = 1
@@ -346,118 +360,118 @@ def test_forward_shortcut():
     net.layers[1] = layer_2
     net.layers[2] = layer_3
     net.w = net.h = 224
-    LIB.resize_network(net, 224, 224)
+    _lib().resize_network(net, 224, 224)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_reorg():
     """test reorg layer"""
-    net = LIB.make_network(2)
-    layer_1 = LIB.make_convolutional_layer(1, 222, 222, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
-    layer_2 = LIB.make_reorg_layer(1, 110, 110, 32, 2, 0, 0, 0)
+    net = _lib().make_network(2)
+    layer_1 = _lib().make_convolutional_layer(1, 222, 222, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = _lib().make_reorg_layer(1, 110, 110, 32, 2, 0, 0, 0)
     net.layers[0] = layer_1
     net.layers[1] = layer_2
     net.w = net.h = 222
-    LIB.resize_network(net, 222, 222)
+    _lib().resize_network(net, 222, 222)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_region():
     """test region layer"""
-    net = LIB.make_network(2)
-    layer_1 = LIB.make_convolutional_layer(1, 19, 19, 3, 425, 1, 1, 1, 0, 1, 0, 0, 0, 0)
-    layer_2 = LIB.make_region_layer(1, 19, 19, 5, 80, 4)
+    net = _lib().make_network(2)
+    layer_1 = _lib().make_convolutional_layer(1, 19, 19, 3, 425, 1, 1, 1, 0, 1, 0, 0, 0, 0)
+    layer_2 = _lib().make_region_layer(1, 19, 19, 5, 80, 4)
     layer_2.softmax = 1
     net.layers[0] = layer_1
     net.layers[1] = layer_2
     net.w = net.h = 19
-    LIB.resize_network(net, 19, 19)
+    _lib().resize_network(net, 19, 19)
     build_dtype = {}
     verify_darknet_frontend(net, build_dtype)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_yolo_op():
     """test yolo layer"""
-    net = LIB.make_network(2)
-    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 14, 1, 3, 2, 0, 1, 0, 0, 0, 0)
-    layer_2 = LIB.make_yolo_layer(1, 111, 111, 2, 9, __darknetffi__.NULL, 2)
+    net = _lib().make_network(2)
+    layer_1 = _lib().make_convolutional_layer(1, 224, 224, 3, 14, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = _lib().make_yolo_layer(1, 111, 111, 2, 9, __darknetffi__.NULL, 2)
     net.layers[0] = layer_1
     net.layers[1] = layer_2
     net.w = net.h = 224
-    LIB.resize_network(net, 224, 224)
+    _lib().resize_network(net, 224, 224)
     build_dtype = {}
     verify_darknet_frontend(net, build_dtype)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_upsample():
     """test upsample layer"""
-    net = LIB.make_network(1)
-    layer = LIB.make_upsample_layer(1, 19, 19, 3, 3)
+    net = _lib().make_network(1)
+    layer = _lib().make_upsample_layer(1, 19, 19, 3, 3)
     layer.scale = 1
     net.layers[0] = layer
     net.w = net.h = 19
-    LIB.resize_network(net, 19, 19)
+    _lib().resize_network(net, 19, 19)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_l2normalize():
     """test l2 normalization layer"""
-    net = LIB.make_network(1)
-    layer = LIB.make_l2norm_layer(1, 224 * 224 * 3)
+    net = _lib().make_network(1)
+    layer = _lib().make_l2norm_layer(1, 224 * 224 * 3)
     layer.c = layer.out_c = 3
     layer.h = layer.out_h = 224
     layer.w = layer.out_w = 224
     net.layers[0] = layer
     net.w = net.h = 224
-    LIB.resize_network(net, 224, 224)
+    _lib().resize_network(net, 224, 224)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_elu():
     """test elu activation layer"""
-    net = LIB.make_network(1)
-    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    net = _lib().make_network(1)
+    layer_1 = _lib().make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
     layer_1.activation = ACTIVATION.ELU
     net.layers[0] = layer_1
     net.w = net.h = 224
-    LIB.resize_network(net, 224, 224)
+    _lib().resize_network(net, 224, 224)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_softmax():
     """test softmax layer"""
-    net = LIB.make_network(1)
-    layer_1 = LIB.make_softmax_layer(1, 75, 1)
+    net = _lib().make_network(1)
+    layer_1 = _lib().make_softmax_layer(1, 75, 1)
     layer_1.temperature = 1
     net.layers[0] = layer_1
     net.w = net.h = 5
-    LIB.resize_network(net, net.w, net.h)
+    _lib().resize_network(net, net.w, net.h)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_softmax_temperature():
     """test softmax layer"""
-    net = LIB.make_network(1)
-    layer_1 = LIB.make_softmax_layer(1, 75, 1)
+    net = _lib().make_network(1)
+    layer_1 = _lib().make_softmax_layer(1, 75, 1)
     layer_1.temperature = 0.8
     net.layers[0] = layer_1
     net.w = net.h = 5
-    LIB.resize_network(net, net.w, net.h)
+    _lib().resize_network(net, net.w, net.h)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_activation_logistic():
     """test logistic activation layer"""
-    net = LIB.make_network(1)
+    net = _lib().make_network(1)
     batch = 1
     h = 224
     width = 224
@@ -472,7 +486,7 @@ def test_forward_activation_logistic():
     binary = 0
     xnor = 0
     adam = 0
-    layer_1 = LIB.make_convolutional_layer(
+    layer_1 = _lib().make_convolutional_layer(
         batch,
         h,
         width,
@@ -491,14 +505,14 @@ def test_forward_activation_logistic():
     net.layers[0] = layer_1
     net.w = width
     net.h = h
-    LIB.resize_network(net, net.w, net.h)
+    _lib().resize_network(net, net.w, net.h)
     verify_darknet_frontend(net)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 def test_forward_rnn():
     """test RNN layer"""
-    net = LIB.make_network(1)
+    net = _lib().make_network(1)
     batch = 1
     inputs = 4
     outputs = 4
@@ -506,15 +520,17 @@ def test_forward_rnn():
     activation = ACTIVATION.RELU
     batch_normalize = 0
     adam = 0
-    layer_1 = LIB.make_rnn_layer(batch, inputs, outputs, steps, activation, batch_normalize, adam)
+    layer_1 = _lib().make_rnn_layer(
+        batch, inputs, outputs, steps, activation, batch_normalize, adam
+    )
     net.layers[0] = layer_1
     net.inputs = inputs
     net.outputs = outputs
     net.w = net.h = 0
-    LIB.resize_network(net, net.w, net.h)
+    _lib().resize_network(net, net.w, net.h)
     states = {"rnn0_state": np.zeros([1, net.inputs])}
     _test_rnn_network(net, states)
-    LIB.free_network(net)
+    _lib().free_network(net)
 
 
 if __name__ == "__main__":
diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py
index dd1adf0dedd9..ce379b6b2cb3 100644
--- a/tests/scripts/request_hook/request_hook.py
+++ b/tests/scripts/request_hook/request_hook.py
@@ -20,6 +20,8 @@
 import urllib.request
 import logging
 
+from urllib.parse import quote
+
 LOGGER = None
 
 
@@ -30,22 +32,119 @@
     "http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel": f"{BASE}/bvlc_alexnet.caffemodel",
     "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel": f"{BASE}/bvlc_googlenet.caffemodel",
     "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz": f"{BASE}/tf-mobilenet_v1_1.0_224.tgz",
+    "http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03.tar.gz": f"{BASE}/models/object_detection/ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03.tar.gz",
+    "http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz": f"{BASE}/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz",
     "http://images.cocodataset.org/zips/val2017.zip": f"{BASE}/cocodataset-val2017.zip",
+    "http://pjreddie.com/media/files/alexnet.weights?raw=true": f"{BASE}/media/files/alexnet.weights"
+    + quote("?raw=true"),
+    "http://pjreddie.com/media/files/alexnet.weights?raw=true": f"{BASE}/media/files/alexnet.weights"
+    + quote("?raw=true"),
+    "http://pjreddie.com/media/files/extraction.weights?raw=true": f"{BASE}/media/files/extraction.weights"
+    + quote("?raw=true"),
+    "http://pjreddie.com/media/files/extraction.weights?raw=true": f"{BASE}/media/files/extraction.weights"
+    + quote("?raw=true"),
+    "http://pjreddie.com/media/files/resnet50.weights?raw=true": f"{BASE}/media/files/resnet50.weights"
+    + quote("?raw=true"),
+    "http://pjreddie.com/media/files/resnext50.weights?raw=true": f"{BASE}/media/files/resnext50.weights"
+    + quote("?raw=true"),
+    "http://pjreddie.com/media/files/yolov2.weights?raw=true": f"{BASE}/media/files/yolov2.weights"
+    + quote("?raw=true"),
+    "http://pjreddie.com/media/files/yolov3.weights?raw=true": f"{BASE}/media/files/yolov3.weights"
+    + quote("?raw=true"),
+    "http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz": f"{BASE}/imikolov/rnnlm/simple-examples.tgz",
     "https://bj.bcebos.com/x2paddle/models/paddle_resnet50.tar": f"{BASE}/bcebos-paddle_resnet50.tar",
     "https://data.deepai.org/stanfordcars.zip": f"{BASE}/deepai-stanfordcars.zip",
+    "https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth": f"{BASE}/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth",
+    "https://github.com/ARM-software/ML-zoo/blob/48f458af1e9065d9aad2ad94d24b58d6e7c00817/models/keyword_spotting/ds_cnn_small/tflite_int16/ds_cnn_quantized.tflite?raw=true": f"{BASE}/ARM-software/ML-zoo/blob/48f458af1e9065d9aad2ad94d24b58d6e7c00817/models/keyword_spotting/ds_cnn_small/tflite_int16/ds_cnn_quantized.tflite"
+    + quote("?raw=true"),
+    "https://raw.githubusercontent.com/tlc-pack/tophub/main/tophub/adreno_v0.01.log": f"{BASE}/tlc-pack/tophub/main/tophub/adreno_v0.01.log",
     "https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel": f"{BASE}/2022-10-05/MobileNet.mlmodel",
+    "https://docs-assets.developer.apple.com/coreml/models/Resnet50.mlmodel": f"{BASE}/coreml/models/Resnet50.mlmodel",
+    "https://download.pytorch.org/models/deeplabv3_mobilenet_v3_large-fc3c493d.pth": f"{BASE}/models/deeplabv3_mobilenet_v3_large-fc3c493d.pth",
+    "https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth": f"{BASE}/models/deeplabv3_resnet101_coco-586e9e4e.pth",
+    "https://download.pytorch.org/models/densenet121-a639ec97.pth": f"{BASE}/models/densenet121-a639ec97.pth",
+    "https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth": f"{BASE}/models/efficientnet_b4_rwightman-7eb33cd5.pth",
+    "https://download.pytorch.org/models/fcn_resnet101_coco-7ecb50ca.pth": f"{BASE}/models/fcn_resnet101_coco-7ecb50ca.pth",
+    "https://download.pytorch.org/models/googlenet-1378be20.pth": f"{BASE}/models/googlenet-1378be20.pth",
+    "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth": f"{BASE}/models/inception_v3_google-0cc3c7bd.pth",
     "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth": f"{BASE}/2022-10-05/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth",
+    "https://download.pytorch.org/models/mnasnet0.5_top1_67.823-3ffadce67e.pth": f"{BASE}/models/mnasnet0.5_top1_67.823-3ffadce67e.pth",
     "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth": f"{BASE}/2022-10-05/mobilenet_v2-b0353104.pth",
+    "https://download.pytorch.org/models/r3d_18-b3b3357e.pth": f"{BASE}/models/r3d_18-b3b3357e.pth",
     "https://download.pytorch.org/models/resnet18-f37072fd.pth": f"{BASE}/2022-10-05/resnet18-f37072fd.pth",
+    "https://download.pytorch.org/models/resnet50-0676ba61.pth": f"{BASE}/models/resnet50-0676ba61.pth",
+    "https://download.pytorch.org/models/squeezenet1_0-b66bff10.pth": f"{BASE}/models/squeezenet1_0-b66bff10.pth",
+    "https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth": f"{BASE}/models/squeezenet1_1-b8a52dc0.pth",
+    "https://download.pytorch.org/models/vgg16_features-amdegroot-88682ab5.pth": f"{BASE}/models/vgg16_features-amdegroot-88682ab5.pth",
     "https://gist.github.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/93672b029103648953c4e5ad3ac3aadf346a4cdc/super_resolution_0.2.onnx": f"{BASE}/2022-10-05/super_resolution_0.2.onnx",
     "https://gist.githubusercontent.com/zhreshold/4d0b62f3d01426887599d4f7ede23ee5/raw/596b27d23537e5a1b5751d2b0481ef172f58b539/imagenet1000_clsid_to_human.txt": f"{BASE}/2022-10-05/imagenet1000_clsid_to_human.txt",
+    "https://gist.githubusercontent.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/fa7ef0e9c9a5daea686d6473a62aacd1a5885849/cat.png": f"{BASE}/zhreshold/bcda4716699ac97ea44f791c24310193/raw/fa7ef0e9c9a5daea686d6473a62aacd1a5885849/cat.png",
+    "https://github.com/ARM-software/ML-zoo/raw/48a22ee22325d15d2371a6df24eb7d67e21dcc97/models/keyword_spotting/cnn_small/tflite_int8/cnn_s_quantized.tflite": f"{BASE}/ARM-software/ML-zoo/raw/48a22ee22325d15d2371a6df24eb7d67e21dcc97/models/keyword_spotting/cnn_small/tflite_int8/cnn_s_quantized.tflite",
+    "https://github.com/ARM-software/ML-zoo/raw/master/models/keyword_spotting/cnn_small/tflite_int8//cnn_s_quantized.tflite": f"{BASE}/ARM-software/ML-zoo/raw/master/models/keyword_spotting/cnn_small/tflite_int8//cnn_s_quantized.tflite",
+    "https://github.com/czh978/models_for_tvm_test/raw/main/tflite_graph_with_postprocess.pb": f"{BASE}/czh978/models_for_tvm_test/raw/main/tflite_graph_with_postprocess.pb",
+    "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true": f"{BASE}/dmlc/mxnet.js/blob/main/data/cat.png"
+    + quote("?raw=true"),
+    "https://github.com/dmlc/mxnet.js/raw/main/data/cat.png": f"{BASE}/dmlc/mxnet.js/raw/main/data/cat.png",
+    "https://github.com/dmlc/web-data/blob/main/darknet/cfg/yolov3.cfg?raw=true": f"{BASE}/dmlc/web-data/blob/main/darknet/cfg/yolov3.cfg"
+    + quote("?raw=true"),
+    "https://github.com/dmlc/web-data/blob/main/darknet/data/arial.ttf?raw=true": f"{BASE}/dmlc/web-data/blob/main/darknet/data/arial.ttf"
+    + quote("?raw=true"),
+    "https://github.com/dmlc/web-data/blob/main/darknet/data/coco.names?raw=true": f"{BASE}/dmlc/web-data/blob/main/darknet/data/coco.names"
+    + quote("?raw=true"),
+    "https://github.com/dmlc/web-data/blob/main/darknet/data/dog.jpg?raw=true": f"{BASE}/dmlc/web-data/blob/main/darknet/data/dog.jpg"
+    + quote("?raw=true"),
     "https://github.com/dmlc/web-data/blob/main/darknet/data/dog.jpg": f"{BASE}/dog.jpg",
+    "https://github.com/dmlc/web-data/blob/main/darknet/data/person.jpg?raw=true": f"{BASE}/dmlc/web-data/blob/main/darknet/data/person.jpg"
+    + quote("?raw=true"),
+    "https://github.com/dmlc/web-data/blob/main/darknet/lib/libdarknet2.0.so?raw=true": f"{BASE}/dmlc/web-data/blob/main/darknet/lib/libdarknet2.0.so"
+    + quote("?raw=true"),
     "https://github.com/dmlc/web-data/blob/main/gluoncv/detection/street_small.jpg?raw=true": f"{BASE}/2022-10-05/small_street_raw.jpg",
+    "https://github.com/dmlc/web-data/raw/main/darknet/cfg/yolov3.cfg": f"{BASE}/dmlc/web-data/raw/main/darknet/cfg/yolov3.cfg",
+    "https://github.com/dmlc/web-data/raw/main/darknet/data/arial.ttf": f"{BASE}/dmlc/web-data/raw/main/darknet/data/arial.ttf",
+    "https://github.com/dmlc/web-data/raw/main/darknet/data/coco.names": f"{BASE}/dmlc/web-data/raw/main/darknet/data/coco.names",
+    "https://github.com/dmlc/web-data/raw/main/darknet/data/dog.jpg": f"{BASE}/dmlc/web-data/raw/main/darknet/data/dog.jpg",
+    "https://github.com/dmlc/web-data/raw/main/darknet/data/person.jpg": f"{BASE}/dmlc/web-data/raw/main/darknet/data/person.jpg",
+    "https://github.com/dmlc/web-data/raw/main/darknet/lib/libdarknet2.0.so": f"{BASE}/dmlc/web-data/raw/main/darknet/lib/libdarknet2.0.so",
     "https://github.com/dmlc/web-data/raw/main/gluoncv/detection/street_small.jpg": f"{BASE}/2022-10-05/gluon-small-stree.jpg",
+    "https://github.com/dmlc/web-data/raw/main/tensorflow/models/Custom/placeholder.pb": f"{BASE}/dmlc/web-data/raw/main/tensorflow/models/Custom/placeholder.pb",
+    "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/classify_image_graph_def-with_shapes.pb": f"{BASE}/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/classify_image_graph_def-with_shapes.pb",
+    "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/elephant-299.jpg": f"{BASE}/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/elephant-299.jpg",
+    "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/imagenet_2012_challenge_label_map_proto.pbtxt": f"{BASE}/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/imagenet_2012_challenge_label_map_proto.pbtxt",
+    "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/imagenet_synset_to_human_label_map.txt": f"{BASE}/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/imagenet_synset_to_human_label_map.txt",
+    "https://github.com/dmlc/web-data/raw/main/tensorflow/models/RNN/ptb/ptb_model_with_lstmblockcell.pb": f"{BASE}/dmlc/web-data/raw/main/tensorflow/models/RNN/ptb/ptb_model_with_lstmblockcell.pb",
+    "https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/elephant-299.jpg": f"{BASE}/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/elephant-299.jpg",
+    "https://github.com/fernchen/CaffeModels/raw/master/resnet/ResNet-50-deploy.prototxt": f"{BASE}/fernchen/CaffeModels/raw/master/resnet/ResNet-50-deploy.prototxt",
+    "https://github.com/fernchen/CaffeModels/raw/master/resnet/ResNet-50-deploy.prototxt": f"{BASE}/fernchen/CaffeModels/raw/master/resnet/ResNet-50-deploy.prototxt",
+    "https://github.com/fernchen/CaffeModels/raw/master/resnet/ResNet-50-model.caffemodel": f"{BASE}/fernchen/CaffeModels/raw/master/resnet/ResNet-50-model.caffemodel",
+    "https://github.com/google/mediapipe/raw/v0.7.4/mediapipe/models/hand_landmark.tflite": f"{BASE}/google/mediapipe/raw/v0.7.4/mediapipe/models/hand_landmark.tflite",
     "https://github.com/JonathanCMitchell/mobilenet_v2_keras/releases/download/v1.1/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5": f"{BASE}/2022-10-05/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5",
     "https://github.com/onnx/models/raw/bd206494e8b6a27b25e5cf7199dbcdbfe9d05d1c/vision/classification/mnist/model/mnist-1.onnx": f"{BASE}/onnx/mnist-1.onnx",
+    "https://github.com/onnx/models/raw/bd206494e8b6a27b25e5cf7199dbcdbfe9d05d1c/vision/classification/resnet/model/resnet50-v2-7.onnx": f"{BASE}/onnx/models/raw/bd206494e8b6a27b25e5cf7199dbcdbfe9d05d1c/vision/classification/resnet/model/resnet50-v2-7.onnx",
+    "https://github.com/onnx/models/raw/main/vision/classification/mobilenet/model/mobilenetv2-7.onnx": f"{BASE}/onnx/models/raw/main/vision/classification/mobilenet/model/mobilenetv2-7.onnx",
     "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v2-7.onnx": f"{BASE}/2022-10-05/resnet50-v2-7.onnx",
+    "https://github.com/pjreddie/darknet/blob/master/cfg/alexnet.cfg?raw=true": f"{BASE}/pjreddie/darknet/blob/master/cfg/alexnet.cfg"
+    + quote("?raw=true"),
+    "https://github.com/pjreddie/darknet/blob/master/cfg/extraction.cfg?raw=true": f"{BASE}/pjreddie/darknet/blob/master/cfg/extraction.cfg"
+    + quote("?raw=true"),
+    "https://github.com/pjreddie/darknet/blob/master/cfg/resnet50.cfg?raw=true": f"{BASE}/pjreddie/darknet/blob/master/cfg/resnet50.cfg"
+    + quote("?raw=true"),
+    "https://github.com/pjreddie/darknet/blob/master/cfg/resnext50.cfg?raw=true": f"{BASE}/pjreddie/darknet/blob/master/cfg/resnext50.cfg"
+    + quote("?raw=true"),
+    "https://github.com/pjreddie/darknet/blob/master/cfg/yolov2.cfg?raw=true": f"{BASE}/pjreddie/darknet/blob/master/cfg/yolov2.cfg"
+    + quote("?raw=true"),
     "https://github.com/pjreddie/darknet/blob/master/cfg/yolov3-tiny.cfg?raw=true": f"{BASE}/2022-10-05/yolov3-tiny-raw.cfg",
+    "https://github.com/pjreddie/darknet/blob/master/cfg/yolov3.cfg?raw=true": f"{BASE}/pjreddie/darknet/blob/master/cfg/yolov3.cfg"
+    + quote("?raw=true"),
+    "https://github.com/SebastianBoblestETAS/nn_models/blob/ce49c5de64889493161ca4194a20e0fd5eb707e6/lstm_1_in_3_out_2_ts_4.tflite?raw=true": f"{BASE}/SebastianBoblestETAS/nn_models/blob/ce49c5de64889493161ca4194a20e0fd5eb707e6/lstm_1_in_3_out_2_ts_4.tflite"
+    + quote("?raw=true"),
+    "https://github.com/shicai/MobileNet-Caffe/blob/master/mobilenet_v2.caffemodel?raw=true": f"{BASE}/shicai/MobileNet-Caffe/blob/master/mobilenet_v2.caffemodel"
+    + quote("?raw=true"),
+    "https://github.com/shicai/MobileNet-Caffe/raw/master/mobilenet_v2_deploy.prototxt": f"{BASE}/shicai/MobileNet-Caffe/raw/master/mobilenet_v2_deploy.prototxt",
+    "https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/micro_speech/micro_speech.tflite": f"{BASE}/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/micro_speech/micro_speech.tflite",
+    "https://github.com/tlc-pack/web-data/raw/25fe99fb00329a26bd37d3dca723da94316fd34c/testdata/microTVM/model/keyword_spotting_quant.tflite": f"{BASE}/tlc-pack/web-data/raw/25fe99fb00329a26bd37d3dca723da94316fd34c/testdata/microTVM/model/keyword_spotting_quant.tflite",
+    "https://github.com/tlc-pack/web-data/raw/967fc387dadb272c5a7f8c3461d34c060100dbf1/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy": f"{BASE}/tlc-pack/web-data/raw/967fc387dadb272c5a7f8c3461d34c060100dbf1/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy",
+    "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy": f"{BASE}/tlc-pack/web-data/raw/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy",
+    "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/keyword_spotting_quant.tflite": f"{BASE}/tlc-pack/web-data/raw/main/testdata/microTVM/model/keyword_spotting_quant.tflite",
     "https://github.com/uwsampl/web-data/raw/main/vta/models/synset.txt": f"{BASE}/2022-10-05/synset.txt",
     "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_cora.torch": f"{BASE}/gcn_cora.torch",
     "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg": f"{BASE}/vta_cat.jpg",
@@ -56,26 +155,76 @@
     "https://pjreddie.com/media/files/yolov3.weights": f"{BASE}/yolov3.weights",
     "https://raw.githubusercontent.com/Cadene/pretrained-models.pytorch/master/data/imagenet_classes.txt": f"{BASE}/2022-10-05/imagenet_classes.txt",
     "https://raw.githubusercontent.com/Cadene/pretrained-models.pytorch/master/data/imagenet_synsets.txt": f"{BASE}/2022-10-05/imagenet_synsets.txt",
+    "https://raw.githubusercontent.com/dmlc/mxnet.js/main/data/cat.png": f"{BASE}/dmlc/mxnet.js/main/data/cat.png",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/darknet/cfg/yolov3.cfg": f"{BASE}/dmlc/web-data/main/darknet/cfg/yolov3.cfg",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/darknet/data/arial.ttf": f"{BASE}/dmlc/web-data/main/darknet/data/arial.ttf",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/darknet/data/coco.names": f"{BASE}/dmlc/web-data/main/darknet/data/coco.names",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/darknet/data/dog.jpg": f"{BASE}/dmlc/web-data/main/darknet/data/dog.jpg",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/darknet/data/person.jpg": f"{BASE}/dmlc/web-data/main/darknet/data/person.jpg",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/darknet/lib/libdarknet2.0.so": f"{BASE}/dmlc/web-data/main/darknet/lib/libdarknet2.0.so",
     "https://raw.githubusercontent.com/dmlc/web-data/main/gluoncv/detection/street_small.jpg": f"{BASE}/2022-10-05/small_street.jpg",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/models/InceptionV1/classify_image_graph_def-with_shapes.pb": f"{BASE}/dmlc/web-data/main/tensorflow/models/InceptionV1/classify_image_graph_def-with_shapes.pb",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/models/InceptionV1/elephant-299.jpg": f"{BASE}/dmlc/web-data/main/tensorflow/models/InceptionV1/elephant-299.jpg",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/models/InceptionV1/imagenet_2012_challenge_label_map_proto.pbtxt": f"{BASE}/dmlc/web-data/main/tensorflow/models/InceptionV1/imagenet_2012_challenge_label_map_proto.pbtxt",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/models/InceptionV1/imagenet_synset_to_human_label_map.txt": f"{BASE}/dmlc/web-data/main/tensorflow/models/InceptionV1/imagenet_synset_to_human_label_map.txt",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/models/object_detection/ssd_mobilenet_v1_coco_2018_01_28.tgz": f"{BASE}/dmlc/web-data/main/tensorflow/models/object_detection/ssd_mobilenet_v1_coco_2018_01_28.tgz",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/models/Quantized/inception_v1_quantized.tflite": f"{BASE}/dmlc/web-data/main/tensorflow/models/Quantized/inception_v1_quantized.tflite",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/models/Quantized/mobilenet_v2_quantized.tflite": f"{BASE}/dmlc/web-data/main/tensorflow/models/Quantized/mobilenet_v2_quantized.tflite",
+    "https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/models/Quantized/resnet_50_quantized.tflite": f"{BASE}/dmlc/web-data/main/tensorflow/models/Quantized/resnet_50_quantized.tflite",
     "https://raw.githubusercontent.com/dmlc/web-data/master/gluoncv/detection/street_small.jpg": f"{BASE}/2022-10-05/street_small.jpg",
     "https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/lite/java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt": f"{BASE}/2022-10-05/labels_mobilenet_quant_v1_224.txt",
+    "https://raw.githubusercontent.com/tlc-pack/tophub/main/tophub/arm_cpu_v0.08.log": f"{BASE}/tlc-pack/tophub/main/tophub/arm_cpu_v0.08.log",
+    "https://raw.githubusercontent.com/tlc-pack/tophub/main/tophub/cuda_v0.10.log": f"{BASE}/tlc-pack/tophub/main/tophub/cuda_v0.10.log",
+    "https://raw.githubusercontent.com/tlc-pack/tophub/main/tophub/llvm_v0.04.log": f"{BASE}/tlc-pack/tophub/main/tophub/llvm_v0.04.log",
     "https://raw.githubusercontent.com/tlc-pack/tophub/main/tophub/mali_v0.06.log": f"{BASE}/2022-10-05/mali_v0.06.log",
+    "https://raw.githubusercontent.com/tlc-pack/tophub/main/tophub/opencl_v0.04.log": f"{BASE}/tlc-pack/tophub/main/tophub/opencl_v0.04.log",
+    "https://raw.githubusercontent.com/tlc-pack/tophub/main/tophub/vta_v0.10.log": f"{BASE}/tlc-pack/tophub/main/tophub/vta_v0.10.log",
+    "https://raw.githubusercontent.com/tlc-pack/web-data/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy": f"{BASE}/tlc-pack/web-data/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy",
+    "https://raw.githubusercontent.com/tlc-pack/web-data/main/testdata/microTVM/model/keyword_spotting_quant.tflite": f"{BASE}/tlc-pack/web-data/main/testdata/microTVM/model/keyword_spotting_quant.tflite",
     "https://s3.amazonaws.com/model-server/inputs/kitten.jpg": f"{BASE}/2022-10-05/kitten.jpg",
     "https://s3.amazonaws.com/onnx-model-zoo/synset.txt": f"{BASE}/2022-10-05/synset-s3.txt",
+    "https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz": f"{BASE}/download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/inception_v4_299_quant_20181026.tgz": f"{BASE}/download.tensorflow.org/models/inception_v4_299_quant_20181026.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz": f"{BASE}/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz": f"{BASE}/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz": f"{BASE}/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz": f"{BASE}/download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz",
     "https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz": f"{BASE}/2022-10-05/mobilenet_v2_1.0_224_quant.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip": f"{BASE}/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip",
+    "https://storage.googleapis.com/download.tensorflow.org/models/tflite/digit_classifier/mnist.tflite": f"{BASE}/download.tensorflow.org/models/tflite/digit_classifier/mnist.tflite",
+    "https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz": f"{BASE}/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz": f"{BASE}/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz": f"{BASE}/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz",
+    "https://storage.googleapis.com/fast-convnets/tflite-models/mbv1_140_90_12b4_720.tflite": f"{BASE}/fast-convnets/tflite-models/mbv1_140_90_12b4_720.tflite",
+    "https://storage.googleapis.com/fast-convnets/tflite-models/mbv2_200_85_11-16b2_744.tflite": f"{BASE}/fast-convnets/tflite-models/mbv2_200_85_11-16b2_744.tflite",
+    "https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.4_224.tgz": f"{BASE}/mobilenet_v2/checkpoints/mobilenet_v2_1.4_224.tgz",
+    "https://storage.googleapis.com/mobilenet_v3/checkpoints/v3-large_224_1.0_float.tgz": f"{BASE}/mobilenet_v3/checkpoints/v3-large_224_1.0_float.tgz",
+    "https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/mobilenet_1_0_224_tf_no_top.h5": f"{BASE}/tensorflow/keras-applications/mobilenet/mobilenet_1_0_224_tf_no_top.h5",
+    "https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/mobilenet_1_0_224_tf.h5": f"{BASE}/tensorflow/keras-applications/mobilenet/mobilenet_1_0_224_tf.h5",
     "https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/mobilenet_2_5_128_tf.h5": f"{BASE}/2022-10-05/mobilenet_2_5_128_tf.h5",
-    "https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5": f"{BASE}/2022-10-05/resnet50_weights_tf_dim_ordering_tf_kernels.h5",
+    "https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5": f"{BASE}/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5",
+    "https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5": f"{BASE}/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5",
+    "https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels.h5": f"{BASE}/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels.h5",
+    "https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz": f"{BASE}/tensorflow/tf-keras-datasets/mnist.npz",
 }
 
 
 class TvmRequestHook(urllib.request.Request):
     def __init__(self, url, *args, **kwargs):
         LOGGER.info(f"Caught access to {url}")
-        if url in URL_MAP:
-            new_url = URL_MAP[url]
-            LOGGER.info(f"Mapped URL {url} to {new_url}")
-        else:
-            new_url = url
+        url = url.strip()
+        if url not in URL_MAP and not url.startswith(BASE):
+            # Dis-allow any accesses that aren't going through S3
+            msg = (
+                f"Uncaught URL found in CI: {url}. "
+                "A committer must upload the relevant file to S3 via"
+                "https://github.com/apache/tvm/actions/workflows/upload_ci_resource.yml"
+                "and add it to the mapping in tests/scripts/request_hook/request_hook.py"
+            )
+            raise RuntimeError(msg)
+
+        new_url = URL_MAP[url]
+        LOGGER.info(f"Mapped URL {url} to {new_url}")
         super().__init__(new_url, *args, **kwargs)
 
 
From 7bc41ecca26204112f266d146371d0e867ec1240 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Sun, 4 Dec 2022 13:40:33 -0600
Subject: [PATCH 689/704] [Logging] Mark LogFatal::~LogFatal as [[noreturn]]
 (#13542)

Using `LOG(FATAL) << "some error message";` throws an exception when
the internal `LogFatal` object is constructed.  However, the gcc warning
"control reaches end of non-void function" resulted in a common
pattern where a `LOG(FATAL)` statement was immediately followed by
return of a default value, in order to suppress the warning.

This commit marks the `LogFatal::~LogFatal` destructor with the C++11
attribute `[[noreturn]]`, indicating that calls into it will not
resume the normal control flow.  This suppresses the warning without
requiring the extra `return` statement after `LOG(FATAL)`, as the
compiler knows that control flow will not resume after `LOG(FATAL)`.
---
 apps/dso_plugin_module/plugin_module.cc          |  1 -
 include/tvm/runtime/data_type.h                  |  1 -
 include/tvm/runtime/device_api.h                 |  1 -
 include/tvm/runtime/logging.h                    |  6 ++++--
 include/tvm/runtime/packed_func.h                |  1 -
 include/tvm/tir/expr_functor.h                   |  1 -
 include/tvm/tir/op.h                             |  1 -
 include/tvm/tir/op_attr_types.h                  |  1 -
 include/tvm/tir/stmt.h                           |  1 -
 include/tvm/tir/stmt_functor.h                   |  1 -
 src/arith/bound_deducer.cc                       |  1 -
 src/arith/iter_affine_map.cc                     |  2 --
 src/arith/narrow_predicate_expression.cc         |  1 -
 src/arith/transitive_comparison_analyzer.cc      |  3 ---
 src/auto_scheduler/search_policy/utils.h         |  2 --
 src/auto_scheduler/utils.h                       |  1 -
 src/ir/function.cc                               |  1 -
 src/ir/module.cc                                 |  1 -
 src/meta_schedule/module_equality.cc             |  1 -
 src/parser/meta_ref.cc                           |  1 -
 src/parser/parser.cc                             |  1 -
 src/parser/token.h                               |  2 --
 src/printer/tvmscript_printer.cc                 |  3 ---
 src/relay/backend/aot/aot_lower_main.cc          |  1 -
 src/relay/backend/aot_executor_codegen.cc        |  1 -
 src/relay/backend/contrib/codegen_c/codegen.cc   |  1 -
 .../backend/contrib/codegen_json/codegen_json.h  |  1 -
 src/relay/backend/contrib/cutlass/codegen.cc     |  2 --
 src/relay/backend/contrib/dnnl/codegen.cc        |  3 ---
 src/relay/backend/graph_executor_codegen.cc      |  8 --------
 src/relay/backend/interpreter.cc                 |  6 ------
 src/relay/backend/te_compiler_cache.cc           |  6 ------
 src/relay/backend/vm/compiler.cc                 |  1 -
 src/relay/op/nn/sparse.cc                        |  2 --
 src/relay/qnn/utils.h                            |  2 --
 src/relay/transforms/defunctionalization.cc      |  1 -
 src/relay/transforms/fold_constant.cc            |  1 -
 src/relay/transforms/fold_scale_axis.cc          |  2 --
 src/relay/transforms/infer_layout_utils.cc       |  1 -
 src/relay/transforms/to_mixed_precision.cc       |  1 -
 src/runtime/contrib/arm_compute_lib/acl_utils.cc |  1 -
 src/runtime/contrib/bnns/bnns_json_runtime.cc    |  1 -
 src/runtime/contrib/clml/clml_runtime.cc         |  2 --
 src/runtime/contrib/cublas/cublas_utils.h        |  1 -
 src/runtime/contrib/dnnl/dnnl_tensor_requisite.h |  1 -
 src/runtime/contrib/tflite/tflite_runtime.cc     |  1 -
 .../graph_executor/debug/graph_executor_debug.cc |  1 -
 src/runtime/hexagon/hexagon_buffer.cc            |  1 -
 src/runtime/hexagon/hexagon_module.cc            |  1 -
 src/runtime/logging.cc                           |  1 -
 src/runtime/module.cc                            |  2 --
 src/runtime/opencl/opencl_common.h               |  1 -
 src/runtime/opencl/opencl_device_api.cc          |  1 -
 src/runtime/pack_args.h                          |  1 -
 src/runtime/pipeline/pipeline_executor.cc        |  1 -
 src/runtime/pipeline/pipeline_struct.h           |  3 ---
 src/runtime/rpc/rpc_event_impl.cc                |  6 ++----
 src/runtime/rpc/rpc_module.cc                    |  1 -
 src/runtime/rpc/rpc_session.cc                   |  1 -
 src/runtime/stackvm/stackvm.cc                   |  1 -
 src/runtime/stackvm/stackvm.h                    |  4 ----
 src/runtime/thread_storage_scope.h               |  2 --
 src/runtime/vm/executable.cc                     |  2 --
 src/runtime/vm/vm.cc                             |  1 -
 src/runtime/vulkan/vulkan_device.cc              |  1 -
 src/support/base64.h                             |  1 -
 src/support/scalars.cc                           |  2 --
 src/support/socket.h                             |  1 -
 src/target/llvm/codegen_amdgpu.cc                |  1 -
 src/target/llvm/codegen_cpu.cc                   |  2 --
 src/target/llvm/codegen_llvm.cc                  | 16 ++--------------
 src/target/llvm/codegen_nvptx.cc                 |  1 -
 src/target/source/ptx.cc                         |  2 --
 src/target/spirv/codegen_spirv.cc                |  2 --
 src/target/target.cc                             |  1 -
 src/te/autodiff/jacobian.cc                      |  1 -
 src/te/schedule/schedule_dataflow_rewrite.cc     |  1 -
 src/tir/analysis/stmt_finding.cc                 |  1 -
 src/tir/ir/expr_functor.cc                       |  1 -
 src/tir/ir/stmt_functor.cc                       |  3 ---
 src/tir/op/op.cc                                 |  6 ------
 src/tir/schedule/primitive/cache_index.cc        |  1 -
 src/tir/schedule/primitive/cache_read_write.cc   |  3 ---
 src/tir/schedule/primitive/compute_inline.cc     |  2 --
 .../schedule/primitive/layout_transformation.cc  |  1 -
 src/tir/transforms/bf16_legalize.cc              |  2 --
 src/tir/transforms/bound_checker.cc              |  2 --
 src/tir/transforms/inject_double_buffer.cc       |  2 --
 src/tir/transforms/inject_virtual_thread.cc      |  4 ----
 src/tir/transforms/ir_utils.cc                   |  2 --
 src/tir/transforms/lift_attr_scope.cc            |  1 -
 src/tir/transforms/lower_custom_datatypes.cc     |  2 --
 src/tir/transforms/lower_thread_allreduce.cc     |  2 --
 src/tir/transforms/lower_warp_memory.cc          |  2 --
 .../merge_dynamic_shared_memory_allocations.cc   |  2 --
 src/tir/transforms/narrow_datatype.cc            |  2 --
 src/tir/transforms/renew_defs.cc                 |  2 --
 src/tir/transforms/rewrite_unsafe_select.cc      |  1 -
 src/tir/transforms/simplify.cc                   |  1 -
 src/tir/transforms/split_host_device.cc          |  2 --
 src/tir/transforms/storage_flatten.cc            |  5 -----
 src/tir/transforms/storage_rewrite.cc            |  4 ----
 src/tir/transforms/thread_storage_sync.cc        |  2 --
 src/tir/transforms/unroll_loop.cc                |  1 -
 .../transforms/update_pointer_storage_scope.cc   |  2 --
 src/tir/transforms/vectorize_loop.cc             |  6 ------
 src/tir/usmp/algo/hill_climb.cc                  |  1 -
 vta/runtime/runtime.cc                           |  1 -
 web/emcc/webgpu_runtime.cc                       | 16 +++-------------
 109 files changed, 11 insertions(+), 219 deletions(-)

diff --git a/apps/dso_plugin_module/plugin_module.cc b/apps/dso_plugin_module/plugin_module.cc
index eed11f855693..bcf37fe760fd 100644
--- a/apps/dso_plugin_module/plugin_module.cc
+++ b/apps/dso_plugin_module/plugin_module.cc
@@ -43,7 +43,6 @@ class MyModuleNode : public ModuleNode {
       return TypedPackedFunc<int(int)>([sptr_to_self, this](int value) { return value_ * value; });
     } else {
       LOG(FATAL) << "unknown function " << name;
-      return PackedFunc();
     }
   }
 
diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
index 7f68ce2ad5bb..089147798a0e 100644
--- a/include/tvm/runtime/data_type.h
+++ b/include/tvm/runtime/data_type.h
@@ -310,7 +310,6 @@ inline const char* DLDataTypeCode2Str(DLDataTypeCode type_code) {
       return "bfloat";
     default:
       LOG(FATAL) << "unknown type_code=" << static_cast<int>(type_code);
-      return "";
   }
 }
 
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index e517eb0d7f34..d3c2f9ba3857 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -289,7 +289,6 @@ inline const char* DeviceName(int type) {
       return "microdev";
     default:
       LOG(FATAL) << "unknown type =" << type;
-      return "Unknown";
   }
 }
 
diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h
index 2128fc485ba4..f0c303f7d7a3 100644
--- a/include/tvm/runtime/logging.h
+++ b/include/tvm/runtime/logging.h
@@ -353,7 +353,7 @@ class LogFatal {
 #pragma disagnostic push
 #pragma warning(disable : 4722)
 #endif
-  ~LogFatal() TVM_THROW_EXCEPTION { GetEntry().Finalize(); }
+  [[noreturn]] ~LogFatal() TVM_THROW_EXCEPTION { GetEntry().Finalize(); }
 #ifdef _MSC_VER
 #pragma disagnostic pop
 #endif
@@ -366,7 +366,9 @@ class LogFatal {
       this->file_ = file;
       this->lineno_ = lineno;
     }
-    TVM_NO_INLINE dmlc::Error Finalize() { throw InternalError(file_, lineno_, stream_.str()); }
+    [[noreturn]] TVM_NO_INLINE dmlc::Error Finalize() {
+      throw InternalError(file_, lineno_, stream_.str());
+    }
     std::ostringstream stream_;
     std::string file_;
     int lineno_;
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 326f9661dfde..a4054c71f335 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -1256,7 +1256,6 @@ inline const char* ArgTypeCode2Str(int type_code) {
       return "ObjectRValueRefArg";
     default:
       LOG(FATAL) << "unknown type_code=" << static_cast<int>(type_code);
-      return "";
   }
 }
 
diff --git a/include/tvm/tir/expr_functor.h b/include/tvm/tir/expr_functor.h
index b5f1d64a00c4..e148d5834f95 100644
--- a/include/tvm/tir/expr_functor.h
+++ b/include/tvm/tir/expr_functor.h
@@ -153,7 +153,6 @@ class ExprFunctor<R(const PrimExpr& n, Args...)> {
   virtual R VisitExpr_(const AnyNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExprDefault_(const Object* op, Args...) {
     LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
-    return R();
   }
 
  private:
diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
index 0939e25efddf..9b48b0ccebd1 100644
--- a/include/tvm/tir/op.h
+++ b/include/tvm/tir/op.h
@@ -931,7 +931,6 @@ inline PrimExpr MakeConstScalar(DataType t, ValueType value, Span span = Span())
     return FloatImm(t, static_cast<double>(value), span);
   }
   LOG(FATAL) << "cannot make const for type " << t;
-  return PrimExpr();
 }
 
 template <>
diff --git a/include/tvm/tir/op_attr_types.h b/include/tvm/tir/op_attr_types.h
index fa409b27d12a..2dc174f7d2a1 100644
--- a/include/tvm/tir/op_attr_types.h
+++ b/include/tvm/tir/op_attr_types.h
@@ -119,7 +119,6 @@ inline std::ostream& operator<<(std::ostream& os, CallEffectKind side_effect) {
 
     default:
       LOG(FATAL) << "Unknown CallEffectKind: " << static_cast<int>(side_effect);
-      return os;
   }
 }
 
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 5beea44cdb1a..dc257b1e8a21 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -1649,7 +1649,6 @@ inline const char* ForKind2String(ForKind t) {
       return "thread_binding";
   }
   LOG(FATAL) << "Unknown ForKind" << t;
-  return "Unknown";
 }
 
 }  // namespace tir
diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index 9f4b4b40e4cd..3adb186fd561 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -103,7 +103,6 @@ class StmtFunctor<R(const Stmt& n, Args... args)> {
   virtual R VisitStmt_(const BlockRealizeNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmtDefault_(const Object* op, Args...) {
     LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
-    return R();
   }
 
  private:
diff --git a/src/arith/bound_deducer.cc b/src/arith/bound_deducer.cc
index ba6b11dbb71b..d4a3101378b0 100644
--- a/src/arith/bound_deducer.cc
+++ b/src/arith/bound_deducer.cc
@@ -216,7 +216,6 @@ CompareOp BoundDeducer::ReverseOp(CompareOp comp_op) {
       return kGreater;
     default:
       LOG(FATAL) << "Not a valid compare op";
-      return kGreater;  // return some default value
   }
 }
 
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 7529019abda8..fa4f0fd5147b 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -1330,7 +1330,6 @@ IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr o
     return fused;
   } else {
     LOG(FATAL) << "Unsupported subclass of IterMarkExpr";
-    return IterSumExpr();
   }
 }
 
@@ -1855,7 +1854,6 @@ class SubspaceDivider {
         return IterSplitExpr(IterMark(GetRef<IterSumExpr>(op), extent));
       } else {
         LOG(FATAL) << "Unknown IterMapExpr type";
-        return NullValue<IterSplitExpr>();
       }
     }
   };
diff --git a/src/arith/narrow_predicate_expression.cc b/src/arith/narrow_predicate_expression.cc
index 1c8931d2dec4..40c7ab3c54ac 100644
--- a/src/arith/narrow_predicate_expression.cc
+++ b/src/arith/narrow_predicate_expression.cc
@@ -192,7 +192,6 @@ class ExpressionNarrower : public tir::ExprMutator {
 
       default:
         LOG(FATAL) << "Unhandled Context, all legal values should be handled";
-        return Context::Maximize;
     }
   }
 
diff --git a/src/arith/transitive_comparison_analyzer.cc b/src/arith/transitive_comparison_analyzer.cc
index 36c2fb77074c..52010ec322c8 100644
--- a/src/arith/transitive_comparison_analyzer.cc
+++ b/src/arith/transitive_comparison_analyzer.cc
@@ -321,7 +321,6 @@ CompareResult Reverse(CompareResult res) {
       return CompareResult::kUnknown;
     default:
       LOG(FATAL) << "Invalid CompareResult: " << static_cast<int>(res);
-      return CompareResult::kInconsistent;
   }
 }
 
@@ -864,11 +863,9 @@ CompareResult TransitiveComparisonAnalyzer::Impl::MergeComparisons(
       case CompareResult::kGT:
       case CompareResult::kLT:
         LOG(FATAL) << "Internal error, normalized comparisons should only include <= and >=";
-        return CompareResult::kInconsistent;
 
       default:
         LOG(FATAL) << "Invalid CompareResult: " << static_cast<int>(cmp.result_);
-        return CompareResult::kInconsistent;
     }
   }
 
diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h
index ca8979c0e829..76069d61b490 100644
--- a/src/auto_scheduler/search_policy/utils.h
+++ b/src/auto_scheduler/search_policy/utils.h
@@ -94,7 +94,6 @@ inline int OperationToStage(const te::Operation& op, const State& state) {
     }
   }
   LOG(FATAL) << "Cannot find op: " << op;
-  return -1;
 }
 
 /********** Get Parameters **********/
@@ -536,7 +535,6 @@ inline Iterator GetLastReduceIteratorInOutermostReduceTile(const Stage& stage) {
   }
 
   LOG(FATAL) << "Cannot find the iterator.";
-  return stage->iters[0];
 }
 
 /*! \brief Get the target stage id of a history step in the new state.
diff --git a/src/auto_scheduler/utils.h b/src/auto_scheduler/utils.h
index f8c00d924dd1..f55cad00e4cc 100755
--- a/src/auto_scheduler/utils.h
+++ b/src/auto_scheduler/utils.h
@@ -89,7 +89,6 @@ inline int GetIndex(const Array<T>& array, const T& to_locate) {
     }
   }
   LOG(FATAL) << "Cannot find the item";
-  return -1;
 }
 
 /*! \brief Delete the item in a std::vector if it exists. */
diff --git a/src/ir/function.cc b/src/ir/function.cc
index c0cda704c424..dcfddd5f69d5 100644
--- a/src/ir/function.cc
+++ b/src/ir/function.cc
@@ -45,7 +45,6 @@ TVM_REGISTER_GLOBAL("ir.BaseFuncWithAttr")
         return WithAttr(Downcast<relay::Function>(std::move(func)), key, value);
       } else {
         LOG(FATAL) << "Do not support function type " << func->GetTypeKey();
-        return func;
       }
     });
 
diff --git a/src/ir/module.cc b/src/ir/module.cc
index 8d6de5a536a7..def94a046855 100644
--- a/src/ir/module.cc
+++ b/src/ir/module.cc
@@ -172,7 +172,6 @@ Constructor IRModuleNode::GetConstructor(const String& adt, const String& cons)
   }
 
   LOG(FATAL) << adt << " does not contain constructor " << cons;
-  return {};
 }
 
 tvm::Array<GlobalTypeVar> IRModuleNode::GetGlobalTypeVars() const {
diff --git a/src/meta_schedule/module_equality.cc b/src/meta_schedule/module_equality.cc
index f9ffe82aa271..f5757adf08a8 100644
--- a/src/meta_schedule/module_equality.cc
+++ b/src/meta_schedule/module_equality.cc
@@ -104,7 +104,6 @@ std::unique_ptr<ModuleEquality> ModuleEquality::Create(const std::string& mod_eq
     return std::make_unique<ModuleEqualityAnchorBlock>();
   }
   LOG(FATAL) << "Unknown module equality " << mod_eq_name;
-  return nullptr;
 }
 
 }  // namespace meta_schedule
diff --git a/src/parser/meta_ref.cc b/src/parser/meta_ref.cc
index c74b396900d8..6b0e8d0c5966 100644
--- a/src/parser/meta_ref.cc
+++ b/src/parser/meta_ref.cc
@@ -43,7 +43,6 @@ TVM_REGISTER_NODE_TYPE(MetaRefAttrs);
 bool MetaRefRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   LOG(FATAL) << "need to expand before type checking";
-  return true;
 }
 
 RELAY_REGISTER_OP("parser.MetaRef")
diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index 548b5e90ff65..fe89857f2709 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -541,7 +541,6 @@ class Parser {
       return support::FloatImmToNDArray(Downcast<tvm::FloatImm>(token->data));
     } else {
       LOG(FATAL) << "internal error: should only call this function on numeric tokens";
-      return {};
     }
   }
 
diff --git a/src/parser/token.h b/src/parser/token.h
index 14e553d358f4..48a1bf70a250 100644
--- a/src/parser/token.h
+++ b/src/parser/token.h
@@ -216,7 +216,6 @@ std::string ToString(const TokenType& token_type) {
     // Older compilers warn even though the above code is exhaustive.
     default:
       LOG(FATAL) << "unreachable code";
-      return "";
   }
 }
 
@@ -339,7 +338,6 @@ std::string Pretty(const TokenType& token_type) {
     // Older compilers warn even though the above code is exhaustive.
     default:
       LOG(FATAL) << "unreachable code";
-      return "";
   }
 }
 
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 05e514295c04..7fb1129d274e 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -756,18 +756,15 @@ Doc TVMScriptPrinter::Print(const ObjectRef& node) {
     return PrintTarget(node.as<TargetNode>());
   } else {
     LOG(FATAL) << "Do not know how to print " << node->GetTypeKey();
-    return Doc();
   }
 }
 
 Doc TVMScriptPrinter::VisitExprDefault_(const Object* op, ExprPrecedence* out_precedence) {
   LOG(FATAL) << "Do not know how to print " << op->GetTypeKey();
-  return Doc();
 }
 
 Doc TVMScriptPrinter::VisitStmtDefault_(const Object* op) {
   LOG(FATAL) << "Do not know how to print " << op->GetTypeKey();
-  return Doc();
 }
 
 Doc TVMScriptPrinter::VisitExpr_(const IntImmNode* op, ExprPrecedence* out_precedence) {
diff --git a/src/relay/backend/aot/aot_lower_main.cc b/src/relay/backend/aot/aot_lower_main.cc
index 2a4dfb84ddcf..fb13e8b66e5d 100644
--- a/src/relay/backend/aot/aot_lower_main.cc
+++ b/src/relay/backend/aot/aot_lower_main.cc
@@ -303,7 +303,6 @@ class AOTMainLowerer : public MixedModeVisitor {
       // TODO(mbs): device_copy cleaunp
       // Suspect treating as no-op is better since already built into the StorageInfo?
       LOG(FATAL) << "The AOT executor does not currently support device_copy";
-      return;
     }
 
     // At this point we should only see calls of the form call_lowered(@callee, (args...)),
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 3c0ab7c16f23..65088e38a563 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -618,7 +618,6 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       // TODO(mbs): device_copy cleaunp
       // Suspect treating as no-op is better since already built into the StorageInfo?
       LOG(FATAL) << "The AOT executor does not currently support device_copy";
-      return;
     }
 
     // At this point we should only see calls of the form call_lowered(@callee, (args...)),
diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc
index dee3f939c50a..de41807431b0 100644
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ b/src/relay/backend/contrib/codegen_c/codegen.cc
@@ -76,7 +76,6 @@ class CodegenC : public backend::MemoizedExprTranslator<std::vector<Output>>, pu
  private:
   std::vector<Output> VisitExprDefault_(const Object* op) override {
     LOG(FATAL) << "C codegen doesn't support: " << op->GetTypeKey();
-    return {};
   }
 
   std::vector<Output> VisitExpr_(const VarNode* node) override {
diff --git a/src/relay/backend/contrib/codegen_json/codegen_json.h b/src/relay/backend/contrib/codegen_json/codegen_json.h
index de6d0f74061b..c1cde2a03bf2 100644
--- a/src/relay/backend/contrib/codegen_json/codegen_json.h
+++ b/src/relay/backend/contrib/codegen_json/codegen_json.h
@@ -250,7 +250,6 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
 
   std::vector<JSONGraphNodeEntry> VisitExprDefault_(const Object* op) {
     LOG(FATAL) << "JSON runtime currently doesn't support " << op->GetTypeKey();
-    return {};
   }
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const VarNode* vn) {
diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc
index 2e76ab1cbbf6..173dcf5e5fcb 100644
--- a/src/relay/backend/contrib/cutlass/codegen.cc
+++ b/src/relay/backend/contrib/cutlass/codegen.cc
@@ -531,7 +531,6 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
 
   std::vector<Output> VisitExprDefault_(const Object* op) final {
     LOG(FATAL) << "Cutlass codegen doesn't support: " << op->GetTypeKey();
-    return {};
   }
 
   std::vector<Output> VisitExpr_(const VarNode* node) final {
@@ -730,7 +729,6 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
     }
 
     LOG(FATAL) << "Unknown composite function: " << pattern_name;
-    return {};
   }
 
   GenerateBodyOutput GenerateBody(const CallNode* root_call, const std::string& func_name,
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index cbd11b4542fc..74cd19b3aaba 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -160,7 +160,6 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
 
   std::vector<Output> VisitExprDefault_(const Object* op) final {
     LOG(FATAL) << "DNNL codegen doesn't support: " << op->GetTypeKey();
-    return {};
   }
 
   std::vector<Output> VisitExpr_(const VarNode* node) final {
@@ -262,7 +261,6 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
     }
 
     LOG(FATAL) << "Unsupported op: " << AsText(call->op, false);
-    return {};
   }
 
   GenerateBodyOutput GenerateCompositeFunctionCall(const FunctionNode* callee,
@@ -282,7 +280,6 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
     }
 
     LOG(FATAL) << "Unknown composite function:" << pattern_name;
-    return {};
   }
 
   GenerateBodyOutput GenerateBody(const CallNode* root_call, const std::string& func_name,
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index ab725d82e676..78d4dde19a29 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -493,15 +493,12 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
 
   std::vector<GraphNodeRef> VisitExpr_(const OpNode* op) override {
     LOG(FATAL) << "All OpNodes should have been expanded";
-    return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const GlobalVarNode* op) override {
     LOG(FATAL) << "All GlobalVarNodes should be removed before graph executor's Codegen is called";
-    return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const IfNode* op) override {
     LOG(FATAL) << "Graph executor does not support control flow (found IfNode)";
-    return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const FunctionNode* op) override {
     ICHECK(op->GetAttr<String>(attr::kCompiler).defined())
@@ -510,23 +507,18 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
   }
   std::vector<GraphNodeRef> VisitExpr_(const RefCreateNode* op) override {
     LOG(FATAL) << "Graph executor does not support references (found RefCreateNode)";
-    return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const RefReadNode* op) override {
     LOG(FATAL) << "Graph executor does not support references (found RefReadNode)";
-    return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const RefWriteNode* op) override {
     LOG(FATAL) << "Graph executor does not support references (found RefWriteNode)";
-    return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const ConstructorNode* op) override {
     LOG(FATAL) << "Graph executor does not support ADTs (found ConstructorNode)";
-    return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const MatchNode* op) override {
     LOG(FATAL) << "Graph executor does not support matching (found MatchNode)";
-    return {};
   }
   /*!
    * \brief Generate Graph JSON
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 1019ecf358b1..e6c5ac0d6ef3 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -707,7 +707,6 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     if (device_copy_props.body.defined()) {
       // TODO(mbs): device_copy cleanup
       LOG(FATAL) << "The interpreter does not support device_copy";
-      return {};
     } else if (call_lowered_props.lowered_func.defined()) {
       // Special case: Call a lowered TIR function.
 
@@ -837,7 +836,6 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       }
     } else {
       LOG(FATAL) << "type error, type system should have caught this";
-      return ObjectRef();
     }
   }
 
@@ -848,7 +846,6 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       return ADT::Tuple(std::vector<ObjectRef>());
     } else {
       LOG(FATAL) << "type error, type system should have caught this";
-      return ObjectRef();
     }
   }
 
@@ -860,7 +857,6 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       return rv->value;
     } else {
       LOG(FATAL) << "type error, type system should have caught this";
-      return ObjectRef();
     }
   }
 
@@ -872,7 +868,6 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       }
     }
     LOG(FATAL) << "did not find any match";
-    return ObjectRef();
   }
 
   bool VisitPattern_(const PatternConstructorNode* op, const ObjectRef& v) final {
@@ -1099,7 +1094,6 @@ TypedPackedFunc<ObjectRef(Array<Expr>)> EvalFunction(IRModule mod, Expr expr, De
     });
   } else {
     LOG(FATAL) << "expecting expression to have function type and evaluate to a closure";
-    return nullptr;
   }
 }
 
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index d235c17b2a4b..511f0a901d11 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -243,7 +243,6 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
 
   Array<te::Tensor> VisitExpr_(const VarNode* op) final {
     LOG(FATAL) << "Unexpected free variable " << PrettyPrint(GetRef<Var>(op));
-    return {};
   }
 
   Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
@@ -272,7 +271,6 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
               return make_const(dtype, static_cast<const double*>(data)[0]);
             } else {
               LOG(FATAL) << dtype << " not handled";
-              return tvm::PrimExpr();
             }
           },
           "compile_engine_const", topi::kBroadcast);
@@ -351,7 +349,6 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
 
   Array<te::Tensor> VisitExpr_(const FunctionNode* op) final {
     LOG(FATAL) << "Primitive Functions can not contain nested functions.";
-    return Array<te::Tensor>();
   }
 
   Array<te::Tensor> VisitExpr_(const LetNode* op) final {
@@ -877,7 +874,6 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     }
     if (param_states_.find(var) == param_states_.end()) {
       LOG(FATAL) << "Unexpected free variable " << PrettyPrint(var);
-      return {};
     } else {
       ICHECK(data_dependents_per_input_.size());
       auto data_dependent = data_dependents_per_input_.back();
@@ -934,7 +930,6 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
               return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
             } else {
               LOG(FATAL) << "not handled";
-              return tvm::PrimExpr();
             }
           },
           "data_const", topi::kBroadcast);
@@ -1015,7 +1010,6 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const FunctionNode* op) final {
     LOG(FATAL) << "Nested functions are not allowed to be visited.";
-    return Array<te::Tensor>();
   }
 
   Array<te::Tensor> VisitExpr_(const LetNode* op) final {
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index b807f4195947..9ba90b9f676d 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -865,7 +865,6 @@ PackedFunc VMCompiler::GetFunction(const std::string& name, const ObjectPtr<Obje
     });
   } else {
     LOG(FATAL) << "Unknown packed function: " << name;
-    return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
   }
 }
 
diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc
index e190a8b886e1..60c03895da46 100644
--- a/src/relay/op/nn/sparse.cc
+++ b/src/relay/op/nn/sparse.cc
@@ -65,7 +65,6 @@ bool SparseDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
       return true;
     }
     LOG(FATAL) << "Unknown data ndim for nn.sparse_dense, should be 1 (CSR) or 3 (BSR)";
-    return false;
 
   } else {
     const auto* data = types[0].as<TensorTypeNode>();
@@ -89,7 +88,6 @@ bool SparseDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
       return true;
     }
     LOG(FATAL) << "Unknown weight ndim for nn.sparse_dense, should be 1 (CSR) or 3 (BSR)";
-    return false;
   }
 }
 
diff --git a/src/relay/qnn/utils.h b/src/relay/qnn/utils.h
index 87195eb34d94..5005d6068524 100644
--- a/src/relay/qnn/utils.h
+++ b/src/relay/qnn/utils.h
@@ -57,7 +57,6 @@ static inline int32_t GetQmin(const DataType& dtype) {
     return static_cast<int32_t>(min_value[0]);
   } else {
     LOG(FATAL) << "Type not supported " << dtype;
-    return -1;  // To hide the warning
   }
 }
 
@@ -70,7 +69,6 @@ static inline int32_t GetQmax(const DataType& dtype) {
     return static_cast<int32_t>(max_value[0]);
   } else {
     LOG(FATAL) << "Type not supported " << dtype;
-    return -1;  // To hide the warning
   }
 }
 
diff --git a/src/relay/transforms/defunctionalization.cc b/src/relay/transforms/defunctionalization.cc
index 38e403a8d9b0..5ee3bbcef48f 100644
--- a/src/relay/transforms/defunctionalization.cc
+++ b/src/relay/transforms/defunctionalization.cc
@@ -289,7 +289,6 @@ class DefuncMutator : public ExprMutator {
       return Call(c, call_args);
     }
     LOG(FATAL) << "EncodeArg failed to cast arg into identifier node or function node";
-    return {};
   }
 
   /*!
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index aee402836f89..dba412f81688 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -244,7 +244,6 @@ class ConstantFolder : public MixedModeMutator {
       return Tuple(fields);
     } else {
       LOG(FATAL) << "Cannot handle " << value->GetTypeKey();
-      return {};
     }
   }
 
diff --git a/src/relay/transforms/fold_scale_axis.cc b/src/relay/transforms/fold_scale_axis.cc
index 435c1ff1f255..69e100936839 100644
--- a/src/relay/transforms/fold_scale_axis.cc
+++ b/src/relay/transforms/fold_scale_axis.cc
@@ -919,7 +919,6 @@ Expr AddSubBackwardTransform(const Call& call, const Message& message, const Exp
     return Call(call->op, {lhs, rhs}, call->attrs, call->type_args);
   } else {
     LOG(FATAL) << "outstanding scale";
-    return Expr();
   }
 }
 
@@ -1104,7 +1103,6 @@ Expr BiasAddBackwardTransform(const Call& call, const Message& message, const Ex
     return Call(call->op, {lhs, rhs}, call->attrs, call->type_args);
   } else {
     LOG(FATAL) << "outstanding scale";
-    return Expr();
   }
 }
 
diff --git a/src/relay/transforms/infer_layout_utils.cc b/src/relay/transforms/infer_layout_utils.cc
index efe886c29d23..984e23ad15f1 100644
--- a/src/relay/transforms/infer_layout_utils.cc
+++ b/src/relay/transforms/infer_layout_utils.cc
@@ -148,7 +148,6 @@ Layout TryTransformLike(const Layout& old, const Layout& ref_old, const Layout&
     for (int i = 0; i < 26; ++i)
       if (!used[i]) return 'A' + i;
     LOG(FATAL) << "All letters are used";
-    return 0;
   };
 
   for (int j = old->axes.size() - 1, i = ref_old->axes.size() - 1; j >= 0; --i, --j) {
diff --git a/src/relay/transforms/to_mixed_precision.cc b/src/relay/transforms/to_mixed_precision.cc
index 18161b3c2508..820bc6e58e4d 100644
--- a/src/relay/transforms/to_mixed_precision.cc
+++ b/src/relay/transforms/to_mixed_precision.cc
@@ -213,7 +213,6 @@ class MixedPrecisionPass : public MixedModeMutator {
       return true;
     } else {
       LOG(FATAL) << "Unsupported type " << t << " we don't know how to handle";
-      return false;
     }
   }
 
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
index 0f2dde5e36e1..9d65af721970 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -136,7 +136,6 @@ arm_compute::DataType MakeACLDataType(const DLDataType& data_type) {
     return arm_compute::DataType::S32;
   } else {
     LOG(FATAL) << "Datatype " << data_type << " unsupported by ACL runtime";
-    return arm_compute::DataType::UNKNOWN;
   }
 }
 
diff --git a/src/runtime/contrib/bnns/bnns_json_runtime.cc b/src/runtime/contrib/bnns/bnns_json_runtime.cc
index 87b01567cd30..cb921aa729a1 100644
--- a/src/runtime/contrib/bnns/bnns_json_runtime.cc
+++ b/src/runtime/contrib/bnns/bnns_json_runtime.cc
@@ -536,7 +536,6 @@ class BNNSJSONRuntime : public JSONRuntimeBase {
       if (dl_dtype.bits == 8) return BNNSDataTypeUInt8;
     }
     LOG(FATAL) << "Unsupported data type for BNNS runtime";
-    return BNNS::Dtype(0);
   }
 
   BNNSFilterParameters getCommonFilterParams() {
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index c3fa3051591f..a667caaafcd8 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -574,7 +574,6 @@ class CLMLRuntime : public JSONRuntimeBase {
       return CL_HALF_FLOAT;
     } else {
       LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
-      return -1;
     }
   }
 
@@ -588,7 +587,6 @@ class CLMLRuntime : public JSONRuntimeBase {
       return CL_ARITHMETIC_MODE_FP16_QCOM;
     } else {
       LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
-      return CL_ARITHMETIC_MODE_FP32_QCOM;
     }
   }
 
diff --git a/src/runtime/contrib/cublas/cublas_utils.h b/src/runtime/contrib/cublas/cublas_utils.h
index 3edb8300be88..62863b8f7bc8 100644
--- a/src/runtime/contrib/cublas/cublas_utils.h
+++ b/src/runtime/contrib/cublas/cublas_utils.h
@@ -103,7 +103,6 @@ inline cudaDataType_t GetCudaDataType(DLDataType type) {
     }
   }
   LOG(FATAL) << "Unsupported cuda type";
-  return CUDA_R_16F;
 }
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
index e3867f27bc71..689113f62865 100644
--- a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
+++ b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
@@ -274,7 +274,6 @@ class TensorRequisite {
     if (layout.find("O") != std::string::npos) return "OI" + sparse_dims[rank - 3];
 
     LOG(FATAL) << "Unknown layout " << layout << "There is no default scheme to handle it";
-    return {};
   }
 
   /*!
diff --git a/src/runtime/contrib/tflite/tflite_runtime.cc b/src/runtime/contrib/tflite/tflite_runtime.cc
index 3f3c75814547..2806cb33b840 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.cc
+++ b/src/runtime/contrib/tflite/tflite_runtime.cc
@@ -86,7 +86,6 @@ DataType TfLiteDType2TVMDType(TfLiteType dtype) {
       return DataType::Float(16);
     default:
       LOG(FATAL) << "tflite data type not support yet: " << dtype;
-      return DataType::Float(32);
   }
 }
 
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 2288aece626c..e0b970a3ad88 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -235,7 +235,6 @@ class GraphExecutorDebug : public GraphExecutor {
       }
     }
     LOG(FATAL) << "cannot find " << name << " among nodex";
-    return -1;
   }
 
   /*!
diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index b8c7bd2cb96e..3a3444faf4da 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -151,7 +151,6 @@ void* HexagonBuffer::GetPointer() {
     return allocations_.data();
   } else {
     LOG(FATAL) << "HexagonBuffer should be either 1-d or 2-d, not " << ndim_ << "-d";
-    return nullptr;
   }
 }
 
diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
index c00e33f4c4a1..59c8aa931db6 100644
--- a/src/runtime/hexagon/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -45,7 +45,6 @@ HexagonModuleNode::HexagonModuleNode(std::string data, std::string fmt,
 PackedFunc HexagonModuleNode::GetFunction(const std::string& name,
                                           const ObjectPtr<Object>& sptr_to_self) {
   LOG(FATAL) << "HexagonModuleNode::GetFunction is not implemented.";
-  return PackedFunc();
 }
 
 std::string HexagonModuleNode::GetSource(const std::string& format) {
diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc
index d0ce77c931db..0569a78a0fcd 100644
--- a/src/runtime/logging.cc
+++ b/src/runtime/logging.cc
@@ -281,7 +281,6 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) {
     }
     if (name.empty()) {
       LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(name) << ": empty filename";
-      return settings;
     }
 
     name = FileToVLogMapKey(name);
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 633dc7c17671..9ef57e905324 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -103,7 +103,6 @@ void ModuleNode::SaveToBinary(dmlc::Stream* stream) {
 
 std::string ModuleNode::GetSource(const std::string& format) {
   LOG(FATAL) << "Module[" << type_key() << "] does not support GetSource";
-  return "";
 }
 
 const PackedFunc* ModuleNode::GetFuncFromEnv(const std::string& name) {
@@ -131,7 +130,6 @@ const PackedFunc* ModuleNode::GetFuncFromEnv(const std::string& name) {
 
 std::string ModuleNode::GetFormat() {
   LOG(FATAL) << "Module[" << type_key() << "] does not support GetFormat";
-  return "";
 }
 
 bool ModuleNode::IsDSOExportable() const { return false; }
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 4c51158c29df..f0a68864d724 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -196,7 +196,6 @@ inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
     return CL_UNSIGNED_INT32;
   }
   LOG(FATAL) << "data type is not supported in OpenCL runtime yet: " << dtype;
-  return CL_FLOAT;
 }
 
 /*!
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 58744c2cc615..1244fddf0983 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -79,7 +79,6 @@ cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope(
     return cl::BufferDescriptor::MemoryLayout::kImage2DNHWC;
   }
   LOG(FATAL) << "No memory layout defined for memory of scope: " << mem_scope.value();
-  return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
 }
 
 String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryLayout layout) {
diff --git a/src/runtime/pack_args.h b/src/runtime/pack_args.h
index 3776d18fafcc..5291b12fa13e 100644
--- a/src/runtime/pack_args.h
+++ b/src/runtime/pack_args.h
@@ -144,7 +144,6 @@ inline ArgConvertCode GetArgConvertCode(DLDataType t) {
     return HANDLE_TO_HANDLE;
   }
   LOG(FATAL) << "Cannot handle " << t << " as device function argument";
-  return HANDLE_TO_HANDLE;
 }
 
 template <int N, typename F>
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index b5c560e255e3..39f995a3764a 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -87,7 +87,6 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name,
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetExecutionCount(); });
   } else {
     LOG(FATAL) << "Unknown packed function: " << name;
-    return PackedFunc();
   }
 }
 /*!
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index 540103d0186c..9f14d9163c7e 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -204,7 +204,6 @@ class QueueData {
   DLTensor* CreateCopyFrom(const DLTensor* from) {
     if (!from) {
       LOG(FATAL) << "the 'from' pointer is a null pointer!";
-      return nullptr;
     }
     size_t fromLen = tvm::runtime::GetDataSize(*from);
     size_t toLen = data_ ? tvm::runtime::GetDataSize(*data_) : 0;
@@ -892,7 +891,6 @@ class BackendRuntime : public BasicRuntime {
   bool LoadBindingData(int input_index) {
     if (input_queue_.find(input_index) == input_queue_.end()) {
       LOG(FATAL) << "Not finding the associated input queue of the input " << input_index << " !";
-      return false;
     }
     auto queue = input_queue_[input_index];
     QueueData data;
@@ -913,7 +911,6 @@ class BackendRuntime : public BasicRuntime {
       auto output_idx = child.first;
       if (forward_queue_.find(output_idx) == forward_queue_.end()) {
         LOG(FATAL) << "Not find the forwarding queue map for output(" << output_idx << ")!";
-        return false;
       }
       NDArray output = GetOutput(output_idx);
       auto forward_queue_map = forward_queue_[output_idx];
diff --git a/src/runtime/rpc/rpc_event_impl.cc b/src/runtime/rpc/rpc_event_impl.cc
index 3bf9538c1879..64cc025f00a0 100644
--- a/src/runtime/rpc/rpc_event_impl.cc
+++ b/src/runtime/rpc/rpc_event_impl.cc
@@ -32,10 +32,8 @@ namespace tvm {
 namespace runtime {
 
 PackedFunc CreateEventDrivenServer(PackedFunc fsend, std::string name, std::string remote_key) {
-  static PackedFunc frecv([](TVMArgs args, TVMRetValue* rv) {
-    LOG(FATAL) << "Do not allow explicit receive";
-    return 0;
-  });
+  static PackedFunc frecv(
+      [](TVMArgs args, TVMRetValue* rv) { LOG(FATAL) << "Do not allow explicit receive"; });
 
   auto ch = std::make_unique<CallbackChannel>(fsend, frecv);
   std::shared_ptr<RPCEndpoint> sess = RPCEndpoint::Create(std::move(ch), name, remote_key);
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 1578fce994f6..968bd773e453 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -191,7 +191,6 @@ class RPCModuleNode final : public ModuleNode {
 
   std::string GetSource(const std::string& format) final {
     LOG(FATAL) << "GetSource for rpc Module is not supported";
-    return "";
   }
 
   PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat,
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index df4f1ce42998..3df012e4d4a2 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -116,7 +116,6 @@ class RPCSessTable {
       }
     }
     LOG(FATAL) << "maximum number of RPC session reached";
-    return 0;
   }
 
  private:
diff --git a/src/runtime/stackvm/stackvm.cc b/src/runtime/stackvm/stackvm.cc
index 808dc4063c8b..5a4af57b5ec4 100644
--- a/src/runtime/stackvm/stackvm.cc
+++ b/src/runtime/stackvm/stackvm.cc
@@ -174,7 +174,6 @@ int64_t StackVM::PrintCode(std::ostream& os, int64_t pc) const {
     }
   }
   LOG(FATAL) << "unknown op code " << code[pc].op_code;
-  return 0;
 }
 
 std::ostream& operator<<(std::ostream& os, const StackVM& vm) {  // NOLINT(*)
diff --git a/src/runtime/stackvm/stackvm.h b/src/runtime/stackvm/stackvm.h
index e57cb0b03952..c967e99dbecb 100644
--- a/src/runtime/stackvm/stackvm.h
+++ b/src/runtime/stackvm/stackvm.h
@@ -379,10 +379,8 @@ class StackVM {
         return LE_F64;
       case MOD_I64:
         LOG(FATAL) << "cannot handle mod for float";
-        return ADD_F64;
       default:
         LOG(FATAL) << "cannot handle op " << code;
-        return ADD_F64;
     }
   }
   /*!
@@ -412,7 +410,6 @@ class StackVM {
       }
     }
     LOG(FATAL) << "Cannot load type " << t;
-    return ARRAY_LOAD_FP64;
   }
   /*!
    * \brief Get store opcode for type t
@@ -441,7 +438,6 @@ class StackVM {
       }
     }
     LOG(FATAL) << "Cannot store type " << t;
-    return ARRAY_STORE_FP64;
   }
   friend std::ostream& operator<<(std::ostream& os, const StackVM& vm);  // NOLINT(*)
 
diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h
index d0cca1c028f2..83477312dcc5 100644
--- a/src/runtime/thread_storage_scope.h
+++ b/src/runtime/thread_storage_scope.h
@@ -78,7 +78,6 @@ inline StorageRank DefaultStorageRank(int thread_scope_rank) {
       return StorageRank::kLocal;
     default: {
       LOG(FATAL) << "unknown rank";
-      return StorageRank::kGlobal;
     }
   }
 }
@@ -115,7 +114,6 @@ struct StorageScope {
         return "texture" + tag;
       default:
         LOG(FATAL) << "unknown storage scope";
-        return "";
     }
   }
   /*!
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 2484ece3081d..082ff0556544 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -118,7 +118,6 @@ PackedFunc Executable::GetFunction(const std::string& name, const ObjectPtr<Obje
     });
   } else {
     LOG(FATAL) << "Unknown packed function: " << name;
-    return PackedFunc();
   }
 }
 
@@ -1012,7 +1011,6 @@ Instruction DeserializeInstruction(const VMInstructionSerializer& instr) {
     }
     default:
       LOG(FATAL) << "Invalid opcode" << instr.opcode;
-      return Instruction();
   }
 }
 
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index aaf4675733a8..72e624f7f6e0 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -243,7 +243,6 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
     });
   } else {
     LOG(FATAL) << "Unknown packed function: " << name;
-    return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
   }
 }
 
diff --git a/src/runtime/vulkan/vulkan_device.cc b/src/runtime/vulkan/vulkan_device.cc
index 7a6b92943c90..b3e017d03418 100644
--- a/src/runtime/vulkan/vulkan_device.cc
+++ b/src/runtime/vulkan/vulkan_device.cc
@@ -590,7 +590,6 @@ uint32_t FindMemoryType(const VulkanDevice& device, VkBufferCreateInfo info,
     type_bits >>= 1;
   }
   LOG(FATAL) << "Requested memory type not found";
-  return 0;
 }
 
 VulkanHostVisibleBuffer* GetOrAllocate(
diff --git a/src/support/base64.h b/src/support/base64.h
index 3aac9920a075..7b37afce66cc 100644
--- a/src/support/base64.h
+++ b/src/support/base64.h
@@ -245,7 +245,6 @@ class Base64OutStream : public dmlc::Stream {
   }
   virtual size_t Read(void* ptr, size_t size) {
     LOG(FATAL) << "Base64OutStream do not support read";
-    return 0;
   }
   /*!
    * \brief finish writing of all current base64 stream, do some post processing
diff --git a/src/support/scalars.cc b/src/support/scalars.cc
index 0ab16899bae9..4ba505922b21 100644
--- a/src/support/scalars.cc
+++ b/src/support/scalars.cc
@@ -170,7 +170,6 @@ IntImm ValueToIntImm(int64_t value, int width) {
     return IntImm(kInt64, value);
   } else {
     LOG(FATAL) << "Unrecognized int scalar width: " << width;
-    return {};
   }
 }
 
@@ -190,7 +189,6 @@ FloatImm ValueToFloatImm(double value, int width) {
     return FloatImm(kFloat64, value);
   } else {
     LOG(FATAL) << "Unrecognized float scalar width: " << width;
-    return {};
   }
 }
 
diff --git a/src/support/socket.h b/src/support/socket.h
index 52de2f72f548..be6910c02344 100644
--- a/src/support/socket.h
+++ b/src/support/socket.h
@@ -553,7 +553,6 @@ class TCPSocket : public Socket {
       if (ret == -1) {
         if (LastErrorWouldBlock()) {
           LOG(FATAL) << "would block";
-          return ndone;
         }
         Socket::Error("RecvAll");
       }
diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 4e83e83ba7e3..327f23af2c73 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -198,7 +198,6 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       return builder_->CreateCall(f, {});
     } else {
       LOG(FATAL) << "Do not support sync " << sync;
-      return nullptr;
     }
   }
 
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index eb5c92e663fa..facb49660078 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -423,7 +423,6 @@ CodeGenLLVM::TypedPointer CodeGenCPU::CreateStructRefPtr(DataType t, llvm::Value
     }
     default:
       LOG(FATAL) << "unknown field code";
-      return TypedPointer();
   }
 }
 
@@ -1440,7 +1439,6 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
         return builder_->CreateAlloca(t_tvm_array_, num);
       } else {
         LOG(FATAL) << "Unknown stack alloca type " << type;
-        return nullptr;
       }
     });
   } else {
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index dce7d0b82f0d..7aae17788800 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -128,7 +128,6 @@ std::unique_ptr<CodeGenLLVM> CodeGenLLVM::Create(LLVMTarget* llvm_target) {
     return std::unique_ptr<CodeGenLLVM>(static_cast<CodeGenLLVM*>(handle));
   } else {
     LOG(FATAL) << "unable to create codegen for target " << target;
-    return nullptr;  // unreachable
   }
 }
 
@@ -347,15 +346,9 @@ void CodeGenLLVM::AddMainFunction(const std::string& entry_func_name) {
   LOG(FATAL) << "not implemented";
 }
 
-llvm::Value* CodeGenLLVM::GetThreadIndex(const IterVar& iv) {
-  LOG(FATAL) << "not implemented";
-  return nullptr;
-}
+llvm::Value* CodeGenLLVM::GetThreadIndex(const IterVar& iv) { LOG(FATAL) << "not implemented"; }
 
-llvm::Value* CodeGenLLVM::CreateStorageSync(const CallNode* op) {
-  LOG(FATAL) << "not implemented";
-  return nullptr;
-}
+llvm::Value* CodeGenLLVM::CreateStorageSync(const CallNode* op) { LOG(FATAL) << "not implemented"; }
 
 #if TVM_LLVM_VERSION >= 160
 
@@ -547,7 +540,6 @@ llvm::Type* CodeGenLLVM::GetLLVMType(const Type& type) const {
     return t_void_;
   } else {
     LOG(FATAL) << "Type " << type << " does not have a corresponding LLVM Type";
-    return t_void_;
   }
 }
 
@@ -1382,14 +1374,12 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
   } else if (op->op.same_as(builtin::atomic_add())) {
     // TODO(masahi): Support atomic for CPU backend
     LOG(FATAL) << "CPU backend does not support atomic add yet.";
-    return nullptr;
   } else if (op->op.same_as(builtin::start_profile_intrinsic()) ||
              op->op.same_as(builtin::end_profile_intrinsic())) {
     LOG(INFO) << "Ignoring profile_intrinsic ... " << op->op;
     return nullptr;
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->op;
-    return nullptr;
   }
 }
 
@@ -1562,7 +1552,6 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const LetNode* op) {
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const LoadNode* op) {
   LOG(FATAL) << "Unexpected deprecated LoadNode.  Use BufferLoadNode instead.";
-  return nullptr;
 }
 
 bool CodeGenLLVM::HasAlignmentPadding(DataType dtype) {
@@ -1721,7 +1710,6 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
   } else {
     ICHECK(op->op.as<GlobalVarNode>());
     LOG(FATAL) << "Do not yet support cross function call";
-    return nullptr;
   }
 }
 
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index ff330e52d33c..ec561667c1b0 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -179,7 +179,6 @@ class CodeGenNVPTX : public CodeGenLLVM {
       return builder_->CreateCall(f, {});
     } else {
       LOG(FATAL) << "Do not support sync " << sync;
-      return nullptr;
     }
   }
 
diff --git a/src/target/source/ptx.cc b/src/target/source/ptx.cc
index 881c425e7742..886242efe08c 100644
--- a/src/target/source/ptx.cc
+++ b/src/target/source/ptx.cc
@@ -121,7 +121,6 @@ inline DataType DTypeFromString(const std::string str) {
     return DataType::kBit64;
   } else {
     LOG(FATAL) << "Unrecognized PTX data type " << str;
-    return DataType(0);
   }
 }
 
@@ -162,7 +161,6 @@ LayoutType LayoutTypeFromString(const std::string& str) {
     return LayoutType::kColumnMajor;
   } else {
     LOG(FATAL) << "Unrecognized layout type " << str;
-    return LayoutType::kRowMajor;
   }
 }
 
diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc
index c291a478dd3f..e3ef5acb8331 100644
--- a/src/target/spirv/codegen_spirv.cc
+++ b/src/target/spirv/codegen_spirv.cc
@@ -199,7 +199,6 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const FloatImmNode* op) {
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const StringImmNode* op) {
   LOG(FATAL) << "StringImm is not supported in Device code";
-  return spirv::Value();
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const CastNode* op) {
@@ -398,7 +397,6 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     return spirv::Value();
   } else {
     LOG(FATAL) << "Unresolved call  " << op->op;
-    return spirv::Value();
   }
 }
 
diff --git a/src/target/target.cc b/src/target/target.cc
index cbebd0e10c46..24a418709ff3 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -504,7 +504,6 @@ std::string TargetInternal::StringifyAtomicType(const ObjectRef& obj) {
     return u;
   }
   LOG(FATAL) << "Cannot stringify this object";
-  return "";  // unreachable
 }
 
 std::string TargetInternal::StringifyArray(const ArrayNode& array) {
diff --git a/src/te/autodiff/jacobian.cc b/src/te/autodiff/jacobian.cc
index e61a590c409d..a77688b43efb 100644
--- a/src/te/autodiff/jacobian.cc
+++ b/src/te/autodiff/jacobian.cc
@@ -119,7 +119,6 @@ class JacobianMutator : public ExprMutator {
       return FloatImm(expr.dtype(), 0.0);
     } else {
       LOG(FATAL) << "Derivative of this intrinsic is not implemented: " << op->op;
-      return PrimExpr();
     }
   }
 
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
index 39243bf2216f..c1741e9e4ec5 100644
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ b/src/te/schedule/schedule_dataflow_rewrite.cc
@@ -459,7 +459,6 @@ Tensor Schedule::cache_write(const Tensor& tensor, const std::string& scope) {
     return (CacheWriteWithReLayoutTensor(*this, {tensor}, scope))[0];
   } else {
     LOG(FATAL) << "cache write only take ComputeOp or TensorComputeOp as writers";
-    return Tensor();
   }
 }
 
diff --git a/src/tir/analysis/stmt_finding.cc b/src/tir/analysis/stmt_finding.cc
index 107786a0eb38..1d8cb462c14b 100644
--- a/src/tir/analysis/stmt_finding.cc
+++ b/src/tir/analysis/stmt_finding.cc
@@ -98,7 +98,6 @@ Stmt GetEnclosingLoop(const BlockNode* block, Stmt func_body) {
   }
 
   LOG(FATAL) << "Enclosing loop not found for a block " << GetRef<Block>(block);
-  return Stmt();
 }
 
 const BlockNode* FindAnchorBlock(const IRModule& mod) {
diff --git a/src/tir/ir/expr_functor.cc b/src/tir/ir/expr_functor.cc
index da02e0316f48..b3b09e54f2e2 100644
--- a/src/tir/ir/expr_functor.cc
+++ b/src/tir/ir/expr_functor.cc
@@ -127,7 +127,6 @@ PrimExpr ExprMutator::VisitExpr_(const AnyNode* op) { return GetRef<PrimExpr>(op
 
 PrimExpr ExprMutator::VisitExpr_(const LoadNode* op) {
   LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-  return PrimExpr();
 }
 
 PrimExpr ExprMutator::VisitExpr_(const BufferLoadNode* op) {
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index daa8fe703a08..1d00b8bd364f 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -371,7 +371,6 @@ Stmt StmtMutator::VisitStmt_(const IfThenElseNode* op) {
 
 Stmt StmtMutator::VisitStmt_(const StoreNode* op) {
   LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-  return Stmt();
 }
 
 Stmt StmtMutator::VisitStmt_(const BufferStoreNode* op) {
@@ -676,12 +675,10 @@ class IRSubstitute : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 9896fe40d833..044d8fd08da5 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -57,7 +57,6 @@ runtime::DataType GetRuntimeDataType(const Type& type) {
     return DataType::Void();
   } else {
     LOG(FATAL) << "Type " << type << " does not have a corresponding runtime::DataType";
-    return DataType::Handle();
   }
 }
 
@@ -206,7 +205,6 @@ PrimExpr max_value(const DataType& dtype, Span span) {
     return FloatImm(dtype, std::numeric_limits<float>::max(), span);
   }
   LOG(FATAL) << "Cannot decide max_value for type" << dtype;
-  return PrimExpr();
 }
 
 PrimExpr min_value(const DataType& dtype, Span span) {
@@ -241,7 +239,6 @@ PrimExpr min_value(const DataType& dtype, Span span) {
     return FloatImm(dtype, std::numeric_limits<float>::lowest(), span);
   }
   LOG(FATAL) << "Cannot decide min_value for type" << dtype;
-  return PrimExpr();
 }
 
 // infinity
@@ -256,7 +253,6 @@ PrimExpr infinity(const DataType& dtype, Span span) {
     }
   }
   LOG(FATAL) << "Cannot decide infinity for type " << dtype;
-  return PrimExpr();
 }
 
 namespace tir {
@@ -710,7 +706,6 @@ PrimExpr isnan(PrimExpr x, Span span) {
     }
   } else {
     LOG(FATAL) << "Data type " << x.dtype() << " not supported for isnan op. Skipping isnan op...";
-    return x;
   }
 }
 
@@ -724,7 +719,6 @@ PrimExpr isinf(PrimExpr x, Span span) {
     return abs(x, span) == infX && !isnan(x, span);
   } else {
     LOG(FATAL) << "Data type " << x.dtype() << " not supported for finiteness ops. Skipping it...";
-    return x;
   }
 }
 
diff --git a/src/tir/schedule/primitive/cache_index.cc b/src/tir/schedule/primitive/cache_index.cc
index 1db86a5444ff..c6f845541dd2 100644
--- a/src/tir/schedule/primitive/cache_index.cc
+++ b/src/tir/schedule/primitive/cache_index.cc
@@ -384,7 +384,6 @@ class CacheIndexRewriter : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
  private:
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index b3e0e8f1274e..27244f157592 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -686,7 +686,6 @@ class CacheReadRewriter : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   PrimExpr VisitExpr_(const VarNode* op) final {
@@ -806,12 +805,10 @@ class CacheWriteRewriter : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const VarNode* op) final {
diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc
index d54be8a05fdc..e4771c8b19f6 100644
--- a/src/tir/schedule/primitive/compute_inline.cc
+++ b/src/tir/schedule/primitive/compute_inline.cc
@@ -263,12 +263,10 @@ class BaseInliner : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   Stmt VisitStmt_(const ForNode* loop) final {
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index bf618af8de54..bbbbd2fdf56f 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -587,7 +587,6 @@ class TransformLayoutPlanner : private StmtExprVisitor {
         return info.innermost_block_realize.value();
       } else {
         LOG(FATAL) << "Write occured outside of any block/loop";
-        return Stmt();
       }
     }();
     return EpiloguePlan{insert_after, stmt};
diff --git a/src/tir/transforms/bf16_legalize.cc b/src/tir/transforms/bf16_legalize.cc
index 040c48c79693..8c5982e80916 100644
--- a/src/tir/transforms/bf16_legalize.cc
+++ b/src/tir/transforms/bf16_legalize.cc
@@ -260,7 +260,6 @@ class BF16LowerRewriter : public StmtExprMutator {
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
@@ -277,7 +276,6 @@ class BF16LowerRewriter : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   PrimExpr VisitExpr_(const FloatImmNode* op) final {
diff --git a/src/tir/transforms/bound_checker.cc b/src/tir/transforms/bound_checker.cc
index 85aac3cee855..5a4178a018cf 100644
--- a/src/tir/transforms/bound_checker.cc
+++ b/src/tir/transforms/bound_checker.cc
@@ -80,12 +80,10 @@ class BoundChecker : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
diff --git a/src/tir/transforms/inject_double_buffer.cc b/src/tir/transforms/inject_double_buffer.cc
index d974e3c8108a..91052cbf572d 100644
--- a/src/tir/transforms/inject_double_buffer.cc
+++ b/src/tir/transforms/inject_double_buffer.cc
@@ -172,12 +172,10 @@ class DoubleBufferInjector : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc
index a1ebdcef9855..5b54b8abee8e 100644
--- a/src/tir/transforms/inject_virtual_thread.cc
+++ b/src/tir/transforms/inject_virtual_thread.cc
@@ -247,12 +247,10 @@ class VTInjector : public arith::IRMutatorWithAnalyzer {
   // Load
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
   // Store
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
   // BufferLoad
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
@@ -379,7 +377,6 @@ class VTInjector : public arith::IRMutatorWithAnalyzer {
   Stmt VisitStmt_(const WhileNode* op) final {
     // TODO(masahi): What should we do for While nodes?
     LOG(FATAL) << "WhileNode in InjectVirtualThread not supported yet";
-    return Stmt();
   }
 
   // Seq
@@ -528,7 +525,6 @@ class VirtualThreadInjector : public arith::IRMutatorWithAnalyzer {
 
   Stmt VisitStmt_(const ProducerStoreNode* op) final {
     LOG(FATAL) << "Need to call StorageFlatten first";
-    return GetRef<Stmt>(op);
   }
 };
 
diff --git a/src/tir/transforms/ir_utils.cc b/src/tir/transforms/ir_utils.cc
index 6893aecc4d00..afd7ba43cf93 100644
--- a/src/tir/transforms/ir_utils.cc
+++ b/src/tir/transforms/ir_utils.cc
@@ -112,12 +112,10 @@ class IRConvertSSA final : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
diff --git a/src/tir/transforms/lift_attr_scope.cc b/src/tir/transforms/lift_attr_scope.cc
index 272e16d40d97..b340a94937f3 100644
--- a/src/tir/transforms/lift_attr_scope.cc
+++ b/src/tir/transforms/lift_attr_scope.cc
@@ -160,7 +160,6 @@ class AttrScopeLifter : public StmtMutator {
   Stmt VisitStmt_(const WhileNode* op) final {
     // TODO(masahi): Do we need a special handling for While nodes?
     LOG(FATAL) << "WhileNode not supported in LiftAttrScope.";
-    return Stmt();
   }
 
  private:
diff --git a/src/tir/transforms/lower_custom_datatypes.cc b/src/tir/transforms/lower_custom_datatypes.cc
index 3cf5ed2ecf7c..241b656ace6c 100644
--- a/src/tir/transforms/lower_custom_datatypes.cc
+++ b/src/tir/transforms/lower_custom_datatypes.cc
@@ -105,12 +105,10 @@ class CustomDatatypesLowerer : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc
index bd6b5185eb4a..cade9a90566d 100644
--- a/src/tir/transforms/lower_thread_allreduce.cc
+++ b/src/tir/transforms/lower_thread_allreduce.cc
@@ -111,12 +111,10 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
diff --git a/src/tir/transforms/lower_warp_memory.cc b/src/tir/transforms/lower_warp_memory.cc
index e12e2772ab22..9d2ff88540fc 100644
--- a/src/tir/transforms/lower_warp_memory.cc
+++ b/src/tir/transforms/lower_warp_memory.cc
@@ -295,12 +295,10 @@ class WarpAccessRewriter : protected StmtExprMutator {
 
   Stmt VisitStmt_(const StoreNode* op) override {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const LoadNode* op) override {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const BufferStoreNode* op) override {
diff --git a/src/tir/transforms/merge_dynamic_shared_memory_allocations.cc b/src/tir/transforms/merge_dynamic_shared_memory_allocations.cc
index e61af842b507..eab660e2a47b 100644
--- a/src/tir/transforms/merge_dynamic_shared_memory_allocations.cc
+++ b/src/tir/transforms/merge_dynamic_shared_memory_allocations.cc
@@ -309,12 +309,10 @@ class DynamicSharedMemoryRewriter : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index fba813870bb1..2f116a02295f 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -226,12 +226,10 @@ class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   PrimExpr VisitExpr_(const VarNode* op) final {
diff --git a/src/tir/transforms/renew_defs.cc b/src/tir/transforms/renew_defs.cc
index a185916a9a4c..90399f7a0586 100644
--- a/src/tir/transforms/renew_defs.cc
+++ b/src/tir/transforms/renew_defs.cc
@@ -159,12 +159,10 @@ class RenewDefMutator : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
  private:
diff --git a/src/tir/transforms/rewrite_unsafe_select.cc b/src/tir/transforms/rewrite_unsafe_select.cc
index 8a37f9958073..b4082e2040fd 100644
--- a/src/tir/transforms/rewrite_unsafe_select.cc
+++ b/src/tir/transforms/rewrite_unsafe_select.cc
@@ -69,7 +69,6 @@ class UnsafeExprDetector : public ExprFunctor<bool(const PrimExpr& n)> {
   }
   bool VisitExpr_(const LoadNode* op) {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return true;
   }
   bool VisitExpr_(const AddNode* op) final { return BinaryOp(op); }
   bool VisitExpr_(const SubNode* op) final { return BinaryOp(op); }
diff --git a/src/tir/transforms/simplify.cc b/src/tir/transforms/simplify.cc
index 49d3a9ceaef5..7dd52f941c46 100644
--- a/src/tir/transforms/simplify.cc
+++ b/src/tir/transforms/simplify.cc
@@ -209,7 +209,6 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   // eliminate useless stores
diff --git a/src/tir/transforms/split_host_device.cc b/src/tir/transforms/split_host_device.cc
index dc56a3ce762f..2de7d38d7d57 100644
--- a/src/tir/transforms/split_host_device.cc
+++ b/src/tir/transforms/split_host_device.cc
@@ -114,7 +114,6 @@ class VarUseDefAnalysis : public StmtExprMutator {
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
@@ -167,7 +166,6 @@ class VarUseDefAnalysis : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index eb0409e555a1..58f4eba83893 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -848,12 +848,10 @@ class BufferBindUnwrapper : public StmtExprMutator {
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
@@ -1206,7 +1204,6 @@ class BufferBindUnwrapper : public StmtExprMutator {
       return buf_map_[buffer.get()];
     } else {
       LOG(FATAL) << "Can't work around the undefined buffer";
-      return *static_cast<BufferEntry*>(nullptr);
     }
   }
 
@@ -1391,12 +1388,10 @@ class StorageFlattener : public StmtExprMutator {
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index 67972ce67282..7e09bda70371 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -418,12 +418,10 @@ class StoragePlanRewriter : public StmtExprMutator {
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   template <typename Node>
@@ -1418,12 +1416,10 @@ class VectorTypeRewriter : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   template <typename Node>
diff --git a/src/tir/transforms/thread_storage_sync.cc b/src/tir/transforms/thread_storage_sync.cc
index 466a52d632a3..4becd8ffd74f 100644
--- a/src/tir/transforms/thread_storage_sync.cc
+++ b/src/tir/transforms/thread_storage_sync.cc
@@ -316,12 +316,10 @@ class ThreadSyncInserter : public StmtExprMutator {
   }
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
     if (sync_scope_.rank == StorageRank::kGlobal &&
diff --git a/src/tir/transforms/unroll_loop.cc b/src/tir/transforms/unroll_loop.cc
index e1d0688ab537..1e55cb22ee26 100644
--- a/src/tir/transforms/unroll_loop.cc
+++ b/src/tir/transforms/unroll_loop.cc
@@ -135,7 +135,6 @@ class LoopUnroller : public StmtExprMutator {
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
diff --git a/src/tir/transforms/update_pointer_storage_scope.cc b/src/tir/transforms/update_pointer_storage_scope.cc
index 69db85eda2df..3a9e4717241d 100644
--- a/src/tir/transforms/update_pointer_storage_scope.cc
+++ b/src/tir/transforms/update_pointer_storage_scope.cc
@@ -96,7 +96,6 @@ Buffer UpdatePointerStorageScope::GetUpdatedBuffer(Buffer buf) {
 
 PrimExpr UpdatePointerStorageScope::VisitExpr_(const LoadNode* op) {
   LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-  return PrimExpr();
 }
 
 PrimExpr UpdatePointerStorageScope::VisitExpr_(const BufferLoadNode* op) {
@@ -106,7 +105,6 @@ PrimExpr UpdatePointerStorageScope::VisitExpr_(const BufferLoadNode* op) {
 
 Stmt UpdatePointerStorageScope::VisitStmt_(const StoreNode* op) {
   LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-  return Stmt();
 }
 
 Stmt UpdatePointerStorageScope::VisitStmt_(const BufferStoreNode* op) {
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index 8efed83ccdf1..6888ac625389 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -65,12 +65,10 @@ class VecAllocAccess : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
-    return Stmt();
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
@@ -372,7 +370,6 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   // Load
   PrimExpr VisitExpr_(const LoadNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return PrimExpr();
   }
   // BufferLoad
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
@@ -420,7 +417,6 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   // Store
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
-    return Stmt();
   }
   // BufferStore
   Stmt VisitStmt_(const BufferStoreNode* op) final {
@@ -504,7 +500,6 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   // While
   Stmt VisitStmt_(const WhileNode* op) final {
     LOG(FATAL) << "A while loop inside a vectorized loop not supported.";
-    return Stmt();
   }
   // LetStmt
   Stmt VisitStmt_(const LetStmtNode* op) final {
@@ -573,7 +568,6 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   // ProducerStore
   Stmt VisitStmt_(const ProducerStoreNode* op) final {
     LOG(FATAL) << "ProducerProvide cannot appear in a TIR PrimFunc";
-    return Stmt();
   }
 
  private:
diff --git a/src/tir/usmp/algo/hill_climb.cc b/src/tir/usmp/algo/hill_climb.cc
index 1da9cef1eb6f..6e1de1e43cd3 100644
--- a/src/tir/usmp/algo/hill_climb.cc
+++ b/src/tir/usmp/algo/hill_climb.cc
@@ -249,7 +249,6 @@ class HillClimbAllocator : public GreedyBase {
         return it->second;
       }
       LOG(FATAL) << "node is not indexed in the _pos_map";
-      return -1;
     };
 
     for (; attempts < _max_attempts; ++attempts) {
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index c1215214cf51..b139fbda6819 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -955,7 +955,6 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
       return kStoreStage;
     }
     LOG(FATAL) << "not reached";
-    return kNoneStage;
   }
 
   // Get stage of memory and computation
diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc
index 073c613bd2c2..936c9938dd3a 100644
--- a/web/emcc/webgpu_runtime.cc
+++ b/web/emcc/webgpu_runtime.cc
@@ -105,27 +105,17 @@ class WebGPUDeviceAPI : public DeviceAPI {
   }
 
  public:
-  TVMStreamHandle CreateStream(Device dev) final {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
+  TVMStreamHandle CreateStream(Device dev) final { LOG(FATAL) << "Not implemented"; }
 
-  void FreeStream(Device dev, TVMStreamHandle stream) final {
-    LOG(FATAL) << "Not implemented";
-    return;
-  }
+  void FreeStream(Device dev, TVMStreamHandle stream) final { LOG(FATAL) << "Not implemented"; }
 
   void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) {
     LOG(FATAL) << "Not implemented";
-    return;
   }
 
   void StreamSync(Device dev, TVMStreamHandle stream) final { LOG(FATAL) << "Not implemented"; }
 
-  void SetStream(Device dev, TVMStreamHandle stream) final {
-    LOG(FATAL) << "Not implemented";
-    return;
-  }
+  void SetStream(Device dev, TVMStreamHandle stream) final { LOG(FATAL) << "Not implemented"; }
 
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final {
     return WebGPUThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size);

From 3a81aef40bca9479d4a691b3a80e42b01f3f8a0d Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Sun, 4 Dec 2022 18:22:38 -0500
Subject: [PATCH 690/704] [Fix] Use proper target in VerifyGPUCode (#13548)

Previously, the VerifyGPUCode post-processor uses hardcoded target `Target("cuda")` for applying pass LowerIntrin. This is a bit problematic since the actual target can be other GPU target (e.g., Metal). Therefore, this PR changes the hardcoded target to be the actual target.
---
 src/meta_schedule/postproc/verify_gpu_code.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/meta_schedule/postproc/verify_gpu_code.cc b/src/meta_schedule/postproc/verify_gpu_code.cc
index ae6f3474bbd6..99ffc1bfcdf7 100644
--- a/src/meta_schedule/postproc/verify_gpu_code.cc
+++ b/src/meta_schedule/postproc/verify_gpu_code.cc
@@ -113,19 +113,20 @@ Integer Extract(const Target& target, const char* name) {
 /*! \brief Verify the correctness of the generated GPU code. */
 class VerifyGPUCodeNode : public PostprocNode {
  public:
+  Target target_{nullptr};
   Map<String, PrimExpr> target_constraints_{nullptr};
   int thread_warp_size_ = -1;
 
   void InitializeWithTuneContext(const TuneContext& context) final {
     ICHECK(context->target.defined());
-    Target target = context->target.value();
+    this->target_ = context->target.value();
     this->target_constraints_ = Map<String, PrimExpr>{
-        {"max_shared_memory_per_block", Extract(target, "max_shared_memory_per_block")},
-        {"max_threads_per_block", Extract(target, "max_threads_per_block")},
+        {"max_shared_memory_per_block", Extract(this->target_, "max_shared_memory_per_block")},
+        {"max_threads_per_block", Extract(this->target_, "max_threads_per_block")},
         {"max_vthread", Integer(8)},
         {"max_vector_bytes", Integer(16)},
     };
-    thread_warp_size_ = Extract(target, "thread_warp_size").IntValue();
+    thread_warp_size_ = Extract(this->target_, "thread_warp_size").IntValue();
   }
 
   bool Verify(const IRModule& mod) const {
@@ -180,7 +181,7 @@ class VerifyGPUCodeNode : public PostprocNode {
           transform::PassContext pass_ctx = transform::PassContext::Current();
           tir::PrimFunc f = WithAttr(GetRef<tir::PrimFunc>(prim_func), "global_symbol",
                                      runtime::String(g_var->name_hint));
-          f = WithAttr(f, tvm::attr::kTarget, Target("cuda"));  // Required for LowerIntrin
+          f = WithAttr(f, tvm::attr::kTarget, this->target_);  // Required for LowerIntrin
           bool noalias = pass_ctx->GetConfig<Bool>("tir.noalias", Bool(true)).value();
           if (noalias) {
             f = WithAttr(std::move(f), "tir.noalias", Bool(true));

From e7160d569a19aa00b0fd605abd970d0e9ed8b1d0 Mon Sep 17 00:00:00 2001
From: "yin.changsheng" <yin.changsheng@intellif.com>
Date: Mon, 5 Dec 2022 14:15:56 +0800
Subject: [PATCH 691/704] Add recursive on loop with marked kUnrolled (#13536)

Current LoopPartition pass, when the loop is marked kUnrolled, it returns directly
This PR enhance LoopPartition pass to continue recursive on loop with marked kUnrolled.
---
 src/tir/transforms/loop_partition.cc          |  3 +-
 .../test_tir_transform_loop_partition.py      | 69 +++++++++++++++++++
 2 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc
index 1d995ef26ed8..0d088526694d 100644
--- a/src/tir/transforms/loop_partition.cc
+++ b/src/tir/transforms/loop_partition.cc
@@ -597,7 +597,8 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim
   if (!opt_cond_value.has_value()) {
     if (has_partition_hint_ && unroll_loop_with_partition_hint_no_interval_ &&
         analyzer_.CanProve(max - min > 0)) {
-      return For(var, min, max - min + 1, ForKind::kUnrolled, body);
+      auto new_body = VisitAndMutate(body);
+      return For(var, min, max - min + 1, ForKind::kUnrolled, new_body);
     }
     return Stmt();
   }
diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py
index fe48aa7d8fd4..7dd8e794103e 100644
--- a/tests/python/unittest/test_tir_transform_loop_partition.py
+++ b/tests/python/unittest/test_tir_transform_loop_partition.py
@@ -677,6 +677,75 @@ def partitioned_main(
     assert tvm.ir.structural_equal(mod["main"], partitioned_main)
 
 
+def test_loop_partition_recursive_unroll_hint():
+    @T.prim_func
+    def main():
+        placeholder_0_dm = T.decl_buffer([1, 32, 32, 16], dtype="int8")
+        for i3_0 in T.serial(5, annotations={"pragma_loop_partition_hint": 1}):
+            for i2_0 in T.serial(2, annotations={"pragma_loop_partition_hint": 1}):
+                pad_temp = T.decl_buffer([1, 16, 16, 16], dtype="int8")
+                for ax0, ax1, ax2 in T.grid(16, 16, 16):
+                    if (
+                        6 <= i2_0 * 4 + ax0
+                        and i2_0 * 4 + ax0 < 26
+                        and 6 <= i3_0 * 4 + ax1
+                        and i3_0 * 4 + ax1 < 26
+                    ):
+                        pad_temp[
+                            0,
+                            i2_0 * 4 + ax0 - 6 + 6 - i2_0 * 4,
+                            i3_0 * 4 + ax1 - 6 + 6 - i3_0 * 4,
+                            ax2,
+                        ] = placeholder_0_dm[
+                            0,
+                            i2_0 * 4 + ax0 - 6 - -6,
+                            i3_0 * 4 + ax1 - 6 - -6,
+                            ax2,
+                        ]
+
+    @T.prim_func
+    def partitioned_main():
+        placeholder_0_dm = T.allocate([16384], "int8", "global")
+        placeholder_0_dm_1 = T.buffer_decl([16384], dtype="int8", data=placeholder_0_dm)
+        for i3_0 in T.unroll(2):
+            for i2_0 in T.unroll(2):
+                pad_temp = T.allocate([4096], "int8", "global")
+                pad_temp_1 = T.buffer_decl([4096], dtype="int8", data=pad_temp)
+                for ax0, ax1, ax2 in T.grid(16, 16, 16):
+                    if 6 <= i2_0 * 4 + ax0 and 6 <= i3_0 * 4 + ax1:
+                        pad_temp_1[ax0 * 256 + ax1 * 16 + ax2] = placeholder_0_dm_1[
+                            i2_0 * 2048 + ax0 * 512 + i3_0 * 64 + ax1 * 16 + ax2
+                        ]
+        for i2_0 in T.unroll(2):
+            pad_temp_2 = T.allocate([4096], "int8", "global")
+            pad_temp_3 = T.buffer_decl([4096], dtype="int8", data=pad_temp_2)
+            for ax0, ax1, ax2 in T.grid(16, 16, 16):
+                if 6 <= i2_0 * 4 + ax0:
+                    pad_temp_3[ax0 * 256 + ax1 * 16 + ax2] = placeholder_0_dm_1[
+                        i2_0 * 2048 + ax0 * 512 + ax1 * 16 + ax2 + 128
+                    ]
+        for i3_0 in T.unroll(2):
+            for i2_0 in T.unroll(2):
+                pad_temp_4 = T.allocate([4096], "int8", "global")
+                pad_temp_5 = T.buffer_decl([4096], dtype="int8", data=pad_temp_4)
+                for ax0, ax1, ax2 in T.grid(16, 16, 16):
+                    if 6 <= i2_0 * 4 + ax0 and i3_0 * 4 + ax1 < 14:
+                        pad_temp_5[ax0 * 256 + ax1 * 16 + ax2] = placeholder_0_dm_1[
+                            i2_0 * 2048 + ax0 * 512 + i3_0 * 64 + ax1 * 16 + ax2 + 192
+                        ]
+
+    mod = partition_from_scheduled_tir(
+        main,
+        {
+            "tir.LoopPartition": {
+                "partition_const_loop": True,
+                "unroll_loop_with_partition_hint_no_interval": True,
+            }
+        },
+    )
+    assert tvm.ir.structural_equal(mod["main"], partitioned_main)
+
+
 def test_loop_partition_keep_loop_annotations():
     @T.prim_func
     def before(A: T.Buffer[160, "int32"], B: T.Buffer[160, "int32"]) -> None:

From 012551ffda830d7992a467fce67cdf0ada3a1826 Mon Sep 17 00:00:00 2001
From: Alexey Yazev <113356454+Alexey-Yazev@users.noreply.github.com>
Date: Mon, 5 Dec 2022 15:11:28 +0400
Subject: [PATCH 692/704] [microNPU] Fix cascade scheduling stability (#13428)

For Plans/Proposals added sorting by the number of cycles in case the memory used matches.
---
 src/contrib/ethosu/cascader/pareto.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/contrib/ethosu/cascader/pareto.cc b/src/contrib/ethosu/cascader/pareto.cc
index e40a6602fa2a..5d025b57bbe4 100644
--- a/src/contrib/ethosu/cascader/pareto.cc
+++ b/src/contrib/ethosu/cascader/pareto.cc
@@ -91,6 +91,9 @@ std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans,
   }
 
   std::sort(plans.begin(), plans.end(), [](const Plan& a, const Plan& b) -> bool {
+    if (a->GetMemoryUsage() == b->GetMemoryUsage()) {
+      return a->GetCycles() < b->GetCycles();
+    }
     return a->GetMemoryUsage() < b->GetMemoryUsage();
   });
   std::vector<std::array<float, 2>> costs;
@@ -122,6 +125,9 @@ std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_
   }
 
   std::sort(proposals.begin(), proposals.end(), [](const Proposal& a, const Proposal& b) -> bool {
+    if (a->GetMemoryUsage() == b->GetMemoryUsage()) {
+      return a->GetCycles() < b->GetCycles();
+    }
     return a->GetMemoryUsage() < b->GetMemoryUsage();
   });
   std::vector<std::array<float, 2>> costs;

From 2b110367d1e1df12a3e784b7cdcc1d769c97132c Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 5 Dec 2022 11:03:00 -0600
Subject: [PATCH 693/704] [microTVM][Arduino]Add `serial_number` to project
 options and tests (#13518)

This PR adds serial_number as a project option to Arduino project API. In addition, it is added to micro pytest_plugin to enable testing with assigned serial number.
---
 .../template_project/microtvm_api_server.py   |  51 +-
 apps/microtvm/poetry.lock                     | 556 +++++++++++++++---
 apps/microtvm/pyproject.toml                  |   1 +
 python/tvm/micro/testing/pytest_plugin.py     |  13 +
 tests/micro/arduino/README.md                 |   7 +
 .../arduino/test_arduino_error_detection.py   |   7 +-
 .../micro/arduino/test_arduino_rpc_server.py  |  91 ++-
 tests/micro/arduino/test_arduino_workflow.py  |   3 +-
 tests/micro/arduino/test_utils.py             |   3 +-
 tests/micro/zephyr/README.md                  |   2 +-
 tests/micro/zephyr/conftest.py                |  10 -
 11 files changed, 635 insertions(+), 109 deletions(-)

diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index cb0022b3beee..4975d924dac1 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -91,7 +91,20 @@ class BoardAutodetectFailed(Exception):
         optional=["flash", "open_transport"],
         type="int",
         default=None,
-        help="Port to use for connecting to hardware.",
+        help=(
+            "Port to use for connecting to hardware. "
+            "If port and serial_number options are not set it will try to autodetect the port."
+        ),
+    ),
+    server.ProjectOption(
+        "serial_number",
+        optional=["open_transport", "flash"],
+        type="str",
+        default=None,
+        help=(
+            "Board serial number. If both serial_number and port options are set,"
+            " it will throw exception."
+        ),
     ),
 ]
 
@@ -525,6 +538,7 @@ def _parse_connected_boards(self, tabular_str):
             yield device
 
     def _auto_detect_port(self, arduino_cli_cmd: str, board: str) -> str:
+        # It is assumed only one board with this type is connected to this host machine.
         list_cmd = [self._get_arduino_cli_cmd(arduino_cli_cmd), "board", "list"]
         list_cmd_output = subprocess.run(
             list_cmd, check=True, stdout=subprocess.PIPE
@@ -538,10 +552,34 @@ def _auto_detect_port(self, arduino_cli_cmd: str, board: str) -> str:
         # If no compatible boards, raise an error
         raise BoardAutodetectFailed()
 
-    def _get_arduino_port(self, arduino_cli_cmd: str, board: str, port: int):
+    def _get_arduino_port(
+        self, arduino_cli_cmd: str, board: str, port: int = None, serial_number: str = None
+    ):
+        """Returns Arduino serial port.
+        If both port and serial_number are set, it throw Runtime exception.
+        If none of those options are set, it tries to autodetect the serial port.
+        """
+        # TODO: This is to avoid breaking GPU docker on running the tutorials.
+        import serial.tools.list_ports
+
+        if serial_number and port:
+            raise RuntimeError(
+                "port and serial_number cannot be set together. Please set only one."
+            )
+
         if not self._port:
             if port:
                 self._port = port
+            elif serial_number:
+                com_ports = serial.tools.list_ports.comports()
+                for port in com_ports:
+                    if port.serial_number == serial_number:
+                        self._port = port.device
+                        break
+                if not self._port:
+                    raise BoardAutodetectFailed(
+                        f"Detecting port with board serial_number {serial_number} failed."
+                    )
             else:
                 self._port = self._auto_detect_port(arduino_cli_cmd, board)
 
@@ -565,12 +603,14 @@ def flash(self, options):
         warning_as_error = options.get("warning_as_error")
         port = options.get("port")
         board = options.get("board")
+        serial_number = options.get("serial_number")
+
         if not board:
             board = self._get_board_from_makefile(API_SERVER_DIR / MAKEFILE_FILENAME)
 
         cli_command = self._get_arduino_cli_cmd(arduino_cli_cmd)
         self._check_platform_version(cli_command, warning_as_error)
-        port = self._get_arduino_port(cli_command, board, port)
+        port = self._get_arduino_port(cli_command, board, port, serial_number)
 
         upload_cmd = ["make", "flash", f"PORT={port}"]
         for _ in range(self.FLASH_MAX_RETRIES):
@@ -594,6 +634,7 @@ def flash(self, options):
             )
 
     def open_transport(self, options):
+        # TODO: This is to avoid breaking GPU docker on running the tutorials.
         import serial
         import serial.tools.list_ports
 
@@ -601,6 +642,8 @@ def open_transport(self, options):
         arduino_cli_cmd = options.get("arduino_cli_cmd")
         port = options.get("port")
         board = options.get("board")
+        serial_number = options.get("serial_number")
+
         if not board:
             board = self._get_board_from_makefile(API_SERVER_DIR / MAKEFILE_FILENAME)
 
@@ -608,7 +651,7 @@ def open_transport(self, options):
         if self._serial is not None:
             return
 
-        port = self._get_arduino_port(arduino_cli_cmd, board, port)
+        port = self._get_arduino_port(arduino_cli_cmd, board, port, serial_number)
 
         # It takes a moment for the Arduino code to finish initializing
         # and start communicating over serial
diff --git a/apps/microtvm/poetry.lock b/apps/microtvm/poetry.lock
index 3637b69f869c..124d9bd1f78b 100644
--- a/apps/microtvm/poetry.lock
+++ b/apps/microtvm/poetry.lock
@@ -53,7 +53,7 @@ python-versions = ">=3.7"
 typing-extensions = {version = "*", markers = "python_version < \"3.8\""}
 
 [package.extras]
-tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
+tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"]
 
 [[package]]
 name = "astroid"
@@ -97,8 +97,8 @@ optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 
 [package.extras]
-azure-pipelines = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "pytest-azurepipelines"]
-dev = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "sphinx", "pre-commit"]
+azure-pipelines = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-azurepipelines", "six", "zope.interface"]
+dev = ["coverage", "hypothesis", "pre-commit", "pympler", "pytest (>=4.3.0)", "six", "sphinx", "zope.interface"]
 docs = ["sphinx", "zope.interface"]
 tests = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"]
 
@@ -348,9 +348,9 @@ optional = false
 python-versions = ">=3.7"
 
 [package.extras]
-all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "skia-pathops (>=0.5.0)", "uharfbuzz (>=0.23.0)", "brotlicffi (>=0.8.0)", "scipy", "brotli (>=1.0.1)", "munkres", "unicodedata2 (>=14.0.0)", "xattr"]
+all = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres", "scipy", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.23.0)", "unicodedata2 (>=14.0.0)", "xattr", "zopfli (>=0.1.4)"]
 graphite = ["lz4 (>=1.7.4.2)"]
-interpolatable = ["scipy", "munkres"]
+interpolatable = ["munkres", "scipy"]
 lxml = ["lxml (>=4.0,<5)"]
 pathops = ["skia-pathops (>=0.5.0)"]
 plot = ["matplotlib"]
@@ -359,7 +359,7 @@ symfont = ["sympy"]
 type1 = ["xattr"]
 ufo = ["fs (>=2.2.0,<3)"]
 unicode = ["unicodedata2 (>=14.0.0)"]
-woff = ["zopfli (>=0.1.4)", "brotlicffi (>=0.8.0)", "brotli (>=1.0.1)"]
+woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"]
 
 [[package]]
 name = "future"
@@ -392,7 +392,7 @@ rsa = {version = ">=3.1.4,<5", markers = "python_version >= \"3.6\""}
 six = ">=1.9.0"
 
 [package.extras]
-aiohttp = ["requests (>=2.20.0,<3.0.0dev)", "aiohttp (>=3.6.2,<4.0.0dev)"]
+aiohttp = ["aiohttp (>=3.6.2,<4.0.0dev)", "requests (>=2.20.0,<3.0.0dev)"]
 enterprise_cert = ["cryptography (==36.0.2)", "pyopenssl (==22.0.0)"]
 pyopenssl = ["pyopenssl (>=20.0.0)"]
 reauth = ["pyu2f (>=0.1.5)"]
@@ -432,9 +432,9 @@ optional = true
 python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
 
 [package.extras]
-dev = ["tox (>=3.0)", "flake8", "pep8-naming", "wheel", "twine"]
+dev = ["flake8", "pep8-naming", "tox (>=3.0)", "twine", "wheel"]
 docs = ["sphinx (>=1.3)", "sphinx-rtd-theme"]
-test = ["mock (>=2)", "pytest (>=3.4)", "pytest-mock (>=1.8)", "pytest-cov"]
+test = ["mock (>=2)", "pytest (>=3.4)", "pytest-cov", "pytest-mock (>=1.8)"]
 
 [[package]]
 name = "grpcio"
@@ -503,9 +503,9 @@ typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
 zipp = ">=0.5"
 
 [package.extras]
-docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"]
+docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"]
 perf = ["ipython"]
-testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"]
+testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"]
 
 [[package]]
 name = "iniconfig"
@@ -524,10 +524,10 @@ optional = false
 python-versions = ">=3.6.1,<4.0"
 
 [package.extras]
-pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
-requirements_deprecated_finder = ["pipreqs", "pip-api"]
 colors = ["colorama (>=0.4.3,<0.5.0)"]
+pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
 plugins = ["setuptools"]
+requirements_deprecated_finder = ["pip-api", "pipreqs"]
 
 [[package]]
 name = "jinja2"
@@ -564,9 +564,9 @@ numpy = ">=1.9.1"
 six = ">=1.9.0"
 
 [package.extras]
-image = ["scipy (>=0.14)", "Pillow (>=5.2.0)"]
+image = ["Pillow (>=5.2.0)", "scipy (>=0.14)"]
 pep8 = ["flake8"]
-tests = ["pandas", "pillow", "tensorflow", "keras", "pytest", "pytest-xdist", "pytest-cov"]
+tests = ["keras", "pandas", "pillow", "pytest", "pytest-cov", "pytest-xdist", "tensorflow"]
 
 [[package]]
 name = "kiwisolver"
@@ -775,7 +775,7 @@ python-versions = ">=3.5"
 numpy = ">=1.7"
 
 [package.extras]
-docs = ["sphinx (==1.2.3)", "sphinxcontrib-napoleon", "sphinx-rtd-theme", "numpydoc"]
+docs = ["numpydoc", "sphinx (==1.2.3)", "sphinx-rtd-theme", "sphinxcontrib-napoleon"]
 tests = ["pytest", "pytest-cov", "pytest-pep8"]
 
 [[package]]
@@ -814,8 +814,8 @@ optional = false
 python-versions = ">=3.7"
 
 [package.extras]
-docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx-autodoc-typehints (>=1.12)", "sphinx (>=4)"]
-test = ["appdirs (==1.4.4)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)", "pytest (>=6)"]
+docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"]
+test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"]
 
 [[package]]
 name = "pluggy"
@@ -849,7 +849,7 @@ optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 
 [package.extras]
-test = ["ipaddress", "mock", "enum34", "pywin32", "wmi"]
+test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 
 [[package]]
 name = "py"
@@ -957,7 +957,7 @@ optional = false
 python-versions = ">=3.6.8"
 
 [package.extras]
-diagrams = ["railroad-diagrams", "jinja2"]
+diagrams = ["jinja2", "railroad-diagrams"]
 
 [[package]]
 name = "pyserial"
@@ -1041,6 +1041,14 @@ category = "dev"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "pyusb"
+version = "1.2.1"
+description = "Python USB access module"
+category = "main"
+optional = false
+python-versions = ">=3.6.0"
+
 [[package]]
 name = "pyyaml"
 version = "6.0"
@@ -1185,8 +1193,8 @@ sphinxcontrib-serializinghtml = "*"
 
 [package.extras]
 docs = ["sphinxcontrib-websupport"]
-lint = ["flake8 (>=3.5.0)", "isort", "mypy (>=0.800)", "docutils-stubs"]
-test = ["pytest", "pytest-cov", "html5lib", "cython", "typed-ast"]
+lint = ["docutils-stubs", "flake8 (>=3.5.0)", "isort", "mypy (>=0.800)"]
+test = ["cython", "html5lib", "pytest", "pytest-cov", "typed-ast"]
 
 [[package]]
 name = "sphinx-gallery"
@@ -1226,7 +1234,7 @@ optional = false
 python-versions = ">=3.5"
 
 [package.extras]
-lint = ["flake8", "mypy", "docutils-stubs"]
+lint = ["docutils-stubs", "flake8", "mypy"]
 test = ["pytest"]
 
 [[package]]
@@ -1238,7 +1246,7 @@ optional = false
 python-versions = ">=3.5"
 
 [package.extras]
-lint = ["flake8", "mypy", "docutils-stubs"]
+lint = ["docutils-stubs", "flake8", "mypy"]
 test = ["pytest"]
 
 [[package]]
@@ -1250,8 +1258,8 @@ optional = false
 python-versions = ">=3.6"
 
 [package.extras]
-lint = ["flake8", "mypy", "docutils-stubs"]
-test = ["pytest", "html5lib"]
+lint = ["docutils-stubs", "flake8", "mypy"]
+test = ["html5lib", "pytest"]
 
 [[package]]
 name = "sphinxcontrib-jsmath"
@@ -1262,7 +1270,7 @@ optional = false
 python-versions = ">=3.5"
 
 [package.extras]
-test = ["pytest", "flake8", "mypy"]
+test = ["flake8", "mypy", "pytest"]
 
 [[package]]
 name = "sphinxcontrib-qthelp"
@@ -1273,7 +1281,7 @@ optional = false
 python-versions = ">=3.5"
 
 [package.extras]
-lint = ["flake8", "mypy", "docutils-stubs"]
+lint = ["docutils-stubs", "flake8", "mypy"]
 test = ["pytest"]
 
 [[package]]
@@ -1285,7 +1293,7 @@ optional = false
 python-versions = ">=3.5"
 
 [package.extras]
-lint = ["flake8", "mypy", "docutils-stubs"]
+lint = ["docutils-stubs", "flake8", "mypy"]
 test = ["pytest"]
 
 [[package]]
@@ -1502,8 +1510,8 @@ optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4"
 
 [package.extras]
-brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"]
-secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"]
+secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)"]
 socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 
 [[package]]
@@ -1541,7 +1549,7 @@ numpy = "*"
 scipy = "*"
 
 [package.extras]
-dask = ["dask", "pandas", "distributed"]
+dask = ["dask", "distributed", "pandas"]
 datatable = ["datatable"]
 pandas = ["pandas"]
 plotting = ["graphviz", "matplotlib"]
@@ -1556,8 +1564,8 @@ optional = false
 python-versions = ">=3.7"
 
 [package.extras]
-docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"]
-testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"]
+docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"]
+testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
 
 [extras]
 importer-caffe2 = ["torch"]
@@ -1574,10 +1582,13 @@ xgboost = ["xgboost"]
 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.7, <3.9"
-content-hash = "ee9733f342ea5000eb05f186db9986ce80eb31bcb6adbd4139b8968003f0e353"
+content-hash = "ce29099c550654168ca21ab9b5aa3ab1f989da3dfd6e422b9ba2fde30a53cd02"
 
 [metadata.files]
-absl-py = []
+absl-py = [
+    {file = "absl-py-1.2.0.tar.gz", hash = "sha256:f568809938c49abbda89826223c992b630afd23c638160ad7840cfe347710d97"},
+    {file = "absl_py-1.2.0-py3-none-any.whl", hash = "sha256:5d15f85b8cc859c6245bc9886ba664460ed96a6fee895416caa37d669ee74a9a"},
+]
 alabaster = [
     {file = "alabaster-0.7.12-py2.py3-none-any.whl", hash = "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359"},
     {file = "alabaster-0.7.12.tar.gz", hash = "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"},
@@ -1590,12 +1601,17 @@ asgiref = [
     {file = "asgiref-3.5.2-py3-none-any.whl", hash = "sha256:1d2880b792ae8757289136f1db2b7b99100ce959b2aa57fd69dab783d05afac4"},
     {file = "asgiref-3.5.2.tar.gz", hash = "sha256:4a29362a6acebe09bf1d6640db38c1dc3d9217c68e6f9f6204d72667fc19a424"},
 ]
-astroid = []
+astroid = [
+    {file = "astroid-2.11.7-py3-none-any.whl", hash = "sha256:86b0a340a512c65abf4368b80252754cda17c02cdbbd3f587dddf98112233e7b"},
+    {file = "astroid-2.11.7.tar.gz", hash = "sha256:bb24615c77f4837c707669d16907331374ae8a964650a66999da3f5ca68dc946"},
+]
 astunparse = [
     {file = "astunparse-1.6.3-py2.py3-none-any.whl", hash = "sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8"},
     {file = "astunparse-1.6.3.tar.gz", hash = "sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872"},
 ]
-atomicwrites = []
+atomicwrites = [
+    {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"},
+]
 attrs = [
     {file = "attrs-19.3.0-py2.py3-none-any.whl", hash = "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c"},
     {file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"},
@@ -1606,8 +1622,14 @@ autodocsumm = [
 autoflake = [
     {file = "autoflake-1.4.tar.gz", hash = "sha256:61a353012cff6ab94ca062823d1fb2f692c4acda51c76ff83a8d77915fba51ea"},
 ]
-autopep8 = []
-babel = []
+autopep8 = [
+    {file = "autopep8-1.7.0-py2.py3-none-any.whl", hash = "sha256:6f09e90a2be784317e84dc1add17ebfc7abe3924239957a37e5040e27d812087"},
+    {file = "autopep8-1.7.0.tar.gz", hash = "sha256:ca9b1a83e53a7fad65d731dc7a2a2d50aa48f43850407c59f6a1a306c4201142"},
+]
+babel = [
+    {file = "Babel-2.10.3-py3-none-any.whl", hash = "sha256:ff56f4892c1c4bf0d814575ea23471c230d544203c7748e8c68f0089478d48eb"},
+    {file = "Babel-2.10.3.tar.gz", hash = "sha256:7614553711ee97490f732126dc077f8d0ae084ebc6a96e23db1482afabdb2c51"},
+]
 black = [
     {file = "black-19.10b0-py36-none-any.whl", hash = "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b"},
     {file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"},
@@ -1616,9 +1638,80 @@ cachetools = [
     {file = "cachetools-5.2.0-py3-none-any.whl", hash = "sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db"},
     {file = "cachetools-5.2.0.tar.gz", hash = "sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757"},
 ]
-certifi = []
-cffi = []
-charset-normalizer = []
+certifi = [
+    {file = "certifi-2022.6.15-py3-none-any.whl", hash = "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"},
+    {file = "certifi-2022.6.15.tar.gz", hash = "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d"},
+]
+cffi = [
+    {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"},
+    {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"},
+    {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"},
+    {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"},
+    {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"},
+    {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"},
+    {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"},
+    {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"},
+    {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"},
+    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"},
+    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"},
+    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"},
+    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"},
+    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"},
+    {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"},
+    {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"},
+    {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"},
+    {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"},
+    {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"},
+    {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"},
+    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"},
+    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"},
+    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"},
+    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"},
+    {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"},
+    {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"},
+    {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"},
+    {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"},
+    {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"},
+    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"},
+    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"},
+    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"},
+    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"},
+    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"},
+    {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"},
+    {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"},
+    {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"},
+    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"},
+    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"},
+    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"},
+    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"},
+    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"},
+    {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"},
+    {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"},
+    {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"},
+    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"},
+    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"},
+    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"},
+    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"},
+    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"},
+    {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"},
+    {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"},
+    {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"},
+    {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"},
+    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"},
+    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"},
+    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"},
+    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"},
+    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"},
+    {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"},
+    {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"},
+    {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"},
+    {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"},
+    {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"},
+]
+charset-normalizer = [
+    {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"},
+    {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"},
+]
 click = [
     {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
     {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
@@ -1669,17 +1762,29 @@ dill = [
     {file = "dill-0.3.5.1-py2.py3-none-any.whl", hash = "sha256:33501d03270bbe410c72639b350e941882a8b0fd55357580fbc873fba0c59302"},
     {file = "dill-0.3.5.1.tar.gz", hash = "sha256:d75e41f3eff1eee599d738e76ba8f4ad98ea229db8b085318aa2b3333a208c86"},
 ]
-django = []
+django = [
+    {file = "Django-3.2.15-py3-none-any.whl", hash = "sha256:115baf5049d5cf4163e43492cdc7139c306ed6d451e7d3571fe9612903903713"},
+    {file = "Django-3.2.15.tar.gz", hash = "sha256:f71934b1a822f14a86c9ac9634053689279cd04ae69cb6ade4a59471b886582b"},
+]
 docformatter = [
     {file = "docformatter-1.4.tar.gz", hash = "sha256:064e6d81f04ac96bc0d176cbaae953a0332482b22d3ad70d47c8a7f2732eef6f"},
 ]
-docutils = []
-execnet = []
+docutils = [
+    {file = "docutils-0.19-py3-none-any.whl", hash = "sha256:5e1de4d849fee02c63b040a4a3fd567f4ab104defd8a5511fbbc24a8a017efbc"},
+    {file = "docutils-0.19.tar.gz", hash = "sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6"},
+]
+execnet = [
+    {file = "execnet-1.9.0-py2.py3-none-any.whl", hash = "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"},
+    {file = "execnet-1.9.0.tar.gz", hash = "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5"},
+]
 flatbuffers = [
     {file = "flatbuffers-2.0-py2.py3-none-any.whl", hash = "sha256:3751954f0604580d3219ae49a85fafec9d85eec599c0b96226e1bc0b48e57474"},
     {file = "flatbuffers-2.0.tar.gz", hash = "sha256:12158ab0272375eab8db2d663ae97370c33f152b27801fa6024e1d6105fd4dd2"},
 ]
-fonttools = []
+fonttools = [
+    {file = "fonttools-4.36.0-py3-none-any.whl", hash = "sha256:cb91ef8d5a435d90aeb3ab814b2548c6b515df5bc13b4c5adaa23778f2f79823"},
+    {file = "fonttools-4.36.0.zip", hash = "sha256:e637d2fe06bddabbfc488e02ef32d04d561e3c71e9ba11abc7782ea753ceb218"},
+]
 future = [
     {file = "future-0.18.2.tar.gz", hash = "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"},
 ]
@@ -1687,7 +1792,10 @@ gast = [
     {file = "gast-0.4.0-py3-none-any.whl", hash = "sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4"},
     {file = "gast-0.4.0.tar.gz", hash = "sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1"},
 ]
-google-auth = []
+google-auth = [
+    {file = "google-auth-2.10.0.tar.gz", hash = "sha256:7904dbd44b745c7323fef29565adee2fe7ff48473e2d94443aced40b0404a395"},
+    {file = "google_auth-2.10.0-py2.py3-none-any.whl", hash = "sha256:1deba4a54f95ef67b4139eaf5c20eaa7047215eec9f6a2344599b8596db8863b"},
+]
 google-auth-oauthlib = [
     {file = "google-auth-oauthlib-0.4.6.tar.gz", hash = "sha256:a90a072f6993f2c327067bf65270046384cda5a8ecb20b94ea9a687f1f233a7a"},
     {file = "google_auth_oauthlib-0.4.6-py2.py3-none-any.whl", hash = "sha256:3f2a6e802eebbb6fb736a370fbf3b055edcb6b52878bf2f26330b5e041316c73"},
@@ -1701,7 +1809,54 @@ graphviz = [
     {file = "graphviz-0.8.4-py2.py3-none-any.whl", hash = "sha256:7caa53f0b0be42c5f2eaa3f3d71dcc863b15bacceb5d531c2ad7519e1980ff82"},
     {file = "graphviz-0.8.4.zip", hash = "sha256:4958a19cbd8461757a08db308a4a15c3d586660417e1e364f0107d2fe481689f"},
 ]
-grpcio = []
+grpcio = [
+    {file = "grpcio-1.48.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:4a049a032144641ed5d073535c0dc69eb6029187cc729a66946c86dcc8eec3a1"},
+    {file = "grpcio-1.48.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:f8bc76f5cd95f5476e5285fe5d3704a9332586a569fbbccef551b0b6f7a270f9"},
+    {file = "grpcio-1.48.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:448d397fe88e9fef8170f019b86abdc4d554ae311aaf4dbff1532fde227d3308"},
+    {file = "grpcio-1.48.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8f9b6b6f7c83869d2316c5d13f953381881a16741275a34ec5ed5762f11b206e"},
+    {file = "grpcio-1.48.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5bd8541c4b6b43c9024496d30b4a12346325d3a17a1f3c80ad8924caed1e35c3"},
+    {file = "grpcio-1.48.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:877d33aeba05ae0b9e81761a694914ed33613f655c35f6bbcf4ebbcb984e0167"},
+    {file = "grpcio-1.48.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cd01a8201fd8ab2ce496f7e65975da1f1e629eac8eea84ead0fd77e32e4350cd"},
+    {file = "grpcio-1.48.0-cp310-cp310-win32.whl", hash = "sha256:0388da923dff58ba7f711233e41c2b749b5817b8e0f137a107672d9c15a1009c"},
+    {file = "grpcio-1.48.0-cp310-cp310-win_amd64.whl", hash = "sha256:8dcffdb8921fd88857ae350fd579277a5f9315351e89ed9094ef28927a46d40d"},
+    {file = "grpcio-1.48.0-cp36-cp36m-linux_armv7l.whl", hash = "sha256:2138c50331232f56178c2b36dcfa6ad67aad705fe410955f3b2a53d722191b89"},
+    {file = "grpcio-1.48.0-cp36-cp36m-macosx_10_10_x86_64.whl", hash = "sha256:af2d80f142da2a6af45204a5ca2374e2747af07a99de54a1164111e169a761ff"},
+    {file = "grpcio-1.48.0-cp36-cp36m-manylinux_2_17_aarch64.whl", hash = "sha256:59284bd4cdf47c147c26d91aca693765318d524328f6ece2a1a0b85a12a362af"},
+    {file = "grpcio-1.48.0-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3ebfe356c0c6750379cd194bf2b7e5d1d2f29db1832358f05a73e9290db98c"},
+    {file = "grpcio-1.48.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc2619a31339e1c53731f54761f1a2cb865d3421f690e00ef3e92f90d2a0c5ae"},
+    {file = "grpcio-1.48.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:7df637405de328a54c1c8c08a3206f974c7a577730f90644af4c3400b7bfde2d"},
+    {file = "grpcio-1.48.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:9e73b95969a579798bfbeb85d376695cce5172357fb52e450467ceb8e7365152"},
+    {file = "grpcio-1.48.0-cp36-cp36m-win32.whl", hash = "sha256:059e9d58b5aba7fb9eabe3a4d2ac49e1dcbc2b54b0f166f6475e40b7f4435343"},
+    {file = "grpcio-1.48.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7cebcf645170f0c82ef71769544f9ac4515993a4d367f5900aba2eb4ecd2a32f"},
+    {file = "grpcio-1.48.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:8af3a8845df35b838104d6fb1ae7f4969d248cf037fa2794916d31e917346f72"},
+    {file = "grpcio-1.48.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:a1ef40975ec9ced6c17ce7fbec9825823da782fa606f0b92392646ff3886f198"},
+    {file = "grpcio-1.48.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:7cccbf6db31f2a78e1909047ff69620f94a4e6e53251858e9502fbbff5714b48"},
+    {file = "grpcio-1.48.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f3f142579f58def64c0850f0bb0eb1b425ae885f5669dda5b73ade64ad2b753"},
+    {file = "grpcio-1.48.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:656c6f6f7b815bca3054780b8cdfa1e4e37cd36c887a48558d00c2cf85f31697"},
+    {file = "grpcio-1.48.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:cba4538e8a2ef123ea570e7b1d62162e158963c2471e35d79eb9690c971a10c0"},
+    {file = "grpcio-1.48.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:9daa67820fafceec6194ed1686c1783816e62d6756ff301ba93e682948836846"},
+    {file = "grpcio-1.48.0-cp37-cp37m-win32.whl", hash = "sha256:7ec264a7fb413e0c804a7a48a6f7d7212742955a60724c44d793da35a8f30873"},
+    {file = "grpcio-1.48.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a2b1b33b92359388b8164807313dcbb3317101b038a5d54342982560329d958f"},
+    {file = "grpcio-1.48.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:7b820696a5ce7b98f459f234698cb323f89b355373789188efa126d7f47a2a92"},
+    {file = "grpcio-1.48.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:e4dfae66ebc165c46c5b7048eb554472ee72fbaab2c2c2da7f9b1621c81e077c"},
+    {file = "grpcio-1.48.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:f7115038edce33b494e0138b0bd31a2eb6595d45e2eed23be46bc32886feb741"},
+    {file = "grpcio-1.48.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b4e996282238943ca114628255be61980e38b25f73a08ae2ffd02b63eaf70d3a"},
+    {file = "grpcio-1.48.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13dad31f5155fa555d393511cc8108c41b1b5b54dc4c24c27d4694ddd7a78fad"},
+    {file = "grpcio-1.48.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:c84b9d90b2641963de98b35bb7a2a51f78119fe5bd00ef27246ba9f4f0835e36"},
+    {file = "grpcio-1.48.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:41b65166779d7dafac4c98380ac19f690f1c5fe18083a71d370df87b24dd30ff"},
+    {file = "grpcio-1.48.0-cp38-cp38-win32.whl", hash = "sha256:b890e5f5fbc21cb994894f73ecb2faaa66697d8debcb228a5adb0622b9bec3b2"},
+    {file = "grpcio-1.48.0-cp38-cp38-win_amd64.whl", hash = "sha256:5fe3af539d2f50891ed93aed3064ffbcc38bf848aa3f7ed1fbedcce139c57302"},
+    {file = "grpcio-1.48.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:a4ed57f4e3d91259551e6765782b22d9e8b8178fec43ebf8e1b2c392c4ced37b"},
+    {file = "grpcio-1.48.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:60843d8184e171886dd7a93d6672e2ef0b08dfd4f88da7421c10b46b6e031ac4"},
+    {file = "grpcio-1.48.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:0ecba22f25ccde2442be7e7dd7fa746905d628f03312b4a0c9961f0d99771f53"},
+    {file = "grpcio-1.48.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34f5917f0c49a04633dc12d483c8aee6f6d9f69133b700214d3703f72a72f501"},
+    {file = "grpcio-1.48.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f4c4ad8ad7e2cf3a272cbc96734d56635e6543939022f17e0c4487f7d2a45bf9"},
+    {file = "grpcio-1.48.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:111fb2f5f4a069f331ae23106145fd16dd4e1112ca223858a922068614dac6d2"},
+    {file = "grpcio-1.48.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:beb0573daa49889efcfea0a6e995b4f39d481aa1b94e1257617406ef417b56a6"},
+    {file = "grpcio-1.48.0-cp39-cp39-win32.whl", hash = "sha256:ce70254a082cb767217b2fdee374cc79199d338d46140753438cd6d67c609b2f"},
+    {file = "grpcio-1.48.0-cp39-cp39-win_amd64.whl", hash = "sha256:ae3fd135666448058fe277d93c10e0f18345fbcbb015c4642de2fa3db6f0c205"},
+    {file = "grpcio-1.48.0.tar.gz", hash = "sha256:eaf4bb73819863440727195411ab3b5c304f6663625e66f348e91ebe0a039306"},
+]
 h5py = [
     {file = "h5py-3.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d77af42cb751ad6cc44f11bae73075a07429a5cf2094dfde2b1e716e059b3911"},
     {file = "h5py-3.7.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:63beb8b7b47d0896c50de6efb9a1eaa81dbe211f3767e7dd7db159cea51ba37a"},
@@ -1731,7 +1886,10 @@ idna = [
 image = [
     {file = "image-1.5.33.tar.gz", hash = "sha256:baa2e09178277daa50f22fd6d1d51ec78f19c12688921cb9ab5808743f097126"},
 ]
-imagesize = []
+imagesize = [
+    {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"},
+    {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"},
+]
 importlib-metadata = [
     {file = "importlib_metadata-4.12.0-py3-none-any.whl", hash = "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23"},
     {file = "importlib_metadata-4.12.0.tar.gz", hash = "sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670"},
@@ -1755,7 +1913,76 @@ keras-preprocessing = [
     {file = "Keras_Preprocessing-1.1.2-py2.py3-none-any.whl", hash = "sha256:7b82029b130ff61cc99b55f3bd27427df4838576838c5b2f65940e4fcec99a7b"},
     {file = "Keras_Preprocessing-1.1.2.tar.gz", hash = "sha256:add82567c50c8bc648c14195bf544a5ce7c1f76761536956c3d2978970179ef3"},
 ]
-kiwisolver = []
+kiwisolver = [
+    {file = "kiwisolver-1.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2f5e60fabb7343a836360c4f0919b8cd0d6dbf08ad2ca6b9cf90bf0c76a3c4f6"},
+    {file = "kiwisolver-1.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:10ee06759482c78bdb864f4109886dff7b8a56529bc1609d4f1112b93fe6423c"},
+    {file = "kiwisolver-1.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c79ebe8f3676a4c6630fd3f777f3cfecf9289666c84e775a67d1d358578dc2e3"},
+    {file = "kiwisolver-1.4.4-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:abbe9fa13da955feb8202e215c4018f4bb57469b1b78c7a4c5c7b93001699938"},
+    {file = "kiwisolver-1.4.4-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7577c1987baa3adc4b3c62c33bd1118c3ef5c8ddef36f0f2c950ae0b199e100d"},
+    {file = "kiwisolver-1.4.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8ad8285b01b0d4695102546b342b493b3ccc6781fc28c8c6a1bb63e95d22f09"},
+    {file = "kiwisolver-1.4.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ed58b8acf29798b036d347791141767ccf65eee7f26bde03a71c944449e53de"},
+    {file = "kiwisolver-1.4.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a68b62a02953b9841730db7797422f983935aeefceb1679f0fc85cbfbd311c32"},
+    {file = "kiwisolver-1.4.4-cp310-cp310-win32.whl", hash = "sha256:e92a513161077b53447160b9bd8f522edfbed4bd9759e4c18ab05d7ef7e49408"},
+    {file = "kiwisolver-1.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:3fe20f63c9ecee44560d0e7f116b3a747a5d7203376abeea292ab3152334d004"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e0ea21f66820452a3f5d1655f8704a60d66ba1191359b96541eaf457710a5fc6"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bc9db8a3efb3e403e4ecc6cd9489ea2bac94244f80c78e27c31dcc00d2790ac2"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d5b61785a9ce44e5a4b880272baa7cf6c8f48a5180c3e81c59553ba0cb0821ca"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c2dbb44c3f7e6c4d3487b31037b1bdbf424d97687c1747ce4ff2895795c9bf69"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6295ecd49304dcf3bfbfa45d9a081c96509e95f4b9d0eb7ee4ec0530c4a96514"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bd472dbe5e136f96a4b18f295d159d7f26fd399136f5b17b08c4e5f498cd494"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bf7d9fce9bcc4752ca4a1b80aabd38f6d19009ea5cbda0e0856983cf6d0023f5"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d6601aed50c74e0ef02f4204da1816147a6d3fbdc8b3872d263338a9052c51"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:877272cf6b4b7e94c9614f9b10140e198d2186363728ed0f701c6eee1baec1da"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:db608a6757adabb32f1cfe6066e39b3706d8c3aa69bbc353a5b61edad36a5cb4"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:5853eb494c71e267912275e5586fe281444eb5e722de4e131cddf9d442615626"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:f0a1dbdb5ecbef0d34eb77e56fcb3e95bbd7e50835d9782a45df81cc46949750"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:283dffbf061a4ec60391d51e6155e372a1f7a4f5b15d59c8505339454f8989e4"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-win32.whl", hash = "sha256:d06adcfa62a4431d404c31216f0f8ac97397d799cd53800e9d3efc2fbb3cf14e"},
+    {file = "kiwisolver-1.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:e7da3fec7408813a7cebc9e4ec55afed2d0fd65c4754bc376bf03498d4e92686"},
+    {file = "kiwisolver-1.4.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:62ac9cc684da4cf1778d07a89bf5f81b35834cb96ca523d3a7fb32509380cbf6"},
+    {file = "kiwisolver-1.4.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41dae968a94b1ef1897cb322b39360a0812661dba7c682aa45098eb8e193dbdf"},
+    {file = "kiwisolver-1.4.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:02f79693ec433cb4b5f51694e8477ae83b3205768a6fb48ffba60549080e295b"},
+    {file = "kiwisolver-1.4.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0611a0a2a518464c05ddd5a3a1a0e856ccc10e67079bb17f265ad19ab3c7597"},
+    {file = "kiwisolver-1.4.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:db5283d90da4174865d520e7366801a93777201e91e79bacbac6e6927cbceede"},
+    {file = "kiwisolver-1.4.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1041feb4cda8708ce73bb4dcb9ce1ccf49d553bf87c3954bdfa46f0c3f77252c"},
+    {file = "kiwisolver-1.4.4-cp37-cp37m-win32.whl", hash = "sha256:a553dadda40fef6bfa1456dc4be49b113aa92c2a9a9e8711e955618cd69622e3"},
+    {file = "kiwisolver-1.4.4-cp37-cp37m-win_amd64.whl", hash = "sha256:03baab2d6b4a54ddbb43bba1a3a2d1627e82d205c5cf8f4c924dc49284b87166"},
+    {file = "kiwisolver-1.4.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:841293b17ad704d70c578f1f0013c890e219952169ce8a24ebc063eecf775454"},
+    {file = "kiwisolver-1.4.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f4f270de01dd3e129a72efad823da90cc4d6aafb64c410c9033aba70db9f1ff0"},
+    {file = "kiwisolver-1.4.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f9f39e2f049db33a908319cf46624a569b36983c7c78318e9726a4cb8923b26c"},
+    {file = "kiwisolver-1.4.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c97528e64cb9ebeff9701e7938653a9951922f2a38bd847787d4a8e498cc83ae"},
+    {file = "kiwisolver-1.4.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d1573129aa0fd901076e2bfb4275a35f5b7aa60fbfb984499d661ec950320b0"},
+    {file = "kiwisolver-1.4.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ad881edc7ccb9d65b0224f4e4d05a1e85cf62d73aab798943df6d48ab0cd79a1"},
+    {file = "kiwisolver-1.4.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b428ef021242344340460fa4c9185d0b1f66fbdbfecc6c63eff4b7c29fad429d"},
+    {file = "kiwisolver-1.4.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:2e407cb4bd5a13984a6c2c0fe1845e4e41e96f183e5e5cd4d77a857d9693494c"},
+    {file = "kiwisolver-1.4.4-cp38-cp38-win32.whl", hash = "sha256:75facbe9606748f43428fc91a43edb46c7ff68889b91fa31f53b58894503a191"},
+    {file = "kiwisolver-1.4.4-cp38-cp38-win_amd64.whl", hash = "sha256:5bce61af018b0cb2055e0e72e7d65290d822d3feee430b7b8203d8a855e78766"},
+    {file = "kiwisolver-1.4.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8c808594c88a025d4e322d5bb549282c93c8e1ba71b790f539567932722d7bd8"},
+    {file = "kiwisolver-1.4.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f0a71d85ecdd570ded8ac3d1c0f480842f49a40beb423bb8014539a9f32a5897"},
+    {file = "kiwisolver-1.4.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b533558eae785e33e8c148a8d9921692a9fe5aa516efbdff8606e7d87b9d5824"},
+    {file = "kiwisolver-1.4.4-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:efda5fc8cc1c61e4f639b8067d118e742b812c930f708e6667a5ce0d13499e29"},
+    {file = "kiwisolver-1.4.4-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7c43e1e1206cd421cd92e6b3280d4385d41d7166b3ed577ac20444b6995a445f"},
+    {file = "kiwisolver-1.4.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc8d3bd6c72b2dd9decf16ce70e20abcb3274ba01b4e1c96031e0c4067d1e7cd"},
+    {file = "kiwisolver-1.4.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4ea39b0ccc4f5d803e3337dd46bcce60b702be4d86fd0b3d7531ef10fd99a1ac"},
+    {file = "kiwisolver-1.4.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:968f44fdbf6dd757d12920d63b566eeb4d5b395fd2d00d29d7ef00a00582aac9"},
+    {file = "kiwisolver-1.4.4-cp39-cp39-win32.whl", hash = "sha256:da7e547706e69e45d95e116e6939488d62174e033b763ab1496b4c29b76fabea"},
+    {file = "kiwisolver-1.4.4-cp39-cp39-win_amd64.whl", hash = "sha256:ba59c92039ec0a66103b1d5fe588fa546373587a7d68f5c96f743c3396afc04b"},
+    {file = "kiwisolver-1.4.4-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:91672bacaa030f92fc2f43b620d7b337fd9a5af28b0d6ed3f77afc43c4a64b5a"},
+    {file = "kiwisolver-1.4.4-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:787518a6789009c159453da4d6b683f468ef7a65bbde796bcea803ccf191058d"},
+    {file = "kiwisolver-1.4.4-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da152d8cdcab0e56e4f45eb08b9aea6455845ec83172092f09b0e077ece2cf7a"},
+    {file = "kiwisolver-1.4.4-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:ecb1fa0db7bf4cff9dac752abb19505a233c7f16684c5826d1f11ebd9472b871"},
+    {file = "kiwisolver-1.4.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:28bc5b299f48150b5f822ce68624e445040595a4ac3d59251703779836eceff9"},
+    {file = "kiwisolver-1.4.4-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:81e38381b782cc7e1e46c4e14cd997ee6040768101aefc8fa3c24a4cc58e98f8"},
+    {file = "kiwisolver-1.4.4-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2a66fdfb34e05b705620dd567f5a03f239a088d5a3f321e7b6ac3239d22aa286"},
+    {file = "kiwisolver-1.4.4-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:872b8ca05c40d309ed13eb2e582cab0c5a05e81e987ab9c521bf05ad1d5cf5cb"},
+    {file = "kiwisolver-1.4.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:70e7c2e7b750585569564e2e5ca9845acfaa5da56ac46df68414f29fea97be9f"},
+    {file = "kiwisolver-1.4.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9f85003f5dfa867e86d53fac6f7e6f30c045673fa27b603c397753bebadc3008"},
+    {file = "kiwisolver-1.4.4-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e307eb9bd99801f82789b44bb45e9f541961831c7311521b13a6c85afc09767"},
+    {file = "kiwisolver-1.4.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1792d939ec70abe76f5054d3f36ed5656021dcad1322d1cc996d4e54165cef9"},
+    {file = "kiwisolver-1.4.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6cb459eea32a4e2cf18ba5fcece2dbdf496384413bc1bae15583f19e567f3b2"},
+    {file = "kiwisolver-1.4.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:36dafec3d6d6088d34e2de6b85f9d8e2324eb734162fba59d2ba9ed7a2043d5b"},
+    {file = "kiwisolver-1.4.4.tar.gz", hash = "sha256:d41997519fcba4a1e46eb4a2fe31bc12f0ff957b2b81bac28db24744f333e955"},
+]
 lazy-object-proxy = [
     {file = "lazy-object-proxy-1.7.1.tar.gz", hash = "sha256:d609c75b986def706743cdebe5e47553f4a5a1da9c5ff66d76013ef396b5a8a4"},
     {file = "lazy_object_proxy-1.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bb8c5fd1684d60a9902c60ebe276da1f2281a318ca16c1d0a96db28f62e9166b"},
@@ -1795,8 +2022,22 @@ lazy-object-proxy = [
     {file = "lazy_object_proxy-1.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:677ea950bef409b47e51e733283544ac3d660b709cfce7b187f5ace137960d61"},
     {file = "lazy_object_proxy-1.7.1-pp37.pp38-none-any.whl", hash = "sha256:d66906d5785da8e0be7360912e99c9188b70f52c422f9fc18223347235691a84"},
 ]
-libclang = []
-markdown = []
+libclang = [
+    {file = "libclang-14.0.6-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:8791cf3c3b087c373a6d61e9199da7a541da922c9ddcfed1122090586b996d6e"},
+    {file = "libclang-14.0.6-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:7b06fc76bd1e67c8b04b5719bf2ac5d6a323b289b245dfa9e468561d99538188"},
+    {file = "libclang-14.0.6-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:e429853939423f276a25140b0b702442d7da9a09e001c05e48df888336947614"},
+    {file = "libclang-14.0.6-py2.py3-none-manylinux2010_x86_64.whl", hash = "sha256:206d2789e4450a37d054e63b70451a6fc1873466397443fa13de2b3d4adb2796"},
+    {file = "libclang-14.0.6-py2.py3-none-manylinux2014_aarch64.whl", hash = "sha256:e2add1703129b2abe066fb1890afa880870a89fd6ab4ec5d2a7a8dc8d271677e"},
+    {file = "libclang-14.0.6-py2.py3-none-manylinux2014_armv7l.whl", hash = "sha256:5dd3c6fca1b007d308a4114afa8e4e9d32f32b2572520701d45fcc626ac5cd6c"},
+    {file = "libclang-14.0.6-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:cfb0e892ebb5dff6bd498ab5778adb8581f26a00fd8347b3c76c989fe2fd04f7"},
+    {file = "libclang-14.0.6-py2.py3-none-win_amd64.whl", hash = "sha256:ea03c12675151837660cdd5dce65bd89320896ac3421efef43a36678f113ce95"},
+    {file = "libclang-14.0.6-py2.py3-none-win_arm64.whl", hash = "sha256:2e4303e04517fcd11173cb2e51a7070eed71e16ef45d4e26a82c5e881cac3d27"},
+    {file = "libclang-14.0.6.tar.gz", hash = "sha256:9052a8284d8846984f6fa826b1d7460a66d3b23a486d782633b42b6e3b418789"},
+]
+markdown = [
+    {file = "Markdown-3.4.1-py3-none-any.whl", hash = "sha256:08fb8465cffd03d10b9dd34a5c3fea908e20391a2a90b88d66362cb05beed186"},
+    {file = "Markdown-3.4.1.tar.gz", hash = "sha256:3b809086bb6efad416156e00a0da66fe47618a5d6918dd688f53f40c8e4cfeff"},
+]
 markupsafe = [
     {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"},
     {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"},
@@ -1839,7 +2080,43 @@ markupsafe = [
     {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"},
     {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"},
 ]
-matplotlib = []
+matplotlib = [
+    {file = "matplotlib-3.5.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a206a1b762b39398efea838f528b3a6d60cdb26fe9d58b48265787e29cd1d693"},
+    {file = "matplotlib-3.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cd45a6f3e93a780185f70f05cf2a383daed13c3489233faad83e81720f7ede24"},
+    {file = "matplotlib-3.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d62880e1f60e5a30a2a8484432bcb3a5056969dc97258d7326ad465feb7ae069"},
+    {file = "matplotlib-3.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9ab29589cef03bc88acfa3a1490359000c18186fc30374d8aa77d33cc4a51a4a"},
+    {file = "matplotlib-3.5.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2886cc009f40e2984c083687251821f305d811d38e3df8ded414265e4583f0c5"},
+    {file = "matplotlib-3.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c995f7d9568f18b5db131ab124c64e51b6820a92d10246d4f2b3f3a66698a15b"},
+    {file = "matplotlib-3.5.3-cp310-cp310-win32.whl", hash = "sha256:6bb93a0492d68461bd458eba878f52fdc8ac7bdb6c4acdfe43dba684787838c2"},
+    {file = "matplotlib-3.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:2e6d184ebe291b9e8f7e78bbab7987d269c38ea3e062eace1fe7d898042ef804"},
+    {file = "matplotlib-3.5.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6ea6aef5c4338e58d8d376068e28f80a24f54e69f09479d1c90b7172bad9f25b"},
+    {file = "matplotlib-3.5.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:839d47b8ead7ad9669aaacdbc03f29656dc21f0d41a6fea2d473d856c39c8b1c"},
+    {file = "matplotlib-3.5.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3b4fa56159dc3c7f9250df88f653f085068bcd32dcd38e479bba58909254af7f"},
+    {file = "matplotlib-3.5.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:94ff86af56a3869a4ae26a9637a849effd7643858a1a04dd5ee50e9ab75069a7"},
+    {file = "matplotlib-3.5.3-cp37-cp37m-win32.whl", hash = "sha256:35a8ad4dddebd51f94c5d24bec689ec0ec66173bf614374a1244c6241c1595e0"},
+    {file = "matplotlib-3.5.3-cp37-cp37m-win_amd64.whl", hash = "sha256:43e9d3fa077bf0cc95ded13d331d2156f9973dce17c6f0c8b49ccd57af94dbd9"},
+    {file = "matplotlib-3.5.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:22227c976ad4dc8c5a5057540421f0d8708c6560744ad2ad638d48e2984e1dbc"},
+    {file = "matplotlib-3.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bf618a825deb6205f015df6dfe6167a5d9b351203b03fab82043ae1d30f16511"},
+    {file = "matplotlib-3.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9befa5954cdbc085e37d974ff6053da269474177921dd61facdad8023c4aeb51"},
+    {file = "matplotlib-3.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3840c280ebc87a48488a46f760ea1c0c0c83fcf7abbe2e6baf99d033fd35fd8"},
+    {file = "matplotlib-3.5.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dacddf5bfcec60e3f26ec5c0ae3d0274853a258b6c3fc5ef2f06a8eb23e042be"},
+    {file = "matplotlib-3.5.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b428076a55fb1c084c76cb93e68006f27d247169f056412607c5c88828d08f88"},
+    {file = "matplotlib-3.5.3-cp38-cp38-win32.whl", hash = "sha256:874df7505ba820e0400e7091199decf3ff1fde0583652120c50cd60d5820ca9a"},
+    {file = "matplotlib-3.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:b28de401d928890187c589036857a270a032961411934bdac4cf12dde3d43094"},
+    {file = "matplotlib-3.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3211ba82b9f1518d346f6309df137b50c3dc4421b4ed4815d1d7eadc617f45a1"},
+    {file = "matplotlib-3.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6fe807e8a22620b4cd95cfbc795ba310dc80151d43b037257250faf0bfcd82bc"},
+    {file = "matplotlib-3.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5c096363b206a3caf43773abebdbb5a23ea13faef71d701b21a9c27fdcef72f4"},
+    {file = "matplotlib-3.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bcdfcb0f976e1bac6721d7d457c17be23cf7501f977b6a38f9d38a3762841f7"},
+    {file = "matplotlib-3.5.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1e64ac9be9da6bfff0a732e62116484b93b02a0b4d4b19934fb4f8e7ad26ad6a"},
+    {file = "matplotlib-3.5.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:73dd93dc35c85dece610cca8358003bf0760d7986f70b223e2306b4ea6d1406b"},
+    {file = "matplotlib-3.5.3-cp39-cp39-win32.whl", hash = "sha256:879c7e5fce4939c6aa04581dfe08d57eb6102a71f2e202e3314d5fbc072fd5a0"},
+    {file = "matplotlib-3.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:ab8d26f07fe64f6f6736d635cce7bfd7f625320490ed5bfc347f2cdb4fae0e56"},
+    {file = "matplotlib-3.5.3-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:99482b83ebf4eb6d5fc6813d7aacdefdd480f0d9c0b52dcf9f1cc3b2c4b3361a"},
+    {file = "matplotlib-3.5.3-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f814504e459c68118bf2246a530ed953ebd18213dc20e3da524174d84ed010b2"},
+    {file = "matplotlib-3.5.3-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57f1b4e69f438a99bb64d7f2c340db1b096b41ebaa515cf61ea72624279220ce"},
+    {file = "matplotlib-3.5.3-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:d2484b350bf3d32cae43f85dcfc89b3ed7bd2bcd781ef351f93eb6fb2cc483f9"},
+    {file = "matplotlib-3.5.3.tar.gz", hash = "sha256:339cac48b80ddbc8bfd05daae0a3a73414651a8596904c2a881cfd1edb65f26c"},
+]
 mccabe = [
     {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
     {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
@@ -2139,16 +2416,25 @@ pyasn1-modules = [
     {file = "pyasn1_modules-0.2.8-py3.6.egg", hash = "sha256:cbac4bc38d117f2a49aeedec4407d23e8866ea4ac27ff2cf7fb3e5b570df19e0"},
     {file = "pyasn1_modules-0.2.8-py3.7.egg", hash = "sha256:c29a5e5cc7a3f05926aff34e097e84f8589cd790ce0ed41b67aed6857b26aafd"},
 ]
-pycodestyle = []
+pycodestyle = [
+    {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"},
+    {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"},
+]
 pycparser = [
     {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"},
     {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
 ]
-pyflakes = []
+pyflakes = [
+    {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"},
+    {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"},
+]
 pyformat = [
     {file = "pyformat-0.7.tar.gz", hash = "sha256:eb7b0e93f768c6f92e2cb06307deaa3a5141c7c61cd472b1a7918e30d09df20f"},
 ]
-pygments = []
+pygments = [
+    {file = "Pygments-2.13.0-py3-none-any.whl", hash = "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"},
+    {file = "Pygments-2.13.0.tar.gz", hash = "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1"},
+]
 pylint = [
     {file = "pylint-2.13.9-py3-none-any.whl", hash = "sha256:705c620d388035bdd9ff8b44c5bcdd235bfb49d276d488dd2c8ff1736aa42526"},
     {file = "pylint-2.13.9.tar.gz", hash = "sha256:095567c96e19e6f57b5b907e67d265ff535e588fe26b12b5ebe1fc5645b2c731"},
@@ -2161,14 +2447,30 @@ pyserial = [
     {file = "pyserial-3.5-py2.py3-none-any.whl", hash = "sha256:c4451db6ba391ca6ca299fb3ec7bae67a5c55dde170964c7a14ceefec02f2cf0"},
     {file = "pyserial-3.5.tar.gz", hash = "sha256:3c77e014170dfffbd816e6ffc205e9842efb10be9f58ec16d3e8675b4925cddb"},
 ]
-pytest = []
-pytest-forked = []
-pytest-xdist = []
+pytest = [
+    {file = "pytest-7.1.2-py3-none-any.whl", hash = "sha256:13d0e3ccfc2b6e26be000cb6568c832ba67ba32e719443bfe725814d3c42433c"},
+    {file = "pytest-7.1.2.tar.gz", hash = "sha256:a06a0425453864a270bc45e71f783330a7428defb4230fb5e6a731fde06ecd45"},
+]
+pytest-forked = [
+    {file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"},
+    {file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"},
+]
+pytest-xdist = [
+    {file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"},
+    {file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"},
+]
 python-dateutil = [
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
 ]
-pytz = []
+pytz = [
+    {file = "pytz-2022.2.1-py2.py3-none-any.whl", hash = "sha256:220f481bdafa09c3955dfbdddb7b57780e9a94f5127e35456a48589b9e0c0197"},
+    {file = "pytz-2022.2.1.tar.gz", hash = "sha256:cea221417204f2d1a2aa03ddae3e867921971d0d76f14d87abb4414415bbdcf5"},
+]
+pyusb = [
+    {file = "pyusb-1.2.1-py3-none-any.whl", hash = "sha256:2b4c7cb86dbadf044dfb9d3a4ff69fd217013dbe78a792177a3feb172449ea36"},
+    {file = "pyusb-1.2.1.tar.gz", hash = "sha256:a4cc7404a203144754164b8b40994e2849fde1cfff06b08492f12fff9d9de7b9"},
+]
 pyyaml = [
     {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
     {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
@@ -2177,6 +2479,13 @@ pyyaml = [
     {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
     {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
     {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
+    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
+    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
     {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
@@ -2208,13 +2517,94 @@ recommonmark = [
     {file = "recommonmark-0.6.0-py2.py3-none-any.whl", hash = "sha256:2ec4207a574289355d5b6ae4ae4abb29043346ca12cdd5f07d374dc5987d2852"},
     {file = "recommonmark-0.6.0.tar.gz", hash = "sha256:29cd4faeb6c5268c633634f2d69aef9431e0f4d347f90659fd0aab20e541efeb"},
 ]
-regex = []
-requests = []
+regex = [
+    {file = "regex-2022.7.25-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:55911aba9bae9ad826971d2c80428425625a3dd0c00b94e9bb19361888b983a6"},
+    {file = "regex-2022.7.25-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1dee18c683a0603445ff9e77ffc39f1a3997f43ee07ae04ac80228fc5565fc4d"},
+    {file = "regex-2022.7.25-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42702dba0281bcafbcf194770ecb987d60854946071c622777e6d207b3c169bc"},
+    {file = "regex-2022.7.25-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff0e0c3a48c635529a1723d2fea9326da1dacdba5db20be1a4eeaf56580e3949"},
+    {file = "regex-2022.7.25-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b5f1e598b9b823fb37f2f1baf930bb5f30ae4a3d9b67dfdc63f8f2374f336679"},
+    {file = "regex-2022.7.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e19695f7b8de8a3b7d940288abedf48dfcfc0cd8d36f360e5b1bc5e1c3f02a72"},
+    {file = "regex-2022.7.25-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dd0b115c4fab388b1131c89518cdd98db38d88c55cedfffc71de33c92eeee9c6"},
+    {file = "regex-2022.7.25-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8e324436b7f8bbb8e7b3c4593b01d1dce7215befc83a60569ff34a38d6c250ae"},
+    {file = "regex-2022.7.25-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:39ed69803697f1e1e9f1fb1e0b5a8116c55c130745ecd39485cc6255d3b9f046"},
+    {file = "regex-2022.7.25-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:513be18bcf5f27076990dd111f72270d33188653e772023985be92a2c5438382"},
+    {file = "regex-2022.7.25-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e4a72f70ad7aa3df8244da55cf21e28b6f0640a8d8e0065dfa7ec477dd2b4ea4"},
+    {file = "regex-2022.7.25-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:3ef5a4ced251a501962d1c8797d15978dd97661721e337cbe88d8bcdb9cd0d56"},
+    {file = "regex-2022.7.25-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f86be4e30cf2ffcd67845251c8549d70740cd6eec77bd38d977c4c0640eefc24"},
+    {file = "regex-2022.7.25-cp310-cp310-win32.whl", hash = "sha256:4d4640ab9fd3659378eab2ee6f47c3e04b4a269bf206475652c6d8520a9301cc"},
+    {file = "regex-2022.7.25-cp310-cp310-win_amd64.whl", hash = "sha256:af3d5c74af5ae5d04d597ea61e5e9e0b84e84509e58d1e52aaefbae81cb697bb"},
+    {file = "regex-2022.7.25-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a23653a18c1d69760a2d8b6793478815cf5dc8c12f3b6e608e50aed49829f0ef"},
+    {file = "regex-2022.7.25-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ccf10d7d0f25a3c5e123c97ffbab8d4b1429a3c25fbd50812010075bd5d844fd"},
+    {file = "regex-2022.7.25-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:933752abc9931cb53eccbd4ab3aedbcd0f1797c0a1b19ed385952e265636b2b6"},
+    {file = "regex-2022.7.25-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:750b5de7982e568c1bb60388dea1c3abd674d1d579b87ef1b945ba4da53eb5e2"},
+    {file = "regex-2022.7.25-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fac0dd2f11a165a79e271a04226378a008c83368031c6a9294a6df9cd1c13c05"},
+    {file = "regex-2022.7.25-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48018c71ce7b2fe80c1eb16b9104d7d04d07567e9333159810a4ae5ef8cdf01f"},
+    {file = "regex-2022.7.25-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:15bc8cddffe3a9181572c6bcdf45b145691fff1b5712767e7d7a6ef5d32f424f"},
+    {file = "regex-2022.7.25-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:50dd20fd10dafd9b697f1c0629285790d86e66946caa2c6a1135f67846d9b495"},
+    {file = "regex-2022.7.25-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:438b36fbf9446b94325eaeeb1336e2291cd81daeef91b9c728c0946ffbc42ba4"},
+    {file = "regex-2022.7.25-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:7378a6fba8a043b3c5fb8cf915044c814ebb2463b0a7137ec09ae0b1b10f5484"},
+    {file = "regex-2022.7.25-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:609a97626bf310e8cd7c79173e6ed8acab7f01ed4519b7936e998b54b3eb8d31"},
+    {file = "regex-2022.7.25-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:9b8d411a547b47852020242f9c384da35d4c65ccf159ae55a3ba0e50b6220932"},
+    {file = "regex-2022.7.25-cp36-cp36m-win32.whl", hash = "sha256:fbbf9858a3043f632c9da2a82e4ce895016dfb401f59ab110900121121ee73b7"},
+    {file = "regex-2022.7.25-cp36-cp36m-win_amd64.whl", hash = "sha256:1903a2a6c4463488452e953a49f7e6663cfea9ff5e75b09333cbcc840e727a5b"},
+    {file = "regex-2022.7.25-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:76696de39cbbbf976aa85cbd7b1f3ea2d98b3bc9889f6739fdb6cda85a7f05aa"},
+    {file = "regex-2022.7.25-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0c12e5c14eeb5e484c688f2db57ca4a8182d09b40ab69f73147dc32bcdf849d"},
+    {file = "regex-2022.7.25-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bbc0c5b350036ce49a8fd6015a29e4621de725fa99d9e985d3d76b820d44e5a9"},
+    {file = "regex-2022.7.25-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c942696b541ce6be4e3cc2c963b48671277b38ebd4a28af803b511b2885759b7"},
+    {file = "regex-2022.7.25-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fddd2ef742f05a18fde1d1c74df12fa6f426945cfb6fefba3fa1c5380e2dd2bf"},
+    {file = "regex-2022.7.25-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1b83baa19355c8dd0ec23e725f18450be01bc464ba1f1865cfada03594fa629"},
+    {file = "regex-2022.7.25-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3ef700d411b900fcff91f1ef16771bf085a9f9a376d16d8a643e8a20ff6dcb7b"},
+    {file = "regex-2022.7.25-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b24133df3d3c57a901f6a63ba3783d6eed1d0561ed1cafd027f0789e76a10615"},
+    {file = "regex-2022.7.25-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1228f5a6be5b45ce7b66a69a77682632f0ce64cea1d7da505f33972e01f1f3fe"},
+    {file = "regex-2022.7.25-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:9eec276e6419de4f93824f9373b28a2a8eaed04f28514000cc6a41b64703d804"},
+    {file = "regex-2022.7.25-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:ab950bbafafe9bf2e0a75b9f17291500fa7794f398834f1f4a71c18dddece130"},
+    {file = "regex-2022.7.25-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a60840ebd37fe0152b5be50b56e8a958e1430837475311986f867dabad1c7474"},
+    {file = "regex-2022.7.25-cp37-cp37m-win32.whl", hash = "sha256:a0c38edcc78556625cbadf48eb87decd5d3c5e82fc4810dd22c19a5498d2329d"},
+    {file = "regex-2022.7.25-cp37-cp37m-win_amd64.whl", hash = "sha256:f755fba215ddafa26211e33ac91b48dcebf84ff28590790e5b7711b46fa4095d"},
+    {file = "regex-2022.7.25-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8d928237cf78cfe3b46b608f87e255c45a1e11d04e7dd2c49cb60200cbd6f987"},
+    {file = "regex-2022.7.25-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ea9f01224c25101c5f2c6dceebd29d1431525637d596241935640e4de0fbb822"},
+    {file = "regex-2022.7.25-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91d2a85a4a134011eb517f2a752f4e488b0a4f6b6ad00ef247f9fac57f9ff4f0"},
+    {file = "regex-2022.7.25-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9163ef45bfebc39838848330cb94f79b563f738c60fc0a20a7f0a30f13ec1573"},
+    {file = "regex-2022.7.25-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0798f6b97c3f8139c95af7b128a60909f5305b2e431a012083063298b2481e5d"},
+    {file = "regex-2022.7.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03cdd06061426378a83e8a5bdec9cc71b964c35e329f68fb7058d08791780c83"},
+    {file = "regex-2022.7.25-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f898bf0a9613cc8b7f7af6fdcd80cc8e7659787908834c63391f22271fdb1c14"},
+    {file = "regex-2022.7.25-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b131c7c94da56f8f1c59b4540c37c20973119608ec8cf42b3ebb40a94f3afc2c"},
+    {file = "regex-2022.7.25-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a2afa24d06301f4ffcb00244d30df1c12e65cabf30dcb0ba8b871d6b0c54d19e"},
+    {file = "regex-2022.7.25-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d3ce546e54cfafa9dee60b11b7f99b87058d81ab62bd05e366fc5bf6b2c1383a"},
+    {file = "regex-2022.7.25-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:f7329e66c6bd9950eb428f225db3982e5f54e53d3d95951da424dce9aa621eae"},
+    {file = "regex-2022.7.25-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ae6cd6ce16681d345592d74a0a92b25a9530d4055be460af425e654d891cdee4"},
+    {file = "regex-2022.7.25-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fddd7ddd520661085ffd91f1db74b18e4cf5ed9b6e939aa7d31ca1ea67bc7621"},
+    {file = "regex-2022.7.25-cp38-cp38-win32.whl", hash = "sha256:f049a9fdacdbc4e84afcec7a3b14a8309699a7347c95a525d49c4b9a9c353cee"},
+    {file = "regex-2022.7.25-cp38-cp38-win_amd64.whl", hash = "sha256:50497f3d8a1e8d8055c6da1768c98f5b618039e572aacdcccd642704db6077eb"},
+    {file = "regex-2022.7.25-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:89f4c531409ef01aa12b7c15bb489415e219c186725d44bc12a8f279afde3fe2"},
+    {file = "regex-2022.7.25-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:535a2392a0f11f7df80f43e63a5b69c51bb29a10a690e4ae5ad721b9fe50684d"},
+    {file = "regex-2022.7.25-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f3de4baf25e960a3048a6ecd0246cedcdfeb462a741d55e9a42e91add5a4a99"},
+    {file = "regex-2022.7.25-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2c8f542c5afd36e60237dbbabc95722135047d4c2844b9c4bff74c7177a50a1"},
+    {file = "regex-2022.7.25-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc49d9c6289df4c7895c85094872ef98ce7f609ba0ecbeb77acdd7f8362cda7d"},
+    {file = "regex-2022.7.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:730cc311757153d59bf2bcf06d4026e3c998c1919c06557ad0e382235049b376"},
+    {file = "regex-2022.7.25-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:14882770017436aabe4cfa2651a9777f9faa2625bc0f6cdaec362697a8a964c3"},
+    {file = "regex-2022.7.25-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1991348464df42a6bc04601e1241dfa4a9ec4d599338dc64760f2c299e1cb996"},
+    {file = "regex-2022.7.25-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:03d7ff80e3a276ef460baaa745d425162c19d8ea093d60ecf47f52ffee37aea5"},
+    {file = "regex-2022.7.25-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ed42feff196aaf262db1878d5ac553a3bcef147caf1362e7095f1115b71ae0e1"},
+    {file = "regex-2022.7.25-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:4433690ff474fd95a3058085aed5fe12ac4e09d4f4b2b983de35e3a6c899afa0"},
+    {file = "regex-2022.7.25-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:454c2c81d34eb4e1d015acbca0488789c17fc84188e336365eaa31a16c964c04"},
+    {file = "regex-2022.7.25-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a06d6ada6bef79aaa550ef37c7d529da60b81c02838d9dd9c5ab788becfc57d4"},
+    {file = "regex-2022.7.25-cp39-cp39-win32.whl", hash = "sha256:cc018ce0f1b62df155a5b9c9a81464040a87e97fd9bd05e0febe92568c63e678"},
+    {file = "regex-2022.7.25-cp39-cp39-win_amd64.whl", hash = "sha256:26d6e9a6431626c20821d0165a4c4508acb20a57e4c04ee77c96f01b7fe4c09c"},
+    {file = "regex-2022.7.25.tar.gz", hash = "sha256:bd0883e86964cd61360ffc36dbebbc49b928e92a306f886eab02c11dfde5b7aa"},
+]
+requests = [
+    {file = "requests-2.28.1-py3-none-any.whl", hash = "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"},
+    {file = "requests-2.28.1.tar.gz", hash = "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983"},
+]
 requests-oauthlib = [
     {file = "requests-oauthlib-1.3.1.tar.gz", hash = "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"},
     {file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"},
 ]
-rsa = []
+rsa = [
+    {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"},
+    {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"},
+]
 scipy = [
     {file = "scipy-1.7.3-1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:c9e04d7e9b03a8a6ac2045f7c5ef741be86727d8f49c45db45f244bdd2bcff17"},
     {file = "scipy-1.7.3-1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b0e0aeb061a1d7dcd2ed59ea57ee56c9b23dd60100825f98238c06ee5cc4467e"},
@@ -2295,7 +2685,9 @@ sqlparse = [
     {file = "sqlparse-0.4.2-py3-none-any.whl", hash = "sha256:48719e356bb8b42991bdbb1e8b83223757b93789c00910a616a071910ca4a64d"},
     {file = "sqlparse-0.4.2.tar.gz", hash = "sha256:0c00730c74263a94e5a9919ade150dfc3b19c574389985446148402998287dae"},
 ]
-tensorboard = []
+tensorboard = [
+    {file = "tensorboard-2.10.0-py3-none-any.whl", hash = "sha256:76c91a5e8959cd2208cc32cb17a0cb002badabb66a06ac2af02a7810f49a59e3"},
+]
 tensorboard-data-server = [
     {file = "tensorboard_data_server-0.6.1-py3-none-any.whl", hash = "sha256:809fe9887682d35c1f7d1f54f0f40f98bb1f771b14265b453ca051e2ce58fca7"},
     {file = "tensorboard_data_server-0.6.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:fa8cef9be4fcae2f2363c88176638baf2da19c5ec90addb49b1cde05c95c88ee"},
@@ -2392,7 +2784,19 @@ torchvision = [
     {file = "torchvision-0.12.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b93a767f44e3933cb3b01a6fe9727db54590f57b7dac09d5aaf15966c6c151dd"},
     {file = "torchvision-0.12.0-cp39-cp39-win_amd64.whl", hash = "sha256:edab05f7ba9f648c00435b384ffdbd7bde79a3b8ea893813fb50f6ccf28b1e76"},
 ]
-tornado = []
+tornado = [
+    {file = "tornado-6.2-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:20f638fd8cc85f3cbae3c732326e96addff0a15e22d80f049e00121651e82e72"},
+    {file = "tornado-6.2-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:87dcafae3e884462f90c90ecc200defe5e580a7fbbb4365eda7c7c1eb809ebc9"},
+    {file = "tornado-6.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba09ef14ca9893954244fd872798b4ccb2367c165946ce2dd7376aebdde8e3ac"},
+    {file = "tornado-6.2-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b8150f721c101abdef99073bf66d3903e292d851bee51910839831caba341a75"},
+    {file = "tornado-6.2-cp37-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3a2f5999215a3a06a4fc218026cd84c61b8b2b40ac5296a6db1f1451ef04c1e"},
+    {file = "tornado-6.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5f8c52d219d4995388119af7ccaa0bcec289535747620116a58d830e7c25d8a8"},
+    {file = "tornado-6.2-cp37-abi3-musllinux_1_1_i686.whl", hash = "sha256:6fdfabffd8dfcb6cf887428849d30cf19a3ea34c2c248461e1f7d718ad30b66b"},
+    {file = "tornado-6.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:1d54d13ab8414ed44de07efecb97d4ef7c39f7438cf5e976ccd356bebb1b5fca"},
+    {file = "tornado-6.2-cp37-abi3-win32.whl", hash = "sha256:5c87076709343557ef8032934ce5f637dbb552efa7b21d08e89ae7619ed0eb23"},
+    {file = "tornado-6.2-cp37-abi3-win_amd64.whl", hash = "sha256:e5f923aa6a47e133d1cf87d60700889d7eae68988704e20c75fb2d65677a8e4b"},
+    {file = "tornado-6.2.tar.gz", hash = "sha256:9b630419bde84ec666bfd7ea0a4cb2a8a651c2d5cccdbdd1972a0c859dfc3c13"},
+]
 typed-ast = [
     {file = "typed_ast-1.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:669dd0c4167f6f2cd9f57041e03c3c2ebf9063d0757dc89f79ba1daa2bfca9d4"},
     {file = "typed_ast-1.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:211260621ab1cd7324e0798d6be953d00b74e0428382991adfddb352252f1d62"},
@@ -2419,15 +2823,24 @@ typed-ast = [
     {file = "typed_ast-1.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:0fdbcf2fef0ca421a3f5912555804296f0b0960f0418c440f5d6d3abb549f3e1"},
     {file = "typed_ast-1.5.4.tar.gz", hash = "sha256:39e21ceb7388e4bb37f4c679d72707ed46c2fbf2a5609b8b8ebc4b067d977df2"},
 ]
-typing-extensions = []
+typing-extensions = [
+    {file = "typing_extensions-4.3.0-py3-none-any.whl", hash = "sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02"},
+    {file = "typing_extensions-4.3.0.tar.gz", hash = "sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"},
+]
 unify = [
     {file = "unify-0.5.tar.gz", hash = "sha256:8ddce812b2457212b7598fe574c9e6eb3ad69710f445391338270c7f8a71723c"},
 ]
 untokenize = [
     {file = "untokenize-0.1.1.tar.gz", hash = "sha256:3865dbbbb8efb4bb5eaa72f1be7f3e0be00ea8b7f125c69cbd1f5fda926f37a2"},
 ]
-urllib3 = []
-werkzeug = []
+urllib3 = [
+    {file = "urllib3-1.26.11-py2.py3-none-any.whl", hash = "sha256:c33ccba33c819596124764c23a97d25f32b28433ba0dedeb77d873a38722c9bc"},
+    {file = "urllib3-1.26.11.tar.gz", hash = "sha256:ea6e8fb210b19d950fab93b60c9009226c63a28808bc8386e05301e25883ac0a"},
+]
+werkzeug = [
+    {file = "Werkzeug-2.2.2-py3-none-any.whl", hash = "sha256:f979ab81f58d7318e064e99c4506445d60135ac5cd2e177a2de0089bfd4c9bd5"},
+    {file = "Werkzeug-2.2.2.tar.gz", hash = "sha256:7ea2d48322cc7c0f8b3a215ed73eabd7b5d75d0b50e31ab006286ccff9e00b8f"},
+]
 wrapt = [
     {file = "wrapt-1.14.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3"},
     {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef"},
@@ -2502,4 +2915,7 @@ xgboost = [
     {file = "xgboost-1.6.1-py3-none-win_amd64.whl", hash = "sha256:3adcb7e4ccf774d5e0128c01e5c381303c3799910ab0f2e996160fe3cd23b7fc"},
     {file = "xgboost-1.6.1.tar.gz", hash = "sha256:24072028656f3428e7b8aabf77340ece057f273e41f7f85d67ccaefb7454bb18"},
 ]
-zipp = []
+zipp = [
+    {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
+    {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
+]
diff --git a/apps/microtvm/pyproject.toml b/apps/microtvm/pyproject.toml
index 0455e1b3e960..b75975e2ffa7 100644
--- a/apps/microtvm/pyproject.toml
+++ b/apps/microtvm/pyproject.toml
@@ -111,6 +111,7 @@ tensorflow-estimator = {version = "^2.1", optional = true}
 tflite = {version = "2.1.0", optional = true}
 wheel = "*"
 cloudpickle = "^1.6.0"
+pyusb = "^1.2.1"
 
 
 [tool.poetry.extras]
diff --git a/python/tvm/micro/testing/pytest_plugin.py b/python/tvm/micro/testing/pytest_plugin.py
index 9864b49abb61..c32377fb7e7d 100644
--- a/python/tvm/micro/testing/pytest_plugin.py
+++ b/python/tvm/micro/testing/pytest_plugin.py
@@ -60,6 +60,14 @@ def pytest_addoption(parser):
             "Also, it will enable debug level logging in project generation."
         ),
     )
+    parser.addoption(
+        "--serial-number",
+        default=None,
+        help=(
+            "Board serial number. This is used to run test on a "
+            "specific board when multiple boards with the same type exist."
+        ),
+    )
 
 
 def pytest_generate_tests(metafunc):
@@ -130,3 +138,8 @@ def pytest_configure(config):
         "markers",
         "skip_boards(board): skip test for the given board",
     )
+
+
+@pytest.fixture
+def serial_number(request):
+    return request.config.getoption("--serial-number")
diff --git a/tests/micro/arduino/README.md b/tests/micro/arduino/README.md
index 2b37599849f7..36cd7d5f46d7 100644
--- a/tests/micro/arduino/README.md
+++ b/tests/micro/arduino/README.md
@@ -33,3 +33,10 @@ To see the list of supported values for `--board`, run:
 ```
 $ pytest --help
 ```
+
+If you would like to test with a real hardware and need to target one of many
+identical devices, you have the option to pass the serial number for your
+development board.
+```
+$ pytest --board=due --serial-number="4873ce"
+```
diff --git a/tests/micro/arduino/test_arduino_error_detection.py b/tests/micro/arduino/test_arduino_error_detection.py
index de5e5bb56c94..f1278094b484 100644
--- a/tests/micro/arduino/test_arduino_error_detection.py
+++ b/tests/micro/arduino/test_arduino_error_detection.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import sys
 import pytest
 
 from tvm.micro.project_api.server import ServerError
@@ -25,8 +24,10 @@
 
 
 @pytest.fixture
-def project(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
-    return test_utils.make_kws_project(board, arduino_cli_cmd, microtvm_debug, workspace_dir)
+def project(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
+    return test_utils.make_kws_project(
+        board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number
+    )
 
 
 def test_blank_project_compiles(workspace_dir, project):
diff --git a/tests/micro/arduino/test_arduino_rpc_server.py b/tests/micro/arduino/test_arduino_rpc_server.py
index e3d97bfdf915..ae22fb9499b8 100644
--- a/tests/micro/arduino/test_arduino_rpc_server.py
+++ b/tests/micro/arduino/test_arduino_rpc_server.py
@@ -38,7 +38,15 @@
 import test_utils
 
 
-def _make_session(model, arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config):
+def _make_session(
+    model,
+    arduino_board,
+    arduino_cli_cmd,
+    workspace_dir,
+    mod,
+    build_config,
+    serial_number: str = None,
+):
     project = tvm.micro.generate_project(
         str(test_utils.TEMPLATE_PROJECT_DIR),
         mod,
@@ -48,6 +56,7 @@ def _make_session(model, arduino_board, arduino_cli_cmd, workspace_dir, mod, bui
             "arduino_cli_cmd": arduino_cli_cmd,
             "project_type": "host_driven",
             "verbose": bool(build_config.get("debug")),
+            "serial_number": serial_number,
         },
     )
     project.build()
@@ -56,30 +65,50 @@ def _make_session(model, arduino_board, arduino_cli_cmd, workspace_dir, mod, bui
 
 
 def _make_sess_from_op(
-    model, arduino_board, arduino_cli_cmd, workspace_dir, op_name, sched, arg_bufs, build_config
+    model,
+    arduino_board,
+    arduino_cli_cmd,
+    workspace_dir,
+    op_name,
+    sched,
+    arg_bufs,
+    build_config,
+    serial_number: str = None,
 ):
     target = tvm.target.target.micro(model)
     runtime = Runtime("crt", {"system-lib": True})
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.build(sched, arg_bufs, target=target, runtime=runtime, name=op_name)
 
-    return _make_session(model, arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config)
+    return _make_session(
+        model, arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config, serial_number
+    )
 
 
-def _make_add_sess(model, arduino_board, arduino_cli_cmd, workspace_dir, build_config):
+def _make_add_sess(
+    model, arduino_board, arduino_cli_cmd, workspace_dir, build_config, serial_number: str = None
+):
     A = tvm.te.placeholder((2,), dtype="int8")
     B = tvm.te.placeholder((1,), dtype="int8")
     C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
     sched = tvm.te.create_schedule(C.op)
     return _make_sess_from_op(
-        model, arduino_board, arduino_cli_cmd, workspace_dir, "add", sched, [A, B, C], build_config
+        model,
+        arduino_board,
+        arduino_cli_cmd,
+        workspace_dir,
+        "add",
+        sched,
+        [A, B, C],
+        build_config,
+        serial_number,
     )
 
 
 # The same test code can be executed on both the QEMU simulation and on real hardware.
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_compile_runtime(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
+def test_compile_runtime(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
     """Test compiling the on-device runtime."""
 
     model = test_utils.ARDUINO_BOARDS[board]
@@ -98,13 +127,15 @@ def test_basic_add(sess):
         system_lib.get_function("add")(A_data, B_data, C_data)
         assert (C_data.numpy() == np.array([6, 7])).all()
 
-    with _make_add_sess(model, board, arduino_cli_cmd, workspace_dir, build_config) as sess:
+    with _make_add_sess(
+        model, board, arduino_cli_cmd, workspace_dir, build_config, serial_number
+    ) as sess:
         test_basic_add(sess)
 
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_platform_timer(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
+def test_platform_timer(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
     """Test compiling the on-device runtime."""
 
     model = test_utils.ARDUINO_BOARDS[board]
@@ -128,13 +159,15 @@ def test_basic_add(sess):
         assert result.mean > 0
         assert len(result.results) == 3
 
-    with _make_add_sess(model, board, arduino_cli_cmd, workspace_dir, build_config) as sess:
+    with _make_add_sess(
+        model, board, arduino_cli_cmd, workspace_dir, build_config, serial_number
+    ) as sess:
         test_basic_add(sess)
 
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_relay(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
+def test_relay(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
     """Testing a simple relay graph"""
     model = test_utils.ARDUINO_BOARDS[board]
     build_config = {"debug": microtvm_debug}
@@ -153,7 +186,9 @@ def test_relay(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.relay.build(func, target=target, runtime=runtime)
 
-    with _make_session(model, board, arduino_cli_cmd, workspace_dir, mod, build_config) as session:
+    with _make_session(
+        model, board, arduino_cli_cmd, workspace_dir, mod, build_config, serial_number
+    ) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             mod.get_graph_json(), session.get_system_lib(), session.device
         )
@@ -167,7 +202,7 @@ def test_relay(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_onnx(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
+def test_onnx(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
     """Testing a simple ONNX model."""
     model = test_utils.ARDUINO_BOARDS[board]
     build_config = {"debug": microtvm_debug}
@@ -197,7 +232,7 @@ def test_onnx(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
         graph = lowered.get_graph_json()
 
     with _make_session(
-        model, board, arduino_cli_cmd, workspace_dir, lowered, build_config
+        model, board, arduino_cli_cmd, workspace_dir, lowered, build_config, serial_number
     ) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             graph, session.get_system_lib(), session.device
@@ -227,6 +262,7 @@ def check_result(
     out_shape,
     result,
     build_config,
+    serial_number,
 ):
     """Helper function to verify results"""
     TOL = 1e-5
@@ -236,7 +272,7 @@ def check_result(
         mod = tvm.relay.build(relay_mod, target=target, runtime=runtime)
 
     with _make_session(
-        model, arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config
+        model, arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config, serial_number
     ) as session:
         rt_mod = tvm.micro.create_local_graph_executor(
             mod.get_graph_json(), session.get_system_lib(), session.device
@@ -258,7 +294,7 @@ def check_result(
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_byoc_microtvm(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
+def test_byoc_microtvm(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
     """This is a simple test case to check BYOC capabilities of microTVM"""
     model = test_utils.ARDUINO_BOARDS[board]
     build_config = {"debug": microtvm_debug}
@@ -318,17 +354,32 @@ def test_byoc_microtvm(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
         arduino_board=board,
         arduino_cli_cmd=arduino_cli_cmd,
         workspace_dir=workspace_dir,
+        serial_number=serial_number,
     )
 
 
 def _make_add_sess_with_shape(
-    model, arduino_board, arduino_cli_cmd, workspace_dir, shape, build_config
+    model,
+    arduino_board,
+    arduino_cli_cmd,
+    workspace_dir,
+    shape,
+    build_config,
+    serial_number: str = None,
 ):
     A = tvm.te.placeholder(shape, dtype="int8")
     C = tvm.te.compute(A.shape, lambda i: A[i] + A[i], name="C")
     sched = tvm.te.create_schedule(C.op)
     return _make_sess_from_op(
-        model, arduino_board, arduino_cli_cmd, workspace_dir, "add", sched, [A, C], build_config
+        model,
+        arduino_board,
+        arduino_cli_cmd,
+        workspace_dir,
+        "add",
+        sched,
+        [A, C],
+        build_config,
+        serial_number,
     )
 
 
@@ -342,7 +393,9 @@ def _make_add_sess_with_shape(
 )
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_rpc_large_array(board, arduino_cli_cmd, microtvm_debug, workspace_dir, shape):
+def test_rpc_large_array(
+    board, arduino_cli_cmd, microtvm_debug, workspace_dir, shape, serial_number
+):
     """Test large RPC array transfer."""
     model = test_utils.ARDUINO_BOARDS[board]
     build_config = {"debug": microtvm_debug}
@@ -357,7 +410,7 @@ def test_tensors(sess):
         assert (C_data.numpy() == np.zeros(shape)).all()
 
     with _make_add_sess_with_shape(
-        model, board, arduino_cli_cmd, workspace_dir, shape, build_config
+        model, board, arduino_cli_cmd, workspace_dir, shape, build_config, serial_number
     ) as sess:
         test_tensors(sess)
 
diff --git a/tests/micro/arduino/test_arduino_workflow.py b/tests/micro/arduino/test_arduino_workflow.py
index 8d5d541d408c..51898424aee5 100644
--- a/tests/micro/arduino/test_arduino_workflow.py
+++ b/tests/micro/arduino/test_arduino_workflow.py
@@ -56,8 +56,9 @@ def project_dir(workflow_workspace_dir):
 @pytest.fixture(scope="module")
 def project(request, arduino_cli_cmd, microtvm_debug, workflow_workspace_dir):
     board = request.config.getoption("--board")
+    serial_number = request.config.getoption("--serial-number")
     return test_utils.make_kws_project(
-        board, arduino_cli_cmd, microtvm_debug, workflow_workspace_dir
+        board, arduino_cli_cmd, microtvm_debug, workflow_workspace_dir, serial_number
     )
 
 
diff --git a/tests/micro/arduino/test_utils.py b/tests/micro/arduino/test_utils.py
index b27d4bb7aa10..d81edc845b98 100644
--- a/tests/micro/arduino/test_utils.py
+++ b/tests/micro/arduino/test_utils.py
@@ -61,7 +61,7 @@ def make_workspace_dir(test_name, board):
     return t
 
 
-def make_kws_project(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
+def make_kws_project(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number: str):
     this_dir = pathlib.Path(__file__).parent
     model = ARDUINO_BOARDS[board]
     build_config = {"debug": microtvm_debug}
@@ -88,5 +88,6 @@ def make_kws_project(board, arduino_cli_cmd, microtvm_debug, workspace_dir):
             "arduino_cli_cmd": arduino_cli_cmd,
             "project_type": "example_project",
             "verbose": bool(build_config.get("debug")),
+            "serial_number": serial_number,
         },
     )
diff --git a/tests/micro/zephyr/README.md b/tests/micro/zephyr/README.md
index 7ae98bbd2d42..f6917bf5e2f4 100644
--- a/tests/micro/zephyr/README.md
+++ b/tests/micro/zephyr/README.md
@@ -44,5 +44,5 @@ $ pytest test_zephyr.py --help
 If you like to test with a real hardware, you have the option to pass the serial number
 for your development board.
 ```
-$ pytest test_zephyr.py --board=nrf5340dk_nrf5340_cpuapp --serial="0672FF5"
+$ pytest test_zephyr.py --board=nrf5340dk_nrf5340_cpuapp --serial-number="0672FF5"
 ```
diff --git a/tests/micro/zephyr/conftest.py b/tests/micro/zephyr/conftest.py
index d3dbf22e4746..aa1759d770fd 100644
--- a/tests/micro/zephyr/conftest.py
+++ b/tests/micro/zephyr/conftest.py
@@ -29,11 +29,6 @@ def pytest_addoption(parser):
         default=False,
         help="If set true, use the FVP emulator to run the test",
     )
-    parser.addoption(
-        "--serial",
-        default=None,
-        help="If set true, use the FVP emulator to run the test",
-    )
 
 
 @pytest.fixture
@@ -41,11 +36,6 @@ def use_fvp(request):
     return request.config.getoption("--use-fvp")
 
 
-@pytest.fixture
-def serial_number(request):
-    return request.config.getoption("--serial")
-
-
 @pytest.fixture(autouse=True)
 def xfail_on_fvp(request, use_fvp):
     """mark the tests as xfail if running on fvp."""

From 3e956ce9dadd9f919ee2c78d64d9bc0e813bef68 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@gmail.com>
Date: Tue, 6 Dec 2022 04:19:31 +0800
Subject: [PATCH 694/704] [DNNL][BYOC] enable dense_bias_sum fusion (#13550)

This patch is to enable dense_bias_sum fusion and add its relevant unittest.
---
 python/tvm/relay/op/contrib/dnnl.py           | 23 ++++++++++++++++
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc |  4 +++
 tests/python/contrib/test_dnnl.py             | 26 +++++++++++++++++++
 3 files changed, 53 insertions(+)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index f7752e41b056..bdf910d704ce 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -207,6 +207,28 @@ def make_conv_bias_sum_relu_pattern(conv_type, has_relu=True):
     return out
 
 
+def make_dense_bias_sum_pattern():
+    """Create patterns with sum op.
+
+    Parameters
+    ----------
+    N/A
+
+    Returns
+    -------
+    out : CallPattern
+        Call node sequence.
+    """
+    data1 = wildcard()
+    weight = wildcard()
+    bias = wildcard()
+    data2 = wildcard()
+    out = is_op("nn.dense")(data1, weight)
+    out = is_op("add")(out, bias)
+    out = is_op("add")(out, data2)
+    return "dnnl.dense_bias_sum", out
+
+
 def get_op_name(expr):
     """Get the operator name from an expression."""
     if isinstance(expr, Op):
@@ -438,6 +460,7 @@ def pattern_table():
     dnnl_patterns = list()
     dnnl_patterns.append(make_qnn_conv2d_pattern())
     dnnl_patterns.append(make_qnn_dense_pattern())
+    dnnl_patterns.append(make_dense_bias_sum_pattern())
     dnnl_patterns.append(
         (
             "dnnl.conv2d_bias_sum_relu",
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index 9deab71b5102..ba06d082c4e0 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -470,6 +470,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
   void Dense(const size_t& nid) {
     auto node = nodes_[nid];
+    auto op_name = node.GetOpName();
 
     // Setup attributes.
     auto src_tr = GetInput(nid, 0);
@@ -500,6 +501,9 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
     // TODO(@apeskov): Simulation of inplace primitive. just as PoC.
     auto sum_in_tr = GetInputByName(nid, "sum_idx");
+    if (op_name.find("_sum") != std::string::npos) {
+      sum_in_tr = GetInput(nid, node.GetInputs().size() - 1);
+    }
 
     Submit(dnnl::inner_product_forward(dense_prim_desc),
            {{DNNL_ARG_SRC, src_tr},
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index f23b3c70aa96..f60472c923ae 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -831,6 +831,32 @@ def get_conv2d_bn_sum_relu(x_shape, k_shape, dtype="float32"):
     run_and_verify_func(config, run_module=run_module, dtype=dtype)
 
 
+def test_dense_bias_sum(run_module, dtype="float32"):
+    x_shape = (4, 32)
+    k_shape = (16, 32)
+
+    def get_dense_bias_sum(x_shape, k_shape, dtype="float32"):
+        out, dic, param_lst = get_dense_bias(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+
+        sum_in = relay.var("sum_in", shape=x_shape, dtype=dtype)
+        ker = relay.var("ker", shape=(k_shape), dtype=dtype)
+        dense_sum = relay.nn.dense(sum_in, ker, units=k_shape[0])
+
+        # sum over two dense outputs to meet inplace condition
+        out = relay.add(out, dense_sum)
+        dic["sum_in"] = x_shape
+        dic["ker"] = k_shape
+        param_lst += ["ker"]
+        return out, dic, param_lst
+
+    dense_bias_sum, dic, param_lst = get_dense_bias_sum(x_shape, k_shape, dtype=dtype)
+    dense_bias_sum = tvm.IRModule.from_expr(dense_bias_sum)
+    print("hebi-dbg:")
+    print(dense_bias_sum)
+    config = dense_bias_sum, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
 def test_conv2d_transpose(run_module, dtype="float32"):
     x_shape = (1, 32, 8, 8)
     for k_shape, groups in [((32, 16, 3, 3), 1), ((32, 1, 3, 3), 32), ((32, 4, 3, 3), 16)]:

From 3482eab1c55ba3c456ad229005be57906e1c5edb Mon Sep 17 00:00:00 2001
From: ninesheep <ninesheep@live.cn>
Date: Tue, 6 Dec 2022 04:27:27 +0800
Subject: [PATCH 695/704] [Fix Bug]fix the bug of schedule batch_matmul_int8 on
 cuda (#13551)

* [Fix Bug]fix the bug of tensorflow frontend when parsing Range layer

* [Fix Bug]fix the bug of schedule batch_matmul_int8 on cuda

Co-authored-by: wangjiuyang <wang.jiuyang@intellif.com>
---
 python/tvm/topi/cuda/batch_matmul.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index 4e476094f2d9..d2f5c9b9c586 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -352,7 +352,7 @@ def _schedule_batch_matmul_int8(cfg, s, output):
     cfg.define_split("tile_k", K // k_factor, num_outputs=2)
     cfg.define_knob("auto_unroll_max_step", [0, 256, 512, 1024])
 
-    batch_matmul_op = s.outputs[0]
+    batch_matmul_op = s[output].op
     s[input_x].compute_inline()
     s[input_y].compute_inline()
 
@@ -373,6 +373,10 @@ def _schedule_batch_matmul_int8(cfg, s, output):
         dtypes = (input_x.dtype, input_y.dtype)
         s[batch_matmul_cache].tensorize(ki, dp4a("shared", "shared", "local", dtypes))
 
+    if batch_matmul_op not in s.outputs:
+        s[output].compute_inline()
+        batch_matmul_op = s.outputs[0]
+
     # tile axis
     f, m, n = batch_matmul_op.axis
     kernel_scope, f = s[batch_matmul_op].split(f, nparts=1)

From 965490e618f066e7ed762f7f60bac4900a66c4d9 Mon Sep 17 00:00:00 2001
From: shengxinhu <sxhu2019@gmail.com>
Date: Tue, 6 Dec 2022 05:20:17 +0800
Subject: [PATCH 696/704] [Relay] Optimize transform shape (#13519)

* [Relay] Optimize transform shape

* add test
---
 src/tir/ir/data_layout.cc      | 10 +---------
 tests/python/relay/test_any.py |  9 +++++++++
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/tir/ir/data_layout.cc b/src/tir/ir/data_layout.cc
index f784f7b49aac..3b22ffc71173 100644
--- a/src/tir/ir/data_layout.cc
+++ b/src/tir/ir/data_layout.cc
@@ -334,13 +334,9 @@ inline Array<PrimExpr> TransformShape(const Array<PrimExpr>& src_shape,
   // for minor-axis, simply bind it as 0, so that we can reuse forward/backward_rule,
   // e.g., (C * 16 + c) / 32
   std::unordered_map<const tir::VarNode*, PrimExpr> bind_map;
-  std::unordered_set<size_t> symbolic_var_set;
   for (size_t i = 0; i < src_shape.size(); ++i) {
     PrimExpr orig_shape = src_shape[i];
     IterVar orig_axis = src_axis[i];
-    if (orig_shape.as<tir::AnyNode>()) {
-      symbolic_var_set.insert(i);
-    }
     if (!LayoutAxis::Get(orig_axis).IsPrimal()) {
       if (orig_shape.defined()) {
         const auto* orig_shape_const = orig_shape.as<IntImmNode>();
@@ -369,11 +365,7 @@ inline Array<PrimExpr> TransformShape(const Array<PrimExpr>& src_shape,
     if (!LayoutAxis::Get(axis).IsPrimal()) {
       result.push_back(axis->dom->extent);
     } else {
-      if (symbolic_var_set.count(i)) {
-        result.push_back(tir::Any());
-      } else {
-        result.push_back(ana.Simplify(tir::Substitute(rule, bind_map)));
-      }
+      result.push_back(ana.Simplify(tir::Substitute(rule, bind_map)));
     }
   }
 
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index f602a17e2412..37aa2271a520 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -446,6 +446,15 @@ def test_any_layout_transform():
     verify_any_layout_transform((16, 1), "CH", "C4cH", (16, 1), (4, 4, 1))
 
 
+def test_bilayout_with_any():
+    bilayout = tvm.tir.bijective_layout("NCHW", "NHWC")
+    assert isinstance(bilayout, tvm.tir.BijectiveLayout)
+    dst_shape = bilayout.forward_shape((relay.Any(), 32, 7, relay.Any()))
+    assert dst_shape[3] == 32
+    src_shape = bilayout.backward_shape(dst_shape)
+    assert src_shape[1] == 32
+
+
 def verify_any_expand_dims(data_shape, axis, num_newaxis, static_data_shape, ref_out_shape):
     mod = tvm.IRModule()
     dtype = "float32"

From 6574e1603452f6865949647bc8e3bed4dca5e55e Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 6 Dec 2022 16:51:12 +0900
Subject: [PATCH 697/704] [MetaSchedule][Hexagon] Add postproc for verifying
 VTCM usage (#13538)

* add new postproc VerifyVTCMLimit

* remove pass

* add test

* add doc, missing file

* Add back VectorizeLoop in prereq lowering pass

* fix lint
---
 include/tvm/meta_schedule/postproc.h          |   5 +
 include/tvm/tir/analysis.h                    |   8 ++
 python/tvm/meta_schedule/postproc/__init__.py |   1 +
 .../postproc/verify_vtcm_limit.py             |  31 +++++
 src/meta_schedule/postproc/postproc.cc        |   7 +-
 .../postproc/verify_vtcm_limit.cc             | 104 ++++++++++++++
 .../analysis/calculate_allocated_memory.cc    |   9 ++
 ...eta_schedule_postproc_verify_vtcm_limit.py | 127 ++++++++++++++++++
 8 files changed, 288 insertions(+), 4 deletions(-)
 create mode 100644 python/tvm/meta_schedule/postproc/verify_vtcm_limit.py
 create mode 100644 src/meta_schedule/postproc/verify_vtcm_limit.cc
 create mode 100644 tests/python/unittest/test_meta_schedule_postproc_verify_vtcm_limit.py

diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 13fe47058740..76f8d71ad65b 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -144,6 +144,11 @@ class Postproc : public runtime::ObjectRef {
    * \return The postprocessor created
    */
   TVM_DLL static Postproc VerifyGPUCode();
+  /*!
+   * \brief Verifies that the VTCM usage of a given schedule is within the provided limit.
+   * \return The postprocessor created
+   */
+  TVM_DLL static Postproc VerifyVTCMLimit();
   /*!
    * \brief Creates a postprocessor that rewrites the layout of input tensor
    * \note Weight layout rewrite is supported so far, activation layout rewrite will be added.
diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h
index cb31a7e5ee96..a8edc2675fc4 100644
--- a/include/tvm/tir/analysis.h
+++ b/include/tvm/tir/analysis.h
@@ -169,6 +169,14 @@ TVM_DLL bool VerifyMemory(const PrimFunc& func);
  */
 TVM_DLL bool VerifyGPUCode(const PrimFunc& func, Map<String, PrimExpr> constraints);
 
+/*!
+ * \brief Verifies that the VTCM usage of the given prim_func is within the provided limit.
+ * \param func The function to be checked.
+ * \param limit The limit to check.
+ * \return true if the VTCM usage is within the provided limit.
+ */
+TVM_DLL bool VerifyVTCMLimit(const PrimFunc& func, Integer limit);
+
 /*!
  * \brief Auto detect the block access region according to its body stmt
  *        It will detect the access region as an array in order of appearance in AST
diff --git a/python/tvm/meta_schedule/postproc/__init__.py b/python/tvm/meta_schedule/postproc/__init__.py
index f70b740d7bd7..0598a53e2ac1 100644
--- a/python/tvm/meta_schedule/postproc/__init__.py
+++ b/python/tvm/meta_schedule/postproc/__init__.py
@@ -24,3 +24,4 @@
 from .rewrite_tensorize import RewriteTensorize
 from .rewrite_unbound_block import RewriteUnboundBlock
 from .verify_gpu_code import VerifyGPUCode
+from .verify_vtcm_limit import VerifyVTCMLimit
diff --git a/python/tvm/meta_schedule/postproc/verify_vtcm_limit.py b/python/tvm/meta_schedule/postproc/verify_vtcm_limit.py
new file mode 100644
index 000000000000..28d202d5b338
--- /dev/null
+++ b/python/tvm/meta_schedule/postproc/verify_vtcm_limit.py
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A postprocessor that verifies the VTCM usage of a given schedule."""
+
+from tvm._ffi.registry import register_object
+from .. import _ffi_api
+from .postproc import Postproc
+
+
+@register_object("meta_schedule.VerifyVTCMLimit")
+class VerifyVTCMLimit(Postproc):
+    """Verifies that the VTCM usage of a given schedule is within the provided limit."""
+
+    def __init__(self) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.PostprocVerifyVTCMLimit,  # type: ignore # pylint: disable=no-member
+        )
diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc
index c614f3230d59..dba523d094bf 100644
--- a/src/meta_schedule/postproc/postproc.cc
+++ b/src/meta_schedule/postproc/postproc.cc
@@ -94,10 +94,9 @@ Array<Postproc> Postproc::DefaultCUDATensorCore() {
 
 Array<Postproc> Postproc::DefaultHexagon() {
   return Array<Postproc>{
-      Postproc::DisallowDynamicLoop(),
-      Postproc::RewriteParallelVectorizeUnroll(),
-      Postproc::RewriteReductionBlock(),
-      Postproc::RewriteLayout(),
+      Postproc::DisallowDynamicLoop(),   Postproc::RewriteParallelVectorizeUnroll(),
+      Postproc::RewriteReductionBlock(), Postproc::RewriteLayout(),
+      Postproc::VerifyVTCMLimit(),
   };
 }
 
diff --git a/src/meta_schedule/postproc/verify_vtcm_limit.cc b/src/meta_schedule/postproc/verify_vtcm_limit.cc
new file mode 100644
index 000000000000..a6b577de9acc
--- /dev/null
+++ b/src/meta_schedule/postproc/verify_vtcm_limit.cc
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/tir/transform.h>
+
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+class VerifyVTCMLimitNode : public PostprocNode {
+ public:
+  Integer vtcm_capacity;
+
+  void InitializeWithTuneContext(const TuneContext& context) final {
+    ICHECK(context->target.defined());
+    Target target = context->target.value();
+    ICHECK(target->kind->name == "hexagon");
+    // The value of 0 will disable VTCM verification.
+    vtcm_capacity = target->GetAttr<Integer>("vtcm-capacity").value_or(0);
+  }
+
+  bool Verify(const IRModule& mod) const {
+    for (const auto& kv : mod->functions) {
+      if (const auto* prim_func = kv.second.as<tir::PrimFuncNode>()) {
+        if (!tir::VerifyVTCMLimit(GetRef<tir::PrimFunc>(prim_func), vtcm_capacity)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  bool Apply(const tir::Schedule& sch) final {
+    IRModule mod = sch->mod();
+    for (const auto& kv : mod->functions) {
+      const GlobalVar& g_var = kv.first;
+      const BaseFunc& base_func = kv.second;
+      if (const auto* prim_func = base_func.as<tir::PrimFuncNode>()) {
+        IRModule lowered{nullptr};
+        try {
+          auto pass_list = Array<tvm::transform::Pass>();
+          pass_list.push_back(tir::transform::LowerInitBlock());
+          pass_list.push_back(tir::transform::PlanAndUpdateBufferAllocationLocation());
+          pass_list.push_back(tir::transform::ConvertBlocksToOpaque());
+          pass_list.push_back(tir::transform::CompactBufferAllocation());
+          pass_list.push_back(tir::transform::LowerMatchBuffer());
+          pass_list.push_back(tir::transform::InjectSoftwarePipeline());
+          pass_list.push_back(tir::transform::LowerOpaqueBlock());
+          pass_list.push_back(tir::transform::FlattenBuffer());
+          pass_list.push_back(tir::transform::Simplify());
+          pass_list.push_back(tir::transform::VectorizeLoop(true));
+          pass_list.push_back(tir::transform::StorageRewrite());
+          transform::PassContext pass_ctx = transform::PassContext::Current();
+          tir::PrimFunc f = WithAttr(GetRef<tir::PrimFunc>(prim_func), "global_symbol",
+                                     runtime::String(g_var->name_hint));
+          IRModule mod = IRModule(Map<GlobalVar, BaseFunc>({{GlobalVar(g_var->name_hint), f}}));
+          lowered = tvm::transform::Sequential(pass_list)(std::move(mod));
+        } catch (const dmlc::Error& e) {
+          return false;
+        }
+        if (!Verify(lowered)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  Postproc Clone() const {
+    ObjectPtr<VerifyVTCMLimitNode> n = make_object<VerifyVTCMLimitNode>(*this);
+    return Postproc(n);
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.VerifyVTCMLimit";
+  TVM_DECLARE_FINAL_OBJECT_INFO(VerifyVTCMLimitNode, PostprocNode);
+};
+
+Postproc Postproc::VerifyVTCMLimit() {
+  ObjectPtr<VerifyVTCMLimitNode> n = make_object<VerifyVTCMLimitNode>();
+  return Postproc(n);
+}
+
+TVM_REGISTER_NODE_TYPE(VerifyVTCMLimitNode);
+TVM_REGISTER_GLOBAL("meta_schedule.PostprocVerifyVTCMLimit")
+    .set_body_typed(Postproc::VerifyVTCMLimit);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/tir/analysis/calculate_allocated_memory.cc b/src/tir/analysis/calculate_allocated_memory.cc
index 01457508ab95..9da8ec435524 100644
--- a/src/tir/analysis/calculate_allocated_memory.cc
+++ b/src/tir/analysis/calculate_allocated_memory.cc
@@ -87,6 +87,15 @@ TVM_REGISTER_GLOBAL("tir.analysis.calculate_allocated_bytes").set_body_typed([](
   return CalculateAllocatedBytes(func);
 });
 
+bool VerifyVTCMLimit(const PrimFunc& func, Integer limit) {
+  auto sizes = CalculateAllocatedBytes(func);
+  const auto vtcm_allocated = sizes.Get("global.vtcm").value_or(0);
+  if (limit.IntValue() > 0 && vtcm_allocated.IntValue() > limit.IntValue()) {
+    return false;
+  }
+  return true;
+}
+
 namespace transform {
 
 Pass VerifyVTCMLimit(const Integer& limit) {
diff --git a/tests/python/unittest/test_meta_schedule_postproc_verify_vtcm_limit.py b/tests/python/unittest/test_meta_schedule_postproc_verify_vtcm_limit.py
new file mode 100644
index 000000000000..55ea0a6ed80f
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_postproc_verify_vtcm_limit.py
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+import tvm
+import tvm.testing
+from tvm import meta_schedule as ms
+from tvm import tir
+from tvm.script import tir as T
+
+
+def _create_context(mod, target) -> ms.TuneContext:
+    return ms.TuneContext(
+        mod=mod,
+        target=target,
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[ms.postproc.VerifyVTCMLimit()],
+            mutator_probs={},
+        ),
+        task_name="test",
+    )
+
+
+# pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable,misplaced-comparison-constant
+# fmt: off
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcVTCM:
+    @T.prim_func
+    def main(p0: T.Buffer[(T.int64(1), T.int64(2), T.int64(56), T.int64(56), T.int64(32)), "uint8"], p1: T.Buffer[(T.int64(2), T.int64(2), T.int64(3), T.int64(3), T.int64(8), T.int64(32), T.int64(4)), "uint8"], conv2d_NCHWc_int8: T.Buffer[(T.int64(1), T.int64(2), T.int64(54), T.int64(54), T.int64(32)), "int32"]):
+        T.func_attr({"tir.noalias": True, "global_symbol": "main"})
+        p0_global_vtcm = T.alloc_buffer([T.int64(1), T.int64(2), T.int64(56), T.int64(56), T.int64(32)], dtype="uint8", scope="global.vtcm")
+        p1_global_vtcm = T.alloc_buffer([T.int64(2), T.int64(2), T.int64(3), T.int64(3), T.int64(8), T.int64(32), T.int64(4)], dtype="uint8", scope="global.vtcm")
+        for n_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step":16, "pragma_unroll_explicit":1}):
+            for oc_chunk_0, oh_0, ow_0, oc_block_0_0 in T.grid(T.int64(2), T.int64(2), T.int64(2), T.int64(1)):
+                for oc_chunk_1_init, oh_1_init, ow_1_init, oc_chunk_2_init, oh_2_init, ow_2_init in T.grid(T.int64(1), T.int64(27), T.int64(3), T.int64(1), T.int64(1), T.int64(9)):
+                    with T.block("conv2d_NCHWc_int8_o_init"):
+                        v_n = T.axis.spatial(T.int64(1), T.int64(0))
+                        v_oc_chunk = T.axis.spatial(T.int64(2), oc_chunk_1_init + oc_chunk_2_init + oc_chunk_0)
+                        v_oh = T.axis.spatial(T.int64(54), oh_2_init + oh_0 * T.int64(27) + oh_1_init)
+                        v_ow = T.axis.spatial(T.int64(54), ow_0 * T.int64(27) + ow_1_init * T.int64(9) + ow_2_init)
+                        v_oc_block_o = T.axis.spatial(T.int64(1), T.int64(0))
+                        T.reads()
+                        T.writes(conv2d_NCHWc_int8[v_n, v_oc_chunk, v_oh, v_ow, T.int64(0) : T.int64(32)])
+                        for oc_block_1 in T.vectorized(T.int64(32)):
+                            with T.block("conv2d_NCHWc_int8_init"):
+                                v_oc_block_i_init = T.axis.spatial(T.int64(32), oc_block_1)
+                                T.reads()
+                                T.writes(conv2d_NCHWc_int8[v_n, v_oc_chunk, v_oh, v_ow, v_oc_block_i_init])
+                                conv2d_NCHWc_int8[v_n, v_oc_chunk, v_oh, v_ow, v_oc_block_i_init] = 0
+                for kh_0_kw_0_ic_outer_0_ic_f_inner_0_ic_s_inner_0_0_fused in T.serial(T.int64(2), annotations={"software_pipeline_async_stages":[0], "software_pipeline_order":[0, 1, 2], "software_pipeline_stage":[0, 0, 1]}):
+                    for ax0_ax1_ax2_ax3_ax4_fused in T.serial(T.int64(26912)):
+                        with T.block("p0_global.vtcm"):
+                            v0 = T.axis.spatial(T.int64(1), T.int64(0))
+                            v1 = T.axis.spatial(T.int64(2), ax0_ax1_ax2_ax3_ax4_fused // T.int64(13456))
+                            v2 = T.axis.spatial(T.int64(56), oh_0 * T.int64(27) + ax0_ax1_ax2_ax3_ax4_fused % T.int64(13456) // T.int64(464))
+                            v3 = T.axis.spatial(T.int64(56), ow_0 * T.int64(27) + ax0_ax1_ax2_ax3_ax4_fused % T.int64(464) // T.int64(16))
+                            v4 = T.axis.spatial(T.int64(32), kh_0_kw_0_ic_outer_0_ic_f_inner_0_ic_s_inner_0_0_fused * T.int64(16) + ax0_ax1_ax2_ax3_ax4_fused % T.int64(16))
+                            T.reads(p0[v0, v1, v2, v3, v4])
+                            T.writes(p0_global_vtcm[v0, v1, v2, v3, v4])
+                            p0_global_vtcm[v0, v1, v2, v3, v4] = p0[v0, v1, v2, v3, v4]
+                    for ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused in T.serial(T.int64(9216)):
+                        with T.block("p1_global.vtcm"):
+                            v0 = T.axis.spatial(T.int64(2), oc_chunk_0)
+                            v1 = T.axis.spatial(T.int64(2), ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused // T.int64(4608))
+                            v2 = T.axis.spatial(T.int64(3), ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused % T.int64(4608) // T.int64(1536))
+                            v3 = T.axis.spatial(T.int64(3), ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused % T.int64(1536) // T.int64(512))
+                            v4 = T.axis.spatial(T.int64(8), kh_0_kw_0_ic_outer_0_ic_f_inner_0_ic_s_inner_0_0_fused * T.int64(4) + ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused % T.int64(512) // T.int64(128))
+                            v5 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused % T.int64(128) // T.int64(4))
+                            v6 = T.axis.spatial(T.int64(4), ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused % T.int64(4))
+                            T.reads(p1[v0, v1, v2, v3, v4, v5, v6])
+                            T.writes(p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6])
+                            p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6] = p1[v0, v1, v2, v3, v4, v5, v6]
+                    for n_1, oc_chunk_1, oh_1, ow_1, oc_block_0_1, kh_1, kw_1, ic_outer_1, ic_f_inner_1, ic_s_inner_0_1, n_2, oc_chunk_2, oh_2, ow_2, oc_block_0_2 in T.grid(T.int64(1), T.int64(1), T.int64(27), T.int64(3), T.int64(1), T.int64(3), T.int64(3), T.int64(2), T.int64(4), T.int64(1), T.int64(1), T.int64(1), T.int64(1), T.int64(9), T.int64(1)):
+                        with T.block("conv2d_NCHWc_int8_o_update"):
+                            v_n = T.axis.spatial(T.int64(1), T.int64(0))
+                            v_oc_chunk = T.axis.spatial(T.int64(2), oc_chunk_1 + oc_chunk_2 + oc_chunk_0)
+                            v_oh = T.axis.spatial(T.int64(54), oh_2 + oh_0 * T.int64(27) + oh_1)
+                            v_ow = T.axis.spatial(T.int64(54), ow_0 * T.int64(27) + ow_1 * T.int64(9) + ow_2)
+                            v_oc_block_o = T.axis.spatial(T.int64(1), T.int64(0))
+                            v_kh, v_kw, v_ic_outer = T.axis.remap("RRR", [kh_1, kw_1, ic_outer_1])
+                            v_ic_f_inner = T.axis.reduce(T.int64(8), kh_0_kw_0_ic_outer_0_ic_f_inner_0_ic_s_inner_0_0_fused * T.int64(4) + ic_f_inner_1)
+                            v_ic_s_inner_o = T.axis.reduce(T.int64(1), T.int64(0))
+                            T.reads(conv2d_NCHWc_int8[v_n, v_oc_chunk, v_oh, v_ow, T.int64(0) : T.int64(32)], p0_global_vtcm[v_n, v_ic_outer, v_oh + v_kh, v_ow + v_kw, v_ic_f_inner * T.int64(4) : v_ic_f_inner * T.int64(4) + T.int64(4)], p1_global_vtcm[v_oc_chunk, v_ic_outer, v_kh, v_kw, v_ic_f_inner, T.int64(0) : T.int64(32), T.int64(0) : T.int64(4)])
+                            T.writes(conv2d_NCHWc_int8[v_n, v_oc_chunk, v_oh, v_ow, T.int64(0) : T.int64(32)])
+                            for oc_block_1, ic_s_inner_1 in T.grid(T.int64(32), T.int64(4)):
+                                with T.block("conv2d_NCHWc_int8"):
+                                    v_oc_block_i, v_ic_s_inner_i = T.axis.remap("SR", [oc_block_1, ic_s_inner_1])
+                                    T.reads(conv2d_NCHWc_int8[v_n, v_oc_chunk, v_oh, v_ow, v_oc_block_i], p0_global_vtcm[v_n, v_ic_outer, v_oh + v_kh, v_ow + v_kw, v_ic_f_inner * T.int64(4) + v_ic_s_inner_i], p1_global_vtcm[v_oc_chunk, v_ic_outer, v_kh, v_kw, v_ic_f_inner, v_oc_block_i, v_ic_s_inner_i])
+                                    T.writes(conv2d_NCHWc_int8[v_n, v_oc_chunk, v_oh, v_ow, v_oc_block_i])
+                                    T.block_attr({"meta_schedule.tiling_structure":"SRSRS"})
+                                    conv2d_NCHWc_int8[v_n, v_oc_chunk, v_oh, v_ow, v_oc_block_i] = conv2d_NCHWc_int8[v_n, v_oc_chunk, v_oh, v_ow, v_oc_block_i] + T.Cast("int32", p0_global_vtcm[v_n, v_ic_outer, v_oh + v_kh, v_ow + v_kw, v_ic_f_inner * T.int64(4) + v_ic_s_inner_i]) * T.Cast("int32", p1_global_vtcm[v_oc_chunk, v_ic_outer, v_kh, v_kw, v_ic_f_inner, v_oc_block_i, v_ic_s_inner_i])
+
+#fmt on
+
+
+def test_conv2d_vtcm():
+    def get_target(vtcm_cap):
+        target = tvm.target.hexagon("v68", vtcm_capacity=vtcm_cap)
+        return tvm.target.Target(target, host=target)
+
+    sch = tir.Schedule(Conv2dNCHWcVTCM, debug_mask="all")
+
+    ctx = _create_context(Conv2dNCHWcVTCM, target=get_target(70000))
+    assert not ctx.space_generator.postprocs[0].apply(sch)
+
+    ctx = _create_context(Conv2dNCHWcVTCM, target=get_target(75000))
+    assert ctx.space_generator.postprocs[0].apply(sch)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 8d31b25bb8f237da9a437f5263202ff032bef606 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar Subramaniam <quic_sanirudh@quicinc.com>
Date: Tue, 6 Dec 2022 15:15:17 +0530
Subject: [PATCH 698/704] [TIR] [Hexagon] Add vdmpy intrinsic and
 transform_layout for tests (#13557)

[TIR] Add vdmpy intrinsic and transform_layout for tests

This patch adds the vdmpy hexagon intrinsic and a sample tensorization
test for the same.

This patch modifies the test to use transform_layout instead of a packed
tensor in the compute to make it obvious that this example is just
matmul with a different data layout for one of the inputs
---
 python/tvm/tir/tensor_intrin/hexagon.py       | 46 +++++++++++++++++++
 .../unittest/test_tir_schedule_tensorize.py   | 42 ++++++++++++-----
 2 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
index 49c12c3e9dce..5e5749055bb0 100644
--- a/python/tvm/tir/tensor_intrin/hexagon.py
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -104,6 +104,48 @@ def dot_product_32x4_u8i8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> Non
     return dot_product_32x4_u8i8i32_desc, dot_product_32x4_u8i8i32_vrmpy
 
 
+def generate_dot_product_32x2_i16i16i32(mem_scope="global"):
+    @T.prim_func
+    def dot_product_32x2_i16i16i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        with T.block("root"):
+            T.reads(C[0:32], A[0:2], B[0:32, 0:2])
+            T.writes(C[0:32])
+            for i in T.serial(0, 32):
+                for k in T.serial(0, 2):
+                    with T.block("update"):
+                        vi, vk = T.axis.remap("SR", [i, k])
+                        C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
+
+    @T.prim_func
+    def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        with T.block("root"):
+            T.reads(C[0:32], A[0:2], B[0:32, 0:2])
+            T.writes(C[0:32])
+
+            A_i16x2 = A.vload([0], "int16x2")
+            A_i32 = T.reinterpret(A_i16x2, dtype="int32")
+
+            B_i16x64 = B.vload([0, 0], dtype="int16x64")
+            B_i32x32 = T.reinterpret(B_i16x64, dtype="int32x32")
+
+            C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin(
+                T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vdmpyhvsat.acc.128B"),
+                T.uint32(3),
+                C[T.ramp(T.int32(0), 1, 32)],
+                T.Broadcast(A_i32, 32),
+                B_i32x32,
+                dtype="int32x32",
+            )
+
+    return dot_product_32x2_i16i16i32_desc, dot_product_32x2_i16i16i32_vdmpy
+
+
 VRMPY_u8u8i32_INTRIN = "dot_32x4_u8u8i32_vrmpy"
 
 TensorIntrin.register(VRMPY_u8u8i32_INTRIN, *generate_dot_product_32x4_u8u8i32())
@@ -112,6 +154,10 @@ def dot_product_32x4_u8i8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> Non
 
 TensorIntrin.register(VRMPY_u8i8i32_INTRIN, *generate_dot_product_32x4_u8i8i32())
 
+VDMPY_i16i16i32_INTRIN = "dot_product_32x2_i16i16i32_vdmpy"
+
+TensorIntrin.register(VDMPY_i16i16i32_INTRIN, *generate_dot_product_32x2_i16i16i32())
+
 VRMPY_u8u8i32_VTCM_INTRIN = "dot_32x4_u8u8i32_vtcm_vrmpy"
 TensorIntrin.register(VRMPY_u8u8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8u8i32("global.vtcm"))
 
diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index 21cc39b71402..fc0bdc146c88 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -30,7 +30,7 @@
 )
 from tvm.tir.tensor_intrin.rocm import AMDGPU_SDOT4_INTRIN
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN
-from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN, VDMPY_i16i16i32_INTRIN
 
 # fmt: off
 # pylint: disable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
@@ -540,33 +540,31 @@ def test_tensorize_with_annotation():
     verify_trace_roundtrip(sch=s, mod=func)
 
 
-def get_matmul_packed(m, n, k, lhs_type, int32_lanes, rhs_dtype="int8"):
+def get_matmul_packed(m, n, k, lhs_type, rhs_dtype="int8"):
     X = te.placeholder((m, k), name="X", dtype=lhs_type)
-    packed_W = te.placeholder((n // int32_lanes, k // 4, int32_lanes, 4), name="packedW", dtype=rhs_dtype)
+    W = te.placeholder((n, k), name="W", dtype=rhs_dtype)
 
     ak = te.reduce_axis((0, k), name="k")
     matmul = te.compute(
         (m, n),
         lambda i, j: te.sum(
-            X[i, ak].astype("int32")
-            * packed_W[
-                tvm.tir.indexdiv(j, int32_lanes), tvm.tir.indexdiv(ak, 4), j % int32_lanes, ak % 4
-            ].astype("int32"),
+            X[i, ak].astype("int32") * W[j, ak].astype("int32"),
             axis=ak,
         ),
         name="compute",
     )
 
-    return te.create_prim_func([X, packed_W, matmul])
+    return te.create_prim_func([X, W, matmul])
 
 
 def test_tensorize_vnni():
     m, n, k = 128, 128, 128
 
-    func = get_matmul_packed(m, n, k, "uint8", 16)
+    func = get_matmul_packed(m, n, k, "uint8")
 
     sch = tir.Schedule(func, debug_mask="all")
     block = sch.get_block("compute")
+    sch.transform_layout(block, "W", lambda i, j: [i//16, j//4, i%16, j%4])
     _, j, k = sch.get_loops(block)
 
     _, ji = sch.split(j, factors=[None, 16])
@@ -582,11 +580,12 @@ def test_tensorize_vnni():
 def test_tensorize_arm_dot():
     m, n, k = 128, 128, 128
 
-    func = get_matmul_packed(m, n, k, "int8", 4)
+    func = get_matmul_packed(m, n, k, "int8")
 
     for intrin in [ARM_DOT_4x4_i8_SDOT_INTRIN, ARM_DOT_4x4_i8_NEON_INTRIN]:
         sch = tir.Schedule(func, debug_mask="all")
         block = sch.get_block("compute")
+        sch.transform_layout(block, "W", lambda i, j: [i//4, j//4, i%4, j%4])
         _, j, k = sch.get_loops(block)
 
         _, ji = sch.split(j, factors=[None, 4])
@@ -602,10 +601,11 @@ def test_tensorize_arm_dot():
 def test_tensorize_vrmpy():
     m, n, k = 128, 128, 128
 
-    func = get_matmul_packed(m, n, k, "uint8", 32, "uint8")
+    func = get_matmul_packed(m, n, k, "uint8", "uint8")
 
     sch = tir.Schedule(func, debug_mask="all")
     block = sch.get_block("compute")
+    sch.transform_layout(block, "W", lambda i, j: [i//32, j//4, i%32, j%4])
     _, j, k = sch.get_loops(block)
 
     _, ji = sch.split(j, factors=[None, 32])
@@ -618,6 +618,26 @@ def test_tensorize_vrmpy():
     verify_trace_roundtrip(sch=sch, mod=func)
 
 
+def test_tensorize_vdmpy():
+    m, n, k = 128, 128, 128
+
+    func = get_matmul_packed(m, n, k, "int16", "int16")
+
+    sch = tir.Schedule(func, debug_mask="all")
+    block = sch.get_block("compute")
+    sch.transform_layout(block, "W", lambda i, j: [i//32, j//2, i%32, j%2])
+    _, j, k = sch.get_loops(block)
+
+    _, ji = sch.split(j, factors=[None, 32])
+    ko, ki = sch.split(k, factors=[None, 2])
+    sch.reorder(ko, ji, ki)
+
+    sch.decompose_reduction(block, ko)
+    sch.tensorize(ji, VDMPY_i16i16i32_INTRIN)
+
+    verify_trace_roundtrip(sch=sch, mod=func)
+
+
 def test_tensorize_dpa4():
     m, n, k = 128, 128, 128
 

From bbba8d97fe9c32513f6143b54ea66ce8277b79d1 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Wed, 7 Dec 2022 01:27:57 +0900
Subject: [PATCH 699/704] [microTVM] Modernize Arm Cortex-M convolution
 schedules (#13242)

* Quantized Corstone300 test draft

* Add QNN strategy with operator fusion for Cortex-M

Get QNN strategy running

QNN strategy with operator fusion

* Add assembly tensordot code from other PR

Assembly tensordot from other PR

Tensordot offset support

Hand tested tensordot code

* Helper work to support microTVM TIR schedules

Formatting fixes

Don't use automatic AOT building when skipping pass

Assorted tech for scheduling with TIR

Hacky int16 support

* TIR schedule for microTVM conv2d

Bugged schedule implementation

Passing test!

Works for all 1x1 conv2ds!

External QNN operator altering

Debugging work

Pad with correct constant

Broadly functional conv2d

Reorganize quantize convolution test

* TIR schedule for microTVM depthwise_conv2d

Working depthwise convolution for strides=1

Working depthwise convolution!

* Clean up code

Support Python 3.7

Clean up code to prepare for review

* Break qnn.py into helper functions

* Finish reorganizing qnn.py

* Fix linting

* Remove residual debug code and fix linting

* Try repairing unit tests

* Run black to fix linting

* Address code review comments

* Second round of code review

Second round of code review

Fix tensordot opts test

* Address @areusch code review

* More code review

* Catch VWW model download with request hook
---
 python/tvm/relay/op/nn/_nn.py                 |  17 +
 python/tvm/relay/qnn/strategy/__init__.py     |   1 +
 python/tvm/relay/qnn/strategy/arm_cpu.py      |  72 +++
 python/tvm/topi/arm_cpu/__init__.py           |   2 +
 python/tvm/topi/arm_cpu/conv2d.py             |  18 -
 python/tvm/topi/arm_cpu/depthwise_conv2d.py   |  20 -
 .../mprofile/dsp/micro_kernel/tensordot.py    | 469 ++++++++++++++----
 .../arm_cpu/mprofile/dsp/tensordot_conv2ds.py | 296 -----------
 python/tvm/topi/arm_cpu/qnn.py                | 370 ++++++++++++++
 python/tvm/topi/arm_cpu/qnn_alter_op.py       | 122 +++++
 python/tvm/topi/nn/qnn.py                     |  48 ++
 src/relay/qnn/op/convolution.cc               |   5 +-
 .../test_ethosn/test_convert_equivalents.py   |   4 +-
 .../relay/strategy/arm_cpu/test_conv2d.py     |  22 -
 .../strategy/arm_cpu/test_depthwise_conv2d.py |  31 --
 .../arm_cpu/test_generalized_conv2d.py        |  10 +-
 .../arm_cpu/test_quantized_convolution.py     | 358 +++++++++++++
 .../python/test_topi_conv2d_tensordot_opts.py | 415 ++++++++++++++++
 tests/scripts/request_hook/request_hook.py    |   1 +
 19 files changed, 1775 insertions(+), 506 deletions(-)
 create mode 100644 python/tvm/relay/qnn/strategy/arm_cpu.py
 delete mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
 create mode 100644 python/tvm/topi/arm_cpu/qnn.py
 create mode 100644 python/tvm/topi/arm_cpu/qnn_alter_op.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
 create mode 100644 tests/python/topi/python/test_topi_conv2d_tensordot_opts.py

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 53aec11e5816..e956c82828c1 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -877,6 +877,23 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, desired_layouts):
     return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)
 
 
+# QNN ops
+@reg.register_alter_op_layout("add")
+def alter_op_layout_add(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a add op.
+
+    Useful for fusing the bias constant with an input zero point constant in a previous quantized
+    op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
+    """
+    return topi.nn.qnn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type)
+
+
+@reg.register_alter_op_layout("qnn.requantize")
+def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a requantization op."""
+    return topi.nn.qnn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type)
+
+
 # bitpack
 @reg.register_compute("nn.bitpack")
 def compute_bitpack(attrs, inputs, out_dtype):
diff --git a/python/tvm/relay/qnn/strategy/__init__.py b/python/tvm/relay/qnn/strategy/__init__.py
index 05778c3e9f86..d7b669a4fa42 100644
--- a/python/tvm/relay/qnn/strategy/__init__.py
+++ b/python/tvm/relay/qnn/strategy/__init__.py
@@ -20,4 +20,5 @@
 from __future__ import absolute_import as _abs
 
 from .generic import *
+from . import arm_cpu
 from . import hexagon
diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py b/python/tvm/relay/qnn/strategy/arm_cpu.py
new file mode 100644
index 000000000000..f8653817835e
--- /dev/null
+++ b/python/tvm/relay/qnn/strategy/arm_cpu.py
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Quantized operator strategy for Arm CPU.
+
+As quantized op schedules, these are only used if the qnn.Legalize pass is disabled. The current
+schedules only work for fused operators with bias, as this is the most common use case. Only
+regular/depthwise conv2d is supported, but qnn_dense will be added eventually."""
+
+from tvm import topi, TVMError
+from .generic import qnn_conv2d_strategy
+from ... import op as _op
+from ...op.strategy.generic import is_depthwise_conv2d
+
+
+@qnn_conv2d_strategy.register("arm_cpu")
+def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
+    """qnn.conv2d strategy for Arm Cortex-M CPUs with DSP.
+
+    When computing convolutions, we want data that will be used to compute the same output values to
+    be adjacent in memory, as this lets us reuse memory loads and use more SIMD instructions.
+
+    For depthwise convolutions, channels do not interact with each other, so the NCHW and IOHW
+    layouts to the best job of keeping "related" data close. In contrast, computing one output of a
+    regular convolution requires reading all input channels, so NHWC and OHWI are best. Hence, these
+    are the layouts we support.
+    """
+
+    if not (target.features.has_dsp and "cortex-m" in target.mcpu):
+        raise TVMError(
+            "Quantized Arm schedules only exist for Cortex-M with DSP! "
+            "The qnn.Legalize pass should be run for other Arm processors."
+        )
+
+    data = inputs[0]
+    kernel = inputs[1]
+    data_layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    groups = attrs.groups
+    strategy = _op.OpStrategy()
+
+    if groups == 1:
+        if data_layout == "NHWC" and kernel_layout == "OHWI":
+            strategy.add_implementation(
+                topi.arm_cpu.qnn_conv2d,
+                topi.arm_cpu.schedule_qnn_conv2d,
+                name="qnn_conv2d.arm_cpu",
+            )
+    elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
+        if data_layout == "NCHW" and kernel_layout == "IOHW":
+            strategy.add_implementation(
+                topi.arm_cpu.qnn_depthwise_conv2d,
+                topi.arm_cpu.schedule_qnn_depthwise_conv2d,
+                name="qnn_depthwise_conv2d.arm_cpu",
+            )
+    else:
+        raise TVMError("No Arm Cortex-M DSP strategy exists for generic group qnn.conv2d")
+
+    return strategy
diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py
index 20f92a8895dd..eba102662bc4 100644
--- a/python/tvm/topi/arm_cpu/__init__.py
+++ b/python/tvm/topi/arm_cpu/__init__.py
@@ -23,9 +23,11 @@
 from .conv2d_transpose import *
 from .conv2d_int8 import *
 from . import conv2d_alter_op
+from . import qnn_alter_op
 from .bitserial_conv2d import *
 from .bitserial_dense import *
 from .injective import *
 from .group_conv2d import *
 from .pooling import *
 from .dense import *
+from .qnn import *
diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py
index fc46f4b34f9d..ab489161a8fa 100644
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ b/python/tvm/topi/arm_cpu/conv2d.py
@@ -37,10 +37,6 @@
     conv2d_nhwc_dsp_compute,
     conv2d_nhwc_dsp_schedule,
 )
-from .mprofile.dsp.tensordot_conv2ds import (
-    conv2d_nhwc_ohwi_dsp_compute,
-    tensordot_conv2ds_schedule,
-)
 
 
 @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
@@ -522,17 +518,3 @@ def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
 def schedule_conv2d_nhwc_dsp(cfg, outs):
     """Create schedule for conv2d_nhwc_dsp"""
     return conv2d_nhwc_dsp_schedule(cfg, outs)
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_ohwi_dsp.arm_cpu")
-def conv2d_nhwc_ohwi_dsp(cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype):
-    """Compute conv2d_nhwc_ohwi with v7e-m DSP instructions and the tensordot kernel."""
-    return conv2d_nhwc_ohwi_dsp_compute(
-        cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_ohwi_dsp.arm_cpu")
-def schedule_conv2d_nhwc_ohwi_dsp(cfg, outs):
-    """Create schedule for conv2d_nhwc_ohwi."""
-    return tensordot_conv2ds_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index 9284b9474513..b6c15a30c037 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -31,10 +31,6 @@
     depthwise_conv2d_nhwc_dsp_compute,
     depthwise_conv2d_nhwc_dsp_schedule,
 )
-from .mprofile.dsp.tensordot_conv2ds import (
-    depthwise_conv2d_nchw_oihw_dsp_compute,
-    tensordot_conv2ds_schedule,
-)
 
 
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
@@ -722,19 +718,3 @@ def depthwise_conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out
 def schedule_depthwise_conv2d_nhwc_dsp(cfg, outs):
     """Create schedule for conv2d_nhwc_dsp"""
     return depthwise_conv2d_nhwc_dsp_schedule(cfg, outs)
-
-
-@autotvm.register_topi_compute("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-def depthwise_conv2d_nchw_oihw_dsp(
-    cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-):
-    """Compute depthwise_conv2d_nchw_oihw with v7e-m DSP instructions and the tensordot kernel."""
-    return depthwise_conv2d_nchw_oihw_dsp_compute(
-        cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-    )
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-def schedule_depthwise_conv2d_nchw_oihw_dsp(cfg, outs):
-    """Create schedule for depthwise_conv2d_nchw_oihw."""
-    return tensordot_conv2ds_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
index 0fdffc06cf4f..1d36e1dd1e9c 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
@@ -14,142 +14,391 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Computes a "jumpy tensordot" operator, which can be used to tensorize many common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d provided the data and kernel layouts
-are the optimal ones. When groups=1, the optimal data layout is NHWC and kernel layout is OHWI. When
-this is a depthwise convolution, the optimal data layout is NCHW and kernel layout is OIHW."""
+"""Generates optimized code to compute a tensor dot product on ARMv7E-M.
 
+This function can be used to tensorize many common operators including regular conv2d, depthwise
+conv2d, and grouped conv2d for some data and kernel layouts. When for regular convolution, use data
+layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout data layout is NCHW
+and kernel layout OIHW.
+
+The generated code will also work on v8-M chips that have the DSP instructions (unlike v7E-M, they
+are optional in v8-M). Note that the generated code does not use the (potentially very useful) MVE
+instructions present on some v8-M chips.
+"""
+
+from dataclasses import dataclass
+from itertools import chain
 import textwrap
+from typing import Iterator, Optional, Tuple
 
-from tvm import te, tir
 
-from .common import num_simd_lanes_per_word
+@dataclass
+class SMLAInstruction:
+    """Class for keeping track of an item in inventory."""
 
+    instruction: str
+    tensor_var: str
+    kernel_var: str
 
-def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix):
-    """Gets the C function name of the tensordot function."""
-    return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}"
+    def call_with_acle(self, accumulator_var: str) -> str:
+        return (
+            f"{accumulator_var} = __{self.instruction}"
+            f"({self.tensor_var}, {self.kernel_var}, {accumulator_var});"
+        )
 
+    def has_same_operands(self, other: "SMLAInstruction") -> bool:
+        return self.tensor_var == other.tensor_var and self.kernel_var == other.kernel_var
 
-def make_intrin_tensordot(slices, strides, tensordot_params):
-    """Helper function for constructing tensordot intrinsic. We can't construct the whole thing here
-    (as multiple schedules use tensordot and each must build the intrinstic differently) but we can
-    build part here to simplify the code."""
 
-    # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params
-    data, kernel, output = slices
-    data_strides, kernel_strides = strides
+def _get_c_function_name(num_outputs, dimensions, offsets, x_strides):
+    """Generates a C function name for tensordot.
 
-    data_buf = tir.decl_buffer(
-        data.shape, data.dtype, name="data", offset_factor=1, strides=data_strides
-    )
-    kernel_buf = tir.decl_buffer(
-        kernel.shape,
-        kernel.dtype,
-        name="kernel",
-        offset_factor=1,
-        strides=kernel_strides,
-    )
-    output_buf = tir.decl_buffer(
-        output.shape, output.dtype, name="output", offset_factor=1, strides=[1]
+    We do not need a suffix, as the generated function will have an #include guard. Unlike other
+    microTVM operators, _get_c_function_name is never called externally.
+    """
+    tensor_w, kernel_h, kernel_w = dimensions
+    return (
+        f"tensordot_opt_x{num_outputs}_int16_w{tensor_w}_"
+        + f"{kernel_h}x{kernel_w}_"
+        + "".join(map(str, offsets))
+        + (f"_{x_strides[0]}_{x_strides[1]}" if num_outputs > 1 else "")
     )
 
-    def intrin_func(ins, outs):
-        builder = tir.ir_builder.create()
-        builder.emit(
-            tir.call_extern(
-                "int32",
-                _get_func_name(*tensordot_params),
-                outs[0].access_ptr("w"),
-                ins[0].access_ptr("r"),
-                ins[1].access_ptr("r"),
-            )
-        )
-        return builder.get()
 
-    return te.decl_tensor_intrin(
-        output.op,
-        intrin_func,
-        binds={data: data_buf, kernel: kernel_buf, output: output_buf},
-    )
+def _init_biased_accumulators(num_outputs):
+    """Generates code to load the bias into the accumulators.
+
+    Addition is commutative, so we could add the bias before, during, or after performing our
+    multiply-accumulate operations. Where we add the bias does not change the overflow behavior.
+
+    Doing the bias add takes one cycle either way (if done at the beginning we can't use a SMULXY
+    trick to set sum_i to zero for "free"). However, doing it at the beginning frees up a register,
+    so we'll do it first.
+    """
+    assignments = [f"sum_{x:x} = *bias" for x in range(num_outputs)]
+    joined_assignments = ", ".join(assignments)
+    return f"int32_t {joined_assignments};"
+
+
+def _get_tensor_halfwords(dimensions, offset, num_outputs, in_stride) -> Iterator[Optional[Tuple]]:
+    """Gets the logical indices of the data that will be stored in memory at the tensor pointer.
+
+    Returns an Iterator of Optional[Tuple], while skipping over word-aligned pairs of unrelated
+    halfwords. The returned iterator is as short as possible while having even length and containing
+    all relevant tensor data. Tuples in the returned Iterator represent an (y, x) offset from the
+    top-left tensor position being used in this convolution. We need to be aware of the None values
+    so our code is correctly word-aligned.
+
+    One consequence of these requirements - each row in the tensor is broken into word-aligned pairs
+    of halfwords (which are later combined into full words). See the test cases (located in
+    tests/python/topi/python/test_topi_conv2d_tensordot_opts.py) for usage examples.
+    """
+
+    tensor_w, kernel_h, kernel_w = dimensions
+    max_x_val = (num_outputs - 1) * in_stride + kernel_w
+    halfwords = []
+
+    for y in range(kernel_h):
+        # If needed, pad so the beginning of the row is word-aligned
+        if (y * tensor_w + offset) % 2 == 1:
+            halfwords.append(None)
+
+        for x in range(max_x_val):
+            halfwords.append((y, x))
+
+        # If needed, pad so the row length is word aligned
+        if (y * tensor_w + offset + max_x_val) % 2 == 1:
+            halfwords.append(None)
+    return halfwords
+
+
+def _get_kernel_halfwords(dimensions, offset) -> Iterator[Optional[Tuple]]:
+    """Gets the logical indices of the data that will be stored in memory at the kernel pointer.
 
+    Returns an Iterator of Optional[Tuple]. The returned iterator is as short as possible while
+    having even length and containing all kernel data. Tuples in the returned Iterator represent
+    an (y, x) position in the kernel, while None values represent other, irrelevant data. We need
+    to be aware of the None values so our code is correctly word-aligned.
 
-def tensordot_impl(in_dtype: str, tensor_h: int, jump: int, tensor_w: int, suffix: str) -> str:
-    """Generates C code for taking the dot products of two `tensor_h` * `tensor_w` tensors. Also has
-    a `jump` argument that advances the pointer of one tensor by that many words after each row. The
-    `jump` and `tensor_w` values must be word-aligned for the input data type, as non-word-aligned
-    memory access is slow on the Cortex-M series. Depending on the input datatype, the code may
-    contain DSP instructions for Arm v7e-m. C code contains DSP instructions for Arm v7e-m. See
-    the below pseudocode for reference:
-
-    tensordot(out_ptr, dat_ptr, ker_ptr) {
-        sum = 0;
-        for (i = 0; i < tensor_h; i++) {
-            for (j = 0; j < tensor_w; j++) {
-                sum += (*dat_ptr++) * (*ker_ptr++);
-            }
-            dat_ptr += jump;
-        }
-        *out_ptr = sum;
-    }
+    See test cases in tests/python/topi/python/test_topi_conv2d_tensordot_opts.py for examples.
     """
+    _, kernel_h, kernel_w = dimensions
+    halfwords = []
 
-    simd_lanes = num_simd_lanes_per_word(in_dtype)
-    assert tensor_w % simd_lanes == 0
-    assert jump % simd_lanes == 0
+    # Kernel data starts `offset` places after the pointer value
+    if offset == 1:
+        halfwords.append(None)
 
-    if in_dtype == "int8":
-        inner_loop = """
-              uint32_t tensor_c20 = __SXTB16(tensor_batch);
-              uint32_t kernel_c20 = __SXTB16(kernel_batch);
-              sum = __SMLAD(tensor_c20, kernel_c20, sum);
+    for y in range(kernel_h):
+        for x in range(kernel_w):
+            halfwords.append((y, x))
 
-              uint32_t tensor_c31 = __SXTB16(__ROR(tensor_batch, 8));
-              uint32_t kernel_c31 = __SXTB16(__ROR(kernel_batch, 8));
-              sum = __SMLAD(tensor_c31, kernel_c31, sum);"""
+    # Make sure the returned iterator has even length by padding with an "unknown" value. We want
+    # even length as this corresponds to an integer number of int32 words.
+    if (kernel_h * kernel_w + offset) % 2 == 1:
+        halfwords.append(None)
+    return halfwords
 
-    elif in_dtype == "int16":
-        inner_loop = """
-              sum = __SMLAD(tensor_batch, kernel_batch, sum);"""
 
-    elif in_dtype == "int32":
-        inner_loop = """
-              // Compiles to a single MAC instruction
-              sum += tensor_batch * kernel_batch;"""
+def _get_int16_alias(position) -> str:
+    if position is None:
+        return "unknown"
+    y, x = position
+    return f"y{y:0>2x}_x{x:0>2x}"
+
+
+def _load_tensor_vars(halfwords, tensor_w) -> Iterator[str]:
+    assert len(halfwords) % 2 == 0
+    offset = int(not bool(halfwords[0]))
+
+    for i in range(0, len(halfwords), 2):
+        var_name = f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}"
+        y, x = halfwords[i + 1] or halfwords[i]
+        tensor_index = (y * tensor_w + x + offset) // 2
+        yield f"int32_t tensor__{var_name} = tensor[{tensor_index}];"
+
+
+def _load_kernel_vars(halfwords) -> Iterator[str]:
+    assert len(halfwords) % 2 == 0
+    for i in range(0, len(halfwords), 2):
+        var_name = f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}"
+        yield f"int32_t kernel__{var_name} = kernel[{i // 2}];"
+
+
+def _get_draft_macs(
+    kernel_dims, tensor_halfwords, kernel_halfwords, offset
+) -> Iterator[SMLAInstruction]:
+    """Generates unrolled MAC instructions to compute one tensordot sum.
+
+    Unrolling these loops increases code size a tiny bit (< 0.02 KB), but makes the generated code
+    much faster. The generated code does not use SIMD instructions - they are added later by
+    _apply_simd_optimizations.
+
+    We return an iterator of SMLAInstruction named tuples. Returning an iterator lets us do
+    optimizations by iterator chaining.
+    """
+
+    def get_var(y, x, halfwords) -> Tuple[str, str]:
+        i = halfwords.index((y, x))
+        if i % 2 == 0:
+            return f"{_get_int16_alias((y, x))}__{_get_int16_alias(halfwords[i + 1])}", "b"
+        return f"{_get_int16_alias(halfwords[i - 1])}__{_get_int16_alias((y, x))}", "t"
+
+    kernel_h, kernel_w = kernel_dims
+    for y in range(kernel_h):
+        for x in range(kernel_w):
+            tensor_var, tensor_half = get_var(y, x + offset, tensor_halfwords)
+            kernel_var, kernel_half = get_var(y, x, kernel_halfwords)
+            instruction = f"smla{tensor_half}{kernel_half}"
+            yield SMLAInstruction(instruction, f"tensor__{tensor_var}", f"kernel__{kernel_var}")
+
+
+def _apply_simd_optimizations(instruction_tuples) -> Iterator[SMLAInstruction]:
+    """When possible, fuses single MACs into SIMD MAC instructions.
+
+    The compiler cannot do this automatically, as calling __smlaxy forces the SMLAxy instruction to
+    be used. This function takes as input an iterator of SMLAInstructions and returns an iterator of
+    SMLAInstructions (possibly of different length).
+    """
+    curr_tuple = next(instruction_tuples, None)
+    while curr_tuple:
+        next_tuple = next(instruction_tuples, None)
+        if next_tuple is None:
+            yield curr_tuple
+            break
+
+        if curr_tuple.has_same_operands(next_tuple):
+            instructions = sorted([curr_tuple.instruction, next_tuple.instruction])
+            if instructions == ["smlabb", "smlatt"]:
+                yield SMLAInstruction("smlad", curr_tuple.tensor_var, curr_tuple.kernel_var)
+                next_tuple = next(instruction_tuples, None)
+            elif instructions == ["smlabt", "smlatb"]:
+                yield SMLAInstruction("smladx", curr_tuple.tensor_var, curr_tuple.kernel_var)
+                next_tuple = next(instruction_tuples, None)
+            else:
+                yield curr_tuple
+
+        else:
+            yield curr_tuple
+        curr_tuple = next_tuple
+
+
+def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
+    """Converts an iterator of SMLAInstructions into lines of C code.
+
+    We want the compiler to re-order these with the memory loads, so we generate them as a series of
+    calls to instruction aliases instead of as a single `asm` block.
+    """
+
+    for smla_instruction in instruction_tuples:
+        assert "smla" in smla_instruction.instruction
+
+        # We call the instruction using the Arm C Language Extensions. Using ACLE gives better
+        # cross-compiler compatibility than using __builtin functions.
+        yield smla_instruction.call_with_acle(f"sum_{index}")
+
+
+def _requantize_sums(num_outputs, requantize_shift, output_zero_point) -> Iterator[str]:
+    """Generates code to requantize the accumulator values.
+
+    The generated code does not use floating point instructions, as it simulates floating point
+    multiplication with an a int64 multiply + shift. The bias is added at the beginning, so we can
+    skip doing it now. The shift is hard-coded, as this saves a few cycles without hurting accuracy
+    in "most" cases.
+
+    It's *possible* we could save one more cycle here by pre-multiplying the bias with the
+    requantize multiplier, and then doing the bias addition and shift in the same cycle (via <op2>).
+    However, it's complicated and only saves one cycle.
+
+    It's also worth noting the SSAT16 operation doesn't help us here. The data isn't stored as two
+    halfwords in a word, and rearrainging it would take at least one cycle. Two SSAT operations is
+    just as good.
+
+    Calling __ssat directly is a little bit gross, but GCC and Clang are unreliable about compiling
+    other ways of writing this. Both the multiply + shift and shift + saturation combine to one
+    instruction each.
+    """
+
+    yield "int32_t scale_val = *scale;"
+    for i in range(num_outputs):
+        yield f"int32_t requant_{i} = (sum_{i} * (int64_t) scale_val) >> {requantize_shift - 1};"
+        yield f"requant_{i} = (requant_{i} + 1) >> 1;"
+        yield f"requant_{i} = __ssat(requant_{i} + {output_zero_point}, 8);"
+
+
+def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
+    """Generates code to write the requantized sums to memory.
+
+    Note - halfword packing here *does* help. It seems
+    like it wouldn't, as doing two pipelined int16 stores takes two cycles - the same as halfword
+    packing plus a pipelined int32 store. We still do the int16 stores when there is an output
+    stride, though.
+
+    However, this lets the compiler re-order instructions to better preserve memory, as it doesn't
+    like breaking apart the store instructions (as this messes up pipelining).
+    """
+
+    if stride > 1:
+        for i in range(num_outputs):
+            yield f"((int16_t*) output)[{i * stride + offset}] = (int16_t) requant_{i};"
 
     else:
-        raise ValueError(f"No tensordot implementation exists for dtype '{in_dtype}'!")
+        num_packed = (num_outputs - offset) // 2
+        for i in range(num_packed):
+            index = 2 * i + offset
+            yield f"int32_t packed_res_{i} = requant_{index} + (requant_{index + 1} << 16);"
 
-    function_name = _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix)
-    return textwrap.dedent(
-        (
-            f"""
-        #include <stdint.h>
-        #include <arm_nnsupportfunctions.h>
+        if offset == 1:
+            yield "((int16_t*) output)[1] = (int16_t) requant_0;"
 
-        #ifdef __cplusplus
-        extern "C"
-        #endif
-        __STATIC_FORCEINLINE int32_t {function_name}(
-            uint32_t *out,
-            uint32_t *tensor,
-            uint32_t *kernel) {{
-
-          uint32_t sum = 0;
-
-          #pragma GCC unroll {tensor_h}
-          for (int i = 0; i < {tensor_h}; i++) {{
-            #pragma GCC unroll {tensor_w // simd_lanes}
-            for (int j = 0; j < {tensor_w // simd_lanes}; j++) {{
-              uint32_t tensor_batch = *tensor++;
-              uint32_t kernel_batch = *kernel++;
-              {inner_loop.strip()}
-            }}
-            tensor += {jump // simd_lanes};
-          }}
-          out[0] = sum;
+        for i in range(num_packed):
+            yield f"output[{offset + i}] = packed_res_{i};"
+
+        if (offset + num_outputs) % 2 == 1:
+            yield f"((int16_t*) output)[{num_packed * 2}] = (int16_t) requant_{num_packed * 2};"
+
+
+def tensordot_int16_impl(
+    num_outputs: int,
+    dimensions: Tuple[int, int, int],
+    offsets: Tuple[int, int, int],
+    x_strides: Tuple[int, int],
+    requantize_shift: int = 33,
+    output_zero_point: int = -128,
+) -> Tuple[str, str]:
+    """Generates code to compute a tensor dot product with requantization.
+
+    The generated function takes pointers to the output, tensor, and kernel as input. All pointers
+    must be word aligned. Only works with `int16` data type. The generated code is optimized for the
+    ARMv7E-M architecture.
+
+    Parameters
+    ----------
+    num_outputs: int
+        The number of tensordot outputs to compute per function call. Computing more than one at
+        once makes us much faster by reducing how often overlapping data is loaded. However, setting
+        this too high causes us to run out of registers and need to store data on the stack. We
+        should autotune this, but num_outputs=2 is usually OK.
+
+    dimensions: Tuple[int, int, int]
+        The dimensions of each tensordot operation. dimensions[1] and dimensions[2] are the height
+        and width of the kernel, respectively. dimensions[0] is the width of the data tensor, which
+        is usually larger than the kernel.
+
+    offsets: Tuple[int, int, int]
+        Each value is 0 or 1, and represents how far after the given data, kernel, and output
+        pointers (respectively) we should start reading/writing. This prevents us from having to
+        check if each pointer is aligned or unaligned at runtime, making us faster.
+
+    x_strides: Tuple[int, int]
+        The distance (in halfwords) between the start of each input tensor, and where to write each
+        output result respectively. Only used when num_outputs > 1.
+
+    requantize_shift: int
+        The distance to right shift after multiplying by the requantization scale. Defaults to 33,
+        as this lets us skip a shift operation.
+
+    outout_zero_point: int
+        The output zero point, which will be subtracted after scale multiplication but before
+        clipping. Defaults to -128, as most models always use this.
+
+    Returns
+    -------
+    func_name, func_code: Tuple[str, str]
+        The name and source code of the generated function.
+    """
+    function_name = _get_c_function_name(num_outputs, dimensions, offsets, x_strides)
+    tensor_w, kernel_h, kernel_w = dimensions
+    tensor_offset, kernel_offset, output_offset = offsets
+    assert tensor_offset < 2 and kernel_offset < 2 and output_offset < 2
+    in_stride, out_stride = x_strides
+
+    tensor_halfwords = _get_tensor_halfwords(dimensions, tensor_offset, num_outputs, in_stride)
+    kernel_halfwords = _get_kernel_halfwords(dimensions, kernel_offset)
+    load_tensor_lines = _load_tensor_vars(tensor_halfwords, tensor_w)
+    load_kernel_lines = _load_kernel_vars(kernel_halfwords)
+
+    def gen_single_loop_macs(index):
+        draft_macs_iter = _get_draft_macs(
+            (kernel_h, kernel_w), tensor_halfwords, kernel_halfwords, index * in_stride
+        )
+        draft_macs_iter = _apply_simd_optimizations(draft_macs_iter)
+        return _expand_instruction_tuples(draft_macs_iter, index)
+
+    multiply_acc_lines = chain.from_iterable(gen_single_loop_macs(i) for i in range(num_outputs))
+    requantize_lines = _requantize_sums(
+        num_outputs, requantize_shift=requantize_shift, output_zero_point=output_zero_point
+    )
+    write_out_lines = _write_sums_to_memory(num_outputs, output_offset, out_stride)
+
+    def insert_lines(lines):
+        return ("\n" + " " * 10).join(lines)
+
+    # It's very common for one model to have different layers that use identical tensordot
+    # functions. To prevent function re-definition errors, we need an #include guard. This is better
+    # than adding a random suffix, as it saves flash memory.
+    code = textwrap.dedent(
+        f"""
+        #ifndef {function_name.upper()}_EXISTS
+        #define {function_name.upper()}_EXISTS
+        #include <arm_acle.h>
+        __attribute__((always_inline)) static inline int32_t {function_name}(
+            int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
+        ) {{
+          {_init_biased_accumulators(num_outputs)}
+
+          {insert_lines(load_tensor_lines)}
+
+          {insert_lines(load_kernel_lines)}
+
+          {insert_lines(multiply_acc_lines)}
+
+          {insert_lines(requantize_lines)}
+
+          {insert_lines(write_out_lines)}
           return 0;
         }}
+        #endif
         """
-        )
     )
+    return (function_name, code)
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py b/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
deleted file mode 100644
index 79564f98edfc..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Implementations of several conv2d variations, all tensorized using tensordot and optimized for
-Cortex-M DSP. Currently contains a standard conv2d and depthwise conv2d implementation, but could be
-extended to add a grouped conv2d operator. Due to the way we tensorize, this schedule ONLY works
-when the data and kernel layouts are NCHWxc and OIHWxi respectively, where x is the number of
-input channels divided by the number of groups."""
-
-import random
-import string
-from typing import Callable, Tuple, Union
-
-import tvm
-from tvm import te
-from tvm.tir import indexdiv, indexmod
-from tvm.topi.utils import traverse_inline
-from tvm.topi.nn.pad import pad
-
-from .micro_kernel.tensordot import (
-    make_intrin_tensordot,
-    tensordot_impl,
-)
-
-
-def _unpack_2d_argument(argument: Union[int, Tuple]) -> Tuple:
-    if isinstance(argument, int):
-        return (argument, argument)
-    assert len(argument) == 2
-    return argument
-
-
-def _check_no_dilation(dilation: Union[int, Tuple]) -> None:
-    """Takes a dilation argument as an integer or tuple, and makes sure both dimensions are 1.
-    Dilation prevents us from using DSP instructions, so this schedule can't work (aside from the
-    niche case where dilation_h == stride_h and dilation_w == stride_w, which is rare enough we
-    probably don't need to support it)."""
-
-    dilation_h, dilation_w = _unpack_2d_argument(dilation)
-    assert dilation_h == dilation_w == 1
-
-
-def _unpack_padding(padding: Tuple) -> Tuple:
-    assert isinstance(padding, tuple)
-    if len(padding) == 2:
-        (pad_up, pad_down), (pad_left, pad_right) = padding
-    else:
-        pad_up, pad_left, pad_down, pad_right = padding
-    return pad_up, pad_left, pad_down, pad_right
-
-
-def _pad_if_needed(data: te.tensor.Tensor, layout: str, padding: Tuple) -> te.tensor.Tensor:
-    """Performs padding on a te.tensor.Tensor object if necessary. If padding = (0, 0, 0, 0), the
-    input tensor is returned unmodified. We only care about tuples here - "VALID" and "SAME" padding
-    will be converted by the importer TFLite importer if present."""
-
-    pad_up, pad_left, pad_down, pad_right = padding
-    if not any(padding):
-        return data
-
-    # We want to pad the "H" and "W" columns, and their position depends on the layout
-    pad_before, pad_after = [0, 0, 0, 0], [0, 0, 0, 0]
-    pad_before[layout.index("H")] = pad_up
-    pad_before[layout.index("W")] = pad_left
-    pad_after[layout.index("H")] = pad_down
-    pad_after[layout.index("W")] = pad_right
-    return pad(data, pad_before, pad_after, name="padded_data")
-
-
-def _compute_output_dim(
-    data_dim: int, kernel_dim: int, pad_before: int, pad_after: int, stride: int
-) -> int:
-    """Computes an output dimension of a convolution, given the data dimension, kernel dimension,
-    padding, and stride along that axis. Note that when stride > 1, this division will often not
-    be perfectly even."""
-    return (data_dim + pad_before + pad_after - kernel_dim) // stride + 1
-
-
-def _wrap_te_compute(
-    shape: Tuple,
-    fcompute: Callable[[int, int, int, int], tvm.ir.PrimExpr],
-    desired_out_layout: str,
-    current_out_layout: str = "NHWC",
-    **kwargs,
-) -> te.tensor.Tensor:
-    """Wrapper over te.compute that allows the output layout to be easily changed."""
-    assert current_out_layout.isalpha() and desired_out_layout.isalpha()
-    assert sorted(current_out_layout) == sorted(desired_out_layout)
-    forward_order = (current_out_layout.index(c) for c in desired_out_layout)
-    reverse_order = (desired_out_layout.index(c) for c in current_out_layout)
-
-    return te.compute(
-        tuple(shape[i] for i in forward_order),
-        lambda *args: fcompute(*(args[i] for i in reverse_order)),
-        **kwargs,
-    )
-
-
-def _get_suffix() -> str:
-    """Returns a random eight-character string to append to C function names. Prevents accidental
-    re-definition of functions if the same operator appears twice in a Relay graph."""
-    return "".join(random.choices(string.ascii_uppercase, k=8))
-
-
-def conv2d_nhwc_ohwi_dsp_compute(
-    _cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-):
-    """Standard conv2d schedule that can be tensorized using tensordot."""
-
-    stride_h, stride_w = _unpack_2d_argument(strides)
-    pad_up, pad_left, pad_down, pad_right = _unpack_padding(padding)
-    _check_no_dilation(dilation)
-
-    batch_size, data_h, data_w, in_channels = data.shape
-    output_channels, kernel_h, kernel_w, _ = kernel.shape
-    assert kernel.shape[3] == in_channels
-
-    output_h = _compute_output_dim(data_h, kernel_h, pad_up, pad_down, stride_h)
-    output_w = _compute_output_dim(data_w, kernel_w, pad_left, pad_right, stride_w)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-    kc_i = te.reduce_axis((0, in_channels), name="rc")
-
-    padded_data = _pad_if_needed(data, "NHWC", (pad_up, pad_left, pad_down, pad_right))
-    return _wrap_te_compute(
-        (batch_size, output_h, output_w, output_channels),
-        lambda n, y, x, c: te.sum(
-            padded_data[n, y * stride_h + kh_i, x * stride_w + kw_i, kc_i].astype(out_dtype)
-            * kernel[c, kh_i, kw_i, kc_i].astype(out_dtype),
-            axis=(kh_i, kw_i, kc_i),
-        ),
-        out_layout,
-        name="conv2d",
-        tag="conv2d_nhwc_ohwi_dsp",
-    )
-
-
-def _make_conv2d_tensorization(padded_data, kernel):
-    _, _, padded_w, in_channels = padded_data.shape
-    _, kernel_h, kernel_w, _ = kernel.shape
-    in_dtype = padded_data.dtype
-    suffix = _get_suffix()
-    assert in_dtype == kernel.dtype
-
-    data_slice = te.placeholder((kernel_h, kernel_w, in_channels), name="a", dtype=in_dtype)
-    kernel_slice = te.placeholder((kernel_h, kernel_w, in_channels), name="b", dtype=in_dtype)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-    kc_i = te.reduce_axis((0, in_channels), name="kc_i")
-
-    output_slice = te.compute(
-        (1,),
-        lambda k: te.sum(
-            data_slice[kh_i, kw_i, kc_i].astype("int32")
-            * kernel_slice[kh_i, kw_i, kc_i].astype("int32"),
-            axis=[kh_i, kw_i, kc_i],
-        ),
-        name="c",
-    )
-
-    # TVM has a really strange bug where the outer reduction axis (kh_i) having length 1 causes the
-    # decl_buffer strides check to fail. height_stride is a dark magic workaround for this.
-    height_stride = in_channels * padded_w if kernel_h > 1 else in_channels
-    jump = (padded_w - kernel_w) * in_channels
-    tensordot_params = (in_dtype, kernel_h, jump, kernel_w * in_channels, suffix)
-    intrin_tensordot = make_intrin_tensordot(
-        (data_slice, kernel_slice, output_slice),
-        ([height_stride, in_channels, 1], [kernel_w * in_channels, in_channels, 1]),
-        tensordot_params,
-    )
-
-    tensordot_code = tensordot_impl(*tensordot_params)
-    return (intrin_tensordot, tensordot_code)
-
-
-def depthwise_conv2d_nchw_oihw_dsp_compute(
-    _cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-):
-    """Depthwise conv2d schedule that can be tensorized using tensordot."""
-
-    stride_h, stride_w = _unpack_2d_argument(strides)
-    pad_up, pad_left, pad_down, pad_right = _unpack_padding(padding)
-    _check_no_dilation(dilation)
-
-    batch_size, in_channels, data_h, data_w = data.shape
-    _, c_mul, kernel_h, kernel_w = kernel.shape
-    output_channels = in_channels * c_mul
-    assert kernel.shape[0] == in_channels
-
-    output_h = _compute_output_dim(data_h, kernel_h, pad_up, pad_down, stride_h)
-    output_w = _compute_output_dim(data_w, kernel_w, pad_left, pad_right, stride_w)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-
-    padded_data = _pad_if_needed(data, "NCHW", (pad_up, pad_left, pad_down, pad_right))
-    return _wrap_te_compute(
-        (batch_size, output_h, output_w, output_channels),
-        lambda n, y, x, c: te.sum(
-            padded_data[
-                n,
-                indexdiv(c, c_mul),
-                y * stride_h + kh_i,
-                x * stride_w + kw_i,
-            ].astype(out_dtype)
-            * kernel[indexdiv(c, c_mul), indexmod(c, c_mul), kh_i, kw_i].astype(out_dtype),
-            axis=(kh_i, kw_i),
-        ),
-        out_layout,
-        name="depthwise_conv2d",
-        tag="depthwise_conv2d_nchw_oihw_dsp",
-    )
-
-
-def _make_depthwise_conv2d_tensorization(padded_data, kernel):
-    _, _, _, padded_w = padded_data.shape
-    _, _, kernel_h, kernel_w = kernel.shape
-
-    in_dtype = padded_data.dtype
-    suffix = _get_suffix()
-    assert in_dtype == kernel.dtype
-
-    data_slice = te.placeholder((kernel_h, kernel_w), name="a", dtype=in_dtype)
-    kernel_slice = te.placeholder((kernel_h, kernel_w), name="b", dtype=in_dtype)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-
-    output_slice = te.compute(
-        (1,),
-        lambda k: te.sum(
-            data_slice[kh_i, kw_i].astype("int32") * kernel_slice[kh_i, kw_i].astype("int32"),
-            axis=[kh_i, kw_i],
-        ),
-        name="c",
-    )
-
-    jump = padded_w - kernel_w
-    tensordot_params = (in_dtype, kernel_h, jump, kernel_w, suffix)
-    intrin_tensordot = make_intrin_tensordot(
-        (data_slice, kernel_slice, output_slice),
-        ([padded_w, 1], [kernel_w, 1]),
-        tensordot_params,
-    )
-
-    tensordot_code = tensordot_impl(*tensordot_params)
-    return (intrin_tensordot, tensordot_code)
-
-
-def tensordot_conv2ds_schedule(_cfg, outs):
-    """Schedule function using v7e-m DSP instructions for all the conv2d operators in this file. We
-    use one schedule function for them all, because they are tensorized with the same kernel."""
-
-    schedule = te.create_schedule([x.op for x in outs])
-
-    def _callback(operator):
-        if "conv2d" in operator.tag:
-            output = operator.output(0)
-            padded_data = output.op.input_tensors[0]
-            kernel = output.op.input_tensors[1]
-
-            if operator.tag == "conv2d_nhwc_ohwi_dsp":
-                b_ax, y_ax, x_ax, co_ax = schedule[output].op.axis
-                kh_ax, kw_ax, ci_ax = schedule[output].op.reduce_axis
-                schedule[output].reorder(b_ax, y_ax, x_ax, co_ax, kh_ax, kw_ax, ci_ax)
-                intrin, code = _make_conv2d_tensorization(padded_data, kernel)
-
-            elif operator.tag == "depthwise_conv2d_nchw_oihw_dsp":
-                b_ax, y_ax, x_ax, co_ax = schedule[output].op.axis
-                kh_ax, kw_ax = schedule[output].op.reduce_axis
-                schedule[output].reorder(b_ax, co_ax, y_ax, x_ax, kh_ax, kw_ax)
-                intrin, code = _make_depthwise_conv2d_tensorization(padded_data, kernel)
-
-            else:
-                raise ValueError(f"Cannot tensorize {operator.tag} with tensordot!")
-
-            schedule[output].tensorize(kh_ax, intrin)
-            schedule[output].pragma(b_ax, "import_c", code)
-
-    traverse_inline(schedule, outs[-1].op, _callback)
-    return schedule
diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py
new file mode 100644
index 000000000000..fad64cc09bb8
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/qnn.py
@@ -0,0 +1,370 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Contains TVMScript implementations of some QNN operators for Arm.
+
+Currently, the only ops with compute functions are fused regular and depthwise convolutions for
+Arm Cortex-M with DSP.
+"""
+
+from typing import Tuple
+
+import tvm
+from tvm import te
+from tvm.tir import const
+from tvm.script import tir as T
+from ..utils import get_const_tuple
+from .mprofile.dsp.micro_kernel import tensordot
+
+
+def int_ceil_division(x, y):
+    return -(x // -y)
+
+
+def _compute_output_dim(data_length, kernel_length, stride):
+    return int_ceil_division(data_length + 1 - kernel_length, stride)
+
+
+def _pick_tensordot_impl(attrs, inputs, num_outputs=2, is_depthwise=False):
+    """Helper function that chooses the right implementation of micro_kernel.tensordot.
+
+    Takes as input the parameters of the conv2d, and returns a tuple of TWO (function_name,
+    function_code). The first pair (the aligned one) is for even numbered output channels, and the
+    second pair (the offset one) is for odd-numbered output channels. This function is used for
+    regular and depthwise convolutions.
+
+    We need different implementations for even vs odd numbered output channels, because the "start"
+    of an odd output channel in the data tensor or kernel might or might not be on a word boundary,
+    and the tensordot code expects all input pointers to be word-aligned.
+    """
+    data, kernel = inputs[0:2]
+    rq_output_zero_point_const = inputs[10]
+    assert len(rq_output_zero_point_const.op.body) == 1
+    output_zero_point = rq_output_zero_point_const.op.body[0]
+
+    _, stride_w = get_const_tuple(attrs.strides)
+
+    if is_depthwise:
+        assert attrs.data_layout == "NCHW"
+        assert attrs.kernel_layout == "IOHW"
+        _, _, height, width = get_const_tuple(data.shape)
+        _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
+
+        dimensions = (width, kernel_h, kernel_w)
+        in_stride = stride_w
+        data_per_oc_size = height * width
+    else:
+        assert attrs.data_layout == "NHWC"
+        assert attrs.kernel_layout == "OHWI"
+        _, height, width, in_channels = get_const_tuple(data.shape)
+        out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
+
+        dimensions = (width * in_channels, kernel_h, kernel_w * in_channels)
+        in_stride = in_channels * stride_w
+        data_per_oc_size = 0
+
+    assert attrs.out_layout is not None
+    if attrs.out_layout == "NHWC":
+        out_stride = out_channels
+    elif attrs.out_layout == "NCHW":
+        out_stride = 1
+    else:
+        raise ValueError(f"Unsupported output layout {attrs.out_layout}!")
+
+    x_strides = (in_stride, out_stride)
+    aligned_func = tensordot.tensordot_int16_impl(
+        num_outputs,
+        dimensions,
+        (0, 0, 0),
+        x_strides,
+        output_zero_point=output_zero_point,
+    )
+
+    kernel_per_oc_size = dimensions[1] * dimensions[2]
+
+    offsets = (data_per_oc_size % 2, kernel_per_oc_size % 2, 0)
+    offset_func = tensordot.tensordot_int16_impl(
+        num_outputs,
+        dimensions,
+        offsets,
+        x_strides,
+        output_zero_point=output_zero_point,
+    )
+
+    return (aligned_func, offset_func)
+
+
+def _make_tscript_ptr(buffer, offset, length, dtype="int16"):
+    return T.tvm_access_ptr(
+        T.type_annotation(dtype=dtype),
+        buffer.data,
+        offset,
+        length,
+        1,
+        dtype="handle",
+    )
+
+
+def _make_tscript_call(func_name, *args):
+    return T.evaluate(T.call_extern(func_name, *args, dtype="int32"))
+
+
+def _make_conv2d_primfunc(
+    call_dimensions: Tuple,
+    buffer_shapes: Tuple[Tuple, Tuple, Tuple, Tuple, Tuple],
+    aligned_func: Tuple[str, str],
+    offset_func: Tuple[str, str],
+    ptr_gens: Tuple,
+):
+    height, width, out_channels = call_dimensions
+    data_shape, kernel_shape, bias_shape, scale_shape, output_shape = buffer_shapes
+    aligned_func_name, aligned_func_code = aligned_func
+    offset_func_name, offset_func_code = offset_func
+    output_ptr, data_ptr, kernel_ptr = ptr_gens
+
+    # If the functions are identical, we can skip the second loop
+    if aligned_func_name == offset_func_name:
+        aligned_channels = out_channels
+        offset_channels = tvm.tir.const(0)
+        c_step = tvm.tir.const(1)
+    else:
+        aligned_channels = out_channels // 2
+        offset_channels = out_channels // 2
+        c_step = tvm.tir.const(2)
+
+    def bias_ptr(bias, c):
+        return _make_tscript_ptr(bias, c, 1, dtype="int32")
+
+    def scale_ptr(scale, c):
+        return _make_tscript_ptr(scale, c, 1, dtype="int32")
+
+    @T.prim_func
+    def biased_quantized_conv2d(
+        data_handle: T.handle,
+        kernel_handle: T.handle,
+        bias_handle: T.handle,
+        scale_handle: T.handle,
+        output_handle: T.handle,
+    ) -> None:
+
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        data = T.match_buffer(data_handle, data_shape, dtype="int16")
+        kernel = T.match_buffer(kernel_handle, kernel_shape, dtype="int16")
+        bias = T.match_buffer(bias_handle, bias_shape, dtype="int32")
+
+        # We don't specify a data type for the requantization scale, even though we will read it as
+        # an int32. This is because we must pretend it is a float32, as Relay's requantize op only
+        # allows floating point scales.
+        scale = T.match_buffer(scale_handle, scale_shape)
+        output = T.match_buffer(output_handle, output_shape, dtype="int16")
+
+        # This hack prevents TVM from seeing these variables as "unused". I should be using T.reads
+        # and T.writes, but they don't work. I think it's an issue with BufferTouchedDomain.
+        # pylint: disable=unused-variable
+        output[0, 0, 0, 0] = 0
+        __1 = data[0, 0, 0, 0]
+        __2 = kernel[0, 0, 0, 0]
+        __3 = bias[0, 0, 0, 0]
+        __4 = scale[0]
+        # pylint: enable=unused-variable
+
+        for c_ax, y_ax, x_ax in T.grid(aligned_channels, height, width):
+            with T.block("conv2d_aligned"):
+                T.block_attr({"pragma_import_c": aligned_func_code})
+                y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                _make_tscript_call(
+                    aligned_func_name,
+                    output_ptr(output, y, x, c * c_step),
+                    data_ptr(data, y, x, c * c_step),
+                    kernel_ptr(kernel, c * c_step),
+                    bias_ptr(bias, c * c_step),
+                    scale_ptr(scale, c * c_step),
+                )
+
+        for c_ax, y_ax, x_ax in T.grid(offset_channels, height, width):
+            with T.block("conv2d_offset"):
+                T.block_attr({"pragma_import_c": offset_func_code})
+                y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                _make_tscript_call(
+                    offset_func_name,
+                    output_ptr(output, y, x, c * c_step + 1),
+                    data_ptr(data, y, x, c * c_step + 1, offset=1),
+                    kernel_ptr(kernel, c * c_step + 1, offset=1),
+                    bias_ptr(bias, c * c_step + 1),
+                    scale_ptr(scale, c * c_step + 1),
+                )
+
+    return biased_quantized_conv2d
+
+
+def qnn_conv2d(attrs, inputs, out_type):
+    """Compute for qnn.conv2d with NHWC layout.
+
+    Note that this is a DIFFERENT layout from the Hexagon variant, because they have special
+    instructions Cortex-M doesn't have. We expect the kernel to have OHWI layout. We also assume
+    that padding is not necessary, as it will have been done by another pass.
+    """
+
+    # Make a few checks to unpack the function arguments and ensure it was called with the right
+    # arguments. Note that unlike most schedules, qnn_conv2d does not use a wrapper.
+    assert len(inputs) == 11
+    data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
+    output_layout = attrs.out_layout
+    assert output_layout == "NHWC"
+
+    _, height, width, in_channels = get_const_tuple(data.shape)
+    out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
+    y_stride, x_stride = get_const_tuple(attrs.strides)
+
+    out_height = _compute_output_dim(height, kernel_h, y_stride)
+    out_width = _compute_output_dim(width, kernel_w, x_stride)
+
+    # Decide how many sums our function should have running at the same time. Doing
+    # this lets us do "more work" for each memory load, but doing too many of them causes us to run
+    # out of registers. Currently this is set to either 1 or 2, but autotuning this value would
+    # improve performance a lot. Tracked by https://github.com/apache/tvm/issues/13528.
+
+    num_outputs = 2
+
+    # Next, decide whether whether we need "parity alternation". For example, if we have an
+    # 8x3x3x3 kernel (8 output channels, height 3, width 3, input channels 3) in the OHWI layout,
+    # then every output channel kernel slice will be 27 halfwords. This means every other output
+    # channel will not be word aligned, which will cause slowness/crashes!
+
+    # We solve this problem by handling the "aligned" and "offset" output channels with different
+    # versions of our tensordot function. The "aligned func" assumes that the start positions of the
+    # output, data, and kernel are given exactly by their pointer. The "offset" version assumes that
+    # the "true" start of the output is the value in the output pointer, plus an offset of 0 or 1.
+    # _pick_tensordot_impl decides whether this is the case. If not, we only want to generate one
+    # function (to save flash), so offset_func is a tuple of empty strings.
+
+    aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, num_outputs, False)
+
+    # Helper functions to make pointers
+    def output_ptr(buffer, y, x, c):
+        return _make_tscript_ptr(
+            buffer,
+            y * const(out_width * out_channels) + x * const(out_channels * num_outputs) + c,
+            1,
+        )
+
+    # We need to disable pylint's unused argument checker, as the kwarg offset is unused but must
+    # be present for compatibility. We cannot add an underscore as we normally would, as this makes
+    # the keyword not match.
+
+    # pylint: disable=unused-argument
+    def data_ptr(buffer, y, x, c, offset=0):
+        return _make_tscript_ptr(
+            buffer,
+            y * const(y_stride * width * in_channels)
+            + x * const(x_stride * num_outputs * in_channels),
+            1,
+        )
+
+    # pylint: enable=unused-argument
+
+    def kernel_ptr(buffer, c, offset=0):
+        return _make_tscript_ptr(
+            buffer,
+            c * const(kernel_h * kernel_w * in_channels) - offset,
+            1,
+        )
+
+    prim_func = _make_conv2d_primfunc(
+        (const(out_height), const(out_width // num_outputs), const(out_channels)),
+        (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
+        aligned_func,
+        offset_func,
+        (output_ptr, data_ptr, kernel_ptr),
+    )
+
+    output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16")
+    return [output]
+
+
+def schedule_qnn_conv2d(_attrs, _outs, _target):
+    """Schedule function for qnn.conv2d."""
+    return None
+
+
+def qnn_depthwise_conv2d(attrs, inputs, out_type):
+    """Compute for qnn.depthwise_conv2d with NCHW layout.
+
+    Works basically the same way as regular conv2d - see above.
+    """
+
+    assert len(inputs) == 11
+    data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
+    output_layout = attrs.out_layout
+    assert output_layout == "NHWC"
+
+    _, _, height, width = get_const_tuple(data.shape)
+    _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
+    _, out_height, out_width, _ = get_const_tuple(out_type.shape)
+    y_stride, x_stride = get_const_tuple(attrs.strides)
+
+    out_height = _compute_output_dim(height, kernel_h, y_stride)
+    out_width = _compute_output_dim(width, kernel_w, x_stride)
+
+    num_outputs = 2
+
+    aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, num_outputs, True)
+
+    # Helper functions for making pointers.
+    def output_ptr(buffer, y, x, c):
+        return _make_tscript_ptr(
+            buffer,
+            y * const(out_width * out_channels) + x * const(out_channels * num_outputs) + c,
+            1,
+        )
+
+    def data_ptr(buffer, y, x, c, offset=0):
+        if height * width % 2 == 1:
+            x_ptr_offset = tvm.tir.const(-1)
+        else:
+            x_ptr_offset = tvm.tir.const(0)
+
+        return _make_tscript_ptr(
+            buffer,
+            c * const(width * height)
+            + y * const(y_stride * width)
+            + x * const(x_stride * num_outputs)
+            + offset * x_ptr_offset,
+            1,
+        )
+
+    def kernel_ptr(buffer, c, offset=0):
+        return _make_tscript_ptr(
+            buffer,
+            c * tvm.tir.const(kernel_h * kernel_w) - offset,
+            1,
+        )
+
+    prim_func = _make_conv2d_primfunc(
+        (const(out_height), const(out_width // num_outputs), const(out_channels)),
+        (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
+        aligned_func,
+        offset_func,
+        (output_ptr, data_ptr, kernel_ptr),
+    )
+
+    output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16")
+    return [output]
+
+
+def schedule_qnn_depthwise_conv2d(_attrs, _outs, _target):
+    """Schedule function for qnn.depthwise_conv2d."""
+    return None
diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py b/python/tvm/topi/arm_cpu/qnn_alter_op.py
new file mode 100644
index 000000000000..00225493db96
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/qnn_alter_op.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm Cortex-M specific optimizations for quantized operators."""
+
+import numpy as np
+
+from tvm import nd, relay, target
+from ..nn import qnn_requantize_alter_layout, qnn_add_alter_layout
+
+
+@qnn_requantize_alter_layout.register(["arm_cpu"])
+def alter_requantize_layout(attrs, inputs, _tinfos, _out_type):
+    """Changes a floating point requantize op to use int64 multiply + shift for microTVM.
+
+    Usually, this is done by QNN legalization. However, microTVM wants to manually choose the
+    integer rounding constants in order to:
+        (a) Have int32, not int64 constants
+        (b) Use a constant rounding shift to skip a memory load.
+
+    Ideally, we would pick these constants in the requantize (or fused) schedule. Unfortunately that
+    is not currently possible, so we pick them with `alter_layout` as a hack. This will only work if
+    the requantize schedule "plays along" with this hack.
+    """
+
+    # Only microTVM Cortex-M boards with DSP use the relevant schedules
+    current_target = target.Target.current(allow_none=False)
+    if not (current_target.features.has_dsp and "cortex-m" in current_target.mcpu):
+        return None
+
+    _, in_scale, _, out_scale, _ = inputs
+    in_scale_numpy = in_scale.data.numpy().astype("float64")
+    out_scale_scalar = out_scale.data.numpy().item()
+
+    # Shifting by 33 and rounding means shifting by 32, adding 1, and shifting by 1 again. This is
+    # useful, because shifting a multiplication product by 32 can be done for "free" with SMMUL
+    scales = ((in_scale_numpy / out_scale_scalar) * 2**33).astype("int32")
+
+    # Requantize ops in Relay do not support int32 scales - if we try to use one, requantize.cc will
+    # raise an error. As a hacky work-around, we change the scale dtype to float32, without changing
+    # underlying data. This works, as our compute function knows to interpret the scale as an int32.
+
+    # This is only a work-around - a better long-term solution would be adding a new integer
+    # requantize op, which takes integer scales, shifts, and rounding behavior.
+    fake_float_scales = scales.view("float32")
+
+    scale_constant = relay.Constant(nd.array(fake_float_scales))
+    return relay.qnn.op.requantize(inputs[0], scale_constant, *inputs[2:], **attrs)
+
+
+def _is_qnn_op_depthwise_conv2d(qnn_conv2d_op):
+    return relay.op.strategy.generic.is_depthwise_conv2d(
+        qnn_conv2d_op.args[0].type_annotation.shape,
+        qnn_conv2d_op.attrs.data_layout,
+        qnn_conv2d_op.args[1].data.shape,
+        qnn_conv2d_op.attrs.kernel_layout,
+        qnn_conv2d_op.attrs.groups,
+    )
+
+
+@qnn_add_alter_layout.register(["arm_cpu"])
+def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
+    """Fuses the zero point for a previous quantized operator with this add operation.
+
+    Currently only supports qnn.conv2d, but qnn.dense support should be added. Note that this
+    optimization means we must pad tensors with the input zero point, and NOT with zero.
+    """
+
+    prev_op, biases = inputs
+    if not hasattr(prev_op, "op"):
+        return None
+    if prev_op.op.name != "qnn.conv2d":
+        return None
+
+    # We should not perform this alteration if the target has a uint * int SIMD MAC operation (since
+    # these do (x - (-128)) * y efficiently, and conv_input_zp is usually -128). For now, we
+    # restrict this optimization to just Cortex-M devices, but it might be helpful on others too.
+    current_target = target.Target.current(allow_none=False)
+    if not "cortex-m" in current_target.mcpu:
+        return None
+
+    conv_input_zp = prev_op.args[2].data.numpy().item()
+    kernel = prev_op.args[1].data.numpy()
+
+    if _is_qnn_op_depthwise_conv2d(prev_op):
+        axes_to_sum = "HW"
+    elif prev_op.attrs.groups == 1:
+        axes_to_sum = "HWI"
+    else:
+        # This alteration does not currently support grouped conv2d
+        return None
+    axes_to_sum = tuple(map(prev_op.attrs.kernel_layout.index, axes_to_sum))
+    element_sums = np.sum(kernel, axis=axes_to_sum).flatten()
+
+    # The zero point is subtracted from the input elements, so we need a "-" sign here
+    zp_shifted_sums = element_sums * (-conv_input_zp)
+
+    # We want to make sure new_biases is representable as an int32. It's tempting to just check
+    # whether arr.dtype == "int32" (since Numpy will automatically increase dtype in some cases)
+    # but this leads to weird wrapping behavior and doesn't work. We must do it manually.
+    new_biases = biases.data.numpy().astype("int64") + zp_shifted_sums
+    if new_biases.min() < -(2**31) or new_biases.max() > 2**31 - 1:
+        return None
+
+    new_input_zp = relay.Constant(nd.array(np.int32(0)))
+    new_conv_args = (*prev_op.args[:2], new_input_zp, *prev_op.args[3:])
+    new_conv_op = relay.qnn.op.conv2d(*new_conv_args, **prev_op.attrs)
+    bias_constant = relay.Constant(nd.array(new_biases.astype("int32")))
+    return relay.add(new_conv_op, bias_constant)
diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py
index caed28580037..222f7a7c223e 100644
--- a/python/tvm/topi/nn/qnn.py
+++ b/python/tvm/topi/nn/qnn.py
@@ -188,3 +188,51 @@ def _dispatch_sim_dequantize(value):
         return intn_value
 
     return te.compute(data.shape, lambda *indices: _dispatch_sim_dequantize(data)[indices])
+
+
+@tvm.target.generic_func
+def qnn_requantize_alter_layout(_attrs, _inputs, _tinfos, _out_type):
+    """Change requantize layout.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    out_type: type
+        The output type
+
+    Note
+    ----
+    Unlike other TOPI functions, this function operates on both graph level and operator level.
+    """
+    return None
+
+
+@tvm.target.generic_func
+def qnn_add_alter_layout(_attrs, _inputs, _tinfos, _out_type):
+    """Change add layout.
+
+    Add is not a QNN-specific function, but this generic exists so that bias add operations can be
+    fused with input zero point add optimizations, which only happens if the previous operator is
+    quantized.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    out_type: type
+        The output type
+
+    Note
+    ----
+    Unlike other TOPI functions, this function operates on both graph level and operator level.
+    """
+    return None
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index 64a5a02e6e25..2170ba76e060 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -53,8 +53,9 @@ bool QnnConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8) ||
          data->dtype == DataType::Int(16))
       << "Expected qnn conv2d type(int8, uint8, int16) for input but was " << data->dtype;
-  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
-      << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
+  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8) ||
+         weight->dtype == DataType::Int(16))
+      << "Expected qnn conv2d type(int8, uint8, int16) for weight but was " << weight->dtype;
   ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32) ||
          param->out_dtype == DataType::Int(64))
       << "Expected qnn conv2d type(int16, int32, int64) for output but was " << param->out_dtype;
diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
index a3e48f4424ad..58173a9ea6c3 100644
--- a/tests/python/contrib/test_ethosn/test_convert_equivalents.py
+++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
@@ -120,7 +120,7 @@ def expected():
 @requires_ethosn
 @pytest.mark.parametrize(
     "dtype,shape,constant_shape",
-    [("int8", (1, 4, 4), (4,)), ("int16", (1, 16, 12, 4), (1, 1, 1, 4))],
+    [("int8", (1, 4, 4), (4,)), ("int32", (1, 16, 12, 4), (1, 1, 1, 4))],
 )
 def test_unsupported_multiply_to_depthwise(dtype, shape, constant_shape):
     """Check that unsupported variants of multiply to depthwise are not converted."""
@@ -339,7 +339,7 @@ def visit_call(self, call):
 
 @requires_ethosn
 @pytest.mark.parametrize(
-    "dtype,lhs_shape,rhs_shape", [("uint8", (1, 4, 4), (1, 1, 4)), ("int16", (1, 4, 4, 4), (4,))]
+    "dtype,lhs_shape,rhs_shape", [("uint8", (1, 4, 4), (1, 1, 4)), ("int32", (1, 4, 4, 4), (4,))]
 )
 def test_unsupported_add_to_depthwise(dtype, lhs_shape, rhs_shape):
     """Check that unsupported variants of add are not converted."""
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_conv2d.py
index 6cf4bbb8e6ed..1b9c1a5e2e94 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d.py
@@ -93,28 +93,6 @@ class TestConv2d_NHWC_Spatial_Pack(Conv2dTests):
     schedule_name = parameter("conv2d_nhwc_spatial_pack.arm_cpu")
 
 
-class TestConv2d_Tensordot(Conv2dTests):
-    """This test is for the regular conv2d schedule tensorized using tensordot."""
-
-    data_shape, kernel_size, num_filter, strides, padding = parameters(
-        # Disabled because these kernels are not an integral number of words
-        # ((1, 32, 32, 1), (3, 3), 12, 1, 0),
-        # ((1, 32, 10, 3), (3, 3), 16, 1, 0),
-        # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1)),
-        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0)),
-        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0),
-        ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1)),
-        ((4, 16, 16, 16), (5, 5), 8, 2, 0),
-    )
-    dilation = parameter(1)
-    in_dtype = parameter("int8", "int16", "int32")
-
-    data_layout = parameter("NHWC")
-    kernel_layout = parameter("OHWI")
-    out_layout = parameter("NHWC", "NCHW")
-    schedule_name = parameter("conv2d_nhwc_ohwi_dsp.arm_cpu")
-
-
 class TestConv2d_NCHW_Spatial_Pack(Conv2dTests):
     """This test is for conv2d_nchw_spatial_pack.arm_cpu schedule."""
 
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
index f45d27bdaee9..95ae105f9166 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -110,36 +110,5 @@ class TestDepthwiseConv2d_NHWC_HWOI_DSP(DepthwiseConv2dTests):
     schedule_name = parameter("depthwise_conv2d_nhwc_dsp.arm_cpu")
 
 
-class TestDepthwiseConv2d_Tensordot(DepthwiseConv2dTests):
-    """This test is for the depthwise_conv2d schedule tensorized using tensordot."""
-
-    data_shape, kernel_size, num_filter, strides, padding, in_dtype = parameters(
-        # Currently, our schedule requires kernel_w be divisible by the number of simd lanes given
-        # its dtype. This means 3x3 and 5x5 kernels do not work on int16 or int8 for now. If you had
-        # to, you could hack around this by padding the data and kernel.
-        ((1, 48, 48, 8), (3, 3), 8, (1, 1), 1, "int32"),
-        ((1, 48, 48, 16), (3, 3), 16, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 24, 24, 32), (3, 3), 32, (1, 1), 1, "int32"),
-        ((1, 24, 24, 32), (3, 3), 32, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 12, 12, 64), (3, 3), 64, (1, 1), 1, "int32"),
-        ((1, 12, 12, 64), (3, 3), 64, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 6, 6, 128), (3, 3), 128, (1, 1), 1, "int32"),
-        ((1, 6, 6, 128), (3, 3), 128, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 3, 3, 256), (3, 3), 256, (1, 1), 1, "int32"),
-        ((1, 25, 5, 64), (3, 3), 64, (1, 1), 1, "int32"),
-        ((1, 24, 24, 8), (5, 5), 8, (1, 1), 1, "int32"),
-        ((1, 24, 24, 8), (3, 5), 8, (1, 1), 1, "int32"),
-        # These "evenly divisible" kernels work on smaller dtypes.
-        ((1, 48, 48, 8), (3, 2), 8, 1, 0, "int16"),
-        ((1, 48, 48, 8), (4, 4), 8, 1, 0, "int8"),
-    )
-    dilation = parameter(1)
-
-    data_layout = parameter("NCHW")
-    kernel_layout = parameter("OIHW")
-    out_layout = parameter("NHWC", "NCHW")
-    schedule_name = parameter("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-
-
 if __name__ == "__main__":
     main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py
index 499d677e8f95..d48c7e138fba 100644
--- a/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py
@@ -26,7 +26,7 @@
 from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
 
 
-def _change_ndarray_layout(arr, src_layout, dst_layout):
+def change_ndarray_layout(arr, src_layout, dst_layout):
     """Makes a copy of an ndarray, reshaping it to a new data layout.
 
     Parameter
@@ -96,7 +96,7 @@ def test_conv2d(
 
         ref_relay_op = relay.op.nn.conv2d(
             ref_input_var,
-            relay.const(_change_ndarray_layout(ref_kernel_data, "HWIO", self.ref_kernel_layout)),
+            relay.const(change_ndarray_layout(ref_kernel_data, "HWIO", self.ref_kernel_layout)),
             kernel_size=kernel_size,
             strides=strides,
             padding=padding,
@@ -113,11 +113,11 @@ def test_conv2d(
         # Reshape output dictionary to match out_layout
         assert len(ref_outputs) == 1
         output_tensor_name, output_tensor = next(iter(ref_outputs.items()))
-        ref_outputs[output_tensor_name] = _change_ndarray_layout(output_tensor, "NHWC", out_layout)
+        ref_outputs[output_tensor_name] = change_ndarray_layout(output_tensor, "NHWC", out_layout)
 
-        test_input_data = _change_ndarray_layout(ref_input_data, "NHWC", data_layout)
+        test_input_data = change_ndarray_layout(ref_input_data, "NHWC", data_layout)
         test_input_var = relay.var("input", relay.TensorType(test_input_data.shape, in_dtype))
-        test_kernel_data = _change_ndarray_layout(ref_kernel_data, "HWIO", kernel_layout)
+        test_kernel_data = change_ndarray_layout(ref_kernel_data, "HWIO", kernel_layout)
 
         test_relay_op = relay.op.nn.conv2d(
             test_input_var,
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
new file mode 100644
index 000000000000..573231f9632c
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -0,0 +1,358 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""microTVM cares a lot about the convolution + bias + requantize + fused ReLU use case. There have
+been some accuracy issues in the past, so this test steps through a model (MobileNetV1) layer by
+layer and ensures there is 1-1 correspondance at each step. This test would run way faster if we ran
+the model all at once, but then we wouldn't know which layers had issues.
+
+Furthermore, this test uses some in-development optimizations for microTVM that aren't part of the
+main pipeline.
+"""
+
+import numpy as np
+from PIL import Image
+import pytest
+
+import tvm
+import tvm.testing
+from tvm import meta_schedule, relay
+from tvm.testing.aot import AOTTestModel, run_and_check, AOTCompiledTestModel
+from tvm.relay.backend import Executor, Runtime
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+from tvm.contrib.download import download_testdata
+from test_generalized_conv2d import change_ndarray_layout
+
+
+# The model is the v0.7 version of the TinyML person detection (aka visual wake words) model. This
+# is an RGB 96x96 MobileNet V1 model.
+MODEL_URL = "https://github.com/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite"
+SAMPLE_URL = (
+    "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/elephant-299.jpg"
+)
+
+
+@pytest.fixture(scope="module")
+def interpreter():
+    """Returns a TFLite interpreter with the MLPerf Tiny visual wakewords model loaded, with an
+    elephant image run through it, and with all intermediate layer outputs saved."""
+
+    # Make sure the Tensorflow import is skipped if the test is being skipped. This is needed to
+    # prevent the "python: i386" tests from failing, as they don't have Tensorflow installed.
+    import tensorflow as tf  # pylint: disable=import-outside-toplevel
+
+    # Download the reference model
+    rel_model_path = "model_microtvm_mobilenetv1.tflite"
+    file = download_testdata(MODEL_URL, rel_model_path, overwrite=False)
+
+    # Load it into TensorFlow and allocate memory
+    interpreter = tf.lite.Interpreter(file, experimental_preserve_all_tensors=True)
+    interpreter.allocate_tensors()
+
+    # Download an image. The neuron activations are strange if we use random data or ones,
+    # so downloading an image is useful.
+    rel_image_path = "image_microtvm_mobilenetv1.jpg"
+    img_path = download_testdata(SAMPLE_URL, rel_image_path, overwrite=False)
+    image = Image.open(img_path).resize((96, 96))
+    image_data_hwc_uint8 = np.asarray(image)
+    assert image_data_hwc_uint8.shape == (96, 96, 3)
+    assert image_data_hwc_uint8.dtype == "uint8"
+    image_data_nhwc_int8 = (image_data_hwc_uint8 + 128).view("int8").reshape((1, 96, 96, 3))
+
+    # Load the image into the TFLite interpreter and compute all intermediate tensor values
+    input_details = interpreter.get_input_details()
+    interpreter.set_tensor(input_details[0]["index"], image_data_nhwc_int8)
+    interpreter.invoke()
+    return interpreter
+
+
+def _get_mobilenet_v1_layer_attributes(layer_num):
+    """Returns the relevant padding and stride for a given layer in a MobileNetV1 model. It's a huge
+    headache to read this data from TensorFlow, as it is not user accessible via the interpreter. If
+    we really wanted to, we would have to parse the .tflite file ourselves. This function is a bit
+    of a hack, but lets us skip that."""
+
+    if layer_num == 0:  # Regular conv2d
+        return ((0, 0, 1, 1), (2, 2), False)
+    if layer_num % 2 == 0:  # 1x1 conv2d
+        return ((0, 0, 0, 0), (1, 1), False)
+    if layer_num in [3, 7, 11, 23]:  # Downsizing depthwise_conv2d layers
+        return ((0, 0, 1, 1), (2, 2), True)
+    # Depthwise conv2d
+    return ((1, 1, 1, 1), (1, 1), True)
+
+
+def _get_relu_activation_prefix(layer_num):
+    if layer_num == 0:
+        return "model/activation/Relu;"
+    return f"model/activation_{layer_num}/Relu;"
+
+
+def _get_main_path_tensor_details(details, tensor_num):
+    """A "main path" tensor is a fused layer input/output. Gets the tensor details from the tensor
+    index, where 0 gives the original input tensor, 1 gives the output of the first fused
+    convolution layer, and so on. TFLite names are a little wack, so we get this information by
+    finding the SECOND tensor (which has the suffix "1") for each ReLU activation (the first tensor
+    is the bias)."""
+
+    if tensor_num == 0:
+        return details[0]
+    prefix = _get_relu_activation_prefix(tensor_num - 1)
+    detail = next(d for d in details if d["name"].startswith(prefix) and d["name"].endswith("1"))
+    assert len(detail["shape"]) == 4
+    assert detail["dtype"] == np.int8
+    return detail
+
+
+def _get_bias_details(details, layer_num):
+    """Gets the tensor details for the bias tensor for the corresponding convolution layer. The
+    bias tensors always appear before the main path tensors, so we don't have to check the ending to
+    make sure we have the right one."""
+    prefix = _get_relu_activation_prefix(layer_num)
+    detail = next(d for d in details if d["name"].startswith(prefix))
+    assert len(detail["shape"]) == 1
+    assert detail["dtype"] == np.int32
+    return detail
+
+
+def _get_kernel_details(details, layer_num):
+    """Gets the tensor details for the kernel tensor for the corresponding convolution layer. These
+    have a different naming scheme from the main path and bias tensors, as they are converted before
+    activation function fusion. Note that regular vs depthwise conv2ds have different prefixes."""
+
+    if layer_num == 0:
+        prefix = "model/conv2d/Conv2D"
+    elif layer_num % 2 == 0:
+        prefix = f"model/conv2d_{layer_num // 2}/"
+    else:
+        prefix = f"model/batch_normalization_{layer_num}/"
+
+    detail = next(d for d in details if d["name"].startswith(prefix))
+    assert len(detail["shape"]) == 4
+    assert detail["dtype"] == np.int8
+    return detail
+
+
+def _get_quant_scale_const(quantization_dict, as_scalar=False):
+    scales = quantization_dict["scales"]
+    if as_scalar:
+        assert len(scales) == 1
+        scales = scales[0]
+    return relay.const(scales, "float32")
+
+
+def _get_quant_zp_const(quantization_dict, as_scalar=False):
+    zero_points = quantization_dict["zero_points"]
+    if as_scalar:
+        assert len(zero_points) == 1
+        zero_points = zero_points[0]
+    return relay.const(zero_points, "int32")
+
+
+def _change_layout(data, old_layout, new_layout, dtype):
+    return change_ndarray_layout(data, old_layout, new_layout).astype(dtype)
+
+
+def _load_tflite_layer(interpreter, layer):
+    tensor_details = interpreter.get_tensor_details()
+
+    def lookup(detail):
+        return interpreter.get_tensor(detail["index"]), detail["quantization_parameters"]
+
+    input_data = lookup(_get_main_path_tensor_details(tensor_details, layer))
+    kernel_data = lookup(_get_kernel_details(tensor_details, layer))
+    bias_data = lookup(_get_bias_details(tensor_details, layer))
+    output_data = lookup(_get_main_path_tensor_details(tensor_details, layer + 1))
+    return input_data, kernel_data, bias_data, output_data
+
+
+def _make_relay_partial_func(relay_op, *args, **kwargs):
+    return lambda op: relay_op(op, *args, **kwargs)
+
+
+def _make_conv2d_op(kernel, data_quant, kernel_quant, hyperparams, is_depthwise=False):
+    dtype, padding, strides, data_layout, kernel_layout, output_layout = hyperparams
+    kernel_size = kernel.shape[1:3]
+    if is_depthwise:
+        channels = groups = kernel.shape[3]
+    else:
+        channels = kernel.shape[0]
+        groups = 1
+
+    kernel_ndarr = _change_layout(kernel, "OHWI", kernel_layout, dtype)
+
+    return _make_relay_partial_func(
+        relay.qnn.op.conv2d,
+        relay.const(kernel_ndarr, dtype),
+        input_zero_point=_get_quant_zp_const(data_quant, as_scalar=True),
+        kernel_zero_point=_get_quant_zp_const(kernel_quant),
+        input_scale=_get_quant_scale_const(data_quant, as_scalar=True),
+        kernel_scale=_get_quant_scale_const(kernel_quant),
+        kernel_size=kernel_size,
+        data_layout=data_layout,
+        kernel_layout="IOHW" if is_depthwise else kernel_layout,
+        dilation=(1, 1),
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        channels=channels,
+        out_dtype="int32",
+        out_layout=output_layout,
+    )
+
+
+def _make_bias_op(bias, output_layout):
+    requantize_axis = output_layout.index("C")
+    return _make_relay_partial_func(
+        relay.op.nn.bias_add,
+        relay.const(bias, "int32"),
+        axis=requantize_axis,
+    )
+
+
+def _make_requantize_op(bias_quant, output_quant, output_dtype, output_layout):
+    requantize_axis = output_layout.index("C")
+    return _make_relay_partial_func(
+        relay.qnn.op.requantize,
+        _get_quant_scale_const(bias_quant),
+        _get_quant_zp_const(bias_quant),
+        _get_quant_scale_const(output_quant, as_scalar=True),
+        _get_quant_zp_const(output_quant, as_scalar=True),
+        axis=requantize_axis,
+        compute_dtype="int64",
+        out_dtype=output_dtype,
+    )
+
+
+def _make_aot_model(params, hyperparams, layouts, is_depthwise=False):
+    tensors, quantizations = zip(*params)
+    data, kernel, bias, output = tensors
+    data_quant, kernel_quant, bias_quant, output_quant = quantizations
+
+    dtype, padding, _strides = hyperparams
+    data_layout, _, output_layout = layouts
+
+    if any(padding):
+        pad_const = int(data_quant["zero_points"][0])
+        pad_before = (0, padding[0], padding[1], 0)
+        pad_after = (0, padding[2], padding[3], 0)
+        data = np.pad(data, tuple(zip(pad_before, pad_after)), constant_values=pad_const)
+    data_ndarr = _change_layout(data, "NHWC", data_layout, dtype)
+    output_ndarr = _change_layout(output, "NHWC", output_layout, dtype)
+
+    input_var = relay.var("input", relay.TensorType(data_ndarr.shape, dtype))
+    conv2d = _make_conv2d_op(kernel, data_quant, kernel_quant, hyperparams + layouts, is_depthwise)
+    bias = _make_bias_op(bias, output_layout)
+    requantize = _make_requantize_op(bias_quant, output_quant, dtype, output_layout)
+
+    relay_mod = requantize(bias(conv2d(input_var)))
+    relay_func = relay.Function([input_var], relay_mod)
+    return AOTTestModel(
+        module=tvm.IRModule.from_expr(relay_func),
+        inputs={"input": data_ndarr},
+        outputs={"output": output_ndarr},
+        output_tolerance=1,
+    )
+
+
+def _make_target():
+    return tvm.target.Target("c -keys=arm_cpu -mcpu=cortex-m7")
+
+
+def _make_executor():
+    return Executor(
+        "aot",
+        {
+            "workspace-byte-alignment": 8,
+            "constant-byte-alignment": 8,
+            "interface-api": "c",
+            "unpacked-api": True,
+        },
+    )
+
+
+@pytest.mark.parametrize("layer", range(23))
+@tvm.testing.requires_corstone300
+def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer):
+    """Checks microTVM output against TFLite for one MobileNetV1 layer.
+
+    Loads the input, kernel, bias, expected output, and quantization parameters from the specified
+    layer in a TFLite Interpreter. That information is used to construct a Relay Function with the
+    same structure. The Function is run using microTVM and AOTTestModel, and we verify microTVM's
+    output is the same as the TFLite ground truth.
+
+    This function only cross-checks the first 23 layers in MobileNetV1, which are regular and
+    depthwise 2D convolutions (this function only works for 2D convolutions). We do not test the
+    average pool, dense, or softmax layers at the end of the model.
+
+    Note that we disable the QNN Legalization pass. This allows TVM to use its QNN compute
+    definitions, fuse the three operations together, and perform other optimizations.
+
+    Parameters
+    ----------
+    interpreter: tensorflow.lite.python.interpreter.Interpreter
+        A TensorFlow Lite interpreter for a MobileNetV1 model, where invoke() has already been
+        called and experimental_preserve_all_tensors=True. Should be passed as a Pytest fixture.
+
+    layer: int
+        The index of the layer to check against TensorFlow's ground truth values.
+    """
+    dtype = "int16"
+
+    tensor, kernel, bias, output = _load_tflite_layer(interpreter, layer)
+
+    padding, strides, is_depthwise = _get_mobilenet_v1_layer_attributes(layer)
+    if is_depthwise:
+        data_layout, kernel_layout, output_layout = "NCHW", "OIHW", "NHWC"
+    else:
+        data_layout, kernel_layout, output_layout = "NHWC", "OHWI", "NHWC"
+
+    test_model = _make_aot_model(
+        (tensor, kernel, bias, output),
+        (dtype, padding, strides),
+        (data_layout, kernel_layout, output_layout),
+        is_depthwise=is_depthwise,
+    )
+
+    def schedule_fn(_sch):
+        return True
+
+    with tvm.transform.PassContext(
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+            "relay.backend.use_meta_schedule": True,
+            "relay.backend.tir_converter": "allow_extern",
+        },
+        disabled_pass=["qnn.Legalize"],
+    ), meta_schedule.database.ScheduleFnDatabase(schedule_fn):
+        executor_factory = tvm.relay.build(
+            test_model.module,
+            _make_target(),
+            executor=_make_executor(),
+            runtime=Runtime("crt"),
+            params=test_model.params,
+            mod_name=test_model.name,
+        )
+        compiled = AOTCompiledTestModel(model=test_model, executor_factory=executor_factory)
+
+    run_and_check(
+        models=[compiled],
+        runner=AOT_CORSTONE300_RUNNER,
+        interface_api="c",
+        workspace_byte_alignment=8,
+        constant_byte_alignment=8,
+    )
diff --git a/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
new file mode 100644
index 000000000000..46d2797ba394
--- /dev/null
+++ b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
@@ -0,0 +1,415 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for functions in tvm.topi.arm_cpu.mprofile.dsp.micro_kernel.tensordot.
+
+Contains a few unit tests, followed by integration tests for common use cases. Note that we do not
+run the generated code - we just make sure the strings match exactly.
+
+Note that a *lot* of instruction reordering happens during compilation from C to assembly (by GCC or
+Clang). I've verified that this instruction reordering happens correctly for all the functions here.
+For more details on why the generated code is the way it is, see `tensordot_int16_impl`."""
+
+import textwrap
+
+from tvm.topi.arm_cpu.mprofile.dsp.micro_kernel.tensordot import (
+    _get_tensor_halfwords,
+    _get_kernel_halfwords,
+    tensordot_int16_impl,
+)
+
+
+def test_get_tensor_halfwords():
+    """Tests the _get_tensor_halfwords helper function in tensordot.py.
+
+    This function loads the logical indices of the data that will be stored in memory at the tensor
+    pointer. See the function docstring for more details.
+    """
+
+    # fmt: off
+    # A simple 3x3 depthwise convolution computing one output and with in_stride = 1. Note that each
+    # row is padded with None at the end to make the rows word-aligned.
+    assert _get_tensor_halfwords((48, 3, 3), 0, 1, 1) == [
+        (0, 0), (0, 1), (0, 2), None,
+        (1, 0), (1, 1), (1, 2), None,
+        (2, 0), (2, 1), (2, 2), None
+    ]
+
+    # If the tensor width is odd, padding alternates before/after every row.
+    assert _get_tensor_halfwords((49, 3, 3), 0, 1, 1) == [
+        (0, 0), (0, 1), (0, 2), None,
+        None, (1, 0), (1, 1), (1, 2),
+        (2, 0), (2, 1), (2, 2), None
+    ]
+
+    # If we are computing multiple outputs, more tensor data becomes relevant.
+    assert _get_tensor_halfwords((48, 3, 3), 0, 2, 1) == [
+        (0, 0), (0, 1), (0, 2), (0, 3),
+        (1, 0), (1, 1), (1, 2), (1, 3),
+        (2, 0), (2, 1), (2, 2), (2, 3)
+    ]
+
+    # If offset=1, relevant data starts one halfword after the kernel pointer.
+    assert _get_tensor_halfwords((48, 3, 3), 1, 1, 1) == [
+        None, (0, 0), (0, 1), (0, 2),
+        None, (1, 0), (1, 1), (1, 2),
+        None, (2, 0), (2, 1), (2, 2)
+    ]
+
+    # These adjustments can be (and often are) used together.
+    assert _get_tensor_halfwords((49, 3, 3), 1, 2, 2) == [
+        None, (0, 0), (0, 1), (0, 2), (0, 3), (0, 4),
+        (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), None,
+        None, (2, 0), (2, 1), (2, 2), (2, 3), (2, 4)
+    ]
+    # fmt: on
+
+
+def test_get_kernel_halfwords():
+    """Tests the _get_kernel_halfwords helper function in tensordot.py.
+
+    This function loads the logical indices of the data that will be stored in memory at the kernel
+    pointer. See the function docstring for more details.
+    """
+
+    # fmt: off
+    # Example of a kernel for a 3x3 depthwise convolution channel
+    assert _get_kernel_halfwords((96, 3, 3), 0) == [
+        (0, 0), (0, 1), (0, 2),
+        (1, 0), (1, 1), (1, 2),
+        (2, 0), (2, 1), (2, 2),
+        None,
+    ]
+
+    # Example of a kernel for a 1x1 regular convolution with 4 channels
+    assert _get_kernel_halfwords((48, 1, 4), 1) == [
+        None, (0, 0), (0, 1), (0, 2), (0, 3), None,
+    ]
+    # fmt: on
+
+
+def test_write_3x3_depthwise_code():
+    """This is the function that would be generated for a 1x4x48x48 NCHW input tensor with "SAME"
+    padding. We are only computing one sum at once, so we don't need stride or output. Note that
+    this is pretty inefficient - it would be much better to compute a few sums concurrently.
+
+    When inlined, this code compiles (with armv7-a clang 11) into:
+
+    tensordot_opt_x1_int16_w48_3x3_000(int*, int*, int*, int*, int*):
+        ldr.w   lr, [r3]
+        ldrd    r11, r4, [r1]
+        ldrd    r5, r9, [r1, #96]
+        ldrd    r10, r8, [r1, #192]
+        ldm.w   r2, {r1, r6, r7}
+        ldr.w   r12, [sp, #36]
+        smlad   r1, r11, r1, lr
+        smlabb  r1, r4, r6, r1
+        smlatb  r1, r6, r5, r1
+        ldrd    r3, r2, [r2, #12]
+        smlatb  r1, r5, r7, r1
+        smlatb  r1, r7, r9, r1
+        smlad   r1, r10, r3, r1
+        ldr.w   r3, [r12]
+        smlabb  r1, r8, r2, r1
+        smmul   r1, r3, r1
+        ssat    r1, #8, r1, asr #8
+        strh    r1, [r0]
+    """
+    _, code = tensordot_int16_impl(1, (48, 3, 3), (0, 0, 0), (1, 1))
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
+    #define TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t tensordot_opt_x1_int16_w48_3x3_000(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
+    ) {
+      int32_t sum_0 = *bias;
+
+      int32_t tensor__y00_x00__y00_x01 = tensor[0];
+      int32_t tensor__y00_x02__unknown = tensor[1];
+      int32_t tensor__y01_x00__y01_x01 = tensor[24];
+      int32_t tensor__y01_x02__unknown = tensor[25];
+      int32_t tensor__y02_x00__y02_x01 = tensor[48];
+      int32_t tensor__y02_x02__unknown = tensor[49];
+
+      int32_t kernel__y00_x00__y00_x01 = kernel[0];
+      int32_t kernel__y00_x02__y01_x00 = kernel[1];
+      int32_t kernel__y01_x01__y01_x02 = kernel[2];
+      int32_t kernel__y02_x00__y02_x01 = kernel[3];
+      int32_t kernel__y02_x02__unknown = kernel[4];
+
+      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
+      sum_0 = __smlabb(tensor__y00_x02__unknown, kernel__y00_x02__y01_x00, sum_0);
+      sum_0 = __smlabt(tensor__y01_x00__y01_x01, kernel__y00_x02__y01_x00, sum_0);
+      sum_0 = __smlatb(tensor__y01_x00__y01_x01, kernel__y01_x01__y01_x02, sum_0);
+      sum_0 = __smlabt(tensor__y01_x02__unknown, kernel__y01_x01__y01_x02, sum_0);
+      sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
+      sum_0 = __smlabb(tensor__y02_x02__unknown, kernel__y02_x02__unknown, sum_0);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + -128, 8);
+
+      ((int16_t*) output)[0] = (int16_t) requant_0;
+      return 0;
+    }
+    #endif
+    """
+    )
+
+
+def test_odd_width_3x3_depthwise_strides_code():
+    """This is the function that would be generated for a 1x4x48x48 NCHW input tensor with "SAME"
+    padding and (2, 2) strides, being written into NHWC layout. The layout change is encoded by
+    out_stride = 4. This is a common use case seen in MobileNetV1, among others.
+
+    Note that despite the rows not being word-aligned, the *tensor pointer will always be word
+    aligned (satisfying this requirement) since y_stride = 2."""
+
+    _, code = tensordot_int16_impl(2, (49, 3, 3), (0, 0, 0), (2, 4))
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
+    #define TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t tensordot_opt_x2_int16_w49_3x3_000_2_4(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
+    ) {
+      int32_t sum_0 = *bias, sum_1 = *bias;
+
+      int32_t tensor__y00_x00__y00_x01 = tensor[0];
+      int32_t tensor__y00_x02__y00_x03 = tensor[1];
+      int32_t tensor__y00_x04__unknown = tensor[2];
+      int32_t tensor__unknown__y01_x00 = tensor[24];
+      int32_t tensor__y01_x01__y01_x02 = tensor[25];
+      int32_t tensor__y01_x03__y01_x04 = tensor[26];
+      int32_t tensor__y02_x00__y02_x01 = tensor[49];
+      int32_t tensor__y02_x02__y02_x03 = tensor[50];
+      int32_t tensor__y02_x04__unknown = tensor[51];
+
+      int32_t kernel__y00_x00__y00_x01 = kernel[0];
+      int32_t kernel__y00_x02__y01_x00 = kernel[1];
+      int32_t kernel__y01_x01__y01_x02 = kernel[2];
+      int32_t kernel__y02_x00__y02_x01 = kernel[3];
+      int32_t kernel__y02_x02__unknown = kernel[4];
+
+      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
+      sum_0 = __smlabb(tensor__y00_x02__y00_x03, kernel__y00_x02__y01_x00, sum_0);
+      sum_0 = __smlatt(tensor__unknown__y01_x00, kernel__y00_x02__y01_x00, sum_0);
+      sum_0 = __smlad(tensor__y01_x01__y01_x02, kernel__y01_x01__y01_x02, sum_0);
+      sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
+      sum_0 = __smlabb(tensor__y02_x02__y02_x03, kernel__y02_x02__unknown, sum_0);
+      sum_1 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x00__y00_x01, sum_1);
+      sum_1 = __smlabb(tensor__y00_x04__unknown, kernel__y00_x02__y01_x00, sum_1);
+      sum_1 = __smlatt(tensor__y01_x01__y01_x02, kernel__y00_x02__y01_x00, sum_1);
+      sum_1 = __smlad(tensor__y01_x03__y01_x04, kernel__y01_x01__y01_x02, sum_1);
+      sum_1 = __smlad(tensor__y02_x02__y02_x03, kernel__y02_x00__y02_x01, sum_1);
+      sum_1 = __smlabb(tensor__y02_x04__unknown, kernel__y02_x02__unknown, sum_1);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + -128, 8);
+      int32_t requant_1 = (sum_1 * (int64_t) scale_val) >> 32;
+      requant_1 = (requant_1 + 1) >> 1;
+      requant_1 = __ssat(requant_1 + -128, 8);
+
+      ((int16_t*) output)[0] = (int16_t) requant_0;
+      ((int16_t*) output)[4] = (int16_t) requant_1;
+      return 0;
+    }
+    #endif
+    """
+    )
+
+
+def test_1x1x8_convolution_code():
+    """This is the function that would be generated for a 1x48x48x8 NHWC input tensor under
+    standard convolution with a 1x1 kernel. This is a common use case seen in MobileNetV1,
+    among others. In this scenario, a very high amount of memory re-use means that summing
+    four channels at once makes us faster."""
+
+    _, code = tensordot_int16_impl(4, (48 * 8, 1, 8), (0, 0, 0), (8, 1))
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
+    #define TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t tensordot_opt_x4_int16_w384_1x8_000_8_1(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
+    ) {
+      int32_t sum_0 = *bias, sum_1 = *bias, sum_2 = *bias, sum_3 = *bias;
+
+      int32_t tensor__y00_x00__y00_x01 = tensor[0];
+      int32_t tensor__y00_x02__y00_x03 = tensor[1];
+      int32_t tensor__y00_x04__y00_x05 = tensor[2];
+      int32_t tensor__y00_x06__y00_x07 = tensor[3];
+      int32_t tensor__y00_x08__y00_x09 = tensor[4];
+      int32_t tensor__y00_x0a__y00_x0b = tensor[5];
+      int32_t tensor__y00_x0c__y00_x0d = tensor[6];
+      int32_t tensor__y00_x0e__y00_x0f = tensor[7];
+      int32_t tensor__y00_x10__y00_x11 = tensor[8];
+      int32_t tensor__y00_x12__y00_x13 = tensor[9];
+      int32_t tensor__y00_x14__y00_x15 = tensor[10];
+      int32_t tensor__y00_x16__y00_x17 = tensor[11];
+      int32_t tensor__y00_x18__y00_x19 = tensor[12];
+      int32_t tensor__y00_x1a__y00_x1b = tensor[13];
+      int32_t tensor__y00_x1c__y00_x1d = tensor[14];
+      int32_t tensor__y00_x1e__y00_x1f = tensor[15];
+
+      int32_t kernel__y00_x00__y00_x01 = kernel[0];
+      int32_t kernel__y00_x02__y00_x03 = kernel[1];
+      int32_t kernel__y00_x04__y00_x05 = kernel[2];
+      int32_t kernel__y00_x06__y00_x07 = kernel[3];
+
+      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
+      sum_0 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x02__y00_x03, sum_0);
+      sum_0 = __smlad(tensor__y00_x04__y00_x05, kernel__y00_x04__y00_x05, sum_0);
+      sum_0 = __smlad(tensor__y00_x06__y00_x07, kernel__y00_x06__y00_x07, sum_0);
+      sum_1 = __smlad(tensor__y00_x08__y00_x09, kernel__y00_x00__y00_x01, sum_1);
+      sum_1 = __smlad(tensor__y00_x0a__y00_x0b, kernel__y00_x02__y00_x03, sum_1);
+      sum_1 = __smlad(tensor__y00_x0c__y00_x0d, kernel__y00_x04__y00_x05, sum_1);
+      sum_1 = __smlad(tensor__y00_x0e__y00_x0f, kernel__y00_x06__y00_x07, sum_1);
+      sum_2 = __smlad(tensor__y00_x10__y00_x11, kernel__y00_x00__y00_x01, sum_2);
+      sum_2 = __smlad(tensor__y00_x12__y00_x13, kernel__y00_x02__y00_x03, sum_2);
+      sum_2 = __smlad(tensor__y00_x14__y00_x15, kernel__y00_x04__y00_x05, sum_2);
+      sum_2 = __smlad(tensor__y00_x16__y00_x17, kernel__y00_x06__y00_x07, sum_2);
+      sum_3 = __smlad(tensor__y00_x18__y00_x19, kernel__y00_x00__y00_x01, sum_3);
+      sum_3 = __smlad(tensor__y00_x1a__y00_x1b, kernel__y00_x02__y00_x03, sum_3);
+      sum_3 = __smlad(tensor__y00_x1c__y00_x1d, kernel__y00_x04__y00_x05, sum_3);
+      sum_3 = __smlad(tensor__y00_x1e__y00_x1f, kernel__y00_x06__y00_x07, sum_3);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + -128, 8);
+      int32_t requant_1 = (sum_1 * (int64_t) scale_val) >> 32;
+      requant_1 = (requant_1 + 1) >> 1;
+      requant_1 = __ssat(requant_1 + -128, 8);
+      int32_t requant_2 = (sum_2 * (int64_t) scale_val) >> 32;
+      requant_2 = (requant_2 + 1) >> 1;
+      requant_2 = __ssat(requant_2 + -128, 8);
+      int32_t requant_3 = (sum_3 * (int64_t) scale_val) >> 32;
+      requant_3 = (requant_3 + 1) >> 1;
+      requant_3 = __ssat(requant_3 + -128, 8);
+
+      int32_t packed_res_0 = requant_0 + (requant_1 << 16);
+      int32_t packed_res_1 = requant_2 + (requant_3 << 16);
+      output[0] = packed_res_0;
+      output[1] = packed_res_1;
+      return 0;
+    }
+    #endif
+    """
+    )
+
+
+def test_3x3x3_offset_convolution_code():
+    """This is the function that would be generated for a 1x96x96x3 NHWC input tensor under
+    standard convolution with a 3x3x3 kernel - the first layer of MobileNetV1. This is special, as
+    it means that every other kernel channel will not start on an even numbered halfword. We won't
+    have this issue for the input tensor, as we will always compute two positions at a time.
+
+    To solve this 'every other' issue, we will need two different version of this function to
+    alternate between. This alternation will be handled in TIR scheduling. Here, we just test the
+    version where the kernel is not word aligned.
+
+    Also tests the requantize_shift and output_zero_point keyword args. These might be needed for
+    some ResNet models (like image classification from MLPerf Tiny).
+    """
+
+    _, code = tensordot_int16_impl(
+        1,
+        (96 * 3, 3, 9),
+        (1, 1, 1),
+        (3, 1),
+        requantize_shift=40,
+        output_zero_point=4,
+    )
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
+    #define TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t tensordot_opt_x1_int16_w288_3x9_111(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
+    ) {
+      int32_t sum_0 = *bias;
+
+      int32_t tensor__unknown__y00_x00 = tensor[0];
+      int32_t tensor__y00_x01__y00_x02 = tensor[1];
+      int32_t tensor__y00_x03__y00_x04 = tensor[2];
+      int32_t tensor__y00_x05__y00_x06 = tensor[3];
+      int32_t tensor__y00_x07__y00_x08 = tensor[4];
+      int32_t tensor__unknown__y01_x00 = tensor[144];
+      int32_t tensor__y01_x01__y01_x02 = tensor[145];
+      int32_t tensor__y01_x03__y01_x04 = tensor[146];
+      int32_t tensor__y01_x05__y01_x06 = tensor[147];
+      int32_t tensor__y01_x07__y01_x08 = tensor[148];
+      int32_t tensor__unknown__y02_x00 = tensor[288];
+      int32_t tensor__y02_x01__y02_x02 = tensor[289];
+      int32_t tensor__y02_x03__y02_x04 = tensor[290];
+      int32_t tensor__y02_x05__y02_x06 = tensor[291];
+      int32_t tensor__y02_x07__y02_x08 = tensor[292];
+
+      int32_t kernel__unknown__y00_x00 = kernel[0];
+      int32_t kernel__y00_x01__y00_x02 = kernel[1];
+      int32_t kernel__y00_x03__y00_x04 = kernel[2];
+      int32_t kernel__y00_x05__y00_x06 = kernel[3];
+      int32_t kernel__y00_x07__y00_x08 = kernel[4];
+      int32_t kernel__y01_x00__y01_x01 = kernel[5];
+      int32_t kernel__y01_x02__y01_x03 = kernel[6];
+      int32_t kernel__y01_x04__y01_x05 = kernel[7];
+      int32_t kernel__y01_x06__y01_x07 = kernel[8];
+      int32_t kernel__y01_x08__y02_x00 = kernel[9];
+      int32_t kernel__y02_x01__y02_x02 = kernel[10];
+      int32_t kernel__y02_x03__y02_x04 = kernel[11];
+      int32_t kernel__y02_x05__y02_x06 = kernel[12];
+      int32_t kernel__y02_x07__y02_x08 = kernel[13];
+
+      sum_0 = __smlatt(tensor__unknown__y00_x00, kernel__unknown__y00_x00, sum_0);
+      sum_0 = __smlad(tensor__y00_x01__y00_x02, kernel__y00_x01__y00_x02, sum_0);
+      sum_0 = __smlad(tensor__y00_x03__y00_x04, kernel__y00_x03__y00_x04, sum_0);
+      sum_0 = __smlad(tensor__y00_x05__y00_x06, kernel__y00_x05__y00_x06, sum_0);
+      sum_0 = __smlad(tensor__y00_x07__y00_x08, kernel__y00_x07__y00_x08, sum_0);
+      sum_0 = __smlatb(tensor__unknown__y01_x00, kernel__y01_x00__y01_x01, sum_0);
+      sum_0 = __smlabt(tensor__y01_x01__y01_x02, kernel__y01_x00__y01_x01, sum_0);
+      sum_0 = __smlatb(tensor__y01_x01__y01_x02, kernel__y01_x02__y01_x03, sum_0);
+      sum_0 = __smlabt(tensor__y01_x03__y01_x04, kernel__y01_x02__y01_x03, sum_0);
+      sum_0 = __smlatb(tensor__y01_x03__y01_x04, kernel__y01_x04__y01_x05, sum_0);
+      sum_0 = __smlabt(tensor__y01_x05__y01_x06, kernel__y01_x04__y01_x05, sum_0);
+      sum_0 = __smlatb(tensor__y01_x05__y01_x06, kernel__y01_x06__y01_x07, sum_0);
+      sum_0 = __smlabt(tensor__y01_x07__y01_x08, kernel__y01_x06__y01_x07, sum_0);
+      sum_0 = __smlatb(tensor__y01_x07__y01_x08, kernel__y01_x08__y02_x00, sum_0);
+      sum_0 = __smlatt(tensor__unknown__y02_x00, kernel__y01_x08__y02_x00, sum_0);
+      sum_0 = __smlad(tensor__y02_x01__y02_x02, kernel__y02_x01__y02_x02, sum_0);
+      sum_0 = __smlad(tensor__y02_x03__y02_x04, kernel__y02_x03__y02_x04, sum_0);
+      sum_0 = __smlad(tensor__y02_x05__y02_x06, kernel__y02_x05__y02_x06, sum_0);
+      sum_0 = __smlad(tensor__y02_x07__y02_x08, kernel__y02_x07__y02_x08, sum_0);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 39;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + 4, 8);
+
+      ((int16_t*) output)[1] = (int16_t) requant_0;
+      return 0;
+    }
+    #endif
+    """
+    )
diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py
index ce379b6b2cb3..cb24353539a4 100644
--- a/tests/scripts/request_hook/request_hook.py
+++ b/tests/scripts/request_hook/request_hook.py
@@ -145,6 +145,7 @@
     "https://github.com/tlc-pack/web-data/raw/967fc387dadb272c5a7f8c3461d34c060100dbf1/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy": f"{BASE}/tlc-pack/web-data/raw/967fc387dadb272c5a7f8c3461d34c060100dbf1/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy",
     "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy": f"{BASE}/tlc-pack/web-data/raw/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy",
     "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/keyword_spotting_quant.tflite": f"{BASE}/tlc-pack/web-data/raw/main/testdata/microTVM/model/keyword_spotting_quant.tflite",
+    "https://github.com/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite": f"{BASE}/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite",
     "https://github.com/uwsampl/web-data/raw/main/vta/models/synset.txt": f"{BASE}/2022-10-05/synset.txt",
     "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_cora.torch": f"{BASE}/gcn_cora.torch",
     "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg": f"{BASE}/vta_cat.jpg",

From 8d04e1ea3cfc40942c685dd75f0d5fde421afc6e Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Wed, 7 Dec 2022 02:12:26 +0900
Subject: [PATCH 700/704] [ci] Fix upload_ci_resource.yml and update related
 documentation (#13562)

The workflow https://github.com/apache/tvm/actions/workflows/upload_ci_resource.yml uses SHA-256, not SHA-512 hashes. This  PR fixes a comment in `request_hook.py` to reflect this, and fixes some other formatting too.

It also adds a trailing quote to `upload_ci_resource.yml' to prevent it from failing (e.g. https://github.com/apache/tvm/actions/runs/3629030778/jobs/6120748470).
---
 .github/workflows/upload_ci_resource.yml   | 2 +-
 tests/scripts/request_hook/request_hook.py | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/upload_ci_resource.yml b/.github/workflows/upload_ci_resource.yml
index 6d85c26c25b3..c0af1a5b1ceb 100644
--- a/.github/workflows/upload_ci_resource.yml
+++ b/.github/workflows/upload_ci_resource.yml
@@ -57,4 +57,4 @@ jobs:
           aws s3 cp downloaded_file "s3://tvm-ci-resources/$UPLOAD_PATH"
           echo "The item is available at https://tvm-ci-resources.s3.us-west-2.amazonaws.com/$UPLOAD_PATH"
           echo "Add this line to tests/scripts/request_hook/request_hook.py"
-          echo "    \"$URL\": f\"{BASE}/$UPLOAD_PATH\",
+          echo "    \"$URL\": f\"{BASE}/$UPLOAD_PATH\","
diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py
index cb24353539a4..4e3db220e0b4 100644
--- a/tests/scripts/request_hook/request_hook.py
+++ b/tests/scripts/request_hook/request_hook.py
@@ -25,7 +25,8 @@
 LOGGER = None
 
 
-# To update this list, run the workflow <HERE> with the URL to download and the SHA512 of the file
+# To update this list, run https://github.com/apache/tvm/actions/workflows/upload_ci_resource.yml
+# with the URL to download and the SHA-256 hash of the file.
 BASE = "https://tvm-ci-resources.s3.us-west-2.amazonaws.com"
 URL_MAP = {
     "http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec": f"{BASE}/mxnet-val_256_q90.rec",
@@ -218,8 +219,8 @@ def __init__(self, url, *args, **kwargs):
             # Dis-allow any accesses that aren't going through S3
             msg = (
                 f"Uncaught URL found in CI: {url}. "
-                "A committer must upload the relevant file to S3 via"
-                "https://github.com/apache/tvm/actions/workflows/upload_ci_resource.yml"
+                "A committer must upload the relevant file to S3 via "
+                "https://github.com/apache/tvm/actions/workflows/upload_ci_resource.yml "
                 "and add it to the mapping in tests/scripts/request_hook/request_hook.py"
             )
             raise RuntimeError(msg)

From a1d46645d27e376afaa2c141493854e0dced7621 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 7 Dec 2022 04:07:55 +0900
Subject: [PATCH 701/704] [TIR] Fix remaining dtype mismatch issue caused by
 SubspaceDivide (#13558)

---
 src/arith/iter_affine_map.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index fa4f0fd5147b..adba61632fb2 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -1910,7 +1910,7 @@ class SubspaceDivider {
       return DivisionResult::Failure();
     }
     bool need_predicate = !analyzer_->CanProveEqual(extent, mark_extent);
-    const IterMark& outer_mark = MarkFromArgsAndBase(outer_args, 0);
+    const IterMark& outer_mark = MarkFromArgsAndBase(outer_args, make_const(dtype, 0));
     const IterMark& inner_mark = MarkFromArgsAndBase(inner_args, expr->base);
     IterSumExpr outer_source = Downcast<IterSumExpr>(outer_mark->source);
     IterSumExpr inner_source = Downcast<IterSumExpr>(inner_mark->source);

From 8826c1c386ca0bbba96cbd2ea3be235ab30411d4 Mon Sep 17 00:00:00 2001
From: Krishna Bindumadhavan <31140965+f2013519@users.noreply.github.com>
Date: Wed, 7 Dec 2022 00:45:06 +0530
Subject: [PATCH 702/704] Mark base64.h encode and decode API's as inline
 (#13556)

---
 src/contrib/torch/base64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/contrib/torch/base64.h b/src/contrib/torch/base64.h
index 859fd1abcfd0..d7dac4b86cc8 100644
--- a/src/contrib/torch/base64.h
+++ b/src/contrib/torch/base64.h
@@ -36,7 +36,7 @@
 namespace tvm {
 namespace support {
 
-size_t b64strlen(const std::string b64str) {
+inline size_t b64strlen(const std::string b64str) {
   ICHECK(b64str.size() % 4 == 0) << "invalid base64 encoding";
   size_t length = b64str.size() / 4 * 3;
   if (b64str[b64str.size() - 2] == '=') {
@@ -47,7 +47,7 @@ size_t b64strlen(const std::string b64str) {
   return length;
 }
 
-void b64decode(const std::string b64str, u_char* ret) {
+inline void b64decode(const std::string b64str, u_char* ret) {
   size_t index = 0;
   const auto length = b64str.size();
   for (size_t i = 0; i < length; i += 4) {

From eb2781f7c900f437d86d562fd0a53a887f02aa90 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 7 Dec 2022 04:46:37 +0900
Subject: [PATCH 703/704] [MetaSchedule] Make `MultiLevelTiling` apply
 condition customizable (#13535)

Currently, the condition for applying MultiLevelTiling is given by NeedsMultiLevelTiling function and only multi-argument reduction operations like conv2d or dense are selected. This PR changes this condition to be customizable for use cases where we might want to apply MultiLevelTiling to more ops (pooling etc).

* define max pool on blocked layout

* introduce filter_fn to MultiLevelTiling
---
 include/tvm/meta_schedule/schedule_rule.h     |   7 +-
 .../schedule_rule/multi_level_tiling.py       |  10 +-
 .../schedule_rule/multi_level_tiling.cc       |  21 ++-
 .../schedule_rule/multi_level_tiling.h        |   2 +
 .../test_meta_schedule_schedule_rule_mlt.py   | 158 +++++++++++++++++-
 5 files changed, 181 insertions(+), 17 deletions(-)

diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index a3d6c7ef68bf..879dd076a8b5 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -148,6 +148,10 @@ class ScheduleRule : public runtime::ObjectRef {
    * NullOpt means disable vectorization
    * \param reuse_read Data reuse configuration for reading. NullOpt means no reuse.
    * \param reuse_write Data reuse configuration for writing. NullOpt means no reuse.
+   * \param filter_fn A function that can be passed to overwrite the default condition for applying
+   * MultiLevelTiling to a block. Its signature must be (Schedule, BlockRV) -> bool.
+   * This is useful if there is a need to apply MultiLevelTiling to an operation / block which is
+   * ignored  by default. This function should return True for a block that should be tiled.
    * \return The schedule rule created
    */
   TVM_DLL static ScheduleRule MultiLevelTiling(String structure,                             //
@@ -155,7 +159,8 @@ class ScheduleRule : public runtime::ObjectRef {
                                                Optional<Integer> max_innermost_factor,       //
                                                Optional<Array<Integer>> vector_load_lens,    //
                                                Optional<Map<String, ObjectRef>> reuse_read,  //
-                                               Optional<Map<String, ObjectRef>> reuse_write);
+                                               Optional<Map<String, ObjectRef>> reuse_write,
+                                               Optional<runtime::PackedFunc> filter_fn = NullOpt);
 
   /*!
    * \brief Extension of MultiLevelTiling for auto-tensorization with a single intrinsic.
diff --git a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
index e91382dd017a..19651a2ce18e 100644
--- a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
+++ b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
@@ -15,8 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Multi-level tiling with reuse."""
-from typing import Any, Dict, List, Mapping, NamedTuple, Optional
+from typing import Any, Dict, List, Mapping, NamedTuple, Optional, Callable
 
+from tvm.tir.schedule import Schedule, BlockRV
 from tvm._ffi import register_object
 
 from .. import _ffi_api
@@ -62,6 +63,11 @@ class MultiLevelTiling(ScheduleRule):
         Data reuse configuration for reading. None means no reuse.
     reuse_write : Optional[ReuseType]
         Data reuse configuration for writing. None means no reuse.
+    filter_fn: Optional[Callable[[Schedule, BlockRV], bool]]
+        A function that can be passed to overwrite the default condition for applying
+        MultiLevelTiling to a block. This is useful if there is a need to apply MultiLevelTiling
+        to an operation / block which is ignored by default. This function should return True
+        for a block that should be tiled (based on the block name, for example).
     """
 
     def __init__(
@@ -72,6 +78,7 @@ def __init__(
         vector_load_lens: Optional[List[int]] = None,
         reuse_read: Optional[ReuseType] = None,
         reuse_write: Optional[ReuseType] = None,
+        filter_fn: Optional[Callable[[Schedule, BlockRV], bool]] = None,
     ) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.ScheduleRuleMultiLevelTiling,  # type: ignore # pylint: disable=no-member
@@ -81,6 +88,7 @@ def __init__(
             vector_load_lens,
             reuse_read.as_dict() if reuse_read is not None else None,
             reuse_write.as_dict() if reuse_write is not None else None,
+            filter_fn,
         )
 
 
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index fe24357fcad5..324eedafb98a 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -92,16 +92,17 @@ void MultiLevelTilingNode::InitializeWithTuneContext(const TuneContext& context)
 
 // Entry of the mega rule; Inherited from ScheduleRuleNode
 Array<Schedule> MultiLevelTilingNode::Apply(const Schedule& sch, const BlockRV& block_rv) {
-  if (!NeedsMultiLevelTiling(sch->state(), sch->GetSRef(block_rv))) {
-    return {sch};
-  }
-  sch->Annotate(block_rv, tir::attr::meta_schedule_tiling_structure, structure);
+  if ((filter_fn_ && filter_fn_.value()(sch, sch->GetSRef(block_rv))) ||
+      NeedsMultiLevelTiling(sch->state(), sch->GetSRef(block_rv))) {
+    sch->Annotate(block_rv, tir::attr::meta_schedule_tiling_structure, structure);
 
-  Array<Schedule> results;
-  for (auto&& state : ApplySubRules({State(sch, block_rv)})) {
-    results.push_back(std::move(state->sch));
+    Array<Schedule> results;
+    for (auto&& state : ApplySubRules({State(sch, block_rv)})) {
+      results.push_back(std::move(state->sch));
+    }
+    return results;
   }
-  return results;
+  return {sch};
 }
 
 // Inherited from ScheduleRuleNode
@@ -320,9 +321,11 @@ ScheduleRule ScheduleRule::MultiLevelTiling(String structure, Optional<Array<Str
                                             Optional<Integer> max_innermost_factor,
                                             Optional<Array<Integer>> vector_load_lens,
                                             Optional<Map<String, ObjectRef>> reuse_read,
-                                            Optional<Map<String, ObjectRef>> reuse_write) {
+                                            Optional<Map<String, ObjectRef>> reuse_write,
+                                            Optional<runtime::PackedFunc> filter_fn) {
   auto node = MultiLevelTilingInitCommon<MultiLevelTilingNode>(
       structure, tile_binds, max_innermost_factor, vector_load_lens, reuse_read, reuse_write);
+  node->filter_fn_ = filter_fn;
   return ScheduleRule(node);
 }
 
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.h b/src/meta_schedule/schedule_rule/multi_level_tiling.h
index 98b4634af106..d8725a3060b1 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.h
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.h
@@ -194,6 +194,8 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
   int max_threads_per_block_;
   /*! \brief The logging function */
   PackedFunc logger;
+  /*! \brief The function to overwrite the default condition for applying MultiLevelTiling. */
+  Optional<PackedFunc> filter_fn_;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("structure", &structure);
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
index 2c5a44d7a29f..6d4dcd996475 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+import tvm.testing
 from tvm import meta_schedule as ms
 from tvm import target, te
 from tvm.meta_schedule.testing import te_workload
@@ -646,10 +647,155 @@ def test_cache_read_specify_consumer():
     assert residual_block in space[0].mod.script()
 
 
+def test_max_pool_blocked():
+    # fmt off
+    @T.prim_func
+    def pool_blocked_cache_read_write(
+        X: T.Buffer[(1, 2, 8, 8, 8, 8, 32), "uint8"],
+        pool: T.Buffer[(1, 2, 4, 4, 8, 8, 32), "uint8"],
+    ):
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        pool_global = T.alloc_buffer([1, 2, 4, 4, 8, 8, 32], dtype="uint8")
+        X_global = T.alloc_buffer([1, 2, 8, 8, 8, 8, 32], dtype="uint8")
+        for b_0, c_o_0, h_o_0, w_o_0, h_i_0, w_i_0, c_i_0 in T.grid(1, 2, 4, 1, 8, 1, 4):
+            for ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused in T.serial(896):
+                with T.block("X_global"):
+                    v0 = T.axis.spatial(1, 0)
+                    v1 = T.axis.spatial(2, c_o_0)
+                    v2 = T.axis.spatial(8, h_o_0 * 2)
+                    v3 = T.axis.spatial(8, ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused // 128)
+                    v4 = T.axis.spatial(
+                        8, h_i_0 % 4 * 2 + ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused % 128 // 64
+                    )
+                    v5 = T.axis.spatial(8, ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused % 64 // 8)
+                    v6 = T.axis.spatial(32, c_i_0 * 8 + ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused % 8)
+                    T.reads(X[v0, v1, v2, v3, v4, v5, v6])
+                    T.writes(X_global[v0, v1, v2, v3, v4, v5, v6])
+                    X_global[v0, v1, v2, v3, v4, v5, v6] = X[v0, v1, v2, v3, v4, v5, v6]
+            for wh, ww, b_1, c_o_1, h_o_1, w_o_1, h_i_1, w_i_1, c_i_1 in T.grid(
+                2, 2, 1, 1, 1, 4, 1, 8, 8
+            ):
+                with T.block("pool"):
+                    v_b = T.axis.spatial(1, b_1 + b_0)
+                    v_c_o = T.axis.spatial(2, c_o_0 + c_o_1)
+                    v_h_o = T.axis.spatial(4, h_o_1 + h_o_0)
+                    v_w_o = T.axis.spatial(4, w_o_0 * 4 + w_o_1)
+                    v_h_i = T.axis.spatial(8, h_i_1 + h_i_0)
+                    v_w_i = T.axis.spatial(8, w_i_0 * 8 + w_i_1)
+                    v_c_i = T.axis.spatial(32, c_i_0 * 8 + c_i_1)
+                    v_wh, v_ww = T.axis.remap("RR", [wh, ww])
+                    T.reads(
+                        X_global[
+                            v_b,
+                            v_c_o,
+                            v_h_i // 8 * 2 + v_h_o * 2,
+                            v_w_i // 8 * 2 + v_w_o * 2,
+                            v_h_i % 4 * 2 + v_wh,
+                            v_w_i % 4 * 2 + v_ww,
+                            v_c_i,
+                        ]
+                    )
+                    T.writes(pool_global[v_b, v_c_o, v_h_o, v_w_o, v_h_i, v_w_i, v_c_i])
+                    T.block_attr({"meta_schedule.tiling_structure": "SRS"})
+                    with T.init():
+                        pool_global[v_b, v_c_o, v_h_o, v_w_o, v_h_i, v_w_i, v_c_i] = T.uint8(0)
+                    pool_global[v_b, v_c_o, v_h_o, v_w_o, v_h_i, v_w_i, v_c_i] = T.max(
+                        pool_global[v_b, v_c_o, v_h_o, v_w_o, v_h_i, v_w_i, v_c_i],
+                        X_global[
+                            v_b,
+                            v_c_o,
+                            v_h_i // 8 * 2 + v_h_o * 2,
+                            v_w_i // 8 * 2 + v_w_o * 2,
+                            v_h_i % 4 * 2 + v_wh,
+                            v_w_i % 4 * 2 + v_ww,
+                            v_c_i,
+                        ],
+                    )
+            for ax0, ax1, ax2, ax3, ax4, ax5, ax6 in T.grid(1, 1, 1, 4, 1, 8, 8):
+                with T.block("pool_global"):
+                    v0 = T.axis.spatial(1, ax0)
+                    v1 = T.axis.spatial(2, c_o_0 + ax1)
+                    v2 = T.axis.spatial(4, h_o_0 + ax2)
+                    v3 = T.axis.spatial(4, ax3)
+                    v4 = T.axis.spatial(8, h_i_0 + ax4)
+                    v5 = T.axis.spatial(8, ax5)
+                    v6 = T.axis.spatial(32, c_i_0 * 8 + ax6)
+                    T.reads(pool_global[v0, v1, v2, v3, v4, v5, v6])
+                    T.writes(pool[v0, v1, v2, v3, v4, v5, v6])
+                    pool[v0, v1, v2, v3, v4, v5, v6] = pool_global[v0, v1, v2, v3, v4, v5, v6]
+
+    # fmt on
+
+    def max_pool_blocked_compute(height, width, channel):
+        ishape = (1, channel // 32, height // 8, width // 8, 8, 8, 32)
+        oshape = (1, channel // 32, height // 8 // 2, width // 8 // 2, 8, 8, 32)
+        X = te.placeholder(ishape, name="X", dtype="uint8")
+
+        window_h = te.reduce_axis((0, 2), name="wh")
+        window_w = te.reduce_axis((0, 2), name="ww")
+
+        out = te.compute(
+            oshape,
+            lambda b, c_o, h_o, w_o, h_i, w_i, c_i: te.max(
+                X[
+                    b,
+                    c_o,
+                    (h_o * 8 + h_i) // 8 * 2,
+                    (w_o * 8 + w_i) // 8 * 2,
+                    (h_o * 8 + h_i) % 4 * 2 + window_h,
+                    (w_o * 8 + w_i) % 4 * 2 + window_w,
+                    c_i,
+                ],
+                axis=[window_h, window_w],
+            ),
+            name="pool",
+        )
+        return [X, out]
+
+    height = width = 64
+    channel = 64
+
+    mod = te.create_prim_func(max_pool_blocked_compute(height, width, channel))
+
+    actual = generate_design_space(
+        kind="llvm",
+        mod=mod,
+        target=Target("llvm"),
+        types=None,
+        sch_rules=[
+            ms.schedule_rule.MultiLevelTiling(
+                structure="SRS",
+                tile_binds=None,
+                max_innermost_factor=64,
+                vector_load_lens=None,
+                reuse_read=ms.schedule_rule.ReuseType(
+                    req="must",
+                    levels=[1],
+                    scope="global",
+                ),
+                reuse_write=ms.schedule_rule.ReuseType(req="must", levels=[1], scope="global"),
+                filter_fn=lambda sch, block_rv: sch.get(block_rv).name_hint == "pool",
+            )
+        ],
+    )
+
+    decision = [
+        ("SamplePerfectTile", [1, 1]),
+        ("SamplePerfectTile", [2, 1]),
+        ("SamplePerfectTile", [4, 1]),
+        ("SamplePerfectTile", [1, 4]),
+        ("SamplePerfectTile", [8, 1]),
+        ("SamplePerfectTile", [1, 8]),
+        ("SamplePerfectTile", [4, 8]),
+    ]
+
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[pool_blocked_cache_read_write],
+        expected_decisions=[decision],
+    )
+
+
 if __name__ == "__main__":
-    test_cpu_matmul()
-    test_cpu_matmul_relu()
-    test_cuda_matmul()
-    test_cuda_matmul_relu()
-    test_cuda_sum_with_trivial_block_iter()
-    test_multi_level_tiling_hexagon()
+    tvm.testing.main()

From 9374738b29c986e863d331ffe29e1b36828a8565 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 6 Dec 2022 13:52:31 -0800
Subject: [PATCH 704/704] [ci] Split Jenkinsfile into platform-specific jobs
 (#13300)

This breaks up the Jenkinsfile into ones for GPU, CPU, etc. This removes a false dependency between the build and test steps (e.g. before the GPU tests had to wait on the Hexagon build to complete) and makes the Jenkins UI a bit better since there's not 30 tests to scroll through to find a failure. An example can be found in my fork here: https://github.com/driazati/tvm/pull/38 in the checks box. Before this is merged https://github.com/tlc-pack/ci/blob/main/jenkins/jenkins-jobs/prod/tvm.yaml will need to be updated to accept webhooks from apache/tvm instead of my fork.

See #13337 for more context
---
 .gitattributes                                |    1 +
 ci/jenkins/Build.groovy.j2                    |  236 ---
 ci/jenkins/Deploy.groovy.j2                   |  174 ---
 ci/jenkins/DockerBuild.groovy.j2              |  118 --
 ci/jenkins/Lint.groovy.j2                     |   19 -
 ci/jenkins/Makefile                           |   27 -
 ci/jenkins/Test.groovy.j2                     |  344 -----
 ci/jenkins/data.py                            |  122 ++
 ci/jenkins/generate.py                        |  146 +-
 ci/jenkins/generated/arm_jenkinsfile.groovy   |  944 ++++++++++++
 .../generated/cortexm_jenkinsfile.groovy      | 1124 ++++++++++++++
 ci/jenkins/generated/cpu_jenkinsfile.groovy   |  842 +++++++++++
 .../generated/docker_jenkinsfile.groovy       |  960 ++++++++++++
 ci/jenkins/generated/gpu_jenkinsfile.groovy   | 1294 +++++++++++++++++
 .../generated/hexagon_jenkinsfile.groovy      |  931 ++++++++++++
 ci/jenkins/generated/i386_jenkinsfile.groovy  |  693 +++++++++
 ci/jenkins/generated/lint_jenkinsfile.groovy  |  545 +++++++
 .../generated/minimal_jenkinsfile.groovy      |  589 ++++++++
 ci/jenkins/generated/riscv_jenkinsfile.groovy |  594 ++++++++
 ci/jenkins/generated/wasm_jenkinsfile.groovy  |  530 +++++++
 ci/jenkins/requirements.txt                   |    1 -
 .../templates/arm_jenkinsfile.groovy.j2       |   99 ++
 .../templates/cortexm_jenkinsfile.groovy.j2   |   64 +
 .../templates/cpu_jenkinsfile.groovy.j2       |   97 ++
 .../templates/docker_jenkinsfile.groovy.j2    |  239 +++
 .../templates/gpu_jenkinsfile.groovy.j2       |  206 +++
 .../templates/hexagon_jenkinsfile.groovy.j2   |   63 +
 .../templates/i386_jenkinsfile.groovy.j2      |   65 +
 .../templates/lint_jenkinsfile.groovy.j2      |   58 +
 .../templates/minimal_jenkinsfile.groovy.j2   |   55 +
 .../templates/riscv_jenkinsfile.groovy.j2     |   62 +
 ci/jenkins/templates/utils/Build.groovy.j2    |   57 +
 .../{ => templates/utils}/Prepare.groovy.j2   |    3 +-
 ci/jenkins/templates/utils/Test.groovy.j2     |   13 +
 .../utils/base.groovy.j2}                     |   63 +-
 ci/jenkins/{ => templates/utils}/macros.j2    |  105 +-
 .../templates/wasm_jenkinsfile.groovy.j2      |   40 +
 ci/scripts/jenkins/open_docker_update_pr.py   |   56 +-
 docker/dev_common.sh                          |   19 +-
 tests/lint/check_file_type.py                 |    2 +
 tests/lint/rat-excludes                       |    9 +-
 tests/python/ci/test_ci.py                    |    4 +-
 42 files changed, 10428 insertions(+), 1185 deletions(-)
 delete mode 100644 ci/jenkins/Build.groovy.j2
 delete mode 100644 ci/jenkins/Deploy.groovy.j2
 delete mode 100644 ci/jenkins/DockerBuild.groovy.j2
 delete mode 100644 ci/jenkins/Lint.groovy.j2
 delete mode 100644 ci/jenkins/Makefile
 delete mode 100644 ci/jenkins/Test.groovy.j2
 create mode 100644 ci/jenkins/data.py
 create mode 100644 ci/jenkins/generated/arm_jenkinsfile.groovy
 create mode 100644 ci/jenkins/generated/cortexm_jenkinsfile.groovy
 create mode 100644 ci/jenkins/generated/cpu_jenkinsfile.groovy
 create mode 100644 ci/jenkins/generated/docker_jenkinsfile.groovy
 create mode 100644 ci/jenkins/generated/gpu_jenkinsfile.groovy
 create mode 100644 ci/jenkins/generated/hexagon_jenkinsfile.groovy
 create mode 100644 ci/jenkins/generated/i386_jenkinsfile.groovy
 create mode 100644 ci/jenkins/generated/lint_jenkinsfile.groovy
 create mode 100644 ci/jenkins/generated/minimal_jenkinsfile.groovy
 create mode 100644 ci/jenkins/generated/riscv_jenkinsfile.groovy
 create mode 100644 ci/jenkins/generated/wasm_jenkinsfile.groovy
 delete mode 100644 ci/jenkins/requirements.txt
 create mode 100644 ci/jenkins/templates/arm_jenkinsfile.groovy.j2
 create mode 100644 ci/jenkins/templates/cortexm_jenkinsfile.groovy.j2
 create mode 100644 ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
 create mode 100644 ci/jenkins/templates/docker_jenkinsfile.groovy.j2
 create mode 100644 ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
 create mode 100644 ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2
 create mode 100644 ci/jenkins/templates/i386_jenkinsfile.groovy.j2
 create mode 100644 ci/jenkins/templates/lint_jenkinsfile.groovy.j2
 create mode 100644 ci/jenkins/templates/minimal_jenkinsfile.groovy.j2
 create mode 100644 ci/jenkins/templates/riscv_jenkinsfile.groovy.j2
 create mode 100644 ci/jenkins/templates/utils/Build.groovy.j2
 rename ci/jenkins/{ => templates/utils}/Prepare.groovy.j2 (98%)
 create mode 100644 ci/jenkins/templates/utils/Test.groovy.j2
 rename ci/jenkins/{Jenkinsfile.j2 => templates/utils/base.groovy.j2} (63%)
 rename ci/jenkins/{ => templates/utils}/macros.j2 (58%)
 create mode 100644 ci/jenkins/templates/wasm_jenkinsfile.groovy.j2

diff --git a/.gitattributes b/.gitattributes
index 1c7a460675f8..d82bd5436b21 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 Jenkinsfile linguist-generated=true
+ci/jenkins/generated/* linguist-generated=true
diff --git a/ci/jenkins/Build.groovy.j2 b/ci/jenkins/Build.groovy.j2
deleted file mode 100644
index 7592079ef8d1..000000000000
--- a/ci/jenkins/Build.groovy.j2
+++ /dev/null
@@ -1,236 +0,0 @@
-def ci_setup(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
-    label: 'Clean up old workspace',
-  )
-}
-
-def python_unittest(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
-    label: 'Run Python unit tests',
-  )
-}
-
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
-def make_standalone_crt(image, build_dir) {
-  sh (
-    script: """
-      set -eux
-      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
-        --sccache-bucket tvm-sccache-prod \
-        --cmake-target standalone_crt \
-        --build-dir build
-      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
-        --sccache-bucket tvm-sccache-prod \
-        --cmake-target crttest \
-        --build-dir build
-      """,
-    label: 'Make standalone CRT',
-  )
-}
-
-def make_cpp_tests(image, build_dir) {
-  sh (
-    script: """
-      set -eux
-      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
-        --sccache-bucket tvm-sccache-prod \
-        --cmake-target cpptest \
-        --build-dir ${build_dir}
-      """,
-    label: 'Make C++ tests',
-  )
-}
-
-def cmake_build(image, path, make_flag) {
-  sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
-    label: 'Run cmake build',
-  )
-}
-
-def build() {
-stage('Build') {
-  environment {
-    SKIP_SLOW_TESTS = "${skip_slow_tests}"
-  }
-  parallel(
-
-  {% call m.build_step(
-      name='BUILD: GPU',
-      node='CPU-SMALL',
-      condition='!skip_ci',
-      ws='tvm/build-gpu',
-      docker_image='ci_gpu',
-    ) %}
-    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
-    cmake_build("${ci_gpu} --no-gpu", 'build', '-j2')
-    make_standalone_crt("${ci_gpu} --no-gpu", 'build')
-    {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib + tvm_allvisible + microtvm_template_projects + crttest + standalone_crt) }}
-
-    // compiler test
-    sh "rm -rf build"
-    sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
-    cmake_build("${ci_gpu} --no-gpu", 'build', '-j2')
-    make_standalone_crt("${ci_gpu} --no-gpu", 'build')
-    {{ m.upload_artifacts(tag='gpu2', filenames=tvm_lib + crttest + standalone_crt) }}
-  {% endcall %}
-
-  {% call m.build_step(
-      name='BUILD: CPU',
-      node='CPU-SMALL',
-      condition='!skip_ci && is_docs_only_build != 1',
-      ws='tvm/build-cpu',
-      docker_image='ci_cpu',
-    ) %}
-    sh (
-      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
-      label: 'Create CPU cmake config',
-    )
-    cmake_build(ci_cpu, 'build', '-j2')
-    make_standalone_crt(ci_cpu, 'build')
-    make_cpp_tests(ci_cpu, 'build')
-    {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim + tvm_allvisible + crttest + cpptest + standalone_crt) }}
-    ci_setup(ci_cpu)
-    // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
-    // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
-    sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
-  {% endcall %}
-
-  {% call m.build_step(
-      name='BUILD: CPU MINIMAL',
-      node='CPU-SMALL',
-      condition='!skip_ci && is_docs_only_build != 1',
-      ws='tvm/build-cpu-minimal',
-      docker_image='ci_minimal',
-    ) %}
-    sh (
-      script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build",
-      label: 'Create CPU minimal cmake config',
-    )
-    cmake_build(ci_minimal, 'build', '-j2')
-    make_standalone_crt(ci_minimal, 'build')
-    make_cpp_tests(ci_minimal, 'build')
-    {{ m.upload_artifacts(tag='cpu-minimal', filenames=tvm_lib + tvm_allvisible + crttest + cpptest + standalone_crt) }}
-  {% endcall %}
-
-  {% call m.build_step(
-      name='BUILD: WASM',
-      node='CPU-SMALL',
-      condition='!skip_ci && is_docs_only_build != 1',
-      ws='tvm/build-wasm',
-      docker_image='ci_wasm',
-    ) %}
-    sh (
-      script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
-      label: 'Create WASM cmake config',
-    )
-    cmake_build(ci_wasm, 'build', '-j2')
-    make_standalone_crt(ci_wasm, 'build')
-    make_cpp_tests(ci_wasm, 'build')
-    cpp_unittest(ci_wasm)
-    ci_setup(ci_wasm)
-    sh (
-      script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
-      label: 'Run WASM lint and tests',
-    )
-  {% endcall %}
-
-  {% call m.build_step(
-      name='BUILD: i386',
-      node='CPU-SMALL',
-      condition='!skip_ci && is_docs_only_build != 1',
-      ws='tvm/build-i386',
-      docker_image='ci_i386',
-    ) %}
-    sh (
-      script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
-      label: 'Create i386 cmake config',
-    )
-    cmake_build(ci_i386, 'build', '-j2')
-    make_standalone_crt(ci_i386, 'build')
-    make_cpp_tests(ci_i386, 'build')
-    {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim + standalone_crt + crttest + cpptest) }}
-  {% endcall %}
-
-  {% call m.build_step(
-      name='BUILD: arm',
-      node='ARM-SMALL',
-      condition='!skip_ci && is_docs_only_build != 1',
-      ws='tvm/build-arm',
-      docker_image='ci_arm',
-    ) %}
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
-      label: 'Create ARM cmake config',
-    )
-    cmake_build(ci_arm, 'build', '-j4')
-    make_standalone_crt(ci_arm, 'build')
-    make_cpp_tests(ci_arm, 'build')
-    {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib + cpptest + crttest + standalone_crt) }}
-  {% endcall %}
-
-  {% call m.build_step(
-      name='BUILD: Cortex-M',
-      node='CPU-SMALL',
-      condition='!skip_ci && is_docs_only_build != 1',
-      ws='tvm/build-cortexm',
-      docker_image='ci_cortexm',
-    ) %}
-    sh (
-      script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build",
-      label: 'Create Cortex-M cmake config',
-    )
-    cmake_build(ci_cortexm, 'build', '-j2')
-    make_standalone_crt(ci_cortexm, 'build')
-    make_cpp_tests(ci_cortexm, 'build')
-    {{ m.upload_artifacts(tag='cortexm', filenames=tvm_lib + tvm_allvisible + crttest + standalone_crt + cpptest + microtvm_template_projects) }}
-  {% endcall %}
-
-  {% call m.build_step(
-      name='BUILD: Hexagon',
-      node='CPU-SMALL',
-      condition='!skip_ci && is_docs_only_build != 1',
-      ws='tvm/build-hexagon',
-      docker_image='ci_hexagon',
-    ) %}
-    sh (
-      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
-      label: 'Create Hexagon cmake config',
-    )
-    cmake_build(ci_hexagon, 'build', '-j2')
-    make_cpp_tests(ci_hexagon, 'build')
-    sh (
-      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-      label: 'Build Hexagon API',
-    )
-    {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib + cpptest + hexagon_api) }}
-  {% endcall %}
-
-  {% call m.build_step(
-      name='BUILD: RISC-V',
-      node='CPU-SMALL',
-      condition='!skip_ci && is_docs_only_build != 1',
-      ws='tvm/build-riscv',
-      docker_image='ci_riscv',
-    ) %}
-    sh (
-      script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build",
-      label: 'Create RISC-V cmake config',
-    )
-    cmake_build(ci_riscv, 'build', '-j2')
-    make_standalone_crt(ci_riscv, 'build')
-    make_cpp_tests(ci_riscv, 'build')
-    {{ m.upload_artifacts(tag='riscv', filenames=tvm_lib + tvm_allvisible + standalone_crt + crttest + cpptest + microtvm_template_projects) }}
-  {% endcall %}
-
-  )
-}
-}
diff --git a/ci/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2
deleted file mode 100644
index 5cfffc7caef3..000000000000
--- a/ci/jenkins/Deploy.groovy.j2
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
-stage('Build packages') {
-  parallel 'conda CPU': {
-    node('CPU') {
-      sh "${docker_run} tlcpack/conda-cpu ./conda/build_cpu.sh
-    }
-  },
-  'conda cuda': {
-    node('CPU') {
-      sh "${docker_run} tlcpack/conda-cuda90 ./conda/build_cuda.sh
-      sh "${docker_run} tlcpack/conda-cuda100 ./conda/build_cuda.sh
-    }
-  }
-// Here we could upload the packages to anaconda for releases
-// and/or the main branch
-}
-*/
-
-
-def update_docker(ecr_image, hub_image) {
-  if (ecr_image == null) {
-    sh("image was not rebuilt, skipping")
-    return
-  }
-  if (!ecr_image.contains("amazonaws.com")) {
-    sh("echo \"Skipping '${ecr_image}' -> '${hub_image}' since it doesn\'t look like an ECR image\"")
-    return
-  }
-  docker_init(ecr_image)
-  sh(
-    script: """
-    set -eux
-    . ${jenkins_scripts_root}/retry.sh
-    docker tag \
-      ${ecr_image} \
-      ${hub_image}
-    retry 5 docker push ${hub_image}
-    """,
-    label: "Update ${hub_image} on Docker Hub",
-  )
-}
-
-def deploy_docs() {
-  // Note: This code must stay in the Jenkinsfile to ensure that it runs
-  // from a trusted context only
-  sh(
-    script: '''
-      set -eux
-      rm -rf tvm-site
-      git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site
-      cd tvm-site
-      git status
-      git checkout -B $DOCS_DEPLOY_BRANCH
-
-      git ls-tree HEAD docs/ --name-only | grep -vP '^docs/v\\d' | xargs rm -rf
-      mkdir -p docs
-      tar xf ../docs.tgz -C docs
-      COMMIT=$(cat docs/commit_hash)
-      git add .
-      git config user.name tvm-bot
-      git config user.email 95660001+tvm-bot@users.noreply.github.com
-      git commit -m"deploying docs (apache/tvm@$COMMIT)"
-      git status
-    ''',
-    label: 'Unpack docs and update tvm-site'
-  )
-
-  withCredentials([string(
-    credentialsId: 'docs-push-token',
-    variable: 'GITHUB_TOKEN',
-    )]) {
-    sh(
-      script: '''
-        cd tvm-site
-        git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git
-        git push deploy $DOCS_DEPLOY_BRANCH || true
-      ''',
-      label: 'Upload docs to apache/tvm-site'
-    )
-  }
-}
-
-
-def deploy() {
-  stage('Deploy') {
-    if (env.BRANCH_NAME == 'main') {
-      parallel(
-        {% call m.deploy_step(
-          name="Deploy Docs",
-          feature_flag="env.DOCS_DEPLOY_ENABLED == 'yes'",
-          ws="tvm/deploy-docs",
-        ) %}
-          init_git()
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/docs --items docs.tgz",
-            label: 'Download docs folder from S3',
-          )
-          deploy_docs()
-        {% endcall %}
-        {% call m.deploy_step(
-          name="Upload built Docker images",
-          feature_flag="env.DEPLOY_DOCKER_IMAGES == 'yes' && rebuild_docker_images && upstream_revision != null",
-          ws="tvm/deploy-docker",
-        ) %}
-          init_git()
-          try {
-            withCredentials([string(
-              credentialsId: 'dockerhub-tlcpackstaging-key',
-              variable: 'DOCKERHUB_KEY',
-            )]) {
-              sh(
-                script: 'docker login -u tlcpackstaging -p ${DOCKERHUB_KEY}',
-                label: 'Log in to Docker Hub',
-              )
-            }
-            def date_Ymd_HMS = sh(
-              script: 'python3 -c \'import datetime; print(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))\'',
-              label: 'Determine date',
-              returnStdout: true,
-            ).trim()
-            def tag = "${date_Ymd_HMS}-${upstream_revision.substring(0, 8)}"
-            {% for image in images %}
-            update_docker(built_{{ image.name }}, "tlcpackstaging/{{ image.name }}:${tag}")
-            {% endfor %}
-          } finally {
-            sh(
-              script: 'docker logout',
-              label: 'Clean up login credentials'
-            )
-          }
-        {% endcall %}
-        {% call m.deploy_step(
-          name="Tag tlcpackstaging to tlcpack",
-          feature_flag="env.DOCS_DEPLOY_ENABLED == 'yes'",
-          ws="tvm/tag-images",
-        ) %}
-          init_git()
-          withCredentials([string(
-            credentialsId: 'dockerhub-tlcpack-key',
-            variable: 'TLCPACK_TOKEN',
-          )]) {
-            try {
-              sh(
-                script: 'echo $TLCPACK_TOKEN | docker login --username octomldriazati --password-stdin',
-                label: 'Log in to Docker Hub'
-              )
-              {% for image in images %}
-              if ({{ image.name }}.contains("tlcpackstaging")) {
-                // Push image to tlcpack
-                def tag = {{ image.name }}.split(":")[1]
-                sh(
-                  script: """
-                    set -eux
-                    . ${jenkins_scripts_root}/retry.sh
-                    docker pull tlcpackstaging/{{ image.name }}:${tag}
-                    docker tag tlcpackstaging/{{ image.name }}:${tag} tlcpack/{{ image.name.replace("_", "-") }}:${tag}
-                    retry 5 docker push tlcpack/{{ image.name.replace("_", "-") }}:${tag}
-                  """,
-                  label: 'Tag tlcpackstaging/{{ image.name }} image to tlcpack',
-                )
-              }
-              {% endfor %}
-            } finally {
-              sh(
-                script: 'docker logout',
-                label: 'Clean up login credentials'
-              )
-            }
-          }
-        {% endcall %}
-      )
-    }
-  }
-}
diff --git a/ci/jenkins/DockerBuild.groovy.j2 b/ci/jenkins/DockerBuild.groovy.j2
deleted file mode 100644
index 69e0db4f9e4f..000000000000
--- a/ci/jenkins/DockerBuild.groovy.j2
+++ /dev/null
@@ -1,118 +0,0 @@
-def ecr_push(full_name) {
-  aws_account_id = sh(
-    returnStdout: true,
-    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
-    label: 'Get AWS ID'
-  ).trim()
-
-  def ecr_name = "${aws_account_id}.{{ aws_ecr_url }}/${full_name}"
-  try {
-    withEnv([
-      "AWS_ACCOUNT_ID=${aws_account_id}",
-      'AWS_DEFAULT_REGION={{ aws_default_region }}',
-      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
-      sh(
-        script: '''
-          set -eux
-          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
-        ''',
-        label: 'Log in to ECR'
-      )
-      sh(
-        script: """
-          set -x
-          . ${jenkins_scripts_root}/retry.sh
-          docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
-          retry 5 docker push \$AWS_ECR_REPO/${full_name}
-        """,
-        label: 'Upload image to ECR'
-      )
-    }
-  } finally {
-    withEnv([
-      "AWS_ACCOUNT_ID=${aws_account_id}",
-      'AWS_DEFAULT_REGION={{ aws_default_region }}',
-      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
-      sh(
-        script: 'docker logout $AWS_ECR_REPO',
-        label: 'Clean up login credentials'
-      )
-    }
-  }
-  return ecr_name
-}
-
-def ecr_pull(full_name) {
-  aws_account_id = sh(
-    returnStdout: true,
-    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
-    label: 'Get AWS ID'
-  ).trim()
-
-  try {
-    withEnv([
-      "AWS_ACCOUNT_ID=${aws_account_id}",
-      'AWS_DEFAULT_REGION={{ aws_default_region }}',
-      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
-      sh(
-        script: '''
-          set -eux
-          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
-        ''',
-        label: 'Log in to ECR'
-      )
-      sh(
-        script: """
-          set -eux
-          . ${jenkins_scripts_root}/retry.sh
-          retry 5 docker pull ${full_name}
-        """,
-        label: 'Pull image from ECR'
-      )
-    }
-  } finally {
-    withEnv([
-      "AWS_ACCOUNT_ID=${aws_account_id}",
-      'AWS_DEFAULT_REGION={{ aws_default_region }}',
-      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
-      sh(
-        script: 'docker logout $AWS_ECR_REPO',
-        label: 'Clean up login credentials'
-      )
-    }
-  }
-}
-
-def build_image(image_name) {
-  hash = sh(
-    returnStdout: true,
-    script: 'git log -1 --format=\'%h\''
-  ).trim()
-  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
-  sh(
-    script: "${docker_build} ${image_name} --spec ${full_name}",
-    label: 'Build docker image'
-  )
-  return ecr_push(full_name)
-}
-
-
-def build_docker_images() {
-  stage('Docker Image Build') {
-    parallel(
-    {% for image in images %}
-      '{{ image.name }}': {
-        node('{{ image.platform }}') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            // We're purposefully not setting the built image here since they
-            // are not yet being uploaded to tlcpack
-            // {{ image.name }} = build_image('{{ image.name }}')
-            built_{{ image.name }} = build_image('{{ image.name }}');
-          }
-        }
-      },
-    {% endfor %}
-    )
-  }
-}
diff --git a/ci/jenkins/Lint.groovy.j2 b/ci/jenkins/Lint.groovy.j2
deleted file mode 100644
index 3ede64301c93..000000000000
--- a/ci/jenkins/Lint.groovy.j2
+++ /dev/null
@@ -1,19 +0,0 @@
-def lint() {
-  stage('Lint') {
-    parallel(
-      {% call m.sharded_lint_step(
-          name='Lint',
-          num_shards=2,
-          node='CPU-SMALL',
-          ws='tvm/lint',
-          docker_image='ci_lint',
-        )
-      %}
-        sh (
-          script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
-          label: 'Run lint',
-        )
-      {% endcall %}
-    )
-  }
-}
diff --git a/ci/jenkins/Makefile b/ci/jenkins/Makefile
deleted file mode 100644
index 5c9e0ac54057..000000000000
--- a/ci/jenkins/Makefile
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-_venv: requirements.txt
-	rm -rf _venv
-	python3 -mvenv _venv
-	_venv/bin/pip3 install -r requirements.txt
-
-all: _venv
-	_venv/bin/python3 generate.py
-
-.PHONY: all venv
-.DEFAULT_GOAL=all
diff --git a/ci/jenkins/Test.groovy.j2 b/ci/jenkins/Test.groovy.j2
deleted file mode 100644
index 274a3e2dce6c..000000000000
--- a/ci/jenkins/Test.groovy.j2
+++ /dev/null
@@ -1,344 +0,0 @@
-{% set test_method_names = [] %}
-
-def cpp_unittest(image) {
-  sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
-    label: 'Run C++ tests',
-  )
-}
-
-def micro_cpp_unittest(image) {
-  sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
-    label: 'Run microTVM C++ tests',
-  )
-}
-
-// We have to do this whacky split of the code from where it's used since the
-// JVM limits method length to 64k and we easily exceed that with all this
-// autogenerated code. This makes it so each test step is in its own method so
-// that each individual method isn't too big.
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="unittest: GPU",
-  num_shards=3,
-  node="GPU",
-  ws="tvm/ut-python-gpu",
-  platform="gpu",
-  docker_image="ci_gpu",
-  test_method_names=test_method_names,
-) %}
-  {% if shard_index == 1 %}
-  {{ m.download_artifacts(tag='gpu2') }}
-  sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
-  // These require a GPU to finish the build (i.e. CUDA needs to be load-able)
-  make_standalone_crt(ci_gpu, 'build')
-  // make_cpp_tests(ci_gpu, 'build')
-  // cpp_unittest(ci_gpu)
-
-  sh "rm -rf build"
-  {{ m.download_artifacts(tag='gpu') }}
-  ci_setup(ci_gpu)
-  sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
-  make_standalone_crt(ci_gpu, 'build')
-  make_cpp_tests(ci_gpu, 'build')
-  cpp_unittest(ci_gpu)
-  micro_cpp_unittest(ci_gpu)
-  {% else %}
-  {{ m.download_artifacts(tag='gpu') }}
-  ci_setup(ci_gpu)
-  {% endif %}
-  {% if shard_index == 2 or num_shards < 2 %}
-  sh (
-    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
-    label: 'Run Java unit tests',
-  )
-  {% endif %}
-  sh (
-    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
-    label: 'Run Python GPU unit tests',
-  )
-  sh (
-    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
-    label: 'Run Python GPU integration tests',
-  )
-{% endcall %}
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="integration: CPU",
-  node="CPU-SMALL",
-  num_shards=4,
-  ws="tvm/integration-python-cpu",
-  platform="cpu",
-  docker_image="ci_cpu",
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='cpu') }}
-  ci_setup(ci_cpu)
-  sh (
-    script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-    label: 'Run CPU integration tests',
-  )
-{% endcall %}
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="python: i386",
-  node="CPU-SMALL",
-  num_shards=3,
-  ws="tvm/integration-python-i386",
-  platform="i386",
-  docker_image="ci_i386",
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='i386') }}
-  ci_setup(ci_i386)
-  {% if shard_index == 1 %}
-  cpp_unittest(ci_i386)
-  micro_cpp_unittest(ci_i386)
-  {% endif %}
-  python_unittest(ci_i386)
-  sh (
-    script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-    label: 'Run i386 integration tests',
-  )
-  {% if shard_index == 2 or num_shards < 2 %}
-  fsim_test(ci_i386)
-  {% endif %}
-{% endcall %}
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="test: Hexagon",
-  node="CPU-SMALL",
-  ws="tvm/test-hexagon",
-  platform="hexagon",
-  docker_image="ci_hexagon",
-  test_method_names=test_method_names,
-  num_shards=8,
-) %}
-  {{ m.download_artifacts(tag='hexagon') }}
-  ci_setup(ci_hexagon)
-  {% if shard_index == 1 %}
-  cpp_unittest(ci_hexagon)
-  {% endif %}
-  sh (
-    script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-    label: 'Run Hexagon tests',
-  )
-{% endcall %}
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="integration: aarch64",
-  num_shards=4,
-  node="ARM-SMALL",
-  ws="tvm/ut-python-arm",
-  platform="arm",
-  docker_image="ci_arm",
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='arm') }}
-  ci_setup(ci_arm)
-  python_unittest(ci_arm)
-  sh (
-    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-    label: 'Run CPU integration tests',
-  )
-{% endcall %}
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="topi: GPU",
-  node="GPU",
-  num_shards=3,
-  ws="tvm/topi-python-gpu",
-  platform="gpu",
-  docker_image="ci_gpu",
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='gpu') }}
-  ci_setup(ci_gpu)
-  sh (
-    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-    label: 'Run TOPI tests',
-  )
-{% endcall %}
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="frontend: GPU",
-  node="GPU",
-  num_shards=6,
-  ws="tvm/frontend-python-gpu",
-  platform="gpu",
-  docker_image="ci_gpu",
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='gpu') }}
-  ci_setup(ci_gpu)
-  sh (
-    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
-    label: 'Run Python frontend tests',
-  )
-{% endcall %}
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="topi: aarch64",
-  node="ARM-SMALL",
-  ws="tvm/ut-python-arm",
-  platform="arm",
-  docker_image="ci_arm",
-  num_shards=2,
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='arm') }}
-  ci_setup(ci_arm)
-  {% if shard_index == 1 %}
-  cpp_unittest(ci_arm)
-  micro_cpp_unittest(ci_arm)
-  {% endif %}
-  sh (
-    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-    label: 'Run test_arm_compute_lib test',
-  )
-  sh (
-    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-    label: 'Run TOPI tests',
-  )
-{% endcall %}
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="frontend: aarch64",
-  node="ARM-SMALL",
-  ws="tvm/frontend-python-arm",
-  platform="arm",
-  docker_image="ci_arm",
-  num_shards=2,
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='arm') }}
-  ci_setup(ci_arm)
-  sh (
-    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
-    label: 'Run Python frontend tests',
-  )
-{% endcall %}
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="test: Cortex-M",
-  node="CPU-SMALL",
-  ws="tvm/test-cortexm",
-  platform="cortexm",
-  docker_image="ci_cortexm",
-  num_shards=12,
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='cortexm') }}
-  ci_setup(ci_cortexm)
-  {% if shard_index == 1%}
-  cpp_unittest(ci_cortexm)
-  micro_cpp_unittest(ci_cortexm)
-  sh (
-    script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_demo_microtvm.sh",
-    label: 'Run microTVM demos',
-  )
-  {% endif %}
-  sh (
-    script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
-    label: 'Run microTVM tests',
-  )
-{% endcall %}
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="test: RISC-V",
-  node="CPU-SMALL",
-  ws="tvm/test-riscv",
-  platform="riscv",
-  docker_image="ci_riscv",
-  num_shards=1,
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='riscv') }}
-  ci_setup(ci_riscv)
-  {% if shard_index == 1%}
-  cpp_unittest(ci_cortexm)
-  micro_cpp_unittest(ci_cortexm)
-  {% endif %}
-  sh (
-    script: "${docker_run} ${ci_riscv} ./tests/scripts/task_riscv_microtvm.sh",
-    label: 'Run microTVM tests',
-  )
-{% endcall %}
-
-def run_unittest_minimal() {
-  {% call m.test_step_body(
-      name="unittest: CPU MINIMAL",
-      node="CPU-SMALL",
-      ws="tvm/ut-python-cpu-minimal",
-      platform="minimal",
-      docker_image="ci_minimal",
-    ) %}
-      {{ m.download_artifacts(tag='cpu-minimal') }}
-      cpp_unittest(ci_minimal)
-      micro_cpp_unittest(ci_minimal)
-      python_unittest(ci_minimal)
-  {% endcall %}
-}
-
-def test() {
-stage('Test') {
-  environment {
-    SKIP_SLOW_TESTS = "${skip_slow_tests}"
-  }
-  parallel(
-  {% for stage_name, method_name in test_method_names %}
-  '{{ stage_name }}': {
-    {{ method_name }}()
-  },
-  {% endfor %}
-  'unittest: CPU MINIMAL': {
-    run_unittest_minimal()
-  },
-  {% call m.test_step(
-    name="unittest: CPU",
-    node="CPU-SMALL",
-    ws="tvm/ut-python-cpu",
-    platform="cpu",
-    docker_image="ci_cpu",
-  ) %}
-    {{ m.download_artifacts(tag='cpu') }}
-    ci_setup(ci_cpu)
-    cpp_unittest(ci_cpu)
-    micro_cpp_unittest(ci_cpu)
-    python_unittest(ci_cpu)
-    fsim_test(ci_cpu)
-    sh (
-      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
-      label: 'Run VTA tests in TSIM',
-    )
-  {% endcall %}
-  {% call m.test_step(
-    name="frontend: CPU",
-    node="CPU-SMALL",
-    ws="tvm/frontend-python-cpu",
-    platform="cpu",
-    docker_image="ci_cpu",
-) %}
-    {{ m.download_artifacts(tag='cpu') }}
-    ci_setup(ci_cpu)
-    sh (
-      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
-      label: 'Run Python frontend tests',
-    )
-  {% endcall %}
-  'docs: GPU': {
-    if (!skip_ci) {
-      node('GPU') {
-        ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
-          init_git()
-          docker_init(ci_gpu)
-          {{ m.download_artifacts(tag='gpu') }}
-          timeout(time: 180, unit: 'MINUTES') {
-            ci_setup(ci_gpu)
-            sh (
-              script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh",
-              label: 'Build docs',
-            )
-          }
-          {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }}
-          sh(
-            script: "aws s3 cp --no-progress _docs s3://${s3_bucket}/${s3_prefix}/docs --recursive",
-            label: 'Upload docs to S3',
-          )
-        }
-      }
-    }
-  },
-  )
-}
-}
diff --git a/ci/jenkins/data.py b/ci/jenkins/data.py
new file mode 100644
index 000000000000..492608870e01
--- /dev/null
+++ b/ci/jenkins/data.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+
+
+files_to_stash = {
+    # Executables and build files needed to run c++ tests
+    "cpptest": ["build/cpptest", "build/build.ninja", "build/CMakeFiles/rules.ninja"],
+    # Executables and build files needed to c runtime tests
+    "crttest": ["build/crttest"],
+    # Folder for hexagon build
+    "hexagon_api": [
+        "build/hexagon_api_output",
+    ],
+    # Folder for microtvm build
+    "microtvm_template_projects": [
+        "build/microtvm_template_projects",
+    ],
+    # Folders and build files for c runtime
+    "standalone_crt": ["build/standalone_crt", "build/build.ninja"],
+    # This library is produced with HIDE_PRIVATE_SYMBOLS=ON
+    "tvm_allvisible": ["build/libtvm_allvisible.so"],
+    # runtime files
+    "tvm_runtime": ["build/libtvm_runtime.so", "build/config.cmake"],
+    # compiler files
+    "tvm_lib": ["build/libtvm.so", "build/libtvm_runtime.so", "build/config.cmake"],
+    # compiler files and fsim
+    "tvm_multilib": [
+        "build/libtvm.so",
+        "build/libvta_fsim.so",
+        "build/libtvm_runtime.so",
+        "build/config.cmake",
+    ],
+    # compiler files, fsim, and tsim
+    "tvm_multilib_tsim": [
+        "build/libvta_tsim.so",
+        "build/libtvm.so",
+        "build/libvta_fsim.so",
+        "build/libtvm_runtime.so",
+        "build/config.cmake",
+    ],
+}
+
+
+# AWS info
+aws_default_region = "us-west-2"
+aws_ecr_url = "dkr.ecr." + aws_default_region + ".amazonaws.com"
+
+# Docker Images
+docker_images = {
+    "ci_arm": {
+        "tag": "tlcpack/ci-arm:20221013-060115-61c9742ea",
+        "platform": "ARM",
+    },
+    "ci_cortexm": {
+        "tag": "tlcpack/ci-cortexm:20221013-060115-61c9742ea",
+        "platform": "CPU",
+    },
+    "ci_cpu": {
+        "tag": "tlcpack/ci-cpu:20221013-060115-61c9742ea",
+        "platform": "CPU",
+    },
+    "ci_gpu": {
+        "tag": "tlcpack/ci-gpu:20221019-060125-0b4836739",
+        "platform": "GPU",
+    },
+    "ci_hexagon": {
+        "tag": "tlcpack/ci-hexagon:20221013-060115-61c9742ea",
+        "platform": "CPU",
+    },
+    "ci_i386": {
+        "tag": "tlcpack/ci-i386:20221013-060115-61c9742ea",
+        "platform": "CPU",
+    },
+    "ci_lint": {
+        "tag": "tlcpack/ci-lint:20221013-060115-61c9742ea",
+        "platform": "CPU",
+    },
+    "ci_minimal": {
+        "tag": "tlcpack/ci-minimal:20221013-060115-61c9742ea",
+        "platform": "CPU",
+    },
+    "ci_riscv": {
+        "tag": "tlcpack/ci-riscv:20221013-060115-61c9742ea",
+        "platform": "CPU",
+    },
+    "ci_wasm": {
+        "tag": "tlcpack/ci-wasm:20221013-060115-61c9742ea",
+        "platform": "CPU",
+    },
+}
+
+data = {
+    "images": [{"name": k, "platform": v["platform"]} for k, v in docker_images.items()],
+    "aws_default_region": aws_default_region,
+    "aws_ecr_url": aws_ecr_url,
+    **{k: v["tag"] for k, v in docker_images.items()},
+    **files_to_stash,
+}
+
+if __name__ == "__main__":
+    # This is used in docker/dev_common.sh to look up image tags
+    name = sys.argv[1]
+    if name in docker_images:
+        print(docker_images[name]["tag"])
+    else:
+        exit(1)
diff --git a/ci/jenkins/generate.py b/ci/jenkins/generate.py
index 07bf4b5a8dad..30c12be5f95d 100644
--- a/ci/jenkins/generate.py
+++ b/ci/jenkins/generate.py
@@ -23,12 +23,16 @@
 import textwrap
 
 from pathlib import Path
-from typing import List
+from typing import List, Optional
+from dataclasses import dataclass
+
+from data import data
 
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent
-JENKINSFILE_TEMPLATE = REPO_ROOT / "ci" / "jenkins" / "Jenkinsfile.j2"
-JENKINSFILE = REPO_ROOT / "Jenkinsfile"
+JENKINS_DIR = REPO_ROOT / "ci" / "jenkins"
+TEMPLATES_DIR = JENKINS_DIR / "templates"
+GENERATED_DIR = JENKINS_DIR / "generated"
 
 
 class Change:
@@ -37,50 +41,12 @@ class Change:
     FULL = object()
 
 
-data = {
-    "images": [
-        {
-            "name": "ci_arm",
-            "platform": "ARM",
-        },
-        {
-            "name": "ci_cortexm",
-            "platform": "CPU",
-        },
-        {
-            "name": "ci_cpu",
-            "platform": "CPU",
-        },
-        {
-            "name": "ci_gpu",
-            "platform": "CPU",
-        },
-        {
-            "name": "ci_hexagon",
-            "platform": "CPU",
-        },
-        {
-            "name": "ci_i386",
-            "platform": "CPU",
-        },
-        {
-            "name": "ci_lint",
-            "platform": "CPU",
-        },
-        {
-            "name": "ci_minimal",
-            "platform": "CPU",
-        },
-        {
-            "name": "ci_riscv",
-            "platform": "CPU",
-        },
-        {
-            "name": "ci_wasm",
-            "platform": "CPU",
-        },
-    ]
-}
+@dataclass
+class ChangeData:
+    diff: Optional[str]
+    content: str
+    destination: Path
+    source: Path
 
 
 def lines_without_generated_tag(content):
@@ -133,36 +99,44 @@ def change_type(lines: List[str]) -> Change:
         return Change.FULL
 
 
-if __name__ == "__main__":
-    help = "Regenerate Jenkinsfile from template"
-    parser = argparse.ArgumentParser(description=help)
-    parser.add_argument("--force", action="store_true", help="always overwrite timestamp")
-    parser.add_argument("--check", action="store_true", help="just verify the output didn't change")
-    args = parser.parse_args()
-
-    with open(JENKINSFILE) as f:
-        content = f.read()
+def update_jenkinsfile(source: Path) -> ChangeData:
+    destination = GENERATED_DIR / source.stem
 
     data["generated_time"] = datetime.datetime.now().isoformat()
-    timestamp_match = re.search(r"^// Generated at (.*)$", content, flags=re.MULTILINE)
-    if not timestamp_match:
-        raise RuntimeError("Could not find timestamp in Jenkinsfile")
-    original_timestamp = timestamp_match.groups()[0]
+    if destination.exists():
+        with open(destination) as f:
+            old_generated_content = f.read()
+
+        timestamp_match = re.search(
+            r"^// Generated at (.*)$", old_generated_content, flags=re.MULTILINE
+        )
+        if not timestamp_match:
+            raise RuntimeError(
+                f"Could not find timestamp in Jenkinsfile: {destination.relative_to(TEMPLATES_DIR)}"
+            )
+        original_timestamp = timestamp_match.groups()[0]
 
     environment = jinja2.Environment(
-        loader=jinja2.FileSystemLoader(REPO_ROOT),
+        loader=jinja2.FileSystemLoader(TEMPLATES_DIR),
         undefined=jinja2.StrictUndefined,
         lstrip_blocks=True,
         trim_blocks=True,
         keep_trailing_newline=True,
     )
-    template = environment.get_template(str(JENKINSFILE_TEMPLATE.relative_to(REPO_ROOT)))
+    template = environment.get_template(str(source.relative_to(TEMPLATES_DIR)))
     new_content = template.render(**data)
 
+    if not destination.exists():
+        # New file, create it from scratch
+        return ChangeData(
+            diff=new_content, content=new_content, source=source, destination=destination
+        )
+
     diff = [
         line
         for line in difflib.unified_diff(
-            lines_without_generated_tag(content), lines_without_generated_tag(new_content)
+            lines_without_generated_tag(old_generated_content),
+            lines_without_generated_tag(new_content),
         )
     ]
     change = change_type(diff)
@@ -173,17 +147,30 @@ def change_type(lines: List[str]) -> Change:
 
     diff = "".join(diff)
 
+    return ChangeData(diff=diff, content=new_content, source=source, destination=destination)
+
+
+if __name__ == "__main__":
+    help = "Regenerate Jenkinsfile from template"
+    parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--force", action="store_true", help="always overwrite timestamp")
+    parser.add_argument("--check", action="store_true", help="just verify the output didn't change")
+    args = parser.parse_args()
+
+    sources = TEMPLATES_DIR.glob("*_jenkinsfile.groovy.j2")
+    changes = [update_jenkinsfile(source) for source in sources if source.name != "base.groovy.j2"]
+
     if args.check:
-        if not diff:
-            print("Success, the newly generated Jenkinsfile matched the one on disk")
+        if all(not data.diff for data in changes):
+            print("Success, the newly generated Jenkinsfiles matched the ones on disk")
             exit(0)
         else:
             print(
                 textwrap.dedent(
                     """
-                Newly generated Jenkinsfile did not match the one on disk! If you have made
-                edits to the Jenkinsfile, move them to 'jenkins/Jenkinsfile.j2' and
-                regenerate the Jenkinsfile from the template with
+                Newly generated Jenkinsfiles did not match the ones on disk! If you have made
+                edits to the Jenkinsfiles in generated/, move them to the corresponding source and
+                regenerate the Jenkinsfiles from the templates with
 
                     python3 -m pip install -r jenkins/requirements.txt
                     python3 jenkins/generate.py
@@ -192,13 +179,20 @@ def change_type(lines: List[str]) -> Change:
             """
                 ).strip()
             )
-            print(diff)
+            for data in changes:
+                if data.diff:
+                    source = data.source.relative_to(REPO_ROOT)
+                    print(source)
+                    print(data.diff)
+
             exit(1)
     else:
-        with open(JENKINSFILE, "w") as f:
-            f.write(new_content)
-        if not diff:
-            print(f"Wrote output to {JENKINSFILE.relative_to(REPO_ROOT)}, no changes made")
-        else:
-            print(f"Wrote output to {JENKINSFILE.relative_to(REPO_ROOT)}, changes:")
-            print(diff)
+        for data in changes:
+            with open(data.destination, "w") as f:
+                f.write(data.content)
+
+            if not data.diff:
+                print(f"Wrote output to {data.destination.relative_to(REPO_ROOT)}, no changes made")
+            else:
+                print(f"Wrote output to {data.destination.relative_to(REPO_ROOT)}, changes:")
+                print(data.diff)
diff --git a/ci/jenkins/generated/arm_jenkinsfile.groovy b/ci/jenkins/generated/arm_jenkinsfile.groovy
new file mode 100644
index 000000000000..f387687528c0
--- /dev/null
+++ b/ci/jenkins/generated/arm_jenkinsfile.groovy
@@ -0,0 +1,944 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-12-05T14:48:42.092397
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+def build() {
+  stage('Build') {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('ARM-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-arm") {
+          init_git()
+          docker_init(ci_arm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
+          script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
+          label: 'Create ARM cmake config',
+        )
+        cmake_build(ci_arm, 'build', '-j4')
+        make_standalone_crt(ci_arm, 'build')
+        make_cpp_tests(ci_arm, 'build')
+        sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/arm --items build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/crttest build/standalone_crt build/build.ninja",
+            label: 'Upload artifacts to S3',
+          )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: arm')
+    }
+  }
+}
+build()
+
+
+
+def shard_run_integration_aarch64_1_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          docker_init(ci_arm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=integration: aarch64',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_arm)
+              python_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: aarch64 1 of 4')
+  }
+}
+
+def shard_run_integration_aarch64_2_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          docker_init(ci_arm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=integration: aarch64',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=1',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_arm)
+              python_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: aarch64 2 of 4')
+  }
+}
+
+def shard_run_integration_aarch64_3_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          docker_init(ci_arm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=integration: aarch64',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=2',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_arm)
+              python_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: aarch64 3 of 4')
+  }
+}
+
+def shard_run_integration_aarch64_4_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          docker_init(ci_arm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=integration: aarch64',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=3',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_arm)
+              python_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: aarch64 4 of 4')
+  }
+}
+
+
+
+def shard_run_topi_aarch64_1_of_2() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          docker_init(ci_arm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=topi: aarch64',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_arm)
+              cpp_unittest(ci_arm)
+              micro_cpp_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+                label: 'Run test_arm_compute_lib test',
+              )
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_aarch64 --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('topi: aarch64 1 of 2')
+  }
+}
+
+def shard_run_topi_aarch64_2_of_2() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          docker_init(ci_arm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=topi: aarch64',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=1',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+                label: 'Run test_arm_compute_lib test',
+              )
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_aarch64 --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('topi: aarch64 2 of 2')
+  }
+}
+
+
+
+def shard_run_frontend_aarch64_1_of_2() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
+        try {
+          init_git()
+          docker_init(ci_arm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=frontend: aarch64',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_aarch64 --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: aarch64 1 of 2')
+  }
+}
+
+def shard_run_frontend_aarch64_2_of_2() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
+        try {
+          init_git()
+          docker_init(ci_arm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=frontend: aarch64',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=1',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_aarch64 --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: aarch64 2 of 2')
+  }
+}
+
+
+def test() {
+  stage('Test') {
+    environment {
+      SKIP_SLOW_TESTS = "${skip_slow_tests}"
+    }
+    parallel(
+    'integration: aarch64 1 of 4': {
+      shard_run_integration_aarch64_1_of_4()
+    },
+    'integration: aarch64 2 of 4': {
+      shard_run_integration_aarch64_2_of_4()
+    },
+    'integration: aarch64 3 of 4': {
+      shard_run_integration_aarch64_3_of_4()
+    },
+    'integration: aarch64 4 of 4': {
+      shard_run_integration_aarch64_4_of_4()
+    },
+    'topi: aarch64 1 of 2': {
+      shard_run_topi_aarch64_1_of_2()
+    },
+    'topi: aarch64 2 of 2': {
+      shard_run_topi_aarch64_2_of_2()
+    },
+    'frontend: aarch64 1 of 2': {
+      shard_run_frontend_aarch64_1_of_2()
+    },
+    'frontend: aarch64 2 of 2': {
+      shard_run_frontend_aarch64_2_of_2()
+    },
+    )
+  }
+}
+test()
diff --git a/ci/jenkins/generated/cortexm_jenkinsfile.groovy b/ci/jenkins/generated/cortexm_jenkinsfile.groovy
new file mode 100644
index 000000000000..76dbbbb7a3d8
--- /dev/null
+++ b/ci/jenkins/generated/cortexm_jenkinsfile.groovy
@@ -0,0 +1,1124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-12-05T14:48:41.929980
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+def build() {
+  stage('Build') {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cortexm") {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
+          script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build",
+          label: 'Create Cortex-M cmake config',
+        )
+        cmake_build(ci_cortexm, 'build', '-j2')
+        make_standalone_crt(ci_cortexm, 'build')
+        make_cpp_tests(ci_cortexm, 'build')
+        sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/crttest build/standalone_crt build/build.ninja build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/microtvm_template_projects",
+            label: 'Upload artifacts to S3',
+          )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: Cortex-M')
+    }
+  }
+}
+build()
+
+
+
+def shard_run_test_Cortex_M_1_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              cpp_unittest(ci_cortexm)
+              micro_cpp_unittest(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_demo_microtvm.sh",
+                label: 'Run microTVM demos',
+              )
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 1 of 12')
+  }
+}
+
+def shard_run_test_Cortex_M_2_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=1',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 2 of 12')
+  }
+}
+
+def shard_run_test_Cortex_M_3_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=2',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 3 of 12')
+  }
+}
+
+def shard_run_test_Cortex_M_4_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=3',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 4 of 12')
+  }
+}
+
+def shard_run_test_Cortex_M_5_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=4',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 5 of 12')
+  }
+}
+
+def shard_run_test_Cortex_M_6_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=5',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 6 of 12')
+  }
+}
+
+def shard_run_test_Cortex_M_7_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=6',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 7 of 12')
+  }
+}
+
+def shard_run_test_Cortex_M_8_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=7',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 8 of 12')
+  }
+}
+
+def shard_run_test_Cortex_M_9_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=8',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 9 of 12')
+  }
+}
+
+def shard_run_test_Cortex_M_10_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=9',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 10 of 12')
+  }
+}
+
+def shard_run_test_Cortex_M_11_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=10',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 11 of 12')
+  }
+}
+
+def shard_run_test_Cortex_M_12_of_12() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") {
+        try {
+          init_git()
+          docker_init(ci_cortexm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cortexm',
+              'TEST_STEP_NAME=test: Cortex-M',
+              'TVM_NUM_SHARDS=12',
+              'TVM_SHARD_INDEX=11',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cortexm",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Cortex_M --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Cortex-M 12 of 12')
+  }
+}
+
+
+def test() {
+  stage('Test') {
+    environment {
+      SKIP_SLOW_TESTS = "${skip_slow_tests}"
+    }
+    parallel(
+    'test: Cortex-M 1 of 12': {
+      shard_run_test_Cortex_M_1_of_12()
+    },
+    'test: Cortex-M 2 of 12': {
+      shard_run_test_Cortex_M_2_of_12()
+    },
+    'test: Cortex-M 3 of 12': {
+      shard_run_test_Cortex_M_3_of_12()
+    },
+    'test: Cortex-M 4 of 12': {
+      shard_run_test_Cortex_M_4_of_12()
+    },
+    'test: Cortex-M 5 of 12': {
+      shard_run_test_Cortex_M_5_of_12()
+    },
+    'test: Cortex-M 6 of 12': {
+      shard_run_test_Cortex_M_6_of_12()
+    },
+    'test: Cortex-M 7 of 12': {
+      shard_run_test_Cortex_M_7_of_12()
+    },
+    'test: Cortex-M 8 of 12': {
+      shard_run_test_Cortex_M_8_of_12()
+    },
+    'test: Cortex-M 9 of 12': {
+      shard_run_test_Cortex_M_9_of_12()
+    },
+    'test: Cortex-M 10 of 12': {
+      shard_run_test_Cortex_M_10_of_12()
+    },
+    'test: Cortex-M 11 of 12': {
+      shard_run_test_Cortex_M_11_of_12()
+    },
+    'test: Cortex-M 12 of 12': {
+      shard_run_test_Cortex_M_12_of_12()
+    },
+    )
+  }
+}
+test()
diff --git a/ci/jenkins/generated/cpu_jenkinsfile.groovy b/ci/jenkins/generated/cpu_jenkinsfile.groovy
new file mode 100644
index 000000000000..ad168c591872
--- /dev/null
+++ b/ci/jenkins/generated/cpu_jenkinsfile.groovy
@@ -0,0 +1,842 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-12-05T14:48:42.120032
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+def build() {
+  stage('Build') {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu") {
+          init_git()
+          docker_init(ci_cpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
+          script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
+          label: 'Create CPU cmake config',
+        )
+        cmake_build(ci_cpu, 'build', '-j2')
+        make_standalone_crt(ci_cpu, 'build')
+        make_cpp_tests(ci_cpu, 'build')
+        sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu --items build/libvta_tsim.so build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/standalone_crt build/build.ninja",
+            label: 'Upload artifacts to S3',
+          )
+
+        ci_setup(ci_cpu)
+        // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
+        // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
+        sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: CPU')
+    }
+  }
+}
+build()
+
+
+
+def shard_run_integration_CPU_1_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+        try {
+          init_git()
+          docker_init(ci_cpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: CPU 1 of 4')
+  }
+}
+
+def shard_run_integration_CPU_2_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+        try {
+          init_git()
+          docker_init(ci_cpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=1',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: CPU 2 of 4')
+  }
+}
+
+def shard_run_integration_CPU_3_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+        try {
+          init_git()
+          docker_init(ci_cpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=2',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: CPU 3 of 4')
+  }
+}
+
+def shard_run_integration_CPU_4_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+        try {
+          init_git()
+          docker_init(ci_cpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cpu',
+              'TEST_STEP_NAME=integration: CPU',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=3',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: CPU 4 of 4')
+  }
+}
+
+
+
+def shard_run_unittest_CPU_1_of_1() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
+        try {
+          init_git()
+          docker_init(ci_cpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cpu',
+              'TEST_STEP_NAME=unittest: CPU',
+              'TVM_NUM_SHARDS=1',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cpu)
+              cpp_unittest(ci_cpu)
+              micro_cpp_unittest(ci_cpu)
+              python_unittest(ci_cpu)
+              fsim_test(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
+                label: 'Run VTA tests in TSIM',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_CPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('unittest: CPU 1 of 1')
+  }
+}
+
+
+def shard_run_frontend_CPU_1_of_1() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-cpu") {
+        try {
+          init_git()
+          docker_init(ci_cpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cpu',
+              'TEST_STEP_NAME=frontend: CPU',
+              'TVM_NUM_SHARDS=1',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_CPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: CPU 1 of 1')
+  }
+}
+
+
+def test() {
+  stage('Test') {
+    environment {
+      SKIP_SLOW_TESTS = "${skip_slow_tests}"
+    }
+    parallel(
+    'integration: CPU 1 of 4': {
+      shard_run_integration_CPU_1_of_4()
+    },
+    'integration: CPU 2 of 4': {
+      shard_run_integration_CPU_2_of_4()
+    },
+    'integration: CPU 3 of 4': {
+      shard_run_integration_CPU_3_of_4()
+    },
+    'integration: CPU 4 of 4': {
+      shard_run_integration_CPU_4_of_4()
+    },
+    'unittest: CPU 1 of 1': {
+      shard_run_unittest_CPU_1_of_1()
+    },
+    'frontend: CPU 1 of 1': {
+      shard_run_frontend_CPU_1_of_1()
+    },
+    )
+  }
+}
+test()
diff --git a/ci/jenkins/generated/docker_jenkinsfile.groovy b/ci/jenkins/generated/docker_jenkinsfile.groovy
new file mode 100644
index 000000000000..28e81efb7bf0
--- /dev/null
+++ b/ci/jenkins/generated/docker_jenkinsfile.groovy
@@ -0,0 +1,960 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-12-05T14:48:41.987490
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+def ecr_push(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  def ecr_name = "${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com/${full_name}"
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -x
+          . ${jenkins_scripts_root}/retry.sh
+          docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
+          retry 5 docker push \$AWS_ECR_REPO/${full_name}
+        """,
+        label: 'Upload image to ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+  return ecr_name
+}
+
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ci/scripts/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
+def build_image(image_name) {
+  hash = sh(
+    returnStdout: true,
+    script: 'git log -1 --format=\'%h\''
+  ).trim()
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
+  sh(
+    script: "${docker_build} ${image_name} --spec ${full_name}",
+    label: 'Build docker image'
+  )
+  return ecr_push(full_name)
+}
+
+def update_docker(ecr_image, hub_image) {
+  if (ecr_image == null) {
+    sh("image was not rebuilt, skipping")
+    return
+  }
+  if (!ecr_image.contains("amazonaws.com")) {
+    sh("echo \"Skipping '${ecr_image}' -> '${hub_image}' since it doesn\'t look like an ECR image\"")
+    return
+  }
+  docker_init(ecr_image)
+  sh(
+    script: """
+    set -eux
+    . ${jenkins_scripts_root}/retry.sh
+    docker tag \
+      ${ecr_image} \
+      ${hub_image}
+    retry 5 docker push ${hub_image}
+    """,
+    label: "Update ${hub_image} on Docker Hub",
+  )
+}
+
+stage('Docker Image Build') {
+  parallel(
+    'ci_arm': {
+      node('ARM') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          // We're purposefully not setting the built image here since they
+          // are not yet being uploaded to tlcpack
+          // ci_arm = build_image('ci_arm')
+          built_ci_arm = build_image('ci_arm');
+        }
+      }
+    },
+    'ci_cortexm': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          // We're purposefully not setting the built image here since they
+          // are not yet being uploaded to tlcpack
+          // ci_cortexm = build_image('ci_cortexm')
+          built_ci_cortexm = build_image('ci_cortexm');
+        }
+      }
+    },
+    'ci_cpu': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          // We're purposefully not setting the built image here since they
+          // are not yet being uploaded to tlcpack
+          // ci_cpu = build_image('ci_cpu')
+          built_ci_cpu = build_image('ci_cpu');
+        }
+      }
+    },
+    'ci_gpu': {
+      node('GPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          // We're purposefully not setting the built image here since they
+          // are not yet being uploaded to tlcpack
+          // ci_gpu = build_image('ci_gpu')
+          built_ci_gpu = build_image('ci_gpu');
+        }
+      }
+    },
+    'ci_hexagon': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          // We're purposefully not setting the built image here since they
+          // are not yet being uploaded to tlcpack
+          // ci_hexagon = build_image('ci_hexagon')
+          built_ci_hexagon = build_image('ci_hexagon');
+        }
+      }
+    },
+    'ci_i386': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          // We're purposefully not setting the built image here since they
+          // are not yet being uploaded to tlcpack
+          // ci_i386 = build_image('ci_i386')
+          built_ci_i386 = build_image('ci_i386');
+        }
+      }
+    },
+    'ci_lint': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          // We're purposefully not setting the built image here since they
+          // are not yet being uploaded to tlcpack
+          // ci_lint = build_image('ci_lint')
+          built_ci_lint = build_image('ci_lint');
+        }
+      }
+    },
+    'ci_minimal': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          // We're purposefully not setting the built image here since they
+          // are not yet being uploaded to tlcpack
+          // ci_minimal = build_image('ci_minimal')
+          built_ci_minimal = build_image('ci_minimal');
+        }
+      }
+    },
+    'ci_riscv': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          // We're purposefully not setting the built image here since they
+          // are not yet being uploaded to tlcpack
+          // ci_riscv = build_image('ci_riscv')
+          built_ci_riscv = build_image('ci_riscv');
+        }
+      }
+    },
+    'ci_wasm': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          // We're purposefully not setting the built image here since they
+          // are not yet being uploaded to tlcpack
+          // ci_wasm = build_image('ci_wasm')
+          built_ci_wasm = build_image('ci_wasm');
+        }
+      }
+    },
+  )
+}
+
+def deploy() {
+  stage('Deploy') {
+    if (env.BRANCH_NAME == 'main') {
+      parallel(
+  'Upload built Docker images': {
+    if (env.DEPLOY_DOCKER_IMAGES == 'yes' && rebuild_docker_images && upstream_revision != null) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docker") {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+                    try {
+                      withCredentials([string(
+                        credentialsId: 'dockerhub-tlcpackstaging-key',
+                        variable: 'DOCKERHUB_KEY',
+                      )]) {
+                        sh(
+                          script: 'docker login -u tlcpackstaging -p ${DOCKERHUB_KEY}',
+                          label: 'Log in to Docker Hub',
+                        )
+                      }
+                      def date_Ymd_HMS = sh(
+                        script: 'python3 -c \'import datetime; print(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))\'',
+                        label: 'Determine date',
+                        returnStdout: true,
+                      ).trim()
+                      def tag = "${date_Ymd_HMS}-${upstream_revision.substring(0, 8)}"
+                      update_docker(built_ci_arm, "tlcpackstaging/ci_arm:${tag}")
+                      update_docker(built_ci_cortexm, "tlcpackstaging/ci_cortexm:${tag}")
+                      update_docker(built_ci_cpu, "tlcpackstaging/ci_cpu:${tag}")
+                      update_docker(built_ci_gpu, "tlcpackstaging/ci_gpu:${tag}")
+                      update_docker(built_ci_hexagon, "tlcpackstaging/ci_hexagon:${tag}")
+                      update_docker(built_ci_i386, "tlcpackstaging/ci_i386:${tag}")
+                      update_docker(built_ci_lint, "tlcpackstaging/ci_lint:${tag}")
+                      update_docker(built_ci_minimal, "tlcpackstaging/ci_minimal:${tag}")
+                      update_docker(built_ci_riscv, "tlcpackstaging/ci_riscv:${tag}")
+                      update_docker(built_ci_wasm, "tlcpackstaging/ci_wasm:${tag}")
+                    } finally {
+                      sh(
+                        script: 'docker logout',
+                        label: 'Clean up login credentials'
+                      )
+                    }
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('Upload built Docker images')
+    }
+  },
+  'Tag tlcpackstaging to tlcpack': {
+    if (env.DEPLOY_DOCKER_IMAGES == 'yes') {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/tag-images") {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+                    withCredentials([string(
+                      credentialsId: 'dockerhub-tlcpack-key',
+                      variable: 'TLCPACK_TOKEN',
+                    )]) {
+                      try {
+                        sh(
+                          script: 'echo $TLCPACK_TOKEN | docker login --username octomldriazati --password-stdin',
+                          label: 'Log in to Docker Hub'
+                        )
+                        if (ci_arm.contains("tlcpackstaging")) {
+                          // Push image to tlcpack
+                          def tag = ci_arm.split(":")[1]
+                          sh(
+                            script: """
+                              set -eux
+                              . ${jenkins_scripts_root}/retry.sh
+                              docker pull tlcpackstaging/ci_arm:${tag}
+                              docker tag tlcpackstaging/ci_arm:${tag} tlcpack/ci-arm:${tag}
+                              retry 5 docker push tlcpack/ci-arm:${tag}
+                            """,
+                            label: 'Tag tlcpackstaging/ci_arm image to tlcpack',
+                          )
+                        }
+                        if (ci_cortexm.contains("tlcpackstaging")) {
+                          // Push image to tlcpack
+                          def tag = ci_cortexm.split(":")[1]
+                          sh(
+                            script: """
+                              set -eux
+                              . ${jenkins_scripts_root}/retry.sh
+                              docker pull tlcpackstaging/ci_cortexm:${tag}
+                              docker tag tlcpackstaging/ci_cortexm:${tag} tlcpack/ci-cortexm:${tag}
+                              retry 5 docker push tlcpack/ci-cortexm:${tag}
+                            """,
+                            label: 'Tag tlcpackstaging/ci_cortexm image to tlcpack',
+                          )
+                        }
+                        if (ci_cpu.contains("tlcpackstaging")) {
+                          // Push image to tlcpack
+                          def tag = ci_cpu.split(":")[1]
+                          sh(
+                            script: """
+                              set -eux
+                              . ${jenkins_scripts_root}/retry.sh
+                              docker pull tlcpackstaging/ci_cpu:${tag}
+                              docker tag tlcpackstaging/ci_cpu:${tag} tlcpack/ci-cpu:${tag}
+                              retry 5 docker push tlcpack/ci-cpu:${tag}
+                            """,
+                            label: 'Tag tlcpackstaging/ci_cpu image to tlcpack',
+                          )
+                        }
+                        if (ci_gpu.contains("tlcpackstaging")) {
+                          // Push image to tlcpack
+                          def tag = ci_gpu.split(":")[1]
+                          sh(
+                            script: """
+                              set -eux
+                              . ${jenkins_scripts_root}/retry.sh
+                              docker pull tlcpackstaging/ci_gpu:${tag}
+                              docker tag tlcpackstaging/ci_gpu:${tag} tlcpack/ci-gpu:${tag}
+                              retry 5 docker push tlcpack/ci-gpu:${tag}
+                            """,
+                            label: 'Tag tlcpackstaging/ci_gpu image to tlcpack',
+                          )
+                        }
+                        if (ci_hexagon.contains("tlcpackstaging")) {
+                          // Push image to tlcpack
+                          def tag = ci_hexagon.split(":")[1]
+                          sh(
+                            script: """
+                              set -eux
+                              . ${jenkins_scripts_root}/retry.sh
+                              docker pull tlcpackstaging/ci_hexagon:${tag}
+                              docker tag tlcpackstaging/ci_hexagon:${tag} tlcpack/ci-hexagon:${tag}
+                              retry 5 docker push tlcpack/ci-hexagon:${tag}
+                            """,
+                            label: 'Tag tlcpackstaging/ci_hexagon image to tlcpack',
+                          )
+                        }
+                        if (ci_i386.contains("tlcpackstaging")) {
+                          // Push image to tlcpack
+                          def tag = ci_i386.split(":")[1]
+                          sh(
+                            script: """
+                              set -eux
+                              . ${jenkins_scripts_root}/retry.sh
+                              docker pull tlcpackstaging/ci_i386:${tag}
+                              docker tag tlcpackstaging/ci_i386:${tag} tlcpack/ci-i386:${tag}
+                              retry 5 docker push tlcpack/ci-i386:${tag}
+                            """,
+                            label: 'Tag tlcpackstaging/ci_i386 image to tlcpack',
+                          )
+                        }
+                        if (ci_lint.contains("tlcpackstaging")) {
+                          // Push image to tlcpack
+                          def tag = ci_lint.split(":")[1]
+                          sh(
+                            script: """
+                              set -eux
+                              . ${jenkins_scripts_root}/retry.sh
+                              docker pull tlcpackstaging/ci_lint:${tag}
+                              docker tag tlcpackstaging/ci_lint:${tag} tlcpack/ci-lint:${tag}
+                              retry 5 docker push tlcpack/ci-lint:${tag}
+                            """,
+                            label: 'Tag tlcpackstaging/ci_lint image to tlcpack',
+                          )
+                        }
+                        if (ci_minimal.contains("tlcpackstaging")) {
+                          // Push image to tlcpack
+                          def tag = ci_minimal.split(":")[1]
+                          sh(
+                            script: """
+                              set -eux
+                              . ${jenkins_scripts_root}/retry.sh
+                              docker pull tlcpackstaging/ci_minimal:${tag}
+                              docker tag tlcpackstaging/ci_minimal:${tag} tlcpack/ci-minimal:${tag}
+                              retry 5 docker push tlcpack/ci-minimal:${tag}
+                            """,
+                            label: 'Tag tlcpackstaging/ci_minimal image to tlcpack',
+                          )
+                        }
+                        if (ci_riscv.contains("tlcpackstaging")) {
+                          // Push image to tlcpack
+                          def tag = ci_riscv.split(":")[1]
+                          sh(
+                            script: """
+                              set -eux
+                              . ${jenkins_scripts_root}/retry.sh
+                              docker pull tlcpackstaging/ci_riscv:${tag}
+                              docker tag tlcpackstaging/ci_riscv:${tag} tlcpack/ci-riscv:${tag}
+                              retry 5 docker push tlcpack/ci-riscv:${tag}
+                            """,
+                            label: 'Tag tlcpackstaging/ci_riscv image to tlcpack',
+                          )
+                        }
+                        if (ci_wasm.contains("tlcpackstaging")) {
+                          // Push image to tlcpack
+                          def tag = ci_wasm.split(":")[1]
+                          sh(
+                            script: """
+                              set -eux
+                              . ${jenkins_scripts_root}/retry.sh
+                              docker pull tlcpackstaging/ci_wasm:${tag}
+                              docker tag tlcpackstaging/ci_wasm:${tag} tlcpack/ci-wasm:${tag}
+                              retry 5 docker push tlcpack/ci-wasm:${tag}
+                            """,
+                            label: 'Tag tlcpackstaging/ci_wasm image to tlcpack',
+                          )
+                        }
+                      } finally {
+                        sh(
+                          script: 'docker logout',
+                          label: 'Clean up login credentials'
+                        )
+                      }
+                    }
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('Tag tlcpackstaging to tlcpack')
+    }
+  },
+      )
+    }
+  }
+}
+
+deploy()
diff --git a/ci/jenkins/generated/gpu_jenkinsfile.groovy b/ci/jenkins/generated/gpu_jenkinsfile.groovy
new file mode 100644
index 000000000000..c226255e0e6e
--- /dev/null
+++ b/ci/jenkins/generated/gpu_jenkinsfile.groovy
@@ -0,0 +1,1294 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-12-05T14:48:42.195581
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+def build() {
+  stage('Build') {
+    if (!skip_ci) {
+      node('CPU-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
+        cmake_build("${ci_gpu} --no-gpu", 'build', '-j2')
+        make_standalone_crt("${ci_gpu} --no-gpu", 'build')
+        sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu --items build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/microtvm_template_projects build/crttest build/standalone_crt build/build.ninja",
+            label: 'Upload artifacts to S3',
+          )
+
+
+        // compiler test
+        sh "rm -rf build"
+        sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
+        cmake_build("${ci_gpu} --no-gpu", 'build', '-j2')
+        make_standalone_crt("${ci_gpu} --no-gpu", 'build')
+        sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu2 --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/crttest build/standalone_crt build/build.ninja",
+            label: 'Upload artifacts to S3',
+          )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: GPU')
+    }
+  }
+}
+build()
+
+
+
+def shard_run_unittest_GPU_1_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=unittest: GPU',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu2",
+                  label: 'Download artifacts from S3',
+                )
+
+              sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
+              // These require a GPU to finish the build (i.e. CUDA needs to be load-able)
+              make_standalone_crt(ci_gpu, 'build')
+              // make_cpp_tests(ci_gpu, 'build')
+              // cpp_unittest(ci_gpu)
+
+              sh "rm -rf build"
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
+              make_standalone_crt(ci_gpu, 'build')
+              make_cpp_tests(ci_gpu, 'build')
+              cpp_unittest(ci_gpu)
+              micro_cpp_unittest(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+                label: 'Run Python GPU unit tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+                label: 'Run Python GPU integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('unittest: GPU 1 of 3')
+  }
+}
+
+def shard_run_unittest_GPU_2_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=unittest: GPU',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=1',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
+                label: 'Run Java unit tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+                label: 'Run Python GPU unit tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+                label: 'Run Python GPU integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('unittest: GPU 2 of 3')
+  }
+}
+
+def shard_run_unittest_GPU_3_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=unittest: GPU',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=2',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+                label: 'Run Python GPU unit tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+                label: 'Run Python GPU integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('unittest: GPU 3 of 3')
+  }
+}
+
+
+
+def shard_run_topi_GPU_1_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=topi: GPU',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('topi: GPU 1 of 3')
+  }
+}
+
+def shard_run_topi_GPU_2_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=topi: GPU',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=1',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('topi: GPU 2 of 3')
+  }
+}
+
+def shard_run_topi_GPU_3_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=topi: GPU',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=2',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/topi_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('topi: GPU 3 of 3')
+  }
+}
+
+
+
+def shard_run_frontend_GPU_1_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 1 of 6')
+  }
+}
+
+def shard_run_frontend_GPU_2_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=1',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 2 of 6')
+  }
+}
+
+def shard_run_frontend_GPU_3_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=2',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 3 of 6')
+  }
+}
+
+def shard_run_frontend_GPU_4_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=3',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 4 of 6')
+  }
+}
+
+def shard_run_frontend_GPU_5_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=4',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 5 of 6')
+  }
+}
+
+def shard_run_frontend_GPU_6_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=frontend: GPU',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=5',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/frontend_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 6 of 6')
+  }
+}
+
+
+
+def shard_run_docs_GPU_1_of_1() {
+  if (!skip_ci) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
+        try {
+          init_git()
+          docker_init(ci_gpu)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TEST_STEP_NAME=docs: GPU',
+              'TVM_NUM_SHARDS=1',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/gpu",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh",
+                label: 'Build docs',
+              )
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/docs --items docs.tgz",
+                  label: 'Upload artifacts to S3',
+                )
+
+              sh(
+                script: "aws s3 cp --no-progress _docs s3://${s3_bucket}/${s3_prefix}/docs --recursive",
+                label: 'Upload docs to S3',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/docs_GPU --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('docs: GPU 1 of 1')
+  }
+}
+
+
+
+def test() {
+  stage('Test') {
+    environment {
+      SKIP_SLOW_TESTS = "${skip_slow_tests}"
+    }
+    parallel(
+    'unittest: GPU 1 of 3': {
+      shard_run_unittest_GPU_1_of_3()
+    },
+    'unittest: GPU 2 of 3': {
+      shard_run_unittest_GPU_2_of_3()
+    },
+    'unittest: GPU 3 of 3': {
+      shard_run_unittest_GPU_3_of_3()
+    },
+    'topi: GPU 1 of 3': {
+      shard_run_topi_GPU_1_of_3()
+    },
+    'topi: GPU 2 of 3': {
+      shard_run_topi_GPU_2_of_3()
+    },
+    'topi: GPU 3 of 3': {
+      shard_run_topi_GPU_3_of_3()
+    },
+    'frontend: GPU 1 of 6': {
+      shard_run_frontend_GPU_1_of_6()
+    },
+    'frontend: GPU 2 of 6': {
+      shard_run_frontend_GPU_2_of_6()
+    },
+    'frontend: GPU 3 of 6': {
+      shard_run_frontend_GPU_3_of_6()
+    },
+    'frontend: GPU 4 of 6': {
+      shard_run_frontend_GPU_4_of_6()
+    },
+    'frontend: GPU 5 of 6': {
+      shard_run_frontend_GPU_5_of_6()
+    },
+    'frontend: GPU 6 of 6': {
+      shard_run_frontend_GPU_6_of_6()
+    },
+    'docs: GPU 1 of 1': {
+      shard_run_docs_GPU_1_of_1()
+    },
+    )
+  }
+}
+test()
+
+
+
+def deploy_docs() {
+  // Note: This code must stay in the Jenkinsfile to ensure that it runs
+  // from a trusted context only
+  sh(
+    script: '''
+      set -eux
+      rm -rf tvm-site
+      git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site
+      cd tvm-site
+      git status
+      git checkout -B $DOCS_DEPLOY_BRANCH
+
+      git ls-tree HEAD docs/ --name-only | grep -vP '^docs/v\\d' | xargs rm -rf
+      mkdir -p docs
+      tar xf ../docs.tgz -C docs
+      COMMIT=$(cat docs/commit_hash)
+      git add .
+      git config user.name tvm-bot
+      git config user.email 95660001+tvm-bot@users.noreply.github.com
+      git commit -m"deploying docs (apache/tvm@$COMMIT)"
+      git status
+    ''',
+    label: 'Unpack docs and update tvm-site'
+  )
+
+  withCredentials([string(
+    credentialsId: 'docs-push-token',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh(
+      script: '''
+        cd tvm-site
+        git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git
+        git push deploy $DOCS_DEPLOY_BRANCH || true
+      ''',
+      label: 'Upload docs to apache/tvm-site'
+    )
+  }
+}
+
+def deploy() {
+  stage('Deploy') {
+    if (env.BRANCH_NAME == 'main') {
+      parallel(
+  'Deploy Docs': {
+    if (env.DOCS_DEPLOY_ENABLED == 'yes') {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+                    sh(
+                script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/docs",
+                label: 'Download artifacts from S3',
+              )
+
+                    deploy_docs()
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('Deploy Docs')
+    }
+  },
+      )
+    }
+  }
+}
+
+deploy()
diff --git a/ci/jenkins/generated/hexagon_jenkinsfile.groovy b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
new file mode 100644
index 000000000000..6296d0c5c868
--- /dev/null
+++ b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
@@ -0,0 +1,931 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-12-05T14:48:42.065368
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+def build() {
+  stage('Build') {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") {
+          init_git()
+          docker_init(ci_hexagon)
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
+          script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
+          label: 'Create Hexagon cmake config',
+        )
+        cmake_build(ci_hexagon, 'build', '-j2')
+        make_cpp_tests(ci_hexagon, 'build')
+        sh (
+          script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+          label: 'Build Hexagon API',
+        )
+        sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/hexagon_api_output",
+            label: 'Upload artifacts to S3',
+          )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: Hexagon')
+    }
+  }
+}
+build()
+
+
+
+
+def shard_run_test_Hexagon_1_of_8() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          docker_init(ci_hexagon)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_hexagon)
+              cpp_unittest(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 1 of 8')
+  }
+}
+
+def shard_run_test_Hexagon_2_of_8() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          docker_init(ci_hexagon)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=1',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 2 of 8')
+  }
+}
+
+def shard_run_test_Hexagon_3_of_8() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          docker_init(ci_hexagon)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=2',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 3 of 8')
+  }
+}
+
+def shard_run_test_Hexagon_4_of_8() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          docker_init(ci_hexagon)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=3',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 4 of 8')
+  }
+}
+
+def shard_run_test_Hexagon_5_of_8() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          docker_init(ci_hexagon)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=4',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 5 of 8')
+  }
+}
+
+def shard_run_test_Hexagon_6_of_8() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          docker_init(ci_hexagon)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=5',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 6 of 8')
+  }
+}
+
+def shard_run_test_Hexagon_7_of_8() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          docker_init(ci_hexagon)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=6',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 7 of 8')
+  }
+}
+
+def shard_run_test_Hexagon_8_of_8() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          docker_init(ci_hexagon)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TEST_STEP_NAME=test: Hexagon',
+              'TVM_NUM_SHARDS=8',
+              'TVM_SHARD_INDEX=7',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 8 of 8')
+  }
+}
+
+
+def test() {
+  stage('Test') {
+    environment {
+      SKIP_SLOW_TESTS = "${skip_slow_tests}"
+    }
+    parallel(
+    'test: Hexagon 1 of 8': {
+      shard_run_test_Hexagon_1_of_8()
+    },
+    'test: Hexagon 2 of 8': {
+      shard_run_test_Hexagon_2_of_8()
+    },
+    'test: Hexagon 3 of 8': {
+      shard_run_test_Hexagon_3_of_8()
+    },
+    'test: Hexagon 4 of 8': {
+      shard_run_test_Hexagon_4_of_8()
+    },
+    'test: Hexagon 5 of 8': {
+      shard_run_test_Hexagon_5_of_8()
+    },
+    'test: Hexagon 6 of 8': {
+      shard_run_test_Hexagon_6_of_8()
+    },
+    'test: Hexagon 7 of 8': {
+      shard_run_test_Hexagon_7_of_8()
+    },
+    'test: Hexagon 8 of 8': {
+      shard_run_test_Hexagon_8_of_8()
+    },
+    )
+  }
+}
+test()
diff --git a/ci/jenkins/generated/i386_jenkinsfile.groovy b/ci/jenkins/generated/i386_jenkinsfile.groovy
new file mode 100644
index 000000000000..f0170f586721
--- /dev/null
+++ b/ci/jenkins/generated/i386_jenkinsfile.groovy
@@ -0,0 +1,693 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-12-05T14:48:42.016799
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+def build() {
+  stage('Build') {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-i386") {
+          init_git()
+          docker_init(ci_i386)
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
+          script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
+          label: 'Create i386 cmake config',
+        )
+        cmake_build(ci_i386, 'build', '-j2')
+        make_standalone_crt(ci_i386, 'build')
+        make_cpp_tests(ci_i386, 'build')
+        sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/i386 --items build/libvta_tsim.so build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/standalone_crt build/build.ninja build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja",
+            label: 'Upload artifacts to S3',
+          )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: i386')
+    }
+  }
+}
+build()
+
+
+
+
+def shard_run_python_i386_1_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+        try {
+          init_git()
+          docker_init(ci_i386)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=i386',
+              'TEST_STEP_NAME=python: i386',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/i386",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_i386)
+              cpp_unittest(ci_i386)
+              micro_cpp_unittest(ci_i386)
+              python_unittest(ci_i386)
+              sh (
+                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                label: 'Run i386 integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/python_i386 --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('python: i386 1 of 3')
+  }
+}
+
+def shard_run_python_i386_2_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+        try {
+          init_git()
+          docker_init(ci_i386)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=i386',
+              'TEST_STEP_NAME=python: i386',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=1',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/i386",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_i386)
+              python_unittest(ci_i386)
+              sh (
+                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                label: 'Run i386 integration tests',
+              )
+              fsim_test(ci_i386)
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/python_i386 --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('python: i386 2 of 3')
+  }
+}
+
+def shard_run_python_i386_3_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+        try {
+          init_git()
+          docker_init(ci_i386)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=i386',
+              'TEST_STEP_NAME=python: i386',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=2',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/i386",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_i386)
+              python_unittest(ci_i386)
+              sh (
+                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                label: 'Run i386 integration tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/python_i386 --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('python: i386 3 of 3')
+  }
+}
+
+
+def test() {
+  stage('Test') {
+    environment {
+      SKIP_SLOW_TESTS = "${skip_slow_tests}"
+    }
+    parallel(
+    'python: i386 1 of 3': {
+      shard_run_python_i386_1_of_3()
+    },
+    'python: i386 2 of 3': {
+      shard_run_python_i386_2_of_3()
+    },
+    'python: i386 3 of 3': {
+      shard_run_python_i386_3_of_3()
+    },
+    )
+  }
+}
+test()
diff --git a/ci/jenkins/generated/lint_jenkinsfile.groovy b/ci/jenkins/generated/lint_jenkinsfile.groovy
new file mode 100644
index 000000000000..ee63a1008b13
--- /dev/null
+++ b/ci/jenkins/generated/lint_jenkinsfile.groovy
@@ -0,0 +1,545 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-12-05T14:48:42.041376
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+
+stage('Lint') {
+  parallel(
+  'Lint 1 of 2': {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        init_git()
+        docker_init(ci_lint)
+        timeout(time: max_time, unit: 'MINUTES') {
+          withEnv([
+            'TVM_NUM_SHARDS=2',
+            'TEST_STEP_NAME=Lint',
+            'TVM_SHARD_INDEX=0',
+            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+            sh (
+              script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+              label: 'Run lint',
+            )
+          })
+        }
+      }
+    }
+  },
+  'Lint 2 of 2': {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        init_git()
+        docker_init(ci_lint)
+        timeout(time: max_time, unit: 'MINUTES') {
+          withEnv([
+            'TVM_NUM_SHARDS=2',
+            'TEST_STEP_NAME=Lint',
+            'TVM_SHARD_INDEX=1',
+            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+            sh (
+              script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+              label: 'Run lint',
+            )
+          })
+        }
+      }
+    }
+  },
+  )
+}
diff --git a/ci/jenkins/generated/minimal_jenkinsfile.groovy b/ci/jenkins/generated/minimal_jenkinsfile.groovy
new file mode 100644
index 000000000000..4c9f469b3bb6
--- /dev/null
+++ b/ci/jenkins/generated/minimal_jenkinsfile.groovy
@@ -0,0 +1,589 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-12-05T23:21:03.010229
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+def build() {
+  stage('Build') {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu-minimal") {
+          init_git()
+          docker_init(ci_minimal)
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
+          script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build",
+          label: 'Create CPU minimal cmake config',
+        )
+        cmake_build(ci_minimal, 'build', '-j2')
+        make_standalone_crt(ci_minimal, 'build')
+        make_cpp_tests(ci_minimal, 'build')
+        sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu-minimal --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/standalone_crt build/build.ninja",
+            label: 'Upload artifacts to S3',
+          )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: CPU MINIMAL')
+    }
+  }
+}
+build()
+
+
+
+
+def shard_run_unittest_CPU_MINIMAL_1_of_1() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu-minimal") {
+        try {
+          init_git()
+          docker_init(ci_minimal)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=minimal',
+              'TEST_STEP_NAME=unittest: CPU MINIMAL',
+              'TVM_NUM_SHARDS=1',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu-minimal",
+                  label: 'Download artifacts from S3',
+                )
+
+              cpp_unittest(ci_minimal)
+              python_unittest(ci_minimal)
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_CPU_MINIMAL --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('unittest: CPU MINIMAL 1 of 1')
+  }
+}
+
+
+
+def test() {
+  stage('Test') {
+    environment {
+      SKIP_SLOW_TESTS = "${skip_slow_tests}"
+    }
+    parallel(
+    'unittest: CPU MINIMAL 1 of 1': {
+      shard_run_unittest_CPU_MINIMAL_1_of_1()
+    },
+    )
+  }
+}
+test()
diff --git a/ci/jenkins/generated/riscv_jenkinsfile.groovy b/ci/jenkins/generated/riscv_jenkinsfile.groovy
new file mode 100644
index 000000000000..b485e9906f4c
--- /dev/null
+++ b/ci/jenkins/generated/riscv_jenkinsfile.groovy
@@ -0,0 +1,594 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-12-05T14:48:42.170796
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+def build() {
+  stage('Build') {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-riscv") {
+          init_git()
+          docker_init(ci_riscv)
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
+          script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build",
+          label: 'Create RISC-V cmake config',
+        )
+        cmake_build(ci_riscv, 'build', '-j2')
+        make_standalone_crt(ci_riscv, 'build')
+        make_cpp_tests(ci_riscv, 'build')
+        sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/riscv --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/standalone_crt build/build.ninja build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/microtvm_template_projects",
+            label: 'Upload artifacts to S3',
+          )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: RISC-V')
+    }
+  }
+}
+build()
+
+
+
+
+
+def shard_run_test_RISC_V_1_of_1() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-riscv") {
+        try {
+          init_git()
+          docker_init(ci_riscv)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=riscv',
+              'TEST_STEP_NAME=test: RISC-V',
+              'TVM_NUM_SHARDS=1',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/riscv",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_riscv)
+              cpp_unittest(ci_cortexm)
+              micro_cpp_unittest(ci_cortexm)
+              sh (
+                script: "${docker_run} ${ci_riscv} ./tests/scripts/task_riscv_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_RISC_V --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: RISC-V 1 of 1')
+  }
+}
+
+
+def test() {
+  stage('Test') {
+    environment {
+      SKIP_SLOW_TESTS = "${skip_slow_tests}"
+    }
+    parallel(
+    'test: RISC-V 1 of 1': {
+      shard_run_test_RISC_V_1_of_1()
+    },
+    )
+  }
+}
+test()
diff --git a/ci/jenkins/generated/wasm_jenkinsfile.groovy b/ci/jenkins/generated/wasm_jenkinsfile.groovy
new file mode 100644
index 000000000000..0c7c2ccf2aaa
--- /dev/null
+++ b/ci/jenkins/generated/wasm_jenkinsfile.groovy
@@ -0,0 +1,530 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-12-05T14:48:42.147157
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
+ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
+ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
+ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
+ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
+ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
+ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
+ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
+ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
+ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+def build() {
+  stage('Build') {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-wasm") {
+          init_git()
+          docker_init(ci_wasm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
+          script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
+          label: 'Create WASM cmake config',
+        )
+        cmake_build(ci_wasm, 'build', '-j2')
+        make_standalone_crt(ci_wasm, 'build')
+        make_cpp_tests(ci_wasm, 'build')
+        cpp_unittest(ci_wasm)
+        ci_setup(ci_wasm)
+        sh (
+          script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
+          label: 'Run WASM lint and tests',
+        )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: WASM')
+    }
+  }
+}
+build()
diff --git a/ci/jenkins/requirements.txt b/ci/jenkins/requirements.txt
deleted file mode 100644
index d8086eca6e41..000000000000
--- a/ci/jenkins/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-Jinja2>=3.0.0
diff --git a/ci/jenkins/templates/arm_jenkinsfile.groovy.j2 b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..6cffd5cbbe66
--- /dev/null
+++ b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+{% call m.invoke_build(
+  name='BUILD: arm',
+  node='ARM-SMALL',
+  condition='!skip_ci && is_docs_only_build != 1',
+  ws='tvm/build-arm',
+  docker_image='ci_arm',
+) %}
+  sh (
+    script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
+    label: 'Create ARM cmake config',
+  )
+  cmake_build(ci_arm, 'build', '-j4')
+  make_standalone_crt(ci_arm, 'build')
+  make_cpp_tests(ci_arm, 'build')
+  {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib + cpptest + crttest + standalone_crt) }}
+{% endcall %}
+
+{% set test_method_names = [] %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="integration: aarch64",
+  num_shards=4,
+  node="ARM-SMALL",
+  ws="tvm/ut-python-arm",
+  platform="arm",
+  docker_image="ci_arm",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='arm') }}
+  ci_setup(ci_arm)
+  python_unittest(ci_arm)
+  sh (
+    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+    label: 'Run CPU integration tests',
+  )
+{% endcall %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="topi: aarch64",
+  node="ARM-SMALL",
+  ws="tvm/ut-python-arm",
+  platform="arm",
+  docker_image="ci_arm",
+  num_shards=2,
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='arm') }}
+  ci_setup(ci_arm)
+  {% if shard_index == 1 %}
+  cpp_unittest(ci_arm)
+  micro_cpp_unittest(ci_arm)
+  {% endif %}
+  sh (
+    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+    label: 'Run test_arm_compute_lib test',
+  )
+  sh (
+    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+    label: 'Run TOPI tests',
+  )
+{% endcall %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="frontend: aarch64",
+  node="ARM-SMALL",
+  ws="tvm/frontend-python-arm",
+  platform="arm",
+  docker_image="ci_arm",
+  num_shards=2,
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='arm') }}
+  ci_setup(ci_arm)
+  sh (
+    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+    label: 'Run Python frontend tests',
+  )
+{% endcall %}
+
+{{ m.invoke_tests(test_method_names) -}}
diff --git a/ci/jenkins/templates/cortexm_jenkinsfile.groovy.j2 b/ci/jenkins/templates/cortexm_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..acbc147f408e
--- /dev/null
+++ b/ci/jenkins/templates/cortexm_jenkinsfile.groovy.j2
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+{% call m.invoke_build(
+  name='BUILD: Cortex-M',
+  node='CPU-SMALL',
+  condition='!skip_ci && is_docs_only_build != 1',
+  ws='tvm/build-cortexm',
+  docker_image='ci_cortexm',
+) %}
+  sh (
+    script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build",
+    label: 'Create Cortex-M cmake config',
+  )
+  cmake_build(ci_cortexm, 'build', '-j2')
+  make_standalone_crt(ci_cortexm, 'build')
+  make_cpp_tests(ci_cortexm, 'build')
+  {{ m.upload_artifacts(tag='cortexm', filenames=tvm_lib + tvm_allvisible + crttest + standalone_crt + cpptest + microtvm_template_projects) }}
+{% endcall %}
+
+{% set test_method_names = [] %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="test: Cortex-M",
+  node="CPU-SMALL",
+  ws="tvm/test-cortexm",
+  platform="cortexm",
+  docker_image="ci_cortexm",
+  num_shards=12,
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='cortexm') }}
+  ci_setup(ci_cortexm)
+  {% if shard_index == 1%}
+  cpp_unittest(ci_cortexm)
+  micro_cpp_unittest(ci_cortexm)
+  sh (
+    script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_demo_microtvm.sh",
+    label: 'Run microTVM demos',
+  )
+  {% endif %}
+  sh (
+    script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh",
+    label: 'Run microTVM tests',
+  )
+{% endcall %}
+
+{{ m.invoke_tests(test_method_names) -}}
diff --git a/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2 b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..fa2be6584ff0
--- /dev/null
+++ b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+{% call m.invoke_build(
+  name='BUILD: CPU',
+  node='CPU-SMALL',
+  condition='!skip_ci && is_docs_only_build != 1',
+  ws='tvm/build-cpu',
+  docker_image='ci_cpu',
+) %}
+  sh (
+    script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
+    label: 'Create CPU cmake config',
+  )
+  cmake_build(ci_cpu, 'build', '-j2')
+  make_standalone_crt(ci_cpu, 'build')
+  make_cpp_tests(ci_cpu, 'build')
+  {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim + tvm_allvisible + crttest + cpptest + standalone_crt) }}
+  ci_setup(ci_cpu)
+  // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
+  // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
+  sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
+{% endcall %}
+
+{% set test_method_names = [] %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="integration: CPU",
+  node="CPU-SMALL",
+  num_shards=4,
+  ws="tvm/integration-python-cpu",
+  platform="cpu",
+  docker_image="ci_cpu",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='cpu') }}
+  ci_setup(ci_cpu)
+  sh (
+    script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+    label: 'Run CPU integration tests',
+  )
+{% endcall %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="unittest: CPU",
+  node="CPU-SMALL",
+  ws="tvm/ut-python-cpu",
+  platform="cpu",
+  num_shards=1,
+  docker_image="ci_cpu",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='cpu') }}
+  ci_setup(ci_cpu)
+  cpp_unittest(ci_cpu)
+  micro_cpp_unittest(ci_cpu)
+  python_unittest(ci_cpu)
+  fsim_test(ci_cpu)
+  sh (
+    script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
+    label: 'Run VTA tests in TSIM',
+  )
+{% endcall %}
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="frontend: CPU",
+  node="CPU-SMALL",
+  ws="tvm/frontend-python-cpu",
+  platform="cpu",
+  num_shards=1,
+  docker_image="ci_cpu",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='cpu') }}
+  ci_setup(ci_cpu)
+  sh (
+    script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
+    label: 'Run Python frontend tests',
+  )
+{% endcall %}
+
+{{ m.invoke_tests(test_method_names) -}}
diff --git a/ci/jenkins/templates/docker_jenkinsfile.groovy.j2 b/ci/jenkins/templates/docker_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..016a1c7bc8e9
--- /dev/null
+++ b/ci/jenkins/templates/docker_jenkinsfile.groovy.j2
@@ -0,0 +1,239 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+def ecr_push(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  def ecr_name = "${aws_account_id}.{{ aws_ecr_url }}/${full_name}"
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -x
+          . ${jenkins_scripts_root}/retry.sh
+          docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
+          retry 5 docker push \$AWS_ECR_REPO/${full_name}
+        """,
+        label: 'Upload image to ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+  return ecr_name
+}
+
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ci/scripts/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
+def build_image(image_name) {
+  hash = sh(
+    returnStdout: true,
+    script: 'git log -1 --format=\'%h\''
+  ).trim()
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
+  sh(
+    script: "${docker_build} ${image_name} --spec ${full_name}",
+    label: 'Build docker image'
+  )
+  return ecr_push(full_name)
+}
+
+def update_docker(ecr_image, hub_image) {
+  if (ecr_image == null) {
+    sh("image was not rebuilt, skipping")
+    return
+  }
+  if (!ecr_image.contains("amazonaws.com")) {
+    sh("echo \"Skipping '${ecr_image}' -> '${hub_image}' since it doesn\'t look like an ECR image\"")
+    return
+  }
+  docker_init(ecr_image)
+  sh(
+    script: """
+    set -eux
+    . ${jenkins_scripts_root}/retry.sh
+    docker tag \
+      ${ecr_image} \
+      ${hub_image}
+    retry 5 docker push ${hub_image}
+    """,
+    label: "Update ${hub_image} on Docker Hub",
+  )
+}
+
+stage('Docker Image Build') {
+  parallel(
+  {% for image in images %}
+    '{{ image.name }}': {
+      node('{{ image.platform }}') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          // We're purposefully not setting the built image here since they
+          // are not yet being uploaded to tlcpack
+          // {{ image.name }} = build_image('{{ image.name }}')
+          built_{{ image.name }} = build_image('{{ image.name }}');
+        }
+      }
+    },
+  {% endfor %}
+  )
+}
+
+def deploy() {
+  stage('Deploy') {
+    if (env.BRANCH_NAME == 'main') {
+      parallel(
+        {% call m.deploy_step(
+          name="Upload built Docker images",
+          feature_flag="env.DEPLOY_DOCKER_IMAGES == 'yes' && rebuild_docker_images && upstream_revision != null",
+          ws="tvm/deploy-docker",
+        ) %}
+          init_git()
+          try {
+            withCredentials([string(
+              credentialsId: 'dockerhub-tlcpackstaging-key',
+              variable: 'DOCKERHUB_KEY',
+            )]) {
+              sh(
+                script: 'docker login -u tlcpackstaging -p ${DOCKERHUB_KEY}',
+                label: 'Log in to Docker Hub',
+              )
+            }
+            def date_Ymd_HMS = sh(
+              script: 'python3 -c \'import datetime; print(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))\'',
+              label: 'Determine date',
+              returnStdout: true,
+            ).trim()
+            def tag = "${date_Ymd_HMS}-${upstream_revision.substring(0, 8)}"
+            {% for image in images %}
+            update_docker(built_{{ image.name }}, "tlcpackstaging/{{ image.name }}:${tag}")
+            {% endfor %}
+          } finally {
+            sh(
+              script: 'docker logout',
+              label: 'Clean up login credentials'
+            )
+          }
+        {% endcall %}
+        {% call m.deploy_step(
+          name="Tag tlcpackstaging to tlcpack",
+          feature_flag="env.DEPLOY_DOCKER_IMAGES == 'yes'",
+          ws="tvm/tag-images",
+        ) %}
+          init_git()
+          withCredentials([string(
+            credentialsId: 'dockerhub-tlcpack-key',
+            variable: 'TLCPACK_TOKEN',
+          )]) {
+            try {
+              sh(
+                script: 'echo $TLCPACK_TOKEN | docker login --username octomldriazati --password-stdin',
+                label: 'Log in to Docker Hub'
+              )
+              {% for image in images %}
+              if ({{ image.name }}.contains("tlcpackstaging")) {
+                // Push image to tlcpack
+                def tag = {{ image.name }}.split(":")[1]
+                sh(
+                  script: """
+                    set -eux
+                    . ${jenkins_scripts_root}/retry.sh
+                    docker pull tlcpackstaging/{{ image.name }}:${tag}
+                    docker tag tlcpackstaging/{{ image.name }}:${tag} tlcpack/{{ image.name.replace("_", "-") }}:${tag}
+                    retry 5 docker push tlcpack/{{ image.name.replace("_", "-") }}:${tag}
+                  """,
+                  label: 'Tag tlcpackstaging/{{ image.name }} image to tlcpack',
+                )
+              }
+              {% endfor %}
+            } finally {
+              sh(
+                script: 'docker logout',
+                label: 'Clean up login credentials'
+              )
+            }
+          }
+        {% endcall %}
+      )
+    }
+  }
+}
+
+deploy()
diff --git a/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2 b/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..4a11a1bc427a
--- /dev/null
+++ b/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
@@ -0,0 +1,206 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+{% call m.invoke_build(
+  name='BUILD: GPU',
+  node='CPU-SMALL',
+  condition='!skip_ci',
+  ws='tvm/build-gpu',
+  docker_image='ci_gpu',
+) %}
+  sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
+  cmake_build("${ci_gpu} --no-gpu", 'build', '-j2')
+  make_standalone_crt("${ci_gpu} --no-gpu", 'build')
+  {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib + tvm_allvisible + microtvm_template_projects + crttest + standalone_crt) }}
+
+  // compiler test
+  sh "rm -rf build"
+  sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
+  cmake_build("${ci_gpu} --no-gpu", 'build', '-j2')
+  make_standalone_crt("${ci_gpu} --no-gpu", 'build')
+  {{ m.upload_artifacts(tag='gpu2', filenames=tvm_lib + crttest + standalone_crt) }}
+{% endcall %}
+
+{% set test_method_names = [] %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="unittest: GPU",
+  num_shards=3,
+  node="GPU",
+  ws="tvm/ut-python-gpu",
+  platform="gpu",
+  docker_image="ci_gpu",
+  test_method_names=test_method_names,
+) %}
+  {% if shard_index == 1 %}
+  {{ m.download_artifacts(tag='gpu2') }}
+  sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build"
+  // These require a GPU to finish the build (i.e. CUDA needs to be load-able)
+  make_standalone_crt(ci_gpu, 'build')
+  // make_cpp_tests(ci_gpu, 'build')
+  // cpp_unittest(ci_gpu)
+
+  sh "rm -rf build"
+  {{ m.download_artifacts(tag='gpu') }}
+  ci_setup(ci_gpu)
+  sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
+  make_standalone_crt(ci_gpu, 'build')
+  make_cpp_tests(ci_gpu, 'build')
+  cpp_unittest(ci_gpu)
+  micro_cpp_unittest(ci_gpu)
+  {% else %}
+  {{ m.download_artifacts(tag='gpu') }}
+  ci_setup(ci_gpu)
+  {% endif %}
+  {% if shard_index == 2 or num_shards < 2 %}
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
+    label: 'Run Java unit tests',
+  )
+  {% endif %}
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+    label: 'Run Python GPU unit tests',
+  )
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+    label: 'Run Python GPU integration tests',
+  )
+{% endcall %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="topi: GPU",
+  node="GPU",
+  num_shards=3,
+  ws="tvm/topi-python-gpu",
+  platform="gpu",
+  docker_image="ci_gpu",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='gpu') }}
+  ci_setup(ci_gpu)
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+    label: 'Run TOPI tests',
+  )
+{% endcall %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="frontend: GPU",
+  node="GPU",
+  num_shards=6,
+  ws="tvm/frontend-python-gpu",
+  platform="gpu",
+  docker_image="ci_gpu",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='gpu') }}
+  ci_setup(ci_gpu)
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+    label: 'Run Python frontend tests',
+  )
+{% endcall %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="docs: GPU",
+  node="GPU",
+  num_shards=1,
+  ws="tvm/docs-python-gpu",
+  platform="gpu",
+  docker_image="ci_gpu",
+  condition="!skip_ci",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='gpu') }}
+  ci_setup(ci_gpu)
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh",
+    label: 'Build docs',
+  )
+  {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }}
+  sh(
+    script: "aws s3 cp --no-progress _docs s3://${s3_bucket}/${s3_prefix}/docs --recursive",
+    label: 'Upload docs to S3',
+  )
+{% endcall %}
+
+
+{{ m.invoke_tests(test_method_names) }}
+
+
+def deploy_docs() {
+  // Note: This code must stay in the Jenkinsfile to ensure that it runs
+  // from a trusted context only
+  sh(
+    script: '''
+      set -eux
+      rm -rf tvm-site
+      git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site
+      cd tvm-site
+      git status
+      git checkout -B $DOCS_DEPLOY_BRANCH
+
+      git ls-tree HEAD docs/ --name-only | grep -vP '^docs/v\\d' | xargs rm -rf
+      mkdir -p docs
+      tar xf ../docs.tgz -C docs
+      COMMIT=$(cat docs/commit_hash)
+      git add .
+      git config user.name tvm-bot
+      git config user.email 95660001+tvm-bot@users.noreply.github.com
+      git commit -m"deploying docs (apache/tvm@$COMMIT)"
+      git status
+    ''',
+    label: 'Unpack docs and update tvm-site'
+  )
+
+  withCredentials([string(
+    credentialsId: 'docs-push-token',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh(
+      script: '''
+        cd tvm-site
+        git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git
+        git push deploy $DOCS_DEPLOY_BRANCH || true
+      ''',
+      label: 'Upload docs to apache/tvm-site'
+    )
+  }
+}
+
+def deploy() {
+  stage('Deploy') {
+    if (env.BRANCH_NAME == 'main') {
+      parallel(
+        {% call m.deploy_step(
+          name="Deploy Docs",
+          feature_flag="env.DOCS_DEPLOY_ENABLED == 'yes'",
+          ws="tvm/deploy-docs",
+        ) %}
+          init_git()
+          {{ m.download_artifacts(tag='docs') }}
+          deploy_docs()
+        {% endcall %}
+      )
+    }
+  }
+}
+
+deploy()
diff --git a/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2 b/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..140c227154a1
--- /dev/null
+++ b/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+{% call m.invoke_build(
+  name='BUILD: Hexagon',
+  node='CPU-SMALL',
+  condition='!skip_ci && is_docs_only_build != 1',
+  ws='tvm/build-hexagon',
+  docker_image='ci_hexagon',
+) %}
+  sh (
+    script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
+    label: 'Create Hexagon cmake config',
+  )
+  cmake_build(ci_hexagon, 'build', '-j2')
+  make_cpp_tests(ci_hexagon, 'build')
+  sh (
+    script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+    label: 'Build Hexagon API',
+  )
+  {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib + cpptest + hexagon_api) }}
+{% endcall %}
+
+
+{% set test_method_names = [] %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="test: Hexagon",
+  node="CPU-SMALL",
+  ws="tvm/test-hexagon",
+  platform="hexagon",
+  docker_image="ci_hexagon",
+  test_method_names=test_method_names,
+  num_shards=8,
+) %}
+  {{ m.download_artifacts(tag='hexagon') }}
+  ci_setup(ci_hexagon)
+  {% if shard_index == 1 %}
+  cpp_unittest(ci_hexagon)
+  {% endif %}
+  sh (
+    script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+    label: 'Run Hexagon tests',
+  )
+{% endcall %}
+
+{{ m.invoke_tests(test_method_names) -}}
diff --git a/ci/jenkins/templates/i386_jenkinsfile.groovy.j2 b/ci/jenkins/templates/i386_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..1825e0cbd6bd
--- /dev/null
+++ b/ci/jenkins/templates/i386_jenkinsfile.groovy.j2
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+{% call m.invoke_build(
+  name='BUILD: i386',
+  node='CPU-SMALL',
+  condition='!skip_ci && is_docs_only_build != 1',
+  ws='tvm/build-i386',
+  docker_image='ci_i386',
+) %}
+  sh (
+    script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
+    label: 'Create i386 cmake config',
+  )
+  cmake_build(ci_i386, 'build', '-j2')
+  make_standalone_crt(ci_i386, 'build')
+  make_cpp_tests(ci_i386, 'build')
+  {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim + standalone_crt + crttest + cpptest) }}
+{% endcall %}
+
+
+{% set test_method_names = [] %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="python: i386",
+  node="CPU-SMALL",
+  num_shards=3,
+  ws="tvm/integration-python-i386",
+  platform="i386",
+  docker_image="ci_i386",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='i386') }}
+  ci_setup(ci_i386)
+  {% if shard_index == 1 %}
+  cpp_unittest(ci_i386)
+  micro_cpp_unittest(ci_i386)
+  {% endif %}
+  python_unittest(ci_i386)
+  sh (
+    script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+    label: 'Run i386 integration tests',
+  )
+  {% if shard_index == 2 or num_shards < 2 %}
+  fsim_test(ci_i386)
+  {% endif %}
+{% endcall %}
+
+{{ m.invoke_tests(test_method_names) -}}
diff --git a/ci/jenkins/templates/lint_jenkinsfile.groovy.j2 b/ci/jenkins/templates/lint_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..7423ee171007
--- /dev/null
+++ b/ci/jenkins/templates/lint_jenkinsfile.groovy.j2
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+{% macro sharded_lint_step(name, num_shards, docker_image, node, ws) %}
+{% for shard_index in range(1, num_shards + 1) %}
+  '{{ name }} {{ shard_index }} of {{ num_shards }}': {
+    node('{{ node }}') {
+      ws({{ m.per_exec_ws(ws) }}) {
+        init_git()
+        docker_init({{ docker_image }})
+        timeout(time: max_time, unit: 'MINUTES') {
+          withEnv([
+            'TVM_NUM_SHARDS={{ num_shards }}',
+            'TEST_STEP_NAME={{ name }}',
+            'TVM_SHARD_INDEX={{ shard_index - 1 }}',
+            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+            {{ caller() | trim | indent(width=6) }}
+          })
+        }
+      }
+    }
+  },
+{% endfor %}
+{% endmacro %}
+
+stage('Lint') {
+  parallel(
+    {% call sharded_lint_step(
+      name='Lint',
+      num_shards=2,
+      node='CPU-SMALL',
+      ws='tvm/lint',
+      docker_image='ci_lint',
+    )
+    %}
+      sh (
+        script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+        label: 'Run lint',
+      )
+    {% endcall %}
+  )
+}
diff --git a/ci/jenkins/templates/minimal_jenkinsfile.groovy.j2 b/ci/jenkins/templates/minimal_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..87db883745cc
--- /dev/null
+++ b/ci/jenkins/templates/minimal_jenkinsfile.groovy.j2
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+{% call m.invoke_build(
+  name='BUILD: CPU MINIMAL',
+  node='CPU-SMALL',
+  condition='!skip_ci && is_docs_only_build != 1',
+  ws='tvm/build-cpu-minimal',
+  docker_image='ci_minimal',
+) %}
+  sh (
+    script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build",
+    label: 'Create CPU minimal cmake config',
+  )
+  cmake_build(ci_minimal, 'build', '-j2')
+  make_standalone_crt(ci_minimal, 'build')
+  make_cpp_tests(ci_minimal, 'build')
+  {{ m.upload_artifacts(tag='cpu-minimal', filenames=tvm_lib + tvm_allvisible + crttest + cpptest + standalone_crt) }}
+{% endcall %}
+
+
+{% set test_method_names = [] %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="unittest: CPU MINIMAL",
+  node="CPU-SMALL",
+  num_shards=1,
+  ws="tvm/ut-python-cpu-minimal",
+  platform="minimal",
+  docker_image="ci_minimal",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='cpu-minimal') }}
+  cpp_unittest(ci_minimal)
+  python_unittest(ci_minimal)
+{% endcall %}
+
+
+{{ m.invoke_tests(test_method_names) -}}
diff --git a/ci/jenkins/templates/riscv_jenkinsfile.groovy.j2 b/ci/jenkins/templates/riscv_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..35bb84cdf7e5
--- /dev/null
+++ b/ci/jenkins/templates/riscv_jenkinsfile.groovy.j2
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+{% call m.invoke_build(
+  name='BUILD: RISC-V',
+  node='CPU-SMALL',
+  condition='!skip_ci && is_docs_only_build != 1',
+  ws='tvm/build-riscv',
+  docker_image='ci_riscv',
+) %}
+  sh (
+    script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build",
+    label: 'Create RISC-V cmake config',
+  )
+  cmake_build(ci_riscv, 'build', '-j2')
+  make_standalone_crt(ci_riscv, 'build')
+  make_cpp_tests(ci_riscv, 'build')
+  {{ m.upload_artifacts(tag='riscv', filenames=tvm_lib + tvm_allvisible + standalone_crt + crttest + cpptest + microtvm_template_projects) }}
+{% endcall %}
+
+
+
+{% set test_method_names = [] %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="test: RISC-V",
+  node="CPU-SMALL",
+  ws="tvm/test-riscv",
+  platform="riscv",
+  docker_image="ci_riscv",
+  num_shards=1,
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='riscv') }}
+  ci_setup(ci_riscv)
+  {% if shard_index == 1%}
+  cpp_unittest(ci_cortexm)
+  micro_cpp_unittest(ci_cortexm)
+  {% endif %}
+  sh (
+    script: "${docker_run} ${ci_riscv} ./tests/scripts/task_riscv_microtvm.sh",
+    label: 'Run microTVM tests',
+  )
+{% endcall %}
+
+{{ m.invoke_tests(test_method_names) -}}
diff --git a/ci/jenkins/templates/utils/Build.groovy.j2 b/ci/jenkins/templates/utils/Build.groovy.j2
new file mode 100644
index 000000000000..362e8341350f
--- /dev/null
+++ b/ci/jenkins/templates/utils/Build.groovy.j2
@@ -0,0 +1,57 @@
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/templates/utils/Prepare.groovy.j2
similarity index 98%
rename from ci/jenkins/Prepare.groovy.j2
rename to ci/jenkins/templates/utils/Prepare.groovy.j2
index 6a82a887ede6..099bde5bc770 100644
--- a/ci/jenkins/Prepare.groovy.j2
+++ b/ci/jenkins/templates/utils/Prepare.groovy.j2
@@ -75,7 +75,8 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    ecr_pull(image)
+    // ecr_pull(image)
+    sh "echo Pulling from AWS is not implemented && exit 1"
   } else {
     sh(
       script: """
diff --git a/ci/jenkins/templates/utils/Test.groovy.j2 b/ci/jenkins/templates/utils/Test.groovy.j2
new file mode 100644
index 000000000000..1e70869cfc2c
--- /dev/null
+++ b/ci/jenkins/templates/utils/Test.groovy.j2
@@ -0,0 +1,13 @@
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/templates/utils/base.groovy.j2
similarity index 63%
rename from ci/jenkins/Jenkinsfile.j2
rename to ci/jenkins/templates/utils/base.groovy.j2
index 3aa44294966e..0854091c7a65 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/templates/utils/base.groovy.j2
@@ -1,4 +1,3 @@
-#!groovy
 // -*- mode: groovy -*-
 
 // Licensed to the Apache Software Foundation (ASF) under one
@@ -48,20 +47,18 @@
 // Generated at {{ generated_time }}
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-{% import 'ci/jenkins/macros.j2' as m with context -%}
-
-// NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20221128-070141-ae4fd7df7'
-ci_gpu = 'tlcpack/ci-gpu:20221128-070141-ae4fd7df7'
-ci_cpu = 'tlcpack/ci-cpu:20221128-070141-ae4fd7df7'
-ci_minimal = 'tlcpack/ci-minimal:20221128-070141-ae4fd7df7'
-ci_wasm = 'tlcpack/ci-wasm:20221128-070141-ae4fd7df7'
-ci_i386 = 'tlcpack/ci-i386:20221128-070141-ae4fd7df7'
-ci_cortexm = 'tlcpack/ci-cortexm:20221128-070141-ae4fd7df7'
-ci_arm = 'tlcpack/ci-arm:20221128-070141-ae4fd7df7'
-ci_hexagon = 'tlcpack/ci-hexagon:20221025-182121-e41d0ed6e'
-ci_riscv = 'tlcpack/ci-riscv:20221128-070141-ae4fd7df7'
-// <--- End of regex-scanned config.
+{% import 'utils/macros.j2' as m with context -%}
+
+ci_lint = '{{ ci_lint }}'
+ci_gpu = '{{ ci_gpu }}'
+ci_cpu = '{{ ci_cpu }}'
+ci_minimal = '{{ ci_minimal }}'
+ci_wasm = '{{ ci_wasm }}'
+ci_i386 = '{{ ci_i386 }}'
+ci_cortexm = '{{ ci_cortexm }}'
+ci_arm = '{{ ci_arm }}'
+ci_hexagon = '{{ ci_hexagon }}'
+ci_riscv = '{{ ci_riscv }}'
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -91,48 +88,20 @@ docker_build = 'docker/build.sh'
 max_time = 180
 rebuild_docker_images = false
 
-// Filenames for stashing between build and test steps
-{% set tvm_runtime = ['build/libtvm_runtime.so', 'build/config.cmake'] %}
-{% set crttest = ['build/crttest'] %}
-{% set tvm_allvisible = ['build/libtvm_allvisible.so'] %}
-{% set cpptest = ['build/cpptest', 'build/build.ninja', 'build/CMakeFiles/rules.ninja'] %}
-{% set tvm_lib = ['build/libtvm.so'] + tvm_runtime %}
-{% set tvm_multilib = ['build/libtvm.so', 'build/libvta_fsim.so'] + tvm_runtime %}
-{% set tvm_multilib_tsim = ['build/libvta_tsim.so'] + tvm_multilib %}
-{% set microtvm_template_projects = ['build/microtvm_template_projects',] %}
-{% set hexagon_api = ['build/hexagon_api_output',] %}
-{% set standalone_crt = ['build/standalone_crt', 'build/build.ninja'] %}
 s3_bucket = 'tvm-jenkins-artifacts-prod'
 s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
 // Jenkins script root directory
 jenkins_scripts_root = "ci/scripts/jenkins"
-{% set aws_default_region = "us-west-2" %}
-{% set aws_ecr_url = "dkr.ecr." + aws_default_region + ".amazonaws.com" %}
+
 
 // General note: Jenkins has limits on the size of a method (or top level code)
 // that are pretty strict, so most usage of groovy methods in these templates
 // are purely to satisfy the JVM
-{% include "ci/jenkins/Prepare.groovy.j2" %}
-{% include "ci/jenkins/DockerBuild.groovy.j2" %}
-{% include "ci/jenkins/Lint.groovy.j2" %}
-{% include "ci/jenkins/Build.groovy.j2" %}
-{% include "ci/jenkins/Test.groovy.j2" %}
-{% include "ci/jenkins/Deploy.groovy.j2" %}
-
+{% include "utils/Prepare.groovy.j2" %}
+{% include "utils/Build.groovy.j2" %}
+{% include "utils/Test.groovy.j2" %}
 
 cancel_previous_build()
 
 prepare()
-
-if (rebuild_docker_images) {
-  build_docker_images()
-}
-
-lint()
-
-build()
-
-test()
-
-deploy()
diff --git a/ci/jenkins/macros.j2 b/ci/jenkins/templates/utils/macros.j2
similarity index 58%
rename from ci/jenkins/macros.j2
rename to ci/jenkins/templates/utils/macros.j2
index ff59a4046179..5c65318477da 100644
--- a/ci/jenkins/macros.j2
+++ b/ci/jenkins/templates/utils/macros.j2
@@ -26,13 +26,13 @@ sh(
           )
 {% endmacro %}
 
-{% macro sharded_test_step(name, num_shards, node, ws, docker_image, platform, test_method_names) %}
+{% macro sharded_test_step(name, num_shards, node, ws, docker_image, platform, test_method_names, condition="!skip_ci && is_docs_only_build != 1") %}
 
 {% for shard_index in range(1, num_shards + 1) %}
 {% set method_name = "shard_run_" + name.replace(":", "").replace(" ", "-").replace("-", "_") + "_" + shard_index|string + "_of_" + num_shards|string %}
 {% set test_dir_name = name.replace(":", "").replace(" ", "-").replace("-", "_")|string %}
 def {{ method_name }}() {
-  if (!skip_ci && is_docs_only_build != 1) {
+  if ({{ condition }}) {
     node('{{ node }}') {
       ws({{ per_exec_ws(ws) }}) {
         try {
@@ -67,30 +67,9 @@ def {{ method_name }}() {
 {% endfor %}
 {% endmacro %}
 
-{% macro sharded_lint_step(name, num_shards, docker_image, node, ws) %}
-{% for shard_index in range(1, num_shards + 1) %}
-  '{{ name }} {{ shard_index }} of {{ num_shards }}': {
-    node('{{ node }}') {
-      ws({{ per_exec_ws(ws) }}) {
-        init_git()
-        docker_init({{ docker_image }})
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'TVM_NUM_SHARDS={{ num_shards }}',
-            'TEST_STEP_NAME={{ name }}',
-            'TVM_SHARD_INDEX={{ shard_index - 1 }}',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            {{ caller() | trim | indent(width=6) }}
-          })
-        }
-      }
-    }
-  },
-{% endfor %}
-{% endmacro %}
-
-{% macro build_step(name, condition, node, docker_image, ws) %}
-  '{{ name }}': {
+{% macro invoke_build(name, condition, node, docker_image, ws) %}
+def build() {
+  stage('Build') {
     if ({{ condition }}) {
       node('{{ node }}') {
         ws({{ per_exec_ws(ws) }}) {
@@ -104,67 +83,27 @@ def {{ method_name }}() {
     } else {
       Utils.markStageSkippedForConditional('{{ name }}')
     }
-  },
-{% endmacro %}
-
-{% macro test_step_body(name, node, ws, docker_image, platform) %}
-{% set test_dir_name = name.replace(":", "").replace(" ", "-").replace("-", "_")|string %}
-  if (!skip_ci && is_docs_only_build != 1) {
-    node('{{ node }}') {
-      ws({{ per_exec_ws(ws) }}) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          try {
-            init_git()
-            docker_init({{ docker_image }})
-            withEnv(['PLATFORM={{ platform }}'], {
-              {{ caller() | indent(width=8) | trim }}
-            })
-          } finally {
-            try {
-              {{ junit_to_s3(test_dir_name) | indent(width=0) }}
-              junit 'build/pytest-results/*.xml'
-            } catch (Exception e) {
-              echo 'Exception during JUnit upload: ' + e.toString()
-            }
-          }
-        }
-      }
-    }
-  } else {
-    Utils.markStageSkippedForConditional('{{ name }}')
   }
+}
+build()
 {% endmacro %}
 
-{% macro test_step(name, node, ws, docker_image, platform) %}
-{% set test_dir_name = name.replace(":", "").replace(" ", "-").replace("-", "_")|string %}
-  '{{ name }}': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('{{ node }}') {
-        ws({{ per_exec_ws(ws) }}) {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              init_git()
-              docker_init({{ docker_image }})
-              withEnv(['PLATFORM={{ platform }}',
-              'TEST_STEP_NAME={{ name }}',
-              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-                {{ caller() | indent(width=12) | trim }}
-              })
-            } finally {
-            try {
-              {{ junit_to_s3(test_dir_name) | indent(width=4) }}
-              junit 'build/pytest-results/*.xml'
-            } catch (Exception e) {
-              echo 'Exception during JUnit upload: ' + e.toString()
-            }
-            }
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('{{ name }}')
+{% macro invoke_tests(test_method_names) %}
+def test() {
+  stage('Test') {
+    environment {
+      SKIP_SLOW_TESTS = "${skip_slow_tests}"
     }
-  },
+    parallel(
+    {% for stage_name, method_name in test_method_names %}
+    '{{ stage_name }}': {
+      {{ method_name }}()
+    },
+    {% endfor %}
+    )
+  }
+}
+test()
 {% endmacro %}
 
 {% macro deploy_step(name, feature_flag, ws) %}
diff --git a/ci/jenkins/templates/wasm_jenkinsfile.groovy.j2 b/ci/jenkins/templates/wasm_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..085fc12220b5
--- /dev/null
+++ b/ci/jenkins/templates/wasm_jenkinsfile.groovy.j2
@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+{% call m.invoke_build(
+  name='BUILD: WASM',
+  node='CPU-SMALL',
+  condition='!skip_ci && is_docs_only_build != 1',
+  ws='tvm/build-wasm',
+  docker_image='ci_wasm',
+) %}
+  sh (
+    script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
+    label: 'Create WASM cmake config',
+  )
+  cmake_build(ci_wasm, 'build', '-j2')
+  make_standalone_crt(ci_wasm, 'build')
+  make_cpp_tests(ci_wasm, 'build')
+  cpp_unittest(ci_wasm)
+  ci_setup(ci_wasm)
+  sh (
+    script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
+    label: 'Run WASM lint and tests',
+  )
+{% endcall %}
diff --git a/ci/scripts/jenkins/open_docker_update_pr.py b/ci/scripts/jenkins/open_docker_update_pr.py
index 9dcb241d5fd8..0531bdb6780b 100755
--- a/ci/scripts/jenkins/open_docker_update_pr.py
+++ b/ci/scripts/jenkins/open_docker_update_pr.py
@@ -22,15 +22,16 @@
 import os
 import json
 import re
+import shlex
 from urllib import error
 from typing import List, Dict, Any, Optional, Callable
 from git_utils import git, parse_remote, GitHubRepo
 from cmd_utils import REPO_ROOT, init_log
 from should_rebuild_docker import docker_api
 
-JENKINSFILE = REPO_ROOT / "ci" / "jenkins" / "Jenkinsfile.j2"
-GENERATED_JENKINSFILE = REPO_ROOT / "Jenkinsfile"
-GENERATE_SCRIPT = REPO_ROOT / "ci" / "jenkins" / "generate.py"
+JENKINS_DIR = REPO_ROOT / "ci" / "jenkins"
+IMAGES_FILE = JENKINS_DIR / "data.py"
+GENERATE_SCRIPT = JENKINS_DIR / "generate.py"
 GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
 BRANCH = "nightly-docker-update"
 
@@ -125,51 +126,41 @@ def latest_tlcpackstaging_image(source: str) -> Optional[str]:
     user, repo = parse_remote(remote)
 
     # Read the existing images from the Jenkinsfile
-    logging.info(f"Reading {JENKINSFILE}")
-    with open(JENKINSFILE) as f:
+    logging.info(f"Reading {IMAGES_FILE}")
+    with open(IMAGES_FILE) as f:
         content = f.readlines()
 
     # Build a new Jenkinsfile with the latest images from tlcpack or tlcpackstaging
-    new_content = []
     replacements = {}
+
     for line in content:
-        m = re.match(r"^(ci_[a-zA-Z0-9]+) = \'(.*)\'", line.strip())
+        m = re.match(r'"tag": "(.*)",', line.strip())
         if m is not None:
+            image_spec = m.groups()[0]
             logging.info(f"Found match on line {line.strip()}")
-            groups = m.groups()
-            new_image = latest_tlcpackstaging_image(groups[1])
+            new_image = latest_tlcpackstaging_image(image_spec)
             if new_image is None:
                 logging.info(f"No new image found")
-                new_content.append(line)
             else:
                 logging.info(f"Using new image {new_image}")
-                new_line = f"{groups[0]} = '{new_image}'\n"
-                new_content.append(new_line)
+                new_line = f'        "tag": "{new_image}",'
                 replacements[line] = new_line
-        else:
-            new_content.append(line)
 
-    # Write out the new content
-    if args.dry_run:
-        logging.info(f"Dry run, would have written new content to {JENKINSFILE}")
-    else:
-        logging.info(f"Writing new content to {JENKINSFILE}")
-        with open(JENKINSFILE, "w") as f:
-            f.write("".join(new_content))
+    # Re-generate the Jenkinsfiles
+    command = f"python3 {shlex.quote(str(GENERATE_SCRIPT))}"
 
-    # Re-generate the Jenkinsfile
-    logging.info(f"Editing {GENERATED_JENKINSFILE}")
-    with open(GENERATED_JENKINSFILE) as f:
-        generated_content = f.read()
+    content = "\n".join(content)
+    for old_line, new_line in replacements.items():
+        content = content.replace(old_line, new_line)
 
-    for original_line, new_line in replacements.items():
-        generated_content = generated_content.replace(original_line, new_line)
+    print(f"Updated to:\n{content}")
 
     if args.dry_run:
-        print(f"Would have written:\n{generated_content}")
+        print(f"Would have run:\n{command}")
     else:
-        with open(GENERATED_JENKINSFILE, "w") as f:
-            f.write(generated_content)
+        with open(IMAGES_FILE, "w") as f:
+            f.write(content)
+        Sh().run(command)
 
     # Publish the PR
     title = "[ci][docker] Nightly Docker image update"
@@ -177,12 +168,11 @@ def latest_tlcpackstaging_image(source: str) -> Optional[str]:
     message = f"{title}\n\n\n{body}"
 
     if args.dry_run:
-        logging.info("Dry run, would have committed Jenkinsfile")
+        logging.info("Dry run, would have committed Jenkinsfiles")
     else:
         logging.info(f"Creating git commit")
         git(["checkout", "-B", BRANCH])
-        git(["add", str(JENKINSFILE.relative_to(REPO_ROOT))])
-        git(["add", str(GENERATED_JENKINSFILE.relative_to(REPO_ROOT))])
+        git(["add", str(JENKINS_DIR.relative_to(REPO_ROOT))])
         git(["config", "user.name", "tvm-bot"])
         git(["config", "user.email", "95660001+tvm-bot@users.noreply.github.com"])
         git(["commit", "-m", message])
diff --git a/docker/dev_common.sh b/docker/dev_common.sh
index 59ab8db395a1..1ec04ecc90d8 100644
--- a/docker/dev_common.sh
+++ b/docker/dev_common.sh
@@ -28,24 +28,9 @@ INVOCATION_PWD="$(pwd)"
 GIT_TOPLEVEL=$(cd $(dirname ${BASH_SOURCE[0]}) && git rev-parse --show-toplevel)
 
 
-function filter_jenkinsfile() {
-    local echo_on=0;
-    while read line; do
-        if [ "${line}" == "// NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->" ]; then
-            echo_on=1
-        elif [ "${line}" == "// <--- End of regex-scanned config." ]; then
-            break
-        elif [ ${echo_on} -eq 1 ]; then
-            echo "$line"
-        fi
-    done
-}
-
-
 function lookup_image_spec() {
-    img_line=$(cat "${GIT_TOPLEVEL}/Jenkinsfile" | filter_jenkinsfile | grep -E "^${1} = ")
-    if [ -n "${img_line}" ]; then
-        img_spec=$(echo "${img_line}" | sed -E "s/${1} = '([^\"]*)'/\1/")
+    img_spec=$(python3 "${GIT_TOPLEVEL}/ci/jenkins/data.py" "$1")
+    if [ -n "${img_spec}" ]; then
         has_similar_docker_image=1
         docker inspect "${1}" &>/dev/null || has_similar_docker_image=0
         if [ ${has_similar_docker_image} -ne 0 ]; then
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 2b8b61c41361..f5d5a2f0a370 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -89,6 +89,8 @@
     "ld",
     # Jinja2 templates
     "j2",
+    # Jenkinsfiles
+    "groovy",
 }
 
 # List of file names allowed
diff --git a/tests/lint/rat-excludes b/tests/lint/rat-excludes
index 1cdb78e31913..e6338dc81b3c 100644
--- a/tests/lint/rat-excludes
+++ b/tests/lint/rat-excludes
@@ -52,10 +52,5 @@ MANIFEST
 rat-excludes
 Cargo.lock
 
-# Included template files
-Build.groovy.j2
-Deploy.groovy.j2
-DockerBuild.groovy.j2
-Lint.groovy.j2
-Prepare.groovy.j2
-Test.groovy.j2
+# Template files for Jenkins
+.*\.groovy\.j2
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 710f152c9b1e..cf31b50b63ec 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -1213,7 +1213,7 @@ def test_github_tag_teams(tmpdir_factory, source_type, data, check):
         },
         expected="Using tlcpackstaging tag on tlcpack",
         expected_images=[
-            "ci_arm = 'tlcpack/ci-arm:456-456-abc'",
+            '"tag": "tlcpack/ci-arm:456-456-abc"',
         ],
     ),
     tlcpack_update=dict(
@@ -1235,7 +1235,7 @@ def test_github_tag_teams(tmpdir_factory, source_type, data, check):
         },
         expected="Found newer image, using: tlcpack",
         expected_images=[
-            "ci_arm = 'tlcpack/ci-arm:234-234-abc'",
+            '"tag": "tlcpack/ci-arm:234-234-abc",',
         ],
     ),
 )